diff options
author | Lars-Dominik Braun <lars@6xq.net> | 2018-12-31 19:29:49 +0100 |
---|---|---|
committer | Lars-Dominik Braun <lars@6xq.net> | 2018-12-31 19:29:49 +0100 |
commit | ad4e119bf1a55c84dc7c6260588ed7db9e7199c6 (patch) | |
tree | cdbec18406ffa54dc3a7e2a4a0df4b6a9ca745c5 /crocoite | |
parent | cc1132a5b4677d089e024bcd0e16e1e817a3581c (diff) | |
download | crocoite-ad4e119bf1a55c84dc7c6260588ed7db9e7199c6.tar.gz crocoite-ad4e119bf1a55c84dc7c6260588ed7db9e7199c6.tar.bz2 crocoite-ad4e119bf1a55c84dc7c6260588ed7db9e7199c6.zip |
extract-screenshot: Remove URL from filename
URL’s can get quite long, overflowing the file name length limit.
Instead use sequential filenames and output metadata to stdout.
Diffstat (limited to 'crocoite')
-rw-r--r-- | crocoite/tools.py | 27 |
1 files changed, 19 insertions, 8 deletions
diff --git a/crocoite/tools.py b/crocoite/tools.py index 9c5d836..f4d506d 100644 --- a/crocoite/tools.py +++ b/crocoite/tools.py @@ -24,10 +24,14 @@ Misc tools import shutil, sys, os, logging, argparse, json from io import BytesIO + from warcio.archiveiterator import ArchiveIterator from warcio.warcwriter import WARCWriter +from yarl import URL + from pkg_resources import parse_version, parse_requirements -from .util import packageUrl, getSoftwareInfo + +from .util import packageUrl, getSoftwareInfo, StrJsonEncoder def mergeWarc (files, output): unique = 0 @@ -98,13 +102,17 @@ def extractScreenshot (): Extract page screenshots from a WARC generated by crocoite into files """ - parser = argparse.ArgumentParser(description='Extract screenshots.') - parser.add_argument('-f', '--force', action='store_true', help='Overwrite existing files') - parser.add_argument('input', type=argparse.FileType ('rb'), help='Input WARC') + parser = argparse.ArgumentParser(description='Extract screenshots from ' + 'WARC, write TSV-formatted info to stdout.') + parser.add_argument('-f', '--force', action='store_true', + help='Overwrite existing files') + parser.add_argument('input', type=argparse.FileType ('rb'), + help='Input WARC') parser.add_argument('prefix', help='Output file prefix') args = parser.parse_args() + i = 0 with args.input: for record in ArchiveIterator (args.input): headers = record.rec_headers @@ -113,15 +121,18 @@ def extractScreenshot (): 'X-Crocoite-Screenshot-Y-Offset' not in headers: continue - urlSanitized = headers.get_header('WARC-Target-URI').replace ('/', '_') - xoff = 0 + url = URL (headers.get_header ('WARC-Target-URI')) yoff = int (headers.get_header ('X-Crocoite-Screenshot-Y-Offset')) - outpath = f'{args.prefix}-{urlSanitized}-{xoff}-{yoff}.png' + outpath = f'{args.prefix}{i:05d}.png' if args.force or not os.path.exists (outpath): + json.dump ({'file': outpath, 'url': url, 'yoff': yoff}, + sys.stdout, cls=StrJsonEncoder) + sys.stdout.write ('\n') with open (outpath, 'wb') as out: shutil.copyfileobj (record.raw_stream, out) + i += 1 else: - print (f'not overwriting {outputh}') + print (f'not overwriting {outpath}', file=sys.stderr) class Errata: __slots__ = ('uuid', 'description', 'affects') |