From ad4e119bf1a55c84dc7c6260588ed7db9e7199c6 Mon Sep 17 00:00:00 2001 From: Lars-Dominik Braun Date: Mon, 31 Dec 2018 19:29:49 +0100 Subject: extract-screenshot: Remove URL from filename MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit URL’s can get quite long, overflowing the file name length limit. Instead use sequential filenames and output metadata to stdout. --- crocoite/tools.py | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/crocoite/tools.py b/crocoite/tools.py index 9c5d836..f4d506d 100644 --- a/crocoite/tools.py +++ b/crocoite/tools.py @@ -24,10 +24,14 @@ Misc tools import shutil, sys, os, logging, argparse, json from io import BytesIO + from warcio.archiveiterator import ArchiveIterator from warcio.warcwriter import WARCWriter +from yarl import URL + from pkg_resources import parse_version, parse_requirements -from .util import packageUrl, getSoftwareInfo + +from .util import packageUrl, getSoftwareInfo, StrJsonEncoder def mergeWarc (files, output): unique = 0 @@ -98,13 +102,17 @@ def extractScreenshot (): Extract page screenshots from a WARC generated by crocoite into files """ - parser = argparse.ArgumentParser(description='Extract screenshots.') - parser.add_argument('-f', '--force', action='store_true', help='Overwrite existing files') - parser.add_argument('input', type=argparse.FileType ('rb'), help='Input WARC') + parser = argparse.ArgumentParser(description='Extract screenshots from ' + 'WARC, write TSV-formatted info to stdout.') + parser.add_argument('-f', '--force', action='store_true', + help='Overwrite existing files') + parser.add_argument('input', type=argparse.FileType ('rb'), + help='Input WARC') parser.add_argument('prefix', help='Output file prefix') args = parser.parse_args() + i = 0 with args.input: for record in ArchiveIterator (args.input): headers = record.rec_headers @@ -113,15 +121,18 @@ def extractScreenshot (): 'X-Crocoite-Screenshot-Y-Offset' not in headers: continue - urlSanitized = headers.get_header('WARC-Target-URI').replace ('/', '_') - xoff = 0 + url = URL (headers.get_header ('WARC-Target-URI')) yoff = int (headers.get_header ('X-Crocoite-Screenshot-Y-Offset')) - outpath = f'{args.prefix}-{urlSanitized}-{xoff}-{yoff}.png' + outpath = f'{args.prefix}{i:05d}.png' if args.force or not os.path.exists (outpath): + json.dump ({'file': outpath, 'url': url, 'yoff': yoff}, + sys.stdout, cls=StrJsonEncoder) + sys.stdout.write ('\n') with open (outpath, 'wb') as out: shutil.copyfileobj (record.raw_stream, out) + i += 1 else: - print (f'not overwriting {outputh}') + print (f'not overwriting {outpath}', file=sys.stderr) class Errata: __slots__ = ('uuid', 'description', 'affects') -- cgit v1.2.3