From 785ef19736cc9a21746e00a022b76fd756c162de Mon Sep 17 00:00:00 2001 From: Lars-Dominik Braun Date: Mon, 25 Jun 2018 19:55:48 +0200 Subject: warc: Save DOM-/image screenshot as WARC conversion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Judging from the docs this is the proper way to store these resources. Enable both for the IRC bot by default, since they won’t interfere with IA’s wayback machine. --- crocoite/tools.py | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) (limited to 'crocoite/tools.py') diff --git a/crocoite/tools.py b/crocoite/tools.py index bc92f8f..3aeaaad 100644 --- a/crocoite/tools.py +++ b/crocoite/tools.py @@ -80,18 +80,21 @@ def extractScreenshot (): args = parser.parse_args() - screenshotRe = re.compile (r'^urn:crocoite:screenshot-(\d+)-(\d+).png$', re.I) with args.input: - for record in ArchiveIterator(args.input): - uri = record.rec_headers.get_header('WARC-Target-URI') - if record.rec_type == 'resource': - m = screenshotRe.match (uri) - xoff, yoff = m.groups () - if m: - outpath = '{}-{}-{}.png'.format (args.prefix, xoff, yoff) - if args.force or not os.path.exists (outpath): - with open (outpath, 'wb') as out: - shutil.copyfileobj (record.raw_stream, out) - else: - print ('not overwriting {}'.format (outpath)) + for record in ArchiveIterator (args.input): + headers = record.rec_headers + if record.rec_type != 'conversion' or \ + headers['Content-Type'] != 'image/png' or \ + 'X-Crocoite-Screenshot-Y-Offset' not in headers: + continue + + urlSanitized = headers.get_header('WARC-Target-URI').replace ('/', '_') + xoff = 0 + yoff = int (headers.get_header ('X-Crocoite-Screenshot-Y-Offset')) + outpath = '{}-{}-{}-{}.png'.format (args.prefix, urlSanitized, xoff, yoff) + if args.force or not os.path.exists (outpath): + with open (outpath, 'wb') as out: + shutil.copyfileobj (record.raw_stream, out) + else: + print ('not overwriting {}'.format (outpath)) -- cgit v1.2.3