diff options
author | Lars-Dominik Braun <lars@6xq.net> | 2018-06-25 19:55:48 +0200 |
---|---|---|
committer | Lars-Dominik Braun <lars@6xq.net> | 2018-06-25 19:55:48 +0200 |
commit | 785ef19736cc9a21746e00a022b76fd756c162de (patch) | |
tree | 041a8696c852294fe9573485831398933e26ee13 /crocoite/tools.py | |
parent | 344a6b449075a8fb42054801144c40760f791366 (diff) | |
download | crocoite-785ef19736cc9a21746e00a022b76fd756c162de.tar.gz crocoite-785ef19736cc9a21746e00a022b76fd756c162de.tar.bz2 crocoite-785ef19736cc9a21746e00a022b76fd756c162de.zip |
warc: Save DOM-/image screenshot as WARC conversion
Judging from the docs this is the proper way to store these resources.
Enable both for the IRC bot by default, since they won’t interfere with
IA’s wayback machine.
Diffstat (limited to 'crocoite/tools.py')
-rw-r--r-- | crocoite/tools.py | 29 |
1 files changed, 16 insertions, 13 deletions
diff --git a/crocoite/tools.py b/crocoite/tools.py index bc92f8f..3aeaaad 100644 --- a/crocoite/tools.py +++ b/crocoite/tools.py @@ -80,18 +80,21 @@ def extractScreenshot (): args = parser.parse_args() - screenshotRe = re.compile (r'^urn:crocoite:screenshot-(\d+)-(\d+).png$', re.I) with args.input: - for record in ArchiveIterator(args.input): - uri = record.rec_headers.get_header('WARC-Target-URI') - if record.rec_type == 'resource': - m = screenshotRe.match (uri) - xoff, yoff = m.groups () - if m: - outpath = '{}-{}-{}.png'.format (args.prefix, xoff, yoff) - if args.force or not os.path.exists (outpath): - with open (outpath, 'wb') as out: - shutil.copyfileobj (record.raw_stream, out) - else: - print ('not overwriting {}'.format (outpath)) + for record in ArchiveIterator (args.input): + headers = record.rec_headers + if record.rec_type != 'conversion' or \ + headers['Content-Type'] != 'image/png' or \ + 'X-Crocoite-Screenshot-Y-Offset' not in headers: + continue + + urlSanitized = headers.get_header('WARC-Target-URI').replace ('/', '_') + xoff = 0 + yoff = int (headers.get_header ('X-Crocoite-Screenshot-Y-Offset')) + outpath = '{}-{}-{}-{}.png'.format (args.prefix, urlSanitized, xoff, yoff) + if args.force or not os.path.exists (outpath): + with open (outpath, 'wb') as out: + shutil.copyfileobj (record.raw_stream, out) + else: + print ('not overwriting {}'.format (outpath)) |