summaryrefslogtreecommitdiff
path: root/crocoite/tools.py
diff options
context:
space:
mode:
authorLars-Dominik Braun <lars@6xq.net>2018-06-25 19:55:48 +0200
committerLars-Dominik Braun <lars@6xq.net>2018-06-25 19:55:48 +0200
commit785ef19736cc9a21746e00a022b76fd756c162de (patch)
tree041a8696c852294fe9573485831398933e26ee13 /crocoite/tools.py
parent344a6b449075a8fb42054801144c40760f791366 (diff)
downloadcrocoite-785ef19736cc9a21746e00a022b76fd756c162de.tar.gz
crocoite-785ef19736cc9a21746e00a022b76fd756c162de.tar.bz2
crocoite-785ef19736cc9a21746e00a022b76fd756c162de.zip
warc: Save DOM-/image screenshot as WARC conversion
Judging from the docs this is the proper way to store these resources. Enable both for the IRC bot by default, since they won’t interfere with IA’s wayback machine.
Diffstat (limited to 'crocoite/tools.py')
-rw-r--r--crocoite/tools.py29
1 files changed, 16 insertions, 13 deletions
diff --git a/crocoite/tools.py b/crocoite/tools.py
index bc92f8f..3aeaaad 100644
--- a/crocoite/tools.py
+++ b/crocoite/tools.py
@@ -80,18 +80,21 @@ def extractScreenshot ():
args = parser.parse_args()
- screenshotRe = re.compile (r'^urn:crocoite:screenshot-(\d+)-(\d+).png$', re.I)
with args.input:
- for record in ArchiveIterator(args.input):
- uri = record.rec_headers.get_header('WARC-Target-URI')
- if record.rec_type == 'resource':
- m = screenshotRe.match (uri)
- xoff, yoff = m.groups ()
- if m:
- outpath = '{}-{}-{}.png'.format (args.prefix, xoff, yoff)
- if args.force or not os.path.exists (outpath):
- with open (outpath, 'wb') as out:
- shutil.copyfileobj (record.raw_stream, out)
- else:
- print ('not overwriting {}'.format (outpath))
+ for record in ArchiveIterator (args.input):
+ headers = record.rec_headers
+ if record.rec_type != 'conversion' or \
+ headers['Content-Type'] != 'image/png' or \
+ 'X-Crocoite-Screenshot-Y-Offset' not in headers:
+ continue
+
+ urlSanitized = headers.get_header('WARC-Target-URI').replace ('/', '_')
+ xoff = 0
+ yoff = int (headers.get_header ('X-Crocoite-Screenshot-Y-Offset'))
+ outpath = '{}-{}-{}-{}.png'.format (args.prefix, urlSanitized, xoff, yoff)
+ if args.force or not os.path.exists (outpath):
+ with open (outpath, 'wb') as out:
+ shutil.copyfileobj (record.raw_stream, out)
+ else:
+ print ('not overwriting {}'.format (outpath))