diff options
author | Lars-Dominik Braun <lars@6xq.net> | 2018-06-25 19:55:48 +0200 |
---|---|---|
committer | Lars-Dominik Braun <lars@6xq.net> | 2018-06-25 19:55:48 +0200 |
commit | 785ef19736cc9a21746e00a022b76fd756c162de (patch) | |
tree | 041a8696c852294fe9573485831398933e26ee13 /crocoite/warc.py | |
parent | 344a6b449075a8fb42054801144c40760f791366 (diff) | |
download | crocoite-785ef19736cc9a21746e00a022b76fd756c162de.tar.gz crocoite-785ef19736cc9a21746e00a022b76fd756c162de.tar.bz2 crocoite-785ef19736cc9a21746e00a022b76fd756c162de.zip |
warc: Save DOM-/image screenshot as WARC conversion
Judging from the docs this is the proper way to store these resources.
Enable both for the IRC bot by default, since they won’t interfere with
IA’s wayback machine.
Diffstat (limited to 'crocoite/warc.py')
-rw-r--r-- | crocoite/warc.py | 39 |
1 files changed, 30 insertions, 9 deletions
diff --git a/crocoite/warc.py b/crocoite/warc.py index af04cf9..e472f16 100644 --- a/crocoite/warc.py +++ b/crocoite/warc.py @@ -38,7 +38,7 @@ from .behavior import Script, DomSnapshotEvent, ScreenshotEvent from .browser import Item class WarcHandler (EventHandler): - __slots__ = ('logger', 'writer', 'maxBodySize') + __slots__ = ('logger', 'writer', 'maxBodySize', 'documentRecords') def __init__ (self, fd, logger=logging.getLogger(__name__), @@ -46,6 +46,9 @@ class WarcHandler (EventHandler): self.logger = logger self.writer = WARCWriter (fd, gzip=True) self.maxBodySize = maxBodySize + # maps document urls to WARC record ids, required for DomSnapshotEvent + # and ScreenshotEvent + self.documentRecords = {} def _writeRequest (self, item): writer = self.writer @@ -135,6 +138,9 @@ class WarcHandler (EventHandler): http_headers=httpHeaders) writer.write_record(record) + if item.resourceType == 'Document': + self.documentRecords[item.url] = record.rec_headers.get_header ('WARC-Record-ID') + def _writeScript (self, item): writer = self.writer encoding = 'utf-8' @@ -155,21 +161,36 @@ class WarcHandler (EventHandler): except ValueError as e: self.logger.error (e.args[0]) + def _addRefersTo (self, headers, url): + refersTo = self.documentRecords.get (url) + if refersTo: + headers['WARC-Refers-To'] = refersTo + else: + self.logger.error ('No document record found for {}'.format (url)) + return headers + def _writeDomSnapshot (self, item): writer = self.writer - httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1') - record = writer.create_warc_record (item.url, 'response', + + warcHeaders = {'X-DOM-Snapshot': str (True), + 'X-Chrome-Viewport': item.viewport, + 'Content-Type': 'text/html; charset=utf-8', + } + + self._addRefersTo (warcHeaders, item.url) + + record = writer.create_warc_record (item.url, 'conversion', payload=BytesIO (item.document), - http_headers=httpHeaders, - warc_headers_dict={'X-DOM-Snapshot': str (True), - 'X-Chrome-Viewport': item.viewport}) + warc_headers_dict=warcHeaders) writer.write_record (record) def _writeScreenshot (self, item): writer = self.writer - url = packageUrl ('screenshot-{}-{}.png'.format (0, item.yoff)) - record = writer.create_warc_record (url, 'resource', - payload=BytesIO (item.data), warc_headers_dict={'Content-Type': 'image/png'}) + warcHeaders = {'Content-Type': 'image/png', + 'X-Crocoite-Screenshot-Y-Offset': str (item.yoff)} + self._addRefersTo (warcHeaders, item.url) + record = writer.create_warc_record (item.url, 'conversion', + payload=BytesIO (item.data), warc_headers_dict=warcHeaders) writer.write_record (record) def _writeControllerStart (self, item): |