From 785ef19736cc9a21746e00a022b76fd756c162de Mon Sep 17 00:00:00 2001 From: Lars-Dominik Braun Date: Mon, 25 Jun 2018 19:55:48 +0200 Subject: warc: Save DOM-/image screenshot as WARC conversion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Judging from the docs this is the proper way to store these resources. Enable both for the IRC bot by default, since they won’t interfere with IA’s wayback machine. --- crocoite/warc.py | 39 ++++++++++++++++++++++++++++++--------- 1 file changed, 30 insertions(+), 9 deletions(-) (limited to 'crocoite/warc.py') diff --git a/crocoite/warc.py b/crocoite/warc.py index af04cf9..e472f16 100644 --- a/crocoite/warc.py +++ b/crocoite/warc.py @@ -38,7 +38,7 @@ from .behavior import Script, DomSnapshotEvent, ScreenshotEvent from .browser import Item class WarcHandler (EventHandler): - __slots__ = ('logger', 'writer', 'maxBodySize') + __slots__ = ('logger', 'writer', 'maxBodySize', 'documentRecords') def __init__ (self, fd, logger=logging.getLogger(__name__), @@ -46,6 +46,9 @@ class WarcHandler (EventHandler): self.logger = logger self.writer = WARCWriter (fd, gzip=True) self.maxBodySize = maxBodySize + # maps document urls to WARC record ids, required for DomSnapshotEvent + # and ScreenshotEvent + self.documentRecords = {} def _writeRequest (self, item): writer = self.writer @@ -135,6 +138,9 @@ class WarcHandler (EventHandler): http_headers=httpHeaders) writer.write_record(record) + if item.resourceType == 'Document': + self.documentRecords[item.url] = record.rec_headers.get_header ('WARC-Record-ID') + def _writeScript (self, item): writer = self.writer encoding = 'utf-8' @@ -155,21 +161,36 @@ class WarcHandler (EventHandler): except ValueError as e: self.logger.error (e.args[0]) + def _addRefersTo (self, headers, url): + refersTo = self.documentRecords.get (url) + if refersTo: + headers['WARC-Refers-To'] = refersTo + else: + self.logger.error ('No document record found for {}'.format (url)) + return headers + def _writeDomSnapshot (self, item): writer = self.writer - httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1') - record = writer.create_warc_record (item.url, 'response', + + warcHeaders = {'X-DOM-Snapshot': str (True), + 'X-Chrome-Viewport': item.viewport, + 'Content-Type': 'text/html; charset=utf-8', + } + + self._addRefersTo (warcHeaders, item.url) + + record = writer.create_warc_record (item.url, 'conversion', payload=BytesIO (item.document), - http_headers=httpHeaders, - warc_headers_dict={'X-DOM-Snapshot': str (True), - 'X-Chrome-Viewport': item.viewport}) + warc_headers_dict=warcHeaders) writer.write_record (record) def _writeScreenshot (self, item): writer = self.writer - url = packageUrl ('screenshot-{}-{}.png'.format (0, item.yoff)) - record = writer.create_warc_record (url, 'resource', - payload=BytesIO (item.data), warc_headers_dict={'Content-Type': 'image/png'}) + warcHeaders = {'Content-Type': 'image/png', + 'X-Crocoite-Screenshot-Y-Offset': str (item.yoff)} + self._addRefersTo (warcHeaders, item.url) + record = writer.create_warc_record (item.url, 'conversion', + payload=BytesIO (item.data), warc_headers_dict=warcHeaders) writer.write_record (record) def _writeControllerStart (self, item): -- cgit v1.2.3