summaryrefslogtreecommitdiff
path: root/crocoite/warc.py
diff options
context:
space:
mode:
authorLars-Dominik Braun <lars@6xq.net>2018-06-25 19:55:48 +0200
committerLars-Dominik Braun <lars@6xq.net>2018-06-25 19:55:48 +0200
commit785ef19736cc9a21746e00a022b76fd756c162de (patch)
tree041a8696c852294fe9573485831398933e26ee13 /crocoite/warc.py
parent344a6b449075a8fb42054801144c40760f791366 (diff)
downloadcrocoite-785ef19736cc9a21746e00a022b76fd756c162de.tar.gz
crocoite-785ef19736cc9a21746e00a022b76fd756c162de.tar.bz2
crocoite-785ef19736cc9a21746e00a022b76fd756c162de.zip
warc: Save DOM-/image screenshot as WARC conversion
Judging from the docs this is the proper way to store these resources. Enable both for the IRC bot by default, since they won’t interfere with IA’s wayback machine.
Diffstat (limited to 'crocoite/warc.py')
-rw-r--r--crocoite/warc.py39
1 files changed, 30 insertions, 9 deletions
diff --git a/crocoite/warc.py b/crocoite/warc.py
index af04cf9..e472f16 100644
--- a/crocoite/warc.py
+++ b/crocoite/warc.py
@@ -38,7 +38,7 @@ from .behavior import Script, DomSnapshotEvent, ScreenshotEvent
from .browser import Item
class WarcHandler (EventHandler):
- __slots__ = ('logger', 'writer', 'maxBodySize')
+ __slots__ = ('logger', 'writer', 'maxBodySize', 'documentRecords')
def __init__ (self, fd,
logger=logging.getLogger(__name__),
@@ -46,6 +46,9 @@ class WarcHandler (EventHandler):
self.logger = logger
self.writer = WARCWriter (fd, gzip=True)
self.maxBodySize = maxBodySize
+ # maps document urls to WARC record ids, required for DomSnapshotEvent
+ # and ScreenshotEvent
+ self.documentRecords = {}
def _writeRequest (self, item):
writer = self.writer
@@ -135,6 +138,9 @@ class WarcHandler (EventHandler):
http_headers=httpHeaders)
writer.write_record(record)
+ if item.resourceType == 'Document':
+ self.documentRecords[item.url] = record.rec_headers.get_header ('WARC-Record-ID')
+
def _writeScript (self, item):
writer = self.writer
encoding = 'utf-8'
@@ -155,21 +161,36 @@ class WarcHandler (EventHandler):
except ValueError as e:
self.logger.error (e.args[0])
+ def _addRefersTo (self, headers, url):
+ refersTo = self.documentRecords.get (url)
+ if refersTo:
+ headers['WARC-Refers-To'] = refersTo
+ else:
+ self.logger.error ('No document record found for {}'.format (url))
+ return headers
+
def _writeDomSnapshot (self, item):
writer = self.writer
- httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1')
- record = writer.create_warc_record (item.url, 'response',
+
+ warcHeaders = {'X-DOM-Snapshot': str (True),
+ 'X-Chrome-Viewport': item.viewport,
+ 'Content-Type': 'text/html; charset=utf-8',
+ }
+
+ self._addRefersTo (warcHeaders, item.url)
+
+ record = writer.create_warc_record (item.url, 'conversion',
payload=BytesIO (item.document),
- http_headers=httpHeaders,
- warc_headers_dict={'X-DOM-Snapshot': str (True),
- 'X-Chrome-Viewport': item.viewport})
+ warc_headers_dict=warcHeaders)
writer.write_record (record)
def _writeScreenshot (self, item):
writer = self.writer
- url = packageUrl ('screenshot-{}-{}.png'.format (0, item.yoff))
- record = writer.create_warc_record (url, 'resource',
- payload=BytesIO (item.data), warc_headers_dict={'Content-Type': 'image/png'})
+ warcHeaders = {'Content-Type': 'image/png',
+ 'X-Crocoite-Screenshot-Y-Offset': str (item.yoff)}
+ self._addRefersTo (warcHeaders, item.url)
+ record = writer.create_warc_record (item.url, 'conversion',
+ payload=BytesIO (item.data), warc_headers_dict=warcHeaders)
writer.write_record (record)
def _writeControllerStart (self, item):