diff options
author | Lars-Dominik Braun <lars@6xq.net> | 2018-06-25 19:55:48 +0200 |
---|---|---|
committer | Lars-Dominik Braun <lars@6xq.net> | 2018-06-25 19:55:48 +0200 |
commit | 785ef19736cc9a21746e00a022b76fd756c162de (patch) | |
tree | 041a8696c852294fe9573485831398933e26ee13 | |
parent | 344a6b449075a8fb42054801144c40760f791366 (diff) | |
download | crocoite-785ef19736cc9a21746e00a022b76fd756c162de.tar.gz crocoite-785ef19736cc9a21746e00a022b76fd756c162de.tar.bz2 crocoite-785ef19736cc9a21746e00a022b76fd756c162de.zip |
warc: Save DOM-/image screenshot as WARC conversion
Judging from the docs this is the proper way to store these resources.
Enable both for the IRC bot by default, since they won’t interfere with
IA’s wayback machine.
-rw-r--r-- | contrib/celerycrocoite.py | 3 | ||||
-rw-r--r-- | crocoite/behavior.py | 17 | ||||
-rw-r--r-- | crocoite/browser.py | 10 | ||||
-rw-r--r-- | crocoite/controller.py | 8 | ||||
-rw-r--r-- | crocoite/tools.py | 29 | ||||
-rw-r--r-- | crocoite/util.py | 6 | ||||
-rw-r--r-- | crocoite/warc.py | 39 |
7 files changed, 73 insertions, 39 deletions
diff --git a/contrib/celerycrocoite.py b/contrib/celerycrocoite.py index d0a02e9..3da43d9 100644 --- a/contrib/celerycrocoite.py +++ b/contrib/celerycrocoite.py @@ -192,12 +192,11 @@ def archive (bot, trigger): if not args: bot.reply ('Sorry, I don’t understand {}'.format (trigger.group (2))) return - blacklistedBehavior = {'domSnapshot', 'screenshot'} settings = dict (maxBodySize=args.maxBodySize, logBuffer=defaultSettings.logBuffer, idleTimeout=args.idleTimeout, timeout=args.timeout) args = dict (url=args.url, - enabledBehaviorNames=list (set (behavior.availableMap.keys())-blacklistedBehavior), + enabledBehaviorNames=list (behavior.availableMap.keys ()), settings=settings, recursive=args.recursive, concurrency=args.concurrency) q = bot.memory['crocoite']['q'] diff --git a/crocoite/behavior.py b/crocoite/behavior.py index ab859f8..b34d3d9 100644 --- a/crocoite/behavior.py +++ b/crocoite/behavior.py @@ -32,7 +32,7 @@ from collections import OrderedDict from html5lib.serializer import HTMLSerializer from pychrome.exceptions import TimeoutException -from .util import randomString, getFormattedViewportMetrics +from .util import randomString, getFormattedViewportMetrics, removeFragment from . import html from .html import StripAttributeFilter, StripTagFilter, ChromeTreeWalker @@ -229,12 +229,13 @@ class DomSnapshot (Behavior): disallowedAttributes = html.eventAttributes stream = StripAttributeFilter (StripTagFilter (walker, disallowedTags), disallowedAttributes) serializer = HTMLSerializer () - yield DomSnapshotEvent (doc['documentURL'], serializer.render (stream, 'utf-8'), viewport) + yield DomSnapshotEvent (removeFragment (doc['documentURL']), serializer.render (stream, 'utf-8'), viewport) class ScreenshotEvent: - __slots__ = ('yoff', 'data') + __slots__ = ('yoff', 'data', 'url') - def __init__ (self, yoff, data): + def __init__ (self, url, yoff, data): + self.url = url self.yoff = yoff self.data = data @@ -248,6 +249,12 @@ class Screenshot (Behavior): def onfinish (self): tab = self.loader.tab + try: + url = removeFragment (tab.Page.getFrameTree ()['frameTree']['frame']['url']) + except KeyError: + logger.error ('frame has no url') + url = None + # see https://github.com/GoogleChrome/puppeteer/blob/230be28b067b521f0577206899db01f0ca7fc0d2/examples/screenshots-longpage.js # Hardcoded max texture size of 16,384 (crbug.com/770769) maxDim = 16*1024 @@ -260,7 +267,7 @@ class Screenshot (Behavior): height = min (contentSize['height'] - yoff, maxDim) clip = {'x': 0, 'y': yoff, 'width': width, 'height': height, 'scale': 1} data = b64decode (tab.Page.captureScreenshot (format='png', clip=clip)['data']) - yield ScreenshotEvent (yoff, data) + yield ScreenshotEvent (url, yoff, data) class Click (JsOnload): """ Generic link clicking """ diff --git a/crocoite/browser.py b/crocoite/browser.py index 1c09598..6a4bee2 100644 --- a/crocoite/browser.py +++ b/crocoite/browser.py @@ -41,9 +41,9 @@ class Item: def __init__ (self, tab): self.tab = tab - self.chromeRequest = None - self.chromeResponse = None - self.chromeFinished = None + self.chromeRequest = {} + self.chromeResponse = {} + self.chromeFinished = {} self.isRedirect = False self.failed = False @@ -128,6 +128,10 @@ class Item: return text[0] return 'No status text available' + @property + def resourceType (self): + return self.chromeResponse.get ('type', self.chromeRequest.get ('type', None)) + @staticmethod def _unfoldHeaders (headers): """ diff --git a/crocoite/controller.py b/crocoite/controller.py index 84001b7..ef042cc 100644 --- a/crocoite/controller.py +++ b/crocoite/controller.py @@ -71,11 +71,10 @@ class StatsHandler (EventHandler): self.stats['crashed'] += 1 import logging, time -from urllib.parse import urlsplit, urlunsplit from . import behavior as cbehavior from .browser import ChromeService, SiteLoader, Item -from .util import getFormattedViewportMetrics +from .util import getFormattedViewportMetrics, removeFragment class ControllerStart: __slots__ = ('payload') @@ -238,11 +237,6 @@ class PrefixLimit (RecursionPolicy): def __call__ (self, urls): return set (filter (lambda u: u.startswith (self.prefix), urls)) -def removeFragment (u): - """ Remove fragment from url (i.e. #hashvalue) """ - s = urlsplit (u) - return urlunsplit ((s.scheme, s.netloc, s.path, s.query, '')) - from .behavior import ExtractLinksEvent class RecursiveController (EventHandler): diff --git a/crocoite/tools.py b/crocoite/tools.py index bc92f8f..3aeaaad 100644 --- a/crocoite/tools.py +++ b/crocoite/tools.py @@ -80,18 +80,21 @@ def extractScreenshot (): args = parser.parse_args() - screenshotRe = re.compile (r'^urn:crocoite:screenshot-(\d+)-(\d+).png$', re.I) with args.input: - for record in ArchiveIterator(args.input): - uri = record.rec_headers.get_header('WARC-Target-URI') - if record.rec_type == 'resource': - m = screenshotRe.match (uri) - xoff, yoff = m.groups () - if m: - outpath = '{}-{}-{}.png'.format (args.prefix, xoff, yoff) - if args.force or not os.path.exists (outpath): - with open (outpath, 'wb') as out: - shutil.copyfileobj (record.raw_stream, out) - else: - print ('not overwriting {}'.format (outpath)) + for record in ArchiveIterator (args.input): + headers = record.rec_headers + if record.rec_type != 'conversion' or \ + headers['Content-Type'] != 'image/png' or \ + 'X-Crocoite-Screenshot-Y-Offset' not in headers: + continue + + urlSanitized = headers.get_header('WARC-Target-URI').replace ('/', '_') + xoff = 0 + yoff = int (headers.get_header ('X-Crocoite-Screenshot-Y-Offset')) + outpath = '{}-{}-{}-{}.png'.format (args.prefix, urlSanitized, xoff, yoff) + if args.force or not os.path.exists (outpath): + with open (outpath, 'wb') as out: + shutil.copyfileobj (record.raw_stream, out) + else: + print ('not overwriting {}'.format (outpath)) diff --git a/crocoite/util.py b/crocoite/util.py index ec257f1..fe43f01 100644 --- a/crocoite/util.py +++ b/crocoite/util.py @@ -23,6 +23,7 @@ Random utility functions """ import random +from urllib.parse import urlsplit, urlunsplit def randomString (length=None, chars='abcdefghijklmnopqrstuvwxyz'): if length is None: @@ -41,3 +42,8 @@ def getFormattedViewportMetrics (tab): return '{}x{}'.format (layoutMetrics['layoutViewport']['clientWidth'], layoutMetrics['layoutViewport']['clientHeight']) +def removeFragment (u): + """ Remove fragment from url (i.e. #hashvalue) """ + s = urlsplit (u) + return urlunsplit ((s.scheme, s.netloc, s.path, s.query, '')) + diff --git a/crocoite/warc.py b/crocoite/warc.py index af04cf9..e472f16 100644 --- a/crocoite/warc.py +++ b/crocoite/warc.py @@ -38,7 +38,7 @@ from .behavior import Script, DomSnapshotEvent, ScreenshotEvent from .browser import Item class WarcHandler (EventHandler): - __slots__ = ('logger', 'writer', 'maxBodySize') + __slots__ = ('logger', 'writer', 'maxBodySize', 'documentRecords') def __init__ (self, fd, logger=logging.getLogger(__name__), @@ -46,6 +46,9 @@ class WarcHandler (EventHandler): self.logger = logger self.writer = WARCWriter (fd, gzip=True) self.maxBodySize = maxBodySize + # maps document urls to WARC record ids, required for DomSnapshotEvent + # and ScreenshotEvent + self.documentRecords = {} def _writeRequest (self, item): writer = self.writer @@ -135,6 +138,9 @@ class WarcHandler (EventHandler): http_headers=httpHeaders) writer.write_record(record) + if item.resourceType == 'Document': + self.documentRecords[item.url] = record.rec_headers.get_header ('WARC-Record-ID') + def _writeScript (self, item): writer = self.writer encoding = 'utf-8' @@ -155,21 +161,36 @@ class WarcHandler (EventHandler): except ValueError as e: self.logger.error (e.args[0]) + def _addRefersTo (self, headers, url): + refersTo = self.documentRecords.get (url) + if refersTo: + headers['WARC-Refers-To'] = refersTo + else: + self.logger.error ('No document record found for {}'.format (url)) + return headers + def _writeDomSnapshot (self, item): writer = self.writer - httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1') - record = writer.create_warc_record (item.url, 'response', + + warcHeaders = {'X-DOM-Snapshot': str (True), + 'X-Chrome-Viewport': item.viewport, + 'Content-Type': 'text/html; charset=utf-8', + } + + self._addRefersTo (warcHeaders, item.url) + + record = writer.create_warc_record (item.url, 'conversion', payload=BytesIO (item.document), - http_headers=httpHeaders, - warc_headers_dict={'X-DOM-Snapshot': str (True), - 'X-Chrome-Viewport': item.viewport}) + warc_headers_dict=warcHeaders) writer.write_record (record) def _writeScreenshot (self, item): writer = self.writer - url = packageUrl ('screenshot-{}-{}.png'.format (0, item.yoff)) - record = writer.create_warc_record (url, 'resource', - payload=BytesIO (item.data), warc_headers_dict={'Content-Type': 'image/png'}) + warcHeaders = {'Content-Type': 'image/png', + 'X-Crocoite-Screenshot-Y-Offset': str (item.yoff)} + self._addRefersTo (warcHeaders, item.url) + record = writer.create_warc_record (item.url, 'conversion', + payload=BytesIO (item.data), warc_headers_dict=warcHeaders) writer.write_record (record) def _writeControllerStart (self, item): |