From 785ef19736cc9a21746e00a022b76fd756c162de Mon Sep 17 00:00:00 2001 From: Lars-Dominik Braun Date: Mon, 25 Jun 2018 19:55:48 +0200 Subject: warc: Save DOM-/image screenshot as WARC conversion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Judging from the docs this is the proper way to store these resources. Enable both for the IRC bot by default, since they won’t interfere with IA’s wayback machine. --- crocoite/behavior.py | 17 ++++++++++++----- crocoite/browser.py | 10 +++++++--- crocoite/controller.py | 8 +------- crocoite/tools.py | 29 ++++++++++++++++------------- crocoite/util.py | 6 ++++++ crocoite/warc.py | 39 ++++++++++++++++++++++++++++++--------- 6 files changed, 72 insertions(+), 37 deletions(-) (limited to 'crocoite') diff --git a/crocoite/behavior.py b/crocoite/behavior.py index ab859f8..b34d3d9 100644 --- a/crocoite/behavior.py +++ b/crocoite/behavior.py @@ -32,7 +32,7 @@ from collections import OrderedDict from html5lib.serializer import HTMLSerializer from pychrome.exceptions import TimeoutException -from .util import randomString, getFormattedViewportMetrics +from .util import randomString, getFormattedViewportMetrics, removeFragment from . import html from .html import StripAttributeFilter, StripTagFilter, ChromeTreeWalker @@ -229,12 +229,13 @@ class DomSnapshot (Behavior): disallowedAttributes = html.eventAttributes stream = StripAttributeFilter (StripTagFilter (walker, disallowedTags), disallowedAttributes) serializer = HTMLSerializer () - yield DomSnapshotEvent (doc['documentURL'], serializer.render (stream, 'utf-8'), viewport) + yield DomSnapshotEvent (removeFragment (doc['documentURL']), serializer.render (stream, 'utf-8'), viewport) class ScreenshotEvent: - __slots__ = ('yoff', 'data') + __slots__ = ('yoff', 'data', 'url') - def __init__ (self, yoff, data): + def __init__ (self, url, yoff, data): + self.url = url self.yoff = yoff self.data = data @@ -248,6 +249,12 @@ class Screenshot (Behavior): def onfinish (self): tab = self.loader.tab + try: + url = removeFragment (tab.Page.getFrameTree ()['frameTree']['frame']['url']) + except KeyError: + logger.error ('frame has no url') + url = None + # see https://github.com/GoogleChrome/puppeteer/blob/230be28b067b521f0577206899db01f0ca7fc0d2/examples/screenshots-longpage.js # Hardcoded max texture size of 16,384 (crbug.com/770769) maxDim = 16*1024 @@ -260,7 +267,7 @@ class Screenshot (Behavior): height = min (contentSize['height'] - yoff, maxDim) clip = {'x': 0, 'y': yoff, 'width': width, 'height': height, 'scale': 1} data = b64decode (tab.Page.captureScreenshot (format='png', clip=clip)['data']) - yield ScreenshotEvent (yoff, data) + yield ScreenshotEvent (url, yoff, data) class Click (JsOnload): """ Generic link clicking """ diff --git a/crocoite/browser.py b/crocoite/browser.py index 1c09598..6a4bee2 100644 --- a/crocoite/browser.py +++ b/crocoite/browser.py @@ -41,9 +41,9 @@ class Item: def __init__ (self, tab): self.tab = tab - self.chromeRequest = None - self.chromeResponse = None - self.chromeFinished = None + self.chromeRequest = {} + self.chromeResponse = {} + self.chromeFinished = {} self.isRedirect = False self.failed = False @@ -128,6 +128,10 @@ class Item: return text[0] return 'No status text available' + @property + def resourceType (self): + return self.chromeResponse.get ('type', self.chromeRequest.get ('type', None)) + @staticmethod def _unfoldHeaders (headers): """ diff --git a/crocoite/controller.py b/crocoite/controller.py index 84001b7..ef042cc 100644 --- a/crocoite/controller.py +++ b/crocoite/controller.py @@ -71,11 +71,10 @@ class StatsHandler (EventHandler): self.stats['crashed'] += 1 import logging, time -from urllib.parse import urlsplit, urlunsplit from . import behavior as cbehavior from .browser import ChromeService, SiteLoader, Item -from .util import getFormattedViewportMetrics +from .util import getFormattedViewportMetrics, removeFragment class ControllerStart: __slots__ = ('payload') @@ -238,11 +237,6 @@ class PrefixLimit (RecursionPolicy): def __call__ (self, urls): return set (filter (lambda u: u.startswith (self.prefix), urls)) -def removeFragment (u): - """ Remove fragment from url (i.e. #hashvalue) """ - s = urlsplit (u) - return urlunsplit ((s.scheme, s.netloc, s.path, s.query, '')) - from .behavior import ExtractLinksEvent class RecursiveController (EventHandler): diff --git a/crocoite/tools.py b/crocoite/tools.py index bc92f8f..3aeaaad 100644 --- a/crocoite/tools.py +++ b/crocoite/tools.py @@ -80,18 +80,21 @@ def extractScreenshot (): args = parser.parse_args() - screenshotRe = re.compile (r'^urn:crocoite:screenshot-(\d+)-(\d+).png$', re.I) with args.input: - for record in ArchiveIterator(args.input): - uri = record.rec_headers.get_header('WARC-Target-URI') - if record.rec_type == 'resource': - m = screenshotRe.match (uri) - xoff, yoff = m.groups () - if m: - outpath = '{}-{}-{}.png'.format (args.prefix, xoff, yoff) - if args.force or not os.path.exists (outpath): - with open (outpath, 'wb') as out: - shutil.copyfileobj (record.raw_stream, out) - else: - print ('not overwriting {}'.format (outpath)) + for record in ArchiveIterator (args.input): + headers = record.rec_headers + if record.rec_type != 'conversion' or \ + headers['Content-Type'] != 'image/png' or \ + 'X-Crocoite-Screenshot-Y-Offset' not in headers: + continue + + urlSanitized = headers.get_header('WARC-Target-URI').replace ('/', '_') + xoff = 0 + yoff = int (headers.get_header ('X-Crocoite-Screenshot-Y-Offset')) + outpath = '{}-{}-{}-{}.png'.format (args.prefix, urlSanitized, xoff, yoff) + if args.force or not os.path.exists (outpath): + with open (outpath, 'wb') as out: + shutil.copyfileobj (record.raw_stream, out) + else: + print ('not overwriting {}'.format (outpath)) diff --git a/crocoite/util.py b/crocoite/util.py index ec257f1..fe43f01 100644 --- a/crocoite/util.py +++ b/crocoite/util.py @@ -23,6 +23,7 @@ Random utility functions """ import random +from urllib.parse import urlsplit, urlunsplit def randomString (length=None, chars='abcdefghijklmnopqrstuvwxyz'): if length is None: @@ -41,3 +42,8 @@ def getFormattedViewportMetrics (tab): return '{}x{}'.format (layoutMetrics['layoutViewport']['clientWidth'], layoutMetrics['layoutViewport']['clientHeight']) +def removeFragment (u): + """ Remove fragment from url (i.e. #hashvalue) """ + s = urlsplit (u) + return urlunsplit ((s.scheme, s.netloc, s.path, s.query, '')) + diff --git a/crocoite/warc.py b/crocoite/warc.py index af04cf9..e472f16 100644 --- a/crocoite/warc.py +++ b/crocoite/warc.py @@ -38,7 +38,7 @@ from .behavior import Script, DomSnapshotEvent, ScreenshotEvent from .browser import Item class WarcHandler (EventHandler): - __slots__ = ('logger', 'writer', 'maxBodySize') + __slots__ = ('logger', 'writer', 'maxBodySize', 'documentRecords') def __init__ (self, fd, logger=logging.getLogger(__name__), @@ -46,6 +46,9 @@ class WarcHandler (EventHandler): self.logger = logger self.writer = WARCWriter (fd, gzip=True) self.maxBodySize = maxBodySize + # maps document urls to WARC record ids, required for DomSnapshotEvent + # and ScreenshotEvent + self.documentRecords = {} def _writeRequest (self, item): writer = self.writer @@ -135,6 +138,9 @@ class WarcHandler (EventHandler): http_headers=httpHeaders) writer.write_record(record) + if item.resourceType == 'Document': + self.documentRecords[item.url] = record.rec_headers.get_header ('WARC-Record-ID') + def _writeScript (self, item): writer = self.writer encoding = 'utf-8' @@ -155,21 +161,36 @@ class WarcHandler (EventHandler): except ValueError as e: self.logger.error (e.args[0]) + def _addRefersTo (self, headers, url): + refersTo = self.documentRecords.get (url) + if refersTo: + headers['WARC-Refers-To'] = refersTo + else: + self.logger.error ('No document record found for {}'.format (url)) + return headers + def _writeDomSnapshot (self, item): writer = self.writer - httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1') - record = writer.create_warc_record (item.url, 'response', + + warcHeaders = {'X-DOM-Snapshot': str (True), + 'X-Chrome-Viewport': item.viewport, + 'Content-Type': 'text/html; charset=utf-8', + } + + self._addRefersTo (warcHeaders, item.url) + + record = writer.create_warc_record (item.url, 'conversion', payload=BytesIO (item.document), - http_headers=httpHeaders, - warc_headers_dict={'X-DOM-Snapshot': str (True), - 'X-Chrome-Viewport': item.viewport}) + warc_headers_dict=warcHeaders) writer.write_record (record) def _writeScreenshot (self, item): writer = self.writer - url = packageUrl ('screenshot-{}-{}.png'.format (0, item.yoff)) - record = writer.create_warc_record (url, 'resource', - payload=BytesIO (item.data), warc_headers_dict={'Content-Type': 'image/png'}) + warcHeaders = {'Content-Type': 'image/png', + 'X-Crocoite-Screenshot-Y-Offset': str (item.yoff)} + self._addRefersTo (warcHeaders, item.url) + record = writer.create_warc_record (item.url, 'conversion', + payload=BytesIO (item.data), warc_headers_dict=warcHeaders) writer.write_record (record) def _writeControllerStart (self, item): -- cgit v1.2.3