diff options
| author | Lars-Dominik Braun <lars@6xq.net> | 2018-06-25 19:55:48 +0200 | 
|---|---|---|
| committer | Lars-Dominik Braun <lars@6xq.net> | 2018-06-25 19:55:48 +0200 | 
| commit | 785ef19736cc9a21746e00a022b76fd756c162de (patch) | |
| tree | 041a8696c852294fe9573485831398933e26ee13 | |
| parent | 344a6b449075a8fb42054801144c40760f791366 (diff) | |
| download | crocoite-785ef19736cc9a21746e00a022b76fd756c162de.tar.gz crocoite-785ef19736cc9a21746e00a022b76fd756c162de.tar.bz2 crocoite-785ef19736cc9a21746e00a022b76fd756c162de.zip | |
warc: Save DOM-/image screenshot as WARC conversion
Judging from the docs this is the proper way to store these resources.
Enable both for the IRC bot by default, since they won’t interfere with
IA’s wayback machine.
| -rw-r--r-- | contrib/celerycrocoite.py | 3 | ||||
| -rw-r--r-- | crocoite/behavior.py | 17 | ||||
| -rw-r--r-- | crocoite/browser.py | 10 | ||||
| -rw-r--r-- | crocoite/controller.py | 8 | ||||
| -rw-r--r-- | crocoite/tools.py | 29 | ||||
| -rw-r--r-- | crocoite/util.py | 6 | ||||
| -rw-r--r-- | crocoite/warc.py | 39 | 
7 files changed, 73 insertions, 39 deletions
| diff --git a/contrib/celerycrocoite.py b/contrib/celerycrocoite.py index d0a02e9..3da43d9 100644 --- a/contrib/celerycrocoite.py +++ b/contrib/celerycrocoite.py @@ -192,12 +192,11 @@ def archive (bot, trigger):      if not args:          bot.reply ('Sorry, I don’t understand {}'.format (trigger.group (2)))          return -    blacklistedBehavior = {'domSnapshot', 'screenshot'}      settings = dict (maxBodySize=args.maxBodySize,              logBuffer=defaultSettings.logBuffer, idleTimeout=args.idleTimeout,              timeout=args.timeout)      args = dict (url=args.url, -            enabledBehaviorNames=list (set (behavior.availableMap.keys())-blacklistedBehavior), +            enabledBehaviorNames=list (behavior.availableMap.keys ()),              settings=settings, recursive=args.recursive,              concurrency=args.concurrency)      q = bot.memory['crocoite']['q'] diff --git a/crocoite/behavior.py b/crocoite/behavior.py index ab859f8..b34d3d9 100644 --- a/crocoite/behavior.py +++ b/crocoite/behavior.py @@ -32,7 +32,7 @@ from collections import OrderedDict  from html5lib.serializer import HTMLSerializer  from pychrome.exceptions import TimeoutException -from .util import randomString, getFormattedViewportMetrics +from .util import randomString, getFormattedViewportMetrics, removeFragment  from . import html  from .html import StripAttributeFilter, StripTagFilter, ChromeTreeWalker @@ -229,12 +229,13 @@ class DomSnapshot (Behavior):                  disallowedAttributes = html.eventAttributes                  stream = StripAttributeFilter (StripTagFilter (walker, disallowedTags), disallowedAttributes)                  serializer = HTMLSerializer () -                yield DomSnapshotEvent (doc['documentURL'], serializer.render (stream, 'utf-8'), viewport) +                yield DomSnapshotEvent (removeFragment (doc['documentURL']), serializer.render (stream, 'utf-8'), viewport)  class ScreenshotEvent: -    __slots__ = ('yoff', 'data') +    __slots__ = ('yoff', 'data', 'url') -    def __init__ (self, yoff, data): +    def __init__ (self, url, yoff, data): +        self.url = url          self.yoff = yoff          self.data = data @@ -248,6 +249,12 @@ class Screenshot (Behavior):      def onfinish (self):          tab = self.loader.tab +        try: +            url = removeFragment (tab.Page.getFrameTree ()['frameTree']['frame']['url']) +        except KeyError: +            logger.error ('frame has no url') +            url = None +          # see https://github.com/GoogleChrome/puppeteer/blob/230be28b067b521f0577206899db01f0ca7fc0d2/examples/screenshots-longpage.js          # Hardcoded max texture size of 16,384 (crbug.com/770769)          maxDim = 16*1024 @@ -260,7 +267,7 @@ class Screenshot (Behavior):              height = min (contentSize['height'] - yoff, maxDim)              clip = {'x': 0, 'y': yoff, 'width': width, 'height': height, 'scale': 1}              data = b64decode (tab.Page.captureScreenshot (format='png', clip=clip)['data']) -            yield ScreenshotEvent (yoff, data) +            yield ScreenshotEvent (url, yoff, data)  class Click (JsOnload):      """ Generic link clicking """ diff --git a/crocoite/browser.py b/crocoite/browser.py index 1c09598..6a4bee2 100644 --- a/crocoite/browser.py +++ b/crocoite/browser.py @@ -41,9 +41,9 @@ class Item:      def __init__ (self, tab):          self.tab = tab -        self.chromeRequest = None -        self.chromeResponse = None -        self.chromeFinished = None +        self.chromeRequest = {} +        self.chromeResponse = {} +        self.chromeFinished = {}          self.isRedirect = False          self.failed = False @@ -128,6 +128,10 @@ class Item:              return text[0]          return 'No status text available' +    @property +    def resourceType (self): +        return self.chromeResponse.get ('type', self.chromeRequest.get ('type', None)) +      @staticmethod      def _unfoldHeaders (headers):          """ diff --git a/crocoite/controller.py b/crocoite/controller.py index 84001b7..ef042cc 100644 --- a/crocoite/controller.py +++ b/crocoite/controller.py @@ -71,11 +71,10 @@ class StatsHandler (EventHandler):              self.stats['crashed'] += 1  import logging, time -from urllib.parse import urlsplit, urlunsplit  from . import behavior as cbehavior  from .browser import ChromeService, SiteLoader, Item -from .util import getFormattedViewportMetrics +from .util import getFormattedViewportMetrics, removeFragment  class ControllerStart:      __slots__ = ('payload') @@ -238,11 +237,6 @@ class PrefixLimit (RecursionPolicy):      def __call__ (self, urls):          return set (filter (lambda u: u.startswith (self.prefix), urls)) -def removeFragment (u): -    """ Remove fragment from url (i.e. #hashvalue) """ -    s = urlsplit (u) -    return urlunsplit ((s.scheme, s.netloc, s.path, s.query, '')) -  from .behavior import ExtractLinksEvent  class RecursiveController (EventHandler): diff --git a/crocoite/tools.py b/crocoite/tools.py index bc92f8f..3aeaaad 100644 --- a/crocoite/tools.py +++ b/crocoite/tools.py @@ -80,18 +80,21 @@ def extractScreenshot ():      args = parser.parse_args() -    screenshotRe = re.compile (r'^urn:crocoite:screenshot-(\d+)-(\d+).png$', re.I)      with args.input: -        for record in ArchiveIterator(args.input): -            uri = record.rec_headers.get_header('WARC-Target-URI') -            if record.rec_type == 'resource': -                m = screenshotRe.match (uri) -                xoff, yoff = m.groups () -                if m: -                    outpath = '{}-{}-{}.png'.format (args.prefix, xoff, yoff) -                    if args.force or not os.path.exists (outpath): -                        with open (outpath, 'wb') as out: -                            shutil.copyfileobj (record.raw_stream, out) -                    else: -                        print ('not overwriting {}'.format (outpath)) +        for record in ArchiveIterator (args.input): +            headers = record.rec_headers +            if record.rec_type != 'conversion' or \ +                    headers['Content-Type'] != 'image/png' or \ +                    'X-Crocoite-Screenshot-Y-Offset' not in headers: +                continue + +            urlSanitized = headers.get_header('WARC-Target-URI').replace ('/', '_') +            xoff = 0 +            yoff = int (headers.get_header ('X-Crocoite-Screenshot-Y-Offset')) +            outpath = '{}-{}-{}-{}.png'.format (args.prefix, urlSanitized, xoff, yoff) +            if args.force or not os.path.exists (outpath): +                with open (outpath, 'wb') as out: +                    shutil.copyfileobj (record.raw_stream, out) +            else: +                print ('not overwriting {}'.format (outpath)) diff --git a/crocoite/util.py b/crocoite/util.py index ec257f1..fe43f01 100644 --- a/crocoite/util.py +++ b/crocoite/util.py @@ -23,6 +23,7 @@ Random utility functions  """  import random +from urllib.parse import urlsplit, urlunsplit  def randomString (length=None, chars='abcdefghijklmnopqrstuvwxyz'):      if length is None: @@ -41,3 +42,8 @@ def getFormattedViewportMetrics (tab):      return '{}x{}'.format (layoutMetrics['layoutViewport']['clientWidth'],                  layoutMetrics['layoutViewport']['clientHeight']) +def removeFragment (u): +    """ Remove fragment from url (i.e. #hashvalue) """ +    s = urlsplit (u) +    return urlunsplit ((s.scheme, s.netloc, s.path, s.query, '')) + diff --git a/crocoite/warc.py b/crocoite/warc.py index af04cf9..e472f16 100644 --- a/crocoite/warc.py +++ b/crocoite/warc.py @@ -38,7 +38,7 @@ from .behavior import Script, DomSnapshotEvent, ScreenshotEvent  from .browser import Item  class WarcHandler (EventHandler): -    __slots__ = ('logger', 'writer', 'maxBodySize') +    __slots__ = ('logger', 'writer', 'maxBodySize', 'documentRecords')      def __init__ (self, fd,              logger=logging.getLogger(__name__), @@ -46,6 +46,9 @@ class WarcHandler (EventHandler):          self.logger = logger          self.writer = WARCWriter (fd, gzip=True)          self.maxBodySize = maxBodySize +        # maps document urls to WARC record ids, required for DomSnapshotEvent +        # and ScreenshotEvent +        self.documentRecords = {}      def _writeRequest (self, item):          writer = self.writer @@ -135,6 +138,9 @@ class WarcHandler (EventHandler):                  http_headers=httpHeaders)          writer.write_record(record) +        if item.resourceType == 'Document': +            self.documentRecords[item.url] = record.rec_headers.get_header ('WARC-Record-ID') +      def _writeScript (self, item):          writer = self.writer          encoding = 'utf-8' @@ -155,21 +161,36 @@ class WarcHandler (EventHandler):          except ValueError as e:              self.logger.error (e.args[0]) +    def _addRefersTo (self, headers, url): +        refersTo = self.documentRecords.get (url) +        if refersTo: +            headers['WARC-Refers-To'] = refersTo +        else: +            self.logger.error ('No document record found for {}'.format (url)) +        return headers +      def _writeDomSnapshot (self, item):          writer = self.writer -        httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1') -        record = writer.create_warc_record (item.url, 'response', + +        warcHeaders = {'X-DOM-Snapshot': str (True), +                'X-Chrome-Viewport': item.viewport, +                'Content-Type': 'text/html; charset=utf-8', +                } + +        self._addRefersTo (warcHeaders, item.url) + +        record = writer.create_warc_record (item.url, 'conversion',                  payload=BytesIO (item.document), -                http_headers=httpHeaders, -                warc_headers_dict={'X-DOM-Snapshot': str (True), -                        'X-Chrome-Viewport': item.viewport}) +                warc_headers_dict=warcHeaders)          writer.write_record (record)      def _writeScreenshot (self, item):          writer = self.writer -        url = packageUrl ('screenshot-{}-{}.png'.format (0, item.yoff)) -        record = writer.create_warc_record (url, 'resource', -                payload=BytesIO (item.data), warc_headers_dict={'Content-Type': 'image/png'}) +        warcHeaders = {'Content-Type': 'image/png', +                'X-Crocoite-Screenshot-Y-Offset': str (item.yoff)} +        self._addRefersTo (warcHeaders, item.url) +        record = writer.create_warc_record (item.url, 'conversion', +                payload=BytesIO (item.data), warc_headers_dict=warcHeaders)          writer.write_record (record)      def _writeControllerStart (self, item): | 
