1 files changed, 293 insertions, 122 deletions
diff --git a/crocoite/behavior.py b/crocoite/behavior.py
index 95e8160..1610751 100644
--- a/crocoite/behavior.py
+++ b/crocoite/behavior.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 crocoite contributors
+# Copyright (c) 2017–2018 crocoite contributors
 # 
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -19,31 +19,74 @@
 # THE SOFTWARE.
 
 """
-Generic and per-site behavior scripts
+Behavior scripts (i.e. subclasses of Behavior) are a powerful method to
+manipulate websites loaded into Chrome. They are executed by the controller
+after the page started loading (onload), after it has been idle for a while
+(onstop) and after loading was stopped (onfinish).
+
+The script’s excercise their power either through DevTools API calls or by
+injecting JavaScript into the page context. Thus they can manipulate both, the
+browser itself (DevTools; modify resolution, get DOM snapshot) as well as the
+page (JavaScript; trigger JavaScript events, call web API’s).
+
+They also emit (yield) data processable by any consumer registered to the
+controller. This allows storing captured screenshots inside WARC files, for
+instance.
 """
 
-import logging
-from io import BytesIO
-from urllib.parse import urlsplit
-import os.path
-import pkg_resources
+import asyncio, json, os.path
 from base64 import b64decode
+from collections import OrderedDict
+import pkg_resources
+
+from html5lib.serializer import HTMLSerializer
+from yarl import URL
+import yaml
 
-from .util import randomString, packageUrl, getFormattedViewportMetrics
+from .util import getFormattedViewportMetrics
 from . import html
 from .html import StripAttributeFilter, StripTagFilter, ChromeTreeWalker
-from html5lib.serializer import HTMLSerializer
-from warcio.statusandheaders import StatusAndHeaders
+from .devtools import Crashed, TabException
+
+class Script:
+    """ A JavaScript resource """
 
-logger = logging.getLogger(__name__)
+    __slots__ = ('path', 'data')
+    datadir = 'data'
+
+    def __init__ (self, path=None, encoding='utf-8'):
+        self.path = path
+        if path:
+            self.data = pkg_resources.resource_string (__name__, os.path.join (self.datadir, path)).decode (encoding)
+
+    def __repr__ (self):
+        return f'<Script {self.path}>'
+
+    def __str__ (self):
+        return self.data
+
+    @property
+    def abspath (self):
+        return pkg_resources.resource_filename (__name__,
+                os.path.join (self.datadir, self.path))
+
+    @classmethod
+    def fromStr (cls, data, path=None):
+        s = Script ()
+        s.data = data
+        s.path = path
+        return s
 
 class Behavior:
+    __slots__ = ('loader', 'logger')
+
     # unique behavior name
     name = None
 
-    def __init__ (self, loader):
+    def __init__ (self, loader, logger):
         assert self.name is not None
         self.loader = loader
+        self.logger = logger.bind (context=type (self).__name__)
 
     def __contains__ (self, url):
         """
@@ -51,54 +94,97 @@ class Behavior:
         """
         return True
 
-    def loadScript (self, path, encoding='utf-8'):
-        return pkg_resources.resource_string (__name__, os.path.join ('data', path)).decode (encoding)
+    def __repr__ (self):
+        return f'<Behavior {self.name}>'
 
-    def useScript (self, script, encoding='utf-8'):
-        writer = self.loader.writer
-        record = writer.create_warc_record (packageUrl ('script'), 'metadata',
-                payload=BytesIO (script.encode (encoding)),
-                warc_headers_dict={'Content-Type': 'application/javascript; charset={}'.format (encoding)})
-        writer.write_record (record)
+    async def onload (self):
+        """ After loading the page started """
+        # this is a dirty hack to make this function an async generator
+        return
+        yield # pragma: no cover
 
-    def onload (self):
-        """ Before loading the page """
-        pass
-
-    def onstop (self):
+    async def onstop (self):
         """ Before page loading is stopped """
-        pass
+        return
+        yield # pragma: no cover
 
-    def onfinish (self):
+    async def onfinish (self):
         """ After the site has stopped loading """
-        pass
-
-class HostnameFilter:
-    """ Limit behavior script to hostname """
-
-    hostname = None
-
-    def __contains__ (self, url):
-        url = urlsplit (url)
-        hostname = url.hostname.split ('.')[::-1]
-        return hostname[:2] == self.hostname
+        return
+        yield # pragma: no cover
 
 class JsOnload (Behavior):
     """ Execute JavaScript on page load """
 
-    scriptPath = None
+    __slots__ = ('script', 'context', 'options')
 
-    def __init__ (self, loader):
-        super ().__init__ (loader)
-        self.script = self.loadScript (self.scriptPath)
-        self.scriptHandle = None
+    scriptPath = None
 
-    def onload (self):
-        self.useScript (self.script)
-        self.scriptHandle = self.loader.tab.Page.addScriptToEvaluateOnNewDocument (source=self.script)['identifier']
+    def __init__ (self, loader, logger):
+        super ().__init__ (loader, logger)
+        self.script = Script (self.scriptPath)
+        self.context = None
+        # options passed to constructor
+        self.options = {}
 
-    def onstop (self):
-        self.loader.tab.Page.removeScriptToEvaluateOnNewDocument (identifier=self.scriptHandle)
+    async def onload (self):
+        tab = self.loader.tab
+        yield self.script
+
+        # This is slightly awkward, since we cannot compile the class into an
+        # objectId and then reference it. Therefore the script must return a
+        # class constructor, which is then called with a generic options
+        # parameter.
+        # XXX: is there a better way to do this?
+        result = await tab.Runtime.evaluate (expression=str (self.script))
+        self.logger.debug ('behavior onload inject',
+                uuid='a2da9b78-5648-44c5-bfa8-5c7573e13ad3', result=result)
+        exception = result.get ('exceptionDetails', None)
+        result = result['result']
+        assert result['type'] == 'function', result
+        assert result.get ('subtype') != 'error', exception
+        constructor = result['objectId']
+
+        if self.options:
+            yield Script.fromStr (json.dumps (self.options, indent=2), f'{self.script.path}#options')
+
+        try:
+            result = await tab.Runtime.callFunctionOn (
+                    functionDeclaration='function(options){return new this(options);}',
+                    objectId=constructor,
+                    arguments=[{'value': self.options}])
+            self.logger.debug ('behavior onload start',
+                    uuid='6c0605ae-93b3-46b3-b575-ba45790909a7', result=result)
+            result = result['result']
+            assert result['type'] == 'object', result
+            assert result.get ('subtype') != 'error', result
+            self.context = result['objectId']
+        except TabException as e:
+            if e.args[0] == -32000:
+                # the site probably reloaded. ignore this, since we’ll be
+                # re-injected into the new site by the controller.
+                self.logger.error ('jsonload onload failed',
+                        uuid='c151a863-78d1-41f4-a8e6-c022a6c5d252',
+                        exception=e.args)
+            else:
+                raise
+
+    async def onstop (self):
+        tab = self.loader.tab
+        try:
+            assert self.context is not None
+            await tab.Runtime.callFunctionOn (functionDeclaration='function(){return this.stop();}',
+                    objectId=self.context)
+            await tab.Runtime.releaseObject (objectId=self.context)
+        except TabException as e:
+            # cannot do anything about that. Ignoring should be fine.
+            self.logger.error ('jsonload onstop failed',
+                    uuid='1786726f-c8ec-4f79-8769-30954d4e32f5',
+                    exception=e.args,
+                    objectId=self.context)
+
+        return
+        yield # pragma: no cover
 
 ### Generic scripts ###
 
@@ -106,24 +192,10 @@ class Scroll (JsOnload):
     name = 'scroll'
     scriptPath = 'scroll.js'
 
-    def __init__ (self, loader):
-        super ().__init__ (loader)
-        stopVarname = '__' + __package__ + '_stop__'
-        newStopVarname = randomString ()
-        self.script = self.script.replace (stopVarname, newStopVarname)
-        self.stopVarname = newStopVarname
-
-    def onstop (self):
-        super ().onstop ()
-        # removing the script does not stop it if running
-        script = '{} = true; window.scrollTo (0, 0);'.format (self.stopVarname)
-        self.useScript (script)
-        self.loader.tab.Runtime.evaluate (expression=script, returnByValue=True)
-
 class EmulateScreenMetrics (Behavior):
     name = 'emulateScreenMetrics'
 
-    def onstop (self):
+    async def onstop (self):
         """
         Emulate different screen sizes, causing the site to fetch assets (img
         srcset and css, for example) for different screen resolutions.
@@ -139,17 +211,32 @@ class EmulateScreenMetrics (Behavior):
                 {'width': 320, 'height': 480, 'deviceScaleFactor': 163/cssPpi, 'mobile': True},
                 # 6th gen iPhone (portrait mode)
                 {'width': 750, 'height': 1334, 'deviceScaleFactor': 326/cssPpi, 'mobile': True},
-                # and reset
-                {'width': 1920, 'height': 1080, 'deviceScaleFactor': 1, 'mobile': False},
                 ]
         l = self.loader
         tab = l.tab
         for s in sizes:
-            tab.Emulation.setDeviceMetricsOverride (**s)
+            self.logger.debug ('device override',
+                    uuid='3d2d8096-1a75-4830-ad79-ae5f6f97071d', **s)
+            await tab.Emulation.setDeviceMetricsOverride (**s)
             # give the browser time to re-eval page and start requests
-            l.wait (1)
-        # XXX: this seems to be broken, it does not clear the override
-        #tab.Emulation.clearDeviceMetricsOverride ()
+            # XXX: should wait until loader is not busy any more
+            await asyncio.sleep (1)
+        self.logger.debug ('clear override',
+                uuid='f9401683-eb3a-4b86-9bb2-c8c5d876fc8d')
+        await tab.Emulation.clearDeviceMetricsOverride ()
+        return
+        yield # pragma: no cover
+
+class DomSnapshotEvent:
+    __slots__ = ('url', 'document', 'viewport')
+
+    def __init__ (self, url, document, viewport):
+        # XXX: document encoding?
+        assert isinstance (document, bytes)
+
+        self.url = url
+        self.document = document
+        self.viewport = viewport
 
 class DomSnapshot (Behavior):
     """
@@ -157,38 +244,39 @@ class DomSnapshot (Behavior):
 
     We could use DOMSnapshot.getSnapshot here, but the API is not stable
     yet. Also computed styles are not really necessary here.
-
-    XXX: Currently writes a response, when it should use “resource”. pywb
-    can’t handle that though.
     """
 
+    __slots__ = ('script', )
+
     name = 'domSnapshot'
 
-    def __init__ (self, loader):
-        super ().__init__ (loader)
-        self.script = self.loadScript ('canvas-snapshot.js')
+    def __init__ (self, loader, logger):
+        super ().__init__ (loader, logger)
+        self.script = Script ('canvas-snapshot.js')
 
-    def onfinish (self):
+    async def onfinish (self):
         tab = self.loader.tab
-        writer = self.loader.writer
 
-        self.useScript (self.script)
-        tab.Runtime.evaluate (expression=self.script, returnByValue=True)
+        yield self.script
+        await tab.Runtime.evaluate (expression=str (self.script), returnByValue=True)
 
-        viewport = getFormattedViewportMetrics (tab)
-        dom = tab.DOM.getDocument (depth=-1, pierce=True)
+        viewport = await getFormattedViewportMetrics (tab)
+        dom = await tab.DOM.getDocument (depth=-1, pierce=True)
+        self.logger.debug ('dom snapshot document',
+                uuid='0c720784-8bd1-4fdc-a811-84394d753539', dom=dom)
         haveUrls = set ()
         for doc in ChromeTreeWalker (dom['root']).split ():
-            rawUrl = doc['documentURL']
-            if rawUrl in haveUrls:
+            url = URL (doc['documentURL'])
+            if url in haveUrls:
                 # ignore duplicate URLs. they are usually caused by
                 # javascript-injected iframes (advertising) with no(?) src
-                logger.warning ('have DOM snapshot for URL {}, ignoring'.format (rawUrl))
-                continue
-            url = urlsplit (rawUrl)
-            if url.scheme in ('http', 'https'):
-                logger.debug ('saving DOM snapshot for url {}, base {}'.format (doc['documentURL'], doc['baseURL']))
-                haveUrls.add (rawUrl)
+                self.logger.warning ('dom snapshot duplicate',
+                        uuid='d44de989-98d4-456e-82e7-9d4c49acab5e')
+            elif url.scheme in ('http', 'https'):
+                self.logger.debug ('dom snapshot',
+                        uuid='ece7ff05-ccd9-44b5-b6a8-be25a24b96f4',
+                        base=doc["baseURL"])
+                haveUrls.add (url)
                 walker = ChromeTreeWalker (doc)
                 # remove script, to make the page static and noscript, because at the
                 # time we took the snapshot scripts were enabled
@@ -196,41 +284,89 @@ class DomSnapshot (Behavior):
                 disallowedAttributes = html.eventAttributes
                 stream = StripAttributeFilter (StripTagFilter (walker, disallowedTags), disallowedAttributes)
                 serializer = HTMLSerializer ()
-                httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1')
-                record = writer.create_warc_record (doc['documentURL'], 'response',
-                        payload=BytesIO (serializer.render (stream, 'utf-8')),
-                        http_headers=httpHeaders,
-                        warc_headers_dict={'X-DOM-Snapshot': str (True),
-                                'X-Chrome-Viewport': viewport})
-                writer.write_record (record)
+                yield DomSnapshotEvent (url.with_fragment(None), serializer.render (stream, 'utf-8'), viewport)
+
+class ScreenshotEvent:
+    __slots__ = ('yoff', 'data', 'url')
+
+    def __init__ (self, url, yoff, data):
+        self.url = url
+        self.yoff = yoff
+        self.data = data
 
 class Screenshot (Behavior):
     """
     Create screenshot from tab and write it to WARC
+
+    Chrome will allocate an additional 512MB of RAM when using this plugin.
     """
 
+    __slots__ = ('script')
+
     name = 'screenshot'
 
-    def onfinish (self):
+    # Hardcoded max texture size of 16,384 (crbug.com/770769)
+    maxDim = 16*1024
+
+    def __init__ (self, loader, logger):
+        super ().__init__ (loader, logger)
+        self.script = Script ('screenshot.js')
+
+    async def onfinish (self):
         tab = self.loader.tab
-        writer = self.loader.writer
 
-        # see https://github.com/GoogleChrome/puppeteer/blob/230be28b067b521f0577206899db01f0ca7fc0d2/examples/screenshots-longpage.js
-        # Hardcoded max texture size of 16,384 (crbug.com/770769)
-        maxDim = 16*1024
-        metrics = tab.Page.getLayoutMetrics ()
+        # for top-level/full-screen elements with position: fixed we need to
+        # figure out their actual size (i.e. scrollHeight) and use that when
+        # overriding the viewport size.
+        # we could do this without javascript, but that would require several
+        # round-trips to Chrome or pulling down the entire DOM+computed styles
+        tab = self.loader.tab
+        yield self.script
+        result = await tab.Runtime.evaluate (expression=str (self.script), returnByValue=True)
+        assert result['result']['type'] == 'object', result
+        result = result['result']['value']
+
+        # this is required to make the browser render more than just the small
+        # actual viewport (i.e. entire page).  see
+        # https://github.com/GoogleChrome/puppeteer/blob/45873ea737b4ebe4fa7d6f46256b2ea19ce18aa7/lib/Page.js#L805
+        metrics = await tab.Page.getLayoutMetrics ()
         contentSize = metrics['contentSize']
-        width = min (contentSize['width'], maxDim)
+        contentHeight = max (result + [contentSize['height']])
+
+        override = {
+                'width': 0,
+                'height': 0,
+                'deviceScaleFactor': 0,
+                'mobile': False,
+                'viewport': {'x': 0,
+                    'y': 0,
+                    'width': contentSize['width'],
+                    'height': contentHeight,
+                    'scale': 1}
+                }
+        self.logger.debug ('screenshot override',
+                uuid='e0affa18-cbb1-4d97-9d13-9a88f704b1b2', override=override)
+        await tab.Emulation.setDeviceMetricsOverride (**override)
+
+        tree = await tab.Page.getFrameTree ()
+        try:
+            url = URL (tree['frameTree']['frame']['url']).with_fragment (None)
+        except KeyError:
+            self.logger.error ('frame without url',
+                    uuid='edc2743d-b93e-4ba1-964e-db232f2f96ff', tree=tree)
+            url = None
+
+        width = min (contentSize['width'], self.maxDim)
         # we’re ignoring horizontal scroll intentionally. Most horizontal
         # layouts use JavaScript scrolling and don’t extend the viewport.
-        for yoff in range (0, contentSize['height'], maxDim):
-            height = min (contentSize['height'] - yoff, maxDim)
+        for yoff in range (0, contentHeight, self.maxDim):
+            height = min (contentHeight - yoff, self.maxDim)
             clip = {'x': 0, 'y': yoff, 'width': width, 'height': height, 'scale': 1}
-            data = b64decode (tab.Page.captureScreenshot (format='png', clip=clip)['data'])
-            url = packageUrl ('screenshot-{}-{}.png'.format (0, yoff))
-            record = writer.create_warc_record (url, 'resource',
-                    payload=BytesIO (data), warc_headers_dict={'Content-Type': 'image/png'})
-            writer.write_record (record)
+            ret = await tab.Page.captureScreenshot (format='png', clip=clip)
+            data = b64decode (ret['data'])
+            yield ScreenshotEvent (url, yoff, data)
+
+        await tab.Emulation.clearDeviceMetricsOverride ()
 
 class Click (JsOnload):
     """ Generic link clicking """
@@ -238,6 +374,27 @@ class Click (JsOnload):
     name = 'click'
     scriptPath = 'click.js'
 
+    def __init__ (self, loader, logger):
+        super ().__init__ (loader, logger)
+        with pkg_resources.resource_stream (__name__, os.path.join ('data', 'click.yaml')) as fd:
+            self.options['sites'] = list (yaml.safe_load_all (fd))
+
+class ExtractLinksEvent:
+    __slots__ = ('links', )
+
+    def __init__ (self, links):
+        self.links = links
+
+    def __repr__ (self):
+        return f'<ExtractLinksEvent {self.links!r}>'
+
+def mapOrIgnore (f, l):
+    for e in l:
+        try:
+            yield f (e)
+        except:
+            pass
+
 class ExtractLinks (Behavior):
     """
     Extract links from a page using JavaScript
@@ -246,23 +403,37 @@ class ExtractLinks (Behavior):
     manually resolve relative links.
     """
 
+    __slots__ = ('script', )
+
     name = 'extractLinks'
 
-    def __init__ (self, loader):
-        super ().__init__ (loader)
-        self.script = self.loadScript ('extract-links.js')
-        self.links = None
+    def __init__ (self, loader, logger):
+        super ().__init__ (loader, logger)
+        self.script = Script ('extract-links.js')
 
-    def onfinish (self):
+    async def onfinish (self):
         tab = self.loader.tab
-        self.useScript (self.script)
-        result = tab.Runtime.evaluate (expression=self.script, returnByValue=True)
-        self.links = list (set (result['result']['value']))
+        yield self.script
+        result = await tab.Runtime.evaluate (expression=str (self.script), returnByValue=True)
+        yield ExtractLinksEvent (list (set (mapOrIgnore (URL, result['result']['value']))))
+
+class Crash (Behavior):
+    """ Crash the browser. For testing only. Obviously. """
+
+    name = 'crash'
+
+    async def onstop (self):
+        try:
+            await self.loader.tab.Page.crash ()
+        except Crashed:
+            pass
+        return
+        yield # pragma: no cover
 
 # available behavior scripts. Order matters, move those modifying the page
 # towards the end of available
-generic = [Scroll, EmulateScreenMetrics, Click, ExtractLinks]
-perSite = []
-available = generic + perSite + [Screenshot, DomSnapshot]
-availableNames = set (map (lambda x: x.name, available))
+available = [Scroll, Click, ExtractLinks, Screenshot, EmulateScreenMetrics, DomSnapshot]
+#available.append (Crash)
+# order matters, since behavior can modify the page (dom snapshots, for instance)
+availableMap = OrderedDict (map (lambda x: (x.name, x), available))