summaryrefslogtreecommitdiff
path: root/crocoite/behavior.py
diff options
context:
space:
mode:
Diffstat (limited to 'crocoite/behavior.py')
-rw-r--r--crocoite/behavior.py415
1 files changed, 293 insertions, 122 deletions
diff --git a/crocoite/behavior.py b/crocoite/behavior.py
index 95e8160..1610751 100644
--- a/crocoite/behavior.py
+++ b/crocoite/behavior.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 crocoite contributors
+# Copyright (c) 2017–2018 crocoite contributors
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
@@ -19,31 +19,74 @@
# THE SOFTWARE.
"""
-Generic and per-site behavior scripts
+Behavior scripts (i.e. subclasses of Behavior) are a powerful method to
+manipulate websites loaded into Chrome. They are executed by the controller
+after the page started loading (onload), after it has been idle for a while
+(onstop) and after loading was stopped (onfinish).
+
+The script’s excercise their power either through DevTools API calls or by
+injecting JavaScript into the page context. Thus they can manipulate both, the
+browser itself (DevTools; modify resolution, get DOM snapshot) as well as the
+page (JavaScript; trigger JavaScript events, call web API’s).
+
+They also emit (yield) data processable by any consumer registered to the
+controller. This allows storing captured screenshots inside WARC files, for
+instance.
"""
-import logging
-from io import BytesIO
-from urllib.parse import urlsplit
-import os.path
-import pkg_resources
+import asyncio, json, os.path
from base64 import b64decode
+from collections import OrderedDict
+import pkg_resources
+
+from html5lib.serializer import HTMLSerializer
+from yarl import URL
+import yaml
-from .util import randomString, packageUrl, getFormattedViewportMetrics
+from .util import getFormattedViewportMetrics
from . import html
from .html import StripAttributeFilter, StripTagFilter, ChromeTreeWalker
-from html5lib.serializer import HTMLSerializer
-from warcio.statusandheaders import StatusAndHeaders
+from .devtools import Crashed, TabException
+
+class Script:
+ """ A JavaScript resource """
-logger = logging.getLogger(__name__)
+ __slots__ = ('path', 'data')
+ datadir = 'data'
+
+ def __init__ (self, path=None, encoding='utf-8'):
+ self.path = path
+ if path:
+ self.data = pkg_resources.resource_string (__name__, os.path.join (self.datadir, path)).decode (encoding)
+
+ def __repr__ (self):
+ return f'<Script {self.path}>'
+
+ def __str__ (self):
+ return self.data
+
+ @property
+ def abspath (self):
+ return pkg_resources.resource_filename (__name__,
+ os.path.join (self.datadir, self.path))
+
+ @classmethod
+ def fromStr (cls, data, path=None):
+ s = Script ()
+ s.data = data
+ s.path = path
+ return s
class Behavior:
+ __slots__ = ('loader', 'logger')
+
# unique behavior name
name = None
- def __init__ (self, loader):
+ def __init__ (self, loader, logger):
assert self.name is not None
self.loader = loader
+ self.logger = logger.bind (context=type (self).__name__)
def __contains__ (self, url):
"""
@@ -51,54 +94,97 @@ class Behavior:
"""
return True
- def loadScript (self, path, encoding='utf-8'):
- return pkg_resources.resource_string (__name__, os.path.join ('data', path)).decode (encoding)
+ def __repr__ (self):
+ return f'<Behavior {self.name}>'
- def useScript (self, script, encoding='utf-8'):
- writer = self.loader.writer
- record = writer.create_warc_record (packageUrl ('script'), 'metadata',
- payload=BytesIO (script.encode (encoding)),
- warc_headers_dict={'Content-Type': 'application/javascript; charset={}'.format (encoding)})
- writer.write_record (record)
+ async def onload (self):
+ """ After loading the page started """
+ # this is a dirty hack to make this function an async generator
+ return
+ yield # pragma: no cover
- def onload (self):
- """ Before loading the page """
- pass
-
- def onstop (self):
+ async def onstop (self):
""" Before page loading is stopped """
- pass
+ return
+ yield # pragma: no cover
- def onfinish (self):
+ async def onfinish (self):
""" After the site has stopped loading """
- pass
-
-class HostnameFilter:
- """ Limit behavior script to hostname """
-
- hostname = None
-
- def __contains__ (self, url):
- url = urlsplit (url)
- hostname = url.hostname.split ('.')[::-1]
- return hostname[:2] == self.hostname
+ return
+ yield # pragma: no cover
class JsOnload (Behavior):
""" Execute JavaScript on page load """
- scriptPath = None
+ __slots__ = ('script', 'context', 'options')
- def __init__ (self, loader):
- super ().__init__ (loader)
- self.script = self.loadScript (self.scriptPath)
- self.scriptHandle = None
+ scriptPath = None
- def onload (self):
- self.useScript (self.script)
- self.scriptHandle = self.loader.tab.Page.addScriptToEvaluateOnNewDocument (source=self.script)['identifier']
+ def __init__ (self, loader, logger):
+ super ().__init__ (loader, logger)
+ self.script = Script (self.scriptPath)
+ self.context = None
+ # options passed to constructor
+ self.options = {}
- def onstop (self):
- self.loader.tab.Page.removeScriptToEvaluateOnNewDocument (identifier=self.scriptHandle)
+ async def onload (self):
+ tab = self.loader.tab
+ yield self.script
+
+ # This is slightly awkward, since we cannot compile the class into an
+ # objectId and then reference it. Therefore the script must return a
+ # class constructor, which is then called with a generic options
+ # parameter.
+ # XXX: is there a better way to do this?
+ result = await tab.Runtime.evaluate (expression=str (self.script))
+ self.logger.debug ('behavior onload inject',
+ uuid='a2da9b78-5648-44c5-bfa8-5c7573e13ad3', result=result)
+ exception = result.get ('exceptionDetails', None)
+ result = result['result']
+ assert result['type'] == 'function', result
+ assert result.get ('subtype') != 'error', exception
+ constructor = result['objectId']
+
+ if self.options:
+ yield Script.fromStr (json.dumps (self.options, indent=2), f'{self.script.path}#options')
+
+ try:
+ result = await tab.Runtime.callFunctionOn (
+ functionDeclaration='function(options){return new this(options);}',
+ objectId=constructor,
+ arguments=[{'value': self.options}])
+ self.logger.debug ('behavior onload start',
+ uuid='6c0605ae-93b3-46b3-b575-ba45790909a7', result=result)
+ result = result['result']
+ assert result['type'] == 'object', result
+ assert result.get ('subtype') != 'error', result
+ self.context = result['objectId']
+ except TabException as e:
+ if e.args[0] == -32000:
+ # the site probably reloaded. ignore this, since we’ll be
+ # re-injected into the new site by the controller.
+ self.logger.error ('jsonload onload failed',
+ uuid='c151a863-78d1-41f4-a8e6-c022a6c5d252',
+ exception=e.args)
+ else:
+ raise
+
+ async def onstop (self):
+ tab = self.loader.tab
+ try:
+ assert self.context is not None
+ await tab.Runtime.callFunctionOn (functionDeclaration='function(){return this.stop();}',
+ objectId=self.context)
+ await tab.Runtime.releaseObject (objectId=self.context)
+ except TabException as e:
+ # cannot do anything about that. Ignoring should be fine.
+ self.logger.error ('jsonload onstop failed',
+ uuid='1786726f-c8ec-4f79-8769-30954d4e32f5',
+ exception=e.args,
+ objectId=self.context)
+
+ return
+ yield # pragma: no cover
### Generic scripts ###
@@ -106,24 +192,10 @@ class Scroll (JsOnload):
name = 'scroll'
scriptPath = 'scroll.js'
- def __init__ (self, loader):
- super ().__init__ (loader)
- stopVarname = '__' + __package__ + '_stop__'
- newStopVarname = randomString ()
- self.script = self.script.replace (stopVarname, newStopVarname)
- self.stopVarname = newStopVarname
-
- def onstop (self):
- super ().onstop ()
- # removing the script does not stop it if running
- script = '{} = true; window.scrollTo (0, 0);'.format (self.stopVarname)
- self.useScript (script)
- self.loader.tab.Runtime.evaluate (expression=script, returnByValue=True)
-
class EmulateScreenMetrics (Behavior):
name = 'emulateScreenMetrics'
- def onstop (self):
+ async def onstop (self):
"""
Emulate different screen sizes, causing the site to fetch assets (img
srcset and css, for example) for different screen resolutions.
@@ -139,17 +211,32 @@ class EmulateScreenMetrics (Behavior):
{'width': 320, 'height': 480, 'deviceScaleFactor': 163/cssPpi, 'mobile': True},
# 6th gen iPhone (portrait mode)
{'width': 750, 'height': 1334, 'deviceScaleFactor': 326/cssPpi, 'mobile': True},
- # and reset
- {'width': 1920, 'height': 1080, 'deviceScaleFactor': 1, 'mobile': False},
]
l = self.loader
tab = l.tab
for s in sizes:
- tab.Emulation.setDeviceMetricsOverride (**s)
+ self.logger.debug ('device override',
+ uuid='3d2d8096-1a75-4830-ad79-ae5f6f97071d', **s)
+ await tab.Emulation.setDeviceMetricsOverride (**s)
# give the browser time to re-eval page and start requests
- l.wait (1)
- # XXX: this seems to be broken, it does not clear the override
- #tab.Emulation.clearDeviceMetricsOverride ()
+ # XXX: should wait until loader is not busy any more
+ await asyncio.sleep (1)
+ self.logger.debug ('clear override',
+ uuid='f9401683-eb3a-4b86-9bb2-c8c5d876fc8d')
+ await tab.Emulation.clearDeviceMetricsOverride ()
+ return
+ yield # pragma: no cover
+
+class DomSnapshotEvent:
+ __slots__ = ('url', 'document', 'viewport')
+
+ def __init__ (self, url, document, viewport):
+ # XXX: document encoding?
+ assert isinstance (document, bytes)
+
+ self.url = url
+ self.document = document
+ self.viewport = viewport
class DomSnapshot (Behavior):
"""
@@ -157,38 +244,39 @@ class DomSnapshot (Behavior):
We could use DOMSnapshot.getSnapshot here, but the API is not stable
yet. Also computed styles are not really necessary here.
-
- XXX: Currently writes a response, when it should use “resource”. pywb
- can’t handle that though.
"""
+ __slots__ = ('script', )
+
name = 'domSnapshot'
- def __init__ (self, loader):
- super ().__init__ (loader)
- self.script = self.loadScript ('canvas-snapshot.js')
+ def __init__ (self, loader, logger):
+ super ().__init__ (loader, logger)
+ self.script = Script ('canvas-snapshot.js')
- def onfinish (self):
+ async def onfinish (self):
tab = self.loader.tab
- writer = self.loader.writer
- self.useScript (self.script)
- tab.Runtime.evaluate (expression=self.script, returnByValue=True)
+ yield self.script
+ await tab.Runtime.evaluate (expression=str (self.script), returnByValue=True)
- viewport = getFormattedViewportMetrics (tab)
- dom = tab.DOM.getDocument (depth=-1, pierce=True)
+ viewport = await getFormattedViewportMetrics (tab)
+ dom = await tab.DOM.getDocument (depth=-1, pierce=True)
+ self.logger.debug ('dom snapshot document',
+ uuid='0c720784-8bd1-4fdc-a811-84394d753539', dom=dom)
haveUrls = set ()
for doc in ChromeTreeWalker (dom['root']).split ():
- rawUrl = doc['documentURL']
- if rawUrl in haveUrls:
+ url = URL (doc['documentURL'])
+ if url in haveUrls:
# ignore duplicate URLs. they are usually caused by
# javascript-injected iframes (advertising) with no(?) src
- logger.warning ('have DOM snapshot for URL {}, ignoring'.format (rawUrl))
- continue
- url = urlsplit (rawUrl)
- if url.scheme in ('http', 'https'):
- logger.debug ('saving DOM snapshot for url {}, base {}'.format (doc['documentURL'], doc['baseURL']))
- haveUrls.add (rawUrl)
+ self.logger.warning ('dom snapshot duplicate',
+ uuid='d44de989-98d4-456e-82e7-9d4c49acab5e')
+ elif url.scheme in ('http', 'https'):
+ self.logger.debug ('dom snapshot',
+ uuid='ece7ff05-ccd9-44b5-b6a8-be25a24b96f4',
+ base=doc["baseURL"])
+ haveUrls.add (url)
walker = ChromeTreeWalker (doc)
# remove script, to make the page static and noscript, because at the
# time we took the snapshot scripts were enabled
@@ -196,41 +284,89 @@ class DomSnapshot (Behavior):
disallowedAttributes = html.eventAttributes
stream = StripAttributeFilter (StripTagFilter (walker, disallowedTags), disallowedAttributes)
serializer = HTMLSerializer ()
- httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1')
- record = writer.create_warc_record (doc['documentURL'], 'response',
- payload=BytesIO (serializer.render (stream, 'utf-8')),
- http_headers=httpHeaders,
- warc_headers_dict={'X-DOM-Snapshot': str (True),
- 'X-Chrome-Viewport': viewport})
- writer.write_record (record)
+ yield DomSnapshotEvent (url.with_fragment(None), serializer.render (stream, 'utf-8'), viewport)
+
+class ScreenshotEvent:
+ __slots__ = ('yoff', 'data', 'url')
+
+ def __init__ (self, url, yoff, data):
+ self.url = url
+ self.yoff = yoff
+ self.data = data
class Screenshot (Behavior):
"""
Create screenshot from tab and write it to WARC
+
+ Chrome will allocate an additional 512MB of RAM when using this plugin.
"""
+ __slots__ = ('script')
+
name = 'screenshot'
- def onfinish (self):
+ # Hardcoded max texture size of 16,384 (crbug.com/770769)
+ maxDim = 16*1024
+
+ def __init__ (self, loader, logger):
+ super ().__init__ (loader, logger)
+ self.script = Script ('screenshot.js')
+
+ async def onfinish (self):
tab = self.loader.tab
- writer = self.loader.writer
- # see https://github.com/GoogleChrome/puppeteer/blob/230be28b067b521f0577206899db01f0ca7fc0d2/examples/screenshots-longpage.js
- # Hardcoded max texture size of 16,384 (crbug.com/770769)
- maxDim = 16*1024
- metrics = tab.Page.getLayoutMetrics ()
+ # for top-level/full-screen elements with position: fixed we need to
+ # figure out their actual size (i.e. scrollHeight) and use that when
+ # overriding the viewport size.
+ # we could do this without javascript, but that would require several
+ # round-trips to Chrome or pulling down the entire DOM+computed styles
+ tab = self.loader.tab
+ yield self.script
+ result = await tab.Runtime.evaluate (expression=str (self.script), returnByValue=True)
+ assert result['result']['type'] == 'object', result
+ result = result['result']['value']
+
+ # this is required to make the browser render more than just the small
+ # actual viewport (i.e. entire page). see
+ # https://github.com/GoogleChrome/puppeteer/blob/45873ea737b4ebe4fa7d6f46256b2ea19ce18aa7/lib/Page.js#L805
+ metrics = await tab.Page.getLayoutMetrics ()
contentSize = metrics['contentSize']
- width = min (contentSize['width'], maxDim)
+ contentHeight = max (result + [contentSize['height']])
+
+ override = {
+ 'width': 0,
+ 'height': 0,
+ 'deviceScaleFactor': 0,
+ 'mobile': False,
+ 'viewport': {'x': 0,
+ 'y': 0,
+ 'width': contentSize['width'],
+ 'height': contentHeight,
+ 'scale': 1}
+ }
+ self.logger.debug ('screenshot override',
+ uuid='e0affa18-cbb1-4d97-9d13-9a88f704b1b2', override=override)
+ await tab.Emulation.setDeviceMetricsOverride (**override)
+
+ tree = await tab.Page.getFrameTree ()
+ try:
+ url = URL (tree['frameTree']['frame']['url']).with_fragment (None)
+ except KeyError:
+ self.logger.error ('frame without url',
+ uuid='edc2743d-b93e-4ba1-964e-db232f2f96ff', tree=tree)
+ url = None
+
+ width = min (contentSize['width'], self.maxDim)
# we’re ignoring horizontal scroll intentionally. Most horizontal
# layouts use JavaScript scrolling and don’t extend the viewport.
- for yoff in range (0, contentSize['height'], maxDim):
- height = min (contentSize['height'] - yoff, maxDim)
+ for yoff in range (0, contentHeight, self.maxDim):
+ height = min (contentHeight - yoff, self.maxDim)
clip = {'x': 0, 'y': yoff, 'width': width, 'height': height, 'scale': 1}
- data = b64decode (tab.Page.captureScreenshot (format='png', clip=clip)['data'])
- url = packageUrl ('screenshot-{}-{}.png'.format (0, yoff))
- record = writer.create_warc_record (url, 'resource',
- payload=BytesIO (data), warc_headers_dict={'Content-Type': 'image/png'})
- writer.write_record (record)
+ ret = await tab.Page.captureScreenshot (format='png', clip=clip)
+ data = b64decode (ret['data'])
+ yield ScreenshotEvent (url, yoff, data)
+
+ await tab.Emulation.clearDeviceMetricsOverride ()
class Click (JsOnload):
""" Generic link clicking """
@@ -238,6 +374,27 @@ class Click (JsOnload):
name = 'click'
scriptPath = 'click.js'
+ def __init__ (self, loader, logger):
+ super ().__init__ (loader, logger)
+ with pkg_resources.resource_stream (__name__, os.path.join ('data', 'click.yaml')) as fd:
+ self.options['sites'] = list (yaml.safe_load_all (fd))
+
+class ExtractLinksEvent:
+ __slots__ = ('links', )
+
+ def __init__ (self, links):
+ self.links = links
+
+ def __repr__ (self):
+ return f'<ExtractLinksEvent {self.links!r}>'
+
+def mapOrIgnore (f, l):
+ for e in l:
+ try:
+ yield f (e)
+ except:
+ pass
+
class ExtractLinks (Behavior):
"""
Extract links from a page using JavaScript
@@ -246,23 +403,37 @@ class ExtractLinks (Behavior):
manually resolve relative links.
"""
+ __slots__ = ('script', )
+
name = 'extractLinks'
- def __init__ (self, loader):
- super ().__init__ (loader)
- self.script = self.loadScript ('extract-links.js')
- self.links = None
+ def __init__ (self, loader, logger):
+ super ().__init__ (loader, logger)
+ self.script = Script ('extract-links.js')
- def onfinish (self):
+ async def onfinish (self):
tab = self.loader.tab
- self.useScript (self.script)
- result = tab.Runtime.evaluate (expression=self.script, returnByValue=True)
- self.links = list (set (result['result']['value']))
+ yield self.script
+ result = await tab.Runtime.evaluate (expression=str (self.script), returnByValue=True)
+ yield ExtractLinksEvent (list (set (mapOrIgnore (URL, result['result']['value']))))
+
+class Crash (Behavior):
+ """ Crash the browser. For testing only. Obviously. """
+
+ name = 'crash'
+
+ async def onstop (self):
+ try:
+ await self.loader.tab.Page.crash ()
+ except Crashed:
+ pass
+ return
+ yield # pragma: no cover
# available behavior scripts. Order matters, move those modifying the page
# towards the end of available
-generic = [Scroll, EmulateScreenMetrics, Click, ExtractLinks]
-perSite = []
-available = generic + perSite + [Screenshot, DomSnapshot]
-availableNames = set (map (lambda x: x.name, available))
+available = [Scroll, Click, ExtractLinks, Screenshot, EmulateScreenMetrics, DomSnapshot]
+#available.append (Crash)
+# order matters, since behavior can modify the page (dom snapshots, for instance)
+availableMap = OrderedDict (map (lambda x: (x.name, x), available))