diff options
Diffstat (limited to 'crocoite/behavior.py')
-rw-r--r-- | crocoite/behavior.py | 415 |
1 files changed, 293 insertions, 122 deletions
diff --git a/crocoite/behavior.py b/crocoite/behavior.py index 95e8160..1610751 100644 --- a/crocoite/behavior.py +++ b/crocoite/behavior.py @@ -1,4 +1,4 @@ -# Copyright (c) 2017 crocoite contributors +# Copyright (c) 2017–2018 crocoite contributors # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -19,31 +19,74 @@ # THE SOFTWARE. """ -Generic and per-site behavior scripts +Behavior scripts (i.e. subclasses of Behavior) are a powerful method to +manipulate websites loaded into Chrome. They are executed by the controller +after the page started loading (onload), after it has been idle for a while +(onstop) and after loading was stopped (onfinish). + +The script’s excercise their power either through DevTools API calls or by +injecting JavaScript into the page context. Thus they can manipulate both, the +browser itself (DevTools; modify resolution, get DOM snapshot) as well as the +page (JavaScript; trigger JavaScript events, call web API’s). + +They also emit (yield) data processable by any consumer registered to the +controller. This allows storing captured screenshots inside WARC files, for +instance. """ -import logging -from io import BytesIO -from urllib.parse import urlsplit -import os.path -import pkg_resources +import asyncio, json, os.path from base64 import b64decode +from collections import OrderedDict +import pkg_resources + +from html5lib.serializer import HTMLSerializer +from yarl import URL +import yaml -from .util import randomString, packageUrl, getFormattedViewportMetrics +from .util import getFormattedViewportMetrics from . import html from .html import StripAttributeFilter, StripTagFilter, ChromeTreeWalker -from html5lib.serializer import HTMLSerializer -from warcio.statusandheaders import StatusAndHeaders +from .devtools import Crashed, TabException + +class Script: + """ A JavaScript resource """ -logger = logging.getLogger(__name__) + __slots__ = ('path', 'data') + datadir = 'data' + + def __init__ (self, path=None, encoding='utf-8'): + self.path = path + if path: + self.data = pkg_resources.resource_string (__name__, os.path.join (self.datadir, path)).decode (encoding) + + def __repr__ (self): + return f'<Script {self.path}>' + + def __str__ (self): + return self.data + + @property + def abspath (self): + return pkg_resources.resource_filename (__name__, + os.path.join (self.datadir, self.path)) + + @classmethod + def fromStr (cls, data, path=None): + s = Script () + s.data = data + s.path = path + return s class Behavior: + __slots__ = ('loader', 'logger') + # unique behavior name name = None - def __init__ (self, loader): + def __init__ (self, loader, logger): assert self.name is not None self.loader = loader + self.logger = logger.bind (context=type (self).__name__) def __contains__ (self, url): """ @@ -51,54 +94,97 @@ class Behavior: """ return True - def loadScript (self, path, encoding='utf-8'): - return pkg_resources.resource_string (__name__, os.path.join ('data', path)).decode (encoding) + def __repr__ (self): + return f'<Behavior {self.name}>' - def useScript (self, script, encoding='utf-8'): - writer = self.loader.writer - record = writer.create_warc_record (packageUrl ('script'), 'metadata', - payload=BytesIO (script.encode (encoding)), - warc_headers_dict={'Content-Type': 'application/javascript; charset={}'.format (encoding)}) - writer.write_record (record) + async def onload (self): + """ After loading the page started """ + # this is a dirty hack to make this function an async generator + return + yield # pragma: no cover - def onload (self): - """ Before loading the page """ - pass - - def onstop (self): + async def onstop (self): """ Before page loading is stopped """ - pass + return + yield # pragma: no cover - def onfinish (self): + async def onfinish (self): """ After the site has stopped loading """ - pass - -class HostnameFilter: - """ Limit behavior script to hostname """ - - hostname = None - - def __contains__ (self, url): - url = urlsplit (url) - hostname = url.hostname.split ('.')[::-1] - return hostname[:2] == self.hostname + return + yield # pragma: no cover class JsOnload (Behavior): """ Execute JavaScript on page load """ - scriptPath = None + __slots__ = ('script', 'context', 'options') - def __init__ (self, loader): - super ().__init__ (loader) - self.script = self.loadScript (self.scriptPath) - self.scriptHandle = None + scriptPath = None - def onload (self): - self.useScript (self.script) - self.scriptHandle = self.loader.tab.Page.addScriptToEvaluateOnNewDocument (source=self.script)['identifier'] + def __init__ (self, loader, logger): + super ().__init__ (loader, logger) + self.script = Script (self.scriptPath) + self.context = None + # options passed to constructor + self.options = {} - def onstop (self): - self.loader.tab.Page.removeScriptToEvaluateOnNewDocument (identifier=self.scriptHandle) + async def onload (self): + tab = self.loader.tab + yield self.script + + # This is slightly awkward, since we cannot compile the class into an + # objectId and then reference it. Therefore the script must return a + # class constructor, which is then called with a generic options + # parameter. + # XXX: is there a better way to do this? + result = await tab.Runtime.evaluate (expression=str (self.script)) + self.logger.debug ('behavior onload inject', + uuid='a2da9b78-5648-44c5-bfa8-5c7573e13ad3', result=result) + exception = result.get ('exceptionDetails', None) + result = result['result'] + assert result['type'] == 'function', result + assert result.get ('subtype') != 'error', exception + constructor = result['objectId'] + + if self.options: + yield Script.fromStr (json.dumps (self.options, indent=2), f'{self.script.path}#options') + + try: + result = await tab.Runtime.callFunctionOn ( + functionDeclaration='function(options){return new this(options);}', + objectId=constructor, + arguments=[{'value': self.options}]) + self.logger.debug ('behavior onload start', + uuid='6c0605ae-93b3-46b3-b575-ba45790909a7', result=result) + result = result['result'] + assert result['type'] == 'object', result + assert result.get ('subtype') != 'error', result + self.context = result['objectId'] + except TabException as e: + if e.args[0] == -32000: + # the site probably reloaded. ignore this, since we’ll be + # re-injected into the new site by the controller. + self.logger.error ('jsonload onload failed', + uuid='c151a863-78d1-41f4-a8e6-c022a6c5d252', + exception=e.args) + else: + raise + + async def onstop (self): + tab = self.loader.tab + try: + assert self.context is not None + await tab.Runtime.callFunctionOn (functionDeclaration='function(){return this.stop();}', + objectId=self.context) + await tab.Runtime.releaseObject (objectId=self.context) + except TabException as e: + # cannot do anything about that. Ignoring should be fine. + self.logger.error ('jsonload onstop failed', + uuid='1786726f-c8ec-4f79-8769-30954d4e32f5', + exception=e.args, + objectId=self.context) + + return + yield # pragma: no cover ### Generic scripts ### @@ -106,24 +192,10 @@ class Scroll (JsOnload): name = 'scroll' scriptPath = 'scroll.js' - def __init__ (self, loader): - super ().__init__ (loader) - stopVarname = '__' + __package__ + '_stop__' - newStopVarname = randomString () - self.script = self.script.replace (stopVarname, newStopVarname) - self.stopVarname = newStopVarname - - def onstop (self): - super ().onstop () - # removing the script does not stop it if running - script = '{} = true; window.scrollTo (0, 0);'.format (self.stopVarname) - self.useScript (script) - self.loader.tab.Runtime.evaluate (expression=script, returnByValue=True) - class EmulateScreenMetrics (Behavior): name = 'emulateScreenMetrics' - def onstop (self): + async def onstop (self): """ Emulate different screen sizes, causing the site to fetch assets (img srcset and css, for example) for different screen resolutions. @@ -139,17 +211,32 @@ class EmulateScreenMetrics (Behavior): {'width': 320, 'height': 480, 'deviceScaleFactor': 163/cssPpi, 'mobile': True}, # 6th gen iPhone (portrait mode) {'width': 750, 'height': 1334, 'deviceScaleFactor': 326/cssPpi, 'mobile': True}, - # and reset - {'width': 1920, 'height': 1080, 'deviceScaleFactor': 1, 'mobile': False}, ] l = self.loader tab = l.tab for s in sizes: - tab.Emulation.setDeviceMetricsOverride (**s) + self.logger.debug ('device override', + uuid='3d2d8096-1a75-4830-ad79-ae5f6f97071d', **s) + await tab.Emulation.setDeviceMetricsOverride (**s) # give the browser time to re-eval page and start requests - l.wait (1) - # XXX: this seems to be broken, it does not clear the override - #tab.Emulation.clearDeviceMetricsOverride () + # XXX: should wait until loader is not busy any more + await asyncio.sleep (1) + self.logger.debug ('clear override', + uuid='f9401683-eb3a-4b86-9bb2-c8c5d876fc8d') + await tab.Emulation.clearDeviceMetricsOverride () + return + yield # pragma: no cover + +class DomSnapshotEvent: + __slots__ = ('url', 'document', 'viewport') + + def __init__ (self, url, document, viewport): + # XXX: document encoding? + assert isinstance (document, bytes) + + self.url = url + self.document = document + self.viewport = viewport class DomSnapshot (Behavior): """ @@ -157,38 +244,39 @@ class DomSnapshot (Behavior): We could use DOMSnapshot.getSnapshot here, but the API is not stable yet. Also computed styles are not really necessary here. - - XXX: Currently writes a response, when it should use “resource”. pywb - can’t handle that though. """ + __slots__ = ('script', ) + name = 'domSnapshot' - def __init__ (self, loader): - super ().__init__ (loader) - self.script = self.loadScript ('canvas-snapshot.js') + def __init__ (self, loader, logger): + super ().__init__ (loader, logger) + self.script = Script ('canvas-snapshot.js') - def onfinish (self): + async def onfinish (self): tab = self.loader.tab - writer = self.loader.writer - self.useScript (self.script) - tab.Runtime.evaluate (expression=self.script, returnByValue=True) + yield self.script + await tab.Runtime.evaluate (expression=str (self.script), returnByValue=True) - viewport = getFormattedViewportMetrics (tab) - dom = tab.DOM.getDocument (depth=-1, pierce=True) + viewport = await getFormattedViewportMetrics (tab) + dom = await tab.DOM.getDocument (depth=-1, pierce=True) + self.logger.debug ('dom snapshot document', + uuid='0c720784-8bd1-4fdc-a811-84394d753539', dom=dom) haveUrls = set () for doc in ChromeTreeWalker (dom['root']).split (): - rawUrl = doc['documentURL'] - if rawUrl in haveUrls: + url = URL (doc['documentURL']) + if url in haveUrls: # ignore duplicate URLs. they are usually caused by # javascript-injected iframes (advertising) with no(?) src - logger.warning ('have DOM snapshot for URL {}, ignoring'.format (rawUrl)) - continue - url = urlsplit (rawUrl) - if url.scheme in ('http', 'https'): - logger.debug ('saving DOM snapshot for url {}, base {}'.format (doc['documentURL'], doc['baseURL'])) - haveUrls.add (rawUrl) + self.logger.warning ('dom snapshot duplicate', + uuid='d44de989-98d4-456e-82e7-9d4c49acab5e') + elif url.scheme in ('http', 'https'): + self.logger.debug ('dom snapshot', + uuid='ece7ff05-ccd9-44b5-b6a8-be25a24b96f4', + base=doc["baseURL"]) + haveUrls.add (url) walker = ChromeTreeWalker (doc) # remove script, to make the page static and noscript, because at the # time we took the snapshot scripts were enabled @@ -196,41 +284,89 @@ class DomSnapshot (Behavior): disallowedAttributes = html.eventAttributes stream = StripAttributeFilter (StripTagFilter (walker, disallowedTags), disallowedAttributes) serializer = HTMLSerializer () - httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1') - record = writer.create_warc_record (doc['documentURL'], 'response', - payload=BytesIO (serializer.render (stream, 'utf-8')), - http_headers=httpHeaders, - warc_headers_dict={'X-DOM-Snapshot': str (True), - 'X-Chrome-Viewport': viewport}) - writer.write_record (record) + yield DomSnapshotEvent (url.with_fragment(None), serializer.render (stream, 'utf-8'), viewport) + +class ScreenshotEvent: + __slots__ = ('yoff', 'data', 'url') + + def __init__ (self, url, yoff, data): + self.url = url + self.yoff = yoff + self.data = data class Screenshot (Behavior): """ Create screenshot from tab and write it to WARC + + Chrome will allocate an additional 512MB of RAM when using this plugin. """ + __slots__ = ('script') + name = 'screenshot' - def onfinish (self): + # Hardcoded max texture size of 16,384 (crbug.com/770769) + maxDim = 16*1024 + + def __init__ (self, loader, logger): + super ().__init__ (loader, logger) + self.script = Script ('screenshot.js') + + async def onfinish (self): tab = self.loader.tab - writer = self.loader.writer - # see https://github.com/GoogleChrome/puppeteer/blob/230be28b067b521f0577206899db01f0ca7fc0d2/examples/screenshots-longpage.js - # Hardcoded max texture size of 16,384 (crbug.com/770769) - maxDim = 16*1024 - metrics = tab.Page.getLayoutMetrics () + # for top-level/full-screen elements with position: fixed we need to + # figure out their actual size (i.e. scrollHeight) and use that when + # overriding the viewport size. + # we could do this without javascript, but that would require several + # round-trips to Chrome or pulling down the entire DOM+computed styles + tab = self.loader.tab + yield self.script + result = await tab.Runtime.evaluate (expression=str (self.script), returnByValue=True) + assert result['result']['type'] == 'object', result + result = result['result']['value'] + + # this is required to make the browser render more than just the small + # actual viewport (i.e. entire page). see + # https://github.com/GoogleChrome/puppeteer/blob/45873ea737b4ebe4fa7d6f46256b2ea19ce18aa7/lib/Page.js#L805 + metrics = await tab.Page.getLayoutMetrics () contentSize = metrics['contentSize'] - width = min (contentSize['width'], maxDim) + contentHeight = max (result + [contentSize['height']]) + + override = { + 'width': 0, + 'height': 0, + 'deviceScaleFactor': 0, + 'mobile': False, + 'viewport': {'x': 0, + 'y': 0, + 'width': contentSize['width'], + 'height': contentHeight, + 'scale': 1} + } + self.logger.debug ('screenshot override', + uuid='e0affa18-cbb1-4d97-9d13-9a88f704b1b2', override=override) + await tab.Emulation.setDeviceMetricsOverride (**override) + + tree = await tab.Page.getFrameTree () + try: + url = URL (tree['frameTree']['frame']['url']).with_fragment (None) + except KeyError: + self.logger.error ('frame without url', + uuid='edc2743d-b93e-4ba1-964e-db232f2f96ff', tree=tree) + url = None + + width = min (contentSize['width'], self.maxDim) # we’re ignoring horizontal scroll intentionally. Most horizontal # layouts use JavaScript scrolling and don’t extend the viewport. - for yoff in range (0, contentSize['height'], maxDim): - height = min (contentSize['height'] - yoff, maxDim) + for yoff in range (0, contentHeight, self.maxDim): + height = min (contentHeight - yoff, self.maxDim) clip = {'x': 0, 'y': yoff, 'width': width, 'height': height, 'scale': 1} - data = b64decode (tab.Page.captureScreenshot (format='png', clip=clip)['data']) - url = packageUrl ('screenshot-{}-{}.png'.format (0, yoff)) - record = writer.create_warc_record (url, 'resource', - payload=BytesIO (data), warc_headers_dict={'Content-Type': 'image/png'}) - writer.write_record (record) + ret = await tab.Page.captureScreenshot (format='png', clip=clip) + data = b64decode (ret['data']) + yield ScreenshotEvent (url, yoff, data) + + await tab.Emulation.clearDeviceMetricsOverride () class Click (JsOnload): """ Generic link clicking """ @@ -238,6 +374,27 @@ class Click (JsOnload): name = 'click' scriptPath = 'click.js' + def __init__ (self, loader, logger): + super ().__init__ (loader, logger) + with pkg_resources.resource_stream (__name__, os.path.join ('data', 'click.yaml')) as fd: + self.options['sites'] = list (yaml.safe_load_all (fd)) + +class ExtractLinksEvent: + __slots__ = ('links', ) + + def __init__ (self, links): + self.links = links + + def __repr__ (self): + return f'<ExtractLinksEvent {self.links!r}>' + +def mapOrIgnore (f, l): + for e in l: + try: + yield f (e) + except: + pass + class ExtractLinks (Behavior): """ Extract links from a page using JavaScript @@ -246,23 +403,37 @@ class ExtractLinks (Behavior): manually resolve relative links. """ + __slots__ = ('script', ) + name = 'extractLinks' - def __init__ (self, loader): - super ().__init__ (loader) - self.script = self.loadScript ('extract-links.js') - self.links = None + def __init__ (self, loader, logger): + super ().__init__ (loader, logger) + self.script = Script ('extract-links.js') - def onfinish (self): + async def onfinish (self): tab = self.loader.tab - self.useScript (self.script) - result = tab.Runtime.evaluate (expression=self.script, returnByValue=True) - self.links = list (set (result['result']['value'])) + yield self.script + result = await tab.Runtime.evaluate (expression=str (self.script), returnByValue=True) + yield ExtractLinksEvent (list (set (mapOrIgnore (URL, result['result']['value'])))) + +class Crash (Behavior): + """ Crash the browser. For testing only. Obviously. """ + + name = 'crash' + + async def onstop (self): + try: + await self.loader.tab.Page.crash () + except Crashed: + pass + return + yield # pragma: no cover # available behavior scripts. Order matters, move those modifying the page # towards the end of available -generic = [Scroll, EmulateScreenMetrics, Click, ExtractLinks] -perSite = [] -available = generic + perSite + [Screenshot, DomSnapshot] -availableNames = set (map (lambda x: x.name, available)) +available = [Scroll, Click, ExtractLinks, Screenshot, EmulateScreenMetrics, DomSnapshot] +#available.append (Crash) +# order matters, since behavior can modify the page (dom snapshots, for instance) +availableMap = OrderedDict (map (lambda x: (x.name, x), available)) |