diff options
Diffstat (limited to 'crocoite/behavior.py')
-rw-r--r-- | crocoite/behavior.py | 192 |
1 files changed, 137 insertions, 55 deletions
diff --git a/crocoite/behavior.py b/crocoite/behavior.py index eb5478b..1610751 100644 --- a/crocoite/behavior.py +++ b/crocoite/behavior.py @@ -35,35 +35,41 @@ instance. """ import asyncio, json, os.path -from urllib.parse import urlsplit from base64 import b64decode from collections import OrderedDict import pkg_resources from html5lib.serializer import HTMLSerializer +from yarl import URL import yaml -from .util import getFormattedViewportMetrics, removeFragment +from .util import getFormattedViewportMetrics from . import html from .html import StripAttributeFilter, StripTagFilter, ChromeTreeWalker -from .devtools import Crashed +from .devtools import Crashed, TabException class Script: """ A JavaScript resource """ __slots__ = ('path', 'data') + datadir = 'data' def __init__ (self, path=None, encoding='utf-8'): self.path = path if path: - self.data = pkg_resources.resource_string (__name__, os.path.join ('data', path)).decode (encoding) + self.data = pkg_resources.resource_string (__name__, os.path.join (self.datadir, path)).decode (encoding) def __repr__ (self): - return '<Script {}>'.format (self.path) + return f'<Script {self.path}>' def __str__ (self): return self.data + @property + def abspath (self): + return pkg_resources.resource_filename (__name__, + os.path.join (self.datadir, self.path)) + @classmethod def fromStr (cls, data, path=None): s = Script () @@ -89,33 +95,23 @@ class Behavior: return True def __repr__ (self): - return '<Behavior {}>'.format (self.name) + return f'<Behavior {self.name}>' async def onload (self): """ After loading the page started """ # this is a dirty hack to make this function an async generator return - yield + yield # pragma: no cover async def onstop (self): """ Before page loading is stopped """ return - yield + yield # pragma: no cover async def onfinish (self): """ After the site has stopped loading """ return - yield - -class HostnameFilter: - """ Limit behavior script to hostname """ - - hostname = None - - def __contains__ (self, url): - url = urlsplit (url) - hostname = url.hostname.split ('.')[::-1] - return hostname[:2] == self.hostname + yield # pragma: no cover class JsOnload (Behavior): """ Execute JavaScript on page load """ @@ -141,6 +137,8 @@ class JsOnload (Behavior): # parameter. # XXX: is there a better way to do this? result = await tab.Runtime.evaluate (expression=str (self.script)) + self.logger.debug ('behavior onload inject', + uuid='a2da9b78-5648-44c5-bfa8-5c7573e13ad3', result=result) exception = result.get ('exceptionDetails', None) result = result['result'] assert result['type'] == 'function', result @@ -148,23 +146,45 @@ class JsOnload (Behavior): constructor = result['objectId'] if self.options: - yield Script.fromStr (json.dumps (self.options, indent=2), '{}/options'.format (self.script.path)) - result = await tab.Runtime.callFunctionOn ( - functionDeclaration='function(options){return new this(options);}', - objectId=constructor, - arguments=[{'value': self.options}]) - result = result['result'] - assert result['type'] == 'object', result - assert result.get ('subtype') != 'error', result - self.context = result['objectId'] + yield Script.fromStr (json.dumps (self.options, indent=2), f'{self.script.path}#options') + + try: + result = await tab.Runtime.callFunctionOn ( + functionDeclaration='function(options){return new this(options);}', + objectId=constructor, + arguments=[{'value': self.options}]) + self.logger.debug ('behavior onload start', + uuid='6c0605ae-93b3-46b3-b575-ba45790909a7', result=result) + result = result['result'] + assert result['type'] == 'object', result + assert result.get ('subtype') != 'error', result + self.context = result['objectId'] + except TabException as e: + if e.args[0] == -32000: + # the site probably reloaded. ignore this, since we’ll be + # re-injected into the new site by the controller. + self.logger.error ('jsonload onload failed', + uuid='c151a863-78d1-41f4-a8e6-c022a6c5d252', + exception=e.args) + else: + raise async def onstop (self): tab = self.loader.tab - assert self.context is not None - await tab.Runtime.callFunctionOn (functionDeclaration='function(){return this.stop();}', objectId=self.context) - await tab.Runtime.releaseObject (objectId=self.context) + try: + assert self.context is not None + await tab.Runtime.callFunctionOn (functionDeclaration='function(){return this.stop();}', + objectId=self.context) + await tab.Runtime.releaseObject (objectId=self.context) + except TabException as e: + # cannot do anything about that. Ignoring should be fine. + self.logger.error ('jsonload onstop failed', + uuid='1786726f-c8ec-4f79-8769-30954d4e32f5', + exception=e.args, + objectId=self.context) + return - yield + yield # pragma: no cover ### Generic scripts ### @@ -195,18 +215,25 @@ class EmulateScreenMetrics (Behavior): l = self.loader tab = l.tab for s in sizes: + self.logger.debug ('device override', + uuid='3d2d8096-1a75-4830-ad79-ae5f6f97071d', **s) await tab.Emulation.setDeviceMetricsOverride (**s) # give the browser time to re-eval page and start requests # XXX: should wait until loader is not busy any more await asyncio.sleep (1) + self.logger.debug ('clear override', + uuid='f9401683-eb3a-4b86-9bb2-c8c5d876fc8d') await tab.Emulation.clearDeviceMetricsOverride () return - yield + yield # pragma: no cover class DomSnapshotEvent: __slots__ = ('url', 'document', 'viewport') def __init__ (self, url, document, viewport): + # XXX: document encoding? + assert isinstance (document, bytes) + self.url = url self.document = document self.viewport = viewport @@ -235,18 +262,21 @@ class DomSnapshot (Behavior): viewport = await getFormattedViewportMetrics (tab) dom = await tab.DOM.getDocument (depth=-1, pierce=True) + self.logger.debug ('dom snapshot document', + uuid='0c720784-8bd1-4fdc-a811-84394d753539', dom=dom) haveUrls = set () for doc in ChromeTreeWalker (dom['root']).split (): - rawUrl = doc['documentURL'] - if rawUrl in haveUrls: + url = URL (doc['documentURL']) + if url in haveUrls: # ignore duplicate URLs. they are usually caused by # javascript-injected iframes (advertising) with no(?) src - self.logger.warning ('have DOM snapshot for URL {}, ignoring'.format (rawUrl)) - continue - url = urlsplit (rawUrl) - if url.scheme in ('http', 'https'): - self.logger.debug ('saving DOM snapshot for url {}, base {}'.format (doc['documentURL'], doc['baseURL'])) - haveUrls.add (rawUrl) + self.logger.warning ('dom snapshot duplicate', + uuid='d44de989-98d4-456e-82e7-9d4c49acab5e') + elif url.scheme in ('http', 'https'): + self.logger.debug ('dom snapshot', + uuid='ece7ff05-ccd9-44b5-b6a8-be25a24b96f4', + base=doc["baseURL"]) + haveUrls.add (url) walker = ChromeTreeWalker (doc) # remove script, to make the page static and noscript, because at the # time we took the snapshot scripts were enabled @@ -254,7 +284,7 @@ class DomSnapshot (Behavior): disallowedAttributes = html.eventAttributes stream = StripAttributeFilter (StripTagFilter (walker, disallowedTags), disallowedAttributes) serializer = HTMLSerializer () - yield DomSnapshotEvent (removeFragment (doc['documentURL']), serializer.render (stream, 'utf-8'), viewport) + yield DomSnapshotEvent (url.with_fragment(None), serializer.render (stream, 'utf-8'), viewport) class ScreenshotEvent: __slots__ = ('yoff', 'data', 'url') @@ -267,35 +297,77 @@ class ScreenshotEvent: class Screenshot (Behavior): """ Create screenshot from tab and write it to WARC + + Chrome will allocate an additional 512MB of RAM when using this plugin. """ + __slots__ = ('script') + name = 'screenshot' + # Hardcoded max texture size of 16,384 (crbug.com/770769) + maxDim = 16*1024 + + def __init__ (self, loader, logger): + super ().__init__ (loader, logger) + self.script = Script ('screenshot.js') + async def onfinish (self): tab = self.loader.tab + # for top-level/full-screen elements with position: fixed we need to + # figure out their actual size (i.e. scrollHeight) and use that when + # overriding the viewport size. + # we could do this without javascript, but that would require several + # round-trips to Chrome or pulling down the entire DOM+computed styles + tab = self.loader.tab + yield self.script + result = await tab.Runtime.evaluate (expression=str (self.script), returnByValue=True) + assert result['result']['type'] == 'object', result + result = result['result']['value'] + + # this is required to make the browser render more than just the small + # actual viewport (i.e. entire page). see + # https://github.com/GoogleChrome/puppeteer/blob/45873ea737b4ebe4fa7d6f46256b2ea19ce18aa7/lib/Page.js#L805 + metrics = await tab.Page.getLayoutMetrics () + contentSize = metrics['contentSize'] + contentHeight = max (result + [contentSize['height']]) + + override = { + 'width': 0, + 'height': 0, + 'deviceScaleFactor': 0, + 'mobile': False, + 'viewport': {'x': 0, + 'y': 0, + 'width': contentSize['width'], + 'height': contentHeight, + 'scale': 1} + } + self.logger.debug ('screenshot override', + uuid='e0affa18-cbb1-4d97-9d13-9a88f704b1b2', override=override) + await tab.Emulation.setDeviceMetricsOverride (**override) + tree = await tab.Page.getFrameTree () try: - url = removeFragment (tree['frameTree']['frame']['url']) + url = URL (tree['frameTree']['frame']['url']).with_fragment (None) except KeyError: - self.logger.error ('frame without url', tree=tree) + self.logger.error ('frame without url', + uuid='edc2743d-b93e-4ba1-964e-db232f2f96ff', tree=tree) url = None - # see https://github.com/GoogleChrome/puppeteer/blob/230be28b067b521f0577206899db01f0ca7fc0d2/examples/screenshots-longpage.js - # Hardcoded max texture size of 16,384 (crbug.com/770769) - maxDim = 16*1024 - metrics = await tab.Page.getLayoutMetrics () - contentSize = metrics['contentSize'] - width = min (contentSize['width'], maxDim) + width = min (contentSize['width'], self.maxDim) # we’re ignoring horizontal scroll intentionally. Most horizontal # layouts use JavaScript scrolling and don’t extend the viewport. - for yoff in range (0, contentSize['height'], maxDim): - height = min (contentSize['height'] - yoff, maxDim) + for yoff in range (0, contentHeight, self.maxDim): + height = min (contentHeight - yoff, self.maxDim) clip = {'x': 0, 'y': yoff, 'width': width, 'height': height, 'scale': 1} ret = await tab.Page.captureScreenshot (format='png', clip=clip) data = b64decode (ret['data']) yield ScreenshotEvent (url, yoff, data) + await tab.Emulation.clearDeviceMetricsOverride () + class Click (JsOnload): """ Generic link clicking """ @@ -305,7 +377,7 @@ class Click (JsOnload): def __init__ (self, loader, logger): super ().__init__ (loader, logger) with pkg_resources.resource_stream (__name__, os.path.join ('data', 'click.yaml')) as fd: - self.options['sites'] = list (yaml.load_all (fd)) + self.options['sites'] = list (yaml.safe_load_all (fd)) class ExtractLinksEvent: __slots__ = ('links', ) @@ -313,6 +385,16 @@ class ExtractLinksEvent: def __init__ (self, links): self.links = links + def __repr__ (self): + return f'<ExtractLinksEvent {self.links!r}>' + +def mapOrIgnore (f, l): + for e in l: + try: + yield f (e) + except: + pass + class ExtractLinks (Behavior): """ Extract links from a page using JavaScript @@ -333,7 +415,7 @@ class ExtractLinks (Behavior): tab = self.loader.tab yield self.script result = await tab.Runtime.evaluate (expression=str (self.script), returnByValue=True) - yield ExtractLinksEvent (list (set (result['result']['value']))) + yield ExtractLinksEvent (list (set (mapOrIgnore (URL, result['result']['value'])))) class Crash (Behavior): """ Crash the browser. For testing only. Obviously. """ @@ -346,7 +428,7 @@ class Crash (Behavior): except Crashed: pass return - yield + yield # pragma: no cover # available behavior scripts. Order matters, move those modifying the page # towards the end of available |