summaryrefslogtreecommitdiff
path: root/crocoite/behavior.py
diff options
context:
space:
mode:
Diffstat (limited to 'crocoite/behavior.py')
-rw-r--r--crocoite/behavior.py192
1 files changed, 137 insertions, 55 deletions
diff --git a/crocoite/behavior.py b/crocoite/behavior.py
index eb5478b..1610751 100644
--- a/crocoite/behavior.py
+++ b/crocoite/behavior.py
@@ -35,35 +35,41 @@ instance.
"""
import asyncio, json, os.path
-from urllib.parse import urlsplit
from base64 import b64decode
from collections import OrderedDict
import pkg_resources
from html5lib.serializer import HTMLSerializer
+from yarl import URL
import yaml
-from .util import getFormattedViewportMetrics, removeFragment
+from .util import getFormattedViewportMetrics
from . import html
from .html import StripAttributeFilter, StripTagFilter, ChromeTreeWalker
-from .devtools import Crashed
+from .devtools import Crashed, TabException
class Script:
""" A JavaScript resource """
__slots__ = ('path', 'data')
+ datadir = 'data'
def __init__ (self, path=None, encoding='utf-8'):
self.path = path
if path:
- self.data = pkg_resources.resource_string (__name__, os.path.join ('data', path)).decode (encoding)
+ self.data = pkg_resources.resource_string (__name__, os.path.join (self.datadir, path)).decode (encoding)
def __repr__ (self):
- return '<Script {}>'.format (self.path)
+ return f'<Script {self.path}>'
def __str__ (self):
return self.data
+ @property
+ def abspath (self):
+ return pkg_resources.resource_filename (__name__,
+ os.path.join (self.datadir, self.path))
+
@classmethod
def fromStr (cls, data, path=None):
s = Script ()
@@ -89,33 +95,23 @@ class Behavior:
return True
def __repr__ (self):
- return '<Behavior {}>'.format (self.name)
+ return f'<Behavior {self.name}>'
async def onload (self):
""" After loading the page started """
# this is a dirty hack to make this function an async generator
return
- yield
+ yield # pragma: no cover
async def onstop (self):
""" Before page loading is stopped """
return
- yield
+ yield # pragma: no cover
async def onfinish (self):
""" After the site has stopped loading """
return
- yield
-
-class HostnameFilter:
- """ Limit behavior script to hostname """
-
- hostname = None
-
- def __contains__ (self, url):
- url = urlsplit (url)
- hostname = url.hostname.split ('.')[::-1]
- return hostname[:2] == self.hostname
+ yield # pragma: no cover
class JsOnload (Behavior):
""" Execute JavaScript on page load """
@@ -141,6 +137,8 @@ class JsOnload (Behavior):
# parameter.
# XXX: is there a better way to do this?
result = await tab.Runtime.evaluate (expression=str (self.script))
+ self.logger.debug ('behavior onload inject',
+ uuid='a2da9b78-5648-44c5-bfa8-5c7573e13ad3', result=result)
exception = result.get ('exceptionDetails', None)
result = result['result']
assert result['type'] == 'function', result
@@ -148,23 +146,45 @@ class JsOnload (Behavior):
constructor = result['objectId']
if self.options:
- yield Script.fromStr (json.dumps (self.options, indent=2), '{}/options'.format (self.script.path))
- result = await tab.Runtime.callFunctionOn (
- functionDeclaration='function(options){return new this(options);}',
- objectId=constructor,
- arguments=[{'value': self.options}])
- result = result['result']
- assert result['type'] == 'object', result
- assert result.get ('subtype') != 'error', result
- self.context = result['objectId']
+ yield Script.fromStr (json.dumps (self.options, indent=2), f'{self.script.path}#options')
+
+ try:
+ result = await tab.Runtime.callFunctionOn (
+ functionDeclaration='function(options){return new this(options);}',
+ objectId=constructor,
+ arguments=[{'value': self.options}])
+ self.logger.debug ('behavior onload start',
+ uuid='6c0605ae-93b3-46b3-b575-ba45790909a7', result=result)
+ result = result['result']
+ assert result['type'] == 'object', result
+ assert result.get ('subtype') != 'error', result
+ self.context = result['objectId']
+ except TabException as e:
+ if e.args[0] == -32000:
+ # the site probably reloaded. ignore this, since we’ll be
+ # re-injected into the new site by the controller.
+ self.logger.error ('jsonload onload failed',
+ uuid='c151a863-78d1-41f4-a8e6-c022a6c5d252',
+ exception=e.args)
+ else:
+ raise
async def onstop (self):
tab = self.loader.tab
- assert self.context is not None
- await tab.Runtime.callFunctionOn (functionDeclaration='function(){return this.stop();}', objectId=self.context)
- await tab.Runtime.releaseObject (objectId=self.context)
+ try:
+ assert self.context is not None
+ await tab.Runtime.callFunctionOn (functionDeclaration='function(){return this.stop();}',
+ objectId=self.context)
+ await tab.Runtime.releaseObject (objectId=self.context)
+ except TabException as e:
+ # cannot do anything about that. Ignoring should be fine.
+ self.logger.error ('jsonload onstop failed',
+ uuid='1786726f-c8ec-4f79-8769-30954d4e32f5',
+ exception=e.args,
+ objectId=self.context)
+
return
- yield
+ yield # pragma: no cover
### Generic scripts ###
@@ -195,18 +215,25 @@ class EmulateScreenMetrics (Behavior):
l = self.loader
tab = l.tab
for s in sizes:
+ self.logger.debug ('device override',
+ uuid='3d2d8096-1a75-4830-ad79-ae5f6f97071d', **s)
await tab.Emulation.setDeviceMetricsOverride (**s)
# give the browser time to re-eval page and start requests
# XXX: should wait until loader is not busy any more
await asyncio.sleep (1)
+ self.logger.debug ('clear override',
+ uuid='f9401683-eb3a-4b86-9bb2-c8c5d876fc8d')
await tab.Emulation.clearDeviceMetricsOverride ()
return
- yield
+ yield # pragma: no cover
class DomSnapshotEvent:
__slots__ = ('url', 'document', 'viewport')
def __init__ (self, url, document, viewport):
+ # XXX: document encoding?
+ assert isinstance (document, bytes)
+
self.url = url
self.document = document
self.viewport = viewport
@@ -235,18 +262,21 @@ class DomSnapshot (Behavior):
viewport = await getFormattedViewportMetrics (tab)
dom = await tab.DOM.getDocument (depth=-1, pierce=True)
+ self.logger.debug ('dom snapshot document',
+ uuid='0c720784-8bd1-4fdc-a811-84394d753539', dom=dom)
haveUrls = set ()
for doc in ChromeTreeWalker (dom['root']).split ():
- rawUrl = doc['documentURL']
- if rawUrl in haveUrls:
+ url = URL (doc['documentURL'])
+ if url in haveUrls:
# ignore duplicate URLs. they are usually caused by
# javascript-injected iframes (advertising) with no(?) src
- self.logger.warning ('have DOM snapshot for URL {}, ignoring'.format (rawUrl))
- continue
- url = urlsplit (rawUrl)
- if url.scheme in ('http', 'https'):
- self.logger.debug ('saving DOM snapshot for url {}, base {}'.format (doc['documentURL'], doc['baseURL']))
- haveUrls.add (rawUrl)
+ self.logger.warning ('dom snapshot duplicate',
+ uuid='d44de989-98d4-456e-82e7-9d4c49acab5e')
+ elif url.scheme in ('http', 'https'):
+ self.logger.debug ('dom snapshot',
+ uuid='ece7ff05-ccd9-44b5-b6a8-be25a24b96f4',
+ base=doc["baseURL"])
+ haveUrls.add (url)
walker = ChromeTreeWalker (doc)
# remove script, to make the page static and noscript, because at the
# time we took the snapshot scripts were enabled
@@ -254,7 +284,7 @@ class DomSnapshot (Behavior):
disallowedAttributes = html.eventAttributes
stream = StripAttributeFilter (StripTagFilter (walker, disallowedTags), disallowedAttributes)
serializer = HTMLSerializer ()
- yield DomSnapshotEvent (removeFragment (doc['documentURL']), serializer.render (stream, 'utf-8'), viewport)
+ yield DomSnapshotEvent (url.with_fragment(None), serializer.render (stream, 'utf-8'), viewport)
class ScreenshotEvent:
__slots__ = ('yoff', 'data', 'url')
@@ -267,35 +297,77 @@ class ScreenshotEvent:
class Screenshot (Behavior):
"""
Create screenshot from tab and write it to WARC
+
+ Chrome will allocate an additional 512MB of RAM when using this plugin.
"""
+ __slots__ = ('script')
+
name = 'screenshot'
+ # Hardcoded max texture size of 16,384 (crbug.com/770769)
+ maxDim = 16*1024
+
+ def __init__ (self, loader, logger):
+ super ().__init__ (loader, logger)
+ self.script = Script ('screenshot.js')
+
async def onfinish (self):
tab = self.loader.tab
+ # for top-level/full-screen elements with position: fixed we need to
+ # figure out their actual size (i.e. scrollHeight) and use that when
+ # overriding the viewport size.
+ # we could do this without javascript, but that would require several
+ # round-trips to Chrome or pulling down the entire DOM+computed styles
+ tab = self.loader.tab
+ yield self.script
+ result = await tab.Runtime.evaluate (expression=str (self.script), returnByValue=True)
+ assert result['result']['type'] == 'object', result
+ result = result['result']['value']
+
+ # this is required to make the browser render more than just the small
+ # actual viewport (i.e. entire page). see
+ # https://github.com/GoogleChrome/puppeteer/blob/45873ea737b4ebe4fa7d6f46256b2ea19ce18aa7/lib/Page.js#L805
+ metrics = await tab.Page.getLayoutMetrics ()
+ contentSize = metrics['contentSize']
+ contentHeight = max (result + [contentSize['height']])
+
+ override = {
+ 'width': 0,
+ 'height': 0,
+ 'deviceScaleFactor': 0,
+ 'mobile': False,
+ 'viewport': {'x': 0,
+ 'y': 0,
+ 'width': contentSize['width'],
+ 'height': contentHeight,
+ 'scale': 1}
+ }
+ self.logger.debug ('screenshot override',
+ uuid='e0affa18-cbb1-4d97-9d13-9a88f704b1b2', override=override)
+ await tab.Emulation.setDeviceMetricsOverride (**override)
+
tree = await tab.Page.getFrameTree ()
try:
- url = removeFragment (tree['frameTree']['frame']['url'])
+ url = URL (tree['frameTree']['frame']['url']).with_fragment (None)
except KeyError:
- self.logger.error ('frame without url', tree=tree)
+ self.logger.error ('frame without url',
+ uuid='edc2743d-b93e-4ba1-964e-db232f2f96ff', tree=tree)
url = None
- # see https://github.com/GoogleChrome/puppeteer/blob/230be28b067b521f0577206899db01f0ca7fc0d2/examples/screenshots-longpage.js
- # Hardcoded max texture size of 16,384 (crbug.com/770769)
- maxDim = 16*1024
- metrics = await tab.Page.getLayoutMetrics ()
- contentSize = metrics['contentSize']
- width = min (contentSize['width'], maxDim)
+ width = min (contentSize['width'], self.maxDim)
# we’re ignoring horizontal scroll intentionally. Most horizontal
# layouts use JavaScript scrolling and don’t extend the viewport.
- for yoff in range (0, contentSize['height'], maxDim):
- height = min (contentSize['height'] - yoff, maxDim)
+ for yoff in range (0, contentHeight, self.maxDim):
+ height = min (contentHeight - yoff, self.maxDim)
clip = {'x': 0, 'y': yoff, 'width': width, 'height': height, 'scale': 1}
ret = await tab.Page.captureScreenshot (format='png', clip=clip)
data = b64decode (ret['data'])
yield ScreenshotEvent (url, yoff, data)
+ await tab.Emulation.clearDeviceMetricsOverride ()
+
class Click (JsOnload):
""" Generic link clicking """
@@ -305,7 +377,7 @@ class Click (JsOnload):
def __init__ (self, loader, logger):
super ().__init__ (loader, logger)
with pkg_resources.resource_stream (__name__, os.path.join ('data', 'click.yaml')) as fd:
- self.options['sites'] = list (yaml.load_all (fd))
+ self.options['sites'] = list (yaml.safe_load_all (fd))
class ExtractLinksEvent:
__slots__ = ('links', )
@@ -313,6 +385,16 @@ class ExtractLinksEvent:
def __init__ (self, links):
self.links = links
+ def __repr__ (self):
+ return f'<ExtractLinksEvent {self.links!r}>'
+
+def mapOrIgnore (f, l):
+ for e in l:
+ try:
+ yield f (e)
+ except:
+ pass
+
class ExtractLinks (Behavior):
"""
Extract links from a page using JavaScript
@@ -333,7 +415,7 @@ class ExtractLinks (Behavior):
tab = self.loader.tab
yield self.script
result = await tab.Runtime.evaluate (expression=str (self.script), returnByValue=True)
- yield ExtractLinksEvent (list (set (result['result']['value'])))
+ yield ExtractLinksEvent (list (set (mapOrIgnore (URL, result['result']['value']))))
class Crash (Behavior):
""" Crash the browser. For testing only. Obviously. """
@@ -346,7 +428,7 @@ class Crash (Behavior):
except Crashed:
pass
return
- yield
+ yield # pragma: no cover
# available behavior scripts. Order matters, move those modifying the page
# towards the end of available