summaryrefslogtreecommitdiff
path: root/crocoite/behavior.py
diff options
context:
space:
mode:
Diffstat (limited to 'crocoite/behavior.py')
-rw-r--r--crocoite/behavior.py124
1 files changed, 80 insertions, 44 deletions
diff --git a/crocoite/behavior.py b/crocoite/behavior.py
index 95e8160..fd8fff8 100644
--- a/crocoite/behavior.py
+++ b/crocoite/behavior.py
@@ -22,21 +22,44 @@
Generic and per-site behavior scripts
"""
-import logging
+import logging, time
from io import BytesIO
from urllib.parse import urlsplit
import os.path
import pkg_resources
from base64 import b64decode
+from collections import OrderedDict
+
+from html5lib.serializer import HTMLSerializer
+from warcio.statusandheaders import StatusAndHeaders
+from pychrome.exceptions import TimeoutException
from .util import randomString, packageUrl, getFormattedViewportMetrics
from . import html
from .html import StripAttributeFilter, StripTagFilter, ChromeTreeWalker
-from html5lib.serializer import HTMLSerializer
-from warcio.statusandheaders import StatusAndHeaders
+from .browser import SiteLoader
logger = logging.getLogger(__name__)
+class Script:
+ """ A JavaScript resource """
+ def __init__ (self, path=None, encoding='utf-8'):
+ self.path = path
+ if path:
+ self.data = pkg_resources.resource_string (__name__, os.path.join ('data', path)).decode (encoding)
+
+ def __repr__ (self):
+ return '<Script {}>'.format (self.path)
+
+ def __str__ (self):
+ return self.data
+
+ @classmethod
+ def fromStr (cls, data):
+ s = Script ()
+ s.data = data
+ return s
+
class Behavior:
# unique behavior name
name = None
@@ -51,27 +74,20 @@ class Behavior:
"""
return True
- def loadScript (self, path, encoding='utf-8'):
- return pkg_resources.resource_string (__name__, os.path.join ('data', path)).decode (encoding)
-
- def useScript (self, script, encoding='utf-8'):
- writer = self.loader.writer
- record = writer.create_warc_record (packageUrl ('script'), 'metadata',
- payload=BytesIO (script.encode (encoding)),
- warc_headers_dict={'Content-Type': 'application/javascript; charset={}'.format (encoding)})
- writer.write_record (record)
+ def __repr__ (self):
+ return '<Behavior {}>'.format (self.name)
def onload (self):
""" Before loading the page """
- pass
+ yield from ()
def onstop (self):
""" Before page loading is stopped """
- pass
+ yield from ()
def onfinish (self):
""" After the site has stopped loading """
- pass
+ yield from ()
class HostnameFilter:
""" Limit behavior script to hostname """
@@ -90,15 +106,16 @@ class JsOnload (Behavior):
def __init__ (self, loader):
super ().__init__ (loader)
- self.script = self.loadScript (self.scriptPath)
+ self.script = Script (self.scriptPath)
self.scriptHandle = None
def onload (self):
- self.useScript (self.script)
- self.scriptHandle = self.loader.tab.Page.addScriptToEvaluateOnNewDocument (source=self.script)['identifier']
+ yield self.script
+ self.scriptHandle = self.loader.tab.Page.addScriptToEvaluateOnNewDocument (source=str (self.script))['identifier']
def onstop (self):
self.loader.tab.Page.removeScriptToEvaluateOnNewDocument (identifier=self.scriptHandle)
+ yield from ()
### Generic scripts ###
@@ -110,15 +127,15 @@ class Scroll (JsOnload):
super ().__init__ (loader)
stopVarname = '__' + __package__ + '_stop__'
newStopVarname = randomString ()
- self.script = self.script.replace (stopVarname, newStopVarname)
+ self.script.data = self.script.data.replace (stopVarname, newStopVarname)
self.stopVarname = newStopVarname
def onstop (self):
super ().onstop ()
# removing the script does not stop it if running
- script = '{} = true; window.scrollTo (0, 0);'.format (self.stopVarname)
- self.useScript (script)
- self.loader.tab.Runtime.evaluate (expression=script, returnByValue=True)
+ script = Script.fromStr ('{} = true; window.scrollTo (0, 0);'.format (self.stopVarname))
+ yield script
+ self.loader.tab.Runtime.evaluate (expression=str (script), returnByValue=True)
class EmulateScreenMetrics (Behavior):
name = 'emulateScreenMetrics'
@@ -147,9 +164,16 @@ class EmulateScreenMetrics (Behavior):
for s in sizes:
tab.Emulation.setDeviceMetricsOverride (**s)
# give the browser time to re-eval page and start requests
- l.wait (1)
+ time.sleep (1)
# XXX: this seems to be broken, it does not clear the override
#tab.Emulation.clearDeviceMetricsOverride ()
+ yield from ()
+
+class DomSnapshotEvent:
+ def __init__ (self, url, document, viewport):
+ self.url = url
+ self.document = document
+ self.viewport = viewport
class DomSnapshot (Behavior):
"""
@@ -166,14 +190,13 @@ class DomSnapshot (Behavior):
def __init__ (self, loader):
super ().__init__ (loader)
- self.script = self.loadScript ('canvas-snapshot.js')
+ self.script = Script ('canvas-snapshot.js')
def onfinish (self):
tab = self.loader.tab
- writer = self.loader.writer
- self.useScript (self.script)
- tab.Runtime.evaluate (expression=self.script, returnByValue=True)
+ yield self.script
+ tab.Runtime.evaluate (expression=str (self.script), returnByValue=True)
viewport = getFormattedViewportMetrics (tab)
dom = tab.DOM.getDocument (depth=-1, pierce=True)
@@ -196,13 +219,12 @@ class DomSnapshot (Behavior):
disallowedAttributes = html.eventAttributes
stream = StripAttributeFilter (StripTagFilter (walker, disallowedTags), disallowedAttributes)
serializer = HTMLSerializer ()
- httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1')
- record = writer.create_warc_record (doc['documentURL'], 'response',
- payload=BytesIO (serializer.render (stream, 'utf-8')),
- http_headers=httpHeaders,
- warc_headers_dict={'X-DOM-Snapshot': str (True),
- 'X-Chrome-Viewport': viewport})
- writer.write_record (record)
+ yield DomSnapshotEvent (doc['documentURL'], serializer.render (stream, 'utf-8'), viewport)
+
+class ScreenshotEvent:
+ def __init__ (self, yoff, data):
+ self.yoff = yoff
+ self.data = data
class Screenshot (Behavior):
"""
@@ -213,7 +235,6 @@ class Screenshot (Behavior):
def onfinish (self):
tab = self.loader.tab
- writer = self.loader.writer
# see https://github.com/GoogleChrome/puppeteer/blob/230be28b067b521f0577206899db01f0ca7fc0d2/examples/screenshots-longpage.js
# Hardcoded max texture size of 16,384 (crbug.com/770769)
@@ -227,10 +248,7 @@ class Screenshot (Behavior):
height = min (contentSize['height'] - yoff, maxDim)
clip = {'x': 0, 'y': yoff, 'width': width, 'height': height, 'scale': 1}
data = b64decode (tab.Page.captureScreenshot (format='png', clip=clip)['data'])
- url = packageUrl ('screenshot-{}-{}.png'.format (0, yoff))
- record = writer.create_warc_record (url, 'resource',
- payload=BytesIO (data), warc_headers_dict={'Content-Type': 'image/png'})
- writer.write_record (record)
+ yield ScreenshotEvent (yoff, data)
class Click (JsOnload):
""" Generic link clicking """
@@ -238,6 +256,10 @@ class Click (JsOnload):
name = 'click'
scriptPath = 'click.js'
+class ExtractLinksEvent:
+ def __init__ (self, links):
+ self.links = links
+
class ExtractLinks (Behavior):
"""
Extract links from a page using JavaScript
@@ -250,19 +272,33 @@ class ExtractLinks (Behavior):
def __init__ (self, loader):
super ().__init__ (loader)
- self.script = self.loadScript ('extract-links.js')
+ self.script = Script ('extract-links.js')
self.links = None
def onfinish (self):
tab = self.loader.tab
- self.useScript (self.script)
- result = tab.Runtime.evaluate (expression=self.script, returnByValue=True)
- self.links = list (set (result['result']['value']))
+ yield self.script
+ result = tab.Runtime.evaluate (expression=str (self.script), returnByValue=True)
+ yield ExtractLinksEvent (list (set (result['result']['value'])))
+
+class Crash (Behavior):
+ """ Crash the browser. For testing only. Obviously. """
+
+ name = 'crash'
+
+ def onstop (self):
+ try:
+ self.loader.tab.Page.crash (_timeout=1)
+ except TimeoutException:
+ pass
+ yield from ()
# available behavior scripts. Order matters, move those modifying the page
# towards the end of available
generic = [Scroll, EmulateScreenMetrics, Click, ExtractLinks]
perSite = []
available = generic + perSite + [Screenshot, DomSnapshot]
-availableNames = set (map (lambda x: x.name, available))
+#available.append (Crash)
+# order matters, since behavior can modify the page (dom snapshots, for instance)
+availableMap = OrderedDict (map (lambda x: (x.name, x), available))