Refactor behavior scripts

No functional changes, just cleanup. Replaces onload and onsnapshot events. Move screen metric emulation, DOM snapshots and screenshots here as well.
author: Lars-Dominik Braun <lars@6xq.net> 2017-12-24 10:23:09 +0100
committer: Lars-Dominik Braun <lars@6xq.net> 2017-12-24 10:23:09 +0100
commit: d4951d1fe8be0941df9cafbfe57c21d26f66e8ee (patch)
tree: f24081e310a1bcbf1aea8ddc258d60050f012c0d
parent: fcb9ae93514d26085c8e0aebf2fc9b9c64a77453 (diff)
download: crocoite-d4951d1fe8be0941df9cafbfe57c21d26f66e8ee.tar.gz
crocoite-d4951d1fe8be0941df9cafbfe57c21d26f66e8ee.tar.bz2
crocoite-d4951d1fe8be0941df9cafbfe57c21d26f66e8ee.zip
8 files changed, 302 insertions, 194 deletions
diff --git a/README.rst b/README.rst
index 6fe9606..f86d8aa 100644
--- a/README.rst
+++ b/README.rst
@@ -31,20 +31,15 @@ One-shot commandline interface and pywb_ playback::
 
 .. _pywb: https://github.com/ikreymer/pywb
 
-Injecting JavaScript
-^^^^^^^^^^^^^^^^^^^^
+Behavior scripts
+^^^^^^^^^^^^^^^^
 
-A lot of sites need some form of interaction to load more content. Twitter for
+A lot of sites need some form of interaction to dynamically load more content. Twitter for
 instance continously loads new posts when scrolling to the bottom of the page.
-crocoite can emulate these user interactions by injecting JavaScript into the
-page before loading it. For instance ``--onload=scroll.js`` scrolls the page to
-the bottom.
-
-If extra work is required before taking a DOM snapshot, additional scripts can
-be run with ``--onsnapshot=canvas-snapshot.js``, which replaces all HTML
-``<canvas>`` elements with a static picture of their current contents.
-
-Example scripts can be found in the directory ``crocoite/data/``.
+crocoite can emulate these user interactions (and more) by combining control
+code written in Python and injecting JavaScript into the page. The code can be
+limited to certain URLs or apply to every page loaded. By default all scripts
+available are enabled, see command line flag ``--behavior``.
 
 Caveats
 -------
diff --git a/contrib/celerycrocoite.py b/contrib/celerycrocoite.py
index 83be7a6..1923629 100644
--- a/contrib/celerycrocoite.py
+++ b/contrib/celerycrocoite.py
@@ -28,8 +28,7 @@ from sopel.tools import Identifier, SopelMemory
 import celery
 from urllib.parse import urlsplit
 
-import crocoite.cli
-from crocoite import behavior
+from crocoite import behavior, cli
 
 def prettyTimeDelta (seconds):
     """
@@ -82,22 +81,19 @@ def archive (bot, trigger):
         bot.reply ('{} is not a valid URL'.format (url))
         return
 
+    blacklistedBehavior = {'domSnapshot', 'screenshot'}
     args = {
             'url': url,
             'output': None,
-            'onload': ['scroll.js'] + behavior.getByUrl (url),
-            'onsnapshot': [],
+            'enabledBehaviorNames': list (behavior.availableNames-blacklistedBehavior),
             'browser': None,
             'logBuffer': 1000,
             'maxBodySize': 10*1024*1024,
             'idleTimeout': 10,
-            # 1 hour
-            'timeout': 1*60*60,
-            'domSnapshot': False,
-            'screenshot': False,
+            'timeout': 1*60*60, # 1 hour
             }
 
-    handle = crocoite.cli.archive.delay (**args)
+    handle = cli.archive.delay (**args)
     m = bot.memory['crocoite']
     jobs = m['jobs']
     # XXX: for some reason we cannot access the job’s state through handle,
@@ -106,9 +102,10 @@ def archive (bot, trigger):
 
     # pretty-print a few selected args
     showargs = {
-            'onload': ','.join (args['onload']),
+            'behavior': ','.join (args['enabledBehaviorNames']),
             'idleTimeout': prettyTimeDelta (args['idleTimeout']),
             'timeout': prettyTimeDelta (args['timeout']),
+            'maxBodySize': prettyBytes (args['maxBodySize']),
             }
     strargs = ', '.join (map (lambda x: '{}={}'.format (*x), showargs.items ()))
     bot.reply ('{} has been queued as {} with {}'.format (url, handle.id, strargs))
diff --git a/crocoite/__init__.py b/crocoite/__init__.py
index e23cd60..6fc86ce 100644
--- a/crocoite/__init__.py
+++ b/crocoite/__init__.py
@@ -18,16 +18,3 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 # THE SOFTWARE.
 
-import os
-
-def packageData (path):
-    """
-    Locate package data, see setup.py’s data_files
-    """
-    return os.path.join (os.path.dirname (__file__), 'data', path)
-
-def packageUrl (path):
-    """
-    Create URL for package data stored into WARC
-    """
-    return 'urn:' + __package__ + ':' + path
diff --git a/crocoite/behavior.py b/crocoite/behavior.py
index 13530fe..a7928e7 100644
--- a/crocoite/behavior.py
+++ b/crocoite/behavior.py
@@ -19,23 +19,225 @@
 # THE SOFTWARE.
 
 """
-Per-site JavaScript injections
+Generic and per-site behavior scripts
 """
 
+import logging
+from io import BytesIO
 from urllib.parse import urlsplit
+import os.path
+import pkg_resources
+from base64 import b64decode
 
-def getByUrl (url):
+from .util import randomString, packageUrl, getFormattedViewportMetrics
+from . import html
+from .html import StripAttributeFilter, StripTagFilter, ChromeTreeWalker
+from html5lib.serializer import HTMLSerializer
+from warcio.statusandheaders import StatusAndHeaders
+
+logger = logging.getLogger(__name__)
+
+class Behavior:
+    # unique behavior name
+    name = None
+
+    def __init__ (self, loader):
+        assert self.name is not None
+        self.loader = loader
+
+    def __contains__ (self, url):
+        """
+        Accept every URL by default
+        """
+        return True
+
+    def loadScript (self, path, encoding='utf-8'):
+        return pkg_resources.resource_string (__name__, os.path.join ('data', path)).decode (encoding)
+
+    def useScript (self, script, encoding='utf-8'):
+        writer = self.loader.writer
+        record = writer.create_warc_record (packageUrl ('script'), 'metadata',
+                payload=BytesIO (script.encode (encoding)),
+                warc_headers_dict={'Content-Type': 'application/javascript; charset={}'.format (encoding)})
+        writer.write_record (record)
+
+    def onload (self):
+        """ Before loading the page """
+        pass
+
+    def onstop (self):
+        """ Before page loading is stopped """
+        pass
+
+    def onfinish (self):
+        """ After the site has stopped loading """
+        pass
+
+class HostnameFilter:
+    """ Limit behavior script to hostname """
+
+    hostname = None
+
+    def __contains__ (self, url):
+        url = urlsplit (url)
+        hostname = url.hostname.split ('.')[::-1]
+        return hostname[:2] == self.hostname
+
+class JsOnload (Behavior):
+    """ Execute JavaScript on page load """
+
+    scriptPath = None
+
+    def __init__ (self, loader):
+        super ().__init__ (loader)
+        self.script = self.loadScript (self.scriptPath)
+        self.scriptHandle = None
+
+    def onload (self):
+        self.useScript (self.script)
+        self.scriptHandle = self.loader.tab.Page.addScriptToEvaluateOnNewDocument (source=self.script)['identifier']
+
+    def onstop (self):
+        self.loader.tab.Page.removeScriptToEvaluateOnNewDocument (identifier=self.scriptHandle)
+
+### Generic scripts ###
+
+class Scroll (JsOnload):
+    name = 'scroll'
+    scriptPath = 'scroll.js'
+
+    def __init__ (self, loader):
+        super ().__init__ (loader)
+        stopVarname = '__' + __package__ + '_stop__'
+        newStopVarname = randomString ()
+        self.script = self.script.replace (stopVarname, newStopVarname)
+        self.stopVarname = newStopVarname
+
+    def onstop (self):
+        super ().onstop ()
+        # removing the script does not stop it if running
+        script = '{} = true; window.scrollTo (0, 0);'.format (self.stopVarname)
+        self.useScript (script)
+        self.loader.tab.Runtime.evaluate (expression=script, returnByValue=True)
+
+class EmulateScreenMetrics (Behavior):
+    name = 'emulateScreenMetrics'
+
+    def onstop (self):
+        """
+        Emulate different screen sizes, causing the site to fetch assets (img
+        srcset and css, for example) for different screen resolutions.
+        """
+        cssPpi = 96
+        sizes = [
+                {'width': 1920, 'height': 1080, 'deviceScaleFactor': 1.5, 'mobile': False},
+                {'width': 1920, 'height': 1080, 'deviceScaleFactor': 2, 'mobile': False},
+                # very dense display
+                {'width': 1920, 'height': 1080, 'deviceScaleFactor': 4, 'mobile': False},
+                # just a few samples:
+                # 1st gen iPhone (portrait mode)
+                {'width': 320, 'height': 480, 'deviceScaleFactor': 163/cssPpi, 'mobile': True},
+                # 6th gen iPhone (portrait mode)
+                {'width': 750, 'height': 1334, 'deviceScaleFactor': 326/cssPpi, 'mobile': True},
+                # and reset
+                {'width': 1920, 'height': 1080, 'deviceScaleFactor': 1, 'mobile': False},
+                ]
+        l = self.loader
+        tab = l.tab
+        for s in sizes:
+            tab.Emulation.setDeviceMetricsOverride (**s)
+            # give the browser time to re-eval page and start requests
+            l.wait (1)
+        # XXX: this seems to be broken, it does not clear the override
+        #tab.Emulation.clearDeviceMetricsOverride ()
+
+class DomSnapshot (Behavior):
     """
-    Get site-specific onload behavior scripts
+    Get a DOM snapshot of tab and write it to WARC.
+
+    We could use DOMSnapshot.getSnapshot here, but the API is not stable
+    yet. Also computed styles are not really necessary here.
+
+    XXX: Currently writes a response, when it should use “resource”. pywb
+    can’t handle that though.
     """
-    url = urlsplit (url)
 
-    hostname = url.hostname.split ('.')[::-1]
+    name = 'domSnapshot'
+
+    def __init__ (self, loader):
+        super ().__init__ (loader)
+        self.script = self.loadScript ('canvas-snapshot.js')
+
+    def onfinish (self):
+        tab = self.loader.tab
+        writer = self.loader.writer
+
+        self.useScript (self.script)
+        tab.Runtime.evaluate (expression=self.script, returnByValue=True)
+
+        viewport = getFormattedViewportMetrics (tab)
+        dom = tab.DOM.getDocument (depth=-1, pierce=True)
+        haveUrls = set ()
+        for doc in ChromeTreeWalker (dom['root']).split ():
+            rawUrl = doc['documentURL']
+            if rawUrl in haveUrls:
+                # ignore duplicate URLs. they are usually caused by
+                # javascript-injected iframes (advertising) with no(?) src
+                logger.warning ('have DOM snapshot for URL {}, ignoring'.format (rawUrl))
+                continue
+            url = urlsplit (rawUrl)
+            if url.scheme in ('http', 'https'):
+                logger.debug ('saving DOM snapshot for url {}, base {}'.format (doc['documentURL'], doc['baseURL']))
+                haveUrls.add (rawUrl)
+                walker = ChromeTreeWalker (doc)
+                # remove script, to make the page static and noscript, because at the
+                # time we took the snapshot scripts were enabled
+                disallowedTags = ['script', 'noscript']
+                disallowedAttributes = html.eventAttributes
+                stream = StripAttributeFilter (StripTagFilter (walker, disallowedTags), disallowedAttributes)
+                serializer = HTMLSerializer ()
+                httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1')
+                record = writer.create_warc_record (doc['documentURL'], 'response',
+                        payload=BytesIO (serializer.render (stream, 'utf-8')),
+                        http_headers=httpHeaders,
+                        warc_headers_dict={'X-DOM-Snapshot': str (True),
+                                'X-Chrome-Viewport': viewport})
+                writer.write_record (record)
+
+class Screenshot (Behavior):
+    """
+    Create screenshot from tab and write it to WARC
+    """
+
+    name = 'screenshot'
+
+    def onfinish (self):
+        tab = self.loader.tab
+        writer = self.loader.writer
+
+        viewport = getFormattedViewportMetrics (tab)
+        data = b64decode (tab.Page.captureScreenshot (format='png')['data'])
+        record = writer.create_warc_record (packageUrl ('screenshot.png'), 'resource',
+                payload=BytesIO (data), warc_headers_dict={'Content-Type': 'image/png',
+                'X-Chrome-Viewport': viewport})
+        writer.write_record (record)
+
+### Site-specific scripts ###
+
+class Twitter (HostnameFilter, JsOnload):
+    name = 'twitter'
+    scriptPath = 'per-site/twitter.js'
+    hostname = ['com', 'twitter']
+
+class Instagram (HostnameFilter, JsOnload):
+    name = 'instagram'
+    scriptPath = 'per-site/instagram.js'
+    hostname = ['com', 'instagram']
 
-    if hostname[0] == 'com':
-        if hostname[1] == 'twitter':
-            return ['per-site/twitter.js']
-        elif hostname[1] == 'instagram':
-            return ['per-site/instagram.js']
-    return []
+# available behavior scripts. Order matters, move those modifying the page
+# towards the end of available
+generic = [Scroll, EmulateScreenMetrics]
+perSite = [Twitter, Instagram]
+available = generic + perSite + [Screenshot, DomSnapshot]
+availableNames = set (map (lambda x: x.name, available))
 
diff --git a/crocoite/cli.py b/crocoite/cli.py
index 32e0959..86fee13 100644
--- a/crocoite/cli.py
+++ b/crocoite/cli.py
@@ -22,125 +22,19 @@
 Standalone and Celery command line interface
 """
 
-import os, random, logging, argparse
+import os, logging, argparse
 from io import BytesIO
 from datetime import datetime
-from base64 import b64decode
 import pychrome
 from urllib.parse import urlsplit
-from warcio.statusandheaders import StatusAndHeaders
-from html5lib.serializer import HTMLSerializer
 
 from celery import Celery
 from celery.utils.log import get_task_logger
 
-from . import html, packageData, packageUrl
+from . import behavior
 from .warc import WarcLoader, SerializingWARCWriter
-from .html import StripAttributeFilter, StripTagFilter, ChromeTreeWalker
 from .browser import ChromeService, NullService
-
-def getFormattedViewportMetrics (tab):
-    layoutMetrics = tab.Page.getLayoutMetrics ()
-    # XXX: I’m not entirely sure which one we should use here
-    return '{}x{}'.format (layoutMetrics['layoutViewport']['clientWidth'],
-                layoutMetrics['layoutViewport']['clientHeight'])
-
-def writeScript (path, source, writer):
-    record = writer.create_warc_record (packageUrl (path), 'metadata',
-            payload=BytesIO (source.encode ('utf8')),
-            warc_headers_dict={'Content-Type': 'application/javascript; charset=utf-8'})
-    writer.write_record (record)
-
-def randomString (length=None, chars='abcdefghijklmnopqrstuvwxyz'):
-    if length is None:
-        length = random.randint (16, 32)
-    return ''.join (map (lambda x: random.choice (chars), range (length)))
-
-def writeDOMSnapshot (tab, writer):
-    """
-    Get a DOM snapshot of tab and write it to WARC.
-
-    We could use DOMSnapshot.getSnapshot here, but the API is not stable
-    yet. Also computed styles are not really necessary here.
-
-    XXX: Currently writes a response, when it should use “resource”. pywb
-    can’t handle that though.
-    """
-    viewport = getFormattedViewportMetrics (tab)
-    dom = tab.DOM.getDocument (depth=-1, pierce=True)
-    haveUrls = set ()
-    for doc in ChromeTreeWalker (dom['root']).split ():
-        rawUrl = doc['documentURL']
-        if rawUrl in haveUrls:
-            # ignore duplicate URLs. they are usually caused by
-            # javascript-injected iframes (advertising) with no(?) src
-            logger.warning ('have DOM snapshot for URL {}, ignoring'.format (rawUrl))
-            continue
-        url = urlsplit (rawUrl)
-        if url.scheme in ('http', 'https'):
-            logger.debug ('saving DOM snapshot for url {}, base {}'.format (doc['documentURL'], doc['baseURL']))
-            haveUrls.add (rawUrl)
-            walker = ChromeTreeWalker (doc)
-            # remove script, to make the page static and noscript, because at the
-            # time we took the snapshot scripts were enabled
-            disallowedTags = ['script', 'noscript']
-            disallowedAttributes = html.eventAttributes
-            stream = StripAttributeFilter (StripTagFilter (walker, disallowedTags), disallowedAttributes)
-            serializer = HTMLSerializer ()
-            httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1')
-            record = writer.create_warc_record (doc['documentURL'], 'response',
-                    payload=BytesIO (serializer.render (stream, 'utf-8')),
-                    http_headers=httpHeaders,
-                    warc_headers_dict={'X-DOM-Snapshot': str (True),
-                            'X-Chrome-Viewport': viewport})
-            writer.write_record (record)
-
-def emulateScreenMetrics (l):
-    """
-    Emulate different screen sizes, causing the site to fetch assets (img
-    srcset and css, for example) for different screen resolutions.
-    """
-    cssPpi = 96
-    sizes = [
-            {'width': 1920, 'height': 1080, 'deviceScaleFactor': 1.5, 'mobile': False},
-            {'width': 1920, 'height': 1080, 'deviceScaleFactor': 2, 'mobile': False},
-            # very dense display
-            {'width': 1920, 'height': 1080, 'deviceScaleFactor': 4, 'mobile': False},
-            # just a few samples:
-            # 1st gen iPhone (portrait mode)
-            {'width': 320, 'height': 480, 'deviceScaleFactor': 163/cssPpi, 'mobile': True},
-            # 6th gen iPhone (portrait mode)
-            {'width': 750, 'height': 1334, 'deviceScaleFactor': 326/cssPpi, 'mobile': True},
-            # and reset
-            {'width': 1920, 'height': 1080, 'deviceScaleFactor': 1, 'mobile': False},
-            ]
-    for s in sizes:
-        l.tab.Emulation.setDeviceMetricsOverride (**s)
-        l.wait (1)
-    # XXX: this seems to be broken, it does not clear the override
-    #tab.Emulation.clearDeviceMetricsOverride ()
-    # wait until assets finished loading
-    l.waitIdle (2, 60)
-
-def loadScripts (paths, scripts=[]):
-    for p in paths:
-        if not os.path.exists (p):
-            # search for defaults scripts in package data directory
-            p = packageData (p)
-        with open (p, 'r') as fd:
-            scripts.append (fd.read ())
-    return '\n'.join (scripts)
-
-def writeScreenshot (tab, writer):
-    """
-    Create screenshot from tab and write it to WARC
-    """
-    viewport = getFormattedViewportMetrics (tab)
-    data = b64decode (tab.Page.captureScreenshot (format='png')['data'])
-    record = writer.create_warc_record (packageUrl ('screenshot.png'), 'resource',
-            payload=BytesIO (data), warc_headers_dict={'Content-Type': 'image/png',
-            'X-Chrome-Viewport': viewport})
-    writer.write_record (record)
+from .util import packageUrl, getFormattedViewportMetrics
 
 app = Celery ('crocoite.distributed')
 app.config_from_object('celeryconfig')
@@ -148,8 +42,8 @@ logger = get_task_logger('crocoite.distributed.archive')
 
 # defaults can be changed below using argparse; track started state, because tasks are usually long-running
 @app.task(bind=True, track_started=True)
-def archive (self, url, output, onload, onsnapshot, browser,
-        logBuffer, maxBodySize, idleTimeout, timeout, domSnapshot, screenshot):
+def archive (self, url, output, browser, logBuffer, maxBodySize, idleTimeout,
+        timeout, enabledBehaviorNames):
     """
     Archive a single URL
 
@@ -164,16 +58,12 @@ def archive (self, url, output, onload, onsnapshot, browser,
 
     self.update_state (state='PROGRESS', meta={'step': 'start'})
 
-    stopVarname = '__' + __package__ + '_stop__'
-    # avoid sites messing with our scripts by using a random stop variable name
-    newStopVarname = randomString ()
-    onload = loadScripts (onload, ['var {} = false;\n'.format (stopVarname)]).replace (stopVarname, newStopVarname)
-    stopVarname = newStopVarname
-
     service = ChromeService ()
     if browser:
         service = NullService (browser)
 
+    allBehavior = list (filter (lambda x: x.name in enabledBehaviorNames, behavior.available))
+
     with service as browser:
         browser = pychrome.Browser(url=browser)
 
@@ -201,36 +91,34 @@ def archive (self, url, output, onload, onsnapshot, browser,
                     }
             warcinfo = writer.create_warcinfo_record (filename=None, info=payload)
             writer.write_record (warcinfo)
-            # save onload script as well
-            writeScript ('onload', onload, writer)
 
-            # inject our custom javascript to the page before loading
-            l.tab.Page.addScriptToEvaluateOnNewDocument (source=onload)
+            # not all behavior scripts are allowed for every URL, filter them
+            enabledBehavior = list (filter (lambda x: url in x,
+                    map (lambda x: x (l), allBehavior)))
+
+            self.update_state (state='PROGRESS', meta={'step': 'onload'})
+            for b in enabledBehavior:
+                logger.debug ('starting onload behavior {}'.format (b.name))
+                b.onload ()
             l.start ()
 
             self.update_state (state='PROGRESS', meta={'step': 'fetch'})
             l.waitIdle (idleTimeout, timeout)
 
-            # get ready for snapshot: stop loading and scripts, disable events
-            l.tab.Runtime.evaluate (expression='{} = true; window.scrollTo (0, 0);'.format (stopVarname), returnByValue=True)
+            self.update_state (state='PROGRESS', meta={'step': 'onstop'})
+            for b in enabledBehavior:
+                logger.debug ('starting onstop behavior {}'.format (b.name))
+                b.onstop ()
+
             # if we stopped due to timeout, wait for remaining assets
             l.waitIdle (2, 60)
-
-            self.update_state (state='PROGRESS', meta={'step': 'emulateScreenMetrics'})
-            emulateScreenMetrics (l)
-
             l.stop ()
 
-            if domSnapshot:
-                self.update_state (state='PROGRESS', meta={'step': 'domSnapshot'})
-                script = loadScripts (onsnapshot)
-                writeScript ('onsnapshot', script, writer)
-                l.tab.Runtime.evaluate (expression=script, returnByValue=True)
-                writeDOMSnapshot (l.tab, writer)
+            self.update_state (state='PROGRESS', meta={'step': 'onfinish'})
+            for b in enabledBehavior:
+                logger.debug ('starting onfinish behavior {}'.format (b.name))
+                b.onfinish ()
 
-            if screenshot:
-                self.update_state (state='PROGRESS', meta={'step': 'screenshot'})
-                writeScreenshot (l.tab, writer)
             ret['stats'] = l.stats
         writer.flush ()
     if not output:
@@ -244,8 +132,6 @@ def stateCallback (data):
         print (data['task_id'], result['step'])
 
 def main ():
-    from crocoite import behavior
-
     parser = argparse.ArgumentParser(description='Save website to WARC using Google Chrome.')
     parser.add_argument('--browser', help='DevTools URL', metavar='URL')
     parser.add_argument('--distributed', help='Use celery worker', action='store_true')
@@ -254,23 +140,19 @@ def main ():
     parser.add_argument('--log-buffer', default=1000, type=int, dest='logBuffer', metavar='LINES')
     parser.add_argument('--max-body-size', default=10*1024*1024, type=int, dest='maxBodySize', help='Max body size', metavar='BYTES')
     #parser.add_argument('--keep-tab', action='store_true', default=False, dest='keepTab', help='Keep tab open')
-    parser.add_argument('--onload', default=[], action='append', help='Inject JavaScript file before loading page', metavar='FILE')
-    parser.add_argument('--no-behavior', default=True, action='store_false', help='Do not inject default behavior scripts', dest='behavior')
-    parser.add_argument('--onsnapshot', default=[], action='append', help='Run JavaScript files before creating DOM snapshot', metavar='FILE')
-    parser.add_argument('--no-screenshot', default=True, action='store_false', help='Do not create a screenshot of the website', dest='screenshot')
-    parser.add_argument('--no-dom-snapshot', default=True, action='store_false', help='Do not create a DOM snapshot of the website', dest='domSnapshot')
+    parser.add_argument('--behavior', help='Comma-separated list of enabled behavior scripts',
+            dest='enabledBehaviorNames',
+            default=list (behavior.availableNames),
+            choices=list (behavior.availableNames))
     parser.add_argument('url', help='Website URL')
     parser.add_argument('output', help='WARC filename')
 
     args = parser.parse_args ()
-    if args.behavior:
-        args.onload.extend (['scroll.js'] + behavior.getByUrl (args.url))
 
     # prepare args for function
     distributed = args.distributed
     passArgs = vars (args)
     del passArgs['distributed']
-    del passArgs['behavior']
 
     if distributed:
         result = archive.delay (**passArgs)
diff --git a/crocoite/data/scroll.js b/crocoite/data/scroll.js
index 2b4dff1..0d1a4a7 100644
--- a/crocoite/data/scroll.js
+++ b/crocoite/data/scroll.js
@@ -1,5 +1,6 @@
 /*	Continuously scrolls the page
  */
+var __crocoite_stop__ = false;
 (function(){
 function scroll (event) {
 	if (__crocoite_stop__) {
diff --git a/crocoite/util.py b/crocoite/util.py
new file mode 100644
index 0000000..ec257f1
--- /dev/null
+++ b/crocoite/util.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2017 crocoite contributors
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# 
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+# 
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+"""
+Random utility functions
+"""
+
+import random
+
+def randomString (length=None, chars='abcdefghijklmnopqrstuvwxyz'):
+    if length is None:
+        length = random.randint (16, 32)
+    return ''.join (map (lambda x: random.choice (chars), range (length)))
+
+def packageUrl (path):
+    """
+    Create URL for package data stored into WARC
+    """
+    return 'urn:' + __package__ + ':' + path
+
+def getFormattedViewportMetrics (tab):
+    layoutMetrics = tab.Page.getLayoutMetrics ()
+    # XXX: I’m not entirely sure which one we should use here
+    return '{}x{}'.format (layoutMetrics['layoutViewport']['clientWidth'],
+                layoutMetrics['layoutViewport']['clientHeight'])
+
diff --git a/crocoite/warc.py b/crocoite/warc.py
index d9afab2..540f673 100644
--- a/crocoite/warc.py
+++ b/crocoite/warc.py
@@ -24,8 +24,6 @@ Classes writing data to WARC files
 
 import logging
 import json
-from .browser import AccountingSiteLoader
-from . import packageUrl
 from http.server import BaseHTTPRequestHandler
 from base64 import b64decode
 from io import BytesIO
@@ -40,6 +38,9 @@ from queue import Queue
 from warcio.timeutils import datetime_to_iso_date
 from warcio.warcwriter import WARCWriter
 
+from .browser import AccountingSiteLoader
+from .util import packageUrl
+
 class SerializingWARCWriter (WARCWriter):
     """
     Serializing WARC writer using separate writer thread and queue for
author	Lars-Dominik Braun <lars@6xq.net>	2017-12-24 10:23:09 +0100
committer	Lars-Dominik Braun <lars@6xq.net>	2017-12-24 10:23:09 +0100
commit	d4951d1fe8be0941df9cafbfe57c21d26f66e8ee (patch)
tree	f24081e310a1bcbf1aea8ddc258d60050f012c0d
parent	fcb9ae93514d26085c8e0aebf2fc9b9c64a77453 (diff)
download	crocoite-d4951d1fe8be0941df9cafbfe57c21d26f66e8ee.tar.gz crocoite-d4951d1fe8be0941df9cafbfe57c21d26f66e8ee.tar.bz2 crocoite-d4951d1fe8be0941df9cafbfe57c21d26f66e8ee.zip