summaryrefslogtreecommitdiff
path: root/crocoite/cli.py
diff options
context:
space:
mode:
Diffstat (limited to 'crocoite/cli.py')
-rw-r--r--crocoite/cli.py174
1 files changed, 28 insertions, 146 deletions
diff --git a/crocoite/cli.py b/crocoite/cli.py
index 32e0959..86fee13 100644
--- a/crocoite/cli.py
+++ b/crocoite/cli.py
@@ -22,125 +22,19 @@
Standalone and Celery command line interface
"""
-import os, random, logging, argparse
+import os, logging, argparse
from io import BytesIO
from datetime import datetime
-from base64 import b64decode
import pychrome
from urllib.parse import urlsplit
-from warcio.statusandheaders import StatusAndHeaders
-from html5lib.serializer import HTMLSerializer
from celery import Celery
from celery.utils.log import get_task_logger
-from . import html, packageData, packageUrl
+from . import behavior
from .warc import WarcLoader, SerializingWARCWriter
-from .html import StripAttributeFilter, StripTagFilter, ChromeTreeWalker
from .browser import ChromeService, NullService
-
-def getFormattedViewportMetrics (tab):
- layoutMetrics = tab.Page.getLayoutMetrics ()
- # XXX: I’m not entirely sure which one we should use here
- return '{}x{}'.format (layoutMetrics['layoutViewport']['clientWidth'],
- layoutMetrics['layoutViewport']['clientHeight'])
-
-def writeScript (path, source, writer):
- record = writer.create_warc_record (packageUrl (path), 'metadata',
- payload=BytesIO (source.encode ('utf8')),
- warc_headers_dict={'Content-Type': 'application/javascript; charset=utf-8'})
- writer.write_record (record)
-
-def randomString (length=None, chars='abcdefghijklmnopqrstuvwxyz'):
- if length is None:
- length = random.randint (16, 32)
- return ''.join (map (lambda x: random.choice (chars), range (length)))
-
-def writeDOMSnapshot (tab, writer):
- """
- Get a DOM snapshot of tab and write it to WARC.
-
- We could use DOMSnapshot.getSnapshot here, but the API is not stable
- yet. Also computed styles are not really necessary here.
-
- XXX: Currently writes a response, when it should use “resource”. pywb
- can’t handle that though.
- """
- viewport = getFormattedViewportMetrics (tab)
- dom = tab.DOM.getDocument (depth=-1, pierce=True)
- haveUrls = set ()
- for doc in ChromeTreeWalker (dom['root']).split ():
- rawUrl = doc['documentURL']
- if rawUrl in haveUrls:
- # ignore duplicate URLs. they are usually caused by
- # javascript-injected iframes (advertising) with no(?) src
- logger.warning ('have DOM snapshot for URL {}, ignoring'.format (rawUrl))
- continue
- url = urlsplit (rawUrl)
- if url.scheme in ('http', 'https'):
- logger.debug ('saving DOM snapshot for url {}, base {}'.format (doc['documentURL'], doc['baseURL']))
- haveUrls.add (rawUrl)
- walker = ChromeTreeWalker (doc)
- # remove script, to make the page static and noscript, because at the
- # time we took the snapshot scripts were enabled
- disallowedTags = ['script', 'noscript']
- disallowedAttributes = html.eventAttributes
- stream = StripAttributeFilter (StripTagFilter (walker, disallowedTags), disallowedAttributes)
- serializer = HTMLSerializer ()
- httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1')
- record = writer.create_warc_record (doc['documentURL'], 'response',
- payload=BytesIO (serializer.render (stream, 'utf-8')),
- http_headers=httpHeaders,
- warc_headers_dict={'X-DOM-Snapshot': str (True),
- 'X-Chrome-Viewport': viewport})
- writer.write_record (record)
-
-def emulateScreenMetrics (l):
- """
- Emulate different screen sizes, causing the site to fetch assets (img
- srcset and css, for example) for different screen resolutions.
- """
- cssPpi = 96
- sizes = [
- {'width': 1920, 'height': 1080, 'deviceScaleFactor': 1.5, 'mobile': False},
- {'width': 1920, 'height': 1080, 'deviceScaleFactor': 2, 'mobile': False},
- # very dense display
- {'width': 1920, 'height': 1080, 'deviceScaleFactor': 4, 'mobile': False},
- # just a few samples:
- # 1st gen iPhone (portrait mode)
- {'width': 320, 'height': 480, 'deviceScaleFactor': 163/cssPpi, 'mobile': True},
- # 6th gen iPhone (portrait mode)
- {'width': 750, 'height': 1334, 'deviceScaleFactor': 326/cssPpi, 'mobile': True},
- # and reset
- {'width': 1920, 'height': 1080, 'deviceScaleFactor': 1, 'mobile': False},
- ]
- for s in sizes:
- l.tab.Emulation.setDeviceMetricsOverride (**s)
- l.wait (1)
- # XXX: this seems to be broken, it does not clear the override
- #tab.Emulation.clearDeviceMetricsOverride ()
- # wait until assets finished loading
- l.waitIdle (2, 60)
-
-def loadScripts (paths, scripts=[]):
- for p in paths:
- if not os.path.exists (p):
- # search for defaults scripts in package data directory
- p = packageData (p)
- with open (p, 'r') as fd:
- scripts.append (fd.read ())
- return '\n'.join (scripts)
-
-def writeScreenshot (tab, writer):
- """
- Create screenshot from tab and write it to WARC
- """
- viewport = getFormattedViewportMetrics (tab)
- data = b64decode (tab.Page.captureScreenshot (format='png')['data'])
- record = writer.create_warc_record (packageUrl ('screenshot.png'), 'resource',
- payload=BytesIO (data), warc_headers_dict={'Content-Type': 'image/png',
- 'X-Chrome-Viewport': viewport})
- writer.write_record (record)
+from .util import packageUrl, getFormattedViewportMetrics
app = Celery ('crocoite.distributed')
app.config_from_object('celeryconfig')
@@ -148,8 +42,8 @@ logger = get_task_logger('crocoite.distributed.archive')
# defaults can be changed below using argparse; track started state, because tasks are usually long-running
@app.task(bind=True, track_started=True)
-def archive (self, url, output, onload, onsnapshot, browser,
- logBuffer, maxBodySize, idleTimeout, timeout, domSnapshot, screenshot):
+def archive (self, url, output, browser, logBuffer, maxBodySize, idleTimeout,
+ timeout, enabledBehaviorNames):
"""
Archive a single URL
@@ -164,16 +58,12 @@ def archive (self, url, output, onload, onsnapshot, browser,
self.update_state (state='PROGRESS', meta={'step': 'start'})
- stopVarname = '__' + __package__ + '_stop__'
- # avoid sites messing with our scripts by using a random stop variable name
- newStopVarname = randomString ()
- onload = loadScripts (onload, ['var {} = false;\n'.format (stopVarname)]).replace (stopVarname, newStopVarname)
- stopVarname = newStopVarname
-
service = ChromeService ()
if browser:
service = NullService (browser)
+ allBehavior = list (filter (lambda x: x.name in enabledBehaviorNames, behavior.available))
+
with service as browser:
browser = pychrome.Browser(url=browser)
@@ -201,36 +91,34 @@ def archive (self, url, output, onload, onsnapshot, browser,
}
warcinfo = writer.create_warcinfo_record (filename=None, info=payload)
writer.write_record (warcinfo)
- # save onload script as well
- writeScript ('onload', onload, writer)
- # inject our custom javascript to the page before loading
- l.tab.Page.addScriptToEvaluateOnNewDocument (source=onload)
+ # not all behavior scripts are allowed for every URL, filter them
+ enabledBehavior = list (filter (lambda x: url in x,
+ map (lambda x: x (l), allBehavior)))
+
+ self.update_state (state='PROGRESS', meta={'step': 'onload'})
+ for b in enabledBehavior:
+ logger.debug ('starting onload behavior {}'.format (b.name))
+ b.onload ()
l.start ()
self.update_state (state='PROGRESS', meta={'step': 'fetch'})
l.waitIdle (idleTimeout, timeout)
- # get ready for snapshot: stop loading and scripts, disable events
- l.tab.Runtime.evaluate (expression='{} = true; window.scrollTo (0, 0);'.format (stopVarname), returnByValue=True)
+ self.update_state (state='PROGRESS', meta={'step': 'onstop'})
+ for b in enabledBehavior:
+ logger.debug ('starting onstop behavior {}'.format (b.name))
+ b.onstop ()
+
# if we stopped due to timeout, wait for remaining assets
l.waitIdle (2, 60)
-
- self.update_state (state='PROGRESS', meta={'step': 'emulateScreenMetrics'})
- emulateScreenMetrics (l)
-
l.stop ()
- if domSnapshot:
- self.update_state (state='PROGRESS', meta={'step': 'domSnapshot'})
- script = loadScripts (onsnapshot)
- writeScript ('onsnapshot', script, writer)
- l.tab.Runtime.evaluate (expression=script, returnByValue=True)
- writeDOMSnapshot (l.tab, writer)
+ self.update_state (state='PROGRESS', meta={'step': 'onfinish'})
+ for b in enabledBehavior:
+ logger.debug ('starting onfinish behavior {}'.format (b.name))
+ b.onfinish ()
- if screenshot:
- self.update_state (state='PROGRESS', meta={'step': 'screenshot'})
- writeScreenshot (l.tab, writer)
ret['stats'] = l.stats
writer.flush ()
if not output:
@@ -244,8 +132,6 @@ def stateCallback (data):
print (data['task_id'], result['step'])
def main ():
- from crocoite import behavior
-
parser = argparse.ArgumentParser(description='Save website to WARC using Google Chrome.')
parser.add_argument('--browser', help='DevTools URL', metavar='URL')
parser.add_argument('--distributed', help='Use celery worker', action='store_true')
@@ -254,23 +140,19 @@ def main ():
parser.add_argument('--log-buffer', default=1000, type=int, dest='logBuffer', metavar='LINES')
parser.add_argument('--max-body-size', default=10*1024*1024, type=int, dest='maxBodySize', help='Max body size', metavar='BYTES')
#parser.add_argument('--keep-tab', action='store_true', default=False, dest='keepTab', help='Keep tab open')
- parser.add_argument('--onload', default=[], action='append', help='Inject JavaScript file before loading page', metavar='FILE')
- parser.add_argument('--no-behavior', default=True, action='store_false', help='Do not inject default behavior scripts', dest='behavior')
- parser.add_argument('--onsnapshot', default=[], action='append', help='Run JavaScript files before creating DOM snapshot', metavar='FILE')
- parser.add_argument('--no-screenshot', default=True, action='store_false', help='Do not create a screenshot of the website', dest='screenshot')
- parser.add_argument('--no-dom-snapshot', default=True, action='store_false', help='Do not create a DOM snapshot of the website', dest='domSnapshot')
+ parser.add_argument('--behavior', help='Comma-separated list of enabled behavior scripts',
+ dest='enabledBehaviorNames',
+ default=list (behavior.availableNames),
+ choices=list (behavior.availableNames))
parser.add_argument('url', help='Website URL')
parser.add_argument('output', help='WARC filename')
args = parser.parse_args ()
- if args.behavior:
- args.onload.extend (['scroll.js'] + behavior.getByUrl (args.url))
# prepare args for function
distributed = args.distributed
passArgs = vars (args)
del passArgs['distributed']
- del passArgs['behavior']
if distributed:
result = archive.delay (**passArgs)