diff options
Diffstat (limited to 'crocoite/cli.py')
-rw-r--r-- | crocoite/cli.py | 174 |
1 files changed, 28 insertions, 146 deletions
diff --git a/crocoite/cli.py b/crocoite/cli.py index 32e0959..86fee13 100644 --- a/crocoite/cli.py +++ b/crocoite/cli.py @@ -22,125 +22,19 @@ Standalone and Celery command line interface """ -import os, random, logging, argparse +import os, logging, argparse from io import BytesIO from datetime import datetime -from base64 import b64decode import pychrome from urllib.parse import urlsplit -from warcio.statusandheaders import StatusAndHeaders -from html5lib.serializer import HTMLSerializer from celery import Celery from celery.utils.log import get_task_logger -from . import html, packageData, packageUrl +from . import behavior from .warc import WarcLoader, SerializingWARCWriter -from .html import StripAttributeFilter, StripTagFilter, ChromeTreeWalker from .browser import ChromeService, NullService - -def getFormattedViewportMetrics (tab): - layoutMetrics = tab.Page.getLayoutMetrics () - # XXX: I’m not entirely sure which one we should use here - return '{}x{}'.format (layoutMetrics['layoutViewport']['clientWidth'], - layoutMetrics['layoutViewport']['clientHeight']) - -def writeScript (path, source, writer): - record = writer.create_warc_record (packageUrl (path), 'metadata', - payload=BytesIO (source.encode ('utf8')), - warc_headers_dict={'Content-Type': 'application/javascript; charset=utf-8'}) - writer.write_record (record) - -def randomString (length=None, chars='abcdefghijklmnopqrstuvwxyz'): - if length is None: - length = random.randint (16, 32) - return ''.join (map (lambda x: random.choice (chars), range (length))) - -def writeDOMSnapshot (tab, writer): - """ - Get a DOM snapshot of tab and write it to WARC. - - We could use DOMSnapshot.getSnapshot here, but the API is not stable - yet. Also computed styles are not really necessary here. - - XXX: Currently writes a response, when it should use “resource”. pywb - can’t handle that though. - """ - viewport = getFormattedViewportMetrics (tab) - dom = tab.DOM.getDocument (depth=-1, pierce=True) - haveUrls = set () - for doc in ChromeTreeWalker (dom['root']).split (): - rawUrl = doc['documentURL'] - if rawUrl in haveUrls: - # ignore duplicate URLs. they are usually caused by - # javascript-injected iframes (advertising) with no(?) src - logger.warning ('have DOM snapshot for URL {}, ignoring'.format (rawUrl)) - continue - url = urlsplit (rawUrl) - if url.scheme in ('http', 'https'): - logger.debug ('saving DOM snapshot for url {}, base {}'.format (doc['documentURL'], doc['baseURL'])) - haveUrls.add (rawUrl) - walker = ChromeTreeWalker (doc) - # remove script, to make the page static and noscript, because at the - # time we took the snapshot scripts were enabled - disallowedTags = ['script', 'noscript'] - disallowedAttributes = html.eventAttributes - stream = StripAttributeFilter (StripTagFilter (walker, disallowedTags), disallowedAttributes) - serializer = HTMLSerializer () - httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1') - record = writer.create_warc_record (doc['documentURL'], 'response', - payload=BytesIO (serializer.render (stream, 'utf-8')), - http_headers=httpHeaders, - warc_headers_dict={'X-DOM-Snapshot': str (True), - 'X-Chrome-Viewport': viewport}) - writer.write_record (record) - -def emulateScreenMetrics (l): - """ - Emulate different screen sizes, causing the site to fetch assets (img - srcset and css, for example) for different screen resolutions. - """ - cssPpi = 96 - sizes = [ - {'width': 1920, 'height': 1080, 'deviceScaleFactor': 1.5, 'mobile': False}, - {'width': 1920, 'height': 1080, 'deviceScaleFactor': 2, 'mobile': False}, - # very dense display - {'width': 1920, 'height': 1080, 'deviceScaleFactor': 4, 'mobile': False}, - # just a few samples: - # 1st gen iPhone (portrait mode) - {'width': 320, 'height': 480, 'deviceScaleFactor': 163/cssPpi, 'mobile': True}, - # 6th gen iPhone (portrait mode) - {'width': 750, 'height': 1334, 'deviceScaleFactor': 326/cssPpi, 'mobile': True}, - # and reset - {'width': 1920, 'height': 1080, 'deviceScaleFactor': 1, 'mobile': False}, - ] - for s in sizes: - l.tab.Emulation.setDeviceMetricsOverride (**s) - l.wait (1) - # XXX: this seems to be broken, it does not clear the override - #tab.Emulation.clearDeviceMetricsOverride () - # wait until assets finished loading - l.waitIdle (2, 60) - -def loadScripts (paths, scripts=[]): - for p in paths: - if not os.path.exists (p): - # search for defaults scripts in package data directory - p = packageData (p) - with open (p, 'r') as fd: - scripts.append (fd.read ()) - return '\n'.join (scripts) - -def writeScreenshot (tab, writer): - """ - Create screenshot from tab and write it to WARC - """ - viewport = getFormattedViewportMetrics (tab) - data = b64decode (tab.Page.captureScreenshot (format='png')['data']) - record = writer.create_warc_record (packageUrl ('screenshot.png'), 'resource', - payload=BytesIO (data), warc_headers_dict={'Content-Type': 'image/png', - 'X-Chrome-Viewport': viewport}) - writer.write_record (record) +from .util import packageUrl, getFormattedViewportMetrics app = Celery ('crocoite.distributed') app.config_from_object('celeryconfig') @@ -148,8 +42,8 @@ logger = get_task_logger('crocoite.distributed.archive') # defaults can be changed below using argparse; track started state, because tasks are usually long-running @app.task(bind=True, track_started=True) -def archive (self, url, output, onload, onsnapshot, browser, - logBuffer, maxBodySize, idleTimeout, timeout, domSnapshot, screenshot): +def archive (self, url, output, browser, logBuffer, maxBodySize, idleTimeout, + timeout, enabledBehaviorNames): """ Archive a single URL @@ -164,16 +58,12 @@ def archive (self, url, output, onload, onsnapshot, browser, self.update_state (state='PROGRESS', meta={'step': 'start'}) - stopVarname = '__' + __package__ + '_stop__' - # avoid sites messing with our scripts by using a random stop variable name - newStopVarname = randomString () - onload = loadScripts (onload, ['var {} = false;\n'.format (stopVarname)]).replace (stopVarname, newStopVarname) - stopVarname = newStopVarname - service = ChromeService () if browser: service = NullService (browser) + allBehavior = list (filter (lambda x: x.name in enabledBehaviorNames, behavior.available)) + with service as browser: browser = pychrome.Browser(url=browser) @@ -201,36 +91,34 @@ def archive (self, url, output, onload, onsnapshot, browser, } warcinfo = writer.create_warcinfo_record (filename=None, info=payload) writer.write_record (warcinfo) - # save onload script as well - writeScript ('onload', onload, writer) - # inject our custom javascript to the page before loading - l.tab.Page.addScriptToEvaluateOnNewDocument (source=onload) + # not all behavior scripts are allowed for every URL, filter them + enabledBehavior = list (filter (lambda x: url in x, + map (lambda x: x (l), allBehavior))) + + self.update_state (state='PROGRESS', meta={'step': 'onload'}) + for b in enabledBehavior: + logger.debug ('starting onload behavior {}'.format (b.name)) + b.onload () l.start () self.update_state (state='PROGRESS', meta={'step': 'fetch'}) l.waitIdle (idleTimeout, timeout) - # get ready for snapshot: stop loading and scripts, disable events - l.tab.Runtime.evaluate (expression='{} = true; window.scrollTo (0, 0);'.format (stopVarname), returnByValue=True) + self.update_state (state='PROGRESS', meta={'step': 'onstop'}) + for b in enabledBehavior: + logger.debug ('starting onstop behavior {}'.format (b.name)) + b.onstop () + # if we stopped due to timeout, wait for remaining assets l.waitIdle (2, 60) - - self.update_state (state='PROGRESS', meta={'step': 'emulateScreenMetrics'}) - emulateScreenMetrics (l) - l.stop () - if domSnapshot: - self.update_state (state='PROGRESS', meta={'step': 'domSnapshot'}) - script = loadScripts (onsnapshot) - writeScript ('onsnapshot', script, writer) - l.tab.Runtime.evaluate (expression=script, returnByValue=True) - writeDOMSnapshot (l.tab, writer) + self.update_state (state='PROGRESS', meta={'step': 'onfinish'}) + for b in enabledBehavior: + logger.debug ('starting onfinish behavior {}'.format (b.name)) + b.onfinish () - if screenshot: - self.update_state (state='PROGRESS', meta={'step': 'screenshot'}) - writeScreenshot (l.tab, writer) ret['stats'] = l.stats writer.flush () if not output: @@ -244,8 +132,6 @@ def stateCallback (data): print (data['task_id'], result['step']) def main (): - from crocoite import behavior - parser = argparse.ArgumentParser(description='Save website to WARC using Google Chrome.') parser.add_argument('--browser', help='DevTools URL', metavar='URL') parser.add_argument('--distributed', help='Use celery worker', action='store_true') @@ -254,23 +140,19 @@ def main (): parser.add_argument('--log-buffer', default=1000, type=int, dest='logBuffer', metavar='LINES') parser.add_argument('--max-body-size', default=10*1024*1024, type=int, dest='maxBodySize', help='Max body size', metavar='BYTES') #parser.add_argument('--keep-tab', action='store_true', default=False, dest='keepTab', help='Keep tab open') - parser.add_argument('--onload', default=[], action='append', help='Inject JavaScript file before loading page', metavar='FILE') - parser.add_argument('--no-behavior', default=True, action='store_false', help='Do not inject default behavior scripts', dest='behavior') - parser.add_argument('--onsnapshot', default=[], action='append', help='Run JavaScript files before creating DOM snapshot', metavar='FILE') - parser.add_argument('--no-screenshot', default=True, action='store_false', help='Do not create a screenshot of the website', dest='screenshot') - parser.add_argument('--no-dom-snapshot', default=True, action='store_false', help='Do not create a DOM snapshot of the website', dest='domSnapshot') + parser.add_argument('--behavior', help='Comma-separated list of enabled behavior scripts', + dest='enabledBehaviorNames', + default=list (behavior.availableNames), + choices=list (behavior.availableNames)) parser.add_argument('url', help='Website URL') parser.add_argument('output', help='WARC filename') args = parser.parse_args () - if args.behavior: - args.onload.extend (['scroll.js'] + behavior.getByUrl (args.url)) # prepare args for function distributed = args.distributed passArgs = vars (args) del passArgs['distributed'] - del passArgs['behavior'] if distributed: result = archive.delay (**passArgs) |