diff options
Diffstat (limited to 'crocoite')
-rw-r--r-- | crocoite/__init__.py | 13 | ||||
-rw-r--r-- | crocoite/behavior.py | 224 | ||||
-rw-r--r-- | crocoite/cli.py | 174 | ||||
-rw-r--r-- | crocoite/data/scroll.js | 1 | ||||
-rw-r--r-- | crocoite/util.py | 43 | ||||
-rw-r--r-- | crocoite/warc.py | 5 |
6 files changed, 288 insertions, 172 deletions
diff --git a/crocoite/__init__.py b/crocoite/__init__.py index e23cd60..6fc86ce 100644 --- a/crocoite/__init__.py +++ b/crocoite/__init__.py @@ -18,16 +18,3 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. -import os - -def packageData (path): - """ - Locate package data, see setup.py’s data_files - """ - return os.path.join (os.path.dirname (__file__), 'data', path) - -def packageUrl (path): - """ - Create URL for package data stored into WARC - """ - return 'urn:' + __package__ + ':' + path diff --git a/crocoite/behavior.py b/crocoite/behavior.py index 13530fe..a7928e7 100644 --- a/crocoite/behavior.py +++ b/crocoite/behavior.py @@ -19,23 +19,225 @@ # THE SOFTWARE. """ -Per-site JavaScript injections +Generic and per-site behavior scripts """ +import logging +from io import BytesIO from urllib.parse import urlsplit +import os.path +import pkg_resources +from base64 import b64decode -def getByUrl (url): +from .util import randomString, packageUrl, getFormattedViewportMetrics +from . import html +from .html import StripAttributeFilter, StripTagFilter, ChromeTreeWalker +from html5lib.serializer import HTMLSerializer +from warcio.statusandheaders import StatusAndHeaders + +logger = logging.getLogger(__name__) + +class Behavior: + # unique behavior name + name = None + + def __init__ (self, loader): + assert self.name is not None + self.loader = loader + + def __contains__ (self, url): + """ + Accept every URL by default + """ + return True + + def loadScript (self, path, encoding='utf-8'): + return pkg_resources.resource_string (__name__, os.path.join ('data', path)).decode (encoding) + + def useScript (self, script, encoding='utf-8'): + writer = self.loader.writer + record = writer.create_warc_record (packageUrl ('script'), 'metadata', + payload=BytesIO (script.encode (encoding)), + warc_headers_dict={'Content-Type': 'application/javascript; charset={}'.format (encoding)}) + writer.write_record (record) + + def onload (self): + """ Before loading the page """ + pass + + def onstop (self): + """ Before page loading is stopped """ + pass + + def onfinish (self): + """ After the site has stopped loading """ + pass + +class HostnameFilter: + """ Limit behavior script to hostname """ + + hostname = None + + def __contains__ (self, url): + url = urlsplit (url) + hostname = url.hostname.split ('.')[::-1] + return hostname[:2] == self.hostname + +class JsOnload (Behavior): + """ Execute JavaScript on page load """ + + scriptPath = None + + def __init__ (self, loader): + super ().__init__ (loader) + self.script = self.loadScript (self.scriptPath) + self.scriptHandle = None + + def onload (self): + self.useScript (self.script) + self.scriptHandle = self.loader.tab.Page.addScriptToEvaluateOnNewDocument (source=self.script)['identifier'] + + def onstop (self): + self.loader.tab.Page.removeScriptToEvaluateOnNewDocument (identifier=self.scriptHandle) + +### Generic scripts ### + +class Scroll (JsOnload): + name = 'scroll' + scriptPath = 'scroll.js' + + def __init__ (self, loader): + super ().__init__ (loader) + stopVarname = '__' + __package__ + '_stop__' + newStopVarname = randomString () + self.script = self.script.replace (stopVarname, newStopVarname) + self.stopVarname = newStopVarname + + def onstop (self): + super ().onstop () + # removing the script does not stop it if running + script = '{} = true; window.scrollTo (0, 0);'.format (self.stopVarname) + self.useScript (script) + self.loader.tab.Runtime.evaluate (expression=script, returnByValue=True) + +class EmulateScreenMetrics (Behavior): + name = 'emulateScreenMetrics' + + def onstop (self): + """ + Emulate different screen sizes, causing the site to fetch assets (img + srcset and css, for example) for different screen resolutions. + """ + cssPpi = 96 + sizes = [ + {'width': 1920, 'height': 1080, 'deviceScaleFactor': 1.5, 'mobile': False}, + {'width': 1920, 'height': 1080, 'deviceScaleFactor': 2, 'mobile': False}, + # very dense display + {'width': 1920, 'height': 1080, 'deviceScaleFactor': 4, 'mobile': False}, + # just a few samples: + # 1st gen iPhone (portrait mode) + {'width': 320, 'height': 480, 'deviceScaleFactor': 163/cssPpi, 'mobile': True}, + # 6th gen iPhone (portrait mode) + {'width': 750, 'height': 1334, 'deviceScaleFactor': 326/cssPpi, 'mobile': True}, + # and reset + {'width': 1920, 'height': 1080, 'deviceScaleFactor': 1, 'mobile': False}, + ] + l = self.loader + tab = l.tab + for s in sizes: + tab.Emulation.setDeviceMetricsOverride (**s) + # give the browser time to re-eval page and start requests + l.wait (1) + # XXX: this seems to be broken, it does not clear the override + #tab.Emulation.clearDeviceMetricsOverride () + +class DomSnapshot (Behavior): """ - Get site-specific onload behavior scripts + Get a DOM snapshot of tab and write it to WARC. + + We could use DOMSnapshot.getSnapshot here, but the API is not stable + yet. Also computed styles are not really necessary here. + + XXX: Currently writes a response, when it should use “resource”. pywb + can’t handle that though. """ - url = urlsplit (url) - hostname = url.hostname.split ('.')[::-1] + name = 'domSnapshot' + + def __init__ (self, loader): + super ().__init__ (loader) + self.script = self.loadScript ('canvas-snapshot.js') + + def onfinish (self): + tab = self.loader.tab + writer = self.loader.writer + + self.useScript (self.script) + tab.Runtime.evaluate (expression=self.script, returnByValue=True) + + viewport = getFormattedViewportMetrics (tab) + dom = tab.DOM.getDocument (depth=-1, pierce=True) + haveUrls = set () + for doc in ChromeTreeWalker (dom['root']).split (): + rawUrl = doc['documentURL'] + if rawUrl in haveUrls: + # ignore duplicate URLs. they are usually caused by + # javascript-injected iframes (advertising) with no(?) src + logger.warning ('have DOM snapshot for URL {}, ignoring'.format (rawUrl)) + continue + url = urlsplit (rawUrl) + if url.scheme in ('http', 'https'): + logger.debug ('saving DOM snapshot for url {}, base {}'.format (doc['documentURL'], doc['baseURL'])) + haveUrls.add (rawUrl) + walker = ChromeTreeWalker (doc) + # remove script, to make the page static and noscript, because at the + # time we took the snapshot scripts were enabled + disallowedTags = ['script', 'noscript'] + disallowedAttributes = html.eventAttributes + stream = StripAttributeFilter (StripTagFilter (walker, disallowedTags), disallowedAttributes) + serializer = HTMLSerializer () + httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1') + record = writer.create_warc_record (doc['documentURL'], 'response', + payload=BytesIO (serializer.render (stream, 'utf-8')), + http_headers=httpHeaders, + warc_headers_dict={'X-DOM-Snapshot': str (True), + 'X-Chrome-Viewport': viewport}) + writer.write_record (record) + +class Screenshot (Behavior): + """ + Create screenshot from tab and write it to WARC + """ + + name = 'screenshot' + + def onfinish (self): + tab = self.loader.tab + writer = self.loader.writer + + viewport = getFormattedViewportMetrics (tab) + data = b64decode (tab.Page.captureScreenshot (format='png')['data']) + record = writer.create_warc_record (packageUrl ('screenshot.png'), 'resource', + payload=BytesIO (data), warc_headers_dict={'Content-Type': 'image/png', + 'X-Chrome-Viewport': viewport}) + writer.write_record (record) + +### Site-specific scripts ### + +class Twitter (HostnameFilter, JsOnload): + name = 'twitter' + scriptPath = 'per-site/twitter.js' + hostname = ['com', 'twitter'] + +class Instagram (HostnameFilter, JsOnload): + name = 'instagram' + scriptPath = 'per-site/instagram.js' + hostname = ['com', 'instagram'] - if hostname[0] == 'com': - if hostname[1] == 'twitter': - return ['per-site/twitter.js'] - elif hostname[1] == 'instagram': - return ['per-site/instagram.js'] - return [] +# available behavior scripts. Order matters, move those modifying the page +# towards the end of available +generic = [Scroll, EmulateScreenMetrics] +perSite = [Twitter, Instagram] +available = generic + perSite + [Screenshot, DomSnapshot] +availableNames = set (map (lambda x: x.name, available)) diff --git a/crocoite/cli.py b/crocoite/cli.py index 32e0959..86fee13 100644 --- a/crocoite/cli.py +++ b/crocoite/cli.py @@ -22,125 +22,19 @@ Standalone and Celery command line interface """ -import os, random, logging, argparse +import os, logging, argparse from io import BytesIO from datetime import datetime -from base64 import b64decode import pychrome from urllib.parse import urlsplit -from warcio.statusandheaders import StatusAndHeaders -from html5lib.serializer import HTMLSerializer from celery import Celery from celery.utils.log import get_task_logger -from . import html, packageData, packageUrl +from . import behavior from .warc import WarcLoader, SerializingWARCWriter -from .html import StripAttributeFilter, StripTagFilter, ChromeTreeWalker from .browser import ChromeService, NullService - -def getFormattedViewportMetrics (tab): - layoutMetrics = tab.Page.getLayoutMetrics () - # XXX: I’m not entirely sure which one we should use here - return '{}x{}'.format (layoutMetrics['layoutViewport']['clientWidth'], - layoutMetrics['layoutViewport']['clientHeight']) - -def writeScript (path, source, writer): - record = writer.create_warc_record (packageUrl (path), 'metadata', - payload=BytesIO (source.encode ('utf8')), - warc_headers_dict={'Content-Type': 'application/javascript; charset=utf-8'}) - writer.write_record (record) - -def randomString (length=None, chars='abcdefghijklmnopqrstuvwxyz'): - if length is None: - length = random.randint (16, 32) - return ''.join (map (lambda x: random.choice (chars), range (length))) - -def writeDOMSnapshot (tab, writer): - """ - Get a DOM snapshot of tab and write it to WARC. - - We could use DOMSnapshot.getSnapshot here, but the API is not stable - yet. Also computed styles are not really necessary here. - - XXX: Currently writes a response, when it should use “resource”. pywb - can’t handle that though. - """ - viewport = getFormattedViewportMetrics (tab) - dom = tab.DOM.getDocument (depth=-1, pierce=True) - haveUrls = set () - for doc in ChromeTreeWalker (dom['root']).split (): - rawUrl = doc['documentURL'] - if rawUrl in haveUrls: - # ignore duplicate URLs. they are usually caused by - # javascript-injected iframes (advertising) with no(?) src - logger.warning ('have DOM snapshot for URL {}, ignoring'.format (rawUrl)) - continue - url = urlsplit (rawUrl) - if url.scheme in ('http', 'https'): - logger.debug ('saving DOM snapshot for url {}, base {}'.format (doc['documentURL'], doc['baseURL'])) - haveUrls.add (rawUrl) - walker = ChromeTreeWalker (doc) - # remove script, to make the page static and noscript, because at the - # time we took the snapshot scripts were enabled - disallowedTags = ['script', 'noscript'] - disallowedAttributes = html.eventAttributes - stream = StripAttributeFilter (StripTagFilter (walker, disallowedTags), disallowedAttributes) - serializer = HTMLSerializer () - httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1') - record = writer.create_warc_record (doc['documentURL'], 'response', - payload=BytesIO (serializer.render (stream, 'utf-8')), - http_headers=httpHeaders, - warc_headers_dict={'X-DOM-Snapshot': str (True), - 'X-Chrome-Viewport': viewport}) - writer.write_record (record) - -def emulateScreenMetrics (l): - """ - Emulate different screen sizes, causing the site to fetch assets (img - srcset and css, for example) for different screen resolutions. - """ - cssPpi = 96 - sizes = [ - {'width': 1920, 'height': 1080, 'deviceScaleFactor': 1.5, 'mobile': False}, - {'width': 1920, 'height': 1080, 'deviceScaleFactor': 2, 'mobile': False}, - # very dense display - {'width': 1920, 'height': 1080, 'deviceScaleFactor': 4, 'mobile': False}, - # just a few samples: - # 1st gen iPhone (portrait mode) - {'width': 320, 'height': 480, 'deviceScaleFactor': 163/cssPpi, 'mobile': True}, - # 6th gen iPhone (portrait mode) - {'width': 750, 'height': 1334, 'deviceScaleFactor': 326/cssPpi, 'mobile': True}, - # and reset - {'width': 1920, 'height': 1080, 'deviceScaleFactor': 1, 'mobile': False}, - ] - for s in sizes: - l.tab.Emulation.setDeviceMetricsOverride (**s) - l.wait (1) - # XXX: this seems to be broken, it does not clear the override - #tab.Emulation.clearDeviceMetricsOverride () - # wait until assets finished loading - l.waitIdle (2, 60) - -def loadScripts (paths, scripts=[]): - for p in paths: - if not os.path.exists (p): - # search for defaults scripts in package data directory - p = packageData (p) - with open (p, 'r') as fd: - scripts.append (fd.read ()) - return '\n'.join (scripts) - -def writeScreenshot (tab, writer): - """ - Create screenshot from tab and write it to WARC - """ - viewport = getFormattedViewportMetrics (tab) - data = b64decode (tab.Page.captureScreenshot (format='png')['data']) - record = writer.create_warc_record (packageUrl ('screenshot.png'), 'resource', - payload=BytesIO (data), warc_headers_dict={'Content-Type': 'image/png', - 'X-Chrome-Viewport': viewport}) - writer.write_record (record) +from .util import packageUrl, getFormattedViewportMetrics app = Celery ('crocoite.distributed') app.config_from_object('celeryconfig') @@ -148,8 +42,8 @@ logger = get_task_logger('crocoite.distributed.archive') # defaults can be changed below using argparse; track started state, because tasks are usually long-running @app.task(bind=True, track_started=True) -def archive (self, url, output, onload, onsnapshot, browser, - logBuffer, maxBodySize, idleTimeout, timeout, domSnapshot, screenshot): +def archive (self, url, output, browser, logBuffer, maxBodySize, idleTimeout, + timeout, enabledBehaviorNames): """ Archive a single URL @@ -164,16 +58,12 @@ def archive (self, url, output, onload, onsnapshot, browser, self.update_state (state='PROGRESS', meta={'step': 'start'}) - stopVarname = '__' + __package__ + '_stop__' - # avoid sites messing with our scripts by using a random stop variable name - newStopVarname = randomString () - onload = loadScripts (onload, ['var {} = false;\n'.format (stopVarname)]).replace (stopVarname, newStopVarname) - stopVarname = newStopVarname - service = ChromeService () if browser: service = NullService (browser) + allBehavior = list (filter (lambda x: x.name in enabledBehaviorNames, behavior.available)) + with service as browser: browser = pychrome.Browser(url=browser) @@ -201,36 +91,34 @@ def archive (self, url, output, onload, onsnapshot, browser, } warcinfo = writer.create_warcinfo_record (filename=None, info=payload) writer.write_record (warcinfo) - # save onload script as well - writeScript ('onload', onload, writer) - # inject our custom javascript to the page before loading - l.tab.Page.addScriptToEvaluateOnNewDocument (source=onload) + # not all behavior scripts are allowed for every URL, filter them + enabledBehavior = list (filter (lambda x: url in x, + map (lambda x: x (l), allBehavior))) + + self.update_state (state='PROGRESS', meta={'step': 'onload'}) + for b in enabledBehavior: + logger.debug ('starting onload behavior {}'.format (b.name)) + b.onload () l.start () self.update_state (state='PROGRESS', meta={'step': 'fetch'}) l.waitIdle (idleTimeout, timeout) - # get ready for snapshot: stop loading and scripts, disable events - l.tab.Runtime.evaluate (expression='{} = true; window.scrollTo (0, 0);'.format (stopVarname), returnByValue=True) + self.update_state (state='PROGRESS', meta={'step': 'onstop'}) + for b in enabledBehavior: + logger.debug ('starting onstop behavior {}'.format (b.name)) + b.onstop () + # if we stopped due to timeout, wait for remaining assets l.waitIdle (2, 60) - - self.update_state (state='PROGRESS', meta={'step': 'emulateScreenMetrics'}) - emulateScreenMetrics (l) - l.stop () - if domSnapshot: - self.update_state (state='PROGRESS', meta={'step': 'domSnapshot'}) - script = loadScripts (onsnapshot) - writeScript ('onsnapshot', script, writer) - l.tab.Runtime.evaluate (expression=script, returnByValue=True) - writeDOMSnapshot (l.tab, writer) + self.update_state (state='PROGRESS', meta={'step': 'onfinish'}) + for b in enabledBehavior: + logger.debug ('starting onfinish behavior {}'.format (b.name)) + b.onfinish () - if screenshot: - self.update_state (state='PROGRESS', meta={'step': 'screenshot'}) - writeScreenshot (l.tab, writer) ret['stats'] = l.stats writer.flush () if not output: @@ -244,8 +132,6 @@ def stateCallback (data): print (data['task_id'], result['step']) def main (): - from crocoite import behavior - parser = argparse.ArgumentParser(description='Save website to WARC using Google Chrome.') parser.add_argument('--browser', help='DevTools URL', metavar='URL') parser.add_argument('--distributed', help='Use celery worker', action='store_true') @@ -254,23 +140,19 @@ def main (): parser.add_argument('--log-buffer', default=1000, type=int, dest='logBuffer', metavar='LINES') parser.add_argument('--max-body-size', default=10*1024*1024, type=int, dest='maxBodySize', help='Max body size', metavar='BYTES') #parser.add_argument('--keep-tab', action='store_true', default=False, dest='keepTab', help='Keep tab open') - parser.add_argument('--onload', default=[], action='append', help='Inject JavaScript file before loading page', metavar='FILE') - parser.add_argument('--no-behavior', default=True, action='store_false', help='Do not inject default behavior scripts', dest='behavior') - parser.add_argument('--onsnapshot', default=[], action='append', help='Run JavaScript files before creating DOM snapshot', metavar='FILE') - parser.add_argument('--no-screenshot', default=True, action='store_false', help='Do not create a screenshot of the website', dest='screenshot') - parser.add_argument('--no-dom-snapshot', default=True, action='store_false', help='Do not create a DOM snapshot of the website', dest='domSnapshot') + parser.add_argument('--behavior', help='Comma-separated list of enabled behavior scripts', + dest='enabledBehaviorNames', + default=list (behavior.availableNames), + choices=list (behavior.availableNames)) parser.add_argument('url', help='Website URL') parser.add_argument('output', help='WARC filename') args = parser.parse_args () - if args.behavior: - args.onload.extend (['scroll.js'] + behavior.getByUrl (args.url)) # prepare args for function distributed = args.distributed passArgs = vars (args) del passArgs['distributed'] - del passArgs['behavior'] if distributed: result = archive.delay (**passArgs) diff --git a/crocoite/data/scroll.js b/crocoite/data/scroll.js index 2b4dff1..0d1a4a7 100644 --- a/crocoite/data/scroll.js +++ b/crocoite/data/scroll.js @@ -1,5 +1,6 @@ /* Continuously scrolls the page */ +var __crocoite_stop__ = false; (function(){ function scroll (event) { if (__crocoite_stop__) { diff --git a/crocoite/util.py b/crocoite/util.py new file mode 100644 index 0000000..ec257f1 --- /dev/null +++ b/crocoite/util.py @@ -0,0 +1,43 @@ +# Copyright (c) 2017 crocoite contributors +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +""" +Random utility functions +""" + +import random + +def randomString (length=None, chars='abcdefghijklmnopqrstuvwxyz'): + if length is None: + length = random.randint (16, 32) + return ''.join (map (lambda x: random.choice (chars), range (length))) + +def packageUrl (path): + """ + Create URL for package data stored into WARC + """ + return 'urn:' + __package__ + ':' + path + +def getFormattedViewportMetrics (tab): + layoutMetrics = tab.Page.getLayoutMetrics () + # XXX: I’m not entirely sure which one we should use here + return '{}x{}'.format (layoutMetrics['layoutViewport']['clientWidth'], + layoutMetrics['layoutViewport']['clientHeight']) + diff --git a/crocoite/warc.py b/crocoite/warc.py index d9afab2..540f673 100644 --- a/crocoite/warc.py +++ b/crocoite/warc.py @@ -24,8 +24,6 @@ Classes writing data to WARC files import logging import json -from .browser import AccountingSiteLoader -from . import packageUrl from http.server import BaseHTTPRequestHandler from base64 import b64decode from io import BytesIO @@ -40,6 +38,9 @@ from queue import Queue from warcio.timeutils import datetime_to_iso_date from warcio.warcwriter import WARCWriter +from .browser import AccountingSiteLoader +from .util import packageUrl + class SerializingWARCWriter (WARCWriter): """ Serializing WARC writer using separate writer thread and queue for |