summaryrefslogtreecommitdiff
path: root/crocoite
diff options
context:
space:
mode:
Diffstat (limited to 'crocoite')
-rw-r--r--crocoite/__init__.py13
-rw-r--r--crocoite/behavior.py224
-rw-r--r--crocoite/cli.py174
-rw-r--r--crocoite/data/scroll.js1
-rw-r--r--crocoite/util.py43
-rw-r--r--crocoite/warc.py5
6 files changed, 288 insertions, 172 deletions
diff --git a/crocoite/__init__.py b/crocoite/__init__.py
index e23cd60..6fc86ce 100644
--- a/crocoite/__init__.py
+++ b/crocoite/__init__.py
@@ -18,16 +18,3 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
-import os
-
-def packageData (path):
- """
- Locate package data, see setup.py’s data_files
- """
- return os.path.join (os.path.dirname (__file__), 'data', path)
-
-def packageUrl (path):
- """
- Create URL for package data stored into WARC
- """
- return 'urn:' + __package__ + ':' + path
diff --git a/crocoite/behavior.py b/crocoite/behavior.py
index 13530fe..a7928e7 100644
--- a/crocoite/behavior.py
+++ b/crocoite/behavior.py
@@ -19,23 +19,225 @@
# THE SOFTWARE.
"""
-Per-site JavaScript injections
+Generic and per-site behavior scripts
"""
+import logging
+from io import BytesIO
from urllib.parse import urlsplit
+import os.path
+import pkg_resources
+from base64 import b64decode
-def getByUrl (url):
+from .util import randomString, packageUrl, getFormattedViewportMetrics
+from . import html
+from .html import StripAttributeFilter, StripTagFilter, ChromeTreeWalker
+from html5lib.serializer import HTMLSerializer
+from warcio.statusandheaders import StatusAndHeaders
+
+logger = logging.getLogger(__name__)
+
+class Behavior:
+ # unique behavior name
+ name = None
+
+ def __init__ (self, loader):
+ assert self.name is not None
+ self.loader = loader
+
+ def __contains__ (self, url):
+ """
+ Accept every URL by default
+ """
+ return True
+
+ def loadScript (self, path, encoding='utf-8'):
+ return pkg_resources.resource_string (__name__, os.path.join ('data', path)).decode (encoding)
+
+ def useScript (self, script, encoding='utf-8'):
+ writer = self.loader.writer
+ record = writer.create_warc_record (packageUrl ('script'), 'metadata',
+ payload=BytesIO (script.encode (encoding)),
+ warc_headers_dict={'Content-Type': 'application/javascript; charset={}'.format (encoding)})
+ writer.write_record (record)
+
+ def onload (self):
+ """ Before loading the page """
+ pass
+
+ def onstop (self):
+ """ Before page loading is stopped """
+ pass
+
+ def onfinish (self):
+ """ After the site has stopped loading """
+ pass
+
+class HostnameFilter:
+ """ Limit behavior script to hostname """
+
+ hostname = None
+
+ def __contains__ (self, url):
+ url = urlsplit (url)
+ hostname = url.hostname.split ('.')[::-1]
+ return hostname[:2] == self.hostname
+
+class JsOnload (Behavior):
+ """ Execute JavaScript on page load """
+
+ scriptPath = None
+
+ def __init__ (self, loader):
+ super ().__init__ (loader)
+ self.script = self.loadScript (self.scriptPath)
+ self.scriptHandle = None
+
+ def onload (self):
+ self.useScript (self.script)
+ self.scriptHandle = self.loader.tab.Page.addScriptToEvaluateOnNewDocument (source=self.script)['identifier']
+
+ def onstop (self):
+ self.loader.tab.Page.removeScriptToEvaluateOnNewDocument (identifier=self.scriptHandle)
+
+### Generic scripts ###
+
+class Scroll (JsOnload):
+ name = 'scroll'
+ scriptPath = 'scroll.js'
+
+ def __init__ (self, loader):
+ super ().__init__ (loader)
+ stopVarname = '__' + __package__ + '_stop__'
+ newStopVarname = randomString ()
+ self.script = self.script.replace (stopVarname, newStopVarname)
+ self.stopVarname = newStopVarname
+
+ def onstop (self):
+ super ().onstop ()
+ # removing the script does not stop it if running
+ script = '{} = true; window.scrollTo (0, 0);'.format (self.stopVarname)
+ self.useScript (script)
+ self.loader.tab.Runtime.evaluate (expression=script, returnByValue=True)
+
+class EmulateScreenMetrics (Behavior):
+ name = 'emulateScreenMetrics'
+
+ def onstop (self):
+ """
+ Emulate different screen sizes, causing the site to fetch assets (img
+ srcset and css, for example) for different screen resolutions.
+ """
+ cssPpi = 96
+ sizes = [
+ {'width': 1920, 'height': 1080, 'deviceScaleFactor': 1.5, 'mobile': False},
+ {'width': 1920, 'height': 1080, 'deviceScaleFactor': 2, 'mobile': False},
+ # very dense display
+ {'width': 1920, 'height': 1080, 'deviceScaleFactor': 4, 'mobile': False},
+ # just a few samples:
+ # 1st gen iPhone (portrait mode)
+ {'width': 320, 'height': 480, 'deviceScaleFactor': 163/cssPpi, 'mobile': True},
+ # 6th gen iPhone (portrait mode)
+ {'width': 750, 'height': 1334, 'deviceScaleFactor': 326/cssPpi, 'mobile': True},
+ # and reset
+ {'width': 1920, 'height': 1080, 'deviceScaleFactor': 1, 'mobile': False},
+ ]
+ l = self.loader
+ tab = l.tab
+ for s in sizes:
+ tab.Emulation.setDeviceMetricsOverride (**s)
+ # give the browser time to re-eval page and start requests
+ l.wait (1)
+ # XXX: this seems to be broken, it does not clear the override
+ #tab.Emulation.clearDeviceMetricsOverride ()
+
+class DomSnapshot (Behavior):
"""
- Get site-specific onload behavior scripts
+ Get a DOM snapshot of tab and write it to WARC.
+
+ We could use DOMSnapshot.getSnapshot here, but the API is not stable
+ yet. Also computed styles are not really necessary here.
+
+ XXX: Currently writes a response, when it should use “resource”. pywb
+ can’t handle that though.
"""
- url = urlsplit (url)
- hostname = url.hostname.split ('.')[::-1]
+ name = 'domSnapshot'
+
+ def __init__ (self, loader):
+ super ().__init__ (loader)
+ self.script = self.loadScript ('canvas-snapshot.js')
+
+ def onfinish (self):
+ tab = self.loader.tab
+ writer = self.loader.writer
+
+ self.useScript (self.script)
+ tab.Runtime.evaluate (expression=self.script, returnByValue=True)
+
+ viewport = getFormattedViewportMetrics (tab)
+ dom = tab.DOM.getDocument (depth=-1, pierce=True)
+ haveUrls = set ()
+ for doc in ChromeTreeWalker (dom['root']).split ():
+ rawUrl = doc['documentURL']
+ if rawUrl in haveUrls:
+ # ignore duplicate URLs. they are usually caused by
+ # javascript-injected iframes (advertising) with no(?) src
+ logger.warning ('have DOM snapshot for URL {}, ignoring'.format (rawUrl))
+ continue
+ url = urlsplit (rawUrl)
+ if url.scheme in ('http', 'https'):
+ logger.debug ('saving DOM snapshot for url {}, base {}'.format (doc['documentURL'], doc['baseURL']))
+ haveUrls.add (rawUrl)
+ walker = ChromeTreeWalker (doc)
+ # remove script, to make the page static and noscript, because at the
+ # time we took the snapshot scripts were enabled
+ disallowedTags = ['script', 'noscript']
+ disallowedAttributes = html.eventAttributes
+ stream = StripAttributeFilter (StripTagFilter (walker, disallowedTags), disallowedAttributes)
+ serializer = HTMLSerializer ()
+ httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1')
+ record = writer.create_warc_record (doc['documentURL'], 'response',
+ payload=BytesIO (serializer.render (stream, 'utf-8')),
+ http_headers=httpHeaders,
+ warc_headers_dict={'X-DOM-Snapshot': str (True),
+ 'X-Chrome-Viewport': viewport})
+ writer.write_record (record)
+
+class Screenshot (Behavior):
+ """
+ Create screenshot from tab and write it to WARC
+ """
+
+ name = 'screenshot'
+
+ def onfinish (self):
+ tab = self.loader.tab
+ writer = self.loader.writer
+
+ viewport = getFormattedViewportMetrics (tab)
+ data = b64decode (tab.Page.captureScreenshot (format='png')['data'])
+ record = writer.create_warc_record (packageUrl ('screenshot.png'), 'resource',
+ payload=BytesIO (data), warc_headers_dict={'Content-Type': 'image/png',
+ 'X-Chrome-Viewport': viewport})
+ writer.write_record (record)
+
+### Site-specific scripts ###
+
+class Twitter (HostnameFilter, JsOnload):
+ name = 'twitter'
+ scriptPath = 'per-site/twitter.js'
+ hostname = ['com', 'twitter']
+
+class Instagram (HostnameFilter, JsOnload):
+ name = 'instagram'
+ scriptPath = 'per-site/instagram.js'
+ hostname = ['com', 'instagram']
- if hostname[0] == 'com':
- if hostname[1] == 'twitter':
- return ['per-site/twitter.js']
- elif hostname[1] == 'instagram':
- return ['per-site/instagram.js']
- return []
+# available behavior scripts. Order matters, move those modifying the page
+# towards the end of available
+generic = [Scroll, EmulateScreenMetrics]
+perSite = [Twitter, Instagram]
+available = generic + perSite + [Screenshot, DomSnapshot]
+availableNames = set (map (lambda x: x.name, available))
diff --git a/crocoite/cli.py b/crocoite/cli.py
index 32e0959..86fee13 100644
--- a/crocoite/cli.py
+++ b/crocoite/cli.py
@@ -22,125 +22,19 @@
Standalone and Celery command line interface
"""
-import os, random, logging, argparse
+import os, logging, argparse
from io import BytesIO
from datetime import datetime
-from base64 import b64decode
import pychrome
from urllib.parse import urlsplit
-from warcio.statusandheaders import StatusAndHeaders
-from html5lib.serializer import HTMLSerializer
from celery import Celery
from celery.utils.log import get_task_logger
-from . import html, packageData, packageUrl
+from . import behavior
from .warc import WarcLoader, SerializingWARCWriter
-from .html import StripAttributeFilter, StripTagFilter, ChromeTreeWalker
from .browser import ChromeService, NullService
-
-def getFormattedViewportMetrics (tab):
- layoutMetrics = tab.Page.getLayoutMetrics ()
- # XXX: I’m not entirely sure which one we should use here
- return '{}x{}'.format (layoutMetrics['layoutViewport']['clientWidth'],
- layoutMetrics['layoutViewport']['clientHeight'])
-
-def writeScript (path, source, writer):
- record = writer.create_warc_record (packageUrl (path), 'metadata',
- payload=BytesIO (source.encode ('utf8')),
- warc_headers_dict={'Content-Type': 'application/javascript; charset=utf-8'})
- writer.write_record (record)
-
-def randomString (length=None, chars='abcdefghijklmnopqrstuvwxyz'):
- if length is None:
- length = random.randint (16, 32)
- return ''.join (map (lambda x: random.choice (chars), range (length)))
-
-def writeDOMSnapshot (tab, writer):
- """
- Get a DOM snapshot of tab and write it to WARC.
-
- We could use DOMSnapshot.getSnapshot here, but the API is not stable
- yet. Also computed styles are not really necessary here.
-
- XXX: Currently writes a response, when it should use “resource”. pywb
- can’t handle that though.
- """
- viewport = getFormattedViewportMetrics (tab)
- dom = tab.DOM.getDocument (depth=-1, pierce=True)
- haveUrls = set ()
- for doc in ChromeTreeWalker (dom['root']).split ():
- rawUrl = doc['documentURL']
- if rawUrl in haveUrls:
- # ignore duplicate URLs. they are usually caused by
- # javascript-injected iframes (advertising) with no(?) src
- logger.warning ('have DOM snapshot for URL {}, ignoring'.format (rawUrl))
- continue
- url = urlsplit (rawUrl)
- if url.scheme in ('http', 'https'):
- logger.debug ('saving DOM snapshot for url {}, base {}'.format (doc['documentURL'], doc['baseURL']))
- haveUrls.add (rawUrl)
- walker = ChromeTreeWalker (doc)
- # remove script, to make the page static and noscript, because at the
- # time we took the snapshot scripts were enabled
- disallowedTags = ['script', 'noscript']
- disallowedAttributes = html.eventAttributes
- stream = StripAttributeFilter (StripTagFilter (walker, disallowedTags), disallowedAttributes)
- serializer = HTMLSerializer ()
- httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1')
- record = writer.create_warc_record (doc['documentURL'], 'response',
- payload=BytesIO (serializer.render (stream, 'utf-8')),
- http_headers=httpHeaders,
- warc_headers_dict={'X-DOM-Snapshot': str (True),
- 'X-Chrome-Viewport': viewport})
- writer.write_record (record)
-
-def emulateScreenMetrics (l):
- """
- Emulate different screen sizes, causing the site to fetch assets (img
- srcset and css, for example) for different screen resolutions.
- """
- cssPpi = 96
- sizes = [
- {'width': 1920, 'height': 1080, 'deviceScaleFactor': 1.5, 'mobile': False},
- {'width': 1920, 'height': 1080, 'deviceScaleFactor': 2, 'mobile': False},
- # very dense display
- {'width': 1920, 'height': 1080, 'deviceScaleFactor': 4, 'mobile': False},
- # just a few samples:
- # 1st gen iPhone (portrait mode)
- {'width': 320, 'height': 480, 'deviceScaleFactor': 163/cssPpi, 'mobile': True},
- # 6th gen iPhone (portrait mode)
- {'width': 750, 'height': 1334, 'deviceScaleFactor': 326/cssPpi, 'mobile': True},
- # and reset
- {'width': 1920, 'height': 1080, 'deviceScaleFactor': 1, 'mobile': False},
- ]
- for s in sizes:
- l.tab.Emulation.setDeviceMetricsOverride (**s)
- l.wait (1)
- # XXX: this seems to be broken, it does not clear the override
- #tab.Emulation.clearDeviceMetricsOverride ()
- # wait until assets finished loading
- l.waitIdle (2, 60)
-
-def loadScripts (paths, scripts=[]):
- for p in paths:
- if not os.path.exists (p):
- # search for defaults scripts in package data directory
- p = packageData (p)
- with open (p, 'r') as fd:
- scripts.append (fd.read ())
- return '\n'.join (scripts)
-
-def writeScreenshot (tab, writer):
- """
- Create screenshot from tab and write it to WARC
- """
- viewport = getFormattedViewportMetrics (tab)
- data = b64decode (tab.Page.captureScreenshot (format='png')['data'])
- record = writer.create_warc_record (packageUrl ('screenshot.png'), 'resource',
- payload=BytesIO (data), warc_headers_dict={'Content-Type': 'image/png',
- 'X-Chrome-Viewport': viewport})
- writer.write_record (record)
+from .util import packageUrl, getFormattedViewportMetrics
app = Celery ('crocoite.distributed')
app.config_from_object('celeryconfig')
@@ -148,8 +42,8 @@ logger = get_task_logger('crocoite.distributed.archive')
# defaults can be changed below using argparse; track started state, because tasks are usually long-running
@app.task(bind=True, track_started=True)
-def archive (self, url, output, onload, onsnapshot, browser,
- logBuffer, maxBodySize, idleTimeout, timeout, domSnapshot, screenshot):
+def archive (self, url, output, browser, logBuffer, maxBodySize, idleTimeout,
+ timeout, enabledBehaviorNames):
"""
Archive a single URL
@@ -164,16 +58,12 @@ def archive (self, url, output, onload, onsnapshot, browser,
self.update_state (state='PROGRESS', meta={'step': 'start'})
- stopVarname = '__' + __package__ + '_stop__'
- # avoid sites messing with our scripts by using a random stop variable name
- newStopVarname = randomString ()
- onload = loadScripts (onload, ['var {} = false;\n'.format (stopVarname)]).replace (stopVarname, newStopVarname)
- stopVarname = newStopVarname
-
service = ChromeService ()
if browser:
service = NullService (browser)
+ allBehavior = list (filter (lambda x: x.name in enabledBehaviorNames, behavior.available))
+
with service as browser:
browser = pychrome.Browser(url=browser)
@@ -201,36 +91,34 @@ def archive (self, url, output, onload, onsnapshot, browser,
}
warcinfo = writer.create_warcinfo_record (filename=None, info=payload)
writer.write_record (warcinfo)
- # save onload script as well
- writeScript ('onload', onload, writer)
- # inject our custom javascript to the page before loading
- l.tab.Page.addScriptToEvaluateOnNewDocument (source=onload)
+ # not all behavior scripts are allowed for every URL, filter them
+ enabledBehavior = list (filter (lambda x: url in x,
+ map (lambda x: x (l), allBehavior)))
+
+ self.update_state (state='PROGRESS', meta={'step': 'onload'})
+ for b in enabledBehavior:
+ logger.debug ('starting onload behavior {}'.format (b.name))
+ b.onload ()
l.start ()
self.update_state (state='PROGRESS', meta={'step': 'fetch'})
l.waitIdle (idleTimeout, timeout)
- # get ready for snapshot: stop loading and scripts, disable events
- l.tab.Runtime.evaluate (expression='{} = true; window.scrollTo (0, 0);'.format (stopVarname), returnByValue=True)
+ self.update_state (state='PROGRESS', meta={'step': 'onstop'})
+ for b in enabledBehavior:
+ logger.debug ('starting onstop behavior {}'.format (b.name))
+ b.onstop ()
+
# if we stopped due to timeout, wait for remaining assets
l.waitIdle (2, 60)
-
- self.update_state (state='PROGRESS', meta={'step': 'emulateScreenMetrics'})
- emulateScreenMetrics (l)
-
l.stop ()
- if domSnapshot:
- self.update_state (state='PROGRESS', meta={'step': 'domSnapshot'})
- script = loadScripts (onsnapshot)
- writeScript ('onsnapshot', script, writer)
- l.tab.Runtime.evaluate (expression=script, returnByValue=True)
- writeDOMSnapshot (l.tab, writer)
+ self.update_state (state='PROGRESS', meta={'step': 'onfinish'})
+ for b in enabledBehavior:
+ logger.debug ('starting onfinish behavior {}'.format (b.name))
+ b.onfinish ()
- if screenshot:
- self.update_state (state='PROGRESS', meta={'step': 'screenshot'})
- writeScreenshot (l.tab, writer)
ret['stats'] = l.stats
writer.flush ()
if not output:
@@ -244,8 +132,6 @@ def stateCallback (data):
print (data['task_id'], result['step'])
def main ():
- from crocoite import behavior
-
parser = argparse.ArgumentParser(description='Save website to WARC using Google Chrome.')
parser.add_argument('--browser', help='DevTools URL', metavar='URL')
parser.add_argument('--distributed', help='Use celery worker', action='store_true')
@@ -254,23 +140,19 @@ def main ():
parser.add_argument('--log-buffer', default=1000, type=int, dest='logBuffer', metavar='LINES')
parser.add_argument('--max-body-size', default=10*1024*1024, type=int, dest='maxBodySize', help='Max body size', metavar='BYTES')
#parser.add_argument('--keep-tab', action='store_true', default=False, dest='keepTab', help='Keep tab open')
- parser.add_argument('--onload', default=[], action='append', help='Inject JavaScript file before loading page', metavar='FILE')
- parser.add_argument('--no-behavior', default=True, action='store_false', help='Do not inject default behavior scripts', dest='behavior')
- parser.add_argument('--onsnapshot', default=[], action='append', help='Run JavaScript files before creating DOM snapshot', metavar='FILE')
- parser.add_argument('--no-screenshot', default=True, action='store_false', help='Do not create a screenshot of the website', dest='screenshot')
- parser.add_argument('--no-dom-snapshot', default=True, action='store_false', help='Do not create a DOM snapshot of the website', dest='domSnapshot')
+ parser.add_argument('--behavior', help='Comma-separated list of enabled behavior scripts',
+ dest='enabledBehaviorNames',
+ default=list (behavior.availableNames),
+ choices=list (behavior.availableNames))
parser.add_argument('url', help='Website URL')
parser.add_argument('output', help='WARC filename')
args = parser.parse_args ()
- if args.behavior:
- args.onload.extend (['scroll.js'] + behavior.getByUrl (args.url))
# prepare args for function
distributed = args.distributed
passArgs = vars (args)
del passArgs['distributed']
- del passArgs['behavior']
if distributed:
result = archive.delay (**passArgs)
diff --git a/crocoite/data/scroll.js b/crocoite/data/scroll.js
index 2b4dff1..0d1a4a7 100644
--- a/crocoite/data/scroll.js
+++ b/crocoite/data/scroll.js
@@ -1,5 +1,6 @@
/* Continuously scrolls the page
*/
+var __crocoite_stop__ = false;
(function(){
function scroll (event) {
if (__crocoite_stop__) {
diff --git a/crocoite/util.py b/crocoite/util.py
new file mode 100644
index 0000000..ec257f1
--- /dev/null
+++ b/crocoite/util.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2017 crocoite contributors
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+"""
+Random utility functions
+"""
+
+import random
+
+def randomString (length=None, chars='abcdefghijklmnopqrstuvwxyz'):
+ if length is None:
+ length = random.randint (16, 32)
+ return ''.join (map (lambda x: random.choice (chars), range (length)))
+
+def packageUrl (path):
+ """
+ Create URL for package data stored into WARC
+ """
+ return 'urn:' + __package__ + ':' + path
+
+def getFormattedViewportMetrics (tab):
+ layoutMetrics = tab.Page.getLayoutMetrics ()
+ # XXX: I’m not entirely sure which one we should use here
+ return '{}x{}'.format (layoutMetrics['layoutViewport']['clientWidth'],
+ layoutMetrics['layoutViewport']['clientHeight'])
+
diff --git a/crocoite/warc.py b/crocoite/warc.py
index d9afab2..540f673 100644
--- a/crocoite/warc.py
+++ b/crocoite/warc.py
@@ -24,8 +24,6 @@ Classes writing data to WARC files
import logging
import json
-from .browser import AccountingSiteLoader
-from . import packageUrl
from http.server import BaseHTTPRequestHandler
from base64 import b64decode
from io import BytesIO
@@ -40,6 +38,9 @@ from queue import Queue
from warcio.timeutils import datetime_to_iso_date
from warcio.warcwriter import WARCWriter
+from .browser import AccountingSiteLoader
+from .util import packageUrl
+
class SerializingWARCWriter (WARCWriter):
"""
Serializing WARC writer using separate writer thread and queue for