summaryrefslogtreecommitdiff
path: root/crocoite
diff options
context:
space:
mode:
Diffstat (limited to 'crocoite')
-rw-r--r--crocoite/browser.py21
-rw-r--r--crocoite/cli.py351
2 files changed, 221 insertions, 151 deletions
diff --git a/crocoite/browser.py b/crocoite/browser.py
index 765acbb..3e0e310 100644
--- a/crocoite/browser.py
+++ b/crocoite/browser.py
@@ -178,7 +178,7 @@ class SiteLoader:
resp = kwargs['response']
url = urlsplit (resp['url'])
if url.scheme in self.allowedSchemes:
- self.logger.debug ('response {} {}'.format (reqId, resp['url']))
+ self.logger.info ('response {} {}'.format (reqId, resp['url']))
item.setResponse (kwargs)
else:
self.logger.warn ('response: ignoring scheme {}'.format (url.scheme))
@@ -198,13 +198,13 @@ class SiteLoader:
assert req['url'] == resp['url'], 'req and resp urls are not the same {} vs {}'.format (req['url'], resp['url'])
url = urlsplit (resp['url'])
if url.scheme in self.allowedSchemes:
- self.logger.debug ('finished {} {}'.format (reqId, req['url']))
+ self.logger.info ('finished {} {}'.format (reqId, req['url']))
item.encodedDataLength = kwargs['encodedDataLength']
self.loadingFinished (item)
def _loadingFailed (self, **kwargs):
reqId = kwargs['requestId']
- self.logger.debug ('failed {} {}'.format (reqId, kwargs['errorText'], kwargs.get ('blockedReason')))
+ self.logger.warn ('failed {} {}'.format (reqId, kwargs['errorText'], kwargs.get ('blockedReason')))
item = self.requests.pop (reqId, None)
import subprocess
@@ -219,9 +219,18 @@ def ChromeService (binary='google-chrome-stable', host='localhost', port=9222, w
is not required with this method, since reads will block until Chrome is
ready.
"""
- s = socket.socket ()
- s.setsockopt (socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
- s.bind ((host, port))
+ while True:
+ s = socket.socket ()
+ s.setsockopt (socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+ try:
+ s.bind ((host, port))
+ break
+ except OSError:
+ # try different port
+ if port < 65000:
+ port += 1
+ else:
+ raise
s.listen (10)
userDataDir = mkdtemp ()
args = [binary,
diff --git a/crocoite/cli.py b/crocoite/cli.py
index 8a55269..3527ceb 100644
--- a/crocoite/cli.py
+++ b/crocoite/cli.py
@@ -18,159 +18,180 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
-def main ():
- import os, random, logging, argparse
- from io import BytesIO
- from base64 import b64decode
- import pychrome
- from urllib.parse import urlsplit
- from warcio.warcwriter import WARCWriter
- from warcio.statusandheaders import StatusAndHeaders
- from html5lib.serializer import HTMLSerializer
- from . import html, packageData, packageUrl, browser
- from .warc import WarcLoader
- from .html import StripAttributeFilter, StripTagFilter, ChromeTreeWalker
-
- def randomString (length=None, chars='abcdefghijklmnopqrstuvwxyz'):
- if length is None:
- length = random.randint (16, 32)
- return ''.join (map (lambda x: random.choice (chars), range (length)))
-
- def getFormattedViewportMetrics (tab):
- layoutMetrics = tab.Page.getLayoutMetrics ()
- # XXX: I’m not entirely sure which one we should use here
- return '{}x{}'.format (layoutMetrics['layoutViewport']['clientWidth'],
- layoutMetrics['layoutViewport']['clientHeight'])
-
- def writeDOMSnapshot (tab, writer):
- """
- Get a DOM snapshot of tab and write it to WARC.
-
- We could use DOMSnapshot.getSnapshot here, but the API is not stable
- yet. Also computed styles are not really necessary here.
-
- XXX: Currently writes a response, when it should use “resource”. pywb
- can’t handle that though.
- """
- viewport = getFormattedViewportMetrics (tab)
- dom = tab.DOM.getDocument (depth=-1, pierce=True)
- haveUrls = set ()
- for doc in ChromeTreeWalker (dom['root']).split ():
- rawUrl = doc['documentURL']
- if rawUrl in haveUrls:
- # ignore duplicate URLs. they are usually caused by
- # javascript-injected iframes (advertising) with no(?) src
- logger.warning ('have DOM snapshot for URL {}, ignoring'.format (rawUrl))
- continue
- url = urlsplit (rawUrl)
- if url.scheme in ('http', 'https'):
- logger.debug ('saving DOM snapshot for url {}, base {}'.format (doc['documentURL'], doc['baseURL']))
- haveUrls.add (rawUrl)
- walker = ChromeTreeWalker (doc)
- # remove script, to make the page static and noscript, because at the
- # time we took the snapshot scripts were enabled
- disallowedTags = ['script', 'noscript']
- disallowedAttributes = html.eventAttributes
- stream = StripAttributeFilter (StripTagFilter (walker, disallowedTags), disallowedAttributes)
- serializer = HTMLSerializer ()
- httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1')
- record = writer.create_warc_record (doc['documentURL'], 'response',
- payload=BytesIO (serializer.render (stream, 'utf-8')),
- http_headers=httpHeaders,
- warc_headers_dict={'X-DOM-Snapshot': str (True),
- 'X-Chrome-Viewport': viewport})
- writer.write_record (record)
-
- def emulateScreenMetrics (l):
- """
- Emulate different screen sizes, causing the site to fetch assets (img
- srcset and css, for example) for different screen resolutions.
- """
- cssPpi = 96
- sizes = [
- {'width': 1920, 'height': 1080, 'deviceScaleFactor': 1.5, 'mobile': False},
- {'width': 1920, 'height': 1080, 'deviceScaleFactor': 2, 'mobile': False},
- # very dense display
- {'width': 1920, 'height': 1080, 'deviceScaleFactor': 4, 'mobile': False},
- # just a few samples:
- # 1st gen iPhone (portrait mode)
- {'width': 320, 'height': 480, 'deviceScaleFactor': 163/cssPpi, 'mobile': True},
- # 6th gen iPhone (portrait mode)
- {'width': 750, 'height': 1334, 'deviceScaleFactor': 326/cssPpi, 'mobile': True},
- # and reset
- {'width': 1920, 'height': 1080, 'deviceScaleFactor': 1, 'mobile': False},
- ]
- for s in sizes:
- l.tab.Emulation.setDeviceMetricsOverride (**s)
- l.wait (1)
- # XXX: this seems to be broken, it does not clear the override
- #tab.Emulation.clearDeviceMetricsOverride ()
- # wait until assets finished loading
- l.waitIdle (2, 10)
-
- def loadScripts (paths, scripts=[]):
- for p in paths:
- if not os.path.exists (p):
- # search for defaults scripts in package data directory
- p = packageData (p)
- with open (p, 'r') as fd:
- scripts.append (fd.read ())
- return '\n'.join (scripts)
-
- def writeScript (path, source, writer):
- record = writer.create_warc_record (packageUrl (path), 'metadata',
- payload=BytesIO (source.encode ('utf8')),
- warc_headers_dict={'Content-Type': 'application/javascript; charset=utf-8'})
- writer.write_record (record)
-
- def writeScreenshot (tab, writer):
- """
- Create screenshot from tab and write it to WARC
- """
- viewport = getFormattedViewportMetrics (tab)
- data = b64decode (l.tab.Page.captureScreenshot (format='png')['data'])
- record = writer.create_warc_record (packageUrl ('screenshot.png'), 'resource',
- payload=BytesIO (data), warc_headers_dict={'Content-Type': 'image/png',
- 'X-Chrome-Viewport': viewport})
- writer.write_record (record)
-
- logger = logging.getLogger(__name__)
- logging.basicConfig (level=logging.DEBUG)
+"""
+Standalone and Celery command line interface
+"""
- parser = argparse.ArgumentParser(description='Save website to WARC using Google Chrome.')
- parser.add_argument('--browser', help='DevTools URL', metavar='URL')
- parser.add_argument('--timeout', default=10, type=int, help='Maximum time for archival', metavar='SEC')
- parser.add_argument('--idle-timeout', default=2, type=int, help='Maximum idle seconds (i.e. no requests)', dest='idleTimeout', metavar='SEC')
- parser.add_argument('--log-buffer', default=1000, type=int, dest='logBuffer', metavar='LINES')
- parser.add_argument('--max-body-size', default=10*1024*1024, type=int, dest='maxBodySize', help='Max body size', metavar='BYTES')
- #parser.add_argument('--keep-tab', action='store_true', default=False, dest='keepTab', help='Keep tab open')
- parser.add_argument('--onload', default=[], action='append', help='Inject JavaScript file before loading page', metavar='FILE')
- parser.add_argument('--onsnapshot', default=[], action='append', help='Run JavaScript files before creating DOM snapshot', metavar='FILE')
- parser.add_argument('--no-screenshot', default=True, action='store_false', help='Do not create a screenshot of the website', dest='screenshot')
- parser.add_argument('--no-dom-snapshot', default=True, action='store_false', help='Do not create a DOM snapshot of the website', dest='domSnapshot')
- parser.add_argument('url', help='Website URL')
- parser.add_argument('output', help='WARC filename')
+import os, random, logging, argparse
+from io import BytesIO
+from datetime import datetime
+from base64 import b64decode
+import pychrome
+from urllib.parse import urlsplit
+from warcio.warcwriter import WARCWriter
+from warcio.statusandheaders import StatusAndHeaders
+from html5lib.serializer import HTMLSerializer
- args = parser.parse_args ()
+from celery import Celery
+from celery.utils.log import get_task_logger
+
+from . import html, packageData, packageUrl
+from .warc import WarcLoader
+from .html import StripAttributeFilter, StripTagFilter, ChromeTreeWalker
+from .browser import ChromeService, NullService
+
+def getFormattedViewportMetrics (tab):
+ layoutMetrics = tab.Page.getLayoutMetrics ()
+ # XXX: I’m not entirely sure which one we should use here
+ return '{}x{}'.format (layoutMetrics['layoutViewport']['clientWidth'],
+ layoutMetrics['layoutViewport']['clientHeight'])
+
+def writeScript (path, source, writer):
+ record = writer.create_warc_record (packageUrl (path), 'metadata',
+ payload=BytesIO (source.encode ('utf8')),
+ warc_headers_dict={'Content-Type': 'application/javascript; charset=utf-8'})
+ writer.write_record (record)
+
+def randomString (length=None, chars='abcdefghijklmnopqrstuvwxyz'):
+ if length is None:
+ length = random.randint (16, 32)
+ return ''.join (map (lambda x: random.choice (chars), range (length)))
+
+def writeDOMSnapshot (tab, writer):
+ """
+ Get a DOM snapshot of tab and write it to WARC.
+
+ We could use DOMSnapshot.getSnapshot here, but the API is not stable
+ yet. Also computed styles are not really necessary here.
+
+ XXX: Currently writes a response, when it should use “resource”. pywb
+ can’t handle that though.
+ """
+ viewport = getFormattedViewportMetrics (tab)
+ dom = tab.DOM.getDocument (depth=-1, pierce=True)
+ haveUrls = set ()
+ for doc in ChromeTreeWalker (dom['root']).split ():
+ rawUrl = doc['documentURL']
+ if rawUrl in haveUrls:
+ # ignore duplicate URLs. they are usually caused by
+ # javascript-injected iframes (advertising) with no(?) src
+ logger.warning ('have DOM snapshot for URL {}, ignoring'.format (rawUrl))
+ continue
+ url = urlsplit (rawUrl)
+ if url.scheme in ('http', 'https'):
+ logger.debug ('saving DOM snapshot for url {}, base {}'.format (doc['documentURL'], doc['baseURL']))
+ haveUrls.add (rawUrl)
+ walker = ChromeTreeWalker (doc)
+ # remove script, to make the page static and noscript, because at the
+ # time we took the snapshot scripts were enabled
+ disallowedTags = ['script', 'noscript']
+ disallowedAttributes = html.eventAttributes
+ stream = StripAttributeFilter (StripTagFilter (walker, disallowedTags), disallowedAttributes)
+ serializer = HTMLSerializer ()
+ httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1')
+ record = writer.create_warc_record (doc['documentURL'], 'response',
+ payload=BytesIO (serializer.render (stream, 'utf-8')),
+ http_headers=httpHeaders,
+ warc_headers_dict={'X-DOM-Snapshot': str (True),
+ 'X-Chrome-Viewport': viewport})
+ writer.write_record (record)
+
+def emulateScreenMetrics (l):
+ """
+ Emulate different screen sizes, causing the site to fetch assets (img
+ srcset and css, for example) for different screen resolutions.
+ """
+ cssPpi = 96
+ sizes = [
+ {'width': 1920, 'height': 1080, 'deviceScaleFactor': 1.5, 'mobile': False},
+ {'width': 1920, 'height': 1080, 'deviceScaleFactor': 2, 'mobile': False},
+ # very dense display
+ {'width': 1920, 'height': 1080, 'deviceScaleFactor': 4, 'mobile': False},
+ # just a few samples:
+ # 1st gen iPhone (portrait mode)
+ {'width': 320, 'height': 480, 'deviceScaleFactor': 163/cssPpi, 'mobile': True},
+ # 6th gen iPhone (portrait mode)
+ {'width': 750, 'height': 1334, 'deviceScaleFactor': 326/cssPpi, 'mobile': True},
+ # and reset
+ {'width': 1920, 'height': 1080, 'deviceScaleFactor': 1, 'mobile': False},
+ ]
+ for s in sizes:
+ l.tab.Emulation.setDeviceMetricsOverride (**s)
+ l.wait (1)
+ # XXX: this seems to be broken, it does not clear the override
+ #tab.Emulation.clearDeviceMetricsOverride ()
+ # wait until assets finished loading
+ l.waitIdle (2, 10)
+
+def loadScripts (paths, scripts=[]):
+ for p in paths:
+ if not os.path.exists (p):
+ # search for defaults scripts in package data directory
+ p = packageData (p)
+ with open (p, 'r') as fd:
+ scripts.append (fd.read ())
+ return '\n'.join (scripts)
+
+def writeScreenshot (tab, writer):
+ """
+ Create screenshot from tab and write it to WARC
+ """
+ viewport = getFormattedViewportMetrics (tab)
+ data = b64decode (tab.Page.captureScreenshot (format='png')['data'])
+ record = writer.create_warc_record (packageUrl ('screenshot.png'), 'resource',
+ payload=BytesIO (data), warc_headers_dict={'Content-Type': 'image/png',
+ 'X-Chrome-Viewport': viewport})
+ writer.write_record (record)
+
+# XXX: rabbitmq is hardcoded
+app = Celery ('crocoite.distributed')
+app.config_from_object('celeryconfig')
+logger = get_task_logger('crocoite.distributed.archive')
+
+# defaults can be changed below using argparse; track started state, because tasks are usually long-running
+@app.task(bind=True, track_started=True)
+def archive (self, url, output, onload, onsnapshot, browser,
+ logBuffer, maxBodySize, idleTimeout, timeout, domSnapshot, screenshot):
+ """
+ Archive a single URL
+
+ Supports these config keys (celeryconfig):
+
+ warc_filename = '{domain}-{date}-{id}.warc.gz'
+ temp_dir = '/tmp/'
+ finished_dir = '/tmp/finished'
+ """
+
+ self.update_state (state='PROGRESS', meta={'step': 'start'})
stopVarname = '__' + __package__ + '_stop__'
# avoid sites messing with our scripts by using a random stop variable name
newStopVarname = randomString ()
- onload = loadScripts (args.onload, ['var {} = false;\n'.format (stopVarname)]).replace (stopVarname, newStopVarname)
+ onload = loadScripts (onload, ['var {} = false;\n'.format (stopVarname)]).replace (stopVarname, newStopVarname)
stopVarname = newStopVarname
- service = browser.ChromeService ()
- if args.browser:
- service = browser.NullService (args.browser)
+ service = ChromeService ()
+ if browser:
+ service = NullService (browser)
- with service as browserUrl:
- browser = pychrome.Browser(url=browserUrl)
+ with service as browser:
+ browser = pychrome.Browser(url=browser)
- fd = open (args.output, 'wb')
+ if not output:
+ parsedUrl = urlsplit (url)
+ outFile = app.conf.warc_filename.format (
+ id=self.request.id,
+ domain=parsedUrl.hostname.replace ('/', '-'),
+ date=datetime.utcnow ().isoformat (),
+ )
+ outPath = os.path.join (app.conf.temp_dir, outFile)
+ fd = open (outPath, 'wb')
+ else:
+ fd = open (output, 'wb')
writer = WARCWriter (fd, gzip=True)
- with WarcLoader (browser, args.url, writer, logBuffer=args.logBuffer,
- maxBodySize=args.maxBodySize) as l:
+ with WarcLoader (browser, url, writer, logBuffer=logBuffer,
+ maxBodySize=maxBodySize) as l:
version = l.tab.Browser.getVersion ()
payload = {
'software': __package__,
@@ -187,25 +208,65 @@ def main ():
l.tab.Page.addScriptToEvaluateOnNewDocument (source=onload)
l.start ()
- l.waitIdle (args.idleTimeout, args.timeout)
+ self.update_state (state='PROGRESS', meta={'step': 'fetch'})
+ l.waitIdle (idleTimeout, timeout)
# get ready for snapshot: stop loading and scripts, disable events
l.tab.Runtime.evaluate (expression='{} = true; window.scrollTo (0, 0);'.format (stopVarname), returnByValue=True)
# if we stopped due to timeout, wait for remaining assets
l.waitIdle (2, 10)
+ self.update_state (state='PROGRESS', meta={'step': 'emulateScreenMetrics'})
emulateScreenMetrics (l)
l.stop ()
- if args.domSnapshot:
- script = loadScripts (args.onsnapshot)
+ if domSnapshot:
+ self.update_state (state='PROGRESS', meta={'step': 'domSnapshot'})
+ script = loadScripts (onsnapshot)
writeScript ('onsnapshot', script, writer)
l.tab.Runtime.evaluate (expression=script, returnByValue=True)
writeDOMSnapshot (l.tab, writer)
- if args.screenshot:
+ if screenshot:
+ self.update_state (state='PROGRESS', meta={'step': 'screenshot'})
writeScreenshot (l.tab, writer)
+ if not output:
+ outPath = os.path.join (app.conf.finished_dir, outFile)
+ os.rename (fd.name, outPath)
+ return True
+
+def stateCallback (data):
+ result = data['result']
+ if data['status'] == 'PROGRESS':
+ print (data['task_id'], result['step'])
+
+def main ():
+ parser = argparse.ArgumentParser(description='Save website to WARC using Google Chrome.')
+ parser.add_argument('--browser', help='DevTools URL', metavar='URL')
+ parser.add_argument('--distributed', help='Use celery worker', action='store_true')
+ parser.add_argument('--timeout', default=10, type=int, help='Maximum time for archival', metavar='SEC')
+ parser.add_argument('--idle-timeout', default=2, type=int, help='Maximum idle seconds (i.e. no requests)', dest='idleTimeout', metavar='SEC')
+ parser.add_argument('--log-buffer', default=1000, type=int, dest='logBuffer', metavar='LINES')
+ parser.add_argument('--max-body-size', default=10*1024*1024, type=int, dest='maxBodySize', help='Max body size', metavar='BYTES')
+ #parser.add_argument('--keep-tab', action='store_true', default=False, dest='keepTab', help='Keep tab open')
+ parser.add_argument('--onload', default=[], action='append', help='Inject JavaScript file before loading page', metavar='FILE')
+ parser.add_argument('--onsnapshot', default=[], action='append', help='Run JavaScript files before creating DOM snapshot', metavar='FILE')
+ parser.add_argument('--no-screenshot', default=True, action='store_false', help='Do not create a screenshot of the website', dest='screenshot')
+ parser.add_argument('--no-dom-snapshot', default=True, action='store_false', help='Do not create a DOM snapshot of the website', dest='domSnapshot')
+ parser.add_argument('url', help='Website URL')
+ parser.add_argument('output', help='WARC filename')
+
+ args = parser.parse_args ()
+ distributed = args.distributed
+ passArgs = vars (args)
+ del passArgs['distributed']
+
+ if distributed:
+ result = archive.delay (**passArgs)
+ result.get (on_message=stateCallback)
+ else:
+ archive (**passArgs)
return True