summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLars-Dominik Braun <lars@6xq.net>2017-12-10 12:31:07 +0100
committerLars-Dominik Braun <lars@6xq.net>2017-12-17 16:40:16 +0100
commit84c3f69293fa79d752127410c7468038c907c96a (patch)
tree4a71dcddd6abc6eeda30ed40bd78d91518efde38
parentf816319081d5253974ddb70b655d55f4a880a77a (diff)
downloadcrocoite-84c3f69293fa79d752127410c7468038c907c96a.tar.gz
crocoite-84c3f69293fa79d752127410c7468038c907c96a.tar.bz2
crocoite-84c3f69293fa79d752127410c7468038c907c96a.zip
Add distributed archiving
Using celery. Also adds a plugin for the IRC bot sopel. Code still needs some love, but it should work.
-rw-r--r--README.rst38
-rw-r--r--contrib/celerycrocoite.py144
-rw-r--r--crocoite/browser.py21
-rw-r--r--crocoite/cli.py351
-rw-r--r--setup.py7
5 files changed, 407 insertions, 154 deletions
diff --git a/README.rst b/README.rst
index 3a7aa7c..3d5af5f 100644
--- a/README.rst
+++ b/README.rst
@@ -66,3 +66,41 @@ also saved. This causes its own set of issues though:
- JavaScript-based navigation does not work.
+Distributed crawling
+--------------------
+
+Configure using celeryconfig.py
+
+.. code:: python
+
+ broker_url = 'pyamqp://'
+ result_backend = 'rpc://'
+ warc_filename = '{domain}-{date}-{id}.warc.gz'
+ temp_dir = '/tmp/'
+ finished_dir = '/tmp/finished'
+
+Start a Celery worker::
+
+ celery -A crocoite.cli worker --loglevel=info
+
+Then queue archive job::
+
+ crocoite-standalone --distributed …
+
+Alternative: IRC bot using sopel_. Use contrib/celerycrocoite.py
+
+~/.sopel/default.cfg
+
+.. code:: ini
+
+ [core]
+ nick = chromebot
+ host = irc.efnet.fr
+ port = 6667
+ owner = someone
+ extra = /path/to/crocoite/contrib
+ enable = celerycrocoite
+ channels = #somechannel
+
+Then in #somechannel ``chromebot: ao <url>``
+
diff --git a/contrib/celerycrocoite.py b/contrib/celerycrocoite.py
new file mode 100644
index 0000000..8fab046
--- /dev/null
+++ b/contrib/celerycrocoite.py
@@ -0,0 +1,144 @@
+# Copyright (c) 2017 crocoite contributors
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+"""
+Module for Sopel IRC bot
+"""
+
+import os, logging
+from sopel.module import nickname_commands, require_chanmsg, thread, example, require_privilege, VOICE
+from sopel.tools import Identifier, SopelMemory
+import celery
+from urllib.parse import urlsplit
+
+import crocoite.cli
+
+def setup (bot):
+ m = bot.memory['crocoite'] = SopelMemory ()
+ m['jobs'] = {}
+
+def isValidUrl (s):
+ url = urlsplit (s)
+ return url.scheme and url.netloc and url.scheme in {'http', 'https'}
+
+@nickname_commands ('ao', 'archiveonly')
+@require_chanmsg ()
+@require_privilege (VOICE)
+@thread (True)
+@example ('ao http://example.com')
+def archive (bot, trigger):
+ """
+ Archive a single page (no recursion) to WARC
+ """
+
+ def updateState (job, data):
+ job['state'] = data
+
+ url = trigger.group(2)
+ if not url:
+ bot.reply ('Need a URL')
+ return
+ if not isValidUrl (url):
+ bot.reply ('{} is not a valid URL'.format (url))
+ return
+
+ args = {
+ 'url': url,
+ 'output': None,
+ 'onload': ['scroll.js'],
+ 'onsnapshot': [],
+ 'browser': None,
+ 'logBuffer': 1000,
+ 'maxBodySize': 10*1024*1024,
+ 'idleTimeout': 10,
+ # 1 hour
+ 'timeout': 1*60*60,
+ 'domSnapshot': False,
+ 'screenshot': False,
+ }
+
+ handle = crocoite.cli.archive.delay (**args)
+ m = bot.memory['crocoite']
+ jobs = m['jobs']
+ # XXX: for some reason we cannot access the job’s state through handle,
+ # instead use a callback quirk
+ j = jobs[handle.id] = {'handle': handle, 'trigger': trigger, 'state': {}}
+ bot.reply ('{} has been queued as {}'.format (url, handle.id))
+ try:
+ result = handle.get (on_message=lambda x: updateState (j, x))
+ bot.reply ('{} ({}) finished'.format (url, handle.id))
+ except Exception as e:
+ # json serialization does not work well with exceptions. If their class
+ # names are unique we can still distinguish them.
+ ename = type (e).__name__
+ if ename == 'TaskRevokedError':
+ bot.reply ('{} ({}) was revoked'.format (url, handle.id))
+ else:
+ bot.reply ('{} ({}) failed'.format (url, handle.id))
+ logging.exception ('{} ({}) failed'.format (url, handle.id))
+ finally:
+ del jobs[handle.id]
+
+@nickname_commands ('s', 'status')
+@example ('s c251f09e-3c26-481f-96e0-4b5f58bd1170')
+@require_chanmsg ()
+def status (bot, trigger):
+ """
+ Retrieve status for a job
+ """
+
+ m = bot.memory['crocoite']
+ jobs = m['jobs']
+
+ i = trigger.group(2)
+ if not i or i not in jobs:
+ bot.reply("Job not found.")
+ return
+
+ j = jobs[i]
+ jtrigger = j['trigger']
+ jhandle = j['handle']
+ jstate = j['state']
+ jresult = jstate.get ('result', {})
+ bot.reply ('{}: {}, queued {}, by {}'.format (jhandle.id,
+ jstate.get ('status', 'UNKNOWN'), jtrigger.time, jtrigger.nick))
+
+@nickname_commands ('r', 'revoke')
+@example ('r c251f09e-3c26-481f-96e0-4b5f58bd1170')
+@require_privilege (VOICE)
+@require_chanmsg ()
+def revoke (bot, trigger):
+ """
+ Cancel (revoke) a job
+ """
+
+ m = bot.memory['crocoite']
+ jobs = m['jobs']
+
+ i = trigger.group(2)
+ if not i or i not in jobs:
+ bot.reply ("Job not found.")
+ return
+
+ j = jobs[i]
+ jhandle = j['handle']
+ jhandle.revoke (terminate=True)
+ # response is handled by long-running initiation thread
+
diff --git a/crocoite/browser.py b/crocoite/browser.py
index 765acbb..3e0e310 100644
--- a/crocoite/browser.py
+++ b/crocoite/browser.py
@@ -178,7 +178,7 @@ class SiteLoader:
resp = kwargs['response']
url = urlsplit (resp['url'])
if url.scheme in self.allowedSchemes:
- self.logger.debug ('response {} {}'.format (reqId, resp['url']))
+ self.logger.info ('response {} {}'.format (reqId, resp['url']))
item.setResponse (kwargs)
else:
self.logger.warn ('response: ignoring scheme {}'.format (url.scheme))
@@ -198,13 +198,13 @@ class SiteLoader:
assert req['url'] == resp['url'], 'req and resp urls are not the same {} vs {}'.format (req['url'], resp['url'])
url = urlsplit (resp['url'])
if url.scheme in self.allowedSchemes:
- self.logger.debug ('finished {} {}'.format (reqId, req['url']))
+ self.logger.info ('finished {} {}'.format (reqId, req['url']))
item.encodedDataLength = kwargs['encodedDataLength']
self.loadingFinished (item)
def _loadingFailed (self, **kwargs):
reqId = kwargs['requestId']
- self.logger.debug ('failed {} {}'.format (reqId, kwargs['errorText'], kwargs.get ('blockedReason')))
+ self.logger.warn ('failed {} {}'.format (reqId, kwargs['errorText'], kwargs.get ('blockedReason')))
item = self.requests.pop (reqId, None)
import subprocess
@@ -219,9 +219,18 @@ def ChromeService (binary='google-chrome-stable', host='localhost', port=9222, w
is not required with this method, since reads will block until Chrome is
ready.
"""
- s = socket.socket ()
- s.setsockopt (socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
- s.bind ((host, port))
+ while True:
+ s = socket.socket ()
+ s.setsockopt (socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+ try:
+ s.bind ((host, port))
+ break
+ except OSError:
+ # try different port
+ if port < 65000:
+ port += 1
+ else:
+ raise
s.listen (10)
userDataDir = mkdtemp ()
args = [binary,
diff --git a/crocoite/cli.py b/crocoite/cli.py
index 8a55269..3527ceb 100644
--- a/crocoite/cli.py
+++ b/crocoite/cli.py
@@ -18,159 +18,180 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
-def main ():
- import os, random, logging, argparse
- from io import BytesIO
- from base64 import b64decode
- import pychrome
- from urllib.parse import urlsplit
- from warcio.warcwriter import WARCWriter
- from warcio.statusandheaders import StatusAndHeaders
- from html5lib.serializer import HTMLSerializer
- from . import html, packageData, packageUrl, browser
- from .warc import WarcLoader
- from .html import StripAttributeFilter, StripTagFilter, ChromeTreeWalker
-
- def randomString (length=None, chars='abcdefghijklmnopqrstuvwxyz'):
- if length is None:
- length = random.randint (16, 32)
- return ''.join (map (lambda x: random.choice (chars), range (length)))
-
- def getFormattedViewportMetrics (tab):
- layoutMetrics = tab.Page.getLayoutMetrics ()
- # XXX: I’m not entirely sure which one we should use here
- return '{}x{}'.format (layoutMetrics['layoutViewport']['clientWidth'],
- layoutMetrics['layoutViewport']['clientHeight'])
-
- def writeDOMSnapshot (tab, writer):
- """
- Get a DOM snapshot of tab and write it to WARC.
-
- We could use DOMSnapshot.getSnapshot here, but the API is not stable
- yet. Also computed styles are not really necessary here.
-
- XXX: Currently writes a response, when it should use “resource”. pywb
- can’t handle that though.
- """
- viewport = getFormattedViewportMetrics (tab)
- dom = tab.DOM.getDocument (depth=-1, pierce=True)
- haveUrls = set ()
- for doc in ChromeTreeWalker (dom['root']).split ():
- rawUrl = doc['documentURL']
- if rawUrl in haveUrls:
- # ignore duplicate URLs. they are usually caused by
- # javascript-injected iframes (advertising) with no(?) src
- logger.warning ('have DOM snapshot for URL {}, ignoring'.format (rawUrl))
- continue
- url = urlsplit (rawUrl)
- if url.scheme in ('http', 'https'):
- logger.debug ('saving DOM snapshot for url {}, base {}'.format (doc['documentURL'], doc['baseURL']))
- haveUrls.add (rawUrl)
- walker = ChromeTreeWalker (doc)
- # remove script, to make the page static and noscript, because at the
- # time we took the snapshot scripts were enabled
- disallowedTags = ['script', 'noscript']
- disallowedAttributes = html.eventAttributes
- stream = StripAttributeFilter (StripTagFilter (walker, disallowedTags), disallowedAttributes)
- serializer = HTMLSerializer ()
- httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1')
- record = writer.create_warc_record (doc['documentURL'], 'response',
- payload=BytesIO (serializer.render (stream, 'utf-8')),
- http_headers=httpHeaders,
- warc_headers_dict={'X-DOM-Snapshot': str (True),
- 'X-Chrome-Viewport': viewport})
- writer.write_record (record)
-
- def emulateScreenMetrics (l):
- """
- Emulate different screen sizes, causing the site to fetch assets (img
- srcset and css, for example) for different screen resolutions.
- """
- cssPpi = 96
- sizes = [
- {'width': 1920, 'height': 1080, 'deviceScaleFactor': 1.5, 'mobile': False},
- {'width': 1920, 'height': 1080, 'deviceScaleFactor': 2, 'mobile': False},
- # very dense display
- {'width': 1920, 'height': 1080, 'deviceScaleFactor': 4, 'mobile': False},
- # just a few samples:
- # 1st gen iPhone (portrait mode)
- {'width': 320, 'height': 480, 'deviceScaleFactor': 163/cssPpi, 'mobile': True},
- # 6th gen iPhone (portrait mode)
- {'width': 750, 'height': 1334, 'deviceScaleFactor': 326/cssPpi, 'mobile': True},
- # and reset
- {'width': 1920, 'height': 1080, 'deviceScaleFactor': 1, 'mobile': False},
- ]
- for s in sizes:
- l.tab.Emulation.setDeviceMetricsOverride (**s)
- l.wait (1)
- # XXX: this seems to be broken, it does not clear the override
- #tab.Emulation.clearDeviceMetricsOverride ()
- # wait until assets finished loading
- l.waitIdle (2, 10)
-
- def loadScripts (paths, scripts=[]):
- for p in paths:
- if not os.path.exists (p):
- # search for defaults scripts in package data directory
- p = packageData (p)
- with open (p, 'r') as fd:
- scripts.append (fd.read ())
- return '\n'.join (scripts)
-
- def writeScript (path, source, writer):
- record = writer.create_warc_record (packageUrl (path), 'metadata',
- payload=BytesIO (source.encode ('utf8')),
- warc_headers_dict={'Content-Type': 'application/javascript; charset=utf-8'})
- writer.write_record (record)
-
- def writeScreenshot (tab, writer):
- """
- Create screenshot from tab and write it to WARC
- """
- viewport = getFormattedViewportMetrics (tab)
- data = b64decode (l.tab.Page.captureScreenshot (format='png')['data'])
- record = writer.create_warc_record (packageUrl ('screenshot.png'), 'resource',
- payload=BytesIO (data), warc_headers_dict={'Content-Type': 'image/png',
- 'X-Chrome-Viewport': viewport})
- writer.write_record (record)
-
- logger = logging.getLogger(__name__)
- logging.basicConfig (level=logging.DEBUG)
+"""
+Standalone and Celery command line interface
+"""
- parser = argparse.ArgumentParser(description='Save website to WARC using Google Chrome.')
- parser.add_argument('--browser', help='DevTools URL', metavar='URL')
- parser.add_argument('--timeout', default=10, type=int, help='Maximum time for archival', metavar='SEC')
- parser.add_argument('--idle-timeout', default=2, type=int, help='Maximum idle seconds (i.e. no requests)', dest='idleTimeout', metavar='SEC')
- parser.add_argument('--log-buffer', default=1000, type=int, dest='logBuffer', metavar='LINES')
- parser.add_argument('--max-body-size', default=10*1024*1024, type=int, dest='maxBodySize', help='Max body size', metavar='BYTES')
- #parser.add_argument('--keep-tab', action='store_true', default=False, dest='keepTab', help='Keep tab open')
- parser.add_argument('--onload', default=[], action='append', help='Inject JavaScript file before loading page', metavar='FILE')
- parser.add_argument('--onsnapshot', default=[], action='append', help='Run JavaScript files before creating DOM snapshot', metavar='FILE')
- parser.add_argument('--no-screenshot', default=True, action='store_false', help='Do not create a screenshot of the website', dest='screenshot')
- parser.add_argument('--no-dom-snapshot', default=True, action='store_false', help='Do not create a DOM snapshot of the website', dest='domSnapshot')
- parser.add_argument('url', help='Website URL')
- parser.add_argument('output', help='WARC filename')
+import os, random, logging, argparse
+from io import BytesIO
+from datetime import datetime
+from base64 import b64decode
+import pychrome
+from urllib.parse import urlsplit
+from warcio.warcwriter import WARCWriter
+from warcio.statusandheaders import StatusAndHeaders
+from html5lib.serializer import HTMLSerializer
- args = parser.parse_args ()
+from celery import Celery
+from celery.utils.log import get_task_logger
+
+from . import html, packageData, packageUrl
+from .warc import WarcLoader
+from .html import StripAttributeFilter, StripTagFilter, ChromeTreeWalker
+from .browser import ChromeService, NullService
+
+def getFormattedViewportMetrics (tab):
+ layoutMetrics = tab.Page.getLayoutMetrics ()
+ # XXX: I’m not entirely sure which one we should use here
+ return '{}x{}'.format (layoutMetrics['layoutViewport']['clientWidth'],
+ layoutMetrics['layoutViewport']['clientHeight'])
+
+def writeScript (path, source, writer):
+ record = writer.create_warc_record (packageUrl (path), 'metadata',
+ payload=BytesIO (source.encode ('utf8')),
+ warc_headers_dict={'Content-Type': 'application/javascript; charset=utf-8'})
+ writer.write_record (record)
+
+def randomString (length=None, chars='abcdefghijklmnopqrstuvwxyz'):
+ if length is None:
+ length = random.randint (16, 32)
+ return ''.join (map (lambda x: random.choice (chars), range (length)))
+
+def writeDOMSnapshot (tab, writer):
+ """
+ Get a DOM snapshot of tab and write it to WARC.
+
+ We could use DOMSnapshot.getSnapshot here, but the API is not stable
+ yet. Also computed styles are not really necessary here.
+
+ XXX: Currently writes a response, when it should use “resource”. pywb
+ can’t handle that though.
+ """
+ viewport = getFormattedViewportMetrics (tab)
+ dom = tab.DOM.getDocument (depth=-1, pierce=True)
+ haveUrls = set ()
+ for doc in ChromeTreeWalker (dom['root']).split ():
+ rawUrl = doc['documentURL']
+ if rawUrl in haveUrls:
+ # ignore duplicate URLs. they are usually caused by
+ # javascript-injected iframes (advertising) with no(?) src
+ logger.warning ('have DOM snapshot for URL {}, ignoring'.format (rawUrl))
+ continue
+ url = urlsplit (rawUrl)
+ if url.scheme in ('http', 'https'):
+ logger.debug ('saving DOM snapshot for url {}, base {}'.format (doc['documentURL'], doc['baseURL']))
+ haveUrls.add (rawUrl)
+ walker = ChromeTreeWalker (doc)
+ # remove script, to make the page static and noscript, because at the
+ # time we took the snapshot scripts were enabled
+ disallowedTags = ['script', 'noscript']
+ disallowedAttributes = html.eventAttributes
+ stream = StripAttributeFilter (StripTagFilter (walker, disallowedTags), disallowedAttributes)
+ serializer = HTMLSerializer ()
+ httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1')
+ record = writer.create_warc_record (doc['documentURL'], 'response',
+ payload=BytesIO (serializer.render (stream, 'utf-8')),
+ http_headers=httpHeaders,
+ warc_headers_dict={'X-DOM-Snapshot': str (True),
+ 'X-Chrome-Viewport': viewport})
+ writer.write_record (record)
+
+def emulateScreenMetrics (l):
+ """
+ Emulate different screen sizes, causing the site to fetch assets (img
+ srcset and css, for example) for different screen resolutions.
+ """
+ cssPpi = 96
+ sizes = [
+ {'width': 1920, 'height': 1080, 'deviceScaleFactor': 1.5, 'mobile': False},
+ {'width': 1920, 'height': 1080, 'deviceScaleFactor': 2, 'mobile': False},
+ # very dense display
+ {'width': 1920, 'height': 1080, 'deviceScaleFactor': 4, 'mobile': False},
+ # just a few samples:
+ # 1st gen iPhone (portrait mode)
+ {'width': 320, 'height': 480, 'deviceScaleFactor': 163/cssPpi, 'mobile': True},
+ # 6th gen iPhone (portrait mode)
+ {'width': 750, 'height': 1334, 'deviceScaleFactor': 326/cssPpi, 'mobile': True},
+ # and reset
+ {'width': 1920, 'height': 1080, 'deviceScaleFactor': 1, 'mobile': False},
+ ]
+ for s in sizes:
+ l.tab.Emulation.setDeviceMetricsOverride (**s)
+ l.wait (1)
+ # XXX: this seems to be broken, it does not clear the override
+ #tab.Emulation.clearDeviceMetricsOverride ()
+ # wait until assets finished loading
+ l.waitIdle (2, 10)
+
+def loadScripts (paths, scripts=[]):
+ for p in paths:
+ if not os.path.exists (p):
+ # search for defaults scripts in package data directory
+ p = packageData (p)
+ with open (p, 'r') as fd:
+ scripts.append (fd.read ())
+ return '\n'.join (scripts)
+
+def writeScreenshot (tab, writer):
+ """
+ Create screenshot from tab and write it to WARC
+ """
+ viewport = getFormattedViewportMetrics (tab)
+ data = b64decode (tab.Page.captureScreenshot (format='png')['data'])
+ record = writer.create_warc_record (packageUrl ('screenshot.png'), 'resource',
+ payload=BytesIO (data), warc_headers_dict={'Content-Type': 'image/png',
+ 'X-Chrome-Viewport': viewport})
+ writer.write_record (record)
+
+# XXX: rabbitmq is hardcoded
+app = Celery ('crocoite.distributed')
+app.config_from_object('celeryconfig')
+logger = get_task_logger('crocoite.distributed.archive')
+
+# defaults can be changed below using argparse; track started state, because tasks are usually long-running
+@app.task(bind=True, track_started=True)
+def archive (self, url, output, onload, onsnapshot, browser,
+ logBuffer, maxBodySize, idleTimeout, timeout, domSnapshot, screenshot):
+ """
+ Archive a single URL
+
+ Supports these config keys (celeryconfig):
+
+ warc_filename = '{domain}-{date}-{id}.warc.gz'
+ temp_dir = '/tmp/'
+ finished_dir = '/tmp/finished'
+ """
+
+ self.update_state (state='PROGRESS', meta={'step': 'start'})
stopVarname = '__' + __package__ + '_stop__'
# avoid sites messing with our scripts by using a random stop variable name
newStopVarname = randomString ()
- onload = loadScripts (args.onload, ['var {} = false;\n'.format (stopVarname)]).replace (stopVarname, newStopVarname)
+ onload = loadScripts (onload, ['var {} = false;\n'.format (stopVarname)]).replace (stopVarname, newStopVarname)
stopVarname = newStopVarname
- service = browser.ChromeService ()
- if args.browser:
- service = browser.NullService (args.browser)
+ service = ChromeService ()
+ if browser:
+ service = NullService (browser)
- with service as browserUrl:
- browser = pychrome.Browser(url=browserUrl)
+ with service as browser:
+ browser = pychrome.Browser(url=browser)
- fd = open (args.output, 'wb')
+ if not output:
+ parsedUrl = urlsplit (url)
+ outFile = app.conf.warc_filename.format (
+ id=self.request.id,
+ domain=parsedUrl.hostname.replace ('/', '-'),
+ date=datetime.utcnow ().isoformat (),
+ )
+ outPath = os.path.join (app.conf.temp_dir, outFile)
+ fd = open (outPath, 'wb')
+ else:
+ fd = open (output, 'wb')
writer = WARCWriter (fd, gzip=True)
- with WarcLoader (browser, args.url, writer, logBuffer=args.logBuffer,
- maxBodySize=args.maxBodySize) as l:
+ with WarcLoader (browser, url, writer, logBuffer=logBuffer,
+ maxBodySize=maxBodySize) as l:
version = l.tab.Browser.getVersion ()
payload = {
'software': __package__,
@@ -187,25 +208,65 @@ def main ():
l.tab.Page.addScriptToEvaluateOnNewDocument (source=onload)
l.start ()
- l.waitIdle (args.idleTimeout, args.timeout)
+ self.update_state (state='PROGRESS', meta={'step': 'fetch'})
+ l.waitIdle (idleTimeout, timeout)
# get ready for snapshot: stop loading and scripts, disable events
l.tab.Runtime.evaluate (expression='{} = true; window.scrollTo (0, 0);'.format (stopVarname), returnByValue=True)
# if we stopped due to timeout, wait for remaining assets
l.waitIdle (2, 10)
+ self.update_state (state='PROGRESS', meta={'step': 'emulateScreenMetrics'})
emulateScreenMetrics (l)
l.stop ()
- if args.domSnapshot:
- script = loadScripts (args.onsnapshot)
+ if domSnapshot:
+ self.update_state (state='PROGRESS', meta={'step': 'domSnapshot'})
+ script = loadScripts (onsnapshot)
writeScript ('onsnapshot', script, writer)
l.tab.Runtime.evaluate (expression=script, returnByValue=True)
writeDOMSnapshot (l.tab, writer)
- if args.screenshot:
+ if screenshot:
+ self.update_state (state='PROGRESS', meta={'step': 'screenshot'})
writeScreenshot (l.tab, writer)
+ if not output:
+ outPath = os.path.join (app.conf.finished_dir, outFile)
+ os.rename (fd.name, outPath)
+ return True
+
+def stateCallback (data):
+ result = data['result']
+ if data['status'] == 'PROGRESS':
+ print (data['task_id'], result['step'])
+
+def main ():
+ parser = argparse.ArgumentParser(description='Save website to WARC using Google Chrome.')
+ parser.add_argument('--browser', help='DevTools URL', metavar='URL')
+ parser.add_argument('--distributed', help='Use celery worker', action='store_true')
+ parser.add_argument('--timeout', default=10, type=int, help='Maximum time for archival', metavar='SEC')
+ parser.add_argument('--idle-timeout', default=2, type=int, help='Maximum idle seconds (i.e. no requests)', dest='idleTimeout', metavar='SEC')
+ parser.add_argument('--log-buffer', default=1000, type=int, dest='logBuffer', metavar='LINES')
+ parser.add_argument('--max-body-size', default=10*1024*1024, type=int, dest='maxBodySize', help='Max body size', metavar='BYTES')
+ #parser.add_argument('--keep-tab', action='store_true', default=False, dest='keepTab', help='Keep tab open')
+ parser.add_argument('--onload', default=[], action='append', help='Inject JavaScript file before loading page', metavar='FILE')
+ parser.add_argument('--onsnapshot', default=[], action='append', help='Run JavaScript files before creating DOM snapshot', metavar='FILE')
+ parser.add_argument('--no-screenshot', default=True, action='store_false', help='Do not create a screenshot of the website', dest='screenshot')
+ parser.add_argument('--no-dom-snapshot', default=True, action='store_false', help='Do not create a DOM snapshot of the website', dest='domSnapshot')
+ parser.add_argument('url', help='Website URL')
+ parser.add_argument('output', help='WARC filename')
+
+ args = parser.parse_args ()
+ distributed = args.distributed
+ passArgs = vars (args)
+ del passArgs['distributed']
+
+ if distributed:
+ result = archive.delay (**passArgs)
+ result.get (on_message=stateCallback)
+ else:
+ archive (**passArgs)
return True
diff --git a/setup.py b/setup.py
index 52747c0..0ab9249 100644
--- a/setup.py
+++ b/setup.py
@@ -13,13 +13,14 @@ setup(
'pychrome',
'warcio',
'html5lib>=0.999999999',
+ 'Celery',
],
entry_points={
'console_scripts': [
'crocoite-standalone = crocoite.cli:main',
],
},
- data_files=[
- ('crocoite/data', ['crocoite/data/onload.js']),
- ],
+ package_data={
+ 'crocoite': ['data/*'],
+ },
)