From 84c3f69293fa79d752127410c7468038c907c96a Mon Sep 17 00:00:00 2001 From: Lars-Dominik Braun Date: Sun, 10 Dec 2017 12:31:07 +0100 Subject: Add distributed archiving Using celery. Also adds a plugin for the IRC bot sopel. Code still needs some love, but it should work. --- README.rst | 38 +++++ contrib/celerycrocoite.py | 144 +++++++++++++++++++ crocoite/browser.py | 21 ++- crocoite/cli.py | 351 +++++++++++++++++++++++++++------------------- setup.py | 7 +- 5 files changed, 407 insertions(+), 154 deletions(-) create mode 100644 contrib/celerycrocoite.py diff --git a/README.rst b/README.rst index 3a7aa7c..3d5af5f 100644 --- a/README.rst +++ b/README.rst @@ -66,3 +66,41 @@ also saved. This causes its own set of issues though: - JavaScript-based navigation does not work. +Distributed crawling +-------------------- + +Configure using celeryconfig.py + +.. code:: python + + broker_url = 'pyamqp://' + result_backend = 'rpc://' + warc_filename = '{domain}-{date}-{id}.warc.gz' + temp_dir = '/tmp/' + finished_dir = '/tmp/finished' + +Start a Celery worker:: + + celery -A crocoite.cli worker --loglevel=info + +Then queue archive job:: + + crocoite-standalone --distributed … + +Alternative: IRC bot using sopel_. Use contrib/celerycrocoite.py + +~/.sopel/default.cfg + +.. code:: ini + + [core] + nick = chromebot + host = irc.efnet.fr + port = 6667 + owner = someone + extra = /path/to/crocoite/contrib + enable = celerycrocoite + channels = #somechannel + +Then in #somechannel ``chromebot: ao `` + diff --git a/contrib/celerycrocoite.py b/contrib/celerycrocoite.py new file mode 100644 index 0000000..8fab046 --- /dev/null +++ b/contrib/celerycrocoite.py @@ -0,0 +1,144 @@ +# Copyright (c) 2017 crocoite contributors +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +""" +Module for Sopel IRC bot +""" + +import os, logging +from sopel.module import nickname_commands, require_chanmsg, thread, example, require_privilege, VOICE +from sopel.tools import Identifier, SopelMemory +import celery +from urllib.parse import urlsplit + +import crocoite.cli + +def setup (bot): + m = bot.memory['crocoite'] = SopelMemory () + m['jobs'] = {} + +def isValidUrl (s): + url = urlsplit (s) + return url.scheme and url.netloc and url.scheme in {'http', 'https'} + +@nickname_commands ('ao', 'archiveonly') +@require_chanmsg () +@require_privilege (VOICE) +@thread (True) +@example ('ao http://example.com') +def archive (bot, trigger): + """ + Archive a single page (no recursion) to WARC + """ + + def updateState (job, data): + job['state'] = data + + url = trigger.group(2) + if not url: + bot.reply ('Need a URL') + return + if not isValidUrl (url): + bot.reply ('{} is not a valid URL'.format (url)) + return + + args = { + 'url': url, + 'output': None, + 'onload': ['scroll.js'], + 'onsnapshot': [], + 'browser': None, + 'logBuffer': 1000, + 'maxBodySize': 10*1024*1024, + 'idleTimeout': 10, + # 1 hour + 'timeout': 1*60*60, + 'domSnapshot': False, + 'screenshot': False, + } + + handle = crocoite.cli.archive.delay (**args) + m = bot.memory['crocoite'] + jobs = m['jobs'] + # XXX: for some reason we cannot access the job’s state through handle, + # instead use a callback quirk + j = jobs[handle.id] = {'handle': handle, 'trigger': trigger, 'state': {}} + bot.reply ('{} has been queued as {}'.format (url, handle.id)) + try: + result = handle.get (on_message=lambda x: updateState (j, x)) + bot.reply ('{} ({}) finished'.format (url, handle.id)) + except Exception as e: + # json serialization does not work well with exceptions. If their class + # names are unique we can still distinguish them. + ename = type (e).__name__ + if ename == 'TaskRevokedError': + bot.reply ('{} ({}) was revoked'.format (url, handle.id)) + else: + bot.reply ('{} ({}) failed'.format (url, handle.id)) + logging.exception ('{} ({}) failed'.format (url, handle.id)) + finally: + del jobs[handle.id] + +@nickname_commands ('s', 'status') +@example ('s c251f09e-3c26-481f-96e0-4b5f58bd1170') +@require_chanmsg () +def status (bot, trigger): + """ + Retrieve status for a job + """ + + m = bot.memory['crocoite'] + jobs = m['jobs'] + + i = trigger.group(2) + if not i or i not in jobs: + bot.reply("Job not found.") + return + + j = jobs[i] + jtrigger = j['trigger'] + jhandle = j['handle'] + jstate = j['state'] + jresult = jstate.get ('result', {}) + bot.reply ('{}: {}, queued {}, by {}'.format (jhandle.id, + jstate.get ('status', 'UNKNOWN'), jtrigger.time, jtrigger.nick)) + +@nickname_commands ('r', 'revoke') +@example ('r c251f09e-3c26-481f-96e0-4b5f58bd1170') +@require_privilege (VOICE) +@require_chanmsg () +def revoke (bot, trigger): + """ + Cancel (revoke) a job + """ + + m = bot.memory['crocoite'] + jobs = m['jobs'] + + i = trigger.group(2) + if not i or i not in jobs: + bot.reply ("Job not found.") + return + + j = jobs[i] + jhandle = j['handle'] + jhandle.revoke (terminate=True) + # response is handled by long-running initiation thread + diff --git a/crocoite/browser.py b/crocoite/browser.py index 765acbb..3e0e310 100644 --- a/crocoite/browser.py +++ b/crocoite/browser.py @@ -178,7 +178,7 @@ class SiteLoader: resp = kwargs['response'] url = urlsplit (resp['url']) if url.scheme in self.allowedSchemes: - self.logger.debug ('response {} {}'.format (reqId, resp['url'])) + self.logger.info ('response {} {}'.format (reqId, resp['url'])) item.setResponse (kwargs) else: self.logger.warn ('response: ignoring scheme {}'.format (url.scheme)) @@ -198,13 +198,13 @@ class SiteLoader: assert req['url'] == resp['url'], 'req and resp urls are not the same {} vs {}'.format (req['url'], resp['url']) url = urlsplit (resp['url']) if url.scheme in self.allowedSchemes: - self.logger.debug ('finished {} {}'.format (reqId, req['url'])) + self.logger.info ('finished {} {}'.format (reqId, req['url'])) item.encodedDataLength = kwargs['encodedDataLength'] self.loadingFinished (item) def _loadingFailed (self, **kwargs): reqId = kwargs['requestId'] - self.logger.debug ('failed {} {}'.format (reqId, kwargs['errorText'], kwargs.get ('blockedReason'))) + self.logger.warn ('failed {} {}'.format (reqId, kwargs['errorText'], kwargs.get ('blockedReason'))) item = self.requests.pop (reqId, None) import subprocess @@ -219,9 +219,18 @@ def ChromeService (binary='google-chrome-stable', host='localhost', port=9222, w is not required with this method, since reads will block until Chrome is ready. """ - s = socket.socket () - s.setsockopt (socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) - s.bind ((host, port)) + while True: + s = socket.socket () + s.setsockopt (socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + try: + s.bind ((host, port)) + break + except OSError: + # try different port + if port < 65000: + port += 1 + else: + raise s.listen (10) userDataDir = mkdtemp () args = [binary, diff --git a/crocoite/cli.py b/crocoite/cli.py index 8a55269..3527ceb 100644 --- a/crocoite/cli.py +++ b/crocoite/cli.py @@ -18,159 +18,180 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. -def main (): - import os, random, logging, argparse - from io import BytesIO - from base64 import b64decode - import pychrome - from urllib.parse import urlsplit - from warcio.warcwriter import WARCWriter - from warcio.statusandheaders import StatusAndHeaders - from html5lib.serializer import HTMLSerializer - from . import html, packageData, packageUrl, browser - from .warc import WarcLoader - from .html import StripAttributeFilter, StripTagFilter, ChromeTreeWalker - - def randomString (length=None, chars='abcdefghijklmnopqrstuvwxyz'): - if length is None: - length = random.randint (16, 32) - return ''.join (map (lambda x: random.choice (chars), range (length))) - - def getFormattedViewportMetrics (tab): - layoutMetrics = tab.Page.getLayoutMetrics () - # XXX: I’m not entirely sure which one we should use here - return '{}x{}'.format (layoutMetrics['layoutViewport']['clientWidth'], - layoutMetrics['layoutViewport']['clientHeight']) - - def writeDOMSnapshot (tab, writer): - """ - Get a DOM snapshot of tab and write it to WARC. - - We could use DOMSnapshot.getSnapshot here, but the API is not stable - yet. Also computed styles are not really necessary here. - - XXX: Currently writes a response, when it should use “resource”. pywb - can’t handle that though. - """ - viewport = getFormattedViewportMetrics (tab) - dom = tab.DOM.getDocument (depth=-1, pierce=True) - haveUrls = set () - for doc in ChromeTreeWalker (dom['root']).split (): - rawUrl = doc['documentURL'] - if rawUrl in haveUrls: - # ignore duplicate URLs. they are usually caused by - # javascript-injected iframes (advertising) with no(?) src - logger.warning ('have DOM snapshot for URL {}, ignoring'.format (rawUrl)) - continue - url = urlsplit (rawUrl) - if url.scheme in ('http', 'https'): - logger.debug ('saving DOM snapshot for url {}, base {}'.format (doc['documentURL'], doc['baseURL'])) - haveUrls.add (rawUrl) - walker = ChromeTreeWalker (doc) - # remove script, to make the page static and noscript, because at the - # time we took the snapshot scripts were enabled - disallowedTags = ['script', 'noscript'] - disallowedAttributes = html.eventAttributes - stream = StripAttributeFilter (StripTagFilter (walker, disallowedTags), disallowedAttributes) - serializer = HTMLSerializer () - httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1') - record = writer.create_warc_record (doc['documentURL'], 'response', - payload=BytesIO (serializer.render (stream, 'utf-8')), - http_headers=httpHeaders, - warc_headers_dict={'X-DOM-Snapshot': str (True), - 'X-Chrome-Viewport': viewport}) - writer.write_record (record) - - def emulateScreenMetrics (l): - """ - Emulate different screen sizes, causing the site to fetch assets (img - srcset and css, for example) for different screen resolutions. - """ - cssPpi = 96 - sizes = [ - {'width': 1920, 'height': 1080, 'deviceScaleFactor': 1.5, 'mobile': False}, - {'width': 1920, 'height': 1080, 'deviceScaleFactor': 2, 'mobile': False}, - # very dense display - {'width': 1920, 'height': 1080, 'deviceScaleFactor': 4, 'mobile': False}, - # just a few samples: - # 1st gen iPhone (portrait mode) - {'width': 320, 'height': 480, 'deviceScaleFactor': 163/cssPpi, 'mobile': True}, - # 6th gen iPhone (portrait mode) - {'width': 750, 'height': 1334, 'deviceScaleFactor': 326/cssPpi, 'mobile': True}, - # and reset - {'width': 1920, 'height': 1080, 'deviceScaleFactor': 1, 'mobile': False}, - ] - for s in sizes: - l.tab.Emulation.setDeviceMetricsOverride (**s) - l.wait (1) - # XXX: this seems to be broken, it does not clear the override - #tab.Emulation.clearDeviceMetricsOverride () - # wait until assets finished loading - l.waitIdle (2, 10) - - def loadScripts (paths, scripts=[]): - for p in paths: - if not os.path.exists (p): - # search for defaults scripts in package data directory - p = packageData (p) - with open (p, 'r') as fd: - scripts.append (fd.read ()) - return '\n'.join (scripts) - - def writeScript (path, source, writer): - record = writer.create_warc_record (packageUrl (path), 'metadata', - payload=BytesIO (source.encode ('utf8')), - warc_headers_dict={'Content-Type': 'application/javascript; charset=utf-8'}) - writer.write_record (record) - - def writeScreenshot (tab, writer): - """ - Create screenshot from tab and write it to WARC - """ - viewport = getFormattedViewportMetrics (tab) - data = b64decode (l.tab.Page.captureScreenshot (format='png')['data']) - record = writer.create_warc_record (packageUrl ('screenshot.png'), 'resource', - payload=BytesIO (data), warc_headers_dict={'Content-Type': 'image/png', - 'X-Chrome-Viewport': viewport}) - writer.write_record (record) - - logger = logging.getLogger(__name__) - logging.basicConfig (level=logging.DEBUG) +""" +Standalone and Celery command line interface +""" - parser = argparse.ArgumentParser(description='Save website to WARC using Google Chrome.') - parser.add_argument('--browser', help='DevTools URL', metavar='URL') - parser.add_argument('--timeout', default=10, type=int, help='Maximum time for archival', metavar='SEC') - parser.add_argument('--idle-timeout', default=2, type=int, help='Maximum idle seconds (i.e. no requests)', dest='idleTimeout', metavar='SEC') - parser.add_argument('--log-buffer', default=1000, type=int, dest='logBuffer', metavar='LINES') - parser.add_argument('--max-body-size', default=10*1024*1024, type=int, dest='maxBodySize', help='Max body size', metavar='BYTES') - #parser.add_argument('--keep-tab', action='store_true', default=False, dest='keepTab', help='Keep tab open') - parser.add_argument('--onload', default=[], action='append', help='Inject JavaScript file before loading page', metavar='FILE') - parser.add_argument('--onsnapshot', default=[], action='append', help='Run JavaScript files before creating DOM snapshot', metavar='FILE') - parser.add_argument('--no-screenshot', default=True, action='store_false', help='Do not create a screenshot of the website', dest='screenshot') - parser.add_argument('--no-dom-snapshot', default=True, action='store_false', help='Do not create a DOM snapshot of the website', dest='domSnapshot') - parser.add_argument('url', help='Website URL') - parser.add_argument('output', help='WARC filename') +import os, random, logging, argparse +from io import BytesIO +from datetime import datetime +from base64 import b64decode +import pychrome +from urllib.parse import urlsplit +from warcio.warcwriter import WARCWriter +from warcio.statusandheaders import StatusAndHeaders +from html5lib.serializer import HTMLSerializer - args = parser.parse_args () +from celery import Celery +from celery.utils.log import get_task_logger + +from . import html, packageData, packageUrl +from .warc import WarcLoader +from .html import StripAttributeFilter, StripTagFilter, ChromeTreeWalker +from .browser import ChromeService, NullService + +def getFormattedViewportMetrics (tab): + layoutMetrics = tab.Page.getLayoutMetrics () + # XXX: I’m not entirely sure which one we should use here + return '{}x{}'.format (layoutMetrics['layoutViewport']['clientWidth'], + layoutMetrics['layoutViewport']['clientHeight']) + +def writeScript (path, source, writer): + record = writer.create_warc_record (packageUrl (path), 'metadata', + payload=BytesIO (source.encode ('utf8')), + warc_headers_dict={'Content-Type': 'application/javascript; charset=utf-8'}) + writer.write_record (record) + +def randomString (length=None, chars='abcdefghijklmnopqrstuvwxyz'): + if length is None: + length = random.randint (16, 32) + return ''.join (map (lambda x: random.choice (chars), range (length))) + +def writeDOMSnapshot (tab, writer): + """ + Get a DOM snapshot of tab and write it to WARC. + + We could use DOMSnapshot.getSnapshot here, but the API is not stable + yet. Also computed styles are not really necessary here. + + XXX: Currently writes a response, when it should use “resource”. pywb + can’t handle that though. + """ + viewport = getFormattedViewportMetrics (tab) + dom = tab.DOM.getDocument (depth=-1, pierce=True) + haveUrls = set () + for doc in ChromeTreeWalker (dom['root']).split (): + rawUrl = doc['documentURL'] + if rawUrl in haveUrls: + # ignore duplicate URLs. they are usually caused by + # javascript-injected iframes (advertising) with no(?) src + logger.warning ('have DOM snapshot for URL {}, ignoring'.format (rawUrl)) + continue + url = urlsplit (rawUrl) + if url.scheme in ('http', 'https'): + logger.debug ('saving DOM snapshot for url {}, base {}'.format (doc['documentURL'], doc['baseURL'])) + haveUrls.add (rawUrl) + walker = ChromeTreeWalker (doc) + # remove script, to make the page static and noscript, because at the + # time we took the snapshot scripts were enabled + disallowedTags = ['script', 'noscript'] + disallowedAttributes = html.eventAttributes + stream = StripAttributeFilter (StripTagFilter (walker, disallowedTags), disallowedAttributes) + serializer = HTMLSerializer () + httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1') + record = writer.create_warc_record (doc['documentURL'], 'response', + payload=BytesIO (serializer.render (stream, 'utf-8')), + http_headers=httpHeaders, + warc_headers_dict={'X-DOM-Snapshot': str (True), + 'X-Chrome-Viewport': viewport}) + writer.write_record (record) + +def emulateScreenMetrics (l): + """ + Emulate different screen sizes, causing the site to fetch assets (img + srcset and css, for example) for different screen resolutions. + """ + cssPpi = 96 + sizes = [ + {'width': 1920, 'height': 1080, 'deviceScaleFactor': 1.5, 'mobile': False}, + {'width': 1920, 'height': 1080, 'deviceScaleFactor': 2, 'mobile': False}, + # very dense display + {'width': 1920, 'height': 1080, 'deviceScaleFactor': 4, 'mobile': False}, + # just a few samples: + # 1st gen iPhone (portrait mode) + {'width': 320, 'height': 480, 'deviceScaleFactor': 163/cssPpi, 'mobile': True}, + # 6th gen iPhone (portrait mode) + {'width': 750, 'height': 1334, 'deviceScaleFactor': 326/cssPpi, 'mobile': True}, + # and reset + {'width': 1920, 'height': 1080, 'deviceScaleFactor': 1, 'mobile': False}, + ] + for s in sizes: + l.tab.Emulation.setDeviceMetricsOverride (**s) + l.wait (1) + # XXX: this seems to be broken, it does not clear the override + #tab.Emulation.clearDeviceMetricsOverride () + # wait until assets finished loading + l.waitIdle (2, 10) + +def loadScripts (paths, scripts=[]): + for p in paths: + if not os.path.exists (p): + # search for defaults scripts in package data directory + p = packageData (p) + with open (p, 'r') as fd: + scripts.append (fd.read ()) + return '\n'.join (scripts) + +def writeScreenshot (tab, writer): + """ + Create screenshot from tab and write it to WARC + """ + viewport = getFormattedViewportMetrics (tab) + data = b64decode (tab.Page.captureScreenshot (format='png')['data']) + record = writer.create_warc_record (packageUrl ('screenshot.png'), 'resource', + payload=BytesIO (data), warc_headers_dict={'Content-Type': 'image/png', + 'X-Chrome-Viewport': viewport}) + writer.write_record (record) + +# XXX: rabbitmq is hardcoded +app = Celery ('crocoite.distributed') +app.config_from_object('celeryconfig') +logger = get_task_logger('crocoite.distributed.archive') + +# defaults can be changed below using argparse; track started state, because tasks are usually long-running +@app.task(bind=True, track_started=True) +def archive (self, url, output, onload, onsnapshot, browser, + logBuffer, maxBodySize, idleTimeout, timeout, domSnapshot, screenshot): + """ + Archive a single URL + + Supports these config keys (celeryconfig): + + warc_filename = '{domain}-{date}-{id}.warc.gz' + temp_dir = '/tmp/' + finished_dir = '/tmp/finished' + """ + + self.update_state (state='PROGRESS', meta={'step': 'start'}) stopVarname = '__' + __package__ + '_stop__' # avoid sites messing with our scripts by using a random stop variable name newStopVarname = randomString () - onload = loadScripts (args.onload, ['var {} = false;\n'.format (stopVarname)]).replace (stopVarname, newStopVarname) + onload = loadScripts (onload, ['var {} = false;\n'.format (stopVarname)]).replace (stopVarname, newStopVarname) stopVarname = newStopVarname - service = browser.ChromeService () - if args.browser: - service = browser.NullService (args.browser) + service = ChromeService () + if browser: + service = NullService (browser) - with service as browserUrl: - browser = pychrome.Browser(url=browserUrl) + with service as browser: + browser = pychrome.Browser(url=browser) - fd = open (args.output, 'wb') + if not output: + parsedUrl = urlsplit (url) + outFile = app.conf.warc_filename.format ( + id=self.request.id, + domain=parsedUrl.hostname.replace ('/', '-'), + date=datetime.utcnow ().isoformat (), + ) + outPath = os.path.join (app.conf.temp_dir, outFile) + fd = open (outPath, 'wb') + else: + fd = open (output, 'wb') writer = WARCWriter (fd, gzip=True) - with WarcLoader (browser, args.url, writer, logBuffer=args.logBuffer, - maxBodySize=args.maxBodySize) as l: + with WarcLoader (browser, url, writer, logBuffer=logBuffer, + maxBodySize=maxBodySize) as l: version = l.tab.Browser.getVersion () payload = { 'software': __package__, @@ -187,25 +208,65 @@ def main (): l.tab.Page.addScriptToEvaluateOnNewDocument (source=onload) l.start () - l.waitIdle (args.idleTimeout, args.timeout) + self.update_state (state='PROGRESS', meta={'step': 'fetch'}) + l.waitIdle (idleTimeout, timeout) # get ready for snapshot: stop loading and scripts, disable events l.tab.Runtime.evaluate (expression='{} = true; window.scrollTo (0, 0);'.format (stopVarname), returnByValue=True) # if we stopped due to timeout, wait for remaining assets l.waitIdle (2, 10) + self.update_state (state='PROGRESS', meta={'step': 'emulateScreenMetrics'}) emulateScreenMetrics (l) l.stop () - if args.domSnapshot: - script = loadScripts (args.onsnapshot) + if domSnapshot: + self.update_state (state='PROGRESS', meta={'step': 'domSnapshot'}) + script = loadScripts (onsnapshot) writeScript ('onsnapshot', script, writer) l.tab.Runtime.evaluate (expression=script, returnByValue=True) writeDOMSnapshot (l.tab, writer) - if args.screenshot: + if screenshot: + self.update_state (state='PROGRESS', meta={'step': 'screenshot'}) writeScreenshot (l.tab, writer) + if not output: + outPath = os.path.join (app.conf.finished_dir, outFile) + os.rename (fd.name, outPath) + return True + +def stateCallback (data): + result = data['result'] + if data['status'] == 'PROGRESS': + print (data['task_id'], result['step']) + +def main (): + parser = argparse.ArgumentParser(description='Save website to WARC using Google Chrome.') + parser.add_argument('--browser', help='DevTools URL', metavar='URL') + parser.add_argument('--distributed', help='Use celery worker', action='store_true') + parser.add_argument('--timeout', default=10, type=int, help='Maximum time for archival', metavar='SEC') + parser.add_argument('--idle-timeout', default=2, type=int, help='Maximum idle seconds (i.e. no requests)', dest='idleTimeout', metavar='SEC') + parser.add_argument('--log-buffer', default=1000, type=int, dest='logBuffer', metavar='LINES') + parser.add_argument('--max-body-size', default=10*1024*1024, type=int, dest='maxBodySize', help='Max body size', metavar='BYTES') + #parser.add_argument('--keep-tab', action='store_true', default=False, dest='keepTab', help='Keep tab open') + parser.add_argument('--onload', default=[], action='append', help='Inject JavaScript file before loading page', metavar='FILE') + parser.add_argument('--onsnapshot', default=[], action='append', help='Run JavaScript files before creating DOM snapshot', metavar='FILE') + parser.add_argument('--no-screenshot', default=True, action='store_false', help='Do not create a screenshot of the website', dest='screenshot') + parser.add_argument('--no-dom-snapshot', default=True, action='store_false', help='Do not create a DOM snapshot of the website', dest='domSnapshot') + parser.add_argument('url', help='Website URL') + parser.add_argument('output', help='WARC filename') + + args = parser.parse_args () + distributed = args.distributed + passArgs = vars (args) + del passArgs['distributed'] + + if distributed: + result = archive.delay (**passArgs) + result.get (on_message=stateCallback) + else: + archive (**passArgs) return True diff --git a/setup.py b/setup.py index 52747c0..0ab9249 100644 --- a/setup.py +++ b/setup.py @@ -13,13 +13,14 @@ setup( 'pychrome', 'warcio', 'html5lib>=0.999999999', + 'Celery', ], entry_points={ 'console_scripts': [ 'crocoite-standalone = crocoite.cli:main', ], }, - data_files=[ - ('crocoite/data', ['crocoite/data/onload.js']), - ], + package_data={ + 'crocoite': ['data/*'], + }, ) -- cgit v1.2.3