diff options
| -rw-r--r-- | README.rst | 38 | ||||
| -rw-r--r-- | contrib/celerycrocoite.py | 144 | ||||
| -rw-r--r-- | crocoite/browser.py | 21 | ||||
| -rw-r--r-- | crocoite/cli.py | 351 | ||||
| -rw-r--r-- | setup.py | 7 | 
5 files changed, 407 insertions, 154 deletions
| @@ -66,3 +66,41 @@ also saved. This causes its own set of issues though:  - JavaScript-based navigation does not work. +Distributed crawling +-------------------- + +Configure using celeryconfig.py + +.. code:: python + +    broker_url = 'pyamqp://' +    result_backend = 'rpc://' +    warc_filename = '{domain}-{date}-{id}.warc.gz' +    temp_dir = '/tmp/' +    finished_dir = '/tmp/finished' + +Start a Celery worker:: + +    celery -A crocoite.cli worker --loglevel=info + +Then queue archive job:: + +    crocoite-standalone --distributed … + +Alternative: IRC bot using sopel_. Use contrib/celerycrocoite.py + +~/.sopel/default.cfg + +.. code:: ini + +    [core] +    nick = chromebot +    host = irc.efnet.fr +    port = 6667 +    owner = someone +    extra = /path/to/crocoite/contrib +    enable = celerycrocoite +    channels = #somechannel + +Then in #somechannel ``chromebot: ao <url>`` + diff --git a/contrib/celerycrocoite.py b/contrib/celerycrocoite.py new file mode 100644 index 0000000..8fab046 --- /dev/null +++ b/contrib/celerycrocoite.py @@ -0,0 +1,144 @@ +# Copyright (c) 2017 crocoite contributors +#  +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +#  +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +#  +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +""" +Module for Sopel IRC bot +""" + +import os, logging +from sopel.module import nickname_commands, require_chanmsg, thread, example, require_privilege, VOICE +from sopel.tools import Identifier, SopelMemory +import celery +from urllib.parse import urlsplit + +import crocoite.cli + +def setup (bot): +    m = bot.memory['crocoite'] = SopelMemory () +    m['jobs'] = {} + +def isValidUrl (s): +    url = urlsplit (s) +    return url.scheme and url.netloc and url.scheme in {'http', 'https'} + +@nickname_commands ('ao', 'archiveonly') +@require_chanmsg () +@require_privilege (VOICE) +@thread (True) +@example ('ao http://example.com') +def archive (bot, trigger): +    """ +    Archive a single page (no recursion) to WARC +    """ + +    def updateState (job, data): +        job['state'] = data + +    url = trigger.group(2) +    if not url: +        bot.reply ('Need a URL') +        return +    if not isValidUrl (url): +        bot.reply ('{} is not a valid URL'.format (url)) +        return + +    args = { +            'url': url, +            'output': None, +            'onload': ['scroll.js'], +            'onsnapshot': [], +            'browser': None, +            'logBuffer': 1000, +            'maxBodySize': 10*1024*1024, +            'idleTimeout': 10, +            # 1 hour +            'timeout': 1*60*60, +            'domSnapshot': False, +            'screenshot': False, +            } + +    handle = crocoite.cli.archive.delay (**args) +    m = bot.memory['crocoite'] +    jobs = m['jobs'] +    # XXX: for some reason we cannot access the job’s state through handle, +    # instead use a callback quirk +    j = jobs[handle.id] = {'handle': handle, 'trigger': trigger, 'state': {}} +    bot.reply ('{} has been queued as {}'.format (url, handle.id)) +    try: +        result = handle.get (on_message=lambda x: updateState (j, x)) +        bot.reply ('{} ({}) finished'.format (url, handle.id)) +    except Exception as e: +        # json serialization does not work well with exceptions. If their class +        # names are unique we can still distinguish them. +        ename = type (e).__name__ +        if ename == 'TaskRevokedError': +            bot.reply ('{} ({}) was revoked'.format (url, handle.id)) +        else: +            bot.reply ('{} ({}) failed'.format (url, handle.id)) +            logging.exception ('{} ({}) failed'.format (url, handle.id)) +    finally: +        del jobs[handle.id] + +@nickname_commands ('s', 'status') +@example ('s c251f09e-3c26-481f-96e0-4b5f58bd1170') +@require_chanmsg () +def status (bot, trigger): +    """ +    Retrieve status for a job +    """ + +    m = bot.memory['crocoite'] +    jobs = m['jobs'] + +    i = trigger.group(2) +    if not i or i not in jobs: +        bot.reply("Job not found.") +        return +     +    j = jobs[i] +    jtrigger = j['trigger'] +    jhandle = j['handle'] +    jstate = j['state'] +    jresult = jstate.get ('result', {}) +    bot.reply ('{}: {}, queued {}, by {}'.format (jhandle.id, +            jstate.get ('status', 'UNKNOWN'), jtrigger.time, jtrigger.nick)) + +@nickname_commands ('r', 'revoke') +@example ('r c251f09e-3c26-481f-96e0-4b5f58bd1170') +@require_privilege (VOICE) +@require_chanmsg () +def revoke (bot, trigger): +    """ +    Cancel (revoke) a job +    """ + +    m = bot.memory['crocoite'] +    jobs = m['jobs'] + +    i = trigger.group(2) +    if not i or i not in jobs: +        bot.reply ("Job not found.") +        return +     +    j = jobs[i] +    jhandle = j['handle'] +    jhandle.revoke (terminate=True) +    # response is handled by long-running initiation thread + diff --git a/crocoite/browser.py b/crocoite/browser.py index 765acbb..3e0e310 100644 --- a/crocoite/browser.py +++ b/crocoite/browser.py @@ -178,7 +178,7 @@ class SiteLoader:          resp = kwargs['response']          url = urlsplit (resp['url'])          if url.scheme in self.allowedSchemes: -            self.logger.debug ('response {} {}'.format (reqId, resp['url'])) +            self.logger.info ('response {} {}'.format (reqId, resp['url']))              item.setResponse (kwargs)          else:              self.logger.warn ('response: ignoring scheme {}'.format (url.scheme)) @@ -198,13 +198,13 @@ class SiteLoader:          assert req['url'] == resp['url'], 'req and resp urls are not the same {} vs {}'.format (req['url'], resp['url'])          url = urlsplit (resp['url'])          if url.scheme in self.allowedSchemes: -            self.logger.debug ('finished {} {}'.format (reqId, req['url'])) +            self.logger.info ('finished {} {}'.format (reqId, req['url']))              item.encodedDataLength = kwargs['encodedDataLength']              self.loadingFinished (item)      def _loadingFailed (self, **kwargs):          reqId = kwargs['requestId'] -        self.logger.debug ('failed {} {}'.format (reqId, kwargs['errorText'], kwargs.get ('blockedReason'))) +        self.logger.warn ('failed {} {}'.format (reqId, kwargs['errorText'], kwargs.get ('blockedReason')))          item = self.requests.pop (reqId, None)  import subprocess @@ -219,9 +219,18 @@ def ChromeService (binary='google-chrome-stable', host='localhost', port=9222, w      is not required with this method, since reads will block until Chrome is      ready.      """ -    s = socket.socket () -    s.setsockopt (socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) -    s.bind ((host, port)) +    while True: +        s = socket.socket () +        s.setsockopt (socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) +        try: +            s.bind ((host, port)) +            break +        except OSError: +            # try different port +            if port < 65000: +                port += 1 +            else: +                raise      s.listen (10)      userDataDir = mkdtemp ()      args = [binary, diff --git a/crocoite/cli.py b/crocoite/cli.py index 8a55269..3527ceb 100644 --- a/crocoite/cli.py +++ b/crocoite/cli.py @@ -18,159 +18,180 @@  # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN  # THE SOFTWARE. -def main (): -    import os, random, logging, argparse -    from io import BytesIO -    from base64 import b64decode -    import pychrome -    from urllib.parse import urlsplit -    from warcio.warcwriter import WARCWriter -    from warcio.statusandheaders import StatusAndHeaders -    from html5lib.serializer import HTMLSerializer -    from . import html, packageData, packageUrl, browser -    from .warc import WarcLoader -    from .html import StripAttributeFilter, StripTagFilter, ChromeTreeWalker - -    def randomString (length=None, chars='abcdefghijklmnopqrstuvwxyz'): -        if length is None: -            length = random.randint (16, 32) -        return ''.join (map (lambda x: random.choice (chars), range (length))) -     -    def getFormattedViewportMetrics (tab): -        layoutMetrics = tab.Page.getLayoutMetrics () -        # XXX: I’m not entirely sure which one we should use here -        return '{}x{}'.format (layoutMetrics['layoutViewport']['clientWidth'], -                    layoutMetrics['layoutViewport']['clientHeight']) - -    def writeDOMSnapshot (tab, writer): -        """ -        Get a DOM snapshot of tab and write it to WARC. - -        We could use DOMSnapshot.getSnapshot here, but the API is not stable -        yet. Also computed styles are not really necessary here. - -        XXX: Currently writes a response, when it should use “resource”. pywb -        can’t handle that though. -        """ -        viewport = getFormattedViewportMetrics (tab) -        dom = tab.DOM.getDocument (depth=-1, pierce=True) -        haveUrls = set () -        for doc in ChromeTreeWalker (dom['root']).split (): -            rawUrl = doc['documentURL'] -            if rawUrl in haveUrls: -                # ignore duplicate URLs. they are usually caused by -                # javascript-injected iframes (advertising) with no(?) src -                logger.warning ('have DOM snapshot for URL {}, ignoring'.format (rawUrl)) -                continue -            url = urlsplit (rawUrl) -            if url.scheme in ('http', 'https'): -                logger.debug ('saving DOM snapshot for url {}, base {}'.format (doc['documentURL'], doc['baseURL'])) -                haveUrls.add (rawUrl) -                walker = ChromeTreeWalker (doc) -                # remove script, to make the page static and noscript, because at the -                # time we took the snapshot scripts were enabled -                disallowedTags = ['script', 'noscript'] -                disallowedAttributes = html.eventAttributes -                stream = StripAttributeFilter (StripTagFilter (walker, disallowedTags), disallowedAttributes) -                serializer = HTMLSerializer () -                httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1') -                record = writer.create_warc_record (doc['documentURL'], 'response', -                        payload=BytesIO (serializer.render (stream, 'utf-8')), -                        http_headers=httpHeaders, -                        warc_headers_dict={'X-DOM-Snapshot': str (True), -                                'X-Chrome-Viewport': viewport}) -                writer.write_record (record) - -    def emulateScreenMetrics (l): -        """ -        Emulate different screen sizes, causing the site to fetch assets (img -        srcset and css, for example) for different screen resolutions. -        """ -        cssPpi = 96 -        sizes = [ -                {'width': 1920, 'height': 1080, 'deviceScaleFactor': 1.5, 'mobile': False}, -                {'width': 1920, 'height': 1080, 'deviceScaleFactor': 2, 'mobile': False}, -                # very dense display -                {'width': 1920, 'height': 1080, 'deviceScaleFactor': 4, 'mobile': False}, -                # just a few samples: -                # 1st gen iPhone (portrait mode) -                {'width': 320, 'height': 480, 'deviceScaleFactor': 163/cssPpi, 'mobile': True}, -                # 6th gen iPhone (portrait mode) -                {'width': 750, 'height': 1334, 'deviceScaleFactor': 326/cssPpi, 'mobile': True}, -                # and reset -                {'width': 1920, 'height': 1080, 'deviceScaleFactor': 1, 'mobile': False}, -                ] -        for s in sizes: -            l.tab.Emulation.setDeviceMetricsOverride (**s) -            l.wait (1) -        # XXX: this seems to be broken, it does not clear the override -        #tab.Emulation.clearDeviceMetricsOverride () -        # wait until assets finished loading -        l.waitIdle (2, 10) - -    def loadScripts (paths, scripts=[]): -        for p in paths: -            if not os.path.exists (p): -                # search for defaults scripts in package data directory -                p = packageData (p) -            with open (p, 'r') as fd: -                scripts.append (fd.read ()) -        return '\n'.join (scripts) - -    def writeScript (path, source, writer): -        record = writer.create_warc_record (packageUrl (path), 'metadata', -                payload=BytesIO (source.encode ('utf8')), -                warc_headers_dict={'Content-Type': 'application/javascript; charset=utf-8'}) -        writer.write_record (record) - -    def writeScreenshot (tab, writer): -        """ -        Create screenshot from tab and write it to WARC -        """ -        viewport = getFormattedViewportMetrics (tab) -        data = b64decode (l.tab.Page.captureScreenshot (format='png')['data']) -        record = writer.create_warc_record (packageUrl ('screenshot.png'), 'resource', -                payload=BytesIO (data), warc_headers_dict={'Content-Type': 'image/png', -                'X-Chrome-Viewport': viewport}) -        writer.write_record (record) - -    logger = logging.getLogger(__name__) -    logging.basicConfig (level=logging.DEBUG) +""" +Standalone and Celery command line interface +""" -    parser = argparse.ArgumentParser(description='Save website to WARC using Google Chrome.') -    parser.add_argument('--browser', help='DevTools URL', metavar='URL') -    parser.add_argument('--timeout', default=10, type=int, help='Maximum time for archival', metavar='SEC') -    parser.add_argument('--idle-timeout', default=2, type=int, help='Maximum idle seconds (i.e. no requests)', dest='idleTimeout', metavar='SEC') -    parser.add_argument('--log-buffer', default=1000, type=int, dest='logBuffer', metavar='LINES') -    parser.add_argument('--max-body-size', default=10*1024*1024, type=int, dest='maxBodySize', help='Max body size', metavar='BYTES') -    #parser.add_argument('--keep-tab', action='store_true', default=False, dest='keepTab', help='Keep tab open') -    parser.add_argument('--onload', default=[], action='append', help='Inject JavaScript file before loading page', metavar='FILE') -    parser.add_argument('--onsnapshot', default=[], action='append', help='Run JavaScript files before creating DOM snapshot', metavar='FILE') -    parser.add_argument('--no-screenshot', default=True, action='store_false', help='Do not create a screenshot of the website', dest='screenshot') -    parser.add_argument('--no-dom-snapshot', default=True, action='store_false', help='Do not create a DOM snapshot of the website', dest='domSnapshot') -    parser.add_argument('url', help='Website URL') -    parser.add_argument('output', help='WARC filename') +import os, random, logging, argparse +from io import BytesIO +from datetime import datetime +from base64 import b64decode +import pychrome +from urllib.parse import urlsplit +from warcio.warcwriter import WARCWriter +from warcio.statusandheaders import StatusAndHeaders +from html5lib.serializer import HTMLSerializer -    args = parser.parse_args () +from celery import Celery +from celery.utils.log import get_task_logger + +from . import html, packageData, packageUrl +from .warc import WarcLoader +from .html import StripAttributeFilter, StripTagFilter, ChromeTreeWalker +from .browser import ChromeService, NullService + +def getFormattedViewportMetrics (tab): +    layoutMetrics = tab.Page.getLayoutMetrics () +    # XXX: I’m not entirely sure which one we should use here +    return '{}x{}'.format (layoutMetrics['layoutViewport']['clientWidth'], +                layoutMetrics['layoutViewport']['clientHeight']) + +def writeScript (path, source, writer): +    record = writer.create_warc_record (packageUrl (path), 'metadata', +            payload=BytesIO (source.encode ('utf8')), +            warc_headers_dict={'Content-Type': 'application/javascript; charset=utf-8'}) +    writer.write_record (record) + +def randomString (length=None, chars='abcdefghijklmnopqrstuvwxyz'): +    if length is None: +        length = random.randint (16, 32) +    return ''.join (map (lambda x: random.choice (chars), range (length))) + +def writeDOMSnapshot (tab, writer): +    """ +    Get a DOM snapshot of tab and write it to WARC. + +    We could use DOMSnapshot.getSnapshot here, but the API is not stable +    yet. Also computed styles are not really necessary here. + +    XXX: Currently writes a response, when it should use “resource”. pywb +    can’t handle that though. +    """ +    viewport = getFormattedViewportMetrics (tab) +    dom = tab.DOM.getDocument (depth=-1, pierce=True) +    haveUrls = set () +    for doc in ChromeTreeWalker (dom['root']).split (): +        rawUrl = doc['documentURL'] +        if rawUrl in haveUrls: +            # ignore duplicate URLs. they are usually caused by +            # javascript-injected iframes (advertising) with no(?) src +            logger.warning ('have DOM snapshot for URL {}, ignoring'.format (rawUrl)) +            continue +        url = urlsplit (rawUrl) +        if url.scheme in ('http', 'https'): +            logger.debug ('saving DOM snapshot for url {}, base {}'.format (doc['documentURL'], doc['baseURL'])) +            haveUrls.add (rawUrl) +            walker = ChromeTreeWalker (doc) +            # remove script, to make the page static and noscript, because at the +            # time we took the snapshot scripts were enabled +            disallowedTags = ['script', 'noscript'] +            disallowedAttributes = html.eventAttributes +            stream = StripAttributeFilter (StripTagFilter (walker, disallowedTags), disallowedAttributes) +            serializer = HTMLSerializer () +            httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1') +            record = writer.create_warc_record (doc['documentURL'], 'response', +                    payload=BytesIO (serializer.render (stream, 'utf-8')), +                    http_headers=httpHeaders, +                    warc_headers_dict={'X-DOM-Snapshot': str (True), +                            'X-Chrome-Viewport': viewport}) +            writer.write_record (record) + +def emulateScreenMetrics (l): +    """ +    Emulate different screen sizes, causing the site to fetch assets (img +    srcset and css, for example) for different screen resolutions. +    """ +    cssPpi = 96 +    sizes = [ +            {'width': 1920, 'height': 1080, 'deviceScaleFactor': 1.5, 'mobile': False}, +            {'width': 1920, 'height': 1080, 'deviceScaleFactor': 2, 'mobile': False}, +            # very dense display +            {'width': 1920, 'height': 1080, 'deviceScaleFactor': 4, 'mobile': False}, +            # just a few samples: +            # 1st gen iPhone (portrait mode) +            {'width': 320, 'height': 480, 'deviceScaleFactor': 163/cssPpi, 'mobile': True}, +            # 6th gen iPhone (portrait mode) +            {'width': 750, 'height': 1334, 'deviceScaleFactor': 326/cssPpi, 'mobile': True}, +            # and reset +            {'width': 1920, 'height': 1080, 'deviceScaleFactor': 1, 'mobile': False}, +            ] +    for s in sizes: +        l.tab.Emulation.setDeviceMetricsOverride (**s) +        l.wait (1) +    # XXX: this seems to be broken, it does not clear the override +    #tab.Emulation.clearDeviceMetricsOverride () +    # wait until assets finished loading +    l.waitIdle (2, 10) + +def loadScripts (paths, scripts=[]): +    for p in paths: +        if not os.path.exists (p): +            # search for defaults scripts in package data directory +            p = packageData (p) +        with open (p, 'r') as fd: +            scripts.append (fd.read ()) +    return '\n'.join (scripts) + +def writeScreenshot (tab, writer): +    """ +    Create screenshot from tab and write it to WARC +    """ +    viewport = getFormattedViewportMetrics (tab) +    data = b64decode (tab.Page.captureScreenshot (format='png')['data']) +    record = writer.create_warc_record (packageUrl ('screenshot.png'), 'resource', +            payload=BytesIO (data), warc_headers_dict={'Content-Type': 'image/png', +            'X-Chrome-Viewport': viewport}) +    writer.write_record (record) + +# XXX: rabbitmq is hardcoded +app = Celery ('crocoite.distributed') +app.config_from_object('celeryconfig') +logger = get_task_logger('crocoite.distributed.archive') + +# defaults can be changed below using argparse; track started state, because tasks are usually long-running +@app.task(bind=True, track_started=True) +def archive (self, url, output, onload, onsnapshot, browser, +        logBuffer, maxBodySize, idleTimeout, timeout, domSnapshot, screenshot): +    """ +    Archive a single URL + +    Supports these config keys (celeryconfig): + +    warc_filename = '{domain}-{date}-{id}.warc.gz' +    temp_dir = '/tmp/' +    finished_dir = '/tmp/finished' +    """ + +    self.update_state (state='PROGRESS', meta={'step': 'start'})      stopVarname = '__' + __package__ + '_stop__'      # avoid sites messing with our scripts by using a random stop variable name      newStopVarname = randomString () -    onload = loadScripts (args.onload, ['var {} = false;\n'.format (stopVarname)]).replace (stopVarname, newStopVarname) +    onload = loadScripts (onload, ['var {} = false;\n'.format (stopVarname)]).replace (stopVarname, newStopVarname)      stopVarname = newStopVarname -    service = browser.ChromeService () -    if args.browser: -        service = browser.NullService (args.browser) +    service = ChromeService () +    if browser: +        service = NullService (browser) -    with service as browserUrl: -        browser = pychrome.Browser(url=browserUrl) +    with service as browser: +        browser = pychrome.Browser(url=browser) -        fd = open (args.output, 'wb') +        if not output: +            parsedUrl = urlsplit (url) +            outFile = app.conf.warc_filename.format ( +                            id=self.request.id, +                            domain=parsedUrl.hostname.replace ('/', '-'), +                            date=datetime.utcnow ().isoformat (), +                            ) +            outPath = os.path.join (app.conf.temp_dir, outFile) +            fd = open (outPath, 'wb') +        else: +            fd = open (output, 'wb')          writer = WARCWriter (fd, gzip=True) -        with WarcLoader (browser, args.url, writer, logBuffer=args.logBuffer, -                maxBodySize=args.maxBodySize) as l: +        with WarcLoader (browser, url, writer, logBuffer=logBuffer, +                maxBodySize=maxBodySize) as l:              version = l.tab.Browser.getVersion ()              payload = {                      'software': __package__, @@ -187,25 +208,65 @@ def main ():              l.tab.Page.addScriptToEvaluateOnNewDocument (source=onload)              l.start () -            l.waitIdle (args.idleTimeout, args.timeout) +            self.update_state (state='PROGRESS', meta={'step': 'fetch'}) +            l.waitIdle (idleTimeout, timeout)              # get ready for snapshot: stop loading and scripts, disable events              l.tab.Runtime.evaluate (expression='{} = true; window.scrollTo (0, 0);'.format (stopVarname), returnByValue=True)              # if we stopped due to timeout, wait for remaining assets              l.waitIdle (2, 10) +            self.update_state (state='PROGRESS', meta={'step': 'emulateScreenMetrics'})              emulateScreenMetrics (l)              l.stop () -            if args.domSnapshot: -                script = loadScripts (args.onsnapshot) +            if domSnapshot: +                self.update_state (state='PROGRESS', meta={'step': 'domSnapshot'}) +                script = loadScripts (onsnapshot)                  writeScript ('onsnapshot', script, writer)                  l.tab.Runtime.evaluate (expression=script, returnByValue=True)                  writeDOMSnapshot (l.tab, writer) -            if args.screenshot: +            if screenshot: +                self.update_state (state='PROGRESS', meta={'step': 'screenshot'})                  writeScreenshot (l.tab, writer) +    if not output: +        outPath = os.path.join (app.conf.finished_dir, outFile) +        os.rename (fd.name, outPath) +    return True + +def stateCallback (data): +    result = data['result'] +    if data['status'] == 'PROGRESS': +        print (data['task_id'], result['step']) + +def main (): +    parser = argparse.ArgumentParser(description='Save website to WARC using Google Chrome.') +    parser.add_argument('--browser', help='DevTools URL', metavar='URL') +    parser.add_argument('--distributed', help='Use celery worker', action='store_true') +    parser.add_argument('--timeout', default=10, type=int, help='Maximum time for archival', metavar='SEC') +    parser.add_argument('--idle-timeout', default=2, type=int, help='Maximum idle seconds (i.e. no requests)', dest='idleTimeout', metavar='SEC') +    parser.add_argument('--log-buffer', default=1000, type=int, dest='logBuffer', metavar='LINES') +    parser.add_argument('--max-body-size', default=10*1024*1024, type=int, dest='maxBodySize', help='Max body size', metavar='BYTES') +    #parser.add_argument('--keep-tab', action='store_true', default=False, dest='keepTab', help='Keep tab open') +    parser.add_argument('--onload', default=[], action='append', help='Inject JavaScript file before loading page', metavar='FILE') +    parser.add_argument('--onsnapshot', default=[], action='append', help='Run JavaScript files before creating DOM snapshot', metavar='FILE') +    parser.add_argument('--no-screenshot', default=True, action='store_false', help='Do not create a screenshot of the website', dest='screenshot') +    parser.add_argument('--no-dom-snapshot', default=True, action='store_false', help='Do not create a DOM snapshot of the website', dest='domSnapshot') +    parser.add_argument('url', help='Website URL') +    parser.add_argument('output', help='WARC filename') + +    args = parser.parse_args () +    distributed = args.distributed +    passArgs = vars (args) +    del passArgs['distributed'] + +    if distributed: +        result = archive.delay (**passArgs) +        result.get (on_message=stateCallback) +    else: +        archive (**passArgs)      return True @@ -13,13 +13,14 @@ setup(          'pychrome',          'warcio',          'html5lib>=0.999999999', +        'Celery',      ],      entry_points={      'console_scripts': [              'crocoite-standalone = crocoite.cli:main',              ],      }, -    data_files=[ -        ('crocoite/data', ['crocoite/data/onload.js']), -        ], +    package_data={ +            'crocoite': ['data/*'], +    },  ) | 
