Add distributed archiving

Using celery. Also adds a plugin for the IRC bot sopel. Code still needs some love, but it should work.
author: Lars-Dominik Braun <lars@6xq.net> 2017-12-10 12:31:07 +0100
committer: Lars-Dominik Braun <lars@6xq.net> 2017-12-17 16:40:16 +0100
commit: 84c3f69293fa79d752127410c7468038c907c96a (patch)
tree: 4a71dcddd6abc6eeda30ed40bd78d91518efde38
parent: f816319081d5253974ddb70b655d55f4a880a77a (diff)
download: crocoite-84c3f69293fa79d752127410c7468038c907c96a.tar.gz
crocoite-84c3f69293fa79d752127410c7468038c907c96a.tar.bz2
crocoite-84c3f69293fa79d752127410c7468038c907c96a.zip
5 files changed, 407 insertions, 154 deletions
diff --git a/README.rst b/README.rst
index 3a7aa7c..3d5af5f 100644
--- a/README.rst
+++ b/README.rst
@@ -66,3 +66,41 @@ also saved. This causes its own set of issues though:
 
 - JavaScript-based navigation does not work.
 
+Distributed crawling
+--------------------
+
+Configure using celeryconfig.py
+
+.. code:: python
+
+    broker_url = 'pyamqp://'
+    result_backend = 'rpc://'
+    warc_filename = '{domain}-{date}-{id}.warc.gz'
+    temp_dir = '/tmp/'
+    finished_dir = '/tmp/finished'
+
+Start a Celery worker::
+
+    celery -A crocoite.cli worker --loglevel=info
+
+Then queue archive job::
+
+    crocoite-standalone --distributed …
+
+Alternative: IRC bot using sopel_. Use contrib/celerycrocoite.py
+
+~/.sopel/default.cfg
+
+.. code:: ini
+
+    [core]
+    nick = chromebot
+    host = irc.efnet.fr
+    port = 6667
+    owner = someone
+    extra = /path/to/crocoite/contrib
+    enable = celerycrocoite
+    channels = #somechannel
+
+Then in #somechannel ``chromebot: ao <url>``
+
diff --git a/contrib/celerycrocoite.py b/contrib/celerycrocoite.py
new file mode 100644
index 0000000..8fab046
--- /dev/null
+++ b/contrib/celerycrocoite.py
@@ -0,0 +1,144 @@
+# Copyright (c) 2017 crocoite contributors
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# 
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+# 
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+"""
+Module for Sopel IRC bot
+"""
+
+import os, logging
+from sopel.module import nickname_commands, require_chanmsg, thread, example, require_privilege, VOICE
+from sopel.tools import Identifier, SopelMemory
+import celery
+from urllib.parse import urlsplit
+
+import crocoite.cli
+
+def setup (bot):
+    m = bot.memory['crocoite'] = SopelMemory ()
+    m['jobs'] = {}
+
+def isValidUrl (s):
+    url = urlsplit (s)
+    return url.scheme and url.netloc and url.scheme in {'http', 'https'}
+
+@nickname_commands ('ao', 'archiveonly')
+@require_chanmsg ()
+@require_privilege (VOICE)
+@thread (True)
+@example ('ao http://example.com')
+def archive (bot, trigger):
+    """
+    Archive a single page (no recursion) to WARC
+    """
+
+    def updateState (job, data):
+        job['state'] = data
+
+    url = trigger.group(2)
+    if not url:
+        bot.reply ('Need a URL')
+        return
+    if not isValidUrl (url):
+        bot.reply ('{} is not a valid URL'.format (url))
+        return
+
+    args = {
+            'url': url,
+            'output': None,
+            'onload': ['scroll.js'],
+            'onsnapshot': [],
+            'browser': None,
+            'logBuffer': 1000,
+            'maxBodySize': 10*1024*1024,
+            'idleTimeout': 10,
+            # 1 hour
+            'timeout': 1*60*60,
+            'domSnapshot': False,
+            'screenshot': False,
+            }
+
+    handle = crocoite.cli.archive.delay (**args)
+    m = bot.memory['crocoite']
+    jobs = m['jobs']
+    # XXX: for some reason we cannot access the job’s state through handle,
+    # instead use a callback quirk
+    j = jobs[handle.id] = {'handle': handle, 'trigger': trigger, 'state': {}}
+    bot.reply ('{} has been queued as {}'.format (url, handle.id))
+    try:
+        result = handle.get (on_message=lambda x: updateState (j, x))
+        bot.reply ('{} ({}) finished'.format (url, handle.id))
+    except Exception as e:
+        # json serialization does not work well with exceptions. If their class
+        # names are unique we can still distinguish them.
+        ename = type (e).__name__
+        if ename == 'TaskRevokedError':
+            bot.reply ('{} ({}) was revoked'.format (url, handle.id))
+        else:
+            bot.reply ('{} ({}) failed'.format (url, handle.id))
+            logging.exception ('{} ({}) failed'.format (url, handle.id))
+    finally:
+        del jobs[handle.id]
+
+@nickname_commands ('s', 'status')
+@example ('s c251f09e-3c26-481f-96e0-4b5f58bd1170')
+@require_chanmsg ()
+def status (bot, trigger):
+    """
+    Retrieve status for a job
+    """
+
+    m = bot.memory['crocoite']
+    jobs = m['jobs']
+
+    i = trigger.group(2)
+    if not i or i not in jobs:
+        bot.reply("Job not found.")
+        return
+    
+    j = jobs[i]
+    jtrigger = j['trigger']
+    jhandle = j['handle']
+    jstate = j['state']
+    jresult = jstate.get ('result', {})
+    bot.reply ('{}: {}, queued {}, by {}'.format (jhandle.id,
+            jstate.get ('status', 'UNKNOWN'), jtrigger.time, jtrigger.nick))
+
+@nickname_commands ('r', 'revoke')
+@example ('r c251f09e-3c26-481f-96e0-4b5f58bd1170')
+@require_privilege (VOICE)
+@require_chanmsg ()
+def revoke (bot, trigger):
+    """
+    Cancel (revoke) a job
+    """
+
+    m = bot.memory['crocoite']
+    jobs = m['jobs']
+
+    i = trigger.group(2)
+    if not i or i not in jobs:
+        bot.reply ("Job not found.")
+        return
+    
+    j = jobs[i]
+    jhandle = j['handle']
+    jhandle.revoke (terminate=True)
+    # response is handled by long-running initiation thread
+
diff --git a/crocoite/browser.py b/crocoite/browser.py
index 765acbb..3e0e310 100644
--- a/crocoite/browser.py
+++ b/crocoite/browser.py
@@ -178,7 +178,7 @@ class SiteLoader:
         resp = kwargs['response']
         url = urlsplit (resp['url'])
         if url.scheme in self.allowedSchemes:
-            self.logger.debug ('response {} {}'.format (reqId, resp['url']))
+            self.logger.info ('response {} {}'.format (reqId, resp['url']))
             item.setResponse (kwargs)
         else:
             self.logger.warn ('response: ignoring scheme {}'.format (url.scheme))
@@ -198,13 +198,13 @@ class SiteLoader:
         assert req['url'] == resp['url'], 'req and resp urls are not the same {} vs {}'.format (req['url'], resp['url'])
         url = urlsplit (resp['url'])
         if url.scheme in self.allowedSchemes:
-            self.logger.debug ('finished {} {}'.format (reqId, req['url']))
+            self.logger.info ('finished {} {}'.format (reqId, req['url']))
             item.encodedDataLength = kwargs['encodedDataLength']
             self.loadingFinished (item)
 
     def _loadingFailed (self, **kwargs):
         reqId = kwargs['requestId']
-        self.logger.debug ('failed {} {}'.format (reqId, kwargs['errorText'], kwargs.get ('blockedReason')))
+        self.logger.warn ('failed {} {}'.format (reqId, kwargs['errorText'], kwargs.get ('blockedReason')))
         item = self.requests.pop (reqId, None)
 
 import subprocess
@@ -219,9 +219,18 @@ def ChromeService (binary='google-chrome-stable', host='localhost', port=9222, w
     is not required with this method, since reads will block until Chrome is
     ready.
     """
-    s = socket.socket ()
-    s.setsockopt (socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
-    s.bind ((host, port))
+    while True:
+        s = socket.socket ()
+        s.setsockopt (socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+        try:
+            s.bind ((host, port))
+            break
+        except OSError:
+            # try different port
+            if port < 65000:
+                port += 1
+            else:
+                raise
     s.listen (10)
     userDataDir = mkdtemp ()
     args = [binary,
diff --git a/crocoite/cli.py b/crocoite/cli.py
index 8a55269..3527ceb 100644
--- a/crocoite/cli.py
+++ b/crocoite/cli.py
@@ -18,159 +18,180 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 # THE SOFTWARE.
 
-def main ():
-    import os, random, logging, argparse
-    from io import BytesIO
-    from base64 import b64decode
-    import pychrome
-    from urllib.parse import urlsplit
-    from warcio.warcwriter import WARCWriter
-    from warcio.statusandheaders import StatusAndHeaders
-    from html5lib.serializer import HTMLSerializer
-    from . import html, packageData, packageUrl, browser
-    from .warc import WarcLoader
-    from .html import StripAttributeFilter, StripTagFilter, ChromeTreeWalker
-
-    def randomString (length=None, chars='abcdefghijklmnopqrstuvwxyz'):
-        if length is None:
-            length = random.randint (16, 32)
-        return ''.join (map (lambda x: random.choice (chars), range (length)))
-    
-    def getFormattedViewportMetrics (tab):
-        layoutMetrics = tab.Page.getLayoutMetrics ()
-        # XXX: I’m not entirely sure which one we should use here
-        return '{}x{}'.format (layoutMetrics['layoutViewport']['clientWidth'],
-                    layoutMetrics['layoutViewport']['clientHeight'])
-
-    def writeDOMSnapshot (tab, writer):
-        """
-        Get a DOM snapshot of tab and write it to WARC.
-
-        We could use DOMSnapshot.getSnapshot here, but the API is not stable
-        yet. Also computed styles are not really necessary here.
-
-        XXX: Currently writes a response, when it should use “resource”. pywb
-        can’t handle that though.
-        """
-        viewport = getFormattedViewportMetrics (tab)
-        dom = tab.DOM.getDocument (depth=-1, pierce=True)
-        haveUrls = set ()
-        for doc in ChromeTreeWalker (dom['root']).split ():
-            rawUrl = doc['documentURL']
-            if rawUrl in haveUrls:
-                # ignore duplicate URLs. they are usually caused by
-                # javascript-injected iframes (advertising) with no(?) src
-                logger.warning ('have DOM snapshot for URL {}, ignoring'.format (rawUrl))
-                continue
-            url = urlsplit (rawUrl)
-            if url.scheme in ('http', 'https'):
-                logger.debug ('saving DOM snapshot for url {}, base {}'.format (doc['documentURL'], doc['baseURL']))
-                haveUrls.add (rawUrl)
-                walker = ChromeTreeWalker (doc)
-                # remove script, to make the page static and noscript, because at the
-                # time we took the snapshot scripts were enabled
-                disallowedTags = ['script', 'noscript']
-                disallowedAttributes = html.eventAttributes
-                stream = StripAttributeFilter (StripTagFilter (walker, disallowedTags), disallowedAttributes)
-                serializer = HTMLSerializer ()
-                httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1')
-                record = writer.create_warc_record (doc['documentURL'], 'response',
-                        payload=BytesIO (serializer.render (stream, 'utf-8')),
-                        http_headers=httpHeaders,
-                        warc_headers_dict={'X-DOM-Snapshot': str (True),
-                                'X-Chrome-Viewport': viewport})
-                writer.write_record (record)
-
-    def emulateScreenMetrics (l):
-        """
-        Emulate different screen sizes, causing the site to fetch assets (img
-        srcset and css, for example) for different screen resolutions.
-        """
-        cssPpi = 96
-        sizes = [
-                {'width': 1920, 'height': 1080, 'deviceScaleFactor': 1.5, 'mobile': False},
-                {'width': 1920, 'height': 1080, 'deviceScaleFactor': 2, 'mobile': False},
-                # very dense display
-                {'width': 1920, 'height': 1080, 'deviceScaleFactor': 4, 'mobile': False},
-                # just a few samples:
-                # 1st gen iPhone (portrait mode)
-                {'width': 320, 'height': 480, 'deviceScaleFactor': 163/cssPpi, 'mobile': True},
-                # 6th gen iPhone (portrait mode)
-                {'width': 750, 'height': 1334, 'deviceScaleFactor': 326/cssPpi, 'mobile': True},
-                # and reset
-                {'width': 1920, 'height': 1080, 'deviceScaleFactor': 1, 'mobile': False},
-                ]
-        for s in sizes:
-            l.tab.Emulation.setDeviceMetricsOverride (**s)
-            l.wait (1)
-        # XXX: this seems to be broken, it does not clear the override
-        #tab.Emulation.clearDeviceMetricsOverride ()
-        # wait until assets finished loading
-        l.waitIdle (2, 10)
-
-    def loadScripts (paths, scripts=[]):
-        for p in paths:
-            if not os.path.exists (p):
-                # search for defaults scripts in package data directory
-                p = packageData (p)
-            with open (p, 'r') as fd:
-                scripts.append (fd.read ())
-        return '\n'.join (scripts)
-
-    def writeScript (path, source, writer):
-        record = writer.create_warc_record (packageUrl (path), 'metadata',
-                payload=BytesIO (source.encode ('utf8')),
-                warc_headers_dict={'Content-Type': 'application/javascript; charset=utf-8'})
-        writer.write_record (record)
-
-    def writeScreenshot (tab, writer):
-        """
-        Create screenshot from tab and write it to WARC
-        """
-        viewport = getFormattedViewportMetrics (tab)
-        data = b64decode (l.tab.Page.captureScreenshot (format='png')['data'])
-        record = writer.create_warc_record (packageUrl ('screenshot.png'), 'resource',
-                payload=BytesIO (data), warc_headers_dict={'Content-Type': 'image/png',
-                'X-Chrome-Viewport': viewport})
-        writer.write_record (record)
-
-    logger = logging.getLogger(__name__)
-    logging.basicConfig (level=logging.DEBUG)
+"""
+Standalone and Celery command line interface
+"""
 
-    parser = argparse.ArgumentParser(description='Save website to WARC using Google Chrome.')
-    parser.add_argument('--browser', help='DevTools URL', metavar='URL')
-    parser.add_argument('--timeout', default=10, type=int, help='Maximum time for archival', metavar='SEC')
-    parser.add_argument('--idle-timeout', default=2, type=int, help='Maximum idle seconds (i.e. no requests)', dest='idleTimeout', metavar='SEC')
-    parser.add_argument('--log-buffer', default=1000, type=int, dest='logBuffer', metavar='LINES')
-    parser.add_argument('--max-body-size', default=10*1024*1024, type=int, dest='maxBodySize', help='Max body size', metavar='BYTES')
-    #parser.add_argument('--keep-tab', action='store_true', default=False, dest='keepTab', help='Keep tab open')
-    parser.add_argument('--onload', default=[], action='append', help='Inject JavaScript file before loading page', metavar='FILE')
-    parser.add_argument('--onsnapshot', default=[], action='append', help='Run JavaScript files before creating DOM snapshot', metavar='FILE')
-    parser.add_argument('--no-screenshot', default=True, action='store_false', help='Do not create a screenshot of the website', dest='screenshot')
-    parser.add_argument('--no-dom-snapshot', default=True, action='store_false', help='Do not create a DOM snapshot of the website', dest='domSnapshot')
-    parser.add_argument('url', help='Website URL')
-    parser.add_argument('output', help='WARC filename')
+import os, random, logging, argparse
+from io import BytesIO
+from datetime import datetime
+from base64 import b64decode
+import pychrome
+from urllib.parse import urlsplit
+from warcio.warcwriter import WARCWriter
+from warcio.statusandheaders import StatusAndHeaders
+from html5lib.serializer import HTMLSerializer
 
-    args = parser.parse_args ()
+from celery import Celery
+from celery.utils.log import get_task_logger
+
+from . import html, packageData, packageUrl
+from .warc import WarcLoader
+from .html import StripAttributeFilter, StripTagFilter, ChromeTreeWalker
+from .browser import ChromeService, NullService
+
+def getFormattedViewportMetrics (tab):
+    layoutMetrics = tab.Page.getLayoutMetrics ()
+    # XXX: I’m not entirely sure which one we should use here
+    return '{}x{}'.format (layoutMetrics['layoutViewport']['clientWidth'],
+                layoutMetrics['layoutViewport']['clientHeight'])
+
+def writeScript (path, source, writer):
+    record = writer.create_warc_record (packageUrl (path), 'metadata',
+            payload=BytesIO (source.encode ('utf8')),
+            warc_headers_dict={'Content-Type': 'application/javascript; charset=utf-8'})
+    writer.write_record (record)
+
+def randomString (length=None, chars='abcdefghijklmnopqrstuvwxyz'):
+    if length is None:
+        length = random.randint (16, 32)
+    return ''.join (map (lambda x: random.choice (chars), range (length)))
+
+def writeDOMSnapshot (tab, writer):
+    """
+    Get a DOM snapshot of tab and write it to WARC.
+
+    We could use DOMSnapshot.getSnapshot here, but the API is not stable
+    yet. Also computed styles are not really necessary here.
+
+    XXX: Currently writes a response, when it should use “resource”. pywb
+    can’t handle that though.
+    """
+    viewport = getFormattedViewportMetrics (tab)
+    dom = tab.DOM.getDocument (depth=-1, pierce=True)
+    haveUrls = set ()
+    for doc in ChromeTreeWalker (dom['root']).split ():
+        rawUrl = doc['documentURL']
+        if rawUrl in haveUrls:
+            # ignore duplicate URLs. they are usually caused by
+            # javascript-injected iframes (advertising) with no(?) src
+            logger.warning ('have DOM snapshot for URL {}, ignoring'.format (rawUrl))
+            continue
+        url = urlsplit (rawUrl)
+        if url.scheme in ('http', 'https'):
+            logger.debug ('saving DOM snapshot for url {}, base {}'.format (doc['documentURL'], doc['baseURL']))
+            haveUrls.add (rawUrl)
+            walker = ChromeTreeWalker (doc)
+            # remove script, to make the page static and noscript, because at the
+            # time we took the snapshot scripts were enabled
+            disallowedTags = ['script', 'noscript']
+            disallowedAttributes = html.eventAttributes
+            stream = StripAttributeFilter (StripTagFilter (walker, disallowedTags), disallowedAttributes)
+            serializer = HTMLSerializer ()
+            httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1')
+            record = writer.create_warc_record (doc['documentURL'], 'response',
+                    payload=BytesIO (serializer.render (stream, 'utf-8')),
+                    http_headers=httpHeaders,
+                    warc_headers_dict={'X-DOM-Snapshot': str (True),
+                            'X-Chrome-Viewport': viewport})
+            writer.write_record (record)
+
+def emulateScreenMetrics (l):
+    """
+    Emulate different screen sizes, causing the site to fetch assets (img
+    srcset and css, for example) for different screen resolutions.
+    """
+    cssPpi = 96
+    sizes = [
+            {'width': 1920, 'height': 1080, 'deviceScaleFactor': 1.5, 'mobile': False},
+            {'width': 1920, 'height': 1080, 'deviceScaleFactor': 2, 'mobile': False},
+            # very dense display
+            {'width': 1920, 'height': 1080, 'deviceScaleFactor': 4, 'mobile': False},
+            # just a few samples:
+            # 1st gen iPhone (portrait mode)
+            {'width': 320, 'height': 480, 'deviceScaleFactor': 163/cssPpi, 'mobile': True},
+            # 6th gen iPhone (portrait mode)
+            {'width': 750, 'height': 1334, 'deviceScaleFactor': 326/cssPpi, 'mobile': True},
+            # and reset
+            {'width': 1920, 'height': 1080, 'deviceScaleFactor': 1, 'mobile': False},
+            ]
+    for s in sizes:
+        l.tab.Emulation.setDeviceMetricsOverride (**s)
+        l.wait (1)
+    # XXX: this seems to be broken, it does not clear the override
+    #tab.Emulation.clearDeviceMetricsOverride ()
+    # wait until assets finished loading
+    l.waitIdle (2, 10)
+
+def loadScripts (paths, scripts=[]):
+    for p in paths:
+        if not os.path.exists (p):
+            # search for defaults scripts in package data directory
+            p = packageData (p)
+        with open (p, 'r') as fd:
+            scripts.append (fd.read ())
+    return '\n'.join (scripts)
+
+def writeScreenshot (tab, writer):
+    """
+    Create screenshot from tab and write it to WARC
+    """
+    viewport = getFormattedViewportMetrics (tab)
+    data = b64decode (tab.Page.captureScreenshot (format='png')['data'])
+    record = writer.create_warc_record (packageUrl ('screenshot.png'), 'resource',
+            payload=BytesIO (data), warc_headers_dict={'Content-Type': 'image/png',
+            'X-Chrome-Viewport': viewport})
+    writer.write_record (record)
+
+# XXX: rabbitmq is hardcoded
+app = Celery ('crocoite.distributed')
+app.config_from_object('celeryconfig')
+logger = get_task_logger('crocoite.distributed.archive')
+
+# defaults can be changed below using argparse; track started state, because tasks are usually long-running
+@app.task(bind=True, track_started=True)
+def archive (self, url, output, onload, onsnapshot, browser,
+        logBuffer, maxBodySize, idleTimeout, timeout, domSnapshot, screenshot):
+    """
+    Archive a single URL
+
+    Supports these config keys (celeryconfig):
+
+    warc_filename = '{domain}-{date}-{id}.warc.gz'
+    temp_dir = '/tmp/'
+    finished_dir = '/tmp/finished'
+    """
+
+    self.update_state (state='PROGRESS', meta={'step': 'start'})
 
     stopVarname = '__' + __package__ + '_stop__'
     # avoid sites messing with our scripts by using a random stop variable name
     newStopVarname = randomString ()
-    onload = loadScripts (args.onload, ['var {} = false;\n'.format (stopVarname)]).replace (stopVarname, newStopVarname)
+    onload = loadScripts (onload, ['var {} = false;\n'.format (stopVarname)]).replace (stopVarname, newStopVarname)
     stopVarname = newStopVarname
 
-    service = browser.ChromeService ()
-    if args.browser:
-        service = browser.NullService (args.browser)
+    service = ChromeService ()
+    if browser:
+        service = NullService (browser)
 
-    with service as browserUrl:
-        browser = pychrome.Browser(url=browserUrl)
+    with service as browser:
+        browser = pychrome.Browser(url=browser)
 
-        fd = open (args.output, 'wb')
+        if not output:
+            parsedUrl = urlsplit (url)
+            outFile = app.conf.warc_filename.format (
+                            id=self.request.id,
+                            domain=parsedUrl.hostname.replace ('/', '-'),
+                            date=datetime.utcnow ().isoformat (),
+                            )
+            outPath = os.path.join (app.conf.temp_dir, outFile)
+            fd = open (outPath, 'wb')
+        else:
+            fd = open (output, 'wb')
         writer = WARCWriter (fd, gzip=True)
 
-        with WarcLoader (browser, args.url, writer, logBuffer=args.logBuffer,
-                maxBodySize=args.maxBodySize) as l:
+        with WarcLoader (browser, url, writer, logBuffer=logBuffer,
+                maxBodySize=maxBodySize) as l:
             version = l.tab.Browser.getVersion ()
             payload = {
                     'software': __package__,
@@ -187,25 +208,65 @@ def main ():
             l.tab.Page.addScriptToEvaluateOnNewDocument (source=onload)
             l.start ()
 
-            l.waitIdle (args.idleTimeout, args.timeout)
+            self.update_state (state='PROGRESS', meta={'step': 'fetch'})
+            l.waitIdle (idleTimeout, timeout)
 
             # get ready for snapshot: stop loading and scripts, disable events
             l.tab.Runtime.evaluate (expression='{} = true; window.scrollTo (0, 0);'.format (stopVarname), returnByValue=True)
             # if we stopped due to timeout, wait for remaining assets
             l.waitIdle (2, 10)
 
+            self.update_state (state='PROGRESS', meta={'step': 'emulateScreenMetrics'})
             emulateScreenMetrics (l)
 
             l.stop ()
 
-            if args.domSnapshot:
-                script = loadScripts (args.onsnapshot)
+            if domSnapshot:
+                self.update_state (state='PROGRESS', meta={'step': 'domSnapshot'})
+                script = loadScripts (onsnapshot)
                 writeScript ('onsnapshot', script, writer)
                 l.tab.Runtime.evaluate (expression=script, returnByValue=True)
                 writeDOMSnapshot (l.tab, writer)
 
-            if args.screenshot:
+            if screenshot:
+                self.update_state (state='PROGRESS', meta={'step': 'screenshot'})
                 writeScreenshot (l.tab, writer)
+    if not output:
+        outPath = os.path.join (app.conf.finished_dir, outFile)
+        os.rename (fd.name, outPath)
+    return True
+
+def stateCallback (data):
+    result = data['result']
+    if data['status'] == 'PROGRESS':
+        print (data['task_id'], result['step'])
+
+def main ():
+    parser = argparse.ArgumentParser(description='Save website to WARC using Google Chrome.')
+    parser.add_argument('--browser', help='DevTools URL', metavar='URL')
+    parser.add_argument('--distributed', help='Use celery worker', action='store_true')
+    parser.add_argument('--timeout', default=10, type=int, help='Maximum time for archival', metavar='SEC')
+    parser.add_argument('--idle-timeout', default=2, type=int, help='Maximum idle seconds (i.e. no requests)', dest='idleTimeout', metavar='SEC')
+    parser.add_argument('--log-buffer', default=1000, type=int, dest='logBuffer', metavar='LINES')
+    parser.add_argument('--max-body-size', default=10*1024*1024, type=int, dest='maxBodySize', help='Max body size', metavar='BYTES')
+    #parser.add_argument('--keep-tab', action='store_true', default=False, dest='keepTab', help='Keep tab open')
+    parser.add_argument('--onload', default=[], action='append', help='Inject JavaScript file before loading page', metavar='FILE')
+    parser.add_argument('--onsnapshot', default=[], action='append', help='Run JavaScript files before creating DOM snapshot', metavar='FILE')
+    parser.add_argument('--no-screenshot', default=True, action='store_false', help='Do not create a screenshot of the website', dest='screenshot')
+    parser.add_argument('--no-dom-snapshot', default=True, action='store_false', help='Do not create a DOM snapshot of the website', dest='domSnapshot')
+    parser.add_argument('url', help='Website URL')
+    parser.add_argument('output', help='WARC filename')
+
+    args = parser.parse_args ()
+    distributed = args.distributed
+    passArgs = vars (args)
+    del passArgs['distributed']
+
+    if distributed:
+        result = archive.delay (**passArgs)
+        result.get (on_message=stateCallback)
+    else:
+        archive (**passArgs)
 
     return True
 
diff --git a/setup.py b/setup.py
index 52747c0..0ab9249 100644
--- a/setup.py
+++ b/setup.py
@@ -13,13 +13,14 @@ setup(
         'pychrome',
         'warcio',
         'html5lib>=0.999999999',
+        'Celery',
     ],
     entry_points={
     'console_scripts': [
             'crocoite-standalone = crocoite.cli:main',
             ],
     },
-    data_files=[
-        ('crocoite/data', ['crocoite/data/onload.js']),
-        ],
+    package_data={
+            'crocoite': ['data/*'],
+    },
 )
author	Lars-Dominik Braun <lars@6xq.net>	2017-12-10 12:31:07 +0100
committer	Lars-Dominik Braun <lars@6xq.net>	2017-12-17 16:40:16 +0100
commit	84c3f69293fa79d752127410c7468038c907c96a (patch)
tree	4a71dcddd6abc6eeda30ed40bd78d91518efde38
parent	f816319081d5253974ddb70b655d55f4a880a77a (diff)
download	crocoite-84c3f69293fa79d752127410c7468038c907c96a.tar.gz crocoite-84c3f69293fa79d752127410c7468038c907c96a.tar.bz2 crocoite-84c3f69293fa79d752127410c7468038c907c96a.zip