summaryrefslogtreecommitdiff
path: root/crocoite/cli.py
diff options
context:
space:
mode:
authorLars-Dominik Braun <lars@6xq.net>2018-04-29 10:44:33 +0200
committerLars-Dominik Braun <lars@6xq.net>2018-05-04 16:00:05 +0200
commitb30a44cfe9456deafc83e008f8501c391cd1e258 (patch)
tree91bd6a9f431b4155047e0f024fe9f5665766804e /crocoite/cli.py
parentd15b498505dc0362fbd7e92bf7ba2945cad5a118 (diff)
downloadcrocoite-b30a44cfe9456deafc83e008f8501c391cd1e258.tar.gz
crocoite-b30a44cfe9456deafc83e008f8501c391cd1e258.tar.bz2
crocoite-b30a44cfe9456deafc83e008f8501c391cd1e258.zip
Move page archiving logic to SinglePageController
In preparation for recursive crawls.
Diffstat (limited to 'crocoite/cli.py')
-rw-r--r--crocoite/cli.py135
1 files changed, 21 insertions, 114 deletions
diff --git a/crocoite/cli.py b/crocoite/cli.py
index 2cbbfa8..cac5b3b 100644
--- a/crocoite/cli.py
+++ b/crocoite/cli.py
@@ -19,112 +19,13 @@
# THE SOFTWARE.
"""
-Standalone and Celery command line interface
+Command line interface
"""
-import os, logging, argparse
-from io import BytesIO
-from datetime import datetime
-import pychrome
-from urllib.parse import urlsplit
+import logging, argparse
-from celery import Celery
-from celery.utils.log import get_task_logger
-
-from . import behavior, defaults
-from .warc import WarcLoader, SerializingWARCWriter
-from .browser import ChromeService, NullService
-from .util import packageUrl, getFormattedViewportMetrics
-
-app = Celery ('crocoite.distributed')
-app.config_from_object('celeryconfig')
-logger = get_task_logger('crocoite.distributed.archive')
-
-# defaults can be changed below using argparse; track started state, because tasks are usually long-running
-@app.task(bind=True, track_started=True)
-def archive (self, url, output, browser, logBuffer, maxBodySize, idleTimeout,
- timeout, enabledBehaviorNames):
- """
- Archive a single URL
-
- Supports these config keys (celeryconfig):
-
- warc_filename = '{domain}-{date}-{id}.warc.gz'
- temp_dir = '/tmp/'
- finished_dir = '/tmp/finished'
- """
-
- ret = {'stats': None}
-
- self.update_state (state='PROGRESS', meta={'step': 'start'})
-
- service = ChromeService ()
- if browser:
- service = NullService (browser)
-
- allBehavior = list (filter (lambda x: x.name in enabledBehaviorNames, behavior.available))
-
- with service as browser:
- browser = pychrome.Browser(url=browser)
-
- if not output:
- parsedUrl = urlsplit (url)
- outFile = app.conf.warc_filename.format (
- id=self.request.id,
- domain=parsedUrl.hostname.replace ('/', '-'),
- date=datetime.utcnow ().isoformat (),
- )
- outPath = os.path.join (app.conf.temp_dir, outFile)
- fd = open (outPath, 'wb')
- else:
- fd = open (output, 'wb')
- writer = SerializingWARCWriter (fd, gzip=True)
-
- with WarcLoader (browser, url, writer, logBuffer=logBuffer,
- maxBodySize=maxBodySize) as l:
- version = l.tab.Browser.getVersion ()
- payload = {
- 'software': __package__,
- 'browser': version['product'],
- 'useragent': version['userAgent'],
- 'viewport': getFormattedViewportMetrics (l.tab),
- }
- warcinfo = writer.create_warcinfo_record (filename=None, info=payload)
- writer.write_record (warcinfo)
-
- # not all behavior scripts are allowed for every URL, filter them
- enabledBehavior = list (filter (lambda x: url in x,
- map (lambda x: x (l), allBehavior)))
-
- self.update_state (state='PROGRESS', meta={'step': 'onload'})
- for b in enabledBehavior:
- logger.debug ('starting onload behavior {}'.format (b.name))
- b.onload ()
- l.start ()
-
- self.update_state (state='PROGRESS', meta={'step': 'fetch'})
- l.waitIdle (idleTimeout, timeout)
-
- self.update_state (state='PROGRESS', meta={'step': 'onstop'})
- for b in enabledBehavior:
- logger.debug ('starting onstop behavior {}'.format (b.name))
- b.onstop ()
-
- # if we stopped due to timeout, wait for remaining assets
- l.waitIdle (2, 60)
- l.stop ()
-
- self.update_state (state='PROGRESS', meta={'step': 'onfinish'})
- for b in enabledBehavior:
- logger.debug ('starting onfinish behavior {}'.format (b.name))
- b.onfinish ()
-
- ret['stats'] = l.stats
- writer.flush ()
- if not output:
- outPath = os.path.join (app.conf.finished_dir, outFile)
- os.rename (fd.name, outPath)
- return ret
+from . import behavior
+from .controller import SinglePageController, defaultSettings, ControllerSettings
def stateCallback (data):
result = data['result']
@@ -134,34 +35,40 @@ def stateCallback (data):
def main ():
parser = argparse.ArgumentParser(description='Save website to WARC using Google Chrome.')
parser.add_argument('--browser', help='DevTools URL', metavar='URL')
- parser.add_argument('--distributed', help='Use celery worker', action='store_true')
parser.add_argument('--timeout', default=10, type=int, help='Maximum time for archival', metavar='SEC')
parser.add_argument('--idle-timeout', default=2, type=int, help='Maximum idle seconds (i.e. no requests)', dest='idleTimeout', metavar='SEC')
- parser.add_argument('--log-buffer', default=defaults.logBuffer, type=int, dest='logBuffer', metavar='LINES')
- parser.add_argument('--max-body-size', default=defaults.maxBodySize, type=int, dest='maxBodySize', help='Max body size', metavar='BYTES')
- #parser.add_argument('--keep-tab', action='store_true', default=False, dest='keepTab', help='Keep tab open')
+ parser.add_argument('--log-buffer', default=defaultSettings.logBuffer, type=int, dest='logBuffer', metavar='LINES')
+ parser.add_argument('--max-body-size', default=defaultSettings.maxBodySize, type=int, dest='maxBodySize', help='Max body size', metavar='BYTES')
parser.add_argument('--behavior', help='Comma-separated list of enabled behavior scripts',
dest='enabledBehaviorNames',
default=list (behavior.availableNames),
choices=list (behavior.availableNames))
+ group = parser.add_mutually_exclusive_group (required=True)
+ group.add_argument('--output', help='WARC filename', metavar='FILE')
+ group.add_argument('--distributed', help='Use celery worker', action='store_true')
parser.add_argument('url', help='Website URL')
- parser.add_argument('output', help='WARC filename')
args = parser.parse_args ()
# prepare args for function
distributed = args.distributed
- passArgs = vars (args)
- del passArgs['distributed']
if distributed:
- result = archive.delay (**passArgs)
+ from .task import archive
+ settings = dict (maxBodySize=args.maxBodySize,
+ logBuffer=args.logBuffer, idleTimeout=args.idleTimeout,
+ timeout=args.timeout)
+ result = archive.delay (url=args.url, settings=settings,
+ enabledBehaviorNames=args.enabledBehaviorNames)
r = result.get (on_message=stateCallback)
else:
- # XXX: local evaluation does not init celery logging?
logging.basicConfig (level=logging.INFO)
- r = archive (**passArgs)
- print (r['stats'])
+ settings = ControllerSettings (maxBodySize=args.maxBodySize,
+ logBuffer=args.logBuffer, idleTimeout=args.idleTimeout,
+ timeout=args.timeout)
+ with open (args.output, 'wb') as fd:
+ controller = SinglePageController (args.url, fd, settings=settings)
+ controller.run ()
return True