diff options
author | Lars-Dominik Braun <lars@6xq.net> | 2018-06-20 11:13:37 +0200 |
---|---|---|
committer | Lars-Dominik Braun <lars@6xq.net> | 2018-06-20 11:17:25 +0200 |
commit | 7730e0d64ec895091a0dd7eb0e3c6ce2ed02d981 (patch) | |
tree | 15d0ca2e0374b7d00a05d5dd5de1e48838e71feb /crocoite/cli.py | |
parent | 06a06463c0367718b2ed1b2b7f081cff6ca998a0 (diff) | |
download | crocoite-7730e0d64ec895091a0dd7eb0e3c6ce2ed02d981.tar.gz crocoite-7730e0d64ec895091a0dd7eb0e3c6ce2ed02d981.tar.bz2 crocoite-7730e0d64ec895091a0dd7eb0e3c6ce2ed02d981.zip |
Synchronous SiteLoader event handling
Previously a browser crash stalled the entire grab, since events from
pychrome were handled asynchronously in a different thread and
exceptions were not propagated to the main thread.
Now all browser events are stored in a queue and processed by the main
thread, allowing us to handle browser crashes gracefully (more or less).
This made the following additional changes necessary:
- Clear separation between producer (browser) and consumer (WARC, stats,
…)
- Behavior scripts now yield events as well, instead of accessing the
WARC writer
- WARC logging was removed (for now) and WARC writer does not require
serialization any more
Diffstat (limited to 'crocoite/cli.py')
-rw-r--r-- | crocoite/cli.py | 19 |
1 files changed, 13 insertions, 6 deletions
diff --git a/crocoite/cli.py b/crocoite/cli.py index f6454da..d631f10 100644 --- a/crocoite/cli.py +++ b/crocoite/cli.py @@ -26,8 +26,9 @@ import logging, argparse, json, sys from . import behavior from .controller import RecursiveController, defaultSettings, \ - ControllerSettings, DepthLimit, PrefixLimit + ControllerSettings, DepthLimit, PrefixLimit, StatsHandler from .browser import NullService, ChromeService +from .warc import WarcHandler def parseRecursive (recursive, url): if recursive is None: @@ -41,6 +42,7 @@ def parseRecursive (recursive, url): def main (): parser = argparse.ArgumentParser(description='Save website to WARC using Google Chrome.') + parser.add_argument('--debug', help='Enable debug messages', action='store_true') parser.add_argument('--browser', help='DevTools URL', metavar='URL') parser.add_argument('--recursive', help='Follow links recursively') parser.add_argument('--concurrency', '-j', type=int, default=1) @@ -50,8 +52,8 @@ def main (): parser.add_argument('--max-body-size', default=defaultSettings.maxBodySize, type=int, dest='maxBodySize', help='Max body size', metavar='BYTES') parser.add_argument('--behavior', help='Comma-separated list of enabled behavior scripts', dest='enabledBehaviorNames', - default=list (behavior.availableNames), - choices=list (behavior.availableNames)) + default=list (behavior.availableMap.keys ()), + choices=list (behavior.availableMap.keys ())) group = parser.add_mutually_exclusive_group (required=True) group.add_argument('--output', help='WARC filename', metavar='FILE') group.add_argument('--distributed', help='Use celery worker', action='store_true') @@ -71,7 +73,8 @@ def main (): recursive=args.recursive, concurrency=args.concurrency) r = result.get () else: - logging.basicConfig (level=logging.INFO) + level = logging.DEBUG if args.debug else logging.INFO + logging.basicConfig (level=level) try: recursionPolicy = parseRecursive (args.recursive, args.url) @@ -84,9 +87,13 @@ def main (): logBuffer=args.logBuffer, idleTimeout=args.idleTimeout, timeout=args.timeout) with open (args.output, 'wb') as fd: + handler = [StatsHandler (), WarcHandler (fd)] + b = list (map (lambda x: behavior.availableMap[x], args.enabledBehaviorNames)) controller = RecursiveController (args.url, fd, settings=settings, - recursionPolicy=recursionPolicy, service=service) - r = controller.run () + recursionPolicy=recursionPolicy, service=service, + handler=handler, behavior=b) + controller.run () + r = handler[0].stats json.dump (r, sys.stdout) return True |