From 824c6e91ae6fee1318e79c3ce1a43f98bc697c7b Mon Sep 17 00:00:00 2001 From: Lars-Dominik Braun Date: Fri, 4 May 2018 13:00:05 +0200 Subject: Add distributed recursive crawls --- crocoite/cli.py | 41 ++++++++++++++++++----------------------- 1 file changed, 18 insertions(+), 23 deletions(-) (limited to 'crocoite/cli.py') diff --git a/crocoite/cli.py b/crocoite/cli.py index 196162e..15fac12 100644 --- a/crocoite/cli.py +++ b/crocoite/cli.py @@ -28,15 +28,11 @@ from . import behavior from .controller import RecursiveController, defaultSettings, \ ControllerSettings, DepthLimit, PrefixLimit -def stateCallback (data): - result = data['result'] - if data['status'] == 'PROGRESS': - print (data['task_id'], result['step']) - def main (): parser = argparse.ArgumentParser(description='Save website to WARC using Google Chrome.') parser.add_argument('--browser', help='DevTools URL', metavar='URL') parser.add_argument('--recursive', help='Follow links recursively') + parser.add_argument('--concurrency', '-j', type=int, default=1) parser.add_argument('--timeout', default=10, type=int, help='Maximum time for archival', metavar='SEC') parser.add_argument('--idle-timeout', default=2, type=int, help='Maximum idle seconds (i.e. no requests)', dest='idleTimeout', metavar='SEC') parser.add_argument('--log-buffer', default=defaultSettings.logBuffer, type=int, dest='logBuffer', metavar='LINES') @@ -52,29 +48,28 @@ def main (): args = parser.parse_args () - # prepare args for function - distributed = args.distributed - if args.recursive is None: - recursionPolicy = DepthLimit (0) - elif args.recursive.isdigit (): - recursionPolicy = DepthLimit (int (args.recursive)) - elif args.recursive == 'prefix': - recursionPolicy = PrefixLimit (args.url) - else: - parser.error ('Invalid argument for --recursive') - - if distributed: - assert args.recursive is None, "Distributed crawls cannot be recursive right now, sorry" - - from .task import archive + if args.distributed: + if args.browser: + parser.error ('--browser is not supported for distributed jobs') + from . import task settings = dict (maxBodySize=args.maxBodySize, logBuffer=args.logBuffer, idleTimeout=args.idleTimeout, timeout=args.timeout) - result = archive.delay (url=args.url, settings=settings, - enabledBehaviorNames=args.enabledBehaviorNames) - r = result.get (on_message=stateCallback) + result = task.controller.delay (url=args.url, settings=settings, + enabledBehaviorNames=args.enabledBehaviorNames, + recursive=args.recursive, concurrency=args.concurrency) + r = result.get () else: logging.basicConfig (level=logging.INFO) + + if args.recursive is None: + recursionPolicy = DepthLimit (0) + elif args.recursive.isdigit (): + recursionPolicy = DepthLimit (int (args.recursive)) + elif args.recursive == 'prefix': + recursionPolicy = PrefixLimit (args.url) + else: + parser.error ('Invalid argument for --recursive') settings = ControllerSettings (maxBodySize=args.maxBodySize, logBuffer=args.logBuffer, idleTimeout=args.idleTimeout, timeout=args.timeout) -- cgit v1.2.3