summaryrefslogtreecommitdiff
path: root/crocoite/cli.py
diff options
context:
space:
mode:
authorLars-Dominik Braun <lars@6xq.net>2018-05-01 15:59:30 +0200
committerLars-Dominik Braun <lars@6xq.net>2018-05-04 16:00:05 +0200
commit48cabe3b0c7e7a760de4d40af55717c024fdc3bf (patch)
tree269b34c7d89b739ba0c0c2042db282ae20dd3ff7 /crocoite/cli.py
parentde2b4036ac7ce2d242d49fe37337db890c30da3b (diff)
downloadcrocoite-48cabe3b0c7e7a760de4d40af55717c024fdc3bf.tar.gz
crocoite-48cabe3b0c7e7a760de4d40af55717c024fdc3bf.tar.bz2
crocoite-48cabe3b0c7e7a760de4d40af55717c024fdc3bf.zip
Add support for recursive crawls
Only local right now, not distributed.
Diffstat (limited to 'crocoite/cli.py')
-rw-r--r--crocoite/cli.py17
1 files changed, 15 insertions, 2 deletions
diff --git a/crocoite/cli.py b/crocoite/cli.py
index 4cbce4a..196162e 100644
--- a/crocoite/cli.py
+++ b/crocoite/cli.py
@@ -25,7 +25,8 @@ Command line interface
import logging, argparse, json, sys
from . import behavior
-from .controller import SinglePageController, defaultSettings, ControllerSettings
+from .controller import RecursiveController, defaultSettings, \
+ ControllerSettings, DepthLimit, PrefixLimit
def stateCallback (data):
result = data['result']
@@ -35,6 +36,7 @@ def stateCallback (data):
def main ():
parser = argparse.ArgumentParser(description='Save website to WARC using Google Chrome.')
parser.add_argument('--browser', help='DevTools URL', metavar='URL')
+ parser.add_argument('--recursive', help='Follow links recursively')
parser.add_argument('--timeout', default=10, type=int, help='Maximum time for archival', metavar='SEC')
parser.add_argument('--idle-timeout', default=2, type=int, help='Maximum idle seconds (i.e. no requests)', dest='idleTimeout', metavar='SEC')
parser.add_argument('--log-buffer', default=defaultSettings.logBuffer, type=int, dest='logBuffer', metavar='LINES')
@@ -52,8 +54,18 @@ def main ():
# prepare args for function
distributed = args.distributed
+ if args.recursive is None:
+ recursionPolicy = DepthLimit (0)
+ elif args.recursive.isdigit ():
+ recursionPolicy = DepthLimit (int (args.recursive))
+ elif args.recursive == 'prefix':
+ recursionPolicy = PrefixLimit (args.url)
+ else:
+ parser.error ('Invalid argument for --recursive')
if distributed:
+ assert args.recursive is None, "Distributed crawls cannot be recursive right now, sorry"
+
from .task import archive
settings = dict (maxBodySize=args.maxBodySize,
logBuffer=args.logBuffer, idleTimeout=args.idleTimeout,
@@ -67,7 +79,8 @@ def main ():
logBuffer=args.logBuffer, idleTimeout=args.idleTimeout,
timeout=args.timeout)
with open (args.output, 'wb') as fd:
- controller = SinglePageController (args.url, fd, settings=settings)
+ controller = RecursiveController (args.url, fd, settings=settings,
+ recursionPolicy=recursionPolicy)
r = controller.run ()
json.dump (r, sys.stdout)