summaryrefslogtreecommitdiff
path: root/crocoite/cli.py
diff options
context:
space:
mode:
Diffstat (limited to 'crocoite/cli.py')
-rw-r--r--crocoite/cli.py101
1 files changed, 36 insertions, 65 deletions
diff --git a/crocoite/cli.py b/crocoite/cli.py
index d89384d..93b742b 100644
--- a/crocoite/cli.py
+++ b/crocoite/cli.py
@@ -42,6 +42,13 @@ from .logger import Logger, JsonPrintConsumer, DatetimeConsumer, \
WarcHandlerConsumer, Level
from .devtools import Crashed
+def absurl (s):
+ """ argparse: Absolute URL """
+ u = URL (s)
+ if u.is_absolute ():
+ return u
+ raise argparse.ArgumentTypeError ('Must be absolute')
+
class SingleExitStatus(IntEnum):
""" Exit status for single-shot command line """
Ok = 0
@@ -50,21 +57,8 @@ class SingleExitStatus(IntEnum):
Navigate = 3
def single ():
- """
- One-shot command line interface and pywb_ playback:
-
- .. code:: bash
-
- pip install pywb
- crocoite-grab http://example.com/ example.com.warc.gz
- rm -rf collections && wb-manager init test && wb-manager add test example.com.warc.gz
- wayback &
- $BROWSER http://localhost:8080
-
- .. _pywb: https://github.com/ikreymer/pywb
- """
- parser = argparse.ArgumentParser(description='Save website to WARC using Google Chrome.')
- parser.add_argument('--browser', help='DevTools URL', metavar='URL')
+ parser = argparse.ArgumentParser(description='crocoite helper tools to fetch individual pages.')
+ parser.add_argument('--browser', help='DevTools URL', type=absurl, metavar='URL')
parser.add_argument('--timeout', default=1*60*60, type=int, help='Maximum time for archival', metavar='SEC')
parser.add_argument('--idle-timeout', default=30, type=int, help='Maximum idle seconds (i.e. no requests)', dest='idleTimeout', metavar='SEC')
parser.add_argument('--behavior', help='Enable behavior script',
@@ -77,7 +71,7 @@ def single ():
parser.add_argument('-k', '--insecure',
action='store_true',
help='Disable certificate validation')
- parser.add_argument('url', help='Website URL', type=URL, metavar='URL')
+ parser.add_argument('url', help='Website URL', type=absurl, metavar='URL')
parser.add_argument('output', help='WARC filename', metavar='FILE')
args = parser.parse_args ()
@@ -135,50 +129,40 @@ def parsePolicy (recursive, url):
return DepthLimit (int (recursive))
elif recursive == 'prefix':
return PrefixLimit (url)
- raise ValueError ('Unsupported')
+ raise argparse.ArgumentTypeError ('Unsupported recursion mode')
def recursive ():
- """
- crocoite is built with the Unix philosophy (“do one thing and do it well”) in
- mind. Thus ``crocoite-grab`` can only save a single page. If you want recursion
- use ``crocoite-recursive``, which follows hyperlinks according to ``--policy``.
- It can either recurse a maximum number of levels or grab all pages with the
- same prefix as the start URL:
-
- .. code:: bash
-
- crocoite-recursive --policy prefix http://www.example.com/dir/ output
-
- will save all pages in ``/dir/`` and below to individual files in the output
- directory ``output``. You can customize the command used to grab individual
- pages by appending it after ``output``. This way distributed grabs (ssh to a
- different machine and execute the job there, queue the command with Slurm, …)
- are possible.
- """
-
logger = Logger (consumer=[DatetimeConsumer (), JsonPrintConsumer ()])
- parser = argparse.ArgumentParser(description='Recursively run crocoite-grab.')
- parser.add_argument('--policy', help='Recursion policy', metavar='POLICY')
- parser.add_argument('--tempdir', help='Directory for temporary files', metavar='DIR')
- parser.add_argument('--prefix', help='Output filename prefix, supports templates {host} and {date}', metavar='FILENAME', default='{host}-{date}-')
- parser.add_argument('--concurrency', '-j', help='Run at most N jobs', metavar='N', default=1, type=int)
- parser.add_argument('url', help='Seed URL', type=URL, metavar='URL')
- parser.add_argument('output', help='Output directory', metavar='DIR')
- parser.add_argument('command', help='Fetch command, supports templates {url} and {dest}', metavar='CMD', nargs='*', default=['crocoite-grab', '{url}', '{dest}'])
+ parser = argparse.ArgumentParser(description='Save website to WARC using Google Chrome.')
+ parser.add_argument('-j', '--concurrency',
+ help='Run at most N jobs concurrently', metavar='N', default=1,
+ type=int)
+ parser.add_argument('-r', '--recursion', help='Recursion policy',
+ metavar='POLICY')
+ parser.add_argument('--tempdir', help='Directory for temporary files',
+ metavar='DIR')
+ parser.add_argument('url', help='Seed URL', type=absurl, metavar='URL')
+ parser.add_argument('output',
+ help='Output file, supports templates {host}, {date} and {seqnum}',
+ metavar='FILE')
+ parser.add_argument('command',
+ help='Fetch command, supports templates {url} and {dest}',
+ metavar='CMD', nargs='*',
+ default=['crocoite-single', '{url}', '{dest}'])
args = parser.parse_args ()
try:
- policy = parsePolicy (args.policy, args.url)
- except ValueError:
- parser.error ('Invalid argument for --policy')
+ policy = parsePolicy (args.recursion, args.url)
+ except argparse.ArgumentTypeError as e:
+ parser.error (str (e))
- os.makedirs (args.output, exist_ok=True)
-
- controller = RecursiveController (url=args.url, output=args.output,
- command=args.command, logger=logger, policy=policy,
- tempdir=args.tempdir, prefix=args.prefix,
- concurrency=args.concurrency)
+ try:
+ controller = RecursiveController (url=args.url, output=args.output,
+ command=args.command, logger=logger, policy=policy,
+ tempdir=args.tempdir, concurrency=args.concurrency)
+ except ValueError as e:
+ parser.error (str (e))
run = asyncio.ensure_future (controller.run ())
loop = asyncio.get_event_loop()
@@ -191,19 +175,6 @@ def recursive ():
return 0
def irc ():
- """
- A simple IRC bot (“chromebot”) is provided with the command ``crocoite-irc``.
- It reads its configuration from a config file like the example provided in
- ``contrib/chromebot.json`` and supports the following commands:
-
- a <url> -j <concurrency> -r <policy>
- Archive <url> with <concurrency> processes according to recursion <policy>
- s <uuid>
- Get job status for <uuid>
- r <uuid>
- Revoke or abort running job with <uuid>
- """
-
import json, re
from .irc import Chromebot