diff options
Diffstat (limited to 'crocoite/cli.py')
-rw-r--r-- | crocoite/cli.py | 168 |
1 files changed, 127 insertions, 41 deletions
diff --git a/crocoite/cli.py b/crocoite/cli.py index c3c41a4..04bbb19 100644 --- a/crocoite/cli.py +++ b/crocoite/cli.py @@ -22,27 +22,68 @@ Command line interface """ -import argparse, sys, signal, asyncio, os +import argparse, sys, signal, asyncio, os, json +from traceback import TracebackException from enum import IntEnum +from yarl import URL +from http.cookies import SimpleCookie +import pkg_resources +try: + import manhole + manhole.install (patch_fork=False, oneshot_on='USR1') +except ModuleNotFoundError: + pass -from . import behavior +from . import behavior, browser from .controller import SinglePageController, \ ControllerSettings, StatsHandler, LogHandler, \ RecursiveController, DepthLimit, PrefixLimit from .devtools import Passthrough, Process from .warc import WarcHandler -from .logger import Logger, JsonPrintConsumer, DatetimeConsumer, WarcHandlerConsumer +from .logger import Logger, JsonPrintConsumer, DatetimeConsumer, \ + WarcHandlerConsumer, Level from .devtools import Crashed +def absurl (s): + """ argparse: Absolute URL """ + u = URL (s) + if u.is_absolute (): + return u + raise argparse.ArgumentTypeError ('Must be absolute') + +def cookie (s): + """ argparse: Cookie """ + c = SimpleCookie (s) + # for some reason the constructor does not raise an exception if the cookie + # supplied is invalid. It’ll simply be empty. + if len (c) != 1: + raise argparse.ArgumentTypeError ('Invalid cookie') + # we want a single Morsel + return next (iter (c.values ())) + +def cookiejar (f): + """ argparse: Cookies from file """ + cookies = [] + try: + with open (f, 'r') as fd: + for l in fd: + l = l.lstrip () + if l and not l.startswith ('#'): + cookies.append (cookie (l)) + except FileNotFoundError: + raise argparse.ArgumentTypeError (f'Cookie jar "{f}" does not exist') + return cookies + class SingleExitStatus(IntEnum): """ Exit status for single-shot command line """ Ok = 0 Fail = 1 BrowserCrash = 2 + Navigate = 3 def single (): - parser = argparse.ArgumentParser(description='Save website to WARC using Google Chrome.') - parser.add_argument('--browser', help='DevTools URL', metavar='URL') + parser = argparse.ArgumentParser(description='crocoite helper tools to fetch individual pages.') + parser.add_argument('--browser', help='DevTools URL', type=absurl, metavar='URL') parser.add_argument('--timeout', default=1*60*60, type=int, help='Maximum time for archival', metavar='SEC') parser.add_argument('--idle-timeout', default=30, type=int, help='Maximum idle seconds (i.e. no requests)', dest='idleTimeout', metavar='SEC') parser.add_argument('--behavior', help='Enable behavior script', @@ -50,7 +91,19 @@ def single (): default=list (behavior.availableMap.keys ()), choices=list (behavior.availableMap.keys ()), metavar='NAME', nargs='*') - parser.add_argument('url', help='Website URL', metavar='URL') + parser.add_argument('--warcinfo', help='Add extra information to warcinfo record', + metavar='JSON', type=json.loads) + # re-using curl’s short/long switch names whenever possible + parser.add_argument('-k', '--insecure', + action='store_true', + help='Disable certificate validation') + parser.add_argument ('-b', '--cookie', type=cookie, metavar='SET-COOKIE', + action='append', default=[], help='Cookies in Set-Cookie format.') + parser.add_argument ('-c', '--cookie-jar', dest='cookieJar', + type=cookiejar, metavar='FILE', + default=pkg_resources.resource_filename (__name__, 'data/cookies.txt'), + help='Cookie jar file, read-only.') + parser.add_argument('url', help='Website URL', type=absurl, metavar='URL') parser.add_argument('output', help='WARC filename', metavar='FILE') args = parser.parse_args () @@ -61,13 +114,19 @@ def single (): service = Process () if args.browser: service = Passthrough (args.browser) - settings = ControllerSettings (idleTimeout=args.idleTimeout, timeout=args.timeout) + settings = ControllerSettings ( + idleTimeout=args.idleTimeout, + timeout=args.timeout, + insecure=args.insecure, + cookies=args.cookieJar + args.cookie, + ) with open (args.output, 'wb') as fd, WarcHandler (fd, logger) as warcHandler: logger.connect (WarcHandlerConsumer (warcHandler)) handler = [StatsHandler (), LogHandler (logger), warcHandler] b = list (map (lambda x: behavior.availableMap[x], args.enabledBehaviorNames)) controller = SinglePageController (url=args.url, settings=settings, - service=service, handler=handler, behavior=b, logger=logger) + service=service, handler=handler, behavior=b, logger=logger, + warcinfo=args.warcinfo) try: loop = asyncio.get_event_loop() run = asyncio.ensure_future (controller.run ()) @@ -79,9 +138,20 @@ def single (): ret = SingleExitStatus.Ok except Crashed: ret = SingleExitStatus.BrowserCrash + except asyncio.CancelledError: + # don’t log this one + pass + except browser.NavigateError: + ret = SingleExitStatus.Navigate + except Exception as e: + ret = SingleExitStatus.Fail + logger.error ('cli exception', + uuid='7fd69858-ecaa-4225-b213-8ab880aa3cc5', + traceback=list (TracebackException.from_exception (e).format ())) finally: r = handler[0].stats logger.info ('stats', context='cli', uuid='24d92d16-770e-4088-b769-4020e127a7ff', **r) + logger.info ('exit', context='cli', uuid='9b1bd603-f7cd-4745-895a-5b894a5166f2', status=ret) return ret @@ -92,68 +162,84 @@ def parsePolicy (recursive, url): return DepthLimit (int (recursive)) elif recursive == 'prefix': return PrefixLimit (url) - raise ValueError ('Unsupported') + raise argparse.ArgumentTypeError ('Unsupported recursion mode') def recursive (): logger = Logger (consumer=[DatetimeConsumer (), JsonPrintConsumer ()]) - parser = argparse.ArgumentParser(description='Recursively run crocoite-grab.') - parser.add_argument('--policy', help='Recursion policy', metavar='POLICY') - parser.add_argument('--tempdir', help='Directory for temporary files', metavar='DIR') - parser.add_argument('--prefix', help='Output filename prefix, supports templates {host} and {date}', metavar='FILENAME', default='{host}-{date}-') - parser.add_argument('--concurrency', '-j', help='Run at most N jobs', metavar='N', default=1, type=int) - parser.add_argument('url', help='Seed URL', metavar='URL') - parser.add_argument('output', help='Output directory', metavar='DIR') - parser.add_argument('command', help='Fetch command, supports templates {url} and {dest}', metavar='CMD', nargs='*', default=['crocoite-grab', '{url}', '{dest}']) + parser = argparse.ArgumentParser(description='Save website to WARC using Google Chrome.') + parser.add_argument('-j', '--concurrency', + help='Run at most N jobs concurrently', metavar='N', default=1, + type=int) + parser.add_argument('-r', '--recursion', help='Recursion policy', + metavar='POLICY') + parser.add_argument('--tempdir', help='Directory for temporary files', + metavar='DIR') + parser.add_argument('url', help='Seed URL', type=absurl, metavar='URL') + parser.add_argument('output', + help='Output file, supports templates {host}, {date} and {seqnum}', + metavar='FILE') + parser.add_argument('command', + help='Fetch command, supports templates {url} and {dest}', + metavar='CMD', nargs='*', + default=['crocoite-single', '{url}', '{dest}']) args = parser.parse_args () try: - policy = parsePolicy (args.policy, args.url) - except ValueError: - parser.error ('Invalid argument for --policy') - - os.makedirs (args.output, exist_ok=True) + policy = parsePolicy (args.recursion, args.url) + except argparse.ArgumentTypeError as e: + parser.error (str (e)) - controller = RecursiveController (url=args.url, output=args.output, - command=args.command, logger=logger, policy=policy, - tempdir=args.tempdir, prefix=args.prefix, - concurrency=args.concurrency) + try: + controller = RecursiveController (url=args.url, output=args.output, + command=args.command, logger=logger, policy=policy, + tempdir=args.tempdir, concurrency=args.concurrency) + except ValueError as e: + parser.error (str (e)) + run = asyncio.ensure_future (controller.run ()) loop = asyncio.get_event_loop() - stop = lambda signum: controller.cancel () + stop = lambda signum: run.cancel () loop.add_signal_handler (signal.SIGINT, stop, signal.SIGINT) loop.add_signal_handler (signal.SIGTERM, stop, signal.SIGTERM) - loop.run_until_complete(controller.run ()) - loop.close() + try: + loop.run_until_complete(run) + except asyncio.CancelledError: + pass + finally: + loop.close() return 0 def irc (): - from configparser import ConfigParser + import json, re from .irc import Chromebot logger = Logger (consumer=[DatetimeConsumer (), JsonPrintConsumer ()]) parser = argparse.ArgumentParser(description='IRC bot.') - parser.add_argument('--config', '-c', help='Config file location', metavar='PATH', default='chromebot.ini') + parser.add_argument('--config', '-c', help='Config file location', metavar='PATH', default='chromebot.json') args = parser.parse_args () - config = ConfigParser () - config.read (args.config) + with open (args.config) as fd: + config = json.load (fd) s = config['irc'] + blacklist = dict (map (lambda x: (re.compile (x[0], re.I), x[1]), config['blacklist'].items ())) loop = asyncio.get_event_loop() bot = Chromebot ( - host=s.get ('host'), - port=s.getint ('port'), - ssl=s.getboolean ('ssl'), - nick=s.get ('nick'), - channels=[s.get ('channel')], - tempdir=s.get ('tempdir'), - destdir=s.get ('destdir'), - processLimit=s.getint ('process_limit'), + host=s['host'], + port=s['port'], + ssl=s['ssl'], + nick=s['nick'], + channels=s['channels'], + tempdir=config['tempdir'], + destdir=config['destdir'], + processLimit=config['process_limit'], logger=logger, + blacklist=blacklist, + needVoice=config['need_voice'], loop=loop) stop = lambda signum: bot.cancel () loop.add_signal_handler (signal.SIGINT, stop, signal.SIGINT) |