diff options
author | Lars-Dominik Braun <lars@6xq.net> | 2018-10-23 16:24:38 +0200 |
---|---|---|
committer | Lars-Dominik Braun <lars@6xq.net> | 2018-10-23 16:25:33 +0200 |
commit | 3cc7b39bd5d3d54e0fcc569385ce105e63425a63 (patch) | |
tree | a3785a91ff69721d2656f42d3def27caa898a567 /crocoite | |
parent | 513dfcc432ce20e62623c97ca44352211c1422a0 (diff) | |
download | crocoite-3cc7b39bd5d3d54e0fcc569385ce105e63425a63.tar.gz crocoite-3cc7b39bd5d3d54e0fcc569385ce105e63425a63.tar.bz2 crocoite-3cc7b39bd5d3d54e0fcc569385ce105e63425a63.zip |
single: Set and recursive: check exit status
Use exit status to signal something is wrong. Check it within recursive,
increment crashed counter and do not move the resulting WARC, it might
be broken.
Diffstat (limited to 'crocoite')
-rw-r--r-- | crocoite/cli.py | 25 | ||||
-rw-r--r-- | crocoite/controller.py | 21 |
2 files changed, 34 insertions, 12 deletions
diff --git a/crocoite/cli.py b/crocoite/cli.py index 6365c78..ab336e1 100644 --- a/crocoite/cli.py +++ b/crocoite/cli.py @@ -23,14 +23,21 @@ Command line interface """ import argparse, json, sys, signal +from enum import IntEnum from . import behavior from .controller import SinglePageController, defaultSettings, \ ControllerSettings, StatsHandler, LogHandler -from .browser import NullService, ChromeService +from .browser import NullService, ChromeService, BrowserCrashed from .warc import WarcHandler from .logger import Logger, JsonPrintConsumer, DatetimeConsumer, WarcHandlerConsumer +class SingleExitStatus(IntEnum): + """ Exit status for single-shot command line """ + Ok = 0 + Fail = 1 + BrowserCrash = 2 + def single (): parser = argparse.ArgumentParser(description='Save website to WARC using Google Chrome.') parser.add_argument('--browser', help='DevTools URL', metavar='URL') @@ -48,6 +55,7 @@ def single (): logger = Logger (consumer=[DatetimeConsumer (), JsonPrintConsumer ()]) + ret = SingleExitStatus.Fail service = ChromeService () if args.browser: service = NullService (args.browser) @@ -59,11 +67,16 @@ def single (): b = list (map (lambda x: behavior.availableMap[x], args.enabledBehaviorNames)) controller = SinglePageController (args.url, fd, settings=settings, service=service, handler=handler, behavior=b, logger=logger) - controller.run () - r = handler[0].stats - logger.info ('stats', context='cli', uuid='24d92d16-770e-4088-b769-4020e127a7ff', **r) + try: + controller.run () + ret = SingleExitStatus.Ok + except BrowserCrashed: + ret = SingleExitStatus.BrowserCrash + finally: + r = handler[0].stats + logger.info ('stats', context='cli', uuid='24d92d16-770e-4088-b769-4020e127a7ff', **r) - return True + return ret import asyncio, os from .controller import RecursiveController, DepthLimit, PrefixLimit @@ -110,6 +123,8 @@ def recursive (): loop.run_until_complete(controller.run ()) loop.close() + return 0 + def irc (): from configparser import ConfigParser from .irc import Chromebot diff --git a/crocoite/controller.py b/crocoite/controller.py index 45d9442..b1f5f6f 100644 --- a/crocoite/controller.py +++ b/crocoite/controller.py @@ -56,7 +56,7 @@ class StatsHandler (EventHandler): acceptException = True def __init__ (self): - self.stats = {'requests': 0, 'finished': 0, 'failed': 0, 'bytesRcv': 0, 'crashed': 0} + self.stats = {'requests': 0, 'finished': 0, 'failed': 0, 'bytesRcv': 0} def push (self, item): if isinstance (item, Item): @@ -66,8 +66,6 @@ class StatsHandler (EventHandler): else: self.stats['finished'] += 1 self.stats['bytesRcv'] += item.encodedDataLength - elif isinstance (item, BrowserCrashed): - self.stats['crashed'] += 1 from .behavior import ExtractLinksEvent from itertools import islice @@ -321,14 +319,20 @@ class RecursiveController: command is usually crocoite-grab """ + logger = self.logger.bind (url=url) + def formatCommand (e): return e.format (url=url, dest=dest.name) def formatPrefix (p): return p.format (host=urlparse (url).hostname, date=datetime.utcnow ().isoformat ()) + def logStats (): + logger.info ('stats', uuid='24d92d16-770e-4088-b769-4020e127a7ff', **self.stats) + if urlparse (url).scheme not in self.SCHEME_WHITELIST: self.stats['ignored'] += 1 + logStats () self.logger.warning ('scheme not whitelisted', url=url, uuid='57e838de-4494-4316-ae98-cd3a2ebf541b') return @@ -337,7 +341,6 @@ class RecursiveController: prefix=formatPrefix (self.prefix), suffix='.warc.gz', delete=False) destpath = os.path.join (self.output, os.path.basename (dest.name)) - logger = self.logger.bind (url=url) command = list (map (formatCommand, self.command)) logger.info ('fetch', uuid='1680f384-744c-4b8a-815b-7346e632e8db', command=command, destfile=destpath) process = await asyncio.create_subprocess_exec (*command, stdout=asyncio.subprocess.PIPE, @@ -356,10 +359,14 @@ class RecursiveController: elif uuid == '24d92d16-770e-4088-b769-4020e127a7ff': for k in self.stats.keys (): self.stats[k] += data.get (k, 0) - logger.info ('stats', uuid='24d92d16-770e-4088-b769-4020e127a7ff', **self.stats) + logStats () code = await process.wait() - # atomically move once finished - os.rename (dest.name, destpath) + if code == 0: + # atomically move once finished + os.rename (dest.name, destpath) + else: + self.stats['crashed'] += 1 + logStats () def cancel (self): """ Gracefully cancel this job, waiting for existing workers to shut down """ |