From 3cc7b39bd5d3d54e0fcc569385ce105e63425a63 Mon Sep 17 00:00:00 2001 From: Lars-Dominik Braun Date: Tue, 23 Oct 2018 16:24:38 +0200 Subject: single: Set and recursive: check exit status Use exit status to signal something is wrong. Check it within recursive, increment crashed counter and do not move the resulting WARC, it might be broken. --- crocoite/cli.py | 25 ++++++++++++++++++++----- crocoite/controller.py | 21 ++++++++++++++------- 2 files changed, 34 insertions(+), 12 deletions(-) (limited to 'crocoite') diff --git a/crocoite/cli.py b/crocoite/cli.py index 6365c78..ab336e1 100644 --- a/crocoite/cli.py +++ b/crocoite/cli.py @@ -23,14 +23,21 @@ Command line interface """ import argparse, json, sys, signal +from enum import IntEnum from . import behavior from .controller import SinglePageController, defaultSettings, \ ControllerSettings, StatsHandler, LogHandler -from .browser import NullService, ChromeService +from .browser import NullService, ChromeService, BrowserCrashed from .warc import WarcHandler from .logger import Logger, JsonPrintConsumer, DatetimeConsumer, WarcHandlerConsumer +class SingleExitStatus(IntEnum): + """ Exit status for single-shot command line """ + Ok = 0 + Fail = 1 + BrowserCrash = 2 + def single (): parser = argparse.ArgumentParser(description='Save website to WARC using Google Chrome.') parser.add_argument('--browser', help='DevTools URL', metavar='URL') @@ -48,6 +55,7 @@ def single (): logger = Logger (consumer=[DatetimeConsumer (), JsonPrintConsumer ()]) + ret = SingleExitStatus.Fail service = ChromeService () if args.browser: service = NullService (args.browser) @@ -59,11 +67,16 @@ def single (): b = list (map (lambda x: behavior.availableMap[x], args.enabledBehaviorNames)) controller = SinglePageController (args.url, fd, settings=settings, service=service, handler=handler, behavior=b, logger=logger) - controller.run () - r = handler[0].stats - logger.info ('stats', context='cli', uuid='24d92d16-770e-4088-b769-4020e127a7ff', **r) + try: + controller.run () + ret = SingleExitStatus.Ok + except BrowserCrashed: + ret = SingleExitStatus.BrowserCrash + finally: + r = handler[0].stats + logger.info ('stats', context='cli', uuid='24d92d16-770e-4088-b769-4020e127a7ff', **r) - return True + return ret import asyncio, os from .controller import RecursiveController, DepthLimit, PrefixLimit @@ -110,6 +123,8 @@ def recursive (): loop.run_until_complete(controller.run ()) loop.close() + return 0 + def irc (): from configparser import ConfigParser from .irc import Chromebot diff --git a/crocoite/controller.py b/crocoite/controller.py index 45d9442..b1f5f6f 100644 --- a/crocoite/controller.py +++ b/crocoite/controller.py @@ -56,7 +56,7 @@ class StatsHandler (EventHandler): acceptException = True def __init__ (self): - self.stats = {'requests': 0, 'finished': 0, 'failed': 0, 'bytesRcv': 0, 'crashed': 0} + self.stats = {'requests': 0, 'finished': 0, 'failed': 0, 'bytesRcv': 0} def push (self, item): if isinstance (item, Item): @@ -66,8 +66,6 @@ class StatsHandler (EventHandler): else: self.stats['finished'] += 1 self.stats['bytesRcv'] += item.encodedDataLength - elif isinstance (item, BrowserCrashed): - self.stats['crashed'] += 1 from .behavior import ExtractLinksEvent from itertools import islice @@ -321,14 +319,20 @@ class RecursiveController: command is usually crocoite-grab """ + logger = self.logger.bind (url=url) + def formatCommand (e): return e.format (url=url, dest=dest.name) def formatPrefix (p): return p.format (host=urlparse (url).hostname, date=datetime.utcnow ().isoformat ()) + def logStats (): + logger.info ('stats', uuid='24d92d16-770e-4088-b769-4020e127a7ff', **self.stats) + if urlparse (url).scheme not in self.SCHEME_WHITELIST: self.stats['ignored'] += 1 + logStats () self.logger.warning ('scheme not whitelisted', url=url, uuid='57e838de-4494-4316-ae98-cd3a2ebf541b') return @@ -337,7 +341,6 @@ class RecursiveController: prefix=formatPrefix (self.prefix), suffix='.warc.gz', delete=False) destpath = os.path.join (self.output, os.path.basename (dest.name)) - logger = self.logger.bind (url=url) command = list (map (formatCommand, self.command)) logger.info ('fetch', uuid='1680f384-744c-4b8a-815b-7346e632e8db', command=command, destfile=destpath) process = await asyncio.create_subprocess_exec (*command, stdout=asyncio.subprocess.PIPE, @@ -356,10 +359,14 @@ class RecursiveController: elif uuid == '24d92d16-770e-4088-b769-4020e127a7ff': for k in self.stats.keys (): self.stats[k] += data.get (k, 0) - logger.info ('stats', uuid='24d92d16-770e-4088-b769-4020e127a7ff', **self.stats) + logStats () code = await process.wait() - # atomically move once finished - os.rename (dest.name, destpath) + if code == 0: + # atomically move once finished + os.rename (dest.name, destpath) + else: + self.stats['crashed'] += 1 + logStats () def cancel (self): """ Gracefully cancel this job, waiting for existing workers to shut down """ -- cgit v1.2.3