diff options
Diffstat (limited to 'crocoite')
| -rw-r--r-- | crocoite/cli.py | 25 | ||||
| -rw-r--r-- | crocoite/controller.py | 21 | 
2 files changed, 34 insertions, 12 deletions
| diff --git a/crocoite/cli.py b/crocoite/cli.py index 6365c78..ab336e1 100644 --- a/crocoite/cli.py +++ b/crocoite/cli.py @@ -23,14 +23,21 @@ Command line interface  """  import argparse, json, sys, signal +from enum import IntEnum  from . import behavior  from .controller import SinglePageController, defaultSettings, \          ControllerSettings, StatsHandler, LogHandler -from .browser import NullService, ChromeService +from .browser import NullService, ChromeService, BrowserCrashed  from .warc import WarcHandler  from .logger import Logger, JsonPrintConsumer, DatetimeConsumer, WarcHandlerConsumer +class SingleExitStatus(IntEnum): +    """ Exit status for single-shot command line """ +    Ok = 0 +    Fail = 1 +    BrowserCrash = 2 +  def single ():      parser = argparse.ArgumentParser(description='Save website to WARC using Google Chrome.')      parser.add_argument('--browser', help='DevTools URL', metavar='URL') @@ -48,6 +55,7 @@ def single ():      logger = Logger (consumer=[DatetimeConsumer (), JsonPrintConsumer ()]) +    ret = SingleExitStatus.Fail      service = ChromeService ()      if args.browser:          service = NullService (args.browser) @@ -59,11 +67,16 @@ def single ():          b = list (map (lambda x: behavior.availableMap[x], args.enabledBehaviorNames))          controller = SinglePageController (args.url, fd, settings=settings,                  service=service, handler=handler, behavior=b, logger=logger) -        controller.run () -        r = handler[0].stats -        logger.info ('stats', context='cli', uuid='24d92d16-770e-4088-b769-4020e127a7ff', **r) +        try: +            controller.run () +            ret = SingleExitStatus.Ok +        except BrowserCrashed: +            ret = SingleExitStatus.BrowserCrash +        finally: +            r = handler[0].stats +            logger.info ('stats', context='cli', uuid='24d92d16-770e-4088-b769-4020e127a7ff', **r) -    return True +    return ret  import asyncio, os  from .controller import RecursiveController, DepthLimit, PrefixLimit @@ -110,6 +123,8 @@ def recursive ():      loop.run_until_complete(controller.run ())      loop.close() +    return 0 +  def irc ():      from configparser import ConfigParser      from .irc import Chromebot diff --git a/crocoite/controller.py b/crocoite/controller.py index 45d9442..b1f5f6f 100644 --- a/crocoite/controller.py +++ b/crocoite/controller.py @@ -56,7 +56,7 @@ class StatsHandler (EventHandler):      acceptException = True      def __init__ (self): -        self.stats = {'requests': 0, 'finished': 0, 'failed': 0, 'bytesRcv': 0, 'crashed': 0} +        self.stats = {'requests': 0, 'finished': 0, 'failed': 0, 'bytesRcv': 0}      def push (self, item):          if isinstance (item, Item): @@ -66,8 +66,6 @@ class StatsHandler (EventHandler):              else:                  self.stats['finished'] += 1                  self.stats['bytesRcv'] += item.encodedDataLength -        elif isinstance (item, BrowserCrashed): -            self.stats['crashed'] += 1  from .behavior import ExtractLinksEvent  from itertools import islice @@ -321,14 +319,20 @@ class RecursiveController:          command is usually crocoite-grab          """ +        logger = self.logger.bind (url=url) +          def formatCommand (e):              return e.format (url=url, dest=dest.name)          def formatPrefix (p):              return p.format (host=urlparse (url).hostname, date=datetime.utcnow ().isoformat ()) +        def logStats (): +            logger.info ('stats', uuid='24d92d16-770e-4088-b769-4020e127a7ff', **self.stats) +          if urlparse (url).scheme not in self.SCHEME_WHITELIST:              self.stats['ignored'] += 1 +            logStats ()              self.logger.warning ('scheme not whitelisted', url=url,                      uuid='57e838de-4494-4316-ae98-cd3a2ebf541b')              return @@ -337,7 +341,6 @@ class RecursiveController:                  prefix=formatPrefix (self.prefix), suffix='.warc.gz',                  delete=False)          destpath = os.path.join (self.output, os.path.basename (dest.name)) -        logger = self.logger.bind (url=url)          command = list (map (formatCommand, self.command))          logger.info ('fetch', uuid='1680f384-744c-4b8a-815b-7346e632e8db', command=command, destfile=destpath)          process = await asyncio.create_subprocess_exec (*command, stdout=asyncio.subprocess.PIPE, @@ -356,10 +359,14 @@ class RecursiveController:              elif uuid == '24d92d16-770e-4088-b769-4020e127a7ff':                  for k in self.stats.keys ():                      self.stats[k] += data.get (k, 0) -                logger.info ('stats', uuid='24d92d16-770e-4088-b769-4020e127a7ff', **self.stats) +                logStats ()          code = await process.wait() -        # atomically move once finished -        os.rename (dest.name, destpath) +        if code == 0: +            # atomically move once finished +            os.rename (dest.name, destpath) +        else: +            self.stats['crashed'] += 1 +            logStats ()      def cancel (self):          """ Gracefully cancel this job, waiting for existing workers to shut down """ | 
