summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--crocoite/cli.py25
-rw-r--r--crocoite/controller.py21
2 files changed, 34 insertions, 12 deletions
diff --git a/crocoite/cli.py b/crocoite/cli.py
index 6365c78..ab336e1 100644
--- a/crocoite/cli.py
+++ b/crocoite/cli.py
@@ -23,14 +23,21 @@ Command line interface
"""
import argparse, json, sys, signal
+from enum import IntEnum
from . import behavior
from .controller import SinglePageController, defaultSettings, \
ControllerSettings, StatsHandler, LogHandler
-from .browser import NullService, ChromeService
+from .browser import NullService, ChromeService, BrowserCrashed
from .warc import WarcHandler
from .logger import Logger, JsonPrintConsumer, DatetimeConsumer, WarcHandlerConsumer
+class SingleExitStatus(IntEnum):
+ """ Exit status for single-shot command line """
+ Ok = 0
+ Fail = 1
+ BrowserCrash = 2
+
def single ():
parser = argparse.ArgumentParser(description='Save website to WARC using Google Chrome.')
parser.add_argument('--browser', help='DevTools URL', metavar='URL')
@@ -48,6 +55,7 @@ def single ():
logger = Logger (consumer=[DatetimeConsumer (), JsonPrintConsumer ()])
+ ret = SingleExitStatus.Fail
service = ChromeService ()
if args.browser:
service = NullService (args.browser)
@@ -59,11 +67,16 @@ def single ():
b = list (map (lambda x: behavior.availableMap[x], args.enabledBehaviorNames))
controller = SinglePageController (args.url, fd, settings=settings,
service=service, handler=handler, behavior=b, logger=logger)
- controller.run ()
- r = handler[0].stats
- logger.info ('stats', context='cli', uuid='24d92d16-770e-4088-b769-4020e127a7ff', **r)
+ try:
+ controller.run ()
+ ret = SingleExitStatus.Ok
+ except BrowserCrashed:
+ ret = SingleExitStatus.BrowserCrash
+ finally:
+ r = handler[0].stats
+ logger.info ('stats', context='cli', uuid='24d92d16-770e-4088-b769-4020e127a7ff', **r)
- return True
+ return ret
import asyncio, os
from .controller import RecursiveController, DepthLimit, PrefixLimit
@@ -110,6 +123,8 @@ def recursive ():
loop.run_until_complete(controller.run ())
loop.close()
+ return 0
+
def irc ():
from configparser import ConfigParser
from .irc import Chromebot
diff --git a/crocoite/controller.py b/crocoite/controller.py
index 45d9442..b1f5f6f 100644
--- a/crocoite/controller.py
+++ b/crocoite/controller.py
@@ -56,7 +56,7 @@ class StatsHandler (EventHandler):
acceptException = True
def __init__ (self):
- self.stats = {'requests': 0, 'finished': 0, 'failed': 0, 'bytesRcv': 0, 'crashed': 0}
+ self.stats = {'requests': 0, 'finished': 0, 'failed': 0, 'bytesRcv': 0}
def push (self, item):
if isinstance (item, Item):
@@ -66,8 +66,6 @@ class StatsHandler (EventHandler):
else:
self.stats['finished'] += 1
self.stats['bytesRcv'] += item.encodedDataLength
- elif isinstance (item, BrowserCrashed):
- self.stats['crashed'] += 1
from .behavior import ExtractLinksEvent
from itertools import islice
@@ -321,14 +319,20 @@ class RecursiveController:
command is usually crocoite-grab
"""
+ logger = self.logger.bind (url=url)
+
def formatCommand (e):
return e.format (url=url, dest=dest.name)
def formatPrefix (p):
return p.format (host=urlparse (url).hostname, date=datetime.utcnow ().isoformat ())
+ def logStats ():
+ logger.info ('stats', uuid='24d92d16-770e-4088-b769-4020e127a7ff', **self.stats)
+
if urlparse (url).scheme not in self.SCHEME_WHITELIST:
self.stats['ignored'] += 1
+ logStats ()
self.logger.warning ('scheme not whitelisted', url=url,
uuid='57e838de-4494-4316-ae98-cd3a2ebf541b')
return
@@ -337,7 +341,6 @@ class RecursiveController:
prefix=formatPrefix (self.prefix), suffix='.warc.gz',
delete=False)
destpath = os.path.join (self.output, os.path.basename (dest.name))
- logger = self.logger.bind (url=url)
command = list (map (formatCommand, self.command))
logger.info ('fetch', uuid='1680f384-744c-4b8a-815b-7346e632e8db', command=command, destfile=destpath)
process = await asyncio.create_subprocess_exec (*command, stdout=asyncio.subprocess.PIPE,
@@ -356,10 +359,14 @@ class RecursiveController:
elif uuid == '24d92d16-770e-4088-b769-4020e127a7ff':
for k in self.stats.keys ():
self.stats[k] += data.get (k, 0)
- logger.info ('stats', uuid='24d92d16-770e-4088-b769-4020e127a7ff', **self.stats)
+ logStats ()
code = await process.wait()
- # atomically move once finished
- os.rename (dest.name, destpath)
+ if code == 0:
+ # atomically move once finished
+ os.rename (dest.name, destpath)
+ else:
+ self.stats['crashed'] += 1
+ logStats ()
def cancel (self):
""" Gracefully cancel this job, waiting for existing workers to shut down """