summaryrefslogtreecommitdiff
path: root/crocoite
diff options
context:
space:
mode:
authorLars-Dominik Braun <lars@6xq.net>2018-10-23 16:24:38 +0200
committerLars-Dominik Braun <lars@6xq.net>2018-10-23 16:25:33 +0200
commit3cc7b39bd5d3d54e0fcc569385ce105e63425a63 (patch)
treea3785a91ff69721d2656f42d3def27caa898a567 /crocoite
parent513dfcc432ce20e62623c97ca44352211c1422a0 (diff)
downloadcrocoite-3cc7b39bd5d3d54e0fcc569385ce105e63425a63.tar.gz
crocoite-3cc7b39bd5d3d54e0fcc569385ce105e63425a63.tar.bz2
crocoite-3cc7b39bd5d3d54e0fcc569385ce105e63425a63.zip
single: Set and recursive: check exit status
Use exit status to signal something is wrong. Check it within recursive, increment crashed counter and do not move the resulting WARC, it might be broken.
Diffstat (limited to 'crocoite')
-rw-r--r--crocoite/cli.py25
-rw-r--r--crocoite/controller.py21
2 files changed, 34 insertions, 12 deletions
diff --git a/crocoite/cli.py b/crocoite/cli.py
index 6365c78..ab336e1 100644
--- a/crocoite/cli.py
+++ b/crocoite/cli.py
@@ -23,14 +23,21 @@ Command line interface
"""
import argparse, json, sys, signal
+from enum import IntEnum
from . import behavior
from .controller import SinglePageController, defaultSettings, \
ControllerSettings, StatsHandler, LogHandler
-from .browser import NullService, ChromeService
+from .browser import NullService, ChromeService, BrowserCrashed
from .warc import WarcHandler
from .logger import Logger, JsonPrintConsumer, DatetimeConsumer, WarcHandlerConsumer
+class SingleExitStatus(IntEnum):
+ """ Exit status for single-shot command line """
+ Ok = 0
+ Fail = 1
+ BrowserCrash = 2
+
def single ():
parser = argparse.ArgumentParser(description='Save website to WARC using Google Chrome.')
parser.add_argument('--browser', help='DevTools URL', metavar='URL')
@@ -48,6 +55,7 @@ def single ():
logger = Logger (consumer=[DatetimeConsumer (), JsonPrintConsumer ()])
+ ret = SingleExitStatus.Fail
service = ChromeService ()
if args.browser:
service = NullService (args.browser)
@@ -59,11 +67,16 @@ def single ():
b = list (map (lambda x: behavior.availableMap[x], args.enabledBehaviorNames))
controller = SinglePageController (args.url, fd, settings=settings,
service=service, handler=handler, behavior=b, logger=logger)
- controller.run ()
- r = handler[0].stats
- logger.info ('stats', context='cli', uuid='24d92d16-770e-4088-b769-4020e127a7ff', **r)
+ try:
+ controller.run ()
+ ret = SingleExitStatus.Ok
+ except BrowserCrashed:
+ ret = SingleExitStatus.BrowserCrash
+ finally:
+ r = handler[0].stats
+ logger.info ('stats', context='cli', uuid='24d92d16-770e-4088-b769-4020e127a7ff', **r)
- return True
+ return ret
import asyncio, os
from .controller import RecursiveController, DepthLimit, PrefixLimit
@@ -110,6 +123,8 @@ def recursive ():
loop.run_until_complete(controller.run ())
loop.close()
+ return 0
+
def irc ():
from configparser import ConfigParser
from .irc import Chromebot
diff --git a/crocoite/controller.py b/crocoite/controller.py
index 45d9442..b1f5f6f 100644
--- a/crocoite/controller.py
+++ b/crocoite/controller.py
@@ -56,7 +56,7 @@ class StatsHandler (EventHandler):
acceptException = True
def __init__ (self):
- self.stats = {'requests': 0, 'finished': 0, 'failed': 0, 'bytesRcv': 0, 'crashed': 0}
+ self.stats = {'requests': 0, 'finished': 0, 'failed': 0, 'bytesRcv': 0}
def push (self, item):
if isinstance (item, Item):
@@ -66,8 +66,6 @@ class StatsHandler (EventHandler):
else:
self.stats['finished'] += 1
self.stats['bytesRcv'] += item.encodedDataLength
- elif isinstance (item, BrowserCrashed):
- self.stats['crashed'] += 1
from .behavior import ExtractLinksEvent
from itertools import islice
@@ -321,14 +319,20 @@ class RecursiveController:
command is usually crocoite-grab
"""
+ logger = self.logger.bind (url=url)
+
def formatCommand (e):
return e.format (url=url, dest=dest.name)
def formatPrefix (p):
return p.format (host=urlparse (url).hostname, date=datetime.utcnow ().isoformat ())
+ def logStats ():
+ logger.info ('stats', uuid='24d92d16-770e-4088-b769-4020e127a7ff', **self.stats)
+
if urlparse (url).scheme not in self.SCHEME_WHITELIST:
self.stats['ignored'] += 1
+ logStats ()
self.logger.warning ('scheme not whitelisted', url=url,
uuid='57e838de-4494-4316-ae98-cd3a2ebf541b')
return
@@ -337,7 +341,6 @@ class RecursiveController:
prefix=formatPrefix (self.prefix), suffix='.warc.gz',
delete=False)
destpath = os.path.join (self.output, os.path.basename (dest.name))
- logger = self.logger.bind (url=url)
command = list (map (formatCommand, self.command))
logger.info ('fetch', uuid='1680f384-744c-4b8a-815b-7346e632e8db', command=command, destfile=destpath)
process = await asyncio.create_subprocess_exec (*command, stdout=asyncio.subprocess.PIPE,
@@ -356,10 +359,14 @@ class RecursiveController:
elif uuid == '24d92d16-770e-4088-b769-4020e127a7ff':
for k in self.stats.keys ():
self.stats[k] += data.get (k, 0)
- logger.info ('stats', uuid='24d92d16-770e-4088-b769-4020e127a7ff', **self.stats)
+ logStats ()
code = await process.wait()
- # atomically move once finished
- os.rename (dest.name, destpath)
+ if code == 0:
+ # atomically move once finished
+ os.rename (dest.name, destpath)
+ else:
+ self.stats['crashed'] += 1
+ logStats ()
def cancel (self):
""" Gracefully cancel this job, waiting for existing workers to shut down """