summaryrefslogtreecommitdiff
path: root/crocoite/controller.py
diff options
context:
space:
mode:
authorLars-Dominik Braun <lars@6xq.net>2018-10-23 16:24:38 +0200
committerLars-Dominik Braun <lars@6xq.net>2018-10-23 16:25:33 +0200
commit3cc7b39bd5d3d54e0fcc569385ce105e63425a63 (patch)
treea3785a91ff69721d2656f42d3def27caa898a567 /crocoite/controller.py
parent513dfcc432ce20e62623c97ca44352211c1422a0 (diff)
downloadcrocoite-3cc7b39bd5d3d54e0fcc569385ce105e63425a63.tar.gz
crocoite-3cc7b39bd5d3d54e0fcc569385ce105e63425a63.tar.bz2
crocoite-3cc7b39bd5d3d54e0fcc569385ce105e63425a63.zip
single: Set and recursive: check exit status
Use exit status to signal something is wrong. Check it within recursive, increment crashed counter and do not move the resulting WARC, it might be broken.
Diffstat (limited to 'crocoite/controller.py')
-rw-r--r--crocoite/controller.py21
1 files changed, 14 insertions, 7 deletions
diff --git a/crocoite/controller.py b/crocoite/controller.py
index 45d9442..b1f5f6f 100644
--- a/crocoite/controller.py
+++ b/crocoite/controller.py
@@ -56,7 +56,7 @@ class StatsHandler (EventHandler):
acceptException = True
def __init__ (self):
- self.stats = {'requests': 0, 'finished': 0, 'failed': 0, 'bytesRcv': 0, 'crashed': 0}
+ self.stats = {'requests': 0, 'finished': 0, 'failed': 0, 'bytesRcv': 0}
def push (self, item):
if isinstance (item, Item):
@@ -66,8 +66,6 @@ class StatsHandler (EventHandler):
else:
self.stats['finished'] += 1
self.stats['bytesRcv'] += item.encodedDataLength
- elif isinstance (item, BrowserCrashed):
- self.stats['crashed'] += 1
from .behavior import ExtractLinksEvent
from itertools import islice
@@ -321,14 +319,20 @@ class RecursiveController:
command is usually crocoite-grab
"""
+ logger = self.logger.bind (url=url)
+
def formatCommand (e):
return e.format (url=url, dest=dest.name)
def formatPrefix (p):
return p.format (host=urlparse (url).hostname, date=datetime.utcnow ().isoformat ())
+ def logStats ():
+ logger.info ('stats', uuid='24d92d16-770e-4088-b769-4020e127a7ff', **self.stats)
+
if urlparse (url).scheme not in self.SCHEME_WHITELIST:
self.stats['ignored'] += 1
+ logStats ()
self.logger.warning ('scheme not whitelisted', url=url,
uuid='57e838de-4494-4316-ae98-cd3a2ebf541b')
return
@@ -337,7 +341,6 @@ class RecursiveController:
prefix=formatPrefix (self.prefix), suffix='.warc.gz',
delete=False)
destpath = os.path.join (self.output, os.path.basename (dest.name))
- logger = self.logger.bind (url=url)
command = list (map (formatCommand, self.command))
logger.info ('fetch', uuid='1680f384-744c-4b8a-815b-7346e632e8db', command=command, destfile=destpath)
process = await asyncio.create_subprocess_exec (*command, stdout=asyncio.subprocess.PIPE,
@@ -356,10 +359,14 @@ class RecursiveController:
elif uuid == '24d92d16-770e-4088-b769-4020e127a7ff':
for k in self.stats.keys ():
self.stats[k] += data.get (k, 0)
- logger.info ('stats', uuid='24d92d16-770e-4088-b769-4020e127a7ff', **self.stats)
+ logStats ()
code = await process.wait()
- # atomically move once finished
- os.rename (dest.name, destpath)
+ if code == 0:
+ # atomically move once finished
+ os.rename (dest.name, destpath)
+ else:
+ self.stats['crashed'] += 1
+ logStats ()
def cancel (self):
""" Gracefully cancel this job, waiting for existing workers to shut down """