summaryrefslogtreecommitdiff
path: root/contrib/celerycrocoite.py
diff options
context:
space:
mode:
authorLars-Dominik Braun <lars@6xq.net>2018-06-20 11:13:37 +0200
committerLars-Dominik Braun <lars@6xq.net>2018-06-20 11:17:25 +0200
commit7730e0d64ec895091a0dd7eb0e3c6ce2ed02d981 (patch)
tree15d0ca2e0374b7d00a05d5dd5de1e48838e71feb /contrib/celerycrocoite.py
parent06a06463c0367718b2ed1b2b7f081cff6ca998a0 (diff)
downloadcrocoite-7730e0d64ec895091a0dd7eb0e3c6ce2ed02d981.tar.gz
crocoite-7730e0d64ec895091a0dd7eb0e3c6ce2ed02d981.tar.bz2
crocoite-7730e0d64ec895091a0dd7eb0e3c6ce2ed02d981.zip
Synchronous SiteLoader event handling
Previously a browser crash stalled the entire grab, since events from pychrome were handled asynchronously in a different thread and exceptions were not propagated to the main thread. Now all browser events are stored in a queue and processed by the main thread, allowing us to handle browser crashes gracefully (more or less). This made the following additional changes necessary: - Clear separation between producer (browser) and consumer (WARC, stats, …) - Behavior scripts now yield events as well, instead of accessing the WARC writer - WARC logging was removed (for now) and WARC writer does not require serialization any more
Diffstat (limited to 'contrib/celerycrocoite.py')
-rw-r--r--contrib/celerycrocoite.py9
1 files changed, 4 insertions, 5 deletions
diff --git a/contrib/celerycrocoite.py b/contrib/celerycrocoite.py
index 26c35ce..d0a02e9 100644
--- a/contrib/celerycrocoite.py
+++ b/contrib/celerycrocoite.py
@@ -88,10 +88,9 @@ def checkCompletedJobs (bot, jobs):
if Identifier (channel) not in bot.channels:
continue
try:
- result = handle.get (timeout=0.1)
- stats = result['stats']
- bot.msg (channel, '{}: {} ({}) finished. {} requests, {} failed, {} received.'.format (user, url,
- handle.id, stats['requests'], stats['failed'],
+ stats = handle.get (timeout=0.1)
+ bot.msg (channel, '{}: {} ({}) finished. {} crashed, {} requests, {} failed, {} received.'.format (user, url,
+ handle.id, stats['crashed'], stats['requests'], stats['failed'],
prettyBytes (stats['bytesRcv'])))
delete.add (handle.id)
except celery.exceptions.TimeoutError:
@@ -198,7 +197,7 @@ def archive (bot, trigger):
logBuffer=defaultSettings.logBuffer, idleTimeout=args.idleTimeout,
timeout=args.timeout)
args = dict (url=args.url,
- enabledBehaviorNames=list (behavior.availableNames-blacklistedBehavior),
+ enabledBehaviorNames=list (set (behavior.availableMap.keys())-blacklistedBehavior),
settings=settings, recursive=args.recursive,
concurrency=args.concurrency)
q = bot.memory['crocoite']['q']