diff options
author | Lars-Dominik Braun <lars@6xq.net> | 2018-06-20 11:13:37 +0200 |
---|---|---|
committer | Lars-Dominik Braun <lars@6xq.net> | 2018-06-20 11:17:25 +0200 |
commit | 7730e0d64ec895091a0dd7eb0e3c6ce2ed02d981 (patch) | |
tree | 15d0ca2e0374b7d00a05d5dd5de1e48838e71feb /contrib | |
parent | 06a06463c0367718b2ed1b2b7f081cff6ca998a0 (diff) | |
download | crocoite-7730e0d64ec895091a0dd7eb0e3c6ce2ed02d981.tar.gz crocoite-7730e0d64ec895091a0dd7eb0e3c6ce2ed02d981.tar.bz2 crocoite-7730e0d64ec895091a0dd7eb0e3c6ce2ed02d981.zip |
Synchronous SiteLoader event handling
Previously a browser crash stalled the entire grab, since events from
pychrome were handled asynchronously in a different thread and
exceptions were not propagated to the main thread.
Now all browser events are stored in a queue and processed by the main
thread, allowing us to handle browser crashes gracefully (more or less).
This made the following additional changes necessary:
- Clear separation between producer (browser) and consumer (WARC, stats,
…)
- Behavior scripts now yield events as well, instead of accessing the
WARC writer
- WARC logging was removed (for now) and WARC writer does not require
serialization any more
Diffstat (limited to 'contrib')
-rw-r--r-- | contrib/celerycrocoite.py | 9 |
1 files changed, 4 insertions, 5 deletions
diff --git a/contrib/celerycrocoite.py b/contrib/celerycrocoite.py index 26c35ce..d0a02e9 100644 --- a/contrib/celerycrocoite.py +++ b/contrib/celerycrocoite.py @@ -88,10 +88,9 @@ def checkCompletedJobs (bot, jobs): if Identifier (channel) not in bot.channels: continue try: - result = handle.get (timeout=0.1) - stats = result['stats'] - bot.msg (channel, '{}: {} ({}) finished. {} requests, {} failed, {} received.'.format (user, url, - handle.id, stats['requests'], stats['failed'], + stats = handle.get (timeout=0.1) + bot.msg (channel, '{}: {} ({}) finished. {} crashed, {} requests, {} failed, {} received.'.format (user, url, + handle.id, stats['crashed'], stats['requests'], stats['failed'], prettyBytes (stats['bytesRcv']))) delete.add (handle.id) except celery.exceptions.TimeoutError: @@ -198,7 +197,7 @@ def archive (bot, trigger): logBuffer=defaultSettings.logBuffer, idleTimeout=args.idleTimeout, timeout=args.timeout) args = dict (url=args.url, - enabledBehaviorNames=list (behavior.availableNames-blacklistedBehavior), + enabledBehaviorNames=list (set (behavior.availableMap.keys())-blacklistedBehavior), settings=settings, recursive=args.recursive, concurrency=args.concurrency) q = bot.memory['crocoite']['q'] |