From 7730e0d64ec895091a0dd7eb0e3c6ce2ed02d981 Mon Sep 17 00:00:00 2001 From: Lars-Dominik Braun Date: Wed, 20 Jun 2018 11:13:37 +0200 Subject: Synchronous SiteLoader event handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously a browser crash stalled the entire grab, since events from pychrome were handled asynchronously in a different thread and exceptions were not propagated to the main thread. Now all browser events are stored in a queue and processed by the main thread, allowing us to handle browser crashes gracefully (more or less). This made the following additional changes necessary: - Clear separation between producer (browser) and consumer (WARC, stats, …) - Behavior scripts now yield events as well, instead of accessing the WARC writer - WARC logging was removed (for now) and WARC writer does not require serialization any more --- crocoite/behavior.py | 124 +++++++++++++++++++++++++++++++++------------------ 1 file changed, 80 insertions(+), 44 deletions(-) (limited to 'crocoite/behavior.py') diff --git a/crocoite/behavior.py b/crocoite/behavior.py index 95e8160..fd8fff8 100644 --- a/crocoite/behavior.py +++ b/crocoite/behavior.py @@ -22,21 +22,44 @@ Generic and per-site behavior scripts """ -import logging +import logging, time from io import BytesIO from urllib.parse import urlsplit import os.path import pkg_resources from base64 import b64decode +from collections import OrderedDict + +from html5lib.serializer import HTMLSerializer +from warcio.statusandheaders import StatusAndHeaders +from pychrome.exceptions import TimeoutException from .util import randomString, packageUrl, getFormattedViewportMetrics from . import html from .html import StripAttributeFilter, StripTagFilter, ChromeTreeWalker -from html5lib.serializer import HTMLSerializer -from warcio.statusandheaders import StatusAndHeaders +from .browser import SiteLoader logger = logging.getLogger(__name__) +class Script: + """ A JavaScript resource """ + def __init__ (self, path=None, encoding='utf-8'): + self.path = path + if path: + self.data = pkg_resources.resource_string (__name__, os.path.join ('data', path)).decode (encoding) + + def __repr__ (self): + return '