From 3deded13df1339ef59a760c188804adffd9ed902 Mon Sep 17 00:00:00 2001 From: Lars-Dominik Braun Date: Sat, 28 Jul 2018 20:25:49 +0200 Subject: Reintroduce WARC logging Commit 7730e0d64ec895091a0dd7eb0e3c6ce2ed02d981 removed logging to WARC files. Add it again, but with a different implementation.. Credits to structlog for inspiration. --- crocoite/behavior.py | 34 ++++++------ crocoite/browser.py | 48 ++++++++++------- crocoite/cli.py | 16 +++--- crocoite/controller.py | 56 +++++++++++-------- crocoite/logger.py | 137 +++++++++++++++++++++++++++++++++++++++++++++++ crocoite/task.py | 9 ++-- crocoite/test_browser.py | 15 +++++- crocoite/test_logger.py | 60 +++++++++++++++++++++ crocoite/warc.py | 38 +++++++++++-- 9 files changed, 337 insertions(+), 76 deletions(-) create mode 100644 crocoite/logger.py create mode 100644 crocoite/test_logger.py diff --git a/crocoite/behavior.py b/crocoite/behavior.py index b34d3d9..8c24c59 100644 --- a/crocoite/behavior.py +++ b/crocoite/behavior.py @@ -22,7 +22,7 @@ Generic and per-site behavior scripts """ -import logging, time +import time from urllib.parse import urlsplit import os.path import pkg_resources @@ -36,8 +36,6 @@ from .util import randomString, getFormattedViewportMetrics, removeFragment from . import html from .html import StripAttributeFilter, StripTagFilter, ChromeTreeWalker -logger = logging.getLogger(__name__) - class Script: """ A JavaScript resource """ @@ -61,14 +59,15 @@ class Script: return s class Behavior: - __slots__ = ('loader') + __slots__ = ('loader', 'logger') # unique behavior name name = None - def __init__ (self, loader): + def __init__ (self, loader, logger): assert self.name is not None self.loader = loader + self.logger = logger.bind (context=type (self).__name__) def __contains__ (self, url): """ @@ -108,8 +107,8 @@ class JsOnload (Behavior): scriptPath = None - def __init__ (self, loader): - super ().__init__ (loader) + def __init__ (self, loader, logger): + super ().__init__ (loader, logger) self.script = Script (self.scriptPath) self.scriptHandle = None @@ -129,8 +128,8 @@ class Scroll (JsOnload): name = 'scroll' scriptPath = 'scroll.js' - def __init__ (self, loader): - super ().__init__ (loader) + def __init__ (self, loader, logger): + super ().__init__ (loader, logger) stopVarname = '__' + __package__ + '_stop__' newStopVarname = randomString () self.script.data = self.script.data.replace (stopVarname, newStopVarname) @@ -198,8 +197,8 @@ class DomSnapshot (Behavior): name = 'domSnapshot' - def __init__ (self, loader): - super ().__init__ (loader) + def __init__ (self, loader, logger): + super ().__init__ (loader, logger) self.script = Script ('canvas-snapshot.js') def onfinish (self): @@ -216,11 +215,11 @@ class DomSnapshot (Behavior): if rawUrl in haveUrls: # ignore duplicate URLs. they are usually caused by # javascript-injected iframes (advertising) with no(?) src - logger.warning ('have DOM snapshot for URL {}, ignoring'.format (rawUrl)) + self.logger.warning ('have DOM snapshot for URL {}, ignoring'.format (rawUrl)) continue url = urlsplit (rawUrl) if url.scheme in ('http', 'https'): - logger.debug ('saving DOM snapshot for url {}, base {}'.format (doc['documentURL'], doc['baseURL'])) + self.logger.debug ('saving DOM snapshot for url {}, base {}'.format (doc['documentURL'], doc['baseURL'])) haveUrls.add (rawUrl) walker = ChromeTreeWalker (doc) # remove script, to make the page static and noscript, because at the @@ -249,10 +248,11 @@ class Screenshot (Behavior): def onfinish (self): tab = self.loader.tab + tree = tab.Page.getFrameTree () try: - url = removeFragment (tab.Page.getFrameTree ()['frameTree']['frame']['url']) + url = removeFragment (tree['frameTree']['frame']['url']) except KeyError: - logger.error ('frame has no url') + self.logger.error ('frame without url', tree=tree) url = None # see https://github.com/GoogleChrome/puppeteer/blob/230be28b067b521f0577206899db01f0ca7fc0d2/examples/screenshots-longpage.js @@ -293,8 +293,8 @@ class ExtractLinks (Behavior): name = 'extractLinks' - def __init__ (self, loader): - super ().__init__ (loader) + def __init__ (self, loader, logger): + super ().__init__ (loader, logger) self.script = Script ('extract-links.js') def onfinish (self): diff --git a/crocoite/browser.py b/crocoite/browser.py index 6a4bee2..fbd12fd 100644 --- a/crocoite/browser.py +++ b/crocoite/browser.py @@ -22,12 +22,12 @@ Chrome browser interactions. """ -import logging from urllib.parse import urlsplit from base64 import b64decode from collections import deque from threading import Event from http.server import BaseHTTPRequestHandler +from .logger import Level import pychrome @@ -174,11 +174,11 @@ class SiteLoader: __slots__ = ('requests', 'browser', 'url', 'logger', 'queue', 'notify', 'tab') allowedSchemes = {'http', 'https'} - def __init__ (self, browser, url, logger=logging.getLogger(__name__)): + def __init__ (self, browser, url, logger): self.requests = {} self.browser = pychrome.Browser (url=browser) self.url = url - self.logger = logger + self.logger = logger.bind (context=type (self).__name__, url=url) self.queue = deque () self.notify = Event () @@ -236,6 +236,7 @@ class SiteLoader: def _requestWillBeSent (self, **kwargs): reqId = kwargs['requestId'] req = kwargs['request'] + logger = self.logger.bind (reqId=reqId, reqUrl=req['url']) url = urlsplit (req['url']) if url.scheme not in self.allowedSchemes: @@ -252,14 +253,15 @@ class SiteLoader: resp = {'requestId': reqId, 'encodedDataLength': 0, 'timestamp': kwargs['timestamp']} item.setFinished (resp) item.isRedirect = True - self.logger.info ('redirected request {} has url {}'.format (reqId, req['url'])) + logger.info ('redirect', uuid='85eaec41-e2a9-49c2-9445-6f19690278b8', target=req['url']) self._append (item) else: - self.logger.warning ('request {} already exists, overwriting.'.format (reqId)) + logger.warning ('request exists', uuid='2c989142-ba00-4791-bb03-c2a14e91a56b') item = Item (self.tab) item.setRequest (kwargs) self.requests[reqId] = item + logger.debug ('request', uuid='55c17564-1bd0-4499-8724-fa7aad65478f') def _responseReceived (self, **kwargs): reqId = kwargs['requestId'] @@ -268,12 +270,13 @@ class SiteLoader: return resp = kwargs['response'] + logger = self.logger.bind (reqId=reqId, respUrl=resp['url']) url = urlsplit (resp['url']) if url.scheme in self.allowedSchemes: - self.logger.info ('response {} {}'.format (reqId, resp['url'])) + logger.debug ('response', uuid='84461c4e-e8ef-4cbd-8e8e-e10a901c8bd0') item.setResponse (kwargs) else: - self.logger.warning ('response: ignoring scheme {}'.format (url.scheme)) + logger.warning ('scheme forbidden', uuid='2ea6e5d7-dd3b-4881-b9de-156c1751c666') def _loadingFinished (self, **kwargs): """ @@ -286,17 +289,22 @@ class SiteLoader: # we never recorded this request (blacklisted scheme, for example) return req = item.request + logger = self.logger.bind (reqId=reqId, reqUrl=req['url']) resp = item.response - assert req['url'] == resp['url'], 'req and resp urls are not the same {} vs {}'.format (req['url'], resp['url']) + if req['url'] != resp['url']: + logger.error ('url mismatch', uuid='7385f45f-0b06-4cbc-81f9-67bcd72ee7d0', respUrl=resp['url']) url = urlsplit (resp['url']) if url.scheme in self.allowedSchemes: - self.logger.info ('finished {} {}'.format (reqId, req['url'])) + logger.info ('finished', uuid='5a8b4bad-f86a-4fe6-a53e-8da4130d6a02') item.setFinished (kwargs) self._append (item) def _loadingFailed (self, **kwargs): reqId = kwargs['requestId'] - self.logger.warning ('failed {} {}'.format (reqId, kwargs['errorText'], kwargs.get ('blockedReason'))) + self.logger.warning ('loading failed', + uuid='68410f13-6eea-453e-924e-c1af4601748b', + errorText=kwargs['errorText'], + blockedReason=kwargs.get ('blockedReason')) item = self.requests.pop (reqId, None) item.failed = True self._append (item) @@ -304,25 +312,29 @@ class SiteLoader: def _entryAdded (self, **kwargs): """ Log entry added """ entry = kwargs['entry'] - level = {'verbose': logging.DEBUG, 'info': logging.INFO, - 'warning': logging.WARNING, - 'error': logging.ERROR}[entry['level']] - self.logger.log (level, 'console: {}: {}'.format (entry['source'], entry['text']), extra={'raw': entry}) + level = {'verbose': Level.DEBUG, 'info': Level.INFO, + 'warning': Level.WARNING, + 'error': Level.ERROR}.get (entry.pop ('level'), Level.INFO) + self.logger (level, 'console', **entry) def _javascriptDialogOpening (self, **kwargs): t = kwargs.get ('type') if t in {'alert', 'confirm', 'prompt'}: - self.logger.info ('javascript opened a dialog: {}, {}, canceling'.format (t, kwargs.get ('message'))) + self.logger.info ('js dialog', + uuid='d6f07ce2-648e-493b-a1df-f353bed27c84', + action='cancel', type=t, message=kwargs.get ('message')) self.tab.Page.handleJavaScriptDialog (accept=False) elif t == 'beforeunload': # we must accept this one, otherwise the page will not unload/close - self.logger.info ('javascript opened a dialog: {}, {}, procceeding'.format (t, kwargs.get ('message'))) + self.logger.info ('js dialog', + uuid='96399b99-9834-4c8f-bd93-cb9fa2225abd', + action='proceed', type=t, message=kwargs.get ('message')) self.tab.Page.handleJavaScriptDialog (accept=True) else: - self.logger.warning ('unknown javascript dialog type {}'.format (t)) + self.logger.warning ('js dialog unknown', uuid='3ef7292e-8595-4e89-b834-0cc6bc40ee38', **kwargs) def _targetCrashed (self, **kwargs): - self.logger.error ('browser crashed') + self.logger.error ('browser crashed', uuid='6fe2b3be-ff01-4503-b30c-ad6aeea953ef') # priority message self._appendleft (BrowserCrashed ()) diff --git a/crocoite/cli.py b/crocoite/cli.py index d631f10..8e225d9 100644 --- a/crocoite/cli.py +++ b/crocoite/cli.py @@ -22,13 +22,14 @@ Command line interface """ -import logging, argparse, json, sys +import argparse, json, sys from . import behavior from .controller import RecursiveController, defaultSettings, \ ControllerSettings, DepthLimit, PrefixLimit, StatsHandler from .browser import NullService, ChromeService from .warc import WarcHandler +from .logger import Logger, JsonPrintConsumer, DatetimeConsumer, WarcHandlerConsumer def parseRecursive (recursive, url): if recursive is None: @@ -42,7 +43,6 @@ def parseRecursive (recursive, url): def main (): parser = argparse.ArgumentParser(description='Save website to WARC using Google Chrome.') - parser.add_argument('--debug', help='Enable debug messages', action='store_true') parser.add_argument('--browser', help='DevTools URL', metavar='URL') parser.add_argument('--recursive', help='Follow links recursively') parser.add_argument('--concurrency', '-j', type=int, default=1) @@ -73,8 +73,7 @@ def main (): recursive=args.recursive, concurrency=args.concurrency) r = result.get () else: - level = logging.DEBUG if args.debug else logging.INFO - logging.basicConfig (level=level) + logger = Logger (consumer=[DatetimeConsumer (), JsonPrintConsumer ()]) try: recursionPolicy = parseRecursive (args.recursive, args.url) @@ -86,15 +85,16 @@ def main (): settings = ControllerSettings (maxBodySize=args.maxBodySize, logBuffer=args.logBuffer, idleTimeout=args.idleTimeout, timeout=args.timeout) - with open (args.output, 'wb') as fd: - handler = [StatsHandler (), WarcHandler (fd)] + with open (args.output, 'wb') as fd, WarcHandler (fd, logger) as warcHandler: + logger.connect (WarcHandlerConsumer (warcHandler)) + handler = [StatsHandler (), warcHandler] b = list (map (lambda x: behavior.availableMap[x], args.enabledBehaviorNames)) controller = RecursiveController (args.url, fd, settings=settings, recursionPolicy=recursionPolicy, service=service, - handler=handler, behavior=b) + handler=handler, behavior=b, logger=logger) controller.run () r = handler[0].stats - json.dump (r, sys.stdout) + logger.info ('stats', context='cli', uuid='24d92d16-770e-4088-b769-4020e127a7ff', **r) return True diff --git a/crocoite/controller.py b/crocoite/controller.py index ef042cc..cbf0037 100644 --- a/crocoite/controller.py +++ b/crocoite/controller.py @@ -70,7 +70,7 @@ class StatsHandler (EventHandler): elif isinstance (item, BrowserCrashed): self.stats['crashed'] += 1 -import logging, time +import time from . import behavior as cbehavior from .browser import ChromeService, SiteLoader, Item @@ -92,14 +92,15 @@ class SinglePageController: __slots__ = ('url', 'output', 'service', 'behavior', 'settings', 'logger', 'handler') - def __init__ (self, url, output, service=ChromeService (), behavior=cbehavior.available, \ - logger=logging.getLogger(__name__), settings=defaultSettings, handler=[]): + def __init__ (self, url, output, logger, \ + service=ChromeService (), behavior=cbehavior.available, \ + settings=defaultSettings, handler=[]): self.url = url self.output = output self.service = service self.behavior = behavior self.settings = settings - self.logger = logger + self.logger = logger.bind (context=type (self).__name__, url=url) self.handler = handler def processItem (self, item): @@ -113,17 +114,22 @@ class SinglePageController: h.push (item) def run (self): + logger = self.logger def processQueue (): # XXX: this is very ugly code and does not work well. figure out a # better way to impose timeouts and still process all items in the # queue queue = l.queue - self.logger.debug ('processing at least {} queue items'.format (len (queue))) + logger.debug ('process queue', + uuid='dafbf76b-a37e-44db-a021-efb5593b81f8', + queuelen=len (queue)) while True: now = time.time () elapsed = now-start maxTimeout = max (min (self.settings.idleTimeout, self.settings.timeout-elapsed), 0) - self.logger.debug ('max timeout is {} with elapsed {}'.format (maxTimeout, elapsed)) + logger.debug ('timeout status', + uuid='49550447-37e3-49ff-9a73-34da1c3e5984', + maxTimeout=maxTimeout, elapsed=elapsed) # skip waiting if there is work to do. processes all items in # queue, regardless of timeouts, i.e. you need to make sure the # queue will actually be empty at some point. @@ -131,7 +137,9 @@ class SinglePageController: if not l.notify.wait (maxTimeout): assert len (queue) == 0, "event must be sent" # timed out - self.logger.debug ('timed out after {}'.format (elapsed)) + logger.debug ('timeout', + uuid='6a7e0083-7c1a-45ba-b1ed-dbc4f26697c6', + elapsed=elapsed) break else: l.notify.clear () @@ -141,14 +149,16 @@ class SinglePageController: for i in range (1000): try: item = queue.popleft () - self.logger.debug ('queue pop: {!r}, len now {}'.format (item, len (queue))) + logger.debug ('queue pop', + uuid='adc96bfa-026d-4092-b732-4a022a1a92ca', + item=item, queuelen=len (queue)) except IndexError: break self.processItem (item) if maxTimeout == 0: break - with self.service as browser, SiteLoader (browser, self.url, logger=self.logger) as l: + with self.service as browser, SiteLoader (browser, self.url, logger=logger) as l: start = time.time () version = l.tab.Browser.getVersion () @@ -162,10 +172,9 @@ class SinglePageController: # not all behavior scripts are allowed for every URL, filter them enabledBehavior = list (filter (lambda x: self.url in x, - map (lambda x: x (l), self.behavior))) + map (lambda x: x (l, logger), self.behavior))) for b in enabledBehavior: - self.logger.debug ('starting onload {}'.format (b)) # I decided against using the queue here to limit memory # usage (screenshot behavior would put all images into # queue before we could process them) @@ -176,7 +185,6 @@ class SinglePageController: processQueue () for b in enabledBehavior: - self.logger.debug ('starting onstop {}'.format (b)) for item in b.onstop (): self.processItem (item) @@ -184,7 +192,6 @@ class SinglePageController: processQueue () for b in enabledBehavior: - self.logger.debug ('starting onfinish {}'.format (b)) for item in b.onfinish (): self.processItem (item) @@ -249,15 +256,16 @@ class RecursiveController (EventHandler): __slots__ = ('url', 'output', 'service', 'behavior', 'settings', 'logger', 'recursionPolicy', 'handler', 'urls', 'have') - def __init__ (self, url, output, service=ChromeService (), behavior=cbehavior.available, \ - logger=logging.getLogger(__name__), settings=defaultSettings, + def __init__ (self, url, output, logger, + service=ChromeService (), behavior=cbehavior.available, \ + settings=defaultSettings, \ recursionPolicy=DepthLimit (0), handler=[]): self.url = url self.output = output self.service = service self.behavior = behavior self.settings = settings - self.logger = logger + self.logger = logger.bind (context=type(self).__name__, url=url) self.recursionPolicy = recursionPolicy self.handler = handler self.handler.append (self) @@ -269,19 +277,22 @@ class RecursiveController (EventHandler): """ for u in urls: try: - c = SinglePageController (u, self.output, self.service, - self.behavior, self.logger, self.settings, self.handler) + c = SinglePageController (url=u, output=self.output, service=self.service, + behavior=self.behavior, logger=self.logger, + settings=self.settings, handler=self.handler) c.run () except BrowserCrashed: # this is fine if reported - self.logger.error ('browser crashed for {}'.format (u)) + self.logger.error ('browser crashed', uuid='42582cbe-fb83-47ce-b330-d022a1c3b331') def run (self): self.have = set () self.urls = set ([self.url]) while self.urls: - self.logger.info ('retrieving {} urls'.format (len (self.urls))) + self.logger.info ('recursing', + uuid='5b8498e4-868d-413c-a67e-004516b8452c', + numurls=len (self.urls)) self.have.update (self.urls) fetchurls = self.urls @@ -296,8 +307,7 @@ class RecursiveController (EventHandler): def push (self, item): if isinstance (item, ExtractLinksEvent): - self.logger.debug ('adding extracted links: {}'.format (item.links)) + self.logger.debug ('extracted links', + uuid='8ee5e9c9-1130-4c5c-88ff-718508546e0c', links=item.links) self.urls.update (map (removeFragment, item.links)) - else: - self.logger.debug ('{} got unhandled event {!r}'.format (self, item)) diff --git a/crocoite/logger.py b/crocoite/logger.py new file mode 100644 index 0000000..4b43e02 --- /dev/null +++ b/crocoite/logger.py @@ -0,0 +1,137 @@ +# Copyright (c) 2017–2018 crocoite contributors +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +""" +Simple logger inspired by structlog. + +It is usually used like this: Classes are passed a logger instance. They bind +context to their name, so identifying the source of messages is easier. Every +log message carries a unique id (uuid) for automated identification as well as +a short human-readable message (msg) and arbitrary payload. +""" + +import sys, json +from datetime import datetime +from functools import partial +from enum import IntEnum + +class Level(IntEnum): + DEBUG = 0 + INFO = 1 + WARNING = 2 + ERROR = 3 + +class Logger: + def __init__ (self, consumer=[], bindings={}): + self.bindings = bindings + self.consumer = consumer + + def __call__ (self, level, *args, **kwargs): + if not isinstance (level, Level): + level = Level[level.upper ()] + kwargs['level'] = level + if args: + if len (args) == 1: + args, = args + kwargs['msg'] = args + # do not overwrite arguments + for k, v in self.bindings.items (): + if k not in kwargs: + kwargs[k] = v + for c in self.consumer: + kwargs = c (**kwargs) + return kwargs + + def __getattr__ (self, k): + """ Bind all method names to level, so Logger.info, Logger.warning, … work """ + return partial (self.__call__, k) + + def bind (self, **kwargs): + d = self.bindings.copy () + d.update (kwargs) + # consumer is not a copy intentionally, so attaching to the parent + # logger will attach to all children as well + return self.__class__ (consumer=self.consumer, bindings=d) + + def unbind (self, **kwargs): + d = self.bindings.copy () + for k in kwargs.keys (): + del d[k] + return self.__class__ (consumer=self.consumer, bindings=d) + + def connect (self, consumer): + self.consumer.append (consumer) + + def disconnect (self, consumer): + self.consumer.remove (consumer) + +class Consumer: + def __call__ (self, level, *args, **kwargs): + raise NotImplementedError () + +class NullConsumer (Consumer): + def __call__ (self, **kwargs): + return kwargs + +class PrintConsumer (Consumer): + """ + Simple printing consumer + """ + def __call__ (self, **kwargs): + sys.stderr.write (str (kwargs)) + sys.stderr.write ('\n') + return kwargs + +class JsonEncoder (json.JSONEncoder): + def default (self, obj): + if isinstance (obj, datetime): + return obj.isoformat () + + # make sure serialization always succeeds + try: + return json.JSONEncoder.default(self, obj) + except TypeError: + return str (obj) + +class JsonPrintConsumer (Consumer): + def __init__ (self, minLevel=Level.INFO): + self.minLevel = minLevel + + def __call__ (self, **kwargs): + if kwargs['level'] >= self.minLevel: + json.dump (kwargs, sys.stdout, cls=JsonEncoder) + sys.stdout.write ('\n') + return kwargs + +class DatetimeConsumer (Consumer): + def __call__ (self, **kwargs): + kwargs['date'] = datetime.utcnow () + return kwargs + +class WarcHandlerConsumer (Consumer): + def __init__ (self, warc, minLevel=Level.INFO): + self.warc = warc + self.minLevel = minLevel + + def __call__ (self, **kwargs): + if kwargs['level'] >= self.minLevel: + self.warc._writeLog (json.dumps (kwargs, cls=JsonEncoder)) + return kwargs + diff --git a/crocoite/task.py b/crocoite/task.py index 6b3c9d1..06dd022 100644 --- a/crocoite/task.py +++ b/crocoite/task.py @@ -22,7 +22,7 @@ Celery distributed tasks """ -import os, logging +import os from urllib.parse import urlsplit from datetime import datetime @@ -113,8 +113,8 @@ class DistributedRecursiveController (RecursiveController): __slots__ = ('concurrency', 'stats') - def __init__ (self, url, service=ChromeService (), behavior=behavior.available, \ - logger=logging.getLogger(__name__), settings=defaultSettings, + def __init__ (self, url, logger, service=ChromeService (), behavior=behavior.available, \ + settings=defaultSettings, recursionPolicy=DepthLimit (0), concurrency=1): super ().__init__ (url, None, service, behavior, logger, settings, recursionPolicy) self.concurrency = concurrency @@ -134,10 +134,11 @@ class DistributedRecursiveController (RecursiveController): def controller (self, url, settings, enabledBehaviorNames, recursive, concurrency): """ Recursive controller """ + logger = Logger (consumer=[DatetimeConsumer (), JsonPrintConsumer ()]) recursionPolicy = parseRecursive (recursive, url) enabledBehavior = list (filter (lambda x: x.name in enabledBehaviorNames, behavior.available)) settings = ControllerSettings (**settings) - c = DistributedRecursiveController (url, None, behavior=enabledBehavior, + c = DistributedRecursiveController (url, None, logger=logger, behavior=enabledBehavior, settings=settings, recursionPolicy=recursionPolicy, concurrency=concurrency) c.run () return dict (c.stats) diff --git a/crocoite/test_browser.py b/crocoite/test_browser.py index 9d37efb..dfcd71c 100644 --- a/crocoite/test_browser.py +++ b/crocoite/test_browser.py @@ -24,6 +24,7 @@ from http.server import BaseHTTPRequestHandler from pychrome.exceptions import TimeoutException from .browser import Item, SiteLoader, ChromeService, NullService, BrowserCrashed +from .logger import Logger, Consumer class TItem (Item): """ This should be as close to Item as possible """ @@ -102,12 +103,22 @@ def http (): p.terminate () p.join () +class AssertConsumer (Consumer): + def __call__ (self, **kwargs): + assert 'uuid' in kwargs + assert 'msg' in kwargs + assert 'context' in kwargs + +@pytest.fixture +def logger (): + return Logger (consumer=[AssertConsumer ()]) + @pytest.fixture -def loader (http): +def loader (http, logger): def f (path): if path.startswith ('/'): path = 'http://localhost:8000{}'.format (path) - return SiteLoader (browser, path) + return SiteLoader (browser, path, logger) print ('loader setup') with ChromeService () as browser: yield f diff --git a/crocoite/test_logger.py b/crocoite/test_logger.py new file mode 100644 index 0000000..8a34aab --- /dev/null +++ b/crocoite/test_logger.py @@ -0,0 +1,60 @@ +import pytest +from .logger import Logger, Consumer, NullConsumer + +@pytest.fixture +def logger (): + return Logger (consumer=[NullConsumer ()]) + +class QueueConsumer (Consumer): + def __init__ (self): + self.data = [] + + def __call__ (self, **kwargs): + self.data.append (kwargs) + return kwargs + +def test_bind (logger): + # simple bind + logger = logger.bind (foo='bar') + ret = logger.debug () + assert ret['foo'] == 'bar' + + # additional + ret = logger.debug (bar='baz') + assert ret['foo'] == 'bar' + assert ret['bar'] == 'baz' + + # override + ret = logger.debug (foo='baz') + assert ret['foo'] == 'baz' + + # unbind + logger = logger.unbind (foo=None) + ret = logger.debug () + assert 'foo' not in ret + +def test_consumer (logger): + c = QueueConsumer () + logger.connect (c) + ret = logger.debug (foo='bar') + assert len (c.data) == 1 + assert c.data[0] == ret + assert ret['foo'] == 'bar' + c.data = [] + + # inheritance + logger = logger.bind (inherit=1) + ret = logger.debug (foo='bar') + assert len (c.data) == 1 + assert c.data[0] == ret + assert ret['foo'] == 'bar' + assert ret['inherit'] == 1 + c.data = [] + + # removal + logger.disconnect (c) + ret = logger.debug (foo='bar') + assert len (c.data) == 0 + assert ret['foo'] == 'bar' + assert ret['inherit'] == 1 + diff --git a/crocoite/warc.py b/crocoite/warc.py index 47dd9dc..a4a70ac 100644 --- a/crocoite/warc.py +++ b/crocoite/warc.py @@ -22,8 +22,7 @@ Classes writing data to WARC files """ -import logging -import json +import json, threading from io import BytesIO from warcio.statusandheaders import StatusAndHeaders from urllib.parse import urlsplit @@ -38,18 +37,31 @@ from .behavior import Script, DomSnapshotEvent, ScreenshotEvent from .browser import Item class WarcHandler (EventHandler): - __slots__ = ('logger', 'writer', 'maxBodySize', 'documentRecords') + __slots__ = ('logger', 'writer', 'maxBodySize', 'documentRecords', 'log', + 'maxLogSize', 'logEncoding') def __init__ (self, fd, - logger=logging.getLogger(__name__), + logger, maxBodySize=defaultSettings.maxBodySize): self.logger = logger self.writer = WARCWriter (fd, gzip=True) self.maxBodySize = maxBodySize + + self.logEncoding = 'utf-8' + self.log = BytesIO () + # max log buffer size (bytes) + self.maxLogSize = 500*1024 + # maps document urls to WARC record ids, required for DomSnapshotEvent # and ScreenshotEvent self.documentRecords = {} + def __enter__ (self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + self._flushLogEntries () + def _writeRequest (self, item): writer = self.writer @@ -204,6 +216,24 @@ class WarcHandler (EventHandler): warcinfo = writer.create_warcinfo_record (filename=None, info=item.payload) writer.write_record (warcinfo) + def _flushLogEntries (self): + writer = self.writer + self.log.seek (0) + # XXX: we should use the type continuation here + record = writer.create_warc_record (packageUrl ('log'), 'resource', payload=self.log, + warc_headers_dict={'Content-Type': 'text/plain; encoding={}'.format (self.logEncoding)}) + writer.write_record (record) + self.log = BytesIO () + + def _writeLog (self, item): + """ Handle log entries, called by .logger.WarcHandlerConsumer only """ + self.log.write (item.encode (self.logEncoding)) + self.log.write (b'\n') + # instead of locking, check we’re running in the main thread + if self.log.tell () > self.maxLogSize and \ + threading.current_thread () is threading.main_thread (): + self._flushLogEntries () + route = {Script: _writeScript, Item: _writeItem, DomSnapshotEvent: _writeDomSnapshot, -- cgit v1.2.3