summaryrefslogtreecommitdiff
path: root/crocoite
diff options
context:
space:
mode:
Diffstat (limited to 'crocoite')
-rw-r--r--crocoite/behavior.py34
-rw-r--r--crocoite/browser.py48
-rw-r--r--crocoite/cli.py16
-rw-r--r--crocoite/controller.py56
-rw-r--r--crocoite/logger.py137
-rw-r--r--crocoite/task.py9
-rw-r--r--crocoite/test_browser.py15
-rw-r--r--crocoite/test_logger.py60
-rw-r--r--crocoite/warc.py38
9 files changed, 337 insertions, 76 deletions
diff --git a/crocoite/behavior.py b/crocoite/behavior.py
index b34d3d9..8c24c59 100644
--- a/crocoite/behavior.py
+++ b/crocoite/behavior.py
@@ -22,7 +22,7 @@
Generic and per-site behavior scripts
"""
-import logging, time
+import time
from urllib.parse import urlsplit
import os.path
import pkg_resources
@@ -36,8 +36,6 @@ from .util import randomString, getFormattedViewportMetrics, removeFragment
from . import html
from .html import StripAttributeFilter, StripTagFilter, ChromeTreeWalker
-logger = logging.getLogger(__name__)
-
class Script:
""" A JavaScript resource """
@@ -61,14 +59,15 @@ class Script:
return s
class Behavior:
- __slots__ = ('loader')
+ __slots__ = ('loader', 'logger')
# unique behavior name
name = None
- def __init__ (self, loader):
+ def __init__ (self, loader, logger):
assert self.name is not None
self.loader = loader
+ self.logger = logger.bind (context=type (self).__name__)
def __contains__ (self, url):
"""
@@ -108,8 +107,8 @@ class JsOnload (Behavior):
scriptPath = None
- def __init__ (self, loader):
- super ().__init__ (loader)
+ def __init__ (self, loader, logger):
+ super ().__init__ (loader, logger)
self.script = Script (self.scriptPath)
self.scriptHandle = None
@@ -129,8 +128,8 @@ class Scroll (JsOnload):
name = 'scroll'
scriptPath = 'scroll.js'
- def __init__ (self, loader):
- super ().__init__ (loader)
+ def __init__ (self, loader, logger):
+ super ().__init__ (loader, logger)
stopVarname = '__' + __package__ + '_stop__'
newStopVarname = randomString ()
self.script.data = self.script.data.replace (stopVarname, newStopVarname)
@@ -198,8 +197,8 @@ class DomSnapshot (Behavior):
name = 'domSnapshot'
- def __init__ (self, loader):
- super ().__init__ (loader)
+ def __init__ (self, loader, logger):
+ super ().__init__ (loader, logger)
self.script = Script ('canvas-snapshot.js')
def onfinish (self):
@@ -216,11 +215,11 @@ class DomSnapshot (Behavior):
if rawUrl in haveUrls:
# ignore duplicate URLs. they are usually caused by
# javascript-injected iframes (advertising) with no(?) src
- logger.warning ('have DOM snapshot for URL {}, ignoring'.format (rawUrl))
+ self.logger.warning ('have DOM snapshot for URL {}, ignoring'.format (rawUrl))
continue
url = urlsplit (rawUrl)
if url.scheme in ('http', 'https'):
- logger.debug ('saving DOM snapshot for url {}, base {}'.format (doc['documentURL'], doc['baseURL']))
+ self.logger.debug ('saving DOM snapshot for url {}, base {}'.format (doc['documentURL'], doc['baseURL']))
haveUrls.add (rawUrl)
walker = ChromeTreeWalker (doc)
# remove script, to make the page static and noscript, because at the
@@ -249,10 +248,11 @@ class Screenshot (Behavior):
def onfinish (self):
tab = self.loader.tab
+ tree = tab.Page.getFrameTree ()
try:
- url = removeFragment (tab.Page.getFrameTree ()['frameTree']['frame']['url'])
+ url = removeFragment (tree['frameTree']['frame']['url'])
except KeyError:
- logger.error ('frame has no url')
+ self.logger.error ('frame without url', tree=tree)
url = None
# see https://github.com/GoogleChrome/puppeteer/blob/230be28b067b521f0577206899db01f0ca7fc0d2/examples/screenshots-longpage.js
@@ -293,8 +293,8 @@ class ExtractLinks (Behavior):
name = 'extractLinks'
- def __init__ (self, loader):
- super ().__init__ (loader)
+ def __init__ (self, loader, logger):
+ super ().__init__ (loader, logger)
self.script = Script ('extract-links.js')
def onfinish (self):
diff --git a/crocoite/browser.py b/crocoite/browser.py
index 6a4bee2..fbd12fd 100644
--- a/crocoite/browser.py
+++ b/crocoite/browser.py
@@ -22,12 +22,12 @@
Chrome browser interactions.
"""
-import logging
from urllib.parse import urlsplit
from base64 import b64decode
from collections import deque
from threading import Event
from http.server import BaseHTTPRequestHandler
+from .logger import Level
import pychrome
@@ -174,11 +174,11 @@ class SiteLoader:
__slots__ = ('requests', 'browser', 'url', 'logger', 'queue', 'notify', 'tab')
allowedSchemes = {'http', 'https'}
- def __init__ (self, browser, url, logger=logging.getLogger(__name__)):
+ def __init__ (self, browser, url, logger):
self.requests = {}
self.browser = pychrome.Browser (url=browser)
self.url = url
- self.logger = logger
+ self.logger = logger.bind (context=type (self).__name__, url=url)
self.queue = deque ()
self.notify = Event ()
@@ -236,6 +236,7 @@ class SiteLoader:
def _requestWillBeSent (self, **kwargs):
reqId = kwargs['requestId']
req = kwargs['request']
+ logger = self.logger.bind (reqId=reqId, reqUrl=req['url'])
url = urlsplit (req['url'])
if url.scheme not in self.allowedSchemes:
@@ -252,14 +253,15 @@ class SiteLoader:
resp = {'requestId': reqId, 'encodedDataLength': 0, 'timestamp': kwargs['timestamp']}
item.setFinished (resp)
item.isRedirect = True
- self.logger.info ('redirected request {} has url {}'.format (reqId, req['url']))
+ logger.info ('redirect', uuid='85eaec41-e2a9-49c2-9445-6f19690278b8', target=req['url'])
self._append (item)
else:
- self.logger.warning ('request {} already exists, overwriting.'.format (reqId))
+ logger.warning ('request exists', uuid='2c989142-ba00-4791-bb03-c2a14e91a56b')
item = Item (self.tab)
item.setRequest (kwargs)
self.requests[reqId] = item
+ logger.debug ('request', uuid='55c17564-1bd0-4499-8724-fa7aad65478f')
def _responseReceived (self, **kwargs):
reqId = kwargs['requestId']
@@ -268,12 +270,13 @@ class SiteLoader:
return
resp = kwargs['response']
+ logger = self.logger.bind (reqId=reqId, respUrl=resp['url'])
url = urlsplit (resp['url'])
if url.scheme in self.allowedSchemes:
- self.logger.info ('response {} {}'.format (reqId, resp['url']))
+ logger.debug ('response', uuid='84461c4e-e8ef-4cbd-8e8e-e10a901c8bd0')
item.setResponse (kwargs)
else:
- self.logger.warning ('response: ignoring scheme {}'.format (url.scheme))
+ logger.warning ('scheme forbidden', uuid='2ea6e5d7-dd3b-4881-b9de-156c1751c666')
def _loadingFinished (self, **kwargs):
"""
@@ -286,17 +289,22 @@ class SiteLoader:
# we never recorded this request (blacklisted scheme, for example)
return
req = item.request
+ logger = self.logger.bind (reqId=reqId, reqUrl=req['url'])
resp = item.response
- assert req['url'] == resp['url'], 'req and resp urls are not the same {} vs {}'.format (req['url'], resp['url'])
+ if req['url'] != resp['url']:
+ logger.error ('url mismatch', uuid='7385f45f-0b06-4cbc-81f9-67bcd72ee7d0', respUrl=resp['url'])
url = urlsplit (resp['url'])
if url.scheme in self.allowedSchemes:
- self.logger.info ('finished {} {}'.format (reqId, req['url']))
+ logger.info ('finished', uuid='5a8b4bad-f86a-4fe6-a53e-8da4130d6a02')
item.setFinished (kwargs)
self._append (item)
def _loadingFailed (self, **kwargs):
reqId = kwargs['requestId']
- self.logger.warning ('failed {} {}'.format (reqId, kwargs['errorText'], kwargs.get ('blockedReason')))
+ self.logger.warning ('loading failed',
+ uuid='68410f13-6eea-453e-924e-c1af4601748b',
+ errorText=kwargs['errorText'],
+ blockedReason=kwargs.get ('blockedReason'))
item = self.requests.pop (reqId, None)
item.failed = True
self._append (item)
@@ -304,25 +312,29 @@ class SiteLoader:
def _entryAdded (self, **kwargs):
""" Log entry added """
entry = kwargs['entry']
- level = {'verbose': logging.DEBUG, 'info': logging.INFO,
- 'warning': logging.WARNING,
- 'error': logging.ERROR}[entry['level']]
- self.logger.log (level, 'console: {}: {}'.format (entry['source'], entry['text']), extra={'raw': entry})
+ level = {'verbose': Level.DEBUG, 'info': Level.INFO,
+ 'warning': Level.WARNING,
+ 'error': Level.ERROR}.get (entry.pop ('level'), Level.INFO)
+ self.logger (level, 'console', **entry)
def _javascriptDialogOpening (self, **kwargs):
t = kwargs.get ('type')
if t in {'alert', 'confirm', 'prompt'}:
- self.logger.info ('javascript opened a dialog: {}, {}, canceling'.format (t, kwargs.get ('message')))
+ self.logger.info ('js dialog',
+ uuid='d6f07ce2-648e-493b-a1df-f353bed27c84',
+ action='cancel', type=t, message=kwargs.get ('message'))
self.tab.Page.handleJavaScriptDialog (accept=False)
elif t == 'beforeunload':
# we must accept this one, otherwise the page will not unload/close
- self.logger.info ('javascript opened a dialog: {}, {}, procceeding'.format (t, kwargs.get ('message')))
+ self.logger.info ('js dialog',
+ uuid='96399b99-9834-4c8f-bd93-cb9fa2225abd',
+ action='proceed', type=t, message=kwargs.get ('message'))
self.tab.Page.handleJavaScriptDialog (accept=True)
else:
- self.logger.warning ('unknown javascript dialog type {}'.format (t))
+ self.logger.warning ('js dialog unknown', uuid='3ef7292e-8595-4e89-b834-0cc6bc40ee38', **kwargs)
def _targetCrashed (self, **kwargs):
- self.logger.error ('browser crashed')
+ self.logger.error ('browser crashed', uuid='6fe2b3be-ff01-4503-b30c-ad6aeea953ef')
# priority message
self._appendleft (BrowserCrashed ())
diff --git a/crocoite/cli.py b/crocoite/cli.py
index d631f10..8e225d9 100644
--- a/crocoite/cli.py
+++ b/crocoite/cli.py
@@ -22,13 +22,14 @@
Command line interface
"""
-import logging, argparse, json, sys
+import argparse, json, sys
from . import behavior
from .controller import RecursiveController, defaultSettings, \
ControllerSettings, DepthLimit, PrefixLimit, StatsHandler
from .browser import NullService, ChromeService
from .warc import WarcHandler
+from .logger import Logger, JsonPrintConsumer, DatetimeConsumer, WarcHandlerConsumer
def parseRecursive (recursive, url):
if recursive is None:
@@ -42,7 +43,6 @@ def parseRecursive (recursive, url):
def main ():
parser = argparse.ArgumentParser(description='Save website to WARC using Google Chrome.')
- parser.add_argument('--debug', help='Enable debug messages', action='store_true')
parser.add_argument('--browser', help='DevTools URL', metavar='URL')
parser.add_argument('--recursive', help='Follow links recursively')
parser.add_argument('--concurrency', '-j', type=int, default=1)
@@ -73,8 +73,7 @@ def main ():
recursive=args.recursive, concurrency=args.concurrency)
r = result.get ()
else:
- level = logging.DEBUG if args.debug else logging.INFO
- logging.basicConfig (level=level)
+ logger = Logger (consumer=[DatetimeConsumer (), JsonPrintConsumer ()])
try:
recursionPolicy = parseRecursive (args.recursive, args.url)
@@ -86,15 +85,16 @@ def main ():
settings = ControllerSettings (maxBodySize=args.maxBodySize,
logBuffer=args.logBuffer, idleTimeout=args.idleTimeout,
timeout=args.timeout)
- with open (args.output, 'wb') as fd:
- handler = [StatsHandler (), WarcHandler (fd)]
+ with open (args.output, 'wb') as fd, WarcHandler (fd, logger) as warcHandler:
+ logger.connect (WarcHandlerConsumer (warcHandler))
+ handler = [StatsHandler (), warcHandler]
b = list (map (lambda x: behavior.availableMap[x], args.enabledBehaviorNames))
controller = RecursiveController (args.url, fd, settings=settings,
recursionPolicy=recursionPolicy, service=service,
- handler=handler, behavior=b)
+ handler=handler, behavior=b, logger=logger)
controller.run ()
r = handler[0].stats
- json.dump (r, sys.stdout)
+ logger.info ('stats', context='cli', uuid='24d92d16-770e-4088-b769-4020e127a7ff', **r)
return True
diff --git a/crocoite/controller.py b/crocoite/controller.py
index ef042cc..cbf0037 100644
--- a/crocoite/controller.py
+++ b/crocoite/controller.py
@@ -70,7 +70,7 @@ class StatsHandler (EventHandler):
elif isinstance (item, BrowserCrashed):
self.stats['crashed'] += 1
-import logging, time
+import time
from . import behavior as cbehavior
from .browser import ChromeService, SiteLoader, Item
@@ -92,14 +92,15 @@ class SinglePageController:
__slots__ = ('url', 'output', 'service', 'behavior', 'settings', 'logger', 'handler')
- def __init__ (self, url, output, service=ChromeService (), behavior=cbehavior.available, \
- logger=logging.getLogger(__name__), settings=defaultSettings, handler=[]):
+ def __init__ (self, url, output, logger, \
+ service=ChromeService (), behavior=cbehavior.available, \
+ settings=defaultSettings, handler=[]):
self.url = url
self.output = output
self.service = service
self.behavior = behavior
self.settings = settings
- self.logger = logger
+ self.logger = logger.bind (context=type (self).__name__, url=url)
self.handler = handler
def processItem (self, item):
@@ -113,17 +114,22 @@ class SinglePageController:
h.push (item)
def run (self):
+ logger = self.logger
def processQueue ():
# XXX: this is very ugly code and does not work well. figure out a
# better way to impose timeouts and still process all items in the
# queue
queue = l.queue
- self.logger.debug ('processing at least {} queue items'.format (len (queue)))
+ logger.debug ('process queue',
+ uuid='dafbf76b-a37e-44db-a021-efb5593b81f8',
+ queuelen=len (queue))
while True:
now = time.time ()
elapsed = now-start
maxTimeout = max (min (self.settings.idleTimeout, self.settings.timeout-elapsed), 0)
- self.logger.debug ('max timeout is {} with elapsed {}'.format (maxTimeout, elapsed))
+ logger.debug ('timeout status',
+ uuid='49550447-37e3-49ff-9a73-34da1c3e5984',
+ maxTimeout=maxTimeout, elapsed=elapsed)
# skip waiting if there is work to do. processes all items in
# queue, regardless of timeouts, i.e. you need to make sure the
# queue will actually be empty at some point.
@@ -131,7 +137,9 @@ class SinglePageController:
if not l.notify.wait (maxTimeout):
assert len (queue) == 0, "event must be sent"
# timed out
- self.logger.debug ('timed out after {}'.format (elapsed))
+ logger.debug ('timeout',
+ uuid='6a7e0083-7c1a-45ba-b1ed-dbc4f26697c6',
+ elapsed=elapsed)
break
else:
l.notify.clear ()
@@ -141,14 +149,16 @@ class SinglePageController:
for i in range (1000):
try:
item = queue.popleft ()
- self.logger.debug ('queue pop: {!r}, len now {}'.format (item, len (queue)))
+ logger.debug ('queue pop',
+ uuid='adc96bfa-026d-4092-b732-4a022a1a92ca',
+ item=item, queuelen=len (queue))
except IndexError:
break
self.processItem (item)
if maxTimeout == 0:
break
- with self.service as browser, SiteLoader (browser, self.url, logger=self.logger) as l:
+ with self.service as browser, SiteLoader (browser, self.url, logger=logger) as l:
start = time.time ()
version = l.tab.Browser.getVersion ()
@@ -162,10 +172,9 @@ class SinglePageController:
# not all behavior scripts are allowed for every URL, filter them
enabledBehavior = list (filter (lambda x: self.url in x,
- map (lambda x: x (l), self.behavior)))
+ map (lambda x: x (l, logger), self.behavior)))
for b in enabledBehavior:
- self.logger.debug ('starting onload {}'.format (b))
# I decided against using the queue here to limit memory
# usage (screenshot behavior would put all images into
# queue before we could process them)
@@ -176,7 +185,6 @@ class SinglePageController:
processQueue ()
for b in enabledBehavior:
- self.logger.debug ('starting onstop {}'.format (b))
for item in b.onstop ():
self.processItem (item)
@@ -184,7 +192,6 @@ class SinglePageController:
processQueue ()
for b in enabledBehavior:
- self.logger.debug ('starting onfinish {}'.format (b))
for item in b.onfinish ():
self.processItem (item)
@@ -249,15 +256,16 @@ class RecursiveController (EventHandler):
__slots__ = ('url', 'output', 'service', 'behavior', 'settings', 'logger',
'recursionPolicy', 'handler', 'urls', 'have')
- def __init__ (self, url, output, service=ChromeService (), behavior=cbehavior.available, \
- logger=logging.getLogger(__name__), settings=defaultSettings,
+ def __init__ (self, url, output, logger,
+ service=ChromeService (), behavior=cbehavior.available, \
+ settings=defaultSettings, \
recursionPolicy=DepthLimit (0), handler=[]):
self.url = url
self.output = output
self.service = service
self.behavior = behavior
self.settings = settings
- self.logger = logger
+ self.logger = logger.bind (context=type(self).__name__, url=url)
self.recursionPolicy = recursionPolicy
self.handler = handler
self.handler.append (self)
@@ -269,19 +277,22 @@ class RecursiveController (EventHandler):
"""
for u in urls:
try:
- c = SinglePageController (u, self.output, self.service,
- self.behavior, self.logger, self.settings, self.handler)
+ c = SinglePageController (url=u, output=self.output, service=self.service,
+ behavior=self.behavior, logger=self.logger,
+ settings=self.settings, handler=self.handler)
c.run ()
except BrowserCrashed:
# this is fine if reported
- self.logger.error ('browser crashed for {}'.format (u))
+ self.logger.error ('browser crashed', uuid='42582cbe-fb83-47ce-b330-d022a1c3b331')
def run (self):
self.have = set ()
self.urls = set ([self.url])
while self.urls:
- self.logger.info ('retrieving {} urls'.format (len (self.urls)))
+ self.logger.info ('recursing',
+ uuid='5b8498e4-868d-413c-a67e-004516b8452c',
+ numurls=len (self.urls))
self.have.update (self.urls)
fetchurls = self.urls
@@ -296,8 +307,7 @@ class RecursiveController (EventHandler):
def push (self, item):
if isinstance (item, ExtractLinksEvent):
- self.logger.debug ('adding extracted links: {}'.format (item.links))
+ self.logger.debug ('extracted links',
+ uuid='8ee5e9c9-1130-4c5c-88ff-718508546e0c', links=item.links)
self.urls.update (map (removeFragment, item.links))
- else:
- self.logger.debug ('{} got unhandled event {!r}'.format (self, item))
diff --git a/crocoite/logger.py b/crocoite/logger.py
new file mode 100644
index 0000000..4b43e02
--- /dev/null
+++ b/crocoite/logger.py
@@ -0,0 +1,137 @@
+# Copyright (c) 2017–2018 crocoite contributors
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+"""
+Simple logger inspired by structlog.
+
+It is usually used like this: Classes are passed a logger instance. They bind
+context to their name, so identifying the source of messages is easier. Every
+log message carries a unique id (uuid) for automated identification as well as
+a short human-readable message (msg) and arbitrary payload.
+"""
+
+import sys, json
+from datetime import datetime
+from functools import partial
+from enum import IntEnum
+
+class Level(IntEnum):
+ DEBUG = 0
+ INFO = 1
+ WARNING = 2
+ ERROR = 3
+
+class Logger:
+ def __init__ (self, consumer=[], bindings={}):
+ self.bindings = bindings
+ self.consumer = consumer
+
+ def __call__ (self, level, *args, **kwargs):
+ if not isinstance (level, Level):
+ level = Level[level.upper ()]
+ kwargs['level'] = level
+ if args:
+ if len (args) == 1:
+ args, = args
+ kwargs['msg'] = args
+ # do not overwrite arguments
+ for k, v in self.bindings.items ():
+ if k not in kwargs:
+ kwargs[k] = v
+ for c in self.consumer:
+ kwargs = c (**kwargs)
+ return kwargs
+
+ def __getattr__ (self, k):
+ """ Bind all method names to level, so Logger.info, Logger.warning, … work """
+ return partial (self.__call__, k)
+
+ def bind (self, **kwargs):
+ d = self.bindings.copy ()
+ d.update (kwargs)
+ # consumer is not a copy intentionally, so attaching to the parent
+ # logger will attach to all children as well
+ return self.__class__ (consumer=self.consumer, bindings=d)
+
+ def unbind (self, **kwargs):
+ d = self.bindings.copy ()
+ for k in kwargs.keys ():
+ del d[k]
+ return self.__class__ (consumer=self.consumer, bindings=d)
+
+ def connect (self, consumer):
+ self.consumer.append (consumer)
+
+ def disconnect (self, consumer):
+ self.consumer.remove (consumer)
+
+class Consumer:
+ def __call__ (self, level, *args, **kwargs):
+ raise NotImplementedError ()
+
+class NullConsumer (Consumer):
+ def __call__ (self, **kwargs):
+ return kwargs
+
+class PrintConsumer (Consumer):
+ """
+ Simple printing consumer
+ """
+ def __call__ (self, **kwargs):
+ sys.stderr.write (str (kwargs))
+ sys.stderr.write ('\n')
+ return kwargs
+
+class JsonEncoder (json.JSONEncoder):
+ def default (self, obj):
+ if isinstance (obj, datetime):
+ return obj.isoformat ()
+
+ # make sure serialization always succeeds
+ try:
+ return json.JSONEncoder.default(self, obj)
+ except TypeError:
+ return str (obj)
+
+class JsonPrintConsumer (Consumer):
+ def __init__ (self, minLevel=Level.INFO):
+ self.minLevel = minLevel
+
+ def __call__ (self, **kwargs):
+ if kwargs['level'] >= self.minLevel:
+ json.dump (kwargs, sys.stdout, cls=JsonEncoder)
+ sys.stdout.write ('\n')
+ return kwargs
+
+class DatetimeConsumer (Consumer):
+ def __call__ (self, **kwargs):
+ kwargs['date'] = datetime.utcnow ()
+ return kwargs
+
+class WarcHandlerConsumer (Consumer):
+ def __init__ (self, warc, minLevel=Level.INFO):
+ self.warc = warc
+ self.minLevel = minLevel
+
+ def __call__ (self, **kwargs):
+ if kwargs['level'] >= self.minLevel:
+ self.warc._writeLog (json.dumps (kwargs, cls=JsonEncoder))
+ return kwargs
+
diff --git a/crocoite/task.py b/crocoite/task.py
index 6b3c9d1..06dd022 100644
--- a/crocoite/task.py
+++ b/crocoite/task.py
@@ -22,7 +22,7 @@
Celery distributed tasks
"""
-import os, logging
+import os
from urllib.parse import urlsplit
from datetime import datetime
@@ -113,8 +113,8 @@ class DistributedRecursiveController (RecursiveController):
__slots__ = ('concurrency', 'stats')
- def __init__ (self, url, service=ChromeService (), behavior=behavior.available, \
- logger=logging.getLogger(__name__), settings=defaultSettings,
+ def __init__ (self, url, logger, service=ChromeService (), behavior=behavior.available, \
+ settings=defaultSettings,
recursionPolicy=DepthLimit (0), concurrency=1):
super ().__init__ (url, None, service, behavior, logger, settings, recursionPolicy)
self.concurrency = concurrency
@@ -134,10 +134,11 @@ class DistributedRecursiveController (RecursiveController):
def controller (self, url, settings, enabledBehaviorNames, recursive, concurrency):
""" Recursive controller """
+ logger = Logger (consumer=[DatetimeConsumer (), JsonPrintConsumer ()])
recursionPolicy = parseRecursive (recursive, url)
enabledBehavior = list (filter (lambda x: x.name in enabledBehaviorNames, behavior.available))
settings = ControllerSettings (**settings)
- c = DistributedRecursiveController (url, None, behavior=enabledBehavior,
+ c = DistributedRecursiveController (url, None, logger=logger, behavior=enabledBehavior,
settings=settings, recursionPolicy=recursionPolicy, concurrency=concurrency)
c.run ()
return dict (c.stats)
diff --git a/crocoite/test_browser.py b/crocoite/test_browser.py
index 9d37efb..dfcd71c 100644
--- a/crocoite/test_browser.py
+++ b/crocoite/test_browser.py
@@ -24,6 +24,7 @@ from http.server import BaseHTTPRequestHandler
from pychrome.exceptions import TimeoutException
from .browser import Item, SiteLoader, ChromeService, NullService, BrowserCrashed
+from .logger import Logger, Consumer
class TItem (Item):
""" This should be as close to Item as possible """
@@ -102,12 +103,22 @@ def http ():
p.terminate ()
p.join ()
+class AssertConsumer (Consumer):
+ def __call__ (self, **kwargs):
+ assert 'uuid' in kwargs
+ assert 'msg' in kwargs
+ assert 'context' in kwargs
+
+@pytest.fixture
+def logger ():
+ return Logger (consumer=[AssertConsumer ()])
+
@pytest.fixture
-def loader (http):
+def loader (http, logger):
def f (path):
if path.startswith ('/'):
path = 'http://localhost:8000{}'.format (path)
- return SiteLoader (browser, path)
+ return SiteLoader (browser, path, logger)
print ('loader setup')
with ChromeService () as browser:
yield f
diff --git a/crocoite/test_logger.py b/crocoite/test_logger.py
new file mode 100644
index 0000000..8a34aab
--- /dev/null
+++ b/crocoite/test_logger.py
@@ -0,0 +1,60 @@
+import pytest
+from .logger import Logger, Consumer, NullConsumer
+
+@pytest.fixture
+def logger ():
+ return Logger (consumer=[NullConsumer ()])
+
+class QueueConsumer (Consumer):
+ def __init__ (self):
+ self.data = []
+
+ def __call__ (self, **kwargs):
+ self.data.append (kwargs)
+ return kwargs
+
+def test_bind (logger):
+ # simple bind
+ logger = logger.bind (foo='bar')
+ ret = logger.debug ()
+ assert ret['foo'] == 'bar'
+
+ # additional
+ ret = logger.debug (bar='baz')
+ assert ret['foo'] == 'bar'
+ assert ret['bar'] == 'baz'
+
+ # override
+ ret = logger.debug (foo='baz')
+ assert ret['foo'] == 'baz'
+
+ # unbind
+ logger = logger.unbind (foo=None)
+ ret = logger.debug ()
+ assert 'foo' not in ret
+
+def test_consumer (logger):
+ c = QueueConsumer ()
+ logger.connect (c)
+ ret = logger.debug (foo='bar')
+ assert len (c.data) == 1
+ assert c.data[0] == ret
+ assert ret['foo'] == 'bar'
+ c.data = []
+
+ # inheritance
+ logger = logger.bind (inherit=1)
+ ret = logger.debug (foo='bar')
+ assert len (c.data) == 1
+ assert c.data[0] == ret
+ assert ret['foo'] == 'bar'
+ assert ret['inherit'] == 1
+ c.data = []
+
+ # removal
+ logger.disconnect (c)
+ ret = logger.debug (foo='bar')
+ assert len (c.data) == 0
+ assert ret['foo'] == 'bar'
+ assert ret['inherit'] == 1
+
diff --git a/crocoite/warc.py b/crocoite/warc.py
index 47dd9dc..a4a70ac 100644
--- a/crocoite/warc.py
+++ b/crocoite/warc.py
@@ -22,8 +22,7 @@
Classes writing data to WARC files
"""
-import logging
-import json
+import json, threading
from io import BytesIO
from warcio.statusandheaders import StatusAndHeaders
from urllib.parse import urlsplit
@@ -38,18 +37,31 @@ from .behavior import Script, DomSnapshotEvent, ScreenshotEvent
from .browser import Item
class WarcHandler (EventHandler):
- __slots__ = ('logger', 'writer', 'maxBodySize', 'documentRecords')
+ __slots__ = ('logger', 'writer', 'maxBodySize', 'documentRecords', 'log',
+ 'maxLogSize', 'logEncoding')
def __init__ (self, fd,
- logger=logging.getLogger(__name__),
+ logger,
maxBodySize=defaultSettings.maxBodySize):
self.logger = logger
self.writer = WARCWriter (fd, gzip=True)
self.maxBodySize = maxBodySize
+
+ self.logEncoding = 'utf-8'
+ self.log = BytesIO ()
+ # max log buffer size (bytes)
+ self.maxLogSize = 500*1024
+
# maps document urls to WARC record ids, required for DomSnapshotEvent
# and ScreenshotEvent
self.documentRecords = {}
+ def __enter__ (self):
+ return self
+
+ def __exit__(self, exc_type, exc_value, traceback):
+ self._flushLogEntries ()
+
def _writeRequest (self, item):
writer = self.writer
@@ -204,6 +216,24 @@ class WarcHandler (EventHandler):
warcinfo = writer.create_warcinfo_record (filename=None, info=item.payload)
writer.write_record (warcinfo)
+ def _flushLogEntries (self):
+ writer = self.writer
+ self.log.seek (0)
+ # XXX: we should use the type continuation here
+ record = writer.create_warc_record (packageUrl ('log'), 'resource', payload=self.log,
+ warc_headers_dict={'Content-Type': 'text/plain; encoding={}'.format (self.logEncoding)})
+ writer.write_record (record)
+ self.log = BytesIO ()
+
+ def _writeLog (self, item):
+ """ Handle log entries, called by .logger.WarcHandlerConsumer only """
+ self.log.write (item.encode (self.logEncoding))
+ self.log.write (b'\n')
+ # instead of locking, check we’re running in the main thread
+ if self.log.tell () > self.maxLogSize and \
+ threading.current_thread () is threading.main_thread ():
+ self._flushLogEntries ()
+
route = {Script: _writeScript,
Item: _writeItem,
DomSnapshotEvent: _writeDomSnapshot,