diff options
Diffstat (limited to 'crocoite')
| -rw-r--r-- | crocoite/behavior.py | 34 | ||||
| -rw-r--r-- | crocoite/browser.py | 48 | ||||
| -rw-r--r-- | crocoite/cli.py | 16 | ||||
| -rw-r--r-- | crocoite/controller.py | 56 | ||||
| -rw-r--r-- | crocoite/logger.py | 137 | ||||
| -rw-r--r-- | crocoite/task.py | 9 | ||||
| -rw-r--r-- | crocoite/test_browser.py | 15 | ||||
| -rw-r--r-- | crocoite/test_logger.py | 60 | ||||
| -rw-r--r-- | crocoite/warc.py | 38 | 
9 files changed, 337 insertions, 76 deletions
| diff --git a/crocoite/behavior.py b/crocoite/behavior.py index b34d3d9..8c24c59 100644 --- a/crocoite/behavior.py +++ b/crocoite/behavior.py @@ -22,7 +22,7 @@  Generic and per-site behavior scripts  """ -import logging, time +import time  from urllib.parse import urlsplit  import os.path  import pkg_resources @@ -36,8 +36,6 @@ from .util import randomString, getFormattedViewportMetrics, removeFragment  from . import html  from .html import StripAttributeFilter, StripTagFilter, ChromeTreeWalker -logger = logging.getLogger(__name__) -  class Script:      """ A JavaScript resource """ @@ -61,14 +59,15 @@ class Script:          return s  class Behavior: -    __slots__ = ('loader') +    __slots__ = ('loader', 'logger')      # unique behavior name      name = None -    def __init__ (self, loader): +    def __init__ (self, loader, logger):          assert self.name is not None          self.loader = loader +        self.logger = logger.bind (context=type (self).__name__)      def __contains__ (self, url):          """ @@ -108,8 +107,8 @@ class JsOnload (Behavior):      scriptPath = None -    def __init__ (self, loader): -        super ().__init__ (loader) +    def __init__ (self, loader, logger): +        super ().__init__ (loader, logger)          self.script = Script (self.scriptPath)          self.scriptHandle = None @@ -129,8 +128,8 @@ class Scroll (JsOnload):      name = 'scroll'      scriptPath = 'scroll.js' -    def __init__ (self, loader): -        super ().__init__ (loader) +    def __init__ (self, loader, logger): +        super ().__init__ (loader, logger)          stopVarname = '__' + __package__ + '_stop__'          newStopVarname = randomString ()          self.script.data = self.script.data.replace (stopVarname, newStopVarname) @@ -198,8 +197,8 @@ class DomSnapshot (Behavior):      name = 'domSnapshot' -    def __init__ (self, loader): -        super ().__init__ (loader) +    def __init__ (self, loader, logger): +        super ().__init__ (loader, logger)          self.script = Script ('canvas-snapshot.js')      def onfinish (self): @@ -216,11 +215,11 @@ class DomSnapshot (Behavior):              if rawUrl in haveUrls:                  # ignore duplicate URLs. they are usually caused by                  # javascript-injected iframes (advertising) with no(?) src -                logger.warning ('have DOM snapshot for URL {}, ignoring'.format (rawUrl)) +                self.logger.warning ('have DOM snapshot for URL {}, ignoring'.format (rawUrl))                  continue              url = urlsplit (rawUrl)              if url.scheme in ('http', 'https'): -                logger.debug ('saving DOM snapshot for url {}, base {}'.format (doc['documentURL'], doc['baseURL'])) +                self.logger.debug ('saving DOM snapshot for url {}, base {}'.format (doc['documentURL'], doc['baseURL']))                  haveUrls.add (rawUrl)                  walker = ChromeTreeWalker (doc)                  # remove script, to make the page static and noscript, because at the @@ -249,10 +248,11 @@ class Screenshot (Behavior):      def onfinish (self):          tab = self.loader.tab +        tree = tab.Page.getFrameTree ()          try: -            url = removeFragment (tab.Page.getFrameTree ()['frameTree']['frame']['url']) +            url = removeFragment (tree['frameTree']['frame']['url'])          except KeyError: -            logger.error ('frame has no url') +            self.logger.error ('frame without url', tree=tree)              url = None          # see https://github.com/GoogleChrome/puppeteer/blob/230be28b067b521f0577206899db01f0ca7fc0d2/examples/screenshots-longpage.js @@ -293,8 +293,8 @@ class ExtractLinks (Behavior):      name = 'extractLinks' -    def __init__ (self, loader): -        super ().__init__ (loader) +    def __init__ (self, loader, logger): +        super ().__init__ (loader, logger)          self.script = Script ('extract-links.js')      def onfinish (self): diff --git a/crocoite/browser.py b/crocoite/browser.py index 6a4bee2..fbd12fd 100644 --- a/crocoite/browser.py +++ b/crocoite/browser.py @@ -22,12 +22,12 @@  Chrome browser interactions.  """ -import logging  from urllib.parse import urlsplit  from base64 import b64decode  from collections import deque  from threading import Event  from http.server import BaseHTTPRequestHandler +from .logger import Level  import pychrome @@ -174,11 +174,11 @@ class SiteLoader:      __slots__ = ('requests', 'browser', 'url', 'logger', 'queue', 'notify', 'tab')      allowedSchemes = {'http', 'https'} -    def __init__ (self, browser, url, logger=logging.getLogger(__name__)): +    def __init__ (self, browser, url, logger):          self.requests = {}          self.browser = pychrome.Browser (url=browser)          self.url = url -        self.logger = logger +        self.logger = logger.bind (context=type (self).__name__, url=url)          self.queue = deque ()          self.notify = Event () @@ -236,6 +236,7 @@ class SiteLoader:      def _requestWillBeSent (self, **kwargs):          reqId = kwargs['requestId']          req = kwargs['request'] +        logger = self.logger.bind (reqId=reqId, reqUrl=req['url'])          url = urlsplit (req['url'])          if url.scheme not in self.allowedSchemes: @@ -252,14 +253,15 @@ class SiteLoader:                  resp = {'requestId': reqId, 'encodedDataLength': 0, 'timestamp': kwargs['timestamp']}                  item.setFinished (resp)                  item.isRedirect = True -                self.logger.info ('redirected request {} has url {}'.format (reqId, req['url'])) +                logger.info ('redirect', uuid='85eaec41-e2a9-49c2-9445-6f19690278b8', target=req['url'])                  self._append (item)              else: -                self.logger.warning ('request {} already exists, overwriting.'.format (reqId)) +                logger.warning ('request exists', uuid='2c989142-ba00-4791-bb03-c2a14e91a56b')          item = Item (self.tab)          item.setRequest (kwargs)          self.requests[reqId] = item +        logger.debug ('request', uuid='55c17564-1bd0-4499-8724-fa7aad65478f')      def _responseReceived (self, **kwargs):          reqId = kwargs['requestId'] @@ -268,12 +270,13 @@ class SiteLoader:              return          resp = kwargs['response'] +        logger = self.logger.bind (reqId=reqId, respUrl=resp['url'])          url = urlsplit (resp['url'])          if url.scheme in self.allowedSchemes: -            self.logger.info ('response {} {}'.format (reqId, resp['url'])) +            logger.debug ('response', uuid='84461c4e-e8ef-4cbd-8e8e-e10a901c8bd0')              item.setResponse (kwargs)          else: -            self.logger.warning ('response: ignoring scheme {}'.format (url.scheme)) +            logger.warning ('scheme forbidden', uuid='2ea6e5d7-dd3b-4881-b9de-156c1751c666')      def _loadingFinished (self, **kwargs):          """ @@ -286,17 +289,22 @@ class SiteLoader:              # we never recorded this request (blacklisted scheme, for example)              return          req = item.request +        logger = self.logger.bind (reqId=reqId, reqUrl=req['url'])          resp = item.response -        assert req['url'] == resp['url'], 'req and resp urls are not the same {} vs {}'.format (req['url'], resp['url']) +        if req['url'] != resp['url']: +            logger.error ('url mismatch', uuid='7385f45f-0b06-4cbc-81f9-67bcd72ee7d0', respUrl=resp['url'])          url = urlsplit (resp['url'])          if url.scheme in self.allowedSchemes: -            self.logger.info ('finished {} {}'.format (reqId, req['url'])) +            logger.info ('finished', uuid='5a8b4bad-f86a-4fe6-a53e-8da4130d6a02')              item.setFinished (kwargs)              self._append (item)      def _loadingFailed (self, **kwargs):          reqId = kwargs['requestId'] -        self.logger.warning ('failed {} {}'.format (reqId, kwargs['errorText'], kwargs.get ('blockedReason'))) +        self.logger.warning ('loading failed', +                uuid='68410f13-6eea-453e-924e-c1af4601748b', +                errorText=kwargs['errorText'], +                blockedReason=kwargs.get ('blockedReason'))          item = self.requests.pop (reqId, None)          item.failed = True          self._append (item) @@ -304,25 +312,29 @@ class SiteLoader:      def _entryAdded (self, **kwargs):          """ Log entry added """          entry = kwargs['entry'] -        level = {'verbose': logging.DEBUG, 'info': logging.INFO, -                'warning': logging.WARNING, -                'error': logging.ERROR}[entry['level']] -        self.logger.log (level, 'console: {}: {}'.format (entry['source'], entry['text']), extra={'raw': entry}) +        level = {'verbose': Level.DEBUG, 'info': Level.INFO, +                'warning': Level.WARNING, +                'error': Level.ERROR}.get (entry.pop ('level'), Level.INFO) +        self.logger (level, 'console', **entry)      def _javascriptDialogOpening (self, **kwargs):          t = kwargs.get ('type')          if t in {'alert', 'confirm', 'prompt'}: -            self.logger.info ('javascript opened a dialog: {}, {}, canceling'.format (t, kwargs.get ('message'))) +            self.logger.info ('js dialog', +                    uuid='d6f07ce2-648e-493b-a1df-f353bed27c84', +                    action='cancel', type=t, message=kwargs.get ('message'))              self.tab.Page.handleJavaScriptDialog (accept=False)          elif t == 'beforeunload':              # we must accept this one, otherwise the page will not unload/close -            self.logger.info ('javascript opened a dialog: {}, {}, procceeding'.format (t, kwargs.get ('message'))) +            self.logger.info ('js dialog', +                    uuid='96399b99-9834-4c8f-bd93-cb9fa2225abd', +                    action='proceed', type=t, message=kwargs.get ('message'))              self.tab.Page.handleJavaScriptDialog (accept=True)          else: -            self.logger.warning ('unknown javascript dialog type {}'.format (t)) +            self.logger.warning ('js dialog unknown', uuid='3ef7292e-8595-4e89-b834-0cc6bc40ee38', **kwargs)      def _targetCrashed (self, **kwargs): -        self.logger.error ('browser crashed') +        self.logger.error ('browser crashed', uuid='6fe2b3be-ff01-4503-b30c-ad6aeea953ef')          # priority message          self._appendleft (BrowserCrashed ()) diff --git a/crocoite/cli.py b/crocoite/cli.py index d631f10..8e225d9 100644 --- a/crocoite/cli.py +++ b/crocoite/cli.py @@ -22,13 +22,14 @@  Command line interface  """ -import logging, argparse, json, sys +import argparse, json, sys  from . import behavior  from .controller import RecursiveController, defaultSettings, \          ControllerSettings, DepthLimit, PrefixLimit, StatsHandler  from .browser import NullService, ChromeService  from .warc import WarcHandler +from .logger import Logger, JsonPrintConsumer, DatetimeConsumer, WarcHandlerConsumer  def parseRecursive (recursive, url):      if recursive is None: @@ -42,7 +43,6 @@ def parseRecursive (recursive, url):  def main ():      parser = argparse.ArgumentParser(description='Save website to WARC using Google Chrome.') -    parser.add_argument('--debug', help='Enable debug messages', action='store_true')      parser.add_argument('--browser', help='DevTools URL', metavar='URL')      parser.add_argument('--recursive', help='Follow links recursively')      parser.add_argument('--concurrency', '-j', type=int, default=1) @@ -73,8 +73,7 @@ def main ():                  recursive=args.recursive, concurrency=args.concurrency)          r = result.get ()      else: -        level = logging.DEBUG if args.debug else logging.INFO -        logging.basicConfig (level=level) +        logger = Logger (consumer=[DatetimeConsumer (), JsonPrintConsumer ()])          try:              recursionPolicy = parseRecursive (args.recursive, args.url) @@ -86,15 +85,16 @@ def main ():          settings = ControllerSettings (maxBodySize=args.maxBodySize,                  logBuffer=args.logBuffer, idleTimeout=args.idleTimeout,                  timeout=args.timeout) -        with open (args.output, 'wb') as fd: -            handler = [StatsHandler (), WarcHandler (fd)] +        with open (args.output, 'wb') as fd, WarcHandler (fd, logger) as warcHandler: +            logger.connect (WarcHandlerConsumer (warcHandler)) +            handler = [StatsHandler (), warcHandler]              b = list (map (lambda x: behavior.availableMap[x], args.enabledBehaviorNames))              controller = RecursiveController (args.url, fd, settings=settings,                      recursionPolicy=recursionPolicy, service=service, -                    handler=handler, behavior=b) +                    handler=handler, behavior=b, logger=logger)              controller.run ()              r = handler[0].stats -    json.dump (r, sys.stdout) +            logger.info ('stats', context='cli', uuid='24d92d16-770e-4088-b769-4020e127a7ff', **r)      return True diff --git a/crocoite/controller.py b/crocoite/controller.py index ef042cc..cbf0037 100644 --- a/crocoite/controller.py +++ b/crocoite/controller.py @@ -70,7 +70,7 @@ class StatsHandler (EventHandler):          elif isinstance (item, BrowserCrashed):              self.stats['crashed'] += 1 -import logging, time +import time  from . import behavior as cbehavior  from .browser import ChromeService, SiteLoader, Item @@ -92,14 +92,15 @@ class SinglePageController:      __slots__ = ('url', 'output', 'service', 'behavior', 'settings', 'logger', 'handler') -    def __init__ (self, url, output, service=ChromeService (), behavior=cbehavior.available, \ -            logger=logging.getLogger(__name__), settings=defaultSettings, handler=[]): +    def __init__ (self, url, output, logger, \ +            service=ChromeService (), behavior=cbehavior.available, \ +            settings=defaultSettings, handler=[]):          self.url = url          self.output = output          self.service = service          self.behavior = behavior          self.settings = settings -        self.logger = logger +        self.logger = logger.bind (context=type (self).__name__, url=url)          self.handler = handler      def processItem (self, item): @@ -113,17 +114,22 @@ class SinglePageController:              h.push (item)      def run (self): +        logger = self.logger          def processQueue ():              # XXX: this is very ugly code and does not work well. figure out a              # better way to impose timeouts and still process all items in the              # queue              queue = l.queue -            self.logger.debug ('processing at least {} queue items'.format (len (queue))) +            logger.debug ('process queue', +                    uuid='dafbf76b-a37e-44db-a021-efb5593b81f8', +                    queuelen=len (queue))              while True:                  now = time.time ()                  elapsed = now-start                  maxTimeout = max (min (self.settings.idleTimeout, self.settings.timeout-elapsed), 0) -                self.logger.debug ('max timeout is {} with elapsed {}'.format (maxTimeout, elapsed)) +                logger.debug ('timeout status', +                        uuid='49550447-37e3-49ff-9a73-34da1c3e5984', +                        maxTimeout=maxTimeout, elapsed=elapsed)                  # skip waiting if there is work to do. processes all items in                  # queue, regardless of timeouts, i.e. you need to make sure the                  # queue will actually be empty at some point. @@ -131,7 +137,9 @@ class SinglePageController:                      if not l.notify.wait (maxTimeout):                          assert len (queue) == 0, "event must be sent"                          # timed out -                        self.logger.debug ('timed out after {}'.format (elapsed)) +                        logger.debug ('timeout', +                                uuid='6a7e0083-7c1a-45ba-b1ed-dbc4f26697c6', +                                elapsed=elapsed)                          break                      else:                          l.notify.clear () @@ -141,14 +149,16 @@ class SinglePageController:                  for i in range (1000):                      try:                          item = queue.popleft () -                        self.logger.debug ('queue pop: {!r}, len now {}'.format (item, len (queue))) +                        logger.debug ('queue pop', +                                uuid='adc96bfa-026d-4092-b732-4a022a1a92ca', +                                item=item, queuelen=len (queue))                      except IndexError:                          break                      self.processItem (item)                  if maxTimeout == 0:                      break -        with self.service as browser, SiteLoader (browser, self.url, logger=self.logger) as l: +        with self.service as browser, SiteLoader (browser, self.url, logger=logger) as l:              start = time.time ()              version = l.tab.Browser.getVersion () @@ -162,10 +172,9 @@ class SinglePageController:              # not all behavior scripts are allowed for every URL, filter them              enabledBehavior = list (filter (lambda x: self.url in x, -                    map (lambda x: x (l), self.behavior))) +                    map (lambda x: x (l, logger), self.behavior)))              for b in enabledBehavior: -                self.logger.debug ('starting onload {}'.format (b))                  # I decided against using the queue here to limit memory                  # usage (screenshot behavior would put all images into                  # queue before we could process them) @@ -176,7 +185,6 @@ class SinglePageController:              processQueue ()              for b in enabledBehavior: -                self.logger.debug ('starting onstop {}'.format (b))                  for item in b.onstop ():                      self.processItem (item) @@ -184,7 +192,6 @@ class SinglePageController:              processQueue ()              for b in enabledBehavior: -                self.logger.debug ('starting onfinish {}'.format (b))                  for item in b.onfinish ():                      self.processItem (item) @@ -249,15 +256,16 @@ class RecursiveController (EventHandler):      __slots__ = ('url', 'output', 'service', 'behavior', 'settings', 'logger',              'recursionPolicy', 'handler', 'urls', 'have') -    def __init__ (self, url, output, service=ChromeService (), behavior=cbehavior.available, \ -            logger=logging.getLogger(__name__), settings=defaultSettings, +    def __init__ (self, url, output, logger, +            service=ChromeService (), behavior=cbehavior.available, \ +            settings=defaultSettings, \              recursionPolicy=DepthLimit (0), handler=[]):          self.url = url          self.output = output          self.service = service          self.behavior = behavior          self.settings = settings -        self.logger = logger +        self.logger = logger.bind (context=type(self).__name__, url=url)          self.recursionPolicy = recursionPolicy          self.handler = handler          self.handler.append (self) @@ -269,19 +277,22 @@ class RecursiveController (EventHandler):          """          for u in urls:              try: -                c = SinglePageController (u, self.output, self.service, -                        self.behavior, self.logger, self.settings, self.handler) +                c = SinglePageController (url=u, output=self.output, service=self.service, +                        behavior=self.behavior, logger=self.logger, +                        settings=self.settings, handler=self.handler)                  c.run ()              except BrowserCrashed:                  # this is fine if reported -                self.logger.error ('browser crashed for {}'.format (u)) +                self.logger.error ('browser crashed', uuid='42582cbe-fb83-47ce-b330-d022a1c3b331')      def run (self):          self.have = set ()          self.urls = set ([self.url])          while self.urls: -            self.logger.info ('retrieving {} urls'.format (len (self.urls))) +            self.logger.info ('recursing', +                    uuid='5b8498e4-868d-413c-a67e-004516b8452c', +                    numurls=len (self.urls))              self.have.update (self.urls)              fetchurls = self.urls @@ -296,8 +307,7 @@ class RecursiveController (EventHandler):      def push (self, item):          if isinstance (item, ExtractLinksEvent): -            self.logger.debug ('adding extracted links: {}'.format (item.links)) +            self.logger.debug ('extracted links', +                    uuid='8ee5e9c9-1130-4c5c-88ff-718508546e0c', links=item.links)              self.urls.update (map (removeFragment, item.links)) -        else: -            self.logger.debug ('{} got unhandled event {!r}'.format (self, item)) diff --git a/crocoite/logger.py b/crocoite/logger.py new file mode 100644 index 0000000..4b43e02 --- /dev/null +++ b/crocoite/logger.py @@ -0,0 +1,137 @@ +# Copyright (c) 2017–2018 crocoite contributors +#  +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +#  +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +#  +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +""" +Simple logger inspired by structlog. + +It is usually used like this: Classes are passed a logger instance. They bind +context to their name, so identifying the source of messages is easier. Every +log message carries a unique id (uuid) for automated identification as well as +a short human-readable message (msg) and arbitrary payload. +""" + +import sys, json +from datetime import datetime +from functools import partial +from enum import IntEnum + +class Level(IntEnum): +    DEBUG = 0 +    INFO = 1 +    WARNING = 2 +    ERROR = 3 + +class Logger: +    def __init__ (self, consumer=[], bindings={}): +        self.bindings = bindings +        self.consumer = consumer + +    def __call__ (self, level, *args, **kwargs): +        if not isinstance (level, Level): +            level = Level[level.upper ()] +        kwargs['level'] = level +        if args: +            if len (args) == 1: +                args, = args +            kwargs['msg'] = args +        # do not overwrite arguments +        for k, v in self.bindings.items (): +            if k not in kwargs: +                kwargs[k] = v +        for c in self.consumer: +            kwargs = c (**kwargs) +        return kwargs + +    def __getattr__ (self, k): +        """ Bind all method names to level, so Logger.info, Logger.warning, … work """ +        return partial (self.__call__, k) + +    def bind (self, **kwargs): +        d = self.bindings.copy () +        d.update (kwargs) +        # consumer is not a copy intentionally, so attaching to the parent +        # logger will attach to all children as well +        return self.__class__ (consumer=self.consumer, bindings=d) + +    def unbind (self, **kwargs): +        d = self.bindings.copy () +        for k in kwargs.keys (): +            del d[k] +        return self.__class__ (consumer=self.consumer, bindings=d) + +    def connect (self, consumer): +        self.consumer.append (consumer) + +    def disconnect (self, consumer): +        self.consumer.remove (consumer) + +class Consumer: +    def __call__ (self, level, *args, **kwargs): +        raise NotImplementedError () + +class NullConsumer (Consumer): +    def __call__ (self, **kwargs): +        return kwargs + +class PrintConsumer (Consumer): +    """ +    Simple printing consumer +    """ +    def __call__ (self, **kwargs): +        sys.stderr.write (str (kwargs)) +        sys.stderr.write ('\n') +        return kwargs + +class JsonEncoder (json.JSONEncoder): +    def default (self, obj): +        if isinstance (obj, datetime): +            return obj.isoformat () + +        # make sure serialization always succeeds +        try: +            return json.JSONEncoder.default(self, obj) +        except TypeError: +            return str (obj) + +class JsonPrintConsumer (Consumer): +    def __init__ (self, minLevel=Level.INFO): +        self.minLevel = minLevel + +    def __call__ (self, **kwargs): +        if kwargs['level'] >= self.minLevel: +            json.dump (kwargs, sys.stdout, cls=JsonEncoder) +            sys.stdout.write ('\n') +        return kwargs + +class DatetimeConsumer (Consumer): +    def __call__ (self, **kwargs): +        kwargs['date'] = datetime.utcnow () +        return kwargs + +class WarcHandlerConsumer (Consumer): +    def __init__ (self, warc, minLevel=Level.INFO): +        self.warc = warc +        self.minLevel = minLevel + +    def __call__ (self, **kwargs): +        if kwargs['level'] >= self.minLevel: +            self.warc._writeLog (json.dumps (kwargs, cls=JsonEncoder)) +        return kwargs + diff --git a/crocoite/task.py b/crocoite/task.py index 6b3c9d1..06dd022 100644 --- a/crocoite/task.py +++ b/crocoite/task.py @@ -22,7 +22,7 @@  Celery distributed tasks  """ -import os, logging +import os  from urllib.parse import urlsplit  from datetime import datetime @@ -113,8 +113,8 @@ class DistributedRecursiveController (RecursiveController):      __slots__ = ('concurrency', 'stats') -    def __init__ (self, url, service=ChromeService (), behavior=behavior.available, \ -            logger=logging.getLogger(__name__), settings=defaultSettings, +    def __init__ (self, url, logger, service=ChromeService (), behavior=behavior.available, \ +            settings=defaultSettings,              recursionPolicy=DepthLimit (0), concurrency=1):          super ().__init__ (url, None, service, behavior, logger, settings, recursionPolicy)          self.concurrency = concurrency @@ -134,10 +134,11 @@ class DistributedRecursiveController (RecursiveController):  def controller (self, url, settings, enabledBehaviorNames, recursive, concurrency):      """ Recursive controller """ +    logger = Logger (consumer=[DatetimeConsumer (), JsonPrintConsumer ()])      recursionPolicy = parseRecursive (recursive, url)      enabledBehavior = list (filter (lambda x: x.name in enabledBehaviorNames, behavior.available))      settings = ControllerSettings (**settings) -    c = DistributedRecursiveController (url, None, behavior=enabledBehavior, +    c = DistributedRecursiveController (url, None, logger=logger, behavior=enabledBehavior,              settings=settings, recursionPolicy=recursionPolicy, concurrency=concurrency)      c.run ()      return dict (c.stats) diff --git a/crocoite/test_browser.py b/crocoite/test_browser.py index 9d37efb..dfcd71c 100644 --- a/crocoite/test_browser.py +++ b/crocoite/test_browser.py @@ -24,6 +24,7 @@ from http.server import BaseHTTPRequestHandler  from pychrome.exceptions import TimeoutException  from .browser import Item, SiteLoader, ChromeService, NullService, BrowserCrashed +from .logger import Logger, Consumer  class TItem (Item):      """ This should be as close to Item as possible """ @@ -102,12 +103,22 @@ def http ():      p.terminate ()      p.join () +class AssertConsumer (Consumer): +    def __call__ (self, **kwargs): +        assert 'uuid' in kwargs +        assert 'msg' in kwargs +        assert 'context' in kwargs + +@pytest.fixture +def logger (): +    return Logger (consumer=[AssertConsumer ()]) +  @pytest.fixture -def loader (http): +def loader (http, logger):      def f (path):          if path.startswith ('/'):              path = 'http://localhost:8000{}'.format (path) -        return SiteLoader (browser, path) +        return SiteLoader (browser, path, logger)      print ('loader setup')      with ChromeService () as browser:          yield f diff --git a/crocoite/test_logger.py b/crocoite/test_logger.py new file mode 100644 index 0000000..8a34aab --- /dev/null +++ b/crocoite/test_logger.py @@ -0,0 +1,60 @@ +import pytest +from .logger import Logger, Consumer, NullConsumer + +@pytest.fixture +def logger (): +    return Logger (consumer=[NullConsumer ()]) + +class QueueConsumer (Consumer): +    def __init__ (self): +        self.data = [] + +    def __call__ (self, **kwargs): +        self.data.append (kwargs) +        return kwargs + +def test_bind (logger): +    # simple bind +    logger = logger.bind (foo='bar') +    ret = logger.debug () +    assert ret['foo'] == 'bar' + +    # additional +    ret = logger.debug (bar='baz') +    assert ret['foo'] == 'bar' +    assert ret['bar'] == 'baz' +     +    # override +    ret = logger.debug (foo='baz') +    assert ret['foo'] == 'baz' + +    # unbind +    logger = logger.unbind (foo=None) +    ret = logger.debug () +    assert 'foo' not in ret + +def test_consumer (logger): +    c = QueueConsumer () +    logger.connect (c) +    ret = logger.debug (foo='bar') +    assert len (c.data) == 1 +    assert c.data[0] == ret +    assert ret['foo'] == 'bar' +    c.data = [] + +    # inheritance +    logger = logger.bind (inherit=1) +    ret = logger.debug (foo='bar') +    assert len (c.data) == 1 +    assert c.data[0] == ret +    assert ret['foo'] == 'bar' +    assert ret['inherit'] == 1 +    c.data = [] + +    # removal +    logger.disconnect (c) +    ret = logger.debug (foo='bar') +    assert len (c.data) == 0 +    assert ret['foo'] == 'bar' +    assert ret['inherit'] == 1 + diff --git a/crocoite/warc.py b/crocoite/warc.py index 47dd9dc..a4a70ac 100644 --- a/crocoite/warc.py +++ b/crocoite/warc.py @@ -22,8 +22,7 @@  Classes writing data to WARC files  """ -import logging -import json +import json, threading  from io import BytesIO  from warcio.statusandheaders import StatusAndHeaders  from urllib.parse import urlsplit @@ -38,18 +37,31 @@ from .behavior import Script, DomSnapshotEvent, ScreenshotEvent  from .browser import Item  class WarcHandler (EventHandler): -    __slots__ = ('logger', 'writer', 'maxBodySize', 'documentRecords') +    __slots__ = ('logger', 'writer', 'maxBodySize', 'documentRecords', 'log', +            'maxLogSize', 'logEncoding')      def __init__ (self, fd, -            logger=logging.getLogger(__name__), +            logger,              maxBodySize=defaultSettings.maxBodySize):          self.logger = logger          self.writer = WARCWriter (fd, gzip=True)          self.maxBodySize = maxBodySize + +        self.logEncoding = 'utf-8' +        self.log = BytesIO () +        # max log buffer size (bytes) +        self.maxLogSize = 500*1024 +          # maps document urls to WARC record ids, required for DomSnapshotEvent          # and ScreenshotEvent          self.documentRecords = {} +    def __enter__ (self): +        return self + +    def __exit__(self, exc_type, exc_value, traceback): +        self._flushLogEntries () +      def _writeRequest (self, item):          writer = self.writer @@ -204,6 +216,24 @@ class WarcHandler (EventHandler):          warcinfo = writer.create_warcinfo_record (filename=None, info=item.payload)          writer.write_record (warcinfo) +    def _flushLogEntries (self): +        writer = self.writer +        self.log.seek (0) +        # XXX: we should use the type continuation here +        record = writer.create_warc_record (packageUrl ('log'), 'resource', payload=self.log, +                warc_headers_dict={'Content-Type': 'text/plain; encoding={}'.format (self.logEncoding)}) +        writer.write_record (record) +        self.log = BytesIO () + +    def _writeLog (self, item): +        """ Handle log entries, called by .logger.WarcHandlerConsumer only """ +        self.log.write (item.encode (self.logEncoding)) +        self.log.write (b'\n') +        # instead of locking, check we’re running in the main thread +        if self.log.tell () > self.maxLogSize and \ +                threading.current_thread () is threading.main_thread (): +            self._flushLogEntries () +      route = {Script: _writeScript,              Item: _writeItem,              DomSnapshotEvent: _writeDomSnapshot, | 
