diff options
| -rw-r--r-- | crocoite/cli.py | 4 | ||||
| -rw-r--r-- | crocoite/controller.py | 23 | 
2 files changed, 25 insertions, 2 deletions
| diff --git a/crocoite/cli.py b/crocoite/cli.py index ac7e648..73ddca1 100644 --- a/crocoite/cli.py +++ b/crocoite/cli.py @@ -26,7 +26,7 @@ import argparse, json, sys  from . import behavior  from .controller import SinglePageController, defaultSettings, \ -        ControllerSettings, StatsHandler +        ControllerSettings, StatsHandler, LogHandler  from .browser import NullService, ChromeService  from .warc import WarcHandler  from .logger import Logger, JsonPrintConsumer, DatetimeConsumer, WarcHandlerConsumer @@ -55,7 +55,7 @@ def single ():              idleTimeout=args.idleTimeout, timeout=args.timeout)      with open (args.output, 'wb') as fd, WarcHandler (fd, logger) as warcHandler:          logger.connect (WarcHandlerConsumer (warcHandler)) -        handler = [StatsHandler (), warcHandler] +        handler = [StatsHandler (), LogHandler (logger), warcHandler]          b = list (map (lambda x: behavior.availableMap[x], args.enabledBehaviorNames))          controller = SinglePageController (args.url, fd, settings=settings,                  service=service, handler=handler, behavior=b, logger=logger) diff --git a/crocoite/controller.py b/crocoite/controller.py index 9dae96f..01edc44 100644 --- a/crocoite/controller.py +++ b/crocoite/controller.py @@ -69,6 +69,29 @@ class StatsHandler (EventHandler):          elif isinstance (item, BrowserCrashed):              self.stats['crashed'] += 1 +from .behavior import ExtractLinksEvent +from itertools import islice + +class LogHandler (EventHandler): +    """ Handle items by logging information about them """ + +    __slots__ = ('logger') + +    def __init__ (self, logger): +        self.logger = logger.bind (context=type (self).__name__) + +    def push (self, item): +        if isinstance (item, ExtractLinksEvent): +            # limit number of links per message, so json blob won’t get too big +            it = iter (item.links) +            limit = 100 +            while True: +                limitlinks = list (islice (it, 0, limit)) +                if not limitlinks: +                    break +                self.logger.info ('extracted links', context=type (item).__name__, +                        uuid='8ee5e9c9-1130-4c5c-88ff-718508546e0c', links=limitlinks) +  import time, platform  from . import behavior as cbehavior | 
