diff options
author | Lars-Dominik Braun <lars@6xq.net> | 2018-08-21 13:46:08 +0200 |
---|---|---|
committer | Lars-Dominik Braun <lars@6xq.net> | 2018-09-25 16:15:13 +0200 |
commit | fd383fd5f5bac0a4cebbacf7e1ffccfd0be04e50 (patch) | |
tree | eb3823a81ca6fbca3f133ed24732e94504d1c0bf | |
parent | 53e4df3fe732417988532e5b3d8b4dc7e781a3df (diff) | |
download | crocoite-fd383fd5f5bac0a4cebbacf7e1ffccfd0be04e50.tar.gz crocoite-fd383fd5f5bac0a4cebbacf7e1ffccfd0be04e50.tar.bz2 crocoite-fd383fd5f5bac0a4cebbacf7e1ffccfd0be04e50.zip |
Log extracted links
-rw-r--r-- | crocoite/cli.py | 4 | ||||
-rw-r--r-- | crocoite/controller.py | 23 |
2 files changed, 25 insertions, 2 deletions
diff --git a/crocoite/cli.py b/crocoite/cli.py index ac7e648..73ddca1 100644 --- a/crocoite/cli.py +++ b/crocoite/cli.py @@ -26,7 +26,7 @@ import argparse, json, sys from . import behavior from .controller import SinglePageController, defaultSettings, \ - ControllerSettings, StatsHandler + ControllerSettings, StatsHandler, LogHandler from .browser import NullService, ChromeService from .warc import WarcHandler from .logger import Logger, JsonPrintConsumer, DatetimeConsumer, WarcHandlerConsumer @@ -55,7 +55,7 @@ def single (): idleTimeout=args.idleTimeout, timeout=args.timeout) with open (args.output, 'wb') as fd, WarcHandler (fd, logger) as warcHandler: logger.connect (WarcHandlerConsumer (warcHandler)) - handler = [StatsHandler (), warcHandler] + handler = [StatsHandler (), LogHandler (logger), warcHandler] b = list (map (lambda x: behavior.availableMap[x], args.enabledBehaviorNames)) controller = SinglePageController (args.url, fd, settings=settings, service=service, handler=handler, behavior=b, logger=logger) diff --git a/crocoite/controller.py b/crocoite/controller.py index 9dae96f..01edc44 100644 --- a/crocoite/controller.py +++ b/crocoite/controller.py @@ -69,6 +69,29 @@ class StatsHandler (EventHandler): elif isinstance (item, BrowserCrashed): self.stats['crashed'] += 1 +from .behavior import ExtractLinksEvent +from itertools import islice + +class LogHandler (EventHandler): + """ Handle items by logging information about them """ + + __slots__ = ('logger') + + def __init__ (self, logger): + self.logger = logger.bind (context=type (self).__name__) + + def push (self, item): + if isinstance (item, ExtractLinksEvent): + # limit number of links per message, so json blob won’t get too big + it = iter (item.links) + limit = 100 + while True: + limitlinks = list (islice (it, 0, limit)) + if not limitlinks: + break + self.logger.info ('extracted links', context=type (item).__name__, + uuid='8ee5e9c9-1130-4c5c-88ff-718508546e0c', links=limitlinks) + import time, platform from . import behavior as cbehavior |