From fd383fd5f5bac0a4cebbacf7e1ffccfd0be04e50 Mon Sep 17 00:00:00 2001 From: Lars-Dominik Braun Date: Tue, 21 Aug 2018 13:46:08 +0200 Subject: Log extracted links --- crocoite/cli.py | 4 ++-- crocoite/controller.py | 23 +++++++++++++++++++++++ 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/crocoite/cli.py b/crocoite/cli.py index ac7e648..73ddca1 100644 --- a/crocoite/cli.py +++ b/crocoite/cli.py @@ -26,7 +26,7 @@ import argparse, json, sys from . import behavior from .controller import SinglePageController, defaultSettings, \ - ControllerSettings, StatsHandler + ControllerSettings, StatsHandler, LogHandler from .browser import NullService, ChromeService from .warc import WarcHandler from .logger import Logger, JsonPrintConsumer, DatetimeConsumer, WarcHandlerConsumer @@ -55,7 +55,7 @@ def single (): idleTimeout=args.idleTimeout, timeout=args.timeout) with open (args.output, 'wb') as fd, WarcHandler (fd, logger) as warcHandler: logger.connect (WarcHandlerConsumer (warcHandler)) - handler = [StatsHandler (), warcHandler] + handler = [StatsHandler (), LogHandler (logger), warcHandler] b = list (map (lambda x: behavior.availableMap[x], args.enabledBehaviorNames)) controller = SinglePageController (args.url, fd, settings=settings, service=service, handler=handler, behavior=b, logger=logger) diff --git a/crocoite/controller.py b/crocoite/controller.py index 9dae96f..01edc44 100644 --- a/crocoite/controller.py +++ b/crocoite/controller.py @@ -69,6 +69,29 @@ class StatsHandler (EventHandler): elif isinstance (item, BrowserCrashed): self.stats['crashed'] += 1 +from .behavior import ExtractLinksEvent +from itertools import islice + +class LogHandler (EventHandler): + """ Handle items by logging information about them """ + + __slots__ = ('logger') + + def __init__ (self, logger): + self.logger = logger.bind (context=type (self).__name__) + + def push (self, item): + if isinstance (item, ExtractLinksEvent): + # limit number of links per message, so json blob won’t get too big + it = iter (item.links) + limit = 100 + while True: + limitlinks = list (islice (it, 0, limit)) + if not limitlinks: + break + self.logger.info ('extracted links', context=type (item).__name__, + uuid='8ee5e9c9-1130-4c5c-88ff-718508546e0c', links=limitlinks) + import time, platform from . import behavior as cbehavior -- cgit v1.2.3