summaryrefslogtreecommitdiff
path: root/crocoite
diff options
context:
space:
mode:
Diffstat (limited to 'crocoite')
-rw-r--r--crocoite/cli.py4
-rw-r--r--crocoite/controller.py23
2 files changed, 25 insertions, 2 deletions
diff --git a/crocoite/cli.py b/crocoite/cli.py
index ac7e648..73ddca1 100644
--- a/crocoite/cli.py
+++ b/crocoite/cli.py
@@ -26,7 +26,7 @@ import argparse, json, sys
from . import behavior
from .controller import SinglePageController, defaultSettings, \
- ControllerSettings, StatsHandler
+ ControllerSettings, StatsHandler, LogHandler
from .browser import NullService, ChromeService
from .warc import WarcHandler
from .logger import Logger, JsonPrintConsumer, DatetimeConsumer, WarcHandlerConsumer
@@ -55,7 +55,7 @@ def single ():
idleTimeout=args.idleTimeout, timeout=args.timeout)
with open (args.output, 'wb') as fd, WarcHandler (fd, logger) as warcHandler:
logger.connect (WarcHandlerConsumer (warcHandler))
- handler = [StatsHandler (), warcHandler]
+ handler = [StatsHandler (), LogHandler (logger), warcHandler]
b = list (map (lambda x: behavior.availableMap[x], args.enabledBehaviorNames))
controller = SinglePageController (args.url, fd, settings=settings,
service=service, handler=handler, behavior=b, logger=logger)
diff --git a/crocoite/controller.py b/crocoite/controller.py
index 9dae96f..01edc44 100644
--- a/crocoite/controller.py
+++ b/crocoite/controller.py
@@ -69,6 +69,29 @@ class StatsHandler (EventHandler):
elif isinstance (item, BrowserCrashed):
self.stats['crashed'] += 1
+from .behavior import ExtractLinksEvent
+from itertools import islice
+
+class LogHandler (EventHandler):
+ """ Handle items by logging information about them """
+
+ __slots__ = ('logger')
+
+ def __init__ (self, logger):
+ self.logger = logger.bind (context=type (self).__name__)
+
+ def push (self, item):
+ if isinstance (item, ExtractLinksEvent):
+ # limit number of links per message, so json blob won’t get too big
+ it = iter (item.links)
+ limit = 100
+ while True:
+ limitlinks = list (islice (it, 0, limit))
+ if not limitlinks:
+ break
+ self.logger.info ('extracted links', context=type (item).__name__,
+ uuid='8ee5e9c9-1130-4c5c-88ff-718508546e0c', links=limitlinks)
+
import time, platform
from . import behavior as cbehavior