summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLars-Dominik Braun <lars@6xq.net>2018-08-21 13:46:08 +0200
committerLars-Dominik Braun <lars@6xq.net>2018-09-25 16:15:13 +0200
commitfd383fd5f5bac0a4cebbacf7e1ffccfd0be04e50 (patch)
treeeb3823a81ca6fbca3f133ed24732e94504d1c0bf
parent53e4df3fe732417988532e5b3d8b4dc7e781a3df (diff)
downloadcrocoite-fd383fd5f5bac0a4cebbacf7e1ffccfd0be04e50.tar.gz
crocoite-fd383fd5f5bac0a4cebbacf7e1ffccfd0be04e50.tar.bz2
crocoite-fd383fd5f5bac0a4cebbacf7e1ffccfd0be04e50.zip
Log extracted links
-rw-r--r--crocoite/cli.py4
-rw-r--r--crocoite/controller.py23
2 files changed, 25 insertions, 2 deletions
diff --git a/crocoite/cli.py b/crocoite/cli.py
index ac7e648..73ddca1 100644
--- a/crocoite/cli.py
+++ b/crocoite/cli.py
@@ -26,7 +26,7 @@ import argparse, json, sys
from . import behavior
from .controller import SinglePageController, defaultSettings, \
- ControllerSettings, StatsHandler
+ ControllerSettings, StatsHandler, LogHandler
from .browser import NullService, ChromeService
from .warc import WarcHandler
from .logger import Logger, JsonPrintConsumer, DatetimeConsumer, WarcHandlerConsumer
@@ -55,7 +55,7 @@ def single ():
idleTimeout=args.idleTimeout, timeout=args.timeout)
with open (args.output, 'wb') as fd, WarcHandler (fd, logger) as warcHandler:
logger.connect (WarcHandlerConsumer (warcHandler))
- handler = [StatsHandler (), warcHandler]
+ handler = [StatsHandler (), LogHandler (logger), warcHandler]
b = list (map (lambda x: behavior.availableMap[x], args.enabledBehaviorNames))
controller = SinglePageController (args.url, fd, settings=settings,
service=service, handler=handler, behavior=b, logger=logger)
diff --git a/crocoite/controller.py b/crocoite/controller.py
index 9dae96f..01edc44 100644
--- a/crocoite/controller.py
+++ b/crocoite/controller.py
@@ -69,6 +69,29 @@ class StatsHandler (EventHandler):
elif isinstance (item, BrowserCrashed):
self.stats['crashed'] += 1
+from .behavior import ExtractLinksEvent
+from itertools import islice
+
+class LogHandler (EventHandler):
+ """ Handle items by logging information about them """
+
+ __slots__ = ('logger')
+
+ def __init__ (self, logger):
+ self.logger = logger.bind (context=type (self).__name__)
+
+ def push (self, item):
+ if isinstance (item, ExtractLinksEvent):
+ # limit number of links per message, so json blob won’t get too big
+ it = iter (item.links)
+ limit = 100
+ while True:
+ limitlinks = list (islice (it, 0, limit))
+ if not limitlinks:
+ break
+ self.logger.info ('extracted links', context=type (item).__name__,
+ uuid='8ee5e9c9-1130-4c5c-88ff-718508546e0c', links=limitlinks)
+
import time, platform
from . import behavior as cbehavior