From eb818f0c6eb86461a0db1845876f2a0b39b99b7f Mon Sep 17 00:00:00 2001 From: Lars-Dominik Braun Date: Tue, 1 May 2018 10:37:08 +0200 Subject: behavior: Add link extraction script --- crocoite/controller.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'crocoite/controller.py') diff --git a/crocoite/controller.py b/crocoite/controller.py index a338559..00958a9 100644 --- a/crocoite/controller.py +++ b/crocoite/controller.py @@ -40,6 +40,13 @@ from .browser import ChromeService from .warc import WarcLoader, SerializingWARCWriter from .util import getFormattedViewportMetrics +def firstOrNone (it): + """ Return first item of iterator it or None if empty """ + try: + return next (it) + except StopIteration: + return None + class SinglePageController: """ Archive a single page url to file output. @@ -55,7 +62,7 @@ class SinglePageController: self.logger = logger def run (self): - ret = {'stats': None} + ret = {'stats': None, 'links': None} with self.service as browser: browser = pychrome.Browser (url=browser) @@ -77,6 +84,8 @@ class SinglePageController: # not all behavior scripts are allowed for every URL, filter them enabledBehavior = list (filter (lambda x: self.url in x, map (lambda x: x (l), self.behavior))) + linksBehavior = firstOrNone (filter (lambda x: isinstance (x, cbehavior.ExtractLinks), + enabledBehavior)) for b in enabledBehavior: self.logger.debug ('starting onload behavior {}'.format (b.name)) @@ -98,6 +107,7 @@ class SinglePageController: b.onfinish () ret['stats'] = l.stats + ret['links'] = linksBehavior.links if linksBehavior else None writer.flush () return ret -- cgit v1.2.3