From eb818f0c6eb86461a0db1845876f2a0b39b99b7f Mon Sep 17 00:00:00 2001 From: Lars-Dominik Braun Date: Tue, 1 May 2018 10:37:08 +0200 Subject: behavior: Add link extraction script --- crocoite/behavior.py | 22 ++++++++++++++++++++-- crocoite/cli.py | 5 +++-- crocoite/controller.py | 12 +++++++++++- crocoite/data/extract-links.js | 9 +++++++++ 4 files changed, 43 insertions(+), 5 deletions(-) create mode 100644 crocoite/data/extract-links.js (limited to 'crocoite') diff --git a/crocoite/behavior.py b/crocoite/behavior.py index f6dfd3f..c658699 100644 --- a/crocoite/behavior.py +++ b/crocoite/behavior.py @@ -238,11 +238,29 @@ class Click (JsOnload): name = 'click' scriptPath = 'click.js' -### Site-specific scripts ### +class ExtractLinks (Behavior): + """ + Extract links from a page using JavaScript + + We could retrieve a HTML snapshot and extract links here, but we’d have to + manually resolve relative links. + """ + + name = 'extractLinks' + + def __init__ (self, loader): + super ().__init__ (loader) + self.script = self.loadScript ('extract-links.js') + self.links = None + + def onfinish (self): + tab = self.loader.tab + self.useScript (self.script) + self.links = list (set (tab.Runtime.evaluate (expression=self.script, returnByValue=True)['result']['value'])) # available behavior scripts. Order matters, move those modifying the page # towards the end of available -generic = [Scroll, EmulateScreenMetrics, Click] +generic = [Scroll, EmulateScreenMetrics, Click, ExtractLinks] perSite = [] available = generic + perSite + [Screenshot, DomSnapshot] availableNames = set (map (lambda x: x.name, available)) diff --git a/crocoite/cli.py b/crocoite/cli.py index cac5b3b..4cbce4a 100644 --- a/crocoite/cli.py +++ b/crocoite/cli.py @@ -22,7 +22,7 @@ Command line interface """ -import logging, argparse +import logging, argparse, json, sys from . import behavior from .controller import SinglePageController, defaultSettings, ControllerSettings @@ -68,7 +68,8 @@ def main (): timeout=args.timeout) with open (args.output, 'wb') as fd: controller = SinglePageController (args.url, fd, settings=settings) - controller.run () + r = controller.run () + json.dump (r, sys.stdout) return True diff --git a/crocoite/controller.py b/crocoite/controller.py index a338559..00958a9 100644 --- a/crocoite/controller.py +++ b/crocoite/controller.py @@ -40,6 +40,13 @@ from .browser import ChromeService from .warc import WarcLoader, SerializingWARCWriter from .util import getFormattedViewportMetrics +def firstOrNone (it): + """ Return first item of iterator it or None if empty """ + try: + return next (it) + except StopIteration: + return None + class SinglePageController: """ Archive a single page url to file output. @@ -55,7 +62,7 @@ class SinglePageController: self.logger = logger def run (self): - ret = {'stats': None} + ret = {'stats': None, 'links': None} with self.service as browser: browser = pychrome.Browser (url=browser) @@ -77,6 +84,8 @@ class SinglePageController: # not all behavior scripts are allowed for every URL, filter them enabledBehavior = list (filter (lambda x: self.url in x, map (lambda x: x (l), self.behavior))) + linksBehavior = firstOrNone (filter (lambda x: isinstance (x, cbehavior.ExtractLinks), + enabledBehavior)) for b in enabledBehavior: self.logger.debug ('starting onload behavior {}'.format (b.name)) @@ -98,6 +107,7 @@ class SinglePageController: b.onfinish () ret['stats'] = l.stats + ret['links'] = linksBehavior.links if linksBehavior else None writer.flush () return ret diff --git a/crocoite/data/extract-links.js b/crocoite/data/extract-links.js new file mode 100644 index 0000000..f2a37aa --- /dev/null +++ b/crocoite/data/extract-links.js @@ -0,0 +1,9 @@ +/* Extract links from a page + */ +let x = document.body.querySelectorAll('a[href]'); +let ret = []; +let index = 0; +for( index=0; index < x.length; index++ ) { + ret.push (x[index].href); +} +ret; /* immediately return results, for use with Runtime.evaluate() */ -- cgit v1.2.3