summaryrefslogtreecommitdiff
path: root/crocoite
diff options
context:
space:
mode:
authorLars-Dominik Braun <lars@6xq.net>2018-05-01 10:37:08 +0200
committerLars-Dominik Braun <lars@6xq.net>2018-05-04 16:00:05 +0200
commiteb818f0c6eb86461a0db1845876f2a0b39b99b7f (patch)
tree2b25cb248e4a2aca0577a336f48ae112c7e54408 /crocoite
parentd8193765df85293f825abc486ac5cb1f5ac0a028 (diff)
downloadcrocoite-eb818f0c6eb86461a0db1845876f2a0b39b99b7f.tar.gz
crocoite-eb818f0c6eb86461a0db1845876f2a0b39b99b7f.tar.bz2
crocoite-eb818f0c6eb86461a0db1845876f2a0b39b99b7f.zip
behavior: Add link extraction script
Diffstat (limited to 'crocoite')
-rw-r--r--crocoite/behavior.py22
-rw-r--r--crocoite/cli.py5
-rw-r--r--crocoite/controller.py12
-rw-r--r--crocoite/data/extract-links.js9
4 files changed, 43 insertions, 5 deletions
diff --git a/crocoite/behavior.py b/crocoite/behavior.py
index f6dfd3f..c658699 100644
--- a/crocoite/behavior.py
+++ b/crocoite/behavior.py
@@ -238,11 +238,29 @@ class Click (JsOnload):
name = 'click'
scriptPath = 'click.js'
-### Site-specific scripts ###
+class ExtractLinks (Behavior):
+ """
+ Extract links from a page using JavaScript
+
+ We could retrieve a HTML snapshot and extract links here, but we’d have to
+ manually resolve relative links.
+ """
+
+ name = 'extractLinks'
+
+ def __init__ (self, loader):
+ super ().__init__ (loader)
+ self.script = self.loadScript ('extract-links.js')
+ self.links = None
+
+ def onfinish (self):
+ tab = self.loader.tab
+ self.useScript (self.script)
+ self.links = list (set (tab.Runtime.evaluate (expression=self.script, returnByValue=True)['result']['value']))
# available behavior scripts. Order matters, move those modifying the page
# towards the end of available
-generic = [Scroll, EmulateScreenMetrics, Click]
+generic = [Scroll, EmulateScreenMetrics, Click, ExtractLinks]
perSite = []
available = generic + perSite + [Screenshot, DomSnapshot]
availableNames = set (map (lambda x: x.name, available))
diff --git a/crocoite/cli.py b/crocoite/cli.py
index cac5b3b..4cbce4a 100644
--- a/crocoite/cli.py
+++ b/crocoite/cli.py
@@ -22,7 +22,7 @@
Command line interface
"""
-import logging, argparse
+import logging, argparse, json, sys
from . import behavior
from .controller import SinglePageController, defaultSettings, ControllerSettings
@@ -68,7 +68,8 @@ def main ():
timeout=args.timeout)
with open (args.output, 'wb') as fd:
controller = SinglePageController (args.url, fd, settings=settings)
- controller.run ()
+ r = controller.run ()
+ json.dump (r, sys.stdout)
return True
diff --git a/crocoite/controller.py b/crocoite/controller.py
index a338559..00958a9 100644
--- a/crocoite/controller.py
+++ b/crocoite/controller.py
@@ -40,6 +40,13 @@ from .browser import ChromeService
from .warc import WarcLoader, SerializingWARCWriter
from .util import getFormattedViewportMetrics
+def firstOrNone (it):
+ """ Return first item of iterator it or None if empty """
+ try:
+ return next (it)
+ except StopIteration:
+ return None
+
class SinglePageController:
"""
Archive a single page url to file output.
@@ -55,7 +62,7 @@ class SinglePageController:
self.logger = logger
def run (self):
- ret = {'stats': None}
+ ret = {'stats': None, 'links': None}
with self.service as browser:
browser = pychrome.Browser (url=browser)
@@ -77,6 +84,8 @@ class SinglePageController:
# not all behavior scripts are allowed for every URL, filter them
enabledBehavior = list (filter (lambda x: self.url in x,
map (lambda x: x (l), self.behavior)))
+ linksBehavior = firstOrNone (filter (lambda x: isinstance (x, cbehavior.ExtractLinks),
+ enabledBehavior))
for b in enabledBehavior:
self.logger.debug ('starting onload behavior {}'.format (b.name))
@@ -98,6 +107,7 @@ class SinglePageController:
b.onfinish ()
ret['stats'] = l.stats
+ ret['links'] = linksBehavior.links if linksBehavior else None
writer.flush ()
return ret
diff --git a/crocoite/data/extract-links.js b/crocoite/data/extract-links.js
new file mode 100644
index 0000000..f2a37aa
--- /dev/null
+++ b/crocoite/data/extract-links.js
@@ -0,0 +1,9 @@
+/* Extract links from a page
+ */
+let x = document.body.querySelectorAll('a[href]');
+let ret = [];
+let index = 0;
+for( index=0; index < x.length; index++ ) {
+ ret.push (x[index].href);
+}
+ret; /* immediately return results, for use with Runtime.evaluate() */