behavior: Add link extraction script

author: Lars-Dominik Braun <lars@6xq.net> 2018-05-01 10:37:08 +0200
committer: Lars-Dominik Braun <lars@6xq.net> 2018-05-04 16:00:05 +0200
commit: eb818f0c6eb86461a0db1845876f2a0b39b99b7f (patch)
tree: 2b25cb248e4a2aca0577a336f48ae112c7e54408 /crocoite
parent: d8193765df85293f825abc486ac5cb1f5ac0a028 (diff)
download: crocoite-eb818f0c6eb86461a0db1845876f2a0b39b99b7f.tar.gz
crocoite-eb818f0c6eb86461a0db1845876f2a0b39b99b7f.tar.bz2
crocoite-eb818f0c6eb86461a0db1845876f2a0b39b99b7f.zip
4 files changed, 43 insertions, 5 deletions
diff --git a/crocoite/behavior.py b/crocoite/behavior.py
index f6dfd3f..c658699 100644
--- a/crocoite/behavior.py
+++ b/crocoite/behavior.py
@@ -238,11 +238,29 @@ class Click (JsOnload):
     name = 'click'
     scriptPath = 'click.js'
 
-### Site-specific scripts ###
+class ExtractLinks (Behavior):
+    """
+    Extract links from a page using JavaScript
+    
+    We could retrieve a HTML snapshot and extract links here, but we’d have to
+    manually resolve relative links.
+    """
+
+    name = 'extractLinks'
+
+    def __init__ (self, loader):
+        super ().__init__ (loader)
+        self.script = self.loadScript ('extract-links.js')
+        self.links = None
+
+    def onfinish (self):
+        tab = self.loader.tab
+        self.useScript (self.script)
+        self.links = list (set (tab.Runtime.evaluate (expression=self.script, returnByValue=True)['result']['value']))
 
 # available behavior scripts. Order matters, move those modifying the page
 # towards the end of available
-generic = [Scroll, EmulateScreenMetrics, Click]
+generic = [Scroll, EmulateScreenMetrics, Click, ExtractLinks]
 perSite = []
 available = generic + perSite + [Screenshot, DomSnapshot]
 availableNames = set (map (lambda x: x.name, available))
diff --git a/crocoite/cli.py b/crocoite/cli.py
index cac5b3b..4cbce4a 100644
--- a/crocoite/cli.py
+++ b/crocoite/cli.py
@@ -22,7 +22,7 @@
 Command line interface
 """
 
-import logging, argparse
+import logging, argparse, json, sys
 
 from . import behavior
 from .controller import SinglePageController, defaultSettings, ControllerSettings
@@ -68,7 +68,8 @@ def main ():
                 timeout=args.timeout)
         with open (args.output, 'wb') as fd:
             controller = SinglePageController (args.url, fd, settings=settings)
-            controller.run ()
+            r = controller.run ()
+    json.dump (r, sys.stdout)
 
     return True
 
diff --git a/crocoite/controller.py b/crocoite/controller.py
index a338559..00958a9 100644
--- a/crocoite/controller.py
+++ b/crocoite/controller.py
@@ -40,6 +40,13 @@ from .browser import ChromeService
 from .warc import WarcLoader, SerializingWARCWriter
 from .util import getFormattedViewportMetrics
 
+def firstOrNone (it):
+    """ Return first item of iterator it or None if empty """
+    try:
+        return next (it)
+    except StopIteration:
+        return None
+
 class SinglePageController:
     """
     Archive a single page url to file output.
@@ -55,7 +62,7 @@ class SinglePageController:
         self.logger = logger
 
     def run (self):
-        ret = {'stats': None}
+        ret = {'stats': None, 'links': None}
 
         with self.service as browser:
             browser = pychrome.Browser (url=browser)
@@ -77,6 +84,8 @@ class SinglePageController:
                 # not all behavior scripts are allowed for every URL, filter them
                 enabledBehavior = list (filter (lambda x: self.url in x,
                         map (lambda x: x (l), self.behavior)))
+                linksBehavior = firstOrNone (filter (lambda x: isinstance (x, cbehavior.ExtractLinks),
+                        enabledBehavior))
 
                 for b in enabledBehavior:
                     self.logger.debug ('starting onload behavior {}'.format (b.name))
@@ -98,6 +107,7 @@ class SinglePageController:
                     b.onfinish ()
 
                 ret['stats'] = l.stats
+                ret['links'] = linksBehavior.links if linksBehavior else None
             writer.flush ()
         return ret
 
diff --git a/crocoite/data/extract-links.js b/crocoite/data/extract-links.js
new file mode 100644
index 0000000..f2a37aa
--- /dev/null
+++ b/crocoite/data/extract-links.js
@@ -0,0 +1,9 @@
+/*	Extract links from a page
+ */
+let x = document.body.querySelectorAll('a[href]');
+let ret = [];
+let index = 0;
+for( index=0; index < x.length; index++ ) {
+   ret.push (x[index].href);
+}
+ret; /* immediately return results, for use with Runtime.evaluate() */
author	Lars-Dominik Braun <lars@6xq.net>	2018-05-01 10:37:08 +0200
committer	Lars-Dominik Braun <lars@6xq.net>	2018-05-04 16:00:05 +0200
commit	eb818f0c6eb86461a0db1845876f2a0b39b99b7f (patch)
tree	2b25cb248e4a2aca0577a336f48ae112c7e54408 /crocoite
parent	d8193765df85293f825abc486ac5cb1f5ac0a028 (diff)
download	crocoite-eb818f0c6eb86461a0db1845876f2a0b39b99b7f.tar.gz crocoite-eb818f0c6eb86461a0db1845876f2a0b39b99b7f.tar.bz2 crocoite-eb818f0c6eb86461a0db1845876f2a0b39b99b7f.zip