summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLars-Dominik Braun <lars@6xq.net>2018-05-01 15:59:30 +0200
committerLars-Dominik Braun <lars@6xq.net>2018-05-04 16:00:05 +0200
commit48cabe3b0c7e7a760de4d40af55717c024fdc3bf (patch)
tree269b34c7d89b739ba0c0c2042db282ae20dd3ff7
parentde2b4036ac7ce2d242d49fe37337db890c30da3b (diff)
downloadcrocoite-48cabe3b0c7e7a760de4d40af55717c024fdc3bf.tar.gz
crocoite-48cabe3b0c7e7a760de4d40af55717c024fdc3bf.tar.bz2
crocoite-48cabe3b0c7e7a760de4d40af55717c024fdc3bf.zip
Add support for recursive crawls
Only local right now, not distributed.
-rw-r--r--crocoite/cli.py17
-rw-r--r--crocoite/controller.py100
2 files changed, 115 insertions, 2 deletions
diff --git a/crocoite/cli.py b/crocoite/cli.py
index 4cbce4a..196162e 100644
--- a/crocoite/cli.py
+++ b/crocoite/cli.py
@@ -25,7 +25,8 @@ Command line interface
import logging, argparse, json, sys
from . import behavior
-from .controller import SinglePageController, defaultSettings, ControllerSettings
+from .controller import RecursiveController, defaultSettings, \
+ ControllerSettings, DepthLimit, PrefixLimit
def stateCallback (data):
result = data['result']
@@ -35,6 +36,7 @@ def stateCallback (data):
def main ():
parser = argparse.ArgumentParser(description='Save website to WARC using Google Chrome.')
parser.add_argument('--browser', help='DevTools URL', metavar='URL')
+ parser.add_argument('--recursive', help='Follow links recursively')
parser.add_argument('--timeout', default=10, type=int, help='Maximum time for archival', metavar='SEC')
parser.add_argument('--idle-timeout', default=2, type=int, help='Maximum idle seconds (i.e. no requests)', dest='idleTimeout', metavar='SEC')
parser.add_argument('--log-buffer', default=defaultSettings.logBuffer, type=int, dest='logBuffer', metavar='LINES')
@@ -52,8 +54,18 @@ def main ():
# prepare args for function
distributed = args.distributed
+ if args.recursive is None:
+ recursionPolicy = DepthLimit (0)
+ elif args.recursive.isdigit ():
+ recursionPolicy = DepthLimit (int (args.recursive))
+ elif args.recursive == 'prefix':
+ recursionPolicy = PrefixLimit (args.url)
+ else:
+ parser.error ('Invalid argument for --recursive')
if distributed:
+ assert args.recursive is None, "Distributed crawls cannot be recursive right now, sorry"
+
from .task import archive
settings = dict (maxBodySize=args.maxBodySize,
logBuffer=args.logBuffer, idleTimeout=args.idleTimeout,
@@ -67,7 +79,8 @@ def main ():
logBuffer=args.logBuffer, idleTimeout=args.idleTimeout,
timeout=args.timeout)
with open (args.output, 'wb') as fd:
- controller = SinglePageController (args.url, fd, settings=settings)
+ controller = RecursiveController (args.url, fd, settings=settings,
+ recursionPolicy=recursionPolicy)
r = controller.run ()
json.dump (r, sys.stdout)
diff --git a/crocoite/controller.py b/crocoite/controller.py
index 00958a9..638cb6c 100644
--- a/crocoite/controller.py
+++ b/crocoite/controller.py
@@ -32,6 +32,7 @@ class ControllerSettings:
defaultSettings = ControllerSettings ()
import logging
+from urllib.parse import urlsplit, urlunsplit
import pychrome
@@ -111,3 +112,102 @@ class SinglePageController:
writer.flush ()
return ret
+from collections import UserDict
+
+class IntegerDict (UserDict):
+ """ Dict with dict/dict per-item arithmetic propagation, i.e. {1: 2}+{1: 1}={1: 3} """
+ def __add__ (self, b):
+ newdict = self.__class__ (self)
+ for k, v in b.items ():
+ if k in self:
+ newdict[k] += v
+ else:
+ newdict[k] = v
+ return newdict
+
+class RecursionPolicy:
+ """ Abstract recursion policy """
+ def __call__ (self, urls):
+ raise NotImplementedError
+
+class DepthLimit (RecursionPolicy):
+ """
+ Limit recursion by depth.
+
+ depth==0 means no recursion, depth==1 is the page and outgoing links, …
+ """
+ def __init__ (self, maxdepth=0):
+ self.maxdepth = maxdepth
+
+ def __call__ (self, urls):
+ if self.maxdepth <= 0:
+ return {}
+ else:
+ self.maxdepth -= 1
+ return urls
+
+ def __repr__ (self):
+ return '<DepthLimit {}>'.format (self.maxdepth)
+
+class PrefixLimit (RecursionPolicy):
+ """
+ Limit recursion by prefix
+
+ i.e. prefix=http://example.com/foo
+ ignored: http://example.com/bar http://offsite.example/foo
+ accepted: http://example.com/foobar http://example.com/foo/bar
+ """
+ def __init__ (self, prefix):
+ self.prefix = prefix
+
+ def __call__ (self, urls):
+ return set (filter (lambda u: u.startswith (self.prefix), urls))
+
+def removeFragment (u):
+ """ Remove fragment from url (i.e. #hashvalue) """
+ s = urlsplit (u)
+ return urlunsplit ((s.scheme, s.netloc, s.path, s.query, ''))
+
+class RecursiveController:
+ """
+ Simple recursive controller
+
+ Visits links acording to recursionPolicy
+ """
+
+ def __init__ (self, url, output, service=ChromeService (), behavior=cbehavior.available, \
+ logger=logging.getLogger(__name__), settings=defaultSettings,
+ recursionPolicy=DepthLimit (0)):
+ self.url = url
+ self.output = output
+ self.service = service
+ self.behavior = behavior
+ self.settings = settings
+ self.logger = logger
+ self.recursionPolicy = recursionPolicy
+
+ def run (self):
+ have = set ()
+ urls = set ([self.url])
+ ret = {'stats': IntegerDict ()}
+
+ while urls:
+ self.logger.info ('retrieving {} urls'.format (len (urls)))
+ result = []
+ for u in urls:
+ c = SinglePageController (u, self.output, self.service,
+ self.behavior, self.logger, self.settings)
+ result.append (c.run ())
+
+ have.update (urls)
+ urls = set ()
+ for r in result:
+ ret['stats'] += r['stats']
+ urls.update (map (removeFragment, r['links']))
+ urls.difference_update (have)
+
+ urls = self.recursionPolicy (urls)
+ # everything in ret must be serializeable
+ ret['stats'] = dict (ret['stats'])
+ return ret
+