From 48cabe3b0c7e7a760de4d40af55717c024fdc3bf Mon Sep 17 00:00:00 2001 From: Lars-Dominik Braun Date: Tue, 1 May 2018 15:59:30 +0200 Subject: Add support for recursive crawls Only local right now, not distributed. --- crocoite/cli.py | 17 ++++++++- crocoite/controller.py | 100 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 115 insertions(+), 2 deletions(-) diff --git a/crocoite/cli.py b/crocoite/cli.py index 4cbce4a..196162e 100644 --- a/crocoite/cli.py +++ b/crocoite/cli.py @@ -25,7 +25,8 @@ Command line interface import logging, argparse, json, sys from . import behavior -from .controller import SinglePageController, defaultSettings, ControllerSettings +from .controller import RecursiveController, defaultSettings, \ + ControllerSettings, DepthLimit, PrefixLimit def stateCallback (data): result = data['result'] @@ -35,6 +36,7 @@ def stateCallback (data): def main (): parser = argparse.ArgumentParser(description='Save website to WARC using Google Chrome.') parser.add_argument('--browser', help='DevTools URL', metavar='URL') + parser.add_argument('--recursive', help='Follow links recursively') parser.add_argument('--timeout', default=10, type=int, help='Maximum time for archival', metavar='SEC') parser.add_argument('--idle-timeout', default=2, type=int, help='Maximum idle seconds (i.e. no requests)', dest='idleTimeout', metavar='SEC') parser.add_argument('--log-buffer', default=defaultSettings.logBuffer, type=int, dest='logBuffer', metavar='LINES') @@ -52,8 +54,18 @@ def main (): # prepare args for function distributed = args.distributed + if args.recursive is None: + recursionPolicy = DepthLimit (0) + elif args.recursive.isdigit (): + recursionPolicy = DepthLimit (int (args.recursive)) + elif args.recursive == 'prefix': + recursionPolicy = PrefixLimit (args.url) + else: + parser.error ('Invalid argument for --recursive') if distributed: + assert args.recursive is None, "Distributed crawls cannot be recursive right now, sorry" + from .task import archive settings = dict (maxBodySize=args.maxBodySize, logBuffer=args.logBuffer, idleTimeout=args.idleTimeout, @@ -67,7 +79,8 @@ def main (): logBuffer=args.logBuffer, idleTimeout=args.idleTimeout, timeout=args.timeout) with open (args.output, 'wb') as fd: - controller = SinglePageController (args.url, fd, settings=settings) + controller = RecursiveController (args.url, fd, settings=settings, + recursionPolicy=recursionPolicy) r = controller.run () json.dump (r, sys.stdout) diff --git a/crocoite/controller.py b/crocoite/controller.py index 00958a9..638cb6c 100644 --- a/crocoite/controller.py +++ b/crocoite/controller.py @@ -32,6 +32,7 @@ class ControllerSettings: defaultSettings = ControllerSettings () import logging +from urllib.parse import urlsplit, urlunsplit import pychrome @@ -111,3 +112,102 @@ class SinglePageController: writer.flush () return ret +from collections import UserDict + +class IntegerDict (UserDict): + """ Dict with dict/dict per-item arithmetic propagation, i.e. {1: 2}+{1: 1}={1: 3} """ + def __add__ (self, b): + newdict = self.__class__ (self) + for k, v in b.items (): + if k in self: + newdict[k] += v + else: + newdict[k] = v + return newdict + +class RecursionPolicy: + """ Abstract recursion policy """ + def __call__ (self, urls): + raise NotImplementedError + +class DepthLimit (RecursionPolicy): + """ + Limit recursion by depth. + + depth==0 means no recursion, depth==1 is the page and outgoing links, … + """ + def __init__ (self, maxdepth=0): + self.maxdepth = maxdepth + + def __call__ (self, urls): + if self.maxdepth <= 0: + return {} + else: + self.maxdepth -= 1 + return urls + + def __repr__ (self): + return ''.format (self.maxdepth) + +class PrefixLimit (RecursionPolicy): + """ + Limit recursion by prefix + + i.e. prefix=http://example.com/foo + ignored: http://example.com/bar http://offsite.example/foo + accepted: http://example.com/foobar http://example.com/foo/bar + """ + def __init__ (self, prefix): + self.prefix = prefix + + def __call__ (self, urls): + return set (filter (lambda u: u.startswith (self.prefix), urls)) + +def removeFragment (u): + """ Remove fragment from url (i.e. #hashvalue) """ + s = urlsplit (u) + return urlunsplit ((s.scheme, s.netloc, s.path, s.query, '')) + +class RecursiveController: + """ + Simple recursive controller + + Visits links acording to recursionPolicy + """ + + def __init__ (self, url, output, service=ChromeService (), behavior=cbehavior.available, \ + logger=logging.getLogger(__name__), settings=defaultSettings, + recursionPolicy=DepthLimit (0)): + self.url = url + self.output = output + self.service = service + self.behavior = behavior + self.settings = settings + self.logger = logger + self.recursionPolicy = recursionPolicy + + def run (self): + have = set () + urls = set ([self.url]) + ret = {'stats': IntegerDict ()} + + while urls: + self.logger.info ('retrieving {} urls'.format (len (urls))) + result = [] + for u in urls: + c = SinglePageController (u, self.output, self.service, + self.behavior, self.logger, self.settings) + result.append (c.run ()) + + have.update (urls) + urls = set () + for r in result: + ret['stats'] += r['stats'] + urls.update (map (removeFragment, r['links'])) + urls.difference_update (have) + + urls = self.recursionPolicy (urls) + # everything in ret must be serializeable + ret['stats'] = dict (ret['stats']) + return ret + -- cgit v1.2.3