diff options
| -rw-r--r-- | crocoite/cli.py | 17 | ||||
| -rw-r--r-- | crocoite/controller.py | 100 | 
2 files changed, 115 insertions, 2 deletions
| diff --git a/crocoite/cli.py b/crocoite/cli.py index 4cbce4a..196162e 100644 --- a/crocoite/cli.py +++ b/crocoite/cli.py @@ -25,7 +25,8 @@ Command line interface  import logging, argparse, json, sys  from . import behavior -from .controller import SinglePageController, defaultSettings, ControllerSettings +from .controller import RecursiveController, defaultSettings, \ +        ControllerSettings, DepthLimit, PrefixLimit  def stateCallback (data):      result = data['result'] @@ -35,6 +36,7 @@ def stateCallback (data):  def main ():      parser = argparse.ArgumentParser(description='Save website to WARC using Google Chrome.')      parser.add_argument('--browser', help='DevTools URL', metavar='URL') +    parser.add_argument('--recursive', help='Follow links recursively')      parser.add_argument('--timeout', default=10, type=int, help='Maximum time for archival', metavar='SEC')      parser.add_argument('--idle-timeout', default=2, type=int, help='Maximum idle seconds (i.e. no requests)', dest='idleTimeout', metavar='SEC')      parser.add_argument('--log-buffer', default=defaultSettings.logBuffer, type=int, dest='logBuffer', metavar='LINES') @@ -52,8 +54,18 @@ def main ():      # prepare args for function      distributed = args.distributed +    if args.recursive is None: +        recursionPolicy = DepthLimit (0) +    elif args.recursive.isdigit (): +        recursionPolicy = DepthLimit (int (args.recursive)) +    elif args.recursive == 'prefix': +        recursionPolicy = PrefixLimit (args.url) +    else: +        parser.error ('Invalid argument for --recursive')      if distributed: +        assert args.recursive is None, "Distributed crawls cannot be recursive right now, sorry" +          from .task import archive          settings = dict (maxBodySize=args.maxBodySize,                  logBuffer=args.logBuffer, idleTimeout=args.idleTimeout, @@ -67,7 +79,8 @@ def main ():                  logBuffer=args.logBuffer, idleTimeout=args.idleTimeout,                  timeout=args.timeout)          with open (args.output, 'wb') as fd: -            controller = SinglePageController (args.url, fd, settings=settings) +            controller = RecursiveController (args.url, fd, settings=settings, +                    recursionPolicy=recursionPolicy)              r = controller.run ()      json.dump (r, sys.stdout) diff --git a/crocoite/controller.py b/crocoite/controller.py index 00958a9..638cb6c 100644 --- a/crocoite/controller.py +++ b/crocoite/controller.py @@ -32,6 +32,7 @@ class ControllerSettings:  defaultSettings = ControllerSettings ()  import logging +from urllib.parse import urlsplit, urlunsplit  import pychrome @@ -111,3 +112,102 @@ class SinglePageController:              writer.flush ()          return ret +from collections import UserDict + +class IntegerDict (UserDict): +    """ Dict with dict/dict per-item arithmetic propagation, i.e. {1: 2}+{1: 1}={1: 3} """ +    def __add__ (self, b): +        newdict = self.__class__ (self) +        for k, v in b.items (): +            if k in self: +                newdict[k] += v +            else: +                newdict[k] = v +        return newdict + +class RecursionPolicy: +    """ Abstract recursion policy """ +    def __call__ (self, urls): +        raise NotImplementedError + +class DepthLimit (RecursionPolicy): +    """ +    Limit recursion by depth. +     +    depth==0 means no recursion, depth==1 is the page and outgoing links, … +    """ +    def __init__ (self, maxdepth=0): +        self.maxdepth = maxdepth + +    def __call__ (self, urls): +        if self.maxdepth <= 0: +            return {} +        else: +            self.maxdepth -= 1 +            return urls + +    def __repr__ (self): +        return '<DepthLimit {}>'.format (self.maxdepth) + +class PrefixLimit (RecursionPolicy): +    """ +    Limit recursion by prefix +     +    i.e. prefix=http://example.com/foo +    ignored: http://example.com/bar http://offsite.example/foo +    accepted: http://example.com/foobar http://example.com/foo/bar +    """ +    def __init__ (self, prefix): +        self.prefix = prefix + +    def __call__ (self, urls): +        return set (filter (lambda u: u.startswith (self.prefix), urls)) + +def removeFragment (u): +    """ Remove fragment from url (i.e. #hashvalue) """ +    s = urlsplit (u) +    return urlunsplit ((s.scheme, s.netloc, s.path, s.query, '')) + +class RecursiveController: +    """ +    Simple recursive controller + +    Visits links acording to recursionPolicy +    """ + +    def __init__ (self, url, output, service=ChromeService (), behavior=cbehavior.available, \ +            logger=logging.getLogger(__name__), settings=defaultSettings, +            recursionPolicy=DepthLimit (0)): +        self.url = url +        self.output = output +        self.service = service +        self.behavior = behavior +        self.settings = settings +        self.logger = logger +        self.recursionPolicy = recursionPolicy + +    def run (self): +        have = set () +        urls = set ([self.url]) +        ret = {'stats': IntegerDict ()} + +        while urls: +            self.logger.info ('retrieving {} urls'.format (len (urls))) +            result = [] +            for u in urls: +                c = SinglePageController (u, self.output, self.service, +                        self.behavior, self.logger, self.settings) +                result.append (c.run ()) + +            have.update (urls) +            urls = set () +            for r in result: +                ret['stats'] += r['stats'] +                urls.update (map (removeFragment, r['links'])) +            urls.difference_update (have) + +            urls = self.recursionPolicy (urls) +        # everything in ret must be serializeable +        ret['stats'] = dict (ret['stats']) +        return ret + | 
