diff options
| author | Lars-Dominik Braun <lars@6xq.net> | 2019-05-30 14:51:11 +0300 | 
|---|---|---|
| committer | Lars-Dominik Braun <lars@6xq.net> | 2019-05-30 14:52:48 +0300 | 
| commit | 6067fa02b8252c3e084a9d9b0cd122ec217e5ee7 (patch) | |
| tree | e84ab7bd96beb46e9f2b8fc97838d7ef6143f30a /crocoite | |
| parent | 79d9adf23e9993ae36ad9a89108ab79eec32882a (diff) | |
| download | crocoite-6067fa02b8252c3e084a9d9b0cd122ec217e5ee7.tar.gz crocoite-6067fa02b8252c3e084a9d9b0cd122ec217e5ee7.tar.bz2 crocoite-6067fa02b8252c3e084a9d9b0cd122ec217e5ee7.zip | |
controller: Fix DepthLimit
The policy itself must be stateless, since there can be multiple
ExtractLinks events (which would cause DepthLimit to reduce its depth
every time).
Diffstat (limited to 'crocoite')
| -rw-r--r-- | crocoite/controller.py | 42 | ||||
| -rw-r--r-- | crocoite/test_controller.py | 15 | 
2 files changed, 45 insertions, 12 deletions
| diff --git a/crocoite/controller.py b/crocoite/controller.py index 432d434..61153ca 100644 --- a/crocoite/controller.py +++ b/crocoite/controller.py @@ -233,6 +233,24 @@ class SinglePageController:              else:                  handle.cancel () +class SetEntry: +    """ A object, to be used with sets, that compares equality only on its +    primary property. """ +    def __init__ (self, value, **props): +        self.value = value +        for k, v in props.items (): +            setattr (self, k, v) + +    def __eq__ (self, b): +        assert isinstance (b, SetEntry) +        return self.value == b.value + +    def __hash__ (self): +        return hash (self.value) + +    def __repr__ (self): +        return f'<SetEntry {self.value!r}>' +  class RecursionPolicy:      """ Abstract recursion policy """ @@ -251,16 +269,14 @@ class DepthLimit (RecursionPolicy):      __slots__ = ('maxdepth', )      def __init__ (self, maxdepth=0): -        if maxdepth < 0 or maxdepth > 1: -            raise ValueError ('Unsupported')          self.maxdepth = maxdepth      def __call__ (self, urls): -        if self.maxdepth <= 0: -            return {} -        else: -            self.maxdepth -= 1 -            return urls +        newurls = set () +        for u in urls: +            if u.depth <= self.maxdepth: +                newurls.add (u) +        return newurls      def __repr__ (self):          return f'<DepthLimit {self.maxdepth}>' @@ -280,7 +296,7 @@ class PrefixLimit (RecursionPolicy):          self.prefix = prefix      def __call__ (self, urls): -        return set (filter (lambda u: str(u).startswith (str (self.prefix)), urls)) +        return set (filter (lambda u: str(u.value).startswith (str (self.prefix)), urls))  class RecursiveController:      """ @@ -310,13 +326,17 @@ class RecursiveController:          # keep in sync with StatsHandler          self.stats = {'requests': 0, 'finished': 0, 'failed': 0, 'bytesRcv': 0, 'crashed': 0, 'ignored': 0} -    async def fetch (self, url): +    async def fetch (self, entry):          """          Fetch a single URL using an external command          command is usually crocoite-grab          """ +        assert isinstance (entry, SetEntry) + +        url = entry.value +        depth = entry.depth          logger = self.logger.bind (url=url)          def formatCommand (e): @@ -356,7 +376,7 @@ class RecursiveController:                  data = json.loads (data)                  uuid = data.get ('uuid')                  if uuid == '8ee5e9c9-1130-4c5c-88ff-718508546e0c': -                    links = set (self.policy (map (lambda x: URL(x).with_fragment(None), data.get ('links', [])))) +                    links = set (self.policy (map (lambda x: SetEntry (URL(x).with_fragment(None), depth=depth+1), data.get ('links', []))))                      links.difference_update (self.have)                      self.pending.update (links)                  elif uuid == '24d92d16-770e-4088-b769-4020e127a7ff': @@ -387,7 +407,7 @@ class RecursiveController:          try:              self.have = set () -            self.pending = set ([self.url]) +            self.pending = set ([SetEntry (self.url, depth=0)])              while self.pending:                  # since pending is a set this picks a random item, which is fine diff --git a/crocoite/test_controller.py b/crocoite/test_controller.py index c88b80d..6f92e23 100644 --- a/crocoite/test_controller.py +++ b/crocoite/test_controller.py @@ -26,7 +26,7 @@ from aiohttp import web  import pytest  from .logger import Logger -from .controller import ControllerSettings, SinglePageController +from .controller import ControllerSettings, SinglePageController, SetEntry  from .devtools import Process  from .test_browser import loader @@ -104,3 +104,16 @@ window.setInterval (function () { fetch('/').then (function (e) { console.log (e      finally:          await runner.cleanup () +def test_set_entry (): +    a = SetEntry (1, a=2, b=3) +    assert a == a +    assert hash (a) == hash (a) + +    b = SetEntry (1, a=2, b=4) +    assert a == b +    assert hash (a) == hash (b) + +    c = SetEntry (2, a=2, b=3) +    assert a != c +    assert hash (a) != hash (c) + | 
