From 6067fa02b8252c3e084a9d9b0cd122ec217e5ee7 Mon Sep 17 00:00:00 2001 From: Lars-Dominik Braun Date: Thu, 30 May 2019 14:51:11 +0300 Subject: controller: Fix DepthLimit The policy itself must be stateless, since there can be multiple ExtractLinks events (which would cause DepthLimit to reduce its depth every time). --- crocoite/controller.py | 42 +++++++++++++++++++++++++++++++----------- crocoite/test_controller.py | 15 ++++++++++++++- 2 files changed, 45 insertions(+), 12 deletions(-) diff --git a/crocoite/controller.py b/crocoite/controller.py index 432d434..61153ca 100644 --- a/crocoite/controller.py +++ b/crocoite/controller.py @@ -233,6 +233,24 @@ class SinglePageController: else: handle.cancel () +class SetEntry: + """ A object, to be used with sets, that compares equality only on its + primary property. """ + def __init__ (self, value, **props): + self.value = value + for k, v in props.items (): + setattr (self, k, v) + + def __eq__ (self, b): + assert isinstance (b, SetEntry) + return self.value == b.value + + def __hash__ (self): + return hash (self.value) + + def __repr__ (self): + return f'' + class RecursionPolicy: """ Abstract recursion policy """ @@ -251,16 +269,14 @@ class DepthLimit (RecursionPolicy): __slots__ = ('maxdepth', ) def __init__ (self, maxdepth=0): - if maxdepth < 0 or maxdepth > 1: - raise ValueError ('Unsupported') self.maxdepth = maxdepth def __call__ (self, urls): - if self.maxdepth <= 0: - return {} - else: - self.maxdepth -= 1 - return urls + newurls = set () + for u in urls: + if u.depth <= self.maxdepth: + newurls.add (u) + return newurls def __repr__ (self): return f'' @@ -280,7 +296,7 @@ class PrefixLimit (RecursionPolicy): self.prefix = prefix def __call__ (self, urls): - return set (filter (lambda u: str(u).startswith (str (self.prefix)), urls)) + return set (filter (lambda u: str(u.value).startswith (str (self.prefix)), urls)) class RecursiveController: """ @@ -310,13 +326,17 @@ class RecursiveController: # keep in sync with StatsHandler self.stats = {'requests': 0, 'finished': 0, 'failed': 0, 'bytesRcv': 0, 'crashed': 0, 'ignored': 0} - async def fetch (self, url): + async def fetch (self, entry): """ Fetch a single URL using an external command command is usually crocoite-grab """ + assert isinstance (entry, SetEntry) + + url = entry.value + depth = entry.depth logger = self.logger.bind (url=url) def formatCommand (e): @@ -356,7 +376,7 @@ class RecursiveController: data = json.loads (data) uuid = data.get ('uuid') if uuid == '8ee5e9c9-1130-4c5c-88ff-718508546e0c': - links = set (self.policy (map (lambda x: URL(x).with_fragment(None), data.get ('links', [])))) + links = set (self.policy (map (lambda x: SetEntry (URL(x).with_fragment(None), depth=depth+1), data.get ('links', [])))) links.difference_update (self.have) self.pending.update (links) elif uuid == '24d92d16-770e-4088-b769-4020e127a7ff': @@ -387,7 +407,7 @@ class RecursiveController: try: self.have = set () - self.pending = set ([self.url]) + self.pending = set ([SetEntry (self.url, depth=0)]) while self.pending: # since pending is a set this picks a random item, which is fine diff --git a/crocoite/test_controller.py b/crocoite/test_controller.py index c88b80d..6f92e23 100644 --- a/crocoite/test_controller.py +++ b/crocoite/test_controller.py @@ -26,7 +26,7 @@ from aiohttp import web import pytest from .logger import Logger -from .controller import ControllerSettings, SinglePageController +from .controller import ControllerSettings, SinglePageController, SetEntry from .devtools import Process from .test_browser import loader @@ -104,3 +104,16 @@ window.setInterval (function () { fetch('/').then (function (e) { console.log (e finally: await runner.cleanup () +def test_set_entry (): + a = SetEntry (1, a=2, b=3) + assert a == a + assert hash (a) == hash (a) + + b = SetEntry (1, a=2, b=4) + assert a == b + assert hash (a) == hash (b) + + c = SetEntry (2, a=2, b=3) + assert a != c + assert hash (a) != hash (c) + -- cgit v1.2.3