summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLars-Dominik Braun <lars@6xq.net>2019-05-30 14:51:11 +0300
committerLars-Dominik Braun <lars@6xq.net>2019-05-30 14:52:48 +0300
commit6067fa02b8252c3e084a9d9b0cd122ec217e5ee7 (patch)
treee84ab7bd96beb46e9f2b8fc97838d7ef6143f30a
parent79d9adf23e9993ae36ad9a89108ab79eec32882a (diff)
downloadcrocoite-6067fa02b8252c3e084a9d9b0cd122ec217e5ee7.tar.gz
crocoite-6067fa02b8252c3e084a9d9b0cd122ec217e5ee7.tar.bz2
crocoite-6067fa02b8252c3e084a9d9b0cd122ec217e5ee7.zip
controller: Fix DepthLimit
The policy itself must be stateless, since there can be multiple ExtractLinks events (which would cause DepthLimit to reduce its depth every time).
-rw-r--r--crocoite/controller.py42
-rw-r--r--crocoite/test_controller.py15
2 files changed, 45 insertions, 12 deletions
diff --git a/crocoite/controller.py b/crocoite/controller.py
index 432d434..61153ca 100644
--- a/crocoite/controller.py
+++ b/crocoite/controller.py
@@ -233,6 +233,24 @@ class SinglePageController:
else:
handle.cancel ()
+class SetEntry:
+ """ A object, to be used with sets, that compares equality only on its
+ primary property. """
+ def __init__ (self, value, **props):
+ self.value = value
+ for k, v in props.items ():
+ setattr (self, k, v)
+
+ def __eq__ (self, b):
+ assert isinstance (b, SetEntry)
+ return self.value == b.value
+
+ def __hash__ (self):
+ return hash (self.value)
+
+ def __repr__ (self):
+ return f'<SetEntry {self.value!r}>'
+
class RecursionPolicy:
""" Abstract recursion policy """
@@ -251,16 +269,14 @@ class DepthLimit (RecursionPolicy):
__slots__ = ('maxdepth', )
def __init__ (self, maxdepth=0):
- if maxdepth < 0 or maxdepth > 1:
- raise ValueError ('Unsupported')
self.maxdepth = maxdepth
def __call__ (self, urls):
- if self.maxdepth <= 0:
- return {}
- else:
- self.maxdepth -= 1
- return urls
+ newurls = set ()
+ for u in urls:
+ if u.depth <= self.maxdepth:
+ newurls.add (u)
+ return newurls
def __repr__ (self):
return f'<DepthLimit {self.maxdepth}>'
@@ -280,7 +296,7 @@ class PrefixLimit (RecursionPolicy):
self.prefix = prefix
def __call__ (self, urls):
- return set (filter (lambda u: str(u).startswith (str (self.prefix)), urls))
+ return set (filter (lambda u: str(u.value).startswith (str (self.prefix)), urls))
class RecursiveController:
"""
@@ -310,13 +326,17 @@ class RecursiveController:
# keep in sync with StatsHandler
self.stats = {'requests': 0, 'finished': 0, 'failed': 0, 'bytesRcv': 0, 'crashed': 0, 'ignored': 0}
- async def fetch (self, url):
+ async def fetch (self, entry):
"""
Fetch a single URL using an external command
command is usually crocoite-grab
"""
+ assert isinstance (entry, SetEntry)
+
+ url = entry.value
+ depth = entry.depth
logger = self.logger.bind (url=url)
def formatCommand (e):
@@ -356,7 +376,7 @@ class RecursiveController:
data = json.loads (data)
uuid = data.get ('uuid')
if uuid == '8ee5e9c9-1130-4c5c-88ff-718508546e0c':
- links = set (self.policy (map (lambda x: URL(x).with_fragment(None), data.get ('links', []))))
+ links = set (self.policy (map (lambda x: SetEntry (URL(x).with_fragment(None), depth=depth+1), data.get ('links', []))))
links.difference_update (self.have)
self.pending.update (links)
elif uuid == '24d92d16-770e-4088-b769-4020e127a7ff':
@@ -387,7 +407,7 @@ class RecursiveController:
try:
self.have = set ()
- self.pending = set ([self.url])
+ self.pending = set ([SetEntry (self.url, depth=0)])
while self.pending:
# since pending is a set this picks a random item, which is fine
diff --git a/crocoite/test_controller.py b/crocoite/test_controller.py
index c88b80d..6f92e23 100644
--- a/crocoite/test_controller.py
+++ b/crocoite/test_controller.py
@@ -26,7 +26,7 @@ from aiohttp import web
import pytest
from .logger import Logger
-from .controller import ControllerSettings, SinglePageController
+from .controller import ControllerSettings, SinglePageController, SetEntry
from .devtools import Process
from .test_browser import loader
@@ -104,3 +104,16 @@ window.setInterval (function () { fetch('/').then (function (e) { console.log (e
finally:
await runner.cleanup ()
+def test_set_entry ():
+ a = SetEntry (1, a=2, b=3)
+ assert a == a
+ assert hash (a) == hash (a)
+
+ b = SetEntry (1, a=2, b=4)
+ assert a == b
+ assert hash (a) == hash (b)
+
+ c = SetEntry (2, a=2, b=3)
+ assert a != c
+ assert hash (a) != hash (c)
+