summaryrefslogtreecommitdiff
path: root/crocoite/controller.py
diff options
context:
space:
mode:
authorLars-Dominik Braun <lars@6xq.net>2018-05-04 13:00:05 +0200
committerLars-Dominik Braun <lars@6xq.net>2018-05-04 16:00:05 +0200
commit824c6e91ae6fee1318e79c3ce1a43f98bc697c7b (patch)
treec8662f34d26247202f0d8eebc4f37224f16f18ec /crocoite/controller.py
parent48cabe3b0c7e7a760de4d40af55717c024fdc3bf (diff)
downloadcrocoite-824c6e91ae6fee1318e79c3ce1a43f98bc697c7b.tar.gz
crocoite-824c6e91ae6fee1318e79c3ce1a43f98bc697c7b.tar.bz2
crocoite-824c6e91ae6fee1318e79c3ce1a43f98bc697c7b.zip
Add distributed recursive crawls
Diffstat (limited to 'crocoite/controller.py')
-rw-r--r--crocoite/controller.py22
1 files changed, 17 insertions, 5 deletions
diff --git a/crocoite/controller.py b/crocoite/controller.py
index 638cb6c..113c139 100644
--- a/crocoite/controller.py
+++ b/crocoite/controller.py
@@ -29,6 +29,10 @@ class ControllerSettings:
self.idleTimeout = idleTimeout
self.timeout = timeout
+ def toDict (self):
+ return dict (logBuffer=self.logBuffer, maxBodySize=self.maxBodySize,
+ idleTimeout=self.idleTimeout, timeout=self.timeout)
+
defaultSettings = ControllerSettings ()
import logging
@@ -186,6 +190,18 @@ class RecursiveController:
self.logger = logger
self.recursionPolicy = recursionPolicy
+ def fetch (self, urls):
+ """
+ Overrideable fetch action for URLs. Defaults to sequential
+ SinglePageController.
+ """
+ result = []
+ for u in urls:
+ c = SinglePageController (u, self.output, self.service,
+ self.behavior, self.logger, self.settings)
+ result.append (c.run ())
+ return result
+
def run (self):
have = set ()
urls = set ([self.url])
@@ -193,11 +209,7 @@ class RecursiveController:
while urls:
self.logger.info ('retrieving {} urls'.format (len (urls)))
- result = []
- for u in urls:
- c = SinglePageController (u, self.output, self.service,
- self.behavior, self.logger, self.settings)
- result.append (c.run ())
+ result = self.fetch (urls)
have.update (urls)
urls = set ()