From 824c6e91ae6fee1318e79c3ce1a43f98bc697c7b Mon Sep 17 00:00:00 2001 From: Lars-Dominik Braun Date: Fri, 4 May 2018 13:00:05 +0200 Subject: Add distributed recursive crawls --- crocoite/controller.py | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) (limited to 'crocoite/controller.py') diff --git a/crocoite/controller.py b/crocoite/controller.py index 638cb6c..113c139 100644 --- a/crocoite/controller.py +++ b/crocoite/controller.py @@ -29,6 +29,10 @@ class ControllerSettings: self.idleTimeout = idleTimeout self.timeout = timeout + def toDict (self): + return dict (logBuffer=self.logBuffer, maxBodySize=self.maxBodySize, + idleTimeout=self.idleTimeout, timeout=self.timeout) + defaultSettings = ControllerSettings () import logging @@ -186,6 +190,18 @@ class RecursiveController: self.logger = logger self.recursionPolicy = recursionPolicy + def fetch (self, urls): + """ + Overrideable fetch action for URLs. Defaults to sequential + SinglePageController. + """ + result = [] + for u in urls: + c = SinglePageController (u, self.output, self.service, + self.behavior, self.logger, self.settings) + result.append (c.run ()) + return result + def run (self): have = set () urls = set ([self.url]) @@ -193,11 +209,7 @@ class RecursiveController: while urls: self.logger.info ('retrieving {} urls'.format (len (urls))) - result = [] - for u in urls: - c = SinglePageController (u, self.output, self.service, - self.behavior, self.logger, self.settings) - result.append (c.run ()) + result = self.fetch (urls) have.update (urls) urls = set () -- cgit v1.2.3