diff options
author | Lars-Dominik Braun <lars@6xq.net> | 2018-05-04 13:00:05 +0200 |
---|---|---|
committer | Lars-Dominik Braun <lars@6xq.net> | 2018-05-04 16:00:05 +0200 |
commit | 824c6e91ae6fee1318e79c3ce1a43f98bc697c7b (patch) | |
tree | c8662f34d26247202f0d8eebc4f37224f16f18ec /crocoite/controller.py | |
parent | 48cabe3b0c7e7a760de4d40af55717c024fdc3bf (diff) | |
download | crocoite-824c6e91ae6fee1318e79c3ce1a43f98bc697c7b.tar.gz crocoite-824c6e91ae6fee1318e79c3ce1a43f98bc697c7b.tar.bz2 crocoite-824c6e91ae6fee1318e79c3ce1a43f98bc697c7b.zip |
Add distributed recursive crawls
Diffstat (limited to 'crocoite/controller.py')
-rw-r--r-- | crocoite/controller.py | 22 |
1 files changed, 17 insertions, 5 deletions
diff --git a/crocoite/controller.py b/crocoite/controller.py index 638cb6c..113c139 100644 --- a/crocoite/controller.py +++ b/crocoite/controller.py @@ -29,6 +29,10 @@ class ControllerSettings: self.idleTimeout = idleTimeout self.timeout = timeout + def toDict (self): + return dict (logBuffer=self.logBuffer, maxBodySize=self.maxBodySize, + idleTimeout=self.idleTimeout, timeout=self.timeout) + defaultSettings = ControllerSettings () import logging @@ -186,6 +190,18 @@ class RecursiveController: self.logger = logger self.recursionPolicy = recursionPolicy + def fetch (self, urls): + """ + Overrideable fetch action for URLs. Defaults to sequential + SinglePageController. + """ + result = [] + for u in urls: + c = SinglePageController (u, self.output, self.service, + self.behavior, self.logger, self.settings) + result.append (c.run ()) + return result + def run (self): have = set () urls = set ([self.url]) @@ -193,11 +209,7 @@ class RecursiveController: while urls: self.logger.info ('retrieving {} urls'.format (len (urls))) - result = [] - for u in urls: - c = SinglePageController (u, self.output, self.service, - self.behavior, self.logger, self.settings) - result.append (c.run ()) + result = self.fetch (urls) have.update (urls) urls = set () |