diff options
Diffstat (limited to 'crocoite')
-rw-r--r-- | crocoite/controller.py | 10 |
1 files changed, 9 insertions, 1 deletions
diff --git a/crocoite/controller.py b/crocoite/controller.py index fffd024..9c153b8 100644 --- a/crocoite/controller.py +++ b/crocoite/controller.py @@ -292,6 +292,8 @@ class RecursiveController: __slots__ = ('url', 'output', 'command', 'logger', 'policy', 'have', 'pending', 'stats', 'prefix', 'tempdir', 'running', 'concurrency') + SCHEME_WHITELIST = {'http', 'https'} + def __init__ (self, url, output, command, logger, prefix='{host}-{date}-', tempdir=None, policy=DepthLimit (0), concurrency=1): self.url = url @@ -306,7 +308,7 @@ class RecursiveController: # max number of tasks running self.concurrency = concurrency # keep in sync with StatsHandler - self.stats = {'requests': 0, 'finished': 0, 'failed': 0, 'bytesRcv': 0, 'crashed': 0} + self.stats = {'requests': 0, 'finished': 0, 'failed': 0, 'bytesRcv': 0, 'crashed': 0, 'ignored': 0} async def fetch (self, url): """ @@ -320,6 +322,12 @@ class RecursiveController: def formatPrefix (p): return p.format (host=urlparse (url).hostname, date=datetime.utcnow ().isoformat ()) + if urlparse (url).scheme not in self.SCHEME_WHITELIST: + self.stats['ignored'] += 1 + self.logger.warning ('scheme not whitelisted', url=url, + uuid='57e838de-4494-4316-ae98-cd3a2ebf541b') + return + dest = tempfile.NamedTemporaryFile (dir=self.tempdir, prefix=formatPrefix (self.prefix), suffix='.warc.gz', delete=False) |