diff options
author | Lars-Dominik Braun <lars@6xq.net> | 2018-09-25 16:49:00 +0200 |
---|---|---|
committer | Lars-Dominik Braun <lars@6xq.net> | 2018-09-25 17:01:26 +0200 |
commit | e06aa345dbf60d00874f2af4cdd1aee196c25fe8 (patch) | |
tree | e871415f686c78e54e8e5a280b3ccfb15821f1d4 | |
parent | dcf48e1eb4f61fde83e525ddfe0850efbf1d79bd (diff) | |
download | crocoite-e06aa345dbf60d00874f2af4cdd1aee196c25fe8.tar.gz crocoite-e06aa345dbf60d00874f2af4cdd1aee196c25fe8.tar.bz2 crocoite-e06aa345dbf60d00874f2af4cdd1aee196c25fe8.zip |
Prevent recursing into arbitrary schemes
HTTP(S) only.
-rw-r--r-- | crocoite/controller.py | 10 |
1 files changed, 9 insertions, 1 deletions
diff --git a/crocoite/controller.py b/crocoite/controller.py index fffd024..9c153b8 100644 --- a/crocoite/controller.py +++ b/crocoite/controller.py @@ -292,6 +292,8 @@ class RecursiveController: __slots__ = ('url', 'output', 'command', 'logger', 'policy', 'have', 'pending', 'stats', 'prefix', 'tempdir', 'running', 'concurrency') + SCHEME_WHITELIST = {'http', 'https'} + def __init__ (self, url, output, command, logger, prefix='{host}-{date}-', tempdir=None, policy=DepthLimit (0), concurrency=1): self.url = url @@ -306,7 +308,7 @@ class RecursiveController: # max number of tasks running self.concurrency = concurrency # keep in sync with StatsHandler - self.stats = {'requests': 0, 'finished': 0, 'failed': 0, 'bytesRcv': 0, 'crashed': 0} + self.stats = {'requests': 0, 'finished': 0, 'failed': 0, 'bytesRcv': 0, 'crashed': 0, 'ignored': 0} async def fetch (self, url): """ @@ -320,6 +322,12 @@ class RecursiveController: def formatPrefix (p): return p.format (host=urlparse (url).hostname, date=datetime.utcnow ().isoformat ()) + if urlparse (url).scheme not in self.SCHEME_WHITELIST: + self.stats['ignored'] += 1 + self.logger.warning ('scheme not whitelisted', url=url, + uuid='57e838de-4494-4316-ae98-cd3a2ebf541b') + return + dest = tempfile.NamedTemporaryFile (dir=self.tempdir, prefix=formatPrefix (self.prefix), suffix='.warc.gz', delete=False) |