summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLars-Dominik Braun <lars@6xq.net>2018-09-25 16:49:00 +0200
committerLars-Dominik Braun <lars@6xq.net>2018-09-25 17:01:26 +0200
commite06aa345dbf60d00874f2af4cdd1aee196c25fe8 (patch)
treee871415f686c78e54e8e5a280b3ccfb15821f1d4
parentdcf48e1eb4f61fde83e525ddfe0850efbf1d79bd (diff)
downloadcrocoite-e06aa345dbf60d00874f2af4cdd1aee196c25fe8.tar.gz
crocoite-e06aa345dbf60d00874f2af4cdd1aee196c25fe8.tar.bz2
crocoite-e06aa345dbf60d00874f2af4cdd1aee196c25fe8.zip
Prevent recursing into arbitrary schemes
HTTP(S) only.
-rw-r--r--crocoite/controller.py10
1 files changed, 9 insertions, 1 deletions
diff --git a/crocoite/controller.py b/crocoite/controller.py
index fffd024..9c153b8 100644
--- a/crocoite/controller.py
+++ b/crocoite/controller.py
@@ -292,6 +292,8 @@ class RecursiveController:
__slots__ = ('url', 'output', 'command', 'logger', 'policy', 'have',
'pending', 'stats', 'prefix', 'tempdir', 'running', 'concurrency')
+ SCHEME_WHITELIST = {'http', 'https'}
+
def __init__ (self, url, output, command, logger, prefix='{host}-{date}-',
tempdir=None, policy=DepthLimit (0), concurrency=1):
self.url = url
@@ -306,7 +308,7 @@ class RecursiveController:
# max number of tasks running
self.concurrency = concurrency
# keep in sync with StatsHandler
- self.stats = {'requests': 0, 'finished': 0, 'failed': 0, 'bytesRcv': 0, 'crashed': 0}
+ self.stats = {'requests': 0, 'finished': 0, 'failed': 0, 'bytesRcv': 0, 'crashed': 0, 'ignored': 0}
async def fetch (self, url):
"""
@@ -320,6 +322,12 @@ class RecursiveController:
def formatPrefix (p):
return p.format (host=urlparse (url).hostname, date=datetime.utcnow ().isoformat ())
+ if urlparse (url).scheme not in self.SCHEME_WHITELIST:
+ self.stats['ignored'] += 1
+ self.logger.warning ('scheme not whitelisted', url=url,
+ uuid='57e838de-4494-4316-ae98-cd3a2ebf541b')
+ return
+
dest = tempfile.NamedTemporaryFile (dir=self.tempdir,
prefix=formatPrefix (self.prefix), suffix='.warc.gz',
delete=False)