diff options
Diffstat (limited to 'crocoite')
| -rw-r--r-- | crocoite/controller.py | 10 | 
1 files changed, 9 insertions, 1 deletions
diff --git a/crocoite/controller.py b/crocoite/controller.py index fffd024..9c153b8 100644 --- a/crocoite/controller.py +++ b/crocoite/controller.py @@ -292,6 +292,8 @@ class RecursiveController:      __slots__ = ('url', 'output', 'command', 'logger', 'policy', 'have',              'pending', 'stats', 'prefix', 'tempdir', 'running', 'concurrency') +    SCHEME_WHITELIST = {'http', 'https'} +      def __init__ (self, url, output, command, logger, prefix='{host}-{date}-',              tempdir=None, policy=DepthLimit (0), concurrency=1):          self.url = url @@ -306,7 +308,7 @@ class RecursiveController:          # max number of tasks running          self.concurrency = concurrency          # keep in sync with StatsHandler -        self.stats = {'requests': 0, 'finished': 0, 'failed': 0, 'bytesRcv': 0, 'crashed': 0} +        self.stats = {'requests': 0, 'finished': 0, 'failed': 0, 'bytesRcv': 0, 'crashed': 0, 'ignored': 0}      async def fetch (self, url):          """ @@ -320,6 +322,12 @@ class RecursiveController:          def formatPrefix (p):              return p.format (host=urlparse (url).hostname, date=datetime.utcnow ().isoformat ()) +        if urlparse (url).scheme not in self.SCHEME_WHITELIST: +            self.stats['ignored'] += 1 +            self.logger.warning ('scheme not whitelisted', url=url, +                    uuid='57e838de-4494-4316-ae98-cd3a2ebf541b') +            return +          dest = tempfile.NamedTemporaryFile (dir=self.tempdir,                  prefix=formatPrefix (self.prefix), suffix='.warc.gz',                  delete=False)  | 
