diff options
author | Lars-Dominik Braun <lars@6xq.net> | 2018-12-18 12:34:25 +0100 |
---|---|---|
committer | Lars-Dominik Braun <lars@6xq.net> | 2018-12-21 20:28:51 +0100 |
commit | 5e444dd6511d97308a84ae9c86ebf14547d01f01 (patch) | |
tree | 0852c081163ff3456038fb08ad4e47d0d47a6167 /crocoite/controller.py | |
parent | e19635a75cc1ab206be12ecf2b1c9a909baa9c21 (diff) | |
download | crocoite-5e444dd6511d97308a84ae9c86ebf14547d01f01.tar.gz crocoite-5e444dd6511d97308a84ae9c86ebf14547d01f01.tar.bz2 crocoite-5e444dd6511d97308a84ae9c86ebf14547d01f01.zip |
Parse URLs by default
Use library yarl (already pulled in by aiohttp). No URL processed should
be a string.
Diffstat (limited to 'crocoite/controller.py')
-rw-r--r-- | crocoite/controller.py | 9 |
1 files changed, 4 insertions, 5 deletions
diff --git a/crocoite/controller.py b/crocoite/controller.py index f8b1420..c646a61 100644 --- a/crocoite/controller.py +++ b/crocoite/controller.py @@ -26,12 +26,11 @@ import time import tempfile, asyncio, json, os from itertools import islice from datetime import datetime -from urllib.parse import urlparse from operator import attrgetter from . import behavior as cbehavior from .browser import SiteLoader, Item -from .util import getFormattedViewportMetrics, getSoftwareInfo, removeFragment +from .util import getFormattedViewportMetrics, getSoftwareInfo from .behavior import ExtractLinksEvent class ControllerSettings: @@ -316,12 +315,12 @@ class RecursiveController: return e.format (url=url, dest=dest.name) def formatPrefix (p): - return p.format (host=urlparse (url).hostname, date=datetime.utcnow ().isoformat ()) + return p.format (host=url.host, date=datetime.utcnow ().isoformat ()) def logStats (): logger.info ('stats', uuid='24d92d16-770e-4088-b769-4020e127a7ff', **self.stats) - if urlparse (url).scheme not in self.SCHEME_WHITELIST: + if url.scheme not in self.SCHEME_WHITELIST: self.stats['ignored'] += 1 logStats () self.logger.warning ('scheme not whitelisted', url=url, @@ -344,7 +343,7 @@ class RecursiveController: data = json.loads (data) uuid = data.get ('uuid') if uuid == '8ee5e9c9-1130-4c5c-88ff-718508546e0c': - links = set (self.policy (map (removeFragment, data.get ('links', [])))) + links = set (self.policy (map (lambda x: x.with_fragment(None), data.get ('links', [])))) links.difference_update (self.have) self.pending.update (links) elif uuid == '24d92d16-770e-4088-b769-4020e127a7ff': |