summaryrefslogtreecommitdiff
path: root/crocoite/controller.py
diff options
context:
space:
mode:
authorLars-Dominik Braun <lars@6xq.net>2018-12-18 12:34:25 +0100
committerLars-Dominik Braun <lars@6xq.net>2018-12-21 20:28:51 +0100
commit5e444dd6511d97308a84ae9c86ebf14547d01f01 (patch)
tree0852c081163ff3456038fb08ad4e47d0d47a6167 /crocoite/controller.py
parente19635a75cc1ab206be12ecf2b1c9a909baa9c21 (diff)
downloadcrocoite-5e444dd6511d97308a84ae9c86ebf14547d01f01.tar.gz
crocoite-5e444dd6511d97308a84ae9c86ebf14547d01f01.tar.bz2
crocoite-5e444dd6511d97308a84ae9c86ebf14547d01f01.zip
Parse URLs by default
Use library yarl (already pulled in by aiohttp). No URL processed should be a string.
Diffstat (limited to 'crocoite/controller.py')
-rw-r--r--crocoite/controller.py9
1 files changed, 4 insertions, 5 deletions
diff --git a/crocoite/controller.py b/crocoite/controller.py
index f8b1420..c646a61 100644
--- a/crocoite/controller.py
+++ b/crocoite/controller.py
@@ -26,12 +26,11 @@ import time
import tempfile, asyncio, json, os
from itertools import islice
from datetime import datetime
-from urllib.parse import urlparse
from operator import attrgetter
from . import behavior as cbehavior
from .browser import SiteLoader, Item
-from .util import getFormattedViewportMetrics, getSoftwareInfo, removeFragment
+from .util import getFormattedViewportMetrics, getSoftwareInfo
from .behavior import ExtractLinksEvent
class ControllerSettings:
@@ -316,12 +315,12 @@ class RecursiveController:
return e.format (url=url, dest=dest.name)
def formatPrefix (p):
- return p.format (host=urlparse (url).hostname, date=datetime.utcnow ().isoformat ())
+ return p.format (host=url.host, date=datetime.utcnow ().isoformat ())
def logStats ():
logger.info ('stats', uuid='24d92d16-770e-4088-b769-4020e127a7ff', **self.stats)
- if urlparse (url).scheme not in self.SCHEME_WHITELIST:
+ if url.scheme not in self.SCHEME_WHITELIST:
self.stats['ignored'] += 1
logStats ()
self.logger.warning ('scheme not whitelisted', url=url,
@@ -344,7 +343,7 @@ class RecursiveController:
data = json.loads (data)
uuid = data.get ('uuid')
if uuid == '8ee5e9c9-1130-4c5c-88ff-718508546e0c':
- links = set (self.policy (map (removeFragment, data.get ('links', []))))
+ links = set (self.policy (map (lambda x: x.with_fragment(None), data.get ('links', []))))
links.difference_update (self.have)
self.pending.update (links)
elif uuid == '24d92d16-770e-4088-b769-4020e127a7ff':