summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLars-Dominik Braun <lars@6xq.net>2019-05-12 15:37:48 +0300
committerLars-Dominik Braun <lars@6xq.net>2019-05-12 15:37:48 +0300
commitba5dbfd061d328a2140f0a7541ef0fdb6acf5903 (patch)
treed092aa0e9b401404b6463db22741511165ca1575
parent0299acfb6edf7d54ed112834a2b639567f782ab4 (diff)
downloadcrocoite-ba5dbfd061d328a2140f0a7541ef0fdb6acf5903.tar.gz
crocoite-ba5dbfd061d328a2140f0a7541ef0fdb6acf5903.tar.bz2
crocoite-ba5dbfd061d328a2140f0a7541ef0fdb6acf5903.zip
behavior: Ignore invalid URLs when extracting links
Fixes #18.
-rw-r--r--crocoite/behavior.py9
-rw-r--r--crocoite/test_behavior.py11
2 files changed, 18 insertions, 2 deletions
diff --git a/crocoite/behavior.py b/crocoite/behavior.py
index dca9ea0..d079603 100644
--- a/crocoite/behavior.py
+++ b/crocoite/behavior.py
@@ -328,6 +328,13 @@ class ExtractLinksEvent:
def __repr__ (self):
return f'<ExtractLinksEvent {self.links!r}>'
+def mapOrIgnore (f, l):
+ for e in l:
+ try:
+ yield f (e)
+ except:
+ pass
+
class ExtractLinks (Behavior):
"""
Extract links from a page using JavaScript
@@ -348,7 +355,7 @@ class ExtractLinks (Behavior):
tab = self.loader.tab
yield self.script
result = await tab.Runtime.evaluate (expression=str (self.script), returnByValue=True)
- yield ExtractLinksEvent (list (set (map (URL, result['result']['value']))))
+ yield ExtractLinksEvent (list (set (mapOrIgnore (URL, result['result']['value']))))
class Crash (Behavior):
""" Crash the browser. For testing only. Obviously. """
diff --git a/crocoite/test_behavior.py b/crocoite/test_behavior.py
index 7a723c6..9a13c65 100644
--- a/crocoite/test_behavior.py
+++ b/crocoite/test_behavior.py
@@ -30,7 +30,7 @@ import pkg_resources
from .logger import Logger
from .devtools import Process
from .behavior import Scroll, Behavior, ExtractLinks, ExtractLinksEvent, Crash, \
- Screenshot, ScreenshotEvent, DomSnapshot, DomSnapshotEvent
+ Screenshot, ScreenshotEvent, DomSnapshot, DomSnapshotEvent, mapOrIgnore
from .controller import SinglePageController, EventHandler
from .devtools import Crashed
@@ -139,6 +139,7 @@ async def test_extract_links ():
<a href="http://example.com/absolute/">foo</a>
<a href="https://example.com/absolute/secure">foo</a>
<a href="#anchor">foo</a>
+ <a href="http://neue_preise_f%c3%bcr_zahnimplantate_k%c3%b6nnten_sie_%c3%bcberraschen">foo</a>
<a href="/hidden/visibility" style="visibility: hidden">foo</a>
<a href="/hidden/display" style="display: none">foo</a>
@@ -252,3 +253,11 @@ async def test_dom_snapshot ():
finally:
await runner.cleanup ()
+def test_mapOrIgnore ():
+ def fail (x):
+ if x < 50:
+ raise Exception ()
+ return x+1
+
+ assert list (mapOrIgnore (fail, range (100))) == list (range (51, 101))
+