From ba5dbfd061d328a2140f0a7541ef0fdb6acf5903 Mon Sep 17 00:00:00 2001
From: Lars-Dominik Braun <lars@6xq.net>
Date: Sun, 12 May 2019 15:37:48 +0300
Subject: behavior: Ignore invalid URLs when extracting links

Fixes #18.
---
 crocoite/behavior.py      |  9 ++++++++-
 crocoite/test_behavior.py | 11 ++++++++++-
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/crocoite/behavior.py b/crocoite/behavior.py
index dca9ea0..d079603 100644
--- a/crocoite/behavior.py
+++ b/crocoite/behavior.py
@@ -328,6 +328,13 @@ class ExtractLinksEvent:
     def __repr__ (self):
         return f'<ExtractLinksEvent {self.links!r}>'
 
+def mapOrIgnore (f, l):
+    for e in l:
+        try:
+            yield f (e)
+        except:
+            pass
+
 class ExtractLinks (Behavior):
     """
     Extract links from a page using JavaScript
@@ -348,7 +355,7 @@ class ExtractLinks (Behavior):
         tab = self.loader.tab
         yield self.script
         result = await tab.Runtime.evaluate (expression=str (self.script), returnByValue=True)
-        yield ExtractLinksEvent (list (set (map (URL, result['result']['value']))))
+        yield ExtractLinksEvent (list (set (mapOrIgnore (URL, result['result']['value']))))
 
 class Crash (Behavior):
     """ Crash the browser. For testing only. Obviously. """
diff --git a/crocoite/test_behavior.py b/crocoite/test_behavior.py
index 7a723c6..9a13c65 100644
--- a/crocoite/test_behavior.py
+++ b/crocoite/test_behavior.py
@@ -30,7 +30,7 @@ import pkg_resources
 from .logger import Logger
 from .devtools import Process
 from .behavior import Scroll, Behavior, ExtractLinks, ExtractLinksEvent, Crash, \
-        Screenshot, ScreenshotEvent, DomSnapshot, DomSnapshotEvent
+        Screenshot, ScreenshotEvent, DomSnapshot, DomSnapshotEvent, mapOrIgnore
 from .controller import SinglePageController, EventHandler
 from .devtools import Crashed
 
@@ -139,6 +139,7 @@ async def test_extract_links ():
                 <a href="http://example.com/absolute/">foo</a>
                 <a href="https://example.com/absolute/secure">foo</a>
                 <a href="#anchor">foo</a>
+                <a href="http://neue_preise_f%c3%bcr_zahnimplantate_k%c3%b6nnten_sie_%c3%bcberraschen">foo</a>
 
                 <a href="/hidden/visibility" style="visibility: hidden">foo</a>
                 <a href="/hidden/display" style="display: none">foo</a>
@@ -252,3 +253,11 @@ async def test_dom_snapshot ():
     finally:
         await runner.cleanup ()
 
+def test_mapOrIgnore ():
+    def fail (x):
+        if x < 50:
+            raise Exception ()
+        return x+1
+
+    assert list (mapOrIgnore (fail, range (100))) == list (range (51, 101))
+
-- 
cgit v1.2.3