Ignore duplicate URLs when saving DOM snapshot

author: Lars-Dominik Braun <lars@6xq.net> 2017-11-25 14:35:05 +0100
committer: Lars-Dominik Braun <lars@6xq.net> 2017-11-25 14:35:05 +0100
commit: ee736e28ed4c5cdae395f5851eb2bad96d8078ef (patch)
tree: 40ba2e782c7b8d454098c7295644896e4ea3b35c
parent: 85d51603acdc19d6cafe47307f279eb6c4df1a03 (diff)
download: crocoite-ee736e28ed4c5cdae395f5851eb2bad96d8078ef.tar.gz
crocoite-ee736e28ed4c5cdae395f5851eb2bad96d8078ef.tar.bz2
crocoite-ee736e28ed4c5cdae395f5851eb2bad96d8078ef.zip
1 files changed, 10 insertions, 1 deletions
diff --git a/crocoite/cli.py b/crocoite/cli.py
index 7a29cc7..a2ac958 100644
--- a/crocoite/cli.py
+++ b/crocoite/cli.py
@@ -352,9 +352,18 @@ def main ():
         """
         viewport = getFormattedViewportMetrics (tab)
         dom = tab.DOM.getDocument (depth=-1, pierce=True)
+        haveUrls = set ()
         for doc in ChromeTreeWalker (dom['root']).split ():
-            url = urlsplit (doc['documentURL'])
+            rawUrl = doc['documentURL']
+            if rawUrl in haveUrls:
+                # ignore duplicate URLs. they are usually caused by
+                # javascript-injected iframes (advertising) with no(?) src
+                logger.warning ('have DOM snapshot for URL {}, ignoring'.format (rawUrl))
+                continue
+            url = urlsplit (rawUrl)
             if url.scheme in ('http', 'https'):
+                logger.debug ('saving DOM snapshot for url {}, base {}'.format (doc['documentURL'], doc['baseURL']))
+                haveUrls.add (rawUrl)
                 walker = ChromeTreeWalker (doc)
                 # remove script, to make the page static and noscript, because at the
                 # time we took the snapshot scripts were enabled
author	Lars-Dominik Braun <lars@6xq.net>	2017-11-25 14:35:05 +0100
committer	Lars-Dominik Braun <lars@6xq.net>	2017-11-25 14:35:05 +0100
commit	ee736e28ed4c5cdae395f5851eb2bad96d8078ef (patch)
tree	40ba2e782c7b8d454098c7295644896e4ea3b35c
parent	85d51603acdc19d6cafe47307f279eb6c4df1a03 (diff)
download	crocoite-ee736e28ed4c5cdae395f5851eb2bad96d8078ef.tar.gz crocoite-ee736e28ed4c5cdae395f5851eb2bad96d8078ef.tar.bz2 crocoite-ee736e28ed4c5cdae395f5851eb2bad96d8078ef.zip