diff options
author | Lars-Dominik Braun <lars@6xq.net> | 2017-11-25 14:35:05 +0100 |
---|---|---|
committer | Lars-Dominik Braun <lars@6xq.net> | 2017-11-25 14:35:05 +0100 |
commit | ee736e28ed4c5cdae395f5851eb2bad96d8078ef (patch) | |
tree | 40ba2e782c7b8d454098c7295644896e4ea3b35c /crocoite | |
parent | 85d51603acdc19d6cafe47307f279eb6c4df1a03 (diff) | |
download | crocoite-ee736e28ed4c5cdae395f5851eb2bad96d8078ef.tar.gz crocoite-ee736e28ed4c5cdae395f5851eb2bad96d8078ef.tar.bz2 crocoite-ee736e28ed4c5cdae395f5851eb2bad96d8078ef.zip |
Ignore duplicate URLs when saving DOM snapshot
Diffstat (limited to 'crocoite')
-rw-r--r-- | crocoite/cli.py | 11 |
1 files changed, 10 insertions, 1 deletions
diff --git a/crocoite/cli.py b/crocoite/cli.py index 7a29cc7..a2ac958 100644 --- a/crocoite/cli.py +++ b/crocoite/cli.py @@ -352,9 +352,18 @@ def main (): """ viewport = getFormattedViewportMetrics (tab) dom = tab.DOM.getDocument (depth=-1, pierce=True) + haveUrls = set () for doc in ChromeTreeWalker (dom['root']).split (): - url = urlsplit (doc['documentURL']) + rawUrl = doc['documentURL'] + if rawUrl in haveUrls: + # ignore duplicate URLs. they are usually caused by + # javascript-injected iframes (advertising) with no(?) src + logger.warning ('have DOM snapshot for URL {}, ignoring'.format (rawUrl)) + continue + url = urlsplit (rawUrl) if url.scheme in ('http', 'https'): + logger.debug ('saving DOM snapshot for url {}, base {}'.format (doc['documentURL'], doc['baseURL'])) + haveUrls.add (rawUrl) walker = ChromeTreeWalker (doc) # remove script, to make the page static and noscript, because at the # time we took the snapshot scripts were enabled |