summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLars-Dominik Braun <lars@6xq.net>2017-11-24 19:54:23 +0100
committerLars-Dominik Braun <lars@6xq.net>2017-11-24 19:54:23 +0100
commitc1fda1e1899d12d6a582b07a6a69b4e2743867ac (patch)
tree0474cce78384efa0d55266ad810c30bf732ffe3a
parent5cb096ace6582bba06c9bc625c750d5a83dd18ea (diff)
downloadcrocoite-c1fda1e1899d12d6a582b07a6a69b4e2743867ac.tar.gz
crocoite-c1fda1e1899d12d6a582b07a6a69b4e2743867ac.tar.bz2
crocoite-c1fda1e1899d12d6a582b07a6a69b4e2743867ac.zip
DOM snapshot: Save frames/subdocuments as well
Request all subdocuments with pierce=True, split the result and save each document. Playback with pywb works, because timestamps of the snapshots are close to each other.
-rw-r--r--crocoite/cli.py49
1 files changed, 36 insertions, 13 deletions
diff --git a/crocoite/cli.py b/crocoite/cli.py
index 5db4329..60d6661 100644
--- a/crocoite/cli.py
+++ b/crocoite/cli.py
@@ -101,6 +101,8 @@ class ChromeTreeWalker (TreeWalker):
elif name == '#document':
for child in node.get ('children', []):
yield from self.recurse (child)
+ else:
+ assert False, name
else:
default_namespace = constants.namespaces["html"]
attributes = node.get ('attributes', [])
@@ -113,10 +115,27 @@ class ChromeTreeWalker (TreeWalker):
yield self.endTag ('', name)
def __iter__ (self):
- from pprint import pprint
assert self.tree['nodeName'] == '#document'
return self.recurse (self.tree)
+ def split (self):
+ """
+ Split response returned by DOM.getDocument(pierce=True) into independent documents
+ """
+ def recurse (node):
+ contentDocument = node.get ('contentDocument')
+ if contentDocument:
+ assert contentDocument['nodeName'] == '#document'
+ yield contentDocument
+ yield from recurse (contentDocument)
+
+ for child in node.get ('children', []):
+ yield from recurse (child)
+
+ if self.tree['nodeName'] == '#document':
+ yield self.tree
+ yield from recurse (self.tree)
+
class StripTagFilter (Filter):
"""
Remove arbitrary tags
@@ -308,18 +327,22 @@ def main ():
can’t handle that though.
"""
viewport = getFormattedViewportMetrics (tab)
- dom = tab.DOM.getDocument (depth=-1)
- # remove script, to make the page static and noscript, because at the
- # time we took the snapshot scripts were enabled
- stream = StripTagFilter (ChromeTreeWalker (dom['root']), ['script', 'noscript'])
- serializer = HTMLSerializer ()
- httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1')
- record = writer.create_warc_record (dom['root']['documentURL'], 'response',
- payload=BytesIO (serializer.render (stream, 'utf8')),
- http_headers=httpHeaders,
- warc_headers_dict={'X-DOM-Snapshot': str (True),
- 'X-Chrome-Viewport': viewport})
- writer.write_record (record)
+ dom = tab.DOM.getDocument (depth=-1, pierce=True)
+ for doc in ChromeTreeWalker (dom['root']).split ():
+ url = urlsplit (doc['documentURL'])
+ if url.scheme in ('http', 'https'):
+ walker = ChromeTreeWalker (doc)
+ # remove script, to make the page static and noscript, because at the
+ # time we took the snapshot scripts were enabled
+ stream = StripTagFilter (walker, ['script', 'noscript'])
+ serializer = HTMLSerializer ()
+ httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1')
+ record = writer.create_warc_record (doc['documentURL'], 'response',
+ payload=BytesIO (serializer.render (stream, 'utf8')),
+ http_headers=httpHeaders,
+ warc_headers_dict={'X-DOM-Snapshot': str (True),
+ 'X-Chrome-Viewport': viewport})
+ writer.write_record (record)
def emulateScreenMetrics (tab):
"""