From c1fda1e1899d12d6a582b07a6a69b4e2743867ac Mon Sep 17 00:00:00 2001
From: Lars-Dominik Braun <lars@6xq.net>
Date: Fri, 24 Nov 2017 19:54:23 +0100
Subject: DOM snapshot: Save frames/subdocuments as well

Request all subdocuments with pierce=True, split the result and save
each document. Playback with pywb works, because timestamps of the
snapshots are close to each other.
---
 crocoite/cli.py | 49 ++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 36 insertions(+), 13 deletions(-)

(limited to 'crocoite/cli.py')

diff --git a/crocoite/cli.py b/crocoite/cli.py
index 5db4329..60d6661 100644
--- a/crocoite/cli.py
+++ b/crocoite/cli.py
@@ -101,6 +101,8 @@ class ChromeTreeWalker (TreeWalker):
             elif name == '#document':
                 for child in node.get ('children', []):
                     yield from self.recurse (child)
+            else:
+                assert False, name
         else:
             default_namespace = constants.namespaces["html"]
             attributes = node.get ('attributes', [])
@@ -113,10 +115,27 @@ class ChromeTreeWalker (TreeWalker):
             yield self.endTag ('', name)
 
     def __iter__ (self):
-        from pprint import pprint
         assert self.tree['nodeName'] == '#document'
         return self.recurse (self.tree)
 
+    def split (self):
+        """
+        Split response returned by DOM.getDocument(pierce=True) into independent documents
+        """
+        def recurse (node):
+            contentDocument = node.get ('contentDocument')
+            if contentDocument:
+                assert contentDocument['nodeName'] == '#document'
+                yield contentDocument
+                yield from recurse (contentDocument)
+
+            for child in node.get ('children', []):
+                yield from recurse (child)
+
+        if self.tree['nodeName'] == '#document':
+            yield self.tree
+        yield from recurse (self.tree)
+
 class StripTagFilter (Filter):
     """
     Remove arbitrary tags
@@ -308,18 +327,22 @@ def main ():
         can’t handle that though.
         """
         viewport = getFormattedViewportMetrics (tab)
-        dom = tab.DOM.getDocument (depth=-1)
-        # remove script, to make the page static and noscript, because at the
-        # time we took the snapshot scripts were enabled
-        stream = StripTagFilter (ChromeTreeWalker (dom['root']), ['script', 'noscript'])
-        serializer = HTMLSerializer ()
-        httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1')
-        record = writer.create_warc_record (dom['root']['documentURL'], 'response',
-                payload=BytesIO (serializer.render (stream, 'utf8')),
-                http_headers=httpHeaders,
-                warc_headers_dict={'X-DOM-Snapshot': str (True),
-                        'X-Chrome-Viewport': viewport})
-        writer.write_record (record)
+        dom = tab.DOM.getDocument (depth=-1, pierce=True)
+        for doc in ChromeTreeWalker (dom['root']).split ():
+            url = urlsplit (doc['documentURL'])
+            if url.scheme in ('http', 'https'):
+                walker = ChromeTreeWalker (doc)
+                # remove script, to make the page static and noscript, because at the
+                # time we took the snapshot scripts were enabled
+                stream = StripTagFilter (walker, ['script', 'noscript'])
+                serializer = HTMLSerializer ()
+                httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1')
+                record = writer.create_warc_record (doc['documentURL'], 'response',
+                        payload=BytesIO (serializer.render (stream, 'utf8')),
+                        http_headers=httpHeaders,
+                        warc_headers_dict={'X-DOM-Snapshot': str (True),
+                                'X-Chrome-Viewport': viewport})
+                writer.write_record (record)
 
     def emulateScreenMetrics (tab):
         """
-- 
cgit v1.2.3