From c1fda1e1899d12d6a582b07a6a69b4e2743867ac Mon Sep 17 00:00:00 2001 From: Lars-Dominik Braun Date: Fri, 24 Nov 2017 19:54:23 +0100 Subject: DOM snapshot: Save frames/subdocuments as well Request all subdocuments with pierce=True, split the result and save each document. Playback with pywb works, because timestamps of the snapshots are close to each other. --- crocoite/cli.py | 49 ++++++++++++++++++++++++++++++++++++------------- 1 file changed, 36 insertions(+), 13 deletions(-) (limited to 'crocoite/cli.py') diff --git a/crocoite/cli.py b/crocoite/cli.py index 5db4329..60d6661 100644 --- a/crocoite/cli.py +++ b/crocoite/cli.py @@ -101,6 +101,8 @@ class ChromeTreeWalker (TreeWalker): elif name == '#document': for child in node.get ('children', []): yield from self.recurse (child) + else: + assert False, name else: default_namespace = constants.namespaces["html"] attributes = node.get ('attributes', []) @@ -113,10 +115,27 @@ class ChromeTreeWalker (TreeWalker): yield self.endTag ('', name) def __iter__ (self): - from pprint import pprint assert self.tree['nodeName'] == '#document' return self.recurse (self.tree) + def split (self): + """ + Split response returned by DOM.getDocument(pierce=True) into independent documents + """ + def recurse (node): + contentDocument = node.get ('contentDocument') + if contentDocument: + assert contentDocument['nodeName'] == '#document' + yield contentDocument + yield from recurse (contentDocument) + + for child in node.get ('children', []): + yield from recurse (child) + + if self.tree['nodeName'] == '#document': + yield self.tree + yield from recurse (self.tree) + class StripTagFilter (Filter): """ Remove arbitrary tags @@ -308,18 +327,22 @@ def main (): can’t handle that though. """ viewport = getFormattedViewportMetrics (tab) - dom = tab.DOM.getDocument (depth=-1) - # remove script, to make the page static and noscript, because at the - # time we took the snapshot scripts were enabled - stream = StripTagFilter (ChromeTreeWalker (dom['root']), ['script', 'noscript']) - serializer = HTMLSerializer () - httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1') - record = writer.create_warc_record (dom['root']['documentURL'], 'response', - payload=BytesIO (serializer.render (stream, 'utf8')), - http_headers=httpHeaders, - warc_headers_dict={'X-DOM-Snapshot': str (True), - 'X-Chrome-Viewport': viewport}) - writer.write_record (record) + dom = tab.DOM.getDocument (depth=-1, pierce=True) + for doc in ChromeTreeWalker (dom['root']).split (): + url = urlsplit (doc['documentURL']) + if url.scheme in ('http', 'https'): + walker = ChromeTreeWalker (doc) + # remove script, to make the page static and noscript, because at the + # time we took the snapshot scripts were enabled + stream = StripTagFilter (walker, ['script', 'noscript']) + serializer = HTMLSerializer () + httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1') + record = writer.create_warc_record (doc['documentURL'], 'response', + payload=BytesIO (serializer.render (stream, 'utf8')), + http_headers=httpHeaders, + warc_headers_dict={'X-DOM-Snapshot': str (True), + 'X-Chrome-Viewport': viewport}) + writer.write_record (record) def emulateScreenMetrics (tab): """ -- cgit v1.2.3