diff options
author | Lars-Dominik Braun <lars@6xq.net> | 2017-11-24 19:54:23 +0100 |
---|---|---|
committer | Lars-Dominik Braun <lars@6xq.net> | 2017-11-24 19:54:23 +0100 |
commit | c1fda1e1899d12d6a582b07a6a69b4e2743867ac (patch) | |
tree | 0474cce78384efa0d55266ad810c30bf732ffe3a /crocoite | |
parent | 5cb096ace6582bba06c9bc625c750d5a83dd18ea (diff) | |
download | crocoite-c1fda1e1899d12d6a582b07a6a69b4e2743867ac.tar.gz crocoite-c1fda1e1899d12d6a582b07a6a69b4e2743867ac.tar.bz2 crocoite-c1fda1e1899d12d6a582b07a6a69b4e2743867ac.zip |
DOM snapshot: Save frames/subdocuments as well
Request all subdocuments with pierce=True, split the result and save
each document. Playback with pywb works, because timestamps of the
snapshots are close to each other.
Diffstat (limited to 'crocoite')
-rw-r--r-- | crocoite/cli.py | 49 |
1 files changed, 36 insertions, 13 deletions
diff --git a/crocoite/cli.py b/crocoite/cli.py index 5db4329..60d6661 100644 --- a/crocoite/cli.py +++ b/crocoite/cli.py @@ -101,6 +101,8 @@ class ChromeTreeWalker (TreeWalker): elif name == '#document': for child in node.get ('children', []): yield from self.recurse (child) + else: + assert False, name else: default_namespace = constants.namespaces["html"] attributes = node.get ('attributes', []) @@ -113,10 +115,27 @@ class ChromeTreeWalker (TreeWalker): yield self.endTag ('', name) def __iter__ (self): - from pprint import pprint assert self.tree['nodeName'] == '#document' return self.recurse (self.tree) + def split (self): + """ + Split response returned by DOM.getDocument(pierce=True) into independent documents + """ + def recurse (node): + contentDocument = node.get ('contentDocument') + if contentDocument: + assert contentDocument['nodeName'] == '#document' + yield contentDocument + yield from recurse (contentDocument) + + for child in node.get ('children', []): + yield from recurse (child) + + if self.tree['nodeName'] == '#document': + yield self.tree + yield from recurse (self.tree) + class StripTagFilter (Filter): """ Remove arbitrary tags @@ -308,18 +327,22 @@ def main (): can’t handle that though. """ viewport = getFormattedViewportMetrics (tab) - dom = tab.DOM.getDocument (depth=-1) - # remove script, to make the page static and noscript, because at the - # time we took the snapshot scripts were enabled - stream = StripTagFilter (ChromeTreeWalker (dom['root']), ['script', 'noscript']) - serializer = HTMLSerializer () - httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1') - record = writer.create_warc_record (dom['root']['documentURL'], 'response', - payload=BytesIO (serializer.render (stream, 'utf8')), - http_headers=httpHeaders, - warc_headers_dict={'X-DOM-Snapshot': str (True), - 'X-Chrome-Viewport': viewport}) - writer.write_record (record) + dom = tab.DOM.getDocument (depth=-1, pierce=True) + for doc in ChromeTreeWalker (dom['root']).split (): + url = urlsplit (doc['documentURL']) + if url.scheme in ('http', 'https'): + walker = ChromeTreeWalker (doc) + # remove script, to make the page static and noscript, because at the + # time we took the snapshot scripts were enabled + stream = StripTagFilter (walker, ['script', 'noscript']) + serializer = HTMLSerializer () + httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1') + record = writer.create_warc_record (doc['documentURL'], 'response', + payload=BytesIO (serializer.render (stream, 'utf8')), + http_headers=httpHeaders, + warc_headers_dict={'X-DOM-Snapshot': str (True), + 'X-Chrome-Viewport': viewport}) + writer.write_record (record) def emulateScreenMetrics (tab): """ |