Add page created from DOM snapshot

author: Lars-Dominik Braun <lars@6xq.net> 2017-11-20 19:19:05 +0100
committer: Lars-Dominik Braun <lars@6xq.net> 2017-11-20 19:25:33 +0100
commit: ca01f82227a8b79f1cbc4f5e0be5434804dc3c0e (patch)
tree: dd8aafb9b1672f70985eb5dd14635eb8635dd5e3
parent: 0b8a8e88a3c33c14e52241190ee6478cb2acd49d (diff)
download: crocoite-ca01f82227a8b79f1cbc4f5e0be5434804dc3c0e.tar.gz
crocoite-ca01f82227a8b79f1cbc4f5e0be5434804dc3c0e.tar.bz2
crocoite-ca01f82227a8b79f1cbc4f5e0be5434804dc3c0e.zip
3 files changed, 119 insertions, 9 deletions
diff --git a/README.rst b/README.rst
index 7eea272..f66da27 100644
--- a/README.rst
+++ b/README.rst
@@ -10,6 +10,7 @@ Dependencies
 - Python 3
 - pychrome_ 
 - warcio_
+- html5lib
 
 .. _pychrome: https://github.com/fate0/pychrome
 .. _warcio: https://github.com/webrecorder/warcio
@@ -34,7 +35,20 @@ Caveats
 -------
 
 - Original HTTP requests/responses are not available. They are rebuilt from
-  data available. Character encoding for text documents is changed to UTF-8.
-- Some sites request different assets based on screen resolution, some fetch
-  different scripts based on user agent.
+  parsed data. Character encoding for text documents is changed to UTF-8.
+- Some sites request assets based on screen resolution, pixel ratio and
+  supported image formats (webp). Replaying those with different parameters
+  won’t work, since assets for those are missing. Example: missguided.com.
+- Some fetch different scripts based on user agent. Example: youtube.com.
+- Requests containing randomly generated JavaScript callback function names
+  won’t work. Example: weather.com.
+
+Most of these issues can be worked around by using the DOM snapshot, which is
+also saved. This causes its own set of issues though:
+
+- JavaScript-based navigation does not work.
+- Scripts modifying styles based on scrolling position are stuck at the end of
+  page state at the moment. Example: twitter.com
+- CSS-based asset loading (screen size, pixel ratio, …) still does not work.
+- Canvas contents are probably not preserved.
 
diff --git a/crocoite/cli.py b/crocoite/cli.py
index 24192fe..aee00e3 100644
--- a/crocoite/cli.py
+++ b/crocoite/cli.py
@@ -31,6 +31,11 @@ from io import BytesIO
 import argparse
 import tempfile
 
+from html5lib.treewalkers.base import TreeWalker
+from html5lib.filters.base import Filter
+from html5lib.serializer import HTMLSerializer
+from html5lib import constants
+
 logger = logging.getLogger(__name__)
 
 # 10 MB, encoded! (i.e. actual data can be larger due to compression)
@@ -76,6 +81,58 @@ class WARCLogHandler (BufferingHandler):
         finally:
             self.release ()
 
+class ChromeTreeWalker (TreeWalker):
+    """
+    Recursive html5lib TreeWalker for Google Chrome method DOM.getDocument
+    """
+
+    def recurse (self, node):
+        name = node['nodeName']
+        if name.startswith ('#'):
+            if name == '#text':
+                yield from self.text (node['nodeValue'])
+            elif name == '#comment':
+                yield self.comment (node['nodeValue'])
+            elif name == '#document':
+                for child in node.get ('children', []):
+                    yield from self.recurse (child)
+        else:
+            default_namespace = constants.namespaces["html"]
+            attributes = node.get ('attributes', [])
+            convertedAttr = {}
+            for i in range (0, len (attributes), 2):
+                convertedAttr[(default_namespace, attributes[i])] = attributes[i+1]
+            yield self.startTag (default_namespace, name, convertedAttr)
+            for child in node.get ('children', []):
+                yield from self.recurse (child)
+            yield self.endTag ('', name)
+
+    def __iter__ (self):
+        from pprint import pprint
+        assert self.tree['nodeName'] == '#document'
+        return self.recurse (self.tree)
+
+class StripTagFilter (Filter):
+    """
+    Remove arbitrary tags
+    """
+
+    def __init__ (self, source, tags):
+        Filter.__init__ (self, source)
+        self.tags = set (map (str.lower, tags))
+
+    def __iter__(self):
+        delete = 0
+        for token in Filter.__iter__(self):
+            tokenType = token['type']
+            if tokenType == 'StartTag':
+                if delete > 0 or token['name'].lower () in self.tags:
+                    delete += 1
+            if delete == 0:
+                yield token
+            if tokenType == 'EndTag' and delete > 0:
+                delete -= 1
+
 def main ():
     def getStatusText (response):
         text = response.get ('statusText')
@@ -219,7 +276,7 @@ def main ():
                 writer.write_record(record)
             except pychrome.exceptions.CallMethodException:
                 logger.error ('no data for {} {} {}'.format (resp['url'],
-                        resp['status'], kwargs.get ('requestId')))
+                        resp['status'], reqId))
         else:
             logger.warn ('body for {} is too large, {} bytes'.format (resp['url'], kwargs['encodedDataLength']))
 
@@ -229,6 +286,36 @@ def main ():
         if reqId in requests:
             del requests[reqId]
 
+    def getFormattedViewportMetrics (tab):
+        layoutMetrics = tab.Page.getLayoutMetrics ()
+        # XXX: I’m not entirely sure which one we should use here
+        return '{}x{}'.format (layoutMetrics['layoutViewport']['clientWidth'],
+                    layoutMetrics['layoutViewport']['clientHeight'])
+
+    def writeDOMSnapshot (tab, writer):
+        """
+        Get a DOM snapshot of tab and write it to WARC.
+
+        We could use DOMSnapshot.getSnapshot here, but the API is not stable
+        yet. Also computed styles are not really necessary here.
+
+        XXX: Currently writes a response, when it should use “resource”. pywb
+        can’t handle that though.
+        """
+        viewport = getFormattedViewportMetrics (tab)
+        dom = tab.DOM.getDocument (depth=-1)
+        # remove script, to make the page static and noscript, because at the
+        # time we took the snapshot scripts were enabled
+        stream = StripTagFilter (ChromeTreeWalker (dom['root']), ['script', 'noscript'])
+        serializer = HTMLSerializer ()
+        httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1')
+        record = writer.create_warc_record (dom['root']['documentURL'], 'response',
+                payload=BytesIO (serializer.render (stream, 'utf8')),
+                http_headers=httpHeaders,
+                warc_headers_dict={'X-DOM-Snapshot': str (True),
+                        'X-Chrome-Viewport': viewport})
+        writer.write_record (record)
+
     logging.basicConfig (level=logging.DEBUG)
 
     parser = argparse.ArgumentParser(description='Save website to WARC using Google Chrome.')
@@ -268,15 +355,11 @@ def main ():
     fd = open (args.output, 'wb')
     writer = WARCWriter (fd, gzip=True)
     version = tab.Browser.getVersion ()
-    # assuming these don’t change
-    layoutMetrics = tab.Page.getLayoutMetrics ()
     payload = {
             'software': __package__,
             'browser': version['product'],
             'useragent': version['userAgent'],
-            # XXX: I’m not entirely sure which one we should use here
-            'viewport': '{}x{}'.format (layoutMetrics['layoutViewport']['clientWidth'],
-                    layoutMetrics['layoutViewport']['clientHeight'])
+            'viewport': getFormattedViewportMetrics (tab),
             }
     warcinfo = writer.create_warcinfo_record (filename=None, info=payload)
     writer.write_record (warcinfo)
@@ -308,6 +391,18 @@ def main ():
         if len (requests) == 0:
             break
 
+    # disable events
+    tab.Page.stopLoading ()
+    tab.Network.disable ()
+    tab.Page.disable ()
+    tab.Network.requestWillBeSent = None
+    tab.Network.responseReceived = None
+    tab.Network.loadingFinished = None
+    tab.Network.loadingFailed = None
+    tab.Page.loadEventFired = None
+
+    writeDOMSnapshot (tab, writer)
+
     tab.stop()
     if not args.keepTab:
         browser.close_tab(tab)
diff --git a/setup.py b/setup.py
index 939cec9..52747c0 100644
--- a/setup.py
+++ b/setup.py
@@ -12,6 +12,7 @@ setup(
     install_requires=[
         'pychrome',
         'warcio',
+        'html5lib>=0.999999999',
     ],
     entry_points={
     'console_scripts': [
author	Lars-Dominik Braun <lars@6xq.net>	2017-11-20 19:19:05 +0100
committer	Lars-Dominik Braun <lars@6xq.net>	2017-11-20 19:25:33 +0100
commit	ca01f82227a8b79f1cbc4f5e0be5434804dc3c0e (patch)
tree	dd8aafb9b1672f70985eb5dd14635eb8635dd5e3
parent	0b8a8e88a3c33c14e52241190ee6478cb2acd49d (diff)
download	crocoite-ca01f82227a8b79f1cbc4f5e0be5434804dc3c0e.tar.gz crocoite-ca01f82227a8b79f1cbc4f5e0be5434804dc3c0e.tar.bz2 crocoite-ca01f82227a8b79f1cbc4f5e0be5434804dc3c0e.zip