From 23b3fed7b44e4059901ea2d09c866d385fa05bfc Mon Sep 17 00:00:00 2001
From: Lars-Dominik Braun <lars@6xq.net>
Date: Sat, 15 Jun 2019 13:51:41 +0200
Subject: html: Fix CDATA walking
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Missing “from” keyword, returned generator instead of dicts. Properly
recreate CDATA elements now.
---
 crocoite/html.py      | 11 ++++++-----
 crocoite/test_html.py | 36 ++++++++++++++++++++++++++++++++++++
 2 files changed, 42 insertions(+), 5 deletions(-)

(limited to 'crocoite')
diff --git a/crocoite/html.py b/crocoite/html.py
index ae5b03c..30f6ca5 100644
--- a/crocoite/html.py
+++ b/crocoite/html.py
@@ -107,6 +107,8 @@ eventAttributes = {'onabort',
         'onvolumechange',
         'onwaiting'}
 
+default_namespace = constants.namespaces["html"]
+
 class ChromeTreeWalker (TreeWalker):
     """
     Recursive html5lib TreeWalker for Google Chrome method DOM.getDocument
@@ -123,14 +125,13 @@ class ChromeTreeWalker (TreeWalker):
                 for child in node.get ('children', []):
                     yield from self.recurse (child)
             elif name == '#cdata-section':
-                # html5lib cannot generate cdata. text should be fine. This
-                # only happens when using Chrome’s inline XML display.
-                yield self.text (node['nodeValue'])
+                # html5lib cannot generate cdata, so we’re faking it by using
+                # an empty tag
+                yield from self.emptyTag (default_namespace,
+                        '![CDATA[' + node['nodeValue'] + ']]', {})
             else:
                 assert False, (name, node)
         else:
-            default_namespace = constants.namespaces["html"]
-
             attributes = node.get ('attributes', [])
             convertedAttr = {}
             for i in range (0, len (attributes), 2):
diff --git a/crocoite/test_html.py b/crocoite/test_html.py
index c71697a..c17903b 100644
--- a/crocoite/test_html.py
+++ b/crocoite/test_html.py
@@ -18,9 +18,11 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 # THE SOFTWARE.
 
+import asyncio
 import pytest, html5lib
 from html5lib.serializer import HTMLSerializer
 from html5lib.treewalkers import getTreeWalker
+from aiohttp import web
 
 from .html import StripTagFilter, StripAttributeFilter, ChromeTreeWalker
 from .test_devtools import tab, browser
@@ -58,3 +60,37 @@ async def test_treewalker (tab):
         elif i == 1:
             assert result == framehtml
 
+cdataDoc = '<test><![CDATA[Hello world]]></test>'
+xmlHeader = '<?xml version="1.0" encoding="UTF-8"?>'
+async def hello(request):
+    return web.Response(text=xmlHeader + cdataDoc, content_type='text/xml')
+
+@pytest.fixture
+async def server ():
+    """ Simple HTTP server for testing notifications """
+    app = web.Application()
+    app.add_routes([web.get('/test.xml', hello)])
+    runner = web.AppRunner(app)
+    await runner.setup()
+    site = web.TCPSite(runner, 'localhost', 8080)
+    await site.start()
+    yield app
+    await runner.cleanup ()
+
+@pytest.mark.asyncio
+async def test_treewalker_cdata (tab, server):
+    ret = await tab.Page.navigate (url='http://localhost:8080/test.xml')
+    # wait until loaded XXX: replace with idle check
+    await asyncio.sleep (0.5)
+    dom = await tab.DOM.getDocument (depth=-1, pierce=True)
+    docs = list (ChromeTreeWalker (dom['root']).split ())
+    assert len(docs) == 1
+    for i, doc in enumerate (docs):
+        walker = ChromeTreeWalker (doc)
+        serializer = HTMLSerializer ()
+        result = serializer.render (iter(walker))
+        # chrome will display a pretty-printed viewer *plus* the original
+        # source (stripped of its xml header)
+        assert cdataDoc in result
+
+
-- 
cgit v1.2.3