summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLars-Dominik Braun <lars@6xq.net>2019-06-15 13:51:41 +0200
committerLars-Dominik Braun <lars@6xq.net>2019-06-17 14:06:42 +0200
commit23b3fed7b44e4059901ea2d09c866d385fa05bfc (patch)
treedfabfc03ea3ac1376d8a5073f14f602973e30b29
parent158f55eb7fb24fa26727a008ad44964390171060 (diff)
downloadcrocoite-23b3fed7b44e4059901ea2d09c866d385fa05bfc.tar.gz
crocoite-23b3fed7b44e4059901ea2d09c866d385fa05bfc.tar.bz2
crocoite-23b3fed7b44e4059901ea2d09c866d385fa05bfc.zip
html: Fix CDATA walking
Missing “from” keyword, returned generator instead of dicts. Properly recreate CDATA elements now.
-rw-r--r--crocoite/html.py11
-rw-r--r--crocoite/test_html.py36
2 files changed, 42 insertions, 5 deletions
diff --git a/crocoite/html.py b/crocoite/html.py
index ae5b03c..30f6ca5 100644
--- a/crocoite/html.py
+++ b/crocoite/html.py
@@ -107,6 +107,8 @@ eventAttributes = {'onabort',
'onvolumechange',
'onwaiting'}
+default_namespace = constants.namespaces["html"]
+
class ChromeTreeWalker (TreeWalker):
"""
Recursive html5lib TreeWalker for Google Chrome method DOM.getDocument
@@ -123,14 +125,13 @@ class ChromeTreeWalker (TreeWalker):
for child in node.get ('children', []):
yield from self.recurse (child)
elif name == '#cdata-section':
- # html5lib cannot generate cdata. text should be fine. This
- # only happens when using Chrome’s inline XML display.
- yield self.text (node['nodeValue'])
+ # html5lib cannot generate cdata, so we’re faking it by using
+ # an empty tag
+ yield from self.emptyTag (default_namespace,
+ '![CDATA[' + node['nodeValue'] + ']]', {})
else:
assert False, (name, node)
else:
- default_namespace = constants.namespaces["html"]
-
attributes = node.get ('attributes', [])
convertedAttr = {}
for i in range (0, len (attributes), 2):
diff --git a/crocoite/test_html.py b/crocoite/test_html.py
index c71697a..c17903b 100644
--- a/crocoite/test_html.py
+++ b/crocoite/test_html.py
@@ -18,9 +18,11 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
+import asyncio
import pytest, html5lib
from html5lib.serializer import HTMLSerializer
from html5lib.treewalkers import getTreeWalker
+from aiohttp import web
from .html import StripTagFilter, StripAttributeFilter, ChromeTreeWalker
from .test_devtools import tab, browser
@@ -58,3 +60,37 @@ async def test_treewalker (tab):
elif i == 1:
assert result == framehtml
+cdataDoc = '<test><![CDATA[Hello world]]></test>'
+xmlHeader = '<?xml version="1.0" encoding="UTF-8"?>'
+async def hello(request):
+ return web.Response(text=xmlHeader + cdataDoc, content_type='text/xml')
+
+@pytest.fixture
+async def server ():
+ """ Simple HTTP server for testing notifications """
+ app = web.Application()
+ app.add_routes([web.get('/test.xml', hello)])
+ runner = web.AppRunner(app)
+ await runner.setup()
+ site = web.TCPSite(runner, 'localhost', 8080)
+ await site.start()
+ yield app
+ await runner.cleanup ()
+
+@pytest.mark.asyncio
+async def test_treewalker_cdata (tab, server):
+ ret = await tab.Page.navigate (url='http://localhost:8080/test.xml')
+ # wait until loaded XXX: replace with idle check
+ await asyncio.sleep (0.5)
+ dom = await tab.DOM.getDocument (depth=-1, pierce=True)
+ docs = list (ChromeTreeWalker (dom['root']).split ())
+ assert len(docs) == 1
+ for i, doc in enumerate (docs):
+ walker = ChromeTreeWalker (doc)
+ serializer = HTMLSerializer ()
+ result = serializer.render (iter(walker))
+ # chrome will display a pretty-printed viewer *plus* the original
+ # source (stripped of its xml header)
+ assert cdataDoc in result
+
+