diff options
author | Lars-Dominik Braun <lars@6xq.net> | 2019-06-15 13:51:41 +0200 |
---|---|---|
committer | Lars-Dominik Braun <lars@6xq.net> | 2019-06-17 14:06:42 +0200 |
commit | 23b3fed7b44e4059901ea2d09c866d385fa05bfc (patch) | |
tree | dfabfc03ea3ac1376d8a5073f14f602973e30b29 /crocoite | |
parent | 158f55eb7fb24fa26727a008ad44964390171060 (diff) | |
download | crocoite-23b3fed7b44e4059901ea2d09c866d385fa05bfc.tar.gz crocoite-23b3fed7b44e4059901ea2d09c866d385fa05bfc.tar.bz2 crocoite-23b3fed7b44e4059901ea2d09c866d385fa05bfc.zip |
html: Fix CDATA walking
Missing “from” keyword, returned generator instead of dicts. Properly
recreate CDATA elements now.
Diffstat (limited to 'crocoite')
-rw-r--r-- | crocoite/html.py | 11 | ||||
-rw-r--r-- | crocoite/test_html.py | 36 |
2 files changed, 42 insertions, 5 deletions
diff --git a/crocoite/html.py b/crocoite/html.py index ae5b03c..30f6ca5 100644 --- a/crocoite/html.py +++ b/crocoite/html.py @@ -107,6 +107,8 @@ eventAttributes = {'onabort', 'onvolumechange', 'onwaiting'} +default_namespace = constants.namespaces["html"] + class ChromeTreeWalker (TreeWalker): """ Recursive html5lib TreeWalker for Google Chrome method DOM.getDocument @@ -123,14 +125,13 @@ class ChromeTreeWalker (TreeWalker): for child in node.get ('children', []): yield from self.recurse (child) elif name == '#cdata-section': - # html5lib cannot generate cdata. text should be fine. This - # only happens when using Chrome’s inline XML display. - yield self.text (node['nodeValue']) + # html5lib cannot generate cdata, so we’re faking it by using + # an empty tag + yield from self.emptyTag (default_namespace, + '![CDATA[' + node['nodeValue'] + ']]', {}) else: assert False, (name, node) else: - default_namespace = constants.namespaces["html"] - attributes = node.get ('attributes', []) convertedAttr = {} for i in range (0, len (attributes), 2): diff --git a/crocoite/test_html.py b/crocoite/test_html.py index c71697a..c17903b 100644 --- a/crocoite/test_html.py +++ b/crocoite/test_html.py @@ -18,9 +18,11 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. +import asyncio import pytest, html5lib from html5lib.serializer import HTMLSerializer from html5lib.treewalkers import getTreeWalker +from aiohttp import web from .html import StripTagFilter, StripAttributeFilter, ChromeTreeWalker from .test_devtools import tab, browser @@ -58,3 +60,37 @@ async def test_treewalker (tab): elif i == 1: assert result == framehtml +cdataDoc = '<test><![CDATA[Hello world]]></test>' +xmlHeader = '<?xml version="1.0" encoding="UTF-8"?>' +async def hello(request): + return web.Response(text=xmlHeader + cdataDoc, content_type='text/xml') + +@pytest.fixture +async def server (): + """ Simple HTTP server for testing notifications """ + app = web.Application() + app.add_routes([web.get('/test.xml', hello)]) + runner = web.AppRunner(app) + await runner.setup() + site = web.TCPSite(runner, 'localhost', 8080) + await site.start() + yield app + await runner.cleanup () + +@pytest.mark.asyncio +async def test_treewalker_cdata (tab, server): + ret = await tab.Page.navigate (url='http://localhost:8080/test.xml') + # wait until loaded XXX: replace with idle check + await asyncio.sleep (0.5) + dom = await tab.DOM.getDocument (depth=-1, pierce=True) + docs = list (ChromeTreeWalker (dom['root']).split ()) + assert len(docs) == 1 + for i, doc in enumerate (docs): + walker = ChromeTreeWalker (doc) + serializer = HTMLSerializer () + result = serializer.render (iter(walker)) + # chrome will display a pretty-printed viewer *plus* the original + # source (stripped of its xml header) + assert cdataDoc in result + + |