From 23b3fed7b44e4059901ea2d09c866d385fa05bfc Mon Sep 17 00:00:00 2001 From: Lars-Dominik Braun Date: Sat, 15 Jun 2019 13:51:41 +0200 Subject: html: Fix CDATA walking MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Missing “from” keyword, returned generator instead of dicts. Properly recreate CDATA elements now. --- crocoite/html.py | 11 ++++++----- crocoite/test_html.py | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+), 5 deletions(-) diff --git a/crocoite/html.py b/crocoite/html.py index ae5b03c..30f6ca5 100644 --- a/crocoite/html.py +++ b/crocoite/html.py @@ -107,6 +107,8 @@ eventAttributes = {'onabort', 'onvolumechange', 'onwaiting'} +default_namespace = constants.namespaces["html"] + class ChromeTreeWalker (TreeWalker): """ Recursive html5lib TreeWalker for Google Chrome method DOM.getDocument @@ -123,14 +125,13 @@ class ChromeTreeWalker (TreeWalker): for child in node.get ('children', []): yield from self.recurse (child) elif name == '#cdata-section': - # html5lib cannot generate cdata. text should be fine. This - # only happens when using Chrome’s inline XML display. - yield self.text (node['nodeValue']) + # html5lib cannot generate cdata, so we’re faking it by using + # an empty tag + yield from self.emptyTag (default_namespace, + '![CDATA[' + node['nodeValue'] + ']]', {}) else: assert False, (name, node) else: - default_namespace = constants.namespaces["html"] - attributes = node.get ('attributes', []) convertedAttr = {} for i in range (0, len (attributes), 2): diff --git a/crocoite/test_html.py b/crocoite/test_html.py index c71697a..c17903b 100644 --- a/crocoite/test_html.py +++ b/crocoite/test_html.py @@ -18,9 +18,11 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. +import asyncio import pytest, html5lib from html5lib.serializer import HTMLSerializer from html5lib.treewalkers import getTreeWalker +from aiohttp import web from .html import StripTagFilter, StripAttributeFilter, ChromeTreeWalker from .test_devtools import tab, browser @@ -58,3 +60,37 @@ async def test_treewalker (tab): elif i == 1: assert result == framehtml +cdataDoc = '' +xmlHeader = '' +async def hello(request): + return web.Response(text=xmlHeader + cdataDoc, content_type='text/xml') + +@pytest.fixture +async def server (): + """ Simple HTTP server for testing notifications """ + app = web.Application() + app.add_routes([web.get('/test.xml', hello)]) + runner = web.AppRunner(app) + await runner.setup() + site = web.TCPSite(runner, 'localhost', 8080) + await site.start() + yield app + await runner.cleanup () + +@pytest.mark.asyncio +async def test_treewalker_cdata (tab, server): + ret = await tab.Page.navigate (url='http://localhost:8080/test.xml') + # wait until loaded XXX: replace with idle check + await asyncio.sleep (0.5) + dom = await tab.DOM.getDocument (depth=-1, pierce=True) + docs = list (ChromeTreeWalker (dom['root']).split ()) + assert len(docs) == 1 + for i, doc in enumerate (docs): + walker = ChromeTreeWalker (doc) + serializer = HTMLSerializer () + result = serializer.render (iter(walker)) + # chrome will display a pretty-printed viewer *plus* the original + # source (stripped of its xml header) + assert cdataDoc in result + + -- cgit v1.2.3