diff options
| author | Lars-Dominik Braun <lars@6xq.net> | 2019-06-15 13:51:41 +0200 | 
|---|---|---|
| committer | Lars-Dominik Braun <lars@6xq.net> | 2019-06-17 14:06:42 +0200 | 
| commit | 23b3fed7b44e4059901ea2d09c866d385fa05bfc (patch) | |
| tree | dfabfc03ea3ac1376d8a5073f14f602973e30b29 | |
| parent | 158f55eb7fb24fa26727a008ad44964390171060 (diff) | |
| download | crocoite-23b3fed7b44e4059901ea2d09c866d385fa05bfc.tar.gz crocoite-23b3fed7b44e4059901ea2d09c866d385fa05bfc.tar.bz2 crocoite-23b3fed7b44e4059901ea2d09c866d385fa05bfc.zip | |
html: Fix CDATA walking
Missing “from” keyword, returned generator instead of dicts. Properly
recreate CDATA elements now.
| -rw-r--r-- | crocoite/html.py | 11 | ||||
| -rw-r--r-- | crocoite/test_html.py | 36 | 
2 files changed, 42 insertions, 5 deletions
| diff --git a/crocoite/html.py b/crocoite/html.py index ae5b03c..30f6ca5 100644 --- a/crocoite/html.py +++ b/crocoite/html.py @@ -107,6 +107,8 @@ eventAttributes = {'onabort',          'onvolumechange',          'onwaiting'} +default_namespace = constants.namespaces["html"] +  class ChromeTreeWalker (TreeWalker):      """      Recursive html5lib TreeWalker for Google Chrome method DOM.getDocument @@ -123,14 +125,13 @@ class ChromeTreeWalker (TreeWalker):                  for child in node.get ('children', []):                      yield from self.recurse (child)              elif name == '#cdata-section': -                # html5lib cannot generate cdata. text should be fine. This -                # only happens when using Chrome’s inline XML display. -                yield self.text (node['nodeValue']) +                # html5lib cannot generate cdata, so we’re faking it by using +                # an empty tag +                yield from self.emptyTag (default_namespace, +                        '![CDATA[' + node['nodeValue'] + ']]', {})              else:                  assert False, (name, node)          else: -            default_namespace = constants.namespaces["html"] -              attributes = node.get ('attributes', [])              convertedAttr = {}              for i in range (0, len (attributes), 2): diff --git a/crocoite/test_html.py b/crocoite/test_html.py index c71697a..c17903b 100644 --- a/crocoite/test_html.py +++ b/crocoite/test_html.py @@ -18,9 +18,11 @@  # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN  # THE SOFTWARE. +import asyncio  import pytest, html5lib  from html5lib.serializer import HTMLSerializer  from html5lib.treewalkers import getTreeWalker +from aiohttp import web  from .html import StripTagFilter, StripAttributeFilter, ChromeTreeWalker  from .test_devtools import tab, browser @@ -58,3 +60,37 @@ async def test_treewalker (tab):          elif i == 1:              assert result == framehtml +cdataDoc = '<test><![CDATA[Hello world]]></test>' +xmlHeader = '<?xml version="1.0" encoding="UTF-8"?>' +async def hello(request): +    return web.Response(text=xmlHeader + cdataDoc, content_type='text/xml') + +@pytest.fixture +async def server (): +    """ Simple HTTP server for testing notifications """ +    app = web.Application() +    app.add_routes([web.get('/test.xml', hello)]) +    runner = web.AppRunner(app) +    await runner.setup() +    site = web.TCPSite(runner, 'localhost', 8080) +    await site.start() +    yield app +    await runner.cleanup () + +@pytest.mark.asyncio +async def test_treewalker_cdata (tab, server): +    ret = await tab.Page.navigate (url='http://localhost:8080/test.xml') +    # wait until loaded XXX: replace with idle check +    await asyncio.sleep (0.5) +    dom = await tab.DOM.getDocument (depth=-1, pierce=True) +    docs = list (ChromeTreeWalker (dom['root']).split ()) +    assert len(docs) == 1 +    for i, doc in enumerate (docs): +        walker = ChromeTreeWalker (doc) +        serializer = HTMLSerializer () +        result = serializer.render (iter(walker)) +        # chrome will display a pretty-printed viewer *plus* the original +        # source (stripped of its xml header) +        assert cdataDoc in result + + | 
