From be5f9525649ac39fd9b72a0ad1e6442c72034834 Mon Sep 17 00:00:00 2001 From: Lars-Dominik Braun Date: Sat, 5 Jan 2019 16:29:08 +0100 Subject: html: Handle CDATA When loading XML documents Chrome presents a pretty-printed version to the user, which still contains the original XML when exporting via DOM.getDocument. Not sure how to test this. --- crocoite/html.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/crocoite/html.py b/crocoite/html.py index fec9760..ae5b03c 100644 --- a/crocoite/html.py +++ b/crocoite/html.py @@ -122,8 +122,12 @@ class ChromeTreeWalker (TreeWalker): elif name == '#document': for child in node.get ('children', []): yield from self.recurse (child) + elif name == '#cdata-section': + # html5lib cannot generate cdata. text should be fine. This + # only happens when using Chrome’s inline XML display. + yield self.text (node['nodeValue']) else: - assert False, name + assert False, (name, node) else: default_namespace = constants.namespaces["html"] -- cgit v1.2.3