From 6b1d593ec841ebe18dcbdd18902f7faad0868fd4 Mon Sep 17 00:00:00 2001 From: Lars-Dominik Braun Date: Sun, 26 Nov 2017 10:23:34 +0100 Subject: DOM snapshot: Generate valid HTML5 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some tags are “void”, i.e. cannot contain contents and don’t have a closing tag. --- crocoite/cli.py | 19 ++++++++++++------- crocoite/html.py | 21 +++++++++++++++++++-- 2 files changed, 31 insertions(+), 9 deletions(-) (limited to 'crocoite') diff --git a/crocoite/cli.py b/crocoite/cli.py index a2ac958..640d207 100644 --- a/crocoite/cli.py +++ b/crocoite/cli.py @@ -107,14 +107,20 @@ class ChromeTreeWalker (TreeWalker): assert False, name else: default_namespace = constants.namespaces["html"] + attributes = node.get ('attributes', []) convertedAttr = {} for i in range (0, len (attributes), 2): convertedAttr[(default_namespace, attributes[i])] = attributes[i+1] - yield self.startTag (default_namespace, name, convertedAttr) - for child in node.get ('children', []): - yield from self.recurse (child) - yield self.endTag ('', name) + + children = node.get ('children', []) + if name.lower() in html.voidTags and not children: + yield from self.emptyTag (default_namespace, name, convertedAttr) + else: + yield self.startTag (default_namespace, name, convertedAttr) + for child in node.get ('children', []): + yield from self.recurse (child) + yield self.endTag ('', name) def __iter__ (self): assert self.tree['nodeName'] == '#document' @@ -151,7 +157,7 @@ class StripTagFilter (Filter): delete = 0 for token in Filter.__iter__(self): tokenType = token['type'] - if tokenType == 'StartTag': + if tokenType in {'StartTag', 'EmptyTag'}: if delete > 0 or token['name'].lower () in self.tags: delete += 1 if delete == 0: @@ -172,8 +178,7 @@ class StripAttributeFilter (Filter): default_namespace = constants.namespaces["html"] for token in Filter.__iter__(self): data = token.get ('data') - # XXX: Handle EmptyTag - if data and token['type'] == 'StartTag': + if data and token['type'] in {'StartTag', 'EmptyTag'}: newdata = {} for (namespace, k), v in data.items (): if k.lower () not in self.attributes: diff --git a/crocoite/html.py b/crocoite/html.py index 75ac022..34fe26b 100644 --- a/crocoite/html.py +++ b/crocoite/html.py @@ -18,8 +18,24 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. +# HTML void tags, see https://html.spec.whatwg.org/multipage/syntax.html#void-elements +voidTags = {'area', + 'base', + 'br', + 'col', + 'embed', + 'hr', + 'img', + 'input', + 'link', + 'meta', + 'param', + 'source', + 'track', + 'wbr'} + # source: https://developer.mozilla.org/en-US/docs/Web/HTML/Global_attributes -eventAttributes = ['onabort', +eventAttributes = {'onabort', 'onautocomplete', 'onautocompleteerror', 'onblur', @@ -81,4 +97,5 @@ eventAttributes = ['onabort', 'ontimeupdate', 'ontoggle', 'onvolumechange', - 'onwaiting'] + 'onwaiting'} + -- cgit v1.2.3