summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLars-Dominik Braun <lars@6xq.net>2017-11-26 10:23:34 +0100
committerLars-Dominik Braun <lars@6xq.net>2017-11-26 10:25:13 +0100
commit6b1d593ec841ebe18dcbdd18902f7faad0868fd4 (patch)
treef437aa9275f93492d5a860fb1790cb59d2d2e5a5
parentee736e28ed4c5cdae395f5851eb2bad96d8078ef (diff)
downloadcrocoite-6b1d593ec841ebe18dcbdd18902f7faad0868fd4.zip
crocoite-6b1d593ec841ebe18dcbdd18902f7faad0868fd4.tar.gz
crocoite-6b1d593ec841ebe18dcbdd18902f7faad0868fd4.tar.bz2
DOM snapshot: Generate valid HTML5
Some tags are “void”, i.e. cannot contain contents and don’t have a closing tag.
-rw-r--r--crocoite/cli.py19
-rw-r--r--crocoite/html.py21
2 files changed, 31 insertions, 9 deletions
diff --git a/crocoite/cli.py b/crocoite/cli.py
index a2ac958..640d207 100644
--- a/crocoite/cli.py
+++ b/crocoite/cli.py
@@ -107,14 +107,20 @@ class ChromeTreeWalker (TreeWalker):
assert False, name
else:
default_namespace = constants.namespaces["html"]
+
attributes = node.get ('attributes', [])
convertedAttr = {}
for i in range (0, len (attributes), 2):
convertedAttr[(default_namespace, attributes[i])] = attributes[i+1]
- yield self.startTag (default_namespace, name, convertedAttr)
- for child in node.get ('children', []):
- yield from self.recurse (child)
- yield self.endTag ('', name)
+
+ children = node.get ('children', [])
+ if name.lower() in html.voidTags and not children:
+ yield from self.emptyTag (default_namespace, name, convertedAttr)
+ else:
+ yield self.startTag (default_namespace, name, convertedAttr)
+ for child in node.get ('children', []):
+ yield from self.recurse (child)
+ yield self.endTag ('', name)
def __iter__ (self):
assert self.tree['nodeName'] == '#document'
@@ -151,7 +157,7 @@ class StripTagFilter (Filter):
delete = 0
for token in Filter.__iter__(self):
tokenType = token['type']
- if tokenType == 'StartTag':
+ if tokenType in {'StartTag', 'EmptyTag'}:
if delete > 0 or token['name'].lower () in self.tags:
delete += 1
if delete == 0:
@@ -172,8 +178,7 @@ class StripAttributeFilter (Filter):
default_namespace = constants.namespaces["html"]
for token in Filter.__iter__(self):
data = token.get ('data')
- # XXX: Handle EmptyTag
- if data and token['type'] == 'StartTag':
+ if data and token['type'] in {'StartTag', 'EmptyTag'}:
newdata = {}
for (namespace, k), v in data.items ():
if k.lower () not in self.attributes:
diff --git a/crocoite/html.py b/crocoite/html.py
index 75ac022..34fe26b 100644
--- a/crocoite/html.py
+++ b/crocoite/html.py
@@ -18,8 +18,24 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
+# HTML void tags, see https://html.spec.whatwg.org/multipage/syntax.html#void-elements
+voidTags = {'area',
+ 'base',
+ 'br',
+ 'col',
+ 'embed',
+ 'hr',
+ 'img',
+ 'input',
+ 'link',
+ 'meta',
+ 'param',
+ 'source',
+ 'track',
+ 'wbr'}
+
# source: https://developer.mozilla.org/en-US/docs/Web/HTML/Global_attributes
-eventAttributes = ['onabort',
+eventAttributes = {'onabort',
'onautocomplete',
'onautocompleteerror',
'onblur',
@@ -81,4 +97,5 @@ eventAttributes = ['onabort',
'ontimeupdate',
'ontoggle',
'onvolumechange',
- 'onwaiting']
+ 'onwaiting'}
+