From 39b18663ebf045c1cb7a9ee1c40bacd45c785ee3 Mon Sep 17 00:00:00 2001
From: Lars-Dominik Braun <lars@6xq.net>
Date: Sat, 25 Nov 2017 14:32:46 +0100
Subject: Strip on* HTML attributes

They can carry JavaScript as well and should not be allowed for DOM
snapshots.
---
 crocoite/cli.py | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

(limited to 'crocoite/cli.py')

diff --git a/crocoite/cli.py b/crocoite/cli.py
index f070625..fee72c1 100644
--- a/crocoite/cli.py
+++ b/crocoite/cli.py
@@ -36,6 +36,8 @@ from html5lib.filters.base import Filter
 from html5lib.serializer import HTMLSerializer
 from html5lib import constants
 
+from . import html
+
 logger = logging.getLogger(__name__)
 
 # 10 MB, encoded! (i.e. actual data can be larger due to compression)
@@ -157,6 +159,28 @@ class StripTagFilter (Filter):
             if tokenType == 'EndTag' and delete > 0:
                 delete -= 1
 
+class StripAttributeFilter (Filter):
+    """
+    Remove arbitrary HTML attributes
+    """
+
+    def __init__ (self, source, attributes):
+        Filter.__init__ (self, source)
+        self.attributes = set (map (str.lower, attributes))
+
+    def __iter__(self):
+        default_namespace = constants.namespaces["html"]
+        for token in Filter.__iter__(self):
+            data = token.get ('data')
+            # XXX: Handle EmptyTag
+            if data and token['type'] == 'StartTag':
+                newdata = {}
+                for (namespace, k), v in data.items ():
+                    if k.lower () not in self.attributes:
+                        newdata[(namespace, k)] = v
+                token['data'] = newdata
+            yield token
+
 def main ():
     def getStatusText (response):
         text = response.get ('statusText')
@@ -334,7 +358,9 @@ def main ():
                 walker = ChromeTreeWalker (doc)
                 # remove script, to make the page static and noscript, because at the
                 # time we took the snapshot scripts were enabled
-                stream = StripTagFilter (walker, ['script', 'noscript'])
+                disallowedTags = ['script', 'noscript']
+                disallowedAttributes = html.eventAttributes
+                stream = StripAttributeFilter (StripTagFilter (walker, disallowedTags), disallowedAttributes)
                 serializer = HTMLSerializer ()
                 httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1')
                 record = writer.create_warc_record (doc['documentURL'], 'response',
-- 
cgit v1.2.3