summaryrefslogtreecommitdiff
path: root/crocoite/cli.py
diff options
context:
space:
mode:
authorLars-Dominik Braun <lars@6xq.net>2017-11-25 14:32:46 +0100
committerLars-Dominik Braun <lars@6xq.net>2017-11-25 14:32:46 +0100
commit39b18663ebf045c1cb7a9ee1c40bacd45c785ee3 (patch)
tree3960f38b794f53d708d4c6d5aceed417736f5996 /crocoite/cli.py
parentde94e6bc320ddc38f4b0baf006c254378be5d845 (diff)
downloadcrocoite-39b18663ebf045c1cb7a9ee1c40bacd45c785ee3.tar.gz
crocoite-39b18663ebf045c1cb7a9ee1c40bacd45c785ee3.tar.bz2
crocoite-39b18663ebf045c1cb7a9ee1c40bacd45c785ee3.zip
Strip on* HTML attributes
They can carry JavaScript as well and should not be allowed for DOM snapshots.
Diffstat (limited to 'crocoite/cli.py')
-rw-r--r--crocoite/cli.py28
1 files changed, 27 insertions, 1 deletions
diff --git a/crocoite/cli.py b/crocoite/cli.py
index f070625..fee72c1 100644
--- a/crocoite/cli.py
+++ b/crocoite/cli.py
@@ -36,6 +36,8 @@ from html5lib.filters.base import Filter
from html5lib.serializer import HTMLSerializer
from html5lib import constants
+from . import html
+
logger = logging.getLogger(__name__)
# 10 MB, encoded! (i.e. actual data can be larger due to compression)
@@ -157,6 +159,28 @@ class StripTagFilter (Filter):
if tokenType == 'EndTag' and delete > 0:
delete -= 1
+class StripAttributeFilter (Filter):
+ """
+ Remove arbitrary HTML attributes
+ """
+
+ def __init__ (self, source, attributes):
+ Filter.__init__ (self, source)
+ self.attributes = set (map (str.lower, attributes))
+
+ def __iter__(self):
+ default_namespace = constants.namespaces["html"]
+ for token in Filter.__iter__(self):
+ data = token.get ('data')
+ # XXX: Handle EmptyTag
+ if data and token['type'] == 'StartTag':
+ newdata = {}
+ for (namespace, k), v in data.items ():
+ if k.lower () not in self.attributes:
+ newdata[(namespace, k)] = v
+ token['data'] = newdata
+ yield token
+
def main ():
def getStatusText (response):
text = response.get ('statusText')
@@ -334,7 +358,9 @@ def main ():
walker = ChromeTreeWalker (doc)
# remove script, to make the page static and noscript, because at the
# time we took the snapshot scripts were enabled
- stream = StripTagFilter (walker, ['script', 'noscript'])
+ disallowedTags = ['script', 'noscript']
+ disallowedAttributes = html.eventAttributes
+ stream = StripAttributeFilter (StripTagFilter (walker, disallowedTags), disallowedAttributes)
serializer = HTMLSerializer ()
httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1')
record = writer.create_warc_record (doc['documentURL'], 'response',