From 39b18663ebf045c1cb7a9ee1c40bacd45c785ee3 Mon Sep 17 00:00:00 2001 From: Lars-Dominik Braun Date: Sat, 25 Nov 2017 14:32:46 +0100 Subject: Strip on* HTML attributes They can carry JavaScript as well and should not be allowed for DOM snapshots. --- crocoite/cli.py | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) (limited to 'crocoite/cli.py') diff --git a/crocoite/cli.py b/crocoite/cli.py index f070625..fee72c1 100644 --- a/crocoite/cli.py +++ b/crocoite/cli.py @@ -36,6 +36,8 @@ from html5lib.filters.base import Filter from html5lib.serializer import HTMLSerializer from html5lib import constants +from . import html + logger = logging.getLogger(__name__) # 10 MB, encoded! (i.e. actual data can be larger due to compression) @@ -157,6 +159,28 @@ class StripTagFilter (Filter): if tokenType == 'EndTag' and delete > 0: delete -= 1 +class StripAttributeFilter (Filter): + """ + Remove arbitrary HTML attributes + """ + + def __init__ (self, source, attributes): + Filter.__init__ (self, source) + self.attributes = set (map (str.lower, attributes)) + + def __iter__(self): + default_namespace = constants.namespaces["html"] + for token in Filter.__iter__(self): + data = token.get ('data') + # XXX: Handle EmptyTag + if data and token['type'] == 'StartTag': + newdata = {} + for (namespace, k), v in data.items (): + if k.lower () not in self.attributes: + newdata[(namespace, k)] = v + token['data'] = newdata + yield token + def main (): def getStatusText (response): text = response.get ('statusText') @@ -334,7 +358,9 @@ def main (): walker = ChromeTreeWalker (doc) # remove script, to make the page static and noscript, because at the # time we took the snapshot scripts were enabled - stream = StripTagFilter (walker, ['script', 'noscript']) + disallowedTags = ['script', 'noscript'] + disallowedAttributes = html.eventAttributes + stream = StripAttributeFilter (StripTagFilter (walker, disallowedTags), disallowedAttributes) serializer = HTMLSerializer () httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1') record = writer.create_warc_record (doc['documentURL'], 'response', -- cgit v1.2.3