From 39b18663ebf045c1cb7a9ee1c40bacd45c785ee3 Mon Sep 17 00:00:00 2001 From: Lars-Dominik Braun Date: Sat, 25 Nov 2017 14:32:46 +0100 Subject: Strip on* HTML attributes They can carry JavaScript as well and should not be allowed for DOM snapshots. --- crocoite/cli.py | 28 ++++++++++++++++++- crocoite/html.py | 84 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 111 insertions(+), 1 deletion(-) create mode 100644 crocoite/html.py diff --git a/crocoite/cli.py b/crocoite/cli.py index f070625..fee72c1 100644 --- a/crocoite/cli.py +++ b/crocoite/cli.py @@ -36,6 +36,8 @@ from html5lib.filters.base import Filter from html5lib.serializer import HTMLSerializer from html5lib import constants +from . import html + logger = logging.getLogger(__name__) # 10 MB, encoded! (i.e. actual data can be larger due to compression) @@ -157,6 +159,28 @@ class StripTagFilter (Filter): if tokenType == 'EndTag' and delete > 0: delete -= 1 +class StripAttributeFilter (Filter): + """ + Remove arbitrary HTML attributes + """ + + def __init__ (self, source, attributes): + Filter.__init__ (self, source) + self.attributes = set (map (str.lower, attributes)) + + def __iter__(self): + default_namespace = constants.namespaces["html"] + for token in Filter.__iter__(self): + data = token.get ('data') + # XXX: Handle EmptyTag + if data and token['type'] == 'StartTag': + newdata = {} + for (namespace, k), v in data.items (): + if k.lower () not in self.attributes: + newdata[(namespace, k)] = v + token['data'] = newdata + yield token + def main (): def getStatusText (response): text = response.get ('statusText') @@ -334,7 +358,9 @@ def main (): walker = ChromeTreeWalker (doc) # remove script, to make the page static and noscript, because at the # time we took the snapshot scripts were enabled - stream = StripTagFilter (walker, ['script', 'noscript']) + disallowedTags = ['script', 'noscript'] + disallowedAttributes = html.eventAttributes + stream = StripAttributeFilter (StripTagFilter (walker, disallowedTags), disallowedAttributes) serializer = HTMLSerializer () httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1') record = writer.create_warc_record (doc['documentURL'], 'response', diff --git a/crocoite/html.py b/crocoite/html.py new file mode 100644 index 0000000..75ac022 --- /dev/null +++ b/crocoite/html.py @@ -0,0 +1,84 @@ +# Copyright (c) 2017 crocoite contributors +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +# source: https://developer.mozilla.org/en-US/docs/Web/HTML/Global_attributes +eventAttributes = ['onabort', + 'onautocomplete', + 'onautocompleteerror', + 'onblur', + 'oncancel', + 'oncanplay', + 'oncanplaythrough', + 'onchange', + 'onclick', + 'onclose', + 'oncontextmenu', + 'oncuechange', + 'ondblclick', + 'ondrag', + 'ondragend', + 'ondragenter', + 'ondragexit', + 'ondragleave', + 'ondragover', + 'ondragstart', + 'ondrop', + 'ondurationchange', + 'onemptied', + 'onended', + 'onerror', + 'onfocus', + 'oninput', + 'oninvalid', + 'onkeydown', + 'onkeypress', + 'onkeyup', + 'onload', + 'onloadeddata', + 'onloadedmetadata', + 'onloadstart', + 'onmousedown', + 'onmouseenter', + 'onmouseleave', + 'onmousemove', + 'onmouseout', + 'onmouseover', + 'onmouseup', + 'onmousewheel', + 'onpause', + 'onplay', + 'onplaying', + 'onprogress', + 'onratechange', + 'onreset', + 'onresize', + 'onscroll', + 'onseeked', + 'onseeking', + 'onselect', + 'onshow', + 'onsort', + 'onstalled', + 'onsubmit', + 'onsuspend', + 'ontimeupdate', + 'ontoggle', + 'onvolumechange', + 'onwaiting'] -- cgit v1.2.3