summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--crocoite/cli.py28
-rw-r--r--crocoite/html.py84
2 files changed, 111 insertions, 1 deletions
diff --git a/crocoite/cli.py b/crocoite/cli.py
index f070625..fee72c1 100644
--- a/crocoite/cli.py
+++ b/crocoite/cli.py
@@ -36,6 +36,8 @@ from html5lib.filters.base import Filter
from html5lib.serializer import HTMLSerializer
from html5lib import constants
+from . import html
+
logger = logging.getLogger(__name__)
# 10 MB, encoded! (i.e. actual data can be larger due to compression)
@@ -157,6 +159,28 @@ class StripTagFilter (Filter):
if tokenType == 'EndTag' and delete > 0:
delete -= 1
+class StripAttributeFilter (Filter):
+ """
+ Remove arbitrary HTML attributes
+ """
+
+ def __init__ (self, source, attributes):
+ Filter.__init__ (self, source)
+ self.attributes = set (map (str.lower, attributes))
+
+ def __iter__(self):
+ default_namespace = constants.namespaces["html"]
+ for token in Filter.__iter__(self):
+ data = token.get ('data')
+ # XXX: Handle EmptyTag
+ if data and token['type'] == 'StartTag':
+ newdata = {}
+ for (namespace, k), v in data.items ():
+ if k.lower () not in self.attributes:
+ newdata[(namespace, k)] = v
+ token['data'] = newdata
+ yield token
+
def main ():
def getStatusText (response):
text = response.get ('statusText')
@@ -334,7 +358,9 @@ def main ():
walker = ChromeTreeWalker (doc)
# remove script, to make the page static and noscript, because at the
# time we took the snapshot scripts were enabled
- stream = StripTagFilter (walker, ['script', 'noscript'])
+ disallowedTags = ['script', 'noscript']
+ disallowedAttributes = html.eventAttributes
+ stream = StripAttributeFilter (StripTagFilter (walker, disallowedTags), disallowedAttributes)
serializer = HTMLSerializer ()
httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1')
record = writer.create_warc_record (doc['documentURL'], 'response',
diff --git a/crocoite/html.py b/crocoite/html.py
new file mode 100644
index 0000000..75ac022
--- /dev/null
+++ b/crocoite/html.py
@@ -0,0 +1,84 @@
+# Copyright (c) 2017 crocoite contributors
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+# source: https://developer.mozilla.org/en-US/docs/Web/HTML/Global_attributes
+eventAttributes = ['onabort',
+ 'onautocomplete',
+ 'onautocompleteerror',
+ 'onblur',
+ 'oncancel',
+ 'oncanplay',
+ 'oncanplaythrough',
+ 'onchange',
+ 'onclick',
+ 'onclose',
+ 'oncontextmenu',
+ 'oncuechange',
+ 'ondblclick',
+ 'ondrag',
+ 'ondragend',
+ 'ondragenter',
+ 'ondragexit',
+ 'ondragleave',
+ 'ondragover',
+ 'ondragstart',
+ 'ondrop',
+ 'ondurationchange',
+ 'onemptied',
+ 'onended',
+ 'onerror',
+ 'onfocus',
+ 'oninput',
+ 'oninvalid',
+ 'onkeydown',
+ 'onkeypress',
+ 'onkeyup',
+ 'onload',
+ 'onloadeddata',
+ 'onloadedmetadata',
+ 'onloadstart',
+ 'onmousedown',
+ 'onmouseenter',
+ 'onmouseleave',
+ 'onmousemove',
+ 'onmouseout',
+ 'onmouseover',
+ 'onmouseup',
+ 'onmousewheel',
+ 'onpause',
+ 'onplay',
+ 'onplaying',
+ 'onprogress',
+ 'onratechange',
+ 'onreset',
+ 'onresize',
+ 'onscroll',
+ 'onseeked',
+ 'onseeking',
+ 'onselect',
+ 'onshow',
+ 'onsort',
+ 'onstalled',
+ 'onsubmit',
+ 'onsuspend',
+ 'ontimeupdate',
+ 'ontoggle',
+ 'onvolumechange',
+ 'onwaiting']