diff options
Diffstat (limited to 'crocoite/cli.py')
-rw-r--r-- | crocoite/cli.py | 28 |
1 files changed, 27 insertions, 1 deletions
diff --git a/crocoite/cli.py b/crocoite/cli.py index f070625..fee72c1 100644 --- a/crocoite/cli.py +++ b/crocoite/cli.py @@ -36,6 +36,8 @@ from html5lib.filters.base import Filter from html5lib.serializer import HTMLSerializer from html5lib import constants +from . import html + logger = logging.getLogger(__name__) # 10 MB, encoded! (i.e. actual data can be larger due to compression) @@ -157,6 +159,28 @@ class StripTagFilter (Filter): if tokenType == 'EndTag' and delete > 0: delete -= 1 +class StripAttributeFilter (Filter): + """ + Remove arbitrary HTML attributes + """ + + def __init__ (self, source, attributes): + Filter.__init__ (self, source) + self.attributes = set (map (str.lower, attributes)) + + def __iter__(self): + default_namespace = constants.namespaces["html"] + for token in Filter.__iter__(self): + data = token.get ('data') + # XXX: Handle EmptyTag + if data and token['type'] == 'StartTag': + newdata = {} + for (namespace, k), v in data.items (): + if k.lower () not in self.attributes: + newdata[(namespace, k)] = v + token['data'] = newdata + yield token + def main (): def getStatusText (response): text = response.get ('statusText') @@ -334,7 +358,9 @@ def main (): walker = ChromeTreeWalker (doc) # remove script, to make the page static and noscript, because at the # time we took the snapshot scripts were enabled - stream = StripTagFilter (walker, ['script', 'noscript']) + disallowedTags = ['script', 'noscript'] + disallowedAttributes = html.eventAttributes + stream = StripAttributeFilter (StripTagFilter (walker, disallowedTags), disallowedAttributes) serializer = HTMLSerializer () httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1') record = writer.create_warc_record (doc['documentURL'], 'response', |