From 6b656f5ccffe79f7cdb11fce3dd72359b3cbbc1b Mon Sep 17 00:00:00 2001
From: Lars-Dominik Braun <lars@6xq.net>
Date: Wed, 29 Nov 2017 13:07:08 +0100
Subject: Refactoring

Reusable browser communication and WARC writing.
---
 crocoite/html.py | 107 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 107 insertions(+)

(limited to 'crocoite/html.py')

diff --git a/crocoite/html.py b/crocoite/html.py
index 34fe26b..f891101 100644
--- a/crocoite/html.py
+++ b/crocoite/html.py
@@ -18,6 +18,10 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 # THE SOFTWARE.
 
+"""
+HTML helper
+"""
+
 # HTML void tags, see https://html.spec.whatwg.org/multipage/syntax.html#void-elements
 voidTags = {'area',
         'base',
@@ -99,3 +103,106 @@ eventAttributes = {'onabort',
         'onvolumechange',
         'onwaiting'}
 
+from html5lib.treewalkers.base import TreeWalker
+from html5lib.filters.base import Filter
+from html5lib.serializer import HTMLSerializer
+from html5lib import constants
+
+class ChromeTreeWalker (TreeWalker):
+    """
+    Recursive html5lib TreeWalker for Google Chrome method DOM.getDocument
+    """
+
+    def recurse (self, node):
+        name = node['nodeName']
+        if name.startswith ('#'):
+            if name == '#text':
+                yield from self.text (node['nodeValue'])
+            elif name == '#comment':
+                yield self.comment (node['nodeValue'])
+            elif name == '#document':
+                for child in node.get ('children', []):
+                    yield from self.recurse (child)
+            else:
+                assert False, name
+        else:
+            default_namespace = constants.namespaces["html"]
+
+            attributes = node.get ('attributes', [])
+            convertedAttr = {}
+            for i in range (0, len (attributes), 2):
+                convertedAttr[(default_namespace, attributes[i])] = attributes[i+1]
+
+            children = node.get ('children', [])
+            if name.lower() in voidTags and not children:
+                yield from self.emptyTag (default_namespace, name, convertedAttr)
+            else:
+                yield self.startTag (default_namespace, name, convertedAttr)
+                for child in node.get ('children', []):
+                    yield from self.recurse (child)
+                yield self.endTag ('', name)
+
+    def __iter__ (self):
+        assert self.tree['nodeName'] == '#document'
+        return self.recurse (self.tree)
+
+    def split (self):
+        """
+        Split response returned by DOM.getDocument(pierce=True) into independent documents
+        """
+        def recurse (node):
+            contentDocument = node.get ('contentDocument')
+            if contentDocument:
+                assert contentDocument['nodeName'] == '#document'
+                yield contentDocument
+                yield from recurse (contentDocument)
+
+            for child in node.get ('children', []):
+                yield from recurse (child)
+
+        if self.tree['nodeName'] == '#document':
+            yield self.tree
+        yield from recurse (self.tree)
+
+class StripTagFilter (Filter):
+    """
+    Remove arbitrary tags
+    """
+
+    def __init__ (self, source, tags):
+        Filter.__init__ (self, source)
+        self.tags = set (map (str.lower, tags))
+
+    def __iter__(self):
+        delete = 0
+        for token in Filter.__iter__(self):
+            tokenType = token['type']
+            if tokenType in {'StartTag', 'EmptyTag'}:
+                if delete > 0 or token['name'].lower () in self.tags:
+                    delete += 1
+            if delete == 0:
+                yield token
+            if tokenType == 'EndTag' and delete > 0:
+                delete -= 1
+
+class StripAttributeFilter (Filter):
+    """
+    Remove arbitrary HTML attributes
+    """
+
+    def __init__ (self, source, attributes):
+        Filter.__init__ (self, source)
+        self.attributes = set (map (str.lower, attributes))
+
+    def __iter__(self):
+        default_namespace = constants.namespaces["html"]
+        for token in Filter.__iter__(self):
+            data = token.get ('data')
+            if data and token['type'] in {'StartTag', 'EmptyTag'}:
+                newdata = {}
+                for (namespace, k), v in data.items ():
+                    if k.lower () not in self.attributes:
+                        newdata[(namespace, k)] = v
+                token['data'] = newdata
+            yield token
+
-- 
cgit v1.2.3