summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLars-Dominik Braun <lars@6xq.net>2017-11-20 19:19:05 +0100
committerLars-Dominik Braun <lars@6xq.net>2017-11-20 19:25:33 +0100
commitca01f82227a8b79f1cbc4f5e0be5434804dc3c0e (patch)
treedd8aafb9b1672f70985eb5dd14635eb8635dd5e3
parent0b8a8e88a3c33c14e52241190ee6478cb2acd49d (diff)
downloadcrocoite-ca01f82227a8b79f1cbc4f5e0be5434804dc3c0e.tar.gz
crocoite-ca01f82227a8b79f1cbc4f5e0be5434804dc3c0e.tar.bz2
crocoite-ca01f82227a8b79f1cbc4f5e0be5434804dc3c0e.zip
Add page created from DOM snapshot
-rw-r--r--README.rst20
-rw-r--r--crocoite/cli.py107
-rw-r--r--setup.py1
3 files changed, 119 insertions, 9 deletions
diff --git a/README.rst b/README.rst
index 7eea272..f66da27 100644
--- a/README.rst
+++ b/README.rst
@@ -10,6 +10,7 @@ Dependencies
- Python 3
- pychrome_
- warcio_
+- html5lib
.. _pychrome: https://github.com/fate0/pychrome
.. _warcio: https://github.com/webrecorder/warcio
@@ -34,7 +35,20 @@ Caveats
-------
- Original HTTP requests/responses are not available. They are rebuilt from
- data available. Character encoding for text documents is changed to UTF-8.
-- Some sites request different assets based on screen resolution, some fetch
- different scripts based on user agent.
+ parsed data. Character encoding for text documents is changed to UTF-8.
+- Some sites request assets based on screen resolution, pixel ratio and
+ supported image formats (webp). Replaying those with different parameters
+ won’t work, since assets for those are missing. Example: missguided.com.
+- Some fetch different scripts based on user agent. Example: youtube.com.
+- Requests containing randomly generated JavaScript callback function names
+ won’t work. Example: weather.com.
+
+Most of these issues can be worked around by using the DOM snapshot, which is
+also saved. This causes its own set of issues though:
+
+- JavaScript-based navigation does not work.
+- Scripts modifying styles based on scrolling position are stuck at the end of
+ page state at the moment. Example: twitter.com
+- CSS-based asset loading (screen size, pixel ratio, …) still does not work.
+- Canvas contents are probably not preserved.
diff --git a/crocoite/cli.py b/crocoite/cli.py
index 24192fe..aee00e3 100644
--- a/crocoite/cli.py
+++ b/crocoite/cli.py
@@ -31,6 +31,11 @@ from io import BytesIO
import argparse
import tempfile
+from html5lib.treewalkers.base import TreeWalker
+from html5lib.filters.base import Filter
+from html5lib.serializer import HTMLSerializer
+from html5lib import constants
+
logger = logging.getLogger(__name__)
# 10 MB, encoded! (i.e. actual data can be larger due to compression)
@@ -76,6 +81,58 @@ class WARCLogHandler (BufferingHandler):
finally:
self.release ()
+class ChromeTreeWalker (TreeWalker):
+ """
+ Recursive html5lib TreeWalker for Google Chrome method DOM.getDocument
+ """
+
+ def recurse (self, node):
+ name = node['nodeName']
+ if name.startswith ('#'):
+ if name == '#text':
+ yield from self.text (node['nodeValue'])
+ elif name == '#comment':
+ yield self.comment (node['nodeValue'])
+ elif name == '#document':
+ for child in node.get ('children', []):
+ yield from self.recurse (child)
+ else:
+ default_namespace = constants.namespaces["html"]
+ attributes = node.get ('attributes', [])
+ convertedAttr = {}
+ for i in range (0, len (attributes), 2):
+ convertedAttr[(default_namespace, attributes[i])] = attributes[i+1]
+ yield self.startTag (default_namespace, name, convertedAttr)
+ for child in node.get ('children', []):
+ yield from self.recurse (child)
+ yield self.endTag ('', name)
+
+ def __iter__ (self):
+ from pprint import pprint
+ assert self.tree['nodeName'] == '#document'
+ return self.recurse (self.tree)
+
+class StripTagFilter (Filter):
+ """
+ Remove arbitrary tags
+ """
+
+ def __init__ (self, source, tags):
+ Filter.__init__ (self, source)
+ self.tags = set (map (str.lower, tags))
+
+ def __iter__(self):
+ delete = 0
+ for token in Filter.__iter__(self):
+ tokenType = token['type']
+ if tokenType == 'StartTag':
+ if delete > 0 or token['name'].lower () in self.tags:
+ delete += 1
+ if delete == 0:
+ yield token
+ if tokenType == 'EndTag' and delete > 0:
+ delete -= 1
+
def main ():
def getStatusText (response):
text = response.get ('statusText')
@@ -219,7 +276,7 @@ def main ():
writer.write_record(record)
except pychrome.exceptions.CallMethodException:
logger.error ('no data for {} {} {}'.format (resp['url'],
- resp['status'], kwargs.get ('requestId')))
+ resp['status'], reqId))
else:
logger.warn ('body for {} is too large, {} bytes'.format (resp['url'], kwargs['encodedDataLength']))
@@ -229,6 +286,36 @@ def main ():
if reqId in requests:
del requests[reqId]
+ def getFormattedViewportMetrics (tab):
+ layoutMetrics = tab.Page.getLayoutMetrics ()
+ # XXX: I’m not entirely sure which one we should use here
+ return '{}x{}'.format (layoutMetrics['layoutViewport']['clientWidth'],
+ layoutMetrics['layoutViewport']['clientHeight'])
+
+ def writeDOMSnapshot (tab, writer):
+ """
+ Get a DOM snapshot of tab and write it to WARC.
+
+ We could use DOMSnapshot.getSnapshot here, but the API is not stable
+ yet. Also computed styles are not really necessary here.
+
+ XXX: Currently writes a response, when it should use “resource”. pywb
+ can’t handle that though.
+ """
+ viewport = getFormattedViewportMetrics (tab)
+ dom = tab.DOM.getDocument (depth=-1)
+ # remove script, to make the page static and noscript, because at the
+ # time we took the snapshot scripts were enabled
+ stream = StripTagFilter (ChromeTreeWalker (dom['root']), ['script', 'noscript'])
+ serializer = HTMLSerializer ()
+ httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1')
+ record = writer.create_warc_record (dom['root']['documentURL'], 'response',
+ payload=BytesIO (serializer.render (stream, 'utf8')),
+ http_headers=httpHeaders,
+ warc_headers_dict={'X-DOM-Snapshot': str (True),
+ 'X-Chrome-Viewport': viewport})
+ writer.write_record (record)
+
logging.basicConfig (level=logging.DEBUG)
parser = argparse.ArgumentParser(description='Save website to WARC using Google Chrome.')
@@ -268,15 +355,11 @@ def main ():
fd = open (args.output, 'wb')
writer = WARCWriter (fd, gzip=True)
version = tab.Browser.getVersion ()
- # assuming these don’t change
- layoutMetrics = tab.Page.getLayoutMetrics ()
payload = {
'software': __package__,
'browser': version['product'],
'useragent': version['userAgent'],
- # XXX: I’m not entirely sure which one we should use here
- 'viewport': '{}x{}'.format (layoutMetrics['layoutViewport']['clientWidth'],
- layoutMetrics['layoutViewport']['clientHeight'])
+ 'viewport': getFormattedViewportMetrics (tab),
}
warcinfo = writer.create_warcinfo_record (filename=None, info=payload)
writer.write_record (warcinfo)
@@ -308,6 +391,18 @@ def main ():
if len (requests) == 0:
break
+ # disable events
+ tab.Page.stopLoading ()
+ tab.Network.disable ()
+ tab.Page.disable ()
+ tab.Network.requestWillBeSent = None
+ tab.Network.responseReceived = None
+ tab.Network.loadingFinished = None
+ tab.Network.loadingFailed = None
+ tab.Page.loadEventFired = None
+
+ writeDOMSnapshot (tab, writer)
+
tab.stop()
if not args.keepTab:
browser.close_tab(tab)
diff --git a/setup.py b/setup.py
index 939cec9..52747c0 100644
--- a/setup.py
+++ b/setup.py
@@ -12,6 +12,7 @@ setup(
install_requires=[
'pychrome',
'warcio',
+ 'html5lib>=0.999999999',
],
entry_points={
'console_scripts': [