From ca01f82227a8b79f1cbc4f5e0be5434804dc3c0e Mon Sep 17 00:00:00 2001 From: Lars-Dominik Braun Date: Mon, 20 Nov 2017 19:19:05 +0100 Subject: Add page created from DOM snapshot --- README.rst | 20 +++++++++-- crocoite/cli.py | 107 ++++++++++++++++++++++++++++++++++++++++++++++++++++---- setup.py | 1 + 3 files changed, 119 insertions(+), 9 deletions(-) diff --git a/README.rst b/README.rst index 7eea272..f66da27 100644 --- a/README.rst +++ b/README.rst @@ -10,6 +10,7 @@ Dependencies - Python 3 - pychrome_ - warcio_ +- html5lib .. _pychrome: https://github.com/fate0/pychrome .. _warcio: https://github.com/webrecorder/warcio @@ -34,7 +35,20 @@ Caveats ------- - Original HTTP requests/responses are not available. They are rebuilt from - data available. Character encoding for text documents is changed to UTF-8. -- Some sites request different assets based on screen resolution, some fetch - different scripts based on user agent. + parsed data. Character encoding for text documents is changed to UTF-8. +- Some sites request assets based on screen resolution, pixel ratio and + supported image formats (webp). Replaying those with different parameters + won’t work, since assets for those are missing. Example: missguided.com. +- Some fetch different scripts based on user agent. Example: youtube.com. +- Requests containing randomly generated JavaScript callback function names + won’t work. Example: weather.com. + +Most of these issues can be worked around by using the DOM snapshot, which is +also saved. This causes its own set of issues though: + +- JavaScript-based navigation does not work. +- Scripts modifying styles based on scrolling position are stuck at the end of + page state at the moment. Example: twitter.com +- CSS-based asset loading (screen size, pixel ratio, …) still does not work. +- Canvas contents are probably not preserved. diff --git a/crocoite/cli.py b/crocoite/cli.py index 24192fe..aee00e3 100644 --- a/crocoite/cli.py +++ b/crocoite/cli.py @@ -31,6 +31,11 @@ from io import BytesIO import argparse import tempfile +from html5lib.treewalkers.base import TreeWalker +from html5lib.filters.base import Filter +from html5lib.serializer import HTMLSerializer +from html5lib import constants + logger = logging.getLogger(__name__) # 10 MB, encoded! (i.e. actual data can be larger due to compression) @@ -76,6 +81,58 @@ class WARCLogHandler (BufferingHandler): finally: self.release () +class ChromeTreeWalker (TreeWalker): + """ + Recursive html5lib TreeWalker for Google Chrome method DOM.getDocument + """ + + def recurse (self, node): + name = node['nodeName'] + if name.startswith ('#'): + if name == '#text': + yield from self.text (node['nodeValue']) + elif name == '#comment': + yield self.comment (node['nodeValue']) + elif name == '#document': + for child in node.get ('children', []): + yield from self.recurse (child) + else: + default_namespace = constants.namespaces["html"] + attributes = node.get ('attributes', []) + convertedAttr = {} + for i in range (0, len (attributes), 2): + convertedAttr[(default_namespace, attributes[i])] = attributes[i+1] + yield self.startTag (default_namespace, name, convertedAttr) + for child in node.get ('children', []): + yield from self.recurse (child) + yield self.endTag ('', name) + + def __iter__ (self): + from pprint import pprint + assert self.tree['nodeName'] == '#document' + return self.recurse (self.tree) + +class StripTagFilter (Filter): + """ + Remove arbitrary tags + """ + + def __init__ (self, source, tags): + Filter.__init__ (self, source) + self.tags = set (map (str.lower, tags)) + + def __iter__(self): + delete = 0 + for token in Filter.__iter__(self): + tokenType = token['type'] + if tokenType == 'StartTag': + if delete > 0 or token['name'].lower () in self.tags: + delete += 1 + if delete == 0: + yield token + if tokenType == 'EndTag' and delete > 0: + delete -= 1 + def main (): def getStatusText (response): text = response.get ('statusText') @@ -219,7 +276,7 @@ def main (): writer.write_record(record) except pychrome.exceptions.CallMethodException: logger.error ('no data for {} {} {}'.format (resp['url'], - resp['status'], kwargs.get ('requestId'))) + resp['status'], reqId)) else: logger.warn ('body for {} is too large, {} bytes'.format (resp['url'], kwargs['encodedDataLength'])) @@ -229,6 +286,36 @@ def main (): if reqId in requests: del requests[reqId] + def getFormattedViewportMetrics (tab): + layoutMetrics = tab.Page.getLayoutMetrics () + # XXX: I’m not entirely sure which one we should use here + return '{}x{}'.format (layoutMetrics['layoutViewport']['clientWidth'], + layoutMetrics['layoutViewport']['clientHeight']) + + def writeDOMSnapshot (tab, writer): + """ + Get a DOM snapshot of tab and write it to WARC. + + We could use DOMSnapshot.getSnapshot here, but the API is not stable + yet. Also computed styles are not really necessary here. + + XXX: Currently writes a response, when it should use “resource”. pywb + can’t handle that though. + """ + viewport = getFormattedViewportMetrics (tab) + dom = tab.DOM.getDocument (depth=-1) + # remove script, to make the page static and noscript, because at the + # time we took the snapshot scripts were enabled + stream = StripTagFilter (ChromeTreeWalker (dom['root']), ['script', 'noscript']) + serializer = HTMLSerializer () + httpHeaders = StatusAndHeaders('200 OK', {}, protocol='HTTP/1.1') + record = writer.create_warc_record (dom['root']['documentURL'], 'response', + payload=BytesIO (serializer.render (stream, 'utf8')), + http_headers=httpHeaders, + warc_headers_dict={'X-DOM-Snapshot': str (True), + 'X-Chrome-Viewport': viewport}) + writer.write_record (record) + logging.basicConfig (level=logging.DEBUG) parser = argparse.ArgumentParser(description='Save website to WARC using Google Chrome.') @@ -268,15 +355,11 @@ def main (): fd = open (args.output, 'wb') writer = WARCWriter (fd, gzip=True) version = tab.Browser.getVersion () - # assuming these don’t change - layoutMetrics = tab.Page.getLayoutMetrics () payload = { 'software': __package__, 'browser': version['product'], 'useragent': version['userAgent'], - # XXX: I’m not entirely sure which one we should use here - 'viewport': '{}x{}'.format (layoutMetrics['layoutViewport']['clientWidth'], - layoutMetrics['layoutViewport']['clientHeight']) + 'viewport': getFormattedViewportMetrics (tab), } warcinfo = writer.create_warcinfo_record (filename=None, info=payload) writer.write_record (warcinfo) @@ -308,6 +391,18 @@ def main (): if len (requests) == 0: break + # disable events + tab.Page.stopLoading () + tab.Network.disable () + tab.Page.disable () + tab.Network.requestWillBeSent = None + tab.Network.responseReceived = None + tab.Network.loadingFinished = None + tab.Network.loadingFailed = None + tab.Page.loadEventFired = None + + writeDOMSnapshot (tab, writer) + tab.stop() if not args.keepTab: browser.close_tab(tab) diff --git a/setup.py b/setup.py index 939cec9..52747c0 100644 --- a/setup.py +++ b/setup.py @@ -12,6 +12,7 @@ setup( install_requires=[ 'pychrome', 'warcio', + 'html5lib>=0.999999999', ], entry_points={ 'console_scripts': [ -- cgit v1.2.3