diff options
author | Lars-Dominik Braun <lars@6xq.net> | 2017-11-29 13:07:08 +0100 |
---|---|---|
committer | Lars-Dominik Braun <lars@6xq.net> | 2017-11-29 13:07:08 +0100 |
commit | 6b656f5ccffe79f7cdb11fce3dd72359b3cbbc1b (patch) | |
tree | d822bf4e43baf23819a54f6c14dc3736c26e39d9 /crocoite | |
parent | 6b1d593ec841ebe18dcbdd18902f7faad0868fd4 (diff) | |
download | crocoite-6b656f5ccffe79f7cdb11fce3dd72359b3cbbc1b.tar.gz crocoite-6b656f5ccffe79f7cdb11fce3dd72359b3cbbc1b.tar.bz2 crocoite-6b656f5ccffe79f7cdb11fce3dd72359b3cbbc1b.zip |
Refactoring
Reusable browser communication and WARC writing.
Diffstat (limited to 'crocoite')
-rw-r--r-- | crocoite/__init__.py | 32 | ||||
-rw-r--r-- | crocoite/browser.py | 209 | ||||
-rw-r--r-- | crocoite/cli.py | 452 | ||||
-rw-r--r-- | crocoite/html.py | 107 | ||||
-rw-r--r-- | crocoite/warc.py | 174 |
5 files changed, 571 insertions, 403 deletions
diff --git a/crocoite/__init__.py b/crocoite/__init__.py index 066c83e..e23cd60 100644 --- a/crocoite/__init__.py +++ b/crocoite/__init__.py @@ -1,3 +1,33 @@ -#!/usr/bin/env python3 +# Copyright (c) 2017 crocoite contributors +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +import os +def packageData (path): + """ + Locate package data, see setup.py’s data_files + """ + return os.path.join (os.path.dirname (__file__), 'data', path) + +def packageUrl (path): + """ + Create URL for package data stored into WARC + """ + return 'urn:' + __package__ + ':' + path diff --git a/crocoite/browser.py b/crocoite/browser.py new file mode 100644 index 0000000..756fd64 --- /dev/null +++ b/crocoite/browser.py @@ -0,0 +1,209 @@ +# Copyright (c) 2017 crocoite contributors +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +""" +Chrome browser interactions. +""" + +import logging +from urllib.parse import urlsplit + +class Item: + """ + Simple wrapper containing Chrome request and response + """ + + def __init__ (self): + self._chromeRequest = None + self._chromeResponse = None + self.encodedDataLength = 0 + + def __repr__ (self): + return '<Item {}>'.format (self.request['url']) + + @property + def request (self): + return self._chromeRequest['request'] + + @property + def response (self): + return self._chromeResponse['response'] + + @property + def initiator (self): + return self._chromeRequest['initiator'] + + @property + def id (self): + return self._chromeRequest['requestId'] + + def setRequest (self, req): + self._chromeRequest = req + + def setResponse (self, resp): + self._chromeResponse = resp + +class SiteLoader: + """ + Load site in Chrome and monitor network requests + + XXX: track popup windows/new tabs and close them + """ + + allowedSchemes = {'http', 'https'} + + def __init__ (self, browser, url, logger=logging.getLogger(__name__)): + self.requests = {} + self.browser = browser + self.url = url + self.logger = logger + + self.tab = browser.new_tab() + + def __enter__ (self): + tab = self.tab + # setup callbacks + tab.Network.requestWillBeSent = self._requestWillBeSent + tab.Network.responseReceived = self._responseReceived + tab.Network.loadingFinished = self._loadingFinished + tab.Network.loadingFailed = self._loadingFailed + #tab.Page.loadEventFired = loadEventFired + + # start the tab + tab.start() + + # enable events + tab.Network.enable() + tab.Page.enable () + tab.Network.clearBrowserCache () + if tab.Network.canClearBrowserCookies ()['result']: + tab.Network.clearBrowserCookies () + + return self + + def __len__ (self): + return len (self.requests) + + def start (self): + self.tab.Page.navigate(url=self.url) + + def wait (self, timeout=1): + self.tab.wait (timeout) + + def waitIdle (self, idleTimeout=1, maxTimeout=60): + step = 0 + for i in range (0, maxTimeout): + self.wait (1) + if len (self) == 0: + step += 1 + if step > idleTimeout: + break + else: + step = 0 + + def stop (self): + """ + Stop loading site + + XXX: stop executing scripts + """ + + tab = self.tab + + tab.Page.stopLoading () + tab.Network.disable () + tab.Page.disable () + tab.Network.requestWillBeSent = None + tab.Network.responseReceived = None + tab.Network.loadingFinished = None + tab.Network.loadingFailed = None + tab.Page.loadEventFired = None + + def __exit__ (self, exc_type, exc_value, traceback): + self.tab.stop () + self.browser.close_tab(self.tab) + return False + + def loadingFinished (self, item): + self.logger.debug ('item finished {}'.format (item)) + + # internal chrome callbacks + def _requestWillBeSent (self, **kwargs): + reqId = kwargs['requestId'] + req = kwargs['request'] + + url = urlsplit (req['url']) + if url.scheme not in self.allowedSchemes: + return + + item = self.requests.get (reqId) + if item: + # redirects never “finish” loading, but yield another requestWillBeSent with this key set + redirectResp = kwargs.get ('redirectResponse') + if redirectResp: + resp = {'requestId': reqId, 'response': redirectResp} + item.setResponse (resp) + self.loadingFinished (item, redirect=True) + self.logger.debug ('redirected request {} has url {}'.format (reqId, req['url'])) + else: + self.logger.warn ('request {} already exists, overwriting.'.format (reqId)) + + item = Item () + item.setRequest (kwargs) + self.requests[reqId] = item + + def _responseReceived (self, **kwargs): + reqId = kwargs['requestId'] + item = self.requests.get (reqId) + if item is None: + return + + resp = kwargs['response'] + url = urlsplit (resp['url']) + if url.scheme in self.allowedSchemes: + self.logger.debug ('response {} {}'.format (reqId, resp['url'])) + item.setResponse (kwargs) + else: + self.logger.warn ('response: ignoring scheme {}'.format (url.scheme)) + + def _loadingFinished (self, **kwargs): + """ + Item was fully loaded. For some items the request body is not available + when responseReceived is fired, thus move everything here. + """ + reqId = kwargs['requestId'] + item = self.requests.pop (reqId, None) + if item is None: + # we never recorded this request (blacklisted scheme, for example) + return + req = item.request + resp = item.response + assert req['url'] == resp['url'], 'req and resp urls are not the same {} vs {}'.format (req['url'], resp['url']) + url = urlsplit (resp['url']) + if url.scheme in self.allowedSchemes: + self.logger.debug ('finished {} {}'.format (reqId, req['url'])) + item.encodedDataLength = kwargs['encodedDataLength'] + self.loadingFinished (item) + + def _loadingFailed (self, **kwargs): + reqId = kwargs['requestId'] + self.logger.debug ('failed {} {}'.format (reqId, kwargs['errorText'], kwargs.get ('blockedReason'))) + item = self.requests.pop (reqId, None) + diff --git a/crocoite/cli.py b/crocoite/cli.py index 640d207..a97f0ce 100644 --- a/crocoite/cli.py +++ b/crocoite/cli.py @@ -18,327 +18,23 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. -import os, sys, json -import pychrome -from urllib.parse import urlsplit -from warcio.warcwriter import WARCWriter -from warcio.statusandheaders import StatusAndHeaders -from base64 import b64decode -import logging -from logging.handlers import BufferingHandler -from http.server import BaseHTTPRequestHandler -from io import BytesIO -import argparse -import tempfile, random - -from html5lib.treewalkers.base import TreeWalker -from html5lib.filters.base import Filter -from html5lib.serializer import HTMLSerializer -from html5lib import constants - -from . import html - -logger = logging.getLogger(__name__) - -# 10 MB, encoded! (i.e. actual data can be larger due to compression) -maxBodySize = 10*1024*1024 - -def packageData (path): - """ - Locate package data, see setup.py’s data_files - """ - return os.path.join (os.path.dirname (__file__), 'data', path) - -def packageUrl (path): - """ - Create URL for package data stored into WARC - """ - return 'urn:' + __package__ + ':' + path - -def randomString (length=None, chars='abcdefghijklmnopqrstuvwxyz'): - if length is None: - length = random.randint (16, 32) - return ''.join (map (lambda x: random.choice (chars), range (length))) - -class WARCLogHandler (BufferingHandler): - """ - Buffered log handler, flushing to warcio - """ - - contentType = 'text/plain; charset=utf-8' - - def __init__ (self, capacity, warcfile): - BufferingHandler.__init__ (self, capacity) - self.warcfile = warcfile - - def flush (self): - self.acquire () - try: - buf = '' - for record in self.buffer: - buf += self.format (record) - buf += '\n' - # XXX: record type? - record = self.warcfile.create_warc_record ( - packageUrl ('log'), 'metadata', - payload=BytesIO (buf.encode ('utf8')), - warc_headers_dict={'Content-Type': self.contentType}) - self.warcfile.write_record(record) - self.buffer = [] - finally: - self.release () - -class ChromeTreeWalker (TreeWalker): - """ - Recursive html5lib TreeWalker for Google Chrome method DOM.getDocument - """ - - def recurse (self, node): - name = node['nodeName'] - if name.startswith ('#'): - if name == '#text': - yield from self.text (node['nodeValue']) - elif name == '#comment': - yield self.comment (node['nodeValue']) - elif name == '#document': - for child in node.get ('children', []): - yield from self.recurse (child) - else: - assert False, name - else: - default_namespace = constants.namespaces["html"] - - attributes = node.get ('attributes', []) - convertedAttr = {} - for i in range (0, len (attributes), 2): - convertedAttr[(default_namespace, attributes[i])] = attributes[i+1] - - children = node.get ('children', []) - if name.lower() in html.voidTags and not children: - yield from self.emptyTag (default_namespace, name, convertedAttr) - else: - yield self.startTag (default_namespace, name, convertedAttr) - for child in node.get ('children', []): - yield from self.recurse (child) - yield self.endTag ('', name) - - def __iter__ (self): - assert self.tree['nodeName'] == '#document' - return self.recurse (self.tree) - - def split (self): - """ - Split response returned by DOM.getDocument(pierce=True) into independent documents - """ - def recurse (node): - contentDocument = node.get ('contentDocument') - if contentDocument: - assert contentDocument['nodeName'] == '#document' - yield contentDocument - yield from recurse (contentDocument) - - for child in node.get ('children', []): - yield from recurse (child) - - if self.tree['nodeName'] == '#document': - yield self.tree - yield from recurse (self.tree) - -class StripTagFilter (Filter): - """ - Remove arbitrary tags - """ - - def __init__ (self, source, tags): - Filter.__init__ (self, source) - self.tags = set (map (str.lower, tags)) - - def __iter__(self): - delete = 0 - for token in Filter.__iter__(self): - tokenType = token['type'] - if tokenType in {'StartTag', 'EmptyTag'}: - if delete > 0 or token['name'].lower () in self.tags: - delete += 1 - if delete == 0: - yield token - if tokenType == 'EndTag' and delete > 0: - delete -= 1 - -class StripAttributeFilter (Filter): - """ - Remove arbitrary HTML attributes - """ - - def __init__ (self, source, attributes): - Filter.__init__ (self, source) - self.attributes = set (map (str.lower, attributes)) - - def __iter__(self): - default_namespace = constants.namespaces["html"] - for token in Filter.__iter__(self): - data = token.get ('data') - if data and token['type'] in {'StartTag', 'EmptyTag'}: - newdata = {} - for (namespace, k), v in data.items (): - if k.lower () not in self.attributes: - newdata[(namespace, k)] = v - token['data'] = newdata - yield token - def main (): - def getStatusText (response): - text = response.get ('statusText') - if text: - return text - text = BaseHTTPRequestHandler.responses.get (response['status']) - if text: - return text[0] - return 'No status text available' - - def requestWillBeSent (**kwargs): - req = kwargs.get ('request') - reqId = kwargs['requestId'] - url = urlsplit (req['url']) - if url.scheme in ('http', 'https'): - logger.debug ('sending {} {}'.format (reqId, req['url'])) - if reqId in requests: - redirectResp = kwargs.get ('redirectResponse') - if redirectResp: - requests[reqId]['response'] = redirectResp - # XXX: can we retrieve the original response body right now? - itemToWarc (reqId, requests[reqId], ignoreBody=True) - else: - logger.warn ('request {} already exists, overwriting.'.format (reqId)) - requests[reqId] = {} - requests[reqId]['request'] = req - requests[reqId]['initiator'] = kwargs['initiator'] - else: - logger.warn ('request: ignoring scheme {}'.format (url.scheme)) - - def responseReceived (**kwargs): - resp = kwargs['response'] - reqId = kwargs['requestId'] - url = urlsplit (resp['url']) - if url.scheme in ('http', 'https') and reqId in requests: - logger.debug ('response {} {}'.format (reqId, resp['url'])) - requests[reqId]['response'] = resp - else: - logger.warn ('response: ignoring scheme {}'.format (url.scheme)) - - def loadEventFired (**kwargs): - """ - Equivalent to DOM ready JavaScript event - """ - root = tab.DOM.getDocument () - rootId = root['root']['nodeId'] - links = tab.DOM.querySelectorAll (nodeId=rootId, selector='a') - #for i in links['nodeIds']: - # print ('link', tab.DOM.getAttributes (nodeId=i)) - - def loadingFinished (**kwargs): - """ - Item was fully loaded. For some items the request body is not available - when responseReceived is fired, thus move everything here. - """ - reqId = kwargs['requestId'] - # we never recorded this request - if reqId not in requests: - return - item = requests[reqId] - req = item['request'] - resp = item['response'] - url = urlsplit (resp['url']) - if url.scheme in ('http', 'https'): - logger.debug ('finished {} {}'.format (reqId, req['url'])) - itemToWarc (reqId, item, kwargs['encodedDataLength']) - del requests[reqId] - - def itemToWarc (reqId, item, encodedDataLength=0, ignoreBody=False): - req = item['request'] - resp = item['response'] - url = urlsplit (resp['url']) - - # overwrite request headers with those actually sent - newReqHeaders = resp.get ('requestHeaders') - if newReqHeaders: - req['headers'] = newReqHeaders - - postData = req.get ('postData') - if postData: - postData = BytesIO (postData.encode ('utf8')) - path = url.path - if url.query: - path += '?' + url.query - httpHeaders = StatusAndHeaders('{} {} HTTP/1.1'.format (req['method'], path), - req['headers'], protocol='HTTP/1.1', is_http_request=True) - initiator = item['initiator'] - warcHeaders = { - 'X-Chrome-Initiator': json.dumps (initiator), - } - record = writer.create_warc_record(req['url'], 'request', - payload=postData, http_headers=httpHeaders, - warc_headers_dict=warcHeaders) - writer.write_record(record) - concurrentTo = record.rec_headers['WARC-Record-ID'] - - # check body size first, since we’re loading everything into memory - if encodedDataLength < maxBodySize: - try: - if ignoreBody: - rawBody = b'' - base64Encoded = True - else: - body = tab.Network.getResponseBody (requestId=reqId) - rawBody = body['body'] - base64Encoded = body['base64Encoded'] - if base64Encoded: - rawBody = b64decode (rawBody) - else: - rawBody = rawBody.encode ('utf8') - - httpHeaders = StatusAndHeaders('{} {}'.format (resp['status'], - getStatusText (resp)), resp['headers'], protocol='HTTP/1.1') - - # Content is saved decompressed and decoded, remove these headers - blacklistedHeaders = {'transfer-encoding', 'content-encoding'} - for h in blacklistedHeaders: - httpHeaders.remove_header (h) - - # chrome sends nothing but utf8 encoded text. Fortunately HTTP - # headers take precedence over the document’s <meta>, thus we can - # easily override those. - contentType = resp['mimeType'] - if not base64Encoded: - contentType += '; charset=utf-8' - httpHeaders.replace_header ('content-type', contentType) - - httpHeaders.replace_header ('content-length', '{:d}'.format (len (rawBody))) - - warcHeaders = { - 'WARC-Concurrent-To': concurrentTo, - 'WARC-IP-Address': resp.get ('remoteIPAddress', ''), - 'X-Chrome-Protocol': resp.get ('protocol', ''), - 'X-Chrome-FromDiskCache': str (resp.get ('fromDiskCache')), - 'X-Chrome-ConnectionReused': str (resp.get ('connectionReused')), - 'X-Chrome-Base64Body': str (base64Encoded), - } - record = writer.create_warc_record(resp['url'], 'response', - warc_headers_dict=warcHeaders, payload=BytesIO (rawBody), - http_headers=httpHeaders) - writer.write_record(record) - except pychrome.exceptions.CallMethodException: - logger.error ('no data for {} {} {}'.format (resp['url'], - resp['status'], reqId)) - else: - logger.warn ('body for {} is too large, {} bytes'.format (resp['url'], kwargs['encodedDataLength'])) - - def loadingFailed (**kwargs): - reqId = kwargs['requestId'] - logger.debug ('failed {} {}'.format (reqId, kwargs['errorText'], kwargs.get ('blockedReason'))) - if reqId in requests: - del requests[reqId] - + import os, random, logging, argparse + from io import BytesIO + import pychrome + from urllib.parse import urlsplit + from warcio.warcwriter import WARCWriter + from warcio.statusandheaders import StatusAndHeaders + from html5lib.serializer import HTMLSerializer + from . import html, packageData, packageUrl + from .warc import WarcLoader + from .html import StripAttributeFilter, StripTagFilter, ChromeTreeWalker + + def randomString (length=None, chars='abcdefghijklmnopqrstuvwxyz'): + if length is None: + length = random.randint (16, 32) + return ''.join (map (lambda x: random.choice (chars), range (length))) + def getFormattedViewportMetrics (tab): layoutMetrics = tab.Page.getLayoutMetrics () # XXX: I’m not entirely sure which one we should use here @@ -384,7 +80,7 @@ def main (): 'X-Chrome-Viewport': viewport}) writer.write_record (record) - def emulateScreenMetrics (tab): + def emulateScreenMetrics (l): """ Emulate different screen sizes, causing the site to fetch assets (img srcset and css, for example) for different screen resolutions. @@ -404,13 +100,12 @@ def main (): {'width': 1920, 'height': 1080, 'deviceScaleFactor': 1, 'mobile': False}, ] for s in sizes: - tab.Emulation.setDeviceMetricsOverride (**s) - tab.wait (1) + l.tab.Emulation.setDeviceMetricsOverride (**s) + l.wait (1) # XXX: this seems to be broken, it does not clear the override #tab.Emulation.clearDeviceMetricsOverride () # wait until assets finished loading - while len (requests) != 0: - tab.wait (1) + l.waitIdle (2, 10) def loadScripts (paths, scripts=[]): for p in paths: @@ -427,6 +122,7 @@ def main (): warc_headers_dict={'Content-Type': 'application/javascript; charset=utf-8'}) writer.write_record (record) + logger = logging.getLogger(__name__) logging.basicConfig (level=logging.DEBUG) parser = argparse.ArgumentParser(description='Save website to WARC using Google Chrome.') @@ -434,7 +130,8 @@ def main (): parser.add_argument('--timeout', default=10, type=int, help='Maximum time for archival') parser.add_argument('--idle-timeout', default=2, type=int, help='Maximum idle seconds (i.e. no requests)', dest='idleTimeout') parser.add_argument('--log-buffer', default=1000, type=int, dest='logBuffer') - parser.add_argument('--keep-tab', action='store_true', default=False, dest='keepTab', help='Keep tab open') + parser.add_argument('--max-body-size', default=10*1024*1024, type=int, dest='maxBodySize', help='Max body size in bytes') + #parser.add_argument('--keep-tab', action='store_true', default=False, dest='keepTab', help='Keep tab open') parser.add_argument('--onload', default=[], action='append', help='Inject JavaScript file before loading page') parser.add_argument('--onsnapshot', default=[], action='append', help='Run JavaScript files before creating DOM snapshot') parser.add_argument('url', help='Website URL') @@ -448,93 +145,44 @@ def main (): onload = loadScripts (args.onload, ['var {} = false;\n'.format (stopVarname)]).replace (stopVarname, newStopVarname) stopVarname = newStopVarname - # temporary store for requests - requests = {} - - # create a browser instance browser = pychrome.Browser(url=args.browser) - # create a tab - tab = browser.new_tab() - - # setup callbacks - tab.Network.requestWillBeSent = requestWillBeSent - tab.Network.responseReceived = responseReceived - tab.Network.loadingFinished = loadingFinished - tab.Network.loadingFailed = loadingFailed - tab.Page.loadEventFired = loadEventFired - - # start the tab - tab.start() - fd = open (args.output, 'wb') writer = WARCWriter (fd, gzip=True) - version = tab.Browser.getVersion () - payload = { - 'software': __package__, - 'browser': version['product'], - 'useragent': version['userAgent'], - 'viewport': getFormattedViewportMetrics (tab), - } - warcinfo = writer.create_warcinfo_record (filename=None, info=payload) - writer.write_record (warcinfo) - - warcLogger = WARCLogHandler (args.logBuffer, writer) - logger.addHandler (warcLogger) - - # save onload script - writeScript ('onload', onload, writer) - # enable events - tab.Network.enable() - tab.Page.enable () - tab.Network.clearBrowserCache () - if tab.Network.canClearBrowserCookies ()['result']: - tab.Network.clearBrowserCookies () - # inject our custom javascript to the page before loading - tab.Page.addScriptToEvaluateOnNewDocument (source=onload) - - tab.Page.navigate(url=args.url) - - idleTimeout = 0 - for i in range (0, args.timeout): - tab.wait (1) - if len (requests) == 0: - idleTimeout += 1 - if idleTimeout > args.idleTimeout: - break - else: - idleTimeout = 0 + with WarcLoader (browser, args.url, writer, logBuffer=args.logBuffer, + maxBodySize=args.maxBodySize) as l: + version = l.tab.Browser.getVersion () + payload = { + 'software': __package__, + 'browser': version['product'], + 'useragent': version['userAgent'], + 'viewport': getFormattedViewportMetrics (l.tab), + } + warcinfo = writer.create_warcinfo_record (filename=None, info=payload) + writer.write_record (warcinfo) + # save onload script as well + writeScript ('onload', onload, writer) - # get ready for snapshot: stop loading and scripts, disable events - tab.Runtime.evaluate (expression='{} = true; window.scrollTo (0, 0);'.format (stopVarname), returnByValue=True) - # if we stopped due to timeout, wait for remaining assets - while len (requests) != 0: - tab.wait (1) + # inject our custom javascript to the page before loading + l.tab.Page.addScriptToEvaluateOnNewDocument (source=onload) + l.start () - emulateScreenMetrics (tab) + l.waitIdle (args.idleTimeout, args.timeout) - tab.Page.stopLoading () - tab.Network.disable () - tab.Page.disable () - tab.Network.requestWillBeSent = None - tab.Network.responseReceived = None - tab.Network.loadingFinished = None - tab.Network.loadingFailed = None - tab.Page.loadEventFired = None + # get ready for snapshot: stop loading and scripts, disable events + l.tab.Runtime.evaluate (expression='{} = true; window.scrollTo (0, 0);'.format (stopVarname), returnByValue=True) + # if we stopped due to timeout, wait for remaining assets + l.waitIdle (2, 10) - script = loadScripts (args.onsnapshot) - writeScript ('onsnapshot', script, writer) - tab.Runtime.evaluate (expression=script, returnByValue=True) - writeDOMSnapshot (tab, writer) + emulateScreenMetrics (l) - tab.stop() - if not args.keepTab: - browser.close_tab(tab) + l.stop () - logger.removeHandler (warcLogger) - warcLogger.flush () - fd.close () + script = loadScripts (args.onsnapshot) + writeScript ('onsnapshot', script, writer) + l.tab.Runtime.evaluate (expression=script, returnByValue=True) + writeDOMSnapshot (l.tab, writer) return True diff --git a/crocoite/html.py b/crocoite/html.py index 34fe26b..f891101 100644 --- a/crocoite/html.py +++ b/crocoite/html.py @@ -18,6 +18,10 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. +""" +HTML helper +""" + # HTML void tags, see https://html.spec.whatwg.org/multipage/syntax.html#void-elements voidTags = {'area', 'base', @@ -99,3 +103,106 @@ eventAttributes = {'onabort', 'onvolumechange', 'onwaiting'} +from html5lib.treewalkers.base import TreeWalker +from html5lib.filters.base import Filter +from html5lib.serializer import HTMLSerializer +from html5lib import constants + +class ChromeTreeWalker (TreeWalker): + """ + Recursive html5lib TreeWalker for Google Chrome method DOM.getDocument + """ + + def recurse (self, node): + name = node['nodeName'] + if name.startswith ('#'): + if name == '#text': + yield from self.text (node['nodeValue']) + elif name == '#comment': + yield self.comment (node['nodeValue']) + elif name == '#document': + for child in node.get ('children', []): + yield from self.recurse (child) + else: + assert False, name + else: + default_namespace = constants.namespaces["html"] + + attributes = node.get ('attributes', []) + convertedAttr = {} + for i in range (0, len (attributes), 2): + convertedAttr[(default_namespace, attributes[i])] = attributes[i+1] + + children = node.get ('children', []) + if name.lower() in voidTags and not children: + yield from self.emptyTag (default_namespace, name, convertedAttr) + else: + yield self.startTag (default_namespace, name, convertedAttr) + for child in node.get ('children', []): + yield from self.recurse (child) + yield self.endTag ('', name) + + def __iter__ (self): + assert self.tree['nodeName'] == '#document' + return self.recurse (self.tree) + + def split (self): + """ + Split response returned by DOM.getDocument(pierce=True) into independent documents + """ + def recurse (node): + contentDocument = node.get ('contentDocument') + if contentDocument: + assert contentDocument['nodeName'] == '#document' + yield contentDocument + yield from recurse (contentDocument) + + for child in node.get ('children', []): + yield from recurse (child) + + if self.tree['nodeName'] == '#document': + yield self.tree + yield from recurse (self.tree) + +class StripTagFilter (Filter): + """ + Remove arbitrary tags + """ + + def __init__ (self, source, tags): + Filter.__init__ (self, source) + self.tags = set (map (str.lower, tags)) + + def __iter__(self): + delete = 0 + for token in Filter.__iter__(self): + tokenType = token['type'] + if tokenType in {'StartTag', 'EmptyTag'}: + if delete > 0 or token['name'].lower () in self.tags: + delete += 1 + if delete == 0: + yield token + if tokenType == 'EndTag' and delete > 0: + delete -= 1 + +class StripAttributeFilter (Filter): + """ + Remove arbitrary HTML attributes + """ + + def __init__ (self, source, attributes): + Filter.__init__ (self, source) + self.attributes = set (map (str.lower, attributes)) + + def __iter__(self): + default_namespace = constants.namespaces["html"] + for token in Filter.__iter__(self): + data = token.get ('data') + if data and token['type'] in {'StartTag', 'EmptyTag'}: + newdata = {} + for (namespace, k), v in data.items (): + if k.lower () not in self.attributes: + newdata[(namespace, k)] = v + token['data'] = newdata + yield token + diff --git a/crocoite/warc.py b/crocoite/warc.py new file mode 100644 index 0000000..e06b1c7 --- /dev/null +++ b/crocoite/warc.py @@ -0,0 +1,174 @@ +# Copyright (c) 2017 crocoite contributors +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +""" +Classes writing data to WARC files +""" + +import logging +import json +from .browser import SiteLoader +from . import packageUrl +from http.server import BaseHTTPRequestHandler +from base64 import b64decode +from io import BytesIO +from warcio.statusandheaders import StatusAndHeaders +from urllib.parse import urlsplit +from logging.handlers import BufferingHandler +import pychrome + +class WARCLogHandler (BufferingHandler): + """ + Buffered log handler, flushing to warcio + """ + + contentType = 'text/plain; charset=utf-8' + + def __init__ (self, capacity, warcfile): + BufferingHandler.__init__ (self, capacity) + self.warcfile = warcfile + + def flush (self): + self.acquire () + try: + if self.buffer: + buf = '' + for record in self.buffer: + buf += self.format (record) + buf += '\n' + # XXX: record type? + record = self.warcfile.create_warc_record ( + packageUrl ('log'), 'metadata', + payload=BytesIO (buf.encode ('utf8')), + warc_headers_dict={'Content-Type': self.contentType}) + self.warcfile.write_record(record) + self.buffer = [] + finally: + self.release () + +class WarcLoader (SiteLoader): + def __init__ (self, browser, url, writer, + logger=logging.getLogger(__name__), logBuffer=1000, + maxBodySize=10*1024*1024): + SiteLoader.__init__ (self, browser, url, logger) + self.writer = writer + self.maxBodySize = maxBodySize + self.warcLogger = WARCLogHandler (logBuffer, writer) + self.logger.addHandler (self.warcLogger) + + def __exit__ (self, exc_type, exc_value, traceback): + self.logger.removeHandler (self.warcLogger) + self.warcLogger.flush () + return SiteLoader.__exit__ (self, exc_type, exc_value, traceback) + + @staticmethod + def getStatusText (response): + text = response.get ('statusText') + if text: + return text + text = BaseHTTPRequestHandler.responses.get (response['status']) + if text: + return text[0] + return 'No status text available' + + def loadingFinished (self, item, redirect=False): + writer = self.writer + + req = item.request + reqId = item.id + resp = item.response + url = urlsplit (resp['url']) + + # overwrite request headers with those actually sent + newReqHeaders = resp.get ('requestHeaders') + if newReqHeaders: + req['headers'] = newReqHeaders + + postData = req.get ('postData') + if postData: + postData = BytesIO (postData.encode ('utf8')) + path = url.path + if url.query: + path += '?' + url.query + httpHeaders = StatusAndHeaders('{} {} HTTP/1.1'.format (req['method'], path), + req['headers'], protocol='HTTP/1.1', is_http_request=True) + initiator = item.initiator + warcHeaders = { + 'X-Chrome-Initiator': json.dumps (initiator), + } + record = writer.create_warc_record(req['url'], 'request', + payload=postData, http_headers=httpHeaders, + warc_headers_dict=warcHeaders) + writer.write_record(record) + concurrentTo = record.rec_headers['WARC-Record-ID'] + + # now the response + warcHeaders = { + 'WARC-Concurrent-To': concurrentTo, + 'WARC-IP-Address': resp.get ('remoteIPAddress', ''), + 'X-Chrome-Protocol': resp.get ('protocol', ''), + 'X-Chrome-FromDiskCache': str (resp.get ('fromDiskCache')), + 'X-Chrome-ConnectionReused': str (resp.get ('connectionReused')), + } + + rawBody = b'' + base64Encoded = False + try: + # check body size first, since we’re loading everything into memory + if item.encodedDataLength < self.maxBodySize: + body = self.tab.Network.getResponseBody (requestId=reqId) + rawBody = body['body'] + base64Encoded = body['base64Encoded'] + if base64Encoded: + rawBody = b64decode (rawBody) + warcHeaders['X-Chrome-Base64Body'] = str (True) + else: + rawBody = rawBody.encode ('utf8') + else: + self.logger.error ('body for {} too large {} vs {}'.format (reqId, + item.encodedDataLength, self.maxBodySize)) + except pychrome.exceptions.CallMethodException: + self.logger.error ('no data for {} {} {}'.format (resp['url'], + resp['status'], reqId)) + + httpHeaders = StatusAndHeaders('{} {}'.format (resp['status'], + self.getStatusText (resp)), resp['headers'], protocol='HTTP/1.1') + + # Content is saved decompressed and decoded, remove these headers + blacklistedHeaders = {'transfer-encoding', 'content-encoding'} + for h in blacklistedHeaders: + httpHeaders.remove_header (h) + + # chrome sends nothing but utf8 encoded text. Fortunately HTTP + # headers take precedence over the document’s <meta>, thus we can + # easily override those. + contentType = resp.get ('mimeType') + if contentType: + if not base64Encoded: + contentType += '; charset=utf-8' + httpHeaders.replace_header ('content-type', contentType) + + httpHeaders.replace_header ('content-length', '{:d}'.format (len (rawBody))) + + record = writer.create_warc_record(resp['url'], 'response', + warc_headers_dict=warcHeaders, payload=BytesIO (rawBody), + http_headers=httpHeaders) + writer.write_record(record) + |