From 6b656f5ccffe79f7cdb11fce3dd72359b3cbbc1b Mon Sep 17 00:00:00 2001 From: Lars-Dominik Braun Date: Wed, 29 Nov 2017 13:07:08 +0100 Subject: Refactoring Reusable browser communication and WARC writing. --- crocoite/cli.py | 452 +++++++------------------------------------------------- 1 file changed, 50 insertions(+), 402 deletions(-) (limited to 'crocoite/cli.py') diff --git a/crocoite/cli.py b/crocoite/cli.py index 640d207..a97f0ce 100644 --- a/crocoite/cli.py +++ b/crocoite/cli.py @@ -18,327 +18,23 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. -import os, sys, json -import pychrome -from urllib.parse import urlsplit -from warcio.warcwriter import WARCWriter -from warcio.statusandheaders import StatusAndHeaders -from base64 import b64decode -import logging -from logging.handlers import BufferingHandler -from http.server import BaseHTTPRequestHandler -from io import BytesIO -import argparse -import tempfile, random - -from html5lib.treewalkers.base import TreeWalker -from html5lib.filters.base import Filter -from html5lib.serializer import HTMLSerializer -from html5lib import constants - -from . import html - -logger = logging.getLogger(__name__) - -# 10 MB, encoded! (i.e. actual data can be larger due to compression) -maxBodySize = 10*1024*1024 - -def packageData (path): - """ - Locate package data, see setup.py’s data_files - """ - return os.path.join (os.path.dirname (__file__), 'data', path) - -def packageUrl (path): - """ - Create URL for package data stored into WARC - """ - return 'urn:' + __package__ + ':' + path - -def randomString (length=None, chars='abcdefghijklmnopqrstuvwxyz'): - if length is None: - length = random.randint (16, 32) - return ''.join (map (lambda x: random.choice (chars), range (length))) - -class WARCLogHandler (BufferingHandler): - """ - Buffered log handler, flushing to warcio - """ - - contentType = 'text/plain; charset=utf-8' - - def __init__ (self, capacity, warcfile): - BufferingHandler.__init__ (self, capacity) - self.warcfile = warcfile - - def flush (self): - self.acquire () - try: - buf = '' - for record in self.buffer: - buf += self.format (record) - buf += '\n' - # XXX: record type? - record = self.warcfile.create_warc_record ( - packageUrl ('log'), 'metadata', - payload=BytesIO (buf.encode ('utf8')), - warc_headers_dict={'Content-Type': self.contentType}) - self.warcfile.write_record(record) - self.buffer = [] - finally: - self.release () - -class ChromeTreeWalker (TreeWalker): - """ - Recursive html5lib TreeWalker for Google Chrome method DOM.getDocument - """ - - def recurse (self, node): - name = node['nodeName'] - if name.startswith ('#'): - if name == '#text': - yield from self.text (node['nodeValue']) - elif name == '#comment': - yield self.comment (node['nodeValue']) - elif name == '#document': - for child in node.get ('children', []): - yield from self.recurse (child) - else: - assert False, name - else: - default_namespace = constants.namespaces["html"] - - attributes = node.get ('attributes', []) - convertedAttr = {} - for i in range (0, len (attributes), 2): - convertedAttr[(default_namespace, attributes[i])] = attributes[i+1] - - children = node.get ('children', []) - if name.lower() in html.voidTags and not children: - yield from self.emptyTag (default_namespace, name, convertedAttr) - else: - yield self.startTag (default_namespace, name, convertedAttr) - for child in node.get ('children', []): - yield from self.recurse (child) - yield self.endTag ('', name) - - def __iter__ (self): - assert self.tree['nodeName'] == '#document' - return self.recurse (self.tree) - - def split (self): - """ - Split response returned by DOM.getDocument(pierce=True) into independent documents - """ - def recurse (node): - contentDocument = node.get ('contentDocument') - if contentDocument: - assert contentDocument['nodeName'] == '#document' - yield contentDocument - yield from recurse (contentDocument) - - for child in node.get ('children', []): - yield from recurse (child) - - if self.tree['nodeName'] == '#document': - yield self.tree - yield from recurse (self.tree) - -class StripTagFilter (Filter): - """ - Remove arbitrary tags - """ - - def __init__ (self, source, tags): - Filter.__init__ (self, source) - self.tags = set (map (str.lower, tags)) - - def __iter__(self): - delete = 0 - for token in Filter.__iter__(self): - tokenType = token['type'] - if tokenType in {'StartTag', 'EmptyTag'}: - if delete > 0 or token['name'].lower () in self.tags: - delete += 1 - if delete == 0: - yield token - if tokenType == 'EndTag' and delete > 0: - delete -= 1 - -class StripAttributeFilter (Filter): - """ - Remove arbitrary HTML attributes - """ - - def __init__ (self, source, attributes): - Filter.__init__ (self, source) - self.attributes = set (map (str.lower, attributes)) - - def __iter__(self): - default_namespace = constants.namespaces["html"] - for token in Filter.__iter__(self): - data = token.get ('data') - if data and token['type'] in {'StartTag', 'EmptyTag'}: - newdata = {} - for (namespace, k), v in data.items (): - if k.lower () not in self.attributes: - newdata[(namespace, k)] = v - token['data'] = newdata - yield token - def main (): - def getStatusText (response): - text = response.get ('statusText') - if text: - return text - text = BaseHTTPRequestHandler.responses.get (response['status']) - if text: - return text[0] - return 'No status text available' - - def requestWillBeSent (**kwargs): - req = kwargs.get ('request') - reqId = kwargs['requestId'] - url = urlsplit (req['url']) - if url.scheme in ('http', 'https'): - logger.debug ('sending {} {}'.format (reqId, req['url'])) - if reqId in requests: - redirectResp = kwargs.get ('redirectResponse') - if redirectResp: - requests[reqId]['response'] = redirectResp - # XXX: can we retrieve the original response body right now? - itemToWarc (reqId, requests[reqId], ignoreBody=True) - else: - logger.warn ('request {} already exists, overwriting.'.format (reqId)) - requests[reqId] = {} - requests[reqId]['request'] = req - requests[reqId]['initiator'] = kwargs['initiator'] - else: - logger.warn ('request: ignoring scheme {}'.format (url.scheme)) - - def responseReceived (**kwargs): - resp = kwargs['response'] - reqId = kwargs['requestId'] - url = urlsplit (resp['url']) - if url.scheme in ('http', 'https') and reqId in requests: - logger.debug ('response {} {}'.format (reqId, resp['url'])) - requests[reqId]['response'] = resp - else: - logger.warn ('response: ignoring scheme {}'.format (url.scheme)) - - def loadEventFired (**kwargs): - """ - Equivalent to DOM ready JavaScript event - """ - root = tab.DOM.getDocument () - rootId = root['root']['nodeId'] - links = tab.DOM.querySelectorAll (nodeId=rootId, selector='a') - #for i in links['nodeIds']: - # print ('link', tab.DOM.getAttributes (nodeId=i)) - - def loadingFinished (**kwargs): - """ - Item was fully loaded. For some items the request body is not available - when responseReceived is fired, thus move everything here. - """ - reqId = kwargs['requestId'] - # we never recorded this request - if reqId not in requests: - return - item = requests[reqId] - req = item['request'] - resp = item['response'] - url = urlsplit (resp['url']) - if url.scheme in ('http', 'https'): - logger.debug ('finished {} {}'.format (reqId, req['url'])) - itemToWarc (reqId, item, kwargs['encodedDataLength']) - del requests[reqId] - - def itemToWarc (reqId, item, encodedDataLength=0, ignoreBody=False): - req = item['request'] - resp = item['response'] - url = urlsplit (resp['url']) - - # overwrite request headers with those actually sent - newReqHeaders = resp.get ('requestHeaders') - if newReqHeaders: - req['headers'] = newReqHeaders - - postData = req.get ('postData') - if postData: - postData = BytesIO (postData.encode ('utf8')) - path = url.path - if url.query: - path += '?' + url.query - httpHeaders = StatusAndHeaders('{} {} HTTP/1.1'.format (req['method'], path), - req['headers'], protocol='HTTP/1.1', is_http_request=True) - initiator = item['initiator'] - warcHeaders = { - 'X-Chrome-Initiator': json.dumps (initiator), - } - record = writer.create_warc_record(req['url'], 'request', - payload=postData, http_headers=httpHeaders, - warc_headers_dict=warcHeaders) - writer.write_record(record) - concurrentTo = record.rec_headers['WARC-Record-ID'] - - # check body size first, since we’re loading everything into memory - if encodedDataLength < maxBodySize: - try: - if ignoreBody: - rawBody = b'' - base64Encoded = True - else: - body = tab.Network.getResponseBody (requestId=reqId) - rawBody = body['body'] - base64Encoded = body['base64Encoded'] - if base64Encoded: - rawBody = b64decode (rawBody) - else: - rawBody = rawBody.encode ('utf8') - - httpHeaders = StatusAndHeaders('{} {}'.format (resp['status'], - getStatusText (resp)), resp['headers'], protocol='HTTP/1.1') - - # Content is saved decompressed and decoded, remove these headers - blacklistedHeaders = {'transfer-encoding', 'content-encoding'} - for h in blacklistedHeaders: - httpHeaders.remove_header (h) - - # chrome sends nothing but utf8 encoded text. Fortunately HTTP - # headers take precedence over the document’s , thus we can - # easily override those. - contentType = resp['mimeType'] - if not base64Encoded: - contentType += '; charset=utf-8' - httpHeaders.replace_header ('content-type', contentType) - - httpHeaders.replace_header ('content-length', '{:d}'.format (len (rawBody))) - - warcHeaders = { - 'WARC-Concurrent-To': concurrentTo, - 'WARC-IP-Address': resp.get ('remoteIPAddress', ''), - 'X-Chrome-Protocol': resp.get ('protocol', ''), - 'X-Chrome-FromDiskCache': str (resp.get ('fromDiskCache')), - 'X-Chrome-ConnectionReused': str (resp.get ('connectionReused')), - 'X-Chrome-Base64Body': str (base64Encoded), - } - record = writer.create_warc_record(resp['url'], 'response', - warc_headers_dict=warcHeaders, payload=BytesIO (rawBody), - http_headers=httpHeaders) - writer.write_record(record) - except pychrome.exceptions.CallMethodException: - logger.error ('no data for {} {} {}'.format (resp['url'], - resp['status'], reqId)) - else: - logger.warn ('body for {} is too large, {} bytes'.format (resp['url'], kwargs['encodedDataLength'])) - - def loadingFailed (**kwargs): - reqId = kwargs['requestId'] - logger.debug ('failed {} {}'.format (reqId, kwargs['errorText'], kwargs.get ('blockedReason'))) - if reqId in requests: - del requests[reqId] - + import os, random, logging, argparse + from io import BytesIO + import pychrome + from urllib.parse import urlsplit + from warcio.warcwriter import WARCWriter + from warcio.statusandheaders import StatusAndHeaders + from html5lib.serializer import HTMLSerializer + from . import html, packageData, packageUrl + from .warc import WarcLoader + from .html import StripAttributeFilter, StripTagFilter, ChromeTreeWalker + + def randomString (length=None, chars='abcdefghijklmnopqrstuvwxyz'): + if length is None: + length = random.randint (16, 32) + return ''.join (map (lambda x: random.choice (chars), range (length))) + def getFormattedViewportMetrics (tab): layoutMetrics = tab.Page.getLayoutMetrics () # XXX: I’m not entirely sure which one we should use here @@ -384,7 +80,7 @@ def main (): 'X-Chrome-Viewport': viewport}) writer.write_record (record) - def emulateScreenMetrics (tab): + def emulateScreenMetrics (l): """ Emulate different screen sizes, causing the site to fetch assets (img srcset and css, for example) for different screen resolutions. @@ -404,13 +100,12 @@ def main (): {'width': 1920, 'height': 1080, 'deviceScaleFactor': 1, 'mobile': False}, ] for s in sizes: - tab.Emulation.setDeviceMetricsOverride (**s) - tab.wait (1) + l.tab.Emulation.setDeviceMetricsOverride (**s) + l.wait (1) # XXX: this seems to be broken, it does not clear the override #tab.Emulation.clearDeviceMetricsOverride () # wait until assets finished loading - while len (requests) != 0: - tab.wait (1) + l.waitIdle (2, 10) def loadScripts (paths, scripts=[]): for p in paths: @@ -427,6 +122,7 @@ def main (): warc_headers_dict={'Content-Type': 'application/javascript; charset=utf-8'}) writer.write_record (record) + logger = logging.getLogger(__name__) logging.basicConfig (level=logging.DEBUG) parser = argparse.ArgumentParser(description='Save website to WARC using Google Chrome.') @@ -434,7 +130,8 @@ def main (): parser.add_argument('--timeout', default=10, type=int, help='Maximum time for archival') parser.add_argument('--idle-timeout', default=2, type=int, help='Maximum idle seconds (i.e. no requests)', dest='idleTimeout') parser.add_argument('--log-buffer', default=1000, type=int, dest='logBuffer') - parser.add_argument('--keep-tab', action='store_true', default=False, dest='keepTab', help='Keep tab open') + parser.add_argument('--max-body-size', default=10*1024*1024, type=int, dest='maxBodySize', help='Max body size in bytes') + #parser.add_argument('--keep-tab', action='store_true', default=False, dest='keepTab', help='Keep tab open') parser.add_argument('--onload', default=[], action='append', help='Inject JavaScript file before loading page') parser.add_argument('--onsnapshot', default=[], action='append', help='Run JavaScript files before creating DOM snapshot') parser.add_argument('url', help='Website URL') @@ -448,93 +145,44 @@ def main (): onload = loadScripts (args.onload, ['var {} = false;\n'.format (stopVarname)]).replace (stopVarname, newStopVarname) stopVarname = newStopVarname - # temporary store for requests - requests = {} - - # create a browser instance browser = pychrome.Browser(url=args.browser) - # create a tab - tab = browser.new_tab() - - # setup callbacks - tab.Network.requestWillBeSent = requestWillBeSent - tab.Network.responseReceived = responseReceived - tab.Network.loadingFinished = loadingFinished - tab.Network.loadingFailed = loadingFailed - tab.Page.loadEventFired = loadEventFired - - # start the tab - tab.start() - fd = open (args.output, 'wb') writer = WARCWriter (fd, gzip=True) - version = tab.Browser.getVersion () - payload = { - 'software': __package__, - 'browser': version['product'], - 'useragent': version['userAgent'], - 'viewport': getFormattedViewportMetrics (tab), - } - warcinfo = writer.create_warcinfo_record (filename=None, info=payload) - writer.write_record (warcinfo) - - warcLogger = WARCLogHandler (args.logBuffer, writer) - logger.addHandler (warcLogger) - - # save onload script - writeScript ('onload', onload, writer) - # enable events - tab.Network.enable() - tab.Page.enable () - tab.Network.clearBrowserCache () - if tab.Network.canClearBrowserCookies ()['result']: - tab.Network.clearBrowserCookies () - # inject our custom javascript to the page before loading - tab.Page.addScriptToEvaluateOnNewDocument (source=onload) - - tab.Page.navigate(url=args.url) - - idleTimeout = 0 - for i in range (0, args.timeout): - tab.wait (1) - if len (requests) == 0: - idleTimeout += 1 - if idleTimeout > args.idleTimeout: - break - else: - idleTimeout = 0 + with WarcLoader (browser, args.url, writer, logBuffer=args.logBuffer, + maxBodySize=args.maxBodySize) as l: + version = l.tab.Browser.getVersion () + payload = { + 'software': __package__, + 'browser': version['product'], + 'useragent': version['userAgent'], + 'viewport': getFormattedViewportMetrics (l.tab), + } + warcinfo = writer.create_warcinfo_record (filename=None, info=payload) + writer.write_record (warcinfo) + # save onload script as well + writeScript ('onload', onload, writer) - # get ready for snapshot: stop loading and scripts, disable events - tab.Runtime.evaluate (expression='{} = true; window.scrollTo (0, 0);'.format (stopVarname), returnByValue=True) - # if we stopped due to timeout, wait for remaining assets - while len (requests) != 0: - tab.wait (1) + # inject our custom javascript to the page before loading + l.tab.Page.addScriptToEvaluateOnNewDocument (source=onload) + l.start () - emulateScreenMetrics (tab) + l.waitIdle (args.idleTimeout, args.timeout) - tab.Page.stopLoading () - tab.Network.disable () - tab.Page.disable () - tab.Network.requestWillBeSent = None - tab.Network.responseReceived = None - tab.Network.loadingFinished = None - tab.Network.loadingFailed = None - tab.Page.loadEventFired = None + # get ready for snapshot: stop loading and scripts, disable events + l.tab.Runtime.evaluate (expression='{} = true; window.scrollTo (0, 0);'.format (stopVarname), returnByValue=True) + # if we stopped due to timeout, wait for remaining assets + l.waitIdle (2, 10) - script = loadScripts (args.onsnapshot) - writeScript ('onsnapshot', script, writer) - tab.Runtime.evaluate (expression=script, returnByValue=True) - writeDOMSnapshot (tab, writer) + emulateScreenMetrics (l) - tab.stop() - if not args.keepTab: - browser.close_tab(tab) + l.stop () - logger.removeHandler (warcLogger) - warcLogger.flush () - fd.close () + script = loadScripts (args.onsnapshot) + writeScript ('onsnapshot', script, writer) + l.tab.Runtime.evaluate (expression=script, returnByValue=True) + writeDOMSnapshot (l.tab, writer) return True -- cgit v1.2.3