diff options
| -rw-r--r-- | crocoite/__init__.py | 32 | ||||
| -rw-r--r-- | crocoite/browser.py | 209 | ||||
| -rw-r--r-- | crocoite/cli.py | 452 | ||||
| -rw-r--r-- | crocoite/html.py | 107 | ||||
| -rw-r--r-- | crocoite/warc.py | 174 | 
5 files changed, 571 insertions, 403 deletions
| diff --git a/crocoite/__init__.py b/crocoite/__init__.py index 066c83e..e23cd60 100644 --- a/crocoite/__init__.py +++ b/crocoite/__init__.py @@ -1,3 +1,33 @@ -#!/usr/bin/env python3 +# Copyright (c) 2017 crocoite contributors +#  +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +#  +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +#  +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +import os +def packageData (path): +    """ +    Locate package data, see setup.py’s data_files +    """ +    return os.path.join (os.path.dirname (__file__), 'data', path) + +def packageUrl (path): +    """ +    Create URL for package data stored into WARC +    """ +    return 'urn:' + __package__ + ':' + path diff --git a/crocoite/browser.py b/crocoite/browser.py new file mode 100644 index 0000000..756fd64 --- /dev/null +++ b/crocoite/browser.py @@ -0,0 +1,209 @@ +# Copyright (c) 2017 crocoite contributors +#  +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +#  +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +#  +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +""" +Chrome browser interactions. +""" + +import logging +from urllib.parse import urlsplit + +class Item: +    """ +    Simple wrapper containing Chrome request and response +    """ + +    def __init__ (self): +        self._chromeRequest = None +        self._chromeResponse = None +        self.encodedDataLength = 0 + +    def __repr__ (self): +        return '<Item {}>'.format (self.request['url']) + +    @property +    def request (self): +        return self._chromeRequest['request'] + +    @property +    def response (self): +        return self._chromeResponse['response'] + +    @property +    def initiator (self): +        return self._chromeRequest['initiator'] + +    @property +    def id (self): +        return self._chromeRequest['requestId'] + +    def setRequest (self, req): +        self._chromeRequest = req + +    def setResponse (self, resp): +        self._chromeResponse = resp + +class SiteLoader: +    """ +    Load site in Chrome and monitor network requests + +    XXX: track popup windows/new tabs and close them +    """ + +    allowedSchemes = {'http', 'https'} + +    def __init__ (self, browser, url, logger=logging.getLogger(__name__)): +        self.requests = {} +        self.browser = browser +        self.url = url +        self.logger = logger + +        self.tab = browser.new_tab() + +    def __enter__ (self): +        tab = self.tab +        # setup callbacks +        tab.Network.requestWillBeSent = self._requestWillBeSent +        tab.Network.responseReceived = self._responseReceived +        tab.Network.loadingFinished = self._loadingFinished +        tab.Network.loadingFailed = self._loadingFailed +        #tab.Page.loadEventFired = loadEventFired + +        # start the tab +        tab.start() + +        # enable events +        tab.Network.enable() +        tab.Page.enable () +        tab.Network.clearBrowserCache () +        if tab.Network.canClearBrowserCookies ()['result']: +            tab.Network.clearBrowserCookies () + +        return self + +    def __len__ (self): +        return len (self.requests) + +    def start (self): +        self.tab.Page.navigate(url=self.url) + +    def wait (self, timeout=1): +        self.tab.wait (timeout) + +    def waitIdle (self, idleTimeout=1, maxTimeout=60): +        step = 0 +        for i in range (0, maxTimeout): +            self.wait (1) +            if len (self) == 0: +                step += 1 +                if step > idleTimeout: +                    break +            else: +                step = 0 + +    def stop (self): +        """ +        Stop loading site + +        XXX: stop executing scripts +        """ + +        tab = self.tab + +        tab.Page.stopLoading () +        tab.Network.disable () +        tab.Page.disable () +        tab.Network.requestWillBeSent = None +        tab.Network.responseReceived = None +        tab.Network.loadingFinished = None +        tab.Network.loadingFailed = None +        tab.Page.loadEventFired = None + +    def __exit__ (self, exc_type, exc_value, traceback): +        self.tab.stop () +        self.browser.close_tab(self.tab) +        return False + +    def loadingFinished (self, item): +        self.logger.debug ('item finished {}'.format (item)) + +    # internal chrome callbacks +    def _requestWillBeSent (self, **kwargs): +        reqId = kwargs['requestId'] +        req = kwargs['request'] + +        url = urlsplit (req['url']) +        if url.scheme not in self.allowedSchemes: +            return + +        item = self.requests.get (reqId) +        if item: +            # redirects never “finish” loading, but yield another requestWillBeSent with this key set +            redirectResp = kwargs.get ('redirectResponse') +            if redirectResp: +                resp = {'requestId': reqId, 'response': redirectResp} +                item.setResponse (resp) +                self.loadingFinished (item, redirect=True) +                self.logger.debug ('redirected request {} has url {}'.format (reqId, req['url'])) +            else: +                self.logger.warn ('request {} already exists, overwriting.'.format (reqId)) + +        item = Item () +        item.setRequest (kwargs) +        self.requests[reqId] = item + +    def _responseReceived (self, **kwargs): +        reqId = kwargs['requestId'] +        item = self.requests.get (reqId) +        if item is None: +            return + +        resp = kwargs['response'] +        url = urlsplit (resp['url']) +        if url.scheme in self.allowedSchemes: +            self.logger.debug ('response {} {}'.format (reqId, resp['url'])) +            item.setResponse (kwargs) +        else: +            self.logger.warn ('response: ignoring scheme {}'.format (url.scheme)) + +    def _loadingFinished (self, **kwargs): +        """ +        Item was fully loaded. For some items the request body is not available +        when responseReceived is fired, thus move everything here. +        """ +        reqId = kwargs['requestId'] +        item = self.requests.pop (reqId, None) +        if item is None: +            # we never recorded this request (blacklisted scheme, for example) +            return +        req = item.request +        resp = item.response +        assert req['url'] == resp['url'], 'req and resp urls are not the same {} vs {}'.format (req['url'], resp['url']) +        url = urlsplit (resp['url']) +        if url.scheme in self.allowedSchemes: +            self.logger.debug ('finished {} {}'.format (reqId, req['url'])) +            item.encodedDataLength = kwargs['encodedDataLength'] +            self.loadingFinished (item) + +    def _loadingFailed (self, **kwargs): +        reqId = kwargs['requestId'] +        self.logger.debug ('failed {} {}'.format (reqId, kwargs['errorText'], kwargs.get ('blockedReason'))) +        item = self.requests.pop (reqId, None) + diff --git a/crocoite/cli.py b/crocoite/cli.py index 640d207..a97f0ce 100644 --- a/crocoite/cli.py +++ b/crocoite/cli.py @@ -18,327 +18,23 @@  # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN  # THE SOFTWARE. -import os, sys, json -import pychrome -from urllib.parse import urlsplit -from warcio.warcwriter import WARCWriter -from warcio.statusandheaders import StatusAndHeaders -from base64 import b64decode -import logging -from logging.handlers import BufferingHandler -from http.server import BaseHTTPRequestHandler -from io import BytesIO -import argparse -import tempfile, random - -from html5lib.treewalkers.base import TreeWalker -from html5lib.filters.base import Filter -from html5lib.serializer import HTMLSerializer -from html5lib import constants - -from . import html - -logger = logging.getLogger(__name__) - -# 10 MB, encoded! (i.e. actual data can be larger due to compression) -maxBodySize = 10*1024*1024 - -def packageData (path): -    """ -    Locate package data, see setup.py’s data_files -    """ -    return os.path.join (os.path.dirname (__file__), 'data', path) - -def packageUrl (path): -    """ -    Create URL for package data stored into WARC -    """ -    return 'urn:' + __package__ + ':' + path - -def randomString (length=None, chars='abcdefghijklmnopqrstuvwxyz'): -    if length is None: -        length = random.randint (16, 32) -    return ''.join (map (lambda x: random.choice (chars), range (length))) - -class WARCLogHandler (BufferingHandler): -    """ -    Buffered log handler, flushing to warcio -    """ - -    contentType = 'text/plain; charset=utf-8' - -    def __init__ (self, capacity, warcfile): -        BufferingHandler.__init__ (self, capacity) -        self.warcfile = warcfile - -    def flush (self): -        self.acquire () -        try: -            buf = '' -            for record in self.buffer: -                buf += self.format (record) -                buf += '\n' -            # XXX: record type? -            record = self.warcfile.create_warc_record ( -                    packageUrl ('log'), 'metadata', -                    payload=BytesIO (buf.encode ('utf8')), -                    warc_headers_dict={'Content-Type': self.contentType}) -            self.warcfile.write_record(record) -            self.buffer = [] -        finally: -            self.release () - -class ChromeTreeWalker (TreeWalker): -    """ -    Recursive html5lib TreeWalker for Google Chrome method DOM.getDocument -    """ - -    def recurse (self, node): -        name = node['nodeName'] -        if name.startswith ('#'): -            if name == '#text': -                yield from self.text (node['nodeValue']) -            elif name == '#comment': -                yield self.comment (node['nodeValue']) -            elif name == '#document': -                for child in node.get ('children', []): -                    yield from self.recurse (child) -            else: -                assert False, name -        else: -            default_namespace = constants.namespaces["html"] - -            attributes = node.get ('attributes', []) -            convertedAttr = {} -            for i in range (0, len (attributes), 2): -                convertedAttr[(default_namespace, attributes[i])] = attributes[i+1] - -            children = node.get ('children', []) -            if name.lower() in html.voidTags and not children: -                yield from self.emptyTag (default_namespace, name, convertedAttr) -            else: -                yield self.startTag (default_namespace, name, convertedAttr) -                for child in node.get ('children', []): -                    yield from self.recurse (child) -                yield self.endTag ('', name) - -    def __iter__ (self): -        assert self.tree['nodeName'] == '#document' -        return self.recurse (self.tree) - -    def split (self): -        """ -        Split response returned by DOM.getDocument(pierce=True) into independent documents -        """ -        def recurse (node): -            contentDocument = node.get ('contentDocument') -            if contentDocument: -                assert contentDocument['nodeName'] == '#document' -                yield contentDocument -                yield from recurse (contentDocument) - -            for child in node.get ('children', []): -                yield from recurse (child) - -        if self.tree['nodeName'] == '#document': -            yield self.tree -        yield from recurse (self.tree) - -class StripTagFilter (Filter): -    """ -    Remove arbitrary tags -    """ - -    def __init__ (self, source, tags): -        Filter.__init__ (self, source) -        self.tags = set (map (str.lower, tags)) - -    def __iter__(self): -        delete = 0 -        for token in Filter.__iter__(self): -            tokenType = token['type'] -            if tokenType in {'StartTag', 'EmptyTag'}: -                if delete > 0 or token['name'].lower () in self.tags: -                    delete += 1 -            if delete == 0: -                yield token -            if tokenType == 'EndTag' and delete > 0: -                delete -= 1 - -class StripAttributeFilter (Filter): -    """ -    Remove arbitrary HTML attributes -    """ - -    def __init__ (self, source, attributes): -        Filter.__init__ (self, source) -        self.attributes = set (map (str.lower, attributes)) - -    def __iter__(self): -        default_namespace = constants.namespaces["html"] -        for token in Filter.__iter__(self): -            data = token.get ('data') -            if data and token['type'] in {'StartTag', 'EmptyTag'}: -                newdata = {} -                for (namespace, k), v in data.items (): -                    if k.lower () not in self.attributes: -                        newdata[(namespace, k)] = v -                token['data'] = newdata -            yield token -  def main (): -    def getStatusText (response): -        text = response.get ('statusText') -        if text: -            return text -        text = BaseHTTPRequestHandler.responses.get (response['status']) -        if text: -            return text[0] -        return 'No status text available' - -    def requestWillBeSent (**kwargs): -        req = kwargs.get ('request') -        reqId = kwargs['requestId'] -        url = urlsplit (req['url']) -        if url.scheme in ('http', 'https'): -            logger.debug ('sending {} {}'.format (reqId, req['url'])) -            if reqId in requests: -                redirectResp = kwargs.get ('redirectResponse') -                if redirectResp: -                    requests[reqId]['response'] = redirectResp -                    # XXX: can we retrieve the original response body right now? -                    itemToWarc (reqId, requests[reqId], ignoreBody=True) -                else: -                    logger.warn ('request {} already exists, overwriting.'.format (reqId)) -            requests[reqId] = {} -            requests[reqId]['request'] = req -            requests[reqId]['initiator'] = kwargs['initiator'] -        else: -            logger.warn ('request: ignoring scheme {}'.format (url.scheme)) - -    def responseReceived (**kwargs): -        resp = kwargs['response'] -        reqId = kwargs['requestId'] -        url = urlsplit (resp['url']) -        if url.scheme in ('http', 'https') and reqId in requests: -            logger.debug ('response {} {}'.format (reqId, resp['url'])) -            requests[reqId]['response'] = resp -        else: -            logger.warn ('response: ignoring scheme {}'.format (url.scheme)) - -    def loadEventFired (**kwargs): -        """ -        Equivalent to DOM ready JavaScript event -        """ -        root = tab.DOM.getDocument () -        rootId = root['root']['nodeId'] -        links = tab.DOM.querySelectorAll (nodeId=rootId, selector='a') -        #for i in links['nodeIds']: -        #    print ('link', tab.DOM.getAttributes (nodeId=i)) - -    def loadingFinished (**kwargs): -        """ -        Item was fully loaded. For some items the request body is not available -        when responseReceived is fired, thus move everything here. -        """ -        reqId = kwargs['requestId'] -        # we never recorded this request -        if reqId not in requests: -            return -        item = requests[reqId] -        req = item['request'] -        resp = item['response'] -        url = urlsplit (resp['url']) -        if url.scheme in ('http', 'https'): -            logger.debug ('finished {} {}'.format (reqId, req['url'])) -            itemToWarc (reqId, item, kwargs['encodedDataLength']) -            del requests[reqId] - -    def itemToWarc (reqId, item, encodedDataLength=0, ignoreBody=False): -        req = item['request'] -        resp = item['response'] -        url = urlsplit (resp['url']) - -        # overwrite request headers with those actually sent -        newReqHeaders = resp.get ('requestHeaders') -        if newReqHeaders: -            req['headers'] = newReqHeaders - -        postData = req.get ('postData') -        if postData: -            postData = BytesIO (postData.encode ('utf8')) -        path = url.path -        if url.query: -            path += '?' + url.query -        httpHeaders = StatusAndHeaders('{} {} HTTP/1.1'.format (req['method'], path), -                req['headers'], protocol='HTTP/1.1', is_http_request=True) -        initiator = item['initiator'] -        warcHeaders = { -                'X-Chrome-Initiator': json.dumps (initiator), -                } -        record = writer.create_warc_record(req['url'], 'request', -                payload=postData, http_headers=httpHeaders, -                warc_headers_dict=warcHeaders) -        writer.write_record(record) -        concurrentTo = record.rec_headers['WARC-Record-ID'] - -        # check body size first, since we’re loading everything into memory -        if encodedDataLength < maxBodySize: -            try: -                if ignoreBody: -                    rawBody = b'' -                    base64Encoded = True -                else: -                    body = tab.Network.getResponseBody (requestId=reqId) -                    rawBody = body['body'] -                    base64Encoded = body['base64Encoded'] -                    if base64Encoded: -                        rawBody = b64decode (rawBody) -                    else: -                        rawBody = rawBody.encode ('utf8') - -                httpHeaders = StatusAndHeaders('{} {}'.format (resp['status'], -                        getStatusText (resp)), resp['headers'], protocol='HTTP/1.1') - -                # Content is saved decompressed and decoded, remove these headers -                blacklistedHeaders = {'transfer-encoding', 'content-encoding'} -                for h in blacklistedHeaders: -                    httpHeaders.remove_header (h) - -                # chrome sends nothing but utf8 encoded text. Fortunately HTTP -                # headers take precedence over the document’s <meta>, thus we can -                # easily override those. -                contentType = resp['mimeType'] -                if not base64Encoded: -                    contentType += '; charset=utf-8' -                httpHeaders.replace_header ('content-type', contentType) - -                httpHeaders.replace_header ('content-length', '{:d}'.format (len (rawBody))) - -                warcHeaders = { -                        'WARC-Concurrent-To': concurrentTo, -                        'WARC-IP-Address': resp.get ('remoteIPAddress', ''), -                        'X-Chrome-Protocol': resp.get ('protocol', ''), -                        'X-Chrome-FromDiskCache': str (resp.get ('fromDiskCache')), -                        'X-Chrome-ConnectionReused': str (resp.get ('connectionReused')), -                        'X-Chrome-Base64Body': str (base64Encoded), -                        } -                record = writer.create_warc_record(resp['url'], 'response', -                        warc_headers_dict=warcHeaders, payload=BytesIO (rawBody), -                        http_headers=httpHeaders) -                writer.write_record(record) -            except pychrome.exceptions.CallMethodException: -                logger.error ('no data for {} {} {}'.format (resp['url'], -                        resp['status'], reqId)) -        else: -            logger.warn ('body for {} is too large, {} bytes'.format (resp['url'], kwargs['encodedDataLength'])) - -    def loadingFailed (**kwargs): -        reqId = kwargs['requestId'] -        logger.debug ('failed {} {}'.format (reqId, kwargs['errorText'], kwargs.get ('blockedReason'))) -        if reqId in requests: -            del requests[reqId] - +    import os, random, logging, argparse +    from io import BytesIO +    import pychrome +    from urllib.parse import urlsplit +    from warcio.warcwriter import WARCWriter +    from warcio.statusandheaders import StatusAndHeaders +    from html5lib.serializer import HTMLSerializer +    from . import html, packageData, packageUrl +    from .warc import WarcLoader +    from .html import StripAttributeFilter, StripTagFilter, ChromeTreeWalker + +    def randomString (length=None, chars='abcdefghijklmnopqrstuvwxyz'): +        if length is None: +            length = random.randint (16, 32) +        return ''.join (map (lambda x: random.choice (chars), range (length))) +          def getFormattedViewportMetrics (tab):          layoutMetrics = tab.Page.getLayoutMetrics ()          # XXX: I’m not entirely sure which one we should use here @@ -384,7 +80,7 @@ def main ():                                  'X-Chrome-Viewport': viewport})                  writer.write_record (record) -    def emulateScreenMetrics (tab): +    def emulateScreenMetrics (l):          """          Emulate different screen sizes, causing the site to fetch assets (img          srcset and css, for example) for different screen resolutions. @@ -404,13 +100,12 @@ def main ():                  {'width': 1920, 'height': 1080, 'deviceScaleFactor': 1, 'mobile': False},                  ]          for s in sizes: -            tab.Emulation.setDeviceMetricsOverride (**s) -            tab.wait (1) +            l.tab.Emulation.setDeviceMetricsOverride (**s) +            l.wait (1)          # XXX: this seems to be broken, it does not clear the override          #tab.Emulation.clearDeviceMetricsOverride ()          # wait until assets finished loading -        while len (requests) != 0: -            tab.wait (1) +        l.waitIdle (2, 10)      def loadScripts (paths, scripts=[]):          for p in paths: @@ -427,6 +122,7 @@ def main ():                  warc_headers_dict={'Content-Type': 'application/javascript; charset=utf-8'})          writer.write_record (record) +    logger = logging.getLogger(__name__)      logging.basicConfig (level=logging.DEBUG)      parser = argparse.ArgumentParser(description='Save website to WARC using Google Chrome.') @@ -434,7 +130,8 @@ def main ():      parser.add_argument('--timeout', default=10, type=int, help='Maximum time for archival')      parser.add_argument('--idle-timeout', default=2, type=int, help='Maximum idle seconds (i.e. no requests)', dest='idleTimeout')      parser.add_argument('--log-buffer', default=1000, type=int, dest='logBuffer') -    parser.add_argument('--keep-tab', action='store_true', default=False, dest='keepTab', help='Keep tab open') +    parser.add_argument('--max-body-size', default=10*1024*1024, type=int, dest='maxBodySize', help='Max body size in bytes') +    #parser.add_argument('--keep-tab', action='store_true', default=False, dest='keepTab', help='Keep tab open')      parser.add_argument('--onload', default=[], action='append', help='Inject JavaScript file before loading page')      parser.add_argument('--onsnapshot', default=[], action='append', help='Run JavaScript files before creating DOM snapshot')      parser.add_argument('url', help='Website URL') @@ -448,93 +145,44 @@ def main ():      onload = loadScripts (args.onload, ['var {} = false;\n'.format (stopVarname)]).replace (stopVarname, newStopVarname)      stopVarname = newStopVarname -    # temporary store for requests -    requests = {} - -    # create a browser instance      browser = pychrome.Browser(url=args.browser) -    # create a tab -    tab = browser.new_tab() - -    # setup callbacks -    tab.Network.requestWillBeSent = requestWillBeSent -    tab.Network.responseReceived = responseReceived -    tab.Network.loadingFinished = loadingFinished -    tab.Network.loadingFailed = loadingFailed -    tab.Page.loadEventFired = loadEventFired - -    # start the tab -    tab.start() -      fd = open (args.output, 'wb')      writer = WARCWriter (fd, gzip=True) -    version = tab.Browser.getVersion () -    payload = { -            'software': __package__, -            'browser': version['product'], -            'useragent': version['userAgent'], -            'viewport': getFormattedViewportMetrics (tab), -            } -    warcinfo = writer.create_warcinfo_record (filename=None, info=payload) -    writer.write_record (warcinfo) - -    warcLogger = WARCLogHandler (args.logBuffer, writer) -    logger.addHandler (warcLogger) - -    # save onload script -    writeScript ('onload', onload, writer) -    # enable events -    tab.Network.enable() -    tab.Page.enable () -    tab.Network.clearBrowserCache () -    if tab.Network.canClearBrowserCookies ()['result']: -        tab.Network.clearBrowserCookies () -    # inject our custom javascript to the page before loading -    tab.Page.addScriptToEvaluateOnNewDocument (source=onload) - -    tab.Page.navigate(url=args.url) - -    idleTimeout = 0 -    for i in range (0, args.timeout): -        tab.wait (1) -        if len (requests) == 0: -            idleTimeout += 1 -            if idleTimeout > args.idleTimeout: -                break -        else: -            idleTimeout = 0 +    with WarcLoader (browser, args.url, writer, logBuffer=args.logBuffer, +            maxBodySize=args.maxBodySize) as l: +        version = l.tab.Browser.getVersion () +        payload = { +                'software': __package__, +                'browser': version['product'], +                'useragent': version['userAgent'], +                'viewport': getFormattedViewportMetrics (l.tab), +                } +        warcinfo = writer.create_warcinfo_record (filename=None, info=payload) +        writer.write_record (warcinfo) +        # save onload script as well +        writeScript ('onload', onload, writer) -    # get ready for snapshot: stop loading and scripts, disable events -    tab.Runtime.evaluate (expression='{} = true; window.scrollTo (0, 0);'.format (stopVarname), returnByValue=True) -    # if we stopped due to timeout, wait for remaining assets -    while len (requests) != 0: -        tab.wait (1) +        # inject our custom javascript to the page before loading +        l.tab.Page.addScriptToEvaluateOnNewDocument (source=onload) +        l.start () -    emulateScreenMetrics (tab) +        l.waitIdle (args.idleTimeout, args.timeout) -    tab.Page.stopLoading () -    tab.Network.disable () -    tab.Page.disable () -    tab.Network.requestWillBeSent = None -    tab.Network.responseReceived = None -    tab.Network.loadingFinished = None -    tab.Network.loadingFailed = None -    tab.Page.loadEventFired = None +        # get ready for snapshot: stop loading and scripts, disable events +        l.tab.Runtime.evaluate (expression='{} = true; window.scrollTo (0, 0);'.format (stopVarname), returnByValue=True) +        # if we stopped due to timeout, wait for remaining assets +        l.waitIdle (2, 10) -    script = loadScripts (args.onsnapshot) -    writeScript ('onsnapshot', script, writer) -    tab.Runtime.evaluate (expression=script, returnByValue=True) -    writeDOMSnapshot (tab, writer) +        emulateScreenMetrics (l) -    tab.stop() -    if not args.keepTab: -        browser.close_tab(tab) +        l.stop () -    logger.removeHandler (warcLogger) -    warcLogger.flush () -    fd.close () +        script = loadScripts (args.onsnapshot) +        writeScript ('onsnapshot', script, writer) +        l.tab.Runtime.evaluate (expression=script, returnByValue=True) +        writeDOMSnapshot (l.tab, writer)      return True diff --git a/crocoite/html.py b/crocoite/html.py index 34fe26b..f891101 100644 --- a/crocoite/html.py +++ b/crocoite/html.py @@ -18,6 +18,10 @@  # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN  # THE SOFTWARE. +""" +HTML helper +""" +  # HTML void tags, see https://html.spec.whatwg.org/multipage/syntax.html#void-elements  voidTags = {'area',          'base', @@ -99,3 +103,106 @@ eventAttributes = {'onabort',          'onvolumechange',          'onwaiting'} +from html5lib.treewalkers.base import TreeWalker +from html5lib.filters.base import Filter +from html5lib.serializer import HTMLSerializer +from html5lib import constants + +class ChromeTreeWalker (TreeWalker): +    """ +    Recursive html5lib TreeWalker for Google Chrome method DOM.getDocument +    """ + +    def recurse (self, node): +        name = node['nodeName'] +        if name.startswith ('#'): +            if name == '#text': +                yield from self.text (node['nodeValue']) +            elif name == '#comment': +                yield self.comment (node['nodeValue']) +            elif name == '#document': +                for child in node.get ('children', []): +                    yield from self.recurse (child) +            else: +                assert False, name +        else: +            default_namespace = constants.namespaces["html"] + +            attributes = node.get ('attributes', []) +            convertedAttr = {} +            for i in range (0, len (attributes), 2): +                convertedAttr[(default_namespace, attributes[i])] = attributes[i+1] + +            children = node.get ('children', []) +            if name.lower() in voidTags and not children: +                yield from self.emptyTag (default_namespace, name, convertedAttr) +            else: +                yield self.startTag (default_namespace, name, convertedAttr) +                for child in node.get ('children', []): +                    yield from self.recurse (child) +                yield self.endTag ('', name) + +    def __iter__ (self): +        assert self.tree['nodeName'] == '#document' +        return self.recurse (self.tree) + +    def split (self): +        """ +        Split response returned by DOM.getDocument(pierce=True) into independent documents +        """ +        def recurse (node): +            contentDocument = node.get ('contentDocument') +            if contentDocument: +                assert contentDocument['nodeName'] == '#document' +                yield contentDocument +                yield from recurse (contentDocument) + +            for child in node.get ('children', []): +                yield from recurse (child) + +        if self.tree['nodeName'] == '#document': +            yield self.tree +        yield from recurse (self.tree) + +class StripTagFilter (Filter): +    """ +    Remove arbitrary tags +    """ + +    def __init__ (self, source, tags): +        Filter.__init__ (self, source) +        self.tags = set (map (str.lower, tags)) + +    def __iter__(self): +        delete = 0 +        for token in Filter.__iter__(self): +            tokenType = token['type'] +            if tokenType in {'StartTag', 'EmptyTag'}: +                if delete > 0 or token['name'].lower () in self.tags: +                    delete += 1 +            if delete == 0: +                yield token +            if tokenType == 'EndTag' and delete > 0: +                delete -= 1 + +class StripAttributeFilter (Filter): +    """ +    Remove arbitrary HTML attributes +    """ + +    def __init__ (self, source, attributes): +        Filter.__init__ (self, source) +        self.attributes = set (map (str.lower, attributes)) + +    def __iter__(self): +        default_namespace = constants.namespaces["html"] +        for token in Filter.__iter__(self): +            data = token.get ('data') +            if data and token['type'] in {'StartTag', 'EmptyTag'}: +                newdata = {} +                for (namespace, k), v in data.items (): +                    if k.lower () not in self.attributes: +                        newdata[(namespace, k)] = v +                token['data'] = newdata +            yield token + diff --git a/crocoite/warc.py b/crocoite/warc.py new file mode 100644 index 0000000..e06b1c7 --- /dev/null +++ b/crocoite/warc.py @@ -0,0 +1,174 @@ +# Copyright (c) 2017 crocoite contributors +#  +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +#  +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +#  +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +""" +Classes writing data to WARC files +""" + +import logging +import json +from .browser import SiteLoader +from . import packageUrl +from http.server import BaseHTTPRequestHandler +from base64 import b64decode +from io import BytesIO +from warcio.statusandheaders import StatusAndHeaders +from urllib.parse import urlsplit +from logging.handlers import BufferingHandler +import pychrome + +class WARCLogHandler (BufferingHandler): +    """ +    Buffered log handler, flushing to warcio +    """ + +    contentType = 'text/plain; charset=utf-8' + +    def __init__ (self, capacity, warcfile): +        BufferingHandler.__init__ (self, capacity) +        self.warcfile = warcfile + +    def flush (self): +        self.acquire () +        try: +            if self.buffer: +                buf = '' +                for record in self.buffer: +                    buf += self.format (record) +                    buf += '\n' +                # XXX: record type? +                record = self.warcfile.create_warc_record ( +                        packageUrl ('log'), 'metadata', +                        payload=BytesIO (buf.encode ('utf8')), +                        warc_headers_dict={'Content-Type': self.contentType}) +                self.warcfile.write_record(record) +                self.buffer = [] +        finally: +            self.release () + +class WarcLoader (SiteLoader): +    def __init__ (self, browser, url, writer, +            logger=logging.getLogger(__name__), logBuffer=1000, +            maxBodySize=10*1024*1024): +        SiteLoader.__init__ (self, browser, url, logger) +        self.writer = writer +        self.maxBodySize = maxBodySize +        self.warcLogger = WARCLogHandler (logBuffer, writer) +        self.logger.addHandler (self.warcLogger) + +    def __exit__ (self, exc_type, exc_value, traceback): +        self.logger.removeHandler (self.warcLogger) +        self.warcLogger.flush () +        return SiteLoader.__exit__ (self, exc_type, exc_value, traceback) + +    @staticmethod +    def getStatusText (response): +        text = response.get ('statusText') +        if text: +            return text +        text = BaseHTTPRequestHandler.responses.get (response['status']) +        if text: +            return text[0] +        return 'No status text available' + +    def loadingFinished (self, item, redirect=False): +        writer = self.writer + +        req = item.request +        reqId = item.id +        resp = item.response +        url = urlsplit (resp['url']) + +        # overwrite request headers with those actually sent +        newReqHeaders = resp.get ('requestHeaders') +        if newReqHeaders: +            req['headers'] = newReqHeaders + +        postData = req.get ('postData') +        if postData: +            postData = BytesIO (postData.encode ('utf8')) +        path = url.path +        if url.query: +            path += '?' + url.query +        httpHeaders = StatusAndHeaders('{} {} HTTP/1.1'.format (req['method'], path), +                req['headers'], protocol='HTTP/1.1', is_http_request=True) +        initiator = item.initiator +        warcHeaders = { +                'X-Chrome-Initiator': json.dumps (initiator), +                } +        record = writer.create_warc_record(req['url'], 'request', +                payload=postData, http_headers=httpHeaders, +                warc_headers_dict=warcHeaders) +        writer.write_record(record) +        concurrentTo = record.rec_headers['WARC-Record-ID'] + +        # now the response +        warcHeaders = { +                'WARC-Concurrent-To': concurrentTo, +                'WARC-IP-Address': resp.get ('remoteIPAddress', ''), +                'X-Chrome-Protocol': resp.get ('protocol', ''), +                'X-Chrome-FromDiskCache': str (resp.get ('fromDiskCache')), +                'X-Chrome-ConnectionReused': str (resp.get ('connectionReused')), +                } + +        rawBody = b'' +        base64Encoded = False +        try: +            # check body size first, since we’re loading everything into memory +            if item.encodedDataLength < self.maxBodySize: +                body = self.tab.Network.getResponseBody (requestId=reqId) +                rawBody = body['body'] +                base64Encoded = body['base64Encoded'] +                if base64Encoded: +                    rawBody = b64decode (rawBody) +                    warcHeaders['X-Chrome-Base64Body'] = str (True) +                else: +                    rawBody = rawBody.encode ('utf8') +            else: +                self.logger.error ('body for {} too large {} vs {}'.format (reqId, +                        item.encodedDataLength, self.maxBodySize)) +        except pychrome.exceptions.CallMethodException: +            self.logger.error ('no data for {} {} {}'.format (resp['url'], +                    resp['status'], reqId)) + +        httpHeaders = StatusAndHeaders('{} {}'.format (resp['status'], +                self.getStatusText (resp)), resp['headers'], protocol='HTTP/1.1') + +        # Content is saved decompressed and decoded, remove these headers +        blacklistedHeaders = {'transfer-encoding', 'content-encoding'} +        for h in blacklistedHeaders: +            httpHeaders.remove_header (h) + +        # chrome sends nothing but utf8 encoded text. Fortunately HTTP +        # headers take precedence over the document’s <meta>, thus we can +        # easily override those. +        contentType = resp.get ('mimeType') +        if contentType: +            if not base64Encoded: +                contentType += '; charset=utf-8' +            httpHeaders.replace_header ('content-type', contentType) + +        httpHeaders.replace_header ('content-length', '{:d}'.format (len (rawBody))) + +        record = writer.create_warc_record(resp['url'], 'response', +                warc_headers_dict=warcHeaders, payload=BytesIO (rawBody), +                http_headers=httpHeaders) +        writer.write_record(record) + | 
