diff options
| -rw-r--r-- | README.rst | 2 | ||||
| -rw-r--r-- | crocoite/browser.py | 326 | ||||
| -rw-r--r-- | crocoite/controller.py | 12 | ||||
| -rw-r--r-- | crocoite/logger.py | 2 | ||||
| -rw-r--r-- | crocoite/test_browser.py | 531 | ||||
| -rw-r--r-- | crocoite/test_warc.py | 145 | ||||
| -rw-r--r-- | crocoite/warc.py | 96 | ||||
| -rw-r--r-- | setup.py | 1 | 
8 files changed, 630 insertions, 485 deletions
| @@ -24,6 +24,7 @@ These dependencies must be present to run crocoite:  - warcio_  - html5lib_  - yarl_ +- multidict_  - bottom_ (IRC client)  - `Google Chrome`_ @@ -35,6 +36,7 @@ These dependencies must be present to run crocoite:  .. _bottom: https://github.com/numberoverzero/bottom  .. _Google Chrome: https://www.google.com/chrome/  .. _yarl: https://yarl.readthedocs.io/ +.. _multidict: https://multidict.readthedocs.io/  The following commands clone the repository from GitHub_, set up a virtual  environment and install crocoite: diff --git a/crocoite/browser.py b/crocoite/browser.py index 3de61f0..50561ed 100644 --- a/crocoite/browser.py +++ b/crocoite/browser.py @@ -23,80 +23,197 @@ Chrome browser interactions.  """  import asyncio -from base64 import b64decode +from base64 import b64decode, b64encode +from datetime import datetime, timedelta  from http.server import BaseHTTPRequestHandler +  from yarl import URL +from multidict import CIMultiDict  from .logger import Level  from .devtools import Browser, TabException -class Item: -    """ -    Simple wrapper containing Chrome request and response -    """ +# These two classes’ only purpose is so we can later tell whether a body was +# base64-encoded or a unicode string +class Base64Body (bytes): +    def __new__ (cls, value): +        return bytes.__new__ (cls, b64decode (value)) + +    @classmethod +    def fromBytes (cls, b): +        """ For testing """ +        return cls (b64encode (b)) + +class UnicodeBody (bytes): +    def __new__ (cls, value): +        if type (value) is not str: +            raise TypeError ('expecting unicode string') + +        return bytes.__new__ (cls, value.encode ('utf-8')) -    __slots__ = ('chromeRequest', 'chromeResponse', 'chromeFinished', -            'isRedirect', 'failed', 'body', 'requestBody') +class Request: +    __slots__ = ('headers', 'body', 'initiator', 'hasPostData', 'method', 'timestamp') -    def __init__ (self): -        self.chromeRequest = {} -        self.chromeResponse = {} -        self.chromeFinished = {} -        self.isRedirect = False -        self.failed = False -        self.body = None -        self.requestBody = None +    def __init__ (self, method=None, headers=None, body=None): +        self.headers = headers +        self.body = body +        self.hasPostData = False +        self.initiator = None +        # HTTP method +        self.method = method +        self.timestamp = None      def __repr__ (self): -        return f'<Item {self.url}>' - -    @property -    def request (self): -        return self.chromeRequest.get ('request', {}) - -    @property -    def response (self): -        return self.chromeResponse.get ('response', {}) - -    @property -    def initiator (self): -        return self.chromeRequest['initiator'] - -    @property -    def id (self): -        return self.chromeRequest['requestId'] - -    @property -    def encodedDataLength (self): -        return self.chromeFinished['encodedDataLength'] - -    @property -    def url (self): -        return URL (self.response.get ('url', self.request.get ('url'))) - -    @property -    def requestHeaders (self): -        # the response object may contain refined headers, which were -        # *actually* sent over the wire -        return self._unfoldHeaders (self.response.get ('requestHeaders', self.request['headers'])) - -    @property -    def responseHeaders (self): -        return self._unfoldHeaders (self.response['headers']) - -    @property -    def statusText (self): -        text = self.response.get ('statusText') -        if text: -            return text -        text = BaseHTTPRequestHandler.responses.get (self.response['status']) -        if text: -            return text[0] -        return 'No status text available' - -    @property -    def resourceType (self): -        return self.chromeResponse.get ('type', self.chromeRequest.get ('type', None)) +        return f'Request({self.method!r}, {self.headers!r}, {self.body!r})' + +    def __eq__ (self, b): +        if b is None: +            return False + +        if not isinstance (b, Request): +            raise TypeError ('Can only compare equality with Request.') + +        # do not compare hasPostData (only required to fetch body) and +        # timestamp (depends on time) +        return self.headers == b.headers and \ +                self.body == b.body and \ +                self.initiator == b.initiator and \ +                self.method == b.method + +class Response: +    __slots__ = ('status', 'statusText', 'headers', 'body', 'bytesReceived', +            'timestamp', 'mimeType') + +    def __init__ (self, status=None, statusText=None, headers=None, body=None, mimeType=None): +        self.status = status +        self.statusText = statusText +        self.headers = headers +        self.body = body +        # bytes received over the network (not body size!) +        self.bytesReceived = 0 +        self.timestamp = None +        self.mimeType = mimeType + +    def __repr__ (self): +        return f'Response({self.status!r}, {self.statusText!r}, {self.headers!r}, {self.body!r}, {self.mimeType!r})' + +    def __eq__ (self, b): +        if b is None: +            return False + +        if not isinstance (b, Response): +            raise TypeError ('Can only compare equality with Response.') + +        # do not compare bytesReceived (depends on network), timestamp +        # (depends on time) and statusText (does not matter) +        return self.status == b.status and \ +                self.statusText == b.statusText and \ +                self.headers == b.headers and \ +                self.body == b.body and \ +                self.mimeType == b.mimeType + +class ReferenceTimestamp: +    """ Map relative timestamp to absolute timestamp """ + +    def __init__ (self, relative, absolute): +        self.relative = timedelta (seconds=relative) +        self.absolute = datetime.utcfromtimestamp (absolute) + +    def __call__ (self, relative): +        if not isinstance (relative, timedelta): +            relative = timedelta (seconds=relative) +        return self.absolute + (relative-self.relative) + +class RequestResponsePair: +    __slots__ = ('request', 'response', 'id', 'url', 'remoteIpAddress', +            'protocol', 'resourceType', '_time') + +    def __init__ (self, id=None, url=None, request=None, response=None): +        self.request = request +        self.response = response +        self.id = id +        self.url = url +        self.remoteIpAddress = None +        self.protocol = None +        self.resourceType = None +        self._time = None + +    def __repr__ (self): +        return f'RequestResponsePair({self.id!r}, {self.url!r}, {self.request!r}, {self.response!r})' + +    def __eq__ (self, b): +        if not isinstance (b, RequestResponsePair): +            raise TypeError (f'Can only compare with {self.__class__.__name__}') + +        # do not compare id and _time. These depend on external factors and do +        # not influence the request/response *content* +        return self.request == b.request and \ +                self.response == b.response and \ +                self.url == b.url and \ +                self.remoteIpAddress == b.remoteIpAddress and \ +                self.protocol == b.protocol and \ +                self.resourceType == b.resourceType + +    def fromRequestWillBeSent (self, req): +        """ Set request data from Chrome Network.requestWillBeSent event """ +        r = req['request'] + +        self.id = req['requestId'] +        self.url = URL (r['url']) +        self.resourceType = req.get ('type') +        self._time = ReferenceTimestamp (req['timestamp'], req['wallTime']) + +        assert self.request is None, req +        self.request = Request () +        self.request.initiator = req['initiator'] +        self.request.headers = CIMultiDict (self._unfoldHeaders (r['headers'])) +        self.request.hasPostData = r.get ('hasPostData', False) +        self.request.method = r['method'] +        self.request.timestamp = self._time (req['timestamp']) +        if self.request.hasPostData: +            postData = r.get ('postData') +            if postData is not None: +                self.request.body = UnicodeBody (postData) + +    def fromResponse (self, r, timestamp=None, resourceType=None): +        """ +        Set response data from Chrome’s Response object. +         +        Request must exist. Updates if response was set before. Sometimes +        fromResponseReceived is triggered twice by Chrome. No idea why. +        """ +        assert self.request is not None, (self.request, r) + +        if not timestamp: +            timestamp = self.request.timestamp + +        self.remoteIpAddress = r.get ('remoteIPAddress') +        self.protocol = r.get ('protocol') +        if resourceType: +            self.resourceType = resourceType + +        # a response may contain updated request headers (i.e. those actually +        # sent over the wire) +        if 'requestHeaders' in r: +            self.request.headers = CIMultiDict (self._unfoldHeaders (r['requestHeaders'])) + +        self.response = Response () +        self.response.headers = CIMultiDict (self._unfoldHeaders (r['headers'])) +        self.response.status = r['status'] +        self.response.statusText = r['statusText'] +        self.response.timestamp = timestamp +        self.response.mimeType = r['mimeType'] + +    def fromResponseReceived (self, resp): +        """ Set response data from Chrome Network.responseReceived """ +        return self.fromResponse (resp['response'], +                self._time (resp['timestamp']), resp['type']) + +    def fromLoadingFinished (self, data): +        self.response.bytesReceived = data['encodedDataLength'] + +    def fromLoadingFailed (self, data): +        self.response = None      @staticmethod      def _unfoldHeaders (headers): @@ -110,44 +227,26 @@ class Item:                  items.append ((k, v))          return items -    def setRequest (self, req): -        self.chromeRequest = req - -    def setResponse (self, resp): -        self.chromeResponse = resp - -    def setFinished (self, finished): -        self.chromeFinished = finished -      async def prefetchRequestBody (self, tab): -        # request body -        req = self.request -        postData = req.get ('postData') -        if postData: -            self.requestBody = postData.encode ('utf8'), False -        elif req.get ('hasPostData', False): +        if self.request.hasPostData and self.request.body is None:              try:                  postData = await tab.Network.getRequestPostData (requestId=self.id) -                postData = postData['postData'] -                self.requestBody = b64decode (postData), True +                self.request.body = UnicodeBody (postData['postData'])              except TabException: -                self.requestBody = None +                self.request.body = None          else: -            self.requestBody = None, False +            self.request.body = None      async def prefetchResponseBody (self, tab): -        # get response body +        """ Fetch response body """          try:              body = await tab.Network.getResponseBody (requestId=self.id) -            rawBody = body['body'] -            base64Encoded = body['base64Encoded'] -            if base64Encoded: -                rawBody = b64decode (rawBody) +            if body['base64Encoded']: +                self.response.body = Base64Body (body['body'])              else: -                rawBody = rawBody.encode ('utf8') -            self.body = rawBody, base64Encoded +                self.response.body = UnicodeBody (body['body'])          except TabException: -            self.body = None +            self.response.body = None  class VarChangeEvent:      """ Notify when variable is changed """ @@ -179,14 +278,14 @@ class SiteLoader:      XXX: track popup windows/new tabs and close them      """ -    __slots__ = ('requests', 'browser', 'url', 'logger', 'tab', '_iterRunning', 'idle', '_framesLoading') +    __slots__ = ('requests', 'browser', 'logger', 'tab', '_iterRunning', +            'idle', '_framesLoading')      allowedSchemes = {'http', 'https'} -    def __init__ (self, browser, url, logger): +    def __init__ (self, browser, logger):          self.requests = {}          self.browser = Browser (url=browser) -        self.url = url -        self.logger = logger.bind (context=type (self).__name__, url=url) +        self.logger = logger.bind (context=type (self).__name__)          self._iterRunning = []          self.idle = VarChangeEvent (True) @@ -250,7 +349,7 @@ class SiteLoader:                  result = t.result ()                  if result is None:                      pass -                elif isinstance (result, Item): +                elif isinstance (result, RequestResponsePair):                      yield result                  else:                      method, data = result @@ -263,8 +362,8 @@ class SiteLoader:              running = pending              self._iterRunning = running -    async def start (self): -        await self.tab.Page.navigate(url=self.url) +    async def navigate (self, url): +        await self.tab.Page.navigate(url=url)      # internal chrome callbacks      async def _requestWillBeSent (self, **kwargs): @@ -282,21 +381,24 @@ class SiteLoader:              # redirects never “finish” loading, but yield another requestWillBeSent with this key set              redirectResp = kwargs.get ('redirectResponse')              if redirectResp: -                # create fake responses -                resp = {'requestId': reqId, 'response': redirectResp, 'timestamp': kwargs['timestamp']} -                item.setResponse (resp) -                resp = {'requestId': reqId, 'encodedDataLength': 0, 'timestamp': kwargs['timestamp']} -                item.setFinished (resp) -                item.isRedirect = True +                if item.url != url: +                    # this happens for unknown reasons. the docs simply state +                    # it can differ in case of a redirect. Fix it and move on. +                    logger.warning ('redirect url differs', +                            uuid='558a7df7-2258-4fe4-b16d-22b6019cc163', +                            expected=item.url) +                    redirectResp['url'] = str (item.url) +                item.fromResponse (redirectResp)                  logger.info ('redirect', uuid='85eaec41-e2a9-49c2-9445-6f19690278b8', target=url) +                # XXX: queue this? no need to wait for it                  await item.prefetchRequestBody (self.tab) -                # cannot fetch request body due to race condition (item id reused) +                # cannot fetch response body due to race condition (item id reused)                  ret = item              else:                  logger.warning ('request exists', uuid='2c989142-ba00-4791-bb03-c2a14e91a56b') -        item = Item () -        item.setRequest (kwargs) +        item = RequestResponsePair () +        item.fromRequestWillBeSent (kwargs)          self.requests[reqId] = item          logger.debug ('request', uuid='55c17564-1bd0-4499-8724-fa7aad65478f') @@ -315,7 +417,7 @@ class SiteLoader:              logger.error ('url mismatch', uuid='7385f45f-0b06-4cbc-81f9-67bcd72ee7d0', respUrl=url)          if url.scheme in self.allowedSchemes:              logger.debug ('response', uuid='84461c4e-e8ef-4cbd-8e8e-e10a901c8bd0') -            item.setResponse (kwargs) +            item.fromResponseReceived (kwargs)          else:              logger.warning ('scheme forbidden', uuid='2ea6e5d7-dd3b-4881-b9de-156c1751c666') @@ -333,19 +435,21 @@ class SiteLoader:          logger = self.logger.bind (reqId=reqId, reqUrl=item.url)          if item.url.scheme in self.allowedSchemes:              logger.info ('finished', uuid='5a8b4bad-f86a-4fe6-a53e-8da4130d6a02') -            item.setFinished (kwargs) +            item.fromLoadingFinished (kwargs) +            # XXX queue both              await asyncio.gather (item.prefetchRequestBody (self.tab), item.prefetchResponseBody (self.tab))              return item      async def _loadingFailed (self, **kwargs):          reqId = kwargs['requestId'] -        self.logger.warning ('loading failed', +        logger = self.logger.bind (reqId=reqId) +        logger.warning ('loading failed',                  uuid='68410f13-6eea-453e-924e-c1af4601748b',                  errorText=kwargs['errorText'],                  blockedReason=kwargs.get ('blockedReason'))          item = self.requests.pop (reqId, None)          if item is not None: -            item.failed = True +            item.fromLoadingFailed (kwargs)              return item      async def _entryAdded (self, **kwargs): diff --git a/crocoite/controller.py b/crocoite/controller.py index 504fa23..a64a8dc 100644 --- a/crocoite/controller.py +++ b/crocoite/controller.py @@ -30,7 +30,7 @@ from operator import attrgetter  from yarl import URL  from . import behavior as cbehavior -from .browser import SiteLoader, Item +from .browser import SiteLoader, RequestResponsePair  from .util import getFormattedViewportMetrics, getSoftwareInfo  from .behavior import ExtractLinksEvent @@ -61,13 +61,13 @@ class StatsHandler (EventHandler):          self.stats = {'requests': 0, 'finished': 0, 'failed': 0, 'bytesRcv': 0}      def push (self, item): -        if isinstance (item, Item): +        if isinstance (item, RequestResponsePair):              self.stats['requests'] += 1 -            if item.failed: +            if not item.response:                  self.stats['failed'] += 1              else:                  self.stats['finished'] += 1 -                self.stats['bytesRcv'] += item.encodedDataLength +                self.stats['bytesRcv'] += item.response.bytesReceived  class LogHandler (EventHandler):      """ Handle items by logging information about them """ @@ -126,7 +126,7 @@ class SinglePageController:              async for item in l:                  self.processItem (item) -        async with self.service as browser, SiteLoader (browser, self.url, logger=logger) as l: +        async with self.service as browser, SiteLoader (browser, logger=logger) as l:              handle = asyncio.ensure_future (processQueue ())              start = time.time () @@ -153,7 +153,7 @@ class SinglePageController:                      }              self.processItem (ControllerStart (payload)) -            await l.start () +            await l.navigate (self.url)              for b in enabledBehavior:                  async for item in b.onload ():                      self.processItem (item) diff --git a/crocoite/logger.py b/crocoite/logger.py index d882eaf..82d7f5b 100644 --- a/crocoite/logger.py +++ b/crocoite/logger.py @@ -105,7 +105,7 @@ class PrintConsumer (Consumer):          return kwargs  class JsonPrintConsumer (Consumer): -    def __init__ (self, minLevel=Level.INFO): +    def __init__ (self, minLevel=Level.DEBUG):          self.minLevel = minLevel      def __call__ (self, **kwargs): diff --git a/crocoite/test_browser.py b/crocoite/test_browser.py index 8008855..4bf2c64 100644 --- a/crocoite/test_browser.py +++ b/crocoite/test_browser.py @@ -19,103 +19,28 @@  # THE SOFTWARE.  import asyncio, socket -import pytest  from operator import itemgetter -from aiohttp import web  from http.server import BaseHTTPRequestHandler +from datetime import datetime + +from yarl import URL +from aiohttp import web +from multidict import CIMultiDict -from .browser import Item, SiteLoader, VarChangeEvent +from hypothesis import given +import hypothesis.strategies as st +import pytest + +from .browser import RequestResponsePair, SiteLoader, VarChangeEvent, Request, \ +        UnicodeBody, ReferenceTimestamp, Base64Body, UnicodeBody, Request, \ +        Response  from .logger import Logger, Consumer  from .devtools import Crashed, Process  # if you want to know what’s going on: +#import logging  #logging.basicConfig(level=logging.DEBUG) -class TItem (Item): -    """ This should be as close to Item as possible """ - -    __slots__ = ('bodySend', '_body', '_requestBody') -    base = 'http://localhost:8000/' - -    def __init__ (self, path, status, headers, bodyReceive, bodySend=None, requestBody=None, failed=False, isRedirect=False): -        super ().__init__ () -        self.chromeResponse = {'response': {'headers': headers, 'status': status, 'url': self.base + path}} -        self.body = bodyReceive, False -        self.bodySend = bodyReceive if not bodySend else bodySend -        self.requestBody = requestBody, False -        self.failed = failed -        self.isRedirect = isRedirect - -testItems = [ -    TItem ('binary', 200, {'Content-Type': 'application/octet-stream'}, b'\x00\x01\x02', failed=True), -    TItem ('attachment', 200,  -            {'Content-Type': 'text/plain; charset=utf-8', -            'Content-Disposition': 'attachment; filename="attachment.txt"', -            }, -            'This is a simple text file with umlauts. ÄÖU.'.encode ('utf8'), failed=True), -    TItem ('encoding/utf8', 200, {'Content-Type': 'text/plain; charset=utf-8'}, -            'This is a test, äöü μνψκ ¥¥¥¿ýý¡'.encode ('utf8')), -    TItem ('encoding/iso88591', 200, {'Content-Type': 'text/plain; charset=ISO-8859-1'}, -            'This is a test, äöü.'.encode ('utf8'), -            'This is a test, äöü.'.encode ('ISO-8859-1')), -    TItem ('encoding/latin1', 200, {'Content-Type': 'text/plain; charset=latin1'}, -            'This is a test, äöü.'.encode ('utf8'), -            'This is a test, äöü.'.encode ('latin1')), -    TItem ('image', 200, {'Content-Type': 'image/png'}, -            # 1×1 png image -            b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01\x00\x00\x00\x01\x08\x00\x00\x00\x00:~\x9bU\x00\x00\x00\nIDAT\x08\x1dc\xf8\x0f\x00\x01\x01\x01\x006_g\x80\x00\x00\x00\x00IEND\xaeB`\x82'), -    TItem ('empty', 200, {'Content-Type': 'text/plain'}, b''), -    TItem ('headers/duplicate', 200, [('Content-Type', 'text/plain'), ('Duplicate', '1'), ('Duplicate', '2')], b''), -    TItem ('headers/fetch/req', 200, {'Content-Type': 'text/plain'}, b''), -    TItem ('headers/fetch/html', 200, {'Content-Type': 'text/html'}, -            r"""<html><body><script> -            let h = new Headers([["custom", "1"]]); -            fetch("/headers/fetch/req", {"method": "GET", "headers": h}).then(x => console.log("done")); -            </script></body></html>""".encode ('utf8')), -    TItem ('redirect/301/empty', 301, {'Location': '/empty'}, b'', isRedirect=True), -    TItem ('redirect/301/redirect/301/empty', 301, {'Location': '/redirect/301/empty'}, b'', isRedirect=True), -    TItem ('nonexistent', 404, {}, b''), -    TItem ('html', 200, {'Content-Type': 'text/html'}, -            '<html><body><img src="/image"><img src="/nonexistent"></body></html>'.encode ('utf8')), -    TItem ('html/alert', 200, {'Content-Type': 'text/html'}, -            '<html><body><script>window.addEventListener("beforeunload", function (e) { e.returnValue = "bye?"; return e.returnValue; }); alert("stopping here"); if (confirm("are you sure?") || prompt ("42?")) { window.location = "/nonexistent"; }</script><script>document.write(\'<img src="/image">\');</script></body></html>'.encode ('utf8')), -    TItem ('html/fetchPost', 200, {'Content-Type': 'text/html'}, -            r"""<html><body><script> -            let a = fetch("/html/fetchPost/binary", {"method": "POST", "body": "\x00"}); -            let b = fetch("/html/fetchPost/form", {"method": "POST", "body": new URLSearchParams({"data": "!"})}); -            let c = fetch("/html/fetchPost/binary/large", {"method": "POST", "body": "\x00".repeat(100*1024)}); -            let d = fetch("/html/fetchPost/form/large", {"method": "POST", "body": new URLSearchParams({"data": "!".repeat(100*1024)})}); -            </script></body></html>""".encode ('utf8')), -    TItem ('html/fetchPost/binary', 200, {'Content-Type': 'application/octet-stream'}, b'\x00', requestBody=b'\x00'), -    TItem ('html/fetchPost/form', 200, {'Content-Type': 'application/octet-stream'}, b'\x00', requestBody=b'data=%21'), -    # XXX: these should trigger the need for getRequestPostData, but they don’t. oh well. -    TItem ('html/fetchPost/binary/large', 200, {'Content-Type': 'application/octet-stream'}, b'\x00', requestBody=(100*1024)*b'\x00'), -    TItem ('html/fetchPost/form/large', 200, {'Content-Type': 'application/octet-stream'}, b'\x00', requestBody=b'data=' + (100*1024)*b'%21'), -    ] -testItemMap = dict ([(item.url.path, item) for item in testItems]) - -def itemToResponse (item): -    async def f (req): -        headers = item.response['headers'] -        return web.Response(body=item.bodySend, status=item.response['status'], -                headers=headers) -    return f - -@pytest.fixture -async def server (): -    """ Simple HTTP server for testing notifications """ -    import logging -    logging.basicConfig(level=logging.DEBUG) -    app = web.Application() -    for item in testItems: -        app.router.add_route ('*', item.url.path, itemToResponse (item)) -    runner = web.AppRunner(app) -    await runner.setup() -    site = web.TCPSite(runner, 'localhost', 8080) -    await site.start() -    yield app -    await runner.cleanup () -  class AssertConsumer (Consumer):      def __call__ (self, **kwargs):          assert 'uuid' in kwargs @@ -128,134 +53,14 @@ def logger ():      return Logger (consumer=[AssertConsumer ()])  @pytest.fixture -async def loader (server, logger): -    def f (path): -        if path.startswith ('/'): -            path = 'http://localhost:8080{}'.format (path) -        return SiteLoader (browser, path, logger) -    async with Process () as browser: -        yield f - -async def itemsLoaded (l, items): -    items = dict ([(i.url.path, i) for i in items]) -    async for item in l: -        assert item.chromeResponse is not None -        golden = items.pop (item.url.path) -        if not golden: -            assert False, f'url {item.url} not supposed to be fetched' -        assert item.failed == golden.failed -        if item.failed: -            # response will be invalid if request failed -            if not items: -                break -            else: -                continue -        assert item.isRedirect == golden.isRedirect -        if golden.isRedirect: -            assert item.body is None -        else: -            assert item.body[0] == golden.body[0] -        assert item.requestBody[0] == golden.requestBody[0] -        assert item.response['status'] == golden.response['status'] -        assert item.statusText == BaseHTTPRequestHandler.responses.get (item.response['status'])[0] -        for k, v in golden.responseHeaders: -            actual = list (map (itemgetter (1), filter (lambda x: x[0] == k, item.responseHeaders))) -            assert v in actual -         -        # we’re done when everything has been loaded -        if not items: -            break - -async def literalItem (lf, item, deps=[]): -    async with lf (item.url.path) as l: -        await l.start () -        await asyncio.wait_for (itemsLoaded (l, [item] + deps), timeout=30) - -@pytest.mark.asyncio -async def test_empty (loader): -    await literalItem (loader, testItemMap['/empty']) - -@pytest.mark.asyncio -async def test_headers_duplicate (loader): -    """ -    Some headers, like Set-Cookie can be present multiple times. Chrome -    separates these with a newline. -    """ -    async with loader ('/headers/duplicate') as l: -        await l.start () -        async for it in l: -            if it.url.path == '/headers/duplicate': -                assert not it.failed -                dup = list (filter (lambda x: x[0] == 'Duplicate', it.responseHeaders)) -                assert len(dup) == 2 -                assert list(sorted(map(itemgetter(1), dup))) == ['1', '2'] -                break - -@pytest.mark.asyncio -async def test_headers_req (loader): -    """ -    Custom request headers. JavaScript’s Headers() does not support duplicate -    headers, so we can’t generate those. -    """ -    async with loader ('/headers/fetch/html') as l: -        await l.start () -        async for it in l: -            if it.url.path == '/headers/fetch/req': -                assert not it.failed -                dup = list (filter (lambda x: x[0] == 'custom', it.requestHeaders)) -                assert len(dup) == 1 -                assert list(sorted(map(itemgetter(1), dup))) == ['1'] -                break - -@pytest.mark.asyncio -async def test_redirect (loader): -    await literalItem (loader, testItemMap['/redirect/301/empty'], [testItemMap['/empty']]) -    # chained redirects -    await literalItem (loader, testItemMap['/redirect/301/redirect/301/empty'], [testItemMap['/redirect/301/empty'], testItemMap['/empty']]) - -@pytest.mark.asyncio -async def test_encoding (loader): -    """ Text responses are transformed to UTF-8. Make sure this works -    correctly. """ -    for item in {testItemMap['/encoding/utf8'], testItemMap['/encoding/latin1'], testItemMap['/encoding/iso88591']}: -        await literalItem (loader, item) - -@pytest.mark.asyncio -async def test_binary (loader): -    """ Browser should ignore content it cannot display (i.e. octet-stream) """ -    await literalItem (loader, testItemMap['/binary']) - -@pytest.mark.asyncio -async def test_image (loader): -    """ Images should be displayed inline """ -    await literalItem (loader, testItemMap['/image']) - -@pytest.mark.asyncio -async def test_attachment (loader): -    """ And downloads won’t work in headless mode, even if it’s just a text file """ -    await literalItem (loader, testItemMap['/attachment']) - -@pytest.mark.asyncio -async def test_html (loader): -    await literalItem (loader, testItemMap['/html'], [testItemMap['/image'], testItemMap['/nonexistent']]) -    # make sure alerts are dismissed correctly (image won’t load otherwise) -    await literalItem (loader, testItemMap['/html/alert'], [testItemMap['/image']]) - -@pytest.mark.asyncio -async def test_post (loader): -    """ XHR POST request with binary data""" -    await literalItem (loader, testItemMap['/html/fetchPost'], -            [testItemMap['/html/fetchPost/binary'], -            testItemMap['/html/fetchPost/binary/large'], -            testItemMap['/html/fetchPost/form'], -            testItemMap['/html/fetchPost/form/large']]) +async def loader (logger): +    async with Process () as browser, SiteLoader (browser, logger) as l: +        yield l  @pytest.mark.asyncio  async def test_crash (loader): -    async with loader ('/html') as l: -        await l.start () -        with pytest.raises (Crashed): -            await l.tab.Page.crash () +    with pytest.raises (Crashed): +        await loader.tab.Page.crash ()  @pytest.mark.asyncio  async def test_invalidurl (loader): @@ -267,15 +72,16 @@ async def test_invalidurl (loader):      try:          resolved = await loop.getaddrinfo (host, None)      except socket.gaierror: -        async with loader (f'http://{host}/') as l: -            await l.start () -            async for it in l: -                assert it.failed -                break +        url = URL.build (scheme='http', host=host) +        await loader.navigate (url) +        async for it in loader: +            assert it.request is not None +            assert it.url == url +            assert it.response is None +            break      else:          pytest.skip (f'host {host} resolved to {resolved}') -  @pytest.mark.asyncio  async def test_varchangeevent ():      e = VarChangeEvent (True) @@ -299,3 +105,290 @@ async def test_varchangeevent ():      assert ret == False      assert e.get () == ret +timestamp = st.one_of ( +                st.integers(min_value=0, max_value=2**32-1), +                st.floats (min_value=0, max_value=2**32-1), +                ) + +@given(timestamp, timestamp, timestamp) +def test_referencetimestamp (relativeA, absoluteA, relativeB): +    ts = ReferenceTimestamp (relativeA, absoluteA) +    absoluteA = datetime.utcfromtimestamp (absoluteA) +    absoluteB = ts (relativeB) +    assert (absoluteA < absoluteB and relativeA < relativeB) or \ +            (absoluteA >= absoluteB and relativeA >= relativeB) +    assert abs ((absoluteB - absoluteA).total_seconds () - (relativeB - relativeA)) < 10e-6 + +def hostname (): +    # XXX: find a better way to generate hostnames +    return st.text (alphabet=st.sampled_from('abcdefghijklmnopqrstuvwxyz0123456789-'), min_size=1, max_size=253) + +def urls (): +    """ Build http/https URL """ +    scheme = st.sampled_from (['http', 'https']) +    # Path must start with a slash +    pathSt = st.builds (lambda x: '/' + x, st.text ()) +    args = st.fixed_dictionaries ({ +            'scheme': scheme, +            'host': hostname (), +            'port': st.one_of (st.none (), st.integers (min_value=1, max_value=2**16-1)), +            'path': pathSt, +            'query_string': st.text (), +            'fragment': st.text (), +            }) +    return st.builds (lambda x: URL.build (**x), args) + +def urlsStr (): +    return st.builds (lambda x: str (x), urls ()) + +asciiText = st.text (st.characters (min_codepoint=32, max_codepoint=126)) + +def chromeHeaders (): +    # token as defined by https://tools.ietf.org/html/rfc7230#section-3.2.6 +    token = st.sampled_from('abcdefghijklmnopqrstuvwxyz0123456789!#$%&\'*+-.^_`|~') +    # XXX: the value should be asciiText without leading/trailing spaces +    return st.dictionaries (token, token) + +def fixedDicts (fixed, dynamic): +    return st.builds (lambda x, y: x.update (y), st.fixed_dictionaries (fixed), st.lists (dynamic)) + +def chromeRequestWillBeSent (reqid, url): +    methodSt = st.sampled_from (['GET', 'POST', 'PUT', 'DELETE']) +    return st.fixed_dictionaries ({ +            'requestId': reqid, +            'initiator': st.just ('Test'), +            'wallTime': timestamp, +            'timestamp': timestamp, +            'request': st.fixed_dictionaries ({ +                'url': url, +                'method': methodSt, +                'headers': chromeHeaders (), +                # XXX: postData, hasPostData +                }) +            }) + +def chromeResponseReceived (reqid, url): +    mimeTypeSt = st.one_of (st.none (), st.just ('text/html')) +    remoteIpAddressSt = st.one_of (st.none (), st.just ('127.0.0.1')) +    protocolSt = st.one_of (st.none (), st.just ('h2')) +    statusCodeSt = st.integers (min_value=100, max_value=999) +    typeSt = st.sampled_from (['Document', 'Stylesheet', 'Image', 'Media', +            'Font', 'Script', 'TextTrack', 'XHR', 'Fetch', 'EventSource', +            'WebSocket', 'Manifest', 'SignedExchange', 'Ping', +            'CSPViolationReport', 'Other']) +    return st.fixed_dictionaries ({ +            'requestId': reqid, +            'timestamp': timestamp, +            'type': typeSt, +            'response': st.fixed_dictionaries ({ +                'url': url, +                'requestHeaders': chromeHeaders (), # XXX: make this optional +                'headers': chromeHeaders (), +                'status': statusCodeSt, +                'statusText': asciiText, +                'mimeType': mimeTypeSt, +                'remoteIPAddress': remoteIpAddressSt, +                'protocol': protocolSt, +                }) +            }) + +def chromeReqResp (): +    # XXX: will this gnerated the same url for all testcases? +    reqid = st.shared (st.text (), 'reqresp') +    url = st.shared (urlsStr (), 'reqresp') +    return st.tuples (chromeRequestWillBeSent (reqid, url), +            chromeResponseReceived (reqid, url)) + +def requestResponsePair (): +    def f (creq, cresp, hasPostData, reqBody, respBody): +        i = RequestResponsePair () +        i.fromRequestWillBeSent (creq) +        i.request.hasPostData = hasPostData +        if hasPostData: +            i.request.body = reqBody + +        if cresp is not None: +            i.fromResponseReceived (cresp) +            if respBody is not None: +                i.response.body = respBody +        return i + +    bodySt = st.one_of ( +            st.none (), +            st.builds (UnicodeBody, st.text ()), +            st.builds (Base64Body.fromBytes, st.binary ()) +            ) +    return st.builds (lambda reqresp, hasPostData, reqBody, respBody: +            f (reqresp[0], reqresp[1], hasPostData, reqBody, respBody), +            chromeReqResp (), st.booleans (), bodySt, bodySt) + +@given(chromeReqResp ()) +def test_requestResponsePair (creqresp): +    creq, cresp = creqresp + +    item = RequestResponsePair () + +    assert item.id is None +    assert item.url is None +    assert item.request is None +    assert item.response is None + +    item.fromRequestWillBeSent (creq) + +    assert item.id == creq['requestId'] +    url = URL (creq['request']['url']) +    assert item.url == url +    assert item.request is not None +    assert item.request.timestamp == datetime.utcfromtimestamp (creq['wallTime']) +    assert set (item.request.headers.keys ()) == set (creq['request']['headers'].keys ()) +    assert item.response is None + +    item.fromResponseReceived (cresp) + +    # url will not be overwritten +    assert item.id == creq['requestId'] == cresp['requestId'] +    assert item.url == url +    assert item.request is not None +    assert set (item.request.headers.keys ()) == set (cresp['response']['requestHeaders'].keys ()) +    assert item.response is not None +    assert set (item.response.headers.keys ()) == set (cresp['response']['headers'].keys ()) +    assert (item.response.timestamp - item.request.timestamp).total_seconds () - \ +            (cresp['timestamp'] - creq['timestamp']) < 10e-6 + +@given(chromeReqResp ()) +def test_requestResponsePair_eq (creqresp): +    creq, cresp = creqresp + +    item = RequestResponsePair () +    item2 = RequestResponsePair () +    assert item == item +    assert item == item2 + +    item.fromRequestWillBeSent (creq) +    assert item != item2 +    item2.fromRequestWillBeSent (creq) +    assert item == item +    assert item == item2 + +    item.fromResponseReceived (cresp) +    assert item != item2 +    item2.fromResponseReceived (cresp) +    assert item == item +    assert item == item2 + +    # XXX: test for inequality with different parameters + +### Google Chrome integration tests ### + +serverUrl = URL.build (scheme='http', host='localhost', port=8080) +items = [ +    RequestResponsePair ( +        url=serverUrl.with_path ('/encoding/utf-8'), +        request=Request (method='GET'), +        response=Response (status=200, headers=CIMultiDict ([('Content-Type', 'text/html; charset=utf-8')]), +            body=UnicodeBody ('äöü'), mimeType='text/html') +        ), +    RequestResponsePair ( +        url=serverUrl.with_path ('/encoding/latin1'), +        request=Request (method='GET'), +        response=Response (status=200, headers=CIMultiDict ([('Content-Type', 'text/html; charset=latin1')]), +            body=UnicodeBody ('äöü'), mimeType='text/html') +        ), +    RequestResponsePair ( +        url=serverUrl.with_path ('/encoding/utf-16'), +        request=Request (method='GET'), +        response=Response (status=200, headers=CIMultiDict ([('Content-Type', 'text/html; charset=utf-16')]), +            body=UnicodeBody ('äöü'), mimeType='text/html') +        ), +    RequestResponsePair ( +        url=serverUrl.with_path ('/encoding/ISO-8859-1'), +        request=Request (method='GET'), +        response=Response (status=200, headers=CIMultiDict ([('Content-Type', 'text/html; charset=ISO-8859-1')]), +            body=UnicodeBody ('äöü'), mimeType='text/html') +        ), +    RequestResponsePair ( +        url=serverUrl.with_path ('/status/200'), +        request=Request (method='GET'), +        response=Response (status=200, headers=CIMultiDict ([('Content-Type', 'text/plain')]), +            body=b'', +            mimeType='text/plain'), +        ), +    # redirects never have a response body +    RequestResponsePair ( +        url=serverUrl.with_path ('/status/301'), +        request=Request (method='GET'), +        response=Response (status=301, +            headers=CIMultiDict ([('Content-Type', 'text/plain'), +                ('Location', str (serverUrl.with_path ('/status/301/redirected')))]), +            body=None, +            mimeType='text/plain'), +        ), +    RequestResponsePair ( +        url=serverUrl.with_path ('/image/png'), +        request=Request (method='GET'), +        response=Response (status=200, headers=CIMultiDict ([('Content-Type', 'image/png')]), +            body=Base64Body.fromBytes (b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01\x00\x00\x00\x01\x08\x00\x00\x00\x00:~\x9bU\x00\x00\x00\nIDAT\x08\x1dc\xf8\x0f\x00\x01\x01\x01\x006_g\x80\x00\x00\x00\x00IEND\xaeB`\x82'), +            mimeType='image/png'), +        ), +    RequestResponsePair ( +        url=serverUrl.with_path ('/script/alert'), +        request=Request (method='GET'), +        response=Response (status=200, headers=CIMultiDict ([('Content-Type', 'text/html; charset=utf-8')]), +            body=UnicodeBody ('''<html><body><script> +window.addEventListener("beforeunload", function (e) { +    e.returnValue = "bye?"; +    return e.returnValue; +}); +alert("stopping here"); +if (confirm("are you sure?") || prompt ("42?")) { +    window.location = "/nonexistent"; +} +</script></body></html>'''), mimeType='text/html') +        ), +    ] + +@pytest.mark.asyncio +# would be nice if we could use hypothesis here somehow +@pytest.mark.parametrize("golden", items) +async def test_integration_item (loader, golden): +    async def f (req): +        body = golden.response.body +        contentType = golden.response.headers.get ('content-type', '') if golden.response.headers is not None else '' +        charsetOff = contentType.find ('charset=') +        if isinstance (body, UnicodeBody) and charsetOff != -1: +            encoding = contentType[charsetOff+len ('charset='):] +            body = golden.response.body.decode ('utf-8').encode (encoding) +        return web.Response (body=body, status=golden.response.status, +                headers=golden.response.headers) + +    app = web.Application () +    app.router.add_route (golden.request.method, golden.url.path, f) +    runner = web.AppRunner(app) +    await runner.setup() +    site = web.TCPSite(runner, serverUrl.host, serverUrl.port) +    await site.start() + +    try: +        await loader.navigate (golden.url) + +        it = loader.__aiter__ () +        item = await it.__anext__ () + +        # we do not know this in advance +        item.request.initiator = None +        item.request.headers = None +        item.remoteIpAddress = None +        item.protocol = None +        item.resourceType = None + +        if item.response: +            assert item.response.statusText is not None +            item.response.statusText = None + +            del item.response.headers['server'] +            del item.response.headers['content-length'] +            del item.response.headers['date'] +        assert item == golden +    finally: +        await runner.cleanup () + diff --git a/crocoite/test_warc.py b/crocoite/test_warc.py index 7f2635b..954e8c8 100644 --- a/crocoite/test_warc.py +++ b/crocoite/test_warc.py @@ -24,6 +24,7 @@ from operator import itemgetter  from warcio.archiveiterator import ArchiveIterator  from yarl import URL +from multidict import CIMultiDict  from hypothesis import given, reproduce_failure  import hypothesis.strategies as st  import pytest @@ -32,7 +33,8 @@ from .warc import WarcHandler  from .logger import Logger, WarcHandlerConsumer  from .controller import ControllerStart  from .behavior import Script, ScreenshotEvent, DomSnapshotEvent -from .browser import Item +from .browser import RequestResponsePair, Base64Body, UnicodeBody +from .test_browser import requestResponsePair, urls  def test_log ():      logger = Logger () @@ -66,50 +68,6 @@ def test_log ():                  data = json.loads (l.strip ())                  assert data == golden.pop (0) -def hostname (): -    # XXX: find a better way to generate hostnames -    return st.text (alphabet=st.sampled_from('abcdefghijklmnopqrstuvwxyz0123456789-'), min_size=1, max_size=253) - -def urls (): -    """ Build http/https URL """ -    scheme = st.one_of (st.just ('http'), st.just ('https')) -    # Path must start with a slash -    pathSt = st.builds (lambda x: '/' + x, st.text ()) -    args = st.fixed_dictionaries ({ -            'scheme': scheme, -            'host': hostname (), -            'port': st.one_of (st.none (), st.integers (min_value=1, max_value=2**16-1)), -            'path': pathSt, -            'query_string': st.text (), -            'fragment': st.text (), -            }) -    return st.builds (lambda x: URL.build (**x), args) - -def item (): -    def f (url, requestBody, body, mimeType): -        i = Item () -        # XXX: we really need some level of abstraction. Testing is a nightmare. -        i.setRequest ({'requestId': 'myid', 'initiator': 'Test', 'wallTime': 0, 'timestamp': 1, 'request': {'url': str (url), 'method': 'GET', 'headers': {'None': 'None'}}}) -        i.setResponse ({'requestId': 'myid', 'timestamp': 2, 'type': 'Document', 'response': {'url': str (url), 'requestHeaders': {'foo': 'bar', 'Set-Cookie': 'line1\nline2'}, 'headers': {'Response': 'Headers', 'Content-Length': '12345'}, 'status': 200}}) -        if mimeType is not None: -            i.chromeResponse['response']['mimeType'] = 'text/html' -        i.requestBody = requestBody -        i.body = body -        return i - -    def failedItem (url): -        i = Item () -        i.setRequest ({'requestId': 'myid', 'initiator': 'Test', 'wallTime': 0, 'timestamp': 1, 'request': {'url': str (url), 'method': 'GET', 'headers': {'None': 'None'}}}) -        i.failed = True -        return i - -    bodySt = st.one_of (st.none (), st.tuples (st.one_of (st.none (), st.binary ()), st.booleans ())) -    mimeTypeSt = st.one_of (st.none (), st.just ('text/html')) -    return st.one_of ( -            st.builds (failedItem, urls ()), -            st.builds (f, urls (), bodySt, bodySt, mimeTypeSt), -            ) -  def jsonObject ():      """ JSON-encodable objects """      return st.dictionaries (st.text (), st.one_of (st.integers (), st.text ())) @@ -123,7 +81,7 @@ def event ():              st.builds (Script.fromStr, st.text (), st.one_of(st.none (), st.text ())),              st.builds (ScreenshotEvent, urls (), st.integers (), st.binary ()),              st.builds (DomSnapshotEvent, urls (), st.builds (lambda x: x.encode ('utf-8'), st.text ()), viewport()), -            item (), +            requestResponsePair (),              )  @given (st.lists (event ())) @@ -136,7 +94,7 @@ def test_push (golden):      # null logger      logger = Logger () -    with NamedTemporaryFile() as fd: +    with open('/tmp/test.warc.gz', 'w+b') as fd:          with WarcHandler (fd, logger) as handler:              for g in golden:                  handler.push (g) @@ -191,10 +149,7 @@ def test_push (golden):                  assert headers['X-DOM-Snapshot'] == 'True'                  assert rec.raw_stream.read () == g.document -            elif isinstance (g, Item): -                if g.failed: -                    continue - +            elif isinstance (g, RequestResponsePair):                  rec = next (it)                  # request @@ -204,54 +159,56 @@ def test_push (golden):                  assert URL (headers['warc-target-uri']) == g.url                  assert headers['x-chrome-request-id'] == g.id -                assert sorted (rec.http_headers.headers, key=itemgetter (0)) == sorted (g.requestHeaders, key=itemgetter (0)) -                if g.requestBody: -                    if g.requestBody[0] is None: -                        assert not rec.raw_stream.read () +                assert CIMultiDict (rec.http_headers.headers) == g.request.headers +                if g.request.hasPostData: +                    if g.request.body is not None: +                        assert rec.raw_stream.read () == g.request.body +                        assert str (headers['x-chrome-base64body'] or False) == str (isinstance (g.request.body, Base64Body)), (headers['x-chrome-base64body'], g.request.body)                      else: -                        assert rec.raw_stream.read () == g.requestBody[0], g.requestBody -                        assert str (headers['x-chrome-base64body'] or False) == str (g.requestBody[1]), (headers['x-chrome-base64body'], g.requestBody) +                        # body fetch failed +                        assert headers['warc-truncated'] == 'unspecified' +                        assert not rec.raw_stream.read ()                  else: -                    # body fetch failed -                    assert headers['warc-truncated'] == 'unspecified' +                    assert not rec.raw_stream.read ()                  # response -                rec = next (it) -                headers = rec.rec_headers -                httpheaders = rec.http_headers -                assert headers['warc-type'] == 'response' -                checkWarcinfoId (headers) -                assert URL (headers['warc-target-uri']) == g.url -                assert headers['x-chrome-request-id'] == g.id - -                # these are checked separately -                blacklistedHeaders = {'content-type', 'content-length'} -                sortedHeaders = lambda l: sorted (filter (lambda x: x[0].lower() not in blacklistedHeaders, l), key=itemgetter (0)) -                assert sortedHeaders (httpheaders.headers) == sortedHeaders (g.responseHeaders) - -                expectedContentType = g.response.get ('mimeType') -                if expectedContentType is not None: -                    assert httpheaders['content-type'].startswith (expectedContentType) - -                if g.body: -                    if g.body[0] is None: -                        assert not rec.raw_stream.read () -                        #assert httpheaders['content-length'] == '0' +                if g.response: +                    rec = next (it) +                    headers = rec.rec_headers +                    httpheaders = rec.http_headers +                    assert headers['warc-type'] == 'response' +                    checkWarcinfoId (headers) +                    assert URL (headers['warc-target-uri']) == g.url +                    assert headers['x-chrome-request-id'] == g.id + +                    # these are checked separately +                    filteredHeaders = CIMultiDict (httpheaders.headers) +                    for b in {'content-type', 'content-length'}: +                        if b in g.response.headers: +                            g.response.headers.popall (b) +                        if b in filteredHeaders: +                            filteredHeaders.popall (b) +                    assert filteredHeaders == g.response.headers + +                    expectedContentType = g.response.mimeType +                    if expectedContentType is not None: +                        assert httpheaders['content-type'].startswith (expectedContentType) + +                    if g.response.body is not None: +                        assert rec.raw_stream.read () == g.response.body +                        assert str (headers['x-chrome-base64body'] or False) == str (isinstance (g.response.body, Base64Body)) +                        assert httpheaders['content-length'] == str (len (g.response.body)) +                        # body is never truncated if it exists +                        assert headers['warc-truncated'] is None + +                        # unencoded strings are converted to utf8 +                        if isinstance (g.response.body, UnicodeBody) and httpheaders['content-type'] is not None: +                            assert httpheaders['content-type'].endswith ('; charset=utf-8')                      else: -                        assert rec.raw_stream.read () == g.body[0] -                        assert str (headers['x-chrome-base64body'] or False) == str (g.body[1]) -                        assert httpheaders['content-length'] == str (len (g.body[0])) - -                    # body is never truncated if it exists -                    assert headers['warc-truncated'] is None - -                    # unencoded strings are converted to utf8 -                    if not g.body[1] and httpheaders['content-type'] is not None: -                        assert httpheaders['content-type'].endswith ('; charset=utf-8') -                else: -                    # body fetch failed -                    assert headers['warc-truncated'] == 'unspecified' -                    # content-length header should be kept intact +                        # body fetch failed +                        assert headers['warc-truncated'] == 'unspecified' +                        assert not rec.raw_stream.read () +                        # content-length header should be kept intact              else:                  assert False, f"invalid golden type {type(g)}" # pragma: no cover diff --git a/crocoite/warc.py b/crocoite/warc.py index dbd9ebc..cb1f2f7 100644 --- a/crocoite/warc.py +++ b/crocoite/warc.py @@ -25,6 +25,7 @@ Classes writing data to WARC files  import json, threading  from io import BytesIO  from datetime import datetime +from http.server import BaseHTTPRequestHandler  from warcio.timeutils import datetime_to_iso_date  from warcio.warcwriter import WARCWriter @@ -33,7 +34,7 @@ from warcio.statusandheaders import StatusAndHeaders  from .util import packageUrl, StrJsonEncoder  from .controller import EventHandler, ControllerStart  from .behavior import Script, DomSnapshotEvent, ScreenshotEvent -from .browser import Item +from .browser import RequestResponsePair, UnicodeBody, Base64Body  class WarcHandler (EventHandler):      __slots__ = ('logger', 'writer', 'documentRecords', 'log', @@ -86,66 +87,51 @@ class WarcHandler (EventHandler):          url = item.url          path = url.relative().with_fragment(None) -        httpHeaders = StatusAndHeaders(f'{req["method"]} {path} HTTP/1.1', -                item.requestHeaders, protocol='HTTP/1.1', is_http_request=True) -        initiator = item.initiator +        httpHeaders = StatusAndHeaders(f'{req.method} {path} HTTP/1.1', +                req.headers, protocol='HTTP/1.1', is_http_request=True)          warcHeaders = { -                'X-Chrome-Initiator': json.dumps (initiator), +                'X-Chrome-Initiator': json.dumps (req.initiator),                  'X-Chrome-Request-ID': item.id, -                'WARC-Date': datetime_to_iso_date (datetime.utcfromtimestamp (item.chromeRequest['wallTime'])), +                'WARC-Date': datetime_to_iso_date (req.timestamp),                  } -        if item.requestBody is not None: -            payload, payloadBase64Encoded = item.requestBody -        else: +        body = item.request.body +        if item.request.hasPostData and body is None:              # oops, don’t know what went wrong here -            logger.error ('requestBody missing', uuid='ee9adc58-e723-4595-9feb-312a67ead6a0') +            logger.error ('requestBody missing', +                    uuid='ee9adc58-e723-4595-9feb-312a67ead6a0')              warcHeaders['WARC-Truncated'] = 'unspecified' -            payload = None - -        if payload is not None: -            payload = BytesIO (payload) -            warcHeaders['X-Chrome-Base64Body'] = str (payloadBase64Encoded) +        else: +            warcHeaders['X-Chrome-Base64Body'] = str (type (body) is Base64Body) +            body = BytesIO (body)          record = self.writeRecord (url, 'request', -                payload=payload, http_headers=httpHeaders, +                payload=body, http_headers=httpHeaders,                  warc_headers_dict=warcHeaders)          return record.rec_headers['WARC-Record-ID']      def _writeResponse (self, item, concurrentTo):          # fetch the body          reqId = item.id -        rawBody = None -        base64Encoded = False -        bodyTruncated = None -        if item.isRedirect or item.body is None: -            # redirects reuse the same request, thus we cannot safely retrieve -            # the body (i.e getResponseBody may return the new location’s -            # body). No body available means we failed to retrieve it. -            bodyTruncated = 'unspecified' -        else: -            rawBody, base64Encoded = item.body          # now the response          resp = item.response          warcHeaders = {                  'WARC-Concurrent-To': concurrentTo, -                'WARC-IP-Address': resp.get ('remoteIPAddress', ''), -                'X-Chrome-Protocol': resp.get ('protocol', ''), -                'X-Chrome-FromDiskCache': str (resp.get ('fromDiskCache')), -                'X-Chrome-ConnectionReused': str (resp.get ('connectionReused')),                  'X-Chrome-Request-ID': item.id, -                'WARC-Date': datetime_to_iso_date (datetime.utcfromtimestamp ( -                        item.chromeRequest['wallTime']+ -                        (item.chromeResponse['timestamp']-item.chromeRequest['timestamp']))), +                'WARC-Date': datetime_to_iso_date (resp.timestamp),                  } -        if bodyTruncated: -            warcHeaders['WARC-Truncated'] = bodyTruncated -        else: -            warcHeaders['X-Chrome-Base64Body'] = str (base64Encoded) - -        httpHeaders = StatusAndHeaders(f'{resp["status"]} {item.statusText}', -                item.responseHeaders, -                protocol='HTTP/1.1') +        # conditional WARC headers +        if item.remoteIpAddress: +            warcHeaders['WARC-IP-Address'] = item.remoteIpAddress +        if item.protocol: +            warcHeaders['X-Chrome-Protocol'] = item.protocol + +        # HTTP headers +        statusText = resp.statusText or \ +                BaseHTTPRequestHandler.responses.get ( +                resp.status, ('No status text available', ))[0] +        httpHeaders = StatusAndHeaders(f'{resp.status} {statusText}', +                resp.headers, protocol='HTTP/1.1')          # Content is saved decompressed and decoded, remove these headers          blacklistedHeaders = {'transfer-encoding', 'content-encoding'} @@ -155,20 +141,23 @@ class WarcHandler (EventHandler):          # chrome sends nothing but utf8 encoded text. Fortunately HTTP          # headers take precedence over the document’s <meta>, thus we can          # easily override those. -        contentType = resp.get ('mimeType') +        contentType = resp.mimeType          if contentType: -            if not base64Encoded: +            if isinstance (resp.body, UnicodeBody):                  contentType += '; charset=utf-8'              httpHeaders.replace_header ('Content-Type', contentType) -        if rawBody is not None: -            httpHeaders.replace_header ('Content-Length', str (len (rawBody))) -            bodyIo = BytesIO (rawBody) +        # response body +        body = resp.body +        if body is None: +            warcHeaders['WARC-Truncated'] = 'unspecified'          else: -            bodyIo = BytesIO () +            httpHeaders.replace_header ('Content-Length', str (len (body))) +            warcHeaders['X-Chrome-Base64Body'] = str (type (body) is Base64Body) +            body = BytesIO (body)          record = self.writeRecord (item.url, 'response', -                warc_headers_dict=warcHeaders, payload=bodyIo, +                warc_headers_dict=warcHeaders, payload=body,                  http_headers=httpHeaders)          if item.resourceType == 'Document': @@ -184,12 +173,11 @@ class WarcHandler (EventHandler):                  f'application/javascript; charset={encoding}'})      def _writeItem (self, item): -        if item.failed: -            # should have been handled by the logger already -            return - +        assert item.request          concurrentTo = self._writeRequest (item) -        self._writeResponse (item, concurrentTo) +        # items that failed loading don’t have a response +        if item.response: +            self._writeResponse (item, concurrentTo)      def _addRefersTo (self, headers, url):          refersTo = self.documentRecords.get (url) @@ -247,7 +235,7 @@ class WarcHandler (EventHandler):              self._flushLogEntries ()      route = {Script: _writeScript, -            Item: _writeItem, +            RequestResponsePair: _writeItem,              DomSnapshotEvent: _writeDomSnapshot,              ScreenshotEvent: _writeScreenshot,              ControllerStart: _writeControllerStart, @@ -18,6 +18,7 @@ setup(          'aiohttp',          'PyYAML',          'yarl', +        'multidict',      ],      entry_points={      'console_scripts': [ | 
