diff options
author | Lars-Dominik Braun <lars@6xq.net> | 2019-01-03 19:34:17 +0100 |
---|---|---|
committer | Lars-Dominik Braun <lars@6xq.net> | 2019-01-03 19:37:27 +0100 |
commit | 9d7974e3e7e8a4575ea61cb33a30fa291d12ae38 (patch) | |
tree | 5311396c0d74eaa35e1eff1e1641c0bd157cde25 | |
parent | ad4e119bf1a55c84dc7c6260588ed7db9e7199c6 (diff) | |
download | crocoite-9d7974e3e7e8a4575ea61cb33a30fa291d12ae38.tar.gz crocoite-9d7974e3e7e8a4575ea61cb33a30fa291d12ae38.tar.bz2 crocoite-9d7974e3e7e8a4575ea61cb33a30fa291d12ae38.zip |
browser: Turn Item into RequestResponsePair
Previously Item was just a simple wrapper around Chrome’s Network.*
events. This turned out to be quite nasty when testing, so its
replacement, RequestResponsePair, does some level of abstraction. This
makes testing alot easier, since we now can simply instantiate it
without building a proper DevTools event.
Should come without any functional changes.
-rw-r--r-- | README.rst | 2 | ||||
-rw-r--r-- | crocoite/browser.py | 326 | ||||
-rw-r--r-- | crocoite/controller.py | 12 | ||||
-rw-r--r-- | crocoite/logger.py | 2 | ||||
-rw-r--r-- | crocoite/test_browser.py | 531 | ||||
-rw-r--r-- | crocoite/test_warc.py | 145 | ||||
-rw-r--r-- | crocoite/warc.py | 96 | ||||
-rw-r--r-- | setup.py | 1 |
8 files changed, 630 insertions, 485 deletions
@@ -24,6 +24,7 @@ These dependencies must be present to run crocoite: - warcio_ - html5lib_ - yarl_ +- multidict_ - bottom_ (IRC client) - `Google Chrome`_ @@ -35,6 +36,7 @@ These dependencies must be present to run crocoite: .. _bottom: https://github.com/numberoverzero/bottom .. _Google Chrome: https://www.google.com/chrome/ .. _yarl: https://yarl.readthedocs.io/ +.. _multidict: https://multidict.readthedocs.io/ The following commands clone the repository from GitHub_, set up a virtual environment and install crocoite: diff --git a/crocoite/browser.py b/crocoite/browser.py index 3de61f0..50561ed 100644 --- a/crocoite/browser.py +++ b/crocoite/browser.py @@ -23,80 +23,197 @@ Chrome browser interactions. """ import asyncio -from base64 import b64decode +from base64 import b64decode, b64encode +from datetime import datetime, timedelta from http.server import BaseHTTPRequestHandler + from yarl import URL +from multidict import CIMultiDict from .logger import Level from .devtools import Browser, TabException -class Item: - """ - Simple wrapper containing Chrome request and response - """ +# These two classes’ only purpose is so we can later tell whether a body was +# base64-encoded or a unicode string +class Base64Body (bytes): + def __new__ (cls, value): + return bytes.__new__ (cls, b64decode (value)) + + @classmethod + def fromBytes (cls, b): + """ For testing """ + return cls (b64encode (b)) + +class UnicodeBody (bytes): + def __new__ (cls, value): + if type (value) is not str: + raise TypeError ('expecting unicode string') + + return bytes.__new__ (cls, value.encode ('utf-8')) - __slots__ = ('chromeRequest', 'chromeResponse', 'chromeFinished', - 'isRedirect', 'failed', 'body', 'requestBody') +class Request: + __slots__ = ('headers', 'body', 'initiator', 'hasPostData', 'method', 'timestamp') - def __init__ (self): - self.chromeRequest = {} - self.chromeResponse = {} - self.chromeFinished = {} - self.isRedirect = False - self.failed = False - self.body = None - self.requestBody = None + def __init__ (self, method=None, headers=None, body=None): + self.headers = headers + self.body = body + self.hasPostData = False + self.initiator = None + # HTTP method + self.method = method + self.timestamp = None def __repr__ (self): - return f'<Item {self.url}>' - - @property - def request (self): - return self.chromeRequest.get ('request', {}) - - @property - def response (self): - return self.chromeResponse.get ('response', {}) - - @property - def initiator (self): - return self.chromeRequest['initiator'] - - @property - def id (self): - return self.chromeRequest['requestId'] - - @property - def encodedDataLength (self): - return self.chromeFinished['encodedDataLength'] - - @property - def url (self): - return URL (self.response.get ('url', self.request.get ('url'))) - - @property - def requestHeaders (self): - # the response object may contain refined headers, which were - # *actually* sent over the wire - return self._unfoldHeaders (self.response.get ('requestHeaders', self.request['headers'])) - - @property - def responseHeaders (self): - return self._unfoldHeaders (self.response['headers']) - - @property - def statusText (self): - text = self.response.get ('statusText') - if text: - return text - text = BaseHTTPRequestHandler.responses.get (self.response['status']) - if text: - return text[0] - return 'No status text available' - - @property - def resourceType (self): - return self.chromeResponse.get ('type', self.chromeRequest.get ('type', None)) + return f'Request({self.method!r}, {self.headers!r}, {self.body!r})' + + def __eq__ (self, b): + if b is None: + return False + + if not isinstance (b, Request): + raise TypeError ('Can only compare equality with Request.') + + # do not compare hasPostData (only required to fetch body) and + # timestamp (depends on time) + return self.headers == b.headers and \ + self.body == b.body and \ + self.initiator == b.initiator and \ + self.method == b.method + +class Response: + __slots__ = ('status', 'statusText', 'headers', 'body', 'bytesReceived', + 'timestamp', 'mimeType') + + def __init__ (self, status=None, statusText=None, headers=None, body=None, mimeType=None): + self.status = status + self.statusText = statusText + self.headers = headers + self.body = body + # bytes received over the network (not body size!) + self.bytesReceived = 0 + self.timestamp = None + self.mimeType = mimeType + + def __repr__ (self): + return f'Response({self.status!r}, {self.statusText!r}, {self.headers!r}, {self.body!r}, {self.mimeType!r})' + + def __eq__ (self, b): + if b is None: + return False + + if not isinstance (b, Response): + raise TypeError ('Can only compare equality with Response.') + + # do not compare bytesReceived (depends on network), timestamp + # (depends on time) and statusText (does not matter) + return self.status == b.status and \ + self.statusText == b.statusText and \ + self.headers == b.headers and \ + self.body == b.body and \ + self.mimeType == b.mimeType + +class ReferenceTimestamp: + """ Map relative timestamp to absolute timestamp """ + + def __init__ (self, relative, absolute): + self.relative = timedelta (seconds=relative) + self.absolute = datetime.utcfromtimestamp (absolute) + + def __call__ (self, relative): + if not isinstance (relative, timedelta): + relative = timedelta (seconds=relative) + return self.absolute + (relative-self.relative) + +class RequestResponsePair: + __slots__ = ('request', 'response', 'id', 'url', 'remoteIpAddress', + 'protocol', 'resourceType', '_time') + + def __init__ (self, id=None, url=None, request=None, response=None): + self.request = request + self.response = response + self.id = id + self.url = url + self.remoteIpAddress = None + self.protocol = None + self.resourceType = None + self._time = None + + def __repr__ (self): + return f'RequestResponsePair({self.id!r}, {self.url!r}, {self.request!r}, {self.response!r})' + + def __eq__ (self, b): + if not isinstance (b, RequestResponsePair): + raise TypeError (f'Can only compare with {self.__class__.__name__}') + + # do not compare id and _time. These depend on external factors and do + # not influence the request/response *content* + return self.request == b.request and \ + self.response == b.response and \ + self.url == b.url and \ + self.remoteIpAddress == b.remoteIpAddress and \ + self.protocol == b.protocol and \ + self.resourceType == b.resourceType + + def fromRequestWillBeSent (self, req): + """ Set request data from Chrome Network.requestWillBeSent event """ + r = req['request'] + + self.id = req['requestId'] + self.url = URL (r['url']) + self.resourceType = req.get ('type') + self._time = ReferenceTimestamp (req['timestamp'], req['wallTime']) + + assert self.request is None, req + self.request = Request () + self.request.initiator = req['initiator'] + self.request.headers = CIMultiDict (self._unfoldHeaders (r['headers'])) + self.request.hasPostData = r.get ('hasPostData', False) + self.request.method = r['method'] + self.request.timestamp = self._time (req['timestamp']) + if self.request.hasPostData: + postData = r.get ('postData') + if postData is not None: + self.request.body = UnicodeBody (postData) + + def fromResponse (self, r, timestamp=None, resourceType=None): + """ + Set response data from Chrome’s Response object. + + Request must exist. Updates if response was set before. Sometimes + fromResponseReceived is triggered twice by Chrome. No idea why. + """ + assert self.request is not None, (self.request, r) + + if not timestamp: + timestamp = self.request.timestamp + + self.remoteIpAddress = r.get ('remoteIPAddress') + self.protocol = r.get ('protocol') + if resourceType: + self.resourceType = resourceType + + # a response may contain updated request headers (i.e. those actually + # sent over the wire) + if 'requestHeaders' in r: + self.request.headers = CIMultiDict (self._unfoldHeaders (r['requestHeaders'])) + + self.response = Response () + self.response.headers = CIMultiDict (self._unfoldHeaders (r['headers'])) + self.response.status = r['status'] + self.response.statusText = r['statusText'] + self.response.timestamp = timestamp + self.response.mimeType = r['mimeType'] + + def fromResponseReceived (self, resp): + """ Set response data from Chrome Network.responseReceived """ + return self.fromResponse (resp['response'], + self._time (resp['timestamp']), resp['type']) + + def fromLoadingFinished (self, data): + self.response.bytesReceived = data['encodedDataLength'] + + def fromLoadingFailed (self, data): + self.response = None @staticmethod def _unfoldHeaders (headers): @@ -110,44 +227,26 @@ class Item: items.append ((k, v)) return items - def setRequest (self, req): - self.chromeRequest = req - - def setResponse (self, resp): - self.chromeResponse = resp - - def setFinished (self, finished): - self.chromeFinished = finished - async def prefetchRequestBody (self, tab): - # request body - req = self.request - postData = req.get ('postData') - if postData: - self.requestBody = postData.encode ('utf8'), False - elif req.get ('hasPostData', False): + if self.request.hasPostData and self.request.body is None: try: postData = await tab.Network.getRequestPostData (requestId=self.id) - postData = postData['postData'] - self.requestBody = b64decode (postData), True + self.request.body = UnicodeBody (postData['postData']) except TabException: - self.requestBody = None + self.request.body = None else: - self.requestBody = None, False + self.request.body = None async def prefetchResponseBody (self, tab): - # get response body + """ Fetch response body """ try: body = await tab.Network.getResponseBody (requestId=self.id) - rawBody = body['body'] - base64Encoded = body['base64Encoded'] - if base64Encoded: - rawBody = b64decode (rawBody) + if body['base64Encoded']: + self.response.body = Base64Body (body['body']) else: - rawBody = rawBody.encode ('utf8') - self.body = rawBody, base64Encoded + self.response.body = UnicodeBody (body['body']) except TabException: - self.body = None + self.response.body = None class VarChangeEvent: """ Notify when variable is changed """ @@ -179,14 +278,14 @@ class SiteLoader: XXX: track popup windows/new tabs and close them """ - __slots__ = ('requests', 'browser', 'url', 'logger', 'tab', '_iterRunning', 'idle', '_framesLoading') + __slots__ = ('requests', 'browser', 'logger', 'tab', '_iterRunning', + 'idle', '_framesLoading') allowedSchemes = {'http', 'https'} - def __init__ (self, browser, url, logger): + def __init__ (self, browser, logger): self.requests = {} self.browser = Browser (url=browser) - self.url = url - self.logger = logger.bind (context=type (self).__name__, url=url) + self.logger = logger.bind (context=type (self).__name__) self._iterRunning = [] self.idle = VarChangeEvent (True) @@ -250,7 +349,7 @@ class SiteLoader: result = t.result () if result is None: pass - elif isinstance (result, Item): + elif isinstance (result, RequestResponsePair): yield result else: method, data = result @@ -263,8 +362,8 @@ class SiteLoader: running = pending self._iterRunning = running - async def start (self): - await self.tab.Page.navigate(url=self.url) + async def navigate (self, url): + await self.tab.Page.navigate(url=url) # internal chrome callbacks async def _requestWillBeSent (self, **kwargs): @@ -282,21 +381,24 @@ class SiteLoader: # redirects never “finish” loading, but yield another requestWillBeSent with this key set redirectResp = kwargs.get ('redirectResponse') if redirectResp: - # create fake responses - resp = {'requestId': reqId, 'response': redirectResp, 'timestamp': kwargs['timestamp']} - item.setResponse (resp) - resp = {'requestId': reqId, 'encodedDataLength': 0, 'timestamp': kwargs['timestamp']} - item.setFinished (resp) - item.isRedirect = True + if item.url != url: + # this happens for unknown reasons. the docs simply state + # it can differ in case of a redirect. Fix it and move on. + logger.warning ('redirect url differs', + uuid='558a7df7-2258-4fe4-b16d-22b6019cc163', + expected=item.url) + redirectResp['url'] = str (item.url) + item.fromResponse (redirectResp) logger.info ('redirect', uuid='85eaec41-e2a9-49c2-9445-6f19690278b8', target=url) + # XXX: queue this? no need to wait for it await item.prefetchRequestBody (self.tab) - # cannot fetch request body due to race condition (item id reused) + # cannot fetch response body due to race condition (item id reused) ret = item else: logger.warning ('request exists', uuid='2c989142-ba00-4791-bb03-c2a14e91a56b') - item = Item () - item.setRequest (kwargs) + item = RequestResponsePair () + item.fromRequestWillBeSent (kwargs) self.requests[reqId] = item logger.debug ('request', uuid='55c17564-1bd0-4499-8724-fa7aad65478f') @@ -315,7 +417,7 @@ class SiteLoader: logger.error ('url mismatch', uuid='7385f45f-0b06-4cbc-81f9-67bcd72ee7d0', respUrl=url) if url.scheme in self.allowedSchemes: logger.debug ('response', uuid='84461c4e-e8ef-4cbd-8e8e-e10a901c8bd0') - item.setResponse (kwargs) + item.fromResponseReceived (kwargs) else: logger.warning ('scheme forbidden', uuid='2ea6e5d7-dd3b-4881-b9de-156c1751c666') @@ -333,19 +435,21 @@ class SiteLoader: logger = self.logger.bind (reqId=reqId, reqUrl=item.url) if item.url.scheme in self.allowedSchemes: logger.info ('finished', uuid='5a8b4bad-f86a-4fe6-a53e-8da4130d6a02') - item.setFinished (kwargs) + item.fromLoadingFinished (kwargs) + # XXX queue both await asyncio.gather (item.prefetchRequestBody (self.tab), item.prefetchResponseBody (self.tab)) return item async def _loadingFailed (self, **kwargs): reqId = kwargs['requestId'] - self.logger.warning ('loading failed', + logger = self.logger.bind (reqId=reqId) + logger.warning ('loading failed', uuid='68410f13-6eea-453e-924e-c1af4601748b', errorText=kwargs['errorText'], blockedReason=kwargs.get ('blockedReason')) item = self.requests.pop (reqId, None) if item is not None: - item.failed = True + item.fromLoadingFailed (kwargs) return item async def _entryAdded (self, **kwargs): diff --git a/crocoite/controller.py b/crocoite/controller.py index 504fa23..a64a8dc 100644 --- a/crocoite/controller.py +++ b/crocoite/controller.py @@ -30,7 +30,7 @@ from operator import attrgetter from yarl import URL from . import behavior as cbehavior -from .browser import SiteLoader, Item +from .browser import SiteLoader, RequestResponsePair from .util import getFormattedViewportMetrics, getSoftwareInfo from .behavior import ExtractLinksEvent @@ -61,13 +61,13 @@ class StatsHandler (EventHandler): self.stats = {'requests': 0, 'finished': 0, 'failed': 0, 'bytesRcv': 0} def push (self, item): - if isinstance (item, Item): + if isinstance (item, RequestResponsePair): self.stats['requests'] += 1 - if item.failed: + if not item.response: self.stats['failed'] += 1 else: self.stats['finished'] += 1 - self.stats['bytesRcv'] += item.encodedDataLength + self.stats['bytesRcv'] += item.response.bytesReceived class LogHandler (EventHandler): """ Handle items by logging information about them """ @@ -126,7 +126,7 @@ class SinglePageController: async for item in l: self.processItem (item) - async with self.service as browser, SiteLoader (browser, self.url, logger=logger) as l: + async with self.service as browser, SiteLoader (browser, logger=logger) as l: handle = asyncio.ensure_future (processQueue ()) start = time.time () @@ -153,7 +153,7 @@ class SinglePageController: } self.processItem (ControllerStart (payload)) - await l.start () + await l.navigate (self.url) for b in enabledBehavior: async for item in b.onload (): self.processItem (item) diff --git a/crocoite/logger.py b/crocoite/logger.py index d882eaf..82d7f5b 100644 --- a/crocoite/logger.py +++ b/crocoite/logger.py @@ -105,7 +105,7 @@ class PrintConsumer (Consumer): return kwargs class JsonPrintConsumer (Consumer): - def __init__ (self, minLevel=Level.INFO): + def __init__ (self, minLevel=Level.DEBUG): self.minLevel = minLevel def __call__ (self, **kwargs): diff --git a/crocoite/test_browser.py b/crocoite/test_browser.py index 8008855..4bf2c64 100644 --- a/crocoite/test_browser.py +++ b/crocoite/test_browser.py @@ -19,103 +19,28 @@ # THE SOFTWARE. import asyncio, socket -import pytest from operator import itemgetter -from aiohttp import web from http.server import BaseHTTPRequestHandler +from datetime import datetime + +from yarl import URL +from aiohttp import web +from multidict import CIMultiDict -from .browser import Item, SiteLoader, VarChangeEvent +from hypothesis import given +import hypothesis.strategies as st +import pytest + +from .browser import RequestResponsePair, SiteLoader, VarChangeEvent, Request, \ + UnicodeBody, ReferenceTimestamp, Base64Body, UnicodeBody, Request, \ + Response from .logger import Logger, Consumer from .devtools import Crashed, Process # if you want to know what’s going on: +#import logging #logging.basicConfig(level=logging.DEBUG) -class TItem (Item): - """ This should be as close to Item as possible """ - - __slots__ = ('bodySend', '_body', '_requestBody') - base = 'http://localhost:8000/' - - def __init__ (self, path, status, headers, bodyReceive, bodySend=None, requestBody=None, failed=False, isRedirect=False): - super ().__init__ () - self.chromeResponse = {'response': {'headers': headers, 'status': status, 'url': self.base + path}} - self.body = bodyReceive, False - self.bodySend = bodyReceive if not bodySend else bodySend - self.requestBody = requestBody, False - self.failed = failed - self.isRedirect = isRedirect - -testItems = [ - TItem ('binary', 200, {'Content-Type': 'application/octet-stream'}, b'\x00\x01\x02', failed=True), - TItem ('attachment', 200, - {'Content-Type': 'text/plain; charset=utf-8', - 'Content-Disposition': 'attachment; filename="attachment.txt"', - }, - 'This is a simple text file with umlauts. ÄÖU.'.encode ('utf8'), failed=True), - TItem ('encoding/utf8', 200, {'Content-Type': 'text/plain; charset=utf-8'}, - 'This is a test, äöü μνψκ ¥¥¥¿ýý¡'.encode ('utf8')), - TItem ('encoding/iso88591', 200, {'Content-Type': 'text/plain; charset=ISO-8859-1'}, - 'This is a test, äöü.'.encode ('utf8'), - 'This is a test, äöü.'.encode ('ISO-8859-1')), - TItem ('encoding/latin1', 200, {'Content-Type': 'text/plain; charset=latin1'}, - 'This is a test, äöü.'.encode ('utf8'), - 'This is a test, äöü.'.encode ('latin1')), - TItem ('image', 200, {'Content-Type': 'image/png'}, - # 1×1 png image - b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01\x00\x00\x00\x01\x08\x00\x00\x00\x00:~\x9bU\x00\x00\x00\nIDAT\x08\x1dc\xf8\x0f\x00\x01\x01\x01\x006_g\x80\x00\x00\x00\x00IEND\xaeB`\x82'), - TItem ('empty', 200, {'Content-Type': 'text/plain'}, b''), - TItem ('headers/duplicate', 200, [('Content-Type', 'text/plain'), ('Duplicate', '1'), ('Duplicate', '2')], b''), - TItem ('headers/fetch/req', 200, {'Content-Type': 'text/plain'}, b''), - TItem ('headers/fetch/html', 200, {'Content-Type': 'text/html'}, - r"""<html><body><script> - let h = new Headers([["custom", "1"]]); - fetch("/headers/fetch/req", {"method": "GET", "headers": h}).then(x => console.log("done")); - </script></body></html>""".encode ('utf8')), - TItem ('redirect/301/empty', 301, {'Location': '/empty'}, b'', isRedirect=True), - TItem ('redirect/301/redirect/301/empty', 301, {'Location': '/redirect/301/empty'}, b'', isRedirect=True), - TItem ('nonexistent', 404, {}, b''), - TItem ('html', 200, {'Content-Type': 'text/html'}, - '<html><body><img src="/image"><img src="/nonexistent"></body></html>'.encode ('utf8')), - TItem ('html/alert', 200, {'Content-Type': 'text/html'}, - '<html><body><script>window.addEventListener("beforeunload", function (e) { e.returnValue = "bye?"; return e.returnValue; }); alert("stopping here"); if (confirm("are you sure?") || prompt ("42?")) { window.location = "/nonexistent"; }</script><script>document.write(\'<img src="/image">\');</script></body></html>'.encode ('utf8')), - TItem ('html/fetchPost', 200, {'Content-Type': 'text/html'}, - r"""<html><body><script> - let a = fetch("/html/fetchPost/binary", {"method": "POST", "body": "\x00"}); - let b = fetch("/html/fetchPost/form", {"method": "POST", "body": new URLSearchParams({"data": "!"})}); - let c = fetch("/html/fetchPost/binary/large", {"method": "POST", "body": "\x00".repeat(100*1024)}); - let d = fetch("/html/fetchPost/form/large", {"method": "POST", "body": new URLSearchParams({"data": "!".repeat(100*1024)})}); - </script></body></html>""".encode ('utf8')), - TItem ('html/fetchPost/binary', 200, {'Content-Type': 'application/octet-stream'}, b'\x00', requestBody=b'\x00'), - TItem ('html/fetchPost/form', 200, {'Content-Type': 'application/octet-stream'}, b'\x00', requestBody=b'data=%21'), - # XXX: these should trigger the need for getRequestPostData, but they don’t. oh well. - TItem ('html/fetchPost/binary/large', 200, {'Content-Type': 'application/octet-stream'}, b'\x00', requestBody=(100*1024)*b'\x00'), - TItem ('html/fetchPost/form/large', 200, {'Content-Type': 'application/octet-stream'}, b'\x00', requestBody=b'data=' + (100*1024)*b'%21'), - ] -testItemMap = dict ([(item.url.path, item) for item in testItems]) - -def itemToResponse (item): - async def f (req): - headers = item.response['headers'] - return web.Response(body=item.bodySend, status=item.response['status'], - headers=headers) - return f - -@pytest.fixture -async def server (): - """ Simple HTTP server for testing notifications """ - import logging - logging.basicConfig(level=logging.DEBUG) - app = web.Application() - for item in testItems: - app.router.add_route ('*', item.url.path, itemToResponse (item)) - runner = web.AppRunner(app) - await runner.setup() - site = web.TCPSite(runner, 'localhost', 8080) - await site.start() - yield app - await runner.cleanup () - class AssertConsumer (Consumer): def __call__ (self, **kwargs): assert 'uuid' in kwargs @@ -128,134 +53,14 @@ def logger (): return Logger (consumer=[AssertConsumer ()]) @pytest.fixture -async def loader (server, logger): - def f (path): - if path.startswith ('/'): - path = 'http://localhost:8080{}'.format (path) - return SiteLoader (browser, path, logger) - async with Process () as browser: - yield f - -async def itemsLoaded (l, items): - items = dict ([(i.url.path, i) for i in items]) - async for item in l: - assert item.chromeResponse is not None - golden = items.pop (item.url.path) - if not golden: - assert False, f'url {item.url} not supposed to be fetched' - assert item.failed == golden.failed - if item.failed: - # response will be invalid if request failed - if not items: - break - else: - continue - assert item.isRedirect == golden.isRedirect - if golden.isRedirect: - assert item.body is None - else: - assert item.body[0] == golden.body[0] - assert item.requestBody[0] == golden.requestBody[0] - assert item.response['status'] == golden.response['status'] - assert item.statusText == BaseHTTPRequestHandler.responses.get (item.response['status'])[0] - for k, v in golden.responseHeaders: - actual = list (map (itemgetter (1), filter (lambda x: x[0] == k, item.responseHeaders))) - assert v in actual - - # we’re done when everything has been loaded - if not items: - break - -async def literalItem (lf, item, deps=[]): - async with lf (item.url.path) as l: - await l.start () - await asyncio.wait_for (itemsLoaded (l, [item] + deps), timeout=30) - -@pytest.mark.asyncio -async def test_empty (loader): - await literalItem (loader, testItemMap['/empty']) - -@pytest.mark.asyncio -async def test_headers_duplicate (loader): - """ - Some headers, like Set-Cookie can be present multiple times. Chrome - separates these with a newline. - """ - async with loader ('/headers/duplicate') as l: - await l.start () - async for it in l: - if it.url.path == '/headers/duplicate': - assert not it.failed - dup = list (filter (lambda x: x[0] == 'Duplicate', it.responseHeaders)) - assert len(dup) == 2 - assert list(sorted(map(itemgetter(1), dup))) == ['1', '2'] - break - -@pytest.mark.asyncio -async def test_headers_req (loader): - """ - Custom request headers. JavaScript’s Headers() does not support duplicate - headers, so we can’t generate those. - """ - async with loader ('/headers/fetch/html') as l: - await l.start () - async for it in l: - if it.url.path == '/headers/fetch/req': - assert not it.failed - dup = list (filter (lambda x: x[0] == 'custom', it.requestHeaders)) - assert len(dup) == 1 - assert list(sorted(map(itemgetter(1), dup))) == ['1'] - break - -@pytest.mark.asyncio -async def test_redirect (loader): - await literalItem (loader, testItemMap['/redirect/301/empty'], [testItemMap['/empty']]) - # chained redirects - await literalItem (loader, testItemMap['/redirect/301/redirect/301/empty'], [testItemMap['/redirect/301/empty'], testItemMap['/empty']]) - -@pytest.mark.asyncio -async def test_encoding (loader): - """ Text responses are transformed to UTF-8. Make sure this works - correctly. """ - for item in {testItemMap['/encoding/utf8'], testItemMap['/encoding/latin1'], testItemMap['/encoding/iso88591']}: - await literalItem (loader, item) - -@pytest.mark.asyncio -async def test_binary (loader): - """ Browser should ignore content it cannot display (i.e. octet-stream) """ - await literalItem (loader, testItemMap['/binary']) - -@pytest.mark.asyncio -async def test_image (loader): - """ Images should be displayed inline """ - await literalItem (loader, testItemMap['/image']) - -@pytest.mark.asyncio -async def test_attachment (loader): - """ And downloads won’t work in headless mode, even if it’s just a text file """ - await literalItem (loader, testItemMap['/attachment']) - -@pytest.mark.asyncio -async def test_html (loader): - await literalItem (loader, testItemMap['/html'], [testItemMap['/image'], testItemMap['/nonexistent']]) - # make sure alerts are dismissed correctly (image won’t load otherwise) - await literalItem (loader, testItemMap['/html/alert'], [testItemMap['/image']]) - -@pytest.mark.asyncio -async def test_post (loader): - """ XHR POST request with binary data""" - await literalItem (loader, testItemMap['/html/fetchPost'], - [testItemMap['/html/fetchPost/binary'], - testItemMap['/html/fetchPost/binary/large'], - testItemMap['/html/fetchPost/form'], - testItemMap['/html/fetchPost/form/large']]) +async def loader (logger): + async with Process () as browser, SiteLoader (browser, logger) as l: + yield l @pytest.mark.asyncio async def test_crash (loader): - async with loader ('/html') as l: - await l.start () - with pytest.raises (Crashed): - await l.tab.Page.crash () + with pytest.raises (Crashed): + await loader.tab.Page.crash () @pytest.mark.asyncio async def test_invalidurl (loader): @@ -267,15 +72,16 @@ async def test_invalidurl (loader): try: resolved = await loop.getaddrinfo (host, None) except socket.gaierror: - async with loader (f'http://{host}/') as l: - await l.start () - async for it in l: - assert it.failed - break + url = URL.build (scheme='http', host=host) + await loader.navigate (url) + async for it in loader: + assert it.request is not None + assert it.url == url + assert it.response is None + break else: pytest.skip (f'host {host} resolved to {resolved}') - @pytest.mark.asyncio async def test_varchangeevent (): e = VarChangeEvent (True) @@ -299,3 +105,290 @@ async def test_varchangeevent (): assert ret == False assert e.get () == ret +timestamp = st.one_of ( + st.integers(min_value=0, max_value=2**32-1), + st.floats (min_value=0, max_value=2**32-1), + ) + +@given(timestamp, timestamp, timestamp) +def test_referencetimestamp (relativeA, absoluteA, relativeB): + ts = ReferenceTimestamp (relativeA, absoluteA) + absoluteA = datetime.utcfromtimestamp (absoluteA) + absoluteB = ts (relativeB) + assert (absoluteA < absoluteB and relativeA < relativeB) or \ + (absoluteA >= absoluteB and relativeA >= relativeB) + assert abs ((absoluteB - absoluteA).total_seconds () - (relativeB - relativeA)) < 10e-6 + +def hostname (): + # XXX: find a better way to generate hostnames + return st.text (alphabet=st.sampled_from('abcdefghijklmnopqrstuvwxyz0123456789-'), min_size=1, max_size=253) + +def urls (): + """ Build http/https URL """ + scheme = st.sampled_from (['http', 'https']) + # Path must start with a slash + pathSt = st.builds (lambda x: '/' + x, st.text ()) + args = st.fixed_dictionaries ({ + 'scheme': scheme, + 'host': hostname (), + 'port': st.one_of (st.none (), st.integers (min_value=1, max_value=2**16-1)), + 'path': pathSt, + 'query_string': st.text (), + 'fragment': st.text (), + }) + return st.builds (lambda x: URL.build (**x), args) + +def urlsStr (): + return st.builds (lambda x: str (x), urls ()) + +asciiText = st.text (st.characters (min_codepoint=32, max_codepoint=126)) + +def chromeHeaders (): + # token as defined by https://tools.ietf.org/html/rfc7230#section-3.2.6 + token = st.sampled_from('abcdefghijklmnopqrstuvwxyz0123456789!#$%&\'*+-.^_`|~') + # XXX: the value should be asciiText without leading/trailing spaces + return st.dictionaries (token, token) + +def fixedDicts (fixed, dynamic): + return st.builds (lambda x, y: x.update (y), st.fixed_dictionaries (fixed), st.lists (dynamic)) + +def chromeRequestWillBeSent (reqid, url): + methodSt = st.sampled_from (['GET', 'POST', 'PUT', 'DELETE']) + return st.fixed_dictionaries ({ + 'requestId': reqid, + 'initiator': st.just ('Test'), + 'wallTime': timestamp, + 'timestamp': timestamp, + 'request': st.fixed_dictionaries ({ + 'url': url, + 'method': methodSt, + 'headers': chromeHeaders (), + # XXX: postData, hasPostData + }) + }) + +def chromeResponseReceived (reqid, url): + mimeTypeSt = st.one_of (st.none (), st.just ('text/html')) + remoteIpAddressSt = st.one_of (st.none (), st.just ('127.0.0.1')) + protocolSt = st.one_of (st.none (), st.just ('h2')) + statusCodeSt = st.integers (min_value=100, max_value=999) + typeSt = st.sampled_from (['Document', 'Stylesheet', 'Image', 'Media', + 'Font', 'Script', 'TextTrack', 'XHR', 'Fetch', 'EventSource', + 'WebSocket', 'Manifest', 'SignedExchange', 'Ping', + 'CSPViolationReport', 'Other']) + return st.fixed_dictionaries ({ + 'requestId': reqid, + 'timestamp': timestamp, + 'type': typeSt, + 'response': st.fixed_dictionaries ({ + 'url': url, + 'requestHeaders': chromeHeaders (), # XXX: make this optional + 'headers': chromeHeaders (), + 'status': statusCodeSt, + 'statusText': asciiText, + 'mimeType': mimeTypeSt, + 'remoteIPAddress': remoteIpAddressSt, + 'protocol': protocolSt, + }) + }) + +def chromeReqResp (): + # XXX: will this gnerated the same url for all testcases? + reqid = st.shared (st.text (), 'reqresp') + url = st.shared (urlsStr (), 'reqresp') + return st.tuples (chromeRequestWillBeSent (reqid, url), + chromeResponseReceived (reqid, url)) + +def requestResponsePair (): + def f (creq, cresp, hasPostData, reqBody, respBody): + i = RequestResponsePair () + i.fromRequestWillBeSent (creq) + i.request.hasPostData = hasPostData + if hasPostData: + i.request.body = reqBody + + if cresp is not None: + i.fromResponseReceived (cresp) + if respBody is not None: + i.response.body = respBody + return i + + bodySt = st.one_of ( + st.none (), + st.builds (UnicodeBody, st.text ()), + st.builds (Base64Body.fromBytes, st.binary ()) + ) + return st.builds (lambda reqresp, hasPostData, reqBody, respBody: + f (reqresp[0], reqresp[1], hasPostData, reqBody, respBody), + chromeReqResp (), st.booleans (), bodySt, bodySt) + +@given(chromeReqResp ()) +def test_requestResponsePair (creqresp): + creq, cresp = creqresp + + item = RequestResponsePair () + + assert item.id is None + assert item.url is None + assert item.request is None + assert item.response is None + + item.fromRequestWillBeSent (creq) + + assert item.id == creq['requestId'] + url = URL (creq['request']['url']) + assert item.url == url + assert item.request is not None + assert item.request.timestamp == datetime.utcfromtimestamp (creq['wallTime']) + assert set (item.request.headers.keys ()) == set (creq['request']['headers'].keys ()) + assert item.response is None + + item.fromResponseReceived (cresp) + + # url will not be overwritten + assert item.id == creq['requestId'] == cresp['requestId'] + assert item.url == url + assert item.request is not None + assert set (item.request.headers.keys ()) == set (cresp['response']['requestHeaders'].keys ()) + assert item.response is not None + assert set (item.response.headers.keys ()) == set (cresp['response']['headers'].keys ()) + assert (item.response.timestamp - item.request.timestamp).total_seconds () - \ + (cresp['timestamp'] - creq['timestamp']) < 10e-6 + +@given(chromeReqResp ()) +def test_requestResponsePair_eq (creqresp): + creq, cresp = creqresp + + item = RequestResponsePair () + item2 = RequestResponsePair () + assert item == item + assert item == item2 + + item.fromRequestWillBeSent (creq) + assert item != item2 + item2.fromRequestWillBeSent (creq) + assert item == item + assert item == item2 + + item.fromResponseReceived (cresp) + assert item != item2 + item2.fromResponseReceived (cresp) + assert item == item + assert item == item2 + + # XXX: test for inequality with different parameters + +### Google Chrome integration tests ### + +serverUrl = URL.build (scheme='http', host='localhost', port=8080) +items = [ + RequestResponsePair ( + url=serverUrl.with_path ('/encoding/utf-8'), + request=Request (method='GET'), + response=Response (status=200, headers=CIMultiDict ([('Content-Type', 'text/html; charset=utf-8')]), + body=UnicodeBody ('äöü'), mimeType='text/html') + ), + RequestResponsePair ( + url=serverUrl.with_path ('/encoding/latin1'), + request=Request (method='GET'), + response=Response (status=200, headers=CIMultiDict ([('Content-Type', 'text/html; charset=latin1')]), + body=UnicodeBody ('äöü'), mimeType='text/html') + ), + RequestResponsePair ( + url=serverUrl.with_path ('/encoding/utf-16'), + request=Request (method='GET'), + response=Response (status=200, headers=CIMultiDict ([('Content-Type', 'text/html; charset=utf-16')]), + body=UnicodeBody ('äöü'), mimeType='text/html') + ), + RequestResponsePair ( + url=serverUrl.with_path ('/encoding/ISO-8859-1'), + request=Request (method='GET'), + response=Response (status=200, headers=CIMultiDict ([('Content-Type', 'text/html; charset=ISO-8859-1')]), + body=UnicodeBody ('äöü'), mimeType='text/html') + ), + RequestResponsePair ( + url=serverUrl.with_path ('/status/200'), + request=Request (method='GET'), + response=Response (status=200, headers=CIMultiDict ([('Content-Type', 'text/plain')]), + body=b'', + mimeType='text/plain'), + ), + # redirects never have a response body + RequestResponsePair ( + url=serverUrl.with_path ('/status/301'), + request=Request (method='GET'), + response=Response (status=301, + headers=CIMultiDict ([('Content-Type', 'text/plain'), + ('Location', str (serverUrl.with_path ('/status/301/redirected')))]), + body=None, + mimeType='text/plain'), + ), + RequestResponsePair ( + url=serverUrl.with_path ('/image/png'), + request=Request (method='GET'), + response=Response (status=200, headers=CIMultiDict ([('Content-Type', 'image/png')]), + body=Base64Body.fromBytes (b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01\x00\x00\x00\x01\x08\x00\x00\x00\x00:~\x9bU\x00\x00\x00\nIDAT\x08\x1dc\xf8\x0f\x00\x01\x01\x01\x006_g\x80\x00\x00\x00\x00IEND\xaeB`\x82'), + mimeType='image/png'), + ), + RequestResponsePair ( + url=serverUrl.with_path ('/script/alert'), + request=Request (method='GET'), + response=Response (status=200, headers=CIMultiDict ([('Content-Type', 'text/html; charset=utf-8')]), + body=UnicodeBody ('''<html><body><script> +window.addEventListener("beforeunload", function (e) { + e.returnValue = "bye?"; + return e.returnValue; +}); +alert("stopping here"); +if (confirm("are you sure?") || prompt ("42?")) { + window.location = "/nonexistent"; +} +</script></body></html>'''), mimeType='text/html') + ), + ] + +@pytest.mark.asyncio +# would be nice if we could use hypothesis here somehow +@pytest.mark.parametrize("golden", items) +async def test_integration_item (loader, golden): + async def f (req): + body = golden.response.body + contentType = golden.response.headers.get ('content-type', '') if golden.response.headers is not None else '' + charsetOff = contentType.find ('charset=') + if isinstance (body, UnicodeBody) and charsetOff != -1: + encoding = contentType[charsetOff+len ('charset='):] + body = golden.response.body.decode ('utf-8').encode (encoding) + return web.Response (body=body, status=golden.response.status, + headers=golden.response.headers) + + app = web.Application () + app.router.add_route (golden.request.method, golden.url.path, f) + runner = web.AppRunner(app) + await runner.setup() + site = web.TCPSite(runner, serverUrl.host, serverUrl.port) + await site.start() + + try: + await loader.navigate (golden.url) + + it = loader.__aiter__ () + item = await it.__anext__ () + + # we do not know this in advance + item.request.initiator = None + item.request.headers = None + item.remoteIpAddress = None + item.protocol = None + item.resourceType = None + + if item.response: + assert item.response.statusText is not None + item.response.statusText = None + + del item.response.headers['server'] + del item.response.headers['content-length'] + del item.response.headers['date'] + assert item == golden + finally: + await runner.cleanup () + diff --git a/crocoite/test_warc.py b/crocoite/test_warc.py index 7f2635b..954e8c8 100644 --- a/crocoite/test_warc.py +++ b/crocoite/test_warc.py @@ -24,6 +24,7 @@ from operator import itemgetter from warcio.archiveiterator import ArchiveIterator from yarl import URL +from multidict import CIMultiDict from hypothesis import given, reproduce_failure import hypothesis.strategies as st import pytest @@ -32,7 +33,8 @@ from .warc import WarcHandler from .logger import Logger, WarcHandlerConsumer from .controller import ControllerStart from .behavior import Script, ScreenshotEvent, DomSnapshotEvent -from .browser import Item +from .browser import RequestResponsePair, Base64Body, UnicodeBody +from .test_browser import requestResponsePair, urls def test_log (): logger = Logger () @@ -66,50 +68,6 @@ def test_log (): data = json.loads (l.strip ()) assert data == golden.pop (0) -def hostname (): - # XXX: find a better way to generate hostnames - return st.text (alphabet=st.sampled_from('abcdefghijklmnopqrstuvwxyz0123456789-'), min_size=1, max_size=253) - -def urls (): - """ Build http/https URL """ - scheme = st.one_of (st.just ('http'), st.just ('https')) - # Path must start with a slash - pathSt = st.builds (lambda x: '/' + x, st.text ()) - args = st.fixed_dictionaries ({ - 'scheme': scheme, - 'host': hostname (), - 'port': st.one_of (st.none (), st.integers (min_value=1, max_value=2**16-1)), - 'path': pathSt, - 'query_string': st.text (), - 'fragment': st.text (), - }) - return st.builds (lambda x: URL.build (**x), args) - -def item (): - def f (url, requestBody, body, mimeType): - i = Item () - # XXX: we really need some level of abstraction. Testing is a nightmare. - i.setRequest ({'requestId': 'myid', 'initiator': 'Test', 'wallTime': 0, 'timestamp': 1, 'request': {'url': str (url), 'method': 'GET', 'headers': {'None': 'None'}}}) - i.setResponse ({'requestId': 'myid', 'timestamp': 2, 'type': 'Document', 'response': {'url': str (url), 'requestHeaders': {'foo': 'bar', 'Set-Cookie': 'line1\nline2'}, 'headers': {'Response': 'Headers', 'Content-Length': '12345'}, 'status': 200}}) - if mimeType is not None: - i.chromeResponse['response']['mimeType'] = 'text/html' - i.requestBody = requestBody - i.body = body - return i - - def failedItem (url): - i = Item () - i.setRequest ({'requestId': 'myid', 'initiator': 'Test', 'wallTime': 0, 'timestamp': 1, 'request': {'url': str (url), 'method': 'GET', 'headers': {'None': 'None'}}}) - i.failed = True - return i - - bodySt = st.one_of (st.none (), st.tuples (st.one_of (st.none (), st.binary ()), st.booleans ())) - mimeTypeSt = st.one_of (st.none (), st.just ('text/html')) - return st.one_of ( - st.builds (failedItem, urls ()), - st.builds (f, urls (), bodySt, bodySt, mimeTypeSt), - ) - def jsonObject (): """ JSON-encodable objects """ return st.dictionaries (st.text (), st.one_of (st.integers (), st.text ())) @@ -123,7 +81,7 @@ def event (): st.builds (Script.fromStr, st.text (), st.one_of(st.none (), st.text ())), st.builds (ScreenshotEvent, urls (), st.integers (), st.binary ()), st.builds (DomSnapshotEvent, urls (), st.builds (lambda x: x.encode ('utf-8'), st.text ()), viewport()), - item (), + requestResponsePair (), ) @given (st.lists (event ())) @@ -136,7 +94,7 @@ def test_push (golden): # null logger logger = Logger () - with NamedTemporaryFile() as fd: + with open('/tmp/test.warc.gz', 'w+b') as fd: with WarcHandler (fd, logger) as handler: for g in golden: handler.push (g) @@ -191,10 +149,7 @@ def test_push (golden): assert headers['X-DOM-Snapshot'] == 'True' assert rec.raw_stream.read () == g.document - elif isinstance (g, Item): - if g.failed: - continue - + elif isinstance (g, RequestResponsePair): rec = next (it) # request @@ -204,54 +159,56 @@ def test_push (golden): assert URL (headers['warc-target-uri']) == g.url assert headers['x-chrome-request-id'] == g.id - assert sorted (rec.http_headers.headers, key=itemgetter (0)) == sorted (g.requestHeaders, key=itemgetter (0)) - if g.requestBody: - if g.requestBody[0] is None: - assert not rec.raw_stream.read () + assert CIMultiDict (rec.http_headers.headers) == g.request.headers + if g.request.hasPostData: + if g.request.body is not None: + assert rec.raw_stream.read () == g.request.body + assert str (headers['x-chrome-base64body'] or False) == str (isinstance (g.request.body, Base64Body)), (headers['x-chrome-base64body'], g.request.body) else: - assert rec.raw_stream.read () == g.requestBody[0], g.requestBody - assert str (headers['x-chrome-base64body'] or False) == str (g.requestBody[1]), (headers['x-chrome-base64body'], g.requestBody) + # body fetch failed + assert headers['warc-truncated'] == 'unspecified' + assert not rec.raw_stream.read () else: - # body fetch failed - assert headers['warc-truncated'] == 'unspecified' + assert not rec.raw_stream.read () # response - rec = next (it) - headers = rec.rec_headers - httpheaders = rec.http_headers - assert headers['warc-type'] == 'response' - checkWarcinfoId (headers) - assert URL (headers['warc-target-uri']) == g.url - assert headers['x-chrome-request-id'] == g.id - - # these are checked separately - blacklistedHeaders = {'content-type', 'content-length'} - sortedHeaders = lambda l: sorted (filter (lambda x: x[0].lower() not in blacklistedHeaders, l), key=itemgetter (0)) - assert sortedHeaders (httpheaders.headers) == sortedHeaders (g.responseHeaders) - - expectedContentType = g.response.get ('mimeType') - if expectedContentType is not None: - assert httpheaders['content-type'].startswith (expectedContentType) - - if g.body: - if g.body[0] is None: - assert not rec.raw_stream.read () - #assert httpheaders['content-length'] == '0' + if g.response: + rec = next (it) + headers = rec.rec_headers + httpheaders = rec.http_headers + assert headers['warc-type'] == 'response' + checkWarcinfoId (headers) + assert URL (headers['warc-target-uri']) == g.url + assert headers['x-chrome-request-id'] == g.id + + # these are checked separately + filteredHeaders = CIMultiDict (httpheaders.headers) + for b in {'content-type', 'content-length'}: + if b in g.response.headers: + g.response.headers.popall (b) + if b in filteredHeaders: + filteredHeaders.popall (b) + assert filteredHeaders == g.response.headers + + expectedContentType = g.response.mimeType + if expectedContentType is not None: + assert httpheaders['content-type'].startswith (expectedContentType) + + if g.response.body is not None: + assert rec.raw_stream.read () == g.response.body + assert str (headers['x-chrome-base64body'] or False) == str (isinstance (g.response.body, Base64Body)) + assert httpheaders['content-length'] == str (len (g.response.body)) + # body is never truncated if it exists + assert headers['warc-truncated'] is None + + # unencoded strings are converted to utf8 + if isinstance (g.response.body, UnicodeBody) and httpheaders['content-type'] is not None: + assert httpheaders['content-type'].endswith ('; charset=utf-8') else: - assert rec.raw_stream.read () == g.body[0] - assert str (headers['x-chrome-base64body'] or False) == str (g.body[1]) - assert httpheaders['content-length'] == str (len (g.body[0])) - - # body is never truncated if it exists - assert headers['warc-truncated'] is None - - # unencoded strings are converted to utf8 - if not g.body[1] and httpheaders['content-type'] is not None: - assert httpheaders['content-type'].endswith ('; charset=utf-8') - else: - # body fetch failed - assert headers['warc-truncated'] == 'unspecified' - # content-length header should be kept intact + # body fetch failed + assert headers['warc-truncated'] == 'unspecified' + assert not rec.raw_stream.read () + # content-length header should be kept intact else: assert False, f"invalid golden type {type(g)}" # pragma: no cover diff --git a/crocoite/warc.py b/crocoite/warc.py index dbd9ebc..cb1f2f7 100644 --- a/crocoite/warc.py +++ b/crocoite/warc.py @@ -25,6 +25,7 @@ Classes writing data to WARC files import json, threading from io import BytesIO from datetime import datetime +from http.server import BaseHTTPRequestHandler from warcio.timeutils import datetime_to_iso_date from warcio.warcwriter import WARCWriter @@ -33,7 +34,7 @@ from warcio.statusandheaders import StatusAndHeaders from .util import packageUrl, StrJsonEncoder from .controller import EventHandler, ControllerStart from .behavior import Script, DomSnapshotEvent, ScreenshotEvent -from .browser import Item +from .browser import RequestResponsePair, UnicodeBody, Base64Body class WarcHandler (EventHandler): __slots__ = ('logger', 'writer', 'documentRecords', 'log', @@ -86,66 +87,51 @@ class WarcHandler (EventHandler): url = item.url path = url.relative().with_fragment(None) - httpHeaders = StatusAndHeaders(f'{req["method"]} {path} HTTP/1.1', - item.requestHeaders, protocol='HTTP/1.1', is_http_request=True) - initiator = item.initiator + httpHeaders = StatusAndHeaders(f'{req.method} {path} HTTP/1.1', + req.headers, protocol='HTTP/1.1', is_http_request=True) warcHeaders = { - 'X-Chrome-Initiator': json.dumps (initiator), + 'X-Chrome-Initiator': json.dumps (req.initiator), 'X-Chrome-Request-ID': item.id, - 'WARC-Date': datetime_to_iso_date (datetime.utcfromtimestamp (item.chromeRequest['wallTime'])), + 'WARC-Date': datetime_to_iso_date (req.timestamp), } - if item.requestBody is not None: - payload, payloadBase64Encoded = item.requestBody - else: + body = item.request.body + if item.request.hasPostData and body is None: # oops, don’t know what went wrong here - logger.error ('requestBody missing', uuid='ee9adc58-e723-4595-9feb-312a67ead6a0') + logger.error ('requestBody missing', + uuid='ee9adc58-e723-4595-9feb-312a67ead6a0') warcHeaders['WARC-Truncated'] = 'unspecified' - payload = None - - if payload is not None: - payload = BytesIO (payload) - warcHeaders['X-Chrome-Base64Body'] = str (payloadBase64Encoded) + else: + warcHeaders['X-Chrome-Base64Body'] = str (type (body) is Base64Body) + body = BytesIO (body) record = self.writeRecord (url, 'request', - payload=payload, http_headers=httpHeaders, + payload=body, http_headers=httpHeaders, warc_headers_dict=warcHeaders) return record.rec_headers['WARC-Record-ID'] def _writeResponse (self, item, concurrentTo): # fetch the body reqId = item.id - rawBody = None - base64Encoded = False - bodyTruncated = None - if item.isRedirect or item.body is None: - # redirects reuse the same request, thus we cannot safely retrieve - # the body (i.e getResponseBody may return the new location’s - # body). No body available means we failed to retrieve it. - bodyTruncated = 'unspecified' - else: - rawBody, base64Encoded = item.body # now the response resp = item.response warcHeaders = { 'WARC-Concurrent-To': concurrentTo, - 'WARC-IP-Address': resp.get ('remoteIPAddress', ''), - 'X-Chrome-Protocol': resp.get ('protocol', ''), - 'X-Chrome-FromDiskCache': str (resp.get ('fromDiskCache')), - 'X-Chrome-ConnectionReused': str (resp.get ('connectionReused')), 'X-Chrome-Request-ID': item.id, - 'WARC-Date': datetime_to_iso_date (datetime.utcfromtimestamp ( - item.chromeRequest['wallTime']+ - (item.chromeResponse['timestamp']-item.chromeRequest['timestamp']))), + 'WARC-Date': datetime_to_iso_date (resp.timestamp), } - if bodyTruncated: - warcHeaders['WARC-Truncated'] = bodyTruncated - else: - warcHeaders['X-Chrome-Base64Body'] = str (base64Encoded) - - httpHeaders = StatusAndHeaders(f'{resp["status"]} {item.statusText}', - item.responseHeaders, - protocol='HTTP/1.1') + # conditional WARC headers + if item.remoteIpAddress: + warcHeaders['WARC-IP-Address'] = item.remoteIpAddress + if item.protocol: + warcHeaders['X-Chrome-Protocol'] = item.protocol + + # HTTP headers + statusText = resp.statusText or \ + BaseHTTPRequestHandler.responses.get ( + resp.status, ('No status text available', ))[0] + httpHeaders = StatusAndHeaders(f'{resp.status} {statusText}', + resp.headers, protocol='HTTP/1.1') # Content is saved decompressed and decoded, remove these headers blacklistedHeaders = {'transfer-encoding', 'content-encoding'} @@ -155,20 +141,23 @@ class WarcHandler (EventHandler): # chrome sends nothing but utf8 encoded text. Fortunately HTTP # headers take precedence over the document’s <meta>, thus we can # easily override those. - contentType = resp.get ('mimeType') + contentType = resp.mimeType if contentType: - if not base64Encoded: + if isinstance (resp.body, UnicodeBody): contentType += '; charset=utf-8' httpHeaders.replace_header ('Content-Type', contentType) - if rawBody is not None: - httpHeaders.replace_header ('Content-Length', str (len (rawBody))) - bodyIo = BytesIO (rawBody) + # response body + body = resp.body + if body is None: + warcHeaders['WARC-Truncated'] = 'unspecified' else: - bodyIo = BytesIO () + httpHeaders.replace_header ('Content-Length', str (len (body))) + warcHeaders['X-Chrome-Base64Body'] = str (type (body) is Base64Body) + body = BytesIO (body) record = self.writeRecord (item.url, 'response', - warc_headers_dict=warcHeaders, payload=bodyIo, + warc_headers_dict=warcHeaders, payload=body, http_headers=httpHeaders) if item.resourceType == 'Document': @@ -184,12 +173,11 @@ class WarcHandler (EventHandler): f'application/javascript; charset={encoding}'}) def _writeItem (self, item): - if item.failed: - # should have been handled by the logger already - return - + assert item.request concurrentTo = self._writeRequest (item) - self._writeResponse (item, concurrentTo) + # items that failed loading don’t have a response + if item.response: + self._writeResponse (item, concurrentTo) def _addRefersTo (self, headers, url): refersTo = self.documentRecords.get (url) @@ -247,7 +235,7 @@ class WarcHandler (EventHandler): self._flushLogEntries () route = {Script: _writeScript, - Item: _writeItem, + RequestResponsePair: _writeItem, DomSnapshotEvent: _writeDomSnapshot, ScreenshotEvent: _writeScreenshot, ControllerStart: _writeControllerStart, @@ -18,6 +18,7 @@ setup( 'aiohttp', 'PyYAML', 'yarl', + 'multidict', ], entry_points={ 'console_scripts': [ |