diff options
-rw-r--r-- | README.rst | 2 | ||||
-rw-r--r-- | crocoite/browser.py | 326 | ||||
-rw-r--r-- | crocoite/controller.py | 12 | ||||
-rw-r--r-- | crocoite/logger.py | 2 | ||||
-rw-r--r-- | crocoite/test_browser.py | 531 | ||||
-rw-r--r-- | crocoite/test_warc.py | 145 | ||||
-rw-r--r-- | crocoite/warc.py | 96 | ||||
-rw-r--r-- | setup.py | 1 |
8 files changed, 630 insertions, 485 deletions
@@ -24,6 +24,7 @@ These dependencies must be present to run crocoite: - warcio_ - html5lib_ - yarl_ +- multidict_ - bottom_ (IRC client) - `Google Chrome`_ @@ -35,6 +36,7 @@ These dependencies must be present to run crocoite: .. _bottom: https://github.com/numberoverzero/bottom .. _Google Chrome: https://www.google.com/chrome/ .. _yarl: https://yarl.readthedocs.io/ +.. _multidict: https://multidict.readthedocs.io/ The following commands clone the repository from GitHub_, set up a virtual environment and install crocoite: diff --git a/crocoite/browser.py b/crocoite/browser.py index 3de61f0..50561ed 100644 --- a/crocoite/browser.py +++ b/crocoite/browser.py @@ -23,80 +23,197 @@ Chrome browser interactions. """ import asyncio -from base64 import b64decode +from base64 import b64decode, b64encode +from datetime import datetime, timedelta from http.server import BaseHTTPRequestHandler + from yarl import URL +from multidict import CIMultiDict from .logger import Level from .devtools import Browser, TabException -class Item: - """ - Simple wrapper containing Chrome request and response - """ +# These two classes’ only purpose is so we can later tell whether a body was +# base64-encoded or a unicode string +class Base64Body (bytes): + def __new__ (cls, value): + return bytes.__new__ (cls, b64decode (value)) + + @classmethod + def fromBytes (cls, b): + """ For testing """ + return cls (b64encode (b)) + +class UnicodeBody (bytes): + def __new__ (cls, value): + if type (value) is not str: + raise TypeError ('expecting unicode string') + + return bytes.__new__ (cls, value.encode ('utf-8')) - __slots__ = ('chromeRequest', 'chromeResponse', 'chromeFinished', - 'isRedirect', 'failed', 'body', 'requestBody') +class Request: + __slots__ = ('headers', 'body', 'initiator', 'hasPostData', 'method', 'timestamp') - def __init__ (self): - self.chromeRequest = {} - self.chromeResponse = {} - self.chromeFinished = {} - self.isRedirect = False - self.failed = False - self.body = None - self.requestBody = None + def __init__ (self, method=None, headers=None, body=None): + self.headers = headers + self.body = body + self.hasPostData = False + self.initiator = None + # HTTP method + self.method = method + self.timestamp = None def __repr__ (self): - return f'<Item {self.url}>' - - @property - def request (self): - return self.chromeRequest.get ('request', {}) - - @property - def response (self): - return self.chromeResponse.get ('response', {}) - - @property - def initiator (self): - return self.chromeRequest['initiator'] - - @property - def id (self): - return self.chromeRequest['requestId'] - - @property - def encodedDataLength (self): - return self.chromeFinished['encodedDataLength'] - - @property - def url (self): - return URL (self.response.get ('url', self.request.get ('url'))) - - @property - def requestHeaders (self): - # the response object may contain refined headers, which were - # *actually* sent over the wire - return self._unfoldHeaders (self.response.get ('requestHeaders', self.request['headers'])) - - @property - def responseHeaders (self): - return self._unfoldHeaders (self.response['headers']) - - @property - def statusText (self): - text = self.response.get ('statusText') - if text: - return text - text = BaseHTTPRequestHandler.responses.get (self.response['status']) - if text: - return text[0] - return 'No status text available' - - @property - def resourceType (self): - return self.chromeResponse.get ('type', self.chromeRequest.get ('type', None)) + return f'Request({self.method!r}, {self.headers!r}, {self.body!r})' + + def __eq__ (self, b): + if b is None: + return False + + if not isinstance (b, Request): + raise TypeError ('Can only compare equality with Request.') + + # do not compare hasPostData (only required to fetch body) and + # timestamp (depends on time) + return self.headers == b.headers and \ + self.body == b.body and \ + self.initiator == b.initiator and \ + self.method == b.method + +class Response: + __slots__ = ('status', 'statusText', 'headers', 'body', 'bytesReceived', + 'timestamp', 'mimeType') + + def __init__ (self, status=None, statusText=None, headers=None, body=None, mimeType=None): + self.status = status + self.statusText = statusText + self.headers = headers + self.body = body + # bytes received over the network (not body size!) + self.bytesReceived = 0 + self.timestamp = None + self.mimeType = mimeType + + def __repr__ (self): + return f'Response({self.status!r}, {self.statusText!r}, {self.headers!r}, {self.body!r}, {self.mimeType!r})' + + def __eq__ (self, b): + if b is None: + return False + + if not isinstance (b, Response): + raise TypeError ('Can only compare equality with Response.') + + # do not compare bytesReceived (depends on network), timestamp + # (depends on time) and statusText (does not matter) + return self.status == b.status and \ + self.statusText == b.statusText and \ + self.headers == b.headers and \ + self.body == b.body and \ + self.mimeType == b.mimeType + +class ReferenceTimestamp: + """ Map relative timestamp to absolute timestamp """ + + def __init__ (self, relative, absolute): + self.relative = timedelta (seconds=relative) + self.absolute = datetime.utcfromtimestamp (absolute) + + def __call__ (self, relative): + if not isinstance (relative, timedelta): + relative = timedelta (seconds=relative) + return self.absolute + (relative-self.relative) + +class RequestResponsePair: + __slots__ = ('request', 'response', 'id', 'url', 'remoteIpAddress', + 'protocol', 'resourceType', '_time') + + def __init__ (self, id=None, url=None, request=None, response=None): + self.request = request + self.response = response + self.id = id + self.url = url + self.remoteIpAddress = None + self.protocol = None + self.resourceType = None + self._time = None + + def __repr__ (self): + return f'RequestResponsePair({self.id!r}, {self.url!r}, {self.request!r}, {self.response!r})' + + def __eq__ (self, b): + if not isinstance (b, RequestResponsePair): + raise TypeError (f'Can only compare with {self.__class__.__name__}') + + # do not compare id and _time. These depend on external factors and do + # not influence the request/response *content* + return self.request == b.request and \ + self.response == b.response and \ + self.url == b.url and \ + self.remoteIpAddress == b.remoteIpAddress and \ + self.protocol == b.protocol and \ + self.resourceType == b.resourceType + + def fromRequestWillBeSent (self, req): + """ Set request data from Chrome Network.requestWillBeSent event """ + r = req['request'] + + self.id = req['requestId'] + self.url = URL (r['url']) + self.resourceType = req.get ('type') + self._time = ReferenceTimestamp (req['timestamp'], req['wallTime']) + + assert self.request is None, req + self.request = Request () + self.request.initiator = req['initiator'] + self.request.headers = CIMultiDict (self._unfoldHeaders (r['headers'])) + self.request.hasPostData = r.get ('hasPostData', False) + self.request.method = r['method'] + self.request.timestamp = self._time (req['timestamp']) + if self.request.hasPostData: + postData = r.get ('postData') + if postData is not None: + self.request.body = UnicodeBody (postData) + + def fromResponse (self, r, timestamp=None, resourceType=None): + """ + Set response data from Chrome’s Response object. + + Request must exist. Updates if response was set before. Sometimes + fromResponseReceived is triggered twice by Chrome. No idea why. + """ + assert self.request is not None, (self.request, r) + + if not timestamp: + timestamp = self.request.timestamp + + self.remoteIpAddress = r.get ('remoteIPAddress') + self.protocol = r.get ('protocol') + if resourceType: + self.resourceType = resourceType + + # a response may contain updated request headers (i.e. those actually + # sent over the wire) + if 'requestHeaders' in r: + self.request.headers = CIMultiDict (self._unfoldHeaders (r['requestHeaders'])) + + self.response = Response () + self.response.headers = CIMultiDict (self._unfoldHeaders (r['headers'])) + self.response.status = r['status'] + self.response.statusText = r['statusText'] + self.response.timestamp = timestamp + self.response.mimeType = r['mimeType'] + + def fromResponseReceived (self, resp): + """ Set response data from Chrome Network.responseReceived """ + return self.fromResponse (resp['response'], + self._time (resp['timestamp']), resp['type']) + + def fromLoadingFinished (self, data): + self.response.bytesReceived = data['encodedDataLength'] + + def fromLoadingFailed (self, data): + self.response = None @staticmethod def _unfoldHeaders (headers): @@ -110,44 +227,26 @@ class Item: items.append ((k, v)) return items - def setRequest (self, req): - self.chromeRequest = req - - def setResponse (self, resp): - self.chromeResponse = resp - - def setFinished (self, finished): - self.chromeFinished = finished - async def prefetchRequestBody (self, tab): - # request body - req = self.request - postData = req.get ('postData') - if postData: - self.requestBody = postData.encode ('utf8'), False - elif req.get ('hasPostData', False): + if self.request.hasPostData and self.request.body is None: try: postData = await tab.Network.getRequestPostData (requestId=self.id) - postData = postData['postData'] - self.requestBody = b64decode (postData), True + self.request.body = UnicodeBody (postData['postData']) except TabException: - self.requestBody = None + self.request.body = None else: - self.requestBody = None, False + self.request.body = None async def prefetchResponseBody (self, tab): - # get response body + """ Fetch response body """ try: body = await tab.Network.getResponseBody (requestId=self.id) - rawBody = body['body'] - base64Encoded = body['base64Encoded'] - if base64Encoded: - rawBody = b64decode (rawBody) + if body['base64Encoded']: + self.response.body = Base64Body (body['body']) else: - rawBody = rawBody.encode ('utf8') - self.body = rawBody, base64Encoded + self.response.body = UnicodeBody (body['body']) except TabException: - self.body = None + self.response.body = None class VarChangeEvent: """ Notify when variable is changed """ @@ -179,14 +278,14 @@ class SiteLoader: XXX: track popup windows/new tabs and close them """ - __slots__ = ('requests', 'browser', 'url', 'logger', 'tab', '_iterRunning', 'idle', '_framesLoading') + __slots__ = ('requests', 'browser', 'logger', 'tab', '_iterRunning', + 'idle', '_framesLoading') allowedSchemes = {'http', 'https'} - def __init__ (self, browser, url, logger): + def __init__ (self, browser, logger): self.requests = {} self.browser = Browser (url=browser) - self.url = url - self.logger = logger.bind (context=type (self).__name__, url=url) + self.logger = logger.bind (context=type (self).__name__) self._iterRunning = [] self.idle = VarChangeEvent (True) @@ -250,7 +349,7 @@ class SiteLoader: result = t.result () if result is None: pass - elif isinstance (result, Item): + elif isinstance (result, RequestResponsePair): yield result else: method, data = result @@ -263,8 +362,8 @@ class SiteLoader: running = pending self._iterRunning = running - async def start (self): - await self.tab.Page.navigate(url=self.url) + async def navigate (self, url): + await self.tab.Page.navigate(url=url) # internal chrome callbacks async def _requestWillBeSent (self, **kwargs): @@ -282,21 +381,24 @@ class SiteLoader: # redirects never “finish” loading, but yield another requestWillBeSent with this key set redirectResp = kwargs.get ('redirectResponse') if redirectResp: - # create fake responses - resp = {'requestId': reqId, 'response': redirectResp, 'timestamp': kwargs['timestamp']} - item.setResponse (resp) - resp = {'requestId': reqId, 'encodedDataLength': 0, 'timestamp': kwargs['timestamp']} - item.setFinished (resp) - item.isRedirect = True + if item.url != url: + # this happens for unknown reasons. the docs simply state + # it can differ in case of a redirect. Fix it and move on. + logger.warning ('redirect url differs', + uuid='558a7df7-2258-4fe4-b16d-22b6019cc163', + expected=item.url) + redirectResp['url'] = str (item.url) + item.fromResponse (redirectResp) logger.info ('redirect', uuid='85eaec41-e2a9-49c2-9445-6f19690278b8', target=url) + # XXX: queue this? no need to wait for it await item.prefetchRequestBody (self.tab) - # cannot fetch request body due to race condition (item id reused) + # cannot fetch response body due to race condition (item id reused) ret = item else: logger.warning ('request exists', uuid='2c989142-ba00-4791-bb03-c2a14e91a56b') - item = Item () - item.setRequest (kwargs) + item = RequestResponsePair () + item.fromRequestWillBeSent (kwargs) self.requests[reqId] = item logger.debug ('request', uuid='55c17564-1bd0-4499-8724-fa7aad65478f') @@ -315,7 +417,7 @@ class SiteLoader: logger.error ('url mismatch', uuid='7385f45f-0b06-4cbc-81f9-67bcd72ee7d0', respUrl=url) if url.scheme in self.allowedSchemes: logger.debug ('response', uuid='84461c4e-e8ef-4cbd-8e8e-e10a901c8bd0') - item.setResponse (kwargs) + item.fromResponseReceived (kwargs) else: logger.warning ('scheme forbidden', uuid='2ea6e5d7-dd3b-4881-b9de-156c1751c666') @@ -333,19 +435,21 @@ class SiteLoader: logger = self.logger.bind (reqId=reqId, reqUrl=item.url) if item.url.scheme in self.allowedSchemes: logger.info ('finished', uuid='5a8b4bad-f86a-4fe6-a53e-8da4130d6a02') - item.setFinished (kwargs) + item.fromLoadingFinished (kwargs) + # XXX queue both await asyncio.gather (item.prefetchRequestBody (self.tab), item.prefetchResponseBody (self.tab)) return item async def _loadingFailed (self, **kwargs): reqId = kwargs['requestId'] - self.logger.warning ('loading failed', + logger = self.logger.bind (reqId=reqId) + logger.warning ('loading failed', uuid='68410f13-6eea-453e-924e-c1af4601748b', errorText=kwargs['errorText'], blockedReason=kwargs.get ('blockedReason')) item = self.requests.pop (reqId, None) if item is not None: - item.failed = True + item.fromLoadingFailed (kwargs) return item async def _entryAdded (self, **kwargs): diff --git a/crocoite/controller.py b/crocoite/controller.py index 504fa23..a64a8dc 100644 --- a/crocoite/controller.py +++ b/crocoite/controller.py @@ -30,7 +30,7 @@ from operator import attrgetter from yarl import URL from . import behavior as cbehavior -from .browser import SiteLoader, Item +from .browser import SiteLoader, RequestResponsePair from .util import getFormattedViewportMetrics, getSoftwareInfo from .behavior import ExtractLinksEvent @@ -61,13 +61,13 @@ class StatsHandler (EventHandler): self.stats = {'requests': 0, 'finished': 0, 'failed': 0, 'bytesRcv': 0} def push (self, item): - if isinstance (item, Item): + if isinstance (item, RequestResponsePair): self.stats['requests'] += 1 - if item.failed: + if not item.response: self.stats['failed'] += 1 else: self.stats['finished'] += 1 - self.stats['bytesRcv'] += item.encodedDataLength + self.stats['bytesRcv'] += item.response.bytesReceived class LogHandler (EventHandler): """ Handle items by logging information about them """ @@ -126,7 +126,7 @@ class SinglePageController: async for item in l: self.processItem (item) - async with self.service as browser, SiteLoader (browser, self.url, logger=logger) as l: + async with self.service as browser, SiteLoader (browser, logger=logger) as l: handle = asyncio.ensure_future (processQueue ()) start = time.time () @@ -153,7 +153,7 @@ class SinglePageController: } self.processItem (ControllerStart (payload)) - await l.start () + await l.navigate (self.url) for b in enabledBehavior: async for item in b.onload (): self.processItem (item) diff --git a/crocoite/logger.py b/crocoite/logger.py index d882eaf..82d7f5b 100644 --- a/crocoite/logger.py +++ b/crocoite/logger.py @@ -105,7 +105,7 @@ class PrintConsumer (Consumer): return kwargs class JsonPrintConsumer (Consumer): - def __init__ (self, minLevel=Level.INFO): + def __init__ (self, minLevel=Level.DEBUG): self.minLevel = minLevel def __call__ (self, **kwargs): diff --git a/crocoite/test_browser.py b/crocoite/test_browser.py index 8008855..4bf2c64 100644 --- a/crocoite/test_browser.py +++ b/crocoite/test_browser.py @@ -19,103 +19,28 @@ # THE SOFTWARE. import asyncio, socket -import pytest from operator import itemgetter -from aiohttp import web from http.server import BaseHTTPRequestHandler +from datetime import datetime + +from yarl import URL +from aiohttp import web +from multidict import CIMultiDict -from .browser import Item, SiteLoader, VarChangeEvent +from hypothesis import given +import hypothesis.strategies as st +import pytest + +from .browser import RequestResponsePair, SiteLoader, VarChangeEvent, Request, \ + UnicodeBody, ReferenceTimestamp, Base64Body, UnicodeBody, Request, \ + Response from .logger import Logger, Consumer from .devtools import Crashed, Process # if you want to know what’s going on: +#import logging #logging.basicConfig(level=logging.DEBUG) -class TItem (Item): - """ This should be as close to Item as possible """ - - __slots__ = ('bodySend', '_body', '_requestBody') - base = 'http://localhost:8000/' - - def __init__ (self, path, status, headers, bodyReceive, bodySend=None, requestBody=None, failed=False, isRedirect=False): - super ().__init__ () - self.chromeResponse = {'response': {'headers': headers, 'status': status, 'url': self.base + path}} - self.body = bodyReceive, False - self.bodySend = bodyReceive if not bodySend else bodySend - self.requestBody = requestBody, False - self.failed = failed - self.isRedirect = isRedirect - -testItems = [ - TItem ('binary', 200, {'Content-Type': 'application/octet-stream'}, b'\x00\x01\x02', failed=True), - TItem ('attachment', 200, - {'Content-Type': 'text/plain; charset=utf-8', - 'Content-Disposition': 'attachment; filename="attachment.txt"', - }, - 'This is a simple text file with umlauts. ÄÖU.'.encode ('utf8'), failed=True), - TItem ('encoding/utf8', 200, {'Content-Type': 'text/plain; charset=utf-8'}, - 'This is a test, äöü μνψκ ¥¥¥¿ýý¡'.encode ('utf8')), - TItem ('encoding/iso88591', 200, {'Content-Type': 'text/plain; charset=ISO-8859-1'}, - 'This is a test, äöü.'.encode ('utf8'), - 'This is a test, äöü.'.encode ('ISO-8859-1')), - TItem ('encoding/latin1', 200, {'Content-Type': 'text/plain; charset=latin1'}, - 'This is a test, äöü.'.encode ('utf8'), - 'This is a test, äöü.'.encode ('latin1')), - TItem ('image', 200, {'Content-Type': 'image/png'}, - # 1×1 png image - b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01\x00\x00\x00\x01\x08\x00\x00\x00\x00:~\x9bU\x00\x00\x00\nIDAT\x08\x1dc\xf8\x0f\x00\x01\x01\x01\x006_g\x80\x00\x00\x00\x00IEND\xaeB`\x82'), - TItem ('empty', 200, {'Content-Type': 'text/plain'}, b''), - TItem ('headers/duplicate', 200, [('Content-Type', 'text/plain'), ('Duplicate', '1'), ('Duplicate', '2')], b''), - TItem ('headers/fetch/req', 200, {'Content-Type': 'text/plain'}, b''), - TItem ('headers/fetch/html', 200, {'Content-Type': 'text/html'}, - r"""<html><body><script> - let h = new Headers([["custom", "1"]]); - fetch("/headers/fetch/req", {"method": "GET", "headers": h}).then(x => console.log("done")); - </script></body></html>""".encode ('utf8')), - TItem ('redirect/301/empty', 301, {'Location': '/empty'}, b'', isRedirect=True), - TItem ('redirect/301/redirect/301/empty', 301, {'Location': '/redirect/301/empty'}, b'', isRedirect=True), - TItem ('nonexistent', 404, {}, b''), - TItem ('html', 200, {'Content-Type': 'text/html'}, - '<html><body><img src="/image"><img src="/nonexistent"></body></html>'.encode ('utf8')), - TItem ('html/alert', 200, {'Content-Type': 'text/html'}, - '<html><body><script>window.addEventListener("beforeunload", function (e) { e.returnValue = "bye?"; return e.returnValue; }); alert("stopping here"); if (confirm("are you sure?") || prompt ("42?")) { window.location = "/nonexistent"; }</script><script>document.write(\'<img src="/image">\');</script></body></html>'.encode ('utf8')), - TItem ('html/fetchPost', 200, {'Content-Type': 'text/html'}, - r"""<html><body><script> - let a = fetch("/html/fetchPost/binary", {"method": "POST", "body": "\x00"}); - let b = fetch("/html/fetchPost/form", {"method": "POST", "body": new URLSearchParams({"data": "!"})}); - let c = fetch("/html/fetchPost/binary/large", {"method": "POST", "body": "\x00".repeat(100*1024)}); - let d = fetch("/html/fetchPost/form/large", {"method": "POST", "body": new URLSearchParams({"data": "!".repeat(100*1024)})}); - </script></body></html>""".encode ('utf8')), - TItem ('html/fetchPost/binary', 200, {'Content-Type': 'application/octet-stream'}, b'\x00', requestBody=b'\x00'), - TItem ('html/fetchPost/form', 200, {'Content-Type': 'application/octet-stream'}, b'\x00', requestBody=b'data=%21'), - # XXX: these should trigger the need for getRequestPostData, but they don’t. oh well. - TItem ('html/fetchPost/binary/large', 200, {'Content-Type': 'application/octet-stream'}, b'\x00', requestBody=(100*1024)*b'\x00'), - TItem ('html/fetchPost/form/large', 200, {'Content-Type': 'application/octet-stream'}, b'\x00', requestBody=b'data=' + (100*1024)*b'%21'), - ] -testItemMap = dict ([(item.url.path, item) for item in testItems]) - -def itemToResponse (item): - async def f (req): - headers = item.response['headers'] - return web.Response(body=item.bodySend, status=item.response['status'], - headers=headers) - return f - -@pytest.fixture -async def server (): - """ Simple HTTP server for testing notifications """ - import logging - logging.basicConfig(level=logging.DEBUG) - app = web.Application() - for item in testItems: - app.router.add_route ('*', item.url.path, itemToResponse (item)) - runner = web.AppRunner(app) - await runner.setup() - site = web.TCPSite(runner, 'localhost', 8080) - await site.start() - yield app - await runner.cleanup () - class AssertConsumer (Consumer): def __call__ (self, **kwargs): assert 'uuid' in kwargs @@ -128,134 +53,14 @@ def logger (): return Logger (consumer=[AssertConsumer ()]) @pytest.fixture -async def loader (server, logger): - def f (path): - if path.startswith ('/'): - path = 'http://localhost:8080{}'.format (path) - return SiteLoader (browser, path, logger) - async with Process () as browser: - yield f - -async def itemsLoaded (l, items): - items = dict ([(i.url.path, i) for i in items]) - async for item in l: - assert item.chromeResponse is not None - golden = items.pop (item.url.path) - if not golden: - assert False, f'url {item.url} not supposed to be fetched' - assert item.failed == golden.failed - if item.failed: - # response will be invalid if request failed - if not items: - break - else: - continue - assert item.isRedirect == golden.isRedirect - if golden.isRedirect: - assert item.body is None - else: - assert item.body[0] == golden.body[0] - assert item.requestBody[0] == golden.requestBody[0] - assert item.response['status'] == golden.response['status'] - assert item.statusText == BaseHTTPRequestHandler.responses.get (item.response['status'])[0] - for k, v in golden.responseHeaders: - actual = list (map (itemgetter (1), filter (lambda x: x[0] == k, item.responseHeaders))) - assert v in actual - - # we’re done when everything has been loaded - if not items: - break - -async def literalItem (lf, item, deps=[]): - async with lf (item.url.path) as l: - await l.start () - await asyncio.wait_for (itemsLoaded (l, [item] + deps), timeout=30) - -@pytest.mark.asyncio -async def test_empty (loader): - await literalItem (loader, testItemMap['/empty']) - -@pytest.mark.asyncio -async def test_headers_duplicate (loader): - """ - Some headers, like Set-Cookie can be present multiple times. Chrome - separates these with a newline. - """ - async with loader ('/headers/duplicate') as l: - await l.start () - async for it in l: - if it.url.path == '/headers/duplicate': - assert not it.failed - dup = list (filter (lambda x: x[0] == 'Duplicate', it.responseHeaders)) - assert len(dup) == 2 - assert list(sorted(map(itemgetter(1), dup))) == ['1', '2'] - break - -@pytest.mark.asyncio -async def test_headers_req (loader): - """ - Custom request headers. JavaScript’s Headers() does not support duplicate - headers, so we can’t generate those. - """ - async with loader ('/headers/fetch/html') as l: - await l.start () - async for it in l: - if it.url.path == '/headers/fetch/req': - assert not it.failed - dup = list (filter (lambda x: x[0] == 'custom', it.requestHeaders)) - assert len(dup) == 1 - assert list(sorted(map(itemgetter(1), dup))) == ['1'] - break - -@pytest.mark.asyncio -async def test_redirect (loader): - await literalItem (loader, testItemMap['/redirect/301/empty'], [testItemMap['/empty']]) - # chained redirects - await literalItem (loader, testItemMap['/redirect/301/redirect/301/empty'], [testItemMap['/redirect/301/empty'], testItemMap['/empty']]) - -@pytest.mark.asyncio -async def test_encoding (loader): - """ Text responses are transformed to UTF-8. Make sure this works - correctly. """ - for item in {testItemMap['/encoding/utf8'], testItemMap['/encoding/latin1'], testItemMap['/encoding/iso88591']}: - await literalItem (loader, item) - -@pytest.mark.asyncio -async def test_binary (loader): - """ Browser should ignore content it cannot display (i.e. octet-stream) """ - await literalItem (loader, testItemMap['/binary']) - -@pytest.mark.asyncio -async def test_image (loader): - """ Images should be displayed inline """ - await literalItem (loader, testItemMap['/image']) - -@pytest.mark.asyncio -async def test_attachment (loader): - """ And downloads won’t work in headless mode, even if it’s just a text file """ - await literalItem (loader, testItemMap['/attachment']) - -@pytest.mark.asyncio -async def test_html (loader): - await literalItem (loader, testItemMap['/html'], [testItemMap['/image'], testItemMap['/nonexistent']]) - # make sure alerts are dismissed correctly (image won’t load otherwise) - await literalItem (loader, testItemMap['/html/alert'], [testItemMap['/image']]) - -@pytest.mark.asyncio -async def test_post (loader): - """ XHR POST request with binary data""" - await literalItem (loader, testItemMap['/html/fetchPost'], - [testItemMap['/html/fetchPost/binary'], - testItemMap['/html/fetchPost/binary/large'], - testItemMap['/html/fetchPost/form'], - testItemMap['/html/fetchPost/form/large']]) +async def loader (logger): + async with Process () as browser, SiteLoader (browser, logger) as l: + yield l @pytest.mark.asyncio async def test_crash (loader): - async with loader ('/html') as l: - await l.start () - with pytest.raises (Crashed): - await l.tab.Page.crash () + with pytest.raises (Crashed): + await loader.tab.Page.crash () @pytest.mark.asyncio async def test_invalidurl (loader): @@ -267,15 +72,16 @@ async def test_invalidurl (loader): try: resolved = await loop.getaddrinfo (host, None) except socket.gaierror: - async with loader (f'http://{host}/') as l: - await l.start () - async for it in l: - assert it.failed - break + url = URL.build (scheme='http', host=host) + await loader.navigate (url) + async for it in loader: + assert it.request is not None + assert it.url == url + assert it.response is None + break else: pytest.skip (f'host {host} resolved to {resolved}') - @pytest.mark.asyncio async def test_varchangeevent (): e = VarChangeEvent (True) @@ -299,3 +105,290 @@ async def test_varchangeevent (): assert ret == False assert e.get () == ret +timestamp = st.one_of ( + st.integers(min_value=0, max_value=2**32-1), + st.floats (min_value=0, max_value=2**32-1), + ) + +@given(timestamp, timestamp, timestamp) +def test_referencetimestamp (relativeA, absoluteA, relativeB): + ts = ReferenceTimestamp (relativeA, absoluteA) + absoluteA = datetime.utcfromtimestamp (absoluteA) + absoluteB = ts (relativeB) + assert (absoluteA < absoluteB and relativeA < relativeB) or \ + (absoluteA >= absoluteB and relativeA >= relativeB) + assert abs ((absoluteB - absoluteA).total_seconds () - (relativeB - relativeA)) < 10e-6 + +def hostname (): + # XXX: find a better way to generate hostnames + return st.text (alphabet=st.sampled_from('abcdefghijklmnopqrstuvwxyz0123456789-'), min_size=1, max_size=253) + +def urls (): + """ Build http/https URL """ + scheme = st.sampled_from (['http', 'https']) + # Path must start with a slash + pathSt = st.builds (lambda x: '/' + x, st.text ()) + args = st.fixed_dictionaries ({ + 'scheme': scheme, + 'host': hostname (), + 'port': st.one_of (st.none (), st.integers (min_value=1, max_value=2**16-1)), + 'path': pathSt, + 'query_string': st.text (), + 'fragment': st.text (), + }) + return st.builds (lambda x: URL.build (**x), args) + +def urlsStr (): + return st.builds (lambda x: str (x), urls ()) + +asciiText = st.text (st.characters (min_codepoint=32, max_codepoint=126)) + +def chromeHeaders (): + # token as defined by https://tools.ietf.org/html/rfc7230#section-3.2.6 + token = st.sampled_from('abcdefghijklmnopqrstuvwxyz0123456789!#$%&\'*+-.^_`|~') + # XXX: the value should be asciiText without leading/trailing spaces + return st.dictionaries (token, token) + +def fixedDicts (fixed, dynamic): + return st.builds (lambda x, y: x.update (y), st.fixed_dictionaries (fixed), st.lists (dynamic)) + +def chromeRequestWillBeSent (reqid, url): + methodSt = st.sampled_from (['GET', 'POST', 'PUT', 'DELETE']) + return st.fixed_dictionaries ({ + 'requestId': reqid, + 'initiator': st.just ('Test'), + 'wallTime': timestamp, + 'timestamp': timestamp, + 'request': st.fixed_dictionaries ({ + 'url': url, + 'method': methodSt, + 'headers': chromeHeaders (), + # XXX: postData, hasPostData + }) + }) + +def chromeResponseReceived (reqid, url): + mimeTypeSt = st.one_of (st.none (), st.just ('text/html')) + remoteIpAddressSt = st.one_of (st.none (), st.just ('127.0.0.1')) + protocolSt = st.one_of (st.none (), st.just ('h2')) + statusCodeSt = st.integers (min_value=100, max_value=999) + typeSt = st.sampled_from (['Document', 'Stylesheet', 'Image', 'Media', + 'Font', 'Script', 'TextTrack', 'XHR', 'Fetch', 'EventSource', + 'WebSocket', 'Manifest', 'SignedExchange', 'Ping', + 'CSPViolationReport', 'Other']) + return st.fixed_dictionaries ({ + 'requestId': reqid, + 'timestamp': timestamp, + 'type': typeSt, + 'response': st.fixed_dictionaries ({ + 'url': url, + 'requestHeaders': chromeHeaders (), # XXX: make this optional + 'headers': chromeHeaders (), + 'status': statusCodeSt, + 'statusText': asciiText, + 'mimeType': mimeTypeSt, + 'remoteIPAddress': remoteIpAddressSt, + 'protocol': protocolSt, + }) + }) + +def chromeReqResp (): + # XXX: will this gnerated the same url for all testcases? + reqid = st.shared (st.text (), 'reqresp') + url = st.shared (urlsStr (), 'reqresp') + return st.tuples (chromeRequestWillBeSent (reqid, url), + chromeResponseReceived (reqid, url)) + +def requestResponsePair (): + def f (creq, cresp, hasPostData, reqBody, respBody): + i = RequestResponsePair () + i.fromRequestWillBeSent (creq) + i.request.hasPostData = hasPostData + if hasPostData: + i.request.body = reqBody + + if cresp is not None: + i.fromResponseReceived (cresp) + if respBody is not None: + i.response.body = respBody + return i + + bodySt = st.one_of ( + st.none (), + st.builds (UnicodeBody, st.text ()), + st.builds (Base64Body.fromBytes, st.binary ()) + ) + return st.builds (lambda reqresp, hasPostData, reqBody, respBody: + f (reqresp[0], reqresp[1], hasPostData, reqBody, respBody), + chromeReqResp (), st.booleans (), bodySt, bodySt) + +@given(chromeReqResp ()) +def test_requestResponsePair (creqresp): + creq, cresp = creqresp + + item = RequestResponsePair () + + assert item.id is None + assert item.url is None + assert item.request is None + assert item.response is None + + item.fromRequestWillBeSent (creq) + + assert item.id == creq['requestId'] + url = URL (creq['request']['url']) + assert item.url == url + assert item.request is not None + assert item.request.timestamp == datetime.utcfromtimestamp (creq['wallTime']) + assert set (item.request.headers.keys ()) == set (creq['request']['headers'].keys ()) + assert item.response is None + + item.fromResponseReceived (cresp) + + # url will not be overwritten + assert item.id == creq['requestId'] == cresp['requestId'] + assert item.url == url + assert item.request is not None + assert set (item.request.headers.keys ()) == set (cresp['response']['requestHeaders'].keys ()) + assert item.response is not None + assert set (item.response.headers.keys ()) == set (cresp['response']['headers'].keys ()) + assert (item.response.timestamp - item.request.timestamp).total_seconds () - \ + (cresp['timestamp'] - creq['timestamp']) < 10e-6 + +@given(chromeReqResp ()) +def test_requestResponsePair_eq (creqresp): + creq, cresp = creqresp + + item = RequestResponsePair () + item2 = RequestResponsePair () + assert item == item + assert item == item2 + + item.fromRequestWillBeSent (creq) + assert item != item2 + item2.fromRequestWillBeSent (creq) + assert item == item + assert item == item2 + + item.fromResponseReceived (cresp) + assert item != item2 + item2.fromResponseReceived (cresp) + assert item == item + assert item == item2 + + # XXX: test for inequality with different parameters + +### Google Chrome integration tests ### + +serverUrl = URL.build (scheme='http', host='localhost', port=8080) +items = [ + RequestResponsePair ( + url=serverUrl.with_path ('/encoding/utf-8'), + request=Request (method='GET'), + response=Response (status=200, headers=CIMultiDict ([('Content-Type', 'text/html; charset=utf-8')]), + body=UnicodeBody ('äöü'), mimeType='text/html') + ), + RequestResponsePair ( + url=serverUrl.with_path ('/encoding/latin1'), + request=Request (method='GET'), + response=Response (status=200, headers=CIMultiDict ([('Content-Type', 'text/html; charset=latin1')]), + body=UnicodeBody ('äöü'), mimeType='text/html') + ), + RequestResponsePair ( + url=serverUrl.with_path ('/encoding/utf-16'), + request=Request (method='GET'), + response=Response (status=200, headers=CIMultiDict ([('Content-Type', 'text/html; charset=utf-16')]), + body=UnicodeBody ('äöü'), mimeType='text/html') + ), + RequestResponsePair ( + url=serverUrl.with_path ('/encoding/ISO-8859-1'), + request=Request (method='GET'), + response=Response (status=200, headers=CIMultiDict ([('Content-Type', 'text/html; charset=ISO-8859-1')]), + body=UnicodeBody ('äöü'), mimeType='text/html') + ), + RequestResponsePair ( + url=serverUrl.with_path ('/status/200'), + request=Request (method='GET'), + response=Response (status=200, headers=CIMultiDict ([('Content-Type', 'text/plain')]), + body=b'', + mimeType='text/plain'), + ), + # redirects never have a response body + RequestResponsePair ( + url=serverUrl.with_path ('/status/301'), + request=Request (method='GET'), + response=Response (status=301, + headers=CIMultiDict ([('Content-Type', 'text/plain'), + ('Location', str (serverUrl.with_path ('/status/301/redirected')))]), + body=None, + mimeType='text/plain'), + ), + RequestResponsePair ( + url=serverUrl.with_path ('/image/png'), + request=Request (method='GET'), + response=Response (status=200, headers=CIMultiDict ([('Content-Type', 'image/png')]), + body=Base64Body.fromBytes (b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01\x00\x00\x00\x01\x08\x00\x00\x00\x00:~\x9bU\x00\x00\x00\nIDAT\x08\x1dc\xf8\x0f\x00\x01\x01\x01\x006_g\x80\x00\x00\x00\x00IEND\xaeB`\x82'), + mimeType='image/png'), + ), + RequestResponsePair ( + url=serverUrl.with_path ('/script/alert'), + request=Request (method='GET'), + response=Response (status=200, headers=CIMultiDict ([('Content-Type', 'text/html; charset=utf-8')]), + body=UnicodeBody ('''<html><body><script> +window.addEventListener("beforeunload", function (e) { + e.returnValue = "bye?"; + return e.returnValue; +}); +alert("stopping here"); +if (confirm("are you sure?") || prompt ("42?")) { + window.location = "/nonexistent"; +} +</script></body></html>'''), mimeType='text/html') + ), + ] + +@pytest.mark.asyncio +# would be nice if we could use hypothesis here somehow +@pytest.mark.parametrize("golden", items) +async def test_integration_item (loader, golden): + async def f (req): + body = golden.response.body + contentType = golden.response.headers.get ('content-type', '') if golden.response.headers is not None else '' + charsetOff = contentType.find ('charset=') + if isinstance (body, UnicodeBody) and charsetOff != -1: + encoding = contentType[charsetOff+len ('charset='):] + body = golden.response.body.decode ('utf-8').encode (encoding) + return web.Response (body=body, status=golden.response.status, + headers=golden.response.headers) + + app = web.Application () + app.router.add_route (golden.request.method, golden.url.path, f) + runner = web.AppRunner(app) + await runner.setup() + site = web.TCPSite(runner, serverUrl.host, serverUrl.port) + await site.start() + + try: + await loader.navigate (golden.url) + + it = loader.__aiter__ () + item = await it.__anext__ () + + # we do not know this in advance + item.request.initiator = None + item.request.headers = None + item.remoteIpAddress = None + item.protocol = None + item.resourceType = None + + if item.response: + assert item.response.statusText is not None + item.response.statusText = None + + del item.response.headers['server'] + del item.response.headers['content-length'] + del item.response.headers['date'] + assert item == golden + finally: + await runner.cleanup () + diff --git a/crocoite/test_warc.py b/crocoite/test_warc.py index 7f2635b..954e8c8 100644 --- a/crocoite/test_warc.py +++ b/crocoite/test_warc.py @@ -24,6 +24,7 @@ from operator import itemgetter from warcio.archiveiterator import ArchiveIterator from yarl import URL +from multidict import CIMultiDict from hypothesis import given, reproduce_failure import hypothesis.strategies as st import pytest @@ -32,7 +33,8 @@ from .warc import WarcHandler from .logger import Logger, WarcHandlerConsumer from .controller import ControllerStart from .behavior import Script, ScreenshotEvent, DomSnapshotEvent -from .browser import Item +from .browser import RequestResponsePair, Base64Body, UnicodeBody +from .test_browser import requestResponsePair, urls def test_log (): logger = Logger () @@ -66,50 +68,6 @@ def test_log (): data = json.loads (l.strip ()) assert data == golden.pop (0) -def hostname (): - # XXX: find a better way to generate hostnames - return st.text (alphabet=st.sampled_from('abcdefghijklmnopqrstuvwxyz0123456789-'), min_size=1, max_size=253) - -def urls (): - """ Build http/https URL """ - scheme = st.one_of (st.just ('http'), st.just ('https')) - # Path must start with a slash - pathSt = st.builds (lambda x: '/' + x, st.text ()) - args = st.fixed_dictionaries ({ - 'scheme': scheme, - 'host': hostname (), - 'port': st.one_of (st.none (), st.integers (min_value=1, max_value=2**16-1)), - 'path': pathSt, - 'query_string': st.text (), - 'fragment': st.text (), - }) - return st.builds (lambda x: URL.build (**x), args) - -def item (): - def f (url, requestBody, body, mimeType): - i = Item () - # XXX: we really need some level of abstraction. Testing is a nightmare. - i.setRequest ({'requestId': 'myid', 'initiator': 'Test', 'wallTime': 0, 'timestamp': 1, 'request': {'url': str (url), 'method': 'GET', 'headers': {'None': 'None'}}}) - i.setResponse ({'requestId': 'myid', 'timestamp': 2, 'type': 'Document', 'response': {'url': str (url), 'requestHeaders': {'foo': 'bar', 'Set-Cookie': 'line1\nline2'}, 'headers': {'Response': 'Headers', 'Content-Length': '12345'}, 'status': 200}}) - if mimeType is not None: - i.chromeResponse['response']['mimeType'] = 'text/html' - i.requestBody = requestBody - i.body = body - return i - - def failedItem (url): - i = Item () - i.setRequest ({'requestId': 'myid', 'initiator': 'Test', 'wallTime': 0, 'timestamp': 1, 'request': {'url': str (url), 'method': 'GET', 'headers': {'None': 'None'}}}) - i.failed = True - return i - - bodySt = st.one_of (st.none (), st.tuples (st.one_of (st.none (), st.binary ()), st.booleans ())) - mimeTypeSt = st.one_of (st.none (), st.just ('text/html')) - return st.one_of ( - st.builds (failedItem, urls ()), - st.builds (f, urls (), bodySt, bodySt, mimeTypeSt), - ) - def jsonObject (): """ JSON-encodable objects """ return st.dictionaries (st.text (), st.one_of (st.integers (), st.text ())) @@ -123,7 +81,7 @@ def event (): st.builds (Script.fromStr, st.text (), st.one_of(st.none (), st.text ())), st.builds (ScreenshotEvent, urls (), st.integers (), st.binary ()), st.builds (DomSnapshotEvent, urls (), st.builds (lambda x: x.encode ('utf-8'), st.text ()), viewport()), - item (), + requestResponsePair (), ) @given (st.lists (event ())) @@ -136,7 +94,7 @@ def test_push (golden): # null logger logger = Logger () - with NamedTemporaryFile() as fd: + with open('/tmp/test.warc.gz', 'w+b') as fd: with WarcHandler (fd, logger) as handler: for g in golden: handler.push (g) @@ -191,10 +149,7 @@ def test_push (golden): assert headers['X-DOM-Snapshot'] == 'True' assert rec.raw_stream.read () == g.document - elif isinstance (g, Item): - if g.failed: - continue - + elif isinstance (g, RequestResponsePair): rec = next (it) # request @@ -204,54 +159,56 @@ def test_push (golden): assert URL (headers['warc-target-uri']) == g.url assert headers['x-chrome-request-id'] == g.id - assert sorted (rec.http_headers.headers, key=itemgetter (0)) == sorted (g.requestHeaders, key=itemgetter (0)) - if g.requestBody: - if g.requestBody[0] is None: - assert not rec.raw_stream.read () + assert CIMultiDict (rec.http_headers.headers) == g.request.headers + if g.request.hasPostData: + if g.request.body is not None: + assert rec.raw_stream.read () == g.request.body + assert str (headers['x-chrome-base64body'] or False) == str (isinstance (g.request.body, Base64Body)), (headers['x-chrome-base64body'], g.request.body) else: - assert rec.raw_stream.read () == g.requestBody[0], g.requestBody - assert str (headers['x-chrome-base64body'] or False) == str (g.requestBody[1]), (headers['x-chrome-base64body'], g.requestBody) + # body fetch failed + assert headers['warc-truncated'] == 'unspecified' + assert not rec.raw_stream.read () else: - # body fetch failed - assert headers['warc-truncated'] == 'unspecified' + assert not rec.raw_stream.read () # response - rec = next (it) - headers = rec.rec_headers - httpheaders = rec.http_headers - assert headers['warc-type'] == 'response' - checkWarcinfoId (headers) - assert URL (headers['warc-target-uri']) == g.url - assert headers['x-chrome-request-id'] == g.id - - # these are checked separately - blacklistedHeaders = {'content-type', 'content-length'} - sortedHeaders = lambda l: sorted (filter (lambda x: x[0].lower() not in blacklistedHeaders, l), key=itemgetter (0)) - assert sortedHeaders (httpheaders.headers) == sortedHeaders (g.responseHeaders) - - expectedContentType = g.response.get ('mimeType') - if expectedContentType is not None: - assert httpheaders['content-type'].startswith (expectedContentType) - - if g.body: - if g.body[0] is None: - assert not rec.raw_stream.read () - #assert httpheaders['content-length'] == '0' + if g.response: + rec = next (it) + headers = rec.rec_headers + httpheaders = rec.http_headers + assert headers['warc-type'] == 'response' + checkWarcinfoId (headers) + assert URL (headers['warc-target-uri']) == g.url + assert headers['x-chrome-request-id'] == g.id + + # these are checked separately + filteredHeaders = CIMultiDict (httpheaders.headers) + for b in {'content-type', 'content-length'}: + if b in g.response.headers: + g.response.headers.popall (b) + if b in filteredHeaders: + filteredHeaders.popall (b) + assert filteredHeaders == g.response.headers + + expectedContentType = g.response.mimeType + if expectedContentType is not None: + assert httpheaders['content-type'].startswith (expectedContentType) + + if g.response.body is not None: + assert rec.raw_stream.read () == g.response.body + assert str (headers['x-chrome-base64body'] or False) == str (isinstance (g.response.body, Base64Body)) + assert httpheaders['content-length'] == str (len (g.response.body)) + # body is never truncated if it exists + assert headers['warc-truncated'] is None + + # unencoded strings are converted to utf8 + if isinstance (g.response.body, UnicodeBody) and httpheaders['content-type'] is not None: + assert httpheaders['content-type'].endswith ('; charset=utf-8') else: - assert rec.raw_stream.read () == g.body[0] - assert str (headers['x-chrome-base64body'] or False) == str (g.body[1]) - assert httpheaders['content-length'] == str (len (g.body[0])) - - # body is never truncated if it exists - assert headers['warc-truncated'] is None - - # unencoded strings are converted to utf8 - if not g.body[1] and httpheaders['content-type'] is not None: - assert httpheaders['content-type'].endswith ('; charset=utf-8') - else: - # body fetch failed - assert headers['warc-truncated'] == 'unspecified' - # content-length header should be kept intact + # body fetch failed + assert headers['warc-truncated'] == 'unspecified' + assert not rec.raw_stream.read () + # content-length header should be kept intact else: assert False, f"invalid golden type {type(g)}" # pragma: no cover diff --git a/crocoite/warc.py b/crocoite/warc.py index dbd9ebc..cb1f2f7 100644 --- a/crocoite/warc.py +++ b/crocoite/warc.py @@ -25,6 +25,7 @@ Classes writing data to WARC files import json, threading from io import BytesIO from datetime import datetime +from http.server import BaseHTTPRequestHandler from warcio.timeutils import datetime_to_iso_date from warcio.warcwriter import WARCWriter @@ -33,7 +34,7 @@ from warcio.statusandheaders import StatusAndHeaders from .util import packageUrl, StrJsonEncoder from .controller import EventHandler, ControllerStart from .behavior import Script, DomSnapshotEvent, ScreenshotEvent -from .browser import Item +from .browser import RequestResponsePair, UnicodeBody, Base64Body class WarcHandler (EventHandler): __slots__ = ('logger', 'writer', 'documentRecords', 'log', @@ -86,66 +87,51 @@ class WarcHandler (EventHandler): url = item.url path = url.relative().with_fragment(None) - httpHeaders = StatusAndHeaders(f'{req["method"]} {path} HTTP/1.1', - item.requestHeaders, protocol='HTTP/1.1', is_http_request=True) - initiator = item.initiator + httpHeaders = StatusAndHeaders(f'{req.method} {path} HTTP/1.1', + req.headers, protocol='HTTP/1.1', is_http_request=True) warcHeaders = { - 'X-Chrome-Initiator': json.dumps (initiator), + 'X-Chrome-Initiator': json.dumps (req.initiator), 'X-Chrome-Request-ID': item.id, - 'WARC-Date': datetime_to_iso_date (datetime.utcfromtimestamp (item.chromeRequest['wallTime'])), + 'WARC-Date': datetime_to_iso_date (req.timestamp), } - if item.requestBody is not None: - payload, payloadBase64Encoded = item.requestBody - else: + body = item.request.body + if item.request.hasPostData and body is None: # oops, don’t know what went wrong here - logger.error ('requestBody missing', uuid='ee9adc58-e723-4595-9feb-312a67ead6a0') + logger.error ('requestBody missing', + uuid='ee9adc58-e723-4595-9feb-312a67ead6a0') warcHeaders['WARC-Truncated'] = 'unspecified' - payload = None - - if payload is not None: - payload = BytesIO (payload) - warcHeaders['X-Chrome-Base64Body'] = str (payloadBase64Encoded) + else: + warcHeaders['X-Chrome-Base64Body'] = str (type (body) is Base64Body) + body = BytesIO (body) record = self.writeRecord (url, 'request', - payload=payload, http_headers=httpHeaders, + payload=body, http_headers=httpHeaders, warc_headers_dict=warcHeaders) return record.rec_headers['WARC-Record-ID'] def _writeResponse (self, item, concurrentTo): # fetch the body reqId = item.id - rawBody = None - base64Encoded = False - bodyTruncated = None - if item.isRedirect or item.body is None: - # redirects reuse the same request, thus we cannot safely retrieve - # the body (i.e getResponseBody may return the new location’s - # body). No body available means we failed to retrieve it. - bodyTruncated = 'unspecified' - else: - rawBody, base64Encoded = item.body # now the response resp = item.response warcHeaders = { 'WARC-Concurrent-To': concurrentTo, - 'WARC-IP-Address': resp.get ('remoteIPAddress', ''), - 'X-Chrome-Protocol': resp.get ('protocol', ''), - 'X-Chrome-FromDiskCache': str (resp.get ('fromDiskCache')), - 'X-Chrome-ConnectionReused': str (resp.get ('connectionReused')), 'X-Chrome-Request-ID': item.id, - 'WARC-Date': datetime_to_iso_date (datetime.utcfromtimestamp ( - item.chromeRequest['wallTime']+ - (item.chromeResponse['timestamp']-item.chromeRequest['timestamp']))), + 'WARC-Date': datetime_to_iso_date (resp.timestamp), } - if bodyTruncated: - warcHeaders['WARC-Truncated'] = bodyTruncated - else: - warcHeaders['X-Chrome-Base64Body'] = str (base64Encoded) - - httpHeaders = StatusAndHeaders(f'{resp["status"]} {item.statusText}', - item.responseHeaders, - protocol='HTTP/1.1') + # conditional WARC headers + if item.remoteIpAddress: + warcHeaders['WARC-IP-Address'] = item.remoteIpAddress + if item.protocol: + warcHeaders['X-Chrome-Protocol'] = item.protocol + + # HTTP headers + statusText = resp.statusText or \ + BaseHTTPRequestHandler.responses.get ( + resp.status, ('No status text available', ))[0] + httpHeaders = StatusAndHeaders(f'{resp.status} {statusText}', + resp.headers, protocol='HTTP/1.1') # Content is saved decompressed and decoded, remove these headers blacklistedHeaders = {'transfer-encoding', 'content-encoding'} @@ -155,20 +141,23 @@ class WarcHandler (EventHandler): # chrome sends nothing but utf8 encoded text. Fortunately HTTP # headers take precedence over the document’s <meta>, thus we can # easily override those. - contentType = resp.get ('mimeType') + contentType = resp.mimeType if contentType: - if not base64Encoded: + if isinstance (resp.body, UnicodeBody): contentType += '; charset=utf-8' httpHeaders.replace_header ('Content-Type', contentType) - if rawBody is not None: - httpHeaders.replace_header ('Content-Length', str (len (rawBody))) - bodyIo = BytesIO (rawBody) + # response body + body = resp.body + if body is None: + warcHeaders['WARC-Truncated'] = 'unspecified' else: - bodyIo = BytesIO () + httpHeaders.replace_header ('Content-Length', str (len (body))) + warcHeaders['X-Chrome-Base64Body'] = str (type (body) is Base64Body) + body = BytesIO (body) record = self.writeRecord (item.url, 'response', - warc_headers_dict=warcHeaders, payload=bodyIo, + warc_headers_dict=warcHeaders, payload=body, http_headers=httpHeaders) if item.resourceType == 'Document': @@ -184,12 +173,11 @@ class WarcHandler (EventHandler): f'application/javascript; charset={encoding}'}) def _writeItem (self, item): - if item.failed: - # should have been handled by the logger already - return - + assert item.request concurrentTo = self._writeRequest (item) - self._writeResponse (item, concurrentTo) + # items that failed loading don’t have a response + if item.response: + self._writeResponse (item, concurrentTo) def _addRefersTo (self, headers, url): refersTo = self.documentRecords.get (url) @@ -247,7 +235,7 @@ class WarcHandler (EventHandler): self._flushLogEntries () route = {Script: _writeScript, - Item: _writeItem, + RequestResponsePair: _writeItem, DomSnapshotEvent: _writeDomSnapshot, ScreenshotEvent: _writeScreenshot, ControllerStart: _writeControllerStart, @@ -18,6 +18,7 @@ setup( 'aiohttp', 'PyYAML', 'yarl', + 'multidict', ], entry_points={ 'console_scripts': [ |