summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLars-Dominik Braun <lars@6xq.net>2019-01-03 19:34:17 +0100
committerLars-Dominik Braun <lars@6xq.net>2019-01-03 19:37:27 +0100
commit9d7974e3e7e8a4575ea61cb33a30fa291d12ae38 (patch)
tree5311396c0d74eaa35e1eff1e1641c0bd157cde25
parentad4e119bf1a55c84dc7c6260588ed7db9e7199c6 (diff)
downloadcrocoite-9d7974e3e7e8a4575ea61cb33a30fa291d12ae38.tar.gz
crocoite-9d7974e3e7e8a4575ea61cb33a30fa291d12ae38.tar.bz2
crocoite-9d7974e3e7e8a4575ea61cb33a30fa291d12ae38.zip
browser: Turn Item into RequestResponsePair
Previously Item was just a simple wrapper around Chrome’s Network.* events. This turned out to be quite nasty when testing, so its replacement, RequestResponsePair, does some level of abstraction. This makes testing alot easier, since we now can simply instantiate it without building a proper DevTools event. Should come without any functional changes.
-rw-r--r--README.rst2
-rw-r--r--crocoite/browser.py326
-rw-r--r--crocoite/controller.py12
-rw-r--r--crocoite/logger.py2
-rw-r--r--crocoite/test_browser.py531
-rw-r--r--crocoite/test_warc.py145
-rw-r--r--crocoite/warc.py96
-rw-r--r--setup.py1
8 files changed, 630 insertions, 485 deletions
diff --git a/README.rst b/README.rst
index 71d9947..45d2e0f 100644
--- a/README.rst
+++ b/README.rst
@@ -24,6 +24,7 @@ These dependencies must be present to run crocoite:
- warcio_
- html5lib_
- yarl_
+- multidict_
- bottom_ (IRC client)
- `Google Chrome`_
@@ -35,6 +36,7 @@ These dependencies must be present to run crocoite:
.. _bottom: https://github.com/numberoverzero/bottom
.. _Google Chrome: https://www.google.com/chrome/
.. _yarl: https://yarl.readthedocs.io/
+.. _multidict: https://multidict.readthedocs.io/
The following commands clone the repository from GitHub_, set up a virtual
environment and install crocoite:
diff --git a/crocoite/browser.py b/crocoite/browser.py
index 3de61f0..50561ed 100644
--- a/crocoite/browser.py
+++ b/crocoite/browser.py
@@ -23,80 +23,197 @@ Chrome browser interactions.
"""
import asyncio
-from base64 import b64decode
+from base64 import b64decode, b64encode
+from datetime import datetime, timedelta
from http.server import BaseHTTPRequestHandler
+
from yarl import URL
+from multidict import CIMultiDict
from .logger import Level
from .devtools import Browser, TabException
-class Item:
- """
- Simple wrapper containing Chrome request and response
- """
+# These two classes’ only purpose is so we can later tell whether a body was
+# base64-encoded or a unicode string
+class Base64Body (bytes):
+ def __new__ (cls, value):
+ return bytes.__new__ (cls, b64decode (value))
+
+ @classmethod
+ def fromBytes (cls, b):
+ """ For testing """
+ return cls (b64encode (b))
+
+class UnicodeBody (bytes):
+ def __new__ (cls, value):
+ if type (value) is not str:
+ raise TypeError ('expecting unicode string')
+
+ return bytes.__new__ (cls, value.encode ('utf-8'))
- __slots__ = ('chromeRequest', 'chromeResponse', 'chromeFinished',
- 'isRedirect', 'failed', 'body', 'requestBody')
+class Request:
+ __slots__ = ('headers', 'body', 'initiator', 'hasPostData', 'method', 'timestamp')
- def __init__ (self):
- self.chromeRequest = {}
- self.chromeResponse = {}
- self.chromeFinished = {}
- self.isRedirect = False
- self.failed = False
- self.body = None
- self.requestBody = None
+ def __init__ (self, method=None, headers=None, body=None):
+ self.headers = headers
+ self.body = body
+ self.hasPostData = False
+ self.initiator = None
+ # HTTP method
+ self.method = method
+ self.timestamp = None
def __repr__ (self):
- return f'<Item {self.url}>'
-
- @property
- def request (self):
- return self.chromeRequest.get ('request', {})
-
- @property
- def response (self):
- return self.chromeResponse.get ('response', {})
-
- @property
- def initiator (self):
- return self.chromeRequest['initiator']
-
- @property
- def id (self):
- return self.chromeRequest['requestId']
-
- @property
- def encodedDataLength (self):
- return self.chromeFinished['encodedDataLength']
-
- @property
- def url (self):
- return URL (self.response.get ('url', self.request.get ('url')))
-
- @property
- def requestHeaders (self):
- # the response object may contain refined headers, which were
- # *actually* sent over the wire
- return self._unfoldHeaders (self.response.get ('requestHeaders', self.request['headers']))
-
- @property
- def responseHeaders (self):
- return self._unfoldHeaders (self.response['headers'])
-
- @property
- def statusText (self):
- text = self.response.get ('statusText')
- if text:
- return text
- text = BaseHTTPRequestHandler.responses.get (self.response['status'])
- if text:
- return text[0]
- return 'No status text available'
-
- @property
- def resourceType (self):
- return self.chromeResponse.get ('type', self.chromeRequest.get ('type', None))
+ return f'Request({self.method!r}, {self.headers!r}, {self.body!r})'
+
+ def __eq__ (self, b):
+ if b is None:
+ return False
+
+ if not isinstance (b, Request):
+ raise TypeError ('Can only compare equality with Request.')
+
+ # do not compare hasPostData (only required to fetch body) and
+ # timestamp (depends on time)
+ return self.headers == b.headers and \
+ self.body == b.body and \
+ self.initiator == b.initiator and \
+ self.method == b.method
+
+class Response:
+ __slots__ = ('status', 'statusText', 'headers', 'body', 'bytesReceived',
+ 'timestamp', 'mimeType')
+
+ def __init__ (self, status=None, statusText=None, headers=None, body=None, mimeType=None):
+ self.status = status
+ self.statusText = statusText
+ self.headers = headers
+ self.body = body
+ # bytes received over the network (not body size!)
+ self.bytesReceived = 0
+ self.timestamp = None
+ self.mimeType = mimeType
+
+ def __repr__ (self):
+ return f'Response({self.status!r}, {self.statusText!r}, {self.headers!r}, {self.body!r}, {self.mimeType!r})'
+
+ def __eq__ (self, b):
+ if b is None:
+ return False
+
+ if not isinstance (b, Response):
+ raise TypeError ('Can only compare equality with Response.')
+
+ # do not compare bytesReceived (depends on network), timestamp
+ # (depends on time) and statusText (does not matter)
+ return self.status == b.status and \
+ self.statusText == b.statusText and \
+ self.headers == b.headers and \
+ self.body == b.body and \
+ self.mimeType == b.mimeType
+
+class ReferenceTimestamp:
+ """ Map relative timestamp to absolute timestamp """
+
+ def __init__ (self, relative, absolute):
+ self.relative = timedelta (seconds=relative)
+ self.absolute = datetime.utcfromtimestamp (absolute)
+
+ def __call__ (self, relative):
+ if not isinstance (relative, timedelta):
+ relative = timedelta (seconds=relative)
+ return self.absolute + (relative-self.relative)
+
+class RequestResponsePair:
+ __slots__ = ('request', 'response', 'id', 'url', 'remoteIpAddress',
+ 'protocol', 'resourceType', '_time')
+
+ def __init__ (self, id=None, url=None, request=None, response=None):
+ self.request = request
+ self.response = response
+ self.id = id
+ self.url = url
+ self.remoteIpAddress = None
+ self.protocol = None
+ self.resourceType = None
+ self._time = None
+
+ def __repr__ (self):
+ return f'RequestResponsePair({self.id!r}, {self.url!r}, {self.request!r}, {self.response!r})'
+
+ def __eq__ (self, b):
+ if not isinstance (b, RequestResponsePair):
+ raise TypeError (f'Can only compare with {self.__class__.__name__}')
+
+ # do not compare id and _time. These depend on external factors and do
+ # not influence the request/response *content*
+ return self.request == b.request and \
+ self.response == b.response and \
+ self.url == b.url and \
+ self.remoteIpAddress == b.remoteIpAddress and \
+ self.protocol == b.protocol and \
+ self.resourceType == b.resourceType
+
+ def fromRequestWillBeSent (self, req):
+ """ Set request data from Chrome Network.requestWillBeSent event """
+ r = req['request']
+
+ self.id = req['requestId']
+ self.url = URL (r['url'])
+ self.resourceType = req.get ('type')
+ self._time = ReferenceTimestamp (req['timestamp'], req['wallTime'])
+
+ assert self.request is None, req
+ self.request = Request ()
+ self.request.initiator = req['initiator']
+ self.request.headers = CIMultiDict (self._unfoldHeaders (r['headers']))
+ self.request.hasPostData = r.get ('hasPostData', False)
+ self.request.method = r['method']
+ self.request.timestamp = self._time (req['timestamp'])
+ if self.request.hasPostData:
+ postData = r.get ('postData')
+ if postData is not None:
+ self.request.body = UnicodeBody (postData)
+
+ def fromResponse (self, r, timestamp=None, resourceType=None):
+ """
+ Set response data from Chrome’s Response object.
+
+ Request must exist. Updates if response was set before. Sometimes
+ fromResponseReceived is triggered twice by Chrome. No idea why.
+ """
+ assert self.request is not None, (self.request, r)
+
+ if not timestamp:
+ timestamp = self.request.timestamp
+
+ self.remoteIpAddress = r.get ('remoteIPAddress')
+ self.protocol = r.get ('protocol')
+ if resourceType:
+ self.resourceType = resourceType
+
+ # a response may contain updated request headers (i.e. those actually
+ # sent over the wire)
+ if 'requestHeaders' in r:
+ self.request.headers = CIMultiDict (self._unfoldHeaders (r['requestHeaders']))
+
+ self.response = Response ()
+ self.response.headers = CIMultiDict (self._unfoldHeaders (r['headers']))
+ self.response.status = r['status']
+ self.response.statusText = r['statusText']
+ self.response.timestamp = timestamp
+ self.response.mimeType = r['mimeType']
+
+ def fromResponseReceived (self, resp):
+ """ Set response data from Chrome Network.responseReceived """
+ return self.fromResponse (resp['response'],
+ self._time (resp['timestamp']), resp['type'])
+
+ def fromLoadingFinished (self, data):
+ self.response.bytesReceived = data['encodedDataLength']
+
+ def fromLoadingFailed (self, data):
+ self.response = None
@staticmethod
def _unfoldHeaders (headers):
@@ -110,44 +227,26 @@ class Item:
items.append ((k, v))
return items
- def setRequest (self, req):
- self.chromeRequest = req
-
- def setResponse (self, resp):
- self.chromeResponse = resp
-
- def setFinished (self, finished):
- self.chromeFinished = finished
-
async def prefetchRequestBody (self, tab):
- # request body
- req = self.request
- postData = req.get ('postData')
- if postData:
- self.requestBody = postData.encode ('utf8'), False
- elif req.get ('hasPostData', False):
+ if self.request.hasPostData and self.request.body is None:
try:
postData = await tab.Network.getRequestPostData (requestId=self.id)
- postData = postData['postData']
- self.requestBody = b64decode (postData), True
+ self.request.body = UnicodeBody (postData['postData'])
except TabException:
- self.requestBody = None
+ self.request.body = None
else:
- self.requestBody = None, False
+ self.request.body = None
async def prefetchResponseBody (self, tab):
- # get response body
+ """ Fetch response body """
try:
body = await tab.Network.getResponseBody (requestId=self.id)
- rawBody = body['body']
- base64Encoded = body['base64Encoded']
- if base64Encoded:
- rawBody = b64decode (rawBody)
+ if body['base64Encoded']:
+ self.response.body = Base64Body (body['body'])
else:
- rawBody = rawBody.encode ('utf8')
- self.body = rawBody, base64Encoded
+ self.response.body = UnicodeBody (body['body'])
except TabException:
- self.body = None
+ self.response.body = None
class VarChangeEvent:
""" Notify when variable is changed """
@@ -179,14 +278,14 @@ class SiteLoader:
XXX: track popup windows/new tabs and close them
"""
- __slots__ = ('requests', 'browser', 'url', 'logger', 'tab', '_iterRunning', 'idle', '_framesLoading')
+ __slots__ = ('requests', 'browser', 'logger', 'tab', '_iterRunning',
+ 'idle', '_framesLoading')
allowedSchemes = {'http', 'https'}
- def __init__ (self, browser, url, logger):
+ def __init__ (self, browser, logger):
self.requests = {}
self.browser = Browser (url=browser)
- self.url = url
- self.logger = logger.bind (context=type (self).__name__, url=url)
+ self.logger = logger.bind (context=type (self).__name__)
self._iterRunning = []
self.idle = VarChangeEvent (True)
@@ -250,7 +349,7 @@ class SiteLoader:
result = t.result ()
if result is None:
pass
- elif isinstance (result, Item):
+ elif isinstance (result, RequestResponsePair):
yield result
else:
method, data = result
@@ -263,8 +362,8 @@ class SiteLoader:
running = pending
self._iterRunning = running
- async def start (self):
- await self.tab.Page.navigate(url=self.url)
+ async def navigate (self, url):
+ await self.tab.Page.navigate(url=url)
# internal chrome callbacks
async def _requestWillBeSent (self, **kwargs):
@@ -282,21 +381,24 @@ class SiteLoader:
# redirects never “finish” loading, but yield another requestWillBeSent with this key set
redirectResp = kwargs.get ('redirectResponse')
if redirectResp:
- # create fake responses
- resp = {'requestId': reqId, 'response': redirectResp, 'timestamp': kwargs['timestamp']}
- item.setResponse (resp)
- resp = {'requestId': reqId, 'encodedDataLength': 0, 'timestamp': kwargs['timestamp']}
- item.setFinished (resp)
- item.isRedirect = True
+ if item.url != url:
+ # this happens for unknown reasons. the docs simply state
+ # it can differ in case of a redirect. Fix it and move on.
+ logger.warning ('redirect url differs',
+ uuid='558a7df7-2258-4fe4-b16d-22b6019cc163',
+ expected=item.url)
+ redirectResp['url'] = str (item.url)
+ item.fromResponse (redirectResp)
logger.info ('redirect', uuid='85eaec41-e2a9-49c2-9445-6f19690278b8', target=url)
+ # XXX: queue this? no need to wait for it
await item.prefetchRequestBody (self.tab)
- # cannot fetch request body due to race condition (item id reused)
+ # cannot fetch response body due to race condition (item id reused)
ret = item
else:
logger.warning ('request exists', uuid='2c989142-ba00-4791-bb03-c2a14e91a56b')
- item = Item ()
- item.setRequest (kwargs)
+ item = RequestResponsePair ()
+ item.fromRequestWillBeSent (kwargs)
self.requests[reqId] = item
logger.debug ('request', uuid='55c17564-1bd0-4499-8724-fa7aad65478f')
@@ -315,7 +417,7 @@ class SiteLoader:
logger.error ('url mismatch', uuid='7385f45f-0b06-4cbc-81f9-67bcd72ee7d0', respUrl=url)
if url.scheme in self.allowedSchemes:
logger.debug ('response', uuid='84461c4e-e8ef-4cbd-8e8e-e10a901c8bd0')
- item.setResponse (kwargs)
+ item.fromResponseReceived (kwargs)
else:
logger.warning ('scheme forbidden', uuid='2ea6e5d7-dd3b-4881-b9de-156c1751c666')
@@ -333,19 +435,21 @@ class SiteLoader:
logger = self.logger.bind (reqId=reqId, reqUrl=item.url)
if item.url.scheme in self.allowedSchemes:
logger.info ('finished', uuid='5a8b4bad-f86a-4fe6-a53e-8da4130d6a02')
- item.setFinished (kwargs)
+ item.fromLoadingFinished (kwargs)
+ # XXX queue both
await asyncio.gather (item.prefetchRequestBody (self.tab), item.prefetchResponseBody (self.tab))
return item
async def _loadingFailed (self, **kwargs):
reqId = kwargs['requestId']
- self.logger.warning ('loading failed',
+ logger = self.logger.bind (reqId=reqId)
+ logger.warning ('loading failed',
uuid='68410f13-6eea-453e-924e-c1af4601748b',
errorText=kwargs['errorText'],
blockedReason=kwargs.get ('blockedReason'))
item = self.requests.pop (reqId, None)
if item is not None:
- item.failed = True
+ item.fromLoadingFailed (kwargs)
return item
async def _entryAdded (self, **kwargs):
diff --git a/crocoite/controller.py b/crocoite/controller.py
index 504fa23..a64a8dc 100644
--- a/crocoite/controller.py
+++ b/crocoite/controller.py
@@ -30,7 +30,7 @@ from operator import attrgetter
from yarl import URL
from . import behavior as cbehavior
-from .browser import SiteLoader, Item
+from .browser import SiteLoader, RequestResponsePair
from .util import getFormattedViewportMetrics, getSoftwareInfo
from .behavior import ExtractLinksEvent
@@ -61,13 +61,13 @@ class StatsHandler (EventHandler):
self.stats = {'requests': 0, 'finished': 0, 'failed': 0, 'bytesRcv': 0}
def push (self, item):
- if isinstance (item, Item):
+ if isinstance (item, RequestResponsePair):
self.stats['requests'] += 1
- if item.failed:
+ if not item.response:
self.stats['failed'] += 1
else:
self.stats['finished'] += 1
- self.stats['bytesRcv'] += item.encodedDataLength
+ self.stats['bytesRcv'] += item.response.bytesReceived
class LogHandler (EventHandler):
""" Handle items by logging information about them """
@@ -126,7 +126,7 @@ class SinglePageController:
async for item in l:
self.processItem (item)
- async with self.service as browser, SiteLoader (browser, self.url, logger=logger) as l:
+ async with self.service as browser, SiteLoader (browser, logger=logger) as l:
handle = asyncio.ensure_future (processQueue ())
start = time.time ()
@@ -153,7 +153,7 @@ class SinglePageController:
}
self.processItem (ControllerStart (payload))
- await l.start ()
+ await l.navigate (self.url)
for b in enabledBehavior:
async for item in b.onload ():
self.processItem (item)
diff --git a/crocoite/logger.py b/crocoite/logger.py
index d882eaf..82d7f5b 100644
--- a/crocoite/logger.py
+++ b/crocoite/logger.py
@@ -105,7 +105,7 @@ class PrintConsumer (Consumer):
return kwargs
class JsonPrintConsumer (Consumer):
- def __init__ (self, minLevel=Level.INFO):
+ def __init__ (self, minLevel=Level.DEBUG):
self.minLevel = minLevel
def __call__ (self, **kwargs):
diff --git a/crocoite/test_browser.py b/crocoite/test_browser.py
index 8008855..4bf2c64 100644
--- a/crocoite/test_browser.py
+++ b/crocoite/test_browser.py
@@ -19,103 +19,28 @@
# THE SOFTWARE.
import asyncio, socket
-import pytest
from operator import itemgetter
-from aiohttp import web
from http.server import BaseHTTPRequestHandler
+from datetime import datetime
+
+from yarl import URL
+from aiohttp import web
+from multidict import CIMultiDict
-from .browser import Item, SiteLoader, VarChangeEvent
+from hypothesis import given
+import hypothesis.strategies as st
+import pytest
+
+from .browser import RequestResponsePair, SiteLoader, VarChangeEvent, Request, \
+ UnicodeBody, ReferenceTimestamp, Base64Body, UnicodeBody, Request, \
+ Response
from .logger import Logger, Consumer
from .devtools import Crashed, Process
# if you want to know what’s going on:
+#import logging
#logging.basicConfig(level=logging.DEBUG)
-class TItem (Item):
- """ This should be as close to Item as possible """
-
- __slots__ = ('bodySend', '_body', '_requestBody')
- base = 'http://localhost:8000/'
-
- def __init__ (self, path, status, headers, bodyReceive, bodySend=None, requestBody=None, failed=False, isRedirect=False):
- super ().__init__ ()
- self.chromeResponse = {'response': {'headers': headers, 'status': status, 'url': self.base + path}}
- self.body = bodyReceive, False
- self.bodySend = bodyReceive if not bodySend else bodySend
- self.requestBody = requestBody, False
- self.failed = failed
- self.isRedirect = isRedirect
-
-testItems = [
- TItem ('binary', 200, {'Content-Type': 'application/octet-stream'}, b'\x00\x01\x02', failed=True),
- TItem ('attachment', 200,
- {'Content-Type': 'text/plain; charset=utf-8',
- 'Content-Disposition': 'attachment; filename="attachment.txt"',
- },
- 'This is a simple text file with umlauts. ÄÖU.'.encode ('utf8'), failed=True),
- TItem ('encoding/utf8', 200, {'Content-Type': 'text/plain; charset=utf-8'},
- 'This is a test, äöü μνψκ ¥¥¥¿ýý¡'.encode ('utf8')),
- TItem ('encoding/iso88591', 200, {'Content-Type': 'text/plain; charset=ISO-8859-1'},
- 'This is a test, äöü.'.encode ('utf8'),
- 'This is a test, äöü.'.encode ('ISO-8859-1')),
- TItem ('encoding/latin1', 200, {'Content-Type': 'text/plain; charset=latin1'},
- 'This is a test, äöü.'.encode ('utf8'),
- 'This is a test, äöü.'.encode ('latin1')),
- TItem ('image', 200, {'Content-Type': 'image/png'},
- # 1×1 png image
- b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01\x00\x00\x00\x01\x08\x00\x00\x00\x00:~\x9bU\x00\x00\x00\nIDAT\x08\x1dc\xf8\x0f\x00\x01\x01\x01\x006_g\x80\x00\x00\x00\x00IEND\xaeB`\x82'),
- TItem ('empty', 200, {'Content-Type': 'text/plain'}, b''),
- TItem ('headers/duplicate', 200, [('Content-Type', 'text/plain'), ('Duplicate', '1'), ('Duplicate', '2')], b''),
- TItem ('headers/fetch/req', 200, {'Content-Type': 'text/plain'}, b''),
- TItem ('headers/fetch/html', 200, {'Content-Type': 'text/html'},
- r"""<html><body><script>
- let h = new Headers([["custom", "1"]]);
- fetch("/headers/fetch/req", {"method": "GET", "headers": h}).then(x => console.log("done"));
- </script></body></html>""".encode ('utf8')),
- TItem ('redirect/301/empty', 301, {'Location': '/empty'}, b'', isRedirect=True),
- TItem ('redirect/301/redirect/301/empty', 301, {'Location': '/redirect/301/empty'}, b'', isRedirect=True),
- TItem ('nonexistent', 404, {}, b''),
- TItem ('html', 200, {'Content-Type': 'text/html'},
- '<html><body><img src="/image"><img src="/nonexistent"></body></html>'.encode ('utf8')),
- TItem ('html/alert', 200, {'Content-Type': 'text/html'},
- '<html><body><script>window.addEventListener("beforeunload", function (e) { e.returnValue = "bye?"; return e.returnValue; }); alert("stopping here"); if (confirm("are you sure?") || prompt ("42?")) { window.location = "/nonexistent"; }</script><script>document.write(\'<img src="/image">\');</script></body></html>'.encode ('utf8')),
- TItem ('html/fetchPost', 200, {'Content-Type': 'text/html'},
- r"""<html><body><script>
- let a = fetch("/html/fetchPost/binary", {"method": "POST", "body": "\x00"});
- let b = fetch("/html/fetchPost/form", {"method": "POST", "body": new URLSearchParams({"data": "!"})});
- let c = fetch("/html/fetchPost/binary/large", {"method": "POST", "body": "\x00".repeat(100*1024)});
- let d = fetch("/html/fetchPost/form/large", {"method": "POST", "body": new URLSearchParams({"data": "!".repeat(100*1024)})});
- </script></body></html>""".encode ('utf8')),
- TItem ('html/fetchPost/binary', 200, {'Content-Type': 'application/octet-stream'}, b'\x00', requestBody=b'\x00'),
- TItem ('html/fetchPost/form', 200, {'Content-Type': 'application/octet-stream'}, b'\x00', requestBody=b'data=%21'),
- # XXX: these should trigger the need for getRequestPostData, but they don’t. oh well.
- TItem ('html/fetchPost/binary/large', 200, {'Content-Type': 'application/octet-stream'}, b'\x00', requestBody=(100*1024)*b'\x00'),
- TItem ('html/fetchPost/form/large', 200, {'Content-Type': 'application/octet-stream'}, b'\x00', requestBody=b'data=' + (100*1024)*b'%21'),
- ]
-testItemMap = dict ([(item.url.path, item) for item in testItems])
-
-def itemToResponse (item):
- async def f (req):
- headers = item.response['headers']
- return web.Response(body=item.bodySend, status=item.response['status'],
- headers=headers)
- return f
-
-@pytest.fixture
-async def server ():
- """ Simple HTTP server for testing notifications """
- import logging
- logging.basicConfig(level=logging.DEBUG)
- app = web.Application()
- for item in testItems:
- app.router.add_route ('*', item.url.path, itemToResponse (item))
- runner = web.AppRunner(app)
- await runner.setup()
- site = web.TCPSite(runner, 'localhost', 8080)
- await site.start()
- yield app
- await runner.cleanup ()
-
class AssertConsumer (Consumer):
def __call__ (self, **kwargs):
assert 'uuid' in kwargs
@@ -128,134 +53,14 @@ def logger ():
return Logger (consumer=[AssertConsumer ()])
@pytest.fixture
-async def loader (server, logger):
- def f (path):
- if path.startswith ('/'):
- path = 'http://localhost:8080{}'.format (path)
- return SiteLoader (browser, path, logger)
- async with Process () as browser:
- yield f
-
-async def itemsLoaded (l, items):
- items = dict ([(i.url.path, i) for i in items])
- async for item in l:
- assert item.chromeResponse is not None
- golden = items.pop (item.url.path)
- if not golden:
- assert False, f'url {item.url} not supposed to be fetched'
- assert item.failed == golden.failed
- if item.failed:
- # response will be invalid if request failed
- if not items:
- break
- else:
- continue
- assert item.isRedirect == golden.isRedirect
- if golden.isRedirect:
- assert item.body is None
- else:
- assert item.body[0] == golden.body[0]
- assert item.requestBody[0] == golden.requestBody[0]
- assert item.response['status'] == golden.response['status']
- assert item.statusText == BaseHTTPRequestHandler.responses.get (item.response['status'])[0]
- for k, v in golden.responseHeaders:
- actual = list (map (itemgetter (1), filter (lambda x: x[0] == k, item.responseHeaders)))
- assert v in actual
-
- # we’re done when everything has been loaded
- if not items:
- break
-
-async def literalItem (lf, item, deps=[]):
- async with lf (item.url.path) as l:
- await l.start ()
- await asyncio.wait_for (itemsLoaded (l, [item] + deps), timeout=30)
-
-@pytest.mark.asyncio
-async def test_empty (loader):
- await literalItem (loader, testItemMap['/empty'])
-
-@pytest.mark.asyncio
-async def test_headers_duplicate (loader):
- """
- Some headers, like Set-Cookie can be present multiple times. Chrome
- separates these with a newline.
- """
- async with loader ('/headers/duplicate') as l:
- await l.start ()
- async for it in l:
- if it.url.path == '/headers/duplicate':
- assert not it.failed
- dup = list (filter (lambda x: x[0] == 'Duplicate', it.responseHeaders))
- assert len(dup) == 2
- assert list(sorted(map(itemgetter(1), dup))) == ['1', '2']
- break
-
-@pytest.mark.asyncio
-async def test_headers_req (loader):
- """
- Custom request headers. JavaScript’s Headers() does not support duplicate
- headers, so we can’t generate those.
- """
- async with loader ('/headers/fetch/html') as l:
- await l.start ()
- async for it in l:
- if it.url.path == '/headers/fetch/req':
- assert not it.failed
- dup = list (filter (lambda x: x[0] == 'custom', it.requestHeaders))
- assert len(dup) == 1
- assert list(sorted(map(itemgetter(1), dup))) == ['1']
- break
-
-@pytest.mark.asyncio
-async def test_redirect (loader):
- await literalItem (loader, testItemMap['/redirect/301/empty'], [testItemMap['/empty']])
- # chained redirects
- await literalItem (loader, testItemMap['/redirect/301/redirect/301/empty'], [testItemMap['/redirect/301/empty'], testItemMap['/empty']])
-
-@pytest.mark.asyncio
-async def test_encoding (loader):
- """ Text responses are transformed to UTF-8. Make sure this works
- correctly. """
- for item in {testItemMap['/encoding/utf8'], testItemMap['/encoding/latin1'], testItemMap['/encoding/iso88591']}:
- await literalItem (loader, item)
-
-@pytest.mark.asyncio
-async def test_binary (loader):
- """ Browser should ignore content it cannot display (i.e. octet-stream) """
- await literalItem (loader, testItemMap['/binary'])
-
-@pytest.mark.asyncio
-async def test_image (loader):
- """ Images should be displayed inline """
- await literalItem (loader, testItemMap['/image'])
-
-@pytest.mark.asyncio
-async def test_attachment (loader):
- """ And downloads won’t work in headless mode, even if it’s just a text file """
- await literalItem (loader, testItemMap['/attachment'])
-
-@pytest.mark.asyncio
-async def test_html (loader):
- await literalItem (loader, testItemMap['/html'], [testItemMap['/image'], testItemMap['/nonexistent']])
- # make sure alerts are dismissed correctly (image won’t load otherwise)
- await literalItem (loader, testItemMap['/html/alert'], [testItemMap['/image']])
-
-@pytest.mark.asyncio
-async def test_post (loader):
- """ XHR POST request with binary data"""
- await literalItem (loader, testItemMap['/html/fetchPost'],
- [testItemMap['/html/fetchPost/binary'],
- testItemMap['/html/fetchPost/binary/large'],
- testItemMap['/html/fetchPost/form'],
- testItemMap['/html/fetchPost/form/large']])
+async def loader (logger):
+ async with Process () as browser, SiteLoader (browser, logger) as l:
+ yield l
@pytest.mark.asyncio
async def test_crash (loader):
- async with loader ('/html') as l:
- await l.start ()
- with pytest.raises (Crashed):
- await l.tab.Page.crash ()
+ with pytest.raises (Crashed):
+ await loader.tab.Page.crash ()
@pytest.mark.asyncio
async def test_invalidurl (loader):
@@ -267,15 +72,16 @@ async def test_invalidurl (loader):
try:
resolved = await loop.getaddrinfo (host, None)
except socket.gaierror:
- async with loader (f'http://{host}/') as l:
- await l.start ()
- async for it in l:
- assert it.failed
- break
+ url = URL.build (scheme='http', host=host)
+ await loader.navigate (url)
+ async for it in loader:
+ assert it.request is not None
+ assert it.url == url
+ assert it.response is None
+ break
else:
pytest.skip (f'host {host} resolved to {resolved}')
-
@pytest.mark.asyncio
async def test_varchangeevent ():
e = VarChangeEvent (True)
@@ -299,3 +105,290 @@ async def test_varchangeevent ():
assert ret == False
assert e.get () == ret
+timestamp = st.one_of (
+ st.integers(min_value=0, max_value=2**32-1),
+ st.floats (min_value=0, max_value=2**32-1),
+ )
+
+@given(timestamp, timestamp, timestamp)
+def test_referencetimestamp (relativeA, absoluteA, relativeB):
+ ts = ReferenceTimestamp (relativeA, absoluteA)
+ absoluteA = datetime.utcfromtimestamp (absoluteA)
+ absoluteB = ts (relativeB)
+ assert (absoluteA < absoluteB and relativeA < relativeB) or \
+ (absoluteA >= absoluteB and relativeA >= relativeB)
+ assert abs ((absoluteB - absoluteA).total_seconds () - (relativeB - relativeA)) < 10e-6
+
+def hostname ():
+ # XXX: find a better way to generate hostnames
+ return st.text (alphabet=st.sampled_from('abcdefghijklmnopqrstuvwxyz0123456789-'), min_size=1, max_size=253)
+
+def urls ():
+ """ Build http/https URL """
+ scheme = st.sampled_from (['http', 'https'])
+ # Path must start with a slash
+ pathSt = st.builds (lambda x: '/' + x, st.text ())
+ args = st.fixed_dictionaries ({
+ 'scheme': scheme,
+ 'host': hostname (),
+ 'port': st.one_of (st.none (), st.integers (min_value=1, max_value=2**16-1)),
+ 'path': pathSt,
+ 'query_string': st.text (),
+ 'fragment': st.text (),
+ })
+ return st.builds (lambda x: URL.build (**x), args)
+
+def urlsStr ():
+ return st.builds (lambda x: str (x), urls ())
+
+asciiText = st.text (st.characters (min_codepoint=32, max_codepoint=126))
+
+def chromeHeaders ():
+ # token as defined by https://tools.ietf.org/html/rfc7230#section-3.2.6
+ token = st.sampled_from('abcdefghijklmnopqrstuvwxyz0123456789!#$%&\'*+-.^_`|~')
+ # XXX: the value should be asciiText without leading/trailing spaces
+ return st.dictionaries (token, token)
+
+def fixedDicts (fixed, dynamic):
+ return st.builds (lambda x, y: x.update (y), st.fixed_dictionaries (fixed), st.lists (dynamic))
+
+def chromeRequestWillBeSent (reqid, url):
+ methodSt = st.sampled_from (['GET', 'POST', 'PUT', 'DELETE'])
+ return st.fixed_dictionaries ({
+ 'requestId': reqid,
+ 'initiator': st.just ('Test'),
+ 'wallTime': timestamp,
+ 'timestamp': timestamp,
+ 'request': st.fixed_dictionaries ({
+ 'url': url,
+ 'method': methodSt,
+ 'headers': chromeHeaders (),
+ # XXX: postData, hasPostData
+ })
+ })
+
+def chromeResponseReceived (reqid, url):
+ mimeTypeSt = st.one_of (st.none (), st.just ('text/html'))
+ remoteIpAddressSt = st.one_of (st.none (), st.just ('127.0.0.1'))
+ protocolSt = st.one_of (st.none (), st.just ('h2'))
+ statusCodeSt = st.integers (min_value=100, max_value=999)
+ typeSt = st.sampled_from (['Document', 'Stylesheet', 'Image', 'Media',
+ 'Font', 'Script', 'TextTrack', 'XHR', 'Fetch', 'EventSource',
+ 'WebSocket', 'Manifest', 'SignedExchange', 'Ping',
+ 'CSPViolationReport', 'Other'])
+ return st.fixed_dictionaries ({
+ 'requestId': reqid,
+ 'timestamp': timestamp,
+ 'type': typeSt,
+ 'response': st.fixed_dictionaries ({
+ 'url': url,
+ 'requestHeaders': chromeHeaders (), # XXX: make this optional
+ 'headers': chromeHeaders (),
+ 'status': statusCodeSt,
+ 'statusText': asciiText,
+ 'mimeType': mimeTypeSt,
+ 'remoteIPAddress': remoteIpAddressSt,
+ 'protocol': protocolSt,
+ })
+ })
+
+def chromeReqResp ():
+ # XXX: will this gnerated the same url for all testcases?
+ reqid = st.shared (st.text (), 'reqresp')
+ url = st.shared (urlsStr (), 'reqresp')
+ return st.tuples (chromeRequestWillBeSent (reqid, url),
+ chromeResponseReceived (reqid, url))
+
+def requestResponsePair ():
+ def f (creq, cresp, hasPostData, reqBody, respBody):
+ i = RequestResponsePair ()
+ i.fromRequestWillBeSent (creq)
+ i.request.hasPostData = hasPostData
+ if hasPostData:
+ i.request.body = reqBody
+
+ if cresp is not None:
+ i.fromResponseReceived (cresp)
+ if respBody is not None:
+ i.response.body = respBody
+ return i
+
+ bodySt = st.one_of (
+ st.none (),
+ st.builds (UnicodeBody, st.text ()),
+ st.builds (Base64Body.fromBytes, st.binary ())
+ )
+ return st.builds (lambda reqresp, hasPostData, reqBody, respBody:
+ f (reqresp[0], reqresp[1], hasPostData, reqBody, respBody),
+ chromeReqResp (), st.booleans (), bodySt, bodySt)
+
+@given(chromeReqResp ())
+def test_requestResponsePair (creqresp):
+ creq, cresp = creqresp
+
+ item = RequestResponsePair ()
+
+ assert item.id is None
+ assert item.url is None
+ assert item.request is None
+ assert item.response is None
+
+ item.fromRequestWillBeSent (creq)
+
+ assert item.id == creq['requestId']
+ url = URL (creq['request']['url'])
+ assert item.url == url
+ assert item.request is not None
+ assert item.request.timestamp == datetime.utcfromtimestamp (creq['wallTime'])
+ assert set (item.request.headers.keys ()) == set (creq['request']['headers'].keys ())
+ assert item.response is None
+
+ item.fromResponseReceived (cresp)
+
+ # url will not be overwritten
+ assert item.id == creq['requestId'] == cresp['requestId']
+ assert item.url == url
+ assert item.request is not None
+ assert set (item.request.headers.keys ()) == set (cresp['response']['requestHeaders'].keys ())
+ assert item.response is not None
+ assert set (item.response.headers.keys ()) == set (cresp['response']['headers'].keys ())
+ assert (item.response.timestamp - item.request.timestamp).total_seconds () - \
+ (cresp['timestamp'] - creq['timestamp']) < 10e-6
+
+@given(chromeReqResp ())
+def test_requestResponsePair_eq (creqresp):
+ creq, cresp = creqresp
+
+ item = RequestResponsePair ()
+ item2 = RequestResponsePair ()
+ assert item == item
+ assert item == item2
+
+ item.fromRequestWillBeSent (creq)
+ assert item != item2
+ item2.fromRequestWillBeSent (creq)
+ assert item == item
+ assert item == item2
+
+ item.fromResponseReceived (cresp)
+ assert item != item2
+ item2.fromResponseReceived (cresp)
+ assert item == item
+ assert item == item2
+
+ # XXX: test for inequality with different parameters
+
+### Google Chrome integration tests ###
+
+serverUrl = URL.build (scheme='http', host='localhost', port=8080)
+items = [
+ RequestResponsePair (
+ url=serverUrl.with_path ('/encoding/utf-8'),
+ request=Request (method='GET'),
+ response=Response (status=200, headers=CIMultiDict ([('Content-Type', 'text/html; charset=utf-8')]),
+ body=UnicodeBody ('äöü'), mimeType='text/html')
+ ),
+ RequestResponsePair (
+ url=serverUrl.with_path ('/encoding/latin1'),
+ request=Request (method='GET'),
+ response=Response (status=200, headers=CIMultiDict ([('Content-Type', 'text/html; charset=latin1')]),
+ body=UnicodeBody ('äöü'), mimeType='text/html')
+ ),
+ RequestResponsePair (
+ url=serverUrl.with_path ('/encoding/utf-16'),
+ request=Request (method='GET'),
+ response=Response (status=200, headers=CIMultiDict ([('Content-Type', 'text/html; charset=utf-16')]),
+ body=UnicodeBody ('äöü'), mimeType='text/html')
+ ),
+ RequestResponsePair (
+ url=serverUrl.with_path ('/encoding/ISO-8859-1'),
+ request=Request (method='GET'),
+ response=Response (status=200, headers=CIMultiDict ([('Content-Type', 'text/html; charset=ISO-8859-1')]),
+ body=UnicodeBody ('äöü'), mimeType='text/html')
+ ),
+ RequestResponsePair (
+ url=serverUrl.with_path ('/status/200'),
+ request=Request (method='GET'),
+ response=Response (status=200, headers=CIMultiDict ([('Content-Type', 'text/plain')]),
+ body=b'',
+ mimeType='text/plain'),
+ ),
+ # redirects never have a response body
+ RequestResponsePair (
+ url=serverUrl.with_path ('/status/301'),
+ request=Request (method='GET'),
+ response=Response (status=301,
+ headers=CIMultiDict ([('Content-Type', 'text/plain'),
+ ('Location', str (serverUrl.with_path ('/status/301/redirected')))]),
+ body=None,
+ mimeType='text/plain'),
+ ),
+ RequestResponsePair (
+ url=serverUrl.with_path ('/image/png'),
+ request=Request (method='GET'),
+ response=Response (status=200, headers=CIMultiDict ([('Content-Type', 'image/png')]),
+ body=Base64Body.fromBytes (b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01\x00\x00\x00\x01\x08\x00\x00\x00\x00:~\x9bU\x00\x00\x00\nIDAT\x08\x1dc\xf8\x0f\x00\x01\x01\x01\x006_g\x80\x00\x00\x00\x00IEND\xaeB`\x82'),
+ mimeType='image/png'),
+ ),
+ RequestResponsePair (
+ url=serverUrl.with_path ('/script/alert'),
+ request=Request (method='GET'),
+ response=Response (status=200, headers=CIMultiDict ([('Content-Type', 'text/html; charset=utf-8')]),
+ body=UnicodeBody ('''<html><body><script>
+window.addEventListener("beforeunload", function (e) {
+ e.returnValue = "bye?";
+ return e.returnValue;
+});
+alert("stopping here");
+if (confirm("are you sure?") || prompt ("42?")) {
+ window.location = "/nonexistent";
+}
+</script></body></html>'''), mimeType='text/html')
+ ),
+ ]
+
+@pytest.mark.asyncio
+# would be nice if we could use hypothesis here somehow
+@pytest.mark.parametrize("golden", items)
+async def test_integration_item (loader, golden):
+ async def f (req):
+ body = golden.response.body
+ contentType = golden.response.headers.get ('content-type', '') if golden.response.headers is not None else ''
+ charsetOff = contentType.find ('charset=')
+ if isinstance (body, UnicodeBody) and charsetOff != -1:
+ encoding = contentType[charsetOff+len ('charset='):]
+ body = golden.response.body.decode ('utf-8').encode (encoding)
+ return web.Response (body=body, status=golden.response.status,
+ headers=golden.response.headers)
+
+ app = web.Application ()
+ app.router.add_route (golden.request.method, golden.url.path, f)
+ runner = web.AppRunner(app)
+ await runner.setup()
+ site = web.TCPSite(runner, serverUrl.host, serverUrl.port)
+ await site.start()
+
+ try:
+ await loader.navigate (golden.url)
+
+ it = loader.__aiter__ ()
+ item = await it.__anext__ ()
+
+ # we do not know this in advance
+ item.request.initiator = None
+ item.request.headers = None
+ item.remoteIpAddress = None
+ item.protocol = None
+ item.resourceType = None
+
+ if item.response:
+ assert item.response.statusText is not None
+ item.response.statusText = None
+
+ del item.response.headers['server']
+ del item.response.headers['content-length']
+ del item.response.headers['date']
+ assert item == golden
+ finally:
+ await runner.cleanup ()
+
diff --git a/crocoite/test_warc.py b/crocoite/test_warc.py
index 7f2635b..954e8c8 100644
--- a/crocoite/test_warc.py
+++ b/crocoite/test_warc.py
@@ -24,6 +24,7 @@ from operator import itemgetter
from warcio.archiveiterator import ArchiveIterator
from yarl import URL
+from multidict import CIMultiDict
from hypothesis import given, reproduce_failure
import hypothesis.strategies as st
import pytest
@@ -32,7 +33,8 @@ from .warc import WarcHandler
from .logger import Logger, WarcHandlerConsumer
from .controller import ControllerStart
from .behavior import Script, ScreenshotEvent, DomSnapshotEvent
-from .browser import Item
+from .browser import RequestResponsePair, Base64Body, UnicodeBody
+from .test_browser import requestResponsePair, urls
def test_log ():
logger = Logger ()
@@ -66,50 +68,6 @@ def test_log ():
data = json.loads (l.strip ())
assert data == golden.pop (0)
-def hostname ():
- # XXX: find a better way to generate hostnames
- return st.text (alphabet=st.sampled_from('abcdefghijklmnopqrstuvwxyz0123456789-'), min_size=1, max_size=253)
-
-def urls ():
- """ Build http/https URL """
- scheme = st.one_of (st.just ('http'), st.just ('https'))
- # Path must start with a slash
- pathSt = st.builds (lambda x: '/' + x, st.text ())
- args = st.fixed_dictionaries ({
- 'scheme': scheme,
- 'host': hostname (),
- 'port': st.one_of (st.none (), st.integers (min_value=1, max_value=2**16-1)),
- 'path': pathSt,
- 'query_string': st.text (),
- 'fragment': st.text (),
- })
- return st.builds (lambda x: URL.build (**x), args)
-
-def item ():
- def f (url, requestBody, body, mimeType):
- i = Item ()
- # XXX: we really need some level of abstraction. Testing is a nightmare.
- i.setRequest ({'requestId': 'myid', 'initiator': 'Test', 'wallTime': 0, 'timestamp': 1, 'request': {'url': str (url), 'method': 'GET', 'headers': {'None': 'None'}}})
- i.setResponse ({'requestId': 'myid', 'timestamp': 2, 'type': 'Document', 'response': {'url': str (url), 'requestHeaders': {'foo': 'bar', 'Set-Cookie': 'line1\nline2'}, 'headers': {'Response': 'Headers', 'Content-Length': '12345'}, 'status': 200}})
- if mimeType is not None:
- i.chromeResponse['response']['mimeType'] = 'text/html'
- i.requestBody = requestBody
- i.body = body
- return i
-
- def failedItem (url):
- i = Item ()
- i.setRequest ({'requestId': 'myid', 'initiator': 'Test', 'wallTime': 0, 'timestamp': 1, 'request': {'url': str (url), 'method': 'GET', 'headers': {'None': 'None'}}})
- i.failed = True
- return i
-
- bodySt = st.one_of (st.none (), st.tuples (st.one_of (st.none (), st.binary ()), st.booleans ()))
- mimeTypeSt = st.one_of (st.none (), st.just ('text/html'))
- return st.one_of (
- st.builds (failedItem, urls ()),
- st.builds (f, urls (), bodySt, bodySt, mimeTypeSt),
- )
-
def jsonObject ():
""" JSON-encodable objects """
return st.dictionaries (st.text (), st.one_of (st.integers (), st.text ()))
@@ -123,7 +81,7 @@ def event ():
st.builds (Script.fromStr, st.text (), st.one_of(st.none (), st.text ())),
st.builds (ScreenshotEvent, urls (), st.integers (), st.binary ()),
st.builds (DomSnapshotEvent, urls (), st.builds (lambda x: x.encode ('utf-8'), st.text ()), viewport()),
- item (),
+ requestResponsePair (),
)
@given (st.lists (event ()))
@@ -136,7 +94,7 @@ def test_push (golden):
# null logger
logger = Logger ()
- with NamedTemporaryFile() as fd:
+ with open('/tmp/test.warc.gz', 'w+b') as fd:
with WarcHandler (fd, logger) as handler:
for g in golden:
handler.push (g)
@@ -191,10 +149,7 @@ def test_push (golden):
assert headers['X-DOM-Snapshot'] == 'True'
assert rec.raw_stream.read () == g.document
- elif isinstance (g, Item):
- if g.failed:
- continue
-
+ elif isinstance (g, RequestResponsePair):
rec = next (it)
# request
@@ -204,54 +159,56 @@ def test_push (golden):
assert URL (headers['warc-target-uri']) == g.url
assert headers['x-chrome-request-id'] == g.id
- assert sorted (rec.http_headers.headers, key=itemgetter (0)) == sorted (g.requestHeaders, key=itemgetter (0))
- if g.requestBody:
- if g.requestBody[0] is None:
- assert not rec.raw_stream.read ()
+ assert CIMultiDict (rec.http_headers.headers) == g.request.headers
+ if g.request.hasPostData:
+ if g.request.body is not None:
+ assert rec.raw_stream.read () == g.request.body
+ assert str (headers['x-chrome-base64body'] or False) == str (isinstance (g.request.body, Base64Body)), (headers['x-chrome-base64body'], g.request.body)
else:
- assert rec.raw_stream.read () == g.requestBody[0], g.requestBody
- assert str (headers['x-chrome-base64body'] or False) == str (g.requestBody[1]), (headers['x-chrome-base64body'], g.requestBody)
+ # body fetch failed
+ assert headers['warc-truncated'] == 'unspecified'
+ assert not rec.raw_stream.read ()
else:
- # body fetch failed
- assert headers['warc-truncated'] == 'unspecified'
+ assert not rec.raw_stream.read ()
# response
- rec = next (it)
- headers = rec.rec_headers
- httpheaders = rec.http_headers
- assert headers['warc-type'] == 'response'
- checkWarcinfoId (headers)
- assert URL (headers['warc-target-uri']) == g.url
- assert headers['x-chrome-request-id'] == g.id
-
- # these are checked separately
- blacklistedHeaders = {'content-type', 'content-length'}
- sortedHeaders = lambda l: sorted (filter (lambda x: x[0].lower() not in blacklistedHeaders, l), key=itemgetter (0))
- assert sortedHeaders (httpheaders.headers) == sortedHeaders (g.responseHeaders)
-
- expectedContentType = g.response.get ('mimeType')
- if expectedContentType is not None:
- assert httpheaders['content-type'].startswith (expectedContentType)
-
- if g.body:
- if g.body[0] is None:
- assert not rec.raw_stream.read ()
- #assert httpheaders['content-length'] == '0'
+ if g.response:
+ rec = next (it)
+ headers = rec.rec_headers
+ httpheaders = rec.http_headers
+ assert headers['warc-type'] == 'response'
+ checkWarcinfoId (headers)
+ assert URL (headers['warc-target-uri']) == g.url
+ assert headers['x-chrome-request-id'] == g.id
+
+ # these are checked separately
+ filteredHeaders = CIMultiDict (httpheaders.headers)
+ for b in {'content-type', 'content-length'}:
+ if b in g.response.headers:
+ g.response.headers.popall (b)
+ if b in filteredHeaders:
+ filteredHeaders.popall (b)
+ assert filteredHeaders == g.response.headers
+
+ expectedContentType = g.response.mimeType
+ if expectedContentType is not None:
+ assert httpheaders['content-type'].startswith (expectedContentType)
+
+ if g.response.body is not None:
+ assert rec.raw_stream.read () == g.response.body
+ assert str (headers['x-chrome-base64body'] or False) == str (isinstance (g.response.body, Base64Body))
+ assert httpheaders['content-length'] == str (len (g.response.body))
+ # body is never truncated if it exists
+ assert headers['warc-truncated'] is None
+
+ # unencoded strings are converted to utf8
+ if isinstance (g.response.body, UnicodeBody) and httpheaders['content-type'] is not None:
+ assert httpheaders['content-type'].endswith ('; charset=utf-8')
else:
- assert rec.raw_stream.read () == g.body[0]
- assert str (headers['x-chrome-base64body'] or False) == str (g.body[1])
- assert httpheaders['content-length'] == str (len (g.body[0]))
-
- # body is never truncated if it exists
- assert headers['warc-truncated'] is None
-
- # unencoded strings are converted to utf8
- if not g.body[1] and httpheaders['content-type'] is not None:
- assert httpheaders['content-type'].endswith ('; charset=utf-8')
- else:
- # body fetch failed
- assert headers['warc-truncated'] == 'unspecified'
- # content-length header should be kept intact
+ # body fetch failed
+ assert headers['warc-truncated'] == 'unspecified'
+ assert not rec.raw_stream.read ()
+ # content-length header should be kept intact
else:
assert False, f"invalid golden type {type(g)}" # pragma: no cover
diff --git a/crocoite/warc.py b/crocoite/warc.py
index dbd9ebc..cb1f2f7 100644
--- a/crocoite/warc.py
+++ b/crocoite/warc.py
@@ -25,6 +25,7 @@ Classes writing data to WARC files
import json, threading
from io import BytesIO
from datetime import datetime
+from http.server import BaseHTTPRequestHandler
from warcio.timeutils import datetime_to_iso_date
from warcio.warcwriter import WARCWriter
@@ -33,7 +34,7 @@ from warcio.statusandheaders import StatusAndHeaders
from .util import packageUrl, StrJsonEncoder
from .controller import EventHandler, ControllerStart
from .behavior import Script, DomSnapshotEvent, ScreenshotEvent
-from .browser import Item
+from .browser import RequestResponsePair, UnicodeBody, Base64Body
class WarcHandler (EventHandler):
__slots__ = ('logger', 'writer', 'documentRecords', 'log',
@@ -86,66 +87,51 @@ class WarcHandler (EventHandler):
url = item.url
path = url.relative().with_fragment(None)
- httpHeaders = StatusAndHeaders(f'{req["method"]} {path} HTTP/1.1',
- item.requestHeaders, protocol='HTTP/1.1', is_http_request=True)
- initiator = item.initiator
+ httpHeaders = StatusAndHeaders(f'{req.method} {path} HTTP/1.1',
+ req.headers, protocol='HTTP/1.1', is_http_request=True)
warcHeaders = {
- 'X-Chrome-Initiator': json.dumps (initiator),
+ 'X-Chrome-Initiator': json.dumps (req.initiator),
'X-Chrome-Request-ID': item.id,
- 'WARC-Date': datetime_to_iso_date (datetime.utcfromtimestamp (item.chromeRequest['wallTime'])),
+ 'WARC-Date': datetime_to_iso_date (req.timestamp),
}
- if item.requestBody is not None:
- payload, payloadBase64Encoded = item.requestBody
- else:
+ body = item.request.body
+ if item.request.hasPostData and body is None:
# oops, don’t know what went wrong here
- logger.error ('requestBody missing', uuid='ee9adc58-e723-4595-9feb-312a67ead6a0')
+ logger.error ('requestBody missing',
+ uuid='ee9adc58-e723-4595-9feb-312a67ead6a0')
warcHeaders['WARC-Truncated'] = 'unspecified'
- payload = None
-
- if payload is not None:
- payload = BytesIO (payload)
- warcHeaders['X-Chrome-Base64Body'] = str (payloadBase64Encoded)
+ else:
+ warcHeaders['X-Chrome-Base64Body'] = str (type (body) is Base64Body)
+ body = BytesIO (body)
record = self.writeRecord (url, 'request',
- payload=payload, http_headers=httpHeaders,
+ payload=body, http_headers=httpHeaders,
warc_headers_dict=warcHeaders)
return record.rec_headers['WARC-Record-ID']
def _writeResponse (self, item, concurrentTo):
# fetch the body
reqId = item.id
- rawBody = None
- base64Encoded = False
- bodyTruncated = None
- if item.isRedirect or item.body is None:
- # redirects reuse the same request, thus we cannot safely retrieve
- # the body (i.e getResponseBody may return the new location’s
- # body). No body available means we failed to retrieve it.
- bodyTruncated = 'unspecified'
- else:
- rawBody, base64Encoded = item.body
# now the response
resp = item.response
warcHeaders = {
'WARC-Concurrent-To': concurrentTo,
- 'WARC-IP-Address': resp.get ('remoteIPAddress', ''),
- 'X-Chrome-Protocol': resp.get ('protocol', ''),
- 'X-Chrome-FromDiskCache': str (resp.get ('fromDiskCache')),
- 'X-Chrome-ConnectionReused': str (resp.get ('connectionReused')),
'X-Chrome-Request-ID': item.id,
- 'WARC-Date': datetime_to_iso_date (datetime.utcfromtimestamp (
- item.chromeRequest['wallTime']+
- (item.chromeResponse['timestamp']-item.chromeRequest['timestamp']))),
+ 'WARC-Date': datetime_to_iso_date (resp.timestamp),
}
- if bodyTruncated:
- warcHeaders['WARC-Truncated'] = bodyTruncated
- else:
- warcHeaders['X-Chrome-Base64Body'] = str (base64Encoded)
-
- httpHeaders = StatusAndHeaders(f'{resp["status"]} {item.statusText}',
- item.responseHeaders,
- protocol='HTTP/1.1')
+ # conditional WARC headers
+ if item.remoteIpAddress:
+ warcHeaders['WARC-IP-Address'] = item.remoteIpAddress
+ if item.protocol:
+ warcHeaders['X-Chrome-Protocol'] = item.protocol
+
+ # HTTP headers
+ statusText = resp.statusText or \
+ BaseHTTPRequestHandler.responses.get (
+ resp.status, ('No status text available', ))[0]
+ httpHeaders = StatusAndHeaders(f'{resp.status} {statusText}',
+ resp.headers, protocol='HTTP/1.1')
# Content is saved decompressed and decoded, remove these headers
blacklistedHeaders = {'transfer-encoding', 'content-encoding'}
@@ -155,20 +141,23 @@ class WarcHandler (EventHandler):
# chrome sends nothing but utf8 encoded text. Fortunately HTTP
# headers take precedence over the document’s <meta>, thus we can
# easily override those.
- contentType = resp.get ('mimeType')
+ contentType = resp.mimeType
if contentType:
- if not base64Encoded:
+ if isinstance (resp.body, UnicodeBody):
contentType += '; charset=utf-8'
httpHeaders.replace_header ('Content-Type', contentType)
- if rawBody is not None:
- httpHeaders.replace_header ('Content-Length', str (len (rawBody)))
- bodyIo = BytesIO (rawBody)
+ # response body
+ body = resp.body
+ if body is None:
+ warcHeaders['WARC-Truncated'] = 'unspecified'
else:
- bodyIo = BytesIO ()
+ httpHeaders.replace_header ('Content-Length', str (len (body)))
+ warcHeaders['X-Chrome-Base64Body'] = str (type (body) is Base64Body)
+ body = BytesIO (body)
record = self.writeRecord (item.url, 'response',
- warc_headers_dict=warcHeaders, payload=bodyIo,
+ warc_headers_dict=warcHeaders, payload=body,
http_headers=httpHeaders)
if item.resourceType == 'Document':
@@ -184,12 +173,11 @@ class WarcHandler (EventHandler):
f'application/javascript; charset={encoding}'})
def _writeItem (self, item):
- if item.failed:
- # should have been handled by the logger already
- return
-
+ assert item.request
concurrentTo = self._writeRequest (item)
- self._writeResponse (item, concurrentTo)
+ # items that failed loading don’t have a response
+ if item.response:
+ self._writeResponse (item, concurrentTo)
def _addRefersTo (self, headers, url):
refersTo = self.documentRecords.get (url)
@@ -247,7 +235,7 @@ class WarcHandler (EventHandler):
self._flushLogEntries ()
route = {Script: _writeScript,
- Item: _writeItem,
+ RequestResponsePair: _writeItem,
DomSnapshotEvent: _writeDomSnapshot,
ScreenshotEvent: _writeScreenshot,
ControllerStart: _writeControllerStart,
diff --git a/setup.py b/setup.py
index 982fa92..7cf6b32 100644
--- a/setup.py
+++ b/setup.py
@@ -18,6 +18,7 @@ setup(
'aiohttp',
'PyYAML',
'yarl',
+ 'multidict',
],
entry_points={
'console_scripts': [