summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--README.rst2
-rw-r--r--crocoite/browser.py326
-rw-r--r--crocoite/controller.py12
-rw-r--r--crocoite/logger.py2
-rw-r--r--crocoite/test_browser.py531
-rw-r--r--crocoite/test_warc.py145
-rw-r--r--crocoite/warc.py96
-rw-r--r--setup.py1
8 files changed, 630 insertions, 485 deletions
diff --git a/README.rst b/README.rst
index 71d9947..45d2e0f 100644
--- a/README.rst
+++ b/README.rst
@@ -24,6 +24,7 @@ These dependencies must be present to run crocoite:
- warcio_
- html5lib_
- yarl_
+- multidict_
- bottom_ (IRC client)
- `Google Chrome`_
@@ -35,6 +36,7 @@ These dependencies must be present to run crocoite:
.. _bottom: https://github.com/numberoverzero/bottom
.. _Google Chrome: https://www.google.com/chrome/
.. _yarl: https://yarl.readthedocs.io/
+.. _multidict: https://multidict.readthedocs.io/
The following commands clone the repository from GitHub_, set up a virtual
environment and install crocoite:
diff --git a/crocoite/browser.py b/crocoite/browser.py
index 3de61f0..50561ed 100644
--- a/crocoite/browser.py
+++ b/crocoite/browser.py
@@ -23,80 +23,197 @@ Chrome browser interactions.
"""
import asyncio
-from base64 import b64decode
+from base64 import b64decode, b64encode
+from datetime import datetime, timedelta
from http.server import BaseHTTPRequestHandler
+
from yarl import URL
+from multidict import CIMultiDict
from .logger import Level
from .devtools import Browser, TabException
-class Item:
- """
- Simple wrapper containing Chrome request and response
- """
+# These two classes’ only purpose is so we can later tell whether a body was
+# base64-encoded or a unicode string
+class Base64Body (bytes):
+ def __new__ (cls, value):
+ return bytes.__new__ (cls, b64decode (value))
+
+ @classmethod
+ def fromBytes (cls, b):
+ """ For testing """
+ return cls (b64encode (b))
+
+class UnicodeBody (bytes):
+ def __new__ (cls, value):
+ if type (value) is not str:
+ raise TypeError ('expecting unicode string')
+
+ return bytes.__new__ (cls, value.encode ('utf-8'))
- __slots__ = ('chromeRequest', 'chromeResponse', 'chromeFinished',
- 'isRedirect', 'failed', 'body', 'requestBody')
+class Request:
+ __slots__ = ('headers', 'body', 'initiator', 'hasPostData', 'method', 'timestamp')
- def __init__ (self):
- self.chromeRequest = {}
- self.chromeResponse = {}
- self.chromeFinished = {}
- self.isRedirect = False
- self.failed = False
- self.body = None
- self.requestBody = None
+ def __init__ (self, method=None, headers=None, body=None):
+ self.headers = headers
+ self.body = body
+ self.hasPostData = False
+ self.initiator = None
+ # HTTP method
+ self.method = method
+ self.timestamp = None
def __repr__ (self):
- return f'<Item {self.url}>'
-
- @property
- def request (self):
- return self.chromeRequest.get ('request', {})
-
- @property
- def response (self):
- return self.chromeResponse.get ('response', {})
-
- @property
- def initiator (self):
- return self.chromeRequest['initiator']
-
- @property
- def id (self):
- return self.chromeRequest['requestId']
-
- @property
- def encodedDataLength (self):
- return self.chromeFinished['encodedDataLength']
-
- @property
- def url (self):
- return URL (self.response.get ('url', self.request.get ('url')))
-
- @property
- def requestHeaders (self):
- # the response object may contain refined headers, which were
- # *actually* sent over the wire
- return self._unfoldHeaders (self.response.get ('requestHeaders', self.request['headers']))
-
- @property
- def responseHeaders (self):
- return self._unfoldHeaders (self.response['headers'])
-
- @property
- def statusText (self):
- text = self.response.get ('statusText')
- if text:
- return text
- text = BaseHTTPRequestHandler.responses.get (self.response['status'])
- if text:
- return text[0]
- return 'No status text available'
-
- @property
- def resourceType (self):
- return self.chromeResponse.get ('type', self.chromeRequest.get ('type', None))
+ return f'Request({self.method!r}, {self.headers!r}, {self.body!r})'
+
+ def __eq__ (self, b):
+ if b is None:
+ return False
+
+ if not isinstance (b, Request):
+ raise TypeError ('Can only compare equality with Request.')
+
+ # do not compare hasPostData (only required to fetch body) and
+ # timestamp (depends on time)
+ return self.headers == b.headers and \
+ self.body == b.body and \
+ self.initiator == b.initiator and \
+ self.method == b.method
+
+class Response:
+ __slots__ = ('status', 'statusText', 'headers', 'body', 'bytesReceived',
+ 'timestamp', 'mimeType')
+
+ def __init__ (self, status=None, statusText=None, headers=None, body=None, mimeType=None):
+ self.status = status
+ self.statusText = statusText
+ self.headers = headers
+ self.body = body
+ # bytes received over the network (not body size!)
+ self.bytesReceived = 0
+ self.timestamp = None
+ self.mimeType = mimeType
+
+ def __repr__ (self):
+ return f'Response({self.status!r}, {self.statusText!r}, {self.headers!r}, {self.body!r}, {self.mimeType!r})'
+
+ def __eq__ (self, b):
+ if b is None:
+ return False
+
+ if not isinstance (b, Response):
+ raise TypeError ('Can only compare equality with Response.')
+
+ # do not compare bytesReceived (depends on network), timestamp
+ # (depends on time) and statusText (does not matter)
+ return self.status == b.status and \
+ self.statusText == b.statusText and \
+ self.headers == b.headers and \
+ self.body == b.body and \
+ self.mimeType == b.mimeType
+
+class ReferenceTimestamp:
+ """ Map relative timestamp to absolute timestamp """
+
+ def __init__ (self, relative, absolute):
+ self.relative = timedelta (seconds=relative)
+ self.absolute = datetime.utcfromtimestamp (absolute)
+
+ def __call__ (self, relative):
+ if not isinstance (relative, timedelta):
+ relative = timedelta (seconds=relative)
+ return self.absolute + (relative-self.relative)
+
+class RequestResponsePair:
+ __slots__ = ('request', 'response', 'id', 'url', 'remoteIpAddress',
+ 'protocol', 'resourceType', '_time')
+
+ def __init__ (self, id=None, url=None, request=None, response=None):
+ self.request = request
+ self.response = response
+ self.id = id
+ self.url = url
+ self.remoteIpAddress = None
+ self.protocol = None
+ self.resourceType = None
+ self._time = None
+
+ def __repr__ (self):
+ return f'RequestResponsePair({self.id!r}, {self.url!r}, {self.request!r}, {self.response!r})'
+
+ def __eq__ (self, b):
+ if not isinstance (b, RequestResponsePair):
+ raise TypeError (f'Can only compare with {self.__class__.__name__}')
+
+ # do not compare id and _time. These depend on external factors and do
+ # not influence the request/response *content*
+ return self.request == b.request and \
+ self.response == b.response and \
+ self.url == b.url and \
+ self.remoteIpAddress == b.remoteIpAddress and \
+ self.protocol == b.protocol and \
+ self.resourceType == b.resourceType
+
+ def fromRequestWillBeSent (self, req):
+ """ Set request data from Chrome Network.requestWillBeSent event """
+ r = req['request']
+
+ self.id = req['requestId']
+ self.url = URL (r['url'])
+ self.resourceType = req.get ('type')
+ self._time = ReferenceTimestamp (req['timestamp'], req['wallTime'])
+
+ assert self.request is None, req
+ self.request = Request ()
+ self.request.initiator = req['initiator']
+ self.request.headers = CIMultiDict (self._unfoldHeaders (r['headers']))
+ self.request.hasPostData = r.get ('hasPostData', False)
+ self.request.method = r['method']
+ self.request.timestamp = self._time (req['timestamp'])
+ if self.request.hasPostData:
+ postData = r.get ('postData')
+ if postData is not None:
+ self.request.body = UnicodeBody (postData)
+
+ def fromResponse (self, r, timestamp=None, resourceType=None):
+ """
+ Set response data from Chrome’s Response object.
+
+ Request must exist. Updates if response was set before. Sometimes
+ fromResponseReceived is triggered twice by Chrome. No idea why.
+ """
+ assert self.request is not None, (self.request, r)
+
+ if not timestamp:
+ timestamp = self.request.timestamp
+
+ self.remoteIpAddress = r.get ('remoteIPAddress')
+ self.protocol = r.get ('protocol')
+ if resourceType:
+ self.resourceType = resourceType
+
+ # a response may contain updated request headers (i.e. those actually
+ # sent over the wire)
+ if 'requestHeaders' in r:
+ self.request.headers = CIMultiDict (self._unfoldHeaders (r['requestHeaders']))
+
+ self.response = Response ()
+ self.response.headers = CIMultiDict (self._unfoldHeaders (r['headers']))
+ self.response.status = r['status']
+ self.response.statusText = r['statusText']
+ self.response.timestamp = timestamp
+ self.response.mimeType = r['mimeType']
+
+ def fromResponseReceived (self, resp):
+ """ Set response data from Chrome Network.responseReceived """
+ return self.fromResponse (resp['response'],
+ self._time (resp['timestamp']), resp['type'])
+
+ def fromLoadingFinished (self, data):
+ self.response.bytesReceived = data['encodedDataLength']
+
+ def fromLoadingFailed (self, data):
+ self.response = None
@staticmethod
def _unfoldHeaders (headers):
@@ -110,44 +227,26 @@ class Item:
items.append ((k, v))
return items
- def setRequest (self, req):
- self.chromeRequest = req
-
- def setResponse (self, resp):
- self.chromeResponse = resp
-
- def setFinished (self, finished):
- self.chromeFinished = finished
-
async def prefetchRequestBody (self, tab):
- # request body
- req = self.request
- postData = req.get ('postData')
- if postData:
- self.requestBody = postData.encode ('utf8'), False
- elif req.get ('hasPostData', False):
+ if self.request.hasPostData and self.request.body is None:
try:
postData = await tab.Network.getRequestPostData (requestId=self.id)
- postData = postData['postData']
- self.requestBody = b64decode (postData), True
+ self.request.body = UnicodeBody (postData['postData'])
except TabException:
- self.requestBody = None
+ self.request.body = None
else:
- self.requestBody = None, False
+ self.request.body = None
async def prefetchResponseBody (self, tab):
- # get response body
+ """ Fetch response body """
try:
body = await tab.Network.getResponseBody (requestId=self.id)
- rawBody = body['body']
- base64Encoded = body['base64Encoded']
- if base64Encoded:
- rawBody = b64decode (rawBody)
+ if body['base64Encoded']:
+ self.response.body = Base64Body (body['body'])
else:
- rawBody = rawBody.encode ('utf8')
- self.body = rawBody, base64Encoded
+ self.response.body = UnicodeBody (body['body'])
except TabException:
- self.body = None
+ self.response.body = None
class VarChangeEvent:
""" Notify when variable is changed """
@@ -179,14 +278,14 @@ class SiteLoader:
XXX: track popup windows/new tabs and close them
"""
- __slots__ = ('requests', 'browser', 'url', 'logger', 'tab', '_iterRunning', 'idle', '_framesLoading')
+ __slots__ = ('requests', 'browser', 'logger', 'tab', '_iterRunning',
+ 'idle', '_framesLoading')
allowedSchemes = {'http', 'https'}
- def __init__ (self, browser, url, logger):
+ def __init__ (self, browser, logger):
self.requests = {}
self.browser = Browser (url=browser)
- self.url = url
- self.logger = logger.bind (context=type (self).__name__, url=url)
+ self.logger = logger.bind (context=type (self).__name__)
self._iterRunning = []
self.idle = VarChangeEvent (True)
@@ -250,7 +349,7 @@ class SiteLoader:
result = t.result ()
if result is None:
pass
- elif isinstance (result, Item):
+ elif isinstance (result, RequestResponsePair):
yield result
else:
method, data = result
@@ -263,8 +362,8 @@ class SiteLoader:
running = pending
self._iterRunning = running
- async def start (self):
- await self.tab.Page.navigate(url=self.url)
+ async def navigate (self, url):
+ await self.tab.Page.navigate(url=url)
# internal chrome callbacks
async def _requestWillBeSent (self, **kwargs):
@@ -282,21 +381,24 @@ class SiteLoader:
# redirects never “finish” loading, but yield another requestWillBeSent with this key set
redirectResp = kwargs.get ('redirectResponse')
if redirectResp:
- # create fake responses
- resp = {'requestId': reqId, 'response': redirectResp, 'timestamp': kwargs['timestamp']}
- item.setResponse (resp)
- resp = {'requestId': reqId, 'encodedDataLength': 0, 'timestamp': kwargs['timestamp']}
- item.setFinished (resp)
- item.isRedirect = True
+ if item.url != url:
+ # this happens for unknown reasons. the docs simply state
+ # it can differ in case of a redirect. Fix it and move on.
+ logger.warning ('redirect url differs',
+ uuid='558a7df7-2258-4fe4-b16d-22b6019cc163',
+ expected=item.url)
+ redirectResp['url'] = str (item.url)
+ item.fromResponse (redirectResp)
logger.info ('redirect', uuid='85eaec41-e2a9-49c2-9445-6f19690278b8', target=url)
+ # XXX: queue this? no need to wait for it
await item.prefetchRequestBody (self.tab)
- # cannot fetch request body due to race condition (item id reused)
+ # cannot fetch response body due to race condition (item id reused)
ret = item
else:
logger.warning ('request exists', uuid='2c989142-ba00-4791-bb03-c2a14e91a56b')
- item = Item ()
- item.setRequest (kwargs)
+ item = RequestResponsePair ()
+ item.fromRequestWillBeSent (kwargs)
self.requests[reqId] = item
logger.debug ('request', uuid='55c17564-1bd0-4499-8724-fa7aad65478f')
@@ -315,7 +417,7 @@ class SiteLoader:
logger.error ('url mismatch', uuid='7385f45f-0b06-4cbc-81f9-67bcd72ee7d0', respUrl=url)
if url.scheme in self.allowedSchemes:
logger.debug ('response', uuid='84461c4e-e8ef-4cbd-8e8e-e10a901c8bd0')
- item.setResponse (kwargs)
+ item.fromResponseReceived (kwargs)
else:
logger.warning ('scheme forbidden', uuid='2ea6e5d7-dd3b-4881-b9de-156c1751c666')
@@ -333,19 +435,21 @@ class SiteLoader:
logger = self.logger.bind (reqId=reqId, reqUrl=item.url)
if item.url.scheme in self.allowedSchemes:
logger.info ('finished', uuid='5a8b4bad-f86a-4fe6-a53e-8da4130d6a02')
- item.setFinished (kwargs)
+ item.fromLoadingFinished (kwargs)
+ # XXX queue both
await asyncio.gather (item.prefetchRequestBody (self.tab), item.prefetchResponseBody (self.tab))
return item
async def _loadingFailed (self, **kwargs):
reqId = kwargs['requestId']
- self.logger.warning ('loading failed',
+ logger = self.logger.bind (reqId=reqId)
+ logger.warning ('loading failed',
uuid='68410f13-6eea-453e-924e-c1af4601748b',
errorText=kwargs['errorText'],
blockedReason=kwargs.get ('blockedReason'))
item = self.requests.pop (reqId, None)
if item is not None:
- item.failed = True
+ item.fromLoadingFailed (kwargs)
return item
async def _entryAdded (self, **kwargs):
diff --git a/crocoite/controller.py b/crocoite/controller.py
index 504fa23..a64a8dc 100644
--- a/crocoite/controller.py
+++ b/crocoite/controller.py
@@ -30,7 +30,7 @@ from operator import attrgetter
from yarl import URL
from . import behavior as cbehavior
-from .browser import SiteLoader, Item
+from .browser import SiteLoader, RequestResponsePair
from .util import getFormattedViewportMetrics, getSoftwareInfo
from .behavior import ExtractLinksEvent
@@ -61,13 +61,13 @@ class StatsHandler (EventHandler):
self.stats = {'requests': 0, 'finished': 0, 'failed': 0, 'bytesRcv': 0}
def push (self, item):
- if isinstance (item, Item):
+ if isinstance (item, RequestResponsePair):
self.stats['requests'] += 1
- if item.failed:
+ if not item.response:
self.stats['failed'] += 1
else:
self.stats['finished'] += 1
- self.stats['bytesRcv'] += item.encodedDataLength
+ self.stats['bytesRcv'] += item.response.bytesReceived
class LogHandler (EventHandler):
""" Handle items by logging information about them """
@@ -126,7 +126,7 @@ class SinglePageController:
async for item in l:
self.processItem (item)
- async with self.service as browser, SiteLoader (browser, self.url, logger=logger) as l:
+ async with self.service as browser, SiteLoader (browser, logger=logger) as l:
handle = asyncio.ensure_future (processQueue ())
start = time.time ()
@@ -153,7 +153,7 @@ class SinglePageController:
}
self.processItem (ControllerStart (payload))
- await l.start ()
+ await l.navigate (self.url)
for b in enabledBehavior:
async for item in b.onload ():
self.processItem (item)
diff --git a/crocoite/logger.py b/crocoite/logger.py
index d882eaf..82d7f5b 100644
--- a/crocoite/logger.py
+++ b/crocoite/logger.py
@@ -105,7 +105,7 @@ class PrintConsumer (Consumer):
return kwargs
class JsonPrintConsumer (Consumer):
- def __init__ (self, minLevel=Level.INFO):
+ def __init__ (self, minLevel=Level.DEBUG):
self.minLevel = minLevel
def __call__ (self, **kwargs):
diff --git a/crocoite/test_browser.py b/crocoite/test_browser.py
index 8008855..4bf2c64 100644
--- a/crocoite/test_browser.py
+++ b/crocoite/test_browser.py
@@ -19,103 +19,28 @@
# THE SOFTWARE.
import asyncio, socket
-import pytest
from operator import itemgetter
-from aiohttp import web
from http.server import BaseHTTPRequestHandler
+from datetime import datetime
+
+from yarl import URL
+from aiohttp import web
+from multidict import CIMultiDict
-from .browser import Item, SiteLoader, VarChangeEvent
+from hypothesis import given
+import hypothesis.strategies as st
+import pytest
+
+from .browser import RequestResponsePair, SiteLoader, VarChangeEvent, Request, \
+ UnicodeBody, ReferenceTimestamp, Base64Body, UnicodeBody, Request, \
+ Response
from .logger import Logger, Consumer
from .devtools import Crashed, Process
# if you want to know what’s going on:
+#import logging
#logging.basicConfig(level=logging.DEBUG)
-class TItem (Item):
- """ This should be as close to Item as possible """
-
- __slots__ = ('bodySend', '_body', '_requestBody')
- base = 'http://localhost:8000/'
-
- def __init__ (self, path, status, headers, bodyReceive, bodySend=None, requestBody=None, failed=False, isRedirect=False):
- super ().__init__ ()
- self.chromeResponse = {'response': {'headers': headers, 'status': status, 'url': self.base + path}}
- self.body = bodyReceive, False
- self.bodySend = bodyReceive if not bodySend else bodySend
- self.requestBody = requestBody, False
- self.failed = failed
- self.isRedirect = isRedirect
-
-testItems = [
- TItem ('binary', 200, {'Content-Type': 'application/octet-stream'}, b'\x00\x01\x02', failed=True),
- TItem ('attachment', 200,
- {'Content-Type': 'text/plain; charset=utf-8',
- 'Content-Disposition': 'attachment; filename="attachment.txt"',
- },
- 'This is a simple text file with umlauts. ÄÖU.'.encode ('utf8'), failed=True),
- TItem ('encoding/utf8', 200, {'Content-Type': 'text/plain; charset=utf-8'},
- 'This is a test, äöü μνψκ ¥¥¥¿ýý¡'.encode ('utf8')),
- TItem ('encoding/iso88591', 200, {'Content-Type': 'text/plain; charset=ISO-8859-1'},
- 'This is a test, äöü.'.encode ('utf8'),
- 'This is a test, äöü.'.encode ('ISO-8859-1')),
- TItem ('encoding/latin1', 200, {'Content-Type': 'text/plain; charset=latin1'},
- 'This is a test, äöü.'.encode ('utf8'),
- 'This is a test, äöü.'.encode ('latin1')),
- TItem ('image', 200, {'Content-Type': 'image/png'},
- # 1×1 png image
- b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01\x00\x00\x00\x01\x08\x00\x00\x00\x00:~\x9bU\x00\x00\x00\nIDAT\x08\x1dc\xf8\x0f\x00\x01\x01\x01\x006_g\x80\x00\x00\x00\x00IEND\xaeB`\x82'),
- TItem ('empty', 200, {'Content-Type': 'text/plain'}, b''),
- TItem ('headers/duplicate', 200, [('Content-Type', 'text/plain'), ('Duplicate', '1'), ('Duplicate', '2')], b''),
- TItem ('headers/fetch/req', 200, {'Content-Type': 'text/plain'}, b''),
- TItem ('headers/fetch/html', 200, {'Content-Type': 'text/html'},
- r"""<html><body><script>
- let h = new Headers([["custom", "1"]]);
- fetch("/headers/fetch/req", {"method": "GET", "headers": h}).then(x => console.log("done"));
- </script></body></html>""".encode ('utf8')),
- TItem ('redirect/301/empty', 301, {'Location': '/empty'}, b'', isRedirect=True),
- TItem ('redirect/301/redirect/301/empty', 301, {'Location': '/redirect/301/empty'}, b'', isRedirect=True),
- TItem ('nonexistent', 404, {}, b''),
- TItem ('html', 200, {'Content-Type': 'text/html'},
- '<html><body><img src="/image"><img src="/nonexistent"></body></html>'.encode ('utf8')),
- TItem ('html/alert', 200, {'Content-Type': 'text/html'},
- '<html><body><script>window.addEventListener("beforeunload", function (e) { e.returnValue = "bye?"; return e.returnValue; }); alert("stopping here"); if (confirm("are you sure?") || prompt ("42?")) { window.location = "/nonexistent"; }</script><script>document.write(\'<img src="/image">\');</script></body></html>'.encode ('utf8')),
- TItem ('html/fetchPost', 200, {'Content-Type': 'text/html'},
- r"""<html><body><script>
- let a = fetch("/html/fetchPost/binary", {"method": "POST", "body": "\x00"});
- let b = fetch("/html/fetchPost/form", {"method": "POST", "body": new URLSearchParams({"data": "!"})});
- let c = fetch("/html/fetchPost/binary/large", {"method": "POST", "body": "\x00".repeat(100*1024)});
- let d = fetch("/html/fetchPost/form/large", {"method": "POST", "body": new URLSearchParams({"data": "!".repeat(100*1024)})});
- </script></body></html>""".encode ('utf8')),
- TItem ('html/fetchPost/binary', 200, {'Content-Type': 'application/octet-stream'}, b'\x00', requestBody=b'\x00'),
- TItem ('html/fetchPost/form', 200, {'Content-Type': 'application/octet-stream'}, b'\x00', requestBody=b'data=%21'),
- # XXX: these should trigger the need for getRequestPostData, but they don’t. oh well.
- TItem ('html/fetchPost/binary/large', 200, {'Content-Type': 'application/octet-stream'}, b'\x00', requestBody=(100*1024)*b'\x00'),
- TItem ('html/fetchPost/form/large', 200, {'Content-Type': 'application/octet-stream'}, b'\x00', requestBody=b'data=' + (100*1024)*b'%21'),
- ]
-testItemMap = dict ([(item.url.path, item) for item in testItems])
-
-def itemToResponse (item):
- async def f (req):
- headers = item.response['headers']
- return web.Response(body=item.bodySend, status=item.response['status'],
- headers=headers)
- return f
-
-@pytest.fixture
-async def server ():
- """ Simple HTTP server for testing notifications """
- import logging
- logging.basicConfig(level=logging.DEBUG)
- app = web.Application()
- for item in testItems:
- app.router.add_route ('*', item.url.path, itemToResponse (item))
- runner = web.AppRunner(app)
- await runner.setup()
- site = web.TCPSite(runner, 'localhost', 8080)
- await site.start()
- yield app
- await runner.cleanup ()
-
class AssertConsumer (Consumer):
def __call__ (self, **kwargs):
assert 'uuid' in kwargs
@@ -128,134 +53,14 @@ def logger ():
return Logger (consumer=[AssertConsumer ()])
@pytest.fixture
-async def loader (server, logger):
- def f (path):
- if path.startswith ('/'):
- path = 'http://localhost:8080{}'.format (path)
- return SiteLoader (browser, path, logger)
- async with Process () as browser:
- yield f
-
-async def itemsLoaded (l, items):
- items = dict ([(i.url.path, i) for i in items])
- async for item in l:
- assert item.chromeResponse is not None
- golden = items.pop (item.url.path)
- if not golden:
- assert False, f'url {item.url} not supposed to be fetched'
- assert item.failed == golden.failed
- if item.failed:
- # response will be invalid if request failed
- if not items:
- break
- else:
- continue
- assert item.isRedirect == golden.isRedirect
- if golden.isRedirect:
- assert item.body is None
- else:
- assert item.body[0] == golden.body[0]
- assert item.requestBody[0] == golden.requestBody[0]
- assert item.response['status'] == golden.response['status']
- assert item.statusText == BaseHTTPRequestHandler.responses.get (item.response['status'])[0]
- for k, v in golden.responseHeaders:
- actual = list (map (itemgetter (1), filter (lambda x: x[0] == k, item.responseHeaders)))
- assert v in actual
-
- # we’re done when everything has been loaded
- if not items:
- break
-
-async def literalItem (lf, item, deps=[]):
- async with lf (item.url.path) as l:
- await l.start ()
- await asyncio.wait_for (itemsLoaded (l, [item] + deps), timeout=30)
-
-@pytest.mark.asyncio
-async def test_empty (loader):
- await literalItem (loader, testItemMap['/empty'])
-
-@pytest.mark.asyncio
-async def test_headers_duplicate (loader):
- """
- Some headers, like Set-Cookie can be present multiple times. Chrome
- separates these with a newline.
- """
- async with loader ('/headers/duplicate') as l:
- await l.start ()
- async for it in l:
- if it.url.path == '/headers/duplicate':
- assert not it.failed
- dup = list (filter (lambda x: x[0] == 'Duplicate', it.responseHeaders))
- assert len(dup) == 2
- assert list(sorted(map(itemgetter(1), dup))) == ['1', '2']
- break
-
-@pytest.mark.asyncio
-async def test_headers_req (loader):
- """
- Custom request headers. JavaScript’s Headers() does not support duplicate
- headers, so we can’t generate those.
- """
- async with loader ('/headers/fetch/html') as l:
- await l.start ()
- async for it in l:
- if it.url.path == '/headers/fetch/req':
- assert not it.failed
- dup = list (filter (lambda x: x[0] == 'custom', it.requestHeaders))
- assert len(dup) == 1
- assert list(sorted(map(itemgetter(1), dup))) == ['1']
- break
-
-@pytest.mark.asyncio
-async def test_redirect (loader):
- await literalItem (loader, testItemMap['/redirect/301/empty'], [testItemMap['/empty']])
- # chained redirects
- await literalItem (loader, testItemMap['/redirect/301/redirect/301/empty'], [testItemMap['/redirect/301/empty'], testItemMap['/empty']])
-
-@pytest.mark.asyncio
-async def test_encoding (loader):
- """ Text responses are transformed to UTF-8. Make sure this works
- correctly. """
- for item in {testItemMap['/encoding/utf8'], testItemMap['/encoding/latin1'], testItemMap['/encoding/iso88591']}:
- await literalItem (loader, item)
-
-@pytest.mark.asyncio
-async def test_binary (loader):
- """ Browser should ignore content it cannot display (i.e. octet-stream) """
- await literalItem (loader, testItemMap['/binary'])
-
-@pytest.mark.asyncio
-async def test_image (loader):
- """ Images should be displayed inline """
- await literalItem (loader, testItemMap['/image'])
-
-@pytest.mark.asyncio
-async def test_attachment (loader):
- """ And downloads won’t work in headless mode, even if it’s just a text file """
- await literalItem (loader, testItemMap['/attachment'])
-
-@pytest.mark.asyncio
-async def test_html (loader):
- await literalItem (loader, testItemMap['/html'], [testItemMap['/image'], testItemMap['/nonexistent']])
- # make sure alerts are dismissed correctly (image won’t load otherwise)
- await literalItem (loader, testItemMap['/html/alert'], [testItemMap['/image']])
-
-@pytest.mark.asyncio
-async def test_post (loader):
- """ XHR POST request with binary data"""
- await literalItem (loader, testItemMap['/html/fetchPost'],
- [testItemMap['/html/fetchPost/binary'],
- testItemMap['/html/fetchPost/binary/large'],
- testItemMap['/html/fetchPost/form'],
- testItemMap['/html/fetchPost/form/large']])
+async def loader (logger):
+ async with Process () as browser, SiteLoader (browser, logger) as l:
+ yield l
@pytest.mark.asyncio
async def test_crash (loader):
- async with loader ('/html') as l:
- await l.start ()
- with pytest.raises (Crashed):
- await l.tab.Page.crash ()
+ with pytest.raises (Crashed):
+ await loader.tab.Page.crash ()
@pytest.mark.asyncio
async def test_invalidurl (loader):
@@ -267,15 +72,16 @@ async def test_invalidurl (loader):
try:
resolved = await loop.getaddrinfo (host, None)
except socket.gaierror:
- async with loader (f'http://{host}/') as l:
- await l.start ()
- async for it in l:
- assert it.failed
- break
+ url = URL.build (scheme='http', host=host)
+ await loader.navigate (url)
+ async for it in loader:
+ assert it.request is not None
+ assert it.url == url
+ assert it.response is None
+ break
else:
pytest.skip (f'host {host} resolved to {resolved}')
-
@pytest.mark.asyncio
async def test_varchangeevent ():
e = VarChangeEvent (True)
@@ -299,3 +105,290 @@ async def test_varchangeevent ():
assert ret == False
assert e.get () == ret
+timestamp = st.one_of (
+ st.integers(min_value=0, max_value=2**32-1),
+ st.floats (min_value=0, max_value=2**32-1),
+ )
+
+@given(timestamp, timestamp, timestamp)
+def test_referencetimestamp (relativeA, absoluteA, relativeB):
+ ts = ReferenceTimestamp (relativeA, absoluteA)
+ absoluteA = datetime.utcfromtimestamp (absoluteA)
+ absoluteB = ts (relativeB)
+ assert (absoluteA < absoluteB and relativeA < relativeB) or \
+ (absoluteA >= absoluteB and relativeA >= relativeB)
+ assert abs ((absoluteB - absoluteA).total_seconds () - (relativeB - relativeA)) < 10e-6
+
+def hostname ():
+ # XXX: find a better way to generate hostnames
+ return st.text (alphabet=st.sampled_from('abcdefghijklmnopqrstuvwxyz0123456789-'), min_size=1, max_size=253)
+
+def urls ():
+ """ Build http/https URL """
+ scheme = st.sampled_from (['http', 'https'])
+ # Path must start with a slash
+ pathSt = st.builds (lambda x: '/' + x, st.text ())
+ args = st.fixed_dictionaries ({
+ 'scheme': scheme,
+ 'host': hostname (),
+ 'port': st.one_of (st.none (), st.integers (min_value=1, max_value=2**16-1)),
+ 'path': pathSt,
+ 'query_string': st.text (),
+ 'fragment': st.text (),
+ })
+ return st.builds (lambda x: URL.build (**x), args)
+
+def urlsStr ():
+ return st.builds (lambda x: str (x), urls ())
+
+asciiText = st.text (st.characters (min_codepoint=32, max_codepoint=126))
+
+def chromeHeaders ():
+ # token as defined by https://tools.ietf.org/html/rfc7230#section-3.2.6
+ token = st.sampled_from('abcdefghijklmnopqrstuvwxyz0123456789!#$%&\'*+-.^_`|~')
+ # XXX: the value should be asciiText without leading/trailing spaces
+ return st.dictionaries (token, token)
+
+def fixedDicts (fixed, dynamic):
+ return st.builds (lambda x, y: x.update (y), st.fixed_dictionaries (fixed), st.lists (dynamic))
+
+def chromeRequestWillBeSent (reqid, url):
+ methodSt = st.sampled_from (['GET', 'POST', 'PUT', 'DELETE'])
+ return st.fixed_dictionaries ({
+ 'requestId': reqid,
+ 'initiator': st.just ('Test'),
+ 'wallTime': timestamp,
+ 'timestamp': timestamp,
+ 'request': st.fixed_dictionaries ({
+ 'url': url,
+ 'method': methodSt,
+ 'headers': chromeHeaders (),
+ # XXX: postData, hasPostData
+ })
+ })
+
+def chromeResponseReceived (reqid, url):
+ mimeTypeSt = st.one_of (st.none (), st.just ('text/html'))
+ remoteIpAddressSt = st.one_of (st.none (), st.just ('127.0.0.1'))
+ protocolSt = st.one_of (st.none (), st.just ('h2'))
+ statusCodeSt = st.integers (min_value=100, max_value=999)
+ typeSt = st.sampled_from (['Document', 'Stylesheet', 'Image', 'Media',
+ 'Font', 'Script', 'TextTrack', 'XHR', 'Fetch', 'EventSource',
+ 'WebSocket', 'Manifest', 'SignedExchange', 'Ping',
+ 'CSPViolationReport', 'Other'])
+ return st.fixed_dictionaries ({
+ 'requestId': reqid,
+ 'timestamp': timestamp,
+ 'type': typeSt,
+ 'response': st.fixed_dictionaries ({
+ 'url': url,
+ 'requestHeaders': chromeHeaders (), # XXX: make this optional
+ 'headers': chromeHeaders (),
+ 'status': statusCodeSt,
+ 'statusText': asciiText,
+ 'mimeType': mimeTypeSt,
+ 'remoteIPAddress': remoteIpAddressSt,
+ 'protocol': protocolSt,
+ })
+ })
+
+def chromeReqResp ():
+ # XXX: will this gnerated the same url for all testcases?
+ reqid = st.shared (st.text (), 'reqresp')
+ url = st.shared (urlsStr (), 'reqresp')
+ return st.tuples (chromeRequestWillBeSent (reqid, url),
+ chromeResponseReceived (reqid, url))
+
+def requestResponsePair ():
+ def f (creq, cresp, hasPostData, reqBody, respBody):
+ i = RequestResponsePair ()
+ i.fromRequestWillBeSent (creq)
+ i.request.hasPostData = hasPostData
+ if hasPostData:
+ i.request.body = reqBody
+
+ if cresp is not None:
+ i.fromResponseReceived (cresp)
+ if respBody is not None:
+ i.response.body = respBody
+ return i
+
+ bodySt = st.one_of (
+ st.none (),
+ st.builds (UnicodeBody, st.text ()),
+ st.builds (Base64Body.fromBytes, st.binary ())
+ )
+ return st.builds (lambda reqresp, hasPostData, reqBody, respBody:
+ f (reqresp[0], reqresp[1], hasPostData, reqBody, respBody),
+ chromeReqResp (), st.booleans (), bodySt, bodySt)
+
+@given(chromeReqResp ())
+def test_requestResponsePair (creqresp):
+ creq, cresp = creqresp
+
+ item = RequestResponsePair ()
+
+ assert item.id is None
+ assert item.url is None
+ assert item.request is None
+ assert item.response is None
+
+ item.fromRequestWillBeSent (creq)
+
+ assert item.id == creq['requestId']
+ url = URL (creq['request']['url'])
+ assert item.url == url
+ assert item.request is not None
+ assert item.request.timestamp == datetime.utcfromtimestamp (creq['wallTime'])
+ assert set (item.request.headers.keys ()) == set (creq['request']['headers'].keys ())
+ assert item.response is None
+
+ item.fromResponseReceived (cresp)
+
+ # url will not be overwritten
+ assert item.id == creq['requestId'] == cresp['requestId']
+ assert item.url == url
+ assert item.request is not None
+ assert set (item.request.headers.keys ()) == set (cresp['response']['requestHeaders'].keys ())
+ assert item.response is not None
+ assert set (item.response.headers.keys ()) == set (cresp['response']['headers'].keys ())
+ assert (item.response.timestamp - item.request.timestamp).total_seconds () - \
+ (cresp['timestamp'] - creq['timestamp']) < 10e-6
+
+@given(chromeReqResp ())
+def test_requestResponsePair_eq (creqresp):
+ creq, cresp = creqresp
+
+ item = RequestResponsePair ()
+ item2 = RequestResponsePair ()
+ assert item == item
+ assert item == item2
+
+ item.fromRequestWillBeSent (creq)
+ assert item != item2
+ item2.fromRequestWillBeSent (creq)
+ assert item == item
+ assert item == item2
+
+ item.fromResponseReceived (cresp)
+ assert item != item2
+ item2.fromResponseReceived (cresp)
+ assert item == item
+ assert item == item2
+
+ # XXX: test for inequality with different parameters
+
+### Google Chrome integration tests ###
+
+serverUrl = URL.build (scheme='http', host='localhost', port=8080)
+items = [
+ RequestResponsePair (
+ url=serverUrl.with_path ('/encoding/utf-8'),
+ request=Request (method='GET'),
+ response=Response (status=200, headers=CIMultiDict ([('Content-Type', 'text/html; charset=utf-8')]),
+ body=UnicodeBody ('äöü'), mimeType='text/html')
+ ),
+ RequestResponsePair (
+ url=serverUrl.with_path ('/encoding/latin1'),
+ request=Request (method='GET'),
+ response=Response (status=200, headers=CIMultiDict ([('Content-Type', 'text/html; charset=latin1')]),
+ body=UnicodeBody ('äöü'), mimeType='text/html')
+ ),
+ RequestResponsePair (
+ url=serverUrl.with_path ('/encoding/utf-16'),
+ request=Request (method='GET'),
+ response=Response (status=200, headers=CIMultiDict ([('Content-Type', 'text/html; charset=utf-16')]),
+ body=UnicodeBody ('äöü'), mimeType='text/html')
+ ),
+ RequestResponsePair (
+ url=serverUrl.with_path ('/encoding/ISO-8859-1'),
+ request=Request (method='GET'),
+ response=Response (status=200, headers=CIMultiDict ([('Content-Type', 'text/html; charset=ISO-8859-1')]),
+ body=UnicodeBody ('äöü'), mimeType='text/html')
+ ),
+ RequestResponsePair (
+ url=serverUrl.with_path ('/status/200'),
+ request=Request (method='GET'),
+ response=Response (status=200, headers=CIMultiDict ([('Content-Type', 'text/plain')]),
+ body=b'',
+ mimeType='text/plain'),
+ ),
+ # redirects never have a response body
+ RequestResponsePair (
+ url=serverUrl.with_path ('/status/301'),
+ request=Request (method='GET'),
+ response=Response (status=301,
+ headers=CIMultiDict ([('Content-Type', 'text/plain'),
+ ('Location', str (serverUrl.with_path ('/status/301/redirected')))]),
+ body=None,
+ mimeType='text/plain'),
+ ),
+ RequestResponsePair (
+ url=serverUrl.with_path ('/image/png'),
+ request=Request (method='GET'),
+ response=Response (status=200, headers=CIMultiDict ([('Content-Type', 'image/png')]),
+ body=Base64Body.fromBytes (b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01\x00\x00\x00\x01\x08\x00\x00\x00\x00:~\x9bU\x00\x00\x00\nIDAT\x08\x1dc\xf8\x0f\x00\x01\x01\x01\x006_g\x80\x00\x00\x00\x00IEND\xaeB`\x82'),
+ mimeType='image/png'),
+ ),
+ RequestResponsePair (
+ url=serverUrl.with_path ('/script/alert'),
+ request=Request (method='GET'),
+ response=Response (status=200, headers=CIMultiDict ([('Content-Type', 'text/html; charset=utf-8')]),
+ body=UnicodeBody ('''<html><body><script>
+window.addEventListener("beforeunload", function (e) {
+ e.returnValue = "bye?";
+ return e.returnValue;
+});
+alert("stopping here");
+if (confirm("are you sure?") || prompt ("42?")) {
+ window.location = "/nonexistent";
+}
+</script></body></html>'''), mimeType='text/html')
+ ),
+ ]
+
+@pytest.mark.asyncio
+# would be nice if we could use hypothesis here somehow
+@pytest.mark.parametrize("golden", items)
+async def test_integration_item (loader, golden):
+ async def f (req):
+ body = golden.response.body
+ contentType = golden.response.headers.get ('content-type', '') if golden.response.headers is not None else ''
+ charsetOff = contentType.find ('charset=')
+ if isinstance (body, UnicodeBody) and charsetOff != -1:
+ encoding = contentType[charsetOff+len ('charset='):]
+ body = golden.response.body.decode ('utf-8').encode (encoding)
+ return web.Response (body=body, status=golden.response.status,
+ headers=golden.response.headers)
+
+ app = web.Application ()
+ app.router.add_route (golden.request.method, golden.url.path, f)
+ runner = web.AppRunner(app)
+ await runner.setup()
+ site = web.TCPSite(runner, serverUrl.host, serverUrl.port)
+ await site.start()
+
+ try:
+ await loader.navigate (golden.url)
+
+ it = loader.__aiter__ ()
+ item = await it.__anext__ ()
+
+ # we do not know this in advance
+ item.request.initiator = None
+ item.request.headers = None
+ item.remoteIpAddress = None
+ item.protocol = None
+ item.resourceType = None
+
+ if item.response:
+ assert item.response.statusText is not None
+ item.response.statusText = None
+
+ del item.response.headers['server']
+ del item.response.headers['content-length']
+ del item.response.headers['date']
+ assert item == golden
+ finally:
+ await runner.cleanup ()
+
diff --git a/crocoite/test_warc.py b/crocoite/test_warc.py
index 7f2635b..954e8c8 100644
--- a/crocoite/test_warc.py
+++ b/crocoite/test_warc.py
@@ -24,6 +24,7 @@ from operator import itemgetter
from warcio.archiveiterator import ArchiveIterator
from yarl import URL
+from multidict import CIMultiDict
from hypothesis import given, reproduce_failure
import hypothesis.strategies as st
import pytest
@@ -32,7 +33,8 @@ from .warc import WarcHandler
from .logger import Logger, WarcHandlerConsumer
from .controller import ControllerStart
from .behavior import Script, ScreenshotEvent, DomSnapshotEvent
-from .browser import Item
+from .browser import RequestResponsePair, Base64Body, UnicodeBody
+from .test_browser import requestResponsePair, urls
def test_log ():
logger = Logger ()
@@ -66,50 +68,6 @@ def test_log ():
data = json.loads (l.strip ())
assert data == golden.pop (0)
-def hostname ():
- # XXX: find a better way to generate hostnames
- return st.text (alphabet=st.sampled_from('abcdefghijklmnopqrstuvwxyz0123456789-'), min_size=1, max_size=253)
-
-def urls ():
- """ Build http/https URL """
- scheme = st.one_of (st.just ('http'), st.just ('https'))
- # Path must start with a slash
- pathSt = st.builds (lambda x: '/' + x, st.text ())
- args = st.fixed_dictionaries ({
- 'scheme': scheme,
- 'host': hostname (),
- 'port': st.one_of (st.none (), st.integers (min_value=1, max_value=2**16-1)),
- 'path': pathSt,
- 'query_string': st.text (),
- 'fragment': st.text (),
- })
- return st.builds (lambda x: URL.build (**x), args)
-
-def item ():
- def f (url, requestBody, body, mimeType):
- i = Item ()
- # XXX: we really need some level of abstraction. Testing is a nightmare.
- i.setRequest ({'requestId': 'myid', 'initiator': 'Test', 'wallTime': 0, 'timestamp': 1, 'request': {'url': str (url), 'method': 'GET', 'headers': {'None': 'None'}}})
- i.setResponse ({'requestId': 'myid', 'timestamp': 2, 'type': 'Document', 'response': {'url': str (url), 'requestHeaders': {'foo': 'bar', 'Set-Cookie': 'line1\nline2'}, 'headers': {'Response': 'Headers', 'Content-Length': '12345'}, 'status': 200}})
- if mimeType is not None:
- i.chromeResponse['response']['mimeType'] = 'text/html'
- i.requestBody = requestBody
- i.body = body
- return i
-
- def failedItem (url):
- i = Item ()
- i.setRequest ({'requestId': 'myid', 'initiator': 'Test', 'wallTime': 0, 'timestamp': 1, 'request': {'url': str (url), 'method': 'GET', 'headers': {'None': 'None'}}})
- i.failed = True
- return i
-
- bodySt = st.one_of (st.none (), st.tuples (st.one_of (st.none (), st.binary ()), st.booleans ()))
- mimeTypeSt = st.one_of (st.none (), st.just ('text/html'))
- return st.one_of (
- st.builds (failedItem, urls ()),
- st.builds (f, urls (), bodySt, bodySt, mimeTypeSt),
- )
-
def jsonObject ():
""" JSON-encodable objects """
return st.dictionaries (st.text (), st.one_of (st.integers (), st.text ()))
@@ -123,7 +81,7 @@ def event ():
st.builds (Script.fromStr, st.text (), st.one_of(st.none (), st.text ())),
st.builds (ScreenshotEvent, urls (), st.integers (), st.binary ()),
st.builds (DomSnapshotEvent, urls (), st.builds (lambda x: x.encode ('utf-8'), st.text ()), viewport()),
- item (),
+ requestResponsePair (),
)
@given (st.lists (event ()))
@@ -136,7 +94,7 @@ def test_push (golden):
# null logger
logger = Logger ()
- with NamedTemporaryFile() as fd:
+ with open('/tmp/test.warc.gz', 'w+b') as fd:
with WarcHandler (fd, logger) as handler:
for g in golden:
handler.push (g)
@@ -191,10 +149,7 @@ def test_push (golden):
assert headers['X-DOM-Snapshot'] == 'True'
assert rec.raw_stream.read () == g.document
- elif isinstance (g, Item):
- if g.failed:
- continue
-
+ elif isinstance (g, RequestResponsePair):
rec = next (it)
# request
@@ -204,54 +159,56 @@ def test_push (golden):
assert URL (headers['warc-target-uri']) == g.url
assert headers['x-chrome-request-id'] == g.id
- assert sorted (rec.http_headers.headers, key=itemgetter (0)) == sorted (g.requestHeaders, key=itemgetter (0))
- if g.requestBody:
- if g.requestBody[0] is None:
- assert not rec.raw_stream.read ()
+ assert CIMultiDict (rec.http_headers.headers) == g.request.headers
+ if g.request.hasPostData:
+ if g.request.body is not None:
+ assert rec.raw_stream.read () == g.request.body
+ assert str (headers['x-chrome-base64body'] or False) == str (isinstance (g.request.body, Base64Body)), (headers['x-chrome-base64body'], g.request.body)
else:
- assert rec.raw_stream.read () == g.requestBody[0], g.requestBody
- assert str (headers['x-chrome-base64body'] or False) == str (g.requestBody[1]), (headers['x-chrome-base64body'], g.requestBody)
+ # body fetch failed
+ assert headers['warc-truncated'] == 'unspecified'
+ assert not rec.raw_stream.read ()
else:
- # body fetch failed
- assert headers['warc-truncated'] == 'unspecified'
+ assert not rec.raw_stream.read ()
# response
- rec = next (it)
- headers = rec.rec_headers
- httpheaders = rec.http_headers
- assert headers['warc-type'] == 'response'
- checkWarcinfoId (headers)
- assert URL (headers['warc-target-uri']) == g.url
- assert headers['x-chrome-request-id'] == g.id
-
- # these are checked separately
- blacklistedHeaders = {'content-type', 'content-length'}
- sortedHeaders = lambda l: sorted (filter (lambda x: x[0].lower() not in blacklistedHeaders, l), key=itemgetter (0))
- assert sortedHeaders (httpheaders.headers) == sortedHeaders (g.responseHeaders)
-
- expectedContentType = g.response.get ('mimeType')
- if expectedContentType is not None:
- assert httpheaders['content-type'].startswith (expectedContentType)
-
- if g.body:
- if g.body[0] is None:
- assert not rec.raw_stream.read ()
- #assert httpheaders['content-length'] == '0'
+ if g.response:
+ rec = next (it)
+ headers = rec.rec_headers
+ httpheaders = rec.http_headers
+ assert headers['warc-type'] == 'response'
+ checkWarcinfoId (headers)
+ assert URL (headers['warc-target-uri']) == g.url
+ assert headers['x-chrome-request-id'] == g.id
+
+ # these are checked separately
+ filteredHeaders = CIMultiDict (httpheaders.headers)
+ for b in {'content-type', 'content-length'}:
+ if b in g.response.headers:
+ g.response.headers.popall (b)
+ if b in filteredHeaders:
+ filteredHeaders.popall (b)
+ assert filteredHeaders == g.response.headers
+
+ expectedContentType = g.response.mimeType
+ if expectedContentType is not None:
+ assert httpheaders['content-type'].startswith (expectedContentType)
+
+ if g.response.body is not None:
+ assert rec.raw_stream.read () == g.response.body
+ assert str (headers['x-chrome-base64body'] or False) == str (isinstance (g.response.body, Base64Body))
+ assert httpheaders['content-length'] == str (len (g.response.body))
+ # body is never truncated if it exists
+ assert headers['warc-truncated'] is None
+
+ # unencoded strings are converted to utf8
+ if isinstance (g.response.body, UnicodeBody) and httpheaders['content-type'] is not None:
+ assert httpheaders['content-type'].endswith ('; charset=utf-8')
else:
- assert rec.raw_stream.read () == g.body[0]
- assert str (headers['x-chrome-base64body'] or False) == str (g.body[1])
- assert httpheaders['content-length'] == str (len (g.body[0]))
-
- # body is never truncated if it exists
- assert headers['warc-truncated'] is None
-
- # unencoded strings are converted to utf8
- if not g.body[1] and httpheaders['content-type'] is not None:
- assert httpheaders['content-type'].endswith ('; charset=utf-8')
- else:
- # body fetch failed
- assert headers['warc-truncated'] == 'unspecified'
- # content-length header should be kept intact
+ # body fetch failed
+ assert headers['warc-truncated'] == 'unspecified'
+ assert not rec.raw_stream.read ()
+ # content-length header should be kept intact
else:
assert False, f"invalid golden type {type(g)}" # pragma: no cover
diff --git a/crocoite/warc.py b/crocoite/warc.py
index dbd9ebc..cb1f2f7 100644
--- a/crocoite/warc.py
+++ b/crocoite/warc.py
@@ -25,6 +25,7 @@ Classes writing data to WARC files
import json, threading
from io import BytesIO
from datetime import datetime
+from http.server import BaseHTTPRequestHandler
from warcio.timeutils import datetime_to_iso_date
from warcio.warcwriter import WARCWriter
@@ -33,7 +34,7 @@ from warcio.statusandheaders import StatusAndHeaders
from .util import packageUrl, StrJsonEncoder
from .controller import EventHandler, ControllerStart
from .behavior import Script, DomSnapshotEvent, ScreenshotEvent
-from .browser import Item
+from .browser import RequestResponsePair, UnicodeBody, Base64Body
class WarcHandler (EventHandler):
__slots__ = ('logger', 'writer', 'documentRecords', 'log',
@@ -86,66 +87,51 @@ class WarcHandler (EventHandler):
url = item.url
path = url.relative().with_fragment(None)
- httpHeaders = StatusAndHeaders(f'{req["method"]} {path} HTTP/1.1',
- item.requestHeaders, protocol='HTTP/1.1', is_http_request=True)
- initiator = item.initiator
+ httpHeaders = StatusAndHeaders(f'{req.method} {path} HTTP/1.1',
+ req.headers, protocol='HTTP/1.1', is_http_request=True)
warcHeaders = {
- 'X-Chrome-Initiator': json.dumps (initiator),
+ 'X-Chrome-Initiator': json.dumps (req.initiator),
'X-Chrome-Request-ID': item.id,
- 'WARC-Date': datetime_to_iso_date (datetime.utcfromtimestamp (item.chromeRequest['wallTime'])),
+ 'WARC-Date': datetime_to_iso_date (req.timestamp),
}
- if item.requestBody is not None:
- payload, payloadBase64Encoded = item.requestBody
- else:
+ body = item.request.body
+ if item.request.hasPostData and body is None:
# oops, don’t know what went wrong here
- logger.error ('requestBody missing', uuid='ee9adc58-e723-4595-9feb-312a67ead6a0')
+ logger.error ('requestBody missing',
+ uuid='ee9adc58-e723-4595-9feb-312a67ead6a0')
warcHeaders['WARC-Truncated'] = 'unspecified'
- payload = None
-
- if payload is not None:
- payload = BytesIO (payload)
- warcHeaders['X-Chrome-Base64Body'] = str (payloadBase64Encoded)
+ else:
+ warcHeaders['X-Chrome-Base64Body'] = str (type (body) is Base64Body)
+ body = BytesIO (body)
record = self.writeRecord (url, 'request',
- payload=payload, http_headers=httpHeaders,
+ payload=body, http_headers=httpHeaders,
warc_headers_dict=warcHeaders)
return record.rec_headers['WARC-Record-ID']
def _writeResponse (self, item, concurrentTo):
# fetch the body
reqId = item.id
- rawBody = None
- base64Encoded = False
- bodyTruncated = None
- if item.isRedirect or item.body is None:
- # redirects reuse the same request, thus we cannot safely retrieve
- # the body (i.e getResponseBody may return the new location’s
- # body). No body available means we failed to retrieve it.
- bodyTruncated = 'unspecified'
- else:
- rawBody, base64Encoded = item.body
# now the response
resp = item.response
warcHeaders = {
'WARC-Concurrent-To': concurrentTo,
- 'WARC-IP-Address': resp.get ('remoteIPAddress', ''),
- 'X-Chrome-Protocol': resp.get ('protocol', ''),
- 'X-Chrome-FromDiskCache': str (resp.get ('fromDiskCache')),
- 'X-Chrome-ConnectionReused': str (resp.get ('connectionReused')),
'X-Chrome-Request-ID': item.id,
- 'WARC-Date': datetime_to_iso_date (datetime.utcfromtimestamp (
- item.chromeRequest['wallTime']+
- (item.chromeResponse['timestamp']-item.chromeRequest['timestamp']))),
+ 'WARC-Date': datetime_to_iso_date (resp.timestamp),
}
- if bodyTruncated:
- warcHeaders['WARC-Truncated'] = bodyTruncated
- else:
- warcHeaders['X-Chrome-Base64Body'] = str (base64Encoded)
-
- httpHeaders = StatusAndHeaders(f'{resp["status"]} {item.statusText}',
- item.responseHeaders,
- protocol='HTTP/1.1')
+ # conditional WARC headers
+ if item.remoteIpAddress:
+ warcHeaders['WARC-IP-Address'] = item.remoteIpAddress
+ if item.protocol:
+ warcHeaders['X-Chrome-Protocol'] = item.protocol
+
+ # HTTP headers
+ statusText = resp.statusText or \
+ BaseHTTPRequestHandler.responses.get (
+ resp.status, ('No status text available', ))[0]
+ httpHeaders = StatusAndHeaders(f'{resp.status} {statusText}',
+ resp.headers, protocol='HTTP/1.1')
# Content is saved decompressed and decoded, remove these headers
blacklistedHeaders = {'transfer-encoding', 'content-encoding'}
@@ -155,20 +141,23 @@ class WarcHandler (EventHandler):
# chrome sends nothing but utf8 encoded text. Fortunately HTTP
# headers take precedence over the document’s <meta>, thus we can
# easily override those.
- contentType = resp.get ('mimeType')
+ contentType = resp.mimeType
if contentType:
- if not base64Encoded:
+ if isinstance (resp.body, UnicodeBody):
contentType += '; charset=utf-8'
httpHeaders.replace_header ('Content-Type', contentType)
- if rawBody is not None:
- httpHeaders.replace_header ('Content-Length', str (len (rawBody)))
- bodyIo = BytesIO (rawBody)
+ # response body
+ body = resp.body
+ if body is None:
+ warcHeaders['WARC-Truncated'] = 'unspecified'
else:
- bodyIo = BytesIO ()
+ httpHeaders.replace_header ('Content-Length', str (len (body)))
+ warcHeaders['X-Chrome-Base64Body'] = str (type (body) is Base64Body)
+ body = BytesIO (body)
record = self.writeRecord (item.url, 'response',
- warc_headers_dict=warcHeaders, payload=bodyIo,
+ warc_headers_dict=warcHeaders, payload=body,
http_headers=httpHeaders)
if item.resourceType == 'Document':
@@ -184,12 +173,11 @@ class WarcHandler (EventHandler):
f'application/javascript; charset={encoding}'})
def _writeItem (self, item):
- if item.failed:
- # should have been handled by the logger already
- return
-
+ assert item.request
concurrentTo = self._writeRequest (item)
- self._writeResponse (item, concurrentTo)
+ # items that failed loading don’t have a response
+ if item.response:
+ self._writeResponse (item, concurrentTo)
def _addRefersTo (self, headers, url):
refersTo = self.documentRecords.get (url)
@@ -247,7 +235,7 @@ class WarcHandler (EventHandler):
self._flushLogEntries ()
route = {Script: _writeScript,
- Item: _writeItem,
+ RequestResponsePair: _writeItem,
DomSnapshotEvent: _writeDomSnapshot,
ScreenshotEvent: _writeScreenshot,
ControllerStart: _writeControllerStart,
diff --git a/setup.py b/setup.py
index 982fa92..7cf6b32 100644
--- a/setup.py
+++ b/setup.py
@@ -18,6 +18,7 @@ setup(
'aiohttp',
'PyYAML',
'yarl',
+ 'multidict',
],
entry_points={
'console_scripts': [