summaryrefslogtreecommitdiff
path: root/crocoite/browser.py
diff options
context:
space:
mode:
authorLars-Dominik Braun <lars@6xq.net>2019-01-03 19:34:17 +0100
committerLars-Dominik Braun <lars@6xq.net>2019-01-03 19:37:27 +0100
commit9d7974e3e7e8a4575ea61cb33a30fa291d12ae38 (patch)
tree5311396c0d74eaa35e1eff1e1641c0bd157cde25 /crocoite/browser.py
parentad4e119bf1a55c84dc7c6260588ed7db9e7199c6 (diff)
downloadcrocoite-9d7974e3e7e8a4575ea61cb33a30fa291d12ae38.tar.gz
crocoite-9d7974e3e7e8a4575ea61cb33a30fa291d12ae38.tar.bz2
crocoite-9d7974e3e7e8a4575ea61cb33a30fa291d12ae38.zip
browser: Turn Item into RequestResponsePair
Previously Item was just a simple wrapper around Chrome’s Network.* events. This turned out to be quite nasty when testing, so its replacement, RequestResponsePair, does some level of abstraction. This makes testing alot easier, since we now can simply instantiate it without building a proper DevTools event. Should come without any functional changes.
Diffstat (limited to 'crocoite/browser.py')
-rw-r--r--crocoite/browser.py326
1 files changed, 215 insertions, 111 deletions
diff --git a/crocoite/browser.py b/crocoite/browser.py
index 3de61f0..50561ed 100644
--- a/crocoite/browser.py
+++ b/crocoite/browser.py
@@ -23,80 +23,197 @@ Chrome browser interactions.
"""
import asyncio
-from base64 import b64decode
+from base64 import b64decode, b64encode
+from datetime import datetime, timedelta
from http.server import BaseHTTPRequestHandler
+
from yarl import URL
+from multidict import CIMultiDict
from .logger import Level
from .devtools import Browser, TabException
-class Item:
- """
- Simple wrapper containing Chrome request and response
- """
+# These two classes’ only purpose is so we can later tell whether a body was
+# base64-encoded or a unicode string
+class Base64Body (bytes):
+ def __new__ (cls, value):
+ return bytes.__new__ (cls, b64decode (value))
+
+ @classmethod
+ def fromBytes (cls, b):
+ """ For testing """
+ return cls (b64encode (b))
+
+class UnicodeBody (bytes):
+ def __new__ (cls, value):
+ if type (value) is not str:
+ raise TypeError ('expecting unicode string')
+
+ return bytes.__new__ (cls, value.encode ('utf-8'))
- __slots__ = ('chromeRequest', 'chromeResponse', 'chromeFinished',
- 'isRedirect', 'failed', 'body', 'requestBody')
+class Request:
+ __slots__ = ('headers', 'body', 'initiator', 'hasPostData', 'method', 'timestamp')
- def __init__ (self):
- self.chromeRequest = {}
- self.chromeResponse = {}
- self.chromeFinished = {}
- self.isRedirect = False
- self.failed = False
- self.body = None
- self.requestBody = None
+ def __init__ (self, method=None, headers=None, body=None):
+ self.headers = headers
+ self.body = body
+ self.hasPostData = False
+ self.initiator = None
+ # HTTP method
+ self.method = method
+ self.timestamp = None
def __repr__ (self):
- return f'<Item {self.url}>'
-
- @property
- def request (self):
- return self.chromeRequest.get ('request', {})
-
- @property
- def response (self):
- return self.chromeResponse.get ('response', {})
-
- @property
- def initiator (self):
- return self.chromeRequest['initiator']
-
- @property
- def id (self):
- return self.chromeRequest['requestId']
-
- @property
- def encodedDataLength (self):
- return self.chromeFinished['encodedDataLength']
-
- @property
- def url (self):
- return URL (self.response.get ('url', self.request.get ('url')))
-
- @property
- def requestHeaders (self):
- # the response object may contain refined headers, which were
- # *actually* sent over the wire
- return self._unfoldHeaders (self.response.get ('requestHeaders', self.request['headers']))
-
- @property
- def responseHeaders (self):
- return self._unfoldHeaders (self.response['headers'])
-
- @property
- def statusText (self):
- text = self.response.get ('statusText')
- if text:
- return text
- text = BaseHTTPRequestHandler.responses.get (self.response['status'])
- if text:
- return text[0]
- return 'No status text available'
-
- @property
- def resourceType (self):
- return self.chromeResponse.get ('type', self.chromeRequest.get ('type', None))
+ return f'Request({self.method!r}, {self.headers!r}, {self.body!r})'
+
+ def __eq__ (self, b):
+ if b is None:
+ return False
+
+ if not isinstance (b, Request):
+ raise TypeError ('Can only compare equality with Request.')
+
+ # do not compare hasPostData (only required to fetch body) and
+ # timestamp (depends on time)
+ return self.headers == b.headers and \
+ self.body == b.body and \
+ self.initiator == b.initiator and \
+ self.method == b.method
+
+class Response:
+ __slots__ = ('status', 'statusText', 'headers', 'body', 'bytesReceived',
+ 'timestamp', 'mimeType')
+
+ def __init__ (self, status=None, statusText=None, headers=None, body=None, mimeType=None):
+ self.status = status
+ self.statusText = statusText
+ self.headers = headers
+ self.body = body
+ # bytes received over the network (not body size!)
+ self.bytesReceived = 0
+ self.timestamp = None
+ self.mimeType = mimeType
+
+ def __repr__ (self):
+ return f'Response({self.status!r}, {self.statusText!r}, {self.headers!r}, {self.body!r}, {self.mimeType!r})'
+
+ def __eq__ (self, b):
+ if b is None:
+ return False
+
+ if not isinstance (b, Response):
+ raise TypeError ('Can only compare equality with Response.')
+
+ # do not compare bytesReceived (depends on network), timestamp
+ # (depends on time) and statusText (does not matter)
+ return self.status == b.status and \
+ self.statusText == b.statusText and \
+ self.headers == b.headers and \
+ self.body == b.body and \
+ self.mimeType == b.mimeType
+
+class ReferenceTimestamp:
+ """ Map relative timestamp to absolute timestamp """
+
+ def __init__ (self, relative, absolute):
+ self.relative = timedelta (seconds=relative)
+ self.absolute = datetime.utcfromtimestamp (absolute)
+
+ def __call__ (self, relative):
+ if not isinstance (relative, timedelta):
+ relative = timedelta (seconds=relative)
+ return self.absolute + (relative-self.relative)
+
+class RequestResponsePair:
+ __slots__ = ('request', 'response', 'id', 'url', 'remoteIpAddress',
+ 'protocol', 'resourceType', '_time')
+
+ def __init__ (self, id=None, url=None, request=None, response=None):
+ self.request = request
+ self.response = response
+ self.id = id
+ self.url = url
+ self.remoteIpAddress = None
+ self.protocol = None
+ self.resourceType = None
+ self._time = None
+
+ def __repr__ (self):
+ return f'RequestResponsePair({self.id!r}, {self.url!r}, {self.request!r}, {self.response!r})'
+
+ def __eq__ (self, b):
+ if not isinstance (b, RequestResponsePair):
+ raise TypeError (f'Can only compare with {self.__class__.__name__}')
+
+ # do not compare id and _time. These depend on external factors and do
+ # not influence the request/response *content*
+ return self.request == b.request and \
+ self.response == b.response and \
+ self.url == b.url and \
+ self.remoteIpAddress == b.remoteIpAddress and \
+ self.protocol == b.protocol and \
+ self.resourceType == b.resourceType
+
+ def fromRequestWillBeSent (self, req):
+ """ Set request data from Chrome Network.requestWillBeSent event """
+ r = req['request']
+
+ self.id = req['requestId']
+ self.url = URL (r['url'])
+ self.resourceType = req.get ('type')
+ self._time = ReferenceTimestamp (req['timestamp'], req['wallTime'])
+
+ assert self.request is None, req
+ self.request = Request ()
+ self.request.initiator = req['initiator']
+ self.request.headers = CIMultiDict (self._unfoldHeaders (r['headers']))
+ self.request.hasPostData = r.get ('hasPostData', False)
+ self.request.method = r['method']
+ self.request.timestamp = self._time (req['timestamp'])
+ if self.request.hasPostData:
+ postData = r.get ('postData')
+ if postData is not None:
+ self.request.body = UnicodeBody (postData)
+
+ def fromResponse (self, r, timestamp=None, resourceType=None):
+ """
+ Set response data from Chrome’s Response object.
+
+ Request must exist. Updates if response was set before. Sometimes
+ fromResponseReceived is triggered twice by Chrome. No idea why.
+ """
+ assert self.request is not None, (self.request, r)
+
+ if not timestamp:
+ timestamp = self.request.timestamp
+
+ self.remoteIpAddress = r.get ('remoteIPAddress')
+ self.protocol = r.get ('protocol')
+ if resourceType:
+ self.resourceType = resourceType
+
+ # a response may contain updated request headers (i.e. those actually
+ # sent over the wire)
+ if 'requestHeaders' in r:
+ self.request.headers = CIMultiDict (self._unfoldHeaders (r['requestHeaders']))
+
+ self.response = Response ()
+ self.response.headers = CIMultiDict (self._unfoldHeaders (r['headers']))
+ self.response.status = r['status']
+ self.response.statusText = r['statusText']
+ self.response.timestamp = timestamp
+ self.response.mimeType = r['mimeType']
+
+ def fromResponseReceived (self, resp):
+ """ Set response data from Chrome Network.responseReceived """
+ return self.fromResponse (resp['response'],
+ self._time (resp['timestamp']), resp['type'])
+
+ def fromLoadingFinished (self, data):
+ self.response.bytesReceived = data['encodedDataLength']
+
+ def fromLoadingFailed (self, data):
+ self.response = None
@staticmethod
def _unfoldHeaders (headers):
@@ -110,44 +227,26 @@ class Item:
items.append ((k, v))
return items
- def setRequest (self, req):
- self.chromeRequest = req
-
- def setResponse (self, resp):
- self.chromeResponse = resp
-
- def setFinished (self, finished):
- self.chromeFinished = finished
-
async def prefetchRequestBody (self, tab):
- # request body
- req = self.request
- postData = req.get ('postData')
- if postData:
- self.requestBody = postData.encode ('utf8'), False
- elif req.get ('hasPostData', False):
+ if self.request.hasPostData and self.request.body is None:
try:
postData = await tab.Network.getRequestPostData (requestId=self.id)
- postData = postData['postData']
- self.requestBody = b64decode (postData), True
+ self.request.body = UnicodeBody (postData['postData'])
except TabException:
- self.requestBody = None
+ self.request.body = None
else:
- self.requestBody = None, False
+ self.request.body = None
async def prefetchResponseBody (self, tab):
- # get response body
+ """ Fetch response body """
try:
body = await tab.Network.getResponseBody (requestId=self.id)
- rawBody = body['body']
- base64Encoded = body['base64Encoded']
- if base64Encoded:
- rawBody = b64decode (rawBody)
+ if body['base64Encoded']:
+ self.response.body = Base64Body (body['body'])
else:
- rawBody = rawBody.encode ('utf8')
- self.body = rawBody, base64Encoded
+ self.response.body = UnicodeBody (body['body'])
except TabException:
- self.body = None
+ self.response.body = None
class VarChangeEvent:
""" Notify when variable is changed """
@@ -179,14 +278,14 @@ class SiteLoader:
XXX: track popup windows/new tabs and close them
"""
- __slots__ = ('requests', 'browser', 'url', 'logger', 'tab', '_iterRunning', 'idle', '_framesLoading')
+ __slots__ = ('requests', 'browser', 'logger', 'tab', '_iterRunning',
+ 'idle', '_framesLoading')
allowedSchemes = {'http', 'https'}
- def __init__ (self, browser, url, logger):
+ def __init__ (self, browser, logger):
self.requests = {}
self.browser = Browser (url=browser)
- self.url = url
- self.logger = logger.bind (context=type (self).__name__, url=url)
+ self.logger = logger.bind (context=type (self).__name__)
self._iterRunning = []
self.idle = VarChangeEvent (True)
@@ -250,7 +349,7 @@ class SiteLoader:
result = t.result ()
if result is None:
pass
- elif isinstance (result, Item):
+ elif isinstance (result, RequestResponsePair):
yield result
else:
method, data = result
@@ -263,8 +362,8 @@ class SiteLoader:
running = pending
self._iterRunning = running
- async def start (self):
- await self.tab.Page.navigate(url=self.url)
+ async def navigate (self, url):
+ await self.tab.Page.navigate(url=url)
# internal chrome callbacks
async def _requestWillBeSent (self, **kwargs):
@@ -282,21 +381,24 @@ class SiteLoader:
# redirects never “finish” loading, but yield another requestWillBeSent with this key set
redirectResp = kwargs.get ('redirectResponse')
if redirectResp:
- # create fake responses
- resp = {'requestId': reqId, 'response': redirectResp, 'timestamp': kwargs['timestamp']}
- item.setResponse (resp)
- resp = {'requestId': reqId, 'encodedDataLength': 0, 'timestamp': kwargs['timestamp']}
- item.setFinished (resp)
- item.isRedirect = True
+ if item.url != url:
+ # this happens for unknown reasons. the docs simply state
+ # it can differ in case of a redirect. Fix it and move on.
+ logger.warning ('redirect url differs',
+ uuid='558a7df7-2258-4fe4-b16d-22b6019cc163',
+ expected=item.url)
+ redirectResp['url'] = str (item.url)
+ item.fromResponse (redirectResp)
logger.info ('redirect', uuid='85eaec41-e2a9-49c2-9445-6f19690278b8', target=url)
+ # XXX: queue this? no need to wait for it
await item.prefetchRequestBody (self.tab)
- # cannot fetch request body due to race condition (item id reused)
+ # cannot fetch response body due to race condition (item id reused)
ret = item
else:
logger.warning ('request exists', uuid='2c989142-ba00-4791-bb03-c2a14e91a56b')
- item = Item ()
- item.setRequest (kwargs)
+ item = RequestResponsePair ()
+ item.fromRequestWillBeSent (kwargs)
self.requests[reqId] = item
logger.debug ('request', uuid='55c17564-1bd0-4499-8724-fa7aad65478f')
@@ -315,7 +417,7 @@ class SiteLoader:
logger.error ('url mismatch', uuid='7385f45f-0b06-4cbc-81f9-67bcd72ee7d0', respUrl=url)
if url.scheme in self.allowedSchemes:
logger.debug ('response', uuid='84461c4e-e8ef-4cbd-8e8e-e10a901c8bd0')
- item.setResponse (kwargs)
+ item.fromResponseReceived (kwargs)
else:
logger.warning ('scheme forbidden', uuid='2ea6e5d7-dd3b-4881-b9de-156c1751c666')
@@ -333,19 +435,21 @@ class SiteLoader:
logger = self.logger.bind (reqId=reqId, reqUrl=item.url)
if item.url.scheme in self.allowedSchemes:
logger.info ('finished', uuid='5a8b4bad-f86a-4fe6-a53e-8da4130d6a02')
- item.setFinished (kwargs)
+ item.fromLoadingFinished (kwargs)
+ # XXX queue both
await asyncio.gather (item.prefetchRequestBody (self.tab), item.prefetchResponseBody (self.tab))
return item
async def _loadingFailed (self, **kwargs):
reqId = kwargs['requestId']
- self.logger.warning ('loading failed',
+ logger = self.logger.bind (reqId=reqId)
+ logger.warning ('loading failed',
uuid='68410f13-6eea-453e-924e-c1af4601748b',
errorText=kwargs['errorText'],
blockedReason=kwargs.get ('blockedReason'))
item = self.requests.pop (reqId, None)
if item is not None:
- item.failed = True
+ item.fromLoadingFailed (kwargs)
return item
async def _entryAdded (self, **kwargs):