summaryrefslogtreecommitdiff
path: root/crocoite/browser.py
diff options
context:
space:
mode:
Diffstat (limited to 'crocoite/browser.py')
-rw-r--r--crocoite/browser.py326
1 files changed, 215 insertions, 111 deletions
diff --git a/crocoite/browser.py b/crocoite/browser.py
index 3de61f0..50561ed 100644
--- a/crocoite/browser.py
+++ b/crocoite/browser.py
@@ -23,80 +23,197 @@ Chrome browser interactions.
"""
import asyncio
-from base64 import b64decode
+from base64 import b64decode, b64encode
+from datetime import datetime, timedelta
from http.server import BaseHTTPRequestHandler
+
from yarl import URL
+from multidict import CIMultiDict
from .logger import Level
from .devtools import Browser, TabException
-class Item:
- """
- Simple wrapper containing Chrome request and response
- """
+# These two classes’ only purpose is so we can later tell whether a body was
+# base64-encoded or a unicode string
+class Base64Body (bytes):
+ def __new__ (cls, value):
+ return bytes.__new__ (cls, b64decode (value))
+
+ @classmethod
+ def fromBytes (cls, b):
+ """ For testing """
+ return cls (b64encode (b))
+
+class UnicodeBody (bytes):
+ def __new__ (cls, value):
+ if type (value) is not str:
+ raise TypeError ('expecting unicode string')
+
+ return bytes.__new__ (cls, value.encode ('utf-8'))
- __slots__ = ('chromeRequest', 'chromeResponse', 'chromeFinished',
- 'isRedirect', 'failed', 'body', 'requestBody')
+class Request:
+ __slots__ = ('headers', 'body', 'initiator', 'hasPostData', 'method', 'timestamp')
- def __init__ (self):
- self.chromeRequest = {}
- self.chromeResponse = {}
- self.chromeFinished = {}
- self.isRedirect = False
- self.failed = False
- self.body = None
- self.requestBody = None
+ def __init__ (self, method=None, headers=None, body=None):
+ self.headers = headers
+ self.body = body
+ self.hasPostData = False
+ self.initiator = None
+ # HTTP method
+ self.method = method
+ self.timestamp = None
def __repr__ (self):
- return f'<Item {self.url}>'
-
- @property
- def request (self):
- return self.chromeRequest.get ('request', {})
-
- @property
- def response (self):
- return self.chromeResponse.get ('response', {})
-
- @property
- def initiator (self):
- return self.chromeRequest['initiator']
-
- @property
- def id (self):
- return self.chromeRequest['requestId']
-
- @property
- def encodedDataLength (self):
- return self.chromeFinished['encodedDataLength']
-
- @property
- def url (self):
- return URL (self.response.get ('url', self.request.get ('url')))
-
- @property
- def requestHeaders (self):
- # the response object may contain refined headers, which were
- # *actually* sent over the wire
- return self._unfoldHeaders (self.response.get ('requestHeaders', self.request['headers']))
-
- @property
- def responseHeaders (self):
- return self._unfoldHeaders (self.response['headers'])
-
- @property
- def statusText (self):
- text = self.response.get ('statusText')
- if text:
- return text
- text = BaseHTTPRequestHandler.responses.get (self.response['status'])
- if text:
- return text[0]
- return 'No status text available'
-
- @property
- def resourceType (self):
- return self.chromeResponse.get ('type', self.chromeRequest.get ('type', None))
+ return f'Request({self.method!r}, {self.headers!r}, {self.body!r})'
+
+ def __eq__ (self, b):
+ if b is None:
+ return False
+
+ if not isinstance (b, Request):
+ raise TypeError ('Can only compare equality with Request.')
+
+ # do not compare hasPostData (only required to fetch body) and
+ # timestamp (depends on time)
+ return self.headers == b.headers and \
+ self.body == b.body and \
+ self.initiator == b.initiator and \
+ self.method == b.method
+
+class Response:
+ __slots__ = ('status', 'statusText', 'headers', 'body', 'bytesReceived',
+ 'timestamp', 'mimeType')
+
+ def __init__ (self, status=None, statusText=None, headers=None, body=None, mimeType=None):
+ self.status = status
+ self.statusText = statusText
+ self.headers = headers
+ self.body = body
+ # bytes received over the network (not body size!)
+ self.bytesReceived = 0
+ self.timestamp = None
+ self.mimeType = mimeType
+
+ def __repr__ (self):
+ return f'Response({self.status!r}, {self.statusText!r}, {self.headers!r}, {self.body!r}, {self.mimeType!r})'
+
+ def __eq__ (self, b):
+ if b is None:
+ return False
+
+ if not isinstance (b, Response):
+ raise TypeError ('Can only compare equality with Response.')
+
+ # do not compare bytesReceived (depends on network), timestamp
+ # (depends on time) and statusText (does not matter)
+ return self.status == b.status and \
+ self.statusText == b.statusText and \
+ self.headers == b.headers and \
+ self.body == b.body and \
+ self.mimeType == b.mimeType
+
+class ReferenceTimestamp:
+ """ Map relative timestamp to absolute timestamp """
+
+ def __init__ (self, relative, absolute):
+ self.relative = timedelta (seconds=relative)
+ self.absolute = datetime.utcfromtimestamp (absolute)
+
+ def __call__ (self, relative):
+ if not isinstance (relative, timedelta):
+ relative = timedelta (seconds=relative)
+ return self.absolute + (relative-self.relative)
+
+class RequestResponsePair:
+ __slots__ = ('request', 'response', 'id', 'url', 'remoteIpAddress',
+ 'protocol', 'resourceType', '_time')
+
+ def __init__ (self, id=None, url=None, request=None, response=None):
+ self.request = request
+ self.response = response
+ self.id = id
+ self.url = url
+ self.remoteIpAddress = None
+ self.protocol = None
+ self.resourceType = None
+ self._time = None
+
+ def __repr__ (self):
+ return f'RequestResponsePair({self.id!r}, {self.url!r}, {self.request!r}, {self.response!r})'
+
+ def __eq__ (self, b):
+ if not isinstance (b, RequestResponsePair):
+ raise TypeError (f'Can only compare with {self.__class__.__name__}')
+
+ # do not compare id and _time. These depend on external factors and do
+ # not influence the request/response *content*
+ return self.request == b.request and \
+ self.response == b.response and \
+ self.url == b.url and \
+ self.remoteIpAddress == b.remoteIpAddress and \
+ self.protocol == b.protocol and \
+ self.resourceType == b.resourceType
+
+ def fromRequestWillBeSent (self, req):
+ """ Set request data from Chrome Network.requestWillBeSent event """
+ r = req['request']
+
+ self.id = req['requestId']
+ self.url = URL (r['url'])
+ self.resourceType = req.get ('type')
+ self._time = ReferenceTimestamp (req['timestamp'], req['wallTime'])
+
+ assert self.request is None, req
+ self.request = Request ()
+ self.request.initiator = req['initiator']
+ self.request.headers = CIMultiDict (self._unfoldHeaders (r['headers']))
+ self.request.hasPostData = r.get ('hasPostData', False)
+ self.request.method = r['method']
+ self.request.timestamp = self._time (req['timestamp'])
+ if self.request.hasPostData:
+ postData = r.get ('postData')
+ if postData is not None:
+ self.request.body = UnicodeBody (postData)
+
+ def fromResponse (self, r, timestamp=None, resourceType=None):
+ """
+ Set response data from Chrome’s Response object.
+
+ Request must exist. Updates if response was set before. Sometimes
+ fromResponseReceived is triggered twice by Chrome. No idea why.
+ """
+ assert self.request is not None, (self.request, r)
+
+ if not timestamp:
+ timestamp = self.request.timestamp
+
+ self.remoteIpAddress = r.get ('remoteIPAddress')
+ self.protocol = r.get ('protocol')
+ if resourceType:
+ self.resourceType = resourceType
+
+ # a response may contain updated request headers (i.e. those actually
+ # sent over the wire)
+ if 'requestHeaders' in r:
+ self.request.headers = CIMultiDict (self._unfoldHeaders (r['requestHeaders']))
+
+ self.response = Response ()
+ self.response.headers = CIMultiDict (self._unfoldHeaders (r['headers']))
+ self.response.status = r['status']
+ self.response.statusText = r['statusText']
+ self.response.timestamp = timestamp
+ self.response.mimeType = r['mimeType']
+
+ def fromResponseReceived (self, resp):
+ """ Set response data from Chrome Network.responseReceived """
+ return self.fromResponse (resp['response'],
+ self._time (resp['timestamp']), resp['type'])
+
+ def fromLoadingFinished (self, data):
+ self.response.bytesReceived = data['encodedDataLength']
+
+ def fromLoadingFailed (self, data):
+ self.response = None
@staticmethod
def _unfoldHeaders (headers):
@@ -110,44 +227,26 @@ class Item:
items.append ((k, v))
return items
- def setRequest (self, req):
- self.chromeRequest = req
-
- def setResponse (self, resp):
- self.chromeResponse = resp
-
- def setFinished (self, finished):
- self.chromeFinished = finished
-
async def prefetchRequestBody (self, tab):
- # request body
- req = self.request
- postData = req.get ('postData')
- if postData:
- self.requestBody = postData.encode ('utf8'), False
- elif req.get ('hasPostData', False):
+ if self.request.hasPostData and self.request.body is None:
try:
postData = await tab.Network.getRequestPostData (requestId=self.id)
- postData = postData['postData']
- self.requestBody = b64decode (postData), True
+ self.request.body = UnicodeBody (postData['postData'])
except TabException:
- self.requestBody = None
+ self.request.body = None
else:
- self.requestBody = None, False
+ self.request.body = None
async def prefetchResponseBody (self, tab):
- # get response body
+ """ Fetch response body """
try:
body = await tab.Network.getResponseBody (requestId=self.id)
- rawBody = body['body']
- base64Encoded = body['base64Encoded']
- if base64Encoded:
- rawBody = b64decode (rawBody)
+ if body['base64Encoded']:
+ self.response.body = Base64Body (body['body'])
else:
- rawBody = rawBody.encode ('utf8')
- self.body = rawBody, base64Encoded
+ self.response.body = UnicodeBody (body['body'])
except TabException:
- self.body = None
+ self.response.body = None
class VarChangeEvent:
""" Notify when variable is changed """
@@ -179,14 +278,14 @@ class SiteLoader:
XXX: track popup windows/new tabs and close them
"""
- __slots__ = ('requests', 'browser', 'url', 'logger', 'tab', '_iterRunning', 'idle', '_framesLoading')
+ __slots__ = ('requests', 'browser', 'logger', 'tab', '_iterRunning',
+ 'idle', '_framesLoading')
allowedSchemes = {'http', 'https'}
- def __init__ (self, browser, url, logger):
+ def __init__ (self, browser, logger):
self.requests = {}
self.browser = Browser (url=browser)
- self.url = url
- self.logger = logger.bind (context=type (self).__name__, url=url)
+ self.logger = logger.bind (context=type (self).__name__)
self._iterRunning = []
self.idle = VarChangeEvent (True)
@@ -250,7 +349,7 @@ class SiteLoader:
result = t.result ()
if result is None:
pass
- elif isinstance (result, Item):
+ elif isinstance (result, RequestResponsePair):
yield result
else:
method, data = result
@@ -263,8 +362,8 @@ class SiteLoader:
running = pending
self._iterRunning = running
- async def start (self):
- await self.tab.Page.navigate(url=self.url)
+ async def navigate (self, url):
+ await self.tab.Page.navigate(url=url)
# internal chrome callbacks
async def _requestWillBeSent (self, **kwargs):
@@ -282,21 +381,24 @@ class SiteLoader:
# redirects never “finish” loading, but yield another requestWillBeSent with this key set
redirectResp = kwargs.get ('redirectResponse')
if redirectResp:
- # create fake responses
- resp = {'requestId': reqId, 'response': redirectResp, 'timestamp': kwargs['timestamp']}
- item.setResponse (resp)
- resp = {'requestId': reqId, 'encodedDataLength': 0, 'timestamp': kwargs['timestamp']}
- item.setFinished (resp)
- item.isRedirect = True
+ if item.url != url:
+ # this happens for unknown reasons. the docs simply state
+ # it can differ in case of a redirect. Fix it and move on.
+ logger.warning ('redirect url differs',
+ uuid='558a7df7-2258-4fe4-b16d-22b6019cc163',
+ expected=item.url)
+ redirectResp['url'] = str (item.url)
+ item.fromResponse (redirectResp)
logger.info ('redirect', uuid='85eaec41-e2a9-49c2-9445-6f19690278b8', target=url)
+ # XXX: queue this? no need to wait for it
await item.prefetchRequestBody (self.tab)
- # cannot fetch request body due to race condition (item id reused)
+ # cannot fetch response body due to race condition (item id reused)
ret = item
else:
logger.warning ('request exists', uuid='2c989142-ba00-4791-bb03-c2a14e91a56b')
- item = Item ()
- item.setRequest (kwargs)
+ item = RequestResponsePair ()
+ item.fromRequestWillBeSent (kwargs)
self.requests[reqId] = item
logger.debug ('request', uuid='55c17564-1bd0-4499-8724-fa7aad65478f')
@@ -315,7 +417,7 @@ class SiteLoader:
logger.error ('url mismatch', uuid='7385f45f-0b06-4cbc-81f9-67bcd72ee7d0', respUrl=url)
if url.scheme in self.allowedSchemes:
logger.debug ('response', uuid='84461c4e-e8ef-4cbd-8e8e-e10a901c8bd0')
- item.setResponse (kwargs)
+ item.fromResponseReceived (kwargs)
else:
logger.warning ('scheme forbidden', uuid='2ea6e5d7-dd3b-4881-b9de-156c1751c666')
@@ -333,19 +435,21 @@ class SiteLoader:
logger = self.logger.bind (reqId=reqId, reqUrl=item.url)
if item.url.scheme in self.allowedSchemes:
logger.info ('finished', uuid='5a8b4bad-f86a-4fe6-a53e-8da4130d6a02')
- item.setFinished (kwargs)
+ item.fromLoadingFinished (kwargs)
+ # XXX queue both
await asyncio.gather (item.prefetchRequestBody (self.tab), item.prefetchResponseBody (self.tab))
return item
async def _loadingFailed (self, **kwargs):
reqId = kwargs['requestId']
- self.logger.warning ('loading failed',
+ logger = self.logger.bind (reqId=reqId)
+ logger.warning ('loading failed',
uuid='68410f13-6eea-453e-924e-c1af4601748b',
errorText=kwargs['errorText'],
blockedReason=kwargs.get ('blockedReason'))
item = self.requests.pop (reqId, None)
if item is not None:
- item.failed = True
+ item.fromLoadingFailed (kwargs)
return item
async def _entryAdded (self, **kwargs):