summaryrefslogtreecommitdiff
path: root/crocoite/browser.py
diff options
context:
space:
mode:
Diffstat (limited to 'crocoite/browser.py')
-rw-r--r--crocoite/browser.py444
1 files changed, 286 insertions, 158 deletions
diff --git a/crocoite/browser.py b/crocoite/browser.py
index c472746..3518789 100644
--- a/crocoite/browser.py
+++ b/crocoite/browser.py
@@ -23,84 +23,197 @@ Chrome browser interactions.
"""
import asyncio
-from urllib.parse import urlsplit
-from base64 import b64decode
+from base64 import b64decode, b64encode
+from datetime import datetime, timedelta
from http.server import BaseHTTPRequestHandler
+from yarl import URL
+from multidict import CIMultiDict
+
from .logger import Level
from .devtools import Browser, TabException
-class Item:
- """
- Simple wrapper containing Chrome request and response
- """
+# These two classes’ only purpose is so we can later tell whether a body was
+# base64-encoded or a unicode string
+class Base64Body (bytes):
+ def __new__ (cls, value):
+ return bytes.__new__ (cls, b64decode (value))
+
+ @classmethod
+ def fromBytes (cls, b):
+ """ For testing """
+ return cls (b64encode (b))
+
+class UnicodeBody (bytes):
+ def __new__ (cls, value):
+ if type (value) is not str:
+ raise TypeError ('expecting unicode string')
- __slots__ = ('chromeRequest', 'chromeResponse', 'chromeFinished',
- 'isRedirect', 'failed', 'body', 'requestBody')
+ return bytes.__new__ (cls, value.encode ('utf-8'))
- def __init__ (self):
- self.chromeRequest = {}
- self.chromeResponse = {}
- self.chromeFinished = {}
- self.isRedirect = False
- self.failed = False
- self.body = None
- self.requestBody = None
+class Request:
+ __slots__ = ('headers', 'body', 'initiator', 'hasPostData', 'method', 'timestamp')
+
+ def __init__ (self, method=None, headers=None, body=None):
+ self.headers = headers
+ self.body = body
+ self.hasPostData = False
+ self.initiator = None
+ # HTTP method
+ self.method = method
+ self.timestamp = None
+
+ def __repr__ (self):
+ return f'Request({self.method!r}, {self.headers!r}, {self.body!r})'
+
+ def __eq__ (self, b):
+ if b is None:
+ return False
+
+ if not isinstance (b, Request):
+ raise TypeError ('Can only compare equality with Request.')
+
+ # do not compare hasPostData (only required to fetch body) and
+ # timestamp (depends on time)
+ return self.headers == b.headers and \
+ self.body == b.body and \
+ self.initiator == b.initiator and \
+ self.method == b.method
+
+class Response:
+ __slots__ = ('status', 'statusText', 'headers', 'body', 'bytesReceived',
+ 'timestamp', 'mimeType')
+
+ def __init__ (self, status=None, statusText=None, headers=None, body=None, mimeType=None):
+ self.status = status
+ self.statusText = statusText
+ self.headers = headers
+ self.body = body
+ # bytes received over the network (not body size!)
+ self.bytesReceived = 0
+ self.timestamp = None
+ self.mimeType = mimeType
+
+ def __repr__ (self):
+ return f'Response({self.status!r}, {self.statusText!r}, {self.headers!r}, {self.body!r}, {self.mimeType!r})'
+
+ def __eq__ (self, b):
+ if b is None:
+ return False
+
+ if not isinstance (b, Response):
+ raise TypeError ('Can only compare equality with Response.')
+
+ # do not compare bytesReceived (depends on network), timestamp
+ # (depends on time) and statusText (does not matter)
+ return self.status == b.status and \
+ self.statusText == b.statusText and \
+ self.headers == b.headers and \
+ self.body == b.body and \
+ self.mimeType == b.mimeType
+
+class ReferenceTimestamp:
+ """ Map relative timestamp to absolute timestamp """
+
+ def __init__ (self, relative, absolute):
+ self.relative = timedelta (seconds=relative)
+ self.absolute = datetime.utcfromtimestamp (absolute)
+
+ def __call__ (self, relative):
+ if not isinstance (relative, timedelta):
+ relative = timedelta (seconds=relative)
+ return self.absolute + (relative-self.relative)
+
+class RequestResponsePair:
+ __slots__ = ('request', 'response', 'id', 'url', 'remoteIpAddress',
+ 'protocol', 'resourceType', '_time')
+
+ def __init__ (self, id=None, url=None, request=None, response=None):
+ self.request = request
+ self.response = response
+ self.id = id
+ self.url = url
+ self.remoteIpAddress = None
+ self.protocol = None
+ self.resourceType = None
+ self._time = None
def __repr__ (self):
- return '<Item {}>'.format (self.url)
-
- @property
- def request (self):
- return self.chromeRequest.get ('request', {})
-
- @property
- def response (self):
- return self.chromeResponse.get ('response', {})
-
- @property
- def initiator (self):
- return self.chromeRequest['initiator']
-
- @property
- def id (self):
- return self.chromeRequest['requestId']
-
- @property
- def encodedDataLength (self):
- return self.chromeFinished['encodedDataLength']
-
- @property
- def url (self):
- return self.response.get ('url', self.request.get ('url'))
-
- @property
- def parsedUrl (self):
- return urlsplit (self.url)
-
- @property
- def requestHeaders (self):
- # the response object may contain refined headers, which were
- # *actually* sent over the wire
- return self._unfoldHeaders (self.response.get ('requestHeaders', self.request['headers']))
-
- @property
- def responseHeaders (self):
- return self._unfoldHeaders (self.response['headers'])
-
- @property
- def statusText (self):
- text = self.response.get ('statusText')
- if text:
- return text
- text = BaseHTTPRequestHandler.responses.get (self.response['status'])
- if text:
- return text[0]
- return 'No status text available'
-
- @property
- def resourceType (self):
- return self.chromeResponse.get ('type', self.chromeRequest.get ('type', None))
+ return f'RequestResponsePair({self.id!r}, {self.url!r}, {self.request!r}, {self.response!r})'
+
+ def __eq__ (self, b):
+ if not isinstance (b, RequestResponsePair):
+ raise TypeError (f'Can only compare with {self.__class__.__name__}')
+
+ # do not compare id and _time. These depend on external factors and do
+ # not influence the request/response *content*
+ return self.request == b.request and \
+ self.response == b.response and \
+ self.url == b.url and \
+ self.remoteIpAddress == b.remoteIpAddress and \
+ self.protocol == b.protocol and \
+ self.resourceType == b.resourceType
+
+ def fromRequestWillBeSent (self, req):
+ """ Set request data from Chrome Network.requestWillBeSent event """
+ r = req['request']
+
+ self.id = req['requestId']
+ self.url = URL (r['url'])
+ self.resourceType = req.get ('type')
+ self._time = ReferenceTimestamp (req['timestamp'], req['wallTime'])
+
+ assert self.request is None, req
+ self.request = Request ()
+ self.request.initiator = req['initiator']
+ self.request.headers = CIMultiDict (self._unfoldHeaders (r['headers']))
+ self.request.hasPostData = r.get ('hasPostData', False)
+ self.request.method = r['method']
+ self.request.timestamp = self._time (req['timestamp'])
+ if self.request.hasPostData:
+ postData = r.get ('postData')
+ if postData is not None:
+ self.request.body = UnicodeBody (postData)
+
+ def fromResponse (self, r, timestamp=None, resourceType=None):
+ """
+ Set response data from Chrome’s Response object.
+
+ Request must exist. Updates if response was set before. Sometimes
+ fromResponseReceived is triggered twice by Chrome. No idea why.
+ """
+ assert self.request is not None, (self.request, r)
+
+ if not timestamp:
+ timestamp = self.request.timestamp
+
+ self.remoteIpAddress = r.get ('remoteIPAddress')
+ self.protocol = r.get ('protocol')
+ if resourceType:
+ self.resourceType = resourceType
+
+ # a response may contain updated request headers (i.e. those actually
+ # sent over the wire)
+ if 'requestHeaders' in r:
+ self.request.headers = CIMultiDict (self._unfoldHeaders (r['requestHeaders']))
+
+ self.response = Response ()
+ self.response.headers = CIMultiDict (self._unfoldHeaders (r['headers']))
+ self.response.status = r['status']
+ self.response.statusText = r['statusText']
+ self.response.timestamp = timestamp
+ self.response.mimeType = r['mimeType']
+
+ def fromResponseReceived (self, resp):
+ """ Set response data from Chrome Network.responseReceived """
+ return self.fromResponse (resp['response'],
+ self._time (resp['timestamp']), resp['type'])
+
+ def fromLoadingFinished (self, data):
+ self.response.bytesReceived = data['encodedDataLength']
+
+ def fromLoadingFailed (self, data):
+ self.response = None
@staticmethod
def _unfoldHeaders (headers):
@@ -114,67 +227,46 @@ class Item:
items.append ((k, v))
return items
- def setRequest (self, req):
- self.chromeRequest = req
-
- def setResponse (self, resp):
- self.chromeResponse = resp
-
- def setFinished (self, finished):
- self.chromeFinished = finished
-
async def prefetchRequestBody (self, tab):
- # request body
- req = self.request
- postData = req.get ('postData')
- if postData:
- self.requestBody = postData.encode ('utf8'), False
- elif req.get ('hasPostData', False):
+ if self.request.hasPostData and self.request.body is None:
try:
postData = await tab.Network.getRequestPostData (requestId=self.id)
- postData = postData['postData']
- self.requestBody = b64decode (postData), True
+ self.request.body = UnicodeBody (postData['postData'])
except TabException:
- self.requestBody = None
- else:
- self.requestBody = None, False
+ self.request.body = None
async def prefetchResponseBody (self, tab):
- # get response body
+ """ Fetch response body """
try:
body = await tab.Network.getResponseBody (requestId=self.id)
- rawBody = body['body']
- base64Encoded = body['base64Encoded']
- if base64Encoded:
- rawBody = b64decode (rawBody)
+ if body['base64Encoded']:
+ self.response.body = Base64Body (body['body'])
else:
- rawBody = rawBody.encode ('utf8')
- self.body = rawBody, base64Encoded
+ self.response.body = UnicodeBody (body['body'])
except TabException:
- self.body = None
+ self.response.body = None
+
+class NavigateError (IOError):
+ pass
-class VarChangeEvent:
- """ Notify when variable is changed """
+class PageIdle:
+ """ Page idle event """
- __slots__ = ('_value', 'event')
+ __slots__ = ('idle', )
- def __init__ (self, value):
- self._value = value
- self.event = asyncio.Event()
+ def __init__ (self, idle):
+ self.idle = idle
- def set (self, value):
- if value != self._value:
- self._value = value
- # unblock waiting threads
- self.event.set ()
- self.event.clear ()
+ def __bool__ (self):
+ return self.idle
- def get (self):
- return self._value
+class FrameNavigated:
+ __slots__ = ('id', 'url', 'mimeType')
- async def wait (self):
- await self.event.wait ()
- return self._value
+ def __init__ (self, id, url, mimeType):
+ self.id = id
+ self.url = URL (url)
+ self.mimeType = mimeType
class SiteLoader:
"""
@@ -183,18 +275,18 @@ class SiteLoader:
XXX: track popup windows/new tabs and close them
"""
- __slots__ = ('requests', 'browser', 'url', 'logger', 'tab', '_iterRunning', 'idle', '_framesLoading')
+ __slots__ = ('requests', 'browser', 'logger', 'tab', '_iterRunning',
+ '_framesLoading', '_rootFrame')
allowedSchemes = {'http', 'https'}
- def __init__ (self, browser, url, logger):
+ def __init__ (self, browser, logger):
self.requests = {}
self.browser = Browser (url=browser)
- self.url = url
- self.logger = logger.bind (context=type (self).__name__, url=url)
+ self.logger = logger.bind (context=type (self).__name__)
self._iterRunning = []
- self.idle = VarChangeEvent (True)
self._framesLoading = set ()
+ self._rootFrame = None
async def __aenter__ (self):
tab = self.tab = await self.browser.__aenter__ ()
@@ -236,6 +328,7 @@ class SiteLoader:
tab.Page.javascriptDialogOpening: self._javascriptDialogOpening,
tab.Page.frameStartedLoading: self._frameStartedLoading,
tab.Page.frameStoppedLoading: self._frameStoppedLoading,
+ tab.Page.frameNavigated: self._frameNavigated,
}
# The implementation is a little advanced. Why? The goal here is to
@@ -247,36 +340,46 @@ class SiteLoader:
# we need to block (yield) for every item completed, but not
# handled by the consumer (caller).
running = self._iterRunning
- running.append (asyncio.ensure_future (self.tab.get ()))
+ tabGetTask = asyncio.ensure_future (self.tab.get ())
+ running.append (tabGetTask)
while True:
done, pending = await asyncio.wait (running, return_when=asyncio.FIRST_COMPLETED)
for t in done:
result = t.result ()
if result is None:
pass
- elif isinstance (result, Item):
- yield result
- else:
+ elif t == tabGetTask:
method, data = result
f = handler.get (method, None)
if f is not None:
task = asyncio.ensure_future (f (**data))
pending.add (task)
- pending.add (asyncio.ensure_future (self.tab.get ()))
+ tabGetTask = asyncio.ensure_future (self.tab.get ())
+ pending.add (tabGetTask)
+ else:
+ yield result
running = pending
self._iterRunning = running
- async def start (self):
- await self.tab.Page.navigate(url=self.url)
+ async def navigate (self, url):
+ ret = await self.tab.Page.navigate(url=url)
+ self.logger.debug ('navigate',
+ uuid='9d47ded2-951f-4e09-86ee-fd4151e20666', result=ret)
+ if 'errorText' in ret:
+ raise NavigateError (ret['errorText'])
+ self._rootFrame = ret['frameId']
# internal chrome callbacks
async def _requestWillBeSent (self, **kwargs):
+ self.logger.debug ('requestWillBeSent',
+ uuid='b828d75a-650d-42d2-8c66-14f4547512da', args=kwargs)
+
reqId = kwargs['requestId']
req = kwargs['request']
- logger = self.logger.bind (reqId=reqId, reqUrl=req['url'])
+ url = URL (req['url'])
+ logger = self.logger.bind (reqId=reqId, reqUrl=url)
- url = urlsplit (req['url'])
if url.scheme not in self.allowedSchemes:
return
@@ -286,38 +389,44 @@ class SiteLoader:
# redirects never “finish” loading, but yield another requestWillBeSent with this key set
redirectResp = kwargs.get ('redirectResponse')
if redirectResp:
- # create fake responses
- resp = {'requestId': reqId, 'response': redirectResp, 'timestamp': kwargs['timestamp']}
- item.setResponse (resp)
- resp = {'requestId': reqId, 'encodedDataLength': 0, 'timestamp': kwargs['timestamp']}
- item.setFinished (resp)
- item.isRedirect = True
- logger.info ('redirect', uuid='85eaec41-e2a9-49c2-9445-6f19690278b8', target=req['url'])
+ if item.url != url:
+ # this happens for unknown reasons. the docs simply state
+ # it can differ in case of a redirect. Fix it and move on.
+ logger.warning ('redirect url differs',
+ uuid='558a7df7-2258-4fe4-b16d-22b6019cc163',
+ expected=item.url)
+ redirectResp['url'] = str (item.url)
+ item.fromResponse (redirectResp)
+ logger.info ('redirect', uuid='85eaec41-e2a9-49c2-9445-6f19690278b8', target=url)
+ # XXX: queue this? no need to wait for it
await item.prefetchRequestBody (self.tab)
- # cannot fetch request body due to race condition (item id reused)
+ # cannot fetch response body due to race condition (item id reused)
ret = item
else:
logger.warning ('request exists', uuid='2c989142-ba00-4791-bb03-c2a14e91a56b')
- item = Item ()
- item.setRequest (kwargs)
+ item = RequestResponsePair ()
+ item.fromRequestWillBeSent (kwargs)
self.requests[reqId] = item
- logger.debug ('request', uuid='55c17564-1bd0-4499-8724-fa7aad65478f')
return ret
async def _responseReceived (self, **kwargs):
+ self.logger.debug ('responseReceived',
+ uuid='ecd67e69-401a-41cb-b4ec-eeb1f1ec6abb', args=kwargs)
+
reqId = kwargs['requestId']
item = self.requests.get (reqId)
if item is None:
return
resp = kwargs['response']
- logger = self.logger.bind (reqId=reqId, respUrl=resp['url'])
- url = urlsplit (resp['url'])
+ url = URL (resp['url'])
+ logger = self.logger.bind (reqId=reqId, respUrl=url)
+ if item.url != url:
+ logger.error ('url mismatch', uuid='7385f45f-0b06-4cbc-81f9-67bcd72ee7d0', respUrl=url)
if url.scheme in self.allowedSchemes:
- logger.debug ('response', uuid='84461c4e-e8ef-4cbd-8e8e-e10a901c8bd0')
- item.setResponse (kwargs)
+ item.fromResponseReceived (kwargs)
else:
logger.warning ('scheme forbidden', uuid='2ea6e5d7-dd3b-4881-b9de-156c1751c666')
@@ -326,32 +435,37 @@ class SiteLoader:
Item was fully loaded. For some items the request body is not available
when responseReceived is fired, thus move everything here.
"""
+ self.logger.debug ('loadingFinished',
+ uuid='35479405-a5b5-4395-8c33-d3601d1796b9', args=kwargs)
+
reqId = kwargs['requestId']
item = self.requests.pop (reqId, None)
if item is None:
# we never recorded this request (blacklisted scheme, for example)
return
+ if not item.response:
+ # chrome failed to send us a responseReceived event for this item,
+ # so we can’t record it (missing request/response headers)
+ self.logger.error ('response missing',
+ uuid='fac3ab96-3f9b-4c5a-95c7-f83b675cdcb9', requestId=item.id)
+ return
+
req = item.request
- logger = self.logger.bind (reqId=reqId, reqUrl=req['url'])
- resp = item.response
- if req['url'] != resp['url']:
- logger.error ('url mismatch', uuid='7385f45f-0b06-4cbc-81f9-67bcd72ee7d0', respUrl=resp['url'])
- url = urlsplit (resp['url'])
- if url.scheme in self.allowedSchemes:
- logger.info ('finished', uuid='5a8b4bad-f86a-4fe6-a53e-8da4130d6a02')
- item.setFinished (kwargs)
+ if item.url.scheme in self.allowedSchemes:
+ item.fromLoadingFinished (kwargs)
+ # XXX queue both
await asyncio.gather (item.prefetchRequestBody (self.tab), item.prefetchResponseBody (self.tab))
return item
async def _loadingFailed (self, **kwargs):
+ self.logger.info ('loadingFailed',
+ uuid='4a944e85-5fae-4aa6-9e7c-e578b29392e4', args=kwargs)
+
reqId = kwargs['requestId']
- self.logger.warning ('loading failed',
- uuid='68410f13-6eea-453e-924e-c1af4601748b',
- errorText=kwargs['errorText'],
- blockedReason=kwargs.get ('blockedReason'))
+ logger = self.logger.bind (reqId=reqId)
item = self.requests.pop (reqId, None)
if item is not None:
- item.failed = True
+ item.fromLoadingFailed (kwargs)
return item
async def _entryAdded (self, **kwargs):
@@ -381,11 +495,25 @@ class SiteLoader:
uuid='3ef7292e-8595-4e89-b834-0cc6bc40ee38', **kwargs)
async def _frameStartedLoading (self, **kwargs):
+ self.logger.debug ('frameStartedLoading',
+ uuid='bbeb39c0-3304-4221-918e-f26bd443c566', args=kwargs)
+
self._framesLoading.add (kwargs['frameId'])
- self.idle.set (False)
+ return PageIdle (False)
async def _frameStoppedLoading (self, **kwargs):
+ self.logger.debug ('frameStoppedLoading',
+ uuid='fcbe8110-511c-4cbb-ac2b-f61a5782c5a0', args=kwargs)
+
self._framesLoading.remove (kwargs['frameId'])
if not self._framesLoading:
- self.idle.set (True)
+ return PageIdle (True)
+
+ async def _frameNavigated (self, **kwargs):
+ self.logger.debug ('frameNavigated',
+ uuid='0e876f7d-7129-4612-8632-686f42ac6e1f', args=kwargs)
+ frame = kwargs['frame']
+ if self._rootFrame == frame['id']:
+ assert frame.get ('parentId', None) is None, "root frame must not have a parent"
+ return FrameNavigated (frame['id'], frame['url'], frame['mimeType'])