diff options
Diffstat (limited to 'crocoite/browser.py')
-rw-r--r-- | crocoite/browser.py | 444 |
1 files changed, 286 insertions, 158 deletions
diff --git a/crocoite/browser.py b/crocoite/browser.py index c472746..3518789 100644 --- a/crocoite/browser.py +++ b/crocoite/browser.py @@ -23,84 +23,197 @@ Chrome browser interactions. """ import asyncio -from urllib.parse import urlsplit -from base64 import b64decode +from base64 import b64decode, b64encode +from datetime import datetime, timedelta from http.server import BaseHTTPRequestHandler +from yarl import URL +from multidict import CIMultiDict + from .logger import Level from .devtools import Browser, TabException -class Item: - """ - Simple wrapper containing Chrome request and response - """ +# These two classes’ only purpose is so we can later tell whether a body was +# base64-encoded or a unicode string +class Base64Body (bytes): + def __new__ (cls, value): + return bytes.__new__ (cls, b64decode (value)) + + @classmethod + def fromBytes (cls, b): + """ For testing """ + return cls (b64encode (b)) + +class UnicodeBody (bytes): + def __new__ (cls, value): + if type (value) is not str: + raise TypeError ('expecting unicode string') - __slots__ = ('chromeRequest', 'chromeResponse', 'chromeFinished', - 'isRedirect', 'failed', 'body', 'requestBody') + return bytes.__new__ (cls, value.encode ('utf-8')) - def __init__ (self): - self.chromeRequest = {} - self.chromeResponse = {} - self.chromeFinished = {} - self.isRedirect = False - self.failed = False - self.body = None - self.requestBody = None +class Request: + __slots__ = ('headers', 'body', 'initiator', 'hasPostData', 'method', 'timestamp') + + def __init__ (self, method=None, headers=None, body=None): + self.headers = headers + self.body = body + self.hasPostData = False + self.initiator = None + # HTTP method + self.method = method + self.timestamp = None + + def __repr__ (self): + return f'Request({self.method!r}, {self.headers!r}, {self.body!r})' + + def __eq__ (self, b): + if b is None: + return False + + if not isinstance (b, Request): + raise TypeError ('Can only compare equality with Request.') + + # do not compare hasPostData (only required to fetch body) and + # timestamp (depends on time) + return self.headers == b.headers and \ + self.body == b.body and \ + self.initiator == b.initiator and \ + self.method == b.method + +class Response: + __slots__ = ('status', 'statusText', 'headers', 'body', 'bytesReceived', + 'timestamp', 'mimeType') + + def __init__ (self, status=None, statusText=None, headers=None, body=None, mimeType=None): + self.status = status + self.statusText = statusText + self.headers = headers + self.body = body + # bytes received over the network (not body size!) + self.bytesReceived = 0 + self.timestamp = None + self.mimeType = mimeType + + def __repr__ (self): + return f'Response({self.status!r}, {self.statusText!r}, {self.headers!r}, {self.body!r}, {self.mimeType!r})' + + def __eq__ (self, b): + if b is None: + return False + + if not isinstance (b, Response): + raise TypeError ('Can only compare equality with Response.') + + # do not compare bytesReceived (depends on network), timestamp + # (depends on time) and statusText (does not matter) + return self.status == b.status and \ + self.statusText == b.statusText and \ + self.headers == b.headers and \ + self.body == b.body and \ + self.mimeType == b.mimeType + +class ReferenceTimestamp: + """ Map relative timestamp to absolute timestamp """ + + def __init__ (self, relative, absolute): + self.relative = timedelta (seconds=relative) + self.absolute = datetime.utcfromtimestamp (absolute) + + def __call__ (self, relative): + if not isinstance (relative, timedelta): + relative = timedelta (seconds=relative) + return self.absolute + (relative-self.relative) + +class RequestResponsePair: + __slots__ = ('request', 'response', 'id', 'url', 'remoteIpAddress', + 'protocol', 'resourceType', '_time') + + def __init__ (self, id=None, url=None, request=None, response=None): + self.request = request + self.response = response + self.id = id + self.url = url + self.remoteIpAddress = None + self.protocol = None + self.resourceType = None + self._time = None def __repr__ (self): - return '<Item {}>'.format (self.url) - - @property - def request (self): - return self.chromeRequest.get ('request', {}) - - @property - def response (self): - return self.chromeResponse.get ('response', {}) - - @property - def initiator (self): - return self.chromeRequest['initiator'] - - @property - def id (self): - return self.chromeRequest['requestId'] - - @property - def encodedDataLength (self): - return self.chromeFinished['encodedDataLength'] - - @property - def url (self): - return self.response.get ('url', self.request.get ('url')) - - @property - def parsedUrl (self): - return urlsplit (self.url) - - @property - def requestHeaders (self): - # the response object may contain refined headers, which were - # *actually* sent over the wire - return self._unfoldHeaders (self.response.get ('requestHeaders', self.request['headers'])) - - @property - def responseHeaders (self): - return self._unfoldHeaders (self.response['headers']) - - @property - def statusText (self): - text = self.response.get ('statusText') - if text: - return text - text = BaseHTTPRequestHandler.responses.get (self.response['status']) - if text: - return text[0] - return 'No status text available' - - @property - def resourceType (self): - return self.chromeResponse.get ('type', self.chromeRequest.get ('type', None)) + return f'RequestResponsePair({self.id!r}, {self.url!r}, {self.request!r}, {self.response!r})' + + def __eq__ (self, b): + if not isinstance (b, RequestResponsePair): + raise TypeError (f'Can only compare with {self.__class__.__name__}') + + # do not compare id and _time. These depend on external factors and do + # not influence the request/response *content* + return self.request == b.request and \ + self.response == b.response and \ + self.url == b.url and \ + self.remoteIpAddress == b.remoteIpAddress and \ + self.protocol == b.protocol and \ + self.resourceType == b.resourceType + + def fromRequestWillBeSent (self, req): + """ Set request data from Chrome Network.requestWillBeSent event """ + r = req['request'] + + self.id = req['requestId'] + self.url = URL (r['url']) + self.resourceType = req.get ('type') + self._time = ReferenceTimestamp (req['timestamp'], req['wallTime']) + + assert self.request is None, req + self.request = Request () + self.request.initiator = req['initiator'] + self.request.headers = CIMultiDict (self._unfoldHeaders (r['headers'])) + self.request.hasPostData = r.get ('hasPostData', False) + self.request.method = r['method'] + self.request.timestamp = self._time (req['timestamp']) + if self.request.hasPostData: + postData = r.get ('postData') + if postData is not None: + self.request.body = UnicodeBody (postData) + + def fromResponse (self, r, timestamp=None, resourceType=None): + """ + Set response data from Chrome’s Response object. + + Request must exist. Updates if response was set before. Sometimes + fromResponseReceived is triggered twice by Chrome. No idea why. + """ + assert self.request is not None, (self.request, r) + + if not timestamp: + timestamp = self.request.timestamp + + self.remoteIpAddress = r.get ('remoteIPAddress') + self.protocol = r.get ('protocol') + if resourceType: + self.resourceType = resourceType + + # a response may contain updated request headers (i.e. those actually + # sent over the wire) + if 'requestHeaders' in r: + self.request.headers = CIMultiDict (self._unfoldHeaders (r['requestHeaders'])) + + self.response = Response () + self.response.headers = CIMultiDict (self._unfoldHeaders (r['headers'])) + self.response.status = r['status'] + self.response.statusText = r['statusText'] + self.response.timestamp = timestamp + self.response.mimeType = r['mimeType'] + + def fromResponseReceived (self, resp): + """ Set response data from Chrome Network.responseReceived """ + return self.fromResponse (resp['response'], + self._time (resp['timestamp']), resp['type']) + + def fromLoadingFinished (self, data): + self.response.bytesReceived = data['encodedDataLength'] + + def fromLoadingFailed (self, data): + self.response = None @staticmethod def _unfoldHeaders (headers): @@ -114,67 +227,46 @@ class Item: items.append ((k, v)) return items - def setRequest (self, req): - self.chromeRequest = req - - def setResponse (self, resp): - self.chromeResponse = resp - - def setFinished (self, finished): - self.chromeFinished = finished - async def prefetchRequestBody (self, tab): - # request body - req = self.request - postData = req.get ('postData') - if postData: - self.requestBody = postData.encode ('utf8'), False - elif req.get ('hasPostData', False): + if self.request.hasPostData and self.request.body is None: try: postData = await tab.Network.getRequestPostData (requestId=self.id) - postData = postData['postData'] - self.requestBody = b64decode (postData), True + self.request.body = UnicodeBody (postData['postData']) except TabException: - self.requestBody = None - else: - self.requestBody = None, False + self.request.body = None async def prefetchResponseBody (self, tab): - # get response body + """ Fetch response body """ try: body = await tab.Network.getResponseBody (requestId=self.id) - rawBody = body['body'] - base64Encoded = body['base64Encoded'] - if base64Encoded: - rawBody = b64decode (rawBody) + if body['base64Encoded']: + self.response.body = Base64Body (body['body']) else: - rawBody = rawBody.encode ('utf8') - self.body = rawBody, base64Encoded + self.response.body = UnicodeBody (body['body']) except TabException: - self.body = None + self.response.body = None + +class NavigateError (IOError): + pass -class VarChangeEvent: - """ Notify when variable is changed """ +class PageIdle: + """ Page idle event """ - __slots__ = ('_value', 'event') + __slots__ = ('idle', ) - def __init__ (self, value): - self._value = value - self.event = asyncio.Event() + def __init__ (self, idle): + self.idle = idle - def set (self, value): - if value != self._value: - self._value = value - # unblock waiting threads - self.event.set () - self.event.clear () + def __bool__ (self): + return self.idle - def get (self): - return self._value +class FrameNavigated: + __slots__ = ('id', 'url', 'mimeType') - async def wait (self): - await self.event.wait () - return self._value + def __init__ (self, id, url, mimeType): + self.id = id + self.url = URL (url) + self.mimeType = mimeType class SiteLoader: """ @@ -183,18 +275,18 @@ class SiteLoader: XXX: track popup windows/new tabs and close them """ - __slots__ = ('requests', 'browser', 'url', 'logger', 'tab', '_iterRunning', 'idle', '_framesLoading') + __slots__ = ('requests', 'browser', 'logger', 'tab', '_iterRunning', + '_framesLoading', '_rootFrame') allowedSchemes = {'http', 'https'} - def __init__ (self, browser, url, logger): + def __init__ (self, browser, logger): self.requests = {} self.browser = Browser (url=browser) - self.url = url - self.logger = logger.bind (context=type (self).__name__, url=url) + self.logger = logger.bind (context=type (self).__name__) self._iterRunning = [] - self.idle = VarChangeEvent (True) self._framesLoading = set () + self._rootFrame = None async def __aenter__ (self): tab = self.tab = await self.browser.__aenter__ () @@ -236,6 +328,7 @@ class SiteLoader: tab.Page.javascriptDialogOpening: self._javascriptDialogOpening, tab.Page.frameStartedLoading: self._frameStartedLoading, tab.Page.frameStoppedLoading: self._frameStoppedLoading, + tab.Page.frameNavigated: self._frameNavigated, } # The implementation is a little advanced. Why? The goal here is to @@ -247,36 +340,46 @@ class SiteLoader: # we need to block (yield) for every item completed, but not # handled by the consumer (caller). running = self._iterRunning - running.append (asyncio.ensure_future (self.tab.get ())) + tabGetTask = asyncio.ensure_future (self.tab.get ()) + running.append (tabGetTask) while True: done, pending = await asyncio.wait (running, return_when=asyncio.FIRST_COMPLETED) for t in done: result = t.result () if result is None: pass - elif isinstance (result, Item): - yield result - else: + elif t == tabGetTask: method, data = result f = handler.get (method, None) if f is not None: task = asyncio.ensure_future (f (**data)) pending.add (task) - pending.add (asyncio.ensure_future (self.tab.get ())) + tabGetTask = asyncio.ensure_future (self.tab.get ()) + pending.add (tabGetTask) + else: + yield result running = pending self._iterRunning = running - async def start (self): - await self.tab.Page.navigate(url=self.url) + async def navigate (self, url): + ret = await self.tab.Page.navigate(url=url) + self.logger.debug ('navigate', + uuid='9d47ded2-951f-4e09-86ee-fd4151e20666', result=ret) + if 'errorText' in ret: + raise NavigateError (ret['errorText']) + self._rootFrame = ret['frameId'] # internal chrome callbacks async def _requestWillBeSent (self, **kwargs): + self.logger.debug ('requestWillBeSent', + uuid='b828d75a-650d-42d2-8c66-14f4547512da', args=kwargs) + reqId = kwargs['requestId'] req = kwargs['request'] - logger = self.logger.bind (reqId=reqId, reqUrl=req['url']) + url = URL (req['url']) + logger = self.logger.bind (reqId=reqId, reqUrl=url) - url = urlsplit (req['url']) if url.scheme not in self.allowedSchemes: return @@ -286,38 +389,44 @@ class SiteLoader: # redirects never “finish” loading, but yield another requestWillBeSent with this key set redirectResp = kwargs.get ('redirectResponse') if redirectResp: - # create fake responses - resp = {'requestId': reqId, 'response': redirectResp, 'timestamp': kwargs['timestamp']} - item.setResponse (resp) - resp = {'requestId': reqId, 'encodedDataLength': 0, 'timestamp': kwargs['timestamp']} - item.setFinished (resp) - item.isRedirect = True - logger.info ('redirect', uuid='85eaec41-e2a9-49c2-9445-6f19690278b8', target=req['url']) + if item.url != url: + # this happens for unknown reasons. the docs simply state + # it can differ in case of a redirect. Fix it and move on. + logger.warning ('redirect url differs', + uuid='558a7df7-2258-4fe4-b16d-22b6019cc163', + expected=item.url) + redirectResp['url'] = str (item.url) + item.fromResponse (redirectResp) + logger.info ('redirect', uuid='85eaec41-e2a9-49c2-9445-6f19690278b8', target=url) + # XXX: queue this? no need to wait for it await item.prefetchRequestBody (self.tab) - # cannot fetch request body due to race condition (item id reused) + # cannot fetch response body due to race condition (item id reused) ret = item else: logger.warning ('request exists', uuid='2c989142-ba00-4791-bb03-c2a14e91a56b') - item = Item () - item.setRequest (kwargs) + item = RequestResponsePair () + item.fromRequestWillBeSent (kwargs) self.requests[reqId] = item - logger.debug ('request', uuid='55c17564-1bd0-4499-8724-fa7aad65478f') return ret async def _responseReceived (self, **kwargs): + self.logger.debug ('responseReceived', + uuid='ecd67e69-401a-41cb-b4ec-eeb1f1ec6abb', args=kwargs) + reqId = kwargs['requestId'] item = self.requests.get (reqId) if item is None: return resp = kwargs['response'] - logger = self.logger.bind (reqId=reqId, respUrl=resp['url']) - url = urlsplit (resp['url']) + url = URL (resp['url']) + logger = self.logger.bind (reqId=reqId, respUrl=url) + if item.url != url: + logger.error ('url mismatch', uuid='7385f45f-0b06-4cbc-81f9-67bcd72ee7d0', respUrl=url) if url.scheme in self.allowedSchemes: - logger.debug ('response', uuid='84461c4e-e8ef-4cbd-8e8e-e10a901c8bd0') - item.setResponse (kwargs) + item.fromResponseReceived (kwargs) else: logger.warning ('scheme forbidden', uuid='2ea6e5d7-dd3b-4881-b9de-156c1751c666') @@ -326,32 +435,37 @@ class SiteLoader: Item was fully loaded. For some items the request body is not available when responseReceived is fired, thus move everything here. """ + self.logger.debug ('loadingFinished', + uuid='35479405-a5b5-4395-8c33-d3601d1796b9', args=kwargs) + reqId = kwargs['requestId'] item = self.requests.pop (reqId, None) if item is None: # we never recorded this request (blacklisted scheme, for example) return + if not item.response: + # chrome failed to send us a responseReceived event for this item, + # so we can’t record it (missing request/response headers) + self.logger.error ('response missing', + uuid='fac3ab96-3f9b-4c5a-95c7-f83b675cdcb9', requestId=item.id) + return + req = item.request - logger = self.logger.bind (reqId=reqId, reqUrl=req['url']) - resp = item.response - if req['url'] != resp['url']: - logger.error ('url mismatch', uuid='7385f45f-0b06-4cbc-81f9-67bcd72ee7d0', respUrl=resp['url']) - url = urlsplit (resp['url']) - if url.scheme in self.allowedSchemes: - logger.info ('finished', uuid='5a8b4bad-f86a-4fe6-a53e-8da4130d6a02') - item.setFinished (kwargs) + if item.url.scheme in self.allowedSchemes: + item.fromLoadingFinished (kwargs) + # XXX queue both await asyncio.gather (item.prefetchRequestBody (self.tab), item.prefetchResponseBody (self.tab)) return item async def _loadingFailed (self, **kwargs): + self.logger.info ('loadingFailed', + uuid='4a944e85-5fae-4aa6-9e7c-e578b29392e4', args=kwargs) + reqId = kwargs['requestId'] - self.logger.warning ('loading failed', - uuid='68410f13-6eea-453e-924e-c1af4601748b', - errorText=kwargs['errorText'], - blockedReason=kwargs.get ('blockedReason')) + logger = self.logger.bind (reqId=reqId) item = self.requests.pop (reqId, None) if item is not None: - item.failed = True + item.fromLoadingFailed (kwargs) return item async def _entryAdded (self, **kwargs): @@ -381,11 +495,25 @@ class SiteLoader: uuid='3ef7292e-8595-4e89-b834-0cc6bc40ee38', **kwargs) async def _frameStartedLoading (self, **kwargs): + self.logger.debug ('frameStartedLoading', + uuid='bbeb39c0-3304-4221-918e-f26bd443c566', args=kwargs) + self._framesLoading.add (kwargs['frameId']) - self.idle.set (False) + return PageIdle (False) async def _frameStoppedLoading (self, **kwargs): + self.logger.debug ('frameStoppedLoading', + uuid='fcbe8110-511c-4cbb-ac2b-f61a5782c5a0', args=kwargs) + self._framesLoading.remove (kwargs['frameId']) if not self._framesLoading: - self.idle.set (True) + return PageIdle (True) + + async def _frameNavigated (self, **kwargs): + self.logger.debug ('frameNavigated', + uuid='0e876f7d-7129-4612-8632-686f42ac6e1f', args=kwargs) + frame = kwargs['frame'] + if self._rootFrame == frame['id']: + assert frame.get ('parentId', None) is None, "root frame must not have a parent" + return FrameNavigated (frame['id'], frame['url'], frame['mimeType']) |