diff options
Diffstat (limited to 'crocoite/browser.py')
-rw-r--r-- | crocoite/browser.py | 913 |
1 files changed, 396 insertions, 517 deletions
diff --git a/crocoite/browser.py b/crocoite/browser.py index e58ebcf..3518789 100644 --- a/crocoite/browser.py +++ b/crocoite/browser.py @@ -22,91 +22,198 @@ Chrome browser interactions. """ -import logging -from urllib.parse import urlsplit -from base64 import b64decode -import pychrome +import asyncio +from base64 import b64decode, b64encode +from datetime import datetime, timedelta +from http.server import BaseHTTPRequestHandler -class Item: - """ - Simple wrapper containing Chrome request and response - """ +from yarl import URL +from multidict import CIMultiDict - def __init__ (self, tab): - self.tab = tab - self.chromeRequest = None - self.chromeResponse = None - self.chromeFinished = None +from .logger import Level +from .devtools import Browser, TabException - def __repr__ (self): - return '<Item {}>'.format (self.request['url']) +# These two classes’ only purpose is so we can later tell whether a body was +# base64-encoded or a unicode string +class Base64Body (bytes): + def __new__ (cls, value): + return bytes.__new__ (cls, b64decode (value)) - @property - def request (self): - return self.chromeRequest['request'] + @classmethod + def fromBytes (cls, b): + """ For testing """ + return cls (b64encode (b)) - @property - def response (self): - return self.chromeResponse['response'] +class UnicodeBody (bytes): + def __new__ (cls, value): + if type (value) is not str: + raise TypeError ('expecting unicode string') - @property - def initiator (self): - return self.chromeRequest['initiator'] + return bytes.__new__ (cls, value.encode ('utf-8')) - @property - def id (self): - return self.chromeRequest['requestId'] +class Request: + __slots__ = ('headers', 'body', 'initiator', 'hasPostData', 'method', 'timestamp') - @property - def encodedDataLength (self): - return self.chromeFinished['encodedDataLength'] + def __init__ (self, method=None, headers=None, body=None): + self.headers = headers + self.body = body + self.hasPostData = False + self.initiator = None + # HTTP method + self.method = method + self.timestamp = None - @property - def url (self): - return self.response['url'] + def __repr__ (self): + return f'Request({self.method!r}, {self.headers!r}, {self.body!r})' + + def __eq__ (self, b): + if b is None: + return False + + if not isinstance (b, Request): + raise TypeError ('Can only compare equality with Request.') + + # do not compare hasPostData (only required to fetch body) and + # timestamp (depends on time) + return self.headers == b.headers and \ + self.body == b.body and \ + self.initiator == b.initiator and \ + self.method == b.method + +class Response: + __slots__ = ('status', 'statusText', 'headers', 'body', 'bytesReceived', + 'timestamp', 'mimeType') + + def __init__ (self, status=None, statusText=None, headers=None, body=None, mimeType=None): + self.status = status + self.statusText = statusText + self.headers = headers + self.body = body + # bytes received over the network (not body size!) + self.bytesReceived = 0 + self.timestamp = None + self.mimeType = mimeType - @property - def parsedUrl (self): - return urlsplit (self.url) + def __repr__ (self): + return f'Response({self.status!r}, {self.statusText!r}, {self.headers!r}, {self.body!r}, {self.mimeType!r})' + + def __eq__ (self, b): + if b is None: + return False + + if not isinstance (b, Response): + raise TypeError ('Can only compare equality with Response.') + + # do not compare bytesReceived (depends on network), timestamp + # (depends on time) and statusText (does not matter) + return self.status == b.status and \ + self.statusText == b.statusText and \ + self.headers == b.headers and \ + self.body == b.body and \ + self.mimeType == b.mimeType + +class ReferenceTimestamp: + """ Map relative timestamp to absolute timestamp """ + + def __init__ (self, relative, absolute): + self.relative = timedelta (seconds=relative) + self.absolute = datetime.utcfromtimestamp (absolute) + + def __call__ (self, relative): + if not isinstance (relative, timedelta): + relative = timedelta (seconds=relative) + return self.absolute + (relative-self.relative) + +class RequestResponsePair: + __slots__ = ('request', 'response', 'id', 'url', 'remoteIpAddress', + 'protocol', 'resourceType', '_time') + + def __init__ (self, id=None, url=None, request=None, response=None): + self.request = request + self.response = response + self.id = id + self.url = url + self.remoteIpAddress = None + self.protocol = None + self.resourceType = None + self._time = None - @property - def body (self): - """ Return response body or None """ - try: - body = self.tab.Network.getResponseBody (requestId=self.id, _timeout=60) - rawBody = body['body'] - base64Encoded = body['base64Encoded'] - if base64Encoded: - rawBody = b64decode (rawBody) - else: - rawBody = rawBody.encode ('utf8') - return rawBody, base64Encoded - except (pychrome.exceptions.CallMethodException, pychrome.exceptions.TimeoutException): - raise ValueError ('Cannot fetch response body') - - @property - def requestBody (self): - """ Get request/POST body """ - req = self.request - postData = req.get ('postData') - if postData: - return postData.encode ('utf8'), False - elif req.get ('hasPostData', False): - try: - return b64decode (self.tab.Network.getRequestPostData (requestId=self.id, _timeout=60)['postData']), True - except (pychrome.exceptions.CallMethodException, pychrome.exceptions.TimeoutException): - raise ValueError ('Cannot fetch request body') - return None, False + def __repr__ (self): + return f'RequestResponsePair({self.id!r}, {self.url!r}, {self.request!r}, {self.response!r})' + + def __eq__ (self, b): + if not isinstance (b, RequestResponsePair): + raise TypeError (f'Can only compare with {self.__class__.__name__}') + + # do not compare id and _time. These depend on external factors and do + # not influence the request/response *content* + return self.request == b.request and \ + self.response == b.response and \ + self.url == b.url and \ + self.remoteIpAddress == b.remoteIpAddress and \ + self.protocol == b.protocol and \ + self.resourceType == b.resourceType + + def fromRequestWillBeSent (self, req): + """ Set request data from Chrome Network.requestWillBeSent event """ + r = req['request'] + + self.id = req['requestId'] + self.url = URL (r['url']) + self.resourceType = req.get ('type') + self._time = ReferenceTimestamp (req['timestamp'], req['wallTime']) + + assert self.request is None, req + self.request = Request () + self.request.initiator = req['initiator'] + self.request.headers = CIMultiDict (self._unfoldHeaders (r['headers'])) + self.request.hasPostData = r.get ('hasPostData', False) + self.request.method = r['method'] + self.request.timestamp = self._time (req['timestamp']) + if self.request.hasPostData: + postData = r.get ('postData') + if postData is not None: + self.request.body = UnicodeBody (postData) + + def fromResponse (self, r, timestamp=None, resourceType=None): + """ + Set response data from Chrome’s Response object. + + Request must exist. Updates if response was set before. Sometimes + fromResponseReceived is triggered twice by Chrome. No idea why. + """ + assert self.request is not None, (self.request, r) + + if not timestamp: + timestamp = self.request.timestamp + + self.remoteIpAddress = r.get ('remoteIPAddress') + self.protocol = r.get ('protocol') + if resourceType: + self.resourceType = resourceType + + # a response may contain updated request headers (i.e. those actually + # sent over the wire) + if 'requestHeaders' in r: + self.request.headers = CIMultiDict (self._unfoldHeaders (r['requestHeaders'])) + + self.response = Response () + self.response.headers = CIMultiDict (self._unfoldHeaders (r['headers'])) + self.response.status = r['status'] + self.response.statusText = r['statusText'] + self.response.timestamp = timestamp + self.response.mimeType = r['mimeType'] - @property - def requestHeaders (self): - # the response object may contain refined headers, which were - # *actually* sent over the wire - return self._unfoldHeaders (self.response.get ('requestHeaders', self.request['headers'])) + def fromResponseReceived (self, resp): + """ Set response data from Chrome Network.responseReceived """ + return self.fromResponse (resp['response'], + self._time (resp['timestamp']), resp['type']) - @property - def responseHeaders (self): - return self._unfoldHeaders (self.response['headers']) + def fromLoadingFinished (self, data): + self.response.bytesReceived = data['encodedDataLength'] + + def fromLoadingFailed (self, data): + self.response = None @staticmethod def _unfoldHeaders (headers): @@ -120,14 +227,46 @@ class Item: items.append ((k, v)) return items - def setRequest (self, req): - self.chromeRequest = req + async def prefetchRequestBody (self, tab): + if self.request.hasPostData and self.request.body is None: + try: + postData = await tab.Network.getRequestPostData (requestId=self.id) + self.request.body = UnicodeBody (postData['postData']) + except TabException: + self.request.body = None + + async def prefetchResponseBody (self, tab): + """ Fetch response body """ + try: + body = await tab.Network.getResponseBody (requestId=self.id) + if body['base64Encoded']: + self.response.body = Base64Body (body['body']) + else: + self.response.body = UnicodeBody (body['body']) + except TabException: + self.response.body = None - def setResponse (self, resp): - self.chromeResponse = resp +class NavigateError (IOError): + pass - def setFinished (self, finished): - self.chromeFinished = finished +class PageIdle: + """ Page idle event """ + + __slots__ = ('idle', ) + + def __init__ (self, idle): + self.idle = idle + + def __bool__ (self): + return self.idle + +class FrameNavigated: + __slots__ = ('id', 'url', 'mimeType') + + def __init__ (self, id, url, mimeType): + self.id = id + self.url = URL (url) + self.mimeType = mimeType class SiteLoader: """ @@ -136,505 +275,245 @@ class SiteLoader: XXX: track popup windows/new tabs and close them """ + __slots__ = ('requests', 'browser', 'logger', 'tab', '_iterRunning', + '_framesLoading', '_rootFrame') allowedSchemes = {'http', 'https'} - def __init__ (self, browser, url, logger=logging.getLogger(__name__)): + def __init__ (self, browser, logger): self.requests = {} - self.browser = browser - self.url = url - self.logger = logger + self.browser = Browser (url=browser) + self.logger = logger.bind (context=type (self).__name__) + self._iterRunning = [] - self.tab = browser.new_tab() + self._framesLoading = set () + self._rootFrame = None - def __enter__ (self): - tab = self.tab - # setup callbacks - tab.Network.requestWillBeSent = self._requestWillBeSent - tab.Network.responseReceived = self._responseReceived - tab.Network.loadingFinished = self._loadingFinished - tab.Network.loadingFailed = self._loadingFailed - tab.Log.entryAdded = self._entryAdded - #tab.Page.loadEventFired = loadEventFired - tab.Page.javascriptDialogOpening = self._javascriptDialogOpening - - # start the tab - tab.start() + async def __aenter__ (self): + tab = self.tab = await self.browser.__aenter__ () # enable events - tab.Log.enable () - tab.Network.enable() - tab.Page.enable () - tab.Network.clearBrowserCache () - if tab.Network.canClearBrowserCookies ()['result']: - tab.Network.clearBrowserCookies () - + await asyncio.gather (*[ + tab.Log.enable (), + tab.Network.enable(), + tab.Page.enable (), + tab.Inspector.enable (), + tab.Network.clearBrowserCache (), + tab.Network.clearBrowserCookies (), + ]) return self + async def __aexit__ (self, exc_type, exc_value, traceback): + for task in self._iterRunning: + # ignore any results from stuff we did not end up using anyway + if not task.done (): + task.cancel () + self._iterRunning = [] + await self.browser.__aexit__ (exc_type, exc_value, traceback) + self.tab = None + return False + def __len__ (self): return len (self.requests) - def start (self): - self.tab.Page.navigate(url=self.url) - - def wait (self, timeout=1): - self.tab.wait (timeout) - - def waitIdle (self, idleTimeout=1, maxTimeout=60): - step = 0 - for i in range (0, maxTimeout): - self.wait (1) - if len (self) == 0: - step += 1 - if step > idleTimeout: - break - else: - step = 0 - - def stop (self): - """ - Stop loading site - - XXX: stop executing scripts - """ - + async def __aiter__ (self): + """ Retrieve network items """ tab = self.tab + assert tab is not None + handler = { + tab.Network.requestWillBeSent: self._requestWillBeSent, + tab.Network.responseReceived: self._responseReceived, + tab.Network.loadingFinished: self._loadingFinished, + tab.Network.loadingFailed: self._loadingFailed, + tab.Log.entryAdded: self._entryAdded, + tab.Page.javascriptDialogOpening: self._javascriptDialogOpening, + tab.Page.frameStartedLoading: self._frameStartedLoading, + tab.Page.frameStoppedLoading: self._frameStoppedLoading, + tab.Page.frameNavigated: self._frameNavigated, + } + + # The implementation is a little advanced. Why? The goal here is to + # process events from the tab as quickly as possible (i.e. + # asynchronously). We need to make sure that JavaScript dialogs are + # handled immediately for instance. Otherwise they stall every + # other request. Also, we don’t want to use an unbounded queue, + # since the items yielded can get quite big (response body). Thus + # we need to block (yield) for every item completed, but not + # handled by the consumer (caller). + running = self._iterRunning + tabGetTask = asyncio.ensure_future (self.tab.get ()) + running.append (tabGetTask) + while True: + done, pending = await asyncio.wait (running, return_when=asyncio.FIRST_COMPLETED) + for t in done: + result = t.result () + if result is None: + pass + elif t == tabGetTask: + method, data = result + f = handler.get (method, None) + if f is not None: + task = asyncio.ensure_future (f (**data)) + pending.add (task) + tabGetTask = asyncio.ensure_future (self.tab.get ()) + pending.add (tabGetTask) + else: + yield result - tab.Page.stopLoading () - tab.Network.disable () - tab.Page.disable () - tab.Log.disable () - # XXX: we can’t drain the event queue directly, so insert (yet another) wait - tab.wait (1) - tab.Network.requestWillBeSent = None - tab.Network.responseReceived = None - tab.Network.loadingFinished = None - tab.Network.loadingFailed = None - tab.Page.loadEventFired = None - tab.Page.javascriptDialogOpening = None - tab.Log.entryAdded = None - - def __exit__ (self, exc_type, exc_value, traceback): - self.tab.stop () - self.browser.close_tab(self.tab) - return False - - # overrideable callbacks - def loadingFinished (self, item, redirect=False): - pass + running = pending + self._iterRunning = running - def loadingFailed (self, item): - pass + async def navigate (self, url): + ret = await self.tab.Page.navigate(url=url) + self.logger.debug ('navigate', + uuid='9d47ded2-951f-4e09-86ee-fd4151e20666', result=ret) + if 'errorText' in ret: + raise NavigateError (ret['errorText']) + self._rootFrame = ret['frameId'] # internal chrome callbacks - def _requestWillBeSent (self, **kwargs): + async def _requestWillBeSent (self, **kwargs): + self.logger.debug ('requestWillBeSent', + uuid='b828d75a-650d-42d2-8c66-14f4547512da', args=kwargs) + reqId = kwargs['requestId'] req = kwargs['request'] + url = URL (req['url']) + logger = self.logger.bind (reqId=reqId, reqUrl=url) - url = urlsplit (req['url']) if url.scheme not in self.allowedSchemes: return + ret = None item = self.requests.get (reqId) if item: # redirects never “finish” loading, but yield another requestWillBeSent with this key set redirectResp = kwargs.get ('redirectResponse') if redirectResp: - # create fake responses - resp = {'requestId': reqId, 'response': redirectResp, 'timestamp': kwargs['timestamp']} - item.setResponse (resp) - resp = {'requestId': reqId, 'encodedDataLength': 0, 'timestamp': kwargs['timestamp']} - item.setFinished (resp) - self.loadingFinished (item, redirect=True) - self.logger.info ('redirected request {} has url {}'.format (reqId, req['url'])) + if item.url != url: + # this happens for unknown reasons. the docs simply state + # it can differ in case of a redirect. Fix it and move on. + logger.warning ('redirect url differs', + uuid='558a7df7-2258-4fe4-b16d-22b6019cc163', + expected=item.url) + redirectResp['url'] = str (item.url) + item.fromResponse (redirectResp) + logger.info ('redirect', uuid='85eaec41-e2a9-49c2-9445-6f19690278b8', target=url) + # XXX: queue this? no need to wait for it + await item.prefetchRequestBody (self.tab) + # cannot fetch response body due to race condition (item id reused) + ret = item else: - self.logger.warning ('request {} already exists, overwriting.'.format (reqId)) + logger.warning ('request exists', uuid='2c989142-ba00-4791-bb03-c2a14e91a56b') - item = Item (self.tab) - item.setRequest (kwargs) + item = RequestResponsePair () + item.fromRequestWillBeSent (kwargs) self.requests[reqId] = item - def _responseReceived (self, **kwargs): + return ret + + async def _responseReceived (self, **kwargs): + self.logger.debug ('responseReceived', + uuid='ecd67e69-401a-41cb-b4ec-eeb1f1ec6abb', args=kwargs) + reqId = kwargs['requestId'] item = self.requests.get (reqId) if item is None: return resp = kwargs['response'] - url = urlsplit (resp['url']) + url = URL (resp['url']) + logger = self.logger.bind (reqId=reqId, respUrl=url) + if item.url != url: + logger.error ('url mismatch', uuid='7385f45f-0b06-4cbc-81f9-67bcd72ee7d0', respUrl=url) if url.scheme in self.allowedSchemes: - self.logger.info ('response {} {}'.format (reqId, resp['url'])) - item.setResponse (kwargs) + item.fromResponseReceived (kwargs) else: - self.logger.warning ('response: ignoring scheme {}'.format (url.scheme)) + logger.warning ('scheme forbidden', uuid='2ea6e5d7-dd3b-4881-b9de-156c1751c666') - def _loadingFinished (self, **kwargs): + async def _loadingFinished (self, **kwargs): """ Item was fully loaded. For some items the request body is not available when responseReceived is fired, thus move everything here. """ + self.logger.debug ('loadingFinished', + uuid='35479405-a5b5-4395-8c33-d3601d1796b9', args=kwargs) + reqId = kwargs['requestId'] item = self.requests.pop (reqId, None) if item is None: # we never recorded this request (blacklisted scheme, for example) return + if not item.response: + # chrome failed to send us a responseReceived event for this item, + # so we can’t record it (missing request/response headers) + self.logger.error ('response missing', + uuid='fac3ab96-3f9b-4c5a-95c7-f83b675cdcb9', requestId=item.id) + return + req = item.request - resp = item.response - assert req['url'] == resp['url'], 'req and resp urls are not the same {} vs {}'.format (req['url'], resp['url']) - url = urlsplit (resp['url']) - if url.scheme in self.allowedSchemes: - self.logger.info ('finished {} {}'.format (reqId, req['url'])) - item.setFinished (kwargs) - self.loadingFinished (item) + if item.url.scheme in self.allowedSchemes: + item.fromLoadingFinished (kwargs) + # XXX queue both + await asyncio.gather (item.prefetchRequestBody (self.tab), item.prefetchResponseBody (self.tab)) + return item + + async def _loadingFailed (self, **kwargs): + self.logger.info ('loadingFailed', + uuid='4a944e85-5fae-4aa6-9e7c-e578b29392e4', args=kwargs) - def _loadingFailed (self, **kwargs): reqId = kwargs['requestId'] - self.logger.warning ('failed {} {}'.format (reqId, kwargs['errorText'], kwargs.get ('blockedReason'))) + logger = self.logger.bind (reqId=reqId) item = self.requests.pop (reqId, None) - self.loadingFailed (item) + if item is not None: + item.fromLoadingFailed (kwargs) + return item - def _entryAdded (self, **kwargs): + async def _entryAdded (self, **kwargs): """ Log entry added """ entry = kwargs['entry'] - level = {'verbose': logging.DEBUG, 'info': logging.INFO, - 'warning': logging.WARNING, - 'error': logging.ERROR}[entry['level']] - self.logger.log (level, 'console: {}: {}'.format (entry['source'], entry['text']), extra={'raw': entry}) + level = {'verbose': Level.DEBUG, 'info': Level.INFO, + 'warning': Level.WARNING, + 'error': Level.ERROR}.get (entry.pop ('level'), Level.INFO) + entry['uuid'] = 'e62ffb5a-0521-459c-a3d9-1124551934d2' + self.logger (level, 'console', **entry) - def _javascriptDialogOpening (self, **kwargs): + async def _javascriptDialogOpening (self, **kwargs): t = kwargs.get ('type') if t in {'alert', 'confirm', 'prompt'}: - self.logger.info ('javascript opened a dialog: {}, {}, canceling'.format (t, kwargs.get ('message'))) - self.tab.Page.handleJavaScriptDialog (accept=False) + self.logger.info ('js dialog', + uuid='d6f07ce2-648e-493b-a1df-f353bed27c84', + action='cancel', type=t, message=kwargs.get ('message')) + await self.tab.Page.handleJavaScriptDialog (accept=False) elif t == 'beforeunload': # we must accept this one, otherwise the page will not unload/close - self.logger.info ('javascript opened a dialog: {}, {}, procceeding'.format (t, kwargs.get ('message'))) - self.tab.Page.handleJavaScriptDialog (accept=True) - else: - self.logger.warning ('unknown javascript dialog type {}'.format (t)) - -class AccountingSiteLoader (SiteLoader): - """ - SiteLoader that keeps basic statistics about retrieved pages. - """ - - def __init__ (self, browser, url, logger=logging.getLogger(__name__)): - super ().__init__ (browser, url, logger) - - self.stats = {'requests': 0, 'finished': 0, 'failed': 0, 'bytesRcv': 0} - - def loadingFinished (self, item, redirect=False): - super ().loadingFinished (item, redirect) - - self.stats['finished'] += 1 - self.stats['bytesRcv'] += item.encodedDataLength - - def loadingFailed (self, item): - super ().loadingFailed (item) - - self.stats['failed'] += 1 - - def _requestWillBeSent (self, **kwargs): - super ()._requestWillBeSent (**kwargs) - - self.stats['requests'] += 1 - -import subprocess -from tempfile import mkdtemp -import socket, shutil - -class ChromeService: - """ - Start Chrome with socket activation (i.e. pass listening socket). Polling - is not required with this method, since reads will block until Chrome is - ready. - """ - - def __init__ (self, binary='google-chrome-stable', host='localhost', port=9222, windowSize=(1920, 1080)): - self.binary = binary - self.host = host - self.port = port - self.windowSize = windowSize - self.p = None - - def __enter__ (self): - assert self.p is None - - port = self.port - while True: - s = socket.socket () - s.setsockopt (socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) - try: - s.bind ((self.host, port)) - break - except OSError: - # try different port - if port < 65000: - port += 1 - else: - raise - s.listen (10) - self.userDataDir = mkdtemp () - args = [self.binary, - '--window-size={},{}'.format (*self.windowSize), - '--user-data-dir={}'.format (self.userDataDir), # use temporory user dir - '--no-default-browser-check', - '--no-first-run', # don’t show first run screen - '--disable-breakpad', # no error reports - '--disable-extensions', - '--disable-infobars', - '--disable-notifications', # no libnotify - '--headless', - '--disable-gpu', - '--hide-scrollbars', # hide scrollbars on screenshots - '--mute-audio', # don’t play any audio - '--remote-debugging-socket-fd={}'.format (s.fileno ()), - '--homepage=about:blank', - 'about:blank'] - # start new session, so ^C does not affect subprocess - self.p = subprocess.Popen (args, pass_fds=[s.fileno()], start_new_session=True, - stdin=subprocess.DEVNULL, stdout=subprocess.DEVNULL, - stderr=subprocess.DEVNULL) - s.close () - - return 'http://{}:{}'.format (self.host, port) - - def __exit__ (self, *exc): - self.p.terminate () - self.p.wait () - shutil.rmtree (self.userDataDir) - self.p = None - -class NullService: - def __init__ (self, url): - self.url = url - - def __enter__ (self): - return self.url - - def __exit__ (self, *exc): - pass - -### tests ### - -import unittest, time -from http.server import BaseHTTPRequestHandler - -class TestHTTPRequestHandler (BaseHTTPRequestHandler): - encodingTestString = { - 'latin1': 'äöü', - 'utf-8': 'äöü', - 'ISO-8859-1': 'äöü', - } - binaryTestData = b'\x00\x01\x02' - # 1×1 pixel PNG - imageTestData = b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01\x00\x00\x00\x01\x08\x00\x00\x00\x00:~\x9bU\x00\x00\x00\nIDAT\x08\x1dc\xf8\x0f\x00\x01\x01\x01\x006_g\x80\x00\x00\x00\x00IEND\xaeB`\x82' - htmlTestData = '<html><body><img src="/image"><img src="/nonexistent"></body></html>' - alertData = '<html><body><script>window.addEventListener("beforeunload", function (e) { e.returnValue = "bye?"; return e.returnValue; }); alert("stopping here"); if (confirm("are you sure?") || prompt ("42?")) { window.location = "/nonexistent"; }</script><img src="/image"></body></html>' - - def do_GET(self): - path = self.path - if path.startswith ('/redirect/301'): - self.send_response(301) - self.send_header ('Location', path[13:]) - self.end_headers() - elif path == '/empty': - self.send_response (200) - self.end_headers () - elif path.startswith ('/encoding'): - # send text data with different encodings - _, _, encoding = path.split ('/', 3) - self.send_response (200) - self.send_header ('Content-Type', 'text/plain; charset={}'.format (encoding)) - self.end_headers () - self.wfile.write (self.encodingTestString[encoding].encode (encoding)) - elif path == '/binary': - # send binary data - self.send_response (200) - self.send_header ('Content-Type', 'application/octet-stream') - self.send_header ('Content-Length', len (self.binaryTestData)) - self.end_headers () - self.wfile.write (self.binaryTestData) - elif path == '/image': - # send binary data - self.send_response (200) - self.send_header ('Content-Type', 'image/png') - self.end_headers () - self.wfile.write (self.imageTestData) - elif path == '/attachment': - self.send_response (200) - self.send_header ('Content-Type', 'text/plain; charset=utf-8') - self.send_header ('Content-Disposition', 'attachment; filename="attachment.txt"') - self.end_headers () - self.wfile.write (self.encodingTestString['utf-8'].encode ('utf-8')) - elif path == '/html': - self.send_response (200) - self.send_header ('Content-Type', 'text/html; charset=utf-8') - self.end_headers () - self.wfile.write (self.htmlTestData.encode ('utf-8')) - elif path == '/alert': - self.send_response (200) - self.send_header ('Content-Type', 'text/html; charset=utf-8') - self.end_headers () - self.wfile.write (self.alertData.encode ('utf-8')) - else: - self.send_response (404) - self.end_headers () - - def log_message (self, format, *args): - pass - -def startServer (): - import http.server - PORT = 8000 - httpd = http.server.HTTPServer (("localhost", PORT), TestHTTPRequestHandler) - httpd.serve_forever() - -class TestSiteLoaderAdapter (SiteLoader): - def __init__ (self, browser, url): - SiteLoader.__init__ (self, browser, url) - self.finished = [] - - def loadingFinished (self, item, redirect=False): - self.finished.append (item) - -class TestSiteLoader (unittest.TestCase): - def setUp (self): - from multiprocessing import Process - self.server = Process (target=startServer) - self.server.start () - self.baseurl = 'http://localhost:8000/' - self.service = ChromeService () - browserUrl = self.service.__enter__ () - self.browser = pychrome.Browser(url=browserUrl) - - def buildAdapter (self, path): - return TestSiteLoaderAdapter (self.browser, '{}{}'.format (self.baseurl, path)) - - def assertUrls (self, l, expect): - urls = set (map (lambda x: x.parsedUrl.path, l.finished)) - expect = set (expect) - self.assertEqual (urls, expect) - - def test_wait (self): - waittime = 2 - with self.buildAdapter ('empty') as l: - l.start () - before = time.time () - l.wait (waittime) - after = time.time () - self.assertTrue ((after-before) >= waittime) - - def test_empty (self): - with self.buildAdapter ('empty') as l: - l.start () - l.waitIdle () - self.assertEqual (len (l.finished), 1) - - def test_redirect301 (self): - with self.buildAdapter ('redirect/301/empty') as l: - l.start () - l.waitIdle () - self.assertEqual (len (l.finished), 2) - self.assertUrls (l, ['/redirect/301/empty', '/empty']) - for item in l.finished: - if item.parsedUrl.path == '/empty': - self.assertEqual (item.response['status'], 200) - self.assertEqual (item.body[0], b'') - elif item.parsedUrl.path == '/redirect/301/empty': - self.assertEqual (item.response['status'], 301) - else: - self.fail ('unknown url') - - def test_redirect301multi (self): - with self.buildAdapter ('redirect/301/redirect/301/empty') as l: - l.start () - l.waitIdle () - self.assertEqual (len (l.finished), 3) - self.assertUrls (l, ['/redirect/301/redirect/301/empty', '/redirect/301/empty', '/empty']) - for item in l.finished: - if item.parsedUrl.path == '/empty': - self.assertEqual (item.response['status'], 200) - self.assertEqual (item.body[0], b'') - elif item.parsedUrl.path in {'/redirect/301/empty', \ - '/redirect/301/redirect/301/empty'}: - self.assertEqual (item.response['status'], 301) - else: - self.fail ('unknown url') - - def test_encoding (self): - """ Text responses are transformed to UTF-8. Make sure this works - correctly. """ - for encoding, expected in TestHTTPRequestHandler.encodingTestString.items (): - with self.buildAdapter ('encoding/{}'.format (encoding)) as l: - l.start () - l.waitIdle () - self.assertEqual (len (l.finished), 1) - self.assertUrls (l, ['/encoding/{}'.format (encoding)]) - self.assertEqual (l.finished[0].body[0], expected.encode ('utf8')) - - def test_binary (self): - """ Browser should ignore content it cannot display (i.e. octet-stream) """ - with self.buildAdapter ('binary') as l: - l.start () - l.waitIdle () - self.assertEqual (len (l.finished), 0) - - def test_image (self): - """ Images should be displayed inline """ - with self.buildAdapter ('image') as l: - l.start () - l.waitIdle () - self.assertEqual (len (l.finished), 1) - self.assertUrls (l, ['/image']) - self.assertEqual (l.finished[0].body[0], TestHTTPRequestHandler.imageTestData) - - def test_attachment (self): - """ And downloads won’t work in headless mode """ - with self.buildAdapter ('attachment') as l: - l.start () - l.waitIdle () - self.assertEqual (len (l.finished), 0) - - def test_html (self): - with self.buildAdapter ('html') as l: - l.start () - l.waitIdle () - self.assertEqual (len (l.finished), 3) - self.assertUrls (l, ['/html', '/image', '/nonexistent']) - for item in l.finished: - if item.parsedUrl.path == '/html': - self.assertEqual (item.response['status'], 200) - self.assertEqual (item.body[0], TestHTTPRequestHandler.htmlTestData.encode ('utf-8')) - elif item.parsedUrl.path == '/image': - self.assertEqual (item.response['status'], 200) - self.assertEqual (item.body[0], TestHTTPRequestHandler.imageTestData) - elif item.parsedUrl.path == '/nonexistent': - self.assertEqual (item.response['status'], 404) - else: - self.fail ('unknown url') - - def test_alert (self): - with self.buildAdapter ('alert') as l: - l.start () - l.waitIdle () - self.assertUrls (l, ['/alert', '/image']) - for item in l.finished: - if item.parsedUrl.path == '/alert': - self.assertEqual (item.response['status'], 200) - self.assertEqual (item.body[0], TestHTTPRequestHandler.alertData.encode ('utf-8')) - elif item.parsedUrl.path == '/image': - self.assertEqual (item.response['status'], 200) - self.assertEqual (item.body[0], TestHTTPRequestHandler.imageTestData) - else: - self.fail ('unknown url') - - def tearDown (self): - self.service.__exit__ (None, None, None) - self.server.terminate () - self.server.join () - -if __name__ == '__main__': - import sys - if sys.argv[1] == 'server': - startServer () + self.logger.info ('js dialog', + uuid='96399b99-9834-4c8f-bd93-cb9fa2225abd', + action='proceed', type=t, message=kwargs.get ('message')) + await self.tab.Page.handleJavaScriptDialog (accept=True) + else: # pragma: no cover + self.logger.warning ('js dialog unknown', + uuid='3ef7292e-8595-4e89-b834-0cc6bc40ee38', **kwargs) + + async def _frameStartedLoading (self, **kwargs): + self.logger.debug ('frameStartedLoading', + uuid='bbeb39c0-3304-4221-918e-f26bd443c566', args=kwargs) + + self._framesLoading.add (kwargs['frameId']) + return PageIdle (False) + + async def _frameStoppedLoading (self, **kwargs): + self.logger.debug ('frameStoppedLoading', + uuid='fcbe8110-511c-4cbb-ac2b-f61a5782c5a0', args=kwargs) + + self._framesLoading.remove (kwargs['frameId']) + if not self._framesLoading: + return PageIdle (True) + + async def _frameNavigated (self, **kwargs): + self.logger.debug ('frameNavigated', + uuid='0e876f7d-7129-4612-8632-686f42ac6e1f', args=kwargs) + frame = kwargs['frame'] + if self._rootFrame == frame['id']: + assert frame.get ('parentId', None) is None, "root frame must not have a parent" + return FrameNavigated (frame['id'], frame['url'], frame['mimeType']) |