From 60fe79f2d898757f4f20aa89015e86cd63ef7871 Mon Sep 17 00:00:00 2001 From: Lars-Dominik Braun Date: Sat, 3 Nov 2018 13:32:38 +0100 Subject: Switch site loader to async DevTools communication --- crocoite/browser.py | 233 +++++++++++++++++++++++++---------------------- crocoite/test_browser.py | 232 ++++++++++++++++++++++------------------------ 2 files changed, 236 insertions(+), 229 deletions(-) diff --git a/crocoite/browser.py b/crocoite/browser.py index b5ea4e3..515d06b 100644 --- a/crocoite/browser.py +++ b/crocoite/browser.py @@ -22,30 +22,32 @@ Chrome browser interactions. """ +import asyncio from urllib.parse import urlsplit from base64 import b64decode from collections import deque from threading import Event from http.server import BaseHTTPRequestHandler -from .logger import Level -import pychrome +from .logger import Level +from .devtools import Browser, TabException class Item: """ Simple wrapper containing Chrome request and response """ - __slots__ = ('tab', 'chromeRequest', 'chromeResponse', 'chromeFinished', - 'isRedirect', 'failed') + __slots__ = ('chromeRequest', 'chromeResponse', 'chromeFinished', + 'isRedirect', 'failed', 'body', 'requestBody') def __init__ (self, tab): - self.tab = tab self.chromeRequest = {} self.chromeResponse = {} self.chromeFinished = {} self.isRedirect = False self.failed = False + self.body = None + self.requestBody = None def __repr__ (self): return ''.format (self.url) @@ -78,36 +80,6 @@ class Item: def parsedUrl (self): return urlsplit (self.url) - @property - def body (self): - """ Return response body or None """ - try: - body = self.tab.Network.getResponseBody (requestId=self.id, _timeout=10) - rawBody = body['body'] - base64Encoded = body['base64Encoded'] - if base64Encoded: - rawBody = b64decode (rawBody) - else: - rawBody = rawBody.encode ('utf8') - return rawBody, base64Encoded - except (pychrome.exceptions.CallMethodException, pychrome.exceptions.TimeoutException): - raise ValueError ('Cannot fetch response body') - - @property - def requestBody (self): - """ Get request/POST body """ - req = self.request - postData = req.get ('postData') - if postData: - return postData.encode ('utf8'), False - elif req.get ('hasPostData', False): - try: - postData = self.tab.Network.getRequestPostData (requestId=self.id, _timeout=10)['postData'] - return b64decode (postData), True - except (pychrome.exceptions.CallMethodException, pychrome.exceptions.TimeoutException): - raise ValueError ('Cannot fetch request body') - return None, False - @property def requestHeaders (self): # the response object may contain refined headers, which were @@ -153,87 +125,131 @@ class Item: def setFinished (self, finished): self.chromeFinished = finished -class BrowserCrashed (Exception): - pass + async def prefetchRequestBody (self, tab): + # request body + req = self.request + postData = req.get ('postData') + if postData: + self.requestBody = postData.encode ('utf8'), False + elif req.get ('hasPostData', False): + try: + postData = await tab.Network.getRequestPostData (requestId=self.id) + postData = postData['postData'] + self.requestBody = b64decode (postData), True + except TabException: + self.requestBody = None + else: + self.requestBody = None, False + + async def prefetchResponseBody (self, tab): + # get response body + try: + body = await tab.Network.getResponseBody (requestId=self.id) + rawBody = body['body'] + base64Encoded = body['base64Encoded'] + if base64Encoded: + rawBody = b64decode (rawBody) + else: + rawBody = rawBody.encode ('utf8') + self.body = rawBody, base64Encoded + except TabException: + self.body = None class SiteLoader: """ Load site in Chrome and monitor network requests - Chrome’s raw devtools events are preprocessed here (asynchronously, in a - different thread, spawned by pychrome) and put into a deque. There - are two reasons for this: First of all, it makes consumer exception - handling alot easier (no need to propagate them to the main thread). And - secondly, browser crashes must be handled before everything else, as they - result in a loss of communication with the browser itself (i.e. we can’t - fetch a resource’s body any more). - XXX: track popup windows/new tabs and close them """ - __slots__ = ('requests', 'browser', 'url', 'logger', 'queue', 'notify', 'tab') + __slots__ = ('requests', 'browser', 'url', 'logger', 'tab', '_iterRunning') allowedSchemes = {'http', 'https'} def __init__ (self, browser, url, logger): self.requests = {} - self.browser = pychrome.Browser (url=browser) + self.browser = Browser (url=browser) self.url = url self.logger = logger.bind (context=type (self).__name__, url=url) - self.queue = deque () - self.notify = Event () + self._iterRunning = [] - def __enter__ (self): - tab = self.tab = self.browser.new_tab() - # setup callbacks - tab.Network.requestWillBeSent = self._requestWillBeSent - tab.Network.responseReceived = self._responseReceived - tab.Network.loadingFinished = self._loadingFinished - tab.Network.loadingFailed = self._loadingFailed - tab.Log.entryAdded = self._entryAdded - tab.Page.javascriptDialogOpening = self._javascriptDialogOpening - tab.Inspector.targetCrashed = self._targetCrashed - - # start the tab - tab.start() + async def __aenter__ (self): + tab = self.tab = await self.browser.__aenter__ () # enable events - tab.Log.enable () - tab.Network.enable() - tab.Page.enable () - tab.Inspector.enable () - tab.Network.clearBrowserCache () - if tab.Network.canClearBrowserCookies ()['result']: - tab.Network.clearBrowserCookies () + await asyncio.gather (*[ + tab.Log.enable (), + tab.Network.enable(), + tab.Page.enable (), + tab.Inspector.enable (), + tab.Network.clearBrowserCache (), + ]) + resp = await tab.Network.canClearBrowserCookies () + if resp['result']: + await tab.Network.clearBrowserCookies () return self - def __exit__ (self, exc_type, exc_value, traceback): - self.tab.Page.stopLoading () - self.tab.stop () - self.browser.close_tab(self.tab) + async def __aexit__ (self, exc_type, exc_value, traceback): + for task in self._iterRunning: + # ignore any results from stuff we did not end up using anyway + if not task.done (): + task.cancel () + self._iterRunning = [] + await self.browser.__aexit__ (exc_type, exc_value, traceback) + self.tab = None return False def __len__ (self): return len (self.requests) - def __iter__ (self): - return iter (self.queue) - - def start (self): - self.tab.Page.navigate(url=self.url) - - # use event to signal presence of new items. This way the controller - # can wait for them without polling. - def _append (self, item): - self.queue.append (item) - self.notify.set () - - def _appendleft (self, item): - self.queue.appendleft (item) - self.notify.set () + async def __aiter__ (self): + """ Retrieve network items """ + tab = self.tab + assert tab is not None + handler = { + tab.Network.requestWillBeSent: self._requestWillBeSent, + tab.Network.responseReceived: self._responseReceived, + tab.Network.loadingFinished: self._loadingFinished, + tab.Network.loadingFailed: self._loadingFailed, + tab.Log.entryAdded: self._entryAdded, + tab.Page.javascriptDialogOpening: self._javascriptDialogOpening, + #tab.Inspector.targetCrashed: self._targetCrashed, + } + + # The implementation is a little advanced. Why? The goal here is to + # process events from the tab as quickly as possible (i.e. + # asynchronously). We need to make sure that JavaScript dialogs are + # handled immediately for instance. Otherwise they stall every + # other request. Also, we don’t want to use an unbounded queue, + # since the items yielded can get quite big (response body). Thus + # we need to block (yield) for every item completed, but not + # handled by the consumer (caller). + running = self._iterRunning + running.append (asyncio.ensure_future (self.tab.get ())) + while True: + done, pending = await asyncio.wait (running, return_when=asyncio.FIRST_COMPLETED) + for t in done: + result = t.result () + if result is None: + pass + elif isinstance (result, Item): + yield result + else: + method, data = result + f = handler.get (method, None) + if f is not None: + task = asyncio.ensure_future (f (**data)) + pending.add (task) + pending.add (asyncio.ensure_future (self.tab.get ())) + + running = pending + self._iterRunning = running + + async def start (self): + await self.tab.Page.navigate(url=self.url) # internal chrome callbacks - def _requestWillBeSent (self, **kwargs): + async def _requestWillBeSent (self, **kwargs): reqId = kwargs['requestId'] req = kwargs['request'] logger = self.logger.bind (reqId=reqId, reqUrl=req['url']) @@ -242,6 +258,7 @@ class SiteLoader: if url.scheme not in self.allowedSchemes: return + ret = None item = self.requests.get (reqId) if item: # redirects never “finish” loading, but yield another requestWillBeSent with this key set @@ -254,7 +271,9 @@ class SiteLoader: item.setFinished (resp) item.isRedirect = True logger.info ('redirect', uuid='85eaec41-e2a9-49c2-9445-6f19690278b8', target=req['url']) - self._append (item) + await item.prefetchRequestBody (self.tab) + # cannot fetch request body due to race condition (item id reused) + ret = item else: logger.warning ('request exists', uuid='2c989142-ba00-4791-bb03-c2a14e91a56b') @@ -263,7 +282,9 @@ class SiteLoader: self.requests[reqId] = item logger.debug ('request', uuid='55c17564-1bd0-4499-8724-fa7aad65478f') - def _responseReceived (self, **kwargs): + return ret + + async def _responseReceived (self, **kwargs): reqId = kwargs['requestId'] item = self.requests.get (reqId) if item is None: @@ -278,7 +299,7 @@ class SiteLoader: else: logger.warning ('scheme forbidden', uuid='2ea6e5d7-dd3b-4881-b9de-156c1751c666') - def _loadingFinished (self, **kwargs): + async def _loadingFinished (self, **kwargs): """ Item was fully loaded. For some items the request body is not available when responseReceived is fired, thus move everything here. @@ -297,9 +318,10 @@ class SiteLoader: if url.scheme in self.allowedSchemes: logger.info ('finished', uuid='5a8b4bad-f86a-4fe6-a53e-8da4130d6a02') item.setFinished (kwargs) - self._append (item) + await asyncio.gather (item.prefetchRequestBody (self.tab), item.prefetchResponseBody (self.tab)) + return item - def _loadingFailed (self, **kwargs): + async def _loadingFailed (self, **kwargs): reqId = kwargs['requestId'] self.logger.warning ('loading failed', uuid='68410f13-6eea-453e-924e-c1af4601748b', @@ -307,9 +329,9 @@ class SiteLoader: blockedReason=kwargs.get ('blockedReason')) item = self.requests.pop (reqId, None) item.failed = True - self._append (item) + return item - def _entryAdded (self, **kwargs): + async def _entryAdded (self, **kwargs): """ Log entry added """ entry = kwargs['entry'] level = {'verbose': Level.DEBUG, 'info': Level.INFO, @@ -318,26 +340,22 @@ class SiteLoader: entry['uuid'] = 'e62ffb5a-0521-459c-a3d9-1124551934d2' self.logger (level, 'console', **entry) - def _javascriptDialogOpening (self, **kwargs): + async def _javascriptDialogOpening (self, **kwargs): t = kwargs.get ('type') if t in {'alert', 'confirm', 'prompt'}: self.logger.info ('js dialog', uuid='d6f07ce2-648e-493b-a1df-f353bed27c84', action='cancel', type=t, message=kwargs.get ('message')) - self.tab.Page.handleJavaScriptDialog (accept=False) + await self.tab.Page.handleJavaScriptDialog (accept=False) elif t == 'beforeunload': # we must accept this one, otherwise the page will not unload/close self.logger.info ('js dialog', uuid='96399b99-9834-4c8f-bd93-cb9fa2225abd', action='proceed', type=t, message=kwargs.get ('message')) - self.tab.Page.handleJavaScriptDialog (accept=True) - else: - self.logger.warning ('js dialog unknown', uuid='3ef7292e-8595-4e89-b834-0cc6bc40ee38', **kwargs) - - def _targetCrashed (self, **kwargs): - self.logger.error ('browser crashed', uuid='6fe2b3be-ff01-4503-b30c-ad6aeea953ef') - # priority message - self._appendleft (BrowserCrashed ()) + await self.tab.Page.handleJavaScriptDialog (accept=True) + else: # pragma: no cover + self.logger.warning ('js dialog unknown', + uuid='3ef7292e-8595-4e89-b834-0cc6bc40ee38', **kwargs) import subprocess, os, time from tempfile import mkdtemp @@ -397,6 +415,7 @@ class ChromeService: self.p.wait () shutil.rmtree (self.userDataDir) self.p = None + return False class NullService: __slots__ = ('url') @@ -408,5 +427,5 @@ class NullService: return self.url def __exit__ (self, *exc): - pass + return False diff --git a/crocoite/test_browser.py b/crocoite/test_browser.py index 5c7fc69..030ffb1 100644 --- a/crocoite/test_browser.py +++ b/crocoite/test_browser.py @@ -18,13 +18,19 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. +import logging +import asyncio import pytest from operator import itemgetter +from aiohttp import web from http.server import BaseHTTPRequestHandler -from pychrome.exceptions import TimeoutException -from .browser import Item, SiteLoader, ChromeService, NullService, BrowserCrashed -from .logger import Logger, Consumer +from .browser import Item, SiteLoader, ChromeService, NullService +from .logger import Logger, Consumer, JsonPrintConsumer +from .devtools import Crashed + +# if you want to know what’s going on: +#logging.basicConfig(level=logging.DEBUG) class TItem (Item): """ This should be as close to Item as possible """ @@ -32,21 +38,14 @@ class TItem (Item): __slots__ = ('bodySend', '_body', '_requestBody') base = 'http://localhost:8000/' - def __init__ (self, path, status, headers, bodyReceive, bodySend=None, requestBody=None, failed=False): + def __init__ (self, path, status, headers, bodyReceive, bodySend=None, requestBody=None, failed=False, isRedirect=False): super ().__init__ (tab=None) self.chromeResponse = {'response': {'headers': headers, 'status': status, 'url': self.base + path}} - self._body = bodyReceive, False + self.body = bodyReceive, False self.bodySend = bodyReceive if not bodySend else bodySend - self._requestBody = requestBody, False + self.requestBody = requestBody, False self.failed = failed - - @property - def body (self): - return self._body - - @property - def requestBody (self): - return self._requestBody + self.isRedirect = isRedirect testItems = [ TItem ('binary', 200, {'Content-Type': 'application/octet-stream'}, b'\x00\x01\x02', failed=True), @@ -66,15 +65,15 @@ testItems = [ TItem ('image', 200, {'Content-Type': 'image/png'}, # 1×1 png image b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01\x00\x00\x00\x01\x08\x00\x00\x00\x00:~\x9bU\x00\x00\x00\nIDAT\x08\x1dc\xf8\x0f\x00\x01\x01\x01\x006_g\x80\x00\x00\x00\x00IEND\xaeB`\x82'), - TItem ('empty', 200, {}, b''), - TItem ('redirect/301/empty', 301, {'Location': '/empty'}, b''), - TItem ('redirect/301/redirect/301/empty', 301, {'Location': '/redirect/301/empty'}, b''), + TItem ('empty', 200, {'Content-Type': 'text/plain'}, b''), + TItem ('redirect/301/empty', 301, {'Location': '/empty'}, b'', isRedirect=True), + TItem ('redirect/301/redirect/301/empty', 301, {'Location': '/redirect/301/empty'}, b'', isRedirect=True), TItem ('nonexistent', 404, {}, b''), - TItem ('html', 200, {'Content-Type': 'html'}, + TItem ('html', 200, {'Content-Type': 'text/html'}, ''.encode ('utf8')), - TItem ('html/alert', 200, {'Content-Type': 'html'}, - ''.encode ('utf8')), - TItem ('html/fetchPost', 200, {'Content-Type': 'html'}, + TItem ('html/alert', 200, {'Content-Type': 'text/html'}, + ''.encode ('utf8')), + TItem ('html/fetchPost', 200, {'Content-Type': 'text/html'}, r"""