From 7730e0d64ec895091a0dd7eb0e3c6ce2ed02d981 Mon Sep 17 00:00:00 2001 From: Lars-Dominik Braun Date: Wed, 20 Jun 2018 11:13:37 +0200 Subject: Synchronous SiteLoader event handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously a browser crash stalled the entire grab, since events from pychrome were handled asynchronously in a different thread and exceptions were not propagated to the main thread. Now all browser events are stored in a queue and processed by the main thread, allowing us to handle browser crashes gracefully (more or less). This made the following additional changes necessary: - Clear separation between producer (browser) and consumer (WARC, stats, …) - Behavior scripts now yield events as well, instead of accessing the WARC writer - WARC logging was removed (for now) and WARC writer does not require serialization any more --- crocoite/browser.py | 417 +++++++++++++++++++++------------------------------- 1 file changed, 165 insertions(+), 252 deletions(-) (limited to 'crocoite/browser.py') diff --git a/crocoite/browser.py b/crocoite/browser.py index a891ce7..57d0dd0 100644 --- a/crocoite/browser.py +++ b/crocoite/browser.py @@ -25,6 +25,10 @@ Chrome browser interactions. import logging from urllib.parse import urlsplit from base64 import b64decode +from http.server import BaseHTTPRequestHandler +from collections import deque +from threading import Event + import pychrome class Item: @@ -37,6 +41,8 @@ class Item: self.chromeRequest = None self.chromeResponse = None self.chromeFinished = None + self.isRedirect = False + self.failed = False def __repr__ (self): return ''.format (self.request['url']) @@ -47,6 +53,7 @@ class Item: @property def response (self): + assert not self.failed, "you must not access response if failed is set" return self.chromeResponse['response'] @property @@ -73,7 +80,7 @@ class Item: def body (self): """ Return response body or None """ try: - body = self.tab.Network.getResponseBody (requestId=self.id, _timeout=60) + body = self.tab.Network.getResponseBody (requestId=self.id, _timeout=10) rawBody = body['body'] base64Encoded = body['base64Encoded'] if base64Encoded: @@ -93,7 +100,7 @@ class Item: return postData.encode ('utf8'), False elif req.get ('hasPostData', False): try: - return b64decode (self.tab.Network.getRequestPostData (requestId=self.id, _timeout=60)['postData']), True + return b64decode (self.tab.Network.getRequestPostData (requestId=self.id, _timeout=10)['postData']), True except (pychrome.exceptions.CallMethodException, pychrome.exceptions.TimeoutException): raise ValueError ('Cannot fetch request body') return None, False @@ -108,6 +115,16 @@ class Item: def responseHeaders (self): return self._unfoldHeaders (self.response['headers']) + @property + def statusText (self): + text = self.response.get ('statusText') + if text: + return text + text = BaseHTTPRequestHandler.responses.get (self.response['status']) + if text: + return text[0] + return 'No status text available' + @staticmethod def _unfoldHeaders (headers): """ @@ -129,10 +146,21 @@ class Item: def setFinished (self, finished): self.chromeFinished = finished +class BrowserCrashed (Exception): + pass + class SiteLoader: """ Load site in Chrome and monitor network requests + Chrome’s raw devtools events are preprocessed here (asynchronously, in a + different thread, spawned by pychrome) and put into a deque. There + are two reasons for this: First of all, it makes consumer exception + handling alot easier (no need to propagate them to the main thread). And + secondly, browser crashes must be handled before everything else, as they + result in a loss of communication with the browser itself (i.e. we can’t + fetch a resource’s body any more). + XXX: track popup windows/new tabs and close them """ @@ -140,22 +168,22 @@ class SiteLoader: def __init__ (self, browser, url, logger=logging.getLogger(__name__)): self.requests = {} - self.browser = browser + self.browser = pychrome.Browser (url=browser) self.url = url self.logger = logger - - self.tab = browser.new_tab() + self.queue = deque () + self.notify = Event () def __enter__ (self): - tab = self.tab + tab = self.tab = self.browser.new_tab() # setup callbacks tab.Network.requestWillBeSent = self._requestWillBeSent tab.Network.responseReceived = self._responseReceived tab.Network.loadingFinished = self._loadingFinished tab.Network.loadingFailed = self._loadingFailed tab.Log.entryAdded = self._entryAdded - #tab.Page.loadEventFired = loadEventFired tab.Page.javascriptDialogOpening = self._javascriptDialogOpening + tab.Inspector.targetCrashed = self._targetCrashed # start the tab tab.start() @@ -164,66 +192,37 @@ class SiteLoader: tab.Log.enable () tab.Network.enable() tab.Page.enable () + tab.Inspector.enable () tab.Network.clearBrowserCache () if tab.Network.canClearBrowserCookies ()['result']: tab.Network.clearBrowserCookies () return self + def __exit__ (self, exc_type, exc_value, traceback): + self.tab.Page.stopLoading () + self.tab.stop () + self.browser.close_tab(self.tab) + return False + def __len__ (self): return len (self.requests) + def __iter__ (self): + return iter (self.queue) + def start (self): self.tab.Page.navigate(url=self.url) - def wait (self, timeout=1): - self.tab.wait (timeout) - - def waitIdle (self, idleTimeout=1, maxTimeout=60): - step = 0 - for i in range (0, maxTimeout): - self.wait (1) - if len (self) == 0: - step += 1 - if step > idleTimeout: - break - else: - step = 0 - - def stop (self): - """ - Stop loading site - - XXX: stop executing scripts - """ + # use event to signal presence of new items. This way the controller + # can wait for them without polling. + def _append (self, item): + self.queue.append (item) + self.notify.set () - tab = self.tab - - tab.Page.stopLoading () - tab.Network.disable () - tab.Page.disable () - tab.Log.disable () - # XXX: we can’t drain the event queue directly, so insert (yet another) wait - tab.wait (1) - tab.Network.requestWillBeSent = None - tab.Network.responseReceived = None - tab.Network.loadingFinished = None - tab.Network.loadingFailed = None - tab.Page.loadEventFired = None - tab.Page.javascriptDialogOpening = None - tab.Log.entryAdded = None - - def __exit__ (self, exc_type, exc_value, traceback): - self.tab.stop () - self.browser.close_tab(self.tab) - return False - - # overrideable callbacks - def loadingFinished (self, item, redirect=False): - pass - - def loadingFailed (self, item): - pass + def _appendleft (self, item): + self.queue.appendleft (item) + self.notify.set () # internal chrome callbacks def _requestWillBeSent (self, **kwargs): @@ -244,8 +243,9 @@ class SiteLoader: item.setResponse (resp) resp = {'requestId': reqId, 'encodedDataLength': 0, 'timestamp': kwargs['timestamp']} item.setFinished (resp) - self.loadingFinished (item, redirect=True) + item.isRedirect = True self.logger.info ('redirected request {} has url {}'.format (reqId, req['url'])) + self._append (item) else: self.logger.warning ('request {} already exists, overwriting.'.format (reqId)) @@ -284,13 +284,14 @@ class SiteLoader: if url.scheme in self.allowedSchemes: self.logger.info ('finished {} {}'.format (reqId, req['url'])) item.setFinished (kwargs) - self.loadingFinished (item) + self._append (item) def _loadingFailed (self, **kwargs): reqId = kwargs['requestId'] self.logger.warning ('failed {} {}'.format (reqId, kwargs['errorText'], kwargs.get ('blockedReason'))) item = self.requests.pop (reqId, None) - self.loadingFailed (item) + item.failed = True + self._append (item) def _entryAdded (self, **kwargs): """ Log entry added """ @@ -312,31 +313,10 @@ class SiteLoader: else: self.logger.warning ('unknown javascript dialog type {}'.format (t)) -class AccountingSiteLoader (SiteLoader): - """ - SiteLoader that keeps basic statistics about retrieved pages. - """ - - def __init__ (self, browser, url, logger=logging.getLogger(__name__)): - super ().__init__ (browser, url, logger) - - self.stats = {'requests': 0, 'finished': 0, 'failed': 0, 'bytesRcv': 0} - - def loadingFinished (self, item, redirect=False): - super ().loadingFinished (item, redirect) - - self.stats['finished'] += 1 - self.stats['bytesRcv'] += item.encodedDataLength - - def loadingFailed (self, item): - super ().loadingFailed (item) - - self.stats['failed'] += 1 - - def _requestWillBeSent (self, **kwargs): - super ()._requestWillBeSent (**kwargs) - - self.stats['requests'] += 1 + def _targetCrashed (self, **kwargs): + self.logger.error ('browser crashed') + # priority message + self._appendleft (BrowserCrashed ()) import subprocess, os, time from tempfile import mkdtemp @@ -357,7 +337,6 @@ class ChromeService: def __enter__ (self): assert self.p is None self.userDataDir = mkdtemp () - print (self.userDataDir) args = [self.binary, '--window-size={},{}'.format (*self.windowSize), '--user-data-dir={}'.format (self.userDataDir), # use temporory user dir @@ -413,69 +392,65 @@ class NullService: ### tests ### import unittest, time -from http.server import BaseHTTPRequestHandler +from operator import itemgetter -class TestHTTPRequestHandler (BaseHTTPRequestHandler): - encodingTestString = { - 'latin1': 'äöü', - 'utf-8': 'äöü', - 'ISO-8859-1': 'äöü', - } - binaryTestData = b'\x00\x01\x02' - # 1×1 pixel PNG - imageTestData = b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01\x00\x00\x00\x01\x08\x00\x00\x00\x00:~\x9bU\x00\x00\x00\nIDAT\x08\x1dc\xf8\x0f\x00\x01\x01\x01\x006_g\x80\x00\x00\x00\x00IEND\xaeB`\x82' - htmlTestData = '' - alertData = '' +class TestItem (Item): + """ This should be as close to Item as possible """ + + base = 'http://localhost:8000/' + + def __init__ (self, path, status, headers, bodyReceive, bodySend=None): + super ().__init__ (tab=None) + self.chromeResponse = {'response': {'headers': headers, 'status': status, 'url': self.base + path}} + self._body = bodyReceive, False + self.bodySend = bodyReceive if not bodySend else bodySend + @property + def body (self): + return self._body + +testItems = [ + TestItem ('binary', 200, {'Content-Type': 'application/octet-stream'}, b'\x00\x01\x02'), + TestItem ('attachment', 200, + {'Content-Type': 'text/plain; charset=utf-8', + 'Content-Disposition': 'attachment; filename="attachment.txt"', + }, + 'This is a simple text file with umlauts. ÄÖU.'.encode ('utf8')), + TestItem ('encoding/utf8', 200, {'Content-Type': 'text/plain; charset=utf-8'}, + 'This is a test, äöü μνψκ ¥¥¥¿ýý¡'.encode ('utf8')), + TestItem ('encoding/iso88591', 200, {'Content-Type': 'text/plain; charset=ISO-8859-1'}, + 'This is a test, äöü.'.encode ('utf8'), + 'This is a test, äöü.'.encode ('ISO-8859-1')), + TestItem ('encoding/latin1', 200, {'Content-Type': 'text/plain; charset=latin1'}, + 'This is a test, äöü.'.encode ('utf8'), + 'This is a test, äöü.'.encode ('latin1')), + TestItem ('image', 200, {'Content-Type': 'image/png'}, + # 1×1 png image + b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01\x00\x00\x00\x01\x08\x00\x00\x00\x00:~\x9bU\x00\x00\x00\nIDAT\x08\x1dc\xf8\x0f\x00\x01\x01\x01\x006_g\x80\x00\x00\x00\x00IEND\xaeB`\x82'), + TestItem ('empty', 200, {}, b''), + TestItem ('redirect/301/empty', 301, {'Location': '/empty'}, b''), + TestItem ('redirect/301/redirect/301/empty', 301, {'Location': '/redirect/301/empty'}, b''), + TestItem ('nonexistent', 404, {}, b''), + TestItem ('html', 200, {'Content-Type': 'html'}, + ''.encode ('utf8')), + TestItem ('html/alert', 200, {'Content-Type': 'html'}, + ''.encode ('utf8')), + ] +testItemMap = dict ([(item.parsedUrl.path, item) for item in testItems]) + +class TestHTTPRequestHandler (BaseHTTPRequestHandler): def do_GET(self): - path = self.path - if path.startswith ('/redirect/301'): - self.send_response(301) - self.send_header ('Location', path[13:]) + item = testItemMap.get (self.path) + if item: + self.send_response (item.response['status']) + for k, v in item.response['headers'].items (): + self.send_header (k, v) + body = item.bodySend + self.send_header ('Content-Length', len (body)) self.end_headers() - elif path == '/empty': - self.send_response (200) - self.end_headers () - elif path.startswith ('/encoding'): - # send text data with different encodings - _, _, encoding = path.split ('/', 3) - self.send_response (200) - self.send_header ('Content-Type', 'text/plain; charset={}'.format (encoding)) - self.end_headers () - self.wfile.write (self.encodingTestString[encoding].encode (encoding)) - elif path == '/binary': - # send binary data - self.send_response (200) - self.send_header ('Content-Type', 'application/octet-stream') - self.send_header ('Content-Length', len (self.binaryTestData)) - self.end_headers () - self.wfile.write (self.binaryTestData) - elif path == '/image': - # send binary data - self.send_response (200) - self.send_header ('Content-Type', 'image/png') - self.end_headers () - self.wfile.write (self.imageTestData) - elif path == '/attachment': - self.send_response (200) - self.send_header ('Content-Type', 'text/plain; charset=utf-8') - self.send_header ('Content-Disposition', 'attachment; filename="attachment.txt"') - self.end_headers () - self.wfile.write (self.encodingTestString['utf-8'].encode ('utf-8')) - elif path == '/html': - self.send_response (200) - self.send_header ('Content-Type', 'text/html; charset=utf-8') - self.end_headers () - self.wfile.write (self.htmlTestData.encode ('utf-8')) - elif path == '/alert': - self.send_response (200) - self.send_header ('Content-Type', 'text/html; charset=utf-8') - self.end_headers () - self.wfile.write (self.alertData.encode ('utf-8')) - else: - self.send_response (404) - self.end_headers () - + self.wfile.write (body) + return + def log_message (self, format, *args): pass @@ -485,144 +460,82 @@ def startServer (): httpd = http.server.HTTPServer (("localhost", PORT), TestHTTPRequestHandler) httpd.serve_forever() -class TestSiteLoaderAdapter (SiteLoader): - def __init__ (self, browser, url): - SiteLoader.__init__ (self, browser, url) - self.finished = [] - - def loadingFinished (self, item, redirect=False): - self.finished.append (item) - class TestSiteLoader (unittest.TestCase): def setUp (self): from multiprocessing import Process self.server = Process (target=startServer) self.server.start () - self.baseurl = 'http://localhost:8000/' + self.baseurl = 'http://localhost:8000' self.service = ChromeService () - browserUrl = self.service.__enter__ () - self.browser = pychrome.Browser(url=browserUrl) + self.browser = self.service.__enter__ () def buildAdapter (self, path): - return TestSiteLoaderAdapter (self.browser, '{}{}'.format (self.baseurl, path)) - - def assertUrls (self, l, expect): - urls = set (map (lambda x: x.parsedUrl.path, l.finished)) - expect = set (expect) - self.assertEqual (urls, expect) - - def test_wait (self): - waittime = 2 - with self.buildAdapter ('empty') as l: + self.assertTrue (path.startswith ('/')) + return SiteLoader (self.browser, '{}{}'.format (self.baseurl, path)) + + def assertItems (self, l, items): + items = dict ([(i.parsedUrl.path, i) for i in items]) + timeout = 5 + while True: + if not l.notify.wait (timeout) and len (items) > 0: + self.fail ('timeout') + if len (l.queue) > 0: + item = l.queue.popleft () + if isinstance (item, Exception): + raise item + self.assertIsNot (item.chromeResponse, None, msg='url={}'.format (item.request['url'])) + golden = items.pop (item.parsedUrl.path) + if not golden: + self.fail ('url {} not supposed to be fetched'.format (item.url)) + self.assertEqual (item.body[0], golden.body[0], msg='body for url={}'.format (item.request['url'])) + self.assertEqual (item.response['status'], golden.response['status']) + for k, v in golden.responseHeaders: + actual = list (map (itemgetter (1), filter (lambda x: x[0] == k, item.responseHeaders))) + self.assertIn (v, actual) + + # check queue at least once + if not items: + break + + def assertLiteralItem (self, item, deps=[]): + with self.buildAdapter (item.parsedUrl.path) as l: l.start () - before = time.time () - l.wait (waittime) - after = time.time () - self.assertTrue ((after-before) >= waittime) + self.assertItems (l, [item] + deps) def test_empty (self): - with self.buildAdapter ('empty') as l: - l.start () - l.waitIdle () - self.assertEqual (len (l.finished), 1) + self.assertLiteralItem (testItemMap['/empty']) - def test_redirect301 (self): - with self.buildAdapter ('redirect/301/empty') as l: - l.start () - l.waitIdle () - self.assertEqual (len (l.finished), 2) - self.assertUrls (l, ['/redirect/301/empty', '/empty']) - for item in l.finished: - if item.parsedUrl.path == '/empty': - self.assertEqual (item.response['status'], 200) - self.assertEqual (item.body[0], b'') - elif item.parsedUrl.path == '/redirect/301/empty': - self.assertEqual (item.response['status'], 301) - else: - self.fail ('unknown url') - - def test_redirect301multi (self): - with self.buildAdapter ('redirect/301/redirect/301/empty') as l: - l.start () - l.waitIdle () - self.assertEqual (len (l.finished), 3) - self.assertUrls (l, ['/redirect/301/redirect/301/empty', '/redirect/301/empty', '/empty']) - for item in l.finished: - if item.parsedUrl.path == '/empty': - self.assertEqual (item.response['status'], 200) - self.assertEqual (item.body[0], b'') - elif item.parsedUrl.path in {'/redirect/301/empty', \ - '/redirect/301/redirect/301/empty'}: - self.assertEqual (item.response['status'], 301) - else: - self.fail ('unknown url') + def test_redirect (self): + self.assertLiteralItem (testItemMap['/redirect/301/empty'], [testItemMap['/empty']]) + # chained redirects + self.assertLiteralItem (testItemMap['/redirect/301/redirect/301/empty'], [testItemMap['/redirect/301/empty'], testItemMap['/empty']]) def test_encoding (self): """ Text responses are transformed to UTF-8. Make sure this works correctly. """ - for encoding, expected in TestHTTPRequestHandler.encodingTestString.items (): - with self.buildAdapter ('encoding/{}'.format (encoding)) as l: - l.start () - l.waitIdle () - self.assertEqual (len (l.finished), 1) - self.assertUrls (l, ['/encoding/{}'.format (encoding)]) - self.assertEqual (l.finished[0].body[0], expected.encode ('utf8')) + for item in {testItemMap['/encoding/utf8'], testItemMap['/encoding/latin1'], testItemMap['/encoding/iso88591']}: + self.assertLiteralItem (item) def test_binary (self): """ Browser should ignore content it cannot display (i.e. octet-stream) """ - with self.buildAdapter ('binary') as l: + with self.buildAdapter ('/binary') as l: l.start () - l.waitIdle () - self.assertEqual (len (l.finished), 0) + self.assertItems (l, []) def test_image (self): """ Images should be displayed inline """ - with self.buildAdapter ('image') as l: - l.start () - l.waitIdle () - self.assertEqual (len (l.finished), 1) - self.assertUrls (l, ['/image']) - self.assertEqual (l.finished[0].body[0], TestHTTPRequestHandler.imageTestData) + self.assertLiteralItem (testItemMap['/image']) def test_attachment (self): - """ And downloads won’t work in headless mode """ - with self.buildAdapter ('attachment') as l: + """ And downloads won’t work in headless mode, even if it’s just a text file """ + with self.buildAdapter ('/attachment') as l: l.start () - l.waitIdle () - self.assertEqual (len (l.finished), 0) + self.assertItems (l, []) def test_html (self): - with self.buildAdapter ('html') as l: - l.start () - l.waitIdle () - self.assertEqual (len (l.finished), 3) - self.assertUrls (l, ['/html', '/image', '/nonexistent']) - for item in l.finished: - if item.parsedUrl.path == '/html': - self.assertEqual (item.response['status'], 200) - self.assertEqual (item.body[0], TestHTTPRequestHandler.htmlTestData.encode ('utf-8')) - elif item.parsedUrl.path == '/image': - self.assertEqual (item.response['status'], 200) - self.assertEqual (item.body[0], TestHTTPRequestHandler.imageTestData) - elif item.parsedUrl.path == '/nonexistent': - self.assertEqual (item.response['status'], 404) - else: - self.fail ('unknown url') - - def test_alert (self): - with self.buildAdapter ('alert') as l: - l.start () - l.waitIdle () - self.assertUrls (l, ['/alert', '/image']) - for item in l.finished: - if item.parsedUrl.path == '/alert': - self.assertEqual (item.response['status'], 200) - self.assertEqual (item.body[0], TestHTTPRequestHandler.alertData.encode ('utf-8')) - elif item.parsedUrl.path == '/image': - self.assertEqual (item.response['status'], 200) - self.assertEqual (item.body[0], TestHTTPRequestHandler.imageTestData) - else: - self.fail ('unknown url') + self.assertLiteralItem (testItemMap['/html'], [testItemMap['/image'], testItemMap['/nonexistent']]) + # make sure alerts are dismissed correctly (image won’t load otherwise) + self.assertLiteralItem (testItemMap['/html/alert'], [testItemMap['/image']]) def tearDown (self): self.service.__exit__ (None, None, None) -- cgit v1.2.3