From fabd84cb10beab2b2e5aed7489fc04df9fda7e83 Mon Sep 17 00:00:00 2001 From: Lars-Dominik Braun Date: Sat, 4 Aug 2018 15:31:12 +0200 Subject: Properly handle failure to retrieve request body MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Just truncate the WARC record like we do with responses. Also add a few tests, but they’re not covering the call to getRequestPostData. Not sure what we have to do here. --- crocoite/browser.py | 4 +++- crocoite/test_browser.py | 35 ++++++++++++++++++++++++++++++++--- crocoite/warc.py | 16 +++++++++++++++- 3 files changed, 50 insertions(+), 5 deletions(-) (limited to 'crocoite') diff --git a/crocoite/browser.py b/crocoite/browser.py index fbd12fd..c3ef5ce 100644 --- a/crocoite/browser.py +++ b/crocoite/browser.py @@ -103,7 +103,8 @@ class Item: return postData.encode ('utf8'), False elif req.get ('hasPostData', False): try: - return b64decode (self.tab.Network.getRequestPostData (requestId=self.id, _timeout=10)['postData']), True + postData = self.tab.Network.getRequestPostData (requestId=self.id, _timeout=10)['postData'] + return b64decode (postData), True except (pychrome.exceptions.CallMethodException, pychrome.exceptions.TimeoutException): raise ValueError ('Cannot fetch request body') return None, False @@ -315,6 +316,7 @@ class SiteLoader: level = {'verbose': Level.DEBUG, 'info': Level.INFO, 'warning': Level.WARNING, 'error': Level.ERROR}.get (entry.pop ('level'), Level.INFO) + entry['uuid'] = 'e62ffb5a-0521-459c-a3d9-1124551934d2' self.logger (level, 'console', **entry) def _javascriptDialogOpening (self, **kwargs): diff --git a/crocoite/test_browser.py b/crocoite/test_browser.py index dfcd71c..483a298 100644 --- a/crocoite/test_browser.py +++ b/crocoite/test_browser.py @@ -29,19 +29,24 @@ from .logger import Logger, Consumer class TItem (Item): """ This should be as close to Item as possible """ - __slots__ = ('bodySend', '_body') + __slots__ = ('bodySend', '_body', '_requestBody') base = 'http://localhost:8000/' - def __init__ (self, path, status, headers, bodyReceive, bodySend=None): + def __init__ (self, path, status, headers, bodyReceive, bodySend=None, requestBody=None): super ().__init__ (tab=None) self.chromeResponse = {'response': {'headers': headers, 'status': status, 'url': self.base + path}} self._body = bodyReceive, False self.bodySend = bodyReceive if not bodySend else bodySend + self._requestBody = requestBody, False @property def body (self): return self._body + @property + def requestBody (self): + return self._requestBody + testItems = [ TItem ('binary', 200, {'Content-Type': 'application/octet-stream'}, b'\x00\x01\x02'), TItem ('attachment', 200, @@ -68,6 +73,18 @@ testItems = [ ''.encode ('utf8')), TItem ('html/alert', 200, {'Content-Type': 'html'}, ''.encode ('utf8')), + TItem ('html/fetchPost', 200, {'Content-Type': 'html'}, + r"""""".encode ('utf8')), + TItem ('html/fetchPost/binary', 200, {'Content-Type': 'application/octet-stream'}, b'\x00', requestBody=b'\x00'), + TItem ('html/fetchPost/form', 200, {'Content-Type': 'application/octet-stream'}, b'\x00', requestBody=b'data=%21'), + # XXX: these should trigger the need for getRequestPostData, but they don’t. oh well. + TItem ('html/fetchPost/binary/large', 200, {'Content-Type': 'application/octet-stream'}, b'\x00', requestBody=(100*1024)*b'\x00'), + TItem ('html/fetchPost/form/large', 200, {'Content-Type': 'application/octet-stream'}, b'\x00', requestBody=b'data=' + (100*1024)*b'%21'), ] testItemMap = dict ([(item.parsedUrl.path, item) for item in testItems]) @@ -83,7 +100,9 @@ class RequestHandler (BaseHTTPRequestHandler): self.end_headers() self.wfile.write (body) return - + + do_POST = do_GET + def log_message (self, format, *args): pass @@ -134,11 +153,13 @@ def itemsLoaded (l, items): item = l.queue.popleft () if isinstance (item, Exception): raise item + assert not item.failed assert item.chromeResponse is not None golden = items.pop (item.parsedUrl.path) if not golden: assert False, 'url {} not supposed to be fetched'.format (item.url) assert item.body[0] == golden.body[0] + assert item.requestBody[0] == golden.requestBody[0] assert item.response['status'] == golden.response['status'] assert item.statusText == BaseHTTPRequestHandler.responses.get (item.response['status'])[0] for k, v in golden.responseHeaders: @@ -189,6 +210,14 @@ def test_html (loader): # make sure alerts are dismissed correctly (image won’t load otherwise) literalItem (loader, testItemMap['/html/alert'], [testItemMap['/image']]) +def test_post (loader): + """ XHR POST request with binary data""" + literalItem (loader, testItemMap['/html/fetchPost'], + [testItemMap['/html/fetchPost/binary'], + testItemMap['/html/fetchPost/binary/large'], + testItemMap['/html/fetchPost/form'], + testItemMap['/html/fetchPost/form/large']]) + def test_crash (loader): with loader ('/html') as l: l.start () diff --git a/crocoite/warc.py b/crocoite/warc.py index 32fe5d6..9b97e75 100644 --- a/crocoite/warc.py +++ b/crocoite/warc.py @@ -84,6 +84,7 @@ class WarcHandler (EventHandler): return record def _writeRequest (self, item): + logger = self.logger.bind (reqId=item.id) req = item.request resp = item.response @@ -97,9 +98,21 @@ class WarcHandler (EventHandler): initiator = item.initiator warcHeaders = { 'X-Chrome-Initiator': json.dumps (initiator), + 'X-Chrome-Request-ID': item.id, 'WARC-Date': datetime_to_iso_date (datetime.utcfromtimestamp (item.chromeRequest['wallTime'])), } - payload, payloadBase64Encoded = item.requestBody + try: + bodyTruncated = None + payload, payloadBase64Encoded = item.requestBody + except ValueError: + # oops, don’t know what went wrong here + bodyTruncated = 'unspecified' + logger.error ('requestBody missing', uuid='ee9adc58-e723-4595-9feb-312a67ead6a0') + + if bodyTruncated: + warcHeaders['WARC-Truncated'] = bodyTruncated + payload = None + if payload: payload = BytesIO (payload) warcHeaders['X-Chrome-Base64Body'] = str (payloadBase64Encoded) @@ -139,6 +152,7 @@ class WarcHandler (EventHandler): 'X-Chrome-Protocol': resp.get ('protocol', ''), 'X-Chrome-FromDiskCache': str (resp.get ('fromDiskCache')), 'X-Chrome-ConnectionReused': str (resp.get ('connectionReused')), + 'X-Chrome-Request-ID': item.id, 'WARC-Date': datetime_to_iso_date (datetime.utcfromtimestamp ( item.chromeRequest['wallTime']+ (item.chromeResponse['timestamp']-item.chromeRequest['timestamp']))), -- cgit v1.2.3