From 9d7974e3e7e8a4575ea61cb33a30fa291d12ae38 Mon Sep 17 00:00:00 2001 From: Lars-Dominik Braun Date: Thu, 3 Jan 2019 19:34:17 +0100 Subject: browser: Turn Item into RequestResponsePair MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously Item was just a simple wrapper around Chrome’s Network.* events. This turned out to be quite nasty when testing, so its replacement, RequestResponsePair, does some level of abstraction. This makes testing alot easier, since we now can simply instantiate it without building a proper DevTools event. Should come without any functional changes. --- crocoite/warc.py | 96 +++++++++++++++++++++++++------------------------------- 1 file changed, 42 insertions(+), 54 deletions(-) (limited to 'crocoite/warc.py') diff --git a/crocoite/warc.py b/crocoite/warc.py index dbd9ebc..cb1f2f7 100644 --- a/crocoite/warc.py +++ b/crocoite/warc.py @@ -25,6 +25,7 @@ Classes writing data to WARC files import json, threading from io import BytesIO from datetime import datetime +from http.server import BaseHTTPRequestHandler from warcio.timeutils import datetime_to_iso_date from warcio.warcwriter import WARCWriter @@ -33,7 +34,7 @@ from warcio.statusandheaders import StatusAndHeaders from .util import packageUrl, StrJsonEncoder from .controller import EventHandler, ControllerStart from .behavior import Script, DomSnapshotEvent, ScreenshotEvent -from .browser import Item +from .browser import RequestResponsePair, UnicodeBody, Base64Body class WarcHandler (EventHandler): __slots__ = ('logger', 'writer', 'documentRecords', 'log', @@ -86,66 +87,51 @@ class WarcHandler (EventHandler): url = item.url path = url.relative().with_fragment(None) - httpHeaders = StatusAndHeaders(f'{req["method"]} {path} HTTP/1.1', - item.requestHeaders, protocol='HTTP/1.1', is_http_request=True) - initiator = item.initiator + httpHeaders = StatusAndHeaders(f'{req.method} {path} HTTP/1.1', + req.headers, protocol='HTTP/1.1', is_http_request=True) warcHeaders = { - 'X-Chrome-Initiator': json.dumps (initiator), + 'X-Chrome-Initiator': json.dumps (req.initiator), 'X-Chrome-Request-ID': item.id, - 'WARC-Date': datetime_to_iso_date (datetime.utcfromtimestamp (item.chromeRequest['wallTime'])), + 'WARC-Date': datetime_to_iso_date (req.timestamp), } - if item.requestBody is not None: - payload, payloadBase64Encoded = item.requestBody - else: + body = item.request.body + if item.request.hasPostData and body is None: # oops, don’t know what went wrong here - logger.error ('requestBody missing', uuid='ee9adc58-e723-4595-9feb-312a67ead6a0') + logger.error ('requestBody missing', + uuid='ee9adc58-e723-4595-9feb-312a67ead6a0') warcHeaders['WARC-Truncated'] = 'unspecified' - payload = None - - if payload is not None: - payload = BytesIO (payload) - warcHeaders['X-Chrome-Base64Body'] = str (payloadBase64Encoded) + else: + warcHeaders['X-Chrome-Base64Body'] = str (type (body) is Base64Body) + body = BytesIO (body) record = self.writeRecord (url, 'request', - payload=payload, http_headers=httpHeaders, + payload=body, http_headers=httpHeaders, warc_headers_dict=warcHeaders) return record.rec_headers['WARC-Record-ID'] def _writeResponse (self, item, concurrentTo): # fetch the body reqId = item.id - rawBody = None - base64Encoded = False - bodyTruncated = None - if item.isRedirect or item.body is None: - # redirects reuse the same request, thus we cannot safely retrieve - # the body (i.e getResponseBody may return the new location’s - # body). No body available means we failed to retrieve it. - bodyTruncated = 'unspecified' - else: - rawBody, base64Encoded = item.body # now the response resp = item.response warcHeaders = { 'WARC-Concurrent-To': concurrentTo, - 'WARC-IP-Address': resp.get ('remoteIPAddress', ''), - 'X-Chrome-Protocol': resp.get ('protocol', ''), - 'X-Chrome-FromDiskCache': str (resp.get ('fromDiskCache')), - 'X-Chrome-ConnectionReused': str (resp.get ('connectionReused')), 'X-Chrome-Request-ID': item.id, - 'WARC-Date': datetime_to_iso_date (datetime.utcfromtimestamp ( - item.chromeRequest['wallTime']+ - (item.chromeResponse['timestamp']-item.chromeRequest['timestamp']))), + 'WARC-Date': datetime_to_iso_date (resp.timestamp), } - if bodyTruncated: - warcHeaders['WARC-Truncated'] = bodyTruncated - else: - warcHeaders['X-Chrome-Base64Body'] = str (base64Encoded) - - httpHeaders = StatusAndHeaders(f'{resp["status"]} {item.statusText}', - item.responseHeaders, - protocol='HTTP/1.1') + # conditional WARC headers + if item.remoteIpAddress: + warcHeaders['WARC-IP-Address'] = item.remoteIpAddress + if item.protocol: + warcHeaders['X-Chrome-Protocol'] = item.protocol + + # HTTP headers + statusText = resp.statusText or \ + BaseHTTPRequestHandler.responses.get ( + resp.status, ('No status text available', ))[0] + httpHeaders = StatusAndHeaders(f'{resp.status} {statusText}', + resp.headers, protocol='HTTP/1.1') # Content is saved decompressed and decoded, remove these headers blacklistedHeaders = {'transfer-encoding', 'content-encoding'} @@ -155,20 +141,23 @@ class WarcHandler (EventHandler): # chrome sends nothing but utf8 encoded text. Fortunately HTTP # headers take precedence over the document’s , thus we can # easily override those. - contentType = resp.get ('mimeType') + contentType = resp.mimeType if contentType: - if not base64Encoded: + if isinstance (resp.body, UnicodeBody): contentType += '; charset=utf-8' httpHeaders.replace_header ('Content-Type', contentType) - if rawBody is not None: - httpHeaders.replace_header ('Content-Length', str (len (rawBody))) - bodyIo = BytesIO (rawBody) + # response body + body = resp.body + if body is None: + warcHeaders['WARC-Truncated'] = 'unspecified' else: - bodyIo = BytesIO () + httpHeaders.replace_header ('Content-Length', str (len (body))) + warcHeaders['X-Chrome-Base64Body'] = str (type (body) is Base64Body) + body = BytesIO (body) record = self.writeRecord (item.url, 'response', - warc_headers_dict=warcHeaders, payload=bodyIo, + warc_headers_dict=warcHeaders, payload=body, http_headers=httpHeaders) if item.resourceType == 'Document': @@ -184,12 +173,11 @@ class WarcHandler (EventHandler): f'application/javascript; charset={encoding}'}) def _writeItem (self, item): - if item.failed: - # should have been handled by the logger already - return - + assert item.request concurrentTo = self._writeRequest (item) - self._writeResponse (item, concurrentTo) + # items that failed loading don’t have a response + if item.response: + self._writeResponse (item, concurrentTo) def _addRefersTo (self, headers, url): refersTo = self.documentRecords.get (url) @@ -247,7 +235,7 @@ class WarcHandler (EventHandler): self._flushLogEntries () route = {Script: _writeScript, - Item: _writeItem, + RequestResponsePair: _writeItem, DomSnapshotEvent: _writeDomSnapshot, ScreenshotEvent: _writeScreenshot, ControllerStart: _writeControllerStart, -- cgit v1.2.3