From 0b9824c35f2053b27c3c0d494dc1d29aac9aaa81 Mon Sep 17 00:00:00 2001 From: Lars-Dominik Braun Date: Mon, 25 Jun 2018 20:30:52 +0200 Subject: warc: Add metadata to truncated records MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Specifically for a) redirects (body missing) b) bodies larger than size limit and c) whenever we couldn’t fetch the response body for whatever reason. We gave it our best shot, but still failed miserably. Future generations will certainly appreciate that. Eh, maybe. Hopefully. Will they? --- crocoite/warc.py | 50 ++++++++++++++++++++++++++++---------------------- 1 file changed, 28 insertions(+), 22 deletions(-) (limited to 'crocoite') diff --git a/crocoite/warc.py b/crocoite/warc.py index e472f16..47dd9dc 100644 --- a/crocoite/warc.py +++ b/crocoite/warc.py @@ -78,40 +78,45 @@ class WarcHandler (EventHandler): return record.rec_headers['WARC-Record-ID'] - def _getBody (self, item): + def _writeResponse (self, item, concurrentTo): + # fetch the body reqId = item.id - - rawBody = b'' + rawBody = None base64Encoded = False + bodyTruncated = None if item.isRedirect: # redirects reuse the same request, thus we cannot safely retrieve # the body (i.e getResponseBody may return the new location’s - # body). This is fine. - pass + # body). + bodyTruncated = 'unspecified' elif item.encodedDataLength > self.maxBodySize: + bodyTruncated = 'length' # check body size first, since we’re loading everything into memory - raise ValueError ('body for {} too large {} vs {}'.format (reqId, + self.logger.error ('body for {} too large {} vs {}'.format (reqId, item.encodedDataLength, self.maxBodySize)) else: - rawBody, base64Encoded = item.body - return rawBody, base64Encoded - - def _writeResponse (self, item, concurrentTo, rawBody, base64Encoded): - writer = self.writer - resp = item.response + try: + rawBody, base64Encoded = item.body + except ValueError: + # oops, don’t know what went wrong here + bodyTruncated = 'unspecified' # now the response + resp = item.response warcHeaders = { 'WARC-Concurrent-To': concurrentTo, 'WARC-IP-Address': resp.get ('remoteIPAddress', ''), 'X-Chrome-Protocol': resp.get ('protocol', ''), 'X-Chrome-FromDiskCache': str (resp.get ('fromDiskCache')), 'X-Chrome-ConnectionReused': str (resp.get ('connectionReused')), - 'X-Chrome-Base64Body': str (base64Encoded), 'WARC-Date': datetime_to_iso_date (datetime.utcfromtimestamp ( item.chromeRequest['wallTime']+ (item.chromeResponse['timestamp']-item.chromeRequest['timestamp']))), } + if bodyTruncated: + warcHeaders['WARC-Truncated'] = bodyTruncated + else: + warcHeaders['X-Chrome-Base64Body'] = str (base64Encoded) httpHeaders = StatusAndHeaders('{} {}'.format (resp['status'], item.statusText), item.responseHeaders, @@ -131,10 +136,15 @@ class WarcHandler (EventHandler): contentType += '; charset=utf-8' httpHeaders.replace_header ('content-type', contentType) - httpHeaders.replace_header ('content-length', '{:d}'.format (len (rawBody))) + if rawBody is not None: + httpHeaders.replace_header ('content-length', '{:d}'.format (len (rawBody))) + bodyIo = BytesIO (rawBody) + else: + bodyIo = BytesIO () + writer = self.writer record = writer.create_warc_record(resp['url'], 'response', - warc_headers_dict=warcHeaders, payload=BytesIO (rawBody), + warc_headers_dict=warcHeaders, payload=bodyIo, http_headers=httpHeaders) writer.write_record(record) @@ -153,13 +163,9 @@ class WarcHandler (EventHandler): if item.failed: # should have been handled by the logger already return - try: - # write neither request nor response if we cannot retrieve the body - rawBody, base64Encoded = self._getBody (item) - concurrentTo = self._writeRequest (item) - self._writeResponse (item, concurrentTo, rawBody, base64Encoded) - except ValueError as e: - self.logger.error (e.args[0]) + + concurrentTo = self._writeRequest (item) + self._writeResponse (item, concurrentTo) def _addRefersTo (self, headers, url): refersTo = self.documentRecords.get (url) -- cgit v1.2.3