diff options
-rw-r--r-- | crocoite/warc.py | 50 |
1 files changed, 28 insertions, 22 deletions
diff --git a/crocoite/warc.py b/crocoite/warc.py index e472f16..47dd9dc 100644 --- a/crocoite/warc.py +++ b/crocoite/warc.py @@ -78,40 +78,45 @@ class WarcHandler (EventHandler): return record.rec_headers['WARC-Record-ID'] - def _getBody (self, item): + def _writeResponse (self, item, concurrentTo): + # fetch the body reqId = item.id - - rawBody = b'' + rawBody = None base64Encoded = False + bodyTruncated = None if item.isRedirect: # redirects reuse the same request, thus we cannot safely retrieve # the body (i.e getResponseBody may return the new location’s - # body). This is fine. - pass + # body). + bodyTruncated = 'unspecified' elif item.encodedDataLength > self.maxBodySize: + bodyTruncated = 'length' # check body size first, since we’re loading everything into memory - raise ValueError ('body for {} too large {} vs {}'.format (reqId, + self.logger.error ('body for {} too large {} vs {}'.format (reqId, item.encodedDataLength, self.maxBodySize)) else: - rawBody, base64Encoded = item.body - return rawBody, base64Encoded - - def _writeResponse (self, item, concurrentTo, rawBody, base64Encoded): - writer = self.writer - resp = item.response + try: + rawBody, base64Encoded = item.body + except ValueError: + # oops, don’t know what went wrong here + bodyTruncated = 'unspecified' # now the response + resp = item.response warcHeaders = { 'WARC-Concurrent-To': concurrentTo, 'WARC-IP-Address': resp.get ('remoteIPAddress', ''), 'X-Chrome-Protocol': resp.get ('protocol', ''), 'X-Chrome-FromDiskCache': str (resp.get ('fromDiskCache')), 'X-Chrome-ConnectionReused': str (resp.get ('connectionReused')), - 'X-Chrome-Base64Body': str (base64Encoded), 'WARC-Date': datetime_to_iso_date (datetime.utcfromtimestamp ( item.chromeRequest['wallTime']+ (item.chromeResponse['timestamp']-item.chromeRequest['timestamp']))), } + if bodyTruncated: + warcHeaders['WARC-Truncated'] = bodyTruncated + else: + warcHeaders['X-Chrome-Base64Body'] = str (base64Encoded) httpHeaders = StatusAndHeaders('{} {}'.format (resp['status'], item.statusText), item.responseHeaders, @@ -131,10 +136,15 @@ class WarcHandler (EventHandler): contentType += '; charset=utf-8' httpHeaders.replace_header ('content-type', contentType) - httpHeaders.replace_header ('content-length', '{:d}'.format (len (rawBody))) + if rawBody is not None: + httpHeaders.replace_header ('content-length', '{:d}'.format (len (rawBody))) + bodyIo = BytesIO (rawBody) + else: + bodyIo = BytesIO () + writer = self.writer record = writer.create_warc_record(resp['url'], 'response', - warc_headers_dict=warcHeaders, payload=BytesIO (rawBody), + warc_headers_dict=warcHeaders, payload=bodyIo, http_headers=httpHeaders) writer.write_record(record) @@ -153,13 +163,9 @@ class WarcHandler (EventHandler): if item.failed: # should have been handled by the logger already return - try: - # write neither request nor response if we cannot retrieve the body - rawBody, base64Encoded = self._getBody (item) - concurrentTo = self._writeRequest (item) - self._writeResponse (item, concurrentTo, rawBody, base64Encoded) - except ValueError as e: - self.logger.error (e.args[0]) + + concurrentTo = self._writeRequest (item) + self._writeResponse (item, concurrentTo) def _addRefersTo (self, headers, url): refersTo = self.documentRecords.get (url) |