diff options
author | Lars-Dominik Braun <lars@6xq.net> | 2018-06-25 20:30:52 +0200 |
---|---|---|
committer | Lars-Dominik Braun <lars@6xq.net> | 2018-06-25 20:44:30 +0200 |
commit | 0b9824c35f2053b27c3c0d494dc1d29aac9aaa81 (patch) | |
tree | b2dc44aee67ede80a3a5b99633f02ff89a7a1f1a | |
parent | 785ef19736cc9a21746e00a022b76fd756c162de (diff) | |
download | crocoite-0b9824c35f2053b27c3c0d494dc1d29aac9aaa81.tar.gz crocoite-0b9824c35f2053b27c3c0d494dc1d29aac9aaa81.tar.bz2 crocoite-0b9824c35f2053b27c3c0d494dc1d29aac9aaa81.zip |
warc: Add metadata to truncated records
Specifically for a) redirects (body missing) b) bodies larger than size
limit and c) whenever we couldn’t fetch the response body for whatever
reason.
We gave it our best shot, but still failed miserably. Future generations
will certainly appreciate that. Eh, maybe. Hopefully. Will they?
-rw-r--r-- | crocoite/warc.py | 50 |
1 files changed, 28 insertions, 22 deletions
diff --git a/crocoite/warc.py b/crocoite/warc.py index e472f16..47dd9dc 100644 --- a/crocoite/warc.py +++ b/crocoite/warc.py @@ -78,40 +78,45 @@ class WarcHandler (EventHandler): return record.rec_headers['WARC-Record-ID'] - def _getBody (self, item): + def _writeResponse (self, item, concurrentTo): + # fetch the body reqId = item.id - - rawBody = b'' + rawBody = None base64Encoded = False + bodyTruncated = None if item.isRedirect: # redirects reuse the same request, thus we cannot safely retrieve # the body (i.e getResponseBody may return the new location’s - # body). This is fine. - pass + # body). + bodyTruncated = 'unspecified' elif item.encodedDataLength > self.maxBodySize: + bodyTruncated = 'length' # check body size first, since we’re loading everything into memory - raise ValueError ('body for {} too large {} vs {}'.format (reqId, + self.logger.error ('body for {} too large {} vs {}'.format (reqId, item.encodedDataLength, self.maxBodySize)) else: - rawBody, base64Encoded = item.body - return rawBody, base64Encoded - - def _writeResponse (self, item, concurrentTo, rawBody, base64Encoded): - writer = self.writer - resp = item.response + try: + rawBody, base64Encoded = item.body + except ValueError: + # oops, don’t know what went wrong here + bodyTruncated = 'unspecified' # now the response + resp = item.response warcHeaders = { 'WARC-Concurrent-To': concurrentTo, 'WARC-IP-Address': resp.get ('remoteIPAddress', ''), 'X-Chrome-Protocol': resp.get ('protocol', ''), 'X-Chrome-FromDiskCache': str (resp.get ('fromDiskCache')), 'X-Chrome-ConnectionReused': str (resp.get ('connectionReused')), - 'X-Chrome-Base64Body': str (base64Encoded), 'WARC-Date': datetime_to_iso_date (datetime.utcfromtimestamp ( item.chromeRequest['wallTime']+ (item.chromeResponse['timestamp']-item.chromeRequest['timestamp']))), } + if bodyTruncated: + warcHeaders['WARC-Truncated'] = bodyTruncated + else: + warcHeaders['X-Chrome-Base64Body'] = str (base64Encoded) httpHeaders = StatusAndHeaders('{} {}'.format (resp['status'], item.statusText), item.responseHeaders, @@ -131,10 +136,15 @@ class WarcHandler (EventHandler): contentType += '; charset=utf-8' httpHeaders.replace_header ('content-type', contentType) - httpHeaders.replace_header ('content-length', '{:d}'.format (len (rawBody))) + if rawBody is not None: + httpHeaders.replace_header ('content-length', '{:d}'.format (len (rawBody))) + bodyIo = BytesIO (rawBody) + else: + bodyIo = BytesIO () + writer = self.writer record = writer.create_warc_record(resp['url'], 'response', - warc_headers_dict=warcHeaders, payload=BytesIO (rawBody), + warc_headers_dict=warcHeaders, payload=bodyIo, http_headers=httpHeaders) writer.write_record(record) @@ -153,13 +163,9 @@ class WarcHandler (EventHandler): if item.failed: # should have been handled by the logger already return - try: - # write neither request nor response if we cannot retrieve the body - rawBody, base64Encoded = self._getBody (item) - concurrentTo = self._writeRequest (item) - self._writeResponse (item, concurrentTo, rawBody, base64Encoded) - except ValueError as e: - self.logger.error (e.args[0]) + + concurrentTo = self._writeRequest (item) + self._writeResponse (item, concurrentTo) def _addRefersTo (self, headers, url): refersTo = self.documentRecords.get (url) |