summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLars-Dominik Braun <lars@6xq.net>2018-06-25 20:30:52 +0200
committerLars-Dominik Braun <lars@6xq.net>2018-06-25 20:44:30 +0200
commit0b9824c35f2053b27c3c0d494dc1d29aac9aaa81 (patch)
treeb2dc44aee67ede80a3a5b99633f02ff89a7a1f1a
parent785ef19736cc9a21746e00a022b76fd756c162de (diff)
downloadcrocoite-0b9824c35f2053b27c3c0d494dc1d29aac9aaa81.tar.gz
crocoite-0b9824c35f2053b27c3c0d494dc1d29aac9aaa81.tar.bz2
crocoite-0b9824c35f2053b27c3c0d494dc1d29aac9aaa81.zip
warc: Add metadata to truncated records
Specifically for a) redirects (body missing) b) bodies larger than size limit and c) whenever we couldn’t fetch the response body for whatever reason. We gave it our best shot, but still failed miserably. Future generations will certainly appreciate that. Eh, maybe. Hopefully. Will they?
-rw-r--r--crocoite/warc.py50
1 files changed, 28 insertions, 22 deletions
diff --git a/crocoite/warc.py b/crocoite/warc.py
index e472f16..47dd9dc 100644
--- a/crocoite/warc.py
+++ b/crocoite/warc.py
@@ -78,40 +78,45 @@ class WarcHandler (EventHandler):
return record.rec_headers['WARC-Record-ID']
- def _getBody (self, item):
+ def _writeResponse (self, item, concurrentTo):
+ # fetch the body
reqId = item.id
-
- rawBody = b''
+ rawBody = None
base64Encoded = False
+ bodyTruncated = None
if item.isRedirect:
# redirects reuse the same request, thus we cannot safely retrieve
# the body (i.e getResponseBody may return the new location’s
- # body). This is fine.
- pass
+ # body).
+ bodyTruncated = 'unspecified'
elif item.encodedDataLength > self.maxBodySize:
+ bodyTruncated = 'length'
# check body size first, since we’re loading everything into memory
- raise ValueError ('body for {} too large {} vs {}'.format (reqId,
+ self.logger.error ('body for {} too large {} vs {}'.format (reqId,
item.encodedDataLength, self.maxBodySize))
else:
- rawBody, base64Encoded = item.body
- return rawBody, base64Encoded
-
- def _writeResponse (self, item, concurrentTo, rawBody, base64Encoded):
- writer = self.writer
- resp = item.response
+ try:
+ rawBody, base64Encoded = item.body
+ except ValueError:
+ # oops, don’t know what went wrong here
+ bodyTruncated = 'unspecified'
# now the response
+ resp = item.response
warcHeaders = {
'WARC-Concurrent-To': concurrentTo,
'WARC-IP-Address': resp.get ('remoteIPAddress', ''),
'X-Chrome-Protocol': resp.get ('protocol', ''),
'X-Chrome-FromDiskCache': str (resp.get ('fromDiskCache')),
'X-Chrome-ConnectionReused': str (resp.get ('connectionReused')),
- 'X-Chrome-Base64Body': str (base64Encoded),
'WARC-Date': datetime_to_iso_date (datetime.utcfromtimestamp (
item.chromeRequest['wallTime']+
(item.chromeResponse['timestamp']-item.chromeRequest['timestamp']))),
}
+ if bodyTruncated:
+ warcHeaders['WARC-Truncated'] = bodyTruncated
+ else:
+ warcHeaders['X-Chrome-Base64Body'] = str (base64Encoded)
httpHeaders = StatusAndHeaders('{} {}'.format (resp['status'],
item.statusText), item.responseHeaders,
@@ -131,10 +136,15 @@ class WarcHandler (EventHandler):
contentType += '; charset=utf-8'
httpHeaders.replace_header ('content-type', contentType)
- httpHeaders.replace_header ('content-length', '{:d}'.format (len (rawBody)))
+ if rawBody is not None:
+ httpHeaders.replace_header ('content-length', '{:d}'.format (len (rawBody)))
+ bodyIo = BytesIO (rawBody)
+ else:
+ bodyIo = BytesIO ()
+ writer = self.writer
record = writer.create_warc_record(resp['url'], 'response',
- warc_headers_dict=warcHeaders, payload=BytesIO (rawBody),
+ warc_headers_dict=warcHeaders, payload=bodyIo,
http_headers=httpHeaders)
writer.write_record(record)
@@ -153,13 +163,9 @@ class WarcHandler (EventHandler):
if item.failed:
# should have been handled by the logger already
return
- try:
- # write neither request nor response if we cannot retrieve the body
- rawBody, base64Encoded = self._getBody (item)
- concurrentTo = self._writeRequest (item)
- self._writeResponse (item, concurrentTo, rawBody, base64Encoded)
- except ValueError as e:
- self.logger.error (e.args[0])
+
+ concurrentTo = self._writeRequest (item)
+ self._writeResponse (item, concurrentTo)
def _addRefersTo (self, headers, url):
refersTo = self.documentRecords.get (url)