summaryrefslogtreecommitdiff
path: root/crocoite/warc.py
diff options
context:
space:
mode:
authorLars-Dominik Braun <lars@6xq.net>2018-06-25 20:30:52 +0200
committerLars-Dominik Braun <lars@6xq.net>2018-06-25 20:44:30 +0200
commit0b9824c35f2053b27c3c0d494dc1d29aac9aaa81 (patch)
treeb2dc44aee67ede80a3a5b99633f02ff89a7a1f1a /crocoite/warc.py
parent785ef19736cc9a21746e00a022b76fd756c162de (diff)
downloadcrocoite-0b9824c35f2053b27c3c0d494dc1d29aac9aaa81.tar.gz
crocoite-0b9824c35f2053b27c3c0d494dc1d29aac9aaa81.tar.bz2
crocoite-0b9824c35f2053b27c3c0d494dc1d29aac9aaa81.zip
warc: Add metadata to truncated records
Specifically for a) redirects (body missing) b) bodies larger than size limit and c) whenever we couldn’t fetch the response body for whatever reason. We gave it our best shot, but still failed miserably. Future generations will certainly appreciate that. Eh, maybe. Hopefully. Will they?
Diffstat (limited to 'crocoite/warc.py')
-rw-r--r--crocoite/warc.py50
1 files changed, 28 insertions, 22 deletions
diff --git a/crocoite/warc.py b/crocoite/warc.py
index e472f16..47dd9dc 100644
--- a/crocoite/warc.py
+++ b/crocoite/warc.py
@@ -78,40 +78,45 @@ class WarcHandler (EventHandler):
return record.rec_headers['WARC-Record-ID']
- def _getBody (self, item):
+ def _writeResponse (self, item, concurrentTo):
+ # fetch the body
reqId = item.id
-
- rawBody = b''
+ rawBody = None
base64Encoded = False
+ bodyTruncated = None
if item.isRedirect:
# redirects reuse the same request, thus we cannot safely retrieve
# the body (i.e getResponseBody may return the new location’s
- # body). This is fine.
- pass
+ # body).
+ bodyTruncated = 'unspecified'
elif item.encodedDataLength > self.maxBodySize:
+ bodyTruncated = 'length'
# check body size first, since we’re loading everything into memory
- raise ValueError ('body for {} too large {} vs {}'.format (reqId,
+ self.logger.error ('body for {} too large {} vs {}'.format (reqId,
item.encodedDataLength, self.maxBodySize))
else:
- rawBody, base64Encoded = item.body
- return rawBody, base64Encoded
-
- def _writeResponse (self, item, concurrentTo, rawBody, base64Encoded):
- writer = self.writer
- resp = item.response
+ try:
+ rawBody, base64Encoded = item.body
+ except ValueError:
+ # oops, don’t know what went wrong here
+ bodyTruncated = 'unspecified'
# now the response
+ resp = item.response
warcHeaders = {
'WARC-Concurrent-To': concurrentTo,
'WARC-IP-Address': resp.get ('remoteIPAddress', ''),
'X-Chrome-Protocol': resp.get ('protocol', ''),
'X-Chrome-FromDiskCache': str (resp.get ('fromDiskCache')),
'X-Chrome-ConnectionReused': str (resp.get ('connectionReused')),
- 'X-Chrome-Base64Body': str (base64Encoded),
'WARC-Date': datetime_to_iso_date (datetime.utcfromtimestamp (
item.chromeRequest['wallTime']+
(item.chromeResponse['timestamp']-item.chromeRequest['timestamp']))),
}
+ if bodyTruncated:
+ warcHeaders['WARC-Truncated'] = bodyTruncated
+ else:
+ warcHeaders['X-Chrome-Base64Body'] = str (base64Encoded)
httpHeaders = StatusAndHeaders('{} {}'.format (resp['status'],
item.statusText), item.responseHeaders,
@@ -131,10 +136,15 @@ class WarcHandler (EventHandler):
contentType += '; charset=utf-8'
httpHeaders.replace_header ('content-type', contentType)
- httpHeaders.replace_header ('content-length', '{:d}'.format (len (rawBody)))
+ if rawBody is not None:
+ httpHeaders.replace_header ('content-length', '{:d}'.format (len (rawBody)))
+ bodyIo = BytesIO (rawBody)
+ else:
+ bodyIo = BytesIO ()
+ writer = self.writer
record = writer.create_warc_record(resp['url'], 'response',
- warc_headers_dict=warcHeaders, payload=BytesIO (rawBody),
+ warc_headers_dict=warcHeaders, payload=bodyIo,
http_headers=httpHeaders)
writer.write_record(record)
@@ -153,13 +163,9 @@ class WarcHandler (EventHandler):
if item.failed:
# should have been handled by the logger already
return
- try:
- # write neither request nor response if we cannot retrieve the body
- rawBody, base64Encoded = self._getBody (item)
- concurrentTo = self._writeRequest (item)
- self._writeResponse (item, concurrentTo, rawBody, base64Encoded)
- except ValueError as e:
- self.logger.error (e.args[0])
+
+ concurrentTo = self._writeRequest (item)
+ self._writeResponse (item, concurrentTo)
def _addRefersTo (self, headers, url):
refersTo = self.documentRecords.get (url)