summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--crocoite/warc.py50
1 files changed, 28 insertions, 22 deletions
diff --git a/crocoite/warc.py b/crocoite/warc.py
index e472f16..47dd9dc 100644
--- a/crocoite/warc.py
+++ b/crocoite/warc.py
@@ -78,40 +78,45 @@ class WarcHandler (EventHandler):
return record.rec_headers['WARC-Record-ID']
- def _getBody (self, item):
+ def _writeResponse (self, item, concurrentTo):
+ # fetch the body
reqId = item.id
-
- rawBody = b''
+ rawBody = None
base64Encoded = False
+ bodyTruncated = None
if item.isRedirect:
# redirects reuse the same request, thus we cannot safely retrieve
# the body (i.e getResponseBody may return the new location’s
- # body). This is fine.
- pass
+ # body).
+ bodyTruncated = 'unspecified'
elif item.encodedDataLength > self.maxBodySize:
+ bodyTruncated = 'length'
# check body size first, since we’re loading everything into memory
- raise ValueError ('body for {} too large {} vs {}'.format (reqId,
+ self.logger.error ('body for {} too large {} vs {}'.format (reqId,
item.encodedDataLength, self.maxBodySize))
else:
- rawBody, base64Encoded = item.body
- return rawBody, base64Encoded
-
- def _writeResponse (self, item, concurrentTo, rawBody, base64Encoded):
- writer = self.writer
- resp = item.response
+ try:
+ rawBody, base64Encoded = item.body
+ except ValueError:
+ # oops, don’t know what went wrong here
+ bodyTruncated = 'unspecified'
# now the response
+ resp = item.response
warcHeaders = {
'WARC-Concurrent-To': concurrentTo,
'WARC-IP-Address': resp.get ('remoteIPAddress', ''),
'X-Chrome-Protocol': resp.get ('protocol', ''),
'X-Chrome-FromDiskCache': str (resp.get ('fromDiskCache')),
'X-Chrome-ConnectionReused': str (resp.get ('connectionReused')),
- 'X-Chrome-Base64Body': str (base64Encoded),
'WARC-Date': datetime_to_iso_date (datetime.utcfromtimestamp (
item.chromeRequest['wallTime']+
(item.chromeResponse['timestamp']-item.chromeRequest['timestamp']))),
}
+ if bodyTruncated:
+ warcHeaders['WARC-Truncated'] = bodyTruncated
+ else:
+ warcHeaders['X-Chrome-Base64Body'] = str (base64Encoded)
httpHeaders = StatusAndHeaders('{} {}'.format (resp['status'],
item.statusText), item.responseHeaders,
@@ -131,10 +136,15 @@ class WarcHandler (EventHandler):
contentType += '; charset=utf-8'
httpHeaders.replace_header ('content-type', contentType)
- httpHeaders.replace_header ('content-length', '{:d}'.format (len (rawBody)))
+ if rawBody is not None:
+ httpHeaders.replace_header ('content-length', '{:d}'.format (len (rawBody)))
+ bodyIo = BytesIO (rawBody)
+ else:
+ bodyIo = BytesIO ()
+ writer = self.writer
record = writer.create_warc_record(resp['url'], 'response',
- warc_headers_dict=warcHeaders, payload=BytesIO (rawBody),
+ warc_headers_dict=warcHeaders, payload=bodyIo,
http_headers=httpHeaders)
writer.write_record(record)
@@ -153,13 +163,9 @@ class WarcHandler (EventHandler):
if item.failed:
# should have been handled by the logger already
return
- try:
- # write neither request nor response if we cannot retrieve the body
- rawBody, base64Encoded = self._getBody (item)
- concurrentTo = self._writeRequest (item)
- self._writeResponse (item, concurrentTo, rawBody, base64Encoded)
- except ValueError as e:
- self.logger.error (e.args[0])
+
+ concurrentTo = self._writeRequest (item)
+ self._writeResponse (item, concurrentTo)
def _addRefersTo (self, headers, url):
refersTo = self.documentRecords.get (url)