diff options
Diffstat (limited to 'crocoite/warc.py')
-rw-r--r-- | crocoite/warc.py | 20 |
1 files changed, 12 insertions, 8 deletions
diff --git a/crocoite/warc.py b/crocoite/warc.py index 252e8cb..8e443fd 100644 --- a/crocoite/warc.py +++ b/crocoite/warc.py @@ -136,9 +136,16 @@ class WarcLoader (SiteLoader): rawBody = b'' base64Encoded = False - try: + if redirect: + # redirects reuse the same request, thus we cannot safely retrieve + # the body (i.e getResponseBody may return the new location’s body) + pass + elif item.encodedDataLength > self.maxBodySize: # check body size first, since we’re loading everything into memory - if item.encodedDataLength < self.maxBodySize: + self.logger.error ('body for {} too large {} vs {}'.format (reqId, + item.encodedDataLength, self.maxBodySize)) + else: + try: body = self.tab.Network.getResponseBody (requestId=reqId) rawBody = body['body'] base64Encoded = body['base64Encoded'] @@ -147,12 +154,9 @@ class WarcLoader (SiteLoader): warcHeaders['X-Chrome-Base64Body'] = str (True) else: rawBody = rawBody.encode ('utf8') - else: - self.logger.error ('body for {} too large {} vs {}'.format (reqId, - item.encodedDataLength, self.maxBodySize)) - except pychrome.exceptions.CallMethodException: - self.logger.error ('no data for {} {} {}'.format (resp['url'], - resp['status'], reqId)) + except pychrome.exceptions.CallMethodException: + self.logger.error ('no data for {} {} {}'.format (resp['url'], + resp['status'], reqId)) httpHeaders = StatusAndHeaders('{} {}'.format (resp['status'], self.getStatusText (resp)), resp['headers'], protocol='HTTP/1.1') |