From 6879a7e6a7625129d3fbec2db8016eae07196f76 Mon Sep 17 00:00:00 2001 From: Lars-Dominik Braun Date: Sun, 17 Dec 2017 19:44:37 +0100 Subject: Don’t fetch redirected request body MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We can’t do that safely due to a race-condition. --- crocoite/warc.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) (limited to 'crocoite') diff --git a/crocoite/warc.py b/crocoite/warc.py index 252e8cb..8e443fd 100644 --- a/crocoite/warc.py +++ b/crocoite/warc.py @@ -136,9 +136,16 @@ class WarcLoader (SiteLoader): rawBody = b'' base64Encoded = False - try: + if redirect: + # redirects reuse the same request, thus we cannot safely retrieve + # the body (i.e getResponseBody may return the new location’s body) + pass + elif item.encodedDataLength > self.maxBodySize: # check body size first, since we’re loading everything into memory - if item.encodedDataLength < self.maxBodySize: + self.logger.error ('body for {} too large {} vs {}'.format (reqId, + item.encodedDataLength, self.maxBodySize)) + else: + try: body = self.tab.Network.getResponseBody (requestId=reqId) rawBody = body['body'] base64Encoded = body['base64Encoded'] @@ -147,12 +154,9 @@ class WarcLoader (SiteLoader): warcHeaders['X-Chrome-Base64Body'] = str (True) else: rawBody = rawBody.encode ('utf8') - else: - self.logger.error ('body for {} too large {} vs {}'.format (reqId, - item.encodedDataLength, self.maxBodySize)) - except pychrome.exceptions.CallMethodException: - self.logger.error ('no data for {} {} {}'.format (resp['url'], - resp['status'], reqId)) + except pychrome.exceptions.CallMethodException: + self.logger.error ('no data for {} {} {}'.format (resp['url'], + resp['status'], reqId)) httpHeaders = StatusAndHeaders('{} {}'.format (resp['status'], self.getStatusText (resp)), resp['headers'], protocol='HTTP/1.1') -- cgit v1.2.3