diff options
author | Lars-Dominik Braun <lars@6xq.net> | 2017-12-17 19:44:37 +0100 |
---|---|---|
committer | Lars-Dominik Braun <lars@6xq.net> | 2017-12-17 19:44:37 +0100 |
commit | 6879a7e6a7625129d3fbec2db8016eae07196f76 (patch) | |
tree | 3cae8bd042a185c1ba9b8719eae5a1e1c26abf27 /crocoite | |
parent | 84c3f69293fa79d752127410c7468038c907c96a (diff) | |
download | crocoite-6879a7e6a7625129d3fbec2db8016eae07196f76.tar.gz crocoite-6879a7e6a7625129d3fbec2db8016eae07196f76.tar.bz2 crocoite-6879a7e6a7625129d3fbec2db8016eae07196f76.zip |
Don’t fetch redirected request body
We can’t do that safely due to a race-condition.
Diffstat (limited to 'crocoite')
-rw-r--r-- | crocoite/warc.py | 20 |
1 files changed, 12 insertions, 8 deletions
diff --git a/crocoite/warc.py b/crocoite/warc.py index 252e8cb..8e443fd 100644 --- a/crocoite/warc.py +++ b/crocoite/warc.py @@ -136,9 +136,16 @@ class WarcLoader (SiteLoader): rawBody = b'' base64Encoded = False - try: + if redirect: + # redirects reuse the same request, thus we cannot safely retrieve + # the body (i.e getResponseBody may return the new location’s body) + pass + elif item.encodedDataLength > self.maxBodySize: # check body size first, since we’re loading everything into memory - if item.encodedDataLength < self.maxBodySize: + self.logger.error ('body for {} too large {} vs {}'.format (reqId, + item.encodedDataLength, self.maxBodySize)) + else: + try: body = self.tab.Network.getResponseBody (requestId=reqId) rawBody = body['body'] base64Encoded = body['base64Encoded'] @@ -147,12 +154,9 @@ class WarcLoader (SiteLoader): warcHeaders['X-Chrome-Base64Body'] = str (True) else: rawBody = rawBody.encode ('utf8') - else: - self.logger.error ('body for {} too large {} vs {}'.format (reqId, - item.encodedDataLength, self.maxBodySize)) - except pychrome.exceptions.CallMethodException: - self.logger.error ('no data for {} {} {}'.format (resp['url'], - resp['status'], reqId)) + except pychrome.exceptions.CallMethodException: + self.logger.error ('no data for {} {} {}'.format (resp['url'], + resp['status'], reqId)) httpHeaders = StatusAndHeaders('{} {}'.format (resp['status'], self.getStatusText (resp)), resp['headers'], protocol='HTTP/1.1') |