From 6879a7e6a7625129d3fbec2db8016eae07196f76 Mon Sep 17 00:00:00 2001 From: Lars-Dominik Braun Date: Sun, 17 Dec 2017 19:44:37 +0100 Subject: Don’t fetch redirected request body MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We can’t do that safely due to a race-condition. --- README.rst | 1 + crocoite/warc.py | 20 ++++++++++++-------- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/README.rst b/README.rst index 3d5af5f..145477f 100644 --- a/README.rst +++ b/README.rst @@ -60,6 +60,7 @@ Caveats won’t work. Example: weather.com. - Range requests (Range: bytes=1-100) are captured as-is, making playback difficult +- Content body of HTTP redirects cannot be retrived due to race condition Most of these issues can be worked around by using the DOM snapshot, which is also saved. This causes its own set of issues though: diff --git a/crocoite/warc.py b/crocoite/warc.py index 252e8cb..8e443fd 100644 --- a/crocoite/warc.py +++ b/crocoite/warc.py @@ -136,9 +136,16 @@ class WarcLoader (SiteLoader): rawBody = b'' base64Encoded = False - try: + if redirect: + # redirects reuse the same request, thus we cannot safely retrieve + # the body (i.e getResponseBody may return the new location’s body) + pass + elif item.encodedDataLength > self.maxBodySize: # check body size first, since we’re loading everything into memory - if item.encodedDataLength < self.maxBodySize: + self.logger.error ('body for {} too large {} vs {}'.format (reqId, + item.encodedDataLength, self.maxBodySize)) + else: + try: body = self.tab.Network.getResponseBody (requestId=reqId) rawBody = body['body'] base64Encoded = body['base64Encoded'] @@ -147,12 +154,9 @@ class WarcLoader (SiteLoader): warcHeaders['X-Chrome-Base64Body'] = str (True) else: rawBody = rawBody.encode ('utf8') - else: - self.logger.error ('body for {} too large {} vs {}'.format (reqId, - item.encodedDataLength, self.maxBodySize)) - except pychrome.exceptions.CallMethodException: - self.logger.error ('no data for {} {} {}'.format (resp['url'], - resp['status'], reqId)) + except pychrome.exceptions.CallMethodException: + self.logger.error ('no data for {} {} {}'.format (resp['url'], + resp['status'], reqId)) httpHeaders = StatusAndHeaders('{} {}'.format (resp['status'], self.getStatusText (resp)), resp['headers'], protocol='HTTP/1.1') -- cgit v1.2.3