summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLars-Dominik Braun <lars@6xq.net>2017-12-17 19:44:37 +0100
committerLars-Dominik Braun <lars@6xq.net>2017-12-17 19:44:37 +0100
commit6879a7e6a7625129d3fbec2db8016eae07196f76 (patch)
tree3cae8bd042a185c1ba9b8719eae5a1e1c26abf27
parent84c3f69293fa79d752127410c7468038c907c96a (diff)
downloadcrocoite-6879a7e6a7625129d3fbec2db8016eae07196f76.zip
crocoite-6879a7e6a7625129d3fbec2db8016eae07196f76.tar.gz
crocoite-6879a7e6a7625129d3fbec2db8016eae07196f76.tar.bz2
Don’t fetch redirected request body
We can’t do that safely due to a race-condition.
-rw-r--r--README.rst1
-rw-r--r--crocoite/warc.py20
2 files changed, 13 insertions, 8 deletions
diff --git a/README.rst b/README.rst
index 3d5af5f..145477f 100644
--- a/README.rst
+++ b/README.rst
@@ -60,6 +60,7 @@ Caveats
won’t work. Example: weather.com.
- Range requests (Range: bytes=1-100) are captured as-is, making playback
difficult
+- Content body of HTTP redirects cannot be retrived due to race condition
Most of these issues can be worked around by using the DOM snapshot, which is
also saved. This causes its own set of issues though:
diff --git a/crocoite/warc.py b/crocoite/warc.py
index 252e8cb..8e443fd 100644
--- a/crocoite/warc.py
+++ b/crocoite/warc.py
@@ -136,9 +136,16 @@ class WarcLoader (SiteLoader):
rawBody = b''
base64Encoded = False
- try:
+ if redirect:
+ # redirects reuse the same request, thus we cannot safely retrieve
+ # the body (i.e getResponseBody may return the new location’s body)
+ pass
+ elif item.encodedDataLength > self.maxBodySize:
# check body size first, since we’re loading everything into memory
- if item.encodedDataLength < self.maxBodySize:
+ self.logger.error ('body for {} too large {} vs {}'.format (reqId,
+ item.encodedDataLength, self.maxBodySize))
+ else:
+ try:
body = self.tab.Network.getResponseBody (requestId=reqId)
rawBody = body['body']
base64Encoded = body['base64Encoded']
@@ -147,12 +154,9 @@ class WarcLoader (SiteLoader):
warcHeaders['X-Chrome-Base64Body'] = str (True)
else:
rawBody = rawBody.encode ('utf8')
- else:
- self.logger.error ('body for {} too large {} vs {}'.format (reqId,
- item.encodedDataLength, self.maxBodySize))
- except pychrome.exceptions.CallMethodException:
- self.logger.error ('no data for {} {} {}'.format (resp['url'],
- resp['status'], reqId))
+ except pychrome.exceptions.CallMethodException:
+ self.logger.error ('no data for {} {} {}'.format (resp['url'],
+ resp['status'], reqId))
httpHeaders = StatusAndHeaders('{} {}'.format (resp['status'],
self.getStatusText (resp)), resp['headers'], protocol='HTTP/1.1')