diff options
-rw-r--r-- | crocoite/browser.py | 21 | ||||
-rw-r--r-- | crocoite/warc.py | 13 |
2 files changed, 21 insertions, 13 deletions
diff --git a/crocoite/browser.py b/crocoite/browser.py index baa0d83..b4ade56 100644 --- a/crocoite/browser.py +++ b/crocoite/browser.py @@ -24,13 +24,15 @@ Chrome browser interactions. import logging from urllib.parse import urlsplit +from base64 import b64decode class Item: """ Simple wrapper containing Chrome request and response """ - def __init__ (self): + def __init__ (self, tab): + self.tab = tab self.chromeRequest = None self.chromeResponse = None self.chromeFinished = None @@ -58,6 +60,21 @@ class Item: def encodedDataLength (self): return self.chromeFinished['encodedDataLength'] + @property + def body (self): + """ Return response body or None """ + try: + body = self.tab.Network.getResponseBody (requestId=self.id) + rawBody = body['body'] + base64Encoded = body['base64Encoded'] + if base64Encoded: + rawBody = b64decode (rawBody) + else: + rawBody = rawBody.encode ('utf8') + return rawBody + except pychrome.exceptions.CallMethodException: + return None + def setRequest (self, req): self.chromeRequest = req @@ -185,7 +202,7 @@ class SiteLoader: else: self.logger.warn ('request {} already exists, overwriting.'.format (reqId)) - item = Item () + item = Item (self.tab) item.setRequest (kwargs) self.requests[reqId] = item diff --git a/crocoite/warc.py b/crocoite/warc.py index e04bee4..b56d315 100644 --- a/crocoite/warc.py +++ b/crocoite/warc.py @@ -25,12 +25,10 @@ Classes writing data to WARC files import logging import json from http.server import BaseHTTPRequestHandler -from base64 import b64decode from io import BytesIO from warcio.statusandheaders import StatusAndHeaders from urllib.parse import urlsplit from logging.handlers import BufferingHandler -import pychrome from datetime import datetime from threading import Thread from queue import Queue @@ -188,15 +186,8 @@ class WarcLoader (AccountingSiteLoader): raise ValueError ('body for {} too large {} vs {}'.format (reqId, item.encodedDataLength, self.maxBodySize)) else: - try: - body = self.tab.Network.getResponseBody (requestId=reqId) - rawBody = body['body'] - base64Encoded = body['base64Encoded'] - if base64Encoded: - rawBody = b64decode (rawBody) - else: - rawBody = rawBody.encode ('utf8') - except pychrome.exceptions.CallMethodException: + rawBody = item.body + if rawBody is None: raise ValueError ('no data for {} {} {}'.format (resp['url'], resp['status'], reqId)) return rawBody, base64Encoded |