diff options
| -rw-r--r-- | crocoite/browser.py | 21 | ||||
| -rw-r--r-- | crocoite/warc.py | 13 | 
2 files changed, 21 insertions, 13 deletions
| diff --git a/crocoite/browser.py b/crocoite/browser.py index baa0d83..b4ade56 100644 --- a/crocoite/browser.py +++ b/crocoite/browser.py @@ -24,13 +24,15 @@ Chrome browser interactions.  import logging  from urllib.parse import urlsplit +from base64 import b64decode  class Item:      """      Simple wrapper containing Chrome request and response      """ -    def __init__ (self): +    def __init__ (self, tab): +        self.tab = tab          self.chromeRequest = None          self.chromeResponse = None          self.chromeFinished = None @@ -58,6 +60,21 @@ class Item:      def encodedDataLength (self):          return self.chromeFinished['encodedDataLength'] +    @property +    def body (self): +        """ Return response body or None """ +        try: +            body = self.tab.Network.getResponseBody (requestId=self.id) +            rawBody = body['body'] +            base64Encoded = body['base64Encoded'] +            if base64Encoded: +                rawBody = b64decode (rawBody) +            else: +                rawBody = rawBody.encode ('utf8') +            return rawBody +        except pychrome.exceptions.CallMethodException: +            return None +      def setRequest (self, req):          self.chromeRequest = req @@ -185,7 +202,7 @@ class SiteLoader:              else:                  self.logger.warn ('request {} already exists, overwriting.'.format (reqId)) -        item = Item () +        item = Item (self.tab)          item.setRequest (kwargs)          self.requests[reqId] = item diff --git a/crocoite/warc.py b/crocoite/warc.py index e04bee4..b56d315 100644 --- a/crocoite/warc.py +++ b/crocoite/warc.py @@ -25,12 +25,10 @@ Classes writing data to WARC files  import logging  import json  from http.server import BaseHTTPRequestHandler -from base64 import b64decode  from io import BytesIO  from warcio.statusandheaders import StatusAndHeaders  from urllib.parse import urlsplit  from logging.handlers import BufferingHandler -import pychrome  from datetime import datetime  from threading import Thread  from queue import Queue @@ -188,15 +186,8 @@ class WarcLoader (AccountingSiteLoader):              raise ValueError ('body for {} too large {} vs {}'.format (reqId,                      item.encodedDataLength, self.maxBodySize))          else: -            try: -                body = self.tab.Network.getResponseBody (requestId=reqId) -                rawBody = body['body'] -                base64Encoded = body['base64Encoded'] -                if base64Encoded: -                    rawBody = b64decode (rawBody) -                else: -                    rawBody = rawBody.encode ('utf8') -            except pychrome.exceptions.CallMethodException: +            rawBody = item.body +            if rawBody is None:                  raise ValueError ('no data for {} {} {}'.format (resp['url'],                      resp['status'], reqId))          return rawBody, base64Encoded | 
