diff options
Diffstat (limited to 'crocoite')
| -rw-r--r-- | crocoite/browser.py | 35 | ||||
| -rw-r--r-- | crocoite/cli.py | 10 | ||||
| -rw-r--r-- | crocoite/warc.py | 10 | 
3 files changed, 46 insertions, 9 deletions
| diff --git a/crocoite/browser.py b/crocoite/browser.py index e7eb4e2..e1feda9 100644 --- a/crocoite/browser.py +++ b/crocoite/browser.py @@ -148,8 +148,12 @@ class SiteLoader:          self.browser.close_tab(self.tab)          return False -    def loadingFinished (self, item): -        self.logger.debug ('item finished {}'.format (item)) +    # overrideable callbacks +    def loadingFinished (self, item, redirect=False): +        pass + +    def loadingFailed (self, item): +        pass      # internal chrome callbacks      def _requestWillBeSent (self, **kwargs): @@ -213,6 +217,33 @@ class SiteLoader:          reqId = kwargs['requestId']          self.logger.warn ('failed {} {}'.format (reqId, kwargs['errorText'], kwargs.get ('blockedReason')))          item = self.requests.pop (reqId, None) +        self.loadingFailed (item) + +class AccountingSiteLoader (SiteLoader): +    """ +    SiteLoader that keeps basic statistics about retrieved pages. +    """ + +    def __init__ (self, browser, url, logger=logging.getLogger(__name__)): +        super ().__init__ (browser, url, logger) + +        self.stats = {'requests': 0, 'finished': 0, 'failed': 0, 'bytesRcv': 0} + +    def loadingFinished (self, item, redirect=False): +        super ().loadingFinished (item, redirect) + +        self.stats['finished'] += 1 +        self.stats['bytesRcv'] += item.encodedDataLength + +    def loadingFailed (self, item): +        super ().loadingFailed (item) + +        self.stats['failed'] += 1 + +    def _requestWillBeSent (self, **kwargs): +        super ()._requestWillBeSent (**kwargs) + +        self.stats['requests'] += 1  import subprocess  from tempfile import mkdtemp diff --git a/crocoite/cli.py b/crocoite/cli.py index 0880a53..32e0959 100644 --- a/crocoite/cli.py +++ b/crocoite/cli.py @@ -160,6 +160,8 @@ def archive (self, url, output, onload, onsnapshot, browser,      finished_dir = '/tmp/finished'      """ +    ret = {'stats': None} +      self.update_state (state='PROGRESS', meta={'step': 'start'})      stopVarname = '__' + __package__ + '_stop__' @@ -229,11 +231,12 @@ def archive (self, url, output, onload, onsnapshot, browser,              if screenshot:                  self.update_state (state='PROGRESS', meta={'step': 'screenshot'})                  writeScreenshot (l.tab, writer) +            ret['stats'] = l.stats          writer.flush ()      if not output:          outPath = os.path.join (app.conf.finished_dir, outFile)          os.rename (fd.name, outPath) -    return True +    return ret  def stateCallback (data):      result = data['result'] @@ -271,11 +274,12 @@ def main ():      if distributed:          result = archive.delay (**passArgs) -        result.get (on_message=stateCallback) +        r = result.get (on_message=stateCallback)      else:          # XXX: local evaluation does not init celery logging?          logging.basicConfig (level=logging.INFO) -        archive (**passArgs) +        r = archive (**passArgs) +    print (r['stats'])      return True diff --git a/crocoite/warc.py b/crocoite/warc.py index 1c844bc..d9afab2 100644 --- a/crocoite/warc.py +++ b/crocoite/warc.py @@ -24,7 +24,7 @@ Classes writing data to WARC files  import logging  import json -from .browser import SiteLoader +from .browser import AccountingSiteLoader  from . import packageUrl  from http.server import BaseHTTPRequestHandler  from base64 import b64decode @@ -100,11 +100,11 @@ class WARCLogHandler (BufferingHandler):          finally:              self.release () -class WarcLoader (SiteLoader): +class WarcLoader (AccountingSiteLoader):      def __init__ (self, browser, url, writer,              logger=logging.getLogger(__name__), logBuffer=1000,              maxBodySize=10*1024*1024): -        SiteLoader.__init__ (self, browser, url, logger) +        super ().__init__ (browser, url, logger)          self.writer = writer          self.maxBodySize = maxBodySize          self.warcLogger = WARCLogHandler (logBuffer, writer) @@ -113,7 +113,7 @@ class WarcLoader (SiteLoader):      def __exit__ (self, exc_type, exc_value, traceback):          self.logger.removeHandler (self.warcLogger)          self.warcLogger.flush () -        return SiteLoader.__exit__ (self, exc_type, exc_value, traceback) +        return super ().__exit__ (exc_type, exc_value, traceback)      @staticmethod      def getStatusText (response): @@ -244,6 +244,8 @@ class WarcLoader (SiteLoader):          writer.write_record(record)      def loadingFinished (self, item, redirect=False): +        super ().loadingFinished (item, redirect) +          writer = self.writer          req = item.request | 
