diff options
author | Lars-Dominik Braun <lars@6xq.net> | 2017-12-22 17:43:52 +0100 |
---|---|---|
committer | Lars-Dominik Braun <lars@6xq.net> | 2017-12-22 17:43:52 +0100 |
commit | fd279ff3168c91be2ed8a012af6395034475ccf5 (patch) | |
tree | f4a8b55db5f158a4be4cf8c48aa82d944c206595 /crocoite | |
parent | bcfbdd9b45b7e872ee77e1366197443d855d8c7c (diff) | |
download | crocoite-fd279ff3168c91be2ed8a012af6395034475ccf5.tar.gz crocoite-fd279ff3168c91be2ed8a012af6395034475ccf5.tar.bz2 crocoite-fd279ff3168c91be2ed8a012af6395034475ccf5.zip |
Add simple stats-keeping SiteLoader
Diffstat (limited to 'crocoite')
-rw-r--r-- | crocoite/browser.py | 35 | ||||
-rw-r--r-- | crocoite/cli.py | 10 | ||||
-rw-r--r-- | crocoite/warc.py | 10 |
3 files changed, 46 insertions, 9 deletions
diff --git a/crocoite/browser.py b/crocoite/browser.py index e7eb4e2..e1feda9 100644 --- a/crocoite/browser.py +++ b/crocoite/browser.py @@ -148,8 +148,12 @@ class SiteLoader: self.browser.close_tab(self.tab) return False - def loadingFinished (self, item): - self.logger.debug ('item finished {}'.format (item)) + # overrideable callbacks + def loadingFinished (self, item, redirect=False): + pass + + def loadingFailed (self, item): + pass # internal chrome callbacks def _requestWillBeSent (self, **kwargs): @@ -213,6 +217,33 @@ class SiteLoader: reqId = kwargs['requestId'] self.logger.warn ('failed {} {}'.format (reqId, kwargs['errorText'], kwargs.get ('blockedReason'))) item = self.requests.pop (reqId, None) + self.loadingFailed (item) + +class AccountingSiteLoader (SiteLoader): + """ + SiteLoader that keeps basic statistics about retrieved pages. + """ + + def __init__ (self, browser, url, logger=logging.getLogger(__name__)): + super ().__init__ (browser, url, logger) + + self.stats = {'requests': 0, 'finished': 0, 'failed': 0, 'bytesRcv': 0} + + def loadingFinished (self, item, redirect=False): + super ().loadingFinished (item, redirect) + + self.stats['finished'] += 1 + self.stats['bytesRcv'] += item.encodedDataLength + + def loadingFailed (self, item): + super ().loadingFailed (item) + + self.stats['failed'] += 1 + + def _requestWillBeSent (self, **kwargs): + super ()._requestWillBeSent (**kwargs) + + self.stats['requests'] += 1 import subprocess from tempfile import mkdtemp diff --git a/crocoite/cli.py b/crocoite/cli.py index 0880a53..32e0959 100644 --- a/crocoite/cli.py +++ b/crocoite/cli.py @@ -160,6 +160,8 @@ def archive (self, url, output, onload, onsnapshot, browser, finished_dir = '/tmp/finished' """ + ret = {'stats': None} + self.update_state (state='PROGRESS', meta={'step': 'start'}) stopVarname = '__' + __package__ + '_stop__' @@ -229,11 +231,12 @@ def archive (self, url, output, onload, onsnapshot, browser, if screenshot: self.update_state (state='PROGRESS', meta={'step': 'screenshot'}) writeScreenshot (l.tab, writer) + ret['stats'] = l.stats writer.flush () if not output: outPath = os.path.join (app.conf.finished_dir, outFile) os.rename (fd.name, outPath) - return True + return ret def stateCallback (data): result = data['result'] @@ -271,11 +274,12 @@ def main (): if distributed: result = archive.delay (**passArgs) - result.get (on_message=stateCallback) + r = result.get (on_message=stateCallback) else: # XXX: local evaluation does not init celery logging? logging.basicConfig (level=logging.INFO) - archive (**passArgs) + r = archive (**passArgs) + print (r['stats']) return True diff --git a/crocoite/warc.py b/crocoite/warc.py index 1c844bc..d9afab2 100644 --- a/crocoite/warc.py +++ b/crocoite/warc.py @@ -24,7 +24,7 @@ Classes writing data to WARC files import logging import json -from .browser import SiteLoader +from .browser import AccountingSiteLoader from . import packageUrl from http.server import BaseHTTPRequestHandler from base64 import b64decode @@ -100,11 +100,11 @@ class WARCLogHandler (BufferingHandler): finally: self.release () -class WarcLoader (SiteLoader): +class WarcLoader (AccountingSiteLoader): def __init__ (self, browser, url, writer, logger=logging.getLogger(__name__), logBuffer=1000, maxBodySize=10*1024*1024): - SiteLoader.__init__ (self, browser, url, logger) + super ().__init__ (browser, url, logger) self.writer = writer self.maxBodySize = maxBodySize self.warcLogger = WARCLogHandler (logBuffer, writer) @@ -113,7 +113,7 @@ class WarcLoader (SiteLoader): def __exit__ (self, exc_type, exc_value, traceback): self.logger.removeHandler (self.warcLogger) self.warcLogger.flush () - return SiteLoader.__exit__ (self, exc_type, exc_value, traceback) + return super ().__exit__ (exc_type, exc_value, traceback) @staticmethod def getStatusText (response): @@ -244,6 +244,8 @@ class WarcLoader (SiteLoader): writer.write_record(record) def loadingFinished (self, item, redirect=False): + super ().loadingFinished (item, redirect) + writer = self.writer req = item.request |