From fd279ff3168c91be2ed8a012af6395034475ccf5 Mon Sep 17 00:00:00 2001 From: Lars-Dominik Braun Date: Fri, 22 Dec 2017 17:43:52 +0100 Subject: Add simple stats-keeping SiteLoader --- contrib/celerycrocoite.py | 15 ++++++++++++++- crocoite/browser.py | 35 +++++++++++++++++++++++++++++++++-- crocoite/cli.py | 10 +++++++--- crocoite/warc.py | 10 ++++++---- 4 files changed, 60 insertions(+), 10 deletions(-) diff --git a/contrib/celerycrocoite.py b/contrib/celerycrocoite.py index ede58be..83be7a6 100644 --- a/contrib/celerycrocoite.py +++ b/contrib/celerycrocoite.py @@ -43,6 +43,16 @@ def prettyTimeDelta (seconds): s = filter (lambda x: x[0] != 0, s) return ' '.join (map (lambda x: '{}{}'.format (*x), s)) +def prettyBytes (b): + """ + Pretty-print bytes + """ + prefixes = ['B', 'KiB', 'MiB', 'GiB', 'TiB'] + while b >= 1024 and len (prefixes) > 1: + b /= 1024 + prefixes.pop (0) + return '{:.1f} {}'.format (b, prefixes[0]) + def setup (bot): m = bot.memory['crocoite'] = SopelMemory () m['jobs'] = {} @@ -105,7 +115,10 @@ def archive (bot, trigger): try: result = handle.get (on_message=lambda x: updateState (j, x)) - bot.reply ('{} ({}) finished'.format (url, handle.id)) + stats = result['stats'] + bot.reply ('{} ({}) finished. {} requests, {} failed, {} received.'.format (url, + handle.id, stats['requests'], stats['failed'], + prettyBytes (stats['bytesRcv']))) except Exception as e: # json serialization does not work well with exceptions. If their class # names are unique we can still distinguish them. diff --git a/crocoite/browser.py b/crocoite/browser.py index e7eb4e2..e1feda9 100644 --- a/crocoite/browser.py +++ b/crocoite/browser.py @@ -148,8 +148,12 @@ class SiteLoader: self.browser.close_tab(self.tab) return False - def loadingFinished (self, item): - self.logger.debug ('item finished {}'.format (item)) + # overrideable callbacks + def loadingFinished (self, item, redirect=False): + pass + + def loadingFailed (self, item): + pass # internal chrome callbacks def _requestWillBeSent (self, **kwargs): @@ -213,6 +217,33 @@ class SiteLoader: reqId = kwargs['requestId'] self.logger.warn ('failed {} {}'.format (reqId, kwargs['errorText'], kwargs.get ('blockedReason'))) item = self.requests.pop (reqId, None) + self.loadingFailed (item) + +class AccountingSiteLoader (SiteLoader): + """ + SiteLoader that keeps basic statistics about retrieved pages. + """ + + def __init__ (self, browser, url, logger=logging.getLogger(__name__)): + super ().__init__ (browser, url, logger) + + self.stats = {'requests': 0, 'finished': 0, 'failed': 0, 'bytesRcv': 0} + + def loadingFinished (self, item, redirect=False): + super ().loadingFinished (item, redirect) + + self.stats['finished'] += 1 + self.stats['bytesRcv'] += item.encodedDataLength + + def loadingFailed (self, item): + super ().loadingFailed (item) + + self.stats['failed'] += 1 + + def _requestWillBeSent (self, **kwargs): + super ()._requestWillBeSent (**kwargs) + + self.stats['requests'] += 1 import subprocess from tempfile import mkdtemp diff --git a/crocoite/cli.py b/crocoite/cli.py index 0880a53..32e0959 100644 --- a/crocoite/cli.py +++ b/crocoite/cli.py @@ -160,6 +160,8 @@ def archive (self, url, output, onload, onsnapshot, browser, finished_dir = '/tmp/finished' """ + ret = {'stats': None} + self.update_state (state='PROGRESS', meta={'step': 'start'}) stopVarname = '__' + __package__ + '_stop__' @@ -229,11 +231,12 @@ def archive (self, url, output, onload, onsnapshot, browser, if screenshot: self.update_state (state='PROGRESS', meta={'step': 'screenshot'}) writeScreenshot (l.tab, writer) + ret['stats'] = l.stats writer.flush () if not output: outPath = os.path.join (app.conf.finished_dir, outFile) os.rename (fd.name, outPath) - return True + return ret def stateCallback (data): result = data['result'] @@ -271,11 +274,12 @@ def main (): if distributed: result = archive.delay (**passArgs) - result.get (on_message=stateCallback) + r = result.get (on_message=stateCallback) else: # XXX: local evaluation does not init celery logging? logging.basicConfig (level=logging.INFO) - archive (**passArgs) + r = archive (**passArgs) + print (r['stats']) return True diff --git a/crocoite/warc.py b/crocoite/warc.py index 1c844bc..d9afab2 100644 --- a/crocoite/warc.py +++ b/crocoite/warc.py @@ -24,7 +24,7 @@ Classes writing data to WARC files import logging import json -from .browser import SiteLoader +from .browser import AccountingSiteLoader from . import packageUrl from http.server import BaseHTTPRequestHandler from base64 import b64decode @@ -100,11 +100,11 @@ class WARCLogHandler (BufferingHandler): finally: self.release () -class WarcLoader (SiteLoader): +class WarcLoader (AccountingSiteLoader): def __init__ (self, browser, url, writer, logger=logging.getLogger(__name__), logBuffer=1000, maxBodySize=10*1024*1024): - SiteLoader.__init__ (self, browser, url, logger) + super ().__init__ (browser, url, logger) self.writer = writer self.maxBodySize = maxBodySize self.warcLogger = WARCLogHandler (logBuffer, writer) @@ -113,7 +113,7 @@ class WarcLoader (SiteLoader): def __exit__ (self, exc_type, exc_value, traceback): self.logger.removeHandler (self.warcLogger) self.warcLogger.flush () - return SiteLoader.__exit__ (self, exc_type, exc_value, traceback) + return super ().__exit__ (exc_type, exc_value, traceback) @staticmethod def getStatusText (response): @@ -244,6 +244,8 @@ class WarcLoader (SiteLoader): writer.write_record(record) def loadingFinished (self, item, redirect=False): + super ().loadingFinished (item, redirect) + writer = self.writer req = item.request -- cgit v1.2.3