summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--contrib/celerycrocoite.py15
-rw-r--r--crocoite/browser.py35
-rw-r--r--crocoite/cli.py10
-rw-r--r--crocoite/warc.py10
4 files changed, 60 insertions, 10 deletions
diff --git a/contrib/celerycrocoite.py b/contrib/celerycrocoite.py
index ede58be..83be7a6 100644
--- a/contrib/celerycrocoite.py
+++ b/contrib/celerycrocoite.py
@@ -43,6 +43,16 @@ def prettyTimeDelta (seconds):
s = filter (lambda x: x[0] != 0, s)
return ' '.join (map (lambda x: '{}{}'.format (*x), s))
+def prettyBytes (b):
+ """
+ Pretty-print bytes
+ """
+ prefixes = ['B', 'KiB', 'MiB', 'GiB', 'TiB']
+ while b >= 1024 and len (prefixes) > 1:
+ b /= 1024
+ prefixes.pop (0)
+ return '{:.1f} {}'.format (b, prefixes[0])
+
def setup (bot):
m = bot.memory['crocoite'] = SopelMemory ()
m['jobs'] = {}
@@ -105,7 +115,10 @@ def archive (bot, trigger):
try:
result = handle.get (on_message=lambda x: updateState (j, x))
- bot.reply ('{} ({}) finished'.format (url, handle.id))
+ stats = result['stats']
+ bot.reply ('{} ({}) finished. {} requests, {} failed, {} received.'.format (url,
+ handle.id, stats['requests'], stats['failed'],
+ prettyBytes (stats['bytesRcv'])))
except Exception as e:
# json serialization does not work well with exceptions. If their class
# names are unique we can still distinguish them.
diff --git a/crocoite/browser.py b/crocoite/browser.py
index e7eb4e2..e1feda9 100644
--- a/crocoite/browser.py
+++ b/crocoite/browser.py
@@ -148,8 +148,12 @@ class SiteLoader:
self.browser.close_tab(self.tab)
return False
- def loadingFinished (self, item):
- self.logger.debug ('item finished {}'.format (item))
+ # overrideable callbacks
+ def loadingFinished (self, item, redirect=False):
+ pass
+
+ def loadingFailed (self, item):
+ pass
# internal chrome callbacks
def _requestWillBeSent (self, **kwargs):
@@ -213,6 +217,33 @@ class SiteLoader:
reqId = kwargs['requestId']
self.logger.warn ('failed {} {}'.format (reqId, kwargs['errorText'], kwargs.get ('blockedReason')))
item = self.requests.pop (reqId, None)
+ self.loadingFailed (item)
+
+class AccountingSiteLoader (SiteLoader):
+ """
+ SiteLoader that keeps basic statistics about retrieved pages.
+ """
+
+ def __init__ (self, browser, url, logger=logging.getLogger(__name__)):
+ super ().__init__ (browser, url, logger)
+
+ self.stats = {'requests': 0, 'finished': 0, 'failed': 0, 'bytesRcv': 0}
+
+ def loadingFinished (self, item, redirect=False):
+ super ().loadingFinished (item, redirect)
+
+ self.stats['finished'] += 1
+ self.stats['bytesRcv'] += item.encodedDataLength
+
+ def loadingFailed (self, item):
+ super ().loadingFailed (item)
+
+ self.stats['failed'] += 1
+
+ def _requestWillBeSent (self, **kwargs):
+ super ()._requestWillBeSent (**kwargs)
+
+ self.stats['requests'] += 1
import subprocess
from tempfile import mkdtemp
diff --git a/crocoite/cli.py b/crocoite/cli.py
index 0880a53..32e0959 100644
--- a/crocoite/cli.py
+++ b/crocoite/cli.py
@@ -160,6 +160,8 @@ def archive (self, url, output, onload, onsnapshot, browser,
finished_dir = '/tmp/finished'
"""
+ ret = {'stats': None}
+
self.update_state (state='PROGRESS', meta={'step': 'start'})
stopVarname = '__' + __package__ + '_stop__'
@@ -229,11 +231,12 @@ def archive (self, url, output, onload, onsnapshot, browser,
if screenshot:
self.update_state (state='PROGRESS', meta={'step': 'screenshot'})
writeScreenshot (l.tab, writer)
+ ret['stats'] = l.stats
writer.flush ()
if not output:
outPath = os.path.join (app.conf.finished_dir, outFile)
os.rename (fd.name, outPath)
- return True
+ return ret
def stateCallback (data):
result = data['result']
@@ -271,11 +274,12 @@ def main ():
if distributed:
result = archive.delay (**passArgs)
- result.get (on_message=stateCallback)
+ r = result.get (on_message=stateCallback)
else:
# XXX: local evaluation does not init celery logging?
logging.basicConfig (level=logging.INFO)
- archive (**passArgs)
+ r = archive (**passArgs)
+ print (r['stats'])
return True
diff --git a/crocoite/warc.py b/crocoite/warc.py
index 1c844bc..d9afab2 100644
--- a/crocoite/warc.py
+++ b/crocoite/warc.py
@@ -24,7 +24,7 @@ Classes writing data to WARC files
import logging
import json
-from .browser import SiteLoader
+from .browser import AccountingSiteLoader
from . import packageUrl
from http.server import BaseHTTPRequestHandler
from base64 import b64decode
@@ -100,11 +100,11 @@ class WARCLogHandler (BufferingHandler):
finally:
self.release ()
-class WarcLoader (SiteLoader):
+class WarcLoader (AccountingSiteLoader):
def __init__ (self, browser, url, writer,
logger=logging.getLogger(__name__), logBuffer=1000,
maxBodySize=10*1024*1024):
- SiteLoader.__init__ (self, browser, url, logger)
+ super ().__init__ (browser, url, logger)
self.writer = writer
self.maxBodySize = maxBodySize
self.warcLogger = WARCLogHandler (logBuffer, writer)
@@ -113,7 +113,7 @@ class WarcLoader (SiteLoader):
def __exit__ (self, exc_type, exc_value, traceback):
self.logger.removeHandler (self.warcLogger)
self.warcLogger.flush ()
- return SiteLoader.__exit__ (self, exc_type, exc_value, traceback)
+ return super ().__exit__ (exc_type, exc_value, traceback)
@staticmethod
def getStatusText (response):
@@ -244,6 +244,8 @@ class WarcLoader (SiteLoader):
writer.write_record(record)
def loadingFinished (self, item, redirect=False):
+ super ().loadingFinished (item, redirect)
+
writer = self.writer
req = item.request