summaryrefslogtreecommitdiff
path: root/lulua
diff options
context:
space:
mode:
Diffstat (limited to 'lulua')
-rw-r--r--lulua/stats.py58
-rw-r--r--lulua/test_stats.py15
2 files changed, 72 insertions, 1 deletions
diff --git a/lulua/stats.py b/lulua/stats.py
index 6665ac8..a7980d6 100644
--- a/lulua/stats.py
+++ b/lulua/stats.py
@@ -22,6 +22,7 @@ import sys, operator, pickle, argparse, logging, yaml, math, time
from operator import itemgetter
from itertools import chain, groupby, product
from collections import defaultdict
+from decimal import Decimal
from .layout import *
from .keyboard import defaultKeyboards
@@ -326,6 +327,58 @@ def latinImeDict (args):
p = count/total
print (f' word={word},f={f(p)}')
+def corpusStats (args):
+ """ Get corpus stats from stat files """
+ stats = pickle.load (sys.stdin.buffer)
+ meta = yaml.safe_load (args.metadata)
+
+ meta['stats'] = dict (characters=sum (stats['simple'].combinations.values ()),
+ words=sum (stats['words'].words.values ()))
+
+ yaml.dump (meta, sys.stdout)
+ # make document concatable
+ print ('---')
+
+def approx (i):
+ """ Get approximate human-readable string for large number """
+
+ units = ['', 'thousand', 'million', 'billion']
+ base = Decimal (1000)
+ i = Decimal (i)
+ while i >= base and len (units) > 1:
+ i /= base
+ units.pop (0)
+ i = round (i, 1)
+ return int (i), int (i%1*10), units[0]
+
+def corpusHtml (args):
+ meta = list (filter (lambda x: x is not None, yaml.safe_load_all (sys.stdin)))
+ total = {'words': 0, 'characters': 0}
+ print ('<table class="pure-table"><thead><tr><th>Source</th><th colspan="2"></th><th colspan="2">Words</th><th colspan="2">Characters</th></thead><tbody>')
+ for c in sorted (meta, key=lambda x: x['source']['name'].lower ()):
+ print ('<tr>')
+ print (f'<td><a href="{c["source"]["url"]}">{c["source"]["name"]}</a></td>')
+ count = c.get ('count')
+ if count:
+ print (f'<td class="numint">{count[0]//1000:d},</td><td class="numfrac">{count[0]%1000:03d}\u202f{count[1]}</td></td>')
+ else:
+ print ('<td class="numint"></td><td class="numfrac"></td>')
+
+ stats = c.get ('stats')
+ for k in ('words', 'characters'):
+ i = approx (stats[k])
+ print (f'<td class="numint">{i[0]}.</td><td class="numfrac">{i[1]}\u202f{i[2]}</td>')
+ print ('</tr>')
+
+ for k in ('words', 'characters'):
+ total[k] += c['stats'][k]
+ print ('<tr><td>Total</td><td class="numint"></td><td class="numfrac"></td>')
+ for k in ('words', 'characters'):
+ i = approx (total[k])
+ print (f'<td class="numint">{i[0]}.</td><td class="numfrac">{i[1]}\u202f{i[2]}</td>')
+ print ('</tr>')
+ print ('</tbody></table>')
+
def main ():
parser = argparse.ArgumentParser(description='Process statistics files.')
parser.add_argument('-l', '--layout', metavar='LAYOUT', help='Keyboard layout name')
@@ -350,6 +403,11 @@ def main ():
sp.set_defaults (func=fingerHand)
sp = subparsers.add_parser('latinime')
sp.set_defaults (func=latinImeDict)
+ sp = subparsers.add_parser('corpusstats')
+ sp.add_argument('metadata', type=argparse.FileType ('r'))
+ sp.set_defaults (func=corpusStats)
+ sp = subparsers.add_parser('corpushtml')
+ sp.set_defaults (func=corpusHtml)
logging.basicConfig (level=logging.INFO)
args = parser.parse_args()
diff --git a/lulua/test_stats.py b/lulua/test_stats.py
index 2fff6ce..9e3ed77 100644
--- a/lulua/test_stats.py
+++ b/lulua/test_stats.py
@@ -21,7 +21,7 @@
import operator
import pytest
-from .stats import updateDictOp
+from .stats import updateDictOp, approx
def test_updateDictOp ():
a = {1: 3}
@@ -37,3 +37,16 @@ def test_updateDictOp ():
assert a == {'foo': {1: 3+7}}
assert b == {'foo': {1: 7}}
+def test_approx ():
+ assert approx (0) == (0, 0, '')
+ assert approx (0.01) == (0, 0, '')
+ assert approx (0.05) == (0, 1, '')
+ assert approx (1) == (1, 0, '')
+ assert approx (100) == (100, 0, '')
+ assert approx (999.9) == (999, 9, '')
+
+ assert approx (10**3) == (1, 0, 'thousand')
+ assert approx (10**6) == (1, 0, 'million')
+ assert approx (10**9) == (1, 0, 'billion')
+ assert approx (10**12) == (1000, 0, 'billion')
+