diff options
Diffstat (limited to 'lulua')
-rw-r--r-- | lulua/stats.py | 58 | ||||
-rw-r--r-- | lulua/test_stats.py | 15 |
2 files changed, 72 insertions, 1 deletions
diff --git a/lulua/stats.py b/lulua/stats.py index 6665ac8..a7980d6 100644 --- a/lulua/stats.py +++ b/lulua/stats.py @@ -22,6 +22,7 @@ import sys, operator, pickle, argparse, logging, yaml, math, time from operator import itemgetter from itertools import chain, groupby, product from collections import defaultdict +from decimal import Decimal from .layout import * from .keyboard import defaultKeyboards @@ -326,6 +327,58 @@ def latinImeDict (args): p = count/total print (f' word={word},f={f(p)}') +def corpusStats (args): + """ Get corpus stats from stat files """ + stats = pickle.load (sys.stdin.buffer) + meta = yaml.safe_load (args.metadata) + + meta['stats'] = dict (characters=sum (stats['simple'].combinations.values ()), + words=sum (stats['words'].words.values ())) + + yaml.dump (meta, sys.stdout) + # make document concatable + print ('---') + +def approx (i): + """ Get approximate human-readable string for large number """ + + units = ['', 'thousand', 'million', 'billion'] + base = Decimal (1000) + i = Decimal (i) + while i >= base and len (units) > 1: + i /= base + units.pop (0) + i = round (i, 1) + return int (i), int (i%1*10), units[0] + +def corpusHtml (args): + meta = list (filter (lambda x: x is not None, yaml.safe_load_all (sys.stdin))) + total = {'words': 0, 'characters': 0} + print ('<table class="pure-table"><thead><tr><th>Source</th><th colspan="2"></th><th colspan="2">Words</th><th colspan="2">Characters</th></thead><tbody>') + for c in sorted (meta, key=lambda x: x['source']['name'].lower ()): + print ('<tr>') + print (f'<td><a href="{c["source"]["url"]}">{c["source"]["name"]}</a></td>') + count = c.get ('count') + if count: + print (f'<td class="numint">{count[0]//1000:d},</td><td class="numfrac">{count[0]%1000:03d}\u202f{count[1]}</td></td>') + else: + print ('<td class="numint"></td><td class="numfrac"></td>') + + stats = c.get ('stats') + for k in ('words', 'characters'): + i = approx (stats[k]) + print (f'<td class="numint">{i[0]}.</td><td class="numfrac">{i[1]}\u202f{i[2]}</td>') + print ('</tr>') + + for k in ('words', 'characters'): + total[k] += c['stats'][k] + print ('<tr><td>Total</td><td class="numint"></td><td class="numfrac"></td>') + for k in ('words', 'characters'): + i = approx (total[k]) + print (f'<td class="numint">{i[0]}.</td><td class="numfrac">{i[1]}\u202f{i[2]}</td>') + print ('</tr>') + print ('</tbody></table>') + def main (): parser = argparse.ArgumentParser(description='Process statistics files.') parser.add_argument('-l', '--layout', metavar='LAYOUT', help='Keyboard layout name') @@ -350,6 +403,11 @@ def main (): sp.set_defaults (func=fingerHand) sp = subparsers.add_parser('latinime') sp.set_defaults (func=latinImeDict) + sp = subparsers.add_parser('corpusstats') + sp.add_argument('metadata', type=argparse.FileType ('r')) + sp.set_defaults (func=corpusStats) + sp = subparsers.add_parser('corpushtml') + sp.set_defaults (func=corpusHtml) logging.basicConfig (level=logging.INFO) args = parser.parse_args() diff --git a/lulua/test_stats.py b/lulua/test_stats.py index 2fff6ce..9e3ed77 100644 --- a/lulua/test_stats.py +++ b/lulua/test_stats.py @@ -21,7 +21,7 @@ import operator import pytest -from .stats import updateDictOp +from .stats import updateDictOp, approx def test_updateDictOp (): a = {1: 3} @@ -37,3 +37,16 @@ def test_updateDictOp (): assert a == {'foo': {1: 3+7}} assert b == {'foo': {1: 7}} +def test_approx (): + assert approx (0) == (0, 0, '') + assert approx (0.01) == (0, 0, '') + assert approx (0.05) == (0, 1, '') + assert approx (1) == (1, 0, '') + assert approx (100) == (100, 0, '') + assert approx (999.9) == (999, 9, '') + + assert approx (10**3) == (1, 0, 'thousand') + assert approx (10**6) == (1, 0, 'million') + assert approx (10**9) == (1, 0, 'billion') + assert approx (10**12) == (1000, 0, 'billion') + |