From 14daa5644598836fd6321038c6b0a496c7874374 Mon Sep 17 00:00:00 2001 From: Lars-Dominik Braun Date: Sun, 10 Nov 2019 09:44:35 +0100 Subject: doc: Auto-generate corpus table --- doc/index.html | 33 +++++------------------------- doc/style.css | 11 ++++++++++ gen.sh | 28 ++++++++++++++++++++++++-- lulua/stats.py | 58 +++++++++++++++++++++++++++++++++++++++++++++++++++++ lulua/test_stats.py | 15 +++++++++++++- 5 files changed, 114 insertions(+), 31 deletions(-) diff --git a/doc/index.html b/doc/index.html index a390ddf..e930892 100644 --- a/doc/index.html +++ b/doc/index.html @@ -7,9 +7,8 @@ - - - + + @@ -119,32 +118,10 @@ The corpus used for the following analysis consists of

- + + #include "corpus.html" +

- summing up to roughly - 1.2 billion words or - 7.6 billion characters. - The plot below shows ا ل ي م و ن can be considered the most frequently used letters in the Arabic language. diff --git a/doc/style.css b/doc/style.css index 4f9d63f..d6adf0f 100644 --- a/doc/style.css +++ b/doc/style.css @@ -141,3 +141,14 @@ div.fingerhandstats .fingers .thumb { border: 0.1em solid #2aa198; /* cyan */ } +.pure-table td.numint { + text-align: right; + padding-right: 0; +} + +.pure-table td.numfrac { + border-left: none; + text-align: left; + padding-left: 0; +} + diff --git a/gen.sh b/gen.sh index 762eecc..de1fed0 100755 --- a/gen.sh +++ b/gen.sh @@ -3,6 +3,7 @@ layouts="ar-lulua ar-asmo663 ar-linux ar-malas ar-phonetic ar-osman ar-khorshid" layoutsXmodmap="ar-lulua" +corpora="`ls corpus`" cat < \$out +rule analyze-corpusstats + command = lulua-analyze -l ar-lulua corpusstats \$metadata < \$stats > \$out + +rule analyze-corpushtml + command = cat \$in | lulua-analyze -l ar-lulua corpushtml > \$out + rule wordlist command = lulua-analyze -l ar-lulua latinime < \$in > \$out @@ -111,6 +118,7 @@ build \$docdir/_build/fonts/IBMPlexArabic-Regular.woff2: cp \$fontdir/IBMPlexAra build \$docdir/_build/fonts/IBMPlexArabic-Thin.woff2: cp \$fontdir/IBMPlexArabic-Thin.woff2 || \$docdir/_build/fonts EOF +# targets for every layout for l in $layouts; do cat <= base and len (units) > 1: + i /= base + units.pop (0) + i = round (i, 1) + return int (i), int (i%1*10), units[0] + +def corpusHtml (args): + meta = list (filter (lambda x: x is not None, yaml.safe_load_all (sys.stdin))) + total = {'words': 0, 'characters': 0} + print ('') + for c in sorted (meta, key=lambda x: x['source']['name'].lower ()): + print ('') + print (f'') + count = c.get ('count') + if count: + print (f'') + else: + print ('') + + stats = c.get ('stats') + for k in ('words', 'characters'): + i = approx (stats[k]) + print (f'') + print ('') + + for k in ('words', 'characters'): + total[k] += c['stats'][k] + print ('') + for k in ('words', 'characters'): + i = approx (total[k]) + print (f'') + print ('') + print ('
SourceWordsCharacters
{c["source"]["name"]}{count[0]//1000:d},{count[0]%1000:03d}\u202f{count[1]}{i[0]}.{i[1]}\u202f{i[2]}
Total{i[0]}.{i[1]}\u202f{i[2]}
') + def main (): parser = argparse.ArgumentParser(description='Process statistics files.') parser.add_argument('-l', '--layout', metavar='LAYOUT', help='Keyboard layout name') @@ -350,6 +403,11 @@ def main (): sp.set_defaults (func=fingerHand) sp = subparsers.add_parser('latinime') sp.set_defaults (func=latinImeDict) + sp = subparsers.add_parser('corpusstats') + sp.add_argument('metadata', type=argparse.FileType ('r')) + sp.set_defaults (func=corpusStats) + sp = subparsers.add_parser('corpushtml') + sp.set_defaults (func=corpusHtml) logging.basicConfig (level=logging.INFO) args = parser.parse_args() diff --git a/lulua/test_stats.py b/lulua/test_stats.py index 2fff6ce..9e3ed77 100644 --- a/lulua/test_stats.py +++ b/lulua/test_stats.py @@ -21,7 +21,7 @@ import operator import pytest -from .stats import updateDictOp +from .stats import updateDictOp, approx def test_updateDictOp (): a = {1: 3} @@ -37,3 +37,16 @@ def test_updateDictOp (): assert a == {'foo': {1: 3+7}} assert b == {'foo': {1: 7}} +def test_approx (): + assert approx (0) == (0, 0, '') + assert approx (0.01) == (0, 0, '') + assert approx (0.05) == (0, 1, '') + assert approx (1) == (1, 0, '') + assert approx (100) == (100, 0, '') + assert approx (999.9) == (999, 9, '') + + assert approx (10**3) == (1, 0, 'thousand') + assert approx (10**6) == (1, 0, 'million') + assert approx (10**9) == (1, 0, 'billion') + assert approx (10**12) == (1000, 0, 'billion') + -- cgit v1.2.3