From 14daa5644598836fd6321038c6b0a496c7874374 Mon Sep 17 00:00:00 2001
From: Lars-Dominik Braun
- summing up to roughly
- 1.2 billion words or
- 7.6 billion characters.
-
The plot below shows ا ل ي م و ن can be
considered the most frequently used letters in the Arabic language.
diff --git a/doc/style.css b/doc/style.css
index 4f9d63f..d6adf0f 100644
--- a/doc/style.css
+++ b/doc/style.css
@@ -141,3 +141,14 @@ div.fingerhandstats .fingers .thumb {
border: 0.1em solid #2aa198; /* cyan */
}
+.pure-table td.numint {
+ text-align: right;
+ padding-right: 0;
+}
+
+.pure-table td.numfrac {
+ border-left: none;
+ text-align: left;
+ padding-left: 0;
+}
+
diff --git a/gen.sh b/gen.sh
index 762eecc..de1fed0 100755
--- a/gen.sh
+++ b/gen.sh
@@ -3,6 +3,7 @@
layouts="ar-lulua ar-asmo663 ar-linux ar-malas ar-phonetic ar-osman ar-khorshid"
layoutsXmodmap="ar-lulua"
+corpora="`ls corpus`"
cat <
')
+
def main ():
parser = argparse.ArgumentParser(description='Process statistics files.')
parser.add_argument('-l', '--layout', metavar='LAYOUT', help='Keyboard layout name')
@@ -350,6 +403,11 @@ def main ():
sp.set_defaults (func=fingerHand)
sp = subparsers.add_parser('latinime')
sp.set_defaults (func=latinImeDict)
+ sp = subparsers.add_parser('corpusstats')
+ sp.add_argument('metadata', type=argparse.FileType ('r'))
+ sp.set_defaults (func=corpusStats)
+ sp = subparsers.add_parser('corpushtml')
+ sp.set_defaults (func=corpusHtml)
logging.basicConfig (level=logging.INFO)
args = parser.parse_args()
diff --git a/lulua/test_stats.py b/lulua/test_stats.py
index 2fff6ce..9e3ed77 100644
--- a/lulua/test_stats.py
+++ b/lulua/test_stats.py
@@ -21,7 +21,7 @@
import operator
import pytest
-from .stats import updateDictOp
+from .stats import updateDictOp, approx
def test_updateDictOp ():
a = {1: 3}
@@ -37,3 +37,16 @@ def test_updateDictOp ():
assert a == {'foo': {1: 3+7}}
assert b == {'foo': {1: 7}}
+def test_approx ():
+ assert approx (0) == (0, 0, '')
+ assert approx (0.01) == (0, 0, '')
+ assert approx (0.05) == (0, 1, '')
+ assert approx (1) == (1, 0, '')
+ assert approx (100) == (100, 0, '')
+ assert approx (999.9) == (999, 9, '')
+
+ assert approx (10**3) == (1, 0, 'thousand')
+ assert approx (10**6) == (1, 0, 'million')
+ assert approx (10**9) == (1, 0, 'billion')
+ assert approx (10**12) == (1000, 0, 'billion')
+
--
cgit v1.2.3
')
+ for c in sorted (meta, key=lambda x: x['source']['name'].lower ()):
+ print ('Source Words Characters ')
+ print (f' ')
+
+ for k in ('words', 'characters'):
+ total[k] += c['stats'][k]
+ print ('{c["source"]["name"]} ')
+ count = c.get ('count')
+ if count:
+ print (f'{count[0]//1000:d}, {count[0]%1000:03d}\u202f{count[1]} ')
+ else:
+ print ('')
+
+ stats = c.get ('stats')
+ for k in ('words', 'characters'):
+ i = approx (stats[k])
+ print (f' {i[0]}. {i[1]}\u202f{i[2]} ')
+ print (' ')
+ print ('Total ')
+ for k in ('words', 'characters'):
+ i = approx (total[k])
+ print (f' {i[0]}. {i[1]}\u202f{i[2]} ')
+ print ('