summaryrefslogtreecommitdiff
path: root/lulua
diff options
context:
space:
mode:
authorLars-Dominik Braun <lars@6xq.net>2019-11-03 21:37:30 +0100
committerLars-Dominik Braun <lars@6xq.net>2019-11-03 21:37:30 +0100
commit9c09c62a00aad211484e23940e70ab8dfe1ea65c (patch)
tree8fde0ca2358f1d3dd18def1e5dae00b50182f156 /lulua
parent80a7db488ac82fdf67a2d30fc3c9bf2709612c17 (diff)
downloadlulua-9c09c62a00aad211484e23940e70ab8dfe1ea65c.tar.gz
lulua-9c09c62a00aad211484e23940e70ab8dfe1ea65c.tar.bz2
lulua-9c09c62a00aad211484e23940e70ab8dfe1ea65c.zip
stats: Word stats+LatinIME renderer
Add word stats (well, tokens seperated by non-letter symbols really, but that’s alright) and dumping into an LatinIME-compatible dictionary file (for Android).
Diffstat (limited to 'lulua')
-rw-r--r--lulua/stats.py71
1 files changed, 69 insertions, 2 deletions
diff --git a/lulua/stats.py b/lulua/stats.py
index 598a7cd..06168c2 100644
--- a/lulua/stats.py
+++ b/lulua/stats.py
@@ -18,7 +18,7 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
-import sys, operator, pickle, argparse, logging, yaml
+import sys, operator, pickle, argparse, logging, yaml, math, time
from operator import itemgetter
from itertools import chain, groupby, product
from collections import defaultdict
@@ -157,7 +157,41 @@ class TriadStats (Stats):
def update (self, other):
updateDictOp (self.triads, other.triads, operator.add)
-allStats = [SimpleStats, RunlenStats, TriadStats]
+class WordStats (Stats):
+ """
+ Word stats
+ """
+
+ __slots__ = ('words', '_currentWord', '_writer')
+
+ name = 'words'
+
+ def __init__ (self, writer):
+ self._writer = writer
+
+ self._currentWord = []
+ self.words = defaultdict (int)
+
+ def process (self, event):
+ if isinstance (event, SkipEvent):
+ # reset
+ self._currentWord = []
+ elif isinstance (event, ButtonCombination):
+ text = self._writer.layout.getText (event)
+ for t in text:
+ cat = unicodedata.category (t)
+ if cat in {'Lo', 'Mn'}:
+ # arabic letter or diacritic (non-spacing mark), everything
+ # else is considered a word-delimiter
+ self._currentWord.append (t)
+ elif self._currentWord:
+ self.words[''.join (self._currentWord)] += 1
+ self._currentWord = []
+
+ def update (self, other):
+ updateDictOp (self.words, other.words, operator.add)
+
+allStats = [SimpleStats, RunlenStats, TriadStats, WordStats]
def unpickleAll (fd):
while True:
@@ -207,8 +241,14 @@ def pretty (args):
for k, v in combined.items ():
print (f'{k:2d} {v:10d} {v/total*100:5.1f}%')
+ print ('triads')
for triad, count in sorted (stats['triads'].triads.items (), key=itemgetter (1)):
print (f'{triad} {count:10d}')
+
+ print ('words')
+ for word, count in sorted (stats['words'].words.items (), key=itemgetter (1)):
+ print (f'{word} {count:10d}')
+
effort = Carpalx (models['mod01'], writer)
effort.addTriads (stats['triads'].triads)
print ('total effort (carpalx)', effort.effort)
@@ -257,6 +297,31 @@ def fingerHand (args):
print ('\t</div>\n\t</div>')
print ('</div>')
+def latinImeDict (args):
+ """
+ Create a dictionary for Android’s LatinIME input method from WordStats
+
+ see https://android.googlesource.com/platform/packages/inputmethods/LatinIME/+/master/dictionaries/sample.combined
+ """
+
+ def f (p):
+ """
+ Word probability to logarithmic f-value.
+
+ p = 1/(1.15^(255-f))
+ """
+ return 255+int (round (math.log (p, 1.15)))
+
+ stats = pickle.load (sys.stdin.buffer)
+ now = int (round (time.time ()))
+
+ print ('# auto-generated by ' + __package__)
+ print (f'dictionary=main:ar,locale=ar,description=Arabic wordlist,date={now},version=1')
+ total = sum (stats['words'].words.values ())
+ for word, count in sorted (stats['words'].words.items (), key=itemgetter (1), reverse=True):
+ p = count/total
+ print (f' word={word},f={f(p)}')
+
def main ():
parser = argparse.ArgumentParser(description='Process statistics files.')
parser.add_argument('-l', '--layout', metavar='LAYOUT', help='Keyboard layout name')
@@ -279,6 +344,8 @@ def main ():
sp.set_defaults (func=keyHeatmap)
sp = subparsers.add_parser('fingerhand')
sp.set_defaults (func=fingerHand)
+ sp = subparsers.add_parser('latinime')
+ sp.set_defaults (func=latinImeDict)
logging.basicConfig (level=logging.INFO)
args = parser.parse_args()