From 9c09c62a00aad211484e23940e70ab8dfe1ea65c Mon Sep 17 00:00:00 2001
From: Lars-Dominik Braun <lars@6xq.net>
Date: Sun, 3 Nov 2019 21:37:30 +0100
Subject: stats: Word stats+LatinIME renderer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add word stats (well, tokens seperated by non-letter symbols really, but
that’s alright) and dumping into an LatinIME-compatible dictionary file
(for Android).
---
 README.rst     |  3 ++-
 gen.sh         | 14 +++++++++---
 lulua/stats.py | 71 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 82 insertions(+), 6 deletions(-)

diff --git a/README.rst b/README.rst
index e26a49b..123246a 100644
--- a/README.rst
+++ b/README.rst
@@ -31,7 +31,8 @@ To get a pretty picture (SVG) of your layout render it:
 
    lulua-render -l evolved.yaml svg evolved.svg
 
-It is highly recommended to use pypy3_ instead of CPython.
+It is highly recommended to use pypy3_ instead of CPython and a machine with
+lots of RAM (at least 16 GB).
 
 .. _pypy3: http://pypy.org/
 
diff --git a/gen.sh b/gen.sh
index df59e57..3500b22 100755
--- a/gen.sh
+++ b/gen.sh
@@ -40,13 +40,12 @@ rule render-xmodmap
 rule analyze-heat
     command = lulua-analyze -l \$layout keyheatmap < \$in > \$out
 
-# XXX: add lulua-analyze combine here
 rule write-bbcarabic
-    command = find \$in -type f | lulua-write bbcarabic \$layout > \$out
+    command = find \$in -type f | lulua-write bbcarabic \$layout | lulua-analyze combine > \$out
     pool = write
 
 rule write-aljazeera
-    command = find \$in -type f | lulua-write aljazeera \$layout > \$out
+    command = find \$in -type f | lulua-write aljazeera \$layout | lulua-analyze combine > \$out
     pool = write
 
 rule write-epub
@@ -73,6 +72,9 @@ rule letterfreq
 rule analyze-fingerhand
     command = lulua-analyze -l \$layout fingerhand < \$in > \$out
 
+rule wordlist
+    command = lulua-analyze -l ar-lulua latinime < \$in > \$out
+
 rule cpp
     command = gcc -E -x c -nostdinc -MMD -MF \$out.d -C -P -I \$docdir/_temp \$in -o \$out
     depfile = \$out.d
@@ -81,6 +83,9 @@ rule cpp
 rule cp
     command = cp \$in \$out
 
+rule gz
+    command = gzip -c \$in > \$out
+
 ### build targets ###
 build \$docdir/_build: mkdir
 build \$docdir/_build/fonts: mkdir
@@ -89,6 +94,9 @@ build \$docdir/_build/index.html: cpp \$docdir/index.html || \$docdir/_build
 build \$docdir/_build/letterfreq.json: letterfreq \$statsdir/ar-lulua/all.pickle || \$docdir/_build
 build \$docdir/_build/style.css: cp \$docdir/style.css || \$docdir/_build
 build \$docdir/_build/lulua-logo.svg: cp \$docdir/lulua-logo.svg || \$docdir/_build
+# wordlist
+build \$docdir/_temp/lulua.combined: wordlist \$statsdir/ar-lulua/all.pickle || \$docdir/_temp
+build \$docdir/_build/lulua.combined.gz: gz \$docdir/_temp/lulua.combined || \$docdir/_build
 
 
 build \$docdir/_build/fonts/IBMPlexArabic-Regular.woff2: cp \$fontdir/IBMPlexArabic-Regular.woff2 || \$docdir/_build/fonts
diff --git a/lulua/stats.py b/lulua/stats.py
index 598a7cd..06168c2 100644
--- a/lulua/stats.py
+++ b/lulua/stats.py
@@ -18,7 +18,7 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 # THE SOFTWARE.
 
-import sys, operator, pickle, argparse, logging, yaml
+import sys, operator, pickle, argparse, logging, yaml, math, time
 from operator import itemgetter
 from itertools import chain, groupby, product
 from collections import defaultdict
@@ -157,7 +157,41 @@ class TriadStats (Stats):
     def update (self, other):
         updateDictOp (self.triads, other.triads, operator.add)
 
-allStats = [SimpleStats, RunlenStats, TriadStats]
+class WordStats (Stats):
+    """
+    Word stats
+    """
+
+    __slots__ = ('words', '_currentWord', '_writer')
+
+    name = 'words'
+
+    def __init__ (self, writer):
+        self._writer = writer
+
+        self._currentWord = []
+        self.words = defaultdict (int)
+
+    def process (self, event):
+        if isinstance (event, SkipEvent):
+            # reset
+            self._currentWord = []
+        elif isinstance (event, ButtonCombination):
+            text = self._writer.layout.getText (event)
+            for t in text:
+                cat = unicodedata.category (t)
+                if cat in {'Lo', 'Mn'}:
+                    # arabic letter or diacritic (non-spacing mark), everything
+                    # else is considered a word-delimiter
+                    self._currentWord.append (t)
+                elif self._currentWord:
+                    self.words[''.join (self._currentWord)] += 1
+                    self._currentWord = []
+
+    def update (self, other):
+        updateDictOp (self.words, other.words, operator.add)
+
+allStats = [SimpleStats, RunlenStats, TriadStats, WordStats]
 
 def unpickleAll (fd):
     while True:
@@ -207,8 +241,14 @@ def pretty (args):
     for k, v in combined.items ():
         print (f'{k:2d} {v:10d} {v/total*100:5.1f}%')
 
+    print ('triads')
     for triad, count in sorted (stats['triads'].triads.items (), key=itemgetter (1)):
         print (f'{triad} {count:10d}')
+
+    print ('words')
+    for word, count in sorted (stats['words'].words.items (), key=itemgetter (1)):
+        print (f'{word} {count:10d}')
+
     effort = Carpalx (models['mod01'], writer)
     effort.addTriads (stats['triads'].triads)
     print ('total effort (carpalx)', effort.effort)
@@ -257,6 +297,31 @@ def fingerHand (args):
         print ('\t</div>\n\t</div>')
     print ('</div>')
 
+def latinImeDict (args):
+    """
+    Create a dictionary for Android’s LatinIME input method from WordStats
+
+    see https://android.googlesource.com/platform/packages/inputmethods/LatinIME/+/master/dictionaries/sample.combined
+    """
+
+    def f (p):
+        """
+        Word probability to logarithmic f-value.
+
+        p = 1/(1.15^(255-f))
+        """
+        return 255+int (round (math.log (p, 1.15)))
+
+    stats = pickle.load (sys.stdin.buffer)
+    now = int (round (time.time ()))
+
+    print ('# auto-generated by ' + __package__)
+    print (f'dictionary=main:ar,locale=ar,description=Arabic wordlist,date={now},version=1')
+    total = sum (stats['words'].words.values ())
+    for word, count in sorted (stats['words'].words.items (), key=itemgetter (1), reverse=True):
+        p = count/total
+        print (f' word={word},f={f(p)}')
+
 def main ():
     parser = argparse.ArgumentParser(description='Process statistics files.')
     parser.add_argument('-l', '--layout', metavar='LAYOUT', help='Keyboard layout name')
@@ -279,6 +344,8 @@ def main ():
     sp.set_defaults (func=keyHeatmap)
     sp = subparsers.add_parser('fingerhand')
     sp.set_defaults (func=fingerHand)
+    sp = subparsers.add_parser('latinime')
+    sp.set_defaults (func=latinImeDict)
 
     logging.basicConfig (level=logging.INFO)
     args = parser.parse_args()
-- 
cgit v1.2.3