From 9c09c62a00aad211484e23940e70ab8dfe1ea65c Mon Sep 17 00:00:00 2001 From: Lars-Dominik Braun Date: Sun, 3 Nov 2019 21:37:30 +0100 Subject: stats: Word stats+LatinIME renderer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add word stats (well, tokens seperated by non-letter symbols really, but that’s alright) and dumping into an LatinIME-compatible dictionary file (for Android). --- README.rst | 3 ++- gen.sh | 14 +++++++++--- lulua/stats.py | 71 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 82 insertions(+), 6 deletions(-) diff --git a/README.rst b/README.rst index e26a49b..123246a 100644 --- a/README.rst +++ b/README.rst @@ -31,7 +31,8 @@ To get a pretty picture (SVG) of your layout render it: lulua-render -l evolved.yaml svg evolved.svg -It is highly recommended to use pypy3_ instead of CPython. +It is highly recommended to use pypy3_ instead of CPython and a machine with +lots of RAM (at least 16 GB). .. _pypy3: http://pypy.org/ diff --git a/gen.sh b/gen.sh index df59e57..3500b22 100755 --- a/gen.sh +++ b/gen.sh @@ -40,13 +40,12 @@ rule render-xmodmap rule analyze-heat command = lulua-analyze -l \$layout keyheatmap < \$in > \$out -# XXX: add lulua-analyze combine here rule write-bbcarabic - command = find \$in -type f | lulua-write bbcarabic \$layout > \$out + command = find \$in -type f | lulua-write bbcarabic \$layout | lulua-analyze combine > \$out pool = write rule write-aljazeera - command = find \$in -type f | lulua-write aljazeera \$layout > \$out + command = find \$in -type f | lulua-write aljazeera \$layout | lulua-analyze combine > \$out pool = write rule write-epub @@ -73,6 +72,9 @@ rule letterfreq rule analyze-fingerhand command = lulua-analyze -l \$layout fingerhand < \$in > \$out +rule wordlist + command = lulua-analyze -l ar-lulua latinime < \$in > \$out + rule cpp command = gcc -E -x c -nostdinc -MMD -MF \$out.d -C -P -I \$docdir/_temp \$in -o \$out depfile = \$out.d @@ -81,6 +83,9 @@ rule cpp rule cp command = cp \$in \$out +rule gz + command = gzip -c \$in > \$out + ### build targets ### build \$docdir/_build: mkdir build \$docdir/_build/fonts: mkdir @@ -89,6 +94,9 @@ build \$docdir/_build/index.html: cpp \$docdir/index.html || \$docdir/_build build \$docdir/_build/letterfreq.json: letterfreq \$statsdir/ar-lulua/all.pickle || \$docdir/_build build \$docdir/_build/style.css: cp \$docdir/style.css || \$docdir/_build build \$docdir/_build/lulua-logo.svg: cp \$docdir/lulua-logo.svg || \$docdir/_build +# wordlist +build \$docdir/_temp/lulua.combined: wordlist \$statsdir/ar-lulua/all.pickle || \$docdir/_temp +build \$docdir/_build/lulua.combined.gz: gz \$docdir/_temp/lulua.combined || \$docdir/_build build \$docdir/_build/fonts/IBMPlexArabic-Regular.woff2: cp \$fontdir/IBMPlexArabic-Regular.woff2 || \$docdir/_build/fonts diff --git a/lulua/stats.py b/lulua/stats.py index 598a7cd..06168c2 100644 --- a/lulua/stats.py +++ b/lulua/stats.py @@ -18,7 +18,7 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. -import sys, operator, pickle, argparse, logging, yaml +import sys, operator, pickle, argparse, logging, yaml, math, time from operator import itemgetter from itertools import chain, groupby, product from collections import defaultdict @@ -157,7 +157,41 @@ class TriadStats (Stats): def update (self, other): updateDictOp (self.triads, other.triads, operator.add) -allStats = [SimpleStats, RunlenStats, TriadStats] +class WordStats (Stats): + """ + Word stats + """ + + __slots__ = ('words', '_currentWord', '_writer') + + name = 'words' + + def __init__ (self, writer): + self._writer = writer + + self._currentWord = [] + self.words = defaultdict (int) + + def process (self, event): + if isinstance (event, SkipEvent): + # reset + self._currentWord = [] + elif isinstance (event, ButtonCombination): + text = self._writer.layout.getText (event) + for t in text: + cat = unicodedata.category (t) + if cat in {'Lo', 'Mn'}: + # arabic letter or diacritic (non-spacing mark), everything + # else is considered a word-delimiter + self._currentWord.append (t) + elif self._currentWord: + self.words[''.join (self._currentWord)] += 1 + self._currentWord = [] + + def update (self, other): + updateDictOp (self.words, other.words, operator.add) + +allStats = [SimpleStats, RunlenStats, TriadStats, WordStats] def unpickleAll (fd): while True: @@ -207,8 +241,14 @@ def pretty (args): for k, v in combined.items (): print (f'{k:2d} {v:10d} {v/total*100:5.1f}%') + print ('triads') for triad, count in sorted (stats['triads'].triads.items (), key=itemgetter (1)): print (f'{triad} {count:10d}') + + print ('words') + for word, count in sorted (stats['words'].words.items (), key=itemgetter (1)): + print (f'{word} {count:10d}') + effort = Carpalx (models['mod01'], writer) effort.addTriads (stats['triads'].triads) print ('total effort (carpalx)', effort.effort) @@ -257,6 +297,31 @@ def fingerHand (args): print ('\t\n\t') print ('') +def latinImeDict (args): + """ + Create a dictionary for Android’s LatinIME input method from WordStats + + see https://android.googlesource.com/platform/packages/inputmethods/LatinIME/+/master/dictionaries/sample.combined + """ + + def f (p): + """ + Word probability to logarithmic f-value. + + p = 1/(1.15^(255-f)) + """ + return 255+int (round (math.log (p, 1.15))) + + stats = pickle.load (sys.stdin.buffer) + now = int (round (time.time ())) + + print ('# auto-generated by ' + __package__) + print (f'dictionary=main:ar,locale=ar,description=Arabic wordlist,date={now},version=1') + total = sum (stats['words'].words.values ()) + for word, count in sorted (stats['words'].words.items (), key=itemgetter (1), reverse=True): + p = count/total + print (f' word={word},f={f(p)}') + def main (): parser = argparse.ArgumentParser(description='Process statistics files.') parser.add_argument('-l', '--layout', metavar='LAYOUT', help='Keyboard layout name') @@ -279,6 +344,8 @@ def main (): sp.set_defaults (func=keyHeatmap) sp = subparsers.add_parser('fingerhand') sp.set_defaults (func=fingerHand) + sp = subparsers.add_parser('latinime') + sp.set_defaults (func=latinImeDict) logging.basicConfig (level=logging.INFO) args = parser.parse_args() -- cgit v1.2.3