summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLars-Dominik Braun <lars@6xq.net>2019-09-26 10:11:31 +0200
committerLars-Dominik Braun <lars@6xq.net>2019-09-26 10:11:31 +0200
commit3502bf134512597ae445eee8d015c688c4a7bcfe (patch)
treeb318e4a0033afea2b496f3e8fe1a69cb6a314024
parent33f56e596e5228d82f504926370cecc166766879 (diff)
downloadlulua-3502bf134512597ae445eee8d015c688c4a7bcfe.tar.gz
lulua-3502bf134512597ae445eee8d015c688c4a7bcfe.tar.bz2
lulua-3502bf134512597ae445eee8d015c688c4a7bcfe.zip
plot: Add cumulative letter stats
-rw-r--r--doc/index.html4
-rw-r--r--lulua/plot.py37
2 files changed, 32 insertions, 9 deletions
diff --git a/doc/index.html b/doc/index.html
index 4e14658..474e444 100644
--- a/doc/index.html
+++ b/doc/index.html
@@ -137,8 +137,10 @@
<p>
summing up to roughly 1.5 billion characters.
<!-- -->
- The plot below shows <bdo dir="ltr" lang="ar">ا ل ي و م ن</bdo> can be
+ The plot below shows <bdo dir="ltr" lang="ar">ا ل ي م و ن</bdo> can be
considered the most frequently used letters in the Arabic language.
+ <!-- -->
+ Together they account for more than 50% of all letters in the corpus.
</p>
</div>
</div>
diff --git a/lulua/plot.py b/lulua/plot.py
index 2a7d678..885500a 100644
--- a/lulua/plot.py
+++ b/lulua/plot.py
@@ -21,7 +21,7 @@
import sys, argparse, json, unicodedata, pickle, logging
from operator import itemgetter
from bokeh.plotting import figure
-from bokeh.models import ColumnDataSource
+from bokeh.models import ColumnDataSource, LinearAxis, Range1d
from bokeh.embed import json_item
from .layout import *
@@ -42,10 +42,6 @@ def letterfreq (args):
keyboard = defaultKeyboards['ibmpc105']
layout = defaultLayouts[args.layout].specialize (keyboard)
- xdata = []
- xlabel = []
- ydata = []
- ydataAbs = []
# letter-based binning, in case multiple buttons are mapped to the same
# letter.
@@ -59,15 +55,40 @@ def letterfreq (args):
combinationTotal = sum (binned.values ())
logging.info (f'total binned combinations {combinationTotal}')
+ xdata = []
+ xlabel = []
+ ydata = []
+ ydataAbs = []
+ ydataCumAbs = []
+ ydataCumRel = []
+
+ cumSum = combinationTotal
for i, (k, v) in enumerate (sorted (binned.items (), key=itemgetter (1))):
xdata.append (i)
xlabel.append (k)
- ydata.append (v/combinationTotal*100)
+ ydata.append (v/combinationTotal)
ydataAbs.append (v)
- source = ColumnDataSource(data=dict(x=xdata, letters=xlabel, rel=ydata, abs=ydataAbs))
- p = figure(plot_width=1000, plot_height=500, x_range=xlabel, sizing_mode='scale_both', tooltips=[('frequency', '@rel%'), ('count', '@abs')])
+ # cumulative
+ ydataCumAbs.append (cumSum)
+ ydataCumRel.append (cumSum/combinationTotal)
+ cumSum -= v
+
+ source = ColumnDataSource(data=dict(x=xdata, letters=xlabel, rel=ydata, abs=ydataAbs, cum=ydataCumAbs, cumRel=ydataCumRel))
+ p = figure(
+ plot_width=1000,
+ plot_height=500,
+ x_range=xlabel,
+ y_range=(0, max (ydata)),
+ sizing_mode='scale_both',
+ tooltips=[('frequency', '@rel'), ('cumulative', '@cumRel'), ('count', '@abs')],
+ )
p.vbar(x='letters', width=0.5, top='rel', color="#dc322f", source=source)
+
+ p.extra_y_ranges = {"cum": Range1d()}
+ p.line ('letters', 'cumRel', source=source, y_range_name='cum', line_width=2)
+ p.add_layout(LinearAxis(y_range_name="cum"), 'right')
+
p.xgrid.grid_line_color = None
p.xaxis.major_label_text_font_size = "2em"
p.xaxis.major_label_text_font_size = "2em"