plot: Add cumulative letter stats

author: Lars-Dominik Braun <lars@6xq.net> 2019-09-26 10:11:31 +0200
committer: Lars-Dominik Braun <lars@6xq.net> 2019-09-26 10:11:31 +0200
commit: 3502bf134512597ae445eee8d015c688c4a7bcfe (patch)
tree: b318e4a0033afea2b496f3e8fe1a69cb6a314024
parent: 33f56e596e5228d82f504926370cecc166766879 (diff)
download: lulua-3502bf134512597ae445eee8d015c688c4a7bcfe.tar.gz
lulua-3502bf134512597ae445eee8d015c688c4a7bcfe.tar.bz2
lulua-3502bf134512597ae445eee8d015c688c4a7bcfe.zip
2 files changed, 32 insertions, 9 deletions
diff --git a/doc/index.html b/doc/index.html
index 4e14658..474e444 100644
--- a/doc/index.html
+++ b/doc/index.html
@@ -137,8 +137,10 @@
 		<p>
 		summing up to roughly 1.5 billion characters.
 		<!-- -->
-		The plot below shows <bdo dir="ltr" lang="ar">ا ل ي و م ن</bdo> can be
+		The plot below shows <bdo dir="ltr" lang="ar">ا ل ي م و ن</bdo> can be
 		considered the most frequently used letters in the Arabic language.
+		<!-- -->
+		Together they account for more than 50% of all letters in the corpus.
 		</p>
 	</div>
 	</div>
diff --git a/lulua/plot.py b/lulua/plot.py
index 2a7d678..885500a 100644
--- a/lulua/plot.py
+++ b/lulua/plot.py
@@ -21,7 +21,7 @@
 import sys, argparse, json, unicodedata, pickle, logging
 from operator import itemgetter
 from bokeh.plotting import figure
-from bokeh.models import ColumnDataSource
+from bokeh.models import ColumnDataSource, LinearAxis, Range1d
 from bokeh.embed import json_item
 
 from .layout import *
@@ -42,10 +42,6 @@ def letterfreq (args):
     keyboard = defaultKeyboards['ibmpc105']
     layout = defaultLayouts[args.layout].specialize (keyboard)
 
-    xdata = []
-    xlabel = []
-    ydata = []
-    ydataAbs = []
 
     # letter-based binning, in case multiple buttons are mapped to the same
     # letter.
@@ -59,15 +55,40 @@ def letterfreq (args):
     combinationTotal = sum (binned.values ())
     logging.info (f'total binned combinations {combinationTotal}')
 
+    xdata = []
+    xlabel = []
+    ydata = []
+    ydataAbs = []
+    ydataCumAbs = []
+    ydataCumRel = []
+
+    cumSum = combinationTotal
     for i, (k, v) in enumerate (sorted (binned.items (), key=itemgetter (1))):
         xdata.append (i)
         xlabel.append (k)
-        ydata.append (v/combinationTotal*100)
+        ydata.append (v/combinationTotal)
         ydataAbs.append (v)
 
-    source = ColumnDataSource(data=dict(x=xdata, letters=xlabel, rel=ydata, abs=ydataAbs))
-    p = figure(plot_width=1000, plot_height=500, x_range=xlabel, sizing_mode='scale_both', tooltips=[('frequency', '@rel%'), ('count', '@abs')])
+        # cumulative
+        ydataCumAbs.append (cumSum)
+        ydataCumRel.append (cumSum/combinationTotal)
+        cumSum -= v
+
+    source = ColumnDataSource(data=dict(x=xdata, letters=xlabel, rel=ydata, abs=ydataAbs, cum=ydataCumAbs, cumRel=ydataCumRel))
+    p = figure(
+            plot_width=1000,
+            plot_height=500,
+            x_range=xlabel,
+            y_range=(0, max (ydata)),
+            sizing_mode='scale_both',
+            tooltips=[('frequency', '@rel'), ('cumulative', '@cumRel'), ('count', '@abs')],
+            )
     p.vbar(x='letters', width=0.5, top='rel', color="#dc322f", source=source)
+
+    p.extra_y_ranges = {"cum": Range1d()}
+    p.line ('letters', 'cumRel', source=source, y_range_name='cum', line_width=2)
+    p.add_layout(LinearAxis(y_range_name="cum"), 'right')
+
     p.xgrid.grid_line_color = None
     p.xaxis.major_label_text_font_size = "2em"
     p.xaxis.major_label_text_font_size = "2em"
author	Lars-Dominik Braun <lars@6xq.net>	2019-09-26 10:11:31 +0200
committer	Lars-Dominik Braun <lars@6xq.net>	2019-09-26 10:11:31 +0200
commit	3502bf134512597ae445eee8d015c688c4a7bcfe (patch)
tree	b318e4a0033afea2b496f3e8fe1a69cb6a314024
parent	33f56e596e5228d82f504926370cecc166766879 (diff)
download	lulua-3502bf134512597ae445eee8d015c688c4a7bcfe.tar.gz lulua-3502bf134512597ae445eee8d015c688c4a7bcfe.tar.bz2 lulua-3502bf134512597ae445eee8d015c688c4a7bcfe.zip