doc: Auto-generate corpus table

author: Lars-Dominik Braun <lars@6xq.net> 2019-11-10 09:44:35 +0100
committer: Lars-Dominik Braun <lars@6xq.net> 2019-11-16 10:17:26 +0100
commit: 14daa5644598836fd6321038c6b0a496c7874374 (patch)
tree: 2309443ffec0acea7b8eda095dba17436c0deb0a
parent: 38c9ed5b042ae488ee12287bf8c19457189889aa (diff)
download: lulua-14daa5644598836fd6321038c6b0a496c7874374.tar.gz
lulua-14daa5644598836fd6321038c6b0a496c7874374.tar.bz2
lulua-14daa5644598836fd6321038c6b0a496c7874374.zip
5 files changed, 114 insertions, 31 deletions
diff --git a/doc/index.html b/doc/index.html
index a390ddf..e930892 100644
--- a/doc/index.html
+++ b/doc/index.html
@@ -7,9 +7,8 @@
   <meta name="viewport" content="width=device-width, initial-scale=1">
 
 	<link href="https://fonts.googleapis.com/css?family=IBM+Plex+Mono|IBM+Plex+Sans:100,400&display=swap" rel="stylesheet">
-	<link rel="stylesheet" href="https://unpkg.com/purecss@1.0.1/build/base-min.css" crossorigin="anonymous">
-	<link rel="stylesheet" href="https://unpkg.com/purecss@1.0.1/build/grids-min.css" crossorigin="anonymous">
-	<link rel="stylesheet" href="https://unpkg.com/purecss@1.0.1/build/grids-responsive-min.css" crossorigin="anonymous">
+	<link rel="stylesheet" href="https://unpkg.com/purecss@1.0.1/build/pure-min.css" integrity="sha384-oAOxQR6DkCoMliIh8yFnu25d7Eq/PHS21PClpwjOTeU2jRSq11vu66rf90/cZr47" crossorigin="anonymous">
+	<link rel="stylesheet" href="https://unpkg.com/purecss@1.0.1/build/grids-responsive-min.css">
 	<script src="https://cdn.pydata.org/bokeh/release/bokeh-1.3.4.min.js"></script>
 	<link rel="stylesheet" href="style.css">
 </head>
@@ -119,32 +118,10 @@
 		<!-- -->
 		The corpus used for the following analysis consists of
 		</p>
-		<ul>
-			<li><a href="https://dumps.wikimedia.org/arwiki/20190701/">a
-			dump</a> of the <a href="https://ar.wikipedia.org/">Arabic
-			Wikipedia</a> as of July 2019, extracted using
-			<a href="https://github.com/attardi/wikiextractor/tree/3162bb6c3c9ebd2d15be507aa11d6fa818a454ac">wikiextractor</a>
-			containing 857,386 articles</li>
-			<li>547,110 articles from
-			<a href="https://www.aljazeera.net/">aljazeera.net</a>, an
-			Arabic-language news site</li>
-			<li>149,901 articles from <a href="http://www.bbc.com/arabic">BBC
-			Arabic</a>, another Arabic-language news site</li>
-			<li>116,754 documents from the
-			<a href="https://conferences.unite.un.org/UNCorpus/en/DownloadOverview">United Nations Parallel Corpus v1.0</a></li>
-			<li>subtitles from 94,093 movies based on a
-			<a href="http://opus.nlpl.eu/OpenSubtitles-v2018.php">2018 OpenSubtitles dump</a></li>
-			<li>1,709 ebooks from <a
-			href="https://www.hindawi.org/books">hindawi.org</a></li>
-			<li>and a plain-text copy of the Quran from <a
-			href="http://tanzil.net/docs/download">tanzil.net</a> using the
-			options Simple Enhanced and Text (for inclusion of diacritics)</li>
-		</ul>
+
+		#include "corpus.html"
+
 		<p>
-		summing up to roughly
-		1.2 billion words or
-		7.6 billion characters. <!-- == combined button presses -->
-		<!-- -->
 		The plot below shows <bdo dir="ltr" lang="ar">ا ل ي م و ن</bdo> can be
 		considered the most frequently used letters in the Arabic language.
 		<!-- -->
diff --git a/doc/style.css b/doc/style.css
index 4f9d63f..d6adf0f 100644
--- a/doc/style.css
+++ b/doc/style.css
@@ -141,3 +141,14 @@ div.fingerhandstats .fingers .thumb {
 	border: 0.1em solid #2aa198; /* cyan */
 }
 
+.pure-table td.numint {
+	text-align: right;
+	padding-right: 0;
+}
+
+.pure-table td.numfrac {
+	border-left: none;
+	text-align: left;
+	padding-left: 0;
+}
+
diff --git a/gen.sh b/gen.sh
index 762eecc..de1fed0 100755
--- a/gen.sh
+++ b/gen.sh
@@ -3,6 +3,7 @@
 
 layouts="ar-lulua ar-asmo663 ar-linux ar-malas ar-phonetic ar-osman ar-khorshid"
 layoutsXmodmap="ar-lulua"
+corpora="`ls corpus`"
 
 cat <<EOF
 ### auto-generated by gen.sh. Do not edit. ###
@@ -80,6 +81,12 @@ rule letterfreq
 rule analyze-fingerhand
     command = lulua-analyze -l \$layout fingerhand < \$in > \$out
 
+rule analyze-corpusstats
+    command = lulua-analyze -l ar-lulua corpusstats \$metadata < \$stats > \$out
+
+rule analyze-corpushtml
+    command = cat \$in | lulua-analyze -l ar-lulua corpushtml > \$out
+
 rule wordlist
     command = lulua-analyze -l ar-lulua latinime < \$in > \$out
 
@@ -111,6 +118,7 @@ build \$docdir/_build/fonts/IBMPlexArabic-Regular.woff2: cp \$fontdir/IBMPlexAra
 build \$docdir/_build/fonts/IBMPlexArabic-Thin.woff2: cp \$fontdir/IBMPlexArabic-Thin.woff2 || \$docdir/_build/fonts
 EOF
 
+# targets for every layout
 for l in $layouts; do
 cat <<EOF
 build \$statsdir/${l}: mkdir
@@ -124,7 +132,7 @@ build \$statsdir/${l}/aljazeera.pickle: write-aljazeera \$corpusdir/aljazeera/ra
 build \$statsdir/${l}/hindawi.pickle: write-epub \$corpusdir/hindawi/raw || \$statsdir/${l}
     layout = ${l}
 
-build \$statsdir/${l}/tanzil.pickle: write-tanzil \$corpusdir/tanzil-quaran/plain.txt.lz || \$statsdir/${l}
+build \$statsdir/${l}/tanzil-quaran.pickle: write-tanzil \$corpusdir/tanzil-quaran/plain.txt.lz || \$statsdir/${l}
     layout = ${l}
 
 build \$statsdir/${l}/arwiki.pickle: write-arwiki \$corpusdir/arwiki/arwiki-20190701-pages-articles.xml.bz2 || \$statsdir/${l}
@@ -136,7 +144,7 @@ build \$statsdir/${l}/un-v1.0-tei.pickle: write-tei2 \$corpusdir/un-v1.0-tei ||
 build \$statsdir/${l}/opensubtitles-2018.pickle: write-opensubtitles \$corpusdir/opensubtitles-2018 || \$statsdir/${l}
     layout = ${l}
 
-build \$statsdir/${l}/all.pickle: combine \$statsdir/${l}/bbcarabic.pickle \$statsdir/${l}/aljazeera.pickle \$statsdir/${l}/tanzil.pickle \$statsdir/${l}/arwiki.pickle \$statsdir/${l}/hindawi.pickle \$statsdir/${l}/un-v1.0-tei.pickle \$statsdir/${l}/opensubtitles-2018.pickle || \$statsdir/${l}
+build \$statsdir/${l}/all.pickle: combine \$statsdir/${l}/bbcarabic.pickle \$statsdir/${l}/aljazeera.pickle \$statsdir/${l}/tanzil-quaran.pickle \$statsdir/${l}/arwiki.pickle \$statsdir/${l}/hindawi.pickle \$statsdir/${l}/un-v1.0-tei.pickle \$statsdir/${l}/opensubtitles-2018.pickle || \$statsdir/${l}
 
 build \$docdir/_build/${l}.svg: render-svg || \$docdir/_build
     layout = ${l}
@@ -153,6 +161,7 @@ build \$docdir/_temp/${l}-fingerhand.html: analyze-fingerhand \$statsdir/${l}/al
 EOF
 done
 
+# layouts with xmodmap support
 for l in $layoutsXmodmap; do
 cat <<EOF
 build \$docdir/_build/${l}.xmodmap: render-xmodmap || \$docdir/_build
@@ -161,3 +170,18 @@ build \$docdir/_build/${l}.xmodmap: render-xmodmap || \$docdir/_build
 EOF
 done
 
+# statistics for each corpus (ar-lulua) and html rendering
+outfiles=""
+for c in $corpora; do
+cat <<EOF
+build \$docdir/_temp/metadata-$c.yaml: analyze-corpusstats \$statsdir/ar-lulua/$c.pickle \$corpusdir/$c/metadata.yaml || \$docdir/_temp \$corpusdir/$c/metadata.yaml
+    metadata = \$corpusdir/$c/metadata.yaml
+    stats = \$statsdir/ar-lulua/$c.pickle
+EOF
+outfiles+=" \$docdir/_temp/metadata-$c.yaml"
+done
+
+cat <<EOF
+build \$docdir/_temp/corpus.html: analyze-corpushtml $outfiles || \$docdir/_temp
+EOF
+
diff --git a/lulua/stats.py b/lulua/stats.py
index 6665ac8..a7980d6 100644
--- a/lulua/stats.py
+++ b/lulua/stats.py
@@ -22,6 +22,7 @@ import sys, operator, pickle, argparse, logging, yaml, math, time
 from operator import itemgetter
 from itertools import chain, groupby, product
 from collections import defaultdict
+from decimal import Decimal
 
 from .layout import *
 from .keyboard import defaultKeyboards
@@ -326,6 +327,58 @@ def latinImeDict (args):
         p = count/total
         print (f' word={word},f={f(p)}')
 
+def corpusStats (args):
+    """ Get corpus stats from stat files """
+    stats = pickle.load (sys.stdin.buffer)
+    meta = yaml.safe_load (args.metadata)
+
+    meta['stats'] = dict (characters=sum (stats['simple'].combinations.values ()),
+            words=sum (stats['words'].words.values ()))
+
+    yaml.dump (meta, sys.stdout)
+    # make document concatable
+    print ('---')
+
+def approx (i):
+    """ Get approximate human-readable string for large number """
+
+    units = ['', 'thousand', 'million', 'billion']
+    base = Decimal (1000)
+    i = Decimal (i)
+    while i >= base and len (units) > 1:
+        i /= base
+        units.pop (0)
+    i = round (i, 1)
+    return int (i), int (i%1*10), units[0]
+
+def corpusHtml (args):
+    meta = list (filter (lambda x: x is not None, yaml.safe_load_all (sys.stdin)))
+    total = {'words': 0, 'characters': 0}
+    print ('<table class="pure-table"><thead><tr><th>Source</th><th colspan="2"></th><th colspan="2">Words</th><th colspan="2">Characters</th></thead><tbody>')
+    for c in sorted (meta, key=lambda x: x['source']['name'].lower ()):
+        print ('<tr>')
+        print (f'<td><a href="{c["source"]["url"]}">{c["source"]["name"]}</a></td>')
+        count = c.get ('count')
+        if count:
+            print (f'<td class="numint">{count[0]//1000:d},</td><td class="numfrac">{count[0]%1000:03d}\u202f{count[1]}</td></td>')
+        else:
+            print ('<td class="numint"></td><td class="numfrac"></td>')
+
+        stats = c.get ('stats')
+        for k in ('words', 'characters'):
+            i = approx (stats[k])
+            print (f'<td class="numint">{i[0]}.</td><td class="numfrac">{i[1]}\u202f{i[2]}</td>')
+        print ('</tr>')
+
+        for k in ('words', 'characters'):
+            total[k] += c['stats'][k]
+    print ('<tr><td>Total</td><td class="numint"></td><td class="numfrac"></td>')
+    for k in ('words', 'characters'):
+        i = approx (total[k])
+        print (f'<td class="numint">{i[0]}.</td><td class="numfrac">{i[1]}\u202f{i[2]}</td>')
+    print ('</tr>')
+    print ('</tbody></table>')
+
 def main ():
     parser = argparse.ArgumentParser(description='Process statistics files.')
     parser.add_argument('-l', '--layout', metavar='LAYOUT', help='Keyboard layout name')
@@ -350,6 +403,11 @@ def main ():
     sp.set_defaults (func=fingerHand)
     sp = subparsers.add_parser('latinime')
     sp.set_defaults (func=latinImeDict)
+    sp = subparsers.add_parser('corpusstats')
+    sp.add_argument('metadata', type=argparse.FileType ('r'))
+    sp.set_defaults (func=corpusStats)
+    sp = subparsers.add_parser('corpushtml')
+    sp.set_defaults (func=corpusHtml)
 
     logging.basicConfig (level=logging.INFO)
     args = parser.parse_args()
diff --git a/lulua/test_stats.py b/lulua/test_stats.py
index 2fff6ce..9e3ed77 100644
--- a/lulua/test_stats.py
+++ b/lulua/test_stats.py
@@ -21,7 +21,7 @@
 import operator
 import pytest
 
-from .stats import updateDictOp
+from .stats import updateDictOp, approx
 
 def test_updateDictOp ():
     a = {1: 3}
@@ -37,3 +37,16 @@ def test_updateDictOp ():
     assert a == {'foo': {1: 3+7}}
     assert b == {'foo': {1: 7}}
 
+def test_approx ():
+    assert approx (0) == (0, 0, '')
+    assert approx (0.01) == (0, 0, '')
+    assert approx (0.05) == (0, 1, '')
+    assert approx (1) == (1, 0, '')
+    assert approx (100) == (100, 0, '')
+    assert approx (999.9) == (999, 9, '')
+
+    assert approx (10**3) == (1, 0, 'thousand')
+    assert approx (10**6) == (1, 0, 'million')
+    assert approx (10**9) == (1, 0, 'billion')
+    assert approx (10**12) == (1000, 0, 'billion')
+
author	Lars-Dominik Braun <lars@6xq.net>	2019-11-10 09:44:35 +0100
committer	Lars-Dominik Braun <lars@6xq.net>	2019-11-16 10:17:26 +0100
commit	14daa5644598836fd6321038c6b0a496c7874374 (patch)
tree	2309443ffec0acea7b8eda095dba17436c0deb0a
parent	38c9ed5b042ae488ee12287bf8c19457189889aa (diff)
download	lulua-14daa5644598836fd6321038c6b0a496c7874374.tar.gz lulua-14daa5644598836fd6321038c6b0a496c7874374.tar.bz2 lulua-14daa5644598836fd6321038c6b0a496c7874374.zip