3 files changed, 26 insertions, 11 deletions
diff --git a/doc/index.html b/doc/index.html
index 6749647..19151b0 100644
--- a/doc/index.html
+++ b/doc/index.html
@@ -120,16 +120,18 @@
 		The corpus used for the following analysis consists of
 		</p>
 		<ul>
-			<li>547,110 articles from
-			<a href="https://www.aljazeera.net/">aljazeera.net</a>, an
-			Arabic-language news site</li>
-			<li>149,901 articles from <a href="http://www.bbc.com/arabic">BBC
-			Arabic</a>, another Arabic-language news site</li>
 			<li><a href="https://dumps.wikimedia.org/arwiki/20190701/">a
 			dump</a> of the <a href="https://ar.wikipedia.org/">Arabic
 			Wikipedia</a> as of July 2019, extracted using
 			<a href="https://github.com/attardi/wikiextractor/tree/3162bb6c3c9ebd2d15be507aa11d6fa818a454ac">wikiextractor</a>
 			containing 857,386 articles</li>
+			<li>547,110 articles from
+			<a href="https://www.aljazeera.net/">aljazeera.net</a>, an
+			Arabic-language news site</li>
+			<li>149,901 articles from <a href="http://www.bbc.com/arabic">BBC
+			Arabic</a>, another Arabic-language news site</li>
+			<li>116,754 documents from the
+			<a href="https://conferences.unite.un.org/UNCorpus/en/DownloadOverview">United Nations Parallel Corpus v1.0</a></li>
 			<li>1,709 ebooks from <a
 			href="https://www.hindawi.org/books">hindawi.org</a></li>
 			<li>and a plain-text copy of the Quran from <a
@@ -137,12 +139,14 @@
 			options Simple Enhanced and Text (for inclusion of diacritics)</li>
 		</ul>
 		<p>
-		summing up to roughly two billion characters.
+		summing up to roughly
+		825 million words or
+		5.5 billion characters. <!-- == combined button presses -->
 		<!-- -->
 		The plot below shows <bdo dir="ltr" lang="ar">ا ل ي م و ن</bdo> can be
 		considered the most frequently used letters in the Arabic language.
 		<!-- -->
-		Together they account for more than 50% of all letters in the corpus.
+		Together they account for more than 55% of all letters in the corpus.
 		</p>
 	</div>
 	</div>
diff --git a/gen.sh b/gen.sh
index 3500b22..588b8ba 100755
--- a/gen.sh
+++ b/gen.sh
@@ -56,6 +56,10 @@ rule write-tanzil
     command = echo \$in | lulua-write text \$layout | lulua-analyze combine > \$out
     pool = write
 
+rule write-tei2
+    command = find \$in -type f -name '*.xml' | lulua-write tei2 \$layout | lulua-analyze combine > \$out
+    pool = write
+
 rule write-arwiki
     command = \$wikiextractor -ns 0 --json -o - \$in 2>/dev/null | jq .text | lulua-write json \$layout | lulua-analyze combine > \$out
     pool = write
@@ -122,7 +126,10 @@ build \$statsdir/${l}/tanzil.pickle: write-tanzil \$corpusdir/tanzil-quaran/plai
 build \$statsdir/${l}/arwiki.pickle: write-arwiki \$corpusdir/arwiki/arwiki-20190701-pages-articles.xml.bz2 || \$statsdir/${l}
     layout = ${l}
 
-build \$statsdir/${l}/all.pickle: combine \$statsdir/${l}/bbcarabic.pickle \$statsdir/${l}/aljazeera.pickle \$statsdir/${l}/tanzil.pickle \$statsdir/${l}/arwiki.pickle \$statsdir/${l}/hindawi.pickle || \$statsdir/${l}
+build \$statsdir/${l}/un-v1.0-tei.pickle: write-tei2 \$corpusdir/un-v1.0-tei || \$statsdir/${l}
+    layout = ${l}
+
+build \$statsdir/${l}/all.pickle: combine \$statsdir/${l}/bbcarabic.pickle \$statsdir/${l}/aljazeera.pickle \$statsdir/${l}/tanzil.pickle \$statsdir/${l}/arwiki.pickle \$statsdir/${l}/hindawi.pickle \$statsdir/${l}/un-v1.0-tei.pickle || \$statsdir/${l}
 
 build \$docdir/_build/${l}.svg: render-svg || \$docdir/_build
     layout = ${l}
diff --git a/lulua/stats.py b/lulua/stats.py
index 06168c2..6665ac8 100644
--- a/lulua/stats.py
+++ b/lulua/stats.py
@@ -218,13 +218,16 @@ def pretty (args):
     writer = Writer (layout)
 
     buttonPresses = sum (stats['simple'].buttons.values ())
+    print ('button presses', buttonPresses)
     for k, v in sorted (stats['simple'].buttons.items (), key=itemgetter (1)):
         print (f'{k} {v:10d} {v/buttonPresses*100:5.1f}%')
-    print ('combinations')
+
     combinationTotal = sum (stats['simple'].combinations.values ())
+    print ('combinations', combinationTotal)
     for k, v in sorted (stats['simple'].combinations.items (), key=itemgetter (1)):
         t = displayText (layout.getText (k))
         print (f'{t:4s} {k} {v:10d} {v/combinationTotal*100:5.1f}%')
+
     print ('unknown')
     for k, v in sorted (stats['simple'].unknown.items (), key=itemgetter (1)):
         print (f'{k!r} {v:10d}')
@@ -245,9 +248,10 @@ def pretty (args):
     for triad, count in sorted (stats['triads'].triads.items (), key=itemgetter (1)):
         print (f'{triad} {count:10d}')
 
-    print ('words')
+    totalWords = sum (stats['words'].words.values ())
+    print ('words', totalWords)
     for word, count in sorted (stats['words'].words.items (), key=itemgetter (1)):
-        print (f'{word} {count:10d}')
+        print (f'{word:20s} {count/totalWords*100:2.5f} {count:10d}')
 
     effort = Carpalx (models['mod01'], writer)
     effort.addTriads (stats['triads'].triads)