diff options
-rw-r--r-- | doc/index.html | 18 | ||||
-rwxr-xr-x | gen.sh | 9 | ||||
-rw-r--r-- | lulua/stats.py | 10 |
3 files changed, 26 insertions, 11 deletions
diff --git a/doc/index.html b/doc/index.html index 6749647..19151b0 100644 --- a/doc/index.html +++ b/doc/index.html @@ -120,16 +120,18 @@ The corpus used for the following analysis consists of </p> <ul> - <li>547,110 articles from - <a href="https://www.aljazeera.net/">aljazeera.net</a>, an - Arabic-language news site</li> - <li>149,901 articles from <a href="http://www.bbc.com/arabic">BBC - Arabic</a>, another Arabic-language news site</li> <li><a href="https://dumps.wikimedia.org/arwiki/20190701/">a dump</a> of the <a href="https://ar.wikipedia.org/">Arabic Wikipedia</a> as of July 2019, extracted using <a href="https://github.com/attardi/wikiextractor/tree/3162bb6c3c9ebd2d15be507aa11d6fa818a454ac">wikiextractor</a> containing 857,386 articles</li> + <li>547,110 articles from + <a href="https://www.aljazeera.net/">aljazeera.net</a>, an + Arabic-language news site</li> + <li>149,901 articles from <a href="http://www.bbc.com/arabic">BBC + Arabic</a>, another Arabic-language news site</li> + <li>116,754 documents from the + <a href="https://conferences.unite.un.org/UNCorpus/en/DownloadOverview">United Nations Parallel Corpus v1.0</a></li> <li>1,709 ebooks from <a href="https://www.hindawi.org/books">hindawi.org</a></li> <li>and a plain-text copy of the Quran from <a @@ -137,12 +139,14 @@ options Simple Enhanced and Text (for inclusion of diacritics)</li> </ul> <p> - summing up to roughly two billion characters. + summing up to roughly + 825 million words or + 5.5 billion characters. <!-- == combined button presses --> <!-- --> The plot below shows <bdo dir="ltr" lang="ar">ا ل ي م و ن</bdo> can be considered the most frequently used letters in the Arabic language. <!-- --> - Together they account for more than 50% of all letters in the corpus. + Together they account for more than 55% of all letters in the corpus. </p> </div> </div> @@ -56,6 +56,10 @@ rule write-tanzil command = echo \$in | lulua-write text \$layout | lulua-analyze combine > \$out pool = write +rule write-tei2 + command = find \$in -type f -name '*.xml' | lulua-write tei2 \$layout | lulua-analyze combine > \$out + pool = write + rule write-arwiki command = \$wikiextractor -ns 0 --json -o - \$in 2>/dev/null | jq .text | lulua-write json \$layout | lulua-analyze combine > \$out pool = write @@ -122,7 +126,10 @@ build \$statsdir/${l}/tanzil.pickle: write-tanzil \$corpusdir/tanzil-quaran/plai build \$statsdir/${l}/arwiki.pickle: write-arwiki \$corpusdir/arwiki/arwiki-20190701-pages-articles.xml.bz2 || \$statsdir/${l} layout = ${l} -build \$statsdir/${l}/all.pickle: combine \$statsdir/${l}/bbcarabic.pickle \$statsdir/${l}/aljazeera.pickle \$statsdir/${l}/tanzil.pickle \$statsdir/${l}/arwiki.pickle \$statsdir/${l}/hindawi.pickle || \$statsdir/${l} +build \$statsdir/${l}/un-v1.0-tei.pickle: write-tei2 \$corpusdir/un-v1.0-tei || \$statsdir/${l} + layout = ${l} + +build \$statsdir/${l}/all.pickle: combine \$statsdir/${l}/bbcarabic.pickle \$statsdir/${l}/aljazeera.pickle \$statsdir/${l}/tanzil.pickle \$statsdir/${l}/arwiki.pickle \$statsdir/${l}/hindawi.pickle \$statsdir/${l}/un-v1.0-tei.pickle || \$statsdir/${l} build \$docdir/_build/${l}.svg: render-svg || \$docdir/_build layout = ${l} diff --git a/lulua/stats.py b/lulua/stats.py index 06168c2..6665ac8 100644 --- a/lulua/stats.py +++ b/lulua/stats.py @@ -218,13 +218,16 @@ def pretty (args): writer = Writer (layout) buttonPresses = sum (stats['simple'].buttons.values ()) + print ('button presses', buttonPresses) for k, v in sorted (stats['simple'].buttons.items (), key=itemgetter (1)): print (f'{k} {v:10d} {v/buttonPresses*100:5.1f}%') - print ('combinations') + combinationTotal = sum (stats['simple'].combinations.values ()) + print ('combinations', combinationTotal) for k, v in sorted (stats['simple'].combinations.items (), key=itemgetter (1)): t = displayText (layout.getText (k)) print (f'{t:4s} {k} {v:10d} {v/combinationTotal*100:5.1f}%') + print ('unknown') for k, v in sorted (stats['simple'].unknown.items (), key=itemgetter (1)): print (f'{k!r} {v:10d}') @@ -245,9 +248,10 @@ def pretty (args): for triad, count in sorted (stats['triads'].triads.items (), key=itemgetter (1)): print (f'{triad} {count:10d}') - print ('words') + totalWords = sum (stats['words'].words.values ()) + print ('words', totalWords) for word, count in sorted (stats['words'].words.items (), key=itemgetter (1)): - print (f'{word} {count:10d}') + print (f'{word:20s} {count/totalWords*100:2.5f} {count:10d}') effort = Carpalx (models['mod01'], writer) effort.addTriads (stats['triads'].triads) |