From e31d8731531b41a909bfe33ddc134de07f0a7bab Mon Sep 17 00:00:00 2001 From: Lars-Dominik Braun Date: Wed, 6 Nov 2019 19:18:08 +0100 Subject: Add United Nations Parallel Corpus v1.0 See issue #5. --- doc/index.html | 18 +++++++++++------- gen.sh | 9 ++++++++- lulua/stats.py | 10 +++++++--- 3 files changed, 26 insertions(+), 11 deletions(-) diff --git a/doc/index.html b/doc/index.html index 6749647..19151b0 100644 --- a/doc/index.html +++ b/doc/index.html @@ -120,16 +120,18 @@ The corpus used for the following analysis consists of

- summing up to roughly two billion characters. + summing up to roughly + 825 million words or + 5.5 billion characters. The plot below shows ا ل ي م و ن can be considered the most frequently used letters in the Arabic language. - Together they account for more than 50% of all letters in the corpus. + Together they account for more than 55% of all letters in the corpus.

diff --git a/gen.sh b/gen.sh index 3500b22..588b8ba 100755 --- a/gen.sh +++ b/gen.sh @@ -56,6 +56,10 @@ rule write-tanzil command = echo \$in | lulua-write text \$layout | lulua-analyze combine > \$out pool = write +rule write-tei2 + command = find \$in -type f -name '*.xml' | lulua-write tei2 \$layout | lulua-analyze combine > \$out + pool = write + rule write-arwiki command = \$wikiextractor -ns 0 --json -o - \$in 2>/dev/null | jq .text | lulua-write json \$layout | lulua-analyze combine > \$out pool = write @@ -122,7 +126,10 @@ build \$statsdir/${l}/tanzil.pickle: write-tanzil \$corpusdir/tanzil-quaran/plai build \$statsdir/${l}/arwiki.pickle: write-arwiki \$corpusdir/arwiki/arwiki-20190701-pages-articles.xml.bz2 || \$statsdir/${l} layout = ${l} -build \$statsdir/${l}/all.pickle: combine \$statsdir/${l}/bbcarabic.pickle \$statsdir/${l}/aljazeera.pickle \$statsdir/${l}/tanzil.pickle \$statsdir/${l}/arwiki.pickle \$statsdir/${l}/hindawi.pickle || \$statsdir/${l} +build \$statsdir/${l}/un-v1.0-tei.pickle: write-tei2 \$corpusdir/un-v1.0-tei || \$statsdir/${l} + layout = ${l} + +build \$statsdir/${l}/all.pickle: combine \$statsdir/${l}/bbcarabic.pickle \$statsdir/${l}/aljazeera.pickle \$statsdir/${l}/tanzil.pickle \$statsdir/${l}/arwiki.pickle \$statsdir/${l}/hindawi.pickle \$statsdir/${l}/un-v1.0-tei.pickle || \$statsdir/${l} build \$docdir/_build/${l}.svg: render-svg || \$docdir/_build layout = ${l} diff --git a/lulua/stats.py b/lulua/stats.py index 06168c2..6665ac8 100644 --- a/lulua/stats.py +++ b/lulua/stats.py @@ -218,13 +218,16 @@ def pretty (args): writer = Writer (layout) buttonPresses = sum (stats['simple'].buttons.values ()) + print ('button presses', buttonPresses) for k, v in sorted (stats['simple'].buttons.items (), key=itemgetter (1)): print (f'{k} {v:10d} {v/buttonPresses*100:5.1f}%') - print ('combinations') + combinationTotal = sum (stats['simple'].combinations.values ()) + print ('combinations', combinationTotal) for k, v in sorted (stats['simple'].combinations.items (), key=itemgetter (1)): t = displayText (layout.getText (k)) print (f'{t:4s} {k} {v:10d} {v/combinationTotal*100:5.1f}%') + print ('unknown') for k, v in sorted (stats['simple'].unknown.items (), key=itemgetter (1)): print (f'{k!r} {v:10d}') @@ -245,9 +248,10 @@ def pretty (args): for triad, count in sorted (stats['triads'].triads.items (), key=itemgetter (1)): print (f'{triad} {count:10d}') - print ('words') + totalWords = sum (stats['words'].words.values ()) + print ('words', totalWords) for word, count in sorted (stats['words'].words.items (), key=itemgetter (1)): - print (f'{word} {count:10d}') + print (f'{word:20s} {count/totalWords*100:2.5f} {count:10d}') effort = Carpalx (models['mod01'], writer) effort.addTriads (stats['triads'].triads) -- cgit v1.2.3