summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLars-Dominik Braun <lars@6xq.net>2019-11-06 19:18:08 +0100
committerLars-Dominik Braun <lars@6xq.net>2019-11-08 21:34:11 +0100
commite31d8731531b41a909bfe33ddc134de07f0a7bab (patch)
tree56da3225cfdd4e239c78173803412c1f9e1b5e36
parent43ad3e898a28798ac2f928041999997c24e7bf3c (diff)
downloadlulua-e31d8731531b41a909bfe33ddc134de07f0a7bab.tar.gz
lulua-e31d8731531b41a909bfe33ddc134de07f0a7bab.tar.bz2
lulua-e31d8731531b41a909bfe33ddc134de07f0a7bab.zip
Add United Nations Parallel Corpus v1.0
See issue #5.
-rw-r--r--doc/index.html18
-rwxr-xr-xgen.sh9
-rw-r--r--lulua/stats.py10
3 files changed, 26 insertions, 11 deletions
diff --git a/doc/index.html b/doc/index.html
index 6749647..19151b0 100644
--- a/doc/index.html
+++ b/doc/index.html
@@ -120,16 +120,18 @@
The corpus used for the following analysis consists of
</p>
<ul>
- <li>547,110 articles from
- <a href="https://www.aljazeera.net/">aljazeera.net</a>, an
- Arabic-language news site</li>
- <li>149,901 articles from <a href="http://www.bbc.com/arabic">BBC
- Arabic</a>, another Arabic-language news site</li>
<li><a href="https://dumps.wikimedia.org/arwiki/20190701/">a
dump</a> of the <a href="https://ar.wikipedia.org/">Arabic
Wikipedia</a> as of July 2019, extracted using
<a href="https://github.com/attardi/wikiextractor/tree/3162bb6c3c9ebd2d15be507aa11d6fa818a454ac">wikiextractor</a>
containing 857,386 articles</li>
+ <li>547,110 articles from
+ <a href="https://www.aljazeera.net/">aljazeera.net</a>, an
+ Arabic-language news site</li>
+ <li>149,901 articles from <a href="http://www.bbc.com/arabic">BBC
+ Arabic</a>, another Arabic-language news site</li>
+ <li>116,754 documents from the
+ <a href="https://conferences.unite.un.org/UNCorpus/en/DownloadOverview">United Nations Parallel Corpus v1.0</a></li>
<li>1,709 ebooks from <a
href="https://www.hindawi.org/books">hindawi.org</a></li>
<li>and a plain-text copy of the Quran from <a
@@ -137,12 +139,14 @@
options Simple Enhanced and Text (for inclusion of diacritics)</li>
</ul>
<p>
- summing up to roughly two billion characters.
+ summing up to roughly
+ 825 million words or
+ 5.5 billion characters. <!-- == combined button presses -->
<!-- -->
The plot below shows <bdo dir="ltr" lang="ar">ا ل ي م و ن</bdo> can be
considered the most frequently used letters in the Arabic language.
<!-- -->
- Together they account for more than 50% of all letters in the corpus.
+ Together they account for more than 55% of all letters in the corpus.
</p>
</div>
</div>
diff --git a/gen.sh b/gen.sh
index 3500b22..588b8ba 100755
--- a/gen.sh
+++ b/gen.sh
@@ -56,6 +56,10 @@ rule write-tanzil
command = echo \$in | lulua-write text \$layout | lulua-analyze combine > \$out
pool = write
+rule write-tei2
+ command = find \$in -type f -name '*.xml' | lulua-write tei2 \$layout | lulua-analyze combine > \$out
+ pool = write
+
rule write-arwiki
command = \$wikiextractor -ns 0 --json -o - \$in 2>/dev/null | jq .text | lulua-write json \$layout | lulua-analyze combine > \$out
pool = write
@@ -122,7 +126,10 @@ build \$statsdir/${l}/tanzil.pickle: write-tanzil \$corpusdir/tanzil-quaran/plai
build \$statsdir/${l}/arwiki.pickle: write-arwiki \$corpusdir/arwiki/arwiki-20190701-pages-articles.xml.bz2 || \$statsdir/${l}
layout = ${l}
-build \$statsdir/${l}/all.pickle: combine \$statsdir/${l}/bbcarabic.pickle \$statsdir/${l}/aljazeera.pickle \$statsdir/${l}/tanzil.pickle \$statsdir/${l}/arwiki.pickle \$statsdir/${l}/hindawi.pickle || \$statsdir/${l}
+build \$statsdir/${l}/un-v1.0-tei.pickle: write-tei2 \$corpusdir/un-v1.0-tei || \$statsdir/${l}
+ layout = ${l}
+
+build \$statsdir/${l}/all.pickle: combine \$statsdir/${l}/bbcarabic.pickle \$statsdir/${l}/aljazeera.pickle \$statsdir/${l}/tanzil.pickle \$statsdir/${l}/arwiki.pickle \$statsdir/${l}/hindawi.pickle \$statsdir/${l}/un-v1.0-tei.pickle || \$statsdir/${l}
build \$docdir/_build/${l}.svg: render-svg || \$docdir/_build
layout = ${l}
diff --git a/lulua/stats.py b/lulua/stats.py
index 06168c2..6665ac8 100644
--- a/lulua/stats.py
+++ b/lulua/stats.py
@@ -218,13 +218,16 @@ def pretty (args):
writer = Writer (layout)
buttonPresses = sum (stats['simple'].buttons.values ())
+ print ('button presses', buttonPresses)
for k, v in sorted (stats['simple'].buttons.items (), key=itemgetter (1)):
print (f'{k} {v:10d} {v/buttonPresses*100:5.1f}%')
- print ('combinations')
+
combinationTotal = sum (stats['simple'].combinations.values ())
+ print ('combinations', combinationTotal)
for k, v in sorted (stats['simple'].combinations.items (), key=itemgetter (1)):
t = displayText (layout.getText (k))
print (f'{t:4s} {k} {v:10d} {v/combinationTotal*100:5.1f}%')
+
print ('unknown')
for k, v in sorted (stats['simple'].unknown.items (), key=itemgetter (1)):
print (f'{k!r} {v:10d}')
@@ -245,9 +248,10 @@ def pretty (args):
for triad, count in sorted (stats['triads'].triads.items (), key=itemgetter (1)):
print (f'{triad} {count:10d}')
- print ('words')
+ totalWords = sum (stats['words'].words.values ())
+ print ('words', totalWords)
for word, count in sorted (stats['words'].words.items (), key=itemgetter (1)):
- print (f'{word} {count:10d}')
+ print (f'{word:20s} {count/totalWords*100:2.5f} {count:10d}')
effort = Carpalx (models['mod01'], writer)
effort.addTriads (stats['triads'].triads)