diff options
author | Lars-Dominik Braun <lars@6xq.net> | 2019-11-06 19:18:08 +0100 |
---|---|---|
committer | Lars-Dominik Braun <lars@6xq.net> | 2019-11-08 21:34:11 +0100 |
commit | e31d8731531b41a909bfe33ddc134de07f0a7bab (patch) | |
tree | 56da3225cfdd4e239c78173803412c1f9e1b5e36 /gen.sh | |
parent | 43ad3e898a28798ac2f928041999997c24e7bf3c (diff) | |
download | lulua-e31d8731531b41a909bfe33ddc134de07f0a7bab.tar.gz lulua-e31d8731531b41a909bfe33ddc134de07f0a7bab.tar.bz2 lulua-e31d8731531b41a909bfe33ddc134de07f0a7bab.zip |
Add United Nations Parallel Corpus v1.0
See issue #5.
Diffstat (limited to 'gen.sh')
-rwxr-xr-x | gen.sh | 9 |
1 files changed, 8 insertions, 1 deletions
@@ -56,6 +56,10 @@ rule write-tanzil command = echo \$in | lulua-write text \$layout | lulua-analyze combine > \$out pool = write +rule write-tei2 + command = find \$in -type f -name '*.xml' | lulua-write tei2 \$layout | lulua-analyze combine > \$out + pool = write + rule write-arwiki command = \$wikiextractor -ns 0 --json -o - \$in 2>/dev/null | jq .text | lulua-write json \$layout | lulua-analyze combine > \$out pool = write @@ -122,7 +126,10 @@ build \$statsdir/${l}/tanzil.pickle: write-tanzil \$corpusdir/tanzil-quaran/plai build \$statsdir/${l}/arwiki.pickle: write-arwiki \$corpusdir/arwiki/arwiki-20190701-pages-articles.xml.bz2 || \$statsdir/${l} layout = ${l} -build \$statsdir/${l}/all.pickle: combine \$statsdir/${l}/bbcarabic.pickle \$statsdir/${l}/aljazeera.pickle \$statsdir/${l}/tanzil.pickle \$statsdir/${l}/arwiki.pickle \$statsdir/${l}/hindawi.pickle || \$statsdir/${l} +build \$statsdir/${l}/un-v1.0-tei.pickle: write-tei2 \$corpusdir/un-v1.0-tei || \$statsdir/${l} + layout = ${l} + +build \$statsdir/${l}/all.pickle: combine \$statsdir/${l}/bbcarabic.pickle \$statsdir/${l}/aljazeera.pickle \$statsdir/${l}/tanzil.pickle \$statsdir/${l}/arwiki.pickle \$statsdir/${l}/hindawi.pickle \$statsdir/${l}/un-v1.0-tei.pickle || \$statsdir/${l} build \$docdir/_build/${l}.svg: render-svg || \$docdir/_build layout = ${l} |