summaryrefslogtreecommitdiff
path: root/gen.sh
diff options
context:
space:
mode:
authorLars-Dominik Braun <lars@6xq.net>2019-11-08 16:06:37 +0100
committerLars-Dominik Braun <lars@6xq.net>2019-11-08 21:34:15 +0100
commit38c9ed5b042ae488ee12287bf8c19457189889aa (patch)
treed4f49039eec711aa7c9ee21c691f46bc89316e48 /gen.sh
parente31d8731531b41a909bfe33ddc134de07f0a7bab (diff)
downloadlulua-38c9ed5b042ae488ee12287bf8c19457189889aa.tar.gz
lulua-38c9ed5b042ae488ee12287bf8c19457189889aa.tar.bz2
lulua-38c9ed5b042ae488ee12287bf8c19457189889aa.zip
Add OpenSubtitles corpus
See issue #5.
Diffstat (limited to 'gen.sh')
-rwxr-xr-xgen.sh9
1 files changed, 8 insertions, 1 deletions
diff --git a/gen.sh b/gen.sh
index 588b8ba..762eecc 100755
--- a/gen.sh
+++ b/gen.sh
@@ -60,6 +60,10 @@ rule write-tei2
command = find \$in -type f -name '*.xml' | lulua-write tei2 \$layout | lulua-analyze combine > \$out
pool = write
+rule write-opensubtitles
+ command = find \$in -type f -name '*.xml' | lulua-write opensubtitles \$layout | lulua-analyze combine > \$out
+ pool = write
+
rule write-arwiki
command = \$wikiextractor -ns 0 --json -o - \$in 2>/dev/null | jq .text | lulua-write json \$layout | lulua-analyze combine > \$out
pool = write
@@ -129,7 +133,10 @@ build \$statsdir/${l}/arwiki.pickle: write-arwiki \$corpusdir/arwiki/arwiki-2019
build \$statsdir/${l}/un-v1.0-tei.pickle: write-tei2 \$corpusdir/un-v1.0-tei || \$statsdir/${l}
layout = ${l}
-build \$statsdir/${l}/all.pickle: combine \$statsdir/${l}/bbcarabic.pickle \$statsdir/${l}/aljazeera.pickle \$statsdir/${l}/tanzil.pickle \$statsdir/${l}/arwiki.pickle \$statsdir/${l}/hindawi.pickle \$statsdir/${l}/un-v1.0-tei.pickle || \$statsdir/${l}
+build \$statsdir/${l}/opensubtitles-2018.pickle: write-opensubtitles \$corpusdir/opensubtitles-2018 || \$statsdir/${l}
+ layout = ${l}
+
+build \$statsdir/${l}/all.pickle: combine \$statsdir/${l}/bbcarabic.pickle \$statsdir/${l}/aljazeera.pickle \$statsdir/${l}/tanzil.pickle \$statsdir/${l}/arwiki.pickle \$statsdir/${l}/hindawi.pickle \$statsdir/${l}/un-v1.0-tei.pickle \$statsdir/${l}/opensubtitles-2018.pickle || \$statsdir/${l}
build \$docdir/_build/${l}.svg: render-svg || \$docdir/_build
layout = ${l}