From 1deb60037ed061c5dd973005b81c106561032ebe Mon Sep 17 00:00:00 2001 From: Lars-Dominik Braun Date: Sat, 25 Apr 2020 08:14:27 +0200 Subject: Improve lulua-write Introduce composable filters, switch to brotli-compressed tarballs, which has good ratios and fast decompression, reducing I/O significantly. --- gen.sh | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) (limited to 'gen.sh') diff --git a/gen.sh b/gen.sh index 24ba833..2bf35a0 100755 --- a/gen.sh +++ b/gen.sh @@ -47,35 +47,35 @@ rule analyze-heat command = lulua-analyze -l \$layout keyheatmap < \$in > \$out rule write-bbcarabic - command = find \$in -type f | lulua-write bbcarabic \$layout | lulua-analyze combine > \$out + command = find \$in | lulua-write \$layout file brotli tar bbcarabic | lulua-analyze combine > \$out pool = write rule write-aljazeera - command = find \$in -type f | lulua-write aljazeera \$layout | lulua-analyze combine > \$out + command = find \$in | lulua-write \$layout file brotli tar aljazeera | lulua-analyze combine > \$out pool = write rule write-epub - command = find \$in -type f | lulua-write epub \$layout | lulua-analyze combine > \$out + command = find \$in | lulua-write \$layout epub | lulua-analyze combine > \$out pool = write rule write-tanzil - command = echo \$in | lulua-write text \$layout | lulua-analyze combine > \$out + command = find \$in | lulua-write \$layout file text | lulua-analyze combine > \$out pool = write rule write-tei2 - command = find \$in -type f -name '*.xml' | lulua-write tei2 \$layout | lulua-analyze combine > \$out + command = find \$in | lulua-write \$layout file brotli tar xml tei2 | lulua-analyze combine > \$out pool = write rule write-opensubtitles - command = find \$in -type f -name '*.xml' | lulua-write opensubtitles \$layout | lulua-analyze combine > \$out + command = find \$in | lulua-write \$layout file brotli tar xml opensubtitles | lulua-analyze combine > \$out pool = write rule write-arwiki - command = \$wikiextractor -ns 0 --json -o - \$in 2>/dev/null | jq .text | lulua-write json \$layout | lulua-analyze combine > \$out + command = \$wikiextractor -ns 0 --json -o - \$in 2>/dev/null | jq .text | lulua-write \$layout json | lulua-analyze combine > \$out pool = write rule write-osm - command = \$osmconvert --csv='name:ar' \$in | sort -u | lulua-write lines \$layout | lulua-analyze combine > \$out + command = \$osmconvert --csv='name:ar' \$in | sort -u | lulua-write \$layout lines | lulua-analyze combine > \$out pool = write rule combine @@ -152,21 +152,27 @@ build \$reportdir/ar-lulua-w64.zip: zipR \$tempdir/ar-lulua-w64 | $deps EOF +bbcarabicfiles=`find $corpusdir/bbcarabic/ -type f -name '*.tar.br' | tr '\n' ' '` +aljazeerafiles=`find $corpusdir/aljazeera/ -type f -name '*.tar.br' | tr '\n' ' '` +unfiles=`find $corpusdir/un-v1.0-tei/ -type f -name '*.tar.br' | tr '\n' ' '` +opensubtitlesfiles=`find $corpusdir/opensubtitles-2018/ -type f -name '*.tar.br' | tr '\n' ' '` +hindawifiles=`find $corpusdir/hindawi/ -type f -name '*.epub' | tr '\n' ' '` + # targets for every layout for l in $layouts; do cat <