blob: 17d42140768536573284434dd4ce42a08f888283 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
|
#!/bin/sh
# Generate build.ninja that builds the docs/stats/…
layouts="ar-lulua ar-asmo663 ar-linux ar-malas ar-phonetic ar-osman ar-khorshid"
layoutsXmodmap="ar-lulua"
corpora="`ls corpus`"
cat <<EOF
### auto-generated by gen.sh. Do not edit. ###
### settings ###
corpusdir=corpus
statsdir=stats
docdir=doc
wikiextractor=3rdparty/wikiextractor/WikiExtractor.py
osmconvert=3rdparty/osmctools/src/osmconvert
fontdir=3rdparty/plex/IBM-Plex-Arabic/fonts/complete/woff2/
optrounds=100000
# pin layers, keep hand-optimized numbers, keep top row free
optpins=0;1;2;0,B*;3,*
optmodel=mod01
### pools ###
# lulua-write uses internal parallelization and should not be run more than
# once concurrently. It also uses alot of memory, so…
pool write
depth = 1
### rules ###
rule opt
command = lulua-optimize -n \$optrounds -r -p \$optpins -l ar-lulua -m \$optmodel < \$in > \$out
rule render-svg
command = lulua-render -l \$layout svg \$out
rule render-svg-heat
command = lulua-render -l \$layout svg --heatmap=\$in \$out
rule render-xmodmap
command = lulua-render -l \$layout xmodmap \$out
rule analyze-heat
command = lulua-analyze -l \$layout keyheatmap < \$in > \$out
rule write-bbcarabic
command = find \$in -type f | lulua-write bbcarabic \$layout | lulua-analyze combine > \$out
pool = write
rule write-aljazeera
command = find \$in -type f | lulua-write aljazeera \$layout | lulua-analyze combine > \$out
pool = write
rule write-epub
command = find \$in -type f | lulua-write epub \$layout | lulua-analyze combine > \$out
pool = write
rule write-tanzil
command = echo \$in | lulua-write text \$layout | lulua-analyze combine > \$out
pool = write
rule write-tei2
command = find \$in -type f -name '*.xml' | lulua-write tei2 \$layout | lulua-analyze combine > \$out
pool = write
rule write-opensubtitles
command = find \$in -type f -name '*.xml' | lulua-write opensubtitles \$layout | lulua-analyze combine > \$out
pool = write
rule write-arwiki
command = \$wikiextractor -ns 0 --json -o - \$in 2>/dev/null | jq .text | lulua-write json \$layout | lulua-analyze combine > \$out
pool = write
rule write-osm
command = \$osmconvert --csv='name:ar' \$in | sort -u | lulua-write lines \$layout | lulua-analyze combine > \$out
pool = write
rule combine
command = cat \$in | lulua-analyze combine > \$out
rule mkdir
command = mkdir -p \$out
rule letterfreq
command = lulua-analyze -l ar-lulua letterfreq < \$in > \$out
rule analyze-fingerhand
command = lulua-analyze -l \$layout fingerhand < \$in > \$out
rule analyze-corpusstats
command = lulua-analyze -l ar-lulua corpusstats \$metadata < \$stats > \$out
rule analyze-corpushtml
command = cat \$in | lulua-analyze -l ar-lulua corpushtml > \$out
rule wordlist
command = lulua-analyze -l ar-lulua latinime < \$in > \$out
rule html
command = m4 -I \$docdir/_temp \$template > \$out
rule cp
command = cp \$in \$out
rule gz
command = gzip -c \$in > \$out
rule configure-make
command = cd \$in && autoreconf --install && ./configure && make
### build targets ###
build \$docdir/_build: mkdir
build \$docdir/_build/fonts: mkdir
build \$docdir/_temp: mkdir
build \$docdir/_build/letterfreq.json: letterfreq \$statsdir/ar-lulua/all.pickle || \$docdir/_build
build \$docdir/_build/style.css: cp \$docdir/style.css || \$docdir/_build
build \$docdir/_build/lulua-logo.svg: cp \$docdir/lulua-logo.svg || \$docdir/_build
# wordlist
build \$docdir/_temp/lulua.combined: wordlist \$statsdir/ar-lulua/all.pickle || \$docdir/_temp
build \$docdir/_build/lulua.combined.gz: gz \$docdir/_temp/lulua.combined || \$docdir/_build
build \$docdir/_build/fonts/IBMPlexArabic-Regular.woff2: cp \$fontdir/IBMPlexArabic-Regular.woff2 || \$docdir/_build/fonts
build \$docdir/_build/fonts/IBMPlexArabic-Thin.woff2: cp \$fontdir/IBMPlexArabic-Thin.woff2 || \$docdir/_build/fonts
# build osmconvert
build \$osmconvert: configure-make 3rdparty/osmctools
EOF
# targets for every layout
for l in $layouts; do
cat <<EOF
build \$statsdir/${l}: mkdir
build \$statsdir/${l}/bbcarabic.pickle: write-bbcarabic \$corpusdir/bbcarabic/raw || \$statsdir/${l}
layout = ${l}
build \$statsdir/${l}/aljazeera.pickle: write-aljazeera \$corpusdir/aljazeera/raw || \$statsdir/${l}
layout = ${l}
build \$statsdir/${l}/hindawi.pickle: write-epub \$corpusdir/hindawi/raw || \$statsdir/${l}
layout = ${l}
build \$statsdir/${l}/tanzil-quaran.pickle: write-tanzil \$corpusdir/tanzil-quaran/plain.txt.lz || \$statsdir/${l}
layout = ${l}
build \$statsdir/${l}/arwiki.pickle: write-arwiki \$corpusdir/arwiki/arwiki-20190701-pages-articles.xml.bz2 || \$statsdir/${l}
layout = ${l}
build \$statsdir/${l}/osm.pickle: write-osm \$corpusdir/osm/planet-191104.osm.pbf || \$statsdir/${l} \$osmconvert
layout = ${l}
build \$statsdir/${l}/un-v1.0-tei.pickle: write-tei2 \$corpusdir/un-v1.0-tei/raw || \$statsdir/${l}
layout = ${l}
build \$statsdir/${l}/opensubtitles-2018.pickle: write-opensubtitles \$corpusdir/opensubtitles-2018/raw || \$statsdir/${l}
layout = ${l}
build \$statsdir/${l}/all.pickle: combine \$statsdir/${l}/bbcarabic.pickle \$statsdir/${l}/aljazeera.pickle \$statsdir/${l}/tanzil-quaran.pickle \$statsdir/${l}/arwiki.pickle \$statsdir/${l}/osm.pickle \$statsdir/${l}/hindawi.pickle \$statsdir/${l}/un-v1.0-tei.pickle \$statsdir/${l}/opensubtitles-2018.pickle || \$statsdir/${l}
build \$docdir/_build/${l}.svg: render-svg || \$docdir/_build
layout = ${l}
build \$docdir/_temp/${l}-heat.yaml: analyze-heat \$statsdir/${l}/all.pickle || \$docdir/_temp
layout = ${l}
build \$docdir/_build/${l}-heat.svg: render-svg-heat \$docdir/_temp/${l}-heat.yaml || \$docdir/_build
layout = ${l}
build \$docdir/_temp/${l}-fingerhand.html: analyze-fingerhand \$statsdir/${l}/all.pickle || \$docdir/_temp
layout = ${l}
EOF
# included by index.html and thus must be its dependencies
fingerhandfiles+=" \$docdir/_temp/${l}-fingerhand.html"
done
# layouts with xmodmap support
for l in $layoutsXmodmap; do
cat <<EOF
build \$docdir/_build/${l}.xmodmap: render-xmodmap || \$docdir/_build
layout = ${l}
EOF
done
# statistics for each corpus (ar-lulua) and html rendering
outfiles=""
for c in $corpora; do
cat <<EOF
build \$docdir/_temp/metadata-$c.yaml: analyze-corpusstats \$statsdir/ar-lulua/$c.pickle \$corpusdir/$c/metadata.yaml || \$docdir/_temp \$corpusdir/$c/metadata.yaml
metadata = \$corpusdir/$c/metadata.yaml
stats = \$statsdir/ar-lulua/$c.pickle
EOF
outfiles+=" \$docdir/_temp/metadata-$c.yaml"
done
cat <<EOF
build \$docdir/_temp/corpus.html: analyze-corpushtml $outfiles || \$docdir/_temp
EOF
# html, which depends on several other files generated above
cat <<EOF
build \$docdir/_build/index.html: html \$docdir/index.html \$docdir/_temp/corpus.html $fingerhandfiles || \$docdir/_build
template = \$docdir/index.html
EOF
|