summaryrefslogtreecommitdiff
path: root/gen.sh
blob: de1fed04c7114cb0607b5e9f8fb798ef78fec7d2 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
#!/bin/sh
# Generate build.ninja that builds the docs/stats/…

layouts="ar-lulua ar-asmo663 ar-linux ar-malas ar-phonetic ar-osman ar-khorshid"
layoutsXmodmap="ar-lulua"
corpora="`ls corpus`"

cat <<EOF
### auto-generated by gen.sh. Do not edit. ###

### settings ###
corpusdir=corpus
statsdir=stats
docdir=doc
wikiextractor=3rdparty/wikiextractor/WikiExtractor.py
fontdir=3rdparty/plex/IBM-Plex-Arabic/fonts/complete/woff2/
optrounds=100000
# pin layers, keep hand-optimized numbers, keep top row free
optpins=0;1;2;0,B*;3,*
optmodel=mod01

### pools ###
# lulua-write uses internal parallelization and should not be run more than
# once concurrently. It also uses alot of memory, so…
pool write
    depth = 1

### rules ###
rule opt
    command = lulua-optimize -n \$optrounds -r -p \$optpins -l ar-lulua -m \$optmodel < \$in > \$out

rule render-svg
    command = lulua-render -l \$layout svg \$out

rule render-svg-heat
    command = lulua-render -l \$layout svg --heatmap=\$in \$out

rule render-xmodmap
    command = lulua-render -l \$layout xmodmap \$out

rule analyze-heat
    command = lulua-analyze -l \$layout keyheatmap < \$in > \$out

rule write-bbcarabic
    command = find \$in -type f | lulua-write bbcarabic \$layout | lulua-analyze combine > \$out
    pool = write

rule write-aljazeera
    command = find \$in -type f | lulua-write aljazeera \$layout | lulua-analyze combine > \$out
    pool = write

rule write-epub
    command = find \$in -type f | lulua-write epub \$layout | lulua-analyze combine > \$out
    pool = write

rule write-tanzil
    command = echo \$in | lulua-write text \$layout | lulua-analyze combine > \$out
    pool = write

rule write-tei2
    command = find \$in -type f -name '*.xml' | lulua-write tei2 \$layout | lulua-analyze combine > \$out
    pool = write

rule write-opensubtitles
    command = find \$in -type f -name '*.xml' | lulua-write opensubtitles \$layout | lulua-analyze combine > \$out
    pool = write

rule write-arwiki
    command = \$wikiextractor -ns 0 --json -o - \$in 2>/dev/null | jq .text | lulua-write json \$layout | lulua-analyze combine > \$out
    pool = write

rule combine
    command = cat \$in | lulua-analyze combine > \$out

rule mkdir
    command = mkdir -p \$out

rule letterfreq
    command = lulua-analyze -l ar-lulua letterfreq < \$in > \$out

rule analyze-fingerhand
    command = lulua-analyze -l \$layout fingerhand < \$in > \$out

rule analyze-corpusstats
    command = lulua-analyze -l ar-lulua corpusstats \$metadata < \$stats > \$out

rule analyze-corpushtml
    command = cat \$in | lulua-analyze -l ar-lulua corpushtml > \$out

rule wordlist
    command = lulua-analyze -l ar-lulua latinime < \$in > \$out

rule cpp
    command = gcc -E -x c -nostdinc -MMD -MF \$out.d -C -P -I \$docdir/_temp \$in -o \$out
    depfile = \$out.d
    deps = gcc

rule cp
    command = cp \$in \$out

rule gz
    command = gzip -c \$in > \$out

### build targets ###
build \$docdir/_build: mkdir
build \$docdir/_build/fonts: mkdir
build \$docdir/_temp: mkdir
build \$docdir/_build/index.html: cpp \$docdir/index.html || \$docdir/_build
build \$docdir/_build/letterfreq.json: letterfreq \$statsdir/ar-lulua/all.pickle || \$docdir/_build
build \$docdir/_build/style.css: cp \$docdir/style.css || \$docdir/_build
build \$docdir/_build/lulua-logo.svg: cp \$docdir/lulua-logo.svg || \$docdir/_build
# wordlist
build \$docdir/_temp/lulua.combined: wordlist \$statsdir/ar-lulua/all.pickle || \$docdir/_temp
build \$docdir/_build/lulua.combined.gz: gz \$docdir/_temp/lulua.combined || \$docdir/_build


build \$docdir/_build/fonts/IBMPlexArabic-Regular.woff2: cp \$fontdir/IBMPlexArabic-Regular.woff2 || \$docdir/_build/fonts
build \$docdir/_build/fonts/IBMPlexArabic-Thin.woff2: cp \$fontdir/IBMPlexArabic-Thin.woff2 || \$docdir/_build/fonts
EOF

# targets for every layout
for l in $layouts; do
cat <<EOF
build \$statsdir/${l}: mkdir

build \$statsdir/${l}/bbcarabic.pickle: write-bbcarabic \$corpusdir/bbcarabic/raw || \$statsdir/${l}
    layout = ${l}

build \$statsdir/${l}/aljazeera.pickle: write-aljazeera \$corpusdir/aljazeera/raw || \$statsdir/${l}
    layout = ${l}

build \$statsdir/${l}/hindawi.pickle: write-epub \$corpusdir/hindawi/raw || \$statsdir/${l}
    layout = ${l}

build \$statsdir/${l}/tanzil-quaran.pickle: write-tanzil \$corpusdir/tanzil-quaran/plain.txt.lz || \$statsdir/${l}
    layout = ${l}

build \$statsdir/${l}/arwiki.pickle: write-arwiki \$corpusdir/arwiki/arwiki-20190701-pages-articles.xml.bz2 || \$statsdir/${l}
    layout = ${l}

build \$statsdir/${l}/un-v1.0-tei.pickle: write-tei2 \$corpusdir/un-v1.0-tei || \$statsdir/${l}
    layout = ${l}

build \$statsdir/${l}/opensubtitles-2018.pickle: write-opensubtitles \$corpusdir/opensubtitles-2018 || \$statsdir/${l}
    layout = ${l}

build \$statsdir/${l}/all.pickle: combine \$statsdir/${l}/bbcarabic.pickle \$statsdir/${l}/aljazeera.pickle \$statsdir/${l}/tanzil-quaran.pickle \$statsdir/${l}/arwiki.pickle \$statsdir/${l}/hindawi.pickle \$statsdir/${l}/un-v1.0-tei.pickle \$statsdir/${l}/opensubtitles-2018.pickle || \$statsdir/${l}

build \$docdir/_build/${l}.svg: render-svg || \$docdir/_build
    layout = ${l}

build \$docdir/_temp/${l}-heat.yaml: analyze-heat \$statsdir/${l}/all.pickle || \$docdir/_temp
    layout = ${l}

build \$docdir/_build/${l}-heat.svg: render-svg-heat \$docdir/_temp/${l}-heat.yaml || \$docdir/_build
    layout = ${l}

build \$docdir/_temp/${l}-fingerhand.html: analyze-fingerhand \$statsdir/${l}/all.pickle || \$docdir/_temp
    layout = ${l}

EOF
done

# layouts with xmodmap support
for l in $layoutsXmodmap; do
cat <<EOF
build \$docdir/_build/${l}.xmodmap: render-xmodmap || \$docdir/_build
    layout = ${l}

EOF
done

# statistics for each corpus (ar-lulua) and html rendering
outfiles=""
for c in $corpora; do
cat <<EOF
build \$docdir/_temp/metadata-$c.yaml: analyze-corpusstats \$statsdir/ar-lulua/$c.pickle \$corpusdir/$c/metadata.yaml || \$docdir/_temp \$corpusdir/$c/metadata.yaml
    metadata = \$corpusdir/$c/metadata.yaml
    stats = \$statsdir/ar-lulua/$c.pickle
EOF
outfiles+=" \$docdir/_temp/metadata-$c.yaml"
done

cat <<EOF
build \$docdir/_temp/corpus.html: analyze-corpushtml $outfiles || \$docdir/_temp
EOF