summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLars-Dominik Braun <lars@6xq.net>2019-11-16 13:40:39 +0100
committerLars-Dominik Braun <lars@6xq.net>2019-11-16 21:06:57 +0100
commit41f342e12b975e785de9d755d38eb92cf38f5ec5 (patch)
treea0b9f6716fc94d005350d66091bcf42a0c2f87f4
parentc543f42fe50d9b10175059ee5ed9186f30445f40 (diff)
downloadlulua-41f342e12b975e785de9d755d38eb92cf38f5ec5.tar.gz
lulua-41f342e12b975e785de9d755d38eb92cf38f5ec5.tar.bz2
lulua-41f342e12b975e785de9d755d38eb92cf38f5ec5.zip
Add OpenStreetMap label corpus
Extract node labels (name:ar) from OpenStreetMap’s planet dump. Heavily leans towards a few common words (“street”, obviously), but we should be fine since the corpus is not that large.
-rw-r--r--.gitmodules3
m---------3rdparty/osmctools0
-rw-r--r--corpus/osm/metadata.yaml5
-rwxr-xr-xgen.sh15
-rw-r--r--lulua/text.py5
5 files changed, 27 insertions, 1 deletions
diff --git a/.gitmodules b/.gitmodules
index 7a73284..b4e9f51 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -4,3 +4,6 @@
[submodule "3rdparty/plex"]
path = 3rdparty/plex
url = https://github.com/IBM/plex.git
+[submodule "3rdparty/osmctools"]
+ path = 3rdparty/osmctools
+ url = https://gitlab.com/osm-c-tools/osmctools.git
diff --git a/3rdparty/osmctools b/3rdparty/osmctools
new file mode 160000
+Subproject f341f5f237737594c1b024338f0a2fc04fabdff
diff --git a/corpus/osm/metadata.yaml b/corpus/osm/metadata.yaml
new file mode 100644
index 0000000..e3aff73
--- /dev/null
+++ b/corpus/osm/metadata.yaml
@@ -0,0 +1,5 @@
+source:
+ name: OpenStreetMap Arabic Labels
+ url: https://planet.openstreetmap.org/
+date: 2019-11-04
+count: [376148, labels]
diff --git a/gen.sh b/gen.sh
index 589035a..17d4214 100755
--- a/gen.sh
+++ b/gen.sh
@@ -13,6 +13,7 @@ corpusdir=corpus
statsdir=stats
docdir=doc
wikiextractor=3rdparty/wikiextractor/WikiExtractor.py
+osmconvert=3rdparty/osmctools/src/osmconvert
fontdir=3rdparty/plex/IBM-Plex-Arabic/fonts/complete/woff2/
optrounds=100000
# pin layers, keep hand-optimized numbers, keep top row free
@@ -69,6 +70,10 @@ rule write-arwiki
command = \$wikiextractor -ns 0 --json -o - \$in 2>/dev/null | jq .text | lulua-write json \$layout | lulua-analyze combine > \$out
pool = write
+rule write-osm
+ command = \$osmconvert --csv='name:ar' \$in | sort -u | lulua-write lines \$layout | lulua-analyze combine > \$out
+ pool = write
+
rule combine
command = cat \$in | lulua-analyze combine > \$out
@@ -99,6 +104,9 @@ rule cp
rule gz
command = gzip -c \$in > \$out
+rule configure-make
+ command = cd \$in && autoreconf --install && ./configure && make
+
### build targets ###
build \$docdir/_build: mkdir
build \$docdir/_build/fonts: mkdir
@@ -114,6 +122,8 @@ build \$docdir/_build/lulua.combined.gz: gz \$docdir/_temp/lulua.combined || \$d
build \$docdir/_build/fonts/IBMPlexArabic-Regular.woff2: cp \$fontdir/IBMPlexArabic-Regular.woff2 || \$docdir/_build/fonts
build \$docdir/_build/fonts/IBMPlexArabic-Thin.woff2: cp \$fontdir/IBMPlexArabic-Thin.woff2 || \$docdir/_build/fonts
+# build osmconvert
+build \$osmconvert: configure-make 3rdparty/osmctools
EOF
# targets for every layout
@@ -136,13 +146,16 @@ build \$statsdir/${l}/tanzil-quaran.pickle: write-tanzil \$corpusdir/tanzil-quar
build \$statsdir/${l}/arwiki.pickle: write-arwiki \$corpusdir/arwiki/arwiki-20190701-pages-articles.xml.bz2 || \$statsdir/${l}
layout = ${l}
+build \$statsdir/${l}/osm.pickle: write-osm \$corpusdir/osm/planet-191104.osm.pbf || \$statsdir/${l} \$osmconvert
+ layout = ${l}
+
build \$statsdir/${l}/un-v1.0-tei.pickle: write-tei2 \$corpusdir/un-v1.0-tei/raw || \$statsdir/${l}
layout = ${l}
build \$statsdir/${l}/opensubtitles-2018.pickle: write-opensubtitles \$corpusdir/opensubtitles-2018/raw || \$statsdir/${l}
layout = ${l}
-build \$statsdir/${l}/all.pickle: combine \$statsdir/${l}/bbcarabic.pickle \$statsdir/${l}/aljazeera.pickle \$statsdir/${l}/tanzil-quaran.pickle \$statsdir/${l}/arwiki.pickle \$statsdir/${l}/hindawi.pickle \$statsdir/${l}/un-v1.0-tei.pickle \$statsdir/${l}/opensubtitles-2018.pickle || \$statsdir/${l}
+build \$statsdir/${l}/all.pickle: combine \$statsdir/${l}/bbcarabic.pickle \$statsdir/${l}/aljazeera.pickle \$statsdir/${l}/tanzil-quaran.pickle \$statsdir/${l}/arwiki.pickle \$statsdir/${l}/osm.pickle \$statsdir/${l}/hindawi.pickle \$statsdir/${l}/un-v1.0-tei.pickle \$statsdir/${l}/opensubtitles-2018.pickle || \$statsdir/${l}
build \$docdir/_build/${l}.svg: render-svg || \$docdir/_build
layout = ${l}
diff --git a/lulua/text.py b/lulua/text.py
index 382877b..2d8398d 100644
--- a/lulua/text.py
+++ b/lulua/text.py
@@ -159,6 +159,10 @@ def sourceText (item):
with LzipFile (item.rstrip ()) as fd:
yield fd.read ().decode ('utf-8')
+def sourceLines (item):
+ """ Read items (i.e. lines) as is """
+ yield item
+
def sourceJson (item):
yield json.loads (item)
@@ -210,6 +214,7 @@ sources = dict(
epub=sourceEpub,
tei2=sourceTEI2,
opensubtitles=sourceOpenSubtitles,
+ lines=sourceLines,
)
charMap = {