diff options
author | Lars-Dominik Braun <lars@6xq.net> | 2019-11-16 13:40:39 +0100 |
---|---|---|
committer | Lars-Dominik Braun <lars@6xq.net> | 2019-11-16 21:06:57 +0100 |
commit | 41f342e12b975e785de9d755d38eb92cf38f5ec5 (patch) | |
tree | a0b9f6716fc94d005350d66091bcf42a0c2f87f4 | |
parent | c543f42fe50d9b10175059ee5ed9186f30445f40 (diff) | |
download | lulua-41f342e12b975e785de9d755d38eb92cf38f5ec5.tar.gz lulua-41f342e12b975e785de9d755d38eb92cf38f5ec5.tar.bz2 lulua-41f342e12b975e785de9d755d38eb92cf38f5ec5.zip |
Add OpenStreetMap label corpus
Extract node labels (name:ar) from OpenStreetMap’s planet dump. Heavily
leans towards a few common words (“street”, obviously), but we should be
fine since the corpus is not that large.
-rw-r--r-- | .gitmodules | 3 | ||||
m--------- | 3rdparty/osmctools | 0 | ||||
-rw-r--r-- | corpus/osm/metadata.yaml | 5 | ||||
-rwxr-xr-x | gen.sh | 15 | ||||
-rw-r--r-- | lulua/text.py | 5 |
5 files changed, 27 insertions, 1 deletions
diff --git a/.gitmodules b/.gitmodules index 7a73284..b4e9f51 100644 --- a/.gitmodules +++ b/.gitmodules @@ -4,3 +4,6 @@ [submodule "3rdparty/plex"] path = 3rdparty/plex url = https://github.com/IBM/plex.git +[submodule "3rdparty/osmctools"] + path = 3rdparty/osmctools + url = https://gitlab.com/osm-c-tools/osmctools.git diff --git a/3rdparty/osmctools b/3rdparty/osmctools new file mode 160000 +Subproject f341f5f237737594c1b024338f0a2fc04fabdff diff --git a/corpus/osm/metadata.yaml b/corpus/osm/metadata.yaml new file mode 100644 index 0000000..e3aff73 --- /dev/null +++ b/corpus/osm/metadata.yaml @@ -0,0 +1,5 @@ +source: + name: OpenStreetMap Arabic Labels + url: https://planet.openstreetmap.org/ +date: 2019-11-04 +count: [376148, labels] @@ -13,6 +13,7 @@ corpusdir=corpus statsdir=stats docdir=doc wikiextractor=3rdparty/wikiextractor/WikiExtractor.py +osmconvert=3rdparty/osmctools/src/osmconvert fontdir=3rdparty/plex/IBM-Plex-Arabic/fonts/complete/woff2/ optrounds=100000 # pin layers, keep hand-optimized numbers, keep top row free @@ -69,6 +70,10 @@ rule write-arwiki command = \$wikiextractor -ns 0 --json -o - \$in 2>/dev/null | jq .text | lulua-write json \$layout | lulua-analyze combine > \$out pool = write +rule write-osm + command = \$osmconvert --csv='name:ar' \$in | sort -u | lulua-write lines \$layout | lulua-analyze combine > \$out + pool = write + rule combine command = cat \$in | lulua-analyze combine > \$out @@ -99,6 +104,9 @@ rule cp rule gz command = gzip -c \$in > \$out +rule configure-make + command = cd \$in && autoreconf --install && ./configure && make + ### build targets ### build \$docdir/_build: mkdir build \$docdir/_build/fonts: mkdir @@ -114,6 +122,8 @@ build \$docdir/_build/lulua.combined.gz: gz \$docdir/_temp/lulua.combined || \$d build \$docdir/_build/fonts/IBMPlexArabic-Regular.woff2: cp \$fontdir/IBMPlexArabic-Regular.woff2 || \$docdir/_build/fonts build \$docdir/_build/fonts/IBMPlexArabic-Thin.woff2: cp \$fontdir/IBMPlexArabic-Thin.woff2 || \$docdir/_build/fonts +# build osmconvert +build \$osmconvert: configure-make 3rdparty/osmctools EOF # targets for every layout @@ -136,13 +146,16 @@ build \$statsdir/${l}/tanzil-quaran.pickle: write-tanzil \$corpusdir/tanzil-quar build \$statsdir/${l}/arwiki.pickle: write-arwiki \$corpusdir/arwiki/arwiki-20190701-pages-articles.xml.bz2 || \$statsdir/${l} layout = ${l} +build \$statsdir/${l}/osm.pickle: write-osm \$corpusdir/osm/planet-191104.osm.pbf || \$statsdir/${l} \$osmconvert + layout = ${l} + build \$statsdir/${l}/un-v1.0-tei.pickle: write-tei2 \$corpusdir/un-v1.0-tei/raw || \$statsdir/${l} layout = ${l} build \$statsdir/${l}/opensubtitles-2018.pickle: write-opensubtitles \$corpusdir/opensubtitles-2018/raw || \$statsdir/${l} layout = ${l} -build \$statsdir/${l}/all.pickle: combine \$statsdir/${l}/bbcarabic.pickle \$statsdir/${l}/aljazeera.pickle \$statsdir/${l}/tanzil-quaran.pickle \$statsdir/${l}/arwiki.pickle \$statsdir/${l}/hindawi.pickle \$statsdir/${l}/un-v1.0-tei.pickle \$statsdir/${l}/opensubtitles-2018.pickle || \$statsdir/${l} +build \$statsdir/${l}/all.pickle: combine \$statsdir/${l}/bbcarabic.pickle \$statsdir/${l}/aljazeera.pickle \$statsdir/${l}/tanzil-quaran.pickle \$statsdir/${l}/arwiki.pickle \$statsdir/${l}/osm.pickle \$statsdir/${l}/hindawi.pickle \$statsdir/${l}/un-v1.0-tei.pickle \$statsdir/${l}/opensubtitles-2018.pickle || \$statsdir/${l} build \$docdir/_build/${l}.svg: render-svg || \$docdir/_build layout = ${l} diff --git a/lulua/text.py b/lulua/text.py index 382877b..2d8398d 100644 --- a/lulua/text.py +++ b/lulua/text.py @@ -159,6 +159,10 @@ def sourceText (item): with LzipFile (item.rstrip ()) as fd: yield fd.read ().decode ('utf-8') +def sourceLines (item): + """ Read items (i.e. lines) as is """ + yield item + def sourceJson (item): yield json.loads (item) @@ -210,6 +214,7 @@ sources = dict( epub=sourceEpub, tei2=sourceTEI2, opensubtitles=sourceOpenSubtitles, + lines=sourceLines, ) charMap = { |