From 41f342e12b975e785de9d755d38eb92cf38f5ec5 Mon Sep 17 00:00:00 2001 From: Lars-Dominik Braun Date: Sat, 16 Nov 2019 13:40:39 +0100 Subject: Add OpenStreetMap label corpus MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extract node labels (name:ar) from OpenStreetMap’s planet dump. Heavily leans towards a few common words (“street”, obviously), but we should be fine since the corpus is not that large. --- .gitmodules | 3 +++ 3rdparty/osmctools | 1 + corpus/osm/metadata.yaml | 5 +++++ gen.sh | 15 ++++++++++++++- lulua/text.py | 5 +++++ 5 files changed, 28 insertions(+), 1 deletion(-) create mode 160000 3rdparty/osmctools create mode 100644 corpus/osm/metadata.yaml diff --git a/.gitmodules b/.gitmodules index 7a73284..b4e9f51 100644 --- a/.gitmodules +++ b/.gitmodules @@ -4,3 +4,6 @@ [submodule "3rdparty/plex"] path = 3rdparty/plex url = https://github.com/IBM/plex.git +[submodule "3rdparty/osmctools"] + path = 3rdparty/osmctools + url = https://gitlab.com/osm-c-tools/osmctools.git diff --git a/3rdparty/osmctools b/3rdparty/osmctools new file mode 160000 index 0000000..f341f5f --- /dev/null +++ b/3rdparty/osmctools @@ -0,0 +1 @@ +Subproject commit f341f5f237737594c1b024338f0a2fc04fabdff3 diff --git a/corpus/osm/metadata.yaml b/corpus/osm/metadata.yaml new file mode 100644 index 0000000..e3aff73 --- /dev/null +++ b/corpus/osm/metadata.yaml @@ -0,0 +1,5 @@ +source: + name: OpenStreetMap Arabic Labels + url: https://planet.openstreetmap.org/ +date: 2019-11-04 +count: [376148, labels] diff --git a/gen.sh b/gen.sh index 589035a..17d4214 100755 --- a/gen.sh +++ b/gen.sh @@ -13,6 +13,7 @@ corpusdir=corpus statsdir=stats docdir=doc wikiextractor=3rdparty/wikiextractor/WikiExtractor.py +osmconvert=3rdparty/osmctools/src/osmconvert fontdir=3rdparty/plex/IBM-Plex-Arabic/fonts/complete/woff2/ optrounds=100000 # pin layers, keep hand-optimized numbers, keep top row free @@ -69,6 +70,10 @@ rule write-arwiki command = \$wikiextractor -ns 0 --json -o - \$in 2>/dev/null | jq .text | lulua-write json \$layout | lulua-analyze combine > \$out pool = write +rule write-osm + command = \$osmconvert --csv='name:ar' \$in | sort -u | lulua-write lines \$layout | lulua-analyze combine > \$out + pool = write + rule combine command = cat \$in | lulua-analyze combine > \$out @@ -99,6 +104,9 @@ rule cp rule gz command = gzip -c \$in > \$out +rule configure-make + command = cd \$in && autoreconf --install && ./configure && make + ### build targets ### build \$docdir/_build: mkdir build \$docdir/_build/fonts: mkdir @@ -114,6 +122,8 @@ build \$docdir/_build/lulua.combined.gz: gz \$docdir/_temp/lulua.combined || \$d build \$docdir/_build/fonts/IBMPlexArabic-Regular.woff2: cp \$fontdir/IBMPlexArabic-Regular.woff2 || \$docdir/_build/fonts build \$docdir/_build/fonts/IBMPlexArabic-Thin.woff2: cp \$fontdir/IBMPlexArabic-Thin.woff2 || \$docdir/_build/fonts +# build osmconvert +build \$osmconvert: configure-make 3rdparty/osmctools EOF # targets for every layout @@ -136,13 +146,16 @@ build \$statsdir/${l}/tanzil-quaran.pickle: write-tanzil \$corpusdir/tanzil-quar build \$statsdir/${l}/arwiki.pickle: write-arwiki \$corpusdir/arwiki/arwiki-20190701-pages-articles.xml.bz2 || \$statsdir/${l} layout = ${l} +build \$statsdir/${l}/osm.pickle: write-osm \$corpusdir/osm/planet-191104.osm.pbf || \$statsdir/${l} \$osmconvert + layout = ${l} + build \$statsdir/${l}/un-v1.0-tei.pickle: write-tei2 \$corpusdir/un-v1.0-tei/raw || \$statsdir/${l} layout = ${l} build \$statsdir/${l}/opensubtitles-2018.pickle: write-opensubtitles \$corpusdir/opensubtitles-2018/raw || \$statsdir/${l} layout = ${l} -build \$statsdir/${l}/all.pickle: combine \$statsdir/${l}/bbcarabic.pickle \$statsdir/${l}/aljazeera.pickle \$statsdir/${l}/tanzil-quaran.pickle \$statsdir/${l}/arwiki.pickle \$statsdir/${l}/hindawi.pickle \$statsdir/${l}/un-v1.0-tei.pickle \$statsdir/${l}/opensubtitles-2018.pickle || \$statsdir/${l} +build \$statsdir/${l}/all.pickle: combine \$statsdir/${l}/bbcarabic.pickle \$statsdir/${l}/aljazeera.pickle \$statsdir/${l}/tanzil-quaran.pickle \$statsdir/${l}/arwiki.pickle \$statsdir/${l}/osm.pickle \$statsdir/${l}/hindawi.pickle \$statsdir/${l}/un-v1.0-tei.pickle \$statsdir/${l}/opensubtitles-2018.pickle || \$statsdir/${l} build \$docdir/_build/${l}.svg: render-svg || \$docdir/_build layout = ${l} diff --git a/lulua/text.py b/lulua/text.py index 382877b..2d8398d 100644 --- a/lulua/text.py +++ b/lulua/text.py @@ -159,6 +159,10 @@ def sourceText (item): with LzipFile (item.rstrip ()) as fd: yield fd.read ().decode ('utf-8') +def sourceLines (item): + """ Read items (i.e. lines) as is """ + yield item + def sourceJson (item): yield json.loads (item) @@ -210,6 +214,7 @@ sources = dict( epub=sourceEpub, tei2=sourceTEI2, opensubtitles=sourceOpenSubtitles, + lines=sourceLines, ) charMap = { -- cgit v1.2.3