summaryrefslogtreecommitdiff
path: root/lulua
diff options
context:
space:
mode:
authorLars-Dominik Braun <lars@6xq.net>2019-11-16 13:40:39 +0100
committerLars-Dominik Braun <lars@6xq.net>2019-11-16 21:06:57 +0100
commit41f342e12b975e785de9d755d38eb92cf38f5ec5 (patch)
treea0b9f6716fc94d005350d66091bcf42a0c2f87f4 /lulua
parentc543f42fe50d9b10175059ee5ed9186f30445f40 (diff)
downloadlulua-41f342e12b975e785de9d755d38eb92cf38f5ec5.tar.gz
lulua-41f342e12b975e785de9d755d38eb92cf38f5ec5.tar.bz2
lulua-41f342e12b975e785de9d755d38eb92cf38f5ec5.zip
Add OpenStreetMap label corpus
Extract node labels (name:ar) from OpenStreetMap’s planet dump. Heavily leans towards a few common words (“street”, obviously), but we should be fine since the corpus is not that large.
Diffstat (limited to 'lulua')
-rw-r--r--lulua/text.py5
1 files changed, 5 insertions, 0 deletions
diff --git a/lulua/text.py b/lulua/text.py
index 382877b..2d8398d 100644
--- a/lulua/text.py
+++ b/lulua/text.py
@@ -159,6 +159,10 @@ def sourceText (item):
with LzipFile (item.rstrip ()) as fd:
yield fd.read ().decode ('utf-8')
+def sourceLines (item):
+ """ Read items (i.e. lines) as is """
+ yield item
+
def sourceJson (item):
yield json.loads (item)
@@ -210,6 +214,7 @@ sources = dict(
epub=sourceEpub,
tei2=sourceTEI2,
opensubtitles=sourceOpenSubtitles,
+ lines=sourceLines,
)
charMap = {