From 41f342e12b975e785de9d755d38eb92cf38f5ec5 Mon Sep 17 00:00:00 2001 From: Lars-Dominik Braun Date: Sat, 16 Nov 2019 13:40:39 +0100 Subject: Add OpenStreetMap label corpus MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extract node labels (name:ar) from OpenStreetMap’s planet dump. Heavily leans towards a few common words (“street”, obviously), but we should be fine since the corpus is not that large. --- lulua/text.py | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'lulua') diff --git a/lulua/text.py b/lulua/text.py index 382877b..2d8398d 100644 --- a/lulua/text.py +++ b/lulua/text.py @@ -159,6 +159,10 @@ def sourceText (item): with LzipFile (item.rstrip ()) as fd: yield fd.read ().decode ('utf-8') +def sourceLines (item): + """ Read items (i.e. lines) as is """ + yield item + def sourceJson (item): yield json.loads (item) @@ -210,6 +214,7 @@ sources = dict( epub=sourceEpub, tei2=sourceTEI2, opensubtitles=sourceOpenSubtitles, + lines=sourceLines, ) charMap = { -- cgit v1.2.3