diff options
author | Lars-Dominik Braun <lars@6xq.net> | 2019-11-16 13:40:39 +0100 |
---|---|---|
committer | Lars-Dominik Braun <lars@6xq.net> | 2019-11-16 21:06:57 +0100 |
commit | 41f342e12b975e785de9d755d38eb92cf38f5ec5 (patch) | |
tree | a0b9f6716fc94d005350d66091bcf42a0c2f87f4 /lulua | |
parent | c543f42fe50d9b10175059ee5ed9186f30445f40 (diff) | |
download | lulua-41f342e12b975e785de9d755d38eb92cf38f5ec5.tar.gz lulua-41f342e12b975e785de9d755d38eb92cf38f5ec5.tar.bz2 lulua-41f342e12b975e785de9d755d38eb92cf38f5ec5.zip |
Add OpenStreetMap label corpus
Extract node labels (name:ar) from OpenStreetMap’s planet dump. Heavily
leans towards a few common words (“street”, obviously), but we should be
fine since the corpus is not that large.
Diffstat (limited to 'lulua')
-rw-r--r-- | lulua/text.py | 5 |
1 files changed, 5 insertions, 0 deletions
diff --git a/lulua/text.py b/lulua/text.py index 382877b..2d8398d 100644 --- a/lulua/text.py +++ b/lulua/text.py @@ -159,6 +159,10 @@ def sourceText (item): with LzipFile (item.rstrip ()) as fd: yield fd.read ().decode ('utf-8') +def sourceLines (item): + """ Read items (i.e. lines) as is """ + yield item + def sourceJson (item): yield json.loads (item) @@ -210,6 +214,7 @@ sources = dict( epub=sourceEpub, tei2=sourceTEI2, opensubtitles=sourceOpenSubtitles, + lines=sourceLines, ) charMap = { |