summaryrefslogtreecommitdiff
path: root/lulua
diff options
context:
space:
mode:
authorLars-Dominik Braun <lars@6xq.net>2019-11-08 16:06:37 +0100
committerLars-Dominik Braun <lars@6xq.net>2019-11-08 21:34:15 +0100
commit38c9ed5b042ae488ee12287bf8c19457189889aa (patch)
treed4f49039eec711aa7c9ee21c691f46bc89316e48 /lulua
parente31d8731531b41a909bfe33ddc134de07f0a7bab (diff)
downloadlulua-38c9ed5b042ae488ee12287bf8c19457189889aa.tar.gz
lulua-38c9ed5b042ae488ee12287bf8c19457189889aa.tar.bz2
lulua-38c9ed5b042ae488ee12287bf8c19457189889aa.zip
Add OpenSubtitles corpus
See issue #5.
Diffstat (limited to 'lulua')
-rw-r--r--lulua/text.py18
1 files changed, 18 insertions, 0 deletions
diff --git a/lulua/text.py b/lulua/text.py
index 182c717..382877b 100644
--- a/lulua/text.py
+++ b/lulua/text.py
@@ -185,6 +185,23 @@ def sourceTEI2 (item):
except Exception:
logging.error (f'invalid xml document {item}')
+def sourceOpenSubtitles (item):
+ """
+ XML-based format used by the (raw!) OpenSubtitles dump found here:
+ http://opus.nlpl.eu/OpenSubtitles-v2018.php
+ """
+ with open (item.rstrip (), 'rb') as fd:
+ try:
+ out = []
+ doc = xml.dom.minidom.parse (fd)
+ for s in doc.getElementsByTagName ('s'):
+ # strip newlines, which are mostly unintentional due to
+ # pretty-printed xml structure
+ out.append (getText (s.childNodes).strip ())
+ yield '\n'.join (out)
+ except Exception as e:
+ logging.error (f'invalid xml document {item} {e}')
+
sources = dict(
aljazeera=partial(sourceHtml, f['aljazeera']),
bbcarabic=partial(sourceHtml, f['bbcarabic']),
@@ -192,6 +209,7 @@ sources = dict(
json=sourceJson,
epub=sourceEpub,
tei2=sourceTEI2,
+ opensubtitles=sourceOpenSubtitles,
)
charMap = {