diff options
author | Lars-Dominik Braun <lars@6xq.net> | 2019-11-08 16:06:37 +0100 |
---|---|---|
committer | Lars-Dominik Braun <lars@6xq.net> | 2019-11-08 21:34:15 +0100 |
commit | 38c9ed5b042ae488ee12287bf8c19457189889aa (patch) | |
tree | d4f49039eec711aa7c9ee21c691f46bc89316e48 /lulua | |
parent | e31d8731531b41a909bfe33ddc134de07f0a7bab (diff) | |
download | lulua-38c9ed5b042ae488ee12287bf8c19457189889aa.tar.gz lulua-38c9ed5b042ae488ee12287bf8c19457189889aa.tar.bz2 lulua-38c9ed5b042ae488ee12287bf8c19457189889aa.zip |
Add OpenSubtitles corpus
See issue #5.
Diffstat (limited to 'lulua')
-rw-r--r-- | lulua/text.py | 18 |
1 files changed, 18 insertions, 0 deletions
diff --git a/lulua/text.py b/lulua/text.py index 182c717..382877b 100644 --- a/lulua/text.py +++ b/lulua/text.py @@ -185,6 +185,23 @@ def sourceTEI2 (item): except Exception: logging.error (f'invalid xml document {item}') +def sourceOpenSubtitles (item): + """ + XML-based format used by the (raw!) OpenSubtitles dump found here: + http://opus.nlpl.eu/OpenSubtitles-v2018.php + """ + with open (item.rstrip (), 'rb') as fd: + try: + out = [] + doc = xml.dom.minidom.parse (fd) + for s in doc.getElementsByTagName ('s'): + # strip newlines, which are mostly unintentional due to + # pretty-printed xml structure + out.append (getText (s.childNodes).strip ()) + yield '\n'.join (out) + except Exception as e: + logging.error (f'invalid xml document {item} {e}') + sources = dict( aljazeera=partial(sourceHtml, f['aljazeera']), bbcarabic=partial(sourceHtml, f['bbcarabic']), @@ -192,6 +209,7 @@ sources = dict( json=sourceJson, epub=sourceEpub, tei2=sourceTEI2, + opensubtitles=sourceOpenSubtitles, ) charMap = { |