From 38c9ed5b042ae488ee12287bf8c19457189889aa Mon Sep 17 00:00:00 2001 From: Lars-Dominik Braun Date: Fri, 8 Nov 2019 16:06:37 +0100 Subject: Add OpenSubtitles corpus See issue #5. --- doc/index.html | 6 ++++-- gen.sh | 9 ++++++++- lulua/text.py | 18 ++++++++++++++++++ 3 files changed, 30 insertions(+), 3 deletions(-) diff --git a/doc/index.html b/doc/index.html index 19151b0..a390ddf 100644 --- a/doc/index.html +++ b/doc/index.html @@ -132,6 +132,8 @@ Arabic, another Arabic-language news site
  • 116,754 documents from the United Nations Parallel Corpus v1.0
  • +
  • subtitles from 94,093 movies based on a + 2018 OpenSubtitles dump
  • 1,709 ebooks from hindawi.org
  • and a plain-text copy of the Quran from

    summing up to roughly - 825 million words or - 5.5 billion characters. + 1.2 billion words or + 7.6 billion characters. The plot below shows ا ل ي م و ن can be considered the most frequently used letters in the Arabic language. diff --git a/gen.sh b/gen.sh index 588b8ba..762eecc 100755 --- a/gen.sh +++ b/gen.sh @@ -60,6 +60,10 @@ rule write-tei2 command = find \$in -type f -name '*.xml' | lulua-write tei2 \$layout | lulua-analyze combine > \$out pool = write +rule write-opensubtitles + command = find \$in -type f -name '*.xml' | lulua-write opensubtitles \$layout | lulua-analyze combine > \$out + pool = write + rule write-arwiki command = \$wikiextractor -ns 0 --json -o - \$in 2>/dev/null | jq .text | lulua-write json \$layout | lulua-analyze combine > \$out pool = write @@ -129,7 +133,10 @@ build \$statsdir/${l}/arwiki.pickle: write-arwiki \$corpusdir/arwiki/arwiki-2019 build \$statsdir/${l}/un-v1.0-tei.pickle: write-tei2 \$corpusdir/un-v1.0-tei || \$statsdir/${l} layout = ${l} -build \$statsdir/${l}/all.pickle: combine \$statsdir/${l}/bbcarabic.pickle \$statsdir/${l}/aljazeera.pickle \$statsdir/${l}/tanzil.pickle \$statsdir/${l}/arwiki.pickle \$statsdir/${l}/hindawi.pickle \$statsdir/${l}/un-v1.0-tei.pickle || \$statsdir/${l} +build \$statsdir/${l}/opensubtitles-2018.pickle: write-opensubtitles \$corpusdir/opensubtitles-2018 || \$statsdir/${l} + layout = ${l} + +build \$statsdir/${l}/all.pickle: combine \$statsdir/${l}/bbcarabic.pickle \$statsdir/${l}/aljazeera.pickle \$statsdir/${l}/tanzil.pickle \$statsdir/${l}/arwiki.pickle \$statsdir/${l}/hindawi.pickle \$statsdir/${l}/un-v1.0-tei.pickle \$statsdir/${l}/opensubtitles-2018.pickle || \$statsdir/${l} build \$docdir/_build/${l}.svg: render-svg || \$docdir/_build layout = ${l} diff --git a/lulua/text.py b/lulua/text.py index 182c717..382877b 100644 --- a/lulua/text.py +++ b/lulua/text.py @@ -185,6 +185,23 @@ def sourceTEI2 (item): except Exception: logging.error (f'invalid xml document {item}') +def sourceOpenSubtitles (item): + """ + XML-based format used by the (raw!) OpenSubtitles dump found here: + http://opus.nlpl.eu/OpenSubtitles-v2018.php + """ + with open (item.rstrip (), 'rb') as fd: + try: + out = [] + doc = xml.dom.minidom.parse (fd) + for s in doc.getElementsByTagName ('s'): + # strip newlines, which are mostly unintentional due to + # pretty-printed xml structure + out.append (getText (s.childNodes).strip ()) + yield '\n'.join (out) + except Exception as e: + logging.error (f'invalid xml document {item} {e}') + sources = dict( aljazeera=partial(sourceHtml, f['aljazeera']), bbcarabic=partial(sourceHtml, f['bbcarabic']), @@ -192,6 +209,7 @@ sources = dict( json=sourceJson, epub=sourceEpub, tei2=sourceTEI2, + opensubtitles=sourceOpenSubtitles, ) charMap = { -- cgit v1.2.3