Add OpenSubtitles corpus

See issue #5.
author: Lars-Dominik Braun <lars@6xq.net> 2019-11-08 16:06:37 +0100
committer: Lars-Dominik Braun <lars@6xq.net> 2019-11-08 21:34:15 +0100
commit: 38c9ed5b042ae488ee12287bf8c19457189889aa (patch)
tree: d4f49039eec711aa7c9ee21c691f46bc89316e48
parent: e31d8731531b41a909bfe33ddc134de07f0a7bab (diff)
download: lulua-38c9ed5b042ae488ee12287bf8c19457189889aa.tar.gz
lulua-38c9ed5b042ae488ee12287bf8c19457189889aa.tar.bz2
lulua-38c9ed5b042ae488ee12287bf8c19457189889aa.zip
3 files changed, 30 insertions, 3 deletions
diff --git a/doc/index.html b/doc/index.html
index 19151b0..a390ddf 100644
--- a/doc/index.html
+++ b/doc/index.html
@@ -132,6 +132,8 @@
 			Arabic</a>, another Arabic-language news site</li>
 			<li>116,754 documents from the
 			<a href="https://conferences.unite.un.org/UNCorpus/en/DownloadOverview">United Nations Parallel Corpus v1.0</a></li>
+			<li>subtitles from 94,093 movies based on a
+			<a href="http://opus.nlpl.eu/OpenSubtitles-v2018.php">2018 OpenSubtitles dump</a></li>
 			<li>1,709 ebooks from <a
 			href="https://www.hindawi.org/books">hindawi.org</a></li>
 			<li>and a plain-text copy of the Quran from <a
@@ -140,8 +142,8 @@
 		</ul>
 		<p>
 		summing up to roughly
-		825 million words or
-		5.5 billion characters. <!-- == combined button presses -->
+		1.2 billion words or
+		7.6 billion characters. <!-- == combined button presses -->
 		<!-- -->
 		The plot below shows <bdo dir="ltr" lang="ar">ا ل ي م و ن</bdo> can be
 		considered the most frequently used letters in the Arabic language.
diff --git a/gen.sh b/gen.sh
index 588b8ba..762eecc 100755
--- a/gen.sh
+++ b/gen.sh
@@ -60,6 +60,10 @@ rule write-tei2
     command = find \$in -type f -name '*.xml' | lulua-write tei2 \$layout | lulua-analyze combine > \$out
     pool = write
 
+rule write-opensubtitles
+    command = find \$in -type f -name '*.xml' | lulua-write opensubtitles \$layout | lulua-analyze combine > \$out
+    pool = write
+
 rule write-arwiki
     command = \$wikiextractor -ns 0 --json -o - \$in 2>/dev/null | jq .text | lulua-write json \$layout | lulua-analyze combine > \$out
     pool = write
@@ -129,7 +133,10 @@ build \$statsdir/${l}/arwiki.pickle: write-arwiki \$corpusdir/arwiki/arwiki-2019
 build \$statsdir/${l}/un-v1.0-tei.pickle: write-tei2 \$corpusdir/un-v1.0-tei || \$statsdir/${l}
     layout = ${l}
 
-build \$statsdir/${l}/all.pickle: combine \$statsdir/${l}/bbcarabic.pickle \$statsdir/${l}/aljazeera.pickle \$statsdir/${l}/tanzil.pickle \$statsdir/${l}/arwiki.pickle \$statsdir/${l}/hindawi.pickle \$statsdir/${l}/un-v1.0-tei.pickle || \$statsdir/${l}
+build \$statsdir/${l}/opensubtitles-2018.pickle: write-opensubtitles \$corpusdir/opensubtitles-2018 || \$statsdir/${l}
+    layout = ${l}
+
+build \$statsdir/${l}/all.pickle: combine \$statsdir/${l}/bbcarabic.pickle \$statsdir/${l}/aljazeera.pickle \$statsdir/${l}/tanzil.pickle \$statsdir/${l}/arwiki.pickle \$statsdir/${l}/hindawi.pickle \$statsdir/${l}/un-v1.0-tei.pickle \$statsdir/${l}/opensubtitles-2018.pickle || \$statsdir/${l}
 
 build \$docdir/_build/${l}.svg: render-svg || \$docdir/_build
     layout = ${l}
diff --git a/lulua/text.py b/lulua/text.py
index 182c717..382877b 100644
--- a/lulua/text.py
+++ b/lulua/text.py
@@ -185,6 +185,23 @@ def sourceTEI2 (item):
         except Exception:
             logging.error (f'invalid xml document {item}')
 
+def sourceOpenSubtitles (item):
+    """
+    XML-based format used by the (raw!) OpenSubtitles dump found here:
+    http://opus.nlpl.eu/OpenSubtitles-v2018.php
+    """
+    with open (item.rstrip (), 'rb') as fd:
+        try:
+            out = []
+            doc = xml.dom.minidom.parse (fd)
+            for s in doc.getElementsByTagName ('s'):
+                # strip newlines, which are mostly unintentional due to
+                # pretty-printed xml structure
+                out.append (getText (s.childNodes).strip ())
+            yield '\n'.join (out)
+        except Exception as e:
+            logging.error (f'invalid xml document {item} {e}')
+
 sources = dict(
     aljazeera=partial(sourceHtml, f['aljazeera']),
     bbcarabic=partial(sourceHtml, f['bbcarabic']),
@@ -192,6 +209,7 @@ sources = dict(
     json=sourceJson,
     epub=sourceEpub,
     tei2=sourceTEI2,
+    opensubtitles=sourceOpenSubtitles,
     )
 
 charMap = {
author	Lars-Dominik Braun <lars@6xq.net>	2019-11-08 16:06:37 +0100
committer	Lars-Dominik Braun <lars@6xq.net>	2019-11-08 21:34:15 +0100
commit	38c9ed5b042ae488ee12287bf8c19457189889aa (patch)
tree	d4f49039eec711aa7c9ee21c691f46bc89316e48
parent	e31d8731531b41a909bfe33ddc134de07f0a7bab (diff)
download	lulua-38c9ed5b042ae488ee12287bf8c19457189889aa.tar.gz lulua-38c9ed5b042ae488ee12287bf8c19457189889aa.tar.bz2 lulua-38c9ed5b042ae488ee12287bf8c19457189889aa.zip