summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLars-Dominik Braun <lars@6xq.net>2019-11-08 16:06:37 +0100
committerLars-Dominik Braun <lars@6xq.net>2019-11-08 21:34:15 +0100
commit38c9ed5b042ae488ee12287bf8c19457189889aa (patch)
treed4f49039eec711aa7c9ee21c691f46bc89316e48
parente31d8731531b41a909bfe33ddc134de07f0a7bab (diff)
downloadlulua-38c9ed5b042ae488ee12287bf8c19457189889aa.tar.gz
lulua-38c9ed5b042ae488ee12287bf8c19457189889aa.tar.bz2
lulua-38c9ed5b042ae488ee12287bf8c19457189889aa.zip
Add OpenSubtitles corpus
See issue #5.
-rw-r--r--doc/index.html6
-rwxr-xr-xgen.sh9
-rw-r--r--lulua/text.py18
3 files changed, 30 insertions, 3 deletions
diff --git a/doc/index.html b/doc/index.html
index 19151b0..a390ddf 100644
--- a/doc/index.html
+++ b/doc/index.html
@@ -132,6 +132,8 @@
Arabic</a>, another Arabic-language news site</li>
<li>116,754 documents from the
<a href="https://conferences.unite.un.org/UNCorpus/en/DownloadOverview">United Nations Parallel Corpus v1.0</a></li>
+ <li>subtitles from 94,093 movies based on a
+ <a href="http://opus.nlpl.eu/OpenSubtitles-v2018.php">2018 OpenSubtitles dump</a></li>
<li>1,709 ebooks from <a
href="https://www.hindawi.org/books">hindawi.org</a></li>
<li>and a plain-text copy of the Quran from <a
@@ -140,8 +142,8 @@
</ul>
<p>
summing up to roughly
- 825 million words or
- 5.5 billion characters. <!-- == combined button presses -->
+ 1.2 billion words or
+ 7.6 billion characters. <!-- == combined button presses -->
<!-- -->
The plot below shows <bdo dir="ltr" lang="ar">ا ل ي م و ن</bdo> can be
considered the most frequently used letters in the Arabic language.
diff --git a/gen.sh b/gen.sh
index 588b8ba..762eecc 100755
--- a/gen.sh
+++ b/gen.sh
@@ -60,6 +60,10 @@ rule write-tei2
command = find \$in -type f -name '*.xml' | lulua-write tei2 \$layout | lulua-analyze combine > \$out
pool = write
+rule write-opensubtitles
+ command = find \$in -type f -name '*.xml' | lulua-write opensubtitles \$layout | lulua-analyze combine > \$out
+ pool = write
+
rule write-arwiki
command = \$wikiextractor -ns 0 --json -o - \$in 2>/dev/null | jq .text | lulua-write json \$layout | lulua-analyze combine > \$out
pool = write
@@ -129,7 +133,10 @@ build \$statsdir/${l}/arwiki.pickle: write-arwiki \$corpusdir/arwiki/arwiki-2019
build \$statsdir/${l}/un-v1.0-tei.pickle: write-tei2 \$corpusdir/un-v1.0-tei || \$statsdir/${l}
layout = ${l}
-build \$statsdir/${l}/all.pickle: combine \$statsdir/${l}/bbcarabic.pickle \$statsdir/${l}/aljazeera.pickle \$statsdir/${l}/tanzil.pickle \$statsdir/${l}/arwiki.pickle \$statsdir/${l}/hindawi.pickle \$statsdir/${l}/un-v1.0-tei.pickle || \$statsdir/${l}
+build \$statsdir/${l}/opensubtitles-2018.pickle: write-opensubtitles \$corpusdir/opensubtitles-2018 || \$statsdir/${l}
+ layout = ${l}
+
+build \$statsdir/${l}/all.pickle: combine \$statsdir/${l}/bbcarabic.pickle \$statsdir/${l}/aljazeera.pickle \$statsdir/${l}/tanzil.pickle \$statsdir/${l}/arwiki.pickle \$statsdir/${l}/hindawi.pickle \$statsdir/${l}/un-v1.0-tei.pickle \$statsdir/${l}/opensubtitles-2018.pickle || \$statsdir/${l}
build \$docdir/_build/${l}.svg: render-svg || \$docdir/_build
layout = ${l}
diff --git a/lulua/text.py b/lulua/text.py
index 182c717..382877b 100644
--- a/lulua/text.py
+++ b/lulua/text.py
@@ -185,6 +185,23 @@ def sourceTEI2 (item):
except Exception:
logging.error (f'invalid xml document {item}')
+def sourceOpenSubtitles (item):
+ """
+ XML-based format used by the (raw!) OpenSubtitles dump found here:
+ http://opus.nlpl.eu/OpenSubtitles-v2018.php
+ """
+ with open (item.rstrip (), 'rb') as fd:
+ try:
+ out = []
+ doc = xml.dom.minidom.parse (fd)
+ for s in doc.getElementsByTagName ('s'):
+ # strip newlines, which are mostly unintentional due to
+ # pretty-printed xml structure
+ out.append (getText (s.childNodes).strip ())
+ yield '\n'.join (out)
+ except Exception as e:
+ logging.error (f'invalid xml document {item} {e}')
+
sources = dict(
aljazeera=partial(sourceHtml, f['aljazeera']),
bbcarabic=partial(sourceHtml, f['bbcarabic']),
@@ -192,6 +209,7 @@ sources = dict(
json=sourceJson,
epub=sourceEpub,
tei2=sourceTEI2,
+ opensubtitles=sourceOpenSubtitles,
)
charMap = {