summaryrefslogtreecommitdiff
path: root/corpus
diff options
context:
space:
mode:
authorLars-Dominik Braun <lars@6xq.net>2020-05-10 10:48:38 +0200
committerLars-Dominik Braun <lars@6xq.net>2020-05-10 10:49:23 +0200
commit244197b5f8b1f4d73d4ab9ac838334860b55662c (patch)
treece2d8cd63289f3457f227adea6fbcb0896e8dc26 /corpus
parent58a062d4f93b2ba362a94522d1a9b8e7c7f13469 (diff)
downloadlulua-244197b5f8b1f4d73d4ab9ac838334860b55662c.tar.gz
lulua-244197b5f8b1f4d73d4ab9ac838334860b55662c.tar.bz2
lulua-244197b5f8b1f4d73d4ab9ac838334860b55662c.zip
report: Add translated source table, asymmetry definition
Also fix the layout break point.
Diffstat (limited to 'corpus')
-rw-r--r--corpus/aljazeera/metadata.yaml10
-rw-r--r--corpus/arwiki/metadata.yaml10
-rw-r--r--corpus/bbcarabic/metadata.yaml10
-rw-r--r--corpus/hindawi/metadata.yaml10
-rw-r--r--corpus/opensubtitles-2018/metadata.yaml10
-rw-r--r--corpus/osm/metadata.yaml10
-rw-r--r--corpus/tanzil-quaran/metadata.yaml4
-rw-r--r--corpus/un-v1.0-tei/metadata.yaml10
8 files changed, 59 insertions, 15 deletions
diff --git a/corpus/aljazeera/metadata.yaml b/corpus/aljazeera/metadata.yaml
index f5908c6..623cf91 100644
--- a/corpus/aljazeera/metadata.yaml
+++ b/corpus/aljazeera/metadata.yaml
@@ -1,7 +1,13 @@
source:
- name: Al-Jazeera
+ name:
+ en: Al-Jazeera
+ ar: الجزيرة
url: https://www.aljazeera.net/
extractor:
name: Custom scripts
date: 2019-07
-count: [547110, articles]
+count:
+ num: 547110
+ kind:
+ en: articles
+ ar: مقالة
diff --git a/corpus/arwiki/metadata.yaml b/corpus/arwiki/metadata.yaml
index 2a1ff72..b033919 100644
--- a/corpus/arwiki/metadata.yaml
+++ b/corpus/arwiki/metadata.yaml
@@ -1,8 +1,14 @@
source:
- name: Arabic Wikipedia
+ name:
+ en: Arabic Wikipedia
+ ar: ويكيبيديا العربية
url: https://ar.wikipedia.org/
extractor:
name: wikiextractor
url: https://github.com/attardi/wikiextractor/tree/3162bb6c3c9ebd2d15be507aa11d6fa818a454ac
date: 2019-07-01
-count: [857386, articles]
+count:
+ num: 857386
+ kind:
+ en: articles
+ ar: مقالة
diff --git a/corpus/bbcarabic/metadata.yaml b/corpus/bbcarabic/metadata.yaml
index d1c06a5..085b80f 100644
--- a/corpus/bbcarabic/metadata.yaml
+++ b/corpus/bbcarabic/metadata.yaml
@@ -1,7 +1,13 @@
source:
- name: BBC Arabic
+ name:
+ en: BBC Arabic
+ ar: بي بي سي العربية
url: http://www.bbc.com/arabic
extractor:
name: Custom scripts
date: 2019-07
-count: [149901, articles]
+count:
+ num: 149901
+ kind:
+ en: articles
+ ar: مقالة
diff --git a/corpus/hindawi/metadata.yaml b/corpus/hindawi/metadata.yaml
index c92e428..5cadffc 100644
--- a/corpus/hindawi/metadata.yaml
+++ b/corpus/hindawi/metadata.yaml
@@ -1,5 +1,11 @@
source:
- name: hindawi.org
+ name:
+ en: hindawi.org
+ ar: هنداوي
url: https://www.hindawi.org/books
date: 2019-10-02
-count: [1709, books]
+count:
+ num: 1709
+ kind:
+ en: books
+ ar: كتاب
diff --git a/corpus/opensubtitles-2018/metadata.yaml b/corpus/opensubtitles-2018/metadata.yaml
index 310ae82..32454db 100644
--- a/corpus/opensubtitles-2018/metadata.yaml
+++ b/corpus/opensubtitles-2018/metadata.yaml
@@ -1,5 +1,11 @@
source:
- name: ORPUS OpenSubtitles 2018
+ name:
+ en: ORPUS OpenSubtitles 2018
+ ar: ORPUS OpenSubtitles 2018
url: http://opus.nlpl.eu/OpenSubtitles-v2018.php
date: 2018
-count: [94093, movies]
+count:
+ num: 94093
+ kind:
+ en: movies
+ ar: فيلم
diff --git a/corpus/osm/metadata.yaml b/corpus/osm/metadata.yaml
index e3aff73..1759531 100644
--- a/corpus/osm/metadata.yaml
+++ b/corpus/osm/metadata.yaml
@@ -1,5 +1,11 @@
source:
- name: OpenStreetMap Arabic Labels
+ name:
+ en: OpenStreetMap Arabic Labels
+ ar: خريطة الشارع المفتوحة Arabic Labels
url: https://planet.openstreetmap.org/
date: 2019-11-04
-count: [376148, labels]
+count:
+ num: 376148
+ kind:
+ en: labels
+ ar: labels
diff --git a/corpus/tanzil-quaran/metadata.yaml b/corpus/tanzil-quaran/metadata.yaml
index 0207da4..9932aa5 100644
--- a/corpus/tanzil-quaran/metadata.yaml
+++ b/corpus/tanzil-quaran/metadata.yaml
@@ -1,5 +1,7 @@
source:
- name: tanzil.net Quran
+ name:
+ en: tanzil.net Quran
+ ar: القرآن (بن tanzil.net)
url: http://tanzil.net/docs/download
# notes: options Simple Enhanced and Text (for inclusion of diacritics)
date: 2019-10-02
diff --git a/corpus/un-v1.0-tei/metadata.yaml b/corpus/un-v1.0-tei/metadata.yaml
index 36eaa6d..224da94 100644
--- a/corpus/un-v1.0-tei/metadata.yaml
+++ b/corpus/un-v1.0-tei/metadata.yaml
@@ -1,5 +1,11 @@
source:
- name: United Nations Parallel Corpus v1.0
+ name:
+ en: United Nations Parallel Corpus v1.0
+ ar: الأمم المتحدة Parallel Corpus v1.0
url: https://conferences.unite.un.org/UNCorpus/en/DownloadOverview
date: 2016
-count: [116754, documents]
+count:
+ num: 116754
+ kind:
+ en: documents
+ ar: ملف