diff options
Diffstat (limited to 'corpus')
-rw-r--r-- | corpus/aljazeera/metadata.yaml | 10 | ||||
-rw-r--r-- | corpus/arwiki/metadata.yaml | 10 | ||||
-rw-r--r-- | corpus/bbcarabic/metadata.yaml | 10 | ||||
-rw-r--r-- | corpus/hindawi/metadata.yaml | 10 | ||||
-rw-r--r-- | corpus/opensubtitles-2018/metadata.yaml | 10 | ||||
-rw-r--r-- | corpus/osm/metadata.yaml | 10 | ||||
-rw-r--r-- | corpus/tanzil-quaran/metadata.yaml | 4 | ||||
-rw-r--r-- | corpus/un-v1.0-tei/metadata.yaml | 10 |
8 files changed, 59 insertions, 15 deletions
diff --git a/corpus/aljazeera/metadata.yaml b/corpus/aljazeera/metadata.yaml index f5908c6..623cf91 100644 --- a/corpus/aljazeera/metadata.yaml +++ b/corpus/aljazeera/metadata.yaml @@ -1,7 +1,13 @@ source: - name: Al-Jazeera + name: + en: Al-Jazeera + ar: الجزيرة url: https://www.aljazeera.net/ extractor: name: Custom scripts date: 2019-07 -count: [547110, articles] +count: + num: 547110 + kind: + en: articles + ar: مقالة diff --git a/corpus/arwiki/metadata.yaml b/corpus/arwiki/metadata.yaml index 2a1ff72..b033919 100644 --- a/corpus/arwiki/metadata.yaml +++ b/corpus/arwiki/metadata.yaml @@ -1,8 +1,14 @@ source: - name: Arabic Wikipedia + name: + en: Arabic Wikipedia + ar: ويكيبيديا العربية url: https://ar.wikipedia.org/ extractor: name: wikiextractor url: https://github.com/attardi/wikiextractor/tree/3162bb6c3c9ebd2d15be507aa11d6fa818a454ac date: 2019-07-01 -count: [857386, articles] +count: + num: 857386 + kind: + en: articles + ar: مقالة diff --git a/corpus/bbcarabic/metadata.yaml b/corpus/bbcarabic/metadata.yaml index d1c06a5..085b80f 100644 --- a/corpus/bbcarabic/metadata.yaml +++ b/corpus/bbcarabic/metadata.yaml @@ -1,7 +1,13 @@ source: - name: BBC Arabic + name: + en: BBC Arabic + ar: بي بي سي العربية url: http://www.bbc.com/arabic extractor: name: Custom scripts date: 2019-07 -count: [149901, articles] +count: + num: 149901 + kind: + en: articles + ar: مقالة diff --git a/corpus/hindawi/metadata.yaml b/corpus/hindawi/metadata.yaml index c92e428..5cadffc 100644 --- a/corpus/hindawi/metadata.yaml +++ b/corpus/hindawi/metadata.yaml @@ -1,5 +1,11 @@ source: - name: hindawi.org + name: + en: hindawi.org + ar: هنداوي url: https://www.hindawi.org/books date: 2019-10-02 -count: [1709, books] +count: + num: 1709 + kind: + en: books + ar: كتاب diff --git a/corpus/opensubtitles-2018/metadata.yaml b/corpus/opensubtitles-2018/metadata.yaml index 310ae82..32454db 100644 --- a/corpus/opensubtitles-2018/metadata.yaml +++ b/corpus/opensubtitles-2018/metadata.yaml @@ -1,5 +1,11 @@ source: - name: ORPUS OpenSubtitles 2018 + name: + en: ORPUS OpenSubtitles 2018 + ar: ORPUS OpenSubtitles 2018 url: http://opus.nlpl.eu/OpenSubtitles-v2018.php date: 2018 -count: [94093, movies] +count: + num: 94093 + kind: + en: movies + ar: فيلم diff --git a/corpus/osm/metadata.yaml b/corpus/osm/metadata.yaml index e3aff73..1759531 100644 --- a/corpus/osm/metadata.yaml +++ b/corpus/osm/metadata.yaml @@ -1,5 +1,11 @@ source: - name: OpenStreetMap Arabic Labels + name: + en: OpenStreetMap Arabic Labels + ar: خريطة الشارع المفتوحة Arabic Labels url: https://planet.openstreetmap.org/ date: 2019-11-04 -count: [376148, labels] +count: + num: 376148 + kind: + en: labels + ar: labels diff --git a/corpus/tanzil-quaran/metadata.yaml b/corpus/tanzil-quaran/metadata.yaml index 0207da4..9932aa5 100644 --- a/corpus/tanzil-quaran/metadata.yaml +++ b/corpus/tanzil-quaran/metadata.yaml @@ -1,5 +1,7 @@ source: - name: tanzil.net Quran + name: + en: tanzil.net Quran + ar: القرآن (بن tanzil.net) url: http://tanzil.net/docs/download # notes: options Simple Enhanced and Text (for inclusion of diacritics) date: 2019-10-02 diff --git a/corpus/un-v1.0-tei/metadata.yaml b/corpus/un-v1.0-tei/metadata.yaml index 36eaa6d..224da94 100644 --- a/corpus/un-v1.0-tei/metadata.yaml +++ b/corpus/un-v1.0-tei/metadata.yaml @@ -1,5 +1,11 @@ source: - name: United Nations Parallel Corpus v1.0 + name: + en: United Nations Parallel Corpus v1.0 + ar: الأمم المتحدة Parallel Corpus v1.0 url: https://conferences.unite.un.org/UNCorpus/en/DownloadOverview date: 2016 -count: [116754, documents] +count: + num: 116754 + kind: + en: documents + ar: ملف |