From 810c8ff0bea17214b4e4c5ce802ad89b5ad6e2c9 Mon Sep 17 00:00:00 2001 From: Lars-Dominik Braun Date: Sat, 30 Nov 2019 14:19:00 +0100 Subject: Add missing corpuse metadata files --- corpus/aljazeera/metadata.yaml | 7 +++++++ corpus/arwiki/metadata.yaml | 8 ++++++++ corpus/bbcarabic/metadata.yaml | 7 +++++++ corpus/hindawi/metadata.yaml | 5 +++++ corpus/opensubtitles-2018/metadata.yaml | 5 +++++ corpus/tanzil-quaran/metadata.yaml | 5 +++++ corpus/un-v1.0-tei/metadata.yaml | 5 +++++ 7 files changed, 42 insertions(+) create mode 100644 corpus/aljazeera/metadata.yaml create mode 100644 corpus/arwiki/metadata.yaml create mode 100644 corpus/bbcarabic/metadata.yaml create mode 100644 corpus/hindawi/metadata.yaml create mode 100644 corpus/opensubtitles-2018/metadata.yaml create mode 100644 corpus/tanzil-quaran/metadata.yaml create mode 100644 corpus/un-v1.0-tei/metadata.yaml (limited to 'corpus') diff --git a/corpus/aljazeera/metadata.yaml b/corpus/aljazeera/metadata.yaml new file mode 100644 index 0000000..f5908c6 --- /dev/null +++ b/corpus/aljazeera/metadata.yaml @@ -0,0 +1,7 @@ +source: + name: Al-Jazeera + url: https://www.aljazeera.net/ +extractor: + name: Custom scripts +date: 2019-07 +count: [547110, articles] diff --git a/corpus/arwiki/metadata.yaml b/corpus/arwiki/metadata.yaml new file mode 100644 index 0000000..2a1ff72 --- /dev/null +++ b/corpus/arwiki/metadata.yaml @@ -0,0 +1,8 @@ +source: + name: Arabic Wikipedia + url: https://ar.wikipedia.org/ +extractor: + name: wikiextractor + url: https://github.com/attardi/wikiextractor/tree/3162bb6c3c9ebd2d15be507aa11d6fa818a454ac +date: 2019-07-01 +count: [857386, articles] diff --git a/corpus/bbcarabic/metadata.yaml b/corpus/bbcarabic/metadata.yaml new file mode 100644 index 0000000..d1c06a5 --- /dev/null +++ b/corpus/bbcarabic/metadata.yaml @@ -0,0 +1,7 @@ +source: + name: BBC Arabic + url: http://www.bbc.com/arabic +extractor: + name: Custom scripts +date: 2019-07 +count: [149901, articles] diff --git a/corpus/hindawi/metadata.yaml b/corpus/hindawi/metadata.yaml new file mode 100644 index 0000000..c92e428 --- /dev/null +++ b/corpus/hindawi/metadata.yaml @@ -0,0 +1,5 @@ +source: + name: hindawi.org + url: https://www.hindawi.org/books +date: 2019-10-02 +count: [1709, books] diff --git a/corpus/opensubtitles-2018/metadata.yaml b/corpus/opensubtitles-2018/metadata.yaml new file mode 100644 index 0000000..310ae82 --- /dev/null +++ b/corpus/opensubtitles-2018/metadata.yaml @@ -0,0 +1,5 @@ +source: + name: ORPUS OpenSubtitles 2018 + url: http://opus.nlpl.eu/OpenSubtitles-v2018.php +date: 2018 +count: [94093, movies] diff --git a/corpus/tanzil-quaran/metadata.yaml b/corpus/tanzil-quaran/metadata.yaml new file mode 100644 index 0000000..0207da4 --- /dev/null +++ b/corpus/tanzil-quaran/metadata.yaml @@ -0,0 +1,5 @@ +source: + name: tanzil.net Quran + url: http://tanzil.net/docs/download +# notes: options Simple Enhanced and Text (for inclusion of diacritics) +date: 2019-10-02 diff --git a/corpus/un-v1.0-tei/metadata.yaml b/corpus/un-v1.0-tei/metadata.yaml new file mode 100644 index 0000000..36eaa6d --- /dev/null +++ b/corpus/un-v1.0-tei/metadata.yaml @@ -0,0 +1,5 @@ +source: + name: United Nations Parallel Corpus v1.0 + url: https://conferences.unite.un.org/UNCorpus/en/DownloadOverview +date: 2016 +count: [116754, documents] -- cgit v1.2.3