diff options
author | Lars-Dominik Braun <lars@6xq.net> | 2019-11-30 14:19:00 +0100 |
---|---|---|
committer | Lars-Dominik Braun <lars@6xq.net> | 2019-11-30 14:19:00 +0100 |
commit | 810c8ff0bea17214b4e4c5ce802ad89b5ad6e2c9 (patch) | |
tree | f3db99cc0870af583c4b1d7d3f2376e943014fc1 | |
parent | feb84c69b4e3e7294f69456b50a1fb678566cf24 (diff) | |
download | lulua-810c8ff0bea17214b4e4c5ce802ad89b5ad6e2c9.tar.gz lulua-810c8ff0bea17214b4e4c5ce802ad89b5ad6e2c9.tar.bz2 lulua-810c8ff0bea17214b4e4c5ce802ad89b5ad6e2c9.zip |
Add missing corpuse metadata files
-rw-r--r-- | .gitignore | 1 | ||||
-rw-r--r-- | corpus/aljazeera/metadata.yaml | 7 | ||||
-rw-r--r-- | corpus/arwiki/metadata.yaml | 8 | ||||
-rw-r--r-- | corpus/bbcarabic/metadata.yaml | 7 | ||||
-rw-r--r-- | corpus/hindawi/metadata.yaml | 5 | ||||
-rw-r--r-- | corpus/opensubtitles-2018/metadata.yaml | 5 | ||||
-rw-r--r-- | corpus/tanzil-quaran/metadata.yaml | 5 | ||||
-rw-r--r-- | corpus/un-v1.0-tei/metadata.yaml | 5 |
8 files changed, 42 insertions, 1 deletions
@@ -8,4 +8,3 @@ doc/_temp .ninja_* build.ninja stats/ -corpus/ diff --git a/corpus/aljazeera/metadata.yaml b/corpus/aljazeera/metadata.yaml new file mode 100644 index 0000000..f5908c6 --- /dev/null +++ b/corpus/aljazeera/metadata.yaml @@ -0,0 +1,7 @@ +source: + name: Al-Jazeera + url: https://www.aljazeera.net/ +extractor: + name: Custom scripts +date: 2019-07 +count: [547110, articles] diff --git a/corpus/arwiki/metadata.yaml b/corpus/arwiki/metadata.yaml new file mode 100644 index 0000000..2a1ff72 --- /dev/null +++ b/corpus/arwiki/metadata.yaml @@ -0,0 +1,8 @@ +source: + name: Arabic Wikipedia + url: https://ar.wikipedia.org/ +extractor: + name: wikiextractor + url: https://github.com/attardi/wikiextractor/tree/3162bb6c3c9ebd2d15be507aa11d6fa818a454ac +date: 2019-07-01 +count: [857386, articles] diff --git a/corpus/bbcarabic/metadata.yaml b/corpus/bbcarabic/metadata.yaml new file mode 100644 index 0000000..d1c06a5 --- /dev/null +++ b/corpus/bbcarabic/metadata.yaml @@ -0,0 +1,7 @@ +source: + name: BBC Arabic + url: http://www.bbc.com/arabic +extractor: + name: Custom scripts +date: 2019-07 +count: [149901, articles] diff --git a/corpus/hindawi/metadata.yaml b/corpus/hindawi/metadata.yaml new file mode 100644 index 0000000..c92e428 --- /dev/null +++ b/corpus/hindawi/metadata.yaml @@ -0,0 +1,5 @@ +source: + name: hindawi.org + url: https://www.hindawi.org/books +date: 2019-10-02 +count: [1709, books] diff --git a/corpus/opensubtitles-2018/metadata.yaml b/corpus/opensubtitles-2018/metadata.yaml new file mode 100644 index 0000000..310ae82 --- /dev/null +++ b/corpus/opensubtitles-2018/metadata.yaml @@ -0,0 +1,5 @@ +source: + name: ORPUS OpenSubtitles 2018 + url: http://opus.nlpl.eu/OpenSubtitles-v2018.php +date: 2018 +count: [94093, movies] diff --git a/corpus/tanzil-quaran/metadata.yaml b/corpus/tanzil-quaran/metadata.yaml new file mode 100644 index 0000000..0207da4 --- /dev/null +++ b/corpus/tanzil-quaran/metadata.yaml @@ -0,0 +1,5 @@ +source: + name: tanzil.net Quran + url: http://tanzil.net/docs/download +# notes: options Simple Enhanced and Text (for inclusion of diacritics) +date: 2019-10-02 diff --git a/corpus/un-v1.0-tei/metadata.yaml b/corpus/un-v1.0-tei/metadata.yaml new file mode 100644 index 0000000..36eaa6d --- /dev/null +++ b/corpus/un-v1.0-tei/metadata.yaml @@ -0,0 +1,5 @@ +source: + name: United Nations Parallel Corpus v1.0 + url: https://conferences.unite.un.org/UNCorpus/en/DownloadOverview +date: 2016 +count: [116754, documents] |