summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLars-Dominik Braun <lars@6xq.net>2019-11-30 14:19:00 +0100
committerLars-Dominik Braun <lars@6xq.net>2019-11-30 14:19:00 +0100
commit810c8ff0bea17214b4e4c5ce802ad89b5ad6e2c9 (patch)
treef3db99cc0870af583c4b1d7d3f2376e943014fc1
parentfeb84c69b4e3e7294f69456b50a1fb678566cf24 (diff)
downloadlulua-810c8ff0bea17214b4e4c5ce802ad89b5ad6e2c9.tar.gz
lulua-810c8ff0bea17214b4e4c5ce802ad89b5ad6e2c9.tar.bz2
lulua-810c8ff0bea17214b4e4c5ce802ad89b5ad6e2c9.zip
Add missing corpuse metadata files
-rw-r--r--.gitignore1
-rw-r--r--corpus/aljazeera/metadata.yaml7
-rw-r--r--corpus/arwiki/metadata.yaml8
-rw-r--r--corpus/bbcarabic/metadata.yaml7
-rw-r--r--corpus/hindawi/metadata.yaml5
-rw-r--r--corpus/opensubtitles-2018/metadata.yaml5
-rw-r--r--corpus/tanzil-quaran/metadata.yaml5
-rw-r--r--corpus/un-v1.0-tei/metadata.yaml5
8 files changed, 42 insertions, 1 deletions
diff --git a/.gitignore b/.gitignore
index 41a91da..26b54ec 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,4 +8,3 @@ doc/_temp
.ninja_*
build.ninja
stats/
-corpus/
diff --git a/corpus/aljazeera/metadata.yaml b/corpus/aljazeera/metadata.yaml
new file mode 100644
index 0000000..f5908c6
--- /dev/null
+++ b/corpus/aljazeera/metadata.yaml
@@ -0,0 +1,7 @@
+source:
+ name: Al-Jazeera
+ url: https://www.aljazeera.net/
+extractor:
+ name: Custom scripts
+date: 2019-07
+count: [547110, articles]
diff --git a/corpus/arwiki/metadata.yaml b/corpus/arwiki/metadata.yaml
new file mode 100644
index 0000000..2a1ff72
--- /dev/null
+++ b/corpus/arwiki/metadata.yaml
@@ -0,0 +1,8 @@
+source:
+ name: Arabic Wikipedia
+ url: https://ar.wikipedia.org/
+extractor:
+ name: wikiextractor
+ url: https://github.com/attardi/wikiextractor/tree/3162bb6c3c9ebd2d15be507aa11d6fa818a454ac
+date: 2019-07-01
+count: [857386, articles]
diff --git a/corpus/bbcarabic/metadata.yaml b/corpus/bbcarabic/metadata.yaml
new file mode 100644
index 0000000..d1c06a5
--- /dev/null
+++ b/corpus/bbcarabic/metadata.yaml
@@ -0,0 +1,7 @@
+source:
+ name: BBC Arabic
+ url: http://www.bbc.com/arabic
+extractor:
+ name: Custom scripts
+date: 2019-07
+count: [149901, articles]
diff --git a/corpus/hindawi/metadata.yaml b/corpus/hindawi/metadata.yaml
new file mode 100644
index 0000000..c92e428
--- /dev/null
+++ b/corpus/hindawi/metadata.yaml
@@ -0,0 +1,5 @@
+source:
+ name: hindawi.org
+ url: https://www.hindawi.org/books
+date: 2019-10-02
+count: [1709, books]
diff --git a/corpus/opensubtitles-2018/metadata.yaml b/corpus/opensubtitles-2018/metadata.yaml
new file mode 100644
index 0000000..310ae82
--- /dev/null
+++ b/corpus/opensubtitles-2018/metadata.yaml
@@ -0,0 +1,5 @@
+source:
+ name: ORPUS OpenSubtitles 2018
+ url: http://opus.nlpl.eu/OpenSubtitles-v2018.php
+date: 2018
+count: [94093, movies]
diff --git a/corpus/tanzil-quaran/metadata.yaml b/corpus/tanzil-quaran/metadata.yaml
new file mode 100644
index 0000000..0207da4
--- /dev/null
+++ b/corpus/tanzil-quaran/metadata.yaml
@@ -0,0 +1,5 @@
+source:
+ name: tanzil.net Quran
+ url: http://tanzil.net/docs/download
+# notes: options Simple Enhanced and Text (for inclusion of diacritics)
+date: 2019-10-02
diff --git a/corpus/un-v1.0-tei/metadata.yaml b/corpus/un-v1.0-tei/metadata.yaml
new file mode 100644
index 0000000..36eaa6d
--- /dev/null
+++ b/corpus/un-v1.0-tei/metadata.yaml
@@ -0,0 +1,5 @@
+source:
+ name: United Nations Parallel Corpus v1.0
+ url: https://conferences.unite.un.org/UNCorpus/en/DownloadOverview
+date: 2016
+count: [116754, documents]