From 244197b5f8b1f4d73d4ab9ac838334860b55662c Mon Sep 17 00:00:00 2001 From: Lars-Dominik Braun Date: Sun, 10 May 2020 10:48:38 +0200 Subject: report: Add translated source table, asymmetry definition Also fix the layout break point. --- corpus/aljazeera/metadata.yaml | 10 +- corpus/arwiki/metadata.yaml | 10 +- corpus/bbcarabic/metadata.yaml | 10 +- corpus/hindawi/metadata.yaml | 10 +- corpus/opensubtitles-2018/metadata.yaml | 10 +- corpus/osm/metadata.yaml | 10 +- corpus/tanzil-quaran/metadata.yaml | 4 +- corpus/un-v1.0-tei/metadata.yaml | 10 +- lulua/data/report/index.html | 156 ++++++++++++++++++++++---------- lulua/data/report/style.css | 27 ++++-- lulua/report.py | 16 +++- 11 files changed, 197 insertions(+), 76 deletions(-) diff --git a/corpus/aljazeera/metadata.yaml b/corpus/aljazeera/metadata.yaml index f5908c6..623cf91 100644 --- a/corpus/aljazeera/metadata.yaml +++ b/corpus/aljazeera/metadata.yaml @@ -1,7 +1,13 @@ source: - name: Al-Jazeera + name: + en: Al-Jazeera + ar: الجزيرة url: https://www.aljazeera.net/ extractor: name: Custom scripts date: 2019-07 -count: [547110, articles] +count: + num: 547110 + kind: + en: articles + ar: مقالة diff --git a/corpus/arwiki/metadata.yaml b/corpus/arwiki/metadata.yaml index 2a1ff72..b033919 100644 --- a/corpus/arwiki/metadata.yaml +++ b/corpus/arwiki/metadata.yaml @@ -1,8 +1,14 @@ source: - name: Arabic Wikipedia + name: + en: Arabic Wikipedia + ar: ويكيبيديا العربية url: https://ar.wikipedia.org/ extractor: name: wikiextractor url: https://github.com/attardi/wikiextractor/tree/3162bb6c3c9ebd2d15be507aa11d6fa818a454ac date: 2019-07-01 -count: [857386, articles] +count: + num: 857386 + kind: + en: articles + ar: مقالة diff --git a/corpus/bbcarabic/metadata.yaml b/corpus/bbcarabic/metadata.yaml index d1c06a5..085b80f 100644 --- a/corpus/bbcarabic/metadata.yaml +++ b/corpus/bbcarabic/metadata.yaml @@ -1,7 +1,13 @@ source: - name: BBC Arabic + name: + en: BBC Arabic + ar: بي بي سي العربية url: http://www.bbc.com/arabic extractor: name: Custom scripts date: 2019-07 -count: [149901, articles] +count: + num: 149901 + kind: + en: articles + ar: مقالة diff --git a/corpus/hindawi/metadata.yaml b/corpus/hindawi/metadata.yaml index c92e428..5cadffc 100644 --- a/corpus/hindawi/metadata.yaml +++ b/corpus/hindawi/metadata.yaml @@ -1,5 +1,11 @@ source: - name: hindawi.org + name: + en: hindawi.org + ar: هنداوي url: https://www.hindawi.org/books date: 2019-10-02 -count: [1709, books] +count: + num: 1709 + kind: + en: books + ar: كتاب diff --git a/corpus/opensubtitles-2018/metadata.yaml b/corpus/opensubtitles-2018/metadata.yaml index 310ae82..32454db 100644 --- a/corpus/opensubtitles-2018/metadata.yaml +++ b/corpus/opensubtitles-2018/metadata.yaml @@ -1,5 +1,11 @@ source: - name: ORPUS OpenSubtitles 2018 + name: + en: ORPUS OpenSubtitles 2018 + ar: ORPUS OpenSubtitles 2018 url: http://opus.nlpl.eu/OpenSubtitles-v2018.php date: 2018 -count: [94093, movies] +count: + num: 94093 + kind: + en: movies + ar: فيلم diff --git a/corpus/osm/metadata.yaml b/corpus/osm/metadata.yaml index e3aff73..1759531 100644 --- a/corpus/osm/metadata.yaml +++ b/corpus/osm/metadata.yaml @@ -1,5 +1,11 @@ source: - name: OpenStreetMap Arabic Labels + name: + en: OpenStreetMap Arabic Labels + ar: خريطة الشارع المفتوحة Arabic Labels url: https://planet.openstreetmap.org/ date: 2019-11-04 -count: [376148, labels] +count: + num: 376148 + kind: + en: labels + ar: labels diff --git a/corpus/tanzil-quaran/metadata.yaml b/corpus/tanzil-quaran/metadata.yaml index 0207da4..9932aa5 100644 --- a/corpus/tanzil-quaran/metadata.yaml +++ b/corpus/tanzil-quaran/metadata.yaml @@ -1,5 +1,7 @@ source: - name: tanzil.net Quran + name: + en: tanzil.net Quran + ar: القرآن (بن tanzil.net) url: http://tanzil.net/docs/download # notes: options Simple Enhanced and Text (for inclusion of diacritics) date: 2019-10-02 diff --git a/corpus/un-v1.0-tei/metadata.yaml b/corpus/un-v1.0-tei/metadata.yaml index 36eaa6d..224da94 100644 --- a/corpus/un-v1.0-tei/metadata.yaml +++ b/corpus/un-v1.0-tei/metadata.yaml @@ -1,5 +1,11 @@ source: - name: United Nations Parallel Corpus v1.0 + name: + en: United Nations Parallel Corpus v1.0 + ar: الأمم المتحدة Parallel Corpus v1.0 url: https://conferences.unite.un.org/UNCorpus/en/DownloadOverview date: 2016 -count: [116754, documents] +count: + num: 116754 + kind: + en: documents + ar: ملف diff --git a/lulua/data/report/index.html b/lulua/data/report/index.html index 0e4c779..cda1c9d 100644 --- a/lulua/data/report/index.html +++ b/lulua/data/report/index.html @@ -2,13 +2,15 @@ - لؤلؤة + لؤلؤة: لوحة مفاتيح عربية - + + + {# bokeh #} {% for f in bokehres.js_files -%} @@ -26,7 +28,7 @@

- +

لوحة مفاتيح عربية

Ergonomic Arabic Keyboard Layout

@@ -44,9 +46,9 @@
-
+
-
+

This is work in progress and contributions are welcome. Head over to GitHub to see where @@ -56,9 +58,9 @@

-
+
-
+

Goals

    @@ -77,9 +79,9 @@
-
+
-
+

Usage

@@ -99,9 +101,9 @@
-
+
-
+

Learn more

@@ -111,12 +113,12 @@
-
+

الأبجدية العربية

-
+

The Arabic alphabet

@@ -131,15 +133,56 @@ The novel corpus built for the following analysis consists of

+
+
+
+ +
+
+
+
+ + {% for c in corpus|sort(attribute='source.name.en') %} + + + {% set count = c.get ('count') %} + {% if count %} + {# use new style formatting, for some reason %7,d does not work #} + + {% else %} + + {% endif %} -
مصدركلامحروف
{{ c.source.name.ar }}{{ '{:7,d}'.format(count.num)|arabnum }} {{ count.kind.ar }}
- {% for c in corpus|sort(attribute='source.name') %} + {% set stats = c.get ('stats') %} + {% for k in ('words', 'characters') %} + {% set i = stats[k]|approx('ar') %} + + {% endfor %} + + {% endfor %} + + {% for k in ('words', 'characters') %} + {% set i = corpustotal[k]|approx('ar') %} + + {% endfor %} + +
SourceWordsCharacters
{{ '%5.1f'|format(i[0])|arabnum }} {{ i[1] }}
مجموع{{ '%5.1f'|format(i[0])|arabnum }} {{ i[1] }}
+
+
+
+
+
+
+
+
+ + {% for c in corpus|sort(attribute='source.name.en') %} - + {% set count = c.get ('count') %} {% if count %} {# use new style formatting, for some reason %7,d does not work #} - + {% else %} {% endif %} @@ -147,17 +190,28 @@ {% set stats = c.get ('stats') %} {% for k in ('words', 'characters') %} {% set i = stats[k]|approx %} - + {% endfor %} {% endfor %} {% for k in ('words', 'characters') %} {% set i = corpustotal[k]|approx %} - + {% endfor %}
SourceWordsCharacters
{{ c.source.name }}{{ c.source.name.en }}{{ '{:7,d}'.format(count[0])|numspace }} {{ count[1] }}{{ '{:7,d}'.format(count.num)|numspace }} {{ count.kind.en }}{{ '%5.1f'|format(i[0])|numspace }} {{ i[1] }}{{ '%5.1f'|format(i[0])|numspace }} {{ i[1] }}
Total{{ '%5.1f'|format(i[0])|numspace }} {{ i[1] }}{{ '%5.1f'|format(i[0])|numspace }} {{ i[1] }}
+
+

+ The chosen Quran representation does not include all quranic + diacritization symbols, like other datasets. + + This makes comparison fairer, since most keyboards presented below do + not include any of them. +

+
+

The plot below shows ا ل ي م و ن can be @@ -174,9 +228,9 @@

-
+
-
+

Arabic letter frequency distribution

@@ -187,9 +241,9 @@
-
+
-
+

Layout properties

@@ -206,7 +260,9 @@
cyan
thumb
-

Asymmetry is defined as the difference between left and right hand usage.

+

Asymmetry is defined as the difference between left and right hand button + usage \(b_{left/right}\) and includes the thumb:

+ $$a = \frac{b_{left}}{b_{total}} - \frac{b_{right}}{b_{total}}$$

The layout proposed uses four layers and assumes a 102/105 key ISO @@ -289,9 +345,9 @@