summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLars-Dominik Braun <lars@6xq.net>2019-11-06 16:48:20 +0100
committerLars-Dominik Braun <lars@6xq.net>2019-11-06 19:18:53 +0100
commit43ad3e898a28798ac2f928041999997c24e7bf3c (patch)
treee67264949651ecfffbacf5ee9fa6aff001333523
parentb5ed0d1a5fb280e616d442fd7f928f366cb68bc4 (diff)
downloadlulua-43ad3e898a28798ac2f928041999997c24e7bf3c.tar.gz
lulua-43ad3e898a28798ac2f928041999997c24e7bf3c.tar.bz2
lulua-43ad3e898a28798ac2f928041999997c24e7bf3c.zip
text: Add TEI.2 parser
-rw-r--r--lulua/text.py28
1 files changed, 27 insertions, 1 deletions
diff --git a/lulua/text.py b/lulua/text.py
index b4b4b91..182c717 100644
--- a/lulua/text.py
+++ b/lulua/text.py
@@ -22,7 +22,7 @@
Text/corpus handling tools
"""
-import sys, argparse, pickle, json, logging
+import sys, argparse, pickle, json, logging, xml.dom.minidom
from io import StringIO
from functools import partial
from multiprocessing import Process, Queue, cpu_count, current_process
@@ -162,12 +162,36 @@ def sourceText (item):
def sourceJson (item):
yield json.loads (item)
+def getText (nodelist):
+ rc = []
+ for node in nodelist:
+ if node.nodeType == node.TEXT_NODE:
+ rc.append(node.data)
+ return ''.join(rc)
+
+def sourceTEI2 (item):
+ """ TEI.2 format used for United Nations parallel corpus """
+ with open (item.rstrip (), 'rb') as fd:
+ try:
+ out = []
+ doc = xml.dom.minidom.parse (fd)
+ for text in doc.getElementsByTagName ('text'):
+ for body in text.getElementsByTagName ('body'):
+ for p in body.getElementsByTagName ('p'):
+ for s in p.getElementsByTagName ('s'):
+ out.append (getText (s.childNodes))
+ out.append ('')
+ yield '\n'.join (out)
+ except Exception:
+ logging.error (f'invalid xml document {item}')
+
sources = dict(
aljazeera=partial(sourceHtml, f['aljazeera']),
bbcarabic=partial(sourceHtml, f['bbcarabic']),
text=sourceText,
json=sourceJson,
epub=sourceEpub,
+ tei2=sourceTEI2,
)
charMap = {
@@ -191,6 +215,8 @@ charMap = {
'9': '٩',
'?': '؟',
';': '؛',
+ 'ﻹ': 'لإ',
+ 'ﻷ': 'لأ',
# nbsp
'\u00a0': ' ',
}