diff options
author | Lars-Dominik Braun <lars@6xq.net> | 2019-11-06 16:48:20 +0100 |
---|---|---|
committer | Lars-Dominik Braun <lars@6xq.net> | 2019-11-06 19:18:53 +0100 |
commit | 43ad3e898a28798ac2f928041999997c24e7bf3c (patch) | |
tree | e67264949651ecfffbacf5ee9fa6aff001333523 /lulua | |
parent | b5ed0d1a5fb280e616d442fd7f928f366cb68bc4 (diff) | |
download | lulua-43ad3e898a28798ac2f928041999997c24e7bf3c.tar.gz lulua-43ad3e898a28798ac2f928041999997c24e7bf3c.tar.bz2 lulua-43ad3e898a28798ac2f928041999997c24e7bf3c.zip |
text: Add TEI.2 parser
Diffstat (limited to 'lulua')
-rw-r--r-- | lulua/text.py | 28 |
1 files changed, 27 insertions, 1 deletions
diff --git a/lulua/text.py b/lulua/text.py index b4b4b91..182c717 100644 --- a/lulua/text.py +++ b/lulua/text.py @@ -22,7 +22,7 @@ Text/corpus handling tools """ -import sys, argparse, pickle, json, logging +import sys, argparse, pickle, json, logging, xml.dom.minidom from io import StringIO from functools import partial from multiprocessing import Process, Queue, cpu_count, current_process @@ -162,12 +162,36 @@ def sourceText (item): def sourceJson (item): yield json.loads (item) +def getText (nodelist): + rc = [] + for node in nodelist: + if node.nodeType == node.TEXT_NODE: + rc.append(node.data) + return ''.join(rc) + +def sourceTEI2 (item): + """ TEI.2 format used for United Nations parallel corpus """ + with open (item.rstrip (), 'rb') as fd: + try: + out = [] + doc = xml.dom.minidom.parse (fd) + for text in doc.getElementsByTagName ('text'): + for body in text.getElementsByTagName ('body'): + for p in body.getElementsByTagName ('p'): + for s in p.getElementsByTagName ('s'): + out.append (getText (s.childNodes)) + out.append ('') + yield '\n'.join (out) + except Exception: + logging.error (f'invalid xml document {item}') + sources = dict( aljazeera=partial(sourceHtml, f['aljazeera']), bbcarabic=partial(sourceHtml, f['bbcarabic']), text=sourceText, json=sourceJson, epub=sourceEpub, + tei2=sourceTEI2, ) charMap = { @@ -191,6 +215,8 @@ charMap = { '9': '٩', '?': '؟', ';': '؛', + 'ﻹ': 'لإ', + 'ﻷ': 'لأ', # nbsp '\u00a0': ' ', } |