diff options
-rw-r--r-- | .gitignore | 3 | ||||
-rwxr-xr-x | gen.sh | 34 | ||||
-rw-r--r-- | lulua/test_text.py | 14 | ||||
-rw-r--r-- | lulua/text.py | 292 | ||||
-rw-r--r-- | setup.py | 2 |
5 files changed, 244 insertions, 101 deletions
@@ -6,3 +6,6 @@ __pycache__ .ninja_* build.ninja _build +corpus/*/*.tar.br +corpus/arwiki/*.bz2 +corpus/osm/*.pbf @@ -47,35 +47,35 @@ rule analyze-heat command = lulua-analyze -l \$layout keyheatmap < \$in > \$out rule write-bbcarabic - command = find \$in -type f | lulua-write bbcarabic \$layout | lulua-analyze combine > \$out + command = find \$in | lulua-write \$layout file brotli tar bbcarabic | lulua-analyze combine > \$out pool = write rule write-aljazeera - command = find \$in -type f | lulua-write aljazeera \$layout | lulua-analyze combine > \$out + command = find \$in | lulua-write \$layout file brotli tar aljazeera | lulua-analyze combine > \$out pool = write rule write-epub - command = find \$in -type f | lulua-write epub \$layout | lulua-analyze combine > \$out + command = find \$in | lulua-write \$layout epub | lulua-analyze combine > \$out pool = write rule write-tanzil - command = echo \$in | lulua-write text \$layout | lulua-analyze combine > \$out + command = find \$in | lulua-write \$layout file text | lulua-analyze combine > \$out pool = write rule write-tei2 - command = find \$in -type f -name '*.xml' | lulua-write tei2 \$layout | lulua-analyze combine > \$out + command = find \$in | lulua-write \$layout file brotli tar xml tei2 | lulua-analyze combine > \$out pool = write rule write-opensubtitles - command = find \$in -type f -name '*.xml' | lulua-write opensubtitles \$layout | lulua-analyze combine > \$out + command = find \$in | lulua-write \$layout file brotli tar xml opensubtitles | lulua-analyze combine > \$out pool = write rule write-arwiki - command = \$wikiextractor -ns 0 --json -o - \$in 2>/dev/null | jq .text | lulua-write json \$layout | lulua-analyze combine > \$out + command = \$wikiextractor -ns 0 --json -o - \$in 2>/dev/null | jq .text | lulua-write \$layout json | lulua-analyze combine > \$out pool = write rule write-osm - command = \$osmconvert --csv='name:ar' \$in | sort -u | lulua-write lines \$layout | lulua-analyze combine > \$out + command = \$osmconvert --csv='name:ar' \$in | sort -u | lulua-write \$layout lines | lulua-analyze combine > \$out pool = write rule combine @@ -152,21 +152,27 @@ build \$reportdir/ar-lulua-w64.zip: zipR \$tempdir/ar-lulua-w64 | $deps EOF +bbcarabicfiles=`find $corpusdir/bbcarabic/ -type f -name '*.tar.br' | tr '\n' ' '` +aljazeerafiles=`find $corpusdir/aljazeera/ -type f -name '*.tar.br' | tr '\n' ' '` +unfiles=`find $corpusdir/un-v1.0-tei/ -type f -name '*.tar.br' | tr '\n' ' '` +opensubtitlesfiles=`find $corpusdir/opensubtitles-2018/ -type f -name '*.tar.br' | tr '\n' ' '` +hindawifiles=`find $corpusdir/hindawi/ -type f -name '*.epub' | tr '\n' ' '` + # targets for every layout for l in $layouts; do cat <<EOF build \$statsdir/${l}: mkdir -build \$statsdir/${l}/bbcarabic.pickle: write-bbcarabic \$corpusdir/bbcarabic/raw || \$statsdir/${l} +build \$statsdir/${l}/bbcarabic.pickle: write-bbcarabic $bbcarabicfiles || \$statsdir/${l} layout = ${l} -build \$statsdir/${l}/aljazeera.pickle: write-aljazeera \$corpusdir/aljazeera/raw || \$statsdir/${l} +build \$statsdir/${l}/aljazeera.pickle: write-aljazeera $aljazeerafiles || \$statsdir/${l} layout = ${l} -build \$statsdir/${l}/hindawi.pickle: write-epub \$corpusdir/hindawi/raw || \$statsdir/${l} +build \$statsdir/${l}/hindawi.pickle: write-epub $hindawifiles || \$statsdir/${l} layout = ${l} -build \$statsdir/${l}/tanzil-quaran.pickle: write-tanzil \$corpusdir/tanzil-quaran/plain.txt.lz || \$statsdir/${l} +build \$statsdir/${l}/tanzil-quaran.pickle: write-tanzil \$corpusdir/tanzil-quaran/plain.txt || \$statsdir/${l} layout = ${l} build \$statsdir/${l}/arwiki.pickle: write-arwiki \$corpusdir/arwiki/arwiki-20190701-pages-articles.xml.bz2 || \$statsdir/${l} @@ -175,10 +181,10 @@ build \$statsdir/${l}/arwiki.pickle: write-arwiki \$corpusdir/arwiki/arwiki-2019 build \$statsdir/${l}/osm.pickle: write-osm \$corpusdir/osm/planet-191104.osm.pbf || \$statsdir/${l} \$osmconvert layout = ${l} -build \$statsdir/${l}/un-v1.0-tei.pickle: write-tei2 \$corpusdir/un-v1.0-tei/raw || \$statsdir/${l} +build \$statsdir/${l}/un-v1.0-tei.pickle: write-tei2 $unfiles || \$statsdir/${l} layout = ${l} -build \$statsdir/${l}/opensubtitles-2018.pickle: write-opensubtitles \$corpusdir/opensubtitles-2018/raw || \$statsdir/${l} +build \$statsdir/${l}/opensubtitles-2018.pickle: write-opensubtitles $opensubtitlesfiles || \$statsdir/${l} layout = ${l} build \$statsdir/${l}/all.pickle: combine \$statsdir/${l}/bbcarabic.pickle \$statsdir/${l}/aljazeera.pickle \$statsdir/${l}/tanzil-quaran.pickle \$statsdir/${l}/arwiki.pickle \$statsdir/${l}/osm.pickle \$statsdir/${l}/hindawi.pickle \$statsdir/${l}/un-v1.0-tei.pickle \$statsdir/${l}/opensubtitles-2018.pickle || \$statsdir/${l} diff --git a/lulua/test_text.py b/lulua/test_text.py index 65aa3a1..cd8ae7a 100644 --- a/lulua/test_text.py +++ b/lulua/test_text.py @@ -18,7 +18,10 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. -from .text import charMap, mapChars +import brotli +from io import BytesIO + +from .text import charMap, mapChars, BrotliFile def test_map_chars_mapped (): """ Make sure all chars in the map are mapped correctly """ @@ -46,3 +49,12 @@ def test_map_chars_not_mapped (): outText = mapChars (inText, charMap) assert outText == expectText +def test_brotlifile (): + compressed = brotli.compress (b'hello world') + for chunk in (1, 2, 3, 1024, None): + f = BrotliFile (BytesIO (compressed), chunk) + s = f.read (1) + assert s == b'h' + s = f.read () + assert s == b'ello world' + diff --git a/lulua/text.py b/lulua/text.py index 41be637..77b259e 100644 --- a/lulua/text.py +++ b/lulua/text.py @@ -22,17 +22,19 @@ Text/corpus handling tools """ -import sys, argparse, pickle, json, logging, xml.dom.minidom -from io import StringIO +import sys, argparse, pickle, json, logging, xml.dom.minidom, queue +from io import StringIO, BytesIO from functools import partial +from itertools import chain from multiprocessing import Process, Queue, cpu_count, current_process from subprocess import Popen, PIPE + from tqdm import tqdm import ebooklib from ebooklib import epub - import html5lib from html5lib.filters.base import Filter +import brotli from .keyboard import defaultKeyboards from .layout import defaultLayouts @@ -108,40 +110,78 @@ class HTMLSerializer(object): else: assert False -f = dict( - aljazeera=lambda x: x['name'] == 'div' and x['data'].get ((None, 'id')) == 'DynamicContentContainer', - bbcarabic=lambda x: x['name'] == 'div' and x['data'].get ((None, 'property')) == 'articleBody', - ) - -class LzipFile: - __slots__ = ('p', ) +class BrotliFile: + __slots__ = ('decompressor', 'readchunk', 'fd', 'buf') - def __init__ (self, path): - self.p = Popen (['/usr/bin/lzip', '-c', '-d', path], stdout=PIPE) + def __init__ (self, fd, readchunk=100*1024): + self.fd = fd + self.readchunk = readchunk + self.decompressor = brotli.Decompressor () + self.buf = b'' def __enter__ (self): return self def __exit__ (self, exc_type, exc_val, exc_tb): - self.close () return True def read (self, num=None): - return self.p.stdout.read (num) + while not self.decompressor.is_finished (): + if num is not None and len (self.buf) >= num: + break + self.buf += self.decompressor.process (self.fd.read (self.readchunk)) + if num is not None: + b = self.buf[0:num] + self.buf = self.buf[num:] + else: + b = self.buf + self.buf = b'' + return b + + def seekable (self): + return False def close (self): - self.p.wait () - assert self.p.returncode == 0 + self.decompressor = None -def sourceHtml (selectFunc, item): - with LzipFile (item.rstrip ()) as fd: - document = html5lib.parse (fd) - walker = html5lib.getTreeWalker("etree") - stream = walker (document) - s = HTMLSerializer() - yield ''.join (s.serialize(Select (stream, selectFunc))) +def filterTar (fd): + # Python’s tarfile module is painfully slow. We can do better. + pos = 0 + blocksize = 512 + emptyblock = b'\0'*blocksize -def sourceEpub (item): + while True: + # read header + header = fd.read (blocksize) + pos += blocksize + if header == b'' or header == emptyblock: + break + assert header[256:256+8] == b'\0ustar ', (header[256:256+8]) + size = int (header[124:124+12].rstrip (b'\0'), base=8) + + # read body + if size > 0: + yield BytesIO (fd.read (size)) + pos += size + + # align to next 512 byte block + into = pos%blocksize + if into != 0: + pad = blocksize-into + fd.read (pad) + pos += pad + +def filterBrotli (fd): + yield BrotliFile (fd) + +def filterHtml (selectFunc, fd): + document = html5lib.parse (fd) + walker = html5lib.getTreeWalker("etree") + stream = walker (document) + s = HTMLSerializer() + yield ''.join (s.serialize(Select (stream, selectFunc))) + +def filterEpub (item): """ epub reader """ book = epub.read_epub (item.rstrip ()) logging.debug (f'reading ebook {item}') @@ -155,66 +195,95 @@ def sourceEpub (item): s = HTMLSerializer() yield ''.join (s.serialize (stream)) -def sourceText (item): - with LzipFile (item.rstrip ()) as fd: - yield fd.read ().decode ('utf-8') +def filterText (fd): + yield fd.read ().decode ('utf-8') -def sourceLines (item): +def filterLines (item): """ Read items (i.e. lines) as is """ yield item -def sourceJson (item): +def filterJson (item): yield json.loads (item) +def filterFile (item): + with open (item.rstrip (), 'rb') as fd: + yield fd + +def filterXml (fd): + try: + yield xml.dom.minidom.parse (fd) + except Exception: + logging.error (f'invalid xml document {fd}') + def getText (nodelist): + """ Helper to retrieve text from an XML node list """ rc = [] for node in nodelist: if node.nodeType == node.TEXT_NODE: rc.append(node.data) return ''.join(rc) -def sourceTEI2 (item): +def filterTEI2 (doc): """ TEI.2 format used for United Nations parallel corpus """ - with open (item.rstrip (), 'rb') as fd: - try: - out = [] - doc = xml.dom.minidom.parse (fd) - for text in doc.getElementsByTagName ('text'): - for body in text.getElementsByTagName ('body'): - for p in body.getElementsByTagName ('p'): - for s in p.getElementsByTagName ('s'): - out.append (getText (s.childNodes)) - out.append ('') - yield '\n'.join (out) - except Exception: - logging.error (f'invalid xml document {item}') - -def sourceOpenSubtitles (item): + out = [] + for text in doc.getElementsByTagName ('text'): + for body in text.getElementsByTagName ('body'): + for p in body.getElementsByTagName ('p'): + for s in p.getElementsByTagName ('s'): + out.append (getText (s.childNodes)) + out.append ('') + yield '\n'.join (out) + +def filterOpenSubtitles (doc): """ XML-based format used by the (raw!) OpenSubtitles dump found here: http://opus.nlpl.eu/OpenSubtitles-v2018.php """ - with open (item.rstrip (), 'rb') as fd: - try: - out = [] - doc = xml.dom.minidom.parse (fd) - for s in doc.getElementsByTagName ('s'): - # strip newlines, which are mostly unintentional due to - # pretty-printed xml structure - out.append (getText (s.childNodes).strip ()) - yield '\n'.join (out) - except Exception as e: - logging.error (f'invalid xml document {item} {e}') - -sources = dict( - aljazeera=partial(sourceHtml, f['aljazeera']), - bbcarabic=partial(sourceHtml, f['bbcarabic']), - text=sourceText, - json=sourceJson, - epub=sourceEpub, - tei2=sourceTEI2, - opensubtitles=sourceOpenSubtitles, - lines=sourceLines, + + out = [] + for s in doc.getElementsByTagName ('s'): + # strip newlines, which are mostly unintentional due to + # pretty-printed xml structure + out.append (getText (s.childNodes).strip ()) + yield '\n'.join (out) + +def filterMediawikiMarkdown (text): + """ + Convert mediawiki to markdown + """ + p = subprocess.Popen (['pandoc', '-f', 'mediawiki', '-t', 'markdown'], + stdin=subprocess.PIPE, stdout=subprocess.PIPE) + p.stdin.write (text.encode ('utf-8')) + p.stdin.close () + text = p.stdout.read ().decode ('utf-8') + ret = p.wait () + # make sure we’re not leaking fd’s + p.stdout.close () + del p + if ret != 0: + logging.error ('pandoc rejected document') + else: + yield text + +f = dict( + aljazeera=lambda x: x['name'] == 'div' and x['data'].get ((None, 'id')) == 'DynamicContentContainer', + bbcarabic=lambda x: x['name'] == 'div' and x['data'].get ((None, 'property')) == 'articleBody', + ) + +filterAvail = dict( + aljazeera=partial(filterHtml, f['aljazeera']), + bbcarabic=partial(filterHtml, f['bbcarabic']), + text=filterText, + json=filterJson, + epub=filterEpub, + tei2=filterTEI2, + opensubtitles=filterOpenSubtitles, + lines=filterLines, + xml=filterXml, + file=filterFile, + tar=filterTar, + mediawikimarkdown=filterMediawikiMarkdown, + brotli=filterBrotli, ) charMap = { @@ -248,7 +317,14 @@ def mapChars (text, m): """ For all characters in text, replace if found in map m or keep as-is """ return ''.join (map (lambda x: m.get (x, x), text)) -def writeWorker (layout, sourceFunc, inq, outq): +def apply (fs, items): + """ Apply the first function fs[0] to all items, flatten the result and repeat """ + if not fs: + return items + else: + return apply (fs[1:], chain.from_iterable (map (fs[0], items))) + +def writeWorker (layout, funcs, inq, outq, statusq, benchmark): try: keyboard = defaultKeyboards['ibmpc105'] combined = makeCombined (keyboard) @@ -259,8 +335,13 @@ def writeWorker (layout, sourceFunc, inq, outq): if item is None: break - # extract (can be multiple items per source) - for text in sourceFunc (item): + # extract (can be multiple texts per item) + i = 0 + for text in apply (funcs, [item]): + if benchmark: + i += 1 + continue + # map chars; make sure we’re using unix line endings, which is # only one character text = mapChars (text, charMap).replace ('\r\n', '\n') @@ -278,28 +359,41 @@ def writeWorker (layout, sourceFunc, inq, outq): for s in stats: combined[s.name].update (s) - itemsProcessed += 1 - + i += 1 + # only update ocasionally, this is an expensive operation + statusq.put (i) + itemsProcessed += i if itemsProcessed > 0: outq.put (combined) else: outq.put (None) except Exception as e: # async exceptions - outq.put (None) - raise + outq.put (e) + +def statusWorker (statusq): + with tqdm (unit='item', smoothing=0) as bar: + while True: + try: + num = statusq.get (block=True, timeout=1) + if num is None: + break + bar.update (n=num) + except queue.Empty: + bar.update (n=0) def write (): """ Extract corpus source file, convert to plain text, map chars and create stats """ parser = argparse.ArgumentParser(description='Import text and create stats.') - parser.add_argument('-v', '--verbose', action='store_true', help='Enable debugging output') + parser.add_argument('--benchmark', action='store_true', help='Benchmark filter, no stats') + parser.add_argument('-j', '--jobs', metavar='NUM', + default=cpu_count (), type=int, help='Number of parallel jobs') parser.add_argument('-k', '--keyboard', metavar='KEYBOARD', default='ibmpc105', help='Physical keyboard name') - parser.add_argument('-j', '--jobs', metavar='NUM', - default=cpu_count (), help='Number of parallel jobs') - parser.add_argument('source', metavar='SOURCE', choices=sources.keys(), help='Data source extractor name') + parser.add_argument('-v', '--verbose', action='store_true', help='Enable debugging output') parser.add_argument('layout', metavar='LAYOUT', help='Keyboard layout name') + parser.add_argument('filter', metavar='FILTER', choices=filterAvail.keys(), nargs='+', help='Data filter') args = parser.parse_args() @@ -310,30 +404,36 @@ def write (): keyboard = defaultKeyboards[args.keyboard] layout = defaultLayouts[args.layout].specialize (keyboard) + filterSel = [filterAvail[x] for x in args.filter] # limit queue sizes to limit memory usage inq = Queue (args.jobs*2) outq = Queue (args.jobs+1) + statusq = Queue (args.jobs+1) logging.info (f'using {args.jobs} workers') workers = [] for i in range (args.jobs): p = Process(target=writeWorker, - args=(layout, sources[args.source], inq, outq), + args=(layout, filterSel, inq, outq, statusq, args.benchmark), daemon=True, name=f'worker-{i}') p.start() workers.append (p) + statusp = Process(target=statusWorker, + args=(statusq,), + daemon=True, + name=f'status') + statusp.start() + try: - with tqdm (unit='item') as bar: - for l in sys.stdin: - inq.put (l) - bar.update (n=1) - - # something is wrong - if not outq.empty (): - return 1 + for l in sys.stdin: + inq.put (l) + + # something is wrong + if not outq.empty (): + break except KeyboardInterrupt: pass @@ -342,12 +442,32 @@ def write (): for w in workers: inq.put (None) item = outq.get () + if isinstance (item, Exception): + raise item if item is not None: pickle.dump (item, sys.stdout.buffer, pickle.HIGHEST_PROTOCOL) assert outq.empty () + + statusq.put (None) + statusp.join () + # and then we can kill them for w in workers: w.join () return 0 +import bz2, sys, json, subprocess +from xml.etree.ElementTree import iterparse +def extractMediawiki (): + parser = argparse.ArgumentParser(description='Extract raw wikitext from mediawiki dump.') + parser.add_argument('file', metavar='FILE', help='bzip2-compressed dump') + args = parser.parse_args() + + with bz2.open (args.file, 'r') as fd: + for event, elem in iterparse (fd, ['start', 'end']): + if event == 'end' and elem.tag == '{http://www.mediawiki.org/xml/export-0.10/}text': + text = ''.join (elem.itertext ()) + json.dump (text, sys.stdout, ensure_ascii=False) + sys.stdout.write ('\n') + @@ -40,6 +40,7 @@ setup( 'html5lib', 'ebooklib', 'jinja2', + 'brotli', ], entry_points={ 'console_scripts': [ @@ -49,6 +50,7 @@ setup( 'lulua-import = lulua.layout:importFrom', 'lulua-optimize = lulua.optimize:optimize', 'lulua-write = lulua.text:write', + 'lulua-extract-mediawiki = lulua.text:extractMediawiki', ], }, package_data = { |