diff options
author | Lars-Dominik Braun <lars@6xq.net> | 2019-10-03 17:23:53 +0200 |
---|---|---|
committer | Lars-Dominik Braun <lars@6xq.net> | 2019-10-03 17:23:53 +0200 |
commit | 2d45ef655f8791037373ab83174fc6c3596227b0 (patch) | |
tree | a05d506928fcc16f8dfdddb860c6ce4c5193bfc4 /lulua | |
parent | 8048f6351fb4611134c2f6e2d9129ec025376914 (diff) | |
download | lulua-2d45ef655f8791037373ab83174fc6c3596227b0.tar.gz lulua-2d45ef655f8791037373ab83174fc6c3596227b0.tar.bz2 lulua-2d45ef655f8791037373ab83174fc6c3596227b0.zip |
text: Add epub reader and hindawi corpus
See issue #5.
Diffstat (limited to 'lulua')
-rw-r--r-- | lulua/text.py | 71 |
1 files changed, 50 insertions, 21 deletions
diff --git a/lulua/text.py b/lulua/text.py index 98c7824..b4b4b91 100644 --- a/lulua/text.py +++ b/lulua/text.py @@ -28,6 +28,8 @@ from functools import partial from multiprocessing import Process, Queue, cpu_count, current_process from subprocess import Popen, PIPE from tqdm import tqdm +import ebooklib +from ebooklib import epub import html5lib from html5lib.filters.base import Filter @@ -137,20 +139,35 @@ def sourceHtml (selectFunc, item): walker = html5lib.getTreeWalker("etree") stream = walker (document) s = HTMLSerializer() - return ''.join (s.serialize(Select (stream, selectFunc))) + yield ''.join (s.serialize(Select (stream, selectFunc))) + +def sourceEpub (item): + """ epub reader """ + book = epub.read_epub (item.rstrip ()) + logging.debug (f'reading ebook {item}') + for item in book.get_items_of_type (ebooklib.ITEM_DOCUMENT): + logging.debug (f'got item {item.get_name ()}') + # XXX: in theory html5lib should be able to detect the encoding of + # bytes(), but it does not. + document = html5lib.parse (item.get_content ().decode ('utf-8')) + walker = html5lib.getTreeWalker("etree") + stream = walker (document) + s = HTMLSerializer() + yield ''.join (s.serialize (stream)) def sourceText (item): with LzipFile (item.rstrip ()) as fd: - return fd.read ().decode ('utf-8') + yield fd.read ().decode ('utf-8') def sourceJson (item): - return json.loads (item) + yield json.loads (item) sources = dict( aljazeera=partial(sourceHtml, f['aljazeera']), bbcarabic=partial(sourceHtml, f['bbcarabic']), text=sourceText, json=sourceJson, + epub=sourceEpub, ) charMap = { @@ -184,6 +201,7 @@ def writeWorker (args, inq, outq): layout = defaultLayouts['null'].specialize (keyboard) w = Writer (layout) combined = dict ((cls.name, cls(w)) for cls in allStats) + itemsProcessed = 0 while True: keyboard = defaultKeyboards[args.keyboard] @@ -194,25 +212,30 @@ def writeWorker (args, inq, outq): if item is None: break - # extract - text = sources[args.source] (item) - text = ''.join (map (lambda x: charMap.get (x, x), text)) - # XXX sanity checks, disable - for c in charMap.keys (): - if c in text: - #print (c, 'is in text', file=sys.stderr) - assert False, c - - # stats - stats = [cls(w) for cls in allStats] - for match, event in w.type (StringIO (text)): + # extract (can be multiple items per source) + for text in sources[args.source] (item): + text = ''.join (map (lambda x: charMap.get (x, x), text)) + # XXX sanity checks, disable + for c in charMap.keys (): + if c in text: + #print (c, 'is in text', file=sys.stderr) + assert False, c + + # stats + stats = [cls(w) for cls in allStats] + for match, event in w.type (StringIO (text)): + for s in stats: + s.process (event) + for s in stats: - s.process (event) + combined[s.name].update (s) - for s in stats: - combined[s.name].update (s) + itemsProcessed += 1 - outq.put (combined) + if itemsProcessed > 0: + outq.put (combined) + else: + outq.put (None) except Exception as e: # async exceptions outq.put (None) @@ -222,6 +245,7 @@ def write (): """ Extract corpus source file, convert to plain text, map chars and create stats """ parser = argparse.ArgumentParser(description='Import text and create stats.') + parser.add_argument('-v', '--verbose', action='store_true', help='Enable debugging output') parser.add_argument('-k', '--keyboard', metavar='KEYBOARD', default='ibmpc105', help='Physical keyboard name') parser.add_argument('-j', '--jobs', metavar='NUM', @@ -231,7 +255,10 @@ def write (): args = parser.parse_args() - logging.basicConfig (level=logging.INFO) + if args.verbose: + logging.basicConfig (level=logging.DEBUG) + else: + logging.basicConfig (level=logging.INFO) # limit queue sizes to limit memory usage inq = Queue (args.jobs*2) @@ -260,7 +287,9 @@ def write (): # every one of them will consume exactly one item and write one in return for w in workers: inq.put (None) - pickle.dump (outq.get (), sys.stdout.buffer, pickle.HIGHEST_PROTOCOL) + item = outq.get () + if item is not None: + pickle.dump (item, sys.stdout.buffer, pickle.HIGHEST_PROTOCOL) assert outq.empty () # and then we can kill them for w in workers: |