From 2d45ef655f8791037373ab83174fc6c3596227b0 Mon Sep 17 00:00:00 2001 From: Lars-Dominik Braun Date: Thu, 3 Oct 2019 17:23:53 +0200 Subject: text: Add epub reader and hindawi corpus See issue #5. --- doc/index.html | 6 +++-- gen.sh | 10 ++++++++- lulua/text.py | 71 +++++++++++++++++++++++++++++++++++++++++----------------- setup.py | 1 + 4 files changed, 64 insertions(+), 24 deletions(-) diff --git a/doc/index.html b/doc/index.html index f9daf88..6749647 100644 --- a/doc/index.html +++ b/doc/index.html @@ -129,13 +129,15 @@ dump of the Arabic Wikipedia as of July 2019, extracted using wikiextractor - containing 857386 articles + containing 857,386 articles +
  • 1,709 ebooks from hindawi.org
  • and a plain-text copy of the Quran from tanzil.net using the options Simple Enhanced and Text (for inclusion of diacritics)
  • - summing up to roughly 1.5 billion characters. + summing up to roughly two billion characters. The plot below shows ا ل ي م و ن can be considered the most frequently used letters in the Arabic language. diff --git a/gen.sh b/gen.sh index 77fbf81..0d7a066 100755 --- a/gen.sh +++ b/gen.sh @@ -39,6 +39,7 @@ rule render-xmodmap rule analyze-heat command = lulua-analyze -l \$layout keyheatmap < \$in > \$out +# XXX: add lulua-analyze combine here rule write-bbcarabic command = find \$in -type f | lulua-write bbcarabic \$layout > \$out pool = write @@ -47,6 +48,10 @@ rule write-aljazeera command = find \$in -type f | lulua-write aljazeera \$layout > \$out pool = write +rule write-epub + command = find \$in -type f | lulua-write epub \$layout | lulua-analyze combine > \$out + pool = write + rule write-tanzil command = echo \$in | lulua-write text \$layout | lulua-analyze combine > \$out pool = write @@ -98,13 +103,16 @@ build \$statsdir/${l}/bbcarabic.pickle: write-bbcarabic \$corpusdir/bbcarabic/ra build \$statsdir/${l}/aljazeera.pickle: write-aljazeera \$corpusdir/aljazeera/raw || \$statsdir/${l} layout = ${l} +build \$statsdir/${l}/hindawi.pickle: write-epub \$corpusdir/hindawi/raw || \$statsdir/${l} + layout = ${l} + build \$statsdir/${l}/tanzil.pickle: write-tanzil \$corpusdir/tanzil-quaran/plain.txt.lz || \$statsdir/${l} layout = ${l} build \$statsdir/${l}/arwiki.pickle: write-arwiki \$corpusdir/arwiki/arwiki-20190701-pages-articles.xml.bz2 || \$statsdir/${l} layout = ${l} -build \$statsdir/${l}/all.pickle: combine \$statsdir/${l}/bbcarabic.pickle \$statsdir/${l}/aljazeera.pickle \$statsdir/${l}/tanzil.pickle \$statsdir/${l}/arwiki.pickle || \$statsdir/${l} +build \$statsdir/${l}/all.pickle: combine \$statsdir/${l}/bbcarabic.pickle \$statsdir/${l}/aljazeera.pickle \$statsdir/${l}/tanzil.pickle \$statsdir/${l}/arwiki.pickle \$statsdir/${l}/hindawi.pickle || \$statsdir/${l} build \$docdir/_build/${l}.svg: render-svg || \$docdir/_build layout = ${l} diff --git a/lulua/text.py b/lulua/text.py index 98c7824..b4b4b91 100644 --- a/lulua/text.py +++ b/lulua/text.py @@ -28,6 +28,8 @@ from functools import partial from multiprocessing import Process, Queue, cpu_count, current_process from subprocess import Popen, PIPE from tqdm import tqdm +import ebooklib +from ebooklib import epub import html5lib from html5lib.filters.base import Filter @@ -137,20 +139,35 @@ def sourceHtml (selectFunc, item): walker = html5lib.getTreeWalker("etree") stream = walker (document) s = HTMLSerializer() - return ''.join (s.serialize(Select (stream, selectFunc))) + yield ''.join (s.serialize(Select (stream, selectFunc))) + +def sourceEpub (item): + """ epub reader """ + book = epub.read_epub (item.rstrip ()) + logging.debug (f'reading ebook {item}') + for item in book.get_items_of_type (ebooklib.ITEM_DOCUMENT): + logging.debug (f'got item {item.get_name ()}') + # XXX: in theory html5lib should be able to detect the encoding of + # bytes(), but it does not. + document = html5lib.parse (item.get_content ().decode ('utf-8')) + walker = html5lib.getTreeWalker("etree") + stream = walker (document) + s = HTMLSerializer() + yield ''.join (s.serialize (stream)) def sourceText (item): with LzipFile (item.rstrip ()) as fd: - return fd.read ().decode ('utf-8') + yield fd.read ().decode ('utf-8') def sourceJson (item): - return json.loads (item) + yield json.loads (item) sources = dict( aljazeera=partial(sourceHtml, f['aljazeera']), bbcarabic=partial(sourceHtml, f['bbcarabic']), text=sourceText, json=sourceJson, + epub=sourceEpub, ) charMap = { @@ -184,6 +201,7 @@ def writeWorker (args, inq, outq): layout = defaultLayouts['null'].specialize (keyboard) w = Writer (layout) combined = dict ((cls.name, cls(w)) for cls in allStats) + itemsProcessed = 0 while True: keyboard = defaultKeyboards[args.keyboard] @@ -194,25 +212,30 @@ def writeWorker (args, inq, outq): if item is None: break - # extract - text = sources[args.source] (item) - text = ''.join (map (lambda x: charMap.get (x, x), text)) - # XXX sanity checks, disable - for c in charMap.keys (): - if c in text: - #print (c, 'is in text', file=sys.stderr) - assert False, c - - # stats - stats = [cls(w) for cls in allStats] - for match, event in w.type (StringIO (text)): + # extract (can be multiple items per source) + for text in sources[args.source] (item): + text = ''.join (map (lambda x: charMap.get (x, x), text)) + # XXX sanity checks, disable + for c in charMap.keys (): + if c in text: + #print (c, 'is in text', file=sys.stderr) + assert False, c + + # stats + stats = [cls(w) for cls in allStats] + for match, event in w.type (StringIO (text)): + for s in stats: + s.process (event) + for s in stats: - s.process (event) + combined[s.name].update (s) - for s in stats: - combined[s.name].update (s) + itemsProcessed += 1 - outq.put (combined) + if itemsProcessed > 0: + outq.put (combined) + else: + outq.put (None) except Exception as e: # async exceptions outq.put (None) @@ -222,6 +245,7 @@ def write (): """ Extract corpus source file, convert to plain text, map chars and create stats """ parser = argparse.ArgumentParser(description='Import text and create stats.') + parser.add_argument('-v', '--verbose', action='store_true', help='Enable debugging output') parser.add_argument('-k', '--keyboard', metavar='KEYBOARD', default='ibmpc105', help='Physical keyboard name') parser.add_argument('-j', '--jobs', metavar='NUM', @@ -231,7 +255,10 @@ def write (): args = parser.parse_args() - logging.basicConfig (level=logging.INFO) + if args.verbose: + logging.basicConfig (level=logging.DEBUG) + else: + logging.basicConfig (level=logging.INFO) # limit queue sizes to limit memory usage inq = Queue (args.jobs*2) @@ -260,7 +287,9 @@ def write (): # every one of them will consume exactly one item and write one in return for w in workers: inq.put (None) - pickle.dump (outq.get (), sys.stdout.buffer, pickle.HIGHEST_PROTOCOL) + item = outq.get () + if item is not None: + pickle.dump (item, sys.stdout.buffer, pickle.HIGHEST_PROTOCOL) assert outq.empty () # and then we can kill them for w in workers: diff --git a/setup.py b/setup.py index 56c64a6..2e20067 100644 --- a/setup.py +++ b/setup.py @@ -38,6 +38,7 @@ setup( 'bokeh', 'tqdm', 'html5lib', + 'ebooklib', ], entry_points={ 'console_scripts': [ -- cgit v1.2.3