diff options
author | Lars-Dominik Braun <lars@6xq.net> | 2019-10-03 17:23:53 +0200 |
---|---|---|
committer | Lars-Dominik Braun <lars@6xq.net> | 2019-10-03 17:23:53 +0200 |
commit | 2d45ef655f8791037373ab83174fc6c3596227b0 (patch) | |
tree | a05d506928fcc16f8dfdddb860c6ce4c5193bfc4 | |
parent | 8048f6351fb4611134c2f6e2d9129ec025376914 (diff) | |
download | lulua-2d45ef655f8791037373ab83174fc6c3596227b0.tar.gz lulua-2d45ef655f8791037373ab83174fc6c3596227b0.tar.bz2 lulua-2d45ef655f8791037373ab83174fc6c3596227b0.zip |
text: Add epub reader and hindawi corpus
See issue #5.
-rw-r--r-- | doc/index.html | 6 | ||||
-rwxr-xr-x | gen.sh | 10 | ||||
-rw-r--r-- | lulua/text.py | 71 | ||||
-rw-r--r-- | setup.py | 1 |
4 files changed, 64 insertions, 24 deletions
diff --git a/doc/index.html b/doc/index.html index f9daf88..6749647 100644 --- a/doc/index.html +++ b/doc/index.html @@ -129,13 +129,15 @@ dump</a> of the <a href="https://ar.wikipedia.org/">Arabic Wikipedia</a> as of July 2019, extracted using <a href="https://github.com/attardi/wikiextractor/tree/3162bb6c3c9ebd2d15be507aa11d6fa818a454ac">wikiextractor</a> - containing 857386 articles</li> + containing 857,386 articles</li> + <li>1,709 ebooks from <a + href="https://www.hindawi.org/books">hindawi.org</a></li> <li>and a plain-text copy of the Quran from <a href="http://tanzil.net/docs/download">tanzil.net</a> using the options Simple Enhanced and Text (for inclusion of diacritics)</li> </ul> <p> - summing up to roughly 1.5 billion characters. + summing up to roughly two billion characters. <!-- --> The plot below shows <bdo dir="ltr" lang="ar">ا ل ي م و ن</bdo> can be considered the most frequently used letters in the Arabic language. @@ -39,6 +39,7 @@ rule render-xmodmap rule analyze-heat command = lulua-analyze -l \$layout keyheatmap < \$in > \$out +# XXX: add lulua-analyze combine here rule write-bbcarabic command = find \$in -type f | lulua-write bbcarabic \$layout > \$out pool = write @@ -47,6 +48,10 @@ rule write-aljazeera command = find \$in -type f | lulua-write aljazeera \$layout > \$out pool = write +rule write-epub + command = find \$in -type f | lulua-write epub \$layout | lulua-analyze combine > \$out + pool = write + rule write-tanzil command = echo \$in | lulua-write text \$layout | lulua-analyze combine > \$out pool = write @@ -98,13 +103,16 @@ build \$statsdir/${l}/bbcarabic.pickle: write-bbcarabic \$corpusdir/bbcarabic/ra build \$statsdir/${l}/aljazeera.pickle: write-aljazeera \$corpusdir/aljazeera/raw || \$statsdir/${l} layout = ${l} +build \$statsdir/${l}/hindawi.pickle: write-epub \$corpusdir/hindawi/raw || \$statsdir/${l} + layout = ${l} + build \$statsdir/${l}/tanzil.pickle: write-tanzil \$corpusdir/tanzil-quaran/plain.txt.lz || \$statsdir/${l} layout = ${l} build \$statsdir/${l}/arwiki.pickle: write-arwiki \$corpusdir/arwiki/arwiki-20190701-pages-articles.xml.bz2 || \$statsdir/${l} layout = ${l} -build \$statsdir/${l}/all.pickle: combine \$statsdir/${l}/bbcarabic.pickle \$statsdir/${l}/aljazeera.pickle \$statsdir/${l}/tanzil.pickle \$statsdir/${l}/arwiki.pickle || \$statsdir/${l} +build \$statsdir/${l}/all.pickle: combine \$statsdir/${l}/bbcarabic.pickle \$statsdir/${l}/aljazeera.pickle \$statsdir/${l}/tanzil.pickle \$statsdir/${l}/arwiki.pickle \$statsdir/${l}/hindawi.pickle || \$statsdir/${l} build \$docdir/_build/${l}.svg: render-svg || \$docdir/_build layout = ${l} diff --git a/lulua/text.py b/lulua/text.py index 98c7824..b4b4b91 100644 --- a/lulua/text.py +++ b/lulua/text.py @@ -28,6 +28,8 @@ from functools import partial from multiprocessing import Process, Queue, cpu_count, current_process from subprocess import Popen, PIPE from tqdm import tqdm +import ebooklib +from ebooklib import epub import html5lib from html5lib.filters.base import Filter @@ -137,20 +139,35 @@ def sourceHtml (selectFunc, item): walker = html5lib.getTreeWalker("etree") stream = walker (document) s = HTMLSerializer() - return ''.join (s.serialize(Select (stream, selectFunc))) + yield ''.join (s.serialize(Select (stream, selectFunc))) + +def sourceEpub (item): + """ epub reader """ + book = epub.read_epub (item.rstrip ()) + logging.debug (f'reading ebook {item}') + for item in book.get_items_of_type (ebooklib.ITEM_DOCUMENT): + logging.debug (f'got item {item.get_name ()}') + # XXX: in theory html5lib should be able to detect the encoding of + # bytes(), but it does not. + document = html5lib.parse (item.get_content ().decode ('utf-8')) + walker = html5lib.getTreeWalker("etree") + stream = walker (document) + s = HTMLSerializer() + yield ''.join (s.serialize (stream)) def sourceText (item): with LzipFile (item.rstrip ()) as fd: - return fd.read ().decode ('utf-8') + yield fd.read ().decode ('utf-8') def sourceJson (item): - return json.loads (item) + yield json.loads (item) sources = dict( aljazeera=partial(sourceHtml, f['aljazeera']), bbcarabic=partial(sourceHtml, f['bbcarabic']), text=sourceText, json=sourceJson, + epub=sourceEpub, ) charMap = { @@ -184,6 +201,7 @@ def writeWorker (args, inq, outq): layout = defaultLayouts['null'].specialize (keyboard) w = Writer (layout) combined = dict ((cls.name, cls(w)) for cls in allStats) + itemsProcessed = 0 while True: keyboard = defaultKeyboards[args.keyboard] @@ -194,25 +212,30 @@ def writeWorker (args, inq, outq): if item is None: break - # extract - text = sources[args.source] (item) - text = ''.join (map (lambda x: charMap.get (x, x), text)) - # XXX sanity checks, disable - for c in charMap.keys (): - if c in text: - #print (c, 'is in text', file=sys.stderr) - assert False, c - - # stats - stats = [cls(w) for cls in allStats] - for match, event in w.type (StringIO (text)): + # extract (can be multiple items per source) + for text in sources[args.source] (item): + text = ''.join (map (lambda x: charMap.get (x, x), text)) + # XXX sanity checks, disable + for c in charMap.keys (): + if c in text: + #print (c, 'is in text', file=sys.stderr) + assert False, c + + # stats + stats = [cls(w) for cls in allStats] + for match, event in w.type (StringIO (text)): + for s in stats: + s.process (event) + for s in stats: - s.process (event) + combined[s.name].update (s) - for s in stats: - combined[s.name].update (s) + itemsProcessed += 1 - outq.put (combined) + if itemsProcessed > 0: + outq.put (combined) + else: + outq.put (None) except Exception as e: # async exceptions outq.put (None) @@ -222,6 +245,7 @@ def write (): """ Extract corpus source file, convert to plain text, map chars and create stats """ parser = argparse.ArgumentParser(description='Import text and create stats.') + parser.add_argument('-v', '--verbose', action='store_true', help='Enable debugging output') parser.add_argument('-k', '--keyboard', metavar='KEYBOARD', default='ibmpc105', help='Physical keyboard name') parser.add_argument('-j', '--jobs', metavar='NUM', @@ -231,7 +255,10 @@ def write (): args = parser.parse_args() - logging.basicConfig (level=logging.INFO) + if args.verbose: + logging.basicConfig (level=logging.DEBUG) + else: + logging.basicConfig (level=logging.INFO) # limit queue sizes to limit memory usage inq = Queue (args.jobs*2) @@ -260,7 +287,9 @@ def write (): # every one of them will consume exactly one item and write one in return for w in workers: inq.put (None) - pickle.dump (outq.get (), sys.stdout.buffer, pickle.HIGHEST_PROTOCOL) + item = outq.get () + if item is not None: + pickle.dump (item, sys.stdout.buffer, pickle.HIGHEST_PROTOCOL) assert outq.empty () # and then we can kill them for w in workers: @@ -38,6 +38,7 @@ setup( 'bokeh', 'tqdm', 'html5lib', + 'ebooklib', ], entry_points={ 'console_scripts': [ |