text: Add epub reader and hindawi corpus

See issue #5.
author: Lars-Dominik Braun <lars@6xq.net> 2019-10-03 17:23:53 +0200
committer: Lars-Dominik Braun <lars@6xq.net> 2019-10-03 17:23:53 +0200
commit: 2d45ef655f8791037373ab83174fc6c3596227b0 (patch)
tree: a05d506928fcc16f8dfdddb860c6ce4c5193bfc4
parent: 8048f6351fb4611134c2f6e2d9129ec025376914 (diff)
download: lulua-2d45ef655f8791037373ab83174fc6c3596227b0.tar.gz
lulua-2d45ef655f8791037373ab83174fc6c3596227b0.tar.bz2
lulua-2d45ef655f8791037373ab83174fc6c3596227b0.zip
4 files changed, 64 insertions, 24 deletions
diff --git a/doc/index.html b/doc/index.html
index f9daf88..6749647 100644
--- a/doc/index.html
+++ b/doc/index.html
@@ -129,13 +129,15 @@
 			dump</a> of the <a href="https://ar.wikipedia.org/">Arabic
 			Wikipedia</a> as of July 2019, extracted using
 			<a href="https://github.com/attardi/wikiextractor/tree/3162bb6c3c9ebd2d15be507aa11d6fa818a454ac">wikiextractor</a>
-			containing 857386 articles</li>
+			containing 857,386 articles</li>
+			<li>1,709 ebooks from <a
+			href="https://www.hindawi.org/books">hindawi.org</a></li>
 			<li>and a plain-text copy of the Quran from <a
 			href="http://tanzil.net/docs/download">tanzil.net</a> using the
 			options Simple Enhanced and Text (for inclusion of diacritics)</li>
 		</ul>
 		<p>
-		summing up to roughly 1.5 billion characters.
+		summing up to roughly two billion characters.
 		<!-- -->
 		The plot below shows <bdo dir="ltr" lang="ar">ا ل ي م و ن</bdo> can be
 		considered the most frequently used letters in the Arabic language.
diff --git a/gen.sh b/gen.sh
index 77fbf81..0d7a066 100755
--- a/gen.sh
+++ b/gen.sh
@@ -39,6 +39,7 @@ rule render-xmodmap
 rule analyze-heat
     command = lulua-analyze -l \$layout keyheatmap < \$in > \$out
 
+# XXX: add lulua-analyze combine here
 rule write-bbcarabic
     command = find \$in -type f | lulua-write bbcarabic \$layout > \$out
     pool = write
@@ -47,6 +48,10 @@ rule write-aljazeera
     command = find \$in -type f | lulua-write aljazeera \$layout > \$out
     pool = write
 
+rule write-epub
+    command = find \$in -type f | lulua-write epub \$layout | lulua-analyze combine > \$out
+    pool = write
+
 rule write-tanzil
     command = echo \$in | lulua-write text \$layout | lulua-analyze combine > \$out
     pool = write
@@ -98,13 +103,16 @@ build \$statsdir/${l}/bbcarabic.pickle: write-bbcarabic \$corpusdir/bbcarabic/ra
 build \$statsdir/${l}/aljazeera.pickle: write-aljazeera \$corpusdir/aljazeera/raw || \$statsdir/${l}
     layout = ${l}
 
+build \$statsdir/${l}/hindawi.pickle: write-epub \$corpusdir/hindawi/raw || \$statsdir/${l}
+    layout = ${l}
+
 build \$statsdir/${l}/tanzil.pickle: write-tanzil \$corpusdir/tanzil-quaran/plain.txt.lz || \$statsdir/${l}
     layout = ${l}
 
 build \$statsdir/${l}/arwiki.pickle: write-arwiki \$corpusdir/arwiki/arwiki-20190701-pages-articles.xml.bz2 || \$statsdir/${l}
     layout = ${l}
 
-build \$statsdir/${l}/all.pickle: combine \$statsdir/${l}/bbcarabic.pickle \$statsdir/${l}/aljazeera.pickle \$statsdir/${l}/tanzil.pickle \$statsdir/${l}/arwiki.pickle || \$statsdir/${l}
+build \$statsdir/${l}/all.pickle: combine \$statsdir/${l}/bbcarabic.pickle \$statsdir/${l}/aljazeera.pickle \$statsdir/${l}/tanzil.pickle \$statsdir/${l}/arwiki.pickle \$statsdir/${l}/hindawi.pickle || \$statsdir/${l}
 
 build \$docdir/_build/${l}.svg: render-svg || \$docdir/_build
     layout = ${l}
diff --git a/lulua/text.py b/lulua/text.py
index 98c7824..b4b4b91 100644
--- a/lulua/text.py
+++ b/lulua/text.py
@@ -28,6 +28,8 @@ from functools import partial
 from multiprocessing import Process, Queue, cpu_count, current_process
 from subprocess import Popen, PIPE
 from tqdm import tqdm
+import ebooklib
+from ebooklib import epub
 
 import html5lib
 from html5lib.filters.base import Filter
@@ -137,20 +139,35 @@ def sourceHtml (selectFunc, item):
         walker = html5lib.getTreeWalker("etree")
         stream = walker (document)
         s = HTMLSerializer()
-        return ''.join (s.serialize(Select (stream, selectFunc)))
+        yield ''.join (s.serialize(Select (stream, selectFunc)))
+
+def sourceEpub (item):
+    """ epub reader """
+    book = epub.read_epub (item.rstrip ())
+    logging.debug (f'reading ebook {item}')
+    for item in book.get_items_of_type (ebooklib.ITEM_DOCUMENT):
+        logging.debug (f'got item {item.get_name ()}')
+        # XXX: in theory html5lib should be able to detect the encoding of
+        # bytes(), but it does not.
+        document = html5lib.parse (item.get_content ().decode ('utf-8'))
+        walker = html5lib.getTreeWalker("etree")
+        stream = walker (document)
+        s = HTMLSerializer()
+        yield ''.join (s.serialize (stream))
 
 def sourceText (item):
     with LzipFile (item.rstrip ()) as fd:
-        return fd.read ().decode ('utf-8')
+        yield fd.read ().decode ('utf-8')
 
 def sourceJson (item):
-    return json.loads (item)
+    yield json.loads (item)
 
 sources = dict(
     aljazeera=partial(sourceHtml, f['aljazeera']),
     bbcarabic=partial(sourceHtml, f['bbcarabic']),
     text=sourceText,
     json=sourceJson,
+    epub=sourceEpub,
     )
 
 charMap = {
@@ -184,6 +201,7 @@ def writeWorker (args, inq, outq):
         layout = defaultLayouts['null'].specialize (keyboard)
         w = Writer (layout)
         combined = dict ((cls.name, cls(w)) for cls in allStats)
+        itemsProcessed = 0
 
         while True:
             keyboard = defaultKeyboards[args.keyboard]
@@ -194,25 +212,30 @@ def writeWorker (args, inq, outq):
             if item is None:
                 break
 
-            # extract
-            text = sources[args.source] (item)
-            text = ''.join (map (lambda x: charMap.get (x, x), text))
-            # XXX sanity checks, disable
-            for c in charMap.keys ():
-                if c in text:
-                    #print (c, 'is in text', file=sys.stderr)
-                    assert False, c
-
-            # stats
-            stats = [cls(w) for cls in allStats]
-            for match, event in w.type (StringIO (text)):
+            # extract (can be multiple items per source)
+            for text in  sources[args.source] (item):
+                text = ''.join (map (lambda x: charMap.get (x, x), text))
+                # XXX sanity checks, disable
+                for c in charMap.keys ():
+                    if c in text:
+                        #print (c, 'is in text', file=sys.stderr)
+                        assert False, c
+
+                # stats
+                stats = [cls(w) for cls in allStats]
+                for match, event in w.type (StringIO (text)):
+                    for s in stats:
+                        s.process (event)
+
                 for s in stats:
-                    s.process (event)
+                    combined[s.name].update (s)
 
-            for s in stats:
-                combined[s.name].update (s)
+            itemsProcessed += 1
 
-        outq.put (combined)
+        if itemsProcessed > 0:
+            outq.put (combined)
+        else:
+            outq.put (None)
     except Exception as e:
         # async exceptions
         outq.put (None)
@@ -222,6 +245,7 @@ def write ():
     """ Extract corpus source file, convert to plain text, map chars and create stats """
 
     parser = argparse.ArgumentParser(description='Import text and create stats.')
+    parser.add_argument('-v', '--verbose', action='store_true', help='Enable debugging output')
     parser.add_argument('-k', '--keyboard', metavar='KEYBOARD',
             default='ibmpc105', help='Physical keyboard name')
     parser.add_argument('-j', '--jobs', metavar='NUM',
@@ -231,7 +255,10 @@ def write ():
 
     args = parser.parse_args()
 
-    logging.basicConfig (level=logging.INFO)
+    if args.verbose:
+        logging.basicConfig (level=logging.DEBUG)
+    else:
+        logging.basicConfig (level=logging.INFO)
 
     # limit queue sizes to limit memory usage
     inq = Queue (args.jobs*2)
@@ -260,7 +287,9 @@ def write ():
     # every one of them will consume exactly one item and write one in return
     for w in workers:
         inq.put (None)
-        pickle.dump (outq.get (), sys.stdout.buffer, pickle.HIGHEST_PROTOCOL)
+        item = outq.get ()
+        if item is not None:
+            pickle.dump (item, sys.stdout.buffer, pickle.HIGHEST_PROTOCOL)
     assert outq.empty ()
     # and then we can kill them
     for w in workers:
diff --git a/setup.py b/setup.py
index 56c64a6..2e20067 100644
--- a/setup.py
+++ b/setup.py
@@ -38,6 +38,7 @@ setup(
         'bokeh',
         'tqdm',
         'html5lib',
+        'ebooklib',
     ],
     entry_points={
     'console_scripts': [
author	Lars-Dominik Braun <lars@6xq.net>	2019-10-03 17:23:53 +0200
committer	Lars-Dominik Braun <lars@6xq.net>	2019-10-03 17:23:53 +0200
commit	2d45ef655f8791037373ab83174fc6c3596227b0 (patch)
tree	a05d506928fcc16f8dfdddb860c6ce4c5193bfc4
parent	8048f6351fb4611134c2f6e2d9129ec025376914 (diff)
download	lulua-2d45ef655f8791037373ab83174fc6c3596227b0.tar.gz lulua-2d45ef655f8791037373ab83174fc6c3596227b0.tar.bz2 lulua-2d45ef655f8791037373ab83174fc6c3596227b0.zip