summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLars-Dominik Braun <lars@6xq.net>2019-10-03 17:23:53 +0200
committerLars-Dominik Braun <lars@6xq.net>2019-10-03 17:23:53 +0200
commit2d45ef655f8791037373ab83174fc6c3596227b0 (patch)
treea05d506928fcc16f8dfdddb860c6ce4c5193bfc4
parent8048f6351fb4611134c2f6e2d9129ec025376914 (diff)
downloadlulua-2d45ef655f8791037373ab83174fc6c3596227b0.tar.gz
lulua-2d45ef655f8791037373ab83174fc6c3596227b0.tar.bz2
lulua-2d45ef655f8791037373ab83174fc6c3596227b0.zip
text: Add epub reader and hindawi corpus
See issue #5.
-rw-r--r--doc/index.html6
-rwxr-xr-xgen.sh10
-rw-r--r--lulua/text.py71
-rw-r--r--setup.py1
4 files changed, 64 insertions, 24 deletions
diff --git a/doc/index.html b/doc/index.html
index f9daf88..6749647 100644
--- a/doc/index.html
+++ b/doc/index.html
@@ -129,13 +129,15 @@
dump</a> of the <a href="https://ar.wikipedia.org/">Arabic
Wikipedia</a> as of July 2019, extracted using
<a href="https://github.com/attardi/wikiextractor/tree/3162bb6c3c9ebd2d15be507aa11d6fa818a454ac">wikiextractor</a>
- containing 857386 articles</li>
+ containing 857,386 articles</li>
+ <li>1,709 ebooks from <a
+ href="https://www.hindawi.org/books">hindawi.org</a></li>
<li>and a plain-text copy of the Quran from <a
href="http://tanzil.net/docs/download">tanzil.net</a> using the
options Simple Enhanced and Text (for inclusion of diacritics)</li>
</ul>
<p>
- summing up to roughly 1.5 billion characters.
+ summing up to roughly two billion characters.
<!-- -->
The plot below shows <bdo dir="ltr" lang="ar">ا ل ي م و ن</bdo> can be
considered the most frequently used letters in the Arabic language.
diff --git a/gen.sh b/gen.sh
index 77fbf81..0d7a066 100755
--- a/gen.sh
+++ b/gen.sh
@@ -39,6 +39,7 @@ rule render-xmodmap
rule analyze-heat
command = lulua-analyze -l \$layout keyheatmap < \$in > \$out
+# XXX: add lulua-analyze combine here
rule write-bbcarabic
command = find \$in -type f | lulua-write bbcarabic \$layout > \$out
pool = write
@@ -47,6 +48,10 @@ rule write-aljazeera
command = find \$in -type f | lulua-write aljazeera \$layout > \$out
pool = write
+rule write-epub
+ command = find \$in -type f | lulua-write epub \$layout | lulua-analyze combine > \$out
+ pool = write
+
rule write-tanzil
command = echo \$in | lulua-write text \$layout | lulua-analyze combine > \$out
pool = write
@@ -98,13 +103,16 @@ build \$statsdir/${l}/bbcarabic.pickle: write-bbcarabic \$corpusdir/bbcarabic/ra
build \$statsdir/${l}/aljazeera.pickle: write-aljazeera \$corpusdir/aljazeera/raw || \$statsdir/${l}
layout = ${l}
+build \$statsdir/${l}/hindawi.pickle: write-epub \$corpusdir/hindawi/raw || \$statsdir/${l}
+ layout = ${l}
+
build \$statsdir/${l}/tanzil.pickle: write-tanzil \$corpusdir/tanzil-quaran/plain.txt.lz || \$statsdir/${l}
layout = ${l}
build \$statsdir/${l}/arwiki.pickle: write-arwiki \$corpusdir/arwiki/arwiki-20190701-pages-articles.xml.bz2 || \$statsdir/${l}
layout = ${l}
-build \$statsdir/${l}/all.pickle: combine \$statsdir/${l}/bbcarabic.pickle \$statsdir/${l}/aljazeera.pickle \$statsdir/${l}/tanzil.pickle \$statsdir/${l}/arwiki.pickle || \$statsdir/${l}
+build \$statsdir/${l}/all.pickle: combine \$statsdir/${l}/bbcarabic.pickle \$statsdir/${l}/aljazeera.pickle \$statsdir/${l}/tanzil.pickle \$statsdir/${l}/arwiki.pickle \$statsdir/${l}/hindawi.pickle || \$statsdir/${l}
build \$docdir/_build/${l}.svg: render-svg || \$docdir/_build
layout = ${l}
diff --git a/lulua/text.py b/lulua/text.py
index 98c7824..b4b4b91 100644
--- a/lulua/text.py
+++ b/lulua/text.py
@@ -28,6 +28,8 @@ from functools import partial
from multiprocessing import Process, Queue, cpu_count, current_process
from subprocess import Popen, PIPE
from tqdm import tqdm
+import ebooklib
+from ebooklib import epub
import html5lib
from html5lib.filters.base import Filter
@@ -137,20 +139,35 @@ def sourceHtml (selectFunc, item):
walker = html5lib.getTreeWalker("etree")
stream = walker (document)
s = HTMLSerializer()
- return ''.join (s.serialize(Select (stream, selectFunc)))
+ yield ''.join (s.serialize(Select (stream, selectFunc)))
+
+def sourceEpub (item):
+ """ epub reader """
+ book = epub.read_epub (item.rstrip ())
+ logging.debug (f'reading ebook {item}')
+ for item in book.get_items_of_type (ebooklib.ITEM_DOCUMENT):
+ logging.debug (f'got item {item.get_name ()}')
+ # XXX: in theory html5lib should be able to detect the encoding of
+ # bytes(), but it does not.
+ document = html5lib.parse (item.get_content ().decode ('utf-8'))
+ walker = html5lib.getTreeWalker("etree")
+ stream = walker (document)
+ s = HTMLSerializer()
+ yield ''.join (s.serialize (stream))
def sourceText (item):
with LzipFile (item.rstrip ()) as fd:
- return fd.read ().decode ('utf-8')
+ yield fd.read ().decode ('utf-8')
def sourceJson (item):
- return json.loads (item)
+ yield json.loads (item)
sources = dict(
aljazeera=partial(sourceHtml, f['aljazeera']),
bbcarabic=partial(sourceHtml, f['bbcarabic']),
text=sourceText,
json=sourceJson,
+ epub=sourceEpub,
)
charMap = {
@@ -184,6 +201,7 @@ def writeWorker (args, inq, outq):
layout = defaultLayouts['null'].specialize (keyboard)
w = Writer (layout)
combined = dict ((cls.name, cls(w)) for cls in allStats)
+ itemsProcessed = 0
while True:
keyboard = defaultKeyboards[args.keyboard]
@@ -194,25 +212,30 @@ def writeWorker (args, inq, outq):
if item is None:
break
- # extract
- text = sources[args.source] (item)
- text = ''.join (map (lambda x: charMap.get (x, x), text))
- # XXX sanity checks, disable
- for c in charMap.keys ():
- if c in text:
- #print (c, 'is in text', file=sys.stderr)
- assert False, c
-
- # stats
- stats = [cls(w) for cls in allStats]
- for match, event in w.type (StringIO (text)):
+ # extract (can be multiple items per source)
+ for text in sources[args.source] (item):
+ text = ''.join (map (lambda x: charMap.get (x, x), text))
+ # XXX sanity checks, disable
+ for c in charMap.keys ():
+ if c in text:
+ #print (c, 'is in text', file=sys.stderr)
+ assert False, c
+
+ # stats
+ stats = [cls(w) for cls in allStats]
+ for match, event in w.type (StringIO (text)):
+ for s in stats:
+ s.process (event)
+
for s in stats:
- s.process (event)
+ combined[s.name].update (s)
- for s in stats:
- combined[s.name].update (s)
+ itemsProcessed += 1
- outq.put (combined)
+ if itemsProcessed > 0:
+ outq.put (combined)
+ else:
+ outq.put (None)
except Exception as e:
# async exceptions
outq.put (None)
@@ -222,6 +245,7 @@ def write ():
""" Extract corpus source file, convert to plain text, map chars and create stats """
parser = argparse.ArgumentParser(description='Import text and create stats.')
+ parser.add_argument('-v', '--verbose', action='store_true', help='Enable debugging output')
parser.add_argument('-k', '--keyboard', metavar='KEYBOARD',
default='ibmpc105', help='Physical keyboard name')
parser.add_argument('-j', '--jobs', metavar='NUM',
@@ -231,7 +255,10 @@ def write ():
args = parser.parse_args()
- logging.basicConfig (level=logging.INFO)
+ if args.verbose:
+ logging.basicConfig (level=logging.DEBUG)
+ else:
+ logging.basicConfig (level=logging.INFO)
# limit queue sizes to limit memory usage
inq = Queue (args.jobs*2)
@@ -260,7 +287,9 @@ def write ():
# every one of them will consume exactly one item and write one in return
for w in workers:
inq.put (None)
- pickle.dump (outq.get (), sys.stdout.buffer, pickle.HIGHEST_PROTOCOL)
+ item = outq.get ()
+ if item is not None:
+ pickle.dump (item, sys.stdout.buffer, pickle.HIGHEST_PROTOCOL)
assert outq.empty ()
# and then we can kill them
for w in workers:
diff --git a/setup.py b/setup.py
index 56c64a6..2e20067 100644
--- a/setup.py
+++ b/setup.py
@@ -38,6 +38,7 @@ setup(
'bokeh',
'tqdm',
'html5lib',
+ 'ebooklib',
],
entry_points={
'console_scripts': [