summaryrefslogtreecommitdiff
path: root/lulua/text.py
diff options
context:
space:
mode:
authorLars-Dominik Braun <lars@6xq.net>2019-10-03 17:23:53 +0200
committerLars-Dominik Braun <lars@6xq.net>2019-10-03 17:23:53 +0200
commit2d45ef655f8791037373ab83174fc6c3596227b0 (patch)
treea05d506928fcc16f8dfdddb860c6ce4c5193bfc4 /lulua/text.py
parent8048f6351fb4611134c2f6e2d9129ec025376914 (diff)
downloadlulua-2d45ef655f8791037373ab83174fc6c3596227b0.tar.gz
lulua-2d45ef655f8791037373ab83174fc6c3596227b0.tar.bz2
lulua-2d45ef655f8791037373ab83174fc6c3596227b0.zip
text: Add epub reader and hindawi corpus
See issue #5.
Diffstat (limited to 'lulua/text.py')
-rw-r--r--lulua/text.py71
1 files changed, 50 insertions, 21 deletions
diff --git a/lulua/text.py b/lulua/text.py
index 98c7824..b4b4b91 100644
--- a/lulua/text.py
+++ b/lulua/text.py
@@ -28,6 +28,8 @@ from functools import partial
from multiprocessing import Process, Queue, cpu_count, current_process
from subprocess import Popen, PIPE
from tqdm import tqdm
+import ebooklib
+from ebooklib import epub
import html5lib
from html5lib.filters.base import Filter
@@ -137,20 +139,35 @@ def sourceHtml (selectFunc, item):
walker = html5lib.getTreeWalker("etree")
stream = walker (document)
s = HTMLSerializer()
- return ''.join (s.serialize(Select (stream, selectFunc)))
+ yield ''.join (s.serialize(Select (stream, selectFunc)))
+
+def sourceEpub (item):
+ """ epub reader """
+ book = epub.read_epub (item.rstrip ())
+ logging.debug (f'reading ebook {item}')
+ for item in book.get_items_of_type (ebooklib.ITEM_DOCUMENT):
+ logging.debug (f'got item {item.get_name ()}')
+ # XXX: in theory html5lib should be able to detect the encoding of
+ # bytes(), but it does not.
+ document = html5lib.parse (item.get_content ().decode ('utf-8'))
+ walker = html5lib.getTreeWalker("etree")
+ stream = walker (document)
+ s = HTMLSerializer()
+ yield ''.join (s.serialize (stream))
def sourceText (item):
with LzipFile (item.rstrip ()) as fd:
- return fd.read ().decode ('utf-8')
+ yield fd.read ().decode ('utf-8')
def sourceJson (item):
- return json.loads (item)
+ yield json.loads (item)
sources = dict(
aljazeera=partial(sourceHtml, f['aljazeera']),
bbcarabic=partial(sourceHtml, f['bbcarabic']),
text=sourceText,
json=sourceJson,
+ epub=sourceEpub,
)
charMap = {
@@ -184,6 +201,7 @@ def writeWorker (args, inq, outq):
layout = defaultLayouts['null'].specialize (keyboard)
w = Writer (layout)
combined = dict ((cls.name, cls(w)) for cls in allStats)
+ itemsProcessed = 0
while True:
keyboard = defaultKeyboards[args.keyboard]
@@ -194,25 +212,30 @@ def writeWorker (args, inq, outq):
if item is None:
break
- # extract
- text = sources[args.source] (item)
- text = ''.join (map (lambda x: charMap.get (x, x), text))
- # XXX sanity checks, disable
- for c in charMap.keys ():
- if c in text:
- #print (c, 'is in text', file=sys.stderr)
- assert False, c
-
- # stats
- stats = [cls(w) for cls in allStats]
- for match, event in w.type (StringIO (text)):
+ # extract (can be multiple items per source)
+ for text in sources[args.source] (item):
+ text = ''.join (map (lambda x: charMap.get (x, x), text))
+ # XXX sanity checks, disable
+ for c in charMap.keys ():
+ if c in text:
+ #print (c, 'is in text', file=sys.stderr)
+ assert False, c
+
+ # stats
+ stats = [cls(w) for cls in allStats]
+ for match, event in w.type (StringIO (text)):
+ for s in stats:
+ s.process (event)
+
for s in stats:
- s.process (event)
+ combined[s.name].update (s)
- for s in stats:
- combined[s.name].update (s)
+ itemsProcessed += 1
- outq.put (combined)
+ if itemsProcessed > 0:
+ outq.put (combined)
+ else:
+ outq.put (None)
except Exception as e:
# async exceptions
outq.put (None)
@@ -222,6 +245,7 @@ def write ():
""" Extract corpus source file, convert to plain text, map chars and create stats """
parser = argparse.ArgumentParser(description='Import text and create stats.')
+ parser.add_argument('-v', '--verbose', action='store_true', help='Enable debugging output')
parser.add_argument('-k', '--keyboard', metavar='KEYBOARD',
default='ibmpc105', help='Physical keyboard name')
parser.add_argument('-j', '--jobs', metavar='NUM',
@@ -231,7 +255,10 @@ def write ():
args = parser.parse_args()
- logging.basicConfig (level=logging.INFO)
+ if args.verbose:
+ logging.basicConfig (level=logging.DEBUG)
+ else:
+ logging.basicConfig (level=logging.INFO)
# limit queue sizes to limit memory usage
inq = Queue (args.jobs*2)
@@ -260,7 +287,9 @@ def write ():
# every one of them will consume exactly one item and write one in return
for w in workers:
inq.put (None)
- pickle.dump (outq.get (), sys.stdout.buffer, pickle.HIGHEST_PROTOCOL)
+ item = outq.get ()
+ if item is not None:
+ pickle.dump (item, sys.stdout.buffer, pickle.HIGHEST_PROTOCOL)
assert outq.empty ()
# and then we can kill them
for w in workers: