summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLars-Dominik Braun <lars@6xq.net>2016-09-29 17:56:36 +0200
committerLars-Dominik Braun <lars@6xq.net>2016-09-29 17:56:36 +0200
commit94858048861376fbd1e100097759f2762afdaf1e (patch)
tree73ad390a623c14d9307185effee7c982a2a8e4cd
parentd3fd76cb12f9080cc49c7d9cb1bfab5b2d574aef (diff)
downloadeumel-tools-94858048861376fbd1e100097759f2762afdaf1e.zip
eumel-tools-94858048861376fbd1e100097759f2762afdaf1e.tar.gz
eumel-tools-94858048861376fbd1e100097759f2762afdaf1e.tar.bz2
Update character conversion map
Should contain EUMEL’s entire charset now
-rw-r--r--eumel.py202
1 files changed, 192 insertions, 10 deletions
diff --git a/eumel.py b/eumel.py
index 0434b35..a421e0a 100644
--- a/eumel.py
+++ b/eumel.py
@@ -6,15 +6,196 @@ EUMEL utility functions, including:
import logging
import codecs
-# EUMEL character map. See “Benutzerhandbuch 1.7”, page 107.
-# map eumel character to unicode codepoint
-eumel2unicodemap = dict (
- [(10, '\n'), (13, '\r')] +
- # first part is same as ascii
- [(i, chr (i)) for i in range (32, 126)] +
- [(126, '~')] +
- [(214, 'Ä'), (215, 'Ö'), (216, 'Ü'), (217, 'ä'), (218, 'ö'), (219, 'ü'), (220, 'k'), (221, '-'), (222, '#'), (223, ' ')] +
- [(251, 'ß')])
+# EUMEL character map. See “Benutzerhandbuch 1.7”, page 107 and file
+# ZEICHENSATZ from the archive disk std.graphik.
+eumel2unicodemap = dict ([
+ # standard newlines
+ (10, '\n'),
+ (13, '\r'),
+ # mark start. technically \15 and \14 would be a choice here, but they do
+ # different things on different systems and thus we’re just gonna strip
+ # them.
+ (15, ''),
+ (14, ''), # mark end
+ # same as ascii
+ (32, ' '),
+ (33, '!'),
+ (34, '"'),
+ (35, '#'),
+ (36, '$'),
+ (37, '%'),
+ (38, '&'),
+ (39, "'"),
+ (40, '('),
+ (41, ')'),
+ (42, '*'),
+ (43, '+'),
+ (44, ','),
+ (45, '-'),
+ (46, '.'),
+ (47, '/'),
+ (48, '0'),
+ (49, '1'),
+ (50, '2'),
+ (51, '3'),
+ (52, '4'),
+ (53, '5'),
+ (54, '6'),
+ (55, '7'),
+ (56, '8'),
+ (57, '9'),
+ (58, ':'),
+ (59, ';'),
+ (60, '<'),
+ (61, '='),
+ (62, '>'),
+ (63, '?'),
+ # then the paragraph symbol
+ (64, '§'),
+ # uppercase and lowercase letters from ascii
+ (65, 'A'),
+ (66, 'B'),
+ (67, 'C'),
+ (68, 'D'),
+ (69, 'E'),
+ (70, 'F'),
+ (71, 'G'),
+ (72, 'H'),
+ (73, 'I'),
+ (74, 'J'),
+ (75, 'K'),
+ (76, 'L'),
+ (77, 'M'),
+ (78, 'N'),
+ (79, 'O'),
+ (80, 'P'),
+ (81, 'Q'),
+ (82, 'R'),
+ (83, 'S'),
+ (84, 'T'),
+ (85, 'U'),
+ (86, 'V'),
+ (87, 'W'),
+ (88, 'X'),
+ (89, 'Y'),
+ (90, 'Z'),
+ (91, '['),
+ (92, '\\'),
+ (93, ']'),
+ (94, '^'),
+ (95, '_'),
+ (96, '`'),
+ (97, 'a'),
+ (98, 'b'),
+ (99, 'c'),
+ (100, 'd'),
+ (101, 'e'),
+ (102, 'f'),
+ (103, 'g'),
+ (104, 'h'),
+ (105, 'i'),
+ (106, 'j'),
+ (107, 'k'),
+ (108, 'l'),
+ (109, 'm'),
+ (110, 'n'),
+ (111, 'o'),
+ (112, 'p'),
+ (113, 'q'),
+ (114, 'r'),
+ (115, 's'),
+ (116, 't'),
+ (117, 'u'),
+ (118, 'v'),
+ (119, 'w'),
+ (120, 'x'),
+ (121, 'y'),
+ (122, 'z'),
+ (123, '{'),
+ (124, '|'),
+ (125, '}'),
+ (126, '~'),
+ # uppercase greek
+ (129, 'Α'),
+ (130, 'Β'),
+ (131, 'Γ'),
+ (132, 'Δ'),
+ (133, 'Ε'),
+ (134, 'Ζ'),
+ (135, 'Η'),
+ (136, 'Θ'),
+ (137, 'Ι'),
+ (138, 'Κ'),
+ (139, 'Λ'),
+ (140, 'Μ'),
+ (141, 'Ν'),
+ (142, 'Ξ'),
+ (143, 'Ο'),
+ (144, 'Π'),
+ (145, 'Ρ'),
+ (146, 'Σ'),
+ (147, 'Τ'),
+ (148, 'Υ'),
+ (149, 'Φ'),
+ (150, 'Χ'),
+ (151, 'Ψ'),
+ (152, 'Ω'),
+ # lowercase greek
+ (161, 'α'),
+ (162, 'β'),
+ (163, 'γ'),
+ (164, 'δ'),
+ (165, 'ε'),
+ (166, 'ζ'),
+ (167, 'η'),
+ (168, 'θ'),
+ (169, 'ι'),
+ (170, 'κ'),
+ (171, 'λ'),
+ (172, 'μ'),
+ (173, 'ν'),
+ (174, 'ξ'),
+ (175, 'ο'),
+ (176, 'π'),
+ (177, 'ρ'),
+ (178, 'ς'),
+ (179, 'σ'),
+ (180, 'τ'),
+ (181, 'υ'),
+ (182, 'φ'),
+ (183, 'χ'),
+ (184, 'ψ'),
+ (185, 'ω'),
+ # these seem to be combining diacritic, not sure how they work though
+ # 192 looks like a cross, dunno what it could be
+ (193, '\u0301'), # acute
+ (194, '\u0300'), # grave
+ (195, '\u0302'), # circumflex
+ (196, '\u0303'), # tilde
+ (197, '\u0304'), # macron
+ # 198: dunno
+ (199, '\u0307'), # dot above
+ (200, '\u0308'), # diaeresis
+ # 201: dunno
+ (202, '\u030a'), # ring above
+ (203, '\u0317'), # acute below
+ # 204: dunno
+ (205, '\u030a'), # ring above (again for small letters?)
+ # 206: dunno
+ (207, '\u030c'), # caron
+ # german umlauts
+ (214, 'Ä'),
+ (215, 'Ö'),
+ (216, 'Ü'),
+ (217, 'ä'),
+ (218, 'ö'),
+ (219, 'ü'),
+ (220, 'k'), # handbuch says: Trenn-'k' bei der Umwandlung von 'ck' in 'kk'
+ (221, '\u00ad'), # soft hyphen, inserted by eumel’s hyphenation program
+ (222, '\\#'), # printable hash (i.e. literal hash, not a printer/editor command)
+ (223, '\u00a0'), # protected space
+ (251, 'ß'),
+ ])
def decode (input, errors='strict'):
ret = []
@@ -22,7 +203,7 @@ def decode (input, errors='strict'):
for pos in range (len (input)):
c = input[pos]
m = eumel2unicodemap.get (c, None)
- if m:
+ if m is not None:
ret.append (m)
else:
if errors == 'strict':
@@ -30,6 +211,7 @@ def decode (input, errors='strict'):
elif errors == 'ignore':
pass
elif errors == 'replace':
+ logging.debug ('replacing unknown symbol {} at position {}, context {}'.format (c, pos, input[pos-30:pos+30]))
ret.append ('\uFFFD')
else:
break