From c244fba75805ccb0a8f5e4edec4c6057b3acd48c Mon Sep 17 00:00:00 2001 From: Lars-Dominik Braun Date: Thu, 29 Sep 2016 17:56:36 +0200 Subject: Update character conversion map MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Should contain EUMEL’s entire charset now --- tools/eumel.py | 202 ++++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 192 insertions(+), 10 deletions(-) diff --git a/tools/eumel.py b/tools/eumel.py index 0434b35..a421e0a 100644 --- a/tools/eumel.py +++ b/tools/eumel.py @@ -6,15 +6,196 @@ EUMEL utility functions, including: import logging import codecs -# EUMEL character map. See “Benutzerhandbuch 1.7”, page 107. -# map eumel character to unicode codepoint -eumel2unicodemap = dict ( - [(10, '\n'), (13, '\r')] + - # first part is same as ascii - [(i, chr (i)) for i in range (32, 126)] + - [(126, '~')] + - [(214, 'Ä'), (215, 'Ö'), (216, 'Ü'), (217, 'ä'), (218, 'ö'), (219, 'ü'), (220, 'k'), (221, '-'), (222, '#'), (223, ' ')] + - [(251, 'ß')]) +# EUMEL character map. See “Benutzerhandbuch 1.7”, page 107 and file +# ZEICHENSATZ from the archive disk std.graphik. +eumel2unicodemap = dict ([ + # standard newlines + (10, '\n'), + (13, '\r'), + # mark start. technically \15 and \14 would be a choice here, but they do + # different things on different systems and thus we’re just gonna strip + # them. + (15, ''), + (14, ''), # mark end + # same as ascii + (32, ' '), + (33, '!'), + (34, '"'), + (35, '#'), + (36, '$'), + (37, '%'), + (38, '&'), + (39, "'"), + (40, '('), + (41, ')'), + (42, '*'), + (43, '+'), + (44, ','), + (45, '-'), + (46, '.'), + (47, '/'), + (48, '0'), + (49, '1'), + (50, '2'), + (51, '3'), + (52, '4'), + (53, '5'), + (54, '6'), + (55, '7'), + (56, '8'), + (57, '9'), + (58, ':'), + (59, ';'), + (60, '<'), + (61, '='), + (62, '>'), + (63, '?'), + # then the paragraph symbol + (64, '§'), + # uppercase and lowercase letters from ascii + (65, 'A'), + (66, 'B'), + (67, 'C'), + (68, 'D'), + (69, 'E'), + (70, 'F'), + (71, 'G'), + (72, 'H'), + (73, 'I'), + (74, 'J'), + (75, 'K'), + (76, 'L'), + (77, 'M'), + (78, 'N'), + (79, 'O'), + (80, 'P'), + (81, 'Q'), + (82, 'R'), + (83, 'S'), + (84, 'T'), + (85, 'U'), + (86, 'V'), + (87, 'W'), + (88, 'X'), + (89, 'Y'), + (90, 'Z'), + (91, '['), + (92, '\\'), + (93, ']'), + (94, '^'), + (95, '_'), + (96, '`'), + (97, 'a'), + (98, 'b'), + (99, 'c'), + (100, 'd'), + (101, 'e'), + (102, 'f'), + (103, 'g'), + (104, 'h'), + (105, 'i'), + (106, 'j'), + (107, 'k'), + (108, 'l'), + (109, 'm'), + (110, 'n'), + (111, 'o'), + (112, 'p'), + (113, 'q'), + (114, 'r'), + (115, 's'), + (116, 't'), + (117, 'u'), + (118, 'v'), + (119, 'w'), + (120, 'x'), + (121, 'y'), + (122, 'z'), + (123, '{'), + (124, '|'), + (125, '}'), + (126, '~'), + # uppercase greek + (129, 'Α'), + (130, 'Β'), + (131, 'Γ'), + (132, 'Δ'), + (133, 'Ε'), + (134, 'Ζ'), + (135, 'Η'), + (136, 'Θ'), + (137, 'Ι'), + (138, 'Κ'), + (139, 'Λ'), + (140, 'Μ'), + (141, 'Ν'), + (142, 'Ξ'), + (143, 'Ο'), + (144, 'Π'), + (145, 'Ρ'), + (146, 'Σ'), + (147, 'Τ'), + (148, 'Υ'), + (149, 'Φ'), + (150, 'Χ'), + (151, 'Ψ'), + (152, 'Ω'), + # lowercase greek + (161, 'α'), + (162, 'β'), + (163, 'γ'), + (164, 'δ'), + (165, 'ε'), + (166, 'ζ'), + (167, 'η'), + (168, 'θ'), + (169, 'ι'), + (170, 'κ'), + (171, 'λ'), + (172, 'μ'), + (173, 'ν'), + (174, 'ξ'), + (175, 'ο'), + (176, 'π'), + (177, 'ρ'), + (178, 'ς'), + (179, 'σ'), + (180, 'τ'), + (181, 'υ'), + (182, 'φ'), + (183, 'χ'), + (184, 'ψ'), + (185, 'ω'), + # these seem to be combining diacritic, not sure how they work though + # 192 looks like a cross, dunno what it could be + (193, '\u0301'), # acute + (194, '\u0300'), # grave + (195, '\u0302'), # circumflex + (196, '\u0303'), # tilde + (197, '\u0304'), # macron + # 198: dunno + (199, '\u0307'), # dot above + (200, '\u0308'), # diaeresis + # 201: dunno + (202, '\u030a'), # ring above + (203, '\u0317'), # acute below + # 204: dunno + (205, '\u030a'), # ring above (again for small letters?) + # 206: dunno + (207, '\u030c'), # caron + # german umlauts + (214, 'Ä'), + (215, 'Ö'), + (216, 'Ü'), + (217, 'ä'), + (218, 'ö'), + (219, 'ü'), + (220, 'k'), # handbuch says: Trenn-'k' bei der Umwandlung von 'ck' in 'kk' + (221, '\u00ad'), # soft hyphen, inserted by eumel’s hyphenation program + (222, '\\#'), # printable hash (i.e. literal hash, not a printer/editor command) + (223, '\u00a0'), # protected space + (251, 'ß'), + ]) def decode (input, errors='strict'): ret = [] @@ -22,7 +203,7 @@ def decode (input, errors='strict'): for pos in range (len (input)): c = input[pos] m = eumel2unicodemap.get (c, None) - if m: + if m is not None: ret.append (m) else: if errors == 'strict': @@ -30,6 +211,7 @@ def decode (input, errors='strict'): elif errors == 'ignore': pass elif errors == 'replace': + logging.debug ('replacing unknown symbol {} at position {}, context {}'.format (c, pos, input[pos-30:pos+30])) ret.append ('\uFFFD') else: break -- cgit v1.2.3