From a6d474471dddc2d7a187a66358aafcb86235ca69 Mon Sep 17 00:00:00 2001 From: Lars-Dominik Braun Date: Sat, 21 Jan 2017 11:24:56 +0100 Subject: Restructure git Move tools into separate repo, split TTL file. --- tools/convertCharset.py | 71 ----------- tools/convertFileDs.py | 167 ------------------------- tools/eumel.py | 327 ------------------------------------------------ tools/extractAll.sh | 21 ---- tools/extractArchive.py | 110 ---------------- tools/formatRefs.py | 2 +- tools/formatSoftware.py | 54 +------- tools/linearizeDisk.py | 49 -------- tools/rdf.py | 54 ++++++++ 9 files changed, 57 insertions(+), 798 deletions(-) delete mode 100755 tools/convertCharset.py delete mode 100755 tools/convertFileDs.py delete mode 100644 tools/eumel.py delete mode 100755 tools/extractAll.sh delete mode 100755 tools/extractArchive.py delete mode 100755 tools/linearizeDisk.py create mode 100644 tools/rdf.py (limited to 'tools') diff --git a/tools/convertCharset.py b/tools/convertCharset.py deleted file mode 100755 index 59163aa..0000000 --- a/tools/convertCharset.py +++ /dev/null @@ -1,71 +0,0 @@ -#!/usr/bin/env python3 - -""" -Convert file ZEICHENSATZ from graphics package to PNG files -""" - -from eumel import * - -class ZeichensatzDataspace(Dataspace): - TYPE = 0x44c - - def __init__ (self, fd): - Dataspace.__init__ (self, fd) - - # just an array with 255 elements - self.rows = [] - for i in range (255): - self.rows.append (self.parseText ()) - self.parseHeap () - -if __name__ == '__main__': - import argparse, sys, cairo, math - - def transform (w, h, x, y): - return ((2+x), (11-y)) - - parser = argparse.ArgumentParser(description='Convert ZEICHENSATZ dataspace to PNG') - parser.add_argument ('-v', '--verbose', help='Enable debugging messages', action='store_true') - parser.add_argument ('file', help='Input file') - parser.add_argument ('prefix', help='Output prefix') - args = parser.parse_args () - - if args.verbose: - logging.basicConfig (level=logging.DEBUG) - else: - logging.basicConfig (level=logging.WARNING) - - m = [] - with open (args.file, 'rb') as fd: - ds = ZeichensatzDataspace (fd) - # no character with code 0 - for (j, r) in zip (range (1, len (ds.rows)+1), ds.rows): - if len (r) == 0: - continue - - out = '{}{:03d}.png'.format (args.prefix, j) - logging.info ('Converting character {} to {}'.format (j, out)) - w, h = 1024, 1024 - surface = cairo.ImageSurface(cairo.FORMAT_ARGB32, w, h) - ctx = cairo.Context(surface) - ctx.scale (64, 64) - ctx.set_line_width (0.1) - ctx.set_source_rgb (1, 0, 0) - - r = bytes (r) - lastxy = (0, 0) - for i in range (0, len (r), 4): - x0, y0, x1, y1 = struct.unpack (' 1: - self.prev () - - @property - def atom (self): - """ - Get atom at current position - """ - return self.rows[self.pos] - -class FileDataspace (Dataspace): - """ - EUMEL’s FILE datatype - """ - - TYPE = 1003 - - def __init__ (self, fd): - Dataspace.__init__ (self, fd) - - # header of the BOUND LIST (aka TYPE FILE) - self.used = self.parseSequence () - self.parseInt (2) - self.parseSequence () - self.parseSequence () - self.parseInt (7) - assert self.fd.tell () == 0x38 - - rows = self.parseRows () - - self.parseHeap () - - self.text = self.reconstructText (rows) - - def parseSegment (self): - return Segment (*self.parseInt (3)) - - def parseSequence (self): - return Sequence (*self.parseInt (5)) - - def parseRows (self): - rows = [] - # read lines - while True: - # check data - data = self.fd.read (24) - if data == 24*b'\xff': - break - self.skip (-24) - # and parse it - seg = self.parseSegment () - rowtype = self.parseInt () - text = self.parseText () - rows.append (Atom (seg, rowtype, text)) - logging.debug ('got row {} {}'.format (len (rows)-1, rows[-1])) - return rows - - def reconstructText (self, rows): - # XXX: use - logging.debug ('Used first {}, last {}, starts at line {}, {} lines in total'.format (self.used.segmentbegin, self.used.segmentend, self.used.lineno, self.used.lines)) - chain = Chain (self.used, rows) - chain.first () - firstrow = chain.pos - lines = [] - visited = set () - while True: - if chain.pos in visited: - logging.warning ('Row {} already has been used'.format (chain.pos)) - visited.add (chain.pos) - - r = chain.atom - lbytes = bytes (r.line) - lbytesStripped = lbytes.rstrip (b'\xff') - if len (lbytes) != len (lbytesStripped): - logging.warning ('Line {} length incorrect. Is {}, should be {}, fixing. {}'.format (chain.lineno, r.line.length, len (lbytesStripped), lbytes)) - lbytes = lbytesStripped - lines.append (lbytes) - chain.next () - - # chains are cyclic - if chain.pos == firstrow: - break - return codecs.decode (b'\n'.join (lines), 'eumel', 'replace') - -if __name__ == '__main__': - import sys, os, codecs, logging - import argparse, sys - - parser = argparse.ArgumentParser(description='Convert EUMEL FILE dataspace into plain text file.') - parser.add_argument ('-v', '--verbose', help='Enable debugging messages', action='store_true') - parser.add_argument ('file', help='Input file') - args = parser.parse_args () - - if args.verbose: - logging.basicConfig (level=logging.DEBUG) - else: - logging.basicConfig (level=logging.WARNING) - - with open (args.file, 'rb') as fd: - try: - ds = FileDataspace (fd) - linecount = len (ds.text.splitlines ()) - if linecount != ds.used.lines: - logging.warning ('Got {} lines, but should have been {}'.format (linecount, ds.used.lines)) - print (ds.text) - except DataspaceTypeMismatch: - logging.error ('Not a text file, cannot convert') - sys.exit (1) - diff --git a/tools/eumel.py b/tools/eumel.py deleted file mode 100644 index a421e0a..0000000 --- a/tools/eumel.py +++ /dev/null @@ -1,327 +0,0 @@ -""" -EUMEL utility functions, including: - -""" - -import logging -import codecs - -# EUMEL character map. See “Benutzerhandbuch 1.7”, page 107 and file -# ZEICHENSATZ from the archive disk std.graphik. -eumel2unicodemap = dict ([ - # standard newlines - (10, '\n'), - (13, '\r'), - # mark start. technically \15 and \14 would be a choice here, but they do - # different things on different systems and thus we’re just gonna strip - # them. - (15, ''), - (14, ''), # mark end - # same as ascii - (32, ' '), - (33, '!'), - (34, '"'), - (35, '#'), - (36, '$'), - (37, '%'), - (38, '&'), - (39, "'"), - (40, '('), - (41, ')'), - (42, '*'), - (43, '+'), - (44, ','), - (45, '-'), - (46, '.'), - (47, '/'), - (48, '0'), - (49, '1'), - (50, '2'), - (51, '3'), - (52, '4'), - (53, '5'), - (54, '6'), - (55, '7'), - (56, '8'), - (57, '9'), - (58, ':'), - (59, ';'), - (60, '<'), - (61, '='), - (62, '>'), - (63, '?'), - # then the paragraph symbol - (64, '§'), - # uppercase and lowercase letters from ascii - (65, 'A'), - (66, 'B'), - (67, 'C'), - (68, 'D'), - (69, 'E'), - (70, 'F'), - (71, 'G'), - (72, 'H'), - (73, 'I'), - (74, 'J'), - (75, 'K'), - (76, 'L'), - (77, 'M'), - (78, 'N'), - (79, 'O'), - (80, 'P'), - (81, 'Q'), - (82, 'R'), - (83, 'S'), - (84, 'T'), - (85, 'U'), - (86, 'V'), - (87, 'W'), - (88, 'X'), - (89, 'Y'), - (90, 'Z'), - (91, '['), - (92, '\\'), - (93, ']'), - (94, '^'), - (95, '_'), - (96, '`'), - (97, 'a'), - (98, 'b'), - (99, 'c'), - (100, 'd'), - (101, 'e'), - (102, 'f'), - (103, 'g'), - (104, 'h'), - (105, 'i'), - (106, 'j'), - (107, 'k'), - (108, 'l'), - (109, 'm'), - (110, 'n'), - (111, 'o'), - (112, 'p'), - (113, 'q'), - (114, 'r'), - (115, 's'), - (116, 't'), - (117, 'u'), - (118, 'v'), - (119, 'w'), - (120, 'x'), - (121, 'y'), - (122, 'z'), - (123, '{'), - (124, '|'), - (125, '}'), - (126, '~'), - # uppercase greek - (129, 'Α'), - (130, 'Β'), - (131, 'Γ'), - (132, 'Δ'), - (133, 'Ε'), - (134, 'Ζ'), - (135, 'Η'), - (136, 'Θ'), - (137, 'Ι'), - (138, 'Κ'), - (139, 'Λ'), - (140, 'Μ'), - (141, 'Ν'), - (142, 'Ξ'), - (143, 'Ο'), - (144, 'Π'), - (145, 'Ρ'), - (146, 'Σ'), - (147, 'Τ'), - (148, 'Υ'), - (149, 'Φ'), - (150, 'Χ'), - (151, 'Ψ'), - (152, 'Ω'), - # lowercase greek - (161, 'α'), - (162, 'β'), - (163, 'γ'), - (164, 'δ'), - (165, 'ε'), - (166, 'ζ'), - (167, 'η'), - (168, 'θ'), - (169, 'ι'), - (170, 'κ'), - (171, 'λ'), - (172, 'μ'), - (173, 'ν'), - (174, 'ξ'), - (175, 'ο'), - (176, 'π'), - (177, 'ρ'), - (178, 'ς'), - (179, 'σ'), - (180, 'τ'), - (181, 'υ'), - (182, 'φ'), - (183, 'χ'), - (184, 'ψ'), - (185, 'ω'), - # these seem to be combining diacritic, not sure how they work though - # 192 looks like a cross, dunno what it could be - (193, '\u0301'), # acute - (194, '\u0300'), # grave - (195, '\u0302'), # circumflex - (196, '\u0303'), # tilde - (197, '\u0304'), # macron - # 198: dunno - (199, '\u0307'), # dot above - (200, '\u0308'), # diaeresis - # 201: dunno - (202, '\u030a'), # ring above - (203, '\u0317'), # acute below - # 204: dunno - (205, '\u030a'), # ring above (again for small letters?) - # 206: dunno - (207, '\u030c'), # caron - # german umlauts - (214, 'Ä'), - (215, 'Ö'), - (216, 'Ü'), - (217, 'ä'), - (218, 'ö'), - (219, 'ü'), - (220, 'k'), # handbuch says: Trenn-'k' bei der Umwandlung von 'ck' in 'kk' - (221, '\u00ad'), # soft hyphen, inserted by eumel’s hyphenation program - (222, '\\#'), # printable hash (i.e. literal hash, not a printer/editor command) - (223, '\u00a0'), # protected space - (251, 'ß'), - ]) - -def decode (input, errors='strict'): - ret = [] - pos = 0 - for pos in range (len (input)): - c = input[pos] - m = eumel2unicodemap.get (c, None) - if m is not None: - ret.append (m) - else: - if errors == 'strict': - raise UnicodeError ('unknown char {}'.format (c)) - elif errors == 'ignore': - pass - elif errors == 'replace': - logging.debug ('replacing unknown symbol {} at position {}, context {}'.format (c, pos, input[pos-30:pos+30])) - ret.append ('\uFFFD') - else: - break - return (''.join (ret), pos) - -def lookup (name): - if name == 'eumel': - return codecs.CodecInfo(None, decode) - return None - -codecs.register (lookup) - -# Dataspace utilities -import struct, os - -class DataspaceTypeMismatch (ValueError): - pass - -class Dataspace: - # Expected type - TYPE = None - - def __init__ (self, fd): - self.fd = fd - self.lastaddr, self.firstaddr, self.type, _ = self._parseHeader () - if self.TYPE is not None and self.type != self.TYPE: - raise DataspaceTypeMismatch () - self.heap = {} - - def _parseHeader (self): - """ - :return: (last heap address, first heap address, dataspace type, unknown) - """ - buf = self.fd.read (8) - return struct.unpack (''.format (self.address, self.length) - - @property - def item (self): - if self._item: - return self._item - elif self.address in self.heap: - self._item = self.heap[self.address] - return self._item - else: - raise HeapReferenceUnresolved (self.address, self.length) - -class HeapReferenceUnresolved (Exception): - def __init__ (self, address, length): - Exception.__init__ (self, 'addr: {:x}, len: {}'.format (address, length)) - -# Machine constants -intsize = 2 -pagesize = 512 - diff --git a/tools/extractAll.sh b/tools/extractAll.sh deleted file mode 100755 index 8b8649f..0000000 --- a/tools/extractAll.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/sh - -root=`dirname "$0"` -root=`realpath "$root"` - -while read -r F; do - base=`basename "$F"` - linear=`mktemp` - destdir="${base}.extracted" - echo "Extracting $F to $destdir" - $root/linearizeDisk.py "$F" "$linear" - $root/extractArchive.py -n -o "$destdir" "$linear" - pushd "$destdir" || continue - for G in ./*; do - echo "Converting $G to ${G}.txt" - $root/convertFileDs.py "$G" > "${G}.txt" || rm "${G}.txt" - done - popd - rm "$linear" -done - diff --git a/tools/extractArchive.py b/tools/extractArchive.py deleted file mode 100755 index f14a6b6..0000000 --- a/tools/extractArchive.py +++ /dev/null @@ -1,110 +0,0 @@ -#!/usr/bin/env python3 - -""" -Extract linearized (see linearizeDisk.py) EUMEL archive disk. -""" - -import struct, sys, io, logging -import codecs -from eumel import Dataspace - -def take (it, n): - for i in range (n): - yield next (it) - -def parseEntry (blocks): - while True: - header = next (blocks) - unknown1, unknown2, length, unknown3 = struct.unpack (' {% endfor %}""") g = Graph() - result = g.parse ("index.ttl", format='turtle') + result = g.parse (sys.stdin, format='turtle') s = Namespace ("https://schema.org/") items = [] diff --git a/tools/linearizeDisk.py b/tools/linearizeDisk.py deleted file mode 100755 index 55f4b06..0000000 --- a/tools/linearizeDisk.py +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env python3 - -""" -For some reason blocks in the bitsavers images are not in linear order, but -shuffled. Not sure why and if other disks are affected as well, but this script -reorders them. -""" - -import os, logging -from itertools import chain - -def linearBlocks (fd): - fd.seek (0, os.SEEK_END) - size = fd.tell () - logging.debug ('File size is {} bytes'.format (size)) - - blockSize = 512 - blocksPerChunk = 15 - chunkSize = blockSize*blocksPerChunk - chunks = size//chunkSize - skip = 1 - if size%chunkSize != 0: - logging.warning ('File size {} is not multiple of chunk size {}'.format (size, chunkSize)) - - # first even then odd chunks - for j in chain (range (0, chunks, 2), range (1, chunks, 2)): - pos = j*chunkSize - logging.debug ('Seeking to {} for chunk {} and reading {} blocks @ {} bytes'.format (pos, j, blocksPerChunk, blockSize)) - fd.seek (pos, os.SEEK_SET) - for i in range (blocksPerChunk): - yield fd.read (blockSize) - -if __name__ == '__main__': - import argparse, sys - - parser = argparse.ArgumentParser(description='Reorder EUMEL archive disk’s blocks.') - parser.add_argument ('-v', '--verbose', help='Enable debugging messages', action='store_true') - parser.add_argument ('input', help='Input file') - parser.add_argument ('output', help='Out file') - args = parser.parse_args () - if args.verbose: - logging.basicConfig (level=logging.DEBUG) - else: - logging.basicConfig (level=logging.WARNING) - - with open (args.input, 'rb') as infd, open (args.output, 'wb') as outfd: - for b in linearBlocks (infd): - outfd.write (b) - diff --git a/tools/rdf.py b/tools/rdf.py new file mode 100644 index 0000000..6aaa682 --- /dev/null +++ b/tools/rdf.py @@ -0,0 +1,54 @@ +from rdflib.namespace import RDF, NamespaceManager + +class RDFWalker: + """ + Simple RDF graph walker + """ + + def __init__ (self, g, s, n, path=[]): + """ + :param g: Graph + :param s: Namespace + :param n: Start node + """ + self.g = g + self.n = n + self.s = s + self._path = path + + def __getattr__ (self, k): + """ + If k is underscore _, walk up tree one level, otherwise search for + direct descendents and get first one. + """ + if k == '_': + return RDFWalker (self.g, self.s, self._path[0], self._path[1:]) + yieldall = False + if k.endswith ('_'): + yieldall = True + k = k[:-1] + + if k == 'a': + attr = RDF.type + else: + attr = getattr (self.s, k) + + ret = [RDFWalker (self.g, self.s, n, [self.n] + self._path) for n in self.g.objects (self.n, attr)] + + if yieldall: + return ret + elif not ret: + return None + else: + return ret[0] + + def __eq__ (self, b): + return self.n == b.n + + def __lt__ (self, b): + return str (self) < str (b) + + def __str__ (self): + return str (self.n) + + -- cgit v1.2.3