From 12989393311cdca62f376bea6883ee36e8fa43ac Mon Sep 17 00:00:00 2001 From: Lars-Dominik Braun Date: Sat, 17 Sep 2016 11:06:16 +0200 Subject: Add disk extraction tools --- convertFileDs.py | 167 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ eumel.py | 145 +++++++++++++++++++++++++++++++++++++++++++++++ extractAll.sh | 21 +++++++ extractArchive.py | 104 ++++++++++++++++++++++++++++++++++ linearizeDisk.py | 49 ++++++++++++++++ 5 files changed, 486 insertions(+) create mode 100755 convertFileDs.py create mode 100644 eumel.py create mode 100755 extractAll.sh create mode 100755 extractArchive.py create mode 100755 linearizeDisk.py diff --git a/convertFileDs.py b/convertFileDs.py new file mode 100755 index 0000000..c4037db --- /dev/null +++ b/convertFileDs.py @@ -0,0 +1,167 @@ +#!/usr/bin/env python3 + +""" +Convert EUMEL FILE dataspace into a plain text file. + +Since there are no “files” in EUMEL we’re dealing with the editor’s in-memory +datastructure here. See EUMEL packet “file handling”. +""" + +import struct, copy +from collections import namedtuple +from eumel import Dataspace, DataspaceTypeMismatch + +Segment = namedtuple ('Segment', ['succ', 'pred', 'end']) +Sequence = namedtuple ('Sequence', ['index', 'segmentbegin', 'segmentend', 'lineno', 'lines']) +Atom = namedtuple ('Atom', ['seg', 'type', 'line']) + +class Chain: + """ + A chain is a cyclic datastructure, pointing to segments. Segments contain + one or more rows, which in turn reference a single line’s text. + """ + def __init__ (self, sequence, rows): + self.lineno = sequence.lineno + # current atom + self.pos = sequence.index + # current segment + self.segpos = sequence.segmentbegin + self.rows = rows + + def next (self): + atom = self.rows[self.segpos] + if self.pos == atom.seg.end: + # move to next segment + self.pos = atom.seg.succ + self.segpos = atom.seg.succ + else: + # just use the next atom in this segment + self.pos += 1 + self.lineno += 1 + + def prev (self): + # backwards is a little more involved: seg.pred points to the *first* segment row + logging.debug ('prev at pos {} seg {} line {}'.format (self.pos, self.segpos, self.lineno)) + if self.pos == self.segpos: + # get previous segment + atom = self.rows[self.segpos] + self.segpos = atom.seg.pred + atom = self.rows[self.segpos] + self.pos = atom.seg.end + else: + self.pos -= 1 + self.lineno -= 1 + + def first (self): + """ + Seek to first line + """ + while self.lineno > 1: + self.prev () + + @property + def atom (self): + """ + Get atom at current position + """ + return self.rows[self.pos] + +class FileDataspace (Dataspace): + """ + EUMEL’s FILE datatype + """ + + TYPE = 1003 + + def __init__ (self, fd): + Dataspace.__init__ (self, fd) + + # header of the BOUND LIST (aka TYPE FILE) + self.used = self.parseSequence () + self.parseInt (2) + self.parseSequence () + self.parseSequence () + self.parseInt (7) + assert self.fd.tell () == 0x38 + + rows = self.parseRows () + + self.parseHeap () + + self.text = self.reconstructText (rows) + + def parseSegment (self): + return Segment (*self.parseInt (3)) + + def parseSequence (self): + return Sequence (*self.parseInt (5)) + + def parseRows (self): + rows = [] + # read lines + while True: + # check data + data = self.fd.read (24) + if data == 24*b'\xff': + break + self.skip (-24) + # and parse it + seg = self.parseSegment () + rowtype = self.parseInt () + text = self.parseText () + rows.append (Atom (seg, rowtype, text)) + logging.debug ('got row {} {}'.format (len (rows)-1, rows[-1])) + return rows + + def reconstructText (self, rows): + # XXX: use + logging.debug ('Used first {}, last {}, starts at line {}, {} lines in total'.format (self.used.segmentbegin, self.used.segmentend, self.used.lineno, self.used.lines)) + chain = Chain (self.used, rows) + chain.first () + firstrow = chain.pos + lines = [] + visited = set () + while True: + if chain.pos in visited: + logging.warning ('Row {} already has been used'.format (chain.pos)) + visited.add (chain.pos) + + r = chain.atom + lbytes = bytes (r.line) + lbytesStripped = lbytes.rstrip (b'\xff') + if len (lbytes) != len (lbytesStripped): + logging.warning ('Line {} length incorrect. Is {}, should be {}, fixing. {}'.format (chain.lineno, r.line.length, len (lbytesStripped), lbytes)) + lbytes = lbytesStripped + lines.append (lbytes) + chain.next () + + # chains are cyclic + if chain.pos == firstrow: + break + return codecs.decode (b'\n'.join (lines), 'eumel', 'replace') + +if __name__ == '__main__': + import sys, os, codecs, logging + import argparse, sys + + parser = argparse.ArgumentParser(description='Convert EUMEL FILE dataspace into plain text file.') + parser.add_argument ('-v', '--verbose', help='Enable debugging messages', action='store_true') + parser.add_argument ('file', help='Input file') + args = parser.parse_args () + + if args.verbose: + logging.basicConfig (level=logging.DEBUG) + else: + logging.basicConfig (level=logging.WARNING) + + with open (args.file, 'rb') as fd: + try: + ds = FileDataspace (fd) + linecount = len (ds.text.splitlines ()) + if linecount != ds.used.lines: + logging.warning ('Got {} lines, but should have been {}'.format (linecount, ds.used.lines)) + print (ds.text) + except DataspaceTypeMismatch: + logging.error ('Not a text file, cannot convert') + sys.exit (1) + diff --git a/eumel.py b/eumel.py new file mode 100644 index 0000000..0434b35 --- /dev/null +++ b/eumel.py @@ -0,0 +1,145 @@ +""" +EUMEL utility functions, including: + +""" + +import logging +import codecs + +# EUMEL character map. See “Benutzerhandbuch 1.7”, page 107. +# map eumel character to unicode codepoint +eumel2unicodemap = dict ( + [(10, '\n'), (13, '\r')] + + # first part is same as ascii + [(i, chr (i)) for i in range (32, 126)] + + [(126, '~')] + + [(214, 'Ä'), (215, 'Ö'), (216, 'Ü'), (217, 'ä'), (218, 'ö'), (219, 'ü'), (220, 'k'), (221, '-'), (222, '#'), (223, ' ')] + + [(251, 'ß')]) + +def decode (input, errors='strict'): + ret = [] + pos = 0 + for pos in range (len (input)): + c = input[pos] + m = eumel2unicodemap.get (c, None) + if m: + ret.append (m) + else: + if errors == 'strict': + raise UnicodeError ('unknown char {}'.format (c)) + elif errors == 'ignore': + pass + elif errors == 'replace': + ret.append ('\uFFFD') + else: + break + return (''.join (ret), pos) + +def lookup (name): + if name == 'eumel': + return codecs.CodecInfo(None, decode) + return None + +codecs.register (lookup) + +# Dataspace utilities +import struct, os + +class DataspaceTypeMismatch (ValueError): + pass + +class Dataspace: + # Expected type + TYPE = None + + def __init__ (self, fd): + self.fd = fd + self.lastaddr, self.firstaddr, self.type, _ = self._parseHeader () + if self.TYPE is not None and self.type != self.TYPE: + raise DataspaceTypeMismatch () + self.heap = {} + + def _parseHeader (self): + """ + :return: (last heap address, first heap address, dataspace type, unknown) + """ + buf = self.fd.read (8) + return struct.unpack (''.format (self.address, self.length) + + @property + def item (self): + if self._item: + return self._item + elif self.address in self.heap: + self._item = self.heap[self.address] + return self._item + else: + raise HeapReferenceUnresolved (self.address, self.length) + +class HeapReferenceUnresolved (Exception): + def __init__ (self, address, length): + Exception.__init__ (self, 'addr: {:x}, len: {}'.format (address, length)) + +# Machine constants +intsize = 2 +pagesize = 512 + diff --git a/extractAll.sh b/extractAll.sh new file mode 100755 index 0000000..6139475 --- /dev/null +++ b/extractAll.sh @@ -0,0 +1,21 @@ +#!/bin/sh + +root=`dirname "$0"` +root=`realpath "$root"` + +while read -r F; do + base=`basename "$F"` + linear=`mktemp` + destdir="${base}.extracted" + echo "Extracting $F to $destdir" + $root/linearizeDisk.py "$F" "$linear" + $root/extractArchive.py -o "$destdir" "$linear" + pushd "$destdir" || continue + for G in ./*; do + echo "Converting $G to ${G}.txt" + $root/convertFileDs.py "$G" > "${G}.txt" || rm "${G}.txt" + done + popd + rm "$linear" +done + diff --git a/extractArchive.py b/extractArchive.py new file mode 100755 index 0000000..2e66879 --- /dev/null +++ b/extractArchive.py @@ -0,0 +1,104 @@ +#!/usr/bin/env python3 + +""" +Extract linearized (see linearizeDisk.py) EUMEL archive disk. +""" + +import struct, sys, io, logging +import codecs +from eumel import Dataspace + +def take (it, n): + for i in range (n): + yield next (it) + +def parseEntry (blocks): + while True: + header = next (blocks) + unknown1, unknown2, length, unknown3 = struct.unpack ('