From 12989393311cdca62f376bea6883ee36e8fa43ac Mon Sep 17 00:00:00 2001 From: Lars-Dominik Braun Date: Sat, 17 Sep 2016 11:06:16 +0200 Subject: Add disk extraction tools --- convertFileDs.py | 167 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 167 insertions(+) create mode 100755 convertFileDs.py (limited to 'convertFileDs.py') diff --git a/convertFileDs.py b/convertFileDs.py new file mode 100755 index 0000000..c4037db --- /dev/null +++ b/convertFileDs.py @@ -0,0 +1,167 @@ +#!/usr/bin/env python3 + +""" +Convert EUMEL FILE dataspace into a plain text file. + +Since there are no “files” in EUMEL we’re dealing with the editor’s in-memory +datastructure here. See EUMEL packet “file handling”. +""" + +import struct, copy +from collections import namedtuple +from eumel import Dataspace, DataspaceTypeMismatch + +Segment = namedtuple ('Segment', ['succ', 'pred', 'end']) +Sequence = namedtuple ('Sequence', ['index', 'segmentbegin', 'segmentend', 'lineno', 'lines']) +Atom = namedtuple ('Atom', ['seg', 'type', 'line']) + +class Chain: + """ + A chain is a cyclic datastructure, pointing to segments. Segments contain + one or more rows, which in turn reference a single line’s text. + """ + def __init__ (self, sequence, rows): + self.lineno = sequence.lineno + # current atom + self.pos = sequence.index + # current segment + self.segpos = sequence.segmentbegin + self.rows = rows + + def next (self): + atom = self.rows[self.segpos] + if self.pos == atom.seg.end: + # move to next segment + self.pos = atom.seg.succ + self.segpos = atom.seg.succ + else: + # just use the next atom in this segment + self.pos += 1 + self.lineno += 1 + + def prev (self): + # backwards is a little more involved: seg.pred points to the *first* segment row + logging.debug ('prev at pos {} seg {} line {}'.format (self.pos, self.segpos, self.lineno)) + if self.pos == self.segpos: + # get previous segment + atom = self.rows[self.segpos] + self.segpos = atom.seg.pred + atom = self.rows[self.segpos] + self.pos = atom.seg.end + else: + self.pos -= 1 + self.lineno -= 1 + + def first (self): + """ + Seek to first line + """ + while self.lineno > 1: + self.prev () + + @property + def atom (self): + """ + Get atom at current position + """ + return self.rows[self.pos] + +class FileDataspace (Dataspace): + """ + EUMEL’s FILE datatype + """ + + TYPE = 1003 + + def __init__ (self, fd): + Dataspace.__init__ (self, fd) + + # header of the BOUND LIST (aka TYPE FILE) + self.used = self.parseSequence () + self.parseInt (2) + self.parseSequence () + self.parseSequence () + self.parseInt (7) + assert self.fd.tell () == 0x38 + + rows = self.parseRows () + + self.parseHeap () + + self.text = self.reconstructText (rows) + + def parseSegment (self): + return Segment (*self.parseInt (3)) + + def parseSequence (self): + return Sequence (*self.parseInt (5)) + + def parseRows (self): + rows = [] + # read lines + while True: + # check data + data = self.fd.read (24) + if data == 24*b'\xff': + break + self.skip (-24) + # and parse it + seg = self.parseSegment () + rowtype = self.parseInt () + text = self.parseText () + rows.append (Atom (seg, rowtype, text)) + logging.debug ('got row {} {}'.format (len (rows)-1, rows[-1])) + return rows + + def reconstructText (self, rows): + # XXX: use + logging.debug ('Used first {}, last {}, starts at line {}, {} lines in total'.format (self.used.segmentbegin, self.used.segmentend, self.used.lineno, self.used.lines)) + chain = Chain (self.used, rows) + chain.first () + firstrow = chain.pos + lines = [] + visited = set () + while True: + if chain.pos in visited: + logging.warning ('Row {} already has been used'.format (chain.pos)) + visited.add (chain.pos) + + r = chain.atom + lbytes = bytes (r.line) + lbytesStripped = lbytes.rstrip (b'\xff') + if len (lbytes) != len (lbytesStripped): + logging.warning ('Line {} length incorrect. Is {}, should be {}, fixing. {}'.format (chain.lineno, r.line.length, len (lbytesStripped), lbytes)) + lbytes = lbytesStripped + lines.append (lbytes) + chain.next () + + # chains are cyclic + if chain.pos == firstrow: + break + return codecs.decode (b'\n'.join (lines), 'eumel', 'replace') + +if __name__ == '__main__': + import sys, os, codecs, logging + import argparse, sys + + parser = argparse.ArgumentParser(description='Convert EUMEL FILE dataspace into plain text file.') + parser.add_argument ('-v', '--verbose', help='Enable debugging messages', action='store_true') + parser.add_argument ('file', help='Input file') + args = parser.parse_args () + + if args.verbose: + logging.basicConfig (level=logging.DEBUG) + else: + logging.basicConfig (level=logging.WARNING) + + with open (args.file, 'rb') as fd: + try: + ds = FileDataspace (fd) + linecount = len (ds.text.splitlines ()) + if linecount != ds.used.lines: + logging.warning ('Got {} lines, but should have been {}'.format (linecount, ds.used.lines)) + print (ds.text) + except DataspaceTypeMismatch: + logging.error ('Not a text file, cannot convert') + sys.exit (1) + -- cgit v1.2.3