From 12989393311cdca62f376bea6883ee36e8fa43ac Mon Sep 17 00:00:00 2001
From: Lars-Dominik Braun <lars@6xq.net>
Date: Sat, 17 Sep 2016 11:06:16 +0200
Subject: Add disk extraction tools

---
 convertFileDs.py | 167 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 167 insertions(+)
 create mode 100755 convertFileDs.py

(limited to 'convertFileDs.py')

diff --git a/convertFileDs.py b/convertFileDs.py
new file mode 100755
index 0000000..c4037db
--- /dev/null
+++ b/convertFileDs.py
@@ -0,0 +1,167 @@
+#!/usr/bin/env python3
+
+"""
+Convert EUMEL FILE dataspace into a plain text file.
+
+Since there are no “files” in EUMEL we’re dealing with the editor’s in-memory
+datastructure here. See EUMEL packet “file handling”.
+"""
+
+import struct, copy
+from collections import namedtuple
+from eumel import Dataspace, DataspaceTypeMismatch
+
+Segment = namedtuple ('Segment', ['succ', 'pred', 'end'])
+Sequence = namedtuple ('Sequence', ['index', 'segmentbegin', 'segmentend', 'lineno', 'lines'])
+Atom = namedtuple ('Atom', ['seg', 'type', 'line'])
+
+class Chain:
+    """
+    A chain is a cyclic datastructure, pointing to segments. Segments contain
+    one or more rows, which in turn reference a single line’s text.
+    """
+    def __init__ (self, sequence, rows):
+        self.lineno = sequence.lineno
+        # current atom
+        self.pos = sequence.index
+        # current segment
+        self.segpos = sequence.segmentbegin
+        self.rows = rows
+
+    def next (self):
+        atom = self.rows[self.segpos]
+        if self.pos == atom.seg.end:
+            # move to next segment
+            self.pos = atom.seg.succ
+            self.segpos = atom.seg.succ
+        else:
+            # just use the next atom in this segment
+            self.pos += 1
+        self.lineno += 1
+
+    def prev (self):
+        # backwards is a little more involved: seg.pred points to the *first* segment row
+        logging.debug ('prev at pos {} seg {} line {}'.format (self.pos, self.segpos, self.lineno))
+        if self.pos == self.segpos:
+            # get previous segment
+            atom = self.rows[self.segpos]
+            self.segpos = atom.seg.pred
+            atom = self.rows[self.segpos]
+            self.pos = atom.seg.end
+        else:
+            self.pos -= 1
+        self.lineno -= 1
+
+    def first (self):
+        """
+        Seek to first line
+        """
+        while self.lineno > 1:
+            self.prev ()
+
+    @property
+    def atom (self):
+        """
+        Get atom at current position
+        """
+        return self.rows[self.pos]
+
+class FileDataspace (Dataspace):
+    """
+    EUMEL’s FILE datatype
+    """
+
+    TYPE = 1003
+
+    def __init__ (self, fd):
+        Dataspace.__init__ (self, fd)
+
+        # header of the BOUND LIST (aka TYPE FILE)
+        self.used = self.parseSequence ()
+        self.parseInt (2)
+        self.parseSequence ()
+        self.parseSequence ()
+        self.parseInt (7)
+        assert self.fd.tell () == 0x38
+
+        rows = self.parseRows ()
+
+        self.parseHeap ()
+
+        self.text = self.reconstructText (rows)
+
+    def parseSegment (self):
+        return Segment (*self.parseInt (3))
+
+    def parseSequence (self):
+        return Sequence (*self.parseInt (5))
+
+    def parseRows (self):
+        rows = []
+        # read lines
+        while True:
+            # check data
+            data = self.fd.read (24)
+            if data == 24*b'\xff':
+                break
+            self.skip (-24)
+            # and parse it
+            seg = self.parseSegment ()
+            rowtype = self.parseInt ()
+            text = self.parseText ()
+            rows.append (Atom (seg, rowtype, text))
+            logging.debug ('got row {} {}'.format (len (rows)-1, rows[-1]))
+        return rows
+
+    def reconstructText (self, rows):
+        # XXX: use
+        logging.debug ('Used first {}, last {}, starts at line {}, {} lines in total'.format (self.used.segmentbegin, self.used.segmentend, self.used.lineno, self.used.lines))
+        chain = Chain (self.used, rows)
+        chain.first ()
+        firstrow = chain.pos
+        lines = []
+        visited = set ()
+        while True:
+            if chain.pos in visited:
+                logging.warning ('Row {} already has been used'.format (chain.pos))
+            visited.add (chain.pos)
+
+            r = chain.atom
+            lbytes = bytes (r.line)
+            lbytesStripped = lbytes.rstrip (b'\xff')
+            if len (lbytes) != len (lbytesStripped):
+                logging.warning ('Line {} length incorrect. Is {}, should be {}, fixing. {}'.format (chain.lineno, r.line.length, len (lbytesStripped), lbytes))
+                lbytes = lbytesStripped
+            lines.append (lbytes)
+            chain.next ()
+
+            # chains are cyclic
+            if chain.pos == firstrow:
+                break
+        return codecs.decode (b'\n'.join (lines), 'eumel', 'replace')
+
+if __name__ == '__main__':
+    import sys, os, codecs, logging
+    import argparse, sys
+    
+    parser = argparse.ArgumentParser(description='Convert EUMEL FILE dataspace into plain text file.')
+    parser.add_argument ('-v', '--verbose', help='Enable debugging messages', action='store_true')
+    parser.add_argument ('file', help='Input file')
+    args = parser.parse_args ()
+
+    if args.verbose:
+        logging.basicConfig (level=logging.DEBUG)
+    else:
+        logging.basicConfig (level=logging.WARNING)
+
+    with open (args.file, 'rb') as fd:
+        try:
+            ds = FileDataspace (fd)
+            linecount = len (ds.text.splitlines ())
+            if linecount != ds.used.lines:
+                logging.warning ('Got {} lines, but should have been {}'.format (linecount, ds.used.lines))
+            print (ds.text)
+        except DataspaceTypeMismatch:
+            logging.error ('Not a text file, cannot convert')
+            sys.exit (1)
+
-- 
cgit v1.2.3