summaryrefslogtreecommitdiff
path: root/convertFileDs.py
diff options
context:
space:
mode:
authorLars-Dominik Braun <lars@6xq.net>2016-09-17 11:06:16 +0200
committerLars-Dominik Braun <lars@6xq.net>2016-09-17 11:06:16 +0200
commit12989393311cdca62f376bea6883ee36e8fa43ac (patch)
treeadeb4f42250bfaa887b08539d98c27b26935bcef /convertFileDs.py
downloadeumel-tools-12989393311cdca62f376bea6883ee36e8fa43ac.tar.gz
eumel-tools-12989393311cdca62f376bea6883ee36e8fa43ac.tar.bz2
eumel-tools-12989393311cdca62f376bea6883ee36e8fa43ac.zip
Add disk extraction tools
Diffstat (limited to 'convertFileDs.py')
-rwxr-xr-xconvertFileDs.py167
1 files changed, 167 insertions, 0 deletions
diff --git a/convertFileDs.py b/convertFileDs.py
new file mode 100755
index 0000000..c4037db
--- /dev/null
+++ b/convertFileDs.py
@@ -0,0 +1,167 @@
+#!/usr/bin/env python3
+
+"""
+Convert EUMEL FILE dataspace into a plain text file.
+
+Since there are no “files” in EUMEL we’re dealing with the editor’s in-memory
+datastructure here. See EUMEL packet “file handling”.
+"""
+
+import struct, copy
+from collections import namedtuple
+from eumel import Dataspace, DataspaceTypeMismatch
+
+Segment = namedtuple ('Segment', ['succ', 'pred', 'end'])
+Sequence = namedtuple ('Sequence', ['index', 'segmentbegin', 'segmentend', 'lineno', 'lines'])
+Atom = namedtuple ('Atom', ['seg', 'type', 'line'])
+
+class Chain:
+ """
+ A chain is a cyclic datastructure, pointing to segments. Segments contain
+ one or more rows, which in turn reference a single line’s text.
+ """
+ def __init__ (self, sequence, rows):
+ self.lineno = sequence.lineno
+ # current atom
+ self.pos = sequence.index
+ # current segment
+ self.segpos = sequence.segmentbegin
+ self.rows = rows
+
+ def next (self):
+ atom = self.rows[self.segpos]
+ if self.pos == atom.seg.end:
+ # move to next segment
+ self.pos = atom.seg.succ
+ self.segpos = atom.seg.succ
+ else:
+ # just use the next atom in this segment
+ self.pos += 1
+ self.lineno += 1
+
+ def prev (self):
+ # backwards is a little more involved: seg.pred points to the *first* segment row
+ logging.debug ('prev at pos {} seg {} line {}'.format (self.pos, self.segpos, self.lineno))
+ if self.pos == self.segpos:
+ # get previous segment
+ atom = self.rows[self.segpos]
+ self.segpos = atom.seg.pred
+ atom = self.rows[self.segpos]
+ self.pos = atom.seg.end
+ else:
+ self.pos -= 1
+ self.lineno -= 1
+
+ def first (self):
+ """
+ Seek to first line
+ """
+ while self.lineno > 1:
+ self.prev ()
+
+ @property
+ def atom (self):
+ """
+ Get atom at current position
+ """
+ return self.rows[self.pos]
+
+class FileDataspace (Dataspace):
+ """
+ EUMEL’s FILE datatype
+ """
+
+ TYPE = 1003
+
+ def __init__ (self, fd):
+ Dataspace.__init__ (self, fd)
+
+ # header of the BOUND LIST (aka TYPE FILE)
+ self.used = self.parseSequence ()
+ self.parseInt (2)
+ self.parseSequence ()
+ self.parseSequence ()
+ self.parseInt (7)
+ assert self.fd.tell () == 0x38
+
+ rows = self.parseRows ()
+
+ self.parseHeap ()
+
+ self.text = self.reconstructText (rows)
+
+ def parseSegment (self):
+ return Segment (*self.parseInt (3))
+
+ def parseSequence (self):
+ return Sequence (*self.parseInt (5))
+
+ def parseRows (self):
+ rows = []
+ # read lines
+ while True:
+ # check data
+ data = self.fd.read (24)
+ if data == 24*b'\xff':
+ break
+ self.skip (-24)
+ # and parse it
+ seg = self.parseSegment ()
+ rowtype = self.parseInt ()
+ text = self.parseText ()
+ rows.append (Atom (seg, rowtype, text))
+ logging.debug ('got row {} {}'.format (len (rows)-1, rows[-1]))
+ return rows
+
+ def reconstructText (self, rows):
+ # XXX: use
+ logging.debug ('Used first {}, last {}, starts at line {}, {} lines in total'.format (self.used.segmentbegin, self.used.segmentend, self.used.lineno, self.used.lines))
+ chain = Chain (self.used, rows)
+ chain.first ()
+ firstrow = chain.pos
+ lines = []
+ visited = set ()
+ while True:
+ if chain.pos in visited:
+ logging.warning ('Row {} already has been used'.format (chain.pos))
+ visited.add (chain.pos)
+
+ r = chain.atom
+ lbytes = bytes (r.line)
+ lbytesStripped = lbytes.rstrip (b'\xff')
+ if len (lbytes) != len (lbytesStripped):
+ logging.warning ('Line {} length incorrect. Is {}, should be {}, fixing. {}'.format (chain.lineno, r.line.length, len (lbytesStripped), lbytes))
+ lbytes = lbytesStripped
+ lines.append (lbytes)
+ chain.next ()
+
+ # chains are cyclic
+ if chain.pos == firstrow:
+ break
+ return codecs.decode (b'\n'.join (lines), 'eumel', 'replace')
+
+if __name__ == '__main__':
+ import sys, os, codecs, logging
+ import argparse, sys
+
+ parser = argparse.ArgumentParser(description='Convert EUMEL FILE dataspace into plain text file.')
+ parser.add_argument ('-v', '--verbose', help='Enable debugging messages', action='store_true')
+ parser.add_argument ('file', help='Input file')
+ args = parser.parse_args ()
+
+ if args.verbose:
+ logging.basicConfig (level=logging.DEBUG)
+ else:
+ logging.basicConfig (level=logging.WARNING)
+
+ with open (args.file, 'rb') as fd:
+ try:
+ ds = FileDataspace (fd)
+ linecount = len (ds.text.splitlines ())
+ if linecount != ds.used.lines:
+ logging.warning ('Got {} lines, but should have been {}'.format (linecount, ds.used.lines))
+ print (ds.text)
+ except DataspaceTypeMismatch:
+ logging.error ('Not a text file, cannot convert')
+ sys.exit (1)
+