From a6d474471dddc2d7a187a66358aafcb86235ca69 Mon Sep 17 00:00:00 2001 From: Lars-Dominik Braun Date: Sat, 21 Jan 2017 11:24:56 +0100 Subject: Restructure git Move tools into separate repo, split TTL file. --- tools/convertFileDs.py | 167 ------------------------------------------------- 1 file changed, 167 deletions(-) delete mode 100755 tools/convertFileDs.py (limited to 'tools/convertFileDs.py') diff --git a/tools/convertFileDs.py b/tools/convertFileDs.py deleted file mode 100755 index c4037db..0000000 --- a/tools/convertFileDs.py +++ /dev/null @@ -1,167 +0,0 @@ -#!/usr/bin/env python3 - -""" -Convert EUMEL FILE dataspace into a plain text file. - -Since there are no “files” in EUMEL we’re dealing with the editor’s in-memory -datastructure here. See EUMEL packet “file handling”. -""" - -import struct, copy -from collections import namedtuple -from eumel import Dataspace, DataspaceTypeMismatch - -Segment = namedtuple ('Segment', ['succ', 'pred', 'end']) -Sequence = namedtuple ('Sequence', ['index', 'segmentbegin', 'segmentend', 'lineno', 'lines']) -Atom = namedtuple ('Atom', ['seg', 'type', 'line']) - -class Chain: - """ - A chain is a cyclic datastructure, pointing to segments. Segments contain - one or more rows, which in turn reference a single line’s text. - """ - def __init__ (self, sequence, rows): - self.lineno = sequence.lineno - # current atom - self.pos = sequence.index - # current segment - self.segpos = sequence.segmentbegin - self.rows = rows - - def next (self): - atom = self.rows[self.segpos] - if self.pos == atom.seg.end: - # move to next segment - self.pos = atom.seg.succ - self.segpos = atom.seg.succ - else: - # just use the next atom in this segment - self.pos += 1 - self.lineno += 1 - - def prev (self): - # backwards is a little more involved: seg.pred points to the *first* segment row - logging.debug ('prev at pos {} seg {} line {}'.format (self.pos, self.segpos, self.lineno)) - if self.pos == self.segpos: - # get previous segment - atom = self.rows[self.segpos] - self.segpos = atom.seg.pred - atom = self.rows[self.segpos] - self.pos = atom.seg.end - else: - self.pos -= 1 - self.lineno -= 1 - - def first (self): - """ - Seek to first line - """ - while self.lineno > 1: - self.prev () - - @property - def atom (self): - """ - Get atom at current position - """ - return self.rows[self.pos] - -class FileDataspace (Dataspace): - """ - EUMEL’s FILE datatype - """ - - TYPE = 1003 - - def __init__ (self, fd): - Dataspace.__init__ (self, fd) - - # header of the BOUND LIST (aka TYPE FILE) - self.used = self.parseSequence () - self.parseInt (2) - self.parseSequence () - self.parseSequence () - self.parseInt (7) - assert self.fd.tell () == 0x38 - - rows = self.parseRows () - - self.parseHeap () - - self.text = self.reconstructText (rows) - - def parseSegment (self): - return Segment (*self.parseInt (3)) - - def parseSequence (self): - return Sequence (*self.parseInt (5)) - - def parseRows (self): - rows = [] - # read lines - while True: - # check data - data = self.fd.read (24) - if data == 24*b'\xff': - break - self.skip (-24) - # and parse it - seg = self.parseSegment () - rowtype = self.parseInt () - text = self.parseText () - rows.append (Atom (seg, rowtype, text)) - logging.debug ('got row {} {}'.format (len (rows)-1, rows[-1])) - return rows - - def reconstructText (self, rows): - # XXX: use - logging.debug ('Used first {}, last {}, starts at line {}, {} lines in total'.format (self.used.segmentbegin, self.used.segmentend, self.used.lineno, self.used.lines)) - chain = Chain (self.used, rows) - chain.first () - firstrow = chain.pos - lines = [] - visited = set () - while True: - if chain.pos in visited: - logging.warning ('Row {} already has been used'.format (chain.pos)) - visited.add (chain.pos) - - r = chain.atom - lbytes = bytes (r.line) - lbytesStripped = lbytes.rstrip (b'\xff') - if len (lbytes) != len (lbytesStripped): - logging.warning ('Line {} length incorrect. Is {}, should be {}, fixing. {}'.format (chain.lineno, r.line.length, len (lbytesStripped), lbytes)) - lbytes = lbytesStripped - lines.append (lbytes) - chain.next () - - # chains are cyclic - if chain.pos == firstrow: - break - return codecs.decode (b'\n'.join (lines), 'eumel', 'replace') - -if __name__ == '__main__': - import sys, os, codecs, logging - import argparse, sys - - parser = argparse.ArgumentParser(description='Convert EUMEL FILE dataspace into plain text file.') - parser.add_argument ('-v', '--verbose', help='Enable debugging messages', action='store_true') - parser.add_argument ('file', help='Input file') - args = parser.parse_args () - - if args.verbose: - logging.basicConfig (level=logging.DEBUG) - else: - logging.basicConfig (level=logging.WARNING) - - with open (args.file, 'rb') as fd: - try: - ds = FileDataspace (fd) - linecount = len (ds.text.splitlines ()) - if linecount != ds.used.lines: - logging.warning ('Got {} lines, but should have been {}'.format (linecount, ds.used.lines)) - print (ds.text) - except DataspaceTypeMismatch: - logging.error ('Not a text file, cannot convert') - sys.exit (1) - -- cgit v1.2.3