convertFileDs.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167

#!/usr/bin/env python3

"""
Convert EUMEL FILE dataspace into a plain text file.

Since there are no “files” in EUMEL we’re dealing with the editor’s in-memory
datastructure here. See EUMEL packet “file handling”.
"""

import struct, copy
from collections import namedtuple
from eumel import Dataspace, DataspaceTypeMismatch

Segment = namedtuple ('Segment', ['succ', 'pred', 'end'])
Sequence = namedtuple ('Sequence', ['index', 'segmentbegin', 'segmentend', 'lineno', 'lines'])
Atom = namedtuple ('Atom', ['seg', 'type', 'line'])

class Chain:
    """
    A chain is a cyclic datastructure, pointing to segments. Segments contain
    one or more rows, which in turn reference a single line’s text.
    """
    def __init__ (self, sequence, rows):
        self.lineno = sequence.lineno
        # current atom
        self.pos = sequence.index
        # current segment
        self.segpos = sequence.segmentbegin
        self.rows = rows

    def next (self):
        atom = self.rows[self.segpos]
        if self.pos == atom.seg.end:
            # move to next segment
            self.pos = atom.seg.succ
            self.segpos = atom.seg.succ
        else:
            # just use the next atom in this segment
            self.pos += 1
        self.lineno += 1

    def prev (self):
        # backwards is a little more involved: seg.pred points to the *first* segment row
        logging.debug ('prev at pos {} seg {} line {}'.format (self.pos, self.segpos, self.lineno))
        if self.pos == self.segpos:
            # get previous segment
            atom = self.rows[self.segpos]
            self.segpos = atom.seg.pred
            atom = self.rows[self.segpos]
            self.pos = atom.seg.end
        else:
            self.pos -= 1
        self.lineno -= 1

    def first (self):
        """
        Seek to first line
        """
        while self.lineno > 1:
            self.prev ()

    @property
    def atom (self):
        """
        Get atom at current position
        """
        return self.rows[self.pos]

class FileDataspace (Dataspace):
    """
    EUMEL’s FILE datatype
    """

    TYPE = 1003

    def __init__ (self, fd):
        Dataspace.__init__ (self, fd)

        # header of the BOUND LIST (aka TYPE FILE)
        self.used = self.parseSequence ()
        self.parseInt (2)
        self.parseSequence ()
        self.parseSequence ()
        self.parseInt (7)
        assert self.fd.tell () == 0x38

        rows = self.parseRows ()

        self.parseHeap ()

        self.text = self.reconstructText (rows)

    def parseSegment (self):
        return Segment (*self.parseInt (3))

    def parseSequence (self):
        return Sequence (*self.parseInt (5))

    def parseRows (self):
        rows = []
        # read lines
        while True:
            # check data
            data = self.fd.read (24)
            if data == 24*b'\xff':
                break
            self.skip (-24)
            # and parse it
            seg = self.parseSegment ()
            rowtype = self.parseInt ()
            text = self.parseText ()
            rows.append (Atom (seg, rowtype, text))
            logging.debug ('got row {} {}'.format (len (rows)-1, rows[-1]))
        return rows

    def reconstructText (self, rows):
        # XXX: use
        logging.debug ('Used first {}, last {}, starts at line {}, {} lines in total'.format (self.used.segmentbegin, self.used.segmentend, self.used.lineno, self.used.lines))
        chain = Chain (self.used, rows)
        chain.first ()
        firstrow = chain.pos
        lines = []
        visited = set ()
        while True:
            if chain.pos in visited:
                logging.warning ('Row {} already has been used'.format (chain.pos))
            visited.add (chain.pos)

            r = chain.atom
            lbytes = bytes (r.line)
            lbytesStripped = lbytes.rstrip (b'\xff')
            if len (lbytes) != len (lbytesStripped):
                logging.warning ('Line {} length incorrect. Is {}, should be {}, fixing. {}'.format (chain.lineno, r.line.length, len (lbytesStripped), lbytes))
                lbytes = lbytesStripped
            lines.append (lbytes)
            chain.next ()

            # chains are cyclic
            if chain.pos == firstrow:
                break
        return codecs.decode (b'\n'.join (lines), 'eumel', 'replace')

if __name__ == '__main__':
    import sys, os, codecs, logging
    import argparse, sys
    
    parser = argparse.ArgumentParser(description='Convert EUMEL FILE dataspace into plain text file.')
    parser.add_argument ('-v', '--verbose', help='Enable debugging messages', action='store_true')
    parser.add_argument ('file', help='Input file')
    args = parser.parse_args ()

    if args.verbose:
        logging.basicConfig (level=logging.DEBUG)
    else:
        logging.basicConfig (level=logging.WARNING)

    with open (args.file, 'rb') as fd:
        try:
            ds = FileDataspace (fd)
            linecount = len (ds.text.splitlines ())
            if linecount != ds.used.lines:
                logging.warning ('Got {} lines, but should have been {}'.format (linecount, ds.used.lines))
            print (ds.text)
        except DataspaceTypeMismatch:
            logging.error ('Not a text file, cannot convert')
            sys.exit (1)