1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
|
#!/usr/bin/env python3
"""
Convert EUMEL FILE dataspace into a plain text file.
Since there are no “files” in EUMEL we’re dealing with the editor’s in-memory
datastructure here. See EUMEL packet “file handling”.
"""
import struct, copy
from collections import namedtuple
from eumel import Dataspace, DataspaceTypeMismatch
Segment = namedtuple ('Segment', ['succ', 'pred', 'end'])
Sequence = namedtuple ('Sequence', ['index', 'segmentbegin', 'segmentend', 'lineno', 'lines'])
Atom = namedtuple ('Atom', ['seg', 'type', 'line'])
class Chain:
"""
A chain is a cyclic datastructure, pointing to segments. Segments contain
one or more rows, which in turn reference a single line’s text.
"""
def __init__ (self, sequence, rows):
self.lineno = sequence.lineno
# current atom
self.pos = sequence.index
# current segment
self.segpos = sequence.segmentbegin
self.rows = rows
def next (self):
atom = self.rows[self.segpos]
if self.pos == atom.seg.end:
# move to next segment
self.pos = atom.seg.succ
self.segpos = atom.seg.succ
else:
# just use the next atom in this segment
self.pos += 1
self.lineno += 1
def prev (self):
# backwards is a little more involved: seg.pred points to the *first* segment row
logging.debug ('prev at pos {} seg {} line {}'.format (self.pos, self.segpos, self.lineno))
if self.pos == self.segpos:
# get previous segment
atom = self.rows[self.segpos]
self.segpos = atom.seg.pred
atom = self.rows[self.segpos]
self.pos = atom.seg.end
else:
self.pos -= 1
self.lineno -= 1
def first (self):
"""
Seek to first line
"""
while self.lineno > 1:
self.prev ()
@property
def atom (self):
"""
Get atom at current position
"""
return self.rows[self.pos]
class FileDataspace (Dataspace):
"""
EUMEL’s FILE datatype
"""
TYPE = 1003
def __init__ (self, fd):
Dataspace.__init__ (self, fd)
# header of the BOUND LIST (aka TYPE FILE)
self.used = self.parseSequence ()
self.parseInt (2)
self.parseSequence ()
self.parseSequence ()
self.parseInt (7)
assert self.fd.tell () == 0x38
rows = self.parseRows ()
self.parseHeap ()
self.text = self.reconstructText (rows)
def parseSegment (self):
return Segment (*self.parseInt (3))
def parseSequence (self):
return Sequence (*self.parseInt (5))
def parseRows (self):
rows = []
# read lines
while True:
# check data
data = self.fd.read (24)
if data == 24*b'\xff':
break
self.skip (-24)
# and parse it
seg = self.parseSegment ()
rowtype = self.parseInt ()
text = self.parseText ()
rows.append (Atom (seg, rowtype, text))
logging.debug ('got row {} {}'.format (len (rows)-1, rows[-1]))
return rows
def reconstructText (self, rows):
# XXX: use
logging.debug ('Used first {}, last {}, starts at line {}, {} lines in total'.format (self.used.segmentbegin, self.used.segmentend, self.used.lineno, self.used.lines))
chain = Chain (self.used, rows)
chain.first ()
firstrow = chain.pos
lines = []
visited = set ()
while True:
if chain.pos in visited:
logging.warning ('Row {} already has been used'.format (chain.pos))
visited.add (chain.pos)
r = chain.atom
lbytes = bytes (r.line)
lbytesStripped = lbytes.rstrip (b'\xff')
if len (lbytes) != len (lbytesStripped):
logging.warning ('Line {} length incorrect. Is {}, should be {}, fixing. {}'.format (chain.lineno, r.line.length, len (lbytesStripped), lbytes))
lbytes = lbytesStripped
lines.append (lbytes)
chain.next ()
# chains are cyclic
if chain.pos == firstrow:
break
return codecs.decode (b'\n'.join (lines), 'eumel', 'replace')
if __name__ == '__main__':
import sys, os, codecs, logging
import argparse, sys
parser = argparse.ArgumentParser(description='Convert EUMEL FILE dataspace into plain text file.')
parser.add_argument ('-v', '--verbose', help='Enable debugging messages', action='store_true')
parser.add_argument ('file', help='Input file')
args = parser.parse_args ()
if args.verbose:
logging.basicConfig (level=logging.DEBUG)
else:
logging.basicConfig (level=logging.WARNING)
with open (args.file, 'rb') as fd:
try:
ds = FileDataspace (fd)
linecount = len (ds.text.splitlines ())
if linecount != ds.used.lines:
logging.warning ('Got {} lines, but should have been {}'.format (linecount, ds.used.lines))
print (ds.text)
except DataspaceTypeMismatch:
logging.error ('Not a text file, cannot convert')
sys.exit (1)
|