6 files changed, 452 insertions, 3 deletions
diff --git a/README.rst b/README.rst
new file mode 100644
index 0000000..4aecec7
--- /dev/null
+++ b/README.rst
@@ -0,0 +1,16 @@
+EUMEL-python
+============
+
+Tools for dealing with EUMEL_ datastructures and files, mostly written in Python.
+
+.. _EUMEL: https://6xq.net/eumel/
+
+``elan.py``
+    is a lexer for pygments and used to highlight the packages found
+    `here <https://6xq.net/eumel/src/>`__.
+``extractAll.sh``
+    bulk-extracts all archive disk images whose paths are read from stdin. It
+    also converts text dataspaces to text files usable with modern computers.
+
+    Calls ``convertCharset.py``, ``convertFileDs.py``, ``extractArchive.py``
+    and ``linearizeDisk.py``.
diff --git a/convertFileDs.py b/convertFileDs.py
index 89e0cab..d5c234b 100755
--- a/convertFileDs.py
+++ b/convertFileDs.py
@@ -9,7 +9,7 @@ datastructure here. See EUMEL packet “file handling”.
 
 import struct, copy
 from collections import namedtuple
-from eumel import Dataspace, DataspaceTypeMismatch, HeapReferenceUnresolved
+from eumel import Dataspace, DataspaceTypeMismatch, HeapReferenceUnresolved, pagesize
 
 Segment = namedtuple ('Segment', ['succ', 'pred', 'end'])
 Sequence = namedtuple ('Sequence', ['index', 'segmentbegin', 'segmentend', 'lineno', 'lines'])
@@ -77,12 +77,13 @@ class FileDataspace (Dataspace):
         Dataspace.__init__ (self, fd)
 
         # header of the BOUND LIST (aka TYPE FILE)
+        start = fd.tell ()
         self.used = self.parseSequence ()
         self.parseInt (2)
         self.parseSequence ()
         self.parseSequence ()
         self.parseInt (7)
-        assert self.fd.tell () == 0x38
+        assert self.fd.tell ()-start == 0x30
 
         rows = self.parseRows ()
 
@@ -150,6 +151,7 @@ if __name__ == '__main__':
     
     parser = argparse.ArgumentParser(description='Convert EUMEL FILE dataspace into plain text file.')
     parser.add_argument ('-v', '--verbose', help='Enable debugging messages', action='store_true')
+    parser.add_argument ('-s', '--skip', metavar='PAGES', type=int, default=0, help='Skip pages at the beginning of the file')
     parser.add_argument ('file', help='Input file')
     args = parser.parse_args ()
 
@@ -160,6 +162,7 @@ if __name__ == '__main__':
 
     with open (args.file, 'rb') as fd:
         try:
+            fd.seek (args.skip*pagesize)
             ds = FileDataspace (fd)
             linecount = len (ds.text.splitlines ())
             if linecount != ds.used.lines:
diff --git a/elan.py b/elan.py
new file mode 100644
index 0000000..beeb9f8
--- /dev/null
+++ b/elan.py
@@ -0,0 +1,135 @@
+"""
+pygments lexer for Elementary Language (ELAN)
+
+- Rainer Hahn, Peter Stock: ELAN Handbuch. 1979.
+- Rainer Hahn, Dietmar Heinrichs, Peter Heyderhoff: EUMEL Benutzerhandbuch Version 1.7. 1984.
+"""
+
+from pygments.lexer import RegexLexer, bygroups, include, words
+from pygments.token import *
+
+__all__ = ['ElanLexer']
+
+def uppercaseWords (l):
+    """
+    Match only uppercase words provided in l. For example FOR should not match
+    FORMAT.
+    """
+    return words (l, prefix=r'(?<![A-Z])', suffix=r'(?![A-Z])')
+
+class ElanLexer(RegexLexer):
+    name = 'ELAN'
+    aliases = ['elan']
+    filenames = ['*.elan']
+
+    tokens = {
+        'root': [
+            include('comment'),
+            # strings
+            (r'"', String.Double, 'string'),
+            # numbers. lookbehind, because identifiers may contain numbers too
+            (r'([-+]|(?<![a-z]))\d+', Number.Integer),
+            (r'[-+]?\d+\.\d+(E[+-]?\d+)?', Number.Float),
+            # keywords
+            (uppercaseWords ((
+                # not sure
+                'CONCR',
+                # if-then-else
+                'IF', 'THEN', 'ELSE', 'ELIF', 'ENDIF', 'END IF',
+                # found in the wild:
+                'FI',
+                # select statement
+                'SELECT', 'OF', 'CASE', 'OTHERWISE', 'ENDSELECT', 'END SELECT',
+                # loops
+                'FOR', 'FROM', 'DOWNTO', 'UPTO', 'WHILE', 'REPEAT', 'UNTIL',
+                'ENDREPEAT', 'END REPEAT',
+                # found in the wild:
+                'REP', 'PER', 'END REP',
+                # return statements
+                'LEAVE', 'WITH',
+                )), Keyword.Reserved),
+            (uppercaseWords ((
+                # type declaration
+                'TYPE',
+                # shorthand declaration
+                'LET',
+                )), Keyword.Declaration),
+            (uppercaseWords ((
+                # proper packet
+                'DEFINES',
+                )), Keyword.Namespace),
+            (uppercaseWords (('VAR', 'CONST', 'BOUND')), Name.Attribute),
+            (uppercaseWords (('BOOL', 'INT', 'REAL', 'TEXT', 'STRUCT', 'ROW',
+            'DATASPACE')), Keyword.Type),
+            # thruth values
+            (uppercaseWords (('TRUE', 'FALSE')), Name.Builtin),
+            # semi-builtin functions/operators, see Benutzerhandbuch pp. 329
+            # "Standartpakete"
+            (uppercaseWords ((
+                # boolean
+                'NOT', 'AND', 'OR', 'XOR',
+                # text
+                'CAT', 'LENGTH', 'TIMESOUT',
+                # math
+                'DECR', 'DIV', 'INCR', 'MOD', 'SUB',
+            )), Operator),
+            # and the same with symbols
+            (words ((
+                # assignments
+                ':=', '::',
+                # comparison
+                '=', '<>', '<=', '>=', '<', '>',
+                # math
+                '**', '*','+', '-', '/',
+                ), prefix=r'(?<![:=<>*+-/])', suffix=r'(?![:=<>*+-/])'),
+                Operator),
+            # packets, function and operators
+            # no space required between keyword and identifier
+            # XXX comments may be allowed between keyword and name
+            (r'((?:END\s*)?PACKET)([^A-Za-z]*)([a-z][a-z0-9 ]+)',
+                    bygroups (Keyword.Declaration, Text, Name.Namespace)),
+            (r'((?:END\s*)?PROC)([^A-Za-z]*)([a-z][a-z0-9 ]+)',
+                    bygroups (Keyword.Declaration, Text, Name.Function)),
+            (r'((?:END\s*)?OP)([^A-Za-z]*)([^a-z0-9 (;]+)',
+                    bygroups (Keyword.Declaration, Text, Name.Function)),
+            # Refinements
+            (r'\.(?![a-z])', Text, 'refinement'),
+            (r'.', Text),
+        ],
+        'comment': [
+            (r'\(\*', Comment, 'comment-inside1'),
+            (r'\{', Comment, 'comment-inside2'),
+            (r'#\(', Comment, 'comment-inside3'),
+        ],
+        'comment-inside1': [
+            # comment can be nested
+            include('comment'),
+            (r'\*\)', Comment, '#pop'),
+            (r'(.|\n)', Comment),
+        ],
+        'comment-inside2': [
+            # comment can be nested
+            include('comment'),
+            (r'\}', Comment, '#pop'),
+            (r'(.|\n)', Comment),
+        ],
+        'comment-inside3': [
+            # comment can be nested
+            include('comment'),
+            (r'#\)', Comment, '#pop'),
+            (r'(.|\n)', Comment),
+        ],
+        'string': [
+            # "" equals '\"', "12" is '\12'
+            (r'"[0-9]*"', String.Escape),
+            (r'"', String.Double, '#pop'),
+            (r'.', String.Double),
+        ],
+        'refinement': [
+            include('comment'),
+            (r'\s+', Text),
+            (r'([a-z][a-z0-9 ]*)(:\s+)', bygroups(Name.Label, Text), '#pop'),
+            (r'', Text, '#pop'),
+        ]
+    }
+
diff --git a/elan.vim b/elan.vim
new file mode 100644
index 0000000..bd0f4c8
--- /dev/null
+++ b/elan.vim
@@ -0,0 +1,51 @@
+" Vim syntax file
+" Copy to ~/.vim/syntax/ and enable with :set filetype=elan
+" Language: ELAN
+" Maintainer: Lars-Dominik Braun <lars+eumel@6xq.net>
+" Latest Revision: 2019-02-07
+
+if exists("b:current_syntax")
+  finish
+endif
+
+syn keyword elanStatement PROC ENDPROC OP PACKET ENDPACKET LEAVE WITH END LET DEFINES
+syn keyword elanConditional IF ELSE FI THEN SELECT OF ELIF
+syn keyword elanRepeat FOR FROM UPTO REP PER WHILE UNTIL
+syn keyword elanBoolean TRUE FALSE
+syn keyword elanType DATASPACE INT TEXT BOOL THESAURUS FILE REAL
+syn match   elanOperator      ":="
+syn match   elanOperator      "::"
+syn match   elanOperator      "\*"
+syn match   elanOperator      "<>"
+syn keyword elanOperator AND OR CAND COR NOT XOR
+syn keyword elanOperator DIV MUL ISUB INCR DECR MOD SUB LENGTH CAT LIKE CONTAINS
+syn keyword elanStorageClass VAR CONST BOUND ROW
+syn keyword elanStructure STRUCT TYPE
+syn keyword elanLabel CASE OTHERWISE
+syn match   elanNumber		"-\=\<\d\+\>"
+syn match	elanFloat		"\d\+\.\d\+"
+
+syn region elanComment	start=+(\*+  end=+\*)+
+" XXX: tried to fix strings containing numbers that are not escapes, like "2",
+syn region elanString start=+"+rs=s+1 end=+"+re=e-1 contains=elanStringEscape
+"syn match  elanStringEscape	contained +"[0-9]\+"+
+
+
+hi def link elanBoolean		Boolean
+hi def link elanConditional	Conditional
+hi def link elanRepeat		Repeat
+hi def link elanType		Type
+hi def link elanComment		Comment
+hi def link elanOperator	Operator
+hi def link elanString		String
+hi def link elanStringEscape	Special
+hi def link elanStorageClass	StorageClass
+hi def link elanStructure		Structure
+hi def link elanLabel		Label
+hi def link elanStatement Statement
+hi def link elanNumber Number
+hi def link elanFloat Float
+
+let b:current_syntax = "elan"
+
+
diff --git a/extractAll.sh b/extractAll.sh
index 8b8649f..5870e1f 100755
--- a/extractAll.sh
+++ b/extractAll.sh
@@ -13,7 +13,7 @@ while read -r F; do
 	pushd "$destdir" || continue
 	for G in ./*; do
 		echo "Converting $G to ${G}.txt"
-		$root/convertFileDs.py "$G" > "${G}.txt" || rm "${G}.txt"
+		$root/convertFileDs.py "$G" > "${G}.txt" && touch -r "${G}" "${G}.txt" && rm "${G}" || rm "${G}.txt"
 	done
 	popd
 	rm "$linear"
diff --git a/extractHintergrund.py b/extractHintergrund.py
new file mode 100755
index 0000000..5795d8d
--- /dev/null
+++ b/extractHintergrund.py
@@ -0,0 +1,244 @@
+#!/usr/bin/env python3
+
+"""
+Extract EUMEL Hintergrund floppy disk image. Known to work only with version
+1.8 images.
+"""
+
+import os, logging
+from enum import IntEnum, unique
+from operator import attrgetter
+
+from eumel import pagesize
+
+from construct import Struct, Const, Padding, PaddedString, Int8ul, Int16ul, \
+        Int24ul, Int32ul, Flag, Computed, this, Array, BitStruct, Bitwise, \
+        BitsInteger, Embedded, Nibble, Sequence, Enum
+
+hgIdent = Struct(
+    "signature" / Const(b"EUMEL-"),
+    "version" / PaddedString(6, "ascii"),
+    Padding(1),
+    "isShutup" / Int8ul * "true if value is 0", # XXX
+    "bootCount" / Int16ul,
+    Padding(0x24) * "undocumented",
+    "_hgblocks2" / Int16ul,
+    Padding(0x50) * "unknown/undocumented",
+    "_hgblocks" / Int16ul,
+    "_plusident" / Int16ul,
+    "isPlus" / Computed(this._hgblocks == 1 and this._plusident == 0),
+    "blocks" / Computed(this._hgblocks if this.isPlus else this._hgblocks2), # XXX: this is not correct
+    ) * "First block of Hintergrund"
+
+blockref = Struct(
+    "value" / Int24ul,
+    "control" / Int8ul,
+    )
+
+anchor = Struct(
+    Const(b"\xff"*4),
+    "akttab" / blockref,
+    "clorX" / blockref,
+    Const(b"\xff"*4*3),
+    "taskRoot" / blockref,
+    Const(b"\xff"*4),
+    ) * "System anchor block"
+
+assert pagesize//blockref.sizeof() == 128
+blockTable = Array(pagesize//blockref.sizeof(), blockref)
+
+# XXX: skip const
+segmentTable = Sequence (Const (2*blockref.sizeof ()*b'\xff'), Array (14, blockref))
+
+drinfo = Struct(
+    "count" / blockref * "Number of blocks/pages allocated",
+    "blocks" / Array(3, blockref) * "Direct block references for page 1, 2 and 3",
+    "blockTables" / Array (2, blockref) * "Block references to block tables",
+    "segmentTables" / Array (2, blockref) * "Block references to segment tables, which refer to block tables",
+    ) * "Dataspace descriptor"
+
+# see src/devel/misc/unknown/src/XSTATUS.ELA
+# EUMEL’s pcb function returns the 16 bit word at position (0x1e+2*<id>)%0x40
+# i.e. module is pcb(23) → at offset 0x0c
+pcb = Struct(
+    "wstate" / Int32ul,
+    "millis" / Int8ul,
+    "unknown" / BitStruct (
+        "unused" / Flag, # bit 7
+        Padding(6),
+        "comflag" / Flag, # bit 0
+        ),
+    "status" / Int8ul,
+    "statusflags" / Int8ul * "unknown status flags",
+    "pricnt" / Int8ul,
+    "_icount" / Int16ul,
+    "flags" / BitStruct( # XXX: embedding BitStruct is not possible
+        "iserror" / Flag, # bit 7
+        "disablestop" / Flag, # bit 6
+        Padding(1),
+        "arith" / Flag, # bit 4
+        Padding(2),
+        "_codesegment" / BitsInteger(2), # bits 0…1
+        ),
+    "icount" / Computed(this._icount | (this.flags._codesegment<<16)), # XXX: byte-swapping 18 bit int is not possible? is codesegment low/high bits of icount?
+    "module" / Int16ul,
+    "pbase" / Int8ul,
+    "c8k" / Int8ul,
+    "lbase" / Int16ul,
+    "ltop" / Int16ul,
+    "lsTop" / Int16ul,
+    "heap" / BitStruct( # XXX: is this just a 16 bit pointer?
+        "top" / BitsInteger(12), # XXX: incorrect byte order
+        "segment" / Nibble, # bit 0…3
+        ),
+    Padding(4),
+    "priclk" / Int8ul,
+    "priv" / Int8ul,
+    Padding(2),
+    "linenr" / Int16ul, # ↓ See library/entwurf-systemdokumentation-1982.djvu section 2.4.13 (page 29)
+    "errorline" / Int16ul,
+    "errorcode" / Int16ul,
+    "channel" / Int16ul,
+    Padding(2), # XXX: sure about this padding?
+    "prio" / Int16ul,
+    "msgcode" / Int16ul,
+    "msgds" / Int16ul,
+    "taskid" / Int16ul,
+    "version" / Int16ul,
+    "fromid" / Int32ul,
+    Padding(8) * "unknown",
+    Padding(64) * "usually ff",
+    ) * "Leitblock"
+assert pcb.sizeof() == 4*drinfo.sizeof(), (pcb.sizeof(), drinfo.sizeof())
+
+class CpuType (IntEnum):
+    Z80 = 1
+    INTEL8088 = 3
+    M68K = 1024
+
+urladerlink = Struct (
+    "signature" / Const(b'EUMEL' + b' '*11),
+    "blocks" / Int16ul,
+    "hgver" / Int16ul,
+    "cputype" / Enum (Int16ul, CpuType),
+    "urver" / Int16ul,
+    Padding (2),
+    "shdvermin" / Int16ul,
+    "shdvermax" / Int16ul,
+    ) * "Urlader Linkleiste"
+
+def copyblock (block, infd, outfd):
+    if block == 0xffffff:
+        logging.debug (f'copying empty block')
+        written = outfd.write (b'\xff'*pagesize)
+        assert written == pagesize
+    else:
+        logging.debug (f'copying block {block}@{block*pagesize:x}h')
+        infd.seek (block*pagesize, os.SEEK_SET)
+        buf = infd.read (pagesize)
+        assert len (buf) == pagesize
+        written = outfd.write (buf)
+        assert written == pagesize
+
+def copyBlockTable (block, infd, outfd, skip=0):
+    if block != 0xffffff:
+        logging.debug (f'copying block table {block}@{block*pagesize:x}h, skipping {skip}')
+        fd.seek (block*pagesize, os.SEEK_SET)
+        for i, refl2 in enumerate (blockTable.parse_stream (infd)):
+            if i >= skip:
+                copyblock (refl2.value, fd, outfd)
+    else:
+        logging.debug (f'copying empty block table')
+        entries = (blockTable.sizeof()//blockref.sizeof())-skip
+        outfd.write (b'\xff'*(pagesize*entries))
+
+if __name__ == '__main__':
+    import argparse
+
+    parser = argparse.ArgumentParser(description='Extract EUMEL Hintergrund.')
+    parser.add_argument('-v', '--verbose', action='store_true', help='Verbose debugging output')
+    parser.add_argument('input', metavar='FILE', type=argparse.FileType('rb'), help='Input file')
+
+    args = parser.parse_args()
+
+    if args.verbose:
+        logging.basicConfig(level=logging.DEBUG)
+    else:
+        logging.basicConfig(level=logging.INFO)
+
+    with args.input as fd:
+        # ident
+        logging.info (hgIdent.parse_stream (fd))
+        fd.seek (0x1400, os.SEEK_SET)
+        logging.info (urladerlink.parse_stream (fd))
+
+        fd.seek (pagesize)
+        a = anchor.parse_stream (fd)
+
+        # task root (level 1)
+        fd.seek (a.taskRoot.value*pagesize)
+        taskRoot = blockTable.parse_stream (fd)
+
+        # task dataspaces(?) (level 2)
+        for taskid, taskref in enumerate (taskRoot):
+            if taskref.value == 0xffffff:
+                continue
+            logging.info (f'task {taskid} is at {taskref.value} 0x{taskref.value*pagesize:x}')
+
+            fd.seek (taskref.value*pagesize)
+            dataspaces = blockTable.parse_stream (fd)
+
+            for dsidhigh, dsref in enumerate (dataspaces):
+                if dsref.value == 0xffffff:
+                    continue
+                logging.info (f'\ttaskid {taskid} dsid {dsidhigh<<4} is at {dsref.value} 0x{dsref.value*pagesize:x}')
+
+                # pcb and drinfo (level 3)
+                fd.seek (dsref.value*pagesize)
+                drinfoStart = 0
+                if dsidhigh == 0:
+                    p = pcb.parse_stream (fd)
+                    logging.info (f'\t+pcb taskid {p.taskid} version {p.version} icount {p.icount:x} arith {p.flags.arith} disablestop {p.flags.disablestop} iserror {p.flags.iserror} pbase {p.pbase:x} module {p.module}')
+                    drinfoStart = 4
+                logging.info (f'\t\tdrinfo starting at {fd.tell():x}')
+                for dsidlow in range (drinfoStart, 16):
+                    dsid = dsidlow | dsidhigh << 4
+                    d = drinfo.parse_stream (fd)
+                    if d.count.value != 0xffffff and d.count.value != 0:
+                        # pbt (page block table) 1/2 contain block refs for pages 0…127 and 128…256
+                        # pst (page segment table) 1/2 contain block refs to page block tables for pages > 256
+                        logging.info (f'\t\tdrinfo {dsid} #{d.count.value} @ {[x.value for x in d.blocks]}, ind {[x.value for x in d.blockTables]}, ind2 {[x.value for x in d.segmentTables]}')
+
+                        pos = fd.tell ()
+                        with open (f'{taskid:04d}_{dsid:04d}.ds', 'wb') as outfd:
+                            os.ftruncate (outfd.fileno(), 0)
+
+                            # the first page of a dataspace is used by the OS
+                            # and not stored to the Hintergrund
+                            outfd.seek (pagesize)
+
+                            # get the first three pages
+                            for ref in d.blocks:
+                                copyblock (ref.value, fd, outfd)
+
+                            # indirect block refs (level 4a)
+                            assert len (d.blockTables) == 2
+                            # first four entries of first table are empty and must not be written!
+                            copyBlockTable (d.blockTables[0].value, fd, outfd, 4)
+                            copyBlockTable (d.blockTables[1].value, fd, outfd)
+
+                            # segment tables (level 4b)
+                            for segref in d.segmentTables:
+                                if segref.value != 0xffffff:
+                                    fd.seek (segref.value*pagesize, os.SEEK_SET)
+                                    segtbl = segmentTable.parse_stream (fd)
+                                    for ref in segtbl[1]:
+                                        copyBlockTable (ref.value, fd, outfd)
+                                else:
+                                    outfd.write((14*128*pagesize)*b'\xff')
+
+                            # 2*128 pages through block table, 2 segment tables with 14 refs to block tables each
+                            expectedSize = (2*128+2*14*128)*pagesize
+                            assert outfd.tell() == expectedSize, (outfd.tell(), expectedSize)
+                    fd.seek (pos, os.SEEK_SET)
+