123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481 |
- import sys, re, io, struct
- assert sys.hexversion >= 0x3050300, "Invalid Python version, must be 3.5.3+"
- tokens1MSX = [
- "", "END", "FOR", "NEXT", "DATA", "INPUT", "DIM", "READ", "LET",
- "GOTO", "RUN", "IF", "RESTORE", "GOSUB", "RETURN", "REM", "STOP",
- "PRINT", "CLEAR", "LIST", "NEW", "ON", "WAIT", "DEF", "POKE", "CONT",
- "CSAVE", "CLOAD", "OUT", "LPRINT", "LLIST", "CLS", "WIDTH", "ELSE",
- "TRON", "TROFF", "SWAP", "ERASE", "ERROR", "RESUME", "DELETE",
- "AUTO", "RENUM", "DEFSTR", "DEFINT", "DEFSNG", "DEFDBL", "LINE",
- "OPEN", "FIELD", "GET", "PUT", "CLOSE", "LOAD", "MERGE", "FILES",
- "LSET", "RSET", "SAVE", "LFILES", "CIRCLE", "COLOR", "DRAW", "PAINT",
- "BEEP", "PLAY", "PSET", "PRESET", "SOUND", "SCREEN", "VPOKE",
- "SPRITE", "VDP", "BASE", "CALL", "TIME", "KEY", "MAX", "MOTOR",
- "BLOAD", "BSAVE", "DSKO$", "SET", "NAME", "KILL", "IPL", "COPY", "CMD",
- "LOCATE", "TO", "THEN", "TAB(", "STEP", "USR", "FN", "SPC(", "NOT",
- "ERL", "ERR", "STRING$", "USING", "INSTR", "'", "VARPTR", "CSRLIN",
- "ATTR$", "DSKI$", "OFF", "INKEY$", "POINT", ">", "=", "<", "+", "-", "*",
- "/", "^", "AND", "OR", "XOR", "EQV", "IMP", "MOD", "\\", "", "", "",
- ]
- tokens2 = [
- "", "LEFT$", "RIGHT$", "MID$", "SGN", "INT", "ABS", "SQR", "RND", "SIN",
- "LOG", "EXP", "COS", "TAN", "ATN", "FRE", "INP", "POS", "LEN", "STR$",
- "VAL", "ASC", "CHR$", "PEEK", "VPEEK", "SPACE$", "OCT$", "HEX$",
- "LPOS", "BIN$", "CINT", "CSNG", "CDBL", "FIX", "STICK", "STRIG", "PDL",
- "PAD", "DSKF", "FPOS", "CVI", "CVS", "CVD", "EOF", "LOC", "LOF", "MKI$",
- "MKS$", "MKD$", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
- "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
- "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
- "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
- ]
- trans = (
- '\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F'
- '\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F'
- ' !"#$%&\'()*+,-./0123456789:;<=>?'
- '@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_'
- '`abcdefghijklmnopqrstuvwxyz{|}~\x7F'
- 'ÇüéâäàåçêëèïîìÄÅÉæÆôöòûùÿÖÜ¢£¥₧ƒ'
- 'áíóúñѪº¿⌐¬½¼¡«»ÃãĨĩÕõŨũIJij¾∽◊‰¶§'
- '▂▚▆���▎▞�����������▘▗▝▖�Δ‡ω█▄▌▐▀'
- 'αßΓπΣσµτΦΘΩδ∞φε∩≡±≥≤⌠⌡÷≈°∙·√ⁿ²■�'
- )
- ModeFloat = 0
- ModeUint = 1
- ModeLiteral = 2
- msx_s2b = {}
- msx_b2s = {}
- for k, v in enumerate(trans):
- b = bytes((k,))
- msx_s2b[v] = b
- msx_b2s[b] = v
- def encode(s):
- return b''.join([msx_s2b[c] for c in s])
- class BasicError(Exception):
- pass
- def num2bytes(s, tok_mode):
- """Translate a decimal number string to floating point"""
- assert tok_mode != ModeLiteral
- assert b'e' not in s, "Lower-case exponent in number"
-
- s = s.replace(b' ', b'').lstrip(b'0')
-
- if tok_mode == ModeUint:
- return struct.pack('<BH', 0x0E, int(s))
-
- suffix = s[-1:]
- if suffix in {b'%', b'!', b'#'}:
- s = s[:-1]
- else:
- suffix = b''
-
- point = s.find(b'.')
- s = s.replace(b'.', b'')
- ep = s.upper().find(b'E')
- exp = 0
- has_exp = ep != -1
- if not has_exp:
- ep = len(s)
- else:
- tmp = s[ep + 1:]
- s = s[:ep]
- if tmp not in {b'', b'+', b'-'}:
- exp = int(tmp, 10)
- has_point = point != -1
- if point == -1:
- point = ep
- exp += point + 0x40
-
- orig_len = len(s)
- s = s.lstrip(b'0')
- exp -= orig_len - len(s)
- point -= orig_len - len(s)
-
- s += b'0' * (14 - len(s))
-
- nint = 0
- if point > 0:
-
- nint = int(s[:point])
- if point <= 14 and (s[point:14] == b'9' * (14 - point)
- and s[14:15] >= b'5'):
- nint += 1
- if suffix == b'%' or (not has_point and not has_exp and suffix == b''
- and 0 <= nint <= 32767):
- if not (0 <= nint <= 32767):
- raise BasicError("Overflow")
- if nint < 10:
- return struct.pack('<B', 0x11 + nint)
- if nint < 256:
- return struct.pack('<BB', 0x0F, nint)
- return struct.pack('<Bh', 0x1C, nint)
- n = int(s[:14])
- if suffix == b'!' or (n % 100000000 == 0 and suffix != b'#'):
-
- n = (n + 50000000) // 100000000
- if n > 999999:
-
-
-
- n //= 10
- assert 0 <= n <= 999999
- if n == 0: exp = 0
- if exp > 127 or exp < 0:
- raise BasicError("Overflow")
- return struct.pack('>BL', 0x1D, int(str(n), 16) + exp * 0x1000000)
- if s[14:15] >= b'5':
- n += 1
- if n > 99999999999999:
- exp += 1
- n //= 10
- assert 0 <= n <= 99999999999999
- if n == 0: exp = 0
- if exp > 127 or exp < 0:
- raise BasicError("Overflow")
- return struct.pack('>BLL', 0x1F,
- int(str(n // 100000000), 16) + exp * 0x1000000,
- int(str(n % 100000000), 16))
- def tokenize(src, use_cr=False, type_mode=False, remove_spaces=False):
- """Options:
- type_mode: True if it should behave like typing in a program; False if
- it should behave like LOAD. When True:
- - Spaces at EOL are removed.
- - Entering line numbers >= 65530 raises Syntax Error.
- use_cr: True to split lines at CR and ignore LF (like the real thing),
- False to split lines at LF and ignore CR (multiplatform).
- remove_spaces: True to "minify" the lines by removing all spaces between
- tokens.
- """
- src = encode(src)
- LeadingASCII = re.compile(b'^[A-Z]+')
- tok_translate = {}
- for k, v in enumerate(tokens1MSX):
- v = encode(v)
- assert k < 128 and v not in tok_translate
- if v != b'':
- tok_translate[v] = bytes((k+128,))
- for k, v in enumerate(tokens2):
- v = encode(v)
- assert k < 128 and v not in tok_translate
- if v != b'':
- tok_translate[v] = bytes((255,k+128))
-
- tok_translate[b"'"] = b':' + tok_translate[b'REM'] + tok_translate[b"'"]
-
- tok_translate[b'?'] = tok_translate[b'PRINT']
-
- tok_translate[b'GO TO'] = tok_translate[b'GOTO']
-
- L = []
- for v in tokens1MSX + tokens2 + ['GO TO']:
- v = encode(v)
- if LeadingASCII.search(v):
- L.append(v)
- L.sort(key=len, reverse=True)
-
- L = [re.escape(v) for v in L]
-
- decimal = (
- br'|(?P<dec>(?:'
-
- br'(?:[0-9][0-9\ ]*)?\.(?:[0-9\ ]*[0-9])?'
- br'|(?:[0-9](?:[0-9\ ]*[0-9])?)'
- br')'
- br'(?:'
- br'\ *[%!\#]'
- br'|\ *E\ *[-+]?(?:[0-9\ ]*[0-9])?'
- br')?)'
- )
-
-
- uint = (br'|(?P<dec>'
- br'(?:0[0\ ]*)?'
- br'(?:0'
- br'|[1-5](?:\ *[0-9]){,4}'
- br'|6\ *[0-4](?:\ *[0-9]){,3}'
- br'|6\ *5\ *[0-4](?:\ *[0-9]){,2}'
- br'|6\ *5\ *5\ *[0-2](?:\ *[0-9])?'
- br'|[6-9](?:\ *[0-9]){,3}'
- br'))'
- )
- tokenizer = (br'(?:'
- br"(?P<rem>(?:REM|').*)"
- br'|(?P<call>(?:CALL|_)[^:(]*)'
- br'|(?P<kw>' + br'|'.join(L) + br')'
- br'|[A-Z](?:\ *[0-9.])*'
- br'|&H(?P<hex>[0-9A-F]*)'
- br'|&O(?P<oct>[0-7]*)'
-
- br'%s'
- br'|(?P<str>"(?:[^"]*)(?:"|$))'
- b'|(?P<del>[\x80-\xFF])'
- br'|.'
- br')'
- )
- tokenizer_uint = re.compile(tokenizer % uint, re.I)
- tokenizer_float = re.compile(tokenizer % decimal, re.I)
- tokenizer_literal = re.compile(tokenizer % b'|(?P<dec>(?!))', re.I)
- del L, uint, decimal, LeadingASCII, tokenizer
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- leadingdigit = re.compile(br'[\ \x01-\x1F\x80-\xFF]*(?:([0-9]?))')
- trunc_at_null = re.compile(br'\x00.*', re.S)
- call_strip = re.compile(br'[^0-\x7F\ (]*')
-
- buf = io.BytesIO()
-
- src = re.sub(b'\x1A.*', b'', src, flags=re.S)
- if use_cr:
-
- src = src.split(b'\r')
- ignore = b'\n'
- else:
-
- src = src.split(b'\n')
- ignore = b'\r'
- filestart = buf.tell()
-
-
- PRGLines = {}
- for line in src:
- line = trunc_at_null.sub(b'', line.replace(ignore, b'').lstrip(b' '))
- if type_mode: line = line.rstrip(b' ')
- if line == b'':
- continue
- g = tokenizer_uint.match(line)
-
- if not g.group('dec'):
-
- raise BasicError("Direct statement in file")
- p = g.end()
- linenum = int(g.group().replace(b' ', b''))
- if linenum != 0:
-
- if line[p:p+1] == b' ':
- p += 1
- g = leadingdigit.match(line, p)
- if type_mode:
-
-
-
- if group(1): raise BasicError("Syntax error")
- if not g.group(1) and g.end(0) == len(line):
-
- if linenum not in PRGLines:
- raise BasicError("Undefined line number")
- del PRGLines[linenum]
- continue
- lbuf = io.BytesIO()
- tok_mode = ModeFloat
- while True:
- nextch = line[p:p+1]
- if tok_mode == ModeUint and (nextch == b':'
- or b'A' <= nextch <= b'Z'
- or b'a' <= nextch <= b'z'):
- tok_mode = ModeFloat
- if p == len(line): break
- tokenizer = tokenizer_float
- if tok_mode == ModeUint:
- tokenizer = tokenizer_uint
- elif tok_mode == ModeLiteral:
- tokenizer = tokenizer_literal
- g = tokenizer.match(line, p)
- assert g, "No match in tokenizer for " + line[p]
- match = g.group().upper()
- p = g.end()
-
- if match in {b'\x01', b'\x02', b'\x03', b'\x04', b'\x05', b'\x06',
- b'\x07', b'\x08'}:
-
- g = tokenizer.match(line, p)
- p = g.end()
- continue
- if tok_mode != ModeUint and match in {b'\x0B', b'\x0C', b'\x0E',
- b'\x0F', b'\x10', b'\x11', b'\x12', b'\x13', b'\x14', b'\x15',
- b'\x16', b'\x17', b'\x18', b'\x19', b'\x1B', b'\x1C', b'\x1D',
- b'\x1E', b'\x1F'}:
- lbuf.write(b' ')
- tok_mode = ModeFloat
- continue
-
- if g.group('rem'):
- if g.group().startswith(b"'"):
- lbuf.write(tok_translate[b"'"])
- lbuf.write(g.group()[1:])
- else:
- lbuf.write(tok_translate[b'REM'])
- lbuf.write(g.group()[3:])
- elif g.group('call'):
- match = call_strip.sub(b'', match)
- if match.startswith(b'_'):
- lbuf.write(match)
- else:
- lbuf.write(tok_translate[b'CALL'])
- lbuf.write(match[4:])
- elif g.group('kw'):
-
- if match in {b'GOTO', b'GO TO', b'GOSUB', b'THEN', b'ELSE'}:
- tok_mode = ModeUint
- lbuf.write(tok_translate[match])
- elif g.group('hex') is not None:
-
- lbuf.write(b'\x0C')
- num = int(g.group('hex'), 16) if g.group('hex') != b'' else 0
- if num > 65535:
- raise BasicError("Overflow")
- lbuf.write(struct.pack('<H', num))
- elif g.group('oct') is not None:
-
- lbuf.write(b'\x0B')
- num = int(g.group('oct'), 8) if g.group('oct') != b'' else 0
- if num > 65535:
- raise BasicError("Overflow")
- lbuf.write(struct.pack('<H', num))
- elif g.group('dec'):
-
- lbuf.write(num2bytes(match, tok_mode))
- elif g.group('str'):
-
- lbuf.write(g.group())
- elif g.group('del'):
-
- pass
- else:
- if remove_spaces and match == b' ':
- continue
- if len(match) == 1 and (b'A' <= match <= b'Z' or b'0' <= match <= b'9'):
- tok_mode = ModeLiteral
- elif tok_mode == ModeLiteral:
- tok_mode = ModeFloat
- if match in tok_translate:
-
- lbuf.write(tok_translate[match])
- else:
- lbuf.write(match)
- lbuf.write(b'\0')
- PRGLines[linenum] = lbuf.getvalue()
-
- addr = 0x8001
- for linenum in sorted(PRGLines.keys()):
- line = PRGLines[linenum]
- addr += len(line) + 4
- buf.write(struct.pack('<HH', addr, linenum))
- buf.write(line)
- buf.write(b'\0\0')
- return buf.getvalue()
- def main():
- f = open(sys.argv[1], 'r', newline='')
- try:
- lines = f.read()
- finally:
- f.close()
- out = tokenize(lines, use_cr=len(sys.argv) > 3 and sys.argv[3] == '1')
- f = open(sys.argv[2], 'wb')
- try:
- f.write(b'\xFF' + out)
- finally:
- f.close()
- if __name__ == '__main__':
- main()
|