assemble.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598
  1. """
  2. Copyright (C) 2018 Alyssa Rosenzweig
  3. Copyright (c) 2013 Connor Abbott (connor@abbott.cx)
  4. Permission is hereby granted, free of charge, to any person obtaining a copy
  5. of this software and associated documentation files (the "Software"), to deal
  6. in the Software without restriction, including without limitation the rights
  7. to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  8. copies of the Software, and to permit persons to whom the Software is
  9. furnished to do so, subject to the following conditions:
  10. The above copyright notice and this permission notice shall be included in
  11. all copies or substantial portions of the Software.
  12. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  13. IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  14. FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  15. AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  16. LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  17. OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  18. THE SOFTWARE.
  19. """
  20. import sys
  21. import pprint
  22. import struct
  23. program = []
  24. # Definitions from cwabbott's tools
  25. t6xx_alu_ops = {
  26. "fadd": 0x10,
  27. "fmul": 0x14,
  28. "fmin": 0x28,
  29. "fmax": 0x2C,
  30. "fmov": 0x30,
  31. "ffloor": 0x36,
  32. "fceil": 0x37,
  33. "fdot3": 0x3C,
  34. "fdot3r": 0x3D,
  35. "fdot4": 0x3E,
  36. "freduce": 0x3F,
  37. "iadd": 0x40,
  38. "isub": 0x46,
  39. "imul": 0x58,
  40. "imov": 0x7B,
  41. "feq": 0x80,
  42. "fne": 0x81,
  43. "flt": 0x82,
  44. "fle": 0x83,
  45. "f2i": 0x99,
  46. "ieq": 0xA0,
  47. "ine": 0xA1,
  48. "ilt": 0xA4,
  49. "ile": 0xA5,
  50. "ball": 0xA9,
  51. "bany": 0xB1,
  52. "i2f": 0xB8,
  53. "csel": 0xC5,
  54. "fatan_pt2": 0xE8,
  55. "frcp": 0xF0,
  56. "frsqrt": 0xF2,
  57. "fsqrt": 0xF3,
  58. "fexp2": 0xF4,
  59. "flog2": 0xF5,
  60. "fsin": 0xF6,
  61. "fcos": 0xF7,
  62. "fatan2_pt1": 0xF9,
  63. }
  64. t6xx_alu_bits = {
  65. "vmul": 17,
  66. "sadd": 19,
  67. "vadd": 21,
  68. "smul": 23,
  69. "lut": 25,
  70. "fb": 26,
  71. "branch": 27,
  72. "constants": 32
  73. }
  74. t6xx_alu_size_bits = {
  75. "vmul": 48,
  76. "sadd": 32,
  77. "vadd": 48,
  78. "smul": 32,
  79. "lut": 48,
  80. "fb": 16,
  81. "branch": 48
  82. }
  83. t6xx_outmod = {
  84. "none": 0,
  85. "pos": 1,
  86. "int": 2,
  87. "sat": 3
  88. }
  89. t6xx_reg_mode = {
  90. "half": 1,
  91. "full": 2
  92. }
  93. t6xx_dest_override = {
  94. "lower": 0,
  95. "upper": 1,
  96. "none": 2
  97. }
  98. t6xx_load_store_ops = {
  99. "ld_st_noop": 0x03,
  100. "ld_attr_16": 0x95,
  101. "ld_attr_32": 0x94,
  102. "ld_vary_16": 0x99,
  103. "ld_vary_32": 0x98,
  104. "ld_uniform_16": 0xAC,
  105. "ld_uniform_32": 0xB0,
  106. "st_vary_16": 0xD5,
  107. "st_vary_32": 0xD4
  108. }
  109. t6xx_tag = {
  110. "texture": 0x3,
  111. "load_store": 0x5,
  112. "alu4": 0x8,
  113. "alu8": 0x9,
  114. "alu12": 0xA,
  115. "alu16": 0xB,
  116. }
  117. def is_tag_alu(tag):
  118. return (tag >= t6xx_tag["alu4"]) and (tag <= t6xx_tag["alu16"])
  119. # Just an enum
  120. ALU = 0
  121. LDST = 1
  122. TEXTURE = 2
  123. # Constant types supported, mapping the constant prefix to the Python format
  124. # string and the coercion function
  125. constant_types = {
  126. "f": ("f", float),
  127. "h": ("e", float),
  128. "i": ("i", int),
  129. "s": ("h", int)
  130. }
  131. # TODO: Synthesise fbwrite stuff better
  132. fbwrite_op = 0x7
  133. # TODO: What else?
  134. texture_op = {
  135. "normal": 0x11,
  136. "texelfetch": 0x14
  137. }
  138. texture_fmt = {
  139. "2d": 0x02,
  140. "3d": 0x03
  141. }
  142. with open(sys.argv[1], "r") as f:
  143. for ln in f:
  144. space = ln.strip().split(" ")
  145. instruction = space[0]
  146. rest = " ".join(space[1:])
  147. arguments = [s.strip() for s in rest.split(",")]
  148. program += [(instruction, arguments)]
  149. swizzle_component = {
  150. "x": 0,
  151. "y": 1,
  152. "z": 2,
  153. "w": 3
  154. }
  155. def decode_reg_name(reg_name):
  156. ireg = 0
  157. upper = False
  158. half = False
  159. if reg_name[0] == 'r':
  160. ireg = int(reg_name[1:])
  161. elif reg_name[0] == 'h':
  162. rreg = int(reg_name[2:])
  163. # Decode half-register into its full register's half
  164. ireg = rreg >> 1
  165. upper = rreg & 1
  166. half = True
  167. else:
  168. # Special case for load/store addresses
  169. ireg = int(reg_name)
  170. return (ireg, half, upper)
  171. def standard_swizzle_from_parts(swizzle_parts):
  172. swizzle_s = swizzle_parts[1] if len(swizzle_parts) > 1 else "xyzw"
  173. swizzle = 0
  174. for (i, c) in enumerate(swizzle_s):
  175. swizzle |= swizzle_component[c] << (2 * i)
  176. return swizzle
  177. def mask_from_parts(mask_parts, large_mask):
  178. mask_s = mask_parts[1] if len(mask_parts) > 1 else "xyzw"
  179. if large_mask:
  180. mask = sum([(3 << (2*swizzle_component[c]) if c in mask_s else 0) for c in "xyzw"])
  181. else:
  182. mask = sum([(1 << swizzle_component[c] if c in mask_s else 0) for c in "xyzw"])
  183. return (mask, mask_s)
  184. def decode_reg(reg):
  185. if reg[0] == "#":
  186. # Not actually a register, instead an immediate float
  187. return (True, struct.unpack("H", struct.pack("e", float(reg[1:])))[0], 0, 0, 0, 0)
  188. # Function call syntax used in abs() modifier
  189. if reg[-1] == ')':
  190. reg = reg[:-1]
  191. swizzle_parts = reg.split(".")
  192. reg_name = swizzle_parts[0]
  193. modifiers = 0
  194. if reg_name[0] == '-':
  195. modifiers |= 2
  196. reg_name = reg_name[1:]
  197. if reg_name[0] == 'a':
  198. modifiers |= 1
  199. reg_name = reg_name[len("abs("):]
  200. (ireg, half, upper) = decode_reg_name(reg_name)
  201. return (False, ireg, standard_swizzle_from_parts(swizzle_parts), half, upper, modifiers)
  202. def decode_masked_reg(reg, large_mask):
  203. mask_parts = reg.split(".")
  204. reg_name = mask_parts[0]
  205. (ireg, half, upper) = decode_reg_name(reg_name)
  206. (mask, mask_s) = mask_from_parts(mask_parts, large_mask)
  207. component = max([0] + [swizzle_component[c] for c in "xyzw" if c in mask_s])
  208. return (ireg, mask, component, half, upper)
  209. # TODO: Fill these in XXX
  210. # Texture pipeline registers in r28-r29
  211. TEXTURE_BASE = 28
  212. def decode_texture_reg_number(reg):
  213. r = reg.split(".")[0]
  214. if r[0] == "r":
  215. return (True, int(r[1:]) - TEXTURE_BASE, 0)
  216. else:
  217. no = int(r[2:])
  218. return (False, (no >> 1) - TEXTURE_BASE, no & 1)
  219. def decode_texture_reg(reg):
  220. (full, select, upper) = decode_texture_reg_number(reg)
  221. # Swizzle mandatory for texture registers, afaict
  222. swizzle = reg.split(".")[1]
  223. swizzleL = swizzle_component[swizzle[0]]
  224. swizzleR = swizzle_component[swizzle[1]]
  225. return (full, select, upper, swizzleR, swizzleL)
  226. def decode_texture_out_reg(reg):
  227. (full, select, upper) = decode_texture_reg_number(reg)
  228. (mask, _) = mask_from_parts(reg.split("."), False)
  229. return (full, select, upper, mask)
  230. instruction_stream = []
  231. for p in program:
  232. ins = p[0]
  233. arguments = p[1]
  234. # for ALU, fbwrite, texture
  235. family = ins_mod = ins.split(".")[0]
  236. ins_op = (ins + ".").split(".")[1]
  237. ins_outmod = ("ins" + "." + ".").split(".")[2]
  238. if len(ins_outmod) == 0:
  239. ins_outmod = "none"
  240. out_mod = t6xx_outmod[ins_outmod]
  241. if ins in t6xx_load_store_ops:
  242. op = t6xx_load_store_ops[ins]
  243. (reg, mask, component, half, upper) = decode_masked_reg(p[1][0], False)
  244. (immediate, address, swizzle, half, upper, modifiers) = decode_reg(p[1][1])
  245. unknown = int(p[1][2], 16)
  246. b = (op << 0) | (reg << 8) | (mask << 13) | (swizzle << 17) | (unknown << 25) | (address << 51)
  247. instruction_stream += [(LDST, b)]
  248. elif ins_op in t6xx_alu_ops:
  249. op = t6xx_alu_ops[ins_op]
  250. (reg_out, mask, out_component, half0, upper0) = decode_masked_reg(p[1][0], True)
  251. (_, reg_in1, swizzle1, half1, upper1, mod1) = decode_reg(p[1][1])
  252. (immediate, reg_in2, swizzle2, half2, upper2, mod2) = decode_reg(p[1][2])
  253. if immediate:
  254. register_word = (reg_in1 << 0) | ((reg_in2 >> 11) << 5) | (reg_out << 10) | (1 << 15)
  255. else:
  256. register_word = (reg_in1 << 0) | (reg_in2 << 5) | (reg_out << 10)
  257. if ins_mod in ["vadd", "vmul", "lut"]:
  258. io_mode = t6xx_reg_mode["half" if half0 else "full"]
  259. repsel = 0
  260. i1half = half1
  261. i2block = 0
  262. output_override = 2 # NORMAL, TODO
  263. wr_mask = 0
  264. if half0:
  265. # TODO: half actually
  266. repsel = 2 * upper1
  267. else:
  268. repsel = upper1
  269. if half0:
  270. # Rare case...
  271. (_, halfmask, _, _, _) = decode_masked_reg(p[1][0], False)
  272. wr_mask = halfmask
  273. else:
  274. wr_mask = mask
  275. if immediate:
  276. # Inline constant: lower 11 bits
  277. i2block = ((reg_in2 & 0xFF) << 3) | ((reg_in2 >> 8) & 0x7)
  278. else:
  279. if half0:
  280. # TODO: replicate input 2 if half
  281. pass
  282. else:
  283. # TODO: half selection
  284. i2block = upper2 | (half2 << 2)
  285. i2block |= swizzle2 << 3
  286. instruction_word = (op << 0) | (io_mode << 8) | (mod1 << 10) | (repsel << 12) | (i1half << 14) | (swizzle1 << 15) | (mod2 << 23) | (i2block << 25) | (output_override << 36) | (out_mod << 38) | (wr_mask << 40)
  287. elif ins_mod in ["sadd", "smul"]:
  288. # TODO: What are these?
  289. unknown2 = 0
  290. unknown3 = 0
  291. i1comp_block = 0
  292. if half1:
  293. i1comp_block = swizzle1 | (upper1 << 2)
  294. else:
  295. i1comp_block = swizzle1 << 1
  296. i2block = 0
  297. if immediate:
  298. # Inline constant is splattered in a... bizarre way
  299. i2block = (((reg_in2 >> 9) & 3) << 0) | (((reg_in2 >> 8) & 1) << 2) | (((reg_in2 >> 5) & 7) << 3) | (((reg_in2 >> 0) & 15) << 6)
  300. else:
  301. # TODO: half register
  302. swizzle2 = (swizzle2 << 1) & 0x1F
  303. i2block = (mod2 << 0) | ((not half2) << 2) | (swizzle2 << 3) | (unknown2 << 5)
  304. outcomp_block = 0
  305. if True:
  306. outcomp_block = out_component << 1
  307. else:
  308. # TODO: half register
  309. pass
  310. instruction_word = (op << 0) | (mod1 << 8) | ((not half1) << 10) | (i1comp_block << 11) | (i2block << 14) | (unknown3 << 25) | (out_mod << 26) | ((not half0) << 28) | (outcomp_block) << 29
  311. else:
  312. instruction_word = op
  313. instruction_stream += [(ALU, ins_mod, register_word, instruction_word)]
  314. elif family == "texture":
  315. # Texture ops use long series of modifiers to describe their needed
  316. # capabilities, seperated by dots. Decode them here
  317. parts = ins.split(".")
  318. # First few modifiers are fixed, like an instruction name
  319. tex_op = parts[1]
  320. tex_fmt = parts[2]
  321. # The remaining are variable, but strictly ordered
  322. parts = parts[3:]
  323. op = texture_op[tex_op]
  324. # Some bits are defined directly in the modifier list
  325. shadow = "shadow" in parts
  326. cont = "cont" in parts
  327. last = "last" in parts
  328. has_filter = "raw" not in parts
  329. # The remaining need order preserved since they have their own arguments
  330. argument_parts = [part for part in parts if part not in ["shadow", "cont", "last", "raw"]]
  331. bias_lod = 0
  332. for argument, part in zip(argument_parts, arguments[4:]):
  333. if argument == "bias":
  334. bias_lod = int(float(part) * 256)
  335. else:
  336. print("Unknown argument: " + str(argument))
  337. fmt = texture_fmt[tex_fmt]
  338. has_offset = 0
  339. magic1 = 1 # IDEK
  340. texture_handle = int(arguments[1][len("texture"):])
  341. sampler_parts = arguments[2].split(".")
  342. sampler_handle = int(sampler_parts[0][len("sampler"):])
  343. swizzle0 = standard_swizzle_from_parts(sampler_parts)
  344. (full0, select0, upper0, mask0) = decode_texture_out_reg(arguments[0])
  345. (full1, select1, upper1, swizzleR1, swizzleL1) = decode_texture_reg(arguments[3])
  346. tex = (op << 0) | (shadow << 6) | (cont << 8) | (last << 9) | (fmt << 10) | (has_offset << 15) | (has_filter << 16) | (select1 << 17) | (upper1 << 18) | (full1 << 20) | (swizzleR1 << 21) | (swizzleL1 << 23) | (full0 << 29) | (magic1 << 30) | (select0 << 32) | (upper0 << 33) | (mask0 << 34) | (swizzle0 << 40) | (bias_lod << 72) | (texture_handle << 88) | (sampler_handle << 104)
  347. instruction_stream += [(TEXTURE, tex)]
  348. elif family == "fb":
  349. if ins_op == "write":
  350. # TODO: What does this argument mean, exactly?
  351. magic = int(p[1][0], 16)
  352. fb = fbwrite_op | (magic << 3)
  353. instruction_stream += [(ALU, "fb", None, fb)]
  354. else:
  355. print("Unknown fb op: " + str(ins_op))
  356. elif ins[1:] == "constants":
  357. if ins[0] not in constant_types:
  358. print("Unknown constant type " + str(constant_type))
  359. break
  360. (fmt, cast) = constant_types[ins[0]]
  361. encoded = [struct.pack(fmt, cast(f)) for f in p[1]]
  362. consts = bytearray()
  363. for c in encoded:
  364. consts += c
  365. # consts must be exactly 4 quadwords, so pad with zeroes if necessary
  366. consts += bytes(4*4 - len(consts))
  367. instruction_stream += [(ALU, "constants", consts)]
  368. # Emit from instruction stream
  369. instructions = []
  370. index = 0
  371. while index < len(instruction_stream):
  372. output_stream = bytearray()
  373. ins = instruction_stream[index]
  374. tag = ins[0]
  375. can_prefetch = index + 1 < len(instruction_stream)
  376. succeeding = None
  377. if tag == LDST:
  378. succeeding = instruction_stream[index + 1] if can_prefetch else None
  379. parta = ins[1]
  380. partb = None
  381. if succeeding and succeeding[0] == LDST:
  382. partb = succeeding[1]
  383. index += 1
  384. else:
  385. partb = parta
  386. parta = t6xx_load_store_ops["ld_st_noop"]
  387. tag8 = t6xx_tag["load_store"]
  388. ins = (partb << 68) | (parta << 8) | tag8
  389. output_stream += (ins.to_bytes(16, "little"))
  390. elif tag == TEXTURE:
  391. tag8 = t6xx_tag["texture"]
  392. ins = (ins[1] << 8) | tag8
  393. output_stream += (ins.to_bytes(16, "little"))
  394. elif tag == ALU:
  395. # TODO: Combining ALU ops
  396. emit_size = 4 # 32-bit tag always emitted
  397. tag = 0
  398. register_words = bytearray()
  399. body_words = bytearray()
  400. constant_words = None
  401. last_alu_bit = 0
  402. # Iterate through while there are ALU tags in strictly ascending order
  403. while index < len(instruction_stream) and instruction_stream[index][0] == ALU and t6xx_alu_bits[instruction_stream[index][1]] > last_alu_bit:
  404. ins = instruction_stream[index]
  405. bit = t6xx_alu_bits[ins[1]]
  406. last_alu_bit = bit
  407. if ins[1] == "constants":
  408. constant_words = ins[2]
  409. else:
  410. # Flag for the used part of the GPU
  411. tag |= 1 << bit
  412. # 16-bit register word, if present
  413. if ins[2] is not None:
  414. register_words += (ins[2].to_bytes(2, "little"))
  415. emit_size += 2
  416. size = int(t6xx_alu_size_bits[ins[1]] / 8)
  417. body_words += (ins[3].to_bytes(size, "little"))
  418. emit_size += size
  419. index += 1
  420. index -= 1 # fix off by one, from later loop increment
  421. # Pad to nearest multiple of 4 words
  422. padding = (16 - (emit_size & 15)) if (emit_size & 15) else 0
  423. emit_size += padding
  424. # emit_size includes constants
  425. if constant_words:
  426. emit_size += len(constant_words)
  427. # Calculate tag given size
  428. words = emit_size >> 2
  429. tag |= t6xx_tag["alu" + str(words)]
  430. # Actually emit, now that we can
  431. output_stream += tag.to_bytes(4, "little")
  432. output_stream += register_words
  433. output_stream += body_words
  434. output_stream += bytes(padding)
  435. if constant_words:
  436. output_stream += constant_words
  437. instructions += [output_stream]
  438. index += 1
  439. # Assmebly over; just emit tags at this point
  440. binary = bytearray()
  441. for (idx, ins) in enumerate(instructions):
  442. # Instruction prefetch
  443. tag = 0
  444. if idx + 1 < len(instructions):
  445. tag = instructions[idx + 1][0] & 0xF
  446. # Check for ALU special case
  447. if is_tag_alu(tag) and idx + 2 == len(instructions):
  448. tag = 1
  449. else:
  450. # Instruction stream over
  451. tag = 1
  452. ins[0] |= tag << 4
  453. binary += ins
  454. pprint.pprint(program)
  455. with open(sys.argv[2], "wb") as f:
  456. f.write(binary)