generate.go 41 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600
  1. // License: GPLv3 Copyright: 2024, Kovid Goyal, <kovid at kovidgoyal.net>
  2. //go:build ignore
  3. // See https://www.quasilyte.dev/blog/post/go-asm-complementary-reference/
  4. // for differences between AT&T and Go assembly
  5. package main
  6. import (
  7. "bytes"
  8. "fmt"
  9. "go/types"
  10. "io"
  11. "os"
  12. "path/filepath"
  13. "runtime"
  14. "strconv"
  15. "strings"
  16. "unsafe"
  17. )
  18. var _ = fmt.Print
  19. type Register struct {
  20. Name string
  21. Size int
  22. Restricted bool
  23. }
  24. func (r Register) String() string { return r.Name }
  25. func (r Register) ARMFullWidth() string { return fmt.Sprintf("%s.B%d", r, r.Size/8) }
  26. func (r Register) AddressInRegister() string { return fmt.Sprintf("(%s)", r) }
  27. type Arch string
  28. const (
  29. X86 Arch = "386"
  30. AMD64 Arch = "amd64"
  31. ARM64 Arch = "arm64"
  32. )
  33. type ISA struct {
  34. Bits int
  35. Goarch Arch
  36. Registers []Register
  37. UsedRegisters map[Register]bool
  38. Sizes types.Sizes
  39. GeneralPurposeRegisterSize int
  40. HasSIMD bool
  41. }
  42. const ByteSlice types.BasicKind = 100001
  43. func (isa *ISA) NativeAdd() string {
  44. if isa.Goarch == ARM64 {
  45. return "ADD"
  46. }
  47. if isa.GeneralPurposeRegisterSize == 32 {
  48. return "ADDL"
  49. }
  50. return "ADDQ"
  51. }
  52. func (isa *ISA) NativeSubtract() string {
  53. if isa.Goarch == ARM64 {
  54. return "SUB"
  55. }
  56. if isa.GeneralPurposeRegisterSize == 32 {
  57. return "SUBL"
  58. }
  59. return "SUBQ"
  60. }
  61. func (isa *ISA) add_regs(size int, names ...string) {
  62. for _, r := range names {
  63. isa.Registers = append(isa.Registers, Register{r, size, false})
  64. }
  65. }
  66. func (ans *ISA) add_x86_regs() {
  67. ans.add_regs(ans.GeneralPurposeRegisterSize, `AX`, `BX`, `DX`, `SI`, `DI`, `BP`)
  68. if ans.GeneralPurposeRegisterSize == 64 {
  69. ans.add_regs(ans.GeneralPurposeRegisterSize, `R8`, `R9`, `R10`, `R11`, `R12`, `R13`, `R14`, `R15`)
  70. }
  71. // CX is used for shift and rotate instructions
  72. ans.Registers = append(ans.Registers, Register{`CX`, ans.GeneralPurposeRegisterSize, true})
  73. // SP is the stack pointer used by the Go runtime
  74. ans.Registers = append(ans.Registers, Register{`SP`, ans.GeneralPurposeRegisterSize, true})
  75. ans.add_regs(128, `X0`, `X1`, `X2`, `X3`, `X4`, `X5`, `X6`, `X7`, `X8`, `X9`, `X10`, `X11`, `X12`, `X13`, `X14`, `X15`)
  76. if ans.Goarch == AMD64 {
  77. ans.add_regs(256,
  78. `Y0`, `Y1`, `Y2`, `Y3`, `Y4`, `Y5`, `Y6`, `Y7`, `Y8`, `Y9`, `Y10`, `Y11`, `Y12`, `Y13`, `Y14`, `Y15`)
  79. }
  80. }
  81. func Createi386ISA(bits int) ISA {
  82. ans := ISA{
  83. Bits: bits,
  84. GeneralPurposeRegisterSize: 32,
  85. Goarch: X86,
  86. Sizes: types.SizesFor(runtime.Compiler, string(X86)),
  87. HasSIMD: bits == 128,
  88. }
  89. ans.add_x86_regs()
  90. return ans
  91. }
  92. func CreateAMD64ISA(bits int) ISA {
  93. ans := ISA{
  94. Bits: bits,
  95. GeneralPurposeRegisterSize: 64,
  96. Goarch: AMD64,
  97. Sizes: types.SizesFor(runtime.Compiler, string(AMD64)),
  98. HasSIMD: true,
  99. }
  100. ans.add_x86_regs()
  101. return ans
  102. }
  103. func CreateARM64ISA(bits int) ISA {
  104. ans := ISA{
  105. Bits: bits,
  106. Goarch: ARM64,
  107. GeneralPurposeRegisterSize: 64,
  108. Sizes: types.SizesFor(runtime.Compiler, string(ARM64)),
  109. HasSIMD: bits == 128,
  110. }
  111. ans.add_regs(ans.GeneralPurposeRegisterSize,
  112. `R0`, `R1`, `R2`, `R3`, `R4`, `R5`, `R6`, `R7`, `R8`, `R9`, `R10`, `R11`, `R12`, `R13`, `R14`, `R15`)
  113. ans.add_regs(128,
  114. `V0`, `V1`, `V2`, `V3`, `V4`, `V5`, `V6`, `V7`, `V8`, `V9`, `V10`, `V11`, `V12`, `V13`, `V14`, `V15`,
  115. `V16`, `V17`, `V18`, `V19`, `V20`, `V21`, `V22`, `V23`, `V24`, `V25`, `V26`, `V27`, `V28`, `V29`, `V30`, `V31`,
  116. )
  117. return ans
  118. }
  119. func AsVar(s types.BasicKind, name string) *types.Var {
  120. var t types.Type
  121. switch s {
  122. case ByteSlice:
  123. t = types.NewSlice(types.Typ[types.Byte])
  124. default:
  125. t = types.Typ[s]
  126. }
  127. return types.NewParam(0, nil, name, t)
  128. }
  129. type FunctionParam struct {
  130. Name string
  131. Type types.BasicKind
  132. }
  133. type Function struct {
  134. Name string
  135. Desc string
  136. Params, Returns []FunctionParam
  137. UsedRegisters map[Register]bool
  138. Size int
  139. ISA ISA
  140. ParamOffsets, ReturnOffsets []int
  141. Instructions []string
  142. Used256BitReg bool
  143. }
  144. func (f *Function) Reg() Register {
  145. for _, r := range f.ISA.Registers {
  146. if !r.Restricted && r.Size == f.ISA.GeneralPurposeRegisterSize && !f.UsedRegisters[r] {
  147. f.UsedRegisters[r] = true
  148. return r
  149. }
  150. }
  151. b := []string{}
  152. for _, r := range f.ISA.Registers {
  153. if !r.Restricted && r.Size == f.ISA.GeneralPurposeRegisterSize {
  154. b = append(b, r.Name)
  155. }
  156. }
  157. panic(fmt.Sprint("No available general purpose registers, used registers: ", strings.Join(b, ", ")))
  158. }
  159. func (f *Function) RegForShifts() Register {
  160. if f.ISA.Goarch == ARM64 {
  161. return f.Reg()
  162. }
  163. for _, r := range f.ISA.Registers {
  164. if r.Name == "CX" {
  165. if f.UsedRegisters[r] {
  166. panic("The register for shifts is already used")
  167. }
  168. return r
  169. }
  170. }
  171. panic("No register for shifts found")
  172. }
  173. func (f *Function) Vec(size ...int) Register {
  174. szq := f.ISA.Bits
  175. if len(size) > 0 {
  176. szq = size[0]
  177. }
  178. if f.ISA.Goarch == ARM64 {
  179. for _, r := range f.ISA.Registers {
  180. if r.Size == szq && !r.Restricted && !f.UsedRegisters[r] {
  181. f.UsedRegisters[r] = true
  182. if r.Size > 128 {
  183. f.Used256BitReg = true
  184. }
  185. return r
  186. }
  187. }
  188. } else {
  189. // In Intels crazy architecture AVX registers and SSE registers are the same hardware register so changing
  190. // one can change the other. Sigh.
  191. used := make(map[uint32]bool, len(f.UsedRegisters))
  192. for r, is_used := range f.UsedRegisters {
  193. if is_used && r.Size > f.ISA.GeneralPurposeRegisterSize {
  194. used[r.ARMId()] = true
  195. }
  196. }
  197. for _, r := range f.ISA.Registers {
  198. if r.Size == szq && !r.Restricted && !used[r.ARMId()] {
  199. f.UsedRegisters[r] = true
  200. if r.Size > 128 {
  201. f.Used256BitReg = true
  202. }
  203. return r
  204. }
  205. }
  206. }
  207. panic("No available vector registers")
  208. }
  209. func (f *Function) ReleaseReg(r ...Register) {
  210. for _, x := range r {
  211. f.UsedRegisters[x] = false
  212. }
  213. }
  214. func (f *Function) instr(items ...any) {
  215. sarr := make([]string, len(items))
  216. for i, val := range items {
  217. var f string
  218. if i > 0 && i < len(items)-1 {
  219. f = "%s,"
  220. } else {
  221. f = "%s"
  222. }
  223. sarr[i] = fmt.Sprintf(f, val)
  224. }
  225. f.Instructions = append(f.Instructions, "\t"+strings.Join(sarr, " "))
  226. }
  227. func (f *Function) MemLoadForBasicType(t types.BasicKind) string {
  228. if f.ISA.Goarch == ARM64 {
  229. switch t {
  230. case types.Uint8:
  231. return "MOVBU"
  232. case types.Int8:
  233. return "MOVB"
  234. case types.Uint16:
  235. return "MOVHU"
  236. case types.Int16:
  237. return "MOVH"
  238. case types.Uint32:
  239. return "MOVWU"
  240. case types.Int32:
  241. return "MOVW"
  242. case types.Uint64, types.Uintptr, ByteSlice, types.String, types.Uint, types.Int64, types.Int:
  243. return `MOVD`
  244. }
  245. } else {
  246. if f.ISA.GeneralPurposeRegisterSize == 32 {
  247. switch t {
  248. case types.Uint8:
  249. return "MOVBLZX"
  250. case types.Int8:
  251. return "MOVBLSX"
  252. case types.Uint16:
  253. return "MOVWLZX"
  254. case types.Int16:
  255. return "MOVWLSX"
  256. case types.Uint32, types.Uintptr, types.Int32, ByteSlice, types.String, types.Int, types.Uint:
  257. return "MOVL"
  258. }
  259. } else {
  260. switch t {
  261. case types.Uint8:
  262. return "MOVBQZX"
  263. case types.Int8:
  264. return "MOVBQSX"
  265. case types.Uint16:
  266. return "MOVWQZX"
  267. case types.Int16:
  268. return "MOVWQSX"
  269. case types.Uint32:
  270. return "MOVLQZX"
  271. case types.Int32:
  272. return "MOVLQSX"
  273. case types.Int64, types.Uint64, types.Uintptr, ByteSlice, types.String, types.Int, types.Uint:
  274. return "MOVQ"
  275. }
  276. }
  277. }
  278. panic(fmt.Sprint("Unknown type: ", t))
  279. }
  280. func (f *Function) LoadUnsignedBytesFromMemory(addr string, n int, dest Register) {
  281. defer f.AddTrailingComment(dest, "=", n, "byte(s) from the memory pointed to by", addr)
  282. switch n {
  283. case 1:
  284. if dest.Size != f.ISA.GeneralPurposeRegisterSize {
  285. panic(fmt.Sprintf("cannot load %d bytes into vector register", n))
  286. }
  287. f.instr(f.MemLoadForBasicType(types.Byte), addr, dest)
  288. case 2:
  289. if dest.Size != f.ISA.GeneralPurposeRegisterSize {
  290. panic(fmt.Sprintf("cannot load %d bytes into vector register", n))
  291. }
  292. f.instr(f.MemLoadForBasicType(types.Uint16), addr, dest)
  293. case 4:
  294. if dest.Size != f.ISA.GeneralPurposeRegisterSize {
  295. panic(fmt.Sprintf("cannot load %d bytes into vector register", n))
  296. }
  297. f.instr(f.MemLoadForBasicType(types.Uint32), addr, dest)
  298. case 8:
  299. if dest.Size != f.ISA.GeneralPurposeRegisterSize {
  300. panic(fmt.Sprintf("cannot load %d bytes into vector register", n))
  301. }
  302. if dest.Size*8 > f.ISA.GeneralPurposeRegisterSize {
  303. panic(fmt.Sprintf("cannot load %d bytes into %d bit register", n, dest.Size))
  304. }
  305. f.instr(f.MemLoadForBasicType(types.Uint64), addr, dest)
  306. default:
  307. if dest.Size*8 != f.ISA.GeneralPurposeRegisterSize {
  308. panic(fmt.Sprintf("cannot load %d bytes into %d bit register", n, dest.Size))
  309. }
  310. f.instr(f.MemLoadForBasicType(types.Uintptr), addr, dest)
  311. }
  312. }
  313. func (f *Function) LoadParam(p string) Register {
  314. r := f.Reg()
  315. for i, q := range f.Params {
  316. if q.Name == p {
  317. offset := f.ParamOffsets[i]
  318. mov := f.MemLoadForBasicType(q.Type)
  319. f.instr(mov, fmt.Sprintf("%s+%d(FP)", q.Name, offset), r)
  320. f.AddTrailingComment("load the function parameter", p, "into", r)
  321. }
  322. }
  323. return r
  324. }
  325. func (f *Function) set_return_value(offset int, q FunctionParam, val any) {
  326. mov := f.MemLoadForBasicType(q.Type)
  327. vr := val_repr_for_arithmetic(val)
  328. defer f.AddTrailingComment("save the value:", val, "to the function return parameter:", q.Name)
  329. if f.ISA.Goarch == ARM64 && strings.HasPrefix(vr, `$`) {
  330. // no way to store an immediate value into a memory address
  331. temp := f.Reg()
  332. f.SetRegisterTo(temp, val)
  333. defer f.ReleaseReg(temp)
  334. vr = temp.Name
  335. }
  336. f.instr(mov, vr, fmt.Sprintf("%s+%d(FP)", q.Name, offset))
  337. }
  338. func (f *Function) SetReturnValue(p string, val any) {
  339. for i, q := range f.Returns {
  340. if q.Name == p {
  341. f.set_return_value(f.ReturnOffsets[i], q, val)
  342. break
  343. }
  344. }
  345. }
  346. func (f *Function) CountTrailingZeros(r, ans Register) {
  347. if r.Size == f.ISA.GeneralPurposeRegisterSize {
  348. if f.ISA.Goarch == ARM64 {
  349. f.instr("RBIT", r, r)
  350. f.AddTrailingComment("reverse the bits")
  351. f.instr("CLZ", r, ans)
  352. f.AddTrailingComment(ans, "= number of leading zeros in", r)
  353. } else {
  354. f.instr("BSFL", ans, ans)
  355. f.AddTrailingComment(ans, "= number of trailing zeros in", r)
  356. }
  357. } else {
  358. panic("cannot count trailing zeros in a vector register")
  359. }
  360. }
  361. func (f *Function) Comment(x ...any) {
  362. f.Instructions = append(f.Instructions, space_join("\t//", x...))
  363. }
  364. func shrn8b_immediate4(a, b Register) uint32 {
  365. return (0x0f0c84 << 8) | (a.ARMId()<<5 | b.ARMId())
  366. }
  367. func encode_cmgt16b(a, b, dest Register) (ans uint32) {
  368. return 0x271<<21 | b.ARMId()<<16 | 0xd<<10 | a.ARMId()<<5 | dest.ARMId()
  369. }
  370. func (f *Function) MaskForCountDestructive(vec, ans Register) {
  371. // vec is clobbered by this function
  372. f.Comment("Count the number of bytes to the first 0xff byte and put the result in", ans)
  373. if f.ISA.Goarch == ARM64 {
  374. // See https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon
  375. f.Comment("Go assembler doesn't support the shrn instruction, below we have: shrn.8b", vec, vec, "#4")
  376. f.Comment("It is shifting right by four bits in every 16 bit word and truncating to 8 bits storing the result in the lower 64 bits of", vec)
  377. f.instr("WORD", fmt.Sprintf("$0x%x", shrn8b_immediate4(vec, vec)))
  378. f.instr("FMOVD", "F"+vec.Name[1:], ans)
  379. f.AddTrailingComment("Extract the lower 64 bits from", vec, "and put them into", ans)
  380. } else {
  381. if f.ISA.Bits == 128 {
  382. f.instr("PMOVMSKB", vec, ans)
  383. } else {
  384. f.instr("VPMOVMSKB", vec, ans)
  385. }
  386. f.AddTrailingComment(ans, "= mask of the highest bit in every byte in", vec)
  387. }
  388. }
  389. func (f *Function) shift_self(right bool, self, amt any) {
  390. op := ""
  391. if right {
  392. op = "SHRQ"
  393. if f.ISA.Goarch == ARM64 {
  394. op = "LSR"
  395. }
  396. } else {
  397. op = "SHLQ"
  398. if f.ISA.Goarch == ARM64 {
  399. op = "LSL"
  400. }
  401. }
  402. switch v := amt.(type) {
  403. case Register:
  404. if f.ISA.Goarch != ARM64 && v.Name != "CX" {
  405. panic("On Intel only the CX register can be used for shifts")
  406. }
  407. f.instr(op, v, self)
  408. default:
  409. f.instr(op, val_repr_for_arithmetic(v), self)
  410. }
  411. }
  412. func (f *Function) ShiftSelfRight(self, amt any) {
  413. f.shift_self(true, self, amt)
  414. }
  415. func (f *Function) ShiftSelfLeft(self, amt any) {
  416. f.shift_self(false, self, amt)
  417. }
  418. func (f *Function) ShiftMaskRightDestructive(mask, amt any) {
  419. // The amt register is clobbered by this function
  420. switch n := amt.(type) {
  421. case Register:
  422. if f.ISA.Goarch == ARM64 {
  423. f.Comment("The mask has 4 bits per byte, so multiply", n, "by 4")
  424. f.ShiftSelfLeft(n, 2)
  425. }
  426. f.ShiftSelfRight(mask, n)
  427. case int:
  428. if f.ISA.Goarch == ARM64 {
  429. n <<= 2
  430. }
  431. f.ShiftSelfRight(mask, n)
  432. default:
  433. panic(fmt.Sprintf("Cannot shift by: %s", amt))
  434. }
  435. }
  436. func (f *Function) CountLeadingZeroBytesInMask(src, ans Register) {
  437. if f.ISA.Goarch == ARM64 {
  438. f.CountTrailingZeros(src, ans)
  439. f.instr("UBFX", "$2", ans, "$30", ans)
  440. f.AddTrailingComment(ans, ">>= 2 (divide by 4)")
  441. } else {
  442. f.CountTrailingZeros(src, ans)
  443. }
  444. }
  445. func (f *Function) CountBytesToFirstMatchDestructive(vec, ans Register) {
  446. // vec is clobbered by this function
  447. f.Comment("Count the number of bytes to the first 0xff byte and put the result in", ans)
  448. f.MaskForCountDestructive(vec, ans)
  449. f.CountLeadingZeroBytesInMask(ans, ans)
  450. f.BlankLine()
  451. }
  452. func (f *Function) LoadParamLen(p string) Register {
  453. r := f.Reg()
  454. for i, q := range f.Params {
  455. if q.Name == p {
  456. offset := f.ParamOffsets[i]
  457. if q.Type == ByteSlice || q.Type == types.String {
  458. offset += int(f.ISA.Sizes.Sizeof(types.Typ[types.Uintptr]))
  459. }
  460. mov := f.MemLoadForBasicType(q.Type)
  461. f.instr(mov, fmt.Sprintf("%s_len+%d(FP)", q.Name, offset), r)
  462. f.AddTrailingComment("load the length of the function parameter", q.Name, "into", r)
  463. break
  464. }
  465. }
  466. return r
  467. }
  468. func (f *Function) unaligned_move() string {
  469. switch f.ISA.Goarch {
  470. case X86, AMD64:
  471. if f.ISA.Bits == 128 {
  472. return "MOVOU"
  473. }
  474. return "VMOVDQU"
  475. default:
  476. panic("Unknown arch: " + string(f.ISA.Goarch))
  477. }
  478. }
  479. func (f *Function) aligned_move() string {
  480. switch f.ISA.Goarch {
  481. case X86, AMD64:
  482. if f.ISA.Bits == 128 {
  483. return "MOVOA"
  484. }
  485. return "VMOVDQA"
  486. default:
  487. panic("Unknown arch: " + string(f.ISA.Goarch))
  488. }
  489. }
  490. func (f *Function) LoadPointerUnaligned(register_containing_pointer_value Register, dest Register) {
  491. addr := register_containing_pointer_value.AddressInRegister()
  492. if f.ISA.Goarch == ARM64 {
  493. f.instr(`VLD1`, addr, "["+dest.ARMFullWidth()+"]")
  494. } else {
  495. f.instr(f.unaligned_move(), addr, dest)
  496. }
  497. f.AddTrailingComment("load memory from the address in", register_containing_pointer_value, "to", dest)
  498. }
  499. func (f *Function) LoadPointerAligned(register_containing_pointer_value Register, dest Register) {
  500. addr := register_containing_pointer_value.AddressInRegister()
  501. if f.ISA.Goarch == ARM64 {
  502. f.instr(`VLD1`, addr, "["+dest.ARMFullWidth()+"]")
  503. } else {
  504. f.instr(f.aligned_move(), addr, dest)
  505. }
  506. f.AddTrailingComment("load memory from the address in", register_containing_pointer_value, "to", dest)
  507. }
  508. func (f *Function) StoreUnalignedToPointer(vec, register_containing_pointer_value Register) {
  509. if f.ISA.Goarch == ARM64 {
  510. f.instr(`VST1`, "["+vec.ARMFullWidth()+"]", fmt.Sprintf("(%s)", register_containing_pointer_value))
  511. } else {
  512. f.instr(f.unaligned_move(), vec, fmt.Sprintf("(%s)", register_containing_pointer_value))
  513. }
  514. f.AddTrailingComment("store the value of", vec, "in to the memory whose address is in:", register_containing_pointer_value)
  515. }
  516. func (f *Function) test_if_zero(a Register) {
  517. if f.ISA.Goarch == ARM64 {
  518. f.instr("AND", a, a, a)
  519. }
  520. switch a.Size {
  521. case 32:
  522. f.instr("TESTL", a, a)
  523. case 64:
  524. f.instr("TESTQ", a, a)
  525. case 128:
  526. f.instr("PTEST", a, a)
  527. default:
  528. f.instr("VPTEST", a, a)
  529. }
  530. f.AddTrailingComment("test if", a, "is zero")
  531. }
  532. func (f *Function) JumpTo(label string) {
  533. f.instr("JMP", label)
  534. f.AddTrailingComment("jump to:", label)
  535. }
  536. func (f *Function) jump_on_zero_check(a Register, label string, on_zero bool) {
  537. if f.ISA.Goarch == ARM64 {
  538. if a.Size > f.ISA.GeneralPurposeRegisterSize {
  539. temp := f.Vec()
  540. defer f.ReleaseReg(temp)
  541. f.instr("VDUP", a.Name+".D[1]", temp)
  542. f.AddTrailingComment(`duplicate the upper 64 bits of`, a, "into the lower and upper 64 bits of", temp)
  543. f.Or(a, temp, temp)
  544. a = f.Reg()
  545. defer f.ReleaseReg(a)
  546. f.instr("FMOVD", "F"+temp.Name[1:], a)
  547. f.AddTrailingComment(a, "= lower 64bits of", temp)
  548. }
  549. if on_zero {
  550. f.instr("CBZ", a, label)
  551. } else {
  552. f.instr("CBNZ", a, label)
  553. }
  554. } else {
  555. f.test_if_zero(a)
  556. if on_zero {
  557. f.instr("JZ", label)
  558. } else {
  559. f.instr("JNZ", label)
  560. }
  561. }
  562. }
  563. func (f *Function) JumpIfZero(a Register, label string) {
  564. f.jump_on_zero_check(a, label, true)
  565. f.AddTrailingComment("jump to:", label, "if", a, "is zero")
  566. }
  567. func (f *Function) JumpIfNonZero(a Register, label string) {
  568. f.jump_on_zero_check(a, label, false)
  569. f.AddTrailingComment("jump to:", label, "if", a, "is non-zero")
  570. }
  571. func (f *Function) compare(a, b Register) {
  572. if f.ISA.Goarch == ARM64 {
  573. f.instr("CMP", b, a)
  574. } else {
  575. if a.Size == 32 {
  576. f.instr("CMPL", a, b)
  577. } else {
  578. f.instr("CMPQ", a, b)
  579. }
  580. }
  581. f.AddTrailingComment("compare", a, "to", b)
  582. }
  583. func (f *Function) JumpIfLessThan(a, b Register, label string) {
  584. f.compare(a, b)
  585. if f.ISA.Goarch == ARM64 {
  586. f.instr("BLT", label)
  587. } else {
  588. f.instr("JLT", label)
  589. }
  590. f.AddTrailingComment("jump to:", label, "if", a, "<", b)
  591. }
  592. func (f *Function) JumpIfLessThanOrEqual(a, b Register, label string) {
  593. f.compare(a, b)
  594. if f.ISA.Goarch == ARM64 {
  595. f.instr("BLE", label)
  596. } else {
  597. f.instr("JLE", label)
  598. }
  599. f.AddTrailingComment("jump to:", label, "if", a, "<=", b)
  600. }
  601. func (f *Function) JumpIfEqual(a, b Register, label string) {
  602. f.compare(a, b)
  603. if f.ISA.Goarch == ARM64 {
  604. f.instr("BEQ", label)
  605. } else {
  606. f.instr("JE", label)
  607. }
  608. f.AddTrailingComment("jump to:", label, "if", a, "==", b)
  609. }
  610. func (f *Function) Or(a, b, dest Register) {
  611. if f.ISA.Goarch == ARM64 {
  612. f.instr("VORR", a.ARMFullWidth(), b.ARMFullWidth(), dest.ARMFullWidth())
  613. } else {
  614. if f.ISA.Bits == 128 {
  615. switch dest.Name {
  616. case b.Name:
  617. f.instr("POR", a, b)
  618. case a.Name:
  619. f.instr("POR", b, a)
  620. default:
  621. f.CopyRegister(b, dest)
  622. f.instr("POR", a, dest)
  623. }
  624. } else {
  625. f.instr("VPOR", a, b, dest)
  626. }
  627. }
  628. f.AddTrailingComment(dest, "=", a, "|", b, "(bitwise)")
  629. }
  630. func (f *Function) And(a, b, dest Register) {
  631. if f.ISA.Goarch == ARM64 {
  632. f.instr("VAND", a.ARMFullWidth(), b.ARMFullWidth(), dest.ARMFullWidth())
  633. } else {
  634. if f.ISA.Bits == 128 {
  635. switch dest.Name {
  636. case b.Name:
  637. f.instr("PAND", a, b)
  638. case a.Name:
  639. f.instr("PAND", b, a)
  640. default:
  641. f.CopyRegister(b, dest)
  642. f.instr("PAND", a, dest)
  643. }
  644. } else {
  645. f.instr("VPAND", a, b, dest)
  646. }
  647. }
  648. f.AddTrailingComment(dest, "=", a, "&", b, "(bitwise)")
  649. }
  650. func (f *Function) ClearRegisterToZero(r Register) {
  651. defer func() { f.AddTrailingComment("set", r, "to zero") }()
  652. if f.ISA.Goarch == ARM64 {
  653. if r.Size == f.ISA.GeneralPurposeRegisterSize {
  654. f.instr(f.MemLoadForBasicType(types.Int32), val_repr_for_arithmetic(0), r)
  655. } else {
  656. f.instr("VMOVI", "$0", r.ARMFullWidth())
  657. }
  658. return
  659. }
  660. switch r.Size {
  661. case 128:
  662. f.instr("PXOR", r, r)
  663. case f.ISA.GeneralPurposeRegisterSize:
  664. if r.Size == 32 {
  665. f.instr("XORL", r, r)
  666. } else {
  667. f.instr("XORQ", r, r)
  668. }
  669. case 256:
  670. f.instr("VPXOR", r, r, r)
  671. }
  672. }
  673. func (f *Function) AllOnesRegister(r Register) {
  674. switch r.Size {
  675. default:
  676. f.CmpEqEpi8(r, r, r)
  677. case f.ISA.GeneralPurposeRegisterSize:
  678. if f.ISA.Goarch == ARM64 {
  679. f.instr("MOVD", "$-1", r)
  680. } else {
  681. switch r.Size {
  682. case 32:
  683. f.instr("MOVL", "$0xFFFFFFFF")
  684. case 64:
  685. f.instr("MOVQ", "$0xFFFFFFFFFFFFFFFF")
  686. }
  687. }
  688. f.AddTrailingComment(r, "= all ones")
  689. }
  690. }
  691. func (f *Function) CopyRegister(a, ans Register) {
  692. if a.Size != ans.Size {
  693. panic("Can only copy registers of equal sizes")
  694. }
  695. if a.Size > f.ISA.GeneralPurposeRegisterSize {
  696. if f.ISA.Goarch == ARM64 {
  697. f.instr("VDUP", a.Name[1:]+".D2", ans.Name+".D2")
  698. } else {
  699. f.instr(f.aligned_move(), a, ans)
  700. }
  701. } else {
  702. if f.ISA.Goarch == ARM64 {
  703. f.instr("MOVD", a, ans)
  704. } else {
  705. if a.Size == 32 {
  706. f.instr("MOVL", a, ans)
  707. } else {
  708. f.instr("MOVQ", a, ans)
  709. }
  710. }
  711. }
  712. f.AddTrailingComment(ans, "=", a)
  713. }
  714. func (f *Function) SetRegisterTo(self Register, val any) {
  715. switch v := val.(type) {
  716. case Register:
  717. f.CopyRegister(self, v)
  718. case int:
  719. if self.Size != f.ISA.GeneralPurposeRegisterSize {
  720. panic("TODO: Cannot yet set constant values in vector registers")
  721. }
  722. switch v {
  723. case 0:
  724. f.ClearRegisterToZero(self)
  725. case -1:
  726. f.AllOnesRegister(self)
  727. default:
  728. if f.ISA.Goarch == ARM64 {
  729. f.instr("MOVD", val_repr_for_arithmetic(v), self)
  730. } else {
  731. f.instr("MOVL", val_repr_for_arithmetic(v), self)
  732. }
  733. f.AddTrailingComment(self, "= ", v)
  734. }
  735. case string:
  736. f.instr(f.MemLoadForBasicType(types.Uintptr), v)
  737. f.AddTrailingComment(self, "=", self.Size/8, "bytes at the address", v)
  738. default:
  739. panic(fmt.Sprintf("cannot set register to value: %#v", val))
  740. }
  741. }
  742. func (r Register) ARMId() uint32 {
  743. num, err := strconv.Atoi(r.Name[1:])
  744. if err != nil {
  745. panic(err)
  746. }
  747. return uint32(num)
  748. }
  749. func (f *Function) cmp(a, b, ans Register, op, c_rep string) {
  750. if a.Size != b.Size || a.Size != ans.Size {
  751. panic("Can only compare registers of equal sizes")
  752. }
  753. if f.ISA.Goarch == ARM64 {
  754. if op == "EQ" {
  755. f.instr("VCMEQ", a.ARMFullWidth(), b.ARMFullWidth(), ans.ARMFullWidth())
  756. } else {
  757. f.instr("WORD", fmt.Sprintf("$0x%x", encode_cmgt16b(a, b, ans)))
  758. }
  759. } else {
  760. fop := `PCMP` + op + "B"
  761. if f.ISA.Bits == 128 {
  762. if op == "EQ" {
  763. switch ans.Name {
  764. case a.Name:
  765. f.instr(fop, b, ans)
  766. case b.Name:
  767. f.instr(fop, a, ans)
  768. default:
  769. f.CopyRegister(a, ans)
  770. f.instr(fop, b, ans)
  771. }
  772. } else {
  773. // order matters, we want destination aka 2nd arg to be both a and ans
  774. switch ans.Name {
  775. case a.Name:
  776. f.instr(fop, b, a)
  777. case b.Name:
  778. vec := f.Vec(a.Size)
  779. f.CopyRegister(a, vec)
  780. f.instr(fop, b, vec)
  781. f.CopyRegister(vec, b)
  782. f.ReleaseReg(vec)
  783. default:
  784. f.CopyRegister(a, ans)
  785. f.instr(fop, b, ans)
  786. }
  787. }
  788. } else {
  789. f.instr("V"+fop, b, a, ans)
  790. }
  791. }
  792. f.AddTrailingComment(ans, "= 0xff on every byte where", a.Name+"[n]", c_rep, b.Name+"[n] and zero elsewhere")
  793. }
  794. func (f *Function) CmpGtEpi8(a, b, ans Register) {
  795. f.cmp(a, b, ans, "GT", ">")
  796. }
  797. func (f *Function) CmpLtEpi8(a, b, ans Register) {
  798. f.cmp(b, a, ans, "GT", ">")
  799. }
  800. func (f *Function) CmpEqEpi8(a, b, ans Register) {
  801. f.cmp(a, b, ans, "EQ", "==")
  802. }
  803. func (f *Function) Set1Epi8(val any, vec Register) {
  804. if vec.Size != 128 && vec.Size != 256 {
  805. panic("Set1Epi8 only works on vector registers")
  806. }
  807. do_shuffle_load := func(r Register) {
  808. f.instr("MOVL", r, vec)
  809. shuffle_mask := f.Vec()
  810. f.ClearRegisterToZero(shuffle_mask)
  811. f.instr("PSHUFB", shuffle_mask, vec)
  812. f.ReleaseReg(shuffle_mask)
  813. }
  814. switch v := val.(type) {
  815. default:
  816. panic("unknown type for set1_epi8")
  817. case int:
  818. switch v {
  819. case 0:
  820. f.ClearRegisterToZero(vec)
  821. return
  822. case -1:
  823. f.AllOnesRegister(vec)
  824. return
  825. }
  826. r := f.Reg()
  827. defer f.ReleaseReg(r)
  828. f.SetRegisterTo(r, v)
  829. f.Set1Epi8(r, vec)
  830. case Register:
  831. f.Comment("Set all bytes of", vec, "to the lowest byte in", v)
  832. if v.Size != f.ISA.GeneralPurposeRegisterSize {
  833. panic("Can only set1_epi8 from a general purpose register")
  834. }
  835. if f.ISA.Goarch == ARM64 {
  836. f.instr("VMOV", v, vec.ARMFullWidth())
  837. } else {
  838. switch vec.Size {
  839. case 128:
  840. do_shuffle_load(v)
  841. case 256:
  842. temp := f.Vec(128)
  843. defer f.ReleaseReg(temp)
  844. f.instr("VMOVD", v, temp)
  845. f.instr("VPBROADCASTB", temp, vec)
  846. }
  847. }
  848. defer f.Comment()
  849. case string:
  850. f.Comment("Set all bytes of", vec, "to the first byte in", v)
  851. if f.ISA.Goarch == ARM64 {
  852. r := f.LoadParam(v)
  853. f.instr("VMOV", r, vec.ARMFullWidth())
  854. f.ReleaseReg(r)
  855. return
  856. }
  857. switch vec.Size {
  858. case 128:
  859. r := f.LoadParam(v)
  860. defer f.ReleaseReg(r)
  861. do_shuffle_load(r)
  862. case 256:
  863. f.instr("VPBROADCASTB", f.ParamPos(v), vec)
  864. }
  865. defer f.Comment()
  866. }
  867. }
  868. func (isa *ISA) structsize(vs []*types.Var) int64 {
  869. n := len(vs)
  870. if n == 0 {
  871. return 0
  872. }
  873. offsets := isa.Sizes.Offsetsof(vs)
  874. return offsets[n-1] + isa.Sizes.Sizeof(vs[n-1].Type())
  875. }
  876. func tuplevars(params []FunctionParam) []*types.Var {
  877. vars := make([]*types.Var, len(params))
  878. for i, p := range params {
  879. vars[i] = AsVar(p.Type, p.Name)
  880. }
  881. return vars
  882. }
  883. func NewFunction(isa ISA, name, description string, params, returns []FunctionParam) *Function {
  884. name = fmt.Sprintf("%s_%d", name, isa.Bits)
  885. ans := Function{Name: name, Desc: description, Params: params, Returns: returns, ISA: isa}
  886. vars := tuplevars(params)
  887. vars = append(vars, types.NewParam(0, nil, "sentinel", types.Typ[types.Uint64]))
  888. offsets := isa.Sizes.Offsetsof(vars)
  889. n := len(params)
  890. paramssize := int(offsets[n])
  891. ans.ParamOffsets = make([]int, n)
  892. ans.Size = paramssize
  893. for i := range ans.ParamOffsets {
  894. ans.ParamOffsets[i] = int(offsets[i])
  895. }
  896. if len(returns) > 0 {
  897. vars = tuplevars(returns)
  898. offsets = isa.Sizes.Offsetsof(vars)
  899. ans.ReturnOffsets = make([]int, len(offsets))
  900. for i, off := range offsets {
  901. ans.ReturnOffsets[i] = paramssize + int(off)
  902. }
  903. ans.Size += int(isa.structsize(vars))
  904. }
  905. return &ans
  906. }
  907. func (s *Function) ParamPos(name string) string {
  908. for n, i := range s.Params {
  909. if i.Name == name {
  910. return fmt.Sprintf("%s+%d(FP)", i.Name, s.ParamOffsets[n])
  911. }
  912. }
  913. panic(fmt.Errorf("Unknown parameter: %s", name))
  914. }
  915. func (s *Function) print_signature(w io.Writer) {
  916. fmt.Fprintf(w, "func %s(", s.Name)
  917. print_p := func(p FunctionParam) {
  918. var tname string
  919. if p.Type == ByteSlice {
  920. tname = "[]byte"
  921. } else {
  922. tname = types.Universe.Lookup(types.Typ[p.Type].String()).String()
  923. }
  924. tname, _ = strings.CutPrefix(tname, "type ")
  925. fmt.Fprintf(w, "%s %s", p.Name, tname)
  926. }
  927. for i, p := range s.Params {
  928. if i > 0 {
  929. fmt.Fprint(w, ", ")
  930. }
  931. print_p(p)
  932. }
  933. fmt.Fprint(w, ")")
  934. if len(s.Returns) == 0 {
  935. return
  936. }
  937. fmt.Fprint(w, " (")
  938. for i, p := range s.Returns {
  939. if i > 0 {
  940. fmt.Fprint(w, ", ")
  941. }
  942. print_p(p)
  943. }
  944. fmt.Fprint(w, ")")
  945. }
  946. func (s *Function) OutputStub(w io.Writer) {
  947. if s.Desc != "" {
  948. fmt.Fprintln(w, "// "+s.Desc)
  949. fmt.Fprintln(w, "//")
  950. }
  951. if s.ISA.HasSIMD {
  952. fmt.Fprintln(w, "//go:noescape")
  953. }
  954. s.print_signature(w)
  955. if s.ISA.HasSIMD {
  956. fmt.Fprintln(w)
  957. } else {
  958. fmt.Fprintln(w, "{")
  959. fmt.Fprintln(w, "panic(\"No SIMD implementations for this function\")")
  960. fmt.Fprintln(w, "}")
  961. }
  962. fmt.Fprintln(w)
  963. }
  964. func (s *Function) BlankLine() { s.Instructions = append(s.Instructions, "") }
  965. func (s *Function) Return() {
  966. if s.Used256BitReg {
  967. s.instr("VZEROUPPER")
  968. s.AddTrailingComment("zero upper bits of AVX registers to avoid dependencies when switching between SSE and AVX code")
  969. }
  970. s.instr("RET")
  971. s.AddTrailingComment("return from function")
  972. s.BlankLine()
  973. }
  974. func (s *Function) end_function() {
  975. amt := 16
  976. if s.Used256BitReg {
  977. amt = 32
  978. }
  979. s.instr(fmt.Sprintf("PCALIGN $%d\n", amt))
  980. s.Return()
  981. }
  982. func (s *Function) Label(name string) {
  983. s.Instructions = append(s.Instructions, name+":")
  984. s.AddTrailingComment("jump target")
  985. }
  986. func space_join(prefix string, x ...any) string {
  987. b := strings.Builder{}
  988. if prefix != "" {
  989. b.WriteString(prefix)
  990. b.WriteByte(' ')
  991. }
  992. for _, x := range x {
  993. b.WriteString(fmt.Sprint(x))
  994. b.WriteByte(' ')
  995. }
  996. return b.String()
  997. }
  998. func (s *Function) AddTrailingComment(x ...any) {
  999. s.Instructions[len(s.Instructions)-1] += space_join(" //", x...)
  1000. }
  1001. func val_repr_for_arithmetic(val any) (ans string) {
  1002. switch v := val.(type) {
  1003. case int:
  1004. if v < 0 {
  1005. return fmt.Sprintf("$%d", v)
  1006. }
  1007. return fmt.Sprintf("$0x%x", v)
  1008. case string:
  1009. return val.(string)
  1010. case fmt.Stringer:
  1011. return val.(fmt.Stringer).String()
  1012. default:
  1013. return fmt.Sprint(val)
  1014. }
  1015. }
  1016. func (f *Function) AndSelf(self Register, val any) {
  1017. switch f.ISA.Goarch {
  1018. case ARM64:
  1019. f.instr("AND", val_repr_for_arithmetic(val), self)
  1020. case AMD64:
  1021. f.instr("ANDQ", val_repr_for_arithmetic(val), self)
  1022. case X86:
  1023. f.instr("ANDL", val_repr_for_arithmetic(val), self)
  1024. default:
  1025. panic("Unknown architecture for AND")
  1026. }
  1027. f.AddTrailingComment(self, "&=", val)
  1028. }
  1029. func (f *Function) NegateSelf(self Register) {
  1030. if f.ISA.Goarch == "ARM64" {
  1031. f.instr("NEG", self, self)
  1032. } else {
  1033. f.instr("NEGQ", self)
  1034. }
  1035. f.AddTrailingComment(self, "*= -1")
  1036. }
  1037. func (f *Function) AddToSelf(self Register, val any) {
  1038. f.instr(f.ISA.NativeAdd(), val_repr_for_arithmetic(val), self) // pos += sizeof(vec)
  1039. f.AddTrailingComment(self, "+=", val)
  1040. }
  1041. func (f *Function) SubtractFromSelf(self Register, val any) {
  1042. f.instr(f.ISA.NativeSubtract(), val_repr_for_arithmetic(val), self) // pos += sizeof(vec)
  1043. f.AddTrailingComment(self, "-=", val)
  1044. }
  1045. func (s *Function) SetRegisterToOffset(dest Register, base_register Register, constant_offset int, offset_register Register) {
  1046. if s.ISA.Goarch == ARM64 {
  1047. s.SetRegisterTo(dest, constant_offset)
  1048. s.AddToSelf(dest, base_register)
  1049. s.AddToSelf(dest, offset_register)
  1050. } else {
  1051. addr := fmt.Sprintf("%d(%s)(%s*1)", constant_offset, base_register, offset_register)
  1052. s.instr(s.ISA.LEA(), addr, dest)
  1053. s.AddTrailingComment(dest, "=", base_register, "+", offset_register, "+", constant_offset)
  1054. }
  1055. }
  1056. func (s *Function) OutputASM(w io.Writer) {
  1057. if !s.ISA.HasSIMD {
  1058. return
  1059. }
  1060. fmt.Fprint(w, "// ")
  1061. s.print_signature(w)
  1062. fmt.Fprintf(w, "\nTEXT ·%s(SB), NOSPLIT|TOPFRAME|NOFRAME, $0-%d\n", s.Name, s.Size)
  1063. has_trailing_return := false
  1064. for _, i := range s.Instructions {
  1065. if len(i) == 0 {
  1066. continue
  1067. }
  1068. if strings.HasPrefix(i, "\tRET ") {
  1069. has_trailing_return = true
  1070. } else {
  1071. has_trailing_return = false
  1072. }
  1073. }
  1074. if !has_trailing_return {
  1075. s.Return()
  1076. }
  1077. for _, i := range s.Instructions {
  1078. fmt.Fprintln(w, i)
  1079. }
  1080. fmt.Fprintln(w)
  1081. }
  1082. type State struct {
  1083. ISA ISA
  1084. ActiveFunction *Function
  1085. ASMOutput, StubOutput strings.Builder
  1086. TestASMOutput, TestStubOutput strings.Builder
  1087. }
  1088. var package_name = "simdstring"
  1089. func NewState(isa ISA, build_tags ...string) *State {
  1090. ans := &State{ISA: isa}
  1091. if len(build_tags) == 0 {
  1092. build_tags = append(build_tags, string(isa.Goarch))
  1093. }
  1094. build_tag := func(w io.Writer, is_test bool) {
  1095. fmt.Fprintf(w, "//go:build %s\n", strings.Join(build_tags, " "))
  1096. }
  1097. asm := func(w io.Writer) {
  1098. fmt.Fprintln(w, "// Generated by generate.go do not edit")
  1099. fmt.Fprintln(w, "// vim: ft=goasm")
  1100. build_tag(w, w == &ans.TestASMOutput)
  1101. fmt.Fprintln(w, "\n#include \"go_asm.h\"")
  1102. fmt.Fprintln(w, "#include \"textflag.h\"")
  1103. fmt.Fprintln(w)
  1104. }
  1105. asm(&ans.ASMOutput)
  1106. asm(&ans.TestASMOutput)
  1107. stub := func(w io.Writer) {
  1108. fmt.Fprintln(w, "// Generated by generate.go do not edit")
  1109. build_tag(w, w == &ans.TestStubOutput)
  1110. fmt.Fprintln(w, "\npackage "+package_name)
  1111. fmt.Fprintln(w)
  1112. }
  1113. stub(&ans.StubOutput)
  1114. stub(&ans.TestStubOutput)
  1115. return ans
  1116. }
  1117. func (s *State) OutputFunction() {
  1118. if s.ActiveFunction == nil {
  1119. return
  1120. }
  1121. if strings.HasPrefix(s.ActiveFunction.Name, "test_") {
  1122. s.ActiveFunction.OutputASM(&s.TestASMOutput)
  1123. s.ActiveFunction.OutputStub(&s.TestStubOutput)
  1124. } else {
  1125. s.ActiveFunction.OutputASM(&s.ASMOutput)
  1126. s.ActiveFunction.OutputStub(&s.StubOutput)
  1127. }
  1128. s.ActiveFunction = nil
  1129. }
  1130. func (s *State) NewFunction(name, description string, params, returns []FunctionParam) *Function {
  1131. s.OutputFunction()
  1132. s.ActiveFunction = NewFunction(s.ISA, name, description, params, returns)
  1133. s.ActiveFunction.UsedRegisters = make(map[Register]bool)
  1134. return s.ActiveFunction
  1135. }
  1136. func (f *Function) load_vec_from_param(param string) Register {
  1137. src := f.LoadParam(param)
  1138. vec := f.Vec()
  1139. f.LoadPointerUnaligned(src, vec)
  1140. f.ReleaseReg(src)
  1141. return vec
  1142. }
  1143. func (f *Function) store_vec_in_param(vec Register, param string) {
  1144. ans := f.LoadParam(param)
  1145. f.StoreUnalignedToPointer(vec, ans)
  1146. f.ReleaseReg(ans)
  1147. }
  1148. func (s *State) test_load() {
  1149. f := s.NewFunction("test_load_asm", "Test loading of vector register", []FunctionParam{{"src", ByteSlice}, {"ans", ByteSlice}}, nil)
  1150. if !s.ISA.HasSIMD {
  1151. return
  1152. }
  1153. vec := f.load_vec_from_param("src")
  1154. f.store_vec_in_param(vec, `ans`)
  1155. }
  1156. func (s *State) test_set1_epi8() {
  1157. f := s.NewFunction("test_set1_epi8_asm", "Test broadcast of byte into vector", []FunctionParam{{"b", types.Byte}, {"ans", ByteSlice}}, nil)
  1158. if !s.ISA.HasSIMD {
  1159. return
  1160. }
  1161. vec := f.Vec()
  1162. r := f.LoadParam("b")
  1163. q := f.Reg()
  1164. f.SetRegisterTo(q, int(' '))
  1165. f.JumpIfEqual(r, q, "space")
  1166. f.SetRegisterTo(q, 11)
  1167. f.JumpIfEqual(r, q, "eleven")
  1168. f.Set1Epi8("b", vec)
  1169. f.store_vec_in_param(vec, `ans`)
  1170. f.Return()
  1171. f.Label("space")
  1172. f.Set1Epi8(int(' '), vec)
  1173. f.store_vec_in_param(vec, `ans`)
  1174. f.Return()
  1175. f.Label("eleven")
  1176. f.Set1Epi8(-1, vec)
  1177. f.store_vec_in_param(vec, `ans`)
  1178. f.Return()
  1179. }
  1180. func (s *State) test_cmpeq_epi8() {
  1181. f := s.NewFunction("test_cmpeq_epi8_asm", "Test byte comparison of two vectors", []FunctionParam{{"a", ByteSlice}, {"b", ByteSlice}, {"ans", ByteSlice}}, nil)
  1182. if !s.ISA.HasSIMD {
  1183. return
  1184. }
  1185. a := f.load_vec_from_param("a")
  1186. b := f.load_vec_from_param("b")
  1187. f.CmpEqEpi8(a, b, a)
  1188. f.store_vec_in_param(a, "ans")
  1189. }
  1190. func (s *State) test_cmplt_epi8() {
  1191. f := s.NewFunction(
  1192. "test_cmplt_epi8_asm", "Test byte comparison of two vectors", []FunctionParam{{"a", ByteSlice}, {"b", ByteSlice}, {"which", types.Int}, {"ans", ByteSlice}}, nil)
  1193. if !s.ISA.HasSIMD {
  1194. return
  1195. }
  1196. which := f.LoadParam("which")
  1197. a := f.load_vec_from_param("a")
  1198. b := f.load_vec_from_param("b")
  1199. r := f.Reg()
  1200. f.SetRegisterTo(r, 1)
  1201. f.JumpIfEqual(which, r, "one")
  1202. f.SetRegisterTo(r, 2)
  1203. f.JumpIfEqual(which, r, "two")
  1204. ans := f.Vec()
  1205. f.CmpLtEpi8(a, b, ans)
  1206. f.store_vec_in_param(ans, "ans")
  1207. f.Return()
  1208. f.Label("one")
  1209. f.CmpLtEpi8(a, b, a)
  1210. f.store_vec_in_param(a, "ans")
  1211. f.Return()
  1212. f.Label("two")
  1213. f.CmpLtEpi8(a, b, b)
  1214. f.store_vec_in_param(b, "ans")
  1215. f.Return()
  1216. }
  1217. func (s *State) test_or() {
  1218. f := s.NewFunction("test_or_asm", "Test OR of two vectors", []FunctionParam{{"a", ByteSlice}, {"b", ByteSlice}, {"ans", ByteSlice}}, nil)
  1219. if !s.ISA.HasSIMD {
  1220. return
  1221. }
  1222. a := f.load_vec_from_param("a")
  1223. b := f.load_vec_from_param("b")
  1224. f.Or(a, b, a)
  1225. f.store_vec_in_param(a, "ans")
  1226. }
  1227. func (s *State) test_jump_if_zero() {
  1228. f := s.NewFunction("test_jump_if_zero_asm", "Test jump on zero register", []FunctionParam{{"a", ByteSlice}}, []FunctionParam{{"ans", types.Int}})
  1229. if !s.ISA.HasSIMD {
  1230. return
  1231. }
  1232. a := f.load_vec_from_param("a")
  1233. f.JumpIfZero(a, "zero")
  1234. f.SetReturnValue("ans", 1)
  1235. f.Return()
  1236. f.Label("zero")
  1237. f.SetReturnValue("ans", 0)
  1238. }
  1239. func (s *State) test_count_to_match() {
  1240. f := s.NewFunction("test_count_to_match_asm", "Test counting bytes to first match", []FunctionParam{{"a", ByteSlice}, {"b", types.Byte}}, []FunctionParam{{"ans", types.Int}})
  1241. if !s.ISA.HasSIMD {
  1242. return
  1243. }
  1244. a := f.load_vec_from_param("a")
  1245. b := f.Vec()
  1246. f.Set1Epi8("b", b)
  1247. f.CmpEqEpi8(a, b, b)
  1248. f.JumpIfZero(b, "fail")
  1249. res := f.Reg()
  1250. f.CountBytesToFirstMatchDestructive(b, res)
  1251. f.SetReturnValue("ans", res)
  1252. f.Return()
  1253. f.Label("fail")
  1254. f.SetReturnValue("ans", -1)
  1255. }
  1256. func (isa *ISA) LEA() string {
  1257. if isa.GeneralPurposeRegisterSize == 32 {
  1258. return "LEAL"
  1259. }
  1260. return "LEAQ"
  1261. }
  1262. func (s *State) index_func(f *Function, test_bytes func(bytes_to_test, test_ans Register)) {
  1263. pos := f.Reg()
  1264. test_ans := f.Vec()
  1265. bytes_to_test := f.Vec()
  1266. data_start := f.LoadParam(`data`)
  1267. limit := f.LoadParamLen(`data`)
  1268. f.JumpIfZero(limit, "fail")
  1269. f.AddToSelf(limit, data_start)
  1270. mask := f.Reg()
  1271. vecsz := f.ISA.Bits / 8
  1272. f.CopyRegister(data_start, pos)
  1273. func() {
  1274. unaligned_bytes := f.RegForShifts()
  1275. defer f.ReleaseReg(unaligned_bytes)
  1276. f.CopyRegister(data_start, unaligned_bytes)
  1277. f.AndSelf(unaligned_bytes, vecsz-1)
  1278. f.SubtractFromSelf(pos, unaligned_bytes)
  1279. f.Comment(fmt.Sprintf("%s is now aligned to a %d byte boundary so loading from it is safe", pos, vecsz))
  1280. f.LoadPointerAligned(pos, bytes_to_test)
  1281. test_bytes(bytes_to_test, test_ans)
  1282. f.MaskForCountDestructive(test_ans, mask)
  1283. f.Comment("We need to shift out the possible extra bytes at the start of the string caused by the unaligned read")
  1284. f.ShiftMaskRightDestructive(mask, unaligned_bytes)
  1285. f.JumpIfZero(mask, "loop_start")
  1286. f.CopyRegister(data_start, pos)
  1287. f.JumpTo("byte_found_in_mask")
  1288. }()
  1289. f.Comment("Now loop over aligned blocks")
  1290. f.Label("loop_start")
  1291. f.AddToSelf(pos, vecsz)
  1292. f.JumpIfLessThanOrEqual(limit, pos, "fail")
  1293. f.LoadPointerAligned(pos, bytes_to_test)
  1294. test_bytes(bytes_to_test, test_ans)
  1295. f.JumpIfNonZero(test_ans, "byte_found_in_vec")
  1296. f.JumpTo("loop_start")
  1297. f.Label("byte_found_in_vec")
  1298. f.MaskForCountDestructive(test_ans, mask)
  1299. f.Comment("Get the result from", mask, "and return it")
  1300. f.Label("byte_found_in_mask")
  1301. f.CountLeadingZeroBytesInMask(mask, mask)
  1302. f.AddToSelf(mask, pos)
  1303. f.JumpIfLessThanOrEqual(limit, mask, "fail")
  1304. f.SubtractFromSelf(mask, data_start)
  1305. f.SetReturnValue("ans", mask)
  1306. f.Return()
  1307. f.Label("fail")
  1308. f.SetReturnValue("ans", -1)
  1309. f.Return()
  1310. }
  1311. func (s *State) indexbyte2_body(f *Function) {
  1312. b1 := f.Vec()
  1313. b2 := f.Vec()
  1314. f.Set1Epi8("b1", b1)
  1315. f.Set1Epi8("b2", b2)
  1316. test_bytes := func(bytes_to_test, test_ans Register) {
  1317. f.CmpEqEpi8(bytes_to_test, b1, test_ans)
  1318. f.CmpEqEpi8(bytes_to_test, b2, bytes_to_test)
  1319. f.Or(test_ans, bytes_to_test, test_ans)
  1320. }
  1321. s.index_func(f, test_bytes)
  1322. }
  1323. func (s *State) indexbyte2() {
  1324. f := s.NewFunction("index_byte2_asm", "Find the index of either of two bytes", []FunctionParam{{"data", ByteSlice}, {"b1", types.Byte}, {"b2", types.Byte}}, []FunctionParam{{"ans", types.Int}})
  1325. if s.ISA.HasSIMD {
  1326. s.indexbyte2_body(f)
  1327. }
  1328. f = s.NewFunction("index_byte2_string_asm", "Find the index of either of two bytes", []FunctionParam{{"data", types.String}, {"b1", types.Byte}, {"b2", types.Byte}}, []FunctionParam{{"ans", types.Int}})
  1329. if s.ISA.HasSIMD {
  1330. s.indexbyte2_body(f)
  1331. }
  1332. }
  1333. func (s *State) indexbyte_body(f *Function) {
  1334. b := f.Vec()
  1335. f.Set1Epi8("b", b)
  1336. test_bytes := func(bytes_to_test, test_ans Register) {
  1337. f.CmpEqEpi8(bytes_to_test, b, test_ans)
  1338. }
  1339. s.index_func(f, test_bytes)
  1340. }
  1341. func (s *State) indexbyte() {
  1342. f := s.NewFunction("index_byte_asm", "Find the index of a byte", []FunctionParam{{"data", ByteSlice}, {"b", types.Byte}}, []FunctionParam{{"ans", types.Int}})
  1343. if s.ISA.HasSIMD {
  1344. s.indexbyte_body(f)
  1345. }
  1346. f = s.NewFunction("index_byte_string_asm", "Find the index of a byte", []FunctionParam{{"data", types.String}, {"b", types.Byte}}, []FunctionParam{{"ans", types.Int}})
  1347. if s.ISA.HasSIMD {
  1348. s.indexbyte_body(f)
  1349. }
  1350. }
  1351. func (s *State) indexc0_body(f *Function) {
  1352. lower := f.Vec()
  1353. upper := f.Vec()
  1354. del := f.Vec()
  1355. f.Set1Epi8(-1, lower)
  1356. f.Set1Epi8(int(' '), upper)
  1357. f.Set1Epi8(0x7f, del)
  1358. test_bytes := func(bytes_to_test, test_ans Register) {
  1359. temp := f.Vec()
  1360. defer f.ReleaseReg(temp)
  1361. f.CmpEqEpi8(bytes_to_test, del, test_ans)
  1362. f.CmpLtEpi8(bytes_to_test, upper, temp)
  1363. f.CmpGtEpi8(bytes_to_test, lower, bytes_to_test)
  1364. f.And(temp, bytes_to_test, bytes_to_test)
  1365. f.Or(test_ans, bytes_to_test, test_ans)
  1366. }
  1367. s.index_func(f, test_bytes)
  1368. }
  1369. func (s *State) indexc0() {
  1370. f := s.NewFunction("index_c0_asm", "Find the index of a C0 control code", []FunctionParam{{"data", ByteSlice}}, []FunctionParam{{"ans", types.Int}})
  1371. if s.ISA.HasSIMD {
  1372. s.indexc0_body(f)
  1373. }
  1374. f = s.NewFunction("index_c0_string_asm", "Find the index of a C0 control code", []FunctionParam{{"data", types.String}}, []FunctionParam{{"ans", types.Int}})
  1375. if s.ISA.HasSIMD {
  1376. s.indexc0_body(f)
  1377. }
  1378. }
  1379. func (s *State) Generate() {
  1380. s.test_load()
  1381. s.test_set1_epi8()
  1382. s.test_cmpeq_epi8()
  1383. s.test_cmplt_epi8()
  1384. s.test_or()
  1385. s.test_jump_if_zero()
  1386. s.test_count_to_match()
  1387. s.indexbyte2()
  1388. s.indexc0()
  1389. s.indexbyte()
  1390. s.OutputFunction()
  1391. }
  1392. // CLI {{{
  1393. func exit(msg any) {
  1394. fmt.Fprintf(os.Stderr, "%s\n", msg)
  1395. os.Exit(1)
  1396. }
  1397. func write_file(name, text string) {
  1398. b := unsafe.Slice(unsafe.StringData(text), len(text))
  1399. if existing, err := os.ReadFile(name); err == nil && bytes.Equal(existing, b) {
  1400. return
  1401. }
  1402. if err := os.WriteFile(name, b, 0664); err != nil {
  1403. exit(err)
  1404. }
  1405. }
  1406. func do_one(s *State) {
  1407. s.Generate()
  1408. if s.ISA.HasSIMD {
  1409. write_file(fmt.Sprintf("asm_%d_%s_generated.s", s.ISA.Bits, s.ISA.Goarch), s.ASMOutput.String())
  1410. write_file(fmt.Sprintf("asm_%d_%s_generated_test.s", s.ISA.Bits, s.ISA.Goarch), s.TestASMOutput.String())
  1411. }
  1412. write_file(fmt.Sprintf("asm_%d_%s_generated.go", s.ISA.Bits, s.ISA.Goarch), s.StubOutput.String())
  1413. write_file(fmt.Sprintf("asm_%d_%s_generated_test.go", s.ISA.Bits, s.ISA.Goarch), s.TestStubOutput.String())
  1414. }
  1415. func create_isa(arch Arch, bits int) ISA {
  1416. switch arch {
  1417. case AMD64:
  1418. return CreateAMD64ISA(bits)
  1419. case ARM64:
  1420. return CreateARM64ISA(bits)
  1421. }
  1422. panic("Unknown ISA arch")
  1423. }
  1424. func main() {
  1425. output_dir, err := os.Getwd()
  1426. if err != nil {
  1427. exit(err)
  1428. }
  1429. if len(os.Args) > 1 {
  1430. if output_dir, err = filepath.Abs(os.Args[len(os.Args)-1]); err != nil {
  1431. exit(err)
  1432. }
  1433. }
  1434. if err = os.MkdirAll(output_dir, 0755); err != nil {
  1435. exit(err)
  1436. }
  1437. if err = os.Chdir(output_dir); err != nil {
  1438. exit(err)
  1439. }
  1440. package_name = filepath.Base(output_dir)
  1441. simd_arches := []Arch{AMD64, ARM64}
  1442. a := make([]string, len(simd_arches))
  1443. for i, arch := range simd_arches {
  1444. a[i] = string(arch)
  1445. }
  1446. no_simd_build_tag := fmt.Sprintf("!(%s)", strings.Join(a, "||"))
  1447. for _, bits := range []int{128, 256} {
  1448. for _, arch := range simd_arches {
  1449. s := NewState(create_isa(arch, bits))
  1450. fmt.Fprintf(&s.StubOutput, "const HasSIMD%dCode = %#v\n", bits, s.ISA.HasSIMD)
  1451. do_one(s)
  1452. }
  1453. s := NewState(CreateAMD64ISA(bits), no_simd_build_tag)
  1454. s.ISA.HasSIMD = false
  1455. fmt.Fprintf(&s.StubOutput, "const HasSIMD%dCode = false\n", bits)
  1456. s.Generate()
  1457. write_file(fmt.Sprintf("asm_other_%d_generated.go", bits), s.StubOutput.String())
  1458. write_file(fmt.Sprintf("asm_other_%d_generated_test.go", bits), s.TestStubOutput.String())
  1459. }
  1460. }
  1461. // }}}