gfp_amd64.s 2.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130
  1. // +build amd64,!generic
  2. #define storeBlock(a0,a1,a2,a3, r) \
  3. MOVQ a0, 0+r \
  4. MOVQ a1, 8+r \
  5. MOVQ a2, 16+r \
  6. MOVQ a3, 24+r
  7. #define loadBlock(r, a0,a1,a2,a3) \
  8. MOVQ 0+r, a0 \
  9. MOVQ 8+r, a1 \
  10. MOVQ 16+r, a2 \
  11. MOVQ 24+r, a3
  12. #define gfpCarry(a0,a1,a2,a3,a4, b0,b1,b2,b3,b4) \
  13. \ // b = a-p
  14. MOVQ a0, b0 \
  15. MOVQ a1, b1 \
  16. MOVQ a2, b2 \
  17. MOVQ a3, b3 \
  18. MOVQ a4, b4 \
  19. \
  20. SUBQ ·p2+0(SB), b0 \
  21. SBBQ ·p2+8(SB), b1 \
  22. SBBQ ·p2+16(SB), b2 \
  23. SBBQ ·p2+24(SB), b3 \
  24. SBBQ $0, b4 \
  25. \
  26. \ // if b is negative then return a
  27. \ // else return b
  28. CMOVQCC b0, a0 \
  29. CMOVQCC b1, a1 \
  30. CMOVQCC b2, a2 \
  31. CMOVQCC b3, a3
  32. #include "mul_amd64.h"
  33. #include "mul_bmi2_amd64.h"
  34. TEXT ·gfpNeg(SB),0,$0-16
  35. MOVQ ·p2+0(SB), R8
  36. MOVQ ·p2+8(SB), R9
  37. MOVQ ·p2+16(SB), R10
  38. MOVQ ·p2+24(SB), R11
  39. MOVQ a+8(FP), DI
  40. SUBQ 0(DI), R8
  41. SBBQ 8(DI), R9
  42. SBBQ 16(DI), R10
  43. SBBQ 24(DI), R11
  44. MOVQ $0, AX
  45. gfpCarry(R8,R9,R10,R11,AX, R12,R13,R14,R15,BX)
  46. MOVQ c+0(FP), DI
  47. storeBlock(R8,R9,R10,R11, 0(DI))
  48. RET
  49. TEXT ·gfpAdd(SB),0,$0-24
  50. MOVQ a+8(FP), DI
  51. MOVQ b+16(FP), SI
  52. loadBlock(0(DI), R8,R9,R10,R11)
  53. MOVQ $0, R12
  54. ADDQ 0(SI), R8
  55. ADCQ 8(SI), R9
  56. ADCQ 16(SI), R10
  57. ADCQ 24(SI), R11
  58. ADCQ $0, R12
  59. gfpCarry(R8,R9,R10,R11,R12, R13,R14,R15,AX,BX)
  60. MOVQ c+0(FP), DI
  61. storeBlock(R8,R9,R10,R11, 0(DI))
  62. RET
  63. TEXT ·gfpSub(SB),0,$0-24
  64. MOVQ a+8(FP), DI
  65. MOVQ b+16(FP), SI
  66. loadBlock(0(DI), R8,R9,R10,R11)
  67. MOVQ ·p2+0(SB), R12
  68. MOVQ ·p2+8(SB), R13
  69. MOVQ ·p2+16(SB), R14
  70. MOVQ ·p2+24(SB), R15
  71. MOVQ $0, AX
  72. SUBQ 0(SI), R8
  73. SBBQ 8(SI), R9
  74. SBBQ 16(SI), R10
  75. SBBQ 24(SI), R11
  76. CMOVQCC AX, R12
  77. CMOVQCC AX, R13
  78. CMOVQCC AX, R14
  79. CMOVQCC AX, R15
  80. ADDQ R12, R8
  81. ADCQ R13, R9
  82. ADCQ R14, R10
  83. ADCQ R15, R11
  84. MOVQ c+0(FP), DI
  85. storeBlock(R8,R9,R10,R11, 0(DI))
  86. RET
  87. TEXT ·gfpMul(SB),0,$160-24
  88. MOVQ a+8(FP), DI
  89. MOVQ b+16(FP), SI
  90. // Jump to a slightly different implementation if MULX isn't supported.
  91. CMPB runtime·support_bmi2(SB), $0
  92. JE nobmi2Mul
  93. mulBMI2(0(DI),8(DI),16(DI),24(DI), 0(SI))
  94. storeBlock( R8, R9,R10,R11, 0(SP))
  95. storeBlock(R12,R13,R14,R15, 32(SP))
  96. gfpReduceBMI2()
  97. JMP end
  98. nobmi2Mul:
  99. mul(0(DI),8(DI),16(DI),24(DI), 0(SI), 0(SP))
  100. gfpReduce(0(SP))
  101. end:
  102. MOVQ c+0(FP), DI
  103. storeBlock(R12,R13,R14,R15, 0(DI))
  104. RET