mul_bmi2_amd64.h 2.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113
  1. #define mulBMI2(a0,a1,a2,a3, rb) \
  2. MOVQ a0, DX \
  3. MOVQ $0, R13 \
  4. MULXQ 0+rb, R8, R9 \
  5. MULXQ 8+rb, AX, R10 \
  6. ADDQ AX, R9 \
  7. MULXQ 16+rb, AX, R11 \
  8. ADCQ AX, R10 \
  9. MULXQ 24+rb, AX, R12 \
  10. ADCQ AX, R11 \
  11. ADCQ $0, R12 \
  12. ADCQ $0, R13 \
  13. \
  14. MOVQ a1, DX \
  15. MOVQ $0, R14 \
  16. MULXQ 0+rb, AX, BX \
  17. ADDQ AX, R9 \
  18. ADCQ BX, R10 \
  19. MULXQ 16+rb, AX, BX \
  20. ADCQ AX, R11 \
  21. ADCQ BX, R12 \
  22. ADCQ $0, R13 \
  23. MULXQ 8+rb, AX, BX \
  24. ADDQ AX, R10 \
  25. ADCQ BX, R11 \
  26. MULXQ 24+rb, AX, BX \
  27. ADCQ AX, R12 \
  28. ADCQ BX, R13 \
  29. ADCQ $0, R14 \
  30. \
  31. MOVQ a2, DX \
  32. MOVQ $0, R15 \
  33. MULXQ 0+rb, AX, BX \
  34. ADDQ AX, R10 \
  35. ADCQ BX, R11 \
  36. MULXQ 16+rb, AX, BX \
  37. ADCQ AX, R12 \
  38. ADCQ BX, R13 \
  39. ADCQ $0, R14 \
  40. MULXQ 8+rb, AX, BX \
  41. ADDQ AX, R11 \
  42. ADCQ BX, R12 \
  43. MULXQ 24+rb, AX, BX \
  44. ADCQ AX, R13 \
  45. ADCQ BX, R14 \
  46. ADCQ $0, R15 \
  47. \
  48. MOVQ a3, DX \
  49. MULXQ 0+rb, AX, BX \
  50. ADDQ AX, R11 \
  51. ADCQ BX, R12 \
  52. MULXQ 16+rb, AX, BX \
  53. ADCQ AX, R13 \
  54. ADCQ BX, R14 \
  55. ADCQ $0, R15 \
  56. MULXQ 8+rb, AX, BX \
  57. ADDQ AX, R12 \
  58. ADCQ BX, R13 \
  59. MULXQ 24+rb, AX, BX \
  60. ADCQ AX, R14 \
  61. ADCQ BX, R15
  62. #define gfpReduceBMI2() \
  63. \ // m = (T * N') mod R, store m in R8:R9:R10:R11
  64. MOVQ ·np+0(SB), DX \
  65. MULXQ 0(SP), R8, R9 \
  66. MULXQ 8(SP), AX, R10 \
  67. ADDQ AX, R9 \
  68. MULXQ 16(SP), AX, R11 \
  69. ADCQ AX, R10 \
  70. MULXQ 24(SP), AX, BX \
  71. ADCQ AX, R11 \
  72. \
  73. MOVQ ·np+8(SB), DX \
  74. MULXQ 0(SP), AX, BX \
  75. ADDQ AX, R9 \
  76. ADCQ BX, R10 \
  77. MULXQ 16(SP), AX, BX \
  78. ADCQ AX, R11 \
  79. MULXQ 8(SP), AX, BX \
  80. ADDQ AX, R10 \
  81. ADCQ BX, R11 \
  82. \
  83. MOVQ ·np+16(SB), DX \
  84. MULXQ 0(SP), AX, BX \
  85. ADDQ AX, R10 \
  86. ADCQ BX, R11 \
  87. MULXQ 8(SP), AX, BX \
  88. ADDQ AX, R11 \
  89. \
  90. MOVQ ·np+24(SB), DX \
  91. MULXQ 0(SP), AX, BX \
  92. ADDQ AX, R11 \
  93. \
  94. storeBlock(R8,R9,R10,R11, 64(SP)) \
  95. \
  96. \ // m * N
  97. mulBMI2(·p2+0(SB),·p2+8(SB),·p2+16(SB),·p2+24(SB), 64(SP)) \
  98. \
  99. \ // Add the 512-bit intermediate to m*N
  100. MOVQ $0, AX \
  101. ADDQ 0(SP), R8 \
  102. ADCQ 8(SP), R9 \
  103. ADCQ 16(SP), R10 \
  104. ADCQ 24(SP), R11 \
  105. ADCQ 32(SP), R12 \
  106. ADCQ 40(SP), R13 \
  107. ADCQ 48(SP), R14 \
  108. ADCQ 56(SP), R15 \
  109. ADCQ $0, AX \
  110. \
  111. gfpCarry(R12,R13,R14,R15,AX, R8,R9,R10,R11,BX)