123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130 |
- // +build amd64,!generic
- #define storeBlock(a0,a1,a2,a3, r) \
- MOVQ a0, 0+r \
- MOVQ a1, 8+r \
- MOVQ a2, 16+r \
- MOVQ a3, 24+r
- #define loadBlock(r, a0,a1,a2,a3) \
- MOVQ 0+r, a0 \
- MOVQ 8+r, a1 \
- MOVQ 16+r, a2 \
- MOVQ 24+r, a3
- #define gfpCarry(a0,a1,a2,a3,a4, b0,b1,b2,b3,b4) \
- \ // b = a-p
- MOVQ a0, b0 \
- MOVQ a1, b1 \
- MOVQ a2, b2 \
- MOVQ a3, b3 \
- MOVQ a4, b4 \
- \
- SUBQ ·p2+0(SB), b0 \
- SBBQ ·p2+8(SB), b1 \
- SBBQ ·p2+16(SB), b2 \
- SBBQ ·p2+24(SB), b3 \
- SBBQ $0, b4 \
- \
- \ // if b is negative then return a
- \ // else return b
- CMOVQCC b0, a0 \
- CMOVQCC b1, a1 \
- CMOVQCC b2, a2 \
- CMOVQCC b3, a3
- #include "mul_amd64.h"
- #include "mul_bmi2_amd64.h"
- TEXT ·gfpNeg(SB),0,$0-16
- MOVQ ·p2+0(SB), R8
- MOVQ ·p2+8(SB), R9
- MOVQ ·p2+16(SB), R10
- MOVQ ·p2+24(SB), R11
- MOVQ a+8(FP), DI
- SUBQ 0(DI), R8
- SBBQ 8(DI), R9
- SBBQ 16(DI), R10
- SBBQ 24(DI), R11
- MOVQ $0, AX
- gfpCarry(R8,R9,R10,R11,AX, R12,R13,R14,R15,BX)
- MOVQ c+0(FP), DI
- storeBlock(R8,R9,R10,R11, 0(DI))
- RET
- TEXT ·gfpAdd(SB),0,$0-24
- MOVQ a+8(FP), DI
- MOVQ b+16(FP), SI
- loadBlock(0(DI), R8,R9,R10,R11)
- MOVQ $0, R12
- ADDQ 0(SI), R8
- ADCQ 8(SI), R9
- ADCQ 16(SI), R10
- ADCQ 24(SI), R11
- ADCQ $0, R12
- gfpCarry(R8,R9,R10,R11,R12, R13,R14,R15,AX,BX)
- MOVQ c+0(FP), DI
- storeBlock(R8,R9,R10,R11, 0(DI))
- RET
- TEXT ·gfpSub(SB),0,$0-24
- MOVQ a+8(FP), DI
- MOVQ b+16(FP), SI
- loadBlock(0(DI), R8,R9,R10,R11)
- MOVQ ·p2+0(SB), R12
- MOVQ ·p2+8(SB), R13
- MOVQ ·p2+16(SB), R14
- MOVQ ·p2+24(SB), R15
- MOVQ $0, AX
- SUBQ 0(SI), R8
- SBBQ 8(SI), R9
- SBBQ 16(SI), R10
- SBBQ 24(SI), R11
- CMOVQCC AX, R12
- CMOVQCC AX, R13
- CMOVQCC AX, R14
- CMOVQCC AX, R15
- ADDQ R12, R8
- ADCQ R13, R9
- ADCQ R14, R10
- ADCQ R15, R11
- MOVQ c+0(FP), DI
- storeBlock(R8,R9,R10,R11, 0(DI))
- RET
- TEXT ·gfpMul(SB),0,$160-24
- MOVQ a+8(FP), DI
- MOVQ b+16(FP), SI
- // Jump to a slightly different implementation if MULX isn't supported.
- CMPB runtime·support_bmi2(SB), $0
- JE nobmi2Mul
- mulBMI2(0(DI),8(DI),16(DI),24(DI), 0(SI))
- storeBlock( R8, R9,R10,R11, 0(SP))
- storeBlock(R12,R13,R14,R15, 32(SP))
- gfpReduceBMI2()
- JMP end
- nobmi2Mul:
- mul(0(DI),8(DI),16(DI),24(DI), 0(SI), 0(SP))
- gfpReduce(0(SP))
- end:
- MOVQ c+0(FP), DI
- storeBlock(R12,R13,R14,R15, 0(DI))
- RET
|