123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182 |
- #define mul(a0,a1,a2,a3, rb, stack) \
- MOVQ a0, AX \
- MULQ 0+rb \
- MOVQ AX, R8 \
- MOVQ DX, R9 \
- MOVQ a0, AX \
- MULQ 8+rb \
- ADDQ AX, R9 \
- ADCQ $0, DX \
- MOVQ DX, R10 \
- MOVQ a0, AX \
- MULQ 16+rb \
- ADDQ AX, R10 \
- ADCQ $0, DX \
- MOVQ DX, R11 \
- MOVQ a0, AX \
- MULQ 24+rb \
- ADDQ AX, R11 \
- ADCQ $0, DX \
- MOVQ DX, R12 \
- \
- storeBlock(R8,R9,R10,R11, 0+stack) \
- MOVQ R12, 32+stack \
- \
- MOVQ a1, AX \
- MULQ 0+rb \
- MOVQ AX, R8 \
- MOVQ DX, R9 \
- MOVQ a1, AX \
- MULQ 8+rb \
- ADDQ AX, R9 \
- ADCQ $0, DX \
- MOVQ DX, R10 \
- MOVQ a1, AX \
- MULQ 16+rb \
- ADDQ AX, R10 \
- ADCQ $0, DX \
- MOVQ DX, R11 \
- MOVQ a1, AX \
- MULQ 24+rb \
- ADDQ AX, R11 \
- ADCQ $0, DX \
- MOVQ DX, R12 \
- \
- ADDQ 8+stack, R8 \
- ADCQ 16+stack, R9 \
- ADCQ 24+stack, R10 \
- ADCQ 32+stack, R11 \
- ADCQ $0, R12 \
- storeBlock(R8,R9,R10,R11, 8+stack) \
- MOVQ R12, 40+stack \
- \
- MOVQ a2, AX \
- MULQ 0+rb \
- MOVQ AX, R8 \
- MOVQ DX, R9 \
- MOVQ a2, AX \
- MULQ 8+rb \
- ADDQ AX, R9 \
- ADCQ $0, DX \
- MOVQ DX, R10 \
- MOVQ a2, AX \
- MULQ 16+rb \
- ADDQ AX, R10 \
- ADCQ $0, DX \
- MOVQ DX, R11 \
- MOVQ a2, AX \
- MULQ 24+rb \
- ADDQ AX, R11 \
- ADCQ $0, DX \
- MOVQ DX, R12 \
- \
- ADDQ 16+stack, R8 \
- ADCQ 24+stack, R9 \
- ADCQ 32+stack, R10 \
- ADCQ 40+stack, R11 \
- ADCQ $0, R12 \
- storeBlock(R8,R9,R10,R11, 16+stack) \
- MOVQ R12, 48+stack \
- \
- MOVQ a3, AX \
- MULQ 0+rb \
- MOVQ AX, R8 \
- MOVQ DX, R9 \
- MOVQ a3, AX \
- MULQ 8+rb \
- ADDQ AX, R9 \
- ADCQ $0, DX \
- MOVQ DX, R10 \
- MOVQ a3, AX \
- MULQ 16+rb \
- ADDQ AX, R10 \
- ADCQ $0, DX \
- MOVQ DX, R11 \
- MOVQ a3, AX \
- MULQ 24+rb \
- ADDQ AX, R11 \
- ADCQ $0, DX \
- MOVQ DX, R12 \
- \
- ADDQ 24+stack, R8 \
- ADCQ 32+stack, R9 \
- ADCQ 40+stack, R10 \
- ADCQ 48+stack, R11 \
- ADCQ $0, R12 \
- storeBlock(R8,R9,R10,R11, 24+stack) \
- MOVQ R12, 56+stack
- #define gfpReduce(stack) \
- \ // m = (T * N') mod R, store m in R8:R9:R10:R11
- MOVQ ·np+0(SB), AX \
- MULQ 0+stack \
- MOVQ AX, R8 \
- MOVQ DX, R9 \
- MOVQ ·np+0(SB), AX \
- MULQ 8+stack \
- ADDQ AX, R9 \
- ADCQ $0, DX \
- MOVQ DX, R10 \
- MOVQ ·np+0(SB), AX \
- MULQ 16+stack \
- ADDQ AX, R10 \
- ADCQ $0, DX \
- MOVQ DX, R11 \
- MOVQ ·np+0(SB), AX \
- MULQ 24+stack \
- ADDQ AX, R11 \
- \
- MOVQ ·np+8(SB), AX \
- MULQ 0+stack \
- MOVQ AX, R12 \
- MOVQ DX, R13 \
- MOVQ ·np+8(SB), AX \
- MULQ 8+stack \
- ADDQ AX, R13 \
- ADCQ $0, DX \
- MOVQ DX, R14 \
- MOVQ ·np+8(SB), AX \
- MULQ 16+stack \
- ADDQ AX, R14 \
- \
- ADDQ R12, R9 \
- ADCQ R13, R10 \
- ADCQ R14, R11 \
- \
- MOVQ ·np+16(SB), AX \
- MULQ 0+stack \
- MOVQ AX, R12 \
- MOVQ DX, R13 \
- MOVQ ·np+16(SB), AX \
- MULQ 8+stack \
- ADDQ AX, R13 \
- \
- ADDQ R12, R10 \
- ADCQ R13, R11 \
- \
- MOVQ ·np+24(SB), AX \
- MULQ 0+stack \
- ADDQ AX, R11 \
- \
- storeBlock(R8,R9,R10,R11, 64+stack) \
- \
- \ // m * N
- mul(·p2+0(SB),·p2+8(SB),·p2+16(SB),·p2+24(SB), 64+stack, 96+stack) \
- \
- \ // Add the 512-bit intermediate to m*N
- loadBlock(96+stack, R8,R9,R10,R11) \
- loadBlock(128+stack, R12,R13,R14,R15) \
- \
- MOVQ $0, AX \
- ADDQ 0+stack, R8 \
- ADCQ 8+stack, R9 \
- ADCQ 16+stack, R10 \
- ADCQ 24+stack, R11 \
- ADCQ 32+stack, R12 \
- ADCQ 40+stack, R13 \
- ADCQ 48+stack, R14 \
- ADCQ 56+stack, R15 \
- ADCQ $0, AX \
- \
- gfpCarry(R12,R13,R14,R15,AX, R8,R9,R10,R11,BX)
|