123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121 |
- .section .text..SHmedia32,"ax"
- .align 2
- .global __udivdi3
- __udivdi3:
- shlri r3,1,r4
- nsb r4,r22
- shlld r3,r22,r6
- shlri r6,49,r5
- movi 0xffffffffffffbaf1,r21 /* .l shift count 17. */
- sub r21,r5,r1
- mmulfx.w r1,r1,r4
- mshflo.w r1,r63,r1
- sub r63,r22,r20 // r63 == 64 % 64
- mmulfx.w r5,r4,r4
- pta large_divisor,tr0
- addi r20,32,r9
- msub.w r1,r4,r1
- madd.w r1,r1,r1
- mmulfx.w r1,r1,r4
- shlri r6,32,r7
- bgt/u r9,r63,tr0 // large_divisor
- mmulfx.w r5,r4,r4
- shlri r2,32+14,r19
- addi r22,-31,r0
- msub.w r1,r4,r1
- mulu.l r1,r7,r4
- addi r1,-3,r5
- mulu.l r5,r19,r5
- sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r2
- shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as
- the case may be, %0000000000000000 000.11111111111, still */
- muls.l r1,r4,r4 /* leaving at least one sign bit. */
- mulu.l r5,r3,r8
- mshalds.l r1,r21,r1
- shari r4,26,r4
- shlld r8,r0,r8
- add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5)
- sub r2,r8,r2
- /* Can do second step of 64 : 32 div now, using r1 and the rest in r2. */
- shlri r2,22,r21
- mulu.l r21,r1,r21
- shlld r5,r0,r8
- addi r20,30-22,r0
- shlrd r21,r0,r21
- mulu.l r21,r3,r5
- add r8,r21,r8
- mcmpgt.l r21,r63,r21 // See Note 1
- addi r20,30,r0
- mshfhi.l r63,r21,r21
- sub r2,r5,r2
- andc r2,r21,r2
- /* small divisor: need a third divide step */
- mulu.l r2,r1,r7
- ptabs r18,tr0
- addi r2,1,r2
- shlrd r7,r0,r7
- mulu.l r7,r3,r5
- add r8,r7,r8
- sub r2,r3,r2
- cmpgt r2,r5,r5
- add r8,r5,r2
- /* could test r3 here to check for divide by zero. */
- blink tr0,r63
- large_divisor:
- mmulfx.w r5,r4,r4
- shlrd r2,r9,r25
- shlri r25,32,r8
- msub.w r1,r4,r1
- mulu.l r1,r7,r4
- addi r1,-3,r5
- mulu.l r5,r8,r5
- sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r2
- shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as
- the case may be, %0000000000000000 000.11111111111, still */
- muls.l r1,r4,r4 /* leaving at least one sign bit. */
- shlri r5,14-1,r8
- mulu.l r8,r7,r5
- mshalds.l r1,r21,r1
- shari r4,26,r4
- add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5)
- sub r25,r5,r25
- /* Can do second step of 64 : 32 div now, using r1 and the rest in r25. */
- shlri r25,22,r21
- mulu.l r21,r1,r21
- pta no_lo_adj,tr0
- addi r22,32,r0
- shlri r21,40,r21
- mulu.l r21,r7,r5
- add r8,r21,r8
- shlld r2,r0,r2
- sub r25,r5,r25
- bgtu/u r7,r25,tr0 // no_lo_adj
- addi r8,1,r8
- sub r25,r7,r25
- no_lo_adj:
- mextr4 r2,r25,r2
- /* large_divisor: only needs a few adjustments. */
- mulu.l r8,r6,r5
- ptabs r18,tr0
- /* bubble */
- cmpgtu r5,r2,r5
- sub r8,r5,r2
- blink tr0,r63
-
- /* Note 1: To shift the result of the second divide stage so that the result
- always fits into 32 bits, yet we still reduce the rest sufficiently
- would require a lot of instructions to do the shifts just right. Using
- the full 64 bit shift result to multiply with the divisor would require
- four extra instructions for the upper 32 bits (shift / mulu / shift / sub).
- Fortunately, if the upper 32 bits of the shift result are nonzero, we
- know that the rest after taking this partial result into account will
- fit into 32 bits. So we just clear the upper 32 bits of the rest if the
- upper 32 bits of the partial result are nonzero. */
|