123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304 |
- .global _rsqrt_inaccurate
- .global rsqrt_inaccurate
- .global _rsqrt_full
- .global rsqrt_full
- .global _rsqrt_full_gpr
- .global rsqrt_full_gpr
- .global _rsqrt_full_nb
- .global rsqrt_full_nb
- .global _rsqrt_full_nb2
- .global rsqrt_full_nb2
- .global _rsqrt_full_nb_gpr
- .global rsqrt_full_nb_gpr
- .global _rsqrt_newton
- .global rsqrt_newton
- .global _rsqrt_hack
- .global rsqrt_hack
- .global _rsqrt_fallback
- .text
- .intel_syntax noprefix
- .align 16
- min_pos_denorm:
- .long 0x00800000,0,0,0
- penultimate_bit:
- .long 0x00008000,0,0,0
- ultimate_bit:
- .long 0x00004000,0,0,0
- top_mask:
- .long 0xFFFF8000,0,0,0
- one:
- .long 0x3f800000,0,0,0
- half:
- .long 0x3f000000,0,0,0
- one_point_five:
- .long 0x3fc00000,0,0,0
- magic1:
- .long 0x60000000,0,0,0
- magic2:
- .long 0x3c000000,0,0,0
- magic3:
- .long 0x000047ff,0,0,0
- _rsqrt_inaccurate:
- rsqrt_inaccurate:
- movd xmm0, edi
- rsqrtss xmm0, xmm0
- movd eax, xmm0
- ret
- _rsqrt_full:
- rsqrt_full:
- movd xmm0, edi
- pand xmm0, [rip + top_mask]
- por xmm0, [rip + penultimate_bit]
- vcmpngt_uqss xmm1, xmm0, [rip + min_pos_denorm]
- ptest xmm1, xmm1
- jnz rsqrt_full_bad
- sqrtss xmm0, xmm0
- movd xmm1, [rip + one]
- divss xmm1, xmm0
- paddd xmm1, [rip + ultimate_bit]
- pand xmm1, [rip + top_mask]
- movd eax, xmm1
- ret
- _rsqrt_full_gpr:
- rsqrt_full_gpr:
- movd eax, xmm0 # Emulate regalloc mov
- mov eax, edi
- and eax, 0xFFFF8000
- or eax, 0x00008000
- movd xmm0, eax
- vcmpngt_uqss xmm1, xmm0, [rip + min_pos_denorm]
- ptest xmm1, xmm1
- jnz rsqrt_full_bad
- sqrtss xmm0, xmm0
- movd xmm1, [rip + one]
- divss xmm1, xmm0
- movd eax, xmm1
- add eax, 0x00004000
- and eax, 0xffff8000
- movd xmm0, eax # Emulate regalloc mov
- ret
- _rsqrt_full_nb2:
- rsqrt_full_nb2:
- movd xmm0, edi
- pand xmm0, [rip + top_mask]
- por xmm0, [rip + penultimate_bit]
- ucomiss xmm0, [rip + min_pos_denorm]
- jna rsqrt_full_bad_new1
- sqrtss xmm0, xmm0
- movd xmm1, [rip + one]
- divss xmm1, xmm0
- paddd xmm1, [rip + ultimate_bit]
- pand xmm1, [rip + top_mask]
- movd eax, xmm1
- ret
- _rsqrt_full_nb:
- rsqrt_full_nb:
- movd xmm0, edi
- pand xmm0, [rip + top_mask]
- por xmm0, [rip + penultimate_bit]
- vcmpngt_uqss xmm1, xmm0, [rip + min_pos_denorm]
- ptest xmm1, xmm1
- jnz rsqrt_full_bad_new1
- sqrtss xmm0, xmm0
- movd xmm1, [rip + one]
- divss xmm1, xmm0
- paddd xmm1, [rip + ultimate_bit]
- pand xmm1, [rip + top_mask]
- movd eax, xmm1
- ret
- rsqrt_full_bad_new1:
- cmp edi, 0x00800000
- jb rsqrt_full_bad_new_fallback1
- movd xmm0, edi
- rsqrtss xmm1, xmm0
- ucomiss xmm1, xmm1
- jp rsqrt_full_bad_new1_nan
- movd eax, xmm1
- ret
- rsqrt_full_bad_new_fallback1:
- call _rsqrt_fallback
- ret
- rsqrt_full_bad_new1_nan:
- ucomiss xmm0, xmm0
- jp rsqrt_full_bad_new1_nan_ret
- mov eax, 0x7FC00000
- ret
- rsqrt_full_bad_new1_nan_ret:
- ret
- _rsqrt_full_nb_gpr:
- rsqrt_full_nb_gpr:
- movd eax, xmm0 # Emulate regalloc mov
- mov eax, edi
- and eax, 0xFFFF8000
- or eax, 0x00008000
- movd xmm0, eax
- vcmpngt_uqss xmm1, xmm0, [rip + min_pos_denorm]
- ptest xmm1, xmm1
- jnz rsqrt_full_bad_new2
- sqrtss xmm0, xmm0
- movd xmm1, [rip + one]
- divss xmm1, xmm0
- movd eax, xmm1
- add eax, 0x00004000
- and eax, 0xffff8000
- movd xmm0, eax # Emulate regalloc mov
- ret
- rsqrt_full_bad_new2:
- cmp edi, 0x00800000
- jb rsqrt_full_bad_new_fallback2
- movd xmm0, edi
- rsqrtss xmm1, xmm0
- test edi, edi
- js rsqrt_full_bad_new2_nan
- movd eax, xmm1
- ret
- rsqrt_full_bad_new_fallback2:
- call _rsqrt_fallback
- ret
- rsqrt_full_bad_new2_nan:
- mov eax, 0x7FC00000
- ret
- rsqrt_full_bad:
- xorps xmm1, xmm1
- movd xmm0, edi
- ucomiss xmm0, xmm1
- jp rsqrt_full_nan
- je rsqrt_full_zero
- jc rsqrt_full_neg
- cmp edi, 0x7F800000
- je rsqrt_full_inf
- # TODO: Full Denormal Implementation
- call _rsqrt_fallback
- ret
- rsqrt_full_neg:
- mov eax, 0x7FC00000
- ret
- rsqrt_full_inf:
- xor eax, eax
- ret
- rsqrt_full_nan:
- mov eax, edi
- or eax, 0x00400000
- ret
- rsqrt_full_zero:
- mov eax, edi
- or eax, 0x7F800000
- ret
- _rsqrt_newton:
- rsqrt_newton:
- movd xmm0, edi
- pand xmm0, [rip + top_mask]
- por xmm0, [rip + penultimate_bit]
- vcmpngt_uqss xmm1, xmm0, [rip + min_pos_denorm]
- ptest xmm1, xmm1
- jnz rsqrt_full_bad
- rsqrtps xmm1, xmm0
- mulss xmm0, [rip + half]
- vmulss xmm2, xmm1, xmm1
- mulss xmm2, xmm0
- movaps xmm0, [rip + one_point_five]
- subss xmm0, xmm2
- mulss xmm0, xmm1
- paddd xmm0, [rip + ultimate_bit]
- pand xmm0, [rip + top_mask]
- movd eax, xmm0
- ret
- _rsqrt_hack:
- rsqrt_hack:
- movd xmm9, edi
- vpand xmm0, xmm9, [rip + top_mask]
- por xmm0, [rip + penultimate_bit]
- # detect NaNs, negatives, zeros, denormals and infinities
- vcmpngt_uqss xmm1, xmm0, [rip + min_pos_denorm]
- ptest xmm1, xmm1
- jnz rsqrt_full_bad
- # calculate x64 estimate
- rsqrtps xmm0, xmm0
- # calculate correction factor
- vpslld xmm1, xmm9, 8
- vpsrad xmm2, xmm1, 31
- paddd xmm1, [rip + magic1]
- pcmpgtd xmm1, [rip + magic2]
- pxor xmm1, xmm2
- movaps xmm2, [rip + magic3]
- psubd xmm2, xmm1
- # correct x64 estimate
- paddd xmm0, xmm2
- pand xmm0, [rip + top_mask]
- movd eax, xmm0
- ret
|