rsqrt_test_fn.s 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304
  1. .global _rsqrt_inaccurate
  2. .global rsqrt_inaccurate
  3. .global _rsqrt_full
  4. .global rsqrt_full
  5. .global _rsqrt_full_gpr
  6. .global rsqrt_full_gpr
  7. .global _rsqrt_full_nb
  8. .global rsqrt_full_nb
  9. .global _rsqrt_full_nb2
  10. .global rsqrt_full_nb2
  11. .global _rsqrt_full_nb_gpr
  12. .global rsqrt_full_nb_gpr
  13. .global _rsqrt_newton
  14. .global rsqrt_newton
  15. .global _rsqrt_hack
  16. .global rsqrt_hack
  17. .global _rsqrt_fallback
  18. .text
  19. .intel_syntax noprefix
  20. .align 16
  21. min_pos_denorm:
  22. .long 0x00800000,0,0,0
  23. penultimate_bit:
  24. .long 0x00008000,0,0,0
  25. ultimate_bit:
  26. .long 0x00004000,0,0,0
  27. top_mask:
  28. .long 0xFFFF8000,0,0,0
  29. one:
  30. .long 0x3f800000,0,0,0
  31. half:
  32. .long 0x3f000000,0,0,0
  33. one_point_five:
  34. .long 0x3fc00000,0,0,0
  35. magic1:
  36. .long 0x60000000,0,0,0
  37. magic2:
  38. .long 0x3c000000,0,0,0
  39. magic3:
  40. .long 0x000047ff,0,0,0
  41. _rsqrt_inaccurate:
  42. rsqrt_inaccurate:
  43. movd xmm0, edi
  44. rsqrtss xmm0, xmm0
  45. movd eax, xmm0
  46. ret
  47. _rsqrt_full:
  48. rsqrt_full:
  49. movd xmm0, edi
  50. pand xmm0, [rip + top_mask]
  51. por xmm0, [rip + penultimate_bit]
  52. vcmpngt_uqss xmm1, xmm0, [rip + min_pos_denorm]
  53. ptest xmm1, xmm1
  54. jnz rsqrt_full_bad
  55. sqrtss xmm0, xmm0
  56. movd xmm1, [rip + one]
  57. divss xmm1, xmm0
  58. paddd xmm1, [rip + ultimate_bit]
  59. pand xmm1, [rip + top_mask]
  60. movd eax, xmm1
  61. ret
  62. _rsqrt_full_gpr:
  63. rsqrt_full_gpr:
  64. movd eax, xmm0 # Emulate regalloc mov
  65. mov eax, edi
  66. and eax, 0xFFFF8000
  67. or eax, 0x00008000
  68. movd xmm0, eax
  69. vcmpngt_uqss xmm1, xmm0, [rip + min_pos_denorm]
  70. ptest xmm1, xmm1
  71. jnz rsqrt_full_bad
  72. sqrtss xmm0, xmm0
  73. movd xmm1, [rip + one]
  74. divss xmm1, xmm0
  75. movd eax, xmm1
  76. add eax, 0x00004000
  77. and eax, 0xffff8000
  78. movd xmm0, eax # Emulate regalloc mov
  79. ret
  80. _rsqrt_full_nb2:
  81. rsqrt_full_nb2:
  82. movd xmm0, edi
  83. pand xmm0, [rip + top_mask]
  84. por xmm0, [rip + penultimate_bit]
  85. ucomiss xmm0, [rip + min_pos_denorm]
  86. jna rsqrt_full_bad_new1
  87. sqrtss xmm0, xmm0
  88. movd xmm1, [rip + one]
  89. divss xmm1, xmm0
  90. paddd xmm1, [rip + ultimate_bit]
  91. pand xmm1, [rip + top_mask]
  92. movd eax, xmm1
  93. ret
  94. _rsqrt_full_nb:
  95. rsqrt_full_nb:
  96. movd xmm0, edi
  97. pand xmm0, [rip + top_mask]
  98. por xmm0, [rip + penultimate_bit]
  99. vcmpngt_uqss xmm1, xmm0, [rip + min_pos_denorm]
  100. ptest xmm1, xmm1
  101. jnz rsqrt_full_bad_new1
  102. sqrtss xmm0, xmm0
  103. movd xmm1, [rip + one]
  104. divss xmm1, xmm0
  105. paddd xmm1, [rip + ultimate_bit]
  106. pand xmm1, [rip + top_mask]
  107. movd eax, xmm1
  108. ret
  109. rsqrt_full_bad_new1:
  110. cmp edi, 0x00800000
  111. jb rsqrt_full_bad_new_fallback1
  112. movd xmm0, edi
  113. rsqrtss xmm1, xmm0
  114. ucomiss xmm1, xmm1
  115. jp rsqrt_full_bad_new1_nan
  116. movd eax, xmm1
  117. ret
  118. rsqrt_full_bad_new_fallback1:
  119. call _rsqrt_fallback
  120. ret
  121. rsqrt_full_bad_new1_nan:
  122. ucomiss xmm0, xmm0
  123. jp rsqrt_full_bad_new1_nan_ret
  124. mov eax, 0x7FC00000
  125. ret
  126. rsqrt_full_bad_new1_nan_ret:
  127. ret
  128. _rsqrt_full_nb_gpr:
  129. rsqrt_full_nb_gpr:
  130. movd eax, xmm0 # Emulate regalloc mov
  131. mov eax, edi
  132. and eax, 0xFFFF8000
  133. or eax, 0x00008000
  134. movd xmm0, eax
  135. vcmpngt_uqss xmm1, xmm0, [rip + min_pos_denorm]
  136. ptest xmm1, xmm1
  137. jnz rsqrt_full_bad_new2
  138. sqrtss xmm0, xmm0
  139. movd xmm1, [rip + one]
  140. divss xmm1, xmm0
  141. movd eax, xmm1
  142. add eax, 0x00004000
  143. and eax, 0xffff8000
  144. movd xmm0, eax # Emulate regalloc mov
  145. ret
  146. rsqrt_full_bad_new2:
  147. cmp edi, 0x00800000
  148. jb rsqrt_full_bad_new_fallback2
  149. movd xmm0, edi
  150. rsqrtss xmm1, xmm0
  151. test edi, edi
  152. js rsqrt_full_bad_new2_nan
  153. movd eax, xmm1
  154. ret
  155. rsqrt_full_bad_new_fallback2:
  156. call _rsqrt_fallback
  157. ret
  158. rsqrt_full_bad_new2_nan:
  159. mov eax, 0x7FC00000
  160. ret
  161. rsqrt_full_bad:
  162. xorps xmm1, xmm1
  163. movd xmm0, edi
  164. ucomiss xmm0, xmm1
  165. jp rsqrt_full_nan
  166. je rsqrt_full_zero
  167. jc rsqrt_full_neg
  168. cmp edi, 0x7F800000
  169. je rsqrt_full_inf
  170. # TODO: Full Denormal Implementation
  171. call _rsqrt_fallback
  172. ret
  173. rsqrt_full_neg:
  174. mov eax, 0x7FC00000
  175. ret
  176. rsqrt_full_inf:
  177. xor eax, eax
  178. ret
  179. rsqrt_full_nan:
  180. mov eax, edi
  181. or eax, 0x00400000
  182. ret
  183. rsqrt_full_zero:
  184. mov eax, edi
  185. or eax, 0x7F800000
  186. ret
  187. _rsqrt_newton:
  188. rsqrt_newton:
  189. movd xmm0, edi
  190. pand xmm0, [rip + top_mask]
  191. por xmm0, [rip + penultimate_bit]
  192. vcmpngt_uqss xmm1, xmm0, [rip + min_pos_denorm]
  193. ptest xmm1, xmm1
  194. jnz rsqrt_full_bad
  195. rsqrtps xmm1, xmm0
  196. mulss xmm0, [rip + half]
  197. vmulss xmm2, xmm1, xmm1
  198. mulss xmm2, xmm0
  199. movaps xmm0, [rip + one_point_five]
  200. subss xmm0, xmm2
  201. mulss xmm0, xmm1
  202. paddd xmm0, [rip + ultimate_bit]
  203. pand xmm0, [rip + top_mask]
  204. movd eax, xmm0
  205. ret
  206. _rsqrt_hack:
  207. rsqrt_hack:
  208. movd xmm9, edi
  209. vpand xmm0, xmm9, [rip + top_mask]
  210. por xmm0, [rip + penultimate_bit]
  211. # detect NaNs, negatives, zeros, denormals and infinities
  212. vcmpngt_uqss xmm1, xmm0, [rip + min_pos_denorm]
  213. ptest xmm1, xmm1
  214. jnz rsqrt_full_bad
  215. # calculate x64 estimate
  216. rsqrtps xmm0, xmm0
  217. # calculate correction factor
  218. vpslld xmm1, xmm9, 8
  219. vpsrad xmm2, xmm1, 31
  220. paddd xmm1, [rip + magic1]
  221. pcmpgtd xmm1, [rip + magic2]
  222. pxor xmm1, xmm2
  223. movaps xmm2, [rip + magic3]
  224. psubd xmm2, xmm1
  225. # correct x64 estimate
  226. paddd xmm0, xmm2
  227. pand xmm0, [rip + top_mask]
  228. movd eax, xmm0
  229. ret