ev6-divide.S 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260
  1. /*
  2. * arch/alpha/lib/ev6-divide.S
  3. *
  4. * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com>
  5. *
  6. * Alpha division..
  7. */
  8. /*
  9. * The alpha chip doesn't provide hardware division, so we have to do it
  10. * by hand. The compiler expects the functions
  11. *
  12. * __divqu: 64-bit unsigned long divide
  13. * __remqu: 64-bit unsigned long remainder
  14. * __divqs/__remqs: signed 64-bit
  15. * __divlu/__remlu: unsigned 32-bit
  16. * __divls/__remls: signed 32-bit
  17. *
  18. * These are not normal C functions: instead of the normal
  19. * calling sequence, these expect their arguments in registers
  20. * $24 and $25, and return the result in $27. Register $28 may
  21. * be clobbered (assembly temporary), anything else must be saved.
  22. *
  23. * In short: painful.
  24. *
  25. * This is a rather simple bit-at-a-time algorithm: it's very good
  26. * at dividing random 64-bit numbers, but the more usual case where
  27. * the divisor is small is handled better by the DEC algorithm
  28. * using lookup tables. This uses much less memory, though, and is
  29. * nicer on the cache.. Besides, I don't know the copyright status
  30. * of the DEC code.
  31. */
  32. /*
  33. * My temporaries:
  34. * $0 - current bit
  35. * $1 - shifted divisor
  36. * $2 - modulus/quotient
  37. *
  38. * $23 - return address
  39. * $24 - dividend
  40. * $25 - divisor
  41. *
  42. * $27 - quotient/modulus
  43. * $28 - compare status
  44. *
  45. * Much of the information about 21264 scheduling/coding comes from:
  46. * Compiler Writer's Guide for the Alpha 21264
  47. * abbreviated as 'CWG' in other comments here
  48. * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
  49. * Scheduling notation:
  50. * E - either cluster
  51. * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
  52. * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
  53. * Try not to change the actual algorithm if possible for consistency.
  54. */
  55. #define halt .long 0
  56. /*
  57. * Select function type and registers
  58. */
  59. #define mask $0
  60. #define divisor $1
  61. #define compare $28
  62. #define tmp1 $3
  63. #define tmp2 $4
  64. #ifdef DIV
  65. #define DIV_ONLY(x,y...) x,##y
  66. #define MOD_ONLY(x,y...)
  67. #define func(x) __div##x
  68. #define modulus $2
  69. #define quotient $27
  70. #define GETSIGN(x) xor $24,$25,x
  71. #define STACK 48
  72. #else
  73. #define DIV_ONLY(x,y...)
  74. #define MOD_ONLY(x,y...) x,##y
  75. #define func(x) __rem##x
  76. #define modulus $27
  77. #define quotient $2
  78. #define GETSIGN(x) bis $24,$24,x
  79. #define STACK 32
  80. #endif
  81. /*
  82. * For 32-bit operations, we need to extend to 64-bit
  83. */
  84. #ifdef INTSIZE
  85. #define ufunction func(lu)
  86. #define sfunction func(l)
  87. #define LONGIFY(x) zapnot x,15,x
  88. #define SLONGIFY(x) addl x,0,x
  89. #else
  90. #define ufunction func(qu)
  91. #define sfunction func(q)
  92. #define LONGIFY(x)
  93. #define SLONGIFY(x)
  94. #endif
  95. .set noat
  96. .align 4
  97. .globl ufunction
  98. .ent ufunction
  99. ufunction:
  100. subq $30,STACK,$30 # E :
  101. .frame $30,STACK,$23
  102. .prologue 0
  103. 7: stq $1, 0($30) # L :
  104. bis $25,$25,divisor # E :
  105. stq $2, 8($30) # L : L U L U
  106. bis $24,$24,modulus # E :
  107. stq $0,16($30) # L :
  108. bis $31,$31,quotient # E :
  109. LONGIFY(divisor) # E : U L L U
  110. stq tmp1,24($30) # L :
  111. LONGIFY(modulus) # E :
  112. bis $31,1,mask # E :
  113. DIV_ONLY(stq tmp2,32($30)) # L : L U U L
  114. beq divisor, 9f /* div by zero */
  115. /*
  116. * In spite of the DIV_ONLY being either a non-instruction
  117. * or an actual stq, the addition of the .align directive
  118. * below ensures that label 1 is going to be nicely aligned
  119. */
  120. .align 4
  121. #ifdef INTSIZE
  122. /*
  123. * shift divisor left, using 3-bit shifts for
  124. * 32-bit divides as we can't overflow. Three-bit
  125. * shifts will result in looping three times less
  126. * here, but can result in two loops more later.
  127. * Thus using a large shift isn't worth it (and
  128. * s8add pairs better than a sll..)
  129. */
  130. 1: cmpult divisor,modulus,compare # E :
  131. s8addq divisor,$31,divisor # E :
  132. s8addq mask,$31,mask # E :
  133. bne compare,1b # U : U L U L
  134. #else
  135. 1: cmpult divisor,modulus,compare # E :
  136. nop # E :
  137. nop # E :
  138. blt divisor, 2f # U : U L U L
  139. addq divisor,divisor,divisor # E :
  140. addq mask,mask,mask # E :
  141. unop # E :
  142. bne compare,1b # U : U L U L
  143. #endif
  144. /* ok, start to go right again.. */
  145. 2:
  146. /*
  147. * Keep things nicely bundled... use a nop instead of not
  148. * having an instruction for DIV_ONLY
  149. */
  150. #ifdef DIV
  151. DIV_ONLY(addq quotient,mask,tmp2) # E :
  152. #else
  153. nop # E :
  154. #endif
  155. srl mask,1,mask # U :
  156. cmpule divisor,modulus,compare # E :
  157. subq modulus,divisor,tmp1 # E :
  158. #ifdef DIV
  159. DIV_ONLY(cmovne compare,tmp2,quotient) # E : Latency 2, extra map slot
  160. nop # E : as part of the cmovne
  161. srl divisor,1,divisor # U :
  162. nop # E : L U L U
  163. nop # E :
  164. cmovne compare,tmp1,modulus # E : Latency 2, extra map slot
  165. nop # E : as part of the cmovne
  166. bne mask,2b # U : U L U L
  167. #else
  168. srl divisor,1,divisor # U :
  169. cmovne compare,tmp1,modulus # E : Latency 2, extra map slot
  170. nop # E : as part of the cmovne
  171. bne mask,2b # U : U L L U
  172. #endif
  173. 9: ldq $1, 0($30) # L :
  174. ldq $2, 8($30) # L :
  175. nop # E :
  176. nop # E : U U L L
  177. ldq $0,16($30) # L :
  178. ldq tmp1,24($30) # L :
  179. nop # E :
  180. nop # E :
  181. #ifdef DIV
  182. DIV_ONLY(ldq tmp2,32($30)) # L :
  183. #else
  184. nop # E :
  185. #endif
  186. addq $30,STACK,$30 # E :
  187. ret $31,($23),1 # L0 : L U U L
  188. .end ufunction
  189. /*
  190. * Uhh.. Ugly signed division. I'd rather not have it at all, but
  191. * it's needed in some circumstances. There are different ways to
  192. * handle this, really. This does:
  193. * -a / b = a / -b = -(a / b)
  194. * -a % b = -(a % b)
  195. * a % -b = a % b
  196. * which is probably not the best solution, but at least should
  197. * have the property that (x/y)*y + (x%y) = x.
  198. */
  199. .align 4
  200. .globl sfunction
  201. .ent sfunction
  202. sfunction:
  203. subq $30,STACK,$30 # E :
  204. .frame $30,STACK,$23
  205. .prologue 0
  206. bis $24,$25,$28 # E :
  207. SLONGIFY($28) # E :
  208. bge $28,7b # U :
  209. stq $24,0($30) # L :
  210. subq $31,$24,$28 # E :
  211. stq $25,8($30) # L :
  212. nop # E : U L U L
  213. cmovlt $24,$28,$24 /* abs($24) */ # E : Latency 2, extra map slot
  214. nop # E : as part of the cmov
  215. stq $23,16($30) # L :
  216. subq $31,$25,$28 # E : U L U L
  217. stq tmp1,24($30) # L :
  218. cmovlt $25,$28,$25 /* abs($25) */ # E : Latency 2, extra map slot
  219. nop # E :
  220. bsr $23,ufunction # L0: L U L U
  221. ldq $24,0($30) # L :
  222. ldq $25,8($30) # L :
  223. GETSIGN($28) # E :
  224. subq $31,$27,tmp1 # E : U U L L
  225. SLONGIFY($28) # E :
  226. ldq $23,16($30) # L :
  227. cmovlt $28,tmp1,$27 # E : Latency 2, extra map slot
  228. nop # E : U L L U : as part of the cmov
  229. ldq tmp1,24($30) # L :
  230. nop # E : as part of the cmov
  231. addq $30,STACK,$30 # E :
  232. ret $31,($23),1 # L0 : L U U L
  233. .end sfunction