ev6-memcpy.S 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249
  1. /*
  2. * arch/alpha/lib/ev6-memcpy.S
  3. * 21264 version by Rick Gorton <rick.gorton@alpha-processor.com>
  4. *
  5. * Reasonably optimized memcpy() routine for the Alpha 21264
  6. *
  7. * - memory accessed as aligned quadwords only
  8. * - uses bcmpge to compare 8 bytes in parallel
  9. *
  10. * Much of the information about 21264 scheduling/coding comes from:
  11. * Compiler Writer's Guide for the Alpha 21264
  12. * abbreviated as 'CWG' in other comments here
  13. * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
  14. * Scheduling notation:
  15. * E - either cluster
  16. * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
  17. * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
  18. *
  19. * Temp usage notes:
  20. * $1,$2, - scratch
  21. */
  22. .set noreorder
  23. .set noat
  24. .align 4
  25. .globl memcpy
  26. .ent memcpy
  27. memcpy:
  28. .frame $30,0,$26,0
  29. .prologue 0
  30. mov $16, $0 # E : copy dest to return
  31. ble $18, $nomoredata # U : done with the copy?
  32. xor $16, $17, $1 # E : are source and dest alignments the same?
  33. and $1, 7, $1 # E : are they the same mod 8?
  34. bne $1, $misaligned # U : Nope - gotta do this the slow way
  35. /* source and dest are same mod 8 address */
  36. and $16, 7, $1 # E : Are both 0mod8?
  37. beq $1, $both_0mod8 # U : Yes
  38. nop # E :
  39. /*
  40. * source and dest are same misalignment. move a byte at a time
  41. * until a 0mod8 alignment for both is reached.
  42. * At least one byte more to move
  43. */
  44. $head_align:
  45. ldbu $1, 0($17) # L : grab a byte
  46. subq $18, 1, $18 # E : count--
  47. addq $17, 1, $17 # E : src++
  48. stb $1, 0($16) # L :
  49. addq $16, 1, $16 # E : dest++
  50. and $16, 7, $1 # E : Are we at 0mod8 yet?
  51. ble $18, $nomoredata # U : done with the copy?
  52. bne $1, $head_align # U :
  53. $both_0mod8:
  54. cmple $18, 127, $1 # E : Can we unroll the loop?
  55. bne $1, $no_unroll # U :
  56. and $16, 63, $1 # E : get mod64 alignment
  57. beq $1, $do_unroll # U : no single quads to fiddle
  58. $single_head_quad:
  59. ldq $1, 0($17) # L : get 8 bytes
  60. subq $18, 8, $18 # E : count -= 8
  61. addq $17, 8, $17 # E : src += 8
  62. nop # E :
  63. stq $1, 0($16) # L : store
  64. addq $16, 8, $16 # E : dest += 8
  65. and $16, 63, $1 # E : get mod64 alignment
  66. bne $1, $single_head_quad # U : still not fully aligned
  67. $do_unroll:
  68. addq $16, 64, $7 # E : Initial (+1 trip) wh64 address
  69. cmple $18, 127, $1 # E : Can we go through the unrolled loop?
  70. bne $1, $tail_quads # U : Nope
  71. nop # E :
  72. $unroll_body:
  73. wh64 ($7) # L1 : memory subsystem hint: 64 bytes at
  74. # ($7) are about to be over-written
  75. ldq $6, 0($17) # L0 : bytes 0..7
  76. nop # E :
  77. nop # E :
  78. ldq $4, 8($17) # L : bytes 8..15
  79. ldq $5, 16($17) # L : bytes 16..23
  80. addq $7, 64, $7 # E : Update next wh64 address
  81. nop # E :
  82. ldq $3, 24($17) # L : bytes 24..31
  83. addq $16, 64, $1 # E : fallback value for wh64
  84. nop # E :
  85. nop # E :
  86. addq $17, 32, $17 # E : src += 32 bytes
  87. stq $6, 0($16) # L : bytes 0..7
  88. nop # E :
  89. nop # E :
  90. stq $4, 8($16) # L : bytes 8..15
  91. stq $5, 16($16) # L : bytes 16..23
  92. subq $18, 192, $2 # E : At least two more trips to go?
  93. nop # E :
  94. stq $3, 24($16) # L : bytes 24..31
  95. addq $16, 32, $16 # E : dest += 32 bytes
  96. nop # E :
  97. nop # E :
  98. ldq $6, 0($17) # L : bytes 0..7
  99. ldq $4, 8($17) # L : bytes 8..15
  100. cmovlt $2, $1, $7 # E : Latency 2, extra map slot - Use
  101. # fallback wh64 address if < 2 more trips
  102. nop # E :
  103. ldq $5, 16($17) # L : bytes 16..23
  104. ldq $3, 24($17) # L : bytes 24..31
  105. addq $16, 32, $16 # E : dest += 32
  106. subq $18, 64, $18 # E : count -= 64
  107. addq $17, 32, $17 # E : src += 32
  108. stq $6, -32($16) # L : bytes 0..7
  109. stq $4, -24($16) # L : bytes 8..15
  110. cmple $18, 63, $1 # E : At least one more trip?
  111. stq $5, -16($16) # L : bytes 16..23
  112. stq $3, -8($16) # L : bytes 24..31
  113. nop # E :
  114. beq $1, $unroll_body
  115. $tail_quads:
  116. $no_unroll:
  117. .align 4
  118. subq $18, 8, $18 # E : At least a quad left?
  119. blt $18, $less_than_8 # U : Nope
  120. nop # E :
  121. nop # E :
  122. $move_a_quad:
  123. ldq $1, 0($17) # L : fetch 8
  124. subq $18, 8, $18 # E : count -= 8
  125. addq $17, 8, $17 # E : src += 8
  126. nop # E :
  127. stq $1, 0($16) # L : store 8
  128. addq $16, 8, $16 # E : dest += 8
  129. bge $18, $move_a_quad # U :
  130. nop # E :
  131. $less_than_8:
  132. .align 4
  133. addq $18, 8, $18 # E : add back for trailing bytes
  134. ble $18, $nomoredata # U : All-done
  135. nop # E :
  136. nop # E :
  137. /* Trailing bytes */
  138. $tail_bytes:
  139. subq $18, 1, $18 # E : count--
  140. ldbu $1, 0($17) # L : fetch a byte
  141. addq $17, 1, $17 # E : src++
  142. nop # E :
  143. stb $1, 0($16) # L : store a byte
  144. addq $16, 1, $16 # E : dest++
  145. bgt $18, $tail_bytes # U : more to be done?
  146. nop # E :
  147. /* branching to exit takes 3 extra cycles, so replicate exit here */
  148. ret $31, ($26), 1 # L0 :
  149. nop # E :
  150. nop # E :
  151. nop # E :
  152. $misaligned:
  153. mov $0, $4 # E : dest temp
  154. and $0, 7, $1 # E : dest alignment mod8
  155. beq $1, $dest_0mod8 # U : life doesnt totally suck
  156. nop
  157. $aligndest:
  158. ble $18, $nomoredata # U :
  159. ldbu $1, 0($17) # L : fetch a byte
  160. subq $18, 1, $18 # E : count--
  161. addq $17, 1, $17 # E : src++
  162. stb $1, 0($4) # L : store it
  163. addq $4, 1, $4 # E : dest++
  164. and $4, 7, $1 # E : dest 0mod8 yet?
  165. bne $1, $aligndest # U : go until we are aligned.
  166. /* Source has unknown alignment, but dest is known to be 0mod8 */
  167. $dest_0mod8:
  168. subq $18, 8, $18 # E : At least a quad left?
  169. blt $18, $misalign_tail # U : Nope
  170. ldq_u $3, 0($17) # L : seed (rotating load) of 8 bytes
  171. nop # E :
  172. $mis_quad:
  173. ldq_u $16, 8($17) # L : Fetch next 8
  174. extql $3, $17, $3 # U : masking
  175. extqh $16, $17, $1 # U : masking
  176. bis $3, $1, $1 # E : merged bytes to store
  177. subq $18, 8, $18 # E : count -= 8
  178. addq $17, 8, $17 # E : src += 8
  179. stq $1, 0($4) # L : store 8 (aligned)
  180. mov $16, $3 # E : "rotate" source data
  181. addq $4, 8, $4 # E : dest += 8
  182. bge $18, $mis_quad # U : More quads to move
  183. nop
  184. nop
  185. $misalign_tail:
  186. addq $18, 8, $18 # E : account for tail stuff
  187. ble $18, $nomoredata # U :
  188. nop
  189. nop
  190. $misalign_byte:
  191. ldbu $1, 0($17) # L : fetch 1
  192. subq $18, 1, $18 # E : count--
  193. addq $17, 1, $17 # E : src++
  194. nop # E :
  195. stb $1, 0($4) # L : store
  196. addq $4, 1, $4 # E : dest++
  197. bgt $18, $misalign_byte # U : more to go?
  198. nop
  199. $nomoredata:
  200. ret $31, ($26), 1 # L0 :
  201. nop # E :
  202. nop # E :
  203. nop # E :
  204. .end memcpy
  205. /* For backwards module compatibility. */
  206. __memcpy = memcpy
  207. .globl __memcpy