copy_user_memcpy.S 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219
  1. ! SPDX-License-Identifier: GPL-2.0
  2. !
  3. ! Fast SH memcpy
  4. !
  5. ! by Toshiyasu Morita (tm@netcom.com)
  6. ! hacked by J"orn Rernnecke (joern.rennecke@superh.com) ("o for o-umlaut)
  7. ! SH5 code Copyright 2002 SuperH Ltd.
  8. !
  9. ! Entry: ARG0: destination pointer
  10. ! ARG1: source pointer
  11. ! ARG2: byte count
  12. !
  13. ! Exit: RESULT: destination pointer
  14. ! any other registers in the range r0-r7: trashed
  15. !
  16. ! Notes: Usually one wants to do small reads and write a longword, but
  17. ! unfortunately it is difficult in some cases to concatanate bytes
  18. ! into a longword on the SH, so this does a longword read and small
  19. ! writes.
  20. !
  21. ! This implementation makes two assumptions about how it is called:
  22. !
  23. ! 1.: If the byte count is nonzero, the address of the last byte to be
  24. ! copied is unsigned greater than the address of the first byte to
  25. ! be copied. This could be easily swapped for a signed comparison,
  26. ! but the algorithm used needs some comparison.
  27. !
  28. ! 2.: When there are two or three bytes in the last word of an 11-or-more
  29. ! bytes memory chunk to b copied, the rest of the word can be read
  30. ! without side effects.
  31. ! This could be easily changed by increasing the minimum size of
  32. ! a fast memcpy and the amount subtracted from r7 before L_2l_loop be 2,
  33. ! however, this would cost a few extra cyles on average.
  34. ! For SHmedia, the assumption is that any quadword can be read in its
  35. ! enirety if at least one byte is included in the copy.
  36. /* Imported into Linux kernel by Richard Curnow. This is used to implement the
  37. __copy_user function in the general case, so it has to be a distinct
  38. function from intra-kernel memcpy to allow for exception fix-ups in the
  39. event that the user pointer is bad somewhere in the copy (e.g. due to
  40. running off the end of the vma).
  41. Note, this algorithm will be slightly wasteful in the case where the source
  42. and destination pointers are equally aligned, because the stlo/sthi pairs
  43. could then be merged back into single stores. If there are a lot of cache
  44. misses, this is probably offset by the stall lengths on the preloads.
  45. */
  46. /* NOTE : Prefetches removed and allocos guarded by synco to avoid TAKum03020
  47. * erratum. The first two prefetches are nop-ed out to avoid upsetting the
  48. * instruction counts used in the jump address calculation.
  49. * */
  50. .section .text..SHmedia32,"ax"
  51. .little
  52. .balign 32
  53. .global copy_user_memcpy
  54. .global copy_user_memcpy_end
  55. copy_user_memcpy:
  56. #define LDUAQ(P,O,D0,D1) ldlo.q P,O,D0; ldhi.q P,O+7,D1
  57. #define STUAQ(P,O,D0,D1) stlo.q P,O,D0; sthi.q P,O+7,D1
  58. #define LDUAL(P,O,D0,D1) ldlo.l P,O,D0; ldhi.l P,O+3,D1
  59. #define STUAL(P,O,D0,D1) stlo.l P,O,D0; sthi.l P,O+3,D1
  60. nop ! ld.b r3,0,r63 ! TAKum03020
  61. pta/l Large,tr0
  62. movi 25,r0
  63. bgeu/u r4,r0,tr0
  64. nsb r4,r0
  65. shlli r0,5,r0
  66. movi (L1-L0+63*32 + 1) & 0xffff,r1
  67. sub r1, r0, r0
  68. L0: ptrel r0,tr0
  69. add r2,r4,r5
  70. ptabs r18,tr1
  71. add r3,r4,r6
  72. blink tr0,r63
  73. /* Rearranged to make cut2 safe */
  74. .balign 8
  75. L4_7: /* 4..7 byte memcpy cntd. */
  76. stlo.l r2, 0, r0
  77. or r6, r7, r6
  78. sthi.l r5, -1, r6
  79. stlo.l r5, -4, r6
  80. blink tr1,r63
  81. .balign 8
  82. L1: /* 0 byte memcpy */
  83. nop
  84. blink tr1,r63
  85. nop
  86. nop
  87. nop
  88. nop
  89. L2_3: /* 2 or 3 byte memcpy cntd. */
  90. st.b r5,-1,r6
  91. blink tr1,r63
  92. /* 1 byte memcpy */
  93. ld.b r3,0,r0
  94. st.b r2,0,r0
  95. blink tr1,r63
  96. L8_15: /* 8..15 byte memcpy cntd. */
  97. stlo.q r2, 0, r0
  98. or r6, r7, r6
  99. sthi.q r5, -1, r6
  100. stlo.q r5, -8, r6
  101. blink tr1,r63
  102. /* 2 or 3 byte memcpy */
  103. ld.b r3,0,r0
  104. nop ! ld.b r2,0,r63 ! TAKum03020
  105. ld.b r3,1,r1
  106. st.b r2,0,r0
  107. pta/l L2_3,tr0
  108. ld.b r6,-1,r6
  109. st.b r2,1,r1
  110. blink tr0, r63
  111. /* 4 .. 7 byte memcpy */
  112. LDUAL (r3, 0, r0, r1)
  113. pta L4_7, tr0
  114. ldlo.l r6, -4, r7
  115. or r0, r1, r0
  116. sthi.l r2, 3, r0
  117. ldhi.l r6, -1, r6
  118. blink tr0, r63
  119. /* 8 .. 15 byte memcpy */
  120. LDUAQ (r3, 0, r0, r1)
  121. pta L8_15, tr0
  122. ldlo.q r6, -8, r7
  123. or r0, r1, r0
  124. sthi.q r2, 7, r0
  125. ldhi.q r6, -1, r6
  126. blink tr0, r63
  127. /* 16 .. 24 byte memcpy */
  128. LDUAQ (r3, 0, r0, r1)
  129. LDUAQ (r3, 8, r8, r9)
  130. or r0, r1, r0
  131. sthi.q r2, 7, r0
  132. or r8, r9, r8
  133. sthi.q r2, 15, r8
  134. ldlo.q r6, -8, r7
  135. ldhi.q r6, -1, r6
  136. stlo.q r2, 8, r8
  137. stlo.q r2, 0, r0
  138. or r6, r7, r6
  139. sthi.q r5, -1, r6
  140. stlo.q r5, -8, r6
  141. blink tr1,r63
  142. Large:
  143. ! ld.b r2, 0, r63 ! TAKum03020
  144. pta/l Loop_ua, tr1
  145. ori r3, -8, r7
  146. sub r2, r7, r22
  147. sub r3, r2, r6
  148. add r2, r4, r5
  149. ldlo.q r3, 0, r0
  150. addi r5, -16, r5
  151. movi 64+8, r27 ! could subtract r7 from that.
  152. stlo.q r2, 0, r0
  153. sthi.q r2, 7, r0
  154. ldx.q r22, r6, r0
  155. bgtu/l r27, r4, tr1
  156. addi r5, -48, r27
  157. pta/l Loop_line, tr0
  158. addi r6, 64, r36
  159. addi r6, -24, r19
  160. addi r6, -16, r20
  161. addi r6, -8, r21
  162. Loop_line:
  163. ! ldx.q r22, r36, r63 ! TAKum03020
  164. alloco r22, 32
  165. synco
  166. addi r22, 32, r22
  167. ldx.q r22, r19, r23
  168. sthi.q r22, -25, r0
  169. ldx.q r22, r20, r24
  170. ldx.q r22, r21, r25
  171. stlo.q r22, -32, r0
  172. ldx.q r22, r6, r0
  173. sthi.q r22, -17, r23
  174. sthi.q r22, -9, r24
  175. sthi.q r22, -1, r25
  176. stlo.q r22, -24, r23
  177. stlo.q r22, -16, r24
  178. stlo.q r22, -8, r25
  179. bgeu r27, r22, tr0
  180. Loop_ua:
  181. addi r22, 8, r22
  182. sthi.q r22, -1, r0
  183. stlo.q r22, -8, r0
  184. ldx.q r22, r6, r0
  185. bgtu/l r5, r22, tr1
  186. add r3, r4, r7
  187. ldlo.q r7, -8, r1
  188. sthi.q r22, 7, r0
  189. ldhi.q r7, -1, r7
  190. ptabs r18,tr1
  191. stlo.q r22, 0, r0
  192. or r1, r7, r1
  193. sthi.q r5, 15, r1
  194. stlo.q r5, 8, r1
  195. blink tr1, r63
  196. copy_user_memcpy_end:
  197. nop