csum_copy.S 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315
  1. /* SPDX-License-Identifier: GPL-2.0 */
  2. /* csum_copy.S: Checksum+copy code for sparc64
  3. *
  4. * Copyright (C) 2005 David S. Miller <davem@davemloft.net>
  5. */
  6. #include <asm/export.h>
  7. #ifdef __KERNEL__
  8. #define GLOBAL_SPARE %g7
  9. #else
  10. #define GLOBAL_SPARE %g5
  11. #endif
  12. #ifndef EX_LD
  13. #define EX_LD(x) x
  14. #endif
  15. #ifndef EX_ST
  16. #define EX_ST(x) x
  17. #endif
  18. #ifndef EX_RETVAL
  19. #define EX_RETVAL(x) x
  20. #endif
  21. #ifndef LOAD
  22. #define LOAD(type,addr,dest) type [addr], dest
  23. #endif
  24. #ifndef STORE
  25. #define STORE(type,src,addr) type src, [addr]
  26. #endif
  27. #ifndef FUNC_NAME
  28. #define FUNC_NAME csum_partial_copy_nocheck
  29. #endif
  30. .register %g2, #scratch
  31. .register %g3, #scratch
  32. .text
  33. 90:
  34. /* We checked for zero length already, so there must be
  35. * at least one byte.
  36. */
  37. be,pt %icc, 1f
  38. nop
  39. EX_LD(LOAD(ldub, %o0 + 0x00, %o4))
  40. add %o0, 1, %o0
  41. sub %o2, 1, %o2
  42. EX_ST(STORE(stb, %o4, %o1 + 0x00))
  43. add %o1, 1, %o1
  44. 1: andcc %o0, 0x2, %g0
  45. be,pn %icc, 80f
  46. cmp %o2, 2
  47. blu,pn %icc, 60f
  48. nop
  49. EX_LD(LOAD(lduh, %o0 + 0x00, %o5))
  50. add %o0, 2, %o0
  51. sub %o2, 2, %o2
  52. EX_ST(STORE(sth, %o5, %o1 + 0x00))
  53. add %o1, 2, %o1
  54. ba,pt %xcc, 80f
  55. add %o5, %o4, %o4
  56. .globl FUNC_NAME
  57. .type FUNC_NAME,#function
  58. EXPORT_SYMBOL(FUNC_NAME)
  59. FUNC_NAME: /* %o0=src, %o1=dst, %o2=len, %o3=sum */
  60. LOAD(prefetch, %o0 + 0x000, #n_reads)
  61. xor %o0, %o1, %g1
  62. clr %o4
  63. andcc %g1, 0x3, %g0
  64. bne,pn %icc, 95f
  65. LOAD(prefetch, %o0 + 0x040, #n_reads)
  66. brz,pn %o2, 70f
  67. andcc %o0, 0x3, %g0
  68. /* We "remember" whether the lowest bit in the address
  69. * was set in GLOBAL_SPARE. Because if it is, we have to swap
  70. * upper and lower 8 bit fields of the sum we calculate.
  71. */
  72. bne,pn %icc, 90b
  73. andcc %o0, 0x1, GLOBAL_SPARE
  74. 80:
  75. LOAD(prefetch, %o0 + 0x080, #n_reads)
  76. andncc %o2, 0x3f, %g3
  77. LOAD(prefetch, %o0 + 0x0c0, #n_reads)
  78. sub %o2, %g3, %o2
  79. brz,pn %g3, 2f
  80. LOAD(prefetch, %o0 + 0x100, #n_reads)
  81. /* So that we don't need to use the non-pairing
  82. * add-with-carry instructions we accumulate 32-bit
  83. * values into a 64-bit register. At the end of the
  84. * loop we fold it down to 32-bits and so on.
  85. */
  86. ba,pt %xcc, 1f
  87. LOAD(prefetch, %o0 + 0x140, #n_reads)
  88. .align 32
  89. 1: EX_LD(LOAD(lduw, %o0 + 0x00, %o5))
  90. EX_LD(LOAD(lduw, %o0 + 0x04, %g1))
  91. EX_LD(LOAD(lduw, %o0 + 0x08, %g2))
  92. add %o4, %o5, %o4
  93. EX_ST(STORE(stw, %o5, %o1 + 0x00))
  94. EX_LD(LOAD(lduw, %o0 + 0x0c, %o5))
  95. add %o4, %g1, %o4
  96. EX_ST(STORE(stw, %g1, %o1 + 0x04))
  97. EX_LD(LOAD(lduw, %o0 + 0x10, %g1))
  98. add %o4, %g2, %o4
  99. EX_ST(STORE(stw, %g2, %o1 + 0x08))
  100. EX_LD(LOAD(lduw, %o0 + 0x14, %g2))
  101. add %o4, %o5, %o4
  102. EX_ST(STORE(stw, %o5, %o1 + 0x0c))
  103. EX_LD(LOAD(lduw, %o0 + 0x18, %o5))
  104. add %o4, %g1, %o4
  105. EX_ST(STORE(stw, %g1, %o1 + 0x10))
  106. EX_LD(LOAD(lduw, %o0 + 0x1c, %g1))
  107. add %o4, %g2, %o4
  108. EX_ST(STORE(stw, %g2, %o1 + 0x14))
  109. EX_LD(LOAD(lduw, %o0 + 0x20, %g2))
  110. add %o4, %o5, %o4
  111. EX_ST(STORE(stw, %o5, %o1 + 0x18))
  112. EX_LD(LOAD(lduw, %o0 + 0x24, %o5))
  113. add %o4, %g1, %o4
  114. EX_ST(STORE(stw, %g1, %o1 + 0x1c))
  115. EX_LD(LOAD(lduw, %o0 + 0x28, %g1))
  116. add %o4, %g2, %o4
  117. EX_ST(STORE(stw, %g2, %o1 + 0x20))
  118. EX_LD(LOAD(lduw, %o0 + 0x2c, %g2))
  119. add %o4, %o5, %o4
  120. EX_ST(STORE(stw, %o5, %o1 + 0x24))
  121. EX_LD(LOAD(lduw, %o0 + 0x30, %o5))
  122. add %o4, %g1, %o4
  123. EX_ST(STORE(stw, %g1, %o1 + 0x28))
  124. EX_LD(LOAD(lduw, %o0 + 0x34, %g1))
  125. add %o4, %g2, %o4
  126. EX_ST(STORE(stw, %g2, %o1 + 0x2c))
  127. EX_LD(LOAD(lduw, %o0 + 0x38, %g2))
  128. add %o4, %o5, %o4
  129. EX_ST(STORE(stw, %o5, %o1 + 0x30))
  130. EX_LD(LOAD(lduw, %o0 + 0x3c, %o5))
  131. add %o4, %g1, %o4
  132. EX_ST(STORE(stw, %g1, %o1 + 0x34))
  133. LOAD(prefetch, %o0 + 0x180, #n_reads)
  134. add %o4, %g2, %o4
  135. EX_ST(STORE(stw, %g2, %o1 + 0x38))
  136. subcc %g3, 0x40, %g3
  137. add %o0, 0x40, %o0
  138. add %o4, %o5, %o4
  139. EX_ST(STORE(stw, %o5, %o1 + 0x3c))
  140. bne,pt %icc, 1b
  141. add %o1, 0x40, %o1
  142. 2: and %o2, 0x3c, %g3
  143. brz,pn %g3, 2f
  144. sub %o2, %g3, %o2
  145. 1: EX_LD(LOAD(lduw, %o0 + 0x00, %o5))
  146. subcc %g3, 0x4, %g3
  147. add %o0, 0x4, %o0
  148. add %o4, %o5, %o4
  149. EX_ST(STORE(stw, %o5, %o1 + 0x00))
  150. bne,pt %icc, 1b
  151. add %o1, 0x4, %o1
  152. 2:
  153. /* fold 64-->32 */
  154. srlx %o4, 32, %o5
  155. srl %o4, 0, %o4
  156. add %o4, %o5, %o4
  157. srlx %o4, 32, %o5
  158. srl %o4, 0, %o4
  159. add %o4, %o5, %o4
  160. /* fold 32-->16 */
  161. sethi %hi(0xffff0000), %g1
  162. srl %o4, 16, %o5
  163. andn %o4, %g1, %g2
  164. add %o5, %g2, %o4
  165. srl %o4, 16, %o5
  166. andn %o4, %g1, %g2
  167. add %o5, %g2, %o4
  168. 60:
  169. /* %o4 has the 16-bit sum we have calculated so-far. */
  170. cmp %o2, 2
  171. blu,pt %icc, 1f
  172. nop
  173. EX_LD(LOAD(lduh, %o0 + 0x00, %o5))
  174. sub %o2, 2, %o2
  175. add %o0, 2, %o0
  176. add %o4, %o5, %o4
  177. EX_ST(STORE(sth, %o5, %o1 + 0x00))
  178. add %o1, 0x2, %o1
  179. 1: brz,pt %o2, 1f
  180. nop
  181. EX_LD(LOAD(ldub, %o0 + 0x00, %o5))
  182. sub %o2, 1, %o2
  183. add %o0, 1, %o0
  184. EX_ST(STORE(stb, %o5, %o1 + 0x00))
  185. sllx %o5, 8, %o5
  186. add %o1, 1, %o1
  187. add %o4, %o5, %o4
  188. 1:
  189. /* fold 32-->16 */
  190. sethi %hi(0xffff0000), %g1
  191. srl %o4, 16, %o5
  192. andn %o4, %g1, %g2
  193. add %o5, %g2, %o4
  194. srl %o4, 16, %o5
  195. andn %o4, %g1, %g2
  196. add %o5, %g2, %o4
  197. 1: brz,pt GLOBAL_SPARE, 1f
  198. nop
  199. /* We started with an odd byte, byte-swap the result. */
  200. srl %o4, 8, %o5
  201. and %o4, 0xff, %g1
  202. sll %g1, 8, %g1
  203. or %o5, %g1, %o4
  204. 1: addcc %o3, %o4, %o3
  205. addc %g0, %o3, %o3
  206. 70:
  207. retl
  208. srl %o3, 0, %o0
  209. 95: mov 0, GLOBAL_SPARE
  210. brlez,pn %o2, 4f
  211. andcc %o0, 1, %o5
  212. be,a,pt %icc, 1f
  213. srl %o2, 1, %g1
  214. sub %o2, 1, %o2
  215. EX_LD(LOAD(ldub, %o0, GLOBAL_SPARE))
  216. add %o0, 1, %o0
  217. EX_ST(STORE(stb, GLOBAL_SPARE, %o1))
  218. srl %o2, 1, %g1
  219. add %o1, 1, %o1
  220. 1: brz,a,pn %g1, 3f
  221. andcc %o2, 1, %g0
  222. andcc %o0, 2, %g0
  223. be,a,pt %icc, 1f
  224. srl %g1, 1, %g1
  225. EX_LD(LOAD(lduh, %o0, %o4))
  226. sub %o2, 2, %o2
  227. srl %o4, 8, %g2
  228. sub %g1, 1, %g1
  229. EX_ST(STORE(stb, %g2, %o1))
  230. add %o4, GLOBAL_SPARE, GLOBAL_SPARE
  231. EX_ST(STORE(stb, %o4, %o1 + 1))
  232. add %o0, 2, %o0
  233. srl %g1, 1, %g1
  234. add %o1, 2, %o1
  235. 1: brz,a,pn %g1, 2f
  236. andcc %o2, 2, %g0
  237. EX_LD(LOAD(lduw, %o0, %o4))
  238. 5: srl %o4, 24, %g2
  239. srl %o4, 16, %g3
  240. EX_ST(STORE(stb, %g2, %o1))
  241. srl %o4, 8, %g2
  242. EX_ST(STORE(stb, %g3, %o1 + 1))
  243. add %o0, 4, %o0
  244. EX_ST(STORE(stb, %g2, %o1 + 2))
  245. addcc %o4, GLOBAL_SPARE, GLOBAL_SPARE
  246. EX_ST(STORE(stb, %o4, %o1 + 3))
  247. addc GLOBAL_SPARE, %g0, GLOBAL_SPARE
  248. add %o1, 4, %o1
  249. subcc %g1, 1, %g1
  250. bne,a,pt %icc, 5b
  251. EX_LD(LOAD(lduw, %o0, %o4))
  252. sll GLOBAL_SPARE, 16, %g2
  253. srl GLOBAL_SPARE, 16, GLOBAL_SPARE
  254. srl %g2, 16, %g2
  255. andcc %o2, 2, %g0
  256. add %g2, GLOBAL_SPARE, GLOBAL_SPARE
  257. 2: be,a,pt %icc, 3f
  258. andcc %o2, 1, %g0
  259. EX_LD(LOAD(lduh, %o0, %o4))
  260. andcc %o2, 1, %g0
  261. srl %o4, 8, %g2
  262. add %o0, 2, %o0
  263. EX_ST(STORE(stb, %g2, %o1))
  264. add GLOBAL_SPARE, %o4, GLOBAL_SPARE
  265. EX_ST(STORE(stb, %o4, %o1 + 1))
  266. add %o1, 2, %o1
  267. 3: be,a,pt %icc, 1f
  268. sll GLOBAL_SPARE, 16, %o4
  269. EX_LD(LOAD(ldub, %o0, %g2))
  270. sll %g2, 8, %o4
  271. EX_ST(STORE(stb, %g2, %o1))
  272. add GLOBAL_SPARE, %o4, GLOBAL_SPARE
  273. sll GLOBAL_SPARE, 16, %o4
  274. 1: addcc %o4, GLOBAL_SPARE, GLOBAL_SPARE
  275. srl GLOBAL_SPARE, 16, %o4
  276. addc %g0, %o4, GLOBAL_SPARE
  277. brz,pt %o5, 4f
  278. srl GLOBAL_SPARE, 8, %o4
  279. and GLOBAL_SPARE, 0xff, %g2
  280. and %o4, 0xff, %o4
  281. sll %g2, 8, %g2
  282. or %g2, %o4, GLOBAL_SPARE
  283. 4: addcc %o3, GLOBAL_SPARE, %o3
  284. addc %g0, %o3, %o0
  285. retl
  286. srl %o0, 0, %o0
  287. .size FUNC_NAME, .-FUNC_NAME