U3memcpy.S 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521
  1. /* SPDX-License-Identifier: GPL-2.0 */
  2. /* U3memcpy.S: UltraSparc-III optimized memcpy.
  3. *
  4. * Copyright (C) 1999, 2000, 2004 David S. Miller (davem@redhat.com)
  5. */
  6. #ifdef __KERNEL__
  7. #include <linux/linkage.h>
  8. #include <asm/visasm.h>
  9. #include <asm/asi.h>
  10. #define GLOBAL_SPARE %g7
  11. #else
  12. #define ASI_BLK_P 0xf0
  13. #define FPRS_FEF 0x04
  14. #ifdef MEMCPY_DEBUG
  15. #define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs; \
  16. clr %g1; clr %g2; clr %g3; subcc %g0, %g0, %g0;
  17. #define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
  18. #else
  19. #define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs
  20. #define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
  21. #endif
  22. #define GLOBAL_SPARE %g5
  23. #endif
  24. #ifndef EX_LD
  25. #define EX_LD(x,y) x
  26. #endif
  27. #ifndef EX_LD_FP
  28. #define EX_LD_FP(x,y) x
  29. #endif
  30. #ifndef EX_ST
  31. #define EX_ST(x,y) x
  32. #endif
  33. #ifndef EX_ST_FP
  34. #define EX_ST_FP(x,y) x
  35. #endif
  36. #ifndef LOAD
  37. #define LOAD(type,addr,dest) type [addr], dest
  38. #endif
  39. #ifndef STORE
  40. #define STORE(type,src,addr) type src, [addr]
  41. #endif
  42. #ifndef STORE_BLK
  43. #define STORE_BLK(src,addr) stda src, [addr] ASI_BLK_P
  44. #endif
  45. #ifndef FUNC_NAME
  46. #define FUNC_NAME U3memcpy
  47. #endif
  48. #ifndef PREAMBLE
  49. #define PREAMBLE
  50. #endif
  51. #ifndef XCC
  52. #define XCC xcc
  53. #endif
  54. .register %g2,#scratch
  55. .register %g3,#scratch
  56. /* Special/non-trivial issues of this code:
  57. *
  58. * 1) %o5 is preserved from VISEntryHalf to VISExitHalf
  59. * 2) Only low 32 FPU registers are used so that only the
  60. * lower half of the FPU register set is dirtied by this
  61. * code. This is especially important in the kernel.
  62. * 3) This code never prefetches cachelines past the end
  63. * of the source buffer.
  64. */
  65. .text
  66. #ifndef EX_RETVAL
  67. #define EX_RETVAL(x) x
  68. __restore_fp:
  69. VISExitHalf
  70. retl
  71. nop
  72. ENTRY(U3_retl_o2_plus_g2_plus_g1_plus_1_fp)
  73. add %g1, 1, %g1
  74. add %g2, %g1, %g2
  75. ba,pt %xcc, __restore_fp
  76. add %o2, %g2, %o0
  77. ENDPROC(U3_retl_o2_plus_g2_plus_g1_plus_1_fp)
  78. ENTRY(U3_retl_o2_plus_g2_fp)
  79. ba,pt %xcc, __restore_fp
  80. add %o2, %g2, %o0
  81. ENDPROC(U3_retl_o2_plus_g2_fp)
  82. ENTRY(U3_retl_o2_plus_g2_plus_8_fp)
  83. add %g2, 8, %g2
  84. ba,pt %xcc, __restore_fp
  85. add %o2, %g2, %o0
  86. ENDPROC(U3_retl_o2_plus_g2_plus_8_fp)
  87. ENTRY(U3_retl_o2)
  88. retl
  89. mov %o2, %o0
  90. ENDPROC(U3_retl_o2)
  91. ENTRY(U3_retl_o2_plus_1)
  92. retl
  93. add %o2, 1, %o0
  94. ENDPROC(U3_retl_o2_plus_1)
  95. ENTRY(U3_retl_o2_plus_4)
  96. retl
  97. add %o2, 4, %o0
  98. ENDPROC(U3_retl_o2_plus_4)
  99. ENTRY(U3_retl_o2_plus_8)
  100. retl
  101. add %o2, 8, %o0
  102. ENDPROC(U3_retl_o2_plus_8)
  103. ENTRY(U3_retl_o2_plus_g1_plus_1)
  104. add %g1, 1, %g1
  105. retl
  106. add %o2, %g1, %o0
  107. ENDPROC(U3_retl_o2_plus_g1_plus_1)
  108. ENTRY(U3_retl_o2_fp)
  109. ba,pt %xcc, __restore_fp
  110. mov %o2, %o0
  111. ENDPROC(U3_retl_o2_fp)
  112. ENTRY(U3_retl_o2_plus_o3_sll_6_plus_0x80_fp)
  113. sll %o3, 6, %o3
  114. add %o3, 0x80, %o3
  115. ba,pt %xcc, __restore_fp
  116. add %o2, %o3, %o0
  117. ENDPROC(U3_retl_o2_plus_o3_sll_6_plus_0x80_fp)
  118. ENTRY(U3_retl_o2_plus_o3_sll_6_plus_0x40_fp)
  119. sll %o3, 6, %o3
  120. add %o3, 0x40, %o3
  121. ba,pt %xcc, __restore_fp
  122. add %o2, %o3, %o0
  123. ENDPROC(U3_retl_o2_plus_o3_sll_6_plus_0x40_fp)
  124. ENTRY(U3_retl_o2_plus_GS_plus_0x10)
  125. add GLOBAL_SPARE, 0x10, GLOBAL_SPARE
  126. retl
  127. add %o2, GLOBAL_SPARE, %o0
  128. ENDPROC(U3_retl_o2_plus_GS_plus_0x10)
  129. ENTRY(U3_retl_o2_plus_GS_plus_0x08)
  130. add GLOBAL_SPARE, 0x08, GLOBAL_SPARE
  131. retl
  132. add %o2, GLOBAL_SPARE, %o0
  133. ENDPROC(U3_retl_o2_plus_GS_plus_0x08)
  134. ENTRY(U3_retl_o2_and_7_plus_GS)
  135. and %o2, 7, %o2
  136. retl
  137. add %o2, GLOBAL_SPARE, %o0
  138. ENDPROC(U3_retl_o2_and_7_plus_GS)
  139. ENTRY(U3_retl_o2_and_7_plus_GS_plus_8)
  140. add GLOBAL_SPARE, 8, GLOBAL_SPARE
  141. and %o2, 7, %o2
  142. retl
  143. add %o2, GLOBAL_SPARE, %o0
  144. ENDPROC(U3_retl_o2_and_7_plus_GS_plus_8)
  145. #endif
  146. .align 64
  147. /* The cheetah's flexible spine, oversized liver, enlarged heart,
  148. * slender muscular body, and claws make it the swiftest hunter
  149. * in Africa and the fastest animal on land. Can reach speeds
  150. * of up to 2.4GB per second.
  151. */
  152. .globl FUNC_NAME
  153. .type FUNC_NAME,#function
  154. FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */
  155. srlx %o2, 31, %g2
  156. cmp %g2, 0
  157. /* software trap 5 "Range Check" if dst >= 0x80000000 */
  158. tne %xcc, 5
  159. PREAMBLE
  160. mov %o0, %o4
  161. /* if len == 0 */
  162. cmp %o2, 0
  163. be,pn %XCC, end_return
  164. or %o0, %o1, %o3
  165. /* if len < 16 */
  166. cmp %o2, 16
  167. blu,a,pn %XCC, less_than_16
  168. or %o3, %o2, %o3
  169. /* if len < 192 */
  170. cmp %o2, (3 * 64)
  171. blu,pt %XCC, less_than_192
  172. andcc %o3, 0x7, %g0
  173. /* Clobbers o5/g1/g2/g3/g7/icc/xcc. We must preserve
  174. * o5 from here until we hit VISExitHalf.
  175. */
  176. VISEntryHalf
  177. /* Is 'dst' already aligned on an 64-byte boundary? */
  178. andcc %o0, 0x3f, %g2
  179. be,pt %XCC, 2f
  180. /* Compute abs((dst & 0x3f) - 0x40) into %g2. This is the number
  181. * of bytes to copy to make 'dst' 64-byte aligned. We pre-
  182. * subtract this from 'len'.
  183. */
  184. sub %o0, %o1, GLOBAL_SPARE
  185. sub %g2, 0x40, %g2
  186. sub %g0, %g2, %g2
  187. sub %o2, %g2, %o2
  188. andcc %g2, 0x7, %g1
  189. be,pt %icc, 2f
  190. and %g2, 0x38, %g2
  191. 1: subcc %g1, 0x1, %g1
  192. EX_LD_FP(LOAD(ldub, %o1 + 0x00, %o3), U3_retl_o2_plus_g2_plus_g1_plus_1)
  193. EX_ST_FP(STORE(stb, %o3, %o1 + GLOBAL_SPARE), U3_retl_o2_plus_g2_plus_g1_plus_1)
  194. bgu,pt %XCC, 1b
  195. add %o1, 0x1, %o1
  196. add %o1, GLOBAL_SPARE, %o0
  197. 2: cmp %g2, 0x0
  198. and %o1, 0x7, %g1
  199. be,pt %icc, 3f
  200. alignaddr %o1, %g0, %o1
  201. EX_LD_FP(LOAD(ldd, %o1, %f4), U3_retl_o2_plus_g2)
  202. 1: EX_LD_FP(LOAD(ldd, %o1 + 0x8, %f6), U3_retl_o2_plus_g2)
  203. add %o1, 0x8, %o1
  204. subcc %g2, 0x8, %g2
  205. faligndata %f4, %f6, %f0
  206. EX_ST_FP(STORE(std, %f0, %o0), U3_retl_o2_plus_g2_plus_8)
  207. be,pn %icc, 3f
  208. add %o0, 0x8, %o0
  209. EX_LD_FP(LOAD(ldd, %o1 + 0x8, %f4), U3_retl_o2_plus_g2)
  210. add %o1, 0x8, %o1
  211. subcc %g2, 0x8, %g2
  212. faligndata %f6, %f4, %f2
  213. EX_ST_FP(STORE(std, %f2, %o0), U3_retl_o2_plus_g2_plus_8)
  214. bne,pt %icc, 1b
  215. add %o0, 0x8, %o0
  216. 3: LOAD(prefetch, %o1 + 0x000, #one_read)
  217. LOAD(prefetch, %o1 + 0x040, #one_read)
  218. andn %o2, (0x40 - 1), GLOBAL_SPARE
  219. LOAD(prefetch, %o1 + 0x080, #one_read)
  220. LOAD(prefetch, %o1 + 0x0c0, #one_read)
  221. LOAD(prefetch, %o1 + 0x100, #one_read)
  222. EX_LD_FP(LOAD(ldd, %o1 + 0x000, %f0), U3_retl_o2)
  223. LOAD(prefetch, %o1 + 0x140, #one_read)
  224. EX_LD_FP(LOAD(ldd, %o1 + 0x008, %f2), U3_retl_o2)
  225. LOAD(prefetch, %o1 + 0x180, #one_read)
  226. EX_LD_FP(LOAD(ldd, %o1 + 0x010, %f4), U3_retl_o2)
  227. LOAD(prefetch, %o1 + 0x1c0, #one_read)
  228. faligndata %f0, %f2, %f16
  229. EX_LD_FP(LOAD(ldd, %o1 + 0x018, %f6), U3_retl_o2)
  230. faligndata %f2, %f4, %f18
  231. EX_LD_FP(LOAD(ldd, %o1 + 0x020, %f8), U3_retl_o2)
  232. faligndata %f4, %f6, %f20
  233. EX_LD_FP(LOAD(ldd, %o1 + 0x028, %f10), U3_retl_o2)
  234. faligndata %f6, %f8, %f22
  235. EX_LD_FP(LOAD(ldd, %o1 + 0x030, %f12), U3_retl_o2)
  236. faligndata %f8, %f10, %f24
  237. EX_LD_FP(LOAD(ldd, %o1 + 0x038, %f14), U3_retl_o2)
  238. faligndata %f10, %f12, %f26
  239. EX_LD_FP(LOAD(ldd, %o1 + 0x040, %f0), U3_retl_o2)
  240. subcc GLOBAL_SPARE, 0x80, GLOBAL_SPARE
  241. add %o1, 0x40, %o1
  242. bgu,pt %XCC, 1f
  243. srl GLOBAL_SPARE, 6, %o3
  244. ba,pt %xcc, 2f
  245. nop
  246. .align 64
  247. 1:
  248. EX_LD_FP(LOAD(ldd, %o1 + 0x008, %f2), U3_retl_o2_plus_o3_sll_6_plus_0x80)
  249. faligndata %f12, %f14, %f28
  250. EX_LD_FP(LOAD(ldd, %o1 + 0x010, %f4), U3_retl_o2_plus_o3_sll_6_plus_0x80)
  251. faligndata %f14, %f0, %f30
  252. EX_ST_FP(STORE_BLK(%f16, %o0), U3_retl_o2_plus_o3_sll_6_plus_0x80)
  253. EX_LD_FP(LOAD(ldd, %o1 + 0x018, %f6), U3_retl_o2_plus_o3_sll_6_plus_0x40)
  254. faligndata %f0, %f2, %f16
  255. add %o0, 0x40, %o0
  256. EX_LD_FP(LOAD(ldd, %o1 + 0x020, %f8), U3_retl_o2_plus_o3_sll_6_plus_0x40)
  257. faligndata %f2, %f4, %f18
  258. EX_LD_FP(LOAD(ldd, %o1 + 0x028, %f10), U3_retl_o2_plus_o3_sll_6_plus_0x40)
  259. faligndata %f4, %f6, %f20
  260. EX_LD_FP(LOAD(ldd, %o1 + 0x030, %f12), U3_retl_o2_plus_o3_sll_6_plus_0x40)
  261. subcc %o3, 0x01, %o3
  262. faligndata %f6, %f8, %f22
  263. EX_LD_FP(LOAD(ldd, %o1 + 0x038, %f14), U3_retl_o2_plus_o3_sll_6_plus_0x80)
  264. faligndata %f8, %f10, %f24
  265. EX_LD_FP(LOAD(ldd, %o1 + 0x040, %f0), U3_retl_o2_plus_o3_sll_6_plus_0x80)
  266. LOAD(prefetch, %o1 + 0x1c0, #one_read)
  267. faligndata %f10, %f12, %f26
  268. bg,pt %XCC, 1b
  269. add %o1, 0x40, %o1
  270. /* Finally we copy the last full 64-byte block. */
  271. 2:
  272. EX_LD_FP(LOAD(ldd, %o1 + 0x008, %f2), U3_retl_o2_plus_o3_sll_6_plus_0x80)
  273. faligndata %f12, %f14, %f28
  274. EX_LD_FP(LOAD(ldd, %o1 + 0x010, %f4), U3_retl_o2_plus_o3_sll_6_plus_0x80)
  275. faligndata %f14, %f0, %f30
  276. EX_ST_FP(STORE_BLK(%f16, %o0), U3_retl_o2_plus_o3_sll_6_plus_0x80)
  277. EX_LD_FP(LOAD(ldd, %o1 + 0x018, %f6), U3_retl_o2_plus_o3_sll_6_plus_0x40)
  278. faligndata %f0, %f2, %f16
  279. EX_LD_FP(LOAD(ldd, %o1 + 0x020, %f8), U3_retl_o2_plus_o3_sll_6_plus_0x40)
  280. faligndata %f2, %f4, %f18
  281. EX_LD_FP(LOAD(ldd, %o1 + 0x028, %f10), U3_retl_o2_plus_o3_sll_6_plus_0x40)
  282. faligndata %f4, %f6, %f20
  283. EX_LD_FP(LOAD(ldd, %o1 + 0x030, %f12), U3_retl_o2_plus_o3_sll_6_plus_0x40)
  284. faligndata %f6, %f8, %f22
  285. EX_LD_FP(LOAD(ldd, %o1 + 0x038, %f14), U3_retl_o2_plus_o3_sll_6_plus_0x40)
  286. faligndata %f8, %f10, %f24
  287. cmp %g1, 0
  288. be,pt %XCC, 1f
  289. add %o0, 0x40, %o0
  290. EX_LD_FP(LOAD(ldd, %o1 + 0x040, %f0), U3_retl_o2_plus_o3_sll_6_plus_0x40)
  291. 1: faligndata %f10, %f12, %f26
  292. faligndata %f12, %f14, %f28
  293. faligndata %f14, %f0, %f30
  294. EX_ST_FP(STORE_BLK(%f16, %o0), U3_retl_o2_plus_o3_sll_6_plus_0x40)
  295. add %o0, 0x40, %o0
  296. add %o1, 0x40, %o1
  297. membar #Sync
  298. /* Now we copy the (len modulo 64) bytes at the end.
  299. * Note how we borrow the %f0 loaded above.
  300. *
  301. * Also notice how this code is careful not to perform a
  302. * load past the end of the src buffer.
  303. */
  304. and %o2, 0x3f, %o2
  305. andcc %o2, 0x38, %g2
  306. be,pn %XCC, 2f
  307. subcc %g2, 0x8, %g2
  308. be,pn %XCC, 2f
  309. cmp %g1, 0
  310. sub %o2, %g2, %o2
  311. be,a,pt %XCC, 1f
  312. EX_LD_FP(LOAD(ldd, %o1 + 0x00, %f0), U3_retl_o2_plus_g2)
  313. 1: EX_LD_FP(LOAD(ldd, %o1 + 0x08, %f2), U3_retl_o2_plus_g2)
  314. add %o1, 0x8, %o1
  315. subcc %g2, 0x8, %g2
  316. faligndata %f0, %f2, %f8
  317. EX_ST_FP(STORE(std, %f8, %o0), U3_retl_o2_plus_g2_plus_8)
  318. be,pn %XCC, 2f
  319. add %o0, 0x8, %o0
  320. EX_LD_FP(LOAD(ldd, %o1 + 0x08, %f0), U3_retl_o2_plus_g2)
  321. add %o1, 0x8, %o1
  322. subcc %g2, 0x8, %g2
  323. faligndata %f2, %f0, %f8
  324. EX_ST_FP(STORE(std, %f8, %o0), U3_retl_o2_plus_g2_plus_8)
  325. bne,pn %XCC, 1b
  326. add %o0, 0x8, %o0
  327. /* If anything is left, we copy it one byte at a time.
  328. * Note that %g1 is (src & 0x3) saved above before the
  329. * alignaddr was performed.
  330. */
  331. 2:
  332. cmp %o2, 0
  333. add %o1, %g1, %o1
  334. VISExitHalf
  335. be,pn %XCC, end_return
  336. sub %o0, %o1, %o3
  337. andcc %g1, 0x7, %g0
  338. bne,pn %icc, 90f
  339. andcc %o2, 0x8, %g0
  340. be,pt %icc, 1f
  341. nop
  342. EX_LD(LOAD(ldx, %o1, %o5), U3_retl_o2)
  343. EX_ST(STORE(stx, %o5, %o1 + %o3), U3_retl_o2)
  344. add %o1, 0x8, %o1
  345. sub %o2, 8, %o2
  346. 1: andcc %o2, 0x4, %g0
  347. be,pt %icc, 1f
  348. nop
  349. EX_LD(LOAD(lduw, %o1, %o5), U3_retl_o2)
  350. EX_ST(STORE(stw, %o5, %o1 + %o3), U3_retl_o2)
  351. add %o1, 0x4, %o1
  352. sub %o2, 4, %o2
  353. 1: andcc %o2, 0x2, %g0
  354. be,pt %icc, 1f
  355. nop
  356. EX_LD(LOAD(lduh, %o1, %o5), U3_retl_o2)
  357. EX_ST(STORE(sth, %o5, %o1 + %o3), U3_retl_o2)
  358. add %o1, 0x2, %o1
  359. sub %o2, 2, %o2
  360. 1: andcc %o2, 0x1, %g0
  361. be,pt %icc, end_return
  362. nop
  363. EX_LD(LOAD(ldub, %o1, %o5), U3_retl_o2)
  364. ba,pt %xcc, end_return
  365. EX_ST(STORE(stb, %o5, %o1 + %o3), U3_retl_o2)
  366. .align 64
  367. /* 16 <= len < 192 */
  368. less_than_192:
  369. bne,pn %XCC, 75f
  370. sub %o0, %o1, %o3
  371. 72:
  372. andn %o2, 0xf, GLOBAL_SPARE
  373. and %o2, 0xf, %o2
  374. 1: subcc GLOBAL_SPARE, 0x10, GLOBAL_SPARE
  375. EX_LD(LOAD(ldx, %o1 + 0x00, %o5), U3_retl_o2_plus_GS_plus_0x10)
  376. EX_LD(LOAD(ldx, %o1 + 0x08, %g1), U3_retl_o2_plus_GS_plus_0x10)
  377. EX_ST(STORE(stx, %o5, %o1 + %o3), U3_retl_o2_plus_GS_plus_0x10)
  378. add %o1, 0x8, %o1
  379. EX_ST(STORE(stx, %g1, %o1 + %o3), U3_retl_o2_plus_GS_plus_0x08)
  380. bgu,pt %XCC, 1b
  381. add %o1, 0x8, %o1
  382. 73: andcc %o2, 0x8, %g0
  383. be,pt %XCC, 1f
  384. nop
  385. sub %o2, 0x8, %o2
  386. EX_LD(LOAD(ldx, %o1, %o5), U3_retl_o2_plus_8)
  387. EX_ST(STORE(stx, %o5, %o1 + %o3), U3_retl_o2_plus_8)
  388. add %o1, 0x8, %o1
  389. 1: andcc %o2, 0x4, %g0
  390. be,pt %XCC, 1f
  391. nop
  392. sub %o2, 0x4, %o2
  393. EX_LD(LOAD(lduw, %o1, %o5), U3_retl_o2_plus_4)
  394. EX_ST(STORE(stw, %o5, %o1 + %o3), U3_retl_o2_plus_4)
  395. add %o1, 0x4, %o1
  396. 1: cmp %o2, 0
  397. be,pt %XCC, end_return
  398. nop
  399. ba,pt %xcc, 90f
  400. nop
  401. 75:
  402. andcc %o0, 0x7, %g1
  403. sub %g1, 0x8, %g1
  404. be,pn %icc, 2f
  405. sub %g0, %g1, %g1
  406. sub %o2, %g1, %o2
  407. 1: subcc %g1, 1, %g1
  408. EX_LD(LOAD(ldub, %o1, %o5), U3_retl_o2_plus_g1_plus_1)
  409. EX_ST(STORE(stb, %o5, %o1 + %o3), U3_retl_o2_plus_g1_plus_1)
  410. bgu,pt %icc, 1b
  411. add %o1, 1, %o1
  412. 2: add %o1, %o3, %o0
  413. andcc %o1, 0x7, %g1
  414. bne,pt %icc, 8f
  415. sll %g1, 3, %g1
  416. cmp %o2, 16
  417. bgeu,pt %icc, 72b
  418. nop
  419. ba,a,pt %xcc, 73b
  420. 8: mov 64, %o3
  421. andn %o1, 0x7, %o1
  422. EX_LD(LOAD(ldx, %o1, %g2), U3_retl_o2)
  423. sub %o3, %g1, %o3
  424. andn %o2, 0x7, GLOBAL_SPARE
  425. sllx %g2, %g1, %g2
  426. 1: EX_LD(LOAD(ldx, %o1 + 0x8, %g3), U3_retl_o2_and_7_plus_GS)
  427. subcc GLOBAL_SPARE, 0x8, GLOBAL_SPARE
  428. add %o1, 0x8, %o1
  429. srlx %g3, %o3, %o5
  430. or %o5, %g2, %o5
  431. EX_ST(STORE(stx, %o5, %o0), U3_retl_o2_and_7_plus_GS_plus_8)
  432. add %o0, 0x8, %o0
  433. bgu,pt %icc, 1b
  434. sllx %g3, %g1, %g2
  435. srl %g1, 3, %g1
  436. andcc %o2, 0x7, %o2
  437. be,pn %icc, end_return
  438. add %o1, %g1, %o1
  439. ba,pt %xcc, 90f
  440. sub %o0, %o1, %o3
  441. .align 64
  442. /* 0 < len < 16 */
  443. less_than_16:
  444. andcc %o3, 0x3, %g0
  445. bne,pn %XCC, 90f
  446. sub %o0, %o1, %o3
  447. 1:
  448. subcc %o2, 4, %o2
  449. EX_LD(LOAD(lduw, %o1, %g1), U3_retl_o2_plus_4)
  450. EX_ST(STORE(stw, %g1, %o1 + %o3), U3_retl_o2_plus_4)
  451. bgu,pt %XCC, 1b
  452. add %o1, 4, %o1
  453. end_return:
  454. retl
  455. mov EX_RETVAL(%o4), %o0
  456. .align 32
  457. 90:
  458. subcc %o2, 1, %o2
  459. EX_LD(LOAD(ldub, %o1, %g1), U3_retl_o2_plus_1)
  460. EX_ST(STORE(stb, %g1, %o1 + %o3), U3_retl_o2_plus_1)
  461. bgu,pt %XCC, 90b
  462. add %o1, 1, %o1
  463. retl
  464. mov EX_RETVAL(%o4), %o0
  465. .size FUNC_NAME, .-FUNC_NAME