memcopy.S 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577
  1. /*
  2. * arch/xtensa/lib/hal/memcopy.S -- Core HAL library functions
  3. * xthal_memcpy and xthal_bcopy
  4. *
  5. * This file is subject to the terms and conditions of the GNU General Public
  6. * License. See the file "COPYING" in the main directory of this archive
  7. * for more details.
  8. *
  9. * Copyright (C) 2002 - 2012 Tensilica Inc.
  10. */
  11. #include <variant/core.h>
  12. .macro src_b r, w0, w1
  13. #ifdef __XTENSA_EB__
  14. src \r, \w0, \w1
  15. #else
  16. src \r, \w1, \w0
  17. #endif
  18. .endm
  19. .macro ssa8 r
  20. #ifdef __XTENSA_EB__
  21. ssa8b \r
  22. #else
  23. ssa8l \r
  24. #endif
  25. .endm
  26. /*
  27. * void *memcpy(void *dst, const void *src, size_t len);
  28. *
  29. * This function is intended to do the same thing as the standard
  30. * library function memcpy() for most cases.
  31. * However, where the source and/or destination references
  32. * an instruction RAM or ROM or a data RAM or ROM, that
  33. * source and/or destination will always be accessed with
  34. * 32-bit load and store instructions (as required for these
  35. * types of devices).
  36. *
  37. * !!!!!!! XTFIXME:
  38. * !!!!!!! Handling of IRAM/IROM has not yet
  39. * !!!!!!! been implemented.
  40. *
  41. * The (general case) algorithm is as follows:
  42. * If destination is unaligned, align it by conditionally
  43. * copying 1 and 2 bytes.
  44. * If source is aligned,
  45. * do 16 bytes with a loop, and then finish up with
  46. * 8, 4, 2, and 1 byte copies conditional on the length;
  47. * else (if source is unaligned),
  48. * do the same, but use SRC to align the source data.
  49. * This code tries to use fall-through branches for the common
  50. * case of aligned source and destination and multiple
  51. * of 4 (or 8) length.
  52. *
  53. * Register use:
  54. * a0/ return address
  55. * a1/ stack pointer
  56. * a2/ return value
  57. * a3/ src
  58. * a4/ length
  59. * a5/ dst
  60. * a6/ tmp
  61. * a7/ tmp
  62. * a8/ tmp
  63. * a9/ tmp
  64. * a10/ tmp
  65. * a11/ tmp
  66. */
  67. .text
  68. /*
  69. * Byte by byte copy
  70. */
  71. .align 4
  72. .byte 0 # 1 mod 4 alignment for LOOPNEZ
  73. # (0 mod 4 alignment for LBEG)
  74. .Lbytecopy:
  75. #if XCHAL_HAVE_LOOPS
  76. loopnez a4, .Lbytecopydone
  77. #else /* !XCHAL_HAVE_LOOPS */
  78. beqz a4, .Lbytecopydone
  79. add a7, a3, a4 # a7 = end address for source
  80. #endif /* !XCHAL_HAVE_LOOPS */
  81. .Lnextbyte:
  82. l8ui a6, a3, 0
  83. addi a3, a3, 1
  84. s8i a6, a5, 0
  85. addi a5, a5, 1
  86. #if !XCHAL_HAVE_LOOPS
  87. bne a3, a7, .Lnextbyte # continue loop if $a3:src != $a7:src_end
  88. #endif /* !XCHAL_HAVE_LOOPS */
  89. .Lbytecopydone:
  90. retw
  91. /*
  92. * Destination is unaligned
  93. */
  94. .align 4
  95. .Ldst1mod2: # dst is only byte aligned
  96. _bltui a4, 7, .Lbytecopy # do short copies byte by byte
  97. # copy 1 byte
  98. l8ui a6, a3, 0
  99. addi a3, a3, 1
  100. addi a4, a4, -1
  101. s8i a6, a5, 0
  102. addi a5, a5, 1
  103. _bbci.l a5, 1, .Ldstaligned # if dst is now aligned, then
  104. # return to main algorithm
  105. .Ldst2mod4: # dst 16-bit aligned
  106. # copy 2 bytes
  107. _bltui a4, 6, .Lbytecopy # do short copies byte by byte
  108. l8ui a6, a3, 0
  109. l8ui a7, a3, 1
  110. addi a3, a3, 2
  111. addi a4, a4, -2
  112. s8i a6, a5, 0
  113. s8i a7, a5, 1
  114. addi a5, a5, 2
  115. j .Ldstaligned # dst is now aligned, return to main algorithm
  116. .align 4
  117. .global memcpy
  118. .type memcpy,@function
  119. memcpy:
  120. entry sp, 16 # minimal stack frame
  121. # a2/ dst, a3/ src, a4/ len
  122. mov a5, a2 # copy dst so that a2 is return value
  123. .Lcommon:
  124. _bbsi.l a2, 0, .Ldst1mod2 # if dst is 1 mod 2
  125. _bbsi.l a2, 1, .Ldst2mod4 # if dst is 2 mod 4
  126. .Ldstaligned: # return here from .Ldst?mod? once dst is aligned
  127. srli a7, a4, 4 # number of loop iterations with 16B
  128. # per iteration
  129. movi a8, 3 # if source is not aligned,
  130. _bany a3, a8, .Lsrcunaligned # then use shifting copy
  131. /*
  132. * Destination and source are word-aligned, use word copy.
  133. */
  134. # copy 16 bytes per iteration for word-aligned dst and word-aligned src
  135. #if XCHAL_HAVE_LOOPS
  136. loopnez a7, .Loop1done
  137. #else /* !XCHAL_HAVE_LOOPS */
  138. beqz a7, .Loop1done
  139. slli a8, a7, 4
  140. add a8, a8, a3 # a8 = end of last 16B source chunk
  141. #endif /* !XCHAL_HAVE_LOOPS */
  142. .Loop1:
  143. l32i a6, a3, 0
  144. l32i a7, a3, 4
  145. s32i a6, a5, 0
  146. l32i a6, a3, 8
  147. s32i a7, a5, 4
  148. l32i a7, a3, 12
  149. s32i a6, a5, 8
  150. addi a3, a3, 16
  151. s32i a7, a5, 12
  152. addi a5, a5, 16
  153. #if !XCHAL_HAVE_LOOPS
  154. bne a3, a8, .Loop1 # continue loop if a3:src != a8:src_end
  155. #endif /* !XCHAL_HAVE_LOOPS */
  156. .Loop1done:
  157. bbci.l a4, 3, .L2
  158. # copy 8 bytes
  159. l32i a6, a3, 0
  160. l32i a7, a3, 4
  161. addi a3, a3, 8
  162. s32i a6, a5, 0
  163. s32i a7, a5, 4
  164. addi a5, a5, 8
  165. .L2:
  166. bbsi.l a4, 2, .L3
  167. bbsi.l a4, 1, .L4
  168. bbsi.l a4, 0, .L5
  169. retw
  170. .L3:
  171. # copy 4 bytes
  172. l32i a6, a3, 0
  173. addi a3, a3, 4
  174. s32i a6, a5, 0
  175. addi a5, a5, 4
  176. bbsi.l a4, 1, .L4
  177. bbsi.l a4, 0, .L5
  178. retw
  179. .L4:
  180. # copy 2 bytes
  181. l16ui a6, a3, 0
  182. addi a3, a3, 2
  183. s16i a6, a5, 0
  184. addi a5, a5, 2
  185. bbsi.l a4, 0, .L5
  186. retw
  187. .L5:
  188. # copy 1 byte
  189. l8ui a6, a3, 0
  190. s8i a6, a5, 0
  191. retw
  192. /*
  193. * Destination is aligned, Source is unaligned
  194. */
  195. .align 4
  196. .Lsrcunaligned:
  197. _beqz a4, .Ldone # avoid loading anything for zero-length copies
  198. # copy 16 bytes per iteration for word-aligned dst and unaligned src
  199. ssa8 a3 # set shift amount from byte offset
  200. /* set to 1 when running on ISS (simulator) with the
  201. lint or ferret client, or 0 to save a few cycles */
  202. #define SIM_CHECKS_ALIGNMENT 1
  203. #if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
  204. and a11, a3, a8 # save unalignment offset for below
  205. sub a3, a3, a11 # align a3
  206. #endif
  207. l32i a6, a3, 0 # load first word
  208. #if XCHAL_HAVE_LOOPS
  209. loopnez a7, .Loop2done
  210. #else /* !XCHAL_HAVE_LOOPS */
  211. beqz a7, .Loop2done
  212. slli a10, a7, 4
  213. add a10, a10, a3 # a10 = end of last 16B source chunk
  214. #endif /* !XCHAL_HAVE_LOOPS */
  215. .Loop2:
  216. l32i a7, a3, 4
  217. l32i a8, a3, 8
  218. src_b a6, a6, a7
  219. s32i a6, a5, 0
  220. l32i a9, a3, 12
  221. src_b a7, a7, a8
  222. s32i a7, a5, 4
  223. l32i a6, a3, 16
  224. src_b a8, a8, a9
  225. s32i a8, a5, 8
  226. addi a3, a3, 16
  227. src_b a9, a9, a6
  228. s32i a9, a5, 12
  229. addi a5, a5, 16
  230. #if !XCHAL_HAVE_LOOPS
  231. bne a3, a10, .Loop2 # continue loop if a3:src != a10:src_end
  232. #endif /* !XCHAL_HAVE_LOOPS */
  233. .Loop2done:
  234. bbci.l a4, 3, .L12
  235. # copy 8 bytes
  236. l32i a7, a3, 4
  237. l32i a8, a3, 8
  238. src_b a6, a6, a7
  239. s32i a6, a5, 0
  240. addi a3, a3, 8
  241. src_b a7, a7, a8
  242. s32i a7, a5, 4
  243. addi a5, a5, 8
  244. mov a6, a8
  245. .L12:
  246. bbci.l a4, 2, .L13
  247. # copy 4 bytes
  248. l32i a7, a3, 4
  249. addi a3, a3, 4
  250. src_b a6, a6, a7
  251. s32i a6, a5, 0
  252. addi a5, a5, 4
  253. mov a6, a7
  254. .L13:
  255. #if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
  256. add a3, a3, a11 # readjust a3 with correct misalignment
  257. #endif
  258. bbsi.l a4, 1, .L14
  259. bbsi.l a4, 0, .L15
  260. .Ldone: retw
  261. .L14:
  262. # copy 2 bytes
  263. l8ui a6, a3, 0
  264. l8ui a7, a3, 1
  265. addi a3, a3, 2
  266. s8i a6, a5, 0
  267. s8i a7, a5, 1
  268. addi a5, a5, 2
  269. bbsi.l a4, 0, .L15
  270. retw
  271. .L15:
  272. # copy 1 byte
  273. l8ui a6, a3, 0
  274. s8i a6, a5, 0
  275. retw
  276. /*
  277. * void bcopy(const void *src, void *dest, size_t n);
  278. */
  279. .align 4
  280. .global bcopy
  281. .type bcopy,@function
  282. bcopy:
  283. entry sp, 16 # minimal stack frame
  284. # a2=src, a3=dst, a4=len
  285. mov a5, a3
  286. mov a3, a2
  287. mov a2, a5
  288. j .Lmovecommon # go to common code for memmove+bcopy
  289. /*
  290. * void *memmove(void *dst, const void *src, size_t len);
  291. *
  292. * This function is intended to do the same thing as the standard
  293. * library function memmove() for most cases.
  294. * However, where the source and/or destination references
  295. * an instruction RAM or ROM or a data RAM or ROM, that
  296. * source and/or destination will always be accessed with
  297. * 32-bit load and store instructions (as required for these
  298. * types of devices).
  299. *
  300. * !!!!!!! XTFIXME:
  301. * !!!!!!! Handling of IRAM/IROM has not yet
  302. * !!!!!!! been implemented.
  303. *
  304. * The (general case) algorithm is as follows:
  305. * If end of source doesn't overlap destination then use memcpy.
  306. * Otherwise do memcpy backwards.
  307. *
  308. * Register use:
  309. * a0/ return address
  310. * a1/ stack pointer
  311. * a2/ return value
  312. * a3/ src
  313. * a4/ length
  314. * a5/ dst
  315. * a6/ tmp
  316. * a7/ tmp
  317. * a8/ tmp
  318. * a9/ tmp
  319. * a10/ tmp
  320. * a11/ tmp
  321. */
  322. /*
  323. * Byte by byte copy
  324. */
  325. .align 4
  326. .byte 0 # 1 mod 4 alignment for LOOPNEZ
  327. # (0 mod 4 alignment for LBEG)
  328. .Lbackbytecopy:
  329. #if XCHAL_HAVE_LOOPS
  330. loopnez a4, .Lbackbytecopydone
  331. #else /* !XCHAL_HAVE_LOOPS */
  332. beqz a4, .Lbackbytecopydone
  333. sub a7, a3, a4 # a7 = start address for source
  334. #endif /* !XCHAL_HAVE_LOOPS */
  335. .Lbacknextbyte:
  336. addi a3, a3, -1
  337. l8ui a6, a3, 0
  338. addi a5, a5, -1
  339. s8i a6, a5, 0
  340. #if !XCHAL_HAVE_LOOPS
  341. bne a3, a7, .Lbacknextbyte # continue loop if
  342. # $a3:src != $a7:src_start
  343. #endif /* !XCHAL_HAVE_LOOPS */
  344. .Lbackbytecopydone:
  345. retw
  346. /*
  347. * Destination is unaligned
  348. */
  349. .align 4
  350. .Lbackdst1mod2: # dst is only byte aligned
  351. _bltui a4, 7, .Lbackbytecopy # do short copies byte by byte
  352. # copy 1 byte
  353. addi a3, a3, -1
  354. l8ui a6, a3, 0
  355. addi a5, a5, -1
  356. s8i a6, a5, 0
  357. addi a4, a4, -1
  358. _bbci.l a5, 1, .Lbackdstaligned # if dst is now aligned, then
  359. # return to main algorithm
  360. .Lbackdst2mod4: # dst 16-bit aligned
  361. # copy 2 bytes
  362. _bltui a4, 6, .Lbackbytecopy # do short copies byte by byte
  363. addi a3, a3, -2
  364. l8ui a6, a3, 0
  365. l8ui a7, a3, 1
  366. addi a5, a5, -2
  367. s8i a6, a5, 0
  368. s8i a7, a5, 1
  369. addi a4, a4, -2
  370. j .Lbackdstaligned # dst is now aligned,
  371. # return to main algorithm
  372. .align 4
  373. .global memmove
  374. .type memmove,@function
  375. memmove:
  376. entry sp, 16 # minimal stack frame
  377. # a2/ dst, a3/ src, a4/ len
  378. mov a5, a2 # copy dst so that a2 is return value
  379. .Lmovecommon:
  380. sub a6, a5, a3
  381. bgeu a6, a4, .Lcommon
  382. add a5, a5, a4
  383. add a3, a3, a4
  384. _bbsi.l a5, 0, .Lbackdst1mod2 # if dst is 1 mod 2
  385. _bbsi.l a5, 1, .Lbackdst2mod4 # if dst is 2 mod 4
  386. .Lbackdstaligned: # return here from .Lbackdst?mod? once dst is aligned
  387. srli a7, a4, 4 # number of loop iterations with 16B
  388. # per iteration
  389. movi a8, 3 # if source is not aligned,
  390. _bany a3, a8, .Lbacksrcunaligned # then use shifting copy
  391. /*
  392. * Destination and source are word-aligned, use word copy.
  393. */
  394. # copy 16 bytes per iteration for word-aligned dst and word-aligned src
  395. #if XCHAL_HAVE_LOOPS
  396. loopnez a7, .backLoop1done
  397. #else /* !XCHAL_HAVE_LOOPS */
  398. beqz a7, .backLoop1done
  399. slli a8, a7, 4
  400. sub a8, a3, a8 # a8 = start of first 16B source chunk
  401. #endif /* !XCHAL_HAVE_LOOPS */
  402. .backLoop1:
  403. addi a3, a3, -16
  404. l32i a7, a3, 12
  405. l32i a6, a3, 8
  406. addi a5, a5, -16
  407. s32i a7, a5, 12
  408. l32i a7, a3, 4
  409. s32i a6, a5, 8
  410. l32i a6, a3, 0
  411. s32i a7, a5, 4
  412. s32i a6, a5, 0
  413. #if !XCHAL_HAVE_LOOPS
  414. bne a3, a8, .backLoop1 # continue loop if a3:src != a8:src_start
  415. #endif /* !XCHAL_HAVE_LOOPS */
  416. .backLoop1done:
  417. bbci.l a4, 3, .Lback2
  418. # copy 8 bytes
  419. addi a3, a3, -8
  420. l32i a6, a3, 0
  421. l32i a7, a3, 4
  422. addi a5, a5, -8
  423. s32i a6, a5, 0
  424. s32i a7, a5, 4
  425. .Lback2:
  426. bbsi.l a4, 2, .Lback3
  427. bbsi.l a4, 1, .Lback4
  428. bbsi.l a4, 0, .Lback5
  429. retw
  430. .Lback3:
  431. # copy 4 bytes
  432. addi a3, a3, -4
  433. l32i a6, a3, 0
  434. addi a5, a5, -4
  435. s32i a6, a5, 0
  436. bbsi.l a4, 1, .Lback4
  437. bbsi.l a4, 0, .Lback5
  438. retw
  439. .Lback4:
  440. # copy 2 bytes
  441. addi a3, a3, -2
  442. l16ui a6, a3, 0
  443. addi a5, a5, -2
  444. s16i a6, a5, 0
  445. bbsi.l a4, 0, .Lback5
  446. retw
  447. .Lback5:
  448. # copy 1 byte
  449. addi a3, a3, -1
  450. l8ui a6, a3, 0
  451. addi a5, a5, -1
  452. s8i a6, a5, 0
  453. retw
  454. /*
  455. * Destination is aligned, Source is unaligned
  456. */
  457. .align 4
  458. .Lbacksrcunaligned:
  459. _beqz a4, .Lbackdone # avoid loading anything for zero-length copies
  460. # copy 16 bytes per iteration for word-aligned dst and unaligned src
  461. ssa8 a3 # set shift amount from byte offset
  462. #define SIM_CHECKS_ALIGNMENT 1 /* set to 1 when running on ISS with
  463. * the lint or ferret client, or 0
  464. * to save a few cycles */
  465. #if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
  466. and a11, a3, a8 # save unalignment offset for below
  467. sub a3, a3, a11 # align a3
  468. #endif
  469. l32i a6, a3, 0 # load first word
  470. #if XCHAL_HAVE_LOOPS
  471. loopnez a7, .backLoop2done
  472. #else /* !XCHAL_HAVE_LOOPS */
  473. beqz a7, .backLoop2done
  474. slli a10, a7, 4
  475. sub a10, a3, a10 # a10 = start of first 16B source chunk
  476. #endif /* !XCHAL_HAVE_LOOPS */
  477. .backLoop2:
  478. addi a3, a3, -16
  479. l32i a7, a3, 12
  480. l32i a8, a3, 8
  481. addi a5, a5, -16
  482. src_b a6, a7, a6
  483. s32i a6, a5, 12
  484. l32i a9, a3, 4
  485. src_b a7, a8, a7
  486. s32i a7, a5, 8
  487. l32i a6, a3, 0
  488. src_b a8, a9, a8
  489. s32i a8, a5, 4
  490. src_b a9, a6, a9
  491. s32i a9, a5, 0
  492. #if !XCHAL_HAVE_LOOPS
  493. bne a3, a10, .backLoop2 # continue loop if a3:src != a10:src_start
  494. #endif /* !XCHAL_HAVE_LOOPS */
  495. .backLoop2done:
  496. bbci.l a4, 3, .Lback12
  497. # copy 8 bytes
  498. addi a3, a3, -8
  499. l32i a7, a3, 4
  500. l32i a8, a3, 0
  501. addi a5, a5, -8
  502. src_b a6, a7, a6
  503. s32i a6, a5, 4
  504. src_b a7, a8, a7
  505. s32i a7, a5, 0
  506. mov a6, a8
  507. .Lback12:
  508. bbci.l a4, 2, .Lback13
  509. # copy 4 bytes
  510. addi a3, a3, -4
  511. l32i a7, a3, 0
  512. addi a5, a5, -4
  513. src_b a6, a7, a6
  514. s32i a6, a5, 0
  515. mov a6, a7
  516. .Lback13:
  517. #if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
  518. add a3, a3, a11 # readjust a3 with correct misalignment
  519. #endif
  520. bbsi.l a4, 1, .Lback14
  521. bbsi.l a4, 0, .Lback15
  522. .Lbackdone:
  523. retw
  524. .Lback14:
  525. # copy 2 bytes
  526. addi a3, a3, -2
  527. l8ui a6, a3, 0
  528. l8ui a7, a3, 1
  529. addi a5, a5, -2
  530. s8i a6, a5, 0
  531. s8i a7, a5, 1
  532. bbsi.l a4, 0, .Lback15
  533. retw
  534. .Lback15:
  535. # copy 1 byte
  536. addi a3, a3, -1
  537. addi a5, a5, -1
  538. l8ui a6, a3, 0
  539. s8i a6, a5, 0
  540. retw
  541. /*
  542. * Local Variables:
  543. * mode:fundamental
  544. * comment-start: "# "
  545. * comment-start-skip: "# *"
  546. * End:
  547. */