octeon-memcpy.S 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507
  1. /*
  2. * This file is subject to the terms and conditions of the GNU General Public
  3. * License. See the file "COPYING" in the main directory of this archive
  4. * for more details.
  5. *
  6. * Unified implementation of memcpy, memmove and the __copy_user backend.
  7. *
  8. * Copyright (C) 1998, 99, 2000, 01, 2002 Ralf Baechle (ralf@gnu.org)
  9. * Copyright (C) 1999, 2000, 01, 2002 Silicon Graphics, Inc.
  10. * Copyright (C) 2002 Broadcom, Inc.
  11. * memcpy/copy_user author: Mark Vandevoorde
  12. *
  13. * Mnemonic names for arguments to memcpy/__copy_user
  14. */
  15. #include <asm/asm.h>
  16. #include <asm/asm-offsets.h>
  17. #include <asm/regdef.h>
  18. #define dst a0
  19. #define src a1
  20. #define len a2
  21. /*
  22. * Spec
  23. *
  24. * memcpy copies len bytes from src to dst and sets v0 to dst.
  25. * It assumes that
  26. * - src and dst don't overlap
  27. * - src is readable
  28. * - dst is writable
  29. * memcpy uses the standard calling convention
  30. *
  31. * __copy_user copies up to len bytes from src to dst and sets a2 (len) to
  32. * the number of uncopied bytes due to an exception caused by a read or write.
  33. * __copy_user assumes that src and dst don't overlap, and that the call is
  34. * implementing one of the following:
  35. * copy_to_user
  36. * - src is readable (no exceptions when reading src)
  37. * copy_from_user
  38. * - dst is writable (no exceptions when writing dst)
  39. * __copy_user uses a non-standard calling convention; see
  40. * arch/mips/include/asm/uaccess.h
  41. *
  42. * When an exception happens on a load, the handler must
  43. # ensure that all of the destination buffer is overwritten to prevent
  44. * leaking information to user mode programs.
  45. */
  46. /*
  47. * Implementation
  48. */
  49. /*
  50. * The exception handler for loads requires that:
  51. * 1- AT contain the address of the byte just past the end of the source
  52. * of the copy,
  53. * 2- src_entry <= src < AT, and
  54. * 3- (dst - src) == (dst_entry - src_entry),
  55. * The _entry suffix denotes values when __copy_user was called.
  56. *
  57. * (1) is set up up by uaccess.h and maintained by not writing AT in copy_user
  58. * (2) is met by incrementing src by the number of bytes copied
  59. * (3) is met by not doing loads between a pair of increments of dst and src
  60. *
  61. * The exception handlers for stores adjust len (if necessary) and return.
  62. * These handlers do not need to overwrite any data.
  63. *
  64. * For __rmemcpy and memmove an exception is always a kernel bug, therefore
  65. * they're not protected.
  66. */
  67. #define EXC(inst_reg,addr,handler) \
  68. 9: inst_reg, addr; \
  69. .section __ex_table,"a"; \
  70. PTR 9b, handler; \
  71. .previous
  72. /*
  73. * Only on the 64-bit kernel we can made use of 64-bit registers.
  74. */
  75. #define LOAD ld
  76. #define LOADL ldl
  77. #define LOADR ldr
  78. #define STOREL sdl
  79. #define STORER sdr
  80. #define STORE sd
  81. #define ADD daddu
  82. #define SUB dsubu
  83. #define SRL dsrl
  84. #define SRA dsra
  85. #define SLL dsll
  86. #define SLLV dsllv
  87. #define SRLV dsrlv
  88. #define NBYTES 8
  89. #define LOG_NBYTES 3
  90. /*
  91. * As we are sharing code base with the mips32 tree (which use the o32 ABI
  92. * register definitions). We need to redefine the register definitions from
  93. * the n64 ABI register naming to the o32 ABI register naming.
  94. */
  95. #undef t0
  96. #undef t1
  97. #undef t2
  98. #undef t3
  99. #define t0 $8
  100. #define t1 $9
  101. #define t2 $10
  102. #define t3 $11
  103. #define t4 $12
  104. #define t5 $13
  105. #define t6 $14
  106. #define t7 $15
  107. #ifdef CONFIG_CPU_LITTLE_ENDIAN
  108. #define LDFIRST LOADR
  109. #define LDREST LOADL
  110. #define STFIRST STORER
  111. #define STREST STOREL
  112. #define SHIFT_DISCARD SLLV
  113. #else
  114. #define LDFIRST LOADL
  115. #define LDREST LOADR
  116. #define STFIRST STOREL
  117. #define STREST STORER
  118. #define SHIFT_DISCARD SRLV
  119. #endif
  120. #define FIRST(unit) ((unit)*NBYTES)
  121. #define REST(unit) (FIRST(unit)+NBYTES-1)
  122. #define UNIT(unit) FIRST(unit)
  123. #define ADDRMASK (NBYTES-1)
  124. .text
  125. .set noreorder
  126. .set noat
  127. /*
  128. * t7 is used as a flag to note inatomic mode.
  129. */
  130. LEAF(__copy_user_inatomic)
  131. b __copy_user_common
  132. li t7, 1
  133. END(__copy_user_inatomic)
  134. /*
  135. * A combined memcpy/__copy_user
  136. * __copy_user sets len to 0 for success; else to an upper bound of
  137. * the number of uncopied bytes.
  138. * memcpy sets v0 to dst.
  139. */
  140. .align 5
  141. LEAF(memcpy) /* a0=dst a1=src a2=len */
  142. move v0, dst /* return value */
  143. __memcpy:
  144. FEXPORT(__copy_user)
  145. li t7, 0 /* not inatomic */
  146. __copy_user_common:
  147. /*
  148. * Note: dst & src may be unaligned, len may be 0
  149. * Temps
  150. */
  151. #
  152. # Octeon doesn't care if the destination is unaligned. The hardware
  153. # can fix it faster than we can special case the assembly.
  154. #
  155. pref 0, 0(src)
  156. sltu t0, len, NBYTES # Check if < 1 word
  157. bnez t0, copy_bytes_checklen
  158. and t0, src, ADDRMASK # Check if src unaligned
  159. bnez t0, src_unaligned
  160. sltu t0, len, 4*NBYTES # Check if < 4 words
  161. bnez t0, less_than_4units
  162. sltu t0, len, 8*NBYTES # Check if < 8 words
  163. bnez t0, less_than_8units
  164. sltu t0, len, 16*NBYTES # Check if < 16 words
  165. bnez t0, cleanup_both_aligned
  166. sltu t0, len, 128+1 # Check if len < 129
  167. bnez t0, 1f # Skip prefetch if len is too short
  168. sltu t0, len, 256+1 # Check if len < 257
  169. bnez t0, 1f # Skip prefetch if len is too short
  170. pref 0, 128(src) # We must not prefetch invalid addresses
  171. #
  172. # This is where we loop if there is more than 128 bytes left
  173. 2: pref 0, 256(src) # We must not prefetch invalid addresses
  174. #
  175. # This is where we loop if we can't prefetch anymore
  176. 1:
  177. EXC( LOAD t0, UNIT(0)(src), l_exc)
  178. EXC( LOAD t1, UNIT(1)(src), l_exc_copy)
  179. EXC( LOAD t2, UNIT(2)(src), l_exc_copy)
  180. EXC( LOAD t3, UNIT(3)(src), l_exc_copy)
  181. SUB len, len, 16*NBYTES
  182. EXC( STORE t0, UNIT(0)(dst), s_exc_p16u)
  183. EXC( STORE t1, UNIT(1)(dst), s_exc_p15u)
  184. EXC( STORE t2, UNIT(2)(dst), s_exc_p14u)
  185. EXC( STORE t3, UNIT(3)(dst), s_exc_p13u)
  186. EXC( LOAD t0, UNIT(4)(src), l_exc_copy)
  187. EXC( LOAD t1, UNIT(5)(src), l_exc_copy)
  188. EXC( LOAD t2, UNIT(6)(src), l_exc_copy)
  189. EXC( LOAD t3, UNIT(7)(src), l_exc_copy)
  190. EXC( STORE t0, UNIT(4)(dst), s_exc_p12u)
  191. EXC( STORE t1, UNIT(5)(dst), s_exc_p11u)
  192. EXC( STORE t2, UNIT(6)(dst), s_exc_p10u)
  193. ADD src, src, 16*NBYTES
  194. EXC( STORE t3, UNIT(7)(dst), s_exc_p9u)
  195. ADD dst, dst, 16*NBYTES
  196. EXC( LOAD t0, UNIT(-8)(src), l_exc_copy_rewind16)
  197. EXC( LOAD t1, UNIT(-7)(src), l_exc_copy_rewind16)
  198. EXC( LOAD t2, UNIT(-6)(src), l_exc_copy_rewind16)
  199. EXC( LOAD t3, UNIT(-5)(src), l_exc_copy_rewind16)
  200. EXC( STORE t0, UNIT(-8)(dst), s_exc_p8u)
  201. EXC( STORE t1, UNIT(-7)(dst), s_exc_p7u)
  202. EXC( STORE t2, UNIT(-6)(dst), s_exc_p6u)
  203. EXC( STORE t3, UNIT(-5)(dst), s_exc_p5u)
  204. EXC( LOAD t0, UNIT(-4)(src), l_exc_copy_rewind16)
  205. EXC( LOAD t1, UNIT(-3)(src), l_exc_copy_rewind16)
  206. EXC( LOAD t2, UNIT(-2)(src), l_exc_copy_rewind16)
  207. EXC( LOAD t3, UNIT(-1)(src), l_exc_copy_rewind16)
  208. EXC( STORE t0, UNIT(-4)(dst), s_exc_p4u)
  209. EXC( STORE t1, UNIT(-3)(dst), s_exc_p3u)
  210. EXC( STORE t2, UNIT(-2)(dst), s_exc_p2u)
  211. EXC( STORE t3, UNIT(-1)(dst), s_exc_p1u)
  212. sltu t0, len, 256+1 # See if we can prefetch more
  213. beqz t0, 2b
  214. sltu t0, len, 128 # See if we can loop more time
  215. beqz t0, 1b
  216. nop
  217. #
  218. # Jump here if there are less than 16*NBYTES left.
  219. #
  220. cleanup_both_aligned:
  221. beqz len, done
  222. sltu t0, len, 8*NBYTES
  223. bnez t0, less_than_8units
  224. nop
  225. EXC( LOAD t0, UNIT(0)(src), l_exc)
  226. EXC( LOAD t1, UNIT(1)(src), l_exc_copy)
  227. EXC( LOAD t2, UNIT(2)(src), l_exc_copy)
  228. EXC( LOAD t3, UNIT(3)(src), l_exc_copy)
  229. SUB len, len, 8*NBYTES
  230. EXC( STORE t0, UNIT(0)(dst), s_exc_p8u)
  231. EXC( STORE t1, UNIT(1)(dst), s_exc_p7u)
  232. EXC( STORE t2, UNIT(2)(dst), s_exc_p6u)
  233. EXC( STORE t3, UNIT(3)(dst), s_exc_p5u)
  234. EXC( LOAD t0, UNIT(4)(src), l_exc_copy)
  235. EXC( LOAD t1, UNIT(5)(src), l_exc_copy)
  236. EXC( LOAD t2, UNIT(6)(src), l_exc_copy)
  237. EXC( LOAD t3, UNIT(7)(src), l_exc_copy)
  238. EXC( STORE t0, UNIT(4)(dst), s_exc_p4u)
  239. EXC( STORE t1, UNIT(5)(dst), s_exc_p3u)
  240. EXC( STORE t2, UNIT(6)(dst), s_exc_p2u)
  241. EXC( STORE t3, UNIT(7)(dst), s_exc_p1u)
  242. ADD src, src, 8*NBYTES
  243. beqz len, done
  244. ADD dst, dst, 8*NBYTES
  245. #
  246. # Jump here if there are less than 8*NBYTES left.
  247. #
  248. less_than_8units:
  249. sltu t0, len, 4*NBYTES
  250. bnez t0, less_than_4units
  251. nop
  252. EXC( LOAD t0, UNIT(0)(src), l_exc)
  253. EXC( LOAD t1, UNIT(1)(src), l_exc_copy)
  254. EXC( LOAD t2, UNIT(2)(src), l_exc_copy)
  255. EXC( LOAD t3, UNIT(3)(src), l_exc_copy)
  256. SUB len, len, 4*NBYTES
  257. EXC( STORE t0, UNIT(0)(dst), s_exc_p4u)
  258. EXC( STORE t1, UNIT(1)(dst), s_exc_p3u)
  259. EXC( STORE t2, UNIT(2)(dst), s_exc_p2u)
  260. EXC( STORE t3, UNIT(3)(dst), s_exc_p1u)
  261. ADD src, src, 4*NBYTES
  262. beqz len, done
  263. ADD dst, dst, 4*NBYTES
  264. #
  265. # Jump here if there are less than 4*NBYTES left. This means
  266. # we may need to copy up to 3 NBYTES words.
  267. #
  268. less_than_4units:
  269. sltu t0, len, 1*NBYTES
  270. bnez t0, copy_bytes_checklen
  271. nop
  272. #
  273. # 1) Copy NBYTES, then check length again
  274. #
  275. EXC( LOAD t0, 0(src), l_exc)
  276. SUB len, len, NBYTES
  277. sltu t1, len, 8
  278. EXC( STORE t0, 0(dst), s_exc_p1u)
  279. ADD src, src, NBYTES
  280. bnez t1, copy_bytes_checklen
  281. ADD dst, dst, NBYTES
  282. #
  283. # 2) Copy NBYTES, then check length again
  284. #
  285. EXC( LOAD t0, 0(src), l_exc)
  286. SUB len, len, NBYTES
  287. sltu t1, len, 8
  288. EXC( STORE t0, 0(dst), s_exc_p1u)
  289. ADD src, src, NBYTES
  290. bnez t1, copy_bytes_checklen
  291. ADD dst, dst, NBYTES
  292. #
  293. # 3) Copy NBYTES, then check length again
  294. #
  295. EXC( LOAD t0, 0(src), l_exc)
  296. SUB len, len, NBYTES
  297. ADD src, src, NBYTES
  298. ADD dst, dst, NBYTES
  299. b copy_bytes_checklen
  300. EXC( STORE t0, -8(dst), s_exc_p1u)
  301. src_unaligned:
  302. #define rem t8
  303. SRL t0, len, LOG_NBYTES+2 # +2 for 4 units/iter
  304. beqz t0, cleanup_src_unaligned
  305. and rem, len, (4*NBYTES-1) # rem = len % 4*NBYTES
  306. 1:
  307. /*
  308. * Avoid consecutive LD*'s to the same register since some mips
  309. * implementations can't issue them in the same cycle.
  310. * It's OK to load FIRST(N+1) before REST(N) because the two addresses
  311. * are to the same unit (unless src is aligned, but it's not).
  312. */
  313. EXC( LDFIRST t0, FIRST(0)(src), l_exc)
  314. EXC( LDFIRST t1, FIRST(1)(src), l_exc_copy)
  315. SUB len, len, 4*NBYTES
  316. EXC( LDREST t0, REST(0)(src), l_exc_copy)
  317. EXC( LDREST t1, REST(1)(src), l_exc_copy)
  318. EXC( LDFIRST t2, FIRST(2)(src), l_exc_copy)
  319. EXC( LDFIRST t3, FIRST(3)(src), l_exc_copy)
  320. EXC( LDREST t2, REST(2)(src), l_exc_copy)
  321. EXC( LDREST t3, REST(3)(src), l_exc_copy)
  322. ADD src, src, 4*NBYTES
  323. EXC( STORE t0, UNIT(0)(dst), s_exc_p4u)
  324. EXC( STORE t1, UNIT(1)(dst), s_exc_p3u)
  325. EXC( STORE t2, UNIT(2)(dst), s_exc_p2u)
  326. EXC( STORE t3, UNIT(3)(dst), s_exc_p1u)
  327. bne len, rem, 1b
  328. ADD dst, dst, 4*NBYTES
  329. cleanup_src_unaligned:
  330. beqz len, done
  331. and rem, len, NBYTES-1 # rem = len % NBYTES
  332. beq rem, len, copy_bytes
  333. nop
  334. 1:
  335. EXC( LDFIRST t0, FIRST(0)(src), l_exc)
  336. EXC( LDREST t0, REST(0)(src), l_exc_copy)
  337. SUB len, len, NBYTES
  338. EXC( STORE t0, 0(dst), s_exc_p1u)
  339. ADD src, src, NBYTES
  340. bne len, rem, 1b
  341. ADD dst, dst, NBYTES
  342. copy_bytes_checklen:
  343. beqz len, done
  344. nop
  345. copy_bytes:
  346. /* 0 < len < NBYTES */
  347. #define COPY_BYTE(N) \
  348. EXC( lb t0, N(src), l_exc); \
  349. SUB len, len, 1; \
  350. beqz len, done; \
  351. EXC( sb t0, N(dst), s_exc_p1)
  352. COPY_BYTE(0)
  353. COPY_BYTE(1)
  354. COPY_BYTE(2)
  355. COPY_BYTE(3)
  356. COPY_BYTE(4)
  357. COPY_BYTE(5)
  358. EXC( lb t0, NBYTES-2(src), l_exc)
  359. SUB len, len, 1
  360. jr ra
  361. EXC( sb t0, NBYTES-2(dst), s_exc_p1)
  362. done:
  363. jr ra
  364. nop
  365. END(memcpy)
  366. l_exc_copy_rewind16:
  367. /* Rewind src and dst by 16*NBYTES for l_exc_copy */
  368. SUB src, src, 16*NBYTES
  369. SUB dst, dst, 16*NBYTES
  370. l_exc_copy:
  371. /*
  372. * Copy bytes from src until faulting load address (or until a
  373. * lb faults)
  374. *
  375. * When reached by a faulting LDFIRST/LDREST, THREAD_BUADDR($28)
  376. * may be more than a byte beyond the last address.
  377. * Hence, the lb below may get an exception.
  378. *
  379. * Assumes src < THREAD_BUADDR($28)
  380. */
  381. LOAD t0, TI_TASK($28)
  382. LOAD t0, THREAD_BUADDR(t0)
  383. 1:
  384. EXC( lb t1, 0(src), l_exc)
  385. ADD src, src, 1
  386. sb t1, 0(dst) # can't fault -- we're copy_from_user
  387. bne src, t0, 1b
  388. ADD dst, dst, 1
  389. l_exc:
  390. LOAD t0, TI_TASK($28)
  391. LOAD t0, THREAD_BUADDR(t0) # t0 is just past last good address
  392. SUB len, AT, t0 # len number of uncopied bytes
  393. bnez t7, 2f /* Skip the zeroing out part if inatomic */
  394. /*
  395. * Here's where we rely on src and dst being incremented in tandem,
  396. * See (3) above.
  397. * dst += (fault addr - src) to put dst at first byte to clear
  398. */
  399. ADD dst, t0 # compute start address in a1
  400. SUB dst, src
  401. /*
  402. * Clear len bytes starting at dst. Can't call __bzero because it
  403. * might modify len. An inefficient loop for these rare times...
  404. */
  405. beqz len, done
  406. SUB src, len, 1
  407. 1: sb zero, 0(dst)
  408. ADD dst, dst, 1
  409. bnez src, 1b
  410. SUB src, src, 1
  411. 2: jr ra
  412. nop
  413. #define SEXC(n) \
  414. s_exc_p ## n ## u: \
  415. jr ra; \
  416. ADD len, len, n*NBYTES
  417. SEXC(16)
  418. SEXC(15)
  419. SEXC(14)
  420. SEXC(13)
  421. SEXC(12)
  422. SEXC(11)
  423. SEXC(10)
  424. SEXC(9)
  425. SEXC(8)
  426. SEXC(7)
  427. SEXC(6)
  428. SEXC(5)
  429. SEXC(4)
  430. SEXC(3)
  431. SEXC(2)
  432. SEXC(1)
  433. s_exc_p1:
  434. jr ra
  435. ADD len, len, 1
  436. s_exc:
  437. jr ra
  438. nop
  439. .align 5
  440. LEAF(memmove)
  441. ADD t0, a0, a2
  442. ADD t1, a1, a2
  443. sltu t0, a1, t0 # dst + len <= src -> memcpy
  444. sltu t1, a0, t1 # dst >= src + len -> memcpy
  445. and t0, t1
  446. beqz t0, __memcpy
  447. move v0, a0 /* return value */
  448. beqz a2, r_out
  449. END(memmove)
  450. /* fall through to __rmemcpy */
  451. LEAF(__rmemcpy) /* a0=dst a1=src a2=len */
  452. sltu t0, a1, a0
  453. beqz t0, r_end_bytes_up # src >= dst
  454. nop
  455. ADD a0, a2 # dst = dst + len
  456. ADD a1, a2 # src = src + len
  457. r_end_bytes:
  458. lb t0, -1(a1)
  459. SUB a2, a2, 0x1
  460. sb t0, -1(a0)
  461. SUB a1, a1, 0x1
  462. bnez a2, r_end_bytes
  463. SUB a0, a0, 0x1
  464. r_out:
  465. jr ra
  466. move a2, zero
  467. r_end_bytes_up:
  468. lb t0, (a1)
  469. SUB a2, a2, 0x1
  470. sb t0, (a0)
  471. ADD a1, a1, 0x1
  472. bnez a2, r_end_bytes_up
  473. ADD a0, a0, 0x1
  474. jr ra
  475. move a2, zero
  476. END(__rmemcpy)