memcpy_mck.S 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660
  1. /* SPDX-License-Identifier: GPL-2.0 */
  2. /*
  3. * Itanium 2-optimized version of memcpy and copy_user function
  4. *
  5. * Inputs:
  6. * in0: destination address
  7. * in1: source address
  8. * in2: number of bytes to copy
  9. * Output:
  10. * for memcpy: return dest
  11. * for copy_user: return 0 if success,
  12. * or number of byte NOT copied if error occurred.
  13. *
  14. * Copyright (C) 2002 Intel Corp.
  15. * Copyright (C) 2002 Ken Chen <kenneth.w.chen@intel.com>
  16. */
  17. #include <asm/asmmacro.h>
  18. #include <asm/page.h>
  19. #include <asm/export.h>
  20. #define EK(y...) EX(y)
  21. /* McKinley specific optimization */
  22. #define retval r8
  23. #define saved_pfs r31
  24. #define saved_lc r10
  25. #define saved_pr r11
  26. #define saved_in0 r14
  27. #define saved_in1 r15
  28. #define saved_in2 r16
  29. #define src0 r2
  30. #define src1 r3
  31. #define dst0 r17
  32. #define dst1 r18
  33. #define cnt r9
  34. /* r19-r30 are temp for each code section */
  35. #define PREFETCH_DIST 8
  36. #define src_pre_mem r19
  37. #define dst_pre_mem r20
  38. #define src_pre_l2 r21
  39. #define dst_pre_l2 r22
  40. #define t1 r23
  41. #define t2 r24
  42. #define t3 r25
  43. #define t4 r26
  44. #define t5 t1 // alias!
  45. #define t6 t2 // alias!
  46. #define t7 t3 // alias!
  47. #define n8 r27
  48. #define t9 t5 // alias!
  49. #define t10 t4 // alias!
  50. #define t11 t7 // alias!
  51. #define t12 t6 // alias!
  52. #define t14 t10 // alias!
  53. #define t13 r28
  54. #define t15 r29
  55. #define tmp r30
  56. /* defines for long_copy block */
  57. #define A 0
  58. #define B (PREFETCH_DIST)
  59. #define C (B + PREFETCH_DIST)
  60. #define D (C + 1)
  61. #define N (D + 1)
  62. #define Nrot ((N + 7) & ~7)
  63. /* alias */
  64. #define in0 r32
  65. #define in1 r33
  66. #define in2 r34
  67. GLOBAL_ENTRY(memcpy)
  68. and r28=0x7,in0
  69. and r29=0x7,in1
  70. mov f6=f0
  71. mov retval=in0
  72. br.cond.sptk .common_code
  73. ;;
  74. END(memcpy)
  75. EXPORT_SYMBOL(memcpy)
  76. GLOBAL_ENTRY(__copy_user)
  77. .prologue
  78. // check dest alignment
  79. and r28=0x7,in0
  80. and r29=0x7,in1
  81. mov f6=f1
  82. mov saved_in0=in0 // save dest pointer
  83. mov saved_in1=in1 // save src pointer
  84. mov retval=r0 // initialize return value
  85. ;;
  86. .common_code:
  87. cmp.gt p15,p0=8,in2 // check for small size
  88. cmp.ne p13,p0=0,r28 // check dest alignment
  89. cmp.ne p14,p0=0,r29 // check src alignment
  90. add src0=0,in1
  91. sub r30=8,r28 // for .align_dest
  92. mov saved_in2=in2 // save len
  93. ;;
  94. add dst0=0,in0
  95. add dst1=1,in0 // dest odd index
  96. cmp.le p6,p0 = 1,r30 // for .align_dest
  97. (p15) br.cond.dpnt .memcpy_short
  98. (p13) br.cond.dpnt .align_dest
  99. (p14) br.cond.dpnt .unaligned_src
  100. ;;
  101. // both dest and src are aligned on 8-byte boundary
  102. .aligned_src:
  103. .save ar.pfs, saved_pfs
  104. alloc saved_pfs=ar.pfs,3,Nrot-3,0,Nrot
  105. .save pr, saved_pr
  106. mov saved_pr=pr
  107. shr.u cnt=in2,7 // this much cache line
  108. ;;
  109. cmp.lt p6,p0=2*PREFETCH_DIST,cnt
  110. cmp.lt p7,p8=1,cnt
  111. .save ar.lc, saved_lc
  112. mov saved_lc=ar.lc
  113. .body
  114. add cnt=-1,cnt
  115. add src_pre_mem=0,in1 // prefetch src pointer
  116. add dst_pre_mem=0,in0 // prefetch dest pointer
  117. ;;
  118. (p7) mov ar.lc=cnt // prefetch count
  119. (p8) mov ar.lc=r0
  120. (p6) br.cond.dpnt .long_copy
  121. ;;
  122. .prefetch:
  123. lfetch.fault [src_pre_mem], 128
  124. lfetch.fault.excl [dst_pre_mem], 128
  125. br.cloop.dptk.few .prefetch
  126. ;;
  127. .medium_copy:
  128. and tmp=31,in2 // copy length after iteration
  129. shr.u r29=in2,5 // number of 32-byte iteration
  130. add dst1=8,dst0 // 2nd dest pointer
  131. ;;
  132. add cnt=-1,r29 // ctop iteration adjustment
  133. cmp.eq p10,p0=r29,r0 // do we really need to loop?
  134. add src1=8,src0 // 2nd src pointer
  135. cmp.le p6,p0=8,tmp
  136. ;;
  137. cmp.le p7,p0=16,tmp
  138. mov ar.lc=cnt // loop setup
  139. cmp.eq p16,p17 = r0,r0
  140. mov ar.ec=2
  141. (p10) br.dpnt.few .aligned_src_tail
  142. ;;
  143. TEXT_ALIGN(32)
  144. 1:
  145. EX(.ex_handler, (p16) ld8 r34=[src0],16)
  146. EK(.ex_handler, (p16) ld8 r38=[src1],16)
  147. EX(.ex_handler, (p17) st8 [dst0]=r33,16)
  148. EK(.ex_handler, (p17) st8 [dst1]=r37,16)
  149. ;;
  150. EX(.ex_handler, (p16) ld8 r32=[src0],16)
  151. EK(.ex_handler, (p16) ld8 r36=[src1],16)
  152. EX(.ex_handler, (p16) st8 [dst0]=r34,16)
  153. EK(.ex_handler, (p16) st8 [dst1]=r38,16)
  154. br.ctop.dptk.few 1b
  155. ;;
  156. .aligned_src_tail:
  157. EX(.ex_handler, (p6) ld8 t1=[src0])
  158. mov ar.lc=saved_lc
  159. mov ar.pfs=saved_pfs
  160. EX(.ex_hndlr_s, (p7) ld8 t2=[src1],8)
  161. cmp.le p8,p0=24,tmp
  162. and r21=-8,tmp
  163. ;;
  164. EX(.ex_hndlr_s, (p8) ld8 t3=[src1])
  165. EX(.ex_handler, (p6) st8 [dst0]=t1) // store byte 1
  166. and in2=7,tmp // remaining length
  167. EX(.ex_hndlr_d, (p7) st8 [dst1]=t2,8) // store byte 2
  168. add src0=src0,r21 // setting up src pointer
  169. add dst0=dst0,r21 // setting up dest pointer
  170. ;;
  171. EX(.ex_handler, (p8) st8 [dst1]=t3) // store byte 3
  172. mov pr=saved_pr,-1
  173. br.dptk.many .memcpy_short
  174. ;;
  175. /* code taken from copy_page_mck */
  176. .long_copy:
  177. .rotr v[2*PREFETCH_DIST]
  178. .rotp p[N]
  179. mov src_pre_mem = src0
  180. mov pr.rot = 0x10000
  181. mov ar.ec = 1 // special unrolled loop
  182. mov dst_pre_mem = dst0
  183. add src_pre_l2 = 8*8, src0
  184. add dst_pre_l2 = 8*8, dst0
  185. ;;
  186. add src0 = 8, src_pre_mem // first t1 src
  187. mov ar.lc = 2*PREFETCH_DIST - 1
  188. shr.u cnt=in2,7 // number of lines
  189. add src1 = 3*8, src_pre_mem // first t3 src
  190. add dst0 = 8, dst_pre_mem // first t1 dst
  191. add dst1 = 3*8, dst_pre_mem // first t3 dst
  192. ;;
  193. and tmp=127,in2 // remaining bytes after this block
  194. add cnt = -(2*PREFETCH_DIST) - 1, cnt
  195. // same as .line_copy loop, but with all predicated-off instructions removed:
  196. .prefetch_loop:
  197. EX(.ex_hndlr_lcpy_1, (p[A]) ld8 v[A] = [src_pre_mem], 128) // M0
  198. EK(.ex_hndlr_lcpy_1, (p[B]) st8 [dst_pre_mem] = v[B], 128) // M2
  199. br.ctop.sptk .prefetch_loop
  200. ;;
  201. cmp.eq p16, p0 = r0, r0 // reset p16 to 1
  202. mov ar.lc = cnt
  203. mov ar.ec = N // # of stages in pipeline
  204. ;;
  205. .line_copy:
  206. EX(.ex_handler, (p[D]) ld8 t2 = [src0], 3*8) // M0
  207. EK(.ex_handler, (p[D]) ld8 t4 = [src1], 3*8) // M1
  208. EX(.ex_handler_lcpy, (p[B]) st8 [dst_pre_mem] = v[B], 128) // M2 prefetch dst from memory
  209. EK(.ex_handler_lcpy, (p[D]) st8 [dst_pre_l2] = n8, 128) // M3 prefetch dst from L2
  210. ;;
  211. EX(.ex_handler_lcpy, (p[A]) ld8 v[A] = [src_pre_mem], 128) // M0 prefetch src from memory
  212. EK(.ex_handler_lcpy, (p[C]) ld8 n8 = [src_pre_l2], 128) // M1 prefetch src from L2
  213. EX(.ex_handler, (p[D]) st8 [dst0] = t1, 8) // M2
  214. EK(.ex_handler, (p[D]) st8 [dst1] = t3, 8) // M3
  215. ;;
  216. EX(.ex_handler, (p[D]) ld8 t5 = [src0], 8)
  217. EK(.ex_handler, (p[D]) ld8 t7 = [src1], 3*8)
  218. EX(.ex_handler, (p[D]) st8 [dst0] = t2, 3*8)
  219. EK(.ex_handler, (p[D]) st8 [dst1] = t4, 3*8)
  220. ;;
  221. EX(.ex_handler, (p[D]) ld8 t6 = [src0], 3*8)
  222. EK(.ex_handler, (p[D]) ld8 t10 = [src1], 8)
  223. EX(.ex_handler, (p[D]) st8 [dst0] = t5, 8)
  224. EK(.ex_handler, (p[D]) st8 [dst1] = t7, 3*8)
  225. ;;
  226. EX(.ex_handler, (p[D]) ld8 t9 = [src0], 3*8)
  227. EK(.ex_handler, (p[D]) ld8 t11 = [src1], 3*8)
  228. EX(.ex_handler, (p[D]) st8 [dst0] = t6, 3*8)
  229. EK(.ex_handler, (p[D]) st8 [dst1] = t10, 8)
  230. ;;
  231. EX(.ex_handler, (p[D]) ld8 t12 = [src0], 8)
  232. EK(.ex_handler, (p[D]) ld8 t14 = [src1], 8)
  233. EX(.ex_handler, (p[D]) st8 [dst0] = t9, 3*8)
  234. EK(.ex_handler, (p[D]) st8 [dst1] = t11, 3*8)
  235. ;;
  236. EX(.ex_handler, (p[D]) ld8 t13 = [src0], 4*8)
  237. EK(.ex_handler, (p[D]) ld8 t15 = [src1], 4*8)
  238. EX(.ex_handler, (p[D]) st8 [dst0] = t12, 8)
  239. EK(.ex_handler, (p[D]) st8 [dst1] = t14, 8)
  240. ;;
  241. EX(.ex_handler, (p[C]) ld8 t1 = [src0], 8)
  242. EK(.ex_handler, (p[C]) ld8 t3 = [src1], 8)
  243. EX(.ex_handler, (p[D]) st8 [dst0] = t13, 4*8)
  244. EK(.ex_handler, (p[D]) st8 [dst1] = t15, 4*8)
  245. br.ctop.sptk .line_copy
  246. ;;
  247. add dst0=-8,dst0
  248. add src0=-8,src0
  249. mov in2=tmp
  250. .restore sp
  251. br.sptk.many .medium_copy
  252. ;;
  253. #define BLOCK_SIZE 128*32
  254. #define blocksize r23
  255. #define curlen r24
  256. // dest is on 8-byte boundary, src is not. We need to do
  257. // ld8-ld8, shrp, then st8. Max 8 byte copy per cycle.
  258. .unaligned_src:
  259. .prologue
  260. .save ar.pfs, saved_pfs
  261. alloc saved_pfs=ar.pfs,3,5,0,8
  262. .save ar.lc, saved_lc
  263. mov saved_lc=ar.lc
  264. .save pr, saved_pr
  265. mov saved_pr=pr
  266. .body
  267. .4k_block:
  268. mov saved_in0=dst0 // need to save all input arguments
  269. mov saved_in2=in2
  270. mov blocksize=BLOCK_SIZE
  271. ;;
  272. cmp.lt p6,p7=blocksize,in2
  273. mov saved_in1=src0
  274. ;;
  275. (p6) mov in2=blocksize
  276. ;;
  277. shr.u r21=in2,7 // this much cache line
  278. shr.u r22=in2,4 // number of 16-byte iteration
  279. and curlen=15,in2 // copy length after iteration
  280. and r30=7,src0 // source alignment
  281. ;;
  282. cmp.lt p7,p8=1,r21
  283. add cnt=-1,r21
  284. ;;
  285. add src_pre_mem=0,src0 // prefetch src pointer
  286. add dst_pre_mem=0,dst0 // prefetch dest pointer
  287. and src0=-8,src0 // 1st src pointer
  288. (p7) mov ar.lc = cnt
  289. (p8) mov ar.lc = r0
  290. ;;
  291. TEXT_ALIGN(32)
  292. 1: lfetch.fault [src_pre_mem], 128
  293. lfetch.fault.excl [dst_pre_mem], 128
  294. br.cloop.dptk.few 1b
  295. ;;
  296. shladd dst1=r22,3,dst0 // 2nd dest pointer
  297. shladd src1=r22,3,src0 // 2nd src pointer
  298. cmp.eq p8,p9=r22,r0 // do we really need to loop?
  299. cmp.le p6,p7=8,curlen; // have at least 8 byte remaining?
  300. add cnt=-1,r22 // ctop iteration adjustment
  301. ;;
  302. EX(.ex_handler, (p9) ld8 r33=[src0],8) // loop primer
  303. EK(.ex_handler, (p9) ld8 r37=[src1],8)
  304. (p8) br.dpnt.few .noloop
  305. ;;
  306. // The jump address is calculated based on src alignment. The COPYU
  307. // macro below need to confine its size to power of two, so an entry
  308. // can be caulated using shl instead of an expensive multiply. The
  309. // size is then hard coded by the following #define to match the
  310. // actual size. This make it somewhat tedious when COPYU macro gets
  311. // changed and this need to be adjusted to match.
  312. #define LOOP_SIZE 6
  313. 1:
  314. mov r29=ip // jmp_table thread
  315. mov ar.lc=cnt
  316. ;;
  317. add r29=.jump_table - 1b - (.jmp1-.jump_table), r29
  318. shl r28=r30, LOOP_SIZE // jmp_table thread
  319. mov ar.ec=2 // loop setup
  320. ;;
  321. add r29=r29,r28 // jmp_table thread
  322. cmp.eq p16,p17=r0,r0
  323. ;;
  324. mov b6=r29 // jmp_table thread
  325. ;;
  326. br.cond.sptk.few b6
  327. // for 8-15 byte case
  328. // We will skip the loop, but need to replicate the side effect
  329. // that the loop produces.
  330. .noloop:
  331. EX(.ex_handler, (p6) ld8 r37=[src1],8)
  332. add src0=8,src0
  333. (p6) shl r25=r30,3
  334. ;;
  335. EX(.ex_handler, (p6) ld8 r27=[src1])
  336. (p6) shr.u r28=r37,r25
  337. (p6) sub r26=64,r25
  338. ;;
  339. (p6) shl r27=r27,r26
  340. ;;
  341. (p6) or r21=r28,r27
  342. .unaligned_src_tail:
  343. /* check if we have more than blocksize to copy, if so go back */
  344. cmp.gt p8,p0=saved_in2,blocksize
  345. ;;
  346. (p8) add dst0=saved_in0,blocksize
  347. (p8) add src0=saved_in1,blocksize
  348. (p8) sub in2=saved_in2,blocksize
  349. (p8) br.dpnt .4k_block
  350. ;;
  351. /* we have up to 15 byte to copy in the tail.
  352. * part of work is already done in the jump table code
  353. * we are at the following state.
  354. * src side:
  355. *
  356. * xxxxxx xx <----- r21 has xxxxxxxx already
  357. * -------- -------- --------
  358. * 0 8 16
  359. * ^
  360. * |
  361. * src1
  362. *
  363. * dst
  364. * -------- -------- --------
  365. * ^
  366. * |
  367. * dst1
  368. */
  369. EX(.ex_handler, (p6) st8 [dst1]=r21,8) // more than 8 byte to copy
  370. (p6) add curlen=-8,curlen // update length
  371. mov ar.pfs=saved_pfs
  372. ;;
  373. mov ar.lc=saved_lc
  374. mov pr=saved_pr,-1
  375. mov in2=curlen // remaining length
  376. mov dst0=dst1 // dest pointer
  377. add src0=src1,r30 // forward by src alignment
  378. ;;
  379. // 7 byte or smaller.
  380. .memcpy_short:
  381. cmp.le p8,p9 = 1,in2
  382. cmp.le p10,p11 = 2,in2
  383. cmp.le p12,p13 = 3,in2
  384. cmp.le p14,p15 = 4,in2
  385. add src1=1,src0 // second src pointer
  386. add dst1=1,dst0 // second dest pointer
  387. ;;
  388. EX(.ex_handler_short, (p8) ld1 t1=[src0],2)
  389. EK(.ex_handler_short, (p10) ld1 t2=[src1],2)
  390. (p9) br.ret.dpnt rp // 0 byte copy
  391. ;;
  392. EX(.ex_handler_short, (p8) st1 [dst0]=t1,2)
  393. EK(.ex_handler_short, (p10) st1 [dst1]=t2,2)
  394. (p11) br.ret.dpnt rp // 1 byte copy
  395. EX(.ex_handler_short, (p12) ld1 t3=[src0],2)
  396. EK(.ex_handler_short, (p14) ld1 t4=[src1],2)
  397. (p13) br.ret.dpnt rp // 2 byte copy
  398. ;;
  399. cmp.le p6,p7 = 5,in2
  400. cmp.le p8,p9 = 6,in2
  401. cmp.le p10,p11 = 7,in2
  402. EX(.ex_handler_short, (p12) st1 [dst0]=t3,2)
  403. EK(.ex_handler_short, (p14) st1 [dst1]=t4,2)
  404. (p15) br.ret.dpnt rp // 3 byte copy
  405. ;;
  406. EX(.ex_handler_short, (p6) ld1 t5=[src0],2)
  407. EK(.ex_handler_short, (p8) ld1 t6=[src1],2)
  408. (p7) br.ret.dpnt rp // 4 byte copy
  409. ;;
  410. EX(.ex_handler_short, (p6) st1 [dst0]=t5,2)
  411. EK(.ex_handler_short, (p8) st1 [dst1]=t6,2)
  412. (p9) br.ret.dptk rp // 5 byte copy
  413. EX(.ex_handler_short, (p10) ld1 t7=[src0],2)
  414. (p11) br.ret.dptk rp // 6 byte copy
  415. ;;
  416. EX(.ex_handler_short, (p10) st1 [dst0]=t7,2)
  417. br.ret.dptk rp // done all cases
  418. /* Align dest to nearest 8-byte boundary. We know we have at
  419. * least 7 bytes to copy, enough to crawl to 8-byte boundary.
  420. * Actual number of byte to crawl depend on the dest alignment.
  421. * 7 byte or less is taken care at .memcpy_short
  422. * src0 - source even index
  423. * src1 - source odd index
  424. * dst0 - dest even index
  425. * dst1 - dest odd index
  426. * r30 - distance to 8-byte boundary
  427. */
  428. .align_dest:
  429. add src1=1,in1 // source odd index
  430. cmp.le p7,p0 = 2,r30 // for .align_dest
  431. cmp.le p8,p0 = 3,r30 // for .align_dest
  432. EX(.ex_handler_short, (p6) ld1 t1=[src0],2)
  433. cmp.le p9,p0 = 4,r30 // for .align_dest
  434. cmp.le p10,p0 = 5,r30
  435. ;;
  436. EX(.ex_handler_short, (p7) ld1 t2=[src1],2)
  437. EK(.ex_handler_short, (p8) ld1 t3=[src0],2)
  438. cmp.le p11,p0 = 6,r30
  439. EX(.ex_handler_short, (p6) st1 [dst0] = t1,2)
  440. cmp.le p12,p0 = 7,r30
  441. ;;
  442. EX(.ex_handler_short, (p9) ld1 t4=[src1],2)
  443. EK(.ex_handler_short, (p10) ld1 t5=[src0],2)
  444. EX(.ex_handler_short, (p7) st1 [dst1] = t2,2)
  445. EK(.ex_handler_short, (p8) st1 [dst0] = t3,2)
  446. ;;
  447. EX(.ex_handler_short, (p11) ld1 t6=[src1],2)
  448. EK(.ex_handler_short, (p12) ld1 t7=[src0],2)
  449. cmp.eq p6,p7=r28,r29
  450. EX(.ex_handler_short, (p9) st1 [dst1] = t4,2)
  451. EK(.ex_handler_short, (p10) st1 [dst0] = t5,2)
  452. sub in2=in2,r30
  453. ;;
  454. EX(.ex_handler_short, (p11) st1 [dst1] = t6,2)
  455. EK(.ex_handler_short, (p12) st1 [dst0] = t7)
  456. add dst0=in0,r30 // setup arguments
  457. add src0=in1,r30
  458. (p6) br.cond.dptk .aligned_src
  459. (p7) br.cond.dpnt .unaligned_src
  460. ;;
  461. /* main loop body in jump table format */
  462. #define COPYU(shift) \
  463. 1: \
  464. EX(.ex_handler, (p16) ld8 r32=[src0],8); /* 1 */ \
  465. EK(.ex_handler, (p16) ld8 r36=[src1],8); \
  466. (p17) shrp r35=r33,r34,shift;; /* 1 */ \
  467. EX(.ex_handler, (p6) ld8 r22=[src1]); /* common, prime for tail section */ \
  468. nop.m 0; \
  469. (p16) shrp r38=r36,r37,shift; \
  470. EX(.ex_handler, (p17) st8 [dst0]=r35,8); /* 1 */ \
  471. EK(.ex_handler, (p17) st8 [dst1]=r39,8); \
  472. br.ctop.dptk.few 1b;; \
  473. (p7) add src1=-8,src1; /* back out for <8 byte case */ \
  474. shrp r21=r22,r38,shift; /* speculative work */ \
  475. br.sptk.few .unaligned_src_tail /* branch out of jump table */ \
  476. ;;
  477. TEXT_ALIGN(32)
  478. .jump_table:
  479. COPYU(8) // unaligned cases
  480. .jmp1:
  481. COPYU(16)
  482. COPYU(24)
  483. COPYU(32)
  484. COPYU(40)
  485. COPYU(48)
  486. COPYU(56)
  487. #undef A
  488. #undef B
  489. #undef C
  490. #undef D
  491. /*
  492. * Due to lack of local tag support in gcc 2.x assembler, it is not clear which
  493. * instruction failed in the bundle. The exception algorithm is that we
  494. * first figure out the faulting address, then detect if there is any
  495. * progress made on the copy, if so, redo the copy from last known copied
  496. * location up to the faulting address (exclusive). In the copy_from_user
  497. * case, remaining byte in kernel buffer will be zeroed.
  498. *
  499. * Take copy_from_user as an example, in the code there are multiple loads
  500. * in a bundle and those multiple loads could span over two pages, the
  501. * faulting address is calculated as page_round_down(max(src0, src1)).
  502. * This is based on knowledge that if we can access one byte in a page, we
  503. * can access any byte in that page.
  504. *
  505. * predicate used in the exception handler:
  506. * p6-p7: direction
  507. * p10-p11: src faulting addr calculation
  508. * p12-p13: dst faulting addr calculation
  509. */
  510. #define A r19
  511. #define B r20
  512. #define C r21
  513. #define D r22
  514. #define F r28
  515. #define saved_retval loc0
  516. #define saved_rtlink loc1
  517. #define saved_pfs_stack loc2
  518. .ex_hndlr_s:
  519. add src0=8,src0
  520. br.sptk .ex_handler
  521. ;;
  522. .ex_hndlr_d:
  523. add dst0=8,dst0
  524. br.sptk .ex_handler
  525. ;;
  526. .ex_hndlr_lcpy_1:
  527. mov src1=src_pre_mem
  528. mov dst1=dst_pre_mem
  529. cmp.gtu p10,p11=src_pre_mem,saved_in1
  530. cmp.gtu p12,p13=dst_pre_mem,saved_in0
  531. ;;
  532. (p10) add src0=8,saved_in1
  533. (p11) mov src0=saved_in1
  534. (p12) add dst0=8,saved_in0
  535. (p13) mov dst0=saved_in0
  536. br.sptk .ex_handler
  537. .ex_handler_lcpy:
  538. // in line_copy block, the preload addresses should always ahead
  539. // of the other two src/dst pointers. Furthermore, src1/dst1 should
  540. // always ahead of src0/dst0.
  541. mov src1=src_pre_mem
  542. mov dst1=dst_pre_mem
  543. .ex_handler:
  544. mov pr=saved_pr,-1 // first restore pr, lc, and pfs
  545. mov ar.lc=saved_lc
  546. mov ar.pfs=saved_pfs
  547. ;;
  548. .ex_handler_short: // fault occurred in these sections didn't change pr, lc, pfs
  549. cmp.ltu p6,p7=saved_in0, saved_in1 // get the copy direction
  550. cmp.ltu p10,p11=src0,src1
  551. cmp.ltu p12,p13=dst0,dst1
  552. fcmp.eq p8,p0=f6,f0 // is it memcpy?
  553. mov tmp = dst0
  554. ;;
  555. (p11) mov src1 = src0 // pick the larger of the two
  556. (p13) mov dst0 = dst1 // make dst0 the smaller one
  557. (p13) mov dst1 = tmp // and dst1 the larger one
  558. ;;
  559. (p6) dep F = r0,dst1,0,PAGE_SHIFT // usr dst round down to page boundary
  560. (p7) dep F = r0,src1,0,PAGE_SHIFT // usr src round down to page boundary
  561. ;;
  562. (p6) cmp.le p14,p0=dst0,saved_in0 // no progress has been made on store
  563. (p7) cmp.le p14,p0=src0,saved_in1 // no progress has been made on load
  564. mov retval=saved_in2
  565. (p8) ld1 tmp=[src1] // force an oops for memcpy call
  566. (p8) st1 [dst1]=r0 // force an oops for memcpy call
  567. (p14) br.ret.sptk.many rp
  568. /*
  569. * The remaining byte to copy is calculated as:
  570. *
  571. * A = (faulting_addr - orig_src) -> len to faulting ld address
  572. * or
  573. * (faulting_addr - orig_dst) -> len to faulting st address
  574. * B = (cur_dst - orig_dst) -> len copied so far
  575. * C = A - B -> len need to be copied
  576. * D = orig_len - A -> len need to be left along
  577. */
  578. (p6) sub A = F, saved_in0
  579. (p7) sub A = F, saved_in1
  580. clrrrb
  581. ;;
  582. alloc saved_pfs_stack=ar.pfs,3,3,3,0
  583. cmp.lt p8,p0=A,r0
  584. sub B = dst0, saved_in0 // how many byte copied so far
  585. ;;
  586. (p8) mov A = 0; // A shouldn't be negative, cap it
  587. ;;
  588. sub C = A, B
  589. sub D = saved_in2, A
  590. ;;
  591. cmp.gt p8,p0=C,r0 // more than 1 byte?
  592. mov r8=0
  593. mov saved_retval = D
  594. mov saved_rtlink = b0
  595. add out0=saved_in0, B
  596. add out1=saved_in1, B
  597. mov out2=C
  598. (p8) br.call.sptk.few b0=__copy_user // recursive call
  599. ;;
  600. add saved_retval=saved_retval,r8 // above might return non-zero value
  601. ;;
  602. mov retval=saved_retval
  603. mov ar.pfs=saved_pfs_stack
  604. mov b0=saved_rtlink
  605. br.ret.sptk.many rp
  606. /* end of McKinley specific optimization */
  607. END(__copy_user)
  608. EXPORT_SYMBOL(__copy_user)