memcpy_mck.S 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670
  1. /*
  2. * Itanium 2-optimized version of memcpy and copy_user function
  3. *
  4. * Inputs:
  5. * in0: destination address
  6. * in1: source address
  7. * in2: number of bytes to copy
  8. * Output:
  9. * for memcpy: return dest
  10. * for copy_user: return 0 if success,
  11. * or number of byte NOT copied if error occurred.
  12. *
  13. * Copyright (C) 2002 Intel Corp.
  14. * Copyright (C) 2002 Ken Chen <kenneth.w.chen@intel.com>
  15. */
  16. #include <asm/asmmacro.h>
  17. #include <asm/page.h>
  18. #include <asm/export.h>
  19. #define EK(y...) EX(y)
  20. /* McKinley specific optimization */
  21. #define retval r8
  22. #define saved_pfs r31
  23. #define saved_lc r10
  24. #define saved_pr r11
  25. #define saved_in0 r14
  26. #define saved_in1 r15
  27. #define saved_in2 r16
  28. #define src0 r2
  29. #define src1 r3
  30. #define dst0 r17
  31. #define dst1 r18
  32. #define cnt r9
  33. /* r19-r30 are temp for each code section */
  34. #define PREFETCH_DIST 8
  35. #define src_pre_mem r19
  36. #define dst_pre_mem r20
  37. #define src_pre_l2 r21
  38. #define dst_pre_l2 r22
  39. #define t1 r23
  40. #define t2 r24
  41. #define t3 r25
  42. #define t4 r26
  43. #define t5 t1 // alias!
  44. #define t6 t2 // alias!
  45. #define t7 t3 // alias!
  46. #define n8 r27
  47. #define t9 t5 // alias!
  48. #define t10 t4 // alias!
  49. #define t11 t7 // alias!
  50. #define t12 t6 // alias!
  51. #define t14 t10 // alias!
  52. #define t13 r28
  53. #define t15 r29
  54. #define tmp r30
  55. /* defines for long_copy block */
  56. #define A 0
  57. #define B (PREFETCH_DIST)
  58. #define C (B + PREFETCH_DIST)
  59. #define D (C + 1)
  60. #define N (D + 1)
  61. #define Nrot ((N + 7) & ~7)
  62. /* alias */
  63. #define in0 r32
  64. #define in1 r33
  65. #define in2 r34
  66. GLOBAL_ENTRY(memcpy)
  67. and r28=0x7,in0
  68. and r29=0x7,in1
  69. mov f6=f0
  70. mov retval=in0
  71. br.cond.sptk .common_code
  72. ;;
  73. END(memcpy)
  74. EXPORT_SYMBOL(memcpy)
  75. GLOBAL_ENTRY(__copy_user)
  76. .prologue
  77. // check dest alignment
  78. and r28=0x7,in0
  79. and r29=0x7,in1
  80. mov f6=f1
  81. mov saved_in0=in0 // save dest pointer
  82. mov saved_in1=in1 // save src pointer
  83. mov retval=r0 // initialize return value
  84. ;;
  85. .common_code:
  86. cmp.gt p15,p0=8,in2 // check for small size
  87. cmp.ne p13,p0=0,r28 // check dest alignment
  88. cmp.ne p14,p0=0,r29 // check src alignment
  89. add src0=0,in1
  90. sub r30=8,r28 // for .align_dest
  91. mov saved_in2=in2 // save len
  92. ;;
  93. add dst0=0,in0
  94. add dst1=1,in0 // dest odd index
  95. cmp.le p6,p0 = 1,r30 // for .align_dest
  96. (p15) br.cond.dpnt .memcpy_short
  97. (p13) br.cond.dpnt .align_dest
  98. (p14) br.cond.dpnt .unaligned_src
  99. ;;
  100. // both dest and src are aligned on 8-byte boundary
  101. .aligned_src:
  102. .save ar.pfs, saved_pfs
  103. alloc saved_pfs=ar.pfs,3,Nrot-3,0,Nrot
  104. .save pr, saved_pr
  105. mov saved_pr=pr
  106. shr.u cnt=in2,7 // this much cache line
  107. ;;
  108. cmp.lt p6,p0=2*PREFETCH_DIST,cnt
  109. cmp.lt p7,p8=1,cnt
  110. .save ar.lc, saved_lc
  111. mov saved_lc=ar.lc
  112. .body
  113. add cnt=-1,cnt
  114. add src_pre_mem=0,in1 // prefetch src pointer
  115. add dst_pre_mem=0,in0 // prefetch dest pointer
  116. ;;
  117. (p7) mov ar.lc=cnt // prefetch count
  118. (p8) mov ar.lc=r0
  119. (p6) br.cond.dpnt .long_copy
  120. ;;
  121. .prefetch:
  122. lfetch.fault [src_pre_mem], 128
  123. lfetch.fault.excl [dst_pre_mem], 128
  124. br.cloop.dptk.few .prefetch
  125. ;;
  126. .medium_copy:
  127. and tmp=31,in2 // copy length after iteration
  128. shr.u r29=in2,5 // number of 32-byte iteration
  129. add dst1=8,dst0 // 2nd dest pointer
  130. ;;
  131. add cnt=-1,r29 // ctop iteration adjustment
  132. cmp.eq p10,p0=r29,r0 // do we really need to loop?
  133. add src1=8,src0 // 2nd src pointer
  134. cmp.le p6,p0=8,tmp
  135. ;;
  136. cmp.le p7,p0=16,tmp
  137. mov ar.lc=cnt // loop setup
  138. cmp.eq p16,p17 = r0,r0
  139. mov ar.ec=2
  140. (p10) br.dpnt.few .aligned_src_tail
  141. ;;
  142. TEXT_ALIGN(32)
  143. 1:
  144. EX(.ex_handler, (p16) ld8 r34=[src0],16)
  145. EK(.ex_handler, (p16) ld8 r38=[src1],16)
  146. EX(.ex_handler, (p17) st8 [dst0]=r33,16)
  147. EK(.ex_handler, (p17) st8 [dst1]=r37,16)
  148. ;;
  149. EX(.ex_handler, (p16) ld8 r32=[src0],16)
  150. EK(.ex_handler, (p16) ld8 r36=[src1],16)
  151. EX(.ex_handler, (p16) st8 [dst0]=r34,16)
  152. EK(.ex_handler, (p16) st8 [dst1]=r38,16)
  153. br.ctop.dptk.few 1b
  154. ;;
  155. .aligned_src_tail:
  156. EX(.ex_handler, (p6) ld8 t1=[src0])
  157. mov ar.lc=saved_lc
  158. mov ar.pfs=saved_pfs
  159. EX(.ex_hndlr_s, (p7) ld8 t2=[src1],8)
  160. cmp.le p8,p0=24,tmp
  161. and r21=-8,tmp
  162. ;;
  163. EX(.ex_hndlr_s, (p8) ld8 t3=[src1])
  164. EX(.ex_handler, (p6) st8 [dst0]=t1) // store byte 1
  165. and in2=7,tmp // remaining length
  166. EX(.ex_hndlr_d, (p7) st8 [dst1]=t2,8) // store byte 2
  167. add src0=src0,r21 // setting up src pointer
  168. add dst0=dst0,r21 // setting up dest pointer
  169. ;;
  170. EX(.ex_handler, (p8) st8 [dst1]=t3) // store byte 3
  171. mov pr=saved_pr,-1
  172. br.dptk.many .memcpy_short
  173. ;;
  174. /* code taken from copy_page_mck */
  175. .long_copy:
  176. .rotr v[2*PREFETCH_DIST]
  177. .rotp p[N]
  178. mov src_pre_mem = src0
  179. mov pr.rot = 0x10000
  180. mov ar.ec = 1 // special unrolled loop
  181. mov dst_pre_mem = dst0
  182. add src_pre_l2 = 8*8, src0
  183. add dst_pre_l2 = 8*8, dst0
  184. ;;
  185. add src0 = 8, src_pre_mem // first t1 src
  186. mov ar.lc = 2*PREFETCH_DIST - 1
  187. shr.u cnt=in2,7 // number of lines
  188. add src1 = 3*8, src_pre_mem // first t3 src
  189. add dst0 = 8, dst_pre_mem // first t1 dst
  190. add dst1 = 3*8, dst_pre_mem // first t3 dst
  191. ;;
  192. and tmp=127,in2 // remaining bytes after this block
  193. add cnt = -(2*PREFETCH_DIST) - 1, cnt
  194. // same as .line_copy loop, but with all predicated-off instructions removed:
  195. .prefetch_loop:
  196. EX(.ex_hndlr_lcpy_1, (p[A]) ld8 v[A] = [src_pre_mem], 128) // M0
  197. EK(.ex_hndlr_lcpy_1, (p[B]) st8 [dst_pre_mem] = v[B], 128) // M2
  198. br.ctop.sptk .prefetch_loop
  199. ;;
  200. cmp.eq p16, p0 = r0, r0 // reset p16 to 1
  201. mov ar.lc = cnt
  202. mov ar.ec = N // # of stages in pipeline
  203. ;;
  204. .line_copy:
  205. EX(.ex_handler, (p[D]) ld8 t2 = [src0], 3*8) // M0
  206. EK(.ex_handler, (p[D]) ld8 t4 = [src1], 3*8) // M1
  207. EX(.ex_handler_lcpy, (p[B]) st8 [dst_pre_mem] = v[B], 128) // M2 prefetch dst from memory
  208. EK(.ex_handler_lcpy, (p[D]) st8 [dst_pre_l2] = n8, 128) // M3 prefetch dst from L2
  209. ;;
  210. EX(.ex_handler_lcpy, (p[A]) ld8 v[A] = [src_pre_mem], 128) // M0 prefetch src from memory
  211. EK(.ex_handler_lcpy, (p[C]) ld8 n8 = [src_pre_l2], 128) // M1 prefetch src from L2
  212. EX(.ex_handler, (p[D]) st8 [dst0] = t1, 8) // M2
  213. EK(.ex_handler, (p[D]) st8 [dst1] = t3, 8) // M3
  214. ;;
  215. EX(.ex_handler, (p[D]) ld8 t5 = [src0], 8)
  216. EK(.ex_handler, (p[D]) ld8 t7 = [src1], 3*8)
  217. EX(.ex_handler, (p[D]) st8 [dst0] = t2, 3*8)
  218. EK(.ex_handler, (p[D]) st8 [dst1] = t4, 3*8)
  219. ;;
  220. EX(.ex_handler, (p[D]) ld8 t6 = [src0], 3*8)
  221. EK(.ex_handler, (p[D]) ld8 t10 = [src1], 8)
  222. EX(.ex_handler, (p[D]) st8 [dst0] = t5, 8)
  223. EK(.ex_handler, (p[D]) st8 [dst1] = t7, 3*8)
  224. ;;
  225. EX(.ex_handler, (p[D]) ld8 t9 = [src0], 3*8)
  226. EK(.ex_handler, (p[D]) ld8 t11 = [src1], 3*8)
  227. EX(.ex_handler, (p[D]) st8 [dst0] = t6, 3*8)
  228. EK(.ex_handler, (p[D]) st8 [dst1] = t10, 8)
  229. ;;
  230. EX(.ex_handler, (p[D]) ld8 t12 = [src0], 8)
  231. EK(.ex_handler, (p[D]) ld8 t14 = [src1], 8)
  232. EX(.ex_handler, (p[D]) st8 [dst0] = t9, 3*8)
  233. EK(.ex_handler, (p[D]) st8 [dst1] = t11, 3*8)
  234. ;;
  235. EX(.ex_handler, (p[D]) ld8 t13 = [src0], 4*8)
  236. EK(.ex_handler, (p[D]) ld8 t15 = [src1], 4*8)
  237. EX(.ex_handler, (p[D]) st8 [dst0] = t12, 8)
  238. EK(.ex_handler, (p[D]) st8 [dst1] = t14, 8)
  239. ;;
  240. EX(.ex_handler, (p[C]) ld8 t1 = [src0], 8)
  241. EK(.ex_handler, (p[C]) ld8 t3 = [src1], 8)
  242. EX(.ex_handler, (p[D]) st8 [dst0] = t13, 4*8)
  243. EK(.ex_handler, (p[D]) st8 [dst1] = t15, 4*8)
  244. br.ctop.sptk .line_copy
  245. ;;
  246. add dst0=-8,dst0
  247. add src0=-8,src0
  248. mov in2=tmp
  249. .restore sp
  250. br.sptk.many .medium_copy
  251. ;;
  252. #define BLOCK_SIZE 128*32
  253. #define blocksize r23
  254. #define curlen r24
  255. // dest is on 8-byte boundary, src is not. We need to do
  256. // ld8-ld8, shrp, then st8. Max 8 byte copy per cycle.
  257. .unaligned_src:
  258. .prologue
  259. .save ar.pfs, saved_pfs
  260. alloc saved_pfs=ar.pfs,3,5,0,8
  261. .save ar.lc, saved_lc
  262. mov saved_lc=ar.lc
  263. .save pr, saved_pr
  264. mov saved_pr=pr
  265. .body
  266. .4k_block:
  267. mov saved_in0=dst0 // need to save all input arguments
  268. mov saved_in2=in2
  269. mov blocksize=BLOCK_SIZE
  270. ;;
  271. cmp.lt p6,p7=blocksize,in2
  272. mov saved_in1=src0
  273. ;;
  274. (p6) mov in2=blocksize
  275. ;;
  276. shr.u r21=in2,7 // this much cache line
  277. shr.u r22=in2,4 // number of 16-byte iteration
  278. and curlen=15,in2 // copy length after iteration
  279. and r30=7,src0 // source alignment
  280. ;;
  281. cmp.lt p7,p8=1,r21
  282. add cnt=-1,r21
  283. ;;
  284. add src_pre_mem=0,src0 // prefetch src pointer
  285. add dst_pre_mem=0,dst0 // prefetch dest pointer
  286. and src0=-8,src0 // 1st src pointer
  287. (p7) mov ar.lc = cnt
  288. (p8) mov ar.lc = r0
  289. ;;
  290. TEXT_ALIGN(32)
  291. 1: lfetch.fault [src_pre_mem], 128
  292. lfetch.fault.excl [dst_pre_mem], 128
  293. br.cloop.dptk.few 1b
  294. ;;
  295. shladd dst1=r22,3,dst0 // 2nd dest pointer
  296. shladd src1=r22,3,src0 // 2nd src pointer
  297. cmp.eq p8,p9=r22,r0 // do we really need to loop?
  298. cmp.le p6,p7=8,curlen; // have at least 8 byte remaining?
  299. add cnt=-1,r22 // ctop iteration adjustment
  300. ;;
  301. EX(.ex_handler, (p9) ld8 r33=[src0],8) // loop primer
  302. EK(.ex_handler, (p9) ld8 r37=[src1],8)
  303. (p8) br.dpnt.few .noloop
  304. ;;
  305. // The jump address is calculated based on src alignment. The COPYU
  306. // macro below need to confine its size to power of two, so an entry
  307. // can be caulated using shl instead of an expensive multiply. The
  308. // size is then hard coded by the following #define to match the
  309. // actual size. This make it somewhat tedious when COPYU macro gets
  310. // changed and this need to be adjusted to match.
  311. #define LOOP_SIZE 6
  312. 1:
  313. mov r29=ip // jmp_table thread
  314. mov ar.lc=cnt
  315. ;;
  316. add r29=.jump_table - 1b - (.jmp1-.jump_table), r29
  317. shl r28=r30, LOOP_SIZE // jmp_table thread
  318. mov ar.ec=2 // loop setup
  319. ;;
  320. add r29=r29,r28 // jmp_table thread
  321. cmp.eq p16,p17=r0,r0
  322. ;;
  323. mov b6=r29 // jmp_table thread
  324. ;;
  325. br.cond.sptk.few b6
  326. // for 8-15 byte case
  327. // We will skip the loop, but need to replicate the side effect
  328. // that the loop produces.
  329. .noloop:
  330. EX(.ex_handler, (p6) ld8 r37=[src1],8)
  331. add src0=8,src0
  332. (p6) shl r25=r30,3
  333. ;;
  334. EX(.ex_handler, (p6) ld8 r27=[src1])
  335. (p6) shr.u r28=r37,r25
  336. (p6) sub r26=64,r25
  337. ;;
  338. (p6) shl r27=r27,r26
  339. ;;
  340. (p6) or r21=r28,r27
  341. .unaligned_src_tail:
  342. /* check if we have more than blocksize to copy, if so go back */
  343. cmp.gt p8,p0=saved_in2,blocksize
  344. ;;
  345. (p8) add dst0=saved_in0,blocksize
  346. (p8) add src0=saved_in1,blocksize
  347. (p8) sub in2=saved_in2,blocksize
  348. (p8) br.dpnt .4k_block
  349. ;;
  350. /* we have up to 15 byte to copy in the tail.
  351. * part of work is already done in the jump table code
  352. * we are at the following state.
  353. * src side:
  354. *
  355. * xxxxxx xx <----- r21 has xxxxxxxx already
  356. * -------- -------- --------
  357. * 0 8 16
  358. * ^
  359. * |
  360. * src1
  361. *
  362. * dst
  363. * -------- -------- --------
  364. * ^
  365. * |
  366. * dst1
  367. */
  368. EX(.ex_handler, (p6) st8 [dst1]=r21,8) // more than 8 byte to copy
  369. (p6) add curlen=-8,curlen // update length
  370. mov ar.pfs=saved_pfs
  371. ;;
  372. mov ar.lc=saved_lc
  373. mov pr=saved_pr,-1
  374. mov in2=curlen // remaining length
  375. mov dst0=dst1 // dest pointer
  376. add src0=src1,r30 // forward by src alignment
  377. ;;
  378. // 7 byte or smaller.
  379. .memcpy_short:
  380. cmp.le p8,p9 = 1,in2
  381. cmp.le p10,p11 = 2,in2
  382. cmp.le p12,p13 = 3,in2
  383. cmp.le p14,p15 = 4,in2
  384. add src1=1,src0 // second src pointer
  385. add dst1=1,dst0 // second dest pointer
  386. ;;
  387. EX(.ex_handler_short, (p8) ld1 t1=[src0],2)
  388. EK(.ex_handler_short, (p10) ld1 t2=[src1],2)
  389. (p9) br.ret.dpnt rp // 0 byte copy
  390. ;;
  391. EX(.ex_handler_short, (p8) st1 [dst0]=t1,2)
  392. EK(.ex_handler_short, (p10) st1 [dst1]=t2,2)
  393. (p11) br.ret.dpnt rp // 1 byte copy
  394. EX(.ex_handler_short, (p12) ld1 t3=[src0],2)
  395. EK(.ex_handler_short, (p14) ld1 t4=[src1],2)
  396. (p13) br.ret.dpnt rp // 2 byte copy
  397. ;;
  398. cmp.le p6,p7 = 5,in2
  399. cmp.le p8,p9 = 6,in2
  400. cmp.le p10,p11 = 7,in2
  401. EX(.ex_handler_short, (p12) st1 [dst0]=t3,2)
  402. EK(.ex_handler_short, (p14) st1 [dst1]=t4,2)
  403. (p15) br.ret.dpnt rp // 3 byte copy
  404. ;;
  405. EX(.ex_handler_short, (p6) ld1 t5=[src0],2)
  406. EK(.ex_handler_short, (p8) ld1 t6=[src1],2)
  407. (p7) br.ret.dpnt rp // 4 byte copy
  408. ;;
  409. EX(.ex_handler_short, (p6) st1 [dst0]=t5,2)
  410. EK(.ex_handler_short, (p8) st1 [dst1]=t6,2)
  411. (p9) br.ret.dptk rp // 5 byte copy
  412. EX(.ex_handler_short, (p10) ld1 t7=[src0],2)
  413. (p11) br.ret.dptk rp // 6 byte copy
  414. ;;
  415. EX(.ex_handler_short, (p10) st1 [dst0]=t7,2)
  416. br.ret.dptk rp // done all cases
  417. /* Align dest to nearest 8-byte boundary. We know we have at
  418. * least 7 bytes to copy, enough to crawl to 8-byte boundary.
  419. * Actual number of byte to crawl depend on the dest alignment.
  420. * 7 byte or less is taken care at .memcpy_short
  421. * src0 - source even index
  422. * src1 - source odd index
  423. * dst0 - dest even index
  424. * dst1 - dest odd index
  425. * r30 - distance to 8-byte boundary
  426. */
  427. .align_dest:
  428. add src1=1,in1 // source odd index
  429. cmp.le p7,p0 = 2,r30 // for .align_dest
  430. cmp.le p8,p0 = 3,r30 // for .align_dest
  431. EX(.ex_handler_short, (p6) ld1 t1=[src0],2)
  432. cmp.le p9,p0 = 4,r30 // for .align_dest
  433. cmp.le p10,p0 = 5,r30
  434. ;;
  435. EX(.ex_handler_short, (p7) ld1 t2=[src1],2)
  436. EK(.ex_handler_short, (p8) ld1 t3=[src0],2)
  437. cmp.le p11,p0 = 6,r30
  438. EX(.ex_handler_short, (p6) st1 [dst0] = t1,2)
  439. cmp.le p12,p0 = 7,r30
  440. ;;
  441. EX(.ex_handler_short, (p9) ld1 t4=[src1],2)
  442. EK(.ex_handler_short, (p10) ld1 t5=[src0],2)
  443. EX(.ex_handler_short, (p7) st1 [dst1] = t2,2)
  444. EK(.ex_handler_short, (p8) st1 [dst0] = t3,2)
  445. ;;
  446. EX(.ex_handler_short, (p11) ld1 t6=[src1],2)
  447. EK(.ex_handler_short, (p12) ld1 t7=[src0],2)
  448. cmp.eq p6,p7=r28,r29
  449. EX(.ex_handler_short, (p9) st1 [dst1] = t4,2)
  450. EK(.ex_handler_short, (p10) st1 [dst0] = t5,2)
  451. sub in2=in2,r30
  452. ;;
  453. EX(.ex_handler_short, (p11) st1 [dst1] = t6,2)
  454. EK(.ex_handler_short, (p12) st1 [dst0] = t7)
  455. add dst0=in0,r30 // setup arguments
  456. add src0=in1,r30
  457. (p6) br.cond.dptk .aligned_src
  458. (p7) br.cond.dpnt .unaligned_src
  459. ;;
  460. /* main loop body in jump table format */
  461. #define COPYU(shift) \
  462. 1: \
  463. EX(.ex_handler, (p16) ld8 r32=[src0],8); /* 1 */ \
  464. EK(.ex_handler, (p16) ld8 r36=[src1],8); \
  465. (p17) shrp r35=r33,r34,shift;; /* 1 */ \
  466. EX(.ex_handler, (p6) ld8 r22=[src1]); /* common, prime for tail section */ \
  467. nop.m 0; \
  468. (p16) shrp r38=r36,r37,shift; \
  469. EX(.ex_handler, (p17) st8 [dst0]=r35,8); /* 1 */ \
  470. EK(.ex_handler, (p17) st8 [dst1]=r39,8); \
  471. br.ctop.dptk.few 1b;; \
  472. (p7) add src1=-8,src1; /* back out for <8 byte case */ \
  473. shrp r21=r22,r38,shift; /* speculative work */ \
  474. br.sptk.few .unaligned_src_tail /* branch out of jump table */ \
  475. ;;
  476. TEXT_ALIGN(32)
  477. .jump_table:
  478. COPYU(8) // unaligned cases
  479. .jmp1:
  480. COPYU(16)
  481. COPYU(24)
  482. COPYU(32)
  483. COPYU(40)
  484. COPYU(48)
  485. COPYU(56)
  486. #undef A
  487. #undef B
  488. #undef C
  489. #undef D
  490. /*
  491. * Due to lack of local tag support in gcc 2.x assembler, it is not clear which
  492. * instruction failed in the bundle. The exception algorithm is that we
  493. * first figure out the faulting address, then detect if there is any
  494. * progress made on the copy, if so, redo the copy from last known copied
  495. * location up to the faulting address (exclusive). In the copy_from_user
  496. * case, remaining byte in kernel buffer will be zeroed.
  497. *
  498. * Take copy_from_user as an example, in the code there are multiple loads
  499. * in a bundle and those multiple loads could span over two pages, the
  500. * faulting address is calculated as page_round_down(max(src0, src1)).
  501. * This is based on knowledge that if we can access one byte in a page, we
  502. * can access any byte in that page.
  503. *
  504. * predicate used in the exception handler:
  505. * p6-p7: direction
  506. * p10-p11: src faulting addr calculation
  507. * p12-p13: dst faulting addr calculation
  508. */
  509. #define A r19
  510. #define B r20
  511. #define C r21
  512. #define D r22
  513. #define F r28
  514. #define memset_arg0 r32
  515. #define memset_arg2 r33
  516. #define saved_retval loc0
  517. #define saved_rtlink loc1
  518. #define saved_pfs_stack loc2
  519. .ex_hndlr_s:
  520. add src0=8,src0
  521. br.sptk .ex_handler
  522. ;;
  523. .ex_hndlr_d:
  524. add dst0=8,dst0
  525. br.sptk .ex_handler
  526. ;;
  527. .ex_hndlr_lcpy_1:
  528. mov src1=src_pre_mem
  529. mov dst1=dst_pre_mem
  530. cmp.gtu p10,p11=src_pre_mem,saved_in1
  531. cmp.gtu p12,p13=dst_pre_mem,saved_in0
  532. ;;
  533. (p10) add src0=8,saved_in1
  534. (p11) mov src0=saved_in1
  535. (p12) add dst0=8,saved_in0
  536. (p13) mov dst0=saved_in0
  537. br.sptk .ex_handler
  538. .ex_handler_lcpy:
  539. // in line_copy block, the preload addresses should always ahead
  540. // of the other two src/dst pointers. Furthermore, src1/dst1 should
  541. // always ahead of src0/dst0.
  542. mov src1=src_pre_mem
  543. mov dst1=dst_pre_mem
  544. .ex_handler:
  545. mov pr=saved_pr,-1 // first restore pr, lc, and pfs
  546. mov ar.lc=saved_lc
  547. mov ar.pfs=saved_pfs
  548. ;;
  549. .ex_handler_short: // fault occurred in these sections didn't change pr, lc, pfs
  550. cmp.ltu p6,p7=saved_in0, saved_in1 // get the copy direction
  551. cmp.ltu p10,p11=src0,src1
  552. cmp.ltu p12,p13=dst0,dst1
  553. fcmp.eq p8,p0=f6,f0 // is it memcpy?
  554. mov tmp = dst0
  555. ;;
  556. (p11) mov src1 = src0 // pick the larger of the two
  557. (p13) mov dst0 = dst1 // make dst0 the smaller one
  558. (p13) mov dst1 = tmp // and dst1 the larger one
  559. ;;
  560. (p6) dep F = r0,dst1,0,PAGE_SHIFT // usr dst round down to page boundary
  561. (p7) dep F = r0,src1,0,PAGE_SHIFT // usr src round down to page boundary
  562. ;;
  563. (p6) cmp.le p14,p0=dst0,saved_in0 // no progress has been made on store
  564. (p7) cmp.le p14,p0=src0,saved_in1 // no progress has been made on load
  565. mov retval=saved_in2
  566. (p8) ld1 tmp=[src1] // force an oops for memcpy call
  567. (p8) st1 [dst1]=r0 // force an oops for memcpy call
  568. (p14) br.ret.sptk.many rp
  569. /*
  570. * The remaining byte to copy is calculated as:
  571. *
  572. * A = (faulting_addr - orig_src) -> len to faulting ld address
  573. * or
  574. * (faulting_addr - orig_dst) -> len to faulting st address
  575. * B = (cur_dst - orig_dst) -> len copied so far
  576. * C = A - B -> len need to be copied
  577. * D = orig_len - A -> len need to be zeroed
  578. */
  579. (p6) sub A = F, saved_in0
  580. (p7) sub A = F, saved_in1
  581. clrrrb
  582. ;;
  583. alloc saved_pfs_stack=ar.pfs,3,3,3,0
  584. cmp.lt p8,p0=A,r0
  585. sub B = dst0, saved_in0 // how many byte copied so far
  586. ;;
  587. (p8) mov A = 0; // A shouldn't be negative, cap it
  588. ;;
  589. sub C = A, B
  590. sub D = saved_in2, A
  591. ;;
  592. cmp.gt p8,p0=C,r0 // more than 1 byte?
  593. add memset_arg0=saved_in0, A
  594. (p6) mov memset_arg2=0 // copy_to_user should not call memset
  595. (p7) mov memset_arg2=D // copy_from_user need to have kbuf zeroed
  596. mov r8=0
  597. mov saved_retval = D
  598. mov saved_rtlink = b0
  599. add out0=saved_in0, B
  600. add out1=saved_in1, B
  601. mov out2=C
  602. (p8) br.call.sptk.few b0=__copy_user // recursive call
  603. ;;
  604. add saved_retval=saved_retval,r8 // above might return non-zero value
  605. cmp.gt p8,p0=memset_arg2,r0 // more than 1 byte?
  606. mov out0=memset_arg0 // *s
  607. mov out1=r0 // c
  608. mov out2=memset_arg2 // n
  609. (p8) br.call.sptk.few b0=memset
  610. ;;
  611. mov retval=saved_retval
  612. mov ar.pfs=saved_pfs_stack
  613. mov b0=saved_rtlink
  614. br.ret.sptk.many rp
  615. /* end of McKinley specific optimization */
  616. END(__copy_user)
  617. EXPORT_SYMBOL(__copy_user)