copy_user.S 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614
  1. /* SPDX-License-Identifier: GPL-2.0 */
  2. /*
  3. *
  4. * Optimized version of the copy_user() routine.
  5. * It is used to copy date across the kernel/user boundary.
  6. *
  7. * The source and destination are always on opposite side of
  8. * the boundary. When reading from user space we must catch
  9. * faults on loads. When writing to user space we must catch
  10. * errors on stores. Note that because of the nature of the copy
  11. * we don't need to worry about overlapping regions.
  12. *
  13. *
  14. * Inputs:
  15. * in0 address of source buffer
  16. * in1 address of destination buffer
  17. * in2 number of bytes to copy
  18. *
  19. * Outputs:
  20. * ret0 0 in case of success. The number of bytes NOT copied in
  21. * case of error.
  22. *
  23. * Copyright (C) 2000-2001 Hewlett-Packard Co
  24. * Stephane Eranian <eranian@hpl.hp.com>
  25. *
  26. * Fixme:
  27. * - handle the case where we have more than 16 bytes and the alignment
  28. * are different.
  29. * - more benchmarking
  30. * - fix extraneous stop bit introduced by the EX() macro.
  31. */
  32. #include <asm/asmmacro.h>
  33. #include <asm/export.h>
  34. //
  35. // Tuneable parameters
  36. //
  37. #define COPY_BREAK 16 // we do byte copy below (must be >=16)
  38. #define PIPE_DEPTH 21 // pipe depth
  39. #define EPI p[PIPE_DEPTH-1]
  40. //
  41. // arguments
  42. //
  43. #define dst in0
  44. #define src in1
  45. #define len in2
  46. //
  47. // local registers
  48. //
  49. #define t1 r2 // rshift in bytes
  50. #define t2 r3 // lshift in bytes
  51. #define rshift r14 // right shift in bits
  52. #define lshift r15 // left shift in bits
  53. #define word1 r16
  54. #define word2 r17
  55. #define cnt r18
  56. #define len2 r19
  57. #define saved_lc r20
  58. #define saved_pr r21
  59. #define tmp r22
  60. #define val r23
  61. #define src1 r24
  62. #define dst1 r25
  63. #define src2 r26
  64. #define dst2 r27
  65. #define len1 r28
  66. #define enddst r29
  67. #define endsrc r30
  68. #define saved_pfs r31
  69. GLOBAL_ENTRY(__copy_user)
  70. .prologue
  71. .save ar.pfs, saved_pfs
  72. alloc saved_pfs=ar.pfs,3,((2*PIPE_DEPTH+7)&~7),0,((2*PIPE_DEPTH+7)&~7)
  73. .rotr val1[PIPE_DEPTH],val2[PIPE_DEPTH]
  74. .rotp p[PIPE_DEPTH]
  75. adds len2=-1,len // br.ctop is repeat/until
  76. mov ret0=r0
  77. ;; // RAW of cfm when len=0
  78. cmp.eq p8,p0=r0,len // check for zero length
  79. .save ar.lc, saved_lc
  80. mov saved_lc=ar.lc // preserve ar.lc (slow)
  81. (p8) br.ret.spnt.many rp // empty mempcy()
  82. ;;
  83. add enddst=dst,len // first byte after end of source
  84. add endsrc=src,len // first byte after end of destination
  85. .save pr, saved_pr
  86. mov saved_pr=pr // preserve predicates
  87. .body
  88. mov dst1=dst // copy because of rotation
  89. mov ar.ec=PIPE_DEPTH
  90. mov pr.rot=1<<16 // p16=true all others are false
  91. mov src1=src // copy because of rotation
  92. mov ar.lc=len2 // initialize lc for small count
  93. cmp.lt p10,p7=COPY_BREAK,len // if len > COPY_BREAK then long copy
  94. xor tmp=src,dst // same alignment test prepare
  95. (p10) br.cond.dptk .long_copy_user
  96. ;; // RAW pr.rot/p16 ?
  97. //
  98. // Now we do the byte by byte loop with software pipeline
  99. //
  100. // p7 is necessarily false by now
  101. 1:
  102. EX(.failure_in_pipe1,(p16) ld1 val1[0]=[src1],1)
  103. EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1)
  104. br.ctop.dptk.few 1b
  105. ;;
  106. mov ar.lc=saved_lc
  107. mov pr=saved_pr,0xffffffffffff0000
  108. mov ar.pfs=saved_pfs // restore ar.ec
  109. br.ret.sptk.many rp // end of short memcpy
  110. //
  111. // Not 8-byte aligned
  112. //
  113. .diff_align_copy_user:
  114. // At this point we know we have more than 16 bytes to copy
  115. // and also that src and dest do _not_ have the same alignment.
  116. and src2=0x7,src1 // src offset
  117. and dst2=0x7,dst1 // dst offset
  118. ;;
  119. // The basic idea is that we copy byte-by-byte at the head so
  120. // that we can reach 8-byte alignment for both src1 and dst1.
  121. // Then copy the body using software pipelined 8-byte copy,
  122. // shifting the two back-to-back words right and left, then copy
  123. // the tail by copying byte-by-byte.
  124. //
  125. // Fault handling. If the byte-by-byte at the head fails on the
  126. // load, then restart and finish the pipleline by copying zeros
  127. // to the dst1. Then copy zeros for the rest of dst1.
  128. // If 8-byte software pipeline fails on the load, do the same as
  129. // failure_in3 does. If the byte-by-byte at the tail fails, it is
  130. // handled simply by failure_in_pipe1.
  131. //
  132. // The case p14 represents the source has more bytes in the
  133. // the first word (by the shifted part), whereas the p15 needs to
  134. // copy some bytes from the 2nd word of the source that has the
  135. // tail of the 1st of the destination.
  136. //
  137. //
  138. // Optimization. If dst1 is 8-byte aligned (quite common), we don't need
  139. // to copy the head to dst1, to start 8-byte copy software pipeline.
  140. // We know src1 is not 8-byte aligned in this case.
  141. //
  142. cmp.eq p14,p15=r0,dst2
  143. (p15) br.cond.spnt 1f
  144. ;;
  145. sub t1=8,src2
  146. mov t2=src2
  147. ;;
  148. shl rshift=t2,3
  149. sub len1=len,t1 // set len1
  150. ;;
  151. sub lshift=64,rshift
  152. ;;
  153. br.cond.spnt .word_copy_user
  154. ;;
  155. 1:
  156. cmp.leu p14,p15=src2,dst2
  157. sub t1=dst2,src2
  158. ;;
  159. .pred.rel "mutex", p14, p15
  160. (p14) sub word1=8,src2 // (8 - src offset)
  161. (p15) sub t1=r0,t1 // absolute value
  162. (p15) sub word1=8,dst2 // (8 - dst offset)
  163. ;;
  164. // For the case p14, we don't need to copy the shifted part to
  165. // the 1st word of destination.
  166. sub t2=8,t1
  167. (p14) sub word1=word1,t1
  168. ;;
  169. sub len1=len,word1 // resulting len
  170. (p15) shl rshift=t1,3 // in bits
  171. (p14) shl rshift=t2,3
  172. ;;
  173. (p14) sub len1=len1,t1
  174. adds cnt=-1,word1
  175. ;;
  176. sub lshift=64,rshift
  177. mov ar.ec=PIPE_DEPTH
  178. mov pr.rot=1<<16 // p16=true all others are false
  179. mov ar.lc=cnt
  180. ;;
  181. 2:
  182. EX(.failure_in_pipe2,(p16) ld1 val1[0]=[src1],1)
  183. EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1)
  184. br.ctop.dptk.few 2b
  185. ;;
  186. clrrrb
  187. ;;
  188. .word_copy_user:
  189. cmp.gtu p9,p0=16,len1
  190. (p9) br.cond.spnt 4f // if (16 > len1) skip 8-byte copy
  191. ;;
  192. shr.u cnt=len1,3 // number of 64-bit words
  193. ;;
  194. adds cnt=-1,cnt
  195. ;;
  196. .pred.rel "mutex", p14, p15
  197. (p14) sub src1=src1,t2
  198. (p15) sub src1=src1,t1
  199. //
  200. // Now both src1 and dst1 point to an 8-byte aligned address. And
  201. // we have more than 8 bytes to copy.
  202. //
  203. mov ar.lc=cnt
  204. mov ar.ec=PIPE_DEPTH
  205. mov pr.rot=1<<16 // p16=true all others are false
  206. ;;
  207. 3:
  208. //
  209. // The pipleline consists of 3 stages:
  210. // 1 (p16): Load a word from src1
  211. // 2 (EPI_1): Shift right pair, saving to tmp
  212. // 3 (EPI): Store tmp to dst1
  213. //
  214. // To make it simple, use at least 2 (p16) loops to set up val1[n]
  215. // because we need 2 back-to-back val1[] to get tmp.
  216. // Note that this implies EPI_2 must be p18 or greater.
  217. //
  218. #define EPI_1 p[PIPE_DEPTH-2]
  219. #define SWITCH(pred, shift) cmp.eq pred,p0=shift,rshift
  220. #define CASE(pred, shift) \
  221. (pred) br.cond.spnt .copy_user_bit##shift
  222. #define BODY(rshift) \
  223. .copy_user_bit##rshift: \
  224. 1: \
  225. EX(.failure_out,(EPI) st8 [dst1]=tmp,8); \
  226. (EPI_1) shrp tmp=val1[PIPE_DEPTH-2],val1[PIPE_DEPTH-1],rshift; \
  227. EX(3f,(p16) ld8 val1[1]=[src1],8); \
  228. (p16) mov val1[0]=r0; \
  229. br.ctop.dptk 1b; \
  230. ;; \
  231. br.cond.sptk.many .diff_align_do_tail; \
  232. 2: \
  233. (EPI) st8 [dst1]=tmp,8; \
  234. (EPI_1) shrp tmp=val1[PIPE_DEPTH-2],val1[PIPE_DEPTH-1],rshift; \
  235. 3: \
  236. (p16) mov val1[1]=r0; \
  237. (p16) mov val1[0]=r0; \
  238. br.ctop.dptk 2b; \
  239. ;; \
  240. br.cond.sptk.many .failure_in2
  241. //
  242. // Since the instruction 'shrp' requires a fixed 128-bit value
  243. // specifying the bits to shift, we need to provide 7 cases
  244. // below.
  245. //
  246. SWITCH(p6, 8)
  247. SWITCH(p7, 16)
  248. SWITCH(p8, 24)
  249. SWITCH(p9, 32)
  250. SWITCH(p10, 40)
  251. SWITCH(p11, 48)
  252. SWITCH(p12, 56)
  253. ;;
  254. CASE(p6, 8)
  255. CASE(p7, 16)
  256. CASE(p8, 24)
  257. CASE(p9, 32)
  258. CASE(p10, 40)
  259. CASE(p11, 48)
  260. CASE(p12, 56)
  261. ;;
  262. BODY(8)
  263. BODY(16)
  264. BODY(24)
  265. BODY(32)
  266. BODY(40)
  267. BODY(48)
  268. BODY(56)
  269. ;;
  270. .diff_align_do_tail:
  271. .pred.rel "mutex", p14, p15
  272. (p14) sub src1=src1,t1
  273. (p14) adds dst1=-8,dst1
  274. (p15) sub dst1=dst1,t1
  275. ;;
  276. 4:
  277. // Tail correction.
  278. //
  279. // The problem with this piplelined loop is that the last word is not
  280. // loaded and thus parf of the last word written is not correct.
  281. // To fix that, we simply copy the tail byte by byte.
  282. sub len1=endsrc,src1,1
  283. clrrrb
  284. ;;
  285. mov ar.ec=PIPE_DEPTH
  286. mov pr.rot=1<<16 // p16=true all others are false
  287. mov ar.lc=len1
  288. ;;
  289. 5:
  290. EX(.failure_in_pipe1,(p16) ld1 val1[0]=[src1],1)
  291. EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1)
  292. br.ctop.dptk.few 5b
  293. ;;
  294. mov ar.lc=saved_lc
  295. mov pr=saved_pr,0xffffffffffff0000
  296. mov ar.pfs=saved_pfs
  297. br.ret.sptk.many rp
  298. //
  299. // Beginning of long mempcy (i.e. > 16 bytes)
  300. //
  301. .long_copy_user:
  302. tbit.nz p6,p7=src1,0 // odd alignment
  303. and tmp=7,tmp
  304. ;;
  305. cmp.eq p10,p8=r0,tmp
  306. mov len1=len // copy because of rotation
  307. (p8) br.cond.dpnt .diff_align_copy_user
  308. ;;
  309. // At this point we know we have more than 16 bytes to copy
  310. // and also that both src and dest have the same alignment
  311. // which may not be the one we want. So for now we must move
  312. // forward slowly until we reach 16byte alignment: no need to
  313. // worry about reaching the end of buffer.
  314. //
  315. EX(.failure_in1,(p6) ld1 val1[0]=[src1],1) // 1-byte aligned
  316. (p6) adds len1=-1,len1;;
  317. tbit.nz p7,p0=src1,1
  318. ;;
  319. EX(.failure_in1,(p7) ld2 val1[1]=[src1],2) // 2-byte aligned
  320. (p7) adds len1=-2,len1;;
  321. tbit.nz p8,p0=src1,2
  322. ;;
  323. //
  324. // Stop bit not required after ld4 because if we fail on ld4
  325. // we have never executed the ld1, therefore st1 is not executed.
  326. //
  327. EX(.failure_in1,(p8) ld4 val2[0]=[src1],4) // 4-byte aligned
  328. ;;
  329. EX(.failure_out,(p6) st1 [dst1]=val1[0],1)
  330. tbit.nz p9,p0=src1,3
  331. ;;
  332. //
  333. // Stop bit not required after ld8 because if we fail on ld8
  334. // we have never executed the ld2, therefore st2 is not executed.
  335. //
  336. EX(.failure_in1,(p9) ld8 val2[1]=[src1],8) // 8-byte aligned
  337. EX(.failure_out,(p7) st2 [dst1]=val1[1],2)
  338. (p8) adds len1=-4,len1
  339. ;;
  340. EX(.failure_out, (p8) st4 [dst1]=val2[0],4)
  341. (p9) adds len1=-8,len1;;
  342. shr.u cnt=len1,4 // number of 128-bit (2x64bit) words
  343. ;;
  344. EX(.failure_out, (p9) st8 [dst1]=val2[1],8)
  345. tbit.nz p6,p0=len1,3
  346. cmp.eq p7,p0=r0,cnt
  347. adds tmp=-1,cnt // br.ctop is repeat/until
  348. (p7) br.cond.dpnt .dotail // we have less than 16 bytes left
  349. ;;
  350. adds src2=8,src1
  351. adds dst2=8,dst1
  352. mov ar.lc=tmp
  353. ;;
  354. //
  355. // 16bytes/iteration
  356. //
  357. 2:
  358. EX(.failure_in3,(p16) ld8 val1[0]=[src1],16)
  359. (p16) ld8 val2[0]=[src2],16
  360. EX(.failure_out, (EPI) st8 [dst1]=val1[PIPE_DEPTH-1],16)
  361. (EPI) st8 [dst2]=val2[PIPE_DEPTH-1],16
  362. br.ctop.dptk 2b
  363. ;; // RAW on src1 when fall through from loop
  364. //
  365. // Tail correction based on len only
  366. //
  367. // No matter where we come from (loop or test) the src1 pointer
  368. // is 16 byte aligned AND we have less than 16 bytes to copy.
  369. //
  370. .dotail:
  371. EX(.failure_in1,(p6) ld8 val1[0]=[src1],8) // at least 8 bytes
  372. tbit.nz p7,p0=len1,2
  373. ;;
  374. EX(.failure_in1,(p7) ld4 val1[1]=[src1],4) // at least 4 bytes
  375. tbit.nz p8,p0=len1,1
  376. ;;
  377. EX(.failure_in1,(p8) ld2 val2[0]=[src1],2) // at least 2 bytes
  378. tbit.nz p9,p0=len1,0
  379. ;;
  380. EX(.failure_out, (p6) st8 [dst1]=val1[0],8)
  381. ;;
  382. EX(.failure_in1,(p9) ld1 val2[1]=[src1]) // only 1 byte left
  383. mov ar.lc=saved_lc
  384. ;;
  385. EX(.failure_out,(p7) st4 [dst1]=val1[1],4)
  386. mov pr=saved_pr,0xffffffffffff0000
  387. ;;
  388. EX(.failure_out, (p8) st2 [dst1]=val2[0],2)
  389. mov ar.pfs=saved_pfs
  390. ;;
  391. EX(.failure_out, (p9) st1 [dst1]=val2[1])
  392. br.ret.sptk.many rp
  393. //
  394. // Here we handle the case where the byte by byte copy fails
  395. // on the load.
  396. // Several factors make the zeroing of the rest of the buffer kind of
  397. // tricky:
  398. // - the pipeline: loads/stores are not in sync (pipeline)
  399. //
  400. // In the same loop iteration, the dst1 pointer does not directly
  401. // reflect where the faulty load was.
  402. //
  403. // - pipeline effect
  404. // When you get a fault on load, you may have valid data from
  405. // previous loads not yet store in transit. Such data must be
  406. // store normally before moving onto zeroing the rest.
  407. //
  408. // - single/multi dispersal independence.
  409. //
  410. // solution:
  411. // - we don't disrupt the pipeline, i.e. data in transit in
  412. // the software pipeline will be eventually move to memory.
  413. // We simply replace the load with a simple mov and keep the
  414. // pipeline going. We can't really do this inline because
  415. // p16 is always reset to 1 when lc > 0.
  416. //
  417. .failure_in_pipe1:
  418. sub ret0=endsrc,src1 // number of bytes to zero, i.e. not copied
  419. 1:
  420. (p16) mov val1[0]=r0
  421. (EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1
  422. br.ctop.dptk 1b
  423. ;;
  424. mov pr=saved_pr,0xffffffffffff0000
  425. mov ar.lc=saved_lc
  426. mov ar.pfs=saved_pfs
  427. br.ret.sptk.many rp
  428. //
  429. // This is the case where the byte by byte copy fails on the load
  430. // when we copy the head. We need to finish the pipeline and copy
  431. // zeros for the rest of the destination. Since this happens
  432. // at the top we still need to fill the body and tail.
  433. .failure_in_pipe2:
  434. sub ret0=endsrc,src1 // number of bytes to zero, i.e. not copied
  435. 2:
  436. (p16) mov val1[0]=r0
  437. (EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1
  438. br.ctop.dptk 2b
  439. ;;
  440. sub len=enddst,dst1,1 // precompute len
  441. br.cond.dptk.many .failure_in1bis
  442. ;;
  443. //
  444. // Here we handle the head & tail part when we check for alignment.
  445. // The following code handles only the load failures. The
  446. // main diffculty comes from the fact that loads/stores are
  447. // scheduled. So when you fail on a load, the stores corresponding
  448. // to previous successful loads must be executed.
  449. //
  450. // However some simplifications are possible given the way
  451. // things work.
  452. //
  453. // 1) HEAD
  454. // Theory of operation:
  455. //
  456. // Page A | Page B
  457. // ---------|-----
  458. // 1|8 x
  459. // 1 2|8 x
  460. // 4|8 x
  461. // 1 4|8 x
  462. // 2 4|8 x
  463. // 1 2 4|8 x
  464. // |1
  465. // |2 x
  466. // |4 x
  467. //
  468. // page_size >= 4k (2^12). (x means 4, 2, 1)
  469. // Here we suppose Page A exists and Page B does not.
  470. //
  471. // As we move towards eight byte alignment we may encounter faults.
  472. // The numbers on each page show the size of the load (current alignment).
  473. //
  474. // Key point:
  475. // - if you fail on 1, 2, 4 then you have never executed any smaller
  476. // size loads, e.g. failing ld4 means no ld1 nor ld2 executed
  477. // before.
  478. //
  479. // This allows us to simplify the cleanup code, because basically you
  480. // only have to worry about "pending" stores in the case of a failing
  481. // ld8(). Given the way the code is written today, this means only
  482. // worry about st2, st4. There we can use the information encapsulated
  483. // into the predicates.
  484. //
  485. // Other key point:
  486. // - if you fail on the ld8 in the head, it means you went straight
  487. // to it, i.e. 8byte alignment within an unexisting page.
  488. // Again this comes from the fact that if you crossed just for the ld8 then
  489. // you are 8byte aligned but also 16byte align, therefore you would
  490. // either go for the 16byte copy loop OR the ld8 in the tail part.
  491. // The combination ld1, ld2, ld4, ld8 where you fail on ld8 is impossible
  492. // because it would mean you had 15bytes to copy in which case you
  493. // would have defaulted to the byte by byte copy.
  494. //
  495. //
  496. // 2) TAIL
  497. // Here we now we have less than 16 bytes AND we are either 8 or 16 byte
  498. // aligned.
  499. //
  500. // Key point:
  501. // This means that we either:
  502. // - are right on a page boundary
  503. // OR
  504. // - are at more than 16 bytes from a page boundary with
  505. // at most 15 bytes to copy: no chance of crossing.
  506. //
  507. // This allows us to assume that if we fail on a load we haven't possibly
  508. // executed any of the previous (tail) ones, so we don't need to do
  509. // any stores. For instance, if we fail on ld2, this means we had
  510. // 2 or 3 bytes left to copy and we did not execute the ld8 nor ld4.
  511. //
  512. // This means that we are in a situation similar the a fault in the
  513. // head part. That's nice!
  514. //
  515. .failure_in1:
  516. sub ret0=endsrc,src1 // number of bytes to zero, i.e. not copied
  517. sub len=endsrc,src1,1
  518. //
  519. // we know that ret0 can never be zero at this point
  520. // because we failed why trying to do a load, i.e. there is still
  521. // some work to do.
  522. // The failure_in1bis and length problem is taken care of at the
  523. // calling side.
  524. //
  525. ;;
  526. .failure_in1bis: // from (.failure_in3)
  527. mov ar.lc=len // Continue with a stupid byte store.
  528. ;;
  529. 5:
  530. st1 [dst1]=r0,1
  531. br.cloop.dptk 5b
  532. ;;
  533. mov pr=saved_pr,0xffffffffffff0000
  534. mov ar.lc=saved_lc
  535. mov ar.pfs=saved_pfs
  536. br.ret.sptk.many rp
  537. //
  538. // Here we simply restart the loop but instead
  539. // of doing loads we fill the pipeline with zeroes
  540. // We can't simply store r0 because we may have valid
  541. // data in transit in the pipeline.
  542. // ar.lc and ar.ec are setup correctly at this point
  543. //
  544. // we MUST use src1/endsrc here and not dst1/enddst because
  545. // of the pipeline effect.
  546. //
  547. .failure_in3:
  548. sub ret0=endsrc,src1 // number of bytes to zero, i.e. not copied
  549. ;;
  550. 2:
  551. (p16) mov val1[0]=r0
  552. (p16) mov val2[0]=r0
  553. (EPI) st8 [dst1]=val1[PIPE_DEPTH-1],16
  554. (EPI) st8 [dst2]=val2[PIPE_DEPTH-1],16
  555. br.ctop.dptk 2b
  556. ;;
  557. cmp.ne p6,p0=dst1,enddst // Do we need to finish the tail ?
  558. sub len=enddst,dst1,1 // precompute len
  559. (p6) br.cond.dptk .failure_in1bis
  560. ;;
  561. mov pr=saved_pr,0xffffffffffff0000
  562. mov ar.lc=saved_lc
  563. mov ar.pfs=saved_pfs
  564. br.ret.sptk.many rp
  565. .failure_in2:
  566. sub ret0=endsrc,src1
  567. cmp.ne p6,p0=dst1,enddst // Do we need to finish the tail ?
  568. sub len=enddst,dst1,1 // precompute len
  569. (p6) br.cond.dptk .failure_in1bis
  570. ;;
  571. mov pr=saved_pr,0xffffffffffff0000
  572. mov ar.lc=saved_lc
  573. mov ar.pfs=saved_pfs
  574. br.ret.sptk.many rp
  575. //
  576. // handling of failures on stores: that's the easy part
  577. //
  578. .failure_out:
  579. sub ret0=enddst,dst1
  580. mov pr=saved_pr,0xffffffffffff0000
  581. mov ar.lc=saved_lc
  582. mov ar.pfs=saved_pfs
  583. br.ret.sptk.many rp
  584. END(__copy_user)
  585. EXPORT_SYMBOL(__copy_user)