csum-copy_64.S 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225
  1. /*
  2. * Copyright 2002, 2003 Andi Kleen, SuSE Labs.
  3. *
  4. * This file is subject to the terms and conditions of the GNU General Public
  5. * License. See the file COPYING in the main directory of this archive
  6. * for more details. No warranty for anything given at all.
  7. */
  8. #include <linux/linkage.h>
  9. #include <asm/errno.h>
  10. #include <asm/asm.h>
  11. /*
  12. * Checksum copy with exception handling.
  13. * On exceptions src_err_ptr or dst_err_ptr is set to -EFAULT and the
  14. * destination is zeroed.
  15. *
  16. * Input
  17. * rdi source
  18. * rsi destination
  19. * edx len (32bit)
  20. * ecx sum (32bit)
  21. * r8 src_err_ptr (int)
  22. * r9 dst_err_ptr (int)
  23. *
  24. * Output
  25. * eax 64bit sum. undefined in case of exception.
  26. *
  27. * Wrappers need to take care of valid exception sum and zeroing.
  28. * They also should align source or destination to 8 bytes.
  29. */
  30. .macro source
  31. 10:
  32. _ASM_EXTABLE(10b, .Lbad_source)
  33. .endm
  34. .macro dest
  35. 20:
  36. _ASM_EXTABLE(20b, .Lbad_dest)
  37. .endm
  38. .macro ignore L=.Lignore
  39. 30:
  40. _ASM_EXTABLE(30b, \L)
  41. .endm
  42. ENTRY(csum_partial_copy_generic)
  43. cmpl $3*64, %edx
  44. jle .Lignore
  45. .Lignore:
  46. subq $7*8, %rsp
  47. movq %rbx, 2*8(%rsp)
  48. movq %r12, 3*8(%rsp)
  49. movq %r14, 4*8(%rsp)
  50. movq %r13, 5*8(%rsp)
  51. movq %rbp, 6*8(%rsp)
  52. movq %r8, (%rsp)
  53. movq %r9, 1*8(%rsp)
  54. movl %ecx, %eax
  55. movl %edx, %ecx
  56. xorl %r9d, %r9d
  57. movq %rcx, %r12
  58. shrq $6, %r12
  59. jz .Lhandle_tail /* < 64 */
  60. clc
  61. /* main loop. clear in 64 byte blocks */
  62. /* r9: zero, r8: temp2, rbx: temp1, rax: sum, rcx: saved length */
  63. /* r11: temp3, rdx: temp4, r12 loopcnt */
  64. /* r10: temp5, rbp: temp6, r14 temp7, r13 temp8 */
  65. .p2align 4
  66. .Lloop:
  67. source
  68. movq (%rdi), %rbx
  69. source
  70. movq 8(%rdi), %r8
  71. source
  72. movq 16(%rdi), %r11
  73. source
  74. movq 24(%rdi), %rdx
  75. source
  76. movq 32(%rdi), %r10
  77. source
  78. movq 40(%rdi), %rbp
  79. source
  80. movq 48(%rdi), %r14
  81. source
  82. movq 56(%rdi), %r13
  83. ignore 2f
  84. prefetcht0 5*64(%rdi)
  85. 2:
  86. adcq %rbx, %rax
  87. adcq %r8, %rax
  88. adcq %r11, %rax
  89. adcq %rdx, %rax
  90. adcq %r10, %rax
  91. adcq %rbp, %rax
  92. adcq %r14, %rax
  93. adcq %r13, %rax
  94. decl %r12d
  95. dest
  96. movq %rbx, (%rsi)
  97. dest
  98. movq %r8, 8(%rsi)
  99. dest
  100. movq %r11, 16(%rsi)
  101. dest
  102. movq %rdx, 24(%rsi)
  103. dest
  104. movq %r10, 32(%rsi)
  105. dest
  106. movq %rbp, 40(%rsi)
  107. dest
  108. movq %r14, 48(%rsi)
  109. dest
  110. movq %r13, 56(%rsi)
  111. 3:
  112. leaq 64(%rdi), %rdi
  113. leaq 64(%rsi), %rsi
  114. jnz .Lloop
  115. adcq %r9, %rax
  116. /* do last up to 56 bytes */
  117. .Lhandle_tail:
  118. /* ecx: count */
  119. movl %ecx, %r10d
  120. andl $63, %ecx
  121. shrl $3, %ecx
  122. jz .Lfold
  123. clc
  124. .p2align 4
  125. .Lloop_8:
  126. source
  127. movq (%rdi), %rbx
  128. adcq %rbx, %rax
  129. decl %ecx
  130. dest
  131. movq %rbx, (%rsi)
  132. leaq 8(%rsi), %rsi /* preserve carry */
  133. leaq 8(%rdi), %rdi
  134. jnz .Lloop_8
  135. adcq %r9, %rax /* add in carry */
  136. .Lfold:
  137. /* reduce checksum to 32bits */
  138. movl %eax, %ebx
  139. shrq $32, %rax
  140. addl %ebx, %eax
  141. adcl %r9d, %eax
  142. /* do last up to 6 bytes */
  143. .Lhandle_7:
  144. movl %r10d, %ecx
  145. andl $7, %ecx
  146. shrl $1, %ecx
  147. jz .Lhandle_1
  148. movl $2, %edx
  149. xorl %ebx, %ebx
  150. clc
  151. .p2align 4
  152. .Lloop_1:
  153. source
  154. movw (%rdi), %bx
  155. adcl %ebx, %eax
  156. decl %ecx
  157. dest
  158. movw %bx, (%rsi)
  159. leaq 2(%rdi), %rdi
  160. leaq 2(%rsi), %rsi
  161. jnz .Lloop_1
  162. adcl %r9d, %eax /* add in carry */
  163. /* handle last odd byte */
  164. .Lhandle_1:
  165. testb $1, %r10b
  166. jz .Lende
  167. xorl %ebx, %ebx
  168. source
  169. movb (%rdi), %bl
  170. dest
  171. movb %bl, (%rsi)
  172. addl %ebx, %eax
  173. adcl %r9d, %eax /* carry */
  174. .Lende:
  175. movq 2*8(%rsp), %rbx
  176. movq 3*8(%rsp), %r12
  177. movq 4*8(%rsp), %r14
  178. movq 5*8(%rsp), %r13
  179. movq 6*8(%rsp), %rbp
  180. addq $7*8, %rsp
  181. ret
  182. /* Exception handlers. Very simple, zeroing is done in the wrappers */
  183. .Lbad_source:
  184. movq (%rsp), %rax
  185. testq %rax, %rax
  186. jz .Lende
  187. movl $-EFAULT, (%rax)
  188. jmp .Lende
  189. .Lbad_dest:
  190. movq 8(%rsp), %rax
  191. testq %rax, %rax
  192. jz .Lende
  193. movl $-EFAULT, (%rax)
  194. jmp .Lende
  195. ENDPROC(csum_partial_copy_generic)