memcpy_64.S 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180
  1. /* Copyright 2002 Andi Kleen */
  2. #include <linux/linkage.h>
  3. #include <asm/cpufeature.h>
  4. #include <asm/alternative-asm.h>
  5. /*
  6. * We build a jump to memcpy_orig by default which gets NOPped out on
  7. * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which
  8. * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs
  9. * to a jmp to memcpy_erms which does the REP; MOVSB mem copy.
  10. */
  11. .weak memcpy
  12. /*
  13. * memcpy - Copy a memory block.
  14. *
  15. * Input:
  16. * rdi destination
  17. * rsi source
  18. * rdx count
  19. *
  20. * Output:
  21. * rax original destination
  22. */
  23. ENTRY(__memcpy)
  24. ENTRY(memcpy)
  25. ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \
  26. "jmp memcpy_erms", X86_FEATURE_ERMS
  27. movq %rdi, %rax
  28. movq %rdx, %rcx
  29. shrq $3, %rcx
  30. andl $7, %edx
  31. rep movsq
  32. movl %edx, %ecx
  33. rep movsb
  34. ret
  35. ENDPROC(memcpy)
  36. ENDPROC(__memcpy)
  37. /*
  38. * memcpy_erms() - enhanced fast string memcpy. This is faster and
  39. * simpler than memcpy. Use memcpy_erms when possible.
  40. */
  41. ENTRY(memcpy_erms)
  42. movq %rdi, %rax
  43. movq %rdx, %rcx
  44. rep movsb
  45. ret
  46. ENDPROC(memcpy_erms)
  47. ENTRY(memcpy_orig)
  48. movq %rdi, %rax
  49. cmpq $0x20, %rdx
  50. jb .Lhandle_tail
  51. /*
  52. * We check whether memory false dependence could occur,
  53. * then jump to corresponding copy mode.
  54. */
  55. cmp %dil, %sil
  56. jl .Lcopy_backward
  57. subq $0x20, %rdx
  58. .Lcopy_forward_loop:
  59. subq $0x20, %rdx
  60. /*
  61. * Move in blocks of 4x8 bytes:
  62. */
  63. movq 0*8(%rsi), %r8
  64. movq 1*8(%rsi), %r9
  65. movq 2*8(%rsi), %r10
  66. movq 3*8(%rsi), %r11
  67. leaq 4*8(%rsi), %rsi
  68. movq %r8, 0*8(%rdi)
  69. movq %r9, 1*8(%rdi)
  70. movq %r10, 2*8(%rdi)
  71. movq %r11, 3*8(%rdi)
  72. leaq 4*8(%rdi), %rdi
  73. jae .Lcopy_forward_loop
  74. addl $0x20, %edx
  75. jmp .Lhandle_tail
  76. .Lcopy_backward:
  77. /*
  78. * Calculate copy position to tail.
  79. */
  80. addq %rdx, %rsi
  81. addq %rdx, %rdi
  82. subq $0x20, %rdx
  83. /*
  84. * At most 3 ALU operations in one cycle,
  85. * so append NOPS in the same 16 bytes trunk.
  86. */
  87. .p2align 4
  88. .Lcopy_backward_loop:
  89. subq $0x20, %rdx
  90. movq -1*8(%rsi), %r8
  91. movq -2*8(%rsi), %r9
  92. movq -3*8(%rsi), %r10
  93. movq -4*8(%rsi), %r11
  94. leaq -4*8(%rsi), %rsi
  95. movq %r8, -1*8(%rdi)
  96. movq %r9, -2*8(%rdi)
  97. movq %r10, -3*8(%rdi)
  98. movq %r11, -4*8(%rdi)
  99. leaq -4*8(%rdi), %rdi
  100. jae .Lcopy_backward_loop
  101. /*
  102. * Calculate copy position to head.
  103. */
  104. addl $0x20, %edx
  105. subq %rdx, %rsi
  106. subq %rdx, %rdi
  107. .Lhandle_tail:
  108. cmpl $16, %edx
  109. jb .Lless_16bytes
  110. /*
  111. * Move data from 16 bytes to 31 bytes.
  112. */
  113. movq 0*8(%rsi), %r8
  114. movq 1*8(%rsi), %r9
  115. movq -2*8(%rsi, %rdx), %r10
  116. movq -1*8(%rsi, %rdx), %r11
  117. movq %r8, 0*8(%rdi)
  118. movq %r9, 1*8(%rdi)
  119. movq %r10, -2*8(%rdi, %rdx)
  120. movq %r11, -1*8(%rdi, %rdx)
  121. retq
  122. .p2align 4
  123. .Lless_16bytes:
  124. cmpl $8, %edx
  125. jb .Lless_8bytes
  126. /*
  127. * Move data from 8 bytes to 15 bytes.
  128. */
  129. movq 0*8(%rsi), %r8
  130. movq -1*8(%rsi, %rdx), %r9
  131. movq %r8, 0*8(%rdi)
  132. movq %r9, -1*8(%rdi, %rdx)
  133. retq
  134. .p2align 4
  135. .Lless_8bytes:
  136. cmpl $4, %edx
  137. jb .Lless_3bytes
  138. /*
  139. * Move data from 4 bytes to 7 bytes.
  140. */
  141. movl (%rsi), %ecx
  142. movl -4(%rsi, %rdx), %r8d
  143. movl %ecx, (%rdi)
  144. movl %r8d, -4(%rdi, %rdx)
  145. retq
  146. .p2align 4
  147. .Lless_3bytes:
  148. subl $1, %edx
  149. jb .Lend
  150. /*
  151. * Move data from 1 bytes to 3 bytes.
  152. */
  153. movzbl (%rsi), %ecx
  154. jz .Lstore_1byte
  155. movzbq 1(%rsi), %r8
  156. movzbq (%rsi, %rdx), %r9
  157. movb %r8b, 1(%rdi)
  158. movb %r9b, (%rdi, %rdx)
  159. .Lstore_1byte:
  160. movb %cl, (%rdi)
  161. .Lend:
  162. retq
  163. ENDPROC(memcpy_orig)