memset_64.S 2.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143
  1. /* SPDX-License-Identifier: GPL-2.0 */
  2. /* Copyright 2002 Andi Kleen, SuSE Labs */
  3. #include <linux/linkage.h>
  4. #include <asm/cpufeatures.h>
  5. #include <asm/alternative-asm.h>
  6. #include <asm/export.h>
  7. .weak memset
  8. /*
  9. * ISO C memset - set a memory block to a byte value. This function uses fast
  10. * string to get better performance than the original function. The code is
  11. * simpler and shorter than the original function as well.
  12. *
  13. * rdi destination
  14. * rsi value (char)
  15. * rdx count (bytes)
  16. *
  17. * rax original destination
  18. */
  19. ENTRY(memset)
  20. ENTRY(__memset)
  21. /*
  22. * Some CPUs support enhanced REP MOVSB/STOSB feature. It is recommended
  23. * to use it when possible. If not available, use fast string instructions.
  24. *
  25. * Otherwise, use original memset function.
  26. */
  27. ALTERNATIVE_2 "jmp memset_orig", "", X86_FEATURE_REP_GOOD, \
  28. "jmp memset_erms", X86_FEATURE_ERMS
  29. movq %rdi,%r9
  30. movq %rdx,%rcx
  31. andl $7,%edx
  32. shrq $3,%rcx
  33. /* expand byte value */
  34. movzbl %sil,%esi
  35. movabs $0x0101010101010101,%rax
  36. imulq %rsi,%rax
  37. rep stosq
  38. movl %edx,%ecx
  39. rep stosb
  40. movq %r9,%rax
  41. ret
  42. ENDPROC(memset)
  43. ENDPROC(__memset)
  44. EXPORT_SYMBOL(memset)
  45. EXPORT_SYMBOL(__memset)
  46. /*
  47. * ISO C memset - set a memory block to a byte value. This function uses
  48. * enhanced rep stosb to override the fast string function.
  49. * The code is simpler and shorter than the fast string function as well.
  50. *
  51. * rdi destination
  52. * rsi value (char)
  53. * rdx count (bytes)
  54. *
  55. * rax original destination
  56. */
  57. ENTRY(memset_erms)
  58. movq %rdi,%r9
  59. movb %sil,%al
  60. movq %rdx,%rcx
  61. rep stosb
  62. movq %r9,%rax
  63. ret
  64. ENDPROC(memset_erms)
  65. ENTRY(memset_orig)
  66. movq %rdi,%r10
  67. /* expand byte value */
  68. movzbl %sil,%ecx
  69. movabs $0x0101010101010101,%rax
  70. imulq %rcx,%rax
  71. /* align dst */
  72. movl %edi,%r9d
  73. andl $7,%r9d
  74. jnz .Lbad_alignment
  75. .Lafter_bad_alignment:
  76. movq %rdx,%rcx
  77. shrq $6,%rcx
  78. jz .Lhandle_tail
  79. .p2align 4
  80. .Lloop_64:
  81. decq %rcx
  82. movq %rax,(%rdi)
  83. movq %rax,8(%rdi)
  84. movq %rax,16(%rdi)
  85. movq %rax,24(%rdi)
  86. movq %rax,32(%rdi)
  87. movq %rax,40(%rdi)
  88. movq %rax,48(%rdi)
  89. movq %rax,56(%rdi)
  90. leaq 64(%rdi),%rdi
  91. jnz .Lloop_64
  92. /* Handle tail in loops. The loops should be faster than hard
  93. to predict jump tables. */
  94. .p2align 4
  95. .Lhandle_tail:
  96. movl %edx,%ecx
  97. andl $63&(~7),%ecx
  98. jz .Lhandle_7
  99. shrl $3,%ecx
  100. .p2align 4
  101. .Lloop_8:
  102. decl %ecx
  103. movq %rax,(%rdi)
  104. leaq 8(%rdi),%rdi
  105. jnz .Lloop_8
  106. .Lhandle_7:
  107. andl $7,%edx
  108. jz .Lende
  109. .p2align 4
  110. .Lloop_1:
  111. decl %edx
  112. movb %al,(%rdi)
  113. leaq 1(%rdi),%rdi
  114. jnz .Lloop_1
  115. .Lende:
  116. movq %r10,%rax
  117. ret
  118. .Lbad_alignment:
  119. cmpq $7,%rdx
  120. jbe .Lhandle_7
  121. movq %rax,(%rdi) /* unaligned store */
  122. movq $8,%r8
  123. subq %r9,%r8
  124. addq %r8,%rdi
  125. subq %r8,%rdx
  126. jmp .Lafter_bad_alignment
  127. .Lfinal:
  128. ENDPROC(memset_orig)