memset_64.S 2.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139
  1. /* Copyright 2002 Andi Kleen, SuSE Labs */
  2. #include <linux/linkage.h>
  3. #include <asm/cpufeature.h>
  4. #include <asm/alternative-asm.h>
  5. .weak memset
  6. /*
  7. * ISO C memset - set a memory block to a byte value. This function uses fast
  8. * string to get better performance than the original function. The code is
  9. * simpler and shorter than the orignal function as well.
  10. *
  11. * rdi destination
  12. * rsi value (char)
  13. * rdx count (bytes)
  14. *
  15. * rax original destination
  16. */
  17. ENTRY(memset)
  18. ENTRY(__memset)
  19. /*
  20. * Some CPUs support enhanced REP MOVSB/STOSB feature. It is recommended
  21. * to use it when possible. If not available, use fast string instructions.
  22. *
  23. * Otherwise, use original memset function.
  24. */
  25. ALTERNATIVE_2 "jmp memset_orig", "", X86_FEATURE_REP_GOOD, \
  26. "jmp memset_erms", X86_FEATURE_ERMS
  27. movq %rdi,%r9
  28. movq %rdx,%rcx
  29. andl $7,%edx
  30. shrq $3,%rcx
  31. /* expand byte value */
  32. movzbl %sil,%esi
  33. movabs $0x0101010101010101,%rax
  34. imulq %rsi,%rax
  35. rep stosq
  36. movl %edx,%ecx
  37. rep stosb
  38. movq %r9,%rax
  39. ret
  40. ENDPROC(memset)
  41. ENDPROC(__memset)
  42. /*
  43. * ISO C memset - set a memory block to a byte value. This function uses
  44. * enhanced rep stosb to override the fast string function.
  45. * The code is simpler and shorter than the fast string function as well.
  46. *
  47. * rdi destination
  48. * rsi value (char)
  49. * rdx count (bytes)
  50. *
  51. * rax original destination
  52. */
  53. ENTRY(memset_erms)
  54. movq %rdi,%r9
  55. movb %sil,%al
  56. movq %rdx,%rcx
  57. rep stosb
  58. movq %r9,%rax
  59. ret
  60. ENDPROC(memset_erms)
  61. ENTRY(memset_orig)
  62. movq %rdi,%r10
  63. /* expand byte value */
  64. movzbl %sil,%ecx
  65. movabs $0x0101010101010101,%rax
  66. imulq %rcx,%rax
  67. /* align dst */
  68. movl %edi,%r9d
  69. andl $7,%r9d
  70. jnz .Lbad_alignment
  71. .Lafter_bad_alignment:
  72. movq %rdx,%rcx
  73. shrq $6,%rcx
  74. jz .Lhandle_tail
  75. .p2align 4
  76. .Lloop_64:
  77. decq %rcx
  78. movq %rax,(%rdi)
  79. movq %rax,8(%rdi)
  80. movq %rax,16(%rdi)
  81. movq %rax,24(%rdi)
  82. movq %rax,32(%rdi)
  83. movq %rax,40(%rdi)
  84. movq %rax,48(%rdi)
  85. movq %rax,56(%rdi)
  86. leaq 64(%rdi),%rdi
  87. jnz .Lloop_64
  88. /* Handle tail in loops. The loops should be faster than hard
  89. to predict jump tables. */
  90. .p2align 4
  91. .Lhandle_tail:
  92. movl %edx,%ecx
  93. andl $63&(~7),%ecx
  94. jz .Lhandle_7
  95. shrl $3,%ecx
  96. .p2align 4
  97. .Lloop_8:
  98. decl %ecx
  99. movq %rax,(%rdi)
  100. leaq 8(%rdi),%rdi
  101. jnz .Lloop_8
  102. .Lhandle_7:
  103. andl $7,%edx
  104. jz .Lende
  105. .p2align 4
  106. .Lloop_1:
  107. decl %edx
  108. movb %al,(%rdi)
  109. leaq 1(%rdi),%rdi
  110. jnz .Lloop_1
  111. .Lende:
  112. movq %r10,%rax
  113. ret
  114. .Lbad_alignment:
  115. cmpq $7,%rdx
  116. jbe .Lhandle_7
  117. movq %rax,(%rdi) /* unaligned store */
  118. movq $8,%r8
  119. subq %r9,%r8
  120. addq %r8,%rdi
  121. subq %r8,%rdx
  122. jmp .Lafter_bad_alignment
  123. .Lfinal:
  124. ENDPROC(memset_orig)