arcfour-amd64.S 2.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109
  1. /*
  2. ** RC4 implementation optimized for AMD64.
  3. **
  4. ** Author: Marc Bevand <bevand_m (at) epita.fr>
  5. ** Licence: I hereby disclaim the copyright on this code and place it
  6. ** in the public domain.
  7. **
  8. ** The throughput achieved by this code is about 320 MBytes/sec, on
  9. ** a 1.8 GHz AMD Opteron (rev C0) processor.
  10. **
  11. ** 2013/12/20 <jussi.kivilinna@iki.fi>:
  12. ** - Integrated to libgcrypt
  13. ** - 4.18 cycles/byte on Intel i5-4570
  14. */
  15. #ifdef __x86_64__
  16. #include <config.h>
  17. #if defined(USE_ARCFOUR) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
  18. defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
  19. #include "asm-common-amd64.h"
  20. .text
  21. .align 16
  22. .globl _gcry_arcfour_amd64
  23. ELF(.type _gcry_arcfour_amd64,@function)
  24. _gcry_arcfour_amd64:
  25. CFI_STARTPROC()
  26. ENTER_SYSV_FUNC_PARAMS_0_4
  27. push %rbp
  28. CFI_PUSH(%rbp)
  29. push %rbx
  30. CFI_PUSH(%rbx)
  31. mov %rdi, %rbp # key = ARG(key)
  32. mov %rsi, %rbx # rbx = ARG(len)
  33. mov %rdx, %rsi # in = ARG(in)
  34. mov %rcx, %rdi # out = ARG(out)
  35. mov (4*256)(%rbp), %ecx # x = key->x
  36. mov (4*256+4)(%rbp),%edx # y = key->y
  37. inc %rcx # x++
  38. and $255, %rcx # x &= 0xff
  39. lea -8(%rbx,%rsi), %rbx # rbx = in+len-8
  40. mov %rbx, %r9 # tmp = in+len-8
  41. mov (%rbp,%rcx,4), %eax # tx = d[x]
  42. cmp %rsi, %rbx # cmp in with in+len-8
  43. jl .Lend # jump if (in+len-8 < in)
  44. .Lstart:
  45. add $8, %rsi # increment in
  46. add $8, %rdi # increment out
  47. # generate the next 8 bytes of the rc4 stream into %r8
  48. mov $8, %r11 # byte counter
  49. 1: add %al, %dl # y += tx
  50. mov (%rbp,%rdx,4), %ebx # ty = d[y]
  51. mov %ebx, (%rbp,%rcx,4) # d[x] = ty
  52. add %al, %bl # val = ty + tx
  53. mov %eax, (%rbp,%rdx,4) # d[y] = tx
  54. inc %cl # x++ (NEXT ROUND)
  55. mov (%rbp,%rcx,4), %eax # tx = d[x] (NEXT ROUND)
  56. shl $8, %r8
  57. movb (%rbp,%rbx,4), %r8b # val = d[val]
  58. dec %r11b
  59. jnz 1b
  60. # xor 8 bytes
  61. bswap %r8
  62. xor -8(%rsi), %r8
  63. cmp %r9, %rsi # cmp in+len-8 with in
  64. mov %r8, -8(%rdi)
  65. jle .Lstart # jump if (in <= in+len-8)
  66. .Lend:
  67. add $8, %r9 # tmp = in+len
  68. # handle the last bytes, one by one
  69. 1: cmp %rsi, %r9 # cmp in with in+len
  70. jle .Lfinished # jump if (in+len <= in)
  71. add %al, %dl # y += tx
  72. mov (%rbp,%rdx,4), %ebx # ty = d[y]
  73. mov %ebx, (%rbp,%rcx,4) # d[x] = ty
  74. add %al, %bl # val = ty + tx
  75. mov %eax, (%rbp,%rdx,4) # d[y] = tx
  76. inc %cl # x++ (NEXT ROUND)
  77. mov (%rbp,%rcx,4), %eax # tx = d[x] (NEXT ROUND)
  78. movb (%rbp,%rbx,4), %r8b # val = d[val]
  79. xor (%rsi), %r8b # xor 1 byte
  80. movb %r8b, (%rdi)
  81. inc %rsi # in++
  82. inc %rdi # out++
  83. jmp 1b
  84. .Lfinished:
  85. dec %rcx # x--
  86. movb %cl, (4*256)(%rbp) # key->y = y
  87. movb %dl, (4*256+4)(%rbp) # key->x = x
  88. pop %rbx
  89. CFI_POP(%rbx)
  90. pop %rbp
  91. CFI_POP(%rbp)
  92. EXIT_SYSV_FUNC
  93. ret_spec_stop
  94. CFI_ENDPROC()
  95. .L__gcry_arcfour_amd64_end:
  96. ELF(.size _gcry_arcfour_amd64,.L__gcry_arcfour_amd64_end-_gcry_arcfour_amd64)
  97. #endif
  98. #endif