falkhash.asm 3.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186
  1. ;;; https://github.com/gamozolabs/falkhash
  2. ;;; nasm -f elf64 -o falkhash-elf64.o falkhash.asm
  3. ;;; nasm -f macho64 -o falkhash-macho64.o falkhash.asm
  4. [bits 64]
  5. ;%ifdef ELF
  6. ;section .code
  7. ;%endif
  8. ;%ifdef MACHO64
  9. ;section .text
  10. ;%endif
  11. %macro XMMPUSH 1
  12. sub rsp, 16
  13. movdqu [rsp], %1
  14. %endmacro
  15. %macro XMMPOP 1
  16. movdqu %1, [rsp]
  17. add rsp, 16
  18. %endmacro
  19. ; A chunk_size of 0x50 is ideal for AMD fam 15h platforms, which is what this
  20. ; was optimized and designed for. If you change this value, you have to
  21. ; manually add/remove movdqus and aesencs from the core loop. This must be
  22. ; divisible by 16.
  23. %define CHUNK_SIZE 0x50
  24. ; rdi -> data
  25. ; rsi -> len
  26. ; edx -> seed
  27. ; xmm5 <- 128-bit hash
  28. ;
  29. ; All non-output GP registers are preserved, conforming to the falkos ABI.
  30. ; All non-output XMM registers are also preserved.
  31. falkhash:
  32. push rax
  33. push rcx
  34. push rdi
  35. push rsi
  36. push rbp
  37. XMMPUSH xmm0
  38. XMMPUSH xmm1
  39. XMMPUSH xmm2
  40. XMMPUSH xmm3
  41. XMMPUSH xmm4
  42. sub rsp, CHUNK_SIZE
  43. ; Add the seed to the length
  44. mov rbp, rsi
  45. add rbp, rdx
  46. ; Place the length+seed for both the low and high 64-bits into xmm5,
  47. ; our hash output.
  48. pinsrq xmm5, rbp, 0
  49. pinsrq xmm5, rbp, 1
  50. .lewp:
  51. ; If we have less than a chunk, copy the partial chunk to the stack.
  52. cmp rsi, CHUNK_SIZE
  53. jb short .pad_last_chunk
  54. .continue:
  55. ; Read 5 pieces from memory into xmms
  56. movdqu xmm0, [rdi + 0x00]
  57. movdqu xmm1, [rdi + 0x10]
  58. movdqu xmm2, [rdi + 0x20]
  59. movdqu xmm3, [rdi + 0x30]
  60. movdqu xmm4, [rdi + 0x40]
  61. ; Mix all pieces into xmm0
  62. aesenc xmm0, xmm1
  63. aesenc xmm0, xmm2
  64. aesenc xmm0, xmm3
  65. aesenc xmm0, xmm4
  66. ; Finalize xmm0 by mixing with itself
  67. aesenc xmm0, xmm0
  68. ; Mix in xmm0 to the hash
  69. aesenc xmm5, xmm0
  70. ; Go to the next chunk, fall through if we're done.
  71. add rdi, CHUNK_SIZE
  72. sub rsi, CHUNK_SIZE
  73. jnz short .lewp
  74. jmp short .done
  75. .pad_last_chunk:
  76. ; Fill the stack with 0xff's, this is our padding
  77. push rdi
  78. lea rdi, [rsp + 8]
  79. mov rax, -1
  80. mov ecx, (CHUNK_SIZE / 8)
  81. rep stosq
  82. pop rdi
  83. ; Copy the remainder of data to the stack
  84. mov rcx, rsi
  85. mov rsi, rdi
  86. mov rdi, rsp
  87. rep movsb
  88. ; Make our data now come from the stack, and set the size to one chunk.
  89. mov rdi, rsp
  90. mov rsi, CHUNK_SIZE
  91. jmp short .continue
  92. .done:
  93. ; Finalize the hash. This is required at least once to pass
  94. ; Combination 0x8000000 and Combination 0x0000001. Need more than 1 to
  95. ; pass the Seed tests. We do 4 because they're pretty much free.
  96. ; Maybe we should actually use the seed better? Nah, more finalizing!
  97. aesenc xmm5, xmm5
  98. aesenc xmm5, xmm5
  99. aesenc xmm5, xmm5
  100. aesenc xmm5, xmm5
  101. add rsp, CHUNK_SIZE
  102. XMMPOP xmm4
  103. XMMPOP xmm3
  104. XMMPOP xmm2
  105. XMMPOP xmm1
  106. XMMPOP xmm0
  107. pop rbp
  108. pop rsi
  109. pop rdi
  110. pop rcx
  111. pop rax
  112. ret
  113. ; rdi -> pointer to data
  114. ; rsi -> len
  115. ; edx -> 32-bit seed
  116. ; rcx <> pointer to caller allocated 128-bit hash destination
  117. ;
  118. ; All non-output GP registers are preserved, conforming to the falkos ABI.
  119. ; All XMM registers are preserved.
  120. global _falkhash_test
  121. _falkhash_test:
  122. push rcx
  123. push rdx
  124. push rsi
  125. push rdi
  126. XMMPUSH xmm5
  127. %ifdef WIN
  128. ; Translate from windows to linux calling convention
  129. mov rdi, rcx
  130. mov rsi, rdx
  131. mov rdx, r8
  132. mov rcx, r9
  133. %endif
  134. call falkhash
  135. ; Store the hash into the hash destination
  136. movdqu [rcx], xmm5
  137. XMMPOP xmm5
  138. pop rdi
  139. pop rsi
  140. pop rdx
  141. pop rcx
  142. ret
  143. ; rax <- 64-bit rdtsc value
  144. global rdtsc64
  145. rdtsc64:
  146. push rdx
  147. rdtsc
  148. shl rdx, 32
  149. or rax, rdx
  150. pop rdx
  151. ret