123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186 |
- ;;; https://github.com/gamozolabs/falkhash
- ;;; nasm -f elf64 -o falkhash-elf64.o falkhash.asm
- ;;; nasm -f macho64 -o falkhash-macho64.o falkhash.asm
- [bits 64]
- ;%ifdef ELF
- ;section .code
- ;%endif
- ;%ifdef MACHO64
- ;section .text
- ;%endif
- %macro XMMPUSH 1
- sub rsp, 16
- movdqu [rsp], %1
- %endmacro
- %macro XMMPOP 1
- movdqu %1, [rsp]
- add rsp, 16
- %endmacro
- ; A chunk_size of 0x50 is ideal for AMD fam 15h platforms, which is what this
- ; was optimized and designed for. If you change this value, you have to
- ; manually add/remove movdqus and aesencs from the core loop. This must be
- ; divisible by 16.
- %define CHUNK_SIZE 0x50
- ; rdi -> data
- ; rsi -> len
- ; edx -> seed
- ; xmm5 <- 128-bit hash
- ;
- ; All non-output GP registers are preserved, conforming to the falkos ABI.
- ; All non-output XMM registers are also preserved.
- falkhash:
- push rax
- push rcx
- push rdi
- push rsi
- push rbp
- XMMPUSH xmm0
- XMMPUSH xmm1
- XMMPUSH xmm2
- XMMPUSH xmm3
- XMMPUSH xmm4
- sub rsp, CHUNK_SIZE
- ; Add the seed to the length
- mov rbp, rsi
- add rbp, rdx
- ; Place the length+seed for both the low and high 64-bits into xmm5,
- ; our hash output.
- pinsrq xmm5, rbp, 0
- pinsrq xmm5, rbp, 1
- .lewp:
- ; If we have less than a chunk, copy the partial chunk to the stack.
- cmp rsi, CHUNK_SIZE
- jb short .pad_last_chunk
- .continue:
- ; Read 5 pieces from memory into xmms
- movdqu xmm0, [rdi + 0x00]
- movdqu xmm1, [rdi + 0x10]
- movdqu xmm2, [rdi + 0x20]
- movdqu xmm3, [rdi + 0x30]
- movdqu xmm4, [rdi + 0x40]
- ; Mix all pieces into xmm0
- aesenc xmm0, xmm1
- aesenc xmm0, xmm2
- aesenc xmm0, xmm3
- aesenc xmm0, xmm4
- ; Finalize xmm0 by mixing with itself
- aesenc xmm0, xmm0
- ; Mix in xmm0 to the hash
- aesenc xmm5, xmm0
- ; Go to the next chunk, fall through if we're done.
- add rdi, CHUNK_SIZE
- sub rsi, CHUNK_SIZE
- jnz short .lewp
- jmp short .done
- .pad_last_chunk:
- ; Fill the stack with 0xff's, this is our padding
- push rdi
- lea rdi, [rsp + 8]
- mov rax, -1
- mov ecx, (CHUNK_SIZE / 8)
- rep stosq
- pop rdi
- ; Copy the remainder of data to the stack
- mov rcx, rsi
- mov rsi, rdi
- mov rdi, rsp
- rep movsb
- ; Make our data now come from the stack, and set the size to one chunk.
- mov rdi, rsp
- mov rsi, CHUNK_SIZE
- jmp short .continue
- .done:
- ; Finalize the hash. This is required at least once to pass
- ; Combination 0x8000000 and Combination 0x0000001. Need more than 1 to
- ; pass the Seed tests. We do 4 because they're pretty much free.
- ; Maybe we should actually use the seed better? Nah, more finalizing!
- aesenc xmm5, xmm5
- aesenc xmm5, xmm5
- aesenc xmm5, xmm5
- aesenc xmm5, xmm5
- add rsp, CHUNK_SIZE
- XMMPOP xmm4
- XMMPOP xmm3
- XMMPOP xmm2
- XMMPOP xmm1
- XMMPOP xmm0
- pop rbp
- pop rsi
- pop rdi
- pop rcx
- pop rax
- ret
- ; rdi -> pointer to data
- ; rsi -> len
- ; edx -> 32-bit seed
- ; rcx <> pointer to caller allocated 128-bit hash destination
- ;
- ; All non-output GP registers are preserved, conforming to the falkos ABI.
- ; All XMM registers are preserved.
- global _falkhash_test
- _falkhash_test:
- push rcx
- push rdx
- push rsi
- push rdi
- XMMPUSH xmm5
- %ifdef WIN
- ; Translate from windows to linux calling convention
- mov rdi, rcx
- mov rsi, rdx
- mov rdx, r8
- mov rcx, r9
- %endif
- call falkhash
- ; Store the hash into the hash destination
- movdqu [rcx], xmm5
- XMMPOP xmm5
- pop rdi
- pop rsi
- pop rdx
- pop rcx
- ret
- ; rax <- 64-bit rdtsc value
- global rdtsc64
- rdtsc64:
- push rdx
- rdtsc
- shl rdx, 32
- or rax, rdx
-
- pop rdx
- ret
|