sha512_mb_mgr_submit_avx2.S 6.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225
  1. /*
  2. * Buffer submit code for multi buffer SHA512 algorithm
  3. *
  4. * This file is provided under a dual BSD/GPLv2 license. When using or
  5. * redistributing this file, you may do so under either license.
  6. *
  7. * GPL LICENSE SUMMARY
  8. *
  9. * Copyright(c) 2016 Intel Corporation.
  10. *
  11. * This program is free software; you can redistribute it and/or modify
  12. * it under the terms of version 2 of the GNU General Public License as
  13. * published by the Free Software Foundation.
  14. *
  15. * This program is distributed in the hope that it will be useful, but
  16. * WITHOUT ANY WARRANTY; without even the implied warranty of
  17. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  18. * General Public License for more details.
  19. *
  20. * Contact Information:
  21. * Megha Dey <megha.dey@linux.intel.com>
  22. *
  23. * BSD LICENSE
  24. *
  25. * Copyright(c) 2016 Intel Corporation.
  26. *
  27. * Redistribution and use in source and binary forms, with or without
  28. * modification, are permitted provided that the following conditions
  29. * are met:
  30. *
  31. * * Redistributions of source code must retain the above copyright
  32. * notice, this list of conditions and the following disclaimer.
  33. * * Redistributions in binary form must reproduce the above copyright
  34. * notice, this list of conditions and the following disclaimer in
  35. * the documentation and/or other materials provided with the
  36. * distribution.
  37. * * Neither the name of Intel Corporation nor the names of its
  38. * contributors may be used to endorse or promote products derived
  39. * from this software without specific prior written permission.
  40. *
  41. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  42. * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  43. * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  44. * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  45. * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  46. * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  47. * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  48. * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  49. * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  50. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  51. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  52. */
  53. #include <linux/linkage.h>
  54. #include <asm/frame.h>
  55. #include "sha512_mb_mgr_datastruct.S"
  56. .extern sha512_x4_avx2
  57. #define arg1 %rdi
  58. #define arg2 %rsi
  59. #define idx %rdx
  60. #define last_len %rdx
  61. #define size_offset %rcx
  62. #define tmp2 %rcx
  63. # Common definitions
  64. #define state arg1
  65. #define job arg2
  66. #define len2 arg2
  67. #define p2 arg2
  68. #define p %r11
  69. #define start_offset %r11
  70. #define unused_lanes %rbx
  71. #define job_rax %rax
  72. #define len %rax
  73. #define lane %r12
  74. #define tmp3 %r12
  75. #define lens3 %r12
  76. #define extra_blocks %r8
  77. #define lens0 %r8
  78. #define tmp %r9
  79. #define lens1 %r9
  80. #define lane_data %r10
  81. #define lens2 %r10
  82. #define DWORD_len %eax
  83. # JOB* sha512_mb_mgr_submit_avx2(MB_MGR *state, JOB *job)
  84. # arg 1 : rcx : state
  85. # arg 2 : rdx : job
  86. ENTRY(sha512_mb_mgr_submit_avx2)
  87. FRAME_BEGIN
  88. push %rbx
  89. push %r12
  90. mov _unused_lanes(state), unused_lanes
  91. movzb %bl,lane
  92. shr $8, unused_lanes
  93. imul $_LANE_DATA_size, lane,lane_data
  94. movl $STS_BEING_PROCESSED, _status(job)
  95. lea _ldata(state, lane_data), lane_data
  96. mov unused_lanes, _unused_lanes(state)
  97. movl _len(job), DWORD_len
  98. mov job, _job_in_lane(lane_data)
  99. movl DWORD_len,_lens+4(state , lane, 8)
  100. # Load digest words from result_digest
  101. vmovdqu _result_digest+0*16(job), %xmm0
  102. vmovdqu _result_digest+1*16(job), %xmm1
  103. vmovdqu _result_digest+2*16(job), %xmm2
  104. vmovdqu _result_digest+3*16(job), %xmm3
  105. vmovq %xmm0, _args_digest(state, lane, 8)
  106. vpextrq $1, %xmm0, _args_digest+1*32(state , lane, 8)
  107. vmovq %xmm1, _args_digest+2*32(state , lane, 8)
  108. vpextrq $1, %xmm1, _args_digest+3*32(state , lane, 8)
  109. vmovq %xmm2, _args_digest+4*32(state , lane, 8)
  110. vpextrq $1, %xmm2, _args_digest+5*32(state , lane, 8)
  111. vmovq %xmm3, _args_digest+6*32(state , lane, 8)
  112. vpextrq $1, %xmm3, _args_digest+7*32(state , lane, 8)
  113. mov _buffer(job), p
  114. mov p, _args_data_ptr(state, lane, 8)
  115. cmp $0xFF, unused_lanes
  116. jne return_null
  117. start_loop:
  118. # Find min length
  119. mov _lens+0*8(state),lens0
  120. mov lens0,idx
  121. mov _lens+1*8(state),lens1
  122. cmp idx,lens1
  123. cmovb lens1, idx
  124. mov _lens+2*8(state),lens2
  125. cmp idx,lens2
  126. cmovb lens2,idx
  127. mov _lens+3*8(state),lens3
  128. cmp idx,lens3
  129. cmovb lens3,idx
  130. mov idx,len2
  131. and $0xF,idx
  132. and $~0xFF,len2
  133. jz len_is_0
  134. sub len2,lens0
  135. sub len2,lens1
  136. sub len2,lens2
  137. sub len2,lens3
  138. shr $32,len2
  139. mov lens0, _lens + 0*8(state)
  140. mov lens1, _lens + 1*8(state)
  141. mov lens2, _lens + 2*8(state)
  142. mov lens3, _lens + 3*8(state)
  143. # "state" and "args" are the same address, arg1
  144. # len is arg2
  145. call sha512_x4_avx2
  146. # state and idx are intact
  147. len_is_0:
  148. # process completed job "idx"
  149. imul $_LANE_DATA_size, idx, lane_data
  150. lea _ldata(state, lane_data), lane_data
  151. mov _job_in_lane(lane_data), job_rax
  152. mov _unused_lanes(state), unused_lanes
  153. movq $0, _job_in_lane(lane_data)
  154. movl $STS_COMPLETED, _status(job_rax)
  155. shl $8, unused_lanes
  156. or idx, unused_lanes
  157. mov unused_lanes, _unused_lanes(state)
  158. movl $0xFFFFFFFF,_lens+4(state,idx,8)
  159. vmovq _args_digest+0*32(state , idx, 8), %xmm0
  160. vpinsrq $1, _args_digest+1*32(state , idx, 8), %xmm0, %xmm0
  161. vmovq _args_digest+2*32(state , idx, 8), %xmm1
  162. vpinsrq $1, _args_digest+3*32(state , idx, 8), %xmm1, %xmm1
  163. vmovq _args_digest+4*32(state , idx, 8), %xmm2
  164. vpinsrq $1, _args_digest+5*32(state , idx, 8), %xmm2, %xmm2
  165. vmovq _args_digest+6*32(state , idx, 8), %xmm3
  166. vpinsrq $1, _args_digest+7*32(state , idx, 8), %xmm3, %xmm3
  167. vmovdqu %xmm0, _result_digest + 0*16(job_rax)
  168. vmovdqu %xmm1, _result_digest + 1*16(job_rax)
  169. vmovdqu %xmm2, _result_digest + 2*16(job_rax)
  170. vmovdqu %xmm3, _result_digest + 3*16(job_rax)
  171. return:
  172. pop %r12
  173. pop %rbx
  174. FRAME_END
  175. ret
  176. return_null:
  177. xor job_rax, job_rax
  178. jmp return
  179. ENDPROC(sha512_mb_mgr_submit_avx2)
  180. /* UNUSED?
  181. .section .rodata.cst16, "aM", @progbits, 16
  182. .align 16
  183. H0: .int 0x6a09e667
  184. H1: .int 0xbb67ae85
  185. H2: .int 0x3c6ef372
  186. H3: .int 0xa54ff53a
  187. H4: .int 0x510e527f
  188. H5: .int 0x9b05688c
  189. H6: .int 0x1f83d9ab
  190. H7: .int 0x5be0cd19
  191. */