sha1_mb_mgr_submit_avx2.S 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210
  1. /*
  2. * Buffer submit code for multi buffer SHA1 algorithm
  3. *
  4. * This file is provided under a dual BSD/GPLv2 license. When using or
  5. * redistributing this file, you may do so under either license.
  6. *
  7. * GPL LICENSE SUMMARY
  8. *
  9. * Copyright(c) 2014 Intel Corporation.
  10. *
  11. * This program is free software; you can redistribute it and/or modify
  12. * it under the terms of version 2 of the GNU General Public License as
  13. * published by the Free Software Foundation.
  14. *
  15. * This program is distributed in the hope that it will be useful, but
  16. * WITHOUT ANY WARRANTY; without even the implied warranty of
  17. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  18. * General Public License for more details.
  19. *
  20. * Contact Information:
  21. * James Guilford <james.guilford@intel.com>
  22. * Tim Chen <tim.c.chen@linux.intel.com>
  23. *
  24. * BSD LICENSE
  25. *
  26. * Copyright(c) 2014 Intel Corporation.
  27. *
  28. * Redistribution and use in source and binary forms, with or without
  29. * modification, are permitted provided that the following conditions
  30. * are met:
  31. *
  32. * * Redistributions of source code must retain the above copyright
  33. * notice, this list of conditions and the following disclaimer.
  34. * * Redistributions in binary form must reproduce the above copyright
  35. * notice, this list of conditions and the following disclaimer in
  36. * the documentation and/or other materials provided with the
  37. * distribution.
  38. * * Neither the name of Intel Corporation nor the names of its
  39. * contributors may be used to endorse or promote products derived
  40. * from this software without specific prior written permission.
  41. *
  42. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  43. * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  44. * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  45. * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  46. * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  47. * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  48. * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  49. * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  50. * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  51. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  52. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  53. */
  54. #include <linux/linkage.h>
  55. #include <asm/frame.h>
  56. #include "sha1_mb_mgr_datastruct.S"
  57. .extern sha1_x8_avx
  58. # LINUX register definitions
  59. arg1 = %rdi
  60. arg2 = %rsi
  61. size_offset = %rcx
  62. tmp2 = %rcx
  63. extra_blocks = %rdx
  64. # Common definitions
  65. #define state arg1
  66. #define job %rsi
  67. #define len2 arg2
  68. #define p2 arg2
  69. # idx must be a register not clobberred by sha1_x8_avx2
  70. idx = %r8
  71. DWORD_idx = %r8d
  72. last_len = %r8
  73. p = %r11
  74. start_offset = %r11
  75. unused_lanes = %rbx
  76. BYTE_unused_lanes = %bl
  77. job_rax = %rax
  78. len = %rax
  79. DWORD_len = %eax
  80. lane = %r12
  81. tmp3 = %r12
  82. tmp = %r9
  83. DWORD_tmp = %r9d
  84. lane_data = %r10
  85. # JOB* submit_mb_mgr_submit_avx2(MB_MGR *state, job_sha1 *job)
  86. # arg 1 : rcx : state
  87. # arg 2 : rdx : job
  88. ENTRY(sha1_mb_mgr_submit_avx2)
  89. FRAME_BEGIN
  90. push %rbx
  91. push %r12
  92. mov _unused_lanes(state), unused_lanes
  93. mov unused_lanes, lane
  94. and $0xF, lane
  95. shr $4, unused_lanes
  96. imul $_LANE_DATA_size, lane, lane_data
  97. movl $STS_BEING_PROCESSED, _status(job)
  98. lea _ldata(state, lane_data), lane_data
  99. mov unused_lanes, _unused_lanes(state)
  100. movl _len(job), DWORD_len
  101. mov job, _job_in_lane(lane_data)
  102. shl $4, len
  103. or lane, len
  104. movl DWORD_len, _lens(state , lane, 4)
  105. # Load digest words from result_digest
  106. vmovdqu _result_digest(job), %xmm0
  107. mov _result_digest+1*16(job), DWORD_tmp
  108. vmovd %xmm0, _args_digest(state, lane, 4)
  109. vpextrd $1, %xmm0, _args_digest+1*32(state , lane, 4)
  110. vpextrd $2, %xmm0, _args_digest+2*32(state , lane, 4)
  111. vpextrd $3, %xmm0, _args_digest+3*32(state , lane, 4)
  112. movl DWORD_tmp, _args_digest+4*32(state , lane, 4)
  113. mov _buffer(job), p
  114. mov p, _args_data_ptr(state, lane, 8)
  115. cmp $0xF, unused_lanes
  116. jne return_null
  117. start_loop:
  118. # Find min length
  119. vmovdqa _lens(state), %xmm0
  120. vmovdqa _lens+1*16(state), %xmm1
  121. vpminud %xmm1, %xmm0, %xmm2 # xmm2 has {D,C,B,A}
  122. vpalignr $8, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,D,C}
  123. vpminud %xmm3, %xmm2, %xmm2 # xmm2 has {x,x,E,F}
  124. vpalignr $4, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,x,E}
  125. vpminud %xmm3, %xmm2, %xmm2 # xmm2 has min value in low dword
  126. vmovd %xmm2, DWORD_idx
  127. mov idx, len2
  128. and $0xF, idx
  129. shr $4, len2
  130. jz len_is_0
  131. vpand clear_low_nibble(%rip), %xmm2, %xmm2
  132. vpshufd $0, %xmm2, %xmm2
  133. vpsubd %xmm2, %xmm0, %xmm0
  134. vpsubd %xmm2, %xmm1, %xmm1
  135. vmovdqa %xmm0, _lens + 0*16(state)
  136. vmovdqa %xmm1, _lens + 1*16(state)
  137. # "state" and "args" are the same address, arg1
  138. # len is arg2
  139. call sha1_x8_avx2
  140. # state and idx are intact
  141. len_is_0:
  142. # process completed job "idx"
  143. imul $_LANE_DATA_size, idx, lane_data
  144. lea _ldata(state, lane_data), lane_data
  145. mov _job_in_lane(lane_data), job_rax
  146. mov _unused_lanes(state), unused_lanes
  147. movq $0, _job_in_lane(lane_data)
  148. movl $STS_COMPLETED, _status(job_rax)
  149. shl $4, unused_lanes
  150. or idx, unused_lanes
  151. mov unused_lanes, _unused_lanes(state)
  152. movl $0xFFFFFFFF, _lens(state, idx, 4)
  153. vmovd _args_digest(state, idx, 4), %xmm0
  154. vpinsrd $1, _args_digest+1*32(state , idx, 4), %xmm0, %xmm0
  155. vpinsrd $2, _args_digest+2*32(state , idx, 4), %xmm0, %xmm0
  156. vpinsrd $3, _args_digest+3*32(state , idx, 4), %xmm0, %xmm0
  157. movl _args_digest+4*32(state, idx, 4), DWORD_tmp
  158. vmovdqu %xmm0, _result_digest(job_rax)
  159. movl DWORD_tmp, _result_digest+1*16(job_rax)
  160. return:
  161. pop %r12
  162. pop %rbx
  163. FRAME_END
  164. ret
  165. return_null:
  166. xor job_rax, job_rax
  167. jmp return
  168. ENDPROC(sha1_mb_mgr_submit_avx2)
  169. .section .rodata.cst16.clear_low_nibble, "aM", @progbits, 16
  170. .align 16
  171. clear_low_nibble:
  172. .octa 0x000000000000000000000000FFFFFFF0