sha256_mb_mgr_flush_avx2.S 8.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308
  1. /*
  2. * Flush routine for SHA256 multibuffer
  3. *
  4. * This file is provided under a dual BSD/GPLv2 license. When using or
  5. * redistributing this file, you may do so under either license.
  6. *
  7. * GPL LICENSE SUMMARY
  8. *
  9. * Copyright(c) 2016 Intel Corporation.
  10. *
  11. * This program is free software; you can redistribute it and/or modify
  12. * it under the terms of version 2 of the GNU General Public License as
  13. * published by the Free Software Foundation.
  14. *
  15. * This program is distributed in the hope that it will be useful, but
  16. * WITHOUT ANY WARRANTY; without even the implied warranty of
  17. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  18. * General Public License for more details.
  19. *
  20. * Contact Information:
  21. * Megha Dey <megha.dey@linux.intel.com>
  22. *
  23. * BSD LICENSE
  24. *
  25. * Copyright(c) 2016 Intel Corporation.
  26. *
  27. * Redistribution and use in source and binary forms, with or without
  28. * modification, are permitted provided that the following conditions
  29. * are met:
  30. *
  31. * * Redistributions of source code must retain the above copyright
  32. * notice, this list of conditions and the following disclaimer.
  33. * * Redistributions in binary form must reproduce the above copyright
  34. * notice, this list of conditions and the following disclaimer in
  35. * the documentation and/or other materials provided with the
  36. * distribution.
  37. * * Neither the name of Intel Corporation nor the names of its
  38. * contributors may be used to endorse or promote products derived
  39. * from this software without specific prior written permission.
  40. *
  41. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  42. * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  43. * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  44. * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  45. * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  46. * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  47. * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  48. * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  49. * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  50. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  51. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  52. */
  53. #include <linux/linkage.h>
  54. #include <asm/frame.h>
  55. #include "sha256_mb_mgr_datastruct.S"
  56. .extern sha256_x8_avx2
  57. #LINUX register definitions
  58. #define arg1 %rdi
  59. #define arg2 %rsi
  60. # Common register definitions
  61. #define state arg1
  62. #define job arg2
  63. #define len2 arg2
  64. # idx must be a register not clobberred by sha1_mult
  65. #define idx %r8
  66. #define DWORD_idx %r8d
  67. #define unused_lanes %rbx
  68. #define lane_data %rbx
  69. #define tmp2 %rbx
  70. #define tmp2_w %ebx
  71. #define job_rax %rax
  72. #define tmp1 %rax
  73. #define size_offset %rax
  74. #define tmp %rax
  75. #define start_offset %rax
  76. #define tmp3 %arg1
  77. #define extra_blocks %arg2
  78. #define p %arg2
  79. .macro LABEL prefix n
  80. \prefix\n\():
  81. .endm
  82. .macro JNE_SKIP i
  83. jne skip_\i
  84. .endm
  85. .altmacro
  86. .macro SET_OFFSET _offset
  87. offset = \_offset
  88. .endm
  89. .noaltmacro
  90. # JOB_SHA256* sha256_mb_mgr_flush_avx2(MB_MGR *state)
  91. # arg 1 : rcx : state
  92. ENTRY(sha256_mb_mgr_flush_avx2)
  93. FRAME_BEGIN
  94. push %rbx
  95. # If bit (32+3) is set, then all lanes are empty
  96. mov _unused_lanes(state), unused_lanes
  97. bt $32+3, unused_lanes
  98. jc return_null
  99. # find a lane with a non-null job
  100. xor idx, idx
  101. offset = (_ldata + 1 * _LANE_DATA_size + _job_in_lane)
  102. cmpq $0, offset(state)
  103. cmovne one(%rip), idx
  104. offset = (_ldata + 2 * _LANE_DATA_size + _job_in_lane)
  105. cmpq $0, offset(state)
  106. cmovne two(%rip), idx
  107. offset = (_ldata + 3 * _LANE_DATA_size + _job_in_lane)
  108. cmpq $0, offset(state)
  109. cmovne three(%rip), idx
  110. offset = (_ldata + 4 * _LANE_DATA_size + _job_in_lane)
  111. cmpq $0, offset(state)
  112. cmovne four(%rip), idx
  113. offset = (_ldata + 5 * _LANE_DATA_size + _job_in_lane)
  114. cmpq $0, offset(state)
  115. cmovne five(%rip), idx
  116. offset = (_ldata + 6 * _LANE_DATA_size + _job_in_lane)
  117. cmpq $0, offset(state)
  118. cmovne six(%rip), idx
  119. offset = (_ldata + 7 * _LANE_DATA_size + _job_in_lane)
  120. cmpq $0, offset(state)
  121. cmovne seven(%rip), idx
  122. # copy idx to empty lanes
  123. copy_lane_data:
  124. offset = (_args + _data_ptr)
  125. mov offset(state,idx,8), tmp
  126. I = 0
  127. .rep 8
  128. offset = (_ldata + I * _LANE_DATA_size + _job_in_lane)
  129. cmpq $0, offset(state)
  130. .altmacro
  131. JNE_SKIP %I
  132. offset = (_args + _data_ptr + 8*I)
  133. mov tmp, offset(state)
  134. offset = (_lens + 4*I)
  135. movl $0xFFFFFFFF, offset(state)
  136. LABEL skip_ %I
  137. I = (I+1)
  138. .noaltmacro
  139. .endr
  140. # Find min length
  141. vmovdqu _lens+0*16(state), %xmm0
  142. vmovdqu _lens+1*16(state), %xmm1
  143. vpminud %xmm1, %xmm0, %xmm2 # xmm2 has {D,C,B,A}
  144. vpalignr $8, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,D,C}
  145. vpminud %xmm3, %xmm2, %xmm2 # xmm2 has {x,x,E,F}
  146. vpalignr $4, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,x,E}
  147. vpminud %xmm3, %xmm2, %xmm2 # xmm2 has min val in low dword
  148. vmovd %xmm2, DWORD_idx
  149. mov idx, len2
  150. and $0xF, idx
  151. shr $4, len2
  152. jz len_is_0
  153. vpand clear_low_nibble(%rip), %xmm2, %xmm2
  154. vpshufd $0, %xmm2, %xmm2
  155. vpsubd %xmm2, %xmm0, %xmm0
  156. vpsubd %xmm2, %xmm1, %xmm1
  157. vmovdqu %xmm0, _lens+0*16(state)
  158. vmovdqu %xmm1, _lens+1*16(state)
  159. # "state" and "args" are the same address, arg1
  160. # len is arg2
  161. call sha256_x8_avx2
  162. # state and idx are intact
  163. len_is_0:
  164. # process completed job "idx"
  165. imul $_LANE_DATA_size, idx, lane_data
  166. lea _ldata(state, lane_data), lane_data
  167. mov _job_in_lane(lane_data), job_rax
  168. movq $0, _job_in_lane(lane_data)
  169. movl $STS_COMPLETED, _status(job_rax)
  170. mov _unused_lanes(state), unused_lanes
  171. shl $4, unused_lanes
  172. or idx, unused_lanes
  173. mov unused_lanes, _unused_lanes(state)
  174. movl $0xFFFFFFFF, _lens(state,idx,4)
  175. vmovd _args_digest(state , idx, 4) , %xmm0
  176. vpinsrd $1, _args_digest+1*32(state, idx, 4), %xmm0, %xmm0
  177. vpinsrd $2, _args_digest+2*32(state, idx, 4), %xmm0, %xmm0
  178. vpinsrd $3, _args_digest+3*32(state, idx, 4), %xmm0, %xmm0
  179. vmovd _args_digest+4*32(state, idx, 4), %xmm1
  180. vpinsrd $1, _args_digest+5*32(state, idx, 4), %xmm1, %xmm1
  181. vpinsrd $2, _args_digest+6*32(state, idx, 4), %xmm1, %xmm1
  182. vpinsrd $3, _args_digest+7*32(state, idx, 4), %xmm1, %xmm1
  183. vmovdqu %xmm0, _result_digest(job_rax)
  184. offset = (_result_digest + 1*16)
  185. vmovdqu %xmm1, offset(job_rax)
  186. return:
  187. pop %rbx
  188. FRAME_END
  189. ret
  190. return_null:
  191. xor job_rax, job_rax
  192. jmp return
  193. ENDPROC(sha256_mb_mgr_flush_avx2)
  194. ##############################################################################
  195. .align 16
  196. ENTRY(sha256_mb_mgr_get_comp_job_avx2)
  197. push %rbx
  198. ## if bit 32+3 is set, then all lanes are empty
  199. mov _unused_lanes(state), unused_lanes
  200. bt $(32+3), unused_lanes
  201. jc .return_null
  202. # Find min length
  203. vmovdqu _lens(state), %xmm0
  204. vmovdqu _lens+1*16(state), %xmm1
  205. vpminud %xmm1, %xmm0, %xmm2 # xmm2 has {D,C,B,A}
  206. vpalignr $8, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,D,C}
  207. vpminud %xmm3, %xmm2, %xmm2 # xmm2 has {x,x,E,F}
  208. vpalignr $4, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,x,E}
  209. vpminud %xmm3, %xmm2, %xmm2 # xmm2 has min val in low dword
  210. vmovd %xmm2, DWORD_idx
  211. test $~0xF, idx
  212. jnz .return_null
  213. # process completed job "idx"
  214. imul $_LANE_DATA_size, idx, lane_data
  215. lea _ldata(state, lane_data), lane_data
  216. mov _job_in_lane(lane_data), job_rax
  217. movq $0, _job_in_lane(lane_data)
  218. movl $STS_COMPLETED, _status(job_rax)
  219. mov _unused_lanes(state), unused_lanes
  220. shl $4, unused_lanes
  221. or idx, unused_lanes
  222. mov unused_lanes, _unused_lanes(state)
  223. movl $0xFFFFFFFF, _lens(state, idx, 4)
  224. vmovd _args_digest(state, idx, 4), %xmm0
  225. vpinsrd $1, _args_digest+1*32(state, idx, 4), %xmm0, %xmm0
  226. vpinsrd $2, _args_digest+2*32(state, idx, 4), %xmm0, %xmm0
  227. vpinsrd $3, _args_digest+3*32(state, idx, 4), %xmm0, %xmm0
  228. vmovd _args_digest+4*32(state, idx, 4), %xmm1
  229. vpinsrd $1, _args_digest+5*32(state, idx, 4), %xmm1, %xmm1
  230. vpinsrd $2, _args_digest+6*32(state, idx, 4), %xmm1, %xmm1
  231. vpinsrd $3, _args_digest+7*32(state, idx, 4), %xmm1, %xmm1
  232. vmovdqu %xmm0, _result_digest(job_rax)
  233. offset = (_result_digest + 1*16)
  234. vmovdqu %xmm1, offset(job_rax)
  235. pop %rbx
  236. ret
  237. .return_null:
  238. xor job_rax, job_rax
  239. pop %rbx
  240. ret
  241. ENDPROC(sha256_mb_mgr_get_comp_job_avx2)
  242. .section .rodata.cst16.clear_low_nibble, "aM", @progbits, 16
  243. .align 16
  244. clear_low_nibble:
  245. .octa 0x000000000000000000000000FFFFFFF0
  246. .section .rodata.cst8, "aM", @progbits, 8
  247. .align 8
  248. one:
  249. .quad 1
  250. two:
  251. .quad 2
  252. three:
  253. .quad 3
  254. four:
  255. .quad 4
  256. five:
  257. .quad 5
  258. six:
  259. .quad 6
  260. seven:
  261. .quad 7