sha512_mb_mgr_flush_avx2.S 8.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298
  1. /*
  2. * Flush routine for SHA512 multibuffer
  3. *
  4. * This file is provided under a dual BSD/GPLv2 license. When using or
  5. * redistributing this file, you may do so under either license.
  6. *
  7. * GPL LICENSE SUMMARY
  8. *
  9. * Copyright(c) 2016 Intel Corporation.
  10. *
  11. * This program is free software; you can redistribute it and/or modify
  12. * it under the terms of version 2 of the GNU General Public License as
  13. * published by the Free Software Foundation.
  14. *
  15. * This program is distributed in the hope that it will be useful, but
  16. * WITHOUT ANY WARRANTY; without even the implied warranty of
  17. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  18. * General Public License for more details.
  19. *
  20. * Contact Information:
  21. * Megha Dey <megha.dey@linux.intel.com>
  22. *
  23. * BSD LICENSE
  24. *
  25. * Copyright(c) 2016 Intel Corporation.
  26. *
  27. * Redistribution and use in source and binary forms, with or without
  28. * modification, are permitted provided that the following conditions
  29. * are met:
  30. *
  31. * * Redistributions of source code must retain the above copyright
  32. * notice, this list of conditions and the following disclaimer.
  33. * * Redistributions in binary form must reproduce the above copyright
  34. * notice, this list of conditions and the following disclaimer in
  35. * the documentation and/or other materials provided with the
  36. * distribution.
  37. * * Neither the name of Intel Corporation nor the names of its
  38. * contributors may be used to endorse or promote products derived
  39. * from this software without specific prior written permission.
  40. *
  41. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  42. * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  43. * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  44. * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  45. * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  46. * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  47. * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  48. * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  49. * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  50. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  51. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  52. */
  53. #include <linux/linkage.h>
  54. #include <asm/frame.h>
  55. #include "sha512_mb_mgr_datastruct.S"
  56. .extern sha512_x4_avx2
  57. # LINUX register definitions
  58. #define arg1 %rdi
  59. #define arg2 %rsi
  60. # idx needs to be other than arg1, arg2, rbx, r12
  61. #define idx %rdx
  62. # Common definitions
  63. #define state arg1
  64. #define job arg2
  65. #define len2 arg2
  66. #define unused_lanes %rbx
  67. #define lane_data %rbx
  68. #define tmp2 %rbx
  69. #define job_rax %rax
  70. #define tmp1 %rax
  71. #define size_offset %rax
  72. #define tmp %rax
  73. #define start_offset %rax
  74. #define tmp3 arg1
  75. #define extra_blocks arg2
  76. #define p arg2
  77. #define tmp4 %r8
  78. #define lens0 %r8
  79. #define lens1 %r9
  80. #define lens2 %r10
  81. #define lens3 %r11
  82. .macro LABEL prefix n
  83. \prefix\n\():
  84. .endm
  85. .macro JNE_SKIP i
  86. jne skip_\i
  87. .endm
  88. .altmacro
  89. .macro SET_OFFSET _offset
  90. offset = \_offset
  91. .endm
  92. .noaltmacro
  93. # JOB* sha512_mb_mgr_flush_avx2(MB_MGR *state)
  94. # arg 1 : rcx : state
  95. ENTRY(sha512_mb_mgr_flush_avx2)
  96. FRAME_BEGIN
  97. push %rbx
  98. # If bit (32+3) is set, then all lanes are empty
  99. mov _unused_lanes(state), unused_lanes
  100. bt $32+7, unused_lanes
  101. jc return_null
  102. # find a lane with a non-null job
  103. xor idx, idx
  104. offset = (_ldata + 1*_LANE_DATA_size + _job_in_lane)
  105. cmpq $0, offset(state)
  106. cmovne one(%rip), idx
  107. offset = (_ldata + 2*_LANE_DATA_size + _job_in_lane)
  108. cmpq $0, offset(state)
  109. cmovne two(%rip), idx
  110. offset = (_ldata + 3*_LANE_DATA_size + _job_in_lane)
  111. cmpq $0, offset(state)
  112. cmovne three(%rip), idx
  113. # copy idx to empty lanes
  114. copy_lane_data:
  115. offset = (_args + _data_ptr)
  116. mov offset(state,idx,8), tmp
  117. I = 0
  118. .rep 4
  119. offset = (_ldata + I * _LANE_DATA_size + _job_in_lane)
  120. cmpq $0, offset(state)
  121. .altmacro
  122. JNE_SKIP %I
  123. offset = (_args + _data_ptr + 8*I)
  124. mov tmp, offset(state)
  125. offset = (_lens + 8*I +4)
  126. movl $0xFFFFFFFF, offset(state)
  127. LABEL skip_ %I
  128. I = (I+1)
  129. .noaltmacro
  130. .endr
  131. # Find min length
  132. mov _lens + 0*8(state),lens0
  133. mov lens0,idx
  134. mov _lens + 1*8(state),lens1
  135. cmp idx,lens1
  136. cmovb lens1,idx
  137. mov _lens + 2*8(state),lens2
  138. cmp idx,lens2
  139. cmovb lens2,idx
  140. mov _lens + 3*8(state),lens3
  141. cmp idx,lens3
  142. cmovb lens3,idx
  143. mov idx,len2
  144. and $0xF,idx
  145. and $~0xFF,len2
  146. jz len_is_0
  147. sub len2, lens0
  148. sub len2, lens1
  149. sub len2, lens2
  150. sub len2, lens3
  151. shr $32,len2
  152. mov lens0, _lens + 0*8(state)
  153. mov lens1, _lens + 1*8(state)
  154. mov lens2, _lens + 2*8(state)
  155. mov lens3, _lens + 3*8(state)
  156. # "state" and "args" are the same address, arg1
  157. # len is arg2
  158. call sha512_x4_avx2
  159. # state and idx are intact
  160. len_is_0:
  161. # process completed job "idx"
  162. imul $_LANE_DATA_size, idx, lane_data
  163. lea _ldata(state, lane_data), lane_data
  164. mov _job_in_lane(lane_data), job_rax
  165. movq $0, _job_in_lane(lane_data)
  166. movl $STS_COMPLETED, _status(job_rax)
  167. mov _unused_lanes(state), unused_lanes
  168. shl $8, unused_lanes
  169. or idx, unused_lanes
  170. mov unused_lanes, _unused_lanes(state)
  171. movl $0xFFFFFFFF, _lens+4(state, idx, 8)
  172. vmovq _args_digest+0*32(state, idx, 8), %xmm0
  173. vpinsrq $1, _args_digest+1*32(state, idx, 8), %xmm0, %xmm0
  174. vmovq _args_digest+2*32(state, idx, 8), %xmm1
  175. vpinsrq $1, _args_digest+3*32(state, idx, 8), %xmm1, %xmm1
  176. vmovq _args_digest+4*32(state, idx, 8), %xmm2
  177. vpinsrq $1, _args_digest+5*32(state, idx, 8), %xmm2, %xmm2
  178. vmovq _args_digest+6*32(state, idx, 8), %xmm3
  179. vpinsrq $1, _args_digest+7*32(state, idx, 8), %xmm3, %xmm3
  180. vmovdqu %xmm0, _result_digest(job_rax)
  181. vmovdqu %xmm1, _result_digest+1*16(job_rax)
  182. vmovdqu %xmm2, _result_digest+2*16(job_rax)
  183. vmovdqu %xmm3, _result_digest+3*16(job_rax)
  184. return:
  185. pop %rbx
  186. FRAME_END
  187. ret
  188. return_null:
  189. xor job_rax, job_rax
  190. jmp return
  191. ENDPROC(sha512_mb_mgr_flush_avx2)
  192. .align 16
  193. ENTRY(sha512_mb_mgr_get_comp_job_avx2)
  194. push %rbx
  195. mov _unused_lanes(state), unused_lanes
  196. bt $(32+7), unused_lanes
  197. jc .return_null
  198. # Find min length
  199. mov _lens(state),lens0
  200. mov lens0,idx
  201. mov _lens+1*8(state),lens1
  202. cmp idx,lens1
  203. cmovb lens1,idx
  204. mov _lens+2*8(state),lens2
  205. cmp idx,lens2
  206. cmovb lens2,idx
  207. mov _lens+3*8(state),lens3
  208. cmp idx,lens3
  209. cmovb lens3,idx
  210. test $~0xF,idx
  211. jnz .return_null
  212. and $0xF,idx
  213. #process completed job "idx"
  214. imul $_LANE_DATA_size, idx, lane_data
  215. lea _ldata(state, lane_data), lane_data
  216. mov _job_in_lane(lane_data), job_rax
  217. movq $0, _job_in_lane(lane_data)
  218. movl $STS_COMPLETED, _status(job_rax)
  219. mov _unused_lanes(state), unused_lanes
  220. shl $8, unused_lanes
  221. or idx, unused_lanes
  222. mov unused_lanes, _unused_lanes(state)
  223. movl $0xFFFFFFFF, _lens+4(state, idx, 8)
  224. vmovq _args_digest(state, idx, 8), %xmm0
  225. vpinsrq $1, _args_digest+1*32(state, idx, 8), %xmm0, %xmm0
  226. vmovq _args_digest+2*32(state, idx, 8), %xmm1
  227. vpinsrq $1, _args_digest+3*32(state, idx, 8), %xmm1, %xmm1
  228. vmovq _args_digest+4*32(state, idx, 8), %xmm2
  229. vpinsrq $1, _args_digest+5*32(state, idx, 8), %xmm2, %xmm2
  230. vmovq _args_digest+6*32(state, idx, 8), %xmm3
  231. vpinsrq $1, _args_digest+7*32(state, idx, 8), %xmm3, %xmm3
  232. vmovdqu %xmm0, _result_digest+0*16(job_rax)
  233. vmovdqu %xmm1, _result_digest+1*16(job_rax)
  234. vmovdqu %xmm2, _result_digest+2*16(job_rax)
  235. vmovdqu %xmm3, _result_digest+3*16(job_rax)
  236. pop %rbx
  237. ret
  238. .return_null:
  239. xor job_rax, job_rax
  240. pop %rbx
  241. ret
  242. ENDPROC(sha512_mb_mgr_get_comp_job_avx2)
  243. .section .rodata.cst8.one, "aM", @progbits, 8
  244. .align 8
  245. one:
  246. .quad 1
  247. .section .rodata.cst8.two, "aM", @progbits, 8
  248. .align 8
  249. two:
  250. .quad 2
  251. .section .rodata.cst8.three, "aM", @progbits, 8
  252. .align 8
  253. three:
  254. .quad 3