sha1_x8_avx2.S 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493
  1. /*
  2. * Multi-buffer SHA1 algorithm hash compute routine
  3. *
  4. * This file is provided under a dual BSD/GPLv2 license. When using or
  5. * redistributing this file, you may do so under either license.
  6. *
  7. * GPL LICENSE SUMMARY
  8. *
  9. * Copyright(c) 2014 Intel Corporation.
  10. *
  11. * This program is free software; you can redistribute it and/or modify
  12. * it under the terms of version 2 of the GNU General Public License as
  13. * published by the Free Software Foundation.
  14. *
  15. * This program is distributed in the hope that it will be useful, but
  16. * WITHOUT ANY WARRANTY; without even the implied warranty of
  17. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  18. * General Public License for more details.
  19. *
  20. * Contact Information:
  21. * James Guilford <james.guilford@intel.com>
  22. * Tim Chen <tim.c.chen@linux.intel.com>
  23. *
  24. * BSD LICENSE
  25. *
  26. * Copyright(c) 2014 Intel Corporation.
  27. *
  28. * Redistribution and use in source and binary forms, with or without
  29. * modification, are permitted provided that the following conditions
  30. * are met:
  31. *
  32. * * Redistributions of source code must retain the above copyright
  33. * notice, this list of conditions and the following disclaimer.
  34. * * Redistributions in binary form must reproduce the above copyright
  35. * notice, this list of conditions and the following disclaimer in
  36. * the documentation and/or other materials provided with the
  37. * distribution.
  38. * * Neither the name of Intel Corporation nor the names of its
  39. * contributors may be used to endorse or promote products derived
  40. * from this software without specific prior written permission.
  41. *
  42. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  43. * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  44. * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  45. * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  46. * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  47. * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  48. * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  49. * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  50. * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  51. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  52. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  53. */
  54. #include <linux/linkage.h>
  55. #include "sha1_mb_mgr_datastruct.S"
  56. ## code to compute oct SHA1 using SSE-256
  57. ## outer calling routine takes care of save and restore of XMM registers
  58. ## Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15# ymm0-15
  59. ##
  60. ## Linux clobbers: rax rbx rcx rdx rsi r9 r10 r11 r12 r13 r14 r15
  61. ## Linux preserves: rdi rbp r8
  62. ##
  63. ## clobbers ymm0-15
  64. # TRANSPOSE8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1
  65. # "transpose" data in {r0...r7} using temps {t0...t1}
  66. # Input looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
  67. # r0 = {a7 a6 a5 a4 a3 a2 a1 a0}
  68. # r1 = {b7 b6 b5 b4 b3 b2 b1 b0}
  69. # r2 = {c7 c6 c5 c4 c3 c2 c1 c0}
  70. # r3 = {d7 d6 d5 d4 d3 d2 d1 d0}
  71. # r4 = {e7 e6 e5 e4 e3 e2 e1 e0}
  72. # r5 = {f7 f6 f5 f4 f3 f2 f1 f0}
  73. # r6 = {g7 g6 g5 g4 g3 g2 g1 g0}
  74. # r7 = {h7 h6 h5 h4 h3 h2 h1 h0}
  75. #
  76. # Output looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
  77. # r0 = {h0 g0 f0 e0 d0 c0 b0 a0}
  78. # r1 = {h1 g1 f1 e1 d1 c1 b1 a1}
  79. # r2 = {h2 g2 f2 e2 d2 c2 b2 a2}
  80. # r3 = {h3 g3 f3 e3 d3 c3 b3 a3}
  81. # r4 = {h4 g4 f4 e4 d4 c4 b4 a4}
  82. # r5 = {h5 g5 f5 e5 d5 c5 b5 a5}
  83. # r6 = {h6 g6 f6 e6 d6 c6 b6 a6}
  84. # r7 = {h7 g7 f7 e7 d7 c7 b7 a7}
  85. #
  86. .macro TRANSPOSE8 r0 r1 r2 r3 r4 r5 r6 r7 t0 t1
  87. # process top half (r0..r3) {a...d}
  88. vshufps $0x44, \r1, \r0, \t0 # t0 = {b5 b4 a5 a4 b1 b0 a1 a0}
  89. vshufps $0xEE, \r1, \r0, \r0 # r0 = {b7 b6 a7 a6 b3 b2 a3 a2}
  90. vshufps $0x44, \r3, \r2, \t1 # t1 = {d5 d4 c5 c4 d1 d0 c1 c0}
  91. vshufps $0xEE, \r3, \r2, \r2 # r2 = {d7 d6 c7 c6 d3 d2 c3 c2}
  92. vshufps $0xDD, \t1, \t0, \r3 # r3 = {d5 c5 b5 a5 d1 c1 b1 a1}
  93. vshufps $0x88, \r2, \r0, \r1 # r1 = {d6 c6 b6 a6 d2 c2 b2 a2}
  94. vshufps $0xDD, \r2, \r0, \r0 # r0 = {d7 c7 b7 a7 d3 c3 b3 a3}
  95. vshufps $0x88, \t1, \t0, \t0 # t0 = {d4 c4 b4 a4 d0 c0 b0 a0}
  96. # use r2 in place of t0
  97. # process bottom half (r4..r7) {e...h}
  98. vshufps $0x44, \r5, \r4, \r2 # r2 = {f5 f4 e5 e4 f1 f0 e1 e0}
  99. vshufps $0xEE, \r5, \r4, \r4 # r4 = {f7 f6 e7 e6 f3 f2 e3 e2}
  100. vshufps $0x44, \r7, \r6, \t1 # t1 = {h5 h4 g5 g4 h1 h0 g1 g0}
  101. vshufps $0xEE, \r7, \r6, \r6 # r6 = {h7 h6 g7 g6 h3 h2 g3 g2}
  102. vshufps $0xDD, \t1, \r2, \r7 # r7 = {h5 g5 f5 e5 h1 g1 f1 e1}
  103. vshufps $0x88, \r6, \r4, \r5 # r5 = {h6 g6 f6 e6 h2 g2 f2 e2}
  104. vshufps $0xDD, \r6, \r4, \r4 # r4 = {h7 g7 f7 e7 h3 g3 f3 e3}
  105. vshufps $0x88, \t1, \r2, \t1 # t1 = {h4 g4 f4 e4 h0 g0 f0 e0}
  106. vperm2f128 $0x13, \r1, \r5, \r6 # h6...a6
  107. vperm2f128 $0x02, \r1, \r5, \r2 # h2...a2
  108. vperm2f128 $0x13, \r3, \r7, \r5 # h5...a5
  109. vperm2f128 $0x02, \r3, \r7, \r1 # h1...a1
  110. vperm2f128 $0x13, \r0, \r4, \r7 # h7...a7
  111. vperm2f128 $0x02, \r0, \r4, \r3 # h3...a3
  112. vperm2f128 $0x13, \t0, \t1, \r4 # h4...a4
  113. vperm2f128 $0x02, \t0, \t1, \r0 # h0...a0
  114. .endm
  115. ##
  116. ## Magic functions defined in FIPS 180-1
  117. ##
  118. # macro MAGIC_F0 F,B,C,D,T ## F = (D ^ (B & (C ^ D)))
  119. .macro MAGIC_F0 regF regB regC regD regT
  120. vpxor \regD, \regC, \regF
  121. vpand \regB, \regF, \regF
  122. vpxor \regD, \regF, \regF
  123. .endm
  124. # macro MAGIC_F1 F,B,C,D,T ## F = (B ^ C ^ D)
  125. .macro MAGIC_F1 regF regB regC regD regT
  126. vpxor \regC, \regD, \regF
  127. vpxor \regB, \regF, \regF
  128. .endm
  129. # macro MAGIC_F2 F,B,C,D,T ## F = ((B & C) | (B & D) | (C & D))
  130. .macro MAGIC_F2 regF regB regC regD regT
  131. vpor \regC, \regB, \regF
  132. vpand \regC, \regB, \regT
  133. vpand \regD, \regF, \regF
  134. vpor \regT, \regF, \regF
  135. .endm
  136. # macro MAGIC_F3 F,B,C,D,T ## F = (B ^ C ^ D)
  137. .macro MAGIC_F3 regF regB regC regD regT
  138. MAGIC_F1 \regF,\regB,\regC,\regD,\regT
  139. .endm
  140. # PROLD reg, imm, tmp
  141. .macro PROLD reg imm tmp
  142. vpsrld $(32-\imm), \reg, \tmp
  143. vpslld $\imm, \reg, \reg
  144. vpor \tmp, \reg, \reg
  145. .endm
  146. .macro PROLD_nd reg imm tmp src
  147. vpsrld $(32-\imm), \src, \tmp
  148. vpslld $\imm, \src, \reg
  149. vpor \tmp, \reg, \reg
  150. .endm
  151. .macro SHA1_STEP_00_15 regA regB regC regD regE regT regF memW immCNT MAGIC
  152. vpaddd \immCNT, \regE, \regE
  153. vpaddd \memW*32(%rsp), \regE, \regE
  154. PROLD_nd \regT, 5, \regF, \regA
  155. vpaddd \regT, \regE, \regE
  156. \MAGIC \regF, \regB, \regC, \regD, \regT
  157. PROLD \regB, 30, \regT
  158. vpaddd \regF, \regE, \regE
  159. .endm
  160. .macro SHA1_STEP_16_79 regA regB regC regD regE regT regF memW immCNT MAGIC
  161. vpaddd \immCNT, \regE, \regE
  162. offset = ((\memW - 14) & 15) * 32
  163. vmovdqu offset(%rsp), W14
  164. vpxor W14, W16, W16
  165. offset = ((\memW - 8) & 15) * 32
  166. vpxor offset(%rsp), W16, W16
  167. offset = ((\memW - 3) & 15) * 32
  168. vpxor offset(%rsp), W16, W16
  169. vpsrld $(32-1), W16, \regF
  170. vpslld $1, W16, W16
  171. vpor W16, \regF, \regF
  172. ROTATE_W
  173. offset = ((\memW - 0) & 15) * 32
  174. vmovdqu \regF, offset(%rsp)
  175. vpaddd \regF, \regE, \regE
  176. PROLD_nd \regT, 5, \regF, \regA
  177. vpaddd \regT, \regE, \regE
  178. \MAGIC \regF,\regB,\regC,\regD,\regT ## FUN = MAGIC_Fi(B,C,D)
  179. PROLD \regB,30, \regT
  180. vpaddd \regF, \regE, \regE
  181. .endm
  182. ########################################################################
  183. ########################################################################
  184. ########################################################################
  185. ## FRAMESZ plus pushes must be an odd multiple of 8
  186. YMM_SAVE = (15-15)*32
  187. FRAMESZ = 32*16 + YMM_SAVE
  188. _YMM = FRAMESZ - YMM_SAVE
  189. #define VMOVPS vmovups
  190. IDX = %rax
  191. inp0 = %r9
  192. inp1 = %r10
  193. inp2 = %r11
  194. inp3 = %r12
  195. inp4 = %r13
  196. inp5 = %r14
  197. inp6 = %r15
  198. inp7 = %rcx
  199. arg1 = %rdi
  200. arg2 = %rsi
  201. RSP_SAVE = %rdx
  202. # ymm0 A
  203. # ymm1 B
  204. # ymm2 C
  205. # ymm3 D
  206. # ymm4 E
  207. # ymm5 F AA
  208. # ymm6 T0 BB
  209. # ymm7 T1 CC
  210. # ymm8 T2 DD
  211. # ymm9 T3 EE
  212. # ymm10 T4 TMP
  213. # ymm11 T5 FUN
  214. # ymm12 T6 K
  215. # ymm13 T7 W14
  216. # ymm14 T8 W15
  217. # ymm15 T9 W16
  218. A = %ymm0
  219. B = %ymm1
  220. C = %ymm2
  221. D = %ymm3
  222. E = %ymm4
  223. F = %ymm5
  224. T0 = %ymm6
  225. T1 = %ymm7
  226. T2 = %ymm8
  227. T3 = %ymm9
  228. T4 = %ymm10
  229. T5 = %ymm11
  230. T6 = %ymm12
  231. T7 = %ymm13
  232. T8 = %ymm14
  233. T9 = %ymm15
  234. AA = %ymm5
  235. BB = %ymm6
  236. CC = %ymm7
  237. DD = %ymm8
  238. EE = %ymm9
  239. TMP = %ymm10
  240. FUN = %ymm11
  241. K = %ymm12
  242. W14 = %ymm13
  243. W15 = %ymm14
  244. W16 = %ymm15
  245. .macro ROTATE_ARGS
  246. TMP_ = E
  247. E = D
  248. D = C
  249. C = B
  250. B = A
  251. A = TMP_
  252. .endm
  253. .macro ROTATE_W
  254. TMP_ = W16
  255. W16 = W15
  256. W15 = W14
  257. W14 = TMP_
  258. .endm
  259. # 8 streams x 5 32bit words per digest x 4 bytes per word
  260. #define DIGEST_SIZE (8*5*4)
  261. .align 32
  262. # void sha1_x8_avx2(void **input_data, UINT128 *digest, UINT32 size)
  263. # arg 1 : pointer to array[4] of pointer to input data
  264. # arg 2 : size (in blocks) ;; assumed to be >= 1
  265. #
  266. ENTRY(sha1_x8_avx2)
  267. # save callee-saved clobbered registers to comply with C function ABI
  268. push %r12
  269. push %r13
  270. push %r14
  271. push %r15
  272. #save rsp
  273. mov %rsp, RSP_SAVE
  274. sub $FRAMESZ, %rsp
  275. #align rsp to 32 Bytes
  276. and $~0x1F, %rsp
  277. ## Initialize digests
  278. vmovdqu 0*32(arg1), A
  279. vmovdqu 1*32(arg1), B
  280. vmovdqu 2*32(arg1), C
  281. vmovdqu 3*32(arg1), D
  282. vmovdqu 4*32(arg1), E
  283. ## transpose input onto stack
  284. mov _data_ptr+0*8(arg1),inp0
  285. mov _data_ptr+1*8(arg1),inp1
  286. mov _data_ptr+2*8(arg1),inp2
  287. mov _data_ptr+3*8(arg1),inp3
  288. mov _data_ptr+4*8(arg1),inp4
  289. mov _data_ptr+5*8(arg1),inp5
  290. mov _data_ptr+6*8(arg1),inp6
  291. mov _data_ptr+7*8(arg1),inp7
  292. xor IDX, IDX
  293. lloop:
  294. vmovdqu PSHUFFLE_BYTE_FLIP_MASK(%rip), F
  295. I=0
  296. .rep 2
  297. VMOVPS (inp0, IDX), T0
  298. VMOVPS (inp1, IDX), T1
  299. VMOVPS (inp2, IDX), T2
  300. VMOVPS (inp3, IDX), T3
  301. VMOVPS (inp4, IDX), T4
  302. VMOVPS (inp5, IDX), T5
  303. VMOVPS (inp6, IDX), T6
  304. VMOVPS (inp7, IDX), T7
  305. TRANSPOSE8 T0, T1, T2, T3, T4, T5, T6, T7, T8, T9
  306. vpshufb F, T0, T0
  307. vmovdqu T0, (I*8)*32(%rsp)
  308. vpshufb F, T1, T1
  309. vmovdqu T1, (I*8+1)*32(%rsp)
  310. vpshufb F, T2, T2
  311. vmovdqu T2, (I*8+2)*32(%rsp)
  312. vpshufb F, T3, T3
  313. vmovdqu T3, (I*8+3)*32(%rsp)
  314. vpshufb F, T4, T4
  315. vmovdqu T4, (I*8+4)*32(%rsp)
  316. vpshufb F, T5, T5
  317. vmovdqu T5, (I*8+5)*32(%rsp)
  318. vpshufb F, T6, T6
  319. vmovdqu T6, (I*8+6)*32(%rsp)
  320. vpshufb F, T7, T7
  321. vmovdqu T7, (I*8+7)*32(%rsp)
  322. add $32, IDX
  323. I = (I+1)
  324. .endr
  325. # save old digests
  326. vmovdqu A,AA
  327. vmovdqu B,BB
  328. vmovdqu C,CC
  329. vmovdqu D,DD
  330. vmovdqu E,EE
  331. ##
  332. ## perform 0-79 steps
  333. ##
  334. vmovdqu K00_19(%rip), K
  335. ## do rounds 0...15
  336. I = 0
  337. .rep 16
  338. SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0
  339. ROTATE_ARGS
  340. I = (I+1)
  341. .endr
  342. ## do rounds 16...19
  343. vmovdqu ((16 - 16) & 15) * 32 (%rsp), W16
  344. vmovdqu ((16 - 15) & 15) * 32 (%rsp), W15
  345. .rep 4
  346. SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0
  347. ROTATE_ARGS
  348. I = (I+1)
  349. .endr
  350. ## do rounds 20...39
  351. vmovdqu K20_39(%rip), K
  352. .rep 20
  353. SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1
  354. ROTATE_ARGS
  355. I = (I+1)
  356. .endr
  357. ## do rounds 40...59
  358. vmovdqu K40_59(%rip), K
  359. .rep 20
  360. SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2
  361. ROTATE_ARGS
  362. I = (I+1)
  363. .endr
  364. ## do rounds 60...79
  365. vmovdqu K60_79(%rip), K
  366. .rep 20
  367. SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3
  368. ROTATE_ARGS
  369. I = (I+1)
  370. .endr
  371. vpaddd AA,A,A
  372. vpaddd BB,B,B
  373. vpaddd CC,C,C
  374. vpaddd DD,D,D
  375. vpaddd EE,E,E
  376. sub $1, arg2
  377. jne lloop
  378. # write out digests
  379. vmovdqu A, 0*32(arg1)
  380. vmovdqu B, 1*32(arg1)
  381. vmovdqu C, 2*32(arg1)
  382. vmovdqu D, 3*32(arg1)
  383. vmovdqu E, 4*32(arg1)
  384. # update input pointers
  385. add IDX, inp0
  386. add IDX, inp1
  387. add IDX, inp2
  388. add IDX, inp3
  389. add IDX, inp4
  390. add IDX, inp5
  391. add IDX, inp6
  392. add IDX, inp7
  393. mov inp0, _data_ptr (arg1)
  394. mov inp1, _data_ptr + 1*8(arg1)
  395. mov inp2, _data_ptr + 2*8(arg1)
  396. mov inp3, _data_ptr + 3*8(arg1)
  397. mov inp4, _data_ptr + 4*8(arg1)
  398. mov inp5, _data_ptr + 5*8(arg1)
  399. mov inp6, _data_ptr + 6*8(arg1)
  400. mov inp7, _data_ptr + 7*8(arg1)
  401. ################
  402. ## Postamble
  403. mov RSP_SAVE, %rsp
  404. # restore callee-saved clobbered registers
  405. pop %r15
  406. pop %r14
  407. pop %r13
  408. pop %r12
  409. ret
  410. ENDPROC(sha1_x8_avx2)
  411. .section .rodata.cst32.K00_19, "aM", @progbits, 32
  412. .align 32
  413. K00_19:
  414. .octa 0x5A8279995A8279995A8279995A827999
  415. .octa 0x5A8279995A8279995A8279995A827999
  416. .section .rodata.cst32.K20_39, "aM", @progbits, 32
  417. .align 32
  418. K20_39:
  419. .octa 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1
  420. .octa 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1
  421. .section .rodata.cst32.K40_59, "aM", @progbits, 32
  422. .align 32
  423. K40_59:
  424. .octa 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC
  425. .octa 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC
  426. .section .rodata.cst32.K60_79, "aM", @progbits, 32
  427. .align 32
  428. K60_79:
  429. .octa 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6
  430. .octa 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6
  431. .section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
  432. .align 32
  433. PSHUFFLE_BYTE_FLIP_MASK:
  434. .octa 0x0c0d0e0f08090a0b0405060700010203
  435. .octa 0x0c0d0e0f08090a0b0405060700010203