sha1_ssse3_asm.S 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558
  1. /*
  2. * This is a SIMD SHA-1 implementation. It requires the Intel(R) Supplemental
  3. * SSE3 instruction set extensions introduced in Intel Core Microarchitecture
  4. * processors. CPUs supporting Intel(R) AVX extensions will get an additional
  5. * boost.
  6. *
  7. * This work was inspired by the vectorized implementation of Dean Gaudet.
  8. * Additional information on it can be found at:
  9. * http://www.arctic.org/~dean/crypto/sha1.html
  10. *
  11. * It was improved upon with more efficient vectorization of the message
  12. * scheduling. This implementation has also been optimized for all current and
  13. * several future generations of Intel CPUs.
  14. *
  15. * See this article for more information about the implementation details:
  16. * http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/
  17. *
  18. * Copyright (C) 2010, Intel Corp.
  19. * Authors: Maxim Locktyukhin <maxim.locktyukhin@intel.com>
  20. * Ronen Zohar <ronen.zohar@intel.com>
  21. *
  22. * Converted to AT&T syntax and adapted for inclusion in the Linux kernel:
  23. * Author: Mathias Krause <minipli@googlemail.com>
  24. *
  25. * This program is free software; you can redistribute it and/or modify
  26. * it under the terms of the GNU General Public License as published by
  27. * the Free Software Foundation; either version 2 of the License, or
  28. * (at your option) any later version.
  29. */
  30. #include <linux/linkage.h>
  31. #define CTX %rdi // arg1
  32. #define BUF %rsi // arg2
  33. #define CNT %rdx // arg3
  34. #define REG_A %ecx
  35. #define REG_B %esi
  36. #define REG_C %edi
  37. #define REG_D %r12d
  38. #define REG_E %edx
  39. #define REG_T1 %eax
  40. #define REG_T2 %ebx
  41. #define K_BASE %r8
  42. #define HASH_PTR %r9
  43. #define BUFFER_PTR %r10
  44. #define BUFFER_END %r11
  45. #define W_TMP1 %xmm0
  46. #define W_TMP2 %xmm9
  47. #define W0 %xmm1
  48. #define W4 %xmm2
  49. #define W8 %xmm3
  50. #define W12 %xmm4
  51. #define W16 %xmm5
  52. #define W20 %xmm6
  53. #define W24 %xmm7
  54. #define W28 %xmm8
  55. #define XMM_SHUFB_BSWAP %xmm10
  56. /* we keep window of 64 w[i]+K pre-calculated values in a circular buffer */
  57. #define WK(t) (((t) & 15) * 4)(%rsp)
  58. #define W_PRECALC_AHEAD 16
  59. /*
  60. * This macro implements the SHA-1 function's body for single 64-byte block
  61. * param: function's name
  62. */
  63. .macro SHA1_VECTOR_ASM name
  64. ENTRY(\name)
  65. push %rbx
  66. push %r12
  67. push %rbp
  68. mov %rsp, %rbp
  69. sub $64, %rsp # allocate workspace
  70. and $~15, %rsp # align stack
  71. mov CTX, HASH_PTR
  72. mov BUF, BUFFER_PTR
  73. shl $6, CNT # multiply by 64
  74. add BUF, CNT
  75. mov CNT, BUFFER_END
  76. lea K_XMM_AR(%rip), K_BASE
  77. xmm_mov BSWAP_SHUFB_CTL(%rip), XMM_SHUFB_BSWAP
  78. SHA1_PIPELINED_MAIN_BODY
  79. # cleanup workspace
  80. mov $8, %ecx
  81. mov %rsp, %rdi
  82. xor %eax, %eax
  83. rep stosq
  84. mov %rbp, %rsp # deallocate workspace
  85. pop %rbp
  86. pop %r12
  87. pop %rbx
  88. ret
  89. ENDPROC(\name)
  90. .endm
  91. /*
  92. * This macro implements 80 rounds of SHA-1 for one 64-byte block
  93. */
  94. .macro SHA1_PIPELINED_MAIN_BODY
  95. INIT_REGALLOC
  96. mov (HASH_PTR), A
  97. mov 4(HASH_PTR), B
  98. mov 8(HASH_PTR), C
  99. mov 12(HASH_PTR), D
  100. mov 16(HASH_PTR), E
  101. .set i, 0
  102. .rept W_PRECALC_AHEAD
  103. W_PRECALC i
  104. .set i, (i+1)
  105. .endr
  106. .align 4
  107. 1:
  108. RR F1,A,B,C,D,E,0
  109. RR F1,D,E,A,B,C,2
  110. RR F1,B,C,D,E,A,4
  111. RR F1,E,A,B,C,D,6
  112. RR F1,C,D,E,A,B,8
  113. RR F1,A,B,C,D,E,10
  114. RR F1,D,E,A,B,C,12
  115. RR F1,B,C,D,E,A,14
  116. RR F1,E,A,B,C,D,16
  117. RR F1,C,D,E,A,B,18
  118. RR F2,A,B,C,D,E,20
  119. RR F2,D,E,A,B,C,22
  120. RR F2,B,C,D,E,A,24
  121. RR F2,E,A,B,C,D,26
  122. RR F2,C,D,E,A,B,28
  123. RR F2,A,B,C,D,E,30
  124. RR F2,D,E,A,B,C,32
  125. RR F2,B,C,D,E,A,34
  126. RR F2,E,A,B,C,D,36
  127. RR F2,C,D,E,A,B,38
  128. RR F3,A,B,C,D,E,40
  129. RR F3,D,E,A,B,C,42
  130. RR F3,B,C,D,E,A,44
  131. RR F3,E,A,B,C,D,46
  132. RR F3,C,D,E,A,B,48
  133. RR F3,A,B,C,D,E,50
  134. RR F3,D,E,A,B,C,52
  135. RR F3,B,C,D,E,A,54
  136. RR F3,E,A,B,C,D,56
  137. RR F3,C,D,E,A,B,58
  138. add $64, BUFFER_PTR # move to the next 64-byte block
  139. cmp BUFFER_END, BUFFER_PTR # if the current is the last one use
  140. cmovae K_BASE, BUFFER_PTR # dummy source to avoid buffer overrun
  141. RR F4,A,B,C,D,E,60
  142. RR F4,D,E,A,B,C,62
  143. RR F4,B,C,D,E,A,64
  144. RR F4,E,A,B,C,D,66
  145. RR F4,C,D,E,A,B,68
  146. RR F4,A,B,C,D,E,70
  147. RR F4,D,E,A,B,C,72
  148. RR F4,B,C,D,E,A,74
  149. RR F4,E,A,B,C,D,76
  150. RR F4,C,D,E,A,B,78
  151. UPDATE_HASH (HASH_PTR), A
  152. UPDATE_HASH 4(HASH_PTR), B
  153. UPDATE_HASH 8(HASH_PTR), C
  154. UPDATE_HASH 12(HASH_PTR), D
  155. UPDATE_HASH 16(HASH_PTR), E
  156. RESTORE_RENAMED_REGS
  157. cmp K_BASE, BUFFER_PTR # K_BASE means, we reached the end
  158. jne 1b
  159. .endm
  160. .macro INIT_REGALLOC
  161. .set A, REG_A
  162. .set B, REG_B
  163. .set C, REG_C
  164. .set D, REG_D
  165. .set E, REG_E
  166. .set T1, REG_T1
  167. .set T2, REG_T2
  168. .endm
  169. .macro RESTORE_RENAMED_REGS
  170. # order is important (REG_C is where it should be)
  171. mov B, REG_B
  172. mov D, REG_D
  173. mov A, REG_A
  174. mov E, REG_E
  175. .endm
  176. .macro SWAP_REG_NAMES a, b
  177. .set _T, \a
  178. .set \a, \b
  179. .set \b, _T
  180. .endm
  181. .macro F1 b, c, d
  182. mov \c, T1
  183. SWAP_REG_NAMES \c, T1
  184. xor \d, T1
  185. and \b, T1
  186. xor \d, T1
  187. .endm
  188. .macro F2 b, c, d
  189. mov \d, T1
  190. SWAP_REG_NAMES \d, T1
  191. xor \c, T1
  192. xor \b, T1
  193. .endm
  194. .macro F3 b, c ,d
  195. mov \c, T1
  196. SWAP_REG_NAMES \c, T1
  197. mov \b, T2
  198. or \b, T1
  199. and \c, T2
  200. and \d, T1
  201. or T2, T1
  202. .endm
  203. .macro F4 b, c, d
  204. F2 \b, \c, \d
  205. .endm
  206. .macro UPDATE_HASH hash, val
  207. add \hash, \val
  208. mov \val, \hash
  209. .endm
  210. /*
  211. * RR does two rounds of SHA-1 back to back with W[] pre-calc
  212. * t1 = F(b, c, d); e += w(i)
  213. * e += t1; b <<= 30; d += w(i+1);
  214. * t1 = F(a, b, c);
  215. * d += t1; a <<= 5;
  216. * e += a;
  217. * t1 = e; a >>= 7;
  218. * t1 <<= 5;
  219. * d += t1;
  220. */
  221. .macro RR F, a, b, c, d, e, round
  222. add WK(\round), \e
  223. \F \b, \c, \d # t1 = F(b, c, d);
  224. W_PRECALC (\round + W_PRECALC_AHEAD)
  225. rol $30, \b
  226. add T1, \e
  227. add WK(\round + 1), \d
  228. \F \a, \b, \c
  229. W_PRECALC (\round + W_PRECALC_AHEAD + 1)
  230. rol $5, \a
  231. add \a, \e
  232. add T1, \d
  233. ror $7, \a # (a <<r 5) >>r 7) => a <<r 30)
  234. mov \e, T1
  235. SWAP_REG_NAMES \e, T1
  236. rol $5, T1
  237. add T1, \d
  238. # write: \a, \b
  239. # rotate: \a<=\d, \b<=\e, \c<=\a, \d<=\b, \e<=\c
  240. .endm
  241. .macro W_PRECALC r
  242. .set i, \r
  243. .if (i < 20)
  244. .set K_XMM, 0
  245. .elseif (i < 40)
  246. .set K_XMM, 16
  247. .elseif (i < 60)
  248. .set K_XMM, 32
  249. .elseif (i < 80)
  250. .set K_XMM, 48
  251. .endif
  252. .if ((i < 16) || ((i >= 80) && (i < (80 + W_PRECALC_AHEAD))))
  253. .set i, ((\r) % 80) # pre-compute for the next iteration
  254. .if (i == 0)
  255. W_PRECALC_RESET
  256. .endif
  257. W_PRECALC_00_15
  258. .elseif (i<32)
  259. W_PRECALC_16_31
  260. .elseif (i < 80) // rounds 32-79
  261. W_PRECALC_32_79
  262. .endif
  263. .endm
  264. .macro W_PRECALC_RESET
  265. .set W, W0
  266. .set W_minus_04, W4
  267. .set W_minus_08, W8
  268. .set W_minus_12, W12
  269. .set W_minus_16, W16
  270. .set W_minus_20, W20
  271. .set W_minus_24, W24
  272. .set W_minus_28, W28
  273. .set W_minus_32, W
  274. .endm
  275. .macro W_PRECALC_ROTATE
  276. .set W_minus_32, W_minus_28
  277. .set W_minus_28, W_minus_24
  278. .set W_minus_24, W_minus_20
  279. .set W_minus_20, W_minus_16
  280. .set W_minus_16, W_minus_12
  281. .set W_minus_12, W_minus_08
  282. .set W_minus_08, W_minus_04
  283. .set W_minus_04, W
  284. .set W, W_minus_32
  285. .endm
  286. .macro W_PRECALC_SSSE3
  287. .macro W_PRECALC_00_15
  288. W_PRECALC_00_15_SSSE3
  289. .endm
  290. .macro W_PRECALC_16_31
  291. W_PRECALC_16_31_SSSE3
  292. .endm
  293. .macro W_PRECALC_32_79
  294. W_PRECALC_32_79_SSSE3
  295. .endm
  296. /* message scheduling pre-compute for rounds 0-15 */
  297. .macro W_PRECALC_00_15_SSSE3
  298. .if ((i & 3) == 0)
  299. movdqu (i*4)(BUFFER_PTR), W_TMP1
  300. .elseif ((i & 3) == 1)
  301. pshufb XMM_SHUFB_BSWAP, W_TMP1
  302. movdqa W_TMP1, W
  303. .elseif ((i & 3) == 2)
  304. paddd (K_BASE), W_TMP1
  305. .elseif ((i & 3) == 3)
  306. movdqa W_TMP1, WK(i&~3)
  307. W_PRECALC_ROTATE
  308. .endif
  309. .endm
  310. /* message scheduling pre-compute for rounds 16-31
  311. *
  312. * - calculating last 32 w[i] values in 8 XMM registers
  313. * - pre-calculate K+w[i] values and store to mem, for later load by ALU add
  314. * instruction
  315. *
  316. * some "heavy-lifting" vectorization for rounds 16-31 due to w[i]->w[i-3]
  317. * dependency, but improves for 32-79
  318. */
  319. .macro W_PRECALC_16_31_SSSE3
  320. # blended scheduling of vector and scalar instruction streams, one 4-wide
  321. # vector iteration / 4 scalar rounds
  322. .if ((i & 3) == 0)
  323. movdqa W_minus_12, W
  324. palignr $8, W_minus_16, W # w[i-14]
  325. movdqa W_minus_04, W_TMP1
  326. psrldq $4, W_TMP1 # w[i-3]
  327. pxor W_minus_08, W
  328. .elseif ((i & 3) == 1)
  329. pxor W_minus_16, W_TMP1
  330. pxor W_TMP1, W
  331. movdqa W, W_TMP2
  332. movdqa W, W_TMP1
  333. pslldq $12, W_TMP2
  334. .elseif ((i & 3) == 2)
  335. psrld $31, W
  336. pslld $1, W_TMP1
  337. por W, W_TMP1
  338. movdqa W_TMP2, W
  339. psrld $30, W_TMP2
  340. pslld $2, W
  341. .elseif ((i & 3) == 3)
  342. pxor W, W_TMP1
  343. pxor W_TMP2, W_TMP1
  344. movdqa W_TMP1, W
  345. paddd K_XMM(K_BASE), W_TMP1
  346. movdqa W_TMP1, WK(i&~3)
  347. W_PRECALC_ROTATE
  348. .endif
  349. .endm
  350. /* message scheduling pre-compute for rounds 32-79
  351. *
  352. * in SHA-1 specification: w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1
  353. * instead we do equal: w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
  354. * allows more efficient vectorization since w[i]=>w[i-3] dependency is broken
  355. */
  356. .macro W_PRECALC_32_79_SSSE3
  357. .if ((i & 3) == 0)
  358. movdqa W_minus_04, W_TMP1
  359. pxor W_minus_28, W # W is W_minus_32 before xor
  360. palignr $8, W_minus_08, W_TMP1
  361. .elseif ((i & 3) == 1)
  362. pxor W_minus_16, W
  363. pxor W_TMP1, W
  364. movdqa W, W_TMP1
  365. .elseif ((i & 3) == 2)
  366. psrld $30, W
  367. pslld $2, W_TMP1
  368. por W, W_TMP1
  369. .elseif ((i & 3) == 3)
  370. movdqa W_TMP1, W
  371. paddd K_XMM(K_BASE), W_TMP1
  372. movdqa W_TMP1, WK(i&~3)
  373. W_PRECALC_ROTATE
  374. .endif
  375. .endm
  376. .endm // W_PRECALC_SSSE3
  377. #define K1 0x5a827999
  378. #define K2 0x6ed9eba1
  379. #define K3 0x8f1bbcdc
  380. #define K4 0xca62c1d6
  381. .section .rodata
  382. .align 16
  383. K_XMM_AR:
  384. .long K1, K1, K1, K1
  385. .long K2, K2, K2, K2
  386. .long K3, K3, K3, K3
  387. .long K4, K4, K4, K4
  388. BSWAP_SHUFB_CTL:
  389. .long 0x00010203
  390. .long 0x04050607
  391. .long 0x08090a0b
  392. .long 0x0c0d0e0f
  393. .section .text
  394. W_PRECALC_SSSE3
  395. .macro xmm_mov a, b
  396. movdqu \a,\b
  397. .endm
  398. /* SSSE3 optimized implementation:
  399. * extern "C" void sha1_transform_ssse3(u32 *digest, const char *data, u32 *ws,
  400. * unsigned int rounds);
  401. */
  402. SHA1_VECTOR_ASM sha1_transform_ssse3
  403. #ifdef CONFIG_AS_AVX
  404. .macro W_PRECALC_AVX
  405. .purgem W_PRECALC_00_15
  406. .macro W_PRECALC_00_15
  407. W_PRECALC_00_15_AVX
  408. .endm
  409. .purgem W_PRECALC_16_31
  410. .macro W_PRECALC_16_31
  411. W_PRECALC_16_31_AVX
  412. .endm
  413. .purgem W_PRECALC_32_79
  414. .macro W_PRECALC_32_79
  415. W_PRECALC_32_79_AVX
  416. .endm
  417. .macro W_PRECALC_00_15_AVX
  418. .if ((i & 3) == 0)
  419. vmovdqu (i*4)(BUFFER_PTR), W_TMP1
  420. .elseif ((i & 3) == 1)
  421. vpshufb XMM_SHUFB_BSWAP, W_TMP1, W
  422. .elseif ((i & 3) == 2)
  423. vpaddd (K_BASE), W, W_TMP1
  424. .elseif ((i & 3) == 3)
  425. vmovdqa W_TMP1, WK(i&~3)
  426. W_PRECALC_ROTATE
  427. .endif
  428. .endm
  429. .macro W_PRECALC_16_31_AVX
  430. .if ((i & 3) == 0)
  431. vpalignr $8, W_minus_16, W_minus_12, W # w[i-14]
  432. vpsrldq $4, W_minus_04, W_TMP1 # w[i-3]
  433. vpxor W_minus_08, W, W
  434. vpxor W_minus_16, W_TMP1, W_TMP1
  435. .elseif ((i & 3) == 1)
  436. vpxor W_TMP1, W, W
  437. vpslldq $12, W, W_TMP2
  438. vpslld $1, W, W_TMP1
  439. .elseif ((i & 3) == 2)
  440. vpsrld $31, W, W
  441. vpor W, W_TMP1, W_TMP1
  442. vpslld $2, W_TMP2, W
  443. vpsrld $30, W_TMP2, W_TMP2
  444. .elseif ((i & 3) == 3)
  445. vpxor W, W_TMP1, W_TMP1
  446. vpxor W_TMP2, W_TMP1, W
  447. vpaddd K_XMM(K_BASE), W, W_TMP1
  448. vmovdqu W_TMP1, WK(i&~3)
  449. W_PRECALC_ROTATE
  450. .endif
  451. .endm
  452. .macro W_PRECALC_32_79_AVX
  453. .if ((i & 3) == 0)
  454. vpalignr $8, W_minus_08, W_minus_04, W_TMP1
  455. vpxor W_minus_28, W, W # W is W_minus_32 before xor
  456. .elseif ((i & 3) == 1)
  457. vpxor W_minus_16, W_TMP1, W_TMP1
  458. vpxor W_TMP1, W, W
  459. .elseif ((i & 3) == 2)
  460. vpslld $2, W, W_TMP1
  461. vpsrld $30, W, W
  462. vpor W, W_TMP1, W
  463. .elseif ((i & 3) == 3)
  464. vpaddd K_XMM(K_BASE), W, W_TMP1
  465. vmovdqu W_TMP1, WK(i&~3)
  466. W_PRECALC_ROTATE
  467. .endif
  468. .endm
  469. .endm // W_PRECALC_AVX
  470. W_PRECALC_AVX
  471. .purgem xmm_mov
  472. .macro xmm_mov a, b
  473. vmovdqu \a,\b
  474. .endm
  475. /* AVX optimized implementation:
  476. * extern "C" void sha1_transform_avx(u32 *digest, const char *data, u32 *ws,
  477. * unsigned int rounds);
  478. */
  479. SHA1_VECTOR_ASM sha1_transform_avx
  480. #endif