sha1-avx-bmi2-amd64.S 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447
  1. /* sha1-avx-bmi2-amd64.S - Intel AVX/BMI2 accelerated SHA-1 transform function
  2. * Copyright (C) 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  3. *
  4. * Based on sha1.c:
  5. * Copyright (C) 1998, 2001, 2002, 2003, 2008 Free Software Foundation, Inc.
  6. *
  7. * This file is part of Libgcrypt.
  8. *
  9. * Libgcrypt is free software; you can redistribute it and/or modify
  10. * it under the terms of the GNU Lesser General Public License as
  11. * published by the Free Software Foundation; either version 2.1 of
  12. * the License, or (at your option) any later version.
  13. *
  14. * Libgcrypt is distributed in the hope that it will be useful,
  15. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  17. * GNU Lesser General Public License for more details.
  18. *
  19. * You should have received a copy of the GNU Lesser General Public
  20. * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  21. */
  22. /*
  23. * Intel SSSE3 accelerated SHA-1 implementation based on white paper:
  24. * "Improving the Performance of the Secure Hash Algorithm (SHA-1)"
  25. * http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1
  26. */
  27. #ifdef __x86_64__
  28. #include <config.h>
  29. #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
  30. defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
  31. defined(HAVE_GCC_INLINE_ASM_BMI2) && \
  32. defined(HAVE_GCC_INLINE_ASM_AVX) && defined(USE_SHA1)
  33. #include "asm-common-amd64.h"
  34. /* Context structure */
  35. #define state_h0 0
  36. #define state_h1 4
  37. #define state_h2 8
  38. #define state_h3 12
  39. #define state_h4 16
  40. /* Constants */
  41. SECTION_RODATA
  42. ELF(.type _sha1_avx_bmi2_consts,@object)
  43. _sha1_avx_bmi2_consts:
  44. .align 16
  45. .Lbswap_shufb_ctl:
  46. .long 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f
  47. .LK1: .long 0x5A827999
  48. .LK2: .long 0x6ED9EBA1
  49. .LK3: .long 0x8F1BBCDC
  50. .LK4: .long 0xCA62C1D6
  51. /* Register macros */
  52. #define RSTATE %r8
  53. #define RDATA %r9
  54. #define ROLDSTACK %r10
  55. #define RNBLKS %r11
  56. #define a %esi
  57. #define b %edi
  58. #define c %ebp
  59. #define d %edx
  60. #define e %ecx
  61. #define ne %ebx
  62. #define RT0 %eax
  63. #define RT1 %r12d
  64. #define Wtmp0 %xmm0
  65. #define Wtmp1 %xmm1
  66. #define W0 %xmm2
  67. #define W1 %xmm3
  68. #define W2 %xmm4
  69. #define W3 %xmm5
  70. #define W4 %xmm6
  71. #define W5 %xmm7
  72. #define W6 %xmm8
  73. #define W7 %xmm9
  74. #define BSWAP_REG %xmm10
  75. #define K1 %xmm11
  76. #define K2 %xmm12
  77. #define K3 %xmm13
  78. #define K4 %xmm14
  79. /* Round function macros. */
  80. #define WK(i) (((i) & 15) * 4)(%rsp)
  81. #define R_F1(a,b,c,d,e,i) \
  82. movl c, RT0; \
  83. andn d, b, RT1; \
  84. addl WK(i), e; \
  85. andl b, RT0; \
  86. rorxl $2, b, b; \
  87. addl RT1, e; \
  88. addl ne, a; \
  89. leal (RT0,e), ne; \
  90. rorxl $27, a, e;
  91. #define R_F2(a,b,c,d,e,i) \
  92. movl c, RT0; \
  93. addl WK(i), e; \
  94. xorl b, RT0; \
  95. rorxl $2, b, b; \
  96. xorl d, RT0; \
  97. addl ne, a; \
  98. leal (RT0,e), ne; \
  99. rorxl $27, a, e;
  100. #define R_F3(a,b,c,d,e,i) \
  101. movl c, RT0; \
  102. movl b, RT1; \
  103. addl WK(i), e; \
  104. xorl b, RT0; \
  105. andl c, RT1; \
  106. andl d, RT0; \
  107. addl RT1, e; \
  108. rorxl $2, b, b; \
  109. addl ne, a; \
  110. leal (RT0,e), ne; \
  111. rorxl $27, a, e;
  112. #define R_F4(a,b,c,d,e,i) R_F2(a,b,c,d,e,i)
  113. #define R(a,b,c,d,e,f,i) \
  114. R_##f(a,b,c,d,e,i)
  115. /* Input expansion macros. */
  116. #define W_PRECALC_00_15_0(i, W, tmp0) \
  117. vmovdqu (4*(i))(RDATA), tmp0;
  118. #define W_PRECALC_00_15_1(i, W, tmp0) \
  119. vpshufb BSWAP_REG, tmp0, W;
  120. #define W_PRECALC_00_15_2(i, W, tmp0, K) \
  121. vpaddd K, W, tmp0;
  122. #define W_PRECALC_00_15_3(i, W, tmp0) \
  123. vmovdqa tmp0, WK(i&~3);
  124. #define W_PRECALC_16_31_0(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \
  125. vpalignr $8, W_m16, W_m12, W; \
  126. vpsrldq $4, W_m04, tmp0; \
  127. vpxor W_m08, W, W;
  128. #define W_PRECALC_16_31_1(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \
  129. vpxor W_m16, tmp0, tmp0; \
  130. vpxor tmp0, W, W; \
  131. vpslld $1, W, tmp0; \
  132. vpslldq $12, W, tmp1; \
  133. vpsrld $31, W, W;
  134. #define W_PRECALC_16_31_2(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \
  135. vpor W, tmp0, tmp0; \
  136. vpsrld $30, tmp1, W; \
  137. vpslld $2, tmp1, tmp1;
  138. #define W_PRECALC_16_31_3(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1, K) \
  139. vpxor W, tmp0, tmp0; \
  140. vpxor tmp1, tmp0, W; \
  141. vpaddd K, W, tmp0; \
  142. vmovdqa tmp0, WK((i)&~3);
  143. #define W_PRECALC_32_79_0(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \
  144. vpxor W_m28, W, W; \
  145. vpalignr $8, W_m08, W_m04, tmp0;
  146. #define W_PRECALC_32_79_1(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \
  147. vpxor W_m16, W, W; \
  148. vpxor tmp0, W, W;
  149. #define W_PRECALC_32_79_2(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \
  150. vpsrld $30, W, tmp0; \
  151. vpslld $2, W, W;
  152. #define W_PRECALC_32_79_3(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0, K) \
  153. vpor W, tmp0, W; \
  154. vpaddd K, W, tmp0; \
  155. vmovdqa tmp0, WK((i)&~3);
  156. .text
  157. /*
  158. * Transform nblks*64 bytes (nblks*16 32-bit words) at DATA.
  159. *
  160. * unsigned int
  161. * _gcry_sha1_transform_amd64_avx_bmi2 (void *ctx, const unsigned char *data,
  162. * size_t nblks)
  163. */
  164. .globl _gcry_sha1_transform_amd64_avx_bmi2
  165. ELF(.type _gcry_sha1_transform_amd64_avx_bmi2,@function)
  166. .align 16
  167. _gcry_sha1_transform_amd64_avx_bmi2:
  168. /* input:
  169. * %rdi: ctx, CTX
  170. * %rsi: data (64*nblks bytes)
  171. * %rdx: nblks
  172. */
  173. CFI_STARTPROC();
  174. xorl %eax, %eax;
  175. cmpq $0, %rdx;
  176. jz .Lret;
  177. vzeroupper;
  178. movq %rdx, RNBLKS;
  179. movq %rdi, RSTATE;
  180. movq %rsi, RDATA;
  181. pushq %rbx;
  182. CFI_PUSH(%rbx);
  183. pushq %rbp;
  184. CFI_PUSH(%rbp);
  185. pushq %r12;
  186. CFI_PUSH(%r12);
  187. movq %rsp, ROLDSTACK;
  188. CFI_DEF_CFA_REGISTER(ROLDSTACK);
  189. subq $(16*4), %rsp;
  190. andq $(~31), %rsp;
  191. /* Get the values of the chaining variables. */
  192. movl state_h0(RSTATE), a;
  193. movl state_h1(RSTATE), b;
  194. movl state_h2(RSTATE), c;
  195. movl state_h3(RSTATE), d;
  196. movl state_h4(RSTATE), e;
  197. xorl ne, ne;
  198. vmovdqa .Lbswap_shufb_ctl rRIP, BSWAP_REG;
  199. vpbroadcastd .LK1 rRIP, K1;
  200. vpbroadcastd .LK2 rRIP, K2;
  201. vpbroadcastd .LK3 rRIP, K3;
  202. vpbroadcastd .LK4 rRIP, K4;
  203. /* Precalc 0-15. */
  204. W_PRECALC_00_15_0(0, W0, Wtmp0);
  205. W_PRECALC_00_15_1(1, W0, Wtmp0);
  206. W_PRECALC_00_15_2(2, W0, Wtmp0, K1);
  207. W_PRECALC_00_15_3(3, W0, Wtmp0);
  208. W_PRECALC_00_15_0(4, W7, Wtmp0);
  209. W_PRECALC_00_15_1(5, W7, Wtmp0);
  210. W_PRECALC_00_15_2(6, W7, Wtmp0, K1);
  211. W_PRECALC_00_15_3(7, W7, Wtmp0);
  212. W_PRECALC_00_15_0(8, W6, Wtmp0);
  213. W_PRECALC_00_15_1(9, W6, Wtmp0);
  214. W_PRECALC_00_15_2(10, W6, Wtmp0, K1);
  215. W_PRECALC_00_15_3(11, W6, Wtmp0);
  216. W_PRECALC_00_15_0(12, W5, Wtmp0);
  217. W_PRECALC_00_15_1(13, W5, Wtmp0);
  218. W_PRECALC_00_15_2(14, W5, Wtmp0, K1);
  219. W_PRECALC_00_15_3(15, W5, Wtmp0);
  220. .align 8
  221. .Loop:
  222. addq $64, RDATA;
  223. /* Transform 0-15 + Precalc 16-31. */
  224. R( a, b, c, d, e, F1, 0 ); W_PRECALC_16_31_0(16, W4, W5, W6, W7, W0, Wtmp0, Wtmp1);
  225. R( e, a, b, c, d, F1, 1 ); W_PRECALC_16_31_1(17, W4, W5, W6, W7, W0, Wtmp0, Wtmp1);
  226. R( d, e, a, b, c, F1, 2 ); W_PRECALC_16_31_2(18, W4, W5, W6, W7, W0, Wtmp0, Wtmp1);
  227. R( c, d, e, a, b, F1, 3 ); W_PRECALC_16_31_3(19, W4, W5, W6, W7, W0, Wtmp0, Wtmp1, K1);
  228. R( b, c, d, e, a, F1, 4 ); W_PRECALC_16_31_0(20, W3, W4, W5, W6, W7, Wtmp0, Wtmp1);
  229. R( a, b, c, d, e, F1, 5 ); W_PRECALC_16_31_1(21, W3, W4, W5, W6, W7, Wtmp0, Wtmp1);
  230. R( e, a, b, c, d, F1, 6 ); W_PRECALC_16_31_2(22, W3, W4, W5, W6, W7, Wtmp0, Wtmp1);
  231. R( d, e, a, b, c, F1, 7 ); W_PRECALC_16_31_3(23, W3, W4, W5, W6, W7, Wtmp0, Wtmp1, K2);
  232. R( c, d, e, a, b, F1, 8 ); W_PRECALC_16_31_0(24, W2, W3, W4, W5, W6, Wtmp0, Wtmp1);
  233. R( b, c, d, e, a, F1, 9 ); W_PRECALC_16_31_1(25, W2, W3, W4, W5, W6, Wtmp0, Wtmp1);
  234. R( a, b, c, d, e, F1, 10 ); W_PRECALC_16_31_2(26, W2, W3, W4, W5, W6, Wtmp0, Wtmp1);
  235. R( e, a, b, c, d, F1, 11 ); W_PRECALC_16_31_3(27, W2, W3, W4, W5, W6, Wtmp0, Wtmp1, K2);
  236. R( d, e, a, b, c, F1, 12 ); W_PRECALC_16_31_0(28, W1, W2, W3, W4, W5, Wtmp0, Wtmp1);
  237. R( c, d, e, a, b, F1, 13 ); W_PRECALC_16_31_1(29, W1, W2, W3, W4, W5, Wtmp0, Wtmp1);
  238. R( b, c, d, e, a, F1, 14 ); W_PRECALC_16_31_2(30, W1, W2, W3, W4, W5, Wtmp0, Wtmp1);
  239. R( a, b, c, d, e, F1, 15 ); W_PRECALC_16_31_3(31, W1, W2, W3, W4, W5, Wtmp0, Wtmp1, K2);
  240. /* Transform 16-63 + Precalc 32-79. */
  241. R( e, a, b, c, d, F1, 16 ); W_PRECALC_32_79_0(32, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
  242. R( d, e, a, b, c, F1, 17 ); W_PRECALC_32_79_1(33, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
  243. R( c, d, e, a, b, F1, 18 ); W_PRECALC_32_79_2(34, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
  244. R( b, c, d, e, a, F1, 19 ); W_PRECALC_32_79_3(35, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0, K2);
  245. R( a, b, c, d, e, F2, 20 ); W_PRECALC_32_79_0(36, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
  246. R( e, a, b, c, d, F2, 21 ); W_PRECALC_32_79_1(37, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
  247. R( d, e, a, b, c, F2, 22 ); W_PRECALC_32_79_2(38, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
  248. R( c, d, e, a, b, F2, 23 ); W_PRECALC_32_79_3(39, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0, K2);
  249. R( b, c, d, e, a, F2, 24 ); W_PRECALC_32_79_0(40, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
  250. R( a, b, c, d, e, F2, 25 ); W_PRECALC_32_79_1(41, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
  251. R( e, a, b, c, d, F2, 26 ); W_PRECALC_32_79_2(42, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
  252. R( d, e, a, b, c, F2, 27 ); W_PRECALC_32_79_3(43, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0, K3);
  253. R( c, d, e, a, b, F2, 28 ); W_PRECALC_32_79_0(44, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
  254. R( b, c, d, e, a, F2, 29 ); W_PRECALC_32_79_1(45, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
  255. R( a, b, c, d, e, F2, 30 ); W_PRECALC_32_79_2(46, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
  256. R( e, a, b, c, d, F2, 31 ); W_PRECALC_32_79_3(47, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0, K3);
  257. R( d, e, a, b, c, F2, 32 ); W_PRECALC_32_79_0(48, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0);
  258. R( c, d, e, a, b, F2, 33 ); W_PRECALC_32_79_1(49, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0);
  259. R( b, c, d, e, a, F2, 34 ); W_PRECALC_32_79_2(50, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0);
  260. R( a, b, c, d, e, F2, 35 ); W_PRECALC_32_79_3(51, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0, K3);
  261. R( e, a, b, c, d, F2, 36 ); W_PRECALC_32_79_0(52, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0);
  262. R( d, e, a, b, c, F2, 37 ); W_PRECALC_32_79_1(53, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0);
  263. R( c, d, e, a, b, F2, 38 ); W_PRECALC_32_79_2(54, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0);
  264. R( b, c, d, e, a, F2, 39 ); W_PRECALC_32_79_3(55, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0, K3);
  265. R( a, b, c, d, e, F3, 40 ); W_PRECALC_32_79_0(56, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0);
  266. R( e, a, b, c, d, F3, 41 ); W_PRECALC_32_79_1(57, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0);
  267. R( d, e, a, b, c, F3, 42 ); W_PRECALC_32_79_2(58, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0);
  268. R( c, d, e, a, b, F3, 43 ); W_PRECALC_32_79_3(59, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0, K3);
  269. R( b, c, d, e, a, F3, 44 ); W_PRECALC_32_79_0(60, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0);
  270. R( a, b, c, d, e, F3, 45 ); W_PRECALC_32_79_1(61, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0);
  271. R( e, a, b, c, d, F3, 46 ); W_PRECALC_32_79_2(62, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0);
  272. R( d, e, a, b, c, F3, 47 ); W_PRECALC_32_79_3(63, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0, K4);
  273. R( c, d, e, a, b, F3, 48 ); W_PRECALC_32_79_0(64, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
  274. R( b, c, d, e, a, F3, 49 ); W_PRECALC_32_79_1(65, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
  275. R( a, b, c, d, e, F3, 50 ); W_PRECALC_32_79_2(66, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
  276. R( e, a, b, c, d, F3, 51 ); W_PRECALC_32_79_3(67, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0, K4);
  277. R( d, e, a, b, c, F3, 52 ); W_PRECALC_32_79_0(68, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
  278. R( c, d, e, a, b, F3, 53 ); W_PRECALC_32_79_1(69, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
  279. R( b, c, d, e, a, F3, 54 ); W_PRECALC_32_79_2(70, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
  280. R( a, b, c, d, e, F3, 55 ); W_PRECALC_32_79_3(71, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0, K4);
  281. R( e, a, b, c, d, F3, 56 ); W_PRECALC_32_79_0(72, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
  282. R( d, e, a, b, c, F3, 57 ); W_PRECALC_32_79_1(73, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
  283. R( c, d, e, a, b, F3, 58 ); W_PRECALC_32_79_2(74, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
  284. R( b, c, d, e, a, F3, 59 ); W_PRECALC_32_79_3(75, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0, K4);
  285. R( a, b, c, d, e, F4, 60 ); W_PRECALC_32_79_0(76, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
  286. R( e, a, b, c, d, F4, 61 ); W_PRECALC_32_79_1(77, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
  287. R( d, e, a, b, c, F4, 62 ); W_PRECALC_32_79_2(78, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
  288. R( c, d, e, a, b, F4, 63 ); W_PRECALC_32_79_3(79, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0, K4);
  289. decq RNBLKS;
  290. jz .Lend;
  291. /* Transform 64-79 + Precalc 0-15 of next block. */
  292. R( b, c, d, e, a, F4, 64 ); W_PRECALC_00_15_0(0, W0, Wtmp0);
  293. R( a, b, c, d, e, F4, 65 ); W_PRECALC_00_15_1(1, W0, Wtmp0);
  294. R( e, a, b, c, d, F4, 66 ); W_PRECALC_00_15_2(2, W0, Wtmp0, K1);
  295. R( d, e, a, b, c, F4, 67 ); W_PRECALC_00_15_3(3, W0, Wtmp0);
  296. R( c, d, e, a, b, F4, 68 ); W_PRECALC_00_15_0(4, W7, Wtmp0);
  297. R( b, c, d, e, a, F4, 69 ); W_PRECALC_00_15_1(5, W7, Wtmp0);
  298. R( a, b, c, d, e, F4, 70 ); W_PRECALC_00_15_2(6, W7, Wtmp0, K1);
  299. R( e, a, b, c, d, F4, 71 ); W_PRECALC_00_15_3(7, W7, Wtmp0);
  300. R( d, e, a, b, c, F4, 72 ); W_PRECALC_00_15_0(8, W6, Wtmp0);
  301. R( c, d, e, a, b, F4, 73 ); W_PRECALC_00_15_1(9, W6, Wtmp0);
  302. R( b, c, d, e, a, F4, 74 ); W_PRECALC_00_15_2(10, W6, Wtmp0, K1);
  303. R( a, b, c, d, e, F4, 75 ); W_PRECALC_00_15_3(11, W6, Wtmp0);
  304. R( e, a, b, c, d, F4, 76 ); W_PRECALC_00_15_0(12, W5, Wtmp0);
  305. R( d, e, a, b, c, F4, 77 ); W_PRECALC_00_15_1(13, W5, Wtmp0);
  306. R( c, d, e, a, b, F4, 78 );
  307. addl state_h0(RSTATE), a; W_PRECALC_00_15_2(14, W5, Wtmp0, K1);
  308. R( b, c, d, e, a, F4, 79 ); W_PRECALC_00_15_3(15, W5, Wtmp0);
  309. addl ne, a;
  310. xorl ne, ne;
  311. /* Update the chaining variables. */
  312. addl state_h3(RSTATE), d;
  313. addl state_h2(RSTATE), c;
  314. addl state_h1(RSTATE), b;
  315. addl state_h4(RSTATE), e;
  316. movl d, state_h3(RSTATE);
  317. movl c, state_h2(RSTATE);
  318. movl b, state_h1(RSTATE);
  319. movl a, state_h0(RSTATE);
  320. movl e, state_h4(RSTATE);
  321. jmp .Loop;
  322. .align 16
  323. .Lend:
  324. vzeroall;
  325. /* Transform 64-79 + burn stack */
  326. R( b, c, d, e, a, F4, 64 );
  327. R( a, b, c, d, e, F4, 65 );
  328. R( e, a, b, c, d, F4, 66 );
  329. R( d, e, a, b, c, F4, 67 );
  330. R( c, d, e, a, b, F4, 68 );
  331. R( b, c, d, e, a, F4, 69 );
  332. R( a, b, c, d, e, F4, 70 );
  333. R( e, a, b, c, d, F4, 71 );
  334. R( d, e, a, b, c, F4, 72 );
  335. R( c, d, e, a, b, F4, 73 );
  336. R( b, c, d, e, a, F4, 74 );
  337. R( a, b, c, d, e, F4, 75 );
  338. R( e, a, b, c, d, F4, 76 ); vmovdqa %xmm0, (0*16)(%rsp);
  339. R( d, e, a, b, c, F4, 77 ); vmovdqa %xmm0, (1*16)(%rsp);
  340. R( c, d, e, a, b, F4, 78 ); vmovdqa %xmm0, (2*16)(%rsp);
  341. addl state_h0(RSTATE), a;
  342. R( b, c, d, e, a, F4, 79 );
  343. addl ne, a;
  344. xorl ne, ne;
  345. /* 16*4/16-1 = 3 */
  346. vmovdqa %xmm0, (3*16)(%rsp);
  347. /* Update the chaining variables. */
  348. addl state_h3(RSTATE), d;
  349. addl state_h2(RSTATE), c;
  350. addl state_h1(RSTATE), b;
  351. addl state_h4(RSTATE), e;
  352. movl d, state_h3(RSTATE);
  353. movl c, state_h2(RSTATE);
  354. movl b, state_h1(RSTATE);
  355. movl a, state_h0(RSTATE);
  356. movl e, state_h4(RSTATE);
  357. movq ROLDSTACK, %rsp;
  358. CFI_REGISTER(ROLDSTACK, %rsp);
  359. CFI_DEF_CFA_REGISTER(%rsp);
  360. popq %r12;
  361. CFI_POP(%r12);
  362. popq %rbp;
  363. CFI_POP(%rbp);
  364. popq %rbx;
  365. CFI_POP(%rbx);
  366. /* stack already burned */
  367. xorl %eax, %eax;
  368. .Lret:
  369. ret_spec_stop;
  370. CFI_ENDPROC();
  371. ELF(.size _gcry_sha1_transform_amd64_avx_bmi2,
  372. .-_gcry_sha1_transform_amd64_avx_bmi2;)
  373. #endif
  374. #endif