sha1-ssse3-amd64.S 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443
  1. /* sha1-ssse3-amd64.S - Intel SSSE3 accelerated SHA-1 transform function
  2. * Copyright (C) 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  3. *
  4. * Based on sha1.c:
  5. * Copyright (C) 1998, 2001, 2002, 2003, 2008 Free Software Foundation, Inc.
  6. *
  7. * This file is part of Libgcrypt.
  8. *
  9. * Libgcrypt is free software; you can redistribute it and/or modify
  10. * it under the terms of the GNU Lesser General Public License as
  11. * published by the Free Software Foundation; either version 2.1 of
  12. * the License, or (at your option) any later version.
  13. *
  14. * Libgcrypt is distributed in the hope that it will be useful,
  15. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  17. * GNU Lesser General Public License for more details.
  18. *
  19. * You should have received a copy of the GNU Lesser General Public
  20. * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  21. */
  22. /*
  23. * Intel SSSE3 accelerated SHA-1 implementation based on white paper:
  24. * "Improving the Performance of the Secure Hash Algorithm (SHA-1)"
  25. * http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1
  26. */
  27. #ifdef __x86_64__
  28. #include <config.h>
  29. #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
  30. defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
  31. defined(HAVE_GCC_INLINE_ASM_SSSE3) && defined(USE_SHA1)
  32. #include "asm-common-amd64.h"
  33. /* Context structure */
  34. #define state_h0 0
  35. #define state_h1 4
  36. #define state_h2 8
  37. #define state_h3 12
  38. #define state_h4 16
  39. /* Constants */
  40. SECTION_RODATA
  41. ELF(.type _sha1_ssse3_consts,@object)
  42. _sha1_ssse3_consts:
  43. #define K1 0x5A827999
  44. #define K2 0x6ED9EBA1
  45. #define K3 0x8F1BBCDC
  46. #define K4 0xCA62C1D6
  47. .align 16
  48. .LK_XMM:
  49. .LK1: .long K1, K1, K1, K1
  50. .LK2: .long K2, K2, K2, K2
  51. .LK3: .long K3, K3, K3, K3
  52. .LK4: .long K4, K4, K4, K4
  53. .Lbswap_shufb_ctl:
  54. .long 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f
  55. /* Register macros */
  56. #define RSTATE %r8
  57. #define RDATA %r9
  58. #define ROLDSTACK %r10
  59. #define RNBLKS %r11
  60. #define a %eax
  61. #define b %ebx
  62. #define c %ecx
  63. #define d %edx
  64. #define e %edi
  65. #define RT0 %esi
  66. #define RT1 %ebp
  67. #define Wtmp0 %xmm0
  68. #define Wtmp1 %xmm1
  69. #define W0 %xmm2
  70. #define W1 %xmm3
  71. #define W2 %xmm4
  72. #define W3 %xmm5
  73. #define W4 %xmm6
  74. #define W5 %xmm7
  75. #define W6 %xmm8
  76. #define W7 %xmm9
  77. #define BSWAP_REG %xmm10
  78. /* Round function macros. */
  79. #define WK(i) (((i) & 15) * 4)(%rsp)
  80. #define R_F1(a,b,c,d,e,i) \
  81. movl c, RT0; \
  82. addl WK(i), e; \
  83. xorl d, RT0; \
  84. movl a, RT1; \
  85. andl b, RT0; \
  86. roll $30, b; \
  87. xorl d, RT0; \
  88. leal (RT0,e), e; \
  89. roll $5, RT1; \
  90. addl RT1, e;
  91. #define R_F2(a,b,c,d,e,i) \
  92. movl c, RT0; \
  93. addl WK(i), e; \
  94. xorl b, RT0; \
  95. roll $30, b; \
  96. xorl d, RT0; \
  97. movl a, RT1; \
  98. leal (RT0,e), e; \
  99. roll $5, RT1; \
  100. addl RT1, e;
  101. #define R_F3(a,b,c,d,e,i) \
  102. movl c, RT0; \
  103. movl b, RT1; \
  104. xorl b, RT0; \
  105. andl c, RT1; \
  106. andl d, RT0; \
  107. addl RT1, e; \
  108. addl WK(i), e; \
  109. roll $30, b; \
  110. movl a, RT1; \
  111. leal (RT0,e), e; \
  112. roll $5, RT1; \
  113. addl RT1, e;
  114. #define R_F4(a,b,c,d,e,i) R_F2(a,b,c,d,e,i)
  115. #define R(a,b,c,d,e,f,i) \
  116. R_##f(a,b,c,d,e,i)
  117. /* Input expansion macros. */
  118. #define W_PRECALC_00_15_0(i, W, tmp0) \
  119. movdqu (4*(i))(RDATA), tmp0;
  120. #define W_PRECALC_00_15_1(i, W, tmp0) \
  121. pshufb BSWAP_REG, tmp0; \
  122. movdqa tmp0, W;
  123. #define W_PRECALC_00_15_2(i, W, tmp0) \
  124. paddd (.LK_XMM + ((i)/20)*16) rRIP, tmp0;
  125. #define W_PRECALC_00_15_3(i, W, tmp0) \
  126. movdqa tmp0, WK(i&~3);
  127. #define W_PRECALC_16_31_0(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \
  128. movdqa W_m12, W; \
  129. palignr $8, W_m16, W; \
  130. movdqa W_m04, tmp0; \
  131. psrldq $4, tmp0; \
  132. pxor W_m08, W;
  133. #define W_PRECALC_16_31_1(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \
  134. pxor W_m16, tmp0; \
  135. pxor tmp0, W; \
  136. movdqa W, tmp1; \
  137. movdqa W, tmp0; \
  138. pslldq $12, tmp1;
  139. #define W_PRECALC_16_31_2(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \
  140. psrld $31, W; \
  141. pslld $1, tmp0; \
  142. por W, tmp0; \
  143. movdqa tmp1, W; \
  144. psrld $30, tmp1; \
  145. pslld $2, W;
  146. #define W_PRECALC_16_31_3(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \
  147. pxor W, tmp0; \
  148. pxor tmp1, tmp0; \
  149. movdqa tmp0, W; \
  150. paddd (.LK_XMM + ((i)/20)*16) rRIP, tmp0; \
  151. movdqa tmp0, WK((i)&~3);
  152. #define W_PRECALC_32_79_0(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \
  153. movdqa W_m04, tmp0; \
  154. pxor W_m28, W; \
  155. palignr $8, W_m08, tmp0;
  156. #define W_PRECALC_32_79_1(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \
  157. pxor W_m16, W; \
  158. pxor tmp0, W; \
  159. movdqa W, tmp0;
  160. #define W_PRECALC_32_79_2(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \
  161. psrld $30, W; \
  162. pslld $2, tmp0; \
  163. por W, tmp0;
  164. #define W_PRECALC_32_79_3(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \
  165. movdqa tmp0, W; \
  166. paddd (.LK_XMM + ((i)/20)*16) rRIP, tmp0; \
  167. movdqa tmp0, WK((i)&~3);
  168. #define CLEAR_REG(reg) pxor reg, reg;
  169. .text
  170. /*
  171. * Transform nblks*64 bytes (nblks*16 32-bit words) at DATA.
  172. *
  173. * unsigned int
  174. * _gcry_sha1_transform_amd64_ssse3 (void *ctx, const unsigned char *data,
  175. * size_t nblks)
  176. */
  177. .globl _gcry_sha1_transform_amd64_ssse3
  178. ELF(.type _gcry_sha1_transform_amd64_ssse3,@function)
  179. .align 16
  180. _gcry_sha1_transform_amd64_ssse3:
  181. /* input:
  182. * %rdi: ctx, CTX
  183. * %rsi: data (64*nblks bytes)
  184. * %rdx: nblks
  185. */
  186. CFI_STARTPROC();
  187. xorl %eax, %eax;
  188. cmpq $0, %rdx;
  189. jz .Lret;
  190. movq %rdx, RNBLKS;
  191. movq %rdi, RSTATE;
  192. movq %rsi, RDATA;
  193. pushq %rbx;
  194. CFI_PUSH(%rbx);
  195. pushq %rbp;
  196. CFI_PUSH(%rbp);
  197. movq %rsp, ROLDSTACK;
  198. CFI_DEF_CFA_REGISTER(ROLDSTACK);
  199. subq $(16*4), %rsp;
  200. andq $(~31), %rsp;
  201. /* Get the values of the chaining variables. */
  202. movl state_h0(RSTATE), a;
  203. movl state_h1(RSTATE), b;
  204. movl state_h2(RSTATE), c;
  205. movl state_h3(RSTATE), d;
  206. movl state_h4(RSTATE), e;
  207. movdqa .Lbswap_shufb_ctl rRIP, BSWAP_REG;
  208. /* Precalc 0-15. */
  209. W_PRECALC_00_15_0(0, W0, Wtmp0);
  210. W_PRECALC_00_15_1(1, W0, Wtmp0);
  211. W_PRECALC_00_15_2(2, W0, Wtmp0);
  212. W_PRECALC_00_15_3(3, W0, Wtmp0);
  213. W_PRECALC_00_15_0(4, W7, Wtmp0);
  214. W_PRECALC_00_15_1(5, W7, Wtmp0);
  215. W_PRECALC_00_15_2(6, W7, Wtmp0);
  216. W_PRECALC_00_15_3(7, W7, Wtmp0);
  217. W_PRECALC_00_15_0(8, W6, Wtmp0);
  218. W_PRECALC_00_15_1(9, W6, Wtmp0);
  219. W_PRECALC_00_15_2(10, W6, Wtmp0);
  220. W_PRECALC_00_15_3(11, W6, Wtmp0);
  221. W_PRECALC_00_15_0(12, W5, Wtmp0);
  222. W_PRECALC_00_15_1(13, W5, Wtmp0);
  223. W_PRECALC_00_15_2(14, W5, Wtmp0);
  224. W_PRECALC_00_15_3(15, W5, Wtmp0);
  225. .align 8
  226. .Loop:
  227. addq $64, RDATA;
  228. /* Transform 0-15 + Precalc 16-31. */
  229. R( a, b, c, d, e, F1, 0 ); W_PRECALC_16_31_0(16, W4, W5, W6, W7, W0, Wtmp0, Wtmp1);
  230. R( e, a, b, c, d, F1, 1 ); W_PRECALC_16_31_1(17, W4, W5, W6, W7, W0, Wtmp0, Wtmp1);
  231. R( d, e, a, b, c, F1, 2 ); W_PRECALC_16_31_2(18, W4, W5, W6, W7, W0, Wtmp0, Wtmp1);
  232. R( c, d, e, a, b, F1, 3 ); W_PRECALC_16_31_3(19, W4, W5, W6, W7, W0, Wtmp0, Wtmp1);
  233. R( b, c, d, e, a, F1, 4 ); W_PRECALC_16_31_0(20, W3, W4, W5, W6, W7, Wtmp0, Wtmp1);
  234. R( a, b, c, d, e, F1, 5 ); W_PRECALC_16_31_1(21, W3, W4, W5, W6, W7, Wtmp0, Wtmp1);
  235. R( e, a, b, c, d, F1, 6 ); W_PRECALC_16_31_2(22, W3, W4, W5, W6, W7, Wtmp0, Wtmp1);
  236. R( d, e, a, b, c, F1, 7 ); W_PRECALC_16_31_3(23, W3, W4, W5, W6, W7, Wtmp0, Wtmp1);
  237. R( c, d, e, a, b, F1, 8 ); W_PRECALC_16_31_0(24, W2, W3, W4, W5, W6, Wtmp0, Wtmp1);
  238. R( b, c, d, e, a, F1, 9 ); W_PRECALC_16_31_1(25, W2, W3, W4, W5, W6, Wtmp0, Wtmp1);
  239. R( a, b, c, d, e, F1, 10 ); W_PRECALC_16_31_2(26, W2, W3, W4, W5, W6, Wtmp0, Wtmp1);
  240. R( e, a, b, c, d, F1, 11 ); W_PRECALC_16_31_3(27, W2, W3, W4, W5, W6, Wtmp0, Wtmp1);
  241. R( d, e, a, b, c, F1, 12 ); W_PRECALC_16_31_0(28, W1, W2, W3, W4, W5, Wtmp0, Wtmp1);
  242. R( c, d, e, a, b, F1, 13 ); W_PRECALC_16_31_1(29, W1, W2, W3, W4, W5, Wtmp0, Wtmp1);
  243. R( b, c, d, e, a, F1, 14 ); W_PRECALC_16_31_2(30, W1, W2, W3, W4, W5, Wtmp0, Wtmp1);
  244. R( a, b, c, d, e, F1, 15 ); W_PRECALC_16_31_3(31, W1, W2, W3, W4, W5, Wtmp0, Wtmp1);
  245. /* Transform 16-63 + Precalc 32-79. */
  246. R( e, a, b, c, d, F1, 16 ); W_PRECALC_32_79_0(32, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
  247. R( d, e, a, b, c, F1, 17 ); W_PRECALC_32_79_1(33, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
  248. R( c, d, e, a, b, F1, 18 ); W_PRECALC_32_79_2(34, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
  249. R( b, c, d, e, a, F1, 19 ); W_PRECALC_32_79_3(35, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
  250. R( a, b, c, d, e, F2, 20 ); W_PRECALC_32_79_0(36, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
  251. R( e, a, b, c, d, F2, 21 ); W_PRECALC_32_79_1(37, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
  252. R( d, e, a, b, c, F2, 22 ); W_PRECALC_32_79_2(38, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
  253. R( c, d, e, a, b, F2, 23 ); W_PRECALC_32_79_3(39, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
  254. R( b, c, d, e, a, F2, 24 ); W_PRECALC_32_79_0(40, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
  255. R( a, b, c, d, e, F2, 25 ); W_PRECALC_32_79_1(41, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
  256. R( e, a, b, c, d, F2, 26 ); W_PRECALC_32_79_2(42, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
  257. R( d, e, a, b, c, F2, 27 ); W_PRECALC_32_79_3(43, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
  258. R( c, d, e, a, b, F2, 28 ); W_PRECALC_32_79_0(44, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
  259. R( b, c, d, e, a, F2, 29 ); W_PRECALC_32_79_1(45, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
  260. R( a, b, c, d, e, F2, 30 ); W_PRECALC_32_79_2(46, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
  261. R( e, a, b, c, d, F2, 31 ); W_PRECALC_32_79_3(47, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
  262. R( d, e, a, b, c, F2, 32 ); W_PRECALC_32_79_0(48, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0);
  263. R( c, d, e, a, b, F2, 33 ); W_PRECALC_32_79_1(49, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0);
  264. R( b, c, d, e, a, F2, 34 ); W_PRECALC_32_79_2(50, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0);
  265. R( a, b, c, d, e, F2, 35 ); W_PRECALC_32_79_3(51, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0);
  266. R( e, a, b, c, d, F2, 36 ); W_PRECALC_32_79_0(52, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0);
  267. R( d, e, a, b, c, F2, 37 ); W_PRECALC_32_79_1(53, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0);
  268. R( c, d, e, a, b, F2, 38 ); W_PRECALC_32_79_2(54, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0);
  269. R( b, c, d, e, a, F2, 39 ); W_PRECALC_32_79_3(55, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0);
  270. R( a, b, c, d, e, F3, 40 ); W_PRECALC_32_79_0(56, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0);
  271. R( e, a, b, c, d, F3, 41 ); W_PRECALC_32_79_1(57, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0);
  272. R( d, e, a, b, c, F3, 42 ); W_PRECALC_32_79_2(58, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0);
  273. R( c, d, e, a, b, F3, 43 ); W_PRECALC_32_79_3(59, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0);
  274. R( b, c, d, e, a, F3, 44 ); W_PRECALC_32_79_0(60, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0);
  275. R( a, b, c, d, e, F3, 45 ); W_PRECALC_32_79_1(61, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0);
  276. R( e, a, b, c, d, F3, 46 ); W_PRECALC_32_79_2(62, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0);
  277. R( d, e, a, b, c, F3, 47 ); W_PRECALC_32_79_3(63, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0);
  278. R( c, d, e, a, b, F3, 48 ); W_PRECALC_32_79_0(64, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
  279. R( b, c, d, e, a, F3, 49 ); W_PRECALC_32_79_1(65, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
  280. R( a, b, c, d, e, F3, 50 ); W_PRECALC_32_79_2(66, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
  281. R( e, a, b, c, d, F3, 51 ); W_PRECALC_32_79_3(67, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
  282. R( d, e, a, b, c, F3, 52 ); W_PRECALC_32_79_0(68, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
  283. R( c, d, e, a, b, F3, 53 ); W_PRECALC_32_79_1(69, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
  284. R( b, c, d, e, a, F3, 54 ); W_PRECALC_32_79_2(70, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
  285. R( a, b, c, d, e, F3, 55 ); W_PRECALC_32_79_3(71, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
  286. R( e, a, b, c, d, F3, 56 ); W_PRECALC_32_79_0(72, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
  287. R( d, e, a, b, c, F3, 57 ); W_PRECALC_32_79_1(73, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
  288. R( c, d, e, a, b, F3, 58 ); W_PRECALC_32_79_2(74, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
  289. R( b, c, d, e, a, F3, 59 ); W_PRECALC_32_79_3(75, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
  290. R( a, b, c, d, e, F4, 60 ); W_PRECALC_32_79_0(76, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
  291. R( e, a, b, c, d, F4, 61 ); W_PRECALC_32_79_1(77, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
  292. R( d, e, a, b, c, F4, 62 ); W_PRECALC_32_79_2(78, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
  293. R( c, d, e, a, b, F4, 63 ); W_PRECALC_32_79_3(79, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
  294. decq RNBLKS;
  295. jz .Lend;
  296. /* Transform 64-79 + Precalc 0-15 of next block. */
  297. R( b, c, d, e, a, F4, 64 ); W_PRECALC_00_15_0(0, W0, Wtmp0);
  298. R( a, b, c, d, e, F4, 65 ); W_PRECALC_00_15_1(1, W0, Wtmp0);
  299. R( e, a, b, c, d, F4, 66 ); W_PRECALC_00_15_2(2, W0, Wtmp0);
  300. R( d, e, a, b, c, F4, 67 ); W_PRECALC_00_15_3(3, W0, Wtmp0);
  301. R( c, d, e, a, b, F4, 68 ); W_PRECALC_00_15_0(4, W7, Wtmp0);
  302. R( b, c, d, e, a, F4, 69 ); W_PRECALC_00_15_1(5, W7, Wtmp0);
  303. R( a, b, c, d, e, F4, 70 ); W_PRECALC_00_15_2(6, W7, Wtmp0);
  304. R( e, a, b, c, d, F4, 71 ); W_PRECALC_00_15_3(7, W7, Wtmp0);
  305. R( d, e, a, b, c, F4, 72 ); W_PRECALC_00_15_0(8, W6, Wtmp0);
  306. R( c, d, e, a, b, F4, 73 ); W_PRECALC_00_15_1(9, W6, Wtmp0);
  307. R( b, c, d, e, a, F4, 74 ); W_PRECALC_00_15_2(10, W6, Wtmp0);
  308. R( a, b, c, d, e, F4, 75 ); W_PRECALC_00_15_3(11, W6, Wtmp0);
  309. R( e, a, b, c, d, F4, 76 ); W_PRECALC_00_15_0(12, W5, Wtmp0);
  310. R( d, e, a, b, c, F4, 77 ); W_PRECALC_00_15_1(13, W5, Wtmp0);
  311. R( c, d, e, a, b, F4, 78 );
  312. addl state_h0(RSTATE), a; W_PRECALC_00_15_2(14, W5, Wtmp0);
  313. R( b, c, d, e, a, F4, 79 ); W_PRECALC_00_15_3(15, W5, Wtmp0);
  314. /* Update the chaining variables. */
  315. addl state_h3(RSTATE), d;
  316. addl state_h2(RSTATE), c;
  317. addl state_h1(RSTATE), b;
  318. addl state_h4(RSTATE), e;
  319. movl d, state_h3(RSTATE);
  320. movl c, state_h2(RSTATE);
  321. movl b, state_h1(RSTATE);
  322. movl a, state_h0(RSTATE);
  323. movl e, state_h4(RSTATE);
  324. jmp .Loop;
  325. .align 16
  326. .Lend:
  327. /* Transform 64-79 + Clear XMM registers + Burn stack. */
  328. R( b, c, d, e, a, F4, 64 ); CLEAR_REG(BSWAP_REG);
  329. R( a, b, c, d, e, F4, 65 ); CLEAR_REG(Wtmp0);
  330. R( e, a, b, c, d, F4, 66 ); CLEAR_REG(Wtmp1);
  331. R( d, e, a, b, c, F4, 67 ); CLEAR_REG(W0);
  332. R( c, d, e, a, b, F4, 68 ); CLEAR_REG(W1);
  333. R( b, c, d, e, a, F4, 69 ); CLEAR_REG(W2);
  334. R( a, b, c, d, e, F4, 70 ); CLEAR_REG(W3);
  335. R( e, a, b, c, d, F4, 71 ); CLEAR_REG(W4);
  336. R( d, e, a, b, c, F4, 72 ); CLEAR_REG(W5);
  337. R( c, d, e, a, b, F4, 73 ); CLEAR_REG(W6);
  338. R( b, c, d, e, a, F4, 74 ); CLEAR_REG(W7);
  339. R( a, b, c, d, e, F4, 75 );
  340. R( e, a, b, c, d, F4, 76 ); movdqa Wtmp0, (0*16)(%rsp);
  341. R( d, e, a, b, c, F4, 77 ); movdqa Wtmp0, (1*16)(%rsp);
  342. R( c, d, e, a, b, F4, 78 ); movdqa Wtmp0, (2*16)(%rsp);
  343. addl state_h0(RSTATE), a;
  344. R( b, c, d, e, a, F4, 79 );
  345. /* 16*4/16-1 = 3 */
  346. movdqa Wtmp0, (3*16)(%rsp);
  347. /* Update the chaining variables. */
  348. addl state_h3(RSTATE), d;
  349. addl state_h2(RSTATE), c;
  350. addl state_h1(RSTATE), b;
  351. addl state_h4(RSTATE), e;
  352. movl d, state_h3(RSTATE);
  353. movl c, state_h2(RSTATE);
  354. movl b, state_h1(RSTATE);
  355. movl a, state_h0(RSTATE);
  356. movl e, state_h4(RSTATE);
  357. movq ROLDSTACK, %rsp;
  358. CFI_REGISTER(ROLDSTACK, %rsp);
  359. CFI_DEF_CFA_REGISTER(%rsp);
  360. popq %rbp;
  361. CFI_POP(%rbp);
  362. popq %rbx;
  363. CFI_POP(%rbx);
  364. /* stack already burned */
  365. xorl %eax, %eax;
  366. .Lret:
  367. ret_spec_stop;
  368. CFI_ENDPROC();
  369. ELF(.size _gcry_sha1_transform_amd64_ssse3,
  370. .-_gcry_sha1_transform_amd64_ssse3;)
  371. #endif
  372. #endif