sha256-avx-amd64.S 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512
  1. /*
  2. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  3. ; Copyright (c) 2012, Intel Corporation
  4. ;
  5. ; All rights reserved.
  6. ;
  7. ; Redistribution and use in source and binary forms, with or without
  8. ; modification, are permitted provided that the following conditions are
  9. ; met:
  10. ;
  11. ; * Redistributions of source code must retain the above copyright
  12. ; notice, this list of conditions and the following disclaimer.
  13. ;
  14. ; * Redistributions in binary form must reproduce the above copyright
  15. ; notice, this list of conditions and the following disclaimer in the
  16. ; documentation and/or other materials provided with the
  17. ; distribution.
  18. ;
  19. ; * Neither the name of the Intel Corporation nor the names of its
  20. ; contributors may be used to endorse or promote products derived from
  21. ; this software without specific prior written permission.
  22. ;
  23. ;
  24. ; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY
  25. ; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  26. ; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  27. ; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
  28. ; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  29. ; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  30. ; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  31. ; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  32. ; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  33. ; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  34. ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  35. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  36. ;
  37. ; This code is described in an Intel White-Paper:
  38. ; "Fast SHA-256 Implementations on Intel Architecture Processors"
  39. ;
  40. ; To find it, surf to http://www.intel.com/p/en_US/embedded
  41. ; and search for that title.
  42. ; The paper is expected to be released roughly at the end of April, 2012
  43. ;
  44. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  45. ; This code schedules 1 blocks at a time, with 4 lanes per block
  46. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  47. */
  48. /*
  49. * Conversion to GAS assembly and integration to libgcrypt
  50. * by Jussi Kivilinna <jussi.kivilinna@iki.fi>
  51. *
  52. * Note: Based on the SSSE3 implementation.
  53. */
  54. #ifdef __x86_64
  55. #include <config.h>
  56. #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
  57. defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
  58. defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \
  59. defined(HAVE_GCC_INLINE_ASM_AVX) && defined(USE_SHA256)
  60. #include "asm-common-amd64.h"
  61. .intel_syntax noprefix
  62. #define VMOVDQ vmovdqu /* assume buffers not aligned */
  63. #define ROR(p1, p2) \
  64. /* shld is faster than ror on Intel Sandybridge */ \
  65. shld p1, p1, (32 - p2);
  66. /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros*/
  67. /* addm [mem], reg
  68. * Add reg to mem using reg-mem add and store */
  69. #define addm(p1, p2) \
  70. add p2, p1; \
  71. mov p1, p2;
  72. /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
  73. /* COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
  74. * Load xmm with mem and byte swap each dword */
  75. #define COPY_XMM_AND_BSWAP(p1, p2, p3) \
  76. VMOVDQ p1, p2; \
  77. vpshufb p1, p1, p3;
  78. /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
  79. #define X0 xmm4
  80. #define X1 xmm5
  81. #define X2 xmm6
  82. #define X3 xmm7
  83. #define XTMP0 xmm0
  84. #define XTMP1 xmm1
  85. #define XTMP2 xmm2
  86. #define XTMP3 xmm3
  87. #define XTMP4 xmm8
  88. #define XFER xmm9
  89. #define SHUF_00BA xmm10 /* shuffle xBxA -> 00BA */
  90. #define SHUF_DC00 xmm11 /* shuffle xDxC -> DC00 */
  91. #define BYTE_FLIP_MASK xmm12
  92. #define NUM_BLKS rdx /* 3rd arg */
  93. #define CTX rsi /* 2nd arg */
  94. #define INP rdi /* 1st arg */
  95. #define SRND rdi /* clobbers INP */
  96. #define c ecx
  97. #define d r8d
  98. #define e edx
  99. #define TBL rbp
  100. #define a eax
  101. #define b ebx
  102. #define f r9d
  103. #define g r10d
  104. #define h r11d
  105. #define y0 r13d
  106. #define y1 r14d
  107. #define y2 r15d
  108. #define _INP_END_SIZE 8
  109. #define _INP_SIZE 8
  110. #define _XFER_SIZE 8
  111. #define _XMM_SAVE_SIZE 0
  112. /* STACK_SIZE plus pushes must be an odd multiple of 8 */
  113. #define _ALIGN_SIZE 8
  114. #define _INP_END 0
  115. #define _INP (_INP_END + _INP_END_SIZE)
  116. #define _XFER (_INP + _INP_SIZE)
  117. #define _XMM_SAVE (_XFER + _XFER_SIZE + _ALIGN_SIZE)
  118. #define STACK_SIZE (_XMM_SAVE + _XMM_SAVE_SIZE)
  119. #define FOUR_ROUNDS_AND_SCHED_0(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \
  120. /* compute s0 four at a time and s1 two at a time */; \
  121. /* compute W[-16] + W[-7] 4 at a time */; \
  122. mov y0, e /* y0 = e */; \
  123. ROR( y0, (25-11)) /* y0 = e >> (25-11) */; \
  124. mov y1, a /* y1 = a */; \
  125. vpalignr XTMP0, X3, X2, 4 /* XTMP0 = W[-7] */; \
  126. ROR( y1, (22-13)) /* y1 = a >> (22-13) */; \
  127. xor y0, e /* y0 = e ^ (e >> (25-11)) */; \
  128. mov y2, f /* y2 = f */; \
  129. ROR( y0, (11-6)) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \
  130. xor y1, a /* y1 = a ^ (a >> (22-13) */; \
  131. xor y2, g /* y2 = f^g */; \
  132. vpaddd XTMP0, XTMP0, X0 /* XTMP0 = W[-7] + W[-16] */; \
  133. xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \
  134. and y2, e /* y2 = (f^g)&e */; \
  135. ROR( y1, (13-2)) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \
  136. /* compute s0 */; \
  137. vpalignr XTMP1, X1, X0, 4 /* XTMP1 = W[-15] */; \
  138. xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \
  139. ROR( y0, 6) /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \
  140. xor y2, g /* y2 = CH = ((f^g)&e)^g */; \
  141. ROR( y1, 2) /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \
  142. add y2, y0 /* y2 = S1 + CH */; \
  143. add y2, [rsp + _XFER + 0*4] /* y2 = k + w + S1 + CH */; \
  144. mov y0, a /* y0 = a */; \
  145. add h, y2 /* h = h + S1 + CH + k + w */; \
  146. mov y2, a /* y2 = a */; \
  147. vpslld XTMP2, XTMP1, (32-7); \
  148. or y0, c /* y0 = a|c */; \
  149. add d, h /* d = d + h + S1 + CH + k + w */; \
  150. and y2, c /* y2 = a&c */; \
  151. vpsrld XTMP3, XTMP1, 7; \
  152. and y0, b /* y0 = (a|c)&b */; \
  153. add h, y1 /* h = h + S1 + CH + k + w + S0 */; \
  154. vpor XTMP3, XTMP3, XTMP2 /* XTMP1 = W[-15] ror 7 */; \
  155. or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */; \
  156. lea h, [h + y0] /* h = h + S1 + CH + k + w + S0 + MAJ */
  157. #define FOUR_ROUNDS_AND_SCHED_1(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \
  158. mov y0, e /* y0 = e */; \
  159. mov y1, a /* y1 = a */; \
  160. ROR( y0, (25-11)) /* y0 = e >> (25-11) */; \
  161. xor y0, e /* y0 = e ^ (e >> (25-11)) */; \
  162. mov y2, f /* y2 = f */; \
  163. ROR( y1, (22-13)) /* y1 = a >> (22-13) */; \
  164. vpslld XTMP2, XTMP1, (32-18); \
  165. xor y1, a /* y1 = a ^ (a >> (22-13) */; \
  166. ROR( y0, (11-6)) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \
  167. xor y2, g /* y2 = f^g */; \
  168. vpsrld XTMP4, XTMP1, 18; \
  169. ROR( y1, (13-2)) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \
  170. xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \
  171. and y2, e /* y2 = (f^g)&e */; \
  172. ROR( y0, 6) /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \
  173. vpxor XTMP4, XTMP4, XTMP3; \
  174. xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \
  175. xor y2, g /* y2 = CH = ((f^g)&e)^g */; \
  176. vpsrld XTMP1, XTMP1, 3 /* XTMP4 = W[-15] >> 3 */; \
  177. add y2, y0 /* y2 = S1 + CH */; \
  178. add y2, [rsp + _XFER + 1*4] /* y2 = k + w + S1 + CH */; \
  179. ROR( y1, 2) /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \
  180. vpxor XTMP1, XTMP1, XTMP2 /* XTMP1 = W[-15] ror 7 ^ W[-15] ror 18 */; \
  181. mov y0, a /* y0 = a */; \
  182. add h, y2 /* h = h + S1 + CH + k + w */; \
  183. mov y2, a /* y2 = a */; \
  184. vpxor XTMP1, XTMP1, XTMP4 /* XTMP1 = s0 */; \
  185. or y0, c /* y0 = a|c */; \
  186. add d, h /* d = d + h + S1 + CH + k + w */; \
  187. and y2, c /* y2 = a&c */; \
  188. /* compute low s1 */; \
  189. vpshufd XTMP2, X3, 0b11111010 /* XTMP2 = W[-2] {BBAA} */; \
  190. and y0, b /* y0 = (a|c)&b */; \
  191. add h, y1 /* h = h + S1 + CH + k + w + S0 */; \
  192. vpaddd XTMP0, XTMP0, XTMP1 /* XTMP0 = W[-16] + W[-7] + s0 */; \
  193. or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */; \
  194. lea h, [h + y0] /* h = h + S1 + CH + k + w + S0 + MAJ */
  195. #define FOUR_ROUNDS_AND_SCHED_2(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \
  196. mov y0, e /* y0 = e */; \
  197. mov y1, a /* y1 = a */; \
  198. ROR( y0, (25-11)) /* y0 = e >> (25-11) */; \
  199. xor y0, e /* y0 = e ^ (e >> (25-11)) */; \
  200. ROR( y1, (22-13)) /* y1 = a >> (22-13) */; \
  201. mov y2, f /* y2 = f */; \
  202. xor y1, a /* y1 = a ^ (a >> (22-13) */; \
  203. ROR( y0, (11-6)) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \
  204. vpsrlq XTMP3, XTMP2, 17 /* XTMP2 = W[-2] ror 17 {xBxA} */; \
  205. xor y2, g /* y2 = f^g */; \
  206. vpsrlq XTMP4, XTMP2, 19 /* XTMP3 = W[-2] ror 19 {xBxA} */; \
  207. xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \
  208. and y2, e /* y2 = (f^g)&e */; \
  209. vpsrld XTMP2, XTMP2, 10 /* XTMP4 = W[-2] >> 10 {BBAA} */; \
  210. ROR( y1, (13-2)) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \
  211. xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \
  212. xor y2, g /* y2 = CH = ((f^g)&e)^g */; \
  213. ROR( y0, 6) /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \
  214. vpxor XTMP2, XTMP2, XTMP3; \
  215. add y2, y0 /* y2 = S1 + CH */; \
  216. ROR( y1, 2) /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \
  217. add y2, [rsp + _XFER + 2*4] /* y2 = k + w + S1 + CH */; \
  218. vpxor XTMP4, XTMP4, XTMP2 /* XTMP4 = s1 {xBxA} */; \
  219. mov y0, a /* y0 = a */; \
  220. add h, y2 /* h = h + S1 + CH + k + w */; \
  221. mov y2, a /* y2 = a */; \
  222. vpshufb XTMP4, XTMP4, SHUF_00BA /* XTMP4 = s1 {00BA} */; \
  223. or y0, c /* y0 = a|c */; \
  224. add d, h /* d = d + h + S1 + CH + k + w */; \
  225. and y2, c /* y2 = a&c */; \
  226. vpaddd XTMP0, XTMP0, XTMP4 /* XTMP0 = {..., ..., W[1], W[0]} */; \
  227. and y0, b /* y0 = (a|c)&b */; \
  228. add h, y1 /* h = h + S1 + CH + k + w + S0 */; \
  229. /* compute high s1 */; \
  230. vpshufd XTMP2, XTMP0, 0b01010000 /* XTMP2 = W[-2] {DDCC} */; \
  231. or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */; \
  232. lea h, [h + y0] /* h = h + S1 + CH + k + w + S0 + MAJ */
  233. #define FOUR_ROUNDS_AND_SCHED_3(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \
  234. mov y0, e /* y0 = e */; \
  235. ROR( y0, (25-11)) /* y0 = e >> (25-11) */; \
  236. mov y1, a /* y1 = a */; \
  237. ROR( y1, (22-13)) /* y1 = a >> (22-13) */; \
  238. xor y0, e /* y0 = e ^ (e >> (25-11)) */; \
  239. mov y2, f /* y2 = f */; \
  240. ROR( y0, (11-6)) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \
  241. vpsrlq XTMP3, XTMP2, 17 /* XTMP2 = W[-2] ror 17 {xDxC} */; \
  242. xor y1, a /* y1 = a ^ (a >> (22-13) */; \
  243. xor y2, g /* y2 = f^g */; \
  244. vpsrlq X0, XTMP2, 19 /* XTMP3 = W[-2] ror 19 {xDxC} */; \
  245. xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \
  246. and y2, e /* y2 = (f^g)&e */; \
  247. ROR( y1, (13-2)) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \
  248. vpsrld XTMP2, XTMP2, 10 /* X0 = W[-2] >> 10 {DDCC} */; \
  249. xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \
  250. ROR( y0, 6) /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \
  251. xor y2, g /* y2 = CH = ((f^g)&e)^g */; \
  252. vpxor XTMP2, XTMP2, XTMP3; \
  253. ROR( y1, 2) /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \
  254. add y2, y0 /* y2 = S1 + CH */; \
  255. add y2, [rsp + _XFER + 3*4] /* y2 = k + w + S1 + CH */; \
  256. vpxor X0, X0, XTMP2 /* X0 = s1 {xDxC} */; \
  257. mov y0, a /* y0 = a */; \
  258. add h, y2 /* h = h + S1 + CH + k + w */; \
  259. mov y2, a /* y2 = a */; \
  260. vpshufb X0, X0, SHUF_DC00 /* X0 = s1 {DC00} */; \
  261. or y0, c /* y0 = a|c */; \
  262. add d, h /* d = d + h + S1 + CH + k + w */; \
  263. and y2, c /* y2 = a&c */; \
  264. vpaddd X0, X0, XTMP0 /* X0 = {W[3], W[2], W[1], W[0]} */; \
  265. and y0, b /* y0 = (a|c)&b */; \
  266. add h, y1 /* h = h + S1 + CH + k + w + S0 */; \
  267. or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */; \
  268. lea h, [h + y0] /* h = h + S1 + CH + k + w + S0 + MAJ */
  269. #define FOUR_ROUNDS_AND_SCHED(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \
  270. FOUR_ROUNDS_AND_SCHED_0(X0, X1, X2, X3, a, b, c, d, e, f, g, h); \
  271. FOUR_ROUNDS_AND_SCHED_1(X0, X1, X2, X3, h, a, b, c, d, e, f, g); \
  272. FOUR_ROUNDS_AND_SCHED_2(X0, X1, X2, X3, g, h, a, b, c, d, e, f); \
  273. FOUR_ROUNDS_AND_SCHED_3(X0, X1, X2, X3, f, g, h, a, b, c, d, e);
  274. /* input is [rsp + _XFER + %1 * 4] */
  275. #define DO_ROUND(i1, a, b, c, d, e, f, g, h) \
  276. mov y0, e /* y0 = e */; \
  277. ROR( y0, (25-11)) /* y0 = e >> (25-11) */; \
  278. mov y1, a /* y1 = a */; \
  279. xor y0, e /* y0 = e ^ (e >> (25-11)) */; \
  280. ROR( y1, (22-13)) /* y1 = a >> (22-13) */; \
  281. mov y2, f /* y2 = f */; \
  282. xor y1, a /* y1 = a ^ (a >> (22-13) */; \
  283. ROR( y0, (11-6)) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \
  284. xor y2, g /* y2 = f^g */; \
  285. xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \
  286. ROR( y1, (13-2)) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \
  287. and y2, e /* y2 = (f^g)&e */; \
  288. xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \
  289. ROR( y0, 6) /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \
  290. xor y2, g /* y2 = CH = ((f^g)&e)^g */; \
  291. add y2, y0 /* y2 = S1 + CH */; \
  292. ROR( y1, 2) /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \
  293. add y2, [rsp + _XFER + i1 * 4] /* y2 = k + w + S1 + CH */; \
  294. mov y0, a /* y0 = a */; \
  295. add h, y2 /* h = h + S1 + CH + k + w */; \
  296. mov y2, a /* y2 = a */; \
  297. or y0, c /* y0 = a|c */; \
  298. add d, h /* d = d + h + S1 + CH + k + w */; \
  299. and y2, c /* y2 = a&c */; \
  300. and y0, b /* y0 = (a|c)&b */; \
  301. add h, y1 /* h = h + S1 + CH + k + w + S0 */; \
  302. or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */; \
  303. lea h, [h + y0] /* h = h + S1 + CH + k + w + S0 + MAJ */
  304. /*
  305. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  306. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  307. ;; void sha256_avx(void *input_data, UINT32 digest[8], UINT64 num_blks)
  308. ;; arg 1 : pointer to input data
  309. ;; arg 2 : pointer to digest
  310. ;; arg 3 : Num blocks
  311. */
  312. .text
  313. .globl _gcry_sha256_transform_amd64_avx
  314. ELF(.type _gcry_sha256_transform_amd64_avx,@function;)
  315. .align 16
  316. _gcry_sha256_transform_amd64_avx:
  317. CFI_STARTPROC()
  318. vzeroupper
  319. push rbx
  320. CFI_PUSH(rbx)
  321. push rbp
  322. CFI_PUSH(rbp)
  323. push r13
  324. CFI_PUSH(r13)
  325. push r14
  326. CFI_PUSH(r14)
  327. push r15
  328. CFI_PUSH(r15)
  329. sub rsp, STACK_SIZE
  330. CFI_ADJUST_CFA_OFFSET(STACK_SIZE);
  331. shl NUM_BLKS, 6 /* convert to bytes */
  332. jz .Ldone_hash
  333. add NUM_BLKS, INP /* pointer to end of data */
  334. mov [rsp + _INP_END], NUM_BLKS
  335. /* load initial digest */
  336. mov a,[4*0 + CTX]
  337. mov b,[4*1 + CTX]
  338. mov c,[4*2 + CTX]
  339. mov d,[4*3 + CTX]
  340. mov e,[4*4 + CTX]
  341. mov f,[4*5 + CTX]
  342. mov g,[4*6 + CTX]
  343. mov h,[4*7 + CTX]
  344. vmovdqa BYTE_FLIP_MASK, [.LPSHUFFLE_BYTE_FLIP_MASK ADD_RIP]
  345. vmovdqa SHUF_00BA, [.L_SHUF_00BA ADD_RIP]
  346. vmovdqa SHUF_DC00, [.L_SHUF_DC00 ADD_RIP]
  347. .Loop0:
  348. lea TBL, [.LK256 ADD_RIP]
  349. /* byte swap first 16 dwords */
  350. COPY_XMM_AND_BSWAP(X0, [INP + 0*16], BYTE_FLIP_MASK)
  351. COPY_XMM_AND_BSWAP(X1, [INP + 1*16], BYTE_FLIP_MASK)
  352. COPY_XMM_AND_BSWAP(X2, [INP + 2*16], BYTE_FLIP_MASK)
  353. COPY_XMM_AND_BSWAP(X3, [INP + 3*16], BYTE_FLIP_MASK)
  354. mov [rsp + _INP], INP
  355. /* schedule 48 input dwords, by doing 3 rounds of 16 each */
  356. mov SRND, 3
  357. .align 16
  358. .Loop1:
  359. vpaddd XFER, X0, [TBL + 0*16]
  360. vmovdqa [rsp + _XFER], XFER
  361. FOUR_ROUNDS_AND_SCHED(X0, X1, X2, X3, a, b, c, d, e, f, g, h)
  362. vpaddd XFER, X1, [TBL + 1*16]
  363. vmovdqa [rsp + _XFER], XFER
  364. FOUR_ROUNDS_AND_SCHED(X1, X2, X3, X0, e, f, g, h, a, b, c, d)
  365. vpaddd XFER, X2, [TBL + 2*16]
  366. vmovdqa [rsp + _XFER], XFER
  367. FOUR_ROUNDS_AND_SCHED(X2, X3, X0, X1, a, b, c, d, e, f, g, h)
  368. vpaddd XFER, X3, [TBL + 3*16]
  369. vmovdqa [rsp + _XFER], XFER
  370. add TBL, 4*16
  371. FOUR_ROUNDS_AND_SCHED(X3, X0, X1, X2, e, f, g, h, a, b, c, d)
  372. sub SRND, 1
  373. jne .Loop1
  374. mov SRND, 2
  375. .Loop2:
  376. vpaddd X0, X0, [TBL + 0*16]
  377. vmovdqa [rsp + _XFER], X0
  378. DO_ROUND(0, a, b, c, d, e, f, g, h)
  379. DO_ROUND(1, h, a, b, c, d, e, f, g)
  380. DO_ROUND(2, g, h, a, b, c, d, e, f)
  381. DO_ROUND(3, f, g, h, a, b, c, d, e)
  382. vpaddd X1, X1, [TBL + 1*16]
  383. vmovdqa [rsp + _XFER], X1
  384. add TBL, 2*16
  385. DO_ROUND(0, e, f, g, h, a, b, c, d)
  386. DO_ROUND(1, d, e, f, g, h, a, b, c)
  387. DO_ROUND(2, c, d, e, f, g, h, a, b)
  388. DO_ROUND(3, b, c, d, e, f, g, h, a)
  389. vmovdqa X0, X2
  390. vmovdqa X1, X3
  391. sub SRND, 1
  392. jne .Loop2
  393. addm([4*0 + CTX],a)
  394. addm([4*1 + CTX],b)
  395. addm([4*2 + CTX],c)
  396. addm([4*3 + CTX],d)
  397. addm([4*4 + CTX],e)
  398. addm([4*5 + CTX],f)
  399. addm([4*6 + CTX],g)
  400. addm([4*7 + CTX],h)
  401. mov INP, [rsp + _INP]
  402. add INP, 64
  403. cmp INP, [rsp + _INP_END]
  404. jne .Loop0
  405. .Ldone_hash:
  406. vzeroall
  407. vmovdqa [rsp + _XFER], XFER
  408. xor eax, eax
  409. add rsp, STACK_SIZE
  410. CFI_ADJUST_CFA_OFFSET(-STACK_SIZE);
  411. pop r15
  412. CFI_POP(r15)
  413. pop r14
  414. CFI_POP(r14)
  415. pop r13
  416. CFI_POP(r13)
  417. pop rbp
  418. CFI_POP(rbp)
  419. pop rbx
  420. CFI_POP(rbx)
  421. ret_spec_stop
  422. CFI_ENDPROC()
  423. SECTION_RODATA
  424. ELF(.type _sha256_avx_consts,@object)
  425. _sha256_avx_consts:
  426. .align 16
  427. .LK256:
  428. .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
  429. .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
  430. .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
  431. .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
  432. .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
  433. .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
  434. .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
  435. .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
  436. .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
  437. .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
  438. .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
  439. .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
  440. .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
  441. .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
  442. .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
  443. .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
  444. .LPSHUFFLE_BYTE_FLIP_MASK: .octa 0x0c0d0e0f08090a0b0405060700010203
  445. /* shuffle xBxA -> 00BA */
  446. .L_SHUF_00BA: .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
  447. /* shuffle xDxC -> DC00 */
  448. .L_SHUF_DC00: .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF
  449. #endif
  450. #endif