sha256-ssse3-amd64.S 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534
  1. /*
  2. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  3. ; Copyright (c) 2012, Intel Corporation
  4. ;
  5. ; All rights reserved.
  6. ;
  7. ; Redistribution and use in source and binary forms, with or without
  8. ; modification, are permitted provided that the following conditions are
  9. ; met:
  10. ;
  11. ; * Redistributions of source code must retain the above copyright
  12. ; notice, this list of conditions and the following disclaimer.
  13. ;
  14. ; * Redistributions in binary form must reproduce the above copyright
  15. ; notice, this list of conditions and the following disclaimer in the
  16. ; documentation and/or other materials provided with the
  17. ; distribution.
  18. ;
  19. ; * Neither the name of the Intel Corporation nor the names of its
  20. ; contributors may be used to endorse or promote products derived from
  21. ; this software without specific prior written permission.
  22. ;
  23. ;
  24. ; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY
  25. ; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  26. ; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  27. ; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
  28. ; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  29. ; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  30. ; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  31. ; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  32. ; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  33. ; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  34. ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  35. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  36. ;
  37. ; This code is described in an Intel White-Paper:
  38. ; "Fast SHA-256 Implementations on Intel Architecture Processors"
  39. ;
  40. ; To find it, surf to http://www.intel.com/p/en_US/embedded
  41. ; and search for that title.
  42. ; The paper is expected to be released roughly at the end of April, 2012
  43. ;
  44. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  45. ; This code schedules 1 blocks at a time, with 4 lanes per block
  46. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  47. */
  48. /*
  49. * Conversion to GAS assembly and integration to libgcrypt
  50. * by Jussi Kivilinna <jussi.kivilinna@iki.fi>
  51. *
  52. * Note: original implementation was named as SHA256-SSE4. However, only SSSE3
  53. * is required.
  54. */
  55. #ifdef __x86_64
  56. #include <config.h>
  57. #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
  58. defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
  59. defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \
  60. defined(HAVE_GCC_INLINE_ASM_SSSE3) && defined(USE_SHA256)
  61. #include "asm-common-amd64.h"
  62. .intel_syntax noprefix
  63. #define MOVDQ movdqu /* assume buffers not aligned */
  64. /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros*/
  65. /* addm [mem], reg
  66. * Add reg to mem using reg-mem add and store */
  67. #define addm(p1, p2) \
  68. add p2, p1; \
  69. mov p1, p2;
  70. /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
  71. /* COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
  72. * Load xmm with mem and byte swap each dword */
  73. #define COPY_XMM_AND_BSWAP(p1, p2, p3) \
  74. MOVDQ p1, p2; \
  75. pshufb p1, p3;
  76. /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;*/
  77. #define X0 xmm4
  78. #define X1 xmm5
  79. #define X2 xmm6
  80. #define X3 xmm7
  81. #define XTMP0 xmm0
  82. #define XTMP1 xmm1
  83. #define XTMP2 xmm2
  84. #define XTMP3 xmm3
  85. #define XTMP4 xmm8
  86. #define XFER xmm9
  87. #define SHUF_00BA xmm10 /* shuffle xBxA -> 00BA */
  88. #define SHUF_DC00 xmm11 /* shuffle xDxC -> DC00 */
  89. #define BYTE_FLIP_MASK xmm12
  90. #define NUM_BLKS rdx /* 3rd arg */
  91. #define CTX rsi /* 2nd arg */
  92. #define INP rdi /* 1st arg */
  93. #define SRND rdi /* clobbers INP */
  94. #define c ecx
  95. #define d r8d
  96. #define e edx
  97. #define TBL rbp
  98. #define a eax
  99. #define b ebx
  100. #define f r9d
  101. #define g r10d
  102. #define h r11d
  103. #define y0 r13d
  104. #define y1 r14d
  105. #define y2 r15d
  106. #define _INP_END_SIZE 8
  107. #define _INP_SIZE 8
  108. #define _XFER_SIZE 8
  109. #define _XMM_SAVE_SIZE 0
  110. /* STACK_SIZE plus pushes must be an odd multiple of 8 */
  111. #define _ALIGN_SIZE 8
  112. #define _INP_END 0
  113. #define _INP (_INP_END + _INP_END_SIZE)
  114. #define _XFER (_INP + _INP_SIZE)
  115. #define _XMM_SAVE (_XFER + _XFER_SIZE + _ALIGN_SIZE)
  116. #define STACK_SIZE (_XMM_SAVE + _XMM_SAVE_SIZE)
  117. #define FOUR_ROUNDS_AND_SCHED_0(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \
  118. /* compute s0 four at a time and s1 two at a time */; \
  119. /* compute W[-16] + W[-7] 4 at a time */; \
  120. movdqa XTMP0, X3; \
  121. mov y0, e /* y0 = e */; \
  122. ror y0, (25-11) /* y0 = e >> (25-11) */; \
  123. mov y1, a /* y1 = a */; \
  124. palignr XTMP0, X2, 4 /* XTMP0 = W[-7] */; \
  125. ror y1, (22-13) /* y1 = a >> (22-13) */; \
  126. xor y0, e /* y0 = e ^ (e >> (25-11)) */; \
  127. mov y2, f /* y2 = f */; \
  128. ror y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \
  129. movdqa XTMP1, X1; \
  130. xor y1, a /* y1 = a ^ (a >> (22-13) */; \
  131. xor y2, g /* y2 = f^g */; \
  132. paddd XTMP0, X0 /* XTMP0 = W[-7] + W[-16] */; \
  133. xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \
  134. and y2, e /* y2 = (f^g)&e */; \
  135. ror y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \
  136. /* compute s0 */; \
  137. palignr XTMP1, X0, 4 /* XTMP1 = W[-15] */; \
  138. xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \
  139. ror y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \
  140. xor y2, g /* y2 = CH = ((f^g)&e)^g */; \
  141. movdqa XTMP2, XTMP1 /* XTMP2 = W[-15] */; \
  142. ror y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \
  143. add y2, y0 /* y2 = S1 + CH */; \
  144. add y2, [rsp + _XFER + 0*4] /* y2 = k + w + S1 + CH */; \
  145. movdqa XTMP3, XTMP1 /* XTMP3 = W[-15] */; \
  146. mov y0, a /* y0 = a */; \
  147. add h, y2 /* h = h + S1 + CH + k + w */; \
  148. mov y2, a /* y2 = a */; \
  149. pslld XTMP1, (32-7); \
  150. or y0, c /* y0 = a|c */; \
  151. add d, h /* d = d + h + S1 + CH + k + w */; \
  152. and y2, c /* y2 = a&c */; \
  153. psrld XTMP2, 7; \
  154. and y0, b /* y0 = (a|c)&b */; \
  155. add h, y1 /* h = h + S1 + CH + k + w + S0 */; \
  156. por XTMP1, XTMP2 /* XTMP1 = W[-15] ror 7 */; \
  157. or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */; \
  158. lea h, [h + y0] /* h = h + S1 + CH + k + w + S0 + MAJ */
  159. #define FOUR_ROUNDS_AND_SCHED_1(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \
  160. movdqa XTMP2, XTMP3 /* XTMP2 = W[-15] */; \
  161. mov y0, e /* y0 = e */; \
  162. mov y1, a /* y1 = a */; \
  163. movdqa XTMP4, XTMP3 /* XTMP4 = W[-15] */; \
  164. ror y0, (25-11) /* y0 = e >> (25-11) */; \
  165. xor y0, e /* y0 = e ^ (e >> (25-11)) */; \
  166. mov y2, f /* y2 = f */; \
  167. ror y1, (22-13) /* y1 = a >> (22-13) */; \
  168. pslld XTMP3, (32-18); \
  169. xor y1, a /* y1 = a ^ (a >> (22-13) */; \
  170. ror y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \
  171. xor y2, g /* y2 = f^g */; \
  172. psrld XTMP2, 18; \
  173. ror y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \
  174. xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \
  175. and y2, e /* y2 = (f^g)&e */; \
  176. ror y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \
  177. pxor XTMP1, XTMP3; \
  178. xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \
  179. xor y2, g /* y2 = CH = ((f^g)&e)^g */; \
  180. psrld XTMP4, 3 /* XTMP4 = W[-15] >> 3 */; \
  181. add y2, y0 /* y2 = S1 + CH */; \
  182. add y2, [rsp + _XFER + 1*4] /* y2 = k + w + S1 + CH */; \
  183. ror y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \
  184. pxor XTMP1, XTMP2 /* XTMP1 = W[-15] ror 7 ^ W[-15] ror 18 */; \
  185. mov y0, a /* y0 = a */; \
  186. add h, y2 /* h = h + S1 + CH + k + w */; \
  187. mov y2, a /* y2 = a */; \
  188. pxor XTMP1, XTMP4 /* XTMP1 = s0 */; \
  189. or y0, c /* y0 = a|c */; \
  190. add d, h /* d = d + h + S1 + CH + k + w */; \
  191. and y2, c /* y2 = a&c */; \
  192. /* compute low s1 */; \
  193. pshufd XTMP2, X3, 0b11111010 /* XTMP2 = W[-2] {BBAA} */; \
  194. and y0, b /* y0 = (a|c)&b */; \
  195. add h, y1 /* h = h + S1 + CH + k + w + S0 */; \
  196. paddd XTMP0, XTMP1 /* XTMP0 = W[-16] + W[-7] + s0 */; \
  197. or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */; \
  198. lea h, [h + y0] /* h = h + S1 + CH + k + w + S0 + MAJ */
  199. #define FOUR_ROUNDS_AND_SCHED_2(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \
  200. movdqa XTMP3, XTMP2 /* XTMP3 = W[-2] {BBAA} */; \
  201. mov y0, e /* y0 = e */; \
  202. mov y1, a /* y1 = a */; \
  203. ror y0, (25-11) /* y0 = e >> (25-11) */; \
  204. movdqa XTMP4, XTMP2 /* XTMP4 = W[-2] {BBAA} */; \
  205. xor y0, e /* y0 = e ^ (e >> (25-11)) */; \
  206. ror y1, (22-13) /* y1 = a >> (22-13) */; \
  207. mov y2, f /* y2 = f */; \
  208. xor y1, a /* y1 = a ^ (a >> (22-13) */; \
  209. ror y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \
  210. psrlq XTMP2, 17 /* XTMP2 = W[-2] ror 17 {xBxA} */; \
  211. xor y2, g /* y2 = f^g */; \
  212. psrlq XTMP3, 19 /* XTMP3 = W[-2] ror 19 {xBxA} */; \
  213. xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \
  214. and y2, e /* y2 = (f^g)&e */; \
  215. psrld XTMP4, 10 /* XTMP4 = W[-2] >> 10 {BBAA} */; \
  216. ror y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \
  217. xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \
  218. xor y2, g /* y2 = CH = ((f^g)&e)^g */; \
  219. ror y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \
  220. pxor XTMP2, XTMP3; \
  221. add y2, y0 /* y2 = S1 + CH */; \
  222. ror y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \
  223. add y2, [rsp + _XFER + 2*4] /* y2 = k + w + S1 + CH */; \
  224. pxor XTMP4, XTMP2 /* XTMP4 = s1 {xBxA} */; \
  225. mov y0, a /* y0 = a */; \
  226. add h, y2 /* h = h + S1 + CH + k + w */; \
  227. mov y2, a /* y2 = a */; \
  228. pshufb XTMP4, SHUF_00BA /* XTMP4 = s1 {00BA} */; \
  229. or y0, c /* y0 = a|c */; \
  230. add d, h /* d = d + h + S1 + CH + k + w */; \
  231. and y2, c /* y2 = a&c */; \
  232. paddd XTMP0, XTMP4 /* XTMP0 = {..., ..., W[1], W[0]} */; \
  233. and y0, b /* y0 = (a|c)&b */; \
  234. add h, y1 /* h = h + S1 + CH + k + w + S0 */; \
  235. /* compute high s1 */; \
  236. pshufd XTMP2, XTMP0, 0b01010000 /* XTMP2 = W[-2] {DDCC} */; \
  237. or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */; \
  238. lea h, [h + y0] /* h = h + S1 + CH + k + w + S0 + MAJ */
  239. #define FOUR_ROUNDS_AND_SCHED_3(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \
  240. movdqa XTMP3, XTMP2 /* XTMP3 = W[-2] {DDCC} */; \
  241. mov y0, e /* y0 = e */; \
  242. ror y0, (25-11) /* y0 = e >> (25-11) */; \
  243. mov y1, a /* y1 = a */; \
  244. movdqa X0, XTMP2 /* X0 = W[-2] {DDCC} */; \
  245. ror y1, (22-13) /* y1 = a >> (22-13) */; \
  246. xor y0, e /* y0 = e ^ (e >> (25-11)) */; \
  247. mov y2, f /* y2 = f */; \
  248. ror y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \
  249. psrlq XTMP2, 17 /* XTMP2 = W[-2] ror 17 {xDxC} */; \
  250. xor y1, a /* y1 = a ^ (a >> (22-13) */; \
  251. xor y2, g /* y2 = f^g */; \
  252. psrlq XTMP3, 19 /* XTMP3 = W[-2] ror 19 {xDxC} */; \
  253. xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \
  254. and y2, e /* y2 = (f^g)&e */; \
  255. ror y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \
  256. psrld X0, 10 /* X0 = W[-2] >> 10 {DDCC} */; \
  257. xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \
  258. ror y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \
  259. xor y2, g /* y2 = CH = ((f^g)&e)^g */; \
  260. pxor XTMP2, XTMP3; \
  261. ror y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \
  262. add y2, y0 /* y2 = S1 + CH */; \
  263. add y2, [rsp + _XFER + 3*4] /* y2 = k + w + S1 + CH */; \
  264. pxor X0, XTMP2 /* X0 = s1 {xDxC} */; \
  265. mov y0, a /* y0 = a */; \
  266. add h, y2 /* h = h + S1 + CH + k + w */; \
  267. mov y2, a /* y2 = a */; \
  268. pshufb X0, SHUF_DC00 /* X0 = s1 {DC00} */; \
  269. or y0, c /* y0 = a|c */; \
  270. add d, h /* d = d + h + S1 + CH + k + w */; \
  271. and y2, c /* y2 = a&c */; \
  272. paddd X0, XTMP0 /* X0 = {W[3], W[2], W[1], W[0]} */; \
  273. and y0, b /* y0 = (a|c)&b */; \
  274. add h, y1 /* h = h + S1 + CH + k + w + S0 */; \
  275. or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */; \
  276. lea h, [h + y0] /* h = h + S1 + CH + k + w + S0 + MAJ */
  277. #define FOUR_ROUNDS_AND_SCHED(X0, X1, X2, X3, a, b, c, d, e, f, g, h) \
  278. FOUR_ROUNDS_AND_SCHED_0(X0, X1, X2, X3, a, b, c, d, e, f, g, h); \
  279. FOUR_ROUNDS_AND_SCHED_1(X0, X1, X2, X3, h, a, b, c, d, e, f, g); \
  280. FOUR_ROUNDS_AND_SCHED_2(X0, X1, X2, X3, g, h, a, b, c, d, e, f); \
  281. FOUR_ROUNDS_AND_SCHED_3(X0, X1, X2, X3, f, g, h, a, b, c, d, e);
  282. /* input is [rsp + _XFER + %1 * 4] */
  283. #define DO_ROUND(i1, a, b, c, d, e, f, g, h) \
  284. mov y0, e /* y0 = e */; \
  285. ror y0, (25-11) /* y0 = e >> (25-11) */; \
  286. mov y1, a /* y1 = a */; \
  287. xor y0, e /* y0 = e ^ (e >> (25-11)) */; \
  288. ror y1, (22-13) /* y1 = a >> (22-13) */; \
  289. mov y2, f /* y2 = f */; \
  290. xor y1, a /* y1 = a ^ (a >> (22-13) */; \
  291. ror y0, (11-6) /* y0 = (e >> (11-6)) ^ (e >> (25-6)) */; \
  292. xor y2, g /* y2 = f^g */; \
  293. xor y0, e /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) */; \
  294. ror y1, (13-2) /* y1 = (a >> (13-2)) ^ (a >> (22-2)) */; \
  295. and y2, e /* y2 = (f^g)&e */; \
  296. xor y1, a /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) */; \
  297. ror y0, 6 /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) */; \
  298. xor y2, g /* y2 = CH = ((f^g)&e)^g */; \
  299. add y2, y0 /* y2 = S1 + CH */; \
  300. ror y1, 2 /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) */; \
  301. add y2, [rsp + _XFER + i1 * 4] /* y2 = k + w + S1 + CH */; \
  302. mov y0, a /* y0 = a */; \
  303. add h, y2 /* h = h + S1 + CH + k + w */; \
  304. mov y2, a /* y2 = a */; \
  305. or y0, c /* y0 = a|c */; \
  306. add d, h /* d = d + h + S1 + CH + k + w */; \
  307. and y2, c /* y2 = a&c */; \
  308. and y0, b /* y0 = (a|c)&b */; \
  309. add h, y1 /* h = h + S1 + CH + k + w + S0 */; \
  310. or y0, y2 /* y0 = MAJ = (a|c)&b)|(a&c) */; \
  311. lea h, [h + y0] /* h = h + S1 + CH + k + w + S0 + MAJ */
  312. /*
  313. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  314. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  315. ;; void sha256_sse4(void *input_data, UINT32 digest[8], UINT64 num_blks)
  316. ;; arg 1 : pointer to input data
  317. ;; arg 2 : pointer to digest
  318. ;; arg 3 : Num blocks
  319. */
  320. .text
  321. .globl _gcry_sha256_transform_amd64_ssse3
  322. ELF(.type _gcry_sha256_transform_amd64_ssse3,@function;)
  323. .align 16
  324. _gcry_sha256_transform_amd64_ssse3:
  325. CFI_STARTPROC()
  326. push rbx
  327. CFI_PUSH(rbx)
  328. push rbp
  329. CFI_PUSH(rbp)
  330. push r13
  331. CFI_PUSH(r13)
  332. push r14
  333. CFI_PUSH(r14)
  334. push r15
  335. CFI_PUSH(r15)
  336. sub rsp, STACK_SIZE
  337. CFI_ADJUST_CFA_OFFSET(STACK_SIZE);
  338. shl NUM_BLKS, 6 /* convert to bytes */
  339. jz .Ldone_hash
  340. add NUM_BLKS, INP /* pointer to end of data */
  341. mov [rsp + _INP_END], NUM_BLKS
  342. /* load initial digest */
  343. mov a,[4*0 + CTX]
  344. mov b,[4*1 + CTX]
  345. mov c,[4*2 + CTX]
  346. mov d,[4*3 + CTX]
  347. mov e,[4*4 + CTX]
  348. mov f,[4*5 + CTX]
  349. mov g,[4*6 + CTX]
  350. mov h,[4*7 + CTX]
  351. movdqa BYTE_FLIP_MASK, [.LPSHUFFLE_BYTE_FLIP_MASK ADD_RIP]
  352. movdqa SHUF_00BA, [.L_SHUF_00BA ADD_RIP]
  353. movdqa SHUF_DC00, [.L_SHUF_DC00 ADD_RIP]
  354. .Loop0:
  355. lea TBL, [.LK256 ADD_RIP]
  356. /* byte swap first 16 dwords */
  357. COPY_XMM_AND_BSWAP(X0, [INP + 0*16], BYTE_FLIP_MASK)
  358. COPY_XMM_AND_BSWAP(X1, [INP + 1*16], BYTE_FLIP_MASK)
  359. COPY_XMM_AND_BSWAP(X2, [INP + 2*16], BYTE_FLIP_MASK)
  360. COPY_XMM_AND_BSWAP(X3, [INP + 3*16], BYTE_FLIP_MASK)
  361. mov [rsp + _INP], INP
  362. /* schedule 48 input dwords, by doing 3 rounds of 16 each */
  363. mov SRND, 3
  364. .align 16
  365. .Loop1:
  366. movdqa XFER, [TBL + 0*16]
  367. paddd XFER, X0
  368. movdqa [rsp + _XFER], XFER
  369. FOUR_ROUNDS_AND_SCHED(X0, X1, X2, X3, a, b, c, d, e, f, g, h)
  370. movdqa XFER, [TBL + 1*16]
  371. paddd XFER, X1
  372. movdqa [rsp + _XFER], XFER
  373. FOUR_ROUNDS_AND_SCHED(X1, X2, X3, X0, e, f, g, h, a, b, c, d)
  374. movdqa XFER, [TBL + 2*16]
  375. paddd XFER, X2
  376. movdqa [rsp + _XFER], XFER
  377. FOUR_ROUNDS_AND_SCHED(X2, X3, X0, X1, a, b, c, d, e, f, g, h)
  378. movdqa XFER, [TBL + 3*16]
  379. paddd XFER, X3
  380. movdqa [rsp + _XFER], XFER
  381. add TBL, 4*16
  382. FOUR_ROUNDS_AND_SCHED(X3, X0, X1, X2, e, f, g, h, a, b, c, d)
  383. sub SRND, 1
  384. jne .Loop1
  385. mov SRND, 2
  386. .Loop2:
  387. paddd X0, [TBL + 0*16]
  388. movdqa [rsp + _XFER], X0
  389. DO_ROUND(0, a, b, c, d, e, f, g, h)
  390. DO_ROUND(1, h, a, b, c, d, e, f, g)
  391. DO_ROUND(2, g, h, a, b, c, d, e, f)
  392. DO_ROUND(3, f, g, h, a, b, c, d, e)
  393. paddd X1, [TBL + 1*16]
  394. movdqa [rsp + _XFER], X1
  395. add TBL, 2*16
  396. DO_ROUND(0, e, f, g, h, a, b, c, d)
  397. DO_ROUND(1, d, e, f, g, h, a, b, c)
  398. DO_ROUND(2, c, d, e, f, g, h, a, b)
  399. DO_ROUND(3, b, c, d, e, f, g, h, a)
  400. movdqa X0, X2
  401. movdqa X1, X3
  402. sub SRND, 1
  403. jne .Loop2
  404. addm([4*0 + CTX],a)
  405. addm([4*1 + CTX],b)
  406. addm([4*2 + CTX],c)
  407. addm([4*3 + CTX],d)
  408. addm([4*4 + CTX],e)
  409. addm([4*5 + CTX],f)
  410. addm([4*6 + CTX],g)
  411. addm([4*7 + CTX],h)
  412. mov INP, [rsp + _INP]
  413. add INP, 64
  414. cmp INP, [rsp + _INP_END]
  415. jne .Loop0
  416. pxor xmm0, xmm0
  417. pxor xmm1, xmm1
  418. pxor xmm2, xmm2
  419. pxor xmm3, xmm3
  420. pxor xmm4, xmm4
  421. pxor xmm5, xmm5
  422. pxor xmm6, xmm6
  423. pxor xmm7, xmm7
  424. pxor xmm8, xmm8
  425. pxor xmm9, xmm9
  426. pxor xmm10, xmm10
  427. pxor xmm11, xmm11
  428. pxor xmm12, xmm12
  429. .Ldone_hash:
  430. pxor XFER, XFER
  431. movdqa [rsp + _XFER], XFER
  432. xor eax, eax
  433. add rsp, STACK_SIZE
  434. CFI_ADJUST_CFA_OFFSET(-STACK_SIZE);
  435. pop r15
  436. CFI_POP(r15)
  437. pop r14
  438. CFI_POP(r14)
  439. pop r13
  440. CFI_POP(r13)
  441. pop rbp
  442. CFI_POP(rbp)
  443. pop rbx
  444. CFI_POP(rbx)
  445. ret_spec_stop
  446. CFI_ENDPROC()
  447. SECTION_RODATA
  448. ELF(.type _sha256_ssse3_consts,@object)
  449. _sha256_ssse3_consts:
  450. .align 16
  451. .LK256:
  452. .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
  453. .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
  454. .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
  455. .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
  456. .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
  457. .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
  458. .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
  459. .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
  460. .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
  461. .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
  462. .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
  463. .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
  464. .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
  465. .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
  466. .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
  467. .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
  468. .LPSHUFFLE_BYTE_FLIP_MASK: .octa 0x0c0d0e0f08090a0b0405060700010203
  469. /* shuffle xBxA -> 00BA */
  470. .L_SHUF_00BA: .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
  471. /* shuffle xDxC -> DC00 */
  472. .L_SHUF_DC00: .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF
  473. #endif
  474. #endif