blake2s-amd64-avx.S 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282
  1. /* blake2s-amd64-avx.S - AVX implementation of BLAKE2s
  2. *
  3. * Copyright (C) 2018 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  4. *
  5. * This file is part of Libgcrypt.
  6. *
  7. * Libgcrypt is free software; you can redistribute it and/or modify
  8. * it under the terms of the GNU Lesser General Public License as
  9. * published by the Free Software Foundation; either version 2.1 of
  10. * the License, or (at your option) any later version.
  11. *
  12. * Libgcrypt is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15. * GNU Lesser General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU Lesser General Public
  18. * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  19. */
  20. /* The code is based on public-domain/CC0 BLAKE2 reference implementation
  21. * by Samual Neves, at https://github.com/BLAKE2/BLAKE2/tree/master/sse
  22. * Copyright 2012, Samuel Neves <sneves@dei.uc.pt>
  23. */
  24. #ifdef __x86_64
  25. #include <config.h>
  26. #if defined(HAVE_GCC_INLINE_ASM_AVX) && \
  27. (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
  28. defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
  29. #include "asm-common-amd64.h"
  30. /* register macros */
  31. #define RSTATE %rdi
  32. #define RINBLKS %rsi
  33. #define RNBLKS %rdx
  34. #define RIV %rcx
  35. /* state structure */
  36. #define STATE_H 0
  37. #define STATE_T (STATE_H + 8 * 4)
  38. #define STATE_F (STATE_T + 2 * 4)
  39. /* vector registers */
  40. #define ROW1 %xmm0
  41. #define ROW2 %xmm1
  42. #define ROW3 %xmm2
  43. #define ROW4 %xmm3
  44. #define TMP1 %xmm4
  45. #define TMP1x %xmm4
  46. #define R16 %xmm5
  47. #define R8 %xmm6
  48. #define MA1 %xmm8
  49. #define MA2 %xmm9
  50. #define MA3 %xmm10
  51. #define MA4 %xmm11
  52. #define MB1 %xmm12
  53. #define MB2 %xmm13
  54. #define MB3 %xmm14
  55. #define MB4 %xmm15
  56. /**********************************************************************
  57. blake2s/AVX
  58. **********************************************************************/
  59. #define GATHER_MSG(m1, m2, m3, m4, \
  60. s0, s1, s2, s3, s4, s5, s6, s7, s8, \
  61. s9, s10, s11, s12, s13, s14, s15) \
  62. vmovd (s0)*4(RINBLKS), m1; \
  63. vmovd (s1)*4(RINBLKS), m2; \
  64. vmovd (s8)*4(RINBLKS), m3; \
  65. vmovd (s9)*4(RINBLKS), m4; \
  66. vpinsrd $1, (s2)*4(RINBLKS), m1, m1; \
  67. vpinsrd $1, (s3)*4(RINBLKS), m2, m2; \
  68. vpinsrd $1, (s10)*4(RINBLKS), m3, m3; \
  69. vpinsrd $1, (s11)*4(RINBLKS), m4, m4; \
  70. vpinsrd $2, (s4)*4(RINBLKS), m1, m1; \
  71. vpinsrd $2, (s5)*4(RINBLKS), m2, m2; \
  72. vpinsrd $2, (s12)*4(RINBLKS), m3, m3; \
  73. vpinsrd $2, (s13)*4(RINBLKS), m4, m4; \
  74. vpinsrd $3, (s6)*4(RINBLKS), m1, m1; \
  75. vpinsrd $3, (s7)*4(RINBLKS), m2, m2; \
  76. vpinsrd $3, (s14)*4(RINBLKS), m3, m3; \
  77. vpinsrd $3, (s15)*4(RINBLKS), m4, m4;
  78. #define LOAD_MSG_0(m1, m2, m3, m4) \
  79. GATHER_MSG(m1, m2, m3, m4, \
  80. 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
  81. #define LOAD_MSG_1(m1, m2, m3, m4) \
  82. GATHER_MSG(m1, m2, m3, m4, \
  83. 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3)
  84. #define LOAD_MSG_2(m1, m2, m3, m4) \
  85. GATHER_MSG(m1, m2, m3, m4, \
  86. 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4)
  87. #define LOAD_MSG_3(m1, m2, m3, m4) \
  88. GATHER_MSG(m1, m2, m3, m4, \
  89. 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8)
  90. #define LOAD_MSG_4(m1, m2, m3, m4) \
  91. GATHER_MSG(m1, m2, m3, m4, \
  92. 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13)
  93. #define LOAD_MSG_5(m1, m2, m3, m4) \
  94. GATHER_MSG(m1, m2, m3, m4, \
  95. 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9)
  96. #define LOAD_MSG_6(m1, m2, m3, m4) \
  97. GATHER_MSG(m1, m2, m3, m4, \
  98. 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11)
  99. #define LOAD_MSG_7(m1, m2, m3, m4) \
  100. GATHER_MSG(m1, m2, m3, m4, \
  101. 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10)
  102. #define LOAD_MSG_8(m1, m2, m3, m4) \
  103. GATHER_MSG(m1, m2, m3, m4, \
  104. 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5)
  105. #define LOAD_MSG_9(m1, m2, m3, m4) \
  106. GATHER_MSG(m1, m2, m3, m4, \
  107. 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0)
  108. #define LOAD_MSG(r, m1, m2, m3, m4) LOAD_MSG_##r(m1, m2, m3, m4)
  109. #define ROR_16(in, out) vpshufb R16, in, out;
  110. #define ROR_8(in, out) vpshufb R8, in, out;
  111. #define ROR_12(in, out) \
  112. vpsrld $12, in, TMP1; \
  113. vpslld $(32 - 12), in, out; \
  114. vpxor TMP1, out, out;
  115. #define ROR_7(in, out) \
  116. vpsrld $7, in, TMP1; \
  117. vpslld $(32 - 7), in, out; \
  118. vpxor TMP1, out, out;
  119. #define G(r1, r2, r3, r4, m, ROR_A, ROR_B) \
  120. vpaddd m, r1, r1; \
  121. vpaddd r2, r1, r1; \
  122. vpxor r1, r4, r4; \
  123. ROR_A(r4, r4); \
  124. vpaddd r4, r3, r3; \
  125. vpxor r3, r2, r2; \
  126. ROR_B(r2, r2);
  127. #define G1(r1, r2, r3, r4, m) \
  128. G(r1, r2, r3, r4, m, ROR_16, ROR_12);
  129. #define G2(r1, r2, r3, r4, m) \
  130. G(r1, r2, r3, r4, m, ROR_8, ROR_7);
  131. #define MM_SHUFFLE(z,y,x,w) \
  132. (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
  133. #define DIAGONALIZE(r1, r2, r3, r4) \
  134. vpshufd $MM_SHUFFLE(0,3,2,1), r2, r2; \
  135. vpshufd $MM_SHUFFLE(1,0,3,2), r3, r3; \
  136. vpshufd $MM_SHUFFLE(2,1,0,3), r4, r4;
  137. #define UNDIAGONALIZE(r1, r2, r3, r4) \
  138. vpshufd $MM_SHUFFLE(2,1,0,3), r2, r2; \
  139. vpshufd $MM_SHUFFLE(1,0,3,2), r3, r3; \
  140. vpshufd $MM_SHUFFLE(0,3,2,1), r4, r4;
  141. #define ROUND(r, m1, m2, m3, m4) \
  142. G1(ROW1, ROW2, ROW3, ROW4, m1); \
  143. G2(ROW1, ROW2, ROW3, ROW4, m2); \
  144. DIAGONALIZE(ROW1, ROW2, ROW3, ROW4); \
  145. G1(ROW1, ROW2, ROW3, ROW4, m3); \
  146. G2(ROW1, ROW2, ROW3, ROW4, m4); \
  147. UNDIAGONALIZE(ROW1, ROW2, ROW3, ROW4);
  148. SECTION_RODATA
  149. .align 16
  150. ELF(.type _blake2s_avx_data,@object;)
  151. _blake2s_avx_data:
  152. .Liv:
  153. .long 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A
  154. .long 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
  155. .Lshuf_ror16:
  156. .byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13
  157. .Lshuf_ror8:
  158. .byte 1,2,3,0,5,6,7,4,9,10,11,8,13,14,15,12
  159. .text
  160. .align 64
  161. .globl _gcry_blake2s_transform_amd64_avx
  162. ELF(.type _gcry_blake2s_transform_amd64_avx,@function;)
  163. _gcry_blake2s_transform_amd64_avx:
  164. /* input:
  165. * %rdi: state
  166. * %rsi: blks
  167. * %rdx: num_blks
  168. */
  169. CFI_STARTPROC();
  170. vzeroupper;
  171. addq $64, (STATE_T + 0)(RSTATE);
  172. vmovdqa .Lshuf_ror16 rRIP, R16;
  173. vmovdqa .Lshuf_ror8 rRIP, R8;
  174. vmovdqa .Liv+(0 * 4) rRIP, ROW3;
  175. vmovdqa .Liv+(4 * 4) rRIP, ROW4;
  176. vmovdqu (STATE_H + 0 * 4)(RSTATE), ROW1;
  177. vmovdqu (STATE_H + 4 * 4)(RSTATE), ROW2;
  178. vpxor (STATE_T)(RSTATE), ROW4, ROW4;
  179. LOAD_MSG(0, MA1, MA2, MA3, MA4);
  180. LOAD_MSG(1, MB1, MB2, MB3, MB4);
  181. .Loop:
  182. ROUND(0, MA1, MA2, MA3, MA4);
  183. LOAD_MSG(2, MA1, MA2, MA3, MA4);
  184. ROUND(1, MB1, MB2, MB3, MB4);
  185. LOAD_MSG(3, MB1, MB2, MB3, MB4);
  186. ROUND(2, MA1, MA2, MA3, MA4);
  187. LOAD_MSG(4, MA1, MA2, MA3, MA4);
  188. ROUND(3, MB1, MB2, MB3, MB4);
  189. LOAD_MSG(5, MB1, MB2, MB3, MB4);
  190. ROUND(4, MA1, MA2, MA3, MA4);
  191. LOAD_MSG(6, MA1, MA2, MA3, MA4);
  192. ROUND(5, MB1, MB2, MB3, MB4);
  193. LOAD_MSG(7, MB1, MB2, MB3, MB4);
  194. ROUND(6, MA1, MA2, MA3, MA4);
  195. LOAD_MSG(8, MA1, MA2, MA3, MA4);
  196. ROUND(7, MB1, MB2, MB3, MB4);
  197. LOAD_MSG(9, MB1, MB2, MB3, MB4);
  198. sub $1, RNBLKS;
  199. jz .Loop_end;
  200. lea 64(RINBLKS), RINBLKS;
  201. addq $64, (STATE_T + 0)(RSTATE);
  202. ROUND(8, MA1, MA2, MA3, MA4);
  203. LOAD_MSG(0, MA1, MA2, MA3, MA4);
  204. ROUND(9, MB1, MB2, MB3, MB4);
  205. LOAD_MSG(1, MB1, MB2, MB3, MB4);
  206. vpxor ROW3, ROW1, ROW1;
  207. vpxor ROW4, ROW2, ROW2;
  208. vmovdqa .Liv+(0 * 4) rRIP, ROW3;
  209. vmovdqa .Liv+(4 * 4) rRIP, ROW4;
  210. vpxor (STATE_H + 0 * 4)(RSTATE), ROW1, ROW1;
  211. vpxor (STATE_H + 4 * 4)(RSTATE), ROW2, ROW2;
  212. vmovdqu ROW1, (STATE_H + 0 * 4)(RSTATE);
  213. vmovdqu ROW2, (STATE_H + 4 * 4)(RSTATE);
  214. vpxor (STATE_T)(RSTATE), ROW4, ROW4;
  215. jmp .Loop;
  216. .Loop_end:
  217. ROUND(8, MA1, MA2, MA3, MA4);
  218. ROUND(9, MB1, MB2, MB3, MB4);
  219. vpxor ROW3, ROW1, ROW1;
  220. vpxor ROW4, ROW2, ROW2;
  221. vpxor (STATE_H + 0 * 4)(RSTATE), ROW1, ROW1;
  222. vpxor (STATE_H + 4 * 4)(RSTATE), ROW2, ROW2;
  223. vmovdqu ROW1, (STATE_H + 0 * 4)(RSTATE);
  224. vmovdqu ROW2, (STATE_H + 4 * 4)(RSTATE);
  225. xor %eax, %eax;
  226. vzeroall;
  227. ret_spec_stop;
  228. CFI_ENDPROC();
  229. ELF(.size _gcry_blake2s_transform_amd64_avx,
  230. .-_gcry_blake2s_transform_amd64_avx;)
  231. #endif /*defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS)*/
  232. #endif /*__x86_64*/