glue_helper-asm-avx2.S 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181
  1. /*
  2. * Shared glue code for 128bit block ciphers, AVX2 assembler macros
  3. *
  4. * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
  5. *
  6. * This program is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU General Public License as published by
  8. * the Free Software Foundation; either version 2 of the License, or
  9. * (at your option) any later version.
  10. *
  11. */
  12. #define load_16way(src, x0, x1, x2, x3, x4, x5, x6, x7) \
  13. vmovdqu (0*32)(src), x0; \
  14. vmovdqu (1*32)(src), x1; \
  15. vmovdqu (2*32)(src), x2; \
  16. vmovdqu (3*32)(src), x3; \
  17. vmovdqu (4*32)(src), x4; \
  18. vmovdqu (5*32)(src), x5; \
  19. vmovdqu (6*32)(src), x6; \
  20. vmovdqu (7*32)(src), x7;
  21. #define store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \
  22. vmovdqu x0, (0*32)(dst); \
  23. vmovdqu x1, (1*32)(dst); \
  24. vmovdqu x2, (2*32)(dst); \
  25. vmovdqu x3, (3*32)(dst); \
  26. vmovdqu x4, (4*32)(dst); \
  27. vmovdqu x5, (5*32)(dst); \
  28. vmovdqu x6, (6*32)(dst); \
  29. vmovdqu x7, (7*32)(dst);
  30. #define store_cbc_16way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7, t0) \
  31. vpxor t0, t0, t0; \
  32. vinserti128 $1, (src), t0, t0; \
  33. vpxor t0, x0, x0; \
  34. vpxor (0*32+16)(src), x1, x1; \
  35. vpxor (1*32+16)(src), x2, x2; \
  36. vpxor (2*32+16)(src), x3, x3; \
  37. vpxor (3*32+16)(src), x4, x4; \
  38. vpxor (4*32+16)(src), x5, x5; \
  39. vpxor (5*32+16)(src), x6, x6; \
  40. vpxor (6*32+16)(src), x7, x7; \
  41. store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
  42. #define inc_le128(x, minus_one, tmp) \
  43. vpcmpeqq minus_one, x, tmp; \
  44. vpsubq minus_one, x, x; \
  45. vpslldq $8, tmp, tmp; \
  46. vpsubq tmp, x, x;
  47. #define add2_le128(x, minus_one, minus_two, tmp1, tmp2) \
  48. vpcmpeqq minus_one, x, tmp1; \
  49. vpcmpeqq minus_two, x, tmp2; \
  50. vpsubq minus_two, x, x; \
  51. vpor tmp2, tmp1, tmp1; \
  52. vpslldq $8, tmp1, tmp1; \
  53. vpsubq tmp1, x, x;
  54. #define load_ctr_16way(iv, bswap, x0, x1, x2, x3, x4, x5, x6, x7, t0, t0x, t1, \
  55. t1x, t2, t2x, t3, t3x, t4, t5) \
  56. vpcmpeqd t0, t0, t0; \
  57. vpsrldq $8, t0, t0; /* ab: -1:0 ; cd: -1:0 */ \
  58. vpaddq t0, t0, t4; /* ab: -2:0 ; cd: -2:0 */\
  59. \
  60. /* load IV and byteswap */ \
  61. vmovdqu (iv), t2x; \
  62. vmovdqa t2x, t3x; \
  63. inc_le128(t2x, t0x, t1x); \
  64. vbroadcasti128 bswap, t1; \
  65. vinserti128 $1, t2x, t3, t2; /* ab: le0 ; cd: le1 */ \
  66. vpshufb t1, t2, x0; \
  67. \
  68. /* construct IVs */ \
  69. add2_le128(t2, t0, t4, t3, t5); /* ab: le2 ; cd: le3 */ \
  70. vpshufb t1, t2, x1; \
  71. add2_le128(t2, t0, t4, t3, t5); \
  72. vpshufb t1, t2, x2; \
  73. add2_le128(t2, t0, t4, t3, t5); \
  74. vpshufb t1, t2, x3; \
  75. add2_le128(t2, t0, t4, t3, t5); \
  76. vpshufb t1, t2, x4; \
  77. add2_le128(t2, t0, t4, t3, t5); \
  78. vpshufb t1, t2, x5; \
  79. add2_le128(t2, t0, t4, t3, t5); \
  80. vpshufb t1, t2, x6; \
  81. add2_le128(t2, t0, t4, t3, t5); \
  82. vpshufb t1, t2, x7; \
  83. vextracti128 $1, t2, t2x; \
  84. inc_le128(t2x, t0x, t3x); \
  85. vmovdqu t2x, (iv);
  86. #define store_ctr_16way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7) \
  87. vpxor (0*32)(src), x0, x0; \
  88. vpxor (1*32)(src), x1, x1; \
  89. vpxor (2*32)(src), x2, x2; \
  90. vpxor (3*32)(src), x3, x3; \
  91. vpxor (4*32)(src), x4, x4; \
  92. vpxor (5*32)(src), x5, x5; \
  93. vpxor (6*32)(src), x6, x6; \
  94. vpxor (7*32)(src), x7, x7; \
  95. store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
  96. #define gf128mul_x_ble(iv, mask, tmp) \
  97. vpsrad $31, iv, tmp; \
  98. vpaddq iv, iv, iv; \
  99. vpshufd $0x13, tmp, tmp; \
  100. vpand mask, tmp, tmp; \
  101. vpxor tmp, iv, iv;
  102. #define gf128mul_x2_ble(iv, mask1, mask2, tmp0, tmp1) \
  103. vpsrad $31, iv, tmp0; \
  104. vpaddq iv, iv, tmp1; \
  105. vpsllq $2, iv, iv; \
  106. vpshufd $0x13, tmp0, tmp0; \
  107. vpsrad $31, tmp1, tmp1; \
  108. vpand mask2, tmp0, tmp0; \
  109. vpshufd $0x13, tmp1, tmp1; \
  110. vpxor tmp0, iv, iv; \
  111. vpand mask1, tmp1, tmp1; \
  112. vpxor tmp1, iv, iv;
  113. #define load_xts_16way(iv, src, dst, x0, x1, x2, x3, x4, x5, x6, x7, tiv, \
  114. tivx, t0, t0x, t1, t1x, t2, t2x, t3, \
  115. xts_gf128mul_and_shl1_mask_0, \
  116. xts_gf128mul_and_shl1_mask_1) \
  117. vbroadcasti128 xts_gf128mul_and_shl1_mask_0, t1; \
  118. \
  119. /* load IV and construct second IV */ \
  120. vmovdqu (iv), tivx; \
  121. vmovdqa tivx, t0x; \
  122. gf128mul_x_ble(tivx, t1x, t2x); \
  123. vbroadcasti128 xts_gf128mul_and_shl1_mask_1, t2; \
  124. vinserti128 $1, tivx, t0, tiv; \
  125. vpxor (0*32)(src), tiv, x0; \
  126. vmovdqu tiv, (0*32)(dst); \
  127. \
  128. /* construct and store IVs, also xor with source */ \
  129. gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
  130. vpxor (1*32)(src), tiv, x1; \
  131. vmovdqu tiv, (1*32)(dst); \
  132. \
  133. gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
  134. vpxor (2*32)(src), tiv, x2; \
  135. vmovdqu tiv, (2*32)(dst); \
  136. \
  137. gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
  138. vpxor (3*32)(src), tiv, x3; \
  139. vmovdqu tiv, (3*32)(dst); \
  140. \
  141. gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
  142. vpxor (4*32)(src), tiv, x4; \
  143. vmovdqu tiv, (4*32)(dst); \
  144. \
  145. gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
  146. vpxor (5*32)(src), tiv, x5; \
  147. vmovdqu tiv, (5*32)(dst); \
  148. \
  149. gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
  150. vpxor (6*32)(src), tiv, x6; \
  151. vmovdqu tiv, (6*32)(dst); \
  152. \
  153. gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
  154. vpxor (7*32)(src), tiv, x7; \
  155. vmovdqu tiv, (7*32)(dst); \
  156. \
  157. vextracti128 $1, tiv, tivx; \
  158. gf128mul_x_ble(tivx, t1x, t2x); \
  159. vmovdqu tivx, (iv);
  160. #define store_xts_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \
  161. vpxor (0*32)(dst), x0, x0; \
  162. vpxor (1*32)(dst), x1, x1; \
  163. vpxor (2*32)(dst), x2, x2; \
  164. vpxor (3*32)(dst), x3, x3; \
  165. vpxor (4*32)(dst), x4, x4; \
  166. vpxor (5*32)(dst), x5, x5; \
  167. vpxor (6*32)(dst), x6, x6; \
  168. vpxor (7*32)(dst), x7, x7; \
  169. store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7);