serpent-avx2-amd64.S 38 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215
  1. /* serpent-avx2-amd64.S - AVX2 implementation of Serpent cipher
  2. *
  3. * Copyright (C) 2013-2015 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  4. *
  5. * This file is part of Libgcrypt.
  6. *
  7. * Libgcrypt is free software; you can redistribute it and/or modify
  8. * it under the terms of the GNU Lesser General Public License as
  9. * published by the Free Software Foundation; either version 2.1 of
  10. * the License, or (at your option) any later version.
  11. *
  12. * Libgcrypt is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15. * GNU Lesser General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU Lesser General Public
  18. * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  19. */
  20. #include <config.h>
  21. #ifdef __x86_64
  22. #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
  23. defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && defined(USE_SERPENT) && \
  24. defined(ENABLE_AVX2_SUPPORT)
  25. #include "asm-common-amd64.h"
  26. /* struct serpent_context: */
  27. #define ctx_keys 0
  28. /* register macros */
  29. #define CTX %rdi
  30. /* vector registers */
  31. #define RA0 %ymm0
  32. #define RA1 %ymm1
  33. #define RA2 %ymm2
  34. #define RA3 %ymm3
  35. #define RA4 %ymm4
  36. #define RB0 %ymm5
  37. #define RB1 %ymm6
  38. #define RB2 %ymm7
  39. #define RB3 %ymm8
  40. #define RB4 %ymm9
  41. #define RNOT %ymm10
  42. #define RTMP0 %ymm11
  43. #define RTMP1 %ymm12
  44. #define RTMP2 %ymm13
  45. #define RTMP3 %ymm14
  46. #define RTMP4 %ymm15
  47. #define RNOTx %xmm10
  48. #define RTMP0x %xmm11
  49. #define RTMP1x %xmm12
  50. #define RTMP2x %xmm13
  51. #define RTMP3x %xmm14
  52. #define RTMP4x %xmm15
  53. /**********************************************************************
  54. helper macros
  55. **********************************************************************/
  56. /* vector 32-bit rotation to left */
  57. #define vec_rol(reg, nleft, tmp) \
  58. vpslld $(nleft), reg, tmp; \
  59. vpsrld $(32 - (nleft)), reg, reg; \
  60. vpor tmp, reg, reg;
  61. /* vector 32-bit rotation to right */
  62. #define vec_ror(reg, nright, tmp) \
  63. vec_rol(reg, 32 - nright, tmp)
  64. /* 4x4 32-bit integer matrix transpose */
  65. #define transpose_4x4(x0, x1, x2, x3, t1, t2, t3) \
  66. vpunpckhdq x1, x0, t2; \
  67. vpunpckldq x1, x0, x0; \
  68. \
  69. vpunpckldq x3, x2, t1; \
  70. vpunpckhdq x3, x2, x2; \
  71. \
  72. vpunpckhqdq t1, x0, x1; \
  73. vpunpcklqdq t1, x0, x0; \
  74. \
  75. vpunpckhqdq x2, t2, x3; \
  76. vpunpcklqdq x2, t2, x2;
  77. /**********************************************************************
  78. 16-way serpent
  79. **********************************************************************/
  80. /*
  81. * These are the S-Boxes of Serpent from following research paper.
  82. *
  83. * D. A. Osvik, “Speeding up Serpent,” in Third AES Candidate Conference,
  84. * (New York, New York, USA), p. 317–329, National Institute of Standards and
  85. * Technology, 2000.
  86. *
  87. * Paper is also available at: http://www.ii.uib.no/~osvik/pub/aes3.pdf
  88. *
  89. */
  90. #define SBOX0(r0, r1, r2, r3, r4) \
  91. vpxor r0, r3, r3; vmovdqa r1, r4; \
  92. vpand r3, r1, r1; vpxor r2, r4, r4; \
  93. vpxor r0, r1, r1; vpor r3, r0, r0; \
  94. vpxor r4, r0, r0; vpxor r3, r4, r4; \
  95. vpxor r2, r3, r3; vpor r1, r2, r2; \
  96. vpxor r4, r2, r2; vpxor RNOT, r4, r4; \
  97. vpor r1, r4, r4; vpxor r3, r1, r1; \
  98. vpxor r4, r1, r1; vpor r0, r3, r3; \
  99. vpxor r3, r1, r1; vpxor r3, r4, r4;
  100. #define SBOX0_INVERSE(r0, r1, r2, r3, r4) \
  101. vpxor RNOT, r2, r2; vmovdqa r1, r4; \
  102. vpor r0, r1, r1; vpxor RNOT, r4, r4; \
  103. vpxor r2, r1, r1; vpor r4, r2, r2; \
  104. vpxor r3, r1, r1; vpxor r4, r0, r0; \
  105. vpxor r0, r2, r2; vpand r3, r0, r0; \
  106. vpxor r0, r4, r4; vpor r1, r0, r0; \
  107. vpxor r2, r0, r0; vpxor r4, r3, r3; \
  108. vpxor r1, r2, r2; vpxor r0, r3, r3; \
  109. vpxor r1, r3, r3; \
  110. vpand r3, r2, r2; \
  111. vpxor r2, r4, r4;
  112. #define SBOX1(r0, r1, r2, r3, r4) \
  113. vpxor RNOT, r0, r0; vpxor RNOT, r2, r2; \
  114. vmovdqa r0, r4; vpand r1, r0, r0; \
  115. vpxor r0, r2, r2; vpor r3, r0, r0; \
  116. vpxor r2, r3, r3; vpxor r0, r1, r1; \
  117. vpxor r4, r0, r0; vpor r1, r4, r4; \
  118. vpxor r3, r1, r1; vpor r0, r2, r2; \
  119. vpand r4, r2, r2; vpxor r1, r0, r0; \
  120. vpand r2, r1, r1; \
  121. vpxor r0, r1, r1; vpand r2, r0, r0; \
  122. vpxor r4, r0, r0;
  123. #define SBOX1_INVERSE(r0, r1, r2, r3, r4) \
  124. vmovdqa r1, r4; vpxor r3, r1, r1; \
  125. vpand r1, r3, r3; vpxor r2, r4, r4; \
  126. vpxor r0, r3, r3; vpor r1, r0, r0; \
  127. vpxor r3, r2, r2; vpxor r4, r0, r0; \
  128. vpor r2, r0, r0; vpxor r3, r1, r1; \
  129. vpxor r1, r0, r0; vpor r3, r1, r1; \
  130. vpxor r0, r1, r1; vpxor RNOT, r4, r4; \
  131. vpxor r1, r4, r4; vpor r0, r1, r1; \
  132. vpxor r0, r1, r1; \
  133. vpor r4, r1, r1; \
  134. vpxor r1, r3, r3;
  135. #define SBOX2(r0, r1, r2, r3, r4) \
  136. vmovdqa r0, r4; vpand r2, r0, r0; \
  137. vpxor r3, r0, r0; vpxor r1, r2, r2; \
  138. vpxor r0, r2, r2; vpor r4, r3, r3; \
  139. vpxor r1, r3, r3; vpxor r2, r4, r4; \
  140. vmovdqa r3, r1; vpor r4, r3, r3; \
  141. vpxor r0, r3, r3; vpand r1, r0, r0; \
  142. vpxor r0, r4, r4; vpxor r3, r1, r1; \
  143. vpxor r4, r1, r1; vpxor RNOT, r4, r4;
  144. #define SBOX2_INVERSE(r0, r1, r2, r3, r4) \
  145. vpxor r3, r2, r2; vpxor r0, r3, r3; \
  146. vmovdqa r3, r4; vpand r2, r3, r3; \
  147. vpxor r1, r3, r3; vpor r2, r1, r1; \
  148. vpxor r4, r1, r1; vpand r3, r4, r4; \
  149. vpxor r3, r2, r2; vpand r0, r4, r4; \
  150. vpxor r2, r4, r4; vpand r1, r2, r2; \
  151. vpor r0, r2, r2; vpxor RNOT, r3, r3; \
  152. vpxor r3, r2, r2; vpxor r3, r0, r0; \
  153. vpand r1, r0, r0; vpxor r4, r3, r3; \
  154. vpxor r0, r3, r3;
  155. #define SBOX3(r0, r1, r2, r3, r4) \
  156. vmovdqa r0, r4; vpor r3, r0, r0; \
  157. vpxor r1, r3, r3; vpand r4, r1, r1; \
  158. vpxor r2, r4, r4; vpxor r3, r2, r2; \
  159. vpand r0, r3, r3; vpor r1, r4, r4; \
  160. vpxor r4, r3, r3; vpxor r1, r0, r0; \
  161. vpand r0, r4, r4; vpxor r3, r1, r1; \
  162. vpxor r2, r4, r4; vpor r0, r1, r1; \
  163. vpxor r2, r1, r1; vpxor r3, r0, r0; \
  164. vmovdqa r1, r2; vpor r3, r1, r1; \
  165. vpxor r0, r1, r1;
  166. #define SBOX3_INVERSE(r0, r1, r2, r3, r4) \
  167. vmovdqa r2, r4; vpxor r1, r2, r2; \
  168. vpxor r2, r0, r0; vpand r2, r4, r4; \
  169. vpxor r0, r4, r4; vpand r1, r0, r0; \
  170. vpxor r3, r1, r1; vpor r4, r3, r3; \
  171. vpxor r3, r2, r2; vpxor r3, r0, r0; \
  172. vpxor r4, r1, r1; vpand r2, r3, r3; \
  173. vpxor r1, r3, r3; vpxor r0, r1, r1; \
  174. vpor r2, r1, r1; vpxor r3, r0, r0; \
  175. vpxor r4, r1, r1; \
  176. vpxor r1, r0, r0;
  177. #define SBOX4(r0, r1, r2, r3, r4) \
  178. vpxor r3, r1, r1; vpxor RNOT, r3, r3; \
  179. vpxor r3, r2, r2; vpxor r0, r3, r3; \
  180. vmovdqa r1, r4; vpand r3, r1, r1; \
  181. vpxor r2, r1, r1; vpxor r3, r4, r4; \
  182. vpxor r4, r0, r0; vpand r4, r2, r2; \
  183. vpxor r0, r2, r2; vpand r1, r0, r0; \
  184. vpxor r0, r3, r3; vpor r1, r4, r4; \
  185. vpxor r0, r4, r4; vpor r3, r0, r0; \
  186. vpxor r2, r0, r0; vpand r3, r2, r2; \
  187. vpxor RNOT, r0, r0; vpxor r2, r4, r4;
  188. #define SBOX4_INVERSE(r0, r1, r2, r3, r4) \
  189. vmovdqa r2, r4; vpand r3, r2, r2; \
  190. vpxor r1, r2, r2; vpor r3, r1, r1; \
  191. vpand r0, r1, r1; vpxor r2, r4, r4; \
  192. vpxor r1, r4, r4; vpand r2, r1, r1; \
  193. vpxor RNOT, r0, r0; vpxor r4, r3, r3; \
  194. vpxor r3, r1, r1; vpand r0, r3, r3; \
  195. vpxor r2, r3, r3; vpxor r1, r0, r0; \
  196. vpand r0, r2, r2; vpxor r0, r3, r3; \
  197. vpxor r4, r2, r2; \
  198. vpor r3, r2, r2; vpxor r0, r3, r3; \
  199. vpxor r1, r2, r2;
  200. #define SBOX5(r0, r1, r2, r3, r4) \
  201. vpxor r1, r0, r0; vpxor r3, r1, r1; \
  202. vpxor RNOT, r3, r3; vmovdqa r1, r4; \
  203. vpand r0, r1, r1; vpxor r3, r2, r2; \
  204. vpxor r2, r1, r1; vpor r4, r2, r2; \
  205. vpxor r3, r4, r4; vpand r1, r3, r3; \
  206. vpxor r0, r3, r3; vpxor r1, r4, r4; \
  207. vpxor r2, r4, r4; vpxor r0, r2, r2; \
  208. vpand r3, r0, r0; vpxor RNOT, r2, r2; \
  209. vpxor r4, r0, r0; vpor r3, r4, r4; \
  210. vpxor r4, r2, r2;
  211. #define SBOX5_INVERSE(r0, r1, r2, r3, r4) \
  212. vpxor RNOT, r1, r1; vmovdqa r3, r4; \
  213. vpxor r1, r2, r2; vpor r0, r3, r3; \
  214. vpxor r2, r3, r3; vpor r1, r2, r2; \
  215. vpand r0, r2, r2; vpxor r3, r4, r4; \
  216. vpxor r4, r2, r2; vpor r0, r4, r4; \
  217. vpxor r1, r4, r4; vpand r2, r1, r1; \
  218. vpxor r3, r1, r1; vpxor r2, r4, r4; \
  219. vpand r4, r3, r3; vpxor r1, r4, r4; \
  220. vpxor r4, r3, r3; vpxor RNOT, r4, r4; \
  221. vpxor r0, r3, r3;
  222. #define SBOX6(r0, r1, r2, r3, r4) \
  223. vpxor RNOT, r2, r2; vmovdqa r3, r4; \
  224. vpand r0, r3, r3; vpxor r4, r0, r0; \
  225. vpxor r2, r3, r3; vpor r4, r2, r2; \
  226. vpxor r3, r1, r1; vpxor r0, r2, r2; \
  227. vpor r1, r0, r0; vpxor r1, r2, r2; \
  228. vpxor r0, r4, r4; vpor r3, r0, r0; \
  229. vpxor r2, r0, r0; vpxor r3, r4, r4; \
  230. vpxor r0, r4, r4; vpxor RNOT, r3, r3; \
  231. vpand r4, r2, r2; \
  232. vpxor r3, r2, r2;
  233. #define SBOX6_INVERSE(r0, r1, r2, r3, r4) \
  234. vpxor r2, r0, r0; vmovdqa r2, r4; \
  235. vpand r0, r2, r2; vpxor r3, r4, r4; \
  236. vpxor RNOT, r2, r2; vpxor r1, r3, r3; \
  237. vpxor r3, r2, r2; vpor r0, r4, r4; \
  238. vpxor r2, r0, r0; vpxor r4, r3, r3; \
  239. vpxor r1, r4, r4; vpand r3, r1, r1; \
  240. vpxor r0, r1, r1; vpxor r3, r0, r0; \
  241. vpor r2, r0, r0; vpxor r1, r3, r3; \
  242. vpxor r0, r4, r4;
  243. #define SBOX7(r0, r1, r2, r3, r4) \
  244. vmovdqa r1, r4; vpor r2, r1, r1; \
  245. vpxor r3, r1, r1; vpxor r2, r4, r4; \
  246. vpxor r1, r2, r2; vpor r4, r3, r3; \
  247. vpand r0, r3, r3; vpxor r2, r4, r4; \
  248. vpxor r1, r3, r3; vpor r4, r1, r1; \
  249. vpxor r0, r1, r1; vpor r4, r0, r0; \
  250. vpxor r2, r0, r0; vpxor r4, r1, r1; \
  251. vpxor r1, r2, r2; vpand r0, r1, r1; \
  252. vpxor r4, r1, r1; vpxor RNOT, r2, r2; \
  253. vpor r0, r2, r2; \
  254. vpxor r2, r4, r4;
  255. #define SBOX7_INVERSE(r0, r1, r2, r3, r4) \
  256. vmovdqa r2, r4; vpxor r0, r2, r2; \
  257. vpand r3, r0, r0; vpor r3, r4, r4; \
  258. vpxor RNOT, r2, r2; vpxor r1, r3, r3; \
  259. vpor r0, r1, r1; vpxor r2, r0, r0; \
  260. vpand r4, r2, r2; vpand r4, r3, r3; \
  261. vpxor r2, r1, r1; vpxor r0, r2, r2; \
  262. vpor r2, r0, r0; vpxor r1, r4, r4; \
  263. vpxor r3, r0, r0; vpxor r4, r3, r3; \
  264. vpor r0, r4, r4; vpxor r2, r3, r3; \
  265. vpxor r2, r4, r4;
  266. /* Apply SBOX number WHICH to to the block. */
  267. #define SBOX(which, r0, r1, r2, r3, r4) \
  268. SBOX##which (r0, r1, r2, r3, r4)
  269. /* Apply inverse SBOX number WHICH to to the block. */
  270. #define SBOX_INVERSE(which, r0, r1, r2, r3, r4) \
  271. SBOX##which##_INVERSE (r0, r1, r2, r3, r4)
  272. /* XOR round key into block state in r0,r1,r2,r3. r4 used as temporary. */
  273. #define BLOCK_XOR_KEY(r0, r1, r2, r3, r4, round) \
  274. vpbroadcastd (ctx_keys + (round) * 16 + 0 * 4)(CTX), r4; \
  275. vpxor r4, r0, r0; \
  276. vpbroadcastd (ctx_keys + (round) * 16 + 1 * 4)(CTX), r4; \
  277. vpxor r4, r1, r1; \
  278. vpbroadcastd (ctx_keys + (round) * 16 + 2 * 4)(CTX), r4; \
  279. vpxor r4, r2, r2; \
  280. vpbroadcastd (ctx_keys + (round) * 16 + 3 * 4)(CTX), r4; \
  281. vpxor r4, r3, r3;
  282. /* Apply the linear transformation to BLOCK. */
  283. #define LINEAR_TRANSFORMATION(r0, r1, r2, r3, r4) \
  284. vec_rol(r0, 13, r4); \
  285. vec_rol(r2, 3, r4); \
  286. vpxor r0, r1, r1; \
  287. vpxor r2, r1, r1; \
  288. vpslld $3, r0, r4; \
  289. vpxor r2, r3, r3; \
  290. vpxor r4, r3, r3; \
  291. vec_rol(r1, 1, r4); \
  292. vec_rol(r3, 7, r4); \
  293. vpxor r1, r0, r0; \
  294. vpxor r3, r0, r0; \
  295. vpslld $7, r1, r4; \
  296. vpxor r3, r2, r2; \
  297. vpxor r4, r2, r2; \
  298. vec_rol(r0, 5, r4); \
  299. vec_rol(r2, 22, r4);
  300. /* Apply the inverse linear transformation to BLOCK. */
  301. #define LINEAR_TRANSFORMATION_INVERSE(r0, r1, r2, r3, r4) \
  302. vec_ror(r2, 22, r4); \
  303. vec_ror(r0, 5, r4); \
  304. vpslld $7, r1, r4; \
  305. vpxor r3, r2, r2; \
  306. vpxor r4, r2, r2; \
  307. vpxor r1, r0, r0; \
  308. vpxor r3, r0, r0; \
  309. vec_ror(r3, 7, r4); \
  310. vec_ror(r1, 1, r4); \
  311. vpslld $3, r0, r4; \
  312. vpxor r2, r3, r3; \
  313. vpxor r4, r3, r3; \
  314. vpxor r0, r1, r1; \
  315. vpxor r2, r1, r1; \
  316. vec_ror(r2, 3, r4); \
  317. vec_ror(r0, 13, r4);
  318. /* Apply a Serpent round to sixteen parallel blocks. This macro increments
  319. `round'. */
  320. #define ROUND(round, which, a0, a1, a2, a3, a4, na0, na1, na2, na3, na4, \
  321. b0, b1, b2, b3, b4, nb0, nb1, nb2, nb3, nb4) \
  322. BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round); \
  323. SBOX (which, a0, a1, a2, a3, a4); \
  324. BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round); \
  325. SBOX (which, b0, b1, b2, b3, b4); \
  326. LINEAR_TRANSFORMATION (na0, na1, na2, na3, na4); \
  327. LINEAR_TRANSFORMATION (nb0, nb1, nb2, nb3, nb4);
  328. /* Apply the last Serpent round to sixteen parallel blocks. This macro
  329. increments `round'. */
  330. #define ROUND_LAST(round, which, a0, a1, a2, a3, a4, na0, na1, na2, na3, na4, \
  331. b0, b1, b2, b3, b4, nb0, nb1, nb2, nb3, nb4) \
  332. BLOCK_XOR_KEY (a0, a1, a2, a3, a4, round); \
  333. SBOX (which, a0, a1, a2, a3, a4); \
  334. BLOCK_XOR_KEY (b0, b1, b2, b3, b4, round); \
  335. SBOX (which, b0, b1, b2, b3, b4); \
  336. BLOCK_XOR_KEY (na0, na1, na2, na3, na4, ((round) + 1)); \
  337. BLOCK_XOR_KEY (nb0, nb1, nb2, nb3, nb4, ((round) + 1));
  338. /* Apply an inverse Serpent round to sixteen parallel blocks. This macro
  339. increments `round'. */
  340. #define ROUND_INVERSE(round, which, a0, a1, a2, a3, a4, \
  341. na0, na1, na2, na3, na4, \
  342. b0, b1, b2, b3, b4, \
  343. nb0, nb1, nb2, nb3, nb4) \
  344. LINEAR_TRANSFORMATION_INVERSE (a0, a1, a2, a3, a4); \
  345. LINEAR_TRANSFORMATION_INVERSE (b0, b1, b2, b3, b4); \
  346. SBOX_INVERSE (which, a0, a1, a2, a3, a4); \
  347. BLOCK_XOR_KEY (na0, na1, na2, na3, na4, round); \
  348. SBOX_INVERSE (which, b0, b1, b2, b3, b4); \
  349. BLOCK_XOR_KEY (nb0, nb1, nb2, nb3, nb4, round);
  350. /* Apply the first inverse Serpent round to sixteen parallel blocks. This macro
  351. increments `round'. */
  352. #define ROUND_FIRST_INVERSE(round, which, a0, a1, a2, a3, a4, \
  353. na0, na1, na2, na3, na4, \
  354. b0, b1, b2, b3, b4, \
  355. nb0, nb1, nb2, nb3, nb4) \
  356. BLOCK_XOR_KEY (a0, a1, a2, a3, a4, ((round) + 1)); \
  357. BLOCK_XOR_KEY (b0, b1, b2, b3, b4, ((round) + 1)); \
  358. SBOX_INVERSE (which, a0, a1, a2, a3, a4); \
  359. BLOCK_XOR_KEY (na0, na1, na2, na3, na4, round); \
  360. SBOX_INVERSE (which, b0, b1, b2, b3, b4); \
  361. BLOCK_XOR_KEY (nb0, nb1, nb2, nb3, nb4, round);
  362. .text
  363. .align 16
  364. ELF(.type __serpent_enc_blk16,@function;)
  365. __serpent_enc_blk16:
  366. /* input:
  367. * %rdi: ctx, CTX
  368. * RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: sixteen parallel
  369. * plaintext blocks
  370. * output:
  371. * RA4, RA1, RA2, RA0, RB4, RB1, RB2, RB0: sixteen parallel
  372. * ciphertext blocks
  373. */
  374. CFI_STARTPROC();
  375. vpcmpeqd RNOT, RNOT, RNOT;
  376. transpose_4x4(RA0, RA1, RA2, RA3, RA4, RTMP0, RTMP1);
  377. transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP0, RTMP1);
  378. ROUND (0, 0, RA0, RA1, RA2, RA3, RA4, RA1, RA4, RA2, RA0, RA3,
  379. RB0, RB1, RB2, RB3, RB4, RB1, RB4, RB2, RB0, RB3);
  380. ROUND (1, 1, RA1, RA4, RA2, RA0, RA3, RA2, RA1, RA0, RA4, RA3,
  381. RB1, RB4, RB2, RB0, RB3, RB2, RB1, RB0, RB4, RB3);
  382. ROUND (2, 2, RA2, RA1, RA0, RA4, RA3, RA0, RA4, RA1, RA3, RA2,
  383. RB2, RB1, RB0, RB4, RB3, RB0, RB4, RB1, RB3, RB2);
  384. ROUND (3, 3, RA0, RA4, RA1, RA3, RA2, RA4, RA1, RA3, RA2, RA0,
  385. RB0, RB4, RB1, RB3, RB2, RB4, RB1, RB3, RB2, RB0);
  386. ROUND (4, 4, RA4, RA1, RA3, RA2, RA0, RA1, RA0, RA4, RA2, RA3,
  387. RB4, RB1, RB3, RB2, RB0, RB1, RB0, RB4, RB2, RB3);
  388. ROUND (5, 5, RA1, RA0, RA4, RA2, RA3, RA0, RA2, RA1, RA4, RA3,
  389. RB1, RB0, RB4, RB2, RB3, RB0, RB2, RB1, RB4, RB3);
  390. ROUND (6, 6, RA0, RA2, RA1, RA4, RA3, RA0, RA2, RA3, RA1, RA4,
  391. RB0, RB2, RB1, RB4, RB3, RB0, RB2, RB3, RB1, RB4);
  392. ROUND (7, 7, RA0, RA2, RA3, RA1, RA4, RA4, RA1, RA2, RA0, RA3,
  393. RB0, RB2, RB3, RB1, RB4, RB4, RB1, RB2, RB0, RB3);
  394. ROUND (8, 0, RA4, RA1, RA2, RA0, RA3, RA1, RA3, RA2, RA4, RA0,
  395. RB4, RB1, RB2, RB0, RB3, RB1, RB3, RB2, RB4, RB0);
  396. ROUND (9, 1, RA1, RA3, RA2, RA4, RA0, RA2, RA1, RA4, RA3, RA0,
  397. RB1, RB3, RB2, RB4, RB0, RB2, RB1, RB4, RB3, RB0);
  398. ROUND (10, 2, RA2, RA1, RA4, RA3, RA0, RA4, RA3, RA1, RA0, RA2,
  399. RB2, RB1, RB4, RB3, RB0, RB4, RB3, RB1, RB0, RB2);
  400. ROUND (11, 3, RA4, RA3, RA1, RA0, RA2, RA3, RA1, RA0, RA2, RA4,
  401. RB4, RB3, RB1, RB0, RB2, RB3, RB1, RB0, RB2, RB4);
  402. ROUND (12, 4, RA3, RA1, RA0, RA2, RA4, RA1, RA4, RA3, RA2, RA0,
  403. RB3, RB1, RB0, RB2, RB4, RB1, RB4, RB3, RB2, RB0);
  404. ROUND (13, 5, RA1, RA4, RA3, RA2, RA0, RA4, RA2, RA1, RA3, RA0,
  405. RB1, RB4, RB3, RB2, RB0, RB4, RB2, RB1, RB3, RB0);
  406. ROUND (14, 6, RA4, RA2, RA1, RA3, RA0, RA4, RA2, RA0, RA1, RA3,
  407. RB4, RB2, RB1, RB3, RB0, RB4, RB2, RB0, RB1, RB3);
  408. ROUND (15, 7, RA4, RA2, RA0, RA1, RA3, RA3, RA1, RA2, RA4, RA0,
  409. RB4, RB2, RB0, RB1, RB3, RB3, RB1, RB2, RB4, RB0);
  410. ROUND (16, 0, RA3, RA1, RA2, RA4, RA0, RA1, RA0, RA2, RA3, RA4,
  411. RB3, RB1, RB2, RB4, RB0, RB1, RB0, RB2, RB3, RB4);
  412. ROUND (17, 1, RA1, RA0, RA2, RA3, RA4, RA2, RA1, RA3, RA0, RA4,
  413. RB1, RB0, RB2, RB3, RB4, RB2, RB1, RB3, RB0, RB4);
  414. ROUND (18, 2, RA2, RA1, RA3, RA0, RA4, RA3, RA0, RA1, RA4, RA2,
  415. RB2, RB1, RB3, RB0, RB4, RB3, RB0, RB1, RB4, RB2);
  416. ROUND (19, 3, RA3, RA0, RA1, RA4, RA2, RA0, RA1, RA4, RA2, RA3,
  417. RB3, RB0, RB1, RB4, RB2, RB0, RB1, RB4, RB2, RB3);
  418. ROUND (20, 4, RA0, RA1, RA4, RA2, RA3, RA1, RA3, RA0, RA2, RA4,
  419. RB0, RB1, RB4, RB2, RB3, RB1, RB3, RB0, RB2, RB4);
  420. ROUND (21, 5, RA1, RA3, RA0, RA2, RA4, RA3, RA2, RA1, RA0, RA4,
  421. RB1, RB3, RB0, RB2, RB4, RB3, RB2, RB1, RB0, RB4);
  422. ROUND (22, 6, RA3, RA2, RA1, RA0, RA4, RA3, RA2, RA4, RA1, RA0,
  423. RB3, RB2, RB1, RB0, RB4, RB3, RB2, RB4, RB1, RB0);
  424. ROUND (23, 7, RA3, RA2, RA4, RA1, RA0, RA0, RA1, RA2, RA3, RA4,
  425. RB3, RB2, RB4, RB1, RB0, RB0, RB1, RB2, RB3, RB4);
  426. ROUND (24, 0, RA0, RA1, RA2, RA3, RA4, RA1, RA4, RA2, RA0, RA3,
  427. RB0, RB1, RB2, RB3, RB4, RB1, RB4, RB2, RB0, RB3);
  428. ROUND (25, 1, RA1, RA4, RA2, RA0, RA3, RA2, RA1, RA0, RA4, RA3,
  429. RB1, RB4, RB2, RB0, RB3, RB2, RB1, RB0, RB4, RB3);
  430. ROUND (26, 2, RA2, RA1, RA0, RA4, RA3, RA0, RA4, RA1, RA3, RA2,
  431. RB2, RB1, RB0, RB4, RB3, RB0, RB4, RB1, RB3, RB2);
  432. ROUND (27, 3, RA0, RA4, RA1, RA3, RA2, RA4, RA1, RA3, RA2, RA0,
  433. RB0, RB4, RB1, RB3, RB2, RB4, RB1, RB3, RB2, RB0);
  434. ROUND (28, 4, RA4, RA1, RA3, RA2, RA0, RA1, RA0, RA4, RA2, RA3,
  435. RB4, RB1, RB3, RB2, RB0, RB1, RB0, RB4, RB2, RB3);
  436. ROUND (29, 5, RA1, RA0, RA4, RA2, RA3, RA0, RA2, RA1, RA4, RA3,
  437. RB1, RB0, RB4, RB2, RB3, RB0, RB2, RB1, RB4, RB3);
  438. ROUND (30, 6, RA0, RA2, RA1, RA4, RA3, RA0, RA2, RA3, RA1, RA4,
  439. RB0, RB2, RB1, RB4, RB3, RB0, RB2, RB3, RB1, RB4);
  440. ROUND_LAST (31, 7, RA0, RA2, RA3, RA1, RA4, RA4, RA1, RA2, RA0, RA3,
  441. RB0, RB2, RB3, RB1, RB4, RB4, RB1, RB2, RB0, RB3);
  442. transpose_4x4(RA4, RA1, RA2, RA0, RA3, RTMP0, RTMP1);
  443. transpose_4x4(RB4, RB1, RB2, RB0, RB3, RTMP0, RTMP1);
  444. ret_spec_stop;
  445. CFI_ENDPROC();
  446. ELF(.size __serpent_enc_blk16,.-__serpent_enc_blk16;)
  447. .align 16
  448. ELF(.type __serpent_dec_blk16,@function;)
  449. __serpent_dec_blk16:
  450. /* input:
  451. * %rdi: ctx, CTX
  452. * RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: sixteen parallel
  453. * ciphertext blocks
  454. * output:
  455. * RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: sixteen parallel
  456. * plaintext blocks
  457. */
  458. CFI_STARTPROC();
  459. vpcmpeqd RNOT, RNOT, RNOT;
  460. transpose_4x4(RA0, RA1, RA2, RA3, RA4, RTMP0, RTMP1);
  461. transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP0, RTMP1);
  462. ROUND_FIRST_INVERSE (31, 7, RA0, RA1, RA2, RA3, RA4,
  463. RA3, RA0, RA1, RA4, RA2,
  464. RB0, RB1, RB2, RB3, RB4,
  465. RB3, RB0, RB1, RB4, RB2);
  466. ROUND_INVERSE (30, 6, RA3, RA0, RA1, RA4, RA2, RA0, RA1, RA2, RA4, RA3,
  467. RB3, RB0, RB1, RB4, RB2, RB0, RB1, RB2, RB4, RB3);
  468. ROUND_INVERSE (29, 5, RA0, RA1, RA2, RA4, RA3, RA1, RA3, RA4, RA2, RA0,
  469. RB0, RB1, RB2, RB4, RB3, RB1, RB3, RB4, RB2, RB0);
  470. ROUND_INVERSE (28, 4, RA1, RA3, RA4, RA2, RA0, RA1, RA2, RA4, RA0, RA3,
  471. RB1, RB3, RB4, RB2, RB0, RB1, RB2, RB4, RB0, RB3);
  472. ROUND_INVERSE (27, 3, RA1, RA2, RA4, RA0, RA3, RA4, RA2, RA0, RA1, RA3,
  473. RB1, RB2, RB4, RB0, RB3, RB4, RB2, RB0, RB1, RB3);
  474. ROUND_INVERSE (26, 2, RA4, RA2, RA0, RA1, RA3, RA2, RA3, RA0, RA1, RA4,
  475. RB4, RB2, RB0, RB1, RB3, RB2, RB3, RB0, RB1, RB4);
  476. ROUND_INVERSE (25, 1, RA2, RA3, RA0, RA1, RA4, RA4, RA2, RA1, RA0, RA3,
  477. RB2, RB3, RB0, RB1, RB4, RB4, RB2, RB1, RB0, RB3);
  478. ROUND_INVERSE (24, 0, RA4, RA2, RA1, RA0, RA3, RA4, RA3, RA2, RA0, RA1,
  479. RB4, RB2, RB1, RB0, RB3, RB4, RB3, RB2, RB0, RB1);
  480. ROUND_INVERSE (23, 7, RA4, RA3, RA2, RA0, RA1, RA0, RA4, RA3, RA1, RA2,
  481. RB4, RB3, RB2, RB0, RB1, RB0, RB4, RB3, RB1, RB2);
  482. ROUND_INVERSE (22, 6, RA0, RA4, RA3, RA1, RA2, RA4, RA3, RA2, RA1, RA0,
  483. RB0, RB4, RB3, RB1, RB2, RB4, RB3, RB2, RB1, RB0);
  484. ROUND_INVERSE (21, 5, RA4, RA3, RA2, RA1, RA0, RA3, RA0, RA1, RA2, RA4,
  485. RB4, RB3, RB2, RB1, RB0, RB3, RB0, RB1, RB2, RB4);
  486. ROUND_INVERSE (20, 4, RA3, RA0, RA1, RA2, RA4, RA3, RA2, RA1, RA4, RA0,
  487. RB3, RB0, RB1, RB2, RB4, RB3, RB2, RB1, RB4, RB0);
  488. ROUND_INVERSE (19, 3, RA3, RA2, RA1, RA4, RA0, RA1, RA2, RA4, RA3, RA0,
  489. RB3, RB2, RB1, RB4, RB0, RB1, RB2, RB4, RB3, RB0);
  490. ROUND_INVERSE (18, 2, RA1, RA2, RA4, RA3, RA0, RA2, RA0, RA4, RA3, RA1,
  491. RB1, RB2, RB4, RB3, RB0, RB2, RB0, RB4, RB3, RB1);
  492. ROUND_INVERSE (17, 1, RA2, RA0, RA4, RA3, RA1, RA1, RA2, RA3, RA4, RA0,
  493. RB2, RB0, RB4, RB3, RB1, RB1, RB2, RB3, RB4, RB0);
  494. ROUND_INVERSE (16, 0, RA1, RA2, RA3, RA4, RA0, RA1, RA0, RA2, RA4, RA3,
  495. RB1, RB2, RB3, RB4, RB0, RB1, RB0, RB2, RB4, RB3);
  496. ROUND_INVERSE (15, 7, RA1, RA0, RA2, RA4, RA3, RA4, RA1, RA0, RA3, RA2,
  497. RB1, RB0, RB2, RB4, RB3, RB4, RB1, RB0, RB3, RB2);
  498. ROUND_INVERSE (14, 6, RA4, RA1, RA0, RA3, RA2, RA1, RA0, RA2, RA3, RA4,
  499. RB4, RB1, RB0, RB3, RB2, RB1, RB0, RB2, RB3, RB4);
  500. ROUND_INVERSE (13, 5, RA1, RA0, RA2, RA3, RA4, RA0, RA4, RA3, RA2, RA1,
  501. RB1, RB0, RB2, RB3, RB4, RB0, RB4, RB3, RB2, RB1);
  502. ROUND_INVERSE (12, 4, RA0, RA4, RA3, RA2, RA1, RA0, RA2, RA3, RA1, RA4,
  503. RB0, RB4, RB3, RB2, RB1, RB0, RB2, RB3, RB1, RB4);
  504. ROUND_INVERSE (11, 3, RA0, RA2, RA3, RA1, RA4, RA3, RA2, RA1, RA0, RA4,
  505. RB0, RB2, RB3, RB1, RB4, RB3, RB2, RB1, RB0, RB4);
  506. ROUND_INVERSE (10, 2, RA3, RA2, RA1, RA0, RA4, RA2, RA4, RA1, RA0, RA3,
  507. RB3, RB2, RB1, RB0, RB4, RB2, RB4, RB1, RB0, RB3);
  508. ROUND_INVERSE (9, 1, RA2, RA4, RA1, RA0, RA3, RA3, RA2, RA0, RA1, RA4,
  509. RB2, RB4, RB1, RB0, RB3, RB3, RB2, RB0, RB1, RB4);
  510. ROUND_INVERSE (8, 0, RA3, RA2, RA0, RA1, RA4, RA3, RA4, RA2, RA1, RA0,
  511. RB3, RB2, RB0, RB1, RB4, RB3, RB4, RB2, RB1, RB0);
  512. ROUND_INVERSE (7, 7, RA3, RA4, RA2, RA1, RA0, RA1, RA3, RA4, RA0, RA2,
  513. RB3, RB4, RB2, RB1, RB0, RB1, RB3, RB4, RB0, RB2);
  514. ROUND_INVERSE (6, 6, RA1, RA3, RA4, RA0, RA2, RA3, RA4, RA2, RA0, RA1,
  515. RB1, RB3, RB4, RB0, RB2, RB3, RB4, RB2, RB0, RB1);
  516. ROUND_INVERSE (5, 5, RA3, RA4, RA2, RA0, RA1, RA4, RA1, RA0, RA2, RA3,
  517. RB3, RB4, RB2, RB0, RB1, RB4, RB1, RB0, RB2, RB3);
  518. ROUND_INVERSE (4, 4, RA4, RA1, RA0, RA2, RA3, RA4, RA2, RA0, RA3, RA1,
  519. RB4, RB1, RB0, RB2, RB3, RB4, RB2, RB0, RB3, RB1);
  520. ROUND_INVERSE (3, 3, RA4, RA2, RA0, RA3, RA1, RA0, RA2, RA3, RA4, RA1,
  521. RB4, RB2, RB0, RB3, RB1, RB0, RB2, RB3, RB4, RB1);
  522. ROUND_INVERSE (2, 2, RA0, RA2, RA3, RA4, RA1, RA2, RA1, RA3, RA4, RA0,
  523. RB0, RB2, RB3, RB4, RB1, RB2, RB1, RB3, RB4, RB0);
  524. ROUND_INVERSE (1, 1, RA2, RA1, RA3, RA4, RA0, RA0, RA2, RA4, RA3, RA1,
  525. RB2, RB1, RB3, RB4, RB0, RB0, RB2, RB4, RB3, RB1);
  526. ROUND_INVERSE (0, 0, RA0, RA2, RA4, RA3, RA1, RA0, RA1, RA2, RA3, RA4,
  527. RB0, RB2, RB4, RB3, RB1, RB0, RB1, RB2, RB3, RB4);
  528. transpose_4x4(RA0, RA1, RA2, RA3, RA4, RTMP0, RTMP1);
  529. transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP0, RTMP1);
  530. ret_spec_stop;
  531. CFI_ENDPROC();
  532. ELF(.size __serpent_dec_blk16,.-__serpent_dec_blk16;)
  533. .align 16
  534. .globl _gcry_serpent_avx2_blk16
  535. ELF(.type _gcry_serpent_avx2_blk16,@function;)
  536. _gcry_serpent_avx2_blk16:
  537. /* input:
  538. * %rdi: ctx, CTX
  539. * %rsi: dst (16 blocks)
  540. * %rdx: src (16 blocks)
  541. * %ecx: encrypt
  542. */
  543. CFI_STARTPROC();
  544. vmovdqu (0 * 32)(%rdx), RA0;
  545. vmovdqu (1 * 32)(%rdx), RA1;
  546. vmovdqu (2 * 32)(%rdx), RA2;
  547. vmovdqu (3 * 32)(%rdx), RA3;
  548. vmovdqu (4 * 32)(%rdx), RB0;
  549. vmovdqu (5 * 32)(%rdx), RB1;
  550. vmovdqu (6 * 32)(%rdx), RB2;
  551. vmovdqu (7 * 32)(%rdx), RB3;
  552. testl %ecx, %ecx;
  553. jz .Lblk16_dec;
  554. call __serpent_enc_blk16;
  555. vmovdqu RA4, (0 * 32)(%rsi);
  556. vmovdqu RA1, (1 * 32)(%rsi);
  557. vmovdqu RA2, (2 * 32)(%rsi);
  558. vmovdqu RA0, (3 * 32)(%rsi);
  559. vmovdqu RB4, (4 * 32)(%rsi);
  560. vmovdqu RB1, (5 * 32)(%rsi);
  561. vmovdqu RB2, (6 * 32)(%rsi);
  562. vmovdqu RB0, (7 * 32)(%rsi);
  563. jmp .Lblk16_end;
  564. .Lblk16_dec:
  565. call __serpent_dec_blk16;
  566. vmovdqu RA0, (0 * 32)(%rsi);
  567. vmovdqu RA1, (1 * 32)(%rsi);
  568. vmovdqu RA2, (2 * 32)(%rsi);
  569. vmovdqu RA3, (3 * 32)(%rsi);
  570. vmovdqu RB0, (4 * 32)(%rsi);
  571. vmovdqu RB1, (5 * 32)(%rsi);
  572. vmovdqu RB2, (6 * 32)(%rsi);
  573. vmovdqu RB3, (7 * 32)(%rsi);
  574. .Lblk16_end:
  575. vzeroall;
  576. ret_spec_stop;
  577. CFI_ENDPROC();
  578. ELF(.size _gcry_serpent_avx2_blk16,.-_gcry_serpent_avx2_blk16;)
  579. #define inc_le128(x, minus_one, tmp) \
  580. vpcmpeqq minus_one, x, tmp; \
  581. vpsubq minus_one, x, x; \
  582. vpslldq $8, tmp, tmp; \
  583. vpsubq tmp, x, x;
  584. .align 16
  585. .globl _gcry_serpent_avx2_ctr_enc
  586. ELF(.type _gcry_serpent_avx2_ctr_enc,@function;)
  587. _gcry_serpent_avx2_ctr_enc:
  588. /* input:
  589. * %rdi: ctx, CTX
  590. * %rsi: dst (16 blocks)
  591. * %rdx: src (16 blocks)
  592. * %rcx: iv (big endian, 128bit)
  593. */
  594. CFI_STARTPROC();
  595. movq 8(%rcx), %rax;
  596. bswapq %rax;
  597. vzeroupper;
  598. vbroadcasti128 .Lbswap128_mask rRIP, RTMP3;
  599. vpcmpeqd RNOT, RNOT, RNOT;
  600. vpsrldq $8, RNOT, RNOT; /* ab: -1:0 ; cd: -1:0 */
  601. vpaddq RNOT, RNOT, RTMP2; /* ab: -2:0 ; cd: -2:0 */
  602. /* load IV and byteswap */
  603. vmovdqu (%rcx), RTMP4x;
  604. vpshufb RTMP3x, RTMP4x, RTMP4x;
  605. vmovdqa RTMP4x, RTMP0x;
  606. inc_le128(RTMP4x, RNOTx, RTMP1x);
  607. vinserti128 $1, RTMP4x, RTMP0, RTMP0;
  608. vpshufb RTMP3, RTMP0, RA0; /* +1 ; +0 */
  609. /* check need for handling 64-bit overflow and carry */
  610. cmpq $(0xffffffffffffffff - 16), %rax;
  611. ja .Lhandle_ctr_carry;
  612. /* construct IVs */
  613. vpsubq RTMP2, RTMP0, RTMP0; /* +3 ; +2 */
  614. vpshufb RTMP3, RTMP0, RA1;
  615. vpsubq RTMP2, RTMP0, RTMP0; /* +5 ; +4 */
  616. vpshufb RTMP3, RTMP0, RA2;
  617. vpsubq RTMP2, RTMP0, RTMP0; /* +7 ; +6 */
  618. vpshufb RTMP3, RTMP0, RA3;
  619. vpsubq RTMP2, RTMP0, RTMP0; /* +9 ; +8 */
  620. vpshufb RTMP3, RTMP0, RB0;
  621. vpsubq RTMP2, RTMP0, RTMP0; /* +11 ; +10 */
  622. vpshufb RTMP3, RTMP0, RB1;
  623. vpsubq RTMP2, RTMP0, RTMP0; /* +13 ; +12 */
  624. vpshufb RTMP3, RTMP0, RB2;
  625. vpsubq RTMP2, RTMP0, RTMP0; /* +15 ; +14 */
  626. vpshufb RTMP3, RTMP0, RB3;
  627. vpsubq RTMP2, RTMP0, RTMP0; /* +16 */
  628. vpshufb RTMP3x, RTMP0x, RTMP0x;
  629. jmp .Lctr_carry_done;
  630. .Lhandle_ctr_carry:
  631. /* construct IVs */
  632. inc_le128(RTMP0, RNOT, RTMP1);
  633. inc_le128(RTMP0, RNOT, RTMP1);
  634. vpshufb RTMP3, RTMP0, RA1; /* +3 ; +2 */
  635. inc_le128(RTMP0, RNOT, RTMP1);
  636. inc_le128(RTMP0, RNOT, RTMP1);
  637. vpshufb RTMP3, RTMP0, RA2; /* +5 ; +4 */
  638. inc_le128(RTMP0, RNOT, RTMP1);
  639. inc_le128(RTMP0, RNOT, RTMP1);
  640. vpshufb RTMP3, RTMP0, RA3; /* +7 ; +6 */
  641. inc_le128(RTMP0, RNOT, RTMP1);
  642. inc_le128(RTMP0, RNOT, RTMP1);
  643. vpshufb RTMP3, RTMP0, RB0; /* +9 ; +8 */
  644. inc_le128(RTMP0, RNOT, RTMP1);
  645. inc_le128(RTMP0, RNOT, RTMP1);
  646. vpshufb RTMP3, RTMP0, RB1; /* +11 ; +10 */
  647. inc_le128(RTMP0, RNOT, RTMP1);
  648. inc_le128(RTMP0, RNOT, RTMP1);
  649. vpshufb RTMP3, RTMP0, RB2; /* +13 ; +12 */
  650. inc_le128(RTMP0, RNOT, RTMP1);
  651. inc_le128(RTMP0, RNOT, RTMP1);
  652. vpshufb RTMP3, RTMP0, RB3; /* +15 ; +14 */
  653. inc_le128(RTMP0, RNOT, RTMP1);
  654. vextracti128 $1, RTMP0, RTMP0x;
  655. vpshufb RTMP3x, RTMP0x, RTMP0x; /* +16 */
  656. .align 4
  657. .Lctr_carry_done:
  658. /* store new IV */
  659. vmovdqu RTMP0x, (%rcx);
  660. call __serpent_enc_blk16;
  661. vpxor (0 * 32)(%rdx), RA4, RA4;
  662. vpxor (1 * 32)(%rdx), RA1, RA1;
  663. vpxor (2 * 32)(%rdx), RA2, RA2;
  664. vpxor (3 * 32)(%rdx), RA0, RA0;
  665. vpxor (4 * 32)(%rdx), RB4, RB4;
  666. vpxor (5 * 32)(%rdx), RB1, RB1;
  667. vpxor (6 * 32)(%rdx), RB2, RB2;
  668. vpxor (7 * 32)(%rdx), RB0, RB0;
  669. vmovdqu RA4, (0 * 32)(%rsi);
  670. vmovdqu RA1, (1 * 32)(%rsi);
  671. vmovdqu RA2, (2 * 32)(%rsi);
  672. vmovdqu RA0, (3 * 32)(%rsi);
  673. vmovdqu RB4, (4 * 32)(%rsi);
  674. vmovdqu RB1, (5 * 32)(%rsi);
  675. vmovdqu RB2, (6 * 32)(%rsi);
  676. vmovdqu RB0, (7 * 32)(%rsi);
  677. vzeroall;
  678. ret_spec_stop;
  679. CFI_ENDPROC();
  680. ELF(.size _gcry_serpent_avx2_ctr_enc,.-_gcry_serpent_avx2_ctr_enc;)
  681. .align 16
  682. .globl _gcry_serpent_avx2_cbc_dec
  683. ELF(.type _gcry_serpent_avx2_cbc_dec,@function;)
  684. _gcry_serpent_avx2_cbc_dec:
  685. /* input:
  686. * %rdi: ctx, CTX
  687. * %rsi: dst (16 blocks)
  688. * %rdx: src (16 blocks)
  689. * %rcx: iv
  690. */
  691. CFI_STARTPROC();
  692. vzeroupper;
  693. vmovdqu (0 * 32)(%rdx), RA0;
  694. vmovdqu (1 * 32)(%rdx), RA1;
  695. vmovdqu (2 * 32)(%rdx), RA2;
  696. vmovdqu (3 * 32)(%rdx), RA3;
  697. vmovdqu (4 * 32)(%rdx), RB0;
  698. vmovdqu (5 * 32)(%rdx), RB1;
  699. vmovdqu (6 * 32)(%rdx), RB2;
  700. vmovdqu (7 * 32)(%rdx), RB3;
  701. call __serpent_dec_blk16;
  702. vmovdqu (%rcx), RNOTx;
  703. vinserti128 $1, (%rdx), RNOT, RNOT;
  704. vpxor RNOT, RA0, RA0;
  705. vpxor (0 * 32 + 16)(%rdx), RA1, RA1;
  706. vpxor (1 * 32 + 16)(%rdx), RA2, RA2;
  707. vpxor (2 * 32 + 16)(%rdx), RA3, RA3;
  708. vpxor (3 * 32 + 16)(%rdx), RB0, RB0;
  709. vpxor (4 * 32 + 16)(%rdx), RB1, RB1;
  710. vpxor (5 * 32 + 16)(%rdx), RB2, RB2;
  711. vpxor (6 * 32 + 16)(%rdx), RB3, RB3;
  712. vmovdqu (7 * 32 + 16)(%rdx), RNOTx;
  713. vmovdqu RNOTx, (%rcx); /* store new IV */
  714. vmovdqu RA0, (0 * 32)(%rsi);
  715. vmovdqu RA1, (1 * 32)(%rsi);
  716. vmovdqu RA2, (2 * 32)(%rsi);
  717. vmovdqu RA3, (3 * 32)(%rsi);
  718. vmovdqu RB0, (4 * 32)(%rsi);
  719. vmovdqu RB1, (5 * 32)(%rsi);
  720. vmovdqu RB2, (6 * 32)(%rsi);
  721. vmovdqu RB3, (7 * 32)(%rsi);
  722. vzeroall;
  723. ret_spec_stop;
  724. CFI_ENDPROC();
  725. ELF(.size _gcry_serpent_avx2_cbc_dec,.-_gcry_serpent_avx2_cbc_dec;)
  726. .align 16
  727. .globl _gcry_serpent_avx2_cfb_dec
  728. ELF(.type _gcry_serpent_avx2_cfb_dec,@function;)
  729. _gcry_serpent_avx2_cfb_dec:
  730. /* input:
  731. * %rdi: ctx, CTX
  732. * %rsi: dst (16 blocks)
  733. * %rdx: src (16 blocks)
  734. * %rcx: iv
  735. */
  736. CFI_STARTPROC();
  737. vzeroupper;
  738. /* Load input */
  739. vmovdqu (%rcx), RNOTx;
  740. vinserti128 $1, (%rdx), RNOT, RA0;
  741. vmovdqu (0 * 32 + 16)(%rdx), RA1;
  742. vmovdqu (1 * 32 + 16)(%rdx), RA2;
  743. vmovdqu (2 * 32 + 16)(%rdx), RA3;
  744. vmovdqu (3 * 32 + 16)(%rdx), RB0;
  745. vmovdqu (4 * 32 + 16)(%rdx), RB1;
  746. vmovdqu (5 * 32 + 16)(%rdx), RB2;
  747. vmovdqu (6 * 32 + 16)(%rdx), RB3;
  748. /* Update IV */
  749. vmovdqu (7 * 32 + 16)(%rdx), RNOTx;
  750. vmovdqu RNOTx, (%rcx);
  751. call __serpent_enc_blk16;
  752. vpxor (0 * 32)(%rdx), RA4, RA4;
  753. vpxor (1 * 32)(%rdx), RA1, RA1;
  754. vpxor (2 * 32)(%rdx), RA2, RA2;
  755. vpxor (3 * 32)(%rdx), RA0, RA0;
  756. vpxor (4 * 32)(%rdx), RB4, RB4;
  757. vpxor (5 * 32)(%rdx), RB1, RB1;
  758. vpxor (6 * 32)(%rdx), RB2, RB2;
  759. vpxor (7 * 32)(%rdx), RB0, RB0;
  760. vmovdqu RA4, (0 * 32)(%rsi);
  761. vmovdqu RA1, (1 * 32)(%rsi);
  762. vmovdqu RA2, (2 * 32)(%rsi);
  763. vmovdqu RA0, (3 * 32)(%rsi);
  764. vmovdqu RB4, (4 * 32)(%rsi);
  765. vmovdqu RB1, (5 * 32)(%rsi);
  766. vmovdqu RB2, (6 * 32)(%rsi);
  767. vmovdqu RB0, (7 * 32)(%rsi);
  768. vzeroall;
  769. ret_spec_stop;
  770. CFI_ENDPROC();
  771. ELF(.size _gcry_serpent_avx2_cfb_dec,.-_gcry_serpent_avx2_cfb_dec;)
  772. .align 16
  773. .globl _gcry_serpent_avx2_ocb_enc
  774. ELF(.type _gcry_serpent_avx2_ocb_enc,@function;)
  775. _gcry_serpent_avx2_ocb_enc:
  776. /* input:
  777. * %rdi: ctx, CTX
  778. * %rsi: dst (16 blocks)
  779. * %rdx: src (16 blocks)
  780. * %rcx: offset
  781. * %r8 : checksum
  782. * %r9 : L pointers (void *L[16])
  783. */
  784. CFI_STARTPROC();
  785. vzeroupper;
  786. subq $(4 * 8), %rsp;
  787. CFI_ADJUST_CFA_OFFSET(4 * 8);
  788. movq %r10, (0 * 8)(%rsp);
  789. movq %r11, (1 * 8)(%rsp);
  790. movq %r12, (2 * 8)(%rsp);
  791. movq %r13, (3 * 8)(%rsp);
  792. CFI_REL_OFFSET(%r10, 0 * 8);
  793. CFI_REL_OFFSET(%r11, 1 * 8);
  794. CFI_REL_OFFSET(%r12, 2 * 8);
  795. CFI_REL_OFFSET(%r13, 3 * 8);
  796. vmovdqu (%rcx), RTMP0x;
  797. vmovdqu (%r8), RTMP1x;
  798. /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
  799. /* Checksum_i = Checksum_{i-1} xor P_i */
  800. /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */
  801. #define OCB_INPUT(n, l0reg, l1reg, yreg) \
  802. vmovdqu (n * 32)(%rdx), yreg; \
  803. vpxor (l0reg), RTMP0x, RNOTx; \
  804. vpxor (l1reg), RNOTx, RTMP0x; \
  805. vinserti128 $1, RTMP0x, RNOT, RNOT; \
  806. vpxor yreg, RTMP1, RTMP1; \
  807. vpxor yreg, RNOT, yreg; \
  808. vmovdqu RNOT, (n * 32)(%rsi);
  809. movq (0 * 8)(%r9), %r10;
  810. movq (1 * 8)(%r9), %r11;
  811. movq (2 * 8)(%r9), %r12;
  812. movq (3 * 8)(%r9), %r13;
  813. OCB_INPUT(0, %r10, %r11, RA0);
  814. OCB_INPUT(1, %r12, %r13, RA1);
  815. movq (4 * 8)(%r9), %r10;
  816. movq (5 * 8)(%r9), %r11;
  817. movq (6 * 8)(%r9), %r12;
  818. movq (7 * 8)(%r9), %r13;
  819. OCB_INPUT(2, %r10, %r11, RA2);
  820. OCB_INPUT(3, %r12, %r13, RA3);
  821. movq (8 * 8)(%r9), %r10;
  822. movq (9 * 8)(%r9), %r11;
  823. movq (10 * 8)(%r9), %r12;
  824. movq (11 * 8)(%r9), %r13;
  825. OCB_INPUT(4, %r10, %r11, RB0);
  826. OCB_INPUT(5, %r12, %r13, RB1);
  827. movq (12 * 8)(%r9), %r10;
  828. movq (13 * 8)(%r9), %r11;
  829. movq (14 * 8)(%r9), %r12;
  830. movq (15 * 8)(%r9), %r13;
  831. OCB_INPUT(6, %r10, %r11, RB2);
  832. OCB_INPUT(7, %r12, %r13, RB3);
  833. #undef OCB_INPUT
  834. vextracti128 $1, RTMP1, RNOTx;
  835. vmovdqu RTMP0x, (%rcx);
  836. vpxor RNOTx, RTMP1x, RTMP1x;
  837. vmovdqu RTMP1x, (%r8);
  838. movq (0 * 8)(%rsp), %r10;
  839. movq (1 * 8)(%rsp), %r11;
  840. movq (2 * 8)(%rsp), %r12;
  841. movq (3 * 8)(%rsp), %r13;
  842. CFI_RESTORE(%r10);
  843. CFI_RESTORE(%r11);
  844. CFI_RESTORE(%r12);
  845. CFI_RESTORE(%r13);
  846. call __serpent_enc_blk16;
  847. addq $(4 * 8), %rsp;
  848. CFI_ADJUST_CFA_OFFSET(-4 * 8);
  849. vpxor (0 * 32)(%rsi), RA4, RA4;
  850. vpxor (1 * 32)(%rsi), RA1, RA1;
  851. vpxor (2 * 32)(%rsi), RA2, RA2;
  852. vpxor (3 * 32)(%rsi), RA0, RA0;
  853. vpxor (4 * 32)(%rsi), RB4, RB4;
  854. vpxor (5 * 32)(%rsi), RB1, RB1;
  855. vpxor (6 * 32)(%rsi), RB2, RB2;
  856. vpxor (7 * 32)(%rsi), RB0, RB0;
  857. vmovdqu RA4, (0 * 32)(%rsi);
  858. vmovdqu RA1, (1 * 32)(%rsi);
  859. vmovdqu RA2, (2 * 32)(%rsi);
  860. vmovdqu RA0, (3 * 32)(%rsi);
  861. vmovdqu RB4, (4 * 32)(%rsi);
  862. vmovdqu RB1, (5 * 32)(%rsi);
  863. vmovdqu RB2, (6 * 32)(%rsi);
  864. vmovdqu RB0, (7 * 32)(%rsi);
  865. vzeroall;
  866. ret_spec_stop;
  867. CFI_ENDPROC();
  868. ELF(.size _gcry_serpent_avx2_ocb_enc,.-_gcry_serpent_avx2_ocb_enc;)
  869. .align 16
  870. .globl _gcry_serpent_avx2_ocb_dec
  871. ELF(.type _gcry_serpent_avx2_ocb_dec,@function;)
  872. _gcry_serpent_avx2_ocb_dec:
  873. /* input:
  874. * %rdi: ctx, CTX
  875. * %rsi: dst (16 blocks)
  876. * %rdx: src (16 blocks)
  877. * %rcx: offset
  878. * %r8 : checksum
  879. * %r9 : L pointers (void *L[16])
  880. */
  881. CFI_STARTPROC();
  882. vzeroupper;
  883. subq $(4 * 8), %rsp;
  884. CFI_ADJUST_CFA_OFFSET(4 * 8);
  885. movq %r10, (0 * 8)(%rsp);
  886. movq %r11, (1 * 8)(%rsp);
  887. movq %r12, (2 * 8)(%rsp);
  888. movq %r13, (3 * 8)(%rsp);
  889. CFI_REL_OFFSET(%r10, 0 * 8);
  890. CFI_REL_OFFSET(%r11, 1 * 8);
  891. CFI_REL_OFFSET(%r12, 2 * 8);
  892. CFI_REL_OFFSET(%r13, 3 * 8);
  893. vmovdqu (%rcx), RTMP0x;
  894. /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
  895. /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */
  896. #define OCB_INPUT(n, l0reg, l1reg, yreg) \
  897. vmovdqu (n * 32)(%rdx), yreg; \
  898. vpxor (l0reg), RTMP0x, RNOTx; \
  899. vpxor (l1reg), RNOTx, RTMP0x; \
  900. vinserti128 $1, RTMP0x, RNOT, RNOT; \
  901. vpxor yreg, RNOT, yreg; \
  902. vmovdqu RNOT, (n * 32)(%rsi);
  903. movq (0 * 8)(%r9), %r10;
  904. movq (1 * 8)(%r9), %r11;
  905. movq (2 * 8)(%r9), %r12;
  906. movq (3 * 8)(%r9), %r13;
  907. OCB_INPUT(0, %r10, %r11, RA0);
  908. OCB_INPUT(1, %r12, %r13, RA1);
  909. movq (4 * 8)(%r9), %r10;
  910. movq (5 * 8)(%r9), %r11;
  911. movq (6 * 8)(%r9), %r12;
  912. movq (7 * 8)(%r9), %r13;
  913. OCB_INPUT(2, %r10, %r11, RA2);
  914. OCB_INPUT(3, %r12, %r13, RA3);
  915. movq (8 * 8)(%r9), %r10;
  916. movq (9 * 8)(%r9), %r11;
  917. movq (10 * 8)(%r9), %r12;
  918. movq (11 * 8)(%r9), %r13;
  919. OCB_INPUT(4, %r10, %r11, RB0);
  920. OCB_INPUT(5, %r12, %r13, RB1);
  921. movq (12 * 8)(%r9), %r10;
  922. movq (13 * 8)(%r9), %r11;
  923. movq (14 * 8)(%r9), %r12;
  924. movq (15 * 8)(%r9), %r13;
  925. OCB_INPUT(6, %r10, %r11, RB2);
  926. OCB_INPUT(7, %r12, %r13, RB3);
  927. #undef OCB_INPUT
  928. vmovdqu RTMP0x, (%rcx);
  929. movq (0 * 8)(%rsp), %r10;
  930. movq (1 * 8)(%rsp), %r11;
  931. movq (2 * 8)(%rsp), %r12;
  932. movq (3 * 8)(%rsp), %r13;
  933. CFI_RESTORE(%r10);
  934. CFI_RESTORE(%r11);
  935. CFI_RESTORE(%r12);
  936. CFI_RESTORE(%r13);
  937. call __serpent_dec_blk16;
  938. addq $(4 * 8), %rsp;
  939. CFI_ADJUST_CFA_OFFSET(-4 * 8);
  940. vmovdqu (%r8), RTMP1x;
  941. vpxor (0 * 32)(%rsi), RA0, RA0;
  942. vpxor (1 * 32)(%rsi), RA1, RA1;
  943. vpxor (2 * 32)(%rsi), RA2, RA2;
  944. vpxor (3 * 32)(%rsi), RA3, RA3;
  945. vpxor (4 * 32)(%rsi), RB0, RB0;
  946. vpxor (5 * 32)(%rsi), RB1, RB1;
  947. vpxor (6 * 32)(%rsi), RB2, RB2;
  948. vpxor (7 * 32)(%rsi), RB3, RB3;
  949. /* Checksum_i = Checksum_{i-1} xor P_i */
  950. vmovdqu RA0, (0 * 32)(%rsi);
  951. vpxor RA0, RTMP1, RTMP1;
  952. vmovdqu RA1, (1 * 32)(%rsi);
  953. vpxor RA1, RTMP1, RTMP1;
  954. vmovdqu RA2, (2 * 32)(%rsi);
  955. vpxor RA2, RTMP1, RTMP1;
  956. vmovdqu RA3, (3 * 32)(%rsi);
  957. vpxor RA3, RTMP1, RTMP1;
  958. vmovdqu RB0, (4 * 32)(%rsi);
  959. vpxor RB0, RTMP1, RTMP1;
  960. vmovdqu RB1, (5 * 32)(%rsi);
  961. vpxor RB1, RTMP1, RTMP1;
  962. vmovdqu RB2, (6 * 32)(%rsi);
  963. vpxor RB2, RTMP1, RTMP1;
  964. vmovdqu RB3, (7 * 32)(%rsi);
  965. vpxor RB3, RTMP1, RTMP1;
  966. vextracti128 $1, RTMP1, RNOTx;
  967. vpxor RNOTx, RTMP1x, RTMP1x;
  968. vmovdqu RTMP1x, (%r8);
  969. vzeroall;
  970. ret_spec_stop;
  971. CFI_ENDPROC();
  972. ELF(.size _gcry_serpent_avx2_ocb_dec,.-_gcry_serpent_avx2_ocb_dec;)
  973. .align 16
  974. .globl _gcry_serpent_avx2_ocb_auth
  975. ELF(.type _gcry_serpent_avx2_ocb_auth,@function;)
  976. _gcry_serpent_avx2_ocb_auth:
  977. /* input:
  978. * %rdi: ctx, CTX
  979. * %rsi: abuf (16 blocks)
  980. * %rdx: offset
  981. * %rcx: checksum
  982. * %r8 : L pointers (void *L[16])
  983. */
  984. CFI_STARTPROC();
  985. vzeroupper;
  986. subq $(4 * 8), %rsp;
  987. CFI_ADJUST_CFA_OFFSET(4 * 8);
  988. movq %r10, (0 * 8)(%rsp);
  989. movq %r11, (1 * 8)(%rsp);
  990. movq %r12, (2 * 8)(%rsp);
  991. movq %r13, (3 * 8)(%rsp);
  992. CFI_REL_OFFSET(%r10, 0 * 8);
  993. CFI_REL_OFFSET(%r11, 1 * 8);
  994. CFI_REL_OFFSET(%r12, 2 * 8);
  995. CFI_REL_OFFSET(%r13, 3 * 8);
  996. vmovdqu (%rdx), RTMP0x;
  997. /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
  998. /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */
  999. #define OCB_INPUT(n, l0reg, l1reg, yreg) \
  1000. vmovdqu (n * 32)(%rsi), yreg; \
  1001. vpxor (l0reg), RTMP0x, RNOTx; \
  1002. vpxor (l1reg), RNOTx, RTMP0x; \
  1003. vinserti128 $1, RTMP0x, RNOT, RNOT; \
  1004. vpxor yreg, RNOT, yreg;
  1005. movq (0 * 8)(%r8), %r10;
  1006. movq (1 * 8)(%r8), %r11;
  1007. movq (2 * 8)(%r8), %r12;
  1008. movq (3 * 8)(%r8), %r13;
  1009. OCB_INPUT(0, %r10, %r11, RA0);
  1010. OCB_INPUT(1, %r12, %r13, RA1);
  1011. movq (4 * 8)(%r8), %r10;
  1012. movq (5 * 8)(%r8), %r11;
  1013. movq (6 * 8)(%r8), %r12;
  1014. movq (7 * 8)(%r8), %r13;
  1015. OCB_INPUT(2, %r10, %r11, RA2);
  1016. OCB_INPUT(3, %r12, %r13, RA3);
  1017. movq (8 * 8)(%r8), %r10;
  1018. movq (9 * 8)(%r8), %r11;
  1019. movq (10 * 8)(%r8), %r12;
  1020. movq (11 * 8)(%r8), %r13;
  1021. OCB_INPUT(4, %r10, %r11, RB0);
  1022. OCB_INPUT(5, %r12, %r13, RB1);
  1023. movq (12 * 8)(%r8), %r10;
  1024. movq (13 * 8)(%r8), %r11;
  1025. movq (14 * 8)(%r8), %r12;
  1026. movq (15 * 8)(%r8), %r13;
  1027. OCB_INPUT(6, %r10, %r11, RB2);
  1028. OCB_INPUT(7, %r12, %r13, RB3);
  1029. #undef OCB_INPUT
  1030. vmovdqu RTMP0x, (%rdx);
  1031. movq (0 * 8)(%rsp), %r10;
  1032. movq (1 * 8)(%rsp), %r11;
  1033. movq (2 * 8)(%rsp), %r12;
  1034. movq (3 * 8)(%rsp), %r13;
  1035. CFI_RESTORE(%r10);
  1036. CFI_RESTORE(%r11);
  1037. CFI_RESTORE(%r12);
  1038. CFI_RESTORE(%r13);
  1039. call __serpent_enc_blk16;
  1040. addq $(4 * 8), %rsp;
  1041. CFI_ADJUST_CFA_OFFSET(-4 * 8);
  1042. vpxor RA4, RB4, RA4;
  1043. vpxor RA1, RB1, RA1;
  1044. vpxor RA2, RB2, RA2;
  1045. vpxor RA0, RB0, RA0;
  1046. vpxor RA4, RA1, RA1;
  1047. vpxor RA2, RA0, RA0;
  1048. vpxor RA1, RA0, RTMP1;
  1049. vextracti128 $1, RTMP1, RNOTx;
  1050. vpxor (%rcx), RTMP1x, RTMP1x;
  1051. vpxor RNOTx, RTMP1x, RTMP1x;
  1052. vmovdqu RTMP1x, (%rcx);
  1053. vzeroall;
  1054. ret_spec_stop;
  1055. CFI_ENDPROC();
  1056. ELF(.size _gcry_serpent_avx2_ocb_auth,.-_gcry_serpent_avx2_ocb_auth;)
  1057. SECTION_RODATA
  1058. ELF(.type _serpent_avx2_consts,@object)
  1059. _serpent_avx2_consts:
  1060. /* For CTR-mode IV byteswap */
  1061. .align 16
  1062. .Lbswap128_mask:
  1063. .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
  1064. #endif /*defined(USE_SERPENT) && defined(ENABLE_AVX2_SUPPORT)*/
  1065. #endif /*__x86_64*/