serpent-avx512-x86.c 31 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995
  1. /* serpent-avx512-x86.c - AVX512 implementation of Serpent cipher
  2. *
  3. * Copyright (C) 2023 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  4. *
  5. * This file is part of Libgcrypt.
  6. *
  7. * Libgcrypt is free software; you can redistribute it and/or modify
  8. * it under the terms of the GNU Lesser General Public License as
  9. * published by the Free Software Foundation; either version 2.1 of
  10. * the License, or (at your option) any later version.
  11. *
  12. * Libgcrypt is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15. * GNU Lesser General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU Lesser General Public
  18. * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  19. */
  20. #include <config.h>
  21. #if defined(__x86_64) || defined(__i386)
  22. #if defined(HAVE_COMPATIBLE_CC_X86_AVX512_INTRINSICS) && \
  23. defined(USE_SERPENT) && defined(ENABLE_AVX512_SUPPORT)
  24. #include <immintrin.h>
  25. #include <string.h>
  26. #include <stdio.h>
  27. #include "g10lib.h"
  28. #include "types.h"
  29. #include "cipher.h"
  30. #include "bithelp.h"
  31. #include "bufhelp.h"
  32. #include "cipher-internal.h"
  33. #include "bulkhelp.h"
  34. #define ALWAYS_INLINE inline __attribute__((always_inline))
  35. #define NO_INLINE __attribute__((noinline))
  36. /* Number of rounds per Serpent encrypt/decrypt operation. */
  37. #define ROUNDS 32
  38. /* Serpent works on 128 bit blocks. */
  39. typedef unsigned int serpent_block_t[4];
  40. /* The key schedule consists of 33 128 bit subkeys. */
  41. typedef unsigned int serpent_subkeys_t[ROUNDS + 1][4];
  42. #define vpunpckhdq(a, b, o) ((o) = _mm512_unpackhi_epi32((b), (a)))
  43. #define vpunpckldq(a, b, o) ((o) = _mm512_unpacklo_epi32((b), (a)))
  44. #define vpunpckhqdq(a, b, o) ((o) = _mm512_unpackhi_epi64((b), (a)))
  45. #define vpunpcklqdq(a, b, o) ((o) = _mm512_unpacklo_epi64((b), (a)))
  46. #define vpbroadcastd(v) _mm512_set1_epi32(v)
  47. #define vrol(x, s) _mm512_rol_epi32((x), (s))
  48. #define vror(x, s) _mm512_ror_epi32((x), (s))
  49. #define vshl(x, s) _mm512_slli_epi32((x), (s))
  50. /* 4x4 32-bit integer matrix transpose */
  51. #define transpose_4x4(x0, x1, x2, x3, t1, t2, t3) \
  52. vpunpckhdq(x1, x0, t2); \
  53. vpunpckldq(x1, x0, x0); \
  54. \
  55. vpunpckldq(x3, x2, t1); \
  56. vpunpckhdq(x3, x2, x2); \
  57. \
  58. vpunpckhqdq(t1, x0, x1); \
  59. vpunpcklqdq(t1, x0, x0); \
  60. \
  61. vpunpckhqdq(x2, t2, x3); \
  62. vpunpcklqdq(x2, t2, x2);
  63. /*
  64. * These are the S-Boxes of Serpent from following research paper.
  65. *
  66. * D. A. Osvik, “Speeding up Serpent,” in Third AES Candidate Conference,
  67. * (New York, New York, USA), p. 317–329, National Institute of Standards and
  68. * Technology, 2000.
  69. *
  70. * Paper is also available at: http://www.ii.uib.no/~osvik/pub/aes3.pdf
  71. *
  72. * --
  73. *
  74. * Following logic gets heavily optimized by compiler to use AVX512F
  75. * 'vpternlogq' instruction. This gives higher performance increase than
  76. * would be expected from simple wideing of vectors from AVX2/256bit to
  77. * AVX512/512bit.
  78. *
  79. */
  80. #define SBOX0(r0, r1, r2, r3, w, x, y, z) \
  81. { \
  82. __m512i r4; \
  83. \
  84. r3 ^= r0; r4 = r1; \
  85. r1 &= r3; r4 ^= r2; \
  86. r1 ^= r0; r0 |= r3; \
  87. r0 ^= r4; r4 ^= r3; \
  88. r3 ^= r2; r2 |= r1; \
  89. r2 ^= r4; r4 = ~r4; \
  90. r4 |= r1; r1 ^= r3; \
  91. r1 ^= r4; r3 |= r0; \
  92. r1 ^= r3; r4 ^= r3; \
  93. \
  94. w = r1; x = r4; y = r2; z = r0; \
  95. }
  96. #define SBOX0_INVERSE(r0, r1, r2, r3, w, x, y, z) \
  97. { \
  98. __m512i r4; \
  99. \
  100. r2 = ~r2; r4 = r1; \
  101. r1 |= r0; r4 = ~r4; \
  102. r1 ^= r2; r2 |= r4; \
  103. r1 ^= r3; r0 ^= r4; \
  104. r2 ^= r0; r0 &= r3; \
  105. r4 ^= r0; r0 |= r1; \
  106. r0 ^= r2; r3 ^= r4; \
  107. r2 ^= r1; r3 ^= r0; \
  108. r3 ^= r1; \
  109. r2 &= r3; \
  110. r4 ^= r2; \
  111. \
  112. w = r0; x = r4; y = r1; z = r3; \
  113. }
  114. #define SBOX1(r0, r1, r2, r3, w, x, y, z) \
  115. { \
  116. __m512i r4; \
  117. \
  118. r0 = ~r0; r2 = ~r2; \
  119. r4 = r0; r0 &= r1; \
  120. r2 ^= r0; r0 |= r3; \
  121. r3 ^= r2; r1 ^= r0; \
  122. r0 ^= r4; r4 |= r1; \
  123. r1 ^= r3; r2 |= r0; \
  124. r2 &= r4; r0 ^= r1; \
  125. r1 &= r2; \
  126. r1 ^= r0; r0 &= r2; \
  127. r0 ^= r4; \
  128. \
  129. w = r2; x = r0; y = r3; z = r1; \
  130. }
  131. #define SBOX1_INVERSE(r0, r1, r2, r3, w, x, y, z) \
  132. { \
  133. __m512i r4; \
  134. \
  135. r4 = r1; r1 ^= r3; \
  136. r3 &= r1; r4 ^= r2; \
  137. r3 ^= r0; r0 |= r1; \
  138. r2 ^= r3; r0 ^= r4; \
  139. r0 |= r2; r1 ^= r3; \
  140. r0 ^= r1; r1 |= r3; \
  141. r1 ^= r0; r4 = ~r4; \
  142. r4 ^= r1; r1 |= r0; \
  143. r1 ^= r0; \
  144. r1 |= r4; \
  145. r3 ^= r1; \
  146. \
  147. w = r4; x = r0; y = r3; z = r2; \
  148. }
  149. #define SBOX2(r0, r1, r2, r3, w, x, y, z) \
  150. { \
  151. __m512i r4; \
  152. \
  153. r4 = r0; r0 &= r2; \
  154. r0 ^= r3; r2 ^= r1; \
  155. r2 ^= r0; r3 |= r4; \
  156. r3 ^= r1; r4 ^= r2; \
  157. r1 = r3; r3 |= r4; \
  158. r3 ^= r0; r0 &= r1; \
  159. r4 ^= r0; r1 ^= r3; \
  160. r1 ^= r4; r4 = ~r4; \
  161. \
  162. w = r2; x = r3; y = r1; z = r4; \
  163. }
  164. #define SBOX2_INVERSE(r0, r1, r2, r3, w, x, y, z) \
  165. { \
  166. __m512i r4; \
  167. \
  168. r2 ^= r3; r3 ^= r0; \
  169. r4 = r3; r3 &= r2; \
  170. r3 ^= r1; r1 |= r2; \
  171. r1 ^= r4; r4 &= r3; \
  172. r2 ^= r3; r4 &= r0; \
  173. r4 ^= r2; r2 &= r1; \
  174. r2 |= r0; r3 = ~r3; \
  175. r2 ^= r3; r0 ^= r3; \
  176. r0 &= r1; r3 ^= r4; \
  177. r3 ^= r0; \
  178. \
  179. w = r1; x = r4; y = r2; z = r3; \
  180. }
  181. #define SBOX3(r0, r1, r2, r3, w, x, y, z) \
  182. { \
  183. __m512i r4; \
  184. \
  185. r4 = r0; r0 |= r3; \
  186. r3 ^= r1; r1 &= r4; \
  187. r4 ^= r2; r2 ^= r3; \
  188. r3 &= r0; r4 |= r1; \
  189. r3 ^= r4; r0 ^= r1; \
  190. r4 &= r0; r1 ^= r3; \
  191. r4 ^= r2; r1 |= r0; \
  192. r1 ^= r2; r0 ^= r3; \
  193. r2 = r1; r1 |= r3; \
  194. r1 ^= r0; \
  195. \
  196. w = r1; x = r2; y = r3; z = r4; \
  197. }
  198. #define SBOX3_INVERSE(r0, r1, r2, r3, w, x, y, z) \
  199. { \
  200. __m512i r4; \
  201. \
  202. r4 = r2; r2 ^= r1; \
  203. r0 ^= r2; r4 &= r2; \
  204. r4 ^= r0; r0 &= r1; \
  205. r1 ^= r3; r3 |= r4; \
  206. r2 ^= r3; r0 ^= r3; \
  207. r1 ^= r4; r3 &= r2; \
  208. r3 ^= r1; r1 ^= r0; \
  209. r1 |= r2; r0 ^= r3; \
  210. r1 ^= r4; \
  211. r0 ^= r1; \
  212. \
  213. w = r2; x = r1; y = r3; z = r0; \
  214. }
  215. #define SBOX4(r0, r1, r2, r3, w, x, y, z) \
  216. { \
  217. __m512i r4; \
  218. \
  219. r1 ^= r3; r3 = ~r3; \
  220. r2 ^= r3; r3 ^= r0; \
  221. r4 = r1; r1 &= r3; \
  222. r1 ^= r2; r4 ^= r3; \
  223. r0 ^= r4; r2 &= r4; \
  224. r2 ^= r0; r0 &= r1; \
  225. r3 ^= r0; r4 |= r1; \
  226. r4 ^= r0; r0 |= r3; \
  227. r0 ^= r2; r2 &= r3; \
  228. r0 = ~r0; r4 ^= r2; \
  229. \
  230. w = r1; x = r4; y = r0; z = r3; \
  231. }
  232. #define SBOX4_INVERSE(r0, r1, r2, r3, w, x, y, z) \
  233. { \
  234. __m512i r4; \
  235. \
  236. r4 = r2; r2 &= r3; \
  237. r2 ^= r1; r1 |= r3; \
  238. r1 &= r0; r4 ^= r2; \
  239. r4 ^= r1; r1 &= r2; \
  240. r0 = ~r0; r3 ^= r4; \
  241. r1 ^= r3; r3 &= r0; \
  242. r3 ^= r2; r0 ^= r1; \
  243. r2 &= r0; r3 ^= r0; \
  244. r2 ^= r4; \
  245. r2 |= r3; r3 ^= r0; \
  246. r2 ^= r1; \
  247. \
  248. w = r0; x = r3; y = r2; z = r4; \
  249. }
  250. #define SBOX5(r0, r1, r2, r3, w, x, y, z) \
  251. { \
  252. __m512i r4; \
  253. \
  254. r0 ^= r1; r1 ^= r3; \
  255. r3 = ~r3; r4 = r1; \
  256. r1 &= r0; r2 ^= r3; \
  257. r1 ^= r2; r2 |= r4; \
  258. r4 ^= r3; r3 &= r1; \
  259. r3 ^= r0; r4 ^= r1; \
  260. r4 ^= r2; r2 ^= r0; \
  261. r0 &= r3; r2 = ~r2; \
  262. r0 ^= r4; r4 |= r3; \
  263. r2 ^= r4; \
  264. \
  265. w = r1; x = r3; y = r0; z = r2; \
  266. }
  267. #define SBOX5_INVERSE(r0, r1, r2, r3, w, x, y, z) \
  268. { \
  269. __m512i r4; \
  270. \
  271. r1 = ~r1; r4 = r3; \
  272. r2 ^= r1; r3 |= r0; \
  273. r3 ^= r2; r2 |= r1; \
  274. r2 &= r0; r4 ^= r3; \
  275. r2 ^= r4; r4 |= r0; \
  276. r4 ^= r1; r1 &= r2; \
  277. r1 ^= r3; r4 ^= r2; \
  278. r3 &= r4; r4 ^= r1; \
  279. r3 ^= r4; r4 = ~r4; \
  280. r3 ^= r0; \
  281. \
  282. w = r1; x = r4; y = r3; z = r2; \
  283. }
  284. #define SBOX6(r0, r1, r2, r3, w, x, y, z) \
  285. { \
  286. __m512i r4; \
  287. \
  288. r2 = ~r2; r4 = r3; \
  289. r3 &= r0; r0 ^= r4; \
  290. r3 ^= r2; r2 |= r4; \
  291. r1 ^= r3; r2 ^= r0; \
  292. r0 |= r1; r2 ^= r1; \
  293. r4 ^= r0; r0 |= r3; \
  294. r0 ^= r2; r4 ^= r3; \
  295. r4 ^= r0; r3 = ~r3; \
  296. r2 &= r4; \
  297. r2 ^= r3; \
  298. \
  299. w = r0; x = r1; y = r4; z = r2; \
  300. }
  301. #define SBOX6_INVERSE(r0, r1, r2, r3, w, x, y, z) \
  302. { \
  303. __m512i r4; \
  304. \
  305. r0 ^= r2; r4 = r2; \
  306. r2 &= r0; r4 ^= r3; \
  307. r2 = ~r2; r3 ^= r1; \
  308. r2 ^= r3; r4 |= r0; \
  309. r0 ^= r2; r3 ^= r4; \
  310. r4 ^= r1; r1 &= r3; \
  311. r1 ^= r0; r0 ^= r3; \
  312. r0 |= r2; r3 ^= r1; \
  313. r4 ^= r0; \
  314. \
  315. w = r1; x = r2; y = r4; z = r3; \
  316. }
  317. #define SBOX7(r0, r1, r2, r3, w, x, y, z) \
  318. { \
  319. __m512i r4; \
  320. \
  321. r4 = r1; r1 |= r2; \
  322. r1 ^= r3; r4 ^= r2; \
  323. r2 ^= r1; r3 |= r4; \
  324. r3 &= r0; r4 ^= r2; \
  325. r3 ^= r1; r1 |= r4; \
  326. r1 ^= r0; r0 |= r4; \
  327. r0 ^= r2; r1 ^= r4; \
  328. r2 ^= r1; r1 &= r0; \
  329. r1 ^= r4; r2 = ~r2; \
  330. r2 |= r0; \
  331. r4 ^= r2; \
  332. \
  333. w = r4; x = r3; y = r1; z = r0; \
  334. }
  335. #define SBOX7_INVERSE(r0, r1, r2, r3, w, x, y, z) \
  336. { \
  337. __m512i r4; \
  338. \
  339. r4 = r2; r2 ^= r0; \
  340. r0 &= r3; r4 |= r3; \
  341. r2 = ~r2; r3 ^= r1; \
  342. r1 |= r0; r0 ^= r2; \
  343. r2 &= r4; r3 &= r4; \
  344. r1 ^= r2; r2 ^= r0; \
  345. r0 |= r2; r4 ^= r1; \
  346. r0 ^= r3; r3 ^= r4; \
  347. r4 |= r0; r3 ^= r2; \
  348. r4 ^= r2; \
  349. \
  350. w = r3; x = r0; y = r1; z = r4; \
  351. }
  352. /* XOR BLOCK1 into BLOCK0. */
  353. #define BLOCK_XOR_KEY(block0, rkey) \
  354. { \
  355. block0[0] ^= vpbroadcastd(rkey[0]); \
  356. block0[1] ^= vpbroadcastd(rkey[1]); \
  357. block0[2] ^= vpbroadcastd(rkey[2]); \
  358. block0[3] ^= vpbroadcastd(rkey[3]); \
  359. }
  360. /* Copy BLOCK_SRC to BLOCK_DST. */
  361. #define BLOCK_COPY(block_dst, block_src) \
  362. { \
  363. block_dst[0] = block_src[0]; \
  364. block_dst[1] = block_src[1]; \
  365. block_dst[2] = block_src[2]; \
  366. block_dst[3] = block_src[3]; \
  367. }
  368. /* Apply SBOX number WHICH to to the block found in ARRAY0, writing
  369. the output to the block found in ARRAY1. */
  370. #define SBOX(which, array0, array1) \
  371. SBOX##which (array0[0], array0[1], array0[2], array0[3], \
  372. array1[0], array1[1], array1[2], array1[3]);
  373. /* Apply inverse SBOX number WHICH to to the block found in ARRAY0, writing
  374. the output to the block found in ARRAY1. */
  375. #define SBOX_INVERSE(which, array0, array1) \
  376. SBOX##which##_INVERSE (array0[0], array0[1], array0[2], array0[3], \
  377. array1[0], array1[1], array1[2], array1[3]);
  378. /* Apply the linear transformation to BLOCK. */
  379. #define LINEAR_TRANSFORMATION(block) \
  380. { \
  381. block[0] = vrol (block[0], 13); \
  382. block[2] = vrol (block[2], 3); \
  383. block[1] = block[1] ^ block[0] ^ block[2]; \
  384. block[3] = block[3] ^ block[2] ^ vshl(block[0], 3); \
  385. block[1] = vrol (block[1], 1); \
  386. block[3] = vrol (block[3], 7); \
  387. block[0] = block[0] ^ block[1] ^ block[3]; \
  388. block[2] = block[2] ^ block[3] ^ vshl(block[1], 7); \
  389. block[0] = vrol (block[0], 5); \
  390. block[2] = vrol (block[2], 22); \
  391. }
  392. /* Apply the inverse linear transformation to BLOCK. */
  393. #define LINEAR_TRANSFORMATION_INVERSE(block) \
  394. { \
  395. block[2] = vror (block[2], 22); \
  396. block[0] = vror (block[0] , 5); \
  397. block[2] = block[2] ^ block[3] ^ vshl(block[1], 7); \
  398. block[0] = block[0] ^ block[1] ^ block[3]; \
  399. block[3] = vror (block[3], 7); \
  400. block[1] = vror (block[1], 1); \
  401. block[3] = block[3] ^ block[2] ^ vshl(block[0], 3); \
  402. block[1] = block[1] ^ block[0] ^ block[2]; \
  403. block[2] = vror (block[2], 3); \
  404. block[0] = vror (block[0], 13); \
  405. }
  406. /* Apply a Serpent round to BLOCK, using the SBOX number WHICH and the
  407. subkeys contained in SUBKEYS. Use BLOCK_TMP as temporary storage.
  408. This macro increments `round'. */
  409. #define ROUND(which, subkeys, block, block_tmp) \
  410. { \
  411. BLOCK_XOR_KEY (block, subkeys[round]); \
  412. SBOX (which, block, block_tmp); \
  413. LINEAR_TRANSFORMATION (block_tmp); \
  414. BLOCK_COPY (block, block_tmp); \
  415. }
  416. /* Apply the last Serpent round to BLOCK, using the SBOX number WHICH
  417. and the subkeys contained in SUBKEYS. Use BLOCK_TMP as temporary
  418. storage. The result will be stored in BLOCK_TMP. This macro
  419. increments `round'. */
  420. #define ROUND_LAST(which, subkeys, block, block_tmp) \
  421. { \
  422. BLOCK_XOR_KEY (block, subkeys[round]); \
  423. SBOX (which, block, block_tmp); \
  424. BLOCK_XOR_KEY (block_tmp, subkeys[round+1]); \
  425. }
  426. /* Apply an inverse Serpent round to BLOCK, using the SBOX number
  427. WHICH and the subkeys contained in SUBKEYS. Use BLOCK_TMP as
  428. temporary storage. This macro increments `round'. */
  429. #define ROUND_INVERSE(which, subkey, block, block_tmp) \
  430. { \
  431. LINEAR_TRANSFORMATION_INVERSE (block); \
  432. SBOX_INVERSE (which, block, block_tmp); \
  433. BLOCK_XOR_KEY (block_tmp, subkey[round]); \
  434. BLOCK_COPY (block, block_tmp); \
  435. }
  436. /* Apply the first Serpent round to BLOCK, using the SBOX number WHICH
  437. and the subkeys contained in SUBKEYS. Use BLOCK_TMP as temporary
  438. storage. The result will be stored in BLOCK_TMP. This macro
  439. increments `round'. */
  440. #define ROUND_FIRST_INVERSE(which, subkeys, block, block_tmp) \
  441. { \
  442. BLOCK_XOR_KEY (block, subkeys[round]); \
  443. SBOX_INVERSE (which, block, block_tmp); \
  444. BLOCK_XOR_KEY (block_tmp, subkeys[round-1]); \
  445. }
  446. static ALWAYS_INLINE void
  447. serpent_encrypt_internal_avx512 (const serpent_subkeys_t keys,
  448. const __m512i vin[8], __m512i vout[8])
  449. {
  450. __m512i b[4];
  451. __m512i c[4];
  452. __m512i b_next[4];
  453. __m512i c_next[4];
  454. int round = 0;
  455. b_next[0] = vin[0];
  456. b_next[1] = vin[1];
  457. b_next[2] = vin[2];
  458. b_next[3] = vin[3];
  459. c_next[0] = vin[4];
  460. c_next[1] = vin[5];
  461. c_next[2] = vin[6];
  462. c_next[3] = vin[7];
  463. transpose_4x4 (b_next[0], b_next[1], b_next[2], b_next[3], b[0], b[1], b[2]);
  464. transpose_4x4 (c_next[0], c_next[1], c_next[2], c_next[3], c[0], c[1], c[2]);
  465. b[0] = b_next[0];
  466. b[1] = b_next[1];
  467. b[2] = b_next[2];
  468. b[3] = b_next[3];
  469. c[0] = c_next[0];
  470. c[1] = c_next[1];
  471. c[2] = c_next[2];
  472. c[3] = c_next[3];
  473. while (1)
  474. {
  475. ROUND (0, keys, b, b_next); ROUND (0, keys, c, c_next); round++;
  476. ROUND (1, keys, b, b_next); ROUND (1, keys, c, c_next); round++;
  477. ROUND (2, keys, b, b_next); ROUND (2, keys, c, c_next); round++;
  478. ROUND (3, keys, b, b_next); ROUND (3, keys, c, c_next); round++;
  479. ROUND (4, keys, b, b_next); ROUND (4, keys, c, c_next); round++;
  480. ROUND (5, keys, b, b_next); ROUND (5, keys, c, c_next); round++;
  481. ROUND (6, keys, b, b_next); ROUND (6, keys, c, c_next); round++;
  482. if (round >= ROUNDS - 1)
  483. break;
  484. ROUND (7, keys, b, b_next); ROUND (7, keys, c, c_next); round++;
  485. }
  486. ROUND_LAST (7, keys, b, b_next); ROUND_LAST (7, keys, c, c_next);
  487. transpose_4x4 (b_next[0], b_next[1], b_next[2], b_next[3], b[0], b[1], b[2]);
  488. transpose_4x4 (c_next[0], c_next[1], c_next[2], c_next[3], c[0], c[1], c[2]);
  489. vout[0] = b_next[0];
  490. vout[1] = b_next[1];
  491. vout[2] = b_next[2];
  492. vout[3] = b_next[3];
  493. vout[4] = c_next[0];
  494. vout[5] = c_next[1];
  495. vout[6] = c_next[2];
  496. vout[7] = c_next[3];
  497. }
  498. static ALWAYS_INLINE void
  499. serpent_decrypt_internal_avx512 (const serpent_subkeys_t keys,
  500. const __m512i vin[8], __m512i vout[8])
  501. {
  502. __m512i b[4];
  503. __m512i c[4];
  504. __m512i b_next[4];
  505. __m512i c_next[4];
  506. int round = ROUNDS;
  507. b_next[0] = vin[0];
  508. b_next[1] = vin[1];
  509. b_next[2] = vin[2];
  510. b_next[3] = vin[3];
  511. c_next[0] = vin[4];
  512. c_next[1] = vin[5];
  513. c_next[2] = vin[6];
  514. c_next[3] = vin[7];
  515. transpose_4x4 (b_next[0], b_next[1], b_next[2], b_next[3], b[0], b[1], b[2]);
  516. transpose_4x4 (c_next[0], c_next[1], c_next[2], c_next[3], c[0], c[1], c[2]);
  517. ROUND_FIRST_INVERSE (7, keys, b_next, b); ROUND_FIRST_INVERSE (7, keys, c_next, c);
  518. round -= 2;
  519. while (1)
  520. {
  521. ROUND_INVERSE (6, keys, b, b_next); ROUND_INVERSE (6, keys, c, c_next); round--;
  522. ROUND_INVERSE (5, keys, b, b_next); ROUND_INVERSE (5, keys, c, c_next); round--;
  523. ROUND_INVERSE (4, keys, b, b_next); ROUND_INVERSE (4, keys, c, c_next); round--;
  524. ROUND_INVERSE (3, keys, b, b_next); ROUND_INVERSE (3, keys, c, c_next); round--;
  525. ROUND_INVERSE (2, keys, b, b_next); ROUND_INVERSE (2, keys, c, c_next); round--;
  526. ROUND_INVERSE (1, keys, b, b_next); ROUND_INVERSE (1, keys, c, c_next); round--;
  527. ROUND_INVERSE (0, keys, b, b_next); ROUND_INVERSE (0, keys, c, c_next); round--;
  528. if (round <= 0)
  529. break;
  530. ROUND_INVERSE (7, keys, b, b_next); ROUND_INVERSE (7, keys, c, c_next); round--;
  531. }
  532. transpose_4x4 (b_next[0], b_next[1], b_next[2], b_next[3], b[0], b[1], b[2]);
  533. transpose_4x4 (c_next[0], c_next[1], c_next[2], c_next[3], c[0], c[1], c[2]);
  534. vout[0] = b_next[0];
  535. vout[1] = b_next[1];
  536. vout[2] = b_next[2];
  537. vout[3] = b_next[3];
  538. vout[4] = c_next[0];
  539. vout[5] = c_next[1];
  540. vout[6] = c_next[2];
  541. vout[7] = c_next[3];
  542. }
  543. enum crypt_mode_e
  544. {
  545. ECB_ENC = 0,
  546. ECB_DEC,
  547. CBC_DEC,
  548. CFB_DEC,
  549. CTR_ENC,
  550. OCB_ENC,
  551. OCB_DEC
  552. };
  553. static ALWAYS_INLINE void
  554. ctr_generate(unsigned char *ctr, __m512i vin[8])
  555. {
  556. const unsigned int blocksize = 16;
  557. unsigned char ctr_low = ctr[15];
  558. if (ctr_low + 32 <= 256)
  559. {
  560. const __m512i add0123 = _mm512_set_epi64(3LL << 56, 0,
  561. 2LL << 56, 0,
  562. 1LL << 56, 0,
  563. 0LL << 56, 0);
  564. const __m512i add4444 = _mm512_set_epi64(4LL << 56, 0,
  565. 4LL << 56, 0,
  566. 4LL << 56, 0,
  567. 4LL << 56, 0);
  568. const __m512i add4567 = _mm512_add_epi32(add0123, add4444);
  569. const __m512i add8888 = _mm512_add_epi32(add4444, add4444);
  570. // Fast path without carry handling.
  571. __m512i vctr =
  572. _mm512_broadcast_i32x4(_mm_loadu_si128((const void *)ctr));
  573. cipher_block_add(ctr, 32, blocksize);
  574. vin[0] = _mm512_add_epi32(vctr, add0123);
  575. vin[1] = _mm512_add_epi32(vctr, add4567);
  576. vin[2] = _mm512_add_epi32(vin[0], add8888);
  577. vin[3] = _mm512_add_epi32(vin[1], add8888);
  578. vin[4] = _mm512_add_epi32(vin[2], add8888);
  579. vin[5] = _mm512_add_epi32(vin[3], add8888);
  580. vin[6] = _mm512_add_epi32(vin[4], add8888);
  581. vin[7] = _mm512_add_epi32(vin[5], add8888);
  582. }
  583. else
  584. {
  585. // Slow path.
  586. u32 blocks[4][blocksize / sizeof(u32)];
  587. cipher_block_cpy(blocks[0], ctr, blocksize);
  588. cipher_block_cpy(blocks[1], ctr, blocksize);
  589. cipher_block_cpy(blocks[2], ctr, blocksize);
  590. cipher_block_cpy(blocks[3], ctr, blocksize);
  591. cipher_block_add(ctr, 32, blocksize);
  592. cipher_block_add(blocks[1], 1, blocksize);
  593. cipher_block_add(blocks[2], 2, blocksize);
  594. cipher_block_add(blocks[3], 3, blocksize);
  595. vin[0] = _mm512_loadu_epi32 (blocks);
  596. cipher_block_add(blocks[0], 4, blocksize);
  597. cipher_block_add(blocks[1], 4, blocksize);
  598. cipher_block_add(blocks[2], 4, blocksize);
  599. cipher_block_add(blocks[3], 4, blocksize);
  600. vin[1] = _mm512_loadu_epi32 (blocks);
  601. cipher_block_add(blocks[0], 4, blocksize);
  602. cipher_block_add(blocks[1], 4, blocksize);
  603. cipher_block_add(blocks[2], 4, blocksize);
  604. cipher_block_add(blocks[3], 4, blocksize);
  605. vin[2] = _mm512_loadu_epi32 (blocks);
  606. cipher_block_add(blocks[0], 4, blocksize);
  607. cipher_block_add(blocks[1], 4, blocksize);
  608. cipher_block_add(blocks[2], 4, blocksize);
  609. cipher_block_add(blocks[3], 4, blocksize);
  610. vin[3] = _mm512_loadu_epi32 (blocks);
  611. cipher_block_add(blocks[0], 4, blocksize);
  612. cipher_block_add(blocks[1], 4, blocksize);
  613. cipher_block_add(blocks[2], 4, blocksize);
  614. cipher_block_add(blocks[3], 4, blocksize);
  615. vin[4] = _mm512_loadu_epi32 (blocks);
  616. cipher_block_add(blocks[0], 4, blocksize);
  617. cipher_block_add(blocks[1], 4, blocksize);
  618. cipher_block_add(blocks[2], 4, blocksize);
  619. cipher_block_add(blocks[3], 4, blocksize);
  620. vin[5] = _mm512_loadu_epi32 (blocks);
  621. cipher_block_add(blocks[0], 4, blocksize);
  622. cipher_block_add(blocks[1], 4, blocksize);
  623. cipher_block_add(blocks[2], 4, blocksize);
  624. cipher_block_add(blocks[3], 4, blocksize);
  625. vin[6] = _mm512_loadu_epi32 (blocks);
  626. cipher_block_add(blocks[0], 4, blocksize);
  627. cipher_block_add(blocks[1], 4, blocksize);
  628. cipher_block_add(blocks[2], 4, blocksize);
  629. cipher_block_add(blocks[3], 4, blocksize);
  630. vin[7] = _mm512_loadu_epi32 (blocks);
  631. wipememory(blocks, sizeof(blocks));
  632. }
  633. }
  634. static ALWAYS_INLINE __m512i
  635. ocb_input(__m512i *vchecksum, __m128i *voffset, const unsigned char *input,
  636. unsigned char *output, const ocb_L_uintptr_t L[4])
  637. {
  638. __m128i L0 = _mm_loadu_si128((const void *)(uintptr_t)L[0]);
  639. __m128i L1 = _mm_loadu_si128((const void *)(uintptr_t)L[1]);
  640. __m128i L2 = _mm_loadu_si128((const void *)(uintptr_t)L[2]);
  641. __m128i L3 = _mm_loadu_si128((const void *)(uintptr_t)L[3]);
  642. __m512i vin = _mm512_loadu_epi32 (input);
  643. __m512i voffsets;
  644. /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
  645. /* Checksum_i = Checksum_{i-1} xor P_i */
  646. /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */
  647. if (vchecksum)
  648. *vchecksum ^= _mm512_loadu_epi32 (input);
  649. *voffset ^= L0;
  650. voffsets = _mm512_castsi128_si512(*voffset);
  651. *voffset ^= L1;
  652. voffsets = _mm512_inserti32x4(voffsets, *voffset, 1);
  653. *voffset ^= L2;
  654. voffsets = _mm512_inserti32x4(voffsets, *voffset, 2);
  655. *voffset ^= L3;
  656. voffsets = _mm512_inserti32x4(voffsets, *voffset, 3);
  657. _mm512_storeu_epi32 (output, voffsets);
  658. return vin ^ voffsets;
  659. }
  660. static NO_INLINE void
  661. serpent_avx512_blk32(const void *c, unsigned char *output,
  662. const unsigned char *input, int mode,
  663. unsigned char *iv, unsigned char *checksum,
  664. const ocb_L_uintptr_t Ls[32])
  665. {
  666. __m512i vin[8];
  667. __m512i vout[8];
  668. int encrypt = 1;
  669. asm volatile ("vpxor %%ymm0, %%ymm0, %%ymm0;\n\t"
  670. "vpopcntb %%zmm0, %%zmm6;\n\t" /* spec stop for old AVX512 CPUs */
  671. "vpxor %%ymm6, %%ymm6, %%ymm6;\n\t"
  672. :
  673. : "m"(*input), "m"(*output)
  674. : "xmm6", "xmm0", "memory", "cc");
  675. // Input handling
  676. switch (mode)
  677. {
  678. default:
  679. case CBC_DEC:
  680. case ECB_DEC:
  681. encrypt = 0;
  682. /* fall through */
  683. case ECB_ENC:
  684. vin[0] = _mm512_loadu_epi32 (input + 0 * 64);
  685. vin[1] = _mm512_loadu_epi32 (input + 1 * 64);
  686. vin[2] = _mm512_loadu_epi32 (input + 2 * 64);
  687. vin[3] = _mm512_loadu_epi32 (input + 3 * 64);
  688. vin[4] = _mm512_loadu_epi32 (input + 4 * 64);
  689. vin[5] = _mm512_loadu_epi32 (input + 5 * 64);
  690. vin[6] = _mm512_loadu_epi32 (input + 6 * 64);
  691. vin[7] = _mm512_loadu_epi32 (input + 7 * 64);
  692. break;
  693. case CFB_DEC:
  694. {
  695. __m128i viv;
  696. vin[0] = _mm512_maskz_loadu_epi32(_cvtu32_mask16(0xfff0),
  697. input - 1 * 64 + 48)
  698. ^ _mm512_maskz_loadu_epi32(_cvtu32_mask16(0x000f), iv);
  699. vin[1] = _mm512_loadu_epi32(input + 0 * 64 + 48);
  700. vin[2] = _mm512_loadu_epi32(input + 1 * 64 + 48);
  701. vin[3] = _mm512_loadu_epi32(input + 2 * 64 + 48);
  702. vin[4] = _mm512_loadu_epi32(input + 3 * 64 + 48);
  703. vin[5] = _mm512_loadu_epi32(input + 4 * 64 + 48);
  704. vin[6] = _mm512_loadu_epi32(input + 5 * 64 + 48);
  705. vin[7] = _mm512_loadu_epi32(input + 6 * 64 + 48);
  706. viv = _mm_loadu_si128((const void *)(input + 7 * 64 + 48));
  707. _mm_storeu_si128((void *)iv, viv);
  708. break;
  709. }
  710. case CTR_ENC:
  711. ctr_generate(iv, vin);
  712. break;
  713. case OCB_ENC:
  714. {
  715. const ocb_L_uintptr_t *L = Ls;
  716. __m512i vchecksum = _mm512_setzero_epi32();
  717. __m128i vchecksum128 = _mm_loadu_si128((const void *)checksum);
  718. __m128i voffset = _mm_loadu_si128((const void *)iv);
  719. vin[0] = ocb_input(&vchecksum, &voffset, input + 0 * 64, output + 0 * 64, L); L += 4;
  720. vin[1] = ocb_input(&vchecksum, &voffset, input + 1 * 64, output + 1 * 64, L); L += 4;
  721. vin[2] = ocb_input(&vchecksum, &voffset, input + 2 * 64, output + 2 * 64, L); L += 4;
  722. vin[3] = ocb_input(&vchecksum, &voffset, input + 3 * 64, output + 3 * 64, L); L += 4;
  723. vin[4] = ocb_input(&vchecksum, &voffset, input + 4 * 64, output + 4 * 64, L); L += 4;
  724. vin[5] = ocb_input(&vchecksum, &voffset, input + 5 * 64, output + 5 * 64, L); L += 4;
  725. vin[6] = ocb_input(&vchecksum, &voffset, input + 6 * 64, output + 6 * 64, L); L += 4;
  726. vin[7] = ocb_input(&vchecksum, &voffset, input + 7 * 64, output + 7 * 64, L);
  727. vchecksum128 ^= _mm512_extracti32x4_epi32(vchecksum, 0)
  728. ^ _mm512_extracti32x4_epi32(vchecksum, 1)
  729. ^ _mm512_extracti32x4_epi32(vchecksum, 2)
  730. ^ _mm512_extracti32x4_epi32(vchecksum, 3);
  731. _mm_storeu_si128((void *)checksum, vchecksum128);
  732. _mm_storeu_si128((void *)iv, voffset);
  733. break;
  734. }
  735. case OCB_DEC:
  736. {
  737. const ocb_L_uintptr_t *L = Ls;
  738. __m128i voffset = _mm_loadu_si128((const void *)iv);
  739. encrypt = 0;
  740. vin[0] = ocb_input(NULL, &voffset, input + 0 * 64, output + 0 * 64, L); L += 4;
  741. vin[1] = ocb_input(NULL, &voffset, input + 1 * 64, output + 1 * 64, L); L += 4;
  742. vin[2] = ocb_input(NULL, &voffset, input + 2 * 64, output + 2 * 64, L); L += 4;
  743. vin[3] = ocb_input(NULL, &voffset, input + 3 * 64, output + 3 * 64, L); L += 4;
  744. vin[4] = ocb_input(NULL, &voffset, input + 4 * 64, output + 4 * 64, L); L += 4;
  745. vin[5] = ocb_input(NULL, &voffset, input + 5 * 64, output + 5 * 64, L); L += 4;
  746. vin[6] = ocb_input(NULL, &voffset, input + 6 * 64, output + 6 * 64, L); L += 4;
  747. vin[7] = ocb_input(NULL, &voffset, input + 7 * 64, output + 7 * 64, L);
  748. _mm_storeu_si128((void *)iv, voffset);
  749. break;
  750. }
  751. }
  752. if (encrypt)
  753. serpent_encrypt_internal_avx512(c, vin, vout);
  754. else
  755. serpent_decrypt_internal_avx512(c, vin, vout);
  756. switch (mode)
  757. {
  758. case CTR_ENC:
  759. case CFB_DEC:
  760. vout[0] ^= _mm512_loadu_epi32 (input + 0 * 64);
  761. vout[1] ^= _mm512_loadu_epi32 (input + 1 * 64);
  762. vout[2] ^= _mm512_loadu_epi32 (input + 2 * 64);
  763. vout[3] ^= _mm512_loadu_epi32 (input + 3 * 64);
  764. vout[4] ^= _mm512_loadu_epi32 (input + 4 * 64);
  765. vout[5] ^= _mm512_loadu_epi32 (input + 5 * 64);
  766. vout[6] ^= _mm512_loadu_epi32 (input + 6 * 64);
  767. vout[7] ^= _mm512_loadu_epi32 (input + 7 * 64);
  768. /* fall through */
  769. default:
  770. case ECB_DEC:
  771. case ECB_ENC:
  772. _mm512_storeu_epi32 (output + 0 * 64, vout[0]);
  773. _mm512_storeu_epi32 (output + 1 * 64, vout[1]);
  774. _mm512_storeu_epi32 (output + 2 * 64, vout[2]);
  775. _mm512_storeu_epi32 (output + 3 * 64, vout[3]);
  776. _mm512_storeu_epi32 (output + 4 * 64, vout[4]);
  777. _mm512_storeu_epi32 (output + 5 * 64, vout[5]);
  778. _mm512_storeu_epi32 (output + 6 * 64, vout[6]);
  779. _mm512_storeu_epi32 (output + 7 * 64, vout[7]);
  780. break;
  781. case CBC_DEC:
  782. {
  783. __m128i viv;
  784. vout[0] ^= _mm512_maskz_loadu_epi32(_cvtu32_mask16(0xfff0),
  785. input - 1 * 64 + 48)
  786. ^ _mm512_maskz_loadu_epi32(_cvtu32_mask16(0x000f), iv);
  787. vout[1] ^= _mm512_loadu_epi32(input + 0 * 64 + 48);
  788. vout[2] ^= _mm512_loadu_epi32(input + 1 * 64 + 48);
  789. vout[3] ^= _mm512_loadu_epi32(input + 2 * 64 + 48);
  790. vout[4] ^= _mm512_loadu_epi32(input + 3 * 64 + 48);
  791. vout[5] ^= _mm512_loadu_epi32(input + 4 * 64 + 48);
  792. vout[6] ^= _mm512_loadu_epi32(input + 5 * 64 + 48);
  793. vout[7] ^= _mm512_loadu_epi32(input + 6 * 64 + 48);
  794. viv = _mm_loadu_si128((const void *)(input + 7 * 64 + 48));
  795. _mm_storeu_si128((void *)iv, viv);
  796. _mm512_storeu_epi32 (output + 0 * 64, vout[0]);
  797. _mm512_storeu_epi32 (output + 1 * 64, vout[1]);
  798. _mm512_storeu_epi32 (output + 2 * 64, vout[2]);
  799. _mm512_storeu_epi32 (output + 3 * 64, vout[3]);
  800. _mm512_storeu_epi32 (output + 4 * 64, vout[4]);
  801. _mm512_storeu_epi32 (output + 5 * 64, vout[5]);
  802. _mm512_storeu_epi32 (output + 6 * 64, vout[6]);
  803. _mm512_storeu_epi32 (output + 7 * 64, vout[7]);
  804. break;
  805. }
  806. case OCB_ENC:
  807. vout[0] ^= _mm512_loadu_epi32 (output + 0 * 64);
  808. vout[1] ^= _mm512_loadu_epi32 (output + 1 * 64);
  809. vout[2] ^= _mm512_loadu_epi32 (output + 2 * 64);
  810. vout[3] ^= _mm512_loadu_epi32 (output + 3 * 64);
  811. vout[4] ^= _mm512_loadu_epi32 (output + 4 * 64);
  812. vout[5] ^= _mm512_loadu_epi32 (output + 5 * 64);
  813. vout[6] ^= _mm512_loadu_epi32 (output + 6 * 64);
  814. vout[7] ^= _mm512_loadu_epi32 (output + 7 * 64);
  815. _mm512_storeu_epi32 (output + 0 * 64, vout[0]);
  816. _mm512_storeu_epi32 (output + 1 * 64, vout[1]);
  817. _mm512_storeu_epi32 (output + 2 * 64, vout[2]);
  818. _mm512_storeu_epi32 (output + 3 * 64, vout[3]);
  819. _mm512_storeu_epi32 (output + 4 * 64, vout[4]);
  820. _mm512_storeu_epi32 (output + 5 * 64, vout[5]);
  821. _mm512_storeu_epi32 (output + 6 * 64, vout[6]);
  822. _mm512_storeu_epi32 (output + 7 * 64, vout[7]);
  823. break;
  824. case OCB_DEC:
  825. {
  826. __m512i vchecksum = _mm512_setzero_epi32();
  827. __m128i vchecksum128 = _mm_loadu_si128((const void *)checksum);
  828. vout[0] ^= _mm512_loadu_epi32 (output + 0 * 64);
  829. vout[1] ^= _mm512_loadu_epi32 (output + 1 * 64);
  830. vout[2] ^= _mm512_loadu_epi32 (output + 2 * 64);
  831. vout[3] ^= _mm512_loadu_epi32 (output + 3 * 64);
  832. vout[4] ^= _mm512_loadu_epi32 (output + 4 * 64);
  833. vout[5] ^= _mm512_loadu_epi32 (output + 5 * 64);
  834. vout[6] ^= _mm512_loadu_epi32 (output + 6 * 64);
  835. vout[7] ^= _mm512_loadu_epi32 (output + 7 * 64);
  836. vchecksum ^= vout[0];
  837. vchecksum ^= vout[1];
  838. vchecksum ^= vout[2];
  839. vchecksum ^= vout[3];
  840. vchecksum ^= vout[4];
  841. vchecksum ^= vout[5];
  842. vchecksum ^= vout[6];
  843. vchecksum ^= vout[7];
  844. _mm512_storeu_epi32 (output + 0 * 64, vout[0]);
  845. _mm512_storeu_epi32 (output + 1 * 64, vout[1]);
  846. _mm512_storeu_epi32 (output + 2 * 64, vout[2]);
  847. _mm512_storeu_epi32 (output + 3 * 64, vout[3]);
  848. _mm512_storeu_epi32 (output + 4 * 64, vout[4]);
  849. _mm512_storeu_epi32 (output + 5 * 64, vout[5]);
  850. _mm512_storeu_epi32 (output + 6 * 64, vout[6]);
  851. _mm512_storeu_epi32 (output + 7 * 64, vout[7]);
  852. vchecksum128 ^= _mm512_extracti32x4_epi32(vchecksum, 0)
  853. ^ _mm512_extracti32x4_epi32(vchecksum, 1)
  854. ^ _mm512_extracti32x4_epi32(vchecksum, 2)
  855. ^ _mm512_extracti32x4_epi32(vchecksum, 3);
  856. _mm_storeu_si128((void *)checksum, vchecksum128);
  857. break;
  858. }
  859. }
  860. _mm256_zeroall();
  861. #ifdef __x86_64__
  862. asm volatile (
  863. #define CLEAR(mm) "vpxord %%" #mm ", %%" #mm ", %%" #mm ";\n\t"
  864. CLEAR(ymm16) CLEAR(ymm17) CLEAR(ymm18) CLEAR(ymm19)
  865. CLEAR(ymm20) CLEAR(ymm21) CLEAR(ymm22) CLEAR(ymm23)
  866. CLEAR(ymm24) CLEAR(ymm25) CLEAR(ymm26) CLEAR(ymm27)
  867. CLEAR(ymm28) CLEAR(ymm29) CLEAR(ymm30) CLEAR(ymm31)
  868. #undef CLEAR
  869. :
  870. : "m"(*input), "m"(*output)
  871. : "xmm16", "xmm17", "xmm18", "xmm19",
  872. "xmm20", "xmm21", "xmm22", "xmm23",
  873. "xmm24", "xmm25", "xmm26", "xmm27",
  874. "xmm28", "xmm29", "xmm30", "xmm31",
  875. "memory", "cc");
  876. #endif
  877. }
  878. void
  879. _gcry_serpent_avx512_blk32(const void *ctx, unsigned char *out,
  880. const unsigned char *in, int encrypt)
  881. {
  882. serpent_avx512_blk32 (ctx, out, in, encrypt ? ECB_ENC : ECB_DEC,
  883. NULL, NULL, NULL);
  884. }
  885. void
  886. _gcry_serpent_avx512_cbc_dec(const void *ctx, unsigned char *out,
  887. const unsigned char *in, unsigned char *iv)
  888. {
  889. serpent_avx512_blk32 (ctx, out, in, CBC_DEC, iv, NULL, NULL);
  890. }
  891. void
  892. _gcry_serpent_avx512_cfb_dec(const void *ctx, unsigned char *out,
  893. const unsigned char *in, unsigned char *iv)
  894. {
  895. serpent_avx512_blk32 (ctx, out, in, CFB_DEC, iv, NULL, NULL);
  896. }
  897. void
  898. _gcry_serpent_avx512_ctr_enc(const void *ctx, unsigned char *out,
  899. const unsigned char *in, unsigned char *iv)
  900. {
  901. serpent_avx512_blk32 (ctx, out, in, CTR_ENC, iv, NULL, NULL);
  902. }
  903. void
  904. _gcry_serpent_avx512_ocb_crypt(const void *ctx, unsigned char *out,
  905. const unsigned char *in, unsigned char *offset,
  906. unsigned char *checksum,
  907. const ocb_L_uintptr_t Ls[32], int encrypt)
  908. {
  909. serpent_avx512_blk32 (ctx, out, in, encrypt ? OCB_ENC : OCB_DEC, offset,
  910. checksum, Ls);
  911. }
  912. #endif /*defined(USE_SERPENT) && defined(ENABLE_AVX512_SUPPORT)*/
  913. #endif /*__x86_64 || __i386*/