sm4-ppc.c 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343
  1. /* sm4-ppc.c - PowerPC implementation of SM4 cipher
  2. *
  3. * Copyright (C) 2023 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  4. *
  5. * This file is part of Libgcrypt.
  6. *
  7. * Libgcrypt is free software; you can redistribute it and/or modify
  8. * it under the terms of the GNU Lesser General Public License as
  9. * published by the Free Software Foundation; either version 2.1 of
  10. * the License, or (at your option) any later version.
  11. *
  12. * Libgcrypt is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15. * GNU Lesser General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU Lesser General Public
  18. * License along with this program; if not, see <http://www.gnu.org/licenses/>.
  19. */
  20. #include <config.h>
  21. #if defined(ENABLE_PPC_CRYPTO_SUPPORT) && \
  22. defined(HAVE_COMPATIBLE_CC_PPC_ALTIVEC) && \
  23. defined(HAVE_GCC_INLINE_ASM_PPC_ALTIVEC) && \
  24. !defined(WORDS_BIGENDIAN) && (__GNUC__ >= 4)
  25. #include <altivec.h>
  26. #include "bufhelp.h"
  27. typedef vector unsigned char vector16x_u8;
  28. typedef vector unsigned int vector4x_u32;
  29. typedef vector unsigned long long vector2x_u64;
  30. #ifdef HAVE_GCC_ATTRIBUTE_OPTIMIZE
  31. # define FUNC_ATTR_OPT __attribute__((optimize("-O2")))
  32. #else
  33. # define FUNC_ATTR_OPT
  34. #endif
  35. #if defined(__clang__) && defined(HAVE_CLANG_ATTRIBUTE_PPC_TARGET)
  36. # define FUNC_ATTR_TARGET_P8 __attribute__((target("arch=pwr8")))
  37. # define FUNC_ATTR_TARGET_P9 __attribute__((target("arch=pwr9")))
  38. # define HAVE_FUNC_ATTR_TARGET 1
  39. #elif defined(HAVE_GCC_ATTRIBUTE_PPC_TARGET)
  40. # define FUNC_ATTR_TARGET_P8 __attribute__((target("cpu=power8")))
  41. # define FUNC_ATTR_TARGET_P9 __attribute__((target("cpu=power9")))
  42. # define HAVE_FUNC_ATTR_TARGET 1
  43. #else
  44. # define FUNC_ATTR_TARGET_P8
  45. # define FUNC_ATTR_TARGET_P9
  46. # undef HAVE_FUNC_ATTR_TARGET
  47. #endif
  48. #define ALWAYS_INLINE inline __attribute__((always_inline))
  49. #define NO_INLINE __attribute__((noinline))
  50. #define NO_INSTRUMENT_FUNCTION __attribute__((no_instrument_function))
  51. #define ASM_FUNC_ATTR NO_INSTRUMENT_FUNCTION
  52. #define ASM_FUNC_ATTR_INLINE ASM_FUNC_ATTR ALWAYS_INLINE
  53. #define ASM_FUNC_ATTR_NOINLINE ASM_FUNC_ATTR NO_INLINE
  54. #ifdef __clang__
  55. /* clang has mismatching prototype for vec_sbox_be. */
  56. static ASM_FUNC_ATTR_INLINE vector16x_u8
  57. asm_sbox_be(vector16x_u8 b)
  58. {
  59. vector16x_u8 o;
  60. __asm__ ("vsbox %0, %1\n\t" : "=v" (o) : "v" (b));
  61. return o;
  62. }
  63. #undef vec_sbox_be
  64. #define vec_sbox_be asm_sbox_be
  65. #endif /* __clang__ */
  66. #define transpose_4x4(x0, x1, x2, x3, t1, t2) \
  67. t2 = (vector4x_u32)vec_mergel((vector4x_u32)x0, (vector4x_u32)x1); \
  68. x0 = (vector4x_u32)vec_mergeh((vector4x_u32)x0, (vector4x_u32)x1); \
  69. \
  70. t1 = (vector4x_u32)vec_mergeh((vector4x_u32)x2, (vector4x_u32)x3); \
  71. x2 = (vector4x_u32)vec_mergel((vector4x_u32)x2, (vector4x_u32)x3); \
  72. \
  73. x1 = (vector4x_u32)vec_mergel((vector2x_u64)x0, (vector2x_u64)t1); \
  74. x0 = (vector4x_u32)vec_mergeh((vector2x_u64)x0, (vector2x_u64)t1); \
  75. \
  76. x3 = (vector4x_u32)vec_mergel((vector2x_u64)t2, (vector2x_u64)x2); \
  77. x2 = (vector4x_u32)vec_mergeh((vector2x_u64)t2, (vector2x_u64)x2);
  78. #define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) ({ \
  79. tmp0 = x & mask4bit; \
  80. x = (vector4x_u32)((vector16x_u8)x >> 4); \
  81. \
  82. tmp0 = (vector4x_u32)vec_perm((vector16x_u8)lo_t, (vector16x_u8)lo_t, \
  83. (vector16x_u8)tmp0); \
  84. x = (vector4x_u32)vec_perm((vector16x_u8)hi_t, (vector16x_u8)hi_t, \
  85. (vector16x_u8)x); \
  86. x = x ^ tmp0; \
  87. })
  88. #define GET_RKEY(round) vec_splat(r4keys, round)
  89. #define ROUND4(round, s0, s1, s2, s3) ({ \
  90. vector4x_u32 rkey = GET_RKEY(round); \
  91. vector4x_u32 rx0 = rkey ^ s1 ^ s2 ^ s3; \
  92. filter_8bit(rx0, pre_tf_lo_s, pre_tf_hi_s, mask_0f, tmp0); \
  93. rx0 = (vector4x_u32)vec_sbox_be((vector16x_u8)rx0); \
  94. filter_8bit(rx0, post_tf_lo_s, post_tf_hi_s, mask_0f, tmp0); \
  95. s0 ^= rx0 ^ vec_rl(rx0, rotate2) ^ vec_rl(rx0, rotate10) ^ \
  96. vec_rl(rx0, rotate18) ^ vec_rl(rx0, rotate24); \
  97. })
  98. #define ROUND8(round, s0, s1, s2, s3, r0, r1, r2, r3) ({ \
  99. vector4x_u32 rkey = GET_RKEY(round); \
  100. vector4x_u32 rx0 = rkey ^ s1 ^ s2 ^ s3; \
  101. vector4x_u32 rx1 = rkey ^ r1 ^ r2 ^ r3; \
  102. filter_8bit(rx0, pre_tf_lo_s, pre_tf_hi_s, mask_0f, tmp0); \
  103. filter_8bit(rx1, pre_tf_lo_s, pre_tf_hi_s, mask_0f, tmp0); \
  104. rx0 = (vector4x_u32)vec_sbox_be((vector16x_u8)rx0); \
  105. rx1 = (vector4x_u32)vec_sbox_be((vector16x_u8)rx1); \
  106. filter_8bit(rx0, post_tf_lo_s, post_tf_hi_s, mask_0f, tmp0); \
  107. filter_8bit(rx1, post_tf_lo_s, post_tf_hi_s, mask_0f, tmp0); \
  108. s0 ^= rx0 ^ vec_rl(rx0, rotate2) ^ vec_rl(rx0, rotate10) ^ \
  109. vec_rl(rx0, rotate18) ^ vec_rl(rx0, rotate24); \
  110. r0 ^= rx1 ^ vec_rl(rx1, rotate2) ^ vec_rl(rx1, rotate10) ^ \
  111. vec_rl(rx1, rotate18) ^ vec_rl(rx1, rotate24); \
  112. })
  113. static const vector4x_u32 mask_0f =
  114. { 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f };
  115. static const vector2x_u64 pre_tf_lo_s =
  116. { 0x9096E3E575730600ULL, 0xC6C0B5B323255056ULL };
  117. static const vector2x_u64 pre_tf_hi_s =
  118. { 0xE341AA08EA48A301ULL, 0xF153B81AF85AB113ULL };
  119. static const vector2x_u64 post_tf_lo_s =
  120. { 0x6F53C6FA95A93C00ULL, 0xD9E5704C231F8AB6ULL };
  121. static const vector2x_u64 post_tf_hi_s =
  122. { 0x9A4635E9479BE834ULL, 0x25F98A56F824578BULL };
  123. static const vector4x_u32 rotate2 = { 2, 2, 2, 2 };
  124. static const vector4x_u32 rotate10 = { 10, 10, 10, 10 };
  125. static const vector4x_u32 rotate18 = { 18, 18, 18, 18 };
  126. static const vector4x_u32 rotate24 = { 24, 24, 24, 24 };
  127. static ASM_FUNC_ATTR_INLINE void
  128. sm4_ppc_crypt_blk16(u32 *rk, byte *out, const byte *in)
  129. {
  130. vector4x_u32 ra0, ra1, ra2, ra3;
  131. vector4x_u32 rb0, rb1, rb2, rb3;
  132. vector4x_u32 rc0, rc1, rc2, rc3;
  133. vector4x_u32 rd0, rd1, rd2, rd3;
  134. vector4x_u32 tmp0, tmp1;
  135. u32 *rk_end;
  136. ra0 = vec_revb((vector4x_u32)vec_xl(0, in + 0 * 16));
  137. ra1 = vec_revb((vector4x_u32)vec_xl(0, in + 1 * 16));
  138. ra2 = vec_revb((vector4x_u32)vec_xl(0, in + 2 * 16));
  139. ra3 = vec_revb((vector4x_u32)vec_xl(0, in + 3 * 16));
  140. rb0 = vec_revb((vector4x_u32)vec_xl(0, in + 4 * 16));
  141. rb1 = vec_revb((vector4x_u32)vec_xl(0, in + 5 * 16));
  142. rb2 = vec_revb((vector4x_u32)vec_xl(0, in + 6 * 16));
  143. rb3 = vec_revb((vector4x_u32)vec_xl(0, in + 7 * 16));
  144. in += 8 * 16;
  145. rc0 = vec_revb((vector4x_u32)vec_xl(0, in + 0 * 16));
  146. rc1 = vec_revb((vector4x_u32)vec_xl(0, in + 1 * 16));
  147. rc2 = vec_revb((vector4x_u32)vec_xl(0, in + 2 * 16));
  148. rc3 = vec_revb((vector4x_u32)vec_xl(0, in + 3 * 16));
  149. rd0 = vec_revb((vector4x_u32)vec_xl(0, in + 4 * 16));
  150. rd1 = vec_revb((vector4x_u32)vec_xl(0, in + 5 * 16));
  151. rd2 = vec_revb((vector4x_u32)vec_xl(0, in + 6 * 16));
  152. rd3 = vec_revb((vector4x_u32)vec_xl(0, in + 7 * 16));
  153. transpose_4x4(ra0, ra1, ra2, ra3, tmp0, tmp1);
  154. transpose_4x4(rb0, rb1, rb2, rb3, tmp0, tmp1);
  155. transpose_4x4(rc0, rc1, rc2, rc3, tmp0, tmp1);
  156. transpose_4x4(rd0, rd1, rd2, rd3, tmp0, tmp1);
  157. for (rk_end = rk + 32; rk < rk_end; rk += 4)
  158. {
  159. vector4x_u32 r4keys = vec_xl(0, rk);
  160. ROUND8(0, ra0, ra1, ra2, ra3, rb0, rb1, rb2, rb3);
  161. ROUND8(0, rc0, rc1, rc2, rc3, rd0, rd1, rd2, rd3);
  162. ROUND8(1, ra1, ra2, ra3, ra0, rb1, rb2, rb3, rb0);
  163. ROUND8(1, rc1, rc2, rc3, rc0, rd1, rd2, rd3, rd0);
  164. ROUND8(2, ra2, ra3, ra0, ra1, rb2, rb3, rb0, rb1);
  165. ROUND8(2, rc2, rc3, rc0, rc1, rd2, rd3, rd0, rd1);
  166. ROUND8(3, ra3, ra0, ra1, ra2, rb3, rb0, rb1, rb2);
  167. ROUND8(3, rc3, rc0, rc1, rc2, rd3, rd0, rd1, rd2);
  168. }
  169. transpose_4x4(ra3, ra2, ra1, ra0, tmp0, tmp1);
  170. transpose_4x4(rb3, rb2, rb1, rb0, tmp0, tmp1);
  171. transpose_4x4(rc3, rc2, rc1, rc0, tmp0, tmp1);
  172. transpose_4x4(rd3, rd2, rd1, rd0, tmp0, tmp1);
  173. vec_xst((vector16x_u8)vec_revb(ra3), 0, out + 0 * 16);
  174. vec_xst((vector16x_u8)vec_revb(ra2), 0, out + 1 * 16);
  175. vec_xst((vector16x_u8)vec_revb(ra1), 0, out + 2 * 16);
  176. vec_xst((vector16x_u8)vec_revb(ra0), 0, out + 3 * 16);
  177. vec_xst((vector16x_u8)vec_revb(rb3), 0, out + 4 * 16);
  178. vec_xst((vector16x_u8)vec_revb(rb2), 0, out + 5 * 16);
  179. vec_xst((vector16x_u8)vec_revb(rb1), 0, out + 6 * 16);
  180. vec_xst((vector16x_u8)vec_revb(rb0), 0, out + 7 * 16);
  181. out += 8 * 16;
  182. vec_xst((vector16x_u8)vec_revb(rc3), 0, out + 0 * 16);
  183. vec_xst((vector16x_u8)vec_revb(rc2), 0, out + 1 * 16);
  184. vec_xst((vector16x_u8)vec_revb(rc1), 0, out + 2 * 16);
  185. vec_xst((vector16x_u8)vec_revb(rc0), 0, out + 3 * 16);
  186. vec_xst((vector16x_u8)vec_revb(rd3), 0, out + 4 * 16);
  187. vec_xst((vector16x_u8)vec_revb(rd2), 0, out + 5 * 16);
  188. vec_xst((vector16x_u8)vec_revb(rd1), 0, out + 6 * 16);
  189. vec_xst((vector16x_u8)vec_revb(rd0), 0, out + 7 * 16);
  190. }
  191. static ASM_FUNC_ATTR_INLINE void
  192. sm4_ppc_crypt_blk8(u32 *rk, byte *out, const byte *in)
  193. {
  194. vector4x_u32 ra0, ra1, ra2, ra3;
  195. vector4x_u32 rb0, rb1, rb2, rb3;
  196. vector4x_u32 tmp0, tmp1;
  197. u32 *rk_end;
  198. ra0 = vec_revb((vector4x_u32)vec_xl(0, in + 0 * 16));
  199. ra1 = vec_revb((vector4x_u32)vec_xl(0, in + 1 * 16));
  200. ra2 = vec_revb((vector4x_u32)vec_xl(0, in + 2 * 16));
  201. ra3 = vec_revb((vector4x_u32)vec_xl(0, in + 3 * 16));
  202. rb0 = vec_revb((vector4x_u32)vec_xl(0, in + 4 * 16));
  203. rb1 = vec_revb((vector4x_u32)vec_xl(0, in + 5 * 16));
  204. rb2 = vec_revb((vector4x_u32)vec_xl(0, in + 6 * 16));
  205. rb3 = vec_revb((vector4x_u32)vec_xl(0, in + 7 * 16));
  206. transpose_4x4(ra0, ra1, ra2, ra3, tmp0, tmp1);
  207. transpose_4x4(rb0, rb1, rb2, rb3, tmp0, tmp1);
  208. for (rk_end = rk + 32; rk < rk_end; rk += 4)
  209. {
  210. vector4x_u32 r4keys = vec_xl(0, rk);
  211. ROUND8(0, ra0, ra1, ra2, ra3, rb0, rb1, rb2, rb3);
  212. ROUND8(1, ra1, ra2, ra3, ra0, rb1, rb2, rb3, rb0);
  213. ROUND8(2, ra2, ra3, ra0, ra1, rb2, rb3, rb0, rb1);
  214. ROUND8(3, ra3, ra0, ra1, ra2, rb3, rb0, rb1, rb2);
  215. }
  216. transpose_4x4(ra3, ra2, ra1, ra0, tmp0, tmp1);
  217. transpose_4x4(rb3, rb2, rb1, rb0, tmp0, tmp1);
  218. vec_xst((vector16x_u8)vec_revb(ra3), 0, out + 0 * 16);
  219. vec_xst((vector16x_u8)vec_revb(ra2), 0, out + 1 * 16);
  220. vec_xst((vector16x_u8)vec_revb(ra1), 0, out + 2 * 16);
  221. vec_xst((vector16x_u8)vec_revb(ra0), 0, out + 3 * 16);
  222. vec_xst((vector16x_u8)vec_revb(rb3), 0, out + 4 * 16);
  223. vec_xst((vector16x_u8)vec_revb(rb2), 0, out + 5 * 16);
  224. vec_xst((vector16x_u8)vec_revb(rb1), 0, out + 6 * 16);
  225. vec_xst((vector16x_u8)vec_revb(rb0), 0, out + 7 * 16);
  226. }
  227. static ASM_FUNC_ATTR_INLINE void
  228. sm4_ppc_crypt_blk1_4(u32 *rk, byte *out, const byte *in, size_t nblks)
  229. {
  230. vector4x_u32 ra0, ra1, ra2, ra3;
  231. vector4x_u32 tmp0, tmp1;
  232. u32 *rk_end;
  233. ra0 = vec_revb((vector4x_u32)vec_xl(0, in + 0 * 16));
  234. ra1 = ra0;
  235. ra2 = ra0;
  236. ra3 = ra0;
  237. if (LIKELY(nblks > 1))
  238. ra1 = vec_revb((vector4x_u32)vec_xl(0, in + 1 * 16));
  239. if (LIKELY(nblks > 2))
  240. ra2 = vec_revb((vector4x_u32)vec_xl(0, in + 2 * 16));
  241. if (LIKELY(nblks > 3))
  242. ra3 = vec_revb((vector4x_u32)vec_xl(0, in + 3 * 16));
  243. transpose_4x4(ra0, ra1, ra2, ra3, tmp0, tmp1);
  244. for (rk_end = rk + 32; rk < rk_end; rk += 4)
  245. {
  246. vector4x_u32 r4keys = vec_xl(0, rk);
  247. ROUND4(0, ra0, ra1, ra2, ra3);
  248. ROUND4(1, ra1, ra2, ra3, ra0);
  249. ROUND4(2, ra2, ra3, ra0, ra1);
  250. ROUND4(3, ra3, ra0, ra1, ra2);
  251. }
  252. transpose_4x4(ra3, ra2, ra1, ra0, tmp0, tmp1);
  253. vec_xst((vector16x_u8)vec_revb(ra3), 0, out + 0 * 16);
  254. if (LIKELY(nblks > 1))
  255. vec_xst((vector16x_u8)vec_revb(ra2), 0, out + 1 * 16);
  256. if (LIKELY(nblks > 2))
  257. vec_xst((vector16x_u8)vec_revb(ra1), 0, out + 2 * 16);
  258. if (LIKELY(nblks > 3))
  259. vec_xst((vector16x_u8)vec_revb(ra0), 0, out + 3 * 16);
  260. }
  261. static ASM_FUNC_ATTR_INLINE void
  262. sm4_ppc_crypt_blk1_16(u32 *rk, byte *out, const byte *in, size_t nblks)
  263. {
  264. if (nblks >= 16)
  265. {
  266. sm4_ppc_crypt_blk16(rk, out, in);
  267. return;
  268. }
  269. while (nblks >= 8)
  270. {
  271. sm4_ppc_crypt_blk8(rk, out, in);
  272. in += 8 * 16;
  273. out += 8 * 16;
  274. nblks -= 8;
  275. }
  276. while (nblks)
  277. {
  278. size_t currblks = nblks > 4 ? 4 : nblks;
  279. sm4_ppc_crypt_blk1_4(rk, out, in, currblks);
  280. in += currblks * 16;
  281. out += currblks * 16;
  282. nblks -= currblks;
  283. }
  284. }
  285. ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_TARGET_P8 void
  286. _gcry_sm4_ppc8le_crypt_blk1_16(u32 *rk, byte *out, const byte *in,
  287. size_t nblks)
  288. {
  289. sm4_ppc_crypt_blk1_16(rk, out, in, nblks);
  290. }
  291. ASM_FUNC_ATTR_NOINLINE FUNC_ATTR_TARGET_P9 void
  292. _gcry_sm4_ppc9le_crypt_blk1_16(u32 *rk, byte *out, const byte *in,
  293. size_t nblks)
  294. {
  295. #ifdef HAVE_FUNC_ATTR_TARGET
  296. /* Inline for POWER9 target optimization. */
  297. sm4_ppc_crypt_blk1_16(rk, out, in, nblks);
  298. #else
  299. /* Target selecting not working, just call the other noinline function. */
  300. _gcry_sm4_ppc8le_crypt_blk1_16(rk, out, in, nblks);
  301. #endif
  302. }
  303. #endif /* ENABLE_PPC_CRYPTO_SUPPORT */