crc-ppc.c 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657
  1. /* crc-ppc.c - POWER8 vpmsum accelerated CRC implementation
  2. * Copyright (C) 2019-2020 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  3. *
  4. * This file is part of Libgcrypt.
  5. *
  6. * Libgcrypt is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU Lesser General Public License as
  8. * published by the Free Software Foundation; either version 2.1 of
  9. * the License, or (at your option) any later version.
  10. *
  11. * Libgcrypt is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. * GNU Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with this program; if not, see <https://www.gnu.org/licenses/>.
  18. * SPDX-License-Identifier: LGPL-2.1-or-later
  19. *
  20. */
  21. #include <config.h>
  22. #include <stdio.h>
  23. #include <stdlib.h>
  24. #include <string.h>
  25. #include "g10lib.h"
  26. #include "bithelp.h"
  27. #include "bufhelp.h"
  28. #if defined(ENABLE_PPC_CRYPTO_SUPPORT) && \
  29. defined(HAVE_COMPATIBLE_CC_PPC_ALTIVEC) && \
  30. defined(HAVE_GCC_INLINE_ASM_PPC_ALTIVEC) && \
  31. __GNUC__ >= 4
  32. #include <altivec.h>
  33. #include "bufhelp.h"
  34. #define ALWAYS_INLINE inline __attribute__((always_inline))
  35. #define NO_INLINE __attribute__((noinline))
  36. #define NO_INSTRUMENT_FUNCTION __attribute__((no_instrument_function))
  37. #define ASM_FUNC_ATTR NO_INSTRUMENT_FUNCTION
  38. #define ASM_FUNC_ATTR_INLINE ASM_FUNC_ATTR ALWAYS_INLINE
  39. #define ASM_FUNC_ATTR_NOINLINE ASM_FUNC_ATTR NO_INLINE
  40. #define ALIGNED_64 __attribute__ ((aligned (64)))
  41. typedef vector unsigned char vector16x_u8;
  42. typedef vector unsigned int vector4x_u32;
  43. typedef vector unsigned long long vector2x_u64;
  44. /* Constants structure for generic reflected/non-reflected CRC32 PMULL
  45. * functions. */
  46. struct crc32_consts_s
  47. {
  48. /* k: { x^(32*17), x^(32*15), x^(32*5), x^(32*3), x^(32*2), 0 } mod P(x) */
  49. unsigned long long k[6];
  50. /* my_p: { floor(x^64 / P(x)), P(x) } */
  51. unsigned long long my_p[2];
  52. };
  53. /* PMULL constants for CRC32 and CRC32RFC1510. */
  54. static const struct crc32_consts_s crc32_consts ALIGNED_64 =
  55. {
  56. { /* k[6] = reverse_33bits( x^(32*y) mod P(x) ) */
  57. U64_C(0x154442bd4), U64_C(0x1c6e41596), /* y = { 17, 15 } */
  58. U64_C(0x1751997d0), U64_C(0x0ccaa009e), /* y = { 5, 3 } */
  59. U64_C(0x163cd6124), 0 /* y = 2 */
  60. },
  61. { /* my_p[2] = reverse_33bits ( { floor(x^64 / P(x)), P(x) } ) */
  62. U64_C(0x1f7011641), U64_C(0x1db710641)
  63. }
  64. };
  65. /* PMULL constants for CRC24RFC2440 (polynomial multiplied with x⁸). */
  66. static const struct crc32_consts_s crc24rfc2440_consts ALIGNED_64 =
  67. {
  68. { /* k[6] = x^(32*y) mod P(x) << 32*/
  69. U64_C(0x08289a00) << 32, U64_C(0x74b44a00) << 32, /* y = { 17, 15 } */
  70. U64_C(0xc4b14d00) << 32, U64_C(0xfd7e0c00) << 32, /* y = { 5, 3 } */
  71. U64_C(0xd9fe8c00) << 32, 0 /* y = 2 */
  72. },
  73. { /* my_p[2] = { floor(x^64 / P(x)), P(x) } */
  74. U64_C(0x1f845fe24), U64_C(0x1864cfb00)
  75. }
  76. };
  77. static ASM_FUNC_ATTR_INLINE vector2x_u64
  78. asm_vpmsumd(vector2x_u64 a, vector2x_u64 b)
  79. {
  80. __asm__("vpmsumd %0, %1, %2"
  81. : "=v" (a)
  82. : "v" (a), "v" (b));
  83. return a;
  84. }
  85. static ASM_FUNC_ATTR_INLINE vector2x_u64
  86. asm_swap_u64(vector2x_u64 a)
  87. {
  88. __asm__("xxswapd %x0, %x1"
  89. : "=wa" (a)
  90. : "wa" (a));
  91. return a;
  92. }
  93. static ASM_FUNC_ATTR_INLINE vector4x_u32
  94. vec_sld_u32(vector4x_u32 a, vector4x_u32 b, unsigned int idx)
  95. {
  96. return vec_sld (a, b, (4 * idx) & 15);
  97. }
  98. static const byte crc32_partial_fold_input_mask[16 + 16] ALIGNED_64 =
  99. {
  100. 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  101. 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  102. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  103. 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  104. };
  105. static const byte crc32_shuf_shift[3 * 16] ALIGNED_64 =
  106. {
  107. 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,
  108. 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,
  109. 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08,
  110. 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00,
  111. 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,
  112. 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,
  113. };
  114. static const byte crc32_refl_shuf_shift[3 * 16] ALIGNED_64 =
  115. {
  116. 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,
  117. 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,
  118. 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
  119. 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
  120. 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,
  121. 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,
  122. };
  123. static const vector16x_u8 bswap_const ALIGNED_64 =
  124. { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
  125. #define CRC_VEC_SWAP(v) ({ vector2x_u64 __vecu64 = (v); \
  126. vec_perm(__vecu64, __vecu64, bswap_const); })
  127. #ifdef WORDS_BIGENDIAN
  128. # define CRC_VEC_U64_DEF(lo, hi) { (hi), (lo) }
  129. # define CRC_VEC_U64_LOAD(offs, ptr) \
  130. asm_swap_u64(asm_vec_u64_load(offs, ptr))
  131. # define CRC_VEC_U64_LOAD_LE(offs, ptr) \
  132. CRC_VEC_SWAP(asm_vec_u64_load(offs, ptr))
  133. # define CRC_VEC_U64_LOAD_BE(offs, ptr) \
  134. asm_vec_u64_load(offs, ptr)
  135. # define CRC_VEC_SWAP_TO_LE(v) CRC_VEC_SWAP(v)
  136. # define CRC_VEC_SWAP_TO_BE(v) (v)
  137. # define VEC_U64_LO 1
  138. # define VEC_U64_HI 0
  139. static ASM_FUNC_ATTR_INLINE vector2x_u64
  140. asm_vec_u64_load(unsigned long offset, const void *ptr)
  141. {
  142. vector2x_u64 vecu64;
  143. #if __GNUC__ >= 4
  144. if (__builtin_constant_p (offset) && offset == 0)
  145. __asm__ volatile ("lxvd2x %x0,0,%1\n\t"
  146. : "=wa" (vecu64)
  147. : "r" ((uintptr_t)ptr)
  148. : "memory");
  149. else
  150. #endif
  151. __asm__ volatile ("lxvd2x %x0,%1,%2\n\t"
  152. : "=wa" (vecu64)
  153. : "r" (offset), "r" ((uintptr_t)ptr)
  154. : "memory", "r0");
  155. return vecu64;
  156. }
  157. #else
  158. # define CRC_VEC_U64_DEF(lo, hi) { (lo), (hi) }
  159. # define CRC_VEC_U64_LOAD(offs, ptr) asm_vec_u64_load_le(offs, ptr)
  160. # define CRC_VEC_U64_LOAD_LE(offs, ptr) asm_vec_u64_load_le(offs, ptr)
  161. # define CRC_VEC_U64_LOAD_BE(offs, ptr) asm_vec_u64_load_be(offs, ptr)
  162. # define CRC_VEC_SWAP_TO_LE(v) (v)
  163. # define CRC_VEC_SWAP_TO_BE(v) CRC_VEC_SWAP(v)
  164. # define VEC_U64_LO 0
  165. # define VEC_U64_HI 1
  166. static ASM_FUNC_ATTR_INLINE vector2x_u64
  167. asm_vec_u64_load_le(unsigned long offset, const void *ptr)
  168. {
  169. vector2x_u64 vecu64;
  170. #if __GNUC__ >= 4
  171. if (__builtin_constant_p (offset) && offset == 0)
  172. __asm__ volatile ("lxvd2x %x0,0,%1\n\t"
  173. : "=wa" (vecu64)
  174. : "r" ((uintptr_t)ptr)
  175. : "memory");
  176. else
  177. #endif
  178. __asm__ volatile ("lxvd2x %x0,%1,%2\n\t"
  179. : "=wa" (vecu64)
  180. : "r" (offset), "r" ((uintptr_t)ptr)
  181. : "memory", "r0");
  182. return asm_swap_u64(vecu64);
  183. }
  184. static ASM_FUNC_ATTR_INLINE vector2x_u64
  185. asm_vec_u64_load_be(unsigned int offset, const void *ptr)
  186. {
  187. static const vector16x_u8 vec_load_le_const =
  188. { ~7, ~6, ~5, ~4, ~3, ~2, ~1, ~0, ~15, ~14, ~13, ~12, ~11, ~10, ~9, ~8 };
  189. vector2x_u64 vecu64;
  190. #if __GNUC__ >= 4
  191. if (__builtin_constant_p (offset) && offset == 0)
  192. __asm__ ("lxvd2x %%vs32,0,%1\n\t"
  193. "vperm %0,%%v0,%%v0,%2\n\t"
  194. : "=v" (vecu64)
  195. : "r" ((uintptr_t)(ptr)), "v" (vec_load_le_const)
  196. : "memory", "v0");
  197. #endif
  198. else
  199. __asm__ ("lxvd2x %%vs32,%1,%2\n\t"
  200. "vperm %0,%%v0,%%v0,%3\n\t"
  201. : "=v" (vecu64)
  202. : "r" (offset), "r" ((uintptr_t)(ptr)),
  203. "v" (vec_load_le_const)
  204. : "memory", "r0", "v0");
  205. return vecu64;
  206. }
  207. #endif
  208. static ASM_FUNC_ATTR_INLINE void
  209. crc32r_ppc8_ce_bulk (u32 *pcrc, const byte *inbuf, size_t inlen,
  210. const struct crc32_consts_s *consts)
  211. {
  212. vector4x_u32 zero = { 0, 0, 0, 0 };
  213. vector2x_u64 low_64bit_mask = CRC_VEC_U64_DEF((u64)-1, 0);
  214. vector2x_u64 low_32bit_mask = CRC_VEC_U64_DEF((u32)-1, 0);
  215. vector2x_u64 my_p = CRC_VEC_U64_LOAD(0, &consts->my_p[0]);
  216. vector2x_u64 k1k2 = CRC_VEC_U64_LOAD(0, &consts->k[1 - 1]);
  217. vector2x_u64 k3k4 = CRC_VEC_U64_LOAD(0, &consts->k[3 - 1]);
  218. vector2x_u64 k4lo = CRC_VEC_U64_DEF(k3k4[VEC_U64_HI], 0);
  219. vector2x_u64 k5lo = CRC_VEC_U64_LOAD(0, &consts->k[5 - 1]);
  220. vector2x_u64 crc = CRC_VEC_U64_DEF(*pcrc, 0);
  221. vector2x_u64 crc0, crc1, crc2, crc3;
  222. vector2x_u64 v0;
  223. if (inlen >= 8 * 16)
  224. {
  225. crc0 = CRC_VEC_U64_LOAD_LE(0 * 16, inbuf);
  226. crc0 ^= crc;
  227. crc1 = CRC_VEC_U64_LOAD_LE(1 * 16, inbuf);
  228. crc2 = CRC_VEC_U64_LOAD_LE(2 * 16, inbuf);
  229. crc3 = CRC_VEC_U64_LOAD_LE(3 * 16, inbuf);
  230. inbuf += 4 * 16;
  231. inlen -= 4 * 16;
  232. /* Fold by 4. */
  233. while (inlen >= 4 * 16)
  234. {
  235. v0 = CRC_VEC_U64_LOAD_LE(0 * 16, inbuf);
  236. crc0 = asm_vpmsumd(crc0, k1k2) ^ v0;
  237. v0 = CRC_VEC_U64_LOAD_LE(1 * 16, inbuf);
  238. crc1 = asm_vpmsumd(crc1, k1k2) ^ v0;
  239. v0 = CRC_VEC_U64_LOAD_LE(2 * 16, inbuf);
  240. crc2 = asm_vpmsumd(crc2, k1k2) ^ v0;
  241. v0 = CRC_VEC_U64_LOAD_LE(3 * 16, inbuf);
  242. crc3 = asm_vpmsumd(crc3, k1k2) ^ v0;
  243. inbuf += 4 * 16;
  244. inlen -= 4 * 16;
  245. }
  246. /* Fold 4 to 1. */
  247. crc1 ^= asm_vpmsumd(crc0, k3k4);
  248. crc2 ^= asm_vpmsumd(crc1, k3k4);
  249. crc3 ^= asm_vpmsumd(crc2, k3k4);
  250. crc = crc3;
  251. }
  252. else
  253. {
  254. v0 = CRC_VEC_U64_LOAD_LE(0, inbuf);
  255. crc ^= v0;
  256. inbuf += 16;
  257. inlen -= 16;
  258. }
  259. /* Fold by 1. */
  260. while (inlen >= 16)
  261. {
  262. v0 = CRC_VEC_U64_LOAD_LE(0, inbuf);
  263. crc = asm_vpmsumd(k3k4, crc);
  264. crc ^= v0;
  265. inbuf += 16;
  266. inlen -= 16;
  267. }
  268. /* Partial fold. */
  269. if (inlen)
  270. {
  271. /* Load last input and add padding zeros. */
  272. vector2x_u64 mask = CRC_VEC_U64_LOAD_LE(inlen, crc32_partial_fold_input_mask);
  273. vector2x_u64 shl_shuf = CRC_VEC_U64_LOAD_LE(inlen, crc32_refl_shuf_shift);
  274. vector2x_u64 shr_shuf = CRC_VEC_U64_LOAD_LE(inlen + 16, crc32_refl_shuf_shift);
  275. v0 = CRC_VEC_U64_LOAD_LE(inlen - 16, inbuf);
  276. v0 &= mask;
  277. crc = CRC_VEC_SWAP_TO_LE(crc);
  278. v0 |= (vector2x_u64)vec_perm((vector16x_u8)crc, (vector16x_u8)zero,
  279. (vector16x_u8)shr_shuf);
  280. crc = (vector2x_u64)vec_perm((vector16x_u8)crc, (vector16x_u8)zero,
  281. (vector16x_u8)shl_shuf);
  282. crc = asm_vpmsumd(k3k4, crc);
  283. crc ^= v0;
  284. inbuf += inlen;
  285. inlen -= inlen;
  286. }
  287. /* Final fold. */
  288. /* reduce 128-bits to 96-bits */
  289. v0 = asm_swap_u64(crc);
  290. v0 &= low_64bit_mask;
  291. crc = asm_vpmsumd(k4lo, crc);
  292. crc ^= v0;
  293. /* reduce 96-bits to 64-bits */
  294. v0 = (vector2x_u64)vec_sld_u32((vector4x_u32)crc,
  295. (vector4x_u32)crc, 3); /* [x0][x3][x2][x1] */
  296. v0 &= low_64bit_mask; /* [00][00][x2][x1] */
  297. crc = crc & low_32bit_mask; /* [00][00][00][x0] */
  298. crc = v0 ^ asm_vpmsumd(k5lo, crc); /* [00][00][xx][xx] */
  299. /* barrett reduction */
  300. v0 = crc << 32; /* [00][00][x0][00] */
  301. v0 = asm_vpmsumd(my_p, v0);
  302. v0 = asm_swap_u64(v0);
  303. v0 = asm_vpmsumd(my_p, v0);
  304. crc = (vector2x_u64)vec_sld_u32((vector4x_u32)crc,
  305. zero, 1); /* [00][x1][x0][00] */
  306. crc ^= v0;
  307. *pcrc = (u32)crc[VEC_U64_HI];
  308. }
  309. static ASM_FUNC_ATTR_INLINE u32
  310. crc32r_ppc8_ce_reduction_4 (u32 data, u32 crc,
  311. const struct crc32_consts_s *consts)
  312. {
  313. vector4x_u32 zero = { 0, 0, 0, 0 };
  314. vector2x_u64 my_p = CRC_VEC_U64_LOAD(0, &consts->my_p[0]);
  315. vector2x_u64 v0 = CRC_VEC_U64_DEF((u64)data, 0);
  316. v0 = asm_vpmsumd(v0, my_p); /* [00][00][xx][xx] */
  317. v0 = (vector2x_u64)vec_sld_u32((vector4x_u32)v0,
  318. zero, 3); /* [x0][00][00][00] */
  319. v0 = (vector2x_u64)vec_sld_u32((vector4x_u32)v0,
  320. (vector4x_u32)v0, 3); /* [00][x0][00][00] */
  321. v0 = asm_vpmsumd(v0, my_p); /* [00][00][xx][xx] */
  322. return (v0[VEC_U64_LO] >> 32) ^ crc;
  323. }
  324. static ASM_FUNC_ATTR_INLINE void
  325. crc32r_less_than_16 (u32 *pcrc, const byte *inbuf, size_t inlen,
  326. const struct crc32_consts_s *consts)
  327. {
  328. u32 crc = *pcrc;
  329. u32 data;
  330. while (inlen >= 4)
  331. {
  332. data = buf_get_le32(inbuf);
  333. data ^= crc;
  334. inlen -= 4;
  335. inbuf += 4;
  336. crc = crc32r_ppc8_ce_reduction_4 (data, 0, consts);
  337. }
  338. switch (inlen)
  339. {
  340. case 0:
  341. break;
  342. case 1:
  343. data = inbuf[0];
  344. data ^= crc;
  345. data <<= 24;
  346. crc >>= 8;
  347. crc = crc32r_ppc8_ce_reduction_4 (data, crc, consts);
  348. break;
  349. case 2:
  350. data = inbuf[0] << 0;
  351. data |= inbuf[1] << 8;
  352. data ^= crc;
  353. data <<= 16;
  354. crc >>= 16;
  355. crc = crc32r_ppc8_ce_reduction_4 (data, crc, consts);
  356. break;
  357. case 3:
  358. data = inbuf[0] << 0;
  359. data |= inbuf[1] << 8;
  360. data |= inbuf[2] << 16;
  361. data ^= crc;
  362. data <<= 8;
  363. crc >>= 24;
  364. crc = crc32r_ppc8_ce_reduction_4 (data, crc, consts);
  365. break;
  366. }
  367. *pcrc = crc;
  368. }
  369. static ASM_FUNC_ATTR_INLINE void
  370. crc32_ppc8_ce_bulk (u32 *pcrc, const byte *inbuf, size_t inlen,
  371. const struct crc32_consts_s *consts)
  372. {
  373. vector4x_u32 zero = { 0, 0, 0, 0 };
  374. vector2x_u64 low_96bit_mask = CRC_VEC_U64_DEF(~0, ~((u64)(u32)-1 << 32));
  375. vector2x_u64 p_my = asm_swap_u64(CRC_VEC_U64_LOAD(0, &consts->my_p[0]));
  376. vector2x_u64 p_my_lo, p_my_hi;
  377. vector2x_u64 k2k1 = asm_swap_u64(CRC_VEC_U64_LOAD(0, &consts->k[1 - 1]));
  378. vector2x_u64 k4k3 = asm_swap_u64(CRC_VEC_U64_LOAD(0, &consts->k[3 - 1]));
  379. vector2x_u64 k4hi = CRC_VEC_U64_DEF(0, consts->k[4 - 1]);
  380. vector2x_u64 k5hi = CRC_VEC_U64_DEF(0, consts->k[5 - 1]);
  381. vector2x_u64 crc = CRC_VEC_U64_DEF(0, _gcry_bswap64(*pcrc));
  382. vector2x_u64 crc0, crc1, crc2, crc3;
  383. vector2x_u64 v0;
  384. if (inlen >= 8 * 16)
  385. {
  386. crc0 = CRC_VEC_U64_LOAD_BE(0 * 16, inbuf);
  387. crc0 ^= crc;
  388. crc1 = CRC_VEC_U64_LOAD_BE(1 * 16, inbuf);
  389. crc2 = CRC_VEC_U64_LOAD_BE(2 * 16, inbuf);
  390. crc3 = CRC_VEC_U64_LOAD_BE(3 * 16, inbuf);
  391. inbuf += 4 * 16;
  392. inlen -= 4 * 16;
  393. /* Fold by 4. */
  394. while (inlen >= 4 * 16)
  395. {
  396. v0 = CRC_VEC_U64_LOAD_BE(0 * 16, inbuf);
  397. crc0 = asm_vpmsumd(crc0, k2k1) ^ v0;
  398. v0 = CRC_VEC_U64_LOAD_BE(1 * 16, inbuf);
  399. crc1 = asm_vpmsumd(crc1, k2k1) ^ v0;
  400. v0 = CRC_VEC_U64_LOAD_BE(2 * 16, inbuf);
  401. crc2 = asm_vpmsumd(crc2, k2k1) ^ v0;
  402. v0 = CRC_VEC_U64_LOAD_BE(3 * 16, inbuf);
  403. crc3 = asm_vpmsumd(crc3, k2k1) ^ v0;
  404. inbuf += 4 * 16;
  405. inlen -= 4 * 16;
  406. }
  407. /* Fold 4 to 1. */
  408. crc1 ^= asm_vpmsumd(crc0, k4k3);
  409. crc2 ^= asm_vpmsumd(crc1, k4k3);
  410. crc3 ^= asm_vpmsumd(crc2, k4k3);
  411. crc = crc3;
  412. }
  413. else
  414. {
  415. v0 = CRC_VEC_U64_LOAD_BE(0, inbuf);
  416. crc ^= v0;
  417. inbuf += 16;
  418. inlen -= 16;
  419. }
  420. /* Fold by 1. */
  421. while (inlen >= 16)
  422. {
  423. v0 = CRC_VEC_U64_LOAD_BE(0, inbuf);
  424. crc = asm_vpmsumd(k4k3, crc);
  425. crc ^= v0;
  426. inbuf += 16;
  427. inlen -= 16;
  428. }
  429. /* Partial fold. */
  430. if (inlen)
  431. {
  432. /* Load last input and add padding zeros. */
  433. vector2x_u64 mask = CRC_VEC_U64_LOAD_LE(inlen, crc32_partial_fold_input_mask);
  434. vector2x_u64 shl_shuf = CRC_VEC_U64_LOAD_LE(32 - inlen, crc32_refl_shuf_shift);
  435. vector2x_u64 shr_shuf = CRC_VEC_U64_LOAD_LE(inlen + 16, crc32_shuf_shift);
  436. v0 = CRC_VEC_U64_LOAD_LE(inlen - 16, inbuf);
  437. v0 &= mask;
  438. crc = CRC_VEC_SWAP_TO_LE(crc);
  439. crc2 = (vector2x_u64)vec_perm((vector16x_u8)crc, (vector16x_u8)zero,
  440. (vector16x_u8)shr_shuf);
  441. v0 |= crc2;
  442. v0 = CRC_VEC_SWAP(v0);
  443. crc = (vector2x_u64)vec_perm((vector16x_u8)crc, (vector16x_u8)zero,
  444. (vector16x_u8)shl_shuf);
  445. crc = asm_vpmsumd(k4k3, crc);
  446. crc ^= v0;
  447. inbuf += inlen;
  448. inlen -= inlen;
  449. }
  450. /* Final fold. */
  451. /* reduce 128-bits to 96-bits */
  452. v0 = (vector2x_u64)vec_sld_u32((vector4x_u32)crc,
  453. (vector4x_u32)zero, 2);
  454. crc = asm_vpmsumd(k4hi, crc);
  455. crc ^= v0; /* bottom 32-bit are zero */
  456. /* reduce 96-bits to 64-bits */
  457. v0 = crc & low_96bit_mask; /* [00][x2][x1][00] */
  458. crc >>= 32; /* [00][x3][00][x0] */
  459. crc = asm_vpmsumd(k5hi, crc); /* [00][xx][xx][00] */
  460. crc ^= v0; /* top and bottom 32-bit are zero */
  461. /* barrett reduction */
  462. p_my_hi = p_my;
  463. p_my_lo = p_my;
  464. p_my_hi[VEC_U64_LO] = 0;
  465. p_my_lo[VEC_U64_HI] = 0;
  466. v0 = crc >> 32; /* [00][00][00][x1] */
  467. crc = asm_vpmsumd(p_my_hi, crc); /* [00][xx][xx][xx] */
  468. crc = (vector2x_u64)vec_sld_u32((vector4x_u32)crc,
  469. (vector4x_u32)crc, 3); /* [x0][00][x2][x1] */
  470. crc = asm_vpmsumd(p_my_lo, crc); /* [00][xx][xx][xx] */
  471. crc ^= v0;
  472. *pcrc = _gcry_bswap32(crc[VEC_U64_LO]);
  473. }
  474. static ASM_FUNC_ATTR_INLINE u32
  475. crc32_ppc8_ce_reduction_4 (u32 data, u32 crc,
  476. const struct crc32_consts_s *consts)
  477. {
  478. vector2x_u64 my_p = CRC_VEC_U64_LOAD(0, &consts->my_p[0]);
  479. vector2x_u64 v0 = CRC_VEC_U64_DEF((u64)data << 32, 0);
  480. v0 = asm_vpmsumd(v0, my_p); /* [00][x1][x0][00] */
  481. v0[VEC_U64_LO] = 0; /* [00][x1][00][00] */
  482. v0 = asm_vpmsumd(v0, my_p); /* [00][00][xx][xx] */
  483. return _gcry_bswap32(v0[VEC_U64_LO]) ^ crc;
  484. }
  485. static ASM_FUNC_ATTR_INLINE void
  486. crc32_less_than_16 (u32 *pcrc, const byte *inbuf, size_t inlen,
  487. const struct crc32_consts_s *consts)
  488. {
  489. u32 crc = *pcrc;
  490. u32 data;
  491. while (inlen >= 4)
  492. {
  493. data = buf_get_le32(inbuf);
  494. data ^= crc;
  495. data = _gcry_bswap32(data);
  496. inlen -= 4;
  497. inbuf += 4;
  498. crc = crc32_ppc8_ce_reduction_4 (data, 0, consts);
  499. }
  500. switch (inlen)
  501. {
  502. case 0:
  503. break;
  504. case 1:
  505. data = inbuf[0];
  506. data ^= crc;
  507. data = data & 0xffU;
  508. crc = crc >> 8;
  509. crc = crc32_ppc8_ce_reduction_4 (data, crc, consts);
  510. break;
  511. case 2:
  512. data = inbuf[0] << 0;
  513. data |= inbuf[1] << 8;
  514. data ^= crc;
  515. data = _gcry_bswap32(data << 16);
  516. crc = crc >> 16;
  517. crc = crc32_ppc8_ce_reduction_4 (data, crc, consts);
  518. break;
  519. case 3:
  520. data = inbuf[0] << 0;
  521. data |= inbuf[1] << 8;
  522. data |= inbuf[2] << 16;
  523. data ^= crc;
  524. data = _gcry_bswap32(data << 8);
  525. crc = crc >> 24;
  526. crc = crc32_ppc8_ce_reduction_4 (data, crc, consts);
  527. break;
  528. }
  529. *pcrc = crc;
  530. }
  531. void ASM_FUNC_ATTR
  532. _gcry_crc32_ppc8_vpmsum (u32 *pcrc, const byte *inbuf, size_t inlen)
  533. {
  534. const struct crc32_consts_s *consts = &crc32_consts;
  535. if (!inlen)
  536. return;
  537. if (inlen >= 16)
  538. crc32r_ppc8_ce_bulk (pcrc, inbuf, inlen, consts);
  539. else
  540. crc32r_less_than_16 (pcrc, inbuf, inlen, consts);
  541. }
  542. void ASM_FUNC_ATTR
  543. _gcry_crc24rfc2440_ppc8_vpmsum (u32 *pcrc, const byte *inbuf, size_t inlen)
  544. {
  545. const struct crc32_consts_s *consts = &crc24rfc2440_consts;
  546. if (!inlen)
  547. return;
  548. /* Note: *pcrc in input endian. */
  549. if (inlen >= 16)
  550. crc32_ppc8_ce_bulk (pcrc, inbuf, inlen, consts);
  551. else
  552. crc32_less_than_16 (pcrc, inbuf, inlen, consts);
  553. }
  554. #endif