clmul.h 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388
  1. /* SPDX-License-Identifier: MIT
  2. *
  3. * Permission is hereby granted, free of charge, to any person
  4. * obtaining a copy of this software and associated documentation
  5. * files (the "Software"), to deal in the Software without
  6. * restriction, including without limitation the rights to use, copy,
  7. * modify, merge, publish, distribute, sublicense, and/or sell copies
  8. * of the Software, and to permit persons to whom the Software is
  9. * furnished to do so, subject to the following conditions:
  10. *
  11. * The above copyright notice and this permission notice shall be
  12. * included in all copies or substantial portions of the Software.
  13. *
  14. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  15. * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  16. * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  17. * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  18. * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  19. * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  20. * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21. * SOFTWARE.
  22. *
  23. * Copyright:
  24. * 2020 Evan Nemerson <evan@nemerson.com>
  25. * 2016 Thomas Pornin <pornin@bolet.org>
  26. */
  27. /* The portable version is based on the implementation in BearSSL,
  28. * which is MIT licensed, constant-time / branch-free, and documented
  29. * at https://www.bearssl.org/constanttime.html (specifically, we use
  30. * the implementation from ghash_ctmul64.c). */
  31. #if !defined(SIMDE_X86_CLMUL_H)
  32. #define SIMDE_X86_CLMUL_H
  33. #include "avx512/set.h"
  34. #include "avx512/setzero.h"
  35. #if !defined(SIMDE_X86_PCLMUL_NATIVE) && defined(SIMDE_ENABLE_NATIVE_ALIASES)
  36. # define SIMDE_X86_PCLMUL_ENABLE_NATIVE_ALIASES
  37. #endif
  38. HEDLEY_DIAGNOSTIC_PUSH
  39. SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
  40. SIMDE_BEGIN_DECLS_
  41. SIMDE_FUNCTION_ATTRIBUTES
  42. uint64_t
  43. simde_x_clmul_u64(uint64_t x, uint64_t y) {
  44. uint64_t x0, x1, x2, x3;
  45. uint64_t y0, y1, y2, y3;
  46. uint64_t z0, z1, z2, z3;
  47. x0 = x & UINT64_C(0x1111111111111111);
  48. x1 = x & UINT64_C(0x2222222222222222);
  49. x2 = x & UINT64_C(0x4444444444444444);
  50. x3 = x & UINT64_C(0x8888888888888888);
  51. y0 = y & UINT64_C(0x1111111111111111);
  52. y1 = y & UINT64_C(0x2222222222222222);
  53. y2 = y & UINT64_C(0x4444444444444444);
  54. y3 = y & UINT64_C(0x8888888888888888);
  55. z0 = (x0 * y0) ^ (x1 * y3) ^ (x2 * y2) ^ (x3 * y1);
  56. z1 = (x0 * y1) ^ (x1 * y0) ^ (x2 * y3) ^ (x3 * y2);
  57. z2 = (x0 * y2) ^ (x1 * y1) ^ (x2 * y0) ^ (x3 * y3);
  58. z3 = (x0 * y3) ^ (x1 * y2) ^ (x2 * y1) ^ (x3 * y0);
  59. z0 &= UINT64_C(0x1111111111111111);
  60. z1 &= UINT64_C(0x2222222222222222);
  61. z2 &= UINT64_C(0x4444444444444444);
  62. z3 &= UINT64_C(0x8888888888888888);
  63. return z0 | z1 | z2 | z3;
  64. }
  65. static uint64_t
  66. simde_x_bitreverse_u64(uint64_t v) {
  67. #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
  68. uint8x8_t bytes = vreinterpret_u8_u64(vmov_n_u64(v));
  69. bytes = vrbit_u8(bytes);
  70. bytes = vrev64_u8(bytes);
  71. return vget_lane_u64(vreinterpret_u64_u8(bytes), 0);
  72. #elif defined(SIMDE_X86_GFNI_NATIVE)
  73. /* I don't think there is (or likely will ever be) a CPU with GFNI
  74. * but not pclmulq, but this may be useful for things other than
  75. * _mm_clmulepi64_si128. */
  76. __m128i vec = _mm_cvtsi64_si128(HEDLEY_STATIC_CAST(int64_t, v));
  77. /* Reverse bits within each byte */
  78. vec = _mm_gf2p8affine_epi64_epi8(vec, _mm_cvtsi64_si128(HEDLEY_STATIC_CAST(int64_t, UINT64_C(0x8040201008040201))), 0);
  79. /* Reverse bytes */
  80. #if defined(SIMDE_X86_SSSE3_NATIVE)
  81. vec = _mm_shuffle_epi8(vec, _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7));
  82. #else
  83. vec = _mm_or_si128(_mm_slli_epi16(vec, 8), _mm_srli_epi16(vec, 8));
  84. vec = _mm_shufflelo_epi16(vec, _MM_SHUFFLE(0, 1, 2, 3));
  85. vec = _mm_shufflehi_epi16(vec, _MM_SHUFFLE(0, 1, 2, 3));
  86. #endif
  87. return HEDLEY_STATIC_CAST(uint64_t, _mm_cvtsi128_si64(vec));
  88. #elif HEDLEY_HAS_BUILTIN(__builtin_bitreverse64)
  89. return __builtin_bitreverse64(v);
  90. #else
  91. v = ((v >> 1) & UINT64_C(0x5555555555555555)) | ((v & UINT64_C(0x5555555555555555)) << 1);
  92. v = ((v >> 2) & UINT64_C(0x3333333333333333)) | ((v & UINT64_C(0x3333333333333333)) << 2);
  93. v = ((v >> 4) & UINT64_C(0x0F0F0F0F0F0F0F0F)) | ((v & UINT64_C(0x0F0F0F0F0F0F0F0F)) << 4);
  94. v = ((v >> 8) & UINT64_C(0x00FF00FF00FF00FF)) | ((v & UINT64_C(0x00FF00FF00FF00FF)) << 8);
  95. v = ((v >> 16) & UINT64_C(0x0000FFFF0000FFFF)) | ((v & UINT64_C(0x0000FFFF0000FFFF)) << 16);
  96. return (v >> 32) | (v << 32);
  97. #endif
  98. }
  99. SIMDE_FUNCTION_ATTRIBUTES
  100. simde__m128i
  101. simde_mm_clmulepi64_si128 (simde__m128i a, simde__m128i b, const int imm8)
  102. SIMDE_REQUIRE_CONSTANT(imm8) {
  103. simde__m128i_private
  104. a_ = simde__m128i_to_private(a),
  105. b_ = simde__m128i_to_private(b),
  106. r_;
  107. #if SIMDE_NATURAL_VECTOR_SIZE_GE(128)
  108. #if defined(SIMDE_SHUFFLE_VECTOR_)
  109. switch (imm8 & 0x11) {
  110. case 0x00:
  111. b_.u64 = SIMDE_SHUFFLE_VECTOR_(64, 16, b_.u64, b_.u64, 0, 0);
  112. a_.u64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.u64, a_.u64, 0, 0);
  113. break;
  114. case 0x01:
  115. b_.u64 = SIMDE_SHUFFLE_VECTOR_(64, 16, b_.u64, b_.u64, 0, 0);
  116. a_.u64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.u64, a_.u64, 1, 1);
  117. break;
  118. case 0x10:
  119. b_.u64 = SIMDE_SHUFFLE_VECTOR_(64, 16, b_.u64, b_.u64, 1, 1);
  120. a_.u64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.u64, a_.u64, 0, 0);
  121. break;
  122. case 0x11:
  123. b_.u64 = SIMDE_SHUFFLE_VECTOR_(64, 16, b_.u64, b_.u64, 1, 1);
  124. a_.u64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.u64, a_.u64, 1, 1);
  125. break;
  126. }
  127. #else
  128. {
  129. const uint64_t A = a_.u64[(imm8 ) & 1];
  130. const uint64_t B = b_.u64[(imm8 >> 4) & 1];
  131. SIMDE_VECTORIZE
  132. for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) {
  133. a_.u64[i] = A;
  134. b_.u64[i] = B;
  135. }
  136. }
  137. #endif
  138. simde__m128i_private reversed_;
  139. {
  140. #if defined(SIMDE_SHUFFLE_VECTOR_)
  141. reversed_.u64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.u64, b_.u64, 1, 3);
  142. #else
  143. reversed_.u64[0] = a_.u64[1];
  144. reversed_.u64[1] = b_.u64[1];
  145. #endif
  146. SIMDE_VECTORIZE
  147. for (size_t i = 0 ; i < (sizeof(reversed_.u64) / sizeof(reversed_.u64[0])) ; i++) {
  148. reversed_.u64[i] = simde_x_bitreverse_u64(reversed_.u64[i]);
  149. }
  150. }
  151. #if defined(SIMDE_SHUFFLE_VECTOR_)
  152. a_.u64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.u64, reversed_.u64, 0, 2);
  153. b_.u64 = SIMDE_SHUFFLE_VECTOR_(64, 16, b_.u64, reversed_.u64, 1, 3);
  154. #else
  155. a_.u64[1] = reversed_.u64[0];
  156. b_.u64[1] = reversed_.u64[1];
  157. #endif
  158. SIMDE_VECTORIZE
  159. for (size_t i = 0 ; i < (sizeof(reversed_.u64) / sizeof(reversed_.u64[0])) ; i++) {
  160. r_.u64[i] = simde_x_clmul_u64(a_.u64[i], b_.u64[i]);
  161. }
  162. r_.u64[1] = simde_x_bitreverse_u64(r_.u64[1]) >> 1;
  163. #else
  164. r_.u64[0] = simde_x_clmul_u64( a_.u64[imm8 & 1], b_.u64[(imm8 >> 4) & 1]);
  165. r_.u64[1] = simde_x_bitreverse_u64(simde_x_clmul_u64(simde_x_bitreverse_u64(a_.u64[imm8 & 1]), simde_x_bitreverse_u64(b_.u64[(imm8 >> 4) & 1]))) >> 1;
  166. #endif
  167. return simde__m128i_from_private(r_);
  168. }
  169. #if defined(SIMDE_X86_PCLMUL_NATIVE)
  170. #if defined(HEDLEY_MCST_LCC_VERSION)
  171. #define simde_mm_clmulepi64_si128(a, b, imm8) (__extension__ ({ \
  172. SIMDE_LCC_DISABLE_DEPRECATED_WARNINGS \
  173. _mm_clmulepi64_si128((a), (b), (imm8)); \
  174. SIMDE_LCC_REVERT_DEPRECATED_WARNINGS \
  175. }))
  176. #else
  177. #define simde_mm_clmulepi64_si128(a, b, imm8) _mm_clmulepi64_si128(a, b, imm8)
  178. #endif
  179. #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_AES) && !defined(__clang__)
  180. #define simde_mm_clmulepi64_si128(a, b, imm8) \
  181. simde__m128i_from_neon_u64( \
  182. vreinterpretq_u64_p128( \
  183. vmull_p64( \
  184. vgetq_lane_p64(vreinterpretq_p64_u64(simde__m128i_to_neon_u64(a)), (imm8 ) & 1), \
  185. vgetq_lane_p64(vreinterpretq_p64_u64(simde__m128i_to_neon_u64(b)), (imm8 >> 4) & 1) \
  186. ) \
  187. ) \
  188. )
  189. #endif
  190. #if defined(SIMDE_X86_PCLMUL_ENABLE_NATIVE_ALIASES)
  191. #undef _mm_clmulepi64_si128
  192. #define _mm_clmulepi64_si128(a, b, imm8) simde_mm_clmulepi64_si128(a, b, imm8)
  193. #endif
  194. SIMDE_FUNCTION_ATTRIBUTES
  195. simde__m256i
  196. simde_mm256_clmulepi64_epi128 (simde__m256i a, simde__m256i b, const int imm8)
  197. SIMDE_REQUIRE_CONSTANT(imm8) {
  198. simde__m256i_private
  199. a_ = simde__m256i_to_private(a),
  200. b_ = simde__m256i_to_private(b),
  201. r_;
  202. simde__m128i_private a_lo_, b_lo_, r_lo_, a_hi_, b_hi_, r_hi_;
  203. #if HEDLEY_HAS_BUILTIN(__builtin_shufflevector) && !defined(HEDLEY_IBM_VERSION)
  204. switch (imm8 & 0x01) {
  205. case 0x00:
  206. a_lo_.u64 = __builtin_shufflevector(a_.u64, a_.u64, 0, 2);
  207. break;
  208. case 0x01:
  209. a_lo_.u64 = __builtin_shufflevector(a_.u64, a_.u64, 1, 3);
  210. break;
  211. }
  212. switch (imm8 & 0x10) {
  213. case 0x00:
  214. b_lo_.u64 = __builtin_shufflevector(b_.u64, b_.u64, 0, 2);
  215. break;
  216. case 0x10:
  217. b_lo_.u64 = __builtin_shufflevector(b_.u64, b_.u64, 1, 3);
  218. break;
  219. }
  220. #else
  221. a_lo_.u64[0] = a_.u64[((imm8 >> 0) & 1) + 0];
  222. a_lo_.u64[1] = a_.u64[((imm8 >> 0) & 1) + 2];
  223. b_lo_.u64[0] = b_.u64[((imm8 >> 4) & 1) + 0];
  224. b_lo_.u64[1] = b_.u64[((imm8 >> 4) & 1) + 2];
  225. #endif
  226. SIMDE_VECTORIZE
  227. for (size_t i = 0 ; i < (sizeof(r_hi_.u64) / sizeof(r_hi_.u64[0])) ; i++) {
  228. a_hi_.u64[i] = simde_x_bitreverse_u64(a_lo_.u64[i]);
  229. b_hi_.u64[i] = simde_x_bitreverse_u64(b_lo_.u64[i]);
  230. r_lo_.u64[i] = simde_x_clmul_u64(a_lo_.u64[i], b_lo_.u64[i]);
  231. r_hi_.u64[i] = simde_x_clmul_u64(a_hi_.u64[i], b_hi_.u64[i]);
  232. r_hi_.u64[i] = simde_x_bitreverse_u64(r_hi_.u64[i]) >> 1;
  233. }
  234. #if HEDLEY_HAS_BUILTIN(__builtin_shufflevector) && !defined(HEDLEY_IBM_VERSION)
  235. r_.u64 = __builtin_shufflevector(r_lo_.u64, r_hi_.u64, 0, 2, 1, 3);
  236. #elif defined(SIMDE_SHUFFLE_VECTOR_)
  237. r_ = simde__m256i_to_private(simde_mm256_set_m128i(simde__m128i_from_private(r_hi_), simde__m128i_from_private(r_lo_)));
  238. r_.u64 = SIMDE_SHUFFLE_VECTOR_(64, 32, r_.u64, r_.u64, 0, 2, 1, 3);
  239. #else
  240. r_.u64[0] = r_lo_.u64[0];
  241. r_.u64[1] = r_hi_.u64[0];
  242. r_.u64[2] = r_lo_.u64[1];
  243. r_.u64[3] = r_hi_.u64[1];
  244. #endif
  245. return simde__m256i_from_private(r_);
  246. }
  247. #if defined(SIMDE_X86_VPCLMULQDQ_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
  248. #define simde_mm256_clmulepi64_epi128(a, b, imm8) _mm256_clmulepi64_epi128(a, b, imm8)
  249. #endif
  250. #if defined(SIMDE_X86_VPCLMULQDQ_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
  251. #undef _mm256_clmulepi64_epi128
  252. #define _mm256_clmulepi64_epi128(a, b, imm8) simde_mm256_clmulepi64_epi128(a, b, imm8)
  253. #endif
  254. SIMDE_FUNCTION_ATTRIBUTES
  255. simde__m512i
  256. simde_mm512_clmulepi64_epi128 (simde__m512i a, simde__m512i b, const int imm8)
  257. SIMDE_REQUIRE_CONSTANT(imm8) {
  258. simde__m512i_private
  259. a_ = simde__m512i_to_private(a),
  260. b_ = simde__m512i_to_private(b),
  261. r_;
  262. #if defined(HEDLEY_MSVC_VERSION)
  263. r_ = simde__m512i_to_private(simde_mm512_setzero_si512());
  264. #endif
  265. #if SIMDE_NATURAL_VECTOR_SIZE_LE(256)
  266. switch (imm8 & 0x11) {
  267. case 0x00:
  268. r_.m256i[0] = simde_mm256_clmulepi64_epi128(a_.m256i[0], b_.m256i[0], 0x00);
  269. r_.m256i[1] = simde_mm256_clmulepi64_epi128(a_.m256i[1], b_.m256i[1], 0x00);
  270. break;
  271. case 0x01:
  272. r_.m256i[0] = simde_mm256_clmulepi64_epi128(a_.m256i[0], b_.m256i[0], 0x01);
  273. r_.m256i[1] = simde_mm256_clmulepi64_epi128(a_.m256i[1], b_.m256i[1], 0x01);
  274. break;
  275. case 0x10:
  276. r_.m256i[0] = simde_mm256_clmulepi64_epi128(a_.m256i[0], b_.m256i[0], 0x10);
  277. r_.m256i[1] = simde_mm256_clmulepi64_epi128(a_.m256i[1], b_.m256i[1], 0x10);
  278. break;
  279. case 0x11:
  280. r_.m256i[0] = simde_mm256_clmulepi64_epi128(a_.m256i[0], b_.m256i[0], 0x11);
  281. r_.m256i[1] = simde_mm256_clmulepi64_epi128(a_.m256i[1], b_.m256i[1], 0x11);
  282. break;
  283. }
  284. #else
  285. simde__m256i_private a_lo_, b_lo_, r_lo_, a_hi_, b_hi_, r_hi_;
  286. #if HEDLEY_HAS_BUILTIN(__builtin_shufflevector) && !defined(HEDLEY_IBM_VERSION)
  287. switch (imm8 & 0x01) {
  288. case 0x00:
  289. a_lo_.u64 = __builtin_shufflevector(a_.u64, a_.u64, 0, 2, 4, 6);
  290. break;
  291. case 0x01:
  292. a_lo_.u64 = __builtin_shufflevector(a_.u64, a_.u64, 1, 3, 5, 7);
  293. break;
  294. }
  295. switch (imm8 & 0x10) {
  296. case 0x00:
  297. b_lo_.u64 = __builtin_shufflevector(b_.u64, b_.u64, 0, 2, 4, 6);
  298. break;
  299. case 0x10:
  300. b_lo_.u64 = __builtin_shufflevector(b_.u64, b_.u64, 1, 3, 5, 7);
  301. break;
  302. }
  303. #else
  304. a_lo_.u64[0] = a_.u64[((imm8 >> 0) & 1) + 0];
  305. a_lo_.u64[1] = a_.u64[((imm8 >> 0) & 1) + 2];
  306. a_lo_.u64[2] = a_.u64[((imm8 >> 0) & 1) + 4];
  307. a_lo_.u64[3] = a_.u64[((imm8 >> 0) & 1) + 6];
  308. b_lo_.u64[0] = b_.u64[((imm8 >> 4) & 1) + 0];
  309. b_lo_.u64[1] = b_.u64[((imm8 >> 4) & 1) + 2];
  310. b_lo_.u64[2] = b_.u64[((imm8 >> 4) & 1) + 4];
  311. b_lo_.u64[3] = b_.u64[((imm8 >> 4) & 1) + 6];
  312. #endif
  313. SIMDE_VECTORIZE
  314. for (size_t i = 0 ; i < (sizeof(r_hi_.u64) / sizeof(r_hi_.u64[0])) ; i++) {
  315. a_hi_.u64[i] = simde_x_bitreverse_u64(a_lo_.u64[i]);
  316. b_hi_.u64[i] = simde_x_bitreverse_u64(b_lo_.u64[i]);
  317. r_lo_.u64[i] = simde_x_clmul_u64(a_lo_.u64[i], b_lo_.u64[i]);
  318. r_hi_.u64[i] = simde_x_clmul_u64(a_hi_.u64[i], b_hi_.u64[i]);
  319. r_hi_.u64[i] = simde_x_bitreverse_u64(r_hi_.u64[i]) >> 1;
  320. }
  321. #if HEDLEY_HAS_BUILTIN(__builtin_shufflevector) && !defined(HEDLEY_IBM_VERSION)
  322. r_.u64 = __builtin_shufflevector(r_lo_.u64, r_hi_.u64, 0, 4, 1, 5, 2, 6, 3, 7);
  323. #else
  324. r_.u64[0] = r_lo_.u64[0];
  325. r_.u64[1] = r_hi_.u64[0];
  326. r_.u64[2] = r_lo_.u64[1];
  327. r_.u64[3] = r_hi_.u64[1];
  328. r_.u64[4] = r_lo_.u64[2];
  329. r_.u64[5] = r_hi_.u64[2];
  330. r_.u64[6] = r_lo_.u64[3];
  331. r_.u64[7] = r_hi_.u64[3];
  332. #endif
  333. #endif
  334. return simde__m512i_from_private(r_);
  335. }
  336. #if defined(SIMDE_X86_VPCLMULQDQ_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE)
  337. #define simde_mm512_clmulepi64_epi128(a, b, imm8) _mm512_clmulepi64_epi128(a, b, imm8)
  338. #endif
  339. #if defined(SIMDE_X86_VPCLMULQDQ_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES)
  340. #undef _mm512_clmulepi64_epi128
  341. #define _mm512_clmulepi64_epi128(a, b, imm8) simde_mm512_clmulepi64_epi128(a, b, imm8)
  342. #endif
  343. SIMDE_END_DECLS_
  344. HEDLEY_DIAGNOSTIC_POP
  345. #endif /* !defined(SIMDE_X86_CLMUL_H) */