util_simd.h 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605
  1. /*
  2. * Copyright 2011-2013 Intel Corporation
  3. * Modifications Copyright 2014, Blender Foundation.
  4. *
  5. * Licensed under the Apache License, Version 2.0(the "License");
  6. * you may not use this file except in compliance with the License.
  7. * You may obtain a copy of the License at
  8. *
  9. * http://www.apache.org/licenses/LICENSE-2.0
  10. *
  11. * Unless required by applicable law or agreed to in writing, software
  12. * distributed under the License is distributed on an "AS IS" BASIS,
  13. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. * See the License for the specific language governing permissions and
  15. * limitations under the License.
  16. */
  17. #ifndef __UTIL_SIMD_TYPES_H__
  18. #define __UTIL_SIMD_TYPES_H__
  19. #ifndef __KERNEL_GPU__
  20. # include <limits>
  21. # include "util/util_defines.h"
  22. /* SSE Intrinsics includes
  23. *
  24. * We assume __KERNEL_SSEX__ flags to have been defined at this point */
  25. /* SSE intrinsics headers */
  26. # ifndef FREE_WINDOWS64
  27. # ifdef _MSC_VER
  28. # include <intrin.h>
  29. # elif (defined(__x86_64__) || defined(__i386__))
  30. # include <x86intrin.h>
  31. # endif
  32. # else
  33. /* MinGW64 has conflicting declarations for these SSE headers in <windows.h>.
  34. * Since we can't avoid including <windows.h>, better only include that */
  35. # include "util/util_windows.h"
  36. # endif
  37. # if defined(__x86_64__) || defined(__i386__) || defined(_M_X64) || defined(_M_IX86)
  38. # define SIMD_SET_FLUSH_TO_ZERO \
  39. _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); \
  40. _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
  41. # else
  42. # define SIMD_SET_FLUSH_TO_ZERO
  43. # endif
  44. CCL_NAMESPACE_BEGIN
  45. # ifdef __KERNEL_SSE2__
  46. extern const __m128 _mm_lookupmask_ps[16];
  47. /* Special Types */
  48. static struct TrueTy {
  49. __forceinline operator bool() const
  50. {
  51. return true;
  52. }
  53. } True ccl_maybe_unused;
  54. static struct FalseTy {
  55. __forceinline operator bool() const
  56. {
  57. return false;
  58. }
  59. } False ccl_maybe_unused;
  60. static struct NegInfTy {
  61. __forceinline operator float() const
  62. {
  63. return -std::numeric_limits<float>::infinity();
  64. }
  65. __forceinline operator int() const
  66. {
  67. return std::numeric_limits<int>::min();
  68. }
  69. } neg_inf ccl_maybe_unused;
  70. static struct PosInfTy {
  71. __forceinline operator float() const
  72. {
  73. return std::numeric_limits<float>::infinity();
  74. }
  75. __forceinline operator int() const
  76. {
  77. return std::numeric_limits<int>::max();
  78. }
  79. } inf ccl_maybe_unused, pos_inf ccl_maybe_unused;
  80. /* Intrinsics Functions */
  81. # if defined(__BMI__) && defined(__GNUC__)
  82. # ifndef _tzcnt_u32
  83. # define _tzcnt_u32 __tzcnt_u32
  84. # endif
  85. # ifndef _tzcnt_u64
  86. # define _tzcnt_u64 __tzcnt_u64
  87. # endif
  88. # endif
  89. # if defined(__LZCNT__)
  90. # define _lzcnt_u32 __lzcnt32
  91. # define _lzcnt_u64 __lzcnt64
  92. # endif
  93. # if defined(_WIN32) && !defined(__MINGW32__) && !defined(__clang__)
  94. __forceinline int __popcnt(int in)
  95. {
  96. return _mm_popcnt_u32(in);
  97. }
  98. # if !defined(_MSC_VER)
  99. __forceinline unsigned int __popcnt(unsigned int in)
  100. {
  101. return _mm_popcnt_u32(in);
  102. }
  103. # endif
  104. # if defined(__KERNEL_64_BIT__)
  105. __forceinline long long __popcnt(long long in)
  106. {
  107. return _mm_popcnt_u64(in);
  108. }
  109. __forceinline size_t __popcnt(size_t in)
  110. {
  111. return _mm_popcnt_u64(in);
  112. }
  113. # endif
  114. __forceinline int __bsf(int v)
  115. {
  116. # if defined(__KERNEL_AVX2__)
  117. return _tzcnt_u32(v);
  118. # else
  119. unsigned long r = 0;
  120. _BitScanForward(&r, v);
  121. return r;
  122. # endif
  123. }
  124. __forceinline unsigned int __bsf(unsigned int v)
  125. {
  126. # if defined(__KERNEL_AVX2__)
  127. return _tzcnt_u32(v);
  128. # else
  129. unsigned long r = 0;
  130. _BitScanForward(&r, v);
  131. return r;
  132. # endif
  133. }
  134. __forceinline int __bsr(int v)
  135. {
  136. unsigned long r = 0;
  137. _BitScanReverse(&r, v);
  138. return r;
  139. }
  140. __forceinline int __btc(int v, int i)
  141. {
  142. long r = v;
  143. _bittestandcomplement(&r, i);
  144. return r;
  145. }
  146. __forceinline int __bts(int v, int i)
  147. {
  148. long r = v;
  149. _bittestandset(&r, i);
  150. return r;
  151. }
  152. __forceinline int __btr(int v, int i)
  153. {
  154. long r = v;
  155. _bittestandreset(&r, i);
  156. return r;
  157. }
  158. __forceinline int bitscan(int v)
  159. {
  160. # if defined(__KERNEL_AVX2__)
  161. return _tzcnt_u32(v);
  162. # else
  163. return __bsf(v);
  164. # endif
  165. }
  166. __forceinline int clz(const int x)
  167. {
  168. # if defined(__KERNEL_AVX2__)
  169. return _lzcnt_u32(x);
  170. # else
  171. if (UNLIKELY(x == 0))
  172. return 32;
  173. return 31 - __bsr(x);
  174. # endif
  175. }
  176. __forceinline int __bscf(int &v)
  177. {
  178. int i = __bsf(v);
  179. v &= v - 1;
  180. return i;
  181. }
  182. __forceinline unsigned int __bscf(unsigned int &v)
  183. {
  184. unsigned int i = __bsf(v);
  185. v &= v - 1;
  186. return i;
  187. }
  188. # if defined(__KERNEL_64_BIT__)
  189. __forceinline size_t __bsf(size_t v)
  190. {
  191. # if defined(__KERNEL_AVX2__)
  192. return _tzcnt_u64(v);
  193. # else
  194. unsigned long r = 0;
  195. _BitScanForward64(&r, v);
  196. return r;
  197. # endif
  198. }
  199. __forceinline size_t __bsr(size_t v)
  200. {
  201. unsigned long r = 0;
  202. _BitScanReverse64(&r, v);
  203. return r;
  204. }
  205. __forceinline size_t __btc(size_t v, size_t i)
  206. {
  207. size_t r = v;
  208. _bittestandcomplement64((__int64 *)&r, i);
  209. return r;
  210. }
  211. __forceinline size_t __bts(size_t v, size_t i)
  212. {
  213. __int64 r = v;
  214. _bittestandset64(&r, i);
  215. return r;
  216. }
  217. __forceinline size_t __btr(size_t v, size_t i)
  218. {
  219. __int64 r = v;
  220. _bittestandreset64(&r, i);
  221. return r;
  222. }
  223. __forceinline size_t bitscan(size_t v)
  224. {
  225. # if defined(__KERNEL_AVX2__)
  226. # if defined(__KERNEL_64_BIT__)
  227. return _tzcnt_u64(v);
  228. # else
  229. return _tzcnt_u32(v);
  230. # endif
  231. # else
  232. return __bsf(v);
  233. # endif
  234. }
  235. __forceinline size_t __bscf(size_t &v)
  236. {
  237. size_t i = __bsf(v);
  238. v &= v - 1;
  239. return i;
  240. }
  241. # endif /* __KERNEL_64_BIT__ */
  242. # else /* _WIN32 */
  243. __forceinline unsigned int __popcnt(unsigned int in)
  244. {
  245. int r = 0;
  246. asm("popcnt %1,%0" : "=r"(r) : "r"(in));
  247. return r;
  248. }
  249. __forceinline int __bsf(int v)
  250. {
  251. int r = 0;
  252. asm("bsf %1,%0" : "=r"(r) : "r"(v));
  253. return r;
  254. }
  255. __forceinline int __bsr(int v)
  256. {
  257. int r = 0;
  258. asm("bsr %1,%0" : "=r"(r) : "r"(v));
  259. return r;
  260. }
  261. __forceinline int __btc(int v, int i)
  262. {
  263. int r = 0;
  264. asm("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags");
  265. return r;
  266. }
  267. __forceinline int __bts(int v, int i)
  268. {
  269. int r = 0;
  270. asm("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags");
  271. return r;
  272. }
  273. __forceinline int __btr(int v, int i)
  274. {
  275. int r = 0;
  276. asm("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags");
  277. return r;
  278. }
  279. # if (defined(__KERNEL_64_BIT__) || defined(__APPLE__)) && \
  280. !(defined(__ILP32__) && defined(__x86_64__))
  281. __forceinline size_t __bsf(size_t v)
  282. {
  283. size_t r = 0;
  284. asm("bsf %1,%0" : "=r"(r) : "r"(v));
  285. return r;
  286. }
  287. # endif
  288. __forceinline unsigned int __bsf(unsigned int v)
  289. {
  290. unsigned int r = 0;
  291. asm("bsf %1,%0" : "=r"(r) : "r"(v));
  292. return r;
  293. }
  294. __forceinline size_t __bsr(size_t v)
  295. {
  296. size_t r = 0;
  297. asm("bsr %1,%0" : "=r"(r) : "r"(v));
  298. return r;
  299. }
  300. __forceinline size_t __btc(size_t v, size_t i)
  301. {
  302. size_t r = 0;
  303. asm("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags");
  304. return r;
  305. }
  306. __forceinline size_t __bts(size_t v, size_t i)
  307. {
  308. size_t r = 0;
  309. asm("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags");
  310. return r;
  311. }
  312. __forceinline size_t __btr(size_t v, size_t i)
  313. {
  314. size_t r = 0;
  315. asm("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags");
  316. return r;
  317. }
  318. __forceinline int bitscan(int v)
  319. {
  320. # if defined(__KERNEL_AVX2__)
  321. return _tzcnt_u32(v);
  322. # else
  323. return __bsf(v);
  324. # endif
  325. }
  326. __forceinline unsigned int bitscan(unsigned int v)
  327. {
  328. # if defined(__KERNEL_AVX2__)
  329. return _tzcnt_u32(v);
  330. # else
  331. return __bsf(v);
  332. # endif
  333. }
  334. # if (defined(__KERNEL_64_BIT__) || defined(__APPLE__)) && \
  335. !(defined(__ILP32__) && defined(__x86_64__))
  336. __forceinline size_t bitscan(size_t v)
  337. {
  338. # if defined(__KERNEL_AVX2__)
  339. # if defined(__KERNEL_64_BIT__)
  340. return _tzcnt_u64(v);
  341. # else
  342. return _tzcnt_u32(v);
  343. # endif
  344. # else
  345. return __bsf(v);
  346. # endif
  347. }
  348. # endif
  349. __forceinline int clz(const int x)
  350. {
  351. # if defined(__KERNEL_AVX2__)
  352. return _lzcnt_u32(x);
  353. # else
  354. if (UNLIKELY(x == 0))
  355. return 32;
  356. return 31 - __bsr(x);
  357. # endif
  358. }
  359. __forceinline int __bscf(int &v)
  360. {
  361. int i = bitscan(v);
  362. # if defined(__KERNEL_AVX2__)
  363. v &= v - 1;
  364. # else
  365. v = __btc(v, i);
  366. # endif
  367. return i;
  368. }
  369. __forceinline unsigned int __bscf(unsigned int &v)
  370. {
  371. unsigned int i = bitscan(v);
  372. v &= v - 1;
  373. return i;
  374. }
  375. # if (defined(__KERNEL_64_BIT__) || defined(__APPLE__)) && \
  376. !(defined(__ILP32__) && defined(__x86_64__))
  377. __forceinline size_t __bscf(size_t &v)
  378. {
  379. size_t i = bitscan(v);
  380. # if defined(__KERNEL_AVX2__)
  381. v &= v - 1;
  382. # else
  383. v = __btc(v, i);
  384. # endif
  385. return i;
  386. }
  387. # endif
  388. # endif /* _WIN32 */
  389. /* Test __KERNEL_SSE41__ for MSVC which does not define __SSE4_1__, and test
  390. * __SSE4_1__ to avoid OpenImageIO conflicts with our emulation macros on other
  391. * platforms when compiling code outside the kernel. */
  392. # if !(defined(__KERNEL_SSE41__) || defined(__SSE4_1__) || defined(__SSE4_2__))
  393. /* Emulation of SSE4 functions with SSE2 */
  394. # define _MM_FROUND_TO_NEAREST_INT 0x00
  395. # define _MM_FROUND_TO_NEG_INF 0x01
  396. # define _MM_FROUND_TO_POS_INF 0x02
  397. # define _MM_FROUND_TO_ZERO 0x03
  398. # define _MM_FROUND_CUR_DIRECTION 0x04
  399. # undef _mm_blendv_ps
  400. # define _mm_blendv_ps _mm_blendv_ps_emu
  401. __forceinline __m128 _mm_blendv_ps_emu(__m128 value, __m128 input, __m128 mask)
  402. {
  403. __m128i isignmask = _mm_set1_epi32(0x80000000);
  404. __m128 signmask = _mm_castsi128_ps(isignmask);
  405. __m128i iandsign = _mm_castps_si128(_mm_and_ps(mask, signmask));
  406. __m128i icmpmask = _mm_cmpeq_epi32(iandsign, isignmask);
  407. __m128 cmpmask = _mm_castsi128_ps(icmpmask);
  408. return _mm_or_ps(_mm_and_ps(cmpmask, input), _mm_andnot_ps(cmpmask, value));
  409. }
  410. # undef _mm_blend_ps
  411. # define _mm_blend_ps _mm_blend_ps_emu
  412. __forceinline __m128 _mm_blend_ps_emu(__m128 value, __m128 input, const int mask)
  413. {
  414. assert(mask < 0x10);
  415. return _mm_blendv_ps(value, input, _mm_lookupmask_ps[mask]);
  416. }
  417. # undef _mm_blendv_epi8
  418. # define _mm_blendv_epi8 _mm_blendv_epi8_emu
  419. __forceinline __m128i _mm_blendv_epi8_emu(__m128i value, __m128i input, __m128i mask)
  420. {
  421. return _mm_or_si128(_mm_and_si128(mask, input), _mm_andnot_si128(mask, value));
  422. }
  423. # undef _mm_min_epi32
  424. # define _mm_min_epi32 _mm_min_epi32_emu
  425. __forceinline __m128i _mm_min_epi32_emu(__m128i value, __m128i input)
  426. {
  427. return _mm_blendv_epi8(input, value, _mm_cmplt_epi32(value, input));
  428. }
  429. # undef _mm_max_epi32
  430. # define _mm_max_epi32 _mm_max_epi32_emu
  431. __forceinline __m128i _mm_max_epi32_emu(__m128i value, __m128i input)
  432. {
  433. return _mm_blendv_epi8(value, input, _mm_cmplt_epi32(value, input));
  434. }
  435. # undef _mm_extract_epi32
  436. # define _mm_extract_epi32 _mm_extract_epi32_emu
  437. __forceinline int _mm_extract_epi32_emu(__m128i input, const int index)
  438. {
  439. switch (index) {
  440. case 0:
  441. return _mm_cvtsi128_si32(input);
  442. case 1:
  443. return _mm_cvtsi128_si32(_mm_shuffle_epi32(input, _MM_SHUFFLE(1, 1, 1, 1)));
  444. case 2:
  445. return _mm_cvtsi128_si32(_mm_shuffle_epi32(input, _MM_SHUFFLE(2, 2, 2, 2)));
  446. case 3:
  447. return _mm_cvtsi128_si32(_mm_shuffle_epi32(input, _MM_SHUFFLE(3, 3, 3, 3)));
  448. default:
  449. assert(false);
  450. return 0;
  451. }
  452. }
  453. # undef _mm_insert_epi32
  454. # define _mm_insert_epi32 _mm_insert_epi32_emu
  455. __forceinline __m128i _mm_insert_epi32_emu(__m128i value, int input, const int index)
  456. {
  457. assert(index >= 0 && index < 4);
  458. ((int *)&value)[index] = input;
  459. return value;
  460. }
  461. # undef _mm_insert_ps
  462. # define _mm_insert_ps _mm_insert_ps_emu
  463. __forceinline __m128 _mm_insert_ps_emu(__m128 value, __m128 input, const int index)
  464. {
  465. assert(index < 0x100);
  466. ((float *)&value)[(index >> 4) & 0x3] = ((float *)&input)[index >> 6];
  467. return _mm_andnot_ps(_mm_lookupmask_ps[index & 0xf], value);
  468. }
  469. # undef _mm_round_ps
  470. # define _mm_round_ps _mm_round_ps_emu
  471. __forceinline __m128 _mm_round_ps_emu(__m128 value, const int flags)
  472. {
  473. switch (flags) {
  474. case _MM_FROUND_TO_NEAREST_INT:
  475. return _mm_cvtepi32_ps(_mm_cvtps_epi32(value));
  476. case _MM_FROUND_TO_NEG_INF:
  477. return _mm_cvtepi32_ps(_mm_cvtps_epi32(_mm_add_ps(value, _mm_set1_ps(-0.5f))));
  478. case _MM_FROUND_TO_POS_INF:
  479. return _mm_cvtepi32_ps(_mm_cvtps_epi32(_mm_add_ps(value, _mm_set1_ps(0.5f))));
  480. case _MM_FROUND_TO_ZERO:
  481. return _mm_cvtepi32_ps(_mm_cvttps_epi32(value));
  482. }
  483. return value;
  484. }
  485. # endif /* !(defined(__KERNEL_SSE41__) || defined(__SSE4_1__) || defined(__SSE4_2__)) */
  486. # else /* __KERNEL_SSE2__ */
  487. /* This section is for utility functions which operates on non-register data
  488. * which might be used from a non-vectorized code.
  489. */
  490. ccl_device_inline int bitscan(int value)
  491. {
  492. assert(value != 0);
  493. int bit = 0;
  494. while ((value & (1 << bit)) == 0) {
  495. ++bit;
  496. }
  497. return bit;
  498. }
  499. ccl_device_inline int __bsr(int value)
  500. {
  501. assert(value != 0);
  502. int bit = 0;
  503. while (value >>= 1) {
  504. ++bit;
  505. }
  506. return bit;
  507. }
  508. # endif /* __KERNEL_SSE2__ */
  509. /* quiet unused define warnings */
  510. # if defined(__KERNEL_SSE2__) || defined(__KERNEL_SSE3__) || defined(__KERNEL_SSSE3__) || \
  511. defined(__KERNEL_SSE41__) || defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__)
  512. /* do nothing */
  513. # endif
  514. CCL_NAMESPACE_END
  515. #endif /* __KERNEL_GPU__ */
  516. #endif /* __UTIL_SIMD_TYPES_H__ */