util_sseb.h 8.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312
  1. /*
  2. * Copyright 2011-2013 Intel Corporation
  3. * Modifications Copyright 2014, Blender Foundation.
  4. *
  5. * Licensed under the Apache License, Version 2.0(the "License");
  6. * you may not use this file except in compliance with the License.
  7. * You may obtain a copy of the License at
  8. *
  9. * http://www.apache.org/licenses/LICENSE-2.0
  10. *
  11. * Unless required by applicable law or agreed to in writing, software
  12. * distributed under the License is distributed on an "AS IS" BASIS,
  13. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. * See the License for the specific language governing permissions and
  15. * limitations under the License.
  16. */
  17. #ifndef __UTIL_SSEB_H__
  18. #define __UTIL_SSEB_H__
  19. CCL_NAMESPACE_BEGIN
  20. #ifdef __KERNEL_SSE2__
  21. struct ssei;
  22. struct ssef;
  23. /*! 4-wide SSE bool type. */
  24. struct sseb {
  25. typedef sseb Mask; // mask type
  26. typedef ssei Int; // int type
  27. typedef ssef Float; // float type
  28. enum { size = 4 }; // number of SIMD elements
  29. union {
  30. __m128 m128;
  31. int32_t v[4];
  32. }; // data
  33. ////////////////////////////////////////////////////////////////////////////////
  34. /// Constructors, Assignment & Cast Operators
  35. ////////////////////////////////////////////////////////////////////////////////
  36. __forceinline sseb()
  37. {
  38. }
  39. __forceinline sseb(const sseb &other)
  40. {
  41. m128 = other.m128;
  42. }
  43. __forceinline sseb &operator=(const sseb &other)
  44. {
  45. m128 = other.m128;
  46. return *this;
  47. }
  48. __forceinline sseb(const __m128 input) : m128(input)
  49. {
  50. }
  51. __forceinline operator const __m128 &(void)const
  52. {
  53. return m128;
  54. }
  55. __forceinline operator const __m128i(void) const
  56. {
  57. return _mm_castps_si128(m128);
  58. }
  59. __forceinline operator const __m128d(void) const
  60. {
  61. return _mm_castps_pd(m128);
  62. }
  63. __forceinline sseb(bool a)
  64. : m128(_mm_lookupmask_ps[(size_t(a) << 3) | (size_t(a) << 2) | (size_t(a) << 1) | size_t(a)])
  65. {
  66. }
  67. __forceinline sseb(bool a, bool b)
  68. : m128(_mm_lookupmask_ps[(size_t(b) << 3) | (size_t(a) << 2) | (size_t(b) << 1) | size_t(a)])
  69. {
  70. }
  71. __forceinline sseb(bool a, bool b, bool c, bool d)
  72. : m128(_mm_lookupmask_ps[(size_t(d) << 3) | (size_t(c) << 2) | (size_t(b) << 1) | size_t(a)])
  73. {
  74. }
  75. __forceinline sseb(int mask)
  76. {
  77. assert(mask >= 0 && mask < 16);
  78. m128 = _mm_lookupmask_ps[mask];
  79. }
  80. ////////////////////////////////////////////////////////////////////////////////
  81. /// Constants
  82. ////////////////////////////////////////////////////////////////////////////////
  83. __forceinline sseb(FalseTy) : m128(_mm_setzero_ps())
  84. {
  85. }
  86. __forceinline sseb(TrueTy)
  87. : m128(_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())))
  88. {
  89. }
  90. ////////////////////////////////////////////////////////////////////////////////
  91. /// Array Access
  92. ////////////////////////////////////////////////////////////////////////////////
  93. __forceinline bool operator[](const size_t i) const
  94. {
  95. assert(i < 4);
  96. return (_mm_movemask_ps(m128) >> i) & 1;
  97. }
  98. __forceinline int32_t &operator[](const size_t i)
  99. {
  100. assert(i < 4);
  101. return v[i];
  102. }
  103. };
  104. ////////////////////////////////////////////////////////////////////////////////
  105. /// Unary Operators
  106. ////////////////////////////////////////////////////////////////////////////////
  107. __forceinline const sseb operator!(const sseb &a)
  108. {
  109. return _mm_xor_ps(a, sseb(True));
  110. }
  111. ////////////////////////////////////////////////////////////////////////////////
  112. /// Binary Operators
  113. ////////////////////////////////////////////////////////////////////////////////
  114. __forceinline const sseb operator&(const sseb &a, const sseb &b)
  115. {
  116. return _mm_and_ps(a, b);
  117. }
  118. __forceinline const sseb operator|(const sseb &a, const sseb &b)
  119. {
  120. return _mm_or_ps(a, b);
  121. }
  122. __forceinline const sseb operator^(const sseb &a, const sseb &b)
  123. {
  124. return _mm_xor_ps(a, b);
  125. }
  126. ////////////////////////////////////////////////////////////////////////////////
  127. /// Assignment Operators
  128. ////////////////////////////////////////////////////////////////////////////////
  129. __forceinline const sseb operator&=(sseb &a, const sseb &b)
  130. {
  131. return a = a & b;
  132. }
  133. __forceinline const sseb operator|=(sseb &a, const sseb &b)
  134. {
  135. return a = a | b;
  136. }
  137. __forceinline const sseb operator^=(sseb &a, const sseb &b)
  138. {
  139. return a = a ^ b;
  140. }
  141. ////////////////////////////////////////////////////////////////////////////////
  142. /// Comparison Operators + Select
  143. ////////////////////////////////////////////////////////////////////////////////
  144. __forceinline const sseb operator!=(const sseb &a, const sseb &b)
  145. {
  146. return _mm_xor_ps(a, b);
  147. }
  148. __forceinline const sseb operator==(const sseb &a, const sseb &b)
  149. {
  150. return _mm_castsi128_ps(_mm_cmpeq_epi32(a, b));
  151. }
  152. __forceinline const sseb select(const sseb &m, const sseb &t, const sseb &f)
  153. {
  154. # if defined(__KERNEL_SSE41__)
  155. return _mm_blendv_ps(f, t, m);
  156. # else
  157. return _mm_or_ps(_mm_and_ps(m, t), _mm_andnot_ps(m, f));
  158. # endif
  159. }
  160. ////////////////////////////////////////////////////////////////////////////////
  161. /// Movement/Shifting/Shuffling Functions
  162. ////////////////////////////////////////////////////////////////////////////////
  163. __forceinline const sseb unpacklo(const sseb &a, const sseb &b)
  164. {
  165. return _mm_unpacklo_ps(a, b);
  166. }
  167. __forceinline const sseb unpackhi(const sseb &a, const sseb &b)
  168. {
  169. return _mm_unpackhi_ps(a, b);
  170. }
  171. template<size_t i0, size_t i1, size_t i2, size_t i3>
  172. __forceinline const sseb shuffle(const sseb &a)
  173. {
  174. return _mm_castsi128_ps(_mm_shuffle_epi32(a, _MM_SHUFFLE(i3, i2, i1, i0)));
  175. }
  176. template<> __forceinline const sseb shuffle<0, 1, 0, 1>(const sseb &a)
  177. {
  178. return _mm_movelh_ps(a, a);
  179. }
  180. template<> __forceinline const sseb shuffle<2, 3, 2, 3>(const sseb &a)
  181. {
  182. return _mm_movehl_ps(a, a);
  183. }
  184. template<size_t i0, size_t i1, size_t i2, size_t i3>
  185. __forceinline const sseb shuffle(const sseb &a, const sseb &b)
  186. {
  187. return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
  188. }
  189. template<> __forceinline const sseb shuffle<0, 1, 0, 1>(const sseb &a, const sseb &b)
  190. {
  191. return _mm_movelh_ps(a, b);
  192. }
  193. template<> __forceinline const sseb shuffle<2, 3, 2, 3>(const sseb &a, const sseb &b)
  194. {
  195. return _mm_movehl_ps(b, a);
  196. }
  197. # if defined(__KERNEL_SSE3__)
  198. template<> __forceinline const sseb shuffle<0, 0, 2, 2>(const sseb &a)
  199. {
  200. return _mm_moveldup_ps(a);
  201. }
  202. template<> __forceinline const sseb shuffle<1, 1, 3, 3>(const sseb &a)
  203. {
  204. return _mm_movehdup_ps(a);
  205. }
  206. # endif
  207. # if defined(__KERNEL_SSE41__)
  208. template<size_t dst, size_t src, size_t clr>
  209. __forceinline const sseb insert(const sseb &a, const sseb &b)
  210. {
  211. return _mm_insert_ps(a, b, (dst << 4) | (src << 6) | clr);
  212. }
  213. template<size_t dst, size_t src> __forceinline const sseb insert(const sseb &a, const sseb &b)
  214. {
  215. return insert<dst, src, 0>(a, b);
  216. }
  217. template<size_t dst> __forceinline const sseb insert(const sseb &a, const bool b)
  218. {
  219. return insert<dst, 0>(a, sseb(b));
  220. }
  221. # endif
  222. ////////////////////////////////////////////////////////////////////////////////
  223. /// Reduction Operations
  224. ////////////////////////////////////////////////////////////////////////////////
  225. # if defined(__KERNEL_SSE41__)
  226. __forceinline size_t popcnt(const sseb &a)
  227. {
  228. return __popcnt(_mm_movemask_ps(a));
  229. }
  230. # else
  231. __forceinline size_t popcnt(const sseb &a)
  232. {
  233. return bool(a[0]) + bool(a[1]) + bool(a[2]) + bool(a[3]);
  234. }
  235. # endif
  236. __forceinline bool reduce_and(const sseb &a)
  237. {
  238. return _mm_movemask_ps(a) == 0xf;
  239. }
  240. __forceinline bool reduce_or(const sseb &a)
  241. {
  242. return _mm_movemask_ps(a) != 0x0;
  243. }
  244. __forceinline bool all(const sseb &b)
  245. {
  246. return _mm_movemask_ps(b) == 0xf;
  247. }
  248. __forceinline bool any(const sseb &b)
  249. {
  250. return _mm_movemask_ps(b) != 0x0;
  251. }
  252. __forceinline bool none(const sseb &b)
  253. {
  254. return _mm_movemask_ps(b) == 0x0;
  255. }
  256. __forceinline size_t movemask(const sseb &a)
  257. {
  258. return _mm_movemask_ps(a);
  259. }
  260. ////////////////////////////////////////////////////////////////////////////////
  261. /// Debug Functions
  262. ////////////////////////////////////////////////////////////////////////////////
  263. ccl_device_inline void print_sseb(const char *label, const sseb &a)
  264. {
  265. printf("%s: %d %d %d %d\n", label, a[0], a[1], a[2], a[3]);
  266. }
  267. #endif
  268. CCL_NAMESPACE_END
  269. #endif