util_avxb.h 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242
  1. /*
  2. * Copyright 2011-2013 Intel Corporation
  3. * Modifications Copyright 2014, Blender Foundation.
  4. *
  5. * Licensed under the Apache License, Version 2.0(the "License");
  6. * you may not use this file except in compliance with the License.
  7. * You may obtain a copy of the License at
  8. *
  9. * http://www.apache.org/licenses/LICENSE-2.0
  10. *
  11. * Unless required by applicable law or agreed to in writing, software
  12. * distributed under the License is distributed on an "AS IS" BASIS,
  13. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. * See the License for the specific language governing permissions and
  15. * limitations under the License.
  16. */
  17. #ifndef __UTIL_AVXB_H__
  18. # define __UTIL_AVXB_H__
  19. CCL_NAMESPACE_BEGIN
  20. struct avxf;
  21. /*! 4-wide SSE bool type. */
  22. struct avxb {
  23. typedef avxb Mask; // mask type
  24. typedef avxf Float; // float type
  25. enum { size = 8 }; // number of SIMD elements
  26. union {
  27. __m256 m256;
  28. int32_t v[8];
  29. }; // data
  30. ////////////////////////////////////////////////////////////////////////////////
  31. /// Constructors, Assignment & Cast Operators
  32. ////////////////////////////////////////////////////////////////////////////////
  33. __forceinline avxb()
  34. {
  35. }
  36. __forceinline avxb(const avxb &other)
  37. {
  38. m256 = other.m256;
  39. }
  40. __forceinline avxb &operator=(const avxb &other)
  41. {
  42. m256 = other.m256;
  43. return *this;
  44. }
  45. __forceinline avxb(const __m256 input) : m256(input)
  46. {
  47. }
  48. __forceinline operator const __m256 &(void)const
  49. {
  50. return m256;
  51. }
  52. __forceinline operator const __m256i(void) const
  53. {
  54. return _mm256_castps_si256(m256);
  55. }
  56. __forceinline operator const __m256d(void) const
  57. {
  58. return _mm256_castps_pd(m256);
  59. }
  60. ////////////////////////////////////////////////////////////////////////////////
  61. /// Constants
  62. ////////////////////////////////////////////////////////////////////////////////
  63. __forceinline avxb(FalseTy) : m256(_mm256_setzero_ps())
  64. {
  65. }
  66. __forceinline avxb(TrueTy) : m256(_mm256_castsi256_ps(_mm256_set1_epi32(-1)))
  67. {
  68. }
  69. ////////////////////////////////////////////////////////////////////////////////
  70. /// Array Access
  71. ////////////////////////////////////////////////////////////////////////////////
  72. __forceinline bool operator[](const size_t i) const
  73. {
  74. assert(i < 8);
  75. return (_mm256_movemask_ps(m256) >> i) & 1;
  76. }
  77. __forceinline int32_t &operator[](const size_t i)
  78. {
  79. assert(i < 8);
  80. return v[i];
  81. }
  82. };
  83. ////////////////////////////////////////////////////////////////////////////////
  84. /// Unary Operators
  85. ////////////////////////////////////////////////////////////////////////////////
  86. __forceinline const avxb operator!(const avxb &a)
  87. {
  88. return _mm256_xor_ps(a, avxb(True));
  89. }
  90. ////////////////////////////////////////////////////////////////////////////////
  91. /// Binary Operators
  92. ////////////////////////////////////////////////////////////////////////////////
  93. __forceinline const avxb operator&(const avxb &a, const avxb &b)
  94. {
  95. return _mm256_and_ps(a, b);
  96. }
  97. __forceinline const avxb operator|(const avxb &a, const avxb &b)
  98. {
  99. return _mm256_or_ps(a, b);
  100. }
  101. __forceinline const avxb operator^(const avxb &a, const avxb &b)
  102. {
  103. return _mm256_xor_ps(a, b);
  104. }
  105. ////////////////////////////////////////////////////////////////////////////////
  106. /// Assignment Operators
  107. ////////////////////////////////////////////////////////////////////////////////
  108. __forceinline const avxb operator&=(avxb &a, const avxb &b)
  109. {
  110. return a = a & b;
  111. }
  112. __forceinline const avxb operator|=(avxb &a, const avxb &b)
  113. {
  114. return a = a | b;
  115. }
  116. __forceinline const avxb operator^=(avxb &a, const avxb &b)
  117. {
  118. return a = a ^ b;
  119. }
  120. ////////////////////////////////////////////////////////////////////////////////
  121. /// Comparison Operators + Select
  122. ////////////////////////////////////////////////////////////////////////////////
  123. __forceinline const avxb operator!=(const avxb &a, const avxb &b)
  124. {
  125. return _mm256_xor_ps(a, b);
  126. }
  127. __forceinline const avxb operator==(const avxb &a, const avxb &b)
  128. {
  129. # ifdef __KERNEL_AVX2__
  130. return _mm256_castsi256_ps(_mm256_cmpeq_epi32(a, b));
  131. # else
  132. __m128i a_lo = _mm_castps_si128(_mm256_extractf128_ps(a, 0));
  133. __m128i a_hi = _mm_castps_si128(_mm256_extractf128_ps(a, 1));
  134. __m128i b_lo = _mm_castps_si128(_mm256_extractf128_ps(b, 0));
  135. __m128i b_hi = _mm_castps_si128(_mm256_extractf128_ps(b, 1));
  136. __m128i c_lo = _mm_cmpeq_epi32(a_lo, b_lo);
  137. __m128i c_hi = _mm_cmpeq_epi32(a_hi, b_hi);
  138. __m256i result = _mm256_insertf128_si256(_mm256_castsi128_si256(c_lo), c_hi, 1);
  139. return _mm256_castsi256_ps(result);
  140. # endif
  141. }
  142. __forceinline const avxb select(const avxb &m, const avxb &t, const avxb &f)
  143. {
  144. # if defined(__KERNEL_SSE41__)
  145. return _mm256_blendv_ps(f, t, m);
  146. # else
  147. return _mm256_or_ps(_mm256_and_ps(m, t), _mm256_andnot_ps(m, f));
  148. # endif
  149. }
  150. ////////////////////////////////////////////////////////////////////////////////
  151. /// Movement/Shifting/Shuffling Functions
  152. ////////////////////////////////////////////////////////////////////////////////
  153. __forceinline const avxb unpacklo(const avxb &a, const avxb &b)
  154. {
  155. return _mm256_unpacklo_ps(a, b);
  156. }
  157. __forceinline const avxb unpackhi(const avxb &a, const avxb &b)
  158. {
  159. return _mm256_unpackhi_ps(a, b);
  160. }
  161. ////////////////////////////////////////////////////////////////////////////////
  162. /// Reduction Operations
  163. ////////////////////////////////////////////////////////////////////////////////
  164. # if defined(__KERNEL_SSE41__)
  165. __forceinline size_t popcnt(const avxb &a)
  166. {
  167. return __popcnt(_mm256_movemask_ps(a));
  168. }
  169. # else
  170. __forceinline size_t popcnt(const avxb &a)
  171. {
  172. return bool(a[0]) + bool(a[1]) + bool(a[2]) + bool(a[3]) + bool(a[4]) + bool(a[5]) + bool(a[6]) +
  173. bool(a[7]);
  174. }
  175. # endif
  176. __forceinline bool reduce_and(const avxb &a)
  177. {
  178. return _mm256_movemask_ps(a) == 0xf;
  179. }
  180. __forceinline bool reduce_or(const avxb &a)
  181. {
  182. return _mm256_movemask_ps(a) != 0x0;
  183. }
  184. __forceinline bool all(const avxb &b)
  185. {
  186. return _mm256_movemask_ps(b) == 0xf;
  187. }
  188. __forceinline bool any(const avxb &b)
  189. {
  190. return _mm256_movemask_ps(b) != 0x0;
  191. }
  192. __forceinline bool none(const avxb &b)
  193. {
  194. return _mm256_movemask_ps(b) == 0x0;
  195. }
  196. __forceinline size_t movemask(const avxb &a)
  197. {
  198. return _mm256_movemask_ps(a);
  199. }
  200. ////////////////////////////////////////////////////////////////////////////////
  201. /// Debug Functions
  202. ////////////////////////////////////////////////////////////////////////////////
  203. ccl_device_inline void print_avxb(const char *label, const avxb &a)
  204. {
  205. printf("%s: %d %d %d %d %d %d %d %d\n", label, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7]);
  206. }
  207. #endif
  208. CCL_NAMESPACE_END
  209. //#endif