util_avxf.h 9.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355
  1. /*
  2. * Copyright 2016 Intel Corporation
  3. *
  4. * Licensed under the Apache License, Version 2.0(the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #ifndef __UTIL_AVXF_H__
  17. # define __UTIL_AVXF_H__
  18. CCL_NAMESPACE_BEGIN
  19. struct avxb;
  20. struct avxf {
  21. typedef avxf Float;
  22. enum { size = 8 }; /* Number of SIMD elements. */
  23. union {
  24. __m256 m256;
  25. float f[8];
  26. int i[8];
  27. };
  28. __forceinline avxf()
  29. {
  30. }
  31. __forceinline avxf(const avxf &other)
  32. {
  33. m256 = other.m256;
  34. }
  35. __forceinline avxf &operator=(const avxf &other)
  36. {
  37. m256 = other.m256;
  38. return *this;
  39. }
  40. __forceinline avxf(const __m256 a) : m256(a)
  41. {
  42. }
  43. __forceinline avxf(const __m256i a) : m256(_mm256_castsi256_ps(a))
  44. {
  45. }
  46. __forceinline operator const __m256 &() const
  47. {
  48. return m256;
  49. }
  50. __forceinline operator __m256 &()
  51. {
  52. return m256;
  53. }
  54. __forceinline avxf(float a) : m256(_mm256_set1_ps(a))
  55. {
  56. }
  57. __forceinline avxf(float high32x4, float low32x4)
  58. : m256(_mm256_set_ps(
  59. high32x4, high32x4, high32x4, high32x4, low32x4, low32x4, low32x4, low32x4))
  60. {
  61. }
  62. __forceinline avxf(float a3, float a2, float a1, float a0)
  63. : m256(_mm256_set_ps(a3, a2, a1, a0, a3, a2, a1, a0))
  64. {
  65. }
  66. __forceinline avxf(
  67. float a7, float a6, float a5, float a4, float a3, float a2, float a1, float a0)
  68. : m256(_mm256_set_ps(a7, a6, a5, a4, a3, a2, a1, a0))
  69. {
  70. }
  71. __forceinline avxf(float3 a) : m256(_mm256_set_ps(a.w, a.z, a.y, a.x, a.w, a.z, a.y, a.x))
  72. {
  73. }
  74. __forceinline avxf(int a3, int a2, int a1, int a0)
  75. {
  76. const __m256i foo = _mm256_set_epi32(a3, a2, a1, a0, a3, a2, a1, a0);
  77. m256 = _mm256_castsi256_ps(foo);
  78. }
  79. __forceinline avxf(int a7, int a6, int a5, int a4, int a3, int a2, int a1, int a0)
  80. {
  81. const __m256i foo = _mm256_set_epi32(a7, a6, a5, a4, a3, a2, a1, a0);
  82. m256 = _mm256_castsi256_ps(foo);
  83. }
  84. __forceinline avxf(__m128 a, __m128 b)
  85. {
  86. const __m256 foo = _mm256_castps128_ps256(a);
  87. m256 = _mm256_insertf128_ps(foo, b, 1);
  88. }
  89. __forceinline const float &operator[](const size_t i) const
  90. {
  91. assert(i < 8);
  92. return f[i];
  93. }
  94. __forceinline float &operator[](const size_t i)
  95. {
  96. assert(i < 8);
  97. return f[i];
  98. }
  99. };
  100. __forceinline avxf cross(const avxf &a, const avxf &b)
  101. {
  102. avxf r(0.0,
  103. a[4] * b[5] - a[5] * b[4],
  104. a[6] * b[4] - a[4] * b[6],
  105. a[5] * b[6] - a[6] * b[5],
  106. 0.0,
  107. a[0] * b[1] - a[1] * b[0],
  108. a[2] * b[0] - a[0] * b[2],
  109. a[1] * b[2] - a[2] * b[1]);
  110. return r;
  111. }
  112. __forceinline void dot3(const avxf &a, const avxf &b, float &den, float &den2)
  113. {
  114. const avxf t = _mm256_mul_ps(a.m256, b.m256);
  115. den = ((float *)&t)[0] + ((float *)&t)[1] + ((float *)&t)[2];
  116. den2 = ((float *)&t)[4] + ((float *)&t)[5] + ((float *)&t)[6];
  117. }
  118. ////////////////////////////////////////////////////////////////////////////////
  119. /// Unary Operators
  120. ////////////////////////////////////////////////////////////////////////////////
  121. __forceinline const avxf mm256_sqrt(const avxf &a)
  122. {
  123. return _mm256_sqrt_ps(a.m256);
  124. }
  125. ////////////////////////////////////////////////////////////////////////////////
  126. /// Binary Operators
  127. ////////////////////////////////////////////////////////////////////////////////
  128. __forceinline const avxf operator+(const avxf &a, const avxf &b)
  129. {
  130. return _mm256_add_ps(a.m256, b.m256);
  131. }
  132. __forceinline const avxf operator+(const avxf &a, const float &b)
  133. {
  134. return a + avxf(b);
  135. }
  136. __forceinline const avxf operator+(const float &a, const avxf &b)
  137. {
  138. return avxf(a) + b;
  139. }
  140. __forceinline const avxf operator-(const avxf &a, const avxf &b)
  141. {
  142. return _mm256_sub_ps(a.m256, b.m256);
  143. }
  144. __forceinline const avxf operator-(const avxf &a, const float &b)
  145. {
  146. return a - avxf(b);
  147. }
  148. __forceinline const avxf operator-(const float &a, const avxf &b)
  149. {
  150. return avxf(a) - b;
  151. }
  152. __forceinline const avxf operator*(const avxf &a, const avxf &b)
  153. {
  154. return _mm256_mul_ps(a.m256, b.m256);
  155. }
  156. __forceinline const avxf operator*(const avxf &a, const float &b)
  157. {
  158. return a * avxf(b);
  159. }
  160. __forceinline const avxf operator*(const float &a, const avxf &b)
  161. {
  162. return avxf(a) * b;
  163. }
  164. __forceinline const avxf operator/(const avxf &a, const avxf &b)
  165. {
  166. return _mm256_div_ps(a.m256, b.m256);
  167. }
  168. __forceinline const avxf operator/(const avxf &a, const float &b)
  169. {
  170. return a / avxf(b);
  171. }
  172. __forceinline const avxf operator/(const float &a, const avxf &b)
  173. {
  174. return avxf(a) / b;
  175. }
  176. __forceinline const avxf operator|(const avxf &a, const avxf &b)
  177. {
  178. return _mm256_or_ps(a.m256, b.m256);
  179. }
  180. __forceinline const avxf operator^(const avxf &a, const avxf &b)
  181. {
  182. return _mm256_xor_ps(a.m256, b.m256);
  183. }
  184. __forceinline const avxf operator&(const avxf &a, const avxf &b)
  185. {
  186. return _mm256_and_ps(a.m256, b.m256);
  187. }
  188. __forceinline const avxf max(const avxf &a, const avxf &b)
  189. {
  190. return _mm256_max_ps(a.m256, b.m256);
  191. }
  192. __forceinline const avxf min(const avxf &a, const avxf &b)
  193. {
  194. return _mm256_min_ps(a.m256, b.m256);
  195. }
  196. ////////////////////////////////////////////////////////////////////////////////
  197. /// Movement/Shifting/Shuffling Functions
  198. ////////////////////////////////////////////////////////////////////////////////
  199. __forceinline const avxf shuffle(const avxf &a, const __m256i &shuf)
  200. {
  201. return _mm256_permutevar_ps(a, shuf);
  202. }
  203. template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
  204. __forceinline const avxf shuffle(const avxf &a)
  205. {
  206. return _mm256_permutevar_ps(a, _mm256_set_epi32(i7, i6, i5, i4, i3, i2, i1, i0));
  207. }
  208. template<size_t i0, size_t i1, size_t i2, size_t i3>
  209. __forceinline const avxf shuffle(const avxf &a, const avxf &b)
  210. {
  211. return _mm256_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
  212. }
  213. template<size_t i0, size_t i1, size_t i2, size_t i3>
  214. __forceinline const avxf shuffle(const avxf &a)
  215. {
  216. return shuffle<i0, i1, i2, i3>(a, a);
  217. }
  218. template<size_t i0> __forceinline const avxf shuffle(const avxf &a, const avxf &b)
  219. {
  220. return shuffle<i0, i0, i0, i0>(a, b);
  221. }
  222. template<size_t i0> __forceinline const avxf shuffle(const avxf &a)
  223. {
  224. return shuffle<i0>(a, a);
  225. }
  226. template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
  227. __forceinline const avxf permute(const avxf &a)
  228. {
  229. # ifdef __KERNEL_AVX2__
  230. return _mm256_permutevar8x32_ps(a, _mm256_set_epi32(i7, i6, i5, i4, i3, i2, i1, i0));
  231. # else
  232. float temp[8];
  233. _mm256_storeu_ps((float *)&temp, a);
  234. return avxf(temp[i7], temp[i6], temp[i5], temp[i4], temp[i3], temp[i2], temp[i1], temp[i0]);
  235. # endif
  236. }
  237. template<int S0, int S1, int S2, int S3, int S4, int S5, int S6, int S7>
  238. ccl_device_inline const avxf set_sign_bit(const avxf &a)
  239. {
  240. return a ^ avxf(S7 << 31, S6 << 31, S5 << 31, S4 << 31, S3 << 31, S2 << 31, S1 << 31, S0 << 31);
  241. }
  242. template<size_t S0, size_t S1, size_t S2, size_t S3, size_t S4, size_t S5, size_t S6, size_t S7>
  243. ccl_device_inline const avxf blend(const avxf &a, const avxf &b)
  244. {
  245. return _mm256_blend_ps(
  246. a, b, S7 << 0 | S6 << 1 | S5 << 2 | S4 << 3 | S3 << 4 | S2 << 5 | S1 << 6 | S0 << 7);
  247. }
  248. template<size_t S0, size_t S1, size_t S2, size_t S3>
  249. ccl_device_inline const avxf blend(const avxf &a, const avxf &b)
  250. {
  251. return blend<S0, S1, S2, S3, S0, S1, S2, S3>(a, b);
  252. }
  253. //#if defined(__KERNEL_SSE41__)
  254. __forceinline avxf maxi(const avxf &a, const avxf &b)
  255. {
  256. const avxf ci = _mm256_max_ps(a, b);
  257. return ci;
  258. }
  259. __forceinline avxf mini(const avxf &a, const avxf &b)
  260. {
  261. const avxf ci = _mm256_min_ps(a, b);
  262. return ci;
  263. }
  264. //#endif
  265. ////////////////////////////////////////////////////////////////////////////////
  266. /// Ternary Operators
  267. ////////////////////////////////////////////////////////////////////////////////
  268. __forceinline const avxf madd(const avxf &a, const avxf &b, const avxf &c)
  269. {
  270. # ifdef __KERNEL_AVX2__
  271. return _mm256_fmadd_ps(a, b, c);
  272. # else
  273. return c + (a * b);
  274. # endif
  275. }
  276. __forceinline const avxf nmadd(const avxf &a, const avxf &b, const avxf &c)
  277. {
  278. # ifdef __KERNEL_AVX2__
  279. return _mm256_fnmadd_ps(a, b, c);
  280. # else
  281. return c - (a * b);
  282. # endif
  283. }
  284. __forceinline const avxf msub(const avxf &a, const avxf &b, const avxf &c)
  285. {
  286. # ifdef __KERNEL_AVX2__
  287. return _mm256_fmsub_ps(a, b, c);
  288. # else
  289. return (a * b) - c;
  290. # endif
  291. }
  292. ////////////////////////////////////////////////////////////////////////////////
  293. /// Comparison Operators
  294. ////////////////////////////////////////////////////////////////////////////////
  295. __forceinline const avxb operator<=(const avxf &a, const avxf &b)
  296. {
  297. return _mm256_cmp_ps(a.m256, b.m256, _CMP_LE_OS);
  298. }
  299. #endif
  300. #ifndef _mm256_set_m128
  301. # define _mm256_set_m128(/* __m128 */ hi, /* __m128 */ lo) \
  302. _mm256_insertf128_ps(_mm256_castps128_ps256(lo), (hi), 0x1)
  303. #endif
  304. #define _mm256_loadu2_m128(/* float const* */ hiaddr, /* float const* */ loaddr) \
  305. _mm256_set_m128(_mm_loadu_ps(hiaddr), _mm_loadu_ps(loaddr))
  306. CCL_NAMESPACE_END