123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605 |
- /*
- * Copyright 2011-2013 Intel Corporation
- * Modifications Copyright 2014, Blender Foundation.
- *
- * Licensed under the Apache License, Version 2.0(the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
- #ifndef __UTIL_SIMD_TYPES_H__
- #define __UTIL_SIMD_TYPES_H__
- #ifndef __KERNEL_GPU__
- # include <limits>
- # include "util/util_defines.h"
- /* SSE Intrinsics includes
- *
- * We assume __KERNEL_SSEX__ flags to have been defined at this point */
- /* SSE intrinsics headers */
- # ifndef FREE_WINDOWS64
- # ifdef _MSC_VER
- # include <intrin.h>
- # elif (defined(__x86_64__) || defined(__i386__))
- # include <x86intrin.h>
- # endif
- # else
- /* MinGW64 has conflicting declarations for these SSE headers in <windows.h>.
- * Since we can't avoid including <windows.h>, better only include that */
- # include "util/util_windows.h"
- # endif
- # if defined(__x86_64__) || defined(__i386__) || defined(_M_X64) || defined(_M_IX86)
- # define SIMD_SET_FLUSH_TO_ZERO \
- _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); \
- _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
- # else
- # define SIMD_SET_FLUSH_TO_ZERO
- # endif
- CCL_NAMESPACE_BEGIN
- # ifdef __KERNEL_SSE2__
- extern const __m128 _mm_lookupmask_ps[16];
- /* Special Types */
- static struct TrueTy {
- __forceinline operator bool() const
- {
- return true;
- }
- } True ccl_maybe_unused;
- static struct FalseTy {
- __forceinline operator bool() const
- {
- return false;
- }
- } False ccl_maybe_unused;
- static struct NegInfTy {
- __forceinline operator float() const
- {
- return -std::numeric_limits<float>::infinity();
- }
- __forceinline operator int() const
- {
- return std::numeric_limits<int>::min();
- }
- } neg_inf ccl_maybe_unused;
- static struct PosInfTy {
- __forceinline operator float() const
- {
- return std::numeric_limits<float>::infinity();
- }
- __forceinline operator int() const
- {
- return std::numeric_limits<int>::max();
- }
- } inf ccl_maybe_unused, pos_inf ccl_maybe_unused;
- /* Intrinsics Functions */
- # if defined(__BMI__) && defined(__GNUC__)
- # ifndef _tzcnt_u32
- # define _tzcnt_u32 __tzcnt_u32
- # endif
- # ifndef _tzcnt_u64
- # define _tzcnt_u64 __tzcnt_u64
- # endif
- # endif
- # if defined(__LZCNT__)
- # define _lzcnt_u32 __lzcnt32
- # define _lzcnt_u64 __lzcnt64
- # endif
- # if defined(_WIN32) && !defined(__MINGW32__) && !defined(__clang__)
- __forceinline int __popcnt(int in)
- {
- return _mm_popcnt_u32(in);
- }
- # if !defined(_MSC_VER)
- __forceinline unsigned int __popcnt(unsigned int in)
- {
- return _mm_popcnt_u32(in);
- }
- # endif
- # if defined(__KERNEL_64_BIT__)
- __forceinline long long __popcnt(long long in)
- {
- return _mm_popcnt_u64(in);
- }
- __forceinline size_t __popcnt(size_t in)
- {
- return _mm_popcnt_u64(in);
- }
- # endif
- __forceinline int __bsf(int v)
- {
- # if defined(__KERNEL_AVX2__)
- return _tzcnt_u32(v);
- # else
- unsigned long r = 0;
- _BitScanForward(&r, v);
- return r;
- # endif
- }
- __forceinline unsigned int __bsf(unsigned int v)
- {
- # if defined(__KERNEL_AVX2__)
- return _tzcnt_u32(v);
- # else
- unsigned long r = 0;
- _BitScanForward(&r, v);
- return r;
- # endif
- }
- __forceinline int __bsr(int v)
- {
- unsigned long r = 0;
- _BitScanReverse(&r, v);
- return r;
- }
- __forceinline int __btc(int v, int i)
- {
- long r = v;
- _bittestandcomplement(&r, i);
- return r;
- }
- __forceinline int __bts(int v, int i)
- {
- long r = v;
- _bittestandset(&r, i);
- return r;
- }
- __forceinline int __btr(int v, int i)
- {
- long r = v;
- _bittestandreset(&r, i);
- return r;
- }
- __forceinline int bitscan(int v)
- {
- # if defined(__KERNEL_AVX2__)
- return _tzcnt_u32(v);
- # else
- return __bsf(v);
- # endif
- }
- __forceinline int clz(const int x)
- {
- # if defined(__KERNEL_AVX2__)
- return _lzcnt_u32(x);
- # else
- if (UNLIKELY(x == 0))
- return 32;
- return 31 - __bsr(x);
- # endif
- }
- __forceinline int __bscf(int &v)
- {
- int i = __bsf(v);
- v &= v - 1;
- return i;
- }
- __forceinline unsigned int __bscf(unsigned int &v)
- {
- unsigned int i = __bsf(v);
- v &= v - 1;
- return i;
- }
- # if defined(__KERNEL_64_BIT__)
- __forceinline size_t __bsf(size_t v)
- {
- # if defined(__KERNEL_AVX2__)
- return _tzcnt_u64(v);
- # else
- unsigned long r = 0;
- _BitScanForward64(&r, v);
- return r;
- # endif
- }
- __forceinline size_t __bsr(size_t v)
- {
- unsigned long r = 0;
- _BitScanReverse64(&r, v);
- return r;
- }
- __forceinline size_t __btc(size_t v, size_t i)
- {
- size_t r = v;
- _bittestandcomplement64((__int64 *)&r, i);
- return r;
- }
- __forceinline size_t __bts(size_t v, size_t i)
- {
- __int64 r = v;
- _bittestandset64(&r, i);
- return r;
- }
- __forceinline size_t __btr(size_t v, size_t i)
- {
- __int64 r = v;
- _bittestandreset64(&r, i);
- return r;
- }
- __forceinline size_t bitscan(size_t v)
- {
- # if defined(__KERNEL_AVX2__)
- # if defined(__KERNEL_64_BIT__)
- return _tzcnt_u64(v);
- # else
- return _tzcnt_u32(v);
- # endif
- # else
- return __bsf(v);
- # endif
- }
- __forceinline size_t __bscf(size_t &v)
- {
- size_t i = __bsf(v);
- v &= v - 1;
- return i;
- }
- # endif /* __KERNEL_64_BIT__ */
- # else /* _WIN32 */
- __forceinline unsigned int __popcnt(unsigned int in)
- {
- int r = 0;
- asm("popcnt %1,%0" : "=r"(r) : "r"(in));
- return r;
- }
- __forceinline int __bsf(int v)
- {
- int r = 0;
- asm("bsf %1,%0" : "=r"(r) : "r"(v));
- return r;
- }
- __forceinline int __bsr(int v)
- {
- int r = 0;
- asm("bsr %1,%0" : "=r"(r) : "r"(v));
- return r;
- }
- __forceinline int __btc(int v, int i)
- {
- int r = 0;
- asm("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags");
- return r;
- }
- __forceinline int __bts(int v, int i)
- {
- int r = 0;
- asm("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags");
- return r;
- }
- __forceinline int __btr(int v, int i)
- {
- int r = 0;
- asm("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags");
- return r;
- }
- # if (defined(__KERNEL_64_BIT__) || defined(__APPLE__)) && \
- !(defined(__ILP32__) && defined(__x86_64__))
- __forceinline size_t __bsf(size_t v)
- {
- size_t r = 0;
- asm("bsf %1,%0" : "=r"(r) : "r"(v));
- return r;
- }
- # endif
- __forceinline unsigned int __bsf(unsigned int v)
- {
- unsigned int r = 0;
- asm("bsf %1,%0" : "=r"(r) : "r"(v));
- return r;
- }
- __forceinline size_t __bsr(size_t v)
- {
- size_t r = 0;
- asm("bsr %1,%0" : "=r"(r) : "r"(v));
- return r;
- }
- __forceinline size_t __btc(size_t v, size_t i)
- {
- size_t r = 0;
- asm("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags");
- return r;
- }
- __forceinline size_t __bts(size_t v, size_t i)
- {
- size_t r = 0;
- asm("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags");
- return r;
- }
- __forceinline size_t __btr(size_t v, size_t i)
- {
- size_t r = 0;
- asm("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags");
- return r;
- }
- __forceinline int bitscan(int v)
- {
- # if defined(__KERNEL_AVX2__)
- return _tzcnt_u32(v);
- # else
- return __bsf(v);
- # endif
- }
- __forceinline unsigned int bitscan(unsigned int v)
- {
- # if defined(__KERNEL_AVX2__)
- return _tzcnt_u32(v);
- # else
- return __bsf(v);
- # endif
- }
- # if (defined(__KERNEL_64_BIT__) || defined(__APPLE__)) && \
- !(defined(__ILP32__) && defined(__x86_64__))
- __forceinline size_t bitscan(size_t v)
- {
- # if defined(__KERNEL_AVX2__)
- # if defined(__KERNEL_64_BIT__)
- return _tzcnt_u64(v);
- # else
- return _tzcnt_u32(v);
- # endif
- # else
- return __bsf(v);
- # endif
- }
- # endif
- __forceinline int clz(const int x)
- {
- # if defined(__KERNEL_AVX2__)
- return _lzcnt_u32(x);
- # else
- if (UNLIKELY(x == 0))
- return 32;
- return 31 - __bsr(x);
- # endif
- }
- __forceinline int __bscf(int &v)
- {
- int i = bitscan(v);
- # if defined(__KERNEL_AVX2__)
- v &= v - 1;
- # else
- v = __btc(v, i);
- # endif
- return i;
- }
- __forceinline unsigned int __bscf(unsigned int &v)
- {
- unsigned int i = bitscan(v);
- v &= v - 1;
- return i;
- }
- # if (defined(__KERNEL_64_BIT__) || defined(__APPLE__)) && \
- !(defined(__ILP32__) && defined(__x86_64__))
- __forceinline size_t __bscf(size_t &v)
- {
- size_t i = bitscan(v);
- # if defined(__KERNEL_AVX2__)
- v &= v - 1;
- # else
- v = __btc(v, i);
- # endif
- return i;
- }
- # endif
- # endif /* _WIN32 */
- /* Test __KERNEL_SSE41__ for MSVC which does not define __SSE4_1__, and test
- * __SSE4_1__ to avoid OpenImageIO conflicts with our emulation macros on other
- * platforms when compiling code outside the kernel. */
- # if !(defined(__KERNEL_SSE41__) || defined(__SSE4_1__) || defined(__SSE4_2__))
- /* Emulation of SSE4 functions with SSE2 */
- # define _MM_FROUND_TO_NEAREST_INT 0x00
- # define _MM_FROUND_TO_NEG_INF 0x01
- # define _MM_FROUND_TO_POS_INF 0x02
- # define _MM_FROUND_TO_ZERO 0x03
- # define _MM_FROUND_CUR_DIRECTION 0x04
- # undef _mm_blendv_ps
- # define _mm_blendv_ps _mm_blendv_ps_emu
- __forceinline __m128 _mm_blendv_ps_emu(__m128 value, __m128 input, __m128 mask)
- {
- __m128i isignmask = _mm_set1_epi32(0x80000000);
- __m128 signmask = _mm_castsi128_ps(isignmask);
- __m128i iandsign = _mm_castps_si128(_mm_and_ps(mask, signmask));
- __m128i icmpmask = _mm_cmpeq_epi32(iandsign, isignmask);
- __m128 cmpmask = _mm_castsi128_ps(icmpmask);
- return _mm_or_ps(_mm_and_ps(cmpmask, input), _mm_andnot_ps(cmpmask, value));
- }
- # undef _mm_blend_ps
- # define _mm_blend_ps _mm_blend_ps_emu
- __forceinline __m128 _mm_blend_ps_emu(__m128 value, __m128 input, const int mask)
- {
- assert(mask < 0x10);
- return _mm_blendv_ps(value, input, _mm_lookupmask_ps[mask]);
- }
- # undef _mm_blendv_epi8
- # define _mm_blendv_epi8 _mm_blendv_epi8_emu
- __forceinline __m128i _mm_blendv_epi8_emu(__m128i value, __m128i input, __m128i mask)
- {
- return _mm_or_si128(_mm_and_si128(mask, input), _mm_andnot_si128(mask, value));
- }
- # undef _mm_min_epi32
- # define _mm_min_epi32 _mm_min_epi32_emu
- __forceinline __m128i _mm_min_epi32_emu(__m128i value, __m128i input)
- {
- return _mm_blendv_epi8(input, value, _mm_cmplt_epi32(value, input));
- }
- # undef _mm_max_epi32
- # define _mm_max_epi32 _mm_max_epi32_emu
- __forceinline __m128i _mm_max_epi32_emu(__m128i value, __m128i input)
- {
- return _mm_blendv_epi8(value, input, _mm_cmplt_epi32(value, input));
- }
- # undef _mm_extract_epi32
- # define _mm_extract_epi32 _mm_extract_epi32_emu
- __forceinline int _mm_extract_epi32_emu(__m128i input, const int index)
- {
- switch (index) {
- case 0:
- return _mm_cvtsi128_si32(input);
- case 1:
- return _mm_cvtsi128_si32(_mm_shuffle_epi32(input, _MM_SHUFFLE(1, 1, 1, 1)));
- case 2:
- return _mm_cvtsi128_si32(_mm_shuffle_epi32(input, _MM_SHUFFLE(2, 2, 2, 2)));
- case 3:
- return _mm_cvtsi128_si32(_mm_shuffle_epi32(input, _MM_SHUFFLE(3, 3, 3, 3)));
- default:
- assert(false);
- return 0;
- }
- }
- # undef _mm_insert_epi32
- # define _mm_insert_epi32 _mm_insert_epi32_emu
- __forceinline __m128i _mm_insert_epi32_emu(__m128i value, int input, const int index)
- {
- assert(index >= 0 && index < 4);
- ((int *)&value)[index] = input;
- return value;
- }
- # undef _mm_insert_ps
- # define _mm_insert_ps _mm_insert_ps_emu
- __forceinline __m128 _mm_insert_ps_emu(__m128 value, __m128 input, const int index)
- {
- assert(index < 0x100);
- ((float *)&value)[(index >> 4) & 0x3] = ((float *)&input)[index >> 6];
- return _mm_andnot_ps(_mm_lookupmask_ps[index & 0xf], value);
- }
- # undef _mm_round_ps
- # define _mm_round_ps _mm_round_ps_emu
- __forceinline __m128 _mm_round_ps_emu(__m128 value, const int flags)
- {
- switch (flags) {
- case _MM_FROUND_TO_NEAREST_INT:
- return _mm_cvtepi32_ps(_mm_cvtps_epi32(value));
- case _MM_FROUND_TO_NEG_INF:
- return _mm_cvtepi32_ps(_mm_cvtps_epi32(_mm_add_ps(value, _mm_set1_ps(-0.5f))));
- case _MM_FROUND_TO_POS_INF:
- return _mm_cvtepi32_ps(_mm_cvtps_epi32(_mm_add_ps(value, _mm_set1_ps(0.5f))));
- case _MM_FROUND_TO_ZERO:
- return _mm_cvtepi32_ps(_mm_cvttps_epi32(value));
- }
- return value;
- }
- # endif /* !(defined(__KERNEL_SSE41__) || defined(__SSE4_1__) || defined(__SSE4_2__)) */
- # else /* __KERNEL_SSE2__ */
- /* This section is for utility functions which operates on non-register data
- * which might be used from a non-vectorized code.
- */
- ccl_device_inline int bitscan(int value)
- {
- assert(value != 0);
- int bit = 0;
- while ((value & (1 << bit)) == 0) {
- ++bit;
- }
- return bit;
- }
- ccl_device_inline int __bsr(int value)
- {
- assert(value != 0);
- int bit = 0;
- while (value >>= 1) {
- ++bit;
- }
- return bit;
- }
- # endif /* __KERNEL_SSE2__ */
- /* quiet unused define warnings */
- # if defined(__KERNEL_SSE2__) || defined(__KERNEL_SSE3__) || defined(__KERNEL_SSSE3__) || \
- defined(__KERNEL_SSE41__) || defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__)
- /* do nothing */
- # endif
- CCL_NAMESPACE_END
- #endif /* __KERNEL_GPU__ */
- #endif /* __UTIL_SIMD_TYPES_H__ */
|