123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160 |
- /* ==========================================================================
- * Copyright (c) 2022 SuperTuxKart-Team
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to permit
- * persons to whom the Software is furnished to do so, subject to the
- * following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
- * NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE.
- * ==========================================================================
- */
- #ifndef HEADER_SIMD_WRAPPER_HPP
- #define HEADER_SIMD_WRAPPER_HPP
- #include <simde/simde-arch.h>
- #if defined(SIMDE_ARCH_AMD64) || defined(SIMDE_ARCH_X86)
- // Native SSE
- #if __MMX__ || CPU_ENABLE_MMX
- #include <mmintrin.h>
- #define CPU_MMX_SUPPORT (1)
- #endif
- #if __SSE__ || defined(_M_X64) || ( defined(_M_IX86_FP) && ( _M_IX86_FP >= 1 ) ) || CPU_ENABLE_SSE
- #include <xmmintrin.h>
- #define CPU_SSE_SUPPORT (1)
- #endif
- #if __SSE2__ || defined(_M_X64) || ( defined(_M_IX86_FP) && ( _M_IX86_FP >= 2 ) ) || CPU_ENABLE_SSE2
- #include <emmintrin.h>
- #define CPU_SSE2_SUPPORT (1)
- #endif
- #if __SSE3__ || __AVX__ || CPU_ENABLE_SSE3
- #include <pmmintrin.h>
- #define CPU_SSE3_SUPPORT (1)
- #endif
- #if __SSSE3__ || __AVX__ || CPU_ENABLE_SSSE3
- #include <tmmintrin.h>
- #define CPU_SSSE3_SUPPORT (1)
- #endif
- #if __SSE4_1__ || __AVX__ || CPU_ENABLE_SSE4_1
- #include <smmintrin.h>
- #define CPU_SSE4_1_SUPPORT (1)
- #endif
- #if __SSE4_2__ || CPU_ENABLE_SSE4_2
- #include <nmmintrin.h>
- #define CPU_SSE4_2_SUPPORT (1)
- #endif
- #elif defined(SIMDE_ARCH_ARM_NEON)
- // We only enable compile time SSE* to Neon for now because it's easy to test
- // Enable up to SSE4.2 because after that (starting from AVX) it has few
- // native conversion, which will use the slower C99 fallback
- #define CPU_MMX_SUPPORT (1)
- #define CPU_SSE_SUPPORT (1)
- #define CPU_SSE2_SUPPORT (1)
- #define CPU_SSE3_SUPPORT (1)
- #define CPU_SSSE3_SUPPORT (1)
- #define CPU_SSE4_1_SUPPORT (1)
- #define CPU_SSE4_2_SUPPORT (1)
- #if defined(_MSC_VER) && defined(__cplusplus)
- // Fix math related functions missing in msvc
- #include <cmath>
- #endif
- #define SIMDE_ENABLE_NATIVE_ALIASES
- #include "simde/x86/sse4.2.h"
- #endif
- #ifndef _MM_FROUND_TO_NEG_INF
- #define _MM_FROUND_TO_NEG_INF SIMDE_MM_FROUND_TO_NEG_INF
- #endif
- #ifndef _MM_FROUND_NO_EXC
- #define _MM_FROUND_NO_EXC SIMDE_MM_FROUND_NO_EXC
- #endif
- #ifndef _MM_SET_ROUNDING_MODE
- #define _MM_SET_ROUNDING_MODE _MM_SET_ROUNDING_MODE
- #endif
- #ifndef _MM_ROUND_NEAREST
- #define _MM_ROUND_NEAREST SIMDE_MM_ROUND_NEAREST
- #endif
- #ifndef _MM_ROUND_UP
- #define _MM_ROUND_UP SIMDE_MM_ROUND_UP
- #endif
- #ifndef _MM_ROUND_DOWN
- #define _MM_ROUND_DOWN SIMDE_MM_ROUND_DOWN
- #endif
- // Utilities for aligned allocation
- inline void* simd_aligned_alloc(size_t alignment, size_t bytes)
- {
- // we need to allocate enough storage for the requested bytes, some
- // book-keeping (to store the location returned by malloc) and some extra
- // padding to allow us to find an aligned byte. I'm not entirely sure if
- // 2 * alignment is enough here, its just a guess.
- const size_t total_size = bytes + (2 * alignment) + sizeof(size_t);
- // use malloc to allocate the memory.
- char* data = (char*)malloc(sizeof(char) * total_size);
- if (data)
- {
- // store the original start of the malloc'd data.
- const void* const data_start = data;
- // dedicate enough space to the book-keeping.
- data += sizeof(size_t);
- // find a memory location with correct alignment. the alignment minus
- // the remainder of this mod operation is how many bytes forward we need
- // to move to find an aligned byte.
- const size_t offset = alignment - (((size_t)data) % alignment);
- // set data to the aligned memory.
- data += offset;
- // write the book-keeping.
- size_t* book_keeping = (size_t*)(data - sizeof(size_t));
- *book_keeping = (size_t)data_start;
- }
- return data;
- }
- inline void simd_aligned_free(void* raw_data)
- {
- if (raw_data)
- {
- char* data = (char*)raw_data;
- // we have to assume this memory was allocated with simd_aligned_alloc.
- // this means the sizeof(size_t) bytes before data are the book-keeping
- // which points to the location we need to pass to free.
- data -= sizeof(size_t);
- // set data to the location stored in book-keeping.
- data = (char*)(*((size_t*)data));
- // free the memory.
- free(data);
- }
- }
- #endif
|