simd_wrapper.h 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160
  1. /* ==========================================================================
  2. * Copyright (c) 2022 SuperTuxKart-Team
  3. *
  4. * Permission is hereby granted, free of charge, to any person obtaining a
  5. * copy of this software and associated documentation files (the
  6. * "Software"), to deal in the Software without restriction, including
  7. * without limitation the rights to use, copy, modify, merge, publish,
  8. * distribute, sublicense, and/or sell copies of the Software, and to permit
  9. * persons to whom the Software is furnished to do so, subject to the
  10. * following conditions:
  11. *
  12. * The above copyright notice and this permission notice shall be included
  13. * in all copies or substantial portions of the Software.
  14. *
  15. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  16. * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  17. * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
  18. * NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
  19. * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  20. * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  21. * USE OR OTHER DEALINGS IN THE SOFTWARE.
  22. * ==========================================================================
  23. */
  24. #ifndef HEADER_SIMD_WRAPPER_HPP
  25. #define HEADER_SIMD_WRAPPER_HPP
  26. #include <simde/simde-arch.h>
  27. #if defined(SIMDE_ARCH_AMD64) || defined(SIMDE_ARCH_X86)
  28. // Native SSE
  29. #if __MMX__ || CPU_ENABLE_MMX
  30. #include <mmintrin.h>
  31. #define CPU_MMX_SUPPORT (1)
  32. #endif
  33. #if __SSE__ || defined(_M_X64) || ( defined(_M_IX86_FP) && ( _M_IX86_FP >= 1 ) ) || CPU_ENABLE_SSE
  34. #include <xmmintrin.h>
  35. #define CPU_SSE_SUPPORT (1)
  36. #endif
  37. #if __SSE2__ || defined(_M_X64) || ( defined(_M_IX86_FP) && ( _M_IX86_FP >= 2 ) ) || CPU_ENABLE_SSE2
  38. #include <emmintrin.h>
  39. #define CPU_SSE2_SUPPORT (1)
  40. #endif
  41. #if __SSE3__ || __AVX__ || CPU_ENABLE_SSE3
  42. #include <pmmintrin.h>
  43. #define CPU_SSE3_SUPPORT (1)
  44. #endif
  45. #if __SSSE3__ || __AVX__ || CPU_ENABLE_SSSE3
  46. #include <tmmintrin.h>
  47. #define CPU_SSSE3_SUPPORT (1)
  48. #endif
  49. #if __SSE4_1__ || __AVX__ || CPU_ENABLE_SSE4_1
  50. #include <smmintrin.h>
  51. #define CPU_SSE4_1_SUPPORT (1)
  52. #endif
  53. #if __SSE4_2__ || CPU_ENABLE_SSE4_2
  54. #include <nmmintrin.h>
  55. #define CPU_SSE4_2_SUPPORT (1)
  56. #endif
  57. #elif defined(SIMDE_ARCH_ARM_NEON)
  58. // We only enable compile time SSE* to Neon for now because it's easy to test
  59. // Enable up to SSE4.2 because after that (starting from AVX) it has few
  60. // native conversion, which will use the slower C99 fallback
  61. #define CPU_MMX_SUPPORT (1)
  62. #define CPU_SSE_SUPPORT (1)
  63. #define CPU_SSE2_SUPPORT (1)
  64. #define CPU_SSE3_SUPPORT (1)
  65. #define CPU_SSSE3_SUPPORT (1)
  66. #define CPU_SSE4_1_SUPPORT (1)
  67. #define CPU_SSE4_2_SUPPORT (1)
  68. #if defined(_MSC_VER) && defined(__cplusplus)
  69. // Fix math related functions missing in msvc
  70. #include <cmath>
  71. #endif
  72. #define SIMDE_ENABLE_NATIVE_ALIASES
  73. #include "simde/x86/sse4.2.h"
  74. #endif
  75. #ifndef _MM_FROUND_TO_NEG_INF
  76. #define _MM_FROUND_TO_NEG_INF SIMDE_MM_FROUND_TO_NEG_INF
  77. #endif
  78. #ifndef _MM_FROUND_NO_EXC
  79. #define _MM_FROUND_NO_EXC SIMDE_MM_FROUND_NO_EXC
  80. #endif
  81. #ifndef _MM_SET_ROUNDING_MODE
  82. #define _MM_SET_ROUNDING_MODE _MM_SET_ROUNDING_MODE
  83. #endif
  84. #ifndef _MM_ROUND_NEAREST
  85. #define _MM_ROUND_NEAREST SIMDE_MM_ROUND_NEAREST
  86. #endif
  87. #ifndef _MM_ROUND_UP
  88. #define _MM_ROUND_UP SIMDE_MM_ROUND_UP
  89. #endif
  90. #ifndef _MM_ROUND_DOWN
  91. #define _MM_ROUND_DOWN SIMDE_MM_ROUND_DOWN
  92. #endif
  93. // Utilities for aligned allocation
  94. inline void* simd_aligned_alloc(size_t alignment, size_t bytes)
  95. {
  96. // we need to allocate enough storage for the requested bytes, some
  97. // book-keeping (to store the location returned by malloc) and some extra
  98. // padding to allow us to find an aligned byte. I'm not entirely sure if
  99. // 2 * alignment is enough here, its just a guess.
  100. const size_t total_size = bytes + (2 * alignment) + sizeof(size_t);
  101. // use malloc to allocate the memory.
  102. char* data = (char*)malloc(sizeof(char) * total_size);
  103. if (data)
  104. {
  105. // store the original start of the malloc'd data.
  106. const void* const data_start = data;
  107. // dedicate enough space to the book-keeping.
  108. data += sizeof(size_t);
  109. // find a memory location with correct alignment. the alignment minus
  110. // the remainder of this mod operation is how many bytes forward we need
  111. // to move to find an aligned byte.
  112. const size_t offset = alignment - (((size_t)data) % alignment);
  113. // set data to the aligned memory.
  114. data += offset;
  115. // write the book-keeping.
  116. size_t* book_keeping = (size_t*)(data - sizeof(size_t));
  117. *book_keeping = (size_t)data_start;
  118. }
  119. return data;
  120. }
  121. inline void simd_aligned_free(void* raw_data)
  122. {
  123. if (raw_data)
  124. {
  125. char* data = (char*)raw_data;
  126. // we have to assume this memory was allocated with simd_aligned_alloc.
  127. // this means the sizeof(size_t) bytes before data are the book-keeping
  128. // which points to the location we need to pass to free.
  129. data -= sizeof(size_t);
  130. // set data to the location stored in book-keeping.
  131. data = (char*)(*((size_t*)data));
  132. // free the memory.
  133. free(data);
  134. }
  135. }
  136. #endif