vector.h 9.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288
  1. /* s2 - portable, adaptive, gracefully degrading SIMD
  2. Copyright (C) 2018 Ariadne Devos
  3. This program is free software: you can redistribute it and/or modify
  4. it under the terms of the GNU General Public License as published by
  5. the Free Software Foundation, either version 3 of the License, or
  6. (at your option) any later version.
  7. This program is distributed in the hope that it will be useful,
  8. but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  10. GNU General Public License for more details.
  11. You should have received a copy of the GNU General Public License
  12. along with this program. If not, see <http://www.gnu.org/licenses/>. */
  13. #ifndef _sHT_VECTOR_H
  14. #define _sHT_VECTOR_H
  15. /** Portable vector processing (SIMD)
  16. Many(1) modern processors can do arithmetic on short arrays faster(2)
  17. and more efficient(3), if they are somewhat assisted by using special
  18. registers and alignment. How large the arrays must be, varies with the
  19. extension. (These ‘arrays’ are called *vectors*.)
  20. As the vector size increases, performance goes up(4). So it would be
  21. a good idea to take advantage of higher sizes, when available. This
  22. header-only module helps you to do this, portably.
  23. When SIMD is not available, this falls back to scalar mode.
  24. (1) At least, Intel and AMD chips, which were quite popular on
  25. laptops in 2018
  26. (2) Acceleration Through Stream Computing, Ulrich Drepper (Red Hat)
  27. 2009-09-04, slide 11, 16, 26, 29, 32
  28. (3) Intel® 64 and IA-32 Architectures Optimization Reference
  29. Manual, 13.5.1.2 Vectorization (page 13), June 2016.
  30. Supposedly requires you to license patents to Intel in certain
  31. circumstances (What? Don't buy Intel processors!).
  32. (4) I assume this, because otherwise, only complexity would raise.
  33. * How to use
  34. Write sHT_target_@var{size}, before the function definition, where
  35. @var{size} is the number of bytes in the vector. It can be 2, 4, 8,
  36. 16, 32 or 64, depending on the host system.
  37. For efficiency and correctness, data to be processed with SIMD should
  38. be aligned to @var{sHT_vector_align}.
  39. The number of elements in a vector can be computed by dividing its size
  40. by the element size. In particular, @var{sHT_size_bytes} expands to the
  41. number of bytes in a @var{size_t} and @var{sHT_size_lanes} calculates the
  42. number of lanes (entries) in a vector from its size, checking that the
  43. vector is large enough. (Support other types as needed.)
  44. @var{sHT_size_order} is the binary logarithm of @var{sHT_size_bytes}.
  45. @var{sHT_size_seq} produces a vector iterating the natural numbers
  46. ({0, 1, 2, ...}).
  47. * Internals
  48. @var{sHT_single_vector} is defined if there is only one vector size to use,
  49. in which case it is the binary logarithm of the size. */
  50. #ifndef __has_attribute
  51. # define __has_attribute(a) 0
  52. #endif
  53. /* gcc ignores unknown attributes, make sure the 'target' attribute
  54. isn't silently ignored. */
  55. #if __has_attribute(target)
  56. #elif (__GNUC__ > 4) || ((__GNUC__ == 4) && ((__GNUC_MINOR__ > 9) || ((__GNUC_MINOR__ == 9) && (__GNUC_PATCHLEVEL__ >= 4))))
  57. #elif defined(__GNUC__)
  58. # error s2 uses the attribute 'target', but that has only been introduced in GCC 4.9.4
  59. #else
  60. # error s2 uses the GCC attribute 'target', which is not supported by your compiler
  61. #endif
  62. #include <stddef.h>
  63. #include <stdint.h>
  64. /* These definitions assume that sizeof(T) is a correct alignment
  65. for any vector type T. Here is a proof:
  66. The C standard allows for forming an array of T. Let the array's
  67. length be two, and let P be a pointer to its first element.
  68. According to the standard (at least, I hopeso), P is correctly aligned,
  69. and so is P + 1.
  70. The statements about alignment can be stated this way:
  71. P = 0 (mod _Alignof(T))
  72. P + 1 = 0 (mod _Alignof(T))
  73. The following is also true:
  74. P = (char *) P
  75. P + 1 = (char *) P + sizeof(T)
  76. Filling in the latter values in the former:
  77. (char *) P = 0 (mod _Alignof(T))
  78. (char *) P + sizeof(T) = 0 (mod _Alignof(T))
  79. Subtracting the first from the second:
  80. sizeof(T) = 0 (mod _Alignof(T))
  81. This is equivalent to (1):
  82. _Alignof(T) | sizeof(T) (divides)
  83. Then, proof that if a pointer Q is sizeof(T) aligned,
  84. it also is _Alignof(T) aligned.
  85. Q = 0 (mod sizeof(T))
  86. ==>
  87. sizeof(T) | Q
  88. ==> (introduce (1))
  89. _Alignof(T) | sizeof(T)
  90. sizeof(T) | Q
  91. ==> cut (transitivity of divisibility)
  92. _Alignof(T) | Q
  93. ==>
  94. Q = 0 (mod _Alignof(T))
  95. Q.E.D */
  96. #if defined(__x86_64__)
  97. /* x86-64 requires SSE2 to be supported,
  98. so don't generate a scalar fallback. */
  99. # define sHT_target_16 __attribute__((target ("sse2")))
  100. # define sHT_target_32 __attribute__((target ("avx2")))
  101. # define sHT_target_64 __attribute__((target ("avx512f")))
  102. # define sHT_vector_align 64
  103. #elif defined(__x86__)
  104. # define sHT_target_4
  105. /* AMD has something called 3DNow, but that's for floating-point,
  106. which s2 doesn't do. */
  107. # define sHT_target_8 __attribute__((target ("mmx")))
  108. # define sHT_target_16 __attribute__((target ("sse2")))
  109. # define sHT_target_32 __attribute__((target ("avx2")))
  110. # define sHT_target_64 __attribute__((target ("avx512f")))
  111. # define sHT_vector_align 64
  112. #elif defined(__aarch64__)
  113. /* Aarch64 requires NEON to be supported. */
  114. # define sHT_single_vector 4
  115. # define sHT_target_16
  116. # define sHT_vector_align 16
  117. #elif defined(__arm__)
  118. # define sHT_target_4
  119. # define sHT_target_16 __attribute__((target ("fpu=neon")))
  120. # define sHT_vector_align 16
  121. #elif defined(__ia64__)
  122. /* IA 64 has 32 static general purpose registers!
  123. (and 96 with a register window (?)). Do some unrolling
  124. instead of using SIMD (which doesn't seem to exist). */
  125. # define sHT_single_vector 4
  126. # define sHT_target_16
  127. # define sHT_vector_align 8
  128. /* Fallback to scalar processing. Guess the width of the
  129. general-purpose registers. */
  130. #elif defined(UINT16_MAX) && !defined(UINT32_MAX)
  131. # define sHT_single_vector 1
  132. # define sHT_target_2
  133. # define sHT_vector_align 2
  134. #elif defined(UINT32_MAX) && !defined(UINT64_MAX)
  135. # define sHT_single_vector 2
  136. # define sHT_target_4
  137. # define sHT_vector_align 4
  138. #elif defined(UINT64_MAX) && !defined(UINT128_MAX)
  139. # define sHT_single_vector 3
  140. # define sHT_target_8
  141. # define sHT_vector_align 8
  142. #elif defined(UINT128_MAX) && !defined(UINT256_MAX)
  143. # define sHT_single_vector 4
  144. # define sHT_target_16
  145. # define sHT_vector_align 2
  146. #else
  147. # error 256-bit systems are not yet supported
  148. #endif
  149. #if UINT16_MAX == SIZE_MAX
  150. # define sHT_size_order 1
  151. # define sHT_size_bytes 2
  152. #elif UINT32_MAX == SIZE_MAX
  153. # define sHT_size_order 2
  154. # define sHT_size_bytes 4
  155. #elif UINT64_MAX == SIZE_MAX
  156. # define sHT_size_order 3
  157. # define sHT_size_bytes 8
  158. #elif UINT128_MAX == SIZE_MAX
  159. # define sHT_size_order 4
  160. # define sHT_size_bytes 16
  161. #else
  162. # error 256-bit systems are not yet supported
  163. #endif
  164. #ifndef sHT_single_vector
  165. /** Detect SIMD capabilities
  166. This sets the SIMD capability flags, that will depend
  167. upon the advertised microarchitecture and compile-time
  168. architecture.
  169. (The flags might speculatively be incorrect, but that should only
  170. cause a speculative invalid instruction trap, which should be innocent.) */
  171. void
  172. sHT_detect_simd(void);
  173. /** Choose a vectorized implementation
  174. Consider a non-empty sequence of vector sizes N, where
  175. sHT_target_N is defined and 1 << @var{elem_order} divides N.
  176. Find an index into the sequence, optimised for performance.
  177. (Typically, the largest possible vector size.)
  178. Precondition: 2**@var{elem_order} divides some sHT_target_N for some
  179. N (a power of two). (True for all standard integer types, whose size
  180. is a power of two.
  181. The index depends only upon the SIMD capability flags and
  182. compile-time host architecture.
  183. @var{elem_order} must be a compile-time constant.
  184. (E.g. @var{sHT_size_order}).
  185. Precondition: SIMD capabilities have been detected
  186. (e.g. with @var{sHT_detect_simd}). */
  187. size_t
  188. sHT_select_simd(int elem_order);
  189. #else
  190. static inline void
  191. sHT_detect_simd(void)
  192. {
  193. }
  194. #define sHT_select_simd(elem_order) \
  195. ((void) sizeof(struct { int: -((elem_order) <= sHT_single_vector); }), (size_t) 0)
  196. #endif
  197. /* gcc ignores unknown attributes, make sure the 'vector_size' attribute
  198. isn't silently ignored. */
  199. #if __has_attribute(vector_size)
  200. #elif (__GNUC__ > 4) || ((__GNUC__ == 4) && ((__GNUC_MINOR__ > 0) || ((__GNUC_MINOR__ == 0) && (__GNUC_PATCHLEVEL__ >= 4))))
  201. #elif defined(__GNUC__)
  202. # error s2 uses the attribute 'vector_size', but that has only been introduced in GCC 4.0.4
  203. #else
  204. # error s2 uses the GCC attribute 'vector_size', which is not supported by your compiler
  205. #endif
  206. /** A vector type of size @var{bytes}, containing @var{size_t} */
  207. #define sHT_size_vec(bytes) \
  208. __attribute__((vector_size(bytes))) size_t
  209. /** Calculate the number of lanes in a vector of @var{size_t}
  210. @var{bytes}: the size of the vector, in bytes
  211. This fails to compile if the vector cannot be filled with
  212. @var{size_t}. */
  213. #define sHT_size_lanes(bytes) \
  214. /* If @var{bytes} doesn't divide @var{sHT_size_bytes}, the struct
  215. would have a negative-size bitfield, which isn't allowed. It is said
  216. that this trick has been performed in the Linux kernel */ \
  217. ((void) sizeof(struct { int: -(int) ((bytes) % sHT_size_bytes); }), \
  218. sHT_size_bytes/(bytes))
  219. /** Return a vector of @var{size_t} enumerating the natural numbers
  220. @var{bytes}: the size of the vector, in bytes
  221. This returns { 0, 1, 2, ... }. */
  222. #define sHT_size_seq(bytes) \
  223. /* This produces a lot of warnings, because
  224. not all entries are used.
  225. TODO: patch a -Wno-vector-excess into GCC */ \
  226. ((sHT_size_vec(bytes)) \
  227. { 0, 1, 2, 3, 4, 5, 6, 7, \
  228. 8, 9, 10, 11, 12, 13, 14, 15, \
  229. 16, 17, 18, 19, 20, 21, 22, 23, \
  230. 24, 25, 26, 27, 28, 29, 30, 31, })
  231. #endif