vector.h 9.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276
  1. // SPDX-License-Identifier: GPL-3.0-or-later
  2. // Copyright © 2018-2019 Ariadne Devos
  3. /* sHT -- portable, adaptive, gracefully degrading SIMD */
  4. #ifndef _sHT_VECTOR_H
  5. #define _sHT_VECTOR_H
  6. /** Portable vector processing (SIMD)
  7. Many(1) modern processors can do arithmetic on short arrays faster(2)
  8. and more efficient(3), if they are somewhat assisted by using special
  9. registers and alignment. How large the arrays must be, varies with the
  10. extension. (These ‘arrays’ are called *vectors*.)
  11. As the vector size increases, performance goes up(4). So it would be
  12. a good idea to take advantage of higher sizes, when available. This
  13. header-only module helps you to do this, portably.
  14. When SIMD is not available, this falls back to scalar mode.
  15. (1) At least, Intel and AMD chips, which were quite popular on
  16. laptops in 2018
  17. (2) Acceleration Through Stream Computing, Ulrich Drepper (Red Hat)
  18. 2009-09-04, slide 11, 16, 26, 29, 32
  19. (3) Intel® 64 and IA-32 Architectures Optimization Reference
  20. Manual, 13.5.1.2 Vectorization (page 13), June 2016.
  21. Supposedly requires you to license patents to Intel in certain
  22. circumstances (What? Don't buy Intel processors!).
  23. (4) I assume this, because otherwise, only complexity would raise.
  24. * How to use
  25. Write sHT_target_@var{size}, before the function definition, where
  26. @var{size} is the number of bytes in the vector. It can be 2, 4, 8,
  27. 16, 32 or 64, depending on the host system.
  28. For efficiency and correctness, data to be processed with SIMD should
  29. be aligned to @var{sHT_vector_align}.
  30. The number of elements in a vector can be computed by dividing its size
  31. by the element size. In particular, @var{sHT_size_bytes} expands to the
  32. number of bytes in a @var{size_t} and @var{sHT_size_lanes} calculates the
  33. number of lanes (entries) in a vector from its size, checking that the
  34. vector is large enough. (Support other types as needed.)
  35. @var{sHT_size_order} is the binary logarithm of @var{sHT_size_bytes}.
  36. @var{sHT_size_seq} produces a vector iterating the natural numbers
  37. ({0, 1, 2, ...}).
  38. * Internals
  39. @var{sHT_single_vector} is defined if there is only one vector size to use,
  40. in which case it is the binary logarithm of the size. */
  41. #ifndef __has_attribute
  42. # define __has_attribute(a) 0
  43. #endif
  44. /* gcc ignores unknown attributes, make sure the 'target' attribute
  45. isn't silently ignored. */
  46. #if __has_attribute(target)
  47. #elif (__GNUC__ > 4) || ((__GNUC__ == 4) && ((__GNUC_MINOR__ > 9) || ((__GNUC_MINOR__ == 9) && (__GNUC_PATCHLEVEL__ >= 4))))
  48. #elif defined(__GNUC__)
  49. # error s2 uses the attribute 'target', but that has only been introduced in GCC 4.9.4
  50. #else
  51. # error s2 uses the GCC attribute 'target', which is not supported by your compiler
  52. #endif
  53. #include <stddef.h>
  54. #include <stdint.h>
  55. /* These definitions assume that sizeof(T) is a correct alignment
  56. for any vector type T. Here is a proof:
  57. The C standard allows for forming an array of T. Let the array's
  58. length be two, and let P be a pointer to its first element.
  59. According to the standard (at least, I hopeso), P is correctly aligned,
  60. and so is P + 1.
  61. The statements about alignment can be stated this way:
  62. P = 0 (mod _Alignof(T))
  63. P + 1 = 0 (mod _Alignof(T))
  64. The following is also true:
  65. P = (char *) P
  66. P + 1 = (char *) P + sizeof(T)
  67. Filling in the latter values in the former:
  68. (char *) P = 0 (mod _Alignof(T))
  69. (char *) P + sizeof(T) = 0 (mod _Alignof(T))
  70. Subtracting the first from the second:
  71. sizeof(T) = 0 (mod _Alignof(T))
  72. This is equivalent to (1):
  73. _Alignof(T) | sizeof(T) (divides)
  74. Then, proof that if a pointer Q is sizeof(T) aligned,
  75. it also is _Alignof(T) aligned.
  76. Q = 0 (mod sizeof(T))
  77. ==>
  78. sizeof(T) | Q
  79. ==> (introduce (1))
  80. _Alignof(T) | sizeof(T)
  81. sizeof(T) | Q
  82. ==> cut (transitivity of divisibility)
  83. _Alignof(T) | Q
  84. ==>
  85. Q = 0 (mod _Alignof(T))
  86. Q.E.D */
  87. #if defined(__x86_64__)
  88. /* x86-64 requires SSE2 to be supported,
  89. so don't generate a scalar fallback. */
  90. # define sHT_target_16 __attribute__((target ("sse2")))
  91. # define sHT_target_32 __attribute__((target ("avx2")))
  92. # define sHT_target_64 __attribute__((target ("avx512f")))
  93. # define sHT_vector_align 64
  94. #elif defined(__x86__)
  95. # define sHT_target_4
  96. /* AMD has something called 3DNow, but that's for floating-point,
  97. which s2 doesn't do. */
  98. # define sHT_target_8 __attribute__((target ("mmx")))
  99. # define sHT_target_16 __attribute__((target ("sse2")))
  100. # define sHT_target_32 __attribute__((target ("avx2")))
  101. # define sHT_target_64 __attribute__((target ("avx512f")))
  102. # define sHT_vector_align 64
  103. #elif defined(__aarch64__)
  104. /* Aarch64 requires NEON to be supported. */
  105. # define sHT_single_vector 4
  106. # define sHT_target_16
  107. # define sHT_vector_align 16
  108. #elif defined(__arm__)
  109. # define sHT_target_4
  110. # define sHT_target_16 __attribute__((target ("fpu=neon")))
  111. # define sHT_vector_align 16
  112. #elif defined(__ia64__)
  113. /* IA 64 has 32 static general purpose registers!
  114. (and 96 with a register window (?)). Do some unrolling
  115. instead of using SIMD (which doesn't seem to exist). */
  116. # define sHT_single_vector 4
  117. # define sHT_target_16
  118. # define sHT_vector_align 8
  119. /* Fallback to scalar processing. Guess the width of the
  120. general-purpose registers. */
  121. #elif defined(UINT16_MAX) && !defined(UINT32_MAX)
  122. # define sHT_single_vector 1
  123. # define sHT_target_2
  124. # define sHT_vector_align 2
  125. #elif defined(UINT32_MAX) && !defined(UINT64_MAX)
  126. # define sHT_single_vector 2
  127. # define sHT_target_4
  128. # define sHT_vector_align 4
  129. #elif defined(UINT64_MAX) && !defined(UINT128_MAX)
  130. # define sHT_single_vector 3
  131. # define sHT_target_8
  132. # define sHT_vector_align 8
  133. #elif defined(UINT128_MAX) && !defined(UINT256_MAX)
  134. # define sHT_single_vector 4
  135. # define sHT_target_16
  136. # define sHT_vector_align 2
  137. #else
  138. # error 256-bit systems are not yet supported
  139. #endif
  140. #if UINT16_MAX == SIZE_MAX
  141. # define sHT_size_order 1
  142. # define sHT_size_bytes 2
  143. #elif UINT32_MAX == SIZE_MAX
  144. # define sHT_size_order 2
  145. # define sHT_size_bytes 4
  146. #elif UINT64_MAX == SIZE_MAX
  147. # define sHT_size_order 3
  148. # define sHT_size_bytes 8
  149. #elif UINT128_MAX == SIZE_MAX
  150. # define sHT_size_order 4
  151. # define sHT_size_bytes 16
  152. #else
  153. # error 256-bit systems are not yet supported
  154. #endif
  155. #ifndef sHT_single_vector
  156. /** Detect SIMD capabilities
  157. This sets the SIMD capability flags, that will depend
  158. upon the advertised microarchitecture and compile-time
  159. architecture.
  160. (The flags might speculatively be incorrect, but that should only
  161. cause a speculative invalid instruction trap, which should be innocent.) */
  162. void
  163. sHT_detect_simd(void);
  164. /** Choose a vectorized implementation
  165. Consider a non-empty sequence of vector sizes N, where
  166. sHT_target_N is defined and 1 << @var{elem_order} divides N.
  167. Find an index into the sequence, optimised for performance.
  168. (Typically, the largest possible vector size.)
  169. Precondition: 2**@var{elem_order} divides some sHT_target_N for some
  170. N (a power of two). (True for all standard integer types, whose size
  171. is a power of two.
  172. The index depends only upon the SIMD capability flags and
  173. compile-time host architecture.
  174. @var{elem_order} must be a compile-time constant.
  175. (E.g. @var{sHT_size_order}).
  176. Precondition: SIMD capabilities have been detected
  177. (e.g. with @var{sHT_detect_simd}). */
  178. size_t
  179. sHT_select_simd(int elem_order);
  180. #else
  181. static inline void
  182. sHT_detect_simd(void)
  183. {
  184. }
  185. #define sHT_select_simd(elem_order) \
  186. ((void) sizeof(struct { int: -((elem_order) <= sHT_single_vector); }), (size_t) 0)
  187. #endif
  188. /* gcc ignores unknown attributes, make sure the 'vector_size' attribute
  189. isn't silently ignored. */
  190. #if __has_attribute(vector_size)
  191. #elif (__GNUC__ > 4) || ((__GNUC__ == 4) && ((__GNUC_MINOR__ > 0) || ((__GNUC_MINOR__ == 0) && (__GNUC_PATCHLEVEL__ >= 4))))
  192. #elif defined(__GNUC__)
  193. # error s2 uses the attribute 'vector_size', but that has only been introduced in GCC 4.0.4
  194. #else
  195. # error s2 uses the GCC attribute 'vector_size', which is not supported by your compiler
  196. #endif
  197. /** A vector type of size @var{bytes}, containing @var{size_t} */
  198. #define sHT_size_vec(bytes) \
  199. __attribute__((vector_size(bytes))) size_t
  200. /** Calculate the number of lanes in a vector of @var{size_t}
  201. @var{bytes}: the size of the vector, in bytes
  202. This fails to compile if the vector cannot be filled with
  203. @var{size_t}. */
  204. #define sHT_size_lanes(bytes) \
  205. /* If @var{bytes} doesn't divide @var{sHT_size_bytes}, the struct
  206. would have a negative-size bitfield, which isn't allowed. It is said
  207. that this trick has been performed in the Linux kernel */ \
  208. ((void) sizeof(struct { int: -(int) ((bytes) % sHT_size_bytes); }), \
  209. sHT_size_bytes/(bytes))
  210. /** Return a vector of @var{size_t} enumerating the natural numbers
  211. @var{bytes}: the size of the vector, in bytes
  212. This returns { 0, 1, 2, ... }. */
  213. #define sHT_size_seq(bytes) \
  214. /* This produces a lot of warnings, because
  215. not all entries are used.
  216. TODO: patch a -Wno-vector-excess into GCC */ \
  217. ((sHT_size_vec(bytes)) \
  218. { 0, 1, 2, 3, 4, 5, 6, 7, \
  219. 8, 9, 10, 11, 12, 13, 14, 15, \
  220. 16, 17, 18, 19, 20, 21, 22, 23, \
  221. 24, 25, 26, 27, 28, 29, 30, 31, })
  222. #endif