123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276 |
- // SPDX-License-Identifier: GPL-3.0-or-later
- // Copyright © 2018-2019 Ariadne Devos
- /* sHT -- portable, adaptive, gracefully degrading SIMD */
- #ifndef _sHT_VECTOR_H
- #define _sHT_VECTOR_H
- /** Portable vector processing (SIMD)
- Many(1) modern processors can do arithmetic on short arrays faster(2)
- and more efficient(3), if they are somewhat assisted by using special
- registers and alignment. How large the arrays must be, varies with the
- extension. (These ‘arrays’ are called *vectors*.)
- As the vector size increases, performance goes up(4). So it would be
- a good idea to take advantage of higher sizes, when available. This
- header-only module helps you to do this, portably.
- When SIMD is not available, this falls back to scalar mode.
- (1) At least, Intel and AMD chips, which were quite popular on
- laptops in 2018
- (2) Acceleration Through Stream Computing, Ulrich Drepper (Red Hat)
- 2009-09-04, slide 11, 16, 26, 29, 32
- (3) Intel® 64 and IA-32 Architectures Optimization Reference
- Manual, 13.5.1.2 Vectorization (page 13), June 2016.
- Supposedly requires you to license patents to Intel in certain
- circumstances (What? Don't buy Intel processors!).
- (4) I assume this, because otherwise, only complexity would raise.
- * How to use
- Write sHT_target_@var{size}, before the function definition, where
- @var{size} is the number of bytes in the vector. It can be 2, 4, 8,
- 16, 32 or 64, depending on the host system.
- For efficiency and correctness, data to be processed with SIMD should
- be aligned to @var{sHT_vector_align}.
- The number of elements in a vector can be computed by dividing its size
- by the element size. In particular, @var{sHT_size_bytes} expands to the
- number of bytes in a @var{size_t} and @var{sHT_size_lanes} calculates the
- number of lanes (entries) in a vector from its size, checking that the
- vector is large enough. (Support other types as needed.)
- @var{sHT_size_order} is the binary logarithm of @var{sHT_size_bytes}.
- @var{sHT_size_seq} produces a vector iterating the natural numbers
- ({0, 1, 2, ...}).
- * Internals
- @var{sHT_single_vector} is defined if there is only one vector size to use,
- in which case it is the binary logarithm of the size. */
- #ifndef __has_attribute
- # define __has_attribute(a) 0
- #endif
- /* gcc ignores unknown attributes, make sure the 'target' attribute
- isn't silently ignored. */
- #if __has_attribute(target)
- #elif (__GNUC__ > 4) || ((__GNUC__ == 4) && ((__GNUC_MINOR__ > 9) || ((__GNUC_MINOR__ == 9) && (__GNUC_PATCHLEVEL__ >= 4))))
- #elif defined(__GNUC__)
- # error s2 uses the attribute 'target', but that has only been introduced in GCC 4.9.4
- #else
- # error s2 uses the GCC attribute 'target', which is not supported by your compiler
- #endif
- #include <stddef.h>
- #include <stdint.h>
- /* These definitions assume that sizeof(T) is a correct alignment
- for any vector type T. Here is a proof:
- The C standard allows for forming an array of T. Let the array's
- length be two, and let P be a pointer to its first element.
- According to the standard (at least, I hopeso), P is correctly aligned,
- and so is P + 1.
- The statements about alignment can be stated this way:
- P = 0 (mod _Alignof(T))
- P + 1 = 0 (mod _Alignof(T))
- The following is also true:
- P = (char *) P
- P + 1 = (char *) P + sizeof(T)
- Filling in the latter values in the former:
- (char *) P = 0 (mod _Alignof(T))
- (char *) P + sizeof(T) = 0 (mod _Alignof(T))
- Subtracting the first from the second:
- sizeof(T) = 0 (mod _Alignof(T))
- This is equivalent to (1):
- _Alignof(T) | sizeof(T) (divides)
- Then, proof that if a pointer Q is sizeof(T) aligned,
- it also is _Alignof(T) aligned.
- Q = 0 (mod sizeof(T))
- ==>
- sizeof(T) | Q
- ==> (introduce (1))
- _Alignof(T) | sizeof(T)
- sizeof(T) | Q
- ==> cut (transitivity of divisibility)
- _Alignof(T) | Q
- ==>
- Q = 0 (mod _Alignof(T))
- Q.E.D */
- #if defined(__x86_64__)
- /* x86-64 requires SSE2 to be supported,
- so don't generate a scalar fallback. */
- # define sHT_target_16 __attribute__((target ("sse2")))
- # define sHT_target_32 __attribute__((target ("avx2")))
- # define sHT_target_64 __attribute__((target ("avx512f")))
- # define sHT_vector_align 64
- #elif defined(__x86__)
- # define sHT_target_4
- /* AMD has something called 3DNow, but that's for floating-point,
- which s2 doesn't do. */
- # define sHT_target_8 __attribute__((target ("mmx")))
- # define sHT_target_16 __attribute__((target ("sse2")))
- # define sHT_target_32 __attribute__((target ("avx2")))
- # define sHT_target_64 __attribute__((target ("avx512f")))
- # define sHT_vector_align 64
- #elif defined(__aarch64__)
- /* Aarch64 requires NEON to be supported. */
- # define sHT_single_vector 4
- # define sHT_target_16
- # define sHT_vector_align 16
- #elif defined(__arm__)
- # define sHT_target_4
- # define sHT_target_16 __attribute__((target ("fpu=neon")))
- # define sHT_vector_align 16
- #elif defined(__ia64__)
- /* IA 64 has 32 static general purpose registers!
- (and 96 with a register window (?)). Do some unrolling
- instead of using SIMD (which doesn't seem to exist). */
- # define sHT_single_vector 4
- # define sHT_target_16
- # define sHT_vector_align 8
- /* Fallback to scalar processing. Guess the width of the
- general-purpose registers. */
- #elif defined(UINT16_MAX) && !defined(UINT32_MAX)
- # define sHT_single_vector 1
- # define sHT_target_2
- # define sHT_vector_align 2
- #elif defined(UINT32_MAX) && !defined(UINT64_MAX)
- # define sHT_single_vector 2
- # define sHT_target_4
- # define sHT_vector_align 4
- #elif defined(UINT64_MAX) && !defined(UINT128_MAX)
- # define sHT_single_vector 3
- # define sHT_target_8
- # define sHT_vector_align 8
- #elif defined(UINT128_MAX) && !defined(UINT256_MAX)
- # define sHT_single_vector 4
- # define sHT_target_16
- # define sHT_vector_align 2
- #else
- # error 256-bit systems are not yet supported
- #endif
- #if UINT16_MAX == SIZE_MAX
- # define sHT_size_order 1
- # define sHT_size_bytes 2
- #elif UINT32_MAX == SIZE_MAX
- # define sHT_size_order 2
- # define sHT_size_bytes 4
- #elif UINT64_MAX == SIZE_MAX
- # define sHT_size_order 3
- # define sHT_size_bytes 8
- #elif UINT128_MAX == SIZE_MAX
- # define sHT_size_order 4
- # define sHT_size_bytes 16
- #else
- # error 256-bit systems are not yet supported
- #endif
- #ifndef sHT_single_vector
- /** Detect SIMD capabilities
- This sets the SIMD capability flags, that will depend
- upon the advertised microarchitecture and compile-time
- architecture.
- (The flags might speculatively be incorrect, but that should only
- cause a speculative invalid instruction trap, which should be innocent.) */
- void
- sHT_detect_simd(void);
- /** Choose a vectorized implementation
- Consider a non-empty sequence of vector sizes N, where
- sHT_target_N is defined and 1 << @var{elem_order} divides N.
- Find an index into the sequence, optimised for performance.
- (Typically, the largest possible vector size.)
- Precondition: 2**@var{elem_order} divides some sHT_target_N for some
- N (a power of two). (True for all standard integer types, whose size
- is a power of two.
- The index depends only upon the SIMD capability flags and
- compile-time host architecture.
- @var{elem_order} must be a compile-time constant.
- (E.g. @var{sHT_size_order}).
- Precondition: SIMD capabilities have been detected
- (e.g. with @var{sHT_detect_simd}). */
- size_t
- sHT_select_simd(int elem_order);
- #else
- static inline void
- sHT_detect_simd(void)
- {
- }
- #define sHT_select_simd(elem_order) \
- ((void) sizeof(struct { int: -((elem_order) <= sHT_single_vector); }), (size_t) 0)
- #endif
- /* gcc ignores unknown attributes, make sure the 'vector_size' attribute
- isn't silently ignored. */
- #if __has_attribute(vector_size)
- #elif (__GNUC__ > 4) || ((__GNUC__ == 4) && ((__GNUC_MINOR__ > 0) || ((__GNUC_MINOR__ == 0) && (__GNUC_PATCHLEVEL__ >= 4))))
- #elif defined(__GNUC__)
- # error s2 uses the attribute 'vector_size', but that has only been introduced in GCC 4.0.4
- #else
- # error s2 uses the GCC attribute 'vector_size', which is not supported by your compiler
- #endif
- /** A vector type of size @var{bytes}, containing @var{size_t} */
- #define sHT_size_vec(bytes) \
- __attribute__((vector_size(bytes))) size_t
- /** Calculate the number of lanes in a vector of @var{size_t}
- @var{bytes}: the size of the vector, in bytes
- This fails to compile if the vector cannot be filled with
- @var{size_t}. */
- #define sHT_size_lanes(bytes) \
- /* If @var{bytes} doesn't divide @var{sHT_size_bytes}, the struct
- would have a negative-size bitfield, which isn't allowed. It is said
- that this trick has been performed in the Linux kernel */ \
- ((void) sizeof(struct { int: -(int) ((bytes) % sHT_size_bytes); }), \
- sHT_size_bytes/(bytes))
- /** Return a vector of @var{size_t} enumerating the natural numbers
- @var{bytes}: the size of the vector, in bytes
- This returns { 0, 1, 2, ... }. */
- #define sHT_size_seq(bytes) \
- /* This produces a lot of warnings, because
- not all entries are used.
- TODO: patch a -Wno-vector-excess into GCC */ \
- ((sHT_size_vec(bytes)) \
- { 0, 1, 2, 3, 4, 5, 6, 7, \
- 8, 9, 10, 11, 12, 13, 14, 15, \
- 16, 17, 18, 19, 20, 21, 22, 23, \
- 24, 25, 26, 27, 28, 29, 30, 31, })
- #endif
|