12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199 |
- /*
- * Copyright (c) 2016-2017 Positive Technologies, https://www.ptsecurity.com,
- * Fast Positive Hash.
- *
- * Portions Copyright (c) 2010-2017 Leonid Yuriev <leo@yuriev.ru>,
- * The 1Hippeus project (t1h).
- *
- * This software is provided 'as-is', without any express or implied
- * warranty. In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- * claim that you wrote the original software. If you use this software
- * in a product, an acknowledgement in the product documentation would be
- * appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- * misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
- /*
- * t1ha = { Fast Positive Hash, aka "Позитивный Хэш" }
- * by [Positive Technologies](https://www.ptsecurity.ru)
- *
- * Briefly, it is a 64-bit Hash Function:
- * 1. Created for 64-bit little-endian platforms, in predominantly for x86_64,
- * but without penalties could runs on any 64-bit CPU.
- * 2. In most cases up to 15% faster than City64, xxHash, mum-hash, metro-hash
- * and all others which are not use specific hardware tricks.
- * 3. Not suitable for cryptography.
- *
- * ACKNOWLEDGEMENT:
- * The t1ha was originally developed by Leonid Yuriev (Леонид Юрьев)
- * for The 1Hippeus project - zerocopy messaging in the spirit of Sparta!
- */
- #include "t1ha.h"
- #include <string.h> /* for memcpy() */
- #if !defined(__BYTE_ORDER__) || !defined(__ORDER_LITTLE_ENDIAN__) || \
- !defined(__ORDER_BIG_ENDIAN__)
- #ifndef _MSC_VER
- #include <sys/param.h> /* for endianness */
- #endif
- #if defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && defined(__BIG_ENDIAN)
- #define __ORDER_LITTLE_ENDIAN__ __LITTLE_ENDIAN
- #define __ORDER_BIG_ENDIAN__ __BIG_ENDIAN
- #define __BYTE_ORDER__ __BYTE_ORDER
- #else
- #define __ORDER_LITTLE_ENDIAN__ 1234
- #define __ORDER_BIG_ENDIAN__ 4321
- #if defined(__LITTLE_ENDIAN__) || defined(_LITTLE_ENDIAN) || \
- defined(__ARMEL__) || defined(__THUMBEL__) || defined(__AARCH64EL__) || \
- defined(__MIPSEL__) || defined(_MIPSEL) || defined(__MIPSEL) || \
- defined(__i386) || defined(__x86_64__) || defined(_M_IX86) || \
- defined(_M_X64) || defined(i386) || defined(_X86_) || defined(__i386__) || \
- defined(_X86_64_) || defined(_M_ARM) || defined(__e2k__)
- #define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__
- #elif defined(__BIG_ENDIAN__) || defined(_BIG_ENDIAN) || defined(__ARMEB__) || \
- defined(__THUMBEB__) || defined(__AARCH64EB__) || defined(__MIPSEB__) || \
- defined(_MIPSEB) || defined(__MIPSEB)
- #define __BYTE_ORDER__ __ORDER_BIG_ENDIAN__
- #else
- #error __BYTE_ORDER__ should be defined.
- #endif
- #endif
- #endif
- #if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ && \
- __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__
- #error Unsupported byte order.
- #endif
- #if !defined(UNALIGNED_OK)
- #if defined(__i386) || defined(__x86_64__) || defined(_M_IX86) || \
- defined(_M_X64) || defined(i386) || defined(_X86_) || defined(__i386__) || \
- defined(_X86_64_)
- #define UNALIGNED_OK 1
- #else
- #define UNALIGNED_OK 0
- #endif
- #endif
- #ifndef __has_builtin
- #define __has_builtin(x) (0)
- #endif
- #if __GNUC_PREREQ(4, 4) || defined(__clang__)
- #if defined(__i386__) || defined(__x86_64__)
- #include <cpuid.h>
- #include <x86intrin.h>
- #endif
- #define likely(cond) __builtin_expect(!!(cond), 1)
- #define unlikely(cond) __builtin_expect(!!(cond), 0)
- #if __GNUC_PREREQ(4, 5) || defined(__clang__)
- #define unreachable() __builtin_unreachable()
- #endif
- #define bswap64(v) __builtin_bswap64(v)
- #define bswap32(v) __builtin_bswap32(v)
- #if __GNUC_PREREQ(4, 8) || __has_builtin(__builtin_bswap16)
- #define bswap16(v) __builtin_bswap16(v)
- #endif
- #if __GNUC_PREREQ(4, 3) || __has_attribute(unused)
- #define maybe_unused __attribute__((unused))
- #endif
- #elif defined(_MSC_VER)
- #if _MSC_FULL_VER < 190024215
- #if _MSC_FULL_VER < 180040629 && defined(_M_IX86)
- #error Please use Visual Studio 2015 (MSC 19.0) or newer for 32-bit target.
- #else
- #pragma message( \
- "It is recommended to use Visual Studio 2015 (MSC 19.0) or newer.")
- #endif
- #endif
- #pragma warning( \
- disable : 4710) /* C4710: C4710: 'mux64': function not inlined */
- #pragma warning( \
- disable : 4711) /* C4711: function 'x86_cpu_features' selected for \
- automatic inline expansion */
- #include <intrin.h>
- #include <stdlib.h>
- #define likely(cond) (cond)
- #define unlikely(cond) (cond)
- #define unreachable() __assume(0)
- #define bswap64(v) _byteswap_uint64(v)
- #define bswap32(v) _byteswap_ulong(v)
- #define bswap16(v) _byteswap_ushort(v)
- #define rot64(v, s) _rotr64(v, s)
- #define rot32(v, s) _rotr(v, s)
- #define __inline __forceinline
- #if defined(_M_ARM64) || defined(_M_X64)
- #pragma intrinsic(_umul128)
- #define mul_64x64_128(a, b, ph) _umul128(a, b, ph)
- #pragma intrinsic(__umulh)
- #define mul_64x64_high(a, b) __umulh(a, b)
- #endif
- #if defined(_M_IX86)
- #pragma intrinsic(__emulu)
- #define mul_32x32_64(a, b) __emulu(a, b)
- #elif defined(_M_ARM)
- #define mul_32x32_64(a, b) _arm_umull(a, b)
- #endif
- #endif /* Compiler */
- #ifndef likely
- #define likely(cond) (cond)
- #endif
- #ifndef unlikely
- #define unlikely(cond) (cond)
- #endif
- #ifndef maybe_unused
- #define maybe_unused
- #endif
- #ifndef unreachable
- #define unreachable() \
- do { \
- } while (1)
- #endif
- #ifndef bswap64
- static __inline uint64_t bswap64(uint64_t v) {
- return v << 56 | v >> 56 | ((v << 40) & 0x00ff000000000000ull) |
- ((v << 24) & 0x0000ff0000000000ull) |
- ((v << 8) & 0x000000ff00000000ull) |
- ((v >> 8) & 0x00000000ff000000ull) |
- ((v >> 24) & 0x0000000000ff0000ull) |
- ((v >> 40) & 0x000000000000ff00ull);
- }
- #endif /* bswap64 */
- #ifndef bswap32
- static __inline uint32_t bswap32(uint32_t v) {
- return v << 24 | v >> 24 | ((v << 8) & 0x00ff0000) | ((v >> 8) & 0x0000ff00);
- }
- #endif /* bswap32 */
- #ifndef bswap16
- static __inline uint16_t bswap16(uint16_t v) { return v << 8 | v >> 8; }
- #endif /* bswap16 */
- #ifndef rot64
- static __inline uint64_t rot64(uint64_t v, unsigned s) {
- return (v >> s) | (v << (64 - s));
- }
- #endif /* rot64 */
- #ifndef rot32
- static __inline uint32_t rot32(uint32_t v, unsigned s) {
- return (v >> s) | (v << (32 - s));
- }
- #endif /* rot32 */
- #ifndef mul_32x32_64
- static __inline uint64_t mul_32x32_64(uint32_t a, uint32_t b) {
- return a * (uint64_t)b;
- }
- #endif /* mul_32x32_64 */
- /***************************************************************************/
- static __inline uint64_t fetch64_le(const void *v) {
- #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
- return *(const uint64_t *)v;
- #else
- return bswap64(*(const uint64_t *)v);
- #endif
- }
- static __inline uint32_t fetch32_le(const void *v) {
- #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
- return *(const uint32_t *)v;
- #else
- return bswap32(*(const uint32_t *)v);
- #endif
- }
- static __inline uint16_t fetch16_le(const void *v) {
- #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
- return *(const uint16_t *)v;
- #else
- return bswap16(*(const uint16_t *)v);
- #endif
- }
- static __inline uint64_t tail64_le(const void *v, size_t tail) {
- const uint8_t *p = (const uint8_t *)v;
- uint64_t r = 0;
- switch (tail & 7) {
- #if UNALIGNED_OK && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
- /* For most CPUs this code is better when not needed
- * copying for alignment or byte reordering. */
- case 0:
- return fetch64_le(p);
- case 7:
- r = (uint64_t)p[6] << 8;
- case 6:
- r += p[5];
- r <<= 8;
- case 5:
- r += p[4];
- r <<= 32;
- case 4:
- return r + fetch32_le(p);
- case 3:
- r = (uint64_t)p[2] << 16;
- case 2:
- return r + fetch16_le(p);
- case 1:
- return p[0];
- #else
- /* For most CPUs this code is better than a
- * copying for alignment and/or byte reordering. */
- case 0:
- r = p[7] << 8;
- case 7:
- r += p[6];
- r <<= 8;
- case 6:
- r += p[5];
- r <<= 8;
- case 5:
- r += p[4];
- r <<= 8;
- case 4:
- r += p[3];
- r <<= 8;
- case 3:
- r += p[2];
- r <<= 8;
- case 2:
- r += p[1];
- r <<= 8;
- case 1:
- return r + p[0];
- #endif
- }
- unreachable();
- }
- /* 'magic' primes */
- static const uint64_t p0 = 17048867929148541611ull;
- static const uint64_t p1 = 9386433910765580089ull;
- static const uint64_t p2 = 15343884574428479051ull;
- static const uint64_t p3 = 13662985319504319857ull;
- static const uint64_t p4 = 11242949449147999147ull;
- static const uint64_t p5 = 13862205317416547141ull;
- static const uint64_t p6 = 14653293970879851569ull;
- /* rotations */
- static const unsigned s0 = 41;
- static const unsigned s1 = 17;
- static const unsigned s2 = 31;
- /* xor-mul-xor mixer */
- static __inline uint64_t mix(uint64_t v, uint64_t p) {
- v *= p;
- return v ^ rot64(v, s0);
- }
- static maybe_unused __inline unsigned add_with_carry(uint64_t *sum,
- uint64_t addend) {
- *sum += addend;
- return *sum < addend;
- }
- /* xor high and low parts of full 128-bit product */
- static __inline uint64_t mux64(uint64_t v, uint64_t p) {
- #ifdef __SIZEOF_INT128__
- __uint128_t r = (__uint128_t)v * (__uint128_t)p;
- /* modern GCC could nicely optimize this */
- return r ^ (r >> 64);
- #elif defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128
- __uint128 r = (__uint128)v * (__uint128)p;
- return r ^ (r >> 64);
- #elif defined(mul_64x64_128)
- uint64_t l, h;
- l = mul_64x64_128(v, p, &h);
- return l ^ h;
- #elif defined(mul_64x64_high)
- uint64_t l, h;
- l = v * p;
- h = mul_64x64_high(v, p);
- return l ^ h;
- #else
- /* performs 64x64 to 128 bit multiplication */
- uint64_t ll = mul_32x32_64((uint32_t)v, (uint32_t)p);
- uint64_t lh = mul_32x32_64(v >> 32, (uint32_t)p);
- uint64_t hl = mul_32x32_64(p >> 32, (uint32_t)v);
- uint64_t hh =
- mul_32x32_64(v >> 32, p >> 32) + (lh >> 32) + (hl >> 32) +
- /* Few simplification are possible here for 32-bit architectures,
- * but thus we would lost compatibility with the original 64-bit
- * version. Think is very bad idea, because then 32-bit t1ha will
- * still (relatively) very slowly and well yet not compatible. */
- add_with_carry(&ll, lh << 32) + add_with_carry(&ll, hl << 32);
- return hh ^ ll;
- #endif
- }
- uint64_t t1ha(const void *data, size_t len, uint64_t seed) {
- uint64_t a = seed;
- uint64_t b = len;
- const int need_align = (((uintptr_t)data) & 7) != 0 && !UNALIGNED_OK;
- uint64_t align[4];
- if (unlikely(len > 32)) {
- uint64_t c = rot64(len, s1) + seed;
- uint64_t d = len ^ rot64(seed, s1);
- const void *detent = (const uint8_t *)data + len - 31;
- do {
- const uint64_t *v = (const uint64_t *)data;
- if (unlikely(need_align))
- v = (const uint64_t *)memcpy(&align, v, 32);
- uint64_t w0 = fetch64_le(v + 0);
- uint64_t w1 = fetch64_le(v + 1);
- uint64_t w2 = fetch64_le(v + 2);
- uint64_t w3 = fetch64_le(v + 3);
- uint64_t d02 = w0 ^ rot64(w2 + d, s1);
- uint64_t c13 = w1 ^ rot64(w3 + c, s1);
- c += a ^ rot64(w0, s0);
- d -= b ^ rot64(w1, s2);
- a ^= p1 * (d02 + w3);
- b ^= p0 * (c13 + w2);
- data = (const uint64_t *)data + 4;
- } while (likely(data < detent));
- a ^= p6 * (rot64(c, s1) + d);
- b ^= p5 * (c + rot64(d, s1));
- len &= 31;
- }
- const uint64_t *v = (const uint64_t *)data;
- if (unlikely(need_align) && len > 8)
- v = (const uint64_t *)memcpy(&align, v, len);
- switch (len) {
- default:
- b += mux64(fetch64_le(v++), p4);
- case 24:
- case 23:
- case 22:
- case 21:
- case 20:
- case 19:
- case 18:
- case 17:
- a += mux64(fetch64_le(v++), p3);
- case 16:
- case 15:
- case 14:
- case 13:
- case 12:
- case 11:
- case 10:
- case 9:
- b += mux64(fetch64_le(v++), p2);
- case 8:
- case 7:
- case 6:
- case 5:
- case 4:
- case 3:
- case 2:
- case 1:
- a += mux64(tail64_le(v, len), p1);
- case 0:
- return mux64(rot64(a + b, s1), p4) + mix(a ^ b, p0);
- }
- }
- /***************************************************************************/
- static maybe_unused __inline uint64_t fetch64_be(const void *v) {
- #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
- return *(const uint64_t *)v;
- #else
- return bswap64(*(const uint64_t *)v);
- #endif
- }
- static maybe_unused __inline uint32_t fetch32_be(const void *v) {
- #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
- return *(const uint32_t *)v;
- #else
- return bswap32(*(const uint32_t *)v);
- #endif
- }
- static maybe_unused __inline uint16_t fetch16_be(const void *v) {
- #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
- return *(const uint16_t *)v;
- #else
- return bswap16(*(const uint16_t *)v);
- #endif
- }
- static maybe_unused __inline uint64_t tail64_be(const void *v, size_t tail) {
- const uint8_t *p = (const uint8_t *)v;
- switch (tail & 7) {
- #if UNALIGNED_OK && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
- /* For most CPUs this code is better when not needed
- * copying for alignment or byte reordering. */
- case 1:
- return p[0];
- case 2:
- return fetch16_be(p);
- case 3:
- return (uint32_t)fetch16_be(p) << 8 | p[2];
- case 4:
- return fetch32_be(p);
- case 5:
- return (uint64_t)fetch32_be(p) << 8 | p[4];
- case 6:
- return (uint64_t)fetch32_be(p) << 16 | fetch16_be(p + 4);
- case 7:
- return (uint64_t)fetch32_be(p) << 24 | (uint32_t)fetch16_be(p + 4) << 8 |
- p[6];
- case 0:
- return fetch64_be(p);
- #else
- /* For most CPUs this code is better than a
- * copying for alignment and/or byte reordering. */
- case 1:
- return p[0];
- case 2:
- return p[1] | (uint32_t)p[0] << 8;
- case 3:
- return p[2] | (uint32_t)p[1] << 8 | (uint32_t)p[0] << 16;
- case 4:
- return p[3] | (uint32_t)p[2] << 8 | (uint32_t)p[1] << 16 |
- (uint32_t)p[0] << 24;
- case 5:
- return p[4] | (uint32_t)p[3] << 8 | (uint32_t)p[2] << 16 |
- (uint32_t)p[1] << 24 | (uint64_t)p[0] << 32;
- case 6:
- return p[5] | (uint32_t)p[4] << 8 | (uint32_t)p[3] << 16 |
- (uint32_t)p[2] << 24 | (uint64_t)p[1] << 32 | (uint64_t)p[0] << 40;
- case 7:
- return p[6] | (uint32_t)p[5] << 8 | (uint32_t)p[4] << 16 |
- (uint32_t)p[3] << 24 | (uint64_t)p[2] << 32 | (uint64_t)p[1] << 40 |
- (uint64_t)p[0] << 48;
- case 0:
- return p[7] | (uint32_t)p[6] << 8 | (uint32_t)p[5] << 16 |
- (uint32_t)p[4] << 24 | (uint64_t)p[3] << 32 | (uint64_t)p[2] << 40 |
- (uint64_t)p[1] << 48 | (uint64_t)p[0] << 56;
- #endif
- }
- unreachable();
- }
- uint64_t t1ha_64be(const void *data, size_t len, uint64_t seed) {
- uint64_t a = seed;
- uint64_t b = len;
- const int need_align = (((uintptr_t)data) & 7) != 0 && !UNALIGNED_OK;
- uint64_t align[4];
- if (unlikely(len > 32)) {
- uint64_t c = rot64(len, s1) + seed;
- uint64_t d = len ^ rot64(seed, s1);
- const void *detent = (const uint8_t *)data + len - 31;
- do {
- const uint64_t *v = (const uint64_t *)data;
- if (unlikely(need_align))
- v = (const uint64_t *)memcpy(&align, v, 32);
- uint64_t w0 = fetch64_be(v + 0);
- uint64_t w1 = fetch64_be(v + 1);
- uint64_t w2 = fetch64_be(v + 2);
- uint64_t w3 = fetch64_be(v + 3);
- uint64_t d02 = w0 ^ rot64(w2 + d, s1);
- uint64_t c13 = w1 ^ rot64(w3 + c, s1);
- c += a ^ rot64(w0, s0);
- d -= b ^ rot64(w1, s2);
- a ^= p1 * (d02 + w3);
- b ^= p0 * (c13 + w2);
- data = (const uint64_t *)data + 4;
- } while (likely(data < detent));
- a ^= p6 * (rot64(c, s1) + d);
- b ^= p5 * (c + rot64(d, s1));
- len &= 31;
- }
- const uint64_t *v = (const uint64_t *)data;
- if (unlikely(need_align) && len > 8)
- v = (const uint64_t *)memcpy(&align, v, len);
- switch (len) {
- default:
- b += mux64(fetch64_be(v++), p4);
- case 24:
- case 23:
- case 22:
- case 21:
- case 20:
- case 19:
- case 18:
- case 17:
- a += mux64(fetch64_be(v++), p3);
- case 16:
- case 15:
- case 14:
- case 13:
- case 12:
- case 11:
- case 10:
- case 9:
- b += mux64(fetch64_be(v++), p2);
- case 8:
- case 7:
- case 6:
- case 5:
- case 4:
- case 3:
- case 2:
- case 1:
- a += mux64(tail64_be(v, len), p1);
- case 0:
- return mux64(rot64(a + b, s1), p4) + mix(a ^ b, p0);
- }
- }
- /***************************************************************************/
- static __inline uint32_t tail32_le(const void *v, size_t tail) {
- const uint8_t *p = (const uint8_t *)v;
- uint32_t r = 0;
- switch (tail & 3) {
- #if UNALIGNED_OK && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
- /* For most CPUs this code is better when not needed
- * copying for alignment or byte reordering. */
- case 0:
- return fetch32_le(p);
- case 3:
- r = (uint32_t)p[2] << 16;
- case 2:
- return r + fetch16_le(p);
- case 1:
- return p[0];
- #else
- /* For most CPUs this code is better than a
- * copying for alignment and/or byte reordering. */
- case 0:
- r += p[3];
- r <<= 8;
- case 3:
- r += p[2];
- r <<= 8;
- case 2:
- r += p[1];
- r <<= 8;
- case 1:
- return r + p[0];
- #endif
- }
- unreachable();
- }
- static __inline uint32_t tail32_be(const void *v, size_t tail) {
- const uint8_t *p = (const uint8_t *)v;
- switch (tail & 3) {
- #if UNALIGNED_OK && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
- /* For most CPUs this code is better when not needed
- * copying for alignment or byte reordering. */
- case 1:
- return p[0];
- case 2:
- return fetch16_be(p);
- case 3:
- return fetch16_be(p) << 8 | p[2];
- case 0:
- return fetch32_be(p);
- #else
- /* For most CPUs this code is better than a
- * copying for alignment and/or byte reordering. */
- case 1:
- return p[0];
- case 2:
- return p[1] | (uint32_t)p[0] << 8;
- case 3:
- return p[2] | (uint32_t)p[1] << 8 | (uint32_t)p[0] << 16;
- case 0:
- return p[3] | (uint32_t)p[2] << 8 | (uint32_t)p[1] << 16 |
- (uint32_t)p[0] << 24;
- #endif
- }
- unreachable();
- }
- static __inline uint64_t remix32(uint32_t a, uint32_t b) {
- a ^= rot32(b, 13);
- uint64_t l = a | (uint64_t)b << 32;
- l *= p0;
- l ^= l >> 41;
- return l;
- }
- static __inline void mixup32(uint32_t *a, uint32_t *b, uint32_t v, uint32_t p) {
- uint64_t l = mul_32x32_64(*b + v, p);
- *a ^= (uint32_t)l;
- *b += (uint32_t)(l >> 32);
- }
- /* 32-bit 'magic' primes */
- static const uint32_t q0 = 0x92D78269;
- static const uint32_t q1 = 0xCA9B4735;
- static const uint32_t q2 = 0xA4ABA1C3;
- static const uint32_t q3 = 0xF6499843;
- static const uint32_t q4 = 0x86F0FD61;
- static const uint32_t q5 = 0xCA2DA6FB;
- static const uint32_t q6 = 0xC4BB3575;
- uint64_t t1ha_32le(const void *data, size_t len, uint64_t seed) {
- uint32_t a = rot32((uint32_t)len, s1) + (uint32_t)seed;
- uint32_t b = (uint32_t)len ^ (uint32_t)(seed >> 32);
- const int need_align = (((uintptr_t)data) & 3) != 0 && !UNALIGNED_OK;
- uint32_t align[4];
- if (unlikely(len > 16)) {
- uint32_t c = ~a;
- uint32_t d = rot32(b, 5);
- const void *detent = (const uint8_t *)data + len - 15;
- do {
- const uint32_t *v = (const uint32_t *)data;
- if (unlikely(need_align))
- v = (const uint32_t *)memcpy(&align, v, 16);
- uint32_t w0 = fetch32_le(v + 0);
- uint32_t w1 = fetch32_le(v + 1);
- uint32_t w2 = fetch32_le(v + 2);
- uint32_t w3 = fetch32_le(v + 3);
- uint32_t c02 = w0 ^ rot32(w2 + c, 11);
- uint32_t d13 = w1 + rot32(w3 + d, s1);
- c ^= rot32(b + w1, 7);
- d ^= rot32(a + w0, 3);
- b = q1 * (c02 + w3);
- a = q0 * (d13 ^ w2);
- data = (const uint32_t *)data + 4;
- } while (likely(data < detent));
- c += a;
- d += b;
- a ^= q6 * (rot32(c, 16) + d);
- b ^= q5 * (c + rot32(d, 16));
- len &= 15;
- }
- const uint8_t *v = (const uint8_t *)data;
- if (unlikely(need_align) && len > 4)
- v = (const uint8_t *)memcpy(&align, v, len);
- switch (len) {
- default:
- mixup32(&a, &b, fetch32_le(v), q4);
- v += 4;
- case 12:
- case 11:
- case 10:
- case 9:
- mixup32(&b, &a, fetch32_le(v), q3);
- v += 4;
- case 8:
- case 7:
- case 6:
- case 5:
- mixup32(&a, &b, fetch32_le(v), q2);
- v += 4;
- case 4:
- case 3:
- case 2:
- case 1:
- mixup32(&b, &a, tail32_le(v, len), q1);
- case 0:
- return remix32(a, b);
- }
- }
- uint64_t t1ha_32be(const void *data, size_t len, uint64_t seed) {
- uint32_t a = rot32((uint32_t)len, s1) + (uint32_t)seed;
- uint32_t b = (uint32_t)len ^ (uint32_t)(seed >> 32);
- const int need_align = (((uintptr_t)data) & 3) != 0 && !UNALIGNED_OK;
- uint32_t align[4];
- if (unlikely(len > 16)) {
- uint32_t c = ~a;
- uint32_t d = rot32(b, 5);
- const void *detent = (const uint8_t *)data + len - 15;
- do {
- const uint32_t *v = (const uint32_t *)data;
- if (unlikely(need_align))
- v = (const uint32_t *)memcpy(&align, v, 16);
- uint32_t w0 = fetch32_be(v + 0);
- uint32_t w1 = fetch32_be(v + 1);
- uint32_t w2 = fetch32_be(v + 2);
- uint32_t w3 = fetch32_be(v + 3);
- uint32_t c02 = w0 ^ rot32(w2 + c, 11);
- uint32_t d13 = w1 + rot32(w3 + d, s1);
- c ^= rot32(b + w1, 7);
- d ^= rot32(a + w0, 3);
- b = q1 * (c02 + w3);
- a = q0 * (d13 ^ w2);
- data = (const uint32_t *)data + 4;
- } while (likely(data < detent));
- c += a;
- d += b;
- a ^= q6 * (rot32(c, 16) + d);
- b ^= q5 * (c + rot32(d, 16));
- len &= 15;
- }
- const uint8_t *v = (const uint8_t *)data;
- if (unlikely(need_align) && len > 4)
- v = (const uint8_t *)memcpy(&align, v, len);
- switch (len) {
- default:
- mixup32(&a, &b, fetch32_be(v), q4);
- v += 4;
- case 12:
- case 11:
- case 10:
- case 9:
- mixup32(&b, &a, fetch32_be(v), q3);
- v += 4;
- case 8:
- case 7:
- case 6:
- case 5:
- mixup32(&a, &b, fetch32_be(v), q2);
- v += 4;
- case 4:
- case 3:
- case 2:
- case 1:
- mixup32(&b, &a, tail32_be(v, len), q1);
- case 0:
- return remix32(a, b);
- }
- }
- /***************************************************************************/
- #if (defined(__x86_64__) && (defined(__SSE4_2__) || __GNUC_PREREQ(4, 4) || \
- __has_attribute(target))) || \
- defined(_M_X64) || defined(_X86_64_)
- #include <nmmintrin.h>
- uint64_t
- #if __GNUC_PREREQ(4, 4) || __has_attribute(target)
- __attribute__((target("sse4.2")))
- #endif
- t1ha_ia32crc(const void *data, size_t len, uint64_t seed) {
- uint64_t a = seed;
- uint64_t b = len;
- const uint64_t *v = (const uint64_t *)data;
- if (unlikely(len > 32)) {
- const void *detent = (const uint8_t *)data + len - 31;
- uint32_t x = (uint32_t)b;
- uint32_t y = (uint32_t)a;
- uint32_t z = (uint32_t)(~a ^ b);
- do {
- uint32_t t = (uint32_t)a + x;
- a = rot64(a, 17) + p0 * v[0];
- x += (uint32_t)_mm_crc32_u64(y, v[1]);
- y += (uint32_t)_mm_crc32_u64(z, v[2]);
- z += (uint32_t)_mm_crc32_u64(t, v[3]);
- v += 4;
- } while (likely(detent > (const void *)v));
- a ^= x * p5 + y * p6 + z;
- b = x + y * p5 + z * p6;
- len &= 31;
- }
- switch (len) {
- default:
- b += mux64(*v++, p4);
- case 24:
- case 23:
- case 22:
- case 21:
- case 20:
- case 19:
- case 18:
- case 17:
- a += mux64(*v++, p3);
- case 16:
- case 15:
- case 14:
- case 13:
- case 12:
- case 11:
- case 10:
- case 9:
- b += mux64(*v++, p2);
- case 8:
- case 7:
- case 6:
- case 5:
- case 4:
- case 3:
- case 2:
- case 1:
- a += mux64(tail64_le(v, len), p1);
- case 0:
- return mux64(rot64(a + b, s1), p4) + mix(a ^ b, p0);
- }
- }
- #endif /* __x86_64__ */
- /***************************************************************************/
- #if defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64) || \
- defined(i386) || defined(_X86_) || defined(__i386__) || defined(_X86_64_)
- static uint32_t x86_cpu_features(void) {
- #ifdef __GNUC__
- uint32_t eax, ebx, ecx, edx;
- if (__get_cpuid_max(0, NULL) < 1)
- return 0;
- __cpuid_count(1, 0, eax, ebx, ecx, edx);
- return ecx;
- #elif defined(_MSC_VER)
- int info[4];
- __cpuid(info, 0);
- if (info[0] < 1)
- return 0;
- __cpuidex(info, 1, 0);
- return info[2];
- #else
- return 0;
- #endif
- }
- #endif
- #if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || \
- defined(_M_X64) || defined(i386) || defined(_X86_) || defined(_X86_64_)
- #include <emmintrin.h>
- #include <smmintrin.h>
- #include <wmmintrin.h>
- #if defined(__x86_64__) && defined(__ELF__) && \
- (__GNUC_PREREQ(4, 6) || __has_attribute(ifunc)) && __has_attribute(target)
- uint64_t t1ha_ia32aes(const void *data, size_t len, uint64_t seed)
- __attribute__((ifunc("t1ha_aes_resolve")));
- static uint64_t t1ha_ia32aes_avx(const void *data, size_t len, uint64_t seed);
- static uint64_t t1ha_ia32aes_noavx(const void *data, size_t len, uint64_t seed);
- static uint64_t (*t1ha_aes_resolve(void))(const void *, size_t, uint64_t) {
- uint32_t features = x86_cpu_features();
- if ((features & 0x01A000000) == 0x01A000000)
- return t1ha_ia32aes_avx;
- return t1ha_ia32aes_noavx;
- }
- static uint64_t __attribute__((target("aes,avx")))
- t1ha_ia32aes_avx(const void *data, size_t len, uint64_t seed) {
- uint64_t a = seed;
- uint64_t b = len;
- if (unlikely(len > 32)) {
- __m128i x = _mm_set_epi64x(a, b);
- __m128i y = _mm_aesenc_si128(x, _mm_set_epi64x(p0, p1));
- const __m128i *v = (const __m128i *)data;
- const __m128i *const detent =
- (const __m128i *)((const uint8_t *)data + (len & ~15ul));
- data = detent;
- if (len & 16) {
- x = _mm_add_epi64(x, _mm_loadu_si128(v++));
- y = _mm_aesenc_si128(x, y);
- }
- len &= 15;
- if (v + 7 < detent) {
- __m128i salt = y;
- do {
- __m128i t = _mm_aesenc_si128(_mm_loadu_si128(v++), salt);
- t = _mm_aesdec_si128(t, _mm_loadu_si128(v++));
- t = _mm_aesdec_si128(t, _mm_loadu_si128(v++));
- t = _mm_aesdec_si128(t, _mm_loadu_si128(v++));
- t = _mm_aesdec_si128(t, _mm_loadu_si128(v++));
- t = _mm_aesdec_si128(t, _mm_loadu_si128(v++));
- t = _mm_aesdec_si128(t, _mm_loadu_si128(v++));
- t = _mm_aesdec_si128(t, _mm_loadu_si128(v++));
- salt = _mm_add_epi64(salt, _mm_set_epi64x(p2, p3));
- t = _mm_aesenc_si128(x, t);
- x = _mm_add_epi64(y, x);
- y = t;
- } while (v + 7 < detent);
- }
- while (v < detent) {
- __m128i v0y = _mm_add_epi64(y, _mm_loadu_si128(v++));
- __m128i v1x = _mm_sub_epi64(x, _mm_loadu_si128(v++));
- x = _mm_aesdec_si128(x, v0y);
- y = _mm_aesdec_si128(y, v1x);
- }
- x = _mm_add_epi64(_mm_aesdec_si128(x, _mm_aesenc_si128(y, x)), y);
- #if defined(__x86_64__) || defined(_M_X64)
- a = _mm_cvtsi128_si64(x);
- #if defined(__SSE4_1__)
- b = _mm_extract_epi64(x, 1);
- #else
- b = _mm_cvtsi128_si64(_mm_unpackhi_epi64(x, x));
- #endif
- #else
- a = (uint32_t)_mm_cvtsi128_si32(x);
- #if defined(__SSE4_1__)
- a |= (uint64_t)_mm_extract_epi32(x, 1) << 32;
- b = (uint32_t)_mm_extract_epi32(x, 2) |
- (uint64_t)_mm_extract_epi32(x, 3) << 32;
- #else
- a |= (uint64_t)_mm_cvtsi128_si32(_mm_shuffle_epi32(x, 1)) << 32;
- x = _mm_unpackhi_epi64(x, x);
- b = (uint32_t)_mm_cvtsi128_si32(x);
- b |= (uint64_t)_mm_cvtsi128_si32(_mm_shuffle_epi32(x, 1)) << 32;
- #endif
- #endif
- }
- const uint64_t *v = (const uint64_t *)data;
- switch (len) {
- default:
- b += mux64(*v++, p4);
- case 24:
- case 23:
- case 22:
- case 21:
- case 20:
- case 19:
- case 18:
- case 17:
- a += mux64(*v++, p3);
- case 16:
- case 15:
- case 14:
- case 13:
- case 12:
- case 11:
- case 10:
- case 9:
- b += mux64(*v++, p2);
- case 8:
- case 7:
- case 6:
- case 5:
- case 4:
- case 3:
- case 2:
- case 1:
- a += mux64(tail64_le(v, len), p1);
- case 0:
- return mux64(rot64(a + b, s1), p4) + mix(a ^ b, p0);
- }
- }
- static uint64_t
- #if __GNUC_PREREQ(4, 4) || __has_attribute(target)
- __attribute__((target("aes")))
- #endif
- t1ha_ia32aes_noavx(const void *data, size_t len, uint64_t seed) {
- #else /* ELF && ifunc */
- uint64_t
- #if __GNUC_PREREQ(4, 4) || __has_attribute(target)
- __attribute__((target("aes")))
- #endif
- t1ha_ia32aes(const void *data, size_t len, uint64_t seed) {
- #endif
- uint64_t a = seed;
- uint64_t b = len;
- if (unlikely(len > 32)) {
- __m128i x = _mm_set_epi64x(a, b);
- __m128i y = _mm_aesenc_si128(x, _mm_set_epi64x(p0, p1));
- const __m128i *v = (const __m128i *)data;
- const __m128i *const detent =
- (const __m128i *)((const uint8_t *)data + (len & ~15ul));
- data = detent;
- if (len & 16) {
- x = _mm_add_epi64(x, _mm_loadu_si128(v++));
- y = _mm_aesenc_si128(x, y);
- }
- len &= 15;
- if (v + 7 < detent) {
- __m128i salt = y;
- do {
- __m128i t = _mm_aesenc_si128(_mm_loadu_si128(v++), salt);
- t = _mm_aesdec_si128(t, _mm_loadu_si128(v++));
- t = _mm_aesdec_si128(t, _mm_loadu_si128(v++));
- t = _mm_aesdec_si128(t, _mm_loadu_si128(v++));
- t = _mm_aesdec_si128(t, _mm_loadu_si128(v++));
- t = _mm_aesdec_si128(t, _mm_loadu_si128(v++));
- t = _mm_aesdec_si128(t, _mm_loadu_si128(v++));
- t = _mm_aesdec_si128(t, _mm_loadu_si128(v++));
- salt = _mm_add_epi64(salt, _mm_set_epi64x(p2, p3));
- t = _mm_aesenc_si128(x, t);
- x = _mm_add_epi64(y, x);
- y = t;
- } while (v + 7 < detent);
- }
- while (v < detent) {
- __m128i v0y = _mm_add_epi64(y, _mm_loadu_si128(v++));
- __m128i v1x = _mm_sub_epi64(x, _mm_loadu_si128(v++));
- x = _mm_aesdec_si128(x, v0y);
- y = _mm_aesdec_si128(y, v1x);
- }
- x = _mm_add_epi64(_mm_aesdec_si128(x, _mm_aesenc_si128(y, x)), y);
- #if defined(__x86_64__) || defined(_M_X64)
- a = _mm_cvtsi128_si64(x);
- #if defined(__SSE4_1__)
- b = _mm_extract_epi64(x, 1);
- #else
- b = _mm_cvtsi128_si64(_mm_unpackhi_epi64(x, x));
- #endif
- #else
- a = (uint32_t)_mm_cvtsi128_si32(x);
- #if defined(__SSE4_1__)
- a |= (uint64_t)_mm_extract_epi32(x, 1) << 32;
- b = (uint32_t)_mm_extract_epi32(x, 2) |
- (uint64_t)_mm_extract_epi32(x, 3) << 32;
- #else
- a |= (uint64_t)_mm_cvtsi128_si32(_mm_shuffle_epi32(x, 1)) << 32;
- x = _mm_unpackhi_epi64(x, x);
- b = (uint32_t)_mm_cvtsi128_si32(x);
- b |= (uint64_t)_mm_cvtsi128_si32(_mm_shuffle_epi32(x, 1)) << 32;
- #endif
- #endif
- }
- const uint64_t *v = (const uint64_t *)data;
- switch (len) {
- default:
- b += mux64(*v++, p4);
- case 24:
- case 23:
- case 22:
- case 21:
- case 20:
- case 19:
- case 18:
- case 17:
- a += mux64(*v++, p3);
- case 16:
- case 15:
- case 14:
- case 13:
- case 12:
- case 11:
- case 10:
- case 9:
- b += mux64(*v++, p2);
- case 8:
- case 7:
- case 6:
- case 5:
- case 4:
- case 3:
- case 2:
- case 1:
- a += mux64(tail64_le(v, len), p1);
- case 0:
- return mux64(rot64(a + b, s1), p4) + mix(a ^ b, p0);
- }
- }
- #endif /* __i386__ || __x86_64__ */
- /***************************************************************************/
- static uint64_t (*t1ha_local_resolve(void))(const void *, size_t, uint64_t) {
- #if defined(__x86_64) || defined(_M_IX86) || defined(_M_X64) || \
- defined(i386) || defined(_X86_) || defined(__i386__) || defined(_X86_64_)
- uint32_t features = x86_cpu_features();
- if (features & (1l << 25))
- return t1ha_ia32aes;
- #if defined(__x86_64) || defined(_M_X64) || defined(_X86_64_)
- if (features & (1l << 20))
- return t1ha_ia32crc;
- #endif
- #endif /* x86 */
- #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
- return (sizeof(long) >= 8) ? t1ha_64be : t1ha_32be;
- #else
- return (sizeof(long) >= 8) ? t1ha_64le : t1ha_32le;
- #endif
- }
- #if defined(__ELF__) && (__GNUC_PREREQ(4, 6) || __has_attribute(ifunc))
- uint64_t t1ha_local(const void *data, size_t len, uint64_t seed)
- __attribute__((ifunc("t1ha_local_resolve")));
- #elif __GNUC_PREREQ(4, 0) || __has_attribute(constructor)
- uint64_t (*t1ha_local_ptr)(const void *, size_t, uint64_t);
- static void __attribute__((constructor)) t1ha_local_init(void) {
- t1ha_local_ptr = t1ha_local_resolve();
- }
- #else /* ELF && ifunc */
- static uint64_t t1ha_local_proxy(const void *data, size_t len, uint64_t seed) {
- t1ha_local_ptr = t1ha_local_resolve();
- return t1ha_local_ptr(data, len, seed);
- }
- uint64_t (*t1ha_local_ptr)(const void *, size_t, uint64_t) = t1ha_local_proxy;
- #endif
|