siphash_sse2.c 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136
  1. #include "siphash_impl.h"
  2. /* 0,2,1,3 */
  3. static const packedelem64 siphash_init[2] = {
  4. {{0x736f6d6570736575ull,0x6c7967656e657261ull}},
  5. {{0x646f72616e646f6dull,0x7465646279746573ull}}
  6. };
  7. static const packedelem64 siphash_final = {
  8. {0x0000000000000000ull,0x00000000000000ffull}
  9. };
  10. uint64_t
  11. siphash(const unsigned char key[16], const unsigned char *m, size_t len) {
  12. xmmi k,v02,v20,v13,v11,v33,mi;
  13. uint64_t last7;
  14. uint32_t lo, hi;
  15. size_t i, blocks;
  16. k = _mm_loadu_si128((xmmi *)(key + 0));
  17. v02 = siphash_init[0].v;
  18. v13 = siphash_init[1].v;
  19. v02 = _mm_xor_si128(v02, _mm_unpacklo_epi64(k, k));
  20. v13 = _mm_xor_si128(v13, _mm_unpackhi_epi64(k, k));
  21. last7 = (uint64_t)(len & 0xff) << 56;
  22. #define sipcompress() \
  23. v11 = v13; \
  24. v33 = _mm_shuffle_epi32(v13, _MM_SHUFFLE(1,0,3,2)); \
  25. v11 = _mm_or_si128(_mm_slli_epi64(v11, 13), _mm_srli_epi64(v11, 64-13)); \
  26. v02 = _mm_add_epi64(v02, v13); \
  27. v33 = _mm_or_si128(_mm_slli_epi64(v33, 16), _mm_srli_epi64(v33, 64-16)); \
  28. v13 = _mm_unpacklo_epi64(v11, v33); \
  29. v13 = _mm_xor_si128(v13, v02); \
  30. v20 = _mm_shuffle_epi32(v02, _MM_SHUFFLE(0,1,3,2)); \
  31. v11 = v13; \
  32. v33 = _mm_shuffle_epi32(v13, _MM_SHUFFLE(1,0,3,2)); \
  33. v11 = _mm_or_si128(_mm_slli_epi64(v11, 17), _mm_srli_epi64(v11, 64-17)); \
  34. v20 = _mm_add_epi64(v20, v13); \
  35. v33 = _mm_or_si128(_mm_slli_epi64(v33, 21), _mm_srli_epi64(v33, 64-21)); \
  36. v13 = _mm_unpacklo_epi64(v11, v33); \
  37. v13 = _mm_unpacklo_epi64(v11, v33); \
  38. v02 = _mm_shuffle_epi32(v20, _MM_SHUFFLE(0,1,3,2)); \
  39. v13 = _mm_xor_si128(v13, v20);
  40. for (i = 0, blocks = (len & ~7); i < blocks; i += 8) {
  41. mi = _mm_loadl_epi64((xmmi *)(m + i));
  42. v13 = _mm_xor_si128(v13, _mm_slli_si128(mi, 8));
  43. sipcompress()
  44. sipcompress()
  45. v02 = _mm_xor_si128(v02, mi);
  46. }
  47. switch (len - blocks) {
  48. case 7: last7 |= (uint64_t)m[i + 6] << 48;
  49. case 6: last7 |= (uint64_t)m[i + 5] << 40;
  50. case 5: last7 |= (uint64_t)m[i + 4] << 32;
  51. case 4: last7 |= (uint64_t)m[i + 3] << 24;
  52. case 3: last7 |= (uint64_t)m[i + 2] << 16;
  53. case 2: last7 |= (uint64_t)m[i + 1] << 8;
  54. case 1: last7 |= (uint64_t)m[i + 0] ;
  55. case 0:
  56. default:;
  57. };
  58. mi = _mm_unpacklo_epi32(_mm_cvtsi32_si128((uint32_t)last7),_mm_cvtsi32_si128((uint32_t)(last7 >> 32)));
  59. v13 = _mm_xor_si128(v13, _mm_slli_si128(mi, 8));
  60. sipcompress()
  61. sipcompress()
  62. v02 = _mm_xor_si128(v02, mi);
  63. v02 = _mm_xor_si128(v02, siphash_final.v);
  64. sipcompress()
  65. sipcompress()
  66. sipcompress()
  67. sipcompress()
  68. v02 = _mm_xor_si128(v02, v13);
  69. v02 = _mm_xor_si128(v02, _mm_shuffle_epi32(v02, _MM_SHUFFLE(1,0,3,2)));
  70. lo = _mm_cvtsi128_si32(v02);
  71. hi = _mm_cvtsi128_si32(_mm_srli_si128(v02, 4));
  72. return ((uint64_t)hi << 32) | lo;
  73. }
  74. uint64_t
  75. siphash13(const unsigned char key[16], const unsigned char *m, size_t len) {
  76. xmmi k,v02,v20,v13,v11,v33,mi;
  77. uint64_t last7;
  78. uint32_t lo, hi;
  79. size_t i, blocks;
  80. k = _mm_loadu_si128((xmmi *)(key + 0));
  81. v02 = siphash_init[0].v;
  82. v13 = siphash_init[1].v;
  83. v02 = _mm_xor_si128(v02, _mm_unpacklo_epi64(k, k));
  84. v13 = _mm_xor_si128(v13, _mm_unpackhi_epi64(k, k));
  85. last7 = (uint64_t)(len & 0xff) << 56;
  86. for (i = 0, blocks = (len & ~7); i < blocks; i += 8) {
  87. mi = _mm_loadl_epi64((xmmi *)(m + i));
  88. v13 = _mm_xor_si128(v13, _mm_slli_si128(mi, 8));
  89. sipcompress()
  90. v02 = _mm_xor_si128(v02, mi);
  91. }
  92. switch (len - blocks) {
  93. case 7: last7 |= (uint64_t)m[i + 6] << 48;
  94. case 6: last7 |= (uint64_t)m[i + 5] << 40;
  95. case 5: last7 |= (uint64_t)m[i + 4] << 32;
  96. case 4: last7 |= (uint64_t)m[i + 3] << 24;
  97. case 3: last7 |= (uint64_t)m[i + 2] << 16;
  98. case 2: last7 |= (uint64_t)m[i + 1] << 8;
  99. case 1: last7 |= (uint64_t)m[i + 0] ;
  100. case 0:
  101. default:;
  102. };
  103. mi = _mm_unpacklo_epi32(_mm_cvtsi32_si128((uint32_t)last7),_mm_cvtsi32_si128((uint32_t)(last7 >> 32)));
  104. v13 = _mm_xor_si128(v13, _mm_slli_si128(mi, 8));
  105. sipcompress()
  106. v02 = _mm_xor_si128(v02, mi);
  107. v02 = _mm_xor_si128(v02, siphash_final.v);
  108. sipcompress()
  109. sipcompress()
  110. sipcompress()
  111. v02 = _mm_xor_si128(v02, v13);
  112. v02 = _mm_xor_si128(v02, _mm_shuffle_epi32(v02, _MM_SHUFFLE(1,0,3,2)));
  113. lo = _mm_cvtsi128_si32(v02);
  114. hi = _mm_cvtsi128_si32(_mm_srli_si128(v02, 4));
  115. return ((uint64_t)hi << 32) | lo;
  116. }
  117. #include "halfsiphash.c"