siphash_ssse3.c 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191
  1. #include "siphash_impl.h"
  2. /* 0,2,1,3 */
  3. static const packedelem64 siphash_init[2] = {
  4. {{0x736f6d6570736575ull,0x6c7967656e657261ull}},
  5. {{0x646f72616e646f6dull,0x7465646279746573ull}}
  6. };
  7. static const packedelem64 siphash_final = {
  8. {0x0000000000000000ull,0x00000000000000ffull}
  9. };
  10. static const packedelem8 siphash_rot16v3 = {
  11. {14,15,8,9,10,11,12,13,8,9,10,11,12,13,14,15}
  12. };
  13. uint64_t
  14. siphash(const unsigned char key[16], const unsigned char *m, size_t len) {
  15. xmmi k,v02,v20,v13,v11,v33,mi;
  16. uint64_t last7;
  17. uint32_t lo, hi;
  18. size_t i, blocks;
  19. k = _mm_loadu_si128((xmmi *)(key + 0));
  20. v02 = siphash_init[0].v;
  21. v13 = siphash_init[1].v;
  22. v02 = _mm_xor_si128(v02, _mm_unpacklo_epi64(k, k));
  23. v13 = _mm_xor_si128(v13, _mm_unpackhi_epi64(k, k));
  24. last7 = (uint64_t)(len & 0xff) << 56;
  25. #define sipcompress() \
  26. v11 = v13; \
  27. v33 = v13; \
  28. v11 = _mm_or_si128(_mm_slli_epi64(v11, 13), _mm_srli_epi64(v11, 64-13)); \
  29. v02 = _mm_add_epi64(v02, v13); \
  30. v33 = _mm_shuffle_epi8(v33, siphash_rot16v3.v); \
  31. v13 = _mm_unpacklo_epi64(v11, v33); \
  32. v13 = _mm_xor_si128(v13, v02); \
  33. v20 = _mm_shuffle_epi32(v02, _MM_SHUFFLE(0,1,3,2)); \
  34. v11 = v13; \
  35. v33 = _mm_shuffle_epi32(v13, _MM_SHUFFLE(1,0,3,2)); \
  36. v11 = _mm_or_si128(_mm_slli_epi64(v11, 17), _mm_srli_epi64(v11, 64-17)); \
  37. v20 = _mm_add_epi64(v20, v13); \
  38. v33 = _mm_or_si128(_mm_slli_epi64(v33, 21), _mm_srli_epi64(v33, 64-21)); \
  39. v13 = _mm_unpacklo_epi64(v11, v33); \
  40. v13 = _mm_unpacklo_epi64(v11, v33); \
  41. v02 = _mm_shuffle_epi32(v20, _MM_SHUFFLE(0,1,3,2)); \
  42. v13 = _mm_xor_si128(v13, v20);
  43. for (i = 0, blocks = (len & ~7); i < blocks; i += 8) {
  44. mi = _mm_loadl_epi64((xmmi *)(m + i));
  45. v13 = _mm_xor_si128(v13, _mm_slli_si128(mi, 8));
  46. sipcompress()
  47. sipcompress()
  48. v02 = _mm_xor_si128(v02, mi);
  49. }
  50. switch (len - blocks) {
  51. case 7: last7 |= (uint64_t)m[i + 6] << 48;
  52. case 6: last7 |= (uint64_t)m[i + 5] << 40;
  53. case 5: last7 |= (uint64_t)m[i + 4] << 32;
  54. case 4: last7 |= (uint64_t)m[i + 3] << 24;
  55. case 3: last7 |= (uint64_t)m[i + 2] << 16;
  56. case 2: last7 |= (uint64_t)m[i + 1] << 8;
  57. case 1: last7 |= (uint64_t)m[i + 0] ;
  58. case 0:
  59. default:;
  60. };
  61. mi = _mm_unpacklo_epi32(_mm_cvtsi32_si128((uint32_t)last7),_mm_cvtsi32_si128((uint32_t)(last7 >> 32)));
  62. v13 = _mm_xor_si128(v13, _mm_slli_si128(mi, 8));
  63. sipcompress()
  64. sipcompress()
  65. v02 = _mm_xor_si128(v02, mi);
  66. v02 = _mm_xor_si128(v02, siphash_final.v);
  67. sipcompress()
  68. sipcompress()
  69. sipcompress()
  70. sipcompress()
  71. v02 = _mm_xor_si128(v02, v13);
  72. v02 = _mm_xor_si128(v02, _mm_shuffle_epi32(v02, _MM_SHUFFLE(1,0,3,2)));
  73. lo = _mm_cvtsi128_si32(v02);
  74. hi = _mm_cvtsi128_si32(_mm_srli_si128(v02, 4));
  75. return ((uint64_t)hi << 32) | lo;
  76. }
  77. uint64_t
  78. siphash13(const unsigned char key[16], const unsigned char *m, size_t len) {
  79. xmmi k,v02,v20,v13,v11,v33,mi;
  80. uint64_t last7;
  81. uint32_t lo, hi;
  82. size_t i, blocks;
  83. k = _mm_loadu_si128((xmmi *)(key + 0));
  84. v02 = siphash_init[0].v;
  85. v13 = siphash_init[1].v;
  86. v02 = _mm_xor_si128(v02, _mm_unpacklo_epi64(k, k));
  87. v13 = _mm_xor_si128(v13, _mm_unpackhi_epi64(k, k));
  88. last7 = (uint64_t)(len & 0xff) << 56;
  89. for (i = 0, blocks = (len & ~7); i < blocks; i += 8) {
  90. mi = _mm_loadl_epi64((xmmi *)(m + i));
  91. v13 = _mm_xor_si128(v13, _mm_slli_si128(mi, 8));
  92. sipcompress()
  93. v02 = _mm_xor_si128(v02, mi);
  94. }
  95. switch (len - blocks) {
  96. case 7: last7 |= (uint64_t)m[i + 6] << 48;
  97. case 6: last7 |= (uint64_t)m[i + 5] << 40;
  98. case 5: last7 |= (uint64_t)m[i + 4] << 32;
  99. case 4: last7 |= (uint64_t)m[i + 3] << 24;
  100. case 3: last7 |= (uint64_t)m[i + 2] << 16;
  101. case 2: last7 |= (uint64_t)m[i + 1] << 8;
  102. case 1: last7 |= (uint64_t)m[i + 0] ;
  103. case 0:
  104. default:;
  105. };
  106. mi = _mm_unpacklo_epi32(_mm_cvtsi32_si128((uint32_t)last7),_mm_cvtsi32_si128((uint32_t)(last7 >> 32)));
  107. v13 = _mm_xor_si128(v13, _mm_slli_si128(mi, 8));
  108. sipcompress()
  109. v02 = _mm_xor_si128(v02, mi);
  110. v02 = _mm_xor_si128(v02, siphash_final.v);
  111. sipcompress()
  112. sipcompress()
  113. sipcompress()
  114. v02 = _mm_xor_si128(v02, v13);
  115. v02 = _mm_xor_si128(v02, _mm_shuffle_epi32(v02, _MM_SHUFFLE(1,0,3,2)));
  116. lo = _mm_cvtsi128_si32(v02);
  117. hi = _mm_cvtsi128_si32(_mm_srli_si128(v02, 4));
  118. return ((uint64_t)hi << 32) | lo;
  119. }
  120. #include "halfsiphash.c"
  121. /* slower */
  122. #if 0
  123. uint32_t
  124. halfsiphash(const unsigned char key[16], const unsigned char *m, size_t len) {
  125. xmmi k,v02,v20,v13,v11,v33,mi;
  126. uint32_t last7;
  127. uint32_t lo, hi;
  128. size_t i, blocks;
  129. k = _mm_loadu_si128((xmmi *)(key + 0));
  130. v02 = siphash_init[0].v;
  131. v13 = siphash_init[1].v;
  132. v02 = _mm_xor_si128(v02, _mm_unpacklo_epi64(k, k));
  133. v13 = _mm_xor_si128(v13, _mm_unpackhi_epi64(k, k));
  134. last7 = (len & 0xff) << 24;
  135. for (i = 0, blocks = (len & ~3); i < blocks; i += 4) {
  136. mi = _mm_loadl_epi64((xmmi *)(m + i));
  137. v13 = _mm_xor_si128(v13, _mm_slli_si128(mi, 8));
  138. sipcompress()
  139. sipcompress()
  140. v02 = _mm_xor_si128(v02, mi);
  141. }
  142. switch (len - blocks) {
  143. case 3: last7 |= (uint32_t)m[i + 2] << 16;
  144. case 2: last7 |= (uint32_t)m[i + 1] << 8;
  145. case 1: last7 |= (uint32_t)m[i + 0] ;
  146. case 0:
  147. default:;
  148. };
  149. mi = _mm_unpacklo_epi32(_mm_cvtsi32_si128(last7),_mm_cvtsi32_si128(0));
  150. v13 = _mm_xor_si128(v13, _mm_slli_si128(mi, 8));
  151. sipcompress()
  152. sipcompress()
  153. v02 = _mm_xor_si128(v02, mi);
  154. v02 = _mm_xor_si128(v02, siphash_final.v);
  155. sipcompress()
  156. sipcompress()
  157. sipcompress()
  158. sipcompress()
  159. v02 = _mm_xor_si128(v02, v13);
  160. v02 = _mm_xor_si128(v02, _mm_shuffle_epi32(v02, _MM_SHUFFLE(1,0,3,2)));
  161. lo = _mm_cvtsi128_si32(v02);
  162. return lo;
  163. }
  164. #endif