compare_win.cc 6.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242
  1. /*
  2. * Copyright 2012 The LibYuv Project Authors. All rights reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include "libyuv/basic_types.h"
  11. #include "libyuv/compare_row.h"
  12. #include "libyuv/row.h"
  13. #if defined(_MSC_VER)
  14. #include <intrin.h> // For __popcnt
  15. #endif
  16. #ifdef __cplusplus
  17. namespace libyuv {
  18. extern "C" {
  19. #endif
  20. // This module is for 32 bit Visual C x86 and clangcl
  21. #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
  22. uint32_t HammingDistance_SSE42(const uint8_t* src_a,
  23. const uint8_t* src_b,
  24. int count) {
  25. uint32_t diff = 0u;
  26. int i;
  27. for (i = 0; i < count - 3; i += 4) {
  28. uint32_t x = *((uint32_t*)src_a) ^ *((uint32_t*)src_b); // NOLINT
  29. src_a += 4;
  30. src_b += 4;
  31. diff += __popcnt(x);
  32. }
  33. return diff;
  34. }
  35. __declspec(naked) uint32_t
  36. SumSquareError_SSE2(const uint8_t* src_a, const uint8_t* src_b, int count) {
  37. __asm {
  38. mov eax, [esp + 4] // src_a
  39. mov edx, [esp + 8] // src_b
  40. mov ecx, [esp + 12] // count
  41. pxor xmm0, xmm0
  42. pxor xmm5, xmm5
  43. wloop:
  44. movdqu xmm1, [eax]
  45. lea eax, [eax + 16]
  46. movdqu xmm2, [edx]
  47. lea edx, [edx + 16]
  48. movdqa xmm3, xmm1 // abs trick
  49. psubusb xmm1, xmm2
  50. psubusb xmm2, xmm3
  51. por xmm1, xmm2
  52. movdqa xmm2, xmm1
  53. punpcklbw xmm1, xmm5
  54. punpckhbw xmm2, xmm5
  55. pmaddwd xmm1, xmm1
  56. pmaddwd xmm2, xmm2
  57. paddd xmm0, xmm1
  58. paddd xmm0, xmm2
  59. sub ecx, 16
  60. jg wloop
  61. pshufd xmm1, xmm0, 0xee
  62. paddd xmm0, xmm1
  63. pshufd xmm1, xmm0, 0x01
  64. paddd xmm0, xmm1
  65. movd eax, xmm0
  66. ret
  67. }
  68. }
  69. // Visual C 2012 required for AVX2.
  70. #if _MSC_VER >= 1700
  71. // C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX.
  72. #pragma warning(disable : 4752)
  73. __declspec(naked) uint32_t
  74. SumSquareError_AVX2(const uint8_t* src_a, const uint8_t* src_b, int count) {
  75. __asm {
  76. mov eax, [esp + 4] // src_a
  77. mov edx, [esp + 8] // src_b
  78. mov ecx, [esp + 12] // count
  79. vpxor ymm0, ymm0, ymm0 // sum
  80. vpxor ymm5, ymm5, ymm5 // constant 0 for unpck
  81. sub edx, eax
  82. wloop:
  83. vmovdqu ymm1, [eax]
  84. vmovdqu ymm2, [eax + edx]
  85. lea eax, [eax + 32]
  86. vpsubusb ymm3, ymm1, ymm2 // abs difference trick
  87. vpsubusb ymm2, ymm2, ymm1
  88. vpor ymm1, ymm2, ymm3
  89. vpunpcklbw ymm2, ymm1, ymm5 // u16. mutates order.
  90. vpunpckhbw ymm1, ymm1, ymm5
  91. vpmaddwd ymm2, ymm2, ymm2 // square + hadd to u32.
  92. vpmaddwd ymm1, ymm1, ymm1
  93. vpaddd ymm0, ymm0, ymm1
  94. vpaddd ymm0, ymm0, ymm2
  95. sub ecx, 32
  96. jg wloop
  97. vpshufd ymm1, ymm0, 0xee // 3, 2 + 1, 0 both lanes.
  98. vpaddd ymm0, ymm0, ymm1
  99. vpshufd ymm1, ymm0, 0x01 // 1 + 0 both lanes.
  100. vpaddd ymm0, ymm0, ymm1
  101. vpermq ymm1, ymm0, 0x02 // high + low lane.
  102. vpaddd ymm0, ymm0, ymm1
  103. vmovd eax, xmm0
  104. vzeroupper
  105. ret
  106. }
  107. }
  108. #endif // _MSC_VER >= 1700
  109. uvec32 kHash16x33 = {0x92d9e201, 0, 0, 0}; // 33 ^ 16
  110. uvec32 kHashMul0 = {
  111. 0x0c3525e1, // 33 ^ 15
  112. 0xa3476dc1, // 33 ^ 14
  113. 0x3b4039a1, // 33 ^ 13
  114. 0x4f5f0981, // 33 ^ 12
  115. };
  116. uvec32 kHashMul1 = {
  117. 0x30f35d61, // 33 ^ 11
  118. 0x855cb541, // 33 ^ 10
  119. 0x040a9121, // 33 ^ 9
  120. 0x747c7101, // 33 ^ 8
  121. };
  122. uvec32 kHashMul2 = {
  123. 0xec41d4e1, // 33 ^ 7
  124. 0x4cfa3cc1, // 33 ^ 6
  125. 0x025528a1, // 33 ^ 5
  126. 0x00121881, // 33 ^ 4
  127. };
  128. uvec32 kHashMul3 = {
  129. 0x00008c61, // 33 ^ 3
  130. 0x00000441, // 33 ^ 2
  131. 0x00000021, // 33 ^ 1
  132. 0x00000001, // 33 ^ 0
  133. };
  134. __declspec(naked) uint32_t
  135. HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) {
  136. __asm {
  137. mov eax, [esp + 4] // src
  138. mov ecx, [esp + 8] // count
  139. movd xmm0, [esp + 12] // seed
  140. pxor xmm7, xmm7 // constant 0 for unpck
  141. movdqa xmm6, xmmword ptr kHash16x33
  142. wloop:
  143. movdqu xmm1, [eax] // src[0-15]
  144. lea eax, [eax + 16]
  145. pmulld xmm0, xmm6 // hash *= 33 ^ 16
  146. movdqa xmm5, xmmword ptr kHashMul0
  147. movdqa xmm2, xmm1
  148. punpcklbw xmm2, xmm7 // src[0-7]
  149. movdqa xmm3, xmm2
  150. punpcklwd xmm3, xmm7 // src[0-3]
  151. pmulld xmm3, xmm5
  152. movdqa xmm5, xmmword ptr kHashMul1
  153. movdqa xmm4, xmm2
  154. punpckhwd xmm4, xmm7 // src[4-7]
  155. pmulld xmm4, xmm5
  156. movdqa xmm5, xmmword ptr kHashMul2
  157. punpckhbw xmm1, xmm7 // src[8-15]
  158. movdqa xmm2, xmm1
  159. punpcklwd xmm2, xmm7 // src[8-11]
  160. pmulld xmm2, xmm5
  161. movdqa xmm5, xmmword ptr kHashMul3
  162. punpckhwd xmm1, xmm7 // src[12-15]
  163. pmulld xmm1, xmm5
  164. paddd xmm3, xmm4 // add 16 results
  165. paddd xmm1, xmm2
  166. paddd xmm1, xmm3
  167. pshufd xmm2, xmm1, 0x0e // upper 2 dwords
  168. paddd xmm1, xmm2
  169. pshufd xmm2, xmm1, 0x01
  170. paddd xmm1, xmm2
  171. paddd xmm0, xmm1
  172. sub ecx, 16
  173. jg wloop
  174. movd eax, xmm0 // return hash
  175. ret
  176. }
  177. }
  178. // Visual C 2012 required for AVX2.
  179. #if _MSC_VER >= 1700
  180. __declspec(naked) uint32_t
  181. HashDjb2_AVX2(const uint8_t* src, int count, uint32_t seed) {
  182. __asm {
  183. mov eax, [esp + 4] // src
  184. mov ecx, [esp + 8] // count
  185. vmovd xmm0, [esp + 12] // seed
  186. wloop:
  187. vpmovzxbd xmm3, [eax] // src[0-3]
  188. vpmulld xmm0, xmm0, xmmword ptr kHash16x33 // hash *= 33 ^ 16
  189. vpmovzxbd xmm4, [eax + 4] // src[4-7]
  190. vpmulld xmm3, xmm3, xmmword ptr kHashMul0
  191. vpmovzxbd xmm2, [eax + 8] // src[8-11]
  192. vpmulld xmm4, xmm4, xmmword ptr kHashMul1
  193. vpmovzxbd xmm1, [eax + 12] // src[12-15]
  194. vpmulld xmm2, xmm2, xmmword ptr kHashMul2
  195. lea eax, [eax + 16]
  196. vpmulld xmm1, xmm1, xmmword ptr kHashMul3
  197. vpaddd xmm3, xmm3, xmm4 // add 16 results
  198. vpaddd xmm1, xmm1, xmm2
  199. vpaddd xmm1, xmm1, xmm3
  200. vpshufd xmm2, xmm1, 0x0e // upper 2 dwords
  201. vpaddd xmm1, xmm1,xmm2
  202. vpshufd xmm2, xmm1, 0x01
  203. vpaddd xmm1, xmm1, xmm2
  204. vpaddd xmm0, xmm0, xmm1
  205. sub ecx, 16
  206. jg wloop
  207. vmovd eax, xmm0 // return hash
  208. vzeroupper
  209. ret
  210. }
  211. }
  212. #endif // _MSC_VER >= 1700
  213. #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
  214. #ifdef __cplusplus
  215. } // extern "C"
  216. } // namespace libyuv
  217. #endif