compare_gcc.cc 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361
  1. /*
  2. * Copyright 2012 The LibYuv Project Authors. All rights reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include "libyuv/basic_types.h"
  11. #include "libyuv/compare_row.h"
  12. #include "libyuv/row.h"
  13. #ifdef __cplusplus
  14. namespace libyuv {
  15. extern "C" {
  16. #endif
  17. // This module is for GCC x86 and x64.
  18. #if !defined(LIBYUV_DISABLE_X86) && \
  19. (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
  20. #if defined(__x86_64__)
  21. uint32_t HammingDistance_SSE42(const uint8_t* src_a,
  22. const uint8_t* src_b,
  23. int count) {
  24. uint64_t diff = 0u;
  25. asm volatile(
  26. "xor %3,%3 \n"
  27. "xor %%r8,%%r8 \n"
  28. "xor %%r9,%%r9 \n"
  29. "xor %%r10,%%r10 \n"
  30. // Process 32 bytes per loop.
  31. LABELALIGN
  32. "1: \n"
  33. "mov (%0),%%rcx \n"
  34. "mov 0x8(%0),%%rdx \n"
  35. "xor (%1),%%rcx \n"
  36. "xor 0x8(%1),%%rdx \n"
  37. "popcnt %%rcx,%%rcx \n"
  38. "popcnt %%rdx,%%rdx \n"
  39. "mov 0x10(%0),%%rsi \n"
  40. "mov 0x18(%0),%%rdi \n"
  41. "xor 0x10(%1),%%rsi \n"
  42. "xor 0x18(%1),%%rdi \n"
  43. "popcnt %%rsi,%%rsi \n"
  44. "popcnt %%rdi,%%rdi \n"
  45. "add $0x20,%0 \n"
  46. "add $0x20,%1 \n"
  47. "add %%rcx,%3 \n"
  48. "add %%rdx,%%r8 \n"
  49. "add %%rsi,%%r9 \n"
  50. "add %%rdi,%%r10 \n"
  51. "sub $0x20,%2 \n"
  52. "jg 1b \n"
  53. "add %%r8, %3 \n"
  54. "add %%r9, %3 \n"
  55. "add %%r10, %3 \n"
  56. : "+r"(src_a), // %0
  57. "+r"(src_b), // %1
  58. "+r"(count), // %2
  59. "=r"(diff) // %3
  60. :
  61. : "memory", "cc", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10");
  62. return static_cast<uint32_t>(diff);
  63. }
  64. #else
  65. uint32_t HammingDistance_SSE42(const uint8_t* src_a,
  66. const uint8_t* src_b,
  67. int count) {
  68. uint32_t diff = 0u;
  69. asm volatile(
  70. // Process 16 bytes per loop.
  71. LABELALIGN
  72. "1: \n"
  73. "mov (%0),%%ecx \n"
  74. "mov 0x4(%0),%%edx \n"
  75. "xor (%1),%%ecx \n"
  76. "xor 0x4(%1),%%edx \n"
  77. "popcnt %%ecx,%%ecx \n"
  78. "add %%ecx,%3 \n"
  79. "popcnt %%edx,%%edx \n"
  80. "add %%edx,%3 \n"
  81. "mov 0x8(%0),%%ecx \n"
  82. "mov 0xc(%0),%%edx \n"
  83. "xor 0x8(%1),%%ecx \n"
  84. "xor 0xc(%1),%%edx \n"
  85. "popcnt %%ecx,%%ecx \n"
  86. "add %%ecx,%3 \n"
  87. "popcnt %%edx,%%edx \n"
  88. "add %%edx,%3 \n"
  89. "add $0x10,%0 \n"
  90. "add $0x10,%1 \n"
  91. "sub $0x10,%2 \n"
  92. "jg 1b \n"
  93. : "+r"(src_a), // %0
  94. "+r"(src_b), // %1
  95. "+r"(count), // %2
  96. "+r"(diff) // %3
  97. :
  98. : "memory", "cc", "ecx", "edx");
  99. return diff;
  100. }
  101. #endif
  102. static const vec8 kNibbleMask = {15, 15, 15, 15, 15, 15, 15, 15,
  103. 15, 15, 15, 15, 15, 15, 15, 15};
  104. static const vec8 kBitCount = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4};
  105. uint32_t HammingDistance_SSSE3(const uint8_t* src_a,
  106. const uint8_t* src_b,
  107. int count) {
  108. uint32_t diff = 0u;
  109. asm volatile(
  110. "movdqa %4,%%xmm2 \n"
  111. "movdqa %5,%%xmm3 \n"
  112. "pxor %%xmm0,%%xmm0 \n"
  113. "pxor %%xmm1,%%xmm1 \n"
  114. "sub %0,%1 \n"
  115. LABELALIGN
  116. "1: \n"
  117. "movdqa (%0),%%xmm4 \n"
  118. "movdqa 0x10(%0), %%xmm5 \n"
  119. "pxor (%0,%1), %%xmm4 \n"
  120. "movdqa %%xmm4,%%xmm6 \n"
  121. "pand %%xmm2,%%xmm6 \n"
  122. "psrlw $0x4,%%xmm4 \n"
  123. "movdqa %%xmm3,%%xmm7 \n"
  124. "pshufb %%xmm6,%%xmm7 \n"
  125. "pand %%xmm2,%%xmm4 \n"
  126. "movdqa %%xmm3,%%xmm6 \n"
  127. "pshufb %%xmm4,%%xmm6 \n"
  128. "paddb %%xmm7,%%xmm6 \n"
  129. "pxor 0x10(%0,%1),%%xmm5 \n"
  130. "add $0x20,%0 \n"
  131. "movdqa %%xmm5,%%xmm4 \n"
  132. "pand %%xmm2,%%xmm5 \n"
  133. "psrlw $0x4,%%xmm4 \n"
  134. "movdqa %%xmm3,%%xmm7 \n"
  135. "pshufb %%xmm5,%%xmm7 \n"
  136. "pand %%xmm2,%%xmm4 \n"
  137. "movdqa %%xmm3,%%xmm5 \n"
  138. "pshufb %%xmm4,%%xmm5 \n"
  139. "paddb %%xmm7,%%xmm5 \n"
  140. "paddb %%xmm5,%%xmm6 \n"
  141. "psadbw %%xmm1,%%xmm6 \n"
  142. "paddd %%xmm6,%%xmm0 \n"
  143. "sub $0x20,%2 \n"
  144. "jg 1b \n"
  145. "pshufd $0xaa,%%xmm0,%%xmm1 \n"
  146. "paddd %%xmm1,%%xmm0 \n"
  147. "movd %%xmm0, %3 \n"
  148. : "+r"(src_a), // %0
  149. "+r"(src_b), // %1
  150. "+r"(count), // %2
  151. "=r"(diff) // %3
  152. : "m"(kNibbleMask), // %4
  153. "m"(kBitCount) // %5
  154. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
  155. "xmm7");
  156. return diff;
  157. }
  158. #ifdef HAS_HAMMINGDISTANCE_AVX2
  159. uint32_t HammingDistance_AVX2(const uint8_t* src_a,
  160. const uint8_t* src_b,
  161. int count) {
  162. uint32_t diff = 0u;
  163. asm volatile(
  164. "vbroadcastf128 %4,%%ymm2 \n"
  165. "vbroadcastf128 %5,%%ymm3 \n"
  166. "vpxor %%ymm0,%%ymm0,%%ymm0 \n"
  167. "vpxor %%ymm1,%%ymm1,%%ymm1 \n"
  168. "sub %0,%1 \n"
  169. LABELALIGN
  170. "1: \n"
  171. "vmovdqa (%0),%%ymm4 \n"
  172. "vmovdqa 0x20(%0), %%ymm5 \n"
  173. "vpxor (%0,%1), %%ymm4, %%ymm4 \n"
  174. "vpand %%ymm2,%%ymm4,%%ymm6 \n"
  175. "vpsrlw $0x4,%%ymm4,%%ymm4 \n"
  176. "vpshufb %%ymm6,%%ymm3,%%ymm6 \n"
  177. "vpand %%ymm2,%%ymm4,%%ymm4 \n"
  178. "vpshufb %%ymm4,%%ymm3,%%ymm4 \n"
  179. "vpaddb %%ymm4,%%ymm6,%%ymm6 \n"
  180. "vpxor 0x20(%0,%1),%%ymm5,%%ymm4 \n"
  181. "add $0x40,%0 \n"
  182. "vpand %%ymm2,%%ymm4,%%ymm5 \n"
  183. "vpsrlw $0x4,%%ymm4,%%ymm4 \n"
  184. "vpshufb %%ymm5,%%ymm3,%%ymm5 \n"
  185. "vpand %%ymm2,%%ymm4,%%ymm4 \n"
  186. "vpshufb %%ymm4,%%ymm3,%%ymm4 \n"
  187. "vpaddb %%ymm5,%%ymm4,%%ymm4 \n"
  188. "vpaddb %%ymm6,%%ymm4,%%ymm4 \n"
  189. "vpsadbw %%ymm1,%%ymm4,%%ymm4 \n"
  190. "vpaddd %%ymm0,%%ymm4,%%ymm0 \n"
  191. "sub $0x40,%2 \n"
  192. "jg 1b \n"
  193. "vpermq $0xb1,%%ymm0,%%ymm1 \n"
  194. "vpaddd %%ymm1,%%ymm0,%%ymm0 \n"
  195. "vpermq $0xaa,%%ymm0,%%ymm1 \n"
  196. "vpaddd %%ymm1,%%ymm0,%%ymm0 \n"
  197. "vmovd %%xmm0, %3 \n"
  198. "vzeroupper \n"
  199. : "+r"(src_a), // %0
  200. "+r"(src_b), // %1
  201. "+r"(count), // %2
  202. "=r"(diff) // %3
  203. : "m"(kNibbleMask), // %4
  204. "m"(kBitCount) // %5
  205. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
  206. return diff;
  207. }
  208. #endif // HAS_HAMMINGDISTANCE_AVX2
  209. uint32_t SumSquareError_SSE2(const uint8_t* src_a,
  210. const uint8_t* src_b,
  211. int count) {
  212. uint32_t sse;
  213. asm volatile(
  214. "pxor %%xmm0,%%xmm0 \n"
  215. "pxor %%xmm5,%%xmm5 \n"
  216. LABELALIGN
  217. "1: \n"
  218. "movdqu (%0),%%xmm1 \n"
  219. "lea 0x10(%0),%0 \n"
  220. "movdqu (%1),%%xmm2 \n"
  221. "lea 0x10(%1),%1 \n"
  222. "movdqa %%xmm1,%%xmm3 \n"
  223. "psubusb %%xmm2,%%xmm1 \n"
  224. "psubusb %%xmm3,%%xmm2 \n"
  225. "por %%xmm2,%%xmm1 \n"
  226. "movdqa %%xmm1,%%xmm2 \n"
  227. "punpcklbw %%xmm5,%%xmm1 \n"
  228. "punpckhbw %%xmm5,%%xmm2 \n"
  229. "pmaddwd %%xmm1,%%xmm1 \n"
  230. "pmaddwd %%xmm2,%%xmm2 \n"
  231. "paddd %%xmm1,%%xmm0 \n"
  232. "paddd %%xmm2,%%xmm0 \n"
  233. "sub $0x10,%2 \n"
  234. "jg 1b \n"
  235. "pshufd $0xee,%%xmm0,%%xmm1 \n"
  236. "paddd %%xmm1,%%xmm0 \n"
  237. "pshufd $0x1,%%xmm0,%%xmm1 \n"
  238. "paddd %%xmm1,%%xmm0 \n"
  239. "movd %%xmm0,%3 \n"
  240. : "+r"(src_a), // %0
  241. "+r"(src_b), // %1
  242. "+r"(count), // %2
  243. "=g"(sse) // %3
  244. ::"memory",
  245. "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
  246. return sse;
  247. }
  248. static const uvec32 kHash16x33 = {0x92d9e201, 0, 0, 0}; // 33 ^ 16
  249. static const uvec32 kHashMul0 = {
  250. 0x0c3525e1, // 33 ^ 15
  251. 0xa3476dc1, // 33 ^ 14
  252. 0x3b4039a1, // 33 ^ 13
  253. 0x4f5f0981, // 33 ^ 12
  254. };
  255. static const uvec32 kHashMul1 = {
  256. 0x30f35d61, // 33 ^ 11
  257. 0x855cb541, // 33 ^ 10
  258. 0x040a9121, // 33 ^ 9
  259. 0x747c7101, // 33 ^ 8
  260. };
  261. static const uvec32 kHashMul2 = {
  262. 0xec41d4e1, // 33 ^ 7
  263. 0x4cfa3cc1, // 33 ^ 6
  264. 0x025528a1, // 33 ^ 5
  265. 0x00121881, // 33 ^ 4
  266. };
  267. static const uvec32 kHashMul3 = {
  268. 0x00008c61, // 33 ^ 3
  269. 0x00000441, // 33 ^ 2
  270. 0x00000021, // 33 ^ 1
  271. 0x00000001, // 33 ^ 0
  272. };
  273. uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) {
  274. uint32_t hash;
  275. asm volatile(
  276. "movd %2,%%xmm0 \n"
  277. "pxor %%xmm7,%%xmm7 \n"
  278. "movdqa %4,%%xmm6 \n"
  279. LABELALIGN
  280. "1: \n"
  281. "movdqu (%0),%%xmm1 \n"
  282. "lea 0x10(%0),%0 \n"
  283. "pmulld %%xmm6,%%xmm0 \n"
  284. "movdqa %5,%%xmm5 \n"
  285. "movdqa %%xmm1,%%xmm2 \n"
  286. "punpcklbw %%xmm7,%%xmm2 \n"
  287. "movdqa %%xmm2,%%xmm3 \n"
  288. "punpcklwd %%xmm7,%%xmm3 \n"
  289. "pmulld %%xmm5,%%xmm3 \n"
  290. "movdqa %6,%%xmm5 \n"
  291. "movdqa %%xmm2,%%xmm4 \n"
  292. "punpckhwd %%xmm7,%%xmm4 \n"
  293. "pmulld %%xmm5,%%xmm4 \n"
  294. "movdqa %7,%%xmm5 \n"
  295. "punpckhbw %%xmm7,%%xmm1 \n"
  296. "movdqa %%xmm1,%%xmm2 \n"
  297. "punpcklwd %%xmm7,%%xmm2 \n"
  298. "pmulld %%xmm5,%%xmm2 \n"
  299. "movdqa %8,%%xmm5 \n"
  300. "punpckhwd %%xmm7,%%xmm1 \n"
  301. "pmulld %%xmm5,%%xmm1 \n"
  302. "paddd %%xmm4,%%xmm3 \n"
  303. "paddd %%xmm2,%%xmm1 \n"
  304. "paddd %%xmm3,%%xmm1 \n"
  305. "pshufd $0xe,%%xmm1,%%xmm2 \n"
  306. "paddd %%xmm2,%%xmm1 \n"
  307. "pshufd $0x1,%%xmm1,%%xmm2 \n"
  308. "paddd %%xmm2,%%xmm1 \n"
  309. "paddd %%xmm1,%%xmm0 \n"
  310. "sub $0x10,%1 \n"
  311. "jg 1b \n"
  312. "movd %%xmm0,%3 \n"
  313. : "+r"(src), // %0
  314. "+r"(count), // %1
  315. "+rm"(seed), // %2
  316. "=g"(hash) // %3
  317. : "m"(kHash16x33), // %4
  318. "m"(kHashMul0), // %5
  319. "m"(kHashMul1), // %6
  320. "m"(kHashMul2), // %7
  321. "m"(kHashMul3) // %8
  322. : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
  323. "xmm7");
  324. return hash;
  325. }
  326. #endif // defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))
  327. #ifdef __cplusplus
  328. } // extern "C"
  329. } // namespace libyuv
  330. #endif