lossless_sse2.c 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678
  1. // Copyright 2014 Google Inc. All Rights Reserved.
  2. //
  3. // Use of this source code is governed by a BSD-style license
  4. // that can be found in the COPYING file in the root of the source
  5. // tree. An additional intellectual property rights grant can be found
  6. // in the file PATENTS. All contributing project authors may
  7. // be found in the AUTHORS file in the root of the source tree.
  8. // -----------------------------------------------------------------------------
  9. //
  10. // SSE2 variant of methods for lossless decoder
  11. //
  12. // Author: Skal (pascal.massimino@gmail.com)
  13. #include "./dsp.h"
  14. #if defined(WEBP_USE_SSE2)
  15. #include "./common_sse2.h"
  16. #include "./lossless.h"
  17. #include "./lossless_common.h"
  18. #include <assert.h>
  19. #include <emmintrin.h>
  20. //------------------------------------------------------------------------------
  21. // Predictor Transform
  22. static WEBP_INLINE uint32_t ClampedAddSubtractFull(uint32_t c0, uint32_t c1,
  23. uint32_t c2) {
  24. const __m128i zero = _mm_setzero_si128();
  25. const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c0), zero);
  26. const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c1), zero);
  27. const __m128i C2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c2), zero);
  28. const __m128i V1 = _mm_add_epi16(C0, C1);
  29. const __m128i V2 = _mm_sub_epi16(V1, C2);
  30. const __m128i b = _mm_packus_epi16(V2, V2);
  31. const uint32_t output = _mm_cvtsi128_si32(b);
  32. return output;
  33. }
  34. static WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1,
  35. uint32_t c2) {
  36. const __m128i zero = _mm_setzero_si128();
  37. const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c0), zero);
  38. const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c1), zero);
  39. const __m128i B0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c2), zero);
  40. const __m128i avg = _mm_add_epi16(C1, C0);
  41. const __m128i A0 = _mm_srli_epi16(avg, 1);
  42. const __m128i A1 = _mm_sub_epi16(A0, B0);
  43. const __m128i BgtA = _mm_cmpgt_epi16(B0, A0);
  44. const __m128i A2 = _mm_sub_epi16(A1, BgtA);
  45. const __m128i A3 = _mm_srai_epi16(A2, 1);
  46. const __m128i A4 = _mm_add_epi16(A0, A3);
  47. const __m128i A5 = _mm_packus_epi16(A4, A4);
  48. const uint32_t output = _mm_cvtsi128_si32(A5);
  49. return output;
  50. }
  51. static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) {
  52. int pa_minus_pb;
  53. const __m128i zero = _mm_setzero_si128();
  54. const __m128i A0 = _mm_cvtsi32_si128(a);
  55. const __m128i B0 = _mm_cvtsi32_si128(b);
  56. const __m128i C0 = _mm_cvtsi32_si128(c);
  57. const __m128i AC0 = _mm_subs_epu8(A0, C0);
  58. const __m128i CA0 = _mm_subs_epu8(C0, A0);
  59. const __m128i BC0 = _mm_subs_epu8(B0, C0);
  60. const __m128i CB0 = _mm_subs_epu8(C0, B0);
  61. const __m128i AC = _mm_or_si128(AC0, CA0);
  62. const __m128i BC = _mm_or_si128(BC0, CB0);
  63. const __m128i pa = _mm_unpacklo_epi8(AC, zero); // |a - c|
  64. const __m128i pb = _mm_unpacklo_epi8(BC, zero); // |b - c|
  65. const __m128i diff = _mm_sub_epi16(pb, pa);
  66. {
  67. int16_t out[8];
  68. _mm_storeu_si128((__m128i*)out, diff);
  69. pa_minus_pb = out[0] + out[1] + out[2] + out[3];
  70. }
  71. return (pa_minus_pb <= 0) ? a : b;
  72. }
  73. static WEBP_INLINE void Average2_m128i(const __m128i* const a0,
  74. const __m128i* const a1,
  75. __m128i* const avg) {
  76. // (a + b) >> 1 = ((a + b + 1) >> 1) - ((a ^ b) & 1)
  77. const __m128i ones = _mm_set1_epi8(1);
  78. const __m128i avg1 = _mm_avg_epu8(*a0, *a1);
  79. const __m128i one = _mm_and_si128(_mm_xor_si128(*a0, *a1), ones);
  80. *avg = _mm_sub_epi8(avg1, one);
  81. }
  82. static WEBP_INLINE void Average2_uint32(const uint32_t a0, const uint32_t a1,
  83. __m128i* const avg) {
  84. // (a + b) >> 1 = ((a + b + 1) >> 1) - ((a ^ b) & 1)
  85. const __m128i ones = _mm_set1_epi8(1);
  86. const __m128i A0 = _mm_cvtsi32_si128(a0);
  87. const __m128i A1 = _mm_cvtsi32_si128(a1);
  88. const __m128i avg1 = _mm_avg_epu8(A0, A1);
  89. const __m128i one = _mm_and_si128(_mm_xor_si128(A0, A1), ones);
  90. *avg = _mm_sub_epi8(avg1, one);
  91. }
  92. static WEBP_INLINE __m128i Average2_uint32_16(uint32_t a0, uint32_t a1) {
  93. const __m128i zero = _mm_setzero_si128();
  94. const __m128i A0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a0), zero);
  95. const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a1), zero);
  96. const __m128i sum = _mm_add_epi16(A1, A0);
  97. return _mm_srli_epi16(sum, 1);
  98. }
  99. static WEBP_INLINE uint32_t Average2(uint32_t a0, uint32_t a1) {
  100. __m128i output;
  101. Average2_uint32(a0, a1, &output);
  102. return _mm_cvtsi128_si32(output);
  103. }
  104. static WEBP_INLINE uint32_t Average3(uint32_t a0, uint32_t a1, uint32_t a2) {
  105. const __m128i zero = _mm_setzero_si128();
  106. const __m128i avg1 = Average2_uint32_16(a0, a2);
  107. const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a1), zero);
  108. const __m128i sum = _mm_add_epi16(avg1, A1);
  109. const __m128i avg2 = _mm_srli_epi16(sum, 1);
  110. const __m128i A2 = _mm_packus_epi16(avg2, avg2);
  111. const uint32_t output = _mm_cvtsi128_si32(A2);
  112. return output;
  113. }
  114. static WEBP_INLINE uint32_t Average4(uint32_t a0, uint32_t a1,
  115. uint32_t a2, uint32_t a3) {
  116. const __m128i avg1 = Average2_uint32_16(a0, a1);
  117. const __m128i avg2 = Average2_uint32_16(a2, a3);
  118. const __m128i sum = _mm_add_epi16(avg2, avg1);
  119. const __m128i avg3 = _mm_srli_epi16(sum, 1);
  120. const __m128i A0 = _mm_packus_epi16(avg3, avg3);
  121. const uint32_t output = _mm_cvtsi128_si32(A0);
  122. return output;
  123. }
  124. static uint32_t Predictor5_SSE2(uint32_t left, const uint32_t* const top) {
  125. const uint32_t pred = Average3(left, top[0], top[1]);
  126. return pred;
  127. }
  128. static uint32_t Predictor6_SSE2(uint32_t left, const uint32_t* const top) {
  129. const uint32_t pred = Average2(left, top[-1]);
  130. return pred;
  131. }
  132. static uint32_t Predictor7_SSE2(uint32_t left, const uint32_t* const top) {
  133. const uint32_t pred = Average2(left, top[0]);
  134. return pred;
  135. }
  136. static uint32_t Predictor8_SSE2(uint32_t left, const uint32_t* const top) {
  137. const uint32_t pred = Average2(top[-1], top[0]);
  138. (void)left;
  139. return pred;
  140. }
  141. static uint32_t Predictor9_SSE2(uint32_t left, const uint32_t* const top) {
  142. const uint32_t pred = Average2(top[0], top[1]);
  143. (void)left;
  144. return pred;
  145. }
  146. static uint32_t Predictor10_SSE2(uint32_t left, const uint32_t* const top) {
  147. const uint32_t pred = Average4(left, top[-1], top[0], top[1]);
  148. return pred;
  149. }
  150. static uint32_t Predictor11_SSE2(uint32_t left, const uint32_t* const top) {
  151. const uint32_t pred = Select(top[0], left, top[-1]);
  152. return pred;
  153. }
  154. static uint32_t Predictor12_SSE2(uint32_t left, const uint32_t* const top) {
  155. const uint32_t pred = ClampedAddSubtractFull(left, top[0], top[-1]);
  156. return pred;
  157. }
  158. static uint32_t Predictor13_SSE2(uint32_t left, const uint32_t* const top) {
  159. const uint32_t pred = ClampedAddSubtractHalf(left, top[0], top[-1]);
  160. return pred;
  161. }
  162. // Batch versions of those functions.
  163. // Predictor0: ARGB_BLACK.
  164. static void PredictorAdd0_SSE2(const uint32_t* in, const uint32_t* upper,
  165. int num_pixels, uint32_t* out) {
  166. int i;
  167. const __m128i black = _mm_set1_epi32(ARGB_BLACK);
  168. for (i = 0; i + 4 <= num_pixels; i += 4) {
  169. const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
  170. const __m128i res = _mm_add_epi8(src, black);
  171. _mm_storeu_si128((__m128i*)&out[i], res);
  172. }
  173. if (i != num_pixels) {
  174. VP8LPredictorsAdd_C[0](in + i, upper + i, num_pixels - i, out + i);
  175. }
  176. }
  177. // Predictor1: left.
  178. static void PredictorAdd1_SSE2(const uint32_t* in, const uint32_t* upper,
  179. int num_pixels, uint32_t* out) {
  180. int i;
  181. __m128i prev = _mm_set1_epi32(out[-1]);
  182. for (i = 0; i + 4 <= num_pixels; i += 4) {
  183. // a | b | c | d
  184. const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
  185. // 0 | a | b | c
  186. const __m128i shift0 = _mm_slli_si128(src, 4);
  187. // a | a + b | b + c | c + d
  188. const __m128i sum0 = _mm_add_epi8(src, shift0);
  189. // 0 | 0 | a | a + b
  190. const __m128i shift1 = _mm_slli_si128(sum0, 8);
  191. // a | a + b | a + b + c | a + b + c + d
  192. const __m128i sum1 = _mm_add_epi8(sum0, shift1);
  193. const __m128i res = _mm_add_epi8(sum1, prev);
  194. _mm_storeu_si128((__m128i*)&out[i], res);
  195. // replicate prev output on the four lanes
  196. prev = _mm_shuffle_epi32(res, (3 << 0) | (3 << 2) | (3 << 4) | (3 << 6));
  197. }
  198. if (i != num_pixels) {
  199. VP8LPredictorsAdd_C[1](in + i, upper + i, num_pixels - i, out + i);
  200. }
  201. }
  202. // Macro that adds 32-bit integers from IN using mod 256 arithmetic
  203. // per 8 bit channel.
  204. #define GENERATE_PREDICTOR_1(X, IN) \
  205. static void PredictorAdd##X##_SSE2(const uint32_t* in, const uint32_t* upper, \
  206. int num_pixels, uint32_t* out) { \
  207. int i; \
  208. for (i = 0; i + 4 <= num_pixels; i += 4) { \
  209. const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); \
  210. const __m128i other = _mm_loadu_si128((const __m128i*)&(IN)); \
  211. const __m128i res = _mm_add_epi8(src, other); \
  212. _mm_storeu_si128((__m128i*)&out[i], res); \
  213. } \
  214. if (i != num_pixels) { \
  215. VP8LPredictorsAdd_C[(X)](in + i, upper + i, num_pixels - i, out + i); \
  216. } \
  217. }
  218. // Predictor2: Top.
  219. GENERATE_PREDICTOR_1(2, upper[i])
  220. // Predictor3: Top-right.
  221. GENERATE_PREDICTOR_1(3, upper[i + 1])
  222. // Predictor4: Top-left.
  223. GENERATE_PREDICTOR_1(4, upper[i - 1])
  224. #undef GENERATE_PREDICTOR_1
  225. // Due to averages with integers, values cannot be accumulated in parallel for
  226. // predictors 5 to 7.
  227. GENERATE_PREDICTOR_ADD(Predictor5_SSE2, PredictorAdd5_SSE2)
  228. GENERATE_PREDICTOR_ADD(Predictor6_SSE2, PredictorAdd6_SSE2)
  229. GENERATE_PREDICTOR_ADD(Predictor7_SSE2, PredictorAdd7_SSE2)
  230. #define GENERATE_PREDICTOR_2(X, IN) \
  231. static void PredictorAdd##X##_SSE2(const uint32_t* in, const uint32_t* upper, \
  232. int num_pixels, uint32_t* out) { \
  233. int i; \
  234. for (i = 0; i + 4 <= num_pixels; i += 4) { \
  235. const __m128i Tother = _mm_loadu_si128((const __m128i*)&(IN)); \
  236. const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]); \
  237. const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); \
  238. __m128i avg, res; \
  239. Average2_m128i(&T, &Tother, &avg); \
  240. res = _mm_add_epi8(avg, src); \
  241. _mm_storeu_si128((__m128i*)&out[i], res); \
  242. } \
  243. if (i != num_pixels) { \
  244. VP8LPredictorsAdd_C[(X)](in + i, upper + i, num_pixels - i, out + i); \
  245. } \
  246. }
  247. // Predictor8: average TL T.
  248. GENERATE_PREDICTOR_2(8, upper[i - 1])
  249. // Predictor9: average T TR.
  250. GENERATE_PREDICTOR_2(9, upper[i + 1])
  251. #undef GENERATE_PREDICTOR_2
  252. // Predictor10: average of (average of (L,TL), average of (T, TR)).
  253. static void PredictorAdd10_SSE2(const uint32_t* in, const uint32_t* upper,
  254. int num_pixels, uint32_t* out) {
  255. int i, j;
  256. __m128i L = _mm_cvtsi32_si128(out[-1]);
  257. for (i = 0; i + 4 <= num_pixels; i += 4) {
  258. __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
  259. __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
  260. const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
  261. const __m128i TR = _mm_loadu_si128((const __m128i*)&upper[i + 1]);
  262. __m128i avgTTR;
  263. Average2_m128i(&T, &TR, &avgTTR);
  264. for (j = 0; j < 4; ++j) {
  265. __m128i avgLTL, avg;
  266. Average2_m128i(&L, &TL, &avgLTL);
  267. Average2_m128i(&avgTTR, &avgLTL, &avg);
  268. L = _mm_add_epi8(avg, src);
  269. out[i + j] = _mm_cvtsi128_si32(L);
  270. // Rotate the pre-computed values for the next iteration.
  271. avgTTR = _mm_srli_si128(avgTTR, 4);
  272. TL = _mm_srli_si128(TL, 4);
  273. src = _mm_srli_si128(src, 4);
  274. }
  275. }
  276. if (i != num_pixels) {
  277. VP8LPredictorsAdd_C[10](in + i, upper + i, num_pixels - i, out + i);
  278. }
  279. }
  280. // Predictor11: select.
  281. static void GetSumAbsDiff32(const __m128i* const A, const __m128i* const B,
  282. __m128i* const out) {
  283. // We can unpack with any value on the upper 32 bits, provided it's the same
  284. // on both operands (to that their sum of abs diff is zero). Here we use *A.
  285. const __m128i A_lo = _mm_unpacklo_epi32(*A, *A);
  286. const __m128i B_lo = _mm_unpacklo_epi32(*B, *A);
  287. const __m128i A_hi = _mm_unpackhi_epi32(*A, *A);
  288. const __m128i B_hi = _mm_unpackhi_epi32(*B, *A);
  289. const __m128i s_lo = _mm_sad_epu8(A_lo, B_lo);
  290. const __m128i s_hi = _mm_sad_epu8(A_hi, B_hi);
  291. *out = _mm_packs_epi32(s_lo, s_hi);
  292. }
  293. static void PredictorAdd11_SSE2(const uint32_t* in, const uint32_t* upper,
  294. int num_pixels, uint32_t* out) {
  295. int i, j;
  296. __m128i L = _mm_cvtsi32_si128(out[-1]);
  297. for (i = 0; i + 4 <= num_pixels; i += 4) {
  298. __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
  299. __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
  300. __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
  301. __m128i pa;
  302. GetSumAbsDiff32(&T, &TL, &pa); // pa = sum |T-TL|
  303. for (j = 0; j < 4; ++j) {
  304. const __m128i L_lo = _mm_unpacklo_epi32(L, L);
  305. const __m128i TL_lo = _mm_unpacklo_epi32(TL, L);
  306. const __m128i pb = _mm_sad_epu8(L_lo, TL_lo); // pb = sum |L-TL|
  307. const __m128i mask = _mm_cmpgt_epi32(pb, pa);
  308. const __m128i A = _mm_and_si128(mask, L);
  309. const __m128i B = _mm_andnot_si128(mask, T);
  310. const __m128i pred = _mm_or_si128(A, B); // pred = (L > T)? L : T
  311. L = _mm_add_epi8(src, pred);
  312. out[i + j] = _mm_cvtsi128_si32(L);
  313. // Shift the pre-computed value for the next iteration.
  314. T = _mm_srli_si128(T, 4);
  315. TL = _mm_srli_si128(TL, 4);
  316. src = _mm_srli_si128(src, 4);
  317. pa = _mm_srli_si128(pa, 4);
  318. }
  319. }
  320. if (i != num_pixels) {
  321. VP8LPredictorsAdd_C[11](in + i, upper + i, num_pixels - i, out + i);
  322. }
  323. }
  324. // Predictor12: ClampedAddSubtractFull.
  325. #define DO_PRED12(DIFF, LANE, OUT) \
  326. do { \
  327. const __m128i all = _mm_add_epi16(L, (DIFF)); \
  328. const __m128i alls = _mm_packus_epi16(all, all); \
  329. const __m128i res = _mm_add_epi8(src, alls); \
  330. out[i + (OUT)] = _mm_cvtsi128_si32(res); \
  331. L = _mm_unpacklo_epi8(res, zero); \
  332. /* Shift the pre-computed value for the next iteration.*/ \
  333. if (LANE == 0) (DIFF) = _mm_srli_si128((DIFF), 8); \
  334. src = _mm_srli_si128(src, 4); \
  335. } while (0)
  336. static void PredictorAdd12_SSE2(const uint32_t* in, const uint32_t* upper,
  337. int num_pixels, uint32_t* out) {
  338. int i;
  339. const __m128i zero = _mm_setzero_si128();
  340. const __m128i L8 = _mm_cvtsi32_si128(out[-1]);
  341. __m128i L = _mm_unpacklo_epi8(L8, zero);
  342. for (i = 0; i + 4 <= num_pixels; i += 4) {
  343. // Load 4 pixels at a time.
  344. __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
  345. const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
  346. const __m128i T_lo = _mm_unpacklo_epi8(T, zero);
  347. const __m128i T_hi = _mm_unpackhi_epi8(T, zero);
  348. const __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
  349. const __m128i TL_lo = _mm_unpacklo_epi8(TL, zero);
  350. const __m128i TL_hi = _mm_unpackhi_epi8(TL, zero);
  351. __m128i diff_lo = _mm_sub_epi16(T_lo, TL_lo);
  352. __m128i diff_hi = _mm_sub_epi16(T_hi, TL_hi);
  353. DO_PRED12(diff_lo, 0, 0);
  354. DO_PRED12(diff_lo, 1, 1);
  355. DO_PRED12(diff_hi, 0, 2);
  356. DO_PRED12(diff_hi, 1, 3);
  357. }
  358. if (i != num_pixels) {
  359. VP8LPredictorsAdd_C[12](in + i, upper + i, num_pixels - i, out + i);
  360. }
  361. }
  362. #undef DO_PRED12
  363. // Due to averages with integers, values cannot be accumulated in parallel for
  364. // predictors 13.
  365. GENERATE_PREDICTOR_ADD(Predictor13_SSE2, PredictorAdd13_SSE2)
  366. //------------------------------------------------------------------------------
  367. // Subtract-Green Transform
  368. static void AddGreenToBlueAndRed(const uint32_t* const src, int num_pixels,
  369. uint32_t* dst) {
  370. int i;
  371. for (i = 0; i + 4 <= num_pixels; i += 4) {
  372. const __m128i in = _mm_loadu_si128((const __m128i*)&src[i]); // argb
  373. const __m128i A = _mm_srli_epi16(in, 8); // 0 a 0 g
  374. const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0));
  375. const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0)); // 0g0g
  376. const __m128i out = _mm_add_epi8(in, C);
  377. _mm_storeu_si128((__m128i*)&dst[i], out);
  378. }
  379. // fallthrough and finish off with plain-C
  380. if (i != num_pixels) {
  381. VP8LAddGreenToBlueAndRed_C(src + i, num_pixels - i, dst + i);
  382. }
  383. }
  384. //------------------------------------------------------------------------------
  385. // Color Transform
  386. static void TransformColorInverse(const VP8LMultipliers* const m,
  387. const uint32_t* const src, int num_pixels,
  388. uint32_t* dst) {
  389. // sign-extended multiplying constants, pre-shifted by 5.
  390. #define CST(X) (((int16_t)(m->X << 8)) >> 5) // sign-extend
  391. const __m128i mults_rb = _mm_set_epi16(
  392. CST(green_to_red_), CST(green_to_blue_),
  393. CST(green_to_red_), CST(green_to_blue_),
  394. CST(green_to_red_), CST(green_to_blue_),
  395. CST(green_to_red_), CST(green_to_blue_));
  396. const __m128i mults_b2 = _mm_set_epi16(
  397. CST(red_to_blue_), 0, CST(red_to_blue_), 0,
  398. CST(red_to_blue_), 0, CST(red_to_blue_), 0);
  399. #undef CST
  400. const __m128i mask_ag = _mm_set1_epi32(0xff00ff00); // alpha-green masks
  401. int i;
  402. for (i = 0; i + 4 <= num_pixels; i += 4) {
  403. const __m128i in = _mm_loadu_si128((const __m128i*)&src[i]); // argb
  404. const __m128i A = _mm_and_si128(in, mask_ag); // a 0 g 0
  405. const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0));
  406. const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0)); // g0g0
  407. const __m128i D = _mm_mulhi_epi16(C, mults_rb); // x dr x db1
  408. const __m128i E = _mm_add_epi8(in, D); // x r' x b'
  409. const __m128i F = _mm_slli_epi16(E, 8); // r' 0 b' 0
  410. const __m128i G = _mm_mulhi_epi16(F, mults_b2); // x db2 0 0
  411. const __m128i H = _mm_srli_epi32(G, 8); // 0 x db2 0
  412. const __m128i I = _mm_add_epi8(H, F); // r' x b'' 0
  413. const __m128i J = _mm_srli_epi16(I, 8); // 0 r' 0 b''
  414. const __m128i out = _mm_or_si128(J, A);
  415. _mm_storeu_si128((__m128i*)&dst[i], out);
  416. }
  417. // Fall-back to C-version for left-overs.
  418. if (i != num_pixels) {
  419. VP8LTransformColorInverse_C(m, src + i, num_pixels - i, dst + i);
  420. }
  421. }
  422. //------------------------------------------------------------------------------
  423. // Color-space conversion functions
  424. static void ConvertBGRAToRGB(const uint32_t* src, int num_pixels,
  425. uint8_t* dst) {
  426. const __m128i* in = (const __m128i*)src;
  427. __m128i* out = (__m128i*)dst;
  428. while (num_pixels >= 32) {
  429. // Load the BGRA buffers.
  430. __m128i in0 = _mm_loadu_si128(in + 0);
  431. __m128i in1 = _mm_loadu_si128(in + 1);
  432. __m128i in2 = _mm_loadu_si128(in + 2);
  433. __m128i in3 = _mm_loadu_si128(in + 3);
  434. __m128i in4 = _mm_loadu_si128(in + 4);
  435. __m128i in5 = _mm_loadu_si128(in + 5);
  436. __m128i in6 = _mm_loadu_si128(in + 6);
  437. __m128i in7 = _mm_loadu_si128(in + 7);
  438. VP8L32bToPlanar(&in0, &in1, &in2, &in3);
  439. VP8L32bToPlanar(&in4, &in5, &in6, &in7);
  440. // At this points, in1/in5 contains red only, in2/in6 green only ...
  441. // Pack the colors in 24b RGB.
  442. VP8PlanarTo24b(&in1, &in5, &in2, &in6, &in3, &in7);
  443. _mm_storeu_si128(out + 0, in1);
  444. _mm_storeu_si128(out + 1, in5);
  445. _mm_storeu_si128(out + 2, in2);
  446. _mm_storeu_si128(out + 3, in6);
  447. _mm_storeu_si128(out + 4, in3);
  448. _mm_storeu_si128(out + 5, in7);
  449. in += 8;
  450. out += 6;
  451. num_pixels -= 32;
  452. }
  453. // left-overs
  454. if (num_pixels > 0) {
  455. VP8LConvertBGRAToRGB_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
  456. }
  457. }
  458. static void ConvertBGRAToRGBA(const uint32_t* src,
  459. int num_pixels, uint8_t* dst) {
  460. const __m128i* in = (const __m128i*)src;
  461. __m128i* out = (__m128i*)dst;
  462. while (num_pixels >= 8) {
  463. const __m128i bgra0 = _mm_loadu_si128(in++); // bgra0|bgra1|bgra2|bgra3
  464. const __m128i bgra4 = _mm_loadu_si128(in++); // bgra4|bgra5|bgra6|bgra7
  465. const __m128i v0l = _mm_unpacklo_epi8(bgra0, bgra4); // b0b4g0g4r0r4a0a4...
  466. const __m128i v0h = _mm_unpackhi_epi8(bgra0, bgra4); // b2b6g2g6r2r6a2a6...
  467. const __m128i v1l = _mm_unpacklo_epi8(v0l, v0h); // b0b2b4b6g0g2g4g6...
  468. const __m128i v1h = _mm_unpackhi_epi8(v0l, v0h); // b1b3b5b7g1g3g5g7...
  469. const __m128i v2l = _mm_unpacklo_epi8(v1l, v1h); // b0...b7 | g0...g7
  470. const __m128i v2h = _mm_unpackhi_epi8(v1l, v1h); // r0...r7 | a0...a7
  471. const __m128i ga0 = _mm_unpackhi_epi64(v2l, v2h); // g0...g7 | a0...a7
  472. const __m128i rb0 = _mm_unpacklo_epi64(v2h, v2l); // r0...r7 | b0...b7
  473. const __m128i rg0 = _mm_unpacklo_epi8(rb0, ga0); // r0g0r1g1 ... r6g6r7g7
  474. const __m128i ba0 = _mm_unpackhi_epi8(rb0, ga0); // b0a0b1a1 ... b6a6b7a7
  475. const __m128i rgba0 = _mm_unpacklo_epi16(rg0, ba0); // rgba0|rgba1...
  476. const __m128i rgba4 = _mm_unpackhi_epi16(rg0, ba0); // rgba4|rgba5...
  477. _mm_storeu_si128(out++, rgba0);
  478. _mm_storeu_si128(out++, rgba4);
  479. num_pixels -= 8;
  480. }
  481. // left-overs
  482. if (num_pixels > 0) {
  483. VP8LConvertBGRAToRGBA_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
  484. }
  485. }
  486. static void ConvertBGRAToRGBA4444(const uint32_t* src,
  487. int num_pixels, uint8_t* dst) {
  488. const __m128i mask_0x0f = _mm_set1_epi8(0x0f);
  489. const __m128i mask_0xf0 = _mm_set1_epi8(0xf0);
  490. const __m128i* in = (const __m128i*)src;
  491. __m128i* out = (__m128i*)dst;
  492. while (num_pixels >= 8) {
  493. const __m128i bgra0 = _mm_loadu_si128(in++); // bgra0|bgra1|bgra2|bgra3
  494. const __m128i bgra4 = _mm_loadu_si128(in++); // bgra4|bgra5|bgra6|bgra7
  495. const __m128i v0l = _mm_unpacklo_epi8(bgra0, bgra4); // b0b4g0g4r0r4a0a4...
  496. const __m128i v0h = _mm_unpackhi_epi8(bgra0, bgra4); // b2b6g2g6r2r6a2a6...
  497. const __m128i v1l = _mm_unpacklo_epi8(v0l, v0h); // b0b2b4b6g0g2g4g6...
  498. const __m128i v1h = _mm_unpackhi_epi8(v0l, v0h); // b1b3b5b7g1g3g5g7...
  499. const __m128i v2l = _mm_unpacklo_epi8(v1l, v1h); // b0...b7 | g0...g7
  500. const __m128i v2h = _mm_unpackhi_epi8(v1l, v1h); // r0...r7 | a0...a7
  501. const __m128i ga0 = _mm_unpackhi_epi64(v2l, v2h); // g0...g7 | a0...a7
  502. const __m128i rb0 = _mm_unpacklo_epi64(v2h, v2l); // r0...r7 | b0...b7
  503. const __m128i ga1 = _mm_srli_epi16(ga0, 4); // g0-|g1-|...|a6-|a7-
  504. const __m128i rb1 = _mm_and_si128(rb0, mask_0xf0); // -r0|-r1|...|-b6|-a7
  505. const __m128i ga2 = _mm_and_si128(ga1, mask_0x0f); // g0-|g1-|...|a6-|a7-
  506. const __m128i rgba0 = _mm_or_si128(ga2, rb1); // rg0..rg7 | ba0..ba7
  507. const __m128i rgba1 = _mm_srli_si128(rgba0, 8); // ba0..ba7 | 0
  508. #ifdef WEBP_SWAP_16BIT_CSP
  509. const __m128i rgba = _mm_unpacklo_epi8(rgba1, rgba0); // barg0...barg7
  510. #else
  511. const __m128i rgba = _mm_unpacklo_epi8(rgba0, rgba1); // rgba0...rgba7
  512. #endif
  513. _mm_storeu_si128(out++, rgba);
  514. num_pixels -= 8;
  515. }
  516. // left-overs
  517. if (num_pixels > 0) {
  518. VP8LConvertBGRAToRGBA4444_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
  519. }
  520. }
  521. static void ConvertBGRAToRGB565(const uint32_t* src,
  522. int num_pixels, uint8_t* dst) {
  523. const __m128i mask_0xe0 = _mm_set1_epi8(0xe0);
  524. const __m128i mask_0xf8 = _mm_set1_epi8(0xf8);
  525. const __m128i mask_0x07 = _mm_set1_epi8(0x07);
  526. const __m128i* in = (const __m128i*)src;
  527. __m128i* out = (__m128i*)dst;
  528. while (num_pixels >= 8) {
  529. const __m128i bgra0 = _mm_loadu_si128(in++); // bgra0|bgra1|bgra2|bgra3
  530. const __m128i bgra4 = _mm_loadu_si128(in++); // bgra4|bgra5|bgra6|bgra7
  531. const __m128i v0l = _mm_unpacklo_epi8(bgra0, bgra4); // b0b4g0g4r0r4a0a4...
  532. const __m128i v0h = _mm_unpackhi_epi8(bgra0, bgra4); // b2b6g2g6r2r6a2a6...
  533. const __m128i v1l = _mm_unpacklo_epi8(v0l, v0h); // b0b2b4b6g0g2g4g6...
  534. const __m128i v1h = _mm_unpackhi_epi8(v0l, v0h); // b1b3b5b7g1g3g5g7...
  535. const __m128i v2l = _mm_unpacklo_epi8(v1l, v1h); // b0...b7 | g0...g7
  536. const __m128i v2h = _mm_unpackhi_epi8(v1l, v1h); // r0...r7 | a0...a7
  537. const __m128i ga0 = _mm_unpackhi_epi64(v2l, v2h); // g0...g7 | a0...a7
  538. const __m128i rb0 = _mm_unpacklo_epi64(v2h, v2l); // r0...r7 | b0...b7
  539. const __m128i rb1 = _mm_and_si128(rb0, mask_0xf8); // -r0..-r7|-b0..-b7
  540. const __m128i g_lo1 = _mm_srli_epi16(ga0, 5);
  541. const __m128i g_lo2 = _mm_and_si128(g_lo1, mask_0x07); // g0-...g7-|xx (3b)
  542. const __m128i g_hi1 = _mm_slli_epi16(ga0, 3);
  543. const __m128i g_hi2 = _mm_and_si128(g_hi1, mask_0xe0); // -g0...-g7|xx (3b)
  544. const __m128i b0 = _mm_srli_si128(rb1, 8); // -b0...-b7|0
  545. const __m128i rg1 = _mm_or_si128(rb1, g_lo2); // gr0...gr7|xx
  546. const __m128i b1 = _mm_srli_epi16(b0, 3);
  547. const __m128i gb1 = _mm_or_si128(b1, g_hi2); // bg0...bg7|xx
  548. #ifdef WEBP_SWAP_16BIT_CSP
  549. const __m128i rgba = _mm_unpacklo_epi8(gb1, rg1); // rggb0...rggb7
  550. #else
  551. const __m128i rgba = _mm_unpacklo_epi8(rg1, gb1); // bgrb0...bgrb7
  552. #endif
  553. _mm_storeu_si128(out++, rgba);
  554. num_pixels -= 8;
  555. }
  556. // left-overs
  557. if (num_pixels > 0) {
  558. VP8LConvertBGRAToRGB565_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
  559. }
  560. }
  561. static void ConvertBGRAToBGR(const uint32_t* src,
  562. int num_pixels, uint8_t* dst) {
  563. const __m128i mask_l = _mm_set_epi32(0, 0x00ffffff, 0, 0x00ffffff);
  564. const __m128i mask_h = _mm_set_epi32(0x00ffffff, 0, 0x00ffffff, 0);
  565. const __m128i* in = (const __m128i*)src;
  566. const uint8_t* const end = dst + num_pixels * 3;
  567. // the last storel_epi64 below writes 8 bytes starting at offset 18
  568. while (dst + 26 <= end) {
  569. const __m128i bgra0 = _mm_loadu_si128(in++); // bgra0|bgra1|bgra2|bgra3
  570. const __m128i bgra4 = _mm_loadu_si128(in++); // bgra4|bgra5|bgra6|bgra7
  571. const __m128i a0l = _mm_and_si128(bgra0, mask_l); // bgr0|0|bgr0|0
  572. const __m128i a4l = _mm_and_si128(bgra4, mask_l); // bgr0|0|bgr0|0
  573. const __m128i a0h = _mm_and_si128(bgra0, mask_h); // 0|bgr0|0|bgr0
  574. const __m128i a4h = _mm_and_si128(bgra4, mask_h); // 0|bgr0|0|bgr0
  575. const __m128i b0h = _mm_srli_epi64(a0h, 8); // 000b|gr00|000b|gr00
  576. const __m128i b4h = _mm_srli_epi64(a4h, 8); // 000b|gr00|000b|gr00
  577. const __m128i c0 = _mm_or_si128(a0l, b0h); // rgbrgb00|rgbrgb00
  578. const __m128i c4 = _mm_or_si128(a4l, b4h); // rgbrgb00|rgbrgb00
  579. const __m128i c2 = _mm_srli_si128(c0, 8);
  580. const __m128i c6 = _mm_srli_si128(c4, 8);
  581. _mm_storel_epi64((__m128i*)(dst + 0), c0);
  582. _mm_storel_epi64((__m128i*)(dst + 6), c2);
  583. _mm_storel_epi64((__m128i*)(dst + 12), c4);
  584. _mm_storel_epi64((__m128i*)(dst + 18), c6);
  585. dst += 24;
  586. num_pixels -= 8;
  587. }
  588. // left-overs
  589. if (num_pixels > 0) {
  590. VP8LConvertBGRAToBGR_C((const uint32_t*)in, num_pixels, dst);
  591. }
  592. }
  593. //------------------------------------------------------------------------------
  594. // Entry point
  595. extern void VP8LDspInitSSE2(void);
  596. WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitSSE2(void) {
  597. VP8LPredictors[5] = Predictor5_SSE2;
  598. VP8LPredictors[6] = Predictor6_SSE2;
  599. VP8LPredictors[7] = Predictor7_SSE2;
  600. VP8LPredictors[8] = Predictor8_SSE2;
  601. VP8LPredictors[9] = Predictor9_SSE2;
  602. VP8LPredictors[10] = Predictor10_SSE2;
  603. VP8LPredictors[11] = Predictor11_SSE2;
  604. VP8LPredictors[12] = Predictor12_SSE2;
  605. VP8LPredictors[13] = Predictor13_SSE2;
  606. VP8LPredictorsAdd[0] = PredictorAdd0_SSE2;
  607. VP8LPredictorsAdd[1] = PredictorAdd1_SSE2;
  608. VP8LPredictorsAdd[2] = PredictorAdd2_SSE2;
  609. VP8LPredictorsAdd[3] = PredictorAdd3_SSE2;
  610. VP8LPredictorsAdd[4] = PredictorAdd4_SSE2;
  611. VP8LPredictorsAdd[5] = PredictorAdd5_SSE2;
  612. VP8LPredictorsAdd[6] = PredictorAdd6_SSE2;
  613. VP8LPredictorsAdd[7] = PredictorAdd7_SSE2;
  614. VP8LPredictorsAdd[8] = PredictorAdd8_SSE2;
  615. VP8LPredictorsAdd[9] = PredictorAdd9_SSE2;
  616. VP8LPredictorsAdd[10] = PredictorAdd10_SSE2;
  617. VP8LPredictorsAdd[11] = PredictorAdd11_SSE2;
  618. VP8LPredictorsAdd[12] = PredictorAdd12_SSE2;
  619. VP8LPredictorsAdd[13] = PredictorAdd13_SSE2;
  620. VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed;
  621. VP8LTransformColorInverse = TransformColorInverse;
  622. VP8LConvertBGRAToRGB = ConvertBGRAToRGB;
  623. VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA;
  624. VP8LConvertBGRAToRGBA4444 = ConvertBGRAToRGBA4444;
  625. VP8LConvertBGRAToRGB565 = ConvertBGRAToRGB565;
  626. VP8LConvertBGRAToBGR = ConvertBGRAToBGR;
  627. }
  628. #else // !WEBP_USE_SSE2
  629. WEBP_DSP_INIT_STUB(VP8LDspInitSSE2)
  630. #endif // WEBP_USE_SSE2