enc_msa.c 32 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893
  1. // Copyright 2016 Google Inc. All Rights Reserved.
  2. //
  3. // Use of this source code is governed by a BSD-style license
  4. // that can be found in the COPYING file in the root of the source
  5. // tree. An additional intellectual property rights grant can be found
  6. // in the file PATENTS. All contributing project authors may
  7. // be found in the AUTHORS file in the root of the source tree.
  8. // -----------------------------------------------------------------------------
  9. //
  10. // MSA version of encoder dsp functions.
  11. //
  12. // Author: Prashant Patil (prashant.patil@imgtec.com)
  13. #include "./dsp.h"
  14. #if defined(WEBP_USE_MSA)
  15. #include <stdlib.h>
  16. #include "./msa_macro.h"
  17. #include "../enc/vp8i_enc.h"
  18. //------------------------------------------------------------------------------
  19. // Transforms
  20. #define IDCT_1D_W(in0, in1, in2, in3, out0, out1, out2, out3) do { \
  21. v4i32 a1_m, b1_m, c1_m, d1_m; \
  22. const v4i32 cospi8sqrt2minus1 = __msa_fill_w(20091); \
  23. const v4i32 sinpi8sqrt2 = __msa_fill_w(35468); \
  24. v4i32 c_tmp1_m = in1 * sinpi8sqrt2; \
  25. v4i32 c_tmp2_m = in3 * cospi8sqrt2minus1; \
  26. v4i32 d_tmp1_m = in1 * cospi8sqrt2minus1; \
  27. v4i32 d_tmp2_m = in3 * sinpi8sqrt2; \
  28. \
  29. ADDSUB2(in0, in2, a1_m, b1_m); \
  30. SRAI_W2_SW(c_tmp1_m, c_tmp2_m, 16); \
  31. c_tmp2_m = c_tmp2_m + in3; \
  32. c1_m = c_tmp1_m - c_tmp2_m; \
  33. SRAI_W2_SW(d_tmp1_m, d_tmp2_m, 16); \
  34. d_tmp1_m = d_tmp1_m + in1; \
  35. d1_m = d_tmp1_m + d_tmp2_m; \
  36. BUTTERFLY_4(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3); \
  37. } while (0)
  38. static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
  39. uint8_t* dst) {
  40. v8i16 input0, input1;
  41. v4i32 in0, in1, in2, in3, hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3;
  42. v4i32 res0, res1, res2, res3;
  43. v16i8 dest0, dest1, dest2, dest3;
  44. const v16i8 zero = { 0 };
  45. LD_SH2(in, 8, input0, input1);
  46. UNPCK_SH_SW(input0, in0, in1);
  47. UNPCK_SH_SW(input1, in2, in3);
  48. IDCT_1D_W(in0, in1, in2, in3, hz0, hz1, hz2, hz3);
  49. TRANSPOSE4x4_SW_SW(hz0, hz1, hz2, hz3, hz0, hz1, hz2, hz3);
  50. IDCT_1D_W(hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3);
  51. SRARI_W4_SW(vt0, vt1, vt2, vt3, 3);
  52. TRANSPOSE4x4_SW_SW(vt0, vt1, vt2, vt3, vt0, vt1, vt2, vt3);
  53. LD_SB4(ref, BPS, dest0, dest1, dest2, dest3);
  54. ILVR_B4_SW(zero, dest0, zero, dest1, zero, dest2, zero, dest3,
  55. res0, res1, res2, res3);
  56. ILVR_H4_SW(zero, res0, zero, res1, zero, res2, zero, res3,
  57. res0, res1, res2, res3);
  58. ADD4(res0, vt0, res1, vt1, res2, vt2, res3, vt3, res0, res1, res2, res3);
  59. CLIP_SW4_0_255(res0, res1, res2, res3);
  60. PCKEV_B2_SW(res0, res1, res2, res3, vt0, vt1);
  61. res0 = (v4i32)__msa_pckev_b((v16i8)vt0, (v16i8)vt1);
  62. ST4x4_UB(res0, res0, 3, 2, 1, 0, dst, BPS);
  63. }
  64. static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,
  65. int do_two) {
  66. ITransformOne(ref, in, dst);
  67. if (do_two) {
  68. ITransformOne(ref + 4, in + 16, dst + 4);
  69. }
  70. }
  71. static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
  72. uint64_t out0, out1, out2, out3;
  73. uint32_t in0, in1, in2, in3;
  74. v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
  75. v8i16 t0, t1, t2, t3;
  76. v16u8 srcl0, srcl1, src0, src1;
  77. const v8i16 mask0 = { 0, 4, 8, 12, 1, 5, 9, 13 };
  78. const v8i16 mask1 = { 3, 7, 11, 15, 2, 6, 10, 14 };
  79. const v8i16 mask2 = { 4, 0, 5, 1, 6, 2, 7, 3 };
  80. const v8i16 mask3 = { 0, 4, 1, 5, 2, 6, 3, 7 };
  81. const v8i16 cnst0 = { 2217, -5352, 2217, -5352, 2217, -5352, 2217, -5352 };
  82. const v8i16 cnst1 = { 5352, 2217, 5352, 2217, 5352, 2217, 5352, 2217 };
  83. LW4(src, BPS, in0, in1, in2, in3);
  84. INSERT_W4_UB(in0, in1, in2, in3, src0);
  85. LW4(ref, BPS, in0, in1, in2, in3);
  86. INSERT_W4_UB(in0, in1, in2, in3, src1);
  87. ILVRL_B2_UB(src0, src1, srcl0, srcl1);
  88. HSUB_UB2_SH(srcl0, srcl1, t0, t1);
  89. VSHF_H2_SH(t0, t1, t0, t1, mask0, mask1, t2, t3);
  90. ADDSUB2(t2, t3, t0, t1);
  91. t0 = SRLI_H(t0, 3);
  92. VSHF_H2_SH(t0, t0, t1, t1, mask2, mask3, t3, t2);
  93. tmp0 = __msa_hadd_s_w(t3, t3);
  94. tmp2 = __msa_hsub_s_w(t3, t3);
  95. FILL_W2_SW(1812, 937, tmp1, tmp3);
  96. DPADD_SH2_SW(t2, t2, cnst0, cnst1, tmp3, tmp1);
  97. SRAI_W2_SW(tmp1, tmp3, 9);
  98. PCKEV_H2_SH(tmp1, tmp0, tmp3, tmp2, t0, t1);
  99. VSHF_H2_SH(t0, t1, t0, t1, mask0, mask1, t2, t3);
  100. ADDSUB2(t2, t3, t0, t1);
  101. VSHF_H2_SH(t0, t0, t1, t1, mask2, mask3, t3, t2);
  102. tmp0 = __msa_hadd_s_w(t3, t3);
  103. tmp2 = __msa_hsub_s_w(t3, t3);
  104. ADDVI_W2_SW(tmp0, 7, tmp2, 7, tmp0, tmp2);
  105. SRAI_W2_SW(tmp0, tmp2, 4);
  106. FILL_W2_SW(12000, 51000, tmp1, tmp3);
  107. DPADD_SH2_SW(t2, t2, cnst0, cnst1, tmp3, tmp1);
  108. SRAI_W2_SW(tmp1, tmp3, 16);
  109. UNPCK_R_SH_SW(t1, tmp4);
  110. tmp5 = __msa_ceqi_w(tmp4, 0);
  111. tmp4 = (v4i32)__msa_nor_v((v16u8)tmp5, (v16u8)tmp5);
  112. tmp5 = __msa_fill_w(1);
  113. tmp5 = (v4i32)__msa_and_v((v16u8)tmp5, (v16u8)tmp4);
  114. tmp1 += tmp5;
  115. PCKEV_H2_SH(tmp1, tmp0, tmp3, tmp2, t0, t1);
  116. out0 = __msa_copy_s_d((v2i64)t0, 0);
  117. out1 = __msa_copy_s_d((v2i64)t0, 1);
  118. out2 = __msa_copy_s_d((v2i64)t1, 0);
  119. out3 = __msa_copy_s_d((v2i64)t1, 1);
  120. SD4(out0, out1, out2, out3, out, 8);
  121. }
  122. static void FTransformWHT(const int16_t* in, int16_t* out) {
  123. v8i16 in0 = { 0 };
  124. v8i16 in1 = { 0 };
  125. v8i16 tmp0, tmp1, tmp2, tmp3;
  126. v8i16 out0, out1;
  127. const v8i16 mask0 = { 0, 1, 2, 3, 8, 9, 10, 11 };
  128. const v8i16 mask1 = { 4, 5, 6, 7, 12, 13, 14, 15 };
  129. const v8i16 mask2 = { 0, 4, 8, 12, 1, 5, 9, 13 };
  130. const v8i16 mask3 = { 3, 7, 11, 15, 2, 6, 10, 14 };
  131. in0 = __msa_insert_h(in0, 0, in[ 0]);
  132. in0 = __msa_insert_h(in0, 1, in[ 64]);
  133. in0 = __msa_insert_h(in0, 2, in[128]);
  134. in0 = __msa_insert_h(in0, 3, in[192]);
  135. in0 = __msa_insert_h(in0, 4, in[ 16]);
  136. in0 = __msa_insert_h(in0, 5, in[ 80]);
  137. in0 = __msa_insert_h(in0, 6, in[144]);
  138. in0 = __msa_insert_h(in0, 7, in[208]);
  139. in1 = __msa_insert_h(in1, 0, in[ 48]);
  140. in1 = __msa_insert_h(in1, 1, in[112]);
  141. in1 = __msa_insert_h(in1, 2, in[176]);
  142. in1 = __msa_insert_h(in1, 3, in[240]);
  143. in1 = __msa_insert_h(in1, 4, in[ 32]);
  144. in1 = __msa_insert_h(in1, 5, in[ 96]);
  145. in1 = __msa_insert_h(in1, 6, in[160]);
  146. in1 = __msa_insert_h(in1, 7, in[224]);
  147. ADDSUB2(in0, in1, tmp0, tmp1);
  148. VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3);
  149. ADDSUB2(tmp2, tmp3, tmp0, tmp1);
  150. VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask2, mask3, in0, in1);
  151. ADDSUB2(in0, in1, tmp0, tmp1);
  152. VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3);
  153. ADDSUB2(tmp2, tmp3, out0, out1);
  154. SRAI_H2_SH(out0, out1, 1);
  155. ST_SH2(out0, out1, out, 8);
  156. }
  157. static int TTransform(const uint8_t* in, const uint16_t* w) {
  158. int sum;
  159. uint32_t in0_m, in1_m, in2_m, in3_m;
  160. v16i8 src0;
  161. v8i16 in0, in1, tmp0, tmp1, tmp2, tmp3;
  162. v4i32 dst0, dst1;
  163. const v16i8 zero = { 0 };
  164. const v8i16 mask0 = { 0, 1, 2, 3, 8, 9, 10, 11 };
  165. const v8i16 mask1 = { 4, 5, 6, 7, 12, 13, 14, 15 };
  166. const v8i16 mask2 = { 0, 4, 8, 12, 1, 5, 9, 13 };
  167. const v8i16 mask3 = { 3, 7, 11, 15, 2, 6, 10, 14 };
  168. LW4(in, BPS, in0_m, in1_m, in2_m, in3_m);
  169. INSERT_W4_SB(in0_m, in1_m, in2_m, in3_m, src0);
  170. ILVRL_B2_SH(zero, src0, tmp0, tmp1);
  171. VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask2, mask3, in0, in1);
  172. ADDSUB2(in0, in1, tmp0, tmp1);
  173. VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3);
  174. ADDSUB2(tmp2, tmp3, tmp0, tmp1);
  175. VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask2, mask3, in0, in1);
  176. ADDSUB2(in0, in1, tmp0, tmp1);
  177. VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3);
  178. ADDSUB2(tmp2, tmp3, tmp0, tmp1);
  179. tmp0 = __msa_add_a_h(tmp0, (v8i16)zero);
  180. tmp1 = __msa_add_a_h(tmp1, (v8i16)zero);
  181. LD_SH2(w, 8, tmp2, tmp3);
  182. DOTP_SH2_SW(tmp0, tmp1, tmp2, tmp3, dst0, dst1);
  183. dst0 = dst0 + dst1;
  184. sum = HADD_SW_S32(dst0);
  185. return sum;
  186. }
  187. static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
  188. const uint16_t* const w) {
  189. const int sum1 = TTransform(a, w);
  190. const int sum2 = TTransform(b, w);
  191. return abs(sum2 - sum1) >> 5;
  192. }
  193. static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
  194. const uint16_t* const w) {
  195. int D = 0;
  196. int x, y;
  197. for (y = 0; y < 16 * BPS; y += 4 * BPS) {
  198. for (x = 0; x < 16; x += 4) {
  199. D += Disto4x4(a + x + y, b + x + y, w);
  200. }
  201. }
  202. return D;
  203. }
  204. //------------------------------------------------------------------------------
  205. // Histogram
  206. static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
  207. int start_block, int end_block,
  208. VP8Histogram* const histo) {
  209. int j;
  210. int distribution[MAX_COEFF_THRESH + 1] = { 0 };
  211. for (j = start_block; j < end_block; ++j) {
  212. int16_t out[16];
  213. VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
  214. {
  215. int k;
  216. v8i16 coeff0, coeff1;
  217. const v8i16 zero = { 0 };
  218. const v8i16 max_coeff_thr = __msa_ldi_h(MAX_COEFF_THRESH);
  219. LD_SH2(&out[0], 8, coeff0, coeff1);
  220. coeff0 = __msa_add_a_h(coeff0, zero);
  221. coeff1 = __msa_add_a_h(coeff1, zero);
  222. SRAI_H2_SH(coeff0, coeff1, 3);
  223. coeff0 = __msa_min_s_h(coeff0, max_coeff_thr);
  224. coeff1 = __msa_min_s_h(coeff1, max_coeff_thr);
  225. ST_SH2(coeff0, coeff1, &out[0], 8);
  226. for (k = 0; k < 16; ++k) {
  227. ++distribution[out[k]];
  228. }
  229. }
  230. }
  231. VP8SetHistogramData(distribution, histo);
  232. }
  233. //------------------------------------------------------------------------------
  234. // Intra predictions
  235. // luma 4x4 prediction
  236. #define DST(x, y) dst[(x) + (y) * BPS]
  237. #define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2)
  238. #define AVG2(a, b) (((a) + (b) + 1) >> 1)
  239. static WEBP_INLINE void VE4(uint8_t* dst, const uint8_t* top) { // vertical
  240. const uint64_t val_m = LD(top - 1);
  241. const v16u8 A = (v16u8)__msa_insert_d((v2i64)A, 0, val_m);
  242. const v16u8 B = SLDI_UB(A, A, 1);
  243. const v16u8 C = SLDI_UB(A, A, 2);
  244. const v16u8 AC = __msa_ave_u_b(A, C);
  245. const v16u8 B2 = __msa_ave_u_b(B, B);
  246. const v16u8 R = __msa_aver_u_b(AC, B2);
  247. const uint32_t out = __msa_copy_s_w((v4i32)R, 0);
  248. SW4(out, out, out, out, dst, BPS);
  249. }
  250. static WEBP_INLINE void HE4(uint8_t* dst, const uint8_t* top) { // horizontal
  251. const int X = top[-1];
  252. const int I = top[-2];
  253. const int J = top[-3];
  254. const int K = top[-4];
  255. const int L = top[-5];
  256. WebPUint32ToMem(dst + 0 * BPS, 0x01010101U * AVG3(X, I, J));
  257. WebPUint32ToMem(dst + 1 * BPS, 0x01010101U * AVG3(I, J, K));
  258. WebPUint32ToMem(dst + 2 * BPS, 0x01010101U * AVG3(J, K, L));
  259. WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(K, L, L));
  260. }
  261. static WEBP_INLINE void DC4(uint8_t* dst, const uint8_t* top) {
  262. uint32_t dc = 4;
  263. int i;
  264. for (i = 0; i < 4; ++i) dc += top[i] + top[-5 + i];
  265. dc >>= 3;
  266. dc = dc | (dc << 8) | (dc << 16) | (dc << 24);
  267. SW4(dc, dc, dc, dc, dst, BPS);
  268. }
  269. static WEBP_INLINE void RD4(uint8_t* dst, const uint8_t* top) {
  270. const uint64_t val_m = LD(top - 5);
  271. const v16u8 A1 = (v16u8)__msa_insert_d((v2i64)A1, 0, val_m);
  272. const v16u8 A = (v16u8)__msa_insert_b((v16i8)A1, 8, top[3]);
  273. const v16u8 B = SLDI_UB(A, A, 1);
  274. const v16u8 C = SLDI_UB(A, A, 2);
  275. const v16u8 AC = __msa_ave_u_b(A, C);
  276. const v16u8 B2 = __msa_ave_u_b(B, B);
  277. const v16u8 R0 = __msa_aver_u_b(AC, B2);
  278. const v16u8 R1 = SLDI_UB(R0, R0, 1);
  279. const v16u8 R2 = SLDI_UB(R1, R1, 1);
  280. const v16u8 R3 = SLDI_UB(R2, R2, 1);
  281. const uint32_t val0 = __msa_copy_s_w((v4i32)R0, 0);
  282. const uint32_t val1 = __msa_copy_s_w((v4i32)R1, 0);
  283. const uint32_t val2 = __msa_copy_s_w((v4i32)R2, 0);
  284. const uint32_t val3 = __msa_copy_s_w((v4i32)R3, 0);
  285. SW4(val3, val2, val1, val0, dst, BPS);
  286. }
  287. static WEBP_INLINE void LD4(uint8_t* dst, const uint8_t* top) {
  288. const uint64_t val_m = LD(top);
  289. const v16u8 A = (v16u8)__msa_insert_d((v2i64)A, 0, val_m);
  290. const v16u8 B = SLDI_UB(A, A, 1);
  291. const v16u8 C1 = SLDI_UB(A, A, 2);
  292. const v16u8 C = (v16u8)__msa_insert_b((v16i8)C1, 6, top[7]);
  293. const v16u8 AC = __msa_ave_u_b(A, C);
  294. const v16u8 B2 = __msa_ave_u_b(B, B);
  295. const v16u8 R0 = __msa_aver_u_b(AC, B2);
  296. const v16u8 R1 = SLDI_UB(R0, R0, 1);
  297. const v16u8 R2 = SLDI_UB(R1, R1, 1);
  298. const v16u8 R3 = SLDI_UB(R2, R2, 1);
  299. const uint32_t val0 = __msa_copy_s_w((v4i32)R0, 0);
  300. const uint32_t val1 = __msa_copy_s_w((v4i32)R1, 0);
  301. const uint32_t val2 = __msa_copy_s_w((v4i32)R2, 0);
  302. const uint32_t val3 = __msa_copy_s_w((v4i32)R3, 0);
  303. SW4(val0, val1, val2, val3, dst, BPS);
  304. }
  305. static WEBP_INLINE void VR4(uint8_t* dst, const uint8_t* top) {
  306. const int X = top[-1];
  307. const int I = top[-2];
  308. const int J = top[-3];
  309. const int K = top[-4];
  310. const int A = top[0];
  311. const int B = top[1];
  312. const int C = top[2];
  313. const int D = top[3];
  314. DST(0, 0) = DST(1, 2) = AVG2(X, A);
  315. DST(1, 0) = DST(2, 2) = AVG2(A, B);
  316. DST(2, 0) = DST(3, 2) = AVG2(B, C);
  317. DST(3, 0) = AVG2(C, D);
  318. DST(0, 3) = AVG3(K, J, I);
  319. DST(0, 2) = AVG3(J, I, X);
  320. DST(0, 1) = DST(1, 3) = AVG3(I, X, A);
  321. DST(1, 1) = DST(2, 3) = AVG3(X, A, B);
  322. DST(2, 1) = DST(3, 3) = AVG3(A, B, C);
  323. DST(3, 1) = AVG3(B, C, D);
  324. }
  325. static WEBP_INLINE void VL4(uint8_t* dst, const uint8_t* top) {
  326. const int A = top[0];
  327. const int B = top[1];
  328. const int C = top[2];
  329. const int D = top[3];
  330. const int E = top[4];
  331. const int F = top[5];
  332. const int G = top[6];
  333. const int H = top[7];
  334. DST(0, 0) = AVG2(A, B);
  335. DST(1, 0) = DST(0, 2) = AVG2(B, C);
  336. DST(2, 0) = DST(1, 2) = AVG2(C, D);
  337. DST(3, 0) = DST(2, 2) = AVG2(D, E);
  338. DST(0, 1) = AVG3(A, B, C);
  339. DST(1, 1) = DST(0, 3) = AVG3(B, C, D);
  340. DST(2, 1) = DST(1, 3) = AVG3(C, D, E);
  341. DST(3, 1) = DST(2, 3) = AVG3(D, E, F);
  342. DST(3, 2) = AVG3(E, F, G);
  343. DST(3, 3) = AVG3(F, G, H);
  344. }
  345. static WEBP_INLINE void HU4(uint8_t* dst, const uint8_t* top) {
  346. const int I = top[-2];
  347. const int J = top[-3];
  348. const int K = top[-4];
  349. const int L = top[-5];
  350. DST(0, 0) = AVG2(I, J);
  351. DST(2, 0) = DST(0, 1) = AVG2(J, K);
  352. DST(2, 1) = DST(0, 2) = AVG2(K, L);
  353. DST(1, 0) = AVG3(I, J, K);
  354. DST(3, 0) = DST(1, 1) = AVG3(J, K, L);
  355. DST(3, 1) = DST(1, 2) = AVG3(K, L, L);
  356. DST(3, 2) = DST(2, 2) =
  357. DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
  358. }
  359. static WEBP_INLINE void HD4(uint8_t* dst, const uint8_t* top) {
  360. const int X = top[-1];
  361. const int I = top[-2];
  362. const int J = top[-3];
  363. const int K = top[-4];
  364. const int L = top[-5];
  365. const int A = top[0];
  366. const int B = top[1];
  367. const int C = top[2];
  368. DST(0, 0) = DST(2, 1) = AVG2(I, X);
  369. DST(0, 1) = DST(2, 2) = AVG2(J, I);
  370. DST(0, 2) = DST(2, 3) = AVG2(K, J);
  371. DST(0, 3) = AVG2(L, K);
  372. DST(3, 0) = AVG3(A, B, C);
  373. DST(2, 0) = AVG3(X, A, B);
  374. DST(1, 0) = DST(3, 1) = AVG3(I, X, A);
  375. DST(1, 1) = DST(3, 2) = AVG3(J, I, X);
  376. DST(1, 2) = DST(3, 3) = AVG3(K, J, I);
  377. DST(1, 3) = AVG3(L, K, J);
  378. }
  379. static WEBP_INLINE void TM4(uint8_t* dst, const uint8_t* top) {
  380. const v16i8 zero = { 0 };
  381. const v8i16 TL = (v8i16)__msa_fill_h(top[-1]);
  382. const v8i16 L0 = (v8i16)__msa_fill_h(top[-2]);
  383. const v8i16 L1 = (v8i16)__msa_fill_h(top[-3]);
  384. const v8i16 L2 = (v8i16)__msa_fill_h(top[-4]);
  385. const v8i16 L3 = (v8i16)__msa_fill_h(top[-5]);
  386. const v16u8 T1 = LD_UB(top);
  387. const v8i16 T = (v8i16)__msa_ilvr_b(zero, (v16i8)T1);
  388. const v8i16 d = T - TL;
  389. v8i16 r0, r1, r2, r3;
  390. ADD4(d, L0, d, L1, d, L2, d, L3, r0, r1, r2, r3);
  391. CLIP_SH4_0_255(r0, r1, r2, r3);
  392. PCKEV_ST4x4_UB(r0, r1, r2, r3, dst, BPS);
  393. }
  394. #undef DST
  395. #undef AVG3
  396. #undef AVG2
  397. static void Intra4Preds(uint8_t* dst, const uint8_t* top) {
  398. DC4(I4DC4 + dst, top);
  399. TM4(I4TM4 + dst, top);
  400. VE4(I4VE4 + dst, top);
  401. HE4(I4HE4 + dst, top);
  402. RD4(I4RD4 + dst, top);
  403. VR4(I4VR4 + dst, top);
  404. LD4(I4LD4 + dst, top);
  405. VL4(I4VL4 + dst, top);
  406. HD4(I4HD4 + dst, top);
  407. HU4(I4HU4 + dst, top);
  408. }
  409. // luma 16x16 prediction
  410. #define STORE16x16(out, dst) do { \
  411. ST_UB8(out, out, out, out, out, out, out, out, dst + 0 * BPS, BPS); \
  412. ST_UB8(out, out, out, out, out, out, out, out, dst + 8 * BPS, BPS); \
  413. } while (0)
  414. static WEBP_INLINE void VerticalPred16x16(uint8_t* dst, const uint8_t* top) {
  415. if (top != NULL) {
  416. const v16u8 out = LD_UB(top);
  417. STORE16x16(out, dst);
  418. } else {
  419. const v16u8 out = (v16u8)__msa_fill_b(0x7f);
  420. STORE16x16(out, dst);
  421. }
  422. }
  423. static WEBP_INLINE void HorizontalPred16x16(uint8_t* dst,
  424. const uint8_t* left) {
  425. if (left != NULL) {
  426. int j;
  427. for (j = 0; j < 16; j += 4) {
  428. const v16u8 L0 = (v16u8)__msa_fill_b(left[0]);
  429. const v16u8 L1 = (v16u8)__msa_fill_b(left[1]);
  430. const v16u8 L2 = (v16u8)__msa_fill_b(left[2]);
  431. const v16u8 L3 = (v16u8)__msa_fill_b(left[3]);
  432. ST_UB4(L0, L1, L2, L3, dst, BPS);
  433. dst += 4 * BPS;
  434. left += 4;
  435. }
  436. } else {
  437. const v16u8 out = (v16u8)__msa_fill_b(0x81);
  438. STORE16x16(out, dst);
  439. }
  440. }
  441. static WEBP_INLINE void TrueMotion16x16(uint8_t* dst, const uint8_t* left,
  442. const uint8_t* top) {
  443. if (left != NULL) {
  444. if (top != NULL) {
  445. int j;
  446. v8i16 d1, d2;
  447. const v16i8 zero = { 0 };
  448. const v8i16 TL = (v8i16)__msa_fill_h(left[-1]);
  449. const v16u8 T = LD_UB(top);
  450. ILVRL_B2_SH(zero, T, d1, d2);
  451. SUB2(d1, TL, d2, TL, d1, d2);
  452. for (j = 0; j < 16; j += 4) {
  453. v16i8 t0, t1, t2, t3;
  454. v8i16 r0, r1, r2, r3, r4, r5, r6, r7;
  455. const v8i16 L0 = (v8i16)__msa_fill_h(left[j + 0]);
  456. const v8i16 L1 = (v8i16)__msa_fill_h(left[j + 1]);
  457. const v8i16 L2 = (v8i16)__msa_fill_h(left[j + 2]);
  458. const v8i16 L3 = (v8i16)__msa_fill_h(left[j + 3]);
  459. ADD4(d1, L0, d1, L1, d1, L2, d1, L3, r0, r1, r2, r3);
  460. ADD4(d2, L0, d2, L1, d2, L2, d2, L3, r4, r5, r6, r7);
  461. CLIP_SH4_0_255(r0, r1, r2, r3);
  462. CLIP_SH4_0_255(r4, r5, r6, r7);
  463. PCKEV_B4_SB(r4, r0, r5, r1, r6, r2, r7, r3, t0, t1, t2, t3);
  464. ST_SB4(t0, t1, t2, t3, dst, BPS);
  465. dst += 4 * BPS;
  466. }
  467. } else {
  468. HorizontalPred16x16(dst, left);
  469. }
  470. } else {
  471. if (top != NULL) {
  472. VerticalPred16x16(dst, top);
  473. } else {
  474. const v16u8 out = (v16u8)__msa_fill_b(0x81);
  475. STORE16x16(out, dst);
  476. }
  477. }
  478. }
  479. static WEBP_INLINE void DCMode16x16(uint8_t* dst, const uint8_t* left,
  480. const uint8_t* top) {
  481. int DC;
  482. v16u8 out;
  483. if (top != NULL && left != NULL) {
  484. const v16u8 rtop = LD_UB(top);
  485. const v8u16 dctop = __msa_hadd_u_h(rtop, rtop);
  486. const v16u8 rleft = LD_UB(left);
  487. const v8u16 dcleft = __msa_hadd_u_h(rleft, rleft);
  488. const v8u16 dctemp = dctop + dcleft;
  489. DC = HADD_UH_U32(dctemp);
  490. DC = (DC + 16) >> 5;
  491. } else if (left != NULL) { // left but no top
  492. const v16u8 rleft = LD_UB(left);
  493. const v8u16 dcleft = __msa_hadd_u_h(rleft, rleft);
  494. DC = HADD_UH_U32(dcleft);
  495. DC = (DC + DC + 16) >> 5;
  496. } else if (top != NULL) { // top but no left
  497. const v16u8 rtop = LD_UB(top);
  498. const v8u16 dctop = __msa_hadd_u_h(rtop, rtop);
  499. DC = HADD_UH_U32(dctop);
  500. DC = (DC + DC + 16) >> 5;
  501. } else { // no top, no left, nothing.
  502. DC = 0x80;
  503. }
  504. out = (v16u8)__msa_fill_b(DC);
  505. STORE16x16(out, dst);
  506. }
  507. static void Intra16Preds(uint8_t* dst,
  508. const uint8_t* left, const uint8_t* top) {
  509. DCMode16x16(I16DC16 + dst, left, top);
  510. VerticalPred16x16(I16VE16 + dst, top);
  511. HorizontalPred16x16(I16HE16 + dst, left);
  512. TrueMotion16x16(I16TM16 + dst, left, top);
  513. }
  514. // Chroma 8x8 prediction
  515. #define CALC_DC8(in, out) do { \
  516. const v8u16 temp0 = __msa_hadd_u_h(in, in); \
  517. const v4u32 temp1 = __msa_hadd_u_w(temp0, temp0); \
  518. const v2i64 temp2 = (v2i64)__msa_hadd_u_d(temp1, temp1); \
  519. const v2i64 temp3 = __msa_splati_d(temp2, 1); \
  520. const v2i64 temp4 = temp3 + temp2; \
  521. const v16i8 temp5 = (v16i8)__msa_srari_d(temp4, 4); \
  522. const v2i64 temp6 = (v2i64)__msa_splati_b(temp5, 0); \
  523. out = __msa_copy_s_d(temp6, 0); \
  524. } while (0)
  525. #define STORE8x8(out, dst) do { \
  526. SD4(out, out, out, out, dst + 0 * BPS, BPS); \
  527. SD4(out, out, out, out, dst + 4 * BPS, BPS); \
  528. } while (0)
  529. static WEBP_INLINE void VerticalPred8x8(uint8_t* dst, const uint8_t* top) {
  530. if (top != NULL) {
  531. const uint64_t out = LD(top);
  532. STORE8x8(out, dst);
  533. } else {
  534. const uint64_t out = 0x7f7f7f7f7f7f7f7fULL;
  535. STORE8x8(out, dst);
  536. }
  537. }
  538. static WEBP_INLINE void HorizontalPred8x8(uint8_t* dst, const uint8_t* left) {
  539. if (left != NULL) {
  540. int j;
  541. for (j = 0; j < 8; j += 4) {
  542. const v16u8 L0 = (v16u8)__msa_fill_b(left[0]);
  543. const v16u8 L1 = (v16u8)__msa_fill_b(left[1]);
  544. const v16u8 L2 = (v16u8)__msa_fill_b(left[2]);
  545. const v16u8 L3 = (v16u8)__msa_fill_b(left[3]);
  546. const uint64_t out0 = __msa_copy_s_d((v2i64)L0, 0);
  547. const uint64_t out1 = __msa_copy_s_d((v2i64)L1, 0);
  548. const uint64_t out2 = __msa_copy_s_d((v2i64)L2, 0);
  549. const uint64_t out3 = __msa_copy_s_d((v2i64)L3, 0);
  550. SD4(out0, out1, out2, out3, dst, BPS);
  551. dst += 4 * BPS;
  552. left += 4;
  553. }
  554. } else {
  555. const uint64_t out = 0x8181818181818181ULL;
  556. STORE8x8(out, dst);
  557. }
  558. }
  559. static WEBP_INLINE void TrueMotion8x8(uint8_t* dst, const uint8_t* left,
  560. const uint8_t* top) {
  561. if (left != NULL) {
  562. if (top != NULL) {
  563. int j;
  564. const v8i16 TL = (v8i16)__msa_fill_h(left[-1]);
  565. const v16u8 T1 = LD_UB(top);
  566. const v16i8 zero = { 0 };
  567. const v8i16 T = (v8i16)__msa_ilvr_b(zero, (v16i8)T1);
  568. const v8i16 d = T - TL;
  569. for (j = 0; j < 8; j += 4) {
  570. uint64_t out0, out1, out2, out3;
  571. v16i8 t0, t1;
  572. v8i16 r0 = (v8i16)__msa_fill_h(left[j + 0]);
  573. v8i16 r1 = (v8i16)__msa_fill_h(left[j + 1]);
  574. v8i16 r2 = (v8i16)__msa_fill_h(left[j + 2]);
  575. v8i16 r3 = (v8i16)__msa_fill_h(left[j + 3]);
  576. ADD4(d, r0, d, r1, d, r2, d, r3, r0, r1, r2, r3);
  577. CLIP_SH4_0_255(r0, r1, r2, r3);
  578. PCKEV_B2_SB(r1, r0, r3, r2, t0, t1);
  579. out0 = __msa_copy_s_d((v2i64)t0, 0);
  580. out1 = __msa_copy_s_d((v2i64)t0, 1);
  581. out2 = __msa_copy_s_d((v2i64)t1, 0);
  582. out3 = __msa_copy_s_d((v2i64)t1, 1);
  583. SD4(out0, out1, out2, out3, dst, BPS);
  584. dst += 4 * BPS;
  585. }
  586. } else {
  587. HorizontalPred8x8(dst, left);
  588. }
  589. } else {
  590. if (top != NULL) {
  591. VerticalPred8x8(dst, top);
  592. } else {
  593. const uint64_t out = 0x8181818181818181ULL;
  594. STORE8x8(out, dst);
  595. }
  596. }
  597. }
  598. static WEBP_INLINE void DCMode8x8(uint8_t* dst, const uint8_t* left,
  599. const uint8_t* top) {
  600. uint64_t out;
  601. v16u8 src;
  602. if (top != NULL && left != NULL) {
  603. const uint64_t left_m = LD(left);
  604. const uint64_t top_m = LD(top);
  605. INSERT_D2_UB(left_m, top_m, src);
  606. CALC_DC8(src, out);
  607. } else if (left != NULL) { // left but no top
  608. const uint64_t left_m = LD(left);
  609. INSERT_D2_UB(left_m, left_m, src);
  610. CALC_DC8(src, out);
  611. } else if (top != NULL) { // top but no left
  612. const uint64_t top_m = LD(top);
  613. INSERT_D2_UB(top_m, top_m, src);
  614. CALC_DC8(src, out);
  615. } else { // no top, no left, nothing.
  616. src = (v16u8)__msa_fill_b(0x80);
  617. out = __msa_copy_s_d((v2i64)src, 0);
  618. }
  619. STORE8x8(out, dst);
  620. }
  621. static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
  622. const uint8_t* top) {
  623. // U block
  624. DCMode8x8(C8DC8 + dst, left, top);
  625. VerticalPred8x8(C8VE8 + dst, top);
  626. HorizontalPred8x8(C8HE8 + dst, left);
  627. TrueMotion8x8(C8TM8 + dst, left, top);
  628. // V block
  629. dst += 8;
  630. if (top != NULL) top += 8;
  631. if (left != NULL) left += 16;
  632. DCMode8x8(C8DC8 + dst, left, top);
  633. VerticalPred8x8(C8VE8 + dst, top);
  634. HorizontalPred8x8(C8HE8 + dst, left);
  635. TrueMotion8x8(C8TM8 + dst, left, top);
  636. }
  637. //------------------------------------------------------------------------------
  638. // Metric
  639. #define PACK_DOTP_UB4_SW(in0, in1, in2, in3, out0, out1, out2, out3) do { \
  640. v16u8 tmp0, tmp1; \
  641. v8i16 tmp2, tmp3; \
  642. ILVRL_B2_UB(in0, in1, tmp0, tmp1); \
  643. HSUB_UB2_SH(tmp0, tmp1, tmp2, tmp3); \
  644. DOTP_SH2_SW(tmp2, tmp3, tmp2, tmp3, out0, out1); \
  645. ILVRL_B2_UB(in2, in3, tmp0, tmp1); \
  646. HSUB_UB2_SH(tmp0, tmp1, tmp2, tmp3); \
  647. DOTP_SH2_SW(tmp2, tmp3, tmp2, tmp3, out2, out3); \
  648. } while (0)
  649. #define PACK_DPADD_UB4_SW(in0, in1, in2, in3, out0, out1, out2, out3) do { \
  650. v16u8 tmp0, tmp1; \
  651. v8i16 tmp2, tmp3; \
  652. ILVRL_B2_UB(in0, in1, tmp0, tmp1); \
  653. HSUB_UB2_SH(tmp0, tmp1, tmp2, tmp3); \
  654. DPADD_SH2_SW(tmp2, tmp3, tmp2, tmp3, out0, out1); \
  655. ILVRL_B2_UB(in2, in3, tmp0, tmp1); \
  656. HSUB_UB2_SH(tmp0, tmp1, tmp2, tmp3); \
  657. DPADD_SH2_SW(tmp2, tmp3, tmp2, tmp3, out2, out3); \
  658. } while (0)
  659. static int SSE16x16(const uint8_t* a, const uint8_t* b) {
  660. uint32_t sum;
  661. v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
  662. v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
  663. v4i32 out0, out1, out2, out3;
  664. LD_UB8(a, BPS, src0, src1, src2, src3, src4, src5, src6, src7);
  665. LD_UB8(b, BPS, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
  666. PACK_DOTP_UB4_SW(src0, ref0, src1, ref1, out0, out1, out2, out3);
  667. PACK_DPADD_UB4_SW(src2, ref2, src3, ref3, out0, out1, out2, out3);
  668. PACK_DPADD_UB4_SW(src4, ref4, src5, ref5, out0, out1, out2, out3);
  669. PACK_DPADD_UB4_SW(src6, ref6, src7, ref7, out0, out1, out2, out3);
  670. a += 8 * BPS;
  671. b += 8 * BPS;
  672. LD_UB8(a, BPS, src0, src1, src2, src3, src4, src5, src6, src7);
  673. LD_UB8(b, BPS, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
  674. PACK_DPADD_UB4_SW(src0, ref0, src1, ref1, out0, out1, out2, out3);
  675. PACK_DPADD_UB4_SW(src2, ref2, src3, ref3, out0, out1, out2, out3);
  676. PACK_DPADD_UB4_SW(src4, ref4, src5, ref5, out0, out1, out2, out3);
  677. PACK_DPADD_UB4_SW(src6, ref6, src7, ref7, out0, out1, out2, out3);
  678. out0 += out1;
  679. out2 += out3;
  680. out0 += out2;
  681. sum = HADD_SW_S32(out0);
  682. return sum;
  683. }
  684. static int SSE16x8(const uint8_t* a, const uint8_t* b) {
  685. uint32_t sum;
  686. v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
  687. v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
  688. v4i32 out0, out1, out2, out3;
  689. LD_UB8(a, BPS, src0, src1, src2, src3, src4, src5, src6, src7);
  690. LD_UB8(b, BPS, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
  691. PACK_DOTP_UB4_SW(src0, ref0, src1, ref1, out0, out1, out2, out3);
  692. PACK_DPADD_UB4_SW(src2, ref2, src3, ref3, out0, out1, out2, out3);
  693. PACK_DPADD_UB4_SW(src4, ref4, src5, ref5, out0, out1, out2, out3);
  694. PACK_DPADD_UB4_SW(src6, ref6, src7, ref7, out0, out1, out2, out3);
  695. out0 += out1;
  696. out2 += out3;
  697. out0 += out2;
  698. sum = HADD_SW_S32(out0);
  699. return sum;
  700. }
  701. static int SSE8x8(const uint8_t* a, const uint8_t* b) {
  702. uint32_t sum;
  703. v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
  704. v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
  705. v16u8 t0, t1, t2, t3;
  706. v4i32 out0, out1, out2, out3;
  707. LD_UB8(a, BPS, src0, src1, src2, src3, src4, src5, src6, src7);
  708. LD_UB8(b, BPS, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
  709. ILVR_B4_UB(src0, src1, src2, src3, ref0, ref1, ref2, ref3, t0, t1, t2, t3);
  710. PACK_DOTP_UB4_SW(t0, t2, t1, t3, out0, out1, out2, out3);
  711. ILVR_B4_UB(src4, src5, src6, src7, ref4, ref5, ref6, ref7, t0, t1, t2, t3);
  712. PACK_DPADD_UB4_SW(t0, t2, t1, t3, out0, out1, out2, out3);
  713. out0 += out1;
  714. out2 += out3;
  715. out0 += out2;
  716. sum = HADD_SW_S32(out0);
  717. return sum;
  718. }
  719. static int SSE4x4(const uint8_t* a, const uint8_t* b) {
  720. uint32_t sum = 0;
  721. uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3;
  722. v16u8 src, ref, tmp0, tmp1;
  723. v8i16 diff0, diff1;
  724. v4i32 out0, out1;
  725. LW4(a, BPS, src0, src1, src2, src3);
  726. LW4(b, BPS, ref0, ref1, ref2, ref3);
  727. INSERT_W4_UB(src0, src1, src2, src3, src);
  728. INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
  729. ILVRL_B2_UB(src, ref, tmp0, tmp1);
  730. HSUB_UB2_SH(tmp0, tmp1, diff0, diff1);
  731. DOTP_SH2_SW(diff0, diff1, diff0, diff1, out0, out1);
  732. out0 += out1;
  733. sum = HADD_SW_S32(out0);
  734. return sum;
  735. }
  736. //------------------------------------------------------------------------------
  737. // Quantization
  738. static int QuantizeBlock(int16_t in[16], int16_t out[16],
  739. const VP8Matrix* const mtx) {
  740. int sum;
  741. v8i16 in0, in1, sh0, sh1, out0, out1;
  742. v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, sign0, sign1;
  743. v4i32 s0, s1, s2, s3, b0, b1, b2, b3, t0, t1, t2, t3;
  744. const v8i16 zero = { 0 };
  745. const v8i16 zigzag0 = { 0, 1, 4, 8, 5, 2, 3, 6 };
  746. const v8i16 zigzag1 = { 9, 12, 13, 10, 7, 11, 14, 15 };
  747. const v8i16 maxlevel = __msa_fill_h(MAX_LEVEL);
  748. LD_SH2(&in[0], 8, in0, in1);
  749. LD_SH2(&mtx->sharpen_[0], 8, sh0, sh1);
  750. tmp4 = __msa_add_a_h(in0, zero);
  751. tmp5 = __msa_add_a_h(in1, zero);
  752. ILVRL_H2_SH(sh0, tmp4, tmp0, tmp1);
  753. ILVRL_H2_SH(sh1, tmp5, tmp2, tmp3);
  754. HADD_SH4_SW(tmp0, tmp1, tmp2, tmp3, s0, s1, s2, s3);
  755. sign0 = (in0 < zero);
  756. sign1 = (in1 < zero); // sign
  757. LD_SH2(&mtx->iq_[0], 8, tmp0, tmp1); // iq
  758. ILVRL_H2_SW(zero, tmp0, t0, t1);
  759. ILVRL_H2_SW(zero, tmp1, t2, t3);
  760. LD_SW4(&mtx->bias_[0], 4, b0, b1, b2, b3); // bias
  761. MUL4(t0, s0, t1, s1, t2, s2, t3, s3, t0, t1, t2, t3);
  762. ADD4(b0, t0, b1, t1, b2, t2, b3, t3, b0, b1, b2, b3);
  763. SRAI_W4_SW(b0, b1, b2, b3, 17);
  764. PCKEV_H2_SH(b1, b0, b3, b2, tmp2, tmp3);
  765. tmp0 = (tmp2 > maxlevel);
  766. tmp1 = (tmp3 > maxlevel);
  767. tmp2 = (v8i16)__msa_bmnz_v((v16u8)tmp2, (v16u8)maxlevel, (v16u8)tmp0);
  768. tmp3 = (v8i16)__msa_bmnz_v((v16u8)tmp3, (v16u8)maxlevel, (v16u8)tmp1);
  769. SUB2(0, tmp2, 0, tmp3, tmp0, tmp1);
  770. tmp2 = (v8i16)__msa_bmnz_v((v16u8)tmp2, (v16u8)tmp0, (v16u8)sign0);
  771. tmp3 = (v8i16)__msa_bmnz_v((v16u8)tmp3, (v16u8)tmp1, (v16u8)sign1);
  772. LD_SW4(&mtx->zthresh_[0], 4, t0, t1, t2, t3); // zthresh
  773. t0 = (s0 > t0);
  774. t1 = (s1 > t1);
  775. t2 = (s2 > t2);
  776. t3 = (s3 > t3);
  777. PCKEV_H2_SH(t1, t0, t3, t2, tmp0, tmp1);
  778. tmp4 = (v8i16)__msa_bmnz_v((v16u8)zero, (v16u8)tmp2, (v16u8)tmp0);
  779. tmp5 = (v8i16)__msa_bmnz_v((v16u8)zero, (v16u8)tmp3, (v16u8)tmp1);
  780. LD_SH2(&mtx->q_[0], 8, tmp0, tmp1);
  781. MUL2(tmp4, tmp0, tmp5, tmp1, in0, in1);
  782. VSHF_H2_SH(tmp4, tmp5, tmp4, tmp5, zigzag0, zigzag1, out0, out1);
  783. ST_SH2(in0, in1, &in[0], 8);
  784. ST_SH2(out0, out1, &out[0], 8);
  785. out0 = __msa_add_a_h(out0, out1);
  786. sum = HADD_SH_S32(out0);
  787. return (sum > 0);
  788. }
  789. static int Quantize2Blocks(int16_t in[32], int16_t out[32],
  790. const VP8Matrix* const mtx) {
  791. int nz;
  792. nz = VP8EncQuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
  793. nz |= VP8EncQuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
  794. return nz;
  795. }
  796. //------------------------------------------------------------------------------
  797. // Entry point
  798. extern void VP8EncDspInitMSA(void);
  799. WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMSA(void) {
  800. VP8ITransform = ITransform;
  801. VP8FTransform = FTransform;
  802. VP8FTransformWHT = FTransformWHT;
  803. VP8TDisto4x4 = Disto4x4;
  804. VP8TDisto16x16 = Disto16x16;
  805. VP8CollectHistogram = CollectHistogram;
  806. VP8EncPredLuma4 = Intra4Preds;
  807. VP8EncPredLuma16 = Intra16Preds;
  808. VP8EncPredChroma8 = IntraChromaPreds;
  809. VP8SSE16x16 = SSE16x16;
  810. VP8SSE16x8 = SSE16x8;
  811. VP8SSE8x8 = SSE8x8;
  812. VP8SSE4x4 = SSE4x4;
  813. VP8EncQuantizeBlock = QuantizeBlock;
  814. VP8EncQuantize2Blocks = Quantize2Blocks;
  815. VP8EncQuantizeBlockWHT = QuantizeBlock;
  816. }
  817. #else // !WEBP_USE_MSA
  818. WEBP_DSP_INIT_STUB(VP8EncDspInitMSA)
  819. #endif // WEBP_USE_MSA