enc.c 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932
  1. // Copyright 2011 Google Inc. All Rights Reserved.
  2. //
  3. // Use of this source code is governed by a BSD-style license
  4. // that can be found in the COPYING file in the root of the source
  5. // tree. An additional intellectual property rights grant can be found
  6. // in the file PATENTS. All contributing project authors may
  7. // be found in the AUTHORS file in the root of the source tree.
  8. // -----------------------------------------------------------------------------
  9. //
  10. // Speed-critical encoding functions.
  11. //
  12. // Author: Skal (pascal.massimino@gmail.com)
  13. #include <assert.h>
  14. #include <stdlib.h> // for abs()
  15. #include "./dsp.h"
  16. #include "../enc/vp8i_enc.h"
  17. static WEBP_INLINE uint8_t clip_8b(int v) {
  18. return (!(v & ~0xff)) ? v : (v < 0) ? 0 : 255;
  19. }
  20. static WEBP_INLINE int clip_max(int v, int max) {
  21. return (v > max) ? max : v;
  22. }
  23. //------------------------------------------------------------------------------
  24. // Compute susceptibility based on DCT-coeff histograms:
  25. // the higher, the "easier" the macroblock is to compress.
  26. const int VP8DspScan[16 + 4 + 4] = {
  27. // Luma
  28. 0 + 0 * BPS, 4 + 0 * BPS, 8 + 0 * BPS, 12 + 0 * BPS,
  29. 0 + 4 * BPS, 4 + 4 * BPS, 8 + 4 * BPS, 12 + 4 * BPS,
  30. 0 + 8 * BPS, 4 + 8 * BPS, 8 + 8 * BPS, 12 + 8 * BPS,
  31. 0 + 12 * BPS, 4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS,
  32. 0 + 0 * BPS, 4 + 0 * BPS, 0 + 4 * BPS, 4 + 4 * BPS, // U
  33. 8 + 0 * BPS, 12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS // V
  34. };
  35. // general-purpose util function
  36. void VP8SetHistogramData(const int distribution[MAX_COEFF_THRESH + 1],
  37. VP8Histogram* const histo) {
  38. int max_value = 0, last_non_zero = 1;
  39. int k;
  40. for (k = 0; k <= MAX_COEFF_THRESH; ++k) {
  41. const int value = distribution[k];
  42. if (value > 0) {
  43. if (value > max_value) max_value = value;
  44. last_non_zero = k;
  45. }
  46. }
  47. histo->max_value = max_value;
  48. histo->last_non_zero = last_non_zero;
  49. }
  50. static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
  51. int start_block, int end_block,
  52. VP8Histogram* const histo) {
  53. int j;
  54. int distribution[MAX_COEFF_THRESH + 1] = { 0 };
  55. for (j = start_block; j < end_block; ++j) {
  56. int k;
  57. int16_t out[16];
  58. VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
  59. // Convert coefficients to bin.
  60. for (k = 0; k < 16; ++k) {
  61. const int v = abs(out[k]) >> 3;
  62. const int clipped_value = clip_max(v, MAX_COEFF_THRESH);
  63. ++distribution[clipped_value];
  64. }
  65. }
  66. VP8SetHistogramData(distribution, histo);
  67. }
  68. //------------------------------------------------------------------------------
  69. // run-time tables (~4k)
  70. static uint8_t clip1[255 + 510 + 1]; // clips [-255,510] to [0,255]
  71. // We declare this variable 'volatile' to prevent instruction reordering
  72. // and make sure it's set to true _last_ (so as to be thread-safe)
  73. static volatile int tables_ok = 0;
  74. static WEBP_TSAN_IGNORE_FUNCTION void InitTables(void) {
  75. if (!tables_ok) {
  76. int i;
  77. for (i = -255; i <= 255 + 255; ++i) {
  78. clip1[255 + i] = clip_8b(i);
  79. }
  80. tables_ok = 1;
  81. }
  82. }
  83. //------------------------------------------------------------------------------
  84. // Transforms (Paragraph 14.4)
  85. #define STORE(x, y, v) \
  86. dst[(x) + (y) * BPS] = clip_8b(ref[(x) + (y) * BPS] + ((v) >> 3))
  87. static const int kC1 = 20091 + (1 << 16);
  88. static const int kC2 = 35468;
  89. #define MUL(a, b) (((a) * (b)) >> 16)
  90. static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
  91. uint8_t* dst) {
  92. int C[4 * 4], *tmp;
  93. int i;
  94. tmp = C;
  95. for (i = 0; i < 4; ++i) { // vertical pass
  96. const int a = in[0] + in[8];
  97. const int b = in[0] - in[8];
  98. const int c = MUL(in[4], kC2) - MUL(in[12], kC1);
  99. const int d = MUL(in[4], kC1) + MUL(in[12], kC2);
  100. tmp[0] = a + d;
  101. tmp[1] = b + c;
  102. tmp[2] = b - c;
  103. tmp[3] = a - d;
  104. tmp += 4;
  105. in++;
  106. }
  107. tmp = C;
  108. for (i = 0; i < 4; ++i) { // horizontal pass
  109. const int dc = tmp[0] + 4;
  110. const int a = dc + tmp[8];
  111. const int b = dc - tmp[8];
  112. const int c = MUL(tmp[4], kC2) - MUL(tmp[12], kC1);
  113. const int d = MUL(tmp[4], kC1) + MUL(tmp[12], kC2);
  114. STORE(0, i, a + d);
  115. STORE(1, i, b + c);
  116. STORE(2, i, b - c);
  117. STORE(3, i, a - d);
  118. tmp++;
  119. }
  120. }
  121. static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,
  122. int do_two) {
  123. ITransformOne(ref, in, dst);
  124. if (do_two) {
  125. ITransformOne(ref + 4, in + 16, dst + 4);
  126. }
  127. }
  128. static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
  129. int i;
  130. int tmp[16];
  131. for (i = 0; i < 4; ++i, src += BPS, ref += BPS) {
  132. const int d0 = src[0] - ref[0]; // 9bit dynamic range ([-255,255])
  133. const int d1 = src[1] - ref[1];
  134. const int d2 = src[2] - ref[2];
  135. const int d3 = src[3] - ref[3];
  136. const int a0 = (d0 + d3); // 10b [-510,510]
  137. const int a1 = (d1 + d2);
  138. const int a2 = (d1 - d2);
  139. const int a3 = (d0 - d3);
  140. tmp[0 + i * 4] = (a0 + a1) * 8; // 14b [-8160,8160]
  141. tmp[1 + i * 4] = (a2 * 2217 + a3 * 5352 + 1812) >> 9; // [-7536,7542]
  142. tmp[2 + i * 4] = (a0 - a1) * 8;
  143. tmp[3 + i * 4] = (a3 * 2217 - a2 * 5352 + 937) >> 9;
  144. }
  145. for (i = 0; i < 4; ++i) {
  146. const int a0 = (tmp[0 + i] + tmp[12 + i]); // 15b
  147. const int a1 = (tmp[4 + i] + tmp[ 8 + i]);
  148. const int a2 = (tmp[4 + i] - tmp[ 8 + i]);
  149. const int a3 = (tmp[0 + i] - tmp[12 + i]);
  150. out[0 + i] = (a0 + a1 + 7) >> 4; // 12b
  151. out[4 + i] = ((a2 * 2217 + a3 * 5352 + 12000) >> 16) + (a3 != 0);
  152. out[8 + i] = (a0 - a1 + 7) >> 4;
  153. out[12+ i] = ((a3 * 2217 - a2 * 5352 + 51000) >> 16);
  154. }
  155. }
  156. static void FTransform2(const uint8_t* src, const uint8_t* ref, int16_t* out) {
  157. VP8FTransform(src, ref, out);
  158. VP8FTransform(src + 4, ref + 4, out + 16);
  159. }
  160. static void FTransformWHT(const int16_t* in, int16_t* out) {
  161. // input is 12b signed
  162. int32_t tmp[16];
  163. int i;
  164. for (i = 0; i < 4; ++i, in += 64) {
  165. const int a0 = (in[0 * 16] + in[2 * 16]); // 13b
  166. const int a1 = (in[1 * 16] + in[3 * 16]);
  167. const int a2 = (in[1 * 16] - in[3 * 16]);
  168. const int a3 = (in[0 * 16] - in[2 * 16]);
  169. tmp[0 + i * 4] = a0 + a1; // 14b
  170. tmp[1 + i * 4] = a3 + a2;
  171. tmp[2 + i * 4] = a3 - a2;
  172. tmp[3 + i * 4] = a0 - a1;
  173. }
  174. for (i = 0; i < 4; ++i) {
  175. const int a0 = (tmp[0 + i] + tmp[8 + i]); // 15b
  176. const int a1 = (tmp[4 + i] + tmp[12+ i]);
  177. const int a2 = (tmp[4 + i] - tmp[12+ i]);
  178. const int a3 = (tmp[0 + i] - tmp[8 + i]);
  179. const int b0 = a0 + a1; // 16b
  180. const int b1 = a3 + a2;
  181. const int b2 = a3 - a2;
  182. const int b3 = a0 - a1;
  183. out[ 0 + i] = b0 >> 1; // 15b
  184. out[ 4 + i] = b1 >> 1;
  185. out[ 8 + i] = b2 >> 1;
  186. out[12 + i] = b3 >> 1;
  187. }
  188. }
  189. #undef MUL
  190. #undef STORE
  191. //------------------------------------------------------------------------------
  192. // Intra predictions
  193. static WEBP_INLINE void Fill(uint8_t* dst, int value, int size) {
  194. int j;
  195. for (j = 0; j < size; ++j) {
  196. memset(dst + j * BPS, value, size);
  197. }
  198. }
  199. static WEBP_INLINE void VerticalPred(uint8_t* dst,
  200. const uint8_t* top, int size) {
  201. int j;
  202. if (top != NULL) {
  203. for (j = 0; j < size; ++j) memcpy(dst + j * BPS, top, size);
  204. } else {
  205. Fill(dst, 127, size);
  206. }
  207. }
  208. static WEBP_INLINE void HorizontalPred(uint8_t* dst,
  209. const uint8_t* left, int size) {
  210. if (left != NULL) {
  211. int j;
  212. for (j = 0; j < size; ++j) {
  213. memset(dst + j * BPS, left[j], size);
  214. }
  215. } else {
  216. Fill(dst, 129, size);
  217. }
  218. }
  219. static WEBP_INLINE void TrueMotion(uint8_t* dst, const uint8_t* left,
  220. const uint8_t* top, int size) {
  221. int y;
  222. if (left != NULL) {
  223. if (top != NULL) {
  224. const uint8_t* const clip = clip1 + 255 - left[-1];
  225. for (y = 0; y < size; ++y) {
  226. const uint8_t* const clip_table = clip + left[y];
  227. int x;
  228. for (x = 0; x < size; ++x) {
  229. dst[x] = clip_table[top[x]];
  230. }
  231. dst += BPS;
  232. }
  233. } else {
  234. HorizontalPred(dst, left, size);
  235. }
  236. } else {
  237. // true motion without left samples (hence: with default 129 value)
  238. // is equivalent to VE prediction where you just copy the top samples.
  239. // Note that if top samples are not available, the default value is
  240. // then 129, and not 127 as in the VerticalPred case.
  241. if (top != NULL) {
  242. VerticalPred(dst, top, size);
  243. } else {
  244. Fill(dst, 129, size);
  245. }
  246. }
  247. }
  248. static WEBP_INLINE void DCMode(uint8_t* dst, const uint8_t* left,
  249. const uint8_t* top,
  250. int size, int round, int shift) {
  251. int DC = 0;
  252. int j;
  253. if (top != NULL) {
  254. for (j = 0; j < size; ++j) DC += top[j];
  255. if (left != NULL) { // top and left present
  256. for (j = 0; j < size; ++j) DC += left[j];
  257. } else { // top, but no left
  258. DC += DC;
  259. }
  260. DC = (DC + round) >> shift;
  261. } else if (left != NULL) { // left but no top
  262. for (j = 0; j < size; ++j) DC += left[j];
  263. DC += DC;
  264. DC = (DC + round) >> shift;
  265. } else { // no top, no left, nothing.
  266. DC = 0x80;
  267. }
  268. Fill(dst, DC, size);
  269. }
  270. //------------------------------------------------------------------------------
  271. // Chroma 8x8 prediction (paragraph 12.2)
  272. static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
  273. const uint8_t* top) {
  274. // U block
  275. DCMode(C8DC8 + dst, left, top, 8, 8, 4);
  276. VerticalPred(C8VE8 + dst, top, 8);
  277. HorizontalPred(C8HE8 + dst, left, 8);
  278. TrueMotion(C8TM8 + dst, left, top, 8);
  279. // V block
  280. dst += 8;
  281. if (top != NULL) top += 8;
  282. if (left != NULL) left += 16;
  283. DCMode(C8DC8 + dst, left, top, 8, 8, 4);
  284. VerticalPred(C8VE8 + dst, top, 8);
  285. HorizontalPred(C8HE8 + dst, left, 8);
  286. TrueMotion(C8TM8 + dst, left, top, 8);
  287. }
  288. //------------------------------------------------------------------------------
  289. // luma 16x16 prediction (paragraph 12.3)
  290. static void Intra16Preds(uint8_t* dst,
  291. const uint8_t* left, const uint8_t* top) {
  292. DCMode(I16DC16 + dst, left, top, 16, 16, 5);
  293. VerticalPred(I16VE16 + dst, top, 16);
  294. HorizontalPred(I16HE16 + dst, left, 16);
  295. TrueMotion(I16TM16 + dst, left, top, 16);
  296. }
  297. //------------------------------------------------------------------------------
  298. // luma 4x4 prediction
  299. #define DST(x, y) dst[(x) + (y) * BPS]
  300. #define AVG3(a, b, c) ((uint8_t)(((a) + 2 * (b) + (c) + 2) >> 2))
  301. #define AVG2(a, b) (((a) + (b) + 1) >> 1)
  302. static void VE4(uint8_t* dst, const uint8_t* top) { // vertical
  303. const uint8_t vals[4] = {
  304. AVG3(top[-1], top[0], top[1]),
  305. AVG3(top[ 0], top[1], top[2]),
  306. AVG3(top[ 1], top[2], top[3]),
  307. AVG3(top[ 2], top[3], top[4])
  308. };
  309. int i;
  310. for (i = 0; i < 4; ++i) {
  311. memcpy(dst + i * BPS, vals, 4);
  312. }
  313. }
  314. static void HE4(uint8_t* dst, const uint8_t* top) { // horizontal
  315. const int X = top[-1];
  316. const int I = top[-2];
  317. const int J = top[-3];
  318. const int K = top[-4];
  319. const int L = top[-5];
  320. WebPUint32ToMem(dst + 0 * BPS, 0x01010101U * AVG3(X, I, J));
  321. WebPUint32ToMem(dst + 1 * BPS, 0x01010101U * AVG3(I, J, K));
  322. WebPUint32ToMem(dst + 2 * BPS, 0x01010101U * AVG3(J, K, L));
  323. WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(K, L, L));
  324. }
  325. static void DC4(uint8_t* dst, const uint8_t* top) {
  326. uint32_t dc = 4;
  327. int i;
  328. for (i = 0; i < 4; ++i) dc += top[i] + top[-5 + i];
  329. Fill(dst, dc >> 3, 4);
  330. }
  331. static void RD4(uint8_t* dst, const uint8_t* top) {
  332. const int X = top[-1];
  333. const int I = top[-2];
  334. const int J = top[-3];
  335. const int K = top[-4];
  336. const int L = top[-5];
  337. const int A = top[0];
  338. const int B = top[1];
  339. const int C = top[2];
  340. const int D = top[3];
  341. DST(0, 3) = AVG3(J, K, L);
  342. DST(0, 2) = DST(1, 3) = AVG3(I, J, K);
  343. DST(0, 1) = DST(1, 2) = DST(2, 3) = AVG3(X, I, J);
  344. DST(0, 0) = DST(1, 1) = DST(2, 2) = DST(3, 3) = AVG3(A, X, I);
  345. DST(1, 0) = DST(2, 1) = DST(3, 2) = AVG3(B, A, X);
  346. DST(2, 0) = DST(3, 1) = AVG3(C, B, A);
  347. DST(3, 0) = AVG3(D, C, B);
  348. }
  349. static void LD4(uint8_t* dst, const uint8_t* top) {
  350. const int A = top[0];
  351. const int B = top[1];
  352. const int C = top[2];
  353. const int D = top[3];
  354. const int E = top[4];
  355. const int F = top[5];
  356. const int G = top[6];
  357. const int H = top[7];
  358. DST(0, 0) = AVG3(A, B, C);
  359. DST(1, 0) = DST(0, 1) = AVG3(B, C, D);
  360. DST(2, 0) = DST(1, 1) = DST(0, 2) = AVG3(C, D, E);
  361. DST(3, 0) = DST(2, 1) = DST(1, 2) = DST(0, 3) = AVG3(D, E, F);
  362. DST(3, 1) = DST(2, 2) = DST(1, 3) = AVG3(E, F, G);
  363. DST(3, 2) = DST(2, 3) = AVG3(F, G, H);
  364. DST(3, 3) = AVG3(G, H, H);
  365. }
  366. static void VR4(uint8_t* dst, const uint8_t* top) {
  367. const int X = top[-1];
  368. const int I = top[-2];
  369. const int J = top[-3];
  370. const int K = top[-4];
  371. const int A = top[0];
  372. const int B = top[1];
  373. const int C = top[2];
  374. const int D = top[3];
  375. DST(0, 0) = DST(1, 2) = AVG2(X, A);
  376. DST(1, 0) = DST(2, 2) = AVG2(A, B);
  377. DST(2, 0) = DST(3, 2) = AVG2(B, C);
  378. DST(3, 0) = AVG2(C, D);
  379. DST(0, 3) = AVG3(K, J, I);
  380. DST(0, 2) = AVG3(J, I, X);
  381. DST(0, 1) = DST(1, 3) = AVG3(I, X, A);
  382. DST(1, 1) = DST(2, 3) = AVG3(X, A, B);
  383. DST(2, 1) = DST(3, 3) = AVG3(A, B, C);
  384. DST(3, 1) = AVG3(B, C, D);
  385. }
  386. static void VL4(uint8_t* dst, const uint8_t* top) {
  387. const int A = top[0];
  388. const int B = top[1];
  389. const int C = top[2];
  390. const int D = top[3];
  391. const int E = top[4];
  392. const int F = top[5];
  393. const int G = top[6];
  394. const int H = top[7];
  395. DST(0, 0) = AVG2(A, B);
  396. DST(1, 0) = DST(0, 2) = AVG2(B, C);
  397. DST(2, 0) = DST(1, 2) = AVG2(C, D);
  398. DST(3, 0) = DST(2, 2) = AVG2(D, E);
  399. DST(0, 1) = AVG3(A, B, C);
  400. DST(1, 1) = DST(0, 3) = AVG3(B, C, D);
  401. DST(2, 1) = DST(1, 3) = AVG3(C, D, E);
  402. DST(3, 1) = DST(2, 3) = AVG3(D, E, F);
  403. DST(3, 2) = AVG3(E, F, G);
  404. DST(3, 3) = AVG3(F, G, H);
  405. }
  406. static void HU4(uint8_t* dst, const uint8_t* top) {
  407. const int I = top[-2];
  408. const int J = top[-3];
  409. const int K = top[-4];
  410. const int L = top[-5];
  411. DST(0, 0) = AVG2(I, J);
  412. DST(2, 0) = DST(0, 1) = AVG2(J, K);
  413. DST(2, 1) = DST(0, 2) = AVG2(K, L);
  414. DST(1, 0) = AVG3(I, J, K);
  415. DST(3, 0) = DST(1, 1) = AVG3(J, K, L);
  416. DST(3, 1) = DST(1, 2) = AVG3(K, L, L);
  417. DST(3, 2) = DST(2, 2) =
  418. DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
  419. }
  420. static void HD4(uint8_t* dst, const uint8_t* top) {
  421. const int X = top[-1];
  422. const int I = top[-2];
  423. const int J = top[-3];
  424. const int K = top[-4];
  425. const int L = top[-5];
  426. const int A = top[0];
  427. const int B = top[1];
  428. const int C = top[2];
  429. DST(0, 0) = DST(2, 1) = AVG2(I, X);
  430. DST(0, 1) = DST(2, 2) = AVG2(J, I);
  431. DST(0, 2) = DST(2, 3) = AVG2(K, J);
  432. DST(0, 3) = AVG2(L, K);
  433. DST(3, 0) = AVG3(A, B, C);
  434. DST(2, 0) = AVG3(X, A, B);
  435. DST(1, 0) = DST(3, 1) = AVG3(I, X, A);
  436. DST(1, 1) = DST(3, 2) = AVG3(J, I, X);
  437. DST(1, 2) = DST(3, 3) = AVG3(K, J, I);
  438. DST(1, 3) = AVG3(L, K, J);
  439. }
  440. static void TM4(uint8_t* dst, const uint8_t* top) {
  441. int x, y;
  442. const uint8_t* const clip = clip1 + 255 - top[-1];
  443. for (y = 0; y < 4; ++y) {
  444. const uint8_t* const clip_table = clip + top[-2 - y];
  445. for (x = 0; x < 4; ++x) {
  446. dst[x] = clip_table[top[x]];
  447. }
  448. dst += BPS;
  449. }
  450. }
  451. #undef DST
  452. #undef AVG3
  453. #undef AVG2
  454. // Left samples are top[-5 .. -2], top_left is top[-1], top are
  455. // located at top[0..3], and top right is top[4..7]
  456. static void Intra4Preds(uint8_t* dst, const uint8_t* top) {
  457. DC4(I4DC4 + dst, top);
  458. TM4(I4TM4 + dst, top);
  459. VE4(I4VE4 + dst, top);
  460. HE4(I4HE4 + dst, top);
  461. RD4(I4RD4 + dst, top);
  462. VR4(I4VR4 + dst, top);
  463. LD4(I4LD4 + dst, top);
  464. VL4(I4VL4 + dst, top);
  465. HD4(I4HD4 + dst, top);
  466. HU4(I4HU4 + dst, top);
  467. }
  468. //------------------------------------------------------------------------------
  469. // Metric
  470. static WEBP_INLINE int GetSSE(const uint8_t* a, const uint8_t* b,
  471. int w, int h) {
  472. int count = 0;
  473. int y, x;
  474. for (y = 0; y < h; ++y) {
  475. for (x = 0; x < w; ++x) {
  476. const int diff = (int)a[x] - b[x];
  477. count += diff * diff;
  478. }
  479. a += BPS;
  480. b += BPS;
  481. }
  482. return count;
  483. }
  484. static int SSE16x16(const uint8_t* a, const uint8_t* b) {
  485. return GetSSE(a, b, 16, 16);
  486. }
  487. static int SSE16x8(const uint8_t* a, const uint8_t* b) {
  488. return GetSSE(a, b, 16, 8);
  489. }
  490. static int SSE8x8(const uint8_t* a, const uint8_t* b) {
  491. return GetSSE(a, b, 8, 8);
  492. }
  493. static int SSE4x4(const uint8_t* a, const uint8_t* b) {
  494. return GetSSE(a, b, 4, 4);
  495. }
  496. static void Mean16x4(const uint8_t* ref, uint32_t dc[4]) {
  497. int k, x, y;
  498. for (k = 0; k < 4; ++k) {
  499. uint32_t avg = 0;
  500. for (y = 0; y < 4; ++y) {
  501. for (x = 0; x < 4; ++x) {
  502. avg += ref[x + y * BPS];
  503. }
  504. }
  505. dc[k] = avg;
  506. ref += 4; // go to next 4x4 block.
  507. }
  508. }
  509. //------------------------------------------------------------------------------
  510. // Texture distortion
  511. //
  512. // We try to match the spectral content (weighted) between source and
  513. // reconstructed samples.
  514. // Hadamard transform
  515. // Returns the weighted sum of the absolute value of transformed coefficients.
  516. // w[] contains a row-major 4 by 4 symmetric matrix.
  517. static int TTransform(const uint8_t* in, const uint16_t* w) {
  518. int sum = 0;
  519. int tmp[16];
  520. int i;
  521. // horizontal pass
  522. for (i = 0; i < 4; ++i, in += BPS) {
  523. const int a0 = in[0] + in[2];
  524. const int a1 = in[1] + in[3];
  525. const int a2 = in[1] - in[3];
  526. const int a3 = in[0] - in[2];
  527. tmp[0 + i * 4] = a0 + a1;
  528. tmp[1 + i * 4] = a3 + a2;
  529. tmp[2 + i * 4] = a3 - a2;
  530. tmp[3 + i * 4] = a0 - a1;
  531. }
  532. // vertical pass
  533. for (i = 0; i < 4; ++i, ++w) {
  534. const int a0 = tmp[0 + i] + tmp[8 + i];
  535. const int a1 = tmp[4 + i] + tmp[12+ i];
  536. const int a2 = tmp[4 + i] - tmp[12+ i];
  537. const int a3 = tmp[0 + i] - tmp[8 + i];
  538. const int b0 = a0 + a1;
  539. const int b1 = a3 + a2;
  540. const int b2 = a3 - a2;
  541. const int b3 = a0 - a1;
  542. sum += w[ 0] * abs(b0);
  543. sum += w[ 4] * abs(b1);
  544. sum += w[ 8] * abs(b2);
  545. sum += w[12] * abs(b3);
  546. }
  547. return sum;
  548. }
  549. static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
  550. const uint16_t* const w) {
  551. const int sum1 = TTransform(a, w);
  552. const int sum2 = TTransform(b, w);
  553. return abs(sum2 - sum1) >> 5;
  554. }
  555. static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
  556. const uint16_t* const w) {
  557. int D = 0;
  558. int x, y;
  559. for (y = 0; y < 16 * BPS; y += 4 * BPS) {
  560. for (x = 0; x < 16; x += 4) {
  561. D += Disto4x4(a + x + y, b + x + y, w);
  562. }
  563. }
  564. return D;
  565. }
  566. //------------------------------------------------------------------------------
  567. // Quantization
  568. //
  569. static const uint8_t kZigzag[16] = {
  570. 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
  571. };
  572. // Simple quantization
  573. static int QuantizeBlock(int16_t in[16], int16_t out[16],
  574. const VP8Matrix* const mtx) {
  575. int last = -1;
  576. int n;
  577. for (n = 0; n < 16; ++n) {
  578. const int j = kZigzag[n];
  579. const int sign = (in[j] < 0);
  580. const uint32_t coeff = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
  581. if (coeff > mtx->zthresh_[j]) {
  582. const uint32_t Q = mtx->q_[j];
  583. const uint32_t iQ = mtx->iq_[j];
  584. const uint32_t B = mtx->bias_[j];
  585. int level = QUANTDIV(coeff, iQ, B);
  586. if (level > MAX_LEVEL) level = MAX_LEVEL;
  587. if (sign) level = -level;
  588. in[j] = level * (int)Q;
  589. out[n] = level;
  590. if (level) last = n;
  591. } else {
  592. out[n] = 0;
  593. in[j] = 0;
  594. }
  595. }
  596. return (last >= 0);
  597. }
  598. static int Quantize2Blocks(int16_t in[32], int16_t out[32],
  599. const VP8Matrix* const mtx) {
  600. int nz;
  601. nz = VP8EncQuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
  602. nz |= VP8EncQuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
  603. return nz;
  604. }
  605. //------------------------------------------------------------------------------
  606. // Block copy
  607. static WEBP_INLINE void Copy(const uint8_t* src, uint8_t* dst, int w, int h) {
  608. int y;
  609. for (y = 0; y < h; ++y) {
  610. memcpy(dst, src, w);
  611. src += BPS;
  612. dst += BPS;
  613. }
  614. }
  615. static void Copy4x4(const uint8_t* src, uint8_t* dst) {
  616. Copy(src, dst, 4, 4);
  617. }
  618. static void Copy16x8(const uint8_t* src, uint8_t* dst) {
  619. Copy(src, dst, 16, 8);
  620. }
  621. //------------------------------------------------------------------------------
  622. // SSIM / PSNR
  623. // hat-shaped filter. Sum of coefficients is equal to 16.
  624. static const uint32_t kWeight[2 * VP8_SSIM_KERNEL + 1] = {
  625. 1, 2, 3, 4, 3, 2, 1
  626. };
  627. static const uint32_t kWeightSum = 16 * 16; // sum{kWeight}^2
  628. static WEBP_INLINE double SSIMCalculation(
  629. const VP8DistoStats* const stats, uint32_t N /*num samples*/) {
  630. const uint32_t w2 = N * N;
  631. const uint32_t C1 = 20 * w2;
  632. const uint32_t C2 = 60 * w2;
  633. const uint32_t C3 = 8 * 8 * w2; // 'dark' limit ~= 6
  634. const uint64_t xmxm = (uint64_t)stats->xm * stats->xm;
  635. const uint64_t ymym = (uint64_t)stats->ym * stats->ym;
  636. if (xmxm + ymym >= C3) {
  637. const int64_t xmym = (int64_t)stats->xm * stats->ym;
  638. const int64_t sxy = (int64_t)stats->xym * N - xmym; // can be negative
  639. const uint64_t sxx = (uint64_t)stats->xxm * N - xmxm;
  640. const uint64_t syy = (uint64_t)stats->yym * N - ymym;
  641. // we descale by 8 to prevent overflow during the fnum/fden multiply.
  642. const uint64_t num_S = (2 * (uint64_t)(sxy < 0 ? 0 : sxy) + C2) >> 8;
  643. const uint64_t den_S = (sxx + syy + C2) >> 8;
  644. const uint64_t fnum = (2 * xmym + C1) * num_S;
  645. const uint64_t fden = (xmxm + ymym + C1) * den_S;
  646. const double r = (double)fnum / fden;
  647. assert(r >= 0. && r <= 1.0);
  648. return r;
  649. }
  650. return 1.; // area is too dark to contribute meaningfully
  651. }
  652. double VP8SSIMFromStats(const VP8DistoStats* const stats) {
  653. return SSIMCalculation(stats, kWeightSum);
  654. }
  655. double VP8SSIMFromStatsClipped(const VP8DistoStats* const stats) {
  656. return SSIMCalculation(stats, stats->w);
  657. }
  658. static double SSIMGetClipped_C(const uint8_t* src1, int stride1,
  659. const uint8_t* src2, int stride2,
  660. int xo, int yo, int W, int H) {
  661. VP8DistoStats stats = { 0, 0, 0, 0, 0, 0 };
  662. const int ymin = (yo - VP8_SSIM_KERNEL < 0) ? 0 : yo - VP8_SSIM_KERNEL;
  663. const int ymax = (yo + VP8_SSIM_KERNEL > H - 1) ? H - 1
  664. : yo + VP8_SSIM_KERNEL;
  665. const int xmin = (xo - VP8_SSIM_KERNEL < 0) ? 0 : xo - VP8_SSIM_KERNEL;
  666. const int xmax = (xo + VP8_SSIM_KERNEL > W - 1) ? W - 1
  667. : xo + VP8_SSIM_KERNEL;
  668. int x, y;
  669. src1 += ymin * stride1;
  670. src2 += ymin * stride2;
  671. for (y = ymin; y <= ymax; ++y, src1 += stride1, src2 += stride2) {
  672. for (x = xmin; x <= xmax; ++x) {
  673. const uint32_t w = kWeight[VP8_SSIM_KERNEL + x - xo]
  674. * kWeight[VP8_SSIM_KERNEL + y - yo];
  675. const uint32_t s1 = src1[x];
  676. const uint32_t s2 = src2[x];
  677. stats.w += w;
  678. stats.xm += w * s1;
  679. stats.ym += w * s2;
  680. stats.xxm += w * s1 * s1;
  681. stats.xym += w * s1 * s2;
  682. stats.yym += w * s2 * s2;
  683. }
  684. }
  685. return VP8SSIMFromStatsClipped(&stats);
  686. }
  687. static double SSIMGet_C(const uint8_t* src1, int stride1,
  688. const uint8_t* src2, int stride2) {
  689. VP8DistoStats stats = { 0, 0, 0, 0, 0, 0 };
  690. int x, y;
  691. for (y = 0; y <= 2 * VP8_SSIM_KERNEL; ++y, src1 += stride1, src2 += stride2) {
  692. for (x = 0; x <= 2 * VP8_SSIM_KERNEL; ++x) {
  693. const uint32_t w = kWeight[x] * kWeight[y];
  694. const uint32_t s1 = src1[x];
  695. const uint32_t s2 = src2[x];
  696. stats.xm += w * s1;
  697. stats.ym += w * s2;
  698. stats.xxm += w * s1 * s1;
  699. stats.xym += w * s1 * s2;
  700. stats.yym += w * s2 * s2;
  701. }
  702. }
  703. return VP8SSIMFromStats(&stats);
  704. }
  705. //------------------------------------------------------------------------------
  706. static uint32_t AccumulateSSE(const uint8_t* src1,
  707. const uint8_t* src2, int len) {
  708. int i;
  709. uint32_t sse2 = 0;
  710. assert(len <= 65535); // to ensure that accumulation fits within uint32_t
  711. for (i = 0; i < len; ++i) {
  712. const int32_t diff = src1[i] - src2[i];
  713. sse2 += diff * diff;
  714. }
  715. return sse2;
  716. }
  717. //------------------------------------------------------------------------------
  718. VP8SSIMGetFunc VP8SSIMGet;
  719. VP8SSIMGetClippedFunc VP8SSIMGetClipped;
  720. VP8AccumulateSSEFunc VP8AccumulateSSE;
  721. extern void VP8SSIMDspInitSSE2(void);
  722. static volatile VP8CPUInfo ssim_last_cpuinfo_used =
  723. (VP8CPUInfo)&ssim_last_cpuinfo_used;
  724. WEBP_TSAN_IGNORE_FUNCTION void VP8SSIMDspInit(void) {
  725. if (ssim_last_cpuinfo_used == VP8GetCPUInfo) return;
  726. VP8SSIMGetClipped = SSIMGetClipped_C;
  727. VP8SSIMGet = SSIMGet_C;
  728. VP8AccumulateSSE = AccumulateSSE;
  729. if (VP8GetCPUInfo != NULL) {
  730. #if defined(WEBP_USE_SSE2)
  731. if (VP8GetCPUInfo(kSSE2)) {
  732. VP8SSIMDspInitSSE2();
  733. }
  734. #endif
  735. }
  736. ssim_last_cpuinfo_used = VP8GetCPUInfo;
  737. }
  738. //------------------------------------------------------------------------------
  739. // Initialization
  740. // Speed-critical function pointers. We have to initialize them to the default
  741. // implementations within VP8EncDspInit().
  742. VP8CHisto VP8CollectHistogram;
  743. VP8Idct VP8ITransform;
  744. VP8Fdct VP8FTransform;
  745. VP8Fdct VP8FTransform2;
  746. VP8WHT VP8FTransformWHT;
  747. VP8Intra4Preds VP8EncPredLuma4;
  748. VP8IntraPreds VP8EncPredLuma16;
  749. VP8IntraPreds VP8EncPredChroma8;
  750. VP8Metric VP8SSE16x16;
  751. VP8Metric VP8SSE8x8;
  752. VP8Metric VP8SSE16x8;
  753. VP8Metric VP8SSE4x4;
  754. VP8WMetric VP8TDisto4x4;
  755. VP8WMetric VP8TDisto16x16;
  756. VP8MeanMetric VP8Mean16x4;
  757. VP8QuantizeBlock VP8EncQuantizeBlock;
  758. VP8Quantize2Blocks VP8EncQuantize2Blocks;
  759. VP8QuantizeBlockWHT VP8EncQuantizeBlockWHT;
  760. VP8BlockCopy VP8Copy4x4;
  761. VP8BlockCopy VP8Copy16x8;
  762. extern void VP8EncDspInitSSE2(void);
  763. extern void VP8EncDspInitSSE41(void);
  764. extern void VP8EncDspInitAVX2(void);
  765. extern void VP8EncDspInitNEON(void);
  766. extern void VP8EncDspInitMIPS32(void);
  767. extern void VP8EncDspInitMIPSdspR2(void);
  768. extern void VP8EncDspInitMSA(void);
  769. static volatile VP8CPUInfo enc_last_cpuinfo_used =
  770. (VP8CPUInfo)&enc_last_cpuinfo_used;
  771. WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInit(void) {
  772. if (enc_last_cpuinfo_used == VP8GetCPUInfo) return;
  773. VP8DspInit(); // common inverse transforms
  774. InitTables();
  775. // default C implementations
  776. VP8CollectHistogram = CollectHistogram;
  777. VP8ITransform = ITransform;
  778. VP8FTransform = FTransform;
  779. VP8FTransform2 = FTransform2;
  780. VP8FTransformWHT = FTransformWHT;
  781. VP8EncPredLuma4 = Intra4Preds;
  782. VP8EncPredLuma16 = Intra16Preds;
  783. VP8EncPredChroma8 = IntraChromaPreds;
  784. VP8SSE16x16 = SSE16x16;
  785. VP8SSE8x8 = SSE8x8;
  786. VP8SSE16x8 = SSE16x8;
  787. VP8SSE4x4 = SSE4x4;
  788. VP8TDisto4x4 = Disto4x4;
  789. VP8TDisto16x16 = Disto16x16;
  790. VP8Mean16x4 = Mean16x4;
  791. VP8EncQuantizeBlock = QuantizeBlock;
  792. VP8EncQuantize2Blocks = Quantize2Blocks;
  793. VP8EncQuantizeBlockWHT = QuantizeBlock;
  794. VP8Copy4x4 = Copy4x4;
  795. VP8Copy16x8 = Copy16x8;
  796. // If defined, use CPUInfo() to overwrite some pointers with faster versions.
  797. if (VP8GetCPUInfo != NULL) {
  798. #if defined(WEBP_USE_SSE2)
  799. if (VP8GetCPUInfo(kSSE2)) {
  800. VP8EncDspInitSSE2();
  801. #if defined(WEBP_USE_SSE41)
  802. if (VP8GetCPUInfo(kSSE4_1)) {
  803. VP8EncDspInitSSE41();
  804. }
  805. #endif
  806. }
  807. #endif
  808. #if defined(WEBP_USE_AVX2)
  809. if (VP8GetCPUInfo(kAVX2)) {
  810. VP8EncDspInitAVX2();
  811. }
  812. #endif
  813. #if defined(WEBP_USE_NEON)
  814. if (VP8GetCPUInfo(kNEON)) {
  815. VP8EncDspInitNEON();
  816. }
  817. #endif
  818. #if defined(WEBP_USE_MIPS32)
  819. if (VP8GetCPUInfo(kMIPS32)) {
  820. VP8EncDspInitMIPS32();
  821. }
  822. #endif
  823. #if defined(WEBP_USE_MIPS_DSP_R2)
  824. if (VP8GetCPUInfo(kMIPSdspR2)) {
  825. VP8EncDspInitMIPSdspR2();
  826. }
  827. #endif
  828. #if defined(WEBP_USE_MSA)
  829. if (VP8GetCPUInfo(kMSA)) {
  830. VP8EncDspInitMSA();
  831. }
  832. #endif
  833. }
  834. enc_last_cpuinfo_used = VP8GetCPUInfo;
  835. }