lossless_enc_mips32.c 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432
  1. // Copyright 2015 Google Inc. All Rights Reserved.
  2. //
  3. // Use of this source code is governed by a BSD-style license
  4. // that can be found in the COPYING file in the root of the source
  5. // tree. An additional intellectual property rights grant can be found
  6. // in the file PATENTS. All contributing project authors may
  7. // be found in the AUTHORS file in the root of the source tree.
  8. // -----------------------------------------------------------------------------
  9. //
  10. // MIPS version of lossless functions
  11. //
  12. // Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
  13. // Jovan Zelincevic (jovan.zelincevic@imgtec.com)
  14. #include "./dsp.h"
  15. #include "./lossless.h"
  16. #include "./lossless_common.h"
  17. #if defined(WEBP_USE_MIPS32)
  18. #include <assert.h>
  19. #include <math.h>
  20. #include <stdlib.h>
  21. #include <string.h>
  22. static float FastSLog2Slow(uint32_t v) {
  23. assert(v >= LOG_LOOKUP_IDX_MAX);
  24. if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
  25. uint32_t log_cnt, y, correction;
  26. const int c24 = 24;
  27. const float v_f = (float)v;
  28. uint32_t temp;
  29. // Xf = 256 = 2^8
  30. // log_cnt is index of leading one in upper 24 bits
  31. __asm__ volatile(
  32. "clz %[log_cnt], %[v] \n\t"
  33. "addiu %[y], $zero, 1 \n\t"
  34. "subu %[log_cnt], %[c24], %[log_cnt] \n\t"
  35. "sllv %[y], %[y], %[log_cnt] \n\t"
  36. "srlv %[temp], %[v], %[log_cnt] \n\t"
  37. : [log_cnt]"=&r"(log_cnt), [y]"=&r"(y),
  38. [temp]"=r"(temp)
  39. : [c24]"r"(c24), [v]"r"(v)
  40. );
  41. // vf = (2^log_cnt) * Xf; where y = 2^log_cnt and Xf < 256
  42. // Xf = floor(Xf) * (1 + (v % y) / v)
  43. // log2(Xf) = log2(floor(Xf)) + log2(1 + (v % y) / v)
  44. // The correction factor: log(1 + d) ~ d; for very small d values, so
  45. // log2(1 + (v % y) / v) ~ LOG_2_RECIPROCAL * (v % y)/v
  46. // LOG_2_RECIPROCAL ~ 23/16
  47. // (v % y) = (v % 2^log_cnt) = v & (2^log_cnt - 1)
  48. correction = (23 * (v & (y - 1))) >> 4;
  49. return v_f * (kLog2Table[temp] + log_cnt) + correction;
  50. } else {
  51. return (float)(LOG_2_RECIPROCAL * v * log((double)v));
  52. }
  53. }
  54. static float FastLog2Slow(uint32_t v) {
  55. assert(v >= LOG_LOOKUP_IDX_MAX);
  56. if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
  57. uint32_t log_cnt, y;
  58. const int c24 = 24;
  59. double log_2;
  60. uint32_t temp;
  61. __asm__ volatile(
  62. "clz %[log_cnt], %[v] \n\t"
  63. "addiu %[y], $zero, 1 \n\t"
  64. "subu %[log_cnt], %[c24], %[log_cnt] \n\t"
  65. "sllv %[y], %[y], %[log_cnt] \n\t"
  66. "srlv %[temp], %[v], %[log_cnt] \n\t"
  67. : [log_cnt]"=&r"(log_cnt), [y]"=&r"(y),
  68. [temp]"=r"(temp)
  69. : [c24]"r"(c24), [v]"r"(v)
  70. );
  71. log_2 = kLog2Table[temp] + log_cnt;
  72. if (v >= APPROX_LOG_MAX) {
  73. // Since the division is still expensive, add this correction factor only
  74. // for large values of 'v'.
  75. const uint32_t correction = (23 * (v & (y - 1))) >> 4;
  76. log_2 += (double)correction / v;
  77. }
  78. return (float)log_2;
  79. } else {
  80. return (float)(LOG_2_RECIPROCAL * log((double)v));
  81. }
  82. }
  83. // C version of this function:
  84. // int i = 0;
  85. // int64_t cost = 0;
  86. // const uint32_t* pop = &population[4];
  87. // const uint32_t* LoopEnd = &population[length];
  88. // while (pop != LoopEnd) {
  89. // ++i;
  90. // cost += i * *pop;
  91. // cost += i * *(pop + 1);
  92. // pop += 2;
  93. // }
  94. // return (double)cost;
  95. static double ExtraCost(const uint32_t* const population, int length) {
  96. int i, temp0, temp1;
  97. const uint32_t* pop = &population[4];
  98. const uint32_t* const LoopEnd = &population[length];
  99. __asm__ volatile(
  100. "mult $zero, $zero \n\t"
  101. "xor %[i], %[i], %[i] \n\t"
  102. "beq %[pop], %[LoopEnd], 2f \n\t"
  103. "1: \n\t"
  104. "lw %[temp0], 0(%[pop]) \n\t"
  105. "lw %[temp1], 4(%[pop]) \n\t"
  106. "addiu %[i], %[i], 1 \n\t"
  107. "addiu %[pop], %[pop], 8 \n\t"
  108. "madd %[i], %[temp0] \n\t"
  109. "madd %[i], %[temp1] \n\t"
  110. "bne %[pop], %[LoopEnd], 1b \n\t"
  111. "2: \n\t"
  112. "mfhi %[temp0] \n\t"
  113. "mflo %[temp1] \n\t"
  114. : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
  115. [i]"=&r"(i), [pop]"+r"(pop)
  116. : [LoopEnd]"r"(LoopEnd)
  117. : "memory", "hi", "lo"
  118. );
  119. return (double)((int64_t)temp0 << 32 | temp1);
  120. }
  121. // C version of this function:
  122. // int i = 0;
  123. // int64_t cost = 0;
  124. // const uint32_t* pX = &X[4];
  125. // const uint32_t* pY = &Y[4];
  126. // const uint32_t* LoopEnd = &X[length];
  127. // while (pX != LoopEnd) {
  128. // const uint32_t xy0 = *pX + *pY;
  129. // const uint32_t xy1 = *(pX + 1) + *(pY + 1);
  130. // ++i;
  131. // cost += i * xy0;
  132. // cost += i * xy1;
  133. // pX += 2;
  134. // pY += 2;
  135. // }
  136. // return (double)cost;
  137. static double ExtraCostCombined(const uint32_t* const X,
  138. const uint32_t* const Y, int length) {
  139. int i, temp0, temp1, temp2, temp3;
  140. const uint32_t* pX = &X[4];
  141. const uint32_t* pY = &Y[4];
  142. const uint32_t* const LoopEnd = &X[length];
  143. __asm__ volatile(
  144. "mult $zero, $zero \n\t"
  145. "xor %[i], %[i], %[i] \n\t"
  146. "beq %[pX], %[LoopEnd], 2f \n\t"
  147. "1: \n\t"
  148. "lw %[temp0], 0(%[pX]) \n\t"
  149. "lw %[temp1], 0(%[pY]) \n\t"
  150. "lw %[temp2], 4(%[pX]) \n\t"
  151. "lw %[temp3], 4(%[pY]) \n\t"
  152. "addiu %[i], %[i], 1 \n\t"
  153. "addu %[temp0], %[temp0], %[temp1] \n\t"
  154. "addu %[temp2], %[temp2], %[temp3] \n\t"
  155. "addiu %[pX], %[pX], 8 \n\t"
  156. "addiu %[pY], %[pY], 8 \n\t"
  157. "madd %[i], %[temp0] \n\t"
  158. "madd %[i], %[temp2] \n\t"
  159. "bne %[pX], %[LoopEnd], 1b \n\t"
  160. "2: \n\t"
  161. "mfhi %[temp0] \n\t"
  162. "mflo %[temp1] \n\t"
  163. : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
  164. [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
  165. [i]"=&r"(i), [pX]"+r"(pX), [pY]"+r"(pY)
  166. : [LoopEnd]"r"(LoopEnd)
  167. : "memory", "hi", "lo"
  168. );
  169. return (double)((int64_t)temp0 << 32 | temp1);
  170. }
  171. #define HUFFMAN_COST_PASS \
  172. __asm__ volatile( \
  173. "sll %[temp1], %[temp0], 3 \n\t" \
  174. "addiu %[temp3], %[streak], -3 \n\t" \
  175. "addu %[temp2], %[pstreaks], %[temp1] \n\t" \
  176. "blez %[temp3], 1f \n\t" \
  177. "srl %[temp1], %[temp1], 1 \n\t" \
  178. "addu %[temp3], %[pcnts], %[temp1] \n\t" \
  179. "lw %[temp0], 4(%[temp2]) \n\t" \
  180. "lw %[temp1], 0(%[temp3]) \n\t" \
  181. "addu %[temp0], %[temp0], %[streak] \n\t" \
  182. "addiu %[temp1], %[temp1], 1 \n\t" \
  183. "sw %[temp0], 4(%[temp2]) \n\t" \
  184. "sw %[temp1], 0(%[temp3]) \n\t" \
  185. "b 2f \n\t" \
  186. "1: \n\t" \
  187. "lw %[temp0], 0(%[temp2]) \n\t" \
  188. "addu %[temp0], %[temp0], %[streak] \n\t" \
  189. "sw %[temp0], 0(%[temp2]) \n\t" \
  190. "2: \n\t" \
  191. : [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), \
  192. [temp3]"=&r"(temp3), [temp0]"+r"(temp0) \
  193. : [pstreaks]"r"(pstreaks), [pcnts]"r"(pcnts), \
  194. [streak]"r"(streak) \
  195. : "memory" \
  196. );
  197. // Returns the various RLE counts
  198. static WEBP_INLINE void GetEntropyUnrefinedHelper(
  199. uint32_t val, int i, uint32_t* const val_prev, int* const i_prev,
  200. VP8LBitEntropy* const bit_entropy, VP8LStreaks* const stats) {
  201. int* const pstreaks = &stats->streaks[0][0];
  202. int* const pcnts = &stats->counts[0];
  203. int temp0, temp1, temp2, temp3;
  204. const int streak = i - *i_prev;
  205. // Gather info for the bit entropy.
  206. if (*val_prev != 0) {
  207. bit_entropy->sum += (*val_prev) * streak;
  208. bit_entropy->nonzeros += streak;
  209. bit_entropy->nonzero_code = *i_prev;
  210. bit_entropy->entropy -= VP8LFastSLog2(*val_prev) * streak;
  211. if (bit_entropy->max_val < *val_prev) {
  212. bit_entropy->max_val = *val_prev;
  213. }
  214. }
  215. // Gather info for the Huffman cost.
  216. temp0 = (*val_prev != 0);
  217. HUFFMAN_COST_PASS
  218. *val_prev = val;
  219. *i_prev = i;
  220. }
  221. static void GetEntropyUnrefined(const uint32_t X[], int length,
  222. VP8LBitEntropy* const bit_entropy,
  223. VP8LStreaks* const stats) {
  224. int i;
  225. int i_prev = 0;
  226. uint32_t x_prev = X[0];
  227. memset(stats, 0, sizeof(*stats));
  228. VP8LBitEntropyInit(bit_entropy);
  229. for (i = 1; i < length; ++i) {
  230. const uint32_t x = X[i];
  231. if (x != x_prev) {
  232. GetEntropyUnrefinedHelper(x, i, &x_prev, &i_prev, bit_entropy, stats);
  233. }
  234. }
  235. GetEntropyUnrefinedHelper(0, i, &x_prev, &i_prev, bit_entropy, stats);
  236. bit_entropy->entropy += VP8LFastSLog2(bit_entropy->sum);
  237. }
  238. static void GetCombinedEntropyUnrefined(const uint32_t X[], const uint32_t Y[],
  239. int length,
  240. VP8LBitEntropy* const bit_entropy,
  241. VP8LStreaks* const stats) {
  242. int i = 1;
  243. int i_prev = 0;
  244. uint32_t xy_prev = X[0] + Y[0];
  245. memset(stats, 0, sizeof(*stats));
  246. VP8LBitEntropyInit(bit_entropy);
  247. for (i = 1; i < length; ++i) {
  248. const uint32_t xy = X[i] + Y[i];
  249. if (xy != xy_prev) {
  250. GetEntropyUnrefinedHelper(xy, i, &xy_prev, &i_prev, bit_entropy, stats);
  251. }
  252. }
  253. GetEntropyUnrefinedHelper(0, i, &xy_prev, &i_prev, bit_entropy, stats);
  254. bit_entropy->entropy += VP8LFastSLog2(bit_entropy->sum);
  255. }
  256. #define ASM_START \
  257. __asm__ volatile( \
  258. ".set push \n\t" \
  259. ".set at \n\t" \
  260. ".set macro \n\t" \
  261. "1: \n\t"
  262. // P2 = P0 + P1
  263. // A..D - offsets
  264. // E - temp variable to tell macro
  265. // if pointer should be incremented
  266. // literal_ and successive histograms could be unaligned
  267. // so we must use ulw and usw
  268. #define ADD_TO_OUT(A, B, C, D, E, P0, P1, P2) \
  269. "ulw %[temp0], " #A "(%[" #P0 "]) \n\t" \
  270. "ulw %[temp1], " #B "(%[" #P0 "]) \n\t" \
  271. "ulw %[temp2], " #C "(%[" #P0 "]) \n\t" \
  272. "ulw %[temp3], " #D "(%[" #P0 "]) \n\t" \
  273. "ulw %[temp4], " #A "(%[" #P1 "]) \n\t" \
  274. "ulw %[temp5], " #B "(%[" #P1 "]) \n\t" \
  275. "ulw %[temp6], " #C "(%[" #P1 "]) \n\t" \
  276. "ulw %[temp7], " #D "(%[" #P1 "]) \n\t" \
  277. "addu %[temp4], %[temp4], %[temp0] \n\t" \
  278. "addu %[temp5], %[temp5], %[temp1] \n\t" \
  279. "addu %[temp6], %[temp6], %[temp2] \n\t" \
  280. "addu %[temp7], %[temp7], %[temp3] \n\t" \
  281. "addiu %[" #P0 "], %[" #P0 "], 16 \n\t" \
  282. ".if " #E " == 1 \n\t" \
  283. "addiu %[" #P1 "], %[" #P1 "], 16 \n\t" \
  284. ".endif \n\t" \
  285. "usw %[temp4], " #A "(%[" #P2 "]) \n\t" \
  286. "usw %[temp5], " #B "(%[" #P2 "]) \n\t" \
  287. "usw %[temp6], " #C "(%[" #P2 "]) \n\t" \
  288. "usw %[temp7], " #D "(%[" #P2 "]) \n\t" \
  289. "addiu %[" #P2 "], %[" #P2 "], 16 \n\t" \
  290. "bne %[" #P0 "], %[LoopEnd], 1b \n\t" \
  291. ".set pop \n\t" \
  292. #define ASM_END_COMMON_0 \
  293. : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), \
  294. [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), \
  295. [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), \
  296. [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), \
  297. [pa]"+r"(pa), [pout]"+r"(pout)
  298. #define ASM_END_COMMON_1 \
  299. : [LoopEnd]"r"(LoopEnd) \
  300. : "memory", "at" \
  301. );
  302. #define ASM_END_0 \
  303. ASM_END_COMMON_0 \
  304. , [pb]"+r"(pb) \
  305. ASM_END_COMMON_1
  306. #define ASM_END_1 \
  307. ASM_END_COMMON_0 \
  308. ASM_END_COMMON_1
  309. #define ADD_VECTOR(A, B, OUT, SIZE, EXTRA_SIZE) do { \
  310. const uint32_t* pa = (const uint32_t*)(A); \
  311. const uint32_t* pb = (const uint32_t*)(B); \
  312. uint32_t* pout = (uint32_t*)(OUT); \
  313. const uint32_t* const LoopEnd = pa + (SIZE); \
  314. assert((SIZE) % 4 == 0); \
  315. ASM_START \
  316. ADD_TO_OUT(0, 4, 8, 12, 1, pa, pb, pout) \
  317. ASM_END_0 \
  318. if ((EXTRA_SIZE) > 0) { \
  319. const int last = (EXTRA_SIZE); \
  320. int i; \
  321. for (i = 0; i < last; ++i) pout[i] = pa[i] + pb[i]; \
  322. } \
  323. } while (0)
  324. #define ADD_VECTOR_EQ(A, OUT, SIZE, EXTRA_SIZE) do { \
  325. const uint32_t* pa = (const uint32_t*)(A); \
  326. uint32_t* pout = (uint32_t*)(OUT); \
  327. const uint32_t* const LoopEnd = pa + (SIZE); \
  328. assert((SIZE) % 4 == 0); \
  329. ASM_START \
  330. ADD_TO_OUT(0, 4, 8, 12, 0, pa, pout, pout) \
  331. ASM_END_1 \
  332. if ((EXTRA_SIZE) > 0) { \
  333. const int last = (EXTRA_SIZE); \
  334. int i; \
  335. for (i = 0; i < last; ++i) pout[i] += pa[i]; \
  336. } \
  337. } while (0)
  338. static void HistogramAdd(const VP8LHistogram* const a,
  339. const VP8LHistogram* const b,
  340. VP8LHistogram* const out) {
  341. uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
  342. const int extra_cache_size = VP8LHistogramNumCodes(a->palette_code_bits_)
  343. - (NUM_LITERAL_CODES + NUM_LENGTH_CODES);
  344. assert(a->palette_code_bits_ == b->palette_code_bits_);
  345. if (b != out) {
  346. ADD_VECTOR(a->literal_, b->literal_, out->literal_,
  347. NUM_LITERAL_CODES + NUM_LENGTH_CODES, extra_cache_size);
  348. ADD_VECTOR(a->distance_, b->distance_, out->distance_,
  349. NUM_DISTANCE_CODES, 0);
  350. ADD_VECTOR(a->red_, b->red_, out->red_, NUM_LITERAL_CODES, 0);
  351. ADD_VECTOR(a->blue_, b->blue_, out->blue_, NUM_LITERAL_CODES, 0);
  352. ADD_VECTOR(a->alpha_, b->alpha_, out->alpha_, NUM_LITERAL_CODES, 0);
  353. } else {
  354. ADD_VECTOR_EQ(a->literal_, out->literal_,
  355. NUM_LITERAL_CODES + NUM_LENGTH_CODES, extra_cache_size);
  356. ADD_VECTOR_EQ(a->distance_, out->distance_, NUM_DISTANCE_CODES, 0);
  357. ADD_VECTOR_EQ(a->red_, out->red_, NUM_LITERAL_CODES, 0);
  358. ADD_VECTOR_EQ(a->blue_, out->blue_, NUM_LITERAL_CODES, 0);
  359. ADD_VECTOR_EQ(a->alpha_, out->alpha_, NUM_LITERAL_CODES, 0);
  360. }
  361. }
  362. #undef ADD_VECTOR_EQ
  363. #undef ADD_VECTOR
  364. #undef ASM_END_1
  365. #undef ASM_END_0
  366. #undef ASM_END_COMMON_1
  367. #undef ASM_END_COMMON_0
  368. #undef ADD_TO_OUT
  369. #undef ASM_START
  370. //------------------------------------------------------------------------------
  371. // Entry point
  372. extern void VP8LEncDspInitMIPS32(void);
  373. WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitMIPS32(void) {
  374. VP8LFastSLog2Slow = FastSLog2Slow;
  375. VP8LFastLog2Slow = FastLog2Slow;
  376. VP8LExtraCost = ExtraCost;
  377. VP8LExtraCostCombined = ExtraCostCombined;
  378. VP8LGetEntropyUnrefined = GetEntropyUnrefined;
  379. VP8LGetCombinedEntropyUnrefined = GetCombinedEntropyUnrefined;
  380. VP8LHistogramAdd = HistogramAdd;
  381. }
  382. #else // !WEBP_USE_MIPS32
  383. WEBP_DSP_INIT_STUB(VP8LEncDspInitMIPS32)
  384. #endif // WEBP_USE_MIPS32