dec_mips_dsp_r2.c 50 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995
  1. // Copyright 2014 Google Inc. All Rights Reserved.
  2. //
  3. // Use of this source code is governed by a BSD-style license
  4. // that can be found in the COPYING file in the root of the source
  5. // tree. An additional intellectual property rights grant can be found
  6. // in the file PATENTS. All contributing project authors may
  7. // be found in the AUTHORS file in the root of the source tree.
  8. // -----------------------------------------------------------------------------
  9. //
  10. // MIPS version of dsp functions
  11. //
  12. // Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
  13. // Jovan Zelincevic (jovan.zelincevic@imgtec.com)
  14. #include "./dsp.h"
  15. #if defined(WEBP_USE_MIPS_DSP_R2)
  16. #include "./mips_macro.h"
  17. static const int kC1 = 20091 + (1 << 16);
  18. static const int kC2 = 35468;
  19. #define MUL(a, b) (((a) * (b)) >> 16)
  20. static void TransformDC(const int16_t* in, uint8_t* dst) {
  21. int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9, temp10;
  22. __asm__ volatile (
  23. LOAD_WITH_OFFSET_X4(temp1, temp2, temp3, temp4, dst,
  24. 0, 0, 0, 0,
  25. 0, 1, 2, 3,
  26. BPS)
  27. "lh %[temp5], 0(%[in]) \n\t"
  28. "addiu %[temp5], %[temp5], 4 \n\t"
  29. "ins %[temp5], %[temp5], 16, 16 \n\t"
  30. "shra.ph %[temp5], %[temp5], 3 \n\t"
  31. CONVERT_2_BYTES_TO_HALF(temp6, temp7, temp8, temp9, temp10, temp1, temp2,
  32. temp3, temp1, temp2, temp3, temp4)
  33. STORE_SAT_SUM_X2(temp6, temp7, temp8, temp9, temp10, temp1, temp2, temp3,
  34. temp5, temp5, temp5, temp5, temp5, temp5, temp5, temp5,
  35. dst, 0, 1, 2, 3, BPS)
  36. OUTPUT_EARLY_CLOBBER_REGS_10()
  37. : [in]"r"(in), [dst]"r"(dst)
  38. : "memory"
  39. );
  40. }
  41. static void TransformAC3(const int16_t* in, uint8_t* dst) {
  42. const int a = in[0] + 4;
  43. int c4 = MUL(in[4], kC2);
  44. const int d4 = MUL(in[4], kC1);
  45. const int c1 = MUL(in[1], kC2);
  46. const int d1 = MUL(in[1], kC1);
  47. int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
  48. int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18;
  49. __asm__ volatile (
  50. "ins %[c4], %[d4], 16, 16 \n\t"
  51. "replv.ph %[temp1], %[a] \n\t"
  52. "replv.ph %[temp4], %[d1] \n\t"
  53. ADD_SUB_HALVES(temp2, temp3, temp1, c4)
  54. "replv.ph %[temp5], %[c1] \n\t"
  55. SHIFT_R_SUM_X2(temp1, temp6, temp7, temp8, temp2, temp9, temp10, temp4,
  56. temp2, temp2, temp3, temp3, temp4, temp5, temp4, temp5)
  57. LOAD_WITH_OFFSET_X4(temp3, temp5, temp11, temp12, dst,
  58. 0, 0, 0, 0,
  59. 0, 1, 2, 3,
  60. BPS)
  61. CONVERT_2_BYTES_TO_HALF(temp13, temp14, temp3, temp15, temp5, temp16,
  62. temp11, temp17, temp3, temp5, temp11, temp12)
  63. PACK_2_HALVES_TO_WORD(temp12, temp18, temp7, temp6, temp1, temp8, temp2,
  64. temp4, temp7, temp6, temp10, temp9)
  65. STORE_SAT_SUM_X2(temp13, temp14, temp3, temp15, temp5, temp16, temp11,
  66. temp17, temp12, temp18, temp1, temp8, temp2, temp4,
  67. temp7, temp6, dst, 0, 1, 2, 3, BPS)
  68. OUTPUT_EARLY_CLOBBER_REGS_18(),
  69. [c4]"+&r"(c4)
  70. : [dst]"r"(dst), [a]"r"(a), [d1]"r"(d1), [d4]"r"(d4), [c1]"r"(c1)
  71. : "memory"
  72. );
  73. }
  74. static void TransformOne(const int16_t* in, uint8_t* dst) {
  75. int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
  76. int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18;
  77. __asm__ volatile (
  78. "ulw %[temp1], 0(%[in]) \n\t"
  79. "ulw %[temp2], 16(%[in]) \n\t"
  80. LOAD_IN_X2(temp5, temp6, 24, 26)
  81. ADD_SUB_HALVES(temp3, temp4, temp1, temp2)
  82. LOAD_IN_X2(temp1, temp2, 8, 10)
  83. MUL_SHIFT_SUM(temp7, temp8, temp9, temp10, temp11, temp12, temp13, temp14,
  84. temp10, temp8, temp9, temp7, temp1, temp2, temp5, temp6,
  85. temp13, temp11, temp14, temp12)
  86. INSERT_HALF_X2(temp8, temp7, temp10, temp9)
  87. "ulw %[temp17], 4(%[in]) \n\t"
  88. "ulw %[temp18], 20(%[in]) \n\t"
  89. ADD_SUB_HALVES(temp1, temp2, temp3, temp8)
  90. ADD_SUB_HALVES(temp5, temp6, temp4, temp7)
  91. ADD_SUB_HALVES(temp7, temp8, temp17, temp18)
  92. LOAD_IN_X2(temp17, temp18, 12, 14)
  93. LOAD_IN_X2(temp9, temp10, 28, 30)
  94. MUL_SHIFT_SUM(temp11, temp12, temp13, temp14, temp15, temp16, temp4, temp17,
  95. temp12, temp14, temp11, temp13, temp17, temp18, temp9, temp10,
  96. temp15, temp4, temp16, temp17)
  97. INSERT_HALF_X2(temp11, temp12, temp13, temp14)
  98. ADD_SUB_HALVES(temp17, temp8, temp8, temp11)
  99. ADD_SUB_HALVES(temp3, temp4, temp7, temp12)
  100. // horizontal
  101. SRA_16(temp9, temp10, temp11, temp12, temp1, temp2, temp5, temp6)
  102. INSERT_HALF_X2(temp1, temp6, temp5, temp2)
  103. SRA_16(temp13, temp14, temp15, temp16, temp3, temp4, temp17, temp8)
  104. "repl.ph %[temp2], 0x4 \n\t"
  105. INSERT_HALF_X2(temp3, temp8, temp17, temp4)
  106. "addq.ph %[temp1], %[temp1], %[temp2] \n\t"
  107. "addq.ph %[temp6], %[temp6], %[temp2] \n\t"
  108. ADD_SUB_HALVES(temp2, temp4, temp1, temp3)
  109. ADD_SUB_HALVES(temp5, temp7, temp6, temp8)
  110. MUL_SHIFT_SUM(temp1, temp3, temp6, temp8, temp9, temp13, temp17, temp18,
  111. temp3, temp13, temp1, temp9, temp9, temp13, temp11, temp15,
  112. temp6, temp17, temp8, temp18)
  113. MUL_SHIFT_SUM(temp6, temp8, temp18, temp17, temp11, temp15, temp12, temp16,
  114. temp8, temp15, temp6, temp11, temp12, temp16, temp10, temp14,
  115. temp18, temp12, temp17, temp16)
  116. INSERT_HALF_X2(temp1, temp3, temp9, temp13)
  117. INSERT_HALF_X2(temp6, temp8, temp11, temp15)
  118. SHIFT_R_SUM_X2(temp9, temp10, temp11, temp12, temp13, temp14, temp15,
  119. temp16, temp2, temp4, temp5, temp7, temp3, temp1, temp8,
  120. temp6)
  121. PACK_2_HALVES_TO_WORD(temp1, temp2, temp3, temp4, temp9, temp12, temp13,
  122. temp16, temp11, temp10, temp15, temp14)
  123. LOAD_WITH_OFFSET_X4(temp10, temp11, temp14, temp15, dst,
  124. 0, 0, 0, 0,
  125. 0, 1, 2, 3,
  126. BPS)
  127. CONVERT_2_BYTES_TO_HALF(temp5, temp6, temp7, temp8, temp17, temp18, temp10,
  128. temp11, temp10, temp11, temp14, temp15)
  129. STORE_SAT_SUM_X2(temp5, temp6, temp7, temp8, temp17, temp18, temp10, temp11,
  130. temp9, temp12, temp1, temp2, temp13, temp16, temp3, temp4,
  131. dst, 0, 1, 2, 3, BPS)
  132. OUTPUT_EARLY_CLOBBER_REGS_18()
  133. : [dst]"r"(dst), [in]"r"(in), [kC1]"r"(kC1), [kC2]"r"(kC2)
  134. : "memory", "hi", "lo"
  135. );
  136. }
  137. static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
  138. TransformOne(in, dst);
  139. if (do_two) {
  140. TransformOne(in + 16, dst + 4);
  141. }
  142. }
  143. static WEBP_INLINE void FilterLoop26(uint8_t* p,
  144. int hstride, int vstride, int size,
  145. int thresh, int ithresh, int hev_thresh) {
  146. const int thresh2 = 2 * thresh + 1;
  147. int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
  148. int temp10, temp11, temp12, temp13, temp14, temp15;
  149. __asm__ volatile (
  150. ".set push \n\t"
  151. ".set noreorder \n\t"
  152. "1: \n\t"
  153. "negu %[temp1], %[hstride] \n\t"
  154. "addiu %[size], %[size], -1 \n\t"
  155. "sll %[temp2], %[hstride], 1 \n\t"
  156. "sll %[temp3], %[temp1], 1 \n\t"
  157. "addu %[temp4], %[temp2], %[hstride] \n\t"
  158. "addu %[temp5], %[temp3], %[temp1] \n\t"
  159. "lbu %[temp7], 0(%[p]) \n\t"
  160. "sll %[temp6], %[temp3], 1 \n\t"
  161. "lbux %[temp8], %[temp5](%[p]) \n\t"
  162. "lbux %[temp9], %[temp3](%[p]) \n\t"
  163. "lbux %[temp10], %[temp1](%[p]) \n\t"
  164. "lbux %[temp11], %[temp6](%[p]) \n\t"
  165. "lbux %[temp12], %[hstride](%[p]) \n\t"
  166. "lbux %[temp13], %[temp2](%[p]) \n\t"
  167. "lbux %[temp14], %[temp4](%[p]) \n\t"
  168. "subu %[temp1], %[temp10], %[temp7] \n\t"
  169. "subu %[temp2], %[temp9], %[temp12] \n\t"
  170. "absq_s.w %[temp3], %[temp1] \n\t"
  171. "absq_s.w %[temp4], %[temp2] \n\t"
  172. "negu %[temp1], %[temp1] \n\t"
  173. "sll %[temp3], %[temp3], 2 \n\t"
  174. "addu %[temp15], %[temp3], %[temp4] \n\t"
  175. "subu %[temp3], %[temp15], %[thresh2] \n\t"
  176. "sll %[temp6], %[temp1], 1 \n\t"
  177. "bgtz %[temp3], 3f \n\t"
  178. " subu %[temp4], %[temp11], %[temp8] \n\t"
  179. "absq_s.w %[temp4], %[temp4] \n\t"
  180. "shll_s.w %[temp2], %[temp2], 24 \n\t"
  181. "subu %[temp4], %[temp4], %[ithresh] \n\t"
  182. "bgtz %[temp4], 3f \n\t"
  183. " subu %[temp3], %[temp8], %[temp9] \n\t"
  184. "absq_s.w %[temp3], %[temp3] \n\t"
  185. "subu %[temp3], %[temp3], %[ithresh] \n\t"
  186. "bgtz %[temp3], 3f \n\t"
  187. " subu %[temp5], %[temp9], %[temp10] \n\t"
  188. "absq_s.w %[temp3], %[temp5] \n\t"
  189. "absq_s.w %[temp5], %[temp5] \n\t"
  190. "subu %[temp3], %[temp3], %[ithresh] \n\t"
  191. "bgtz %[temp3], 3f \n\t"
  192. " subu %[temp3], %[temp14], %[temp13] \n\t"
  193. "absq_s.w %[temp3], %[temp3] \n\t"
  194. "slt %[temp5], %[hev_thresh], %[temp5] \n\t"
  195. "subu %[temp3], %[temp3], %[ithresh] \n\t"
  196. "bgtz %[temp3], 3f \n\t"
  197. " subu %[temp3], %[temp13], %[temp12] \n\t"
  198. "absq_s.w %[temp3], %[temp3] \n\t"
  199. "sra %[temp4], %[temp2], 24 \n\t"
  200. "subu %[temp3], %[temp3], %[ithresh] \n\t"
  201. "bgtz %[temp3], 3f \n\t"
  202. " subu %[temp15], %[temp12], %[temp7] \n\t"
  203. "absq_s.w %[temp3], %[temp15] \n\t"
  204. "absq_s.w %[temp15], %[temp15] \n\t"
  205. "subu %[temp3], %[temp3], %[ithresh] \n\t"
  206. "bgtz %[temp3], 3f \n\t"
  207. " slt %[temp15], %[hev_thresh], %[temp15] \n\t"
  208. "addu %[temp3], %[temp6], %[temp1] \n\t"
  209. "or %[temp2], %[temp5], %[temp15] \n\t"
  210. "addu %[temp5], %[temp4], %[temp3] \n\t"
  211. "beqz %[temp2], 4f \n\t"
  212. " shra_r.w %[temp1], %[temp5], 3 \n\t"
  213. "addiu %[temp2], %[temp5], 3 \n\t"
  214. "sra %[temp2], %[temp2], 3 \n\t"
  215. "shll_s.w %[temp1], %[temp1], 27 \n\t"
  216. "shll_s.w %[temp2], %[temp2], 27 \n\t"
  217. "subu %[temp3], %[p], %[hstride] \n\t"
  218. "sra %[temp1], %[temp1], 27 \n\t"
  219. "sra %[temp2], %[temp2], 27 \n\t"
  220. "subu %[temp1], %[temp7], %[temp1] \n\t"
  221. "addu %[temp2], %[temp10], %[temp2] \n\t"
  222. "lbux %[temp2], %[temp2](%[VP8kclip1]) \n\t"
  223. "lbux %[temp1], %[temp1](%[VP8kclip1]) \n\t"
  224. "sb %[temp2], 0(%[temp3]) \n\t"
  225. "j 3f \n\t"
  226. " sb %[temp1], 0(%[p]) \n\t"
  227. "4: \n\t"
  228. "shll_s.w %[temp5], %[temp5], 24 \n\t"
  229. "subu %[temp14], %[p], %[hstride] \n\t"
  230. "subu %[temp11], %[temp14], %[hstride] \n\t"
  231. "sra %[temp6], %[temp5], 24 \n\t"
  232. "sll %[temp1], %[temp6], 3 \n\t"
  233. "subu %[temp15], %[temp11], %[hstride] \n\t"
  234. "addu %[temp2], %[temp6], %[temp1] \n\t"
  235. "sll %[temp3], %[temp2], 1 \n\t"
  236. "addu %[temp4], %[temp3], %[temp2] \n\t"
  237. "addiu %[temp2], %[temp2], 63 \n\t"
  238. "addiu %[temp3], %[temp3], 63 \n\t"
  239. "addiu %[temp4], %[temp4], 63 \n\t"
  240. "sra %[temp2], %[temp2], 7 \n\t"
  241. "sra %[temp3], %[temp3], 7 \n\t"
  242. "sra %[temp4], %[temp4], 7 \n\t"
  243. "addu %[temp1], %[temp8], %[temp2] \n\t"
  244. "addu %[temp5], %[temp9], %[temp3] \n\t"
  245. "addu %[temp6], %[temp10], %[temp4] \n\t"
  246. "subu %[temp8], %[temp7], %[temp4] \n\t"
  247. "subu %[temp7], %[temp12], %[temp3] \n\t"
  248. "addu %[temp10], %[p], %[hstride] \n\t"
  249. "subu %[temp9], %[temp13], %[temp2] \n\t"
  250. "addu %[temp12], %[temp10], %[hstride] \n\t"
  251. "lbux %[temp2], %[temp1](%[VP8kclip1]) \n\t"
  252. "lbux %[temp3], %[temp5](%[VP8kclip1]) \n\t"
  253. "lbux %[temp4], %[temp6](%[VP8kclip1]) \n\t"
  254. "lbux %[temp5], %[temp8](%[VP8kclip1]) \n\t"
  255. "lbux %[temp6], %[temp7](%[VP8kclip1]) \n\t"
  256. "lbux %[temp8], %[temp9](%[VP8kclip1]) \n\t"
  257. "sb %[temp2], 0(%[temp15]) \n\t"
  258. "sb %[temp3], 0(%[temp11]) \n\t"
  259. "sb %[temp4], 0(%[temp14]) \n\t"
  260. "sb %[temp5], 0(%[p]) \n\t"
  261. "sb %[temp6], 0(%[temp10]) \n\t"
  262. "sb %[temp8], 0(%[temp12]) \n\t"
  263. "3: \n\t"
  264. "bgtz %[size], 1b \n\t"
  265. " addu %[p], %[p], %[vstride] \n\t"
  266. ".set pop \n\t"
  267. : [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),[temp3]"=&r"(temp3),
  268. [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [temp6]"=&r"(temp6),
  269. [temp7]"=&r"(temp7),[temp8]"=&r"(temp8),[temp9]"=&r"(temp9),
  270. [temp10]"=&r"(temp10),[temp11]"=&r"(temp11),[temp12]"=&r"(temp12),
  271. [temp13]"=&r"(temp13),[temp14]"=&r"(temp14),[temp15]"=&r"(temp15),
  272. [size]"+&r"(size), [p]"+&r"(p)
  273. : [hstride]"r"(hstride), [thresh2]"r"(thresh2),
  274. [ithresh]"r"(ithresh),[vstride]"r"(vstride), [hev_thresh]"r"(hev_thresh),
  275. [VP8kclip1]"r"(VP8kclip1)
  276. : "memory"
  277. );
  278. }
  279. static WEBP_INLINE void FilterLoop24(uint8_t* p,
  280. int hstride, int vstride, int size,
  281. int thresh, int ithresh, int hev_thresh) {
  282. int p0, q0, p1, q1, p2, q2, p3, q3;
  283. int step1, step2, temp1, temp2, temp3, temp4;
  284. uint8_t* pTemp0;
  285. uint8_t* pTemp1;
  286. const int thresh2 = 2 * thresh + 1;
  287. __asm__ volatile (
  288. ".set push \n\t"
  289. ".set noreorder \n\t"
  290. "bltz %[size], 3f \n\t"
  291. " nop \n\t"
  292. "2: \n\t"
  293. "negu %[step1], %[hstride] \n\t"
  294. "lbu %[q0], 0(%[p]) \n\t"
  295. "lbux %[p0], %[step1](%[p]) \n\t"
  296. "subu %[step1], %[step1], %[hstride] \n\t"
  297. "lbux %[q1], %[hstride](%[p]) \n\t"
  298. "subu %[temp1], %[p0], %[q0] \n\t"
  299. "lbux %[p1], %[step1](%[p]) \n\t"
  300. "addu %[step2], %[hstride], %[hstride] \n\t"
  301. "absq_s.w %[temp2], %[temp1] \n\t"
  302. "subu %[temp3], %[p1], %[q1] \n\t"
  303. "absq_s.w %[temp4], %[temp3] \n\t"
  304. "sll %[temp2], %[temp2], 2 \n\t"
  305. "addu %[temp2], %[temp2], %[temp4] \n\t"
  306. "subu %[temp4], %[temp2], %[thresh2] \n\t"
  307. "subu %[step1], %[step1], %[hstride] \n\t"
  308. "bgtz %[temp4], 0f \n\t"
  309. " lbux %[p2], %[step1](%[p]) \n\t"
  310. "subu %[step1], %[step1], %[hstride] \n\t"
  311. "lbux %[q2], %[step2](%[p]) \n\t"
  312. "lbux %[p3], %[step1](%[p]) \n\t"
  313. "subu %[temp4], %[p2], %[p1] \n\t"
  314. "addu %[step2], %[step2], %[hstride] \n\t"
  315. "subu %[temp2], %[p3], %[p2] \n\t"
  316. "absq_s.w %[temp4], %[temp4] \n\t"
  317. "absq_s.w %[temp2], %[temp2] \n\t"
  318. "lbux %[q3], %[step2](%[p]) \n\t"
  319. "subu %[temp4], %[temp4], %[ithresh] \n\t"
  320. "negu %[temp1], %[temp1] \n\t"
  321. "bgtz %[temp4], 0f \n\t"
  322. " subu %[temp2], %[temp2], %[ithresh] \n\t"
  323. "subu %[p3], %[p1], %[p0] \n\t"
  324. "bgtz %[temp2], 0f \n\t"
  325. " absq_s.w %[p3], %[p3] \n\t"
  326. "subu %[temp4], %[q3], %[q2] \n\t"
  327. "subu %[pTemp0], %[p], %[hstride] \n\t"
  328. "absq_s.w %[temp4], %[temp4] \n\t"
  329. "subu %[temp2], %[p3], %[ithresh] \n\t"
  330. "sll %[step1], %[temp1], 1 \n\t"
  331. "bgtz %[temp2], 0f \n\t"
  332. " subu %[temp4], %[temp4], %[ithresh] \n\t"
  333. "subu %[temp2], %[q2], %[q1] \n\t"
  334. "bgtz %[temp4], 0f \n\t"
  335. " absq_s.w %[temp2], %[temp2] \n\t"
  336. "subu %[q3], %[q1], %[q0] \n\t"
  337. "absq_s.w %[q3], %[q3] \n\t"
  338. "subu %[temp2], %[temp2], %[ithresh] \n\t"
  339. "addu %[temp1], %[temp1], %[step1] \n\t"
  340. "bgtz %[temp2], 0f \n\t"
  341. " subu %[temp4], %[q3], %[ithresh] \n\t"
  342. "slt %[p3], %[hev_thresh], %[p3] \n\t"
  343. "bgtz %[temp4], 0f \n\t"
  344. " slt %[q3], %[hev_thresh], %[q3] \n\t"
  345. "or %[q3], %[q3], %[p3] \n\t"
  346. "bgtz %[q3], 1f \n\t"
  347. " shra_r.w %[temp2], %[temp1], 3 \n\t"
  348. "addiu %[temp1], %[temp1], 3 \n\t"
  349. "sra %[temp1], %[temp1], 3 \n\t"
  350. "shll_s.w %[temp2], %[temp2], 27 \n\t"
  351. "shll_s.w %[temp1], %[temp1], 27 \n\t"
  352. "addu %[pTemp1], %[p], %[hstride] \n\t"
  353. "sra %[temp2], %[temp2], 27 \n\t"
  354. "sra %[temp1], %[temp1], 27 \n\t"
  355. "addiu %[step1], %[temp2], 1 \n\t"
  356. "sra %[step1], %[step1], 1 \n\t"
  357. "addu %[p0], %[p0], %[temp1] \n\t"
  358. "addu %[p1], %[p1], %[step1] \n\t"
  359. "subu %[q0], %[q0], %[temp2] \n\t"
  360. "subu %[q1], %[q1], %[step1] \n\t"
  361. "lbux %[temp2], %[p0](%[VP8kclip1]) \n\t"
  362. "lbux %[temp3], %[q0](%[VP8kclip1]) \n\t"
  363. "lbux %[temp4], %[q1](%[VP8kclip1]) \n\t"
  364. "sb %[temp2], 0(%[pTemp0]) \n\t"
  365. "lbux %[temp1], %[p1](%[VP8kclip1]) \n\t"
  366. "subu %[pTemp0], %[pTemp0], %[hstride] \n\t"
  367. "sb %[temp3], 0(%[p]) \n\t"
  368. "sb %[temp4], 0(%[pTemp1]) \n\t"
  369. "j 0f \n\t"
  370. " sb %[temp1], 0(%[pTemp0]) \n\t"
  371. "1: \n\t"
  372. "shll_s.w %[temp3], %[temp3], 24 \n\t"
  373. "sra %[temp3], %[temp3], 24 \n\t"
  374. "addu %[temp1], %[temp1], %[temp3] \n\t"
  375. "shra_r.w %[temp2], %[temp1], 3 \n\t"
  376. "addiu %[temp1], %[temp1], 3 \n\t"
  377. "shll_s.w %[temp2], %[temp2], 27 \n\t"
  378. "sra %[temp1], %[temp1], 3 \n\t"
  379. "shll_s.w %[temp1], %[temp1], 27 \n\t"
  380. "sra %[temp2], %[temp2], 27 \n\t"
  381. "sra %[temp1], %[temp1], 27 \n\t"
  382. "addu %[p0], %[p0], %[temp1] \n\t"
  383. "subu %[q0], %[q0], %[temp2] \n\t"
  384. "lbux %[temp1], %[p0](%[VP8kclip1]) \n\t"
  385. "lbux %[temp2], %[q0](%[VP8kclip1]) \n\t"
  386. "sb %[temp2], 0(%[p]) \n\t"
  387. "sb %[temp1], 0(%[pTemp0]) \n\t"
  388. "0: \n\t"
  389. "subu %[size], %[size], 1 \n\t"
  390. "bgtz %[size], 2b \n\t"
  391. " addu %[p], %[p], %[vstride] \n\t"
  392. "3: \n\t"
  393. ".set pop \n\t"
  394. : [p0]"=&r"(p0), [q0]"=&r"(q0), [p1]"=&r"(p1), [q1]"=&r"(q1),
  395. [p2]"=&r"(p2), [q2]"=&r"(q2), [p3]"=&r"(p3), [q3]"=&r"(q3),
  396. [step2]"=&r"(step2), [step1]"=&r"(step1), [temp1]"=&r"(temp1),
  397. [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), [temp4]"=&r"(temp4),
  398. [pTemp0]"=&r"(pTemp0), [pTemp1]"=&r"(pTemp1), [p]"+&r"(p),
  399. [size]"+&r"(size)
  400. : [vstride]"r"(vstride), [ithresh]"r"(ithresh),
  401. [hev_thresh]"r"(hev_thresh), [hstride]"r"(hstride),
  402. [VP8kclip1]"r"(VP8kclip1), [thresh2]"r"(thresh2)
  403. : "memory"
  404. );
  405. }
  406. // on macroblock edges
  407. static void VFilter16(uint8_t* p, int stride,
  408. int thresh, int ithresh, int hev_thresh) {
  409. FilterLoop26(p, stride, 1, 16, thresh, ithresh, hev_thresh);
  410. }
  411. static void HFilter16(uint8_t* p, int stride,
  412. int thresh, int ithresh, int hev_thresh) {
  413. FilterLoop26(p, 1, stride, 16, thresh, ithresh, hev_thresh);
  414. }
  415. // 8-pixels wide variant, for chroma filtering
  416. static void VFilter8(uint8_t* u, uint8_t* v, int stride,
  417. int thresh, int ithresh, int hev_thresh) {
  418. FilterLoop26(u, stride, 1, 8, thresh, ithresh, hev_thresh);
  419. FilterLoop26(v, stride, 1, 8, thresh, ithresh, hev_thresh);
  420. }
  421. static void HFilter8(uint8_t* u, uint8_t* v, int stride,
  422. int thresh, int ithresh, int hev_thresh) {
  423. FilterLoop26(u, 1, stride, 8, thresh, ithresh, hev_thresh);
  424. FilterLoop26(v, 1, stride, 8, thresh, ithresh, hev_thresh);
  425. }
  426. // on three inner edges
  427. static void VFilter16i(uint8_t* p, int stride,
  428. int thresh, int ithresh, int hev_thresh) {
  429. int k;
  430. for (k = 3; k > 0; --k) {
  431. p += 4 * stride;
  432. FilterLoop24(p, stride, 1, 16, thresh, ithresh, hev_thresh);
  433. }
  434. }
  435. static void HFilter16i(uint8_t* p, int stride,
  436. int thresh, int ithresh, int hev_thresh) {
  437. int k;
  438. for (k = 3; k > 0; --k) {
  439. p += 4;
  440. FilterLoop24(p, 1, stride, 16, thresh, ithresh, hev_thresh);
  441. }
  442. }
  443. static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
  444. int thresh, int ithresh, int hev_thresh) {
  445. FilterLoop24(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
  446. FilterLoop24(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
  447. }
  448. static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
  449. int thresh, int ithresh, int hev_thresh) {
  450. FilterLoop24(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
  451. FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
  452. }
  453. #undef MUL
  454. //------------------------------------------------------------------------------
  455. // Simple In-loop filtering (Paragraph 15.2)
  456. static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
  457. int i;
  458. const int thresh2 = 2 * thresh + 1;
  459. int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
  460. uint8_t* p1 = p - stride;
  461. __asm__ volatile (
  462. ".set push \n\t"
  463. ".set noreorder \n\t"
  464. "li %[i], 16 \n\t"
  465. "0: \n\t"
  466. "negu %[temp4], %[stride] \n\t"
  467. "sll %[temp5], %[temp4], 1 \n\t"
  468. "lbu %[temp2], 0(%[p]) \n\t"
  469. "lbux %[temp3], %[stride](%[p]) \n\t"
  470. "lbux %[temp1], %[temp4](%[p]) \n\t"
  471. "lbux %[temp0], %[temp5](%[p]) \n\t"
  472. "subu %[temp7], %[temp1], %[temp2] \n\t"
  473. "subu %[temp6], %[temp0], %[temp3] \n\t"
  474. "absq_s.w %[temp4], %[temp7] \n\t"
  475. "absq_s.w %[temp5], %[temp6] \n\t"
  476. "sll %[temp4], %[temp4], 2 \n\t"
  477. "subu %[temp5], %[temp5], %[thresh2] \n\t"
  478. "addu %[temp5], %[temp4], %[temp5] \n\t"
  479. "negu %[temp8], %[temp7] \n\t"
  480. "bgtz %[temp5], 1f \n\t"
  481. " addiu %[i], %[i], -1 \n\t"
  482. "sll %[temp4], %[temp8], 1 \n\t"
  483. "shll_s.w %[temp5], %[temp6], 24 \n\t"
  484. "addu %[temp3], %[temp4], %[temp8] \n\t"
  485. "sra %[temp5], %[temp5], 24 \n\t"
  486. "addu %[temp3], %[temp3], %[temp5] \n\t"
  487. "addiu %[temp7], %[temp3], 3 \n\t"
  488. "sra %[temp7], %[temp7], 3 \n\t"
  489. "shra_r.w %[temp8], %[temp3], 3 \n\t"
  490. "shll_s.w %[temp0], %[temp7], 27 \n\t"
  491. "shll_s.w %[temp4], %[temp8], 27 \n\t"
  492. "sra %[temp0], %[temp0], 27 \n\t"
  493. "sra %[temp4], %[temp4], 27 \n\t"
  494. "addu %[temp7], %[temp1], %[temp0] \n\t"
  495. "subu %[temp2], %[temp2], %[temp4] \n\t"
  496. "lbux %[temp3], %[temp7](%[VP8kclip1]) \n\t"
  497. "lbux %[temp4], %[temp2](%[VP8kclip1]) \n\t"
  498. "sb %[temp3], 0(%[p1]) \n\t"
  499. "sb %[temp4], 0(%[p]) \n\t"
  500. "1: \n\t"
  501. "addiu %[p1], %[p1], 1 \n\t"
  502. "bgtz %[i], 0b \n\t"
  503. " addiu %[p], %[p], 1 \n\t"
  504. " .set pop \n\t"
  505. : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
  506. [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
  507. [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
  508. [p]"+&r"(p), [i]"=&r"(i), [p1]"+&r"(p1)
  509. : [stride]"r"(stride), [VP8kclip1]"r"(VP8kclip1), [thresh2]"r"(thresh2)
  510. : "memory"
  511. );
  512. }
  513. // TEMP0 = SRC[A + A1 * BPS]
  514. // TEMP1 = SRC[B + B1 * BPS]
  515. // TEMP2 = SRC[C + C1 * BPS]
  516. // TEMP3 = SRC[D + D1 * BPS]
  517. #define LOAD_4_BYTES(TEMP0, TEMP1, TEMP2, TEMP3, \
  518. A, A1, B, B1, C, C1, D, D1, SRC) \
  519. "lbu %[" #TEMP0 "], " #A "+" #A1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t" \
  520. "lbu %[" #TEMP1 "], " #B "+" #B1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t" \
  521. "lbu %[" #TEMP2 "], " #C "+" #C1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t" \
  522. "lbu %[" #TEMP3 "], " #D "+" #D1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t" \
  523. static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
  524. int i;
  525. const int thresh2 = 2 * thresh + 1;
  526. int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
  527. __asm__ volatile (
  528. ".set push \n\t"
  529. ".set noreorder \n\t"
  530. "li %[i], 16 \n\t"
  531. "0: \n\t"
  532. LOAD_4_BYTES(temp0, temp1, temp2, temp3, -2, 0, -1, 0, 0, 0, 1, 0, p)
  533. "subu %[temp7], %[temp1], %[temp2] \n\t"
  534. "subu %[temp6], %[temp0], %[temp3] \n\t"
  535. "absq_s.w %[temp4], %[temp7] \n\t"
  536. "absq_s.w %[temp5], %[temp6] \n\t"
  537. "sll %[temp4], %[temp4], 2 \n\t"
  538. "addu %[temp5], %[temp4], %[temp5] \n\t"
  539. "subu %[temp5], %[temp5], %[thresh2] \n\t"
  540. "negu %[temp8], %[temp7] \n\t"
  541. "bgtz %[temp5], 1f \n\t"
  542. " addiu %[i], %[i], -1 \n\t"
  543. "sll %[temp4], %[temp8], 1 \n\t"
  544. "shll_s.w %[temp5], %[temp6], 24 \n\t"
  545. "addu %[temp3], %[temp4], %[temp8] \n\t"
  546. "sra %[temp5], %[temp5], 24 \n\t"
  547. "addu %[temp3], %[temp3], %[temp5] \n\t"
  548. "addiu %[temp7], %[temp3], 3 \n\t"
  549. "sra %[temp7], %[temp7], 3 \n\t"
  550. "shra_r.w %[temp8], %[temp3], 3 \n\t"
  551. "shll_s.w %[temp0], %[temp7], 27 \n\t"
  552. "shll_s.w %[temp4], %[temp8], 27 \n\t"
  553. "sra %[temp0], %[temp0], 27 \n\t"
  554. "sra %[temp4], %[temp4], 27 \n\t"
  555. "addu %[temp7], %[temp1], %[temp0] \n\t"
  556. "subu %[temp2], %[temp2], %[temp4] \n\t"
  557. "lbux %[temp3], %[temp7](%[VP8kclip1]) \n\t"
  558. "lbux %[temp4], %[temp2](%[VP8kclip1]) \n\t"
  559. "sb %[temp3], -1(%[p]) \n\t"
  560. "sb %[temp4], 0(%[p]) \n\t"
  561. "1: \n\t"
  562. "bgtz %[i], 0b \n\t"
  563. " addu %[p], %[p], %[stride] \n\t"
  564. ".set pop \n\t"
  565. : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
  566. [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
  567. [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
  568. [p]"+&r"(p), [i]"=&r"(i)
  569. : [stride]"r"(stride), [VP8kclip1]"r"(VP8kclip1), [thresh2]"r"(thresh2)
  570. : "memory"
  571. );
  572. }
  573. static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) {
  574. int k;
  575. for (k = 3; k > 0; --k) {
  576. p += 4 * stride;
  577. SimpleVFilter16(p, stride, thresh);
  578. }
  579. }
  580. static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
  581. int k;
  582. for (k = 3; k > 0; --k) {
  583. p += 4;
  584. SimpleHFilter16(p, stride, thresh);
  585. }
  586. }
  587. // DST[A * BPS] = TEMP0
  588. // DST[B + C * BPS] = TEMP1
  589. #define STORE_8_BYTES(TEMP0, TEMP1, A, B, C, DST) \
  590. "usw %[" #TEMP0 "], " #A "*" XSTR(BPS) "(%[" #DST "]) \n\t" \
  591. "usw %[" #TEMP1 "], " #B "+" #C "*" XSTR(BPS) "(%[" #DST "]) \n\t"
  592. static void VE4(uint8_t* dst) { // vertical
  593. const uint8_t* top = dst - BPS;
  594. int temp0, temp1, temp2, temp3, temp4, temp5, temp6;
  595. __asm__ volatile (
  596. "ulw %[temp0], -1(%[top]) \n\t"
  597. "ulh %[temp1], 3(%[top]) \n\t"
  598. "preceu.ph.qbr %[temp2], %[temp0] \n\t"
  599. "preceu.ph.qbl %[temp3], %[temp0] \n\t"
  600. "preceu.ph.qbr %[temp4], %[temp1] \n\t"
  601. "packrl.ph %[temp5], %[temp3], %[temp2] \n\t"
  602. "packrl.ph %[temp6], %[temp4], %[temp3] \n\t"
  603. "shll.ph %[temp5], %[temp5], 1 \n\t"
  604. "shll.ph %[temp6], %[temp6], 1 \n\t"
  605. "addq.ph %[temp2], %[temp5], %[temp2] \n\t"
  606. "addq.ph %[temp6], %[temp6], %[temp4] \n\t"
  607. "addq.ph %[temp2], %[temp2], %[temp3] \n\t"
  608. "addq.ph %[temp6], %[temp6], %[temp3] \n\t"
  609. "shra_r.ph %[temp2], %[temp2], 2 \n\t"
  610. "shra_r.ph %[temp6], %[temp6], 2 \n\t"
  611. "precr.qb.ph %[temp4], %[temp6], %[temp2] \n\t"
  612. STORE_8_BYTES(temp4, temp4, 0, 0, 1, dst)
  613. STORE_8_BYTES(temp4, temp4, 2, 0, 3, dst)
  614. : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
  615. [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
  616. [temp6]"=&r"(temp6)
  617. : [top]"r"(top), [dst]"r"(dst)
  618. : "memory"
  619. );
  620. }
  621. static void DC4(uint8_t* dst) { // DC
  622. int temp0, temp1, temp2, temp3, temp4;
  623. __asm__ volatile (
  624. "ulw %[temp0], -1*" XSTR(BPS) "(%[dst]) \n\t"
  625. LOAD_4_BYTES(temp1, temp2, temp3, temp4, -1, 0, -1, 1, -1, 2, -1, 3, dst)
  626. "ins %[temp1], %[temp2], 8, 8 \n\t"
  627. "ins %[temp1], %[temp3], 16, 8 \n\t"
  628. "ins %[temp1], %[temp4], 24, 8 \n\t"
  629. "raddu.w.qb %[temp0], %[temp0] \n\t"
  630. "raddu.w.qb %[temp1], %[temp1] \n\t"
  631. "addu %[temp0], %[temp0], %[temp1] \n\t"
  632. "shra_r.w %[temp0], %[temp0], 3 \n\t"
  633. "replv.qb %[temp0], %[temp0] \n\t"
  634. STORE_8_BYTES(temp0, temp0, 0, 0, 1, dst)
  635. STORE_8_BYTES(temp0, temp0, 2, 0, 3, dst)
  636. : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
  637. [temp3]"=&r"(temp3), [temp4]"=&r"(temp4)
  638. : [dst]"r"(dst)
  639. : "memory"
  640. );
  641. }
  642. static void RD4(uint8_t* dst) { // Down-right
  643. int temp0, temp1, temp2, temp3, temp4;
  644. int temp5, temp6, temp7, temp8;
  645. __asm__ volatile (
  646. LOAD_4_BYTES(temp0, temp1, temp2, temp3, -1, 0, -1, 1, -1, 2, -1, 3, dst)
  647. "ulw %[temp7], -1-" XSTR(BPS) "(%[dst]) \n\t"
  648. "ins %[temp1], %[temp0], 16, 16 \n\t"
  649. "preceu.ph.qbr %[temp5], %[temp7] \n\t"
  650. "ins %[temp2], %[temp1], 16, 16 \n\t"
  651. "preceu.ph.qbl %[temp4], %[temp7] \n\t"
  652. "ins %[temp3], %[temp2], 16, 16 \n\t"
  653. "shll.ph %[temp2], %[temp2], 1 \n\t"
  654. "addq.ph %[temp3], %[temp3], %[temp1] \n\t"
  655. "packrl.ph %[temp6], %[temp5], %[temp1] \n\t"
  656. "addq.ph %[temp3], %[temp3], %[temp2] \n\t"
  657. "addq.ph %[temp1], %[temp1], %[temp5] \n\t"
  658. "shll.ph %[temp6], %[temp6], 1 \n\t"
  659. "addq.ph %[temp1], %[temp1], %[temp6] \n\t"
  660. "packrl.ph %[temp0], %[temp4], %[temp5] \n\t"
  661. "addq.ph %[temp8], %[temp5], %[temp4] \n\t"
  662. "shra_r.ph %[temp3], %[temp3], 2 \n\t"
  663. "shll.ph %[temp0], %[temp0], 1 \n\t"
  664. "shra_r.ph %[temp1], %[temp1], 2 \n\t"
  665. "addq.ph %[temp8], %[temp0], %[temp8] \n\t"
  666. "lbu %[temp5], 3-" XSTR(BPS) "(%[dst]) \n\t"
  667. "precrq.ph.w %[temp7], %[temp7], %[temp7] \n\t"
  668. "shra_r.ph %[temp8], %[temp8], 2 \n\t"
  669. "ins %[temp7], %[temp5], 0, 8 \n\t"
  670. "precr.qb.ph %[temp2], %[temp1], %[temp3] \n\t"
  671. "raddu.w.qb %[temp4], %[temp7] \n\t"
  672. "precr.qb.ph %[temp6], %[temp8], %[temp1] \n\t"
  673. "shra_r.w %[temp4], %[temp4], 2 \n\t"
  674. STORE_8_BYTES(temp2, temp6, 3, 0, 1, dst)
  675. "prepend %[temp2], %[temp8], 8 \n\t"
  676. "prepend %[temp6], %[temp4], 8 \n\t"
  677. STORE_8_BYTES(temp2, temp6, 2, 0, 0, dst)
  678. : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
  679. [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
  680. [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8)
  681. : [dst]"r"(dst)
  682. : "memory"
  683. );
  684. }
  685. // TEMP0 = SRC[A * BPS]
  686. // TEMP1 = SRC[B + C * BPS]
  687. #define LOAD_8_BYTES(TEMP0, TEMP1, A, B, C, SRC) \
  688. "ulw %[" #TEMP0 "], " #A "*" XSTR(BPS) "(%[" #SRC "]) \n\t" \
  689. "ulw %[" #TEMP1 "], " #B "+" #C "*" XSTR(BPS) "(%[" #SRC "]) \n\t"
  690. static void LD4(uint8_t* dst) { // Down-Left
  691. int temp0, temp1, temp2, temp3, temp4;
  692. int temp5, temp6, temp7, temp8, temp9;
  693. __asm__ volatile (
  694. LOAD_8_BYTES(temp0, temp1, -1, 4, -1, dst)
  695. "preceu.ph.qbl %[temp2], %[temp0] \n\t"
  696. "preceu.ph.qbr %[temp3], %[temp0] \n\t"
  697. "preceu.ph.qbr %[temp4], %[temp1] \n\t"
  698. "preceu.ph.qbl %[temp5], %[temp1] \n\t"
  699. "packrl.ph %[temp6], %[temp2], %[temp3] \n\t"
  700. "packrl.ph %[temp7], %[temp4], %[temp2] \n\t"
  701. "packrl.ph %[temp8], %[temp5], %[temp4] \n\t"
  702. "shll.ph %[temp6], %[temp6], 1 \n\t"
  703. "addq.ph %[temp9], %[temp2], %[temp6] \n\t"
  704. "shll.ph %[temp7], %[temp7], 1 \n\t"
  705. "addq.ph %[temp9], %[temp9], %[temp3] \n\t"
  706. "shll.ph %[temp8], %[temp8], 1 \n\t"
  707. "shra_r.ph %[temp9], %[temp9], 2 \n\t"
  708. "addq.ph %[temp3], %[temp4], %[temp7] \n\t"
  709. "addq.ph %[temp0], %[temp5], %[temp8] \n\t"
  710. "addq.ph %[temp3], %[temp3], %[temp2] \n\t"
  711. "addq.ph %[temp0], %[temp0], %[temp4] \n\t"
  712. "shra_r.ph %[temp3], %[temp3], 2 \n\t"
  713. "shra_r.ph %[temp0], %[temp0], 2 \n\t"
  714. "srl %[temp1], %[temp1], 24 \n\t"
  715. "sll %[temp1], %[temp1], 1 \n\t"
  716. "raddu.w.qb %[temp5], %[temp5] \n\t"
  717. "precr.qb.ph %[temp9], %[temp3], %[temp9] \n\t"
  718. "precr.qb.ph %[temp3], %[temp0], %[temp3] \n\t"
  719. "addu %[temp1], %[temp1], %[temp5] \n\t"
  720. "shra_r.w %[temp1], %[temp1], 2 \n\t"
  721. STORE_8_BYTES(temp9, temp3, 0, 0, 2, dst)
  722. "prepend %[temp9], %[temp0], 8 \n\t"
  723. "prepend %[temp3], %[temp1], 8 \n\t"
  724. STORE_8_BYTES(temp9, temp3, 1, 0, 3, dst)
  725. : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
  726. [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
  727. [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
  728. [temp9]"=&r"(temp9)
  729. : [dst]"r"(dst)
  730. : "memory"
  731. );
  732. }
  733. //------------------------------------------------------------------------------
  734. // Chroma
  735. static void DC8uv(uint8_t* dst) { // DC
  736. int temp0, temp1, temp2, temp3, temp4;
  737. int temp5, temp6, temp7, temp8, temp9;
  738. __asm__ volatile (
  739. LOAD_8_BYTES(temp0, temp1, -1, 4, -1, dst)
  740. LOAD_4_BYTES(temp2, temp3, temp4, temp5, -1, 0, -1, 1, -1, 2, -1, 3, dst)
  741. LOAD_4_BYTES(temp6, temp7, temp8, temp9, -1, 4, -1, 5, -1, 6, -1, 7, dst)
  742. "raddu.w.qb %[temp0], %[temp0] \n\t"
  743. "raddu.w.qb %[temp1], %[temp1] \n\t"
  744. "addu %[temp2], %[temp2], %[temp3] \n\t"
  745. "addu %[temp4], %[temp4], %[temp5] \n\t"
  746. "addu %[temp6], %[temp6], %[temp7] \n\t"
  747. "addu %[temp8], %[temp8], %[temp9] \n\t"
  748. "addu %[temp0], %[temp0], %[temp1] \n\t"
  749. "addu %[temp2], %[temp2], %[temp4] \n\t"
  750. "addu %[temp6], %[temp6], %[temp8] \n\t"
  751. "addu %[temp0], %[temp0], %[temp2] \n\t"
  752. "addu %[temp0], %[temp0], %[temp6] \n\t"
  753. "shra_r.w %[temp0], %[temp0], 4 \n\t"
  754. "replv.qb %[temp0], %[temp0] \n\t"
  755. STORE_8_BYTES(temp0, temp0, 0, 4, 0, dst)
  756. STORE_8_BYTES(temp0, temp0, 1, 4, 1, dst)
  757. STORE_8_BYTES(temp0, temp0, 2, 4, 2, dst)
  758. STORE_8_BYTES(temp0, temp0, 3, 4, 3, dst)
  759. STORE_8_BYTES(temp0, temp0, 4, 4, 4, dst)
  760. STORE_8_BYTES(temp0, temp0, 5, 4, 5, dst)
  761. STORE_8_BYTES(temp0, temp0, 6, 4, 6, dst)
  762. STORE_8_BYTES(temp0, temp0, 7, 4, 7, dst)
  763. : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
  764. [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
  765. [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
  766. [temp9]"=&r"(temp9)
  767. : [dst]"r"(dst)
  768. : "memory"
  769. );
  770. }
  771. static void DC8uvNoLeft(uint8_t* dst) { // DC with no left samples
  772. int temp0, temp1;
  773. __asm__ volatile (
  774. LOAD_8_BYTES(temp0, temp1, -1, 4, -1, dst)
  775. "raddu.w.qb %[temp0], %[temp0] \n\t"
  776. "raddu.w.qb %[temp1], %[temp1] \n\t"
  777. "addu %[temp0], %[temp0], %[temp1] \n\t"
  778. "shra_r.w %[temp0], %[temp0], 3 \n\t"
  779. "replv.qb %[temp0], %[temp0] \n\t"
  780. STORE_8_BYTES(temp0, temp0, 0, 4, 0, dst)
  781. STORE_8_BYTES(temp0, temp0, 1, 4, 1, dst)
  782. STORE_8_BYTES(temp0, temp0, 2, 4, 2, dst)
  783. STORE_8_BYTES(temp0, temp0, 3, 4, 3, dst)
  784. STORE_8_BYTES(temp0, temp0, 4, 4, 4, dst)
  785. STORE_8_BYTES(temp0, temp0, 5, 4, 5, dst)
  786. STORE_8_BYTES(temp0, temp0, 6, 4, 6, dst)
  787. STORE_8_BYTES(temp0, temp0, 7, 4, 7, dst)
  788. : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1)
  789. : [dst]"r"(dst)
  790. : "memory"
  791. );
  792. }
  793. static void DC8uvNoTop(uint8_t* dst) { // DC with no top samples
  794. int temp0, temp1, temp2, temp3, temp4;
  795. int temp5, temp6, temp7, temp8;
  796. __asm__ volatile (
  797. LOAD_4_BYTES(temp2, temp3, temp4, temp5, -1, 0, -1, 1, -1, 2, -1, 3, dst)
  798. LOAD_4_BYTES(temp6, temp7, temp8, temp1, -1, 4, -1, 5, -1, 6, -1, 7, dst)
  799. "addu %[temp2], %[temp2], %[temp3] \n\t"
  800. "addu %[temp4], %[temp4], %[temp5] \n\t"
  801. "addu %[temp6], %[temp6], %[temp7] \n\t"
  802. "addu %[temp8], %[temp8], %[temp1] \n\t"
  803. "addu %[temp2], %[temp2], %[temp4] \n\t"
  804. "addu %[temp6], %[temp6], %[temp8] \n\t"
  805. "addu %[temp0], %[temp6], %[temp2] \n\t"
  806. "shra_r.w %[temp0], %[temp0], 3 \n\t"
  807. "replv.qb %[temp0], %[temp0] \n\t"
  808. STORE_8_BYTES(temp0, temp0, 0, 4, 0, dst)
  809. STORE_8_BYTES(temp0, temp0, 1, 4, 1, dst)
  810. STORE_8_BYTES(temp0, temp0, 2, 4, 2, dst)
  811. STORE_8_BYTES(temp0, temp0, 3, 4, 3, dst)
  812. STORE_8_BYTES(temp0, temp0, 4, 4, 4, dst)
  813. STORE_8_BYTES(temp0, temp0, 5, 4, 5, dst)
  814. STORE_8_BYTES(temp0, temp0, 6, 4, 6, dst)
  815. STORE_8_BYTES(temp0, temp0, 7, 4, 7, dst)
  816. : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
  817. [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
  818. [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8)
  819. : [dst]"r"(dst)
  820. : "memory"
  821. );
  822. }
  823. #undef LOAD_8_BYTES
  824. #undef STORE_8_BYTES
  825. #undef LOAD_4_BYTES
  826. #define CLIPPING(SIZE) \
  827. "preceu.ph.qbl %[temp2], %[temp0] \n\t" \
  828. "preceu.ph.qbr %[temp0], %[temp0] \n\t" \
  829. ".if " #SIZE " == 8 \n\t" \
  830. "preceu.ph.qbl %[temp3], %[temp1] \n\t" \
  831. "preceu.ph.qbr %[temp1], %[temp1] \n\t" \
  832. ".endif \n\t" \
  833. "addu.ph %[temp2], %[temp2], %[dst_1] \n\t" \
  834. "addu.ph %[temp0], %[temp0], %[dst_1] \n\t" \
  835. ".if " #SIZE " == 8 \n\t" \
  836. "addu.ph %[temp3], %[temp3], %[dst_1] \n\t" \
  837. "addu.ph %[temp1], %[temp1], %[dst_1] \n\t" \
  838. ".endif \n\t" \
  839. "shll_s.ph %[temp2], %[temp2], 7 \n\t" \
  840. "shll_s.ph %[temp0], %[temp0], 7 \n\t" \
  841. ".if " #SIZE " == 8 \n\t" \
  842. "shll_s.ph %[temp3], %[temp3], 7 \n\t" \
  843. "shll_s.ph %[temp1], %[temp1], 7 \n\t" \
  844. ".endif \n\t" \
  845. "precrqu_s.qb.ph %[temp0], %[temp2], %[temp0] \n\t" \
  846. ".if " #SIZE " == 8 \n\t" \
  847. "precrqu_s.qb.ph %[temp1], %[temp3], %[temp1] \n\t" \
  848. ".endif \n\t"
  849. #define CLIP_8B_TO_DST(DST, TOP, SIZE) do { \
  850. int dst_1 = ((int)(DST)[-1] << 16) + (DST)[-1]; \
  851. int temp0, temp1, temp2, temp3; \
  852. __asm__ volatile ( \
  853. ".if " #SIZE " < 8 \n\t" \
  854. "ulw %[temp0], 0(%[top]) \n\t" \
  855. "subu.ph %[dst_1], %[dst_1], %[top_1] \n\t" \
  856. CLIPPING(4) \
  857. "usw %[temp0], 0(%[dst]) \n\t" \
  858. ".else \n\t" \
  859. "ulw %[temp0], 0(%[top]) \n\t" \
  860. "ulw %[temp1], 4(%[top]) \n\t" \
  861. "subu.ph %[dst_1], %[dst_1], %[top_1] \n\t" \
  862. CLIPPING(8) \
  863. "usw %[temp0], 0(%[dst]) \n\t" \
  864. "usw %[temp1], 4(%[dst]) \n\t" \
  865. ".if " #SIZE " == 16 \n\t" \
  866. "ulw %[temp0], 8(%[top]) \n\t" \
  867. "ulw %[temp1], 12(%[top]) \n\t" \
  868. CLIPPING(8) \
  869. "usw %[temp0], 8(%[dst]) \n\t" \
  870. "usw %[temp1], 12(%[dst]) \n\t" \
  871. ".endif \n\t" \
  872. ".endif \n\t" \
  873. : [dst_1]"+&r"(dst_1), [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), \
  874. [temp2]"=&r"(temp2), [temp3]"=&r"(temp3) \
  875. : [top_1]"r"(top_1), [top]"r"((TOP)), [dst]"r"((DST)) \
  876. : "memory" \
  877. ); \
  878. } while (0)
  879. #define CLIP_TO_DST(DST, SIZE) do { \
  880. int y; \
  881. const uint8_t* top = (DST) - BPS; \
  882. const int top_1 = ((int)top[-1] << 16) + top[-1]; \
  883. for (y = 0; y < (SIZE); ++y) { \
  884. CLIP_8B_TO_DST((DST), top, (SIZE)); \
  885. (DST) += BPS; \
  886. } \
  887. } while (0)
  888. #define TRUE_MOTION(DST, SIZE) \
  889. static void TrueMotion##SIZE(uint8_t* (DST)) { \
  890. CLIP_TO_DST((DST), (SIZE)); \
  891. }
  892. TRUE_MOTION(dst, 4)
  893. TRUE_MOTION(dst, 8)
  894. TRUE_MOTION(dst, 16)
  895. #undef TRUE_MOTION
  896. #undef CLIP_TO_DST
  897. #undef CLIP_8B_TO_DST
  898. #undef CLIPPING
  899. //------------------------------------------------------------------------------
  900. // Entry point
  901. extern void VP8DspInitMIPSdspR2(void);
  902. WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitMIPSdspR2(void) {
  903. VP8TransformDC = TransformDC;
  904. VP8TransformAC3 = TransformAC3;
  905. VP8Transform = TransformTwo;
  906. VP8VFilter16 = VFilter16;
  907. VP8HFilter16 = HFilter16;
  908. VP8VFilter8 = VFilter8;
  909. VP8HFilter8 = HFilter8;
  910. VP8VFilter16i = VFilter16i;
  911. VP8HFilter16i = HFilter16i;
  912. VP8VFilter8i = VFilter8i;
  913. VP8HFilter8i = HFilter8i;
  914. VP8SimpleVFilter16 = SimpleVFilter16;
  915. VP8SimpleHFilter16 = SimpleHFilter16;
  916. VP8SimpleVFilter16i = SimpleVFilter16i;
  917. VP8SimpleHFilter16i = SimpleHFilter16i;
  918. VP8PredLuma4[0] = DC4;
  919. VP8PredLuma4[1] = TrueMotion4;
  920. VP8PredLuma4[2] = VE4;
  921. VP8PredLuma4[4] = RD4;
  922. VP8PredLuma4[6] = LD4;
  923. VP8PredChroma8[0] = DC8uv;
  924. VP8PredChroma8[1] = TrueMotion8;
  925. VP8PredChroma8[4] = DC8uvNoTop;
  926. VP8PredChroma8[5] = DC8uvNoLeft;
  927. VP8PredLuma16[1] = TrueMotion16;
  928. }
  929. #else // !WEBP_USE_MIPS_DSP_R2
  930. WEBP_DSP_INIT_STUB(VP8DspInitMIPSdspR2)
  931. #endif // WEBP_USE_MIPS_DSP_R2