inv_txfm.c 90 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519
  1. /*
  2. * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include <math.h>
  11. #include <string.h>
  12. #include "./vpx_dsp_rtcd.h"
  13. #include "vpx_dsp/inv_txfm.h"
  14. void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
  15. /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
  16. 0.5 shifts per pixel. */
  17. int i;
  18. tran_low_t output[16];
  19. tran_high_t a1, b1, c1, d1, e1;
  20. const tran_low_t *ip = input;
  21. tran_low_t *op = output;
  22. for (i = 0; i < 4; i++) {
  23. a1 = ip[0] >> UNIT_QUANT_SHIFT;
  24. c1 = ip[1] >> UNIT_QUANT_SHIFT;
  25. d1 = ip[2] >> UNIT_QUANT_SHIFT;
  26. b1 = ip[3] >> UNIT_QUANT_SHIFT;
  27. a1 += c1;
  28. d1 -= b1;
  29. e1 = (a1 - d1) >> 1;
  30. b1 = e1 - b1;
  31. c1 = e1 - c1;
  32. a1 -= b1;
  33. d1 += c1;
  34. op[0] = WRAPLOW(a1);
  35. op[1] = WRAPLOW(b1);
  36. op[2] = WRAPLOW(c1);
  37. op[3] = WRAPLOW(d1);
  38. ip += 4;
  39. op += 4;
  40. }
  41. ip = output;
  42. for (i = 0; i < 4; i++) {
  43. a1 = ip[4 * 0];
  44. c1 = ip[4 * 1];
  45. d1 = ip[4 * 2];
  46. b1 = ip[4 * 3];
  47. a1 += c1;
  48. d1 -= b1;
  49. e1 = (a1 - d1) >> 1;
  50. b1 = e1 - b1;
  51. c1 = e1 - c1;
  52. a1 -= b1;
  53. d1 += c1;
  54. dest[stride * 0] = clip_pixel_add(dest[stride * 0], WRAPLOW(a1));
  55. dest[stride * 1] = clip_pixel_add(dest[stride * 1], WRAPLOW(b1));
  56. dest[stride * 2] = clip_pixel_add(dest[stride * 2], WRAPLOW(c1));
  57. dest[stride * 3] = clip_pixel_add(dest[stride * 3], WRAPLOW(d1));
  58. ip++;
  59. dest++;
  60. }
  61. }
  62. void vpx_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int dest_stride) {
  63. int i;
  64. tran_high_t a1, e1;
  65. tran_low_t tmp[4];
  66. const tran_low_t *ip = in;
  67. tran_low_t *op = tmp;
  68. a1 = ip[0] >> UNIT_QUANT_SHIFT;
  69. e1 = a1 >> 1;
  70. a1 -= e1;
  71. op[0] = WRAPLOW(a1);
  72. op[1] = op[2] = op[3] = WRAPLOW(e1);
  73. ip = tmp;
  74. for (i = 0; i < 4; i++) {
  75. e1 = ip[0] >> 1;
  76. a1 = ip[0] - e1;
  77. dest[dest_stride * 0] = clip_pixel_add(dest[dest_stride * 0], a1);
  78. dest[dest_stride * 1] = clip_pixel_add(dest[dest_stride * 1], e1);
  79. dest[dest_stride * 2] = clip_pixel_add(dest[dest_stride * 2], e1);
  80. dest[dest_stride * 3] = clip_pixel_add(dest[dest_stride * 3], e1);
  81. ip++;
  82. dest++;
  83. }
  84. }
  85. void idct4_c(const tran_low_t *input, tran_low_t *output) {
  86. tran_low_t step[4];
  87. tran_high_t temp1, temp2;
  88. // stage 1
  89. temp1 = (input[0] + input[2]) * cospi_16_64;
  90. temp2 = (input[0] - input[2]) * cospi_16_64;
  91. step[0] = WRAPLOW(dct_const_round_shift(temp1));
  92. step[1] = WRAPLOW(dct_const_round_shift(temp2));
  93. temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
  94. temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
  95. step[2] = WRAPLOW(dct_const_round_shift(temp1));
  96. step[3] = WRAPLOW(dct_const_round_shift(temp2));
  97. // stage 2
  98. output[0] = WRAPLOW(step[0] + step[3]);
  99. output[1] = WRAPLOW(step[1] + step[2]);
  100. output[2] = WRAPLOW(step[1] - step[2]);
  101. output[3] = WRAPLOW(step[0] - step[3]);
  102. }
  103. void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
  104. tran_low_t out[4 * 4];
  105. tran_low_t *outptr = out;
  106. int i, j;
  107. tran_low_t temp_in[4], temp_out[4];
  108. // Rows
  109. for (i = 0; i < 4; ++i) {
  110. idct4_c(input, outptr);
  111. input += 4;
  112. outptr += 4;
  113. }
  114. // Columns
  115. for (i = 0; i < 4; ++i) {
  116. for (j = 0; j < 4; ++j)
  117. temp_in[j] = out[j * 4 + i];
  118. idct4_c(temp_in, temp_out);
  119. for (j = 0; j < 4; ++j) {
  120. dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
  121. ROUND_POWER_OF_TWO(temp_out[j], 4));
  122. }
  123. }
  124. }
  125. void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest,
  126. int dest_stride) {
  127. int i;
  128. tran_high_t a1;
  129. tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
  130. out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
  131. a1 = ROUND_POWER_OF_TWO(out, 4);
  132. for (i = 0; i < 4; i++) {
  133. dest[0] = clip_pixel_add(dest[0], a1);
  134. dest[1] = clip_pixel_add(dest[1], a1);
  135. dest[2] = clip_pixel_add(dest[2], a1);
  136. dest[3] = clip_pixel_add(dest[3], a1);
  137. dest += dest_stride;
  138. }
  139. }
  140. void idct8_c(const tran_low_t *input, tran_low_t *output) {
  141. tran_low_t step1[8], step2[8];
  142. tran_high_t temp1, temp2;
  143. // stage 1
  144. step1[0] = input[0];
  145. step1[2] = input[4];
  146. step1[1] = input[2];
  147. step1[3] = input[6];
  148. temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
  149. temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
  150. step1[4] = WRAPLOW(dct_const_round_shift(temp1));
  151. step1[7] = WRAPLOW(dct_const_round_shift(temp2));
  152. temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
  153. temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
  154. step1[5] = WRAPLOW(dct_const_round_shift(temp1));
  155. step1[6] = WRAPLOW(dct_const_round_shift(temp2));
  156. // stage 2
  157. temp1 = (step1[0] + step1[2]) * cospi_16_64;
  158. temp2 = (step1[0] - step1[2]) * cospi_16_64;
  159. step2[0] = WRAPLOW(dct_const_round_shift(temp1));
  160. step2[1] = WRAPLOW(dct_const_round_shift(temp2));
  161. temp1 = step1[1] * cospi_24_64 - step1[3] * cospi_8_64;
  162. temp2 = step1[1] * cospi_8_64 + step1[3] * cospi_24_64;
  163. step2[2] = WRAPLOW(dct_const_round_shift(temp1));
  164. step2[3] = WRAPLOW(dct_const_round_shift(temp2));
  165. step2[4] = WRAPLOW(step1[4] + step1[5]);
  166. step2[5] = WRAPLOW(step1[4] - step1[5]);
  167. step2[6] = WRAPLOW(-step1[6] + step1[7]);
  168. step2[7] = WRAPLOW(step1[6] + step1[7]);
  169. // stage 3
  170. step1[0] = WRAPLOW(step2[0] + step2[3]);
  171. step1[1] = WRAPLOW(step2[1] + step2[2]);
  172. step1[2] = WRAPLOW(step2[1] - step2[2]);
  173. step1[3] = WRAPLOW(step2[0] - step2[3]);
  174. step1[4] = step2[4];
  175. temp1 = (step2[6] - step2[5]) * cospi_16_64;
  176. temp2 = (step2[5] + step2[6]) * cospi_16_64;
  177. step1[5] = WRAPLOW(dct_const_round_shift(temp1));
  178. step1[6] = WRAPLOW(dct_const_round_shift(temp2));
  179. step1[7] = step2[7];
  180. // stage 4
  181. output[0] = WRAPLOW(step1[0] + step1[7]);
  182. output[1] = WRAPLOW(step1[1] + step1[6]);
  183. output[2] = WRAPLOW(step1[2] + step1[5]);
  184. output[3] = WRAPLOW(step1[3] + step1[4]);
  185. output[4] = WRAPLOW(step1[3] - step1[4]);
  186. output[5] = WRAPLOW(step1[2] - step1[5]);
  187. output[6] = WRAPLOW(step1[1] - step1[6]);
  188. output[7] = WRAPLOW(step1[0] - step1[7]);
  189. }
  190. void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
  191. tran_low_t out[8 * 8];
  192. tran_low_t *outptr = out;
  193. int i, j;
  194. tran_low_t temp_in[8], temp_out[8];
  195. // First transform rows
  196. for (i = 0; i < 8; ++i) {
  197. idct8_c(input, outptr);
  198. input += 8;
  199. outptr += 8;
  200. }
  201. // Then transform columns
  202. for (i = 0; i < 8; ++i) {
  203. for (j = 0; j < 8; ++j)
  204. temp_in[j] = out[j * 8 + i];
  205. idct8_c(temp_in, temp_out);
  206. for (j = 0; j < 8; ++j) {
  207. dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
  208. ROUND_POWER_OF_TWO(temp_out[j], 5));
  209. }
  210. }
  211. }
  212. void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
  213. int i, j;
  214. tran_high_t a1;
  215. tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
  216. out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
  217. a1 = ROUND_POWER_OF_TWO(out, 5);
  218. for (j = 0; j < 8; ++j) {
  219. for (i = 0; i < 8; ++i)
  220. dest[i] = clip_pixel_add(dest[i], a1);
  221. dest += stride;
  222. }
  223. }
  224. void iadst4_c(const tran_low_t *input, tran_low_t *output) {
  225. tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
  226. tran_low_t x0 = input[0];
  227. tran_low_t x1 = input[1];
  228. tran_low_t x2 = input[2];
  229. tran_low_t x3 = input[3];
  230. if (!(x0 | x1 | x2 | x3)) {
  231. output[0] = output[1] = output[2] = output[3] = 0;
  232. return;
  233. }
  234. s0 = sinpi_1_9 * x0;
  235. s1 = sinpi_2_9 * x0;
  236. s2 = sinpi_3_9 * x1;
  237. s3 = sinpi_4_9 * x2;
  238. s4 = sinpi_1_9 * x2;
  239. s5 = sinpi_2_9 * x3;
  240. s6 = sinpi_4_9 * x3;
  241. s7 = WRAPLOW(x0 - x2 + x3);
  242. s0 = s0 + s3 + s5;
  243. s1 = s1 - s4 - s6;
  244. s3 = s2;
  245. s2 = sinpi_3_9 * s7;
  246. // 1-D transform scaling factor is sqrt(2).
  247. // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
  248. // + 1b (addition) = 29b.
  249. // Hence the output bit depth is 15b.
  250. output[0] = WRAPLOW(dct_const_round_shift(s0 + s3));
  251. output[1] = WRAPLOW(dct_const_round_shift(s1 + s3));
  252. output[2] = WRAPLOW(dct_const_round_shift(s2));
  253. output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3));
  254. }
  255. void iadst8_c(const tran_low_t *input, tran_low_t *output) {
  256. int s0, s1, s2, s3, s4, s5, s6, s7;
  257. tran_high_t x0 = input[7];
  258. tran_high_t x1 = input[0];
  259. tran_high_t x2 = input[5];
  260. tran_high_t x3 = input[2];
  261. tran_high_t x4 = input[3];
  262. tran_high_t x5 = input[4];
  263. tran_high_t x6 = input[1];
  264. tran_high_t x7 = input[6];
  265. if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
  266. output[0] = output[1] = output[2] = output[3] = output[4]
  267. = output[5] = output[6] = output[7] = 0;
  268. return;
  269. }
  270. // stage 1
  271. s0 = (int)(cospi_2_64 * x0 + cospi_30_64 * x1);
  272. s1 = (int)(cospi_30_64 * x0 - cospi_2_64 * x1);
  273. s2 = (int)(cospi_10_64 * x2 + cospi_22_64 * x3);
  274. s3 = (int)(cospi_22_64 * x2 - cospi_10_64 * x3);
  275. s4 = (int)(cospi_18_64 * x4 + cospi_14_64 * x5);
  276. s5 = (int)(cospi_14_64 * x4 - cospi_18_64 * x5);
  277. s6 = (int)(cospi_26_64 * x6 + cospi_6_64 * x7);
  278. s7 = (int)(cospi_6_64 * x6 - cospi_26_64 * x7);
  279. x0 = WRAPLOW(dct_const_round_shift(s0 + s4));
  280. x1 = WRAPLOW(dct_const_round_shift(s1 + s5));
  281. x2 = WRAPLOW(dct_const_round_shift(s2 + s6));
  282. x3 = WRAPLOW(dct_const_round_shift(s3 + s7));
  283. x4 = WRAPLOW(dct_const_round_shift(s0 - s4));
  284. x5 = WRAPLOW(dct_const_round_shift(s1 - s5));
  285. x6 = WRAPLOW(dct_const_round_shift(s2 - s6));
  286. x7 = WRAPLOW(dct_const_round_shift(s3 - s7));
  287. // stage 2
  288. s0 = (int)x0;
  289. s1 = (int)x1;
  290. s2 = (int)x2;
  291. s3 = (int)x3;
  292. s4 = (int)(cospi_8_64 * x4 + cospi_24_64 * x5);
  293. s5 = (int)(cospi_24_64 * x4 - cospi_8_64 * x5);
  294. s6 = (int)(-cospi_24_64 * x6 + cospi_8_64 * x7);
  295. s7 = (int)(cospi_8_64 * x6 + cospi_24_64 * x7);
  296. x0 = WRAPLOW(s0 + s2);
  297. x1 = WRAPLOW(s1 + s3);
  298. x2 = WRAPLOW(s0 - s2);
  299. x3 = WRAPLOW(s1 - s3);
  300. x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
  301. x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
  302. x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
  303. x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
  304. // stage 3
  305. s2 = (int)(cospi_16_64 * (x2 + x3));
  306. s3 = (int)(cospi_16_64 * (x2 - x3));
  307. s6 = (int)(cospi_16_64 * (x6 + x7));
  308. s7 = (int)(cospi_16_64 * (x6 - x7));
  309. x2 = WRAPLOW(dct_const_round_shift(s2));
  310. x3 = WRAPLOW(dct_const_round_shift(s3));
  311. x6 = WRAPLOW(dct_const_round_shift(s6));
  312. x7 = WRAPLOW(dct_const_round_shift(s7));
  313. output[0] = WRAPLOW(x0);
  314. output[1] = WRAPLOW(-x4);
  315. output[2] = WRAPLOW(x6);
  316. output[3] = WRAPLOW(-x2);
  317. output[4] = WRAPLOW(x3);
  318. output[5] = WRAPLOW(-x7);
  319. output[6] = WRAPLOW(x5);
  320. output[7] = WRAPLOW(-x1);
  321. }
  322. void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
  323. tran_low_t out[8 * 8] = { 0 };
  324. tran_low_t *outptr = out;
  325. int i, j;
  326. tran_low_t temp_in[8], temp_out[8];
  327. // First transform rows
  328. // only first 4 row has non-zero coefs
  329. for (i = 0; i < 4; ++i) {
  330. idct8_c(input, outptr);
  331. input += 8;
  332. outptr += 8;
  333. }
  334. // Then transform columns
  335. for (i = 0; i < 8; ++i) {
  336. for (j = 0; j < 8; ++j)
  337. temp_in[j] = out[j * 8 + i];
  338. idct8_c(temp_in, temp_out);
  339. for (j = 0; j < 8; ++j) {
  340. dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
  341. ROUND_POWER_OF_TWO(temp_out[j], 5));
  342. }
  343. }
  344. }
  345. void idct16_c(const tran_low_t *input, tran_low_t *output) {
  346. tran_low_t step1[16], step2[16];
  347. tran_high_t temp1, temp2;
  348. // stage 1
  349. step1[0] = input[0/2];
  350. step1[1] = input[16/2];
  351. step1[2] = input[8/2];
  352. step1[3] = input[24/2];
  353. step1[4] = input[4/2];
  354. step1[5] = input[20/2];
  355. step1[6] = input[12/2];
  356. step1[7] = input[28/2];
  357. step1[8] = input[2/2];
  358. step1[9] = input[18/2];
  359. step1[10] = input[10/2];
  360. step1[11] = input[26/2];
  361. step1[12] = input[6/2];
  362. step1[13] = input[22/2];
  363. step1[14] = input[14/2];
  364. step1[15] = input[30/2];
  365. // stage 2
  366. step2[0] = step1[0];
  367. step2[1] = step1[1];
  368. step2[2] = step1[2];
  369. step2[3] = step1[3];
  370. step2[4] = step1[4];
  371. step2[5] = step1[5];
  372. step2[6] = step1[6];
  373. step2[7] = step1[7];
  374. temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
  375. temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
  376. step2[8] = WRAPLOW(dct_const_round_shift(temp1));
  377. step2[15] = WRAPLOW(dct_const_round_shift(temp2));
  378. temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
  379. temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
  380. step2[9] = WRAPLOW(dct_const_round_shift(temp1));
  381. step2[14] = WRAPLOW(dct_const_round_shift(temp2));
  382. temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
  383. temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
  384. step2[10] = WRAPLOW(dct_const_round_shift(temp1));
  385. step2[13] = WRAPLOW(dct_const_round_shift(temp2));
  386. temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
  387. temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
  388. step2[11] = WRAPLOW(dct_const_round_shift(temp1));
  389. step2[12] = WRAPLOW(dct_const_round_shift(temp2));
  390. // stage 3
  391. step1[0] = step2[0];
  392. step1[1] = step2[1];
  393. step1[2] = step2[2];
  394. step1[3] = step2[3];
  395. temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
  396. temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
  397. step1[4] = WRAPLOW(dct_const_round_shift(temp1));
  398. step1[7] = WRAPLOW(dct_const_round_shift(temp2));
  399. temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
  400. temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
  401. step1[5] = WRAPLOW(dct_const_round_shift(temp1));
  402. step1[6] = WRAPLOW(dct_const_round_shift(temp2));
  403. step1[8] = WRAPLOW(step2[8] + step2[9]);
  404. step1[9] = WRAPLOW(step2[8] - step2[9]);
  405. step1[10] = WRAPLOW(-step2[10] + step2[11]);
  406. step1[11] = WRAPLOW(step2[10] + step2[11]);
  407. step1[12] = WRAPLOW(step2[12] + step2[13]);
  408. step1[13] = WRAPLOW(step2[12] - step2[13]);
  409. step1[14] = WRAPLOW(-step2[14] + step2[15]);
  410. step1[15] = WRAPLOW(step2[14] + step2[15]);
  411. // stage 4
  412. temp1 = (step1[0] + step1[1]) * cospi_16_64;
  413. temp2 = (step1[0] - step1[1]) * cospi_16_64;
  414. step2[0] = WRAPLOW(dct_const_round_shift(temp1));
  415. step2[1] = WRAPLOW(dct_const_round_shift(temp2));
  416. temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
  417. temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
  418. step2[2] = WRAPLOW(dct_const_round_shift(temp1));
  419. step2[3] = WRAPLOW(dct_const_round_shift(temp2));
  420. step2[4] = WRAPLOW(step1[4] + step1[5]);
  421. step2[5] = WRAPLOW(step1[4] - step1[5]);
  422. step2[6] = WRAPLOW(-step1[6] + step1[7]);
  423. step2[7] = WRAPLOW(step1[6] + step1[7]);
  424. step2[8] = step1[8];
  425. step2[15] = step1[15];
  426. temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
  427. temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
  428. step2[9] = WRAPLOW(dct_const_round_shift(temp1));
  429. step2[14] = WRAPLOW(dct_const_round_shift(temp2));
  430. temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
  431. temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
  432. step2[10] = WRAPLOW(dct_const_round_shift(temp1));
  433. step2[13] = WRAPLOW(dct_const_round_shift(temp2));
  434. step2[11] = step1[11];
  435. step2[12] = step1[12];
  436. // stage 5
  437. step1[0] = WRAPLOW(step2[0] + step2[3]);
  438. step1[1] = WRAPLOW(step2[1] + step2[2]);
  439. step1[2] = WRAPLOW(step2[1] - step2[2]);
  440. step1[3] = WRAPLOW(step2[0] - step2[3]);
  441. step1[4] = step2[4];
  442. temp1 = (step2[6] - step2[5]) * cospi_16_64;
  443. temp2 = (step2[5] + step2[6]) * cospi_16_64;
  444. step1[5] = WRAPLOW(dct_const_round_shift(temp1));
  445. step1[6] = WRAPLOW(dct_const_round_shift(temp2));
  446. step1[7] = step2[7];
  447. step1[8] = WRAPLOW(step2[8] + step2[11]);
  448. step1[9] = WRAPLOW(step2[9] + step2[10]);
  449. step1[10] = WRAPLOW(step2[9] - step2[10]);
  450. step1[11] = WRAPLOW(step2[8] - step2[11]);
  451. step1[12] = WRAPLOW(-step2[12] + step2[15]);
  452. step1[13] = WRAPLOW(-step2[13] + step2[14]);
  453. step1[14] = WRAPLOW(step2[13] + step2[14]);
  454. step1[15] = WRAPLOW(step2[12] + step2[15]);
  455. // stage 6
  456. step2[0] = WRAPLOW(step1[0] + step1[7]);
  457. step2[1] = WRAPLOW(step1[1] + step1[6]);
  458. step2[2] = WRAPLOW(step1[2] + step1[5]);
  459. step2[3] = WRAPLOW(step1[3] + step1[4]);
  460. step2[4] = WRAPLOW(step1[3] - step1[4]);
  461. step2[5] = WRAPLOW(step1[2] - step1[5]);
  462. step2[6] = WRAPLOW(step1[1] - step1[6]);
  463. step2[7] = WRAPLOW(step1[0] - step1[7]);
  464. step2[8] = step1[8];
  465. step2[9] = step1[9];
  466. temp1 = (-step1[10] + step1[13]) * cospi_16_64;
  467. temp2 = (step1[10] + step1[13]) * cospi_16_64;
  468. step2[10] = WRAPLOW(dct_const_round_shift(temp1));
  469. step2[13] = WRAPLOW(dct_const_round_shift(temp2));
  470. temp1 = (-step1[11] + step1[12]) * cospi_16_64;
  471. temp2 = (step1[11] + step1[12]) * cospi_16_64;
  472. step2[11] = WRAPLOW(dct_const_round_shift(temp1));
  473. step2[12] = WRAPLOW(dct_const_round_shift(temp2));
  474. step2[14] = step1[14];
  475. step2[15] = step1[15];
  476. // stage 7
  477. output[0] = WRAPLOW(step2[0] + step2[15]);
  478. output[1] = WRAPLOW(step2[1] + step2[14]);
  479. output[2] = WRAPLOW(step2[2] + step2[13]);
  480. output[3] = WRAPLOW(step2[3] + step2[12]);
  481. output[4] = WRAPLOW(step2[4] + step2[11]);
  482. output[5] = WRAPLOW(step2[5] + step2[10]);
  483. output[6] = WRAPLOW(step2[6] + step2[9]);
  484. output[7] = WRAPLOW(step2[7] + step2[8]);
  485. output[8] = WRAPLOW(step2[7] - step2[8]);
  486. output[9] = WRAPLOW(step2[6] - step2[9]);
  487. output[10] = WRAPLOW(step2[5] - step2[10]);
  488. output[11] = WRAPLOW(step2[4] - step2[11]);
  489. output[12] = WRAPLOW(step2[3] - step2[12]);
  490. output[13] = WRAPLOW(step2[2] - step2[13]);
  491. output[14] = WRAPLOW(step2[1] - step2[14]);
  492. output[15] = WRAPLOW(step2[0] - step2[15]);
  493. }
  494. void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,
  495. int stride) {
  496. tran_low_t out[16 * 16];
  497. tran_low_t *outptr = out;
  498. int i, j;
  499. tran_low_t temp_in[16], temp_out[16];
  500. // First transform rows
  501. for (i = 0; i < 16; ++i) {
  502. idct16_c(input, outptr);
  503. input += 16;
  504. outptr += 16;
  505. }
  506. // Then transform columns
  507. for (i = 0; i < 16; ++i) {
  508. for (j = 0; j < 16; ++j)
  509. temp_in[j] = out[j * 16 + i];
  510. idct16_c(temp_in, temp_out);
  511. for (j = 0; j < 16; ++j) {
  512. dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
  513. ROUND_POWER_OF_TWO(temp_out[j], 6));
  514. }
  515. }
  516. }
  517. void iadst16_c(const tran_low_t *input, tran_low_t *output) {
  518. tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
  519. tran_high_t s9, s10, s11, s12, s13, s14, s15;
  520. tran_high_t x0 = input[15];
  521. tran_high_t x1 = input[0];
  522. tran_high_t x2 = input[13];
  523. tran_high_t x3 = input[2];
  524. tran_high_t x4 = input[11];
  525. tran_high_t x5 = input[4];
  526. tran_high_t x6 = input[9];
  527. tran_high_t x7 = input[6];
  528. tran_high_t x8 = input[7];
  529. tran_high_t x9 = input[8];
  530. tran_high_t x10 = input[5];
  531. tran_high_t x11 = input[10];
  532. tran_high_t x12 = input[3];
  533. tran_high_t x13 = input[12];
  534. tran_high_t x14 = input[1];
  535. tran_high_t x15 = input[14];
  536. if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
  537. | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {
  538. output[0] = output[1] = output[2] = output[3] = output[4]
  539. = output[5] = output[6] = output[7] = output[8]
  540. = output[9] = output[10] = output[11] = output[12]
  541. = output[13] = output[14] = output[15] = 0;
  542. return;
  543. }
  544. // stage 1
  545. s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
  546. s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
  547. s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
  548. s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
  549. s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
  550. s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
  551. s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
  552. s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
  553. s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
  554. s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
  555. s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
  556. s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
  557. s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
  558. s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
  559. s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
  560. s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
  561. x0 = WRAPLOW(dct_const_round_shift(s0 + s8));
  562. x1 = WRAPLOW(dct_const_round_shift(s1 + s9));
  563. x2 = WRAPLOW(dct_const_round_shift(s2 + s10));
  564. x3 = WRAPLOW(dct_const_round_shift(s3 + s11));
  565. x4 = WRAPLOW(dct_const_round_shift(s4 + s12));
  566. x5 = WRAPLOW(dct_const_round_shift(s5 + s13));
  567. x6 = WRAPLOW(dct_const_round_shift(s6 + s14));
  568. x7 = WRAPLOW(dct_const_round_shift(s7 + s15));
  569. x8 = WRAPLOW(dct_const_round_shift(s0 - s8));
  570. x9 = WRAPLOW(dct_const_round_shift(s1 - s9));
  571. x10 = WRAPLOW(dct_const_round_shift(s2 - s10));
  572. x11 = WRAPLOW(dct_const_round_shift(s3 - s11));
  573. x12 = WRAPLOW(dct_const_round_shift(s4 - s12));
  574. x13 = WRAPLOW(dct_const_round_shift(s5 - s13));
  575. x14 = WRAPLOW(dct_const_round_shift(s6 - s14));
  576. x15 = WRAPLOW(dct_const_round_shift(s7 - s15));
  577. // stage 2
  578. s0 = x0;
  579. s1 = x1;
  580. s2 = x2;
  581. s3 = x3;
  582. s4 = x4;
  583. s5 = x5;
  584. s6 = x6;
  585. s7 = x7;
  586. s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
  587. s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
  588. s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
  589. s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
  590. s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;
  591. s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
  592. s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;
  593. s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
  594. x0 = WRAPLOW(s0 + s4);
  595. x1 = WRAPLOW(s1 + s5);
  596. x2 = WRAPLOW(s2 + s6);
  597. x3 = WRAPLOW(s3 + s7);
  598. x4 = WRAPLOW(s0 - s4);
  599. x5 = WRAPLOW(s1 - s5);
  600. x6 = WRAPLOW(s2 - s6);
  601. x7 = WRAPLOW(s3 - s7);
  602. x8 = WRAPLOW(dct_const_round_shift(s8 + s12));
  603. x9 = WRAPLOW(dct_const_round_shift(s9 + s13));
  604. x10 = WRAPLOW(dct_const_round_shift(s10 + s14));
  605. x11 = WRAPLOW(dct_const_round_shift(s11 + s15));
  606. x12 = WRAPLOW(dct_const_round_shift(s8 - s12));
  607. x13 = WRAPLOW(dct_const_round_shift(s9 - s13));
  608. x14 = WRAPLOW(dct_const_round_shift(s10 - s14));
  609. x15 = WRAPLOW(dct_const_round_shift(s11 - s15));
  610. // stage 3
  611. s0 = x0;
  612. s1 = x1;
  613. s2 = x2;
  614. s3 = x3;
  615. s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
  616. s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
  617. s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;
  618. s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
  619. s8 = x8;
  620. s9 = x9;
  621. s10 = x10;
  622. s11 = x11;
  623. s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
  624. s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
  625. s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;
  626. s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
  627. x0 = WRAPLOW(s0 + s2);
  628. x1 = WRAPLOW(s1 + s3);
  629. x2 = WRAPLOW(s0 - s2);
  630. x3 = WRAPLOW(s1 - s3);
  631. x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
  632. x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
  633. x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
  634. x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
  635. x8 = WRAPLOW(s8 + s10);
  636. x9 = WRAPLOW(s9 + s11);
  637. x10 = WRAPLOW(s8 - s10);
  638. x11 = WRAPLOW(s9 - s11);
  639. x12 = WRAPLOW(dct_const_round_shift(s12 + s14));
  640. x13 = WRAPLOW(dct_const_round_shift(s13 + s15));
  641. x14 = WRAPLOW(dct_const_round_shift(s12 - s14));
  642. x15 = WRAPLOW(dct_const_round_shift(s13 - s15));
  643. // stage 4
  644. s2 = (- cospi_16_64) * (x2 + x3);
  645. s3 = cospi_16_64 * (x2 - x3);
  646. s6 = cospi_16_64 * (x6 + x7);
  647. s7 = cospi_16_64 * (- x6 + x7);
  648. s10 = cospi_16_64 * (x10 + x11);
  649. s11 = cospi_16_64 * (- x10 + x11);
  650. s14 = (- cospi_16_64) * (x14 + x15);
  651. s15 = cospi_16_64 * (x14 - x15);
  652. x2 = WRAPLOW(dct_const_round_shift(s2));
  653. x3 = WRAPLOW(dct_const_round_shift(s3));
  654. x6 = WRAPLOW(dct_const_round_shift(s6));
  655. x7 = WRAPLOW(dct_const_round_shift(s7));
  656. x10 = WRAPLOW(dct_const_round_shift(s10));
  657. x11 = WRAPLOW(dct_const_round_shift(s11));
  658. x14 = WRAPLOW(dct_const_round_shift(s14));
  659. x15 = WRAPLOW(dct_const_round_shift(s15));
  660. output[0] = WRAPLOW(x0);
  661. output[1] = WRAPLOW(-x8);
  662. output[2] = WRAPLOW(x12);
  663. output[3] = WRAPLOW(-x4);
  664. output[4] = WRAPLOW(x6);
  665. output[5] = WRAPLOW(x14);
  666. output[6] = WRAPLOW(x10);
  667. output[7] = WRAPLOW(x2);
  668. output[8] = WRAPLOW(x3);
  669. output[9] = WRAPLOW(x11);
  670. output[10] = WRAPLOW(x15);
  671. output[11] = WRAPLOW(x7);
  672. output[12] = WRAPLOW(x5);
  673. output[13] = WRAPLOW(-x13);
  674. output[14] = WRAPLOW(x9);
  675. output[15] = WRAPLOW(-x1);
  676. }
  677. void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,
  678. int stride) {
  679. tran_low_t out[16 * 16] = { 0 };
  680. tran_low_t *outptr = out;
  681. int i, j;
  682. tran_low_t temp_in[16], temp_out[16];
  683. // First transform rows. Since all non-zero dct coefficients are in
  684. // upper-left 4x4 area, we only need to calculate first 4 rows here.
  685. for (i = 0; i < 4; ++i) {
  686. idct16_c(input, outptr);
  687. input += 16;
  688. outptr += 16;
  689. }
  690. // Then transform columns
  691. for (i = 0; i < 16; ++i) {
  692. for (j = 0; j < 16; ++j)
  693. temp_in[j] = out[j*16 + i];
  694. idct16_c(temp_in, temp_out);
  695. for (j = 0; j < 16; ++j) {
  696. dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
  697. ROUND_POWER_OF_TWO(temp_out[j], 6));
  698. }
  699. }
  700. }
  701. void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
  702. int i, j;
  703. tran_high_t a1;
  704. tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
  705. out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
  706. a1 = ROUND_POWER_OF_TWO(out, 6);
  707. for (j = 0; j < 16; ++j) {
  708. for (i = 0; i < 16; ++i)
  709. dest[i] = clip_pixel_add(dest[i], a1);
  710. dest += stride;
  711. }
  712. }
  713. void idct32_c(const tran_low_t *input, tran_low_t *output) {
  714. tran_low_t step1[32], step2[32];
  715. tran_high_t temp1, temp2;
  716. // stage 1
  717. step1[0] = input[0];
  718. step1[1] = input[16];
  719. step1[2] = input[8];
  720. step1[3] = input[24];
  721. step1[4] = input[4];
  722. step1[5] = input[20];
  723. step1[6] = input[12];
  724. step1[7] = input[28];
  725. step1[8] = input[2];
  726. step1[9] = input[18];
  727. step1[10] = input[10];
  728. step1[11] = input[26];
  729. step1[12] = input[6];
  730. step1[13] = input[22];
  731. step1[14] = input[14];
  732. step1[15] = input[30];
  733. temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
  734. temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
  735. step1[16] = WRAPLOW(dct_const_round_shift(temp1));
  736. step1[31] = WRAPLOW(dct_const_round_shift(temp2));
  737. temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
  738. temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
  739. step1[17] = WRAPLOW(dct_const_round_shift(temp1));
  740. step1[30] = WRAPLOW(dct_const_round_shift(temp2));
  741. temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
  742. temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
  743. step1[18] = WRAPLOW(dct_const_round_shift(temp1));
  744. step1[29] = WRAPLOW(dct_const_round_shift(temp2));
  745. temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
  746. temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
  747. step1[19] = WRAPLOW(dct_const_round_shift(temp1));
  748. step1[28] = WRAPLOW(dct_const_round_shift(temp2));
  749. temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
  750. temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
  751. step1[20] = WRAPLOW(dct_const_round_shift(temp1));
  752. step1[27] = WRAPLOW(dct_const_round_shift(temp2));
  753. temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
  754. temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
  755. step1[21] = WRAPLOW(dct_const_round_shift(temp1));
  756. step1[26] = WRAPLOW(dct_const_round_shift(temp2));
  757. temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
  758. temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
  759. step1[22] = WRAPLOW(dct_const_round_shift(temp1));
  760. step1[25] = WRAPLOW(dct_const_round_shift(temp2));
  761. temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
  762. temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
  763. step1[23] = WRAPLOW(dct_const_round_shift(temp1));
  764. step1[24] = WRAPLOW(dct_const_round_shift(temp2));
  765. // stage 2
  766. step2[0] = step1[0];
  767. step2[1] = step1[1];
  768. step2[2] = step1[2];
  769. step2[3] = step1[3];
  770. step2[4] = step1[4];
  771. step2[5] = step1[5];
  772. step2[6] = step1[6];
  773. step2[7] = step1[7];
  774. temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
  775. temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
  776. step2[8] = WRAPLOW(dct_const_round_shift(temp1));
  777. step2[15] = WRAPLOW(dct_const_round_shift(temp2));
  778. temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
  779. temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
  780. step2[9] = WRAPLOW(dct_const_round_shift(temp1));
  781. step2[14] = WRAPLOW(dct_const_round_shift(temp2));
  782. temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
  783. temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
  784. step2[10] = WRAPLOW(dct_const_round_shift(temp1));
  785. step2[13] = WRAPLOW(dct_const_round_shift(temp2));
  786. temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
  787. temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
  788. step2[11] = WRAPLOW(dct_const_round_shift(temp1));
  789. step2[12] = WRAPLOW(dct_const_round_shift(temp2));
  790. step2[16] = WRAPLOW(step1[16] + step1[17]);
  791. step2[17] = WRAPLOW(step1[16] - step1[17]);
  792. step2[18] = WRAPLOW(-step1[18] + step1[19]);
  793. step2[19] = WRAPLOW(step1[18] + step1[19]);
  794. step2[20] = WRAPLOW(step1[20] + step1[21]);
  795. step2[21] = WRAPLOW(step1[20] - step1[21]);
  796. step2[22] = WRAPLOW(-step1[22] + step1[23]);
  797. step2[23] = WRAPLOW(step1[22] + step1[23]);
  798. step2[24] = WRAPLOW(step1[24] + step1[25]);
  799. step2[25] = WRAPLOW(step1[24] - step1[25]);
  800. step2[26] = WRAPLOW(-step1[26] + step1[27]);
  801. step2[27] = WRAPLOW(step1[26] + step1[27]);
  802. step2[28] = WRAPLOW(step1[28] + step1[29]);
  803. step2[29] = WRAPLOW(step1[28] - step1[29]);
  804. step2[30] = WRAPLOW(-step1[30] + step1[31]);
  805. step2[31] = WRAPLOW(step1[30] + step1[31]);
  806. // stage 3
  807. step1[0] = step2[0];
  808. step1[1] = step2[1];
  809. step1[2] = step2[2];
  810. step1[3] = step2[3];
  811. temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
  812. temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
  813. step1[4] = WRAPLOW(dct_const_round_shift(temp1));
  814. step1[7] = WRAPLOW(dct_const_round_shift(temp2));
  815. temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
  816. temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
  817. step1[5] = WRAPLOW(dct_const_round_shift(temp1));
  818. step1[6] = WRAPLOW(dct_const_round_shift(temp2));
  819. step1[8] = WRAPLOW(step2[8] + step2[9]);
  820. step1[9] = WRAPLOW(step2[8] - step2[9]);
  821. step1[10] = WRAPLOW(-step2[10] + step2[11]);
  822. step1[11] = WRAPLOW(step2[10] + step2[11]);
  823. step1[12] = WRAPLOW(step2[12] + step2[13]);
  824. step1[13] = WRAPLOW(step2[12] - step2[13]);
  825. step1[14] = WRAPLOW(-step2[14] + step2[15]);
  826. step1[15] = WRAPLOW(step2[14] + step2[15]);
  827. step1[16] = step2[16];
  828. step1[31] = step2[31];
  829. temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
  830. temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
  831. step1[17] = WRAPLOW(dct_const_round_shift(temp1));
  832. step1[30] = WRAPLOW(dct_const_round_shift(temp2));
  833. temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
  834. temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
  835. step1[18] = WRAPLOW(dct_const_round_shift(temp1));
  836. step1[29] = WRAPLOW(dct_const_round_shift(temp2));
  837. step1[19] = step2[19];
  838. step1[20] = step2[20];
  839. temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
  840. temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
  841. step1[21] = WRAPLOW(dct_const_round_shift(temp1));
  842. step1[26] = WRAPLOW(dct_const_round_shift(temp2));
  843. temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
  844. temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
  845. step1[22] = WRAPLOW(dct_const_round_shift(temp1));
  846. step1[25] = WRAPLOW(dct_const_round_shift(temp2));
  847. step1[23] = step2[23];
  848. step1[24] = step2[24];
  849. step1[27] = step2[27];
  850. step1[28] = step2[28];
  851. // stage 4
  852. temp1 = (step1[0] + step1[1]) * cospi_16_64;
  853. temp2 = (step1[0] - step1[1]) * cospi_16_64;
  854. step2[0] = WRAPLOW(dct_const_round_shift(temp1));
  855. step2[1] = WRAPLOW(dct_const_round_shift(temp2));
  856. temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
  857. temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
  858. step2[2] = WRAPLOW(dct_const_round_shift(temp1));
  859. step2[3] = WRAPLOW(dct_const_round_shift(temp2));
  860. step2[4] = WRAPLOW(step1[4] + step1[5]);
  861. step2[5] = WRAPLOW(step1[4] - step1[5]);
  862. step2[6] = WRAPLOW(-step1[6] + step1[7]);
  863. step2[7] = WRAPLOW(step1[6] + step1[7]);
  864. step2[8] = step1[8];
  865. step2[15] = step1[15];
  866. temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
  867. temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
  868. step2[9] = WRAPLOW(dct_const_round_shift(temp1));
  869. step2[14] = WRAPLOW(dct_const_round_shift(temp2));
  870. temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
  871. temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
  872. step2[10] = WRAPLOW(dct_const_round_shift(temp1));
  873. step2[13] = WRAPLOW(dct_const_round_shift(temp2));
  874. step2[11] = step1[11];
  875. step2[12] = step1[12];
  876. step2[16] = WRAPLOW(step1[16] + step1[19]);
  877. step2[17] = WRAPLOW(step1[17] + step1[18]);
  878. step2[18] = WRAPLOW(step1[17] - step1[18]);
  879. step2[19] = WRAPLOW(step1[16] - step1[19]);
  880. step2[20] = WRAPLOW(-step1[20] + step1[23]);
  881. step2[21] = WRAPLOW(-step1[21] + step1[22]);
  882. step2[22] = WRAPLOW(step1[21] + step1[22]);
  883. step2[23] = WRAPLOW(step1[20] + step1[23]);
  884. step2[24] = WRAPLOW(step1[24] + step1[27]);
  885. step2[25] = WRAPLOW(step1[25] + step1[26]);
  886. step2[26] = WRAPLOW(step1[25] - step1[26]);
  887. step2[27] = WRAPLOW(step1[24] - step1[27]);
  888. step2[28] = WRAPLOW(-step1[28] + step1[31]);
  889. step2[29] = WRAPLOW(-step1[29] + step1[30]);
  890. step2[30] = WRAPLOW(step1[29] + step1[30]);
  891. step2[31] = WRAPLOW(step1[28] + step1[31]);
  892. // stage 5
  893. step1[0] = WRAPLOW(step2[0] + step2[3]);
  894. step1[1] = WRAPLOW(step2[1] + step2[2]);
  895. step1[2] = WRAPLOW(step2[1] - step2[2]);
  896. step1[3] = WRAPLOW(step2[0] - step2[3]);
  897. step1[4] = step2[4];
  898. temp1 = (step2[6] - step2[5]) * cospi_16_64;
  899. temp2 = (step2[5] + step2[6]) * cospi_16_64;
  900. step1[5] = WRAPLOW(dct_const_round_shift(temp1));
  901. step1[6] = WRAPLOW(dct_const_round_shift(temp2));
  902. step1[7] = step2[7];
  903. step1[8] = WRAPLOW(step2[8] + step2[11]);
  904. step1[9] = WRAPLOW(step2[9] + step2[10]);
  905. step1[10] = WRAPLOW(step2[9] - step2[10]);
  906. step1[11] = WRAPLOW(step2[8] - step2[11]);
  907. step1[12] = WRAPLOW(-step2[12] + step2[15]);
  908. step1[13] = WRAPLOW(-step2[13] + step2[14]);
  909. step1[14] = WRAPLOW(step2[13] + step2[14]);
  910. step1[15] = WRAPLOW(step2[12] + step2[15]);
  911. step1[16] = step2[16];
  912. step1[17] = step2[17];
  913. temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
  914. temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
  915. step1[18] = WRAPLOW(dct_const_round_shift(temp1));
  916. step1[29] = WRAPLOW(dct_const_round_shift(temp2));
  917. temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
  918. temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
  919. step1[19] = WRAPLOW(dct_const_round_shift(temp1));
  920. step1[28] = WRAPLOW(dct_const_round_shift(temp2));
  921. temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
  922. temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
  923. step1[20] = WRAPLOW(dct_const_round_shift(temp1));
  924. step1[27] = WRAPLOW(dct_const_round_shift(temp2));
  925. temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
  926. temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
  927. step1[21] = WRAPLOW(dct_const_round_shift(temp1));
  928. step1[26] = WRAPLOW(dct_const_round_shift(temp2));
  929. step1[22] = step2[22];
  930. step1[23] = step2[23];
  931. step1[24] = step2[24];
  932. step1[25] = step2[25];
  933. step1[30] = step2[30];
  934. step1[31] = step2[31];
  935. // stage 6
  936. step2[0] = WRAPLOW(step1[0] + step1[7]);
  937. step2[1] = WRAPLOW(step1[1] + step1[6]);
  938. step2[2] = WRAPLOW(step1[2] + step1[5]);
  939. step2[3] = WRAPLOW(step1[3] + step1[4]);
  940. step2[4] = WRAPLOW(step1[3] - step1[4]);
  941. step2[5] = WRAPLOW(step1[2] - step1[5]);
  942. step2[6] = WRAPLOW(step1[1] - step1[6]);
  943. step2[7] = WRAPLOW(step1[0] - step1[7]);
  944. step2[8] = step1[8];
  945. step2[9] = step1[9];
  946. temp1 = (-step1[10] + step1[13]) * cospi_16_64;
  947. temp2 = (step1[10] + step1[13]) * cospi_16_64;
  948. step2[10] = WRAPLOW(dct_const_round_shift(temp1));
  949. step2[13] = WRAPLOW(dct_const_round_shift(temp2));
  950. temp1 = (-step1[11] + step1[12]) * cospi_16_64;
  951. temp2 = (step1[11] + step1[12]) * cospi_16_64;
  952. step2[11] = WRAPLOW(dct_const_round_shift(temp1));
  953. step2[12] = WRAPLOW(dct_const_round_shift(temp2));
  954. step2[14] = step1[14];
  955. step2[15] = step1[15];
  956. step2[16] = WRAPLOW(step1[16] + step1[23]);
  957. step2[17] = WRAPLOW(step1[17] + step1[22]);
  958. step2[18] = WRAPLOW(step1[18] + step1[21]);
  959. step2[19] = WRAPLOW(step1[19] + step1[20]);
  960. step2[20] = WRAPLOW(step1[19] - step1[20]);
  961. step2[21] = WRAPLOW(step1[18] - step1[21]);
  962. step2[22] = WRAPLOW(step1[17] - step1[22]);
  963. step2[23] = WRAPLOW(step1[16] - step1[23]);
  964. step2[24] = WRAPLOW(-step1[24] + step1[31]);
  965. step2[25] = WRAPLOW(-step1[25] + step1[30]);
  966. step2[26] = WRAPLOW(-step1[26] + step1[29]);
  967. step2[27] = WRAPLOW(-step1[27] + step1[28]);
  968. step2[28] = WRAPLOW(step1[27] + step1[28]);
  969. step2[29] = WRAPLOW(step1[26] + step1[29]);
  970. step2[30] = WRAPLOW(step1[25] + step1[30]);
  971. step2[31] = WRAPLOW(step1[24] + step1[31]);
  972. // stage 7
  973. step1[0] = WRAPLOW(step2[0] + step2[15]);
  974. step1[1] = WRAPLOW(step2[1] + step2[14]);
  975. step1[2] = WRAPLOW(step2[2] + step2[13]);
  976. step1[3] = WRAPLOW(step2[3] + step2[12]);
  977. step1[4] = WRAPLOW(step2[4] + step2[11]);
  978. step1[5] = WRAPLOW(step2[5] + step2[10]);
  979. step1[6] = WRAPLOW(step2[6] + step2[9]);
  980. step1[7] = WRAPLOW(step2[7] + step2[8]);
  981. step1[8] = WRAPLOW(step2[7] - step2[8]);
  982. step1[9] = WRAPLOW(step2[6] - step2[9]);
  983. step1[10] = WRAPLOW(step2[5] - step2[10]);
  984. step1[11] = WRAPLOW(step2[4] - step2[11]);
  985. step1[12] = WRAPLOW(step2[3] - step2[12]);
  986. step1[13] = WRAPLOW(step2[2] - step2[13]);
  987. step1[14] = WRAPLOW(step2[1] - step2[14]);
  988. step1[15] = WRAPLOW(step2[0] - step2[15]);
  989. step1[16] = step2[16];
  990. step1[17] = step2[17];
  991. step1[18] = step2[18];
  992. step1[19] = step2[19];
  993. temp1 = (-step2[20] + step2[27]) * cospi_16_64;
  994. temp2 = (step2[20] + step2[27]) * cospi_16_64;
  995. step1[20] = WRAPLOW(dct_const_round_shift(temp1));
  996. step1[27] = WRAPLOW(dct_const_round_shift(temp2));
  997. temp1 = (-step2[21] + step2[26]) * cospi_16_64;
  998. temp2 = (step2[21] + step2[26]) * cospi_16_64;
  999. step1[21] = WRAPLOW(dct_const_round_shift(temp1));
  1000. step1[26] = WRAPLOW(dct_const_round_shift(temp2));
  1001. temp1 = (-step2[22] + step2[25]) * cospi_16_64;
  1002. temp2 = (step2[22] + step2[25]) * cospi_16_64;
  1003. step1[22] = WRAPLOW(dct_const_round_shift(temp1));
  1004. step1[25] = WRAPLOW(dct_const_round_shift(temp2));
  1005. temp1 = (-step2[23] + step2[24]) * cospi_16_64;
  1006. temp2 = (step2[23] + step2[24]) * cospi_16_64;
  1007. step1[23] = WRAPLOW(dct_const_round_shift(temp1));
  1008. step1[24] = WRAPLOW(dct_const_round_shift(temp2));
  1009. step1[28] = step2[28];
  1010. step1[29] = step2[29];
  1011. step1[30] = step2[30];
  1012. step1[31] = step2[31];
  1013. // final stage
  1014. output[0] = WRAPLOW(step1[0] + step1[31]);
  1015. output[1] = WRAPLOW(step1[1] + step1[30]);
  1016. output[2] = WRAPLOW(step1[2] + step1[29]);
  1017. output[3] = WRAPLOW(step1[3] + step1[28]);
  1018. output[4] = WRAPLOW(step1[4] + step1[27]);
  1019. output[5] = WRAPLOW(step1[5] + step1[26]);
  1020. output[6] = WRAPLOW(step1[6] + step1[25]);
  1021. output[7] = WRAPLOW(step1[7] + step1[24]);
  1022. output[8] = WRAPLOW(step1[8] + step1[23]);
  1023. output[9] = WRAPLOW(step1[9] + step1[22]);
  1024. output[10] = WRAPLOW(step1[10] + step1[21]);
  1025. output[11] = WRAPLOW(step1[11] + step1[20]);
  1026. output[12] = WRAPLOW(step1[12] + step1[19]);
  1027. output[13] = WRAPLOW(step1[13] + step1[18]);
  1028. output[14] = WRAPLOW(step1[14] + step1[17]);
  1029. output[15] = WRAPLOW(step1[15] + step1[16]);
  1030. output[16] = WRAPLOW(step1[15] - step1[16]);
  1031. output[17] = WRAPLOW(step1[14] - step1[17]);
  1032. output[18] = WRAPLOW(step1[13] - step1[18]);
  1033. output[19] = WRAPLOW(step1[12] - step1[19]);
  1034. output[20] = WRAPLOW(step1[11] - step1[20]);
  1035. output[21] = WRAPLOW(step1[10] - step1[21]);
  1036. output[22] = WRAPLOW(step1[9] - step1[22]);
  1037. output[23] = WRAPLOW(step1[8] - step1[23]);
  1038. output[24] = WRAPLOW(step1[7] - step1[24]);
  1039. output[25] = WRAPLOW(step1[6] - step1[25]);
  1040. output[26] = WRAPLOW(step1[5] - step1[26]);
  1041. output[27] = WRAPLOW(step1[4] - step1[27]);
  1042. output[28] = WRAPLOW(step1[3] - step1[28]);
  1043. output[29] = WRAPLOW(step1[2] - step1[29]);
  1044. output[30] = WRAPLOW(step1[1] - step1[30]);
  1045. output[31] = WRAPLOW(step1[0] - step1[31]);
  1046. }
  1047. void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
  1048. int stride) {
  1049. tran_low_t out[32 * 32];
  1050. tran_low_t *outptr = out;
  1051. int i, j;
  1052. tran_low_t temp_in[32], temp_out[32];
  1053. // Rows
  1054. for (i = 0; i < 32; ++i) {
  1055. int16_t zero_coeff[16];
  1056. for (j = 0; j < 16; ++j)
  1057. zero_coeff[j] = input[2 * j] | input[2 * j + 1];
  1058. for (j = 0; j < 8; ++j)
  1059. zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
  1060. for (j = 0; j < 4; ++j)
  1061. zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
  1062. for (j = 0; j < 2; ++j)
  1063. zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
  1064. if (zero_coeff[0] | zero_coeff[1])
  1065. idct32_c(input, outptr);
  1066. else
  1067. memset(outptr, 0, sizeof(tran_low_t) * 32);
  1068. input += 32;
  1069. outptr += 32;
  1070. }
  1071. // Columns
  1072. for (i = 0; i < 32; ++i) {
  1073. for (j = 0; j < 32; ++j)
  1074. temp_in[j] = out[j * 32 + i];
  1075. idct32_c(temp_in, temp_out);
  1076. for (j = 0; j < 32; ++j) {
  1077. dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
  1078. ROUND_POWER_OF_TWO(temp_out[j], 6));
  1079. }
  1080. }
  1081. }
  1082. void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest,
  1083. int stride) {
  1084. tran_low_t out[32 * 32] = {0};
  1085. tran_low_t *outptr = out;
  1086. int i, j;
  1087. tran_low_t temp_in[32], temp_out[32];
  1088. // Rows
  1089. // only upper-left 16x16 has non-zero coeff
  1090. for (i = 0; i < 16; ++i) {
  1091. idct32_c(input, outptr);
  1092. input += 32;
  1093. outptr += 32;
  1094. }
  1095. // Columns
  1096. for (i = 0; i < 32; ++i) {
  1097. for (j = 0; j < 32; ++j)
  1098. temp_in[j] = out[j * 32 + i];
  1099. idct32_c(temp_in, temp_out);
  1100. for (j = 0; j < 32; ++j) {
  1101. dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
  1102. ROUND_POWER_OF_TWO(temp_out[j], 6));
  1103. }
  1104. }
  1105. }
  1106. void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest,
  1107. int stride) {
  1108. tran_low_t out[32 * 32] = {0};
  1109. tran_low_t *outptr = out;
  1110. int i, j;
  1111. tran_low_t temp_in[32], temp_out[32];
  1112. // Rows
  1113. // only upper-left 8x8 has non-zero coeff
  1114. for (i = 0; i < 8; ++i) {
  1115. idct32_c(input, outptr);
  1116. input += 32;
  1117. outptr += 32;
  1118. }
  1119. // Columns
  1120. for (i = 0; i < 32; ++i) {
  1121. for (j = 0; j < 32; ++j)
  1122. temp_in[j] = out[j * 32 + i];
  1123. idct32_c(temp_in, temp_out);
  1124. for (j = 0; j < 32; ++j) {
  1125. dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
  1126. ROUND_POWER_OF_TWO(temp_out[j], 6));
  1127. }
  1128. }
  1129. }
  1130. void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
  1131. int i, j;
  1132. tran_high_t a1;
  1133. tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
  1134. out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
  1135. a1 = ROUND_POWER_OF_TWO(out, 6);
  1136. for (j = 0; j < 32; ++j) {
  1137. for (i = 0; i < 32; ++i)
  1138. dest[i] = clip_pixel_add(dest[i], a1);
  1139. dest += stride;
  1140. }
  1141. }
  1142. #if CONFIG_VP9_HIGHBITDEPTH
  1143. void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
  1144. int stride, int bd) {
  1145. /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
  1146. 0.5 shifts per pixel. */
  1147. int i;
  1148. tran_low_t output[16];
  1149. tran_high_t a1, b1, c1, d1, e1;
  1150. const tran_low_t *ip = input;
  1151. tran_low_t *op = output;
  1152. uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
  1153. for (i = 0; i < 4; i++) {
  1154. a1 = ip[0] >> UNIT_QUANT_SHIFT;
  1155. c1 = ip[1] >> UNIT_QUANT_SHIFT;
  1156. d1 = ip[2] >> UNIT_QUANT_SHIFT;
  1157. b1 = ip[3] >> UNIT_QUANT_SHIFT;
  1158. a1 += c1;
  1159. d1 -= b1;
  1160. e1 = (a1 - d1) >> 1;
  1161. b1 = e1 - b1;
  1162. c1 = e1 - c1;
  1163. a1 -= b1;
  1164. d1 += c1;
  1165. op[0] = HIGHBD_WRAPLOW(a1, bd);
  1166. op[1] = HIGHBD_WRAPLOW(b1, bd);
  1167. op[2] = HIGHBD_WRAPLOW(c1, bd);
  1168. op[3] = HIGHBD_WRAPLOW(d1, bd);
  1169. ip += 4;
  1170. op += 4;
  1171. }
  1172. ip = output;
  1173. for (i = 0; i < 4; i++) {
  1174. a1 = ip[4 * 0];
  1175. c1 = ip[4 * 1];
  1176. d1 = ip[4 * 2];
  1177. b1 = ip[4 * 3];
  1178. a1 += c1;
  1179. d1 -= b1;
  1180. e1 = (a1 - d1) >> 1;
  1181. b1 = e1 - b1;
  1182. c1 = e1 - c1;
  1183. a1 -= b1;
  1184. d1 += c1;
  1185. dest[stride * 0] = highbd_clip_pixel_add(dest[stride * 0],
  1186. HIGHBD_WRAPLOW(a1, bd), bd);
  1187. dest[stride * 1] = highbd_clip_pixel_add(dest[stride * 1],
  1188. HIGHBD_WRAPLOW(b1, bd), bd);
  1189. dest[stride * 2] = highbd_clip_pixel_add(dest[stride * 2],
  1190. HIGHBD_WRAPLOW(c1, bd), bd);
  1191. dest[stride * 3] = highbd_clip_pixel_add(dest[stride * 3],
  1192. HIGHBD_WRAPLOW(d1, bd), bd);
  1193. ip++;
  1194. dest++;
  1195. }
  1196. }
  1197. void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
  1198. int dest_stride, int bd) {
  1199. int i;
  1200. tran_high_t a1, e1;
  1201. tran_low_t tmp[4];
  1202. const tran_low_t *ip = in;
  1203. tran_low_t *op = tmp;
  1204. uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
  1205. (void) bd;
  1206. a1 = ip[0] >> UNIT_QUANT_SHIFT;
  1207. e1 = a1 >> 1;
  1208. a1 -= e1;
  1209. op[0] = HIGHBD_WRAPLOW(a1, bd);
  1210. op[1] = op[2] = op[3] = HIGHBD_WRAPLOW(e1, bd);
  1211. ip = tmp;
  1212. for (i = 0; i < 4; i++) {
  1213. e1 = ip[0] >> 1;
  1214. a1 = ip[0] - e1;
  1215. dest[dest_stride * 0] = highbd_clip_pixel_add(
  1216. dest[dest_stride * 0], a1, bd);
  1217. dest[dest_stride * 1] = highbd_clip_pixel_add(
  1218. dest[dest_stride * 1], e1, bd);
  1219. dest[dest_stride * 2] = highbd_clip_pixel_add(
  1220. dest[dest_stride * 2], e1, bd);
  1221. dest[dest_stride * 3] = highbd_clip_pixel_add(
  1222. dest[dest_stride * 3], e1, bd);
  1223. ip++;
  1224. dest++;
  1225. }
  1226. }
  1227. void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) {
  1228. tran_low_t step[4];
  1229. tran_high_t temp1, temp2;
  1230. (void) bd;
  1231. // stage 1
  1232. temp1 = (input[0] + input[2]) * cospi_16_64;
  1233. temp2 = (input[0] - input[2]) * cospi_16_64;
  1234. step[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
  1235. step[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
  1236. temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
  1237. temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
  1238. step[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
  1239. step[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
  1240. // stage 2
  1241. output[0] = HIGHBD_WRAPLOW(step[0] + step[3], bd);
  1242. output[1] = HIGHBD_WRAPLOW(step[1] + step[2], bd);
  1243. output[2] = HIGHBD_WRAPLOW(step[1] - step[2], bd);
  1244. output[3] = HIGHBD_WRAPLOW(step[0] - step[3], bd);
  1245. }
  1246. void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
  1247. int stride, int bd) {
  1248. tran_low_t out[4 * 4];
  1249. tran_low_t *outptr = out;
  1250. int i, j;
  1251. tran_low_t temp_in[4], temp_out[4];
  1252. uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
  1253. // Rows
  1254. for (i = 0; i < 4; ++i) {
  1255. vpx_highbd_idct4_c(input, outptr, bd);
  1256. input += 4;
  1257. outptr += 4;
  1258. }
  1259. // Columns
  1260. for (i = 0; i < 4; ++i) {
  1261. for (j = 0; j < 4; ++j)
  1262. temp_in[j] = out[j * 4 + i];
  1263. vpx_highbd_idct4_c(temp_in, temp_out, bd);
  1264. for (j = 0; j < 4; ++j) {
  1265. dest[j * stride + i] = highbd_clip_pixel_add(
  1266. dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
  1267. }
  1268. }
  1269. }
  1270. void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8,
  1271. int dest_stride, int bd) {
  1272. int i;
  1273. tran_high_t a1;
  1274. tran_low_t out = HIGHBD_WRAPLOW(
  1275. highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
  1276. uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
  1277. out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
  1278. a1 = ROUND_POWER_OF_TWO(out, 4);
  1279. for (i = 0; i < 4; i++) {
  1280. dest[0] = highbd_clip_pixel_add(dest[0], a1, bd);
  1281. dest[1] = highbd_clip_pixel_add(dest[1], a1, bd);
  1282. dest[2] = highbd_clip_pixel_add(dest[2], a1, bd);
  1283. dest[3] = highbd_clip_pixel_add(dest[3], a1, bd);
  1284. dest += dest_stride;
  1285. }
  1286. }
  1287. void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) {
  1288. tran_low_t step1[8], step2[8];
  1289. tran_high_t temp1, temp2;
  1290. // stage 1
  1291. step1[0] = input[0];
  1292. step1[2] = input[4];
  1293. step1[1] = input[2];
  1294. step1[3] = input[6];
  1295. temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
  1296. temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
  1297. step1[4] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
  1298. step1[7] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
  1299. temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
  1300. temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
  1301. step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
  1302. step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
  1303. // stage 2 & stage 3 - even half
  1304. vpx_highbd_idct4_c(step1, step1, bd);
  1305. // stage 2 - odd half
  1306. step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
  1307. step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
  1308. step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
  1309. step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
  1310. // stage 3 - odd half
  1311. step1[4] = step2[4];
  1312. temp1 = (step2[6] - step2[5]) * cospi_16_64;
  1313. temp2 = (step2[5] + step2[6]) * cospi_16_64;
  1314. step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
  1315. step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
  1316. step1[7] = step2[7];
  1317. // stage 4
  1318. output[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
  1319. output[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
  1320. output[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
  1321. output[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
  1322. output[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
  1323. output[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
  1324. output[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
  1325. output[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
  1326. }
  1327. void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
  1328. int stride, int bd) {
  1329. tran_low_t out[8 * 8];
  1330. tran_low_t *outptr = out;
  1331. int i, j;
  1332. tran_low_t temp_in[8], temp_out[8];
  1333. uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
  1334. // First transform rows.
  1335. for (i = 0; i < 8; ++i) {
  1336. vpx_highbd_idct8_c(input, outptr, bd);
  1337. input += 8;
  1338. outptr += 8;
  1339. }
  1340. // Then transform columns.
  1341. for (i = 0; i < 8; ++i) {
  1342. for (j = 0; j < 8; ++j)
  1343. temp_in[j] = out[j * 8 + i];
  1344. vpx_highbd_idct8_c(temp_in, temp_out, bd);
  1345. for (j = 0; j < 8; ++j) {
  1346. dest[j * stride + i] = highbd_clip_pixel_add(
  1347. dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
  1348. }
  1349. }
  1350. }
  1351. void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8,
  1352. int stride, int bd) {
  1353. int i, j;
  1354. tran_high_t a1;
  1355. tran_low_t out = HIGHBD_WRAPLOW(
  1356. highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
  1357. uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
  1358. out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
  1359. a1 = ROUND_POWER_OF_TWO(out, 5);
  1360. for (j = 0; j < 8; ++j) {
  1361. for (i = 0; i < 8; ++i)
  1362. dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
  1363. dest += stride;
  1364. }
  1365. }
  1366. void vpx_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd) {
  1367. tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
  1368. tran_low_t x0 = input[0];
  1369. tran_low_t x1 = input[1];
  1370. tran_low_t x2 = input[2];
  1371. tran_low_t x3 = input[3];
  1372. (void) bd;
  1373. if (!(x0 | x1 | x2 | x3)) {
  1374. memset(output, 0, 4 * sizeof(*output));
  1375. return;
  1376. }
  1377. s0 = sinpi_1_9 * x0;
  1378. s1 = sinpi_2_9 * x0;
  1379. s2 = sinpi_3_9 * x1;
  1380. s3 = sinpi_4_9 * x2;
  1381. s4 = sinpi_1_9 * x2;
  1382. s5 = sinpi_2_9 * x3;
  1383. s6 = sinpi_4_9 * x3;
  1384. s7 = (tran_high_t)HIGHBD_WRAPLOW(x0 - x2 + x3, bd);
  1385. s0 = s0 + s3 + s5;
  1386. s1 = s1 - s4 - s6;
  1387. s3 = s2;
  1388. s2 = sinpi_3_9 * s7;
  1389. // 1-D transform scaling factor is sqrt(2).
  1390. // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
  1391. // + 1b (addition) = 29b.
  1392. // Hence the output bit depth is 15b.
  1393. output[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s3), bd);
  1394. output[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 + s3), bd);
  1395. output[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2), bd);
  1396. output[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s1 - s3), bd);
  1397. }
  1398. void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
  1399. tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
  1400. tran_low_t x0 = input[7];
  1401. tran_low_t x1 = input[0];
  1402. tran_low_t x2 = input[5];
  1403. tran_low_t x3 = input[2];
  1404. tran_low_t x4 = input[3];
  1405. tran_low_t x5 = input[4];
  1406. tran_low_t x6 = input[1];
  1407. tran_low_t x7 = input[6];
  1408. (void) bd;
  1409. if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
  1410. memset(output, 0, 8 * sizeof(*output));
  1411. return;
  1412. }
  1413. // stage 1
  1414. s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
  1415. s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
  1416. s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
  1417. s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
  1418. s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
  1419. s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
  1420. s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
  1421. s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
  1422. x0 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s4), bd);
  1423. x1 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 + s5), bd);
  1424. x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 + s6), bd);
  1425. x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 + s7), bd);
  1426. x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 - s4), bd);
  1427. x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 - s5), bd);
  1428. x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 - s6), bd);
  1429. x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 - s7), bd);
  1430. // stage 2
  1431. s0 = x0;
  1432. s1 = x1;
  1433. s2 = x2;
  1434. s3 = x3;
  1435. s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
  1436. s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
  1437. s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
  1438. s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
  1439. x0 = HIGHBD_WRAPLOW(s0 + s2, bd);
  1440. x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
  1441. x2 = HIGHBD_WRAPLOW(s0 - s2, bd);
  1442. x3 = HIGHBD_WRAPLOW(s1 - s3, bd);
  1443. x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 + s6), bd);
  1444. x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 + s7), bd);
  1445. x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 - s6), bd);
  1446. x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 - s7), bd);
  1447. // stage 3
  1448. s2 = cospi_16_64 * (x2 + x3);
  1449. s3 = cospi_16_64 * (x2 - x3);
  1450. s6 = cospi_16_64 * (x6 + x7);
  1451. s7 = cospi_16_64 * (x6 - x7);
  1452. x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2), bd);
  1453. x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3), bd);
  1454. x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6), bd);
  1455. x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7), bd);
  1456. output[0] = HIGHBD_WRAPLOW(x0, bd);
  1457. output[1] = HIGHBD_WRAPLOW(-x4, bd);
  1458. output[2] = HIGHBD_WRAPLOW(x6, bd);
  1459. output[3] = HIGHBD_WRAPLOW(-x2, bd);
  1460. output[4] = HIGHBD_WRAPLOW(x3, bd);
  1461. output[5] = HIGHBD_WRAPLOW(-x7, bd);
  1462. output[6] = HIGHBD_WRAPLOW(x5, bd);
  1463. output[7] = HIGHBD_WRAPLOW(-x1, bd);
  1464. }
  1465. void vpx_highbd_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8,
  1466. int stride, int bd) {
  1467. tran_low_t out[8 * 8] = { 0 };
  1468. tran_low_t *outptr = out;
  1469. int i, j;
  1470. tran_low_t temp_in[8], temp_out[8];
  1471. uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
  1472. // First transform rows.
  1473. // Only first 4 row has non-zero coefs.
  1474. for (i = 0; i < 4; ++i) {
  1475. vpx_highbd_idct8_c(input, outptr, bd);
  1476. input += 8;
  1477. outptr += 8;
  1478. }
  1479. // Then transform columns.
  1480. for (i = 0; i < 8; ++i) {
  1481. for (j = 0; j < 8; ++j)
  1482. temp_in[j] = out[j * 8 + i];
  1483. vpx_highbd_idct8_c(temp_in, temp_out, bd);
  1484. for (j = 0; j < 8; ++j) {
  1485. dest[j * stride + i] = highbd_clip_pixel_add(
  1486. dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
  1487. }
  1488. }
  1489. }
  1490. void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
  1491. tran_low_t step1[16], step2[16];
  1492. tran_high_t temp1, temp2;
  1493. (void) bd;
  1494. // stage 1
  1495. step1[0] = input[0/2];
  1496. step1[1] = input[16/2];
  1497. step1[2] = input[8/2];
  1498. step1[3] = input[24/2];
  1499. step1[4] = input[4/2];
  1500. step1[5] = input[20/2];
  1501. step1[6] = input[12/2];
  1502. step1[7] = input[28/2];
  1503. step1[8] = input[2/2];
  1504. step1[9] = input[18/2];
  1505. step1[10] = input[10/2];
  1506. step1[11] = input[26/2];
  1507. step1[12] = input[6/2];
  1508. step1[13] = input[22/2];
  1509. step1[14] = input[14/2];
  1510. step1[15] = input[30/2];
  1511. // stage 2
  1512. step2[0] = step1[0];
  1513. step2[1] = step1[1];
  1514. step2[2] = step1[2];
  1515. step2[3] = step1[3];
  1516. step2[4] = step1[4];
  1517. step2[5] = step1[5];
  1518. step2[6] = step1[6];
  1519. step2[7] = step1[7];
  1520. temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
  1521. temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
  1522. step2[8] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
  1523. step2[15] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
  1524. temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
  1525. temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
  1526. step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
  1527. step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
  1528. temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
  1529. temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
  1530. step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
  1531. step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
  1532. temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
  1533. temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
  1534. step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
  1535. step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
  1536. // stage 3
  1537. step1[0] = step2[0];
  1538. step1[1] = step2[1];
  1539. step1[2] = step2[2];
  1540. step1[3] = step2[3];
  1541. temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
  1542. temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
  1543. step1[4] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
  1544. step1[7] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
  1545. temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
  1546. temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
  1547. step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
  1548. step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
  1549. step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd);
  1550. step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd);
  1551. step1[10] = HIGHBD_WRAPLOW(-step2[10] + step2[11], bd);
  1552. step1[11] = HIGHBD_WRAPLOW(step2[10] + step2[11], bd);
  1553. step1[12] = HIGHBD_WRAPLOW(step2[12] + step2[13], bd);
  1554. step1[13] = HIGHBD_WRAPLOW(step2[12] - step2[13], bd);
  1555. step1[14] = HIGHBD_WRAPLOW(-step2[14] + step2[15], bd);
  1556. step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd);
  1557. // stage 4
  1558. temp1 = (step1[0] + step1[1]) * cospi_16_64;
  1559. temp2 = (step1[0] - step1[1]) * cospi_16_64;
  1560. step2[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
  1561. step2[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
  1562. temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
  1563. temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
  1564. step2[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
  1565. step2[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
  1566. step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
  1567. step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
  1568. step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
  1569. step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
  1570. step2[8] = step1[8];
  1571. step2[15] = step1[15];
  1572. temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
  1573. temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
  1574. step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
  1575. step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
  1576. temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
  1577. temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
  1578. step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
  1579. step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
  1580. step2[11] = step1[11];
  1581. step2[12] = step1[12];
  1582. // stage 5
  1583. step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[3], bd);
  1584. step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[2], bd);
  1585. step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd);
  1586. step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd);
  1587. step1[4] = step2[4];
  1588. temp1 = (step2[6] - step2[5]) * cospi_16_64;
  1589. temp2 = (step2[5] + step2[6]) * cospi_16_64;
  1590. step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
  1591. step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
  1592. step1[7] = step2[7];
  1593. step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd);
  1594. step1[9] = HIGHBD_WRAPLOW(step2[9] + step2[10], bd);
  1595. step1[10] = HIGHBD_WRAPLOW(step2[9] - step2[10], bd);
  1596. step1[11] = HIGHBD_WRAPLOW(step2[8] - step2[11], bd);
  1597. step1[12] = HIGHBD_WRAPLOW(-step2[12] + step2[15], bd);
  1598. step1[13] = HIGHBD_WRAPLOW(-step2[13] + step2[14], bd);
  1599. step1[14] = HIGHBD_WRAPLOW(step2[13] + step2[14], bd);
  1600. step1[15] = HIGHBD_WRAPLOW(step2[12] + step2[15], bd);
  1601. // stage 6
  1602. step2[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
  1603. step2[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
  1604. step2[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
  1605. step2[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
  1606. step2[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
  1607. step2[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
  1608. step2[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
  1609. step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
  1610. step2[8] = step1[8];
  1611. step2[9] = step1[9];
  1612. temp1 = (-step1[10] + step1[13]) * cospi_16_64;
  1613. temp2 = (step1[10] + step1[13]) * cospi_16_64;
  1614. step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
  1615. step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
  1616. temp1 = (-step1[11] + step1[12]) * cospi_16_64;
  1617. temp2 = (step1[11] + step1[12]) * cospi_16_64;
  1618. step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
  1619. step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
  1620. step2[14] = step1[14];
  1621. step2[15] = step1[15];
  1622. // stage 7
  1623. output[0] = HIGHBD_WRAPLOW(step2[0] + step2[15], bd);
  1624. output[1] = HIGHBD_WRAPLOW(step2[1] + step2[14], bd);
  1625. output[2] = HIGHBD_WRAPLOW(step2[2] + step2[13], bd);
  1626. output[3] = HIGHBD_WRAPLOW(step2[3] + step2[12], bd);
  1627. output[4] = HIGHBD_WRAPLOW(step2[4] + step2[11], bd);
  1628. output[5] = HIGHBD_WRAPLOW(step2[5] + step2[10], bd);
  1629. output[6] = HIGHBD_WRAPLOW(step2[6] + step2[9], bd);
  1630. output[7] = HIGHBD_WRAPLOW(step2[7] + step2[8], bd);
  1631. output[8] = HIGHBD_WRAPLOW(step2[7] - step2[8], bd);
  1632. output[9] = HIGHBD_WRAPLOW(step2[6] - step2[9], bd);
  1633. output[10] = HIGHBD_WRAPLOW(step2[5] - step2[10], bd);
  1634. output[11] = HIGHBD_WRAPLOW(step2[4] - step2[11], bd);
  1635. output[12] = HIGHBD_WRAPLOW(step2[3] - step2[12], bd);
  1636. output[13] = HIGHBD_WRAPLOW(step2[2] - step2[13], bd);
  1637. output[14] = HIGHBD_WRAPLOW(step2[1] - step2[14], bd);
  1638. output[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd);
  1639. }
  1640. void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
  1641. int stride, int bd) {
  1642. tran_low_t out[16 * 16];
  1643. tran_low_t *outptr = out;
  1644. int i, j;
  1645. tran_low_t temp_in[16], temp_out[16];
  1646. uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
  1647. // First transform rows.
  1648. for (i = 0; i < 16; ++i) {
  1649. vpx_highbd_idct16_c(input, outptr, bd);
  1650. input += 16;
  1651. outptr += 16;
  1652. }
  1653. // Then transform columns.
  1654. for (i = 0; i < 16; ++i) {
  1655. for (j = 0; j < 16; ++j)
  1656. temp_in[j] = out[j * 16 + i];
  1657. vpx_highbd_idct16_c(temp_in, temp_out, bd);
  1658. for (j = 0; j < 16; ++j) {
  1659. dest[j * stride + i] = highbd_clip_pixel_add(
  1660. dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
  1661. }
  1662. }
  1663. }
  1664. void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
  1665. tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
  1666. tran_high_t s9, s10, s11, s12, s13, s14, s15;
  1667. tran_low_t x0 = input[15];
  1668. tran_low_t x1 = input[0];
  1669. tran_low_t x2 = input[13];
  1670. tran_low_t x3 = input[2];
  1671. tran_low_t x4 = input[11];
  1672. tran_low_t x5 = input[4];
  1673. tran_low_t x6 = input[9];
  1674. tran_low_t x7 = input[6];
  1675. tran_low_t x8 = input[7];
  1676. tran_low_t x9 = input[8];
  1677. tran_low_t x10 = input[5];
  1678. tran_low_t x11 = input[10];
  1679. tran_low_t x12 = input[3];
  1680. tran_low_t x13 = input[12];
  1681. tran_low_t x14 = input[1];
  1682. tran_low_t x15 = input[14];
  1683. (void) bd;
  1684. if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
  1685. | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {
  1686. memset(output, 0, 16 * sizeof(*output));
  1687. return;
  1688. }
  1689. // stage 1
  1690. s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
  1691. s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
  1692. s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
  1693. s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
  1694. s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
  1695. s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
  1696. s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
  1697. s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
  1698. s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
  1699. s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
  1700. s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
  1701. s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
  1702. s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
  1703. s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
  1704. s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
  1705. s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
  1706. x0 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s8), bd);
  1707. x1 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 + s9), bd);
  1708. x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 + s10), bd);
  1709. x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 + s11), bd);
  1710. x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 + s12), bd);
  1711. x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 + s13), bd);
  1712. x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6 + s14), bd);
  1713. x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7 + s15), bd);
  1714. x8 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 - s8), bd);
  1715. x9 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 - s9), bd);
  1716. x10 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 - s10), bd);
  1717. x11 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 - s11), bd);
  1718. x12 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 - s12), bd);
  1719. x13 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 - s13), bd);
  1720. x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6 - s14), bd);
  1721. x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7 - s15), bd);
  1722. // stage 2
  1723. s0 = x0;
  1724. s1 = x1;
  1725. s2 = x2;
  1726. s3 = x3;
  1727. s4 = x4;
  1728. s5 = x5;
  1729. s6 = x6;
  1730. s7 = x7;
  1731. s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
  1732. s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
  1733. s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
  1734. s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
  1735. s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
  1736. s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
  1737. s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
  1738. s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
  1739. x0 = HIGHBD_WRAPLOW(s0 + s4, bd);
  1740. x1 = HIGHBD_WRAPLOW(s1 + s5, bd);
  1741. x2 = HIGHBD_WRAPLOW(s2 + s6, bd);
  1742. x3 = HIGHBD_WRAPLOW(s3 + s7, bd);
  1743. x4 = HIGHBD_WRAPLOW(s0 - s4, bd);
  1744. x5 = HIGHBD_WRAPLOW(s1 - s5, bd);
  1745. x6 = HIGHBD_WRAPLOW(s2 - s6, bd);
  1746. x7 = HIGHBD_WRAPLOW(s3 - s7, bd);
  1747. x8 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s8 + s12), bd);
  1748. x9 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s9 + s13), bd);
  1749. x10 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s10 + s14), bd);
  1750. x11 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s11 + s15), bd);
  1751. x12 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s8 - s12), bd);
  1752. x13 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s9 - s13), bd);
  1753. x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s10 - s14), bd);
  1754. x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s11 - s15), bd);
  1755. // stage 3
  1756. s0 = x0;
  1757. s1 = x1;
  1758. s2 = x2;
  1759. s3 = x3;
  1760. s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
  1761. s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
  1762. s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
  1763. s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
  1764. s8 = x8;
  1765. s9 = x9;
  1766. s10 = x10;
  1767. s11 = x11;
  1768. s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
  1769. s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
  1770. s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
  1771. s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
  1772. x0 = HIGHBD_WRAPLOW(s0 + s2, bd);
  1773. x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
  1774. x2 = HIGHBD_WRAPLOW(s0 - s2, bd);
  1775. x3 = HIGHBD_WRAPLOW(s1 - s3, bd);
  1776. x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 + s6), bd);
  1777. x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 + s7), bd);
  1778. x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 - s6), bd);
  1779. x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 - s7), bd);
  1780. x8 = HIGHBD_WRAPLOW(s8 + s10, bd);
  1781. x9 = HIGHBD_WRAPLOW(s9 + s11, bd);
  1782. x10 = HIGHBD_WRAPLOW(s8 - s10, bd);
  1783. x11 = HIGHBD_WRAPLOW(s9 - s11, bd);
  1784. x12 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s12 + s14), bd);
  1785. x13 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s13 + s15), bd);
  1786. x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s12 - s14), bd);
  1787. x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s13 - s15), bd);
  1788. // stage 4
  1789. s2 = (- cospi_16_64) * (x2 + x3);
  1790. s3 = cospi_16_64 * (x2 - x3);
  1791. s6 = cospi_16_64 * (x6 + x7);
  1792. s7 = cospi_16_64 * (-x6 + x7);
  1793. s10 = cospi_16_64 * (x10 + x11);
  1794. s11 = cospi_16_64 * (-x10 + x11);
  1795. s14 = (- cospi_16_64) * (x14 + x15);
  1796. s15 = cospi_16_64 * (x14 - x15);
  1797. x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2), bd);
  1798. x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3), bd);
  1799. x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6), bd);
  1800. x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7), bd);
  1801. x10 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s10), bd);
  1802. x11 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s11), bd);
  1803. x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s14), bd);
  1804. x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s15), bd);
  1805. output[0] = HIGHBD_WRAPLOW(x0, bd);
  1806. output[1] = HIGHBD_WRAPLOW(-x8, bd);
  1807. output[2] = HIGHBD_WRAPLOW(x12, bd);
  1808. output[3] = HIGHBD_WRAPLOW(-x4, bd);
  1809. output[4] = HIGHBD_WRAPLOW(x6, bd);
  1810. output[5] = HIGHBD_WRAPLOW(x14, bd);
  1811. output[6] = HIGHBD_WRAPLOW(x10, bd);
  1812. output[7] = HIGHBD_WRAPLOW(x2, bd);
  1813. output[8] = HIGHBD_WRAPLOW(x3, bd);
  1814. output[9] = HIGHBD_WRAPLOW(x11, bd);
  1815. output[10] = HIGHBD_WRAPLOW(x15, bd);
  1816. output[11] = HIGHBD_WRAPLOW(x7, bd);
  1817. output[12] = HIGHBD_WRAPLOW(x5, bd);
  1818. output[13] = HIGHBD_WRAPLOW(-x13, bd);
  1819. output[14] = HIGHBD_WRAPLOW(x9, bd);
  1820. output[15] = HIGHBD_WRAPLOW(-x1, bd);
  1821. }
  1822. void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8,
  1823. int stride, int bd) {
  1824. tran_low_t out[16 * 16] = { 0 };
  1825. tran_low_t *outptr = out;
  1826. int i, j;
  1827. tran_low_t temp_in[16], temp_out[16];
  1828. uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
  1829. // First transform rows. Since all non-zero dct coefficients are in
  1830. // upper-left 4x4 area, we only need to calculate first 4 rows here.
  1831. for (i = 0; i < 4; ++i) {
  1832. vpx_highbd_idct16_c(input, outptr, bd);
  1833. input += 16;
  1834. outptr += 16;
  1835. }
  1836. // Then transform columns.
  1837. for (i = 0; i < 16; ++i) {
  1838. for (j = 0; j < 16; ++j)
  1839. temp_in[j] = out[j*16 + i];
  1840. vpx_highbd_idct16_c(temp_in, temp_out, bd);
  1841. for (j = 0; j < 16; ++j) {
  1842. dest[j * stride + i] = highbd_clip_pixel_add(
  1843. dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
  1844. }
  1845. }
  1846. }
  1847. void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8,
  1848. int stride, int bd) {
  1849. int i, j;
  1850. tran_high_t a1;
  1851. tran_low_t out = HIGHBD_WRAPLOW(
  1852. highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
  1853. uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
  1854. out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
  1855. a1 = ROUND_POWER_OF_TWO(out, 6);
  1856. for (j = 0; j < 16; ++j) {
  1857. for (i = 0; i < 16; ++i)
  1858. dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
  1859. dest += stride;
  1860. }
  1861. }
  1862. static void highbd_idct32_c(const tran_low_t *input,
  1863. tran_low_t *output, int bd) {
  1864. tran_low_t step1[32], step2[32];
  1865. tran_high_t temp1, temp2;
  1866. (void) bd;
  1867. // stage 1
  1868. step1[0] = input[0];
  1869. step1[1] = input[16];
  1870. step1[2] = input[8];
  1871. step1[3] = input[24];
  1872. step1[4] = input[4];
  1873. step1[5] = input[20];
  1874. step1[6] = input[12];
  1875. step1[7] = input[28];
  1876. step1[8] = input[2];
  1877. step1[9] = input[18];
  1878. step1[10] = input[10];
  1879. step1[11] = input[26];
  1880. step1[12] = input[6];
  1881. step1[13] = input[22];
  1882. step1[14] = input[14];
  1883. step1[15] = input[30];
  1884. temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
  1885. temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
  1886. step1[16] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
  1887. step1[31] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
  1888. temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
  1889. temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
  1890. step1[17] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
  1891. step1[30] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
  1892. temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
  1893. temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
  1894. step1[18] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
  1895. step1[29] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
  1896. temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
  1897. temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
  1898. step1[19] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
  1899. step1[28] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
  1900. temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
  1901. temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
  1902. step1[20] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
  1903. step1[27] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
  1904. temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
  1905. temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
  1906. step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
  1907. step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
  1908. temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
  1909. temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
  1910. step1[22] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
  1911. step1[25] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
  1912. temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
  1913. temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
  1914. step1[23] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
  1915. step1[24] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
  1916. // stage 2
  1917. step2[0] = step1[0];
  1918. step2[1] = step1[1];
  1919. step2[2] = step1[2];
  1920. step2[3] = step1[3];
  1921. step2[4] = step1[4];
  1922. step2[5] = step1[5];
  1923. step2[6] = step1[6];
  1924. step2[7] = step1[7];
  1925. temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
  1926. temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
  1927. step2[8] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
  1928. step2[15] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
  1929. temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
  1930. temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
  1931. step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
  1932. step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
  1933. temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
  1934. temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
  1935. step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
  1936. step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
  1937. temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
  1938. temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
  1939. step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
  1940. step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
  1941. step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[17], bd);
  1942. step2[17] = HIGHBD_WRAPLOW(step1[16] - step1[17], bd);
  1943. step2[18] = HIGHBD_WRAPLOW(-step1[18] + step1[19], bd);
  1944. step2[19] = HIGHBD_WRAPLOW(step1[18] + step1[19], bd);
  1945. step2[20] = HIGHBD_WRAPLOW(step1[20] + step1[21], bd);
  1946. step2[21] = HIGHBD_WRAPLOW(step1[20] - step1[21], bd);
  1947. step2[22] = HIGHBD_WRAPLOW(-step1[22] + step1[23], bd);
  1948. step2[23] = HIGHBD_WRAPLOW(step1[22] + step1[23], bd);
  1949. step2[24] = HIGHBD_WRAPLOW(step1[24] + step1[25], bd);
  1950. step2[25] = HIGHBD_WRAPLOW(step1[24] - step1[25], bd);
  1951. step2[26] = HIGHBD_WRAPLOW(-step1[26] + step1[27], bd);
  1952. step2[27] = HIGHBD_WRAPLOW(step1[26] + step1[27], bd);
  1953. step2[28] = HIGHBD_WRAPLOW(step1[28] + step1[29], bd);
  1954. step2[29] = HIGHBD_WRAPLOW(step1[28] - step1[29], bd);
  1955. step2[30] = HIGHBD_WRAPLOW(-step1[30] + step1[31], bd);
  1956. step2[31] = HIGHBD_WRAPLOW(step1[30] + step1[31], bd);
  1957. // stage 3
  1958. step1[0] = step2[0];
  1959. step1[1] = step2[1];
  1960. step1[2] = step2[2];
  1961. step1[3] = step2[3];
  1962. temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
  1963. temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
  1964. step1[4] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
  1965. step1[7] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
  1966. temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
  1967. temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
  1968. step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
  1969. step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
  1970. step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd);
  1971. step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd);
  1972. step1[10] = HIGHBD_WRAPLOW(-step2[10] + step2[11], bd);
  1973. step1[11] = HIGHBD_WRAPLOW(step2[10] + step2[11], bd);
  1974. step1[12] = HIGHBD_WRAPLOW(step2[12] + step2[13], bd);
  1975. step1[13] = HIGHBD_WRAPLOW(step2[12] - step2[13], bd);
  1976. step1[14] = HIGHBD_WRAPLOW(-step2[14] + step2[15], bd);
  1977. step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd);
  1978. step1[16] = step2[16];
  1979. step1[31] = step2[31];
  1980. temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
  1981. temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
  1982. step1[17] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
  1983. step1[30] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
  1984. temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
  1985. temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
  1986. step1[18] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
  1987. step1[29] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
  1988. step1[19] = step2[19];
  1989. step1[20] = step2[20];
  1990. temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
  1991. temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
  1992. step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
  1993. step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
  1994. temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
  1995. temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
  1996. step1[22] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
  1997. step1[25] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
  1998. step1[23] = step2[23];
  1999. step1[24] = step2[24];
  2000. step1[27] = step2[27];
  2001. step1[28] = step2[28];
  2002. // stage 4
  2003. temp1 = (step1[0] + step1[1]) * cospi_16_64;
  2004. temp2 = (step1[0] - step1[1]) * cospi_16_64;
  2005. step2[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
  2006. step2[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
  2007. temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
  2008. temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
  2009. step2[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
  2010. step2[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
  2011. step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
  2012. step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
  2013. step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
  2014. step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
  2015. step2[8] = step1[8];
  2016. step2[15] = step1[15];
  2017. temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
  2018. temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
  2019. step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
  2020. step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
  2021. temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
  2022. temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
  2023. step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
  2024. step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
  2025. step2[11] = step1[11];
  2026. step2[12] = step1[12];
  2027. step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[19], bd);
  2028. step2[17] = HIGHBD_WRAPLOW(step1[17] + step1[18], bd);
  2029. step2[18] = HIGHBD_WRAPLOW(step1[17] - step1[18], bd);
  2030. step2[19] = HIGHBD_WRAPLOW(step1[16] - step1[19], bd);
  2031. step2[20] = HIGHBD_WRAPLOW(-step1[20] + step1[23], bd);
  2032. step2[21] = HIGHBD_WRAPLOW(-step1[21] + step1[22], bd);
  2033. step2[22] = HIGHBD_WRAPLOW(step1[21] + step1[22], bd);
  2034. step2[23] = HIGHBD_WRAPLOW(step1[20] + step1[23], bd);
  2035. step2[24] = HIGHBD_WRAPLOW(step1[24] + step1[27], bd);
  2036. step2[25] = HIGHBD_WRAPLOW(step1[25] + step1[26], bd);
  2037. step2[26] = HIGHBD_WRAPLOW(step1[25] - step1[26], bd);
  2038. step2[27] = HIGHBD_WRAPLOW(step1[24] - step1[27], bd);
  2039. step2[28] = HIGHBD_WRAPLOW(-step1[28] + step1[31], bd);
  2040. step2[29] = HIGHBD_WRAPLOW(-step1[29] + step1[30], bd);
  2041. step2[30] = HIGHBD_WRAPLOW(step1[29] + step1[30], bd);
  2042. step2[31] = HIGHBD_WRAPLOW(step1[28] + step1[31], bd);
  2043. // stage 5
  2044. step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[3], bd);
  2045. step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[2], bd);
  2046. step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd);
  2047. step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd);
  2048. step1[4] = step2[4];
  2049. temp1 = (step2[6] - step2[5]) * cospi_16_64;
  2050. temp2 = (step2[5] + step2[6]) * cospi_16_64;
  2051. step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
  2052. step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
  2053. step1[7] = step2[7];
  2054. step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd);
  2055. step1[9] = HIGHBD_WRAPLOW(step2[9] + step2[10], bd);
  2056. step1[10] = HIGHBD_WRAPLOW(step2[9] - step2[10], bd);
  2057. step1[11] = HIGHBD_WRAPLOW(step2[8] - step2[11], bd);
  2058. step1[12] = HIGHBD_WRAPLOW(-step2[12] + step2[15], bd);
  2059. step1[13] = HIGHBD_WRAPLOW(-step2[13] + step2[14], bd);
  2060. step1[14] = HIGHBD_WRAPLOW(step2[13] + step2[14], bd);
  2061. step1[15] = HIGHBD_WRAPLOW(step2[12] + step2[15], bd);
  2062. step1[16] = step2[16];
  2063. step1[17] = step2[17];
  2064. temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
  2065. temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
  2066. step1[18] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
  2067. step1[29] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
  2068. temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
  2069. temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
  2070. step1[19] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
  2071. step1[28] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
  2072. temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
  2073. temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
  2074. step1[20] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
  2075. step1[27] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
  2076. temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
  2077. temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
  2078. step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
  2079. step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
  2080. step1[22] = step2[22];
  2081. step1[23] = step2[23];
  2082. step1[24] = step2[24];
  2083. step1[25] = step2[25];
  2084. step1[30] = step2[30];
  2085. step1[31] = step2[31];
  2086. // stage 6
  2087. step2[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
  2088. step2[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
  2089. step2[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
  2090. step2[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
  2091. step2[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
  2092. step2[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
  2093. step2[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
  2094. step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
  2095. step2[8] = step1[8];
  2096. step2[9] = step1[9];
  2097. temp1 = (-step1[10] + step1[13]) * cospi_16_64;
  2098. temp2 = (step1[10] + step1[13]) * cospi_16_64;
  2099. step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
  2100. step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
  2101. temp1 = (-step1[11] + step1[12]) * cospi_16_64;
  2102. temp2 = (step1[11] + step1[12]) * cospi_16_64;
  2103. step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
  2104. step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
  2105. step2[14] = step1[14];
  2106. step2[15] = step1[15];
  2107. step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[23], bd);
  2108. step2[17] = HIGHBD_WRAPLOW(step1[17] + step1[22], bd);
  2109. step2[18] = HIGHBD_WRAPLOW(step1[18] + step1[21], bd);
  2110. step2[19] = HIGHBD_WRAPLOW(step1[19] + step1[20], bd);
  2111. step2[20] = HIGHBD_WRAPLOW(step1[19] - step1[20], bd);
  2112. step2[21] = HIGHBD_WRAPLOW(step1[18] - step1[21], bd);
  2113. step2[22] = HIGHBD_WRAPLOW(step1[17] - step1[22], bd);
  2114. step2[23] = HIGHBD_WRAPLOW(step1[16] - step1[23], bd);
  2115. step2[24] = HIGHBD_WRAPLOW(-step1[24] + step1[31], bd);
  2116. step2[25] = HIGHBD_WRAPLOW(-step1[25] + step1[30], bd);
  2117. step2[26] = HIGHBD_WRAPLOW(-step1[26] + step1[29], bd);
  2118. step2[27] = HIGHBD_WRAPLOW(-step1[27] + step1[28], bd);
  2119. step2[28] = HIGHBD_WRAPLOW(step1[27] + step1[28], bd);
  2120. step2[29] = HIGHBD_WRAPLOW(step1[26] + step1[29], bd);
  2121. step2[30] = HIGHBD_WRAPLOW(step1[25] + step1[30], bd);
  2122. step2[31] = HIGHBD_WRAPLOW(step1[24] + step1[31], bd);
  2123. // stage 7
  2124. step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[15], bd);
  2125. step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[14], bd);
  2126. step1[2] = HIGHBD_WRAPLOW(step2[2] + step2[13], bd);
  2127. step1[3] = HIGHBD_WRAPLOW(step2[3] + step2[12], bd);
  2128. step1[4] = HIGHBD_WRAPLOW(step2[4] + step2[11], bd);
  2129. step1[5] = HIGHBD_WRAPLOW(step2[5] + step2[10], bd);
  2130. step1[6] = HIGHBD_WRAPLOW(step2[6] + step2[9], bd);
  2131. step1[7] = HIGHBD_WRAPLOW(step2[7] + step2[8], bd);
  2132. step1[8] = HIGHBD_WRAPLOW(step2[7] - step2[8], bd);
  2133. step1[9] = HIGHBD_WRAPLOW(step2[6] - step2[9], bd);
  2134. step1[10] = HIGHBD_WRAPLOW(step2[5] - step2[10], bd);
  2135. step1[11] = HIGHBD_WRAPLOW(step2[4] - step2[11], bd);
  2136. step1[12] = HIGHBD_WRAPLOW(step2[3] - step2[12], bd);
  2137. step1[13] = HIGHBD_WRAPLOW(step2[2] - step2[13], bd);
  2138. step1[14] = HIGHBD_WRAPLOW(step2[1] - step2[14], bd);
  2139. step1[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd);
  2140. step1[16] = step2[16];
  2141. step1[17] = step2[17];
  2142. step1[18] = step2[18];
  2143. step1[19] = step2[19];
  2144. temp1 = (-step2[20] + step2[27]) * cospi_16_64;
  2145. temp2 = (step2[20] + step2[27]) * cospi_16_64;
  2146. step1[20] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
  2147. step1[27] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
  2148. temp1 = (-step2[21] + step2[26]) * cospi_16_64;
  2149. temp2 = (step2[21] + step2[26]) * cospi_16_64;
  2150. step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
  2151. step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
  2152. temp1 = (-step2[22] + step2[25]) * cospi_16_64;
  2153. temp2 = (step2[22] + step2[25]) * cospi_16_64;
  2154. step1[22] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
  2155. step1[25] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
  2156. temp1 = (-step2[23] + step2[24]) * cospi_16_64;
  2157. temp2 = (step2[23] + step2[24]) * cospi_16_64;
  2158. step1[23] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
  2159. step1[24] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
  2160. step1[28] = step2[28];
  2161. step1[29] = step2[29];
  2162. step1[30] = step2[30];
  2163. step1[31] = step2[31];
  2164. // final stage
  2165. output[0] = HIGHBD_WRAPLOW(step1[0] + step1[31], bd);
  2166. output[1] = HIGHBD_WRAPLOW(step1[1] + step1[30], bd);
  2167. output[2] = HIGHBD_WRAPLOW(step1[2] + step1[29], bd);
  2168. output[3] = HIGHBD_WRAPLOW(step1[3] + step1[28], bd);
  2169. output[4] = HIGHBD_WRAPLOW(step1[4] + step1[27], bd);
  2170. output[5] = HIGHBD_WRAPLOW(step1[5] + step1[26], bd);
  2171. output[6] = HIGHBD_WRAPLOW(step1[6] + step1[25], bd);
  2172. output[7] = HIGHBD_WRAPLOW(step1[7] + step1[24], bd);
  2173. output[8] = HIGHBD_WRAPLOW(step1[8] + step1[23], bd);
  2174. output[9] = HIGHBD_WRAPLOW(step1[9] + step1[22], bd);
  2175. output[10] = HIGHBD_WRAPLOW(step1[10] + step1[21], bd);
  2176. output[11] = HIGHBD_WRAPLOW(step1[11] + step1[20], bd);
  2177. output[12] = HIGHBD_WRAPLOW(step1[12] + step1[19], bd);
  2178. output[13] = HIGHBD_WRAPLOW(step1[13] + step1[18], bd);
  2179. output[14] = HIGHBD_WRAPLOW(step1[14] + step1[17], bd);
  2180. output[15] = HIGHBD_WRAPLOW(step1[15] + step1[16], bd);
  2181. output[16] = HIGHBD_WRAPLOW(step1[15] - step1[16], bd);
  2182. output[17] = HIGHBD_WRAPLOW(step1[14] - step1[17], bd);
  2183. output[18] = HIGHBD_WRAPLOW(step1[13] - step1[18], bd);
  2184. output[19] = HIGHBD_WRAPLOW(step1[12] - step1[19], bd);
  2185. output[20] = HIGHBD_WRAPLOW(step1[11] - step1[20], bd);
  2186. output[21] = HIGHBD_WRAPLOW(step1[10] - step1[21], bd);
  2187. output[22] = HIGHBD_WRAPLOW(step1[9] - step1[22], bd);
  2188. output[23] = HIGHBD_WRAPLOW(step1[8] - step1[23], bd);
  2189. output[24] = HIGHBD_WRAPLOW(step1[7] - step1[24], bd);
  2190. output[25] = HIGHBD_WRAPLOW(step1[6] - step1[25], bd);
  2191. output[26] = HIGHBD_WRAPLOW(step1[5] - step1[26], bd);
  2192. output[27] = HIGHBD_WRAPLOW(step1[4] - step1[27], bd);
  2193. output[28] = HIGHBD_WRAPLOW(step1[3] - step1[28], bd);
  2194. output[29] = HIGHBD_WRAPLOW(step1[2] - step1[29], bd);
  2195. output[30] = HIGHBD_WRAPLOW(step1[1] - step1[30], bd);
  2196. output[31] = HIGHBD_WRAPLOW(step1[0] - step1[31], bd);
  2197. }
  2198. void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
  2199. int stride, int bd) {
  2200. tran_low_t out[32 * 32];
  2201. tran_low_t *outptr = out;
  2202. int i, j;
  2203. tran_low_t temp_in[32], temp_out[32];
  2204. uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
  2205. // Rows
  2206. for (i = 0; i < 32; ++i) {
  2207. tran_low_t zero_coeff[16];
  2208. for (j = 0; j < 16; ++j)
  2209. zero_coeff[j] = input[2 * j] | input[2 * j + 1];
  2210. for (j = 0; j < 8; ++j)
  2211. zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
  2212. for (j = 0; j < 4; ++j)
  2213. zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
  2214. for (j = 0; j < 2; ++j)
  2215. zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
  2216. if (zero_coeff[0] | zero_coeff[1])
  2217. highbd_idct32_c(input, outptr, bd);
  2218. else
  2219. memset(outptr, 0, sizeof(tran_low_t) * 32);
  2220. input += 32;
  2221. outptr += 32;
  2222. }
  2223. // Columns
  2224. for (i = 0; i < 32; ++i) {
  2225. for (j = 0; j < 32; ++j)
  2226. temp_in[j] = out[j * 32 + i];
  2227. highbd_idct32_c(temp_in, temp_out, bd);
  2228. for (j = 0; j < 32; ++j) {
  2229. dest[j * stride + i] = highbd_clip_pixel_add(
  2230. dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
  2231. }
  2232. }
  2233. }
  2234. void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8,
  2235. int stride, int bd) {
  2236. tran_low_t out[32 * 32] = {0};
  2237. tran_low_t *outptr = out;
  2238. int i, j;
  2239. tran_low_t temp_in[32], temp_out[32];
  2240. uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
  2241. // Rows
  2242. // Only upper-left 8x8 has non-zero coeff.
  2243. for (i = 0; i < 8; ++i) {
  2244. highbd_idct32_c(input, outptr, bd);
  2245. input += 32;
  2246. outptr += 32;
  2247. }
  2248. // Columns
  2249. for (i = 0; i < 32; ++i) {
  2250. for (j = 0; j < 32; ++j)
  2251. temp_in[j] = out[j * 32 + i];
  2252. highbd_idct32_c(temp_in, temp_out, bd);
  2253. for (j = 0; j < 32; ++j) {
  2254. dest[j * stride + i] = highbd_clip_pixel_add(
  2255. dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
  2256. }
  2257. }
  2258. }
  2259. void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8,
  2260. int stride, int bd) {
  2261. int i, j;
  2262. int a1;
  2263. uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
  2264. tran_low_t out = HIGHBD_WRAPLOW(
  2265. highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
  2266. out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
  2267. a1 = ROUND_POWER_OF_TWO(out, 6);
  2268. for (j = 0; j < 32; ++j) {
  2269. for (i = 0; i < 32; ++i)
  2270. dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
  2271. dest += stride;
  2272. }
  2273. }
  2274. #endif // CONFIG_VP9_HIGHBITDEPTH