inv_txfm_sse2.c 156 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047
  1. /*
  2. * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include "./vpx_dsp_rtcd.h"
  11. #include "vpx_dsp/x86/inv_txfm_sse2.h"
  12. #include "vpx_dsp/x86/txfm_common_sse2.h"
  13. #define RECON_AND_STORE4X4(dest, in_x) \
  14. { \
  15. __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \
  16. d0 = _mm_unpacklo_epi8(d0, zero); \
  17. d0 = _mm_add_epi16(in_x, d0); \
  18. d0 = _mm_packus_epi16(d0, d0); \
  19. *(int *)(dest) = _mm_cvtsi128_si32(d0); \
  20. }
  21. void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest,
  22. int stride) {
  23. const __m128i zero = _mm_setzero_si128();
  24. const __m128i eight = _mm_set1_epi16(8);
  25. const __m128i cst = _mm_setr_epi16(
  26. (int16_t)cospi_16_64, (int16_t)cospi_16_64, (int16_t)cospi_16_64,
  27. (int16_t)-cospi_16_64, (int16_t)cospi_24_64, (int16_t)-cospi_8_64,
  28. (int16_t)cospi_8_64, (int16_t)cospi_24_64);
  29. const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
  30. __m128i input0, input1, input2, input3;
  31. // Rows
  32. input0 = load_input_data(input);
  33. input2 = load_input_data(input + 8);
  34. // Construct i3, i1, i3, i1, i2, i0, i2, i0
  35. input0 = _mm_shufflelo_epi16(input0, 0xd8);
  36. input0 = _mm_shufflehi_epi16(input0, 0xd8);
  37. input2 = _mm_shufflelo_epi16(input2, 0xd8);
  38. input2 = _mm_shufflehi_epi16(input2, 0xd8);
  39. input1 = _mm_unpackhi_epi32(input0, input0);
  40. input0 = _mm_unpacklo_epi32(input0, input0);
  41. input3 = _mm_unpackhi_epi32(input2, input2);
  42. input2 = _mm_unpacklo_epi32(input2, input2);
  43. // Stage 1
  44. input0 = _mm_madd_epi16(input0, cst);
  45. input1 = _mm_madd_epi16(input1, cst);
  46. input2 = _mm_madd_epi16(input2, cst);
  47. input3 = _mm_madd_epi16(input3, cst);
  48. input0 = _mm_add_epi32(input0, rounding);
  49. input1 = _mm_add_epi32(input1, rounding);
  50. input2 = _mm_add_epi32(input2, rounding);
  51. input3 = _mm_add_epi32(input3, rounding);
  52. input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
  53. input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
  54. input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
  55. input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
  56. // Stage 2
  57. input0 = _mm_packs_epi32(input0, input1);
  58. input1 = _mm_packs_epi32(input2, input3);
  59. // Transpose
  60. input2 = _mm_unpacklo_epi16(input0, input1);
  61. input3 = _mm_unpackhi_epi16(input0, input1);
  62. input0 = _mm_unpacklo_epi32(input2, input3);
  63. input1 = _mm_unpackhi_epi32(input2, input3);
  64. // Switch column2, column 3, and then, we got:
  65. // input2: column1, column 0; input3: column2, column 3.
  66. input1 = _mm_shuffle_epi32(input1, 0x4e);
  67. input2 = _mm_add_epi16(input0, input1);
  68. input3 = _mm_sub_epi16(input0, input1);
  69. // Columns
  70. // Construct i3, i1, i3, i1, i2, i0, i2, i0
  71. input0 = _mm_unpacklo_epi32(input2, input2);
  72. input1 = _mm_unpackhi_epi32(input2, input2);
  73. input2 = _mm_unpackhi_epi32(input3, input3);
  74. input3 = _mm_unpacklo_epi32(input3, input3);
  75. // Stage 1
  76. input0 = _mm_madd_epi16(input0, cst);
  77. input1 = _mm_madd_epi16(input1, cst);
  78. input2 = _mm_madd_epi16(input2, cst);
  79. input3 = _mm_madd_epi16(input3, cst);
  80. input0 = _mm_add_epi32(input0, rounding);
  81. input1 = _mm_add_epi32(input1, rounding);
  82. input2 = _mm_add_epi32(input2, rounding);
  83. input3 = _mm_add_epi32(input3, rounding);
  84. input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
  85. input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
  86. input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
  87. input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
  88. // Stage 2
  89. input0 = _mm_packs_epi32(input0, input2);
  90. input1 = _mm_packs_epi32(input1, input3);
  91. // Transpose
  92. input2 = _mm_unpacklo_epi16(input0, input1);
  93. input3 = _mm_unpackhi_epi16(input0, input1);
  94. input0 = _mm_unpacklo_epi32(input2, input3);
  95. input1 = _mm_unpackhi_epi32(input2, input3);
  96. // Switch column2, column 3, and then, we got:
  97. // input2: column1, column 0; input3: column2, column 3.
  98. input1 = _mm_shuffle_epi32(input1, 0x4e);
  99. input2 = _mm_add_epi16(input0, input1);
  100. input3 = _mm_sub_epi16(input0, input1);
  101. // Final round and shift
  102. input2 = _mm_add_epi16(input2, eight);
  103. input3 = _mm_add_epi16(input3, eight);
  104. input2 = _mm_srai_epi16(input2, 4);
  105. input3 = _mm_srai_epi16(input3, 4);
  106. // Reconstruction and Store
  107. {
  108. __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
  109. __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
  110. d0 = _mm_unpacklo_epi32(d0,
  111. _mm_cvtsi32_si128(*(const int *)(dest + stride)));
  112. d2 = _mm_unpacklo_epi32(
  113. _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)), d2);
  114. d0 = _mm_unpacklo_epi8(d0, zero);
  115. d2 = _mm_unpacklo_epi8(d2, zero);
  116. d0 = _mm_add_epi16(d0, input2);
  117. d2 = _mm_add_epi16(d2, input3);
  118. d0 = _mm_packus_epi16(d0, d2);
  119. // store input0
  120. *(int *)dest = _mm_cvtsi128_si32(d0);
  121. // store input1
  122. d0 = _mm_srli_si128(d0, 4);
  123. *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
  124. // store input2
  125. d0 = _mm_srli_si128(d0, 4);
  126. *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
  127. // store input3
  128. d0 = _mm_srli_si128(d0, 4);
  129. *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
  130. }
  131. }
  132. void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest,
  133. int stride) {
  134. __m128i dc_value;
  135. const __m128i zero = _mm_setzero_si128();
  136. int a;
  137. a = (int)dct_const_round_shift(input[0] * cospi_16_64);
  138. a = (int)dct_const_round_shift(a * cospi_16_64);
  139. a = ROUND_POWER_OF_TWO(a, 4);
  140. dc_value = _mm_set1_epi16(a);
  141. RECON_AND_STORE4X4(dest + 0 * stride, dc_value);
  142. RECON_AND_STORE4X4(dest + 1 * stride, dc_value);
  143. RECON_AND_STORE4X4(dest + 2 * stride, dc_value);
  144. RECON_AND_STORE4X4(dest + 3 * stride, dc_value);
  145. }
  146. static INLINE void transpose_4x4(__m128i *res) {
  147. const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
  148. const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
  149. res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1);
  150. res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1);
  151. }
  152. void idct4_sse2(__m128i *in) {
  153. const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64);
  154. const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
  155. const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
  156. const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
  157. const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
  158. __m128i u[8], v[8];
  159. transpose_4x4(in);
  160. // stage 1
  161. u[0] = _mm_unpacklo_epi16(in[0], in[1]);
  162. u[1] = _mm_unpackhi_epi16(in[0], in[1]);
  163. v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
  164. v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
  165. v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
  166. v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
  167. u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
  168. u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
  169. u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
  170. u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
  171. v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
  172. v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
  173. v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
  174. v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
  175. u[0] = _mm_packs_epi32(v[0], v[1]);
  176. u[1] = _mm_packs_epi32(v[3], v[2]);
  177. // stage 2
  178. in[0] = _mm_add_epi16(u[0], u[1]);
  179. in[1] = _mm_sub_epi16(u[0], u[1]);
  180. in[1] = _mm_shuffle_epi32(in[1], 0x4E);
  181. }
  182. void iadst4_sse2(__m128i *in) {
  183. const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9);
  184. const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9);
  185. const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9);
  186. const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9);
  187. const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9);
  188. const __m128i kZero = _mm_set1_epi16(0);
  189. const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
  190. __m128i u[8], v[8], in7;
  191. transpose_4x4(in);
  192. in7 = _mm_srli_si128(in[1], 8);
  193. in7 = _mm_add_epi16(in7, in[0]);
  194. in7 = _mm_sub_epi16(in7, in[1]);
  195. u[0] = _mm_unpacklo_epi16(in[0], in[1]);
  196. u[1] = _mm_unpackhi_epi16(in[0], in[1]);
  197. u[2] = _mm_unpacklo_epi16(in7, kZero);
  198. u[3] = _mm_unpackhi_epi16(in[0], kZero);
  199. v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04); // s0 + s3
  200. v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02); // s2 + s5
  201. v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03); // x2
  202. v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01); // s1 - s4
  203. v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04); // s2 - s6
  204. v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s2
  205. u[0] = _mm_add_epi32(v[0], v[1]);
  206. u[1] = _mm_add_epi32(v[3], v[4]);
  207. u[2] = v[2];
  208. u[3] = _mm_add_epi32(u[0], u[1]);
  209. u[4] = _mm_slli_epi32(v[5], 2);
  210. u[5] = _mm_add_epi32(u[3], v[5]);
  211. u[6] = _mm_sub_epi32(u[5], u[4]);
  212. v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
  213. v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
  214. v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
  215. v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
  216. u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
  217. u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
  218. u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
  219. u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
  220. in[0] = _mm_packs_epi32(u[0], u[1]);
  221. in[1] = _mm_packs_epi32(u[2], u[3]);
  222. }
  223. #define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \
  224. out0, out1, out2, out3, out4, out5, out6, out7) \
  225. { \
  226. const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
  227. const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
  228. const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \
  229. const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \
  230. const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \
  231. const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \
  232. const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \
  233. const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \
  234. \
  235. const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
  236. const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \
  237. const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
  238. const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \
  239. const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
  240. const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \
  241. const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
  242. const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \
  243. \
  244. out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
  245. out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
  246. out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
  247. out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
  248. out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \
  249. out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \
  250. out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \
  251. out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \
  252. }
  253. #define TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, \
  254. out0, out1, out2, out3) \
  255. { \
  256. const __m128i tr0_0 = _mm_unpackhi_epi16(tmp0, tmp1); \
  257. const __m128i tr0_1 = _mm_unpacklo_epi16(tmp1, tmp0); \
  258. const __m128i tr0_4 = _mm_unpacklo_epi16(tmp2, tmp3); \
  259. const __m128i tr0_5 = _mm_unpackhi_epi16(tmp3, tmp2); \
  260. \
  261. const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
  262. const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
  263. const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
  264. const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
  265. \
  266. out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
  267. out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
  268. out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
  269. out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
  270. }
  271. #define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \
  272. { \
  273. const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
  274. const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
  275. out0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
  276. out1 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
  277. }
  278. // Define Macro for multiplying elements by constants and adding them together.
  279. #define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, \
  280. cst0, cst1, cst2, cst3, res0, res1, res2, res3) \
  281. { \
  282. tmp0 = _mm_madd_epi16(lo_0, cst0); \
  283. tmp1 = _mm_madd_epi16(hi_0, cst0); \
  284. tmp2 = _mm_madd_epi16(lo_0, cst1); \
  285. tmp3 = _mm_madd_epi16(hi_0, cst1); \
  286. tmp4 = _mm_madd_epi16(lo_1, cst2); \
  287. tmp5 = _mm_madd_epi16(hi_1, cst2); \
  288. tmp6 = _mm_madd_epi16(lo_1, cst3); \
  289. tmp7 = _mm_madd_epi16(hi_1, cst3); \
  290. \
  291. tmp0 = _mm_add_epi32(tmp0, rounding); \
  292. tmp1 = _mm_add_epi32(tmp1, rounding); \
  293. tmp2 = _mm_add_epi32(tmp2, rounding); \
  294. tmp3 = _mm_add_epi32(tmp3, rounding); \
  295. tmp4 = _mm_add_epi32(tmp4, rounding); \
  296. tmp5 = _mm_add_epi32(tmp5, rounding); \
  297. tmp6 = _mm_add_epi32(tmp6, rounding); \
  298. tmp7 = _mm_add_epi32(tmp7, rounding); \
  299. \
  300. tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
  301. tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
  302. tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
  303. tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
  304. tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \
  305. tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \
  306. tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \
  307. tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \
  308. \
  309. res0 = _mm_packs_epi32(tmp0, tmp1); \
  310. res1 = _mm_packs_epi32(tmp2, tmp3); \
  311. res2 = _mm_packs_epi32(tmp4, tmp5); \
  312. res3 = _mm_packs_epi32(tmp6, tmp7); \
  313. }
  314. #define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \
  315. { \
  316. tmp0 = _mm_madd_epi16(lo_0, cst0); \
  317. tmp1 = _mm_madd_epi16(hi_0, cst0); \
  318. tmp2 = _mm_madd_epi16(lo_0, cst1); \
  319. tmp3 = _mm_madd_epi16(hi_0, cst1); \
  320. \
  321. tmp0 = _mm_add_epi32(tmp0, rounding); \
  322. tmp1 = _mm_add_epi32(tmp1, rounding); \
  323. tmp2 = _mm_add_epi32(tmp2, rounding); \
  324. tmp3 = _mm_add_epi32(tmp3, rounding); \
  325. \
  326. tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
  327. tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
  328. tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
  329. tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
  330. \
  331. res0 = _mm_packs_epi32(tmp0, tmp1); \
  332. res1 = _mm_packs_epi32(tmp2, tmp3); \
  333. }
  334. #define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, \
  335. out0, out1, out2, out3, out4, out5, out6, out7) \
  336. { \
  337. /* Stage1 */ \
  338. { \
  339. const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \
  340. const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \
  341. const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \
  342. const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \
  343. \
  344. MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, \
  345. stg1_1, stg1_2, stg1_3, stp1_4, \
  346. stp1_7, stp1_5, stp1_6) \
  347. } \
  348. \
  349. /* Stage2 */ \
  350. { \
  351. const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \
  352. const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \
  353. const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \
  354. const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \
  355. \
  356. MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, \
  357. stg2_1, stg2_2, stg2_3, stp2_0, \
  358. stp2_1, stp2_2, stp2_3) \
  359. \
  360. stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); \
  361. stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); \
  362. stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); \
  363. stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); \
  364. } \
  365. \
  366. /* Stage3 */ \
  367. { \
  368. const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
  369. const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
  370. \
  371. stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); \
  372. stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); \
  373. stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); \
  374. stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); \
  375. \
  376. tmp0 = _mm_madd_epi16(lo_56, stg2_1); \
  377. tmp1 = _mm_madd_epi16(hi_56, stg2_1); \
  378. tmp2 = _mm_madd_epi16(lo_56, stg2_0); \
  379. tmp3 = _mm_madd_epi16(hi_56, stg2_0); \
  380. \
  381. tmp0 = _mm_add_epi32(tmp0, rounding); \
  382. tmp1 = _mm_add_epi32(tmp1, rounding); \
  383. tmp2 = _mm_add_epi32(tmp2, rounding); \
  384. tmp3 = _mm_add_epi32(tmp3, rounding); \
  385. \
  386. tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
  387. tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
  388. tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
  389. tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
  390. \
  391. stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
  392. stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
  393. } \
  394. \
  395. /* Stage4 */ \
  396. out0 = _mm_adds_epi16(stp1_0, stp2_7); \
  397. out1 = _mm_adds_epi16(stp1_1, stp1_6); \
  398. out2 = _mm_adds_epi16(stp1_2, stp1_5); \
  399. out3 = _mm_adds_epi16(stp1_3, stp2_4); \
  400. out4 = _mm_subs_epi16(stp1_3, stp2_4); \
  401. out5 = _mm_subs_epi16(stp1_2, stp1_5); \
  402. out6 = _mm_subs_epi16(stp1_1, stp1_6); \
  403. out7 = _mm_subs_epi16(stp1_0, stp2_7); \
  404. }
  405. void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest,
  406. int stride) {
  407. const __m128i zero = _mm_setzero_si128();
  408. const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
  409. const __m128i final_rounding = _mm_set1_epi16(1 << 4);
  410. const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
  411. const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
  412. const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
  413. const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
  414. const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
  415. const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
  416. const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
  417. const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
  418. __m128i in0, in1, in2, in3, in4, in5, in6, in7;
  419. __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
  420. __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
  421. __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  422. int i;
  423. // Load input data.
  424. in0 = load_input_data(input);
  425. in1 = load_input_data(input + 8 * 1);
  426. in2 = load_input_data(input + 8 * 2);
  427. in3 = load_input_data(input + 8 * 3);
  428. in4 = load_input_data(input + 8 * 4);
  429. in5 = load_input_data(input + 8 * 5);
  430. in6 = load_input_data(input + 8 * 6);
  431. in7 = load_input_data(input + 8 * 7);
  432. // 2-D
  433. for (i = 0; i < 2; i++) {
  434. // 8x8 Transpose is copied from vpx_fdct8x8_sse2()
  435. TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7,
  436. in0, in1, in2, in3, in4, in5, in6, in7);
  437. // 4-stage 1D idct8x8
  438. IDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
  439. in0, in1, in2, in3, in4, in5, in6, in7);
  440. }
  441. // Final rounding and shift
  442. in0 = _mm_adds_epi16(in0, final_rounding);
  443. in1 = _mm_adds_epi16(in1, final_rounding);
  444. in2 = _mm_adds_epi16(in2, final_rounding);
  445. in3 = _mm_adds_epi16(in3, final_rounding);
  446. in4 = _mm_adds_epi16(in4, final_rounding);
  447. in5 = _mm_adds_epi16(in5, final_rounding);
  448. in6 = _mm_adds_epi16(in6, final_rounding);
  449. in7 = _mm_adds_epi16(in7, final_rounding);
  450. in0 = _mm_srai_epi16(in0, 5);
  451. in1 = _mm_srai_epi16(in1, 5);
  452. in2 = _mm_srai_epi16(in2, 5);
  453. in3 = _mm_srai_epi16(in3, 5);
  454. in4 = _mm_srai_epi16(in4, 5);
  455. in5 = _mm_srai_epi16(in5, 5);
  456. in6 = _mm_srai_epi16(in6, 5);
  457. in7 = _mm_srai_epi16(in7, 5);
  458. RECON_AND_STORE(dest + 0 * stride, in0);
  459. RECON_AND_STORE(dest + 1 * stride, in1);
  460. RECON_AND_STORE(dest + 2 * stride, in2);
  461. RECON_AND_STORE(dest + 3 * stride, in3);
  462. RECON_AND_STORE(dest + 4 * stride, in4);
  463. RECON_AND_STORE(dest + 5 * stride, in5);
  464. RECON_AND_STORE(dest + 6 * stride, in6);
  465. RECON_AND_STORE(dest + 7 * stride, in7);
  466. }
  467. void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest,
  468. int stride) {
  469. __m128i dc_value;
  470. const __m128i zero = _mm_setzero_si128();
  471. int a;
  472. a = (int)dct_const_round_shift(input[0] * cospi_16_64);
  473. a = (int)dct_const_round_shift(a * cospi_16_64);
  474. a = ROUND_POWER_OF_TWO(a, 5);
  475. dc_value = _mm_set1_epi16(a);
  476. RECON_AND_STORE(dest + 0 * stride, dc_value);
  477. RECON_AND_STORE(dest + 1 * stride, dc_value);
  478. RECON_AND_STORE(dest + 2 * stride, dc_value);
  479. RECON_AND_STORE(dest + 3 * stride, dc_value);
  480. RECON_AND_STORE(dest + 4 * stride, dc_value);
  481. RECON_AND_STORE(dest + 5 * stride, dc_value);
  482. RECON_AND_STORE(dest + 6 * stride, dc_value);
  483. RECON_AND_STORE(dest + 7 * stride, dc_value);
  484. }
  485. void idct8_sse2(__m128i *in) {
  486. const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
  487. const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
  488. const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
  489. const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
  490. const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
  491. const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
  492. const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
  493. const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
  494. const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
  495. __m128i in0, in1, in2, in3, in4, in5, in6, in7;
  496. __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
  497. __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
  498. __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  499. // 8x8 Transpose is copied from vpx_fdct8x8_sse2()
  500. TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7],
  501. in0, in1, in2, in3, in4, in5, in6, in7);
  502. // 4-stage 1D idct8x8
  503. IDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
  504. in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7]);
  505. }
  506. void iadst8_sse2(__m128i *in) {
  507. const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
  508. const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
  509. const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
  510. const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
  511. const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
  512. const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
  513. const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
  514. const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
  515. const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
  516. const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
  517. const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
  518. const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
  519. const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
  520. const __m128i k__const_0 = _mm_set1_epi16(0);
  521. const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
  522. __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
  523. __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
  524. __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
  525. __m128i s0, s1, s2, s3, s4, s5, s6, s7;
  526. __m128i in0, in1, in2, in3, in4, in5, in6, in7;
  527. // transpose
  528. array_transpose_8x8(in, in);
  529. // properly aligned for butterfly input
  530. in0 = in[7];
  531. in1 = in[0];
  532. in2 = in[5];
  533. in3 = in[2];
  534. in4 = in[3];
  535. in5 = in[4];
  536. in6 = in[1];
  537. in7 = in[6];
  538. // column transformation
  539. // stage 1
  540. // interleave and multiply/add into 32-bit integer
  541. s0 = _mm_unpacklo_epi16(in0, in1);
  542. s1 = _mm_unpackhi_epi16(in0, in1);
  543. s2 = _mm_unpacklo_epi16(in2, in3);
  544. s3 = _mm_unpackhi_epi16(in2, in3);
  545. s4 = _mm_unpacklo_epi16(in4, in5);
  546. s5 = _mm_unpackhi_epi16(in4, in5);
  547. s6 = _mm_unpacklo_epi16(in6, in7);
  548. s7 = _mm_unpackhi_epi16(in6, in7);
  549. u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
  550. u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
  551. u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
  552. u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
  553. u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
  554. u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
  555. u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
  556. u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
  557. u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
  558. u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
  559. u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
  560. u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
  561. u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
  562. u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
  563. u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
  564. u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
  565. // addition
  566. w0 = _mm_add_epi32(u0, u8);
  567. w1 = _mm_add_epi32(u1, u9);
  568. w2 = _mm_add_epi32(u2, u10);
  569. w3 = _mm_add_epi32(u3, u11);
  570. w4 = _mm_add_epi32(u4, u12);
  571. w5 = _mm_add_epi32(u5, u13);
  572. w6 = _mm_add_epi32(u6, u14);
  573. w7 = _mm_add_epi32(u7, u15);
  574. w8 = _mm_sub_epi32(u0, u8);
  575. w9 = _mm_sub_epi32(u1, u9);
  576. w10 = _mm_sub_epi32(u2, u10);
  577. w11 = _mm_sub_epi32(u3, u11);
  578. w12 = _mm_sub_epi32(u4, u12);
  579. w13 = _mm_sub_epi32(u5, u13);
  580. w14 = _mm_sub_epi32(u6, u14);
  581. w15 = _mm_sub_epi32(u7, u15);
  582. // shift and rounding
  583. v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
  584. v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
  585. v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
  586. v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
  587. v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
  588. v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
  589. v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
  590. v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
  591. v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
  592. v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
  593. v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
  594. v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
  595. v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
  596. v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
  597. v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
  598. v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
  599. u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
  600. u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
  601. u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
  602. u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
  603. u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
  604. u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
  605. u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
  606. u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
  607. u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
  608. u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
  609. u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
  610. u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
  611. u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
  612. u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
  613. u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
  614. u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
  615. // back to 16-bit and pack 8 integers into __m128i
  616. in[0] = _mm_packs_epi32(u0, u1);
  617. in[1] = _mm_packs_epi32(u2, u3);
  618. in[2] = _mm_packs_epi32(u4, u5);
  619. in[3] = _mm_packs_epi32(u6, u7);
  620. in[4] = _mm_packs_epi32(u8, u9);
  621. in[5] = _mm_packs_epi32(u10, u11);
  622. in[6] = _mm_packs_epi32(u12, u13);
  623. in[7] = _mm_packs_epi32(u14, u15);
  624. // stage 2
  625. s0 = _mm_add_epi16(in[0], in[2]);
  626. s1 = _mm_add_epi16(in[1], in[3]);
  627. s2 = _mm_sub_epi16(in[0], in[2]);
  628. s3 = _mm_sub_epi16(in[1], in[3]);
  629. u0 = _mm_unpacklo_epi16(in[4], in[5]);
  630. u1 = _mm_unpackhi_epi16(in[4], in[5]);
  631. u2 = _mm_unpacklo_epi16(in[6], in[7]);
  632. u3 = _mm_unpackhi_epi16(in[6], in[7]);
  633. v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
  634. v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
  635. v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
  636. v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
  637. v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
  638. v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
  639. v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
  640. v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
  641. w0 = _mm_add_epi32(v0, v4);
  642. w1 = _mm_add_epi32(v1, v5);
  643. w2 = _mm_add_epi32(v2, v6);
  644. w3 = _mm_add_epi32(v3, v7);
  645. w4 = _mm_sub_epi32(v0, v4);
  646. w5 = _mm_sub_epi32(v1, v5);
  647. w6 = _mm_sub_epi32(v2, v6);
  648. w7 = _mm_sub_epi32(v3, v7);
  649. v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
  650. v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
  651. v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
  652. v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
  653. v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
  654. v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
  655. v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
  656. v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
  657. u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
  658. u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
  659. u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
  660. u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
  661. u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
  662. u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
  663. u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
  664. u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
  665. // back to 16-bit intergers
  666. s4 = _mm_packs_epi32(u0, u1);
  667. s5 = _mm_packs_epi32(u2, u3);
  668. s6 = _mm_packs_epi32(u4, u5);
  669. s7 = _mm_packs_epi32(u6, u7);
  670. // stage 3
  671. u0 = _mm_unpacklo_epi16(s2, s3);
  672. u1 = _mm_unpackhi_epi16(s2, s3);
  673. u2 = _mm_unpacklo_epi16(s6, s7);
  674. u3 = _mm_unpackhi_epi16(s6, s7);
  675. v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
  676. v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
  677. v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
  678. v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
  679. v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
  680. v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
  681. v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
  682. v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
  683. u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
  684. u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
  685. u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
  686. u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
  687. u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
  688. u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
  689. u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
  690. u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
  691. v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
  692. v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
  693. v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
  694. v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
  695. v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
  696. v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
  697. v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
  698. v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
  699. s2 = _mm_packs_epi32(v0, v1);
  700. s3 = _mm_packs_epi32(v2, v3);
  701. s6 = _mm_packs_epi32(v4, v5);
  702. s7 = _mm_packs_epi32(v6, v7);
  703. in[0] = s0;
  704. in[1] = _mm_sub_epi16(k__const_0, s4);
  705. in[2] = s6;
  706. in[3] = _mm_sub_epi16(k__const_0, s2);
  707. in[4] = s3;
  708. in[5] = _mm_sub_epi16(k__const_0, s7);
  709. in[6] = s5;
  710. in[7] = _mm_sub_epi16(k__const_0, s1);
  711. }
  712. void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest,
  713. int stride) {
  714. const __m128i zero = _mm_setzero_si128();
  715. const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
  716. const __m128i final_rounding = _mm_set1_epi16(1 << 4);
  717. const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
  718. const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
  719. const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
  720. const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
  721. const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
  722. const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
  723. const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
  724. const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
  725. const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
  726. __m128i in0, in1, in2, in3, in4, in5, in6, in7;
  727. __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
  728. __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
  729. __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  730. // Rows. Load 4-row input data.
  731. in0 = load_input_data(input);
  732. in1 = load_input_data(input + 8 * 1);
  733. in2 = load_input_data(input + 8 * 2);
  734. in3 = load_input_data(input + 8 * 3);
  735. // 8x4 Transpose
  736. TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1);
  737. // Stage1
  738. {
  739. const __m128i lo_17 = _mm_unpackhi_epi16(in0, zero);
  740. const __m128i lo_35 = _mm_unpackhi_epi16(in1, zero);
  741. tmp0 = _mm_madd_epi16(lo_17, stg1_0);
  742. tmp2 = _mm_madd_epi16(lo_17, stg1_1);
  743. tmp4 = _mm_madd_epi16(lo_35, stg1_2);
  744. tmp6 = _mm_madd_epi16(lo_35, stg1_3);
  745. tmp0 = _mm_add_epi32(tmp0, rounding);
  746. tmp2 = _mm_add_epi32(tmp2, rounding);
  747. tmp4 = _mm_add_epi32(tmp4, rounding);
  748. tmp6 = _mm_add_epi32(tmp6, rounding);
  749. tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
  750. tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
  751. tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
  752. tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
  753. stp1_4 = _mm_packs_epi32(tmp0, tmp2);
  754. stp1_5 = _mm_packs_epi32(tmp4, tmp6);
  755. }
  756. // Stage2
  757. {
  758. const __m128i lo_04 = _mm_unpacklo_epi16(in0, zero);
  759. const __m128i lo_26 = _mm_unpacklo_epi16(in1, zero);
  760. tmp0 = _mm_madd_epi16(lo_04, stg2_0);
  761. tmp2 = _mm_madd_epi16(lo_04, stg2_1);
  762. tmp4 = _mm_madd_epi16(lo_26, stg2_2);
  763. tmp6 = _mm_madd_epi16(lo_26, stg2_3);
  764. tmp0 = _mm_add_epi32(tmp0, rounding);
  765. tmp2 = _mm_add_epi32(tmp2, rounding);
  766. tmp4 = _mm_add_epi32(tmp4, rounding);
  767. tmp6 = _mm_add_epi32(tmp6, rounding);
  768. tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
  769. tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
  770. tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
  771. tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
  772. stp2_0 = _mm_packs_epi32(tmp0, tmp2);
  773. stp2_2 = _mm_packs_epi32(tmp6, tmp4);
  774. tmp0 = _mm_adds_epi16(stp1_4, stp1_5);
  775. tmp1 = _mm_subs_epi16(stp1_4, stp1_5);
  776. stp2_4 = tmp0;
  777. stp2_5 = _mm_unpacklo_epi64(tmp1, zero);
  778. stp2_6 = _mm_unpackhi_epi64(tmp1, zero);
  779. }
  780. // Stage3
  781. {
  782. const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6);
  783. tmp4 = _mm_adds_epi16(stp2_0, stp2_2);
  784. tmp6 = _mm_subs_epi16(stp2_0, stp2_2);
  785. stp1_2 = _mm_unpackhi_epi64(tmp6, tmp4);
  786. stp1_3 = _mm_unpacklo_epi64(tmp6, tmp4);
  787. tmp0 = _mm_madd_epi16(lo_56, stg3_0);
  788. tmp2 = _mm_madd_epi16(lo_56, stg2_0); // stg3_1 = stg2_0
  789. tmp0 = _mm_add_epi32(tmp0, rounding);
  790. tmp2 = _mm_add_epi32(tmp2, rounding);
  791. tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
  792. tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
  793. stp1_5 = _mm_packs_epi32(tmp0, tmp2);
  794. }
  795. // Stage4
  796. tmp0 = _mm_adds_epi16(stp1_3, stp2_4);
  797. tmp1 = _mm_adds_epi16(stp1_2, stp1_5);
  798. tmp2 = _mm_subs_epi16(stp1_3, stp2_4);
  799. tmp3 = _mm_subs_epi16(stp1_2, stp1_5);
  800. TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3)
  801. IDCT8(in0, in1, in2, in3, zero, zero, zero, zero,
  802. in0, in1, in2, in3, in4, in5, in6, in7);
  803. // Final rounding and shift
  804. in0 = _mm_adds_epi16(in0, final_rounding);
  805. in1 = _mm_adds_epi16(in1, final_rounding);
  806. in2 = _mm_adds_epi16(in2, final_rounding);
  807. in3 = _mm_adds_epi16(in3, final_rounding);
  808. in4 = _mm_adds_epi16(in4, final_rounding);
  809. in5 = _mm_adds_epi16(in5, final_rounding);
  810. in6 = _mm_adds_epi16(in6, final_rounding);
  811. in7 = _mm_adds_epi16(in7, final_rounding);
  812. in0 = _mm_srai_epi16(in0, 5);
  813. in1 = _mm_srai_epi16(in1, 5);
  814. in2 = _mm_srai_epi16(in2, 5);
  815. in3 = _mm_srai_epi16(in3, 5);
  816. in4 = _mm_srai_epi16(in4, 5);
  817. in5 = _mm_srai_epi16(in5, 5);
  818. in6 = _mm_srai_epi16(in6, 5);
  819. in7 = _mm_srai_epi16(in7, 5);
  820. RECON_AND_STORE(dest + 0 * stride, in0);
  821. RECON_AND_STORE(dest + 1 * stride, in1);
  822. RECON_AND_STORE(dest + 2 * stride, in2);
  823. RECON_AND_STORE(dest + 3 * stride, in3);
  824. RECON_AND_STORE(dest + 4 * stride, in4);
  825. RECON_AND_STORE(dest + 5 * stride, in5);
  826. RECON_AND_STORE(dest + 6 * stride, in6);
  827. RECON_AND_STORE(dest + 7 * stride, in7);
  828. }
  829. #define IDCT16 \
  830. /* Stage2 */ \
  831. { \
  832. const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]); \
  833. const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]); \
  834. const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]); \
  835. const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]); \
  836. const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]); \
  837. const __m128i hi_5_11 = _mm_unpackhi_epi16(in[5], in[11]); \
  838. const __m128i lo_13_3 = _mm_unpacklo_epi16(in[13], in[3]); \
  839. const __m128i hi_13_3 = _mm_unpackhi_epi16(in[13], in[3]); \
  840. \
  841. MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, \
  842. stg2_0, stg2_1, stg2_2, stg2_3, \
  843. stp2_8, stp2_15, stp2_9, stp2_14) \
  844. \
  845. MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, \
  846. stg2_4, stg2_5, stg2_6, stg2_7, \
  847. stp2_10, stp2_13, stp2_11, stp2_12) \
  848. } \
  849. \
  850. /* Stage3 */ \
  851. { \
  852. const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], in[14]); \
  853. const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], in[14]); \
  854. const __m128i lo_10_6 = _mm_unpacklo_epi16(in[10], in[6]); \
  855. const __m128i hi_10_6 = _mm_unpackhi_epi16(in[10], in[6]); \
  856. \
  857. MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, \
  858. stg3_0, stg3_1, stg3_2, stg3_3, \
  859. stp1_4, stp1_7, stp1_5, stp1_6) \
  860. \
  861. stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9); \
  862. stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \
  863. stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
  864. stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
  865. \
  866. stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \
  867. stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
  868. stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
  869. stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
  870. } \
  871. \
  872. /* Stage4 */ \
  873. { \
  874. const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]); \
  875. const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], in[8]); \
  876. const __m128i lo_4_12 = _mm_unpacklo_epi16(in[4], in[12]); \
  877. const __m128i hi_4_12 = _mm_unpackhi_epi16(in[4], in[12]); \
  878. \
  879. const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
  880. const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
  881. const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
  882. const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
  883. \
  884. MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, \
  885. stg4_0, stg4_1, stg4_2, stg4_3, \
  886. stp2_0, stp2_1, stp2_2, stp2_3) \
  887. \
  888. stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
  889. stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
  890. stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
  891. stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
  892. \
  893. MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \
  894. stg4_4, stg4_5, stg4_6, stg4_7, \
  895. stp2_9, stp2_14, stp2_10, stp2_13) \
  896. } \
  897. \
  898. /* Stage5 */ \
  899. { \
  900. const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
  901. const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
  902. \
  903. stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
  904. stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
  905. stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
  906. stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
  907. \
  908. tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
  909. tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
  910. tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
  911. tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
  912. \
  913. tmp0 = _mm_add_epi32(tmp0, rounding); \
  914. tmp1 = _mm_add_epi32(tmp1, rounding); \
  915. tmp2 = _mm_add_epi32(tmp2, rounding); \
  916. tmp3 = _mm_add_epi32(tmp3, rounding); \
  917. \
  918. tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
  919. tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
  920. tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
  921. tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
  922. \
  923. stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
  924. stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
  925. \
  926. stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \
  927. stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
  928. stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
  929. stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
  930. \
  931. stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
  932. stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
  933. stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
  934. stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
  935. } \
  936. \
  937. /* Stage6 */ \
  938. { \
  939. const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
  940. const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
  941. const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
  942. const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
  943. \
  944. stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
  945. stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
  946. stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
  947. stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \
  948. stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \
  949. stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
  950. stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
  951. stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
  952. \
  953. MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
  954. stg6_0, stg4_0, stg6_0, stg4_0, \
  955. stp2_10, stp2_13, stp2_11, stp2_12) \
  956. }
  957. #define IDCT16_10 \
  958. /* Stage2 */ \
  959. { \
  960. const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], zero); \
  961. const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], zero); \
  962. const __m128i lo_13_3 = _mm_unpacklo_epi16(zero, in[3]); \
  963. const __m128i hi_13_3 = _mm_unpackhi_epi16(zero, in[3]); \
  964. \
  965. MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_13_3, hi_13_3, \
  966. stg2_0, stg2_1, stg2_6, stg2_7, \
  967. stp1_8_0, stp1_15, stp1_11, stp1_12_0) \
  968. } \
  969. \
  970. /* Stage3 */ \
  971. { \
  972. const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], zero); \
  973. const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], zero); \
  974. \
  975. MULTIPLICATION_AND_ADD_2(lo_2_14, hi_2_14, \
  976. stg3_0, stg3_1, \
  977. stp2_4, stp2_7) \
  978. \
  979. stp1_9 = stp1_8_0; \
  980. stp1_10 = stp1_11; \
  981. \
  982. stp1_13 = stp1_12_0; \
  983. stp1_14 = stp1_15; \
  984. } \
  985. \
  986. /* Stage4 */ \
  987. { \
  988. const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); \
  989. const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], zero); \
  990. \
  991. const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
  992. const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
  993. const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
  994. const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
  995. \
  996. MULTIPLICATION_AND_ADD_2(lo_0_8, hi_0_8, \
  997. stg4_0, stg4_1, \
  998. stp1_0, stp1_1) \
  999. stp2_5 = stp2_4; \
  1000. stp2_6 = stp2_7; \
  1001. \
  1002. MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \
  1003. stg4_4, stg4_5, stg4_6, stg4_7, \
  1004. stp2_9, stp2_14, stp2_10, stp2_13) \
  1005. } \
  1006. \
  1007. /* Stage5 */ \
  1008. { \
  1009. const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
  1010. const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
  1011. \
  1012. stp1_2 = stp1_1; \
  1013. stp1_3 = stp1_0; \
  1014. \
  1015. tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
  1016. tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
  1017. tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
  1018. tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
  1019. \
  1020. tmp0 = _mm_add_epi32(tmp0, rounding); \
  1021. tmp1 = _mm_add_epi32(tmp1, rounding); \
  1022. tmp2 = _mm_add_epi32(tmp2, rounding); \
  1023. tmp3 = _mm_add_epi32(tmp3, rounding); \
  1024. \
  1025. tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
  1026. tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
  1027. tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
  1028. tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
  1029. \
  1030. stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
  1031. stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
  1032. \
  1033. stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \
  1034. stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
  1035. stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
  1036. stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
  1037. \
  1038. stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
  1039. stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
  1040. stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
  1041. stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
  1042. } \
  1043. \
  1044. /* Stage6 */ \
  1045. { \
  1046. const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
  1047. const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
  1048. const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
  1049. const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
  1050. \
  1051. stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
  1052. stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
  1053. stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
  1054. stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \
  1055. stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \
  1056. stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
  1057. stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
  1058. stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
  1059. \
  1060. MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
  1061. stg6_0, stg4_0, stg6_0, stg4_0, \
  1062. stp2_10, stp2_13, stp2_11, stp2_12) \
  1063. }
  1064. void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
  1065. int stride) {
  1066. const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
  1067. const __m128i final_rounding = _mm_set1_epi16(1 << 5);
  1068. const __m128i zero = _mm_setzero_si128();
  1069. const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
  1070. const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
  1071. const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
  1072. const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
  1073. const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
  1074. const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
  1075. const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
  1076. const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
  1077. const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
  1078. const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
  1079. const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
  1080. const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
  1081. const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
  1082. const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
  1083. const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
  1084. const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
  1085. const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
  1086. const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
  1087. const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
  1088. const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
  1089. const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
  1090. __m128i in[16], l[16], r[16], *curr1;
  1091. __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
  1092. stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
  1093. stp1_8_0, stp1_12_0;
  1094. __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
  1095. stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
  1096. __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  1097. int i;
  1098. curr1 = l;
  1099. for (i = 0; i < 2; i++) {
  1100. // 1-D idct
  1101. // Load input data.
  1102. in[0] = load_input_data(input);
  1103. in[8] = load_input_data(input + 8 * 1);
  1104. in[1] = load_input_data(input + 8 * 2);
  1105. in[9] = load_input_data(input + 8 * 3);
  1106. in[2] = load_input_data(input + 8 * 4);
  1107. in[10] = load_input_data(input + 8 * 5);
  1108. in[3] = load_input_data(input + 8 * 6);
  1109. in[11] = load_input_data(input + 8 * 7);
  1110. in[4] = load_input_data(input + 8 * 8);
  1111. in[12] = load_input_data(input + 8 * 9);
  1112. in[5] = load_input_data(input + 8 * 10);
  1113. in[13] = load_input_data(input + 8 * 11);
  1114. in[6] = load_input_data(input + 8 * 12);
  1115. in[14] = load_input_data(input + 8 * 13);
  1116. in[7] = load_input_data(input + 8 * 14);
  1117. in[15] = load_input_data(input + 8 * 15);
  1118. array_transpose_8x8(in, in);
  1119. array_transpose_8x8(in + 8, in + 8);
  1120. IDCT16
  1121. // Stage7
  1122. curr1[0] = _mm_add_epi16(stp2_0, stp1_15);
  1123. curr1[1] = _mm_add_epi16(stp2_1, stp1_14);
  1124. curr1[2] = _mm_add_epi16(stp2_2, stp2_13);
  1125. curr1[3] = _mm_add_epi16(stp2_3, stp2_12);
  1126. curr1[4] = _mm_add_epi16(stp2_4, stp2_11);
  1127. curr1[5] = _mm_add_epi16(stp2_5, stp2_10);
  1128. curr1[6] = _mm_add_epi16(stp2_6, stp1_9);
  1129. curr1[7] = _mm_add_epi16(stp2_7, stp1_8);
  1130. curr1[8] = _mm_sub_epi16(stp2_7, stp1_8);
  1131. curr1[9] = _mm_sub_epi16(stp2_6, stp1_9);
  1132. curr1[10] = _mm_sub_epi16(stp2_5, stp2_10);
  1133. curr1[11] = _mm_sub_epi16(stp2_4, stp2_11);
  1134. curr1[12] = _mm_sub_epi16(stp2_3, stp2_12);
  1135. curr1[13] = _mm_sub_epi16(stp2_2, stp2_13);
  1136. curr1[14] = _mm_sub_epi16(stp2_1, stp1_14);
  1137. curr1[15] = _mm_sub_epi16(stp2_0, stp1_15);
  1138. curr1 = r;
  1139. input += 128;
  1140. }
  1141. for (i = 0; i < 2; i++) {
  1142. int j;
  1143. // 1-D idct
  1144. array_transpose_8x8(l + i * 8, in);
  1145. array_transpose_8x8(r + i * 8, in + 8);
  1146. IDCT16
  1147. // 2-D
  1148. in[0] = _mm_add_epi16(stp2_0, stp1_15);
  1149. in[1] = _mm_add_epi16(stp2_1, stp1_14);
  1150. in[2] = _mm_add_epi16(stp2_2, stp2_13);
  1151. in[3] = _mm_add_epi16(stp2_3, stp2_12);
  1152. in[4] = _mm_add_epi16(stp2_4, stp2_11);
  1153. in[5] = _mm_add_epi16(stp2_5, stp2_10);
  1154. in[6] = _mm_add_epi16(stp2_6, stp1_9);
  1155. in[7] = _mm_add_epi16(stp2_7, stp1_8);
  1156. in[8] = _mm_sub_epi16(stp2_7, stp1_8);
  1157. in[9] = _mm_sub_epi16(stp2_6, stp1_9);
  1158. in[10] = _mm_sub_epi16(stp2_5, stp2_10);
  1159. in[11] = _mm_sub_epi16(stp2_4, stp2_11);
  1160. in[12] = _mm_sub_epi16(stp2_3, stp2_12);
  1161. in[13] = _mm_sub_epi16(stp2_2, stp2_13);
  1162. in[14] = _mm_sub_epi16(stp2_1, stp1_14);
  1163. in[15] = _mm_sub_epi16(stp2_0, stp1_15);
  1164. for (j = 0; j < 16; ++j) {
  1165. // Final rounding and shift
  1166. in[j] = _mm_adds_epi16(in[j], final_rounding);
  1167. in[j] = _mm_srai_epi16(in[j], 6);
  1168. RECON_AND_STORE(dest + j * stride, in[j]);
  1169. }
  1170. dest += 8;
  1171. }
  1172. }
  1173. void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest,
  1174. int stride) {
  1175. __m128i dc_value;
  1176. const __m128i zero = _mm_setzero_si128();
  1177. int a, i;
  1178. a = (int)dct_const_round_shift(input[0] * cospi_16_64);
  1179. a = (int)dct_const_round_shift(a * cospi_16_64);
  1180. a = ROUND_POWER_OF_TWO(a, 6);
  1181. dc_value = _mm_set1_epi16(a);
  1182. for (i = 0; i < 16; ++i) {
  1183. RECON_AND_STORE(dest + 0, dc_value);
  1184. RECON_AND_STORE(dest + 8, dc_value);
  1185. dest += stride;
  1186. }
  1187. }
  1188. static void iadst16_8col(__m128i *in) {
  1189. // perform 16x16 1-D ADST for 8 columns
  1190. __m128i s[16], x[16], u[32], v[32];
  1191. const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
  1192. const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
  1193. const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
  1194. const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
  1195. const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
  1196. const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
  1197. const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
  1198. const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
  1199. const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
  1200. const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
  1201. const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
  1202. const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
  1203. const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
  1204. const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
  1205. const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
  1206. const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
  1207. const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
  1208. const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
  1209. const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
  1210. const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
  1211. const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
  1212. const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
  1213. const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
  1214. const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
  1215. const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
  1216. const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64);
  1217. const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
  1218. const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
  1219. const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
  1220. const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
  1221. const __m128i kZero = _mm_set1_epi16(0);
  1222. u[0] = _mm_unpacklo_epi16(in[15], in[0]);
  1223. u[1] = _mm_unpackhi_epi16(in[15], in[0]);
  1224. u[2] = _mm_unpacklo_epi16(in[13], in[2]);
  1225. u[3] = _mm_unpackhi_epi16(in[13], in[2]);
  1226. u[4] = _mm_unpacklo_epi16(in[11], in[4]);
  1227. u[5] = _mm_unpackhi_epi16(in[11], in[4]);
  1228. u[6] = _mm_unpacklo_epi16(in[9], in[6]);
  1229. u[7] = _mm_unpackhi_epi16(in[9], in[6]);
  1230. u[8] = _mm_unpacklo_epi16(in[7], in[8]);
  1231. u[9] = _mm_unpackhi_epi16(in[7], in[8]);
  1232. u[10] = _mm_unpacklo_epi16(in[5], in[10]);
  1233. u[11] = _mm_unpackhi_epi16(in[5], in[10]);
  1234. u[12] = _mm_unpacklo_epi16(in[3], in[12]);
  1235. u[13] = _mm_unpackhi_epi16(in[3], in[12]);
  1236. u[14] = _mm_unpacklo_epi16(in[1], in[14]);
  1237. u[15] = _mm_unpackhi_epi16(in[1], in[14]);
  1238. v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
  1239. v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
  1240. v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
  1241. v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
  1242. v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
  1243. v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
  1244. v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
  1245. v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
  1246. v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
  1247. v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
  1248. v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
  1249. v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
  1250. v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
  1251. v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
  1252. v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
  1253. v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
  1254. v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
  1255. v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
  1256. v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
  1257. v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
  1258. v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
  1259. v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
  1260. v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
  1261. v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
  1262. v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
  1263. v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
  1264. v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
  1265. v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
  1266. v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
  1267. v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
  1268. v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
  1269. v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
  1270. u[0] = _mm_add_epi32(v[0], v[16]);
  1271. u[1] = _mm_add_epi32(v[1], v[17]);
  1272. u[2] = _mm_add_epi32(v[2], v[18]);
  1273. u[3] = _mm_add_epi32(v[3], v[19]);
  1274. u[4] = _mm_add_epi32(v[4], v[20]);
  1275. u[5] = _mm_add_epi32(v[5], v[21]);
  1276. u[6] = _mm_add_epi32(v[6], v[22]);
  1277. u[7] = _mm_add_epi32(v[7], v[23]);
  1278. u[8] = _mm_add_epi32(v[8], v[24]);
  1279. u[9] = _mm_add_epi32(v[9], v[25]);
  1280. u[10] = _mm_add_epi32(v[10], v[26]);
  1281. u[11] = _mm_add_epi32(v[11], v[27]);
  1282. u[12] = _mm_add_epi32(v[12], v[28]);
  1283. u[13] = _mm_add_epi32(v[13], v[29]);
  1284. u[14] = _mm_add_epi32(v[14], v[30]);
  1285. u[15] = _mm_add_epi32(v[15], v[31]);
  1286. u[16] = _mm_sub_epi32(v[0], v[16]);
  1287. u[17] = _mm_sub_epi32(v[1], v[17]);
  1288. u[18] = _mm_sub_epi32(v[2], v[18]);
  1289. u[19] = _mm_sub_epi32(v[3], v[19]);
  1290. u[20] = _mm_sub_epi32(v[4], v[20]);
  1291. u[21] = _mm_sub_epi32(v[5], v[21]);
  1292. u[22] = _mm_sub_epi32(v[6], v[22]);
  1293. u[23] = _mm_sub_epi32(v[7], v[23]);
  1294. u[24] = _mm_sub_epi32(v[8], v[24]);
  1295. u[25] = _mm_sub_epi32(v[9], v[25]);
  1296. u[26] = _mm_sub_epi32(v[10], v[26]);
  1297. u[27] = _mm_sub_epi32(v[11], v[27]);
  1298. u[28] = _mm_sub_epi32(v[12], v[28]);
  1299. u[29] = _mm_sub_epi32(v[13], v[29]);
  1300. u[30] = _mm_sub_epi32(v[14], v[30]);
  1301. u[31] = _mm_sub_epi32(v[15], v[31]);
  1302. v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
  1303. v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
  1304. v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
  1305. v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
  1306. v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
  1307. v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
  1308. v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
  1309. v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
  1310. v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
  1311. v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
  1312. v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
  1313. v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
  1314. v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
  1315. v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
  1316. v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
  1317. v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
  1318. v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
  1319. v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
  1320. v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
  1321. v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
  1322. v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
  1323. v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
  1324. v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
  1325. v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
  1326. v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
  1327. v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
  1328. v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
  1329. v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
  1330. v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
  1331. v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
  1332. v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
  1333. v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
  1334. u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
  1335. u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
  1336. u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
  1337. u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
  1338. u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
  1339. u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
  1340. u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
  1341. u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
  1342. u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
  1343. u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
  1344. u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
  1345. u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
  1346. u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
  1347. u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
  1348. u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
  1349. u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
  1350. u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
  1351. u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
  1352. u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
  1353. u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
  1354. u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
  1355. u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
  1356. u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
  1357. u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
  1358. u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
  1359. u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
  1360. u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
  1361. u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
  1362. u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
  1363. u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
  1364. u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
  1365. u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
  1366. s[0] = _mm_packs_epi32(u[0], u[1]);
  1367. s[1] = _mm_packs_epi32(u[2], u[3]);
  1368. s[2] = _mm_packs_epi32(u[4], u[5]);
  1369. s[3] = _mm_packs_epi32(u[6], u[7]);
  1370. s[4] = _mm_packs_epi32(u[8], u[9]);
  1371. s[5] = _mm_packs_epi32(u[10], u[11]);
  1372. s[6] = _mm_packs_epi32(u[12], u[13]);
  1373. s[7] = _mm_packs_epi32(u[14], u[15]);
  1374. s[8] = _mm_packs_epi32(u[16], u[17]);
  1375. s[9] = _mm_packs_epi32(u[18], u[19]);
  1376. s[10] = _mm_packs_epi32(u[20], u[21]);
  1377. s[11] = _mm_packs_epi32(u[22], u[23]);
  1378. s[12] = _mm_packs_epi32(u[24], u[25]);
  1379. s[13] = _mm_packs_epi32(u[26], u[27]);
  1380. s[14] = _mm_packs_epi32(u[28], u[29]);
  1381. s[15] = _mm_packs_epi32(u[30], u[31]);
  1382. // stage 2
  1383. u[0] = _mm_unpacklo_epi16(s[8], s[9]);
  1384. u[1] = _mm_unpackhi_epi16(s[8], s[9]);
  1385. u[2] = _mm_unpacklo_epi16(s[10], s[11]);
  1386. u[3] = _mm_unpackhi_epi16(s[10], s[11]);
  1387. u[4] = _mm_unpacklo_epi16(s[12], s[13]);
  1388. u[5] = _mm_unpackhi_epi16(s[12], s[13]);
  1389. u[6] = _mm_unpacklo_epi16(s[14], s[15]);
  1390. u[7] = _mm_unpackhi_epi16(s[14], s[15]);
  1391. v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
  1392. v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
  1393. v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
  1394. v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
  1395. v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
  1396. v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
  1397. v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
  1398. v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
  1399. v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
  1400. v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
  1401. v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
  1402. v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
  1403. v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
  1404. v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
  1405. v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
  1406. v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
  1407. u[0] = _mm_add_epi32(v[0], v[8]);
  1408. u[1] = _mm_add_epi32(v[1], v[9]);
  1409. u[2] = _mm_add_epi32(v[2], v[10]);
  1410. u[3] = _mm_add_epi32(v[3], v[11]);
  1411. u[4] = _mm_add_epi32(v[4], v[12]);
  1412. u[5] = _mm_add_epi32(v[5], v[13]);
  1413. u[6] = _mm_add_epi32(v[6], v[14]);
  1414. u[7] = _mm_add_epi32(v[7], v[15]);
  1415. u[8] = _mm_sub_epi32(v[0], v[8]);
  1416. u[9] = _mm_sub_epi32(v[1], v[9]);
  1417. u[10] = _mm_sub_epi32(v[2], v[10]);
  1418. u[11] = _mm_sub_epi32(v[3], v[11]);
  1419. u[12] = _mm_sub_epi32(v[4], v[12]);
  1420. u[13] = _mm_sub_epi32(v[5], v[13]);
  1421. u[14] = _mm_sub_epi32(v[6], v[14]);
  1422. u[15] = _mm_sub_epi32(v[7], v[15]);
  1423. v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
  1424. v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
  1425. v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
  1426. v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
  1427. v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
  1428. v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
  1429. v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
  1430. v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
  1431. v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
  1432. v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
  1433. v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
  1434. v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
  1435. v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
  1436. v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
  1437. v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
  1438. v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
  1439. u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
  1440. u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
  1441. u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
  1442. u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
  1443. u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
  1444. u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
  1445. u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
  1446. u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
  1447. u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
  1448. u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
  1449. u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
  1450. u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
  1451. u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
  1452. u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
  1453. u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
  1454. u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
  1455. x[0] = _mm_add_epi16(s[0], s[4]);
  1456. x[1] = _mm_add_epi16(s[1], s[5]);
  1457. x[2] = _mm_add_epi16(s[2], s[6]);
  1458. x[3] = _mm_add_epi16(s[3], s[7]);
  1459. x[4] = _mm_sub_epi16(s[0], s[4]);
  1460. x[5] = _mm_sub_epi16(s[1], s[5]);
  1461. x[6] = _mm_sub_epi16(s[2], s[6]);
  1462. x[7] = _mm_sub_epi16(s[3], s[7]);
  1463. x[8] = _mm_packs_epi32(u[0], u[1]);
  1464. x[9] = _mm_packs_epi32(u[2], u[3]);
  1465. x[10] = _mm_packs_epi32(u[4], u[5]);
  1466. x[11] = _mm_packs_epi32(u[6], u[7]);
  1467. x[12] = _mm_packs_epi32(u[8], u[9]);
  1468. x[13] = _mm_packs_epi32(u[10], u[11]);
  1469. x[14] = _mm_packs_epi32(u[12], u[13]);
  1470. x[15] = _mm_packs_epi32(u[14], u[15]);
  1471. // stage 3
  1472. u[0] = _mm_unpacklo_epi16(x[4], x[5]);
  1473. u[1] = _mm_unpackhi_epi16(x[4], x[5]);
  1474. u[2] = _mm_unpacklo_epi16(x[6], x[7]);
  1475. u[3] = _mm_unpackhi_epi16(x[6], x[7]);
  1476. u[4] = _mm_unpacklo_epi16(x[12], x[13]);
  1477. u[5] = _mm_unpackhi_epi16(x[12], x[13]);
  1478. u[6] = _mm_unpacklo_epi16(x[14], x[15]);
  1479. u[7] = _mm_unpackhi_epi16(x[14], x[15]);
  1480. v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
  1481. v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
  1482. v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
  1483. v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
  1484. v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
  1485. v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
  1486. v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
  1487. v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
  1488. v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
  1489. v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
  1490. v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
  1491. v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
  1492. v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
  1493. v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
  1494. v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
  1495. v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
  1496. u[0] = _mm_add_epi32(v[0], v[4]);
  1497. u[1] = _mm_add_epi32(v[1], v[5]);
  1498. u[2] = _mm_add_epi32(v[2], v[6]);
  1499. u[3] = _mm_add_epi32(v[3], v[7]);
  1500. u[4] = _mm_sub_epi32(v[0], v[4]);
  1501. u[5] = _mm_sub_epi32(v[1], v[5]);
  1502. u[6] = _mm_sub_epi32(v[2], v[6]);
  1503. u[7] = _mm_sub_epi32(v[3], v[7]);
  1504. u[8] = _mm_add_epi32(v[8], v[12]);
  1505. u[9] = _mm_add_epi32(v[9], v[13]);
  1506. u[10] = _mm_add_epi32(v[10], v[14]);
  1507. u[11] = _mm_add_epi32(v[11], v[15]);
  1508. u[12] = _mm_sub_epi32(v[8], v[12]);
  1509. u[13] = _mm_sub_epi32(v[9], v[13]);
  1510. u[14] = _mm_sub_epi32(v[10], v[14]);
  1511. u[15] = _mm_sub_epi32(v[11], v[15]);
  1512. u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
  1513. u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
  1514. u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
  1515. u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
  1516. u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
  1517. u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
  1518. u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
  1519. u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
  1520. u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
  1521. u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
  1522. u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
  1523. u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
  1524. u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
  1525. u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
  1526. u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
  1527. u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
  1528. v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
  1529. v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
  1530. v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
  1531. v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
  1532. v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
  1533. v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
  1534. v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
  1535. v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
  1536. v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
  1537. v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
  1538. v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
  1539. v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
  1540. v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
  1541. v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
  1542. v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
  1543. v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
  1544. s[0] = _mm_add_epi16(x[0], x[2]);
  1545. s[1] = _mm_add_epi16(x[1], x[3]);
  1546. s[2] = _mm_sub_epi16(x[0], x[2]);
  1547. s[3] = _mm_sub_epi16(x[1], x[3]);
  1548. s[4] = _mm_packs_epi32(v[0], v[1]);
  1549. s[5] = _mm_packs_epi32(v[2], v[3]);
  1550. s[6] = _mm_packs_epi32(v[4], v[5]);
  1551. s[7] = _mm_packs_epi32(v[6], v[7]);
  1552. s[8] = _mm_add_epi16(x[8], x[10]);
  1553. s[9] = _mm_add_epi16(x[9], x[11]);
  1554. s[10] = _mm_sub_epi16(x[8], x[10]);
  1555. s[11] = _mm_sub_epi16(x[9], x[11]);
  1556. s[12] = _mm_packs_epi32(v[8], v[9]);
  1557. s[13] = _mm_packs_epi32(v[10], v[11]);
  1558. s[14] = _mm_packs_epi32(v[12], v[13]);
  1559. s[15] = _mm_packs_epi32(v[14], v[15]);
  1560. // stage 4
  1561. u[0] = _mm_unpacklo_epi16(s[2], s[3]);
  1562. u[1] = _mm_unpackhi_epi16(s[2], s[3]);
  1563. u[2] = _mm_unpacklo_epi16(s[6], s[7]);
  1564. u[3] = _mm_unpackhi_epi16(s[6], s[7]);
  1565. u[4] = _mm_unpacklo_epi16(s[10], s[11]);
  1566. u[5] = _mm_unpackhi_epi16(s[10], s[11]);
  1567. u[6] = _mm_unpacklo_epi16(s[14], s[15]);
  1568. u[7] = _mm_unpackhi_epi16(s[14], s[15]);
  1569. v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
  1570. v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
  1571. v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
  1572. v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
  1573. v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
  1574. v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
  1575. v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
  1576. v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
  1577. v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
  1578. v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
  1579. v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
  1580. v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
  1581. v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
  1582. v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
  1583. v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
  1584. v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
  1585. u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
  1586. u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
  1587. u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
  1588. u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
  1589. u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
  1590. u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
  1591. u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
  1592. u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
  1593. u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
  1594. u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
  1595. u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
  1596. u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
  1597. u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
  1598. u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
  1599. u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
  1600. u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
  1601. v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
  1602. v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
  1603. v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
  1604. v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
  1605. v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
  1606. v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
  1607. v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
  1608. v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
  1609. v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
  1610. v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
  1611. v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
  1612. v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
  1613. v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
  1614. v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
  1615. v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
  1616. v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
  1617. in[0] = s[0];
  1618. in[1] = _mm_sub_epi16(kZero, s[8]);
  1619. in[2] = s[12];
  1620. in[3] = _mm_sub_epi16(kZero, s[4]);
  1621. in[4] = _mm_packs_epi32(v[4], v[5]);
  1622. in[5] = _mm_packs_epi32(v[12], v[13]);
  1623. in[6] = _mm_packs_epi32(v[8], v[9]);
  1624. in[7] = _mm_packs_epi32(v[0], v[1]);
  1625. in[8] = _mm_packs_epi32(v[2], v[3]);
  1626. in[9] = _mm_packs_epi32(v[10], v[11]);
  1627. in[10] = _mm_packs_epi32(v[14], v[15]);
  1628. in[11] = _mm_packs_epi32(v[6], v[7]);
  1629. in[12] = s[5];
  1630. in[13] = _mm_sub_epi16(kZero, s[13]);
  1631. in[14] = s[9];
  1632. in[15] = _mm_sub_epi16(kZero, s[1]);
  1633. }
  1634. static void idct16_8col(__m128i *in) {
  1635. const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
  1636. const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
  1637. const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
  1638. const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
  1639. const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
  1640. const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
  1641. const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
  1642. const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
  1643. const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
  1644. const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
  1645. const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
  1646. const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
  1647. const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
  1648. const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
  1649. const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
  1650. const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
  1651. const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
  1652. const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
  1653. const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
  1654. const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
  1655. const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
  1656. __m128i v[16], u[16], s[16], t[16];
  1657. // stage 1
  1658. s[0] = in[0];
  1659. s[1] = in[8];
  1660. s[2] = in[4];
  1661. s[3] = in[12];
  1662. s[4] = in[2];
  1663. s[5] = in[10];
  1664. s[6] = in[6];
  1665. s[7] = in[14];
  1666. s[8] = in[1];
  1667. s[9] = in[9];
  1668. s[10] = in[5];
  1669. s[11] = in[13];
  1670. s[12] = in[3];
  1671. s[13] = in[11];
  1672. s[14] = in[7];
  1673. s[15] = in[15];
  1674. // stage 2
  1675. u[0] = _mm_unpacklo_epi16(s[8], s[15]);
  1676. u[1] = _mm_unpackhi_epi16(s[8], s[15]);
  1677. u[2] = _mm_unpacklo_epi16(s[9], s[14]);
  1678. u[3] = _mm_unpackhi_epi16(s[9], s[14]);
  1679. u[4] = _mm_unpacklo_epi16(s[10], s[13]);
  1680. u[5] = _mm_unpackhi_epi16(s[10], s[13]);
  1681. u[6] = _mm_unpacklo_epi16(s[11], s[12]);
  1682. u[7] = _mm_unpackhi_epi16(s[11], s[12]);
  1683. v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02);
  1684. v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02);
  1685. v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30);
  1686. v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30);
  1687. v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18);
  1688. v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18);
  1689. v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14);
  1690. v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14);
  1691. v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10);
  1692. v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10);
  1693. v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22);
  1694. v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22);
  1695. v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26);
  1696. v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26);
  1697. v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06);
  1698. v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06);
  1699. u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
  1700. u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
  1701. u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
  1702. u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
  1703. u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
  1704. u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
  1705. u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
  1706. u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
  1707. u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
  1708. u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
  1709. u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
  1710. u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
  1711. u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
  1712. u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
  1713. u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
  1714. u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
  1715. u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
  1716. u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
  1717. u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
  1718. u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
  1719. u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
  1720. u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
  1721. u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
  1722. u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
  1723. u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
  1724. u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
  1725. u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
  1726. u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
  1727. u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
  1728. u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
  1729. u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
  1730. u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
  1731. s[8] = _mm_packs_epi32(u[0], u[1]);
  1732. s[15] = _mm_packs_epi32(u[2], u[3]);
  1733. s[9] = _mm_packs_epi32(u[4], u[5]);
  1734. s[14] = _mm_packs_epi32(u[6], u[7]);
  1735. s[10] = _mm_packs_epi32(u[8], u[9]);
  1736. s[13] = _mm_packs_epi32(u[10], u[11]);
  1737. s[11] = _mm_packs_epi32(u[12], u[13]);
  1738. s[12] = _mm_packs_epi32(u[14], u[15]);
  1739. // stage 3
  1740. t[0] = s[0];
  1741. t[1] = s[1];
  1742. t[2] = s[2];
  1743. t[3] = s[3];
  1744. u[0] = _mm_unpacklo_epi16(s[4], s[7]);
  1745. u[1] = _mm_unpackhi_epi16(s[4], s[7]);
  1746. u[2] = _mm_unpacklo_epi16(s[5], s[6]);
  1747. u[3] = _mm_unpackhi_epi16(s[5], s[6]);
  1748. v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
  1749. v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
  1750. v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
  1751. v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
  1752. v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
  1753. v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
  1754. v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
  1755. v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
  1756. u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
  1757. u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
  1758. u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
  1759. u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
  1760. u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
  1761. u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
  1762. u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
  1763. u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
  1764. u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
  1765. u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
  1766. u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
  1767. u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
  1768. u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
  1769. u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
  1770. u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
  1771. u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
  1772. t[4] = _mm_packs_epi32(u[0], u[1]);
  1773. t[7] = _mm_packs_epi32(u[2], u[3]);
  1774. t[5] = _mm_packs_epi32(u[4], u[5]);
  1775. t[6] = _mm_packs_epi32(u[6], u[7]);
  1776. t[8] = _mm_add_epi16(s[8], s[9]);
  1777. t[9] = _mm_sub_epi16(s[8], s[9]);
  1778. t[10] = _mm_sub_epi16(s[11], s[10]);
  1779. t[11] = _mm_add_epi16(s[10], s[11]);
  1780. t[12] = _mm_add_epi16(s[12], s[13]);
  1781. t[13] = _mm_sub_epi16(s[12], s[13]);
  1782. t[14] = _mm_sub_epi16(s[15], s[14]);
  1783. t[15] = _mm_add_epi16(s[14], s[15]);
  1784. // stage 4
  1785. u[0] = _mm_unpacklo_epi16(t[0], t[1]);
  1786. u[1] = _mm_unpackhi_epi16(t[0], t[1]);
  1787. u[2] = _mm_unpacklo_epi16(t[2], t[3]);
  1788. u[3] = _mm_unpackhi_epi16(t[2], t[3]);
  1789. u[4] = _mm_unpacklo_epi16(t[9], t[14]);
  1790. u[5] = _mm_unpackhi_epi16(t[9], t[14]);
  1791. u[6] = _mm_unpacklo_epi16(t[10], t[13]);
  1792. u[7] = _mm_unpackhi_epi16(t[10], t[13]);
  1793. v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
  1794. v[1] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
  1795. v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
  1796. v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
  1797. v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08);
  1798. v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08);
  1799. v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
  1800. v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
  1801. v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24);
  1802. v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24);
  1803. v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08);
  1804. v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08);
  1805. v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08);
  1806. v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08);
  1807. v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24);
  1808. v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24);
  1809. u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
  1810. u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
  1811. u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
  1812. u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
  1813. u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
  1814. u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
  1815. u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
  1816. u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
  1817. u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
  1818. u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
  1819. u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
  1820. u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
  1821. u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
  1822. u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
  1823. u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
  1824. u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
  1825. u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
  1826. u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
  1827. u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
  1828. u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
  1829. u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
  1830. u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
  1831. u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
  1832. u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
  1833. u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
  1834. u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
  1835. u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
  1836. u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
  1837. u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
  1838. u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
  1839. u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
  1840. u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
  1841. s[0] = _mm_packs_epi32(u[0], u[1]);
  1842. s[1] = _mm_packs_epi32(u[2], u[3]);
  1843. s[2] = _mm_packs_epi32(u[4], u[5]);
  1844. s[3] = _mm_packs_epi32(u[6], u[7]);
  1845. s[4] = _mm_add_epi16(t[4], t[5]);
  1846. s[5] = _mm_sub_epi16(t[4], t[5]);
  1847. s[6] = _mm_sub_epi16(t[7], t[6]);
  1848. s[7] = _mm_add_epi16(t[6], t[7]);
  1849. s[8] = t[8];
  1850. s[15] = t[15];
  1851. s[9] = _mm_packs_epi32(u[8], u[9]);
  1852. s[14] = _mm_packs_epi32(u[10], u[11]);
  1853. s[10] = _mm_packs_epi32(u[12], u[13]);
  1854. s[13] = _mm_packs_epi32(u[14], u[15]);
  1855. s[11] = t[11];
  1856. s[12] = t[12];
  1857. // stage 5
  1858. t[0] = _mm_add_epi16(s[0], s[3]);
  1859. t[1] = _mm_add_epi16(s[1], s[2]);
  1860. t[2] = _mm_sub_epi16(s[1], s[2]);
  1861. t[3] = _mm_sub_epi16(s[0], s[3]);
  1862. t[4] = s[4];
  1863. t[7] = s[7];
  1864. u[0] = _mm_unpacklo_epi16(s[5], s[6]);
  1865. u[1] = _mm_unpackhi_epi16(s[5], s[6]);
  1866. v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
  1867. v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
  1868. v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
  1869. v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
  1870. u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
  1871. u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
  1872. u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
  1873. u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
  1874. u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
  1875. u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
  1876. u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
  1877. u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
  1878. t[5] = _mm_packs_epi32(u[0], u[1]);
  1879. t[6] = _mm_packs_epi32(u[2], u[3]);
  1880. t[8] = _mm_add_epi16(s[8], s[11]);
  1881. t[9] = _mm_add_epi16(s[9], s[10]);
  1882. t[10] = _mm_sub_epi16(s[9], s[10]);
  1883. t[11] = _mm_sub_epi16(s[8], s[11]);
  1884. t[12] = _mm_sub_epi16(s[15], s[12]);
  1885. t[13] = _mm_sub_epi16(s[14], s[13]);
  1886. t[14] = _mm_add_epi16(s[13], s[14]);
  1887. t[15] = _mm_add_epi16(s[12], s[15]);
  1888. // stage 6
  1889. s[0] = _mm_add_epi16(t[0], t[7]);
  1890. s[1] = _mm_add_epi16(t[1], t[6]);
  1891. s[2] = _mm_add_epi16(t[2], t[5]);
  1892. s[3] = _mm_add_epi16(t[3], t[4]);
  1893. s[4] = _mm_sub_epi16(t[3], t[4]);
  1894. s[5] = _mm_sub_epi16(t[2], t[5]);
  1895. s[6] = _mm_sub_epi16(t[1], t[6]);
  1896. s[7] = _mm_sub_epi16(t[0], t[7]);
  1897. s[8] = t[8];
  1898. s[9] = t[9];
  1899. u[0] = _mm_unpacklo_epi16(t[10], t[13]);
  1900. u[1] = _mm_unpackhi_epi16(t[10], t[13]);
  1901. u[2] = _mm_unpacklo_epi16(t[11], t[12]);
  1902. u[3] = _mm_unpackhi_epi16(t[11], t[12]);
  1903. v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
  1904. v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
  1905. v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
  1906. v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
  1907. v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
  1908. v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
  1909. v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
  1910. v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
  1911. u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
  1912. u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
  1913. u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
  1914. u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
  1915. u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
  1916. u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
  1917. u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
  1918. u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
  1919. u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
  1920. u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
  1921. u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
  1922. u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
  1923. u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
  1924. u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
  1925. u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
  1926. u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
  1927. s[10] = _mm_packs_epi32(u[0], u[1]);
  1928. s[13] = _mm_packs_epi32(u[2], u[3]);
  1929. s[11] = _mm_packs_epi32(u[4], u[5]);
  1930. s[12] = _mm_packs_epi32(u[6], u[7]);
  1931. s[14] = t[14];
  1932. s[15] = t[15];
  1933. // stage 7
  1934. in[0] = _mm_add_epi16(s[0], s[15]);
  1935. in[1] = _mm_add_epi16(s[1], s[14]);
  1936. in[2] = _mm_add_epi16(s[2], s[13]);
  1937. in[3] = _mm_add_epi16(s[3], s[12]);
  1938. in[4] = _mm_add_epi16(s[4], s[11]);
  1939. in[5] = _mm_add_epi16(s[5], s[10]);
  1940. in[6] = _mm_add_epi16(s[6], s[9]);
  1941. in[7] = _mm_add_epi16(s[7], s[8]);
  1942. in[8] = _mm_sub_epi16(s[7], s[8]);
  1943. in[9] = _mm_sub_epi16(s[6], s[9]);
  1944. in[10] = _mm_sub_epi16(s[5], s[10]);
  1945. in[11] = _mm_sub_epi16(s[4], s[11]);
  1946. in[12] = _mm_sub_epi16(s[3], s[12]);
  1947. in[13] = _mm_sub_epi16(s[2], s[13]);
  1948. in[14] = _mm_sub_epi16(s[1], s[14]);
  1949. in[15] = _mm_sub_epi16(s[0], s[15]);
  1950. }
  1951. void idct16_sse2(__m128i *in0, __m128i *in1) {
  1952. array_transpose_16x16(in0, in1);
  1953. idct16_8col(in0);
  1954. idct16_8col(in1);
  1955. }
  1956. void iadst16_sse2(__m128i *in0, __m128i *in1) {
  1957. array_transpose_16x16(in0, in1);
  1958. iadst16_8col(in0);
  1959. iadst16_8col(in1);
  1960. }
  1961. void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
  1962. int stride) {
  1963. const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
  1964. const __m128i final_rounding = _mm_set1_epi16(1 << 5);
  1965. const __m128i zero = _mm_setzero_si128();
  1966. const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
  1967. const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
  1968. const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
  1969. const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
  1970. const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
  1971. const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
  1972. const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
  1973. const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
  1974. const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
  1975. const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
  1976. const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
  1977. const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
  1978. const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
  1979. __m128i in[16], l[16];
  1980. __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6,
  1981. stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
  1982. stp1_8_0, stp1_12_0;
  1983. __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
  1984. stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14;
  1985. __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  1986. int i;
  1987. // First 1-D inverse DCT
  1988. // Load input data.
  1989. in[0] = load_input_data(input);
  1990. in[1] = load_input_data(input + 8 * 2);
  1991. in[2] = load_input_data(input + 8 * 4);
  1992. in[3] = load_input_data(input + 8 * 6);
  1993. TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]);
  1994. // Stage2
  1995. {
  1996. const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], zero);
  1997. const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, in[1]);
  1998. tmp0 = _mm_madd_epi16(lo_1_15, stg2_0);
  1999. tmp2 = _mm_madd_epi16(lo_1_15, stg2_1);
  2000. tmp5 = _mm_madd_epi16(lo_13_3, stg2_6);
  2001. tmp7 = _mm_madd_epi16(lo_13_3, stg2_7);
  2002. tmp0 = _mm_add_epi32(tmp0, rounding);
  2003. tmp2 = _mm_add_epi32(tmp2, rounding);
  2004. tmp5 = _mm_add_epi32(tmp5, rounding);
  2005. tmp7 = _mm_add_epi32(tmp7, rounding);
  2006. tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
  2007. tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
  2008. tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
  2009. tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
  2010. stp2_8 = _mm_packs_epi32(tmp0, tmp2);
  2011. stp2_11 = _mm_packs_epi32(tmp5, tmp7);
  2012. }
  2013. // Stage3
  2014. {
  2015. const __m128i lo_2_14 = _mm_unpacklo_epi16(in[1], zero);
  2016. tmp0 = _mm_madd_epi16(lo_2_14, stg3_0);
  2017. tmp2 = _mm_madd_epi16(lo_2_14, stg3_1);
  2018. tmp0 = _mm_add_epi32(tmp0, rounding);
  2019. tmp2 = _mm_add_epi32(tmp2, rounding);
  2020. tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
  2021. tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
  2022. stp1_13 = _mm_unpackhi_epi64(stp2_11, zero);
  2023. stp1_14 = _mm_unpackhi_epi64(stp2_8, zero);
  2024. stp1_4 = _mm_packs_epi32(tmp0, tmp2);
  2025. }
  2026. // Stage4
  2027. {
  2028. const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero);
  2029. const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14);
  2030. const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13);
  2031. tmp0 = _mm_madd_epi16(lo_0_8, stg4_0);
  2032. tmp2 = _mm_madd_epi16(lo_0_8, stg4_1);
  2033. tmp1 = _mm_madd_epi16(lo_9_14, stg4_4);
  2034. tmp3 = _mm_madd_epi16(lo_9_14, stg4_5);
  2035. tmp5 = _mm_madd_epi16(lo_10_13, stg4_6);
  2036. tmp7 = _mm_madd_epi16(lo_10_13, stg4_7);
  2037. tmp0 = _mm_add_epi32(tmp0, rounding);
  2038. tmp2 = _mm_add_epi32(tmp2, rounding);
  2039. tmp1 = _mm_add_epi32(tmp1, rounding);
  2040. tmp3 = _mm_add_epi32(tmp3, rounding);
  2041. tmp5 = _mm_add_epi32(tmp5, rounding);
  2042. tmp7 = _mm_add_epi32(tmp7, rounding);
  2043. tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
  2044. tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
  2045. tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
  2046. tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
  2047. tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
  2048. tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
  2049. stp1_0 = _mm_packs_epi32(tmp0, tmp0);
  2050. stp1_1 = _mm_packs_epi32(tmp2, tmp2);
  2051. stp2_9 = _mm_packs_epi32(tmp1, tmp3);
  2052. stp2_10 = _mm_packs_epi32(tmp5, tmp7);
  2053. stp2_6 = _mm_unpackhi_epi64(stp1_4, zero);
  2054. }
  2055. // Stage5 and Stage6
  2056. {
  2057. tmp0 = _mm_add_epi16(stp2_8, stp2_11);
  2058. tmp1 = _mm_sub_epi16(stp2_8, stp2_11);
  2059. tmp2 = _mm_add_epi16(stp2_9, stp2_10);
  2060. tmp3 = _mm_sub_epi16(stp2_9, stp2_10);
  2061. stp1_9 = _mm_unpacklo_epi64(tmp2, zero);
  2062. stp1_10 = _mm_unpacklo_epi64(tmp3, zero);
  2063. stp1_8 = _mm_unpacklo_epi64(tmp0, zero);
  2064. stp1_11 = _mm_unpacklo_epi64(tmp1, zero);
  2065. stp1_13 = _mm_unpackhi_epi64(tmp3, zero);
  2066. stp1_14 = _mm_unpackhi_epi64(tmp2, zero);
  2067. stp1_12 = _mm_unpackhi_epi64(tmp1, zero);
  2068. stp1_15 = _mm_unpackhi_epi64(tmp0, zero);
  2069. }
  2070. // Stage6
  2071. {
  2072. const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp1_4);
  2073. const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
  2074. const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);
  2075. tmp1 = _mm_madd_epi16(lo_6_5, stg4_1);
  2076. tmp3 = _mm_madd_epi16(lo_6_5, stg4_0);
  2077. tmp0 = _mm_madd_epi16(lo_10_13, stg6_0);
  2078. tmp2 = _mm_madd_epi16(lo_10_13, stg4_0);
  2079. tmp4 = _mm_madd_epi16(lo_11_12, stg6_0);
  2080. tmp6 = _mm_madd_epi16(lo_11_12, stg4_0);
  2081. tmp1 = _mm_add_epi32(tmp1, rounding);
  2082. tmp3 = _mm_add_epi32(tmp3, rounding);
  2083. tmp0 = _mm_add_epi32(tmp0, rounding);
  2084. tmp2 = _mm_add_epi32(tmp2, rounding);
  2085. tmp4 = _mm_add_epi32(tmp4, rounding);
  2086. tmp6 = _mm_add_epi32(tmp6, rounding);
  2087. tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
  2088. tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
  2089. tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
  2090. tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
  2091. tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
  2092. tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
  2093. stp1_6 = _mm_packs_epi32(tmp3, tmp1);
  2094. stp2_10 = _mm_packs_epi32(tmp0, zero);
  2095. stp2_13 = _mm_packs_epi32(tmp2, zero);
  2096. stp2_11 = _mm_packs_epi32(tmp4, zero);
  2097. stp2_12 = _mm_packs_epi32(tmp6, zero);
  2098. tmp0 = _mm_add_epi16(stp1_0, stp1_4);
  2099. tmp1 = _mm_sub_epi16(stp1_0, stp1_4);
  2100. tmp2 = _mm_add_epi16(stp1_1, stp1_6);
  2101. tmp3 = _mm_sub_epi16(stp1_1, stp1_6);
  2102. stp2_0 = _mm_unpackhi_epi64(tmp0, zero);
  2103. stp2_1 = _mm_unpacklo_epi64(tmp2, zero);
  2104. stp2_2 = _mm_unpackhi_epi64(tmp2, zero);
  2105. stp2_3 = _mm_unpacklo_epi64(tmp0, zero);
  2106. stp2_4 = _mm_unpacklo_epi64(tmp1, zero);
  2107. stp2_5 = _mm_unpackhi_epi64(tmp3, zero);
  2108. stp2_6 = _mm_unpacklo_epi64(tmp3, zero);
  2109. stp2_7 = _mm_unpackhi_epi64(tmp1, zero);
  2110. }
  2111. // Stage7. Left 8x16 only.
  2112. l[0] = _mm_add_epi16(stp2_0, stp1_15);
  2113. l[1] = _mm_add_epi16(stp2_1, stp1_14);
  2114. l[2] = _mm_add_epi16(stp2_2, stp2_13);
  2115. l[3] = _mm_add_epi16(stp2_3, stp2_12);
  2116. l[4] = _mm_add_epi16(stp2_4, stp2_11);
  2117. l[5] = _mm_add_epi16(stp2_5, stp2_10);
  2118. l[6] = _mm_add_epi16(stp2_6, stp1_9);
  2119. l[7] = _mm_add_epi16(stp2_7, stp1_8);
  2120. l[8] = _mm_sub_epi16(stp2_7, stp1_8);
  2121. l[9] = _mm_sub_epi16(stp2_6, stp1_9);
  2122. l[10] = _mm_sub_epi16(stp2_5, stp2_10);
  2123. l[11] = _mm_sub_epi16(stp2_4, stp2_11);
  2124. l[12] = _mm_sub_epi16(stp2_3, stp2_12);
  2125. l[13] = _mm_sub_epi16(stp2_2, stp2_13);
  2126. l[14] = _mm_sub_epi16(stp2_1, stp1_14);
  2127. l[15] = _mm_sub_epi16(stp2_0, stp1_15);
  2128. // Second 1-D inverse transform, performed per 8x16 block
  2129. for (i = 0; i < 2; i++) {
  2130. int j;
  2131. array_transpose_4X8(l + 8 * i, in);
  2132. IDCT16_10
  2133. // Stage7
  2134. in[0] = _mm_add_epi16(stp2_0, stp1_15);
  2135. in[1] = _mm_add_epi16(stp2_1, stp1_14);
  2136. in[2] = _mm_add_epi16(stp2_2, stp2_13);
  2137. in[3] = _mm_add_epi16(stp2_3, stp2_12);
  2138. in[4] = _mm_add_epi16(stp2_4, stp2_11);
  2139. in[5] = _mm_add_epi16(stp2_5, stp2_10);
  2140. in[6] = _mm_add_epi16(stp2_6, stp1_9);
  2141. in[7] = _mm_add_epi16(stp2_7, stp1_8);
  2142. in[8] = _mm_sub_epi16(stp2_7, stp1_8);
  2143. in[9] = _mm_sub_epi16(stp2_6, stp1_9);
  2144. in[10] = _mm_sub_epi16(stp2_5, stp2_10);
  2145. in[11] = _mm_sub_epi16(stp2_4, stp2_11);
  2146. in[12] = _mm_sub_epi16(stp2_3, stp2_12);
  2147. in[13] = _mm_sub_epi16(stp2_2, stp2_13);
  2148. in[14] = _mm_sub_epi16(stp2_1, stp1_14);
  2149. in[15] = _mm_sub_epi16(stp2_0, stp1_15);
  2150. for (j = 0; j < 16; ++j) {
  2151. // Final rounding and shift
  2152. in[j] = _mm_adds_epi16(in[j], final_rounding);
  2153. in[j] = _mm_srai_epi16(in[j], 6);
  2154. RECON_AND_STORE(dest + j * stride, in[j]);
  2155. }
  2156. dest += 8;
  2157. }
  2158. }
  2159. #define LOAD_DQCOEFF(reg, input) \
  2160. { \
  2161. reg = load_input_data(input); \
  2162. input += 8; \
  2163. } \
  2164. #define IDCT32_34 \
  2165. /* Stage1 */ \
  2166. { \
  2167. const __m128i zero = _mm_setzero_si128();\
  2168. const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero); \
  2169. const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero); \
  2170. \
  2171. const __m128i lo_25_7= _mm_unpacklo_epi16(zero, in[7]); \
  2172. const __m128i hi_25_7 = _mm_unpackhi_epi16(zero, in[7]); \
  2173. \
  2174. const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], zero); \
  2175. const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], zero); \
  2176. \
  2177. const __m128i lo_29_3 = _mm_unpacklo_epi16(zero, in[3]); \
  2178. const __m128i hi_29_3 = _mm_unpackhi_epi16(zero, in[3]); \
  2179. \
  2180. MULTIPLICATION_AND_ADD_2(lo_1_31, hi_1_31, stg1_0, \
  2181. stg1_1, stp1_16, stp1_31); \
  2182. MULTIPLICATION_AND_ADD_2(lo_25_7, hi_25_7, stg1_6, \
  2183. stg1_7, stp1_19, stp1_28); \
  2184. MULTIPLICATION_AND_ADD_2(lo_5_27, hi_5_27, stg1_8, \
  2185. stg1_9, stp1_20, stp1_27); \
  2186. MULTIPLICATION_AND_ADD_2(lo_29_3, hi_29_3, stg1_14, \
  2187. stg1_15, stp1_23, stp1_24); \
  2188. } \
  2189. \
  2190. /* Stage2 */ \
  2191. { \
  2192. const __m128i zero = _mm_setzero_si128();\
  2193. const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero); \
  2194. const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero); \
  2195. \
  2196. const __m128i lo_26_6 = _mm_unpacklo_epi16(zero, in[6]); \
  2197. const __m128i hi_26_6 = _mm_unpackhi_epi16(zero, in[6]); \
  2198. \
  2199. MULTIPLICATION_AND_ADD_2(lo_2_30, hi_2_30, stg2_0, \
  2200. stg2_1, stp2_8, stp2_15); \
  2201. MULTIPLICATION_AND_ADD_2(lo_26_6, hi_26_6, stg2_6, \
  2202. stg2_7, stp2_11, stp2_12); \
  2203. \
  2204. stp2_16 = stp1_16; \
  2205. stp2_19 = stp1_19; \
  2206. \
  2207. stp2_20 = stp1_20; \
  2208. stp2_23 = stp1_23; \
  2209. \
  2210. stp2_24 = stp1_24; \
  2211. stp2_27 = stp1_27; \
  2212. \
  2213. stp2_28 = stp1_28; \
  2214. stp2_31 = stp1_31; \
  2215. } \
  2216. \
  2217. /* Stage3 */ \
  2218. { \
  2219. const __m128i zero = _mm_setzero_si128();\
  2220. const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero); \
  2221. const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero); \
  2222. \
  2223. const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1_16, stp1_31); \
  2224. const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1_16, stp1_31); \
  2225. const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1_19, stp1_28); \
  2226. const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1_19, stp1_28); \
  2227. \
  2228. const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1_20, stp1_27); \
  2229. const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1_20, stp1_27); \
  2230. const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1_23, stp1_24); \
  2231. const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1_23, stp2_24); \
  2232. \
  2233. MULTIPLICATION_AND_ADD_2(lo_4_28, hi_4_28, stg3_0, \
  2234. stg3_1, stp1_4, stp1_7); \
  2235. \
  2236. stp1_8 = stp2_8; \
  2237. stp1_11 = stp2_11; \
  2238. stp1_12 = stp2_12; \
  2239. stp1_15 = stp2_15; \
  2240. \
  2241. MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
  2242. stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \
  2243. stp1_18, stp1_29) \
  2244. MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
  2245. stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \
  2246. stp1_22, stp1_25) \
  2247. \
  2248. stp1_16 = stp2_16; \
  2249. stp1_31 = stp2_31; \
  2250. stp1_19 = stp2_19; \
  2251. stp1_20 = stp2_20; \
  2252. stp1_23 = stp2_23; \
  2253. stp1_24 = stp2_24; \
  2254. stp1_27 = stp2_27; \
  2255. stp1_28 = stp2_28; \
  2256. } \
  2257. \
  2258. /* Stage4 */ \
  2259. { \
  2260. const __m128i zero = _mm_setzero_si128();\
  2261. const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero); \
  2262. const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero); \
  2263. \
  2264. const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15); \
  2265. const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15); \
  2266. const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12); \
  2267. const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12); \
  2268. \
  2269. MULTIPLICATION_AND_ADD_2(lo_0_16, hi_0_16, stg4_0, \
  2270. stg4_1, stp2_0, stp2_1); \
  2271. \
  2272. stp2_4 = stp1_4; \
  2273. stp2_5 = stp1_4; \
  2274. stp2_6 = stp1_7; \
  2275. stp2_7 = stp1_7; \
  2276. \
  2277. MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
  2278. stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \
  2279. stp2_10, stp2_13) \
  2280. \
  2281. stp2_8 = stp1_8; \
  2282. stp2_15 = stp1_15; \
  2283. stp2_11 = stp1_11; \
  2284. stp2_12 = stp1_12; \
  2285. \
  2286. stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
  2287. stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
  2288. stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
  2289. stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
  2290. stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
  2291. stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
  2292. stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
  2293. stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
  2294. \
  2295. stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
  2296. stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
  2297. stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
  2298. stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
  2299. stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
  2300. stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
  2301. stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
  2302. stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
  2303. } \
  2304. \
  2305. /* Stage5 */ \
  2306. { \
  2307. const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
  2308. const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
  2309. const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
  2310. const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
  2311. \
  2312. const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
  2313. const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
  2314. const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
  2315. const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
  2316. \
  2317. const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
  2318. const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
  2319. \
  2320. stp1_0 = stp2_0; \
  2321. stp1_1 = stp2_1; \
  2322. stp1_2 = stp2_1; \
  2323. stp1_3 = stp2_0; \
  2324. \
  2325. tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
  2326. tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
  2327. tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
  2328. tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
  2329. \
  2330. tmp0 = _mm_add_epi32(tmp0, rounding); \
  2331. tmp1 = _mm_add_epi32(tmp1, rounding); \
  2332. tmp2 = _mm_add_epi32(tmp2, rounding); \
  2333. tmp3 = _mm_add_epi32(tmp3, rounding); \
  2334. \
  2335. tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
  2336. tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
  2337. tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
  2338. tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
  2339. \
  2340. stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
  2341. stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
  2342. \
  2343. stp1_4 = stp2_4; \
  2344. stp1_7 = stp2_7; \
  2345. \
  2346. stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
  2347. stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
  2348. stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
  2349. stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
  2350. stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
  2351. stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
  2352. stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
  2353. stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
  2354. \
  2355. stp1_16 = stp2_16; \
  2356. stp1_17 = stp2_17; \
  2357. \
  2358. MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
  2359. stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \
  2360. stp1_19, stp1_28) \
  2361. MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
  2362. stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \
  2363. stp1_21, stp1_26) \
  2364. \
  2365. stp1_22 = stp2_22; \
  2366. stp1_23 = stp2_23; \
  2367. stp1_24 = stp2_24; \
  2368. stp1_25 = stp2_25; \
  2369. stp1_30 = stp2_30; \
  2370. stp1_31 = stp2_31; \
  2371. } \
  2372. \
  2373. /* Stage6 */ \
  2374. { \
  2375. const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
  2376. const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
  2377. const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
  2378. const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
  2379. \
  2380. stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
  2381. stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
  2382. stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
  2383. stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
  2384. stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
  2385. stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
  2386. stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
  2387. stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
  2388. \
  2389. stp2_8 = stp1_8; \
  2390. stp2_9 = stp1_9; \
  2391. stp2_14 = stp1_14; \
  2392. stp2_15 = stp1_15; \
  2393. \
  2394. MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
  2395. stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \
  2396. stp2_13, stp2_11, stp2_12) \
  2397. \
  2398. stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
  2399. stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
  2400. stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
  2401. stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
  2402. stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
  2403. stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
  2404. stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
  2405. stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
  2406. \
  2407. stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
  2408. stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
  2409. stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
  2410. stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
  2411. stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
  2412. stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
  2413. stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
  2414. stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
  2415. } \
  2416. \
  2417. /* Stage7 */ \
  2418. { \
  2419. const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
  2420. const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
  2421. const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
  2422. const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
  2423. \
  2424. const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
  2425. const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
  2426. const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
  2427. const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
  2428. \
  2429. stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
  2430. stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
  2431. stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
  2432. stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
  2433. stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
  2434. stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
  2435. stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
  2436. stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
  2437. stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
  2438. stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
  2439. stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
  2440. stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
  2441. stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
  2442. stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
  2443. stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
  2444. stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
  2445. \
  2446. stp1_16 = stp2_16; \
  2447. stp1_17 = stp2_17; \
  2448. stp1_18 = stp2_18; \
  2449. stp1_19 = stp2_19; \
  2450. \
  2451. MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
  2452. stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \
  2453. stp1_21, stp1_26) \
  2454. MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
  2455. stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \
  2456. stp1_23, stp1_24) \
  2457. \
  2458. stp1_28 = stp2_28; \
  2459. stp1_29 = stp2_29; \
  2460. stp1_30 = stp2_30; \
  2461. stp1_31 = stp2_31; \
  2462. }
  2463. #define IDCT32 \
  2464. /* Stage1 */ \
  2465. { \
  2466. const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]); \
  2467. const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], in[31]); \
  2468. const __m128i lo_17_15 = _mm_unpacklo_epi16(in[17], in[15]); \
  2469. const __m128i hi_17_15 = _mm_unpackhi_epi16(in[17], in[15]); \
  2470. \
  2471. const __m128i lo_9_23 = _mm_unpacklo_epi16(in[9], in[23]); \
  2472. const __m128i hi_9_23 = _mm_unpackhi_epi16(in[9], in[23]); \
  2473. const __m128i lo_25_7= _mm_unpacklo_epi16(in[25], in[7]); \
  2474. const __m128i hi_25_7 = _mm_unpackhi_epi16(in[25], in[7]); \
  2475. \
  2476. const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], in[27]); \
  2477. const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], in[27]); \
  2478. const __m128i lo_21_11 = _mm_unpacklo_epi16(in[21], in[11]); \
  2479. const __m128i hi_21_11 = _mm_unpackhi_epi16(in[21], in[11]); \
  2480. \
  2481. const __m128i lo_13_19 = _mm_unpacklo_epi16(in[13], in[19]); \
  2482. const __m128i hi_13_19 = _mm_unpackhi_epi16(in[13], in[19]); \
  2483. const __m128i lo_29_3 = _mm_unpacklo_epi16(in[29], in[3]); \
  2484. const __m128i hi_29_3 = _mm_unpackhi_epi16(in[29], in[3]); \
  2485. \
  2486. MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \
  2487. stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, \
  2488. stp1_17, stp1_30) \
  2489. MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, \
  2490. stg1_5, stg1_6, stg1_7, stp1_18, stp1_29, \
  2491. stp1_19, stp1_28) \
  2492. MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, \
  2493. stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, \
  2494. stp1_21, stp1_26) \
  2495. MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, \
  2496. stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, \
  2497. stp1_23, stp1_24) \
  2498. } \
  2499. \
  2500. /* Stage2 */ \
  2501. { \
  2502. const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], in[30]); \
  2503. const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], in[30]); \
  2504. const __m128i lo_18_14 = _mm_unpacklo_epi16(in[18], in[14]); \
  2505. const __m128i hi_18_14 = _mm_unpackhi_epi16(in[18], in[14]); \
  2506. \
  2507. const __m128i lo_10_22 = _mm_unpacklo_epi16(in[10], in[22]); \
  2508. const __m128i hi_10_22 = _mm_unpackhi_epi16(in[10], in[22]); \
  2509. const __m128i lo_26_6 = _mm_unpacklo_epi16(in[26], in[6]); \
  2510. const __m128i hi_26_6 = _mm_unpackhi_epi16(in[26], in[6]); \
  2511. \
  2512. MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \
  2513. stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \
  2514. stp2_14) \
  2515. MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, \
  2516. stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, \
  2517. stp2_11, stp2_12) \
  2518. \
  2519. stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \
  2520. stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \
  2521. stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); \
  2522. stp2_19 = _mm_add_epi16(stp1_19, stp1_18); \
  2523. \
  2524. stp2_20 = _mm_add_epi16(stp1_20, stp1_21); \
  2525. stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); \
  2526. stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); \
  2527. stp2_23 = _mm_add_epi16(stp1_23, stp1_22); \
  2528. \
  2529. stp2_24 = _mm_add_epi16(stp1_24, stp1_25); \
  2530. stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); \
  2531. stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); \
  2532. stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \
  2533. \
  2534. stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \
  2535. stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \
  2536. stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \
  2537. stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \
  2538. } \
  2539. \
  2540. /* Stage3 */ \
  2541. { \
  2542. const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], in[28]); \
  2543. const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], in[28]); \
  2544. const __m128i lo_20_12 = _mm_unpacklo_epi16(in[20], in[12]); \
  2545. const __m128i hi_20_12 = _mm_unpackhi_epi16(in[20], in[12]); \
  2546. \
  2547. const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \
  2548. const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \
  2549. const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
  2550. const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
  2551. \
  2552. const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
  2553. const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
  2554. const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
  2555. const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
  2556. \
  2557. MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, \
  2558. stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, \
  2559. stp1_6) \
  2560. \
  2561. stp1_8 = _mm_add_epi16(stp2_8, stp2_9); \
  2562. stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \
  2563. stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
  2564. stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
  2565. stp1_12 = _mm_add_epi16(stp2_12, stp2_13); \
  2566. stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
  2567. stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
  2568. stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
  2569. \
  2570. MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
  2571. stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \
  2572. stp1_18, stp1_29) \
  2573. MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
  2574. stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \
  2575. stp1_22, stp1_25) \
  2576. \
  2577. stp1_16 = stp2_16; \
  2578. stp1_31 = stp2_31; \
  2579. stp1_19 = stp2_19; \
  2580. stp1_20 = stp2_20; \
  2581. stp1_23 = stp2_23; \
  2582. stp1_24 = stp2_24; \
  2583. stp1_27 = stp2_27; \
  2584. stp1_28 = stp2_28; \
  2585. } \
  2586. \
  2587. /* Stage4 */ \
  2588. { \
  2589. const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], in[16]); \
  2590. const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], in[16]); \
  2591. const __m128i lo_8_24 = _mm_unpacklo_epi16(in[8], in[24]); \
  2592. const __m128i hi_8_24 = _mm_unpackhi_epi16(in[8], in[24]); \
  2593. \
  2594. const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
  2595. const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
  2596. const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
  2597. const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
  2598. \
  2599. MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, \
  2600. stg4_1, stg4_2, stg4_3, stp2_0, stp2_1, \
  2601. stp2_2, stp2_3) \
  2602. \
  2603. stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
  2604. stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
  2605. stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
  2606. stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
  2607. \
  2608. MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
  2609. stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \
  2610. stp2_10, stp2_13) \
  2611. \
  2612. stp2_8 = stp1_8; \
  2613. stp2_15 = stp1_15; \
  2614. stp2_11 = stp1_11; \
  2615. stp2_12 = stp1_12; \
  2616. \
  2617. stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
  2618. stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
  2619. stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
  2620. stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
  2621. stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
  2622. stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
  2623. stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
  2624. stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
  2625. \
  2626. stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
  2627. stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
  2628. stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
  2629. stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
  2630. stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
  2631. stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
  2632. stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
  2633. stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
  2634. } \
  2635. \
  2636. /* Stage5 */ \
  2637. { \
  2638. const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
  2639. const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
  2640. const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
  2641. const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
  2642. \
  2643. const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
  2644. const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
  2645. const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
  2646. const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
  2647. \
  2648. const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
  2649. const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
  2650. \
  2651. stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
  2652. stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
  2653. stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
  2654. stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
  2655. \
  2656. tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
  2657. tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
  2658. tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
  2659. tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
  2660. \
  2661. tmp0 = _mm_add_epi32(tmp0, rounding); \
  2662. tmp1 = _mm_add_epi32(tmp1, rounding); \
  2663. tmp2 = _mm_add_epi32(tmp2, rounding); \
  2664. tmp3 = _mm_add_epi32(tmp3, rounding); \
  2665. \
  2666. tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
  2667. tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
  2668. tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
  2669. tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
  2670. \
  2671. stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
  2672. stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
  2673. \
  2674. stp1_4 = stp2_4; \
  2675. stp1_7 = stp2_7; \
  2676. \
  2677. stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
  2678. stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
  2679. stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
  2680. stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
  2681. stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
  2682. stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
  2683. stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
  2684. stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
  2685. \
  2686. stp1_16 = stp2_16; \
  2687. stp1_17 = stp2_17; \
  2688. \
  2689. MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
  2690. stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \
  2691. stp1_19, stp1_28) \
  2692. MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
  2693. stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \
  2694. stp1_21, stp1_26) \
  2695. \
  2696. stp1_22 = stp2_22; \
  2697. stp1_23 = stp2_23; \
  2698. stp1_24 = stp2_24; \
  2699. stp1_25 = stp2_25; \
  2700. stp1_30 = stp2_30; \
  2701. stp1_31 = stp2_31; \
  2702. } \
  2703. \
  2704. /* Stage6 */ \
  2705. { \
  2706. const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
  2707. const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
  2708. const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
  2709. const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
  2710. \
  2711. stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
  2712. stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
  2713. stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
  2714. stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
  2715. stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
  2716. stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
  2717. stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
  2718. stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
  2719. \
  2720. stp2_8 = stp1_8; \
  2721. stp2_9 = stp1_9; \
  2722. stp2_14 = stp1_14; \
  2723. stp2_15 = stp1_15; \
  2724. \
  2725. MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
  2726. stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \
  2727. stp2_13, stp2_11, stp2_12) \
  2728. \
  2729. stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
  2730. stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
  2731. stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
  2732. stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
  2733. stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
  2734. stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
  2735. stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
  2736. stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
  2737. \
  2738. stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
  2739. stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
  2740. stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
  2741. stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
  2742. stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
  2743. stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
  2744. stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
  2745. stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
  2746. } \
  2747. \
  2748. /* Stage7 */ \
  2749. { \
  2750. const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
  2751. const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
  2752. const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
  2753. const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
  2754. \
  2755. const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
  2756. const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
  2757. const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
  2758. const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
  2759. \
  2760. stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
  2761. stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
  2762. stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
  2763. stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
  2764. stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
  2765. stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
  2766. stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
  2767. stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
  2768. stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
  2769. stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
  2770. stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
  2771. stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
  2772. stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
  2773. stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
  2774. stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
  2775. stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
  2776. \
  2777. stp1_16 = stp2_16; \
  2778. stp1_17 = stp2_17; \
  2779. stp1_18 = stp2_18; \
  2780. stp1_19 = stp2_19; \
  2781. \
  2782. MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
  2783. stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \
  2784. stp1_21, stp1_26) \
  2785. MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
  2786. stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \
  2787. stp1_23, stp1_24) \
  2788. \
  2789. stp1_28 = stp2_28; \
  2790. stp1_29 = stp2_29; \
  2791. stp1_30 = stp2_30; \
  2792. stp1_31 = stp2_31; \
  2793. }
  2794. // Only upper-left 8x8 has non-zero coeff
  2795. void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest,
  2796. int stride) {
  2797. const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
  2798. const __m128i final_rounding = _mm_set1_epi16(1<<5);
  2799. // idct constants for each stage
  2800. const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
  2801. const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
  2802. const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
  2803. const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
  2804. const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
  2805. const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
  2806. const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
  2807. const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
  2808. const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
  2809. const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
  2810. const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
  2811. const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
  2812. const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
  2813. const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
  2814. const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
  2815. const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
  2816. const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
  2817. const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
  2818. const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
  2819. const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
  2820. const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
  2821. const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
  2822. const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
  2823. const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
  2824. const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
  2825. const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
  2826. __m128i in[32], col[32];
  2827. __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
  2828. stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
  2829. stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
  2830. stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,
  2831. stp1_30, stp1_31;
  2832. __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
  2833. stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
  2834. stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,
  2835. stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
  2836. stp2_30, stp2_31;
  2837. __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  2838. int i;
  2839. // Load input data. Only need to load the top left 8x8 block.
  2840. in[0] = load_input_data(input);
  2841. in[1] = load_input_data(input + 32);
  2842. in[2] = load_input_data(input + 64);
  2843. in[3] = load_input_data(input + 96);
  2844. in[4] = load_input_data(input + 128);
  2845. in[5] = load_input_data(input + 160);
  2846. in[6] = load_input_data(input + 192);
  2847. in[7] = load_input_data(input + 224);
  2848. for (i = 8; i < 32; ++i) {
  2849. in[i] = _mm_setzero_si128();
  2850. }
  2851. array_transpose_8x8(in, in);
  2852. // TODO(hkuang): Following transposes are unnecessary. But remove them will
  2853. // lead to performance drop on some devices.
  2854. array_transpose_8x8(in + 8, in + 8);
  2855. array_transpose_8x8(in + 16, in + 16);
  2856. array_transpose_8x8(in + 24, in + 24);
  2857. IDCT32_34
  2858. // 1_D: Store 32 intermediate results for each 8x32 block.
  2859. col[0] = _mm_add_epi16(stp1_0, stp1_31);
  2860. col[1] = _mm_add_epi16(stp1_1, stp1_30);
  2861. col[2] = _mm_add_epi16(stp1_2, stp1_29);
  2862. col[3] = _mm_add_epi16(stp1_3, stp1_28);
  2863. col[4] = _mm_add_epi16(stp1_4, stp1_27);
  2864. col[5] = _mm_add_epi16(stp1_5, stp1_26);
  2865. col[6] = _mm_add_epi16(stp1_6, stp1_25);
  2866. col[7] = _mm_add_epi16(stp1_7, stp1_24);
  2867. col[8] = _mm_add_epi16(stp1_8, stp1_23);
  2868. col[9] = _mm_add_epi16(stp1_9, stp1_22);
  2869. col[10] = _mm_add_epi16(stp1_10, stp1_21);
  2870. col[11] = _mm_add_epi16(stp1_11, stp1_20);
  2871. col[12] = _mm_add_epi16(stp1_12, stp1_19);
  2872. col[13] = _mm_add_epi16(stp1_13, stp1_18);
  2873. col[14] = _mm_add_epi16(stp1_14, stp1_17);
  2874. col[15] = _mm_add_epi16(stp1_15, stp1_16);
  2875. col[16] = _mm_sub_epi16(stp1_15, stp1_16);
  2876. col[17] = _mm_sub_epi16(stp1_14, stp1_17);
  2877. col[18] = _mm_sub_epi16(stp1_13, stp1_18);
  2878. col[19] = _mm_sub_epi16(stp1_12, stp1_19);
  2879. col[20] = _mm_sub_epi16(stp1_11, stp1_20);
  2880. col[21] = _mm_sub_epi16(stp1_10, stp1_21);
  2881. col[22] = _mm_sub_epi16(stp1_9, stp1_22);
  2882. col[23] = _mm_sub_epi16(stp1_8, stp1_23);
  2883. col[24] = _mm_sub_epi16(stp1_7, stp1_24);
  2884. col[25] = _mm_sub_epi16(stp1_6, stp1_25);
  2885. col[26] = _mm_sub_epi16(stp1_5, stp1_26);
  2886. col[27] = _mm_sub_epi16(stp1_4, stp1_27);
  2887. col[28] = _mm_sub_epi16(stp1_3, stp1_28);
  2888. col[29] = _mm_sub_epi16(stp1_2, stp1_29);
  2889. col[30] = _mm_sub_epi16(stp1_1, stp1_30);
  2890. col[31] = _mm_sub_epi16(stp1_0, stp1_31);
  2891. for (i = 0; i < 4; i++) {
  2892. int j;
  2893. const __m128i zero = _mm_setzero_si128();
  2894. // Transpose 32x8 block to 8x32 block
  2895. array_transpose_8x8(col + i * 8, in);
  2896. IDCT32_34
  2897. // 2_D: Calculate the results and store them to destination.
  2898. in[0] = _mm_add_epi16(stp1_0, stp1_31);
  2899. in[1] = _mm_add_epi16(stp1_1, stp1_30);
  2900. in[2] = _mm_add_epi16(stp1_2, stp1_29);
  2901. in[3] = _mm_add_epi16(stp1_3, stp1_28);
  2902. in[4] = _mm_add_epi16(stp1_4, stp1_27);
  2903. in[5] = _mm_add_epi16(stp1_5, stp1_26);
  2904. in[6] = _mm_add_epi16(stp1_6, stp1_25);
  2905. in[7] = _mm_add_epi16(stp1_7, stp1_24);
  2906. in[8] = _mm_add_epi16(stp1_8, stp1_23);
  2907. in[9] = _mm_add_epi16(stp1_9, stp1_22);
  2908. in[10] = _mm_add_epi16(stp1_10, stp1_21);
  2909. in[11] = _mm_add_epi16(stp1_11, stp1_20);
  2910. in[12] = _mm_add_epi16(stp1_12, stp1_19);
  2911. in[13] = _mm_add_epi16(stp1_13, stp1_18);
  2912. in[14] = _mm_add_epi16(stp1_14, stp1_17);
  2913. in[15] = _mm_add_epi16(stp1_15, stp1_16);
  2914. in[16] = _mm_sub_epi16(stp1_15, stp1_16);
  2915. in[17] = _mm_sub_epi16(stp1_14, stp1_17);
  2916. in[18] = _mm_sub_epi16(stp1_13, stp1_18);
  2917. in[19] = _mm_sub_epi16(stp1_12, stp1_19);
  2918. in[20] = _mm_sub_epi16(stp1_11, stp1_20);
  2919. in[21] = _mm_sub_epi16(stp1_10, stp1_21);
  2920. in[22] = _mm_sub_epi16(stp1_9, stp1_22);
  2921. in[23] = _mm_sub_epi16(stp1_8, stp1_23);
  2922. in[24] = _mm_sub_epi16(stp1_7, stp1_24);
  2923. in[25] = _mm_sub_epi16(stp1_6, stp1_25);
  2924. in[26] = _mm_sub_epi16(stp1_5, stp1_26);
  2925. in[27] = _mm_sub_epi16(stp1_4, stp1_27);
  2926. in[28] = _mm_sub_epi16(stp1_3, stp1_28);
  2927. in[29] = _mm_sub_epi16(stp1_2, stp1_29);
  2928. in[30] = _mm_sub_epi16(stp1_1, stp1_30);
  2929. in[31] = _mm_sub_epi16(stp1_0, stp1_31);
  2930. for (j = 0; j < 32; ++j) {
  2931. // Final rounding and shift
  2932. in[j] = _mm_adds_epi16(in[j], final_rounding);
  2933. in[j] = _mm_srai_epi16(in[j], 6);
  2934. RECON_AND_STORE(dest + j * stride, in[j]);
  2935. }
  2936. dest += 8;
  2937. }
  2938. }
  2939. void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest,
  2940. int stride) {
  2941. const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
  2942. const __m128i final_rounding = _mm_set1_epi16(1 << 5);
  2943. const __m128i zero = _mm_setzero_si128();
  2944. // idct constants for each stage
  2945. const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
  2946. const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
  2947. const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
  2948. const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
  2949. const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
  2950. const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
  2951. const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
  2952. const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
  2953. const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
  2954. const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
  2955. const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
  2956. const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
  2957. const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
  2958. const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
  2959. const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
  2960. const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
  2961. const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
  2962. const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
  2963. const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
  2964. const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
  2965. const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
  2966. const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
  2967. const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
  2968. const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
  2969. const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
  2970. const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
  2971. const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
  2972. const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
  2973. const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
  2974. const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
  2975. const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
  2976. const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
  2977. const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
  2978. const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
  2979. const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
  2980. const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
  2981. const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
  2982. const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
  2983. const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
  2984. const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
  2985. const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
  2986. const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
  2987. __m128i in[32], col[128], zero_idx[16];
  2988. __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
  2989. stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
  2990. stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
  2991. stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,
  2992. stp1_30, stp1_31;
  2993. __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
  2994. stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
  2995. stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,
  2996. stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
  2997. stp2_30, stp2_31;
  2998. __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  2999. int i, j, i32;
  3000. for (i = 0; i < 4; i++) {
  3001. i32 = (i << 5);
  3002. // First 1-D idct
  3003. // Load input data.
  3004. LOAD_DQCOEFF(in[0], input);
  3005. LOAD_DQCOEFF(in[8], input);
  3006. LOAD_DQCOEFF(in[16], input);
  3007. LOAD_DQCOEFF(in[24], input);
  3008. LOAD_DQCOEFF(in[1], input);
  3009. LOAD_DQCOEFF(in[9], input);
  3010. LOAD_DQCOEFF(in[17], input);
  3011. LOAD_DQCOEFF(in[25], input);
  3012. LOAD_DQCOEFF(in[2], input);
  3013. LOAD_DQCOEFF(in[10], input);
  3014. LOAD_DQCOEFF(in[18], input);
  3015. LOAD_DQCOEFF(in[26], input);
  3016. LOAD_DQCOEFF(in[3], input);
  3017. LOAD_DQCOEFF(in[11], input);
  3018. LOAD_DQCOEFF(in[19], input);
  3019. LOAD_DQCOEFF(in[27], input);
  3020. LOAD_DQCOEFF(in[4], input);
  3021. LOAD_DQCOEFF(in[12], input);
  3022. LOAD_DQCOEFF(in[20], input);
  3023. LOAD_DQCOEFF(in[28], input);
  3024. LOAD_DQCOEFF(in[5], input);
  3025. LOAD_DQCOEFF(in[13], input);
  3026. LOAD_DQCOEFF(in[21], input);
  3027. LOAD_DQCOEFF(in[29], input);
  3028. LOAD_DQCOEFF(in[6], input);
  3029. LOAD_DQCOEFF(in[14], input);
  3030. LOAD_DQCOEFF(in[22], input);
  3031. LOAD_DQCOEFF(in[30], input);
  3032. LOAD_DQCOEFF(in[7], input);
  3033. LOAD_DQCOEFF(in[15], input);
  3034. LOAD_DQCOEFF(in[23], input);
  3035. LOAD_DQCOEFF(in[31], input);
  3036. // checking if all entries are zero
  3037. zero_idx[0] = _mm_or_si128(in[0], in[1]);
  3038. zero_idx[1] = _mm_or_si128(in[2], in[3]);
  3039. zero_idx[2] = _mm_or_si128(in[4], in[5]);
  3040. zero_idx[3] = _mm_or_si128(in[6], in[7]);
  3041. zero_idx[4] = _mm_or_si128(in[8], in[9]);
  3042. zero_idx[5] = _mm_or_si128(in[10], in[11]);
  3043. zero_idx[6] = _mm_or_si128(in[12], in[13]);
  3044. zero_idx[7] = _mm_or_si128(in[14], in[15]);
  3045. zero_idx[8] = _mm_or_si128(in[16], in[17]);
  3046. zero_idx[9] = _mm_or_si128(in[18], in[19]);
  3047. zero_idx[10] = _mm_or_si128(in[20], in[21]);
  3048. zero_idx[11] = _mm_or_si128(in[22], in[23]);
  3049. zero_idx[12] = _mm_or_si128(in[24], in[25]);
  3050. zero_idx[13] = _mm_or_si128(in[26], in[27]);
  3051. zero_idx[14] = _mm_or_si128(in[28], in[29]);
  3052. zero_idx[15] = _mm_or_si128(in[30], in[31]);
  3053. zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]);
  3054. zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]);
  3055. zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]);
  3056. zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]);
  3057. zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]);
  3058. zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]);
  3059. zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]);
  3060. zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]);
  3061. zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]);
  3062. zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]);
  3063. zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]);
  3064. zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]);
  3065. zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]);
  3066. zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]);
  3067. zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]);
  3068. if (_mm_movemask_epi8(_mm_cmpeq_epi32(zero_idx[14], zero)) == 0xFFFF) {
  3069. col[i32 + 0] = _mm_setzero_si128();
  3070. col[i32 + 1] = _mm_setzero_si128();
  3071. col[i32 + 2] = _mm_setzero_si128();
  3072. col[i32 + 3] = _mm_setzero_si128();
  3073. col[i32 + 4] = _mm_setzero_si128();
  3074. col[i32 + 5] = _mm_setzero_si128();
  3075. col[i32 + 6] = _mm_setzero_si128();
  3076. col[i32 + 7] = _mm_setzero_si128();
  3077. col[i32 + 8] = _mm_setzero_si128();
  3078. col[i32 + 9] = _mm_setzero_si128();
  3079. col[i32 + 10] = _mm_setzero_si128();
  3080. col[i32 + 11] = _mm_setzero_si128();
  3081. col[i32 + 12] = _mm_setzero_si128();
  3082. col[i32 + 13] = _mm_setzero_si128();
  3083. col[i32 + 14] = _mm_setzero_si128();
  3084. col[i32 + 15] = _mm_setzero_si128();
  3085. col[i32 + 16] = _mm_setzero_si128();
  3086. col[i32 + 17] = _mm_setzero_si128();
  3087. col[i32 + 18] = _mm_setzero_si128();
  3088. col[i32 + 19] = _mm_setzero_si128();
  3089. col[i32 + 20] = _mm_setzero_si128();
  3090. col[i32 + 21] = _mm_setzero_si128();
  3091. col[i32 + 22] = _mm_setzero_si128();
  3092. col[i32 + 23] = _mm_setzero_si128();
  3093. col[i32 + 24] = _mm_setzero_si128();
  3094. col[i32 + 25] = _mm_setzero_si128();
  3095. col[i32 + 26] = _mm_setzero_si128();
  3096. col[i32 + 27] = _mm_setzero_si128();
  3097. col[i32 + 28] = _mm_setzero_si128();
  3098. col[i32 + 29] = _mm_setzero_si128();
  3099. col[i32 + 30] = _mm_setzero_si128();
  3100. col[i32 + 31] = _mm_setzero_si128();
  3101. continue;
  3102. }
  3103. // Transpose 32x8 block to 8x32 block
  3104. array_transpose_8x8(in, in);
  3105. array_transpose_8x8(in + 8, in + 8);
  3106. array_transpose_8x8(in + 16, in + 16);
  3107. array_transpose_8x8(in + 24, in + 24);
  3108. IDCT32
  3109. // 1_D: Store 32 intermediate results for each 8x32 block.
  3110. col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
  3111. col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
  3112. col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
  3113. col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
  3114. col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
  3115. col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
  3116. col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
  3117. col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
  3118. col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
  3119. col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
  3120. col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
  3121. col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
  3122. col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
  3123. col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
  3124. col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
  3125. col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
  3126. col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
  3127. col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
  3128. col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
  3129. col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
  3130. col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
  3131. col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
  3132. col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
  3133. col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
  3134. col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
  3135. col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
  3136. col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
  3137. col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
  3138. col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
  3139. col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
  3140. col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
  3141. col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
  3142. }
  3143. for (i = 0; i < 4; i++) {
  3144. // Second 1-D idct
  3145. j = i << 3;
  3146. // Transpose 32x8 block to 8x32 block
  3147. array_transpose_8x8(col + j, in);
  3148. array_transpose_8x8(col + j + 32, in + 8);
  3149. array_transpose_8x8(col + j + 64, in + 16);
  3150. array_transpose_8x8(col + j + 96, in + 24);
  3151. IDCT32
  3152. // 2_D: Calculate the results and store them to destination.
  3153. in[0] = _mm_add_epi16(stp1_0, stp1_31);
  3154. in[1] = _mm_add_epi16(stp1_1, stp1_30);
  3155. in[2] = _mm_add_epi16(stp1_2, stp1_29);
  3156. in[3] = _mm_add_epi16(stp1_3, stp1_28);
  3157. in[4] = _mm_add_epi16(stp1_4, stp1_27);
  3158. in[5] = _mm_add_epi16(stp1_5, stp1_26);
  3159. in[6] = _mm_add_epi16(stp1_6, stp1_25);
  3160. in[7] = _mm_add_epi16(stp1_7, stp1_24);
  3161. in[8] = _mm_add_epi16(stp1_8, stp1_23);
  3162. in[9] = _mm_add_epi16(stp1_9, stp1_22);
  3163. in[10] = _mm_add_epi16(stp1_10, stp1_21);
  3164. in[11] = _mm_add_epi16(stp1_11, stp1_20);
  3165. in[12] = _mm_add_epi16(stp1_12, stp1_19);
  3166. in[13] = _mm_add_epi16(stp1_13, stp1_18);
  3167. in[14] = _mm_add_epi16(stp1_14, stp1_17);
  3168. in[15] = _mm_add_epi16(stp1_15, stp1_16);
  3169. in[16] = _mm_sub_epi16(stp1_15, stp1_16);
  3170. in[17] = _mm_sub_epi16(stp1_14, stp1_17);
  3171. in[18] = _mm_sub_epi16(stp1_13, stp1_18);
  3172. in[19] = _mm_sub_epi16(stp1_12, stp1_19);
  3173. in[20] = _mm_sub_epi16(stp1_11, stp1_20);
  3174. in[21] = _mm_sub_epi16(stp1_10, stp1_21);
  3175. in[22] = _mm_sub_epi16(stp1_9, stp1_22);
  3176. in[23] = _mm_sub_epi16(stp1_8, stp1_23);
  3177. in[24] = _mm_sub_epi16(stp1_7, stp1_24);
  3178. in[25] = _mm_sub_epi16(stp1_6, stp1_25);
  3179. in[26] = _mm_sub_epi16(stp1_5, stp1_26);
  3180. in[27] = _mm_sub_epi16(stp1_4, stp1_27);
  3181. in[28] = _mm_sub_epi16(stp1_3, stp1_28);
  3182. in[29] = _mm_sub_epi16(stp1_2, stp1_29);
  3183. in[30] = _mm_sub_epi16(stp1_1, stp1_30);
  3184. in[31] = _mm_sub_epi16(stp1_0, stp1_31);
  3185. for (j = 0; j < 32; ++j) {
  3186. // Final rounding and shift
  3187. in[j] = _mm_adds_epi16(in[j], final_rounding);
  3188. in[j] = _mm_srai_epi16(in[j], 6);
  3189. RECON_AND_STORE(dest + j * stride, in[j]);
  3190. }
  3191. dest += 8;
  3192. }
  3193. }
  3194. void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest,
  3195. int stride) {
  3196. __m128i dc_value;
  3197. const __m128i zero = _mm_setzero_si128();
  3198. int a, j;
  3199. a = (int)dct_const_round_shift(input[0] * cospi_16_64);
  3200. a = (int)dct_const_round_shift(a * cospi_16_64);
  3201. a = ROUND_POWER_OF_TWO(a, 6);
  3202. dc_value = _mm_set1_epi16(a);
  3203. for (j = 0; j < 32; ++j) {
  3204. RECON_AND_STORE(dest + 0 + j * stride, dc_value);
  3205. RECON_AND_STORE(dest + 8 + j * stride, dc_value);
  3206. RECON_AND_STORE(dest + 16 + j * stride, dc_value);
  3207. RECON_AND_STORE(dest + 24 + j * stride, dc_value);
  3208. }
  3209. }
  3210. #if CONFIG_VP9_HIGHBITDEPTH
  3211. static INLINE __m128i clamp_high_sse2(__m128i value, int bd) {
  3212. __m128i ubounded, retval;
  3213. const __m128i zero = _mm_set1_epi16(0);
  3214. const __m128i one = _mm_set1_epi16(1);
  3215. const __m128i max = _mm_subs_epi16(_mm_slli_epi16(one, bd), one);
  3216. ubounded = _mm_cmpgt_epi16(value, max);
  3217. retval = _mm_andnot_si128(ubounded, value);
  3218. ubounded = _mm_and_si128(ubounded, max);
  3219. retval = _mm_or_si128(retval, ubounded);
  3220. retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero));
  3221. return retval;
  3222. }
  3223. void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest8,
  3224. int stride, int bd) {
  3225. tran_low_t out[4 * 4];
  3226. tran_low_t *outptr = out;
  3227. int i, j;
  3228. __m128i inptr[4];
  3229. __m128i sign_bits[2];
  3230. __m128i temp_mm, min_input, max_input;
  3231. int test;
  3232. uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
  3233. int optimised_cols = 0;
  3234. const __m128i zero = _mm_set1_epi16(0);
  3235. const __m128i eight = _mm_set1_epi16(8);
  3236. const __m128i max = _mm_set1_epi16(12043);
  3237. const __m128i min = _mm_set1_epi16(-12043);
  3238. // Load input into __m128i
  3239. inptr[0] = _mm_loadu_si128((const __m128i *)input);
  3240. inptr[1] = _mm_loadu_si128((const __m128i *)(input + 4));
  3241. inptr[2] = _mm_loadu_si128((const __m128i *)(input + 8));
  3242. inptr[3] = _mm_loadu_si128((const __m128i *)(input + 12));
  3243. // Pack to 16 bits
  3244. inptr[0] = _mm_packs_epi32(inptr[0], inptr[1]);
  3245. inptr[1] = _mm_packs_epi32(inptr[2], inptr[3]);
  3246. max_input = _mm_max_epi16(inptr[0], inptr[1]);
  3247. min_input = _mm_min_epi16(inptr[0], inptr[1]);
  3248. max_input = _mm_cmpgt_epi16(max_input, max);
  3249. min_input = _mm_cmplt_epi16(min_input, min);
  3250. temp_mm = _mm_or_si128(max_input, min_input);
  3251. test = _mm_movemask_epi8(temp_mm);
  3252. if (!test) {
  3253. // Do the row transform
  3254. idct4_sse2(inptr);
  3255. // Check the min & max values
  3256. max_input = _mm_max_epi16(inptr[0], inptr[1]);
  3257. min_input = _mm_min_epi16(inptr[0], inptr[1]);
  3258. max_input = _mm_cmpgt_epi16(max_input, max);
  3259. min_input = _mm_cmplt_epi16(min_input, min);
  3260. temp_mm = _mm_or_si128(max_input, min_input);
  3261. test = _mm_movemask_epi8(temp_mm);
  3262. if (test) {
  3263. transpose_4x4(inptr);
  3264. sign_bits[0] = _mm_cmplt_epi16(inptr[0], zero);
  3265. sign_bits[1] = _mm_cmplt_epi16(inptr[1], zero);
  3266. inptr[3] = _mm_unpackhi_epi16(inptr[1], sign_bits[1]);
  3267. inptr[2] = _mm_unpacklo_epi16(inptr[1], sign_bits[1]);
  3268. inptr[1] = _mm_unpackhi_epi16(inptr[0], sign_bits[0]);
  3269. inptr[0] = _mm_unpacklo_epi16(inptr[0], sign_bits[0]);
  3270. _mm_storeu_si128((__m128i *)outptr, inptr[0]);
  3271. _mm_storeu_si128((__m128i *)(outptr + 4), inptr[1]);
  3272. _mm_storeu_si128((__m128i *)(outptr + 8), inptr[2]);
  3273. _mm_storeu_si128((__m128i *)(outptr + 12), inptr[3]);
  3274. } else {
  3275. // Set to use the optimised transform for the column
  3276. optimised_cols = 1;
  3277. }
  3278. } else {
  3279. // Run the un-optimised row transform
  3280. for (i = 0; i < 4; ++i) {
  3281. vpx_highbd_idct4_c(input, outptr, bd);
  3282. input += 4;
  3283. outptr += 4;
  3284. }
  3285. }
  3286. if (optimised_cols) {
  3287. idct4_sse2(inptr);
  3288. // Final round and shift
  3289. inptr[0] = _mm_add_epi16(inptr[0], eight);
  3290. inptr[1] = _mm_add_epi16(inptr[1], eight);
  3291. inptr[0] = _mm_srai_epi16(inptr[0], 4);
  3292. inptr[1] = _mm_srai_epi16(inptr[1], 4);
  3293. // Reconstruction and Store
  3294. {
  3295. __m128i d0 = _mm_loadl_epi64((const __m128i *)dest);
  3296. __m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2));
  3297. d0 = _mm_unpacklo_epi64(
  3298. d0, _mm_loadl_epi64((const __m128i *)(dest + stride)));
  3299. d2 = _mm_unpacklo_epi64(
  3300. d2, _mm_loadl_epi64((const __m128i *)(dest + stride * 3)));
  3301. d0 = clamp_high_sse2(_mm_adds_epi16(d0, inptr[0]), bd);
  3302. d2 = clamp_high_sse2(_mm_adds_epi16(d2, inptr[1]), bd);
  3303. // store input0
  3304. _mm_storel_epi64((__m128i *)dest, d0);
  3305. // store input1
  3306. d0 = _mm_srli_si128(d0, 8);
  3307. _mm_storel_epi64((__m128i *)(dest + stride), d0);
  3308. // store input2
  3309. _mm_storel_epi64((__m128i *)(dest + stride * 2), d2);
  3310. // store input3
  3311. d2 = _mm_srli_si128(d2, 8);
  3312. _mm_storel_epi64((__m128i *)(dest + stride * 3), d2);
  3313. }
  3314. } else {
  3315. // Run the un-optimised column transform
  3316. tran_low_t temp_in[4], temp_out[4];
  3317. // Columns
  3318. for (i = 0; i < 4; ++i) {
  3319. for (j = 0; j < 4; ++j)
  3320. temp_in[j] = out[j * 4 + i];
  3321. vpx_highbd_idct4_c(temp_in, temp_out, bd);
  3322. for (j = 0; j < 4; ++j) {
  3323. dest[j * stride + i] = highbd_clip_pixel_add(
  3324. dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
  3325. }
  3326. }
  3327. }
  3328. }
  3329. void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest8,
  3330. int stride, int bd) {
  3331. tran_low_t out[8 * 8];
  3332. tran_low_t *outptr = out;
  3333. int i, j, test;
  3334. __m128i inptr[8];
  3335. __m128i min_input, max_input, temp1, temp2, sign_bits;
  3336. uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
  3337. const __m128i zero = _mm_set1_epi16(0);
  3338. const __m128i sixteen = _mm_set1_epi16(16);
  3339. const __m128i max = _mm_set1_epi16(6201);
  3340. const __m128i min = _mm_set1_epi16(-6201);
  3341. int optimised_cols = 0;
  3342. // Load input into __m128i & pack to 16 bits
  3343. for (i = 0; i < 8; i++) {
  3344. temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));
  3345. temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));
  3346. inptr[i] = _mm_packs_epi32(temp1, temp2);
  3347. }
  3348. // Find the min & max for the row transform
  3349. max_input = _mm_max_epi16(inptr[0], inptr[1]);
  3350. min_input = _mm_min_epi16(inptr[0], inptr[1]);
  3351. for (i = 2; i < 8; i++) {
  3352. max_input = _mm_max_epi16(max_input, inptr[i]);
  3353. min_input = _mm_min_epi16(min_input, inptr[i]);
  3354. }
  3355. max_input = _mm_cmpgt_epi16(max_input, max);
  3356. min_input = _mm_cmplt_epi16(min_input, min);
  3357. temp1 = _mm_or_si128(max_input, min_input);
  3358. test = _mm_movemask_epi8(temp1);
  3359. if (!test) {
  3360. // Do the row transform
  3361. idct8_sse2(inptr);
  3362. // Find the min & max for the column transform
  3363. max_input = _mm_max_epi16(inptr[0], inptr[1]);
  3364. min_input = _mm_min_epi16(inptr[0], inptr[1]);
  3365. for (i = 2; i < 8; i++) {
  3366. max_input = _mm_max_epi16(max_input, inptr[i]);
  3367. min_input = _mm_min_epi16(min_input, inptr[i]);
  3368. }
  3369. max_input = _mm_cmpgt_epi16(max_input, max);
  3370. min_input = _mm_cmplt_epi16(min_input, min);
  3371. temp1 = _mm_or_si128(max_input, min_input);
  3372. test = _mm_movemask_epi8(temp1);
  3373. if (test) {
  3374. array_transpose_8x8(inptr, inptr);
  3375. for (i = 0; i < 8; i++) {
  3376. sign_bits = _mm_cmplt_epi16(inptr[i], zero);
  3377. temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
  3378. temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
  3379. _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);
  3380. _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);
  3381. }
  3382. } else {
  3383. // Set to use the optimised transform for the column
  3384. optimised_cols = 1;
  3385. }
  3386. } else {
  3387. // Run the un-optimised row transform
  3388. for (i = 0; i < 8; ++i) {
  3389. vpx_highbd_idct8_c(input, outptr, bd);
  3390. input += 8;
  3391. outptr += 8;
  3392. }
  3393. }
  3394. if (optimised_cols) {
  3395. idct8_sse2(inptr);
  3396. // Final round & shift and Reconstruction and Store
  3397. {
  3398. __m128i d[8];
  3399. for (i = 0; i < 8; i++) {
  3400. inptr[i] = _mm_add_epi16(inptr[i], sixteen);
  3401. d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
  3402. inptr[i] = _mm_srai_epi16(inptr[i], 5);
  3403. d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
  3404. // Store
  3405. _mm_storeu_si128((__m128i *)(dest + stride*i), d[i]);
  3406. }
  3407. }
  3408. } else {
  3409. // Run the un-optimised column transform
  3410. tran_low_t temp_in[8], temp_out[8];
  3411. for (i = 0; i < 8; ++i) {
  3412. for (j = 0; j < 8; ++j)
  3413. temp_in[j] = out[j * 8 + i];
  3414. vpx_highbd_idct8_c(temp_in, temp_out, bd);
  3415. for (j = 0; j < 8; ++j) {
  3416. dest[j * stride + i] = highbd_clip_pixel_add(
  3417. dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
  3418. }
  3419. }
  3420. }
  3421. }
  3422. void vpx_highbd_idct8x8_10_add_sse2(const tran_low_t *input, uint8_t *dest8,
  3423. int stride, int bd) {
  3424. tran_low_t out[8 * 8] = { 0 };
  3425. tran_low_t *outptr = out;
  3426. int i, j, test;
  3427. __m128i inptr[8];
  3428. __m128i min_input, max_input, temp1, temp2, sign_bits;
  3429. uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
  3430. const __m128i zero = _mm_set1_epi16(0);
  3431. const __m128i sixteen = _mm_set1_epi16(16);
  3432. const __m128i max = _mm_set1_epi16(6201);
  3433. const __m128i min = _mm_set1_epi16(-6201);
  3434. int optimised_cols = 0;
  3435. // Load input into __m128i & pack to 16 bits
  3436. for (i = 0; i < 8; i++) {
  3437. temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));
  3438. temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));
  3439. inptr[i] = _mm_packs_epi32(temp1, temp2);
  3440. }
  3441. // Find the min & max for the row transform
  3442. // only first 4 row has non-zero coefs
  3443. max_input = _mm_max_epi16(inptr[0], inptr[1]);
  3444. min_input = _mm_min_epi16(inptr[0], inptr[1]);
  3445. for (i = 2; i < 4; i++) {
  3446. max_input = _mm_max_epi16(max_input, inptr[i]);
  3447. min_input = _mm_min_epi16(min_input, inptr[i]);
  3448. }
  3449. max_input = _mm_cmpgt_epi16(max_input, max);
  3450. min_input = _mm_cmplt_epi16(min_input, min);
  3451. temp1 = _mm_or_si128(max_input, min_input);
  3452. test = _mm_movemask_epi8(temp1);
  3453. if (!test) {
  3454. // Do the row transform
  3455. idct8_sse2(inptr);
  3456. // Find the min & max for the column transform
  3457. // N.B. Only first 4 cols contain non-zero coeffs
  3458. max_input = _mm_max_epi16(inptr[0], inptr[1]);
  3459. min_input = _mm_min_epi16(inptr[0], inptr[1]);
  3460. for (i = 2; i < 8; i++) {
  3461. max_input = _mm_max_epi16(max_input, inptr[i]);
  3462. min_input = _mm_min_epi16(min_input, inptr[i]);
  3463. }
  3464. max_input = _mm_cmpgt_epi16(max_input, max);
  3465. min_input = _mm_cmplt_epi16(min_input, min);
  3466. temp1 = _mm_or_si128(max_input, min_input);
  3467. test = _mm_movemask_epi8(temp1);
  3468. if (test) {
  3469. // Use fact only first 4 rows contain non-zero coeffs
  3470. array_transpose_4X8(inptr, inptr);
  3471. for (i = 0; i < 4; i++) {
  3472. sign_bits = _mm_cmplt_epi16(inptr[i], zero);
  3473. temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
  3474. temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
  3475. _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);
  3476. _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);
  3477. }
  3478. } else {
  3479. // Set to use the optimised transform for the column
  3480. optimised_cols = 1;
  3481. }
  3482. } else {
  3483. // Run the un-optimised row transform
  3484. for (i = 0; i < 4; ++i) {
  3485. vpx_highbd_idct8_c(input, outptr, bd);
  3486. input += 8;
  3487. outptr += 8;
  3488. }
  3489. }
  3490. if (optimised_cols) {
  3491. idct8_sse2(inptr);
  3492. // Final round & shift and Reconstruction and Store
  3493. {
  3494. __m128i d[8];
  3495. for (i = 0; i < 8; i++) {
  3496. inptr[i] = _mm_add_epi16(inptr[i], sixteen);
  3497. d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
  3498. inptr[i] = _mm_srai_epi16(inptr[i], 5);
  3499. d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
  3500. // Store
  3501. _mm_storeu_si128((__m128i *)(dest + stride*i), d[i]);
  3502. }
  3503. }
  3504. } else {
  3505. // Run the un-optimised column transform
  3506. tran_low_t temp_in[8], temp_out[8];
  3507. for (i = 0; i < 8; ++i) {
  3508. for (j = 0; j < 8; ++j)
  3509. temp_in[j] = out[j * 8 + i];
  3510. vpx_highbd_idct8_c(temp_in, temp_out, bd);
  3511. for (j = 0; j < 8; ++j) {
  3512. dest[j * stride + i] = highbd_clip_pixel_add(
  3513. dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
  3514. }
  3515. }
  3516. }
  3517. }
  3518. void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest8,
  3519. int stride, int bd) {
  3520. tran_low_t out[16 * 16];
  3521. tran_low_t *outptr = out;
  3522. int i, j, test;
  3523. __m128i inptr[32];
  3524. __m128i min_input, max_input, temp1, temp2, sign_bits;
  3525. uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
  3526. const __m128i zero = _mm_set1_epi16(0);
  3527. const __m128i rounding = _mm_set1_epi16(32);
  3528. const __m128i max = _mm_set1_epi16(3155);
  3529. const __m128i min = _mm_set1_epi16(-3155);
  3530. int optimised_cols = 0;
  3531. // Load input into __m128i & pack to 16 bits
  3532. for (i = 0; i < 16; i++) {
  3533. temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));
  3534. temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));
  3535. inptr[i] = _mm_packs_epi32(temp1, temp2);
  3536. temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));
  3537. temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));
  3538. inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
  3539. }
  3540. // Find the min & max for the row transform
  3541. max_input = _mm_max_epi16(inptr[0], inptr[1]);
  3542. min_input = _mm_min_epi16(inptr[0], inptr[1]);
  3543. for (i = 2; i < 32; i++) {
  3544. max_input = _mm_max_epi16(max_input, inptr[i]);
  3545. min_input = _mm_min_epi16(min_input, inptr[i]);
  3546. }
  3547. max_input = _mm_cmpgt_epi16(max_input, max);
  3548. min_input = _mm_cmplt_epi16(min_input, min);
  3549. temp1 = _mm_or_si128(max_input, min_input);
  3550. test = _mm_movemask_epi8(temp1);
  3551. if (!test) {
  3552. // Do the row transform
  3553. idct16_sse2(inptr, inptr + 16);
  3554. // Find the min & max for the column transform
  3555. max_input = _mm_max_epi16(inptr[0], inptr[1]);
  3556. min_input = _mm_min_epi16(inptr[0], inptr[1]);
  3557. for (i = 2; i < 32; i++) {
  3558. max_input = _mm_max_epi16(max_input, inptr[i]);
  3559. min_input = _mm_min_epi16(min_input, inptr[i]);
  3560. }
  3561. max_input = _mm_cmpgt_epi16(max_input, max);
  3562. min_input = _mm_cmplt_epi16(min_input, min);
  3563. temp1 = _mm_or_si128(max_input, min_input);
  3564. test = _mm_movemask_epi8(temp1);
  3565. if (test) {
  3566. array_transpose_16x16(inptr, inptr + 16);
  3567. for (i = 0; i < 16; i++) {
  3568. sign_bits = _mm_cmplt_epi16(inptr[i], zero);
  3569. temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
  3570. temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);
  3571. _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);
  3572. _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);
  3573. sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);
  3574. temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);
  3575. temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);
  3576. _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
  3577. _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
  3578. }
  3579. } else {
  3580. // Set to use the optimised transform for the column
  3581. optimised_cols = 1;
  3582. }
  3583. } else {
  3584. // Run the un-optimised row transform
  3585. for (i = 0; i < 16; ++i) {
  3586. vpx_highbd_idct16_c(input, outptr, bd);
  3587. input += 16;
  3588. outptr += 16;
  3589. }
  3590. }
  3591. if (optimised_cols) {
  3592. idct16_sse2(inptr, inptr + 16);
  3593. // Final round & shift and Reconstruction and Store
  3594. {
  3595. __m128i d[2];
  3596. for (i = 0; i < 16; i++) {
  3597. inptr[i ] = _mm_add_epi16(inptr[i ], rounding);
  3598. inptr[i+16] = _mm_add_epi16(inptr[i+16], rounding);
  3599. d[0] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
  3600. d[1] = _mm_loadu_si128((const __m128i *)(dest + stride*i + 8));
  3601. inptr[i ] = _mm_srai_epi16(inptr[i ], 6);
  3602. inptr[i+16] = _mm_srai_epi16(inptr[i+16], 6);
  3603. d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i ]), bd);
  3604. d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i+16]), bd);
  3605. // Store
  3606. _mm_storeu_si128((__m128i *)(dest + stride*i), d[0]);
  3607. _mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]);
  3608. }
  3609. }
  3610. } else {
  3611. // Run the un-optimised column transform
  3612. tran_low_t temp_in[16], temp_out[16];
  3613. for (i = 0; i < 16; ++i) {
  3614. for (j = 0; j < 16; ++j)
  3615. temp_in[j] = out[j * 16 + i];
  3616. vpx_highbd_idct16_c(temp_in, temp_out, bd);
  3617. for (j = 0; j < 16; ++j) {
  3618. dest[j * stride + i] = highbd_clip_pixel_add(
  3619. dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
  3620. }
  3621. }
  3622. }
  3623. }
  3624. void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest8,
  3625. int stride, int bd) {
  3626. tran_low_t out[16 * 16] = { 0 };
  3627. tran_low_t *outptr = out;
  3628. int i, j, test;
  3629. __m128i inptr[32];
  3630. __m128i min_input, max_input, temp1, temp2, sign_bits;
  3631. uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
  3632. const __m128i zero = _mm_set1_epi16(0);
  3633. const __m128i rounding = _mm_set1_epi16(32);
  3634. const __m128i max = _mm_set1_epi16(3155);
  3635. const __m128i min = _mm_set1_epi16(-3155);
  3636. int optimised_cols = 0;
  3637. // Load input into __m128i & pack to 16 bits
  3638. for (i = 0; i < 16; i++) {
  3639. temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));
  3640. temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));
  3641. inptr[i] = _mm_packs_epi32(temp1, temp2);
  3642. temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));
  3643. temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));
  3644. inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
  3645. }
  3646. // Find the min & max for the row transform
  3647. // Since all non-zero dct coefficients are in upper-left 4x4 area,
  3648. // we only need to consider first 4 rows here.
  3649. max_input = _mm_max_epi16(inptr[0], inptr[1]);
  3650. min_input = _mm_min_epi16(inptr[0], inptr[1]);
  3651. for (i = 2; i < 4; i++) {
  3652. max_input = _mm_max_epi16(max_input, inptr[i]);
  3653. min_input = _mm_min_epi16(min_input, inptr[i]);
  3654. }
  3655. max_input = _mm_cmpgt_epi16(max_input, max);
  3656. min_input = _mm_cmplt_epi16(min_input, min);
  3657. temp1 = _mm_or_si128(max_input, min_input);
  3658. test = _mm_movemask_epi8(temp1);
  3659. if (!test) {
  3660. // Do the row transform (N.B. This transposes inptr)
  3661. idct16_sse2(inptr, inptr + 16);
  3662. // Find the min & max for the column transform
  3663. // N.B. Only first 4 cols contain non-zero coeffs
  3664. max_input = _mm_max_epi16(inptr[0], inptr[1]);
  3665. min_input = _mm_min_epi16(inptr[0], inptr[1]);
  3666. for (i = 2; i < 16; i++) {
  3667. max_input = _mm_max_epi16(max_input, inptr[i]);
  3668. min_input = _mm_min_epi16(min_input, inptr[i]);
  3669. }
  3670. max_input = _mm_cmpgt_epi16(max_input, max);
  3671. min_input = _mm_cmplt_epi16(min_input, min);
  3672. temp1 = _mm_or_si128(max_input, min_input);
  3673. test = _mm_movemask_epi8(temp1);
  3674. if (test) {
  3675. // Use fact only first 4 rows contain non-zero coeffs
  3676. array_transpose_8x8(inptr, inptr);
  3677. array_transpose_8x8(inptr + 8, inptr + 16);
  3678. for (i = 0; i < 4; i++) {
  3679. sign_bits = _mm_cmplt_epi16(inptr[i], zero);
  3680. temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
  3681. temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);
  3682. _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);
  3683. _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);
  3684. sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);
  3685. temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);
  3686. temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);
  3687. _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
  3688. _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
  3689. }
  3690. } else {
  3691. // Set to use the optimised transform for the column
  3692. optimised_cols = 1;
  3693. }
  3694. } else {
  3695. // Run the un-optimised row transform
  3696. for (i = 0; i < 4; ++i) {
  3697. vpx_highbd_idct16_c(input, outptr, bd);
  3698. input += 16;
  3699. outptr += 16;
  3700. }
  3701. }
  3702. if (optimised_cols) {
  3703. idct16_sse2(inptr, inptr + 16);
  3704. // Final round & shift and Reconstruction and Store
  3705. {
  3706. __m128i d[2];
  3707. for (i = 0; i < 16; i++) {
  3708. inptr[i ] = _mm_add_epi16(inptr[i ], rounding);
  3709. inptr[i+16] = _mm_add_epi16(inptr[i+16], rounding);
  3710. d[0] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
  3711. d[1] = _mm_loadu_si128((const __m128i *)(dest + stride*i + 8));
  3712. inptr[i ] = _mm_srai_epi16(inptr[i ], 6);
  3713. inptr[i+16] = _mm_srai_epi16(inptr[i+16], 6);
  3714. d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i ]), bd);
  3715. d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i+16]), bd);
  3716. // Store
  3717. _mm_storeu_si128((__m128i *)(dest + stride*i), d[0]);
  3718. _mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]);
  3719. }
  3720. }
  3721. } else {
  3722. // Run the un-optimised column transform
  3723. tran_low_t temp_in[16], temp_out[16];
  3724. for (i = 0; i < 16; ++i) {
  3725. for (j = 0; j < 16; ++j)
  3726. temp_in[j] = out[j * 16 + i];
  3727. vpx_highbd_idct16_c(temp_in, temp_out, bd);
  3728. for (j = 0; j < 16; ++j) {
  3729. dest[j * stride + i] = highbd_clip_pixel_add(
  3730. dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
  3731. }
  3732. }
  3733. }
  3734. }
  3735. #endif // CONFIG_VP9_HIGHBITDEPTH