inv_txfm_ssse3_x86_64.asm 61 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794
  1. ;
  2. ; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
  3. ;
  4. ; Use of this source code is governed by a BSD-style license
  5. ; that can be found in the LICENSE file in the root of the source
  6. ; tree. An additional intellectual property rights grant can be found
  7. ; in the file PATENTS. All contributing project authors may
  8. ; be found in the AUTHORS file in the root of the source tree.
  9. ;
  10. %include "third_party/x86inc/x86inc.asm"
  11. ; This file provides SSSE3 version of the inverse transformation. Part
  12. ; of the functions are originally derived from the ffmpeg project.
  13. ; Note that the current version applies to x86 64-bit only.
  14. SECTION_RODATA
  15. pw_11585x2: times 8 dw 23170
  16. pw_m2404x2: times 8 dw -2404*2
  17. pw_m4756x2: times 8 dw -4756*2
  18. pw_m5520x2: times 8 dw -5520*2
  19. pw_m8423x2: times 8 dw -8423*2
  20. pw_m9102x2: times 8 dw -9102*2
  21. pw_m10394x2: times 8 dw -10394*2
  22. pw_m11003x2: times 8 dw -11003*2
  23. pw_16364x2: times 8 dw 16364*2
  24. pw_16305x2: times 8 dw 16305*2
  25. pw_16207x2: times 8 dw 16207*2
  26. pw_16069x2: times 8 dw 16069*2
  27. pw_15893x2: times 8 dw 15893*2
  28. pw_15679x2: times 8 dw 15679*2
  29. pw_15426x2: times 8 dw 15426*2
  30. pw_15137x2: times 8 dw 15137*2
  31. pw_14811x2: times 8 dw 14811*2
  32. pw_14449x2: times 8 dw 14449*2
  33. pw_14053x2: times 8 dw 14053*2
  34. pw_13623x2: times 8 dw 13623*2
  35. pw_13160x2: times 8 dw 13160*2
  36. pw_12665x2: times 8 dw 12665*2
  37. pw_12140x2: times 8 dw 12140*2
  38. pw__9760x2: times 8 dw 9760*2
  39. pw__7723x2: times 8 dw 7723*2
  40. pw__7005x2: times 8 dw 7005*2
  41. pw__6270x2: times 8 dw 6270*2
  42. pw__3981x2: times 8 dw 3981*2
  43. pw__3196x2: times 8 dw 3196*2
  44. pw__1606x2: times 8 dw 1606*2
  45. pw___804x2: times 8 dw 804*2
  46. pd_8192: times 4 dd 8192
  47. pw_32: times 8 dw 32
  48. pw_16: times 8 dw 16
  49. %macro TRANSFORM_COEFFS 2
  50. pw_%1_%2: dw %1, %2, %1, %2, %1, %2, %1, %2
  51. pw_m%2_%1: dw -%2, %1, -%2, %1, -%2, %1, -%2, %1
  52. pw_m%1_m%2: dw -%1, -%2, -%1, -%2, -%1, -%2, -%1, -%2
  53. %endmacro
  54. TRANSFORM_COEFFS 6270, 15137
  55. TRANSFORM_COEFFS 3196, 16069
  56. TRANSFORM_COEFFS 13623, 9102
  57. ; constants for 32x32_34
  58. TRANSFORM_COEFFS 804, 16364
  59. TRANSFORM_COEFFS 15426, 5520
  60. TRANSFORM_COEFFS 3981, 15893
  61. TRANSFORM_COEFFS 16207, 2404
  62. TRANSFORM_COEFFS 1606, 16305
  63. TRANSFORM_COEFFS 15679, 4756
  64. TRANSFORM_COEFFS 11585, 11585
  65. ; constants for 32x32_1024
  66. TRANSFORM_COEFFS 12140, 11003
  67. TRANSFORM_COEFFS 7005, 14811
  68. TRANSFORM_COEFFS 14053, 8423
  69. TRANSFORM_COEFFS 9760, 13160
  70. TRANSFORM_COEFFS 12665, 10394
  71. TRANSFORM_COEFFS 7723, 14449
  72. %macro PAIR_PP_COEFFS 2
  73. dpw_%1_%2: dw %1, %1, %1, %1, %2, %2, %2, %2
  74. %endmacro
  75. %macro PAIR_MP_COEFFS 2
  76. dpw_m%1_%2: dw -%1, -%1, -%1, -%1, %2, %2, %2, %2
  77. %endmacro
  78. %macro PAIR_MM_COEFFS 2
  79. dpw_m%1_m%2: dw -%1, -%1, -%1, -%1, -%2, -%2, -%2, -%2
  80. %endmacro
  81. PAIR_PP_COEFFS 30274, 12540
  82. PAIR_PP_COEFFS 6392, 32138
  83. PAIR_MP_COEFFS 18204, 27246
  84. PAIR_PP_COEFFS 12540, 12540
  85. PAIR_PP_COEFFS 30274, 30274
  86. PAIR_PP_COEFFS 6392, 6392
  87. PAIR_PP_COEFFS 32138, 32138
  88. PAIR_MM_COEFFS 18204, 18204
  89. PAIR_PP_COEFFS 27246, 27246
  90. SECTION .text
  91. %if ARCH_X86_64
  92. %macro SUM_SUB 3
  93. psubw m%3, m%1, m%2
  94. paddw m%1, m%2
  95. SWAP %2, %3
  96. %endmacro
  97. ; butterfly operation
  98. %macro MUL_ADD_2X 6 ; dst1, dst2, src, round, coefs1, coefs2
  99. pmaddwd m%1, m%3, %5
  100. pmaddwd m%2, m%3, %6
  101. paddd m%1, %4
  102. paddd m%2, %4
  103. psrad m%1, 14
  104. psrad m%2, 14
  105. %endmacro
  106. %macro BUTTERFLY_4X 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2
  107. punpckhwd m%6, m%2, m%1
  108. MUL_ADD_2X %7, %6, %6, %5, [pw_m%4_%3], [pw_%3_%4]
  109. punpcklwd m%2, m%1
  110. MUL_ADD_2X %1, %2, %2, %5, [pw_m%4_%3], [pw_%3_%4]
  111. packssdw m%1, m%7
  112. packssdw m%2, m%6
  113. %endmacro
  114. %macro BUTTERFLY_4Xmm 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2
  115. punpckhwd m%6, m%2, m%1
  116. MUL_ADD_2X %7, %6, %6, %5, [pw_m%4_%3], [pw_m%3_m%4]
  117. punpcklwd m%2, m%1
  118. MUL_ADD_2X %1, %2, %2, %5, [pw_m%4_%3], [pw_m%3_m%4]
  119. packssdw m%1, m%7
  120. packssdw m%2, m%6
  121. %endmacro
  122. ; matrix transpose
  123. %macro INTERLEAVE_2X 4
  124. punpckh%1 m%4, m%2, m%3
  125. punpckl%1 m%2, m%3
  126. SWAP %3, %4
  127. %endmacro
  128. %macro TRANSPOSE8X8 9
  129. INTERLEAVE_2X wd, %1, %2, %9
  130. INTERLEAVE_2X wd, %3, %4, %9
  131. INTERLEAVE_2X wd, %5, %6, %9
  132. INTERLEAVE_2X wd, %7, %8, %9
  133. INTERLEAVE_2X dq, %1, %3, %9
  134. INTERLEAVE_2X dq, %2, %4, %9
  135. INTERLEAVE_2X dq, %5, %7, %9
  136. INTERLEAVE_2X dq, %6, %8, %9
  137. INTERLEAVE_2X qdq, %1, %5, %9
  138. INTERLEAVE_2X qdq, %3, %7, %9
  139. INTERLEAVE_2X qdq, %2, %6, %9
  140. INTERLEAVE_2X qdq, %4, %8, %9
  141. SWAP %2, %5
  142. SWAP %4, %7
  143. %endmacro
  144. %macro IDCT8_1D 0
  145. SUM_SUB 0, 4, 9
  146. BUTTERFLY_4X 2, 6, 6270, 15137, m8, 9, 10
  147. pmulhrsw m0, m12
  148. pmulhrsw m4, m12
  149. BUTTERFLY_4X 1, 7, 3196, 16069, m8, 9, 10
  150. BUTTERFLY_4X 5, 3, 13623, 9102, m8, 9, 10
  151. SUM_SUB 1, 5, 9
  152. SUM_SUB 7, 3, 9
  153. SUM_SUB 0, 6, 9
  154. SUM_SUB 4, 2, 9
  155. SUM_SUB 3, 5, 9
  156. pmulhrsw m3, m12
  157. pmulhrsw m5, m12
  158. SUM_SUB 0, 7, 9
  159. SUM_SUB 4, 3, 9
  160. SUM_SUB 2, 5, 9
  161. SUM_SUB 6, 1, 9
  162. SWAP 3, 6
  163. SWAP 1, 4
  164. %endmacro
  165. ; This macro handles 8 pixels per line
  166. %macro ADD_STORE_8P_2X 5; src1, src2, tmp1, tmp2, zero
  167. paddw m%1, m11
  168. paddw m%2, m11
  169. psraw m%1, 5
  170. psraw m%2, 5
  171. movh m%3, [outputq]
  172. movh m%4, [outputq + strideq]
  173. punpcklbw m%3, m%5
  174. punpcklbw m%4, m%5
  175. paddw m%3, m%1
  176. paddw m%4, m%2
  177. packuswb m%3, m%5
  178. packuswb m%4, m%5
  179. movh [outputq], m%3
  180. movh [outputq + strideq], m%4
  181. %endmacro
  182. INIT_XMM ssse3
  183. ; full inverse 8x8 2D-DCT transform
  184. cglobal idct8x8_64_add, 3, 5, 13, input, output, stride
  185. mova m8, [pd_8192]
  186. mova m11, [pw_16]
  187. mova m12, [pw_11585x2]
  188. lea r3, [2 * strideq]
  189. %if CONFIG_VP9_HIGHBITDEPTH
  190. mova m0, [inputq + 0]
  191. packssdw m0, [inputq + 16]
  192. mova m1, [inputq + 32]
  193. packssdw m1, [inputq + 48]
  194. mova m2, [inputq + 64]
  195. packssdw m2, [inputq + 80]
  196. mova m3, [inputq + 96]
  197. packssdw m3, [inputq + 112]
  198. mova m4, [inputq + 128]
  199. packssdw m4, [inputq + 144]
  200. mova m5, [inputq + 160]
  201. packssdw m5, [inputq + 176]
  202. mova m6, [inputq + 192]
  203. packssdw m6, [inputq + 208]
  204. mova m7, [inputq + 224]
  205. packssdw m7, [inputq + 240]
  206. %else
  207. mova m0, [inputq + 0]
  208. mova m1, [inputq + 16]
  209. mova m2, [inputq + 32]
  210. mova m3, [inputq + 48]
  211. mova m4, [inputq + 64]
  212. mova m5, [inputq + 80]
  213. mova m6, [inputq + 96]
  214. mova m7, [inputq + 112]
  215. %endif
  216. TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
  217. IDCT8_1D
  218. TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
  219. IDCT8_1D
  220. pxor m12, m12
  221. ADD_STORE_8P_2X 0, 1, 9, 10, 12
  222. lea outputq, [outputq + r3]
  223. ADD_STORE_8P_2X 2, 3, 9, 10, 12
  224. lea outputq, [outputq + r3]
  225. ADD_STORE_8P_2X 4, 5, 9, 10, 12
  226. lea outputq, [outputq + r3]
  227. ADD_STORE_8P_2X 6, 7, 9, 10, 12
  228. RET
  229. ; inverse 8x8 2D-DCT transform with only first 10 coeffs non-zero
  230. cglobal idct8x8_12_add, 3, 5, 13, input, output, stride
  231. mova m8, [pd_8192]
  232. mova m11, [pw_16]
  233. mova m12, [pw_11585x2]
  234. lea r3, [2 * strideq]
  235. %if CONFIG_VP9_HIGHBITDEPTH
  236. mova m0, [inputq + 0]
  237. packssdw m0, [inputq + 16]
  238. mova m1, [inputq + 32]
  239. packssdw m1, [inputq + 48]
  240. mova m2, [inputq + 64]
  241. packssdw m2, [inputq + 80]
  242. mova m3, [inputq + 96]
  243. packssdw m3, [inputq + 112]
  244. %else
  245. mova m0, [inputq + 0]
  246. mova m1, [inputq + 16]
  247. mova m2, [inputq + 32]
  248. mova m3, [inputq + 48]
  249. %endif
  250. punpcklwd m0, m1
  251. punpcklwd m2, m3
  252. punpckhdq m9, m0, m2
  253. punpckldq m0, m2
  254. SWAP 2, 9
  255. ; m0 -> [0], [0]
  256. ; m1 -> [1], [1]
  257. ; m2 -> [2], [2]
  258. ; m3 -> [3], [3]
  259. punpckhqdq m10, m0, m0
  260. punpcklqdq m0, m0
  261. punpckhqdq m9, m2, m2
  262. punpcklqdq m2, m2
  263. SWAP 1, 10
  264. SWAP 3, 9
  265. pmulhrsw m0, m12
  266. pmulhrsw m2, [dpw_30274_12540]
  267. pmulhrsw m1, [dpw_6392_32138]
  268. pmulhrsw m3, [dpw_m18204_27246]
  269. SUM_SUB 0, 2, 9
  270. SUM_SUB 1, 3, 9
  271. punpcklqdq m9, m3, m3
  272. punpckhqdq m5, m3, m9
  273. SUM_SUB 3, 5, 9
  274. punpckhqdq m5, m3
  275. pmulhrsw m5, m12
  276. punpckhqdq m9, m1, m5
  277. punpcklqdq m1, m5
  278. SWAP 5, 9
  279. SUM_SUB 0, 5, 9
  280. SUM_SUB 2, 1, 9
  281. punpckhqdq m3, m0, m0
  282. punpckhqdq m4, m1, m1
  283. punpckhqdq m6, m5, m5
  284. punpckhqdq m7, m2, m2
  285. punpcklwd m0, m3
  286. punpcklwd m7, m2
  287. punpcklwd m1, m4
  288. punpcklwd m6, m5
  289. punpckhdq m4, m0, m7
  290. punpckldq m0, m7
  291. punpckhdq m10, m1, m6
  292. punpckldq m5, m1, m6
  293. punpckhqdq m1, m0, m5
  294. punpcklqdq m0, m5
  295. punpckhqdq m3, m4, m10
  296. punpcklqdq m2, m4, m10
  297. pmulhrsw m0, m12
  298. pmulhrsw m6, m2, [dpw_30274_30274]
  299. pmulhrsw m4, m2, [dpw_12540_12540]
  300. pmulhrsw m7, m1, [dpw_32138_32138]
  301. pmulhrsw m1, [dpw_6392_6392]
  302. pmulhrsw m5, m3, [dpw_m18204_m18204]
  303. pmulhrsw m3, [dpw_27246_27246]
  304. mova m2, m0
  305. SUM_SUB 0, 6, 9
  306. SUM_SUB 2, 4, 9
  307. SUM_SUB 1, 5, 9
  308. SUM_SUB 7, 3, 9
  309. SUM_SUB 3, 5, 9
  310. pmulhrsw m3, m12
  311. pmulhrsw m5, m12
  312. SUM_SUB 0, 7, 9
  313. SUM_SUB 2, 3, 9
  314. SUM_SUB 4, 5, 9
  315. SUM_SUB 6, 1, 9
  316. SWAP 3, 6
  317. SWAP 1, 2
  318. SWAP 2, 4
  319. pxor m12, m12
  320. ADD_STORE_8P_2X 0, 1, 9, 10, 12
  321. lea outputq, [outputq + r3]
  322. ADD_STORE_8P_2X 2, 3, 9, 10, 12
  323. lea outputq, [outputq + r3]
  324. ADD_STORE_8P_2X 4, 5, 9, 10, 12
  325. lea outputq, [outputq + r3]
  326. ADD_STORE_8P_2X 6, 7, 9, 10, 12
  327. RET
  328. %define idx0 16 * 0
  329. %define idx1 16 * 1
  330. %define idx2 16 * 2
  331. %define idx3 16 * 3
  332. %define idx4 16 * 4
  333. %define idx5 16 * 5
  334. %define idx6 16 * 6
  335. %define idx7 16 * 7
  336. %define idx8 16 * 0
  337. %define idx9 16 * 1
  338. %define idx10 16 * 2
  339. %define idx11 16 * 3
  340. %define idx12 16 * 4
  341. %define idx13 16 * 5
  342. %define idx14 16 * 6
  343. %define idx15 16 * 7
  344. %define idx16 16 * 0
  345. %define idx17 16 * 1
  346. %define idx18 16 * 2
  347. %define idx19 16 * 3
  348. %define idx20 16 * 4
  349. %define idx21 16 * 5
  350. %define idx22 16 * 6
  351. %define idx23 16 * 7
  352. %define idx24 16 * 0
  353. %define idx25 16 * 1
  354. %define idx26 16 * 2
  355. %define idx27 16 * 3
  356. %define idx28 16 * 4
  357. %define idx29 16 * 5
  358. %define idx30 16 * 6
  359. %define idx31 16 * 7
  360. ; FROM idct32x32_add_neon.asm
  361. ;
  362. ; Instead of doing the transforms stage by stage, it is done by loading
  363. ; some input values and doing as many stages as possible to minimize the
  364. ; storing/loading of intermediate results. To fit within registers, the
  365. ; final coefficients are cut into four blocks:
  366. ; BLOCK A: 16-19,28-31
  367. ; BLOCK B: 20-23,24-27
  368. ; BLOCK C: 8-11,12-15
  369. ; BLOCK D: 0-3,4-7
  370. ; Blocks A and C are straight calculation through the various stages. In
  371. ; block B, further calculations are performed using the results from
  372. ; block A. In block D, further calculations are performed using the results
  373. ; from block C and then the final calculations are done using results from
  374. ; block A and B which have been combined at the end of block B.
  375. ;
  376. %macro IDCT32X32_34 4
  377. ; BLOCK A STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  378. mova m11, m1
  379. pmulhrsw m1, [pw___804x2] ; stp1_16
  380. mova [r4 + 0], m0
  381. pmulhrsw m11, [pw_16364x2] ; stp2_31
  382. mova [r4 + 16 * 2], m2
  383. mova m12, m7
  384. pmulhrsw m7, [pw_15426x2] ; stp1_28
  385. mova [r4 + 16 * 4], m4
  386. pmulhrsw m12, [pw_m5520x2] ; stp2_19
  387. mova [r4 + 16 * 6], m6
  388. ; BLOCK A STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  389. mova m2, m1 ; stp1_16
  390. mova m0, m11 ; stp1_31
  391. mova m4, m7 ; stp1_28
  392. mova m15, m12 ; stp1_19
  393. ; BLOCK A STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  394. BUTTERFLY_4X 0, 2, 3196, 16069, m8, 9, 10 ; stp1_17, stp1_30
  395. BUTTERFLY_4Xmm 4, 15, 3196, 16069, m8, 9, 10 ; stp1_29, stp1_18
  396. ; BLOCK A STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  397. SUM_SUB 1, 12, 9 ; stp2_16, stp2_19
  398. SUM_SUB 0, 15, 9 ; stp2_17, stp2_18
  399. SUM_SUB 11, 7, 9 ; stp2_31, stp2_28
  400. SUM_SUB 2, 4, 9 ; stp2_30, stp2_29
  401. ; BLOCK A STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  402. BUTTERFLY_4X 4, 15, 6270, 15137, m8, 9, 10 ; stp1_18, stp1_29
  403. BUTTERFLY_4X 7, 12, 6270, 15137, m8, 9, 10 ; stp1_19, stp1_28
  404. ; BLOCK B STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  405. mova m6, m5
  406. pmulhrsw m5, [pw__3981x2] ; stp1_20
  407. mova [stp + %4 + idx28], m12
  408. mova [stp + %4 + idx29], m15
  409. pmulhrsw m6, [pw_15893x2] ; stp2_27
  410. mova [stp + %4 + idx30], m2
  411. mova m2, m3
  412. pmulhrsw m3, [pw_m2404x2] ; stp1_23
  413. mova [stp + %4 + idx31], m11
  414. pmulhrsw m2, [pw_16207x2] ; stp2_24
  415. ; BLOCK B STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  416. mova m13, m5 ; stp1_20
  417. mova m14, m6 ; stp1_27
  418. mova m15, m3 ; stp1_23
  419. mova m11, m2 ; stp1_24
  420. ; BLOCK B STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  421. BUTTERFLY_4X 14, 13, 13623, 9102, m8, 9, 10 ; stp1_21, stp1_26
  422. BUTTERFLY_4Xmm 11, 15, 13623, 9102, m8, 9, 10 ; stp1_25, stp1_22
  423. ; BLOCK B STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  424. SUM_SUB 3, 5, 9 ; stp2_23, stp2_20
  425. SUM_SUB 15, 14, 9 ; stp2_22, stp2_21
  426. SUM_SUB 2, 6, 9 ; stp2_24, stp2_27
  427. SUM_SUB 11, 13, 9 ; stp2_25, stp2_26
  428. ; BLOCK B STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  429. BUTTERFLY_4Xmm 6, 5, 6270, 15137, m8, 9, 10 ; stp1_27, stp1_20
  430. BUTTERFLY_4Xmm 13, 14, 6270, 15137, m8, 9, 10 ; stp1_26, stp1_21
  431. ; BLOCK B STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  432. SUM_SUB 1, 3, 9 ; stp2_16, stp2_23
  433. SUM_SUB 0, 15, 9 ; stp2_17, stp2_22
  434. SUM_SUB 4, 14, 9 ; stp2_18, stp2_21
  435. SUM_SUB 7, 5, 9 ; stp2_19, stp2_20
  436. mova [stp + %3 + idx16], m1
  437. mova [stp + %3 + idx17], m0
  438. mova [stp + %3 + idx18], m4
  439. mova [stp + %3 + idx19], m7
  440. mova m4, [stp + %4 + idx28]
  441. mova m7, [stp + %4 + idx29]
  442. mova m10, [stp + %4 + idx30]
  443. mova m12, [stp + %4 + idx31]
  444. SUM_SUB 4, 6, 9 ; stp2_28, stp2_27
  445. SUM_SUB 7, 13, 9 ; stp2_29, stp2_26
  446. SUM_SUB 10, 11, 9 ; stp2_30, stp2_25
  447. SUM_SUB 12, 2, 9 ; stp2_31, stp2_24
  448. mova [stp + %4 + idx28], m4
  449. mova [stp + %4 + idx29], m7
  450. mova [stp + %4 + idx30], m10
  451. mova [stp + %4 + idx31], m12
  452. ; BLOCK B STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  453. %if 0 ; overflow occurs in SUM_SUB when using test streams
  454. mova m10, [pw_11585x2]
  455. SUM_SUB 6, 5, 9
  456. pmulhrsw m6, m10 ; stp1_27
  457. pmulhrsw m5, m10 ; stp1_20
  458. SUM_SUB 13, 14, 9
  459. pmulhrsw m13, m10 ; stp1_26
  460. pmulhrsw m14, m10 ; stp1_21
  461. SUM_SUB 11, 15, 9
  462. pmulhrsw m11, m10 ; stp1_25
  463. pmulhrsw m15, m10 ; stp1_22
  464. SUM_SUB 2, 3, 9
  465. pmulhrsw m2, m10 ; stp1_24
  466. pmulhrsw m3, m10 ; stp1_23
  467. %else
  468. BUTTERFLY_4X 6, 5, 11585, 11585, m8, 9, 10 ; stp1_20, stp1_27
  469. SWAP 6, 5
  470. BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_21, stp1_26
  471. SWAP 13, 14
  472. BUTTERFLY_4X 11, 15, 11585, 11585, m8, 9, 10 ; stp1_22, stp1_25
  473. SWAP 11, 15
  474. BUTTERFLY_4X 2, 3, 11585, 11585, m8, 9, 10 ; stp1_23, stp1_24
  475. SWAP 2, 3
  476. %endif
  477. mova [stp + %4 + idx24], m2
  478. mova [stp + %4 + idx25], m11
  479. mova [stp + %4 + idx26], m13
  480. mova [stp + %4 + idx27], m6
  481. ; BLOCK C STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  482. ;
  483. ; BLOCK C STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  484. mova m0, [rsp + transposed_in + 16 * 2]
  485. mova m6, [rsp + transposed_in + 16 * 6]
  486. mova m1, m0
  487. pmulhrsw m0, [pw__1606x2] ; stp1_8
  488. mova [stp + %3 + idx20], m5
  489. mova [stp + %3 + idx21], m14
  490. pmulhrsw m1, [pw_16305x2] ; stp2_15
  491. mova [stp + %3 + idx22], m15
  492. mova m7, m6
  493. pmulhrsw m7, [pw_m4756x2] ; stp2_11
  494. mova [stp + %3 + idx23], m3
  495. pmulhrsw m6, [pw_15679x2] ; stp1_12
  496. ; BLOCK C STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  497. mova m3, m0 ; stp1_8
  498. mova m2, m1 ; stp1_15
  499. ; BLOCK C STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  500. BUTTERFLY_4X 2, 3, 6270, 15137, m8, 9, 10 ; stp1_9, stp1_14
  501. mova m4, m7 ; stp1_11
  502. mova m5, m6 ; stp1_12
  503. BUTTERFLY_4Xmm 5, 4, 6270, 15137, m8, 9, 10 ; stp1_13, stp1_10
  504. ; BLOCK C STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  505. SUM_SUB 0, 7, 9 ; stp1_8, stp1_11
  506. SUM_SUB 2, 4, 9 ; stp1_9, stp1_10
  507. SUM_SUB 1, 6, 9 ; stp1_15, stp1_12
  508. SUM_SUB 3, 5, 9 ; stp1_14, stp1_13
  509. ; BLOCK C STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  510. %if 0 ; overflow occurs in SUM_SUB when using test streams
  511. mova m10, [pw_11585x2]
  512. SUM_SUB 5, 4, 9
  513. pmulhrsw m5, m10 ; stp1_13
  514. pmulhrsw m4, m10 ; stp1_10
  515. SUM_SUB 6, 7, 9
  516. pmulhrsw m6, m10 ; stp1_12
  517. pmulhrsw m7, m10 ; stp1_11
  518. %else
  519. BUTTERFLY_4X 5, 4, 11585, 11585, m8, 9, 10 ; stp1_10, stp1_13
  520. SWAP 5, 4
  521. BUTTERFLY_4X 6, 7, 11585, 11585, m8, 9, 10 ; stp1_11, stp1_12
  522. SWAP 6, 7
  523. %endif
  524. ; BLOCK C STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  525. mova [stp + %2 + idx8], m0
  526. mova [stp + %2 + idx9], m2
  527. mova [stp + %2 + idx10], m4
  528. mova [stp + %2 + idx11], m7
  529. ; BLOCK D STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  530. ;
  531. ; BLOCK D STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  532. ;
  533. ; BLOCK D STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  534. mova m11, [rsp + transposed_in + 16 * 4]
  535. mova m12, m11
  536. pmulhrsw m11, [pw__3196x2] ; stp1_4
  537. pmulhrsw m12, [pw_16069x2] ; stp1_7
  538. ; BLOCK D STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  539. mova m0, [rsp + transposed_in + 16 * 0]
  540. mova m10, [pw_11585x2]
  541. pmulhrsw m0, m10 ; stp1_1
  542. mova m14, m11 ; stp1_4
  543. mova m13, m12 ; stp1_7
  544. ; BLOCK D STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  545. %if 0 ; overflow occurs in SUM_SUB when using test streams
  546. SUM_SUB 13, 14, 9
  547. pmulhrsw m13, m10 ; stp1_6
  548. pmulhrsw m14, m10 ; stp1_5
  549. %else
  550. BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_5, stp1_6
  551. SWAP 13, 14
  552. %endif
  553. mova m7, m0 ; stp1_0 = stp1_1
  554. mova m4, m0 ; stp1_1
  555. mova m2, m7 ; stp1_0
  556. ; BLOCK D STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  557. SUM_SUB 0, 12, 9 ; stp1_0, stp1_7
  558. SUM_SUB 7, 13, 9 ; stp1_1, stp1_6
  559. SUM_SUB 2, 14, 9 ; stp1_2, stp1_5
  560. SUM_SUB 4, 11, 9 ; stp1_3, stp1_4
  561. ; BLOCK D STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  562. SUM_SUB 0, 1, 9 ; stp1_0, stp1_15
  563. SUM_SUB 7, 3, 9 ; stp1_1, stp1_14
  564. SUM_SUB 2, 5, 9 ; stp1_2, stp1_13
  565. SUM_SUB 4, 6, 9 ; stp1_3, stp1_12
  566. ; 0-3, 28-31 final stage
  567. mova m15, [stp + %4 + idx30]
  568. mova m10, [stp + %4 + idx31]
  569. SUM_SUB 0, 10, 9 ; stp1_0, stp1_31
  570. SUM_SUB 7, 15, 9 ; stp1_1, stp1_30
  571. mova [stp + %1 + idx0], m0
  572. mova [stp + %1 + idx1], m7
  573. mova [stp + %4 + idx30], m15
  574. mova [stp + %4 + idx31], m10
  575. mova m7, [stp + %4 + idx28]
  576. mova m0, [stp + %4 + idx29]
  577. SUM_SUB 2, 0, 9 ; stp1_2, stp1_29
  578. SUM_SUB 4, 7, 9 ; stp1_3, stp1_28
  579. mova [stp + %1 + idx2], m2
  580. mova [stp + %1 + idx3], m4
  581. mova [stp + %4 + idx28], m7
  582. mova [stp + %4 + idx29], m0
  583. ; 12-15, 16-19 final stage
  584. mova m0, [stp + %3 + idx16]
  585. mova m7, [stp + %3 + idx17]
  586. mova m2, [stp + %3 + idx18]
  587. mova m4, [stp + %3 + idx19]
  588. SUM_SUB 1, 0, 9 ; stp1_15, stp1_16
  589. SUM_SUB 3, 7, 9 ; stp1_14, stp1_17
  590. SUM_SUB 5, 2, 9 ; stp1_13, stp1_18
  591. SUM_SUB 6, 4, 9 ; stp1_12, stp1_19
  592. mova [stp + %2 + idx12], m6
  593. mova [stp + %2 + idx13], m5
  594. mova [stp + %2 + idx14], m3
  595. mova [stp + %2 + idx15], m1
  596. mova [stp + %3 + idx16], m0
  597. mova [stp + %3 + idx17], m7
  598. mova [stp + %3 + idx18], m2
  599. mova [stp + %3 + idx19], m4
  600. mova m4, [stp + %2 + idx8]
  601. mova m5, [stp + %2 + idx9]
  602. mova m6, [stp + %2 + idx10]
  603. mova m7, [stp + %2 + idx11]
  604. SUM_SUB 11, 7, 9 ; stp1_4, stp1_11
  605. SUM_SUB 14, 6, 9 ; stp1_5, stp1_10
  606. SUM_SUB 13, 5, 9 ; stp1_6, stp1_9
  607. SUM_SUB 12, 4, 9 ; stp1_7, stp1_8
  608. ; 4-7, 24-27 final stage
  609. mova m0, [stp + %4 + idx27]
  610. mova m1, [stp + %4 + idx26]
  611. mova m2, [stp + %4 + idx25]
  612. mova m3, [stp + %4 + idx24]
  613. SUM_SUB 11, 0, 9 ; stp1_4, stp1_27
  614. SUM_SUB 14, 1, 9 ; stp1_5, stp1_26
  615. SUM_SUB 13, 2, 9 ; stp1_6, stp1_25
  616. SUM_SUB 12, 3, 9 ; stp1_7, stp1_24
  617. mova [stp + %4 + idx27], m0
  618. mova [stp + %4 + idx26], m1
  619. mova [stp + %4 + idx25], m2
  620. mova [stp + %4 + idx24], m3
  621. mova [stp + %1 + idx4], m11
  622. mova [stp + %1 + idx5], m14
  623. mova [stp + %1 + idx6], m13
  624. mova [stp + %1 + idx7], m12
  625. ; 8-11, 20-23 final stage
  626. mova m0, [stp + %3 + idx20]
  627. mova m1, [stp + %3 + idx21]
  628. mova m2, [stp + %3 + idx22]
  629. mova m3, [stp + %3 + idx23]
  630. SUM_SUB 7, 0, 9 ; stp1_11, stp_20
  631. SUM_SUB 6, 1, 9 ; stp1_10, stp_21
  632. SUM_SUB 5, 2, 9 ; stp1_9, stp_22
  633. SUM_SUB 4, 3, 9 ; stp1_8, stp_23
  634. mova [stp + %2 + idx8], m4
  635. mova [stp + %2 + idx9], m5
  636. mova [stp + %2 + idx10], m6
  637. mova [stp + %2 + idx11], m7
  638. mova [stp + %3 + idx20], m0
  639. mova [stp + %3 + idx21], m1
  640. mova [stp + %3 + idx22], m2
  641. mova [stp + %3 + idx23], m3
  642. %endmacro
  643. %macro RECON_AND_STORE 1
  644. mova m11, [pw_32]
  645. lea stp, [rsp + %1]
  646. mov r6, 32
  647. pxor m8, m8
  648. %%recon_and_store:
  649. mova m0, [stp + 16 * 32 * 0]
  650. mova m1, [stp + 16 * 32 * 1]
  651. mova m2, [stp + 16 * 32 * 2]
  652. mova m3, [stp + 16 * 32 * 3]
  653. add stp, 16
  654. paddw m0, m11
  655. paddw m1, m11
  656. paddw m2, m11
  657. paddw m3, m11
  658. psraw m0, 6
  659. psraw m1, 6
  660. psraw m2, 6
  661. psraw m3, 6
  662. movh m4, [outputq + 0]
  663. movh m5, [outputq + 8]
  664. movh m6, [outputq + 16]
  665. movh m7, [outputq + 24]
  666. punpcklbw m4, m8
  667. punpcklbw m5, m8
  668. punpcklbw m6, m8
  669. punpcklbw m7, m8
  670. paddw m0, m4
  671. paddw m1, m5
  672. paddw m2, m6
  673. paddw m3, m7
  674. packuswb m0, m1
  675. packuswb m2, m3
  676. mova [outputq + 0], m0
  677. mova [outputq + 16], m2
  678. lea outputq, [outputq + strideq]
  679. dec r6
  680. jnz %%recon_and_store
  681. %endmacro
  682. %define i32x32_size 16*32*5
  683. %define pass_two_start 16*32*0
  684. %define transposed_in 16*32*4
  685. %define pass_one_start 16*32*0
  686. %define stp r8
  687. INIT_XMM ssse3
  688. cglobal idct32x32_34_add, 3, 11, 16, i32x32_size, input, output, stride
  689. mova m8, [pd_8192]
  690. lea stp, [rsp + pass_one_start]
  691. idct32x32_34:
  692. mov r3, inputq
  693. lea r4, [rsp + transposed_in]
  694. idct32x32_34_transpose:
  695. %if CONFIG_VP9_HIGHBITDEPTH
  696. mova m0, [r3 + 0]
  697. packssdw m0, [r3 + 16]
  698. mova m1, [r3 + 32 * 4]
  699. packssdw m1, [r3 + 32 * 4 + 16]
  700. mova m2, [r3 + 32 * 8]
  701. packssdw m2, [r3 + 32 * 8 + 16]
  702. mova m3, [r3 + 32 * 12]
  703. packssdw m3, [r3 + 32 * 12 + 16]
  704. mova m4, [r3 + 32 * 16]
  705. packssdw m4, [r3 + 32 * 16 + 16]
  706. mova m5, [r3 + 32 * 20]
  707. packssdw m5, [r3 + 32 * 20 + 16]
  708. mova m6, [r3 + 32 * 24]
  709. packssdw m6, [r3 + 32 * 24 + 16]
  710. mova m7, [r3 + 32 * 28]
  711. packssdw m7, [r3 + 32 * 28 + 16]
  712. %else
  713. mova m0, [r3 + 0]
  714. mova m1, [r3 + 16 * 4]
  715. mova m2, [r3 + 16 * 8]
  716. mova m3, [r3 + 16 * 12]
  717. mova m4, [r3 + 16 * 16]
  718. mova m5, [r3 + 16 * 20]
  719. mova m6, [r3 + 16 * 24]
  720. mova m7, [r3 + 16 * 28]
  721. %endif
  722. TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
  723. IDCT32X32_34 16*0, 16*32, 16*64, 16*96
  724. lea stp, [stp + 16 * 8]
  725. mov r6, 4
  726. lea stp, [rsp + pass_one_start]
  727. lea r9, [rsp + pass_one_start]
  728. idct32x32_34_2:
  729. lea r4, [rsp + transposed_in]
  730. mov r3, r9
  731. idct32x32_34_transpose_2:
  732. mova m0, [r3 + 0]
  733. mova m1, [r3 + 16 * 1]
  734. mova m2, [r3 + 16 * 2]
  735. mova m3, [r3 + 16 * 3]
  736. mova m4, [r3 + 16 * 4]
  737. mova m5, [r3 + 16 * 5]
  738. mova m6, [r3 + 16 * 6]
  739. mova m7, [r3 + 16 * 7]
  740. TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
  741. IDCT32X32_34 16*0, 16*8, 16*16, 16*24
  742. lea stp, [stp + 16 * 32]
  743. add r9, 16 * 32
  744. dec r6
  745. jnz idct32x32_34_2
  746. RECON_AND_STORE pass_two_start
  747. RET
  748. %macro IDCT32X32_135 4
  749. ; BLOCK A STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  750. mova m1, [rsp + transposed_in + 16 * 1]
  751. mova m11, m1
  752. pmulhrsw m1, [pw___804x2] ; stp1_16
  753. pmulhrsw m11, [pw_16364x2] ; stp2_31
  754. mova m7, [rsp + transposed_in + 16 * 7]
  755. mova m12, m7
  756. pmulhrsw m7, [pw_15426x2] ; stp1_28
  757. pmulhrsw m12, [pw_m5520x2] ; stp2_19
  758. mova m3, [rsp + transposed_in + 16 * 9]
  759. mova m4, m3
  760. pmulhrsw m3, [pw__7005x2] ; stp1_18
  761. pmulhrsw m4, [pw_14811x2] ; stp2_29
  762. mova m0, [rsp + transposed_in + 16 * 15]
  763. mova m2, m0
  764. pmulhrsw m0, [pw_12140x2] ; stp1_30
  765. pmulhrsw m2, [pw_m11003x2] ; stp2_17
  766. ; BLOCK A STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  767. SUM_SUB 1, 2, 9 ; stp2_16, stp2_17
  768. SUM_SUB 12, 3, 9 ; stp2_19, stp2_18
  769. SUM_SUB 7, 4, 9 ; stp2_28, stp2_29
  770. SUM_SUB 11, 0, 9 ; stp2_31, stp2_30
  771. ; BLOCK A STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  772. BUTTERFLY_4X 0, 2, 3196, 16069, m8, 9, 10 ; stp1_17, stp1_30
  773. BUTTERFLY_4Xmm 4, 3, 3196, 16069, m8, 9, 10 ; stp1_29, stp1_18
  774. ; BLOCK A STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  775. SUM_SUB 1, 12, 9 ; stp2_16, stp2_19
  776. SUM_SUB 0, 3, 9 ; stp2_17, stp2_18
  777. SUM_SUB 11, 7, 9 ; stp2_31, stp2_28
  778. SUM_SUB 2, 4, 9 ; stp2_30, stp2_29
  779. ; BLOCK A STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  780. BUTTERFLY_4X 4, 3, 6270, 15137, m8, 9, 10 ; stp1_18, stp1_29
  781. BUTTERFLY_4X 7, 12, 6270, 15137, m8, 9, 10 ; stp1_19, stp1_28
  782. mova [stp + %3 + idx16], m1
  783. mova [stp + %3 + idx17], m0
  784. mova [stp + %3 + idx18], m4
  785. mova [stp + %3 + idx19], m7
  786. mova [stp + %4 + idx28], m12
  787. mova [stp + %4 + idx29], m3
  788. mova [stp + %4 + idx30], m2
  789. mova [stp + %4 + idx31], m11
  790. ; BLOCK B STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  791. mova m2, [rsp + transposed_in + 16 * 3]
  792. mova m3, m2
  793. pmulhrsw m3, [pw_m2404x2] ; stp1_23
  794. pmulhrsw m2, [pw_16207x2] ; stp2_24
  795. mova m5, [rsp + transposed_in + 16 * 5]
  796. mova m6, m5
  797. pmulhrsw m5, [pw__3981x2] ; stp1_20
  798. pmulhrsw m6, [pw_15893x2] ; stp2_27
  799. mova m14, [rsp + transposed_in + 16 * 11]
  800. mova m13, m14
  801. pmulhrsw m13, [pw_m8423x2] ; stp1_21
  802. pmulhrsw m14, [pw_14053x2] ; stp2_26
  803. mova m0, [rsp + transposed_in + 16 * 13]
  804. mova m1, m0
  805. pmulhrsw m0, [pw__9760x2] ; stp1_22
  806. pmulhrsw m1, [pw_13160x2] ; stp2_25
  807. ; BLOCK B STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  808. SUM_SUB 5, 13, 9 ; stp2_20, stp2_21
  809. SUM_SUB 3, 0, 9 ; stp2_23, stp2_22
  810. SUM_SUB 2, 1, 9 ; stp2_24, stp2_25
  811. SUM_SUB 6, 14, 9 ; stp2_27, stp2_26
  812. ; BLOCK B STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  813. BUTTERFLY_4X 14, 13, 13623, 9102, m8, 9, 10 ; stp1_21, stp1_26
  814. BUTTERFLY_4Xmm 1, 0, 13623, 9102, m8, 9, 10 ; stp1_25, stp1_22
  815. ; BLOCK B STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  816. SUM_SUB 3, 5, 9 ; stp2_23, stp2_20
  817. SUM_SUB 0, 14, 9 ; stp2_22, stp2_21
  818. SUM_SUB 2, 6, 9 ; stp2_24, stp2_27
  819. SUM_SUB 1, 13, 9 ; stp2_25, stp2_26
  820. ; BLOCK B STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  821. BUTTERFLY_4Xmm 6, 5, 6270, 15137, m8, 9, 10 ; stp1_27, stp1_20
  822. BUTTERFLY_4Xmm 13, 14, 6270, 15137, m8, 9, 10 ; stp1_26, stp1_21
  823. ; BLOCK B STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  824. mova m4, [stp + %3 + idx16]
  825. mova m7, [stp + %3 + idx17]
  826. mova m11, [stp + %3 + idx18]
  827. mova m12, [stp + %3 + idx19]
  828. SUM_SUB 4, 3, 9 ; stp2_16, stp2_23
  829. SUM_SUB 7, 0, 9 ; stp2_17, stp2_22
  830. SUM_SUB 11, 14, 9 ; stp2_18, stp2_21
  831. SUM_SUB 12, 5, 9 ; stp2_19, stp2_20
  832. mova [stp + %3 + idx16], m4
  833. mova [stp + %3 + idx17], m7
  834. mova [stp + %3 + idx18], m11
  835. mova [stp + %3 + idx19], m12
  836. mova m4, [stp + %4 + idx28]
  837. mova m7, [stp + %4 + idx29]
  838. mova m11, [stp + %4 + idx30]
  839. mova m12, [stp + %4 + idx31]
  840. SUM_SUB 4, 6, 9 ; stp2_28, stp2_27
  841. SUM_SUB 7, 13, 9 ; stp2_29, stp2_26
  842. SUM_SUB 11, 1, 9 ; stp2_30, stp2_25
  843. SUM_SUB 12, 2, 9 ; stp2_31, stp2_24
  844. mova [stp + %4 + idx28], m4
  845. mova [stp + %4 + idx29], m7
  846. mova [stp + %4 + idx30], m11
  847. mova [stp + %4 + idx31], m12
  848. ; BLOCK B STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  849. %if 0 ; overflow occurs in SUM_SUB when using test streams
  850. mova m10, [pw_11585x2]
  851. SUM_SUB 6, 5, 9
  852. pmulhrsw m6, m10 ; stp1_27
  853. pmulhrsw m5, m10 ; stp1_20
  854. SUM_SUB 13, 14, 9
  855. pmulhrsw m13, m10 ; stp1_26
  856. pmulhrsw m14, m10 ; stp1_21
  857. SUM_SUB 1, 0, 9
  858. pmulhrsw m1, m10 ; stp1_25
  859. pmulhrsw m0, m10 ; stp1_22
  860. SUM_SUB 2, 3, 9
  861. pmulhrsw m2, m10 ; stp1_25
  862. pmulhrsw m3, m10 ; stp1_22
  863. %else
  864. BUTTERFLY_4X 6, 5, 11585, 11585, m8, 9, 10 ; stp1_20, stp1_27
  865. SWAP 6, 5
  866. BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_21, stp1_26
  867. SWAP 13, 14
  868. BUTTERFLY_4X 1, 0, 11585, 11585, m8, 9, 10 ; stp1_22, stp1_25
  869. SWAP 1, 0
  870. BUTTERFLY_4X 2, 3, 11585, 11585, m8, 9, 10 ; stp1_23, stp1_24
  871. SWAP 2, 3
  872. %endif
  873. mova [stp + %3 + idx20], m5
  874. mova [stp + %3 + idx21], m14
  875. mova [stp + %3 + idx22], m0
  876. mova [stp + %3 + idx23], m3
  877. mova [stp + %4 + idx24], m2
  878. mova [stp + %4 + idx25], m1
  879. mova [stp + %4 + idx26], m13
  880. mova [stp + %4 + idx27], m6
  881. ; BLOCK C STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  882. ;
  883. ; BLOCK C STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  884. mova m0, [rsp + transposed_in + 16 * 2]
  885. mova m1, m0
  886. pmulhrsw m0, [pw__1606x2] ; stp1_8
  887. pmulhrsw m1, [pw_16305x2] ; stp2_15
  888. mova m6, [rsp + transposed_in + 16 * 6]
  889. mova m7, m6
  890. pmulhrsw m7, [pw_m4756x2] ; stp2_11
  891. pmulhrsw m6, [pw_15679x2] ; stp1_12
  892. mova m4, [rsp + transposed_in + 16 * 10]
  893. mova m5, m4
  894. pmulhrsw m4, [pw__7723x2] ; stp1_10
  895. pmulhrsw m5, [pw_14449x2] ; stp2_13
  896. mova m2, [rsp + transposed_in + 16 * 14]
  897. mova m3, m2
  898. pmulhrsw m3, [pw_m10394x2] ; stp1_9
  899. pmulhrsw m2, [pw_12665x2] ; stp2_14
  900. ; BLOCK C STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  901. SUM_SUB 0, 3, 9 ; stp1_8, stp1_9
  902. SUM_SUB 7, 4, 9 ; stp1_11, stp1_10
  903. SUM_SUB 6, 5, 9 ; stp1_12, stp1_13
  904. SUM_SUB 1, 2, 9 ; stp1_15, stp1_14
  905. ; BLOCK C STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  906. BUTTERFLY_4X 2, 3, 6270, 15137, m8, 9, 10 ; stp1_9, stp1_14
  907. BUTTERFLY_4Xmm 5, 4, 6270, 15137, m8, 9, 10 ; stp1_13, stp1_10
  908. ; BLOCK C STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  909. SUM_SUB 0, 7, 9 ; stp1_8, stp1_11
  910. SUM_SUB 2, 4, 9 ; stp1_9, stp1_10
  911. SUM_SUB 1, 6, 9 ; stp1_15, stp1_12
  912. SUM_SUB 3, 5, 9 ; stp1_14, stp1_13
  913. ; BLOCK C STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  914. %if 0 ; overflow occurs in SUM_SUB when using test streams
  915. mova m10, [pw_11585x2]
  916. SUM_SUB 5, 4, 9
  917. pmulhrsw m5, m10 ; stp1_13
  918. pmulhrsw m4, m10 ; stp1_10
  919. SUM_SUB 6, 7, 9
  920. pmulhrsw m6, m10 ; stp1_12
  921. pmulhrsw m7, m10 ; stp1_11
  922. %else
  923. BUTTERFLY_4X 5, 4, 11585, 11585, m8, 9, 10 ; stp1_10, stp1_13
  924. SWAP 5, 4
  925. BUTTERFLY_4X 6, 7, 11585, 11585, m8, 9, 10 ; stp1_11, stp1_12
  926. SWAP 6, 7
  927. %endif
  928. ; BLOCK C STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  929. mova [stp + %2 + idx8], m0
  930. mova [stp + %2 + idx9], m2
  931. mova [stp + %2 + idx10], m4
  932. mova [stp + %2 + idx11], m7
  933. mova [stp + %2 + idx12], m6
  934. mova [stp + %2 + idx13], m5
  935. mova [stp + %2 + idx14], m3
  936. mova [stp + %2 + idx15], m1
  937. ; BLOCK D STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  938. ;
  939. ; BLOCK D STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  940. ;
  941. ; BLOCK D STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  942. mova m11, [rsp + transposed_in + 16 * 4]
  943. mova m12, m11
  944. pmulhrsw m11, [pw__3196x2] ; stp1_4
  945. pmulhrsw m12, [pw_16069x2] ; stp1_7
  946. mova m13, [rsp + transposed_in + 16 * 12]
  947. mova m14, m13
  948. pmulhrsw m13, [pw_13623x2] ; stp1_6
  949. pmulhrsw m14, [pw_m9102x2] ; stp1_5
  950. ; BLOCK D STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  951. mova m0, [rsp + transposed_in + 16 * 0]
  952. mova m2, [rsp + transposed_in + 16 * 8]
  953. pmulhrsw m0, [pw_11585x2] ; stp1_1
  954. mova m3, m2
  955. pmulhrsw m2, [pw__6270x2] ; stp1_2
  956. pmulhrsw m3, [pw_15137x2] ; stp1_3
  957. SUM_SUB 11, 14, 9 ; stp1_4, stp1_5
  958. SUM_SUB 12, 13, 9 ; stp1_7, stp1_6
  959. ; BLOCK D STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  960. %if 0 ; overflow occurs in SUM_SUB when using test streams
  961. mova m10, [pw_11585x2]
  962. SUM_SUB 13, 14, 9
  963. pmulhrsw m13, m10 ; stp1_6
  964. pmulhrsw m14, m10 ; stp1_5
  965. %else
  966. BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_5, stp1_6
  967. SWAP 13, 14
  968. %endif
  969. mova m1, m0 ; stp1_0 = stp1_1
  970. SUM_SUB 0, 3, 9 ; stp1_0, stp1_3
  971. SUM_SUB 1, 2, 9 ; stp1_1, stp1_2
  972. ; BLOCK D STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  973. SUM_SUB 0, 12, 9 ; stp1_0, stp1_7
  974. SUM_SUB 1, 13, 9 ; stp1_1, stp1_6
  975. SUM_SUB 2, 14, 9 ; stp1_2, stp1_5
  976. SUM_SUB 3, 11, 9 ; stp1_3, stp1_4
  977. ; BLOCK D STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  978. mova m4, [stp + %2 + idx12]
  979. mova m5, [stp + %2 + idx13]
  980. mova m6, [stp + %2 + idx14]
  981. mova m7, [stp + %2 + idx15]
  982. SUM_SUB 0, 7, 9 ; stp1_0, stp1_15
  983. SUM_SUB 1, 6, 9 ; stp1_1, stp1_14
  984. SUM_SUB 2, 5, 9 ; stp1_2, stp1_13
  985. SUM_SUB 3, 4, 9 ; stp1_3, stp1_12
  986. ; 0-3, 28-31 final stage
  987. mova m10, [stp + %4 + idx31]
  988. mova m15, [stp + %4 + idx30]
  989. SUM_SUB 0, 10, 9 ; stp1_0, stp1_31
  990. SUM_SUB 1, 15, 9 ; stp1_1, stp1_30
  991. mova [stp + %1 + idx0], m0
  992. mova [stp + %1 + idx1], m1
  993. mova [stp + %4 + idx31], m10
  994. mova [stp + %4 + idx30], m15
  995. mova m0, [stp + %4 + idx29]
  996. mova m1, [stp + %4 + idx28]
  997. SUM_SUB 2, 0, 9 ; stp1_2, stp1_29
  998. SUM_SUB 3, 1, 9 ; stp1_3, stp1_28
  999. mova [stp + %1 + idx2], m2
  1000. mova [stp + %1 + idx3], m3
  1001. mova [stp + %4 + idx29], m0
  1002. mova [stp + %4 + idx28], m1
  1003. ; 12-15, 16-19 final stage
  1004. mova m0, [stp + %3 + idx16]
  1005. mova m1, [stp + %3 + idx17]
  1006. mova m2, [stp + %3 + idx18]
  1007. mova m3, [stp + %3 + idx19]
  1008. SUM_SUB 7, 0, 9 ; stp1_15, stp1_16
  1009. SUM_SUB 6, 1, 9 ; stp1_14, stp1_17
  1010. SUM_SUB 5, 2, 9 ; stp1_13, stp1_18
  1011. SUM_SUB 4, 3, 9 ; stp1_12, stp1_19
  1012. mova [stp + %2 + idx12], m4
  1013. mova [stp + %2 + idx13], m5
  1014. mova [stp + %2 + idx14], m6
  1015. mova [stp + %2 + idx15], m7
  1016. mova [stp + %3 + idx16], m0
  1017. mova [stp + %3 + idx17], m1
  1018. mova [stp + %3 + idx18], m2
  1019. mova [stp + %3 + idx19], m3
  1020. mova m4, [stp + %2 + idx8]
  1021. mova m5, [stp + %2 + idx9]
  1022. mova m6, [stp + %2 + idx10]
  1023. mova m7, [stp + %2 + idx11]
  1024. SUM_SUB 11, 7, 9 ; stp1_4, stp1_11
  1025. SUM_SUB 14, 6, 9 ; stp1_5, stp1_10
  1026. SUM_SUB 13, 5, 9 ; stp1_6, stp1_9
  1027. SUM_SUB 12, 4, 9 ; stp1_7, stp1_8
  1028. ; 4-7, 24-27 final stage
  1029. mova m3, [stp + %4 + idx24]
  1030. mova m2, [stp + %4 + idx25]
  1031. mova m1, [stp + %4 + idx26]
  1032. mova m0, [stp + %4 + idx27]
  1033. SUM_SUB 12, 3, 9 ; stp1_7, stp1_24
  1034. SUM_SUB 13, 2, 9 ; stp1_6, stp1_25
  1035. SUM_SUB 14, 1, 9 ; stp1_5, stp1_26
  1036. SUM_SUB 11, 0, 9 ; stp1_4, stp1_27
  1037. mova [stp + %4 + idx24], m3
  1038. mova [stp + %4 + idx25], m2
  1039. mova [stp + %4 + idx26], m1
  1040. mova [stp + %4 + idx27], m0
  1041. mova [stp + %1 + idx4], m11
  1042. mova [stp + %1 + idx5], m14
  1043. mova [stp + %1 + idx6], m13
  1044. mova [stp + %1 + idx7], m12
  1045. ; 8-11, 20-23 final stage
  1046. mova m0, [stp + %3 + idx20]
  1047. mova m1, [stp + %3 + idx21]
  1048. mova m2, [stp + %3 + idx22]
  1049. mova m3, [stp + %3 + idx23]
  1050. SUM_SUB 7, 0, 9 ; stp1_11, stp_20
  1051. SUM_SUB 6, 1, 9 ; stp1_10, stp_21
  1052. SUM_SUB 5, 2, 9 ; stp1_9, stp_22
  1053. SUM_SUB 4, 3, 9 ; stp1_8, stp_23
  1054. mova [stp + %2 + idx8], m4
  1055. mova [stp + %2 + idx9], m5
  1056. mova [stp + %2 + idx10], m6
  1057. mova [stp + %2 + idx11], m7
  1058. mova [stp + %3 + idx20], m0
  1059. mova [stp + %3 + idx21], m1
  1060. mova [stp + %3 + idx22], m2
  1061. mova [stp + %3 + idx23], m3
  1062. %endmacro
  1063. INIT_XMM ssse3
  1064. cglobal idct32x32_135_add, 3, 11, 16, i32x32_size, input, output, stride
  1065. mova m8, [pd_8192]
  1066. mov r6, 2
  1067. lea stp, [rsp + pass_one_start]
  1068. idct32x32_135:
  1069. mov r3, inputq
  1070. lea r4, [rsp + transposed_in]
  1071. mov r7, 2
  1072. idct32x32_135_transpose:
  1073. %if CONFIG_VP9_HIGHBITDEPTH
  1074. mova m0, [r3 + 0]
  1075. packssdw m0, [r3 + 16]
  1076. mova m1, [r3 + 32 * 4]
  1077. packssdw m1, [r3 + 32 * 4 + 16]
  1078. mova m2, [r3 + 32 * 8]
  1079. packssdw m2, [r3 + 32 * 8 + 16]
  1080. mova m3, [r3 + 32 * 12]
  1081. packssdw m3, [r3 + 32 * 12 + 16]
  1082. mova m4, [r3 + 32 * 16]
  1083. packssdw m4, [r3 + 32 * 16 + 16]
  1084. mova m5, [r3 + 32 * 20]
  1085. packssdw m5, [r3 + 32 * 20 + 16]
  1086. mova m6, [r3 + 32 * 24]
  1087. packssdw m6, [r3 + 32 * 24 + 16]
  1088. mova m7, [r3 + 32 * 28]
  1089. packssdw m7, [r3 + 32 * 28 + 16]
  1090. %else
  1091. mova m0, [r3 + 0]
  1092. mova m1, [r3 + 16 * 4]
  1093. mova m2, [r3 + 16 * 8]
  1094. mova m3, [r3 + 16 * 12]
  1095. mova m4, [r3 + 16 * 16]
  1096. mova m5, [r3 + 16 * 20]
  1097. mova m6, [r3 + 16 * 24]
  1098. mova m7, [r3 + 16 * 28]
  1099. %endif
  1100. TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
  1101. mova [r4 + 0], m0
  1102. mova [r4 + 16 * 1], m1
  1103. mova [r4 + 16 * 2], m2
  1104. mova [r4 + 16 * 3], m3
  1105. mova [r4 + 16 * 4], m4
  1106. mova [r4 + 16 * 5], m5
  1107. mova [r4 + 16 * 6], m6
  1108. mova [r4 + 16 * 7], m7
  1109. %if CONFIG_VP9_HIGHBITDEPTH
  1110. add r3, 32
  1111. %else
  1112. add r3, 16
  1113. %endif
  1114. add r4, 16 * 8
  1115. dec r7
  1116. jne idct32x32_135_transpose
  1117. IDCT32X32_135 16*0, 16*32, 16*64, 16*96
  1118. lea stp, [stp + 16 * 8]
  1119. %if CONFIG_VP9_HIGHBITDEPTH
  1120. lea inputq, [inputq + 32 * 32]
  1121. %else
  1122. lea inputq, [inputq + 16 * 32]
  1123. %endif
  1124. dec r6
  1125. jnz idct32x32_135
  1126. mov r6, 4
  1127. lea stp, [rsp + pass_one_start]
  1128. lea r9, [rsp + pass_one_start]
  1129. idct32x32_135_2:
  1130. lea r4, [rsp + transposed_in]
  1131. mov r3, r9
  1132. mov r7, 2
  1133. idct32x32_135_transpose_2:
  1134. mova m0, [r3 + 0]
  1135. mova m1, [r3 + 16 * 1]
  1136. mova m2, [r3 + 16 * 2]
  1137. mova m3, [r3 + 16 * 3]
  1138. mova m4, [r3 + 16 * 4]
  1139. mova m5, [r3 + 16 * 5]
  1140. mova m6, [r3 + 16 * 6]
  1141. mova m7, [r3 + 16 * 7]
  1142. TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
  1143. mova [r4 + 0], m0
  1144. mova [r4 + 16 * 1], m1
  1145. mova [r4 + 16 * 2], m2
  1146. mova [r4 + 16 * 3], m3
  1147. mova [r4 + 16 * 4], m4
  1148. mova [r4 + 16 * 5], m5
  1149. mova [r4 + 16 * 6], m6
  1150. mova [r4 + 16 * 7], m7
  1151. add r3, 16 * 8
  1152. add r4, 16 * 8
  1153. dec r7
  1154. jne idct32x32_135_transpose_2
  1155. IDCT32X32_135 16*0, 16*8, 16*16, 16*24
  1156. lea stp, [stp + 16 * 32]
  1157. add r9, 16 * 32
  1158. dec r6
  1159. jnz idct32x32_135_2
  1160. RECON_AND_STORE pass_two_start
  1161. RET
  1162. %macro IDCT32X32_1024 4
  1163. ; BLOCK A STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  1164. mova m1, [rsp + transposed_in + 16 * 1]
  1165. mova m11, [rsp + transposed_in + 16 * 31]
  1166. BUTTERFLY_4X 1, 11, 804, 16364, m8, 9, 10 ; stp1_16, stp1_31
  1167. mova m0, [rsp + transposed_in + 16 * 15]
  1168. mova m2, [rsp + transposed_in + 16 * 17]
  1169. BUTTERFLY_4X 2, 0, 12140, 11003, m8, 9, 10 ; stp1_17, stp1_30
  1170. mova m7, [rsp + transposed_in + 16 * 7]
  1171. mova m12, [rsp + transposed_in + 16 * 25]
  1172. BUTTERFLY_4X 12, 7, 15426, 5520, m8, 9, 10 ; stp1_19, stp1_28
  1173. mova m3, [rsp + transposed_in + 16 * 9]
  1174. mova m4, [rsp + transposed_in + 16 * 23]
  1175. BUTTERFLY_4X 3, 4, 7005, 14811, m8, 9, 10 ; stp1_18, stp1_29
  1176. ; BLOCK A STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  1177. SUM_SUB 1, 2, 9 ; stp2_16, stp2_17
  1178. SUM_SUB 12, 3, 9 ; stp2_19, stp2_18
  1179. SUM_SUB 7, 4, 9 ; stp2_28, stp2_29
  1180. SUM_SUB 11, 0, 9 ; stp2_31, stp2_30
  1181. ; BLOCK A STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  1182. BUTTERFLY_4X 0, 2, 3196, 16069, m8, 9, 10 ; stp1_17, stp1_30
  1183. BUTTERFLY_4Xmm 4, 3, 3196, 16069, m8, 9, 10 ; stp1_29, stp1_18
  1184. ; BLOCK A STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  1185. SUM_SUB 1, 12, 9 ; stp2_16, stp2_19
  1186. SUM_SUB 0, 3, 9 ; stp2_17, stp2_18
  1187. SUM_SUB 11, 7, 9 ; stp2_31, stp2_28
  1188. SUM_SUB 2, 4, 9 ; stp2_30, stp2_29
  1189. ; BLOCK A STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  1190. BUTTERFLY_4X 4, 3, 6270, 15137, m8, 9, 10 ; stp1_18, stp1_29
  1191. BUTTERFLY_4X 7, 12, 6270, 15137, m8, 9, 10 ; stp1_19, stp1_28
  1192. mova [stp + %3 + idx16], m1
  1193. mova [stp + %3 + idx17], m0
  1194. mova [stp + %3 + idx18], m4
  1195. mova [stp + %3 + idx19], m7
  1196. mova [stp + %4 + idx28], m12
  1197. mova [stp + %4 + idx29], m3
  1198. mova [stp + %4 + idx30], m2
  1199. mova [stp + %4 + idx31], m11
  1200. ; BLOCK B STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  1201. mova m5, [rsp + transposed_in + 16 * 5]
  1202. mova m6, [rsp + transposed_in + 16 * 27]
  1203. BUTTERFLY_4X 5, 6, 3981, 15893, m8, 9, 10 ; stp1_20, stp1_27
  1204. mova m13, [rsp + transposed_in + 16 * 21]
  1205. mova m14, [rsp + transposed_in + 16 * 11]
  1206. BUTTERFLY_4X 13, 14, 14053, 8423, m8, 9, 10 ; stp1_21, stp1_26
  1207. mova m0, [rsp + transposed_in + 16 * 13]
  1208. mova m1, [rsp + transposed_in + 16 * 19]
  1209. BUTTERFLY_4X 0, 1, 9760, 13160, m8, 9, 10 ; stp1_22, stp1_25
  1210. mova m2, [rsp + transposed_in + 16 * 3]
  1211. mova m3, [rsp + transposed_in + 16 * 29]
  1212. BUTTERFLY_4X 3, 2, 16207, 2404, m8, 9, 10 ; stp1_23, stp1_24
  1213. ; BLOCK B STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  1214. SUM_SUB 5, 13, 9 ; stp2_20, stp2_21
  1215. SUM_SUB 3, 0, 9 ; stp2_23, stp2_22
  1216. SUM_SUB 2, 1, 9 ; stp2_24, stp2_25
  1217. SUM_SUB 6, 14, 9 ; stp2_27, stp2_26
  1218. ; BLOCK B STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  1219. BUTTERFLY_4X 14, 13, 13623, 9102, m8, 9, 10 ; stp1_21, stp1_26
  1220. BUTTERFLY_4Xmm 1, 0, 13623, 9102, m8, 9, 10 ; stp1_25, stp1_22
  1221. ; BLOCK B STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  1222. SUM_SUB 3, 5, 9 ; stp2_23, stp2_20
  1223. SUM_SUB 0, 14, 9 ; stp2_22, stp2_21
  1224. SUM_SUB 2, 6, 9 ; stp2_24, stp2_27
  1225. SUM_SUB 1, 13, 9 ; stp2_25, stp2_26
  1226. ; BLOCK B STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  1227. BUTTERFLY_4Xmm 6, 5, 6270, 15137, m8, 9, 10 ; stp1_27, stp1_20
  1228. BUTTERFLY_4Xmm 13, 14, 6270, 15137, m8, 9, 10 ; stp1_26, stp1_21
  1229. ; BLOCK B STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  1230. mova m4, [stp + %3 + idx16]
  1231. mova m7, [stp + %3 + idx17]
  1232. mova m11, [stp + %3 + idx18]
  1233. mova m12, [stp + %3 + idx19]
  1234. SUM_SUB 4, 3, 9 ; stp2_16, stp2_23
  1235. SUM_SUB 7, 0, 9 ; stp2_17, stp2_22
  1236. SUM_SUB 11, 14, 9 ; stp2_18, stp2_21
  1237. SUM_SUB 12, 5, 9 ; stp2_19, stp2_20
  1238. mova [stp + %3 + idx16], m4
  1239. mova [stp + %3 + idx17], m7
  1240. mova [stp + %3 + idx18], m11
  1241. mova [stp + %3 + idx19], m12
  1242. mova m4, [stp + %4 + idx28]
  1243. mova m7, [stp + %4 + idx29]
  1244. mova m11, [stp + %4 + idx30]
  1245. mova m12, [stp + %4 + idx31]
  1246. SUM_SUB 4, 6, 9 ; stp2_28, stp2_27
  1247. SUM_SUB 7, 13, 9 ; stp2_29, stp2_26
  1248. SUM_SUB 11, 1, 9 ; stp2_30, stp2_25
  1249. SUM_SUB 12, 2, 9 ; stp2_31, stp2_24
  1250. mova [stp + %4 + idx28], m4
  1251. mova [stp + %4 + idx29], m7
  1252. mova [stp + %4 + idx30], m11
  1253. mova [stp + %4 + idx31], m12
  1254. ; BLOCK B STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  1255. %if 0 ; overflow occurs in SUM_SUB when using test streams
  1256. mova m10, [pw_11585x2]
  1257. SUM_SUB 6, 5, 9
  1258. pmulhrsw m6, m10 ; stp1_27
  1259. pmulhrsw m5, m10 ; stp1_20
  1260. SUM_SUB 13, 14, 9
  1261. pmulhrsw m13, m10 ; stp1_26
  1262. pmulhrsw m14, m10 ; stp1_21
  1263. SUM_SUB 1, 0, 9
  1264. pmulhrsw m1, m10 ; stp1_25
  1265. pmulhrsw m0, m10 ; stp1_22
  1266. SUM_SUB 2, 3, 9
  1267. pmulhrsw m2, m10 ; stp1_25
  1268. pmulhrsw m3, m10 ; stp1_22
  1269. %else
  1270. BUTTERFLY_4X 6, 5, 11585, 11585, m8, 9, 10 ; stp1_20, stp1_27
  1271. SWAP 6, 5
  1272. BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_21, stp1_26
  1273. SWAP 13, 14
  1274. BUTTERFLY_4X 1, 0, 11585, 11585, m8, 9, 10 ; stp1_22, stp1_25
  1275. SWAP 1, 0
  1276. BUTTERFLY_4X 2, 3, 11585, 11585, m8, 9, 10 ; stp1_23, stp1_24
  1277. SWAP 2, 3
  1278. %endif
  1279. mova [stp + %3 + idx20], m5
  1280. mova [stp + %3 + idx21], m14
  1281. mova [stp + %3 + idx22], m0
  1282. mova [stp + %3 + idx23], m3
  1283. mova [stp + %4 + idx24], m2
  1284. mova [stp + %4 + idx25], m1
  1285. mova [stp + %4 + idx26], m13
  1286. mova [stp + %4 + idx27], m6
  1287. ; BLOCK C STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  1288. ;
  1289. ; BLOCK C STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  1290. mova m0, [rsp + transposed_in + 16 * 2]
  1291. mova m1, [rsp + transposed_in + 16 * 30]
  1292. BUTTERFLY_4X 0, 1, 1606, 16305, m8, 9, 10 ; stp1_8, stp1_15
  1293. mova m2, [rsp + transposed_in + 16 * 14]
  1294. mova m3, [rsp + transposed_in + 16 * 18]
  1295. BUTTERFLY_4X 3, 2, 12665, 10394, m8, 9, 10 ; stp1_9, stp1_14
  1296. mova m4, [rsp + transposed_in + 16 * 10]
  1297. mova m5, [rsp + transposed_in + 16 * 22]
  1298. BUTTERFLY_4X 4, 5, 7723, 14449, m8, 9, 10 ; stp1_10, stp1_13
  1299. mova m6, [rsp + transposed_in + 16 * 6]
  1300. mova m7, [rsp + transposed_in + 16 * 26]
  1301. BUTTERFLY_4X 7, 6, 15679, 4756, m8, 9, 10 ; stp1_11, stp1_12
  1302. ; BLOCK C STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  1303. SUM_SUB 0, 3, 9 ; stp1_8, stp1_9
  1304. SUM_SUB 7, 4, 9 ; stp1_11, stp1_10
  1305. SUM_SUB 6, 5, 9 ; stp1_12, stp1_13
  1306. SUM_SUB 1, 2, 9 ; stp1_15, stp1_14
  1307. ; BLOCK C STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  1308. BUTTERFLY_4X 2, 3, 6270, 15137, m8, 9, 10 ; stp1_9, stp1_14
  1309. BUTTERFLY_4Xmm 5, 4, 6270, 15137, m8, 9, 10 ; stp1_13, stp1_10
  1310. ; BLOCK C STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  1311. SUM_SUB 0, 7, 9 ; stp1_8, stp1_11
  1312. SUM_SUB 2, 4, 9 ; stp1_9, stp1_10
  1313. SUM_SUB 1, 6, 9 ; stp1_15, stp1_12
  1314. SUM_SUB 3, 5, 9 ; stp1_14, stp1_13
  1315. ; BLOCK C STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  1316. %if 0 ; overflow occurs in SUM_SUB when using test streams
  1317. mova m10, [pw_11585x2]
  1318. SUM_SUB 5, 4, 9
  1319. pmulhrsw m5, m10 ; stp1_13
  1320. pmulhrsw m4, m10 ; stp1_10
  1321. SUM_SUB 6, 7, 9
  1322. pmulhrsw m6, m10 ; stp1_12
  1323. pmulhrsw m7, m10 ; stp1_11
  1324. %else
  1325. BUTTERFLY_4X 5, 4, 11585, 11585, m8, 9, 10 ; stp1_10, stp1_13
  1326. SWAP 5, 4
  1327. BUTTERFLY_4X 6, 7, 11585, 11585, m8, 9, 10 ; stp1_11, stp1_12
  1328. SWAP 6, 7
  1329. %endif
  1330. ; BLOCK C STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  1331. mova [stp + %2 + idx8], m0
  1332. mova [stp + %2 + idx9], m2
  1333. mova [stp + %2 + idx10], m4
  1334. mova [stp + %2 + idx11], m7
  1335. mova [stp + %2 + idx12], m6
  1336. mova [stp + %2 + idx13], m5
  1337. mova [stp + %2 + idx14], m3
  1338. mova [stp + %2 + idx15], m1
  1339. ; BLOCK D STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  1340. ;
  1341. ; BLOCK D STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  1342. ;
  1343. ; BLOCK D STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  1344. mova m11, [rsp + transposed_in + 16 * 4]
  1345. mova m12, [rsp + transposed_in + 16 * 28]
  1346. BUTTERFLY_4X 11, 12, 3196, 16069, m8, 9, 10 ; stp1_4, stp1_7
  1347. mova m13, [rsp + transposed_in + 16 * 12]
  1348. mova m14, [rsp + transposed_in + 16 * 20]
  1349. BUTTERFLY_4X 14, 13, 13623, 9102, m8, 9, 10 ; stp1_5, stp1_6
  1350. ; BLOCK D STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  1351. mova m0, [rsp + transposed_in + 16 * 0]
  1352. mova m1, [rsp + transposed_in + 16 * 16]
  1353. %if 0 ; overflow occurs in SUM_SUB when using test streams
  1354. mova m10, [pw_11585x2]
  1355. SUM_SUB 0, 1, 9
  1356. pmulhrsw m0, m10 ; stp1_1
  1357. pmulhrsw m1, m10 ; stp1_0
  1358. %else
  1359. BUTTERFLY_4X 0, 1, 11585, 11585, m8, 9, 10 ; stp1_1, stp1_0
  1360. SWAP 0, 1
  1361. %endif
  1362. mova m2, [rsp + transposed_in + 16 * 8]
  1363. mova m3, [rsp + transposed_in + 16 * 24]
  1364. BUTTERFLY_4X 2, 3, 6270, 15137, m8, 9, 10 ; stp1_2, stp1_3
  1365. mova m10, [pw_11585x2]
  1366. SUM_SUB 11, 14, 9 ; stp1_4, stp1_5
  1367. SUM_SUB 12, 13, 9 ; stp1_7, stp1_6
  1368. ; BLOCK D STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  1369. %if 0 ; overflow occurs in SUM_SUB when using test streams
  1370. SUM_SUB 13, 14, 9
  1371. pmulhrsw m13, m10 ; stp1_6
  1372. pmulhrsw m14, m10 ; stp1_5
  1373. %else
  1374. BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_5, stp1_6
  1375. SWAP 13, 14
  1376. %endif
  1377. SUM_SUB 0, 3, 9 ; stp1_0, stp1_3
  1378. SUM_SUB 1, 2, 9 ; stp1_1, stp1_2
  1379. ; BLOCK D STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  1380. SUM_SUB 0, 12, 9 ; stp1_0, stp1_7
  1381. SUM_SUB 1, 13, 9 ; stp1_1, stp1_6
  1382. SUM_SUB 2, 14, 9 ; stp1_2, stp1_5
  1383. SUM_SUB 3, 11, 9 ; stp1_3, stp1_4
  1384. ; BLOCK D STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  1385. mova m4, [stp + %2 + idx12]
  1386. mova m5, [stp + %2 + idx13]
  1387. mova m6, [stp + %2 + idx14]
  1388. mova m7, [stp + %2 + idx15]
  1389. SUM_SUB 0, 7, 9 ; stp1_0, stp1_15
  1390. SUM_SUB 1, 6, 9 ; stp1_1, stp1_14
  1391. SUM_SUB 2, 5, 9 ; stp1_2, stp1_13
  1392. SUM_SUB 3, 4, 9 ; stp1_3, stp1_12
  1393. ; 0-3, 28-31 final stage
  1394. mova m10, [stp + %4 + idx31]
  1395. mova m15, [stp + %4 + idx30]
  1396. SUM_SUB 0, 10, 9 ; stp1_0, stp1_31
  1397. SUM_SUB 1, 15, 9 ; stp1_1, stp1_30
  1398. mova [stp + %1 + idx0], m0
  1399. mova [stp + %1 + idx1], m1
  1400. mova [stp + %4 + idx31], m10
  1401. mova [stp + %4 + idx30], m15
  1402. mova m0, [stp + %4 + idx29]
  1403. mova m1, [stp + %4 + idx28]
  1404. SUM_SUB 2, 0, 9 ; stp1_2, stp1_29
  1405. SUM_SUB 3, 1, 9 ; stp1_3, stp1_28
  1406. mova [stp + %1 + idx2], m2
  1407. mova [stp + %1 + idx3], m3
  1408. mova [stp + %4 + idx29], m0
  1409. mova [stp + %4 + idx28], m1
  1410. ; 12-15, 16-19 final stage
  1411. mova m0, [stp + %3 + idx16]
  1412. mova m1, [stp + %3 + idx17]
  1413. mova m2, [stp + %3 + idx18]
  1414. mova m3, [stp + %3 + idx19]
  1415. SUM_SUB 7, 0, 9 ; stp1_15, stp1_16
  1416. SUM_SUB 6, 1, 9 ; stp1_14, stp1_17
  1417. SUM_SUB 5, 2, 9 ; stp1_13, stp1_18
  1418. SUM_SUB 4, 3, 9 ; stp1_12, stp1_19
  1419. mova [stp + %2 + idx12], m4
  1420. mova [stp + %2 + idx13], m5
  1421. mova [stp + %2 + idx14], m6
  1422. mova [stp + %2 + idx15], m7
  1423. mova [stp + %3 + idx16], m0
  1424. mova [stp + %3 + idx17], m1
  1425. mova [stp + %3 + idx18], m2
  1426. mova [stp + %3 + idx19], m3
  1427. mova m4, [stp + %2 + idx8]
  1428. mova m5, [stp + %2 + idx9]
  1429. mova m6, [stp + %2 + idx10]
  1430. mova m7, [stp + %2 + idx11]
  1431. SUM_SUB 11, 7, 9 ; stp1_4, stp1_11
  1432. SUM_SUB 14, 6, 9 ; stp1_5, stp1_10
  1433. SUM_SUB 13, 5, 9 ; stp1_6, stp1_9
  1434. SUM_SUB 12, 4, 9 ; stp1_7, stp1_8
  1435. ; 4-7, 24-27 final stage
  1436. mova m3, [stp + %4 + idx24]
  1437. mova m2, [stp + %4 + idx25]
  1438. mova m1, [stp + %4 + idx26]
  1439. mova m0, [stp + %4 + idx27]
  1440. SUM_SUB 12, 3, 9 ; stp1_7, stp1_24
  1441. SUM_SUB 13, 2, 9 ; stp1_6, stp1_25
  1442. SUM_SUB 14, 1, 9 ; stp1_5, stp1_26
  1443. SUM_SUB 11, 0, 9 ; stp1_4, stp1_27
  1444. mova [stp + %4 + idx24], m3
  1445. mova [stp + %4 + idx25], m2
  1446. mova [stp + %4 + idx26], m1
  1447. mova [stp + %4 + idx27], m0
  1448. mova [stp + %1 + idx4], m11
  1449. mova [stp + %1 + idx5], m14
  1450. mova [stp + %1 + idx6], m13
  1451. mova [stp + %1 + idx7], m12
  1452. ; 8-11, 20-23 final stage
  1453. mova m0, [stp + %3 + idx20]
  1454. mova m1, [stp + %3 + idx21]
  1455. mova m2, [stp + %3 + idx22]
  1456. mova m3, [stp + %3 + idx23]
  1457. SUM_SUB 7, 0, 9 ; stp1_11, stp_20
  1458. SUM_SUB 6, 1, 9 ; stp1_10, stp_21
  1459. SUM_SUB 5, 2, 9 ; stp1_9, stp_22
  1460. SUM_SUB 4, 3, 9 ; stp1_8, stp_23
  1461. mova [stp + %2 + idx8], m4
  1462. mova [stp + %2 + idx9], m5
  1463. mova [stp + %2 + idx10], m6
  1464. mova [stp + %2 + idx11], m7
  1465. mova [stp + %3 + idx20], m0
  1466. mova [stp + %3 + idx21], m1
  1467. mova [stp + %3 + idx22], m2
  1468. mova [stp + %3 + idx23], m3
  1469. %endmacro
  1470. INIT_XMM ssse3
  1471. cglobal idct32x32_1024_add, 3, 11, 16, i32x32_size, input, output, stride
  1472. mova m8, [pd_8192]
  1473. mov r6, 4
  1474. lea stp, [rsp + pass_one_start]
  1475. idct32x32_1024:
  1476. mov r3, inputq
  1477. lea r4, [rsp + transposed_in]
  1478. mov r7, 4
  1479. idct32x32_1024_transpose:
  1480. %if CONFIG_VP9_HIGHBITDEPTH
  1481. mova m0, [r3 + 0]
  1482. packssdw m0, [r3 + 16]
  1483. mova m1, [r3 + 32 * 4]
  1484. packssdw m1, [r3 + 32 * 4 + 16]
  1485. mova m2, [r3 + 32 * 8]
  1486. packssdw m2, [r3 + 32 * 8 + 16]
  1487. mova m3, [r3 + 32 * 12]
  1488. packssdw m3, [r3 + 32 * 12 + 16]
  1489. mova m4, [r3 + 32 * 16]
  1490. packssdw m4, [r3 + 32 * 16 + 16]
  1491. mova m5, [r3 + 32 * 20]
  1492. packssdw m5, [r3 + 32 * 20 + 16]
  1493. mova m6, [r3 + 32 * 24]
  1494. packssdw m6, [r3 + 32 * 24 + 16]
  1495. mova m7, [r3 + 32 * 28]
  1496. packssdw m7, [r3 + 32 * 28 + 16]
  1497. %else
  1498. mova m0, [r3 + 0]
  1499. mova m1, [r3 + 16 * 4]
  1500. mova m2, [r3 + 16 * 8]
  1501. mova m3, [r3 + 16 * 12]
  1502. mova m4, [r3 + 16 * 16]
  1503. mova m5, [r3 + 16 * 20]
  1504. mova m6, [r3 + 16 * 24]
  1505. mova m7, [r3 + 16 * 28]
  1506. %endif
  1507. TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
  1508. mova [r4 + 0], m0
  1509. mova [r4 + 16 * 1], m1
  1510. mova [r4 + 16 * 2], m2
  1511. mova [r4 + 16 * 3], m3
  1512. mova [r4 + 16 * 4], m4
  1513. mova [r4 + 16 * 5], m5
  1514. mova [r4 + 16 * 6], m6
  1515. mova [r4 + 16 * 7], m7
  1516. %if CONFIG_VP9_HIGHBITDEPTH
  1517. add r3, 32
  1518. %else
  1519. add r3, 16
  1520. %endif
  1521. add r4, 16 * 8
  1522. dec r7
  1523. jne idct32x32_1024_transpose
  1524. IDCT32X32_1024 16*0, 16*32, 16*64, 16*96
  1525. lea stp, [stp + 16 * 8]
  1526. %if CONFIG_VP9_HIGHBITDEPTH
  1527. lea inputq, [inputq + 32 * 32]
  1528. %else
  1529. lea inputq, [inputq + 16 * 32]
  1530. %endif
  1531. dec r6
  1532. jnz idct32x32_1024
  1533. mov r6, 4
  1534. lea stp, [rsp + pass_one_start]
  1535. lea r9, [rsp + pass_one_start]
  1536. idct32x32_1024_2:
  1537. lea r4, [rsp + transposed_in]
  1538. mov r3, r9
  1539. mov r7, 4
  1540. idct32x32_1024_transpose_2:
  1541. mova m0, [r3 + 0]
  1542. mova m1, [r3 + 16 * 1]
  1543. mova m2, [r3 + 16 * 2]
  1544. mova m3, [r3 + 16 * 3]
  1545. mova m4, [r3 + 16 * 4]
  1546. mova m5, [r3 + 16 * 5]
  1547. mova m6, [r3 + 16 * 6]
  1548. mova m7, [r3 + 16 * 7]
  1549. TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
  1550. mova [r4 + 0], m0
  1551. mova [r4 + 16 * 1], m1
  1552. mova [r4 + 16 * 2], m2
  1553. mova [r4 + 16 * 3], m3
  1554. mova [r4 + 16 * 4], m4
  1555. mova [r4 + 16 * 5], m5
  1556. mova [r4 + 16 * 6], m6
  1557. mova [r4 + 16 * 7], m7
  1558. add r3, 16 * 8
  1559. add r4, 16 * 8
  1560. dec r7
  1561. jne idct32x32_1024_transpose_2
  1562. IDCT32X32_1024 16*0, 16*8, 16*16, 16*24
  1563. lea stp, [stp + 16 * 32]
  1564. add r9, 16 * 32
  1565. dec r6
  1566. jnz idct32x32_1024_2
  1567. RECON_AND_STORE pass_two_start
  1568. RET
  1569. %endif