fft_common.h 76 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051
  1. /*
  2. * Copyright (c) 2018, Alliance for Open Media. All rights reserved
  3. *
  4. * This source code is subject to the terms of the BSD 2 Clause License and
  5. * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
  6. * was not distributed with this source code in the LICENSE file, you can
  7. * obtain it at www.aomedia.org/license/software. If the Alliance for Open
  8. * Media Patent License 1.0 was not distributed with this source code in the
  9. * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  10. */
  11. #ifndef AOM_AOM_DSP_FFT_COMMON_H_
  12. #define AOM_AOM_DSP_FFT_COMMON_H_
  13. #ifdef __cplusplus
  14. extern "C" {
  15. #endif
  16. /*!\brief A function pointer for computing 1d fft and ifft.
  17. *
  18. * The function will point to an implementation for a specific transform size,
  19. * and may perform the transforms using vectorized instructions.
  20. *
  21. * For a non-vectorized forward transforms of size n, the input and output
  22. * buffers will be size n. The output takes advantage of conjugate symmetry and
  23. * packs the results as: [r_0, r_1, ..., r_{n/2}, i_1, ..., i_{n/2-1}], where
  24. * (r_{j}, i_{j}) is the complex output for index j.
  25. *
  26. * An inverse transform will assume that the complex "input" is packed
  27. * similarly. Its output will be real.
  28. *
  29. * Non-vectorized transforms (e.g., on a single row) would use a stride = 1.
  30. *
  31. * Vectorized implementations are parallelized along the columns so that the fft
  32. * can be performed on multiple columns at a time. In such cases the data block
  33. * for input and output is typically square (n x n) and the stride will
  34. * correspond to the spacing between rows. At minimum, the input size must be
  35. * n x simd_vector_length.
  36. *
  37. * \param[in] input Input buffer. See above for size restrictions.
  38. * \param[out] output Output buffer. See above for size restrictions.
  39. * \param[in] stride The spacing in number of elements between rows
  40. * (or elements)
  41. */
  42. typedef void (*aom_fft_1d_func_t)(const float *input, float *output,
  43. int stride);
  44. // Declare some of the forward non-vectorized transforms which are used in some
  45. // of the vectorized implementations
  46. void aom_fft1d_4_float(const float *input, float *output, int stride);
  47. void aom_fft1d_8_float(const float *input, float *output, int stride);
  48. void aom_fft1d_16_float(const float *input, float *output, int stride);
  49. void aom_fft1d_32_float(const float *input, float *output, int stride);
  50. /**\!brief Function pointer for transposing a matrix of floats.
  51. *
  52. * \param[in] input Input buffer (size n x n)
  53. * \param[out] output Output buffer (size n x n)
  54. * \param[in] n Extent of one dimension of the square matrix.
  55. */
  56. typedef void (*aom_fft_transpose_func_t)(const float *input, float *output,
  57. int n);
  58. /**\!brief Function pointer for re-arranging intermediate 2d transform results.
  59. *
  60. * After re-arrangement, the real and imaginary components will be packed
  61. * tightly next to each other.
  62. *
  63. * \param[in] input Input buffer (size n x n)
  64. * \param[out] output Output buffer (size 2 x n x n)
  65. * \param[in] n Extent of one dimension of the square matrix.
  66. */
  67. typedef void (*aom_fft_unpack_func_t)(const float *input, float *output, int n);
  68. /*!\brief Performs a 2d fft with the given functions.
  69. *
  70. * This generator function allows for multiple different implementations of 2d
  71. * fft with different vector operations, without having to redefine the main
  72. * body multiple times.
  73. *
  74. * \param[in] input Input buffer to run the transform on (size n x n)
  75. * \param[out] temp Working buffer for computing the transform (size n x n)
  76. * \param[out] output Output buffer (size 2 x n x n)
  77. * \param[in] tform Forward transform function
  78. * \param[in] transpose Transpose function (for n x n matrix)
  79. * \param[in] unpack Unpack function used to massage outputs to correct form
  80. * \param[in] vec_size Vector size (the transform is done vec_size units at
  81. * a time)
  82. */
  83. void aom_fft_2d_gen(const float *input, float *temp, float *output, int n,
  84. aom_fft_1d_func_t tform, aom_fft_transpose_func_t transpose,
  85. aom_fft_unpack_func_t unpack, int vec_size);
  86. /*!\brief Perform a 2d inverse fft with the given helper functions
  87. *
  88. * \param[in] input Input buffer to run the transform on (size 2 x n x n)
  89. * \param[out] temp Working buffer for computations (size 2 x n x n)
  90. * \param[out] output Output buffer (size n x n)
  91. * \param[in] fft_single Forward transform function (non vectorized)
  92. * \param[in] fft_multi Forward transform function (vectorized)
  93. * \param[in] ifft_multi Inverse transform function (vectorized)
  94. * \param[in] transpose Transpose function (for n x n matrix)
  95. * \param[in] vec_size Vector size (the transform is done vec_size
  96. * units at a time)
  97. */
  98. void aom_ifft_2d_gen(const float *input, float *temp, float *output, int n,
  99. aom_fft_1d_func_t fft_single, aom_fft_1d_func_t fft_multi,
  100. aom_fft_1d_func_t ifft_multi,
  101. aom_fft_transpose_func_t transpose, int vec_size);
  102. #ifdef __cplusplus
  103. }
  104. #endif
  105. // The macros below define 1D fft/ifft for different data types and for
  106. // different simd vector intrinsic types.
  107. #define GEN_FFT_2(ret, suffix, T, T_VEC, load, store) \
  108. ret aom_fft1d_2_##suffix(const T *input, T *output, int stride) { \
  109. const T_VEC i0 = load(input + 0 * stride); \
  110. const T_VEC i1 = load(input + 1 * stride); \
  111. store(output + 0 * stride, i0 + i1); \
  112. store(output + 1 * stride, i0 - i1); \
  113. }
  114. #define GEN_FFT_4(ret, suffix, T, T_VEC, load, store, constant, add, sub) \
  115. ret aom_fft1d_4_##suffix(const T *input, T *output, int stride) { \
  116. const T_VEC kWeight0 = constant(0.0f); \
  117. const T_VEC i0 = load(input + 0 * stride); \
  118. const T_VEC i1 = load(input + 1 * stride); \
  119. const T_VEC i2 = load(input + 2 * stride); \
  120. const T_VEC i3 = load(input + 3 * stride); \
  121. const T_VEC w0 = add(i0, i2); \
  122. const T_VEC w1 = sub(i0, i2); \
  123. const T_VEC w2 = add(i1, i3); \
  124. const T_VEC w3 = sub(i1, i3); \
  125. store(output + 0 * stride, add(w0, w2)); \
  126. store(output + 1 * stride, w1); \
  127. store(output + 2 * stride, sub(w0, w2)); \
  128. store(output + 3 * stride, sub(kWeight0, w3)); \
  129. }
  130. #define GEN_FFT_8(ret, suffix, T, T_VEC, load, store, constant, add, sub, mul) \
  131. ret aom_fft1d_8_##suffix(const T *input, T *output, int stride) { \
  132. const T_VEC kWeight0 = constant(0.0f); \
  133. const T_VEC kWeight2 = constant(0.707107f); \
  134. const T_VEC i0 = load(input + 0 * stride); \
  135. const T_VEC i1 = load(input + 1 * stride); \
  136. const T_VEC i2 = load(input + 2 * stride); \
  137. const T_VEC i3 = load(input + 3 * stride); \
  138. const T_VEC i4 = load(input + 4 * stride); \
  139. const T_VEC i5 = load(input + 5 * stride); \
  140. const T_VEC i6 = load(input + 6 * stride); \
  141. const T_VEC i7 = load(input + 7 * stride); \
  142. const T_VEC w0 = add(i0, i4); \
  143. const T_VEC w1 = sub(i0, i4); \
  144. const T_VEC w2 = add(i2, i6); \
  145. const T_VEC w3 = sub(i2, i6); \
  146. const T_VEC w4 = add(w0, w2); \
  147. const T_VEC w5 = sub(w0, w2); \
  148. const T_VEC w7 = add(i1, i5); \
  149. const T_VEC w8 = sub(i1, i5); \
  150. const T_VEC w9 = add(i3, i7); \
  151. const T_VEC w10 = sub(i3, i7); \
  152. const T_VEC w11 = add(w7, w9); \
  153. const T_VEC w12 = sub(w7, w9); \
  154. store(output + 0 * stride, add(w4, w11)); \
  155. store(output + 1 * stride, add(w1, mul(kWeight2, sub(w8, w10)))); \
  156. store(output + 2 * stride, w5); \
  157. store(output + 3 * stride, sub(w1, mul(kWeight2, sub(w8, w10)))); \
  158. store(output + 4 * stride, sub(w4, w11)); \
  159. store(output + 5 * stride, \
  160. sub(sub(kWeight0, w3), mul(kWeight2, add(w10, w8)))); \
  161. store(output + 6 * stride, sub(kWeight0, w12)); \
  162. store(output + 7 * stride, sub(w3, mul(kWeight2, add(w10, w8)))); \
  163. }
  164. #define GEN_FFT_16(ret, suffix, T, T_VEC, load, store, constant, add, sub, \
  165. mul) \
  166. ret aom_fft1d_16_##suffix(const T *input, T *output, int stride) { \
  167. const T_VEC kWeight0 = constant(0.0f); \
  168. const T_VEC kWeight2 = constant(0.707107f); \
  169. const T_VEC kWeight3 = constant(0.92388f); \
  170. const T_VEC kWeight4 = constant(0.382683f); \
  171. const T_VEC i0 = load(input + 0 * stride); \
  172. const T_VEC i1 = load(input + 1 * stride); \
  173. const T_VEC i2 = load(input + 2 * stride); \
  174. const T_VEC i3 = load(input + 3 * stride); \
  175. const T_VEC i4 = load(input + 4 * stride); \
  176. const T_VEC i5 = load(input + 5 * stride); \
  177. const T_VEC i6 = load(input + 6 * stride); \
  178. const T_VEC i7 = load(input + 7 * stride); \
  179. const T_VEC i8 = load(input + 8 * stride); \
  180. const T_VEC i9 = load(input + 9 * stride); \
  181. const T_VEC i10 = load(input + 10 * stride); \
  182. const T_VEC i11 = load(input + 11 * stride); \
  183. const T_VEC i12 = load(input + 12 * stride); \
  184. const T_VEC i13 = load(input + 13 * stride); \
  185. const T_VEC i14 = load(input + 14 * stride); \
  186. const T_VEC i15 = load(input + 15 * stride); \
  187. const T_VEC w0 = add(i0, i8); \
  188. const T_VEC w1 = sub(i0, i8); \
  189. const T_VEC w2 = add(i4, i12); \
  190. const T_VEC w3 = sub(i4, i12); \
  191. const T_VEC w4 = add(w0, w2); \
  192. const T_VEC w5 = sub(w0, w2); \
  193. const T_VEC w7 = add(i2, i10); \
  194. const T_VEC w8 = sub(i2, i10); \
  195. const T_VEC w9 = add(i6, i14); \
  196. const T_VEC w10 = sub(i6, i14); \
  197. const T_VEC w11 = add(w7, w9); \
  198. const T_VEC w12 = sub(w7, w9); \
  199. const T_VEC w14 = add(w4, w11); \
  200. const T_VEC w15 = sub(w4, w11); \
  201. const T_VEC w16[2] = { add(w1, mul(kWeight2, sub(w8, w10))), \
  202. sub(sub(kWeight0, w3), \
  203. mul(kWeight2, add(w10, w8))) }; \
  204. const T_VEC w18[2] = { sub(w1, mul(kWeight2, sub(w8, w10))), \
  205. sub(w3, mul(kWeight2, add(w10, w8))) }; \
  206. const T_VEC w19 = add(i1, i9); \
  207. const T_VEC w20 = sub(i1, i9); \
  208. const T_VEC w21 = add(i5, i13); \
  209. const T_VEC w22 = sub(i5, i13); \
  210. const T_VEC w23 = add(w19, w21); \
  211. const T_VEC w24 = sub(w19, w21); \
  212. const T_VEC w26 = add(i3, i11); \
  213. const T_VEC w27 = sub(i3, i11); \
  214. const T_VEC w28 = add(i7, i15); \
  215. const T_VEC w29 = sub(i7, i15); \
  216. const T_VEC w30 = add(w26, w28); \
  217. const T_VEC w31 = sub(w26, w28); \
  218. const T_VEC w33 = add(w23, w30); \
  219. const T_VEC w34 = sub(w23, w30); \
  220. const T_VEC w35[2] = { add(w20, mul(kWeight2, sub(w27, w29))), \
  221. sub(sub(kWeight0, w22), \
  222. mul(kWeight2, add(w29, w27))) }; \
  223. const T_VEC w37[2] = { sub(w20, mul(kWeight2, sub(w27, w29))), \
  224. sub(w22, mul(kWeight2, add(w29, w27))) }; \
  225. store(output + 0 * stride, add(w14, w33)); \
  226. store(output + 1 * stride, \
  227. add(w16[0], add(mul(kWeight3, w35[0]), mul(kWeight4, w35[1])))); \
  228. store(output + 2 * stride, add(w5, mul(kWeight2, sub(w24, w31)))); \
  229. store(output + 3 * stride, \
  230. add(w18[0], add(mul(kWeight4, w37[0]), mul(kWeight3, w37[1])))); \
  231. store(output + 4 * stride, w15); \
  232. store(output + 5 * stride, \
  233. add(w18[0], sub(sub(kWeight0, mul(kWeight4, w37[0])), \
  234. mul(kWeight3, w37[1])))); \
  235. store(output + 6 * stride, sub(w5, mul(kWeight2, sub(w24, w31)))); \
  236. store(output + 7 * stride, \
  237. add(w16[0], sub(sub(kWeight0, mul(kWeight3, w35[0])), \
  238. mul(kWeight4, w35[1])))); \
  239. store(output + 8 * stride, sub(w14, w33)); \
  240. store(output + 9 * stride, \
  241. add(w16[1], sub(mul(kWeight3, w35[1]), mul(kWeight4, w35[0])))); \
  242. store(output + 10 * stride, \
  243. sub(sub(kWeight0, w12), mul(kWeight2, add(w31, w24)))); \
  244. store(output + 11 * stride, \
  245. add(w18[1], sub(mul(kWeight4, w37[1]), mul(kWeight3, w37[0])))); \
  246. store(output + 12 * stride, sub(kWeight0, w34)); \
  247. store(output + 13 * stride, \
  248. sub(sub(kWeight0, w18[1]), \
  249. sub(mul(kWeight3, w37[0]), mul(kWeight4, w37[1])))); \
  250. store(output + 14 * stride, sub(w12, mul(kWeight2, add(w31, w24)))); \
  251. store(output + 15 * stride, \
  252. sub(sub(kWeight0, w16[1]), \
  253. sub(mul(kWeight4, w35[0]), mul(kWeight3, w35[1])))); \
  254. }
  255. #define GEN_FFT_32(ret, suffix, T, T_VEC, load, store, constant, add, sub, \
  256. mul) \
  257. ret aom_fft1d_32_##suffix(const T *input, T *output, int stride) { \
  258. const T_VEC kWeight0 = constant(0.0f); \
  259. const T_VEC kWeight2 = constant(0.707107f); \
  260. const T_VEC kWeight3 = constant(0.92388f); \
  261. const T_VEC kWeight4 = constant(0.382683f); \
  262. const T_VEC kWeight5 = constant(0.980785f); \
  263. const T_VEC kWeight6 = constant(0.19509f); \
  264. const T_VEC kWeight7 = constant(0.83147f); \
  265. const T_VEC kWeight8 = constant(0.55557f); \
  266. const T_VEC i0 = load(input + 0 * stride); \
  267. const T_VEC i1 = load(input + 1 * stride); \
  268. const T_VEC i2 = load(input + 2 * stride); \
  269. const T_VEC i3 = load(input + 3 * stride); \
  270. const T_VEC i4 = load(input + 4 * stride); \
  271. const T_VEC i5 = load(input + 5 * stride); \
  272. const T_VEC i6 = load(input + 6 * stride); \
  273. const T_VEC i7 = load(input + 7 * stride); \
  274. const T_VEC i8 = load(input + 8 * stride); \
  275. const T_VEC i9 = load(input + 9 * stride); \
  276. const T_VEC i10 = load(input + 10 * stride); \
  277. const T_VEC i11 = load(input + 11 * stride); \
  278. const T_VEC i12 = load(input + 12 * stride); \
  279. const T_VEC i13 = load(input + 13 * stride); \
  280. const T_VEC i14 = load(input + 14 * stride); \
  281. const T_VEC i15 = load(input + 15 * stride); \
  282. const T_VEC i16 = load(input + 16 * stride); \
  283. const T_VEC i17 = load(input + 17 * stride); \
  284. const T_VEC i18 = load(input + 18 * stride); \
  285. const T_VEC i19 = load(input + 19 * stride); \
  286. const T_VEC i20 = load(input + 20 * stride); \
  287. const T_VEC i21 = load(input + 21 * stride); \
  288. const T_VEC i22 = load(input + 22 * stride); \
  289. const T_VEC i23 = load(input + 23 * stride); \
  290. const T_VEC i24 = load(input + 24 * stride); \
  291. const T_VEC i25 = load(input + 25 * stride); \
  292. const T_VEC i26 = load(input + 26 * stride); \
  293. const T_VEC i27 = load(input + 27 * stride); \
  294. const T_VEC i28 = load(input + 28 * stride); \
  295. const T_VEC i29 = load(input + 29 * stride); \
  296. const T_VEC i30 = load(input + 30 * stride); \
  297. const T_VEC i31 = load(input + 31 * stride); \
  298. const T_VEC w0 = add(i0, i16); \
  299. const T_VEC w1 = sub(i0, i16); \
  300. const T_VEC w2 = add(i8, i24); \
  301. const T_VEC w3 = sub(i8, i24); \
  302. const T_VEC w4 = add(w0, w2); \
  303. const T_VEC w5 = sub(w0, w2); \
  304. const T_VEC w7 = add(i4, i20); \
  305. const T_VEC w8 = sub(i4, i20); \
  306. const T_VEC w9 = add(i12, i28); \
  307. const T_VEC w10 = sub(i12, i28); \
  308. const T_VEC w11 = add(w7, w9); \
  309. const T_VEC w12 = sub(w7, w9); \
  310. const T_VEC w14 = add(w4, w11); \
  311. const T_VEC w15 = sub(w4, w11); \
  312. const T_VEC w16[2] = { add(w1, mul(kWeight2, sub(w8, w10))), \
  313. sub(sub(kWeight0, w3), \
  314. mul(kWeight2, add(w10, w8))) }; \
  315. const T_VEC w18[2] = { sub(w1, mul(kWeight2, sub(w8, w10))), \
  316. sub(w3, mul(kWeight2, add(w10, w8))) }; \
  317. const T_VEC w19 = add(i2, i18); \
  318. const T_VEC w20 = sub(i2, i18); \
  319. const T_VEC w21 = add(i10, i26); \
  320. const T_VEC w22 = sub(i10, i26); \
  321. const T_VEC w23 = add(w19, w21); \
  322. const T_VEC w24 = sub(w19, w21); \
  323. const T_VEC w26 = add(i6, i22); \
  324. const T_VEC w27 = sub(i6, i22); \
  325. const T_VEC w28 = add(i14, i30); \
  326. const T_VEC w29 = sub(i14, i30); \
  327. const T_VEC w30 = add(w26, w28); \
  328. const T_VEC w31 = sub(w26, w28); \
  329. const T_VEC w33 = add(w23, w30); \
  330. const T_VEC w34 = sub(w23, w30); \
  331. const T_VEC w35[2] = { add(w20, mul(kWeight2, sub(w27, w29))), \
  332. sub(sub(kWeight0, w22), \
  333. mul(kWeight2, add(w29, w27))) }; \
  334. const T_VEC w37[2] = { sub(w20, mul(kWeight2, sub(w27, w29))), \
  335. sub(w22, mul(kWeight2, add(w29, w27))) }; \
  336. const T_VEC w38 = add(w14, w33); \
  337. const T_VEC w39 = sub(w14, w33); \
  338. const T_VEC w40[2] = { \
  339. add(w16[0], add(mul(kWeight3, w35[0]), mul(kWeight4, w35[1]))), \
  340. add(w16[1], sub(mul(kWeight3, w35[1]), mul(kWeight4, w35[0]))) \
  341. }; \
  342. const T_VEC w41[2] = { add(w5, mul(kWeight2, sub(w24, w31))), \
  343. sub(sub(kWeight0, w12), \
  344. mul(kWeight2, add(w31, w24))) }; \
  345. const T_VEC w42[2] = { \
  346. add(w18[0], add(mul(kWeight4, w37[0]), mul(kWeight3, w37[1]))), \
  347. add(w18[1], sub(mul(kWeight4, w37[1]), mul(kWeight3, w37[0]))) \
  348. }; \
  349. const T_VEC w44[2] = { \
  350. add(w18[0], \
  351. sub(sub(kWeight0, mul(kWeight4, w37[0])), mul(kWeight3, w37[1]))), \
  352. sub(sub(kWeight0, w18[1]), \
  353. sub(mul(kWeight3, w37[0]), mul(kWeight4, w37[1]))) \
  354. }; \
  355. const T_VEC w45[2] = { sub(w5, mul(kWeight2, sub(w24, w31))), \
  356. sub(w12, mul(kWeight2, add(w31, w24))) }; \
  357. const T_VEC w46[2] = { \
  358. add(w16[0], \
  359. sub(sub(kWeight0, mul(kWeight3, w35[0])), mul(kWeight4, w35[1]))), \
  360. sub(sub(kWeight0, w16[1]), \
  361. sub(mul(kWeight4, w35[0]), mul(kWeight3, w35[1]))) \
  362. }; \
  363. const T_VEC w47 = add(i1, i17); \
  364. const T_VEC w48 = sub(i1, i17); \
  365. const T_VEC w49 = add(i9, i25); \
  366. const T_VEC w50 = sub(i9, i25); \
  367. const T_VEC w51 = add(w47, w49); \
  368. const T_VEC w52 = sub(w47, w49); \
  369. const T_VEC w54 = add(i5, i21); \
  370. const T_VEC w55 = sub(i5, i21); \
  371. const T_VEC w56 = add(i13, i29); \
  372. const T_VEC w57 = sub(i13, i29); \
  373. const T_VEC w58 = add(w54, w56); \
  374. const T_VEC w59 = sub(w54, w56); \
  375. const T_VEC w61 = add(w51, w58); \
  376. const T_VEC w62 = sub(w51, w58); \
  377. const T_VEC w63[2] = { add(w48, mul(kWeight2, sub(w55, w57))), \
  378. sub(sub(kWeight0, w50), \
  379. mul(kWeight2, add(w57, w55))) }; \
  380. const T_VEC w65[2] = { sub(w48, mul(kWeight2, sub(w55, w57))), \
  381. sub(w50, mul(kWeight2, add(w57, w55))) }; \
  382. const T_VEC w66 = add(i3, i19); \
  383. const T_VEC w67 = sub(i3, i19); \
  384. const T_VEC w68 = add(i11, i27); \
  385. const T_VEC w69 = sub(i11, i27); \
  386. const T_VEC w70 = add(w66, w68); \
  387. const T_VEC w71 = sub(w66, w68); \
  388. const T_VEC w73 = add(i7, i23); \
  389. const T_VEC w74 = sub(i7, i23); \
  390. const T_VEC w75 = add(i15, i31); \
  391. const T_VEC w76 = sub(i15, i31); \
  392. const T_VEC w77 = add(w73, w75); \
  393. const T_VEC w78 = sub(w73, w75); \
  394. const T_VEC w80 = add(w70, w77); \
  395. const T_VEC w81 = sub(w70, w77); \
  396. const T_VEC w82[2] = { add(w67, mul(kWeight2, sub(w74, w76))), \
  397. sub(sub(kWeight0, w69), \
  398. mul(kWeight2, add(w76, w74))) }; \
  399. const T_VEC w84[2] = { sub(w67, mul(kWeight2, sub(w74, w76))), \
  400. sub(w69, mul(kWeight2, add(w76, w74))) }; \
  401. const T_VEC w85 = add(w61, w80); \
  402. const T_VEC w86 = sub(w61, w80); \
  403. const T_VEC w87[2] = { \
  404. add(w63[0], add(mul(kWeight3, w82[0]), mul(kWeight4, w82[1]))), \
  405. add(w63[1], sub(mul(kWeight3, w82[1]), mul(kWeight4, w82[0]))) \
  406. }; \
  407. const T_VEC w88[2] = { add(w52, mul(kWeight2, sub(w71, w78))), \
  408. sub(sub(kWeight0, w59), \
  409. mul(kWeight2, add(w78, w71))) }; \
  410. const T_VEC w89[2] = { \
  411. add(w65[0], add(mul(kWeight4, w84[0]), mul(kWeight3, w84[1]))), \
  412. add(w65[1], sub(mul(kWeight4, w84[1]), mul(kWeight3, w84[0]))) \
  413. }; \
  414. const T_VEC w91[2] = { \
  415. add(w65[0], \
  416. sub(sub(kWeight0, mul(kWeight4, w84[0])), mul(kWeight3, w84[1]))), \
  417. sub(sub(kWeight0, w65[1]), \
  418. sub(mul(kWeight3, w84[0]), mul(kWeight4, w84[1]))) \
  419. }; \
  420. const T_VEC w92[2] = { sub(w52, mul(kWeight2, sub(w71, w78))), \
  421. sub(w59, mul(kWeight2, add(w78, w71))) }; \
  422. const T_VEC w93[2] = { \
  423. add(w63[0], \
  424. sub(sub(kWeight0, mul(kWeight3, w82[0])), mul(kWeight4, w82[1]))), \
  425. sub(sub(kWeight0, w63[1]), \
  426. sub(mul(kWeight4, w82[0]), mul(kWeight3, w82[1]))) \
  427. }; \
  428. store(output + 0 * stride, add(w38, w85)); \
  429. store(output + 1 * stride, \
  430. add(w40[0], add(mul(kWeight5, w87[0]), mul(kWeight6, w87[1])))); \
  431. store(output + 2 * stride, \
  432. add(w41[0], add(mul(kWeight3, w88[0]), mul(kWeight4, w88[1])))); \
  433. store(output + 3 * stride, \
  434. add(w42[0], add(mul(kWeight7, w89[0]), mul(kWeight8, w89[1])))); \
  435. store(output + 4 * stride, add(w15, mul(kWeight2, sub(w62, w81)))); \
  436. store(output + 5 * stride, \
  437. add(w44[0], add(mul(kWeight8, w91[0]), mul(kWeight7, w91[1])))); \
  438. store(output + 6 * stride, \
  439. add(w45[0], add(mul(kWeight4, w92[0]), mul(kWeight3, w92[1])))); \
  440. store(output + 7 * stride, \
  441. add(w46[0], add(mul(kWeight6, w93[0]), mul(kWeight5, w93[1])))); \
  442. store(output + 8 * stride, w39); \
  443. store(output + 9 * stride, \
  444. add(w46[0], sub(sub(kWeight0, mul(kWeight6, w93[0])), \
  445. mul(kWeight5, w93[1])))); \
  446. store(output + 10 * stride, \
  447. add(w45[0], sub(sub(kWeight0, mul(kWeight4, w92[0])), \
  448. mul(kWeight3, w92[1])))); \
  449. store(output + 11 * stride, \
  450. add(w44[0], sub(sub(kWeight0, mul(kWeight8, w91[0])), \
  451. mul(kWeight7, w91[1])))); \
  452. store(output + 12 * stride, sub(w15, mul(kWeight2, sub(w62, w81)))); \
  453. store(output + 13 * stride, \
  454. add(w42[0], sub(sub(kWeight0, mul(kWeight7, w89[0])), \
  455. mul(kWeight8, w89[1])))); \
  456. store(output + 14 * stride, \
  457. add(w41[0], sub(sub(kWeight0, mul(kWeight3, w88[0])), \
  458. mul(kWeight4, w88[1])))); \
  459. store(output + 15 * stride, \
  460. add(w40[0], sub(sub(kWeight0, mul(kWeight5, w87[0])), \
  461. mul(kWeight6, w87[1])))); \
  462. store(output + 16 * stride, sub(w38, w85)); \
  463. store(output + 17 * stride, \
  464. add(w40[1], sub(mul(kWeight5, w87[1]), mul(kWeight6, w87[0])))); \
  465. store(output + 18 * stride, \
  466. add(w41[1], sub(mul(kWeight3, w88[1]), mul(kWeight4, w88[0])))); \
  467. store(output + 19 * stride, \
  468. add(w42[1], sub(mul(kWeight7, w89[1]), mul(kWeight8, w89[0])))); \
  469. store(output + 20 * stride, \
  470. sub(sub(kWeight0, w34), mul(kWeight2, add(w81, w62)))); \
  471. store(output + 21 * stride, \
  472. add(w44[1], sub(mul(kWeight8, w91[1]), mul(kWeight7, w91[0])))); \
  473. store(output + 22 * stride, \
  474. add(w45[1], sub(mul(kWeight4, w92[1]), mul(kWeight3, w92[0])))); \
  475. store(output + 23 * stride, \
  476. add(w46[1], sub(mul(kWeight6, w93[1]), mul(kWeight5, w93[0])))); \
  477. store(output + 24 * stride, sub(kWeight0, w86)); \
  478. store(output + 25 * stride, \
  479. sub(sub(kWeight0, w46[1]), \
  480. sub(mul(kWeight5, w93[0]), mul(kWeight6, w93[1])))); \
  481. store(output + 26 * stride, \
  482. sub(sub(kWeight0, w45[1]), \
  483. sub(mul(kWeight3, w92[0]), mul(kWeight4, w92[1])))); \
  484. store(output + 27 * stride, \
  485. sub(sub(kWeight0, w44[1]), \
  486. sub(mul(kWeight7, w91[0]), mul(kWeight8, w91[1])))); \
  487. store(output + 28 * stride, sub(w34, mul(kWeight2, add(w81, w62)))); \
  488. store(output + 29 * stride, \
  489. sub(sub(kWeight0, w42[1]), \
  490. sub(mul(kWeight8, w89[0]), mul(kWeight7, w89[1])))); \
  491. store(output + 30 * stride, \
  492. sub(sub(kWeight0, w41[1]), \
  493. sub(mul(kWeight4, w88[0]), mul(kWeight3, w88[1])))); \
  494. store(output + 31 * stride, \
  495. sub(sub(kWeight0, w40[1]), \
  496. sub(mul(kWeight6, w87[0]), mul(kWeight5, w87[1])))); \
  497. }
  498. #define GEN_IFFT_2(ret, suffix, T, T_VEC, load, store) \
  499. ret aom_ifft1d_2_##suffix(const T *input, T *output, int stride) { \
  500. const T_VEC i0 = load(input + 0 * stride); \
  501. const T_VEC i1 = load(input + 1 * stride); \
  502. store(output + 0 * stride, i0 + i1); \
  503. store(output + 1 * stride, i0 - i1); \
  504. }
  505. #define GEN_IFFT_4(ret, suffix, T, T_VEC, load, store, constant, add, sub) \
  506. ret aom_ifft1d_4_##suffix(const T *input, T *output, int stride) { \
  507. const T_VEC kWeight0 = constant(0.0f); \
  508. const T_VEC i0 = load(input + 0 * stride); \
  509. const T_VEC i1 = load(input + 1 * stride); \
  510. const T_VEC i2 = load(input + 2 * stride); \
  511. const T_VEC i3 = load(input + 3 * stride); \
  512. const T_VEC w2 = add(i0, i2); \
  513. const T_VEC w3 = sub(i0, i2); \
  514. const T_VEC w4[2] = { add(i1, i1), sub(i3, i3) }; \
  515. const T_VEC w5[2] = { sub(i1, i1), sub(sub(kWeight0, i3), i3) }; \
  516. store(output + 0 * stride, add(w2, w4[0])); \
  517. store(output + 1 * stride, add(w3, w5[1])); \
  518. store(output + 2 * stride, sub(w2, w4[0])); \
  519. store(output + 3 * stride, sub(w3, w5[1])); \
  520. }
  521. #define GEN_IFFT_8(ret, suffix, T, T_VEC, load, store, constant, add, sub, \
  522. mul) \
  523. ret aom_ifft1d_8_##suffix(const T *input, T *output, int stride) { \
  524. const T_VEC kWeight0 = constant(0.0f); \
  525. const T_VEC kWeight2 = constant(0.707107f); \
  526. const T_VEC i0 = load(input + 0 * stride); \
  527. const T_VEC i1 = load(input + 1 * stride); \
  528. const T_VEC i2 = load(input + 2 * stride); \
  529. const T_VEC i3 = load(input + 3 * stride); \
  530. const T_VEC i4 = load(input + 4 * stride); \
  531. const T_VEC i5 = load(input + 5 * stride); \
  532. const T_VEC i6 = load(input + 6 * stride); \
  533. const T_VEC i7 = load(input + 7 * stride); \
  534. const T_VEC w6 = add(i0, i4); \
  535. const T_VEC w7 = sub(i0, i4); \
  536. const T_VEC w8[2] = { add(i2, i2), sub(i6, i6) }; \
  537. const T_VEC w9[2] = { sub(i2, i2), sub(sub(kWeight0, i6), i6) }; \
  538. const T_VEC w10[2] = { add(w6, w8[0]), w8[1] }; \
  539. const T_VEC w11[2] = { sub(w6, w8[0]), sub(kWeight0, w8[1]) }; \
  540. const T_VEC w12[2] = { add(w7, w9[1]), sub(kWeight0, w9[0]) }; \
  541. const T_VEC w13[2] = { sub(w7, w9[1]), w9[0] }; \
  542. const T_VEC w14[2] = { add(i1, i3), sub(i7, i5) }; \
  543. const T_VEC w15[2] = { sub(i1, i3), sub(sub(kWeight0, i5), i7) }; \
  544. const T_VEC w16[2] = { add(i3, i1), sub(i5, i7) }; \
  545. const T_VEC w17[2] = { sub(i3, i1), sub(sub(kWeight0, i7), i5) }; \
  546. const T_VEC w18[2] = { add(w14[0], w16[0]), add(w14[1], w16[1]) }; \
  547. const T_VEC w19[2] = { sub(w14[0], w16[0]), sub(w14[1], w16[1]) }; \
  548. const T_VEC w20[2] = { add(w15[0], w17[1]), sub(w15[1], w17[0]) }; \
  549. const T_VEC w21[2] = { sub(w15[0], w17[1]), add(w15[1], w17[0]) }; \
  550. store(output + 0 * stride, add(w10[0], w18[0])); \
  551. store(output + 1 * stride, \
  552. add(w12[0], mul(kWeight2, add(w20[0], w20[1])))); \
  553. store(output + 2 * stride, add(w11[0], w19[1])); \
  554. store(output + 3 * stride, \
  555. sub(w13[0], mul(kWeight2, sub(w21[0], w21[1])))); \
  556. store(output + 4 * stride, sub(w10[0], w18[0])); \
  557. store(output + 5 * stride, \
  558. add(w12[0], sub(sub(kWeight0, mul(kWeight2, w20[0])), \
  559. mul(kWeight2, w20[1])))); \
  560. store(output + 6 * stride, sub(w11[0], w19[1])); \
  561. store(output + 7 * stride, \
  562. add(w13[0], mul(kWeight2, sub(w21[0], w21[1])))); \
  563. }
  564. #define GEN_IFFT_16(ret, suffix, T, T_VEC, load, store, constant, add, sub, \
  565. mul) \
  566. ret aom_ifft1d_16_##suffix(const T *input, T *output, int stride) { \
  567. const T_VEC kWeight0 = constant(0.0f); \
  568. const T_VEC kWeight2 = constant(0.707107f); \
  569. const T_VEC kWeight3 = constant(0.92388f); \
  570. const T_VEC kWeight4 = constant(0.382683f); \
  571. const T_VEC i0 = load(input + 0 * stride); \
  572. const T_VEC i1 = load(input + 1 * stride); \
  573. const T_VEC i2 = load(input + 2 * stride); \
  574. const T_VEC i3 = load(input + 3 * stride); \
  575. const T_VEC i4 = load(input + 4 * stride); \
  576. const T_VEC i5 = load(input + 5 * stride); \
  577. const T_VEC i6 = load(input + 6 * stride); \
  578. const T_VEC i7 = load(input + 7 * stride); \
  579. const T_VEC i8 = load(input + 8 * stride); \
  580. const T_VEC i9 = load(input + 9 * stride); \
  581. const T_VEC i10 = load(input + 10 * stride); \
  582. const T_VEC i11 = load(input + 11 * stride); \
  583. const T_VEC i12 = load(input + 12 * stride); \
  584. const T_VEC i13 = load(input + 13 * stride); \
  585. const T_VEC i14 = load(input + 14 * stride); \
  586. const T_VEC i15 = load(input + 15 * stride); \
  587. const T_VEC w14 = add(i0, i8); \
  588. const T_VEC w15 = sub(i0, i8); \
  589. const T_VEC w16[2] = { add(i4, i4), sub(i12, i12) }; \
  590. const T_VEC w17[2] = { sub(i4, i4), sub(sub(kWeight0, i12), i12) }; \
  591. const T_VEC w18[2] = { add(w14, w16[0]), w16[1] }; \
  592. const T_VEC w19[2] = { sub(w14, w16[0]), sub(kWeight0, w16[1]) }; \
  593. const T_VEC w20[2] = { add(w15, w17[1]), sub(kWeight0, w17[0]) }; \
  594. const T_VEC w21[2] = { sub(w15, w17[1]), w17[0] }; \
  595. const T_VEC w22[2] = { add(i2, i6), sub(i14, i10) }; \
  596. const T_VEC w23[2] = { sub(i2, i6), sub(sub(kWeight0, i10), i14) }; \
  597. const T_VEC w24[2] = { add(i6, i2), sub(i10, i14) }; \
  598. const T_VEC w25[2] = { sub(i6, i2), sub(sub(kWeight0, i14), i10) }; \
  599. const T_VEC w26[2] = { add(w22[0], w24[0]), add(w22[1], w24[1]) }; \
  600. const T_VEC w27[2] = { sub(w22[0], w24[0]), sub(w22[1], w24[1]) }; \
  601. const T_VEC w28[2] = { add(w23[0], w25[1]), sub(w23[1], w25[0]) }; \
  602. const T_VEC w29[2] = { sub(w23[0], w25[1]), add(w23[1], w25[0]) }; \
  603. const T_VEC w30[2] = { add(w18[0], w26[0]), add(w18[1], w26[1]) }; \
  604. const T_VEC w31[2] = { sub(w18[0], w26[0]), sub(w18[1], w26[1]) }; \
  605. const T_VEC w32[2] = { add(w20[0], mul(kWeight2, add(w28[0], w28[1]))), \
  606. add(w20[1], mul(kWeight2, sub(w28[1], w28[0]))) }; \
  607. const T_VEC w33[2] = { add(w20[0], \
  608. sub(sub(kWeight0, mul(kWeight2, w28[0])), \
  609. mul(kWeight2, w28[1]))), \
  610. add(w20[1], mul(kWeight2, sub(w28[0], w28[1]))) }; \
  611. const T_VEC w34[2] = { add(w19[0], w27[1]), sub(w19[1], w27[0]) }; \
  612. const T_VEC w35[2] = { sub(w19[0], w27[1]), add(w19[1], w27[0]) }; \
  613. const T_VEC w36[2] = { sub(w21[0], mul(kWeight2, sub(w29[0], w29[1]))), \
  614. sub(w21[1], mul(kWeight2, add(w29[1], w29[0]))) }; \
  615. const T_VEC w37[2] = { add(w21[0], mul(kWeight2, sub(w29[0], w29[1]))), \
  616. add(w21[1], mul(kWeight2, add(w29[1], w29[0]))) }; \
  617. const T_VEC w38[2] = { add(i1, i7), sub(i15, i9) }; \
  618. const T_VEC w39[2] = { sub(i1, i7), sub(sub(kWeight0, i9), i15) }; \
  619. const T_VEC w40[2] = { add(i5, i3), sub(i11, i13) }; \
  620. const T_VEC w41[2] = { sub(i5, i3), sub(sub(kWeight0, i13), i11) }; \
  621. const T_VEC w42[2] = { add(w38[0], w40[0]), add(w38[1], w40[1]) }; \
  622. const T_VEC w43[2] = { sub(w38[0], w40[0]), sub(w38[1], w40[1]) }; \
  623. const T_VEC w44[2] = { add(w39[0], w41[1]), sub(w39[1], w41[0]) }; \
  624. const T_VEC w45[2] = { sub(w39[0], w41[1]), add(w39[1], w41[0]) }; \
  625. const T_VEC w46[2] = { add(i3, i5), sub(i13, i11) }; \
  626. const T_VEC w47[2] = { sub(i3, i5), sub(sub(kWeight0, i11), i13) }; \
  627. const T_VEC w48[2] = { add(i7, i1), sub(i9, i15) }; \
  628. const T_VEC w49[2] = { sub(i7, i1), sub(sub(kWeight0, i15), i9) }; \
  629. const T_VEC w50[2] = { add(w46[0], w48[0]), add(w46[1], w48[1]) }; \
  630. const T_VEC w51[2] = { sub(w46[0], w48[0]), sub(w46[1], w48[1]) }; \
  631. const T_VEC w52[2] = { add(w47[0], w49[1]), sub(w47[1], w49[0]) }; \
  632. const T_VEC w53[2] = { sub(w47[0], w49[1]), add(w47[1], w49[0]) }; \
  633. const T_VEC w54[2] = { add(w42[0], w50[0]), add(w42[1], w50[1]) }; \
  634. const T_VEC w55[2] = { sub(w42[0], w50[0]), sub(w42[1], w50[1]) }; \
  635. const T_VEC w56[2] = { add(w44[0], mul(kWeight2, add(w52[0], w52[1]))), \
  636. add(w44[1], mul(kWeight2, sub(w52[1], w52[0]))) }; \
  637. const T_VEC w57[2] = { add(w44[0], \
  638. sub(sub(kWeight0, mul(kWeight2, w52[0])), \
  639. mul(kWeight2, w52[1]))), \
  640. add(w44[1], mul(kWeight2, sub(w52[0], w52[1]))) }; \
  641. const T_VEC w58[2] = { add(w43[0], w51[1]), sub(w43[1], w51[0]) }; \
  642. const T_VEC w59[2] = { sub(w43[0], w51[1]), add(w43[1], w51[0]) }; \
  643. const T_VEC w60[2] = { sub(w45[0], mul(kWeight2, sub(w53[0], w53[1]))), \
  644. sub(w45[1], mul(kWeight2, add(w53[1], w53[0]))) }; \
  645. const T_VEC w61[2] = { add(w45[0], mul(kWeight2, sub(w53[0], w53[1]))), \
  646. add(w45[1], mul(kWeight2, add(w53[1], w53[0]))) }; \
  647. store(output + 0 * stride, add(w30[0], w54[0])); \
  648. store(output + 1 * stride, \
  649. add(w32[0], add(mul(kWeight3, w56[0]), mul(kWeight4, w56[1])))); \
  650. store(output + 2 * stride, \
  651. add(w34[0], mul(kWeight2, add(w58[0], w58[1])))); \
  652. store(output + 3 * stride, \
  653. add(w36[0], add(mul(kWeight4, w60[0]), mul(kWeight3, w60[1])))); \
  654. store(output + 4 * stride, add(w31[0], w55[1])); \
  655. store(output + 5 * stride, \
  656. sub(w33[0], sub(mul(kWeight4, w57[0]), mul(kWeight3, w57[1])))); \
  657. store(output + 6 * stride, \
  658. sub(w35[0], mul(kWeight2, sub(w59[0], w59[1])))); \
  659. store(output + 7 * stride, \
  660. sub(w37[0], sub(mul(kWeight3, w61[0]), mul(kWeight4, w61[1])))); \
  661. store(output + 8 * stride, sub(w30[0], w54[0])); \
  662. store(output + 9 * stride, \
  663. add(w32[0], sub(sub(kWeight0, mul(kWeight3, w56[0])), \
  664. mul(kWeight4, w56[1])))); \
  665. store(output + 10 * stride, \
  666. add(w34[0], sub(sub(kWeight0, mul(kWeight2, w58[0])), \
  667. mul(kWeight2, w58[1])))); \
  668. store(output + 11 * stride, \
  669. add(w36[0], sub(sub(kWeight0, mul(kWeight4, w60[0])), \
  670. mul(kWeight3, w60[1])))); \
  671. store(output + 12 * stride, sub(w31[0], w55[1])); \
  672. store(output + 13 * stride, \
  673. add(w33[0], sub(mul(kWeight4, w57[0]), mul(kWeight3, w57[1])))); \
  674. store(output + 14 * stride, \
  675. add(w35[0], mul(kWeight2, sub(w59[0], w59[1])))); \
  676. store(output + 15 * stride, \
  677. add(w37[0], sub(mul(kWeight3, w61[0]), mul(kWeight4, w61[1])))); \
  678. }
  679. #define GEN_IFFT_32(ret, suffix, T, T_VEC, load, store, constant, add, sub, \
  680. mul) \
  681. ret aom_ifft1d_32_##suffix(const T *input, T *output, int stride) { \
  682. const T_VEC kWeight0 = constant(0.0f); \
  683. const T_VEC kWeight2 = constant(0.707107f); \
  684. const T_VEC kWeight3 = constant(0.92388f); \
  685. const T_VEC kWeight4 = constant(0.382683f); \
  686. const T_VEC kWeight5 = constant(0.980785f); \
  687. const T_VEC kWeight6 = constant(0.19509f); \
  688. const T_VEC kWeight7 = constant(0.83147f); \
  689. const T_VEC kWeight8 = constant(0.55557f); \
  690. const T_VEC i0 = load(input + 0 * stride); \
  691. const T_VEC i1 = load(input + 1 * stride); \
  692. const T_VEC i2 = load(input + 2 * stride); \
  693. const T_VEC i3 = load(input + 3 * stride); \
  694. const T_VEC i4 = load(input + 4 * stride); \
  695. const T_VEC i5 = load(input + 5 * stride); \
  696. const T_VEC i6 = load(input + 6 * stride); \
  697. const T_VEC i7 = load(input + 7 * stride); \
  698. const T_VEC i8 = load(input + 8 * stride); \
  699. const T_VEC i9 = load(input + 9 * stride); \
  700. const T_VEC i10 = load(input + 10 * stride); \
  701. const T_VEC i11 = load(input + 11 * stride); \
  702. const T_VEC i12 = load(input + 12 * stride); \
  703. const T_VEC i13 = load(input + 13 * stride); \
  704. const T_VEC i14 = load(input + 14 * stride); \
  705. const T_VEC i15 = load(input + 15 * stride); \
  706. const T_VEC i16 = load(input + 16 * stride); \
  707. const T_VEC i17 = load(input + 17 * stride); \
  708. const T_VEC i18 = load(input + 18 * stride); \
  709. const T_VEC i19 = load(input + 19 * stride); \
  710. const T_VEC i20 = load(input + 20 * stride); \
  711. const T_VEC i21 = load(input + 21 * stride); \
  712. const T_VEC i22 = load(input + 22 * stride); \
  713. const T_VEC i23 = load(input + 23 * stride); \
  714. const T_VEC i24 = load(input + 24 * stride); \
  715. const T_VEC i25 = load(input + 25 * stride); \
  716. const T_VEC i26 = load(input + 26 * stride); \
  717. const T_VEC i27 = load(input + 27 * stride); \
  718. const T_VEC i28 = load(input + 28 * stride); \
  719. const T_VEC i29 = load(input + 29 * stride); \
  720. const T_VEC i30 = load(input + 30 * stride); \
  721. const T_VEC i31 = load(input + 31 * stride); \
  722. const T_VEC w30 = add(i0, i16); \
  723. const T_VEC w31 = sub(i0, i16); \
  724. const T_VEC w32[2] = { add(i8, i8), sub(i24, i24) }; \
  725. const T_VEC w33[2] = { sub(i8, i8), sub(sub(kWeight0, i24), i24) }; \
  726. const T_VEC w34[2] = { add(w30, w32[0]), w32[1] }; \
  727. const T_VEC w35[2] = { sub(w30, w32[0]), sub(kWeight0, w32[1]) }; \
  728. const T_VEC w36[2] = { add(w31, w33[1]), sub(kWeight0, w33[0]) }; \
  729. const T_VEC w37[2] = { sub(w31, w33[1]), w33[0] }; \
  730. const T_VEC w38[2] = { add(i4, i12), sub(i28, i20) }; \
  731. const T_VEC w39[2] = { sub(i4, i12), sub(sub(kWeight0, i20), i28) }; \
  732. const T_VEC w40[2] = { add(i12, i4), sub(i20, i28) }; \
  733. const T_VEC w41[2] = { sub(i12, i4), sub(sub(kWeight0, i28), i20) }; \
  734. const T_VEC w42[2] = { add(w38[0], w40[0]), add(w38[1], w40[1]) }; \
  735. const T_VEC w43[2] = { sub(w38[0], w40[0]), sub(w38[1], w40[1]) }; \
  736. const T_VEC w44[2] = { add(w39[0], w41[1]), sub(w39[1], w41[0]) }; \
  737. const T_VEC w45[2] = { sub(w39[0], w41[1]), add(w39[1], w41[0]) }; \
  738. const T_VEC w46[2] = { add(w34[0], w42[0]), add(w34[1], w42[1]) }; \
  739. const T_VEC w47[2] = { sub(w34[0], w42[0]), sub(w34[1], w42[1]) }; \
  740. const T_VEC w48[2] = { add(w36[0], mul(kWeight2, add(w44[0], w44[1]))), \
  741. add(w36[1], mul(kWeight2, sub(w44[1], w44[0]))) }; \
  742. const T_VEC w49[2] = { add(w36[0], \
  743. sub(sub(kWeight0, mul(kWeight2, w44[0])), \
  744. mul(kWeight2, w44[1]))), \
  745. add(w36[1], mul(kWeight2, sub(w44[0], w44[1]))) }; \
  746. const T_VEC w50[2] = { add(w35[0], w43[1]), sub(w35[1], w43[0]) }; \
  747. const T_VEC w51[2] = { sub(w35[0], w43[1]), add(w35[1], w43[0]) }; \
  748. const T_VEC w52[2] = { sub(w37[0], mul(kWeight2, sub(w45[0], w45[1]))), \
  749. sub(w37[1], mul(kWeight2, add(w45[1], w45[0]))) }; \
  750. const T_VEC w53[2] = { add(w37[0], mul(kWeight2, sub(w45[0], w45[1]))), \
  751. add(w37[1], mul(kWeight2, add(w45[1], w45[0]))) }; \
  752. const T_VEC w54[2] = { add(i2, i14), sub(i30, i18) }; \
  753. const T_VEC w55[2] = { sub(i2, i14), sub(sub(kWeight0, i18), i30) }; \
  754. const T_VEC w56[2] = { add(i10, i6), sub(i22, i26) }; \
  755. const T_VEC w57[2] = { sub(i10, i6), sub(sub(kWeight0, i26), i22) }; \
  756. const T_VEC w58[2] = { add(w54[0], w56[0]), add(w54[1], w56[1]) }; \
  757. const T_VEC w59[2] = { sub(w54[0], w56[0]), sub(w54[1], w56[1]) }; \
  758. const T_VEC w60[2] = { add(w55[0], w57[1]), sub(w55[1], w57[0]) }; \
  759. const T_VEC w61[2] = { sub(w55[0], w57[1]), add(w55[1], w57[0]) }; \
  760. const T_VEC w62[2] = { add(i6, i10), sub(i26, i22) }; \
  761. const T_VEC w63[2] = { sub(i6, i10), sub(sub(kWeight0, i22), i26) }; \
  762. const T_VEC w64[2] = { add(i14, i2), sub(i18, i30) }; \
  763. const T_VEC w65[2] = { sub(i14, i2), sub(sub(kWeight0, i30), i18) }; \
  764. const T_VEC w66[2] = { add(w62[0], w64[0]), add(w62[1], w64[1]) }; \
  765. const T_VEC w67[2] = { sub(w62[0], w64[0]), sub(w62[1], w64[1]) }; \
  766. const T_VEC w68[2] = { add(w63[0], w65[1]), sub(w63[1], w65[0]) }; \
  767. const T_VEC w69[2] = { sub(w63[0], w65[1]), add(w63[1], w65[0]) }; \
  768. const T_VEC w70[2] = { add(w58[0], w66[0]), add(w58[1], w66[1]) }; \
  769. const T_VEC w71[2] = { sub(w58[0], w66[0]), sub(w58[1], w66[1]) }; \
  770. const T_VEC w72[2] = { add(w60[0], mul(kWeight2, add(w68[0], w68[1]))), \
  771. add(w60[1], mul(kWeight2, sub(w68[1], w68[0]))) }; \
  772. const T_VEC w73[2] = { add(w60[0], \
  773. sub(sub(kWeight0, mul(kWeight2, w68[0])), \
  774. mul(kWeight2, w68[1]))), \
  775. add(w60[1], mul(kWeight2, sub(w68[0], w68[1]))) }; \
  776. const T_VEC w74[2] = { add(w59[0], w67[1]), sub(w59[1], w67[0]) }; \
  777. const T_VEC w75[2] = { sub(w59[0], w67[1]), add(w59[1], w67[0]) }; \
  778. const T_VEC w76[2] = { sub(w61[0], mul(kWeight2, sub(w69[0], w69[1]))), \
  779. sub(w61[1], mul(kWeight2, add(w69[1], w69[0]))) }; \
  780. const T_VEC w77[2] = { add(w61[0], mul(kWeight2, sub(w69[0], w69[1]))), \
  781. add(w61[1], mul(kWeight2, add(w69[1], w69[0]))) }; \
  782. const T_VEC w78[2] = { add(w46[0], w70[0]), add(w46[1], w70[1]) }; \
  783. const T_VEC w79[2] = { sub(w46[0], w70[0]), sub(w46[1], w70[1]) }; \
  784. const T_VEC w80[2] = { \
  785. add(w48[0], add(mul(kWeight3, w72[0]), mul(kWeight4, w72[1]))), \
  786. add(w48[1], sub(mul(kWeight3, w72[1]), mul(kWeight4, w72[0]))) \
  787. }; \
  788. const T_VEC w81[2] = { \
  789. add(w48[0], \
  790. sub(sub(kWeight0, mul(kWeight3, w72[0])), mul(kWeight4, w72[1]))), \
  791. add(w48[1], sub(mul(kWeight4, w72[0]), mul(kWeight3, w72[1]))) \
  792. }; \
  793. const T_VEC w82[2] = { add(w50[0], mul(kWeight2, add(w74[0], w74[1]))), \
  794. add(w50[1], mul(kWeight2, sub(w74[1], w74[0]))) }; \
  795. const T_VEC w83[2] = { add(w50[0], \
  796. sub(sub(kWeight0, mul(kWeight2, w74[0])), \
  797. mul(kWeight2, w74[1]))), \
  798. add(w50[1], mul(kWeight2, sub(w74[0], w74[1]))) }; \
  799. const T_VEC w84[2] = { \
  800. add(w52[0], add(mul(kWeight4, w76[0]), mul(kWeight3, w76[1]))), \
  801. add(w52[1], sub(mul(kWeight4, w76[1]), mul(kWeight3, w76[0]))) \
  802. }; \
  803. const T_VEC w85[2] = { \
  804. add(w52[0], \
  805. sub(sub(kWeight0, mul(kWeight4, w76[0])), mul(kWeight3, w76[1]))), \
  806. add(w52[1], sub(mul(kWeight3, w76[0]), mul(kWeight4, w76[1]))) \
  807. }; \
  808. const T_VEC w86[2] = { add(w47[0], w71[1]), sub(w47[1], w71[0]) }; \
  809. const T_VEC w87[2] = { sub(w47[0], w71[1]), add(w47[1], w71[0]) }; \
  810. const T_VEC w88[2] = { \
  811. sub(w49[0], sub(mul(kWeight4, w73[0]), mul(kWeight3, w73[1]))), \
  812. add(w49[1], \
  813. sub(sub(kWeight0, mul(kWeight4, w73[1])), mul(kWeight3, w73[0]))) \
  814. }; \
  815. const T_VEC w89[2] = { \
  816. add(w49[0], sub(mul(kWeight4, w73[0]), mul(kWeight3, w73[1]))), \
  817. add(w49[1], add(mul(kWeight4, w73[1]), mul(kWeight3, w73[0]))) \
  818. }; \
  819. const T_VEC w90[2] = { sub(w51[0], mul(kWeight2, sub(w75[0], w75[1]))), \
  820. sub(w51[1], mul(kWeight2, add(w75[1], w75[0]))) }; \
  821. const T_VEC w91[2] = { add(w51[0], mul(kWeight2, sub(w75[0], w75[1]))), \
  822. add(w51[1], mul(kWeight2, add(w75[1], w75[0]))) }; \
  823. const T_VEC w92[2] = { \
  824. sub(w53[0], sub(mul(kWeight3, w77[0]), mul(kWeight4, w77[1]))), \
  825. add(w53[1], \
  826. sub(sub(kWeight0, mul(kWeight3, w77[1])), mul(kWeight4, w77[0]))) \
  827. }; \
  828. const T_VEC w93[2] = { \
  829. add(w53[0], sub(mul(kWeight3, w77[0]), mul(kWeight4, w77[1]))), \
  830. add(w53[1], add(mul(kWeight3, w77[1]), mul(kWeight4, w77[0]))) \
  831. }; \
  832. const T_VEC w94[2] = { add(i1, i15), sub(i31, i17) }; \
  833. const T_VEC w95[2] = { sub(i1, i15), sub(sub(kWeight0, i17), i31) }; \
  834. const T_VEC w96[2] = { add(i9, i7), sub(i23, i25) }; \
  835. const T_VEC w97[2] = { sub(i9, i7), sub(sub(kWeight0, i25), i23) }; \
  836. const T_VEC w98[2] = { add(w94[0], w96[0]), add(w94[1], w96[1]) }; \
  837. const T_VEC w99[2] = { sub(w94[0], w96[0]), sub(w94[1], w96[1]) }; \
  838. const T_VEC w100[2] = { add(w95[0], w97[1]), sub(w95[1], w97[0]) }; \
  839. const T_VEC w101[2] = { sub(w95[0], w97[1]), add(w95[1], w97[0]) }; \
  840. const T_VEC w102[2] = { add(i5, i11), sub(i27, i21) }; \
  841. const T_VEC w103[2] = { sub(i5, i11), sub(sub(kWeight0, i21), i27) }; \
  842. const T_VEC w104[2] = { add(i13, i3), sub(i19, i29) }; \
  843. const T_VEC w105[2] = { sub(i13, i3), sub(sub(kWeight0, i29), i19) }; \
  844. const T_VEC w106[2] = { add(w102[0], w104[0]), add(w102[1], w104[1]) }; \
  845. const T_VEC w107[2] = { sub(w102[0], w104[0]), sub(w102[1], w104[1]) }; \
  846. const T_VEC w108[2] = { add(w103[0], w105[1]), sub(w103[1], w105[0]) }; \
  847. const T_VEC w109[2] = { sub(w103[0], w105[1]), add(w103[1], w105[0]) }; \
  848. const T_VEC w110[2] = { add(w98[0], w106[0]), add(w98[1], w106[1]) }; \
  849. const T_VEC w111[2] = { sub(w98[0], w106[0]), sub(w98[1], w106[1]) }; \
  850. const T_VEC w112[2] = { \
  851. add(w100[0], mul(kWeight2, add(w108[0], w108[1]))), \
  852. add(w100[1], mul(kWeight2, sub(w108[1], w108[0]))) \
  853. }; \
  854. const T_VEC w113[2] = { \
  855. add(w100[0], \
  856. sub(sub(kWeight0, mul(kWeight2, w108[0])), mul(kWeight2, w108[1]))), \
  857. add(w100[1], mul(kWeight2, sub(w108[0], w108[1]))) \
  858. }; \
  859. const T_VEC w114[2] = { add(w99[0], w107[1]), sub(w99[1], w107[0]) }; \
  860. const T_VEC w115[2] = { sub(w99[0], w107[1]), add(w99[1], w107[0]) }; \
  861. const T_VEC w116[2] = { \
  862. sub(w101[0], mul(kWeight2, sub(w109[0], w109[1]))), \
  863. sub(w101[1], mul(kWeight2, add(w109[1], w109[0]))) \
  864. }; \
  865. const T_VEC w117[2] = { \
  866. add(w101[0], mul(kWeight2, sub(w109[0], w109[1]))), \
  867. add(w101[1], mul(kWeight2, add(w109[1], w109[0]))) \
  868. }; \
  869. const T_VEC w118[2] = { add(i3, i13), sub(i29, i19) }; \
  870. const T_VEC w119[2] = { sub(i3, i13), sub(sub(kWeight0, i19), i29) }; \
  871. const T_VEC w120[2] = { add(i11, i5), sub(i21, i27) }; \
  872. const T_VEC w121[2] = { sub(i11, i5), sub(sub(kWeight0, i27), i21) }; \
  873. const T_VEC w122[2] = { add(w118[0], w120[0]), add(w118[1], w120[1]) }; \
  874. const T_VEC w123[2] = { sub(w118[0], w120[0]), sub(w118[1], w120[1]) }; \
  875. const T_VEC w124[2] = { add(w119[0], w121[1]), sub(w119[1], w121[0]) }; \
  876. const T_VEC w125[2] = { sub(w119[0], w121[1]), add(w119[1], w121[0]) }; \
  877. const T_VEC w126[2] = { add(i7, i9), sub(i25, i23) }; \
  878. const T_VEC w127[2] = { sub(i7, i9), sub(sub(kWeight0, i23), i25) }; \
  879. const T_VEC w128[2] = { add(i15, i1), sub(i17, i31) }; \
  880. const T_VEC w129[2] = { sub(i15, i1), sub(sub(kWeight0, i31), i17) }; \
  881. const T_VEC w130[2] = { add(w126[0], w128[0]), add(w126[1], w128[1]) }; \
  882. const T_VEC w131[2] = { sub(w126[0], w128[0]), sub(w126[1], w128[1]) }; \
  883. const T_VEC w132[2] = { add(w127[0], w129[1]), sub(w127[1], w129[0]) }; \
  884. const T_VEC w133[2] = { sub(w127[0], w129[1]), add(w127[1], w129[0]) }; \
  885. const T_VEC w134[2] = { add(w122[0], w130[0]), add(w122[1], w130[1]) }; \
  886. const T_VEC w135[2] = { sub(w122[0], w130[0]), sub(w122[1], w130[1]) }; \
  887. const T_VEC w136[2] = { \
  888. add(w124[0], mul(kWeight2, add(w132[0], w132[1]))), \
  889. add(w124[1], mul(kWeight2, sub(w132[1], w132[0]))) \
  890. }; \
  891. const T_VEC w137[2] = { \
  892. add(w124[0], \
  893. sub(sub(kWeight0, mul(kWeight2, w132[0])), mul(kWeight2, w132[1]))), \
  894. add(w124[1], mul(kWeight2, sub(w132[0], w132[1]))) \
  895. }; \
  896. const T_VEC w138[2] = { add(w123[0], w131[1]), sub(w123[1], w131[0]) }; \
  897. const T_VEC w139[2] = { sub(w123[0], w131[1]), add(w123[1], w131[0]) }; \
  898. const T_VEC w140[2] = { \
  899. sub(w125[0], mul(kWeight2, sub(w133[0], w133[1]))), \
  900. sub(w125[1], mul(kWeight2, add(w133[1], w133[0]))) \
  901. }; \
  902. const T_VEC w141[2] = { \
  903. add(w125[0], mul(kWeight2, sub(w133[0], w133[1]))), \
  904. add(w125[1], mul(kWeight2, add(w133[1], w133[0]))) \
  905. }; \
  906. const T_VEC w142[2] = { add(w110[0], w134[0]), add(w110[1], w134[1]) }; \
  907. const T_VEC w143[2] = { sub(w110[0], w134[0]), sub(w110[1], w134[1]) }; \
  908. const T_VEC w144[2] = { \
  909. add(w112[0], add(mul(kWeight3, w136[0]), mul(kWeight4, w136[1]))), \
  910. add(w112[1], sub(mul(kWeight3, w136[1]), mul(kWeight4, w136[0]))) \
  911. }; \
  912. const T_VEC w145[2] = { \
  913. add(w112[0], \
  914. sub(sub(kWeight0, mul(kWeight3, w136[0])), mul(kWeight4, w136[1]))), \
  915. add(w112[1], sub(mul(kWeight4, w136[0]), mul(kWeight3, w136[1]))) \
  916. }; \
  917. const T_VEC w146[2] = { \
  918. add(w114[0], mul(kWeight2, add(w138[0], w138[1]))), \
  919. add(w114[1], mul(kWeight2, sub(w138[1], w138[0]))) \
  920. }; \
  921. const T_VEC w147[2] = { \
  922. add(w114[0], \
  923. sub(sub(kWeight0, mul(kWeight2, w138[0])), mul(kWeight2, w138[1]))), \
  924. add(w114[1], mul(kWeight2, sub(w138[0], w138[1]))) \
  925. }; \
  926. const T_VEC w148[2] = { \
  927. add(w116[0], add(mul(kWeight4, w140[0]), mul(kWeight3, w140[1]))), \
  928. add(w116[1], sub(mul(kWeight4, w140[1]), mul(kWeight3, w140[0]))) \
  929. }; \
  930. const T_VEC w149[2] = { \
  931. add(w116[0], \
  932. sub(sub(kWeight0, mul(kWeight4, w140[0])), mul(kWeight3, w140[1]))), \
  933. add(w116[1], sub(mul(kWeight3, w140[0]), mul(kWeight4, w140[1]))) \
  934. }; \
  935. const T_VEC w150[2] = { add(w111[0], w135[1]), sub(w111[1], w135[0]) }; \
  936. const T_VEC w151[2] = { sub(w111[0], w135[1]), add(w111[1], w135[0]) }; \
  937. const T_VEC w152[2] = { \
  938. sub(w113[0], sub(mul(kWeight4, w137[0]), mul(kWeight3, w137[1]))), \
  939. add(w113[1], \
  940. sub(sub(kWeight0, mul(kWeight4, w137[1])), mul(kWeight3, w137[0]))) \
  941. }; \
  942. const T_VEC w153[2] = { \
  943. add(w113[0], sub(mul(kWeight4, w137[0]), mul(kWeight3, w137[1]))), \
  944. add(w113[1], add(mul(kWeight4, w137[1]), mul(kWeight3, w137[0]))) \
  945. }; \
  946. const T_VEC w154[2] = { \
  947. sub(w115[0], mul(kWeight2, sub(w139[0], w139[1]))), \
  948. sub(w115[1], mul(kWeight2, add(w139[1], w139[0]))) \
  949. }; \
  950. const T_VEC w155[2] = { \
  951. add(w115[0], mul(kWeight2, sub(w139[0], w139[1]))), \
  952. add(w115[1], mul(kWeight2, add(w139[1], w139[0]))) \
  953. }; \
  954. const T_VEC w156[2] = { \
  955. sub(w117[0], sub(mul(kWeight3, w141[0]), mul(kWeight4, w141[1]))), \
  956. add(w117[1], \
  957. sub(sub(kWeight0, mul(kWeight3, w141[1])), mul(kWeight4, w141[0]))) \
  958. }; \
  959. const T_VEC w157[2] = { \
  960. add(w117[0], sub(mul(kWeight3, w141[0]), mul(kWeight4, w141[1]))), \
  961. add(w117[1], add(mul(kWeight3, w141[1]), mul(kWeight4, w141[0]))) \
  962. }; \
  963. store(output + 0 * stride, add(w78[0], w142[0])); \
  964. store(output + 1 * stride, \
  965. add(w80[0], add(mul(kWeight5, w144[0]), mul(kWeight6, w144[1])))); \
  966. store(output + 2 * stride, \
  967. add(w82[0], add(mul(kWeight3, w146[0]), mul(kWeight4, w146[1])))); \
  968. store(output + 3 * stride, \
  969. add(w84[0], add(mul(kWeight7, w148[0]), mul(kWeight8, w148[1])))); \
  970. store(output + 4 * stride, \
  971. add(w86[0], mul(kWeight2, add(w150[0], w150[1])))); \
  972. store(output + 5 * stride, \
  973. add(w88[0], add(mul(kWeight8, w152[0]), mul(kWeight7, w152[1])))); \
  974. store(output + 6 * stride, \
  975. add(w90[0], add(mul(kWeight4, w154[0]), mul(kWeight3, w154[1])))); \
  976. store(output + 7 * stride, \
  977. add(w92[0], add(mul(kWeight6, w156[0]), mul(kWeight5, w156[1])))); \
  978. store(output + 8 * stride, add(w79[0], w143[1])); \
  979. store(output + 9 * stride, \
  980. sub(w81[0], sub(mul(kWeight6, w145[0]), mul(kWeight5, w145[1])))); \
  981. store(output + 10 * stride, \
  982. sub(w83[0], sub(mul(kWeight4, w147[0]), mul(kWeight3, w147[1])))); \
  983. store(output + 11 * stride, \
  984. sub(w85[0], sub(mul(kWeight8, w149[0]), mul(kWeight7, w149[1])))); \
  985. store(output + 12 * stride, \
  986. sub(w87[0], mul(kWeight2, sub(w151[0], w151[1])))); \
  987. store(output + 13 * stride, \
  988. sub(w89[0], sub(mul(kWeight7, w153[0]), mul(kWeight8, w153[1])))); \
  989. store(output + 14 * stride, \
  990. sub(w91[0], sub(mul(kWeight3, w155[0]), mul(kWeight4, w155[1])))); \
  991. store(output + 15 * stride, \
  992. sub(w93[0], sub(mul(kWeight5, w157[0]), mul(kWeight6, w157[1])))); \
  993. store(output + 16 * stride, sub(w78[0], w142[0])); \
  994. store(output + 17 * stride, \
  995. add(w80[0], sub(sub(kWeight0, mul(kWeight5, w144[0])), \
  996. mul(kWeight6, w144[1])))); \
  997. store(output + 18 * stride, \
  998. add(w82[0], sub(sub(kWeight0, mul(kWeight3, w146[0])), \
  999. mul(kWeight4, w146[1])))); \
  1000. store(output + 19 * stride, \
  1001. add(w84[0], sub(sub(kWeight0, mul(kWeight7, w148[0])), \
  1002. mul(kWeight8, w148[1])))); \
  1003. store(output + 20 * stride, \
  1004. add(w86[0], sub(sub(kWeight0, mul(kWeight2, w150[0])), \
  1005. mul(kWeight2, w150[1])))); \
  1006. store(output + 21 * stride, \
  1007. add(w88[0], sub(sub(kWeight0, mul(kWeight8, w152[0])), \
  1008. mul(kWeight7, w152[1])))); \
  1009. store(output + 22 * stride, \
  1010. add(w90[0], sub(sub(kWeight0, mul(kWeight4, w154[0])), \
  1011. mul(kWeight3, w154[1])))); \
  1012. store(output + 23 * stride, \
  1013. add(w92[0], sub(sub(kWeight0, mul(kWeight6, w156[0])), \
  1014. mul(kWeight5, w156[1])))); \
  1015. store(output + 24 * stride, sub(w79[0], w143[1])); \
  1016. store(output + 25 * stride, \
  1017. add(w81[0], sub(mul(kWeight6, w145[0]), mul(kWeight5, w145[1])))); \
  1018. store(output + 26 * stride, \
  1019. add(w83[0], sub(mul(kWeight4, w147[0]), mul(kWeight3, w147[1])))); \
  1020. store(output + 27 * stride, \
  1021. add(w85[0], sub(mul(kWeight8, w149[0]), mul(kWeight7, w149[1])))); \
  1022. store(output + 28 * stride, \
  1023. add(w87[0], mul(kWeight2, sub(w151[0], w151[1])))); \
  1024. store(output + 29 * stride, \
  1025. add(w89[0], sub(mul(kWeight7, w153[0]), mul(kWeight8, w153[1])))); \
  1026. store(output + 30 * stride, \
  1027. add(w91[0], sub(mul(kWeight3, w155[0]), mul(kWeight4, w155[1])))); \
  1028. store(output + 31 * stride, \
  1029. add(w93[0], sub(mul(kWeight5, w157[0]), mul(kWeight6, w157[1])))); \
  1030. }
  1031. #endif // AOM_AOM_DSP_FFT_COMMON_H_