sixtappredict_neon.c 49 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378
  1. /*
  2. * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include <arm_neon.h>
  11. #include "vpx_ports/mem.h"
  12. static const int8_t vp8_sub_pel_filters[8][8] = {
  13. {0, 0, 128, 0, 0, 0, 0, 0}, /* note that 1/8 pel positionyys are */
  14. {0, -6, 123, 12, -1, 0, 0, 0}, /* just as per alpha -0.5 bicubic */
  15. {2, -11, 108, 36, -8, 1, 0, 0}, /* New 1/4 pel 6 tap filter */
  16. {0, -9, 93, 50, -6, 0, 0, 0},
  17. {3, -16, 77, 77, -16, 3, 0, 0}, /* New 1/2 pel 6 tap filter */
  18. {0, -6, 50, 93, -9, 0, 0, 0},
  19. {1, -8, 36, 108, -11, 2, 0, 0}, /* New 1/4 pel 6 tap filter */
  20. {0, -1, 12, 123, -6, 0, 0, 0},
  21. };
  22. void vp8_sixtap_predict8x4_neon(
  23. unsigned char *src_ptr,
  24. int src_pixels_per_line,
  25. int xoffset,
  26. int yoffset,
  27. unsigned char *dst_ptr,
  28. int dst_pitch) {
  29. unsigned char *src;
  30. uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
  31. uint8x8_t d22u8, d23u8, d24u8, d25u8, d26u8;
  32. uint8x8_t d27u8, d28u8, d29u8, d30u8, d31u8;
  33. int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8;
  34. uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16;
  35. uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16;
  36. int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16;
  37. int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16;
  38. uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8;
  39. if (xoffset == 0) { // secondpass_filter8x4_only
  40. // load second_pass filter
  41. dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
  42. d0s8 = vdup_lane_s8(dtmps8, 0);
  43. d1s8 = vdup_lane_s8(dtmps8, 1);
  44. d2s8 = vdup_lane_s8(dtmps8, 2);
  45. d3s8 = vdup_lane_s8(dtmps8, 3);
  46. d4s8 = vdup_lane_s8(dtmps8, 4);
  47. d5s8 = vdup_lane_s8(dtmps8, 5);
  48. d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
  49. d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
  50. d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
  51. d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
  52. d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
  53. d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
  54. // load src data
  55. src = src_ptr - src_pixels_per_line * 2;
  56. d22u8 = vld1_u8(src);
  57. src += src_pixels_per_line;
  58. d23u8 = vld1_u8(src);
  59. src += src_pixels_per_line;
  60. d24u8 = vld1_u8(src);
  61. src += src_pixels_per_line;
  62. d25u8 = vld1_u8(src);
  63. src += src_pixels_per_line;
  64. d26u8 = vld1_u8(src);
  65. src += src_pixels_per_line;
  66. d27u8 = vld1_u8(src);
  67. src += src_pixels_per_line;
  68. d28u8 = vld1_u8(src);
  69. src += src_pixels_per_line;
  70. d29u8 = vld1_u8(src);
  71. src += src_pixels_per_line;
  72. d30u8 = vld1_u8(src);
  73. q3u16 = vmull_u8(d22u8, d0u8);
  74. q4u16 = vmull_u8(d23u8, d0u8);
  75. q5u16 = vmull_u8(d24u8, d0u8);
  76. q6u16 = vmull_u8(d25u8, d0u8);
  77. q3u16 = vmlsl_u8(q3u16, d23u8, d1u8);
  78. q4u16 = vmlsl_u8(q4u16, d24u8, d1u8);
  79. q5u16 = vmlsl_u8(q5u16, d25u8, d1u8);
  80. q6u16 = vmlsl_u8(q6u16, d26u8, d1u8);
  81. q3u16 = vmlsl_u8(q3u16, d26u8, d4u8);
  82. q4u16 = vmlsl_u8(q4u16, d27u8, d4u8);
  83. q5u16 = vmlsl_u8(q5u16, d28u8, d4u8);
  84. q6u16 = vmlsl_u8(q6u16, d29u8, d4u8);
  85. q3u16 = vmlal_u8(q3u16, d24u8, d2u8);
  86. q4u16 = vmlal_u8(q4u16, d25u8, d2u8);
  87. q5u16 = vmlal_u8(q5u16, d26u8, d2u8);
  88. q6u16 = vmlal_u8(q6u16, d27u8, d2u8);
  89. q3u16 = vmlal_u8(q3u16, d27u8, d5u8);
  90. q4u16 = vmlal_u8(q4u16, d28u8, d5u8);
  91. q5u16 = vmlal_u8(q5u16, d29u8, d5u8);
  92. q6u16 = vmlal_u8(q6u16, d30u8, d5u8);
  93. q7u16 = vmull_u8(d25u8, d3u8);
  94. q8u16 = vmull_u8(d26u8, d3u8);
  95. q9u16 = vmull_u8(d27u8, d3u8);
  96. q10u16 = vmull_u8(d28u8, d3u8);
  97. q3s16 = vreinterpretq_s16_u16(q3u16);
  98. q4s16 = vreinterpretq_s16_u16(q4u16);
  99. q5s16 = vreinterpretq_s16_u16(q5u16);
  100. q6s16 = vreinterpretq_s16_u16(q6u16);
  101. q7s16 = vreinterpretq_s16_u16(q7u16);
  102. q8s16 = vreinterpretq_s16_u16(q8u16);
  103. q9s16 = vreinterpretq_s16_u16(q9u16);
  104. q10s16 = vreinterpretq_s16_u16(q10u16);
  105. q7s16 = vqaddq_s16(q7s16, q3s16);
  106. q8s16 = vqaddq_s16(q8s16, q4s16);
  107. q9s16 = vqaddq_s16(q9s16, q5s16);
  108. q10s16 = vqaddq_s16(q10s16, q6s16);
  109. d6u8 = vqrshrun_n_s16(q7s16, 7);
  110. d7u8 = vqrshrun_n_s16(q8s16, 7);
  111. d8u8 = vqrshrun_n_s16(q9s16, 7);
  112. d9u8 = vqrshrun_n_s16(q10s16, 7);
  113. vst1_u8(dst_ptr, d6u8);
  114. dst_ptr += dst_pitch;
  115. vst1_u8(dst_ptr, d7u8);
  116. dst_ptr += dst_pitch;
  117. vst1_u8(dst_ptr, d8u8);
  118. dst_ptr += dst_pitch;
  119. vst1_u8(dst_ptr, d9u8);
  120. return;
  121. }
  122. // load first_pass filter
  123. dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]);
  124. d0s8 = vdup_lane_s8(dtmps8, 0);
  125. d1s8 = vdup_lane_s8(dtmps8, 1);
  126. d2s8 = vdup_lane_s8(dtmps8, 2);
  127. d3s8 = vdup_lane_s8(dtmps8, 3);
  128. d4s8 = vdup_lane_s8(dtmps8, 4);
  129. d5s8 = vdup_lane_s8(dtmps8, 5);
  130. d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
  131. d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
  132. d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
  133. d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
  134. d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
  135. d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
  136. // First pass: output_height lines x output_width columns (9x4)
  137. if (yoffset == 0) // firstpass_filter4x4_only
  138. src = src_ptr - 2;
  139. else
  140. src = src_ptr - 2 - (src_pixels_per_line * 2);
  141. q3u8 = vld1q_u8(src);
  142. src += src_pixels_per_line;
  143. q4u8 = vld1q_u8(src);
  144. src += src_pixels_per_line;
  145. q5u8 = vld1q_u8(src);
  146. src += src_pixels_per_line;
  147. q6u8 = vld1q_u8(src);
  148. q7u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
  149. q8u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
  150. q9u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
  151. q10u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
  152. d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
  153. d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
  154. d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
  155. d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
  156. q7u16 = vmlsl_u8(q7u16, d28u8, d1u8);
  157. q8u16 = vmlsl_u8(q8u16, d29u8, d1u8);
  158. q9u16 = vmlsl_u8(q9u16, d30u8, d1u8);
  159. q10u16 = vmlsl_u8(q10u16, d31u8, d1u8);
  160. d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
  161. d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
  162. d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
  163. d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
  164. q7u16 = vmlsl_u8(q7u16, d28u8, d4u8);
  165. q8u16 = vmlsl_u8(q8u16, d29u8, d4u8);
  166. q9u16 = vmlsl_u8(q9u16, d30u8, d4u8);
  167. q10u16 = vmlsl_u8(q10u16, d31u8, d4u8);
  168. d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
  169. d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
  170. d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
  171. d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
  172. q7u16 = vmlal_u8(q7u16, d28u8, d2u8);
  173. q8u16 = vmlal_u8(q8u16, d29u8, d2u8);
  174. q9u16 = vmlal_u8(q9u16, d30u8, d2u8);
  175. q10u16 = vmlal_u8(q10u16, d31u8, d2u8);
  176. d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
  177. d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
  178. d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
  179. d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
  180. q7u16 = vmlal_u8(q7u16, d28u8, d5u8);
  181. q8u16 = vmlal_u8(q8u16, d29u8, d5u8);
  182. q9u16 = vmlal_u8(q9u16, d30u8, d5u8);
  183. q10u16 = vmlal_u8(q10u16, d31u8, d5u8);
  184. d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
  185. d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
  186. d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
  187. d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
  188. q3u16 = vmull_u8(d28u8, d3u8);
  189. q4u16 = vmull_u8(d29u8, d3u8);
  190. q5u16 = vmull_u8(d30u8, d3u8);
  191. q6u16 = vmull_u8(d31u8, d3u8);
  192. q3s16 = vreinterpretq_s16_u16(q3u16);
  193. q4s16 = vreinterpretq_s16_u16(q4u16);
  194. q5s16 = vreinterpretq_s16_u16(q5u16);
  195. q6s16 = vreinterpretq_s16_u16(q6u16);
  196. q7s16 = vreinterpretq_s16_u16(q7u16);
  197. q8s16 = vreinterpretq_s16_u16(q8u16);
  198. q9s16 = vreinterpretq_s16_u16(q9u16);
  199. q10s16 = vreinterpretq_s16_u16(q10u16);
  200. q7s16 = vqaddq_s16(q7s16, q3s16);
  201. q8s16 = vqaddq_s16(q8s16, q4s16);
  202. q9s16 = vqaddq_s16(q9s16, q5s16);
  203. q10s16 = vqaddq_s16(q10s16, q6s16);
  204. d22u8 = vqrshrun_n_s16(q7s16, 7);
  205. d23u8 = vqrshrun_n_s16(q8s16, 7);
  206. d24u8 = vqrshrun_n_s16(q9s16, 7);
  207. d25u8 = vqrshrun_n_s16(q10s16, 7);
  208. if (yoffset == 0) { // firstpass_filter8x4_only
  209. vst1_u8(dst_ptr, d22u8);
  210. dst_ptr += dst_pitch;
  211. vst1_u8(dst_ptr, d23u8);
  212. dst_ptr += dst_pitch;
  213. vst1_u8(dst_ptr, d24u8);
  214. dst_ptr += dst_pitch;
  215. vst1_u8(dst_ptr, d25u8);
  216. return;
  217. }
  218. // First Pass on rest 5-line data
  219. src += src_pixels_per_line;
  220. q3u8 = vld1q_u8(src);
  221. src += src_pixels_per_line;
  222. q4u8 = vld1q_u8(src);
  223. src += src_pixels_per_line;
  224. q5u8 = vld1q_u8(src);
  225. src += src_pixels_per_line;
  226. q6u8 = vld1q_u8(src);
  227. src += src_pixels_per_line;
  228. q7u8 = vld1q_u8(src);
  229. q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
  230. q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
  231. q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
  232. q11u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
  233. q12u16 = vmull_u8(vget_low_u8(q7u8), d0u8);
  234. d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
  235. d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
  236. d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
  237. d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
  238. d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 1);
  239. q8u16 = vmlsl_u8(q8u16, d27u8, d1u8);
  240. q9u16 = vmlsl_u8(q9u16, d28u8, d1u8);
  241. q10u16 = vmlsl_u8(q10u16, d29u8, d1u8);
  242. q11u16 = vmlsl_u8(q11u16, d30u8, d1u8);
  243. q12u16 = vmlsl_u8(q12u16, d31u8, d1u8);
  244. d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
  245. d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
  246. d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
  247. d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
  248. d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 4);
  249. q8u16 = vmlsl_u8(q8u16, d27u8, d4u8);
  250. q9u16 = vmlsl_u8(q9u16, d28u8, d4u8);
  251. q10u16 = vmlsl_u8(q10u16, d29u8, d4u8);
  252. q11u16 = vmlsl_u8(q11u16, d30u8, d4u8);
  253. q12u16 = vmlsl_u8(q12u16, d31u8, d4u8);
  254. d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
  255. d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
  256. d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
  257. d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
  258. d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 2);
  259. q8u16 = vmlal_u8(q8u16, d27u8, d2u8);
  260. q9u16 = vmlal_u8(q9u16, d28u8, d2u8);
  261. q10u16 = vmlal_u8(q10u16, d29u8, d2u8);
  262. q11u16 = vmlal_u8(q11u16, d30u8, d2u8);
  263. q12u16 = vmlal_u8(q12u16, d31u8, d2u8);
  264. d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
  265. d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
  266. d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
  267. d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
  268. d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 5);
  269. q8u16 = vmlal_u8(q8u16, d27u8, d5u8);
  270. q9u16 = vmlal_u8(q9u16, d28u8, d5u8);
  271. q10u16 = vmlal_u8(q10u16, d29u8, d5u8);
  272. q11u16 = vmlal_u8(q11u16, d30u8, d5u8);
  273. q12u16 = vmlal_u8(q12u16, d31u8, d5u8);
  274. d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
  275. d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
  276. d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
  277. d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
  278. d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 3);
  279. q3u16 = vmull_u8(d27u8, d3u8);
  280. q4u16 = vmull_u8(d28u8, d3u8);
  281. q5u16 = vmull_u8(d29u8, d3u8);
  282. q6u16 = vmull_u8(d30u8, d3u8);
  283. q7u16 = vmull_u8(d31u8, d3u8);
  284. q3s16 = vreinterpretq_s16_u16(q3u16);
  285. q4s16 = vreinterpretq_s16_u16(q4u16);
  286. q5s16 = vreinterpretq_s16_u16(q5u16);
  287. q6s16 = vreinterpretq_s16_u16(q6u16);
  288. q7s16 = vreinterpretq_s16_u16(q7u16);
  289. q8s16 = vreinterpretq_s16_u16(q8u16);
  290. q9s16 = vreinterpretq_s16_u16(q9u16);
  291. q10s16 = vreinterpretq_s16_u16(q10u16);
  292. q11s16 = vreinterpretq_s16_u16(q11u16);
  293. q12s16 = vreinterpretq_s16_u16(q12u16);
  294. q8s16 = vqaddq_s16(q8s16, q3s16);
  295. q9s16 = vqaddq_s16(q9s16, q4s16);
  296. q10s16 = vqaddq_s16(q10s16, q5s16);
  297. q11s16 = vqaddq_s16(q11s16, q6s16);
  298. q12s16 = vqaddq_s16(q12s16, q7s16);
  299. d26u8 = vqrshrun_n_s16(q8s16, 7);
  300. d27u8 = vqrshrun_n_s16(q9s16, 7);
  301. d28u8 = vqrshrun_n_s16(q10s16, 7);
  302. d29u8 = vqrshrun_n_s16(q11s16, 7);
  303. d30u8 = vqrshrun_n_s16(q12s16, 7);
  304. // Second pass: 8x4
  305. dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
  306. d0s8 = vdup_lane_s8(dtmps8, 0);
  307. d1s8 = vdup_lane_s8(dtmps8, 1);
  308. d2s8 = vdup_lane_s8(dtmps8, 2);
  309. d3s8 = vdup_lane_s8(dtmps8, 3);
  310. d4s8 = vdup_lane_s8(dtmps8, 4);
  311. d5s8 = vdup_lane_s8(dtmps8, 5);
  312. d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
  313. d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
  314. d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
  315. d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
  316. d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
  317. d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
  318. q3u16 = vmull_u8(d22u8, d0u8);
  319. q4u16 = vmull_u8(d23u8, d0u8);
  320. q5u16 = vmull_u8(d24u8, d0u8);
  321. q6u16 = vmull_u8(d25u8, d0u8);
  322. q3u16 = vmlsl_u8(q3u16, d23u8, d1u8);
  323. q4u16 = vmlsl_u8(q4u16, d24u8, d1u8);
  324. q5u16 = vmlsl_u8(q5u16, d25u8, d1u8);
  325. q6u16 = vmlsl_u8(q6u16, d26u8, d1u8);
  326. q3u16 = vmlsl_u8(q3u16, d26u8, d4u8);
  327. q4u16 = vmlsl_u8(q4u16, d27u8, d4u8);
  328. q5u16 = vmlsl_u8(q5u16, d28u8, d4u8);
  329. q6u16 = vmlsl_u8(q6u16, d29u8, d4u8);
  330. q3u16 = vmlal_u8(q3u16, d24u8, d2u8);
  331. q4u16 = vmlal_u8(q4u16, d25u8, d2u8);
  332. q5u16 = vmlal_u8(q5u16, d26u8, d2u8);
  333. q6u16 = vmlal_u8(q6u16, d27u8, d2u8);
  334. q3u16 = vmlal_u8(q3u16, d27u8, d5u8);
  335. q4u16 = vmlal_u8(q4u16, d28u8, d5u8);
  336. q5u16 = vmlal_u8(q5u16, d29u8, d5u8);
  337. q6u16 = vmlal_u8(q6u16, d30u8, d5u8);
  338. q7u16 = vmull_u8(d25u8, d3u8);
  339. q8u16 = vmull_u8(d26u8, d3u8);
  340. q9u16 = vmull_u8(d27u8, d3u8);
  341. q10u16 = vmull_u8(d28u8, d3u8);
  342. q3s16 = vreinterpretq_s16_u16(q3u16);
  343. q4s16 = vreinterpretq_s16_u16(q4u16);
  344. q5s16 = vreinterpretq_s16_u16(q5u16);
  345. q6s16 = vreinterpretq_s16_u16(q6u16);
  346. q7s16 = vreinterpretq_s16_u16(q7u16);
  347. q8s16 = vreinterpretq_s16_u16(q8u16);
  348. q9s16 = vreinterpretq_s16_u16(q9u16);
  349. q10s16 = vreinterpretq_s16_u16(q10u16);
  350. q7s16 = vqaddq_s16(q7s16, q3s16);
  351. q8s16 = vqaddq_s16(q8s16, q4s16);
  352. q9s16 = vqaddq_s16(q9s16, q5s16);
  353. q10s16 = vqaddq_s16(q10s16, q6s16);
  354. d6u8 = vqrshrun_n_s16(q7s16, 7);
  355. d7u8 = vqrshrun_n_s16(q8s16, 7);
  356. d8u8 = vqrshrun_n_s16(q9s16, 7);
  357. d9u8 = vqrshrun_n_s16(q10s16, 7);
  358. vst1_u8(dst_ptr, d6u8);
  359. dst_ptr += dst_pitch;
  360. vst1_u8(dst_ptr, d7u8);
  361. dst_ptr += dst_pitch;
  362. vst1_u8(dst_ptr, d8u8);
  363. dst_ptr += dst_pitch;
  364. vst1_u8(dst_ptr, d9u8);
  365. return;
  366. }
  367. void vp8_sixtap_predict8x8_neon(
  368. unsigned char *src_ptr,
  369. int src_pixels_per_line,
  370. int xoffset,
  371. int yoffset,
  372. unsigned char *dst_ptr,
  373. int dst_pitch) {
  374. unsigned char *src, *tmpp;
  375. unsigned char tmp[64];
  376. int i;
  377. uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
  378. uint8x8_t d18u8, d19u8, d20u8, d21u8, d22u8, d23u8, d24u8, d25u8;
  379. uint8x8_t d26u8, d27u8, d28u8, d29u8, d30u8, d31u8;
  380. int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8;
  381. uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16;
  382. uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16;
  383. int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16;
  384. int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16;
  385. uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q9u8, q10u8, q11u8, q12u8;
  386. if (xoffset == 0) { // secondpass_filter8x8_only
  387. // load second_pass filter
  388. dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
  389. d0s8 = vdup_lane_s8(dtmps8, 0);
  390. d1s8 = vdup_lane_s8(dtmps8, 1);
  391. d2s8 = vdup_lane_s8(dtmps8, 2);
  392. d3s8 = vdup_lane_s8(dtmps8, 3);
  393. d4s8 = vdup_lane_s8(dtmps8, 4);
  394. d5s8 = vdup_lane_s8(dtmps8, 5);
  395. d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
  396. d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
  397. d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
  398. d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
  399. d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
  400. d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
  401. // load src data
  402. src = src_ptr - src_pixels_per_line * 2;
  403. d18u8 = vld1_u8(src);
  404. src += src_pixels_per_line;
  405. d19u8 = vld1_u8(src);
  406. src += src_pixels_per_line;
  407. d20u8 = vld1_u8(src);
  408. src += src_pixels_per_line;
  409. d21u8 = vld1_u8(src);
  410. src += src_pixels_per_line;
  411. d22u8 = vld1_u8(src);
  412. src += src_pixels_per_line;
  413. d23u8 = vld1_u8(src);
  414. src += src_pixels_per_line;
  415. d24u8 = vld1_u8(src);
  416. src += src_pixels_per_line;
  417. d25u8 = vld1_u8(src);
  418. src += src_pixels_per_line;
  419. d26u8 = vld1_u8(src);
  420. src += src_pixels_per_line;
  421. d27u8 = vld1_u8(src);
  422. src += src_pixels_per_line;
  423. d28u8 = vld1_u8(src);
  424. src += src_pixels_per_line;
  425. d29u8 = vld1_u8(src);
  426. src += src_pixels_per_line;
  427. d30u8 = vld1_u8(src);
  428. for (i = 2; i > 0; i--) {
  429. q3u16 = vmull_u8(d18u8, d0u8);
  430. q4u16 = vmull_u8(d19u8, d0u8);
  431. q5u16 = vmull_u8(d20u8, d0u8);
  432. q6u16 = vmull_u8(d21u8, d0u8);
  433. q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
  434. q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
  435. q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
  436. q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
  437. q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
  438. q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
  439. q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
  440. q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
  441. q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
  442. q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
  443. q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
  444. q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
  445. q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
  446. q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
  447. q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
  448. q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
  449. q7u16 = vmull_u8(d21u8, d3u8);
  450. q8u16 = vmull_u8(d22u8, d3u8);
  451. q9u16 = vmull_u8(d23u8, d3u8);
  452. q10u16 = vmull_u8(d24u8, d3u8);
  453. q3s16 = vreinterpretq_s16_u16(q3u16);
  454. q4s16 = vreinterpretq_s16_u16(q4u16);
  455. q5s16 = vreinterpretq_s16_u16(q5u16);
  456. q6s16 = vreinterpretq_s16_u16(q6u16);
  457. q7s16 = vreinterpretq_s16_u16(q7u16);
  458. q8s16 = vreinterpretq_s16_u16(q8u16);
  459. q9s16 = vreinterpretq_s16_u16(q9u16);
  460. q10s16 = vreinterpretq_s16_u16(q10u16);
  461. q7s16 = vqaddq_s16(q7s16, q3s16);
  462. q8s16 = vqaddq_s16(q8s16, q4s16);
  463. q9s16 = vqaddq_s16(q9s16, q5s16);
  464. q10s16 = vqaddq_s16(q10s16, q6s16);
  465. d6u8 = vqrshrun_n_s16(q7s16, 7);
  466. d7u8 = vqrshrun_n_s16(q8s16, 7);
  467. d8u8 = vqrshrun_n_s16(q9s16, 7);
  468. d9u8 = vqrshrun_n_s16(q10s16, 7);
  469. d18u8 = d22u8;
  470. d19u8 = d23u8;
  471. d20u8 = d24u8;
  472. d21u8 = d25u8;
  473. d22u8 = d26u8;
  474. d23u8 = d27u8;
  475. d24u8 = d28u8;
  476. d25u8 = d29u8;
  477. d26u8 = d30u8;
  478. vst1_u8(dst_ptr, d6u8);
  479. dst_ptr += dst_pitch;
  480. vst1_u8(dst_ptr, d7u8);
  481. dst_ptr += dst_pitch;
  482. vst1_u8(dst_ptr, d8u8);
  483. dst_ptr += dst_pitch;
  484. vst1_u8(dst_ptr, d9u8);
  485. dst_ptr += dst_pitch;
  486. }
  487. return;
  488. }
  489. // load first_pass filter
  490. dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]);
  491. d0s8 = vdup_lane_s8(dtmps8, 0);
  492. d1s8 = vdup_lane_s8(dtmps8, 1);
  493. d2s8 = vdup_lane_s8(dtmps8, 2);
  494. d3s8 = vdup_lane_s8(dtmps8, 3);
  495. d4s8 = vdup_lane_s8(dtmps8, 4);
  496. d5s8 = vdup_lane_s8(dtmps8, 5);
  497. d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
  498. d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
  499. d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
  500. d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
  501. d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
  502. d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
  503. // First pass: output_height lines x output_width columns (9x4)
  504. if (yoffset == 0) // firstpass_filter4x4_only
  505. src = src_ptr - 2;
  506. else
  507. src = src_ptr - 2 - (src_pixels_per_line * 2);
  508. tmpp = tmp;
  509. for (i = 2; i > 0; i--) {
  510. q3u8 = vld1q_u8(src);
  511. src += src_pixels_per_line;
  512. q4u8 = vld1q_u8(src);
  513. src += src_pixels_per_line;
  514. q5u8 = vld1q_u8(src);
  515. src += src_pixels_per_line;
  516. q6u8 = vld1q_u8(src);
  517. src += src_pixels_per_line;
  518. __builtin_prefetch(src);
  519. __builtin_prefetch(src + src_pixels_per_line);
  520. __builtin_prefetch(src + src_pixels_per_line * 2);
  521. q7u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
  522. q8u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
  523. q9u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
  524. q10u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
  525. d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
  526. d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
  527. d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
  528. d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
  529. q7u16 = vmlsl_u8(q7u16, d28u8, d1u8);
  530. q8u16 = vmlsl_u8(q8u16, d29u8, d1u8);
  531. q9u16 = vmlsl_u8(q9u16, d30u8, d1u8);
  532. q10u16 = vmlsl_u8(q10u16, d31u8, d1u8);
  533. d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
  534. d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
  535. d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
  536. d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
  537. q7u16 = vmlsl_u8(q7u16, d28u8, d4u8);
  538. q8u16 = vmlsl_u8(q8u16, d29u8, d4u8);
  539. q9u16 = vmlsl_u8(q9u16, d30u8, d4u8);
  540. q10u16 = vmlsl_u8(q10u16, d31u8, d4u8);
  541. d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
  542. d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
  543. d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
  544. d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
  545. q7u16 = vmlal_u8(q7u16, d28u8, d2u8);
  546. q8u16 = vmlal_u8(q8u16, d29u8, d2u8);
  547. q9u16 = vmlal_u8(q9u16, d30u8, d2u8);
  548. q10u16 = vmlal_u8(q10u16, d31u8, d2u8);
  549. d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
  550. d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
  551. d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
  552. d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
  553. q7u16 = vmlal_u8(q7u16, d28u8, d5u8);
  554. q8u16 = vmlal_u8(q8u16, d29u8, d5u8);
  555. q9u16 = vmlal_u8(q9u16, d30u8, d5u8);
  556. q10u16 = vmlal_u8(q10u16, d31u8, d5u8);
  557. d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
  558. d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
  559. d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
  560. d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
  561. q3u16 = vmull_u8(d28u8, d3u8);
  562. q4u16 = vmull_u8(d29u8, d3u8);
  563. q5u16 = vmull_u8(d30u8, d3u8);
  564. q6u16 = vmull_u8(d31u8, d3u8);
  565. q3s16 = vreinterpretq_s16_u16(q3u16);
  566. q4s16 = vreinterpretq_s16_u16(q4u16);
  567. q5s16 = vreinterpretq_s16_u16(q5u16);
  568. q6s16 = vreinterpretq_s16_u16(q6u16);
  569. q7s16 = vreinterpretq_s16_u16(q7u16);
  570. q8s16 = vreinterpretq_s16_u16(q8u16);
  571. q9s16 = vreinterpretq_s16_u16(q9u16);
  572. q10s16 = vreinterpretq_s16_u16(q10u16);
  573. q7s16 = vqaddq_s16(q7s16, q3s16);
  574. q8s16 = vqaddq_s16(q8s16, q4s16);
  575. q9s16 = vqaddq_s16(q9s16, q5s16);
  576. q10s16 = vqaddq_s16(q10s16, q6s16);
  577. d22u8 = vqrshrun_n_s16(q7s16, 7);
  578. d23u8 = vqrshrun_n_s16(q8s16, 7);
  579. d24u8 = vqrshrun_n_s16(q9s16, 7);
  580. d25u8 = vqrshrun_n_s16(q10s16, 7);
  581. if (yoffset == 0) { // firstpass_filter8x4_only
  582. vst1_u8(dst_ptr, d22u8);
  583. dst_ptr += dst_pitch;
  584. vst1_u8(dst_ptr, d23u8);
  585. dst_ptr += dst_pitch;
  586. vst1_u8(dst_ptr, d24u8);
  587. dst_ptr += dst_pitch;
  588. vst1_u8(dst_ptr, d25u8);
  589. dst_ptr += dst_pitch;
  590. } else {
  591. vst1_u8(tmpp, d22u8);
  592. tmpp += 8;
  593. vst1_u8(tmpp, d23u8);
  594. tmpp += 8;
  595. vst1_u8(tmpp, d24u8);
  596. tmpp += 8;
  597. vst1_u8(tmpp, d25u8);
  598. tmpp += 8;
  599. }
  600. }
  601. if (yoffset == 0)
  602. return;
  603. // First Pass on rest 5-line data
  604. q3u8 = vld1q_u8(src);
  605. src += src_pixels_per_line;
  606. q4u8 = vld1q_u8(src);
  607. src += src_pixels_per_line;
  608. q5u8 = vld1q_u8(src);
  609. src += src_pixels_per_line;
  610. q6u8 = vld1q_u8(src);
  611. src += src_pixels_per_line;
  612. q7u8 = vld1q_u8(src);
  613. q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
  614. q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
  615. q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
  616. q11u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
  617. q12u16 = vmull_u8(vget_low_u8(q7u8), d0u8);
  618. d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
  619. d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
  620. d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
  621. d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
  622. d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 1);
  623. q8u16 = vmlsl_u8(q8u16, d27u8, d1u8);
  624. q9u16 = vmlsl_u8(q9u16, d28u8, d1u8);
  625. q10u16 = vmlsl_u8(q10u16, d29u8, d1u8);
  626. q11u16 = vmlsl_u8(q11u16, d30u8, d1u8);
  627. q12u16 = vmlsl_u8(q12u16, d31u8, d1u8);
  628. d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
  629. d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
  630. d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
  631. d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
  632. d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 4);
  633. q8u16 = vmlsl_u8(q8u16, d27u8, d4u8);
  634. q9u16 = vmlsl_u8(q9u16, d28u8, d4u8);
  635. q10u16 = vmlsl_u8(q10u16, d29u8, d4u8);
  636. q11u16 = vmlsl_u8(q11u16, d30u8, d4u8);
  637. q12u16 = vmlsl_u8(q12u16, d31u8, d4u8);
  638. d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
  639. d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
  640. d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
  641. d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
  642. d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 2);
  643. q8u16 = vmlal_u8(q8u16, d27u8, d2u8);
  644. q9u16 = vmlal_u8(q9u16, d28u8, d2u8);
  645. q10u16 = vmlal_u8(q10u16, d29u8, d2u8);
  646. q11u16 = vmlal_u8(q11u16, d30u8, d2u8);
  647. q12u16 = vmlal_u8(q12u16, d31u8, d2u8);
  648. d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
  649. d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
  650. d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
  651. d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
  652. d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 5);
  653. q8u16 = vmlal_u8(q8u16, d27u8, d5u8);
  654. q9u16 = vmlal_u8(q9u16, d28u8, d5u8);
  655. q10u16 = vmlal_u8(q10u16, d29u8, d5u8);
  656. q11u16 = vmlal_u8(q11u16, d30u8, d5u8);
  657. q12u16 = vmlal_u8(q12u16, d31u8, d5u8);
  658. d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
  659. d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
  660. d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
  661. d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
  662. d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 3);
  663. q3u16 = vmull_u8(d27u8, d3u8);
  664. q4u16 = vmull_u8(d28u8, d3u8);
  665. q5u16 = vmull_u8(d29u8, d3u8);
  666. q6u16 = vmull_u8(d30u8, d3u8);
  667. q7u16 = vmull_u8(d31u8, d3u8);
  668. q3s16 = vreinterpretq_s16_u16(q3u16);
  669. q4s16 = vreinterpretq_s16_u16(q4u16);
  670. q5s16 = vreinterpretq_s16_u16(q5u16);
  671. q6s16 = vreinterpretq_s16_u16(q6u16);
  672. q7s16 = vreinterpretq_s16_u16(q7u16);
  673. q8s16 = vreinterpretq_s16_u16(q8u16);
  674. q9s16 = vreinterpretq_s16_u16(q9u16);
  675. q10s16 = vreinterpretq_s16_u16(q10u16);
  676. q11s16 = vreinterpretq_s16_u16(q11u16);
  677. q12s16 = vreinterpretq_s16_u16(q12u16);
  678. q8s16 = vqaddq_s16(q8s16, q3s16);
  679. q9s16 = vqaddq_s16(q9s16, q4s16);
  680. q10s16 = vqaddq_s16(q10s16, q5s16);
  681. q11s16 = vqaddq_s16(q11s16, q6s16);
  682. q12s16 = vqaddq_s16(q12s16, q7s16);
  683. d26u8 = vqrshrun_n_s16(q8s16, 7);
  684. d27u8 = vqrshrun_n_s16(q9s16, 7);
  685. d28u8 = vqrshrun_n_s16(q10s16, 7);
  686. d29u8 = vqrshrun_n_s16(q11s16, 7);
  687. d30u8 = vqrshrun_n_s16(q12s16, 7);
  688. // Second pass: 8x8
  689. dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
  690. d0s8 = vdup_lane_s8(dtmps8, 0);
  691. d1s8 = vdup_lane_s8(dtmps8, 1);
  692. d2s8 = vdup_lane_s8(dtmps8, 2);
  693. d3s8 = vdup_lane_s8(dtmps8, 3);
  694. d4s8 = vdup_lane_s8(dtmps8, 4);
  695. d5s8 = vdup_lane_s8(dtmps8, 5);
  696. d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
  697. d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
  698. d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
  699. d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
  700. d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
  701. d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
  702. tmpp = tmp;
  703. q9u8 = vld1q_u8(tmpp);
  704. tmpp += 16;
  705. q10u8 = vld1q_u8(tmpp);
  706. tmpp += 16;
  707. q11u8 = vld1q_u8(tmpp);
  708. tmpp += 16;
  709. q12u8 = vld1q_u8(tmpp);
  710. d18u8 = vget_low_u8(q9u8);
  711. d19u8 = vget_high_u8(q9u8);
  712. d20u8 = vget_low_u8(q10u8);
  713. d21u8 = vget_high_u8(q10u8);
  714. d22u8 = vget_low_u8(q11u8);
  715. d23u8 = vget_high_u8(q11u8);
  716. d24u8 = vget_low_u8(q12u8);
  717. d25u8 = vget_high_u8(q12u8);
  718. for (i = 2; i > 0; i--) {
  719. q3u16 = vmull_u8(d18u8, d0u8);
  720. q4u16 = vmull_u8(d19u8, d0u8);
  721. q5u16 = vmull_u8(d20u8, d0u8);
  722. q6u16 = vmull_u8(d21u8, d0u8);
  723. q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
  724. q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
  725. q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
  726. q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
  727. q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
  728. q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
  729. q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
  730. q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
  731. q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
  732. q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
  733. q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
  734. q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
  735. q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
  736. q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
  737. q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
  738. q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
  739. q7u16 = vmull_u8(d21u8, d3u8);
  740. q8u16 = vmull_u8(d22u8, d3u8);
  741. q9u16 = vmull_u8(d23u8, d3u8);
  742. q10u16 = vmull_u8(d24u8, d3u8);
  743. q3s16 = vreinterpretq_s16_u16(q3u16);
  744. q4s16 = vreinterpretq_s16_u16(q4u16);
  745. q5s16 = vreinterpretq_s16_u16(q5u16);
  746. q6s16 = vreinterpretq_s16_u16(q6u16);
  747. q7s16 = vreinterpretq_s16_u16(q7u16);
  748. q8s16 = vreinterpretq_s16_u16(q8u16);
  749. q9s16 = vreinterpretq_s16_u16(q9u16);
  750. q10s16 = vreinterpretq_s16_u16(q10u16);
  751. q7s16 = vqaddq_s16(q7s16, q3s16);
  752. q8s16 = vqaddq_s16(q8s16, q4s16);
  753. q9s16 = vqaddq_s16(q9s16, q5s16);
  754. q10s16 = vqaddq_s16(q10s16, q6s16);
  755. d6u8 = vqrshrun_n_s16(q7s16, 7);
  756. d7u8 = vqrshrun_n_s16(q8s16, 7);
  757. d8u8 = vqrshrun_n_s16(q9s16, 7);
  758. d9u8 = vqrshrun_n_s16(q10s16, 7);
  759. d18u8 = d22u8;
  760. d19u8 = d23u8;
  761. d20u8 = d24u8;
  762. d21u8 = d25u8;
  763. d22u8 = d26u8;
  764. d23u8 = d27u8;
  765. d24u8 = d28u8;
  766. d25u8 = d29u8;
  767. d26u8 = d30u8;
  768. vst1_u8(dst_ptr, d6u8);
  769. dst_ptr += dst_pitch;
  770. vst1_u8(dst_ptr, d7u8);
  771. dst_ptr += dst_pitch;
  772. vst1_u8(dst_ptr, d8u8);
  773. dst_ptr += dst_pitch;
  774. vst1_u8(dst_ptr, d9u8);
  775. dst_ptr += dst_pitch;
  776. }
  777. return;
  778. }
  779. void vp8_sixtap_predict16x16_neon(
  780. unsigned char *src_ptr,
  781. int src_pixels_per_line,
  782. int xoffset,
  783. int yoffset,
  784. unsigned char *dst_ptr,
  785. int dst_pitch) {
  786. unsigned char *src, *src_tmp, *dst, *tmpp;
  787. unsigned char tmp[336];
  788. int i, j;
  789. uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
  790. uint8x8_t d10u8, d11u8, d12u8, d13u8, d14u8, d15u8, d18u8, d19u8;
  791. uint8x8_t d20u8, d21u8, d22u8, d23u8, d24u8, d25u8, d26u8, d27u8;
  792. uint8x8_t d28u8, d29u8, d30u8, d31u8;
  793. int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8;
  794. uint8x16_t q3u8, q4u8;
  795. uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16, q8u16, q9u16, q10u16;
  796. uint16x8_t q11u16, q12u16, q13u16, q15u16;
  797. int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16, q8s16, q9s16, q10s16;
  798. int16x8_t q11s16, q12s16, q13s16, q15s16;
  799. if (xoffset == 0) { // secondpass_filter8x8_only
  800. // load second_pass filter
  801. dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
  802. d0s8 = vdup_lane_s8(dtmps8, 0);
  803. d1s8 = vdup_lane_s8(dtmps8, 1);
  804. d2s8 = vdup_lane_s8(dtmps8, 2);
  805. d3s8 = vdup_lane_s8(dtmps8, 3);
  806. d4s8 = vdup_lane_s8(dtmps8, 4);
  807. d5s8 = vdup_lane_s8(dtmps8, 5);
  808. d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
  809. d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
  810. d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
  811. d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
  812. d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
  813. d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
  814. // load src data
  815. src_tmp = src_ptr - src_pixels_per_line * 2;
  816. for (i = 0; i < 2; i++) {
  817. src = src_tmp + i * 8;
  818. dst = dst_ptr + i * 8;
  819. d18u8 = vld1_u8(src);
  820. src += src_pixels_per_line;
  821. d19u8 = vld1_u8(src);
  822. src += src_pixels_per_line;
  823. d20u8 = vld1_u8(src);
  824. src += src_pixels_per_line;
  825. d21u8 = vld1_u8(src);
  826. src += src_pixels_per_line;
  827. d22u8 = vld1_u8(src);
  828. src += src_pixels_per_line;
  829. for (j = 0; j < 4; j++) {
  830. d23u8 = vld1_u8(src);
  831. src += src_pixels_per_line;
  832. d24u8 = vld1_u8(src);
  833. src += src_pixels_per_line;
  834. d25u8 = vld1_u8(src);
  835. src += src_pixels_per_line;
  836. d26u8 = vld1_u8(src);
  837. src += src_pixels_per_line;
  838. q3u16 = vmull_u8(d18u8, d0u8);
  839. q4u16 = vmull_u8(d19u8, d0u8);
  840. q5u16 = vmull_u8(d20u8, d0u8);
  841. q6u16 = vmull_u8(d21u8, d0u8);
  842. q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
  843. q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
  844. q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
  845. q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
  846. q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
  847. q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
  848. q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
  849. q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
  850. q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
  851. q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
  852. q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
  853. q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
  854. q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
  855. q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
  856. q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
  857. q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
  858. q7u16 = vmull_u8(d21u8, d3u8);
  859. q8u16 = vmull_u8(d22u8, d3u8);
  860. q9u16 = vmull_u8(d23u8, d3u8);
  861. q10u16 = vmull_u8(d24u8, d3u8);
  862. q3s16 = vreinterpretq_s16_u16(q3u16);
  863. q4s16 = vreinterpretq_s16_u16(q4u16);
  864. q5s16 = vreinterpretq_s16_u16(q5u16);
  865. q6s16 = vreinterpretq_s16_u16(q6u16);
  866. q7s16 = vreinterpretq_s16_u16(q7u16);
  867. q8s16 = vreinterpretq_s16_u16(q8u16);
  868. q9s16 = vreinterpretq_s16_u16(q9u16);
  869. q10s16 = vreinterpretq_s16_u16(q10u16);
  870. q7s16 = vqaddq_s16(q7s16, q3s16);
  871. q8s16 = vqaddq_s16(q8s16, q4s16);
  872. q9s16 = vqaddq_s16(q9s16, q5s16);
  873. q10s16 = vqaddq_s16(q10s16, q6s16);
  874. d6u8 = vqrshrun_n_s16(q7s16, 7);
  875. d7u8 = vqrshrun_n_s16(q8s16, 7);
  876. d8u8 = vqrshrun_n_s16(q9s16, 7);
  877. d9u8 = vqrshrun_n_s16(q10s16, 7);
  878. d18u8 = d22u8;
  879. d19u8 = d23u8;
  880. d20u8 = d24u8;
  881. d21u8 = d25u8;
  882. d22u8 = d26u8;
  883. vst1_u8(dst, d6u8);
  884. dst += dst_pitch;
  885. vst1_u8(dst, d7u8);
  886. dst += dst_pitch;
  887. vst1_u8(dst, d8u8);
  888. dst += dst_pitch;
  889. vst1_u8(dst, d9u8);
  890. dst += dst_pitch;
  891. }
  892. }
  893. return;
  894. }
  895. // load first_pass filter
  896. dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]);
  897. d0s8 = vdup_lane_s8(dtmps8, 0);
  898. d1s8 = vdup_lane_s8(dtmps8, 1);
  899. d2s8 = vdup_lane_s8(dtmps8, 2);
  900. d3s8 = vdup_lane_s8(dtmps8, 3);
  901. d4s8 = vdup_lane_s8(dtmps8, 4);
  902. d5s8 = vdup_lane_s8(dtmps8, 5);
  903. d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
  904. d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
  905. d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
  906. d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
  907. d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
  908. d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
  909. // First pass: output_height lines x output_width columns (9x4)
  910. if (yoffset == 0) { // firstpass_filter4x4_only
  911. src = src_ptr - 2;
  912. dst = dst_ptr;
  913. for (i = 0; i < 8; i++) {
  914. d6u8 = vld1_u8(src);
  915. d7u8 = vld1_u8(src + 8);
  916. d8u8 = vld1_u8(src + 16);
  917. src += src_pixels_per_line;
  918. d9u8 = vld1_u8(src);
  919. d10u8 = vld1_u8(src + 8);
  920. d11u8 = vld1_u8(src + 16);
  921. src += src_pixels_per_line;
  922. __builtin_prefetch(src);
  923. __builtin_prefetch(src + src_pixels_per_line);
  924. q6u16 = vmull_u8(d6u8, d0u8);
  925. q7u16 = vmull_u8(d7u8, d0u8);
  926. q8u16 = vmull_u8(d9u8, d0u8);
  927. q9u16 = vmull_u8(d10u8, d0u8);
  928. d20u8 = vext_u8(d6u8, d7u8, 1);
  929. d21u8 = vext_u8(d9u8, d10u8, 1);
  930. d22u8 = vext_u8(d7u8, d8u8, 1);
  931. d23u8 = vext_u8(d10u8, d11u8, 1);
  932. d24u8 = vext_u8(d6u8, d7u8, 4);
  933. d25u8 = vext_u8(d9u8, d10u8, 4);
  934. d26u8 = vext_u8(d7u8, d8u8, 4);
  935. d27u8 = vext_u8(d10u8, d11u8, 4);
  936. d28u8 = vext_u8(d6u8, d7u8, 5);
  937. d29u8 = vext_u8(d9u8, d10u8, 5);
  938. q6u16 = vmlsl_u8(q6u16, d20u8, d1u8);
  939. q8u16 = vmlsl_u8(q8u16, d21u8, d1u8);
  940. q7u16 = vmlsl_u8(q7u16, d22u8, d1u8);
  941. q9u16 = vmlsl_u8(q9u16, d23u8, d1u8);
  942. q6u16 = vmlsl_u8(q6u16, d24u8, d4u8);
  943. q8u16 = vmlsl_u8(q8u16, d25u8, d4u8);
  944. q7u16 = vmlsl_u8(q7u16, d26u8, d4u8);
  945. q9u16 = vmlsl_u8(q9u16, d27u8, d4u8);
  946. q6u16 = vmlal_u8(q6u16, d28u8, d5u8);
  947. q8u16 = vmlal_u8(q8u16, d29u8, d5u8);
  948. d20u8 = vext_u8(d7u8, d8u8, 5);
  949. d21u8 = vext_u8(d10u8, d11u8, 5);
  950. d22u8 = vext_u8(d6u8, d7u8, 2);
  951. d23u8 = vext_u8(d9u8, d10u8, 2);
  952. d24u8 = vext_u8(d7u8, d8u8, 2);
  953. d25u8 = vext_u8(d10u8, d11u8, 2);
  954. d26u8 = vext_u8(d6u8, d7u8, 3);
  955. d27u8 = vext_u8(d9u8, d10u8, 3);
  956. d28u8 = vext_u8(d7u8, d8u8, 3);
  957. d29u8 = vext_u8(d10u8, d11u8, 3);
  958. q7u16 = vmlal_u8(q7u16, d20u8, d5u8);
  959. q9u16 = vmlal_u8(q9u16, d21u8, d5u8);
  960. q6u16 = vmlal_u8(q6u16, d22u8, d2u8);
  961. q8u16 = vmlal_u8(q8u16, d23u8, d2u8);
  962. q7u16 = vmlal_u8(q7u16, d24u8, d2u8);
  963. q9u16 = vmlal_u8(q9u16, d25u8, d2u8);
  964. q10u16 = vmull_u8(d26u8, d3u8);
  965. q11u16 = vmull_u8(d27u8, d3u8);
  966. q12u16 = vmull_u8(d28u8, d3u8);
  967. q15u16 = vmull_u8(d29u8, d3u8);
  968. q6s16 = vreinterpretq_s16_u16(q6u16);
  969. q7s16 = vreinterpretq_s16_u16(q7u16);
  970. q8s16 = vreinterpretq_s16_u16(q8u16);
  971. q9s16 = vreinterpretq_s16_u16(q9u16);
  972. q10s16 = vreinterpretq_s16_u16(q10u16);
  973. q11s16 = vreinterpretq_s16_u16(q11u16);
  974. q12s16 = vreinterpretq_s16_u16(q12u16);
  975. q15s16 = vreinterpretq_s16_u16(q15u16);
  976. q6s16 = vqaddq_s16(q6s16, q10s16);
  977. q8s16 = vqaddq_s16(q8s16, q11s16);
  978. q7s16 = vqaddq_s16(q7s16, q12s16);
  979. q9s16 = vqaddq_s16(q9s16, q15s16);
  980. d6u8 = vqrshrun_n_s16(q6s16, 7);
  981. d7u8 = vqrshrun_n_s16(q7s16, 7);
  982. d8u8 = vqrshrun_n_s16(q8s16, 7);
  983. d9u8 = vqrshrun_n_s16(q9s16, 7);
  984. q3u8 = vcombine_u8(d6u8, d7u8);
  985. q4u8 = vcombine_u8(d8u8, d9u8);
  986. vst1q_u8(dst, q3u8);
  987. dst += dst_pitch;
  988. vst1q_u8(dst, q4u8);
  989. dst += dst_pitch;
  990. }
  991. return;
  992. }
  993. src = src_ptr - 2 - src_pixels_per_line * 2;
  994. tmpp = tmp;
  995. for (i = 0; i < 7; i++) {
  996. d6u8 = vld1_u8(src);
  997. d7u8 = vld1_u8(src + 8);
  998. d8u8 = vld1_u8(src + 16);
  999. src += src_pixels_per_line;
  1000. d9u8 = vld1_u8(src);
  1001. d10u8 = vld1_u8(src + 8);
  1002. d11u8 = vld1_u8(src + 16);
  1003. src += src_pixels_per_line;
  1004. d12u8 = vld1_u8(src);
  1005. d13u8 = vld1_u8(src + 8);
  1006. d14u8 = vld1_u8(src + 16);
  1007. src += src_pixels_per_line;
  1008. __builtin_prefetch(src);
  1009. __builtin_prefetch(src + src_pixels_per_line);
  1010. __builtin_prefetch(src + src_pixels_per_line * 2);
  1011. q8u16 = vmull_u8(d6u8, d0u8);
  1012. q9u16 = vmull_u8(d7u8, d0u8);
  1013. q10u16 = vmull_u8(d9u8, d0u8);
  1014. q11u16 = vmull_u8(d10u8, d0u8);
  1015. q12u16 = vmull_u8(d12u8, d0u8);
  1016. q13u16 = vmull_u8(d13u8, d0u8);
  1017. d28u8 = vext_u8(d6u8, d7u8, 1);
  1018. d29u8 = vext_u8(d9u8, d10u8, 1);
  1019. d30u8 = vext_u8(d12u8, d13u8, 1);
  1020. q8u16 = vmlsl_u8(q8u16, d28u8, d1u8);
  1021. q10u16 = vmlsl_u8(q10u16, d29u8, d1u8);
  1022. q12u16 = vmlsl_u8(q12u16, d30u8, d1u8);
  1023. d28u8 = vext_u8(d7u8, d8u8, 1);
  1024. d29u8 = vext_u8(d10u8, d11u8, 1);
  1025. d30u8 = vext_u8(d13u8, d14u8, 1);
  1026. q9u16 = vmlsl_u8(q9u16, d28u8, d1u8);
  1027. q11u16 = vmlsl_u8(q11u16, d29u8, d1u8);
  1028. q13u16 = vmlsl_u8(q13u16, d30u8, d1u8);
  1029. d28u8 = vext_u8(d6u8, d7u8, 4);
  1030. d29u8 = vext_u8(d9u8, d10u8, 4);
  1031. d30u8 = vext_u8(d12u8, d13u8, 4);
  1032. q8u16 = vmlsl_u8(q8u16, d28u8, d4u8);
  1033. q10u16 = vmlsl_u8(q10u16, d29u8, d4u8);
  1034. q12u16 = vmlsl_u8(q12u16, d30u8, d4u8);
  1035. d28u8 = vext_u8(d7u8, d8u8, 4);
  1036. d29u8 = vext_u8(d10u8, d11u8, 4);
  1037. d30u8 = vext_u8(d13u8, d14u8, 4);
  1038. q9u16 = vmlsl_u8(q9u16, d28u8, d4u8);
  1039. q11u16 = vmlsl_u8(q11u16, d29u8, d4u8);
  1040. q13u16 = vmlsl_u8(q13u16, d30u8, d4u8);
  1041. d28u8 = vext_u8(d6u8, d7u8, 5);
  1042. d29u8 = vext_u8(d9u8, d10u8, 5);
  1043. d30u8 = vext_u8(d12u8, d13u8, 5);
  1044. q8u16 = vmlal_u8(q8u16, d28u8, d5u8);
  1045. q10u16 = vmlal_u8(q10u16, d29u8, d5u8);
  1046. q12u16 = vmlal_u8(q12u16, d30u8, d5u8);
  1047. d28u8 = vext_u8(d7u8, d8u8, 5);
  1048. d29u8 = vext_u8(d10u8, d11u8, 5);
  1049. d30u8 = vext_u8(d13u8, d14u8, 5);
  1050. q9u16 = vmlal_u8(q9u16, d28u8, d5u8);
  1051. q11u16 = vmlal_u8(q11u16, d29u8, d5u8);
  1052. q13u16 = vmlal_u8(q13u16, d30u8, d5u8);
  1053. d28u8 = vext_u8(d6u8, d7u8, 2);
  1054. d29u8 = vext_u8(d9u8, d10u8, 2);
  1055. d30u8 = vext_u8(d12u8, d13u8, 2);
  1056. q8u16 = vmlal_u8(q8u16, d28u8, d2u8);
  1057. q10u16 = vmlal_u8(q10u16, d29u8, d2u8);
  1058. q12u16 = vmlal_u8(q12u16, d30u8, d2u8);
  1059. d28u8 = vext_u8(d7u8, d8u8, 2);
  1060. d29u8 = vext_u8(d10u8, d11u8, 2);
  1061. d30u8 = vext_u8(d13u8, d14u8, 2);
  1062. q9u16 = vmlal_u8(q9u16, d28u8, d2u8);
  1063. q11u16 = vmlal_u8(q11u16, d29u8, d2u8);
  1064. q13u16 = vmlal_u8(q13u16, d30u8, d2u8);
  1065. d28u8 = vext_u8(d6u8, d7u8, 3);
  1066. d29u8 = vext_u8(d9u8, d10u8, 3);
  1067. d30u8 = vext_u8(d12u8, d13u8, 3);
  1068. d15u8 = vext_u8(d7u8, d8u8, 3);
  1069. d31u8 = vext_u8(d10u8, d11u8, 3);
  1070. d6u8 = vext_u8(d13u8, d14u8, 3);
  1071. q4u16 = vmull_u8(d28u8, d3u8);
  1072. q5u16 = vmull_u8(d29u8, d3u8);
  1073. q6u16 = vmull_u8(d30u8, d3u8);
  1074. q4s16 = vreinterpretq_s16_u16(q4u16);
  1075. q5s16 = vreinterpretq_s16_u16(q5u16);
  1076. q6s16 = vreinterpretq_s16_u16(q6u16);
  1077. q8s16 = vreinterpretq_s16_u16(q8u16);
  1078. q10s16 = vreinterpretq_s16_u16(q10u16);
  1079. q12s16 = vreinterpretq_s16_u16(q12u16);
  1080. q8s16 = vqaddq_s16(q8s16, q4s16);
  1081. q10s16 = vqaddq_s16(q10s16, q5s16);
  1082. q12s16 = vqaddq_s16(q12s16, q6s16);
  1083. q6u16 = vmull_u8(d15u8, d3u8);
  1084. q7u16 = vmull_u8(d31u8, d3u8);
  1085. q3u16 = vmull_u8(d6u8, d3u8);
  1086. q3s16 = vreinterpretq_s16_u16(q3u16);
  1087. q6s16 = vreinterpretq_s16_u16(q6u16);
  1088. q7s16 = vreinterpretq_s16_u16(q7u16);
  1089. q9s16 = vreinterpretq_s16_u16(q9u16);
  1090. q11s16 = vreinterpretq_s16_u16(q11u16);
  1091. q13s16 = vreinterpretq_s16_u16(q13u16);
  1092. q9s16 = vqaddq_s16(q9s16, q6s16);
  1093. q11s16 = vqaddq_s16(q11s16, q7s16);
  1094. q13s16 = vqaddq_s16(q13s16, q3s16);
  1095. d6u8 = vqrshrun_n_s16(q8s16, 7);
  1096. d7u8 = vqrshrun_n_s16(q9s16, 7);
  1097. d8u8 = vqrshrun_n_s16(q10s16, 7);
  1098. d9u8 = vqrshrun_n_s16(q11s16, 7);
  1099. d10u8 = vqrshrun_n_s16(q12s16, 7);
  1100. d11u8 = vqrshrun_n_s16(q13s16, 7);
  1101. vst1_u8(tmpp, d6u8);
  1102. tmpp += 8;
  1103. vst1_u8(tmpp, d7u8);
  1104. tmpp += 8;
  1105. vst1_u8(tmpp, d8u8);
  1106. tmpp += 8;
  1107. vst1_u8(tmpp, d9u8);
  1108. tmpp += 8;
  1109. vst1_u8(tmpp, d10u8);
  1110. tmpp += 8;
  1111. vst1_u8(tmpp, d11u8);
  1112. tmpp += 8;
  1113. }
  1114. // Second pass: 16x16
  1115. dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
  1116. d0s8 = vdup_lane_s8(dtmps8, 0);
  1117. d1s8 = vdup_lane_s8(dtmps8, 1);
  1118. d2s8 = vdup_lane_s8(dtmps8, 2);
  1119. d3s8 = vdup_lane_s8(dtmps8, 3);
  1120. d4s8 = vdup_lane_s8(dtmps8, 4);
  1121. d5s8 = vdup_lane_s8(dtmps8, 5);
  1122. d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
  1123. d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
  1124. d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
  1125. d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
  1126. d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
  1127. d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
  1128. for (i = 0; i < 2; i++) {
  1129. dst = dst_ptr + 8 * i;
  1130. tmpp = tmp + 8 * i;
  1131. d18u8 = vld1_u8(tmpp);
  1132. tmpp += 16;
  1133. d19u8 = vld1_u8(tmpp);
  1134. tmpp += 16;
  1135. d20u8 = vld1_u8(tmpp);
  1136. tmpp += 16;
  1137. d21u8 = vld1_u8(tmpp);
  1138. tmpp += 16;
  1139. d22u8 = vld1_u8(tmpp);
  1140. tmpp += 16;
  1141. for (j = 0; j < 4; j++) {
  1142. d23u8 = vld1_u8(tmpp);
  1143. tmpp += 16;
  1144. d24u8 = vld1_u8(tmpp);
  1145. tmpp += 16;
  1146. d25u8 = vld1_u8(tmpp);
  1147. tmpp += 16;
  1148. d26u8 = vld1_u8(tmpp);
  1149. tmpp += 16;
  1150. q3u16 = vmull_u8(d18u8, d0u8);
  1151. q4u16 = vmull_u8(d19u8, d0u8);
  1152. q5u16 = vmull_u8(d20u8, d0u8);
  1153. q6u16 = vmull_u8(d21u8, d0u8);
  1154. q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
  1155. q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
  1156. q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
  1157. q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
  1158. q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
  1159. q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
  1160. q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
  1161. q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
  1162. q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
  1163. q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
  1164. q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
  1165. q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
  1166. q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
  1167. q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
  1168. q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
  1169. q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
  1170. q7u16 = vmull_u8(d21u8, d3u8);
  1171. q8u16 = vmull_u8(d22u8, d3u8);
  1172. q9u16 = vmull_u8(d23u8, d3u8);
  1173. q10u16 = vmull_u8(d24u8, d3u8);
  1174. q3s16 = vreinterpretq_s16_u16(q3u16);
  1175. q4s16 = vreinterpretq_s16_u16(q4u16);
  1176. q5s16 = vreinterpretq_s16_u16(q5u16);
  1177. q6s16 = vreinterpretq_s16_u16(q6u16);
  1178. q7s16 = vreinterpretq_s16_u16(q7u16);
  1179. q8s16 = vreinterpretq_s16_u16(q8u16);
  1180. q9s16 = vreinterpretq_s16_u16(q9u16);
  1181. q10s16 = vreinterpretq_s16_u16(q10u16);
  1182. q7s16 = vqaddq_s16(q7s16, q3s16);
  1183. q8s16 = vqaddq_s16(q8s16, q4s16);
  1184. q9s16 = vqaddq_s16(q9s16, q5s16);
  1185. q10s16 = vqaddq_s16(q10s16, q6s16);
  1186. d6u8 = vqrshrun_n_s16(q7s16, 7);
  1187. d7u8 = vqrshrun_n_s16(q8s16, 7);
  1188. d8u8 = vqrshrun_n_s16(q9s16, 7);
  1189. d9u8 = vqrshrun_n_s16(q10s16, 7);
  1190. d18u8 = d22u8;
  1191. d19u8 = d23u8;
  1192. d20u8 = d24u8;
  1193. d21u8 = d25u8;
  1194. d22u8 = d26u8;
  1195. vst1_u8(dst, d6u8);
  1196. dst += dst_pitch;
  1197. vst1_u8(dst, d7u8);
  1198. dst += dst_pitch;
  1199. vst1_u8(dst, d8u8);
  1200. dst += dst_pitch;
  1201. vst1_u8(dst, d9u8);
  1202. dst += dst_pitch;
  1203. }
  1204. }
  1205. return;
  1206. }