iwalsh_neon.c 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103
  1. /*
  2. * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
  3. *
  4. * Use of this source code is governed by a BSD-style license
  5. * that can be found in the LICENSE file in the root of the source
  6. * tree. An additional intellectual property rights grant can be found
  7. * in the file PATENTS. All contributing project authors may
  8. * be found in the AUTHORS file in the root of the source tree.
  9. */
  10. #include <arm_neon.h>
  11. void vp8_short_inv_walsh4x4_neon(
  12. int16_t *input,
  13. int16_t *mb_dqcoeff) {
  14. int16x8_t q0s16, q1s16, q2s16, q3s16;
  15. int16x4_t d4s16, d5s16, d6s16, d7s16;
  16. int16x4x2_t v2tmp0, v2tmp1;
  17. int32x2x2_t v2tmp2, v2tmp3;
  18. int16x8_t qAdd3;
  19. q0s16 = vld1q_s16(input);
  20. q1s16 = vld1q_s16(input + 8);
  21. // 1st for loop
  22. d4s16 = vadd_s16(vget_low_s16(q0s16), vget_high_s16(q1s16));
  23. d6s16 = vadd_s16(vget_high_s16(q0s16), vget_low_s16(q1s16));
  24. d5s16 = vsub_s16(vget_low_s16(q0s16), vget_high_s16(q1s16));
  25. d7s16 = vsub_s16(vget_high_s16(q0s16), vget_low_s16(q1s16));
  26. q2s16 = vcombine_s16(d4s16, d5s16);
  27. q3s16 = vcombine_s16(d6s16, d7s16);
  28. q0s16 = vaddq_s16(q2s16, q3s16);
  29. q1s16 = vsubq_s16(q2s16, q3s16);
  30. v2tmp2 = vtrn_s32(vreinterpret_s32_s16(vget_low_s16(q0s16)),
  31. vreinterpret_s32_s16(vget_low_s16(q1s16)));
  32. v2tmp3 = vtrn_s32(vreinterpret_s32_s16(vget_high_s16(q0s16)),
  33. vreinterpret_s32_s16(vget_high_s16(q1s16)));
  34. v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[0]),
  35. vreinterpret_s16_s32(v2tmp3.val[0]));
  36. v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[1]),
  37. vreinterpret_s16_s32(v2tmp3.val[1]));
  38. // 2nd for loop
  39. d4s16 = vadd_s16(v2tmp0.val[0], v2tmp1.val[1]);
  40. d6s16 = vadd_s16(v2tmp0.val[1], v2tmp1.val[0]);
  41. d5s16 = vsub_s16(v2tmp0.val[0], v2tmp1.val[1]);
  42. d7s16 = vsub_s16(v2tmp0.val[1], v2tmp1.val[0]);
  43. q2s16 = vcombine_s16(d4s16, d5s16);
  44. q3s16 = vcombine_s16(d6s16, d7s16);
  45. qAdd3 = vdupq_n_s16(3);
  46. q0s16 = vaddq_s16(q2s16, q3s16);
  47. q1s16 = vsubq_s16(q2s16, q3s16);
  48. q0s16 = vaddq_s16(q0s16, qAdd3);
  49. q1s16 = vaddq_s16(q1s16, qAdd3);
  50. q0s16 = vshrq_n_s16(q0s16, 3);
  51. q1s16 = vshrq_n_s16(q1s16, 3);
  52. // store
  53. vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 0);
  54. mb_dqcoeff += 16;
  55. vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 0);
  56. mb_dqcoeff += 16;
  57. vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 0);
  58. mb_dqcoeff += 16;
  59. vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 0);
  60. mb_dqcoeff += 16;
  61. vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 1);
  62. mb_dqcoeff += 16;
  63. vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 1);
  64. mb_dqcoeff += 16;
  65. vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 1);
  66. mb_dqcoeff += 16;
  67. vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 1);
  68. mb_dqcoeff += 16;
  69. vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 2);
  70. mb_dqcoeff += 16;
  71. vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 2);
  72. mb_dqcoeff += 16;
  73. vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 2);
  74. mb_dqcoeff += 16;
  75. vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 2);
  76. mb_dqcoeff += 16;
  77. vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 3);
  78. mb_dqcoeff += 16;
  79. vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 3);
  80. mb_dqcoeff += 16;
  81. vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 3);
  82. mb_dqcoeff += 16;
  83. vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 3);
  84. mb_dqcoeff += 16;
  85. return;
  86. }