armenquant.s 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163
  1. ;********************************************************************
  2. ;* *
  3. ;* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
  4. ;* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
  5. ;* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
  6. ;* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
  7. ;* *
  8. ;* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
  9. ;* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
  10. ;* *
  11. ;********************************************************************
  12. ;
  13. ; function:
  14. ; last mod: $Id: mmxstate.c 17247 2010-05-28 05:35:32Z tterribe $
  15. ;
  16. ;********************************************************************
  17. AREA |.text|, CODE, READONLY
  18. GET armopts.s
  19. [ OC_ARM_ASM_NEON
  20. EXPORT oc_enc_enquant_table_init_neon
  21. EXPORT oc_enc_enquant_table_fixup_neon
  22. EXPORT oc_enc_quantize_neon
  23. oc_enc_enquant_table_init_neon PROC
  24. ; r0 = void *_enquant
  25. ; r1 = const ogg_uint16_t _dequant[64]
  26. STMFD r13!,{r0,r14}
  27. ; Initialize the table using the C routine
  28. BLX oc_enc_enquant_table_init_c
  29. LDR r0, [r13],#4
  30. MOV r1, #2
  31. ; Now partially de-interleave it, so that the first row is all
  32. ; multipliers, the second row is all shift factors, etc.
  33. ; Also, negate the shifts for use by VSHL.
  34. oeeti_neon_lp
  35. SUBS r1, r1, #1
  36. VLDMIA r0, {D16-D31}
  37. VUZP.16 Q8, Q9
  38. VNEG.S16 Q9, Q9
  39. VUZP.16 Q10,Q11
  40. VNEG.S16 Q11,Q11
  41. VUZP.16 Q12,Q13
  42. VNEG.S16 Q13,Q13
  43. VUZP.16 Q14,Q15
  44. VNEG.S16 Q15,Q15
  45. VSTMIA r0!,{D16-D31}
  46. BNE oeeti_neon_lp
  47. LDR PC, [r13],#4
  48. ENDP
  49. oc_enc_enquant_table_fixup_neon PROC
  50. ; r0 = void *_enquant[3][3][2]
  51. ; r1 = int _nqis
  52. STR r14,[r13,#-4]!
  53. oeetf_neon_lp1
  54. SUBS r1, r1, #1
  55. BEQ oeetf_neon_end1
  56. MOV r14,#3
  57. oeetf_neon_lp2
  58. LDR r2, [r0]
  59. SUBS r14,r14,#1
  60. LDRH r3, [r2]
  61. LDRH r12,[r2,#16]
  62. LDR r2, [r0,#8]
  63. STRH r3, [r2]
  64. STRH r12,[r2,#16]
  65. LDR r2, [r0,#4]
  66. LDRH r3, [r2]
  67. LDRH r12,[r2,#16]
  68. LDR r2, [r0,#12]
  69. ADD r0, r0, #24
  70. STRH r3, [r2]
  71. STRH r12,[r2,#16]
  72. BNE oeetf_neon_lp2
  73. SUB r0, r0, #64
  74. B oeetf_neon_lp1
  75. oeetf_neon_end1
  76. LDR PC, [r13],#4
  77. ENDP
  78. oc_enc_quantize_neon PROC
  79. ; r0 = ogg_int16_t _qdct[64]
  80. ; r1 = const ogg_int16_t _dct[64]
  81. ; r2 = const ogg_int16_t _dequant[64]
  82. ; r3 = const void *_enquant
  83. STMFD r13!,{r4,r5,r14}
  84. ; The loop counter goes in the high half of r14.
  85. MOV r14,#0xFFFCFFFF
  86. oeq_neon_lp
  87. ; Load the next two rows of the data and the quant matrices.
  88. VLD1.64 {D16,D17,D18,D19},[r1@128]!
  89. VLD1.64 {D20,D21,D22,D23},[r2@128]!
  90. ; Add in the signed rounding bias from the quantizers.
  91. ; Note that the VHADD relies on the fact that the quantizers are all
  92. ; even (they're in fact multiples of four) in order to round correctly
  93. ; on the entries being negated.
  94. VSHR.S16 Q0, Q8, #15
  95. VSHR.S16 Q1, Q9, #15
  96. VLD1.64 {D24,D25,D26,D27},[r3@128]!
  97. VHADD.S16 Q10,Q0, Q10
  98. VHADD.S16 Q11,Q1, Q11
  99. VLD1.64 {D28,D29,D30,D31},[r3@128]!
  100. ADDS r14,r14,#1<<16
  101. VEOR.S16 Q10,Q0, Q10
  102. VEOR.S16 Q11,Q1, Q11
  103. VADD.S16 Q8, Q8, Q10
  104. VADD.S16 Q9, Q9, Q11
  105. ; Perform the actual division and save the result.
  106. VQDMULH.S16 Q12,Q8, Q12
  107. VQDMULH.S16 Q14,Q9, Q14
  108. VADD.S16 Q8, Q8, Q8
  109. VADD.S16 Q9, Q9, Q9
  110. VADD.S16 Q8, Q8, Q12
  111. VADD.S16 Q9, Q9, Q14
  112. VSHL.S16 Q8, Q13
  113. VSHL.S16 Q9, Q15
  114. VSUB.S16 Q8, Q8, Q0
  115. VSUB.S16 Q9, Q9, Q1
  116. VST1.64 {D16,D17,D18,D19},[r0@128]!
  117. ; Now pull out a bitfield marking the non-zero coefficients.
  118. VQMOVN.S16 D16,Q8
  119. VQMOVN.S16 D17,Q9
  120. VCEQ.S8 Q8, #0
  121. ; Sadly, NEON has no PMOVMSKB; emulating it requires 6 instructions.
  122. VNEG.S8 Q8, Q8 ; D16=.......3.......2.......1.......0
  123. ; .......7.......6.......5.......4
  124. ; D17=.......B.......A.......9.......8
  125. ; .......F.......E.......D.......C
  126. VZIP.8 D16,D17 ; D16=.......9.......1.......8.......0
  127. ; .......B.......3.......A.......2
  128. ; D17=.......D.......5.......C.......4
  129. ; .......F.......7.......E.......6
  130. VSLI.8 D16,D17,#4 ; D16=...D...9...5...1...C...8...4...0
  131. ; ...F...B...7...3...E...A...6...2
  132. ; Shift over the bitfields from previous iterations and
  133. ; finish compacting the bitfield from the last iteration.
  134. ORR r4, r4, r5, LSL #2 ; r4 =.F.D.B.9.7.5.3.1.E.C.A.8.6.4.2.0
  135. ORR r4, r4, r4, LSR #15 ; r4 =.F.D.B.9.7.5.3.1FEDCBA9876543210
  136. PKHTB r14,r14,r12,ASR #16 ; r14=i|A
  137. PKHBT r12,r4, r12,LSL #16 ; r12=B|C
  138. VMOV r4, r5, D16
  139. BLT oeq_neon_lp
  140. ; Start with the low half while the NEON register transfers.
  141. PKHBT r0, r14,r12 ; r0 =B|A
  142. MVNS r0, r0
  143. CLZNE r0, r0
  144. RSBNE r0, r0, #31
  145. ; Stall 8-10 more cycles waiting for the last transfer.
  146. ORR r4, r4, r5, LSL #2 ; r4 =.F.D.B.9.7.5.3.1.E.C.A.8.6.4.2.0
  147. ORR r4, r4, r4, LSR #15 ; r4 =.F.D.B.9.7.5.3.1FEDCBA9876543210
  148. PKHBT r1, r12,r4, LSL #16 ; r1 = D|C
  149. MVNS r1, r1
  150. CLZNE r1, r1
  151. RSBNE r0, r1, #63
  152. LDMFD r13!,{r4,r5,PC}
  153. ENDP
  154. ]
  155. END