123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163 |
- ;********************************************************************
- ;* *
- ;* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
- ;* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
- ;* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- ;* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
- ;* *
- ;* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
- ;* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
- ;* *
- ;********************************************************************
- ;
- ; function:
- ; last mod: $Id: mmxstate.c 17247 2010-05-28 05:35:32Z tterribe $
- ;
- ;********************************************************************
- AREA |.text|, CODE, READONLY
- GET armopts.s
- [ OC_ARM_ASM_NEON
- EXPORT oc_enc_enquant_table_init_neon
- EXPORT oc_enc_enquant_table_fixup_neon
- EXPORT oc_enc_quantize_neon
- oc_enc_enquant_table_init_neon PROC
- ; r0 = void *_enquant
- ; r1 = const ogg_uint16_t _dequant[64]
- STMFD r13!,{r0,r14}
- ; Initialize the table using the C routine
- BLX oc_enc_enquant_table_init_c
- LDR r0, [r13],#4
- MOV r1, #2
- ; Now partially de-interleave it, so that the first row is all
- ; multipliers, the second row is all shift factors, etc.
- ; Also, negate the shifts for use by VSHL.
- oeeti_neon_lp
- SUBS r1, r1, #1
- VLDMIA r0, {D16-D31}
- VUZP.16 Q8, Q9
- VNEG.S16 Q9, Q9
- VUZP.16 Q10,Q11
- VNEG.S16 Q11,Q11
- VUZP.16 Q12,Q13
- VNEG.S16 Q13,Q13
- VUZP.16 Q14,Q15
- VNEG.S16 Q15,Q15
- VSTMIA r0!,{D16-D31}
- BNE oeeti_neon_lp
- LDR PC, [r13],#4
- ENDP
- oc_enc_enquant_table_fixup_neon PROC
- ; r0 = void *_enquant[3][3][2]
- ; r1 = int _nqis
- STR r14,[r13,#-4]!
- oeetf_neon_lp1
- SUBS r1, r1, #1
- BEQ oeetf_neon_end1
- MOV r14,#3
- oeetf_neon_lp2
- LDR r2, [r0]
- SUBS r14,r14,#1
- LDRH r3, [r2]
- LDRH r12,[r2,#16]
- LDR r2, [r0,#8]
- STRH r3, [r2]
- STRH r12,[r2,#16]
- LDR r2, [r0,#4]
- LDRH r3, [r2]
- LDRH r12,[r2,#16]
- LDR r2, [r0,#12]
- ADD r0, r0, #24
- STRH r3, [r2]
- STRH r12,[r2,#16]
- BNE oeetf_neon_lp2
- SUB r0, r0, #64
- B oeetf_neon_lp1
- oeetf_neon_end1
- LDR PC, [r13],#4
- ENDP
- oc_enc_quantize_neon PROC
- ; r0 = ogg_int16_t _qdct[64]
- ; r1 = const ogg_int16_t _dct[64]
- ; r2 = const ogg_int16_t _dequant[64]
- ; r3 = const void *_enquant
- STMFD r13!,{r4,r5,r14}
- ; The loop counter goes in the high half of r14.
- MOV r14,#0xFFFCFFFF
- oeq_neon_lp
- ; Load the next two rows of the data and the quant matrices.
- VLD1.64 {D16,D17,D18,D19},[r1@128]!
- VLD1.64 {D20,D21,D22,D23},[r2@128]!
- ; Add in the signed rounding bias from the quantizers.
- ; Note that the VHADD relies on the fact that the quantizers are all
- ; even (they're in fact multiples of four) in order to round correctly
- ; on the entries being negated.
- VSHR.S16 Q0, Q8, #15
- VSHR.S16 Q1, Q9, #15
- VLD1.64 {D24,D25,D26,D27},[r3@128]!
- VHADD.S16 Q10,Q0, Q10
- VHADD.S16 Q11,Q1, Q11
- VLD1.64 {D28,D29,D30,D31},[r3@128]!
- ADDS r14,r14,#1<<16
- VEOR.S16 Q10,Q0, Q10
- VEOR.S16 Q11,Q1, Q11
- VADD.S16 Q8, Q8, Q10
- VADD.S16 Q9, Q9, Q11
- ; Perform the actual division and save the result.
- VQDMULH.S16 Q12,Q8, Q12
- VQDMULH.S16 Q14,Q9, Q14
- VADD.S16 Q8, Q8, Q8
- VADD.S16 Q9, Q9, Q9
- VADD.S16 Q8, Q8, Q12
- VADD.S16 Q9, Q9, Q14
- VSHL.S16 Q8, Q13
- VSHL.S16 Q9, Q15
- VSUB.S16 Q8, Q8, Q0
- VSUB.S16 Q9, Q9, Q1
- VST1.64 {D16,D17,D18,D19},[r0@128]!
- ; Now pull out a bitfield marking the non-zero coefficients.
- VQMOVN.S16 D16,Q8
- VQMOVN.S16 D17,Q9
- VCEQ.S8 Q8, #0
- ; Sadly, NEON has no PMOVMSKB; emulating it requires 6 instructions.
- VNEG.S8 Q8, Q8 ; D16=.......3.......2.......1.......0
- ; .......7.......6.......5.......4
- ; D17=.......B.......A.......9.......8
- ; .......F.......E.......D.......C
- VZIP.8 D16,D17 ; D16=.......9.......1.......8.......0
- ; .......B.......3.......A.......2
- ; D17=.......D.......5.......C.......4
- ; .......F.......7.......E.......6
- VSLI.8 D16,D17,#4 ; D16=...D...9...5...1...C...8...4...0
- ; ...F...B...7...3...E...A...6...2
- ; Shift over the bitfields from previous iterations and
- ; finish compacting the bitfield from the last iteration.
- ORR r4, r4, r5, LSL #2 ; r4 =.F.D.B.9.7.5.3.1.E.C.A.8.6.4.2.0
- ORR r4, r4, r4, LSR #15 ; r4 =.F.D.B.9.7.5.3.1FEDCBA9876543210
- PKHTB r14,r14,r12,ASR #16 ; r14=i|A
- PKHBT r12,r4, r12,LSL #16 ; r12=B|C
- VMOV r4, r5, D16
- BLT oeq_neon_lp
- ; Start with the low half while the NEON register transfers.
- PKHBT r0, r14,r12 ; r0 =B|A
- MVNS r0, r0
- CLZNE r0, r0
- RSBNE r0, r0, #31
- ; Stall 8-10 more cycles waiting for the last transfer.
- ORR r4, r4, r5, LSL #2 ; r4 =.F.D.B.9.7.5.3.1.E.C.A.8.6.4.2.0
- ORR r4, r4, r4, LSR #15 ; r4 =.F.D.B.9.7.5.3.1FEDCBA9876543210
- PKHBT r1, r12,r4, LSL #16 ; r1 = D|C
- MVNS r1, r1
- CLZNE r1, r1
- RSBNE r0, r1, #63
- LDMFD r13!,{r4,r5,PC}
- ENDP
- ]
- END
|