mmxstate.c 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177
  1. /********************************************************************
  2. * *
  3. * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
  4. * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
  5. * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
  6. * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
  7. * *
  8. * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
  9. * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
  10. * *
  11. ********************************************************************
  12. function:
  13. last mod: $Id$
  14. ********************************************************************/
  15. /*MMX acceleration of complete fragment reconstruction algorithm.
  16. Originally written by Rudolf Marek.*/
  17. #include <string.h>
  18. #include "x86int.h"
  19. #include "mmxloop.h"
  20. #if defined(OC_X86_ASM)
  21. void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
  22. int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){
  23. unsigned char *dst;
  24. ptrdiff_t frag_buf_off;
  25. int ystride;
  26. int refi;
  27. /*Apply the inverse transform.*/
  28. /*Special case only having a DC component.*/
  29. if(_last_zzi<2){
  30. /*Note that this value must be unsigned, to keep the __asm__ block from
  31. sign-extending it when it puts it in a register.*/
  32. ogg_uint16_t p;
  33. /*We round this dequant product (and not any of the others) because there's
  34. no iDCT rounding.*/
  35. p=(ogg_int16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5);
  36. /*Fill _dct_coeffs with p.*/
  37. __asm{
  38. #define Y eax
  39. #define P ecx
  40. mov Y,_dct_coeffs
  41. movzx P,p
  42. lea Y,[Y+128]
  43. /*mm0=0000 0000 0000 AAAA*/
  44. movd mm0,P
  45. /*mm0=0000 0000 AAAA AAAA*/
  46. punpcklwd mm0,mm0
  47. /*mm0=AAAA AAAA AAAA AAAA*/
  48. punpckldq mm0,mm0
  49. movq [Y],mm0
  50. movq [8+Y],mm0
  51. movq [16+Y],mm0
  52. movq [24+Y],mm0
  53. movq [32+Y],mm0
  54. movq [40+Y],mm0
  55. movq [48+Y],mm0
  56. movq [56+Y],mm0
  57. movq [64+Y],mm0
  58. movq [72+Y],mm0
  59. movq [80+Y],mm0
  60. movq [88+Y],mm0
  61. movq [96+Y],mm0
  62. movq [104+Y],mm0
  63. movq [112+Y],mm0
  64. movq [120+Y],mm0
  65. #undef Y
  66. #undef P
  67. }
  68. }
  69. else{
  70. /*Dequantize the DC coefficient.*/
  71. _dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
  72. oc_idct8x8_mmx(_dct_coeffs+64,_dct_coeffs,_last_zzi);
  73. }
  74. /*Fill in the target buffer.*/
  75. frag_buf_off=_state->frag_buf_offs[_fragi];
  76. refi=_state->frags[_fragi].refi;
  77. ystride=_state->ref_ystride[_pli];
  78. dst=_state->ref_frame_data[OC_FRAME_SELF]+frag_buf_off;
  79. if(refi==OC_FRAME_SELF)oc_frag_recon_intra_mmx(dst,ystride,_dct_coeffs+64);
  80. else{
  81. const unsigned char *ref;
  82. int mvoffsets[2];
  83. ref=_state->ref_frame_data[refi]+frag_buf_off;
  84. if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
  85. _state->frag_mvs[_fragi])>1){
  86. oc_frag_recon_inter2_mmx(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,
  87. _dct_coeffs+64);
  88. }
  89. else oc_frag_recon_inter_mmx(dst,ref+mvoffsets[0],ystride,_dct_coeffs+64);
  90. }
  91. }
  92. /*We copy these entire function to inline the actual MMX routines so that we
  93. use only a single indirect call.*/
  94. void oc_loop_filter_init_mmx(signed char _bv[256],int _flimit){
  95. memset(_bv,~(_flimit<<1),8);
  96. }
  97. /*Apply the loop filter to a given set of fragment rows in the given plane.
  98. The filter may be run on the bottom edge, affecting pixels in the next row of
  99. fragments, so this row also needs to be available.
  100. _bv: The bounding values array.
  101. _refi: The index of the frame buffer to filter.
  102. _pli: The color plane to filter.
  103. _fragy0: The Y coordinate of the first fragment row to filter.
  104. _fragy_end: The Y coordinate of the fragment row to stop filtering at.*/
  105. void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
  106. signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){
  107. const oc_fragment_plane *fplane;
  108. const oc_fragment *frags;
  109. const ptrdiff_t *frag_buf_offs;
  110. unsigned char *ref_frame_data;
  111. ptrdiff_t fragi_top;
  112. ptrdiff_t fragi_bot;
  113. ptrdiff_t fragi0;
  114. ptrdiff_t fragi0_end;
  115. int ystride;
  116. int nhfrags;
  117. fplane=_state->fplanes+_pli;
  118. nhfrags=fplane->nhfrags;
  119. fragi_top=fplane->froffset;
  120. fragi_bot=fragi_top+fplane->nfrags;
  121. fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags;
  122. fragi0_end=fragi_top+_fragy_end*(ptrdiff_t)nhfrags;
  123. ystride=_state->ref_ystride[_pli];
  124. frags=_state->frags;
  125. frag_buf_offs=_state->frag_buf_offs;
  126. ref_frame_data=_state->ref_frame_data[_refi];
  127. /*The following loops are constructed somewhat non-intuitively on purpose.
  128. The main idea is: if a block boundary has at least one coded fragment on
  129. it, the filter is applied to it.
  130. However, the order that the filters are applied in matters, and VP3 chose
  131. the somewhat strange ordering used below.*/
  132. while(fragi0<fragi0_end){
  133. ptrdiff_t fragi;
  134. ptrdiff_t fragi_end;
  135. fragi=fragi0;
  136. fragi_end=fragi+nhfrags;
  137. while(fragi<fragi_end){
  138. if(frags[fragi].coded){
  139. unsigned char *ref;
  140. ref=ref_frame_data+frag_buf_offs[fragi];
  141. #define PIX eax
  142. #define YSTRIDE3 edi
  143. #define YSTRIDE ecx
  144. #define LL edx
  145. #define D esi
  146. #define D_WORD si
  147. if(fragi>fragi0)OC_LOOP_FILTER_H_MMX(ref,ystride,_bv);
  148. if(fragi0>fragi_top)OC_LOOP_FILTER_V_MMX(ref,ystride,_bv);
  149. if(fragi+1<fragi_end&&!frags[fragi+1].coded){
  150. OC_LOOP_FILTER_H_MMX(ref+8,ystride,_bv);
  151. }
  152. if(fragi+nhfrags<fragi_bot&&!frags[fragi+nhfrags].coded){
  153. OC_LOOP_FILTER_V_MMX(ref+(ystride<<3),ystride,_bv);
  154. }
  155. #undef PIX
  156. #undef YSTRIDE3
  157. #undef YSTRIDE
  158. #undef LL
  159. #undef D
  160. #undef D_WORD
  161. }
  162. fragi++;
  163. }
  164. fragi0+=nhfrags;
  165. }
  166. }
  167. #endif