x86enquant.c 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150
  1. /********************************************************************
  2. * *
  3. * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
  4. * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
  5. * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
  6. * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
  7. * *
  8. * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
  9. * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
  10. * *
  11. ********************************************************************
  12. function:
  13. last mod: $Id: mmxstate.c 17247 2010-05-28 05:35:32Z tterribe $
  14. ********************************************************************/
  15. #include "x86enc.h"
  16. #if defined(OC_X86_ASM)
  17. /*The default enquant table is not quite suitable for SIMD purposes.
  18. First, the m and l parameters need to be separated so that an entire row full
  19. of m's or l's can be loaded at a time.
  20. Second, x86 SIMD has no element-wise arithmetic right-shift, so we have to
  21. emulate one with a multiply.
  22. Therefore we translate the shift count into a scale factor.*/
  23. void oc_enc_enquant_table_init_x86(void *_enquant,
  24. const ogg_uint16_t _dequant[64]){
  25. ogg_int16_t *m;
  26. ogg_int16_t *l;
  27. int zzi;
  28. m=(ogg_int16_t *)_enquant;
  29. l=m+64;
  30. for(zzi=0;zzi<64;zzi++){
  31. oc_iquant q;
  32. oc_iquant_init(&q,_dequant[zzi]);
  33. m[zzi]=q.m;
  34. /*q.l must be at least 2 for this to work; fortunately, once all the scale
  35. factors are baked in, the minimum quantizer is much larger than that.*/
  36. l[zzi]=1<<16-q.l;
  37. }
  38. }
  39. void oc_enc_enquant_table_fixup_x86(void *_enquant[3][3][2],int _nqis){
  40. int pli;
  41. int qii;
  42. int qti;
  43. for(pli=0;pli<3;pli++)for(qii=1;qii<_nqis;qii++)for(qti=0;qti<2;qti++){
  44. ((ogg_int16_t *)_enquant[pli][qii][qti])[0]=
  45. ((ogg_int16_t *)_enquant[pli][0][qti])[0];
  46. ((ogg_int16_t *)_enquant[pli][qii][qti])[64]=
  47. ((ogg_int16_t *)_enquant[pli][0][qti])[64];
  48. }
  49. }
  50. int oc_enc_quantize_sse2(ogg_int16_t _qdct[64],const ogg_int16_t _dct[64],
  51. const ogg_uint16_t _dequant[64],const void *_enquant){
  52. ptrdiff_t r;
  53. __asm__ __volatile__(
  54. "xor %[r],%[r]\n\t"
  55. /*Loop through two rows at a time.*/
  56. ".p2align 4\n\t"
  57. "0:\n\t"
  58. /*Load the first two rows of the data and the quant matrices.*/
  59. "movdqa 0x00(%[dct],%[r]),%%xmm0\n\t"
  60. "movdqa 0x10(%[dct],%[r]),%%xmm1\n\t"
  61. "movdqa 0x00(%[dq],%[r]),%%xmm2\n\t"
  62. "movdqa 0x10(%[dq],%[r]),%%xmm3\n\t"
  63. "movdqa 0x00(%[q],%[r]),%%xmm4\n\t"
  64. "movdqa 0x10(%[q],%[r]),%%xmm5\n\t"
  65. /*Double the input and propagate its sign to the rounding factor.
  66. Using SSSE3's psignw would help here, but we need the mask later anyway.*/
  67. "movdqa %%xmm0,%%xmm6\n\t"
  68. "psraw $15,%%xmm0\n\t"
  69. "movdqa %%xmm1,%%xmm7\n\t"
  70. "paddw %%xmm6,%%xmm6\n\t"
  71. "psraw $15,%%xmm1\n\t"
  72. "paddw %%xmm7,%%xmm7\n\t"
  73. "paddw %%xmm0,%%xmm2\n\t"
  74. "paddw %%xmm1,%%xmm3\n\t"
  75. "pxor %%xmm0,%%xmm2\n\t"
  76. "pxor %%xmm1,%%xmm3\n\t"
  77. /*Add the rounding factor and perform the first multiply.*/
  78. "paddw %%xmm2,%%xmm6\n\t"
  79. "paddw %%xmm3,%%xmm7\n\t"
  80. "pmulhw %%xmm6,%%xmm4\n\t"
  81. "pmulhw %%xmm7,%%xmm5\n\t"
  82. "movdqa 0x80(%[q],%[r]),%%xmm2\n\t"
  83. "movdqa 0x90(%[q],%[r]),%%xmm3\n\t"
  84. "paddw %%xmm4,%%xmm6\n\t"
  85. "paddw %%xmm5,%%xmm7\n\t"
  86. /*Emulate an element-wise right-shift via a second multiply.*/
  87. "pmulhw %%xmm2,%%xmm6\n\t"
  88. "pmulhw %%xmm3,%%xmm7\n\t"
  89. "add $32,%[r]\n\t"
  90. "cmp $96,%[r]\n\t"
  91. /*Correct for the sign.*/
  92. "psubw %%xmm0,%%xmm6\n\t"
  93. "psubw %%xmm1,%%xmm7\n\t"
  94. /*Save the result.*/
  95. "movdqa %%xmm6,-0x20(%[qdct],%[r])\n\t"
  96. "movdqa %%xmm7,-0x10(%[qdct],%[r])\n\t"
  97. "jle 0b\n\t"
  98. /*Now find the location of the last non-zero value.*/
  99. "movdqa 0x50(%[qdct]),%%xmm5\n\t"
  100. "movdqa 0x40(%[qdct]),%%xmm4\n\t"
  101. "packsswb %%xmm7,%%xmm6\n\t"
  102. "packsswb %%xmm5,%%xmm4\n\t"
  103. "pxor %%xmm0,%%xmm0\n\t"
  104. "mov $-1,%k[dq]\n\t"
  105. "pcmpeqb %%xmm0,%%xmm6\n\t"
  106. "pcmpeqb %%xmm0,%%xmm4\n\t"
  107. "pmovmskb %%xmm6,%k[q]\n\t"
  108. "pmovmskb %%xmm4,%k[r]\n\t"
  109. "shl $16,%k[q]\n\t"
  110. "or %k[r],%k[q]\n\t"
  111. "mov $32,%[r]\n\t"
  112. /*We have to use xor here instead of not in order to set the flags.*/
  113. "xor %k[dq],%k[q]\n\t"
  114. "jnz 1f\n\t"
  115. "movdqa 0x30(%[qdct]),%%xmm7\n\t"
  116. "movdqa 0x20(%[qdct]),%%xmm6\n\t"
  117. "movdqa 0x10(%[qdct]),%%xmm5\n\t"
  118. "movdqa 0x00(%[qdct]),%%xmm4\n\t"
  119. "packsswb %%xmm7,%%xmm6\n\t"
  120. "packsswb %%xmm5,%%xmm4\n\t"
  121. "pcmpeqb %%xmm0,%%xmm6\n\t"
  122. "pcmpeqb %%xmm0,%%xmm4\n\t"
  123. "pmovmskb %%xmm6,%k[q]\n\t"
  124. "pmovmskb %%xmm4,%k[r]\n\t"
  125. "shl $16,%k[q]\n\t"
  126. "or %k[r],%k[q]\n\t"
  127. "xor %[r],%[r]\n\t"
  128. "not %k[q]\n\t"
  129. "or $1,%k[q]\n\t"
  130. "1:\n\t"
  131. "bsr %k[q],%k[q]\n\t"
  132. "add %k[q],%k[r]\n\t"
  133. :[r]"=&a"(r),[q]"+r"(_enquant),[dq]"+r"(_dequant)
  134. :[dct]"r"(_dct),[qdct]"r"(_qdct)
  135. :"cc","memory"
  136. );
  137. return (int)r;
  138. }
  139. #endif