openjpeg-svn470-t1-flags-mmx.patch 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146
  1. diff -urN -x '*.orig' -x '*.rej' -x '*~' -x '.*' OpenJPEG.orig/libopenjpeg/t1.c OpenJPEG.patched/libopenjpeg/t1.c
  2. --- OpenJPEG.orig/libopenjpeg/t1.c 2007-11-13 13:52:05.000000000 -0600
  3. +++ OpenJPEG.patched/libopenjpeg/t1.c 2007-11-14 01:09:40.000000000 -0600
  4. @@ -33,6 +33,17 @@
  5. #include "opj_includes.h"
  6. #include "t1_luts.h"
  7. +/* Don't use MMX on amd64 */
  8. +/* Note that merely including mmintrin.h, even if we don't use it, changes the code gcc */
  9. +/* outputs on amd64, and it is measurably slower. A bug in gcc? */
  10. +#ifdef __amd64__
  11. +#undef __MMX__
  12. +#endif
  13. +
  14. +#ifdef __MMX__
  15. +#include <mmintrin.h>
  16. +#endif
  17. +
  18. /** @defgroup T1 T1 - Implementation of the tier-1 coding */
  19. /*@{*/
  20. @@ -45,7 +56,7 @@
  21. static char t1_getspb(int f);
  22. static short t1_getnmsedec_sig(int x, int bitpos);
  23. static short t1_getnmsedec_ref(int x, int bitpos);
  24. -#ifdef __amd64__
  25. +#if defined(__amd64__) || defined(__MMX__)
  26. static INLINE void t1_updateflags(flag_t *flagsp, int s, int stride);
  27. #else
  28. static void t1_updateflags(flag_t *flagsp, int s, int stride);
  29. @@ -293,6 +304,32 @@
  30. }
  31. #else
  32. +#ifdef __MMX__
  33. +
  34. +static void t1_updateflags(flag_t *flagsp, int s, int stride) {
  35. + static const __v4hi mod[] = {
  36. + {T1_SIG_SE, T1_SIG_E, T1_SIG_NE, 0},
  37. + {T1_SIG_SE, T1_SIG_E|T1_SGN_E, T1_SIG_NE, 0},
  38. + {T1_SIG_S, T1_SIG, T1_SIG_N, 0},
  39. + {T1_SIG_S|T1_SGN_S, T1_SIG, T1_SIG_N|T1_SGN_N, 0},
  40. + {T1_SIG_SW, T1_SIG_W, T1_SIG_NW, 0},
  41. + {T1_SIG_SW, T1_SIG_W|T1_SGN_W, T1_SIG_NW, 0}
  42. + };
  43. +
  44. + __m64 tmp1 = *(__m64*)((void*)&flagsp[-1 - stride]);
  45. + __m64 tmp2 = *(__m64*)((void*)&flagsp[-1 ]);
  46. + __m64 tmp3 = *(__m64*)((void*)&flagsp[-1 + stride]);
  47. +
  48. + tmp1 = _mm_or_si64(tmp1, mod[s]);
  49. + tmp2 = _mm_or_si64(tmp2, mod[s+2]);
  50. + tmp3 = _mm_or_si64(tmp3, mod[s+4]);
  51. +
  52. + *(__m64*)((void*)&flagsp[-1 - stride]) = tmp1;
  53. + *(__m64*)((void*)&flagsp[-1 ]) = tmp2;
  54. + *(__m64*)((void*)&flagsp[-1 + stride]) = tmp3;
  55. +}
  56. +
  57. +#else
  58. static void t1_updateflags(flag_t *flagsp, int s, int stride) {
  59. static const flag_t mod[] = {
  60. @@ -316,6 +353,7 @@
  61. }
  62. #endif
  63. +#endif
  64. static void t1_enc_sigpass_step(
  65. opj_t1_t *t1,
  66. @@ -720,18 +758,14 @@
  67. | ((int64)(T1_SIG | T1_VISIT | T1_SIG_OTH)<<48);
  68. agg = !tmp;
  69. #else
  70. + int* flagsp = (int*)&t1->flags[(k+1) + (i+1)*(t1->h+2)];
  71. + agg = flagsp[1];
  72. if (cblksty & J2K_CCP_CBLKSTY_VSC) {
  73. - agg = !(t1->flags[(k+1) + (i+1)*(t1->h+2)] & (T1_SIG | T1_VISIT | T1_SIG_OTH)
  74. - || t1->flags[(k+2) + (i+1)*(t1->h+2)] & (T1_SIG | T1_VISIT | T1_SIG_OTH)
  75. - || t1->flags[(k+3) + (i+1)*(t1->h+2)] & (T1_SIG | T1_VISIT | T1_SIG_OTH)
  76. - || (t1->flags[(k+4) + (i+1)*(t1->h+2)]
  77. - & (~(T1_SIG_S | T1_SIG_SE | T1_SIG_SW | T1_SGN_S))) & (T1_SIG | T1_VISIT | T1_SIG_OTH));
  78. - } else {
  79. - agg = !(t1->flags[(k+1) + (i+1)*(t1->h+2)] & (T1_SIG | T1_VISIT | T1_SIG_OTH)
  80. - || t1->flags[(k+2) + (i+1)*(t1->h+2)] & (T1_SIG | T1_VISIT | T1_SIG_OTH)
  81. - || t1->flags[(k+3) + (i+1)*(t1->h+2)] & (T1_SIG | T1_VISIT | T1_SIG_OTH)
  82. - || t1->flags[(k+4) + (i+1)*(t1->h+2)] & (T1_SIG | T1_VISIT | T1_SIG_OTH));
  83. + agg &= ~((T1_SIG_S|T1_SIG_SE|T1_SIG_SW|T1_SGN_S)<<16);
  84. }
  85. + agg |= flagsp[0];
  86. + agg &= (T1_SIG|T1_VISIT|T1_SIG_OTH)|(T1_SIG|T1_VISIT|T1_SIG_OTH)<<16;
  87. + agg = !agg;
  88. #endif
  89. } else {
  90. agg = 0;
  91. @@ -820,7 +854,7 @@
  92. memset(t1->data,0,datasize * sizeof(int));
  93. flagssize=(h+2) * (w+2);
  94. -#ifdef __amd64__
  95. +#if defined(__amd64__) || defined(__MMX__)
  96. /* 64 bit SIMD/SWAR in t1_updateflags requires one short of headroom
  97. because three shorts = 48 bits. */
  98. ++flagssize;
  99. @@ -886,6 +920,9 @@
  100. int correction = 3;
  101. type = ((bpno < (cblk->numbps - 4)) && (passtype < 2) && (cblksty & J2K_CCP_CBLKSTY_LAZY)) ? T1_TYPE_RAW : T1_TYPE_MQ;
  102. +#if !defined(__amd64__) && defined(__MMX__)
  103. + _mm_empty();
  104. +#endif
  105. switch (passtype) {
  106. case 0:
  107. t1_enc_sigpass(t1, bpno, orient, &nmsedec, type, cblksty);
  108. @@ -900,6 +937,9 @@
  109. mqc_segmark_enc(mqc);
  110. break;
  111. }
  112. +#if !defined(__amd64__) && defined(__MMX__)
  113. + _mm_empty();
  114. +#endif
  115. /* fixed_quality */
  116. cumwmsedec += t1_getwmsedec(nmsedec, compno, level, orient, bpno, qmfbid, stepsize, numcomps);
  117. @@ -1004,6 +1044,9 @@
  118. mqc_setstate(mqc, T1_CTXNO_AGG, 0, 3);
  119. mqc_setstate(mqc, T1_CTXNO_ZC, 0, 4);
  120. +#if !defined(__amd64__) && defined(__MMX__)
  121. + _mm_empty();
  122. +#endif
  123. for (segno = 0; segno < cblk->numsegs; ++segno) {
  124. opj_tcd_seg_t *seg = &cblk->segs[segno];
  125. @@ -1044,6 +1087,9 @@
  126. }
  127. }
  128. }
  129. +#if !defined(__amd64__) && defined(__MMX__)
  130. + _mm_empty();
  131. +#endif
  132. }
  133. /* ----------------------------------------------------------------------- */