123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146 |
- diff -urN -x '*.orig' -x '*.rej' -x '*~' -x '.*' OpenJPEG.orig/libopenjpeg/t1.c OpenJPEG.patched/libopenjpeg/t1.c
- --- OpenJPEG.orig/libopenjpeg/t1.c 2007-11-13 13:52:05.000000000 -0600
- +++ OpenJPEG.patched/libopenjpeg/t1.c 2007-11-14 01:09:40.000000000 -0600
- @@ -33,6 +33,17 @@
- #include "opj_includes.h"
- #include "t1_luts.h"
-
- +/* Don't use MMX on amd64 */
- +/* Note that merely including mmintrin.h, even if we don't use it, changes the code gcc */
- +/* outputs on amd64, and it is measurably slower. A bug in gcc? */
- +#ifdef __amd64__
- +#undef __MMX__
- +#endif
- +
- +#ifdef __MMX__
- +#include <mmintrin.h>
- +#endif
- +
- /** @defgroup T1 T1 - Implementation of the tier-1 coding */
- /*@{*/
-
- @@ -45,7 +56,7 @@
- static char t1_getspb(int f);
- static short t1_getnmsedec_sig(int x, int bitpos);
- static short t1_getnmsedec_ref(int x, int bitpos);
- -#ifdef __amd64__
- +#if defined(__amd64__) || defined(__MMX__)
- static INLINE void t1_updateflags(flag_t *flagsp, int s, int stride);
- #else
- static void t1_updateflags(flag_t *flagsp, int s, int stride);
- @@ -293,6 +304,32 @@
- }
-
- #else
- +#ifdef __MMX__
- +
- +static void t1_updateflags(flag_t *flagsp, int s, int stride) {
- + static const __v4hi mod[] = {
- + {T1_SIG_SE, T1_SIG_E, T1_SIG_NE, 0},
- + {T1_SIG_SE, T1_SIG_E|T1_SGN_E, T1_SIG_NE, 0},
- + {T1_SIG_S, T1_SIG, T1_SIG_N, 0},
- + {T1_SIG_S|T1_SGN_S, T1_SIG, T1_SIG_N|T1_SGN_N, 0},
- + {T1_SIG_SW, T1_SIG_W, T1_SIG_NW, 0},
- + {T1_SIG_SW, T1_SIG_W|T1_SGN_W, T1_SIG_NW, 0}
- + };
- +
- + __m64 tmp1 = *(__m64*)((void*)&flagsp[-1 - stride]);
- + __m64 tmp2 = *(__m64*)((void*)&flagsp[-1 ]);
- + __m64 tmp3 = *(__m64*)((void*)&flagsp[-1 + stride]);
- +
- + tmp1 = _mm_or_si64(tmp1, mod[s]);
- + tmp2 = _mm_or_si64(tmp2, mod[s+2]);
- + tmp3 = _mm_or_si64(tmp3, mod[s+4]);
- +
- + *(__m64*)((void*)&flagsp[-1 - stride]) = tmp1;
- + *(__m64*)((void*)&flagsp[-1 ]) = tmp2;
- + *(__m64*)((void*)&flagsp[-1 + stride]) = tmp3;
- +}
- +
- +#else
-
- static void t1_updateflags(flag_t *flagsp, int s, int stride) {
- static const flag_t mod[] = {
- @@ -316,6 +353,7 @@
- }
-
- #endif
- +#endif
-
- static void t1_enc_sigpass_step(
- opj_t1_t *t1,
- @@ -720,18 +758,14 @@
- | ((int64)(T1_SIG | T1_VISIT | T1_SIG_OTH)<<48);
- agg = !tmp;
- #else
- + int* flagsp = (int*)&t1->flags[(k+1) + (i+1)*(t1->h+2)];
- + agg = flagsp[1];
- if (cblksty & J2K_CCP_CBLKSTY_VSC) {
- - agg = !(t1->flags[(k+1) + (i+1)*(t1->h+2)] & (T1_SIG | T1_VISIT | T1_SIG_OTH)
- - || t1->flags[(k+2) + (i+1)*(t1->h+2)] & (T1_SIG | T1_VISIT | T1_SIG_OTH)
- - || t1->flags[(k+3) + (i+1)*(t1->h+2)] & (T1_SIG | T1_VISIT | T1_SIG_OTH)
- - || (t1->flags[(k+4) + (i+1)*(t1->h+2)]
- - & (~(T1_SIG_S | T1_SIG_SE | T1_SIG_SW | T1_SGN_S))) & (T1_SIG | T1_VISIT | T1_SIG_OTH));
- - } else {
- - agg = !(t1->flags[(k+1) + (i+1)*(t1->h+2)] & (T1_SIG | T1_VISIT | T1_SIG_OTH)
- - || t1->flags[(k+2) + (i+1)*(t1->h+2)] & (T1_SIG | T1_VISIT | T1_SIG_OTH)
- - || t1->flags[(k+3) + (i+1)*(t1->h+2)] & (T1_SIG | T1_VISIT | T1_SIG_OTH)
- - || t1->flags[(k+4) + (i+1)*(t1->h+2)] & (T1_SIG | T1_VISIT | T1_SIG_OTH));
- + agg &= ~((T1_SIG_S|T1_SIG_SE|T1_SIG_SW|T1_SGN_S)<<16);
- }
- + agg |= flagsp[0];
- + agg &= (T1_SIG|T1_VISIT|T1_SIG_OTH)|(T1_SIG|T1_VISIT|T1_SIG_OTH)<<16;
- + agg = !agg;
- #endif
- } else {
- agg = 0;
- @@ -820,7 +854,7 @@
- memset(t1->data,0,datasize * sizeof(int));
-
- flagssize=(h+2) * (w+2);
- -#ifdef __amd64__
- +#if defined(__amd64__) || defined(__MMX__)
- /* 64 bit SIMD/SWAR in t1_updateflags requires one short of headroom
- because three shorts = 48 bits. */
- ++flagssize;
- @@ -886,6 +920,9 @@
- int correction = 3;
- type = ((bpno < (cblk->numbps - 4)) && (passtype < 2) && (cblksty & J2K_CCP_CBLKSTY_LAZY)) ? T1_TYPE_RAW : T1_TYPE_MQ;
-
- +#if !defined(__amd64__) && defined(__MMX__)
- + _mm_empty();
- +#endif
- switch (passtype) {
- case 0:
- t1_enc_sigpass(t1, bpno, orient, &nmsedec, type, cblksty);
- @@ -900,6 +937,9 @@
- mqc_segmark_enc(mqc);
- break;
- }
- +#if !defined(__amd64__) && defined(__MMX__)
- + _mm_empty();
- +#endif
-
- /* fixed_quality */
- cumwmsedec += t1_getwmsedec(nmsedec, compno, level, orient, bpno, qmfbid, stepsize, numcomps);
- @@ -1004,6 +1044,9 @@
- mqc_setstate(mqc, T1_CTXNO_AGG, 0, 3);
- mqc_setstate(mqc, T1_CTXNO_ZC, 0, 4);
-
- +#if !defined(__amd64__) && defined(__MMX__)
- + _mm_empty();
- +#endif
- for (segno = 0; segno < cblk->numsegs; ++segno) {
- opj_tcd_seg_t *seg = &cblk->segs[segno];
-
- @@ -1044,6 +1087,9 @@
- }
- }
- }
- +#if !defined(__amd64__) && defined(__MMX__)
- + _mm_empty();
- +#endif
- }
-
- /* ----------------------------------------------------------------------- */
|