videogamepreservation
/
allegiance
огледало от https://github.com/videogamepreservation/allegiance


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410
							/*
 * jdcolor.c
 *
 * Copyright (C) 1991-1996, Thomas G. Lane.
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
 * This file contains output colorspace conversion routines.
 */

#define JPEG_INTERNALS
#include "jinclude.h"
#include "jpeglib.h"

#ifdef NIFTY

#include <math.h>

#define SCALE_PREC      5
#define SCALE_RND       (1 << (SCALE_PREC - 1))
#define SCALE           (1 << SCALE_PREC)
#define unscale(x)      (((x) + SCALE_RND) >> SCALE_PREC)
#define clip(x)         (((long)(x) & ~0xff) ? (((long)(x) < 0) ? 0 : 255) : (long)(x))

#endif


/* Private subobject */

typedef struct {
  struct jpeg_color_deconverter pub; /* public fields */

  /* Private state for YCC->RGB conversion */
  int * Cr_r_tab;		/* => table for Cr to R conversion */
  int * Cb_b_tab;		/* => table for Cb to B conversion */
  INT32 * Cr_g_tab;		/* => table for Cr to G conversion */
  INT32 * Cb_g_tab;		/* => table for Cb to G conversion */

#ifdef NIFTY
  /* Private state for the PhotoYCC->RGB conversion tables */
  coef_c1 *C1;
  coef_c2 *C2;
  short *xy;
#endif

} my_color_deconverter;


/* Added header info - CRK */
extern void MYCbCr2RGB(
  int columns,	  
  unsigned char *inY,
  unsigned char *inU,
  unsigned char *inV,
  unsigned char *outRGB);

extern void MYCbCrA2RGBA(
  int columns,	  
  unsigned char *inY,
  unsigned char *inU,
  unsigned char *inV,
  unsigned char *inA,
  unsigned char *outRGBA);

extern void MYCbCrA2RGBALegacy(
  int columns,	  
  unsigned char *inY,
  unsigned char *inU,
  unsigned char *inV,
  unsigned char *inA,
  unsigned char *outRGBA);
// These constants correspond to CCIR 601-1
// R = [256*Y + 359*(Cr-128)] / 256
// G = [256*Y - 88*(Cb-128) - 183*(Cr-128)] / 256
// B = [256*Y + 454*(Cb-128)] / 256
//Conventional floating point equations:
//	R = Y + 1.40200 * Cr
//	G = Y - 0.34414 * Cb - 0.71414 * Cr
//	B = Y + 1.77200 * Cb

//Ry=0100 Ru=0000 Rv=0167
//Gy=0100 Gu=FFA8 Gv=FF49
//By=0100 Bu=01C6 Bv=0000
// constants for YCbCr->RGB and YCbCrA->RGBA
static __int64 const_0		= 0x0000000000000000;
static __int64 const_sub128	= 0x0080008000800080;
static __int64 const_VUmul	= 0xFF49FFA8FF49FFA8;
static __int64 const_YVmul	= 0x0100016701000167;
static __int64 const_YUmul	= 0x010001C6010001C6;
static __int64 mask_highd	= 0xFFFFFFFF00000000;
static __int64 const_invert	= 0x00FFFFFF00FFFFFF;


//These constants correspond to the original FPX SDK
// R = [256*Y + 410*(Cr-128)] / 256
// G = [256*Y - 85*(Cb-128) - 205*(Cr-128)] / 256
// B = [256*Y + 512*(Cb-128)] / 256
//Conventional floating point equations:
// R = Y + 1.60000*(Cr)
// G = Y - 0.33333*(Cb) - 0.80000*(Cr)
// B = Y + 2.00000*(Cb)

//Ry=0100 Ru=0000 Rv=019A
//Gy=0100 Gu=FFAB Gv=FF33
//By=0100 Bu=0200 Bv=0000
// constants for YCbCr->RGB and YCbCrA->RGBA
//const __int64 const_0		= 0x0000000000000000;
//const __int64 const_sub128= 0x0080008000800080;
//const __int64 const_VUmul	= 0xFF33FFABFF33FFAB;
//const __int64 const_YVmul	= 0x0100019A0100019A;
//const __int64 const_YUmul	= 0x0001000200010002;
//const __int64 mask_highd	= 0xFFFFFFFF00000000;
//const __int64 const_invert= 0x00FFFFFF00FFFFFF;

/* End of added info - CRK */


typedef my_color_deconverter * my_cconvert_ptr;

#ifdef NIFTY

/*
 * Initialize tables for PhotoYCC->RGB colorspace conversion.
 */

LOCAL (void)
build_pycc_rgb_table (j_decompress_ptr cinfo)
{
  my_cconvert_ptr cconvert = (my_cconvert_ptr)cinfo->cconvert;
  INT32 i;

  cconvert->C1 = (coef_c1 *)
	(*cinfo->mem->alloc_small)((j_common_ptr) cinfo, JPOOL_IMAGE,
				   256 * SIZEOF(coef_c1));
  cconvert->C2 = (coef_c2 *)
	(*cinfo->mem->alloc_small)((j_common_ptr) cinfo, JPOOL_IMAGE,
				   256 * SIZEOF(coef_c2));
  cconvert->xy = (short *)
	(*cinfo->mem->alloc_small)((j_common_ptr) cinfo, JPOOL_IMAGE,
				   256 * SIZEOF(short));

  for (i = 0; i < 256; i++) {
    cconvert->xy[i] = (short)((double)i * 1.3584 * SCALE);
    cconvert->C2[i].r = (short)(i * 1.8215 * SCALE);
    cconvert->C1[i].g = (short)(i * -0.4303 * SCALE);
    cconvert->C2[i].g = (short)(i * -0.9271 * SCALE);
    cconvert->C1[i].b = (short)(i * 2.2179 * SCALE);
  }
}

/*
 * PhotoYCC->RGB colorspace conversion.
 */
METHODDEF (void)
pycc_rgb_convert (j_decompress_ptr cinfo,
                 JSAMPIMAGE input_buf, JDIMENSION input_row,
                 JSAMPARRAY output_buf, int num_rows)
{
  my_cconvert_ptr cconvert = (my_cconvert_ptr)cinfo->cconvert;
  register JSAMPROW inptr0, inptr1, inptr2;
  register JSAMPROW outptr;
  register JDIMENSION col;
  JDIMENSION num_cols = cinfo->output_width;
  unsigned char y, c1, c2;
  short ri, gi, bi,
        offsetR, offsetG, offsetB;
  register short *xy = cconvert->xy;
  register coef_c1 *C1 = cconvert->C1;
  register coef_c2 *C2 = cconvert->C2;
 
/*
  for (i = 0; i < 256; i++) {
    xy[i] = (short)((double)i * 1.3584 * SCALE);
    C2[i].r = (short)(i * 1.8215 * SCALE);
    C1[i].g = (short)(i * -0.4303 * SCALE);
    C2[i].g = (short)(i * -0.9271 * SCALE);
    C1[i].b = (short)(i * 2.2179 * SCALE);
  }
*/
 
  offsetR = (short)(-249.55 * SCALE);
  offsetG = (short)( 194.14 * SCALE);
  offsetB = (short)(-345.99 * SCALE);
 
  while (--num_rows >= 0) {
    inptr0 = input_buf[0][input_row];
    inptr1 = input_buf[1][input_row];
    inptr2 = input_buf[2][input_row];
    input_row++;
    outptr = *output_buf++;
    for (col = 0; col < num_cols; col++) {
      y = GETJSAMPLE(inptr0[col]);
      c1 = GETJSAMPLE(inptr1[col]);
      c2 = GETJSAMPLE(inptr2[col]);
 
      ri = xy[y] + C2[c2].r + offsetR;
      gi = xy[y] + C1[c1].g + C2[c2].g + offsetG;
      bi = xy[y] + C1[c1].b + offsetB;
 
      ri = unscale(ri);
      gi = unscale(gi);
      bi = unscale(bi);
 
      outptr[RGB_RED] = (JSAMPLE)clip(ri);
      outptr[RGB_GREEN] = (JSAMPLE)clip(gi);
      outptr[RGB_BLUE] = (JSAMPLE)clip(bi);
      outptr+=3;
    }
  }
}


/*
 * PhotoYCC->RGBA colorspace conversion.
 */
METHODDEF (void)
pycc_rgba_convert (j_decompress_ptr cinfo,
                 JSAMPIMAGE input_buf, JDIMENSION input_row,
                 JSAMPARRAY output_buf, int num_rows)
{
  my_cconvert_ptr cconvert = (my_cconvert_ptr)cinfo->cconvert;
  register JSAMPROW inptr0, inptr1, inptr2;
  register JSAMPROW outptr;
  register JDIMENSION col;
  JDIMENSION num_cols = cinfo->output_width;
  unsigned char y, c1, c2;
  short ri, gi, bi,
        offsetR, offsetG, offsetB;
  register short *xy = cconvert->xy;
  register coef_c1 *C1 = cconvert->C1;
  register coef_c2 *C2 = cconvert->C2;
 
  offsetR = (short)(-249.55 * SCALE);
  offsetG = (short)( 194.14 * SCALE);
  offsetB = (short)(-345.99 * SCALE);
 
  while (--num_rows >= 0) {
    inptr0 = input_buf[0][input_row];
    inptr1 = input_buf[1][input_row];
    inptr2 = input_buf[2][input_row];
    input_row++;
    outptr = *output_buf++;
    for (col = 0; col < num_cols; col++) {
      y = GETJSAMPLE(inptr0[col]);
      c1 = GETJSAMPLE(inptr1[col]);
      c2 = GETJSAMPLE(inptr2[col]);
 
      ri = xy[y] + C2[c2].r + offsetR;
      gi = xy[y] + C1[c1].g + C2[c2].g + offsetG;
      bi = xy[y] + C1[c1].b + offsetB;
 
      ri = unscale(ri);
      gi = unscale(gi);
      bi = unscale(bi);
 
      outptr[RGB_RED] = (JSAMPLE)clip(ri);
      outptr[RGB_GREEN] = (JSAMPLE)clip(gi);
      outptr[RGB_BLUE] = (JSAMPLE)clip(bi);
	  outptr[3] = 255;
      outptr+=4;
    }
  }
}

#endif

/**************** YCbCr -> RGB conversion: most common case **************/

/*
 * YCbCr is defined per CCIR 601-1, except that Cb and Cr are
 * normalized to the range 0..MAXJSAMPLE rather than -0.5 .. 0.5.
 * The conversion equations to be implemented are therefore
 *	R = Y                + 1.40200 * Cr
 *	G = Y - 0.34414 * Cb - 0.71414 * Cr
 *	B = Y + 1.77200 * Cb
 * where Cb and Cr represent the incoming values less CENTERJSAMPLE.
 * (These numbers are derived from TIFF 6.0 section 21, dated 3-June-92.)
 *
 * To avoid floating-point arithmetic, we represent the fractional constants
 * as integers scaled up by 2^16 (about 4 digits precision); we have to divide
 * the products by 2^16, with appropriate rounding, to get the correct answer.
 * Notice that Y, being an integral input, does not contribute any fraction
 * so it need not participate in the rounding.
 *
 * For even more speed, we avoid doing any multiplications in the inner loop
 * by precalculating the constants times Cb and Cr for all possible values.
 * For 8-bit JSAMPLEs this is very reasonable (only 256 entries per table);
 * for 12-bit samples it is still acceptable.  It's not very reasonable for
 * 16-bit samples, but if you want lossless storage you shouldn't be changing
 * colorspace anyway.
 * The Cr=>R and Cb=>B values can be rounded to integers in advance; the
 * values for the G calculation are left scaled up, since we must add them
 * together before rounding.
 */

#define SCALEBITS	16	/* speediest right-shift on some machines */
#define ONE_HALF	((INT32) 1 << (SCALEBITS-1))
#define FIX(x)		((INT32) ((x) * (1L<<SCALEBITS) + 0.5))


/*
 * Initialize tables for YCC->RGB colorspace conversion.
 */

LOCAL(void)
build_ycc_rgb_table (j_decompress_ptr cinfo)
{
  my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
  int i;
  INT32 x;
  SHIFT_TEMPS

  cconvert->Cr_r_tab = (int *)
    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
				(MAXJSAMPLE+1) * SIZEOF(int));
  cconvert->Cb_b_tab = (int *)
    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
				(MAXJSAMPLE+1) * SIZEOF(int));
  cconvert->Cr_g_tab = (INT32 *)
    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
				(MAXJSAMPLE+1) * SIZEOF(INT32));
  cconvert->Cb_g_tab = (INT32 *)
    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
				(MAXJSAMPLE+1) * SIZEOF(INT32));

  for (i = 0, x = -CENTERJSAMPLE; i <= MAXJSAMPLE; i++, x++) {
    /* i is the actual input pixel value, in the range 0..MAXJSAMPLE */
    /* The Cb or Cr value we are thinking of is x = i - CENTERJSAMPLE */
    /* Cr=>R value is nearest int to 1.40200 * x */
    cconvert->Cr_r_tab[i] = (int)
		    RIGHT_SHIFT(FIX(1.40200) * x + ONE_HALF, SCALEBITS);
    /* Cb=>B value is nearest int to 1.77200 * x */
    cconvert->Cb_b_tab[i] = (int)
		    RIGHT_SHIFT(FIX(1.77200) * x + ONE_HALF, SCALEBITS);
    /* Cr=>G value is scaled-up -0.71414 * x */
    cconvert->Cr_g_tab[i] = (- FIX(0.71414)) * x;
    /* Cb=>G value is scaled-up -0.34414 * x */
    /* We also add in ONE_HALF so that need not do it in inner loop */
    cconvert->Cb_g_tab[i] = (- FIX(0.34414)) * x + ONE_HALF;
  }
}


/*
 * Convert some rows of samples to the output colorspace.
 *
 * Note that we change from noninterleaved, one-plane-per-component format
 * to interleaved-pixel format.  The output buffer is therefore three times
 * as wide as the input buffer.
 * A starting row offset is provided only for the input buffer.  The caller
 * can easily adjust the passed output_buf value to accommodate any row
 * offset required on that side.
 */


METHODDEF(void)
ycc_rgb_convert (j_decompress_ptr cinfo,
		 JSAMPIMAGE input_buf, JDIMENSION input_row,
		 JSAMPARRAY output_buf, int num_rows)
{
  my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
  register int y, cb, cr;
  register JSAMPROW outptr;
  register JSAMPROW inptr0, inptr1, inptr2;
  register JDIMENSION col;
  JDIMENSION num_cols = cinfo->output_width;
// Alignment variables - CRK
  JDIMENSION tail_cols = num_cols&7;
  JDIMENSION mmx_cols=num_cols&~7;
  /* copy these pointers into registers if possible */
  register JSAMPLE * range_limit = cinfo->sample_range_limit;
  register int * Crrtab = cconvert->Cr_r_tab;
  register int * Cbbtab = cconvert->Cb_b_tab;
  register INT32 * Crgtab = cconvert->Cr_g_tab;
  register INT32 * Cbgtab = cconvert->Cb_g_tab;
  SHIFT_TEMPS
#ifdef _X86_  
  if(vfMMXMachine) { //MMX Code - CRK
	while (--num_rows >= 0) {
		inptr0 = input_buf[0][input_row];
		inptr1 = input_buf[1][input_row];
		inptr2 = input_buf[2][input_row];
		input_row++;
		outptr = *output_buf++;
		MYCbCr2RGB(mmx_cols, inptr0, inptr1, inptr2, outptr);
		
		outptr += 3*mmx_cols;
		for (col = mmx_cols; col < num_cols; col++) {
		  y  = GETJSAMPLE(inptr0[col]);
		  cb = GETJSAMPLE(inptr1[col]);
		  cr = GETJSAMPLE(inptr2[col]);
		  /* Range-limiting is essential due to noise introduced by DCT losses. */
		  outptr[RGB_RED] =   range_limit[y + Crrtab[cr]];
		  outptr[RGB_GREEN] = range_limit[y +
				      ((int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
							 SCALEBITS))];
		  outptr[RGB_BLUE] =  range_limit[y + Cbbtab[cb]];
		  outptr += RGB_PIXELSIZE;
		}
	}
__asm emms	
  }
#else
    if (0) { }
#endif    
  else {
	while	(--num_rows >= 0) {
		inptr0 = input_buf[0][input_row];
		inptr1 = input_buf[1][input_row];
		inptr2 = input_buf[2][input_row];
		input_row++;
		outptr = *output_buf++;

		for (col = 0; col < num_cols; col++) {
		  y  = GETJSAMPLE(inptr0[col]);
		  cb = GETJSAMPLE(inptr1[col]);
		  cr = GETJSAMPLE(inptr2[col]);
		  /* Range-limiting is essential due to noise introduced by DCT losses. */
		  outptr[RGB_RED] =   range_limit[y + Crrtab[cr]];
		  outptr[RGB_GREEN] = range_limit[y +
				      ((int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
							 SCALEBITS))];
		  outptr[RGB_BLUE] =  range_limit[y + Cbbtab[cb]];
		  outptr += RGB_PIXELSIZE;
		}	
	}
  }
}


/**************** Cases other than YCbCr -> RGB **************/


/*
 * Color conversion for grayscale->RGB:
 * Single input value for Y gets copied into RGB.
 * Need to do this here so that the color quantizing will work.
 */

METHODDEF(void)
grayscale_RGB_convert (j_decompress_ptr cinfo,
		   JSAMPIMAGE input_buf, JDIMENSION input_row,
		   JSAMPARRAY output_buf, int num_rows)
{
    register int y;
    register JSAMPROW outptr;
    register JSAMPROW inptr0;
    register JDIMENSION col;
    JDIMENSION num_cols = cinfo->output_width;
    // Alignment variables - CRK
    /* copy these pointers into registers if possible */
    while	(--num_rows >= 0)
    {
	inptr0 = input_buf[0][input_row];
	input_row++;
	outptr = *output_buf++;

	for (col = num_cols; col--;) 
	{
	    y  = *inptr0++;
	    outptr[RGB_RED]   = y;
	    outptr[RGB_GREEN] = y;
	    outptr[RGB_BLUE]  = y;
	    outptr += RGB_PIXELSIZE;
	}	
    }
}


/*
 * Color conversion for no colorspace change: just copy the data,
 * converting from separate-planes to interleaved representation.
 */

METHODDEF(void)
null_convert (j_decompress_ptr cinfo,
	      JSAMPIMAGE input_buf, JDIMENSION input_row,
	      JSAMPARRAY output_buf, int num_rows)
{
  register JSAMPROW inptr, outptr;
  register JDIMENSION count;
  register int num_components = cinfo->num_components;
  JDIMENSION num_cols = cinfo->output_width;
  int ci;

  while (--num_rows >= 0) {
    for (ci = 0; ci < num_components; ci++) {
      inptr = input_buf[ci][input_row];
      outptr = output_buf[0] + ci;
      for (count = num_cols; count > 0; count--) {
	*outptr = *inptr++;	/* needn't bother with GETJSAMPLE() here */
	outptr += num_components;
      }
    }
    input_row++;
    output_buf++;
  }
}


/*
 * Color conversion for grayscale: just copy the data.
 * This also works for YCbCr -> grayscale conversion, in which
 * we just copy the Y (luminance) component and ignore chrominance.
 */

METHODDEF(void)
grayscale_convert (j_decompress_ptr cinfo,
		   JSAMPIMAGE input_buf, JDIMENSION input_row,
		   JSAMPARRAY output_buf, int num_rows)
{
  jcopy_sample_rows(input_buf[0], (int) input_row, output_buf, 0,
		    num_rows, cinfo->output_width);
}

#ifdef NIFTY

//Not really a colour conversion but special one for Picture It!
//Copies 3 channel data and adds an alpha
METHODDEF(void)
rgb_rgba_convert (j_decompress_ptr cinfo,
	      JSAMPIMAGE input_buf, JDIMENSION input_row,
	      JSAMPARRAY output_buf, int num_rows)
{
  my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
  register JSAMPROW outptr;
  register JSAMPROW inptr0, inptr1, inptr2;
  register JDIMENSION col;
  JDIMENSION num_cols = cinfo->output_width;
  /* copy these pointers into registers if possible */
  SHIFT_TEMPS
 
  while (--num_rows >= 0) {
    inptr0 = input_buf[0][input_row];
    inptr1 = input_buf[1][input_row];
    inptr2 = input_buf[2][input_row];
    input_row++;
    outptr = *output_buf++;
    for (col = 0; col < num_cols; col++) {
      outptr[0] = GETJSAMPLE(inptr0[col]);
      outptr[1] = GETJSAMPLE(inptr1[col]);
      outptr[2] = GETJSAMPLE(inptr2[col]);
      /* Alpha is added as fully opaque */
      outptr[3] = 255;  /* don't need GETJSAMPLE here */
      outptr += 4;
    }
  }
}


METHODDEF (void)
ycbcra_rgba_convert (j_decompress_ptr cinfo,
                   JSAMPIMAGE input_buf, JDIMENSION input_row,
                   JSAMPARRAY output_buf, int num_rows)
{
  my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
  register int y, cb, cr;
  register JSAMPROW outptr;
  register JSAMPROW inptr0, inptr1, inptr2, inptr3;
  register JDIMENSION col;
  JDIMENSION num_cols = cinfo->output_width;
  // Alignment variables - CRK
  JDIMENSION tail_cols = num_cols&7;
  JDIMENSION mmx_cols=num_cols&~7;
  /* copy these pointers into registers if possible */
  register JSAMPLE * range_limit = cinfo->sample_range_limit;
  register int * Crrtab = cconvert->Cr_r_tab;
  register int * Cbbtab = cconvert->Cb_b_tab;
  register INT32 * Crgtab = cconvert->Cr_g_tab;
  register INT32 * Cbgtab = cconvert->Cb_g_tab;
  SHIFT_TEMPS 
#ifdef _X86_  
  if(vfMMXMachine) { //MMX Code - CRK
	while (--num_rows >= 0) {
		inptr0 = input_buf[0][input_row];
		inptr1 = input_buf[1][input_row];
		inptr2 = input_buf[2][input_row];
		inptr3 = input_buf[3][input_row];
		input_row++;
		outptr = *output_buf++;
		MYCbCrA2RGBA(mmx_cols, inptr0, inptr1, inptr2, inptr3, outptr);
		
		outptr += 4*mmx_cols;
		for (col = mmx_cols; col < num_cols; col++) {
		  y  = GETJSAMPLE(inptr0[col]);
		  cb = GETJSAMPLE(inptr1[col]);
		  cr = GETJSAMPLE(inptr2[col]);
		  /* Range-limiting is essential due to noise introduced by DCT losses. */
		  outptr[RGB_RED] =   range_limit[y + Crrtab[cr]];
		  outptr[RGB_GREEN] = range_limit[y +
				      ((int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
							 SCALEBITS))];
		  outptr[RGB_BLUE] =  range_limit[y + Cbbtab[cb]];
		  outptr[3] = inptr3[col];
		  outptr += 4;
		}
	}	
	__asm emms
  }
#else
    if (0) { }
#endif    
  else {
	while (--num_rows >= 0) {
		inptr0 = input_buf[0][input_row];
		inptr1 = input_buf[1][input_row];
		inptr2 = input_buf[2][input_row];
		inptr3 = input_buf[3][input_row];
		input_row++;
		outptr = *output_buf++;
		for (col = 0; col < num_cols; col++) {
		  y  = GETJSAMPLE(inptr0[col]);
		  cb = GETJSAMPLE(inptr1[col]);
		  cr = GETJSAMPLE(inptr2[col]);
		  /* Range-limiting is essential due to noise introduced by DCT losses. */
		  outptr[RGB_RED] = range_limit[(y + Crrtab[cr])];   /* red */
		  outptr[RGB_GREEN] = range_limit[(y +                 /* green */
		                          ((int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
		                                             SCALEBITS)))];
		  outptr[RGB_BLUE] = range_limit[(y + Cbbtab[cb])];   /* blue */
		  /* Alpha passes through unchanged */
		  outptr[3] = inptr3[col];  /* don't need GETJSAMPLE here */
		  outptr += 4;
		}
	 }
  }
}


METHODDEF (void)
ycbcralegacy_rgba_convert (j_decompress_ptr cinfo,
                   JSAMPIMAGE input_buf, JDIMENSION input_row,
                   JSAMPARRAY output_buf, int num_rows)
{
  my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
  register int y, cb, cr;
  register JSAMPROW outptr;
  register JSAMPROW inptr0, inptr1, inptr2, inptr3;
  register JDIMENSION col;
  JDIMENSION num_cols = cinfo->output_width;
  // Alignment variables - CRK
  JDIMENSION tail_cols = num_cols&7;
  JDIMENSION mmx_cols=num_cols&~7;
  /* copy these pointers into registers if possible */
  register JSAMPLE * range_limit = cinfo->sample_range_limit;
  register int * Crrtab = cconvert->Cr_r_tab;
  register int * Cbbtab = cconvert->Cb_b_tab;
  register INT32 * Crgtab = cconvert->Cr_g_tab;
  register INT32 * Cbgtab = cconvert->Cb_g_tab;
  SHIFT_TEMPS
#ifdef _X86_  
  if(vfMMXMachine) { //MMX Code - CRK
	while (--num_rows >= 0) {
		inptr0 = input_buf[0][input_row];
		inptr1 = input_buf[1][input_row];
		inptr2 = input_buf[2][input_row];
		inptr3 = input_buf[3][input_row];
		input_row++;
		outptr = *output_buf++;
		MYCbCrA2RGBALegacy(mmx_cols, inptr0, inptr1, inptr2, inptr3, outptr);
		
		outptr += 4*mmx_cols;
		for (col = mmx_cols; col < num_cols; col++) {
		  y  = GETJSAMPLE(inptr0[col]);
		  cb = GETJSAMPLE(inptr1[col]);
		  cr = GETJSAMPLE(inptr2[col]);
		  /* Range-limiting is essential due to noise introduced by DCT losses. */
		  outptr[RGB_RED] =   range_limit[MAXJSAMPLE - (y + Crrtab[cr])];
		  outptr[RGB_GREEN] = range_limit[MAXJSAMPLE - (y +
				      ((int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
							 SCALEBITS)))];
		  outptr[RGB_BLUE] =  range_limit[MAXJSAMPLE - (y + Cbbtab[cb])];
		  outptr[3] = inptr3[col];
		  outptr += 4;
		}
	 }	
	__asm emms
  }
#else
    if (0) { }
#endif    
  else {
	while (--num_rows >= 0) {
		inptr0 = input_buf[0][input_row];
		inptr1 = input_buf[1][input_row];
		inptr2 = input_buf[2][input_row];
		inptr3 = input_buf[3][input_row];
		input_row++;
		outptr = *output_buf++;
		for (col = 0; col < num_cols; col++) {
		  y  = GETJSAMPLE(inptr0[col]);
		  cb = GETJSAMPLE(inptr1[col]);
		  cr = GETJSAMPLE(inptr2[col]);
		  /* Range-limiting is essential due to noise introduced by DCT losses. */
		  outptr[RGB_RED] = range_limit[MAXJSAMPLE - (y + Crrtab[cr])];   /* red */
		  outptr[RGB_GREEN] = range_limit[MAXJSAMPLE - (y +                 /* green */
		              ((int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
			                 SCALEBITS)))];
		  outptr[RGB_BLUE] = range_limit[MAXJSAMPLE - (y + Cbbtab[cb])];   /* blue */
		  /* Alpha passes through unchanged */
		  outptr[3] = inptr3[col];  /* don't need GETJSAMPLE here */
		  outptr += 4;
		}
	}
  }
}


METHODDEF (void)
ycbcr_rgba_convert (j_decompress_ptr cinfo,
		 JSAMPIMAGE input_buf, JDIMENSION input_row,
		 JSAMPARRAY output_buf, int num_rows)
{
  my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
  register int y, cb, cr;
  register JSAMPROW outptr;
  register JSAMPROW inptr0, inptr1, inptr2;
  register JDIMENSION col;
  JDIMENSION num_cols = cinfo->output_width;
  /* copy these pointers into registers if possible */
  register JSAMPLE * range_limit = cinfo->sample_range_limit;
  register int * Crrtab = cconvert->Cr_r_tab;
  register int * Cbbtab = cconvert->Cb_b_tab;
  register INT32 * Crgtab = cconvert->Cr_g_tab;
  register INT32 * Cbgtab = cconvert->Cb_g_tab;
  SHIFT_TEMPS

  while (--num_rows >= 0) {
    inptr0 = input_buf[0][input_row];
    inptr1 = input_buf[1][input_row];
    inptr2 = input_buf[2][input_row];
    input_row++;
    outptr = *output_buf++;
    for (col = 0; col < num_cols; col++) {
      y  = GETJSAMPLE(inptr0[col]);
      cb = GETJSAMPLE(inptr1[col]);
      cr = GETJSAMPLE(inptr2[col]);
      /* Range-limiting is essential due to noise introduced by DCT losses. */
      outptr[RGB_RED] =   range_limit[y + Crrtab[cr]];
      outptr[RGB_GREEN] = range_limit[y +
			      ((int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
						 SCALEBITS))];
      outptr[RGB_BLUE] =  range_limit[y + Cbbtab[cb]];
	  outptr[3] = 255;
      outptr += 4;
    }
  }
}

#endif

/*
 * Adobe-style YCCK->CMYK conversion.
 * We convert YCbCr to R=1-C, G=1-M, and B=1-Y using the same
 * conversion as above, while passing K (black) unchanged.
 * We assume build_ycc_rgb_table has been called.
 */

METHODDEF(void)
ycck_cmyk_convert (j_decompress_ptr cinfo,
		   JSAMPIMAGE input_buf, JDIMENSION input_row,
		   JSAMPARRAY output_buf, int num_rows)
{
  my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
  register int y, cb, cr;
  register JSAMPROW outptr;
  register JSAMPROW inptr0, inptr1, inptr2, inptr3;
  register JDIMENSION col;
  JDIMENSION num_cols = cinfo->output_width;
  /* copy these pointers into registers if possible */
  register JSAMPLE * range_limit = cinfo->sample_range_limit;
  register int * Crrtab = cconvert->Cr_r_tab;
  register int * Cbbtab = cconvert->Cb_b_tab;
  register INT32 * Crgtab = cconvert->Cr_g_tab;
  register INT32 * Cbgtab = cconvert->Cb_g_tab;
  SHIFT_TEMPS

  while (--num_rows >= 0) {
    inptr0 = input_buf[0][input_row];
    inptr1 = input_buf[1][input_row];
    inptr2 = input_buf[2][input_row];
    inptr3 = input_buf[3][input_row];
    input_row++;
    outptr = *output_buf++;
    for (col = 0; col < num_cols; col++) {
      y  = GETJSAMPLE(inptr0[col]);
      cb = GETJSAMPLE(inptr1[col]);
      cr = GETJSAMPLE(inptr2[col]);
      /* Range-limiting is essential due to noise introduced by DCT losses. */
      outptr[0] = range_limit[MAXJSAMPLE - (y + Crrtab[cr])];	/* red */
      outptr[1] = range_limit[MAXJSAMPLE - (y +			/* green */
			      ((int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
						 SCALEBITS)))];
      outptr[2] = range_limit[MAXJSAMPLE - (y + Cbbtab[cb])];	/* blue */
      /* K passes through unchanged */
      outptr[3] = inptr3[col];	/* don't need GETJSAMPLE here */
      outptr += 4;
    }
  }
}


/*
 * Empty method for start_pass.
 */

METHODDEF(void)
start_pass_dcolor (j_decompress_ptr cinfo)
{
  /* no work needed */
}


/*
 * Module initialization routine for output colorspace conversion.
 */

GLOBAL(void)
jinit_color_deconverter (j_decompress_ptr cinfo)
{
  my_cconvert_ptr cconvert;
  int ci;

  cconvert = (my_cconvert_ptr)
    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
				SIZEOF(my_color_deconverter));
  cinfo->cconvert = (struct jpeg_color_deconverter *) cconvert;
  cconvert->pub.start_pass = start_pass_dcolor;

  /* Make sure num_components agrees with jpeg_color_space */
  switch (cinfo->jpeg_color_space) {
  case JCS_GRAYSCALE:
    if (cinfo->num_components != 1)
      ERREXIT(cinfo, JERR_BAD_J_COLORSPACE);
    break;
#ifdef NIFTY
  case JCS_YCC:
    if (cinfo->num_components != 3)
      ERREXIT(cinfo, JERR_BAD_J_COLORSPACE);
    break;
  case JCS_YCCA:
    if (cinfo->num_components != 4)
      ERREXIT(cinfo, JERR_BAD_J_COLORSPACE);
    break;
  case JCS_RGBA:
    if (cinfo->num_components != 4)
      ERREXIT(cinfo, JERR_BAD_J_COLORSPACE);
    break;
  case JCS_YCbCrA:
    if (cinfo->num_components != 4)
      ERREXIT(cinfo, JERR_BAD_J_COLORSPACE);
    break;
  case JCS_YCbCrALegacy:
    if (cinfo->num_components != 4)
      ERREXIT(cinfo, JERR_BAD_J_COLORSPACE);
    break;

#endif
  case JCS_RGB:
  case JCS_YCbCr:
    if (cinfo->num_components != 3)
      ERREXIT(cinfo, JERR_BAD_J_COLORSPACE);
    break;

  case JCS_CMYK:
  case JCS_YCCK:
    if (cinfo->num_components != 4)
      ERREXIT(cinfo, JERR_BAD_J_COLORSPACE);
    break;

  default:			/* JCS_UNKNOWN can be anything */
    if (cinfo->num_components < 1)
      ERREXIT(cinfo, JERR_BAD_J_COLORSPACE);
    break;
  }

  /* Set out_color_components and conversion method based on requested space.
   * Also clear the component_needed flags for any unused components,
   * so that earlier pipeline stages can avoid useless computation.
   */

  switch (cinfo->out_color_space) {
  case JCS_GRAYSCALE:
    cinfo->out_color_components = 1;
    if (cinfo->jpeg_color_space == JCS_GRAYSCALE ||
	cinfo->jpeg_color_space == JCS_YCbCr) {
      cconvert->pub.color_convert = grayscale_convert;
      /* For color->grayscale conversion, only the Y (0) component is needed */
      for (ci = 1; ci < cinfo->num_components; ci++)
	cinfo->comp_info[ci].component_needed = FALSE;
    } else
      ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
    break;

  case JCS_RGB:
    cinfo->out_color_components = RGB_PIXELSIZE;
    if (cinfo->jpeg_color_space == JCS_YCbCr) {
      cconvert->pub.color_convert = ycc_rgb_convert;
      build_ycc_rgb_table(cinfo);
    } else if (cinfo->jpeg_color_space == JCS_RGB && RGB_PIXELSIZE == 3) {
      cconvert->pub.color_convert = null_convert;
    } else if (cinfo->jpeg_color_space == JCS_GRAYSCALE) {
	cconvert->pub.color_convert = grayscale_RGB_convert;
#ifdef NIFTY
    } else if (cinfo->jpeg_color_space == JCS_YCC) {
      cconvert->pub.color_convert = pycc_rgb_convert;
      build_pycc_rgb_table(cinfo);
#endif
	} else
      ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
    break;

#ifdef NIFTY
  case JCS_RGBA:
    cinfo->out_color_components = 4;
    if (cinfo->jpeg_color_space == JCS_YCbCrA) {
      cconvert->pub.color_convert = ycbcra_rgba_convert;
      build_ycc_rgb_table(cinfo);
	}else if (cinfo->jpeg_color_space == JCS_YCbCrALegacy) {
      cconvert->pub.color_convert = ycbcralegacy_rgba_convert;
      build_ycc_rgb_table(cinfo);
    }else if (cinfo->jpeg_color_space == JCS_YCbCr) {
      cconvert->pub.color_convert = ycbcr_rgba_convert;
      build_ycc_rgb_table(cinfo);
	}else if (cinfo->jpeg_color_space == JCS_RGBA) {
      cconvert->pub.color_convert = null_convert;
    }else if (cinfo->jpeg_color_space == JCS_RGB) {
      cconvert->pub.color_convert = rgb_rgba_convert;
	}else if (cinfo->jpeg_color_space == JCS_YCC) {
      cconvert->pub.color_convert = pycc_rgba_convert;
      build_pycc_rgb_table(cinfo);
	} else {
      ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
    }
    break;
#endif


  case JCS_CMYK:
    cinfo->out_color_components = 4;
    if (cinfo->jpeg_color_space == JCS_YCCK) {
      cconvert->pub.color_convert = ycck_cmyk_convert;
      build_ycc_rgb_table(cinfo);
    } else if (cinfo->jpeg_color_space == JCS_CMYK) {
      cconvert->pub.color_convert = null_convert;
    } else
      ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
    break;

  default:
    /* Permit null conversion to same output space */
    if (cinfo->out_color_space == cinfo->jpeg_color_space) {
      cinfo->out_color_components = cinfo->num_components;
      cconvert->pub.color_convert = null_convert;
    } else			/* unsupported non-null conversion */
      ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
    break;
  }

  if (cinfo->quantize_colors)
    cinfo->output_components = 1; /* single colormapped output component */
  else
    cinfo->output_components = cinfo->out_color_components;
}

#ifdef _X86_

//  MMX assembly code editions begin here - CRK

// Turn off "No EMMS instruction" warning
#pragma warning(disable : 4799)

void MYCbCr2RGB(
  int columns,	  
  unsigned char *inY,
  unsigned char *inU,
  unsigned char *inV,
  unsigned char *outRGB)
{
  _asm {
	// Inits
	mov		eax, inY
	mov		ecx, inV

	mov		edi, columns
	mov		ebx, inU

	shr		edi, 2				; number of loops = cols/4 
	mov		edx, outRGB

YUVtoRGB:
	movd	mm0, [eax]			; 0/0/0/0/Y3/Y2/Y1/Y0
	pxor	mm7, mm7			; use mm7 as const_0 to achieve better pairing at start

	movd	mm2, [ebx]			; 0/0/0/0/U3/U2/U1/U0
	punpcklbw	mm0, mm7		; Y3/Y2/Y1/Y0

	movd	mm3, [ecx]			; 0/0/0/0/V3/V2/V1/V0
	punpcklbw	mm2, mm7		; U3/U2/U1/U0
	
	psubsw	mm2, const_sub128	; U3'/U2'/U1'/U0'
	punpcklbw	mm3, mm7		; V3/V2/V1/V0

	psubsw	mm3, const_sub128	; V3'/V2'/V1'/V0'
	movq	mm4, mm2
	
	punpcklwd	mm2, mm3		; V1'/U1'/V0'/U0'
	movq	mm1, mm0			

	pmaddwd	mm2, const_VUmul	; gvV1'+guU1'/gvV0'+guU0'
	psllw	mm1, 8				; Y3*256/Y2*256/Y1*256/Y0*256

	movq	mm6, mm1
	punpcklwd	mm1, mm7		; Y1*256/Y0*256
	
	punpckhwd	mm6, mm7		; Y3*256/Y2*256
	movq	mm5, mm4

	punpckhwd	mm5, mm3		; V3'/U3'/V2'/U2'
	paddd	mm2, mm1			; G1*256/G0*256		(mm1 free)

	pmaddwd	mm5, const_VUmul	; gvV3'+guU3'/gvV2'+guU2'
	movq	mm1, mm3			;		(using mm1)	
	
	punpcklwd	mm3, mm0		; Y1/V1'/Y0/V0'
	movq	mm7, mm4			; This wipes out the zero constant
	
	pmaddwd	mm3, const_YVmul	; ryY1+rvV1'/ryY0+rvV0'
	psrad	mm2, 8				; G1/G0

	paddd	mm5, mm6			; G3*256/G2*256		(mm6 free)
	punpcklwd	mm4, mm0		; Y1/U1'/Y0/U0'

	pmaddwd	mm4, const_YUmul	; byY1+buU1'/byY0'+buU0'
	psrad	mm5, 8				; G3/G2

	psrad	mm3, 8				; R1/R0

	punpckhwd	mm7 , mm0		; Y3/U3'/Y2/U2'
	
	psrad	mm4, 8				; B1/B0
	movq	mm6, mm3

	pmaddwd	mm7, const_YUmul	; byY3+buU3'/byY2'+buU2'
	punpckhwd	mm1, mm0		; Y3/V3'/Y2/V2'		
	
	pmaddwd	mm1, const_YVmul	; ryY3+rvV3'/ryY2+rvV2'
	punpckldq	mm3, mm2		; G0/R0

	punpckhdq	mm6, mm2		; G1/R1			(mm2 free)
	movq	mm0, mm4

	psrad	mm7, 8				; B3/B2
	
	punpckldq	mm4, const_0	; 0/B0

	punpckhdq	mm0, const_0	; 0/B1

	psrad	mm1, 8				; R3/R2

	packssdw	mm3, mm4		; 0/B0/G0/R0	(mm4 free)
	movq	mm2, mm1

	packssdw	mm6, mm0		; 0/B1/G1/R1	(mm0 free)

	packuswb mm3, mm6			; 0/B1/G1/R1/0/B0/G0/R0  (mm6 free)

	punpckldq	mm2, mm5		; G2/R2
	movq	mm4, mm7

	punpckhdq	mm1, mm5		; G3/R3 (mm5 done)

	punpckldq	mm7, const_0	; 0/B2		(change this line for alpha code)

	punpckhdq	mm4, const_0	; 0/B3		(change this line for alpha code)

	movq		mm0, mm3		
	packssdw	mm2, mm7		; 0/B2/G2/R2

	pand		mm3, mask_highd	; 0/B1/G1/R1/0/0/0/0
	packssdw	mm1, mm4		; 0/B3/G3/R3

	psrlq		mm3, 8			; 0/0/B1/G1/R1/0/0/0
	add			edx, 12

	por			mm0, mm3		; 0/0/?/?/R1/B0/G0/R0 
	packuswb    mm2, mm1		; 0/B3/G3/R3/0/B2/G2/R2

	psrlq		mm3, 32			; 0/0/0/0/0/0/B1/G1
	add			eax, 4

	movd		[edx][-12], mm0		; correct for add		
	punpcklwd	mm3, mm2		; 0/B2/0/0/G2/R2/B1/G1

	psrlq		mm2, 24			; 0/0/0/0/B3/G3/R3/0
	add			ecx, 4

	movd		[edx][-8], mm3	; correct for previous add
	psrlq		mm3, 48			; 0/0/0/0/0/0/0/B2
	
	por			mm2, mm3		; 0/0/0/0/B3/G3/R3/0
	add			ebx, 4

	movd		[edx][-4], mm2	; correct for previous add

	dec			edi
	jnz			YUVtoRGB		; Do 12 more bytes if not zero

	//emms       // commented out since it is done after the IDCT

  } // end of _asm
}

void MYCbCrA2RGBA(
  int columns,	  
  unsigned char *inY,
  unsigned char *inU,
  unsigned char *inV,
  unsigned char *inA,
  unsigned char *outRGBA)
{

	__int64		tempA;
  _asm {
	// Inits
	mov		eax, inY
	mov		ecx, inV

	mov		edi, columns
	mov		ebx, inU

	shr		edi, 2				; number of loops = cols/4 
	mov		edx, outRGBA

	mov		esi, inA

YUVAtoRGBA:
	movd	mm0, [eax]			; 0/0/0/0/Y3/Y2/Y1/Y0
	pxor	mm7, mm7			; added this in to achieve better pairing at start

	movd	mm2, [ebx]			; 0/0/0/0/U3/U2/U1/U0
	punpcklbw	mm0, mm7		; Y3/Y2/Y1/Y0

	movd	mm3, [ecx]			; 0/0/0/0/V3/V2/V1/V0
	punpcklbw	mm2, mm7		; U3/U2/U1/U0
	
	psubsw	mm2, const_sub128	; U3'/U2'/U1'/U0'
	punpcklbw	mm3, mm7		; V3/V2/V1/V0

	psubsw	mm3, const_sub128	; V3'/V2'/V1'/V0'
	movq	mm4, mm2
	
	punpcklwd	mm2, mm3		; V1'/U1'/V0'/U0'
	movq	mm1, mm0		

	pmaddwd	mm2, const_VUmul	; guU1'+gvV1'/guU0'+gvV0'
	psllw	mm1, 8				; Y3*256/Y2*256/Y1*256/Y0*256

	movq	mm6, mm1
	punpcklwd	mm1, mm7		; Y1*256/Y0*256

	punpckhwd	mm6, mm7		; Y3*256/Y2*256
	movq	mm5, mm4

	punpckhwd	mm5, mm3		; V3'/U3'/V2'/U2'
	paddd	mm2, mm1			; G1*256/G0*256		(mm1 free)

	pmaddwd	mm5, const_VUmul	; gvV3'+guU3'/gvV2'+guU2'
	movq	mm1, mm3			;		(using mm1)	
	
	punpcklwd	mm3, mm0		; Y1/V1'/Y0/V0'
	movq	mm7, mm4			; This wipes out the zero constant
	
	pmaddwd	mm3, const_YVmul	; ryY1+rvV1'/ryY0+rvV0'
	psrad	mm2, 8				; G1/G0 

	paddd	mm5, mm6			; G3*256/G2*256		(mm6 free)
	punpcklwd	mm4, mm0		; Y1/U1'/Y0/U0'

	pmaddwd	mm4, const_YUmul	; byY1+buU1'/byY0'+buU0'
	psrad	mm5, 8				; G3/G2

	psrad	mm3, 8				; R1/R0

	punpckhwd	mm7 , mm0		; Y3/U3'/Y2/U2'
	movq	mm6, mm3

	pmaddwd	mm7, const_YUmul	; byY3+buU3'/byY2'+buU2'
	punpckhwd	mm1, mm0		; Y3/V3'/Y2/V2'		

	pmaddwd	mm1, const_YVmul	; ryY3+rvV3'/ryY2+rvV2'
	punpckldq	mm3, mm2		; G0/R0

	punpckhdq	mm6, mm2		; G1/R1			(mm2 free)

	movd	mm2, [esi]			; 0/0/0/0/A3/A2/A1/A0
	psrad	mm4, 8				; B1/B0

	punpcklbw	mm2, const_0	; A3/A2/A1/A0

	psrad	mm1, 8				; R3/R2
	movq	mm0, mm4			; B1/B0

	movq	tempA, mm2
	psrad	mm7, 8				; B3/B2

	punpcklwd	mm2, const_0	; A1/A0

	punpckldq	mm4, mm2		; A0/B0

	punpckhdq	mm0, mm2		; A1/B1
	movq	mm2, mm1

	packssdw	mm3, mm4		; A0/B0/G0/R0	(mm4 free)

	packssdw	mm6, mm0		; A1/B1/G1/R1	(mm0 free)
	movq	mm4, mm7

	packuswb mm3, mm6			; A1/B1/G1/R1/A0/B0/G0/R0  (mm6 free)
	movq		mm6, tempA		; A3/A2/A1/A0

	punpckldq	mm2, mm5		; G2/R2

	movq	[edx], mm3
	punpckhdq	mm1, mm5		; G3/R3 (mm5 done)

	punpckhwd	mm6, const_0	; A3/A2

	punpckldq	mm7, mm6		; A2/B2	
	add			eax, 4

	punpckhdq	mm4, mm6		; A3/B3		
	add			ebx, 4
		
	packssdw	mm2, mm7		; A2/B2/G2/R2
	add			ecx, 4

	packssdw	mm1, mm4		; A3/B3/G3/R3
	add			edx, 16

	packuswb    mm2, mm1		; A3/B3/G3/R3/A2/B2/G2/R2
	add			esi, 4

	movq	[edx][-8], mm2		; Post-add correction on address

	dec			edi
	jnz			YUVAtoRGBA		; Do 12 more bytes if not zero

	//emms       // commented out since it is done after the IDCT

  } // end of _asm
}

void MYCbCrA2RGBALegacy(
  int columns,	  
  unsigned char *inY,
  unsigned char *inU,
  unsigned char *inV,
  unsigned char *inA,
  unsigned char *outRGBA)
{

	__int64		tempA;
  _asm {
	// Inits

	mov		eax, inY
	mov		ecx, inV

	mov		edi, columns
	mov		ebx, inU

	shr		edi, 2				; number of loops = cols/4 
	mov		edx, outRGBA

	mov		esi, inA

YUVAtoRGBA:
	movd	mm0, [eax]			; 0/0/0/0/Y3/Y2/Y1/Y0
	pxor	mm7, mm7			; added this in to achieve better pairing at start

	movd	mm2, [ebx]			; 0/0/0/0/U3/U2/U1/U0
	punpcklbw	mm0, mm7		; Y3/Y2/Y1/Y0

	movd	mm3, [ecx]			; 0/0/0/0/V3/V2/V1/V0
	punpcklbw	mm2, mm7		; U3/U2/U1/U0
	
	psubsw	mm2, const_sub128	; U3'/U2'/U1'/U0'
	punpcklbw	mm3, mm7		; V3/V2/V1/V0

	psubsw	mm3, const_sub128	; V3'/V2'/V1'/V0'
	movq	mm4, mm2
	
	punpcklwd	mm2, mm3		; V1'/U1'/V0'/U0'
	movq	mm1, mm0		

	pmaddwd	mm2, const_VUmul	; guU1'+gvV1'/guU0'+gvV0'
	psllw	mm1, 8				; Y3*256/Y2*256/Y1*256/Y0*256

	movq	mm6, mm1
	punpcklwd	mm1, mm7		; Y1*256/Y0*256
	
	punpckhwd	mm6, mm7		; Y3*256/Y2*256
	movq	mm5, mm4

	punpckhwd	mm5, mm3		; V3'/U3'/V2'/U2'
	paddd	mm2, mm1			; G1*256/G0*256		(mm1 free)

	pmaddwd	mm5, const_VUmul	; gvV3'+guU3'/gvV2'+guU2'
	movq	mm1, mm3			;		(using mm1)	
	
	punpcklwd	mm3, mm0		; Y1/V1'/Y0/V0'
	movq	mm7, mm4			; This wipes out the zero constant
	
	pmaddwd	mm3, const_YVmul	; ryY1+rvV1'/ryY0+rvV0'
	psrad	mm2, 8				; G1/G0 

	paddd	mm5, mm6			; G3*256/G2*256		(mm6 free)
	punpcklwd	mm4, mm0		; Y1/U1'/Y0/U0'

	pmaddwd	mm4, const_YUmul	; byY1+buU1'/byY0'+buU0'
	punpckhwd	mm1, mm0		; Y3/V3'/Y2/V2'		
	
	psrad	mm3, 8				; R1/R0

	punpckhwd	mm7, mm0		; Y3/U3'/Y2/U2'
	movq	mm6, mm3

	pmaddwd	mm7, const_YUmul	; byY3+buU3'/byY2'+buU2'
	psrad	mm4, 8				; B1/B0

	pmaddwd	mm1, const_YVmul	; ryY3+rvV3'/ryY2+rvV2'
	punpckldq	mm3, mm2		; G0/R0
		
	punpckhdq	mm6, mm2		; G1/R1			(mm2 free)

	movd	mm2, [esi]			; 0/0/0/0/A3/A2/A1/A0
	psrad	mm7, 8				; B3/B2	

	punpcklbw	mm2, const_0	; A3/A2/A1/A0
	
	psrad	mm1, 8				; R3/R2
	movq	mm0, mm4			; B1/B0

	movq	tempA, mm2
	psrad	mm5, 8				; G3/G2

	punpcklwd	mm2, const_0	; A1/A0

	punpckldq	mm4, mm2		; A0/B0

	punpckhdq	mm0, mm2		; A1/B1
	movq	mm2, mm1

	packssdw	mm3, mm4		; A0/B0/G0/R0	(mm4 free)

	packssdw	mm6, mm0		; A1/B1/G1/R1	(mm0 free)
	movq	mm4, mm7

	packuswb mm3, mm6			; A1/B1/G1/R1/A0/B0/G0/R0  (mm6 free)
	add			esi, 4

	movq		mm6, tempA		; A3/A2/A1/A0
	punpckldq	mm2, mm5		; G2/R2

	pxor	mm3, const_invert	; Invert all RGB values
	punpckhdq	mm1, mm5		; G3/R3 (mm5 done)

	punpckhwd	mm6, const_0	; A3/A2

	movq	[edx], mm3
	punpckldq	mm7, mm6		; A2/B2	

	punpckhdq	mm4, mm6		; A3/B3		
	add			eax, 4

	packssdw	mm2, mm7		; A2/B2/G2/R2
	add			ebx, 4
		
	packssdw	mm1, mm4		; A3/B3/G3/R3
	add			ecx, 4

	packuswb    mm2, mm1		; A3/B3/G3/R3/A2/B2/G2/R2
	add			edx, 16

	pxor		mm2, const_invert	; invert all RGB values

	movq	[edx][-8], mm2		; Post-add correction on address

	dec			edi
	jnz			YUVAtoRGBA		; Do 12 more bytes if not zero

	//emms       // commented out since it is done after the IDCT
  } // end of _asm
}


// enable "No EMMS instruction" warning

#pragma warning(default : 4799)

#endif