WindowsXP-SP1/shell/shell32/tngen/jccolor.cpp
2020-09-30 16:53:49 +02:00

1547 lines
46 KiB
C++

#include "stdafx.h"
#pragma hdrstop
/*
* jccolor.c
*
* Copyright (C) 1991-1996, Thomas G. Lane.
* This file is part of the Independent JPEG Group's software.
* For conditions of distribution and use, see the accompanying README file.
*
* This file contains input colorspace conversion routines.
*/
#pragma warning( disable : 4799 )
#define JPEG_INTERNALS
#include "jinclude.h"
#include "jpeglib.h"
#ifdef NIFTY
#include <math.h>
#define SCALE_PREC 5
#define SCALE_RND (1 << (SCALE_PREC - 1))
#define SCALE (1 << SCALE_PREC)
#define unscale(x) (((long)(x) + SCALE_RND) >> SCALE_PREC)
#define clip(x) (((long)(x) & ~0xff) ? (((long)(x) < 0) ? 0 : 255) : (long)(x))
#endif
/* Private subobject */
typedef struct {
struct jpeg_color_converter pub; /* public fields */
/* Private state for RGB->YCC conversion */
INT32 * rgb_ycc_tab; /* => table for RGB to YCbCr conversion */
} my_color_converter;
typedef my_color_converter * my_cconvert_ptr;
extern void MRGB2YCbCr(
int rows,
int cols,
unsigned char *inRGB,
unsigned char *outY,
unsigned char *outU,
unsigned char *outV);
extern void MRGBA2YCbCrA(
int rows,
int cols,
unsigned char *inRGB,
unsigned char *outY,
unsigned char *outU,
unsigned char *outV,
unsigned char *outA);
extern void MRGBA2YCbCrALegacy(
int rows,
int cols,
unsigned char *inRGB,
unsigned char *outY,
unsigned char *outU,
unsigned char *outV,
unsigned char *outA);
// ******************************************************************
// Macros and Constants
#define FCONVERSION_BITS 15
#define ICONVERSION_BITS 8
const __int64 const_0 = 0x0000000000000000;
const __int64 const_1 = 0x0001000100010001;
const __int64 const_128 = 0x0080008000800080;
// These constants correspond to CCIR 601-1
// Y = [ (9798*R + 19235*G + 3736*B) / 32768]
// Cb = [(-5529*R - 10855*G + 16384*B) / 32768] + 128
// Cr = [(16384*R - 13720*G - 2664*B) / 32768] + 128
//Conventional floating point equations:
// Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
// Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + 0.5
// Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 0.5
//Yr = 2646 Yg = 4b23 Yb = 0e98
//Ur = ea67 Ug = d599 Ub = 4000
//Vr = 4000 Vg = ca68 Vb = f598
// constants for RGB->YCrCb
const __int64 const_YR0GR = 0x264600004B232646;
const __int64 const_YBG0B = 0x0E984B2300000E98;
const __int64 const_UR0GR = 0xEA670000D599EA67;
const __int64 const_UBG0B = 0x4000D59900004000;
const __int64 const_VR0GR = 0x40000000CA684000;
const __int64 const_VBG0B = 0xF598CA680000F598;
// constants for RGBA->YCrCbA
const __int64 const2_YGRGR = 0x4B2326464B232646;
const __int64 const2_Y0B0B = 0x00000E9800000E98;
const __int64 const2_UGRGR = 0xD599EA67D599EA67;
const __int64 const2_U0B0B = 0x0000400000004000;
const __int64 const2_VGRGR = 0xCA684000CA684000;
const __int64 const2_V0B0B = 0x0000F5980000F598;
const __int64 const2_A = 0x0001000000010000;
const __int64 const2_Legacy = 0x00FFFFFF00FFFFFF;
// These constants correspond to the original FPX SDK
// ... using 2^15
//Y = [ (9869*R + 19738*G + 3290*B) / 32768]
//Cb = [(-4935*R - 9869*G + 14739*B) / 32768] + 128
//Cr = [(14312*R - 12336*G - 2056*B) / 32768] + 128
//Conventional floating point equations:
// Y = 0.30118*R + 0.60235*G + 0.10039*B
// Cb = -0.15059*R - 0.30118*G + 0.44981*B + 0.5
// Cr = 0.43676*R - 0.37647*G - 0.06274*G + 0.5
//Yr = 268d Yg = 4d1a Yb = 0cda
//Ur = ecb9 Ug = d973 Ub = 3993
//Vr = 37e8 Vg = cfd0 Vb = f7f8
// constants for RGB->YCrCb
//const __int64 const_YR0GR = 0x268D00004D1A268D;
//const __int64 const_YBG0B = 0x0CDA4D1A00000CDA;
//const __int64 const_UR0GR = 0xECB90000D973ECB9;
//const __int64 const_UBG0B = 0x3993D97300003993;
//const __int64 const_VR0GR = 0x37E80000CFD037E8;
//const __int64 const_VBG0B = 0xF7F8CFD00000F7F8;
// constants for RGBA->YCrCbA
//const __int64 const2_YGRGR = 0x4D1A268D4D1A268D;
//const __int64 const2_Y0B0B = 0x00000CDA00000CDA;
//const __int64 const2_UGRGR = 0xD973ECB9D973ECB9;
//const __int64 const2_U0B0B = 0x0000399300003993;
//const __int64 const2_VGRGR = 0xCFD037E8CFD037E8;
//const __int64 const2_V0B0B = 0x0000F7F80000F7F8;
//const __int64 const2_A = 0x0001000000010000;
//const __int64 const2_Legacy = 0x00FFFFFF00FFFFFF;
// ... using 2^8
//const __int64 const_X0YY0 = 0x0000010001000000;
//const __int64 const_RVUVU = 0x019A0000019A0000;
//const __int64 const_GVUVU = 0xFF33FFABFF33FFAB;
//const __int64 const_BVUVU = 0x0000020000000200;
__int64 temp0, tempY, tempU, tempV, tempA;
/**************** RGB -> YCbCr conversion: most common case **************/
/*
* YCbCr is defined per CCIR 601-1, except that Cb and Cr are
* normalized to the range 0..MAXJSAMPLE rather than -0.5 .. 0.5.
* The conversion equations to be implemented are therefore
* Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
* Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
* Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
* (These numbers are derived from TIFF 6.0 section 21, dated 3-June-92.)
* Note: older versions of the IJG code used a zero offset of MAXJSAMPLE/2,
* rather than CENTERJSAMPLE, for Cb and Cr. This gave equal positive and
* negative swings for Cb/Cr, but meant that grayscale values (Cb=Cr=0)
* were not represented exactly. Now we sacrifice exact representation of
* maximum red and maximum blue in order to get exact grayscales.
*
* To avoid floating-point arithmetic, we represent the fractional constants
* as integers scaled up by 2^16 (about 4 digits precision); we have to divide
* the products by 2^16, with appropriate rounding, to get the correct answer.
*
* For even more speed, we avoid doing any multiplications in the inner loop
* by precalculating the constants times R,G,B for all possible values.
* For 8-bit JSAMPLEs this is very reasonable (only 256 entries per table);
* for 12-bit samples it is still acceptable. It's not very reasonable for
* 16-bit samples, but if you want lossless storage you shouldn't be changing
* colorspace anyway.
* The CENTERJSAMPLE offsets and the rounding fudge-factor of 0.5 are included
* in the tables to save adding them separately in the inner loop.
*/
#define SCALEBITS 16 /* speediest right-shift on some machines */
#define CBCR_OFFSET ((INT32) CENTERJSAMPLE << SCALEBITS)
#define ONE_HALF ((INT32) 1 << (SCALEBITS-1))
#define FIX(x) ((INT32) ((x) * (1L<<SCALEBITS) + 0.5))
/* We allocate one big table and divide it up into eight parts, instead of
* doing eight alloc_small requests. This lets us use a single table base
* address, which can be held in a register in the inner loops on many
* machines (more than can hold all eight addresses, anyway).
*/
#define R_Y_OFF 0 /* offset to R => Y section */
#define G_Y_OFF (1*(MAXJSAMPLE+1)) /* offset to G => Y section */
#define B_Y_OFF (2*(MAXJSAMPLE+1)) /* etc. */
#define R_CB_OFF (3*(MAXJSAMPLE+1))
#define G_CB_OFF (4*(MAXJSAMPLE+1))
#define B_CB_OFF (5*(MAXJSAMPLE+1))
#define R_CR_OFF B_CB_OFF /* B=>Cb, R=>Cr are the same */
#define G_CR_OFF (6*(MAXJSAMPLE+1))
#define B_CR_OFF (7*(MAXJSAMPLE+1))
#define TABLE_SIZE (8*(MAXJSAMPLE+1))
#ifdef NIFTY
/*
* Initialize for RGB->PhotoYCC colorspace conversion.
*/
METHODDEF (void)
rgb_pycc_start (j_compress_ptr cinfo)
{
}
/*
* RGB->PhotoYCC colorspace convertion.
*/
METHODDEF (void)
rgb_pycc_convert (j_compress_ptr cinfo,
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
JDIMENSION output_row, int num_rows)
{
my_cconvert_ptr cconvert = (my_cconvert_ptr)cinfo->cconvert;
register JSAMPROW inptr;
register JSAMPROW outptr0, outptr1, outptr2;
register JDIMENSION col;
JDIMENSION num_cols = cinfo->image_width;
unsigned char r, g, b;
while (--num_rows >= 0) {
inptr = *input_buf++;
outptr0 = output_buf[0][output_row];
outptr1 = output_buf[1][output_row];
outptr2 = output_buf[2][output_row];
output_row++;
for (col = 0; col < num_cols; col++) {
r = GETJSAMPLE(inptr[RGB_RED]);
g = GETJSAMPLE(inptr[RGB_GREEN]);
b = GETJSAMPLE(inptr[RGB_BLUE]);
inptr+=RGB_PIXELSIZE;
/* Y */
outptr0[col] = (JSAMPLE)((float)((float)r * 0.2200179046) + (float)((float)g * 0.4322754970) + (float)((float)b * 0.0838667868));
/* C1 */
outptr1[col] = (JSAMPLE)((float)((float)r * -0.1347546425) - (float)((float)g * 0.2647563169) + (float)((float)b * 0.3995109594) + 156);
/* C2 */
outptr2[col] = (JSAMPLE)((float)((float)r * 0.3849177482) - (float)((float)g * 0.3223733380) + (float)((float)b * 0.0625444102) + 137);
}
}
}
#endif
/*
* Initialize for RGB->YCC colorspace conversion.
*/
METHODDEF(void)
rgb_ycc_start (j_compress_ptr cinfo)
{
my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
INT32 * rgb_ycc_tab;
INT32 i;
/* Allocate and fill in the conversion tables. */
cconvert->rgb_ycc_tab = rgb_ycc_tab = (INT32 *)
(*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
(TABLE_SIZE * SIZEOF(INT32)));
for (i = 0; i <= MAXJSAMPLE; i++) {
rgb_ycc_tab[i+R_Y_OFF] = FIX(0.29900) * i;
rgb_ycc_tab[i+G_Y_OFF] = FIX(0.58700) * i;
rgb_ycc_tab[i+B_Y_OFF] = FIX(0.11400) * i + ONE_HALF;
rgb_ycc_tab[i+R_CB_OFF] = (-FIX(0.16874)) * i;
rgb_ycc_tab[i+G_CB_OFF] = (-FIX(0.33126)) * i;
/* We use a rounding fudge-factor of 0.5-epsilon for Cb and Cr.
* This ensures that the maximum output will round to MAXJSAMPLE
* not MAXJSAMPLE+1, and thus that we don't have to range-limit.
*/
rgb_ycc_tab[i+B_CB_OFF] = FIX(0.50000) * i + CBCR_OFFSET + ONE_HALF-1;
/* B=>Cb and R=>Cr tables are the same
rgb_ycc_tab[i+R_CR_OFF] = FIX(0.50000) * i + CBCR_OFFSET + ONE_HALF-1;
*/
rgb_ycc_tab[i+G_CR_OFF] = (-FIX(0.41869)) * i;
rgb_ycc_tab[i+B_CR_OFF] = (-FIX(0.08131)) * i;
}
}
/*
* Convert some rows of samples to the JPEG colorspace.
*
* Note that we change from the application's interleaved-pixel format
* to our internal noninterleaved, one-plane-per-component format.
* The input buffer is therefore three times as wide as the output buffer.
*
* A starting row offset is provided only for the output buffer. The caller
* can easily adjust the passed input_buf value to accommodate any row
* offset required on that side.
*/
METHODDEF(void)
rgb_ycc_convert (j_compress_ptr cinfo,
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
JDIMENSION output_row, int num_rows)
{
my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
register int r, g, b;
register INT32 * ctab = cconvert->rgb_ycc_tab;
register JSAMPROW inptr;
register JSAMPROW outptr0, outptr1, outptr2;
register JDIMENSION col;
JDIMENSION num_cols = cinfo->image_width;
JDIMENSION tail_cols = num_cols&7;
JDIMENSION mmx_cols=num_cols&~7;
while (--num_rows >= 0) {
inptr = *input_buf++;
outptr0 = output_buf[0][output_row];
outptr1 = output_buf[1][output_row];
outptr2 = output_buf[2][output_row];
output_row++;
//
// Need to add #ifdef for Alpha port
//
#if defined (_X86_)
if (vfMMXMachine)
{
MRGB2YCbCr( (int)(1), mmx_cols, inptr, outptr0, outptr1, outptr2);
inptr += 3*mmx_cols;
for (col = mmx_cols; col < num_cols; col++) {
r = GETJSAMPLE(inptr[RGB_RED]);
g = GETJSAMPLE(inptr[RGB_GREEN]);
b = GETJSAMPLE(inptr[RGB_BLUE]);
inptr += RGB_PIXELSIZE;
/* If the inputs are 0..MAXJSAMPLE, the outputs of these equations
* must be too; we do not need an explicit range-limiting operation.
* Hence the value being shifted is never negative, and we don't
* need the general RIGHT_SHIFT macro.
*/
/* Y */
outptr0[col] = (JSAMPLE)
((ctab[r+R_Y_OFF] + ctab[g+G_Y_OFF] + ctab[b+B_Y_OFF])
>> SCALEBITS);
/* Cb */
outptr1[col] = (JSAMPLE)
((ctab[r+R_CB_OFF] + ctab[g+G_CB_OFF] + ctab[b+B_CB_OFF])
>> SCALEBITS);
/* Cr */
outptr2[col] = (JSAMPLE)
((ctab[r+R_CR_OFF] + ctab[g+G_CR_OFF] + ctab[b+B_CR_OFF])
>> SCALEBITS);
}
}
else
#endif
{
for (col = 0; col < num_cols; col++) {
r = GETJSAMPLE(inptr[RGB_RED]);
g = GETJSAMPLE(inptr[RGB_GREEN]);
b = GETJSAMPLE(inptr[RGB_BLUE]);
inptr += RGB_PIXELSIZE;
/* If the inputs are 0..MAXJSAMPLE, the outputs of these equations
* must be too; we do not need an explicit range-limiting operation.
* Hence the value being shifted is never negative, and we don't
* need the general RIGHT_SHIFT macro.
*/
/* Y */
outptr0[col] = (JSAMPLE)
((ctab[r+R_Y_OFF] + ctab[g+G_Y_OFF] + ctab[b+B_Y_OFF])
>> SCALEBITS);
/* Cb */
outptr1[col] = (JSAMPLE)
((ctab[r+R_CB_OFF] + ctab[g+G_CB_OFF] + ctab[b+B_CB_OFF])
>> SCALEBITS);
/* Cr */
outptr2[col] = (JSAMPLE)
((ctab[r+R_CR_OFF] + ctab[g+G_CR_OFF] + ctab[b+B_CR_OFF])
>> SCALEBITS);
}
}
}
}
/**************** Cases other than RGB -> YCbCr **************/
/*
* Convert some rows of samples to the JPEG colorspace.
* This version handles RGB->grayscale conversion, which is the same
* as the RGB->Y portion of RGB->YCbCr.
* We assume rgb_ycc_start has been called (we only use the Y tables).
*/
METHODDEF(void)
rgb_gray_convert (j_compress_ptr cinfo,
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
JDIMENSION output_row, int num_rows)
{
my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
register int r, g, b;
register INT32 * ctab = cconvert->rgb_ycc_tab;
register JSAMPROW inptr;
register JSAMPROW outptr;
register JDIMENSION col;
JDIMENSION num_cols = cinfo->image_width;
while (--num_rows >= 0) {
inptr = *input_buf++;
outptr = output_buf[0][output_row];
output_row++;
for (col = 0; col < num_cols; col++) {
r = GETJSAMPLE(inptr[RGB_RED]);
g = GETJSAMPLE(inptr[RGB_GREEN]);
b = GETJSAMPLE(inptr[RGB_BLUE]);
inptr += RGB_PIXELSIZE;
/* Y */
outptr[col] = (JSAMPLE)
((ctab[r+R_Y_OFF] + ctab[g+G_Y_OFF] + ctab[b+B_Y_OFF])
>> SCALEBITS);
}
}
}
#ifdef NIFTY
METHODDEF (void)
rgba_ycbcra_convert (j_compress_ptr cinfo,
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
JDIMENSION output_row, int num_rows)
{
my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
register int r, g, b;
register INT32 * ctab = cconvert->rgb_ycc_tab;
register JSAMPROW inptr;
register JSAMPROW outptr0, outptr1, outptr2, outptr3;
register JDIMENSION col;
JDIMENSION num_cols = cinfo->image_width;
JDIMENSION tail_cols = num_cols&7;
JDIMENSION mmx_cols=num_cols&~7;
while (--num_rows >= 0) {
inptr = *input_buf++;
outptr0 = output_buf[0][output_row];
outptr1 = output_buf[1][output_row];
outptr2 = output_buf[2][output_row];
outptr3 = output_buf[3][output_row];
output_row++;
//
// Need to add #ifdef for Alpha port
//
#if defined (_X86_)
if (vfMMXMachine)
{
MRGBA2YCbCrA( (int)(1), mmx_cols, inptr, outptr0, outptr1, outptr2, outptr3);
inptr += 4*mmx_cols;
for (col = mmx_cols; col < num_cols; col++) {
r = GETJSAMPLE(inptr[0]);
g = GETJSAMPLE(inptr[1]);
b = GETJSAMPLE(inptr[2]);
/* Alpha passes through as-is */
outptr3[col] = inptr[3]; /* don't need GETJSAMPLE here */
inptr += 4;
/* If the inputs are 0..MAXJSAMPLE, the outputs of these equations
* must be too; we do not need an explicit range-limiting operation.
* Hence the value being shifted is never negative, and we don't
* need the general RIGHT_SHIFT macro.
*/
/* Y */
outptr0[col] = (JSAMPLE)
((ctab[r+R_Y_OFF] + ctab[g+G_Y_OFF] + ctab[b+B_Y_OFF])
>> SCALEBITS);
/* Cb */
outptr1[col] = (JSAMPLE)
((ctab[r+R_CB_OFF] + ctab[g+G_CB_OFF] + ctab[b+B_CB_OFF])
>> SCALEBITS);
/* Cr */
outptr2[col] = (JSAMPLE)
((ctab[r+R_CR_OFF] + ctab[g+G_CR_OFF] + ctab[b+B_CR_OFF])
>> SCALEBITS);
}
}
else
#endif // defined (_X86_)
{
for (col = 0; col < num_cols; col++) {
r = GETJSAMPLE(inptr[0]);
g = GETJSAMPLE(inptr[1]);
b = GETJSAMPLE(inptr[2]);
/* Alpha passes through as-is */
outptr3[col] = inptr[3]; /* don't need GETJSAMPLE here */
inptr += 4;
/* If the inputs are 0..MAXJSAMPLE, the outputs of these equations
* must be too; we do not need an explicit range-limiting operation.
* Hence the value being shifted is never negative, and we don't
* need the general RIGHT_SHIFT macro.
*/
/* Y */
outptr0[col] = (JSAMPLE)
((ctab[r+R_Y_OFF] + ctab[g+G_Y_OFF] + ctab[b+B_Y_OFF])
>> SCALEBITS);
/* Cb */
outptr1[col] = (JSAMPLE)
((ctab[r+R_CB_OFF] + ctab[g+G_CB_OFF] + ctab[b+B_CB_OFF])
>> SCALEBITS);
/* Cr */
outptr2[col] = (JSAMPLE)
((ctab[r+R_CR_OFF] + ctab[g+G_CR_OFF] + ctab[b+B_CR_OFF])
>> SCALEBITS);
}
}
}
}
METHODDEF (void)
rgba_ycbcralegacy_convert (j_compress_ptr cinfo,
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
JDIMENSION output_row, int num_rows)
{
my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
register int r, g, b;
register INT32 * ctab = cconvert->rgb_ycc_tab;
register JSAMPROW inptr;
register JSAMPROW outptr0, outptr1, outptr2, outptr3;
register JDIMENSION col;
JDIMENSION num_cols = cinfo->image_width;
JDIMENSION tail_cols = num_cols&7;
JDIMENSION mmx_cols=num_cols&~7;
while (--num_rows >= 0) {
inptr = *input_buf++;
outptr0 = output_buf[0][output_row];
outptr1 = output_buf[1][output_row];
outptr2 = output_buf[2][output_row];
outptr3 = output_buf[3][output_row];
output_row++;
//
// Need to add #ifdef for Alpha port
//
#if defined (_X86_)
if (vfMMXMachine)
{
MRGBA2YCbCrALegacy( (int)(1), mmx_cols, inptr, outptr0, outptr1, outptr2, outptr3);
inptr += 4*mmx_cols;
for (col = mmx_cols; col < num_cols; col++) {
r = MAXJSAMPLE - GETJSAMPLE(inptr[0]);
g = MAXJSAMPLE - GETJSAMPLE(inptr[1]);
b = MAXJSAMPLE - GETJSAMPLE(inptr[2]);
/* Alpha passes through as-is */
outptr3[col] = inptr[3]; /* don't need GETJSAMPLE here */
inptr += 4;
/* If the inputs are 0..MAXJSAMPLE, the outputs of these equations
* must be too; we do not need an explicit range-limiting operation.
* Hence the value being shifted is never negative, and we don't
* need the general RIGHT_SHIFT macro.
*/
/* Y */
outptr0[col] = (JSAMPLE)
((ctab[r+R_Y_OFF] + ctab[g+G_Y_OFF] + ctab[b+B_Y_OFF])
>> SCALEBITS);
/* Cb */
outptr1[col] = (JSAMPLE)
((ctab[r+R_CB_OFF] + ctab[g+G_CB_OFF] + ctab[b+B_CB_OFF])
>> SCALEBITS);
/* Cr */
outptr2[col] = (JSAMPLE)
((ctab[r+R_CR_OFF] + ctab[g+G_CR_OFF] + ctab[b+B_CR_OFF])
>> SCALEBITS);
}
}
else
#endif // defined (_X86_)
{
for (col = 0; col < num_cols; col++) {
r = MAXJSAMPLE - GETJSAMPLE(inptr[0]);
g = MAXJSAMPLE - GETJSAMPLE(inptr[1]);
b = MAXJSAMPLE - GETJSAMPLE(inptr[2]);
/* Alpha passes through as-is */
outptr3[col] = inptr[3]; /* don't need GETJSAMPLE here */
inptr += 4;
/* If the inputs are 0..MAXJSAMPLE, the outputs of these equations
* must be too; we do not need an explicit range-limiting operation.
* Hence the value being shifted is never negative, and we don't
* need the general RIGHT_SHIFT macro.
*/
/* Y */
outptr0[col] = (JSAMPLE)
((ctab[r+R_Y_OFF] + ctab[g+G_Y_OFF] + ctab[b+B_Y_OFF])
>> SCALEBITS);
/* Cb */
outptr1[col] = (JSAMPLE)
((ctab[r+R_CB_OFF] + ctab[g+G_CB_OFF] + ctab[b+B_CB_OFF])
>> SCALEBITS);
/* Cr */
outptr2[col] = (JSAMPLE)
((ctab[r+R_CR_OFF] + ctab[g+G_CR_OFF] + ctab[b+B_CR_OFF])
>> SCALEBITS);
}
}
}
}
#endif
/*
* Convert some rows of samples to the JPEG colorspace.
* This version handles Adobe-style CMYK->YCCK conversion,
* where we convert R=1-C, G=1-M, and B=1-Y to YCbCr using the same
* conversion as above, while passing K (black) unchanged.
* We assume rgb_ycc_start has been called.
*/
METHODDEF(void)
cmyk_ycck_convert (j_compress_ptr cinfo,
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
JDIMENSION output_row, int num_rows)
{
my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
register int r, g, b;
register INT32 * ctab = cconvert->rgb_ycc_tab;
register JSAMPROW inptr;
register JSAMPROW outptr0, outptr1, outptr2, outptr3;
register JDIMENSION col;
JDIMENSION num_cols = cinfo->image_width;
while (--num_rows >= 0) {
inptr = *input_buf++;
outptr0 = output_buf[0][output_row];
outptr1 = output_buf[1][output_row];
outptr2 = output_buf[2][output_row];
outptr3 = output_buf[3][output_row];
output_row++;
for (col = 0; col < num_cols; col++) {
r = MAXJSAMPLE - GETJSAMPLE(inptr[0]);
g = MAXJSAMPLE - GETJSAMPLE(inptr[1]);
b = MAXJSAMPLE - GETJSAMPLE(inptr[2]);
/* K passes through as-is */
outptr3[col] = inptr[3]; /* don't need GETJSAMPLE here */
inptr += 4;
/* If the inputs are 0..MAXJSAMPLE, the outputs of these equations
* must be too; we do not need an explicit range-limiting operation.
* Hence the value being shifted is never negative, and we don't
* need the general RIGHT_SHIFT macro.
*/
/* Y */
outptr0[col] = (JSAMPLE)
((ctab[r+R_Y_OFF] + ctab[g+G_Y_OFF] + ctab[b+B_Y_OFF])
>> SCALEBITS);
/* Cb */
outptr1[col] = (JSAMPLE)
((ctab[r+R_CB_OFF] + ctab[g+G_CB_OFF] + ctab[b+B_CB_OFF])
>> SCALEBITS);
/* Cr */
outptr2[col] = (JSAMPLE)
((ctab[r+R_CR_OFF] + ctab[g+G_CR_OFF] + ctab[b+B_CR_OFF])
>> SCALEBITS);
}
}
}
/*
* Convert some rows of samples to the JPEG colorspace.
* This version handles grayscale output with no conversion.
* The source can be either plain grayscale or YCbCr (since Y == gray).
*/
METHODDEF(void)
grayscale_convert (j_compress_ptr cinfo,
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
JDIMENSION output_row, int num_rows)
{
register JSAMPROW inptr;
register JSAMPROW outptr;
register JDIMENSION col;
JDIMENSION num_cols = cinfo->image_width;
int instride = cinfo->input_components;
while (--num_rows >= 0) {
inptr = *input_buf++;
outptr = output_buf[0][output_row];
output_row++;
for (col = 0; col < num_cols; col++) {
outptr[col] = inptr[0]; /* don't need GETJSAMPLE() here */
inptr += instride;
}
}
}
/*
* Convert some rows of samples to the JPEG colorspace.
* This version handles multi-component colorspaces without conversion.
* We assume input_components == num_components.
*/
METHODDEF(void)
null_convert (j_compress_ptr cinfo,
JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
JDIMENSION output_row, int num_rows)
{
register JSAMPROW inptr;
register JSAMPROW outptr;
register JDIMENSION col;
register int ci;
int nc = cinfo->num_components;
JDIMENSION num_cols = cinfo->image_width;
while (--num_rows >= 0) {
/* It seems fastest to make a separate pass for each component. */
for (ci = 0; ci < nc; ci++) {
inptr = *input_buf;
outptr = output_buf[ci][output_row];
for (col = 0; col < num_cols; col++) {
outptr[col] = inptr[ci]; /* don't need GETJSAMPLE() here */
inptr += nc;
}
}
input_buf++;
output_row++;
}
}
/*
* Empty method for start_pass.
*/
METHODDEF(void)
null_method (j_compress_ptr cinfo)
{
/* no work needed */
}
/*
* Module initialization routine for input colorspace conversion.
*/
GLOBAL(void)
jinit_color_converter (j_compress_ptr cinfo)
{
my_cconvert_ptr cconvert;
cconvert = (my_cconvert_ptr)
(*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
SIZEOF(my_color_converter));
cinfo->cconvert = (struct jpeg_color_converter *) cconvert;
/* set start_pass to null method until we find out differently */
cconvert->pub.start_pass = null_method;
/* Make sure input_components agrees with in_color_space */
switch (cinfo->in_color_space) {
case JCS_GRAYSCALE:
if (cinfo->input_components != 1)
ERREXIT(cinfo, JERR_BAD_IN_COLORSPACE);
break;
#ifdef NIFTY
case JCS_YCC:
if (cinfo->input_components != 3)
ERREXIT(cinfo, JERR_BAD_IN_COLORSPACE);
break;
case JCS_RGBA:
if (cinfo->input_components != 4)
ERREXIT(cinfo, JERR_BAD_IN_COLORSPACE);
break;
case JCS_YCbCrA:
if (cinfo->input_components != 4)
ERREXIT(cinfo, JERR_BAD_IN_COLORSPACE);
break;
case JCS_YCbCrALegacy:
if (cinfo->input_components != 4)
ERREXIT(cinfo, JERR_BAD_IN_COLORSPACE);
break;
case JCS_YCCA:
if (cinfo->input_components != 4)
ERREXIT(cinfo, JERR_BAD_IN_COLORSPACE);
break;
#endif
case JCS_RGB:
#if RGB_PIXELSIZE != 3
if (cinfo->input_components != RGB_PIXELSIZE)
ERREXIT(cinfo, JERR_BAD_IN_COLORSPACE);
break;
#endif /* else share code with YCbCr */
case JCS_YCbCr:
if (cinfo->input_components != 3)
ERREXIT(cinfo, JERR_BAD_IN_COLORSPACE);
break;
case JCS_CMYK:
case JCS_YCCK:
if (cinfo->input_components != 4)
ERREXIT(cinfo, JERR_BAD_IN_COLORSPACE);
break;
default: /* JCS_UNKNOWN can be anything */
if (cinfo->input_components < 1)
ERREXIT(cinfo, JERR_BAD_IN_COLORSPACE);
break;
}
/* Check num_components, set conversion method based on requested space */
switch (cinfo->jpeg_color_space) {
case JCS_GRAYSCALE:
if (cinfo->num_components != 1)
ERREXIT(cinfo, JERR_BAD_J_COLORSPACE);
if (cinfo->in_color_space == JCS_GRAYSCALE)
cconvert->pub.color_convert = grayscale_convert;
else if (cinfo->in_color_space == JCS_RGB) {
cconvert->pub.start_pass = rgb_ycc_start;
cconvert->pub.color_convert = rgb_gray_convert;
} else if (cinfo->in_color_space == JCS_YCbCr)
cconvert->pub.color_convert = grayscale_convert;
else
ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
break;
#ifdef NIFTY
/* Store and compress data as PhotoYCC */
/* Only current option is to start with PhotoYCC
* although I do include the function RGB->PhotoYCC
* in the compressor, I don't think it's a good idea
* to rotate to PhotoYCC from RGB in this context.
* If subsampling is required, then just use YCrCb.
*/
case JCS_YCC:
if (cinfo->num_components != 3)
ERREXIT(cinfo, JERR_BAD_J_COLORSPACE);
if (cinfo->in_color_space == JCS_YCC)
cconvert->pub.color_convert = null_convert;
else
if (cinfo->in_color_space == JCS_RGB) {
/* this is where the RGB->PhotoYCC could be called */
ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
} else {
ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
}
break;
case JCS_YCCA:
if (cinfo->num_components != 4)
ERREXIT(cinfo, JERR_BAD_J_COLORSPACE);
if (cinfo->in_color_space == JCS_YCCA)
cconvert->pub.color_convert = null_convert;
else
ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
break;
case JCS_RGBA:
if (cinfo->num_components != 4)
ERREXIT(cinfo, JERR_BAD_J_COLORSPACE);
if (cinfo->in_color_space == JCS_RGBA) {
cconvert->pub.color_convert = null_convert;
} else {
ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
}
break;
case JCS_YCbCrA:
if (cinfo->num_components != 4)
ERREXIT(cinfo, JERR_BAD_J_COLORSPACE);
if (cinfo->in_color_space == JCS_YCbCrA)
cconvert->pub.color_convert = null_convert;
else if (cinfo->in_color_space == JCS_RGBA) {
cconvert->pub.start_pass = rgb_ycc_start;
cconvert->pub.color_convert = rgba_ycbcra_convert;
} else
ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
break;
case JCS_YCbCrALegacy:
if (cinfo->num_components != 4)
ERREXIT(cinfo, JERR_BAD_J_COLORSPACE);
if (cinfo->in_color_space == JCS_YCbCrALegacy)
cconvert->pub.color_convert = null_convert;
else if (cinfo->in_color_space == JCS_RGBA) {
cconvert->pub.start_pass = rgb_ycc_start;
cconvert->pub.color_convert = rgba_ycbcralegacy_convert;
} else
ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
break;
#endif
case JCS_RGB:
if (cinfo->num_components != 3)
ERREXIT(cinfo, JERR_BAD_J_COLORSPACE);
if (cinfo->in_color_space == JCS_RGB && RGB_PIXELSIZE == 3)
cconvert->pub.color_convert = null_convert;
else
ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
break;
case JCS_YCbCr:
if (cinfo->num_components != 3)
ERREXIT(cinfo, JERR_BAD_J_COLORSPACE);
if (cinfo->in_color_space == JCS_RGB) {
cconvert->pub.start_pass = rgb_ycc_start;
cconvert->pub.color_convert = rgb_ycc_convert;
} else if (cinfo->in_color_space == JCS_YCbCr)
cconvert->pub.color_convert = null_convert;
else
ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
break;
case JCS_CMYK:
if (cinfo->num_components != 4)
ERREXIT(cinfo, JERR_BAD_J_COLORSPACE);
if (cinfo->in_color_space == JCS_CMYK)
cconvert->pub.color_convert = null_convert;
else
ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
break;
case JCS_YCCK:
if (cinfo->num_components != 4)
ERREXIT(cinfo, JERR_BAD_J_COLORSPACE);
if (cinfo->in_color_space == JCS_CMYK) {
cconvert->pub.start_pass = rgb_ycc_start;
cconvert->pub.color_convert = cmyk_ycck_convert;
} else if (cinfo->in_color_space == JCS_YCCK)
cconvert->pub.color_convert = null_convert;
else
ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
break;
default: /* allow null conversion of JCS_UNKNOWN */
if (cinfo->jpeg_color_space != cinfo->in_color_space ||
cinfo->num_components != cinfo->input_components)
ERREXIT(cinfo, JERR_CONVERSION_NOTIMPL);
cconvert->pub.color_convert = null_convert;
break;
}
}
//
// Need to add #ifdef for Alpha port
//
#if defined (_X86_)
void MRGB2YCbCr(
int rows,
int cols,
unsigned char *inRGB,
unsigned char *outY,
unsigned char *outU,
unsigned char *outV)
{
// make global to ensure proper stack alignment
// __int64 temp0, tempY, tempU, tempV;
__asm {
// initializations
//DS - IJG will always call with rows=1, so don't multiply
// mov eax, rows
// mov ebx, cols
// mul ebx ;number pixels
// reorder to take advantage of v-pipe
mov esi, cols
mov eax, inRGB
shr esi, 3 ;number of loops = (rows*cols)/8
mov edx, outV
mov edi, esi ;loop counter in edi
mov ecx, outU
mov ebx, outY
// top of loop
RGBtoYUV:
movq mm1, [eax] ;load #1 G2R2B1G1R1B0G0R0 -> mm1
pxor mm6, mm6 ;0 -> mm6
movq mm0, mm1 ;G2R2B1G1R1B0G0R0 -> mm0
psrlq mm1, 16 ;00G2R2B1G1R1B0 -> mm1
punpcklbw mm0, const_0 ;R1B0G0R0 -> mm0
movq mm7, mm1 ;00G2R2B1G1R1B0 -> mm7
punpcklbw mm1, const_0 ;B1G1R1B0 -> mm1
movq mm2, mm0 ;R1B0G0R0 -> mm2
pmaddwd mm0, const_YR0GR ;yrR1,ygG0+yrR0 -> mm0
movq mm3, mm1 ;B1G1R1B0 -> mm3
pmaddwd mm1, const_YBG0B ;ybB1+ygG1,ybB0 -> mm1
movq mm4, mm2 ;R1B0G0R0 -> mm4
pmaddwd mm2, const_UR0GR ;urR1,ugG0+urR0 -> mm2
movq mm5, mm3 ;B1G1R1B0 -> mm5
pmaddwd mm3, const_UBG0B ;ubB1+ugG1,ubB0 -> mm3
punpckhbw mm7, mm6 ;00G2R2 -> mm7
pmaddwd mm4, const_VR0GR ;vrR1,vgG0+vrR0 -> mm4
paddd mm0, mm1 ;Y1Y0 -> mm0
pmaddwd mm5, const_VBG0B ;vbB1+vgG1,vbB0 -> mm5
// nop
movq mm1, [eax][8] ;load #2 R5B4G4R4B3G3R3B2 -> mm1
paddd mm2, mm3 ;U1U0 -> mm2
movq mm6, mm1 ;R5B4G4R4B3G3R3B2 -> mm6
// nop
punpcklbw mm1, const_0 ;B3G3R3B2 -> mm1
paddd mm4, mm5 ;V1V0 -> mm4
movq mm5, mm1 ;B3G3R3B2 -> mm5
psllq mm1, 32 ;R3B200 -> mm1
paddd mm1, mm7 ;R3B200 + 00G2R2 = R3B2G2R2 -> mm1
// nop
punpckhbw mm6, const_0 ;R5B4G4R4 -> mm6
movq mm3, mm1 ;R3B2G2R2 -> mm3
pmaddwd mm1, const_YR0GR ;yrR3,ygG2+yrR2 -> mm1
movq mm7, mm5 ;B3G3R3B2 -> mm7
pmaddwd mm5, const_YBG0B ;ybB3+ygG3,ybB2 -> mm5
psrad mm0, FCONVERSION_BITS ;32-bit scaled Y1Y0 -> mm0
movq temp0, mm6 ;R5B4G4R4 -> temp0
movq mm6, mm3 ;R3B2G2R2 -> mm6
pmaddwd mm6, const_UR0GR ;urR3,ugG2+urR2 -> mm6
psrad mm2, FCONVERSION_BITS ;32-bit scaled U1U0 -> mm2
paddd mm1, mm5 ;Y3Y2 -> mm1
movq mm5, mm7 ;B3G3R3B2 -> mm5
pmaddwd mm7, const_UBG0B ;ubB3+ugG3,ubB2
psrad mm1, FCONVERSION_BITS ;32-bit scaled Y3Y2 -> mm1
pmaddwd mm3, const_VR0GR ;vrR3,vgG2+vgR2
packssdw mm0, mm1 ;Y3Y2Y1Y0 -> mm0
pmaddwd mm5, const_VBG0B ;vbB3+vgG3,vbB2 -> mm5
psrad mm4, FCONVERSION_BITS ;32-bit scaled V1V0 -> mm4
movq mm1, [eax][16] ;load #3 B7G7R7B6G6R6B5G5 -> mm7
paddd mm6, mm7 ;U3U2 -> mm6
movq mm7, mm1 ;B7G7R7B6G6R6B5G5 -> mm1
psrad mm6, FCONVERSION_BITS ;32-bit scaled U3U2 -> mm6
paddd mm3, mm5 ;V3V2 -> mm3
psllq mm7, 16 ;R7B6G6R6B5G500 -> mm7
movq mm5, mm7 ;R7B6G6R6B5G500 -> mm5
psrad mm3, FCONVERSION_BITS ;32-bit scaled V3V2 -> mm3
movq tempY, mm0 ;32-bit scaled Y3Y2Y1Y0 -> tempY
packssdw mm2, mm6 ;32-bit scaled U3U2U1U0 -> mm2
movq mm0, temp0 ;R5B4G4R4 -> mm0
// nop
punpcklbw mm7, const_0 ;B5G500 -> mm7
movq mm6, mm0 ;R5B4G4R4 -> mm6
movq tempU, mm2 ;32-bit scaled U3U2U1U0 -> tempU
psrlq mm0, 32 ;00R5B4 -> mm0
paddw mm7, mm0 ;B5G5R5B4 -> mm7
movq mm2, mm6 ;B5B4G4R4 -> mm2
pmaddwd mm2, const_YR0GR ;yrR5,ygG4+yrR4 -> mm2
movq mm0, mm7 ;B5G5R5B4 -> mm0
pmaddwd mm7, const_YBG0B ;ybB5+ygG5,ybB4 -> mm7
packssdw mm4, mm3 ;32-bit scaled V3V2V1V0 -> mm4
add eax, 24 ;increment RGB count
// nop ;//JS
movq tempV, mm4 ;32-bit scaled V3V2V1V0 -> tempV
movq mm4, mm6 ;B5B4G4R4 -> mm4
pmaddwd mm6, const_UR0GR ;urR5,ugG4+urR4
movq mm3, mm0 ;B5G5R5B4 -> mm0
pmaddwd mm0, const_UBG0B ;ubB5+ugG5,ubB4
paddd mm2, mm7 ;Y5Y4 -> mm2
pmaddwd mm4, const_VR0GR ;vrR5,vgG4+vrR4 -> mm4
pxor mm7, mm7 ;0 -> mm7
pmaddwd mm3, const_VBG0B ;vbB5+vgG5,vbB4 -> mm3
punpckhbw mm1, mm7 ;B7G7R7B6 -> mm1
paddd mm0, mm6 ;U5U4 -> mm0
movq mm6, mm1 ;B7G7R7B6 -> mm6
pmaddwd mm6, const_YBG0B ;ybB7+ygG7,ybB6 -> mm6
punpckhbw mm5, mm7 ;R7B6G6R6 -> mm5
movq mm7, mm5 ;R7B6G6R6 -> mm7
paddd mm3, mm4 ;V5V4 -> mm3
pmaddwd mm5, const_YR0GR ;yrR7,ygG6+yrR6 -> mm5
movq mm4, mm1 ;B7G7R7B6 -> mm4
pmaddwd mm4,const_UBG0B ;ubB7+ugG7,ubB6 -> mm4
psrad mm0, FCONVERSION_BITS ;32-bit scaled U5U4 -> mm0
psrad mm2, FCONVERSION_BITS ;32-bit scaled Y5Y4 -> mm2
nop ;//JS
paddd mm6, mm5 ;Y7Y6 -> mm6
movq mm5, mm7 ;R7B6G6R6 -> mm5
pmaddwd mm7, const_UR0GR ;urR7,ugG6+ugR6 -> mm7
psrad mm3, FCONVERSION_BITS ;32-bit scaled V5V4 -> mm3
pmaddwd mm1, const_VBG0B ;vbB7+vgG7,vbB6 -> mm1
psrad mm6, FCONVERSION_BITS ;32-bit scaled Y7Y6 -> mm6
packssdw mm2, mm6 ;Y7Y6Y5Y4 -> mm2
// nop ;//JS
pmaddwd mm5, const_VR0GR ;vrR7,vgG6+vrR6 -> mm5
paddd mm7, mm4 ;U7U6 -> mm7
psrad mm7, FCONVERSION_BITS ;32-bit scaled U7U6 -> mm7
// nop
movq mm6, tempY ;32-bit scaled Y3Y2Y1Y0 -> mm6
packssdw mm0, mm7 ;32-bit scaled U7U6U5U4 -> mm0
movq mm4, tempU ;32-bit scaled U3U2U1U0 -> mm4
packuswb mm6, mm2 ;all 8 Y values -> mm6
movq mm7, const_128 ;128,128,128,128 -> mm7
paddd mm1, mm5 ;V7V6 -> mm1
paddw mm0, mm7 ;add offset to U7U6U5U4
// nop
paddw mm4, mm7 ;add offset to U3U2U1U0
psrad mm1, FCONVERSION_BITS ;32-bit scaled V7V6 -> mm1
movq [ebx], mm6 ;store Y
packuswb mm4, mm0 ;all 8 U values -> mm4
movq mm5, tempV ;32-bit scaled V3V2V1V0 -> mm5
packssdw mm3, mm1 ;V7V6V5V4 -> mm3
paddw mm5, mm7 ;add offset to V3V2V1V0
paddw mm3, mm7 ;add offset to V7V6V5V4
movq [ecx], mm4 ;store U
packuswb mm5, mm3 ;all 8 V values -> mm5
add ebx, 8 ;increment Y count
add ecx, 8 ;increment U count
movq [edx], mm5 ;store V
// nop
add edx, 8 ;increment V count
// nop
dec edi ;decrement loop counter
jnz RGBtoYUV ;do 24 more bytes if not 0
//JS The following emms instruction is purposely commented out.
//emms // commented out since it is done after the DCT
} // end of __asm
} // end of MRGB2YCbCr
void MRGBA2YCbCrA(
int rows,
int cols,
unsigned char *inRGBA,
unsigned char *outY,
unsigned char *outU,
unsigned char *outV,
unsigned char *outA)
{
// make global to align on stack properly
// __int64 tempY, tempU, tempV, tempA;
// written by Dave Shade - Intel Corp.
// Feb '97
//
// This color space conversion routine converts
// true color pixels from RGBA to YCbCrA
// one pass through the loop processes 4 pixels
// there is no provision for cols not an even multiple of 4
__asm {
// initializations
//DS - IJG will always call with rows=1, so don't multiply
// mov eax, rows
// mov ebx, cols
// mul ebx ;number pixels
// reorder to take advantage of Pentium v-pipe
mov edi, cols
mov eax, inRGBA
shr edi, 2 ;number of loops = (rows*cols)/4
mov edx, outV
mov ecx, outU
mov esi, outA
mov ebx, outY
// top of loop
RGBAtoYUVA:
movq mm3, [eax+8] ;load #1 A1B1G1R1A0B0G0R0 -> mm3
pxor mm6, mm6 ;0 -> mm6
movq mm4, mm3 ;A1B1G1R1A0B0G0R0 -> mm4
psrlq mm3, 32 ;00000000A1B1G1R1 -> mm3
punpcklwd mm4, mm3 ;A1B1A0B0G1R1G0R0 -> mm4
add esi, 4
movq mm0, mm4 ;A1B1A0B0G1R1G0R0 -> mm0
punpckhbw mm4, mm6 ;A1B1A0B0 -> mm4
movq mm3, mm4 ;A1B1A0B0 -> mm3
punpcklbw mm0, mm6 ;G1R1G0R0 -> mm0
pmaddwd mm3, const2_Y0B0B ;ybB1,ybB0 -> mm3
movq mm1, mm0 ;G1R1G0R0 -> mm1
pmaddwd mm0, const2_YGRGR ;yrG1+ygR1,ygG0+yrR0 -> mm0
movq mm5, mm4 ;A1B1A0B0 -> mm5
pmaddwd mm4, const2_U0B0B ;ubB1,ubB0 -> mm4
movq mm2, mm1 ;G1R1G0R0 -> mm2
pmaddwd mm1, const2_UGRGR ;urG1+ugR1,ugG0+urR0 -> mm1
movq mm7, mm5 ;A1B1A0B0 -> mm7
pmaddwd mm5, const2_V0B0B ;vbB1,vbB0 -> mm5
paddd mm0, mm3 ;Y1Y0 -> mm0
pmaddwd mm2, const2_VGRGR ;vgG1+vrR1,vgG0+vrR0 -> mm2
psrad mm0, FCONVERSION_BITS ;32 bit scaled Y1Y0
movq mm3, [eax] ;*load #2 A3B3G3R3A2B2G2R2 -> mm3
paddd mm1, mm4 ;U1U0 -> mm2
pmaddwd mm7, const2_A ;1*A1,1*A0
psrad mm1, FCONVERSION_BITS ;32 bit scaled U1U0
movq tempY, mm0 ;write out Y1Y0 in 32 bit format
paddd mm2, mm5 ;V1V0 -> mm2
movq mm4, mm3 ;*A3B3G3R3A2B2G2R2 -> mm4
psrad mm2, FCONVERSION_BITS ;32bit scaled V1V0
movq tempU, mm1 ;write out U1U0 in 32 bit format
psrlq mm3, 32 ;*00000000A3B3G3R3 -> mm3
movq tempV, mm2 ;write out V1V0 in 32 bit format
punpcklwd mm4, mm3 ;*A3B3A2B2G3R3G2R2 -> mm4
movq tempA, mm7
movq mm0, mm4 ;*A3B3A2B2G3R3G2R2 -> mm0
punpckhbw mm4, mm6 ;*A3B3A2B2 -> mm4
add eax, 16
movq mm3, mm4 ;*A3B3A2B2 -> mm3
punpcklbw mm0, mm6 ;*G3R3G2R2 -> mm0
pmaddwd mm3, const2_Y0B0B ;*ybB3,ybB2 -> mm3
movq mm1, mm0 ;*G3R3G2R2 -> mm1
pmaddwd mm0, const2_YGRGR ;*yrG3+ygR3,ygG2+yrR2 -> mm0
movq mm5, mm4 ;*A3B3A2B2 -> mm5
pmaddwd mm4, const2_U0B0B ;*ubB3,ubB2 -> mm4
movq mm2, mm1 ;*G3R3G2R2 -> mm2
pmaddwd mm1, const2_UGRGR ;*urG3+ugR3,ugG2+urR2 -> mm1
movq mm7, mm5 ;*A3B3A2B2 -> mm7
pmaddwd mm5, const2_V0B0B ;*vbB3,vbB2 -> mm5
paddd mm0, mm3 ;*Y3Y2 -> mm0
pmaddwd mm2, const2_VGRGR ;*vgG3+vrR3,vgG2+vrR2 -> mm2
psrad mm0, FCONVERSION_BITS
pmaddwd mm7, const2_A ;* 1*A3,1*A2
paddd mm1, mm4 ;*U3U2 -> mm2
movq mm6, const_128
psrad mm1, FCONVERSION_BITS
packssdw mm0, tempY ;*pack Y3Y2,Y1Y0 -> mm0
paddd mm2, mm5 ;*V3V2 -> mm2
psrad mm2, FCONVERSION_BITS
add ebx, 4
packssdw mm1, tempU ;*pack U3U2,U1U0 -> mm1
packssdw mm2, tempV ;*pack V3V2,V1V0 -> mm2
paddw mm1, mm6 ;add 128
packssdw mm7, tempA ;*pack A3A2,A1A0 -> mm7
paddw mm2, mm6 ;add 128
packuswb mm0, mm0
add ecx, 4
packuswb mm1, mm1
add edx, 4
movd [ebx-4], mm0
packuswb mm2, mm2
movd [ecx-4], mm1
packuswb mm7, mm7
movd [edx-4], mm2
movd [esi-4], mm7
dec edi
jnz RGBAtoYUVA
//JS The following emms instruction is purposely commented out.
//emms // commented out since it is done after the DCT
} // end of __asm
} // end of MRGBA2YCbCrA
void MRGBA2YCbCrALegacy(
int rows,
int cols,
unsigned char *inRGBA,
unsigned char *outY,
unsigned char *outU,
unsigned char *outV,
unsigned char *outA)
{
// ensure proper stack alignment by making global
// __int64 tempY, tempU, tempV, tempA;
// written by Dave Shade - Intel Corp.
// Feb '97
//
// This color space conversion routine converts
// true color pixels from RGBA to YCbCrA
// This routine subtracts the RGB components from 255 before converting them
// one pass through the loop processes 4 pixels
// there is no provision for cols not an even multiple of 4
__asm {
// initializations
//DS - IJG will always call with rows=1, so don't multiply
// mov eax, rows
// mov ebx, cols
// mul ebx ;number pixels
// reorder to take advantage of Pentium v-pipe
mov edi, cols
mov eax, inRGBA
shr edi, 2 ;number of loops = (rows*cols)/4
mov edx, outV
mov ecx, outU
mov esi, outA
mov ebx, outY
// top of loop
RGBAtoYUVALegacy:
movq mm3, [eax+8] ;load #1 A1B1G1R1A0B0G0R0 -> mm3
pxor mm6, mm6 ;0 -> mm6
pxor mm3, const2_Legacy ; subtract MaxJSample FlashPix rev. 1 "thing"
movq mm4, mm3 ;A1B1G1R1A0B0G0R0 -> mm4
psrlq mm3, 32 ;00000000A1B1G1R1 -> mm3
punpcklwd mm4, mm3 ;A1B1A0B0G1R1G0R0 -> mm4
add esi, 4 ;opportunistically increment pointer
movq mm0, mm4 ;A1B1A0B0G1R1G0R0 -> mm0
punpckhbw mm4, mm6 ;A1B1A0B0 -> mm4
movq mm3, mm4 ;A1B1A0B0 -> mm3
punpcklbw mm0, mm6 ;G1R1G0R0 -> mm0
pmaddwd mm3, const2_Y0B0B ;ybB1,ybB0 -> mm3
movq mm1, mm0 ;G1R1G0R0 -> mm1
pmaddwd mm0, const2_YGRGR ;yrG1+ygR1,ygG0+yrR0 -> mm0
movq mm5, mm4 ;A1B1A0B0 -> mm5
pmaddwd mm4, const2_U0B0B ;ubB1,ubB0 -> mm4
movq mm2, mm1 ;G1R1G0R0 -> mm2
pmaddwd mm1, const2_UGRGR ;urG1+ugR1,ugG0+urR0 -> mm1
movq mm7, mm5 ;A1B1A0B0 -> mm7
pmaddwd mm5, const2_V0B0B ;vbB1,vbB0 -> mm5
paddd mm0, mm3 ;Y1Y0 -> mm0
pmaddwd mm2, const2_VGRGR ;vgG1+vrR1,vgG0+vrR0 -> mm2
psrad mm0, FCONVERSION_BITS ;32 bit scaled Y1Y0
psrld mm7, 16 ;shift A1A0 down
movq mm3, [eax] ;*load #2 A3B3G3R3A2B2G2R2 -> mm3
paddd mm1, mm4 ;U1U0 -> mm2
pxor mm3, const2_Legacy
psrad mm1, FCONVERSION_BITS ;32 bit scaled U1U0
movq tempY, mm0 ;write out Y1Y0 in 32 bit format
paddd mm2, mm5 ;V1V0 -> mm2
movq mm4, mm3 ;*A3B3G3R3A2B2G2R2 -> mm4
psrad mm2, FCONVERSION_BITS ;32bit scaled V1V0
movq tempU, mm1 ;write out U1U0 in 32 bit format
psrlq mm3, 32 ;*00000000A3B3G3R3 -> mm3
movq tempV, mm2 ;write out V1V0 in 32 bit format
punpcklwd mm4, mm3 ;*A3B3A2B2G3R3G2R2 -> mm4
movq tempA, mm7
movq mm0, mm4 ;*A3B3A2B2G3R3G2R2 -> mm0
punpckhbw mm4, mm6 ;*A3B3A2B2 -> mm4
add eax, 16 ;opportunistically increment pointer
movq mm3, mm4 ;*A3B3A2B2 -> mm3
punpcklbw mm0, mm6 ;*G3R3G2R2 -> mm0
pmaddwd mm3, const2_Y0B0B ;*ybB3,ybB2 -> mm3
movq mm1, mm0 ;*G3R3G2R2 -> mm1
pmaddwd mm0, const2_YGRGR ;*yrG3+ygR3,ygG2+yrR2 -> mm0
movq mm5, mm4 ;*A3B3A2B2 -> mm5
pmaddwd mm4, const2_U0B0B ;*ubB3,ubB2 -> mm4
movq mm2, mm1 ;*G3R3G2R2 -> mm2
pmaddwd mm1, const2_UGRGR ;*urG3+ugR3,ugG2+urR2 -> mm1
movq mm7, mm5 ;*A3B3A2B2 -> mm7
pmaddwd mm5, const2_V0B0B ;*vbB3,vbB2 -> mm5
paddd mm0, mm3 ;*Y3Y2 -> mm0
pmaddwd mm2, const2_VGRGR ;*vgG3+vrR3,vgG2+vrR2 -> mm2
psrad mm0, FCONVERSION_BITS ;shift Y3Y2 by 15 bits
psrld mm7, 16 ;shift the alpha values down
paddd mm1, mm4 ;*U3U2 -> mm2
movq mm6, const_128 ; load mm6 with 128
psrad mm1, FCONVERSION_BITS ;shift U3U2 by 15 bits
packssdw mm0, tempY ;*pack Y3Y2,Y1Y0 -> mm0
paddd mm2, mm5 ;*V3V2 -> mm2
packssdw mm1, tempU ;*pack U3U2,U1U0 -> mm1
psrad mm2, FCONVERSION_BITS ;shift V3V2 by 15 bits
add ebx, 4 ;opportunistically increment pointer
packssdw mm2, tempV ;pack V3V2,V1V0 -> mm2
paddw mm1, mm6 ;add 128
packssdw mm7, tempA ;pack A3A2,A1A0 -> mm7
paddw mm2, mm6 ;add 128
packuswb mm0, mm0 ;pack Y3Y2Y1Y0 from 16 bit to 8 bit
add ecx, 4 ;opportunistically increment pointer
packuswb mm1, mm1 ;pack U3U2U1U0 from 16 bit to 8 bit
add edx, 4 ;opportunistically increment pointer
movd [ebx-4], mm0 ;write out Y3Y2Y1Y0
packuswb mm2, mm2 ;pack V3V2V1V0 from 16 bit to 8 bit
movd [ecx-4], mm1 ;write out U3U2U1U0
packuswb mm7, mm7 ;pack A3A2A1A0 from 16 bit to 8 bits
movd [edx-4], mm2 ;write out V3V2V1V0
movd [esi-4], mm7 ;write out A3A2A1A0
dec edi ;subtract 4 from number of pixels
jnz RGBAtoYUVALegacy
//JS The following emms instruction is purposely commented out.
//emms // commented out since it is done after the DCT
} // end of __asm
} // end of MRGBA2YCbCrALegacy
#endif // defined (_X86_)