2020-09-30 16:53:55 +02:00

1269 lines
32 KiB
C++

/* *************************************************************************
** INTEL Corporation Proprietary Information
**
** This listing is supplied under the terms of a license
** agreement with INTEL Corporation and may not be copied
** nor disclosed except in accordance with the terms of
** that agreement.
**
** Copyright (c) 1995 Intel Corporation.
** All Rights Reserved.
**
** *************************************************************************
*/
#include "precomp.h"
#if defined(H263P) || defined(USE_BILINEAR_MSH26X) // {
/***************************************************
* H26X_YVU9toYUV12()
* Convert from YVU9 to YUV12
* and copy to destination memory with pitch
* defined by the constant PITCH.
*
* uv_plane_common()
* Helper function to convert V and U plane information.
* Since the process is similar for both planes, the
* conversion code was included in this subroutine.
*
***************************************************/
#define READ_DWORD_AND_SHIFT(val,src) \
(((val) = *((unsigned int *)(src))), ((val) &= 0xFEFEFEFE), ((val) >>= 1))
#define WRITE_DWORD(dest,val) ((*(unsigned int *)(dest)) = (val))
#define AVERAGE_DWORDS(out,in1,in2) ((out) = ((((in1) + (in2)) & 0xFEFEFEFE) >> 1))
#define DUP_LOWER_TWO_BYTES(dest,val) \
(*((unsigned int *)(dest)) = (((val) & 0x000000FF) | (((val) << 8) & 0x0000FF00) | \
(((val) << 8) & 0x00FF0000) | (((val) << 16) & 0xFF000000)))
#define DUP_UPPER_TWO_BYTES(dest,val) \
(*((unsigned int *)(dest)) = ((((val) >> 16) & 0x000000FF) | (((val) >> 8) & 0x0000FF00) | \
(((val) >> 8) & 0x00FF0000) | ((val) & 0xFF000000)))
static void C_uv_plane_common(
LPBITMAPINFOHEADER lpbiInput,
WORD OutputWidth,
WORD OutputHeight,
U8 *psrc,
U8 *Plane,
const int pitch) {
U8 *pprev;
U8 *pnext = psrc + (lpbiInput->biWidth >> 2);
U8 *pdest_copy = Plane;
U8 *pdest_avg = Plane + pitch;
U8 t, tb1, tb2;
U32 t1, t2;
int i, j, k;
int dest_pitch_adj;
int widthx4 = ((OutputWidth >> 2) + 0x3) & ~0x3;
int heightx4 = 0;
int width_diff = 0;
int height_diff = 0;
int stretch = 0;
int flag = 0;
int NextSrcLine = 0;
int ChromaIters = 1;
int mark = (OutputHeight >> 2);
int byte_uvpitch_adj = 0;
if (lpbiInput->biHeight > OutputHeight) {
for (ChromaIters = 0, i = OutputHeight; i > 0; i -= 48) {
ChromaIters += 2;
}
NextSrcLine = (lpbiInput->biWidth - OutputWidth) >> 2;
stretch = (NextSrcLine ? 1 : 0);
mark = 6 - stretch;
flag = stretch;
} else {
width_diff = widthx4 - (OutputWidth >> 2);
byte_uvpitch_adj -= width_diff;
heightx4 = ((lpbiInput->biHeight >> 2) + 0x3) & ~0x3;
height_diff = (heightx4 - (lpbiInput->biHeight >> 2)) << 1;
}
dest_pitch_adj = pitch - (widthx4 << 1);
for (j = ChromaIters; j > 0; j--) {
for (k = mark + (flag & 1); k > 0; k--) {
if (!stretch && (1 == j) && (1 == k)) {
pnext = psrc;
}
for (i = (OutputWidth >> 1); (i & ~0x7); i-=8, psrc+=4, pnext+=4,
pdest_copy+=8, pdest_avg+=8) {
READ_DWORD_AND_SHIFT(t1,psrc);
DUP_LOWER_TWO_BYTES(pdest_copy,t1);
DUP_UPPER_TWO_BYTES((pdest_copy+4),t1);
READ_DWORD_AND_SHIFT(t2,pnext);
AVERAGE_DWORDS(t1,t1,t2);
DUP_LOWER_TWO_BYTES(pdest_avg,t1);
DUP_UPPER_TWO_BYTES((pdest_avg+4),t1);
}
if (i & 0x4) {
t = *psrc++ >> 1;
*(U16*)pdest_copy = t | (t<<8);
t = (t + (*pnext++ >> 1)) >> 1;
*(U16*)pdest_avg = t | (t<<8);
t = *psrc++ >> 1;
*(U16*)(pdest_copy+2) = t | (t<<8);
t = (t + (*pnext++ >> 1)) >> 1;
*(U16*)(pdest_avg+2) = t | (t<<8);
pdest_copy += 4; pdest_avg += 4;
}
if (i & 0x2) {
t = *psrc++ >> 1;
*(U16*)pdest_copy = t | (t<<8);
t = (t + (*pnext++ >> 1)) >> 1;
*(U16*)pdest_avg = t | (t<<8);
pdest_copy += 2; pdest_avg += 2;
}
if (width_diff) {
tb1 = *(pdest_copy-1);
tb2 = *(pdest_avg-1);
*pdest_copy++ = tb1; *pdest_copy++ = tb1;
*pdest_avg++ = tb2; *pdest_avg++ = tb2;
if ((width_diff-1) > 0) {
*pdest_copy++ = tb1; *pdest_copy++ = tb1;
*pdest_avg++ = tb2; *pdest_avg++ = tb2;
}
if ((width_diff-2) > 0) {
*pdest_copy++ = tb1; *pdest_copy++ = tb1;
*pdest_avg++ = tb2; *pdest_avg++ = tb2;
}
}
psrc += NextSrcLine;
pnext += NextSrcLine;
pdest_copy = pdest_avg + dest_pitch_adj;
pdest_avg = pdest_copy + pitch;
}
if (height_diff) {
pprev = pdest_copy - pitch;
for (j = height_diff; j > 0; j--) {
for (i = widthx4; i>0; i--) {
*pdest_copy++ = *pprev++;
*pdest_copy++ = *pprev++;
}
pprev += dest_pitch_adj;
pdest_copy += dest_pitch_adj;
}
}
if (stretch) {
psrc -= (lpbiInput->biWidth >> 2);
pnext -= (lpbiInput->biWidth >> 2);
pdest_avg = pdest_copy;
for (i = OutputWidth >> 1; i > 0; i -= 8, psrc += 4, pnext += 4,
pdest_avg += 8) {
READ_DWORD_AND_SHIFT(t1,psrc);
READ_DWORD_AND_SHIFT(t2,pnext);
AVERAGE_DWORDS(t1,t1,t2);
AVERAGE_DWORDS(t1,t1,t2);
DUP_LOWER_TWO_BYTES(pdest_avg,t1);
DUP_UPPER_TWO_BYTES((pdest_avg+4),t1);
}
psrc += NextSrcLine;
pnext += NextSrcLine;
pdest_copy = pdest_avg + dest_pitch_adj;
pdest_avg = pdest_copy + pitch;
flag++;
}
}
}
void C_H26X_YVU9toYUV12(
LPBITMAPINFOHEADER lpbiInput,
WORD OutputWidth,
WORD OutputHeight,
U8 *lpInput,
U8 *YPlane,
U8 *UPlane,
U8 *VPlane,
const int pitch) {
U32 *pnext, *plast, *pbn;
U32 *pyprev, *pyspace;
U8 *pvsrc, *pusrc;
int t;
int i, j, k;
int NextLine;
int widthx16;
int heightx16;
int width_diff = 0;
int height_diff = 0;
int width_adj = 0;
int height_adj = 0;
int stretch = 0;
int aspect = 0;
int word_ypitch_adj = 0;
int LumaIters = 1;
int mark = OutputHeight;
int byte_ypitch_adj = pitch - OutputWidth;
if (lpbiInput->biHeight > OutputHeight) {
for (LumaIters = 0, i = OutputHeight; i > 0; i -= 48) {
LumaIters += 4;
}
width_adj = (lpbiInput->biWidth - OutputWidth) >> 1;
aspect = LumaIters;
height_adj = (lpbiInput->biHeight - (OutputHeight - aspect)) >> 1;
stretch = 1;
mark = 11;
} else {
widthx16 = (lpbiInput->biWidth + 0xF) & ~0xF;
width_diff = widthx16 - OutputWidth;
byte_ypitch_adj -= width_diff;
word_ypitch_adj = byte_ypitch_adj >> 2;
heightx16 = (lpbiInput->biHeight + 0xF) & ~0xF;
height_diff = heightx16 - OutputHeight;
}
NextLine = width_adj >> 1;
pnext = (U32 *)(lpInput + (lpbiInput->biWidth * height_adj) + width_adj);
for (j = LumaIters; j > 0; j--) {
for (k = mark; k > 0; k--) {
for (i = OutputWidth; (i & ~0xF); i-=16, YPlane+=16) {
*(U32 *)YPlane = (*pnext++ >> 1) & 0x7F7F7F7F;
*(U32 *)(YPlane+4) = (*pnext++ >> 1) & 0x7F7F7F7F;
*(U32 *)(YPlane+8) = (*pnext++ >> 1) & 0x7F7F7F7F;
*(U32 *)(YPlane+12) = (*pnext++ >> 1) & 0x7F7F7F7F;
}
if (i & 0x8) {
*(U32 *)YPlane = (*pnext++ >> 1) & 0x7F7F7F7F;
*(U32 *)(YPlane+4) = (*pnext++ >> 1) & 0x7F7F7F7F;
YPlane += 8;
}
if (i & 0x4) {
*(U32 *)YPlane = (*pnext++ >> 1) & 0x7F7F7F7F;
YPlane += 4;
}
if (width_diff) {
t = (*(YPlane-1)) << 24;
t |= (t>>8) | (t>>16) | (t>>24);
*(U32 *)YPlane = t;
if ((width_diff-4) > 0) {
*(U32 *)(YPlane + 4) = t;
}
if ((width_diff-8) > 0) {
*(U32 *)(YPlane + 8) = t;
}
YPlane += width_diff;
}
pnext += NextLine;
YPlane += byte_ypitch_adj;
}
if (height_diff) {
pyprev = (U32 *)(YPlane - pitch);
pyspace = (U32 *)YPlane;
for (j = height_diff; j > 0; j--) {
for (i = widthx16; i>0; i -=4) {
*pyspace++ = *pyprev++;
}
pyspace += word_ypitch_adj;
pyprev += word_ypitch_adj;
}
}
if (stretch) {
plast = pnext - (lpbiInput->biWidth >> 2);
pbn = pnext;
for (i = OutputWidth; i > 0; i -= 4, YPlane += 4, plast++, pbn++) {
*(U32 *)YPlane =
( ((*plast & 0xFCFCFCFC) >> 2) +
((*pbn & 0xFCFCFCFC) >> 2) );
}
YPlane += byte_ypitch_adj;
}
}
pvsrc = lpInput + (lpbiInput->biWidth * lpbiInput->biHeight);
pusrc = pvsrc + ((lpbiInput->biWidth>>2) * (lpbiInput->biHeight>>2));
t = ((lpbiInput->biWidth>>2) * (height_adj>>2)) + (width_adj>>2);
pvsrc += t;
pusrc += t;
C_uv_plane_common(lpbiInput,OutputWidth,OutputHeight,pusrc,UPlane,pitch);
C_uv_plane_common(lpbiInput,OutputWidth,OutputHeight,pvsrc,VPlane,pitch);
}
/***************************************************
* H26X_YUY2toYUV12()
* Convert from YUY2 to YUV12
* and copy to destination memory with pitch
* defined by the constant PITCH.
*
***************************************************/
#if 0 // { 0
void C_H26X_YUY2toYUV12(
LPBITMAPINFOHEADER lpbiInput,
WORD OutputWidth,
WORD OutputHeight,
U8 *lpInput,
U8 *YPlane,
U8 *UPlane,
U8 *VPlane,
const int pitch) {
U8 *pline;
C_RGB_COLOR_CONVERT_INIT
// Since YUY2 is so much like RGB (inverted image), the macro used to initialize
// RGB conversion is also used here. However, there are some local variables
// declared in C_RGB_COLOR_CONVERT_INIT that are not used here. The following
// assignment is here simply to avoid warnings.
t = t;
pline = (U8 *)pnext;
for ( j = 0; j < LumaIters; j++) {
for (k = 0; k < mark; k++) {
for (i = OutputWidth; i > 0; i-=4, pline+=8, YPlane+=4) {
*(U32 *)YPlane = (*pline>>1) | ((*(pline+ 2)<<7)&0x7F00) |
((*(pline+ 4)<<15)&0x7F0000) | ((*(pline+ 6)<<23)&0x7F000000);
if (0 == (k & 1)) {
*(U16 *)UPlane = (*(pline+ 1)>>1) | ((*(pline+ 5)<<7)&0x7F00);
*(U16 *)VPlane = (*(pline+ 3)>>1) | ((*(pline+ 7)<<7)&0x7F00);
UPlane += 2; VPlane += 2;
}
}
pnext = (U32 *)pline;
C_WIDTH_FILL
if (stretch && (0 == k) && j) {
for (i = OutputWidth; i > 0; i -= 8) {
tm = ((*pyprev++ & 0xFEFEFEFE) >> 1);
tm += ((*pynext++ & 0xFEFEFEFE) >> 1);
*pyspace++ = tm;
tm = ((*pyprev++ & 0xFEFEFEFE) >> 1);
tm += ((*pynext++ & 0xFEFEFEFE) >> 1);
*pyspace++ = tm;
}
}
pnext += BackTwoLines;
pline = (U8 *)pnext;
YPlane += byte_ypitch_adj;
if (0 == (k & 1)) {
UPlane += byte_uvpitch_adj;
VPlane += byte_uvpitch_adj;
}
}
if (stretch) {
pyprev = (U32 *)(YPlane - pitch);
pyspace = (U32 *)YPlane;
pynext = (U32 *)(YPlane += pitch);
}
}
C_HEIGHT_FILL
if (stretch) {
for (i = OutputWidth; i > 0; i -= 4) {
*pyspace++ = *pyprev++;
}
}
}
#endif // } 0
__declspec(naked)
void P5_H26X_YUY2toYUV12(
LPBITMAPINFOHEADER lpbiInput,
WORD OutputWidth,
WORD OutputHeight,
U8 *lpInput,
U8 *YPlane,
U8 *UPlane,
U8 *VPlane,
const int pitch)
{
// Permanent (callee-save) registers - ebx, esi, edi, ebp
// Temporary (caller-save) registers - eax, ecx, edx
//
// Stack frame layout
// | pitch | +136
// | VPlane | +132
// | UPlane | +128
// | YPlane | +124
// | lpInput | +120
// | OutputHeight | +116
// | OutputWidth | +112
// | lpbiInput | +108
// ----------------------------
// | return addr | +104
// | saved ebp | +100
// | saved ebx | + 96
// | saved esi | + 92
// | saved edi | + 88
// | output_width | + 84
// | pyprev | + 80
// | pyspace | + 76
// | pynext | + 72
// | puvprev | + 68
// | puvspace | + 64
// | i | + 60
// | j | + 56
// | k | + 52
// | BackTwoLines | + 48
// | widthx16 | + 44
// | heightx16 | + 40
// | width_diff | + 36
// | height_diff | + 32
// | width_adj | + 28
// | height_adj | + 24
// | stretch | + 20
// | aspect | + 16
// | LumaIters | + 12
// | mark | + 8
// | byte_ypitch_adj | + 4
// | byte_uvpitch_adj | + 0
#define LOCALSIZE 88
#define PITCH_PARM 136
#define VPLANE 132
#define UPLANE 128
#define YPLANE 124
#define LP_INPUT 120
#define OUTPUT_HEIGHT_WORD 116
#define OUTPUT_WIDTH_WORD 112
#define LPBI_INPUT 108
#define OUTPUT_WIDTH 84
#define PYPREV 80
#define PYSPACE 76
#define PYNEXT 72
#define PUVPREV 68
#define PUVSPACE 64
#define LOOP_I 60
#define LOOP_J 56
#define LOOP_K 52
#define BACK_TWO_LINES 48
#define WIDTHX16 44
#define HEIGHTX16 40
#define WIDTH_DIFF 36
#define HEIGHT_DIFF 32
#define WIDTH_ADJ 28
#define HEIGHT_ADJ 24
#define STRETCH 20
#define ASPECT 16
#define LUMA_ITERS 12
#define MARK 8
#define BYTE_YPITCH_ADJ 4
#define BYTE_UVPITCH_ADJ 0
_asm {
push ebp
push ebx
push esi
push edi
sub esp, LOCALSIZE
// int width_diff = 0
// int height_diff = 0
// int width_adj = 0
// int height_adj = 0
// int stretch = 0
// int aspect = 0
xor eax, eax
mov [esp + WIDTH_DIFF], eax
mov [esp + HEIGHT_DIFF], eax
mov [esp + WIDTH_ADJ], eax
mov [esp + HEIGHT_ADJ], eax
mov [esp + STRETCH], eax
mov [esp + ASPECT], eax
// int LumaIters = 1
inc eax
mov [esp + LUMA_ITERS], eax
// int mark = OutputHeight
// int output_width = OutputWidth
// int byte_ypitch_adj = pitch - OutputWidth
// int byte_uvpitch_adj = pitch - (OutputWidth >> 1)
xor ebx, ebx
mov bx, [esp + OUTPUT_HEIGHT_WORD]
mov [esp + MARK], ebx
mov bx, [esp + OUTPUT_WIDTH_WORD]
mov [esp + OUTPUT_WIDTH], ebx
mov ecx, [esp + PITCH_PARM]
mov edx, ecx
sub ecx, ebx
mov [esp + BYTE_YPITCH_ADJ], ecx
sar ebx, 1
sub edx, ebx
mov [esp + BYTE_UVPITCH_ADJ], edx
// if (lpbiInput->biHeight > OutputHeight)
mov ebx, [esp + LPBI_INPUT]
mov ecx, (LPBITMAPINFOHEADER)[ebx].biHeight
xor edx, edx
mov dx, [esp + OUTPUT_HEIGHT_WORD]
cmp ecx, edx
jle Lno_stretch
// for (LumaIters = 0, i = OutputHeight; i > 0; i -= 48) LumaIters += 4
xor ecx, ecx
Lrepeat48:
lea ecx, [ecx + 4]
sub edx, 48
jnz Lrepeat48
mov [esp + LUMA_ITERS], ecx
// aspect = LumaIters
mov [esp + ASPECT], ecx
// width_adj = (lpbiInput->biWidth - OutputWidth) >> 1
// width_adj *= lpbiInput->biBitCount
// width_adj >>= 3
mov ecx, (LPBITMAPINFOHEADER)[ebx].biWidth
mov edx, [esp + OUTPUT_WIDTH]
sub ecx, edx
sar ecx, 1
xor edx, edx
mov dx, (LPBITMAPINFOHEADER)[ebx].biBitCount
imul ecx, edx
sar ecx, 3
mov [esp + WIDTH_ADJ], ecx
// height_adj = (lpbiInput->biHeight - (OutputHeight - aspect)) >> 1
mov ecx, (LPBITMAPINFOHEADER)[ebx].biHeight
xor edx, edx
mov dx, [esp + OUTPUT_HEIGHT_WORD]
sub ecx, edx
add ecx, [esp + ASPECT]
sar ecx, 1
mov [esp + HEIGHT_ADJ], ecx
// stretch = 1
// mark = 11
mov ecx, 1
mov edx, 11
mov [esp + STRETCH], ecx
mov [esp + MARK], edx
jmp Lif_done
Lno_stretch:
// widthx16 = (lpbiInput->biWidth + 0xF) & ~0xF
// width_diff = widthx16 - OutputWidth
mov ecx, (LPBITMAPINFOHEADER)[ebx].biWidth
add ecx, 00FH
and ecx, 0FFFFFFF0H
mov [esp + WIDTHX16], ecx
mov edx, [esp + OUTPUT_WIDTH]
sub ecx, edx
mov [esp + WIDTH_DIFF], ecx
// byte_ypitch_adj -= width_diff
mov edx, [esp + BYTE_YPITCH_ADJ]
sub edx, ecx
mov [esp + BYTE_YPITCH_ADJ], edx
// byte_uvpitch_adj -= (width_diff >> 1)
mov edx, [esp + BYTE_UVPITCH_ADJ]
sar ecx, 1
sub edx, ecx
mov [esp + BYTE_UVPITCH_ADJ], edx
// heightx16 = (lpbiInput->biHeight + 0xF) & ~0xF
// height_diff = heightx16 - OutputHeight
mov ecx, (LPBITMAPINFOHEADER)[ebx].biHeight
add ecx, 00FH
and ecx, 0FFFFFFF0H
mov [esp + HEIGHTX16], ecx
xor edx, edx
mov dx, [esp + OUTPUT_HEIGHT_WORD]
sub ecx, edx
mov [esp + HEIGHT_DIFF], ecx
Lif_done:
// BackTwoLines = -(lpbiInput->biWidth + OutputWidth);
// BackTwoLines *= lpbiInput->biBitCount
// BackTwoLines >>= 3
mov ecx, (LPBITMAPINFOHEADER)[ebx].biWidth
mov edx, [esp + OUTPUT_WIDTH]
add ecx, edx
neg ecx
xor edx, edx
mov dx, (LPBITMAPINFOHEADER)[ebx].biBitCount
imul ecx, edx
sar ecx, 3
mov [esp + BACK_TWO_LINES], ecx
// pnext = (U32 *)(lpInput +
// (((lpbiInput->biWidth * lpbiInput->biBitCount) >> 3)) *
// ((OutputHeight - aspect - 1) + height_adj)) +
// width_adj)
// assign (esi, pnext)
mov ecx, (LPBITMAPINFOHEADER)[ebx].biWidth
xor edx, edx
mov dx, (LPBITMAPINFOHEADER)[ebx].biBitCount
imul ecx, edx
sar ecx, 3
xor edx, edx
mov dx, [esp + OUTPUT_HEIGHT_WORD]
sub edx, [esp + ASPECT]
dec edx
add edx, [esp + HEIGHT_ADJ]
imul ecx, edx
add ecx, [esp + WIDTH_ADJ]
add ecx, [esp + LP_INPUT]
mov esi, ecx
// assign (edi, YPlane)
mov edi, [esp + YPLANE]
// for (j = 0; j < LumaIters; j++)
xor eax, eax
mov [esp + LOOP_J], eax
L4:
// for (k = 0; k < mark; k++)
xor eax, eax
mov [esp + LOOP_K], eax
L5:
// for (i = FrameWidth; i > 0; i -= 4, pnext += 8, YPlane += 4)
mov ebp, [esp + OUTPUT_WIDTH]
// The following jump is used to make sure the start of the loop begin in the U pipe.
jmp L6
// *(U32 *)YPlane = (*pline>>1) | ((*(pline+ 2)<<7)&0x7F00) |
// ((*(pline+ 4)<<15)&0x7F0000) | ((*(pline+ 6)<<23)&0x7F000000)
// Register usage:
// esi - ptr to interlaced (VYUY) input
// edi - ptr for writing Y values
L6:
mov al, [esi]
mov cl, [esi+4]
shr eax, 1
mov bl, [esi+2]
shl ecx, 15
mov dl, [esi+6]
shl ebx, 7
and ecx, 0x7F0000
shl edx, 23
and ebx, 0x7F00
and edx, 0x7F000000
or ebx, eax
or ebx, ecx
lea edi, [edi+4]
or ebx, edx
lea esi, [esi+8]
mov [edi-4], ebx
mov ebx, [esp + LOOP_K]
// if (0 == (k & 1))
// *(U16 *)UPlane = (*(pline+ 1)>>1) | ((*(pline+ 5)<<7)&0x7F00)
// *(U16 *)VPlane = (*(pline+ 3)>>1) | ((*(pline+ 7)<<7)&0x7F00)
test ebx, 1
jnz L7
mov ecx, [esp + UPLANE]
mov edx, [esp + VPLANE]
mov al, [esi-7]
mov bl, [esi-3]
shr eax, 1
and ebx, 0xFE
shl ebx, 7
lea edx, [edx+2]
or ebx, eax
mov al, [esi-5]
shr eax, 1
mov [ecx], bx
mov bl, [esi-1]
lea ecx, [ecx+2]
and ebx, 0xFE
mov [esp + UPLANE], ecx
shl ebx, 7
mov [esp + VPLANE], edx
or ebx, eax
nop
mov [edx-2], bx
nop
L7:
sub ebp, 4
jnz L6
// Assembler version of C_WIDTH_DIFF
// if (width_diff)
mov eax, [esp + WIDTH_DIFF]
mov edx, eax
test eax, eax
jz Lno_width_diff
// tm = (*(YPlane-1)) << 24
// tm |= (tm>>8) | (tm>>16) | (tm>>24)
mov bl, [edi - 1]
shl ebx, 24
mov ecx, ebx
shr ebx, 8
or ecx, ebx
shr ebx, 8
or ecx, ebx
shr ebx, 8
or ecx, ebx
// *(U32 *)YPlane = tm
mov [edi], ecx
// if ((width_diff-4) > 0)
sub eax, 4
jz Lupdate_YPlane
// *(U32 *)(YPlane + 4) = tm
mov [edi + 4], ecx
sub eax, 4
// if ((width_diff-8) > 0)
jz Lupdate_YPlane
// *(U32 *)(YPlane + 8) = tm
mov [edi + 8], ecx
Lupdate_YPlane:
// YPlane += width_diff
lea edi, [edi + edx]
///if (0 == (k&1))
mov eax, [esp + LOOP_K]
test eax, 1
jnz Lno_width_diff
// t8u = *(UPlane-1)
// t8v = *(VPlane-1)
// *UPlane++ = t8u
// *UPlane++ = t8u
// *VPlane++ = t8v
// *VPlane++ = t8v
mov ebp, edx
mov eax, [esp + UPLANE]
mov ebx, [esp + VPLANE]
mov cl, [eax - 1]
mov ch, [ebx - 1]
mov [eax], cl
mov [eax + 1], cl
mov [ebx], ch
mov [ebx + 1], ch
// if ((width_diff-4) > 0)
sub ebp, 4
jz Lupdate_UVPlane
// *UPlane++ = t8u
// *UPlane++ = t8u
// *VPlane++ = t8v
// *VPlane++ = t8v
mov [eax + 2], cl
mov [eax + 3], cl
mov [ebx + 2], ch
mov [ebx + 3], ch
// if ((width_diff-8) > 0)
sub ebp, 4
jz Lupdate_UVPlane
// *UPlane++ = t8u
// *UPlane++ = t8u
// *VPlane++ = t8v
// *VPlane++ = t8v
mov [eax + 4], cl
mov [eax + 5], cl
mov [ebx + 4], ch
mov [ebx + 5], ch
Lupdate_UVPlane:
sar edx, 1
lea eax, [eax + edx]
mov [esp + UPLANE], eax
lea ebx, [ebx + edx]
mov [esp + VPLANE], ebx
Lno_width_diff:
// if (stretch && (0 == k) && j)
mov eax, [esp + STRETCH]
test eax, eax
jz L14
mov eax, [esp + LOOP_K]
test eax, eax
jnz L14
mov eax, [esp + LOOP_J]
test eax, eax
jz L14
// spill YPlane ptr
mov [esp + YPLANE], edi
nop
// for (i = OutputWidth; i > 0; i -= 8)
// assign (ebx, pyprev)
// assign (ecx, t)
// assign (edx, pynext)
// assign (edi, pyspace)
// assign (ebp, i)
// make sure offsets are such that there are no bank conflicts here
mov ebx, [esp + PYPREV]
mov edi, [esp + PYSPACE]
mov edx, [esp + PYNEXT]
mov ebp, [esp + OUTPUT_WIDTH]
// t = (*pyprev++ & 0xFEFEFEFE) >> 1
// t += (*pynext++ & 0xFEFEFEFE) >> 1
// *pyspace++ = t
// t = (*pyprev++ & 0xFEFEFEFE) >> 1
// t += (*pynext++ & 0xFEFEFEFE) >> 1
// *pyspace++ = t
L15:
// 1
mov eax, [ebx]
lea ebx, [ebx + 4]
// 2
mov ecx, [edx]
lea edx, [edx + 4]
// 3
shr ecx, 1
and eax, 0xFEFEFEFE
// 4
shr eax, 1
and ecx, 0x7F7F7F7F
// 5
add eax, ecx
mov ecx, [ebx]
// 6
shr ecx, 1
mov [edi], eax
// 7
mov eax, [edx]
and ecx, 0x7F7F7F7F
// 8
shr eax, 1
lea edi, [edi + 4]
// 9
and eax, 0x7F7F7F7F
lea ebx, [ebx + 4]
// 10
lea edx, [edx + 4]
add eax, ecx
// 11
mov [edi], eax
lea edi, [edi + 4]
// 12
sub ebp, 8
jnz L15
// kill (ebx, pyprev)
// kill (ecx, t)
// kill (edx, pynext)
// kill (edi, pyspace)
// kill (ebp, i)
// restore YPlane
mov edi, [esp + YPLANE]
// pnext += BackTwoLines
L14:
add esi, [esp + BACK_TWO_LINES]
// YPlane += byte_ypitch_adj;
add edi, [esp + BYTE_YPITCH_ADJ]
// if(0 == (k&1))
mov eax, [esp + LOOP_K]
and eax, 1
jnz L16
// UPlane += byte_uvpitch_adj;
// VPlane += byte_uvpitch_adj;
mov eax, [esp + BYTE_UVPITCH_ADJ]
add [esp + UPLANE], eax
add [esp + VPLANE], eax
L16:
inc DWORD PTR [esp + LOOP_K]
xor eax, eax
mov ebx, [esp + LOOP_K]
cmp ebx, [esp + MARK]
jl L5
// if (stretch)
cmp DWORD PTR [esp + STRETCH], 0
je L17
// pyprev = YPlane - pitch
mov eax, edi
sub eax, [esp + PITCH_PARM]
mov [esp + PYPREV], eax
// pyspace = YPlane
mov [esp + PYSPACE], edi
// pynext = (YPlane += pitch)
add edi, [esp + PITCH_PARM]
mov [esp + PYNEXT], edi
L17:
inc DWORD PTR [esp + LOOP_J]
mov eax, [esp + LOOP_J]
cmp eax, [esp + LUMA_ITERS]
jl L4
// kill (esi, pnext)
// kill (edi, YPlane)
// ASM version of C_HEIGHT_FILL
// if (height_diff)
mov eax, [esp + HEIGHT_DIFF]
test eax, eax
jz Lno_height_diff
// pyspace = (U32 *)YPlane
mov esi, edi
// pyprev = (U32 *)(YPlane - pitch)
sub esi, [esp + PITCH_PARM]
// for (j = height_diff; j > 0; j--)
Lheight_yfill_loop:
mov ebx, [esp + WIDTHX16]
// for (i = widthx16; i>0; i -=4)
Lheight_yfill_row:
// *pyspace++ = *pyprev++
mov ecx, [esi]
lea esi, [esi + 4]
mov [edi], ecx
lea edi, [edi + 4]
sub ebx, 4
jnz Lheight_yfill_row
// pyspace += word_ypitch_adj
// pyprev += word_ypitch_adj
add esi, [esp + BYTE_YPITCH_ADJ]
add edi, [esp + BYTE_YPITCH_ADJ]
dec eax
jnz Lheight_yfill_loop
mov eax, [esp + HEIGHT_DIFF]
mov edi, [esp + UPLANE]
// puvspace = (U32 *)UPlane
mov esi, edi
// puvprev = (U32 *)(UPlane - pitch)
sub esi, [esp + PITCH_PARM]
// for (j = height_diff; j > 0; j -= 2)
Lheight_ufill_loop:
mov ebx, [esp + WIDTHX16]
// for (i = widthx16; i>0; i -= 8)
Lheight_ufill_row:
// *puvspace++ = *puvprev++
mov ecx, [esi]
mov [edi], ecx
lea esi, [esi + 4]
lea edi, [edi + 4]
sub ebx, 8
jnz Lheight_ufill_row
// puvspace += word_uvpitch_adj
// puvprev += word_uvpitch_adj
add esi, [esp + BYTE_UVPITCH_ADJ]
add edi, [esp + BYTE_UVPITCH_ADJ]
sub eax, 2
jnz Lheight_ufill_loop
mov eax, [esp + HEIGHT_DIFF]
mov edi, [esp + VPLANE]
// puvspace = (U32 *)VPlane
mov esi, edi
// puvprev = (U32 *)(VPlane - pitch)
sub esi, [esp + PITCH_PARM]
// for (j = height_diff; j > 0; j -= 2)
Lheight_vfill_loop:
mov ebx, [esp + WIDTHX16]
// for (i = widthx16; i>0; i -= 8)
Lheight_vfill_row:
// *puvspace++ = *puvprev++
mov ecx, [esi]
mov [edi], ecx
lea esi, [esi + 4]
lea edi, [edi + 4]
sub ebx, 8
jnz Lheight_vfill_row
// puvspace += word_uvpitch_adj
// puvprev += word_uvpitch_adj
add esi, [esp + BYTE_UVPITCH_ADJ]
add edi, [esp + BYTE_UVPITCH_ADJ]
sub eax, 2
jnz Lheight_vfill_loop
Lno_height_diff:
// if (stretch)
mov esi, [esp + PYPREV]
cmp DWORD PTR [esp + STRETCH], 0
je L19
// for (i = OutputWidth; i > 0; i -= 4)
// assign (esi, pyprev)
// assign (edi, pyspace)
// assign (ebp, i)
mov ebp, [esp + OUTPUT_WIDTH]
mov edi, [esp + PYSPACE]
L18:
mov ecx, [esi]
lea esi, [esi + 4]
mov [edi], ecx
lea edi, [edi + 4]
sub ebp, 4
jnz L18
// kill (esi, pyprev)
// kill (edi, pyspace)
// kill (ebp, i)
L19:
add esp, LOCALSIZE
pop edi
pop esi
pop ebx
pop ebp
ret
}
}
#undef LOCALSIZE
#undef PITCH_PARM
#undef VPLANE
#undef UPLANE
#undef YPLANE
#undef LP_INPUT
#undef OUTPUT_HEIGHT_WORD
#undef OUTPUT_WIDTH_WORD
#undef LPBI_INPUT
#undef OUTPUT_WIDTH
#undef PYPREV
#undef PYSPACE
#undef PYNEXT
#undef PUVPREV
#undef PUVSPACE
#undef LOOP_I
#undef LOOP_J
#undef LOOP_K
#undef BACK_TWO_LINES
#undef WIDTHX16
#undef HEIGHTX16
#undef WIDTH_DIFF
#undef HEIGHT_DIFF
#undef WIDTH_ADJ
#undef HEIGHT_ADJ
#undef STRETCH
#undef ASPECT
#undef LUMA_ITERS
#undef MARK
#undef BYTE_YPITCH_ADJ
#undef BYTE_UVPITCH_ADJ
/***************************************************
* H26X_YUV12toEncYUV12()
* Copy YUV12 data to encoder memory at the
* appropriate location. It is assumed that the input
* data is stored as rows of Y, followed by rows of U,
* then rows of V.
*
***************************************************/
void C_H26X_YUV12toEncYUV12(
LPBITMAPINFOHEADER lpbiInput,
WORD OutputWidth,
WORD OutputHeight,
U8 *lpInput,
U8 *YPlane,
U8 *UPlane,
U8 *VPlane,
const int pitch) {
int i, j;
U32 *pnext = (U32 *)lpInput;
U32 *plast;
U32 t;
U16 t16;
U8 *p8next;
int byte_ypitch_adj;
int byte_uvpitch_adj;
int yinput_height = lpbiInput->biHeight;
int yinput_width = lpbiInput->biWidth;
int yheight_diff = 0;
int ywidth_diff = 0;
int uvheight_diff = 0;
int uvwidth_diff = 0;
int uvinput_width = yinput_width >> 1;
int uvinput_height = yinput_height >> 1;
int uvoutput_width = OutputWidth >> 1;
int widthx16 = (OutputWidth + 0xF) & ~0xF;
int width_diff = widthx16 - OutputWidth;
int heightx16 = (OutputHeight + 0xF) & ~0xF;
int height_diff = heightx16 - OutputHeight;
// This routine has to handle two cases:
// - arbitrary frame size (width and height may be any multiple of 4 up to CIF size).
// - backward compatibility with H263 (320x240 -> 352x288 still mode)
// Note: Crop and stretch was not supported for YUV12 conversion in H263.
if (width_diff) {
byte_ypitch_adj = pitch - widthx16;
byte_uvpitch_adj = pitch - (widthx16 >> 1);
} else {
byte_ypitch_adj = pitch - OutputWidth;
byte_uvpitch_adj = pitch - (OutputWidth >> 1);
ywidth_diff = OutputWidth - yinput_width;
yheight_diff = OutputHeight - yinput_height;
uvwidth_diff = ywidth_diff >> 1;
uvheight_diff = yheight_diff >> 1;
}
// Y Plane conversion.
for (j = yinput_height; j > 0; j--, YPlane += byte_ypitch_adj) {
for (i = yinput_width; (i & ~0xF); i-=16, YPlane+=16) {
*(U32 *)YPlane = (*pnext++ >> 1) & 0x7F7F7F7F;
*(U32 *)(YPlane+4) = (*pnext++ >> 1) & 0x7F7F7F7F;
*(U32 *)(YPlane+8) = (*pnext++ >> 1) & 0x7F7F7F7F;
*(U32 *)(YPlane+12) = (*pnext++ >> 1) & 0x7F7F7F7F;
}
if (i & 0x8) {
*(U32 *)YPlane = (*pnext++ >> 1) & 0x7F7F7F7F;
*(U32 *)(YPlane+4) = (*pnext++ >> 1) & 0x7F7F7F7F;
YPlane += 8;
}
if (i & 0x4) {
*(U32 *)YPlane = (*pnext++ >> 1) & 0x7F7F7F7F;
YPlane += 4;
}
// The next two cases are mutually exclusive. If there is a width_diff,
// then there is no ywidth_diff. If there is a ywidth_diff, then there
// is no width_diff. Both width_diff and ywidth_diff may be zero.
if (width_diff) {
t = (*(YPlane-1)) << 24;
t |= (t>>8) | (t>>16) | (t>>24);
*(U32 *)YPlane = t;
if ((width_diff-4) > 0) {
*(U32 *)(YPlane + 4) = t;
}
if ((width_diff-8) > 0) {
*(U32 *)(YPlane + 8) = t;
}
YPlane += width_diff;
}
for (i = ywidth_diff; i > 0; i -= 4) {
*(U32 *)YPlane = 0; YPlane += 4;
}
}
// The next two cases are mutually exclusive. If there is a height_diff,
// then there is no yheight_diff. If there is a yheight_diff, then there
// is no height_diff. Both height_diff and yheight_diff may be zero.
if (height_diff) {
for (j = height_diff; j > 0; j-- ) {
plast = (U32 *)(YPlane - pitch);
for (i = widthx16; i > 0; i -= 4, YPlane += 4) {
*(U32 *)YPlane = *plast++;
}
YPlane += byte_ypitch_adj;
}
}
for (j = yheight_diff; j > 0; j--, YPlane += byte_ypitch_adj) {
for (i = widthx16; i > 0; i -= 4) {
*(U32 *)YPlane = 0; YPlane += 4;
}
}
// U Plane conversion.
p8next = (U8 *)pnext;
for (j = uvinput_height; j > 0; j--, UPlane += byte_uvpitch_adj) {
for (i = uvinput_width; (i & ~0x7); i-=8, UPlane+=8, p8next+=8) {
*(U32 *)UPlane = (*(U32 *)p8next >> 1) & 0x7F7F7F7F;
*(U32 *)(UPlane+4) = (*(U32 *)(p8next+4) >> 1) & 0x7F7F7F7F;
}
if (i & 0x4) {
*(U32 *)UPlane = (*(U32 *)p8next >> 1) & 0x7F7F7F7F;
UPlane += 4, p8next += 4;
}
if (i & 0x2) {
*(U16 *)UPlane = (*(U16 *)p8next >> 1) & 0x7F7F;
UPlane += 2, p8next += 2;
}
// The next two cases are mutually exclusive. If there is a width_diff,
// then there is no uvwidth_diff. If there is a uvwidth_diff, then there
// is no width_diff. Both width_diff and uvwidth_diff may be zero.
if (width_diff) {
t16 = (*(UPlane-1)) << 8;
t16 |= (t16>>8);
*(U16*)UPlane = t16; UPlane += 2;
if ((width_diff-4) > 0) {
*(U16*)UPlane = t16; UPlane += 2;
}
if ((width_diff-8) > 0) {
*(U16*)UPlane = t16; UPlane += 2;
}
}
for (i = uvwidth_diff; i > 0; i -= 4) {
*(U32 *)UPlane = 0x40404040; UPlane += 4;
}
}
// The next two cases are mutually exclusive. If there is a height_diff,
// then there is no uvheight_diff. If there is a uvheight_diff, then there
// is no height_diff. Both height_diff and uvheight_diff may be zero.
if (height_diff) {
for (j = (height_diff >> 1); j > 0; j--, UPlane += byte_uvpitch_adj ) {
plast = (U32 *)(UPlane - pitch);
for (i = (widthx16 >> 1); i > 0; i -= 4, UPlane += 4) {
*(U32 *)UPlane = *plast++;
}
}
}
for (j = uvheight_diff; j > 0; j--, UPlane += byte_uvpitch_adj) {
for (i = uvoutput_width; i > 0; i -= 4) {
*(U32 *)UPlane = 0x40404040; UPlane += 4;
}
}
// V Plane conversion.
for (j = uvinput_height; j > 0; j--, VPlane += byte_uvpitch_adj) {
for (i = uvinput_width; (i & ~0x7); i-=8, VPlane+=8, p8next+=8) {
*(U32 *)VPlane = (*(U32 *)p8next >> 1) & 0x7F7F7F7F;
*(U32 *)(VPlane+4) = (*(U32 *)(p8next+4) >> 1) & 0x7F7F7F7F;
}
if (i & 0x4) {
*(U32 *)VPlane = (*(U32 *)p8next >> 1) & 0x7F7F7F7F;
VPlane += 4, p8next += 4;
}
if (i & 0x2) {
*(U16 *)VPlane = (*(U16 *)p8next >> 1) & 0x7F7F;
VPlane += 2, p8next += 2;
}
// The next two cases are mutually exclusive. If there is a width_diff,
// then there is no uvwidth_diff. If there is a uvwidth_diff, then there
// is no width_diff. Both width_diff and uvwidth_diff may be zero.
if (width_diff) {
t16 = (*(VPlane-1)) << 8;
t16 |= (t16>>8);
*(U16*)VPlane = t16; VPlane += 2;
if ((width_diff-4) > 0) {
*(U16*)VPlane = t16; VPlane += 2;
}
if ((width_diff-8) > 0) {
*(U16*)VPlane = t16; VPlane += 2;
}
}
for (i = uvwidth_diff; i > 0; i -= 4) {
*(U32 *)VPlane = 0x40404040; VPlane += 4;
}
}
// The next two cases are mutually exclusive. If there is a height_diff,
// then there is no uvheight_diff. If there is a uvheight_diff, then there
// is no height_diff. Both height_diff and uvheight_diff may be zero.
if (height_diff) {
for (j = (height_diff >> 1); j > 0; j--, VPlane += byte_uvpitch_adj ) {
plast = (U32 *)(VPlane - pitch);
for (i = (widthx16 >> 1); i > 0; i -= 4, VPlane += 4) {
*(U32 *)VPlane = *plast++;
}
}
}
for (j = uvheight_diff; j > 0; j--, VPlane += byte_uvpitch_adj) {
for (i = uvoutput_width; i > 0; i -= 4) {
*(U32 *)VPlane = 0x40404040; VPlane += 4;
}
}
}
#endif // } H263P