Windows2003-3790/enduser/netmeeting/av/codecs/intel/h261/d1blkcpy.cpp
2020-09-30 16:53:55 +02:00

209 lines
6.6 KiB
C++

/* *************************************************************************
** INTEL Corporation Proprietary Information
**
** This listing is supplied under the terms of a license
** agreement with INTEL Corporation and may not be copied
** nor disclosed except in accordance with the terms of
** that agreement.
**
** Copyright (c) 1995, 1996 Intel Corporation.
** All Rights Reserved.
**
** *************************************************************************
*/
// $Author: AKASAI $
// $Date: 15 Mar 1996 08:48:06 $
// $Archive: S:\h26x\src\dec\d1blkcpy.cpv $
// $Header: S:\h26x\src\dec\d1blkcpy.cpv 1.0 15 Mar 1996 08:48:06 AKASAI $
// $Log: S:\h26x\src\dec\d1blkcpy.cpv $
//
// Rev 1.0 15 Mar 1996 08:48:06 AKASAI
// Initial revision.
//
// Rev 1.3 31 Jan 1996 13:15:14 RMCKENZX
// Rewrote file to avoid bank conflicts. Fully unrolled the loop.
// Module now really will execute in 52 cycles if the cache is hot.
//
// Rev 1.2 22 Dec 1995 13:51:06 KMILLS
// added new copyright notice
//
// Rev 1.1 25 Sep 1995 09:03:22 CZHU
// Added comments on cycle counts
//
// Rev 1.0 11 Sep 1995 16:52:26 CZHU
// Initial revision.
//
//------------------------------------------------------------------------------
//------------------------------------------------------------------------------
//
// BlockCopy reads reference in BYTES and writes DWORDS. Read of BYTES
// is to avoid data alignment problems from motion compensated previous.
//
// Input U8 *reference (Motion Compensated address of reference)
// Output U8 *output (Output buffer)
//
// Registers used:
// eax source address
// ebx temp
// ecx, edx accumulators
// edi destination address
// esi PITCH
//
// Assumption: reference and output use PITCH
//
// Cycle count:
//
//------------------------------------------------------------------------------
#include "precomp.h"
#pragma code_seg("IACODE2")
__declspec(naked)
void BlockCopy (U32 uDstBlock, U32 uSrcBlock)
{
__asm {
mov eax, [esp+8] // eax gets Base addr of uSrcBlock
push edi
push esi // avoid Address Generation Interlocks
push ebx
mov cl, 2[eax] // ref[0][2]
mov edi, [esp+16] // edi gets Base addr of uDstBlock
mov ch, 3[eax] // ref[0][3]
mov dh, 7[eax] // ref[0][7]
shl ecx, 16
mov dl, 6[eax] // ref[0][6]
shl edx, 16
mov ebx, [edi] // heat output cache
mov esi, PITCH
mov cl, 0[eax] // ref[0][0]
mov dh, 5[eax] // ref[0][5]
mov ch, 1[eax] // ref[0][1]
mov dl, 4[eax] // ref[0][4]
add eax, esi
mov 0[edi], ecx // row 0, bytes 0-3
mov 4[edi], edx // row 0, bytes 4-7
mov cl, 2[eax] // ref[1][2]
mov dh, 7[eax] // ref[1][7]
mov ch, 3[eax] // ref[1][3]
add edi, esi
shl ecx, 16
mov dl, 6[eax] // ref[1][6]
shl edx, 16
mov ebx, [edi] // heat output cache
mov cl, 0[eax] // ref[1][0]
mov dh, 5[eax] // ref[1][5]
mov ch, 1[eax] // ref[1][1]
mov dl, 4[eax] // ref[1][4]
add eax, esi
mov 0[edi], ecx // row 1, bytes 0-3
mov cl, 2[eax] // ref[2][2]
mov 4[edi], edx // row 1, bytes 4-7
mov ch, 3[eax] // ref[2][3]
add edi, esi
shl ecx, 16
mov dh, 7[eax] // ref[2][7]
mov dl, 6[eax] // ref[2][6]
mov ebx, [edi] // heat output cache
shl edx, 16
mov cl, 0[eax] // ref[2][0]
mov dh, 5[eax] // ref[2][5]
mov ch, 1[eax] // ref[2][1]
mov dl, 4[eax] // ref[2][4]
add eax, esi
mov 0[edi], ecx // row 2, bytes 0-3
mov 4[edi], edx // row 2, bytes 4-7
mov cl, 2[eax] // ref[3][2]
mov dh, 7[eax] // ref[3][7]
mov ch, 3[eax] // ref[3][3]
add edi, esi
shl ecx, 16
mov dl, 6[eax] // ref[3][6]
shl edx, 16
mov ebx, [edi] // heat output cache
mov cl, 0[eax] // ref[3][0]
mov dh, 5[eax] // ref[3][5]
mov ch, 1[eax] // ref[3][1]
mov dl, 4[eax] // ref[3][4]
add eax, esi
mov 0[edi], ecx // row 3, bytes 0-3
mov cl, 2[eax] // ref[4][2]
mov 4[edi],edx // row 3, bytes 4-7
mov ch, 3[eax] // ref[4][3]
add edi, esi
shl ecx, 16
mov dh, 7[eax] // ref[4][7]
mov dl, 6[eax] // ref[4][6]
mov ebx, [edi] // heat output cache
shl edx, 16
mov cl, 0[eax] // ref[4][0]
mov dh, 5[eax] // ref[4][5]
mov ch, 1[eax] // ref[4][1]
mov dl, 4[eax] // ref[4][4]
add eax, esi
mov 0[edi], ecx // row 4, bytes 0-3
mov 4[edi], edx // row 4, bytes 4-7
mov cl, 2[eax] // ref[5][2]
mov dh, 7[eax] // ref[5][7]
mov ch, 3[eax] // ref[5][3]
add edi, esi
shl ecx, 16
mov dl, 6[eax] // ref[5][6]
shl edx, 16
mov ebx, [edi] // heat output cache
mov cl, 0[eax] // ref[5][0]
mov dh, 5[eax] // ref[5][5]
mov ch, 1[eax] // ref[5][1]
mov dl, 4[eax] // ref[5][4]
add eax, esi
mov 0[edi], ecx // row 5, bytes 0-3
mov cl, 2[eax] // ref[6][2]
mov 4[edi], edx // row 5, bytes 4-7
mov ch, 3[eax] // ref[6][3]
add edi, esi
shl ecx, 16
mov dh, 7[eax] // ref[6][7]
mov dl, 6[eax] // ref[6][6]
mov ebx, [edi] // heat output cache
shl edx, 16
mov cl, 0[eax] // ref[6][0]
mov dh, 5[eax] // ref[6][5]
mov ch, 1[eax] // ref[6][1]
mov dl, 4[eax] // ref[6][4]
add eax, esi
mov 0[edi], ecx // row 6, bytes 0-3
mov 4[edi], edx // row 6, bytes 4-7
mov cl, 2[eax] // ref[7][2]
mov dh, 7[eax] // ref[7][7]
mov ch, 3[eax] // ref[7][3]
add edi, esi
shl ecx, 16
mov dl, 6[eax] // ref[7][6]
shl edx, 16
mov ebx, [edi] // heat output cache
mov cl, 0[eax] // ref[7][0]
mov dh, 5[eax] // ref[7][5]
mov ch, 1[eax] // ref[7][1]
mov dl, 4[eax] // ref[7][4]
mov 0[edi], ecx // row 7, bytes 0-3
mov 4[edi], edx // row 7, bytes 4-7
pop ebx
pop esi
pop edi
ret
} // end of asm BlockCopy
}
#pragma code_seg()