2020-09-30 16:53:55 +02:00

152 lines
4.3 KiB
C++

/* *************************************************************************
** INTEL Corporation Proprietary Information
**
** This listing is supplied under the terms of a license
** agreement with INTEL Corporation and may not be copied
** nor disclosed except in accordance with the terms of
** that agreement.
**
** Copyright (c) 1995, 1996 Intel Corporation.
** All Rights Reserved.
**
** *************************************************************************
*/
// $Author: AGUPTA2 $
// $Date: 08 Mar 1996 16:46:34 $
// $Archive: S:\h26x\src\dec\dxblkcpy.cpv $
// $Header: S:\h26x\src\dec\dxblkcpy.cpv 1.4 08 Mar 1996 16:46:34 AGUPTA2 $
// $Log: S:\h26x\src\dec\dxblkcpy.cpv $
//
// Rev 1.4 08 Mar 1996 16:46:34 AGUPTA2
// Rewritten to reduce code size by avoiding 32-bit displacements. Added
// pragma code_seg. May need to optimize for misaligned case.
//
//
// Rev 1.3 31 Jan 1996 13:15:14 RMCKENZX
// Rewrote file to avoid bank conflicts. Fully unrolled the loop.
// Module now really will execute in 52 cycles if the cache is hot.
//
// Rev 1.2 22 Dec 1995 13:51:06 KMILLS
// added new copyright notice
//
// Rev 1.1 25 Sep 1995 09:03:22 CZHU
// Added comments on cycle counts
//
// Rev 1.0 11 Sep 1995 16:52:26 CZHU
// Initial revision.
//
//------------------------------------------------------------------------------
//------------------------------------------------------------------------------
//
// Note:
// - BlockCopy reads and writes in DWORDS.
// - The __fastcall convention is used.
// - Code re-written to minimize code size.
// - We assume the output frame to NOT be in cache.
// - The constants PITCH and U32 are defined internally (no include files used).
//
// Registers used:
// eax accumulator
// ebx accumulator
// ecx destination address
// edx source address
// ebp PITCH
//
// Pentium cycle count (input cache hot, output cache cold):
// 33 + 8*(cache miss time) input aligned
// 81 + 8*(cache miss time) input mis-aligned
//
//------------------------------------------------------------------------------
#include "precomp.h"
#define U32 unsigned long
// Already defined in precomp.h
#define DXPITCH 384
#pragma code_seg("IACODE2")
/*
* Notes:
* The parameter uDstBlock is in ecx and uSrcBlock is in edx.
*/
__declspec(naked)
void __fastcall BlockCopy (U32 uDstBlock, U32 uSrcBlock)
{
__asm {
push edi
push ebx
push ebp
mov ebp, DXPITCH
// row 0
mov eax, [edx]
mov ebx, [edx+4]
add edx, ebp
mov edi, [ecx] // heat output cache
mov [ecx], eax
mov [ecx+4], ebx
// row 1
add ecx, ebp
mov eax, [edx]
mov ebx, [edx+4]
add edx, ebp
mov edi, [ecx] // heat output cache
mov [ecx], eax
mov [ecx+4], ebx
add ecx, ebp
// row 2
mov eax, [edx]
mov ebx, [edx+4]
add edx, ebp
mov edi, [ecx] // heat output cache
mov [ecx], eax
mov [ecx+4], ebx
// row 3
add ecx, ebp
mov eax, [edx]
mov ebx, [edx+4]
add edx, ebp
mov edi, [ecx] // heat output cache
mov [ecx], eax
mov [ecx+4], ebx
add ecx, ebp
// row 4
mov eax, [edx]
mov ebx, [edx+4]
add edx, ebp
mov edi, [ecx] // heat output cache
mov [ecx], eax
mov [ecx+4], ebx
// row 5
add ecx, ebp
mov eax, [edx]
mov ebx, [edx+4]
add edx, ebp
mov edi, [ecx] // heat output cache
mov [ecx], eax
mov [ecx+4], ebx
add ecx, ebp
// row 6
mov eax, [edx]
mov ebx, [edx+4]
add edx, ebp
mov edi, [ecx] // heat output cache
mov [ecx], eax
mov [ecx+4], ebx
// row 7
add ecx, ebp
pop ebp
mov eax, [edx]
mov ebx, [edx+4]
mov edi, [ecx] // heat output cache
mov [ecx], eax
mov [ecx+4], ebx
pop ebx
pop edi
ret
} // end of asm
}
#pragma code_seg()