152 lines
4.3 KiB
C++
152 lines
4.3 KiB
C++
/* *************************************************************************
|
|
** INTEL Corporation Proprietary Information
|
|
**
|
|
** This listing is supplied under the terms of a license
|
|
** agreement with INTEL Corporation and may not be copied
|
|
** nor disclosed except in accordance with the terms of
|
|
** that agreement.
|
|
**
|
|
** Copyright (c) 1995, 1996 Intel Corporation.
|
|
** All Rights Reserved.
|
|
**
|
|
** *************************************************************************
|
|
*/
|
|
|
|
// $Author: AGUPTA2 $
|
|
// $Date: 08 Mar 1996 16:46:34 $
|
|
// $Archive: S:\h26x\src\dec\dxblkcpy.cpv $
|
|
// $Header: S:\h26x\src\dec\dxblkcpy.cpv 1.4 08 Mar 1996 16:46:34 AGUPTA2 $
|
|
// $Log: S:\h26x\src\dec\dxblkcpy.cpv $
|
|
//
|
|
// Rev 1.4 08 Mar 1996 16:46:34 AGUPTA2
|
|
// Rewritten to reduce code size by avoiding 32-bit displacements. Added
|
|
// pragma code_seg. May need to optimize for misaligned case.
|
|
//
|
|
//
|
|
// Rev 1.3 31 Jan 1996 13:15:14 RMCKENZX
|
|
// Rewrote file to avoid bank conflicts. Fully unrolled the loop.
|
|
// Module now really will execute in 52 cycles if the cache is hot.
|
|
//
|
|
// Rev 1.2 22 Dec 1995 13:51:06 KMILLS
|
|
// added new copyright notice
|
|
//
|
|
// Rev 1.1 25 Sep 1995 09:03:22 CZHU
|
|
// Added comments on cycle counts
|
|
//
|
|
// Rev 1.0 11 Sep 1995 16:52:26 CZHU
|
|
// Initial revision.
|
|
//
|
|
//------------------------------------------------------------------------------
|
|
|
|
//------------------------------------------------------------------------------
|
|
//
|
|
// Note:
|
|
// - BlockCopy reads and writes in DWORDS.
|
|
// - The __fastcall convention is used.
|
|
// - Code re-written to minimize code size.
|
|
// - We assume the output frame to NOT be in cache.
|
|
// - The constants PITCH and U32 are defined internally (no include files used).
|
|
//
|
|
// Registers used:
|
|
// eax accumulator
|
|
// ebx accumulator
|
|
// ecx destination address
|
|
// edx source address
|
|
// ebp PITCH
|
|
//
|
|
// Pentium cycle count (input cache hot, output cache cold):
|
|
// 33 + 8*(cache miss time) input aligned
|
|
// 81 + 8*(cache miss time) input mis-aligned
|
|
//
|
|
//------------------------------------------------------------------------------
|
|
|
|
#include "precomp.h"
|
|
|
|
#define U32 unsigned long
|
|
// Already defined in precomp.h
|
|
#define DXPITCH 384
|
|
|
|
#pragma code_seg("IACODE2")
|
|
/*
|
|
* Notes:
|
|
* The parameter uDstBlock is in ecx and uSrcBlock is in edx.
|
|
*/
|
|
__declspec(naked)
|
|
void __fastcall BlockCopy (U32 uDstBlock, U32 uSrcBlock)
|
|
{
|
|
__asm {
|
|
push edi
|
|
push ebx
|
|
push ebp
|
|
mov ebp, DXPITCH
|
|
// row 0
|
|
mov eax, [edx]
|
|
mov ebx, [edx+4]
|
|
add edx, ebp
|
|
mov edi, [ecx] // heat output cache
|
|
mov [ecx], eax
|
|
mov [ecx+4], ebx
|
|
// row 1
|
|
add ecx, ebp
|
|
mov eax, [edx]
|
|
mov ebx, [edx+4]
|
|
add edx, ebp
|
|
mov edi, [ecx] // heat output cache
|
|
mov [ecx], eax
|
|
mov [ecx+4], ebx
|
|
add ecx, ebp
|
|
// row 2
|
|
mov eax, [edx]
|
|
mov ebx, [edx+4]
|
|
add edx, ebp
|
|
mov edi, [ecx] // heat output cache
|
|
mov [ecx], eax
|
|
mov [ecx+4], ebx
|
|
// row 3
|
|
add ecx, ebp
|
|
mov eax, [edx]
|
|
mov ebx, [edx+4]
|
|
add edx, ebp
|
|
mov edi, [ecx] // heat output cache
|
|
mov [ecx], eax
|
|
mov [ecx+4], ebx
|
|
add ecx, ebp
|
|
// row 4
|
|
mov eax, [edx]
|
|
mov ebx, [edx+4]
|
|
add edx, ebp
|
|
mov edi, [ecx] // heat output cache
|
|
mov [ecx], eax
|
|
mov [ecx+4], ebx
|
|
// row 5
|
|
add ecx, ebp
|
|
mov eax, [edx]
|
|
mov ebx, [edx+4]
|
|
add edx, ebp
|
|
mov edi, [ecx] // heat output cache
|
|
mov [ecx], eax
|
|
mov [ecx+4], ebx
|
|
add ecx, ebp
|
|
// row 6
|
|
mov eax, [edx]
|
|
mov ebx, [edx+4]
|
|
add edx, ebp
|
|
mov edi, [ecx] // heat output cache
|
|
mov [ecx], eax
|
|
mov [ecx+4], ebx
|
|
// row 7
|
|
add ecx, ebp
|
|
pop ebp
|
|
mov eax, [edx]
|
|
mov ebx, [edx+4]
|
|
mov edi, [ecx] // heat output cache
|
|
mov [ecx], eax
|
|
mov [ecx+4], ebx
|
|
pop ebx
|
|
pop edi
|
|
ret
|
|
} // end of asm
|
|
}
|
|
#pragma code_seg()
|
|
|