285 lines
10 KiB
NASM
285 lines
10 KiB
NASM
;////////////////////////////////////////////////////////////////////////////
|
|
;//
|
|
;// INTEL CORPORATION PROPRIETARY INFORMATION
|
|
;//
|
|
;// This software is supplied under the terms of a license
|
|
;// agreement or nondisclosure agreement with Intel Corporation
|
|
;// and may not be copied or disclosed except in accordance
|
|
;// with the terms of that agreement.
|
|
;//
|
|
;////////////////////////////////////////////////////////////////////////////
|
|
;//
|
|
;// $Header: R:\h26x\h26x\src\enc\e3msig.asv 1.2 04 Oct 1996 08:47:58 BNICKERS $
|
|
;//
|
|
;// $Log: R:\h26x\h26x\src\enc\e3msig.asv $
|
|
;//
|
|
;// Rev 1.2 04 Oct 1996 08:47:58 BNICKERS
|
|
;// Add EMV.
|
|
;//
|
|
;// Rev 1.1 08 Jul 1996 16:55:42 BNICKERS
|
|
;// Fix register initialization
|
|
;//
|
|
;// Rev 1.0 25 Jun 1996 14:24:54 BNICKERS
|
|
;// Initial revision.
|
|
;//
|
|
;////////////////////////////////////////////////////////////////////////////
|
|
;
|
|
; MMXMotionEstimationSignaturePrep -- This function pre-computes the signature
|
|
; inputs for the reference frame. It is
|
|
; used only by MMX ME, and only in AP mode.
|
|
|
|
OPTION PROLOGUE:None
|
|
OPTION EPILOGUE:ReturnAndRelieveEpilogueMacro
|
|
OPTION M510
|
|
OPTION CASEMAP:NONE
|
|
|
|
include iammx.inc
|
|
include e3inst.inc
|
|
|
|
.xlist
|
|
include memmodel.inc
|
|
.list
|
|
|
|
;=============================================================================
|
|
|
|
.CODE
|
|
|
|
ASSUME cs : FLAT
|
|
ASSUME ds : FLAT
|
|
ASSUME es : FLAT
|
|
ASSUME fs : FLAT
|
|
ASSUME gs : FLAT
|
|
ASSUME ss : FLAT
|
|
|
|
MMxMESignaturePrep proc C APrev: DWORD,
|
|
ASig: DWORD,
|
|
AFrmWd: DWORD,
|
|
AFrmHt: DWORD
|
|
|
|
RegStoSize = 16
|
|
|
|
; Arguments:
|
|
|
|
PreviousFrameBaseAddress = RegStoSize + 4
|
|
SignatureFrameBaseAddress = RegStoSize + 8
|
|
FrameWidth = RegStoSize + 12
|
|
FrameHeight = RegStoSize + 16
|
|
EndOfArgList = RegStoSize + 20
|
|
|
|
push esi
|
|
push edi
|
|
push ebp
|
|
push ebx
|
|
|
|
; ebp -- PITCH
|
|
; esi -- Cursor over reference frame.
|
|
; edi -- Cursor over frame of signature sums.
|
|
; edx -- Skip distance.
|
|
; ebx -- Outer loop counter.
|
|
; cl -- Initial value for inner loop counter.
|
|
; al -- Inner loop counter.
|
|
; ch -- Scratch.
|
|
; ah -- Scratch.
|
|
|
|
mov esi,[esp+PreviousFrameBaseAddress]
|
|
mov edi,[esp+SignatureFrameBaseAddress]
|
|
mov ebx,[esp+FrameHeight]
|
|
mov eax,[esp+FrameWidth]
|
|
|
|
mov edx,PITCH*4-32
|
|
mov ebp,PITCH
|
|
sub edx,eax ; Distance from end of one row to start of next.
|
|
add eax,32 ; Add the macroblocks off left and right edges.
|
|
shr eax,4 ; Number of macroblocks in row.
|
|
sub esi,16 ; Start at macroblock off left edge.
|
|
mov cl,al ; To re-init inner loop counter.
|
|
sub edi,16 ; Start at macroblock off left edge.
|
|
|
|
pxor mm5,mm5
|
|
pcmpeqb mm0,mm0
|
|
pcmpeqb mm4,mm4
|
|
psrlw mm0,8 ; W:<0x00FF 0x00FF 0x00FF 0x00FF>
|
|
pxor mm6,mm6
|
|
psrlw mm4,15 ; W:<0x0001 0x0001 0x0001 0x0001>
|
|
movq mm2,[esi] ; B:<P07 P06 P05 P04 P03 P02 P01 P00>
|
|
movq mm1,mm0 ; W:< 00FF 00FF 00FF 00FF>
|
|
movq mm3,[esi+8]
|
|
pand mm0,mm2 ; W:<P06 P04 P02 P00>
|
|
pxor mm7,mm7
|
|
|
|
@@:
|
|
|
|
pand mm1,mm3
|
|
psllw mm0,2 ; W:<P06*4 P04*4 P02*4 P00*4>
|
|
mov ah,[edi-PITCH*12]
|
|
psrlw mm2,7 ; W:<P07*2 P05*2 P03*2 P01*2>
|
|
movq [edi-PITCH*12],mm0 ; Save W:<P06*4 P04*4 P02*4 P00*4>
|
|
pmaddwd mm2,mm4 ; D:<(P07+P05)*2 (P03+P01)*2>
|
|
mov ch,[edi-PITCH*8+16]
|
|
mov ah,[edi-PITCH*4]
|
|
movq [edi-PITCH*8],mm0 ; Save W:<P06*4 P04*4 P02*4 P00*4>
|
|
psllw mm1,2
|
|
mov ch,[edi+16]
|
|
psrlw mm3,7 ; W:<P07*2 P05*2 P03*2 P01*2>
|
|
movq [edi-PITCH*4],mm0 ; Save W:<P06*4 P04*4 P02*4 P00*4>
|
|
pmaddwd mm3,mm4
|
|
movq [edi],mm0 ; Save W:<P06*4 P04*4 P02*4 P00*4>
|
|
psllq mm0,2 ; W:<P06*16 P04*16 P02*16 P00*16>
|
|
mov ah,[edi-PITCH*10-16]
|
|
mov ch,[edi-PITCH*16]
|
|
movq [edi-PITCH*16],mm0 ; Save W:<P06*16 P04*16 P02*16 P00*16>
|
|
packssdw mm2,mm2 ; [0:31] W:<(P07+P05)*2 (P03+P01)*2>
|
|
movq [edi-PITCH*12+8],mm1
|
|
punpcklwd mm2,mm2 ; W:<(P07+P05)*2 (P07+P05)*2 (P03+P01)*2 ...>
|
|
movq [edi-PITCH*8+8],mm1
|
|
psubw mm2,mm5 ; Subtract sum of pels 15, 13, 11, and 9 to left.
|
|
movq [edi-PITCH*4+8],mm1
|
|
paddw mm7,mm2 ; Low DWORD: W:<sum(P0*)*2 sum(P0*)*2>, where
|
|
; ; "*" is odd columns from -11 thru +3.
|
|
movq [edi+8],mm1
|
|
paddw mm5,mm2 ; Save W:<(P27+P37+P25+P35) (P07+P17+P05+P15)...>
|
|
mov ah,[edi-PITCH*14-32]
|
|
mov ah,[edi-PITCH*6-32]
|
|
mov ch,[edi-PITCH*2-16]
|
|
movdf [edi-PITCH*14-12],mm7; Save DWORD: W:<sum(P0*)*2 sum (P0*)*2>
|
|
movdf [edi-PITCH*10-12],mm7; Save DWORD: W:<sum(P0*)*2 sum (P0*)*2>
|
|
psrlq mm2,32 ; Position 7, 5, and negative of 9, 11 to left.
|
|
movdf [edi-PITCH*6-12],mm7 ; Save DWORD: W:<sum(P0*)*2 sum (P0*)*2>
|
|
paddw mm2,mm7 ; Low DWORD: W:<sum(P0*)*2 sum(P0*)*2>, where
|
|
; ; "*" is odd columns from -7 thru +7.
|
|
movdf [edi-PITCH*2-12],mm7 ; Save DWORD: W:<sum(P0*)*2 sum (P0*)*2>
|
|
packssdw mm3,mm3
|
|
movdf [edi-PITCH*10-8],mm2
|
|
punpcklwd mm3,mm3
|
|
movdf [edi-PITCH*6-8],mm2
|
|
psubw mm3,mm6
|
|
movdf [edi-PITCH*2-8],mm2
|
|
paddw mm2,mm3
|
|
add esi,16 ; Advance input cursor.
|
|
dec al
|
|
movdf [edi-PITCH*14-4],mm2
|
|
movdf [edi-PITCH*10-4],mm2
|
|
paddw mm6,mm3
|
|
movdf [edi-PITCH*6-4],mm2
|
|
psrlq mm3,32
|
|
movdf [edi-PITCH*2-4],mm2
|
|
paddw mm3,mm2
|
|
movq mm2,[esi] ; B:<P07 P06 P05 P04 P03 P02 P01 P00>
|
|
movq mm7,mm3
|
|
movq mm3,[esi+8]
|
|
psllq mm1,2
|
|
movdf [edi-PITCH*10],mm7
|
|
pcmpeqb mm0,mm0
|
|
movq [edi-PITCH*16+8],mm1
|
|
psrlw mm0,8
|
|
movdf [edi-PITCH*6],mm7
|
|
movq mm1,mm0
|
|
movdf [edi-PITCH*2],mm7
|
|
pand mm0,mm2
|
|
lea edi,[edi+16] ; Advance output cursor.
|
|
jne @b
|
|
|
|
|
|
lea esi,[esi+edx-PITCH*4] ; Get back to start of line 0.
|
|
lea edi,[edi+edx-PITCH*4] ; Get back to start of line 0.
|
|
pxor mm7,mm7
|
|
add ebx,16 ; Do 4 extra sets of 4 lines at bottom.
|
|
mov al,cl
|
|
|
|
Next4LinesRefQuickSig:
|
|
|
|
pxor mm5,mm5
|
|
pcmpeqb mm0,mm0
|
|
movq mm3,[esi+ebp*2] ; B:<P27 P26 P25 P24 P23 P22 P21 P20>
|
|
psrlw mm0,8 ; W:<0x00FF 0x00FF 0x00FF 0x00FF>
|
|
paddb mm3,[esi+PITCH*3] ; B:<P27+P37 P26+P36 P25+P35 P24+P34 ...>
|
|
pcmpeqb mm4,mm4
|
|
pxor mm6,mm6
|
|
psrlw mm4,15 ; W:<0x0001 0x0001 0x0001 0x0001>
|
|
|
|
@@:
|
|
|
|
movq mm2,[esi] ; B:<P07 P06 P05 P04 P03 P02 P01 P00>
|
|
movq mm1,mm3 ; B:<P27+P37 P26+P36 P25+P35 P24+P34 ...>
|
|
paddb mm2,[esi+ebp*1] ; B:<P07+P17 P06+P16 P05+P15 P04+P14 ...>
|
|
psrlw mm3,8 ; W:<P27+P37 P25+P35 P23+P33 P21+P31>
|
|
pmaddwd mm3,mm4 ; D:<P27+P37+P25+P35 P23+P33+P21+P31>
|
|
pand mm1,mm0 ; W:<P26+P36 P24+P34 P22+P32 P20+P30>
|
|
pand mm0,mm2 ; W:<P06+P16 P04+P14 P02+P12 P00+P10>
|
|
psrlw mm2,8 ; W:<P07+P17 P05+P15 P03+P13 P01+P11>
|
|
pmaddwd mm2,mm4 ; D:<P07+P17+P05+P15 P03+P13+P01+P11>
|
|
paddw mm1,mm0 ; W:<(P06+P16+P26+P36) (P04+P14+P24+P34) ...>
|
|
mov ah,[edi+ebp*2-16] ; Initiate cache line load.
|
|
pslld mm3,16 ; D:<(P27+P37+P25+P35)<<16 (P23+P33+P21+P31)<<16>
|
|
movq [edi+ebp*4],mm1 ; Save W:<(P06+P16+P26+P36) ...>
|
|
pcmpeqb mm0,mm0
|
|
paddw mm1,[edi-PITCH*16]; W:<Sum(P*6) Sum(P*4) Sum(P*2) Sum(P*0)>, where
|
|
; ; "*" is the 20 lines P-16 thru P3
|
|
por mm2,mm3 ; W:<(P27+P37+P25+P35) (P07+P17+P05+P15)
|
|
; ; (P23+P33+P21+P31) (P03+P13+P01+P11)>
|
|
psubw mm1,[edi-PITCH*12]; W:<Sum(P*6) Sum(P*4) Sum(P*2) Sum(P*0)>, where
|
|
; ; "*" is the 16 lines P-12 thru P3
|
|
psubw mm2,mm5 ; Subtract sum of pels 15, 13, 11, and 9 to left.
|
|
movq mm3,[esi+ebp*2+8]
|
|
paddw mm7,mm2 ; Low DWORD: W:<sum(P2*+P3*) sum (P0*+P1*)> where
|
|
; ; "*" is odd columns from -11 thru +3.
|
|
movq [edi-PITCH*12],mm1; Save W:<P*6 P*4 P*2 P*0> where * is 16 rows.
|
|
paddw mm5,mm2 ; Save W:<(P27+P37+P25+P35) (P07+P17+P05+P15)...>
|
|
movdf [edi+ebp*2-12],mm7; Save DWORD: W:<sum(P2*+P3*) sum (P0*+P1*)>
|
|
psrlq mm2,32 ; Position 7, 5, and negative of 9, 11 to left.
|
|
paddb mm3,[esi+PITCH*3+8]
|
|
paddw mm7,mm2 ; Low DWORD: W:<sum(P2*+P3*) sum (P0*+P1*)> where
|
|
; ; "*" is odd columns from -7 thru +7.
|
|
movq mm2,[esi+8]
|
|
psrlw mm0,8
|
|
movdf [edi+ebp*2-8],mm7 ; Save DWORD: W:<sum(P2*+P3*) sum (P0*+P1*)>
|
|
movq mm1,mm3
|
|
paddb mm2,[esi+ebp*1+8]
|
|
psrlw mm3,8
|
|
pmaddwd mm3,mm4
|
|
pand mm1,mm0
|
|
pand mm0,mm2
|
|
psrlw mm2,8
|
|
pmaddwd mm2,mm4
|
|
paddw mm1,mm0
|
|
mov ch,[edi+ebp*4+16] ; Initiate cache line load.
|
|
pslld mm3,16
|
|
movq [edi+ebp*4+8],mm1
|
|
pcmpeqb mm0,mm0
|
|
paddw mm1,[edi-PITCH*16+8]
|
|
por mm2,mm3
|
|
psubw mm1,[edi-PITCH*12+8]
|
|
psubw mm2,mm6
|
|
movq mm3,[esi+ebp*2+16]
|
|
paddw mm7,mm2
|
|
movq [edi-PITCH*12+8],mm1
|
|
paddw mm6,mm2
|
|
movdf [edi+ebp*2-4],mm7
|
|
psrlq mm2,32
|
|
paddb mm3,[esi+PITCH*3+16]
|
|
paddw mm7,mm2
|
|
add esi,16 ; Advance input cursor.
|
|
dec al
|
|
movdf [edi+ebp*2],mm7
|
|
psrlw mm0,8
|
|
lea edi,[edi+16] ; Advance output cursor.
|
|
jne @b
|
|
|
|
add esi,edx
|
|
add edi,edx
|
|
mov al,cl
|
|
sub ebx,4
|
|
pxor mm7,mm7
|
|
jne Next4LinesRefQuickSig
|
|
|
|
emms
|
|
pop ebx
|
|
pop ebp
|
|
pop edi
|
|
pop esi
|
|
rturn
|
|
|
|
MMxMESignaturePrep endp
|
|
|
|
END
|