2020-09-30 16:53:55 +02:00

1306 lines
52 KiB
NASM

;////////////////////////////////////////////////////////////////////////////
;//
;// INTEL CORPORATION PROPRIETARY INFORMATION
;//
;// This software is supplied under the terms of a license
;// agreement or nondisclosure agreement with Intel Corporation
;// and may not be copied or disclosed except in accordance
;// with the terms of that agreement.
;//
;////////////////////////////////////////////////////////////////////////////
;//
;// $Header: R:\h26x\h26x\src\enc\e3mbme.asv 1.5 18 Oct 1996 16:57:08 BNICKERS $
;//
;// $Log: R:\h26x\h26x\src\enc\e3mbme.asv $
;//
;// Rev 1.5 18 Oct 1996 16:57:08 BNICKERS
;// Fixes for EMV
;//
;// Rev 1.4 12 Sep 1996 10:56:16 BNICKERS
;// Add arguments for thresholds and differentials.
;//
;// Rev 1.3 22 Jul 1996 15:22:48 BNICKERS
;// Reduce code size. Implement H261 spatial filter.
;//
;// Rev 1.2 14 May 1996 12:18:48 BNICKERS
;// Initial debugging of MMx B-Frame ME.
;//
;// Rev 1.1 03 May 1996 14:03:30 BNICKERS
;//
;// Minor bug fixes and integration refinements.
;//
;// Rev 1.0 02 May 1996 12:00:56 BNICKERS
;// Initial revision.
;//
;////////////////////////////////////////////////////////////////////////////
;
; MMxBFrameMotionEstimation -- This function performs motion estimation for the
; B frame macroblocks identified in the input list.
; This is the MMx version.
;
OPTION M510
OPTION CASEMAP:NONE
BFRMNONZEROMVDIFFERENTIAL = 400
BFRMEMPTYTHRESHOLD = 256
.xlist
include e3inst.inc
include memmodel.inc
include iammx.inc
include exEDTQ.inc
include e3mbad.inc
.list
.CODE EDTQ
EXTERN MMxDoForwardDCT:NEAR
PUBLIC MMxDoBFrameLumaBlocks
PUBLIC MMxDoBFrameChromaBlocks
StackOffset TEXTEQU <4>
CONST_384 TEXTEQU <ebp>
MMxDoBFrameLumaBlocks:
mov eax,QPDiv2 ; Swap these so Quantizer uses right level.
mov ebx,BQPDiv2
mov QPDiv2,ebx
mov BQPDiv2,eax
mov eax,CodeStreamCursor
mov ebx,BCodeStreamCursor
mov CodeStreamCursor,ebx
mov BCodeStreamCursor,eax
mov eax,Recip2QPToUse
mov ebx,BRecip2QPToUse
mov Recip2QPToUse,ebx
mov cl,INTER1MV
mov BRecip2QPToUse,eax
mov StashBlockType,cl
BFrameSWDLoop_0MV:
mov ecx,[edx].BlkY1.MVs
xor ebx,ebx
mov bl,[edx].BlkY1.PVMV ; P-frame Vertical MV
lea edi,WeightForwardMotion
xor eax,eax
and ecx,0FFH ; P-frame Horizontal MV
mov al,[edi+ebx] ; VMV for past ref.
mov bl,[edi+ebx+64] ; VMV for future ref.
mov [edx].BlkY1.VMVb0Delta,bl
mov bl,[edi+ecx+64] ; HMV for future ref.
mov [edx].BlkY1.HMVb0Delta,bl
mov bl,[edi+ecx] ; HMV for past ref.
mov [edx].BlkY1.VMVf0Delta,al ; Record candidate VMVf.
xor ecx,ecx ; Keep pairing happy.
mov [edx].BlkY1.HMVf0Delta,bl ; Record candidate HMVf.
mov edi,[edx].BlkY1.BlkOffset ; Address of 0-MV blk within frame.
call ComputeBFrameSWDForCandRef
movdf [edx].BlkY1.BlkLvlSWD0Delta,mm7 ; Stash SWD.
add edx,SIZEOF T_Blk
lea edi,WeightForwardMotion
test dl,4*SIZEOF T_Blk ; Quit when fourth block done.
je BFrameSWDLoop_0MV
mov eax,[edx-4*SIZEOF T_Blk].BlkY1.MVs
mov cl,[edx-4*SIZEOF T_Blk].BlockType
xor cl,INTER1MV
or al,ah
lea esi,[edx-4*SIZEOF T_Blk] ; Reset MacroBlockActionDescr cursor.
or al,cl
mov ecx,[edx-SIZEOF T_Blk].BlkY1.BlkLvlSWD0Delta
je BelowBFrmZeroThreshold ; Jump if P frm macroblock uses 0 motion vector.
xor eax,eax
cmp ecx,BFrmZeroVectorThreshold
mov CurrSWDState,eax ; Record ME engine state.
jle BelowBFrmZeroThreshold
mov edx,[esi].BlkY1.BlkLvlSWD0Delta ; Remember 0-MV SWDs.
mov ecx,[esi].BlkY2.BlkLvlSWD0Delta
mov [esi].BlkY1.BestBlkLvlSWD,edx
mov [esi].BlkY2.BestBlkLvlSWD,ecx
mov edx,[esi].BlkY3.BlkLvlSWD0Delta
mov ecx,[esi].BlkY4.BlkLvlSWD0Delta
mov [esi].BlkY3.BestBlkLvlSWD,edx
mov [esi].BlkY4.BestBlkLvlSWD,ecx
mov [esi].BlkU.BestBlkLvlSWD,ecx ; Avoid unintended early out, below.
xor edx,edx ; Set best MV to zero.
BFrmSWDLoop:
mov ecx,PD BFrmSWDState[eax] ; cl == HMV; ch == VMV offsets to try.
mov BestMV,edx ; Record what the best MV so far is.
add cl,dl ; Try this horizontal MV delta.
je HMVdIsZero
mov PB CandidateMV,cl ; Record the candidate HMV delta.
add ch,dh ; Try this vertical MV delta.
mov PB CandidateMV+1,ch ; Record the candidate VMV delta.
je VMVdIsZero
VMVdAndHMVdAreNonZero_Loop:
mov edx,[esi].BlkY1.MVs
xor ebx,ebx
mov bl,dl
xor eax,eax
mov al,dh
add esi,SIZEOF T_Blk
mov bl,[edi+ebx] ; TRb * HMV / TRd
pxor mm7,mm7 ; Initialize SWD accumulator
add bl,cl ; HMVf = TRb * HMV / TRd + HMVd
mov al,[edi+eax] ; TRb * VMV / TRd
cmp bl,040H ; If too far left or right, quick out.
jbe MVDeltaOutOfRange
mov [esi-SIZEOF T_Blk].BlkY1.CandHMVf,bl
add al,ch ; VMVf = TRb * VMV / TRd + VMVd
cmp al,040H ; If too far up or down, quick out.
jbe MVDeltaOutOfRange
mov [esi-SIZEOF T_Blk].BlkY1.CandVMVf,al
sub bl,dl ; -HMVb = -(HMVf - HMV)
mov [esi-SIZEOF T_Blk].BlkY1.CandHMVb,bl
sub al,dh ; -VMVb = -(VMVf - VMV)
test esi,4*SIZEOF T_Blk
mov [esi-SIZEOF T_Blk].BlkY1.CandVMVb,al
je VMVdAndHMVdAreNonZero_Loop
sub esi,4*SIZEOF T_Blk
jmp CandidateMVsGotten
VMVdIsZero:
VMVdIsZero_Loop:
mov edx,[esi].BlkY1.MVs
xor eax,eax
mov al,dh
xor ebx,ebx
mov bl,dl
add esi,SIZEOF T_Blk
mov dh,[edi+eax+64] ; -VMVb = -((TRb - TRd) * VMV) / TRd
mov al,[edi+eax] ; TRb * VMV / TRd
mov bl,[edi+ebx] ; TRb * HMV / TRd
mov [esi-SIZEOF T_Blk].BlkY1.CandVMVf,al
add bl,cl ; HMVf = TRb * HMV / TRd + HMVd
mov [esi-SIZEOF T_Blk].BlkY1.CandVMVb,dh
cmp bl,040H ; If too far left or right, quick out.
jbe MVDeltaOutOfRange
mov [esi-SIZEOF T_Blk].BlkY1.CandHMVf,bl
sub bl,dl ; -HMVb = -(HMVf - HMV)
test esi,4*SIZEOF T_Blk
mov [esi-SIZEOF T_Blk].BlkY1.CandHMVb,bl
je VMVdIsZero_Loop
sub esi,4*SIZEOF T_Blk
pxor mm7,mm7 ; Initialize SWD accumulator
jmp CandidateMVsGotten
BFrameEarlyOutForCandidateMV:
MVDeltaOutOfRange:
and esi,-1-7*SIZEOF T_Blk ; Reset block action descr cursor.
mov ebx,CurrSWDState ; Reload ME engine state.
xor eax,eax
mov edx,BestMV ; Previous best MV is still best.
mov al,BFrmSWDState[ebx+2] ; Get next State number.
jmp ProceedWithNextCand
HMVdIsZero:
mov PB CandidateMV,cl ; Record the candidate HMV delta.
add ch,dh ; Try this vertical MV delta.
mov PB CandidateMV+1,ch ; Record the candidate VMV delta.
HMVdIsZeroLoop:
mov edx,[esi].BlkY1.MVs
xor ebx,ebx
mov bl,dl
xor eax,eax
mov al,dh
add esi,SIZEOF T_Blk
mov dl,[edi+ebx+64] ; -HMVb = -((TRb - TRd) * HMV) / TRd
mov bl,[edi+ebx] ; TRb * HMV / TRd
mov al,[edi+eax] ; TRb * VMV / TRd
mov [esi-SIZEOF T_Blk].BlkY1.CandHMVf,bl
add al,ch ; VMVf = TRb * VMV / TRd + VMVd
mov [esi-SIZEOF T_Blk].BlkY1.CandHMVb,dl
cmp al,040H ; If too far up or down, quick out.
jbe MVDeltaOutOfRange
mov [esi-SIZEOF T_Blk].BlkY1.CandVMVf,al
sub al,dh ; -VMVb = -(VMVf - VMV)
test esi,4*SIZEOF T_Blk
mov [esi-SIZEOF T_Blk].BlkY1.CandVMVb,al
je HMVdIsZeroLoop
sub esi,4*SIZEOF T_Blk
pxor mm7,mm7 ; Initialize SWD accumulator
CandidateMVsGotten:
BFrameSWDLoop_Non0MVCandidate:
xor eax,eax
xor ebx,ebx
mov al,[esi].BlkY1.CandVMVf
mov edi,[esi].BlkY1.BlkOffset ; Address of 0-MV blk within frame.
mov bl,[esi].BlkY1.CandHMVf
mov edx,esi
call ComputeBFrameSWDForCandRef
movdf ecx,mm7
mov eax,[edx].BlkY2.BestBlkLvlSWD
lea esi,[edx+SIZEOF T_Blk] ; Early out if the first N blocks for
cmp ecx,eax ; this cand are worse than the first
jge BFrameEarlyOutForCandidateMV ; N+1 blocks for previous best.
test esi,4*SIZEOF T_Blk ; Quit when fourth block done.
mov [esi-SIZEOF T_Blk].BlkY1.CandBlkLvlSWD,ecx ; Stash SWD.
je BFrameSWDLoop_Non0MVCandidate
; This candidate is best so far.
mov [esi-4*SIZEOF T_Blk].BlkY4.BestBlkLvlSWD,ecx
mov ebx,CurrSWDState ; Reload ME engine state.
mov [esi-4*SIZEOF T_Blk].BlkU.BestBlkLvlSWD,ecx
sub esi,4*SIZEOF T_Blk
xor eax,eax
mov edx,CandidateMV ; Candidate was best MV.
mov ecx,[esi].BlkY3.CandBlkLvlSWD
mov [esi].BlkY3.BestBlkLvlSWD,ecx
mov ecx,[esi].BlkY2.CandBlkLvlSWD
mov [esi].BlkY2.BestBlkLvlSWD,ecx
mov ecx,[esi].BlkY1.CandBlkLvlSWD
mov [esi].BlkY1.BestBlkLvlSWD,ecx
mov ecx,[esi].BlkY4.CandBiDiMVs
mov [esi].BlkY4.BestBiDiMVs,ecx
mov ecx,[esi].BlkY3.CandBiDiMVs
mov [esi].BlkY3.BestBiDiMVs,ecx
mov ecx,[esi].BlkY2.CandBiDiMVs
mov [esi].BlkY2.BestBiDiMVs,ecx
mov ecx,[esi].BlkY1.CandBiDiMVs
mov [esi].BlkY1.BestBiDiMVs,ecx
mov al,BFrmSWDState[ebx+3] ; Get next State number.
ProceedWithNextCand:
mov CurrSWDState,eax ; Record ME engine state.
test eax,eax
lea edi,WeightForwardMotion
jne BFrmSWDLoop
mov ecx,[esi].BlkY4.BlkLvlSWD0Delta ; 0MV SWD
sub ecx,BFRMNONZEROMVDIFFERENTIAL
mov ebx,[esi].BlkY4.BestBlkLvlSWD ; Best non-0 MV SWD.
cmp ebx,ecx
jge NonZeroBFrmVectorNotGoodEnoughGain
mov [esi].BlkY1.BHMV,dl
mov [esi].BlkY2.BHMV,dl
mov [esi].BlkY3.BHMV,dl
mov [esi].BlkY4.BHMV,dl
mov [esi].BlkY1.BVMV,dh
mov [esi].BlkY2.BVMV,dh
mov [esi].BlkY3.BVMV,dh
mov [esi].BlkY4.BVMV,dh
mov eax,[esi].BlkY4.BestBlkLvlSWD
mov ebx,[esi].BlkY3.BestBlkLvlSWD
sub eax,ebx
mov ecx,[esi].BlkY2.BestBlkLvlSWD
sub ebx,ecx
mov edx,[esi].BlkY1.BestBlkLvlSWD
sub ecx,edx
mov [esi].BlkY4.BestBlkLvlSWD,eax
mov [esi].BlkY3.BestBlkLvlSWD,ebx
mov [esi].BlkY2.BestBlkLvlSWD,ecx
mov [esi].BlkY1.BestBlkLvlSWD,edx
jmp BFrmMVSettled
BelowBFrmZeroThreshold:
NonZeroBFrmVectorNotGoodEnoughGain:
mov ebx,[esi].BlkY4.BlkLvlSWD0Delta
mov ecx,[esi].BlkY3.BlkLvlSWD0Delta
sub ebx,ecx
mov edx,[esi].BlkY2.BlkLvlSWD0Delta
sub ecx,edx
mov edi,[esi].BlkY1.BlkLvlSWD0Delta
sub edx,edi
mov [esi].BlkY4.BestBlkLvlSWD,ebx
mov [esi].BlkY3.BestBlkLvlSWD,ecx
mov [esi].BlkY2.BestBlkLvlSWD,edx
mov [esi].BlkY1.BestBlkLvlSWD,edi
mov eax,[esi].BlkY1.BiDiMVs0Delta
mov [esi].BlkY1.BestBiDiMVs,eax
mov eax,[esi].BlkY2.BiDiMVs0Delta
mov [esi].BlkY2.BestBiDiMVs,eax
mov eax,[esi].BlkY3.BiDiMVs0Delta
mov [esi].BlkY3.BestBiDiMVs,eax
mov eax,[esi].BlkY4.BiDiMVs0Delta
mov [esi].BlkY4.BestBiDiMVs,eax
xor eax,eax
mov [esi].BlkY1.BHMV,al
mov [esi].BlkY2.BHMV,al
mov [esi].BlkY3.BHMV,al
mov [esi].BlkY4.BHMV,al
mov [esi].BlkY1.BVMV,al
mov [esi].BlkY2.BVMV,al
mov [esi].BlkY3.BVMV,al
mov [esi].BlkY4.BVMV,al
BFrmMVSettled:
mov edx,esi
mov bl,8 ; Init coded block pattern
BFrmLumaBlkLoop:
mov esi,[edx].BlkY1.BestBlkLvlSWD ; Get SWD for block.
xor eax,eax
mov BFrmCBP,bl
cmp esi,BFRMEMPTYTHRESHOLD ; Below threshold for forcing empty?
mov ecx,BSWDTotal
jl BFrmLumaBlkEmpty
mov eax,[edx].BlkY1.BestBiDiMVs
xor ebx,ebx
add ecx,esi
mov bl,ah
mov BSWDTotal,ecx
and eax,0FFH
call BFrameDTQ
mov bl,BlkEmptyFlag[ebx] ; Fetch 16 if block not empty; else 0.
mov al,BFrmCBP
BFrmLumaBlkEmpty:
or bl,al ; Factor in CBP bit for this block.
add edx,SIZEOF T_Blk
shr bl,1 ; CF == 1 when sentinel shifted off
jnc BFrmLumaBlkLoop
mov [edx-4*SIZEOF T_Blk].CodedBlocksB,bl
sub edx,4*SIZEOF T_Blk
mov eax,QPDiv2 ; Restore these for P frame blocks.
mov ebx,BQPDiv2
mov QPDiv2,ebx
mov BQPDiv2,eax
mov eax,CodeStreamCursor
mov ebx,BCodeStreamCursor
mov CodeStreamCursor,ebx
mov BCodeStreamCursor,eax
mov eax,Recip2QPToUse
mov ebx,BRecip2QPToUse
mov Recip2QPToUse,ebx
mov BRecip2QPToUse,eax
ret
MMxDoBFrameChromaBlocks:
; mov eax,QPDiv2 ; Swap these so Quantizer uses right level.
; mov ebx,BQPDiv2 ; (Loaded in caller.)
mov QPDiv2,ebx
mov BQPDiv2,eax
mov eax,CodeStreamCursor
mov ebx,BCodeStreamCursor
mov CodeStreamCursor,ebx
mov BCodeStreamCursor,eax
mov eax,Recip2QPToUse
mov ebx,BRecip2QPToUse
mov Recip2QPToUse,ebx
mov cl,INTER1MV
mov BRecip2QPToUse,eax
mov StashBlockType,cl
mov eax,[edx].BlkU.BestBiDiMVs
xor ebx,ebx
mov bl,ah
and eax,0FFH
add edx,4*SIZEOF T_Blk ; To know we're working on chroma.
call BFrameDTQ
mov bl,BlkEmptyFlag[ebx] ; Fetch 16 if block not empty; else 0.
mov al,[edx-4*SIZEOF T_Blk].CodedBlocksB
or bl,al ; Factor in CBP bit for this block.
mov eax,[edx-4*SIZEOF T_Blk].BlkV.BestBiDiMVs
mov [edx-4*SIZEOF T_Blk].CodedBlocksB,bl
xor ebx,ebx
mov bl,ah
and eax,0FFH
add edx,SIZEOF T_Blk
call BFrameDTQ
mov bl,BlkEmptyFlag[ebx+2] ; Fetch 32 if block not empty; else 0.
mov al,[edx-5*SIZEOF T_Blk].CodedBlocksB
or bl,al ; Factor in CBP bit for this block.
mov eax,QPDiv2 ; Restore these for P frame blocks.
mov [edx-5*SIZEOF T_Blk].CodedBlocksB,bl
mov ebx,BQPDiv2
mov QPDiv2,ebx
mov BQPDiv2,eax
mov eax,CodeStreamCursor
mov ebx,BCodeStreamCursor
mov CodeStreamCursor,ebx
mov BCodeStreamCursor,eax
mov eax,Recip2QPToUse
mov ebx,BRecip2QPToUse
mov Recip2QPToUse,ebx
mov BRecip2QPToUse,eax
sub edx,5*SIZEOF T_Blk
ret
;===============================================================================
; ebp -- Pitch
; edi -- Address of (0-MV) block within frame.
; edx -- Block Action Decriptor cursor
; ebx -- HMVf (HMV to apply to past reference) biased by 96.
; eax -- VMVf (VMV to apply to past reference) biased by 96.
StackOffset TEXTEQU <8>
ComputeBFrameSWDForCandRef:
test al,1
mov ecx,PreviousFrameBaseAddress
lea eax,[eax+eax*2] ; Start of VMVf*384
jne ME_VMVfAtHalfPelPosition
ME_VMVfAtFullPelPosition:
IF PITCH-384
**** The magic leaks out if PITCH != 384
ENDIF
shl eax,6
add ecx,edi
shr ebx,1 ; CF == 1 iff HMVf is at half pel.
jc ME_VMVfAtFull_HMVfAtHalfPelPosition
ME_VMVfAtFull_HMVfAtFullPelPosition:
lea esi,[ecx+eax-48*PITCH-48]
lea ecx,[ebp+ebp*2]
add esi,ebx ; Address of past reference block.
mov eax,BFrameBaseAddress
add edi,eax ; Address of target block.
lea ebx,[ebp+ebp*4]
movq mm0,[esi+ebp*1]
psubw mm0,[edi+ebp*1] ; Get diff for line 1.
movq mm1,[esi+ecx] ; Ref MB, upper left block, Line 3.
psllw mm0,8 ; Extract diffs for line 1 even pels.
psubw mm1,[edi+ecx] ; Diff for line 3.
pmaddwd mm0,mm0 ; Square of diffs for even pels of line 1.
movq mm2,[esi+ebx]
psllw mm1,8
psubw mm2,[edi+ebx]
pmaddwd mm1,mm1
movq mm3,[esi+PITCH*7]
psllw mm2,8
psubw mm3,[edi+PITCH*7]
pmaddwd mm2,mm2
movq mm4,[esi] ; Ref MB, upper left blk, Line 0.
psllw mm3,8
psubw mm4,[edi] ; Diff for line 0.
paddusw mm0,mm1 ; Accumulate SWD (lines 0 and 2).
movq mm1,[esi+ebp*2]
pmaddwd mm3,mm3
psubw mm1,[edi+ebp*2]
paddusw mm0,mm2
movq mm2,[esi+ebp*4]
pmaddwd mm4,mm4 ; Square of diffs for odd pels of line 0.
psubw mm2,[edi+ebp*4]
paddusw mm0,mm3
movq mm3,[esi+ecx*2]
pmaddwd mm1,mm1
psubw mm3,[edi+ecx*2]
pmaddwd mm2,mm2
paddusw mm0,mm4
pmaddwd mm3,mm3
paddusw mm0,mm1
;
paddusw mm0,mm2
;
paddusw mm0,mm3
;
punpckldq mm1,mm0 ; Get low order SWD accum to high order of mm1.
;
paddusw mm0,mm1 ; mm0[48:63] is SWD for block.
;
psrlq mm0,48 ; SWD for block.
;
paddd mm7,mm0 ; mm7 is SWD for all four blocks.
;
ret
ME_VMVfAtFull_HMVfAtHalfPelPosition:
lea esi,[ecx+eax-48*PITCH-48]
mov eax,BFrameBaseAddress
add esi,ebx ; Address of past reference block.
add edi,eax ; Address of target block.
lea ecx,[ebp+ebp*2]
movq mm0,mm6 ; 8 bytes of 1
pmullw mm0,[esi] ; <(P07+P06)*256+junk ...>
movq mm1,mm6
pmullw mm1,[esi+ebp*2]
movq mm2,mm6
pmullw mm2,[esi+ebp*4]
movq mm3,mm6
movq mm4,[edi] ; <C07 C06 C05 C04 C03 C02 C01 C00>
psrlw mm0,1 ; <(P07+P06)*256/2+junk ...>
pmullw mm3,[esi+ecx*2]
psllw mm4,8 ; <C06*256 C04*256 C02*256 C00*256>
movq mm5,[edi+ebp*2]
psrlw mm1,1
psubw mm0,mm4 ; <(P07+P06)*256/2-C06*256+junk ...>
psllw mm5,8
movq mm4,[edi+ebp*4]
psrlw mm2,1
psubw mm1,mm5
psllw mm4,8
movq mm5,[edi+ecx*2]
psrlw mm3,1
psubw mm2,mm4
pmaddwd mm0,mm0 ; SSD fof even pels of line 0.
pmaddwd mm1,mm1
psllw mm5,8
psubw mm3,mm5
pmaddwd mm2,mm2
pmaddwd mm3,mm3
movq mm5,mm6
pmullw mm6,[esi+ebp*1+1] ; <(P18+P17)*256+junk ...>
movq mm4,mm5
pmullw mm5,[esi+ecx+1]
paddusw mm0,mm1 ; Accum SSD for lines 0 and 2.
paddusw mm2,mm3
movq mm1,mm4
pmullw mm4,[esi+PITCH*5+1]
paddusw mm0,mm2
pmullw mm1,[esi+PITCH*7+1]
psrlw mm6,1 ; <(P18+P17)*256/2+junk ...>
psubw mm6,[edi+ebp*1] ; <(P18+P17)*256/2-C17*256+junk ...>
psrlw mm5,1
psubw mm5,[edi+ecx]
psrlw mm4,1
psubw mm4,[edi+PITCH*5]
pmaddwd mm6,mm6 ; SSD for odd pels of line 1.
pmaddwd mm5,mm5
psrlw mm1,1
psubw mm1,[edi+PITCH*7]
pmaddwd mm4,mm4
pmaddwd mm1,mm1
paddusw mm0,mm6
pxor mm6,mm6
paddusw mm0,mm5
pcmpeqb mm5,mm5
paddusw mm0,mm4
psubb mm6,mm5 ; Restore 8 bytes of -1.
paddusw mm0,mm1
punpckldq mm1,mm0 ; Get low order SWD accum to high order of mm1.
;
paddusw mm0,mm1 ; mm0[48:63] is SWD for block.
;
psrlq mm0,48 ; SWD for block.
;
paddd mm7,mm0 ; mm7 is SWD for all four blocks.
;
ret
ME_VMVfAtHalfPelPosition:
IF PITCH-384
**** The magic leaks out if PITCH != 384
ENDIF
shl eax,6
lea ecx,[ecx+edi-48*PITCH-48-PITCH/2]
add ecx,eax
mov eax,BFrameBaseAddress
shr ebx,1 ; CF == 1 iff HMVf is at half pel.
mov esi,ecx ; esi and ecx same if HMVf at full pel,
adc ecx,ebx ; but inc ecx if HMVf is at half pel.
add esi,ebx
add edi,eax ; Address of target block.
lea ebx,[ebp+ebp*2]
movq mm0,[esi] ; <P07 P06 ...>
pcmpeqb mm6,mm6
movq mm1,[ecx+ebp*1] ; <P17 P16 ...> or <P18 P17 ...>
psrlw mm6,8
movq mm2,[esi+ebp*2] ; <P27 P26 ...>
paddb mm0,mm1 ; <P07+P17 junk ...> or <P07+P18 junk ...>
movq mm3,[ecx+ebx] ; <P37 P36 ...> or <P38 P37 ...>
paddb mm1,mm2 ; <junk P16+P26 ...> or <junk P17+P26 ...>
movq mm4,[esi+ebp*4] ; <P47 P46 ...>
paddb mm2,mm3 ; <P27+P37 junk ...> or <P27+P38 junk ...>
paddb mm3,mm4 ; <junk P36+P46 ...> or <junk P37+P46 ...>
psrlw mm0,1 ; <(P07+P17)/2 junk ...> or (P07+P18)/2 junk ...>
pand mm1,mm6 ; <P16+P26 ...> or <P17+P26 ...>
psrlw mm2,1 ; <(P27+P37)/2 junk ...> or (P27+P38)/2 junk ...>
movq mm5,[edi+ebp*1] ; <C17 C16 C15 C14 C13 C12 C11 C10>
pand mm3,mm6 ; <P36+P46 ...> or <P37+P46 ...>
movq mm6,[edi+ebx] ; <C37 C36 C35 C34 C33 C32 C31 C30>
psllw mm5,8 ; <C16 0 C14 0 C12 0 C10 0>
psubw mm0,[edi] ; <(P07+P17)/2-C07 junk ...> or ...
psllw mm1,7 ; <(P16+P26)/2 ...> or <(P17+P26)/2 ...>
psubw mm2,[edi+ebp*2] ; <(P27+P37)/2-C27 junk ...> or ...
psllw mm6,8 ; <C36 0 C34 0 C32 0 C30 0>
pmaddwd mm0,mm0 ; SSD of even pels of line 0.
psubw mm1,mm5 ; <(P16+P26)/2-C16 junk ...> or ...
pmaddwd mm1,mm1 ; SSD of odd pels of line 1.
psllw mm3,7 ; <(P36+P46)/2 ...> or <(P37+P46)/2 ...>
pmaddwd mm2,mm2 ; SSD of even pels of line 2.
psubw mm3,mm6 ; <(P36+P46)/2-C36 junk ...> or ...
pmaddwd mm3,mm3 ; SSD of odd pels of line 3.
pcmpeqb mm6,mm6
paddusw mm0,mm1
movq mm1,[ecx+PITCH*5]
paddusw mm0,mm2
movq mm2,[esi+ebx*2]
paddusw mm0,mm3
movq mm3,[ecx+PITCH*7]
paddb mm4,mm1
paddb mm1,mm2
paddb mm2,mm3
paddb mm3,[esi+ebp*8]
psrlw mm6,8
pand mm1,mm6
psrlw mm4,1
movq mm5,[edi+PITCH*5]
psrlw mm2,1
pand mm3,mm6
psllw mm5,8
movq mm6,[edi+PITCH*7]
psllw mm1,7
psubw mm4,[edi+ebp*4]
psllw mm3,7
psubw mm2,[edi+ebx*2]
psllw mm6,8
pmaddwd mm4,mm4
psubw mm1,mm5
pmaddwd mm2,mm2
psubw mm3,mm6
pmaddwd mm1,mm1
pxor mm6,mm6
pmaddwd mm3,mm3
paddusw mm0,mm4
pcmpeqb mm5,mm5
paddusw mm0,mm1
psubb mm6,mm5 ; Restore 8 bytes of 1.
paddusw mm0,mm2
paddusw mm0,mm3
;
punpckldq mm1,mm0 ; Get low order SWD accum to high order of mm1.
;
paddusw mm0,mm1 ; mm0[48:63] is SWD for block.
;
psrlq mm0,48 ; SWD for block.
;
paddd mm7,mm0 ; mm7 is SWD for all four blocks.
;
ret
;===============================================================================
; ebp -- Pitch
; edx -- Block Action Decriptor cursor
; ebx -- VMVf (VMV to apply to past reference) biased by 96.
; eax -- HMVf (HMV to apply to past reference) biased by 96.
StackOffset TEXTEQU <8>
BFrameDTQ:
test bl,1
lea ebx,[ebx+ebx*2] ; Start of VMVf*384
mov ecx,PreviousFrameBaseAddress
jne Diff_VMVfAtHalfPelPosition
Diff_VMVfAtFullPelPosition:
IF PITCH-384
**** The magic leaks out if PITCH != 384
ENDIF
shl ebx,6
mov edi,[edx].BlkY1.BlkOffset ; Address of 0-MV blk within frame.
shr eax,1 ; CF == 1 iff HMVf is at half pel.
jc Diff_VMVfAtFull_HMVfAtHalfPelPosition
Diff_VMVfAtFull_HMVfAtFullPelPosition:
lea esi,[ecx+ebx-48*PITCH-48]
add eax,edi
add esi,eax ; Address of past reference block.
mov ecx,PITCH/4 ; Pitch for past reference blk, div 4.
mov eax,BFrameBaseAddress ; Address of target block.
mov PastRefPitchDiv4,ecx
add edi,eax ; Address of target block.
jmp Diff_GetFutureContribToPred
Diff_VMVfAtHalfPelPosition:
IF PITCH-384
**** The magic leaks out if PITCH != 384
ENDIF
shl ebx,6
mov edi,[edx].BlkY1.BlkOffset ; Address of 0-MV blk within frame.
shr eax,1 ; CF == 1 iff HMVf is at half pel.
jc Diff_VMVfAtHalf_HMVfAtHalfPelPosition
Diff_VMVfAtHalf_HMVfAtFullPelPosition:
lea esi,[ecx+ebx-48*PITCH-48-PITCH/2]; Begin get pastrefaddr. Del bias.
add eax,edi
add esi,eax ; Address of past reference block.
lea eax,PelDiffs-32
pcmpeqb mm6,mm6
pcmpeqb mm7,mm7 ; 8 bytes -1
movq mm2,[esi] ; Line0
paddb mm6,mm6 ; 8 bytes of 0xFE.
@@:
movq mm1,[esi+ebp*1] ; Line1
movq mm0,mm2 ; Line0
movq mm2,[esi+ebp*2] ; Line2
psubb mm1,mm7 ; Line1+1
paddb mm0,mm1 ; Line0+Line1+1
paddb mm1,mm2 ; Line1+Line2+1
pand mm0,mm6 ; pre-clean
pand mm1,mm6 ; pre-clean
add eax,32 ; Advance pointer for PelDiffs output.
psrlq mm0,1 ; (Line0+Line1+1)/2
lea esi,[esi+ebp*2] ; Advance input ptr 2 lines.
psrlq mm1,1 ; (Line1+Line2+1)/2
movq [eax],mm0 ; Store Past Ref for Line0
movq [eax+16],mm1 ; Store Past Ref for Line1
test al,32 ; Iterate twice
jne @b
test al,64 ; Iterate twice.
mov ecx,4 ; Pitch for past reference blk, div 4.
mov PastRefPitchDiv4,ecx
jne @b
mov eax,BFrameBaseAddress
lea esi,PelDiffs ; Address of interpolated past ref blk.
add edi,eax ; Address of target block.
jmp Diff_GetFutureContribToPred
Diff_VMVfAtFull_HMVfAtHalfPelPosition:
lea esi,[ecx+ebx-48*PITCH-48] ; Begin get pastrefaddr. Del bias.
add eax,edi
add esi,eax ; Address of past reference block.
lea eax,PelDiffs-32
lea ebx,Pel_Rnd
xor ecx,ecx
@@:
movq mm0,[esi+1] ; <P08 P07 P06 P05 P04 P03 P02 P01>
pcmpeqb mm7,mm7
mov cl,[esi] ; P00
movq mm2,mm0 ; <P08 P07 P06 P05 P04 P03 P02 P01>
movq mm1,[esi+ebp*1+1]
psllq mm2,8 ; <P07 P06 P05 P04 P03 P02 P01 0>
paddb mm0,[ebx+ecx*8] ; <P08+1 P07+1 ... P01+P00+1>
movq mm3,mm1
mov cl,[esi+ebp*1]
psllq mm3,8
paddb mm1,mm3
paddb mm0,mm2 ; <P08+P07+1 P07+P06+1 ... P01+P00+1>
paddb mm1,[ebx+ecx*8]
paddb mm7,mm7 ; 8 bytes of 0xFE.
pand mm0,mm7 ; pre-clean
pand mm1,mm7 ; pre-clean
add eax,32 ; Advance pointer for PelDiffs output.
psrlq mm0,1 ; <(P08+P07+1)/2 ...>
lea esi,[esi+ebp*2] ; Advance input ptr 2 lines.
psrlq mm1,1
movq [eax],mm0 ; Store Past Ref for Line0
movq [eax+16],mm1 ; Store Past Ref for Line1
test al,32 ; Iterate twice
jne @b
test al,64 ; Iterate twice.
mov cl,4 ; Pitch for past reference blk, div 4.
mov PastRefPitchDiv4,ecx
jne @b
mov eax,BFrameBaseAddress
lea esi,PelDiffs ; Address of interpolated past ref blk.
add edi,eax ; Address of target block.
jmp Diff_GetFutureContribToPred
Diff_VMVfAtHalf_HMVfAtHalfPelPosition:
lea esi,[ecx+ebx-48*PITCH-48-PITCH/2]; Begin get pastrefaddr. Del bias.
add eax,edi
add esi,eax ; Address of past reference block.
lea eax,PelDiffs-32
lea ebx,Pel_Rnd
xor ecx,ecx
movq mm3,[esi+1] ; 0A: <P08 P07 P06 P05 P04 P03 P02 P01>
pcmpeqb mm7,mm7
mov cl,[esi] ; 0B: P00
movq mm0,mm3 ; 0C: <P08 P07 P06 P05 P04 P03 P02 P01>
paddb mm7,mm7 ; 8 bytes of 0xFE.
psllq mm0,8 ; 0D: <P07 P06 P05 P04 P03 P02 P01 0>
paddb mm3,[ebx+ecx*8] ; 0E: <P08+1 P07+1 ... P01+P00+1>
movq mm6,mm7 ; 8 bytes of 0xFE.
@@:
movq mm1,[esi+ebp*1+1] ; 1A: <P18 P17 P16 P15 P14 P13 P12 P11>
paddb mm0,mm3 ; 0F: <P08+P07+1 ... P01+P00+1>
mov cl,[esi+ebp*1] ; 1B: P10
movq mm3,mm1 ; 1C: <P18 P17 P16 P15 P14 P13 P12 P11>
movq mm2,[esi+ebp*2+1] ; 2A: <P28 P27 P26 P25 P24 P23 P22 P21>
psllq mm3,8 ; 1D: <P17 P16 P15 P14 P13 P12 P11 0>
paddb mm1,[ebx+ecx*8] ; 1E: <P18+1 P17+1 ... P11+P10+1>
movq mm4,mm2 ; 2C: <P28 P27 P26 P25 P24 P23 P22 P21>
mov cl,[esi+ebp*2] ; 2B: P20
paddb mm1,mm3 ; 1F: <P18+P17+1 ... P11+P10+1>
pandn mm6,mm1 ; 0G: <(P18+P17+1)&1 ...>
psllq mm4,8 ; 2D: <P27 P26 P25 P24 P23 P22 P21 0>
paddb mm2,[ebx+ecx*8] ; 2E: <P28+1 P27+1 ... P21+P20+1>
movq mm5,mm6 ; 1G: <(P18+P17+1)&1 ...>
paddb mm2,mm4 ; 2F: <P28+P27+1 ... P21+P20+1>
pand mm6,mm0 ; 0H: <(P18+P17+1)&(P08+P07+1)&1 ...>
pand mm5,mm2 ; 1H: <(P18+P17+1)&(P28+P27+1)&1 ...>
pand mm0,mm7 ; 0I: pre-clean for divide
pand mm1,mm7 ; 1I: pre-clean for divide
psrlq mm0,1 ; 0J: <(P08+P07+1)/2 ...>
movq mm3,mm2 ; Save line 2 for next iter's line 0.
psrlq mm1,1 ; 1J: <(P18+P17+1)/2 ...>
pand mm2,mm7 ; 2I: pre-clean for divide
paddb mm0,mm1 ; 0K: <(P08+P07+1)/2+(P18+P17+1)/2 ...>
paddb mm6,mm0 ; 0L: <(P08+P07+P18+P17+2)/2 ...>
psrlq mm2,1 ; 2J: <(P28+P27+1)/2 ...>
paddb mm1,mm2 ; 1K: <(P18+P17+1)/2+(P28+P27+1)/2 ...>
pand mm6,mm7 ; 0M: pre-clean for divide
paddb mm5,mm1 ; 1L: <(P18+P17+P28+P27+2)/2 ...>
psrlq mm6,1 ; 0M: <(P08+P07+P18+P17+2)/4 ...>
add eax,32 ; Advance pointer for PelDiffs output.
pand mm5,mm7 ; 1M: pre-clean for divide
lea esi,[esi+ebp*2] ; Advance input ptr 2 lines.
psrlq mm5,1 ; 1N: <(P18+P17+P28+P27+2)/4 ...>
movq [eax],mm6 ; 0O: Store Past Ref for Line0
pxor mm0,mm0 ; So that add of mm3 is just like movq.
movq [eax+16],mm5 ; 1O: Store Past Ref for Line1
movq mm6,mm7 ; 8 bytes of 0xFE.
test al,32 ; Iterate twice
jne @b
test al,64 ; Iterate twice.
mov cl,4 ; Pitch for past reference blk, div 4.
jne @b
mov eax,BFrameBaseAddress
lea esi,PelDiffs ; Address of interpolated past ref blk.
add edi,eax ; Address of target block.
mov PastRefPitchDiv4,ecx
Diff_GetFutureContribToPred:
;===============================================================================
;
; Registers at entry:
; edi -- Pointer to target block.
; esi -- Pointer to past reference.
; edx -- Block Descriptor within MacroBlockActionDescritptorStream
;
; Subsequent assignments:
;
; ebp -- Pitch for past reference block, div 4. Loop counter in high 2 bits.
; ecx -- Pointer to future reference block
; ebx -- Pointer to list of indices of multipliers to wt past and future refs.
; eax,edx -- Index of multiplier to weight past and future ref.
xor ecx,ecx
mov eax,edx
IF SIZEOF T_Blk-16
**** The magic leaks out if size of block descriptor is not 16.
ENDIF
mov cl,[edx].BlkY1.BestHMVb ; HMV for future reference block.
and edx,112 ; Extract block number (times 16).
xor ebx,ebx
mov BlockActionDescrCursor,eax
mov bl,[eax].BlkY1.BestVMVb ; VMV for future reference block.
mov eax,LeftRightBlkPosition[edx]
mov ebp,ecx
CONST_384 TEXTEQU <384>
mov edx,UpDownBlkPosition[edx]
mov cl,[eax+ecx*2] ; Get horz part of past/future wt sel.
IF PITCH-384
**** The magic leaks out if PITCH != 384
ENDIF
lea eax,[ebx+ebx*2] ; Start of VMVb*384
mov bl,[edx+ebx*2] ; Get vert part of past/future wt sel.
shl eax,6
mov edx,BFrameToFuture
lea ebx,Diff_IdxRefWts[ecx+ebx] ; Addr of list of wts for refs.
test al,64 ; Is VMVb odd?
lea eax,[eax+edx] ; Begin to get addr futr ref.
jne Diff_VMVbAtHalfPelPosition
Diff_VMVbAtFullPelPosition:
CONST_384 TEXTEQU <384>
shr ebp,1 ; CF == 1 iff HMVf is at half pel.
lea esp,[esp-128]
StackOffset TEXTEQU <136>
lea ecx,[eax+edi-48*PITCH-48]
jc Diff_VMVbAtFull_HMVbAtHalfPelPosition
Diff_VMVbAtFull_HMVbAtFullPelPosition:
CONST_384 TEXTEQU <384>
add ecx,ebp ; Address of future reference block.
mov ebp,PastRefPitchDiv4
xor eax,eax
xor edx,edx
@@:
StackOffset TEXTEQU <undefined>
mov al,[ebx] ; 0A: Index of weights for line 0.
add esp,32 ; Advance Pel Difference cursor
mov dl,[ebx+1] ; 1A: Index of weights for line 1.
add ebx,2 ; Advance list ptr for ref weights.
movq mm0,[ecx] ; 0B: <F07 F06 F05 F04 F03 F02 F01 F00>
pcmpeqb mm7,mm7
movq mm2,FutureWt_FF_or_00[eax]; 0C: <In?FF:00 ...>
paddb mm7,mm7 ; 8 bytes of 0xFE
movq mm3,[esi] ; 0D: <P07 P06 P05 P04 P03 P02 P01 P00>
pand mm0,mm2 ; 0E: <In?F07:00 In?F06:00 ...>
pandn mm2,mm3 ; 0F: <In?00:P07 In?00:P06 ...>
paddb mm0,mm3 ; 0G: <In?F07+P07:P07 ...>
movq mm1,[ecx+PITCH] ; 1B: <F17 F16 F15 F14 F13 F12 F11 F10>
paddb mm0,mm2 ; 0H: <In?F07+P07:2P07 ...>
movq mm2,FutureWt_FF_or_00[edx]; 1C: <In?FF:00 ...>
pand mm0,mm7 ; 0I: pre-clean
movq mm3,[esi+ebp*4] ; 1D: <P17 P16 P15 P14 P13 P12 P11 P10>
pand mm1,mm2 ; 1E: <In?F17:00 In?F16:00 ...>
pandn mm2,mm3 ; 1F: <In?00:P17 In?00:P16 ...>
paddb mm1,mm3 ; 1G: <In?F17+P17:P17 ...>
movq mm3,[edi] ; 0J: <C07 C06 C05 C04 C03 C02 C01 C00>
psrlq mm0,1 ; 0K: <In?(F07+P07)/2:P07 ...>
psubb mm3,mm0 ; 0L: <In?C07-(F07+P07)/2:C07-P07 ...>
paddb mm1,mm2 ; 1H: <In?F17+P17:2P17 ...>
movq mm4,[edi+PITCH] ; 1J: <C17 C16 C15 C14 C13 C12 C11 C10>
pand mm1,mm7 ; 1I: pre-clean
add edi,PITCH*2 ; Advance Target Blk cursor
psrlq mm1,1 ; 1K: <In?(F17+P17)/2:P17 ...>
StackOffset TEXTEQU <8+96>
movq PelDiffs,mm3 ; 0M: Save pel differences for line 0.
StackOffset TEXTEQU <undefined>
psubb mm4,mm1 ; 1L: <In?C17-(F17+P17)/2:C17-P17 ...>
add ecx,PITCH*2 ; Advance Future Ref Blk cursor
lea esi,[esi+ebp*8] ; Advance Past Ref Blk cursor
StackOffset TEXTEQU <8+96>
movq PelDiffs+16,mm4 ; 1M: Save pel differences for line 1.
StackOffset TEXTEQU <undefined>
add ebp,080000000H ; Iterate twice
jnc @b
test ebp,040000000H ; Iterate twice
lea ebp,[ebp+040000000H]
je @b
StackOffset TEXTEQU <8>
mov ebp,16
lea esi,PelDiffs
mov edx,BlockActionDescrCursor
jmp MMxDoForwardDCT
Diff_VMVbAtHalfPelPosition:
CONST_384 TEXTEQU <384>
shr ebp,1 ; CF == 1 iff HMVf is at half pel.
lea esp,[esp-128]
StackOffset TEXTEQU <136>
lea ecx,[eax+edi-48*PITCH-48-PITCH/2]
jc Diff_VMVbAtHalf_HMVbAtHalfPelPosition
Diff_VMVbAtHalf_HMVbAtFullPelPosition:
CONST_384 TEXTEQU <384>
add ecx,ebp ; Address of future reference block.
mov ebp,PastRefPitchDiv4
xor eax,eax
xor edx,edx
movq mm6,[ecx] ; 0B: <F07 F06 F05 F04 F03 F02 F01 F00>
pcmpeqb mm7,mm7 ; 8 bytes -1
@@:
StackOffset TEXTEQU <undefined>
movq mm1,[ecx+PITCH] ; 1a: <f17 f16 f15 f14 f13 f12 f11 f10>
movq mm0,mm6 ; 0a: <f07 f06 f05 f04 f03 f02 f01 f00>
mov al,[ebx] ; 0A: Index of weights for line 0.
psubb mm1,mm7 ; b: <f17+1 ...>
movq mm6,[ecx+PITCH*2] ; 2a: <f27 f26 f25 f24 f23 f22 f21 f20>
paddb mm0,mm1 ; 0c: <f07+f17+1..>
mov dl,[ebx+1] ; 1A: Index of weights for line 1.
paddb mm7,mm7 ; 8 bytes of 0xFE
paddb mm1,mm6 ; 1c: <f17+f27+1..>
pand mm0,mm7 ; 0d: pre-clean
pand mm1,mm7 ; 1d: pre-clean
psrlq mm0,1 ; 0B: <(F07 = f07+f17+1)/2>
movq mm2,FutureWt_FF_or_00[eax]; 0C: <In?FF:00 ...>
psrlq mm1,1 ; 1B: <(F17 = f17+f27+1)/2>
movq mm3,[esi] ; 0D: <P07 P06 P05 P04 P03 P02 P01 P00>
pand mm0,mm2 ; 0E: <In?F07:00 In?F06:00 ...>
pandn mm2,mm3 ; 0F: <In?00:P07 In?00:P06 ...>
paddb mm0,mm3 ; 0G: <In?F07+P07:P07 ...>
add ebx,2 ; Advance list ptr for ref weights.
paddb mm0,mm2 ; 0H: <In?F07+P07:2P07 ...>
movq mm2,FutureWt_FF_or_00[edx]; 1C: <In?FF:00 ...>
pand mm0,mm7 ; 0I: pre-clean
movq mm3,[esi+ebp*4] ; 1D: <P17 P16 P15 P14 P13 P12 P11 P10>
pand mm1,mm2 ; 1E: <In?F17:00 In?F16:00 ...>
pandn mm2,mm3 ; 1F: <In?00:P17 In?00:P16 ...>
paddb mm1,mm3 ; 1G: <In?F17+P17:P17 ...>
movq mm3,[edi] ; 0J: <C07 C06 C05 C04 C03 C02 C01 C00>
psrlq mm0,1 ; 0K: <In?(F07+P07)/2:P07 ...>
psubb mm3,mm0 ; 0L: <In?C07-(F07+P07)/2:C07-P07 ...>
add esp,32 ; Advance Pel Difference cursor
movq mm4,[edi+PITCH] ; 1J: <C17 C16 C15 C14 C13 C12 C11 C10>
paddb mm1,mm2 ; 1H: <In?F17+P17:2P17 ...>
add ecx,PITCH*2 ; Advance Future Ref Blk cursor
pand mm1,mm7 ; 1I: pre-clean
add edi,PITCH*2 ; Advance Target Blk cursor
psrlq mm1,1 ; 1K: <In?(F17+P17)/2:P17 ...>
StackOffset TEXTEQU <8+96>
movq PelDiffs,mm3 ; 0M: Save pel differences for line 0.
StackOffset TEXTEQU <undefined>
psubb mm4,mm1 ; 1L: <In?C17-(F17+P17)/2:C17-P17 ...>
pcmpeqb mm7,mm7 ; 8 bytes -1
lea esi,[esi+ebp*8] ; Advance Past Ref Blk cursor
StackOffset TEXTEQU <8+96>
movq PelDiffs+16,mm4 ; 1M: Save pel differences for line 1.
StackOffset TEXTEQU <undefined>
pcmpeqb mm7,mm7 ; 8 bytes -1
add ebp,080000000H ; Iterate twice
jnc @b
add ebp,040000000H ; Iterate twice
test ebp,ebp
jns @b
StackOffset TEXTEQU <8>
mov ebp,16
lea esi,PelDiffs
mov edx,BlockActionDescrCursor
jmp MMxDoForwardDCT
Diff_VMVbAtFull_HMVbAtHalfPelPosition:
StackOffset TEXTEQU <136>
CONST_384 TEXTEQU <384>
add ecx,ebp ; Address of future reference block.
mov ebp,PastRefPitchDiv4
xor eax,eax
lea edx,Pel_Rnd
@@:
StackOffset TEXTEQU <undefined>
movq mm0,[ecx+1] ; 0a: <f08 f07 f06 f05 f04 f03 f02 f01>
pcmpeqb mm7,mm7
mov al,[ecx] ; 0b: f00
movq mm2,mm0 ; 0c: <f08 f07 f06 f05 f04 f03 f02 f01>
movq mm1,[ecx+PITCH+1] ; 1a: <f18 f17 f16 f15 f14 f13 f12 f11>
psllq mm2,8 ; 0d: <f07 f06 f05 f04 f03 f02 f01 0>
paddb mm0,[edx+eax*8] ; 0e: <f08+1 f07+1 ... f01+f00+1>
movq mm3,mm1 ; 1c: <f18 f17 f16 f15 f14 f13 f12 f11>
mov al,[ecx+PITCH] ; 1b: f10
psllq mm3,8 ; 1d: <f17 f16 f15 f14 f13 f12 f11 0>
paddb mm0,mm2 ; 0f: <f08+f07+1 f07+f06+1 ... f01+f00+1>
paddb mm1,mm3 ; 1f: <f18+f17 f17+f16 ... f11 >
paddb mm1,[edx+eax*8] ; 1e: <f18+f17+1 f17+f16+1 ... f11+f10+1>
paddb mm7,mm7 ; 8 bytes of 0xFE.
mov al,[ebx] ; 0A: Index of weights for line 0.
pand mm0,mm7 ; 0g: pre-clean
movq mm3,[esi] ; 0D: <P07 P06 P05 P04 P03 P02 P01 P00>
psrlq mm0,1 ; 0B: <F07 = (f08+f07+1)/2 ...>
movq mm2,FutureWt_FF_or_00[eax]; 0C: <In?FF:00 ...>
pand mm1,mm7 ; 1g: pre-clean
mov al,[ebx+1] ; 1A: Index of weights for line 1.
psrlq mm1,1 ; 1B: <F17 = (f18+f17+1)/2 ...>
pand mm0,mm2 ; 0E: <In?F07:00 In?F06:00 ...>
pandn mm2,mm3 ; 0F: <In?00:P07 In?00:P06 ...>
movq mm4,FutureWt_FF_or_00[eax]; 1C: <In?FF:00 ...>
paddb mm0,mm3 ; 0G: <In?F07+P07:P07 ...>
movq mm3,[esi+ebp*4] ; 1D: <P17 P16 P15 P14 P13 P12 P11 P10>
paddb mm0,mm2 ; 0H: <In?F07+P07:2P07 ...>
pand mm0,mm7 ; 0I: pre-clean
pand mm1,mm4 ; 1E: <In?F17:00 In?F16:00 ...>
pandn mm4,mm3 ; 1F: <In?00:P17 In?00:P16 ...>
paddb mm1,mm3 ; 1G: <In?F17+P17:P17 ...>
movq mm3,[edi] ; 0J: <C07 C06 C05 C04 C03 C02 C01 C00>
psrlq mm0,1 ; 0K: <In?(F07+P07)/2:P07 ...>
psubb mm3,mm0 ; 0L: <In?C07-(F07+P07)/2:C07-P07 ...>
add esp,32 ; Advance Pel Difference cursor
add ecx,PITCH*2 ; Advance Future Ref Blk cursor
paddb mm1,mm4 ; 1H: <In?F17+P17:2P17 ...>
movq mm4,[edi+PITCH] ; 1J: <C17 C16 C15 C14 C13 C12 C11 C10>
pand mm1,mm7 ; 1I: pre-clean
add edi,PITCH*2 ; Advance Target Blk cursor
psrlq mm1,1 ; 1K: <In?(F17+P17)/2:P17 ...>
StackOffset TEXTEQU <8+96>
movq PelDiffs,mm3 ; 0M: Save pel differences for line 0.
StackOffset TEXTEQU <undefined>
psubb mm4,mm1 ; 1L: <In?C17-(F17+P17)/2:C17-P17 ...>
add ebx,2 ; Advance list ptr for ref weights.
lea esi,[esi+ebp*8] ; Advance Past Ref Blk cursor
StackOffset TEXTEQU <8+96>
movq PelDiffs+16,mm4 ; 1M: Save pel differences for line 1.
StackOffset TEXTEQU <undefined>
add ebp,080000000H ; Iterate twice
jnc @b
add ebp,040000000H ; Iterate twice
test ebp,ebp
jns @b
StackOffset TEXTEQU <8>
mov ebp,16
lea esi,PelDiffs
mov edx,BlockActionDescrCursor
jmp MMxDoForwardDCT
Diff_VMVbAtHalf_HMVbAtHalfPelPosition:
StackOffset TEXTEQU <136>
CONST_384 TEXTEQU <384>
add ecx,ebp ; Address of future reference block.
mov ebp,PastRefPitchDiv4
xor eax,eax
lea edx,Pel_Rnd
movq mm4,[ecx+1] ; 0a: <f08 f07 f06 f05 f04 f03 f02 f01>
pcmpeqb mm7,mm7
mov al,[ecx] ; 0b: f00
movq mm0,mm4 ; 0c: <f08 f07 f06 f05 f04 f03 f02 f01>
paddb mm7,mm7 ; 8 bytes of 0xFE.
psllq mm0,8 ; 0d: <f07 f06 f05 f04 f03 f02 f01 0>
paddb mm4,[edx+eax*8] ; 0e: <f08+1 f07+1 ... f01+f00+1>
movq mm6,mm7 ; 8 bytes of 0xFE.
@@:
StackOffset TEXTEQU <undefined>
movq mm1,[ecx+PITCH+1] ; 1a: <f18 f17 f16 f15 f14 f13 f12 f11>
paddb mm0,mm4 ; 0f: <f08+f07+1 ... f01+f00+1>
mov al,[ecx+PITCH] ; 1b: f10
movq mm3,mm1 ; 1c: <f18 f17 f16 f15 f14 f13 f12 f11>
movq mm2,[ecx+PITCH*2+1] ; 2a: <f28 f27 f26 f25 f24 f23 f22 f21>
psllq mm3,8 ; 1d: <f17 f16 f15 f14 f13 f12 f11 0>
paddb mm1,[edx+eax*8] ; 1e: <f18+1 f17+1 ... f11+f10+1>
movq mm4,mm2 ; 2c: <f28 f27 f26 f25 f24 f23 f22 f21>
mov al,[ecx+PITCH*2] ; 2b: f20
paddb mm1,mm3 ; 1f: <f18+f17+1 ... f11+f10+1>
pandn mm6,mm1 ; 0g: <(f18+f17+1)&1 ...>
psllq mm4,8 ; 2d: <f27 f26 f25 f24 f23 f22 f21 0>
paddb mm2,[edx+eax*8] ; 2e: <f28+1 f27+1 ... f21+f20+1>
movq mm5,mm6 ; 1g: <(f18+f17+1)&1 ...>
paddb mm2,mm4 ; 2f: <f28+f27+1 ... f21+f20+1>
pand mm6,mm0 ; 0h: <(f18+f17+1)&(f08+f07+1)&1 ...>
pand mm5,mm2 ; 1h: <(f18+f17+1)&(f28+f27+1)&1 ...>
pand mm0,mm7 ; 0i: pre-clean for divide
pand mm1,mm7 ; 1i: pre-clean for divide
psrlq mm0,1 ; 0j: <(f08+f07+1)/2 ...>
movq mm4,mm2 ; Save line 2 for next iter's line 0.
psrlq mm1,1 ; 1j: <(f18+f17+1)/2 ...>
pand mm2,mm7 ; 2i: pre-clean for divide
paddb mm0,mm1 ; 0k: <(f08+f07+1)/2+(f18+f17+1)/2 ...>
paddb mm0,mm6 ; 0l: <(f08+f07+f18+f17+2)/2 ...>
psrlq mm2,1 ; 2j: <(f28+f27+1)/2 ...>
paddb mm1,mm2 ; 1k: <(f18+f17+1)/2+(f28+f27+1)/2 ...>
pand mm0,mm7 ; 0m: pre-clean for divide
mov al,[ebx] ; 0A: Index of weights for line 0.
paddb mm1,mm5 ; 1l: <(f18+f17+f28+f27+2)/2 ...>
movq mm3,[esi] ; 0D: <P07 P06 P05 P04 P03 P02 P01 P00>
pand mm1,mm7 ; 1m: pre-clean for divide
movq mm2,FutureWt_FF_or_00[eax]; 0C: <In?FF:00 ...>
psrlq mm0,1 ; 0B: <F07 = (f08+f07+f18+f17+2)/4 ...>
mov al,[ebx+1] ; 1A: Index of weights for line 1.
psrlq mm1,1 ; 1B: <F17 = (f18+f17+f28+f27+2)/4 ...>
pand mm0,mm2 ; 0E: <In?F07:00 In?F06:00 ...>
pandn mm2,mm3 ; 0F: <In?00:P07 In?00:P06 ...>
movq mm5,FutureWt_FF_or_00[eax]; 1C: <In?FF:00 ...>
paddb mm0,mm3 ; 0G: <In?F07+P07:P07 ...>
movq mm3,[esi+ebp*4] ; 1D: <P17 P16 P15 P14 P13 P12 P11 P10>
paddb mm0,mm2 ; 0H: <In?F07+P07:2P07 ...>
pand mm0,mm7 ; 0I: pre-clean
pand mm1,mm5 ; 1E: <In?F17:00 In?F16:00 ...>
pandn mm5,mm3 ; 1F: <In?00:P17 In?00:P16 ...>
paddb mm1,mm3 ; 1G: <In?F17+P17:P17 ...>
movq mm3,[edi] ; 0J: <C07 C06 C05 C04 C03 C02 C01 C00>
psrlq mm0,1 ; 0K: <In?(F07+P07)/2:P07 ...>
psubb mm3,mm0 ; 0L: <In?C07-(F07+P07)/2:C07-P07 ...>
add esp,32 ; Advance Pel Difference cursor
paddb mm1,mm5 ; 1H: <In?F17+P17:2P17 ...>
add ecx,PITCH*2 ; Advance Future Ref Blk cursor
movq mm5,[edi+PITCH] ; 1J: <C17 C16 C15 C14 C13 C12 C11 C10>
pand mm1,mm7 ; 1I: pre-clean
add edi,PITCH*2 ; Advance Target Blk cursor
psrlq mm1,1 ; 1K: <In?(F17+P17)/2:P17 ...>
StackOffset TEXTEQU <8+96>
movq PelDiffs,mm3 ; 0M: Save pel differences for line 0.
StackOffset TEXTEQU <undefined>
psubb mm5,mm1 ; 1L: <In?C17-(F17+P17)/2:C17-P17 ...>
add ebx,2 ; Advance list ptr for ref weights.
lea esi,[esi+ebp*8] ; Advance Past Ref Blk cursor
StackOffset TEXTEQU <8+96>
movq PelDiffs+16,mm5 ; 1M: Save pel differences for line 1.
StackOffset TEXTEQU <undefined>
pxor mm0,mm0 ; So that add of mm4 is just like movq.
add ebp,080000000H ; Iterate twice
movq mm6,mm7 ; 8 bytes of 0xFE.
jnc @b
add ebp,040000000H ; Iterate twice
test ebp,ebp
jns @b
StackOffset TEXTEQU <8>
mov ebp,16
lea esi,PelDiffs
mov edx,BlockActionDescrCursor
jmp MMxDoForwardDCT
CONST_384 TEXTEQU <ebp>
END