2020-09-30 16:53:55 +02:00

690 lines
21 KiB
NASM

;*************************************************************************
;** INTEL Corporation Proprietary Information
;**
;** This listing is supplied under the terms of a license
;** agreement with INTEL Corporation and may not be copied
;** nor disclosed except in accordance with the terms of
;** that agreement.
;**
;** Copyright (c) 1995 Intel Corporation.
;** All Rights Reserved.
;**
;*************************************************************************
;//
;// $Header: S:\h26x\src\dec\cx51209.asv
;//
;// $Log: S:\h26x\src\dec\cx51209.asv
;//
;////////////////////////////////////////////////////////////////////////////
; cx1209 -- This function performs YUV12 to IF09 color conversion for H26x.
; IF09 consists of Y, V, U in 8-bit, planar format, plus a plane of
; 4-bit flags, each in 8 bits of storage, with each bit indicative
; of which dwords of Y are unchanged from the previous frame.
; IF09 is only applicable using DCI.
;
; This version is tuned for maximum performance on both the Pentium
; (r) microcprocessor and the Pentium Pro (tm) microprocessor.
;
; Indentation of instructions indicates expected U/V pipe execution
; on Pentium (r) microprocessor; indented instructions are
; expected to execute in V-pipe, outdented instructions in U-pipe.
; Inside loops, blank lines delineate groups of 1, 2, or 3
; instructions that are expected to be decoded simultaneously
; on the Pentium Pro (tm) microprocessor.
;
; cx1209
; ^^^^^^
; ||||++----- Convert to IF09.
; ||++------- Convert from YUV12.
; |+--------- For both H261 and H263.
; +---------- Color convertor.
;-------------------------------------------------------------------------------
OPTION PROLOGUE:None
OPTION EPILOGUE:ReturnAndRelieveEpilogueMacro
include locals.inc
include ccinst.inc
include decconst.inc
IFNDEF DSEGNAME
IFNDEF WIN32
DSEGNAME TEXTEQU <Data_cx1209>
ENDIF
ENDIF
IFDEF WIN32
.xlist
include memmodel.inc
.list
.DATA
ELSE
DSEGNAME SEGMENT WORD PUBLIC 'DATA'
ENDIF
; any data would go here
IFNDEF WIN32
DSEGNAME ENDS
.xlist
include memmodel.inc
.list
ENDIF
IFNDEF SEGNAME
IFNDEF WIN32
SEGNAME TEXTEQU <_CODE32>
ENDIF
ENDIF
ifdef WIN32
.CODE
ASSUME cs : FLAT
ASSUME ds : FLAT
ASSUME es : FLAT
ASSUME fs : FLAT
ASSUME gs : FLAT
ASSUME ss : FLAT
else
SEGNAME SEGMENT PARA PUBLIC USE32 'CODE'
ASSUME CS : SEGNAME
ASSUME DS : Nothing
ASSUME ES : Nothing
ASSUME FS : Nothing
ASSUME GS : Nothing
endif
PUBLIC YUV12ToIF09
YUV12ToIF09 proc DIST LANG AYPlane: DWORD,
AVPlane: DWORD,
AUPlane: DWORD,
AFrameWidth: DWORD,
AFrameHeight: DWORD,
AYPitch: DWORD,
AUVPitch: DWORD,
AAspectAdjustmentCnt: DWORD,
AColorConvertedFrame: DWORD,
ADCIOffset: DWORD,
ACCOffsetToLine0: DWORD,
ACCOPitch: DWORD,
ACCType: DWORD
; void * YUV12ToIF09 (
; U8 * YPlane,
; U8 * VPlane,
; U8 * UPlane,
; UN FrameWidth,
; UN FrameHeight,
; UN YPitch,
; UN UVPitch,
; UN AspectAdjustmentCount,
; U8 * ColorConvertedFrame,
; U32 DCIOffset,
; U32 CCOffsetToLine0,
; IN CCOPitch,
; IN CCType)
;
; YPlane and VPlane are offsets relative to InstanceBase. In 16-bit Microsoft
; Windows (tm), space in this segment is used for local variables and tables.
; In 32-bit variants of Microsoft Windows (tm), the local variables are on
; the stack, while the tables are in the one and only data segment.
;
; CCOffsetToLine0 is relative to ColorConvertedFrame.
;
IFDEF WIN32
LocalFrameSize = 32
RegisterStorageSize = 16
; Arguments:
YPlane = LocalFrameSize + RegisterStorageSize + 4
VPlane = LocalFrameSize + RegisterStorageSize + 8
FrameWidth = LocalFrameSize + RegisterStorageSize + 12
FrameHeight = LocalFrameSize + RegisterStorageSize + 16
YPitch = LocalFrameSize + RegisterStorageSize + 20
ColorConvertedFrame = LocalFrameSize + RegisterStorageSize + 24
DCIOffset = LocalFrameSize + RegisterStorageSize + 28
CCOffsetToLine0 = LocalFrameSize + RegisterStorageSize + 32
CCOPitch = LocalFrameSize + RegisterStorageSize + 36
CCType = LocalFrameSize + RegisterStorageSize + 40
EndOfArgList = LocalFrameSize + RegisterStorageSize + 44
; Locals (on local stack frame)
CCOCursor = 0
YLimit = 4
CCOVCursor = 8
CCOUCursor = 12
CCOSkipCursor = 16
VLimit = 20
YLine1Limit = 24
CCOUVPitch = 28
LCL EQU <esp+>
ELSE
; Arguments:
RegisterStorageSize = 20 ; Put local variables on stack.
InstanceBase_zero = RegisterStorageSize + 4
InstanceBase_SegNum = RegisterStorageSize + 6
YPlane_arg = RegisterStorageSize + 8
VPlane_arg = RegisterStorageSize + 12
FrameWidth_arg = RegisterStorageSize + 16
FrameHeight_arg = RegisterStorageSize + 18
YPitch_arg = RegisterStorageSize + 20
ColorConvertedFrame = RegisterStorageSize + 22
ColorConvertedFrame_SegNum = RegisterStorageSize + 24
DCIOffset = RegisterStorageSize + 26
CCOffsetToLine0 = RegisterStorageSize + 30
CCOPitch_arg = RegisterStorageSize + 34
EndOfArgList = RegisterStorageSize + 36
; Locals (in per-instance data segment)
CCOCursor = LocalStorageCC + 0
YLimit = LocalStorageCC + 4
CCOVCursor = LocalStorageCC + 8
CCOUCursor = LocalStorageCC + 12
CCOSkipCursor = LocalStorageCC + 16
VLimit = LocalStorageCC + 20
YLine1Limit = LocalStorageCC + 24
CCOUVPitch = LocalStorageCC + 28
YPlane = LocalStorageCC + 32
VPlane = LocalStorageCC + 36
FrameWidth = LocalStorageCC + 40
FrameHeight = LocalStorageCC + 44
YPitch = LocalStorageCC + 48
CCOPitch = LocalStorageCC + 52
LCL EQU <>
ENDIF
push esi
push edi
push ebp
push ebx
IFDEF WIN32
sub esp,LocalFrameSize
mov eax,PD [esp+ColorConvertedFrame]
add eax,PD [esp+DCIOffset]
add eax,PD [esp+CCOffsetToLine0]
mov PD [esp+CCOCursor],eax
ELSE
xor eax,eax
mov eax,ds
push eax
mov ebp,esp
and ebp,00000FFFFH
mov ds, PW [ebp+InstanceBase_SegNum]
mov es, PW [ebp+ColorConvertedFrame_SegNum]
mov ebx,PD [ebp+YPlane_arg] ; Make YPlane accessible
mov ds:PD YPlane,ebx
mov ebx,PD [ebp+VPlane_arg] ; Make VPlane accessible.
mov ds:PD VPlane,ebx
mov ax,PW [ebp+FrameWidth_arg] ; Make FrameWidth accessible
mov ds:PD FrameWidth,eax
mov ax,PW [ebp+FrameHeight_arg] ; Make FrameHeight accessible
mov ds:PD FrameHeight,eax
mov ax,PW [ebp+YPitch_arg] ; Make YPitch accessible
mov ds:PD YPitch,eax
mov ax,PW [ebp+ColorConvertedFrame] ; Init CCOCursor
add eax,PD [ebp+DCIOffset]
mov ds:PD CCOCursor,eax
movsx ebx,PW [ebp+CCOPitch_arg] ; Make CCOPitch accessible
mov ds:PD CCOPitch,ebx
ENDIF
Ledx FrameHeight
Lebx CCOPitch
shr ebx,2 ; UV pitch for the output
Lecx YPitch
add ebx,3 ; Pitch is always a multiple of 4.
Lebp CCOPitch
and ebx,0FFFFFFFCH
Lesi YPlane ; Fetch cursor over luma plane.
Sebx CCOUVPitch
Leax CCOCursor
imul ecx,edx ; ecx: size of Y input.
imul ebp,edx ; ebp: was CCOPitch, now size of Y output.
imul ebx,edx ; ebp: size of U/V output (times 4).
add ecx,esi ; ecx: Ylimit
add eax,ebp ; eax was CCOCursor, now CCOVCursor
Secx YLimit
Seax CCOVCursor
sar ebx,2 ; ebx: UVsize of output
Lecx FrameWidth ; ecx: Y frame width
add esi,ecx ; esi: end of first input Y
add eax,ebx ; eax: now CCOUCursor
shr ecx,2
Seax CCOUCursor
Lebp VPlane ; ebp Vplane input
Ledx YPitch
lea esi,[edx+esi] ; End of Y line 1
add ebp,ecx ; end of Vline
Sesi YLine1Limit
add eax,ebx ; CCO Skip Blocks
Sebp VLimit ; UV width for input
Seax CCOSkipCursor
; Prepare the UV contribution to decide the skip blocks, and copy chroma
; planes at the same time.
;
; Register usage:
;
; esi: V plane input pointer
; edi; V output pointer
; ebp: U output pointer
; edx: Y plane input pointer
; ecx: V limit
; ebx: Work area for U
; eax: Work area for V
ChromaPrep:
Ledi CCOVCursor
Lebp CCOUCursor
Ledx YPlane
Leax YPitch
Lesi VPlane
Lecx VLimit
sub edi,esi ; make edi offset to esi.
sub ebp,esi ; make ebp offset to esi to save inc in the loop.
lea edx,[eax+edx-1296] ; make edx point at place for chroma prep.
mov eax,PD [esi] ; fetch four V
add eax,eax ; Change to 8-bit. (Low bit undef, usually 0).
ChromaLoop:
mov Ze PD[esi+edi*1],eax ; Store four V.
mov ebx,PD [esi+UOFFSET] ; fetch four U
add esi,4
mov PD [edx],eax ; Store four V to chroma-prep line in Y frame.
add edx,16 ; Advance chroma-prep cursor.
add ebx,ebx ; Change to 8-bit. (Low bit undef, usually 0).
mov Ze PD[esi+ebp*1-4],ebx ; Store four U.
mov eax,PD [esi] ; fetch next four V.
add eax,eax ; Change to 8-bit. (Low bit undef, usually 0).
mov PD [edx-12],ebx ; Store four U to chroma-prep line in Y frame.
mov bl,Ze PB [esi+edi*1] ; Pre-load output cache line
cmp esi,ecx
mov bl,Ze PB [esi+ebp*1] ; Pre-load output cache line
jb ChromaLoop
; update chroma pointers.
add ecx,VPITCH
Lebx CCOUVPitch
Ledi CCOVCursor
Lebp CCOUCursor
Secx VLimit
add edi,ebx ; update V output ptr to the next line
Leax VPlane
add ebp,ebx ; update U output ptr to the next line
Sedi CCOVCursor
add eax,VPITCH
Sebp CCOUCursor
Seax VPlane
; now do Luma a row of 4x4 blocks
;
; register usage:
;
; esi: Y cursor
; edi: CCOCursor
; ebp: counts down 4 lines of luma.
; ecx: counts down frame width.
; ebx: Y Pitch.
; eax: Work area.
; copy a row of 4x4 luma
Lesi YPlane
Lecx FrameWidth
Ledi CCOCursor
add esi,ecx
neg ecx
Lebx YPitch
sub edi,ecx
mov eax,PD[esi+ecx] ; Fetch 4 Y pels.
add eax,eax ; Make them 8-bit. Low bit undef, but usually 0.
mov ebp,4
YLoop:
mov Ze PD[edi+ecx],eax ; Store them to IF09 output, Y plane.
mov eax,PD[esi+ecx+4] ; Fetch 4 Y pels.
add eax,eax ; Make them 8-bit. Low bit undef, but usually 0.
add ecx,4 ; Advance induction variable.
jl YLoop
YLoopDone:
Lecx FrameWidth
add esi,ebx
add edi,ecx
neg ecx
mov eax,PD[esi+ecx] ; Fetch 4 Y pels.
add eax,eax ; Make them 8-bit. Low bit undef, but usually 0.
dec ebp
jne YLoop
add edi,ecx
Sedi CCOCursor ; save the output ptr for next four lines
; Build the skip block mask
;
; Register usage:
;
; esi: Y ptr
; edi: Mask Ptr
; ebp: Y Pitch
; edx: mask
; ecx: Archive value
; ebx: UV contribution
; eax: Dword of Y pels
;
; Y starts with Line 1 of 4x4 blocks, since UV pattern has been saved
; relative to line 1.
Lesi YPlane
Lebp YPitch
Ledi CCOSkipCursor
add esi,ebp ; esi point at line 1 of luma
BuildSkipDescrLoop:
mov ebx,PD [esi-1296] ; Fetch 4 U's; byte0 corresponds to this Y.
mov eax,PD [esi-1292] ; Fetch 4 V's; byte0 corresponds to this Y.
shl ebx,11 ; Position U.
and eax,0000000FCH ; Extract 6 bits of V.
and ebx,00007E000H ; Extract 6 bits of U.
mov edx,PD [esi+ebp*2] ; Line 3 of luma first.
and edx,07E7E7E7EH ; Use 6 bits of Y to save more xfer cycles.
mov ecx,PD[esi+ebp*2+YARCHIVEOFFSET] ; Fetch archive for previous frame
lea ebx,[ebx+eax*8] ; Build UV.
mov eax,PD[esi+ebp*1] ; line 2 of luma.
add edx,ebx ; combine Y with UV pattern
and eax,07E7E7E7EH
mov PD[esi+ebp*2+YARCHIVEOFFSET],edx ; save the current in the archive
sub ecx,edx ; compare with the previous archive
add ecx,-1 ; CF == 1 iff curr differs from prev
lea eax,[eax+ebx]
sbb edx,edx ; edx == -1 iff different, else 0.
mov ecx,PD[esi+ebp*1+YARCHIVEOFFSET]
mov PD[esi+ebp*1+YARCHIVEOFFSET],eax
sub ecx,eax
mov eax,PD[esi]
sub esi,ebp ; Gain acces to line 0.
and eax,07E7E7E7EH
add ecx,-1
adc edx,edx ; edx[0] == 1 if different, else 0.
mov ecx,PD[esi+ebp*1+YARCHIVEOFFSET]
add eax,ebx
lea edi,[edi+1]
sub ecx,eax
mov PD[esi+ebp*1+YARCHIVEOFFSET],eax
mov eax,PD[esi]
add ecx,-1
adc edx,edx
and eax,07E7E7E7EH
mov ecx,PD[esi+YARCHIVEOFFSET]
add eax,ebx
sub ecx,eax
Lebx YLine1Limit
mov PD[esi+YARCHIVEOFFSET],eax
add ecx,-1
lea esi,[esi+ebp+4] ; jump to line 1 of next 4x4 block
adc edx,edx
xor edx,0FFFFFFFFH ; edx[4:31] = 0. edx[0,1,2,3] == 1 if skip dword.
cmp esi,ebx ; check the end of line 1 of Y
mov Ze PB[edi-1],dl ; write to the skip block buffer
je BuildSkipDescrLoopDone
mov ebx,PD [esi-1300] ; Fetch 4 U's; byte1 corresponds to this Y.
mov eax,PD [esi-1296] ; Fetch 4 V's; byte1 corresponds to this Y.
shl ebx,11 ; Position U.
and eax,00000FC00H ; Extract 6 bits of V.
and ebx,007E00000H ; Extract 6 bits of U.
mov edx,PD [esi+ebp*2] ; Line 3 of luma first.
and edx,07E7E7E7EH ; Use 6 bits of Y to save more xfer cycles.
mov ecx,PD[esi+ebp*2+YARCHIVEOFFSET] ; Fetch archive for previous frame
lea ebx,[ebx+eax*8] ; Build UV.
mov eax,PD[esi+ebp*1] ; line 2 of luma.
add edx,ebx ; combine Y with UV pattern
and eax,07E7E7E7EH
mov PD[esi+ebp*2+YARCHIVEOFFSET],edx ; save the current in the archive
sub ecx,edx ; compare with the previous archive
add ecx,-1 ; CF == 1 iff curr differs from prev
lea eax,[eax+ebx]
sbb edx,edx ; edx == -1 iff different, else 0.
mov ecx,PD[esi+ebp*1+YARCHIVEOFFSET]
mov PD[esi+ebp*1+YARCHIVEOFFSET],eax
sub ecx,eax
mov eax,PD[esi]
sub esi,ebp ; Gain acces to line 0.
and eax,07E7E7E7EH
add ecx,-1
adc edx,edx ; edx[0] == 1 if different, else 0.
mov ecx,PD[esi+ebp*1+YARCHIVEOFFSET]
add eax,ebx
lea edi,[edi+1]
sub ecx,eax
mov PD[esi+ebp*1+YARCHIVEOFFSET],eax
mov eax,PD[esi]
add ecx,-1
adc edx,edx
and eax,07E7E7E7EH
mov ecx,PD[esi+YARCHIVEOFFSET]
add eax,ebx
sub ecx,eax
Lebx YLine1Limit
mov PD[esi+YARCHIVEOFFSET],eax
add ecx,-1
lea esi,[esi+ebp+4] ; jump to line 1 of next 4x4 block
adc edx,edx
xor edx,0FFFFFFFFH ; edx[4:31] = 0. edx[0,1,2,3] == 1 if skip dword.
cmp esi,ebx ; check the end of line 1 of Y
mov Ze PB[edi-1],dl ; write to the skip block buffer
je BuildSkipDescrLoopDone
mov ebx,PD [esi-1304] ; Fetch 4 U's; byte2 corresponds to this Y.
mov eax,PD [esi-1300] ; Fetch 4 V's; byte2 corresponds to this Y.
shr ebx,5 ; Position U.
and eax,000FC0000H ; Extract 6 bits of V.
and ebx,00007E000H ; Extract 6 bits of U.
mov edx,PD [esi+ebp*2] ; Line 3 of luma first.
and edx,07E7E7E7EH ; Use 6 bits of Y to save more xfer cycles.
mov ecx,PD[esi+ebp*2+YARCHIVEOFFSET] ; Fetch archive for previous frame
lea ebx,[ebx+eax*8] ; Build UV.
mov eax,PD[esi+ebp*1] ; line 2 of luma.
add edx,ebx ; combine Y with UV pattern
and eax,07E7E7E7EH
mov PD[esi+ebp*2+YARCHIVEOFFSET],edx ; save the current in the archive
sub ecx,edx ; compare with the previous archive
add ecx,-1 ; CF == 1 iff curr differs from prev
lea eax,[eax+ebx]
sbb edx,edx ; edx == -1 iff different, else 0.
mov ecx,PD[esi+ebp*1+YARCHIVEOFFSET]
mov PD[esi+ebp*1+YARCHIVEOFFSET],eax
sub ecx,eax
mov eax,PD[esi]
sub esi,ebp ; Gain acces to line 0.
and eax,07E7E7E7EH
add ecx,-1
adc edx,edx ; edx[0] == 1 if different, else 0.
mov ecx,PD[esi+ebp*1+YARCHIVEOFFSET]
add eax,ebx
lea edi,[edi+1]
sub ecx,eax
mov PD[esi+ebp*1+YARCHIVEOFFSET],eax
mov eax,PD[esi]
add ecx,-1
adc edx,edx
and eax,07E7E7E7EH
mov ecx,PD[esi+YARCHIVEOFFSET]
add eax,ebx
sub ecx,eax
Lebx YLine1Limit
mov PD[esi+YARCHIVEOFFSET],eax
add ecx,-1
lea esi,[esi+ebp+4] ; jump to line 1 of next 4x4 block
adc edx,edx
xor edx,0FFFFFFFFH ; edx[4:31] = 0. edx[0,1,2,3] == 1 if skip dword.
cmp esi,ebx ; check the end of line 1 of Y
mov Ze PB[edi-1],dl ; write to the skip block buffer
je BuildSkipDescrLoopDone
mov ebx,PD [esi-1308] ; Fetch 4 U's; byte3 corresponds to this Y.
mov eax,PD [esi-1304] ; Fetch 4 V's; byte3 corresponds to this Y.
shr ebx,5 ; Position U.
mov edx,PD [esi+ebp*2] ; Line 3 of luma first.
shr eax,26 ; Extract 6 bits of V.
and ebx,007E00000H ; Extract 6 bits of U.
and edx,07E7E7E7EH ; Use 6 bits of Y to save more xfer cycles.
mov ecx,PD[esi+ebp*2+YARCHIVEOFFSET] ; Fetch archive for previous frame
lea ebx,[ebx+eax*8] ; Build UV.
mov eax,PD[esi+ebp*1] ; line 2 of luma.
add edx,ebx ; combine Y with UV pattern
and eax,07E7E7E7EH
mov PD[esi+ebp*2+YARCHIVEOFFSET],edx ; save the current in the archive
sub ecx,edx ; compare with the previous archive
add ecx,-1 ; CF == 1 iff curr differs from prev
lea eax,[eax+ebx]
sbb edx,edx ; edx == -1 iff different, else 0.
mov ecx,PD[esi+ebp*1+YARCHIVEOFFSET]
mov PD[esi+ebp*1+YARCHIVEOFFSET],eax
sub ecx,eax
mov eax,PD[esi]
sub esi,ebp ; Gain acces to line 0.
and eax,07E7E7E7EH
add ecx,-1
adc edx,edx ; edx[0] == 1 if different, else 0.
mov ecx,PD[esi+ebp*1+YARCHIVEOFFSET]
add eax,ebx
lea edi,[edi+1]
sub ecx,eax
mov PD[esi+ebp*1+YARCHIVEOFFSET],eax
mov eax,PD[esi]
add ecx,-1
adc edx,edx
and eax,07E7E7E7EH
mov ecx,PD[esi+YARCHIVEOFFSET]
add eax,ebx
sub ecx,eax
Lebx YLine1Limit
mov PD[esi+YARCHIVEOFFSET],eax
add ecx,-1
lea esi,[esi+ebp+4] ; jump to line 1 of next 4x4 block
adc edx,edx
xor edx,0FFFFFFFFH ; edx[4:31] = 0. edx[0,1,2,3] == 1 if skip dword.
cmp esi,ebx ; check the end of line 1 of Y
mov Ze PB[edi-1],dl ; write to the skip block buffer
jne BuildSkipDescrLoop
BuildSkipDescrLoopDone:
add edi,3 ; Round to next dword.
lea ebx,[ebx+ebp*4] ; update YLine1Limit for next row of blocks
and edi,0FFFFFFFCH
Lesi YPlane
Sedi CCOSkipCursor
Sebx YLine1Limit
lea esi,[esi+ebp*4]
Leax YLimit
Sesi YPlane
cmp esi,eax
jl ChromaPrep
IFDEF WIN32
add esp,LocalFrameSize
ELSE
pop ebx
mov ds,ebx
ENDIF
pop ebx
pop ebp
pop edi
pop esi
rturn
YUV12ToIF09 endp
IFNDEF WIN32
SEGNAME ENDS
ENDIF
END