2020-09-30 16:53:55 +02:00

688 lines
24 KiB
NASM

;-------------------------------------------------------------------------
; INTEL Corporation Proprietary Information
;
; This listing is supplied under the terms of a license
; agreement with INTEL Corporation and may not be copied
; nor disclosed except in accordance with the terms of
; that agreement.
;
; Copyright (c) 1996 Intel Corporation.
; All Rights Reserved.
;
;-------------------------------------------------------------------------
;-------------------------------------------------------------------------
;//
;// $Header: S:\h26x\src\dec\cxm12162.asv
;//
;// $Log: S:\h26x\src\dec\cxm12162.asv $
;//
;// Rev 1.10 01 Apr 1997 12:51:50 BNICKERS
;// Fix bugs # 153 and 156 -- wrong color when U is small; right edge flickeri
;//
;// Rev 1.9 09 Dec 1996 15:20:40 BECHOLS
;// Brian fixed ARC bug #94.
;//
;// Rev 1.8 06 Sep 1996 16:07:58 BNICKERS
;// Re-written to filter new points.
;//
;-------------------------------------------------------------------------
;
; +---------- Color convertor.
; |+--------- For both H261 and H263.
; ||+-------- Version for Intel Microprocessors with MMX Technology
; |||++------ Convert from YUV12.
; |||||++---- Convert to RGB16.
; |||||||+--- Zoom by two.
; ||||||||
; cxm12162 -- This function performs zoom-by-2 YUV12-to-RGB16 color conversion
; for H26x. It is tuned for best performance on Intel
; Microprocessors with MMX Technology. It handles any format in
; which there are three fields, the low order field being B and
; starting in bit 0, the second field being G, and the high order
; field being R. Present support for 555, 565, 655, and 644
; formats only. This version adds new rows and columns by
; averaging them with the originals to either side.
;
; The YUV12 input is planar, 8 bits per pel. The Y plane may have
; a pitch of up to 768. It may have a width less than or equal
; to the pitch. It must be QWORD aligned. Pitch and Width must
; be a multiple of eight. Height may be any amount, but must be
; a multiple of two. The U and V planes may have a different
; pitch than the Y plane, subject to the same limitations.
;
; The color convertor is non-destructive; the input Y, U, and V
; planes will not be clobbered.
OPTION PROLOGUE:None
OPTION EPILOGUE:ReturnAndRelieveEpilogueMacro
include ccinst.inc
.xlist
include iammx.inc
include memmodel.inc
.list
MMXCCDATA SEGMENT PAGE
ALIGN 16
Luma0020004000200000 LABEL DWORD
REPEAT 16
DD 0, 0
ENDM
CNT = 0
REPEAT 219
DW 0
DW (CNT*04A7FH)/00200H
DW (CNT*04A7FH)/00100H
DW (CNT*04A7FH)/00200H
CNT = CNT + 1
ENDM
REPEAT 21
DW 00000H
DW 01FFFH
DW 03FFFH
DW 01FFFH
ENDM
UContribToBandG LABEL DWORD
DW -(-128*0C83H)/00040H
DW 08000H
DW -(-127*0C83H)/00040H
DW 08000H
CNT = -126
REPEAT 253
DW -(CNT*00C83H)/00040H
DW (CNT*0408BH)/00040H
CNT = CNT + 1
ENDM
DW (127*0C83H)/00040H
DW 07FFFH
VContribToRandG LABEL DWORD
CNT = -128
REPEAT 256
DW -(CNT*01A04H)/00040H
DW (CNT*03312H)/00040H
CNT = CNT + 1
ENDM
MMXCCDATA ENDS
.CODE
ASSUME ds : FLAT
ASSUME es : FLAT
ASSUME fs : FLAT
ASSUME gs : FLAT
ASSUME ss : FLAT
; void FAR ASM_CALLTYPE YUV12ToRGB16ZoomBy2 (U8 * YPlane,
; U8 * VPlane,
; U8 * UPlane,
; UN FrameWidth,
; UN FrameHeight,
; UN YPitch,
; UN VPitch,
; UN AspectAdjustmentCount,
; U8 * ColorConvertedFrame,
; U32 DCIOffset,
; U32 CCOffsetToLine0,
; IN CCOPitch,
; IN CCType)
;
; CCOffsetToLine0 is relative to ColorConvertedFrame.
;
; due to the need for the ebp reg, these parameter declarations aren't used,
; they are here so the assembler knows how many bytes to relieve from the stack
PUBLIC MMX_YUV12ToRGB16ZoomBy2
MMX_YUV12ToRGB16ZoomBy2 proc DIST LANG AYPlane: DWORD,
AVPlane: DWORD,
AUPlane: DWORD,
AFrameWidth: DWORD,
AFrameHeight: DWORD,
AYPitch: DWORD,
AVPitch: DWORD,
AAspectAdjustmentCnt: DWORD,
AColorConvertedFrame: DWORD,
ADCIOffset: DWORD,
ACCOffsetToLine0: DWORD,
ACCOPitch: DWORD,
ACCType: DWORD
MAXWIDTH = 768
LocalFrameSize = MAXWIDTH*20+128+64
RegisterStorageSize = 16
; Arguments:
YPlane_arg = RegisterStorageSize + 4
VPlane_arg = RegisterStorageSize + 8
UPlane_arg = RegisterStorageSize + 12
FrameWidth_arg = RegisterStorageSize + 16
FrameHeight = RegisterStorageSize + 20
YPitch_arg = RegisterStorageSize + 24
ChromaPitch_arg = RegisterStorageSize + 28
AspectAdjustmentCount_arg = RegisterStorageSize + 32
ColorConvertedFrame = RegisterStorageSize + 36
DCIOffset = RegisterStorageSize + 40
CCOffsetToLine0 = RegisterStorageSize + 44
CCOPitch_arg = RegisterStorageSize + 48
CCType = RegisterStorageSize + 52
EndOfArgList = RegisterStorageSize + 56
; Locals (on local stack frame)
DitherB EQU [esp+ 0]
DitherG EQU [esp+ 8]
DitherR EQU [esp+ 16]
SelectBBits EQU [esp+ 24]
SelectGBits EQU [esp+ 32]
SelectRBits EQU [esp+ 40]
ShiftCountForB EQU [esp+ 48]
ShiftCountForG EQU [esp+ 52]
ShiftCountForR EQU [esp+ 56]
CCOCursor EQU [esp+ 60]
CCOPitch EQU [esp+MAXWIDTH*20+128+ 0]
YCursor EQU [esp+MAXWIDTH*20+128+ 4]
YLimit EQU [esp+MAXWIDTH*20+128+ 8]
YPitch EQU [esp+MAXWIDTH*20+128+12]
UCursor EQU [esp+MAXWIDTH*20+128+16]
DistanceFromUToV EQU [esp+MAXWIDTH*20+128+20]
ChromaPitch EQU [esp+MAXWIDTH*20+128+24]
AspectCount EQU [esp+MAXWIDTH*20+128+28]
AspectAdjustmentCount EQU [esp+MAXWIDTH*20+128+32]
StartIndexOfYLine EQU [esp+MAXWIDTH*20+128+36]
StashESP EQU [esp+MAXWIDTH*20+128+40]
FiltLine0 EQU [esp+ 64] ; Must be 32 byte aligned.
FiltLine1 EQU [esp+ 72]
FiltLine2 EQU [esp+ 80]
FiltLine3 EQU [esp+ 88]
HFiltLinePrev EQU [esp+ 96]
push esi
push edi
push ebp
push ebx
mov edi,esp
and esp,0FFFFF000H
sub esp,4096
mov eax,[esp]
sub esp,4096
mov eax,[esp]
sub esp,4096
mov eax,[esp]
sub esp,LocalFrameSize-12288
mov eax,[esp]
mov eax,768
sub eax,[edi+FrameWidth_arg]
imul eax,20
mov StartIndexOfYLine,eax
mov eax,[edi+YPlane_arg]
mov YCursor,eax
mov ebx,[edi+YPitch_arg]
mov YPitch,ebx
mov ecx,[edi+FrameHeight]
imul ebx,ecx
add eax,ebx
mov YLimit,eax
mov eax,[edi+UPlane_arg]
mov ebx,[edi+VPlane_arg]
mov UCursor,eax
sub ebx,eax
mov DistanceFromUToV,ebx
mov eax,[edi+ColorConvertedFrame]
add eax,[edi+DCIOffset]
add eax,[edi+CCOffsetToLine0]
mov CCOCursor,eax
mov eax,[edi+ChromaPitch_arg]
mov ChromaPitch,eax
mov eax,[edi+CCOPitch_arg]
mov CCOPitch,eax
mov eax,[edi+AspectAdjustmentCount_arg]
mov AspectAdjustmentCount,eax
mov AspectCount,eax
mov StashESP,edi
mov eax,[edi+CCType]
cmp eax,CCTYPE_RGB16555ZoomBy2
je CCTypeIs555
cmp eax,CCTYPE_RGB16555ZoomBy2DCI
je CCTypeIs555
cmp eax,CCTYPE_RGB16565ZoomBy2
je CCTypeIs565
cmp eax,CCTYPE_RGB16565ZoomBy2DCI
je CCTypeIs565
cmp eax,CCTYPE_RGB16655ZoomBy2
je CCTypeIs655
cmp eax,CCTYPE_RGB16655ZoomBy2DCI
je CCTypeIs655
cmp eax,CCTYPE_RGB16664ZoomBy2DCI
je CCTypeIs664
cmp eax,CCTYPE_RGB16664ZoomBy2
je CCTypeIs664
mov eax,0DEADBEEFH
mov YCursor,eax
CCTypeIs555:
mov eax,000000200H ; Dither pattern.
mov ebx,002000000H
mov DitherB,eax
mov DitherB+4,eax
mov DitherG,ebx
mov DitherG+4,ebx
mov DitherR,eax
mov DitherR+4,eax
mov eax,003E003E0H ; Bits to extract for fields
mov ebx,07C007C00H
mov SelectGBits,eax
mov SelectGBits+4,eax
mov SelectRBits,ebx
mov SelectRBits+4,ebx
mov eax,0001F001FH
xor ecx,ecx ; Left shift count for R
mov SelectBBits,eax
mov SelectBBits+4,eax
mov eax,10 ; Right shift count for B
mov ebx,5 ; Right shift count for G
mov ShiftCountForB,eax
mov ShiftCountForG,ebx
mov ShiftCountForR,ecx
jmp CCTypeInitialized
CCTypeIs565:
mov eax,000000200H
mov ebx,004000000H
mov DitherB,eax
mov DitherB+4,eax
mov DitherG,ebx
mov DitherG+4,ebx
mov DitherR,eax
mov DitherR+4,eax
mov eax,007E007E0H
mov ebx,0F800F800H
mov SelectGBits,eax
mov SelectGBits+4,eax
mov SelectRBits,ebx
mov SelectRBits+4,ebx
mov eax,0001F001FH
mov ecx,1
mov SelectBBits,eax
mov SelectBBits+4,eax
mov eax,10
mov ebx,4
mov ShiftCountForB,eax
mov ShiftCountForG,ebx
mov ShiftCountForR,ecx
jmp CCTypeInitialized
CCTypeIs655:
mov eax,000000200H ; Dither pattern.
mov ebx,004000000H
mov DitherB,eax
mov DitherB+4,eax
mov DitherG,eax
mov DitherG+4,eax
mov DitherR,ebx
mov DitherR+4,ebx
mov eax,003E003E0H ; Bits to extract for fields
mov ebx,0FC00FC00H
mov SelectGBits,eax
mov SelectGBits+4,eax
mov SelectRBits,ebx
mov SelectRBits+4,ebx
mov eax,0001F001FH
mov ecx,1 ; Left shift count for R
mov SelectBBits,eax
mov SelectBBits+4,eax
mov eax,10 ; Right shift count for B
mov ebx,5 ; Right shift count for G
mov ShiftCountForB,eax
mov ShiftCountForG,ebx
mov ShiftCountForR,ecx
jmp CCTypeInitialized
CCTypeIs664:
mov eax,000000400H ; Dither pattern.
mov ebx,001000000H
mov DitherB,ebx
mov DitherB+4,ebx
mov DitherG,eax
mov DitherG+4,eax
mov DitherR,eax
mov DitherR+4,eax
mov eax,003F003F0H ; Bits to extract for fields
mov ebx,0FC00FC00H
mov SelectGBits,eax
mov SelectGBits+4,eax
mov SelectRBits,ebx
mov SelectRBits+4,ebx
mov eax,0000F000FH
mov ecx,1 ; Left shift count for R
mov SelectBBits,eax
mov SelectBBits+4,eax
mov eax,11 ; Right shift count for B
mov ebx,5 ; Right shift count for G
mov ShiftCountForB,eax
mov ShiftCountForG,ebx
mov ShiftCountForR,ecx
CCTypeInitialized:
mov esi,YCursor
mov ebp,YPitch
mov edi,StartIndexOfYLine
xor eax,eax
lea edx,[esi+ebp*2]
xor ebx,ebx
mov YCursor,edx
mov bl,[esi+ebp*1] ; Get Y10 (a of line L3; for left edge).
mov al,[esi] ; Get Y00 (A of line L2; for left edge).
movq mm1,Luma0020004000200000[ebx*8] ; L1:< 32a 64a 32a 0 >
mov bl,[esi+ebp*1+2] ; Get c.
movq mm0,Luma0020004000200000[eax*8] ; L0:< 32A 64A 32A 0 >
mov al,[esi+2] ; Get C.
; esi -- Cursor over input line of Y.
; edi -- Index to lines of filtered Y. Quit when MAXWIDTH*20.
; ebp -- Pitch from one line of Y to the next.
; al, bl -- Y pels
; mm0 -- For line 0, contribution of pel to left of two pels under cursor now.
; mm1 -- For line 1, contribution of pel to left of two pels under cursor now.
; mm2-mm6 -- Scratch.
Next2PelsOfFirst2LumaLines:
movq mm3,Luma0020004000200000[ebx*8] ; L1:< 32c 64c 32c 0 >
psrlq mm1,32 ; L1:< 0 0 32a 64a >
movq mm2,Luma0020004000200000[eax*8] ; L0:< 32C 64C 32C 0 >
punpckldq mm1,mm3 ; L1:< 32c 0 32a 64a >
xor ebx,ebx
xor eax,eax
mov bl,[esi+ebp*1+1] ; Get b.
psrlq mm0,32 ; L0:< 0 0 32A 64A >
mov al,[esi+1] ; Get B.
add edi,40 ; Inc filtered luma temp stg idx.
paddw mm1,Luma0020004000200000[ebx*8] ; L1:< 32b+32c 64b 32a+32b 64a >
punpckldq mm0,mm2 ; L0:< 32C 0 32A 64A >
paddw mm0,Luma0020004000200000[eax*8] ; L0:< 32B+32C 64B 32A+32B 64A >
movq HFiltLinePrev[edi-40],mm1 ; Save L1 as next iters LPrev.
paddw mm1,mm0 ; L0+L1
paddw mm0,mm0 ; 2L0
add esi,2 ; Increment input index.
movq FiltLine3[edi-40],mm1 ; Save filtered line L0+L1.
movq mm1,mm3 ; Next iters a.
movq FiltLine2[edi-40],mm0 ; Save filtered line 2L0.
movq mm0,mm2 ; Next iters A.
mov bl,[esi+ebp*1+2] ; Get c.
cmp edi,MAXWIDTH*20-40 ; Done yet.
mov al,[esi+2] ; Get C.
jl Next2PelsOfFirst2LumaLines
xor ebx,ebx
xor ecx,ecx
mov bl,[esi+ebp*1+1] ; Get c.
cmp edi,MAXWIDTH*20 ; Done yet.
mov al,[esi+1] ; Get C.
jl Next2PelsOfFirst2LumaLines
mov ebp,DistanceFromUToV
lea eax,FiltLine2
mov esi,UCursor
mov edx,StartIndexOfYLine
jmp DoOutputLine
Last2OutputLines:
mov ebp,DistanceFromUToV
lea esi,[edi+40]
ja Done
; edi -- Index to lines of filtered Y. Quit when MAXWIDTH*20.
; mm0-mm6 -- Scratch.
movq mm0,HFiltLinePrev[edi] ; Fetch horizontally filtered line LP.
paddw mm0,mm0 ; 2LP
Next2PelsOfLast2LumaLines:
movq FiltLine3[edi],mm0 ; Save horz and vert filt line 2LP.
movq FiltLine2[edi],mm0 ; Save horz and vert filt line 2LP.
movq mm0,HFiltLinePrev[edi+40]; Fetch horizontally filtered line LP.
add edi,40
paddw mm0,mm0 ; 2LP
cmp edi,MAXWIDTH*20 ; Done yet.
jne Next2PelsOfLast2LumaLines
lea eax,FiltLine2
mov edx,StartIndexOfYLine
mov esi,UCursor
jmp DoOutputLine
Next4OutputLines:
mov esi,YCursor
mov ebp,YPitch
mov edi,StartIndexOfYLine
mov ecx,YLimit
lea edx,[esi+ebp*2]
xor eax,eax
mov YCursor,edx
xor ebx,ebx
mov al,[esi] ; Get Y00 (A of line L2; for left edge).
cmp esi,ecx
mov bl,[esi+ebp*1] ; Get Y10 (a of line L3; for left edge).
jae Last2OutputLines
movq mm1,Luma0020004000200000[ebx*8] ; L1:< 32a 64a 32a 0 >
mov bl,[esi+ebp*1+2] ; Get c.
movq mm0,Luma0020004000200000[eax*8] ; L0:< 32A 64A 32A 0 >
mov al,[esi+2] ; Get C.
; esi -- Cursor over input line of Y.
; edi -- Index to lines of filtered Y. Quit when MAXWIDTH*20.
; ebp -- Pitch from one line of Y to the next.
; al, bl -- Y pels
; mm0 -- For line 0, contribution of pel to left of two pels under cursor now.
; mm1 -- For line 1, contribution of pel to left of two pels under cursor now.
; mm2-mm6 -- Scratch.
Next2PelsOf2LumaLines:
movq mm3,Luma0020004000200000[ebx*8] ; L1:< 32c 64c 32c 0 >
psrlq mm1,32 ; L1:< 0 0 32a 64a >
movq mm2,Luma0020004000200000[eax*8] ; L0:< 32C 64C 32C 0 >
punpckldq mm1,mm3 ; L1:< 32c 0 32a 64a >
movq mm4,HFiltLinePrev[edi] ; LP
psrlq mm0,32 ; L0:< 0 0 32A 64A >
xor ebx,ebx
xor eax,eax
mov bl,[esi+ebp*1+1] ; Get b.
movq mm5,mm4 ; LP
mov al,[esi+1] ; Get B.
add esi,2 ; Increment input index.
paddw mm1,Luma0020004000200000[ebx*8] ; L1:< 32b+32c 64b 32a+32b 64a >
punpckldq mm0,mm2 ; L0:< 32C 0 32A 64A >
paddw mm0,Luma0020004000200000[eax*8] ; L0:< 32B+32C 64B 32A+32B 64A >
paddw mm5,mm5 ; 2LP
movq HFiltLinePrev[edi],mm1 ; Save L1 as next iters LPrev.
paddw mm4,mm0 ; LP+L0
movq FiltLine0[edi],mm5 ; Save 2LP
paddw mm1,mm0 ; L0+L1
movq FiltLine1[edi],mm4 ; Save LP+L0
paddw mm0,mm0 ; 2L0
movq FiltLine3[edi],mm1 ; Save L0+L1
movq mm1,mm3 ; Next iters a.
movq FiltLine2[edi],mm0 ; Save 2L0
movq mm0,mm2 ; Next iters A.
add edi,40 ; Inc filtered luma temp stg idx.
mov bl,[esi+ebp*1+2] ; Get c.
cmp edi,MAXWIDTH*20-40 ; Done yet.
mov al,[esi+2] ; Get C.
jl Next2PelsOf2LumaLines
xor ebx,ebx
xor ecx,ecx
mov bl,[esi+ebp*1+1] ; Get c.
cmp edi,MAXWIDTH*20 ; Done yet.
mov al,[esi+1] ; Get C.
jl Next2PelsOf2LumaLines
mov ebp,DistanceFromUToV
mov esi,UCursor
lea eax,FiltLine0
mov edx,StartIndexOfYLine
DoOutputLine:
mov edi,CCOCursor
mov ecx,AspectCount
dec ecx ; If count is non-zero, we keep the line.
mov ebx,CCOPitch
mov AspectCount,ecx
je SkipOutputLine
add ebx,edi
xor ecx,ecx
mov cl,[esi]
add eax,MAXWIDTH*20
movdt mm3,ShiftCountForB
pcmpeqw mm6,mm6
movdt mm0,UContribToBandG[ecx*4] ; < 0 0 Bu Gu >
mov cl,[esi+ebp*1]
sub edx,MAXWIDTH*20
movdt mm4,ShiftCountForG
psllw mm6,15 ; Four words of -32768
movdt mm5,ShiftCountForR
punpcklwd mm0,mm0 ; < Bu Bu Gu Gu >
movq mm7,SelectBBits
mov CCOCursor,ebx
jmp StartDoOutputLine
; ebp -- Distance from U to V
; esi -- Cursor over U
; edi -- Cursor over output
; edx -- Index over Y storage area
; eax -- Base address of Y line
; mm6 -- Four words of -32768, to clamp at floor.
; mm3, mm4, mm5 -- Shift counts to apply to R, G, and B.
DoNext4OutputPels:
movq [edi-8],mm2 ; Save 4 output pels.
punpcklwd mm0,mm0 ; < Bu Bu Gu Gu >
StartDoOutputLine:
movdt mm2,VContribToRandG[ecx*4] ; < 0 0 Rv Gv >
punpcklwd mm2,mm2 ; < Rv Rv Gv Gv >
movq mm1,mm0 ; < junk junk Gu Gu >
punpckhdq mm0,mm0 ; < Bu Bu Bu Bu >
paddsw mm0,[eax+edx] ; < B B B B > with ceiling clamped.
paddw mm1,mm2 ; < junk junk Guv Guv >
paddsw mm0,DitherB ; B with dither added.
punpckldq mm1,mm1 ; < Guv Guv Guv Guv >
paddsw mm1,[eax+edx] ; < G G G G > with ceiling clamped.
punpckhdq mm2,mm2 ; < Rv Rv Rv Rv >
paddsw mm1,DitherG ; G with dither added.
paddsw mm0,mm6 ; B with floor clamped.
paddsw mm2,[eax+edx] ; < R R R R > with ceiling clamped.
paddsw mm1,mm6 ; G with floor clamped.
paddsw mm2,DitherR ; R with dither added.
psrlw mm0,mm3 ; Position B bits.
paddsw mm2,mm6 ; R with floor clamped.
psrlw mm1,mm4 ; Position G bits.
pand mm1,SelectGBits ; Eliminate fractional bits.
psllw mm2,mm5 ; Position R bits.
inc esi ; Advance input cursor
xor ecx,ecx
pand mm2,SelectRBits ; Eliminate fractional bits.
pand mm0,mm7
mov cl,[esi] ; Fetch next U.
add edi,8 ; Advance output cursor.
por mm2,mm0 ; R and B combined.
add edx,40 ; Increment Y index.
movdt mm0,UContribToBandG[ecx*4] ; < 0 0 Bu Gv > next iter.
por mm2,mm1 ; Completed RGB16 for 4 output pels.
mov cl,[esi+ebp*1] ; Fetch next V.
jne DoNext4OutputPels
movq [edi-8],mm2 ; Save 4 output pels.
movq mm0,DitherB ; Reverse dither patterns.
movq mm1,DitherG
psrlq mm0,16
movq mm2,DitherR
psrlq mm1,16
psrlq mm2,16
punpckldq mm0,mm0
punpckldq mm1,mm1
movq DitherB,mm0
punpckldq mm2,mm2
movq DitherG,mm1
movq DitherR,mm2
PrepareForNextOutputLine:
mov edx,StartIndexOfYLine
add eax,8-MAXWIDTH*20 ; Advance to next filtered line of Y.
mov esi,UCursor
test al,8 ; Jump if just did line 0 or 2.
mov ebx,ChromaPitch
jne DoOutputLine
add esi,ebx ; Advance to next chroma line.
test al,16 ; Jump if about to do line 2.
mov UCursor,esi
jne DoOutputLine
sub esi,ebx ; Done with 4 lines. Restore UCursor.
mov UCursor,esi
jmp Next4OutputLines
SkipOutputLine:
mov ecx,AspectAdjustmentCount
add eax,MAXWIDTH*20
mov AspectCount,ecx
jmp PrepareForNextOutputLine
Done:
mov esp,StashESP
pop ebx
pop ebp
pop edi
pop esi
rturn
MMX_YUV12ToRGB16ZoomBy2 endp
END