688 lines
24 KiB
NASM
688 lines
24 KiB
NASM
;-------------------------------------------------------------------------
|
|
; INTEL Corporation Proprietary Information
|
|
;
|
|
; This listing is supplied under the terms of a license
|
|
; agreement with INTEL Corporation and may not be copied
|
|
; nor disclosed except in accordance with the terms of
|
|
; that agreement.
|
|
;
|
|
; Copyright (c) 1996 Intel Corporation.
|
|
; All Rights Reserved.
|
|
;
|
|
;-------------------------------------------------------------------------
|
|
|
|
;-------------------------------------------------------------------------
|
|
;//
|
|
;// $Header: S:\h26x\src\dec\cxm12162.asv
|
|
;//
|
|
;// $Log: S:\h26x\src\dec\cxm12162.asv $
|
|
;//
|
|
;// Rev 1.10 01 Apr 1997 12:51:50 BNICKERS
|
|
;// Fix bugs # 153 and 156 -- wrong color when U is small; right edge flickeri
|
|
;//
|
|
;// Rev 1.9 09 Dec 1996 15:20:40 BECHOLS
|
|
;// Brian fixed ARC bug #94.
|
|
;//
|
|
;// Rev 1.8 06 Sep 1996 16:07:58 BNICKERS
|
|
;// Re-written to filter new points.
|
|
;//
|
|
;-------------------------------------------------------------------------
|
|
;
|
|
; +---------- Color convertor.
|
|
; |+--------- For both H261 and H263.
|
|
; ||+-------- Version for Intel Microprocessors with MMX Technology
|
|
; |||++------ Convert from YUV12.
|
|
; |||||++---- Convert to RGB16.
|
|
; |||||||+--- Zoom by two.
|
|
; ||||||||
|
|
; cxm12162 -- This function performs zoom-by-2 YUV12-to-RGB16 color conversion
|
|
; for H26x. It is tuned for best performance on Intel
|
|
; Microprocessors with MMX Technology. It handles any format in
|
|
; which there are three fields, the low order field being B and
|
|
; starting in bit 0, the second field being G, and the high order
|
|
; field being R. Present support for 555, 565, 655, and 644
|
|
; formats only. This version adds new rows and columns by
|
|
; averaging them with the originals to either side.
|
|
;
|
|
; The YUV12 input is planar, 8 bits per pel. The Y plane may have
|
|
; a pitch of up to 768. It may have a width less than or equal
|
|
; to the pitch. It must be QWORD aligned. Pitch and Width must
|
|
; be a multiple of eight. Height may be any amount, but must be
|
|
; a multiple of two. The U and V planes may have a different
|
|
; pitch than the Y plane, subject to the same limitations.
|
|
;
|
|
; The color convertor is non-destructive; the input Y, U, and V
|
|
; planes will not be clobbered.
|
|
|
|
OPTION PROLOGUE:None
|
|
OPTION EPILOGUE:ReturnAndRelieveEpilogueMacro
|
|
|
|
include ccinst.inc
|
|
|
|
.xlist
|
|
include iammx.inc
|
|
include memmodel.inc
|
|
.list
|
|
|
|
MMXCCDATA SEGMENT PAGE
|
|
ALIGN 16
|
|
|
|
Luma0020004000200000 LABEL DWORD
|
|
REPEAT 16
|
|
DD 0, 0
|
|
ENDM
|
|
CNT = 0
|
|
REPEAT 219
|
|
DW 0
|
|
DW (CNT*04A7FH)/00200H
|
|
DW (CNT*04A7FH)/00100H
|
|
DW (CNT*04A7FH)/00200H
|
|
CNT = CNT + 1
|
|
ENDM
|
|
REPEAT 21
|
|
DW 00000H
|
|
DW 01FFFH
|
|
DW 03FFFH
|
|
DW 01FFFH
|
|
ENDM
|
|
|
|
UContribToBandG LABEL DWORD
|
|
DW -(-128*0C83H)/00040H
|
|
DW 08000H
|
|
DW -(-127*0C83H)/00040H
|
|
DW 08000H
|
|
CNT = -126
|
|
REPEAT 253
|
|
DW -(CNT*00C83H)/00040H
|
|
DW (CNT*0408BH)/00040H
|
|
CNT = CNT + 1
|
|
ENDM
|
|
DW (127*0C83H)/00040H
|
|
DW 07FFFH
|
|
|
|
VContribToRandG LABEL DWORD
|
|
CNT = -128
|
|
REPEAT 256
|
|
DW -(CNT*01A04H)/00040H
|
|
DW (CNT*03312H)/00040H
|
|
CNT = CNT + 1
|
|
ENDM
|
|
|
|
MMXCCDATA ENDS
|
|
|
|
.CODE
|
|
|
|
ASSUME ds : FLAT
|
|
ASSUME es : FLAT
|
|
ASSUME fs : FLAT
|
|
ASSUME gs : FLAT
|
|
ASSUME ss : FLAT
|
|
|
|
; void FAR ASM_CALLTYPE YUV12ToRGB16ZoomBy2 (U8 * YPlane,
|
|
; U8 * VPlane,
|
|
; U8 * UPlane,
|
|
; UN FrameWidth,
|
|
; UN FrameHeight,
|
|
; UN YPitch,
|
|
; UN VPitch,
|
|
; UN AspectAdjustmentCount,
|
|
; U8 * ColorConvertedFrame,
|
|
; U32 DCIOffset,
|
|
; U32 CCOffsetToLine0,
|
|
; IN CCOPitch,
|
|
; IN CCType)
|
|
;
|
|
; CCOffsetToLine0 is relative to ColorConvertedFrame.
|
|
;
|
|
|
|
; due to the need for the ebp reg, these parameter declarations aren't used,
|
|
; they are here so the assembler knows how many bytes to relieve from the stack
|
|
|
|
PUBLIC MMX_YUV12ToRGB16ZoomBy2
|
|
|
|
MMX_YUV12ToRGB16ZoomBy2 proc DIST LANG AYPlane: DWORD,
|
|
AVPlane: DWORD,
|
|
AUPlane: DWORD,
|
|
AFrameWidth: DWORD,
|
|
AFrameHeight: DWORD,
|
|
AYPitch: DWORD,
|
|
AVPitch: DWORD,
|
|
AAspectAdjustmentCnt: DWORD,
|
|
AColorConvertedFrame: DWORD,
|
|
ADCIOffset: DWORD,
|
|
ACCOffsetToLine0: DWORD,
|
|
ACCOPitch: DWORD,
|
|
ACCType: DWORD
|
|
|
|
MAXWIDTH = 768
|
|
LocalFrameSize = MAXWIDTH*20+128+64
|
|
RegisterStorageSize = 16
|
|
|
|
; Arguments:
|
|
|
|
YPlane_arg = RegisterStorageSize + 4
|
|
VPlane_arg = RegisterStorageSize + 8
|
|
UPlane_arg = RegisterStorageSize + 12
|
|
FrameWidth_arg = RegisterStorageSize + 16
|
|
FrameHeight = RegisterStorageSize + 20
|
|
YPitch_arg = RegisterStorageSize + 24
|
|
ChromaPitch_arg = RegisterStorageSize + 28
|
|
AspectAdjustmentCount_arg = RegisterStorageSize + 32
|
|
ColorConvertedFrame = RegisterStorageSize + 36
|
|
DCIOffset = RegisterStorageSize + 40
|
|
CCOffsetToLine0 = RegisterStorageSize + 44
|
|
CCOPitch_arg = RegisterStorageSize + 48
|
|
CCType = RegisterStorageSize + 52
|
|
EndOfArgList = RegisterStorageSize + 56
|
|
|
|
; Locals (on local stack frame)
|
|
|
|
DitherB EQU [esp+ 0]
|
|
DitherG EQU [esp+ 8]
|
|
DitherR EQU [esp+ 16]
|
|
SelectBBits EQU [esp+ 24]
|
|
SelectGBits EQU [esp+ 32]
|
|
SelectRBits EQU [esp+ 40]
|
|
|
|
ShiftCountForB EQU [esp+ 48]
|
|
ShiftCountForG EQU [esp+ 52]
|
|
ShiftCountForR EQU [esp+ 56]
|
|
|
|
CCOCursor EQU [esp+ 60]
|
|
CCOPitch EQU [esp+MAXWIDTH*20+128+ 0]
|
|
YCursor EQU [esp+MAXWIDTH*20+128+ 4]
|
|
|
|
YLimit EQU [esp+MAXWIDTH*20+128+ 8]
|
|
YPitch EQU [esp+MAXWIDTH*20+128+12]
|
|
UCursor EQU [esp+MAXWIDTH*20+128+16]
|
|
DistanceFromUToV EQU [esp+MAXWIDTH*20+128+20]
|
|
ChromaPitch EQU [esp+MAXWIDTH*20+128+24]
|
|
AspectCount EQU [esp+MAXWIDTH*20+128+28]
|
|
AspectAdjustmentCount EQU [esp+MAXWIDTH*20+128+32]
|
|
StartIndexOfYLine EQU [esp+MAXWIDTH*20+128+36]
|
|
StashESP EQU [esp+MAXWIDTH*20+128+40]
|
|
|
|
FiltLine0 EQU [esp+ 64] ; Must be 32 byte aligned.
|
|
FiltLine1 EQU [esp+ 72]
|
|
FiltLine2 EQU [esp+ 80]
|
|
FiltLine3 EQU [esp+ 88]
|
|
HFiltLinePrev EQU [esp+ 96]
|
|
|
|
push esi
|
|
push edi
|
|
push ebp
|
|
push ebx
|
|
|
|
mov edi,esp
|
|
and esp,0FFFFF000H
|
|
sub esp,4096
|
|
mov eax,[esp]
|
|
sub esp,4096
|
|
mov eax,[esp]
|
|
sub esp,4096
|
|
mov eax,[esp]
|
|
sub esp,LocalFrameSize-12288
|
|
mov eax,[esp]
|
|
|
|
mov eax,768
|
|
sub eax,[edi+FrameWidth_arg]
|
|
imul eax,20
|
|
mov StartIndexOfYLine,eax
|
|
|
|
mov eax,[edi+YPlane_arg]
|
|
mov YCursor,eax
|
|
|
|
mov ebx,[edi+YPitch_arg]
|
|
mov YPitch,ebx
|
|
mov ecx,[edi+FrameHeight]
|
|
imul ebx,ecx
|
|
add eax,ebx
|
|
mov YLimit,eax
|
|
|
|
mov eax,[edi+UPlane_arg]
|
|
mov ebx,[edi+VPlane_arg]
|
|
mov UCursor,eax
|
|
sub ebx,eax
|
|
mov DistanceFromUToV,ebx
|
|
|
|
mov eax,[edi+ColorConvertedFrame]
|
|
add eax,[edi+DCIOffset]
|
|
add eax,[edi+CCOffsetToLine0]
|
|
mov CCOCursor,eax
|
|
|
|
mov eax,[edi+ChromaPitch_arg]
|
|
mov ChromaPitch,eax
|
|
|
|
mov eax,[edi+CCOPitch_arg]
|
|
mov CCOPitch,eax
|
|
|
|
mov eax,[edi+AspectAdjustmentCount_arg]
|
|
mov AspectAdjustmentCount,eax
|
|
mov AspectCount,eax
|
|
|
|
mov StashESP,edi
|
|
|
|
mov eax,[edi+CCType]
|
|
cmp eax,CCTYPE_RGB16555ZoomBy2
|
|
je CCTypeIs555
|
|
cmp eax,CCTYPE_RGB16555ZoomBy2DCI
|
|
je CCTypeIs555
|
|
cmp eax,CCTYPE_RGB16565ZoomBy2
|
|
je CCTypeIs565
|
|
cmp eax,CCTYPE_RGB16565ZoomBy2DCI
|
|
je CCTypeIs565
|
|
cmp eax,CCTYPE_RGB16655ZoomBy2
|
|
je CCTypeIs655
|
|
cmp eax,CCTYPE_RGB16655ZoomBy2DCI
|
|
je CCTypeIs655
|
|
cmp eax,CCTYPE_RGB16664ZoomBy2DCI
|
|
je CCTypeIs664
|
|
cmp eax,CCTYPE_RGB16664ZoomBy2
|
|
je CCTypeIs664
|
|
mov eax,0DEADBEEFH
|
|
mov YCursor,eax
|
|
|
|
CCTypeIs555:
|
|
|
|
mov eax,000000200H ; Dither pattern.
|
|
mov ebx,002000000H
|
|
mov DitherB,eax
|
|
mov DitherB+4,eax
|
|
mov DitherG,ebx
|
|
mov DitherG+4,ebx
|
|
mov DitherR,eax
|
|
mov DitherR+4,eax
|
|
mov eax,003E003E0H ; Bits to extract for fields
|
|
mov ebx,07C007C00H
|
|
mov SelectGBits,eax
|
|
mov SelectGBits+4,eax
|
|
mov SelectRBits,ebx
|
|
mov SelectRBits+4,ebx
|
|
mov eax,0001F001FH
|
|
xor ecx,ecx ; Left shift count for R
|
|
mov SelectBBits,eax
|
|
mov SelectBBits+4,eax
|
|
mov eax,10 ; Right shift count for B
|
|
mov ebx,5 ; Right shift count for G
|
|
mov ShiftCountForB,eax
|
|
mov ShiftCountForG,ebx
|
|
mov ShiftCountForR,ecx
|
|
jmp CCTypeInitialized
|
|
|
|
CCTypeIs565:
|
|
|
|
mov eax,000000200H
|
|
mov ebx,004000000H
|
|
mov DitherB,eax
|
|
mov DitherB+4,eax
|
|
mov DitherG,ebx
|
|
mov DitherG+4,ebx
|
|
mov DitherR,eax
|
|
mov DitherR+4,eax
|
|
mov eax,007E007E0H
|
|
mov ebx,0F800F800H
|
|
mov SelectGBits,eax
|
|
mov SelectGBits+4,eax
|
|
mov SelectRBits,ebx
|
|
mov SelectRBits+4,ebx
|
|
mov eax,0001F001FH
|
|
mov ecx,1
|
|
mov SelectBBits,eax
|
|
mov SelectBBits+4,eax
|
|
mov eax,10
|
|
mov ebx,4
|
|
mov ShiftCountForB,eax
|
|
mov ShiftCountForG,ebx
|
|
mov ShiftCountForR,ecx
|
|
jmp CCTypeInitialized
|
|
|
|
CCTypeIs655:
|
|
|
|
mov eax,000000200H ; Dither pattern.
|
|
mov ebx,004000000H
|
|
mov DitherB,eax
|
|
mov DitherB+4,eax
|
|
mov DitherG,eax
|
|
mov DitherG+4,eax
|
|
mov DitherR,ebx
|
|
mov DitherR+4,ebx
|
|
mov eax,003E003E0H ; Bits to extract for fields
|
|
mov ebx,0FC00FC00H
|
|
mov SelectGBits,eax
|
|
mov SelectGBits+4,eax
|
|
mov SelectRBits,ebx
|
|
mov SelectRBits+4,ebx
|
|
mov eax,0001F001FH
|
|
mov ecx,1 ; Left shift count for R
|
|
mov SelectBBits,eax
|
|
mov SelectBBits+4,eax
|
|
mov eax,10 ; Right shift count for B
|
|
mov ebx,5 ; Right shift count for G
|
|
mov ShiftCountForB,eax
|
|
mov ShiftCountForG,ebx
|
|
mov ShiftCountForR,ecx
|
|
jmp CCTypeInitialized
|
|
|
|
CCTypeIs664:
|
|
|
|
mov eax,000000400H ; Dither pattern.
|
|
mov ebx,001000000H
|
|
mov DitherB,ebx
|
|
mov DitherB+4,ebx
|
|
mov DitherG,eax
|
|
mov DitherG+4,eax
|
|
mov DitherR,eax
|
|
mov DitherR+4,eax
|
|
mov eax,003F003F0H ; Bits to extract for fields
|
|
mov ebx,0FC00FC00H
|
|
mov SelectGBits,eax
|
|
mov SelectGBits+4,eax
|
|
mov SelectRBits,ebx
|
|
mov SelectRBits+4,ebx
|
|
mov eax,0000F000FH
|
|
mov ecx,1 ; Left shift count for R
|
|
mov SelectBBits,eax
|
|
mov SelectBBits+4,eax
|
|
mov eax,11 ; Right shift count for B
|
|
mov ebx,5 ; Right shift count for G
|
|
mov ShiftCountForB,eax
|
|
mov ShiftCountForG,ebx
|
|
mov ShiftCountForR,ecx
|
|
|
|
CCTypeInitialized:
|
|
|
|
mov esi,YCursor
|
|
mov ebp,YPitch
|
|
mov edi,StartIndexOfYLine
|
|
xor eax,eax
|
|
lea edx,[esi+ebp*2]
|
|
xor ebx,ebx
|
|
mov YCursor,edx
|
|
mov bl,[esi+ebp*1] ; Get Y10 (a of line L3; for left edge).
|
|
mov al,[esi] ; Get Y00 (A of line L2; for left edge).
|
|
|
|
movq mm1,Luma0020004000200000[ebx*8] ; L1:< 32a 64a 32a 0 >
|
|
mov bl,[esi+ebp*1+2] ; Get c.
|
|
movq mm0,Luma0020004000200000[eax*8] ; L0:< 32A 64A 32A 0 >
|
|
mov al,[esi+2] ; Get C.
|
|
|
|
; esi -- Cursor over input line of Y.
|
|
; edi -- Index to lines of filtered Y. Quit when MAXWIDTH*20.
|
|
; ebp -- Pitch from one line of Y to the next.
|
|
; al, bl -- Y pels
|
|
; mm0 -- For line 0, contribution of pel to left of two pels under cursor now.
|
|
; mm1 -- For line 1, contribution of pel to left of two pels under cursor now.
|
|
; mm2-mm6 -- Scratch.
|
|
|
|
Next2PelsOfFirst2LumaLines:
|
|
|
|
movq mm3,Luma0020004000200000[ebx*8] ; L1:< 32c 64c 32c 0 >
|
|
psrlq mm1,32 ; L1:< 0 0 32a 64a >
|
|
movq mm2,Luma0020004000200000[eax*8] ; L0:< 32C 64C 32C 0 >
|
|
punpckldq mm1,mm3 ; L1:< 32c 0 32a 64a >
|
|
xor ebx,ebx
|
|
xor eax,eax
|
|
mov bl,[esi+ebp*1+1] ; Get b.
|
|
psrlq mm0,32 ; L0:< 0 0 32A 64A >
|
|
mov al,[esi+1] ; Get B.
|
|
add edi,40 ; Inc filtered luma temp stg idx.
|
|
paddw mm1,Luma0020004000200000[ebx*8] ; L1:< 32b+32c 64b 32a+32b 64a >
|
|
punpckldq mm0,mm2 ; L0:< 32C 0 32A 64A >
|
|
paddw mm0,Luma0020004000200000[eax*8] ; L0:< 32B+32C 64B 32A+32B 64A >
|
|
|
|
movq HFiltLinePrev[edi-40],mm1 ; Save L1 as next iters LPrev.
|
|
paddw mm1,mm0 ; L0+L1
|
|
paddw mm0,mm0 ; 2L0
|
|
add esi,2 ; Increment input index.
|
|
movq FiltLine3[edi-40],mm1 ; Save filtered line L0+L1.
|
|
movq mm1,mm3 ; Next iters a.
|
|
movq FiltLine2[edi-40],mm0 ; Save filtered line 2L0.
|
|
movq mm0,mm2 ; Next iters A.
|
|
mov bl,[esi+ebp*1+2] ; Get c.
|
|
cmp edi,MAXWIDTH*20-40 ; Done yet.
|
|
mov al,[esi+2] ; Get C.
|
|
jl Next2PelsOfFirst2LumaLines
|
|
|
|
xor ebx,ebx
|
|
xor ecx,ecx
|
|
mov bl,[esi+ebp*1+1] ; Get c.
|
|
cmp edi,MAXWIDTH*20 ; Done yet.
|
|
mov al,[esi+1] ; Get C.
|
|
jl Next2PelsOfFirst2LumaLines
|
|
|
|
mov ebp,DistanceFromUToV
|
|
lea eax,FiltLine2
|
|
mov esi,UCursor
|
|
mov edx,StartIndexOfYLine
|
|
jmp DoOutputLine
|
|
|
|
|
|
Last2OutputLines:
|
|
|
|
mov ebp,DistanceFromUToV
|
|
lea esi,[edi+40]
|
|
ja Done
|
|
|
|
; edi -- Index to lines of filtered Y. Quit when MAXWIDTH*20.
|
|
; mm0-mm6 -- Scratch.
|
|
|
|
|
|
movq mm0,HFiltLinePrev[edi] ; Fetch horizontally filtered line LP.
|
|
paddw mm0,mm0 ; 2LP
|
|
|
|
Next2PelsOfLast2LumaLines:
|
|
|
|
movq FiltLine3[edi],mm0 ; Save horz and vert filt line 2LP.
|
|
movq FiltLine2[edi],mm0 ; Save horz and vert filt line 2LP.
|
|
movq mm0,HFiltLinePrev[edi+40]; Fetch horizontally filtered line LP.
|
|
add edi,40
|
|
paddw mm0,mm0 ; 2LP
|
|
cmp edi,MAXWIDTH*20 ; Done yet.
|
|
jne Next2PelsOfLast2LumaLines
|
|
|
|
lea eax,FiltLine2
|
|
mov edx,StartIndexOfYLine
|
|
mov esi,UCursor
|
|
jmp DoOutputLine
|
|
|
|
|
|
Next4OutputLines:
|
|
|
|
mov esi,YCursor
|
|
mov ebp,YPitch
|
|
mov edi,StartIndexOfYLine
|
|
mov ecx,YLimit
|
|
lea edx,[esi+ebp*2]
|
|
xor eax,eax
|
|
mov YCursor,edx
|
|
xor ebx,ebx
|
|
mov al,[esi] ; Get Y00 (A of line L2; for left edge).
|
|
cmp esi,ecx
|
|
mov bl,[esi+ebp*1] ; Get Y10 (a of line L3; for left edge).
|
|
jae Last2OutputLines
|
|
|
|
movq mm1,Luma0020004000200000[ebx*8] ; L1:< 32a 64a 32a 0 >
|
|
mov bl,[esi+ebp*1+2] ; Get c.
|
|
movq mm0,Luma0020004000200000[eax*8] ; L0:< 32A 64A 32A 0 >
|
|
mov al,[esi+2] ; Get C.
|
|
|
|
; esi -- Cursor over input line of Y.
|
|
; edi -- Index to lines of filtered Y. Quit when MAXWIDTH*20.
|
|
; ebp -- Pitch from one line of Y to the next.
|
|
; al, bl -- Y pels
|
|
; mm0 -- For line 0, contribution of pel to left of two pels under cursor now.
|
|
; mm1 -- For line 1, contribution of pel to left of two pels under cursor now.
|
|
; mm2-mm6 -- Scratch.
|
|
|
|
Next2PelsOf2LumaLines:
|
|
|
|
movq mm3,Luma0020004000200000[ebx*8] ; L1:< 32c 64c 32c 0 >
|
|
psrlq mm1,32 ; L1:< 0 0 32a 64a >
|
|
movq mm2,Luma0020004000200000[eax*8] ; L0:< 32C 64C 32C 0 >
|
|
punpckldq mm1,mm3 ; L1:< 32c 0 32a 64a >
|
|
movq mm4,HFiltLinePrev[edi] ; LP
|
|
psrlq mm0,32 ; L0:< 0 0 32A 64A >
|
|
xor ebx,ebx
|
|
xor eax,eax
|
|
mov bl,[esi+ebp*1+1] ; Get b.
|
|
movq mm5,mm4 ; LP
|
|
mov al,[esi+1] ; Get B.
|
|
add esi,2 ; Increment input index.
|
|
paddw mm1,Luma0020004000200000[ebx*8] ; L1:< 32b+32c 64b 32a+32b 64a >
|
|
punpckldq mm0,mm2 ; L0:< 32C 0 32A 64A >
|
|
paddw mm0,Luma0020004000200000[eax*8] ; L0:< 32B+32C 64B 32A+32B 64A >
|
|
paddw mm5,mm5 ; 2LP
|
|
movq HFiltLinePrev[edi],mm1 ; Save L1 as next iters LPrev.
|
|
paddw mm4,mm0 ; LP+L0
|
|
movq FiltLine0[edi],mm5 ; Save 2LP
|
|
paddw mm1,mm0 ; L0+L1
|
|
movq FiltLine1[edi],mm4 ; Save LP+L0
|
|
paddw mm0,mm0 ; 2L0
|
|
movq FiltLine3[edi],mm1 ; Save L0+L1
|
|
movq mm1,mm3 ; Next iters a.
|
|
movq FiltLine2[edi],mm0 ; Save 2L0
|
|
movq mm0,mm2 ; Next iters A.
|
|
add edi,40 ; Inc filtered luma temp stg idx.
|
|
mov bl,[esi+ebp*1+2] ; Get c.
|
|
cmp edi,MAXWIDTH*20-40 ; Done yet.
|
|
mov al,[esi+2] ; Get C.
|
|
jl Next2PelsOf2LumaLines
|
|
|
|
xor ebx,ebx
|
|
xor ecx,ecx
|
|
mov bl,[esi+ebp*1+1] ; Get c.
|
|
cmp edi,MAXWIDTH*20 ; Done yet.
|
|
mov al,[esi+1] ; Get C.
|
|
jl Next2PelsOf2LumaLines
|
|
|
|
mov ebp,DistanceFromUToV
|
|
mov esi,UCursor
|
|
lea eax,FiltLine0
|
|
mov edx,StartIndexOfYLine
|
|
|
|
DoOutputLine:
|
|
|
|
mov edi,CCOCursor
|
|
mov ecx,AspectCount
|
|
dec ecx ; If count is non-zero, we keep the line.
|
|
mov ebx,CCOPitch
|
|
mov AspectCount,ecx
|
|
je SkipOutputLine
|
|
|
|
add ebx,edi
|
|
xor ecx,ecx
|
|
mov cl,[esi]
|
|
add eax,MAXWIDTH*20
|
|
movdt mm3,ShiftCountForB
|
|
pcmpeqw mm6,mm6
|
|
movdt mm0,UContribToBandG[ecx*4] ; < 0 0 Bu Gu >
|
|
mov cl,[esi+ebp*1]
|
|
sub edx,MAXWIDTH*20
|
|
movdt mm4,ShiftCountForG
|
|
psllw mm6,15 ; Four words of -32768
|
|
movdt mm5,ShiftCountForR
|
|
punpcklwd mm0,mm0 ; < Bu Bu Gu Gu >
|
|
movq mm7,SelectBBits
|
|
mov CCOCursor,ebx
|
|
jmp StartDoOutputLine
|
|
|
|
; ebp -- Distance from U to V
|
|
; esi -- Cursor over U
|
|
; edi -- Cursor over output
|
|
; edx -- Index over Y storage area
|
|
; eax -- Base address of Y line
|
|
; mm6 -- Four words of -32768, to clamp at floor.
|
|
; mm3, mm4, mm5 -- Shift counts to apply to R, G, and B.
|
|
|
|
DoNext4OutputPels:
|
|
|
|
movq [edi-8],mm2 ; Save 4 output pels.
|
|
punpcklwd mm0,mm0 ; < Bu Bu Gu Gu >
|
|
|
|
StartDoOutputLine:
|
|
|
|
movdt mm2,VContribToRandG[ecx*4] ; < 0 0 Rv Gv >
|
|
punpcklwd mm2,mm2 ; < Rv Rv Gv Gv >
|
|
movq mm1,mm0 ; < junk junk Gu Gu >
|
|
punpckhdq mm0,mm0 ; < Bu Bu Bu Bu >
|
|
paddsw mm0,[eax+edx] ; < B B B B > with ceiling clamped.
|
|
paddw mm1,mm2 ; < junk junk Guv Guv >
|
|
paddsw mm0,DitherB ; B with dither added.
|
|
punpckldq mm1,mm1 ; < Guv Guv Guv Guv >
|
|
paddsw mm1,[eax+edx] ; < G G G G > with ceiling clamped.
|
|
punpckhdq mm2,mm2 ; < Rv Rv Rv Rv >
|
|
paddsw mm1,DitherG ; G with dither added.
|
|
paddsw mm0,mm6 ; B with floor clamped.
|
|
paddsw mm2,[eax+edx] ; < R R R R > with ceiling clamped.
|
|
paddsw mm1,mm6 ; G with floor clamped.
|
|
paddsw mm2,DitherR ; R with dither added.
|
|
psrlw mm0,mm3 ; Position B bits.
|
|
paddsw mm2,mm6 ; R with floor clamped.
|
|
psrlw mm1,mm4 ; Position G bits.
|
|
pand mm1,SelectGBits ; Eliminate fractional bits.
|
|
psllw mm2,mm5 ; Position R bits.
|
|
inc esi ; Advance input cursor
|
|
xor ecx,ecx
|
|
pand mm2,SelectRBits ; Eliminate fractional bits.
|
|
pand mm0,mm7
|
|
mov cl,[esi] ; Fetch next U.
|
|
add edi,8 ; Advance output cursor.
|
|
por mm2,mm0 ; R and B combined.
|
|
add edx,40 ; Increment Y index.
|
|
movdt mm0,UContribToBandG[ecx*4] ; < 0 0 Bu Gv > next iter.
|
|
por mm2,mm1 ; Completed RGB16 for 4 output pels.
|
|
mov cl,[esi+ebp*1] ; Fetch next V.
|
|
jne DoNext4OutputPels
|
|
|
|
movq [edi-8],mm2 ; Save 4 output pels.
|
|
|
|
movq mm0,DitherB ; Reverse dither patterns.
|
|
movq mm1,DitherG
|
|
psrlq mm0,16
|
|
movq mm2,DitherR
|
|
psrlq mm1,16
|
|
psrlq mm2,16
|
|
punpckldq mm0,mm0
|
|
punpckldq mm1,mm1
|
|
movq DitherB,mm0
|
|
punpckldq mm2,mm2
|
|
movq DitherG,mm1
|
|
movq DitherR,mm2
|
|
|
|
PrepareForNextOutputLine:
|
|
|
|
mov edx,StartIndexOfYLine
|
|
add eax,8-MAXWIDTH*20 ; Advance to next filtered line of Y.
|
|
mov esi,UCursor
|
|
test al,8 ; Jump if just did line 0 or 2.
|
|
mov ebx,ChromaPitch
|
|
jne DoOutputLine
|
|
|
|
add esi,ebx ; Advance to next chroma line.
|
|
test al,16 ; Jump if about to do line 2.
|
|
mov UCursor,esi
|
|
jne DoOutputLine
|
|
|
|
sub esi,ebx ; Done with 4 lines. Restore UCursor.
|
|
mov UCursor,esi
|
|
jmp Next4OutputLines
|
|
|
|
SkipOutputLine:
|
|
mov ecx,AspectAdjustmentCount
|
|
add eax,MAXWIDTH*20
|
|
mov AspectCount,ecx
|
|
jmp PrepareForNextOutputLine
|
|
|
|
Done:
|
|
|
|
mov esp,StashESP
|
|
pop ebx
|
|
pop ebp
|
|
pop edi
|
|
pop esi
|
|
rturn
|
|
|
|
MMX_YUV12ToRGB16ZoomBy2 endp
|
|
|
|
END
|