WindowsXP-SP1/shell/shell32/tngen/piint.cpp

638 lines
11 KiB
C++

#include "stdafx.h"
#pragma hdrstop
/***************************************************************************
*
* INTEL Corporation Proprietary Information
*
*
* Copyright (c) 1996 Intel Corporation.
* All rights reserved.
*
***************************************************************************
AUTHOR: Kumar Balasubramanian
***************************************************************************
** Pentium version of the "integer LLM mode" within IJG decompressor code.
** The following is a non-MMX Pentium implementation of the integer slow mode
** IDCT within the IJG code.
*/
#define JPEG_INTERNALS
#include "jinclude.h"
#include "jpeglib.h"
#include "jdct.h" /* Private declarations for DCT subsystem */
#ifdef DCT_ISLOW_SUPPORTED
/*
* This module is specialized to the case DCTSIZE = 8.
*/
#if DCTSIZE != 8
Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */
#endif
#if BITS_IN_JSAMPLE == 8
#define CONST_BITS 13
#define PASS1_BITS 2
#else
#define CONST_BITS 13
#define PASS1_BITS 1 /* lose a little precision to avoid overflow */
#endif
/* Define the constants for the case BITS_IN_JSAMPLE = 8 */
static const INT32 const_0_2986 = 0x0000098E ;
static const INT32 const_0_3901 = 0x0fffff384;
static const INT32 const_0_54119 = 0x00001151;
static const INT32 const_0_7653 = 0x0000187E;
static const INT32 const_0_899 = 0x0ffffe333;
static const INT32 const_1_175 = 0x000025a1;
static const INT32 const_1_501 = 0x0000300b;
static const INT32 const_1_8477 = 0x0ffffc4df;
static const INT32 const_1_961 = 0x0ffffc13b;
static const INT32 const_2_053 = 0x000041b3;
static const INT32 const_2_562 = 0x0ffffadfd;
static const INT32 const_3_072 = 0x00006254;
static const INT32 const_round = 0x00000400;
static const INT32 const_round_row = 0x00020000;
static const INT32 const_mask = 0x000003ff;
/*
* Perform dequantization and inverse DCT on one block of coefficients.
*/
GLOBAL(void)
pidct8x8llm (JCOEFPTR inptr, short *quantptr, short *wsptr,
JSAMPARRAY output_buf, JDIMENSION output_col, JSAMPLE *range_limit )
{
INT32 locdwinptr, locdwqptr, locdwwsptr, locdwtmp0, locdwtmp1 ;
INT32 locdwtmp2, locdwtmp3, locdwtmp00, locdwtmp01, locdwtmp02 ;
INT32 locdwtmp03, locdwtmp10, locdwtmp11, locdwtmp12 ;
INT32 locdwtmp13, locdwcounter, locdwrowctr ;
// Inline assembly to do the IDCT and store the result */
__asm {
mov esi, inptr ; point to start of source
mov edi, quantptr ;
mov eax, wsptr
mov locdwinptr, esi ; point to start of source
mov locdwqptr, edi ;
mov locdwwsptr, eax
mov locdwcounter, 8
mov eax, [esi] ; warm up the cache
mov ebx, [esi+32]
mov ecx, [esi+64]
mov edx, [esi+96]
mov eax, [edi]
mov ebx, [edi+32]
mov ecx, [edi+64]
mov edx, [edi+96]
;; 1D-IDCT of all the eight columns
idct_column:
mov esi, locdwinptr ; point to start of source
mov edi, locdwqptr ;
;; do the even part
mov ax, [esi+16*2]
mov bx, [edi+16*2]
shl eax, 16 ; sign extend the i/p
mov cx, [esi+16*6]
sar eax, 16
mov dx, [edi+16*6]
shl ebx, 16 ; sign extend the quant factor
sar ebx, 16
imul eax, ebx ; dequantized C2 = z2
shl ecx, 16
sar ecx, 16
shl edx, 16
sar edx, 16
imul ecx, edx ; dequantized C6 = z3
mov ebx, eax ; copy of z2
imul eax, const_0_7653
add ebx, ecx ; z2 + z3
imul ecx, const_1_8477
imul ebx, const_0_54119 ; z1
mov dx, [edi+16*4] ; quant factor for C4
add ecx, ebx ; tmp2
add eax, ebx ; tmp3
mov locdwtmp2, ecx
mov locdwtmp3, eax
mov cx, [esi+16*4] ; C4
mov ax, [esi+16*0] ; C0
mov bx, [edi+16*0] ; quant factor for C0
movsx edx, dx
movsx ecx, cx
movsx eax, ax
movsx ebx, bx
imul ecx, edx ; dequantize C4 = z3
imul eax, ebx ; dequantize C0 = z2
mov edx, ecx ; copy of z3
add ecx, eax ; z2 + z3
shl ecx, 13 ; tmp0
sub eax, edx ; z2 - z3
shl eax, 13 ; tmp1
mov ebx, ecx ; copy of tmp0
add ecx, locdwtmp3 ; tmp10
mov edx, eax ; copy of tmp1
add eax, locdwtmp2 ; tmp11
mov locdwtmp00, ecx
sub ebx, locdwtmp3 ; tmp13
mov locdwtmp01, eax
sub edx, locdwtmp2 ; tmp12
mov locdwtmp03, ebx
mov ax, [esi+16*7] ; C7 for the odd part
mov locdwtmp02, edx
mov bx, [edi+16*7] ; quant factor for C7
;; now do the odd part
shl eax, 16
mov cx, [esi+16*3]
sar eax, 16
mov dx, [edi+16*3]
shl ebx, 16
sar ebx, 16
imul eax, ebx ; dequantized C7 = tmp0
shl ecx, 16
sar ecx, 16
shl edx, 16
sar edx, 16
mov bx, [esi+16*1]
imul ecx, edx ; dequantized C3 = tmp2
shl ebx, 16
mov dx, [edi+16*1]
sar ebx, 16
shl edx, 16
sar edx, 16
imul ebx, edx ; dequantized C1 = tmp3
mov locdwtmp0, eax
mov locdwtmp2, ecx
mov ax, [esi+16*5]
mov dx, [edi+16*5]
shl eax, 16
sar eax, 16
shl edx, 16
sar edx, 16
imul eax, edx ; dequantized C5 = tmp1
imul ecx, const_3_072 ; tmp2
mov locdwtmp3, ebx
mov edx, locdwtmp0
imul ebx, const_1_501 ; tmp3
imul edx, const_0_2986 ; tmp0
mov locdwtmp1, eax ; store tmp1
mov locdwtmp10, edx
imul eax, const_2_053 ; tmp1
mov locdwtmp11, eax
mov locdwtmp12, ecx
mov locdwtmp13, ebx
mov eax, locdwtmp0
mov ebx, locdwtmp1
mov ecx, eax
mov edx, ebx
add eax, locdwtmp3 ; z1
add ebx, locdwtmp3 ; z4
add ecx, locdwtmp2 ; z3
add edx, locdwtmp2 ; z2
mov esi, ecx ; copy of z3
imul eax, const_0_899 ; z1
imul edx, const_2_562 ; z2
add esi, ebx ; z3 + z4
imul esi, const_1_175 ; z5
imul ecx, const_1_961 ; z3
imul ebx, const_0_3901 ; z4
add ecx, esi ; z3
add ebx, esi ; z4
mov esi, eax ; copy of z1
add eax, ecx ; z1 + z3
add esi, ebx ; z1 + z4
add ecx, edx ; z3 + z2
add edx, ebx ; z2 + z4
add eax, locdwtmp10 ; tmp0
add edx, locdwtmp11 ; tmp1
add ecx, locdwtmp12 ; tmp2
add esi, locdwtmp13 ; tmp3
mov ebx, locdwtmp03
sub ebx, eax ; w4
add eax, locdwtmp03 ; w3
add ebx, const_round
mov edi, locdwwsptr ; keep in mind that wsptr stores 32 bit values
sar ebx, 11 ; So store/update the pointer accordingly
add eax, const_round
sar eax, 11
mov [edi+32*4], ebx
mov [edi+32*3], eax
mov ebx, locdwtmp02
mov eax, locdwtmp01
sub ebx, edx ; w5
add edx, locdwtmp02 ; w2
sub eax, ecx ; w6
add ecx, locdwtmp01 ; w1
add ebx, const_round
sar ebx, 11
add eax, const_round
sar eax, 11
add edx, const_round
add ecx, const_round
mov [edi+32*5], ebx
sar edx, 11
mov [edi+32*6], eax
sar ecx, 11
mov [edi+32*2], edx
mov eax, locdwtmp00
mov [edi+32*1], ecx
mov ebx, eax
sub eax, esi ; w7
add ebx, esi ; w0
add eax, const_round
sar eax, 11
add ebx, const_round
sar ebx, 11
mov [edi+32*7], eax
mov [edi+32*0], ebx
mov eax, locdwcounter
add locdwinptr, 2
add locdwwsptr, 4 ; wsptr stores 32 bit quantities
add locdwqptr, 2
dec eax
mov locdwcounter, eax
jnz idct_column
;; End of 1D-idct of all the columns
;; get ready for the 1D-idct of the rows
mov esi, wsptr
mov locdwcounter, 8
mov locdwrowctr, 0
mov locdwwsptr, esi
;; 1D-IDCT of all the eight rows
idct_row:
mov esi, locdwwsptr ; point to start of source
mov edi, output_buf
add edi, locdwrowctr
mov edi, [edi]
add locdwrowctr, 4
add edi, output_col ; this is the dest start addr for this row
;; do the even part
mov eax, [esi+4*2]
mov ecx, [esi+4*6]
mov ebx, eax ; copy of z2
mov edx, [edi] ; warm up the cache for writing this output row
imul eax, const_0_7653
add ebx, ecx ; z2 + z3
imul ecx, const_1_8477
imul ebx, const_0_54119 ; z1
add ecx, ebx ; tmp2
add eax, ebx ; tmp3
mov locdwtmp2, ecx
mov locdwtmp3, eax
mov ecx, [esi+4*4] ; C4
mov eax, [esi+4*0] ; C0
mov edx, ecx ; copy of z3
add ecx, eax ; z2 + z3
sub eax, edx ; z2 - z3
shl ecx, 13 ; tmp0
shl eax, 13 ; tmp1
mov ebx, ecx ; copy of tmp0
add ecx, locdwtmp3 ; tmp10
mov edx, eax ; copy of tmp1
add eax, locdwtmp2 ; tmp11
mov locdwtmp00, ecx
sub ebx, locdwtmp3 ; tmp13
mov locdwtmp01, eax
sub edx, locdwtmp2 ; tmp12
mov locdwtmp03, ebx
mov eax, [esi+4*7] ; C7 for the odd part
mov locdwtmp02, edx
;; now do the odd part
mov ecx, [esi+4*3]
mov ebx, [esi+4*1]
mov locdwtmp0, eax
mov locdwtmp2, ecx
mov eax, [esi+4*5]
mov locdwtmp3, ebx
imul ecx, const_3_072 ; tmp2
mov edx, locdwtmp0
imul ebx, const_1_501 ; tmp3
imul edx, const_0_2986 ; tmp0
mov locdwtmp1, eax ; store tmp1
imul eax, const_2_053 ; tmp1
mov locdwtmp10, edx
mov locdwtmp11, eax
mov locdwtmp12, ecx
mov locdwtmp13, ebx
mov eax, locdwtmp0
mov ebx, locdwtmp1
mov ecx, eax
mov edx, ebx
add eax, locdwtmp3 ; z1
add edx, locdwtmp2 ; z2
add ebx, locdwtmp3 ; z4
add ecx, locdwtmp2 ; z3
mov esi, ecx ; copy of z3
imul eax, const_0_899 ; z1
imul edx, const_2_562 ; z2
add esi, ebx ; z3 + z4
imul esi, const_1_175 ; z5
imul ecx, const_1_961 ; z3
imul ebx, const_0_3901 ; z4
add ecx, esi ; z3
add ebx, esi ; z4
mov esi, eax ; copy of z1
add eax, ecx ; z1 + z3
add esi, ebx ; z1 + z4
add ecx, edx ; z3 + z2
add edx, ebx ; z2 + z4
add eax, locdwtmp10 ; tmp0
add edx, locdwtmp11 ; tmp1
add ecx, locdwtmp12 ; tmp2
add esi, locdwtmp13 ; tmp3
mov locdwtmp0, eax
mov locdwtmp1, edx
mov locdwtmp2, ecx
mov locdwtmp3, esi
mov ebx, locdwtmp03
add ebx, locdwtmp0 ; out3
mov ecx, locdwtmp00
sub ecx, locdwtmp3 ; out7
add ebx, const_round_row
sar ebx, 18
add ecx, const_round_row
sar ecx, 18
mov esi, range_limit
and ebx, const_mask
and ecx, const_mask
mov al, [esi][ebx]
mov dl, [esi][ecx]
mov ebx, locdwtmp02
mov ecx, locdwtmp01
add ebx, locdwtmp1 ; out2
sub ecx, locdwtmp2 ; out6
shl eax, 8 ; get ready to receive next output byte
add ebx, const_round_row
shl edx, 8 ; get ready to receive next output byte
add ecx, const_round_row
sar ebx, 18
sar ecx, 18
and ebx, const_mask
and ecx, const_mask
mov al, [esi][ebx]
mov dl, [esi][ecx]
mov ebx, locdwtmp01
mov ecx, locdwtmp02
add ebx, locdwtmp2 ; out1
shl eax, 8 ; get ready to receive next output byte
sub ecx, locdwtmp1 ; out5
shl edx, 8 ; get ready to receive next output byte
add ebx, const_round_row
sar ebx, 18
add ecx, const_round_row
sar ecx, 18
and ebx, const_mask
and ecx, const_mask
mov al, [esi][ebx] ; out1
mov dl, [esi][ecx] ; out5
mov ebx, locdwtmp00
mov ecx, locdwtmp03
add ebx, locdwtmp3 ; out0
shl eax, 8 ; get ready to receive next output byte
sub ecx, locdwtmp0 ; out4
shl edx, 8 ; get ready to receive next output byte
add ebx, const_round_row
sar ebx, 18
add ecx, const_round_row
sar ecx, 18
and ebx, const_mask
and ecx, const_mask
mov al, [esi][ebx] ; out0
mov dl, [esi][ecx] ; out4
mov [edi], eax ; store the first four bytes
mov [edi+4], edx ; store the next four bytes of this row
mov eax, locdwcounter
add locdwwsptr, 32 ; wsptr stores 32 bit quantities
dec eax
mov locdwcounter, eax
jnz idct_row
} //end of __asm
}
#endif /* DCT_ISLOW_SUPPORTED */