1119 lines
21 KiB
C
Raw Normal View History

2001-01-01 00:00:00 +01:00
// SAC MMx utilities
#include <memory.h>
#include "mmxutil.h"
#include "opt.h"
#define I2FTEST 0
#if I2FTEST
#include "stdio.h"
#endif
//------------------------------------------------------
int IsMMX() // does the processor I'm running have MMX(tm) technology?
{
int retu;
#ifdef _ALPHA_
return 0;
#endif
#ifdef _X86_
__asm
{
push ebx
pushfd
pop edx
mov eax,edx
xor edx,200000h
push edx
popfd
pushfd
pop edx
//
// DON'T do this. This clears EAX, but the code is relying
// on edx being 0 in the bail out case!!!
//
// -mikeg
//
// xor eax,edx
//
//
xor edx,eax //This is the right way
je no_cpuid
mov eax,1
_emit 0x0f //CPUID magic incantation
_emit 0xa2
and edx,000800000h
shr edx,23
no_cpuid:
mov retu,edx
pop ebx
}
return(retu);
#endif
}
//------------------------------------------------------
/* The following 4 routines make an 8-byte-aligned 'output' array
from an 'input' array with various alignments. MakeAlignedN assumes
that 'input' starts on an address equal to N mod 8. For now we
only handle even N.
*/
//------------------------------------------------------
void MakeAligned0(void *input, void *output, int numbytes)
{
memcpy(output,input,numbytes);
}
//------------------------------------------------------
void MakeAligned2(void *input, void *output, int numbytes)
{
memcpy(output,input,numbytes);
}
//------------------------------------------------------
void MakeAligned4(void *input, void *output, int numbytes)
{
memcpy(output,input,numbytes);
}
//------------------------------------------------------
void MakeAligned6(void *input, void *output, int numbytes)
{
memcpy(output,input,numbytes);
}
//------------------------------------------------------
int FloatToShortScaled(float *input, short *output, int len, int guard)
{
int max;
/* Convert an array of floats to an array of shorts with dynamic scaling.
If guard=0 the array is scaled so that the largest power of 2 contained
in the input comes out as 16384, which means all values fit in 16 bits
without overflow. If guard>0 the outputs are shifted an extra 'guard'
bits to the right.
*/
max = FloatMaxExp(input, len);
ScaleFloatToShort(input, output, len, max + guard);
return max;
}
int FloatToIntScaled(float *input, int *output, int len, int guard)
{
int max;
/* Convert an array of floats to an array of shorts with dynamic scaling.
If guard=0 the array is scaled so that the largest power of 2 contained
in the input comes out as 2^30, which means all values fit in 32 bits
without overflow. If guard>0 the outputs are shifted an extra 'guard'
bits to the right.
*/
max = FloatMaxExp(input, len);
ScaleFloatToInt(input, output, len, max + guard);
return max;
}
int FloatMaxExp(float *input, int len)
{
int max;
#if ASM_FTOSS
ASM
{
mov esi,input;
xor eax,eax;
mov ebx,len;
xor edi,edi; // max
loop2:
mov ecx,DP[esi+4*eax];
mov edx,DP[esi+4*eax+4];
and ecx,07f800000h;
and edx,07f800000h;
cmp edi,ecx;
jge skip1;
mov edi,ecx;
skip1:
cmp edi,edx;
jge skip2;
mov edi,edx;
skip2:
add eax,2;
cmp eax,ebx;
jl loop2;
mov max,edi;
}
#else
int exp,i;
max = 0;
for (i=0; i<len; i++)
{
exp = (*((int *)(input + i))) & 0x7f800000;
if (exp > max)
max = exp;
}
#endif
return max >> 23;
}
void ScaleFloatToShort(float *input, short *output, int len, int newmax)
{
int i;
float scale;
/*
If max exponent is 14, we want a scale factor of 1, since
then values will be at most +/- 32727. So scale factor multiplier
should be 2^(14 - max - guard). But 'max' has the exponent bias
built in, so we must add BIAS once to the exponent to get a "real"
exponent. But then we want a FP exponent that has bias, so we
need to add BIAS again! So we get 2^(2*BIAS+14 - max - guard).
2*BIAS+14 is 254 + 14 = 252+12, so it's 0x86000000 (first 9 bits 1 0000 1100)
*/
i = 0x86000000 - (newmax << 23);
scale = (*(float *)&i);
#if ASM_FTOSS
ASM
{
mov esi,input;
mov edi,output;
xor eax,eax;
mov ebx,len;
loop1:
fld DP[esi+4*eax];
fmul scale;
fld DP[esi+4*eax+4];
fmul scale;
fxch(1);
fistp WP[edi+2*eax];
fistp WP[edi+2*eax+2];
add eax,2;
cmp eax,ebx;
jl loop1;
}
#else
for (i=0; i<len; i++)
output[i] = (short)(input[i]*scale);
#endif
return;
}
void ConstFloatToShort(float *input, short *output, int len, float scale)
{
#if ASM_FTOSS
ASM
{
mov esi,input;
mov edi,output;
xor eax,eax;
mov ebx,len;
loop1:
fld DP[esi+4*eax];
fmul scale;
fld DP[esi+4*eax+4];
fmul scale;
fxch(1);
fistp WP[edi+2*eax];
fistp WP[edi+2*eax+2];
add eax,2;
cmp eax,ebx;
jl loop1;
}
#else
int i;
for (i=0; i<len; i++)
output[i] = (short)(input[i]*scale);
#endif
return;
}
//------------------------------------------------------
void ScaleFloatToInt(float *input, int *output, int len, int newmax)
{
int i;
float scale;
i = 0x8E000000 - (newmax << 23);
scale = (*(float *)&i);
#if ASM_FTOSS
ASM
{
mov esi,input;
mov edi,output;
xor eax,eax;
mov ebx,len;
loop1:
fld DP[esi+4*eax];
fmul scale;
fld DP[esi+4*eax+4];
fmul scale;
fxch(1);
fistp DP[edi+4*eax];
fistp DP[edi+4*eax+4];
add eax,2;
cmp eax,ebx;
jl loop1;
}
#else
for (i=0; i<len; i++)
output[i] = (int)(input[i]*scale);
#endif
return;
}
void ConstFloatToInt(float *input, int *output, int len, float scale)
{
#if ASM_FTOSS
ASM
{
mov esi,input;
mov edi,output;
xor eax,eax;
mov ebx,len;
loop1:
fld DP[esi+4*eax];
fmul scale;
fld DP[esi+4*eax+4];
fmul scale;
fxch(1);
fistp DP[edi+4*eax];
fistp DP[edi+4*eax+4];
add eax,2;
cmp eax,ebx;
jl loop1;
}
#else
int i;
for (i=0; i<len; i++)
output[i] = (int)(input[i]*scale);
#endif
return;
}
//------------------------------------------------------
void CorrelateInt(short *taps, short *array, int *corr, int len, int num)
{
int i,j;
for (i=0; i<num; i++) // for each correlation
{
corr[i] = 0;
for (j=0; j<len; j++)
corr[i] += (int)taps[j] * (int)array[i+j];
}
}
#if ASM_CORR
//------------------------------------------------------
void CorrelateInt4(short *taps, short *array, int *corr, int ntaps, int ncor)
{
#define rega0 mm0
#define regb0 mm1
#define rega1 mm2
#define regb1 mm3
#define rega2 mm4
#define regb2 mm5
#define acc0 mm6
#define acc1 mm7
#define arr esi
#define tap edi
#define cor eax
#define icnt ebx
// In the following macros, 'n' is the column number and 'i' is the
// iteration number.
#define la(n,i) ASM movq rega##n,QP[arr+8*i]
#define lb(n,i) ASM movq regb##n,QP[tap+8*i+8]
#define m0(n,i) ASM pmaddwd regb##n,rega##n
#define m1(n,i) ASM pmaddwd rega##n,QP[tap+8*i]
#define a0(n,i) ASM paddd acc0,regb##n
#define a1(n,i) ASM paddd acc1,rega##n
ASM
{
shr ntaps,2;
sub taps,8; // point to 1 before start of taps array
mov cor,corr;
ForEachCorrPair:
mov icnt,ntaps;
pxor acc0,acc0;
pxor acc1,acc1;
mov tap,taps;
mov arr,array;
}
// prime the pump
la(0,0);
lb(0,0);
m0(0,0);
ASM pxor rega0,rega0; // to make first a1(0,0) a nop
la(1,1);
lb(1,1);
inner:
la(2,2);
m0(1,1);
m1(1,1);
a0(0,0);
lb(2,2);
a1(0,0);
la(0,3);
m0(2,2);
m1(2,2);
a0(1,1);
lb(0,3);
a1(1,1);
la(1,4);
m0(0,3);
m1(0,3);
a0(2,2);
lb(1,4);
a1(2,2);
ASM add arr,24;
ASM add tap,24;
ASM sub icnt,3;
ASM jg inner;
a1(0,0);
// Done with one correlation pair. First need to add halves of
// acc0 and acc1 together and then store 2 results in corr array
ASM
{
movq mm0,acc0;
psrlq acc0,32;
paddd acc0,mm0;
movq mm1,acc1;
psrlq acc1,32;
movd DP[cor],acc0;
paddd acc1,mm1;
movd DP[cor+16],acc1;
add cor,32;
add array,16;
sub ncor,2;
jg ForEachCorrPair;
emms;
}
}
#undef rega0
#undef regb0
#undef rega1
#undef regb1
#undef rega2
#undef regb2
#undef acc0
#undef acc1
#undef arr
#undef tap
#undef cor
#undef icnt
#undef la
#undef lb
#undef m0
#undef m1
#undef a0
#undef a1
#else
//------------------------------------------------------
void CorrelateInt4(short *taps, short *array, int *corr, int ntaps, int ncor)
{
int i,j,k;
k = 0;
for (i=0; i<ncor; i++) // for each correlation
{
corr[k] = 0;
for (j=0; j<ntaps; j++)
corr[k] += (int)taps[j] * (int)array[k+j];
k += 4;
}
}
#endif
#if COMPILE_MMX
#undef icnt
void ab2abbcw(const short *input, short *output, int n)
{
#define in edi
#define out esi
#define icnt ecx
#define L(m,i) ASM movq mm##m,QP[in+8*(i/2)]
#define PL(m) ASM punpcklwd mm##m,mm##m
#define PH(m) ASM punpckhwd mm##m,mm##m
#define SL(m) ASM psllq mm##m,16
#define SR(m) ASM psrlq mm##m,48
#define O(m,n) ASM por mm##m,mm##n
#define S(m,i) ASM movq QP[out+8*i],mm##m
ASM {
mov in, input;
mov out, output;
mov icnt, n;
ASM pxor mm3,mm3;
sub icnt, 8;
jl odd_ends;
}
//prime pump
L(0,0);
PL(0);
L(1,1);
SL(0);
PH(1);
SL(1);
O(3,0);
L(2,2);
SR(0);
S(3,0);
PL(2);
ASM sub icnt, 8;
ASM jl cleanup;
inner:
SL(2);
O(0,1);
L(3,3)
SR(1);
S(0,1);
PH(3);
SL(3);
O(1,2);
L(0,4);
SR(2);
S(1,2);
PL(0);
SL(0);
O(2,3);
L(1,5);
SR(3);
S(2,3);
PH(1);
SL(1);
O(3,0);
L(2,6);
SR(0);
S(3,4);
PL(2);
ASM add in, 16;
ASM add out, 32;
ASM sub icnt, 8;
ASM jg inner;
cleanup:
SL(2);
O(0,1);
L(3,2);
SR(1);
S(0,1);
PH(3);
SL(3);
O(1,2);
SR(2);
S(1,2);
O(2,3);
S(2,3);
odd_ends:
ASM add icnt, 8-4;
ASM jl end; // jump if no sign change
L(0,4);
SR(3);
PL(0);
L(1,5);
SL(0);
PH(1);
O(3,0);
SL(1);
SR(0);
S(3,4);
O(0,1);
S(0,5);
end:
ASM emms;
#undef in
#undef out
#undef icnt
#undef L
#undef PL
#undef PH
#undef SL
#undef SR
#undef O
#undef S
return;
}
void ab2ababw(const short *input, short *output, int n)
{
#define in edi
#define out esi
#define icnt ecx
#define L(m,i) ASM movq mm##m,QP[in+4*i]
#define C(m,n) ASM movq mm##m,mm##n
#define PL(m) ASM punpckldq mm##m,mm##m
#define PH(m) ASM punpckhdq mm##m,mm##m
#define S(m,i) ASM movq [out+8*i],mm##m
ASM {
mov in, input;
mov out, output;
mov icnt, n;
sub icnt, 8;
jl odd_ends;
}
//prime pump
L(0,0);
C(1,0);
PL(0);
L(2,2);
PH(1);
S(0,0);
C(3,2);
S(1,1);
PL(2);
ASM add in, 16;
ASM add out, 32;
ASM sub icnt, 8;
ASM jl cleanup;
inner:
L(0,0);
PH(3);
S(2,-2);
C(1,0);
S(3,-1);
PL(0);
L(2,2);
PH(1);
S(0,0);
C(3,2);
S(1,1);
PL(2);
ASM add in, 16;
ASM add out, 32;
ASM sub icnt, 8;
ASM jg inner;
cleanup:
PH(3);
S(2,-2);
S(3,-1);
odd_ends:
ASM add icnt, 8-2;
ASM jl end; // jump if no sign change
inner_by2:
ASM movd mm0, DP[in];
PL(0);
S(0,0);
ASM add in, 4;
ASM add out, 8;
ASM sub icnt, 2;
ASM jge inner_by2;
end:
ASM emms;
return;
}
#undef in
#undef out
#undef icnt
#undef L
#undef C
#undef PL
#undef PH
#undef S
void ConvMMX(short *input1, short *input2, int *output, int ncor)
{
#define rega0 mm0
#define regb0 mm1
#define rega1 mm2
#define regb1 mm3
#define rega2 mm4
#define regb2 mm5
#define acc0 mm6
#define acc1 mm7
#define in2 esi
#define in1 edi
#define out eax
#define icnt ecx
#define tmp ebx
// In the following macros, 'n' is the column number and 'i' is the
// iteration number.
// we use "the convolution trick" or using la twice so that one
// of the pmadd's is reg,reg and thus can be in the V-slot.
// NOTE: we have read ahead up to 2 quadwords
// so we need QP[taps+8*ncor] = QP[taps+8*ncor+8] = [0 0 0 0]
// and reading QP[array+8*ncor] or QP[array+8*ncor+8] must be legal
#define la(n,i) ASM movq rega##n,QP[in2+8*i]
#define lb(n,i) ASM movq regb##n,QP[in1+8*i-8]
#define m0(n,i) ASM pmaddwd regb##n,rega##n
#define m1(n,i) ASM pmaddwd rega##n,QP[in1+8*i]
#define a0(n,i) ASM paddd acc0,regb##n
#define a1(n,i) ASM paddd acc1,rega##n
ASM
{
mov tmp,ncor;
shl tmp,2;
shr ncor,1;
mov out,output;
add out,tmp;
add out,16;
mov in1,input1;
mov in2,input2;
mov icnt,ncor;
}
ForEachCorrPair:
// prime the pump
la(0,0);
ASM pxor regb0,regb0; // to avoid lb(0,0) reading taps[-1]
la(1,1);
ASM pxor acc0,acc0; // clear accumulator
m1(0,0);
ASM pxor acc1,acc1; // clear accumulator
lb(1,1);
ASM sub icnt, 1; // account for pump priming
ASM jle cleanup; // bypass if only one to do
inner:
la(2,2);
m0(1,1);
m1(1,1);
a0(0,0);
lb(2,2);
a1(0,0);
la(0,3);
m0(2,2);
m1(2,2);
a0(1,1);
lb(0,3);
a1(1,1);
la(1,4);
m0(0,3);
m1(0,3);
a0(2,2);
lb(1,4);
a1(2,2);
ASM add in2,24;
ASM add in1,24;
ASM sub icnt,3;
ASM jg inner;
cleanup: // last two adds
a0(0,0);
a1(0,0);
// Done with one correlation pair. Pack and store 2 results in corr array
ASM
{
sub out,16;
mov in2, input2;
mov in1,input1;
add in2,16;
mov icnt, ncor;
mov input2, in2;
sub icnt,2; //set flags for jump
movq QP[out-16],acc0;
movq QP[out-8],acc1;
mov ncor, icnt;
jg ForEachCorrPair;
emms;
}
}
#undef rega0
#undef regb0
#undef rega1
#undef regb1
#undef rega2
#undef regb2
#undef acc0
#undef acc1
#undef in2
#undef in1
#undef out
#undef icnt
#undef tmp
#undef la
#undef lb
#undef m0
#undef m1
#undef a0
#undef a1
// 16 bit output
// psrad acc0,16;//this could be less in some cases
// psrad acc1,16;
// packssdw acc1,acc0;
// movq QP[cor-8],acc0;
//#else
//------------------------------------------------------
/*
void ConvMMX(short *in1, short *in2, int *out, int ncor)
{
int i,j;
for (i=0; i < 2*ncor; i+=4) {
int acc0 = 0, acc1 = 0;
for (j=0; j < 2*ncor - i; j+=4) {
acc0 += (int)taps[j]*array[i+j] + (int)taps[j+1]*array[i+j+1];
acc1 += (int)taps[j+2]*array[i+j+2] + (int)taps[j+3]*array[i+j+3];
}
corr[i/2] = acc0 ;
corr[i/2+1] = acc1 ;
}
return;
}*/
void ab2abzaw(const short *input, short *output, int n)
{
register int i;
register unsigned *in, *out;
register unsigned x, y; //tread two words at a time as raw bits
in = (unsigned *)input;
out = (unsigned *)output;
//unroll by two
for (i = n/2 - 2; i>0; i-=2) {
x = in[i];
y = in[i+1];
out[2*(i+1)] = y;
out[2*(i+1)+1] = (y<<16 | x>>16);
x = in[i-1];
y = in[i];
out[2*i] = y;
out[2*i+1] = (y<<16 | x>>16);
}
//odd ends
for (i++; i>=0; i--) {
x = (i>0)?in[i-1]:0;
y = in[i];
out[2*i] = y;
out[2*i+1] = (y<<16 | x>>16);
}
return;
}
void ShortToFloatScale(short *x, float scale, int N, float *y)
{
/*
short i;
float yy[100];
for (i=0; i<N; i++)
{ yy[i]=x[i]*scale; }
ASM
{
mov esi,x;
mov edi,y;
lea ecx,scale;
mov eax, N
sub eax, 2
loop1:
fild WORD PTR [esi+eax*2]
fmul DWORD PTR [ecx]
fstp DWORD PTR [edi+eax*4]
fild WORD PTR [esi+eax*2+2]
fmul DWORD PTR [ecx]
fstp DWORD PTR [edi+eax*4+4]
sub eax, 2
jge loop1;
}
*/
ASM
{
mov esi,x;
mov edi,y;
lea ecx,scale;
mov eax, N
sub eax, 6
fld DP [ecx] ; c
fild WORD PTR [esi+eax*2+8] ; L0 c
fild WORD PTR [esi+eax*2+10] ; L1 L0 c
fxch ST(1) ; L0 L1 c
fmul ST(0), ST(2) ; M0 L1 c
fxch ST(1) ; L1 M0 c
fmul ST(0),ST(2) ; M1 M0 c
fild WORD PTR [esi+eax*2+4] ; L0 M1 M0 c
fild WORD PTR [esi+eax*2+6]; L1 L0 M1 M0 c
fxch ST(3) ; M0 L0 M1 L1 c
fstp DWORD PTR [edi+eax*4+16]; L0 M1 L1 c
loop1: ; L0 M1 L1 c
fmul ST(0),ST(3) ; M0 M1 L1 c
fxch ST(1) ; M1 M0 L1 c
fstp DWORD PTR [edi+eax*4+20]; M0 L1 c
fxch ST(1) ; L1 M0 c
fmul ST(0),ST(2) ; M1 M0 c
fild WORD PTR [esi+eax*2] ; L0 M1 M0 c
fild WORD PTR [esi+eax*2+2] ; L1 L0 M1 M0 c
fxch ST(3) ; M0 L0 M1 L1 c
fstp DWORD PTR [edi+eax*4+8]; L0 M1 L1 c
sub eax, 2
jge loop1;
fmul ST(0),ST(3) ;eax==-2 M0 M1 L1 c
fxch ST(1) ; M1 M0 L1 c
fstp DWORD PTR [edi+eax*4+20] ; M0 L1 c
fxch ST(1) ; L1 M0 c
fmulp ST(2), st(0) ; M0 M1
fstp DWORD PTR [edi+eax*4+8] ; M1
fstp DWORD PTR [edi+eax*4+12] ;
}
/*
for (i=0; i<N; i++)
{
if (y[i]!=yy[i])
{
fprintf(stdout,"\nfloat problem\n");
break;
}
}
*/
}
//assumes N is even
void IntToFloatScale(int *x, float scale, int N, float *y)
{
#if I2FTEST //test code
int i;
float yy[1000];
for (i=0; i<N; i++)
{ yy[i]=(float)x[i]*scale; }
#endif //test code
#if 0 //simple code
//simple assembly version
ASM
{
mov esi,x;
mov edi,y;
lea ecx,scale;
mov eax, N
sub eax, 2
loop1:
fild DWORD PTR [esi+eax*4]
fmul DWORD PTR [ecx]
fstp DWORD PTR [edi+eax*4]
fild DWORD PTR [esi+eax*4+4]
fmul DWORD PTR [ecx]
fstp DWORD PTR [edi+eax*4+4]
sub eax, 2
jge loop1;
}
#endif //test code
ASM
{
mov esi,x;
mov edi,y;
lea ecx,scale;
mov eax, N
sub eax, 6
fld DP [ecx] ; c
fild DWORD PTR [esi+eax*4+16] ; L0 c
fild DWORD PTR [esi+eax*4+20] ; L1 L0 c
fxch ST(1) ; L0 L1 c
fmul ST(0), ST(2) ; M0 L1 c
fxch ST(1) ; L1 M0 c
fmul ST(0),ST(2) ; M1 M0 c
fild DWORD PTR [esi+eax*4+8] ; L0 M1 M0 c
fild DWORD PTR [esi+eax*4+12];L1 L0 M1 M0 c
fxch ST(3) ; M0 L0 M1 L1 c
fstp DWORD PTR [edi+eax*4+16]; L0 M1 L1 c
loop1: ; L0 M1 L1 c
fmul ST(0),ST(3) ; M0 M1 L1 c
fxch ST(1) ; M1 M0 L1 c
fstp DWORD PTR [edi+eax*4+20]; M0 L1 c
fxch ST(1) ; L1 M0 c
fmul ST(0),ST(2) ; M1 M0 c
fild DWORD PTR [esi+eax*4] ; L0 M1 M0 c
fild DWORD PTR [esi+eax*4+4] ;L1 L0 M1 M0 c
fxch ST(3) ; M0 L0 M1 L1 c
fstp DWORD PTR [edi+eax*4+8]; L0 M1 L1 c
sub eax, 2
jge loop1;
fmul ST(0),ST(3) ;eax==-2 M0 M1 L1 c
fxch ST(1) ; M1 M0 L1 c
fstp DWORD PTR [edi+eax*4+20] ; M0 L1 c
fxch ST(1) ; L1 M0 c
fmulp ST(2), st(0) ; M0 M1
fstp DWORD PTR [edi+eax*4+8] ; M1
fstp DWORD PTR [edi+eax*4+12] ;
}
#if I2FTEST
for (i=0; i<N; i++)
{
if (y[i]!=yy[i])
{
printf("F2I %3d %8f %8f\n", i, y[i], yy[i]);
}
}
#endif //test code
}
//assumes N is even
void IntToFloat(int *x, int N, float *y)
{
#if I2FTEST //test code
int i;
float yy[1000];
for (i=0; i<N; i++)
{ yy[i]=(float)x[i]; }
#endif //test code
//simple assembly version
ASM
{
mov esi,x;
mov edi,y;
mov eax, N
sub eax, 2
loop1:
fild DWORD PTR [esi+eax*4]
fild DWORD PTR [esi+eax*4+4]
fxch ST(1) ;
fstp DWORD PTR [edi+eax*4]
fstp DWORD PTR [edi+eax*4+4]
sub eax, 2
jge loop1;
}
#if I2FTEST
for (i=0; i<N; i++)
{
if (y[i]!=yy[i])
{
printf("F2I %3d %8f %8f\n", i, y[i], yy[i]);
}
}
#endif //test code
}
#endif