2020-09-30 16:53:55 +02:00

1021 lines
21 KiB
C

// cb53mmx.c
#include "cst_lbc.h"
#include "mmxutil.h"
#include "opt.h"
#include "exc_lbc.h"
#include "timer.h"
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#include "util_lbc.h"
#define ASM_CORHPL 1
#define ASM_CORHDL 1
#define TESTME 0
#define CHTEST 0
#if COMPILE_MMX
void CorrelateIntTri(short *taps, short *array, int *corr, int ncor);
void CorrelateInt22(short *taps, short *array, int *corr, int ncor);
void Cor_h_Xint(short h[],short X[],int D[]);
void Cor_hint0(short *H, int *rr);
void Cor_hint1(short *H, int *rr);
void cor_h_prodloop(int n, int oddn,short *h,short *h2,int *p3,int *p2,int *p1,int *p0);
void cor_h_diag(int n, int oddn,short *h,short *h2,int *p3,int *p2,int *p1,int *p0);
//------------------------------------------------------------
int ACELP_LBC_code_int(float X[], float h[], int T0, float code[],
int *ind_gain, int *shift, int *sign, float gain_T0, int flags)
{
int i, index;
float gain_q;
float Dn[SubFrLen2], tmp_code[SubFrLen2];
float rr[DIM_RR];
DECLARE_INT(rrint, DIM_RR);
DECLARE_SHORT(hint, SubFrLen2);
DECLARE_INT(Dnint, SubFrLen2);
DECLARE_SHORT(Xint, SubFrLen2);
int XScale;
float hScale;
int m;
#if 0//TESTME
float htest[SubFrLen], Xtest[SubFrLen];
for (i = 0; i<SubFrLen; i++)
{
htest[i] = i; //(float)(i<30?i:60-i);
Xtest[i] = (float)(i<30?i:60-i);
}
h = htest;
X = Xtest;
#endif //TESTME
// Include fixed-gain pitch contribution into impulse resp. h[]
if (T0 < SubFrLen-2)
for (i = T0; i < SubFrLen; i++)
h[i] += gain_T0*h[i-T0];
ALIGN_ARRAY(rrint);
ALIGN_ARRAY(hint);
ALIGN_ARRAY(Dnint);
ALIGN_ARRAY(Xint);
//hScale = FloatToShortScaled(h, hint, SubFrLen, 3);
hScale = (float)sqrt(DotProd(h,h,SubFrLen)/(double)SubFrLen);
m = (asint(hScale) & 0x7f800000) >> 23;
ScaleFloatToShort(h, hint, SubFrLen, m+3);
XScale = FloatToShortScaled(X, Xint, SubFrLen, 3); //would be better to normalize based on engery, not max
#if 0
for (i = 0; i<SubFrLen; i++)
{
hint[i] = i;
}
#endif
// Compute correlations of h[] needed for the codebook search
//TIMER_STAMP(a);
Cor_hint1(hint, rrint);
IntToFloat(rrint, DIM_RR, rr);
//TIMER_STAMP(b);
// Cor_h(h, rr);
////TIMER_STAMP(c);
#if CHTEST
{
DECLARE_INT(rrint2, DIM_RR);
ALIGN_ARRAY(rrint2);//debug
Cor_hint0(hint, rrint2);
for(i = 0; i<DIM_RR; i++) //debug
if(rrint[i] != rrint2[i])
printf("%3d: %8d %8d %8d\n",i, rrint[i], rrint2[i], rrint[i] - rrint2[i]);
}
#endif //CHTEST
// Compute correlation of target vector with impulse response.
//TIMER_STAMP(c);
Cor_h_Xint(hint, Xint, Dnint);
//TIMER_STAMP(d);
IntToFloat(Dnint, SubFrLen, Dn);
//TIMER_STAMP(a);
#if TESTME //test
{
int fpDnint[SubFrLen2];
// float scale;
// scale =
Cor_h_X(h,X,Dn);
FloatToIntScaled(Dn, fpDnint, SubFrLen, 7);
for (i = 0; i<SubFrLen; i++)
if(fpDnint[i] != Dnint[i])
printf("%3d: %8x %8x %8x\n", i, Dnint[i] - fpDnint[i],Dnint[i], fpDnint[i]);
}
#endif //test
// Find codebook index
//TIMER_STAMP(c);
index = D4i64_LBC(Dn, rr, h, tmp_code, rr, shift, sign, flags);
//TIMER_STAMP(f);
// Compute innovation vector gain.
// Include fixed-gain pitch contribution into code[].
*ind_gain = G_code(X, rr, &gain_q);
for (i=0; i < SubFrLen; i++)
code[i] = tmp_code[i]*gain_q;
if(T0 < SubFrLen-2)
for (i=T0; i < SubFrLen; i++)
code[i] += code[i-T0]*gain_T0;
return index;
}
//---------------------------------------------------------------
//---------------------------------------------------------------
void Cor_hint0(short *H, int *rr)
{
// Compute correlations of h[] needed for the codebook search.
// h[] :Impulse response.
// rr[] :Correlations.
int *rri0i0, *rri1i1, *rri2i2, *rri3i3;
int *rri0i1, *rri0i2, *rri0i3;
int *rri1i2, *rri1i3, *rri2i3;
int *p0, *p1, *p2, *p3;
int cor;
int i, k, m, t;
DECLARE_SHORT(h,SubFrLen2);
DECLARE_SHORT(h2,SubFrLen2);
ALIGN_ARRAY(h);
ALIGN_ARRAY(h2);
for(i=0; i<4; i++)
h[i] = (short)0;
for(i=0; i<SubFrLen; i++)
h2[i+2] = h[i+4] = H[i];
// Init pointers
rri0i0 = rr;
rri1i1 = rri0i0 + NB_POS;
rri2i2 = rri1i1 + NB_POS;
rri3i3 = rri2i2 + NB_POS;
rri0i1 = rri3i3 + NB_POS;
rri0i2 = rri0i1 + MSIZE;
rri0i3 = rri0i2 + MSIZE;
rri1i2 = rri0i3 + MSIZE;
rri1i3 = rri1i2 + MSIZE;
rri2i3 = rri1i3 + MSIZE;
// Compute rri0i0[], rri1i1[], rri2i2[] and rri3i3[]
cor = 0;
m = 0;
for(i=NB_POS-1; i>=0; i--)
{
cor += h[m+0]*h[m+0] + h[m+1]*h[m+1]; rri3i3[i] = cor;
cor += h[m+2]*h[m+2] + h[m+3]*h[m+3]; rri2i2[i] = cor;
cor += h[m+4]*h[m+4] + h[m+5]*h[m+5]; rri1i1[i] = cor;
cor += h[m+6]*h[m+6] + h[m+7]*h[m+7]; rri0i0[i] = cor;
m += 8;
}
// Compute elements of: rri0i1[], rri0i3[], rri1i2[] and rri2i3[]
h2 = h+2;
p3 = rri2i3 + MSIZE-1;
p2 = rri1i2 + MSIZE-1;
p1 = rri0i1 + MSIZE-1;
p0 = rri0i3 + MSIZE-2;
for (k=0; k<NB_POS; k++)
{
cor = 0;
m = 0;
t = 0;
for(i=k+1; i<NB_POS; i++)
{
cor += h[m+0]*h2[m+0] + h[m+1]*h2[m+1]; p3[t] = cor;
cor += h[m+2]*h2[m+2] + h[m+3]*h2[m+3]; p2[t] = cor;
cor += h[m+4]*h2[m+4] + h[m+5]*h2[m+5]; p1[t] = cor;
cor += h[m+6]*h2[m+6] + h[m+7]*h2[m+7]; p0[t] = cor;
t -= (NB_POS+1);
m += 8;
}
cor += h[m+0]*h2[m+0] + h[m+1]*h2[m+1]; p3[t] = cor;
cor += h[m+2]*h2[m+2] + h[m+3]*h2[m+3]; p2[t] = cor;
cor += h[m+4]*h2[m+4] + h[m+5]*h2[m+5]; p1[t] = cor;
h2 += STEP;
p3 -= NB_POS;
p2 -= NB_POS;
p1 -= NB_POS;
p0 -= 1;
}
// Compute elements of: rri0i2[], rri1i3[]
h2 = h+4;
p3 = rri1i3 + MSIZE-1;
p2 = rri0i2 + MSIZE-1;
p1 = rri1i3 + MSIZE-2;
p0 = rri0i2 + MSIZE-2;
for (k=0; k<NB_POS; k++)
{
cor = 0;
m = 0;
t = 0;
for(i=k+1; i<NB_POS; i++)
{
cor += h[m+0]*h2[m+0] + h[m+1]*h2[m+1]; p3[t] = cor;
cor += h[m+2]*h2[m+2] + h[m+3]*h2[m+3]; p2[t] = cor;
cor += h[m+4]*h2[m+4] + h[m+5]*h2[m+5]; p1[t] = cor;
cor += h[m+6]*h2[m+6] + h[m+7]*h2[m+7]; p0[t] = cor;
t -= (NB_POS+1);
m += 8;
}
cor += h[m+0]*h2[m+0] + h[m+1]*h2[m+1]; p3[t] = cor;
cor += h[m+2]*h2[m+2] + h[m+3]*h2[m+3]; p2[t] = cor;
h2 += STEP;
p3 -= NB_POS;
p2 -= NB_POS;
p1 -= 1;
p0 -= 1;
}
// Compute elements of: rri0i1[], rri0i3[], rri1i2[] and rri2i3[]
h2 = h+6;
p3 = rri0i3 + MSIZE-1;
p2 = rri2i3 + MSIZE-2;
p1 = rri1i2 + MSIZE-2;
p0 = rri0i1 + MSIZE-2;
for (k=0; k<NB_POS; k++)
{
cor = 0;
m = 0;
t = 0;
for(i=k+1; i<NB_POS; i++)
{
cor += h[m+0]*h2[m+0] + h[m+1]*h2[m+1]; p3[t] = cor;
cor += h[m+2]*h2[m+2] + h[m+3]*h2[m+3]; p2[t] = cor;
cor += h[m+4]*h2[m+4] + h[m+5]*h2[m+5]; p1[t] = cor;
cor += h[m+6]*h2[m+6] + h[m+7]*h2[m+7]; p0[t] = cor;
t -= (NB_POS+1);
m += 8;
}
cor += h[m+0]*h2[m+0] + h[m+1]*h2[m+1]; p3[t] = cor;
h2 += STEP;
p3 -= NB_POS;
p2 -= 1;
p1 -= 1;
p0 -= 1;
}
return;
}
//---------------------------------------------------------------
void cor_h_prods(int oddn,short *h,short *h2,int *p3,int *p2,int *p1,int *p0,int dp3,int dp2,int dp1,int dp0){
int k;
for (k=0; k<NB_POS; k++)
{
cor_h_prodloop(NB_POS-(k+1),oddn,h,h2,p3,p2,p1,p0);
h2 += STEP;
p3 -= dp3;
p2 -= dp2;
p1 -= dp1;
p0 -= dp0;
}
return;
}
#if _MSC_FULL_VER >= 13008827 && defined(_M_IX86)
#pragma warning(disable:4731) // EBP modified with inline asm
#endif
void cor_h_prodloop(int n, int oddn,short *h,short *h2,int *p3,int *p2,int *p1,int *p0)
{
#if ASM_CORHPL
n = n * 4 + oddn;
#define in edi
#define inoff edx
#define out esi
#define out3 out+eax
#define out2 out+ebx
#define out1 out+ebp
#define out0 out
#define L(m,n) ASM movq mm##m, QP[in+8*n]
#define M(m,n) ASM pmaddwd mm##m, QP[in+inoff+8*n]
#define S(m) ASM psrlq mm##m, 32
#define AH(m,n) ASM paddd mm##m, mm##n
#define WH(m,o) ASM movd DP[out##o], mm##m
#define AL(m,n) ASM paddd mm##m, mm##n
#define WL(m,o) ASM movd DP[out##o], mm##m
ASM {
push ebp;
mov ecx, n;
mov in, h;
mov inoff, h2;
sub inoff, in;
mov out, p0;
mov eax, p3;
mov ebx, p2;
mov ebp, p1;
sub eax, out;
sub ebx, out;
sub ebp, out;
}
L(0,0);
ASM pxor mm3,mm3;
M(0,0);
L(1,1);
AL(3,0); //really a copy
M(1,1);
S(0);
ASM sub ecx,8;
ASM jl oddends;
inner:
L(2,2);
AH(0,3);
WL(3,3);
WH(0,2);
AL(0,1);
M(2,2);
S(1);
L(3,3);
AH(1,0);
WL(0,1);
WH(1,0);
AL(1,2);
M(3,3);
S(2);
ASM sub out, 4*(NB_POS+1);
L(0,4);
AH(2,1);
WL(1,3);
WH(2,2);
AL(2,3);
M(0,4);
S(3);
L(1,5);
AH(3,2);
WL(2,1);
WH(3,0);
AL(3,0);
M(1,5);
S(0);
ASM sub out, 4*(NB_POS+1);
ASM add in, 16*2;
ASM sub ecx, 8;
ASM jge inner;
oddends:
ASM add ecx, 4;
ASM jl cleanup;
//four more
L(2,2);
AH(0,3);
WL(3,3);
WH(0,2);
AL(0,1);
M(2,2);
S(1);
L(3,3);
AH(1,0);
WL(0,1);
WH(1,0);
AL(1,2);
M(3,3);
S(2);
ASM sub out, 4*(NB_POS+1);
AH(2,1);
ASM dec ecx;
ASM jl innerdone;
WL(1,3);
ASM dec ecx;
ASM jl innerdone;
WH(2,2);
AL(2,3);
ASM dec ecx;
ASM jl innerdone;
WL(2,1);
ASM jmp innerdone;
cleanup:
ASM add ecx, 4;
ASM dec ecx;
ASM jl innerdone;
AH(0,3);
WL(3,3);
ASM dec ecx;
ASM jl innerdone;
WH(0,2);
AL(0,1);
ASM dec ecx;
ASM jl innerdone;
WL(0,1);
innerdone:
ASM emms;
ASM pop ebp;
#undef in
#undef inoff
#undef out
#undef out3
#undef out2
#undef out1
#undef out0
#undef L
#undef M
#undef S
#undef AH
#undef WH
#undef AL
#undef WL
#else //ASM_CORHPL
int cor;
int i,m,t;
cor = 0;
m = 0;
t = 0;
for(i=n; i; i--)
{
cor += h[m+0]*h2[m+0] + h[m+1]*h2[m+1]; p3[t] = cor;
cor += h[m+2]*h2[m+2] + h[m+3]*h2[m+3]; p2[t] = cor;
cor += h[m+4]*h2[m+4] + h[m+5]*h2[m+5]; p1[t] = cor;
cor += h[m+6]*h2[m+6] + h[m+7]*h2[m+7]; p0[t] = cor;
t -= (NB_POS+1);
m += 8;
}
if(oddn >= 1) {
cor += h[m+0]*h2[m+0] + h[m+1]*h2[m+1]; p3[t] = cor;
if(oddn >= 2) {
cor += h[m+2]*h2[m+2] + h[m+3]*h2[m+3]; p2[t] = cor;
if(oddn >= 3) {
cor += h[m+4]*h2[m+4] + h[m+5]*h2[m+5]; p1[t] = cor;
}
}
}
#endif //ASM_CORHPL
return;
}
void cor_h_diag(int n, int oddn,short *h,short *h2,int *p3,int *p2,int *p1,int *p0)
{
#if ASM_CORHDL
n = n * 4 + oddn;
#define in edi
#define inoff edx
#define out esi
#define out3 out+eax
#define out2 out+ebx
#define out1 out+ebp
#define out0 out
#define L(m,n) ASM movq mm##m, QP[in+8*n]
#define M(m,n) ASM pmaddwd mm##m, QP[in+inoff+8*n]
#define R(m) ASM psrad mm##m, 1
#define S(m) ASM psrlq mm##m, 32
#define AH(m,n) ASM paddd mm##m, mm##n
#define WH(m,o) ASM movd DP[out##o], mm##m
#define AL(m,n) ASM paddd mm##m, mm##n
#define WL(m,o) ASM movd DP[out##o], mm##m
ASM {
push ebp;
mov ecx, n;
mov in, h;
mov inoff, h2;
sub inoff, in;
mov out, p0;
mov eax, p3;
mov ebx, p2;
mov ebp, p1;
sub eax, out;
sub ebx, out;
sub ebp, out;
}
L(0,0);
ASM pxor mm3,mm3;
M(0,0);
L(1,1);
AL(3,0); //really a copy
M(1,1);
R(0);
S(0);
ASM sub ecx,8;
ASM jl oddends;
inner:
L(2,2);
AH(0,3);
WL(3,3);
R(1);
WH(0,2);
AL(0,1);
M(2,2);
S(1);
L(3,3);
AH(1,0);
WL(0,1);
R(2);
WH(1,0);
AL(1,2);
M(3,3);
S(2);
ASM sub out, 4*1;
L(0,4);
AH(2,1);
WL(1,3);
R(3);
WH(2,2);
AL(2,3);
M(0,4);
S(3);
L(1,5);
AH(3,2);
WL(2,1);
R(0);
WH(3,0);
AL(3,0);
M(1,5);
S(0);
ASM sub out, 4*1;
ASM add in, 16*2;
ASM sub ecx, 8;
ASM jge inner;
oddends:
ASM add ecx, 4;
ASM jl cleanup;
//four more
L(2,2);
AH(0,3);
WL(3,3);
R(1);
WH(0,2);
AL(0,1);
M(2,2);
S(1);
L(3,3);
AH(1,0);
WL(0,1);
R(2);
WH(1,0);
AL(1,2);
M(3,3);
S(2);
ASM sub out, 4*1;
AH(2,1);
ASM dec ecx;
ASM jl innerdone;
WL(1,3);
ASM dec ecx;
ASM jl innerdone;
WH(2,2);
AL(2,3);
ASM dec ecx;
ASM jl innerdone;
WL(2,1);
ASM jmp innerdone;
cleanup:
ASM add ecx, 4;
ASM dec ecx;
ASM jl innerdone;
AH(0,3);
WL(3,3);
ASM dec ecx;
ASM jl innerdone;
WH(0,2);
AL(0,1);
ASM dec ecx;
ASM jl innerdone;
WL(0,1);
innerdone:
ASM emms;
ASM pop ebp;
#undef in
#undef inoff
#undef out
#undef out3
#undef out2
#undef out1
#undef out0
#undef L
#undef M
#undef R
#undef S
#undef AH
#undef WH
#undef AL
#undef WL
#else //ASM_CORHDL
int cor;
int i,m,t;
cor = 0;
m = 0;
t = 0;
for(i=n; i; i--)
{
cor += h[m+0]*h2[m+0] + h[m+1]*h2[m+1]; p3[t] = cor>>1;
cor += h[m+2]*h2[m+2] + h[m+3]*h2[m+3]; p2[t] = cor>>1;
cor += h[m+4]*h2[m+4] + h[m+5]*h2[m+5]; p1[t] = cor>>1;
cor += h[m+6]*h2[m+6] + h[m+7]*h2[m+7]; p0[t] = cor>>1;
t -= 1;
m += 8;
}
if(oddn >= 1) {
cor += h[m+0]*h2[m+0] + h[m+1]*h2[m+1]; p3[t] = cor;
if(oddn >= 2) {
cor += h[m+2]*h2[m+2] + h[m+3]*h2[m+3]; p2[t] = cor;
if(oddn >= 3) {
cor += h[m+4]*h2[m+4] + h[m+5]*h2[m+5]; p1[t] = cor;
}
}
}
#endif //ASM_CORHDL
return;
}
void Cor_hint1(short *H, int *rr)
{
// Compute correlations of h[] needed for the codebook search.
// h[] :Impulse response.
// rr[] :Correlations.
int *rri0i0, *rri1i1, *rri2i2, *rri3i3;
int *rri0i1, *rri0i2, *rri0i3;
int *rri1i2, *rri1i3, *rri2i3;
int *p0, *p1, *p2, *p3;
short *h2;
int i;
DECLARE_SHORT(h,SubFrLen2);
DECLARE_SHORT(hp2,SubFrLen2);
ALIGN_ARRAY(h);
ALIGN_ARRAY(hp2);
for(i=0; i<4; i++)
h[i] = (short)0;
for(i=0; i<SubFrLen; i++)
hp2[i+2] = h[i+4] = H[i];
// Init pointers
rri0i0 = rr;
rri1i1 = rri0i0 + NB_POS;
rri2i2 = rri1i1 + NB_POS;
rri3i3 = rri2i2 + NB_POS;
rri0i1 = rri3i3 + NB_POS;
rri0i2 = rri0i1 + MSIZE;
rri0i3 = rri0i2 + MSIZE;
rri1i2 = rri0i3 + MSIZE;
rri1i3 = rri1i2 + MSIZE;
rri2i3 = rri1i3 + MSIZE;
//TIMER_STAMP(a);
// Compute rri0i0[], rri1i1[], rri2i2[] and rri3i3[]
cor_h_diag(NB_POS,0,h,h,&rri3i3[NB_POS-1],&rri2i2[NB_POS-1],&rri1i1[NB_POS-1],&rri0i0[NB_POS-1]);
//TIMER_STAMP(b);
// Compute elements of: rri0i1[], rri0i3[], rri1i2[] and rri2i3[]
h2 = hp2;
p3 = rri2i3 + MSIZE-1;
p2 = rri1i2 + MSIZE-1;
p1 = rri0i1 + MSIZE-1;
p0 = rri0i3 + MSIZE-2;
cor_h_prods(4-1,h,h2,p3,p2,p1,p0,NB_POS,NB_POS,NB_POS,1);
// Compute elements of: rri0i2[], rri1i3[]
h2 = h+4;
p3 = rri1i3 + MSIZE-1;
p2 = rri0i2 + MSIZE-1;
p1 = rri1i3 + MSIZE-2;
p0 = rri0i2 + MSIZE-2;
cor_h_prods(4-2,h,h2,p3,p2,p1,p0,NB_POS,NB_POS,1,1);
// Compute elements of: rri0i1[], rri0i3[], rri1i2[] and rri2i3[]
h2 = hp2+4;
p3 = rri0i3 + MSIZE-1;
p2 = rri2i3 + MSIZE-2;
p1 = rri1i2 + MSIZE-2;
p0 = rri0i1 + MSIZE-2;
cor_h_prods(4-3,h,h2,p3,p2,p1,p0,NB_POS,1,1,1);
//TIMER_STAMP(c);
return;
}
//---------------------------------------------------------------------------
void Cor_h_Xint(short h[],short X[],int D[])
{
int i;
DECLARE_SHORT(hh, 2*SubFrLen+16); //h[-1,0,0,1,1,2,2,3,3,4,4,5,...57,58,58,59]
DECLARE_SHORT(XX, 2*SubFrLen+16); //X[ 0,1,0,1,2,3,2,3,4,5,4,5,...58,59,58,59]
#if TESTME
short htest[SubFrLen], Xtest[SubFrLen];
for (i = 0; i<SubFrLen; i++)
{
htest[i] = 1;//(short)(i<30?i:60-i);
Xtest[i] = 1;//(short)(i<30?i:60-i);
}
h = htest;
X = Xtest;
#endif //TESTME
ALIGN_ARRAY(hh);
ALIGN_ARRAY(XX);
for (i=2*SubFrLen; i < 2*SubFrLen+16; i++) {
XX[i] = hh[i] = (short)0;
}
// hh += 8; XX += 8;
#define ASM_Cor_h_Xint 1
#if ASM_Cor_h_Xint
ab2ababw(X, XX, SubFrLen);
ab2abzaw(h, hh, SubFrLen);
//TIMER_STAMP(e);
CorrelateIntTri (hh, XX, D, SubFrLen);
#if TESTME
{
int D2[SubFrLen];
CorrelateInt22 (hh, XX, D2, SubFrLen);
for (i = 0; i<SubFrLen; i++) {
// if(D[i] != D2[i])
printf("%3d: %6d %6d %6d ", i,D[i], D2[i], D[i] - D2[i]);
if(i&1) printf("\n");
}
}
#endif TESTME
#else //ASM_Cor_h_Xint
for (i=0; i < SubFrLen; i+=2) {
hh[2*i] = (i-1 >= 0) ? h[i-1] : (short)0;
hh[2*i+1] = h[i];
hh[2*i+2] = h[i];
hh[2*i+3] = h[i+1];
XX[2*i] = X[i];
XX[2*i+1] = X[i+1];
XX[2*i+2] = X[i];
XX[2*i+3] = X[i+1];
}
for (i=0; i < 2*SubFrLen; i+=4) {
int acc0 = 0, acc1 = 0;
for (j=0; j < 2*SubFrLen - i; j+=4) {
acc0 += (int)hh[j]*XX[i+j] + (int)hh[j+1]*XX[i+j+1];
acc1 += (int)hh[j+2]*XX[i+j+2] + (int)hh[j+3]*XX[i+j+3];
}
D[i/2] = acc0 >> 16;
D[i/2+1] = acc1 >> 16;
}
#endif //ASM_Cor_h_Xint
return;
}
//---------------------------------------------------------------------------
#define ASM_CORR_TRI 1
//#if ASM_CORR_TRI
//------------------------------------------------------
// triangular correlations
// ASSUMES that array has 8 zero values beyond the end
// and can be read 8 more beyond that (without page fault etc)
// data format is
// taps: 0 t0 t0 t1 t1 t2 t2 t3 t3 t4 t4 t5 ... t57 t58 t58 t59
// arr: a0 a1 a0 a1 a2 a3 a2 a3 a4 a5 a4 a5 ... a58 a59 a58 a59
//
void CorrelateIntTri(short *taps, short *array, int *corr, int ncor)
{
#define rega0 mm0
#define regb0 mm1
#define rega1 mm2
#define regb1 mm3
#define rega2 mm4
#define regb2 mm5
#define acc0 mm6
#define acc1 mm7
#define arr esi
#define tap edi
#define cor eax
#define icnt ecx
// In the following macros, 'n' is the column number and 'i' is the
// iteration number.
// we use "the convolution trick" or using la twice so that one
// of the pmadd's is reg,reg and thus can be in the V-slot.
// NOTE: we have read ahead up to 2 quadwords
// so we need QP[taps+8*ncor] = QP[taps+8*ncor+8] = [0 0 0 0]
// and reading QP[array+8*ncor] or QP[array+8*ncor+8] must be legal
#define la(n,i) ASM movq rega##n,QP[arr+8*i]
#define lb(n,i) ASM movq regb##n,QP[tap+8*i-8]
#define m0(n,i) ASM pmaddwd regb##n,rega##n
#define m1(n,i) ASM pmaddwd rega##n,QP[tap+8*i]
#define a0(n,i) ASM paddd acc0,regb##n
#define a1(n,i) ASM paddd acc1,rega##n
ASM
{
shr ncor,1;
mov cor,corr;
mov tap,taps;
mov arr,array;
mov icnt,ncor;
}
ForEachCorrPair:
// prime the pump
la(0,0);
ASM pxor regb0,regb0; // to avoid lb(0,0) reading taps[-1]
la(1,1);
ASM pxor acc0,acc0; // clear accumulator
m1(0,0);
ASM pxor acc1,acc1; // clear accumulator
lb(1,1);
ASM sub icnt, 1; // account for pump priming
ASM jle cleanup; // bypass if only one to do
inner:
la(2,2);
m0(1,1);
m1(1,1);
a0(0,0);
lb(2,2);
a1(0,0);
la(0,3);
m0(2,2);
m1(2,2);
a0(1,1);
lb(0,3);
a1(1,1);
la(1,4);
m0(0,3);
m1(0,3);
a0(2,2);
lb(1,4);
a1(2,2);
ASM add arr,24;
ASM add tap,24;
ASM sub icnt,3;
ASM jg inner;
cleanup: // last two adds
a0(0,0);
a1(0,0);
// Done with one correlation pair. Pack and store 2 results in corr array
ASM
{
add cor,16;
mov arr, array
mov tap,taps;
add arr,16;
mov icnt, ncor;
mov array, arr;
sub icnt,2; //set flags for jump
movq QP[cor-16],acc1;
movq QP[cor-8],acc0;
mov ncor, icnt;
jg ForEachCorrPair;
emms;
}
}
#undef rega0
#undef regb0
#undef rega1
#undef regb1
#undef rega2
#undef regb2
#undef acc0
#undef acc1
#undef arr
#undef tap
#undef cor
#undef icnt
#undef la
#undef lb
#undef m0
#undef m1
#undef a0
#undef a1
// 16 bit output
// psrad acc0,16;//this could be less in some cases
// psrad acc1,16;
// packssdw acc1,acc0;
// movq QP[cor-8],acc0;
//#else
//------------------------------------------------------
void CorrelateInt22(short *taps, short *array, int *corr, int ncor)
{
int i,j;
for (i=0; i < 2*ncor; i+=4) {
int acc0 = 0, acc1 = 0;
for (j=0; j < 2*ncor - i; j+=4) {
acc0 += (int)taps[j]*array[i+j] + (int)taps[j+1]*array[i+j+1];
acc1 += (int)taps[j+2]*array[i+j+2] + (int)taps[j+3]*array[i+j+3];
}
corr[i/2] = acc0 ;
corr[i/2+1] = acc1 ;
}
return;
}
//#endif
#endif //COMPILE_MMX