xserver-multidpi/fb/fbmmx.c
Søren Sandmann Pedersen cc3e0173d9 Thu Jul 22 20:03:11 2004 Soeren Sandmann <sandmann@daimi.au.dk>
Call MMX solid fill routine when available.
Call MMX operations when available.
New HasGcc34 macro
New file with many operations implemented with MMX intrinsics, conditional
    on having GCC 3.4 on i386.
2004-07-22 19:24:50 +00:00

1515 lines
31 KiB
C

/*
* Copyright © 2004 Red Hat, Inc.
*
* Permission to use, copy, modify, distribute, and sell this software and its
* documentation for any purpose is hereby granted without fee, provided that
* the above copyright notice appear in all copies and that both that
* copyright notice and this permission notice appear in supporting
* documentation, and that the name of Red Hat not be used in advertising or
* publicity pertaining to distribution of the software without specific,
* written prior permission. Red Hat makes no representations about the
* suitability of this software for any purpose. It is provided "as is"
* without express or implied warranty.
*
* RED HAT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL RED HAT
* BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
* OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
* CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*
* Author: Søren Sandmann (sandmann@redhat.com)
*
* Based on work by Owen Taylor
*/
#include "fb.h"
#ifdef USE_GCC34_MMX
#ifdef RENDER
#include "picturestr.h"
#include "mipict.h"
#include "fbpict.h"
typedef int Vector1x64 __attribute__ ((mode(DI)));
typedef int Vector2x32 __attribute__ ((mode(V2SI)));
typedef int Vector4x16 __attribute__ ((mode(V4HI)));
typedef int Vector8x8 __attribute__ ((mode(V8QI)));
typedef unsigned long long ullong;
#define noVERBOSE
#ifdef VERBOSE
#define CHECKPOINT() ErrorF ("at %s %d\n", __FUNCTION__, __LINE__)
#else
#define CHECKPOINT()
#endif
typedef struct
{
ullong mmx_zero;
ullong mmx_4x00ff;
ullong mmx_4x0080;
ullong mmx_565_rgb;
ullong mmx_565_unpack_multiplier;
ullong mmx_565_r;
ullong mmx_565_g;
ullong mmx_565_b;
ullong mmx_mask_0;
ullong mmx_mask_1;
ullong mmx_mask_2;
ullong mmx_mask_3;
ullong mmx_full_alpha;
ullong mmx_ffff0000ffff0000;
ullong mmx_0000ffff00000000;
ullong mmx_000000000000ffff;
} MMXData;
static const MMXData c =
{
.mmx_zero = 0x0000000000000000ULL,
.mmx_4x00ff = 0x00ff00ff00ff00ffULL,
.mmx_4x0080 = 0x0080008000800080ULL,
.mmx_565_rgb = 0x000001f0003f001fULL,
.mmx_565_r = 0x000000f800000000ULL,
.mmx_565_g = 0x0000000000fc0000ULL,
.mmx_565_b = 0x00000000000000f8ULL,
.mmx_mask_0 = 0xffffffffffff0000ULL,
.mmx_mask_1 = 0xffffffff0000ffffULL,
.mmx_mask_2 = 0xffff0000ffffffffULL,
.mmx_mask_3 = 0x0000ffffffffffffULL,
.mmx_full_alpha = 0x00ff000000000000ULL,
.mmx_565_unpack_multiplier = 0x0000008404100840ULL,
.mmx_ffff0000ffff0000 = 0xffff0000ffff0000ULL,
.mmx_0000ffff00000000 = 0x0000ffff00000000ULL,
.mmx_000000000000ffff = 0x000000000000ffffULL,
};
static __inline__ Vector1x64
shift (Vector1x64 v, int s)
{
if (s > 0)
return __builtin_ia32_psllq (v, s);
else if (s < 0)
return __builtin_ia32_psrlq (v, -s);
else
return v;
}
static __inline__ Vector4x16
negate (Vector4x16 mask)
{
return (Vector4x16)__builtin_ia32_pxor (
(Vector1x64)mask,
(Vector1x64)c.mmx_4x00ff);
}
static __inline__ Vector4x16
pix_multiply (Vector4x16 a, Vector4x16 b)
{
Vector4x16 res;
res = __builtin_ia32_pmullw (a, b);
res = __builtin_ia32_paddw (res, (Vector4x16)c.mmx_4x0080);
res = __builtin_ia32_psrlw (res, 8);
return res;
}
#if 0
#define HAVE_PSHUFW
#endif
#ifdef HAVE_PSHUFW
static __inline__ Vector4x16
expand_alpha (Vector4x16 pixel)
{
Vector4x16 result;
__asm__ ("pshufw $0xFF, %1, %0\n\t" : "=y" (result) : "y" (pixel));
return result;
}
static __inline__ Vector4x16
expand_alpha_rev (Vector4x16 pixel)
{
Vector4x16 result;
__asm__ ("pshufw $0x00, %1, %0\n\t" : "=y" (result) : "y" (pixel));
return result;
}
static __inline__ Vector4x16
invert_colors (Vector4x16 pixel)
{
Vector4x16 result;
/* 0xC6 = 11000110 */
/* 3 0 1 2 */
__asm__ ("pshufw $0xC6, %1, %0\n\t" : "=y" (result) : "y" (pixel));
return result;
}
#else
static __inline__ Vector4x16
expand_alpha (Vector4x16 pixel)
{
Vector1x64 t1, t2;
t1 = shift ((Vector1x64)pixel, -48);
t2 = shift (t1, 16);
t1 = __builtin_ia32_por (t1, t2);
t2 = shift (t1, 32);
t1 = __builtin_ia32_por (t1, t2);
return (Vector4x16)t1;
}
static __inline__ Vector4x16
expand_alpha_rev (Vector4x16 pixel)
{
Vector1x64 t1, t2;
t1 = shift ((Vector1x64)pixel, 48);
t1 = shift (t1, -48);
t2 = shift (t1, 16);
t1 = __builtin_ia32_por (t1, t2);
t2 = shift (t1, 32);
t1 = __builtin_ia32_por (t1, t2);
return (Vector4x16)t1;
}
static __inline__ Vector4x16
invert_colors (Vector4x16 pixel)
{
Vector1x64 x, y, z;
x = y = z = (Vector1x64)pixel;
x = __builtin_ia32_pand (x, (Vector1x64)c.mmx_ffff0000ffff0000);
y = __builtin_ia32_pand (y, (Vector1x64)c.mmx_000000000000ffff);
z = __builtin_ia32_pand (z, (Vector1x64)c.mmx_0000ffff00000000);
y = shift (y, 32);
z = shift (z, -32);
x = __builtin_ia32_por (x, y);
x = __builtin_ia32_por (x, z);
return (Vector4x16)x;
}
#endif
/* Notes about writing mmx code
*
* give memory operands as the second operand. If you give it as the
* first, gcc will first load it into a register, then use that register
*
* ie. use
*
* __builtin_pmullw (x, mmx_constant[8]);
*
* not
*
* __builtin_pmullw (mmx_constant[8], x);
*
* Also try to minimize dependencies. Ie. when you need a value, try to calculate
* it from a value that was calculated as early as possible.
*/
static __inline__ Vector4x16
over (Vector4x16 src, Vector4x16 srca, Vector4x16 dest)
{
return (Vector4x16)__builtin_ia32_paddusb ((Vector8x8)src, (Vector8x8)pix_multiply(dest, negate(srca)));
}
static __inline__ Vector4x16
over_rev_non_pre (Vector4x16 src, Vector4x16 dest)
{
Vector4x16 srca = expand_alpha (src);
Vector4x16 srcfaaa = (Vector4x16)__builtin_ia32_por((Vector1x64)srca, (Vector1x64)c.mmx_full_alpha);
return over(pix_multiply(invert_colors(src), srcfaaa), srca, dest);
}
static __inline__ Vector4x16
in (Vector4x16 src,
Vector4x16 mask)
{
return pix_multiply (src, mask);
}
static __inline__ Vector4x16
in_over (Vector4x16 src,
Vector4x16 srca,
Vector4x16 mask,
Vector4x16 dest)
{
return over(in(src, mask), pix_multiply(srca, mask), dest);
}
static __inline__ Vector8x8
cvt32to64 (CARD32 v)
{
ullong r = v;
return (Vector8x8)r;
}
static __inline__ Vector4x16
load8888 (CARD32 v)
{
return (Vector4x16)__builtin_ia32_punpcklbw (cvt32to64 (v),
(Vector8x8)c.mmx_zero);
}
static __inline__ Vector8x8
pack8888 (Vector4x16 lo, Vector4x16 hi)
{
Vector8x8 r;
r = __builtin_ia32_packuswb ((Vector4x16)lo, (Vector4x16)hi);
return r;
}
/* Expand 16 bits positioned at @pos (0-3) of a mmx register into 00RR00GG00BB
--- Expanding 565 in the low word ---
m = (m << (32 - 3)) | (m << (16 - 5)) | m;
m = m & (01f0003f001f);
m = m * (008404100840);
m = m >> 8;
Note the trick here - the top word is shifted by another nibble to avoid
it bumping into the middle word
*/
static __inline__ Vector4x16
expand565 (Vector4x16 pixel, int pos)
{
Vector1x64 p = (Vector1x64)pixel;
/* move pixel to low 16 bit and zero the rest */
p = shift (shift (p, (3 - pos) * 16), -48);
Vector1x64 t1 = shift (p, 36 - 11);
Vector1x64 t2 = shift (p, 16 - 5);
p = __builtin_ia32_por (t1, p);
p = __builtin_ia32_por (t2, p);
p = __builtin_ia32_pand (p, (Vector1x64)c.mmx_565_rgb);
pixel = __builtin_ia32_pmullw ((Vector4x16)p, (Vector4x16)c.mmx_565_unpack_multiplier);
return __builtin_ia32_psrlw (pixel, 8);
}
static __inline__ Vector4x16
expand8888 (Vector4x16 in, int pos)
{
if (pos == 0)
return (Vector4x16)__builtin_ia32_punpcklbw ((Vector8x8)in, (Vector8x8)c.mmx_zero);
else
return (Vector4x16)__builtin_ia32_punpckhbw ((Vector8x8)in, (Vector8x8)c.mmx_zero);
}
static __inline__ Vector4x16
pack565 (Vector4x16 pixel, Vector4x16 target, int pos)
{
Vector1x64 p = (Vector1x64)pixel;
Vector1x64 t = (Vector1x64)target;
Vector1x64 r, g, b;
r = __builtin_ia32_pand (p, (Vector1x64)c.mmx_565_r);
g = __builtin_ia32_pand (p, (Vector1x64)c.mmx_565_g);
b = __builtin_ia32_pand (p, (Vector1x64)c.mmx_565_b);
r = shift (r, - (32 - 8) + pos * 16);
g = shift (g, - (16 - 3) + pos * 16);
b = shift (b, - (0 + 3) + pos * 16);
if (pos == 0)
t = __builtin_ia32_pand (t, (Vector1x64)c.mmx_mask_0);
else if (pos == 1)
t = __builtin_ia32_pand (t, (Vector1x64)c.mmx_mask_1);
else if (pos == 2)
t = __builtin_ia32_pand (t, (Vector1x64)c.mmx_mask_2);
else if (pos == 3)
t = __builtin_ia32_pand (t, (Vector1x64)c.mmx_mask_3);
p = __builtin_ia32_por (r, t);
p = __builtin_ia32_por (g, p);
return (Vector4x16)__builtin_ia32_por (b, p);
}
static __inline__ void
emms (void)
{
__asm__ __volatile__ ("emms");
}
void
fbCompositeSolid_nx8888mmx (CARD8 op,
PicturePtr pSrc,
PicturePtr pMask,
PicturePtr pDst,
INT16 xSrc,
INT16 ySrc,
INT16 xMask,
INT16 yMask,
INT16 xDst,
INT16 yDst,
CARD16 width,
CARD16 height)
{
CARD32 src;
CARD32 *dstLine, *dst;
CARD16 w;
FbStride dstStride;
Vector4x16 vsrc, vsrca;
CHECKPOINT();
fbComposeGetSolid(pSrc, src);
if (src >> 24 == 0)
return;
fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1);
vsrc = load8888 (src);
vsrca = expand_alpha (vsrc);
while (height--)
{
dst = dstLine;
dstLine += dstStride;
w = width;
CHECKPOINT();
while (w && (unsigned long)dst & 7)
{
*dst = (ullong) pack8888(over(vsrc, vsrca, load8888(*dst)), (Vector4x16)c.mmx_zero);
w--;
dst++;
}
while (w >= 2)
{
Vector4x16 vdest;
Vector4x16 dest0, dest1;
vdest = *(Vector4x16 *)dst;
dest0 = over(vsrc, vsrca, expand8888(vdest, 0));
dest1 = over(vsrc, vsrca, expand8888(vdest, 1));
*(Vector8x8 *)dst = (Vector8x8)pack8888(dest0, dest1);
dst += 2;
w -= 2;
}
CHECKPOINT();
while (w)
{
*dst = (ullong) pack8888(over(vsrc, vsrca, load8888(*dst)), (Vector4x16)c.mmx_zero);
w--;
dst++;
}
}
emms();
}
void
fbCompositeSolid_nx0565mmx (CARD8 op,
PicturePtr pSrc,
PicturePtr pMask,
PicturePtr pDst,
INT16 xSrc,
INT16 ySrc,
INT16 xMask,
INT16 yMask,
INT16 xDst,
INT16 yDst,
CARD16 width,
CARD16 height)
{
CARD32 src;
CARD16 *dstLine, *dst;
CARD16 w;
FbStride dstStride;
Vector4x16 vsrc, vsrca;
CHECKPOINT();
fbComposeGetSolid(pSrc, src);
if (src >> 24 == 0)
return;
fbComposeGetStart (pDst, xDst, yDst, CARD16, dstStride, dstLine, 1);
vsrc = load8888 (src);
vsrca = expand_alpha (vsrc);
while (height--)
{
dst = dstLine;
dstLine += dstStride;
w = width;
CHECKPOINT();
while (w && (unsigned long)dst & 7)
{
ullong d = *dst;
Vector4x16 vdest = expand565 ((Vector4x16)d, 0);
vdest = pack565(over(vsrc, vsrca, vdest), vdest, 0);
*dst = (ullong)vdest;
w--;
dst++;
}
while (w >= 4)
{
Vector4x16 vdest;
vdest = *(Vector4x16 *)dst;
vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 0)), vdest, 0);
vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 1)), vdest, 1);
vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 2)), vdest, 2);
vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 3)), vdest, 3);
*(Vector8x8 *)dst = (Vector8x8)vdest;
dst += 4;
w -= 4;
}
CHECKPOINT();
while (w)
{
ullong d = *dst;
Vector4x16 vdest = expand565 ((Vector4x16)d, 0);
vdest = pack565(over(vsrc, vsrca, vdest), vdest, 0);
*dst = (ullong)vdest;
w--;
dst++;
}
}
emms();
}
void
fbCompositeSolidMask_nx8888x8888Cmmx (CARD8 op,
PicturePtr pSrc,
PicturePtr pMask,
PicturePtr pDst,
INT16 xSrc,
INT16 ySrc,
INT16 xMask,
INT16 yMask,
INT16 xDst,
INT16 yDst,
CARD16 width,
CARD16 height)
{
CARD32 src, srca;
CARD32 *dstLine;
CARD32 *maskLine;
FbStride dstStride, maskStride;
Vector4x16 vsrc, vsrca;
CHECKPOINT();
fbComposeGetSolid(pSrc, src);
srca = src >> 24;
if (srca == 0)
return;
fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1);
fbComposeGetStart (pMask, xMask, yMask, CARD32, maskStride, maskLine, 1);
vsrc = load8888(src);
vsrca = expand_alpha(vsrc);
while (height--)
{
int twidth = width;
CARD32 *p = (CARD32 *)maskLine;
CARD32 *q = (CARD32 *)dstLine;
while (twidth && (unsigned long)q & 7)
{
CARD32 m = *(CARD32 *)p;
if (m)
{
Vector4x16 vdest = load8888(*q);
vdest = in_over(vsrc, vsrca, load8888(m), vdest);
*q = (ullong)pack8888(vdest, (Vector4x16)c.mmx_zero);
}
twidth--;
p++;
q++;
}
while (twidth >= 2)
{
CARD32 m0, m1;
m0 = *p;
m1 = *(p + 1);
if (m0 | m1)
{
Vector4x16 dest0, dest1;
Vector4x16 vdest = *(Vector4x16 *)q;
dest0 = in_over(vsrc, vsrca, load8888(m0),
expand8888 (vdest, 0));
dest1 = in_over(vsrc, vsrca, load8888(m1),
expand8888 (vdest, 1));
*(Vector8x8 *)q = (Vector8x8)pack8888(dest0, dest1);
}
p += 2;
q += 2;
twidth -= 2;
}
while (twidth)
{
CARD32 m = *(CARD32 *)p;
if (m)
{
Vector4x16 vdest = load8888(*q);
vdest = in_over(vsrc, vsrca, load8888(m), vdest);
*q = (ullong)pack8888(vdest, (Vector4x16)c.mmx_zero);
}
twidth--;
p++;
q++;
}
dstLine += dstStride;
maskLine += maskStride;
}
emms();
}
void
fbCompositeSolidMask_nx8x8888mmx (CARD8 op,
PicturePtr pSrc,
PicturePtr pMask,
PicturePtr pDst,
INT16 xSrc,
INT16 ySrc,
INT16 xMask,
INT16 yMask,
INT16 xDst,
INT16 yDst,
CARD16 width,
CARD16 height)
{
CARD32 src, srca;
CARD32 *dstLine, *dst;
CARD8 *maskLine, *mask;
FbStride dstStride, maskStride;
CARD16 w;
Vector4x16 vsrc, vsrca;
ullong srcsrc;
CHECKPOINT();
fbComposeGetSolid(pSrc, src);
srca = src >> 24;
if (srca == 0)
return;
srcsrc = (unsigned long long)src << 32 | src;
fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1);
fbComposeGetStart (pMask, xMask, yMask, CARD8, maskStride, maskLine, 1);
vsrc = load8888 (src);
vsrca = expand_alpha (vsrc);
while (height--)
{
dst = dstLine;
dstLine += dstStride;
mask = maskLine;
maskLine += maskStride;
w = width;
CHECKPOINT();
while (w && (unsigned long)dst & 7)
{
ullong m = *mask;
if (m)
{
Vector4x16 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((Vector4x16)m), load8888(*dst));
*dst = (ullong)pack8888(vdest, (Vector4x16)c.mmx_zero);
}
w--;
mask++;
dst++;
}
CHECKPOINT();
while (w >= 2)
{
ullong m0, m1;
m0 = *mask;
m1 = *(mask + 1);
if (srca == 0xff && (m0 & m1) == 0xff)
{
*(unsigned long long *)dst = srcsrc;
}
else if (m0 | m1)
{
Vector4x16 vdest;
Vector4x16 dest0, dest1;
vdest = *(Vector4x16 *)dst;
dest0 = in_over(vsrc, vsrca, expand_alpha_rev ((Vector4x16)m0), expand8888(vdest, 0));
dest1 = in_over(vsrc, vsrca, expand_alpha_rev ((Vector4x16)m1), expand8888(vdest, 1));
*(Vector8x8 *)dst = (Vector8x8)pack8888(dest0, dest1);
}
mask += 2;
dst += 2;
w -= 2;
}
CHECKPOINT();
while (w)
{
ullong m = *mask;
if (m)
{
Vector4x16 vdest = load8888(*dst);
vdest = in_over(vsrc, vsrca, expand_alpha_rev ((Vector4x16)m), vdest);
*dst = (ullong)pack8888(vdest, (Vector4x16)c.mmx_zero);
}
w--;
mask++;
dst++;
}
}
emms();
}
void
fbCompositeSolidMask_nx8x0565mmx (CARD8 op,
PicturePtr pSrc,
PicturePtr pMask,
PicturePtr pDst,
INT16 xSrc,
INT16 ySrc,
INT16 xMask,
INT16 yMask,
INT16 xDst,
INT16 yDst,
CARD16 width,
CARD16 height)
{
CARD32 src, srca;
CARD16 *dstLine, *dst;
CARD8 *maskLine, *mask;
FbStride dstStride, maskStride;
CARD16 w;
Vector4x16 vsrc, vsrca;
unsigned long long srcsrcsrcsrc, src16;
CHECKPOINT();
fbComposeGetSolid(pSrc, src);
srca = src >> 24;
if (srca == 0)
return;
fbComposeGetStart (pDst, xDst, yDst, CARD16, dstStride, dstLine, 1);
fbComposeGetStart (pMask, xMask, yMask, CARD8, maskStride, maskLine, 1);
vsrc = load8888 (src);
vsrca = expand_alpha (vsrc);
src16 = (ullong)pack565(vsrc, (Vector4x16)c.mmx_zero, 0);
srcsrcsrcsrc = (ullong)src16 << 48 | (ullong)src16 << 32 |
(ullong)src16 << 16 | (ullong)src16;
while (height--)
{
dst = dstLine;
dstLine += dstStride;
mask = maskLine;
maskLine += maskStride;
w = width;
CHECKPOINT();
while (w && (unsigned long)dst & 7)
{
ullong m = *mask;
if (m)
{
ullong d = *dst;
Vector4x16 vd = (Vector4x16)d;
Vector4x16 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((Vector4x16)m), expand565(vd, 0));
*dst = (ullong)pack565(vdest, (Vector4x16)c.mmx_zero, 0);
}
w--;
mask++;
dst++;
}
CHECKPOINT();
while (w >= 4)
{
ullong m0, m1, m2, m3;
m0 = *mask;
m1 = *(mask + 1);
m2 = *(mask + 2);
m3 = *(mask + 3);
if (srca == 0xff && (m0 & m1 & m2 & m3) == 0xff)
{
*(unsigned long long *)dst = srcsrcsrcsrc;
}
else if (m0 | m1 | m2 | m3)
{
Vector4x16 vdest;
Vector4x16 vm0, vm1, vm2, vm3;
vdest = *(Vector4x16 *)dst;
vm0 = (Vector4x16)m0;
vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm0), expand565(vdest, 0)), vdest, 0);
vm1 = (Vector4x16)m1;
vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm1), expand565(vdest, 1)), vdest, 1);
vm2 = (Vector4x16)m2;
vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm2), expand565(vdest, 2)), vdest, 2);
vm3 = (Vector4x16)m3;
vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm3), expand565(vdest, 3)), vdest, 3);
*(Vector4x16 *)dst = vdest;
}
w -= 4;
mask += 4;
dst += 4;
}
CHECKPOINT();
while (w)
{
ullong m = *mask;
if (m)
{
ullong d = *dst;
Vector4x16 vd = (Vector4x16)d;
Vector4x16 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((Vector4x16)m), expand565(vd, 0));
*dst = (ullong)pack565(vdest, (Vector4x16)c.mmx_zero, 0);
}
w--;
mask++;
dst++;
}
}
emms();
}
void
fbCompositeSrc_8888RevNPx0565mmx (CARD8 op,
PicturePtr pSrc,
PicturePtr pMask,
PicturePtr pDst,
INT16 xSrc,
INT16 ySrc,
INT16 xMask,
INT16 yMask,
INT16 xDst,
INT16 yDst,
CARD16 width,
CARD16 height)
{
CARD16 *dstLine, *dst;
CARD32 *srcLine, *src;
FbStride dstStride, srcStride;
CARD16 w;
CHECKPOINT();
fbComposeGetStart (pDst, xDst, yDst, CARD16, dstStride, dstLine, 1);
fbComposeGetStart (pSrc, xSrc, ySrc, CARD32, srcStride, srcLine, 1);
assert (pSrc->pDrawable == pMask->pDrawable);
while (height--)
{
dst = dstLine;
dstLine += dstStride;
src = srcLine;
srcLine += srcStride;
w = width;
CHECKPOINT();
while (w && (unsigned long)dst & 7)
{
Vector4x16 vsrc = load8888 (*src);
ullong d = *dst;
Vector4x16 vdest = expand565 ((Vector4x16)d, 0);
vdest = pack565(over_rev_non_pre(vsrc, vdest), vdest, 0);
*dst = (ullong)vdest;
w--;
dst++;
src++;
}
CHECKPOINT();
while (w >= 4)
{
CARD32 s0, s1, s2, s3;
unsigned char a0, a1, a2, a3;
s0 = *src;
s1 = *(src + 1);
s2 = *(src + 2);
s3 = *(src + 3);
a0 = (s0 >> 24);
a1 = (s1 >> 24);
a2 = (s2 >> 24);
a3 = (s3 >> 24);
if ((a0 & a1 & a2 & a3) == 0xFF)
{
Vector4x16 vdest;
vdest = pack565(invert_colors(load8888(s0)), (Vector4x16)c.mmx_zero, 0);
vdest = pack565(invert_colors(load8888(s1)), vdest, 1);
vdest = pack565(invert_colors(load8888(s2)), vdest, 2);
vdest = pack565(invert_colors(load8888(s3)), vdest, 3);
*(Vector4x16 *)dst = vdest;
}
else if (a0 | a1 | a2 | a3)
{
Vector4x16 vdest = *(Vector4x16 *)dst;
vdest = pack565(over_rev_non_pre(load8888(s0), expand565(vdest, 0)), vdest, 0);
vdest = pack565(over_rev_non_pre(load8888(s1), expand565(vdest, 1)), vdest, 1);
vdest = pack565(over_rev_non_pre(load8888(s2), expand565(vdest, 2)), vdest, 2);
vdest = pack565(over_rev_non_pre(load8888(s3), expand565(vdest, 3)), vdest, 3);
*(Vector4x16 *)dst = vdest;
}
w -= 4;
dst += 4;
src += 4;
}
CHECKPOINT();
while (w)
{
Vector4x16 vsrc = load8888 (*src);
ullong d = *dst;
Vector4x16 vdest = expand565 ((Vector4x16)d, 0);
vdest = pack565(over_rev_non_pre(vsrc, vdest), vdest, 0);
*dst = (ullong)vdest;
w--;
dst++;
src++;
}
}
emms();
}
/* "888RevNP" is GdkPixbuf's format: ABGR, non premultiplied */
void
fbCompositeSrc_8888RevNPx8888mmx (CARD8 op,
PicturePtr pSrc,
PicturePtr pMask,
PicturePtr pDst,
INT16 xSrc,
INT16 ySrc,
INT16 xMask,
INT16 yMask,
INT16 xDst,
INT16 yDst,
CARD16 width,
CARD16 height)
{
CARD32 *dstLine, *dst;
CARD32 *srcLine, *src;
FbStride dstStride, srcStride;
CARD16 w;
CHECKPOINT();
fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1);
fbComposeGetStart (pSrc, xSrc, ySrc, CARD32, srcStride, srcLine, 1);
assert (pSrc->pDrawable == pMask->pDrawable);
while (height--)
{
dst = dstLine;
dstLine += dstStride;
src = srcLine;
srcLine += srcStride;
w = width;
while (w && (unsigned long)dst & 7)
{
Vector4x16 s = load8888 (*src);
Vector4x16 d = load8888 (*dst);
*dst = (ullong)pack8888 (over_rev_non_pre (s, d), (Vector4x16)c.mmx_zero);
w--;
dst++;
src++;
}
while (w >= 2)
{
ullong s0, s1;
unsigned char a0, a1;
Vector4x16 d0, d1;
s0 = *src;
s1 = *(src + 1);
a0 = (s0 >> 24);
a1 = (s1 >> 24);
if ((a0 & a1) == 0xFF)
{
d0 = invert_colors(load8888(s0));
d1 = invert_colors(load8888(s1));
*(Vector8x8 *)dst = pack8888 (d0, d1);
}
else if (a0 | a1)
{
Vector4x16 vdest = *(Vector4x16 *)dst;
d0 = over_rev_non_pre (load8888(s0), expand8888 (vdest, 0));
d1 = over_rev_non_pre (load8888(s1), expand8888 (vdest, 1));
*(Vector8x8 *)dst = pack8888 (d0, d1);
}
w -= 2;
dst += 2;
src += 2;
}
while (w)
{
Vector4x16 s = load8888 (*src);
Vector4x16 d = load8888 (*dst);
*dst = (ullong)pack8888 (over_rev_non_pre (s, d), (Vector4x16)c.mmx_zero);
w--;
dst++;
src++;
}
}
emms();
}
void
fbCompositeSolidMask_nx8888x0565Cmmx (CARD8 op,
PicturePtr pSrc,
PicturePtr pMask,
PicturePtr pDst,
INT16 xSrc,
INT16 ySrc,
INT16 xMask,
INT16 yMask,
INT16 xDst,
INT16 yDst,
CARD16 width,
CARD16 height)
{
CARD32 src, srca;
CARD16 *dstLine;
CARD32 *maskLine;
FbStride dstStride, maskStride;
Vector4x16 vsrc, vsrca;
CHECKPOINT();
fbComposeGetSolid(pSrc, src);
srca = src >> 24;
if (srca == 0)
return;
fbComposeGetStart (pDst, xDst, yDst, CARD16, dstStride, dstLine, 1);
fbComposeGetStart (pMask, xMask, yMask, CARD32, maskStride, maskLine, 1);
vsrc = load8888 (src);
vsrca = expand_alpha (vsrc);
while (height--)
{
int twidth = width;
CARD32 *p = (CARD32 *)maskLine;
CARD16 *q = (CARD16 *)dstLine;
while (twidth && ((unsigned long)q & 7))
{
CARD32 m = *(CARD32 *)p;
if (m)
{
ullong d = *q;
Vector4x16 vdest = expand565 ((Vector4x16)d, 0);
vdest = pack565 (in_over (vsrc, vsrca, load8888 (m), vdest), vdest, 0);
*q = (ullong)vdest;
}
twidth--;
p++;
q++;
}
while (twidth >= 4)
{
CARD32 m0, m1, m2, m3;
m0 = *p;
m1 = *(p + 1);
m2 = *(p + 2);
m3 = *(p + 3);
if ((m0 | m1 | m2 | m3))
{
Vector4x16 vdest = *(Vector4x16 *)q;
vdest = pack565(in_over(vsrc, vsrca, load8888(m0), expand565(vdest, 0)), vdest, 0);
vdest = pack565(in_over(vsrc, vsrca, load8888(m1), expand565(vdest, 1)), vdest, 1);
vdest = pack565(in_over(vsrc, vsrca, load8888(m2), expand565(vdest, 2)), vdest, 2);
vdest = pack565(in_over(vsrc, vsrca, load8888(m3), expand565(vdest, 3)), vdest, 3);
*(Vector4x16 *)q = vdest;
}
twidth -= 4;
p += 4;
q += 4;
}
while (twidth)
{
CARD32 m;
m = *(CARD32 *)p;
if (m)
{
ullong d = *q;
Vector4x16 vdest = expand565((Vector4x16)d, 0);
vdest = pack565 (in_over(vsrc, vsrca, load8888(m), vdest), vdest, 0);
*q = (ullong)vdest;
}
twidth--;
p++;
q++;
}
maskLine += maskStride;
dstLine += dstStride;
}
emms ();
}
void
fbCompositeSrcAdd_8000x8000mmx (CARD8 op,
PicturePtr pSrc,
PicturePtr pMask,
PicturePtr pDst,
INT16 xSrc,
INT16 ySrc,
INT16 xMask,
INT16 yMask,
INT16 xDst,
INT16 yDst,
CARD16 width,
CARD16 height)
{
CARD8 *dstLine, *dst;
CARD8 *srcLine, *src;
FbStride dstStride, srcStride;
CARD16 w;
CARD8 s, d;
CARD16 t;
CHECKPOINT();
fbComposeGetStart (pSrc, xSrc, ySrc, CARD8, srcStride, srcLine, 1);
fbComposeGetStart (pDst, xDst, yDst, CARD8, dstStride, dstLine, 1);
while (height--)
{
dst = dstLine;
dstLine += dstStride;
src = srcLine;
srcLine += srcStride;
w = width;
while (w && (unsigned long)dst & 7)
{
s = *src;
d = *dst;
t = d + s;
s = t | (0 - (t >> 8));
*dst = s;
dst++;
src++;
w--;
}
while (w >= 8)
{
__asm__ __volatile__ (
"movq (%0), %%mm2\n\t"
"movq (%1), %%mm3\n\t"
"paddusb %%mm2, %%mm3\n\t"
"movq %%mm3, (%1)\n\t"
: /* no output */ : "r" (src), "r" (dst));
dst += 8;
src += 8;
w -= 8;
}
while (w)
{
s = *src;
d = *dst;
t = d + s;
s = t | (0 - (t >> 8));
*dst = s;
dst++;
src++;
w--;
}
}
emms();
}
void
fbCompositeSrcAdd_8888x8888mmx (CARD8 op,
PicturePtr pSrc,
PicturePtr pMask,
PicturePtr pDst,
INT16 xSrc,
INT16 ySrc,
INT16 xMask,
INT16 yMask,
INT16 xDst,
INT16 yDst,
CARD16 width,
CARD16 height)
{
CARD32 *dstLine, *dst;
CARD32 *srcLine, *src;
FbStride dstStride, srcStride;
CARD16 w;
CHECKPOINT();
fbComposeGetStart (pSrc, xSrc, ySrc, CARD32, srcStride, srcLine, 1);
fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1);
while (height--)
{
dst = dstLine;
dstLine += dstStride;
src = srcLine;
srcLine += srcStride;
w = width;
while (w && (unsigned long)dst & 7)
{
__asm__ __volatile__ (
"movd %0, %%mm2\n\t"
"movd %1, %%mm3\n\t"
"paddusb %%mm2, %%mm3\n\t"
"movd %%mm3, %1\n\t"
: /* no output */ : "m" (*src), "m" (*dst));
dst++;
src++;
w--;
}
while (w >= 2)
{
__asm__ __volatile__ (
"movq (%0), %%mm2\n\t"
"movq (%1), %%mm3\n\t"
"paddusb %%mm2, %%mm3\n\t"
"movq %%mm3, (%1)\n\t"
: /* no output */ : "r" (src), "r" (dst));
dst += 2;
src += 2;
w -= 2;
}
if (w)
{
__asm__ __volatile__ (
"movd %0, %%mm2\n\t"
"movd %1, %%mm3\n\t"
"paddusb %%mm2, %%mm3\n\t"
"movd %%mm3, %1\n\t"
: /* no output */ : "m" (*src), "m" (*dst));
}
}
emms();
}
#define GetStart(drw,x,y,type,stride,line,bpp) {\
FbBits *__bits__; \
FbStride __stride__; \
int __xoff__,__yoff__; \
\
fbGetDrawable((drw),__bits__,__stride__,bpp,__xoff__,__yoff__); \
(stride) = __stride__ * sizeof (FbBits) / sizeof (type); \
(line) = ((type *) __bits__) + (stride) * ((y) - __yoff__) + ((x) - __xoff__); \
}
Bool
fbSolidFillmmx (DrawablePtr pDraw,
int x,
int y,
int width,
int height,
FbBits xor)
{
FbStride stride;
int bpp;
ullong fill;
Vector8x8 vfill;
CARD32 byte_width;
CARD8 *byte_line;
FbBits *bits;
int xoff, yoff;
CHECKPOINT();
fbGetDrawable(pDraw, bits, stride, bpp, xoff, yoff);
if (bpp == 16 && (xor >> 16 != (xor & 0xffff)))
return FALSE;
if (bpp != 16 && bpp != 32)
return FALSE;
if (bpp == 16)
{
stride = stride * sizeof (FbBits) / 2;
byte_line = (CARD8 *)(((CARD16 *)bits) + stride * (y - yoff) + (x - xoff));
byte_width = 2 * width;
stride *= 2;
}
else
{
stride = stride * sizeof (FbBits) / 4;
byte_line = (CARD8 *)(((CARD32 *)bits) + stride * (y - yoff) + (x - xoff));
byte_width = 4 * width;
stride *= 4;
}
fill = ((ullong)xor << 32) | xor;
vfill = (Vector8x8)fill;
while (height--)
{
int w;
CARD8 *d = byte_line;
byte_line += stride;
w = byte_width;
while (w >= 2 && ((unsigned long)d & 3))
{
*(CARD16 *)d = xor;
w -= 2;
d += 2;
}
while (w >= 4 && ((unsigned int)d & 7))
{
*(CARD32 *)d = xor;
w -= 4;
d += 4;
}
while (w >= 64)
{
__asm__ __volatile (
"movq %0, (%1)\n\t"
"movq %0, 8(%1)\n\t"
"movq %0, 16(%1)\n\t"
"movq %0, 24(%1)\n\t"
"movq %0, 32(%1)\n\t"
"movq %0, 40(%1)\n\t"
"movq %0, 48(%1)\n\t"
"movq %0, 56(%1)\n\t"
: /* no output */
: "y" (vfill), "r" (d)
: "memory");
w -= 64;
d += 64;
}
while (w >= 4)
{
*(CARD32 *)d = xor;
w -= 4;
d += 4;
}
if (w >= 2)
{
*(CARD16 *)d = xor;
w -= 2;
d += 2;
}
}
emms();
return TRUE;
}
Bool
fbHaveMMX (void)
{
static Bool initialized = FALSE;
static Bool mmx_present;
if (!initialized)
{
int tmp; /* static variables are accessed through %ebx,
* but we mess around with the registers below,
* so we need a temporary variable that can
* be accessed directly.
*/
__asm__ __volatile__ (
/* Check if bit 21 in flags word is writeable */
"pusha \n\t"
"pushfl \n\t"
"popl %%eax \n\t"
"movl %%eax, %%ebx \n\t"
"xorl $0x00200000, %%eax \n\t"
"pushl %%eax \n\t"
"popfl \n\t"
"pushfl \n\t"
"popl %%eax \n\t"
"cmpl %%eax, %%ebx \n\t"
"je .notfound \n\t"
/* OK, we have CPUID */
"movl $1, %%eax \n\t"
"cpuid \n\t"
"test $0x00800000, %%edx \n\t"
"jz .notfound \n\t"
"movl $1, %0 \n\t"
"jmp .out \n\t"
".notfound: \n\t"
"movl $0, %0 \n\t"
".out: \n\t"
"popa \n\t"
:
"=m" (tmp)
: /* no input */);
initialized = TRUE;
mmx_present = tmp;
}
return mmx_present;
}
#endif /* RENDER */
#endif /* USE_GCC34_MMX */