/* * Copyright © 2004, 2005 Red Hat, Inc. * Copyright © 2004 Nicholas Miell * Copyright © 2005 Trolltech AS * * Permission to use, copy, modify, distribute, and sell this software and its * documentation for any purpose is hereby granted without fee, provided that * the above copyright notice appear in all copies and that both that * copyright notice and this permission notice appear in supporting * documentation, and that the name of Red Hat not be used in advertising or * publicity pertaining to distribution of the software without specific, * written prior permission. Red Hat makes no representations about the * suitability of this software for any purpose. It is provided "as is" * without express or implied warranty. * * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS * SOFTWARE. * * Author: Søren Sandmann (sandmann@redhat.com) * Minor Improvements: Nicholas Miell (nmiell@gmail.com) * MMX code paths for fbcompose.c by Lars Knoll (lars@trolltech.com) * * Based on work by Owen Taylor */ #ifdef HAVE_DIX_CONFIG_H #include #endif #ifdef USE_MMX #if defined(__amd64__) || defined(__x86_64__) #define USE_SSE #endif #include #ifdef USE_SSE #include /* for _mm_shuffle_pi16 and _MM_SHUFFLE */ #endif #ifdef RENDER #include "fb.h" #include "fbmmx.h" #include "picturestr.h" #include "mipict.h" #include "fbpict.h" #define noVERBOSE #ifdef VERBOSE #define CHECKPOINT() ErrorF ("at %s %d\n", __FUNCTION__, __LINE__) #else #define CHECKPOINT() #endif typedef unsigned long long ullong; #ifdef __GNUC__ typedef ullong mmxdatafield; #endif #ifdef _MSC_VER typedef unsigned __int64 ullong; typedef __m64 mmxdatafield; #endif Bool fbSolidFillmmx (DrawablePtr pDraw, int x, int y, int width, int height, FbBits xor) { FbStride stride; int bpp; ullong fill; __m64 vfill; CARD32 byte_width; CARD8 *byte_line; FbBits *bits; int xoff, yoff; #ifdef __GNUC__ __m64 v1, v2, v3, v4, v5, v6, v7; #endif CHECKPOINT(); fbGetDrawable(pDraw, bits, stride, bpp, xoff, yoff); if (bpp == 16 && (xor >> 16 != (xor & 0xffff))) return FALSE; if (bpp != 16 && bpp != 32) return FALSE; if (bpp == 16) { stride = stride * sizeof (FbBits) / 2; byte_line = (CARD8 *)(((CARD16 *)bits) + stride * (y + yoff) + (x + xoff)); byte_width = 2 * width; stride *= 2; } else { stride = stride * sizeof (FbBits) / 4; byte_line = (CARD8 *)(((CARD32 *)bits) + stride * (y + yoff) + (x + xoff)); byte_width = 4 * width; stride *= 4; } fill = ((ullong)xor << 32) | xor; vfill = (__m64)fill; #ifdef __GNUC__ __asm__ ( "movq %7, %0\n" "movq %7, %1\n" "movq %7, %2\n" "movq %7, %3\n" "movq %7, %4\n" "movq %7, %5\n" "movq %7, %6\n" : "=y" (v1), "=y" (v2), "=y" (v3), "=y" (v4), "=y" (v5), "=y" (v6), "=y" (v7) : "y" (vfill)); #endif while (height--) { int w; CARD8 *d = byte_line; byte_line += stride; w = byte_width; while (w >= 2 && ((unsigned long)d & 3)) { *(CARD16 *)d = xor; w -= 2; d += 2; } while (w >= 4 && ((unsigned long)d & 7)) { *(CARD32 *)d = xor; w -= 4; d += 4; } while (w >= 64) { #ifdef __GNUC__ __asm__ ( "movq %1, (%0)\n" "movq %2, 8(%0)\n" "movq %3, 16(%0)\n" "movq %4, 24(%0)\n" "movq %5, 32(%0)\n" "movq %6, 40(%0)\n" "movq %7, 48(%0)\n" "movq %8, 56(%0)\n" : : "r" (d), "y" (vfill), "y" (v1), "y" (v2), "y" (v3), "y" (v4), "y" (v5), "y" (v6), "y" (v7) : "memory"); #else *(__m64*) (d + 0) = vfill; *(__m64*) (d + 8) = vfill; *(__m64*) (d + 16) = vfill; *(__m64*) (d + 24) = vfill; *(__m64*) (d + 32) = vfill; *(__m64*) (d + 40) = vfill; *(__m64*) (d + 48) = vfill; *(__m64*) (d + 56) = vfill; #endif w -= 64; d += 64; } while (w >= 4) { *(CARD32 *)d = xor; w -= 4; d += 4; } if (w >= 2) { *(CARD16 *)d = xor; w -= 2; d += 2; } } _mm_empty(); return TRUE; } Bool fbCopyAreammx (DrawablePtr pSrc, DrawablePtr pDst, int src_x, int src_y, int dst_x, int dst_y, int width, int height) { FbBits * src_bits; FbStride src_stride; int src_bpp; int src_xoff; int src_yoff; FbBits * dst_bits; FbStride dst_stride; int dst_bpp; int dst_xoff; int dst_yoff; CARD8 * src_bytes; CARD8 * dst_bytes; int byte_width; fbGetDrawable(pSrc, src_bits, src_stride, src_bpp, src_xoff, src_yoff); fbGetDrawable(pDst, dst_bits, dst_stride, dst_bpp, dst_xoff, dst_yoff); if (src_bpp != dst_bpp) return FALSE; if (src_bpp == 16) { src_stride = src_stride * sizeof (FbBits) / 2; dst_stride = dst_stride * sizeof (FbBits) / 2; src_bytes = (CARD8 *)(((CARD16 *)src_bits) + src_stride * (src_y + src_yoff) + (src_x + src_xoff)); dst_bytes = (CARD8 *)(((CARD16 *)dst_bits) + dst_stride * (dst_y + dst_yoff) + (dst_x + dst_xoff)); byte_width = 2 * width; src_stride *= 2; dst_stride *= 2; } else if (src_bpp == 32) { src_stride = src_stride * sizeof (FbBits) / 4; dst_stride = dst_stride * sizeof (FbBits) / 4; src_bytes = (CARD8 *)(((CARD32 *)src_bits) + src_stride * (src_y + src_yoff) + (src_x + src_xoff)); dst_bytes = (CARD8 *)(((CARD32 *)dst_bits) + dst_stride * (dst_y + dst_yoff) + (dst_x + dst_xoff)); byte_width = 4 * width; src_stride *= 4; dst_stride *= 4; } else { return FALSE; } while (height--) { int w; CARD8 *s = src_bytes; CARD8 *d = dst_bytes; src_bytes += src_stride; dst_bytes += dst_stride; w = byte_width; while (w >= 2 && ((unsigned long)d & 3)) { *(CARD16 *)d = *(CARD16 *)s; w -= 2; s += 2; d += 2; } while (w >= 4 && ((unsigned long)d & 7)) { *(CARD32 *)d = *(CARD32 *)s; w -= 4; s += 4; d += 4; } while (w >= 64) { #ifdef __GNUC__ __asm__ ( "movq (%1), %%mm0\n" "movq 8(%1), %%mm1\n" "movq 16(%1), %%mm2\n" "movq 24(%1), %%mm3\n" "movq 32(%1), %%mm4\n" "movq 40(%1), %%mm5\n" "movq 48(%1), %%mm6\n" "movq 56(%1), %%mm7\n" "movq %%mm0, (%0)\n" "movq %%mm1, 8(%0)\n" "movq %%mm2, 16(%0)\n" "movq %%mm3, 24(%0)\n" "movq %%mm4, 32(%0)\n" "movq %%mm5, 40(%0)\n" "movq %%mm6, 48(%0)\n" "movq %%mm7, 56(%0)\n" : : "r" (d), "r" (s) : "memory", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7"); #else __m64 v0 = *(__m64 *)(s + 0); __m64 v1 = *(__m64 *)(s + 8); __m64 v2 = *(__m64 *)(s + 16); __m64 v3 = *(__m64 *)(s + 24); __m64 v4 = *(__m64 *)(s + 32); __m64 v5 = *(__m64 *)(s + 40); __m64 v6 = *(__m64 *)(s + 48); __m64 v7 = *(__m64 *)(s + 56); *(__m64 *)(d + 0) = v0; *(__m64 *)(d + 8) = v1; *(__m64 *)(d + 16) = v2; *(__m64 *)(d + 24) = v3; *(__m64 *)(d + 32) = v4; *(__m64 *)(d + 40) = v5; *(__m64 *)(d + 48) = v6; *(__m64 *)(d + 56) = v7; #endif w -= 64; s += 64; d += 64; } while (w >= 4) { *(CARD32 *)d = *(CARD32 *)s; w -= 4; s += 4; d += 4; } if (w >= 2) { *(CARD16 *)d = *(CARD16 *)s; w -= 2; s += 2; d += 2; } } _mm_empty(); return TRUE; } #endif /* RENDER */ #endif /* USE_MMX */