/*-- Copyright (c) 2000 Microsoft Corporation - Xbox SDK Module Name: fastmath.cpp --*/ #include "precomp.h" //#include "Debug.h" //---------------------------------------------------------------------------- // Katmai and MMX(TM) constants implementation #define _MM_ALIGN16 __declspec(align(16)) #define SC_OPT #if !defined(M128) #define M128(x) (*(__m128*)&(x)) #endif typedef unsigned long DWORD; #define _M128_CONST(Name, Val) \ static const _MM_ALIGN16 float _m128const_##Name[4] = { Val, Val, Val, Val } #define _M128_CONST4(Name, Val0, Val1, Val2, Val3) \ static const _MM_ALIGN16 float _m128const_##Name[4] = { Val0, Val1, Val2, Val3 } #define M128_EXTERN_CONST(Name, Val) \ static const _MM_ALIGN16 float _m128const_##Name[4] = { Val, Val, Val, Val }; \ const __m128* p##Name = (__m128*)_m128const_##Name #define M128_EXTERN_CONST_TYPE(Name, Val, Type) \ static const _MM_ALIGN16 Type _m128const_##Name[4] = { Val, Val, Val, Val }; \ const __m128* p##Name = (__m128*)_m128const_##Name #define M128_CONST(Name, Val) \ static const _MM_ALIGN16 float _m128const_##Name[4] = { Val, Val, Val, Val }; \ const __m128 Name = M128(_m128const_##Name) M128_EXTERN_CONST(am_0, 0.0f); M128_EXTERN_CONST(am_1, 1.0f); M128_EXTERN_CONST(am_minus_1, -1.0f); M128_EXTERN_CONST(am_0p5, 0.5f); M128_EXTERN_CONST(am_1p5, 1.5f); M128_EXTERN_CONST(am_3_over_2, 3.0f / 2.0f); M128_EXTERN_CONST(am_pi, PI); M128_EXTERN_CONST(am_pi_over_2, (PI / 2.0f)); M128_EXTERN_CONST(am_2_over_pi, (2.0f / PI)); M128_EXTERN_CONST_TYPE(am_sign_mask, 0x80000000, DWORD); M128_EXTERN_CONST_TYPE(am_inv_sign_mask, ~0x80000000, DWORD); M128_EXTERN_CONST_TYPE(am_min_pos_norm, 0x00800000, DWORD); M128_EXTERN_CONST_TYPE(am_mant_mask, 0x7f800000, DWORD); M128_EXTERN_CONST_TYPE(am_inv_mant_mask, ~0x7f800000, DWORD); //---------------------------------------------------------------------------- // Katmai and MMX(TM) constants implementation static const float p0 = 0.15707963267948963959e1f; static const float p1 = -0.64596409750621907082e0f; static const float p2 = 0.7969262624561800806e-1f; static const float p3 = -0.468175413106023168e-2f; static const float t0 = -0.91646118527267623468e-1f; static const float t1 = -0.13956945682312098640e1f; static const float t2 = -0.94393926122725531747e2f; static const float t3 = 0.12888383034157279340e2f; static const float s0 = 0.12797564625607904396e1f; static const float s1 = 0.21972168858277355914e1f; static const float s2 = 0.68193064729268275701e1f; static const float s3 = 0.28205206687035841409e2f; static const float p0exp = 1.26177193074810590878e-4f; static const float p1exp = 3.02994407707441961300e-2f; static const float q0 = 3.00198505138664455042e-6f; static const float q1 = 2.52448340349684104192e-3f; static const float q2 = 2.27265548208155028766e-1f; static const float q3 = 2.00000000000000000009e0f; static const float rln2 = 1.4426950408889634073599f; static const float c1 = 6.93145751953125e-1f; static const float c2 = 1.42860682030941723212e-6f; const float at3613 = 2.7692309f; const float at2511 = 2.2727273f; const float at36 = 36.0f; const float at25 = 25.0f; const float at16 = 16.0f; const float at11 = 11.0f; const float at9 = 9.0f; const float at7 = 7.0f; const float at5 = 5.0f; const float at4 = 4.0f; const float at3 = 3.0f; const float at1 = 1.0f; const float at_p2 = PI_DIV_2; const float mp2 = -PI_DIV_2; const float as2 = FLOAT_SMALL; const float SQ2 = 1.4142136f; const float SQ3 = 0.3333333f; const float SQ5 = 1.4000000f; const float SQ7 = 0.1428571f; const float LOG2 = 0.3465736f; static const float log_p0 = -7.89580278884799154124e-1f; static const float log_p1 = 1.63866645699558079767e1f; static const float log_p2 = -6.41409952958715622951e1f; static const float log_q0 = -3.56722798256324312549e1f; static const float log_q1 = 3.12093766372244180303e2f; static const float log_q2 = -7.69691943550460008604e2f; static const float log_rsqrt2 = 7.07106781186547524401e-1f; static const float log_c0 = 0.693147180559945f; static const float fmax = 88.0f; static const float fmin = -88.0f; static const float pow_p0 = -7.89580278884799154124e-1f; static const float pow_p1 = 1.63866645699558079767e1f; static const float pow_p2 = -6.41409952958715622951e1f; static const float pow_q0 = -3.56722798256324312549e1f; static const float pow_q1 = 3.12093766372244180303e2f; static const float pow_q2 = -7.69691943550460008604e2f; static const float pow_rsqrt2 = 7.07106781186547524401e-1f; static const float pow_c0 = 1.44269504088896340735992f; static const float pow_r0 = 2.30933477057345225087e-2f; static const float pow_r1 = 2.02020656693165307700e1f; static const float pow_r2 = 1.51390680115615096133e3f; static const float pow_s0 = 2.33184211722314911771e2f; static const float pow_s1 = 4.36821166879210612817e3f; static const float pow_fmax = 128.0f; static const float pow_fmin = -127.0f; const float th1 = 1.0f; const float th2p = 2.0f; const float th2m = -2.0f; const float th3 = 0.3333333f; const float sh1 = 1.0f; const float sh5 = 0.5f; const float sh6 = 0.1666667f; const float t_as[49] = { 0.9698000f, 0.9691796f, 0.9685330f, 0.9678589f, 0.9671551f, 0.9664199f, 0.9656509f, 0.9648460f, 0.9640023f, 0.9631173f, 0.9621876f, 0.9612098f, 0.9601802f, 0.9590943f, 0.9579477f, 0.9567349f, 0.9554501f, 0.9540865f, 0.9526370f, 0.9510929f, 0.9494448f, 0.9476817f, 0.9457912f, 0.9437591f, 0.9415686f, 0.9392007f, 0.9366328f, 0.9338384f, 0.9307863f, 0.9274390f, 0.9237517f, 0.9196697f, 0.9151261f, 0.9100379f, 0.9043011f, 0.8977833f, 0.8903134f, 0.8816667f, 0.8715416f, 0.8595238f, 0.8450292f, 0.8272059f, 0.8047620f, 0.7756411f, 0.7363636f, 0.6805556f, 0.5952381f, 0.4500000f, 0.1666667f }; //---------------------------------------------------------------------------- // atan float fast_atan ( float x ) //-------------------------------------- { #if defined(USE_C) // 154 cycles return atanf(x); #else #if defined(SC_OPT) _asm { mov eax, x movss xmm0, x cmp eax, 0bf800000h jnc minus1 cmp eax, 3f800000h jnc plus1 movss xmm1, xmm0 mulss xmm1, xmm1 movss xmm2, xmm1 movss xmm3, xmm1 movss xmm4, xmm1 movss xmm5, xmm1 mulss xmm2, at2511 mulss xmm3, at16 mulss xmm4, at9 mulss xmm5, at4 addss xmm2, at9 rcpss xmm6, xmm2 mulss xmm6, xmm3 addss xmm6, at7 rcpss xmm2, xmm6 mulss xmm2, xmm4 addss xmm2, at5 rcpss xmm6, xmm2 mulss xmm6, xmm5 addss xmm6, at3 rcpss xmm2, xmm6 mulss xmm1, xmm2 addss xmm1, at1 rcpss xmm2, xmm1 movss xmm7, xmm2 addss xmm2, xmm2 mulss xmm7, xmm7 mulss xmm7, xmm1 subss xmm2, xmm7 mulss xmm0, xmm2 movss x, xmm0 } return x; _asm { ALIGN 16 minus1: rcpss xmm0, xmm0 movss xmm1, xmm0 mulss xmm1, xmm1 movss xmm2, xmm1 movss xmm3, xmm1 movss xmm4, xmm1 movss xmm5, xmm1 movss xmm6, xmm1 mulss xmm2, at3613 mulss xmm3, at25 mulss xmm4, at16 mulss xmm5, at9 mulss xmm6, at4 addss xmm2, at11 rcpss xmm2, xmm2 mulss xmm2, xmm3 addss xmm2, at9 rcpss xmm2, xmm2 mulss xmm2, xmm4 addss xmm2, at7 rcpss xmm2, xmm2 movss xmm3, mp2 mulss xmm2, xmm5 addss xmm2, at5 rcpss xmm2, xmm2 mulss xmm2, xmm6 addss xmm2, at3 rcpss xmm2, xmm2 mulss xmm1, xmm2 addss xmm1, at1 rcpss xmm2, xmm1 movss xmm7, xmm2 addss xmm2, xmm2 mulss xmm7, xmm7 mulss xmm7, xmm1 subss xmm2, xmm7 mulss xmm0, xmm2 subss xmm3, xmm0 movss x, xmm3 } return x; _asm { ALIGN 16 plus1: rcpss xmm0, xmm0 movss xmm1, xmm0 mulss xmm1, xmm1 movss xmm2, xmm1 movss xmm3, xmm1 movss xmm4, xmm1 movss xmm5, xmm1 movss xmm6, xmm1 mulss xmm2, at3613 mulss xmm3, at25 mulss xmm4, at16 mulss xmm5, at9 mulss xmm6, at4 addss xmm2, at11 rcpss xmm2, xmm2 mulss xmm2, xmm3 addss xmm2, at9 rcpss xmm2, xmm2 mulss xmm2, xmm4 addss xmm2, at7 rcpss xmm2, xmm2 movss xmm3, at_p2 mulss xmm2, xmm5 addss xmm2, at5 rcpss xmm2, xmm2 mulss xmm2, xmm6 addss xmm2, at3 rcpss xmm2, xmm2 mulss xmm1, xmm2 addss xmm1, at1 rcpss xmm2, xmm1 movss xmm7, xmm2 addss xmm2, xmm2 mulss xmm7, xmm7 mulss xmm7, xmm1 subss xmm2, xmm7 mulss xmm0, xmm2 subss xmm3, xmm0 movss x, xmm3 } return x; #else // SC_OPT // 60 cycles __asm { movss xmm0, x movss xmm1, xmm0 rcpss xmm4, xmm0 orps xmm1, _m128const_am_sign_mask movss xmm6, xmm4 comiss xmm1, _m128const_am_minus_1 jc l_big // 'c' is 'lt' for comiss movss xmm3, t0 movss xmm2, xmm0 mulss xmm2, xmm2 movss xmm1, s0 addss xmm1, xmm2 movss xmm7, s1 rcpss xmm1, xmm1 mulss xmm1, xmm3 movss xmm3, t1 addss xmm7, xmm2 addss xmm1, xmm7 movss xmm7, s2 rcpss xmm1, xmm1 mulss xmm1, xmm3 movss xmm3, t2 addss xmm7, xmm2 addss xmm1, xmm7 movss xmm7, s3 rcpss xmm1, xmm1 mulss xmm1, xmm3 movss xmm3, t3 addss xmm7, xmm2 mulss xmm0, xmm3 addss xmm1, xmm7 rcpss xmm1, xmm1 mulss xmm0, xmm1 jmp l_done l_big: movss xmm3, t0 mulss xmm6, xmm6 movss xmm5, s0 addss xmm5, xmm6 movss xmm7, s1 rcpss xmm5, xmm5 mulss xmm5, xmm3 movss xmm3, t1 addss xmm7, xmm6 addss xmm5, xmm7 movss xmm7, s2 rcpss xmm5, xmm5 mulss xmm5, xmm3 movss xmm3, t2 addss xmm7, xmm6 addss xmm5, xmm7 movss xmm7, s3 rcpss xmm5, xmm5 mulss xmm5, xmm3 movss xmm3, t3 addss xmm7, xmm6 mulss xmm4, xmm3 addss xmm5, xmm7 movss xmm2, _m128const_am_sign_mask rcpss xmm5, xmm5 mulss xmm5, xmm4 movss xmm7, _m128const_am_pi_over_2 andps xmm0, xmm2 orps xmm0, xmm7 subss xmm0, xmm5 l_done: movss x, xmm0 } return x; #endif // !SC_OPT #endif // !USE_C } //---------------------------------------------------------------------------- // atan2 float fast_atan2 ( float x, float y ) //-------------------------------------- { #if defined(USE_C) // 154 cycles return atan2f(x, y); #else #if defined(SC_OPT) _asm { movss xmm0, x rcpss xmm1, y movss xmm2, xmm1 addss xmm1, xmm1 mulss xmm2, xmm2 mulss xmm2, y subss xmm1, xmm2 mulss xmm0, xmm1 movss x, xmm0 } return fast_atan( x ); #else // SC_OPT // 77 cycles fast_atan(x * y); __asm { // We assume fast_atan leaves the return value in xmm0 xorps xmm7, xmm7 movss xmm1, y //[esp - 20 - 8] comiss xmm1, xmm7 movss xmm4, x //[esp - 20 - 4] jnc l_pos // 'nc' is 'ge' for comiss andps xmm4, _m128const_am_sign_mask orps xmm4, _m128const_am_pi addss xmm0, xmm4 l_pos: movss x, xmm0 } return x; #endif // !SC_OPT #endif // !USE_C } //---------------------------------------------------------------------------- // acos float fast_acos ( float x ) //-------------------------------------- { #if defined(USE_C) // 273 cycles return acosf(x); #else _asm { mov eax, x; test eax, 080000000h jnz acminus or eax, eax ; == 0.0 jz acretz cmp eax, 3f800000h ; >= 1.0 jnc acretp jmp acculc acminus: and eax, 7fffffffh ;Just in case. it may be not need. jz acretz ; == -0.0 cmp eax, 0bf800000h ; <= -1.0 jnc acretm acculc: } return PI_DIV_2 - fast_asin( x ); acretz: return PI_DIV_2; acretp: return 0.0f; acretm: return PI; #endif } //---------------------------------------------------------------------------- // asin float fast_asin ( float x ) //-------------------------------------- { #if defined(USE_C) // 279 cycles return asinf(x); #else const unsigned long as1 = FP_ONE_BITS; const unsigned long* pt_as = (unsigned long*) &t_as[0]; _asm { mov eax, x; test eax, 080000000h jnz asminus or eax, eax ; == 0.0 jz asretz cmp eax, 3f800000h ; >= 1.0 jnc asretp cmp eax, 3f3504f3h ; >= SQRT2 / 2 jnc asrett jmp asculc asminus: and eax, 7fffffffh ;Just in case. it may be not need. jz asretz ; == -0.0 cmp eax, 0bf800000h ; <= -1.0 jnc asretm cmp eax, 0bf3504f3h ; <= -SQRT2 / 2 jnc asrett asculc: movss xmm0, as1 ;xmm0 = factor movss xmm1, xmm0 ;xmm1 = sum movss xmm2, xmm0 ;xmm2 = power movss xmm3, x ;xmm3 = x movss xmm4, xmm3 mulss xmm4, xmm4 ;xmm4 = y movss xmm5, as2 ;xmm5 = FLOAT_SMALL mov edx, pt_as mov ecx, 48 asloop: movss xmm6, dword ptr[edx + ecx * 4] mulss xmm0, xmm6 mulss xmm2, xmm4 movss xmm6, xmm0 mulss xmm6, xmm2 addss xmm1, xmm6 comiss xmm6, xmm5 dec ecx ja asloop mulss xmm1, xmm3 movss x, xmm1 } return x; asretz: return 0.0f; asretp: return PI_DIV_2; asretm: return -PI_DIV_2; asrett: _asm { movss xmm1, x mulss xmm1, xmm1 movss xmm0, as1 subss xmm0, xmm1 rsqrtss xmm1, xmm0 movss xmm2, xmm0 mulss xmm0, xmm1 mulss xmm0, xmm1 mulss xmm0, xmm1 mulss xmm0, _m128const_am_0p5 mulss xmm1, _m128const_am_3_over_2 subss xmm1, xmm0 mulss xmm1, x movss x, xmm1 } // quality of fast_atan is too bad. return fast_atan( x ); #endif } //---------------------------------------------------------------------------- // log float fast_log ( float x ) //-------------------------------------- { #if defined(USE_C) // 106 cycles return logf(x); #else #if defined(SC_OPT) // 58 cycles _asm { mov eax, x mov edx, eax and eax, 7fffffh movss xmm2, SQ2 and edx, 7f800000h or eax, 3f800000h shr edx, 22 mov x, eax movss xmm1, x movss xmm0, xmm1 movss xmm3, SQ5 addss xmm1, xmm2 subss xmm0, xmm2 sub edx, 253 rcpss xmm2, xmm1 movss xmm3, xmm2 ;Newton-Raphson addss xmm3, xmm3 mulss xmm2, xmm2 mulss xmm2, xmm1 subss xmm3, xmm2 ;complete Newton-Raphson mulss xmm0, xmm3 movss xmm2, SQ7 movss xmm1, xmm0 mulss xmm0, xmm0 mulss xmm2, xmm0 addss xmm3, xmm0 mulss xmm2, xmm3 addss xmm2, SQ3 mulss xmm0, xmm1 mulss xmm0, xmm2 cvtsi2ss xmm3, edx addss xmm0, xmm1 mulss xmm3, LOG2 addss xmm0, xmm0 addss xmm0, xmm3 movss x, xmm0 } #else // !SC_OPT // 66 cycles __asm { movss xmm0, x maxss xmm0, _m128const_am_min_pos_norm // Cut off denormalized stuff movss xmm7, _m128const_am_inv_mant_mask movss xmm1, _m128const_am_1 movss [esp - 4], xmm0 andps xmm0, xmm7 orps xmm0, xmm1 // xmm1 == 1.0 comiss xmm0, log_rsqrt2 movss xmm7, xmm0 jc l_lt // 'c' is 'lt' for comiss //l_ge: xor ecx, ecx movss xmm2, xmm1 // xmm1 == 1.0 jmp l_continue l_lt: mov ecx, 1 movss xmm2, _m128const_am_0p5 l_continue: addss xmm7, xmm2 subss xmm0, xmm2 mov edx, x rcpss xmm7, xmm7 mulss xmm0, xmm7 addss xmm0, xmm0 shr edx, 23 movss xmm2, xmm0 sub edx, 0x7f mulss xmm2, xmm2 movss xmm4, log_p0 movss xmm6, log_q0 mulss xmm4, xmm2 movss xmm5, log_p1 sub edx, ecx mulss xmm6, xmm2 movss xmm7, log_q1 addss xmm4, xmm5 addss xmm6, xmm7 movss xmm5, log_p2 mulss xmm4, xmm2 cvtsi2ss xmm1, edx movss xmm7, log_q2 mulss xmm6, xmm2 addss xmm4, xmm5 addss xmm6, xmm7 movss xmm5, log_c0 mulss xmm4, xmm2 rcpss xmm6, xmm6 mulss xmm4, xmm0 mulss xmm1, xmm5 mulss xmm4, xmm6 addss xmm0, xmm1 addss xmm0, xmm4 movss x, xmm0 } #endif // SC_OPT return x; #endif // !USE_C } //---------------------------------------------------------------------------- // log10 float fast_log10 ( float x ) //-------------------------------------- { #if defined(USE_C) // 106 cycles return log10f(x); #else // 74 cycles // fixed coefficient 7/3/2000 Shinji Chiba return fast_log( x ) * 0.4342945f; #endif // !USE_C } //---------------------------------------------------------------------------- // exp float fast_exp ( float x ) //-------------------------------------- { #if defined(USE_C) // 151 cycles return expf(x); #else // 90 cycles __asm { movss xmm0, x maxss xmm0, fmin minss xmm0, fmax movss xmm1, rln2 mulss xmm1, xmm0 movss xmm7, _m128const_am_0 addss xmm1, _m128const_am_0p5 xor ecx, ecx mov edx, 1 comiss xmm1, xmm7 cvttss2si eax, xmm1 cmovc ecx, edx // 'c' is 'lt' for comiss sub eax, ecx cvtsi2ss xmm1, eax add eax, 0x7f movss xmm2, xmm1 mulss xmm1, c1 and eax, 0xff // Optional, just for sanity mulss xmm2, c2 subss xmm0, xmm1 shl eax, 23 subss xmm0, xmm2 movss xmm2, xmm0 mov x, eax mulss xmm2, xmm2 movss xmm6, q0 movss xmm4, p0exp mulss xmm6, xmm2 movss xmm7, q1 mulss xmm4, xmm2 movss xmm5, p1exp addss xmm6, xmm7 addss xmm4, xmm5 movss xmm7, q2 mulss xmm6, xmm2 mulss xmm4, xmm2 addss xmm6, xmm7 mulss xmm4, xmm0 movss xmm7, q3 mulss xmm6, xmm2 addss xmm4, xmm0 addss xmm6, xmm7 movss xmm0, x subss xmm6, xmm4 rcpss xmm6, xmm6 movss xmm7, _m128const_am_1 mulss xmm4, xmm6 addss xmm4, xmm4 addss xmm4, xmm7 mulss xmm0, xmm4 movss x, xmm0 } return x; #endif // !USE_C } float fast_sqrt( float x ) { _asm { mov eax, x or eax, eax jz SQRTZERO movss xmm0, x // approximate sqrt reciprocal -- |Max Error| <= 1.5x2^-12 rsqrtss xmm1, xmm0 // 1/(x^.5) // this does the Newton-Raphson iteration to get up // to 22 bits of precision movss xmm2, xmm0 // x mulss xmm0, xmm1 // 9 * 1/sqr(9) mulss xmm0, xmm1 // 9 * 1/sqr(9) * 1/sqr(9) mulss xmm0, xmm1 // 9 * 1/sqr(9) * 1/sqr(9) * 1/sqr(9) mulss xmm0, _m128const_am_0p5 // 1/2 * 9 * 1/sqr(9) * 1/sqr(9) * 1/sqr(9) mulss xmm1, _m128const_am_1p5 // 3/2 * 1/sqr(9) subss xmm1, xmm0 // 3/2 * 1/sqr(9) - 1/2 * 9 * 1/sqr(9) * 1/sqr(9) * 1/sqr(9) mulss xmm1, xmm2 // x * 1/(x^.5) movss x, xmm1 SQRTZERO: } return x; } //---------------------------------------------------------------------------- // sqrt float fast_inversesqrt ( float x ) //-------------------------------------- { #if defined(USE_C) // 61 cycles return 1.0f / sqrtf(x); #else // 35 cycles _asm { movss xmm0, x rsqrtss xmm1, xmm0 mulss xmm0, xmm1 mulss xmm0, xmm1 mulss xmm0, xmm1 mulss xmm0, _m128const_am_0p5 mulss xmm1, _m128const_am_3_over_2 subss xmm1, xmm0 movss x, xmm1 } return x; #endif // !USE_C } //---------------------------------------------------------------------------- // fabs float fast_fabs ( float x ) //-------------------------------------- { #if defined(USE_C) // 7 cycles return fabsf(x); #else // 6 cyles __asm { mov eax, x // starting with and eax, 0x7fffffff // And out the sign bit mov x, eax // result in x } return x; #endif // !USE_C } //---------------------------------------------------------------------------- // sincos computes both sin and cos simultaneously. void fast_sincos ( float x, SinCosPair* v ) //-------------------------------------- { #if defined(USE_C) // 133 cycles v->fCos = fast_cos(x); v->fSin = fast_sin(x); #else // 68 cycles __asm { movss xmm0, x andps xmm0, _m128const_am_inv_sign_mask mulss xmm0, _m128const_am_2_over_pi mov eax, x // sin and eax, 0x80000000 // sin movaps xmm4, xmm0 // sin addss xmm0, _m128const_am_1 cvttss2si ecx, xmm0 cvttss2si esi, xmm4 // sin mov edx, ecx shl edx, (31 - 1) mov edi, esi // sin shl edi, (31 - 1) // sin cvtsi2ss xmm1, ecx and edx, 0x80000000 cvtsi2ss xmm5, esi // sin and edi, 0x80000000 // sin and ecx, 0x1 subss xmm0, xmm1 jz l_contcos movss xmm1, _m128const_am_1 subss xmm1, xmm0 movss xmm0, xmm1 l_contcos: and esi, 0x1 // sin subss xmm4, xmm5 // sin jz l_contsin // sin movss xmm5, _m128const_am_1 // sin subss xmm5, xmm4 // sin movss xmm4, xmm5 // sin l_contsin: // sin mov ecx, v movss xmm1, xmm0 mulss xmm0, xmm0 movss xmm5, xmm4 // sin mulss xmm4, xmm4 // sin mov [ecx]v.fCos, edx movss xmm2, xmm0 mulss xmm0, p3 xor eax, edi // sin movss xmm6, xmm4 // sin mulss xmm4, p3 // sin addss xmm0, p2 mov [ecx]v.fSin, eax // sin addss xmm4, p2 // sin mulss xmm0, xmm2 movss xmm3, [ecx]v.fCos mulss xmm4, xmm6 // sin movss xmm7, [ecx]v.fSin // sin addss xmm0, p1 addss xmm4, p1 // sin mulss xmm0, xmm2 orps xmm1, xmm3 mulss xmm4, xmm6 // sin orps xmm5, xmm7 // sin addss xmm0, p0 addss xmm4, p0 // sin mulss xmm0, xmm1 mulss xmm4, xmm5 // sin movss dword ptr [ecx]v.fCos, xmm0 movss dword ptr [ecx]v.fSin, xmm4 } #endif // !USE_C } //---------------------------------------------------------------------------- // sin float fast_sin ( float x ) //-------------------------------------- { #if defined(USE_C) // 111 cycles return sinf(x); #else // 62 cycles __asm { movss xmm0, x andps xmm0, _m128const_am_inv_sign_mask // xmm0 = abs(x) mov eax, x // eax = x mulss xmm0, _m128const_am_2_over_pi // xmm0 = abs(x) * 2/PI and eax, 0x80000000 // eax = sign(x) cvttss2si ecx, xmm0 // ecx = int(xmm0) mov edx, ecx // edx = ecx shl edx, (31 - 1) // edx = edx << 30 cvtsi2ss xmm1, ecx // xmm1 = ecx and edx, 0x80000000 // edx = sign(edx) and ecx, 0x1 // ecx = ecx & 0x1 (set ZF according to result) subss xmm0, xmm1 // xmm0 = xmm0 - xmm1 jz l_cont // jump if 0 / ZF = 1 movss xmm1, _m128const_am_1 // xmm1 = 1 subss xmm1, xmm0 // xmm1 = xmm1 - xmm0 movss xmm0, xmm1 // xmm0 = xmm1 l_cont: movss xmm1, xmm0 // xmm1 = xmm0 mulss xmm0, xmm0 // xmm0 = xmm0 * xmm0 xor eax, edx // eax = edx | eax movss xmm2, xmm0 // xmm2 = xmm0 mulss xmm0, p3 // xmm0 = xmm0 * p3 mov x, eax // x = eax addss xmm0, p2 // xmm0 = xmm0 + p2 mulss xmm0, xmm2 // xmm0 = xmm0 * xmm2 movss xmm3, x // xmm3 = x addss xmm0, p1 // xmm0 = xmm0 + p1 mulss xmm0, xmm2 // xmm0 = xmm0 * xmm2 orps xmm1, xmm3 // xmm1 = xmm1 | xmm3 addss xmm0, p0 // xmm0 = xmm0 + p0 mulss xmm0, xmm1 // xmm0 = xmm0 * xmm1 movss x, xmm0 } return x; #endif // !USE_C } //---------------------------------------------------------------------------- // cos float fast_cos ( float x ) //-------------------------------------- { #if defined(USE_C) // 97 cycles return cosf(x); #else // 68 cycles __asm { movss xmm0, x andps xmm0, _m128const_am_inv_sign_mask // abs(x) mulss xmm0, _m128const_am_2_over_pi // x * (2 / pi) addss xmm0, _m128const_am_1 // x * (2 / pi) + 1 cvttss2si ecx, xmm0 // Trancate into ecx. mov edx, ecx // Store ecx. shl edx, (31 - 1) // Shift left 30 bits. cvtsi2ss xmm1, ecx // Store ecx into xmm1. and edx, 0x80000000 // Get sign bit. and ecx, 0x1 subss xmm0, xmm1 jz l_cont movss xmm1, _m128const_am_1 subss xmm1, xmm0 movss xmm0, xmm1 l_cont: movss xmm1, xmm0 mulss xmm0, xmm0 mov x, edx movss xmm2, xmm0 mulss xmm0, p3 addss xmm0, p2 mulss xmm0, xmm2 movss xmm3, x addss xmm0, p1 mulss xmm0, xmm2 orps xmm1, xmm3 addss xmm0, p0 mulss xmm0, xmm1 movss x, xmm0 } return x; #endif // !USE_C } //---------------------------------------------------------------------------- // fast_atan float fast_tan ( float x ) //-------------------------------------- { #if defined(USE_C) // 148 cycles return tanf(x); #else // 75 cycles __asm { movss xmm0, x andps xmm0, _m128const_am_inv_sign_mask mulss xmm0, _m128const_am_2_over_pi movss xmm7, xmm0 addss xmm0, _m128const_am_1 cvttss2si ecx, xmm0 mov edx, ecx shl edx, (31 - 1) cvtsi2ss xmm1, ecx and edx, 0x80000000 and ecx, 0x1 subss xmm0, xmm1 jz l_cont movss xmm1, _m128const_am_1 subss xmm1, xmm0 movss xmm0, xmm1 l_cont: mov eax, x and eax, 0x80000000 cvttss2si ecx, xmm7 xor eax, edx mov edx, ecx shl edx, (31 - 1) cvtsi2ss xmm3, ecx and edx, 0x80000000 and ecx, 0x1 subss xmm7, xmm3 jz l_cont2 movss xmm3, _m128const_am_1 subss xmm3, xmm7 movss xmm7, xmm3 l_cont2: movss xmm1, xmm0 movss xmm3, xmm7 movss xmm6, p3 mulss xmm0, xmm0 mulss xmm7, xmm7 xor eax, edx movss xmm2, xmm0 movss xmm4, xmm7 mulss xmm0, xmm6 mulss xmm7, xmm6 movss xmm6, p2 mov x, eax addss xmm0, xmm6 addss xmm7, xmm6 mulss xmm0, xmm2 movss xmm6, p1 mulss xmm7, xmm4 movss xmm5, x addss xmm0, xmm6 addss xmm7, xmm6 mulss xmm0, xmm2 mulss xmm7, xmm4 movss xmm6, p0 orps xmm3, xmm5 addss xmm0, xmm6 addss xmm7, xmm6 mulss xmm0, xmm1 mulss xmm7, xmm3 rcpss xmm0, xmm0 mulss xmm0, xmm7 movss x, xmm0 } return x; #endif // !USE_C } //---------------------------------------------------------------------------- // pow float fast_pow ( float x, float y ) //-------------------------------------- { #if defined(USE_C) // 303 cycles return powf(x, y); #else // 133 cycles __asm { movss xmm0, x movss xmm1, y xorps xmm7, xmm7 comiss xmm7, xmm0 movss xmm7, _m128const_am_inv_mant_mask maxss xmm0, _m128const_am_min_pos_norm // Cut off denormalized stuff. jnc l_zerobase movss xmm3, _m128const_am_1 movss x, xmm0 andps xmm0, xmm7 orps xmm0, xmm3 // xmm3 == 1.0 comiss xmm0, pow_rsqrt2 movss xmm7, xmm0 jc l_lt // 'c' is 'lt' for comiss //l_ge: xor ecx, ecx movss xmm2, xmm3 // xmm3 == 1.0 jmp l_continue l_lt: mov ecx, 1 movss xmm2, _m128const_am_0p5 l_continue: addss xmm7, xmm2 subss xmm0, xmm2 mov edx, x rcpss xmm7, xmm7 mulss xmm0, xmm7 addss xmm0, xmm0 shr edx, 23 movss xmm4, pow_p0 movss xmm6, pow_q0 sub edx, 0x7f movss xmm2, xmm0 mulss xmm2, xmm2 mulss xmm4, xmm2 sub edx, ecx movss xmm5, pow_p1 mulss xmm6, xmm2 cvtsi2ss xmm3, edx movss xmm7, pow_q1 addss xmm4, xmm5 mulss xmm3, xmm1 addss xmm6, xmm7 movss xmm5, pow_p2 mulss xmm4, xmm2 movss xmm7, pow_q2 mulss xmm6, xmm2 addss xmm4, xmm5 mulss xmm1, pow_c0 addss xmm6, xmm7 mulss xmm4, xmm2 rcpss xmm6, xmm6 mulss xmm6, xmm0 movss xmm5, _m128const_am_0p5 mulss xmm4, xmm6 addss xmm0, xmm4 xorps xmm7, xmm7 mulss xmm0, xmm1 addss xmm0, xmm3 maxss xmm0, pow_fmin minss xmm0, pow_fmax xor ecx, ecx addss xmm5, xmm0 mov edx, 1 comiss xmm5, xmm7 cvttss2si eax, xmm5 cmovc ecx, edx // 'c' is 'lt' for comiss sub eax, ecx cvtsi2ss xmm5, eax add eax, 0x7f subss xmm0, xmm5 movss xmm2, xmm0 mulss xmm2, xmm2 movss xmm6, pow_s0 movss xmm4, pow_r0 mulss xmm6, xmm2 movss xmm7, pow_s1 and eax, 0xff // Optional, just for sanity mulss xmm4, xmm2 movss xmm5, pow_r1 shl eax, 23 addss xmm6, xmm7 addss xmm4, xmm5 movss xmm5, pow_r2 mulss xmm4, xmm2 addss xmm4, xmm5 mulss xmm4, xmm0 mov x, eax subss xmm6, xmm4 movss xmm7, _m128const_am_1 rcpss xmm6, xmm6 mulss xmm4, xmm6 movss xmm0, x addss xmm4, xmm4 addss xmm4, xmm7 mulss xmm0, xmm4 jmp l_done l_zerobase: xorps xmm0, xmm0 l_done: movss x, xmm0 } return x; #endif // !USE_C } float fast_hypot( float x, float y ) { #if defined(USE_C) return (float) _hypot( x, y ); #else // 15.38x faster _asm { movss xmm0, x movss xmm1, y mulss xmm0, xmm0 mulss xmm1, xmm1 addss xmm0, xmm1 rsqrtss xmm1, xmm0 movss xmm2, xmm0 mulss xmm0, xmm1 mulss xmm0, xmm1 mulss xmm0, xmm1 mulss xmm0, _m128const_am_0p5 mulss xmm1, _m128const_am_3_over_2 subss xmm1, xmm0 mulss xmm1, xmm2 movss x, xmm1 } return x; #endif // !USE_C } float fast_ceil( float x ) { #if defined(USE_C) return ceilf( x ); #else // 2.01x faster unsigned long m32; _asm { movss xmm0, x stmxcsr m32 mov edx, m32 mov eax, 0ffff9fffh and eax, edx or eax, 4000h mov m32, eax ldmxcsr m32 cvtss2si eax, xmm0 cvtsi2ss xmm0, eax mov m32, edx ldmxcsr m32 movss x, xmm0 } return x; #endif // !USE_C } float fast_floor( float x ) { #if defined(USE_C) return floorf( x ); #else // 1.99x faster unsigned long m32; _asm { movss xmm0, x stmxcsr m32 mov edx, m32 mov eax, 0ffff9fffh and eax, edx or eax, 2000h mov m32, eax ldmxcsr m32 cvtss2si eax, xmm0 cvtsi2ss xmm0, eax mov m32, edx ldmxcsr m32 movss x, xmm0 } return x; #endif // !USE_C } float fast_tanh( float x ) { #if defined(USE_C) return tanhf( x ); #else // 3.26x faster _asm { mov eax, x cmp eax, 0ba83126fh ja rettanhm cmp eax, 3a83126fh ja rettanhp movss xmm1, x movss xmm0, th1 mulss xmm1, xmm1 mulss xmm1, th3 subss xmm0, xmm1 mulss xmm0, x movss x, xmm0 } return x; _asm { ALIGN 16 rettanhm: movss xmm0, x mulss xmm0, th2p maxss xmm0, fmin minss xmm0, fmax movss xmm1, rln2 mulss xmm1, xmm0 movss xmm6, _m128const_am_0 addss xmm1, _m128const_am_0p5 xor ecx, ecx mov edx, 1 comiss xmm1, xmm6 cvttss2si eax, xmm1 cmovc ecx, edx sub eax, ecx cvtsi2ss xmm1, eax add eax, 7fh movss xmm2, xmm1 mulss xmm1, c1 and eax, 0ffh mulss xmm2, c2 subss xmm0, xmm1 shl eax, 23 subss xmm0, xmm2 movss xmm2, xmm0 mov x, eax mulss xmm2, xmm2 movss xmm5, q0 movss xmm3, p0exp mulss xmm5, xmm2 movss xmm6, q1 mulss xmm3, xmm2 movss xmm4, p1exp addss xmm5, xmm6 addss xmm3, xmm4 movss xmm6, q2 mulss xmm5, xmm2 mulss xmm3, xmm2 addss xmm5, xmm6 mulss xmm3, xmm0 movss xmm6, q3 mulss xmm5, xmm2 addss xmm3, xmm0 addss xmm5, xmm6 movss xmm0, x subss xmm5, xmm3 rcpss xmm5, xmm5 movss xmm6, _m128const_am_1 mulss xmm3, xmm5 addss xmm3, xmm3 addss xmm3, xmm6 mulss xmm0, xmm3 addss xmm0, th1 rcpss xmm1, xmm0 movss xmm2, xmm1 addss xmm2, xmm2 mulss xmm1, xmm1 mulss xmm1, xmm0 subss xmm2, xmm1 mulss xmm2, th2p movss xmm1, th1 subss xmm1, xmm2 movss x, xmm1 } return x; _asm { ALIGN 16 rettanhp: movss xmm0, x mulss xmm0, th2m maxss xmm0, fmin minss xmm0, fmax movss xmm1, rln2 mulss xmm1, xmm0 movss xmm6, _m128const_am_0 addss xmm1, _m128const_am_0p5 xor ecx, ecx mov edx, 1 comiss xmm1, xmm6 cvttss2si eax, xmm1 cmovc ecx, edx sub eax, ecx cvtsi2ss xmm1, eax add eax, 7fh movss xmm2, xmm1 mulss xmm1, c1 and eax, 0ffh mulss xmm2, c2 subss xmm0, xmm1 shl eax, 23 subss xmm0, xmm2 movss xmm2, xmm0 mov x, eax mulss xmm2, xmm2 movss xmm5, q0 movss xmm3, p0exp mulss xmm5, xmm2 movss xmm6, q1 mulss xmm3, xmm2 movss xmm4, p1exp addss xmm5, xmm6 addss xmm3, xmm4 movss xmm6, q2 mulss xmm5, xmm2 mulss xmm3, xmm2 addss xmm5, xmm6 mulss xmm3, xmm0 movss xmm6, q3 mulss xmm5, xmm2 addss xmm3, xmm0 addss xmm5, xmm6 movss xmm0, x subss xmm5, xmm3 rcpss xmm5, xmm5 movss xmm6, _m128const_am_1 mulss xmm3, xmm5 addss xmm3, xmm3 addss xmm3, xmm6 mulss xmm0, xmm3 addss xmm0, th1 rcpss xmm1, xmm0 movss xmm2, xmm1 addss xmm2, xmm2 mulss xmm1, xmm1 mulss xmm1, xmm0 subss xmm2, xmm1 mulss xmm2, th2p subss xmm2, th1 movss x, xmm2 } return x; #endif // !USE_C } float fast_cosh( float x ) { #if defined(USE_C) return coshf( x ); #else // 3.96x faster _asm { movss xmm0, x maxss xmm0, fmin minss xmm0, fmax movss xmm1, rln2 mulss xmm1, xmm0 movss xmm6, _m128const_am_0 addss xmm1, _m128const_am_0p5 xor ecx, ecx mov edx, 1 comiss xmm1, xmm6 cvttss2si eax, xmm1 cmovc ecx, edx sub eax, ecx cvtsi2ss xmm1, eax add eax, 7fh movss xmm2, xmm1 mulss xmm1, c1 and eax, 0ffh mulss xmm2, c2 subss xmm0, xmm1 shl eax, 23 subss xmm0, xmm2 movss xmm2, xmm0 mov x, eax mulss xmm2, xmm2 movss xmm5, q0 movss xmm3, p0exp mulss xmm5, xmm2 movss xmm6, q1 mulss xmm3, xmm2 movss xmm4, p1exp addss xmm5, xmm6 addss xmm3, xmm4 movss xmm6, q2 mulss xmm5, xmm2 mulss xmm3, xmm2 addss xmm5, xmm6 mulss xmm3, xmm0 movss xmm6, q3 mulss xmm5, xmm2 addss xmm3, xmm0 addss xmm5, xmm6 movss xmm0, x subss xmm5, xmm3 rcpss xmm5, xmm5 movss xmm6, _m128const_am_1 mulss xmm3, xmm5 addss xmm3, xmm3 addss xmm3, xmm6 mulss xmm0, xmm3 rcpss xmm1, xmm0 movss xmm2, xmm1 addss xmm2, xmm2 mulss xmm1, xmm1 mulss xmm1, xmm0 subss xmm2, xmm1 addss xmm0, xmm2 mulss xmm0, sh5 movss x, xmm0 } return x; #endif // !USE_C } float fast_sinh( float x ) { #if defined(USE_C) return sinhf( x ); #else // 3.30x faster _asm { mov eax, x and eax, 7fffffffh cmp eax, 3a83126fh jg culcsinh movss xmm0, x mulss xmm0, xmm0 mulss xmm0, sh6 addss xmm0, sh1 mulss xmm0, x movss x, xmm0 } return x; _asm { ALIGN 16 culcsinh: movss xmm0, x maxss xmm0, fmin minss xmm0, fmax movss xmm1, rln2 mulss xmm1, xmm0 movss xmm6, _m128const_am_0 addss xmm1, _m128const_am_0p5 xor ecx, ecx mov edx, 1 comiss xmm1, xmm6 cvttss2si eax, xmm1 cmovc ecx, edx sub eax, ecx cvtsi2ss xmm1, eax add eax, 7fh movss xmm2, xmm1 mulss xmm1, c1 and eax, 0ffh mulss xmm2, c2 subss xmm0, xmm1 shl eax, 23 subss xmm0, xmm2 movss xmm2, xmm0 mov x, eax mulss xmm2, xmm2 movss xmm5, q0 movss xmm3, p0exp mulss xmm5, xmm2 movss xmm6, q1 mulss xmm3, xmm2 movss xmm4, p1exp addss xmm5, xmm6 addss xmm3, xmm4 movss xmm6, q2 mulss xmm5, xmm2 mulss xmm3, xmm2 addss xmm5, xmm6 mulss xmm3, xmm0 movss xmm6, q3 mulss xmm5, xmm2 addss xmm3, xmm0 addss xmm5, xmm6 movss xmm0, x subss xmm5, xmm3 rcpss xmm5, xmm5 movss xmm6, _m128const_am_1 mulss xmm3, xmm5 addss xmm3, xmm3 addss xmm3, xmm6 mulss xmm0, xmm3 rcpss xmm1, xmm0 movss xmm2, xmm1 addss xmm2, xmm2 mulss xmm1, xmm1 mulss xmm1, xmm0 subss xmm2, xmm1 subss xmm0, xmm2 mulss xmm0, sh5 movss x, xmm0 } return x; #endif // !USE_C }