//----------------------------------------------------------------------------------- void MathlibNEON::SinCos4( ArrayReal x, ArrayReal &outSin, ArrayReal &outCos ) { // TODO: Improve accuracy by mapping to the range [-pi/4, pi/4] and swap // between cos & sin depending on which quadrant it fell: // Quadrant | sin | cos // n = 0 -> sin( x ), cos( x ) // n = 1 -> cos( x ), -sin( x ) // n = 2 -> -sin( x ), -cos( x ) // n = 3 -> -sin( x ), sin( x ) // See ARGUMENT REDUCTION FOR HUGE ARGUMENTS: // Good to the Last Bit // K. C. Ng and themembers of the FP group of SunPro // http://www.derekroconnor.net/Software/Ng--ArgReduction.pdf // -- Perhaps we can leave this to GSoC students? -- // Map arbitrary angle x to the range [-pi; +pi] without using division. // Code taken from MSDN's HLSL trick. Architectures with fused mad (i.e. NEON) // can replace the add, the sub, & the two muls for two mad ArrayReal integralPart; x = vaddq_f32( vmulq_f32( x, ONE_DIV_2PI ), HALF ); x = Modf4( x, integralPart ); x = vsubq_f32( vmulq_f32( x, TWO_PI ), PI ); sincos_ps( x, &outSin, &outCos ); }
void AngleQuaternion(vec_t *angles, vec_t *quaternion) { static const ALIGN16_BEG int ps_signmask[4] ALIGN16_END = { 0x80000000, 0, 0x80000000, 0 }; __m128 a = _mm_loadu_ps(angles); a = _mm_mul_ps(a, _mm_load_ps(_ps_0p5)); //a *= 0.5 __m128 s, c; sincos_ps(a, &s, &c); __m128 im1 = _mm_shuffle_ps(s, c, _MM_SHUFFLE(1, 0, 1, 0)); //im1 = {sin[0], sin[1], cos[0], cos[1] } __m128 im2 = _mm_shuffle_ps(c, s, _MM_SHUFFLE(2, 2, 2, 2)); //im2 = {cos[2], cos[2], sin[2], sin[2] } __m128 part1 = _mm_mul_ps( _mm_shuffle_ps(im1, im1, _MM_SHUFFLE(1, 2, 2, 0)), _mm_shuffle_ps(im1, im1, _MM_SHUFFLE(0, 3, 1, 3)) ); part1 = _mm_mul_ps(part1, im2); __m128 part2 = _mm_mul_ps( _mm_shuffle_ps(im1, im1, _MM_SHUFFLE(2, 1, 0, 2)), _mm_shuffle_ps(im1, im1, _MM_SHUFFLE(3, 0, 3, 1)) ); part2 = _mm_mul_ps(part2, _mm_shuffle_ps(im2, im2, _MM_SHUFFLE(0, 0, 2, 2))); __m128 signmask = _mm_load_ps((float*)ps_signmask); part2 = _mm_xor_ps(part2, signmask); __m128 res = _mm_add_ps(part1, part2); _mm_storeu_ps(quaternion, res); }
void SinCos2(const float4 &angleRadians, float4 &outSin, float4 &outCos) { #ifdef MATH_SSE2 __m128 angle = modf_ps(angleRadians.v, pi2); sincos_ps(angle, &outSin.v, &outCos.v); #else SinCos(angleRadians.x, outSin.x, outCos.x); SinCos(angleRadians.y, outSin.y, outCos.y); #endif }
void SinCos(float angleRadians, float &outSin, float &outCos) { #ifdef MATH_USE_SINCOS_LOOKUPTABLE return sincos_lookuptable(angleRadians, outSin, outCos); #elif defined(MATH_SSE2) __m128 angle = modf_ps(setx_ps(angleRadians), pi2); __m128 sin, cos; sincos_ps(angle, &sin, &cos); outSin = s4f_x(sin); outCos = s4f_x(cos); #else outSin = Sin(angleRadians); outCos = Cos(angleRadians); #endif }
void SinCosFastVector(float r1, float r2, float r3, float r4, float *s0, float *s1, float *s2, float *s3, float *c0, float *c1, float *c2, float *c3) { v4sf rad_vector = {r1, r2, r3, r4}; v4sf sin_vector, cos_vector; sincos_ps(rad_vector, &sin_vector, &cos_vector); *s0 = sin_vector[0]; if(s1) *s1 = sin_vector[1]; if(s2) *s2 = sin_vector[2]; if(s3) *s3 = sin_vector[3]; *c0 = cos_vector[0]; if(s1) *c1 = cos_vector[1]; if(s2) *c2 = cos_vector[2]; if(s3) *c3 = cos_vector[3]; }
void Quat::SetFromAxisAngle(const float4 &axis, float angle) { assume1(EqualAbs(axis.w, 0.f), axis); assume2(axis.IsNormalized(1e-4f), axis, axis.Length4()); assume1(MATH_NS::IsFinite(angle), angle); #if defined(MATH_AUTOMATIC_SSE) && defined(MATH_SSE2) // Best: 26.499 nsecs / 71.024 ticks, Avg: 26.856 nsecs, Worst: 27.651 nsecs simd4f halfAngle = set1_ps(0.5f*angle); simd4f sinAngle, cosAngle; sincos_ps(halfAngle, &sinAngle, &cosAngle); simd4f quat = mul_ps(axis, sinAngle); // Set the w component to cosAngle. simd4f highPart = _mm_unpackhi_ps(quat, cosAngle); // [_ _ 1 z] q = _mm_movelh_ps(quat, highPart); // [1 z y x] #else // Best: 36.868 nsecs / 98.312 ticks, Avg: 36.980 nsecs, Worst: 41.477 nsecs SetFromAxisAngle(axis.xyz(), angle); #endif }
/* Description: This routine performs an inverse FFT to real data. * This code is for floating point data. * * Note: Output is BIT-REVERSED! so you must use the BitReversed to * get legible output, (i.e. wave[2*i] = buffer[ BitReversed[i] ] * wave[2*i+1] = buffer[ BitReversed[i]+1 ] ) * Input is in normal order, interleaved (real,imaginary) complex data * You must call InitializeFFT(fftlen) first to initialize some buffers! * * Input buffer[0] is the DC bin, and input buffer[1] is the Fs/2 bin * - this can be done because both values will always be real only * - this allows us to not have to allocate an extra complex value for the Fs/2 bin * * Note: The scaling on this is done according to the standard FFT definition, * so a unit amplitude DC signal will output an amplitude of (N) * (Older revisions would progressively scale the input, so the output * values would be similar in amplitude to the input values, which is * good when using fixed point arithmetic) */ void InverseRealFFTf4x(fft_type *buffer,HFFT h) { __m128 *localBuffer=(__m128 *)buffer; __m128 *A,*B; fft_type *sptr; __m128 *endptr1,*endptr2; int br1Index, br1Value; __m128 HRplus,HRminus,HIplus,HIminus; __m128 v1,v2,sin,cos; fft_type iToRad=2*M_PI/(2*h->Points); int ButterfliesPerGroup=h->Points/2; /* Massage input to get the input for a real output sequence. */ A=localBuffer+2; B=localBuffer+h->Points*2-2; br1Index=1; //h->BitReversed+1; int iSinCosCalIndex=0; while(A<B) { v4sfu sin4_2, cos4_2; if(useBitReverseTable) { br1Value=h->BitReversed[br1Index]; } else { br1Value=SmallReverseBits(br1Index,h->pow2Bits); } if(useSinCosTable) { sin=_mm_set1_ps(h->SinTable[br1Value]); cos=_mm_set1_ps(h->SinTable[br1Value+1]); } else { if(!iSinCosCalIndex) { v4sfu vx; for(int i=0;i<4;i++) vx.m128_f32[i]=((float)(br1Index+i))*iToRad; sincos_ps(&vx, &sin4_2, &cos4_2); sin=_mm_set1_ps(-sin4_2.m128_f32[0]); cos=_mm_set1_ps(-cos4_2.m128_f32[0]); iSinCosCalIndex++; } else { sin=_mm_set1_ps(-sin4_2.m128_f32[iSinCosCalIndex]); cos=_mm_set1_ps(-cos4_2.m128_f32[iSinCosCalIndex]); if(iSinCosCalIndex==3) iSinCosCalIndex=0; else iSinCosCalIndex++; } } HRminus = _mm_sub_ps(*A, *B); HRplus = _mm_add_ps(HRminus, _mm_mul_ps(*B, _mm_set1_ps(2.0))); HIminus = _mm_sub_ps( *(A+1), *(B+1)); HIplus = _mm_add_ps(HIminus, _mm_mul_ps(*(B+1), _mm_set1_ps(2.0))); v1 = _mm_add_ps(_mm_mul_ps(sin, HRminus), _mm_mul_ps(cos, HIplus)); v2 = _mm_sub_ps(_mm_mul_ps(cos, HRminus), _mm_mul_ps(sin, HIplus)); *A = _mm_mul_ps(_mm_add_ps(HRplus, v1), _mm_set1_ps(0.5)); *B = _mm_sub_ps(*A, v1); *(A+1) = _mm_mul_ps(_mm_sub_ps(HIminus, v2) , _mm_set1_ps(0.5)); *(B+1) = _mm_sub_ps(*(A+1), HIminus); A=&A[2]; B=&B[-2]; br1Index++; } /* Handle center bin (just need conjugate) */ // negate sse style *(A+1)=_mm_xor_ps(*(A+1), _mm_set1_ps(-0.f)); /* Handle DC bin separately - this ignores any Fs/2 component buffer[1]=buffer[0]=buffer[0]/2;*/ /* Handle DC and Fs/2 bins specially */ /* The DC bin is passed in as the real part of the DC complex value */ /* The Fs/2 bin is passed in as the imaginary part of the DC complex value */ /* (v1+v2) = buffer[0] == the DC component */ /* (v1-v2) = buffer[1] == the Fs/2 component */ v1=_mm_mul_ps(_mm_set1_ps(0.5), _mm_add_ps(localBuffer[0], localBuffer[1])); v2=_mm_mul_ps(_mm_set1_ps(0.5), _mm_sub_ps(localBuffer[0], localBuffer[1])); localBuffer[0]=v1; localBuffer[1]=v2; /* * Butterfly: * Ain-----Aout * \ / * / \ * Bin-----Bout */ endptr1=localBuffer+h->Points*2; while(ButterfliesPerGroup>0) { A=localBuffer; B=localBuffer+ButterfliesPerGroup*2; sptr=h->SinTable; int iSinCosIndex=0; int iSinCosCalIndex=0; while(A<endptr1) { v4sfu sin4_2, cos4_2; if(useSinCosTable) { sin=_mm_set1_ps(*(sptr++)); cos=_mm_set1_ps(*(sptr++)); } else { if(!iSinCosCalIndex) { v4sfu vx; for(int i=0;i<4;i++) vx.m128_f32[i]=((fft_type )SmallReverseBits(iSinCosIndex+i,h->pow2Bits-1))*iToRad; sincos_ps(&vx, &sin4_2, &cos4_2); sin=_mm_set1_ps(-sin4_2.m128_f32[0]); cos=_mm_set1_ps(-cos4_2.m128_f32[0]); iSinCosCalIndex++; } else { sin=_mm_set1_ps(-sin4_2.m128_f32[iSinCosCalIndex]); cos=_mm_set1_ps(-cos4_2.m128_f32[iSinCosCalIndex]); if(iSinCosCalIndex==3) iSinCosCalIndex=0; else iSinCosCalIndex++; } iSinCosIndex++; } endptr2=B; while(A<endptr2) { v1=_mm_sub_ps( _mm_mul_ps(*B, cos), _mm_mul_ps(*(B+1), sin)); v2=_mm_add_ps( _mm_mul_ps(*B, sin), _mm_mul_ps(*(B+1), cos)); *B=_mm_mul_ps( _mm_add_ps(*A, v1), _mm_set1_ps(0.5)); *(A++)=_mm_sub_ps(*(B++), v1); *B=_mm_mul_ps(_mm_add_ps(*A, v2), _mm_set1_ps(0.5)); *(A++)=_mm_sub_ps(*(B++),v2); } A=B; B=&B[ButterfliesPerGroup*2]; } ButterfliesPerGroup >>= 1; } }
void RealFFTf4x(fft_type *buffer,HFFT h) { __m128 *localBuffer=(__m128 *)buffer; __m128 *A,*B; fft_type *sptr; __m128 *endptr1,*endptr2; int br1Index, br2Index; int br1Value, br2Value; __m128 HRplus,HRminus,HIplus,HIminus; __m128 v1,v2,sin,cos; fft_type iToRad=2*M_PI/(2*h->Points); int ButterfliesPerGroup=h->Points/2; /* * Butterfly: * Ain-----Aout * \ / * / \ * Bin-----Bout */ endptr1=&localBuffer[h->Points*2]; while(ButterfliesPerGroup>0) { A=localBuffer; B=&localBuffer[ButterfliesPerGroup*2]; sptr=h->SinTable; int iSinCosIndex=0; int iSinCosCalIndex=0; while(A<endptr1) { v4sfu sin4_2, cos4_2; if(useSinCosTable) { sin=_mm_set1_ps(*(sptr++)); cos=_mm_set1_ps(*(sptr++)); } else { if(!iSinCosCalIndex) { v4sfu vx; for(int i=0;i<4;i++) vx.m128_f32[i]=((fft_type )SmallReverseBits(iSinCosIndex+i,h->pow2Bits-1))*iToRad; sincos_ps(&vx, &sin4_2, &cos4_2); sin=_mm_set1_ps(-sin4_2.m128_f32[0]); cos=_mm_set1_ps(-cos4_2.m128_f32[0]); iSinCosCalIndex++; } else { sin=_mm_set1_ps(-sin4_2.m128_f32[iSinCosCalIndex]); cos=_mm_set1_ps(-cos4_2.m128_f32[iSinCosCalIndex]); if(iSinCosCalIndex==3) iSinCosCalIndex=0; else iSinCosCalIndex++; } iSinCosIndex++; } endptr2=B; while(A<endptr2) { v1 = _mm_add_ps( _mm_mul_ps(*B, cos), _mm_mul_ps(*(B+1), sin)); v2 = _mm_sub_ps( _mm_mul_ps(*B, sin), _mm_mul_ps(*(B+1), cos)); *B=_mm_add_ps( *A, v1); __m128 temp128 = _mm_set1_ps( 2.0); *(A++)=_mm_sub_ps(*(B++), _mm_mul_ps(temp128, v1)); *B=_mm_sub_ps(*A,v2); *(A++)=_mm_add_ps(*(B++), _mm_mul_ps(temp128, v2)); } A=B; B=&B[ButterfliesPerGroup*2]; } ButterfliesPerGroup >>= 1; } /* Massage output to get the output for a real input sequence. */ br1Index=1; // h->BitReversed+1; br2Index=h->Points-1; //h->BitReversed+h->Points-1; int iSinCosCalIndex=0; while(br1Index<br2Index) { v4sfu sin4_2, cos4_2; if(useBitReverseTable) { br1Value=h->BitReversed[br1Index]; br2Value=h->BitReversed[br2Index]; } else { br1Value=SmallReverseBits(br1Index,h->pow2Bits); br2Value=SmallReverseBits(br2Index,h->pow2Bits); } if(useSinCosTable) { sin=_mm_set1_ps(h->SinTable[br1Value]); cos=_mm_set1_ps(h->SinTable[br1Value+1]); } else { if(!iSinCosCalIndex) { v4sfu vx; for(int i=0;i<4;i++) vx.m128_f32[i]=((float)(br1Index+i))*iToRad; sincos_ps(&vx, &sin4_2, &cos4_2); sin=_mm_set1_ps(-sin4_2.m128_f32[0]); cos=_mm_set1_ps(-cos4_2.m128_f32[0]); iSinCosCalIndex++; } else { sin=_mm_set1_ps(-sin4_2.m128_f32[iSinCosCalIndex]); cos=_mm_set1_ps(-cos4_2.m128_f32[iSinCosCalIndex]); if(iSinCosCalIndex==3) iSinCosCalIndex=0; else iSinCosCalIndex++; } } A=&localBuffer[br1Value]; B=&localBuffer[br2Value]; __m128 temp128 = _mm_set1_ps( 2.0); HRplus = _mm_add_ps(HRminus = _mm_sub_ps( *A, *B ), _mm_mul_ps(*B, temp128)); HIplus = _mm_add_ps(HIminus = _mm_sub_ps(*(A+1), *(B+1) ), _mm_mul_ps(*(B+1), temp128)); v1 = _mm_sub_ps(_mm_mul_ps(sin, HRminus), _mm_mul_ps(cos, HIplus)); v2 = _mm_add_ps(_mm_mul_ps(cos, HRminus), _mm_mul_ps(sin, HIplus)); temp128 = _mm_set1_ps( 0.5); *A = _mm_mul_ps(_mm_add_ps(HRplus, v1), temp128); *B = _mm_sub_ps(*A, v1); *(A+1) = _mm_mul_ps(_mm_add_ps(HIminus, v2), temp128); *(B+1) = _mm_sub_ps(*(A+1), HIminus); br1Index++; br2Index--; } /* Handle the center bin (just need a conjugate) */ if(useBitReverseTable) A=&localBuffer[h->BitReversed[br1Index]+1]; else A=&localBuffer[SmallReverseBits(br1Index,h->pow2Bits)+1]; // negate sse style *A=_mm_xor_ps(*A, _mm_set1_ps(-0.f)); /* Handle DC and Fs/2 bins separately */ /* Put the Fs/2 value into the imaginary part of the DC bin */ v1=_mm_sub_ps(localBuffer[0], localBuffer[1]); localBuffer[0]=_mm_add_ps(localBuffer[0], localBuffer[1]); localBuffer[1]=v1; }
int check_sincos_precision(float xmin, float xmax) { unsigned nb_trials = 100000; printf("checking sines on [%g*Pi, %g*Pi]\n", xmin, xmax); float max_err_sin_ref = 0, max_err_sin_cep = 0, max_err_sin_x = 0; float max_err_cos_ref = 0, max_err_cos_cep = 0, max_err_cos_x = 0; float max_err_sum_sqr_test = 0; float max_err_sum_sqr_ref = 0; xmin *= M_PI; xmax *= M_PI; unsigned i; for (i=0; i < nb_trials; ++i) { V4SF vx, sin4, cos4, sin4_2, cos4_2; vx.f[0] = i*(xmax-xmin)/(nb_trials-1) + xmin; vx.f[1] = (i+.5)*(xmax-xmin)/(nb_trials-1) + xmin; vx.f[2] = frand()*(xmax-xmin); vx.f[3] = (i / 32)*M_PI/((i%32)+1); if (vx.f[3] < xmin || vx.f[3] > xmax) vx.f[3] = frand()*(xmax-xmin); /* vx.f[0] = M_PI/2; vx.f[1] = M_PI; vx.f[2] = M_PI/3; vx.f[3] = M_PI/4; */ sin4.v = sin_ps(vx.v); cos4.v = cos_ps(vx.v); sincos_ps(vx.v, &sin4_2.v, &cos4_2.v); unsigned j; for (j=0; j < 4; ++j) { float x = vx.f[j]; float sin_test = sin4.f[j]; float cos_test = cos4.f[j]; if (sin_test != sin4_2.f[j]) { printf("sin / sincos mismatch at x=%g\n", x); exit(1); return 1; } if (cos_test != cos4_2.f[j]) { printf("cos / sincos mismatch at x=%g\n", x); return 1; } float sin_ref = sinf(x); float sin_cep = cephes_sinf(x); float err_sin_ref = fabs(sin_ref - sin_test); float err_sin_cep = fabs(sin_cep - sin_test); if (err_sin_ref > max_err_sin_ref) { max_err_sin_ref = err_sin_ref; max_err_sin_x = x; } max_err_sin_cep = MAX(max_err_sin_cep, err_sin_cep); float cos_ref = cosf(x); float cos_cep = cephes_cosf(x); float err_cos_ref = fabs(cos_ref - cos_test); float err_cos_cep = fabs(cos_cep - cos_test); if (err_cos_ref > max_err_cos_ref) { max_err_cos_ref = err_cos_ref; max_err_cos_x = x; } max_err_cos_cep = MAX(max_err_cos_cep, err_cos_cep); float err_sum_sqr_test = fabs(1 - cos_test*cos_test - sin_test*sin_test); float err_sum_sqr_ref = fabs(1 - cos_ref*cos_ref - sin_ref*sin_ref); max_err_sum_sqr_ref = MAX(max_err_sum_sqr_ref, err_sum_sqr_ref); max_err_sum_sqr_test = MAX(max_err_sum_sqr_test, err_sum_sqr_test); //printf("sin(%g) = %g %g err=%g\n", x, sin_ref, sin_test, err_sin_ref); } } printf("max deviation from sinf(x): %g at %14.12g*Pi, max deviation from cephes_sin(x): %g\n", max_err_sin_ref, max_err_sin_x/M_PI, max_err_sin_cep); printf("max deviation from cosf(x): %g at %14.12g*Pi, max deviation from cephes_cos(x): %g\n", max_err_cos_ref, max_err_cos_x/M_PI, max_err_cos_cep); printf("deviation of sin(x)^2+cos(x)^2-1: %g (ref deviation is %g)\n", max_err_sum_sqr_test, max_err_sum_sqr_ref); if (max_err_sum_sqr_ref < 2e-7 && max_err_sin_ref < 2e-7 && max_err_cos_ref < 2e-7) { printf(" ->> precision OK for the sin_ps / cos_ps / sincos_ps <<-\n\n"); return 0; } else { printf("\n WRONG PRECISION !! there is a problem\n\n"); return 1; } }
v4sf stupid_sincos_ps(v4sf x) { v4sf s, c; sincos_ps(x, &s, &c); return s; }