Exemplo n.º 1
0
    //-----------------------------------------------------------------------------------
    void MathlibNEON::SinCos4( ArrayReal x, ArrayReal &outSin, ArrayReal &outCos )
    {
        // TODO: Improve accuracy by mapping to the range [-pi/4, pi/4] and swap
        // between cos & sin depending on which quadrant it fell:
        // Quadrant | sin     |  cos
        // n = 0 ->  sin( x ),  cos( x )
        // n = 1 ->  cos( x ), -sin( x )
        // n = 2 -> -sin( x ), -cos( x )
        // n = 3 -> -sin( x ),  sin( x )
        // See ARGUMENT REDUCTION FOR HUGE ARGUMENTS:
        // Good to the Last Bit
        // K. C. Ng and themembers of the FP group of SunPro
        // http://www.derekroconnor.net/Software/Ng--ArgReduction.pdf

        // -- Perhaps we can leave this to GSoC students? --

        // Map arbitrary angle x to the range [-pi; +pi] without using division.
        // Code taken from MSDN's HLSL trick. Architectures with fused mad (i.e. NEON)
        // can replace the add, the sub, & the two muls for two mad
        ArrayReal integralPart;
        x = vaddq_f32( vmulq_f32( x, ONE_DIV_2PI ), HALF );
        x = Modf4( x, integralPart );
        x = vsubq_f32( vmulq_f32( x, TWO_PI ), PI );

        sincos_ps( x, &outSin, &outCos );
    }
Exemplo n.º 2
0
void AngleQuaternion(vec_t *angles, vec_t *quaternion)
{
	static const ALIGN16_BEG int ps_signmask[4] ALIGN16_END = { 0x80000000, 0, 0x80000000, 0 };

	__m128 a = _mm_loadu_ps(angles);
	a = _mm_mul_ps(a, _mm_load_ps(_ps_0p5)); //a *= 0.5
	__m128 s, c;
	sincos_ps(a, &s, &c);

	__m128 im1 = _mm_shuffle_ps(s, c, _MM_SHUFFLE(1, 0, 1, 0)); //im1 =  {sin[0], sin[1], cos[0], cos[1] }
	__m128 im2 = _mm_shuffle_ps(c, s, _MM_SHUFFLE(2, 2, 2, 2)); //im2 =  {cos[2], cos[2], sin[2], sin[2] }

	__m128 part1 = _mm_mul_ps(
		_mm_shuffle_ps(im1, im1, _MM_SHUFFLE(1, 2, 2, 0)),
		_mm_shuffle_ps(im1, im1, _MM_SHUFFLE(0, 3, 1, 3))
		);
	part1 = _mm_mul_ps(part1, im2);

	__m128 part2 = _mm_mul_ps(
		_mm_shuffle_ps(im1, im1, _MM_SHUFFLE(2, 1, 0, 2)),
		_mm_shuffle_ps(im1, im1, _MM_SHUFFLE(3, 0, 3, 1))
		);

	part2 = _mm_mul_ps(part2, _mm_shuffle_ps(im2, im2, _MM_SHUFFLE(0, 0, 2, 2)));

	__m128 signmask = _mm_load_ps((float*)ps_signmask);
	part2 = _mm_xor_ps(part2, signmask);

	__m128 res = _mm_add_ps(part1, part2);
	_mm_storeu_ps(quaternion, res);
}
Exemplo n.º 3
0
void SinCos2(const float4 &angleRadians, float4 &outSin, float4 &outCos)
{
#ifdef MATH_SSE2
	__m128 angle = modf_ps(angleRadians.v, pi2);
	sincos_ps(angle, &outSin.v, &outCos.v);
#else
	SinCos(angleRadians.x, outSin.x, outCos.x);
	SinCos(angleRadians.y, outSin.y, outCos.y);
#endif
}
Exemplo n.º 4
0
void SinCos(float angleRadians, float &outSin, float &outCos)
{
#ifdef MATH_USE_SINCOS_LOOKUPTABLE
	return sincos_lookuptable(angleRadians, outSin, outCos);
#elif defined(MATH_SSE2)
	__m128 angle = modf_ps(setx_ps(angleRadians), pi2);
	__m128 sin, cos;
	sincos_ps(angle, &sin, &cos);
	outSin = s4f_x(sin);
	outCos = s4f_x(cos);
#else
	outSin = Sin(angleRadians);
	outCos = Cos(angleRadians);
#endif
}
Exemplo n.º 5
0
void SinCosFastVector(float r1, float r2, float r3, float r4,
					  float *s0, float *s1, float *s2, float *s3,
					  float *c0, float *c1, float *c2, float *c3)
{
	v4sf rad_vector = {r1, r2, r3, r4};
	v4sf sin_vector, cos_vector;

	sincos_ps(rad_vector, &sin_vector, &cos_vector);

	*s0 = sin_vector[0];
	if(s1) *s1 = sin_vector[1];
	if(s2) *s2 = sin_vector[2];
	if(s3) *s3 = sin_vector[3];

	*c0 = cos_vector[0];
	if(s1) *c1 = cos_vector[1];
	if(s2) *c2 = cos_vector[2];
	if(s3) *c3 = cos_vector[3];
}
Exemplo n.º 6
0
void Quat::SetFromAxisAngle(const float4 &axis, float angle)
{
	assume1(EqualAbs(axis.w, 0.f), axis);
	assume2(axis.IsNormalized(1e-4f), axis, axis.Length4());
	assume1(MATH_NS::IsFinite(angle), angle);

#if defined(MATH_AUTOMATIC_SSE) && defined(MATH_SSE2)
	// Best: 26.499 nsecs / 71.024 ticks, Avg: 26.856 nsecs, Worst: 27.651 nsecs
	simd4f halfAngle = set1_ps(0.5f*angle);
	simd4f sinAngle, cosAngle;
	sincos_ps(halfAngle, &sinAngle, &cosAngle);
	simd4f quat = mul_ps(axis, sinAngle);

	// Set the w component to cosAngle.
	simd4f highPart = _mm_unpackhi_ps(quat, cosAngle); // [_ _ 1 z]
	q = _mm_movelh_ps(quat, highPart); // [1 z y x]
#else
	// Best: 36.868 nsecs / 98.312 ticks, Avg: 36.980 nsecs, Worst: 41.477 nsecs
	SetFromAxisAngle(axis.xyz(), angle);
#endif
}
Exemplo n.º 7
0
/* Description: This routine performs an inverse FFT to real data.
*              This code is for floating point data.
*
*  Note: Output is BIT-REVERSED! so you must use the BitReversed to
*        get legible output, (i.e. wave[2*i]   = buffer[ BitReversed[i] ]
*                                  wave[2*i+1] = buffer[ BitReversed[i]+1 ] )
*        Input is in normal order, interleaved (real,imaginary) complex data
*        You must call InitializeFFT(fftlen) first to initialize some buffers!
*
* Input buffer[0] is the DC bin, and input buffer[1] is the Fs/2 bin
* - this can be done because both values will always be real only
* - this allows us to not have to allocate an extra complex value for the Fs/2 bin
*
*  Note: The scaling on this is done according to the standard FFT definition,
*        so a unit amplitude DC signal will output an amplitude of (N)
*        (Older revisions would progressively scale the input, so the output
*        values would be similar in amplitude to the input values, which is
*        good when using fixed point arithmetic)
*/
void InverseRealFFTf4x(fft_type *buffer,HFFT h)
{

   __m128 *localBuffer=(__m128 *)buffer;

   __m128 *A,*B;
   fft_type *sptr;
   __m128 *endptr1,*endptr2;
   int br1Index, br1Value;
   __m128 HRplus,HRminus,HIplus,HIminus;
   __m128 v1,v2,sin,cos;
   fft_type iToRad=2*M_PI/(2*h->Points);


   int ButterfliesPerGroup=h->Points/2;

   /* Massage input to get the input for a real output sequence. */
   A=localBuffer+2;
   B=localBuffer+h->Points*2-2;
   br1Index=1; //h->BitReversed+1;
   int iSinCosCalIndex=0;
   while(A<B)
   {
      v4sfu sin4_2, cos4_2;
      if(useBitReverseTable) {
         br1Value=h->BitReversed[br1Index];
      } else {
         br1Value=SmallReverseBits(br1Index,h->pow2Bits);
      }
      if(useSinCosTable) {
         sin=_mm_set1_ps(h->SinTable[br1Value]);
         cos=_mm_set1_ps(h->SinTable[br1Value+1]);
      } else {
         if(!iSinCosCalIndex)
         {
            v4sfu vx;
            for(int i=0;i<4;i++)
               vx.m128_f32[i]=((float)(br1Index+i))*iToRad;
            sincos_ps(&vx, &sin4_2, &cos4_2);
            sin=_mm_set1_ps(-sin4_2.m128_f32[0]);
            cos=_mm_set1_ps(-cos4_2.m128_f32[0]);
            iSinCosCalIndex++;
         } else {
            sin=_mm_set1_ps(-sin4_2.m128_f32[iSinCosCalIndex]);
            cos=_mm_set1_ps(-cos4_2.m128_f32[iSinCosCalIndex]);
            if(iSinCosCalIndex==3)
               iSinCosCalIndex=0;
            else
               iSinCosCalIndex++;
         }
      }
      HRminus = _mm_sub_ps(*A,  *B);
      HRplus = _mm_add_ps(HRminus, _mm_mul_ps(*B,  _mm_set1_ps(2.0)));
      HIminus = _mm_sub_ps( *(A+1), *(B+1));
      HIplus = _mm_add_ps(HIminus,  _mm_mul_ps(*(B+1), _mm_set1_ps(2.0)));
      v1 = _mm_add_ps(_mm_mul_ps(sin, HRminus), _mm_mul_ps(cos, HIplus));
      v2 = _mm_sub_ps(_mm_mul_ps(cos, HRminus), _mm_mul_ps(sin, HIplus));
      *A = _mm_mul_ps(_mm_add_ps(HRplus, v1), _mm_set1_ps(0.5));
      *B = _mm_sub_ps(*A, v1);
      *(A+1) = _mm_mul_ps(_mm_sub_ps(HIminus, v2) , _mm_set1_ps(0.5));
      *(B+1) = _mm_sub_ps(*(A+1), HIminus);

      A=&A[2];
      B=&B[-2];
      br1Index++;
   }
   /* Handle center bin (just need conjugate) */
   // negate sse style
   *(A+1)=_mm_xor_ps(*(A+1), _mm_set1_ps(-0.f));

   /* Handle DC bin separately - this ignores any Fs/2 component
   buffer[1]=buffer[0]=buffer[0]/2;*/
   /* Handle DC and Fs/2 bins specially */
   /* The DC bin is passed in as the real part of the DC complex value */
   /* The Fs/2 bin is passed in as the imaginary part of the DC complex value */
   /* (v1+v2) = buffer[0] == the DC component */
   /* (v1-v2) = buffer[1] == the Fs/2 component */
   v1=_mm_mul_ps(_mm_set1_ps(0.5), _mm_add_ps(localBuffer[0], localBuffer[1]));
   v2=_mm_mul_ps(_mm_set1_ps(0.5), _mm_sub_ps(localBuffer[0], localBuffer[1]));
   localBuffer[0]=v1;
   localBuffer[1]=v2;

   /*
   *  Butterfly:
   *     Ain-----Aout
   *         \ /
   *         / \
   *     Bin-----Bout
   */

   endptr1=localBuffer+h->Points*2;

   while(ButterfliesPerGroup>0)
   {
      A=localBuffer;
      B=localBuffer+ButterfliesPerGroup*2;
      sptr=h->SinTable;
      int iSinCosIndex=0;
      int iSinCosCalIndex=0;
      while(A<endptr1)
      {
         v4sfu sin4_2, cos4_2;
         if(useSinCosTable) {
            sin=_mm_set1_ps(*(sptr++));
            cos=_mm_set1_ps(*(sptr++));
         } else {
            if(!iSinCosCalIndex)
            {
               v4sfu vx;
               for(int i=0;i<4;i++)
                  vx.m128_f32[i]=((fft_type )SmallReverseBits(iSinCosIndex+i,h->pow2Bits-1))*iToRad;
               sincos_ps(&vx, &sin4_2, &cos4_2);
               sin=_mm_set1_ps(-sin4_2.m128_f32[0]);
               cos=_mm_set1_ps(-cos4_2.m128_f32[0]);
               iSinCosCalIndex++;
            } else {
               sin=_mm_set1_ps(-sin4_2.m128_f32[iSinCosCalIndex]);
               cos=_mm_set1_ps(-cos4_2.m128_f32[iSinCosCalIndex]);
               if(iSinCosCalIndex==3)
                  iSinCosCalIndex=0;
               else
                  iSinCosCalIndex++;
            }
            iSinCosIndex++;
         }
         endptr2=B;
         while(A<endptr2)
         {
            v1=_mm_sub_ps( _mm_mul_ps(*B, cos), _mm_mul_ps(*(B+1), sin));
            v2=_mm_add_ps( _mm_mul_ps(*B, sin), _mm_mul_ps(*(B+1), cos));
            *B=_mm_mul_ps( _mm_add_ps(*A, v1), _mm_set1_ps(0.5));
            *(A++)=_mm_sub_ps(*(B++), v1);
            *B=_mm_mul_ps(_mm_add_ps(*A, v2), _mm_set1_ps(0.5));
            *(A++)=_mm_sub_ps(*(B++),v2);
         }
         A=B;
         B=&B[ButterfliesPerGroup*2];
      }
      ButterfliesPerGroup >>= 1;
   }
}
Exemplo n.º 8
0
void RealFFTf4x(fft_type *buffer,HFFT h)
{

   __m128 *localBuffer=(__m128 *)buffer;

   __m128 *A,*B;
   fft_type *sptr;
   __m128 *endptr1,*endptr2;
   int br1Index, br2Index;
   int br1Value, br2Value;
   __m128 HRplus,HRminus,HIplus,HIminus;
   __m128 v1,v2,sin,cos;
   fft_type iToRad=2*M_PI/(2*h->Points); 

   int ButterfliesPerGroup=h->Points/2;

   /*
   *  Butterfly:
   *     Ain-----Aout
   *         \ /
   *         / \
   *     Bin-----Bout
   */

   endptr1=&localBuffer[h->Points*2];

   while(ButterfliesPerGroup>0)
   {
      A=localBuffer;
      B=&localBuffer[ButterfliesPerGroup*2];
      sptr=h->SinTable;
      int iSinCosIndex=0;
      int iSinCosCalIndex=0;
      while(A<endptr1)
      {
         v4sfu sin4_2, cos4_2;
         if(useSinCosTable) {
            sin=_mm_set1_ps(*(sptr++));
            cos=_mm_set1_ps(*(sptr++));
         } else {
            if(!iSinCosCalIndex)
            {
               v4sfu vx;
               for(int i=0;i<4;i++)
                  vx.m128_f32[i]=((fft_type )SmallReverseBits(iSinCosIndex+i,h->pow2Bits-1))*iToRad;
               sincos_ps(&vx, &sin4_2, &cos4_2);
               sin=_mm_set1_ps(-sin4_2.m128_f32[0]);
               cos=_mm_set1_ps(-cos4_2.m128_f32[0]);
               iSinCosCalIndex++;
            } else {
               sin=_mm_set1_ps(-sin4_2.m128_f32[iSinCosCalIndex]);
               cos=_mm_set1_ps(-cos4_2.m128_f32[iSinCosCalIndex]);
               if(iSinCosCalIndex==3)
                  iSinCosCalIndex=0;
               else
                  iSinCosCalIndex++;
            }
            iSinCosIndex++;
         }
         endptr2=B;
         while(A<endptr2)
         {
            v1 = _mm_add_ps( _mm_mul_ps(*B, cos), _mm_mul_ps(*(B+1), sin));
            v2 = _mm_sub_ps( _mm_mul_ps(*B, sin), _mm_mul_ps(*(B+1), cos));
            *B=_mm_add_ps( *A, v1);
            __m128 temp128 = _mm_set1_ps( 2.0); 
            *(A++)=_mm_sub_ps(*(B++), _mm_mul_ps(temp128, v1));
            *B=_mm_sub_ps(*A,v2);
            *(A++)=_mm_add_ps(*(B++), _mm_mul_ps(temp128, v2));
         }
         A=B;
         B=&B[ButterfliesPerGroup*2];
      }
      ButterfliesPerGroup >>= 1;
   }
   /* Massage output to get the output for a real input sequence. */

   br1Index=1; // h->BitReversed+1;
   br2Index=h->Points-1;   //h->BitReversed+h->Points-1;

   int iSinCosCalIndex=0;
   while(br1Index<br2Index)
   {
      v4sfu sin4_2, cos4_2;
      if(useBitReverseTable) {
         br1Value=h->BitReversed[br1Index];
         br2Value=h->BitReversed[br2Index];
      } else {
         br1Value=SmallReverseBits(br1Index,h->pow2Bits);
         br2Value=SmallReverseBits(br2Index,h->pow2Bits);
      }
      if(useSinCosTable) {
         sin=_mm_set1_ps(h->SinTable[br1Value]);
         cos=_mm_set1_ps(h->SinTable[br1Value+1]);
      } else {
         if(!iSinCosCalIndex)
         {
            v4sfu vx;
            for(int i=0;i<4;i++)
               vx.m128_f32[i]=((float)(br1Index+i))*iToRad;
            sincos_ps(&vx, &sin4_2, &cos4_2);
            sin=_mm_set1_ps(-sin4_2.m128_f32[0]);
            cos=_mm_set1_ps(-cos4_2.m128_f32[0]);
            iSinCosCalIndex++;
         } else {
            sin=_mm_set1_ps(-sin4_2.m128_f32[iSinCosCalIndex]);
            cos=_mm_set1_ps(-cos4_2.m128_f32[iSinCosCalIndex]);
            if(iSinCosCalIndex==3)
               iSinCosCalIndex=0;
            else
               iSinCosCalIndex++;
         }
      }

      A=&localBuffer[br1Value];
      B=&localBuffer[br2Value];
      __m128 temp128 = _mm_set1_ps( 2.0);
      HRplus = _mm_add_ps(HRminus = _mm_sub_ps( *A, *B ), _mm_mul_ps(*B, temp128));
      HIplus = _mm_add_ps(HIminus = _mm_sub_ps(*(A+1), *(B+1) ), _mm_mul_ps(*(B+1), temp128));
      v1 = _mm_sub_ps(_mm_mul_ps(sin, HRminus), _mm_mul_ps(cos, HIplus));
      v2 = _mm_add_ps(_mm_mul_ps(cos, HRminus), _mm_mul_ps(sin, HIplus));
      temp128 = _mm_set1_ps( 0.5);
      *A = _mm_mul_ps(_mm_add_ps(HRplus, v1), temp128);
      *B = _mm_sub_ps(*A, v1);
      *(A+1) = _mm_mul_ps(_mm_add_ps(HIminus, v2), temp128);
      *(B+1) = _mm_sub_ps(*(A+1), HIminus);

      br1Index++;
      br2Index--;
   }
   /* Handle the center bin (just need a conjugate) */
   if(useBitReverseTable) 
      A=&localBuffer[h->BitReversed[br1Index]+1];
   else
      A=&localBuffer[SmallReverseBits(br1Index,h->pow2Bits)+1];
   // negate sse style
   *A=_mm_xor_ps(*A, _mm_set1_ps(-0.f));
   /* Handle DC and Fs/2 bins separately */
   /* Put the Fs/2 value into the imaginary part of the DC bin */
   v1=_mm_sub_ps(localBuffer[0], localBuffer[1]);
   localBuffer[0]=_mm_add_ps(localBuffer[0], localBuffer[1]);
   localBuffer[1]=v1;
}
int check_sincos_precision(float xmin, float xmax) {
  unsigned nb_trials = 100000;
  printf("checking sines on [%g*Pi, %g*Pi]\n", xmin, xmax);


  float max_err_sin_ref = 0, max_err_sin_cep = 0, max_err_sin_x = 0;
  float max_err_cos_ref = 0, max_err_cos_cep = 0, max_err_cos_x = 0;
  float max_err_sum_sqr_test = 0;
  float max_err_sum_sqr_ref = 0;
  xmin *= M_PI; xmax *= M_PI;
  unsigned i;
  for (i=0; i < nb_trials; ++i) {
    V4SF vx, sin4, cos4, sin4_2, cos4_2;
    vx.f[0] = i*(xmax-xmin)/(nb_trials-1) + xmin;
    vx.f[1] = (i+.5)*(xmax-xmin)/(nb_trials-1) + xmin;
    vx.f[2] = frand()*(xmax-xmin);
    vx.f[3] = (i / 32)*M_PI/((i%32)+1);
    if (vx.f[3] < xmin || vx.f[3] > xmax) vx.f[3] = frand()*(xmax-xmin);

    /*
    vx.f[0] = M_PI/2;
    vx.f[1] = M_PI;
    vx.f[2] = M_PI/3;
    vx.f[3] = M_PI/4;
    */
    sin4.v = sin_ps(vx.v);
    cos4.v = cos_ps(vx.v);
    sincos_ps(vx.v, &sin4_2.v, &cos4_2.v);
    unsigned j;
    for (j=0; j < 4; ++j) {
      float x = vx.f[j];
      float sin_test = sin4.f[j];
      float cos_test = cos4.f[j];
      if (sin_test != sin4_2.f[j]) {
        printf("sin / sincos mismatch at x=%g\n", x);
        exit(1); return 1;
      }
      if (cos_test != cos4_2.f[j]) {
        printf("cos / sincos mismatch at x=%g\n", x);
        return 1;
      }
      float sin_ref = sinf(x);
      float sin_cep = cephes_sinf(x);
      float err_sin_ref = fabs(sin_ref - sin_test);
      float err_sin_cep = fabs(sin_cep - sin_test);
      if (err_sin_ref > max_err_sin_ref) {
        max_err_sin_ref = err_sin_ref;
        max_err_sin_x = x;
      }
      max_err_sin_cep = MAX(max_err_sin_cep, err_sin_cep);
      float cos_ref = cosf(x);
      float cos_cep = cephes_cosf(x);
      float err_cos_ref = fabs(cos_ref - cos_test);
      float err_cos_cep = fabs(cos_cep - cos_test);
      if (err_cos_ref > max_err_cos_ref) {
        max_err_cos_ref = err_cos_ref;
        max_err_cos_x = x;
      }
      max_err_cos_cep = MAX(max_err_cos_cep, err_cos_cep);
      float err_sum_sqr_test = fabs(1 - cos_test*cos_test - sin_test*sin_test);
      float err_sum_sqr_ref = fabs(1 - cos_ref*cos_ref - sin_ref*sin_ref);
      max_err_sum_sqr_ref = MAX(max_err_sum_sqr_ref, err_sum_sqr_ref);
      max_err_sum_sqr_test = MAX(max_err_sum_sqr_test, err_sum_sqr_test);
      //printf("sin(%g) = %g %g err=%g\n", x, sin_ref, sin_test, err_sin_ref);
    }
  }
  printf("max deviation from sinf(x): %g at %14.12g*Pi, max deviation from cephes_sin(x): %g\n",
         max_err_sin_ref, max_err_sin_x/M_PI, max_err_sin_cep);
  printf("max deviation from cosf(x): %g at %14.12g*Pi, max deviation from cephes_cos(x): %g\n",
         max_err_cos_ref, max_err_cos_x/M_PI, max_err_cos_cep);

  printf("deviation of sin(x)^2+cos(x)^2-1: %g (ref deviation is %g)\n",
         max_err_sum_sqr_test, max_err_sum_sqr_ref);

  if (max_err_sum_sqr_ref < 2e-7 && max_err_sin_ref < 2e-7 && max_err_cos_ref < 2e-7) {
    printf("   ->> precision OK for the sin_ps / cos_ps / sincos_ps <<-\n\n");
    return 0;
  } else {
    printf("\n   WRONG PRECISION !! there is a problem\n\n");
    return 1;
  }
}
v4sf stupid_sincos_ps(v4sf x) {
  v4sf s, c;
  sincos_ps(x, &s, &c);
  return s;
}