Exemplo n.º 1
0
__SIMDd _SIMD_sqrt_pd(__SIMDd a)
{
#ifdef  USE_SSE
  return _mm_sqrt_pd(a);
#elif defined USE_AVX
  return _mm256_sqrt_pd(a);
#elif defined USE_IBM
  return vec_sqrt(a);
#endif
}
Exemplo n.º 2
0
void static
avx_test (void)
{
  union256d u, s1;
  double e [4] = {0x1.d3881b2c32ed7p+7, 0x1.54abaed51711cp+4, 0x1.19195c08a8d23p+5, 0x1.719741d6c0b0bp+5};

  s1.x = _mm256_set_pd (2134.3343,1234.635654,453.345635,54646.464356);
  u.x = _mm256_sqrt_pd (s1.x);

  if (check_union256d (u, e))
    abort ();
}
Exemplo n.º 3
0
void gvrotg_fma(double *c, double *s, double *r, double a, double b)
{
#if defined(__FMA__)
    register __m256d x0, x1, t0, t2, u0, u1, one, b0, b1;
    if (b == 0.0) {
        *c = 1.0;
        *s = 0.0;
        *r = a;
        return;
    }
    if (a == 0.0) {
        *c = 0.0;
        *s = 1.0;
        *r = b;
        return;
    }

    // set_pd() order: [3, 2, 1, 0]
    // x[0], x[1]: |a| > |b|,  x[2],x[3]: |b| > |a|

    one = _mm256_set1_pd(1.0);
    x0  = _mm256_set_pd(1.0, a, b, 1.0);   // x0 = {1, a,   b,   1}
    x1  = _mm256_set_pd(1.0, b, a, 1.0);   // x0 = {1, b,   a,   1}
    t0  = _mm256_div_pd(x0, x1);           // t0 = {1, a/b, b/a, 1}
    t2  = _mm256_fmadd_pd(t0, t0, one);    // x3 = {1, 1+(a/b)^2, (b/a)^2+1, 1}
    u0  = _mm256_sqrt_pd(t2);              // u0 = {1, sqrt(1+(a/b)^2), sqrt((b/2)^2+1), 1}
    u1  = _mm256_div_pd(one, u0);
    b0  = _mm256_blend_pd(u0, u1, 0x9);    // b0 = {1/u(a),   u(a),   u(b), 1/u(b)} 
    b0  = _mm256_mul_pd(b0, x1);           // b0 = {1/u(a), b*u(a), a*u(b), 1/u(b)} 
    b1  = _mm256_mul_pd(t0, u1);           // b1 = {1/u(a), t*u(a), t*u(b), 1/u(b)} 

    if (fabs(b) > fabs(a)) {
      *s = b0[3];
      *r = b0[2];
      *c = b1[2];
      if (signbit(b)) {
          *s = -(*s);
          *c = -(*c);
          *r = -(*r);
      }
    } else {
      *c = b0[0];
      *r = b0[1];
      *s = b1[1];
    }
#endif
}
Exemplo n.º 4
0
void gvrotg_avx(double *c, double *s, double *r, double a, double b)
{
    register __m256d x0, x1, t0, t2, u0, u1, one, b0, b1;
    if (b == 0.0) {
        *c = 1.0;
        *s = 0.0;
        *r = a;
        return;
    }
    if (a == 0.0) {
        *c = 0.0;
        *s = 1.0;
        *r = b;
        return;
    }

    // set_pd() order: [3, 2, 1, 0]
    // x[0], x[1]: |a| > |b|,  x[2],x[3]: |b| > |a|

    x0  = _mm256_set_pd(1.0, a, b, 1.0);   // x0 = {1, a,   b,   1}
    x1  = _mm256_set_pd(1.0, b, a, 1.0);   // x0 = {1, b,   a,   1}
    t0  = _mm256_div_pd(x0, x1);           // t0 = {1, a/b, b/a, 1}
    x0  = _mm256_mul_pd(t0, t0);           // x3 = {1, (a/b)^2, (b/a)^2, 1}
    t2  = _mm256_hadd_pd(x0, x0);          // x3 = {1+(a/b)^2, ., (b/a)^2+1, ..}
    u0  = _mm256_sqrt_pd(t2);              // u0 = {sqrt(1+(a/b)^2), .., sqrt((b/a)^2+1)}
    one = _mm256_set1_pd(1.0);
    u1  = _mm256_div_pd(one, u0);
    b0  = _mm256_blend_pd(u0, u1, 0x9);    // b0 = {1/u(b),   u(b),   u(a), 1/u(a)} 
    b0  = _mm256_mul_pd(b0, x1);           // b0 = {1/u(b), b*u(b), a*u(a), 1/u(a)} 
    b1  = _mm256_mul_pd(t0, u1);           // b1 = {1/u(b), t*u(b), t*u(a), 1/u(a)} 

    if (fabs(b) > fabs(a)) {
      *s = b0[3];  // = 1/u(b)
      *r = b0[2];  // = b*u(b)
      *c = b1[2];  // = t*u(b)
      if (signbit(b)) {
          *s = -(*s);
          *c = -(*c);
          *r = -(*r);
      }
    } else {
      *c = b0[0];
      *r = b0[1];
      *s = b1[1];
    }
}
Exemplo n.º 5
0
 /*!
  * \brief Compute the square root of each element in the given vector
  * \return a vector containing the square root of each input element
  */
 ETL_STATIC_INLINE(avx_simd_double) sqrt(avx_simd_double x) {
     return _mm256_sqrt_pd(x.value);
 }
Exemplo n.º 6
0
 inline vector4d sqrt(const vector4d& rhs)
 {
     return _mm256_sqrt_pd(rhs);
 }
Exemplo n.º 7
0
 inline F64vec4 sqrt(const F64vec4 &v)
 {
     return _mm256_sqrt_pd(v);
 }
Exemplo n.º 8
0
BI_FORCE_INLINE inline avx_double sqrt(const avx_double x) {
  avx_double res;
  res.packed = _mm256_sqrt_pd(x.packed);
  return res;
}
Exemplo n.º 9
0
void core::Vector3::normalize(void)
{
#if defined(VTX_USE_AVX)
	ALIGNED_32 platform::F64_t vector[] = {this->x, this->y, this->z, 0};
	ALIGNED_32 platform::F64_t reciprocalVector[] = {1.0, 1.0, 1.0, 1.0};
	__m256d simdvector;
	__m256d result;
	__m256d recp;

	simdvector = _mm256_load_pd(vector);
	recp = _mm256_load_pd(reciprocalVector);

	result = _mm256_mul_pd(simdvector, simdvector);

	result = _mm256_hadd_pd(result, result);
	result = _mm256_hadd_pd(result, result);

	result = _mm256_sqrt_pd(result);

	result = _mm256_div_pd(recp, result);

	simdvector = _mm256_mul_pd(simdvector, result);

	_mm256_store_pd(vector, simdvector);

	this->x = vector[0];
	this->y = vector[1];
	this->z = vector[2];

#elif defined(VTX_USE_SSE)
	// Must pad with a trailing 0, to store in 128-bit register
	ALIGNED_16 core::F32_t vector[] = {this->x, this->y, this->z, 0};
	__m128 simdvector;
	__m128 result;
	simdvector = _mm_load_ps(vector);
	
	// (X^2, Y^2, Z^2, 0^2)
	result = _mm_mul_ps(simdvector, simdvector);
	
	// Add all elements together, giving us (X^2 + Y^2 + Z^2 + 0^2)
	result = _mm_hadd_ps(result, result);
	result = _mm_hadd_ps(result, result);
	
	// Calculate square root, giving us sqrt(X^2 + Y^2 + Z^2 + 0^2)
	result = _mm_sqrt_ps(result);

	// Calculate reciprocal, giving us 1 / sqrt(X^2 + Y^2 + Z^2 + 0^2)
	result = _mm_rcp_ps(result);

	// Finally, multiply the result with our original vector.
	simdvector = _mm_mul_ps(simdvector, result);

	_mm_store_ps(vector, simdvector);
	
	this->x = vector[0];
	this->y = vector[1];
	this->z = vector[2];
#else
	core::F64_t num = 1.0 / std::sqrt(std::pow(this->x, 2) + std::pow(this->y, 2) + std::pow(this->z, 2));
	this->x *= num;
	this->y *= num;
	this->z *= num;
#endif
}
Exemplo n.º 10
0
		inline float64x4_t length(const float64x4_t ymm)
		{
			float64x4_t dot0 = simd_geometric::dot(ymm, ymm);
			float64x4_t sqrt0 = _mm256_sqrt_pd(dot0);
			return sqrt0;
		}