__SIMDd _SIMD_sqrt_pd(__SIMDd a) { #ifdef USE_SSE return _mm_sqrt_pd(a); #elif defined USE_AVX return _mm256_sqrt_pd(a); #elif defined USE_IBM return vec_sqrt(a); #endif }
void static avx_test (void) { union256d u, s1; double e [4] = {0x1.d3881b2c32ed7p+7, 0x1.54abaed51711cp+4, 0x1.19195c08a8d23p+5, 0x1.719741d6c0b0bp+5}; s1.x = _mm256_set_pd (2134.3343,1234.635654,453.345635,54646.464356); u.x = _mm256_sqrt_pd (s1.x); if (check_union256d (u, e)) abort (); }
void gvrotg_fma(double *c, double *s, double *r, double a, double b) { #if defined(__FMA__) register __m256d x0, x1, t0, t2, u0, u1, one, b0, b1; if (b == 0.0) { *c = 1.0; *s = 0.0; *r = a; return; } if (a == 0.0) { *c = 0.0; *s = 1.0; *r = b; return; } // set_pd() order: [3, 2, 1, 0] // x[0], x[1]: |a| > |b|, x[2],x[3]: |b| > |a| one = _mm256_set1_pd(1.0); x0 = _mm256_set_pd(1.0, a, b, 1.0); // x0 = {1, a, b, 1} x1 = _mm256_set_pd(1.0, b, a, 1.0); // x0 = {1, b, a, 1} t0 = _mm256_div_pd(x0, x1); // t0 = {1, a/b, b/a, 1} t2 = _mm256_fmadd_pd(t0, t0, one); // x3 = {1, 1+(a/b)^2, (b/a)^2+1, 1} u0 = _mm256_sqrt_pd(t2); // u0 = {1, sqrt(1+(a/b)^2), sqrt((b/2)^2+1), 1} u1 = _mm256_div_pd(one, u0); b0 = _mm256_blend_pd(u0, u1, 0x9); // b0 = {1/u(a), u(a), u(b), 1/u(b)} b0 = _mm256_mul_pd(b0, x1); // b0 = {1/u(a), b*u(a), a*u(b), 1/u(b)} b1 = _mm256_mul_pd(t0, u1); // b1 = {1/u(a), t*u(a), t*u(b), 1/u(b)} if (fabs(b) > fabs(a)) { *s = b0[3]; *r = b0[2]; *c = b1[2]; if (signbit(b)) { *s = -(*s); *c = -(*c); *r = -(*r); } } else { *c = b0[0]; *r = b0[1]; *s = b1[1]; } #endif }
void gvrotg_avx(double *c, double *s, double *r, double a, double b) { register __m256d x0, x1, t0, t2, u0, u1, one, b0, b1; if (b == 0.0) { *c = 1.0; *s = 0.0; *r = a; return; } if (a == 0.0) { *c = 0.0; *s = 1.0; *r = b; return; } // set_pd() order: [3, 2, 1, 0] // x[0], x[1]: |a| > |b|, x[2],x[3]: |b| > |a| x0 = _mm256_set_pd(1.0, a, b, 1.0); // x0 = {1, a, b, 1} x1 = _mm256_set_pd(1.0, b, a, 1.0); // x0 = {1, b, a, 1} t0 = _mm256_div_pd(x0, x1); // t0 = {1, a/b, b/a, 1} x0 = _mm256_mul_pd(t0, t0); // x3 = {1, (a/b)^2, (b/a)^2, 1} t2 = _mm256_hadd_pd(x0, x0); // x3 = {1+(a/b)^2, ., (b/a)^2+1, ..} u0 = _mm256_sqrt_pd(t2); // u0 = {sqrt(1+(a/b)^2), .., sqrt((b/a)^2+1)} one = _mm256_set1_pd(1.0); u1 = _mm256_div_pd(one, u0); b0 = _mm256_blend_pd(u0, u1, 0x9); // b0 = {1/u(b), u(b), u(a), 1/u(a)} b0 = _mm256_mul_pd(b0, x1); // b0 = {1/u(b), b*u(b), a*u(a), 1/u(a)} b1 = _mm256_mul_pd(t0, u1); // b1 = {1/u(b), t*u(b), t*u(a), 1/u(a)} if (fabs(b) > fabs(a)) { *s = b0[3]; // = 1/u(b) *r = b0[2]; // = b*u(b) *c = b1[2]; // = t*u(b) if (signbit(b)) { *s = -(*s); *c = -(*c); *r = -(*r); } } else { *c = b0[0]; *r = b0[1]; *s = b1[1]; } }
/*! * \brief Compute the square root of each element in the given vector * \return a vector containing the square root of each input element */ ETL_STATIC_INLINE(avx_simd_double) sqrt(avx_simd_double x) { return _mm256_sqrt_pd(x.value); }
inline vector4d sqrt(const vector4d& rhs) { return _mm256_sqrt_pd(rhs); }
inline F64vec4 sqrt(const F64vec4 &v) { return _mm256_sqrt_pd(v); }
BI_FORCE_INLINE inline avx_double sqrt(const avx_double x) { avx_double res; res.packed = _mm256_sqrt_pd(x.packed); return res; }
void core::Vector3::normalize(void) { #if defined(VTX_USE_AVX) ALIGNED_32 platform::F64_t vector[] = {this->x, this->y, this->z, 0}; ALIGNED_32 platform::F64_t reciprocalVector[] = {1.0, 1.0, 1.0, 1.0}; __m256d simdvector; __m256d result; __m256d recp; simdvector = _mm256_load_pd(vector); recp = _mm256_load_pd(reciprocalVector); result = _mm256_mul_pd(simdvector, simdvector); result = _mm256_hadd_pd(result, result); result = _mm256_hadd_pd(result, result); result = _mm256_sqrt_pd(result); result = _mm256_div_pd(recp, result); simdvector = _mm256_mul_pd(simdvector, result); _mm256_store_pd(vector, simdvector); this->x = vector[0]; this->y = vector[1]; this->z = vector[2]; #elif defined(VTX_USE_SSE) // Must pad with a trailing 0, to store in 128-bit register ALIGNED_16 core::F32_t vector[] = {this->x, this->y, this->z, 0}; __m128 simdvector; __m128 result; simdvector = _mm_load_ps(vector); // (X^2, Y^2, Z^2, 0^2) result = _mm_mul_ps(simdvector, simdvector); // Add all elements together, giving us (X^2 + Y^2 + Z^2 + 0^2) result = _mm_hadd_ps(result, result); result = _mm_hadd_ps(result, result); // Calculate square root, giving us sqrt(X^2 + Y^2 + Z^2 + 0^2) result = _mm_sqrt_ps(result); // Calculate reciprocal, giving us 1 / sqrt(X^2 + Y^2 + Z^2 + 0^2) result = _mm_rcp_ps(result); // Finally, multiply the result with our original vector. simdvector = _mm_mul_ps(simdvector, result); _mm_store_ps(vector, simdvector); this->x = vector[0]; this->y = vector[1]; this->z = vector[2]; #else core::F64_t num = 1.0 / std::sqrt(std::pow(this->x, 2) + std::pow(this->y, 2) + std::pow(this->z, 2)); this->x *= num; this->y *= num; this->z *= num; #endif }
inline float64x4_t length(const float64x4_t ymm) { float64x4_t dot0 = simd_geometric::dot(ymm, ymm); float64x4_t sqrt0 = _mm256_sqrt_pd(dot0); return sqrt0; }