__m128d test_mm_sqrt_pd(__m128d A) { // DAG-LABEL: test_mm_sqrt_pd // DAG: call <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double> %{{.*}}) // // ASM-LABEL: test_mm_sqrt_pd // ASM: sqrtpd return _mm_sqrt_pd(A); }
xsqrt( double x ) { __v2df f, g; double _d; x += x; g = __extension__ (__v2df){ x, 0 }; f = _mm_sqrt_pd( g ); _d = _mm_cvtsd_f64 (f); return (_d); }
__SIMDd _SIMD_sqrt_pd(__SIMDd a) { #ifdef USE_SSE return _mm_sqrt_pd(a); #elif defined USE_AVX return _mm256_sqrt_pd(a); #elif defined USE_IBM return vec_sqrt(a); #endif }
double Point::Length() const { #ifdef __SSE3__ __m128d b = v * v; b = _mm_hadd_pd(b, b); b = _mm_sqrt_pd(b); return reinterpret_cast<double &>(b); #else return sqrt(x * x + y * y); #endif }
Point Point::Unit() const { #ifdef __SSE3__ __m128d b = v * v; b = _mm_hadd_pd(b, b); b = _mm_sqrt_pd(b); return Point(v / b); #else double b = 1. / sqrt(x * x + y * y); return Point(x * b, y * b); #endif }
BI_FORCE_INLINE inline sse_double sqrt(const sse_double x) { sse_double res; res.packed = _mm_sqrt_pd(x.packed); return res; }
test (__m128d s1) { return _mm_sqrt_pd (s1); }
Point2d CameraATAN::Project(const Point3d& p3d) { if(p3d.z<=0) return Point2d(-1,-1); #ifdef __SSE3__ if(useDistortion) { __m128d xy=(__m128d){p3d.x,p3d.y}; if(p3d.z!=1.) { xy=_mm_sub_pd(xy,(__m128d){p3d.z,p3d.z}); } __m128d xy2=_mm_mul_pd(xy,xy); xy2=_mm_hadd_pd(xy2,xy2); xy2=_mm_sqrt_pd(xy2); double r=((Point2d*)&xy2)->x; if(r < 0.001 || d == 0.0) r=1.0; else r=(d_inv* atan(r * tan2w) / r); xy=_mm_mul_pd((__m128d){fx,fy},xy); xy=_mm_mul_pd(xy,(__m128d){r,r}); xy=_mm_add_pd(xy,(__m128d){cx,cy}); return *(Point2d*)&xy; } else { if(p3d.z==1.) { __m128d xy = _mm_setr_pd(p3d.x,p3d.y); xy=_mm_add_pd(_mm_setr_pd(cx,cy),_mm_mul_pd(xy,(__m128d){fx,fy})); return *(Point2d*)&xy; } else if(p3d.z>0) { double z_inv=1./p3d.z; return Point2d(fx*z_inv*p3d.x+cx,fy*z_inv*p3d.y+cy); } } #else if(useDistortion) { double X=p3d.x,Y=p3d.y; if(p3d.z!=1.) { double z_inv=1./p3d.z; X*=z_inv;Y*=z_inv; } double r= sqrt(X*X+Y*Y); if(r < 0.001 || d == 0.0) r= 1.0; else r=(d_inv* atan(r * tan2w) / r); return Point2d(cx + fx * r * X,cy + fy * r * Y); } else { if(p3d.z==1.) { return Point2d(fx*p3d.x+cx,fy*p3d.y+cy); } else { double z_inv=1./p3d.z; return Point2d(fx*z_inv*p3d.x+cx,fy*z_inv*p3d.y+cy); } } #endif return Point2d(-1,-1);// let compiler happy }
// only compute the necessary indices of su2_i = subgroup( U*staple^\dagger ) void only_subgroup( GLU_complex *s0 , GLU_complex *s1 , double *scale , const GLU_complex U[ NCNC ] , const GLU_complex staple[ NCNC ] , const size_t su2_index ) { const __m128d *u = (const __m128d*)U ; const __m128d *s = (const __m128d*)staple ; register __m128d sm0 ; register __m128d sm1 ; #if NC == 3 switch( su2_index%3 ) { // I don't like this // rotation 1 // | s0 s1 0 | // | -s1* s0* 0 | // | 0 0 1 | case 0 : sm0 = _mm_add_pd( // temp0 _mm_add_pd( SSE2_MUL_CONJ( *( u + 0 ) , *( s + 0 ) ) , _mm_add_pd( SSE2_MUL_CONJ( *( u + 1 ) , *( s + 1 ) ) , SSE2_MUL_CONJ( *( u + 2 ) , *( s + 2 ) ) ) ) , // temp3^* _mm_add_pd( SSE2_MULCONJ( *( u + 3 ) , *( s + 3 ) ) , _mm_add_pd( SSE2_MULCONJ( *( u + 4 ) , *( s + 4 ) ) , SSE2_MULCONJ( *( u + 5 ) , *( s + 5 ) ) ) ) ) ; sm1 = _mm_sub_pd( // temp1 _mm_add_pd( SSE2_MUL_CONJ( *( u + 0 ) , *( s + 3 ) ) , _mm_add_pd( SSE2_MUL_CONJ( *( u + 1 ) , *( s + 4 ) ) , SSE2_MUL_CONJ( *( u + 2 ) , *( s + 5 ) ) ) ) , // temp2^* _mm_add_pd( SSE2_MULCONJ( *( u + 3 ) , *( s + 0 ) ) , _mm_add_pd( SSE2_MULCONJ( *( u + 4 ) , *( s + 1 ) ) , SSE2_MULCONJ( *( u + 5 ) , *( s + 2 ) ) ) ) ) ; break ; case 1 : // rotation 2 // | 1 0 0 | // | 0 s0 s1 | // | 0 -s1* s0* | sm0 = _mm_add_pd( // temp0 _mm_add_pd( SSE2_MUL_CONJ( *( u + 3 ) , *( s + 3 ) ) , _mm_add_pd( SSE2_MUL_CONJ( *( u + 4 ) , *( s + 4 ) ) , SSE2_MUL_CONJ( *( u + 5 ) , *( s + 5 ) ) ) ) , // temp3^* _mm_add_pd( SSE2_MULCONJ( *( u + 6 ) , *( s + 6 ) ) , _mm_add_pd( SSE2_MULCONJ( *( u + 7 ) , *( s + 7 ) ) , SSE2_MULCONJ( *( u + 8 ) , *( s + 8 ) ) ) ) ) ; sm1 = _mm_sub_pd( // temp1 _mm_add_pd( SSE2_MUL_CONJ( *( u + 3 ) , *( s + 6 ) ) , _mm_add_pd( SSE2_MUL_CONJ( *( u + 4 ) , *( s + 7 ) ) , SSE2_MUL_CONJ( *( u + 5 ) , *( s + 8 ) ) ) ) , // temp2^* _mm_add_pd( SSE2_MULCONJ( *( u + 6 ) , *( s + 3 ) ) , _mm_add_pd( SSE2_MULCONJ( *( u + 7 ) , *( s + 4 ) ) , SSE2_MULCONJ( *( u + 8 ) , *( s + 5 ) ) ) ) ) ; break ; case 2 : // rotation 3 // | s0* 0 -s1 | // | 0 1 0 | // | s1 0 s0 | sm0 = _mm_add_pd( // temp3^* _mm_add_pd( SSE2_MULCONJ( *( u + 0 ) , *( s + 0 ) ) , _mm_add_pd( SSE2_MULCONJ( *( u + 1 ) , *( s + 1 ) ) , SSE2_MULCONJ( *( u + 2 ) , *( s + 2 ) ) ) ) , // temp0 _mm_add_pd( SSE2_MUL_CONJ( *( u + 6 ) , *( s + 6 ) ) , _mm_add_pd( SSE2_MUL_CONJ( *( u + 7 ) , *( s + 7 ) ) , SSE2_MUL_CONJ( *( u + 8 ) , *( s + 8 ) ) ) ) ) ; sm1 = _mm_sub_pd( // temp1 _mm_add_pd( SSE2_MUL_CONJ( *( u + 6 ) , *( s + 0 ) ) , _mm_add_pd( SSE2_MUL_CONJ( *( u + 7 ) , *( s + 1 ) ) , SSE2_MUL_CONJ( *( u + 8 ) , *( s + 2 ) ) ) ) , // temp2^* _mm_add_pd( SSE2_MULCONJ( *( u + 0 ) , *( s + 6 ) ) , _mm_add_pd( SSE2_MULCONJ( *( u + 1 ) , *( s + 7 ) ) , SSE2_MULCONJ( *( u + 2 ) , *( s + 8 ) ) ) ) ) ; break ; } #elif NC == 2 sm0 = _mm_add_pd( // temp0 _mm_add_pd( SSE2_MUL_CONJ( *( u + 0 ) , *( s + 0 ) ) , SSE2_MUL_CONJ( *( u + 1 ) , *( s + 1 ) ) ) , // temp3^* _mm_add_pd( SSE2_MULCONJ( *( u + 2 ) , *( s + 2 ) ) , SSE2_MULCONJ( *( u + 3 ) , *( s + 3 ) ) ) ) ; sm1 = _mm_sub_pd( // temp1 _mm_add_pd( SSE2_MUL_CONJ( *( u + 0 ) , *( s + 2 ) ) , SSE2_MUL_CONJ( *( u + 1 ) , *( s + 3 ) ) ) , // temp2^* _mm_add_pd( SSE2_MULCONJ( *( u + 2 ) , *( s + 0 ) ) , SSE2_MULCONJ( *( u + 3 ) , *( s + 1 ) ) ) ) ; #else // su(N) version const size_t row_a = Latt.su2_data[ su2_index ].idx_a / NC ; const size_t col_b = Latt.su2_data[ su2_index ].idx_b % NC ; // prefetch the staple & link indices const __m128d *S1 = ( s + NC * row_a ) , *S2 = ( s + NC * col_b ) ; const __m128d *U1 = ( u + NC * row_a ) , *U2 = ( u + NC * col_b ) ; // initialise to zero & perform multiplication sm0 = _mm_setzero_pd() ; sm1 = _mm_setzero_pd() ; size_t i ; for( i = 0 ; i < NC ; i++ ) { sm0 = _mm_add_pd( sm0 , _mm_add_pd( SSE2_MUL_CONJ( *U1 , *S1 ) , SSE2_MULCONJ( *U2 , *S2 ) ) ) ; sm1 = _mm_add_pd( sm1 , _mm_sub_pd( SSE2_MUL_CONJ( *U1 , *S2 ) , SSE2_MULCONJ( *U2 , *S1 ) ) ) ; // increment our pointers S1++ , S2++ , U1++ , U2++ ; } #endif // puts the norm in both parts register __m128d z = SSE2_FMA( sm0 , sm0 , _mm_mul_pd( sm1 , sm1 ) ) ; z = _mm_add_pd( z , _mm_shuffle_pd( z , z , 1 ) ) ; z = _mm_sqrt_pd( z ) ; z = _mm_div_pd( _mm_set1_pd( 1.0 ) , z ) ; sm0 = _mm_mul_pd( sm0 , z ) ; sm1 = _mm_mul_pd( sm1 , z ) ; // poke back into *s0 and *s1 and *scale _mm_store_pd( (void*)s0 , sm0 ) ; _mm_store_pd( (void*)s1 , sm1 ) ; _mm_store_sd( (void*)scale , z ) ; return ; }
inline F64vec2 sqrt(const F64vec2 &v) { return _mm_sqrt_pd(v); }
void SpringEmbedderFRExact::mainStep_sse3(ArrayGraph &C) { //#if (defined(OGDF_ARCH_X86) || defined(OGDF_ARCH_X64)) && !(defined(__GNUC__) && !defined(__SSE3__)) #ifdef OGDF_SSE3_EXTENSIONS const int n = C.numberOfNodes(); #ifdef _OPENMP const int work = 256; const int nThreadsRep = min(omp_get_max_threads(), 1 + n*n/work); const int nThreadsPrev = min(omp_get_max_threads(), 1 + n /work); #endif const double k = m_idealEdgeLength; const double kSquare = k*k; const double c_rep = 0.052 * kSquare; // 0.2 = factor for repulsive forces as suggested by Warshal const double minDist = 10e-6;//100*DBL_EPSILON; const double minDistSquare = minDist*minDist; double *disp_x = (double*) System::alignedMemoryAlloc16(n*sizeof(double)); double *disp_y = (double*) System::alignedMemoryAlloc16(n*sizeof(double)); __m128d mm_kSquare = _mm_set1_pd(kSquare); __m128d mm_minDist = _mm_set1_pd(minDist); __m128d mm_minDistSquare = _mm_set1_pd(minDistSquare); __m128d mm_c_rep = _mm_set1_pd(c_rep); #pragma omp parallel num_threads(nThreadsRep) { double tx = m_txNull; double ty = m_tyNull; int cF = 1; for(int i = 1; i <= m_iterations; i++) { // repulsive forces #pragma omp for for(int v = 0; v < n; ++v) { __m128d mm_disp_xv = _mm_setzero_pd(); __m128d mm_disp_yv = _mm_setzero_pd(); __m128d mm_xv = _mm_set1_pd(C.m_x[v]); __m128d mm_yv = _mm_set1_pd(C.m_y[v]); int u; for(u = 0; u+1 < v; u += 2) { __m128d mm_delta_x = _mm_sub_pd(mm_xv, _mm_load_pd(&C.m_x[u])); __m128d mm_delta_y = _mm_sub_pd(mm_yv, _mm_load_pd(&C.m_y[u])); __m128d mm_distSquare = _mm_max_pd(mm_minDistSquare, _mm_add_pd(_mm_mul_pd(mm_delta_x,mm_delta_x),_mm_mul_pd(mm_delta_y,mm_delta_y)) ); __m128d mm_t = _mm_div_pd(_mm_load_pd(&C.m_nodeWeight[u]), mm_distSquare); mm_disp_xv = _mm_add_pd(mm_disp_xv, _mm_mul_pd(mm_delta_x, mm_t)); mm_disp_yv = _mm_add_pd(mm_disp_yv, _mm_mul_pd(mm_delta_y, mm_t)); //mm_disp_xv = _mm_add_pd(mm_disp_xv, _mm_mul_pd(mm_delta_x, _mm_div_pd(mm_kSquare,mm_distSquare))); //mm_disp_yv = _mm_add_pd(mm_disp_yv, _mm_mul_pd(mm_delta_y, _mm_div_pd(mm_kSquare,mm_distSquare))); } int uStart = u+2; if(u == v) ++u; if(u < n) { __m128d mm_delta_x = _mm_sub_sd(mm_xv, _mm_load_sd(&C.m_x[u])); __m128d mm_delta_y = _mm_sub_sd(mm_yv, _mm_load_sd(&C.m_y[u])); __m128d mm_distSquare = _mm_max_sd(mm_minDistSquare, _mm_add_sd(_mm_mul_sd(mm_delta_x,mm_delta_x),_mm_mul_sd(mm_delta_y,mm_delta_y)) ); __m128d mm_t = _mm_div_sd(_mm_load_sd(&C.m_nodeWeight[u]), mm_distSquare); mm_disp_xv = _mm_add_sd(mm_disp_xv, _mm_mul_sd(mm_delta_x, mm_t)); mm_disp_yv = _mm_add_sd(mm_disp_yv, _mm_mul_sd(mm_delta_y, mm_t)); //mm_disp_xv = _mm_add_sd(mm_disp_xv, _mm_mul_sd(mm_delta_x, _mm_div_sd(mm_kSquare,mm_distSquare))); //mm_disp_yv = _mm_add_sd(mm_disp_yv, _mm_mul_sd(mm_delta_y, _mm_div_sd(mm_kSquare,mm_distSquare))); } for(u = uStart; u < n; u += 2) { __m128d mm_delta_x = _mm_sub_pd(mm_xv, _mm_load_pd(&C.m_x[u])); __m128d mm_delta_y = _mm_sub_pd(mm_yv, _mm_load_pd(&C.m_y[u])); __m128d mm_distSquare = _mm_max_pd(mm_minDistSquare, _mm_add_pd(_mm_mul_pd(mm_delta_x,mm_delta_x),_mm_mul_pd(mm_delta_y,mm_delta_y)) ); __m128d mm_t = _mm_div_pd(_mm_load_pd(&C.m_nodeWeight[u]), mm_distSquare); mm_disp_xv = _mm_add_pd(mm_disp_xv, _mm_mul_pd(mm_delta_x, mm_t)); mm_disp_yv = _mm_add_pd(mm_disp_yv, _mm_mul_pd(mm_delta_y, mm_t)); //mm_disp_xv = _mm_add_pd(mm_disp_xv, _mm_mul_pd(mm_delta_x, _mm_div_pd(mm_kSquare,mm_distSquare))); //mm_disp_yv = _mm_add_pd(mm_disp_yv, _mm_mul_pd(mm_delta_y, _mm_div_pd(mm_kSquare,mm_distSquare))); } if(u < n) { __m128d mm_delta_x = _mm_sub_sd(mm_xv, _mm_load_sd(&C.m_x[u])); __m128d mm_delta_y = _mm_sub_sd(mm_yv, _mm_load_sd(&C.m_y[u])); __m128d mm_distSquare = _mm_max_sd(mm_minDistSquare, _mm_add_sd(_mm_mul_sd(mm_delta_x,mm_delta_x),_mm_mul_sd(mm_delta_y,mm_delta_y)) ); __m128d mm_t = _mm_div_sd(_mm_load_sd(&C.m_nodeWeight[u]), mm_distSquare); mm_disp_xv = _mm_add_sd(mm_disp_xv, _mm_mul_sd(mm_delta_x, mm_t)); mm_disp_yv = _mm_add_sd(mm_disp_yv, _mm_mul_sd(mm_delta_y, mm_t)); //mm_disp_xv = _mm_add_sd(mm_disp_xv, _mm_mul_sd(mm_delta_x, _mm_div_sd(mm_kSquare,mm_distSquare))); //mm_disp_yv = _mm_add_sd(mm_disp_yv, _mm_mul_sd(mm_delta_y, _mm_div_sd(mm_kSquare,mm_distSquare))); } mm_disp_xv = _mm_hadd_pd(mm_disp_xv,mm_disp_xv); mm_disp_yv = _mm_hadd_pd(mm_disp_yv,mm_disp_yv); _mm_store_sd(&disp_x[v], _mm_mul_sd(mm_disp_xv, mm_c_rep)); _mm_store_sd(&disp_y[v], _mm_mul_sd(mm_disp_yv, mm_c_rep)); } // attractive forces #pragma omp single for(int e = 0; e < C.numberOfEdges(); ++e) { int v = C.m_src[e]; int u = C.m_tgt[e]; double delta_x = C.m_x[v] - C.m_x[u]; double delta_y = C.m_y[v] - C.m_y[u]; double dist = max(minDist, sqrt(delta_x*delta_x + delta_y*delta_y)); disp_x[v] -= delta_x * dist / k; disp_y[v] -= delta_y * dist / k; disp_x[u] += delta_x * dist / k; disp_y[u] += delta_y * dist / k; } // limit the maximum displacement to the temperature (m_tx,m_ty) __m128d mm_tx = _mm_set1_pd(tx); __m128d mm_ty = _mm_set1_pd(ty); #pragma omp for nowait for(int v = 0; v < n-1; v += 2) { __m128d mm_disp_xv = _mm_load_pd(&disp_x[v]); __m128d mm_disp_yv = _mm_load_pd(&disp_y[v]); __m128d mm_dist = _mm_max_pd(mm_minDist, _mm_sqrt_pd( _mm_add_pd(_mm_mul_pd(mm_disp_xv,mm_disp_xv),_mm_mul_pd(mm_disp_yv,mm_disp_yv)) )); _mm_store_pd(&C.m_x[v], _mm_add_pd(_mm_load_pd(&C.m_x[v]), _mm_mul_pd(_mm_div_pd(mm_disp_xv, mm_dist), _mm_min_pd(mm_dist,mm_tx)) )); _mm_store_pd(&C.m_y[v], _mm_add_pd(_mm_load_pd(&C.m_y[v]), _mm_mul_pd(_mm_div_pd(mm_disp_yv, mm_dist), _mm_min_pd(mm_dist,mm_ty)) )); } #pragma omp single nowait { if(n % 2) { int v = n-1; double dist = max(minDist, sqrt(disp_x[v]*disp_x[v] + disp_y[v]*disp_y[v])); C.m_x[v] += disp_x[v] / dist * min(dist,tx); C.m_y[v] += disp_y[v] / dist * min(dist,ty); } } cool(tx,ty,cF); #pragma omp barrier } } System::alignedMemoryFree(disp_x); System::alignedMemoryFree(disp_y); #else mainStep(C); #endif }
KFR_SINTRIN f64sse sqrt(f64sse x) { return _mm_sqrt_pd(*x); }