Beispiel #1
0
__m128d test_mm_sqrt_pd(__m128d A) {
  // DAG-LABEL: test_mm_sqrt_pd
  // DAG: call <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double> %{{.*}})
  //
  // ASM-LABEL: test_mm_sqrt_pd
  // ASM: sqrtpd
  return _mm_sqrt_pd(A);
}
Beispiel #2
0
xsqrt( double x )
{
  __v2df f, g;
  double _d;
  x += x;
  g = __extension__ (__v2df){ x, 0 };
  f = _mm_sqrt_pd( g );
  _d = _mm_cvtsd_f64 (f);
  return (_d);
}
Beispiel #3
0
__SIMDd _SIMD_sqrt_pd(__SIMDd a)
{
#ifdef  USE_SSE
  return _mm_sqrt_pd(a);
#elif defined USE_AVX
  return _mm256_sqrt_pd(a);
#elif defined USE_IBM
  return vec_sqrt(a);
#endif
}
Beispiel #4
0
double Point::Length() const
{
#ifdef __SSE3__
	__m128d b = v * v;
	b = _mm_hadd_pd(b, b);
	b = _mm_sqrt_pd(b);
	return reinterpret_cast<double &>(b);
#else
	return sqrt(x * x + y * y);
#endif
}
Beispiel #5
0
Point Point::Unit() const
{
#ifdef __SSE3__
	__m128d b = v * v;
	b = _mm_hadd_pd(b, b);
	b = _mm_sqrt_pd(b);
	return Point(v / b);
#else
	double b = 1. / sqrt(x * x + y * y);
	return Point(x * b, y * b);
#endif
}
Beispiel #6
0
BI_FORCE_INLINE inline sse_double sqrt(const sse_double x) {
  sse_double res;
  res.packed = _mm_sqrt_pd(x.packed);
  return res;
}
Beispiel #7
0
test (__m128d s1)
{
  return _mm_sqrt_pd (s1); 
}
Beispiel #8
0
Point2d CameraATAN::Project(const Point3d& p3d)
{
    if(p3d.z<=0) return Point2d(-1,-1);

#ifdef __SSE3__
    if(useDistortion)
    {
        __m128d xy=(__m128d){p3d.x,p3d.y};
        if(p3d.z!=1.)
        {
            xy=_mm_sub_pd(xy,(__m128d){p3d.z,p3d.z});
        }
        __m128d xy2=_mm_mul_pd(xy,xy);

         xy2=_mm_hadd_pd(xy2,xy2);
         xy2=_mm_sqrt_pd(xy2);
        double r=((Point2d*)&xy2)->x;
        if(r < 0.001 || d == 0.0)
            r=1.0;
        else
            r=(d_inv* atan(r * tan2w) / r);
        xy=_mm_mul_pd((__m128d){fx,fy},xy);
        xy=_mm_mul_pd(xy,(__m128d){r,r});
        xy=_mm_add_pd(xy,(__m128d){cx,cy});
        return *(Point2d*)&xy;
    }
    else
    {
        if(p3d.z==1.)
        {
            __m128d xy = _mm_setr_pd(p3d.x,p3d.y);
            xy=_mm_add_pd(_mm_setr_pd(cx,cy),_mm_mul_pd(xy,(__m128d){fx,fy}));
            return *(Point2d*)&xy;
        }
        else if(p3d.z>0)
        {
            double z_inv=1./p3d.z;
            return Point2d(fx*z_inv*p3d.x+cx,fy*z_inv*p3d.y+cy);
        }
    }
#else
    if(useDistortion)
    {
        double X=p3d.x,Y=p3d.y;
        if(p3d.z!=1.)
        {
            double z_inv=1./p3d.z;
            X*=z_inv;Y*=z_inv;
        }
        double r= sqrt(X*X+Y*Y);
        if(r < 0.001 || d == 0.0)
            r= 1.0;
        else
            r=(d_inv* atan(r * tan2w) / r);

        return Point2d(cx + fx * r * X,cy + fy * r * Y);
    }
    else
    {
        if(p3d.z==1.)
        {
            return Point2d(fx*p3d.x+cx,fy*p3d.y+cy);
        }
        else
        {
            double z_inv=1./p3d.z;
            return Point2d(fx*z_inv*p3d.x+cx,fy*z_inv*p3d.y+cy);
        }
    }
#endif
    return Point2d(-1,-1);// let compiler happy
}
Beispiel #9
0
// only compute the necessary indices of su2_i = subgroup( U*staple^\dagger )
void
only_subgroup( GLU_complex *s0 ,
	       GLU_complex *s1 ,
	       double *scale ,
	       const GLU_complex U[ NCNC ] ,
	       const GLU_complex staple[ NCNC ] ,
	       const size_t su2_index )
{
  const __m128d *u = (const __m128d*)U ;
  const __m128d *s = (const __m128d*)staple ;

  register __m128d sm0 ; 
  register __m128d sm1 ;
#if NC == 3
  switch( su2_index%3 ) { // I don't like this
    // rotation 1
    //  |  s0   s1  0 |
    //  | -s1*  s0* 0 |
    //  |  0     0  1 |
  case 0 :
    sm0 = _mm_add_pd(
		     // temp0
		     _mm_add_pd( SSE2_MUL_CONJ( *( u + 0 ) , *( s + 0 ) ) ,
				 _mm_add_pd( SSE2_MUL_CONJ( *( u + 1 ) , *( s + 1 ) ) ,
					     SSE2_MUL_CONJ( *( u + 2 ) , *( s + 2 ) ) ) ) ,
		      // temp3^*
		     _mm_add_pd( SSE2_MULCONJ( *( u + 3 ) , *( s + 3 ) ) ,
				 _mm_add_pd( SSE2_MULCONJ( *( u + 4 ) , *( s + 4 ) ) ,
					     SSE2_MULCONJ( *( u + 5 ) , *( s + 5 ) ) ) ) ) ;
    sm1 = _mm_sub_pd( 
		     // temp1
		     _mm_add_pd( SSE2_MUL_CONJ( *( u + 0 ) , *( s + 3 ) ) ,
				 _mm_add_pd( SSE2_MUL_CONJ( *( u + 1 ) , *( s + 4 ) ) ,
					     SSE2_MUL_CONJ( *( u + 2 ) , *( s + 5 ) ) ) ) ,
		     // temp2^*
		     _mm_add_pd( SSE2_MULCONJ( *( u + 3 ) , *( s + 0 ) ) ,
				 _mm_add_pd( SSE2_MULCONJ( *( u + 4 ) , *( s + 1 ) ) ,
					     SSE2_MULCONJ( *( u + 5 ) , *( s + 2 ) ) ) ) ) ;
    break ;
  case 1 :
    // rotation 2
    //  |  1    0   0  |
    //  |  0   s0  s1  |
    //  |  0  -s1* s0* |
    sm0 = _mm_add_pd( 
		     // temp0
		     _mm_add_pd( SSE2_MUL_CONJ( *( u + 3 ) , *( s + 3 ) ) ,
				 _mm_add_pd( SSE2_MUL_CONJ( *( u + 4 ) , *( s + 4 ) ) ,
					     SSE2_MUL_CONJ( *( u + 5 ) , *( s + 5 ) ) ) ) ,
		     // temp3^*
		     _mm_add_pd( SSE2_MULCONJ( *( u + 6 ) , *( s + 6 ) ) ,
				 _mm_add_pd( SSE2_MULCONJ( *( u + 7 ) , *( s + 7 ) ) ,
					     SSE2_MULCONJ( *( u + 8 ) , *( s + 8 ) ) ) ) ) ;
    sm1 = _mm_sub_pd(
		     // temp1
		     _mm_add_pd( SSE2_MUL_CONJ( *( u + 3 ) , *( s + 6 ) ) ,
				 _mm_add_pd( SSE2_MUL_CONJ( *( u + 4 ) , *( s + 7 ) ) ,
					     SSE2_MUL_CONJ( *( u + 5 ) , *( s + 8 ) ) ) ) ,
		     // temp2^*
		     _mm_add_pd( SSE2_MULCONJ( *( u + 6 ) , *( s + 3 ) ) ,
				 _mm_add_pd( SSE2_MULCONJ( *( u + 7 ) , *( s + 4 ) ) ,
					     SSE2_MULCONJ( *( u + 8 ) , *( s + 5 ) ) ) ) ) ;
    break ;
  case 2 :
    // rotation 3
    //  | s0*  0  -s1 |
    //  |  0   1   0  |
    //  | s1   0  s0  |
    sm0 = _mm_add_pd( 
		     // temp3^*
		     _mm_add_pd( SSE2_MULCONJ( *( u + 0 ) , *( s + 0 ) ) ,
				 _mm_add_pd( SSE2_MULCONJ( *( u + 1 ) , *( s + 1 ) ) ,
					     SSE2_MULCONJ( *( u + 2 ) , *( s + 2 ) ) ) ) ,
		     // temp0
		     _mm_add_pd( SSE2_MUL_CONJ( *( u + 6 ) , *( s + 6 ) ) ,
				 _mm_add_pd( SSE2_MUL_CONJ( *( u + 7 ) , *( s + 7 ) ) ,
					     SSE2_MUL_CONJ( *( u + 8 ) , *( s + 8 ) ) ) ) ) ;
    sm1 = _mm_sub_pd(
		     // temp1
		     _mm_add_pd( SSE2_MUL_CONJ( *( u + 6 ) , *( s + 0 ) ) ,
				 _mm_add_pd( SSE2_MUL_CONJ( *( u + 7 ) , *( s + 1 ) ) ,
					     SSE2_MUL_CONJ( *( u + 8 ) , *( s + 2 ) ) ) ) ,
		     // temp2^*
		     _mm_add_pd( SSE2_MULCONJ( *( u + 0 ) , *( s + 6 ) ) ,
				 _mm_add_pd( SSE2_MULCONJ( *( u + 1 ) , *( s + 7 ) ) ,
					     SSE2_MULCONJ( *( u + 2 ) , *( s + 8 ) ) ) ) ) ;
    break ;
  }
#elif NC == 2
  sm0 = _mm_add_pd( 
		   // temp0
		   _mm_add_pd( SSE2_MUL_CONJ( *( u + 0 ) , *( s + 0 ) ) , 
			       SSE2_MUL_CONJ( *( u + 1 ) , *( s + 1 ) ) ) ,
		   // temp3^*
		   _mm_add_pd( SSE2_MULCONJ( *( u + 2 ) , *( s + 2 ) ) , 
			       SSE2_MULCONJ( *( u + 3 ) , *( s + 3 ) ) ) ) ;
  
  sm1 = _mm_sub_pd( 
		   // temp1
		   _mm_add_pd( SSE2_MUL_CONJ( *( u + 0 ) , *( s + 2 ) ) , 
			       SSE2_MUL_CONJ( *( u + 1 ) , *( s + 3 ) ) ) ,
		   // temp2^*
		   _mm_add_pd( SSE2_MULCONJ( *( u + 2 ) , *( s + 0 ) ) ,
			       SSE2_MULCONJ( *( u + 3 ) , *( s + 1 ) ) ) ) ;
#else
  // su(N) version
  const size_t row_a = Latt.su2_data[ su2_index ].idx_a / NC ;
  const size_t col_b = Latt.su2_data[ su2_index ].idx_b % NC ;

  // prefetch the staple & link indices
  const __m128d *S1 = ( s + NC * row_a ) , *S2 = ( s + NC * col_b ) ; 
  const __m128d *U1 = ( u + NC * row_a ) , *U2 = ( u + NC * col_b ) ; 

  // initialise to zero & perform multiplication
  sm0 = _mm_setzero_pd() ; sm1 = _mm_setzero_pd() ;

  size_t i ;
  for( i = 0 ; i < NC ; i++ ) {
    sm0 = _mm_add_pd( sm0 ,
		      _mm_add_pd( SSE2_MUL_CONJ( *U1 , *S1 ) ,
				  SSE2_MULCONJ( *U2 , *S2 ) ) ) ;
    sm1 = _mm_add_pd( sm1 ,
		      _mm_sub_pd( SSE2_MUL_CONJ( *U1 , *S2 ) ,
				  SSE2_MULCONJ( *U2 , *S1 ) ) ) ;
    // increment our pointers
    S1++ , S2++ , U1++ , U2++ ;
  }
#endif

  // puts the norm in both parts
  register __m128d z = SSE2_FMA( sm0 , sm0 , _mm_mul_pd( sm1 , sm1 ) ) ; 
  z = _mm_add_pd( z , _mm_shuffle_pd( z , z , 1 ) ) ;
  z = _mm_sqrt_pd( z ) ;
  z = _mm_div_pd( _mm_set1_pd( 1.0 ) , z ) ;
  sm0 = _mm_mul_pd( sm0 , z ) ;
  sm1 = _mm_mul_pd( sm1 , z ) ;

  // poke back into *s0 and *s1 and *scale
  _mm_store_pd( (void*)s0 , sm0 ) ; 
  _mm_store_pd( (void*)s1 , sm1 ) ; 
  _mm_store_sd( (void*)scale , z ) ;

  return ;
}
Beispiel #10
0
 inline F64vec2 sqrt(const F64vec2 &v)
 {
     return _mm_sqrt_pd(v);
 }
Beispiel #11
0
void SpringEmbedderFRExact::mainStep_sse3(ArrayGraph &C)
{
//#if (defined(OGDF_ARCH_X86) || defined(OGDF_ARCH_X64)) && !(defined(__GNUC__) && !defined(__SSE3__))
#ifdef OGDF_SSE3_EXTENSIONS

	const int n            = C.numberOfNodes();

#ifdef _OPENMP
	const int work = 256;
	const int nThreadsRep  = min(omp_get_max_threads(), 1 + n*n/work);
	const int nThreadsPrev = min(omp_get_max_threads(), 1 + n  /work);
#endif

	const double k       = m_idealEdgeLength;
	const double kSquare = k*k;
	const double c_rep   = 0.052 * kSquare; // 0.2 = factor for repulsive forces as suggested by Warshal

	const double minDist       = 10e-6;//100*DBL_EPSILON;
	const double minDistSquare = minDist*minDist;

	double *disp_x = (double*) System::alignedMemoryAlloc16(n*sizeof(double));
	double *disp_y = (double*) System::alignedMemoryAlloc16(n*sizeof(double));

	__m128d mm_kSquare       = _mm_set1_pd(kSquare);
	__m128d mm_minDist       = _mm_set1_pd(minDist);
	__m128d mm_minDistSquare = _mm_set1_pd(minDistSquare);
	__m128d mm_c_rep         = _mm_set1_pd(c_rep);

	#pragma omp parallel num_threads(nThreadsRep)
	{
		double tx = m_txNull;
		double ty = m_tyNull;
		int cF = 1;

		for(int i = 1; i <= m_iterations; i++)
		{
			// repulsive forces

			#pragma omp for
			for(int v = 0; v < n; ++v)
			{
				__m128d mm_disp_xv = _mm_setzero_pd();
				__m128d mm_disp_yv = _mm_setzero_pd();

				__m128d mm_xv = _mm_set1_pd(C.m_x[v]);
				__m128d mm_yv = _mm_set1_pd(C.m_y[v]);

				int u;
				for(u = 0; u+1 < v; u += 2)
				{
					__m128d mm_delta_x = _mm_sub_pd(mm_xv, _mm_load_pd(&C.m_x[u]));
					__m128d mm_delta_y = _mm_sub_pd(mm_yv, _mm_load_pd(&C.m_y[u]));

					__m128d mm_distSquare = _mm_max_pd(mm_minDistSquare,
						_mm_add_pd(_mm_mul_pd(mm_delta_x,mm_delta_x),_mm_mul_pd(mm_delta_y,mm_delta_y))
					);

					__m128d mm_t = _mm_div_pd(_mm_load_pd(&C.m_nodeWeight[u]), mm_distSquare);
					mm_disp_xv = _mm_add_pd(mm_disp_xv, _mm_mul_pd(mm_delta_x, mm_t));
					mm_disp_yv = _mm_add_pd(mm_disp_yv, _mm_mul_pd(mm_delta_y, mm_t));
					//mm_disp_xv = _mm_add_pd(mm_disp_xv, _mm_mul_pd(mm_delta_x, _mm_div_pd(mm_kSquare,mm_distSquare)));
					//mm_disp_yv = _mm_add_pd(mm_disp_yv, _mm_mul_pd(mm_delta_y, _mm_div_pd(mm_kSquare,mm_distSquare)));
				}
				int uStart = u+2;
				if(u == v) ++u;
				if(u < n) {
					__m128d mm_delta_x = _mm_sub_sd(mm_xv, _mm_load_sd(&C.m_x[u]));
					__m128d mm_delta_y = _mm_sub_sd(mm_yv, _mm_load_sd(&C.m_y[u]));

					__m128d mm_distSquare = _mm_max_sd(mm_minDistSquare,
						_mm_add_sd(_mm_mul_sd(mm_delta_x,mm_delta_x),_mm_mul_sd(mm_delta_y,mm_delta_y))
					);

					__m128d mm_t = _mm_div_sd(_mm_load_sd(&C.m_nodeWeight[u]), mm_distSquare);
					mm_disp_xv = _mm_add_sd(mm_disp_xv, _mm_mul_sd(mm_delta_x, mm_t));
					mm_disp_yv = _mm_add_sd(mm_disp_yv, _mm_mul_sd(mm_delta_y, mm_t));
					//mm_disp_xv = _mm_add_sd(mm_disp_xv, _mm_mul_sd(mm_delta_x, _mm_div_sd(mm_kSquare,mm_distSquare)));
					//mm_disp_yv = _mm_add_sd(mm_disp_yv, _mm_mul_sd(mm_delta_y, _mm_div_sd(mm_kSquare,mm_distSquare)));
				}

				for(u = uStart; u < n; u += 2)
				{
					__m128d mm_delta_x = _mm_sub_pd(mm_xv, _mm_load_pd(&C.m_x[u]));
					__m128d mm_delta_y = _mm_sub_pd(mm_yv, _mm_load_pd(&C.m_y[u]));

					__m128d mm_distSquare = _mm_max_pd(mm_minDistSquare,
						_mm_add_pd(_mm_mul_pd(mm_delta_x,mm_delta_x),_mm_mul_pd(mm_delta_y,mm_delta_y))
					);

					__m128d mm_t = _mm_div_pd(_mm_load_pd(&C.m_nodeWeight[u]), mm_distSquare);
					mm_disp_xv = _mm_add_pd(mm_disp_xv, _mm_mul_pd(mm_delta_x, mm_t));
					mm_disp_yv = _mm_add_pd(mm_disp_yv, _mm_mul_pd(mm_delta_y, mm_t));
					//mm_disp_xv = _mm_add_pd(mm_disp_xv, _mm_mul_pd(mm_delta_x, _mm_div_pd(mm_kSquare,mm_distSquare)));
					//mm_disp_yv = _mm_add_pd(mm_disp_yv, _mm_mul_pd(mm_delta_y, _mm_div_pd(mm_kSquare,mm_distSquare)));
				}
				if(u < n) {
					__m128d mm_delta_x = _mm_sub_sd(mm_xv, _mm_load_sd(&C.m_x[u]));
					__m128d mm_delta_y = _mm_sub_sd(mm_yv, _mm_load_sd(&C.m_y[u]));

					__m128d mm_distSquare = _mm_max_sd(mm_minDistSquare,
						_mm_add_sd(_mm_mul_sd(mm_delta_x,mm_delta_x),_mm_mul_sd(mm_delta_y,mm_delta_y))
					);

					__m128d mm_t = _mm_div_sd(_mm_load_sd(&C.m_nodeWeight[u]), mm_distSquare);
					mm_disp_xv = _mm_add_sd(mm_disp_xv, _mm_mul_sd(mm_delta_x, mm_t));
					mm_disp_yv = _mm_add_sd(mm_disp_yv, _mm_mul_sd(mm_delta_y, mm_t));
					//mm_disp_xv = _mm_add_sd(mm_disp_xv, _mm_mul_sd(mm_delta_x, _mm_div_sd(mm_kSquare,mm_distSquare)));
					//mm_disp_yv = _mm_add_sd(mm_disp_yv, _mm_mul_sd(mm_delta_y, _mm_div_sd(mm_kSquare,mm_distSquare)));
				}

				mm_disp_xv = _mm_hadd_pd(mm_disp_xv,mm_disp_xv);
				mm_disp_yv = _mm_hadd_pd(mm_disp_yv,mm_disp_yv);

				_mm_store_sd(&disp_x[v], _mm_mul_sd(mm_disp_xv, mm_c_rep));
				_mm_store_sd(&disp_y[v], _mm_mul_sd(mm_disp_yv, mm_c_rep));
			}

			// attractive forces

			#pragma omp single
			for(int e = 0; e < C.numberOfEdges(); ++e)
			{
				int v = C.m_src[e];
				int u = C.m_tgt[e];

				double delta_x = C.m_x[v] - C.m_x[u];
				double delta_y = C.m_y[v] - C.m_y[u];

				double dist = max(minDist, sqrt(delta_x*delta_x + delta_y*delta_y));

				disp_x[v] -= delta_x * dist / k;
				disp_y[v] -= delta_y * dist / k;

				disp_x[u] += delta_x * dist / k;
				disp_y[u] += delta_y * dist / k;
			}

			// limit the maximum displacement to the temperature (m_tx,m_ty)

			__m128d mm_tx = _mm_set1_pd(tx);
			__m128d mm_ty = _mm_set1_pd(ty);

			#pragma omp for nowait
			for(int v = 0; v < n-1; v += 2)
			{
				__m128d mm_disp_xv = _mm_load_pd(&disp_x[v]);
				__m128d mm_disp_yv = _mm_load_pd(&disp_y[v]);

				__m128d mm_dist = _mm_max_pd(mm_minDist, _mm_sqrt_pd(
					_mm_add_pd(_mm_mul_pd(mm_disp_xv,mm_disp_xv),_mm_mul_pd(mm_disp_yv,mm_disp_yv))
				));

				_mm_store_pd(&C.m_x[v], _mm_add_pd(_mm_load_pd(&C.m_x[v]),
					_mm_mul_pd(_mm_div_pd(mm_disp_xv, mm_dist), _mm_min_pd(mm_dist,mm_tx))
				));
				_mm_store_pd(&C.m_y[v], _mm_add_pd(_mm_load_pd(&C.m_y[v]),
					_mm_mul_pd(_mm_div_pd(mm_disp_yv, mm_dist), _mm_min_pd(mm_dist,mm_ty))
				));
			}
			#pragma omp single nowait
			{
				if(n % 2) {
					int v = n-1;
					double dist = max(minDist, sqrt(disp_x[v]*disp_x[v] + disp_y[v]*disp_y[v]));

					C.m_x[v] += disp_x[v] / dist * min(dist,tx);
					C.m_y[v] += disp_y[v] / dist * min(dist,ty);
				}
			}

			cool(tx,ty,cF);

			#pragma omp barrier
		}
	}

	System::alignedMemoryFree(disp_x);
	System::alignedMemoryFree(disp_y);

#else
	mainStep(C);
#endif
}
Beispiel #12
0
 KFR_SINTRIN f64sse sqrt(f64sse x) { return _mm_sqrt_pd(*x); }