template <> vector3d<float> vector3d<float>::transform(const matrix3d<float> &M) const
{
  float _x, _y, _z;
  __m128 v1, v2, rx, ry, rz;
  
  v1 = _mm_load_ps(&coefficients[0]);
  v2 = _mm_load_ps(&M.elements[0]);
  rx = _mm_dp_ps(v1,v2,0xF1);
  
  
  v1 = _mm_load_ps(&coefficients[0]);
  v2 = _mm_load_ps(&M.elements[4]);
  ry = _mm_dp_ps(v1,v2,0xF1);
  
  v1 = _mm_load_ps(&coefficients[0]);
  v2 = _mm_load_ps(&M.elements[8]);
  rz = _mm_dp_ps(v1,v2,0xF1);
  
  _mm_store_ss(&_x,rx);
  _mm_store_ss(&_y,ry);
  _mm_store_ss(&_z,rz);
  
  vector3d<float> q;
  q.set(_x,_y,_z);
  return q;
}
Esempio n. 2
0
// use MMX/SSE extensions (unrolled)
void dotprod_rrrf_execute_sse4u(dotprod_rrrf _q,
                                float *      _x,
                                float *      _y)
{
    __m128 v0, v1, v2, v3;
    __m128 h0, h1, h2, h3;
    __m128 s0, s1, s2, s3;
    __m128 sum = _mm_setzero_ps(); // load zeros into sum register

    // t = 4*(floor(_n/16))
    unsigned int r = (_q->n >> 4) << 2;

    //
    unsigned int i;
    for (i=0; i<r; i+=4) {
        // load inputs into register (unaligned)
        v0 = _mm_loadu_ps(&_x[4*i+ 0]);
        v1 = _mm_loadu_ps(&_x[4*i+ 4]);
        v2 = _mm_loadu_ps(&_x[4*i+ 8]);
        v3 = _mm_loadu_ps(&_x[4*i+12]);

        // load coefficients into register (aligned)
        h0 = _mm_load_ps(&_q->h[4*i+ 0]);
        h1 = _mm_load_ps(&_q->h[4*i+ 4]);
        h2 = _mm_load_ps(&_q->h[4*i+ 8]);
        h3 = _mm_load_ps(&_q->h[4*i+12]);

        // compute dot products
        s0 = _mm_dp_ps(v0, h0, 0xffffffff);
        s1 = _mm_dp_ps(v1, h1, 0xffffffff);
        s2 = _mm_dp_ps(v2, h2, 0xffffffff);
        s3 = _mm_dp_ps(v3, h3, 0xffffffff);
        
        // parallel addition
        // FIXME: these additions are by far the limiting factor
        sum = _mm_add_ps( sum, s0 );
        sum = _mm_add_ps( sum, s1 );
        sum = _mm_add_ps( sum, s2 );
        sum = _mm_add_ps( sum, s3 );
    }

    // aligned output array
    float w[4] __attribute__((aligned(16)));

    // unload packed array
    _mm_store_ps(w, sum);
    float total = w[0];

    // cleanup
    for (i=4*r; i<_q->n; i++)
        total += _x[i] * _q->h[i];

    // set return value
    *_y = total;
}
Esempio n. 3
0
/** transform vector by rigid transform */
inline Matrix<float, 4, 1> operator * (const RigidTransform<float>& mat, const Matrix<float, 4, 1>& vec)
{
#ifdef SIMPLE_GL_USE_SSE4
    __m128 res;
    __m128 dotProd;

    res      = _mm_dp_ps(mat[0].m128, vec.m128, 0xEE);\
    dotProd  = _mm_dp_ps(mat[1].m128, vec.m128, 0xEE);\
    res      = _mm_blend_ps( res, dotProd, _MM_SHUFFLE(0, 1, 1, 1) );\
    dotProd  = _mm_dp_ps(mat[2].m128, vec.m128, 0xEE);\
    res      = _mm_blend_ps( res, dotProd, _MM_SHUFFLE(0, 0, 1, 1) );\
    dotProd  = _mm_dp_ps(mat[3].m128, vec.m128, 0xEE);\
    res      = _mm_blend_ps( res, dotProd, _MM_SHUFFLE(0, 0, 0, 1) );

    return Matrix<float, 4, 1>(res);
#elif defined(SIMPLE_GL_USE_SSE3)
    __m128 res;

    __m128 dotProd0 = _mm_mul_ps(mat[0].m128, vec.m128);
    dotProd0        = _mm_hadd_ps(dotProd0, dotProd0);
    dotProd0        = _mm_hadd_ps(dotProd0, dotProd0);

    __m128 dotProd1 = _mm_mul_ps(mat[1].m128, vec.m128);
    dotProd1        = _mm_hadd_ps(dotProd1, dotProd1);
    dotProd1        = _mm_hadd_ps(dotProd1, dotProd1);

    __m128 dotProd2 = _mm_mul_ps(mat[2].m128, vec.m128);
    dotProd2        = _mm_hadd_ps(dotProd2, dotProd2);
    dotProd2        = _mm_hadd_ps(dotProd2, dotProd2);

    __m128 dotProd3 = _mm_mul_ps(mat[3].m128, vec.m128);
    dotProd3        = _mm_hadd_ps(dotProd3, dotProd3);
    dotProd3        = _mm_hadd_ps(dotProd3, dotProd3);

    __m128 vec01    = _mm_unpacklo_ps(dotProd0, dotProd1);
    __m128 vec23    = _mm_unpackhi_ps(dotProd2, dotProd3);
    res             = _mm_movelh_ps(vec01, vec23);

    return Matrix<float, 4, 1>(res);
#else // SSE2
    // TODO: Think about good sse optimization
    Matrix<float, 4, 1> res;
    res[0] = mat[0][0] * res[0] + mat[0][1] * res[1] + mat[0][2] * res[2] + mat[0][3] * res[3];
    res[1] = mat[1][0] * res[0] + mat[1][1] * res[1] + mat[1][2] * res[2] + mat[1][3] * res[3];
    res[2] = mat[2][0] * res[0] + mat[2][1] * res[1] + mat[2][2] * res[2] + mat[2][3] * res[3];
    res[3] = mat[3][0] * res[0] + mat[3][1] * res[1] + mat[3][2] * res[2] + mat[3][3] * res[3];
    return res;
#endif
}
Esempio n. 4
0
void CraytDlg::OnBnClickedTestSse2()
{
	vec3 A = vec3(1,1,1);
	vec3 B = vec3(2,-1,-2);

	__m128 C = _mm_setr_ps(1,1,1,0);
	__m128 D = _mm_setr_ps(2,-1,-2,0);
	

	LARGE_INTEGER F,T0,T1;   // address of current frequency
	QueryPerformanceFrequency(&F);
	QueryPerformanceCounter(&T0);

	for(int j=0;j<100;++j)
	for(int i=0;i<1000000;++i)
		_mm_dp_ps(C,D,0x7F);

	QueryPerformanceCounter(&T1);
	float elapsed_timeA = (float)(T1.QuadPart - T0.QuadPart) / (float)F.QuadPart;
	T0 = T1;

	for(int j=0;j<100;++j)
	for(int i=0;i<1000000;++i)
		dot(A,B);

	QueryPerformanceCounter(&T1);
	float elapsed_timeB = (float)(T1.QuadPart - T0.QuadPart) / (float)F.QuadPart;


	char buffer[255];
	sprintf(buffer,"Fast= %.2f s       Normal=%.2f s" , elapsed_timeA,elapsed_timeB);
	AfxMessageBox(buffer);
}
Esempio n. 5
0
int main (int argc, char **argv)
{

	int i;
	float prod_scalaire_res = 0;

	vectf a ; initVf(a);
	vectf b ; initVf(b);
	vectf c ;

	print_vector_float(a);

	printf("================================================\n");

	print_vector_float(b);

	printf("================================================\n");

	__m128 v1, v2, v3 ;
	__m128i iV1, iV2, iV3 ;

	for(i=0; i<N; i+=4) {
		v1 = _mm_load_ps (a+i) ;
		v2 = _mm_load_ps (b+i) ;
		v3 = _mm_dp_ps (v1, v2, 0xFF) ;
		//_mm_store_ps (c+i, v3) ;
		prod_scalaire_res += v3[0];
	}

	printf("Produit scalaire des deux vecteurs : %f\n", prod_scalaire_res);

	exit (0) ;
}
Esempio n. 6
0
    float sse_dot_product(std::vector<float> &av, std::vector<float> &bv)
    {

      /* Get SIMD-vector pointers to the start of each vector */
      unsigned int niters = av.size() / 4;
      float zeros[] = {0.0, 0.0, 0.0, 0.0};

      float *a = (float *) aligned_alloc(16, av.size()*sizeof(float));
      float *b = (float *) aligned_alloc(16, av.size()*sizeof(float));
      memcpy(a,&av[0],av.size()*sizeof(float));
      memcpy(b,&bv[0],bv.size()*sizeof(float));

      __m128 *ptrA = (__m128*) &a[0], *ptrB = (__m128*) &b[0];
      __m128 res = _mm_load_ps(zeros);

      /* Do SIMD dot product */
      for (unsigned int i = 0; i < niters; i++, ptrA++,ptrB++)
        res = _mm_add_ps(_mm_dp_ps(*ptrA, *ptrB, 255), res);
      

      /* Get result back from the SIMD vector */
      float fres[4];
      _mm_store_ps (fres, res);
      int q = 4 * niters;

      for (unsigned int i = 0; i < av.size() % 4; i++)
        fres[0] += (a[i+q]*b[i+q]);

      free(a);
      free(b);

      return fres[0];
    }
Esempio n. 7
0
static int ks_assign_hydrogens(const float* xyz, const int* nco_indices, const int n_residues, float *hcoords)
/* Assign hydrogen atom coordinates
 */
{
  int ri, pc_index, po_index;
  __m128 pc, po, r_co, r_h, r_n, norm_r_co;
  __m128 tenth = _mm_set1_ps(0.1f);

  r_n = load_float3(xyz + 3*nco_indices[0]);
  store_float3(hcoords, r_n);
  hcoords += 3;

  for (ri = 1; ri < n_residues; ri++) {
    pc_index = nco_indices[3*(ri-1) + 1];
    po_index = nco_indices[3*(ri-1) + 2];

    pc = load_float3(xyz + 3*pc_index);
    po = load_float3(xyz + 3*po_index);
    r_co = _mm_sub_ps(pc, po);
    r_n = load_float3(xyz + 3*nco_indices[3*ri + 0]);
    norm_r_co = _mm_mul_ps(r_co, _mm_rsqrt_ps(_mm_dp_ps(r_co, r_co, 0xFF)));
    r_h = _mm_add_ps(r_n, _mm_mul_ps(tenth, norm_r_co));
    store_float3(hcoords, r_h);
    hcoords += 3;
  }

  return 1;
}
Esempio n. 8
0
int dihedral(const float* xyz, const int* quartets, float* out,
          const int n_frames, const int n_atoms, const int n_quartets) {
  /* Compute the angle between sets of four atoms in every frame
     of xyz.

     Parameters
     ----------
     xyz : array, shape=(n_frames, n_atoms, 3)
         Cartesian coordinates of the atoms in every frame, in contiguous C order.
     quartets : array, shape=(n_quartets, 3)
         The specific quartet of atoms whose angle you want to compute. The
         angle computed will be the torsion around the bound between the
         middle two elements (i.e aABCD). A 2d array of indices, in C order.
     out : array, shape=(n_frames, n_pairs)
         Array where the angles will be stored, in contiguous C order.

     All of the arrays are assumed to be contiguous. This code will
     segfault if they're not.
  */

  int i, j;
  __m128 x0, x1, x2, x3, b1, b2, b3, c1, c2, p1, p2;

  for (i = 0; i < n_frames; i++) {
    for (j = 0; j < n_quartets; j++) {
      x0 = load_float3(xyz + 3*quartets[4*j + 0]);
      x1 = load_float3(xyz + 3*quartets[4*j + 1]);
      x2 = load_float3(xyz + 3*quartets[4*j + 2]);
      x3 = load_float3(xyz + 3*quartets[4*j + 3]);

      b1 = _mm_sub_ps(x1, x0);
      b2 = _mm_sub_ps(x2, x1);
      b3 = _mm_sub_ps(x3, x2);

      c1 = cross(b2, b3);
      c2 = cross(b1, b2);

      p1 = _mm_mul_ps(_mm_dp_ps(b1, c1, 0x71), _mm_sqrt_ps(_mm_dp_ps(b2, b2, 0x71)));
      p2 = _mm_dp_ps(c1, c2, 0x71);

      *(out++) = atan2(_mm_cvtss_f32(p1), _mm_cvtss_f32(p2));
    };
    xyz += n_atoms*3;
  }
  return 1;
}
Esempio n. 9
0
float Float4::Dot(float x1, float y1, float z1, float w1, float x2, float y2, float z2, float w2) {
	// Task 8: replace with SSE
	__m128 v1 = _mm_setr_ps(x1, y1, z1, w1);
	__m128 v2 = _mm_setr_ps(x2,y2,z2,w2);
	__m128 dpResult = _mm_dp_ps(v1, v2, 0xf1);
	return _mm_cvtss_f32(dpResult);
	//return x1*x2 + y1*y2 + z1*z2 + w1*w2;
}
Esempio n. 10
0
int angle(const float* xyz, const int* triplets, float* out,
          const int n_frames, const int n_atoms, const int n_angles) {
  /* Compute the angle between tripples of atoms in every frame
     of xyz.

     Parameters
     ----------
     xyz : array, shape=(n_frames, n_atoms, 3)
         Cartesian coordinates of the atoms in every frame, in contiguous C order.
     triplets : array, shape=(n_angles, 3)
         The specific tripple of atoms whose angle you want to compute. The
         angle computed will be centered around the middle element (i.e aABC).
         A 2d array of indices, in C order.
     out : array, shape=(n_frames, n_pairs)
         Array where the angles will be stored, in contiguous C order.

     All of the arrays are assumed to be contiguous. This code will
     segfault if they're not.
  */

  int i, j;
  __m128 r_m, r_n, r_o, u_prime, u, v_prime, v;

  for (i = 0; i < n_frames; i++) {
    for (j = 0; j < n_angles; j++) {
      r_m = load_float3(xyz + 3*triplets[3*j + 0]);
      r_o = load_float3(xyz + 3*triplets[3*j + 1]);
      r_n = load_float3(xyz + 3*triplets[3*j + 2]);

      u_prime = _mm_sub_ps(r_m, r_o);
      v_prime = _mm_sub_ps(r_n, r_o);

      // normalize the vectors u_prime and v_prime
      u = _mm_mul_ps(u_prime, _mm_rsqrt_ps(_mm_dp_ps(u_prime, u_prime, 0x7F)));
      v = _mm_mul_ps(v_prime, _mm_rsqrt_ps(_mm_dp_ps(v_prime, v_prime, 0x7F)));

      // compute the arccos of the dot product, and store the result.
      *(out++) = acos(_mm_cvtss_f32(_mm_dp_ps(u, v, 0x71)));
    }
    // advance to the next frame
    xyz += n_atoms*3;
  }

  return 1;
}
Esempio n. 11
0
float Vertex::length_sqr() const
{
#ifdef SSE4
	__m128 ans = _mm_dp_ps(dat, dat, 0b01110001);
	return _mm_cvtss_f32(ans);
#else
	return x*x + y*y + z*z;
#endif
}
Esempio n. 12
0
__forceinline int _overlap(__m128 sse_plane_normal, __m128 sse_dot_plane, __m128* corner_a, __m128* corner_b, __m128* corner_c, __m128* corner_d)
{
   __m128 dota = _mm_dp_ps(*corner_a, sse_plane_normal, 0x70 | 0x1);
   __m128 dotb = _mm_dp_ps(*corner_b, sse_plane_normal, 0x70 | 0x2);
   __m128 dotc = _mm_dp_ps(*corner_c, sse_plane_normal, 0x70 | 0x4);
   __m128 dotd = _mm_dp_ps(*corner_d, sse_plane_normal, 0x70 | 0x8);

   __m128 all_dots = _mm_add_ps(dota, dotb);
   all_dots = _mm_add_ps(all_dots, dotc);
   all_dots = _mm_add_ps(all_dots, dotd);

   __m128 intersection_test = _mm_sub_ps(all_dots, sse_dot_plane);

   __m128 zero = _mm_setzero_ps();
   intersection_test = _mm_cmplt_ps(intersection_test, zero);

   return _mm_movemask_ps(intersection_test);
}
Esempio n. 13
0
float Vertex::length() const
{
#ifdef SSE4
	__m128 ans = _mm_dp_ps(dat, dat, 0b01110001);
	return _mm_cvtss_f32(_mm_sqrt_ss(ans));
#else
	return sqrt(x*x + y*y + z*z);
#endif
}
Esempio n. 14
0
float Vertex::operator&(const Vertex &v) const
{
#ifdef SSE4
	__m128 ans = _mm_dp_ps(dat, v.dat, 0b01110001);
	return _mm_cvtss_f32(ans);
#else
	return x*v.x + y*v.y + z*v.z;
#endif
}
Esempio n. 15
0
  float SSEVector3::operator *( const SSEVector3& v ) const
  {
    float result[ 4 ];

    // Store in lowest float, do not multiply fourth value: 1110 0001
    const int mask = 0x71;
    _mm_store_ss( result, _mm_dp_ps( vec, v.vec, mask ) );
    return result[ 0 ];
  }
Esempio n. 16
0
  float SSEVector3::Length() const
  {
    float result[ 4 ];
    float lengthSquared = LengthSquared();

    // Store in all floats, do not multiply fourth value: 0111 1111
    const int mask = 0x7F;
    _mm_store_ss( result, _mm_sqrt_ss( _mm_dp_ps( vec, vec, mask ) ) );
    return result[ 0 ];
  }
Esempio n. 17
0
File: SSE.hpp Progetto: Eynx/R3D
 static Float4 VFunction Dot(const Float4& vectorA, const Float4& vectorB)
 {
     //Vector vTemp2 = vectorB;
     //Vector vTemp = _mm_mul_ps(vectorA, vTemp2);
     //vTemp2 = Shuffle<0, 0, 0, 1>(vTemp2, vTemp);  // Copy X to the Z position and Y to the W position
     //vTemp2 = _mm_add_ps(vTemp2, vTemp);                   // Add Z = X+Z; W = Y+W;
     //vTemp = Shuffle<0, 0, 3, 0>(vTemp, vTemp2);   // Copy W to the Z position
     //vTemp = _mm_add_ps(vTemp, vTemp2);                    // Add Z and W together
     //return Permute<2, 2, 2, 2>(vTemp); // Splat Z and return
     Vector vResult = _mm_dp_ps(vectorA, vectorB, 0xFF);
     return vResult;
 }
Esempio n. 18
0
Normal::Normal(const Vertex &v)//¹éÒ»»¯
{
#ifdef AVX2
	__m128 ans = _mm_dp_ps(v.dat, v.dat, 0b01110001);
	__m128 tmp = _mm_broadcastss_ps(_mm_sqrt_ss(ans));
	dat = _mm_div_ps(v.dat, tmp);
#else
#    ifdef SSE4
	__m128 ans = _mm_dp_ps(v.dat, v.dat, 0b01110001);
	ans = _mm_sqrt_ss(ans);
	__m128 tmp = _mm_set1_ps(_mm_cvtss_f32(ans));
	dat = _mm_div_ps(v.dat, tmp);
#    else
	float s = v.x*v.x + v.y*v.y + v.z*v.z;
	s = 1 / sqrt(s);
	x = v.x * s;
	y = v.y * s;
	z = v.z * s;
#    endif
#endif
}
inline float dot_product(__m128 a, __m128 b)
{
#if defined(SSE4)
	__m128 m = _mm_dp_ps(a, b, 0xff);
	return m.m128_f32[0];
#elif defined(SSE3)
	__m128 m = _mm_mul_ps(a, b);
	m = _mm_hadd_ps(m, m);
	m = _mm_hadd_ps(m, m);
	return m.m128_f32[0];
#else
	__m128 m = _mm_mul_ps(a, b);
	return m.m128_f32[0] + m.m128_f32[1] + m.m128_f32[2] + m.m128_f32[3];
#endif
}
Esempio n. 20
0
/* Vector dot product  w/ SSE */
float vec_dotProd_sse(data_t* pArray1,       // [in] 1st source array
                data_t* pArray2,       // [in] 2nd source array
                long int nSize)            // [in] size of all arrays
{
  int  i, nLoop = nSize/4;
  float dotProductResult = 0.0f; 
  __m128 m1;  
  __m128*  pSrc1 = (__m128*) pArray1;
  __m128*  pSrc2 = (__m128*) pArray2;

  for (i = 0; i < nLoop; i++) {
    m1 = _mm_dp_ps(*pSrc1, *pSrc2, 0xFF);
    dotProductResult += m1[0]; 

    pSrc1++;
    pSrc2++;
  }
}
Esempio n. 21
0
// use MMX/SSE extensions
void dotprod_rrrf_execute_sse4(dotprod_rrrf _q,
                               float *      _x,
                               float *      _y)
{
    __m128 v;   // input vector
    __m128 h;   // coefficients vector
    __m128 s;   // dot product
    __m128 sum = _mm_setzero_ps(); // load zeros into sum register

    // t = 4*(floor(_n/4))
    unsigned int t = (_q->n >> 2) << 2;

    //
    unsigned int i;
    for (i=0; i<t; i+=4) {
        // load inputs into register (unaligned)
        v = _mm_loadu_ps(&_x[i]);

        // load coefficients into register (aligned)
        h = _mm_load_ps(&_q->h[i]);

        // compute dot product
        s = _mm_dp_ps(v, h, 0xffffffff);
        
        // parallel addition
        sum = _mm_add_ps( sum, s );
    }

    // aligned output array
    float w[4] __attribute__((aligned(16)));

    // unload packed array
    _mm_store_ps(w, sum);
    float total = w[0];

    // cleanup
    for (; i<_q->n; i++)
        total += _x[i] * _q->h[i];

    // set return value
    *_y = total;
}
Esempio n. 22
0
// Résolution de matrices vectorisées supérieures
void resolutionSup (mat M, vectf B, vectf Res) {
	int i, j;
	int somme;

	__m128 v1, v2, v3;

	for(i=0; i<N; i++) {
		somme = 0;
		for(j=(i-i%4); j<N; j+=4) {
			v1 = _mm_load_ps (M[i]+j) ;
			v2 = _mm_load_ps (B+j) ;

			v3 = _mm_dp_ps(v1,v2, 0xFF);

			somme += v3[0];
		}

		Res[i] = somme / B[i];
	}
}
Esempio n. 23
0
// Résolution de matrices vectorisées inférieures
void resolutionInf (mat M, vectf B, vectf Res) {
	int i, j;
	int somme;

	__m128 v1, v2, v3, v4, v5; 

	for(i=0; i<N; i++) {
		somme = 0;
		for(j=0; j<i; j+=4) {
			v1 = _mm_load_ps (M[i]+j) ;
			v2 = _mm_load_ps (B+j) ;
			v3 = _mm_load_ps(Res+j);
      v4 = _mm_mul_ps(v1,v3);

			v5 = _mm_dp_ps(v4,v2, 0xFF);

			somme += v5[0];
		}

		Res[i] = (B[i]-somme) / M[i][i];
	}
}
Esempio n. 24
0
double CMercerKernel<float>::Evaluate(float* x, float* y) {

#ifndef __SSE4_1__

    float result = 0;

    for(size_t i=0; i<m_n; i++)
        result += x[i]*y[i];

    return static_cast<double>(result);

#else
    __m128* px = reinterpret_cast<__m128*>(x);
    __m128* py = reinterpret_cast<__m128*>(y);

    float zero = 0.0;
    __m128 sum = _mm_load1_ps(&zero);

    const int mask = 241;       // 4 MSB mask input, 4 LSB mask output

    for(size_t i=0; i<m_offset/4; i++) {

        __m128 temp = _mm_dp_ps(px[i],py[i],mask);
        sum = _mm_add_ss(sum,temp);                         // accumulate result in first register

    }

    float result[4] = {0.0,0.0,0.0,0.0};
    _mm_storeu_ps(result,sum);

    // add offset
    for(size_t i=m_offset; i<m_n; i++)
        result[0] += x[i]*y[i];

    return static_cast<double>(result[0]);
#endif

}
Esempio n. 25
0
void
test8bit (void)
{
  i1 = _mm_cmpistrm (i2, i3, k4);	  /* { dg-error "the third argument must be an 8-bit immediate" } */
  k1 = _mm_cmpistri (i2, i3, k4);	  /* { dg-error "the third argument must be an 8-bit immediate" } */
  k1 = _mm_cmpistra (i2, i3, k4);	  /* { dg-error "the third argument must be an 8-bit immediate" } */
  k1 = _mm_cmpistrc (i2, i3, k4);	  /* { dg-error "the third argument must be an 8-bit immediate" } */
  k1 = _mm_cmpistro (i2, i3, k4);	  /* { dg-error "the third argument must be an 8-bit immediate" } */
  k1 = _mm_cmpistrs (i2, i3, k4);	  /* { dg-error "the third argument must be an 8-bit immediate" } */
  k1 = _mm_cmpistrz (i2, i3, k4);	  /* { dg-error "the third argument must be an 8-bit immediate" } */
  i1 = _mm_cmpestrm (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */
  k1 = _mm_cmpestri (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */
  k1 = _mm_cmpestra (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */
  k1 = _mm_cmpestrc (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */
  k1 = _mm_cmpestro (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */
  k1 = _mm_cmpestrs (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */
  k1 = _mm_cmpestrz (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */
  b1 = _mm256_blend_ps (b2, b3, k4);	  /* { dg-error "the last argument must be an 8-bit immediate" } */
  k1 = _cvtss_sh (f1, k4);		  /* { dg-error "the last argument must be an 8-bit immediate" } */
  i1 = _mm256_cvtps_ph (b2, k4);	  /* { dg-error "the last argument must be an 8-bit immediate" } */
  b1 = _mm256_dp_ps (b2, b3, k4);	  /* { dg-error "the last argument must be an 8-bit immediate" } */
  e1 = _mm256_permute2f128_pd (e2, e3, k4);/* { dg-error "the last argument must be an 8-bit immediate" } */
  b1 = _mm256_permute2f128_ps (b2, b3, k4);/* { dg-error "the last argument must be an 8-bit immediate" } */
  l1 = _mm256_permute2f128_si256 (l2, l3, k4);/* { dg-error "the last argument must be an 8-bit immediate" } */
  b1 = _mm256_permute_ps (b2, k4);	  /* { dg-error "the last argument must be an 8-bit immediate" } */
  i1 = _mm_aeskeygenassist_si128 (i2, k4);/* { dg-error "the last argument must be an 8-bit immediate" } */
  i1 = _mm_blend_epi16 (i2, i3, k4);	  /* { dg-error "the last argument must be an 8-bit immediate" } */
  i1 = _mm_clmulepi64_si128 (i2, i3, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */
  i1 = _mm_cvtps_ph (a1, k4);		  /* { dg-error "the last argument must be an 8-bit immediate" } */
  d1 = _mm_dp_pd (d2, d3, k4);		  /* { dg-error "the last argument must be an 8-bit immediate" } */
  a1 = _mm_dp_ps (a2, a3, k4);		  /* { dg-error "the last argument must be an 8-bit immediate" } */
  a1 = _mm_insert_ps (a2, a3, k4);	  /* { dg-error "the last argument must be an 8-bit immediate" } */
  i1 = _mm_mpsadbw_epu8 (i2, i3, k4);	  /* { dg-error "the last argument must be an 8-bit immediate" } */
  a1 = _mm_permute_ps (a2, k4);		  /* { dg-error "the last argument must be an 8-bit immediate" } */
  i1 = _mm_slli_si128 (i2, k4);		  /* { dg-error "the last argument must be an 8-bit immediate" } */
  i1 = _mm_srli_si128 (i2, k4);		  /* { dg-error "the last argument must be an 8-bit immediate" } */
}
Esempio n. 26
0
__forceinline int ClusteredLightCuller::_sphereOverlapsFroxel(int x, int y, int z, float sphere_radius, const vec3& sphere_center, const FroxelInfo* froxel_infos)
{
   __m128* center_coord = (__m128*)&froxel_infos[_toFlatFroxelIndex(x, y, z)].center_coord;
   __m128 sse_sphere_center = _mm_set_ps(1.0, sphere_center.z, sphere_center.y, sphere_center.x);
   __m128 sse_plane_normal = _normalize(_mm_sub_ps(*center_coord, sse_sphere_center));

   __m128 plane_origin = _mm_add_ps(sse_sphere_center, _mm_mul_ps(_mm_set1_ps(sphere_radius), sse_plane_normal));
   __m128 sse_dot_plane = _mm_dp_ps(plane_origin, sse_plane_normal, 0x70 | 0xF);

   __m128* corner_a = (__m128*)&froxel_infos[_toFlatFroxelIndex(x + 0, y + 0, z + 0)].corner_coord;
   __m128* corner_b = (__m128*)&froxel_infos[_toFlatFroxelIndex(x + 1, y + 0, z + 0)].corner_coord;
   __m128* corner_c = (__m128*)&froxel_infos[_toFlatFroxelIndex(x + 1, y + 1, z + 0)].corner_coord;
   __m128* corner_d = (__m128*)&froxel_infos[_toFlatFroxelIndex(x + 0, y + 1, z + 0)].corner_coord;

   if (_overlap(sse_plane_normal, sse_dot_plane, corner_a, corner_b, corner_c, corner_d))
      return 1;

   corner_a = (__m128*)&froxel_infos[_toFlatFroxelIndex(x + 0, y + 0, z + 1)].corner_coord;
   corner_b = (__m128*)&froxel_infos[_toFlatFroxelIndex(x + 1, y + 0, z + 1)].corner_coord;
   corner_c = (__m128*)& froxel_infos[_toFlatFroxelIndex(x + 1, y + 1, z + 1)].corner_coord;
   corner_d = (__m128*)& froxel_infos[_toFlatFroxelIndex(x + 0, y + 1, z + 1)].corner_coord;

   return (_overlap(sse_plane_normal, sse_dot_plane, corner_a, corner_b, corner_c, corner_d));
}
Esempio n. 27
0
int kabsch_sander(const float* xyz, const int* nco_indices, const int* ca_indices,
                  const int n_frames, const int n_atoms, const int n_residues,
                  int* hbonds, float* henergies) {
  /* Find all of backbone hydrogen bonds between residues in each frame of a
     trajectory.

    Parameters
    ----------
    xyz : array, shape=(n_frames, n_atoms, 3)
        The cartesian coordinates of all of the atoms in each frame.
    nco_indices : array, shape=(n_residues, 3)
        The indices of the backbone N, C, and O atoms for each residue.
    ca_indices : array, shape=(n_residues,)
        The index of the CA atom of each residue. If a residue does not contain
        a CA atom, or you want to skip the residue for another reason, the
	value should be -1

    Returns
    -------
    hbonds : array, shape=(n_frames, n_residues, 2)
        This is a little tricky, so bear with me. This array gives the indices
        of the residues that each backbone hbond *acceptor* is engaged in an hbond
        with. For instance, the equality `bonds[i, j, 0] == k` is interpreted as
        "in frame i, residue j is accepting its first hydrogen bond from residue
        k". `bonds[i, j, 1] == k` means that residue j is accepting its second
        hydrogen bond from residue k. A negative value indicates that no such
        hbond exists.
    henergies : array, shape=(n_frames, n_residues, 2)
        The semantics of this array run parallel to the hbonds array, but
        instead of giving the identity of the interaction partner, it gives
        the energy of the hbond. Only hbonds with energy below -0.5 kcal/mol
        are recorded.
  */

  int i, ri, rj;
  static float HBOND_ENERGY_CUTOFF = -0.5;
  __m128 ri_ca, rj_ca, r12;
  __m128 MINIMAL_CA_DISTANCE2 = _mm_set1_ps(0.81);
  float* hcoords = (float*) malloc(n_residues*3 * sizeof(float));
  if (hcoords == NULL) {
    fprintf(stderr, "Memory Error\n");
    exit(1);
  }

  for (i = 0; i < n_frames; i++) {
    ks_assign_hydrogens(xyz, nco_indices, n_residues, hcoords);

    for (ri = 0; ri < n_residues; ri++) {
      // -1 is used to indicate that this residue lacks a this atom type
      // so just skip it
      if (ca_indices[ri] == -1) continue;
      ri_ca = load_float3(xyz + 3*ca_indices[ri]);

      for (rj = ri + 1; rj < n_residues; rj++) {
        if (ca_indices[rj] == -1) continue;
        rj_ca = load_float3(xyz + 3*ca_indices[rj]);

        // check the ca distance before proceding
        r12 = _mm_sub_ps(ri_ca, rj_ca);
        if(_mm_extract_epi16(CAST__M128I(_mm_cmplt_ps(_mm_dp_ps(r12, r12, 0x7F), MINIMAL_CA_DISTANCE2)), 0)) {
          float e = ks_donor_acceptor(xyz, hcoords, nco_indices, ri, rj);
          if (e < HBOND_ENERGY_CUTOFF)
            // hbond from donor=ri to acceptor=rj
            store_energies(hbonds, henergies, ri, rj, e);

          if (rj != ri + 1) {
	    float e = ks_donor_acceptor(xyz, hcoords, nco_indices, rj, ri);
            if (e < HBOND_ENERGY_CUTOFF)
              // hbond from donor=rj to acceptor=ri
              store_energies(hbonds, henergies, rj, ri, e);
          }
        }
      }
    }
    xyz += n_atoms*3; // advance to the next frame
    hbonds += n_residues*2;
    henergies += n_residues*2;
  }
  free(hcoords);
  return 1;
}
Esempio n. 28
0
inline __m128 SSENormalizeMultiplierSSE4(__m128 v)
{
	return _mm_rsqrt_ps(_mm_dp_ps(v, v, 0xFF));
}
Esempio n. 29
0
static float ks_donor_acceptor(const float* xyz, const float* hcoords,
			       const int* nco_indices, int donor, int acceptor)
{
  /* Conpute the Kabsch-Sander hydrogen bond energy between two residues
     in a single conformation.

     Parameters
     ----------
     xyz : array, shape=(n_atoms, 3)
         All of the atoms in this frame
     nhco0 : array, shape=(4,)
         The indices of the backbone N, H, C, and O atoms in one residue.
     nhco1 : array, shape=(4,)
         The indices of the backbone N, H, C, and O atoms in the other residue.
     donor : int
         Boolean flag. If 0, then nhco0 is the hydrogen bond proton donor (i.e. we
         look at its N and H). If 1, then nhco1 is the hydrogen bond proton donor.

     Returns
     -------
     energy : float
         The KS backbone hydrogen bond energy, in kcal/mol. A number under -0.5
         is considered significant.
  */
  float energy;
  __m128 r_n, r_h, r_c, r_o, r_ho, r_nc, r_hc, r_no, d2_honchcno;
  __m128 coupling;

  // 332 (kcal*A/mol) * 0.42 * 0.2 * (1nm / 10 A)
  coupling = _mm_setr_ps(-2.7888, -2.7888, 2.7888, 2.7888);
  r_n = load_float3(xyz + 3*nco_indices[3*donor]);
  r_h = load_float3(hcoords + 3*donor);
  r_c = load_float3(xyz + 3*nco_indices[3*acceptor + 1]);
  r_o = load_float3(xyz + 3*nco_indices[3*acceptor + 2]);

  //printf("Donor Index %d\n", donor);
  //printf("Acceptor Index %d\n", acceptor);
  /*printf("N index %d\n", 3*nco_indices[3*donor + 0]);
  printf("C index %d\n", 3*nco_indices[3*acceptor + 1]);
  printf("O index %d\n", 3*nco_indices[3*acceptor + 2]);
  printf("\nrN ");
  printf_m128(r_n);
  printf("rH ");
  printf_m128(r_h);
  printf("rC ");
  printf_m128(r_c);
  printf("rO ");
  printf_m128(r_o);*/

  r_ho = _mm_sub_ps(r_h, r_o);
  r_hc = _mm_sub_ps(r_h, r_c);
  r_nc = _mm_sub_ps(r_n, r_c);
  r_no = _mm_sub_ps(r_n, r_o);

  // compute all four dot products (each of the squared distances), and then
  // pack them into a single float4 using three shuffles.
  d2_honchcno = _mm_shuffle_ps(_mm_shuffle_ps(_mm_dp_ps(r_ho, r_ho, 0xF3), _mm_dp_ps(r_nc, r_nc, 0xF3), _MM_SHUFFLE(0,1,0,1)),
                               _mm_shuffle_ps(_mm_dp_ps(r_hc, r_hc, 0xF3), _mm_dp_ps(r_no, r_no, 0xF3), _MM_SHUFFLE(0,1,0,1)),
                               _MM_SHUFFLE(2,0,2,0));

  energy = _mm_cvtss_f32(_mm_dp_ps(coupling, _mm_rsqrt_ps(d2_honchcno), 0xFF));
  //printf("Energy: %f\n\n", energy);
  return (energy < -9.9f ? -9.9f : energy);
}
Esempio n. 30
0
File: SSE.hpp Progetto: Eynx/R3D
 // Dot product ------------------------------------------------------------------------
 static Float3 VFunction Dot(const Float3& vectorA, const Float3& vectorB)
 {
     Vector vResult = _mm_dp_ps(vectorA, vectorB, 0x7F);
     return vResult;
 }