template <> vector3d<float> vector3d<float>::transform(const matrix3d<float> &M) const { float _x, _y, _z; __m128 v1, v2, rx, ry, rz; v1 = _mm_load_ps(&coefficients[0]); v2 = _mm_load_ps(&M.elements[0]); rx = _mm_dp_ps(v1,v2,0xF1); v1 = _mm_load_ps(&coefficients[0]); v2 = _mm_load_ps(&M.elements[4]); ry = _mm_dp_ps(v1,v2,0xF1); v1 = _mm_load_ps(&coefficients[0]); v2 = _mm_load_ps(&M.elements[8]); rz = _mm_dp_ps(v1,v2,0xF1); _mm_store_ss(&_x,rx); _mm_store_ss(&_y,ry); _mm_store_ss(&_z,rz); vector3d<float> q; q.set(_x,_y,_z); return q; }
// use MMX/SSE extensions (unrolled) void dotprod_rrrf_execute_sse4u(dotprod_rrrf _q, float * _x, float * _y) { __m128 v0, v1, v2, v3; __m128 h0, h1, h2, h3; __m128 s0, s1, s2, s3; __m128 sum = _mm_setzero_ps(); // load zeros into sum register // t = 4*(floor(_n/16)) unsigned int r = (_q->n >> 4) << 2; // unsigned int i; for (i=0; i<r; i+=4) { // load inputs into register (unaligned) v0 = _mm_loadu_ps(&_x[4*i+ 0]); v1 = _mm_loadu_ps(&_x[4*i+ 4]); v2 = _mm_loadu_ps(&_x[4*i+ 8]); v3 = _mm_loadu_ps(&_x[4*i+12]); // load coefficients into register (aligned) h0 = _mm_load_ps(&_q->h[4*i+ 0]); h1 = _mm_load_ps(&_q->h[4*i+ 4]); h2 = _mm_load_ps(&_q->h[4*i+ 8]); h3 = _mm_load_ps(&_q->h[4*i+12]); // compute dot products s0 = _mm_dp_ps(v0, h0, 0xffffffff); s1 = _mm_dp_ps(v1, h1, 0xffffffff); s2 = _mm_dp_ps(v2, h2, 0xffffffff); s3 = _mm_dp_ps(v3, h3, 0xffffffff); // parallel addition // FIXME: these additions are by far the limiting factor sum = _mm_add_ps( sum, s0 ); sum = _mm_add_ps( sum, s1 ); sum = _mm_add_ps( sum, s2 ); sum = _mm_add_ps( sum, s3 ); } // aligned output array float w[4] __attribute__((aligned(16))); // unload packed array _mm_store_ps(w, sum); float total = w[0]; // cleanup for (i=4*r; i<_q->n; i++) total += _x[i] * _q->h[i]; // set return value *_y = total; }
/** transform vector by rigid transform */ inline Matrix<float, 4, 1> operator * (const RigidTransform<float>& mat, const Matrix<float, 4, 1>& vec) { #ifdef SIMPLE_GL_USE_SSE4 __m128 res; __m128 dotProd; res = _mm_dp_ps(mat[0].m128, vec.m128, 0xEE);\ dotProd = _mm_dp_ps(mat[1].m128, vec.m128, 0xEE);\ res = _mm_blend_ps( res, dotProd, _MM_SHUFFLE(0, 1, 1, 1) );\ dotProd = _mm_dp_ps(mat[2].m128, vec.m128, 0xEE);\ res = _mm_blend_ps( res, dotProd, _MM_SHUFFLE(0, 0, 1, 1) );\ dotProd = _mm_dp_ps(mat[3].m128, vec.m128, 0xEE);\ res = _mm_blend_ps( res, dotProd, _MM_SHUFFLE(0, 0, 0, 1) ); return Matrix<float, 4, 1>(res); #elif defined(SIMPLE_GL_USE_SSE3) __m128 res; __m128 dotProd0 = _mm_mul_ps(mat[0].m128, vec.m128); dotProd0 = _mm_hadd_ps(dotProd0, dotProd0); dotProd0 = _mm_hadd_ps(dotProd0, dotProd0); __m128 dotProd1 = _mm_mul_ps(mat[1].m128, vec.m128); dotProd1 = _mm_hadd_ps(dotProd1, dotProd1); dotProd1 = _mm_hadd_ps(dotProd1, dotProd1); __m128 dotProd2 = _mm_mul_ps(mat[2].m128, vec.m128); dotProd2 = _mm_hadd_ps(dotProd2, dotProd2); dotProd2 = _mm_hadd_ps(dotProd2, dotProd2); __m128 dotProd3 = _mm_mul_ps(mat[3].m128, vec.m128); dotProd3 = _mm_hadd_ps(dotProd3, dotProd3); dotProd3 = _mm_hadd_ps(dotProd3, dotProd3); __m128 vec01 = _mm_unpacklo_ps(dotProd0, dotProd1); __m128 vec23 = _mm_unpackhi_ps(dotProd2, dotProd3); res = _mm_movelh_ps(vec01, vec23); return Matrix<float, 4, 1>(res); #else // SSE2 // TODO: Think about good sse optimization Matrix<float, 4, 1> res; res[0] = mat[0][0] * res[0] + mat[0][1] * res[1] + mat[0][2] * res[2] + mat[0][3] * res[3]; res[1] = mat[1][0] * res[0] + mat[1][1] * res[1] + mat[1][2] * res[2] + mat[1][3] * res[3]; res[2] = mat[2][0] * res[0] + mat[2][1] * res[1] + mat[2][2] * res[2] + mat[2][3] * res[3]; res[3] = mat[3][0] * res[0] + mat[3][1] * res[1] + mat[3][2] * res[2] + mat[3][3] * res[3]; return res; #endif }
void CraytDlg::OnBnClickedTestSse2() { vec3 A = vec3(1,1,1); vec3 B = vec3(2,-1,-2); __m128 C = _mm_setr_ps(1,1,1,0); __m128 D = _mm_setr_ps(2,-1,-2,0); LARGE_INTEGER F,T0,T1; // address of current frequency QueryPerformanceFrequency(&F); QueryPerformanceCounter(&T0); for(int j=0;j<100;++j) for(int i=0;i<1000000;++i) _mm_dp_ps(C,D,0x7F); QueryPerformanceCounter(&T1); float elapsed_timeA = (float)(T1.QuadPart - T0.QuadPart) / (float)F.QuadPart; T0 = T1; for(int j=0;j<100;++j) for(int i=0;i<1000000;++i) dot(A,B); QueryPerformanceCounter(&T1); float elapsed_timeB = (float)(T1.QuadPart - T0.QuadPart) / (float)F.QuadPart; char buffer[255]; sprintf(buffer,"Fast= %.2f s Normal=%.2f s" , elapsed_timeA,elapsed_timeB); AfxMessageBox(buffer); }
int main (int argc, char **argv) { int i; float prod_scalaire_res = 0; vectf a ; initVf(a); vectf b ; initVf(b); vectf c ; print_vector_float(a); printf("================================================\n"); print_vector_float(b); printf("================================================\n"); __m128 v1, v2, v3 ; __m128i iV1, iV2, iV3 ; for(i=0; i<N; i+=4) { v1 = _mm_load_ps (a+i) ; v2 = _mm_load_ps (b+i) ; v3 = _mm_dp_ps (v1, v2, 0xFF) ; //_mm_store_ps (c+i, v3) ; prod_scalaire_res += v3[0]; } printf("Produit scalaire des deux vecteurs : %f\n", prod_scalaire_res); exit (0) ; }
float sse_dot_product(std::vector<float> &av, std::vector<float> &bv) { /* Get SIMD-vector pointers to the start of each vector */ unsigned int niters = av.size() / 4; float zeros[] = {0.0, 0.0, 0.0, 0.0}; float *a = (float *) aligned_alloc(16, av.size()*sizeof(float)); float *b = (float *) aligned_alloc(16, av.size()*sizeof(float)); memcpy(a,&av[0],av.size()*sizeof(float)); memcpy(b,&bv[0],bv.size()*sizeof(float)); __m128 *ptrA = (__m128*) &a[0], *ptrB = (__m128*) &b[0]; __m128 res = _mm_load_ps(zeros); /* Do SIMD dot product */ for (unsigned int i = 0; i < niters; i++, ptrA++,ptrB++) res = _mm_add_ps(_mm_dp_ps(*ptrA, *ptrB, 255), res); /* Get result back from the SIMD vector */ float fres[4]; _mm_store_ps (fres, res); int q = 4 * niters; for (unsigned int i = 0; i < av.size() % 4; i++) fres[0] += (a[i+q]*b[i+q]); free(a); free(b); return fres[0]; }
static int ks_assign_hydrogens(const float* xyz, const int* nco_indices, const int n_residues, float *hcoords) /* Assign hydrogen atom coordinates */ { int ri, pc_index, po_index; __m128 pc, po, r_co, r_h, r_n, norm_r_co; __m128 tenth = _mm_set1_ps(0.1f); r_n = load_float3(xyz + 3*nco_indices[0]); store_float3(hcoords, r_n); hcoords += 3; for (ri = 1; ri < n_residues; ri++) { pc_index = nco_indices[3*(ri-1) + 1]; po_index = nco_indices[3*(ri-1) + 2]; pc = load_float3(xyz + 3*pc_index); po = load_float3(xyz + 3*po_index); r_co = _mm_sub_ps(pc, po); r_n = load_float3(xyz + 3*nco_indices[3*ri + 0]); norm_r_co = _mm_mul_ps(r_co, _mm_rsqrt_ps(_mm_dp_ps(r_co, r_co, 0xFF))); r_h = _mm_add_ps(r_n, _mm_mul_ps(tenth, norm_r_co)); store_float3(hcoords, r_h); hcoords += 3; } return 1; }
int dihedral(const float* xyz, const int* quartets, float* out, const int n_frames, const int n_atoms, const int n_quartets) { /* Compute the angle between sets of four atoms in every frame of xyz. Parameters ---------- xyz : array, shape=(n_frames, n_atoms, 3) Cartesian coordinates of the atoms in every frame, in contiguous C order. quartets : array, shape=(n_quartets, 3) The specific quartet of atoms whose angle you want to compute. The angle computed will be the torsion around the bound between the middle two elements (i.e aABCD). A 2d array of indices, in C order. out : array, shape=(n_frames, n_pairs) Array where the angles will be stored, in contiguous C order. All of the arrays are assumed to be contiguous. This code will segfault if they're not. */ int i, j; __m128 x0, x1, x2, x3, b1, b2, b3, c1, c2, p1, p2; for (i = 0; i < n_frames; i++) { for (j = 0; j < n_quartets; j++) { x0 = load_float3(xyz + 3*quartets[4*j + 0]); x1 = load_float3(xyz + 3*quartets[4*j + 1]); x2 = load_float3(xyz + 3*quartets[4*j + 2]); x3 = load_float3(xyz + 3*quartets[4*j + 3]); b1 = _mm_sub_ps(x1, x0); b2 = _mm_sub_ps(x2, x1); b3 = _mm_sub_ps(x3, x2); c1 = cross(b2, b3); c2 = cross(b1, b2); p1 = _mm_mul_ps(_mm_dp_ps(b1, c1, 0x71), _mm_sqrt_ps(_mm_dp_ps(b2, b2, 0x71))); p2 = _mm_dp_ps(c1, c2, 0x71); *(out++) = atan2(_mm_cvtss_f32(p1), _mm_cvtss_f32(p2)); }; xyz += n_atoms*3; } return 1; }
float Float4::Dot(float x1, float y1, float z1, float w1, float x2, float y2, float z2, float w2) { // Task 8: replace with SSE __m128 v1 = _mm_setr_ps(x1, y1, z1, w1); __m128 v2 = _mm_setr_ps(x2,y2,z2,w2); __m128 dpResult = _mm_dp_ps(v1, v2, 0xf1); return _mm_cvtss_f32(dpResult); //return x1*x2 + y1*y2 + z1*z2 + w1*w2; }
int angle(const float* xyz, const int* triplets, float* out, const int n_frames, const int n_atoms, const int n_angles) { /* Compute the angle between tripples of atoms in every frame of xyz. Parameters ---------- xyz : array, shape=(n_frames, n_atoms, 3) Cartesian coordinates of the atoms in every frame, in contiguous C order. triplets : array, shape=(n_angles, 3) The specific tripple of atoms whose angle you want to compute. The angle computed will be centered around the middle element (i.e aABC). A 2d array of indices, in C order. out : array, shape=(n_frames, n_pairs) Array where the angles will be stored, in contiguous C order. All of the arrays are assumed to be contiguous. This code will segfault if they're not. */ int i, j; __m128 r_m, r_n, r_o, u_prime, u, v_prime, v; for (i = 0; i < n_frames; i++) { for (j = 0; j < n_angles; j++) { r_m = load_float3(xyz + 3*triplets[3*j + 0]); r_o = load_float3(xyz + 3*triplets[3*j + 1]); r_n = load_float3(xyz + 3*triplets[3*j + 2]); u_prime = _mm_sub_ps(r_m, r_o); v_prime = _mm_sub_ps(r_n, r_o); // normalize the vectors u_prime and v_prime u = _mm_mul_ps(u_prime, _mm_rsqrt_ps(_mm_dp_ps(u_prime, u_prime, 0x7F))); v = _mm_mul_ps(v_prime, _mm_rsqrt_ps(_mm_dp_ps(v_prime, v_prime, 0x7F))); // compute the arccos of the dot product, and store the result. *(out++) = acos(_mm_cvtss_f32(_mm_dp_ps(u, v, 0x71))); } // advance to the next frame xyz += n_atoms*3; } return 1; }
float Vertex::length_sqr() const { #ifdef SSE4 __m128 ans = _mm_dp_ps(dat, dat, 0b01110001); return _mm_cvtss_f32(ans); #else return x*x + y*y + z*z; #endif }
__forceinline int _overlap(__m128 sse_plane_normal, __m128 sse_dot_plane, __m128* corner_a, __m128* corner_b, __m128* corner_c, __m128* corner_d) { __m128 dota = _mm_dp_ps(*corner_a, sse_plane_normal, 0x70 | 0x1); __m128 dotb = _mm_dp_ps(*corner_b, sse_plane_normal, 0x70 | 0x2); __m128 dotc = _mm_dp_ps(*corner_c, sse_plane_normal, 0x70 | 0x4); __m128 dotd = _mm_dp_ps(*corner_d, sse_plane_normal, 0x70 | 0x8); __m128 all_dots = _mm_add_ps(dota, dotb); all_dots = _mm_add_ps(all_dots, dotc); all_dots = _mm_add_ps(all_dots, dotd); __m128 intersection_test = _mm_sub_ps(all_dots, sse_dot_plane); __m128 zero = _mm_setzero_ps(); intersection_test = _mm_cmplt_ps(intersection_test, zero); return _mm_movemask_ps(intersection_test); }
float Vertex::length() const { #ifdef SSE4 __m128 ans = _mm_dp_ps(dat, dat, 0b01110001); return _mm_cvtss_f32(_mm_sqrt_ss(ans)); #else return sqrt(x*x + y*y + z*z); #endif }
float Vertex::operator&(const Vertex &v) const { #ifdef SSE4 __m128 ans = _mm_dp_ps(dat, v.dat, 0b01110001); return _mm_cvtss_f32(ans); #else return x*v.x + y*v.y + z*v.z; #endif }
float SSEVector3::operator *( const SSEVector3& v ) const { float result[ 4 ]; // Store in lowest float, do not multiply fourth value: 1110 0001 const int mask = 0x71; _mm_store_ss( result, _mm_dp_ps( vec, v.vec, mask ) ); return result[ 0 ]; }
float SSEVector3::Length() const { float result[ 4 ]; float lengthSquared = LengthSquared(); // Store in all floats, do not multiply fourth value: 0111 1111 const int mask = 0x7F; _mm_store_ss( result, _mm_sqrt_ss( _mm_dp_ps( vec, vec, mask ) ) ); return result[ 0 ]; }
static Float4 VFunction Dot(const Float4& vectorA, const Float4& vectorB) { //Vector vTemp2 = vectorB; //Vector vTemp = _mm_mul_ps(vectorA, vTemp2); //vTemp2 = Shuffle<0, 0, 0, 1>(vTemp2, vTemp); // Copy X to the Z position and Y to the W position //vTemp2 = _mm_add_ps(vTemp2, vTemp); // Add Z = X+Z; W = Y+W; //vTemp = Shuffle<0, 0, 3, 0>(vTemp, vTemp2); // Copy W to the Z position //vTemp = _mm_add_ps(vTemp, vTemp2); // Add Z and W together //return Permute<2, 2, 2, 2>(vTemp); // Splat Z and return Vector vResult = _mm_dp_ps(vectorA, vectorB, 0xFF); return vResult; }
Normal::Normal(const Vertex &v)//¹éÒ»»¯ { #ifdef AVX2 __m128 ans = _mm_dp_ps(v.dat, v.dat, 0b01110001); __m128 tmp = _mm_broadcastss_ps(_mm_sqrt_ss(ans)); dat = _mm_div_ps(v.dat, tmp); #else # ifdef SSE4 __m128 ans = _mm_dp_ps(v.dat, v.dat, 0b01110001); ans = _mm_sqrt_ss(ans); __m128 tmp = _mm_set1_ps(_mm_cvtss_f32(ans)); dat = _mm_div_ps(v.dat, tmp); # else float s = v.x*v.x + v.y*v.y + v.z*v.z; s = 1 / sqrt(s); x = v.x * s; y = v.y * s; z = v.z * s; # endif #endif }
inline float dot_product(__m128 a, __m128 b) { #if defined(SSE4) __m128 m = _mm_dp_ps(a, b, 0xff); return m.m128_f32[0]; #elif defined(SSE3) __m128 m = _mm_mul_ps(a, b); m = _mm_hadd_ps(m, m); m = _mm_hadd_ps(m, m); return m.m128_f32[0]; #else __m128 m = _mm_mul_ps(a, b); return m.m128_f32[0] + m.m128_f32[1] + m.m128_f32[2] + m.m128_f32[3]; #endif }
/* Vector dot product w/ SSE */ float vec_dotProd_sse(data_t* pArray1, // [in] 1st source array data_t* pArray2, // [in] 2nd source array long int nSize) // [in] size of all arrays { int i, nLoop = nSize/4; float dotProductResult = 0.0f; __m128 m1; __m128* pSrc1 = (__m128*) pArray1; __m128* pSrc2 = (__m128*) pArray2; for (i = 0; i < nLoop; i++) { m1 = _mm_dp_ps(*pSrc1, *pSrc2, 0xFF); dotProductResult += m1[0]; pSrc1++; pSrc2++; } }
// use MMX/SSE extensions void dotprod_rrrf_execute_sse4(dotprod_rrrf _q, float * _x, float * _y) { __m128 v; // input vector __m128 h; // coefficients vector __m128 s; // dot product __m128 sum = _mm_setzero_ps(); // load zeros into sum register // t = 4*(floor(_n/4)) unsigned int t = (_q->n >> 2) << 2; // unsigned int i; for (i=0; i<t; i+=4) { // load inputs into register (unaligned) v = _mm_loadu_ps(&_x[i]); // load coefficients into register (aligned) h = _mm_load_ps(&_q->h[i]); // compute dot product s = _mm_dp_ps(v, h, 0xffffffff); // parallel addition sum = _mm_add_ps( sum, s ); } // aligned output array float w[4] __attribute__((aligned(16))); // unload packed array _mm_store_ps(w, sum); float total = w[0]; // cleanup for (; i<_q->n; i++) total += _x[i] * _q->h[i]; // set return value *_y = total; }
// Résolution de matrices vectorisées supérieures void resolutionSup (mat M, vectf B, vectf Res) { int i, j; int somme; __m128 v1, v2, v3; for(i=0; i<N; i++) { somme = 0; for(j=(i-i%4); j<N; j+=4) { v1 = _mm_load_ps (M[i]+j) ; v2 = _mm_load_ps (B+j) ; v3 = _mm_dp_ps(v1,v2, 0xFF); somme += v3[0]; } Res[i] = somme / B[i]; } }
// Résolution de matrices vectorisées inférieures void resolutionInf (mat M, vectf B, vectf Res) { int i, j; int somme; __m128 v1, v2, v3, v4, v5; for(i=0; i<N; i++) { somme = 0; for(j=0; j<i; j+=4) { v1 = _mm_load_ps (M[i]+j) ; v2 = _mm_load_ps (B+j) ; v3 = _mm_load_ps(Res+j); v4 = _mm_mul_ps(v1,v3); v5 = _mm_dp_ps(v4,v2, 0xFF); somme += v5[0]; } Res[i] = (B[i]-somme) / M[i][i]; } }
double CMercerKernel<float>::Evaluate(float* x, float* y) { #ifndef __SSE4_1__ float result = 0; for(size_t i=0; i<m_n; i++) result += x[i]*y[i]; return static_cast<double>(result); #else __m128* px = reinterpret_cast<__m128*>(x); __m128* py = reinterpret_cast<__m128*>(y); float zero = 0.0; __m128 sum = _mm_load1_ps(&zero); const int mask = 241; // 4 MSB mask input, 4 LSB mask output for(size_t i=0; i<m_offset/4; i++) { __m128 temp = _mm_dp_ps(px[i],py[i],mask); sum = _mm_add_ss(sum,temp); // accumulate result in first register } float result[4] = {0.0,0.0,0.0,0.0}; _mm_storeu_ps(result,sum); // add offset for(size_t i=m_offset; i<m_n; i++) result[0] += x[i]*y[i]; return static_cast<double>(result[0]); #endif }
void test8bit (void) { i1 = _mm_cmpistrm (i2, i3, k4); /* { dg-error "the third argument must be an 8-bit immediate" } */ k1 = _mm_cmpistri (i2, i3, k4); /* { dg-error "the third argument must be an 8-bit immediate" } */ k1 = _mm_cmpistra (i2, i3, k4); /* { dg-error "the third argument must be an 8-bit immediate" } */ k1 = _mm_cmpistrc (i2, i3, k4); /* { dg-error "the third argument must be an 8-bit immediate" } */ k1 = _mm_cmpistro (i2, i3, k4); /* { dg-error "the third argument must be an 8-bit immediate" } */ k1 = _mm_cmpistrs (i2, i3, k4); /* { dg-error "the third argument must be an 8-bit immediate" } */ k1 = _mm_cmpistrz (i2, i3, k4); /* { dg-error "the third argument must be an 8-bit immediate" } */ i1 = _mm_cmpestrm (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */ k1 = _mm_cmpestri (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */ k1 = _mm_cmpestra (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */ k1 = _mm_cmpestrc (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */ k1 = _mm_cmpestro (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */ k1 = _mm_cmpestrs (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */ k1 = _mm_cmpestrz (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */ b1 = _mm256_blend_ps (b2, b3, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ k1 = _cvtss_sh (f1, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ i1 = _mm256_cvtps_ph (b2, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ b1 = _mm256_dp_ps (b2, b3, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ e1 = _mm256_permute2f128_pd (e2, e3, k4);/* { dg-error "the last argument must be an 8-bit immediate" } */ b1 = _mm256_permute2f128_ps (b2, b3, k4);/* { dg-error "the last argument must be an 8-bit immediate" } */ l1 = _mm256_permute2f128_si256 (l2, l3, k4);/* { dg-error "the last argument must be an 8-bit immediate" } */ b1 = _mm256_permute_ps (b2, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ i1 = _mm_aeskeygenassist_si128 (i2, k4);/* { dg-error "the last argument must be an 8-bit immediate" } */ i1 = _mm_blend_epi16 (i2, i3, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ i1 = _mm_clmulepi64_si128 (i2, i3, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ i1 = _mm_cvtps_ph (a1, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ d1 = _mm_dp_pd (d2, d3, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ a1 = _mm_dp_ps (a2, a3, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ a1 = _mm_insert_ps (a2, a3, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ i1 = _mm_mpsadbw_epu8 (i2, i3, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ a1 = _mm_permute_ps (a2, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ i1 = _mm_slli_si128 (i2, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ i1 = _mm_srli_si128 (i2, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ }
__forceinline int ClusteredLightCuller::_sphereOverlapsFroxel(int x, int y, int z, float sphere_radius, const vec3& sphere_center, const FroxelInfo* froxel_infos) { __m128* center_coord = (__m128*)&froxel_infos[_toFlatFroxelIndex(x, y, z)].center_coord; __m128 sse_sphere_center = _mm_set_ps(1.0, sphere_center.z, sphere_center.y, sphere_center.x); __m128 sse_plane_normal = _normalize(_mm_sub_ps(*center_coord, sse_sphere_center)); __m128 plane_origin = _mm_add_ps(sse_sphere_center, _mm_mul_ps(_mm_set1_ps(sphere_radius), sse_plane_normal)); __m128 sse_dot_plane = _mm_dp_ps(plane_origin, sse_plane_normal, 0x70 | 0xF); __m128* corner_a = (__m128*)&froxel_infos[_toFlatFroxelIndex(x + 0, y + 0, z + 0)].corner_coord; __m128* corner_b = (__m128*)&froxel_infos[_toFlatFroxelIndex(x + 1, y + 0, z + 0)].corner_coord; __m128* corner_c = (__m128*)&froxel_infos[_toFlatFroxelIndex(x + 1, y + 1, z + 0)].corner_coord; __m128* corner_d = (__m128*)&froxel_infos[_toFlatFroxelIndex(x + 0, y + 1, z + 0)].corner_coord; if (_overlap(sse_plane_normal, sse_dot_plane, corner_a, corner_b, corner_c, corner_d)) return 1; corner_a = (__m128*)&froxel_infos[_toFlatFroxelIndex(x + 0, y + 0, z + 1)].corner_coord; corner_b = (__m128*)&froxel_infos[_toFlatFroxelIndex(x + 1, y + 0, z + 1)].corner_coord; corner_c = (__m128*)& froxel_infos[_toFlatFroxelIndex(x + 1, y + 1, z + 1)].corner_coord; corner_d = (__m128*)& froxel_infos[_toFlatFroxelIndex(x + 0, y + 1, z + 1)].corner_coord; return (_overlap(sse_plane_normal, sse_dot_plane, corner_a, corner_b, corner_c, corner_d)); }
int kabsch_sander(const float* xyz, const int* nco_indices, const int* ca_indices, const int n_frames, const int n_atoms, const int n_residues, int* hbonds, float* henergies) { /* Find all of backbone hydrogen bonds between residues in each frame of a trajectory. Parameters ---------- xyz : array, shape=(n_frames, n_atoms, 3) The cartesian coordinates of all of the atoms in each frame. nco_indices : array, shape=(n_residues, 3) The indices of the backbone N, C, and O atoms for each residue. ca_indices : array, shape=(n_residues,) The index of the CA atom of each residue. If a residue does not contain a CA atom, or you want to skip the residue for another reason, the value should be -1 Returns ------- hbonds : array, shape=(n_frames, n_residues, 2) This is a little tricky, so bear with me. This array gives the indices of the residues that each backbone hbond *acceptor* is engaged in an hbond with. For instance, the equality `bonds[i, j, 0] == k` is interpreted as "in frame i, residue j is accepting its first hydrogen bond from residue k". `bonds[i, j, 1] == k` means that residue j is accepting its second hydrogen bond from residue k. A negative value indicates that no such hbond exists. henergies : array, shape=(n_frames, n_residues, 2) The semantics of this array run parallel to the hbonds array, but instead of giving the identity of the interaction partner, it gives the energy of the hbond. Only hbonds with energy below -0.5 kcal/mol are recorded. */ int i, ri, rj; static float HBOND_ENERGY_CUTOFF = -0.5; __m128 ri_ca, rj_ca, r12; __m128 MINIMAL_CA_DISTANCE2 = _mm_set1_ps(0.81); float* hcoords = (float*) malloc(n_residues*3 * sizeof(float)); if (hcoords == NULL) { fprintf(stderr, "Memory Error\n"); exit(1); } for (i = 0; i < n_frames; i++) { ks_assign_hydrogens(xyz, nco_indices, n_residues, hcoords); for (ri = 0; ri < n_residues; ri++) { // -1 is used to indicate that this residue lacks a this atom type // so just skip it if (ca_indices[ri] == -1) continue; ri_ca = load_float3(xyz + 3*ca_indices[ri]); for (rj = ri + 1; rj < n_residues; rj++) { if (ca_indices[rj] == -1) continue; rj_ca = load_float3(xyz + 3*ca_indices[rj]); // check the ca distance before proceding r12 = _mm_sub_ps(ri_ca, rj_ca); if(_mm_extract_epi16(CAST__M128I(_mm_cmplt_ps(_mm_dp_ps(r12, r12, 0x7F), MINIMAL_CA_DISTANCE2)), 0)) { float e = ks_donor_acceptor(xyz, hcoords, nco_indices, ri, rj); if (e < HBOND_ENERGY_CUTOFF) // hbond from donor=ri to acceptor=rj store_energies(hbonds, henergies, ri, rj, e); if (rj != ri + 1) { float e = ks_donor_acceptor(xyz, hcoords, nco_indices, rj, ri); if (e < HBOND_ENERGY_CUTOFF) // hbond from donor=rj to acceptor=ri store_energies(hbonds, henergies, rj, ri, e); } } } } xyz += n_atoms*3; // advance to the next frame hbonds += n_residues*2; henergies += n_residues*2; } free(hcoords); return 1; }
inline __m128 SSENormalizeMultiplierSSE4(__m128 v) { return _mm_rsqrt_ps(_mm_dp_ps(v, v, 0xFF)); }
static float ks_donor_acceptor(const float* xyz, const float* hcoords, const int* nco_indices, int donor, int acceptor) { /* Conpute the Kabsch-Sander hydrogen bond energy between two residues in a single conformation. Parameters ---------- xyz : array, shape=(n_atoms, 3) All of the atoms in this frame nhco0 : array, shape=(4,) The indices of the backbone N, H, C, and O atoms in one residue. nhco1 : array, shape=(4,) The indices of the backbone N, H, C, and O atoms in the other residue. donor : int Boolean flag. If 0, then nhco0 is the hydrogen bond proton donor (i.e. we look at its N and H). If 1, then nhco1 is the hydrogen bond proton donor. Returns ------- energy : float The KS backbone hydrogen bond energy, in kcal/mol. A number under -0.5 is considered significant. */ float energy; __m128 r_n, r_h, r_c, r_o, r_ho, r_nc, r_hc, r_no, d2_honchcno; __m128 coupling; // 332 (kcal*A/mol) * 0.42 * 0.2 * (1nm / 10 A) coupling = _mm_setr_ps(-2.7888, -2.7888, 2.7888, 2.7888); r_n = load_float3(xyz + 3*nco_indices[3*donor]); r_h = load_float3(hcoords + 3*donor); r_c = load_float3(xyz + 3*nco_indices[3*acceptor + 1]); r_o = load_float3(xyz + 3*nco_indices[3*acceptor + 2]); //printf("Donor Index %d\n", donor); //printf("Acceptor Index %d\n", acceptor); /*printf("N index %d\n", 3*nco_indices[3*donor + 0]); printf("C index %d\n", 3*nco_indices[3*acceptor + 1]); printf("O index %d\n", 3*nco_indices[3*acceptor + 2]); printf("\nrN "); printf_m128(r_n); printf("rH "); printf_m128(r_h); printf("rC "); printf_m128(r_c); printf("rO "); printf_m128(r_o);*/ r_ho = _mm_sub_ps(r_h, r_o); r_hc = _mm_sub_ps(r_h, r_c); r_nc = _mm_sub_ps(r_n, r_c); r_no = _mm_sub_ps(r_n, r_o); // compute all four dot products (each of the squared distances), and then // pack them into a single float4 using three shuffles. d2_honchcno = _mm_shuffle_ps(_mm_shuffle_ps(_mm_dp_ps(r_ho, r_ho, 0xF3), _mm_dp_ps(r_nc, r_nc, 0xF3), _MM_SHUFFLE(0,1,0,1)), _mm_shuffle_ps(_mm_dp_ps(r_hc, r_hc, 0xF3), _mm_dp_ps(r_no, r_no, 0xF3), _MM_SHUFFLE(0,1,0,1)), _MM_SHUFFLE(2,0,2,0)); energy = _mm_cvtss_f32(_mm_dp_ps(coupling, _mm_rsqrt_ps(d2_honchcno), 0xFF)); //printf("Energy: %f\n\n", energy); return (energy < -9.9f ? -9.9f : energy); }
// Dot product ------------------------------------------------------------------------ static Float3 VFunction Dot(const Float3& vectorA, const Float3& vectorB) { Vector vResult = _mm_dp_ps(vectorA, vectorB, 0x7F); return vResult; }