float vec3::dot(const vec3 &b) const { // make sure w component is 0 __m128 temp = _mm_mul_ps(v, b.v); __m128 temp2 = _mm_shuffle_ps(temp, temp, 0xFE); temp2 = _mm_add_ps(temp, temp2); return _mm_cvtss_f32(_mm_add_ps(temp2, _mm_shuffle_ps(temp, temp, 0xFD))); }
float vec3::length() const { __m128 temp = _mm_mul_ps(v, v); __m128 temp2 = _mm_shuffle_ps(temp, temp, 0xFD); temp2 = _mm_add_ps(temp, temp2); temp2 = _mm_add_ps(temp2, _mm_shuffle_ps(temp, temp, 0xFE)); return _mm_cvtss_f32(_mm_sqrt_ps(temp2)); }
float Float4::Dot(float x1, float y1, float z1, float w1, float x2, float y2, float z2, float w2) { // Task 8: replace with SSE __m128 v1 = _mm_setr_ps(x1, y1, z1, w1); __m128 v2 = _mm_setr_ps(x2,y2,z2,w2); __m128 dpResult = _mm_dp_ps(v1, v2, 0xf1); return _mm_cvtss_f32(dpResult); //return x1*x2 + y1*y2 + z1*z2 + w1*w2; }
int dihedral(const float* xyz, const int* quartets, float* out, const int n_frames, const int n_atoms, const int n_quartets) { /* Compute the angle between sets of four atoms in every frame of xyz. Parameters ---------- xyz : array, shape=(n_frames, n_atoms, 3) Cartesian coordinates of the atoms in every frame, in contiguous C order. quartets : array, shape=(n_quartets, 3) The specific quartet of atoms whose angle you want to compute. The angle computed will be the torsion around the bound between the middle two elements (i.e aABCD). A 2d array of indices, in C order. out : array, shape=(n_frames, n_pairs) Array where the angles will be stored, in contiguous C order. All of the arrays are assumed to be contiguous. This code will segfault if they're not. */ int i, j; __m128 x0, x1, x2, x3, b1, b2, b3, c1, c2, p1, p2; for (i = 0; i < n_frames; i++) { for (j = 0; j < n_quartets; j++) { x0 = load_float3(xyz + 3*quartets[4*j + 0]); x1 = load_float3(xyz + 3*quartets[4*j + 1]); x2 = load_float3(xyz + 3*quartets[4*j + 2]); x3 = load_float3(xyz + 3*quartets[4*j + 3]); b1 = _mm_sub_ps(x1, x0); b2 = _mm_sub_ps(x2, x1); b3 = _mm_sub_ps(x3, x2); c1 = cross(b2, b3); c2 = cross(b1, b2); p1 = _mm_mul_ps(_mm_dp_ps(b1, c1, 0x71), _mm_sqrt_ps(_mm_dp_ps(b2, b2, 0x71))); p2 = _mm_dp_ps(c1, c2, 0x71); *(out++) = atan2(_mm_cvtss_f32(p1), _mm_cvtss_f32(p2)); }; xyz += n_atoms*3; } return 1; }
inline float Sqrt(const float &sqr) // #include <xmmintrin.h> { __m128 mm1; mm1 = _mm_set_ss(sqr); mm1 = _mm_sqrt_ss(mm1); return _mm_cvtss_f32(mm1); }
static void mexsoftmax(float* y, float* shift, mwSize m, mwSize n) { __m128 i1, i2; __m128 o1, o2; while (m>0) { mwSize curn = n; float sum = 0.0f; declconst128(zero, 0.0f); while (curn>0 && ((unsigned long)(y+curn) & 15) != 0) { --curn; y[curn]=fastexp(y[curn]-*shift); sum += y[curn]; } __m128 s1 = _mm_load1_ps (shift); __m128 sum1 = zero; while (curn>7) { i1 = _mm_load_ps (y+curn-4); i2 = _mm_load_ps (y+curn-8); i1 = _mm_sub_ps (i1, s1); i2 = _mm_sub_ps (i2, s1); o1 = vfastexp(i1); o2 = vfastexp(i2); _mm_store_ps (y+curn-4, o1); sum1 = _mm_add_ps (sum1, o1); _mm_store_ps (y+curn-8, o2); sum1 = _mm_add_ps (sum1, o2); curn-=8; } sum1 = _mm_hadd_ps (sum1, sum1); sum1 = _mm_hadd_ps (sum1, sum1); sum += _mm_cvtss_f32 (sum1); while(curn>0) { --curn; y[curn]=fastexp(y[curn]-*shift); sum += y[curn]; } sum = 1.0f / sum; ptrdiff_t n_pdt = n; ptrdiff_t one_pdt = 1; sscal (&n_pdt, &sum, y, &one_pdt); ++shift; y+=n; --m; } }
float Vertex::length() const { #ifdef SSE4 __m128 ans = _mm_dp_ps(dat, dat, 0b01110001); return _mm_cvtss_f32(_mm_sqrt_ss(ans)); #else return sqrt(x*x + y*y + z*z); #endif }
float Vertex::length_sqr() const { #ifdef SSE4 __m128 ans = _mm_dp_ps(dat, dat, 0b01110001); return _mm_cvtss_f32(ans); #else return x*x + y*y + z*z; #endif }
inline float DatabaseBuilder::Distance(PackedSample* x, PackedSample* y) { #ifdef AVX //Black magic //But it does produce the same results as the not AVX code __m256 accumulator; __m256 x_s = _mm256_load_ps(x->Features); __m256 y_s = _mm256_load_ps(y->Features); __m256 result = _mm256_sub_ps(x_s, y_s); accumulator = _mm256_mul_ps(result, result); x_s = _mm256_load_ps(&x->Features[8]); y_s = _mm256_load_ps(&y->Features[8]); result = _mm256_sub_ps(x_s, y_s); result = _mm256_mul_ps(result, result); accumulator = _mm256_add_ps(accumulator, result); x_s = _mm256_load_ps(&x->Features[16]); y_s = _mm256_load_ps(&y->Features[16]); result = _mm256_sub_ps(x_s, y_s); result = _mm256_mul_ps(result, result); accumulator = _mm256_add_ps(accumulator, result); x_s = _mm256_load_ps(&x->Features[24]); y_s = _mm256_load_ps(&y->Features[24]); result = _mm256_sub_ps(x_s, y_s); result = _mm256_mul_ps(result, result); accumulator = _mm256_add_ps(accumulator, result); //We now have a vector of 8 floats __m256 t1 = _mm256_hadd_ps(accumulator, accumulator); __m256 t2 = _mm256_hadd_ps(t1, t1); __m128 t3 = _mm256_extractf128_ps(t2, 1); __m128 t4 = _mm_add_ss(_mm256_castps256_ps128(t2), t3); //And now we don't return std::sqrtf(_mm_cvtss_f32(t4)); #endif #ifndef AVX //Can be autovectorized float accumulator[32]; float distance = 0; for (int i = 0; i < 30; i++) { accumulator[i] = x->Features[i] - y->Features[i]; } //If done properly this should be 4(8) instructions for (int i = 0; i < 30; i++) { distance += accumulator[i] * accumulator[i]; } return std::sqrtf(distance); #endif }
float Vertex::operator&(const Vertex &v) const { #ifdef SSE4 __m128 ans = _mm_dp_ps(dat, v.dat, 0b01110001); return _mm_cvtss_f32(ans); #else return x*v.x + y*v.y + z*v.z; #endif }
static float Atan(float y, float x) // #include <emmintrin.h>, #include <xmmintrin.h> { const __m128 _ps_atan_p0 = _mm_set1_ps(-0.0464964749f); const __m128 _ps_atan_p1 = _mm_set1_ps(0.15931422f); const __m128 _ps_atan_p2 = _mm_set1_ps(0.327622764f); const __m128 _ps_pi = _mm_set1_ps(pi); const __m128 _ps_pi0p5 = _mm_set1_ps(pi0p5); const __m128 _mask_sign_raw = _mm_castsi128_ps(_mm_set1_epi32( 0x80000000)); const __m128 _mask_sign_inv = _mm_castsi128_ps(_mm_set1_epi32(~0x80000000)); __m128 mm1, mm2, mm3; __m128 axm, aym; __m128 xm = _mm_set1_ps(x); __m128 ym = _mm_set1_ps(y); axm = _mm_and_ps(xm, _mask_sign_inv); aym = _mm_and_ps(ym, _mask_sign_inv); mm1 = _mm_min_ps(axm, aym); mm2 = _mm_max_ps(axm, aym); mm1 = _mm_div_ps(mm1, mm2); mm2 = _mm_mul_ps(mm1, mm1); mm3 = _mm_mul_ps(mm2, _ps_atan_p0); mm3 = _mm_add_ps(mm3, _ps_atan_p1); mm3 = _mm_mul_ps(mm3, mm2); mm3 = _mm_sub_ps(mm3, _ps_atan_p2); mm3 = _mm_mul_ps(mm3, mm2); mm3 = _mm_mul_ps(mm3, mm1); mm3 = _mm_add_ps(mm3, mm1); __m128 mask; /* |y| > |x| */ mask = _mm_cmpgt_ss(aym, axm); mm2 = _mm_and_ps(_ps_pi0p5, mask); mm1 = _mm_and_ps(_mask_sign_raw, mask); mm3 = _mm_xor_ps(mm3, mm1); mm3 = _mm_add_ps(mm3, mm2); /* x < 0 */ mask = _mm_and_ps(xm, _mask_sign_raw); mm3 = _mm_xor_ps(mm3, mask); mm1 = _mm_castsi128_ps(_mm_srai_epi32(_mm_castps_si128(mm3), 30)); mm1 = _mm_and_ps(_ps_pi, mm1); mm3 = _mm_add_ps(mm3, mm1); /* y < 0 */ mm1 = _mm_and_ps(ym, _mask_sign_raw); mm3 = _mm_xor_ps(mm3, mm1); return _mm_cvtss_f32(mm3); }
irreg_poly_area_func_sign(float, _avx) { if (__builtin_expect(is_null(cords) || cords_len == 0, 0)) return 0; __m256 values_0_3, values_4_7, values_8_11, values_12_15, values_16_19 = _mm256_load_ps((const float *)&cords[0][0]), accum_sum = _mm256_setzero_ps(); float accum_sum_aux; #define _float_cords_dot_prod(curr, next, index) \ _mm256_dp_ps( \ curr, \ _mm256_xor_ps( \ _mm256_shuffle_ps(curr, _mm256_permute2f128_ps(curr, next, 0b00100001), 0b00011011),\ _mm256_setr_ps(0, -0.0f, 0, -0.0f, 0, -0.0f, 0, -0.0f) \ ), \ 0b11110000 | (1 << (index)) \ ) unsigned long index; for (index = 0; index < (cords_len - 16); index += 16) { values_0_3 = values_16_19; values_4_7 = _mm256_load_ps((const float *)&cords[index + 4]); values_8_11 = _mm256_load_ps((const float *)&cords[index + 8]); values_12_15 = _mm256_load_ps((const float *)&cords[index + 12]); values_16_19 = _mm256_load_ps((const float *)&cords[index + 16]); accum_sum = _mm256_add_ps( accum_sum, _mm256_add_ps( _mm256_add_ps( _float_cords_dot_prod(values_0_3, values_4_7, 0), _float_cords_dot_prod(values_4_7, values_8_11, 1) ), _mm256_add_ps( _float_cords_dot_prod(values_8_11, values_12_15, 2), _float_cords_dot_prod(values_12_15, values_16_19, 3) ) ) ); } accum_sum = _mm256_hadd_ps(accum_sum, _mm256_permute2f128_ps(accum_sum, accum_sum, 1)); // a0+a1, a2+a3, a4+a5, a6+a7, a4+a5, a6+a7, a0+a1, a2+a3 accum_sum = _mm256_hadd_ps(accum_sum, accum_sum); // a0+a1+a2+a3, a4+a5+a6+a7, ... accum_sum = _mm256_hadd_ps(accum_sum, accum_sum); // a0+a1+a2+a3+a4+a5+a6+a7, ... for (accum_sum_aux = _mm_cvtss_f32(_mm256_castps256_ps128(accum_sum)); index < (cords_len - 1); index++) accum_sum_aux += _calc_diff_of_adj_prods(cords, index); return accum_sum_aux; // return scalar_half(scalar_abs(accum_sum_aux)); }
static double rcp_d(double x) { __m128d xd = _mm_load_sd(&x); double xi = _mm_cvtss_f32(_mm_rcp_ss(_mm_cvtsd_ss(_mm_setzero_ps(), xd))); xi = xi + xi * (1.0 - x * xi); xi = xi + xi * (1.0 - x * xi); return xi; }
inline float hadd(const vector4f& rhs) { #if SSE_INSTR_SET >= 3 // SSE3 __m128 tmp0 = _mm_hadd_ps(rhs, rhs); __m128 tmp1 = _mm_hadd_ps(tmp0, tmp0); #else __m128 tmp0 = _mm_add_ps(rhs, _mm_movehl_ps(rhs, rhs)); __m128 tmp1 = _mm_add_ss(tmp0, _mm_shuffle_ps(tmp0, tmp0, 1)); #endif return _mm_cvtss_f32(tmp1); }
_XOINL float QuaternionSquareSum(const Quaternion& q) { #if defined(XO_SSE) __m128 square = _mm_mul_ps(q.xmm, q.xmm); square = _mm_hadd_ps(square, square); square = _mm_hadd_ps(square, square); return _mm_cvtss_f32(square); #else return q.x * q.x + q.y * q.y + q.z * q.z + q.w * q.w; #endif }
//Thanks stack overflow. static inline float _mm256_reduce_add_ps(__m256 x) { /* ( x3+x7, x2+x6, x1+x5, x0+x4 ) */ const int imm = 1; const __m128 x128 = _mm_add_ps(_mm256_extractf128_ps(x, imm), _mm256_castps256_ps128(x)); /* ( -, -, x1+x3+x5+x7, x0+x2+x4+x6 ) */ const __m128 x64 = _mm_add_ps(x128, _mm_movehl_ps(x128, x128)); /* ( -, -, -, x0+x1+x2+x3+x4+x5+x6+x7 ) */ const __m128 x32 = _mm_add_ss(x64, _mm_shuffle_ps(x64, x64, 0x55)); /* Conversion to float is a no-op on x86-64 */ return _mm_cvtss_f32(x32); }
inline float operator[](int i) const { __m512 buf0; if (i < 16) { buf0 = val1; } else { buf0 = val2; } i &= 15; __m128 buf1; if (i < 8) { if (i < 4) { buf1 = _mm512_extractf32x4_ps(buf0, 0); } else { buf1 = _mm512_extractf32x4_ps(buf0, 1); } } else { if (i < 12) { buf1 = _mm512_extractf32x4_ps(buf0, 2); } else { buf1 = _mm512_extractf32x4_ps(buf0, 3); } } i &= 3; if (i == 3) { return _mm_cvtss_f32(_mm_shuffle_ps(buf1, buf1, 3)); } if (i == 2) { return _mm_cvtss_f32(_mm_shuffle_ps(buf1, buf1, 2)); } if (i == 1) { return _mm_cvtss_f32(_mm_shuffle_ps(buf1, buf1, 1)); } return _mm_cvtss_f32(buf1); }
/* http://stackoverflow.com/questions/13219146/how-to-sum-m256-horizontally */ static inline float horizontal_sum_avx2(__m256 x) { const __m128 hi_quad = _mm256_extractf128_ps(x, 1); const __m128 lo_quad = _mm256_castps256_ps128(x); const __m128 sum_quad = _mm_add_ps(lo_quad, hi_quad); const __m128 lo_dual = sum_quad; const __m128 hi_dual = _mm_movehl_ps(sum_quad, sum_quad); const __m128 sum_dual = _mm_add_ps(lo_dual, hi_dual); const __m128 lo = sum_dual; const __m128 hi = _mm_shuffle_ps(sum_dual, sum_dual, 0x1); const __m128 sum = _mm_add_ss(lo, hi); return _mm_cvtss_f32(sum); }
int angle(const float* xyz, const int* triplets, float* out, const int n_frames, const int n_atoms, const int n_angles) { /* Compute the angle between tripples of atoms in every frame of xyz. Parameters ---------- xyz : array, shape=(n_frames, n_atoms, 3) Cartesian coordinates of the atoms in every frame, in contiguous C order. triplets : array, shape=(n_angles, 3) The specific tripple of atoms whose angle you want to compute. The angle computed will be centered around the middle element (i.e aABC). A 2d array of indices, in C order. out : array, shape=(n_frames, n_pairs) Array where the angles will be stored, in contiguous C order. All of the arrays are assumed to be contiguous. This code will segfault if they're not. */ int i, j; __m128 r_m, r_n, r_o, u_prime, u, v_prime, v; for (i = 0; i < n_frames; i++) { for (j = 0; j < n_angles; j++) { r_m = load_float3(xyz + 3*triplets[3*j + 0]); r_o = load_float3(xyz + 3*triplets[3*j + 1]); r_n = load_float3(xyz + 3*triplets[3*j + 2]); u_prime = _mm_sub_ps(r_m, r_o); v_prime = _mm_sub_ps(r_n, r_o); // normalize the vectors u_prime and v_prime u = _mm_mul_ps(u_prime, _mm_rsqrt_ps(_mm_dp_ps(u_prime, u_prime, 0x7F))); v = _mm_mul_ps(v_prime, _mm_rsqrt_ps(_mm_dp_ps(v_prime, v_prime, 0x7F))); // compute the arccos of the dot product, and store the result. *(out++) = acos(_mm_cvtss_f32(_mm_dp_ps(u, v, 0x71))); } // advance to the next frame xyz += n_atoms*3; } return 1; }
float DotProductSIMD(const float* a, const float* b, std::size_t n) { std::size_t i = 0; __m128 sum = _mm_setzero_ps(); for(; i < ROUND_DOWN(n, 4); i += 4) { __m128 x = _mm_loadu_ps(a + i); __m128 y = _mm_loadu_ps(a + i); x = _mm_mul_ps(x, y); sum = _mm_add_ps(x, sum); } sum = _mm_hadd_ps(sum, sum); sum = _mm_hadd_ps(sum, sum); float product = _mm_cvtss_f32(sum); for(; i < n; i++) { product += a[i] * b[i]; } return product; }
Normal::Normal(const Vertex &v)//¹éÒ»»¯ { #ifdef AVX2 __m128 ans = _mm_dp_ps(v.dat, v.dat, 0b01110001); __m128 tmp = _mm_broadcastss_ps(_mm_sqrt_ss(ans)); dat = _mm_div_ps(v.dat, tmp); #else # ifdef SSE4 __m128 ans = _mm_dp_ps(v.dat, v.dat, 0b01110001); ans = _mm_sqrt_ss(ans); __m128 tmp = _mm_set1_ps(_mm_cvtss_f32(ans)); dat = _mm_div_ps(v.dat, tmp); # else float s = v.x*v.x + v.y*v.y + v.z*v.z; s = 1 / sqrt(s); x = v.x * s; y = v.y * s; z = v.z * s; # endif #endif }
/* http://stackoverflow.com/questions/13219146/how-to-sum-m256-horizontally */ inline float sum8(__m256 x) { // hiQuad = ( x7, x6, x5, x4 ) const __m128 hiQuad = _mm256_extractf128_ps(x, 1); // loQuad = ( x3, x2, x1, x0 ) const __m128 loQuad = _mm256_castps256_ps128(x); // sumQuad = ( x3 + x7, x2 + x6, x1 + x5, x0 + x4 ) const __m128 sumQuad = _mm_add_ps(loQuad, hiQuad); // loDual = ( -, -, x1 + x5, x0 + x4 ) const __m128 loDual = sumQuad; // hiDual = ( -, -, x3 + x7, x2 + x6 ) const __m128 hiDual = _mm_movehl_ps(sumQuad, sumQuad); // sumDual = ( -, -, x1 + x3 + x5 + x7, x0 + x2 + x4 + x6 ) const __m128 sumDual = _mm_add_ps(loDual, hiDual); // lo = ( -, -, -, x0 + x2 + x4 + x6 ) const __m128 lo = sumDual; // hi = ( -, -, -, x1 + x3 + x5 + x7 ) const __m128 hi = _mm_shuffle_ps(sumDual, sumDual, 0x1); // sum = ( -, -, -, x0 + x1 + x2 + x3 + x4 + x5 + x6 + x7 ) const __m128 sum = _mm_add_ss(lo, hi); return _mm_cvtss_f32(sum); }
int test_sqrt() { int Error(0); # if GLM_ARCH & GLM_ARCH_SSE2_BIT for(float f = 0.1f; f < 30.0f; f += 0.1f) { float r = _mm_cvtss_f32(_mm_sqrt_ps(_mm_set1_ps(f))); float s = std::sqrt(f); Error += glm::abs(r - s) < 0.01f ? 0 : 1; assert(!Error); } # endif//GLM_ARCH & GLM_ARCH_SSE2_BIT float A = glm::sqrt(10.f); glm::vec1 B = glm::sqrt(glm::vec1(10.f)); glm::vec2 C = glm::sqrt(glm::vec2(10.f)); glm::vec3 D = glm::sqrt(glm::vec3(10.f)); glm::vec4 E = glm::sqrt(glm::vec4(10.f)); return Error; }
static void matvec_sse() { /* Assume that the data size is an even multiple of the 128 bit * SSE vectors (i.e. 4 floats) */ assert(!(SIZE & 0x3)); /* TASK: Implement your SSE version of the matrix-vector * multiplication here. */ /* HINT: You might find at least the following instructions * useful: * - _mm_setzero_ps * - _mm_load_ps * - _mm_hadd_ps * - _mm_cvtss_f32 * * HINT: You can create the sum of all elements in a vector * using two hadd instructions. */ __m128 dummy=_mm_setzero_ps(); for(int i=0;i<SIZE;++i){ __m128 temp=_mm_setzero_ps(); for(int j=0;j<SIZE;j+=4){ __m128 mm_vec_b=_mm_load_ps((__m128*)(vec_b+j)); __m128 mm_matr=_mm_load_ps((__m128*)(mat_a+MINDEX(i,j))); __m128 out=_mm_mul_ps(mm_vec_b,mm_matr); temp=_mm_add_ps(temp,out); // vec_c[i]+=_mm_cvtss_f32(_mm_dp_ps(mm_matr,mm_vec_b,0xf1)); } __m128 res=_mm_hadd_ps(_mm_hadd_ps(temp,dummy),dummy); vec_c[i]=_mm_cvtss_f32(res); } }
/** * Identify bends in the chain, where the kappa angle (virtual bond angle from * c-alpha i-2, to i, to i+2) is greater than 70 degrees * dssp-2.2.0/structure.cpp:1729 */ static std::vector<int> calculate_bends(const float* xyz, const int* ca_indices, const int* chain_ids, const int n_residues, std::vector<int>& skip) { __m128 prev_ca, this_ca, next_ca, u_prime, v_prime, u, v; float kappa; std::vector<int> is_bend(n_residues, 0); for (int i = 2; i < n_residues-2; i++) { if (chain_ids[i-2] == chain_ids[i+2] && !skip[i-2] && !skip[i] && !skip[i+2]) { prev_ca = load_float3(xyz + 3*ca_indices[i-2]); this_ca = load_float3(xyz + 3*ca_indices[i]); next_ca = load_float3(xyz + 3*ca_indices[i+2]); u_prime = _mm_sub_ps(prev_ca, this_ca); v_prime = _mm_sub_ps(this_ca, next_ca); /* normalize the vectors u_prime and v_prime */ u = _mm_div_ps(u_prime, _mm_sqrt_ps(_mm_dp_ps2(u_prime, u_prime, 0x7F))); v = _mm_div_ps(v_prime, _mm_sqrt_ps(_mm_dp_ps2(v_prime, v_prime, 0x7F))); /* compute the arccos of the dot product. this gives the angle */ kappa = (float) acos(CLIP(_mm_cvtss_f32(_mm_dp_ps2(u, v, 0x71)), -1, 1)); is_bend[i] = kappa > (70 * (M_PI / 180.0)); } } return is_bend; }
static float ks_donor_acceptor(const float* xyz, const float* hcoords, const int* nco_indices, int donor, int acceptor) { /* Conpute the Kabsch-Sander hydrogen bond energy between two residues in a single conformation. Parameters ---------- xyz : array, shape=(n_atoms, 3) All of the atoms in this frame nhco0 : array, shape=(4,) The indices of the backbone N, H, C, and O atoms in one residue. nhco1 : array, shape=(4,) The indices of the backbone N, H, C, and O atoms in the other residue. donor : int Boolean flag. If 0, then nhco0 is the hydrogen bond proton donor (i.e. we look at its N and H). If 1, then nhco1 is the hydrogen bond proton donor. Returns ------- energy : float The KS backbone hydrogen bond energy, in kcal/mol. A number under -0.5 is considered significant. */ float energy; __m128 r_n, r_h, r_c, r_o, r_ho, r_nc, r_hc, r_no, d2_honchcno; __m128 coupling; // 332 (kcal*A/mol) * 0.42 * 0.2 * (1nm / 10 A) coupling = _mm_setr_ps(-2.7888, -2.7888, 2.7888, 2.7888); r_n = load_float3(xyz + 3*nco_indices[3*donor]); r_h = load_float3(hcoords + 3*donor); r_c = load_float3(xyz + 3*nco_indices[3*acceptor + 1]); r_o = load_float3(xyz + 3*nco_indices[3*acceptor + 2]); //printf("Donor Index %d\n", donor); //printf("Acceptor Index %d\n", acceptor); /*printf("N index %d\n", 3*nco_indices[3*donor + 0]); printf("C index %d\n", 3*nco_indices[3*acceptor + 1]); printf("O index %d\n", 3*nco_indices[3*acceptor + 2]); printf("\nrN "); printf_m128(r_n); printf("rH "); printf_m128(r_h); printf("rC "); printf_m128(r_c); printf("rO "); printf_m128(r_o);*/ r_ho = _mm_sub_ps(r_h, r_o); r_hc = _mm_sub_ps(r_h, r_c); r_nc = _mm_sub_ps(r_n, r_c); r_no = _mm_sub_ps(r_n, r_o); // compute all four dot products (each of the squared distances), and then // pack them into a single float4 using three shuffles. d2_honchcno = _mm_shuffle_ps(_mm_shuffle_ps(_mm_dp_ps(r_ho, r_ho, 0xF3), _mm_dp_ps(r_nc, r_nc, 0xF3), _MM_SHUFFLE(0,1,0,1)), _mm_shuffle_ps(_mm_dp_ps(r_hc, r_hc, 0xF3), _mm_dp_ps(r_no, r_no, 0xF3), _MM_SHUFFLE(0,1,0,1)), _MM_SHUFFLE(2,0,2,0)); energy = _mm_cvtss_f32(_mm_dp_ps(coupling, _mm_rsqrt_ps(d2_honchcno), 0xFF)); //printf("Energy: %f\n\n", energy); return (energy < -9.9f ? -9.9f : energy); }
void run_softmax_int32_float_work_item_latency(nn_workload_item *const work_item) { nn_workload_data_t *input_view = work_item->input[0]->output; const auto &arguments = work_item->arguments.forward_softmax_fixedpoint; const auto input_width = input_view->parent->lengths.t[NN_DATA_COORD_z] * input_view->parent->lengths.t[NN_DATA_COORD_p]; const auto output_width = work_item->output->view_end.t[NN_DATA_COORD_x] - work_item->output->view_begin.t[NN_DATA_COORD_x] + 1; const auto num_full_blocks = output_width / C_data_stride; const auto partial_block_size = (output_width / C_simd_width) % C_max_acc; const auto subsimd_block_size = output_width % C_simd_width; const auto output_view_start = work_item->output->view_begin.t[NN_DATA_COORD_x]; const auto input_view_start = input_view->view_begin.t[NN_DATA_COORD_z] * input_view->parent->lengths.t[NN_DATA_COORD_p]; const auto out_fraction = arguments.input_fraction; float * input_f = (float*)_mm_malloc(input_width * sizeof(float), 64); auto input_buffer = &static_cast<int32_t*>(input_view->parent->data_buffer)[input_view_start]; auto shift = out_fraction; if (shift > 0) { for (uint32_t i = 0; i < input_width; i++) input_f[i] = (float)(input_buffer[i]) / (1 << shift); } else if (shift < 0) { for (uint32_t i = 0; i < input_width; i++) input_f[i] = (float)(input_buffer[i]) * (1 << -shift); } else { for (uint32_t i = 0; i < input_width; i++) input_f[i] = (float)(input_buffer[i]); } __m256 acc_sum = _mm256_setzero_ps(); float subsimd_sum = 0.0f; { auto input_buffer = input_f; auto output_buffer = &static_cast<float*>(work_item->output->parent->data_buffer)[output_view_start]; for (auto block = 0u; block < num_full_blocks; ++block) { // Run computation. softmax_compute_block<C_max_acc>(input_buffer, output_buffer, acc_sum); } switch (partial_block_size) { case 0: break; case 1: softmax_compute_block< 1>(input_buffer, output_buffer, acc_sum); break; case 2: softmax_compute_block< 2>(input_buffer, output_buffer, acc_sum); break; case 3: softmax_compute_block< 3>(input_buffer, output_buffer, acc_sum); break; case 4: softmax_compute_block< 4>(input_buffer, output_buffer, acc_sum); break; case 5: softmax_compute_block< 5>(input_buffer, output_buffer, acc_sum); break; case 6: softmax_compute_block< 6>(input_buffer, output_buffer, acc_sum); break; case 7: softmax_compute_block< 7>(input_buffer, output_buffer, acc_sum); break; case 8: softmax_compute_block< 8>(input_buffer, output_buffer, acc_sum); break; case 9: softmax_compute_block< 9>(input_buffer, output_buffer, acc_sum); break; case 10: softmax_compute_block<10>(input_buffer, output_buffer, acc_sum); break; case 11: softmax_compute_block<11>(input_buffer, output_buffer, acc_sum); break; case 12: softmax_compute_block<12>(input_buffer, output_buffer, acc_sum); break; case 13: softmax_compute_block<13>(input_buffer, output_buffer, acc_sum); break; case 14: softmax_compute_block<14>(input_buffer, output_buffer, acc_sum); break; default: NN_UNREACHABLE_CODE; } switch (subsimd_block_size) { case 0: break; case 1: softmax_compute_subsimd<1>(input_buffer, output_buffer, subsimd_sum); break; case 2: softmax_compute_subsimd<2>(input_buffer, output_buffer, subsimd_sum); break; case 3: softmax_compute_subsimd<3>(input_buffer, output_buffer, subsimd_sum); break; case 4: softmax_compute_subsimd<4>(input_buffer, output_buffer, subsimd_sum); break; case 5: softmax_compute_subsimd<5>(input_buffer, output_buffer, subsimd_sum); break; case 6: softmax_compute_subsimd<6>(input_buffer, output_buffer, subsimd_sum); break; case 7: softmax_compute_subsimd<7>(input_buffer, output_buffer, subsimd_sum); break; default: NN_UNREACHABLE_CODE; } } { __m256 intermediate_sum = _mm256_hadd_ps(acc_sum, acc_sum); intermediate_sum = _mm256_permutevar8x32_ps(intermediate_sum, _mm256_set_epi32(0, 1, 4, 5, 2, 3, 6, 7)); intermediate_sum = _mm256_hadd_ps(intermediate_sum, intermediate_sum); intermediate_sum = _mm256_hadd_ps(intermediate_sum, intermediate_sum); acc_sum = _mm256_add_ps(intermediate_sum, _mm256_set1_ps(subsimd_sum)); subsimd_sum = _mm_cvtss_f32(_mm256_extractf128_ps(acc_sum, 0)); acc_sum = _mm256_div_ps(_mm256_set1_ps(1.0f), acc_sum); subsimd_sum = 1.0f / subsimd_sum; } { auto output_buffer = &static_cast<float*>(work_item->output->parent->data_buffer)[output_view_start]; for (auto block = 0u; block < num_full_blocks; ++block) { // Run computation. softmax_finalize_block<C_max_acc>(output_buffer, acc_sum); } switch (partial_block_size) { case 0: break; case 1: softmax_finalize_block< 1>(output_buffer, acc_sum); break; case 2: softmax_finalize_block< 2>(output_buffer, acc_sum); break; case 3: softmax_finalize_block< 3>(output_buffer, acc_sum); break; case 4: softmax_finalize_block< 4>(output_buffer, acc_sum); break; case 5: softmax_finalize_block< 5>(output_buffer, acc_sum); break; case 6: softmax_finalize_block< 6>(output_buffer, acc_sum); break; case 7: softmax_finalize_block< 7>(output_buffer, acc_sum); break; case 8: softmax_finalize_block< 8>(output_buffer, acc_sum); break; case 9: softmax_finalize_block< 9>(output_buffer, acc_sum); break; case 10: softmax_finalize_block<10>(output_buffer, acc_sum); break; case 11: softmax_finalize_block<11>(output_buffer, acc_sum); break; case 12: softmax_finalize_block<12>(output_buffer, acc_sum); break; case 13: softmax_finalize_block<13>(output_buffer, acc_sum); break; case 14: softmax_finalize_block<14>(output_buffer, acc_sum); break; default: NN_UNREACHABLE_CODE; } switch (subsimd_block_size) { case 0: break; case 1: softmax_finalize_subsimd<1>(output_buffer, subsimd_sum); break; case 2: softmax_finalize_subsimd<2>(output_buffer, subsimd_sum); break; case 3: softmax_finalize_subsimd<3>(output_buffer, subsimd_sum); break; case 4: softmax_finalize_subsimd<4>(output_buffer, subsimd_sum); break; case 5: softmax_finalize_subsimd<5>(output_buffer, subsimd_sum); break; case 6: softmax_finalize_subsimd<6>(output_buffer, subsimd_sum); break; case 7: softmax_finalize_subsimd<7>(output_buffer, subsimd_sum); break; default: NN_UNREACHABLE_CODE; } } _mm_free(input_f); }
void sgemm( int m_a, int n_a, float *A, float *B, float *C ) { int mpad = m_a%STEPM ? (m_a/STEPM+1)*STEPM:m_a; int npad = n_a%STEPN ? (n_a/STEPN+1)*STEPN:n_a; int mbackup = m_a; float* Apad=malloc(mpad*npad*sizeof(float)); transposeA(n_a, m_a, npad, mpad, Apad, A); A=Apad; float* Bpad=malloc(npad*mpad*sizeof(float)); transposeB(n_a, m_a, npad, mpad, Bpad, B); B=Bpad; float* Cpad = malloc(mpad*mpad*sizeof(float)); float* backup = C; C = Cpad; m_a = mpad; n_a = npad; #pragma omp parallel { // __m128 c0,a1, c1, a2, c2, a3, c3, a4, c4; __m128 a1, a2, a3, a4, c0; __m128 c11, c12, c13, c14; __m128 c21, c22, c23, c24; __m128 c31, c32, c33, c34; __m128 c41, c42, c43, c44; __m128 b1, b2, b3, b4; float temp0,temp1,temp2,temp3,temp4, temp5, temp6, temp7, temp8; int ii=0; int jj=0; int kk=0; int kkma=0; int jjna=0; int jjma=0; int iina=0; #pragma omp for for( int j = 0; j < m_a; j+=4 ) { jj=j; jjma=jj*m_a; jjna=jj*n_a; for( int i = 0; i < m_a; i+=4 ) { ii=i; iina=ii*n_a; c31=c32=c33=c34=c41=c42=c43=c44=c11=c12=c13=c14=c21=c22=c23=c24 = _mm_setzero_ps(); for( int k = 0; k < n_a; k+=4 ) { float* tempA=A+k+iina; float* tempB=B+k+jjna; b1 = _mm_loadu_ps(tempB); b2 = _mm_loadu_ps(tempB+n_a); b3 = _mm_loadu_ps(tempB+2*n_a); b4 = _mm_loadu_ps(tempB+3*n_a); ///////////////////////////////////////// a1 = _mm_loadu_ps(tempA); a2 = _mm_loadu_ps(tempA+n_a); a3 = _mm_loadu_ps(tempA+n_a*2); a4 = _mm_loadu_ps(tempA+n_a*3); c11=_mm_add_ps(c11, _mm_mul_ps(a1, b1)); c21 = _mm_add_ps(c21, _mm_mul_ps(a2, b1)); c12 = _mm_add_ps(c12, _mm_mul_ps(a1, b2)); c22 = _mm_add_ps(c22, _mm_mul_ps(a2, b2)); c13= _mm_add_ps(c13, _mm_mul_ps(a1, b3)); c23 = _mm_add_ps(c23, _mm_mul_ps(a2, b3)); c14 = _mm_add_ps(c14, _mm_mul_ps(a1, b4)); c24 = _mm_add_ps(c24, _mm_mul_ps(a2, b4)); c31=_mm_add_ps(c31, _mm_mul_ps(a3, b1)); c41 = _mm_add_ps(c41, _mm_mul_ps(a4, b1)); c32 = _mm_add_ps(c32, _mm_mul_ps(a3, b2)); c42 = _mm_add_ps(c42, _mm_mul_ps(a4, b2)); c33= _mm_add_ps(c33, _mm_mul_ps(a3, b3)); c43 = _mm_add_ps(c43, _mm_mul_ps(a4, b3)); c34 = _mm_add_ps(c34, _mm_mul_ps(a3, b4)); c44 = _mm_add_ps(c44, _mm_mul_ps(a4, b4)); } c0= _mm_hadd_ps(c11,c11); c0= _mm_hadd_ps(c0,c0); C[ii+jjma] = _mm_cvtss_f32(c0); c0= _mm_hadd_ps(c12,c12); c0= _mm_hadd_ps(c0,c0); C[ii+jjma+m_a] = _mm_cvtss_f32(c0); c0= _mm_hadd_ps(c13,c13); c0= _mm_hadd_ps(c0,c0); C[ii+jjma+m_a*2] = _mm_cvtss_f32(c0); c0= _mm_hadd_ps(c14,c14); c0= _mm_hadd_ps(c0,c0); C[ii+jjma+m_a*3] = _mm_cvtss_f32(c0); c0= _mm_hadd_ps(c21,c21); c0= _mm_hadd_ps(c0,c0); C[ii+jjma+1] = _mm_cvtss_f32(c0); c0= _mm_hadd_ps(c22,c22); c0= _mm_hadd_ps(c0,c0); C[ii+jjma+1+m_a] = _mm_cvtss_f32(c0); c0= _mm_hadd_ps(c23,c23); c0= _mm_hadd_ps(c0,c0); C[ii+jjma+1+m_a*2] = _mm_cvtss_f32(c0); c0= _mm_hadd_ps(c24,c24); c0= _mm_hadd_ps(c0,c0); C[ii+jjma+1+m_a*3] = _mm_cvtss_f32(c0); c0= _mm_hadd_ps(c31,c31); c0= _mm_hadd_ps(c0,c0); C[ii+jjma+2] = _mm_cvtss_f32(c0); c0= _mm_hadd_ps(c32,c32); c0= _mm_hadd_ps(c0,c0); C[ii+jjma+2+m_a] = _mm_cvtss_f32(c0); c0= _mm_hadd_ps(c33,c33); c0= _mm_hadd_ps(c0,c0); C[ii+jjma+2+m_a*2] = _mm_cvtss_f32(c0); c0= _mm_hadd_ps(c34,c34); c0= _mm_hadd_ps(c0,c0); C[ii+jjma+2+m_a*3] = _mm_cvtss_f32(c0); c0= _mm_hadd_ps(c41,c41); c0= _mm_hadd_ps(c0,c0); C[ii+jjma+3] = _mm_cvtss_f32(c0); c0= _mm_hadd_ps(c42,c42); c0= _mm_hadd_ps(c0,c0); C[ii+jjma+3+m_a] = _mm_cvtss_f32(c0); c0= _mm_hadd_ps(c43,c43); c0= _mm_hadd_ps(c0,c0); C[ii+jjma+3+m_a*2] = _mm_cvtss_f32(c0); c0= _mm_hadd_ps(c44,c44); c0= _mm_hadd_ps(c0,c0); C[ii+jjma+3+m_a*3] = _mm_cvtss_f32(c0); /* c0= _mm_hadd_ps(c11,c11); c0= _mm_hadd_ps(c0,c0); C[ii+jjma] = _mm_cvtss_f32(c0); c11= _mm_hadd_ps(c12,c12); c11= _mm_hadd_ps(c11,c11); C[ii+jjma+m_a]= _mm_cvtss_f32(c11); c12= _mm_hadd_ps(c13,c13); c12= _mm_hadd_ps(c12,c12); C[ii+jjma+m_a*2]=_mm_cvtss_f32(c12); c13= _mm_hadd_ps(c14,c14); c13= _mm_hadd_ps(c13,c13); C[ii+jjma+m_a*3]= _mm_cvtss_f32(c13); c14= _mm_hadd_ps(c21,c21); c14= _mm_hadd_ps(c14,c14); C[ii+jjma+1] = _mm_cvtss_f32(c14); c21= _mm_hadd_ps(c22,c22); c21= _mm_hadd_ps(c21,c21); C[ii+jjma+m_a+1] = _mm_cvtss_f32(c21); c22= _mm_hadd_ps(c23,c23); c22= _mm_hadd_ps(c22,c22); C[ii+jjma+2*m_a+1]=_mm_cvtss_f32(c22); c23= _mm_hadd_ps(c24,c24); c23= _mm_hadd_ps(c23,c23); C[ii+jjma+3*m_a+1]= _mm_cvtss_f32(c23); */ } } } move(mbackup, mpad, C, backup); free(A); free(B); free(C); }
/*! * \brief Perform an horizontal sum of the given vector. * \param in The input vector type * \return the horizontal sum of the vector */ ETL_STATIC_INLINE(float) hadd(avx_simd_float in) { const __m128 x128 = _mm_add_ps(_mm256_extractf128_ps(in.value, 1), _mm256_castps256_ps128(in.value)); const __m128 x64 = _mm_add_ps(x128, _mm_movehl_ps(x128, x128)); const __m128 x32 = _mm_add_ss(x64, _mm_shuffle_ps(x64, x64, 0x55)); return _mm_cvtss_f32(x32); }
static inline float rsqrt_fast(const float x) { const __m128 a = _mm_set_ss(x); const __m128 r = _mm_rsqrt_ps(a); return _mm_cvtss_f32(r); }