__m128 PowSSE_FixedPoint_Exponent(__m128 x, int exponent) { __m128 rslt=Four_Ones; // x^0=1.0 int xp=abs(exponent); if (xp & 3) // fraction present? { __m128 sq_rt=_mm_sqrt_ps(x); if (xp & 1) // .25? rslt=_mm_sqrt_ps(sq_rt); // x^.25 if (xp & 2) rslt=_mm_mul_ps(rslt,sq_rt); } xp>>=2; // strip fraction __m128 curpower=x; // curpower iterates through x,x^2,x^4,x^8,x^16... while(1) { if (xp & 1) rslt=_mm_mul_ps(rslt,curpower); xp>>=1; if (xp) curpower=_mm_mul_ps(curpower,curpower); else break; } if (exponent<0) return _mm_rcp_ps(rslt); // pow(x,-b)=1/pow(x,b) else return rslt; }
__m128 t4(__m128 a, __m128 b) { a=_mm_sqrt_ps(a); b=_mm_sqrt_ps(b); return _mm_xor_ps (a,b); }
__m128 t2(__m128 a, __m128 b) { a=_mm_sqrt_ps(a); b=_mm_sqrt_ps(b); return _mm_andnot_ps (a,b); }
void init_xrpow_core_sse(gr_info * const cod_info, FLOAT xrpow[576], int upper, FLOAT * sum) { int i; float tmp_max = 0; float tmp_sum = 0; int upper4 = (upper / 4) * 4; int rest = upper-upper4; const vecfloat_union fabs_mask = {{ 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF }}; const __m128 vec_fabs_mask = _mm_loadu_ps(&fabs_mask._float[0]); vecfloat_union vec_xrpow_max; vecfloat_union vec_sum; vecfloat_union vec_tmp; _mm_prefetch((char *) cod_info->xr, _MM_HINT_T0); _mm_prefetch((char *) xrpow, _MM_HINT_T0); vec_xrpow_max._m128 = _mm_set_ps1(0); vec_sum._m128 = _mm_set_ps1(0); for (i = 0; i < upper4; i += 4) { vec_tmp._m128 = _mm_loadu_ps(&(cod_info->xr[i])); /* load */ vec_tmp._m128 = _mm_and_ps(vec_tmp._m128, vec_fabs_mask); /* fabs */ vec_sum._m128 = _mm_add_ps(vec_sum._m128, vec_tmp._m128); vec_tmp._m128 = _mm_sqrt_ps(_mm_mul_ps(vec_tmp._m128, _mm_sqrt_ps(vec_tmp._m128))); vec_xrpow_max._m128 = _mm_max_ps(vec_xrpow_max._m128, vec_tmp._m128); /* retrieve max */ _mm_storeu_ps(&(xrpow[i]), vec_tmp._m128); /* store into xrpow[] */ } vec_tmp._m128 = _mm_set_ps1(0); switch (rest) { case 3: vec_tmp._float[2] = cod_info->xr[upper4+2]; case 2: vec_tmp._float[1] = cod_info->xr[upper4+1]; case 1: vec_tmp._float[0] = cod_info->xr[upper4+0]; vec_tmp._m128 = _mm_and_ps(vec_tmp._m128, vec_fabs_mask); /* fabs */ vec_sum._m128 = _mm_add_ps(vec_sum._m128, vec_tmp._m128); vec_tmp._m128 = _mm_sqrt_ps(_mm_mul_ps(vec_tmp._m128, _mm_sqrt_ps(vec_tmp._m128))); vec_xrpow_max._m128 = _mm_max_ps(vec_xrpow_max._m128, vec_tmp._m128); /* retrieve max */ switch (rest) { case 3: xrpow[upper4+2] = vec_tmp._float[2]; case 2: xrpow[upper4+1] = vec_tmp._float[1]; case 1: xrpow[upper4+0] = vec_tmp._float[0]; default: break; } default: break; } tmp_sum = vec_sum._float[0] + vec_sum._float[1] + vec_sum._float[2] + vec_sum._float[3]; { float ma = vec_xrpow_max._float[0] > vec_xrpow_max._float[1] ? vec_xrpow_max._float[0] : vec_xrpow_max._float[1]; float mb = vec_xrpow_max._float[2] > vec_xrpow_max._float[3] ? vec_xrpow_max._float[2] : vec_xrpow_max._float[3]; tmp_max = ma > mb ? ma : mb; } cod_info->xrpow_max = tmp_max; *sum = tmp_sum; }
void calculateSSE(int start, int end) { int size = end - start + 1; // we use aligned memory, because SSE instructions are really slow // working on unaligned memory float* result = (float*)aligned_alloc(16, size * sizeof(float)); __m128 x; __m128 delta_x = _mm_set_ps1(4.0f); __m128 y = _mm_set_ps1(1.0f); __m128* sse_result = (__m128*)result; const int sse_length = size / 4; x = _mm_set_ps(4.0f, 3.0f, 2.0f, 1.0f); for (int loop = 0; loop < 100000; ++loop) { for (int i = 0; i < sse_length; ++i) { __m128 sqrt_result = _mm_sqrt_ps(x); sse_result[i] = _mm_div_ps(sqrt_result, x); //sse_result[i] = _mm_add_ps(x, y); // move x value to next 4 numbers x = _mm_add_ps(x, delta_x); } } }
inline void Sphere::intersect_sse(__m128 &rox, __m128 &roy, __m128 &roz,__m128 &rdx, __m128 &rdy, __m128 &rdz, __m128 &t, __m128 &hit_res){ __m128 dist_x = _mm_sub_ps(_mm_set1_ps(c.x), rox); __m128 dist_y = _mm_sub_ps(_mm_set1_ps(c.y), roy); __m128 dist_z = _mm_sub_ps(_mm_set1_ps(c.z), roz); __m128 B = dot_sse(rdx,rdy,rdz, dist_x, dist_y, dist_z); __m128 D = _mm_add_ps(_mm_sub_ps(_mm_mul_ps(B,B), dot_sse(dist_x, dist_y, dist_z, dist_x, dist_y, dist_z)), _mm_set_ps1(r * r)); __m128 sq = _mm_sqrt_ps(D); __m128 t0 = _mm_sub_ps(B, sq); __m128 t1 = _mm_add_ps(B, sq); hit_res = _mm_set1_epi32(MISS); for (int i=0;i<4;++i){ if ((t0[i] > 0.1f) && (t0[i] < t[i])){ t[i] = t0[i]; hit_res[i] = HIT; } } for (int i=0;i<4;++i){ if ((t1[i] > 0.1f) && (t1[i] < t[i])){ t[i] = t1[i]; hit_res[i] = HIT; } } }
void NBodyAlgorithm::calculateAcceleration(const float3(&posI)[4], const float massJ, const float3 posJ, __m128 accIx, __m128 accIy, __m128 accIz, float *accI) { __m128 pix = _mm_set_ps(posI[0].x, posI[1].x, posI[2].x, posI[3].x); __m128 piy = _mm_set_ps(posI[0].y, posI[1].y, posI[2].y, posI[3].y); __m128 piz = _mm_set_ps(posI[0].z, posI[1].z, posI[2].z, posI[3].z); __m128 pjx = _mm_set_ps1(posJ.x); __m128 pjy = _mm_set_ps1(posJ.y); __m128 pjz = _mm_set_ps1(posJ.z); __m128 rx = _mm_sub_ps(pjx, pix); __m128 ry = _mm_sub_ps(pjy, piy); __m128 rz = _mm_sub_ps(pjz, piz); __m128 eps2 = _mm_set_ps1(mp_properties->eps2); __m128 rx2 = _mm_mul_ps(rx, rx); __m128 ry2 = _mm_mul_ps(ry, ry); __m128 rz2 = _mm_mul_ps(rz, rz); __m128 rabs = _mm_sqrt_ps(_mm_add_ps(_mm_add_ps(rx2, ry2), _mm_add_ps(rz2, eps2))); __m128 m = _mm_set_ps1(massJ); __m128 rabsInv = _mm_div_ps(m, _mm_mul_ps(_mm_mul_ps(rabs, rabs), rabs)); __m128 aix = _mm_mul_ps(rx, rabsInv); __m128 aiy = _mm_mul_ps(ry, rabsInv); __m128 aiz = _mm_mul_ps(rz, rabsInv); accIx = _mm_add_ps(accIx, aix); accIy = _mm_add_ps(accIy, aiy); accIz = _mm_add_ps(accIz, aiz); _mm_storer_ps(accI, accIx); _mm_storer_ps(accI + 4, accIy); _mm_storer_ps(accI + 8, accIz); }
float vec3::length() const { __m128 temp = _mm_mul_ps(v, v); __m128 temp2 = _mm_shuffle_ps(temp, temp, 0xFD); temp2 = _mm_add_ps(temp, temp2); temp2 = _mm_add_ps(temp2, _mm_shuffle_ps(temp, temp, 0xFE)); return _mm_cvtss_f32(_mm_sqrt_ps(temp2)); }
void NBodyAlgorithm::calculateAcceleration(const float3(&posI)[4], const float massJ, const float3 posJ, float3(&accI)[4]) { __m128 pix = _mm_set_ps(posI[0].x, posI[1].x, posI[2].x, posI[3].x); __m128 piy = _mm_set_ps(posI[0].y, posI[1].y, posI[2].y, posI[3].y); __m128 piz = _mm_set_ps(posI[0].z, posI[1].z, posI[2].z, posI[3].z); __m128 pjx = _mm_set_ps1(posJ.x); __m128 pjy = _mm_set_ps1(posJ.y); __m128 pjz = _mm_set_ps1(posJ.z); __m128 rx = _mm_sub_ps(pjx, pix); __m128 ry = _mm_sub_ps(pjy, piy); __m128 rz = _mm_sub_ps(pjz, piz); __m128 eps2 = _mm_set_ps1(mp_properties->eps2); __m128 rx2 = _mm_mul_ps(rx, rx); __m128 ry2 = _mm_mul_ps(ry, ry); __m128 rz2 = _mm_mul_ps(rz, rz); __m128 rabs = _mm_sqrt_ps(_mm_add_ps(_mm_add_ps(rx2, ry2), _mm_add_ps(rz2, eps2))); __m128 m = _mm_set_ps1(massJ); __m128 rabsInv = _mm_div_ps(m, _mm_mul_ps(_mm_mul_ps(rabs, rabs), rabs)); __m128 aix = _mm_mul_ps(rx, rabsInv); __m128 aiy = _mm_mul_ps(ry, rabsInv); __m128 aiz = _mm_mul_ps(rz, rabsInv); for (int i = 0; i < 4; i++) { accI[3 - i].x = aix.m128_f32[i]; accI[3 - i].y = aiy.m128_f32[i]; accI[3 - i].z = aiz.m128_f32[i]; } }
void mandel_sse2(unsigned char *image, const struct spec *s) { __m128 xmin = _mm_set_ps1(s->xlim[0]); __m128 ymin = _mm_set_ps1(s->ylim[0]); __m128 xscale = _mm_set_ps1((s->xlim[1] - s->xlim[0]) / s->width); __m128 yscale = _mm_set_ps1((s->ylim[1] - s->ylim[0]) / s->height); __m128 threshold = _mm_set_ps1(4); __m128 one = _mm_set_ps1(1); __m128i zero = _mm_setzero_si128(); __m128 iter_scale = _mm_set_ps1(1.0f / s->iterations); __m128 depth_scale = _mm_set_ps1(s->depth - 1); #pragma omp parallel for schedule(dynamic, 1) for (int y = 0; y < s->height; y++) { for (int x = 0; x < s->width; x += 4) { __m128 mx = _mm_set_ps(x + 3, x + 2, x + 1, x + 0); __m128 my = _mm_set_ps1(y); __m128 cr = _mm_add_ps(_mm_mul_ps(mx, xscale), xmin); __m128 ci = _mm_add_ps(_mm_mul_ps(my, yscale), ymin); __m128 zr = cr; __m128 zi = ci; int k = 1; __m128 mk = _mm_set_ps1(k); while (++k < s->iterations) { /* Compute z1 from z0 */ __m128 zr2 = _mm_mul_ps(zr, zr); __m128 zi2 = _mm_mul_ps(zi, zi); __m128 zrzi = _mm_mul_ps(zr, zi); /* zr1 = zr0 * zr0 - zi0 * zi0 + cr */ /* zi1 = zr0 * zi0 + zr0 * zi0 + ci */ zr = _mm_add_ps(_mm_sub_ps(zr2, zi2), cr); zi = _mm_add_ps(_mm_add_ps(zrzi, zrzi), ci); /* Increment k */ zr2 = _mm_mul_ps(zr, zr); zi2 = _mm_mul_ps(zi, zi); __m128 mag2 = _mm_add_ps(zr2, zi2); __m128 mask = _mm_cmplt_ps(mag2, threshold); mk = _mm_add_ps(_mm_and_ps(mask, one), mk); /* Early bailout? */ __m128i maski = _mm_castps_si128(mask); if (0xFFFF == _mm_movemask_epi8(_mm_cmpeq_epi8(maski, zero))) break; } mk = _mm_mul_ps(mk, iter_scale); mk = _mm_sqrt_ps(mk); mk = _mm_mul_ps(mk, depth_scale); __m128i pixels = _mm_cvtps_epi32(mk); unsigned char *dst = image + y * s->width * 3 + x * 3; unsigned char *src = (unsigned char *)&pixels; for (int i = 0; i < 4; i++) { dst[i * 3 + 0] = src[i * 4]; dst[i * 3 + 1] = src[i * 4]; dst[i * 3 + 2] = src[i * 4]; } } } }
inline void operator()(const IrradianceSample &sample) { /* Distance to the positive point source of the dipole */ const __m128 lengthSquared = _mm_set1_ps((p - sample.p).lengthSquared()), drSqr = _mm_add_ps(zrSqr, lengthSquared), dvSqr = _mm_add_ps(zvSqr, lengthSquared), dr = _mm_sqrt_ps(drSqr), dv = _mm_sqrt_ps(dvSqr), one = _mm_set1_ps(1.0f), factor = _mm_mul_ps(_mm_set1_ps(0.25f*INV_PI*sample.area * Fdt), _mm_set_ps(sample.E[0], sample.E[1], sample.E[2], 0)), C1fac = _mm_div_ps(_mm_mul_ps(zr, _mm_add_ps(sigmaTr, _mm_div_ps(one, dr))), drSqr), C2fac = _mm_div_ps(_mm_mul_ps(zv, _mm_add_ps(sigmaTr, _mm_div_ps(one, dv))), dvSqr); SSEVector temp1(_mm_mul_ps(dr, sigmaTr)), temp2(_mm_mul_ps(dv, sigmaTr)); const __m128 exp1 = _mm_set_ps(expf(-temp1.f[3]), expf(-temp1.f[2]), expf(-temp1.f[1]), 0), exp2 = _mm_set_ps(expf(-temp2.f[3]), expf(-temp2.f[2]), expf(-temp2.f[1]), 0); result.ps = _mm_add_ps(result.ps, _mm_mul_ps(factor, _mm_add_ps( _mm_mul_ps(C1fac, exp1), _mm_mul_ps(C2fac, exp2)))); }
/* Sqrt */ __SIMD _SIMD_sqrt_ps(__SIMD a) { #ifdef USE_SSE return _mm_sqrt_ps(a); #elif defined USE_AVX return _mm256_sqrt_ps(a); #elif defined USE_IBM return vec_sqrt(a); #endif }
SIMDValue SIMDFloat32x4Operation::OpSqrt(const SIMDValue& value) { X86SIMDValue x86Result; X86SIMDValue v = X86SIMDValue::ToX86SIMDValue(value); x86Result.m128_value = _mm_sqrt_ps(v.m128_value); // result = sqrt(value) return X86SIMDValue::ToSIMDValue(x86Result); }
static void ScaleErrorSignalSSE2(aec_t *aec, float ef[2][PART_LEN1]) { const __m128 k1e_10f = _mm_set1_ps(1e-10f); const __m128 kThresh = _mm_set1_ps(aec->errThresh); const __m128 kMu = _mm_set1_ps(aec->mu); int i; // vectorized code (four at once) for (i = 0; i + 3 < PART_LEN1; i += 4) { const __m128 xPow = _mm_loadu_ps(&aec->xPow[i]); const __m128 ef_re_base = _mm_loadu_ps(&ef[0][i]); const __m128 ef_im_base = _mm_loadu_ps(&ef[1][i]); const __m128 xPowPlus = _mm_add_ps(xPow, k1e_10f); __m128 ef_re = _mm_div_ps(ef_re_base, xPowPlus); __m128 ef_im = _mm_div_ps(ef_im_base, xPowPlus); const __m128 ef_re2 = _mm_mul_ps(ef_re, ef_re); const __m128 ef_im2 = _mm_mul_ps(ef_im, ef_im); const __m128 ef_sum2 = _mm_add_ps(ef_re2, ef_im2); const __m128 absEf = _mm_sqrt_ps(ef_sum2); const __m128 bigger = _mm_cmpgt_ps(absEf, kThresh); __m128 absEfPlus = _mm_add_ps(absEf, k1e_10f); const __m128 absEfInv = _mm_div_ps(kThresh, absEfPlus); __m128 ef_re_if = _mm_mul_ps(ef_re, absEfInv); __m128 ef_im_if = _mm_mul_ps(ef_im, absEfInv); ef_re_if = _mm_and_ps(bigger, ef_re_if); ef_im_if = _mm_and_ps(bigger, ef_im_if); ef_re = _mm_andnot_ps(bigger, ef_re); ef_im = _mm_andnot_ps(bigger, ef_im); ef_re = _mm_or_ps(ef_re, ef_re_if); ef_im = _mm_or_ps(ef_im, ef_im_if); ef_re = _mm_mul_ps(ef_re, kMu); ef_im = _mm_mul_ps(ef_im, kMu); _mm_storeu_ps(&ef[0][i], ef_re); _mm_storeu_ps(&ef[1][i], ef_im); } // scalar code for the remaining items. for (; i < (PART_LEN1); i++) { float absEf; ef[0][i] /= (aec->xPow[i] + 1e-10f); ef[1][i] /= (aec->xPow[i] + 1e-10f); absEf = sqrtf(ef[0][i] * ef[0][i] + ef[1][i] * ef[1][i]); if (absEf > aec->errThresh) { absEf = aec->errThresh / (absEf + 1e-10f); ef[0][i] *= absEf; ef[1][i] *= absEf; } // Stepsize factor ef[0][i] *= aec->mu; ef[1][i] *= aec->mu; } }
SIMDValue SIMDFloat32x4Operation::OpReciprocalSqrt(const SIMDValue& value) { X86SIMDValue x86Result; X86SIMDValue temp; X86SIMDValue v = X86SIMDValue::ToX86SIMDValue(value); temp.m128_value = _mm_div_ps(X86_ALL_ONES_F4.m128_value, v.m128_value); // temp = 1.0/value x86Result.m128_value = _mm_sqrt_ps(temp.m128_value); // result = sqrt(1.0/value) return X86SIMDValue::ToSIMDValue(x86Result); }
static inline __v4sf sse_pow_1_24 (__v4sf x) { __v4sf y, z; y = sse_init_newton (x, -1./12, 0.9976800269, 0.9885126933, 0.5908575383); x = _mm_sqrt_ps (x); /* newton's method for x^(-1/6) */ z = splat4f (1.f/6.f) * x; y = splat4f (7.f/6.f) * y - z * ((y*y)*(y*y)*(y*y*y)); y = splat4f (7.f/6.f) * y - z * ((y*y)*(y*y)*(y*y*y)); return x*y; }
float length() const { Vec3 a = *this; a.w = 0.0f; __m128 &D = a.m128; D = _mm_mul_ps(D, D); D = _mm_hadd_ps(D, D); D = _mm_hadd_ps(D, D); D = _mm_sqrt_ps(D); return a.x; }
/** * Identify bends in the chain, where the kappa angle (virtual bond angle from * c-alpha i-2, to i, to i+2) is greater than 70 degrees * dssp-2.2.0/structure.cpp:1729 */ static std::vector<int> calculate_bends(const float* xyz, const int* ca_indices, const int* chain_ids, const int n_residues, std::vector<int>& skip) { __m128 prev_ca, this_ca, next_ca, u_prime, v_prime, u, v; float kappa; std::vector<int> is_bend(n_residues, 0); for (int i = 2; i < n_residues-2; i++) { if (chain_ids[i-2] == chain_ids[i+2] && !skip[i-2] && !skip[i] && !skip[i+2]) { prev_ca = load_float3(xyz + 3*ca_indices[i-2]); this_ca = load_float3(xyz + 3*ca_indices[i]); next_ca = load_float3(xyz + 3*ca_indices[i+2]); u_prime = _mm_sub_ps(prev_ca, this_ca); v_prime = _mm_sub_ps(this_ca, next_ca); /* normalize the vectors u_prime and v_prime */ u = _mm_div_ps(u_prime, _mm_sqrt_ps(_mm_dp_ps2(u_prime, u_prime, 0x7F))); v = _mm_div_ps(v_prime, _mm_sqrt_ps(_mm_dp_ps2(v_prime, v_prime, 0x7F))); /* compute the arccos of the dot product. this gives the angle */ kappa = (float) acos(CLIP(_mm_cvtss_f32(_mm_dp_ps2(u, v, 0x71)), -1, 1)); is_bend[i] = kappa > (70 * (M_PI / 180.0)); } } return is_bend; }
/* the fast arctan function adopted from OpenCV */ static void _ccv_atan2(float* x, float* y, float* angle, float* mag, int len) { int i = 0; float scale = (float)(180.0 / CCV_PI); #ifdef HAVE_SSE2 #ifndef _WIN32 union { int i; float fl; } iabsmask; iabsmask.i = 0x7fffffff; __m128 eps = _mm_set1_ps((float)1e-6), absmask = _mm_set1_ps(iabsmask.fl); __m128 _90 = _mm_set1_ps((float)(3.141592654 * 0.5)), _180 = _mm_set1_ps((float)3.141592654), _360 = _mm_set1_ps((float)(3.141592654 * 2)); __m128 zero = _mm_setzero_ps(), _0_28 = _mm_set1_ps(0.28f), scale4 = _mm_set1_ps(scale); for(; i <= len - 4; i += 4) { __m128 x4 = _mm_loadu_ps(x + i), y4 = _mm_loadu_ps(y + i); __m128 xq4 = _mm_mul_ps(x4, x4), yq4 = _mm_mul_ps(y4, y4); __m128 xly = _mm_cmplt_ps(xq4, yq4); __m128 z4 = _mm_div_ps(_mm_mul_ps(x4, y4), _mm_add_ps(_mm_add_ps(_mm_max_ps(xq4, yq4), _mm_mul_ps(_mm_min_ps(xq4, yq4), _0_28)), eps)); // a4 <- x < y ? 90 : 0; __m128 a4 = _mm_and_ps(xly, _90); // a4 <- (y < 0 ? 360 - a4 : a4) == ((x < y ? y < 0 ? 270 : 90) : (y < 0 ? 360 : 0)) __m128 mask = _mm_cmplt_ps(y4, zero); a4 = _mm_or_ps(_mm_and_ps(_mm_sub_ps(_360, a4), mask), _mm_andnot_ps(mask, a4)); // a4 <- (x < 0 && !(x < y) ? 180 : a4) mask = _mm_andnot_ps(xly, _mm_cmplt_ps(x4, zero)); a4 = _mm_or_ps(_mm_and_ps(_180, mask), _mm_andnot_ps(mask, a4)); // a4 <- (x < y ? a4 - z4 : a4 + z4) a4 = _mm_mul_ps(_mm_add_ps(_mm_xor_ps(z4, _mm_andnot_ps(absmask, xly)), a4), scale4); __m128 m4 = _mm_sqrt_ps(_mm_add_ps(xq4, yq4)); _mm_storeu_ps(angle + i, a4); _mm_storeu_ps(mag + i, m4); } #endif #endif for(; i < len; i++) { float xf = x[i], yf = y[i]; float a, x2 = xf * xf, y2 = yf * yf; if(y2 <= x2) a = xf * yf / (x2 + 0.28f * y2 + (float)1e-6) + (float)(xf < 0 ? CCV_PI : yf >= 0 ? 0 : CCV_PI * 2); else a = (float)(yf >= 0 ? CCV_PI * 0.5 : CCV_PI * 1.5) - xf * yf / (y2 + 0.28f * x2 + (float)1e-6); angle[i] = a * scale; mag[i] = sqrtf(x2 + y2); } }
int dihedral(const float* xyz, const int* quartets, float* out, const int n_frames, const int n_atoms, const int n_quartets) { /* Compute the angle between sets of four atoms in every frame of xyz. Parameters ---------- xyz : array, shape=(n_frames, n_atoms, 3) Cartesian coordinates of the atoms in every frame, in contiguous C order. quartets : array, shape=(n_quartets, 3) The specific quartet of atoms whose angle you want to compute. The angle computed will be the torsion around the bound between the middle two elements (i.e aABCD). A 2d array of indices, in C order. out : array, shape=(n_frames, n_pairs) Array where the angles will be stored, in contiguous C order. All of the arrays are assumed to be contiguous. This code will segfault if they're not. */ int i, j; __m128 x0, x1, x2, x3, b1, b2, b3, c1, c2, p1, p2; for (i = 0; i < n_frames; i++) { for (j = 0; j < n_quartets; j++) { x0 = load_float3(xyz + 3*quartets[4*j + 0]); x1 = load_float3(xyz + 3*quartets[4*j + 1]); x2 = load_float3(xyz + 3*quartets[4*j + 2]); x3 = load_float3(xyz + 3*quartets[4*j + 3]); b1 = _mm_sub_ps(x1, x0); b2 = _mm_sub_ps(x2, x1); b3 = _mm_sub_ps(x3, x2); c1 = cross(b2, b3); c2 = cross(b1, b2); p1 = _mm_mul_ps(_mm_dp_ps(b1, c1, 0x71), _mm_sqrt_ps(_mm_dp_ps(b2, b2, 0x71))); p2 = _mm_dp_ps(c1, c2, 0x71); *(out++) = atan2(_mm_cvtss_f32(p1), _mm_cvtss_f32(p2)); }; xyz += n_atoms*3; } return 1; }
void sqrt_(register float *dst, register float *src, int w) { register int j = 0; #if CV_SSE if (CPU_SUPPORT_SSE1) { __m128 a; for (; j < w - 3; j += 4) { a = _mm_sqrt_ps(_mm_loadu_ps(src + j)); _mm_storeu_ps(dst + j, a); } } #endif for (; j < w; j++) dst[j] = sqrt(src[j]); }
void NBodyAlgorithm::calculateAccelerationWithColor(const float3(&posI)[4], const float massJ, const float3 posJ, float3(&accI)[4], unsigned int(&isClose)[4]) { __m128 pix = _mm_set_ps(posI[0].x, posI[1].x, posI[2].x, posI[3].x); __m128 piy = _mm_set_ps(posI[0].y, posI[1].y, posI[2].y, posI[3].y); __m128 piz = _mm_set_ps(posI[0].z, posI[1].z, posI[2].z, posI[3].z); __m128 pjx = _mm_set_ps1(posJ.x); __m128 pjy = _mm_set_ps1(posJ.y); __m128 pjz = _mm_set_ps1(posJ.z); __m128 rx = _mm_sub_ps(pjx, pix); __m128 ry = _mm_sub_ps(pjy, piy); __m128 rz = _mm_sub_ps(pjz, piz); __m128 eps2 = _mm_set_ps1(mp_properties->eps2); __m128 rx2 = _mm_mul_ps(rx, rx); __m128 ry2 = _mm_mul_ps(ry, ry); __m128 rz2 = _mm_mul_ps(rz, rz); __m128 rabs = _mm_sqrt_ps(_mm_add_ps(_mm_add_ps(rx2, ry2), _mm_add_ps(rz2, eps2))); __m128 cmpDistance = _mm_set_ps1(float(mp_properties->positionScale)); __m128 close = _mm_cmple_ps(rabs, cmpDistance); for (int i = 0; i < 4; i++) { if (close.m128_f32[i] == 0) { isClose[3 - i] = 0; } } __m128 m = _mm_set_ps1(massJ); __m128 rabsInv = _mm_div_ps(m, _mm_mul_ps(_mm_mul_ps(rabs, rabs), rabs)); __m128 aix = _mm_mul_ps(rx, rabsInv); __m128 aiy = _mm_mul_ps(ry, rabsInv); __m128 aiz = _mm_mul_ps(rz, rabsInv); for (int i = 0; i < 4; i++) { accI[3 - i].x = aix.m128_f32[i]; accI[3 - i].y = aiy.m128_f32[i]; accI[3 - i].z = aiz.m128_f32[i]; } }
void fun_sse(float *a, float *b, int n) { int i, k; __m128 x, z; __m128 *aa = (__m128 *)a; __m128 *bb = (__m128 *)b; k = n / 4; z = _mm_set_ps1(0.5f); for (i = 0; i < k; i++) { x = _mm_mul_ps(*aa, *aa); x = _mm_sqrt_ps(x); *bb = _mm_add_ps(x, z); aa++; bb++; } }
template <bool align> SIMD_INLINE void HogDirectionHistograms(const __m128 & dx, const __m128 & dy, Buffer & buffer, size_t col) { __m128 bestDot = _mm_setzero_ps(); __m128i bestIndex = _mm_setzero_si128(); for(int i = 0; i < buffer.size; ++i) { __m128 dot = _mm_add_ps(_mm_mul_ps(dx, buffer.cos[i]), _mm_mul_ps(dy, buffer.sin[i])); __m128 mask = _mm_cmpgt_ps(dot, bestDot); bestDot = _mm_max_ps(dot, bestDot); bestIndex = Combine(_mm_castps_si128(mask), buffer.pos[i], bestIndex); dot = _mm_sub_ps(_mm_setzero_ps(), dot); mask = _mm_cmpgt_ps(dot, bestDot); bestDot = _mm_max_ps(dot, bestDot); bestIndex = Combine(_mm_castps_si128(mask), buffer.neg[i], bestIndex); } Store<align>((__m128i*)(buffer.index + col), bestIndex); Sse::Store<align>(buffer.value + col, _mm_sqrt_ps(_mm_add_ps(_mm_mul_ps(dx, dx), _mm_mul_ps(dy, dy)))); }
double CHellingerKernel<float>::Evaluate(float* x, float* y) { #ifndef __SSE4_1__ float result = 0; for(size_t i=0; i<m_n; i++) result += sqrt(x[i]*y[i]); return static_cast<double>(result); #else __m128* px = (__m128*)x; __m128* py = (__m128*)y; float zero = 0; __m128 sum = _mm_load1_ps(&zero); for(int i=0; i<m_offset/4; i++) { __m128 temp = _mm_mul_ps(px[i],py[i]); temp = _mm_sqrt_ps(temp); sum = _mm_add_ps(sum,temp); } float result[4] = {0,0,0,0}; _mm_storeu_ps(result,sum); float fresult = result[0] + result[1] + result[2] + result[3]; // add offset for(size_t i=m_offset; i<m_n; i++) fresult += sqrt(x[i]*y[i]); return static_cast<double>(fresult); #endif }
// -------------------------------------------------------------- vuint32 mandelbrot_simd(vfloat32 a, vfloat32 b, size_t max_iter) // -------------------------------------------------------------- { vuint32 num_iter = _mm_set1_epi32(0); vfloat32 zero=_mm_set1_ps(0); vfloat32 one=_mm_set1_ps(1); vfloat32 two=_mm_set1_ps(2); vfloat32 x=_mm_setzero_ps(); vfloat32 y=_mm_setzero_ps(); vfloat32 tmp; vfloat32 z; for(int i=0;i<max_iter;i++){ tmp=x; x=_mm_add_ps(a,_mm_sub_ps(_mm_mul_ps(x,x),_mm_mul_ps(y,y))); y=_mm_add_ps(b,_mm_mul_ps(_mm_mul_ps(y,tmp),two)); z=_mm_sqrt_ps(_mm_add_ps(_mm_mul_ps(x,x),_mm_mul_ps(y,y))); num_iter=_mm_add_epi32(num_iter,_mm_cvtps_epi32(_mm_and_ps(_mm_cmplt_ps(z,two),one))); } return num_iter; }
float sumar(float *a){ float sumaF[4] __attribute__((aligned(16))); __m128 sumas= _mm_set1_ps(0);//suma alineada 1313 iniciadas en 0 __m128 calculo; int i; __m128 aux; for ( i = 0; i < 100000; i+=4){ // multip =1; aux = _mm_load_ps(&a[i]); //corta el ciclo cuando el arreglo no tiene mas valores validos; // calculo = sqrt(a[i]); calculo = _mm_sqrt_ps(aux);//se calcula la raiz cuadrada de los 4 float en paralelo calculo = _mm_pow2_ps(calculo,aux);// se decidió que por precicion de calculo se utilizará la funcion pow2 if(_mm_compare_ps(aux,_mm_set1_ps(0)))break; sumas = _mm_add_ps(sumas, calculo); } _mm_store_ps(sumaF, sumas); return sumaF[0]+sumaF[1]+sumaF[2]+sumaF[3]; }
int test_sqrt() { int Error(0); # if GLM_ARCH & GLM_ARCH_SSE2_BIT for(float f = 0.1f; f < 30.0f; f += 0.1f) { float r = _mm_cvtss_f32(_mm_sqrt_ps(_mm_set1_ps(f))); float s = std::sqrt(f); Error += glm::abs(r - s) < 0.01f ? 0 : 1; assert(!Error); } # endif//GLM_ARCH & GLM_ARCH_SSE2_BIT float A = glm::sqrt(10.f); glm::vec1 B = glm::sqrt(glm::vec1(10.f)); glm::vec2 C = glm::sqrt(glm::vec2(10.f)); glm::vec3 D = glm::sqrt(glm::vec3(10.f)); glm::vec4 E = glm::sqrt(glm::vec4(10.f)); return Error; }
int dist(const float* xyz, const int* pairs, float* distance_out, float* displacement_out, const int n_frames, const int n_atoms, const int n_pairs) { /* Compute the distance/displacement between pairs of atoms in every frame of xyz. Parameters ---------- xyz : array, shape=(n_frames, n_atoms, 3) Cartesian coordinates of the atoms in every frame, in contiguous C order. pairs : array, shape=(n_pairs, 2) The specific pairs of atoms whose distance you want to compute. A 2d array of pairs, in C order. distance_out : array, shape=(n_frames, n_pairs), optional Array where the distances between pairs will be stored, in contiguous C order. If NULL is passed in, this return value will not be saved displacement_out : array, shaoe=(n_frames, n_pairs, 3), optional An optional return value: if you'd also like to save the displacement vectors between the pairs, you can pass a pointer here. If displacement_out is NULL, then this variable will not be saved back to memory. All of the arrays are assumed to be contiguous. This code will segfault if they're not. */ int i, j; int store_displacement = displacement_out == NULL ? 0 : 1; int store_distance = distance_out == NULL ? 0 : 1; __m128 x1, x2, r12, r12_2, s; for (i = 0; i < n_frames; i++) { for (j = 0; j < n_pairs; j++) { // Load the two vectors whos distance we want to compute // x1 = xyz[i, pairs[j,0], 0:3] // x2 = xyz[i, pairs[j,1], 0:3] x1 = load_float3(xyz + 3*pairs[2*j + 0]); x2 = load_float3(xyz + 3*pairs[2*j + 1]); // r12 = x2 - x1 r12 = _mm_sub_ps(x2, x1); // r12_2 = r12*r12 r12_2 = _mm_mul_ps(r12, r12); if (store_displacement) { // store the two lower entries (x,y) in memory _mm_storel_pi((__m64*)(displacement_out), r12); displacement_out += 2; // swap high-low and then store the z entry in the memory _mm_store_ss(displacement_out++, _mm_movehl_ps(r12, r12)); } if (store_distance) { // horizontal add the components of d2 with // two instructions. note: it's critical // here that the last entry of x1 and x2 was 0 // so that d2.w = 0 s = _mm_hadd_ps(r12_2, r12_2); s = _mm_hadd_ps(s, s); // sqrt our final answer s = _mm_sqrt_ps(s); // s now contains our answer in all four elements, because // of the way the hadd works. we only want to store one // element. _mm_store_ss(distance_out++, s); } } // advance to the next frame xyz += n_atoms*3; } return 1; }
int dist_mic(const float* xyz, const int* pairs, const float* box_matrix, float* distance_out, float* displacement_out, const int n_frames, const int n_atoms, const int n_pairs) { /* Compute the distance/displacement between pairs of atoms in every frame of xyz following the minimum image convention in periodic boundary conditions. The computation follows scheme B.9 in Tukerman, M. "Statistical Mechanics: Theory and Molecular Simulation", 2010. Parameters ---------- xyz : array, shape=(n_frames, n_atoms, 3) Cartesian coordinates of the atoms in every frame, in contiguous C order. pairs : array, shape=(n_pairs, 2) The specific pairs of atoms whose distance you want to compute. A 2d array of pairs, in C order. box_matrix : array, shape=(3,3) The box matrix for a single frame. All of the frames are assumed to use this box vector. distance_out : array, shape=(n_frames, n_pairs) Array where the distances between pairs will be stored, in contiguous C order. displacement_out : array, shaoe=(n_frames, n_pairs, 3), optional An optional return value: if you'd also like to save the displacement vectors between the pairs, you can pass a pointer here. If displacement_out is NULL, then this variable will not be saved back to memory. All of the arrays are assumed to be contiguous. This code will segfault if they're not. */ #ifndef __SSE4_1__ _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); int rounding_mode = _MM_GET_ROUNDING_MODE(); #endif int i, j; int store_displacement = displacement_out == NULL ? 0 : 1; int store_distance = distance_out == NULL ? 0 : 1; __m128 r1, r2, s12, r12, s, r12_2; __m128 hinv[3]; __m128 h[3]; for (i = 0; i < n_frames; i++) { // Store the columns of the box matrix in three float4s. This format // is fast for matrix * vector product. See, for example, this S.O. question: // http://stackoverflow.com/questions/14967969/efficient-4x4-matrix-vector-multiplication-with-sse-horizontal-add-and-dot-prod h[0] = _mm_setr_ps(box_matrix[0], box_matrix[3], box_matrix[6], 0.0f); h[1] = _mm_setr_ps(box_matrix[1], box_matrix[4], box_matrix[7], 0.0f); h[2] = _mm_setr_ps(box_matrix[2], box_matrix[5], box_matrix[8], 0.0f); // Calculate the inverse of the box matrix, and also store it in the same // format. inverse33(box_matrix, hinv+0, hinv+1, hinv+2); for (j = 0; j < n_pairs; j++) { // Load the two vectors whos distance we want to compute r1 = load_float3(xyz + 3*pairs[2*j + 0]); r2 = load_float3(xyz + 3*pairs[2*j + 1]); r12 = _mm_sub_ps(r2, r1); // s12 = INVERSE(H) * r12 s12 = _mm_add_ps(_mm_add_ps( _mm_mul_ps(hinv[0], _mm_shuffle_ps(r12, r12, _MM_SHUFFLE(0,0,0,0))), _mm_mul_ps(hinv[1], _mm_shuffle_ps(r12, r12, _MM_SHUFFLE(1,1,1,1)))), _mm_mul_ps(hinv[2], _mm_shuffle_ps(r12, r12, _MM_SHUFFLE(2,2,2,2)))); // s12 = s12 - NEAREST_INTEGER(s12) #ifdef __SSE4_1__ s12 = _mm_sub_ps(s12, _mm_round_ps(s12, _MM_FROUND_TO_NEAREST_INT)); #else s12 = _mm_sub_ps(s12, _mm_cvtepi32_ps(_mm_cvtps_epi32(s12))); #endif r12 = _mm_add_ps(_mm_add_ps( _mm_mul_ps(h[0], _mm_shuffle_ps(s12, s12, _MM_SHUFFLE(0,0,0,0))), _mm_mul_ps(h[1], _mm_shuffle_ps(s12, s12, _MM_SHUFFLE(1,1,1,1)))), _mm_mul_ps(h[2], _mm_shuffle_ps(s12, s12, _MM_SHUFFLE(2,2,2,2)))); if (store_displacement) { // store the two lower entries (x,y) in memory _mm_storel_pi((__m64*)(displacement_out), r12); displacement_out += 2; // swap high-low and then store the z entry in the memory _mm_store_ss(displacement_out++, _mm_movehl_ps(r12, r12)); } if (store_distance) { // out = sqrt(sum(r12**2)) r12_2 = _mm_mul_ps(r12, r12); s = _mm_hadd_ps(r12_2, r12_2); s = _mm_hadd_ps(s, s); s = _mm_sqrt_ps(s); _mm_store_ss(distance_out++, s); } } // advance to the next frame xyz += n_atoms*3; box_matrix += 9; } #ifndef __SSE4_1__ _MM_SET_ROUNDING_MODE(rounding_mode); #endif return 1; }