void NBodyAlgorithmCPU::calculateAcceleration(const float3(&posI)[8], const float massJ, const float3 posJ, float3(&accI)[8]) { __m256 pix = _mm256_set_ps(posI[0].x, posI[1].x, posI[2].x, posI[3].x, posI[4].x, posI[5].x, posI[6].x, posI[7].x); __m256 piy = _mm256_set_ps(posI[0].y, posI[1].y, posI[2].y, posI[3].y, posI[4].y, posI[5].y, posI[6].y, posI[7].y); __m256 piz = _mm256_set_ps(posI[0].z, posI[1].z, posI[2].z, posI[3].z, posI[4].z, posI[5].z, posI[6].z, posI[7].z); __m256 pjx = _mm256_set1_ps(posJ.x); __m256 pjy = _mm256_set1_ps(posJ.y); __m256 pjz = _mm256_set1_ps(posJ.z); __m256 rx = _mm256_sub_ps(pjx, pix); __m256 ry = _mm256_sub_ps(pjy, piy); __m256 rz = _mm256_sub_ps(pjz, piz); __m256 eps2 = _mm256_set1_ps(mp_properties->EPS2); __m256 rx2 = _mm256_mul_ps(rx, rx); __m256 ry2 = _mm256_mul_ps(ry, ry); __m256 rz2 = _mm256_mul_ps(rz, rz); __m256 rabs = _mm256_sqrt_ps(_mm256_add_ps(_mm256_add_ps(rx2, ry2), _mm256_add_ps(rz2, eps2))); __m256 m = _mm256_set1_ps(massJ); __m256 rabsInv = _mm256_div_ps(m, _mm256_mul_ps(_mm256_mul_ps(rabs, rabs), rabs)); __m256 aix = _mm256_mul_ps(rx, rabsInv); __m256 aiy = _mm256_mul_ps(ry, rabsInv); __m256 aiz = _mm256_mul_ps(rz, rabsInv); for (int i = 0; i < 8; i++) { accI[7 - i].x = aix.m256_f32[i]; accI[7 - i].y = aiy.m256_f32[i]; accI[7 - i].z = aiz.m256_f32[i]; } }
void NBodyAlgorithmCPU::calculateAcceleration(const float3(&posI)[8], const float massJ, const float3 posJ, float *accI) { __m256 pix = _mm256_set_ps(posI[7].x, posI[6].x, posI[5].x, posI[4].x, posI[3].x, posI[2].x, posI[1].x, posI[0].x); __m256 piy = _mm256_set_ps(posI[7].y, posI[6].y, posI[5].y, posI[4].y, posI[3].y, posI[2].y, posI[1].y, posI[0].y); __m256 piz = _mm256_set_ps(posI[7].z, posI[6].z, posI[5].z, posI[4].z, posI[3].z, posI[2].z, posI[1].z, posI[0].z); __m256 pjx = _mm256_set1_ps(posJ.x); __m256 pjy = _mm256_set1_ps(posJ.y); __m256 pjz = _mm256_set1_ps(posJ.z); __m256 rx = _mm256_sub_ps(pjx, pix); __m256 ry = _mm256_sub_ps(pjy, piy); __m256 rz = _mm256_sub_ps(pjz, piz); __m256 eps2 = _mm256_set1_ps(mp_properties->EPS2); __m256 rx2 = _mm256_mul_ps(rx, rx); __m256 ry2 = _mm256_mul_ps(ry, ry); __m256 rz2 = _mm256_mul_ps(rz, rz); __m256 rabs = _mm256_sqrt_ps(_mm256_add_ps(_mm256_add_ps(rx2, ry2), _mm256_add_ps(rz2, eps2))); __m256 m = _mm256_set1_ps(massJ); __m256 rabsInv = _mm256_div_ps(m, _mm256_mul_ps(_mm256_mul_ps(rabs, rabs), rabs)); __m256 aix = _mm256_mul_ps(rx, rabsInv); __m256 aiy = _mm256_mul_ps(ry, rabsInv); __m256 aiz = _mm256_mul_ps(rz, rabsInv); _mm256_store_ps(accI, aix); _mm256_store_ps(accI + 8, aiy); _mm256_store_ps(accI + 16, aiz); }
extern "C" void product32x32_avxf(float *a, float *b, float *c, int n) { for(int i=0; i<n; i++) { __m256 t1 = _mm256_loadu_ps(&c[i*n + 0]); __m256 t2 = _mm256_loadu_ps(&c[i*n + 8]); __m256 t3 = _mm256_loadu_ps(&c[i*n + 16]); __m256 t4 = _mm256_loadu_ps(&c[i*n + 24]); for(int k=0; k<n; k++) { __m256 a1 = _mm256_set1_ps(a[k*n+i]); __m256 b1 = _mm256_loadu_ps(&b[k*n+0]); t1 = _mm256_sub_ps(t1,_mm256_mul_ps(a1,b1)); __m256 b2 = _mm256_loadu_ps(&b[k*n+8]); t2 = _mm256_sub_ps(t2,_mm256_mul_ps(a1,b2)); __m256 b3 = _mm256_loadu_ps(&b[k*n+16]); t3 = _mm256_sub_ps(t3,_mm256_mul_ps(a1,b3)); __m256 b4 = _mm256_loadu_ps(&b[k*n+24]); t4 = _mm256_sub_ps(t4,_mm256_mul_ps(a1,b4)); } _mm256_storeu_ps(&c[i*n + 0], t1); _mm256_storeu_ps(&c[i*n + 8], t2); _mm256_storeu_ps(&c[i*n + 16], t3); _mm256_storeu_ps(&c[i*n + 24], t4); } }
inline float DatabaseBuilder::Distance(PackedSample* x, PackedSample* y) { #ifdef AVX //Black magic //But it does produce the same results as the not AVX code __m256 accumulator; __m256 x_s = _mm256_load_ps(x->Features); __m256 y_s = _mm256_load_ps(y->Features); __m256 result = _mm256_sub_ps(x_s, y_s); accumulator = _mm256_mul_ps(result, result); x_s = _mm256_load_ps(&x->Features[8]); y_s = _mm256_load_ps(&y->Features[8]); result = _mm256_sub_ps(x_s, y_s); result = _mm256_mul_ps(result, result); accumulator = _mm256_add_ps(accumulator, result); x_s = _mm256_load_ps(&x->Features[16]); y_s = _mm256_load_ps(&y->Features[16]); result = _mm256_sub_ps(x_s, y_s); result = _mm256_mul_ps(result, result); accumulator = _mm256_add_ps(accumulator, result); x_s = _mm256_load_ps(&x->Features[24]); y_s = _mm256_load_ps(&y->Features[24]); result = _mm256_sub_ps(x_s, y_s); result = _mm256_mul_ps(result, result); accumulator = _mm256_add_ps(accumulator, result); //We now have a vector of 8 floats __m256 t1 = _mm256_hadd_ps(accumulator, accumulator); __m256 t2 = _mm256_hadd_ps(t1, t1); __m128 t3 = _mm256_extractf128_ps(t2, 1); __m128 t4 = _mm_add_ss(_mm256_castps256_ps128(t2), t3); //And now we don't return std::sqrtf(_mm_cvtss_f32(t4)); #endif #ifndef AVX //Can be autovectorized float accumulator[32]; float distance = 0; for (int i = 0; i < 30; i++) { accumulator[i] = x->Features[i] - y->Features[i]; } //If done properly this should be 4(8) instructions for (int i = 0; i < 30; i++) { distance += accumulator[i] * accumulator[i]; } return std::sqrtf(distance); #endif }
__m256 mm256_exp_ps(__m256 x) { __m256 tmp = _mm256_setzero_ps(), fx; __m256i emm0; __m256 one = *(__m256*)m256_ps_1; x = _mm256_min_ps(x, *(__m256*)m256_ps_exp_hi); x = _mm256_max_ps(x, *(__m256*)m256_ps_exp_lo); /* express exp(x) as exp(g + n*log(2)) */ fx = _mm256_mul_ps(x, *(__m256*)m256_ps_cephes_LOG2EF); fx = _mm256_add_ps(fx, *(__m256*)m256_ps_0p5); /* how to perform a floorf with SSE: just below */ /* step 1 : cast to int */ emm0 = _mm256_cvttps_epi32(fx); /* step 2 : cast back to float */ tmp = _mm256_cvtepi32_ps(emm0); /* if greater, substract 1 */ __m256 mask = _mm256_cmp_ps( tmp, fx, _CMP_GT_OS ); mask = _mm256_and_ps(mask, one); fx = _mm256_sub_ps(tmp, mask); tmp = _mm256_mul_ps(fx, *(__m256*)m256_ps_cephes_exp_C1); __m256 z = _mm256_mul_ps(fx, *(__m256*)m256_ps_cephes_exp_C2); x = _mm256_sub_ps(x, tmp); x = _mm256_sub_ps(x, z); z = _mm256_mul_ps(x,x); __m256 y = *(__m256*)m256_ps_cephes_exp_p0; y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, *(__m256*)m256_ps_cephes_exp_p1); y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, *(__m256*)m256_ps_cephes_exp_p2); y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, *(__m256*)m256_ps_cephes_exp_p3); y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, *(__m256*)m256_ps_cephes_exp_p4); y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, *(__m256*)m256_ps_cephes_exp_p5); y = _mm256_mul_ps(y, z); y = _mm256_add_ps(y, x); y = _mm256_add_ps(y, one); /* build 2^n */ emm0 = _mm256_cvttps_epi32(fx); emm0 = _mm256_add_epi32(emm0, *(__m256i*)m256_pi32_0x7f); emm0 = _mm256_slli_epi32(emm0, 23); __m256 pow2n = _mm256_castsi256_ps(emm0); y = _mm256_mul_ps(y, pow2n); _mm256_zeroupper(); return y; }
v8sf exp256_ps(v8sf x) { v8sf tmp = _mm256_setzero_ps(), fx; v8si imm0; v8sf one = *(v8sf*)_ps256_1; x = _mm256_min_ps(x, *(v8sf*)_ps256_exp_hi); x = _mm256_max_ps(x, *(v8sf*)_ps256_exp_lo); /* express exp(x) as exp(g + n*log(2)) */ fx = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_LOG2EF); fx = _mm256_add_ps(fx, *(v8sf*)_ps256_0p5); /* how to perform a floorf with SSE: just below */ //imm0 = _mm256_cvttps_epi32(fx); //tmp = _mm256_cvtepi32_ps(imm0); tmp = _mm256_floor_ps(fx); /* if greater, substract 1 */ //v8sf mask = _mm256_cmpgt_ps(tmp, fx); v8sf mask = _mm256_cmp_ps(tmp, fx, _CMP_GT_OS); mask = _mm256_and_ps(mask, one); fx = _mm256_sub_ps(tmp, mask); tmp = _mm256_mul_ps(fx, *(v8sf*)_ps256_cephes_exp_C1); v8sf z = _mm256_mul_ps(fx, *(v8sf*)_ps256_cephes_exp_C2); x = _mm256_sub_ps(x, tmp); x = _mm256_sub_ps(x, z); z = _mm256_mul_ps(x,x); v8sf y = *(v8sf*)_ps256_cephes_exp_p0; y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p1); y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p2); y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p3); y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p4); y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p5); y = _mm256_mul_ps(y, z); y = _mm256_add_ps(y, x); y = _mm256_add_ps(y, one); /* build 2^n */ imm0 = _mm256_cvttps_epi32(fx); // another two AVX2 instructions imm0 = _mm256_add_epi32(imm0, *(v8si*)_pi32_256_0x7f); imm0 = _mm256_slli_epi32(imm0, 23); v8sf pow2n = _mm256_castsi256_ps(imm0); y = _mm256_mul_ps(y, pow2n); return y; }
inline avx_m256_t newsin_ps(avx_m256_t x) { avx_m256_t sign_bit = _mm256_and_ps(x, _ps_sign_mask); x = _mm256_and_ps(x, _ps_inv_sign_mask); avx_m256_t y = _mm256_mul_ps(x, _ps_cephes_FOPI); avx_m256i_t emm2 = _mm256_cvttps_epi32(y); emm2 = _mm256_add_epi32(emm2, _pi32_1); emm2 = _mm256_and_si256(emm2, _pi32_inv1); y = _mm256_cvtepi32_ps(emm2); avx_m256i_t emm0 = _mm256_and_si256(emm2, _pi32_4); emm0 = _mm256_slli_epi32(emm0, 29); emm2 = _mm256_and_si256(emm2, _pi32_2); emm2 = _mm256_cmpeq_epi32(emm2, _mm256_setzero_si256()); avx_m256_t swap_sign_bit = _mm256_castsi256_ps(emm0); avx_m256_t poly_mask = _mm256_castsi256_ps(emm2); sign_bit = _mm256_xor_ps(sign_bit, swap_sign_bit); avx_m256_t temp = _ps_minus_cephes_DP123; temp = _mm256_mul_ps(y, temp); x = _mm256_add_ps(x, temp); avx_m256_t x2 = _mm256_mul_ps(x, x); avx_m256_t x3 = _mm256_mul_ps(x2, x); avx_m256_t x4 = _mm256_mul_ps(x2, x2); y = _ps_coscof_p0; avx_m256_t y2 = _ps_sincof_p0; y = _mm256_mul_ps(y, x2); y2 = _mm256_mul_ps(y2, x2); y = _mm256_add_ps(y, _ps_coscof_p1); y2 = _mm256_add_ps(y2, _ps_sincof_p1); y = _mm256_mul_ps(y, x2); y2 = _mm256_mul_ps(y2, x2); y = _mm256_add_ps(y, _ps_coscof_p2); y2 = _mm256_add_ps(y2, _ps_sincof_p2); y = _mm256_mul_ps(y, x4); y2 = _mm256_mul_ps(y2, x3); temp = _mm256_mul_ps(x2, _ps_0p5); temp = _mm256_sub_ps(temp, _ps_1); y = _mm256_sub_ps(y, temp); y2 = _mm256_add_ps(y2, x); y = _mm256_andnot_ps(poly_mask, y); y2 = _mm256_and_ps(poly_mask, y2); y = _mm256_add_ps(y, y2); y = _mm256_xor_ps(y, sign_bit); return y; } // newsin_ps()
__m256 distance(const __m256& x1, const __m256& y1, const __m256& x2, const __m256& y2) { const __m256 x_diff = _mm256_sub_ps(x1, x2); const __m256 y_diff = _mm256_sub_ps(y1, y2); const __m256 x_diff2 = _mm256_mul_ps(x_diff, x_diff); const __m256 y_diff2 = _mm256_mul_ps(y_diff, y_diff); const __m256 sum = _mm256_add_ps(x_diff2, y_diff2); const __m256 dist = _mm256_sqrt_ps(sum); return dist; }
int SymmColumnVec_32f_Unsymm_AVX(const float** src, const float* ky, float* dst, float delta, int width, int ksize2) { int i = 0, k; const float *S, *S2; const __m128 d4 = _mm_set1_ps(delta); const __m256 d8 = _mm256_set1_ps(delta); for (; i <= width - 16; i += 16) { __m256 f, s0 = d8, s1 = d8; __m256 x0; S = src[0] + i; for (k = 1; k <= ksize2; k++) { S = src[k] + i; S2 = src[-k] + i; f = _mm256_set1_ps(ky[k]); x0 = _mm256_sub_ps(_mm256_loadu_ps(S), _mm256_loadu_ps(S2)); #if CV_FMA3 s0 = _mm256_fmadd_ps(x0, f, s0); #else s0 = _mm256_add_ps(s0, _mm256_mul_ps(x0, f)); #endif x0 = _mm256_sub_ps(_mm256_loadu_ps(S + 8), _mm256_loadu_ps(S2 + 8)); #if CV_FMA3 s1 = _mm256_fmadd_ps(x0, f, s1); #else s1 = _mm256_add_ps(s1, _mm256_mul_ps(x0, f)); #endif } _mm256_storeu_ps(dst + i, s0); _mm256_storeu_ps(dst + i + 8, s1); } for (; i <= width - 4; i += 4) { __m128 f, x0, s0 = d4; for (k = 1; k <= ksize2; k++) { f = _mm_set1_ps(ky[k]); x0 = _mm_sub_ps(_mm_load_ps(src[k] + i), _mm_load_ps(src[-k] + i)); s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f)); } _mm_storeu_ps(dst + i, s0); } _mm256_zeroupper(); return i; }
/* AVX implementation for Zero Forcing (ZF) solver */ inline void srslte_mat_2x2_zf_avx(__m256 y0, __m256 y1, __m256 h00, __m256 h01, __m256 h10, __m256 h11, __m256 *x0, __m256 *x1, float norm) { __m256 det = srslte_mat_2x2_det_avx(h00, h01, h10, h11); __m256 detrec = _mm256_mul_ps(srslte_mat_cf_recip_avx(det), _mm256_set1_ps(norm)); #ifdef LV_HAVE_FMA *x0 = _MM256_PROD_PS(_MM256_PROD_SUB_PS(h11, y0, _MM256_PROD_PS(h01, y1)), detrec); *x1 = _MM256_PROD_PS(_MM256_PROD_SUB_PS(h00, y1, _MM256_PROD_PS(h10, y0)), detrec); #else *x0 = _MM256_PROD_PS(_mm256_sub_ps(_MM256_PROD_PS(h11, y0), _MM256_PROD_PS(h01, y1)), detrec); *x1 = _MM256_PROD_PS(_mm256_sub_ps(_MM256_PROD_PS(h00, y1), _MM256_PROD_PS(h10, y0)), detrec); #endif /* LV_HAVE_FMA */ }
vector unit_vector(const __m256& x1, const __m256& y1, const __m256& x2, const __m256& y2) { const __m256 x_diff = _mm256_sub_ps(x2, x1); const __m256 y_diff = _mm256_sub_ps(y2, y1); const __m256 dist = distance(x1, y1, x2, y2); const __m256 ux = _mm256_div_ps(x_diff, dist); const __m256 uy = _mm256_div_ps(y_diff, dist); vector result = {ux, uy}; return result; }
/* * Compute: s = sqrt( t**2 - x**2 - y**2 - z**2 ), with s, t, x, y, z * member variables of the st_coords structure arr. * * Traverse elements randomly */ void comp_s(st_coords arr, int L) { for(int j=0; j<L; j+=8) { int i = (rand() % (L/8)) * 8; __m256 x = _mm256_load_ps(&arr.x[i]); __m256 y = _mm256_load_ps(&arr.y[i]); __m256 z = _mm256_load_ps(&arr.z[i]); __m256 t = _mm256_load_ps(&arr.t[i]); #ifdef FMA register __m256 s0; s0 = _mm256_mul_ps(x, x); s0 = _mm256_fmadd_ps(y, y, s0); s0 = _mm256_fmadd_ps(z, z, s0); s0 = _mm256_fmsub_ps(t, t, s0); s0 = _mm256_sqrt_ps(s0); #else register __m256 s0, s1; s1 = _mm256_mul_ps(x, x); s0 = _mm256_mul_ps(y, y); s1 = _mm256_add_ps(s0, s1); s0 = _mm256_mul_ps(z, z); s1 = _mm256_add_ps(s0, s1); s0 = _mm256_mul_ps(t, t); s1 = _mm256_sub_ps(s0, s1); s0 = _mm256_sqrt_ps(s1); #endif _mm256_store_ps(&arr.s[i], s0); } return; }
static inline void blend_unorm8_argb(struct reg *src, __m256i dst_argb) { if (gt.blend.enable) { const __m256i mask = _mm256_set1_epi32(0xff); const __m256 scale = _mm256_set1_ps(1.0f / 255.0f); struct reg dst[4]; /* Convert to float */ dst[2].reg = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(dst_argb, mask)), scale); dst_argb = _mm256_srli_epi32(dst_argb, 8); dst[1].reg = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(dst_argb, mask)), scale); dst_argb = _mm256_srli_epi32(dst_argb, 8); dst[0].reg = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(dst_argb, mask)), scale); dst_argb = _mm256_srli_epi32(dst_argb, 8); dst[3].reg = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(dst_argb, mask)), scale); /* Blend, assuming src BLENDFACTOR_SRC_ALPHA, dst * BLENDFACTOR_INV_SRC_ALPHA, and BLENDFUNCTION_ADD. */ const __m256 inv_alpha = _mm256_sub_ps(_mm256_set1_ps(1.0f), src[3].reg); src[0].reg = _mm256_add_ps(_mm256_mul_ps(src[3].reg, src[0].reg), _mm256_mul_ps(inv_alpha, dst[0].reg)); src[1].reg = _mm256_add_ps(_mm256_mul_ps(src[3].reg, src[1].reg), _mm256_mul_ps(inv_alpha, dst[1].reg)); src[2].reg = _mm256_add_ps(_mm256_mul_ps(src[3].reg, src[2].reg), _mm256_mul_ps(inv_alpha, dst[2].reg)); src[3].reg = _mm256_add_ps(_mm256_mul_ps(src[3].reg, src[3].reg), _mm256_mul_ps(inv_alpha, dst[3].reg)); } }
/* AVX implementation for 2x2 determinant */ inline __m256 srslte_mat_2x2_det_avx(__m256 a00, __m256 a01, __m256 a10, __m256 a11) { #ifdef LV_HAVE_FMA return _MM256_PROD_SUB_PS(a00, a11, _MM256_PROD_PS(a01, a10)); #else return _mm256_sub_ps(_MM256_PROD_PS(a00, a11), _MM256_PROD_PS(a01, a10)); #endif /* LV_HAVE_FMA */ }
void neuralNet::activationPrime_avx(const float* neuronOutput, float* result) { static const __m256 ones = _mm256_set1_ps(1.0f); static const __m256 sigCoefficients = _mm256_set1_ps(SIGMOIDCOEFFICIENT); __m256 temp; const __m256* vOutput = (__m256*)neuronOutput; // 1 - ans temp = _mm256_sub_ps(ones, *vOutput); // (1-ans) * ans temp = _mm256_mul_ps(temp, *vOutput); // ans * coefficient temp = _mm256_mul_ps(temp, sigCoefficients); #ifndef NDEBUG const float* _temp = (float*)&temp; assert(fastabs(_temp[0] - activationPrime(neuronOutput[0])) < 0.05f); assert(fastabs(_temp[1] - activationPrime(neuronOutput[1])) < 0.05f); assert(fastabs(_temp[2] - activationPrime(neuronOutput[2])) < 0.05f); assert(fastabs(_temp[3] - activationPrime(neuronOutput[3])) < 0.05f); assert(fastabs(_temp[4] - activationPrime(neuronOutput[4])) < 0.05f); assert(fastabs(_temp[5] - activationPrime(neuronOutput[5])) < 0.05f); assert(fastabs(_temp[6] - activationPrime(neuronOutput[6])) < 0.05f); assert(fastabs(_temp[7] - activationPrime(neuronOutput[7])) < 0.05f); #endif // return ans _mm256_store_ps(result, temp); };
inline void avx2_xy_to_uv_f(__m256 x, __m256 y, __m256i& u, __m256i& v) { // Convert X,Y first into U,V space then round to nearest // integer. That gets us close to correct answer, mapping XY to a // lozenge-shaped space rather than hexagonal. We then correct the // four regions that lie outside the hexagonal cell assigning them // to their correct neighboring cell. // Writer's note: see ~/Google Drive/Work/calin // double dv = y*c_vy_inv; // double du = x-dv*c_vx; // u = std::lround(du); // v = std::lround(dv); // du -= u; // dv -= v; y = _mm256_mul_ps(y, calin::math::simd::c_m256(_c_m256_vy_inv)); x = _mm256_fnmadd_ps(y, calin::math::simd::c_m256(_c_m256_vx), x); u = _mm256_cvtps_epi32(x); v = _mm256_cvtps_epi32(y); x = _mm256_sub_ps(x, _mm256_cvtepi32_ps(u)); y = _mm256_sub_ps(y, _mm256_cvtepi32_ps(v)); // double c3 = dv-du; const __m256i c3 = _mm256_castps_si256(_mm256_sub_ps(y, x)); __m256i uvshift; __m256i mask; // double c1 = du+0.5*dv; // double c2 = dv+0.5*du; // if(c3<0) { // if(c1>=1) u++; // else if(c2<-1) v--; // } else { // if(c2>=1) v++; // else if(c1<-1) u--; // } uvshift = _mm256_cvtps_epi32(_mm256_fmadd_ps(y, calin::math::simd::c_m256(_c_m256_one_half), x)); mask = _mm256_srai_epi32(_mm256_xor_si256(uvshift, c3), 31); u = _mm256_blendv_epi8(u, _mm256_add_epi32(u, uvshift), mask); uvshift = _mm256_cvtps_epi32(_mm256_fmadd_ps(x, calin::math::simd::c_m256(_c_m256_one_half), y)); mask = _mm256_srai_epi32(_mm256_xor_si256(uvshift, c3), 31); v = _mm256_blendv_epi8(_mm256_add_epi32(v, uvshift), v, mask); }
void mandel_avx(unsigned char *image, const struct spec *s) { __m256 xmin = _mm256_set1_ps(s->xlim[0]); __m256 ymin = _mm256_set1_ps(s->ylim[0]); __m256 xscale = _mm256_set1_ps((s->xlim[1] - s->xlim[0]) / s->width); __m256 yscale = _mm256_set1_ps((s->ylim[1] - s->ylim[0]) / s->height); __m256 threshold = _mm256_set1_ps(4); __m256 one = _mm256_set1_ps(1); __m256 iter_scale = _mm256_set1_ps(1.0f / s->iterations); __m256 depth_scale = _mm256_set1_ps(s->depth - 1); #pragma omp parallel for schedule(dynamic, 1) for (int y = 0; y < s->height; y++) { for (int x = 0; x < s->width; x += 8) { __m256 mx = _mm256_set_ps(x + 7, x + 6, x + 5, x + 4, x + 3, x + 2, x + 1, x + 0); __m256 my = _mm256_set1_ps(y); __m256 cr = _mm256_add_ps(_mm256_mul_ps(mx, xscale), xmin); __m256 ci = _mm256_add_ps(_mm256_mul_ps(my, yscale), ymin); __m256 zr = cr; __m256 zi = ci; int k = 1; __m256 mk = _mm256_set1_ps(k); while (++k < s->iterations) { /* Compute z1 from z0 */ __m256 zr2 = _mm256_mul_ps(zr, zr); __m256 zi2 = _mm256_mul_ps(zi, zi); __m256 zrzi = _mm256_mul_ps(zr, zi); /* zr1 = zr0 * zr0 - zi0 * zi0 + cr */ /* zi1 = zr0 * zi0 + zr0 * zi0 + ci */ zr = _mm256_add_ps(_mm256_sub_ps(zr2, zi2), cr); zi = _mm256_add_ps(_mm256_add_ps(zrzi, zrzi), ci); /* Increment k */ zr2 = _mm256_mul_ps(zr, zr); zi2 = _mm256_mul_ps(zi, zi); __m256 mag2 = _mm256_add_ps(zr2, zi2); __m256 mask = _mm256_cmp_ps(mag2, threshold, _CMP_LT_OS); mk = _mm256_add_ps(_mm256_and_ps(mask, one), mk); /* Early bailout? */ if (_mm256_testz_ps(mask, _mm256_set1_ps(-1))) break; } mk = _mm256_mul_ps(mk, iter_scale); mk = _mm256_sqrt_ps(mk); mk = _mm256_mul_ps(mk, depth_scale); __m256i pixels = _mm256_cvtps_epi32(mk); unsigned char *dst = image + y * s->width * 3 + x * 3; unsigned char *src = (unsigned char *)&pixels; for (int i = 0; i < 8; i++) { dst[i * 3 + 0] = src[i * 4]; dst[i * 3 + 1] = src[i * 4]; dst[i * 3 + 2] = src[i * 4]; } } } }
inline void avx2_xy_to_uv_with_remainder_f( __m256& x_in_dx_out, __m256& y_in_dy_out, __m256i& u, __m256i& v) { avx2_xy_to_uv_f(x_in_dx_out, y_in_dy_out, u, v); x_in_dx_out = _mm256_sub_ps(x_in_dx_out, _mm256_cvtepi32_ps(u)); __m256 vf = _mm256_cvtepi32_ps(v); x_in_dx_out = _mm256_fnmadd_ps(vf, calin::math::simd::c_m256(_c_m256_vx), x_in_dx_out); y_in_dy_out = _mm256_fnmadd_ps(vf, calin::math::simd::c_m256(_c_m256_vy), y_in_dy_out); }
void NBodyAlgorithmCPU::calculateAccelerationWithColor(const float3(&posI)[8], const float massJ, const float3 posJ, float3(&accI)[8], unsigned int(&numNeighbours)[8]) { __m256 pix = _mm256_set_ps(posI[0].x, posI[1].x, posI[2].x, posI[3].x, posI[4].x, posI[5].x, posI[6].x, posI[7].x); __m256 piy = _mm256_set_ps(posI[0].y, posI[1].y, posI[2].y, posI[3].y, posI[4].y, posI[5].y, posI[6].y, posI[7].y); __m256 piz = _mm256_set_ps(posI[0].z, posI[1].z, posI[2].z, posI[3].z, posI[4].z, posI[5].z, posI[6].z, posI[7].z); __m256 pjx = _mm256_set1_ps(posJ.x); __m256 pjy = _mm256_set1_ps(posJ.y); __m256 pjz = _mm256_set1_ps(posJ.z); __m256 rx = _mm256_sub_ps(pjx, pix); __m256 ry = _mm256_sub_ps(pjy, piy); __m256 rz = _mm256_sub_ps(pjz, piz); __m256 eps2 = _mm256_set1_ps(mp_properties->EPS2); __m256 rx2 = _mm256_mul_ps(rx, rx); __m256 ry2 = _mm256_mul_ps(ry, ry); __m256 rz2 = _mm256_mul_ps(rz, rz); __m256 rabs = _mm256_sqrt_ps(_mm256_add_ps(_mm256_add_ps(rx2, ry2), _mm256_add_ps(rz2, eps2))); __m256 cmpDistance = _mm256_set1_ps(float(mp_properties->positionScale)); __m256 close = _mm256_cmp_ps(rabs, cmpDistance, 2); for (int i = 0; i < 8; i++) { if (close.m256_f32[i] == 0) { numNeighbours[7 - i] = 0; } } __m256 m = _mm256_set1_ps(massJ); __m256 rabsInv = _mm256_div_ps(m, _mm256_mul_ps(_mm256_mul_ps(rabs, rabs), rabs)); __m256 aix = _mm256_mul_ps(rx, rabsInv); __m256 aiy = _mm256_mul_ps(ry, rabsInv); __m256 aiz = _mm256_mul_ps(rz, rabsInv); for (int i = 0; i < 8; i++) { accI[7 - i].x = aix.m256_f32[i]; accI[7 - i].y = aiy.m256_f32[i]; accI[7 - i].z = aiz.m256_f32[i]; } }
inline void avx2_xy_trans_to_uv_f(__m256 x, __m256 y, __m256i& u, __m256i& v, const float crot, const float srot, const float scale, const float dx = 0, const float dy = 0) { // x = (x - dx)/scale; // y = (y - dy)/scale; // double xx = x*crot + y*srot; // y = y*crot - x*srot; // xy_to_uv(xx,y,u,v); const __m256 vsrot = _mm256_set1_ps(srot); const __m256 vcrot = _mm256_set1_ps(crot); const __m256 vscale = _mm256_set1_ps(1.0f/scale); x = _mm256_mul_ps(_mm256_sub_ps(x, _mm256_set1_ps(dx)), vscale); y = _mm256_mul_ps(_mm256_sub_ps(y, _mm256_set1_ps(dy)), vscale); __m256 yy = _mm256_mul_ps(x, vsrot); yy = _mm256_fmsub_ps(y, vcrot, yy); x = _mm256_mul_ps(x, vcrot); x = _mm256_fmadd_ps(y, vsrot, x); avx2_xy_to_uv_f(x, yy, u, v); }
void somap::train(const imgdata & obj) { if (this->weight <= 0.0)return; __m256 tmp; __m256 *v1 = (__m256*)(this->fvex); const __m256 *v2 = (__m256*)(obj.fvex); const __m256 ws = _mm256_set1_ps(this->weight); for (int i = 0; i < f; i++) { tmp = _mm256_sub_ps(v2[i], v1[i]); tmp = _mm256_mul_ps(tmp, ws); v1[i] = _mm256_add_ps(v1[i], tmp); } }
extern "C" void product64x64_avx(float *a, float *b, float *c, int n) { for(int i=0; i<n; i++) { __m256 t1 = _mm256_loadu_ps(&c[i*n + 0]); __m256 t2 = _mm256_loadu_ps(&c[i*n + 8]); __m256 t3 = _mm256_loadu_ps(&c[i*n + 16]); __m256 t4 = _mm256_loadu_ps(&c[i*n + 24]); __m256 t5 = _mm256_loadu_ps(&c[i*n + 32]); __m256 t6 = _mm256_loadu_ps(&c[i*n + 40]); __m256 t7 = _mm256_loadu_ps(&c[i*n + 48]); __m256 t8 = _mm256_loadu_ps(&c[i*n + 56]); for(int k=0; k<n; k++) { __m256 a1 = _mm256_set1_ps(a[k*n+i]); __m256 b1 = _mm256_loadu_ps(&b[k*n+0]); t1 = _mm256_sub_ps(t1,_mm256_mul_ps(a1,b1)); __m256 b2 = _mm256_loadu_ps(&b[k*n+8]); t2 = _mm256_sub_ps(t2,_mm256_mul_ps(a1,b2)); __m256 b3 = _mm256_loadu_ps(&b[k*n+16]); t3 = _mm256_sub_ps(t3,_mm256_mul_ps(a1,b3)); __m256 b4 = _mm256_loadu_ps(&b[k*n+24]); t4 = _mm256_sub_ps(t4,_mm256_mul_ps(a1,b4)); __m256 b5 = _mm256_loadu_ps(&b[k*n+32]); t5 = _mm256_sub_ps(t5,_mm256_mul_ps(a1,b5)); __m256 b6 = _mm256_loadu_ps(&b[k*n+40]); t6 = _mm256_sub_ps(t6,_mm256_mul_ps(a1,b6)); __m256 b7 = _mm256_loadu_ps(&b[k*n+48]); t7 = _mm256_sub_ps(t7,_mm256_mul_ps(a1,b7)); __m256 b8 = _mm256_loadu_ps(&b[k*n+56]); t8 = _mm256_sub_ps(t8,_mm256_mul_ps(a1,b8)); } _mm256_storeu_ps(&c[i*n + 0], t1); _mm256_storeu_ps(&c[i*n + 8], t2); _mm256_storeu_ps(&c[i*n + 16], t3); _mm256_storeu_ps(&c[i*n + 24], t4); _mm256_storeu_ps(&c[i*n + 32], t5); _mm256_storeu_ps(&c[i*n + 40], t6); _mm256_storeu_ps(&c[i*n + 48], t7); _mm256_storeu_ps(&c[i*n + 56], t8); } }
void convertCAVX(int num, uint8_t *in, float *out){ int i; __m256 sub = _mm256_set1_ps(128.0); __m256 mul = _mm256_set1_ps(1/128.0); for(i=0; i<num; i+=8){ __m128i val = _mm_loadu_si128((__m128i *)(in + i)); __m256i ints = _mm256_cvtepu8_epi32(val); __m256 cvtd = _mm256_cvtepi32_ps(ints); __m256 res = _mm256_mul_ps(_mm256_sub_ps(cvtd, sub), mul); _mm256_storeu_ps(out + i, res); } }
void nv_vector_sub(nv_matrix_t *vec0, int m0, const nv_matrix_t *vec1, int m1, const nv_matrix_t *vec2, int m2) { NV_ASSERT(vec1->n == vec2->n); NV_ASSERT(vec2->n == vec0->n); #if NV_ENABLE_AVX { __m256 x; int n; int pk_lp = (vec1->n & 0xfffffff8); for (n = 0; n < pk_lp; n += 8) { x = _mm256_load_ps(&NV_MAT_V(vec1, m1, n)); x = _mm256_sub_ps(x, *(const __m256 *)&NV_MAT_V(vec2, m2, n)); _mm256_store_ps(&NV_MAT_V(vec0, m0, n), x); } for (n = pk_lp; n < vec1->n; ++n) { NV_MAT_V(vec0, m0, n) = NV_MAT_V(vec1, m1, n) - NV_MAT_V(vec2, m2, n); } } #elif NV_ENABLE_SSE2 { int n; int pk_lp = (vec1->n & 0xfffffffc); #ifdef _OPENMP //#pragma omp parallel for #endif for (n = 0; n < pk_lp; n += 4) { __m128 x = _mm_load_ps(&NV_MAT_V(vec1, m1, n)); x = _mm_sub_ps(x, *(const __m128 *)&NV_MAT_V(vec2, m2, n)); _mm_store_ps(&NV_MAT_V(vec0, m0, n), x); } for (n = pk_lp; n < vec1->n; ++n) { NV_MAT_V(vec0, m0, n) = NV_MAT_V(vec1, m1, n) - NV_MAT_V(vec2, m2, n); } } #else { int n; for (n = 0; n < vec1->n; ++n) { NV_MAT_V(vec0, m0, n) = NV_MAT_V(vec1, m1, n) - NV_MAT_V(vec2, m2, n); } } #endif }
inline void avx2_xy_trans_to_uv_with_remainder_f( __m256& x_in_dx_out, __m256& y_in_dy_out, __m256i& u, __m256i& v, const float crot, const float srot, const float scale, const float dx = 0, const float dy = 0) { const __m256 vsrot = _mm256_set1_ps(srot); const __m256 vcrot = _mm256_set1_ps(crot); __m256 vscale = _mm256_set1_ps(1.0f/scale); x_in_dx_out = _mm256_mul_ps(_mm256_sub_ps(x_in_dx_out, _mm256_set1_ps(dx)), vscale); y_in_dy_out = _mm256_mul_ps(_mm256_sub_ps(y_in_dy_out, _mm256_set1_ps(dy)), vscale); __m256 yy = _mm256_mul_ps(x_in_dx_out, vsrot); yy = _mm256_fmsub_ps(y_in_dy_out, vcrot, yy); x_in_dx_out = _mm256_mul_ps(x_in_dx_out, vcrot); x_in_dx_out = _mm256_fmadd_ps(y_in_dy_out, vsrot, x_in_dx_out); avx2_xy_to_uv_with_remainder_f(x_in_dx_out, yy, u, v); vscale = _mm256_set1_ps(scale); y_in_dy_out = _mm256_mul_ps(yy, vcrot); y_in_dy_out = _mm256_fmadd_ps(x_in_dx_out, vsrot, y_in_dy_out); y_in_dy_out = _mm256_mul_ps(y_in_dy_out, vscale); x_in_dx_out = _mm256_mul_ps(x_in_dx_out, vcrot); x_in_dx_out = _mm256_fnmadd_ps(yy, vsrot, x_in_dx_out); x_in_dx_out = _mm256_mul_ps(x_in_dx_out, vscale); }
double Atomtype::CalcPE(int frame_i, const Trajectory &trj, const coordinates &rand_xyz, const cubicbox_m256 &box, double vol) const { float pe = 0.0; int atom_i = 0; /* BEGIN SIMD SECTION */ // This performs the exact same calculation after the SIMD section // but doing it on 8 atoms at a time using SIMD instructions. coordinates8 rand_xyz8(rand_xyz), atom_xyz; __m256 r2_8, mask, r6, ri6, pe_tmp; __m256 pe_sum = _mm256_setzero_ps(); float result[n] __attribute__((aligned (16))); for (; atom_i < this->n-8; atom_i+=8) { atom_xyz = trj.GetXYZ8(frame_i, this->name, atom_i); r2_8 = distance2(atom_xyz, rand_xyz8, box); mask = _mm256_cmp_ps(r2_8, rcut2_8, _CMP_LT_OS); r6 = _mm256_and_ps(mask, _mm256_mul_ps(_mm256_mul_ps(r2_8, r2_8), r2_8)); ri6 = _mm256_and_ps(mask, _mm256_rcp_ps(r6)); pe_tmp = _mm256_and_ps(mask, _mm256_mul_ps(ri6, _mm256_sub_ps(_mm256_mul_ps(c12_8, ri6), c6_8))); pe_sum = _mm256_add_ps(pe_tmp, pe_sum); } _mm256_store_ps(result, pe_sum); for (int i = 0; i < 8; i++) { pe += result[i]; } /* END SIMD SECTION */ for (; atom_i < this->n; atom_i++) { coordinates atom_xyz = trj.GetXYZ(frame_i, this->name, atom_i); float r2 = distance2(atom_xyz, rand_xyz, cubicbox(box)); if (r2 < this->rcut2) { float ri6 = 1.0/(pow(r2,3)); pe += ri6*(this->c12*ri6 - this->c6); } } pe += this->n/vol * this->tail_factor;; return pe; }
int main(void) { float out[8]; __m256 a=_mm256_setr_ps(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f); __m256 b=_mm256_setr_ps(1.1f, 2.2f, 3.3f, 4.4f, 5.5f, 6.6f, 7.7f, 8.8f); __m256 dst = _mm256_sub_ps(a, b); _mm256_storeu_ps(out, dst); for(int i=0; i<sizeof(out)/sizeof(out[0]); i++) printf("out[%d]=%5.1f\n", i, out[i]); return 0; }
void TransLut_FindIndexAvx2 <TransLut::MapperLin>::find_index (const TransLut::FloatIntMix val_arr [8], __m256i &index, __m256 &frac) { assert (val_arr != 0); const __m256 scale = _mm256_set1_ps (1 << LINLUT_RES_L2); const __m256i offset = _mm256_set1_epi32 (-LINLUT_MIN_F * (1 << LINLUT_RES_L2)); const __m256i val_min = _mm256_setzero_si256 (); const __m256i val_max = _mm256_set1_epi32 (LINLUT_SIZE_F - 2); const __m256 v = _mm256_load_ps (reinterpret_cast <const float *> (val_arr)); const __m256 val_scl = _mm256_mul_ps (v, scale); const __m256i index_raw = _mm256_cvtps_epi32 (val_scl); __m256i index_tmp = _mm256_add_epi32 (index_raw, offset); index_tmp = _mm256_min_epi32 (index_tmp, val_max); index = _mm256_max_epi32 (index_tmp, val_min); frac = _mm256_sub_ps (val_scl, _mm256_cvtepi32_ps (index_raw)); }
void neuralNet::activation_approx_avx(const float* _neuronOutput, float* result) { BOOST_STATIC_ASSERT(SIGMOIDCOEFFICIENT == 4.0f); // code adapted from http://ybeernet.blogspot.com/2011/03/speeding-up-sigmoid-function-by.html // approximates sigmoid function with coefficient 4.0f static const __m256 ones = _mm256_set1_ps(1.0f); static const __m256 oneFourths = _mm256_set1_ps(0.25f); static const __m256 fours = _mm256_set1_ps(4.0f); __m256 temp; const __m256* vOutput = (__m256*)_neuronOutput; // min (output, 4.0) temp = _mm256_min_ps(*vOutput, fours); // multiply by 0.25 temp = _mm256_mul_ps(temp, oneFourths); // 1 - ans temp = _mm256_sub_ps(ones, temp); // ans^16 temp = _mm256_mul_ps(temp, temp); temp = _mm256_mul_ps(temp, temp); temp = _mm256_mul_ps(temp, temp); temp = _mm256_mul_ps(temp, temp); // 1 + ans temp = _mm256_add_ps(ones, temp); // 1 / ans temp = _mm256_rcp_ps(temp); #ifndef NDEBUG const float* _temp = (float*)&temp; assert(fastabs(_temp[0] - activation(_neuronOutput[0])) < 0.05f); assert(fastabs(_temp[1] - activation(_neuronOutput[1])) < 0.05f); assert(fastabs(_temp[2] - activation(_neuronOutput[2])) < 0.05f); assert(fastabs(_temp[3] - activation(_neuronOutput[3])) < 0.05f); assert(fastabs(_temp[4] - activation(_neuronOutput[4])) < 0.05f); assert(fastabs(_temp[5] - activation(_neuronOutput[5])) < 0.05f); assert(fastabs(_temp[6] - activation(_neuronOutput[6])) < 0.05f); assert(fastabs(_temp[7] - activation(_neuronOutput[7])) < 0.05f); #endif // return ans _mm256_store_ps(result, temp); };
div(avx_simd_complex_float<T> lhs, avx_simd_complex_float<T> rhs) { //lhs = [x1.real, x1.img, x2.real, x2.img ...] //rhs = [y1.real, y1.img, y2.real, y2.img ...] //ymm0 = [y1.real, y1.real, y2.real, y2.real, ...] __m256 ymm0 = _mm256_moveldup_ps(rhs.value); //ymm1 = [y1.imag, y1.imag, y2.imag, y2.imag] __m256 ymm1 = _mm256_movehdup_ps(rhs.value); //ymm2 = [x1.img, x1.real, x2.img, x2.real] __m256 ymm2 = _mm256_permute_ps(lhs.value, 0b10110001); //ymm4 = [x.img * y.img, x.real * y.img] __m256 ymm4 = _mm256_mul_ps(ymm2, ymm1); //ymm5 = subadd((lhs * ymm0), ymm4) #ifdef __FMA__ __m256 ymm5 = _mm256_fmsubadd_ps(lhs.value, ymm0, ymm4); #else __m256 t1 = _mm256_mul_ps(lhs.value, ymm0); __m256 t2 = _mm256_sub_ps(_mm256_set1_ps(0.0), ymm4); __m256 ymm5 = _mm256_addsub_ps(t1, t2); #endif //ymm3 = [y.imag^2, y.imag^2] __m256 ymm3 = _mm256_mul_ps(ymm1, ymm1); //ymm0 = (ymm0 * ymm0 + ymm3) #ifdef __FMA__ ymm0 = _mm256_fmadd_ps(ymm0, ymm0, ymm3); #else __m256 t3 = _mm256_mul_ps(ymm0, ymm0); ymm0 = _mm256_add_ps(t3, ymm3); #endif //result = ymm5 / ymm0 return _mm256_div_ps(ymm5, ymm0); }