vector unit_vector(const __m256& x1, const __m256& y1, const __m256& x2, const __m256& y2) { const __m256 x_diff = _mm256_sub_ps(x2, x1); const __m256 y_diff = _mm256_sub_ps(y2, y1); const __m256 dist = distance(x1, y1, x2, y2); const __m256 ux = _mm256_div_ps(x_diff, dist); const __m256 uy = _mm256_div_ps(y_diff, dist); vector result = {ux, uy}; return result; }
double compute_pi_leibniz_avx_opt_single(size_t n) { double pi = 0.0; register __m256 ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm8; register __m256 ymm9, ymm10, ymm11, ymm12, ymm13; ymm0 = _mm256_set_ps(1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0); ymm1 = _mm256_set_ps(1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0); ymm2 = _mm256_set_ps(17.0, 19.0, 21.0, 23.0, 25.0, 27.0, 29.0, 31.0); ymm3 = _mm256_set_ps(33.0, 35.0, 37.0, 39.0, 41.0, 43.0, 45.0, 47.0); ymm4 = _mm256_set_ps(49.0, 51.0, 53.0, 55.0, 57.0, 59.0, 61.0, 63.0); ymm13 = _mm256_set1_ps(64.0); ymm5 = _mm256_setzero_ps(); ymm6 = _mm256_setzero_ps(); ymm7 = _mm256_setzero_ps(); ymm8 = _mm256_setzero_ps(); for (int i = 0; i <= n - 32; i += 32) { ymm9 = _mm256_div_ps(ymm0, ymm1); ymm1 = _mm256_add_ps(ymm1, ymm13); ymm10 = _mm256_div_ps(ymm0, ymm2); ymm2 = _mm256_add_ps(ymm2, ymm13); ymm11 = _mm256_div_ps(ymm0, ymm3); ymm3 = _mm256_add_ps(ymm3, ymm13); ymm12 = _mm256_div_ps(ymm0, ymm4); ymm4 = _mm256_add_ps(ymm4, ymm13); ymm5 = _mm256_add_ps(ymm5, ymm9); ymm6 = _mm256_add_ps(ymm6, ymm10); ymm7 = _mm256_add_ps(ymm7, ymm11); ymm8 = _mm256_add_ps(ymm8, ymm12); } float tmp[8] __attribute__((aligned(32))); _mm256_store_ps(tmp, ymm5); pi += tmp[0] + tmp[1] + tmp[2] + tmp[3] + \ tmp[4] + tmp[5] + tmp[6] + tmp[7]; _mm256_store_ps(tmp, ymm6); pi += tmp[0] + tmp[1] + tmp[2] + tmp[3] + \ tmp[4] + tmp[5] + tmp[6] + tmp[7]; _mm256_store_ps(tmp, ymm7); pi += tmp[0] + tmp[1] + tmp[2] + tmp[3] + \ tmp[4] + tmp[5] + tmp[6] + tmp[7]; _mm256_store_ps(tmp, ymm8); pi += tmp[0] + tmp[1] + tmp[2] + tmp[3] + \ tmp[4] + tmp[5] + tmp[6] + tmp[7]; return pi * 4.0; }
static void quantize_block(const float *in_data, float *out_data, float *quant_tbl) { int zigzag; __m256 result, dct_values, quant_values; __m256 factor = _mm256_set1_ps(0.25f); for (zigzag = 0; zigzag < 64; zigzag += 8) { // Set the dct_values for the current interation dct_values = _mm256_set_ps(in_data[UV_indexes[zigzag + 7]], in_data[UV_indexes[zigzag + 6]], in_data[UV_indexes[zigzag + 5]], in_data[UV_indexes[zigzag + 4]], in_data[UV_indexes[zigzag + 3]], in_data[UV_indexes[zigzag + 2]], in_data[UV_indexes[zigzag + 1]], in_data[UV_indexes[zigzag]]); // Multiply with 0.25 to divide by 4.0 result = _mm256_mul_ps(dct_values, factor); // Load quant-values and multiply with previous product quant_values = _mm256_load_ps(quant_tbl + zigzag); result = _mm256_div_ps(result, quant_values); // Round off values and store in out_data buffer result = c63_mm256_roundhalfawayfromzero_ps(result); _mm256_store_ps(out_data + zigzag, result); } }
void THFloatVector_divs_AVX(float *y, const float *x, const float c, const ptrdiff_t n) { ptrdiff_t i; __m256 YMM15 = _mm256_set_ps(c, c, c, c, c, c, c, c); __m256 YMM0, YMM1; for (i=0; i<=((n)-16); i+=16) { YMM0 = _mm256_loadu_ps(x+i); YMM1 = _mm256_loadu_ps(x+i+8); YMM0 = _mm256_div_ps(YMM0, YMM15); YMM1 = _mm256_div_ps(YMM1, YMM15); _mm256_storeu_ps(y+i, YMM0); _mm256_storeu_ps(y+i+8, YMM1); } for (; i<(n); i++) { y[i] = x[i] / c; } }
void NBodyAlgorithmCPU::calculateAcceleration(const float3(&posI)[8], const float massJ, const float3 posJ, float *accI) { __m256 pix = _mm256_set_ps(posI[7].x, posI[6].x, posI[5].x, posI[4].x, posI[3].x, posI[2].x, posI[1].x, posI[0].x); __m256 piy = _mm256_set_ps(posI[7].y, posI[6].y, posI[5].y, posI[4].y, posI[3].y, posI[2].y, posI[1].y, posI[0].y); __m256 piz = _mm256_set_ps(posI[7].z, posI[6].z, posI[5].z, posI[4].z, posI[3].z, posI[2].z, posI[1].z, posI[0].z); __m256 pjx = _mm256_set1_ps(posJ.x); __m256 pjy = _mm256_set1_ps(posJ.y); __m256 pjz = _mm256_set1_ps(posJ.z); __m256 rx = _mm256_sub_ps(pjx, pix); __m256 ry = _mm256_sub_ps(pjy, piy); __m256 rz = _mm256_sub_ps(pjz, piz); __m256 eps2 = _mm256_set1_ps(mp_properties->EPS2); __m256 rx2 = _mm256_mul_ps(rx, rx); __m256 ry2 = _mm256_mul_ps(ry, ry); __m256 rz2 = _mm256_mul_ps(rz, rz); __m256 rabs = _mm256_sqrt_ps(_mm256_add_ps(_mm256_add_ps(rx2, ry2), _mm256_add_ps(rz2, eps2))); __m256 m = _mm256_set1_ps(massJ); __m256 rabsInv = _mm256_div_ps(m, _mm256_mul_ps(_mm256_mul_ps(rabs, rabs), rabs)); __m256 aix = _mm256_mul_ps(rx, rabsInv); __m256 aiy = _mm256_mul_ps(ry, rabsInv); __m256 aiz = _mm256_mul_ps(rz, rabsInv); _mm256_store_ps(accI, aix); _mm256_store_ps(accI + 8, aiy); _mm256_store_ps(accI + 16, aiz); }
void NBodyAlgorithmCPU::calculateAcceleration(const float3(&posI)[8], const float massJ, const float3 posJ, float3(&accI)[8]) { __m256 pix = _mm256_set_ps(posI[0].x, posI[1].x, posI[2].x, posI[3].x, posI[4].x, posI[5].x, posI[6].x, posI[7].x); __m256 piy = _mm256_set_ps(posI[0].y, posI[1].y, posI[2].y, posI[3].y, posI[4].y, posI[5].y, posI[6].y, posI[7].y); __m256 piz = _mm256_set_ps(posI[0].z, posI[1].z, posI[2].z, posI[3].z, posI[4].z, posI[5].z, posI[6].z, posI[7].z); __m256 pjx = _mm256_set1_ps(posJ.x); __m256 pjy = _mm256_set1_ps(posJ.y); __m256 pjz = _mm256_set1_ps(posJ.z); __m256 rx = _mm256_sub_ps(pjx, pix); __m256 ry = _mm256_sub_ps(pjy, piy); __m256 rz = _mm256_sub_ps(pjz, piz); __m256 eps2 = _mm256_set1_ps(mp_properties->EPS2); __m256 rx2 = _mm256_mul_ps(rx, rx); __m256 ry2 = _mm256_mul_ps(ry, ry); __m256 rz2 = _mm256_mul_ps(rz, rz); __m256 rabs = _mm256_sqrt_ps(_mm256_add_ps(_mm256_add_ps(rx2, ry2), _mm256_add_ps(rz2, eps2))); __m256 m = _mm256_set1_ps(massJ); __m256 rabsInv = _mm256_div_ps(m, _mm256_mul_ps(_mm256_mul_ps(rabs, rabs), rabs)); __m256 aix = _mm256_mul_ps(rx, rabsInv); __m256 aiy = _mm256_mul_ps(ry, rabsInv); __m256 aiz = _mm256_mul_ps(rz, rabsInv); for (int i = 0; i < 8; i++) { accI[7 - i].x = aix.m256_f32[i]; accI[7 - i].y = aiy.m256_f32[i]; accI[7 - i].z = aiz.m256_f32[i]; } }
void THFloatVector_cdiv_AVX(float *z, const float *x, const float *y, const ptrdiff_t n) { ptrdiff_t i; __m256 YMM0, YMM1, YMM2, YMM3; for (i=0; i<=((n)-16); i+=16) { YMM0 = _mm256_loadu_ps(x+i); YMM1 = _mm256_loadu_ps(x+i+8); YMM2 = _mm256_loadu_ps(y+i); YMM3 = _mm256_loadu_ps(y+i+8); YMM2 = _mm256_div_ps(YMM0, YMM2); YMM3 = _mm256_div_ps(YMM1, YMM3); _mm256_storeu_ps(z+i, YMM2); _mm256_storeu_ps(z+i+8, YMM3); } for (; i<(n); i++) { z[i] = x[i] / y[i]; } }
void nv_vector_inv(nv_matrix_t *a, int am, const nv_matrix_t *x, int xm) { NV_ASSERT(a->n >= x->n); #if NV_ENABLE_AVX { __m256 xx, vv; int n; int pk_lp = (x->n & 0xfffffff8); vv = _mm256_set1_ps(1.0f); for (n = 0; n < pk_lp; n += 8) { xx = _mm256_load_ps(&NV_MAT_V(x, xm, n)); xx = _mm256_div_ps(vv, xx); _mm256_store_ps(&NV_MAT_V(a, am, n), xx); } for (n = pk_lp; n < x->n; ++n) { NV_MAT_V(a, am, n) = 1.0f / NV_MAT_V(x, xm, n); } } #elif NV_ENABLE_SSE2 { __m128 xx, vv; int n; int pk_lp = (x->n & 0xfffffffc); vv = _mm_set1_ps(1.0f); for (n = 0; n < pk_lp; n += 4) { xx = _mm_load_ps(&NV_MAT_V(x, xm, n)); xx = _mm_div_ps(vv, xx); _mm_store_ps(&NV_MAT_V(a, am, n), xx); } for (n = pk_lp; n < x->n; ++n) { NV_MAT_V(a, am, n) = 1.0f / NV_MAT_V(x, xm, n); } } #else { int n; for (n = 0; n < x->n; ++n) { NV_MAT_V(a, am, n) = 1.0f / NV_MAT_V(x, xm, n); } } #endif }
void static avx_test (void) { int i; union256 u, s1, s2; float e[8]; s1.x = _mm256_set_ps (24.43, 68.346, 43.35, 546.46, 46.79, 82.78, 82.7, 9.4); s2.x = _mm256_set_ps (1.17, 2.16, 3.15, 4.14, 5.13, 6.12, 7.11, 8.9); u.x = _mm256_div_ps (s1.x, s2.x); for (i = 0; i < 8; i++) e[i] = s1.a[i] / s2.a[i]; if (check_union256 (u, e)) abort (); }
void NBodyAlgorithmCPU::calculateAccelerationWithColor(const float3(&posI)[8], const float massJ, const float3 posJ, float3(&accI)[8], unsigned int(&numNeighbours)[8]) { __m256 pix = _mm256_set_ps(posI[0].x, posI[1].x, posI[2].x, posI[3].x, posI[4].x, posI[5].x, posI[6].x, posI[7].x); __m256 piy = _mm256_set_ps(posI[0].y, posI[1].y, posI[2].y, posI[3].y, posI[4].y, posI[5].y, posI[6].y, posI[7].y); __m256 piz = _mm256_set_ps(posI[0].z, posI[1].z, posI[2].z, posI[3].z, posI[4].z, posI[5].z, posI[6].z, posI[7].z); __m256 pjx = _mm256_set1_ps(posJ.x); __m256 pjy = _mm256_set1_ps(posJ.y); __m256 pjz = _mm256_set1_ps(posJ.z); __m256 rx = _mm256_sub_ps(pjx, pix); __m256 ry = _mm256_sub_ps(pjy, piy); __m256 rz = _mm256_sub_ps(pjz, piz); __m256 eps2 = _mm256_set1_ps(mp_properties->EPS2); __m256 rx2 = _mm256_mul_ps(rx, rx); __m256 ry2 = _mm256_mul_ps(ry, ry); __m256 rz2 = _mm256_mul_ps(rz, rz); __m256 rabs = _mm256_sqrt_ps(_mm256_add_ps(_mm256_add_ps(rx2, ry2), _mm256_add_ps(rz2, eps2))); __m256 cmpDistance = _mm256_set1_ps(float(mp_properties->positionScale)); __m256 close = _mm256_cmp_ps(rabs, cmpDistance, 2); for (int i = 0; i < 8; i++) { if (close.m256_f32[i] == 0) { numNeighbours[7 - i] = 0; } } __m256 m = _mm256_set1_ps(massJ); __m256 rabsInv = _mm256_div_ps(m, _mm256_mul_ps(_mm256_mul_ps(rabs, rabs), rabs)); __m256 aix = _mm256_mul_ps(rx, rabsInv); __m256 aiy = _mm256_mul_ps(ry, rabsInv); __m256 aiz = _mm256_mul_ps(rz, rabsInv); for (int i = 0; i < 8; i++) { accI[7 - i].x = aix.m256_f32[i]; accI[7 - i].y = aiy.m256_f32[i]; accI[7 - i].z = aiz.m256_f32[i]; } }
div(avx_simd_complex_float<T> lhs, avx_simd_complex_float<T> rhs) { //lhs = [x1.real, x1.img, x2.real, x2.img ...] //rhs = [y1.real, y1.img, y2.real, y2.img ...] //ymm0 = [y1.real, y1.real, y2.real, y2.real, ...] __m256 ymm0 = _mm256_moveldup_ps(rhs.value); //ymm1 = [y1.imag, y1.imag, y2.imag, y2.imag] __m256 ymm1 = _mm256_movehdup_ps(rhs.value); //ymm2 = [x1.img, x1.real, x2.img, x2.real] __m256 ymm2 = _mm256_permute_ps(lhs.value, 0b10110001); //ymm4 = [x.img * y.img, x.real * y.img] __m256 ymm4 = _mm256_mul_ps(ymm2, ymm1); //ymm5 = subadd((lhs * ymm0), ymm4) #ifdef __FMA__ __m256 ymm5 = _mm256_fmsubadd_ps(lhs.value, ymm0, ymm4); #else __m256 t1 = _mm256_mul_ps(lhs.value, ymm0); __m256 t2 = _mm256_sub_ps(_mm256_set1_ps(0.0), ymm4); __m256 ymm5 = _mm256_addsub_ps(t1, t2); #endif //ymm3 = [y.imag^2, y.imag^2] __m256 ymm3 = _mm256_mul_ps(ymm1, ymm1); //ymm0 = (ymm0 * ymm0 + ymm3) #ifdef __FMA__ ymm0 = _mm256_fmadd_ps(ymm0, ymm0, ymm3); #else __m256 t3 = _mm256_mul_ps(ymm0, ymm0); ymm0 = _mm256_add_ps(t3, ymm3); #endif //result = ymm5 / ymm0 return _mm256_div_ps(ymm5, ymm0); }
void AVXAccelerator::forcesFor(std::size_t i, std::vector<XY>& forces) const { const std::size_t objs = m_objects->size(); // AVX can be used for 8 element aligned packs. const std::size_t first_simd_idx = (i + 8) & (-8); const std::size_t last_simd_idx = objs & (-8); // pre AVX calculations (for elements before first_simd_idx) std::size_t j = i + 1; for(; j < std::min(first_simd_idx, objs); j++) { const XY force_vector = force(i, j); forces[i] += force_vector; forces[j] += -force_vector; } // AVX calculations (for elements between first_simd_idx and last_simd_idx) for(; j < last_simd_idx; j+=8) { const float G = 6.6732e-11; const float xi = m_objects->getX()[i]; const __m256 x0 = {xi, xi, xi, xi, xi, xi, xi, xi}; const __m256 x1234 = _mm256_load_ps( &m_objects->getX()[j] ); const float yi = m_objects->getY()[i]; const __m256 y0 = {yi, yi, yi, yi, yi, yi, yi, yi}; const __m256 y1234 = _mm256_load_ps( &m_objects->getY()[j] ); const float mi = m_objects->getMass()[i]; const __m256 m0 = {mi, mi, mi, mi, mi, mi, mi, mi}; const __m256 m1234 = _mm256_load_ps( &m_objects->getMass()[j] ); const __m256 dist = utils::distance(x0, y0, x1234, y1234); const __m256 dist2 = _mm256_mul_ps(dist, dist); const __m256 vG = {G, G, G, G, G, G, G, G}; const __m256 vG_m0 = _mm256_mul_ps(vG, m0); const __m256 m1234_dist2 = _mm256_div_ps(m1234, dist2); const __m256 Fg = _mm256_mul_ps(vG_m0, m1234_dist2); utils::vector force_vector = utils::unit_vector(x0, y0, x1234, y1234); force_vector.x = _mm256_mul_ps(force_vector.x, Fg); force_vector.y = _mm256_mul_ps(force_vector.y, Fg); for (int k = 0; k < 8; k++) { forces[i] += XY(force_vector.x[k], force_vector.y[k]); forces[j + k] += XY(-force_vector.x[k], -force_vector.y[k]); } } // post AVX calculations (for elements after last_simd_idx) for(; j < objs; j++) { const XY force_vector = force(i, j); forces[i] += force_vector; forces[j] += -force_vector; } }
void molec_quadrant_neighbor_interaction_fma(molec_Quadrant_t q, molec_Quadrant_t q_n, float* Epot_) { #ifdef __AVX2__ const __m256 sigLJ = _mm256_set1_ps(molec_parameter->sigLJ); const __m256 epsLJ = _mm256_set1_ps(molec_parameter->epsLJ); const __m256 Rcut2 = _mm256_set1_ps(molec_parameter->Rcut2); const int N = q.N; const int N_n = q_n.N_pad; __m256 Epot8 = _mm256_setzero_ps(); __m256 _1 = _mm256_set1_ps(1.f); __m256 _2 = _mm256_set1_ps(2.f); __m256 _24epsLJ = _mm256_mul_ps(_mm256_set1_ps(24.f), epsLJ); for(int i = 0; i < N; ++i) { const __m256 xi = _mm256_set1_ps(q.x[i]); const __m256 yi = _mm256_set1_ps(q.y[i]); const __m256 zi = _mm256_set1_ps(q.z[i]); __m256 f_xi = _mm256_setzero_ps(); __m256 f_yi = _mm256_setzero_ps(); __m256 f_zi = _mm256_setzero_ps(); for(int j = 0; j < N_n; j += 8) { // count number of interactions if(MOLEC_CELLLIST_COUNT_INTERACTION) ++num_potential_interactions; // load coordinates and fores into AVX vectors const __m256 xj = _mm256_load_ps(&q_n.x[j]); const __m256 yj = _mm256_load_ps(&q_n.y[j]); const __m256 zj = _mm256_load_ps(&q_n.z[j]); __m256 f_xj = _mm256_load_ps(&q_n.f_x[j]); __m256 f_yj = _mm256_load_ps(&q_n.f_y[j]); __m256 f_zj = _mm256_load_ps(&q_n.f_z[j]); // distance computation const __m256 xij = _mm256_sub_ps(xi, xj); const __m256 yij = _mm256_sub_ps(yi, yj); const __m256 zij = _mm256_sub_ps(zi, zj); const __m256 zij2 = _mm256_mul_ps(zij, zij); const __m256 r2 = _mm256_fmadd_ps(xij, xij, _mm256_fmadd_ps(yij, yij, zij2)); // r2 < Rcut2 const __m256 mask = _mm256_cmp_ps(r2, Rcut2, _CMP_LT_OQ); // if( any(r2 < R2) ) if(_mm256_movemask_ps(mask)) { const __m256 r2inv = _mm256_div_ps(_1, r2); const __m256 s2 = _mm256_mul_ps(_mm256_mul_ps(sigLJ, sigLJ), r2inv); const __m256 s6 = _mm256_mul_ps(_mm256_mul_ps(s2, s2), s2); const __m256 s12 = _mm256_mul_ps(s6, s6); const __m256 s12_minus_s6 = _mm256_sub_ps(s12, s6); const __m256 two_s12_minus_s6 = _mm256_sub_ps(_mm256_mul_ps(_2, s12), s6); Epot8 = _mm256_add_ps(Epot8, _mm256_and_ps(s12_minus_s6, mask)); const __m256 fr = _mm256_mul_ps(_mm256_mul_ps(_24epsLJ, r2inv), two_s12_minus_s6); const __m256 fr_mask = _mm256_and_ps(fr, mask); // update forces f_xi = _mm256_fmadd_ps(fr_mask, xij,f_xi); f_yi = _mm256_fmadd_ps(fr_mask, yij,f_yi); f_zi = _mm256_fmadd_ps(fr_mask, zij,f_zi); f_xj = _mm256_fnmadd_ps(fr_mask,xij,f_xj); f_yj = _mm256_fnmadd_ps(fr_mask,yij,f_yj); f_zj = _mm256_fnmadd_ps(fr_mask,zij,f_zj); // store back j-forces _mm256_store_ps(&q_n.f_x[j], f_xj); _mm256_store_ps(&q_n.f_y[j], f_yj); _mm256_store_ps(&q_n.f_z[j], f_zj); } } // update i-forces float MOLEC_ALIGNAS(32) f_array[8]; _mm256_store_ps(f_array, f_xi); q.f_x[i] += f_array[0] + f_array[1] + f_array[2] + f_array[3] + f_array[4] + f_array[5] + f_array[6] + f_array[7]; _mm256_store_ps(f_array, f_yi); q.f_y[i] += f_array[0] + f_array[1] + f_array[2] + f_array[3] + f_array[4] + f_array[5] + f_array[6] + f_array[7]; _mm256_store_ps(f_array, f_zi); q.f_z[i] += f_array[0] + f_array[1] + f_array[2] + f_array[3] + f_array[4] + f_array[5] + f_array[6] + f_array[7]; } float MOLEC_ALIGNAS(32) E_pot_array[8]; _mm256_store_ps(E_pot_array, Epot8); // perform reduction of potential energy *Epot_ += 4 * molec_parameter->epsLJ*(E_pot_array[0] + E_pot_array[1] + E_pot_array[2] + E_pot_array[3] + E_pot_array[4] + E_pot_array[5] + E_pot_array[6] + E_pot_array[7]); #endif }
void animate() { float mx; float my; if(ManualControl) { POINT pos; GetCursorPos(&pos); RECT rc; GetClientRect(hMainWnd, &rc); ScreenToClient(hMainWnd, &pos); mx = pos.x; my = pos.y; } else { UpdatePosition(mx, my); } const auto size = partCount; VertexData *pVertexBuffer; pVertexObject->Lock(0, 0, (void**)&pVertexBuffer, D3DLOCK_DISCARD); _mm256_zeroall(); #pragma omp parallel \ shared(pVertexBuffer, particlesCoord, particlesVel, mx, my, size) { #pragma omp for nowait for(int i = 0; i < size; i += 4) { float mouseCoordVec[8] = { mx, my, mx, my, mx, my, mx, my }; float *particleCoordsVec = (float*)particlesCoord + i; float *velocityVec = (float*)particlesVel + i; auto xyCoord = _mm256_loadu_ps(particleCoordsVec); auto hwTempData = _mm256_sub_ps(xyCoord, _mm256_loadu_ps(mouseCoordVec)); auto squares = _mm256_mul_ps(hwTempData, hwTempData); auto distSquare = _mm256_hadd_ps(squares, squares); distSquare = _mm256_shuffle_ps(distSquare, distSquare, 0x50); auto theForce = _mm256_div_ps(_mm256_set1_ps(G), distSquare); if(distSquare.m256_f32[0] < 400) { theForce.m256_f32[0] = 0; theForce.m256_f32[1] = 0; } if(distSquare.m256_f32[2] < 400) { theForce.m256_f32[2] = 0; theForce.m256_f32[3] = 0; } if(distSquare.m256_f32[4] < 400) { theForce.m256_f32[4] = 0; theForce.m256_f32[5] = 0; } if(distSquare.m256_f32[6] < 400) { theForce.m256_f32[6] = 0; theForce.m256_f32[7] = 0; } auto xyForces = _mm256_mul_ps(_mm256_xor_ps(hwTempData, _mm256_set1_ps(-0.f)), theForce); auto xyVelocities = _mm256_loadu_ps(velocityVec); xyVelocities = _mm256_mul_ps(xyVelocities, _mm256_set1_ps(Resistance)); xyVelocities = _mm256_add_ps(xyVelocities, xyForces); xyCoord = _mm256_add_ps(xyCoord, xyVelocities); _mm256_storeu_ps(velocityVec, xyVelocities); _mm256_storeu_ps(particleCoordsVec, xyCoord); processIfOutOfBounds(((ParticleCoord*)particleCoordsVec)[0], ((ParticleVel*)velocityVec)[0]); processIfOutOfBounds(((ParticleCoord*)particleCoordsVec)[1], ((ParticleVel*)velocityVec)[1]); processIfOutOfBounds(((ParticleCoord*)particleCoordsVec)[2], ((ParticleVel*)velocityVec)[2]); processIfOutOfBounds(((ParticleCoord*)particleCoordsVec)[3], ((ParticleVel*)velocityVec)[3]); pVertexBuffer[i].x = ((ParticleCoord*)particleCoordsVec)[0].x; pVertexBuffer[i].y = ((ParticleCoord*)particleCoordsVec)[0].y; pVertexBuffer[i + 1].x = ((ParticleCoord*)particleCoordsVec)[1].x; pVertexBuffer[i + 1].y = ((ParticleCoord*)particleCoordsVec)[1].y; pVertexBuffer[i + 2].x = ((ParticleCoord*)particleCoordsVec)[2].x; pVertexBuffer[i + 2].y = ((ParticleCoord*)particleCoordsVec)[2].y; pVertexBuffer[i + 3].x = ((ParticleCoord*)particleCoordsVec)[3].x; pVertexBuffer[i + 3].y = ((ParticleCoord*)particleCoordsVec)[3].y; } } pVertexObject->Unlock(); _mm256_zeroall(); }
void Decoder::ADMMDecoder_deg_6_7_2_3_6() { int maxIter = maxIteration; float mu = 5.5f; float tableau[12] = { 0.0f }; if ((mBlocklength == 576) && (mNChecks == 288)) { mu = 3.37309f;//penalty tableau[2] = 0.00001f; tableau[3] = 2.00928f; tableau[6] = 4.69438f; } else if((mBlocklength == 2304) && (mNChecks == 1152) ) { mu = 3.81398683f;//penalty tableau[2] = 0.29669288f; tableau[3] = 0.46964023f; tableau[6] = 3.19548154f; } else { mu = 5.5;//penalty tableau[2] = 0.8f; tableau[3] = 0.8f; tableau[6] = 0.8f; } const float rho = 1.9f; //over relaxation parameter; const float un_m_rho = 1.0 - rho; const auto _rho = _mm256_set1_ps( rho ); const auto _un_m_rho = _mm256_set1_ps( un_m_rho ); float tableaX[12]; // // ON PRECALCULE LES CONSTANTES // #pragma unroll for (int i = 0; i < 7; i++) { tableaX[i] = tableau[ i ] / mu; } const auto t_mu = _mm256_set1_ps ( mu ); const auto t2_amu = _mm256_set1_ps ( tableau[ 2 ] / mu ); const auto t3_amu = _mm256_set1_ps ( tableau[ 3 ] / mu ); const auto t6_amu = _mm256_set1_ps ( tableau[ 6 ] / mu ); const auto t2_2amu = _mm256_set1_ps ( 2.0f * tableau[ 2 ] / mu ); const auto t3_2amu = _mm256_set1_ps ( 2.0f * tableau[ 3 ] / mu ); const auto t6_2amu = _mm256_set1_ps ( 2.0f * tableau[ 6 ] / mu ); const auto t2_deg = _mm256_set1_ps ( 2.0f ); const auto t3_deg = _mm256_set1_ps ( 3.0f ); const auto t6_deg = _mm256_set1_ps ( 6.0f ); const auto zero = _mm256_set1_ps ( 0.0f ); const auto un = _mm256_set1_ps ( 1.0f ); const __m256 a = _mm256_set1_ps ( 0.0f ); const __m256 b = _mm256_set1_ps ( 0.5f ); ////////////////////////////////////////////////////////////////////////////////////// #pragma unroll for( int j = 0; j < _mPCheckMapSize; j+=8 ) { _mm256_store_ps(&Lambda [j], a); _mm256_store_ps(&zReplica[j], b); _mm256_store_ps(&latestProjVector[j], b); } ////////////////////////////////////////////////////////////////////////////////////// for(int i = 0; i < maxIter; i++) { int ptr = 0; mIteration = i + 1; // // MEASURE OF THE VN EXECUTION TIME // #ifdef PROFILE_ON const auto start = timer(); #endif // // VN processing kernel // #pragma unroll for (int j = 0; j < _mBlocklength; j++) { const int degVn = VariableDegree[j]; float M[8] __attribute__((aligned(64))); if( degVn == 2 ){ #if 1 const int dVN = 2; for(int qq = 0; qq < 8; qq++) { M[qq] = (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]); ptr += 1; #pragma unroll for(int k = 1; k < dVN; k++) { M[qq] += (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]); ptr += 1; } } const auto m = _mm256_loadu_ps( M ); const auto llr = _mm256_loadu_ps( &_LogLikelihoodRatio[j] ); const auto t1 = _mm256_sub_ps(m, _mm256_div_ps(llr, t_mu)); const auto xx = _mm256_div_ps(_mm256_sub_ps(t1, t2_amu), _mm256_sub_ps(t2_deg, t2_2amu)); const auto vMin = _mm256_max_ps(_mm256_min_ps(xx, un) , zero); _mm256_storeu_ps(&OutputFromDecoder[j], vMin); j += 7; #else const int degVN = 2; float temp = (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]); #pragma unroll for(int k = 1; k < degVN; k++) temp += (zReplica[ t_row[ptr + k] ] + Lambda[ t_row[ptr + k] ]); ptr += degVN; const float _amu_ = tableaX[ degVN ]; const float _2_amu_ = _amu_+ _amu_; const float llr = _LogLikelihoodRatio[j]; const float t = temp - llr / mu; const float xx = (t - _amu_)/(degVn - _2_amu_); const float vMax = std::min(xx, 1.0f); const float vMin = std::max(vMax, 0.0f); OutputFromDecoder[j] = vMin; #endif }else if( degVn == 3 ){ #if 1 const int dVN = 3; for(int qq = 0; qq < 8; qq++) { M[qq] = (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]); ptr += 1; #pragma unroll for(int k = 1; k < dVN; k++) { M[qq] += (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]); ptr += 1; } } const auto m = _mm256_loadu_ps( M ); const auto llr = _mm256_loadu_ps( &_LogLikelihoodRatio[j] ); const auto t1 = _mm256_sub_ps(m, _mm256_div_ps(llr, t_mu)); const auto xx = _mm256_div_ps(_mm256_sub_ps(t1, t3_amu), _mm256_sub_ps(t3_deg, t3_2amu)); const auto vMin = _mm256_max_ps(_mm256_min_ps(xx, un) , zero); _mm256_storeu_ps(&OutputFromDecoder[j], vMin); j += 7; #else const int degVN = 3; float temp = (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]); #pragma unroll for(int k = 1; k < degVN; k++) temp += (zReplica[ t_row[ptr + k] ] + Lambda[ t_row[ptr + k] ]); ptr += degVN; const float _amu_ = tableaX[ degVN ]; const float _2_amu_ = _amu_+ _amu_; const float llr = _LogLikelihoodRatio[j]; const float t = temp - llr / mu; const float xx = (t - _amu_)/(degVn - _2_amu_); const float vMax = std::min(xx, 1.0f); const float vMin = std::max(vMax, 0.0f); OutputFromDecoder[j] = vMin; #endif }else if( degVn == 6 ){ #if 1 const int dVN = 6; for(int qq = 0; qq < 8; qq++) { M[qq] = (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]); ptr += 1; #pragma unroll for(int k = 1; k < dVN; k++) { M[qq] += (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]); ptr += 1; } } const auto m = _mm256_loadu_ps( M ); const auto llr = _mm256_loadu_ps( &_LogLikelihoodRatio[j] ); const auto t1 = _mm256_sub_ps(m, _mm256_div_ps(llr, t_mu)); const auto xx = _mm256_div_ps(_mm256_sub_ps(t1, t6_amu), _mm256_sub_ps(t6_deg, t6_2amu)); const auto vMin = _mm256_max_ps(_mm256_min_ps(xx, un) , zero); _mm256_storeu_ps(&OutputFromDecoder[j], vMin); j += 7; #else const int degVN = 6; float temp = (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]); #pragma unroll for(int k = 1; k < degVN; k++) temp += (zReplica[ t_row[ptr + k] ] + Lambda[ t_row[ptr + k] ]); ptr += degVN; const float _amu_ = tableaX[ degVN ]; const float _2_amu_ = _amu_+ _amu_; const float llr = _LogLikelihoodRatio[j]; const float t = temp - llr / mu; const float xx = (t - _amu_)/(degVn - _2_amu_); const float vMax = std::min(xx, 1.0f); const float vMin = std::max(vMax, 0.0f); OutputFromDecoder[j] = vMin; #endif } } // // MEASURE OF THE VN EXECUTION TIME // #ifdef PROFILE_ON t_vn += (timer() - start); #endif // // CN processing kernel // int CumSumCheckDegree = 0; // cumulative position of currect edge in factor graph int allVerified = 0; float vector_before_proj[8] __attribute__((aligned(64))); const auto zero = _mm256_set1_ps ( 0.0f ); const auto mask_6 = _mm256_set_epi32(0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF); const auto mask_7 = _mm256_set_epi32(0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF); const auto dot5 = _mm256_set1_ps( 0.5f ); // // MEASURE OF THE CN EXECUTION TIME // #ifdef PROFILE_ON const auto starT = timer(); #endif const auto seuilProj = _mm256_set1_ps( 1e-5f ); for(int j = 0; j < _mNChecks; j++) { if( CheckDegree[j] == 6 ){ const int cDeg6 = 0x3F; const auto offsets = _mm256_loadu_si256 ((const __m256i*)&t_col1 [CumSumCheckDegree]); const auto xpred = _mm256_mask_i32gather_ps (zero, OutputFromDecoder, offsets, _mm256_castsi256_ps(mask_6), 4); const auto synd = _mm256_cmp_ps( xpred, dot5, _CMP_GT_OS ); int test = (_mm256_movemask_ps( synd ) & cDeg6); // deg 6 const auto syndrom = _mm_popcnt_u32( test ); const auto _Replica = _mm256_loadu_ps( &zReplica[CumSumCheckDegree]); const auto _ambda = _mm256_loadu_ps( &Lambda [CumSumCheckDegree]); const auto v1 = _mm256_mul_ps (xpred, _rho ); const auto v2 = _mm256_mul_ps ( _Replica, _un_m_rho ); const auto v3 = _mm256_add_ps ( v1, v2 ); const auto vect_proj = _mm256_sub_ps ( v3, _ambda ); // // ON REALISE LA PROJECTION !!! // allVerified += ( syndrom & 0x01 ); // // MEASURE OF THE PROJECTION EXECUTION TIME // #ifdef PROFILE_ON const auto START = timer(); #endif const auto latest = _mm256_loadu_ps(&latestProjVector[CumSumCheckDegree]); const auto different = _mm256_sub_ps ( vect_proj, latest ); const auto maskAbsol = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF)); const auto absolute = _mm256_and_ps ( different, maskAbsol ); const auto despass = _mm256_cmp_ps( absolute, seuilProj, _CMP_GT_OS ); int skip = (_mm256_movemask_ps( despass ) & cDeg6) == 0x00; // degree 6 if( skip == false ) { const auto _ztemp = mp.projection_deg6( vect_proj ); const auto _ztemp1 = _mm256_sub_ps(_ztemp, xpred ); const auto _ztemp2 = _mm256_sub_ps(_ztemp, _Replica ); const auto _ztemp3 = _mm256_mul_ps(_ztemp1, _rho); const auto _ztemp4 = _mm256_mul_ps(_ztemp2, _un_m_rho); const auto nLambda = _mm256_add_ps( _ambda, _ztemp3 ); const auto mLambda = _mm256_add_ps( nLambda, _ztemp4 ); _mm256_maskstore_ps(& Lambda[CumSumCheckDegree], mask_6, mLambda); _mm256_maskstore_ps(&zReplica[CumSumCheckDegree], mask_6, _ztemp); } _mm256_maskstore_ps(&latestProjVector[CumSumCheckDegree], mask_6, vect_proj); // // MEASURE OF THE PROJECTION EXECUTION TIME // #ifdef PROFILE_ON t_pj += (timer() - START); #endif CumSumCheckDegree += 6; }else if( CheckDegree[j] == 7 ) { const int cDeg7 = 0x7F; const auto offsets = _mm256_loadu_si256 ((const __m256i*)&t_col1 [CumSumCheckDegree]); const auto xpred = _mm256_mask_i32gather_ps (zero, OutputFromDecoder, offsets, _mm256_castsi256_ps(mask_7), 4); const auto synd = _mm256_cmp_ps( xpred, dot5, _CMP_GT_OS ); const int test = (_mm256_movemask_ps( synd ) & cDeg7); // deg 7 const auto syndrom = _mm_popcnt_u32( test ); const auto _Replica = _mm256_loadu_ps( &zReplica[CumSumCheckDegree]); const auto _ambda = _mm256_loadu_ps( &Lambda [CumSumCheckDegree]); const auto v1 = _mm256_mul_ps ( xpred, _rho ); const auto v2 = _mm256_mul_ps ( _Replica, _un_m_rho ); const auto v3 = _mm256_add_ps ( v1, v2 ); const auto vect_proj = _mm256_sub_ps ( v3, _ambda ); // // ON REALISE LA PROJECTION !!! // allVerified += ( syndrom & 0x01 ); // // MEASURE OF THE PROJECTION EXECUTION TIME // #ifdef PROFILE_ON const auto START = timer(); #endif const auto latest = _mm256_loadu_ps(&latestProjVector[CumSumCheckDegree]); const auto different = _mm256_sub_ps ( vect_proj, latest ); const auto maskAbsol = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF)); const auto absolute = _mm256_and_ps ( different, maskAbsol ); const auto despass = _mm256_cmp_ps( absolute, seuilProj, _CMP_GT_OS ); int skip = (_mm256_movemask_ps( despass ) & cDeg7) == 0x00; // degree 7 if( skip == false ) { const auto _ztemp = mp.projection_deg7( vect_proj ); const auto _ztemp1 = _mm256_sub_ps(_ztemp, xpred ); const auto _ztemp2 = _mm256_sub_ps(_ztemp, _Replica ); const auto _ztemp3 = _mm256_mul_ps(_ztemp1, _rho); const auto _ztemp4 = _mm256_mul_ps(_ztemp2, _un_m_rho); const auto nLambda = _mm256_add_ps( _ambda, _ztemp3 ); const auto mLambda = _mm256_add_ps( nLambda, _ztemp4 ); _mm256_maskstore_ps(& Lambda [CumSumCheckDegree], mask_7, mLambda); _mm256_maskstore_ps(&zReplica [CumSumCheckDegree], mask_7, _ztemp); } _mm256_maskstore_ps(&latestProjVector[CumSumCheckDegree], mask_7, vect_proj); // // MEASURE OF THE PROJECTION EXECUTION TIME // #ifdef PROFILE_ON t_pj += (timer() - START); #endif CumSumCheckDegree += 7; }else{ exit( 0 ); } } // // MEASURE OF THE CN LOOP EXECUTION TIME // #ifdef PROFILE_ON t_cn += (timer() - starT); #endif #ifdef PROFILE_ON t_ex += 1; //FILE *ft=fopen("time.txt","a"); //fprintf(ft,"%d \n", t_cn/t_ex); //fprintf(ft,"%d %d %d \n", t_cn, t_vn, t_pj); //fclose(ft); #endif if(allVerified == 0) { mAlgorithmConverge = true; mValidCodeword = true; break; } } // // MEASURE OF THE NUMBER OF EXECUTION // // #ifdef PROFILE_ON // t_ex += 1; // #endif }
inline vec8 operator/(vec8 a, vec8 b) { return _mm256_div_ps(a, b); }
/*! * \brief Divide the two given vectors */ ETL_STATIC_INLINE(avx_simd_float) div(avx_simd_float lhs, avx_simd_float rhs) { return _mm256_div_ps(lhs.value, rhs.value); }
void run_softmax_int32_float_work_item_batch8x(nn_workload_item *const work_item, uint16_t NoBatch8) { nn_workload_data_t *input_view = work_item->input[0]->output; const auto &arguments = work_item->arguments.forward_softmax_fixedpoint; const auto input_width = input_view->parent->lengths.t[NN_DATA_COORD_z] * input_view->parent->lengths.t[NN_DATA_COORD_p]; const auto output_width = work_item->output->view_end.t[NN_DATA_COORD_x] - work_item->output->view_begin.t[NN_DATA_COORD_x] + 1; const auto batch_size_global = work_item->output->parent->lengths.t[NN_DATA_COORD_n]; const auto batch_size = 8; const auto num_full_blocks = output_width / C_max_acc; const auto partial_block_size = output_width % C_max_acc; const auto output_view_start = work_item->output->view_begin.t[NN_DATA_COORD_x] * batch_size * NoBatch8; const auto input_view_start = input_view->view_begin.t[NN_DATA_COORD_z] * input_view->parent->lengths.t[NN_DATA_COORD_p] * batch_size * NoBatch8; const auto out_fraction = arguments.input_fraction; float * input_f = (float*)_mm_malloc(input_width * batch_size * sizeof(float), 64); float * output_f = (float*)_mm_malloc(output_width * batch_size * sizeof(float), 64); auto input_buffer = &static_cast<int32_t*>(input_view->parent->data_buffer)[input_view_start + NoBatch8 * 8 * input_width]; //auto input_buffer = &static_cast<int32_t*>(input_view->parent->data_buffer)[input_view_start]; auto shift = out_fraction; if (shift > 0) { for (uint32_t i = 0; i < input_width * batch_size; i++) input_f[i] = (float)(input_buffer[i]) / (1 << shift); } else if (shift < 0) { for (uint32_t i = 0; i < input_width* batch_size; i++) input_f[i] = (float)(input_buffer[i]) * (1 << -shift); } else { for (uint32_t i = 0; i < input_width* batch_size; i++) input_f[i] = (float)(input_buffer[i]); } __m256 acc_sum = _mm256_setzero_ps(); { auto input_buffer = input_f; //auto output_buffer = &static_cast<float*>(work_item->output->parent->data_buffer)[output_view_start + NoBatch8 * 8 * output_width]; auto output_buffer = output_f; for (auto block = 0u; block < num_full_blocks; ++block) { // Run computation. softmax_compute_block<C_max_acc>(input_buffer, output_buffer, acc_sum); } switch (partial_block_size) { case 0: break; case 1: softmax_compute_block< 1>(input_buffer, output_buffer, acc_sum); break; case 2: softmax_compute_block< 2>(input_buffer, output_buffer, acc_sum); break; case 3: softmax_compute_block< 3>(input_buffer, output_buffer, acc_sum); break; case 4: softmax_compute_block< 4>(input_buffer, output_buffer, acc_sum); break; case 5: softmax_compute_block< 5>(input_buffer, output_buffer, acc_sum); break; case 6: softmax_compute_block< 6>(input_buffer, output_buffer, acc_sum); break; case 7: softmax_compute_block< 7>(input_buffer, output_buffer, acc_sum); break; case 8: softmax_compute_block< 8>(input_buffer, output_buffer, acc_sum); break; case 9: softmax_compute_block< 9>(input_buffer, output_buffer, acc_sum); break; case 10: softmax_compute_block<10>(input_buffer, output_buffer, acc_sum); break; case 11: softmax_compute_block<11>(input_buffer, output_buffer, acc_sum); break; case 12: softmax_compute_block<12>(input_buffer, output_buffer, acc_sum); break; case 13: softmax_compute_block<13>(input_buffer, output_buffer, acc_sum); break; case 14: softmax_compute_block<14>(input_buffer, output_buffer, acc_sum); break; default: NN_UNREACHABLE_CODE; } } acc_sum = _mm256_div_ps(_mm256_set1_ps(1.0f), acc_sum); { //auto output_buffer = &static_cast<float*>(work_item->output->parent->data_buffer)[output_view_start + NoBatch8 * 8 * output_width]; auto output_buffer = output_f; for (auto block = 0u; block < num_full_blocks; ++block) { // Run computation. softmax_finalize_block<C_max_acc>(output_buffer, acc_sum); } switch (partial_block_size) { case 0: break; case 1: softmax_finalize_block< 1>(output_buffer, acc_sum); break; case 2: softmax_finalize_block< 2>(output_buffer, acc_sum); break; case 3: softmax_finalize_block< 3>(output_buffer, acc_sum); break; case 4: softmax_finalize_block< 4>(output_buffer, acc_sum); break; case 5: softmax_finalize_block< 5>(output_buffer, acc_sum); break; case 6: softmax_finalize_block< 6>(output_buffer, acc_sum); break; case 7: softmax_finalize_block< 7>(output_buffer, acc_sum); break; case 8: softmax_finalize_block< 8>(output_buffer, acc_sum); break; case 9: softmax_finalize_block< 9>(output_buffer, acc_sum); break; case 10: softmax_finalize_block<10>(output_buffer, acc_sum); break; case 11: softmax_finalize_block<11>(output_buffer, acc_sum); break; case 12: softmax_finalize_block<12>(output_buffer, acc_sum); break; case 13: softmax_finalize_block<13>(output_buffer, acc_sum); break; case 14: softmax_finalize_block<14>(output_buffer, acc_sum); break; default: NN_UNREACHABLE_CODE; } } auto output_buffer = &static_cast<float*>(work_item->output->parent->data_buffer)[output_view_start]; for (auto itrW = 0; itrW < output_width; itrW++) for (auto itr8 = 0; itr8 < C_batch_size; itr8++) output_buffer[itr8 + itrW * batch_size_global + NoBatch8 * C_batch_size] = output_f[itr8 + itrW * C_batch_size]; _mm_free(input_f); _mm_free(output_f); }
void run_dct(int width, int height, float *quant, float *input, int32_t *output) { float acosvals[8][8]; /* Calculating cosines is expensive, and there * are only 64 cosines that need to be calculated * so precompute them and cache. */ for (int i = 0; i < 8; i++) { for (int j = 0; j < 8; j++) { if (j == 0) { acosvals[i][j] = sqrt(1.0 / 8.0) * cos(PI / 8.0 * (i + 0.5d) * j); } else { acosvals[i][j] = 0.5 * cos(PI / 8.0 * (i + 0.5d) * j); } } } /* Separate the parallel from the for, so each processor gets its * own copy of the buffers and variables. */ #pragma omp parallel { float avload[8] = {0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5}; avload[0] = sqrt(1.0 / 8.0); __m256 row0, row1, row2, row3, row4, row5, row6, row7; __m256 loaderlow, loaderhigh; __m256 temp; __m256 minus128 = _mm256_set1_ps(-128.0); __m256 avxcosloader, avxcos; float avxcosmover; __m256i integer; /* The DCT breaks the image into 8 by 8 blocks and then * transforms them into color frequencies. */ #pragma omp for for (int brow = 0; brow < height / 8; brow++) { for (int bcol = 0; bcol < width / 8; bcol++) { int head_pointer = bcol * 8 + brow * 8 * width; row0 = _mm256_setzero_ps(); row1 = _mm256_setzero_ps(); row2 = _mm256_setzero_ps(); row3 = _mm256_setzero_ps(); row4 = _mm256_setzero_ps(); row5 = _mm256_setzero_ps(); row6 = _mm256_setzero_ps(); row7 = _mm256_setzero_ps(); /* This pair of loops uses AVX instuctions to add the frequency * component from each pixel to all of the buckets at once. Allows * us to do the DCT on a block in 64 iterations of a loop rather * than 64 iterations of 64 iterations of a loop (all 64 pixels affect * all 64 frequencies) */ for (int x = 0; x < 8; x++) { for (int y = 0; y < 4; y++) { loaderlow = _mm256_broadcast_ss(&input[head_pointer + x + (y * width)]); loaderlow = _mm256_add_ps(loaderlow, minus128); loaderhigh = _mm256_broadcast_ss(&input[head_pointer + x + ((7 - y) * width)]); loaderhigh = _mm256_add_ps(loaderhigh, minus128); avxcos = _mm256_loadu_ps(&acosvals[x][0]); loaderlow = _mm256_mul_ps(loaderlow, avxcos); loaderhigh = _mm256_mul_ps(loaderhigh, avxcos); avxcosloader = _mm256_loadu_ps(&acosvals[y][0]); avxcosmover = avxcosloader[0]; avxcos = _mm256_set1_ps(avxcosmover); temp = _mm256_mul_ps(loaderlow, avxcos); row0 = _mm256_add_ps(row0, temp); temp = _mm256_mul_ps(loaderhigh, avxcos); row0 = _mm256_add_ps(row0, temp); avxcosmover = avxcosloader[1]; avxcos = _mm256_set1_ps(avxcosmover); temp = _mm256_mul_ps(loaderlow, avxcos); row1 = _mm256_add_ps(row1, temp); temp = _mm256_mul_ps(loaderhigh, avxcos); row1 = _mm256_sub_ps(row1, temp); avxcosmover = avxcosloader[2]; avxcos = _mm256_set1_ps(avxcosmover); temp = _mm256_mul_ps(loaderlow, avxcos); row2 = _mm256_add_ps(row2, temp); temp = _mm256_mul_ps(loaderhigh, avxcos); row2 = _mm256_add_ps(row2, temp); avxcosmover = avxcosloader[3]; avxcos = _mm256_set1_ps(avxcosmover); temp = _mm256_mul_ps(loaderlow, avxcos); row3 = _mm256_add_ps(row3, temp); temp = _mm256_mul_ps(loaderhigh, avxcos); row3 = _mm256_sub_ps(row3, temp); avxcosmover = avxcosloader[4]; avxcos = _mm256_set1_ps(avxcosmover); temp = _mm256_mul_ps(loaderlow, avxcos); row4 = _mm256_add_ps(row4, temp); temp = _mm256_mul_ps(loaderhigh, avxcos); row4 = _mm256_add_ps(row4, temp); avxcosmover = avxcosloader[5]; avxcos = _mm256_set1_ps(avxcosmover); temp = _mm256_mul_ps(loaderlow, avxcos); row5 = _mm256_add_ps(row5, temp); temp = _mm256_mul_ps(loaderhigh, avxcos); row5 = _mm256_sub_ps(row5, temp); avxcosmover = avxcosloader[6]; avxcos = _mm256_set1_ps(avxcosmover); temp = _mm256_mul_ps(loaderlow, avxcos); row6 = _mm256_add_ps(row6, temp); temp = _mm256_mul_ps(loaderhigh, avxcos); row6 = _mm256_add_ps(row6, temp); avxcosmover = avxcosloader[7]; avxcos = _mm256_set1_ps(avxcosmover); temp = _mm256_mul_ps(loaderlow, avxcos); row7 = _mm256_add_ps(row7, temp); temp = _mm256_mul_ps(loaderhigh, avxcos); row7 = _mm256_sub_ps(row7, temp); } } /* Each frequency stored as a float needs to be divided by * the quantization value, then rounded to the nearest integer. * Also changes the order of the values from pixel order to * each 8 by 8 block stored one after another. */ temp = _mm256_loadu_ps(&quant[0]); row0 = _mm256_div_ps(row0, temp); row0 = _mm256_round_ps(row0, _MM_FROUND_TO_NEAREST_INT); integer = _mm256_cvttps_epi32(row0); _mm256_storeu_si256(output + (bcol + brow * (width / 8)) * 64, integer); temp = _mm256_loadu_ps(&quant[8]); row1 = _mm256_div_ps(row1, temp); row1 = _mm256_round_ps(row1, _MM_FROUND_TO_NEAREST_INT); integer = _mm256_cvttps_epi32(row1); _mm256_storeu_si256(output + 8 + (bcol + brow * (width / 8)) * 64, integer); temp = _mm256_loadu_ps(&quant[16]); row2 = _mm256_div_ps(row2, temp); row2 = _mm256_round_ps(row2, _MM_FROUND_TO_NEAREST_INT); integer = _mm256_cvttps_epi32(row2); _mm256_storeu_si256(output + 16 + (bcol + brow * (width / 8)) * 64, integer); temp = _mm256_loadu_ps(&quant[24]); row3 = _mm256_div_ps(row3, temp); row3 = _mm256_round_ps(row3, _MM_FROUND_TO_NEAREST_INT); integer = _mm256_cvttps_epi32(row3); _mm256_storeu_si256(output + 24 + (bcol + brow * (width / 8)) * 64, integer); temp = _mm256_loadu_ps(&quant[32]); row4 = _mm256_div_ps(row4, temp); row4 = _mm256_round_ps(row4, _MM_FROUND_TO_NEAREST_INT); integer = _mm256_cvttps_epi32(row4); _mm256_storeu_si256(output + 32 + (bcol + brow * (width / 8)) * 64, integer); temp = _mm256_loadu_ps(&quant[40]); row5 = _mm256_div_ps(row5, temp); row5 = _mm256_round_ps(row5, _MM_FROUND_TO_NEAREST_INT); integer = _mm256_cvttps_epi32(row5); _mm256_storeu_si256(output + 40 + (bcol + brow * (width / 8)) * 64, integer); temp = _mm256_loadu_ps(&quant[48]); row6 = _mm256_div_ps(row6, temp); row6 = _mm256_round_ps(row6, _MM_FROUND_TO_NEAREST_INT); integer = _mm256_cvttps_epi32(row6); _mm256_storeu_si256(output + 48 + (bcol + brow * (width / 8)) * 64, integer); temp = _mm256_loadu_ps(&quant[56]); row7 = _mm256_div_ps(row7, temp); row7 = _mm256_round_ps(row7, _MM_FROUND_TO_NEAREST_INT); integer = _mm256_cvttps_epi32(row7); _mm256_storeu_si256(output + 56 + (bcol + brow * (width / 8)) * 64, integer); } } } }
void run_softmax_int32_float_work_item_latency(nn_workload_item *const work_item) { nn_workload_data_t *input_view = work_item->input[0]->output; const auto &arguments = work_item->arguments.forward_softmax_fixedpoint; const auto input_width = input_view->parent->lengths.t[NN_DATA_COORD_z] * input_view->parent->lengths.t[NN_DATA_COORD_p]; const auto output_width = work_item->output->view_end.t[NN_DATA_COORD_x] - work_item->output->view_begin.t[NN_DATA_COORD_x] + 1; const auto num_full_blocks = output_width / C_data_stride; const auto partial_block_size = (output_width / C_simd_width) % C_max_acc; const auto subsimd_block_size = output_width % C_simd_width; const auto output_view_start = work_item->output->view_begin.t[NN_DATA_COORD_x]; const auto input_view_start = input_view->view_begin.t[NN_DATA_COORD_z] * input_view->parent->lengths.t[NN_DATA_COORD_p]; const auto out_fraction = arguments.input_fraction; float * input_f = (float*)_mm_malloc(input_width * sizeof(float), 64); auto input_buffer = &static_cast<int32_t*>(input_view->parent->data_buffer)[input_view_start]; auto shift = out_fraction; if (shift > 0) { for (uint32_t i = 0; i < input_width; i++) input_f[i] = (float)(input_buffer[i]) / (1 << shift); } else if (shift < 0) { for (uint32_t i = 0; i < input_width; i++) input_f[i] = (float)(input_buffer[i]) * (1 << -shift); } else { for (uint32_t i = 0; i < input_width; i++) input_f[i] = (float)(input_buffer[i]); } __m256 acc_sum = _mm256_setzero_ps(); float subsimd_sum = 0.0f; { auto input_buffer = input_f; auto output_buffer = &static_cast<float*>(work_item->output->parent->data_buffer)[output_view_start]; for (auto block = 0u; block < num_full_blocks; ++block) { // Run computation. softmax_compute_block<C_max_acc>(input_buffer, output_buffer, acc_sum); } switch (partial_block_size) { case 0: break; case 1: softmax_compute_block< 1>(input_buffer, output_buffer, acc_sum); break; case 2: softmax_compute_block< 2>(input_buffer, output_buffer, acc_sum); break; case 3: softmax_compute_block< 3>(input_buffer, output_buffer, acc_sum); break; case 4: softmax_compute_block< 4>(input_buffer, output_buffer, acc_sum); break; case 5: softmax_compute_block< 5>(input_buffer, output_buffer, acc_sum); break; case 6: softmax_compute_block< 6>(input_buffer, output_buffer, acc_sum); break; case 7: softmax_compute_block< 7>(input_buffer, output_buffer, acc_sum); break; case 8: softmax_compute_block< 8>(input_buffer, output_buffer, acc_sum); break; case 9: softmax_compute_block< 9>(input_buffer, output_buffer, acc_sum); break; case 10: softmax_compute_block<10>(input_buffer, output_buffer, acc_sum); break; case 11: softmax_compute_block<11>(input_buffer, output_buffer, acc_sum); break; case 12: softmax_compute_block<12>(input_buffer, output_buffer, acc_sum); break; case 13: softmax_compute_block<13>(input_buffer, output_buffer, acc_sum); break; case 14: softmax_compute_block<14>(input_buffer, output_buffer, acc_sum); break; default: NN_UNREACHABLE_CODE; } switch (subsimd_block_size) { case 0: break; case 1: softmax_compute_subsimd<1>(input_buffer, output_buffer, subsimd_sum); break; case 2: softmax_compute_subsimd<2>(input_buffer, output_buffer, subsimd_sum); break; case 3: softmax_compute_subsimd<3>(input_buffer, output_buffer, subsimd_sum); break; case 4: softmax_compute_subsimd<4>(input_buffer, output_buffer, subsimd_sum); break; case 5: softmax_compute_subsimd<5>(input_buffer, output_buffer, subsimd_sum); break; case 6: softmax_compute_subsimd<6>(input_buffer, output_buffer, subsimd_sum); break; case 7: softmax_compute_subsimd<7>(input_buffer, output_buffer, subsimd_sum); break; default: NN_UNREACHABLE_CODE; } } { __m256 intermediate_sum = _mm256_hadd_ps(acc_sum, acc_sum); intermediate_sum = _mm256_permutevar8x32_ps(intermediate_sum, _mm256_set_epi32(0, 1, 4, 5, 2, 3, 6, 7)); intermediate_sum = _mm256_hadd_ps(intermediate_sum, intermediate_sum); intermediate_sum = _mm256_hadd_ps(intermediate_sum, intermediate_sum); acc_sum = _mm256_add_ps(intermediate_sum, _mm256_set1_ps(subsimd_sum)); subsimd_sum = _mm_cvtss_f32(_mm256_extractf128_ps(acc_sum, 0)); acc_sum = _mm256_div_ps(_mm256_set1_ps(1.0f), acc_sum); subsimd_sum = 1.0f / subsimd_sum; } { auto output_buffer = &static_cast<float*>(work_item->output->parent->data_buffer)[output_view_start]; for (auto block = 0u; block < num_full_blocks; ++block) { // Run computation. softmax_finalize_block<C_max_acc>(output_buffer, acc_sum); } switch (partial_block_size) { case 0: break; case 1: softmax_finalize_block< 1>(output_buffer, acc_sum); break; case 2: softmax_finalize_block< 2>(output_buffer, acc_sum); break; case 3: softmax_finalize_block< 3>(output_buffer, acc_sum); break; case 4: softmax_finalize_block< 4>(output_buffer, acc_sum); break; case 5: softmax_finalize_block< 5>(output_buffer, acc_sum); break; case 6: softmax_finalize_block< 6>(output_buffer, acc_sum); break; case 7: softmax_finalize_block< 7>(output_buffer, acc_sum); break; case 8: softmax_finalize_block< 8>(output_buffer, acc_sum); break; case 9: softmax_finalize_block< 9>(output_buffer, acc_sum); break; case 10: softmax_finalize_block<10>(output_buffer, acc_sum); break; case 11: softmax_finalize_block<11>(output_buffer, acc_sum); break; case 12: softmax_finalize_block<12>(output_buffer, acc_sum); break; case 13: softmax_finalize_block<13>(output_buffer, acc_sum); break; case 14: softmax_finalize_block<14>(output_buffer, acc_sum); break; default: NN_UNREACHABLE_CODE; } switch (subsimd_block_size) { case 0: break; case 1: softmax_finalize_subsimd<1>(output_buffer, subsimd_sum); break; case 2: softmax_finalize_subsimd<2>(output_buffer, subsimd_sum); break; case 3: softmax_finalize_subsimd<3>(output_buffer, subsimd_sum); break; case 4: softmax_finalize_subsimd<4>(output_buffer, subsimd_sum); break; case 5: softmax_finalize_subsimd<5>(output_buffer, subsimd_sum); break; case 6: softmax_finalize_subsimd<6>(output_buffer, subsimd_sum); break; case 7: softmax_finalize_subsimd<7>(output_buffer, subsimd_sum); break; default: NN_UNREACHABLE_CODE; } } _mm_free(input_f); }
Triangle* OctreeLeaf::Query(const Ray& ray, float& t) const { float tBox = std::numeric_limits<float>::min(); if (!Intersects(ray, bb, tBox) || tBox > t) return nullptr; const __m256 rayDirX = _mm256_set1_ps(ray.Direction.X); const __m256 rayDirY = _mm256_set1_ps(ray.Direction.Y); const __m256 rayDirZ = _mm256_set1_ps(ray.Direction.Z); const __m256 rayPosX = _mm256_set1_ps(ray.Origin.X); const __m256 rayPosY = _mm256_set1_ps(ray.Origin.Y); const __m256 rayPosZ = _mm256_set1_ps(ray.Origin.Z); union { float dists[MAXSIZE]; __m256 distances[MAXSIZE / NROFLANES]; }; for (int i = 0; i < count; i++) { // Vector3F e1 = triangle.Vertices[1].Position - triangle.Vertices[0].Position; const __m256 e1X = edge1X8[i]; const __m256 e1Y = edge1Y8[i]; const __m256 e1Z = edge1Z8[i]; // Vector3F e2 = triangle.Vertices[2].Position - triangle.Vertices[0].Position; const __m256 e2X = edge2X8[i]; const __m256 e2Y = edge2Y8[i]; const __m256 e2Z = edge2Z8[i]; // Vector3F p = ray.Direction.Cross(e2); const __m256 pX = _mm256_sub_ps(_mm256_mul_ps(rayDirY, e2Z), _mm256_mul_ps(rayDirZ, e2Y)); const __m256 pY = _mm256_sub_ps(_mm256_mul_ps(rayDirZ, e2X), _mm256_mul_ps(rayDirX, e2Z)); const __m256 pZ = _mm256_sub_ps(_mm256_mul_ps(rayDirX, e2Y), _mm256_mul_ps(rayDirY, e2X)); // float det = e1.Dot(p); const __m256 det = _mm256_add_ps(_mm256_mul_ps(e1X, pX), _mm256_add_ps(_mm256_mul_ps(e1Y, pY), _mm256_mul_ps(e1Z, pZ))); // if (det > -EPSILON && det < EPSILON) // return false; __m256 mask = _mm256_or_ps(_mm256_cmp_ps(det, _mm256_set1_ps(-EPSILON), _CMP_LE_OS), _mm256_cmp_ps(det, _mm256_set1_ps(EPSILON), _CMP_GE_OS)); // float invDet = 1 / det; const __m256 invDet = _mm256_div_ps(_mm256_set1_ps(1.0f), det); // Vector3F r = ray.Origin - triangle.Vertices[0].Position; const __m256 rX = _mm256_sub_ps(rayPosX, vert0X8[i]); const __m256 rY = _mm256_sub_ps(rayPosY, vert0Y8[i]); const __m256 rZ = _mm256_sub_ps(rayPosZ, vert0Z8[i]); // float u = r.Dot(p) * invDet; const __m256 u = _mm256_mul_ps(invDet, _mm256_add_ps(_mm256_mul_ps(rX, pX), _mm256_add_ps(_mm256_mul_ps(rY, pY), _mm256_mul_ps(rZ, pZ)))); // if (u < 0 || u > 1) // return false; mask = _mm256_and_ps(mask, _mm256_cmp_ps(u, _mm256_setzero_ps(), _CMP_GE_OS)); // Vector3F q = r.Cross(e1); const __m256 qX = _mm256_sub_ps(_mm256_mul_ps(rY, e1Z), _mm256_mul_ps(rZ, e1Y)); const __m256 qY = _mm256_sub_ps(_mm256_mul_ps(rZ, e1X), _mm256_mul_ps(rX, e1Z)); const __m256 qZ = _mm256_sub_ps(_mm256_mul_ps(rX, e1Y), _mm256_mul_ps(rY, e1X)); // float v = ray.Direction.Dot(q) * invDet; const __m256 v = _mm256_mul_ps(invDet, _mm256_add_ps(_mm256_mul_ps(rayDirX, qX), _mm256_add_ps(_mm256_mul_ps(rayDirY, qY), _mm256_mul_ps(rayDirZ, qZ)))); // if (v < 0 || u + v > 1) // return false; mask = _mm256_and_ps(mask, _mm256_and_ps(_mm256_cmp_ps(v, _mm256_setzero_ps(), _CMP_GE_OS), _mm256_cmp_ps(_mm256_add_ps(u, v), _mm256_set1_ps(1.0f), _CMP_LE_OS))); // float tt = e2.Dot(q) * invDet; const __m256 tt = _mm256_mul_ps(invDet, _mm256_add_ps(_mm256_mul_ps(e2X, qX), _mm256_add_ps(_mm256_mul_ps(e2Y, qY), _mm256_mul_ps(e2Z, qZ)))); // if (tt > EPSILON) // { // t = tt; // return true; // } // // return false; distances[i] = _mm256_and_ps(tt, mask); } Triangle* triangle = nullptr; for (int i = 0; i < count * NROFLANES; i++) if (dists[i] < t && dists[i] > EPSILON) { t = dists[i]; triangle = triangles[i]; } return triangle; }