void static avx_test (void) { int i; int m[8] = {mask_v(0), mask_v(1), mask_v(2), mask_v(3), mask_v(4), mask_v(5), mask_v(6), mask_v(7)}; float s[8] = {1,2,3,4,5,6,7,8}; union256 src, mask; float e [8] = {0.0}; float d [8] = {0.0}; src.x = _mm256_loadu_ps (s); mask.x = _mm256_loadu_ps ((float *)m); _mm256_maskstore_ps (d, mask.x, src.x); for (i = 0 ; i < 8; i++) e[i] = m[i] ? s[i] : 0; if (checkVf (d, e, 8)) abort (); }
INLINE void store8b(const avxb& mask, void *ptr, const avxb& b) { return _mm256_maskstore_ps((float*)ptr,(__m256i)mask,b); }
double bst_compute_129_m256_maskstore_root_aligned( void*_bst_obj, double* p, double* q, size_t nn ) { segments_t* mem = (segments_t*) _bst_obj; int n, i, r, l_end, j, l_end_pre; double t, e_tmp; double* e = mem->e, *w = mem->w; int* root = mem->r; __m256d v_tmp; __m256d v00, v01, v02, v03; __m256d v10, v11, v12, v13; __m256d v20, v21, v22, v23; __m256d v30, v31, v32, v33; __m256i v_cur_roots; __m256 v_rootmask0, v_rootmask1; // initialization // mem->n = nn; n = nn; // subtractions with n potentially negative. say hello to all the bugs int idx1, idx1_root; int idx2; int idx3, idx3_root; int pad_root, pad, pad_r; idx1 = ((int) mem->e_sz) - 1; idx1_root = ((int) mem->r_sz); // the conventio is that iteration i, idx1 points to the first element of line i+1 e[idx1++] = q[n]; // pad contains the padding for row i+1 // for row n it's always 3 pad = 3; pad_root = 7; for (i = n-1; i >= 0; --i) { idx1 -= 2*(n-i)+1 + pad; idx1_root -= 2*(n-i)+1 + pad_root; idx2 = idx1 + 1; e[idx1] = q[i]; w[idx1] = q[i]; for (j = i+1; j < n+1; ++j,++idx2) { e[idx2] = INFINITY; w[idx2] = w[idx2-1] + p[j-1] + q[j]; } idx2 += pad; // padding of line i+1 // idx2 now points to the first element of the next line idx3 = idx1; idx3_root = idx1_root; pad_r = pad; for (r = i; r < n; ++r) { pad_r = (pad_r+1)&3; // padding of line r+1 idx1 = idx3; idx1_root = idx3_root; l_end = idx2 + (n-r); // l_end points to the first entry after the current row e_tmp = e[idx1++]; idx1_root++; // calculate until a multiple of 8 doubles is left // 8 = 4 * 2 128-bit vectors l_end_pre = idx2 + ((n-r)&15); for( ; (idx2 < l_end_pre) && (idx2 < l_end); ++idx2 ) { t = e_tmp + e[idx2] + w[idx1]; if (t < e[idx1]) { e[idx1] = t; root[idx1_root] = r; } idx1++; idx1_root++; } v_tmp = _mm256_set_pd( e_tmp, e_tmp, e_tmp, e_tmp ); // execute the shit for 4 vectors of size 2 v_cur_roots = _mm256_set_epi32(r, r, r, r, r, r, r, r); for( ; idx2 < l_end; idx2 += 16 ) { v01 = _mm256_load_pd( &w[idx1 ] ); v11 = _mm256_load_pd( &w[idx1+ 4] ); v21 = _mm256_load_pd( &w[idx1+ 8] ); v31 = _mm256_load_pd( &w[idx1+12] ); v00 = _mm256_load_pd( &e[idx2 ] ); v01 = _mm256_add_pd( v01, v_tmp ); v10 = _mm256_load_pd( &e[idx2+ 4] ); v11 = _mm256_add_pd( v11, v_tmp ); v20 = _mm256_load_pd( &e[idx2+ 8] ); v21 = _mm256_add_pd( v21, v_tmp ); v30 = _mm256_load_pd( &e[idx2+12] ); v31 = _mm256_add_pd( v31, v_tmp ); v01 = _mm256_add_pd( v01, v00 ); v03 = _mm256_load_pd( &e[idx1 ] ); v11 = _mm256_add_pd( v11, v10 ); v13 = _mm256_load_pd( &e[idx1+ 4] ); v21 = _mm256_add_pd( v21, v20 ); v23 = _mm256_load_pd( &e[idx1+ 8] ); v31 = _mm256_add_pd( v31, v30 ); v33 = _mm256_load_pd( &e[idx1+12] ); v02 = _mm256_cmp_pd( v01, v03, _CMP_LT_OQ ); v12 = _mm256_cmp_pd( v11, v13, _CMP_LT_OQ ); v22 = _mm256_cmp_pd( v21, v23, _CMP_LT_OQ ); v32 = _mm256_cmp_pd( v31, v33, _CMP_LT_OQ ); _mm256_maskstore_pd( &e[idx1 ], _mm256_castpd_si256( v02 ), v01 ); _mm256_maskstore_pd( &e[idx1+ 4], _mm256_castpd_si256( v12 ), v11 ); v_rootmask0 = _mm256_insertf128_ps( _mm256_castps128_ps256( _mm256_cvtpd_ps(v02)), _mm256_cvtpd_ps(v12) , 1 ); _mm256_maskstore_pd( &e[idx1+ 8], _mm256_castpd_si256( v22 ), v21 ); _mm256_maskstore_pd( &e[idx1+12], _mm256_castpd_si256( v32 ), v31 ); v_rootmask1 = _mm256_insertf128_ps( _mm256_castps128_ps256( _mm256_cvtpd_ps(v22)), _mm256_cvtpd_ps(v32) , 1 ); _mm256_maskstore_ps( &root[idx1_root ], _mm256_castps_si256( v_rootmask0 ), _mm256_castsi256_ps( v_cur_roots ) ); _mm256_maskstore_ps( &root[idx1_root + 8], _mm256_castps_si256( v_rootmask1 ), _mm256_castsi256_ps( v_cur_roots ) ); idx1 += 16; idx1_root += 16; } idx2 += pad_r; idx3++; idx3_root++; } pad = (pad -1)&3; pad_root = (pad_root-1)&7; } // the index of the last item of the first row is ((n/4)+1)*4-1, due to the padding // if n is even, the total number of entries in the first // row of the table is odd, so we need padding return e[ ((n/4)+1)*4 - 1 ]; }
void dump_surface(const char *filename, uint32_t binding_table_offset, int i) { struct surface s; char *linear; __m256i alpha; get_surface(binding_table_offset, i, &s); int png_format; switch (s.format) { case SF_R8G8B8X8_UNORM: case SF_R8G8B8A8_UNORM: case SF_R8G8B8X8_UNORM_SRGB: case SF_R8G8B8A8_UNORM_SRGB: png_format = PNG_FORMAT_RGBA; break; case SF_B8G8R8A8_UNORM: case SF_B8G8R8X8_UNORM: case SF_B8G8R8A8_UNORM_SRGB: case SF_B8G8R8X8_UNORM_SRGB: png_format = PNG_FORMAT_BGRA; break; default: stub("image format"); return; } switch (s.format) { case SF_R8G8B8X8_UNORM: case SF_B8G8R8X8_UNORM: case SF_R8G8B8X8_UNORM_SRGB: case SF_B8G8R8X8_UNORM_SRGB: alpha = _mm256_set1_epi32(0xff000000); break; default: alpha = _mm256_set1_epi32(0); break; } switch (s.tile_mode) { case LINEAR: linear = s.pixels; break; case XMAJOR: linear = detile_xmajor(&s, alpha); break; case YMAJOR: linear = detile_ymajor(&s, alpha); break; default: linear = s.pixels; stub("detile wmajor"); break; } FILE *f = fopen(filename, "wb"); ksim_assert(f != NULL); png_image pi = { .version = PNG_IMAGE_VERSION, .width = s.width, .height = s.height, .format = png_format }; ksim_assert(png_image_write_to_stdio(&pi, f, 0, linear, s.stride, NULL)); fclose(f); if (linear != s.pixels) free(linear); } static void depth_test(struct primitive *p, struct dispatch *d) { uint32_t cpp = depth_format_size(gt.depth.format); struct reg w_unorm; struct reg d24x8, cmp, d_f; void *base = ymajor_offset(p->depth.buffer, d->x, d->y, gt.depth.stride, cpp); if (gt.depth.test_enable) { const __m256 inv_scale = _mm256_set1_ps(1.0f / 16777215.0f); switch (gt.depth.format) { case D32_FLOAT: d_f.reg = _mm256_load_ps(base); break; case D24_UNORM_X8_UINT: d24x8.ireg = _mm256_load_si256(base); d_f.reg = _mm256_mul_ps(_mm256_cvtepi32_ps(d24x8.ireg), inv_scale); break; case D16_UNORM: stub("D16_UNORM"); default: ksim_unreachable("invalid depth format"); } /* Swizzle two middle pixel pairs so that dword 0-3 and 4-7 * match the shader dispatch subspan orderingg. */ d_f.ireg = _mm256_permute4x64_epi64(d_f.ireg, SWIZZLE(0, 2, 1, 3)); switch (gt.depth.test_function) { case COMPAREFUNCTION_ALWAYS: cmp.reg = _mm256_cmp_ps(d_f.reg, d->w.reg, _CMP_TRUE_US); break; case COMPAREFUNCTION_NEVER: cmp.reg = _mm256_cmp_ps(d_f.reg, d->w.reg, _CMP_FALSE_OS); break; case COMPAREFUNCTION_LESS: cmp.reg = _mm256_cmp_ps(d_f.reg, d->w.reg, _CMP_LT_OS); break; case COMPAREFUNCTION_EQUAL: cmp.reg = _mm256_cmp_ps(d_f.reg, d->w.reg, _CMP_EQ_OS); break; case COMPAREFUNCTION_LEQUAL: cmp.reg = _mm256_cmp_ps(d_f.reg, d->w.reg, _CMP_LE_OS); break; case COMPAREFUNCTION_GREATER: cmp.reg = _mm256_cmp_ps(d_f.reg, d->w.reg, _CMP_GT_OS); break; case COMPAREFUNCTION_NOTEQUAL: cmp.reg = _mm256_cmp_ps(d_f.reg, d->w.reg, _CMP_NEQ_OS); break; case COMPAREFUNCTION_GEQUAL: cmp.reg = _mm256_cmp_ps(d_f.reg, d->w.reg, _CMP_GE_OS); break; } d->mask.ireg = _mm256_and_si256(cmp.ireg, d->mask.ireg); } if (gt.depth.write_enable) { const __m256 scale = _mm256_set1_ps(16777215.0f); const __m256 half = _mm256_set1_ps(0.5f); struct reg w; w.ireg = _mm256_permute4x64_epi64(d->w.ireg, SWIZZLE(0, 2, 1, 3)); __m256i m = _mm256_permute4x64_epi64(d->mask.ireg, SWIZZLE(0, 2, 1, 3)); switch (gt.depth.format) { case D32_FLOAT: _mm256_maskstore_ps(base, m, w.reg); break; case D24_UNORM_X8_UINT: w_unorm.ireg = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(w.reg, scale), half)); _mm256_maskstore_epi32(base, m, w_unorm.ireg); break; case D16_UNORM: stub("D16_UNORM"); default: ksim_unreachable("invalid depth format"); } } }
void Decoder::ADMMDecoder_deg_6_7_2_3_6() { int maxIter = maxIteration; float mu = 5.5f; float tableau[12] = { 0.0f }; if ((mBlocklength == 576) && (mNChecks == 288)) { mu = 3.37309f;//penalty tableau[2] = 0.00001f; tableau[3] = 2.00928f; tableau[6] = 4.69438f; } else if((mBlocklength == 2304) && (mNChecks == 1152) ) { mu = 3.81398683f;//penalty tableau[2] = 0.29669288f; tableau[3] = 0.46964023f; tableau[6] = 3.19548154f; } else { mu = 5.5;//penalty tableau[2] = 0.8f; tableau[3] = 0.8f; tableau[6] = 0.8f; } const float rho = 1.9f; //over relaxation parameter; const float un_m_rho = 1.0 - rho; const auto _rho = _mm256_set1_ps( rho ); const auto _un_m_rho = _mm256_set1_ps( un_m_rho ); float tableaX[12]; // // ON PRECALCULE LES CONSTANTES // #pragma unroll for (int i = 0; i < 7; i++) { tableaX[i] = tableau[ i ] / mu; } const auto t_mu = _mm256_set1_ps ( mu ); const auto t2_amu = _mm256_set1_ps ( tableau[ 2 ] / mu ); const auto t3_amu = _mm256_set1_ps ( tableau[ 3 ] / mu ); const auto t6_amu = _mm256_set1_ps ( tableau[ 6 ] / mu ); const auto t2_2amu = _mm256_set1_ps ( 2.0f * tableau[ 2 ] / mu ); const auto t3_2amu = _mm256_set1_ps ( 2.0f * tableau[ 3 ] / mu ); const auto t6_2amu = _mm256_set1_ps ( 2.0f * tableau[ 6 ] / mu ); const auto t2_deg = _mm256_set1_ps ( 2.0f ); const auto t3_deg = _mm256_set1_ps ( 3.0f ); const auto t6_deg = _mm256_set1_ps ( 6.0f ); const auto zero = _mm256_set1_ps ( 0.0f ); const auto un = _mm256_set1_ps ( 1.0f ); const __m256 a = _mm256_set1_ps ( 0.0f ); const __m256 b = _mm256_set1_ps ( 0.5f ); ////////////////////////////////////////////////////////////////////////////////////// #pragma unroll for( int j = 0; j < _mPCheckMapSize; j+=8 ) { _mm256_store_ps(&Lambda [j], a); _mm256_store_ps(&zReplica[j], b); _mm256_store_ps(&latestProjVector[j], b); } ////////////////////////////////////////////////////////////////////////////////////// for(int i = 0; i < maxIter; i++) { int ptr = 0; mIteration = i + 1; // // MEASURE OF THE VN EXECUTION TIME // #ifdef PROFILE_ON const auto start = timer(); #endif // // VN processing kernel // #pragma unroll for (int j = 0; j < _mBlocklength; j++) { const int degVn = VariableDegree[j]; float M[8] __attribute__((aligned(64))); if( degVn == 2 ){ #if 1 const int dVN = 2; for(int qq = 0; qq < 8; qq++) { M[qq] = (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]); ptr += 1; #pragma unroll for(int k = 1; k < dVN; k++) { M[qq] += (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]); ptr += 1; } } const auto m = _mm256_loadu_ps( M ); const auto llr = _mm256_loadu_ps( &_LogLikelihoodRatio[j] ); const auto t1 = _mm256_sub_ps(m, _mm256_div_ps(llr, t_mu)); const auto xx = _mm256_div_ps(_mm256_sub_ps(t1, t2_amu), _mm256_sub_ps(t2_deg, t2_2amu)); const auto vMin = _mm256_max_ps(_mm256_min_ps(xx, un) , zero); _mm256_storeu_ps(&OutputFromDecoder[j], vMin); j += 7; #else const int degVN = 2; float temp = (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]); #pragma unroll for(int k = 1; k < degVN; k++) temp += (zReplica[ t_row[ptr + k] ] + Lambda[ t_row[ptr + k] ]); ptr += degVN; const float _amu_ = tableaX[ degVN ]; const float _2_amu_ = _amu_+ _amu_; const float llr = _LogLikelihoodRatio[j]; const float t = temp - llr / mu; const float xx = (t - _amu_)/(degVn - _2_amu_); const float vMax = std::min(xx, 1.0f); const float vMin = std::max(vMax, 0.0f); OutputFromDecoder[j] = vMin; #endif }else if( degVn == 3 ){ #if 1 const int dVN = 3; for(int qq = 0; qq < 8; qq++) { M[qq] = (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]); ptr += 1; #pragma unroll for(int k = 1; k < dVN; k++) { M[qq] += (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]); ptr += 1; } } const auto m = _mm256_loadu_ps( M ); const auto llr = _mm256_loadu_ps( &_LogLikelihoodRatio[j] ); const auto t1 = _mm256_sub_ps(m, _mm256_div_ps(llr, t_mu)); const auto xx = _mm256_div_ps(_mm256_sub_ps(t1, t3_amu), _mm256_sub_ps(t3_deg, t3_2amu)); const auto vMin = _mm256_max_ps(_mm256_min_ps(xx, un) , zero); _mm256_storeu_ps(&OutputFromDecoder[j], vMin); j += 7; #else const int degVN = 3; float temp = (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]); #pragma unroll for(int k = 1; k < degVN; k++) temp += (zReplica[ t_row[ptr + k] ] + Lambda[ t_row[ptr + k] ]); ptr += degVN; const float _amu_ = tableaX[ degVN ]; const float _2_amu_ = _amu_+ _amu_; const float llr = _LogLikelihoodRatio[j]; const float t = temp - llr / mu; const float xx = (t - _amu_)/(degVn - _2_amu_); const float vMax = std::min(xx, 1.0f); const float vMin = std::max(vMax, 0.0f); OutputFromDecoder[j] = vMin; #endif }else if( degVn == 6 ){ #if 1 const int dVN = 6; for(int qq = 0; qq < 8; qq++) { M[qq] = (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]); ptr += 1; #pragma unroll for(int k = 1; k < dVN; k++) { M[qq] += (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]); ptr += 1; } } const auto m = _mm256_loadu_ps( M ); const auto llr = _mm256_loadu_ps( &_LogLikelihoodRatio[j] ); const auto t1 = _mm256_sub_ps(m, _mm256_div_ps(llr, t_mu)); const auto xx = _mm256_div_ps(_mm256_sub_ps(t1, t6_amu), _mm256_sub_ps(t6_deg, t6_2amu)); const auto vMin = _mm256_max_ps(_mm256_min_ps(xx, un) , zero); _mm256_storeu_ps(&OutputFromDecoder[j], vMin); j += 7; #else const int degVN = 6; float temp = (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]); #pragma unroll for(int k = 1; k < degVN; k++) temp += (zReplica[ t_row[ptr + k] ] + Lambda[ t_row[ptr + k] ]); ptr += degVN; const float _amu_ = tableaX[ degVN ]; const float _2_amu_ = _amu_+ _amu_; const float llr = _LogLikelihoodRatio[j]; const float t = temp - llr / mu; const float xx = (t - _amu_)/(degVn - _2_amu_); const float vMax = std::min(xx, 1.0f); const float vMin = std::max(vMax, 0.0f); OutputFromDecoder[j] = vMin; #endif } } // // MEASURE OF THE VN EXECUTION TIME // #ifdef PROFILE_ON t_vn += (timer() - start); #endif // // CN processing kernel // int CumSumCheckDegree = 0; // cumulative position of currect edge in factor graph int allVerified = 0; float vector_before_proj[8] __attribute__((aligned(64))); const auto zero = _mm256_set1_ps ( 0.0f ); const auto mask_6 = _mm256_set_epi32(0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF); const auto mask_7 = _mm256_set_epi32(0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF); const auto dot5 = _mm256_set1_ps( 0.5f ); // // MEASURE OF THE CN EXECUTION TIME // #ifdef PROFILE_ON const auto starT = timer(); #endif const auto seuilProj = _mm256_set1_ps( 1e-5f ); for(int j = 0; j < _mNChecks; j++) { if( CheckDegree[j] == 6 ){ const int cDeg6 = 0x3F; const auto offsets = _mm256_loadu_si256 ((const __m256i*)&t_col1 [CumSumCheckDegree]); const auto xpred = _mm256_mask_i32gather_ps (zero, OutputFromDecoder, offsets, _mm256_castsi256_ps(mask_6), 4); const auto synd = _mm256_cmp_ps( xpred, dot5, _CMP_GT_OS ); int test = (_mm256_movemask_ps( synd ) & cDeg6); // deg 6 const auto syndrom = _mm_popcnt_u32( test ); const auto _Replica = _mm256_loadu_ps( &zReplica[CumSumCheckDegree]); const auto _ambda = _mm256_loadu_ps( &Lambda [CumSumCheckDegree]); const auto v1 = _mm256_mul_ps (xpred, _rho ); const auto v2 = _mm256_mul_ps ( _Replica, _un_m_rho ); const auto v3 = _mm256_add_ps ( v1, v2 ); const auto vect_proj = _mm256_sub_ps ( v3, _ambda ); // // ON REALISE LA PROJECTION !!! // allVerified += ( syndrom & 0x01 ); // // MEASURE OF THE PROJECTION EXECUTION TIME // #ifdef PROFILE_ON const auto START = timer(); #endif const auto latest = _mm256_loadu_ps(&latestProjVector[CumSumCheckDegree]); const auto different = _mm256_sub_ps ( vect_proj, latest ); const auto maskAbsol = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF)); const auto absolute = _mm256_and_ps ( different, maskAbsol ); const auto despass = _mm256_cmp_ps( absolute, seuilProj, _CMP_GT_OS ); int skip = (_mm256_movemask_ps( despass ) & cDeg6) == 0x00; // degree 6 if( skip == false ) { const auto _ztemp = mp.projection_deg6( vect_proj ); const auto _ztemp1 = _mm256_sub_ps(_ztemp, xpred ); const auto _ztemp2 = _mm256_sub_ps(_ztemp, _Replica ); const auto _ztemp3 = _mm256_mul_ps(_ztemp1, _rho); const auto _ztemp4 = _mm256_mul_ps(_ztemp2, _un_m_rho); const auto nLambda = _mm256_add_ps( _ambda, _ztemp3 ); const auto mLambda = _mm256_add_ps( nLambda, _ztemp4 ); _mm256_maskstore_ps(& Lambda[CumSumCheckDegree], mask_6, mLambda); _mm256_maskstore_ps(&zReplica[CumSumCheckDegree], mask_6, _ztemp); } _mm256_maskstore_ps(&latestProjVector[CumSumCheckDegree], mask_6, vect_proj); // // MEASURE OF THE PROJECTION EXECUTION TIME // #ifdef PROFILE_ON t_pj += (timer() - START); #endif CumSumCheckDegree += 6; }else if( CheckDegree[j] == 7 ) { const int cDeg7 = 0x7F; const auto offsets = _mm256_loadu_si256 ((const __m256i*)&t_col1 [CumSumCheckDegree]); const auto xpred = _mm256_mask_i32gather_ps (zero, OutputFromDecoder, offsets, _mm256_castsi256_ps(mask_7), 4); const auto synd = _mm256_cmp_ps( xpred, dot5, _CMP_GT_OS ); const int test = (_mm256_movemask_ps( synd ) & cDeg7); // deg 7 const auto syndrom = _mm_popcnt_u32( test ); const auto _Replica = _mm256_loadu_ps( &zReplica[CumSumCheckDegree]); const auto _ambda = _mm256_loadu_ps( &Lambda [CumSumCheckDegree]); const auto v1 = _mm256_mul_ps ( xpred, _rho ); const auto v2 = _mm256_mul_ps ( _Replica, _un_m_rho ); const auto v3 = _mm256_add_ps ( v1, v2 ); const auto vect_proj = _mm256_sub_ps ( v3, _ambda ); // // ON REALISE LA PROJECTION !!! // allVerified += ( syndrom & 0x01 ); // // MEASURE OF THE PROJECTION EXECUTION TIME // #ifdef PROFILE_ON const auto START = timer(); #endif const auto latest = _mm256_loadu_ps(&latestProjVector[CumSumCheckDegree]); const auto different = _mm256_sub_ps ( vect_proj, latest ); const auto maskAbsol = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF)); const auto absolute = _mm256_and_ps ( different, maskAbsol ); const auto despass = _mm256_cmp_ps( absolute, seuilProj, _CMP_GT_OS ); int skip = (_mm256_movemask_ps( despass ) & cDeg7) == 0x00; // degree 7 if( skip == false ) { const auto _ztemp = mp.projection_deg7( vect_proj ); const auto _ztemp1 = _mm256_sub_ps(_ztemp, xpred ); const auto _ztemp2 = _mm256_sub_ps(_ztemp, _Replica ); const auto _ztemp3 = _mm256_mul_ps(_ztemp1, _rho); const auto _ztemp4 = _mm256_mul_ps(_ztemp2, _un_m_rho); const auto nLambda = _mm256_add_ps( _ambda, _ztemp3 ); const auto mLambda = _mm256_add_ps( nLambda, _ztemp4 ); _mm256_maskstore_ps(& Lambda [CumSumCheckDegree], mask_7, mLambda); _mm256_maskstore_ps(&zReplica [CumSumCheckDegree], mask_7, _ztemp); } _mm256_maskstore_ps(&latestProjVector[CumSumCheckDegree], mask_7, vect_proj); // // MEASURE OF THE PROJECTION EXECUTION TIME // #ifdef PROFILE_ON t_pj += (timer() - START); #endif CumSumCheckDegree += 7; }else{ exit( 0 ); } } // // MEASURE OF THE CN LOOP EXECUTION TIME // #ifdef PROFILE_ON t_cn += (timer() - starT); #endif #ifdef PROFILE_ON t_ex += 1; //FILE *ft=fopen("time.txt","a"); //fprintf(ft,"%d \n", t_cn/t_ex); //fprintf(ft,"%d %d %d \n", t_cn, t_vn, t_pj); //fclose(ft); #endif if(allVerified == 0) { mAlgorithmConverge = true; mValidCodeword = true; break; } } // // MEASURE OF THE NUMBER OF EXECUTION // // #ifdef PROFILE_ON // t_ex += 1; // #endif }
INLINE void _mm256_maskstore_ps (float *ptr, __m256i mask, __m256 data) { _mm256_maskstore_ps(ptr, _mm256_castsi256_ps(mask), data); }