// Compare rank with all values currently in the queue. Returns -1 if the value already exists // or is larger than all values. // Otherwise, returns the index of the register in which the value should be inserted. // Mask is replicated to both lanes, so it can be used for both value and rank lane. int PriorityQueue_AVX2::compare(__m256i mrank, int &field, __m256i >mask) { static const __m256i eq4mask = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1); __m256i eq, eq4; int reg, mask; // Because items are sorted in ascending order within each (double) register, the mask after GT // comparison must be of the form 000...1111, which is one less than a power of two. { __m256i r0_7 = _mm256_permute2x128_si256(_rv[1], _rv[0], 0x20); // [0 .. 7] gtmask = _mm256_cmpgt_epi32(r0_7, mrank); mask = _mm256_movemask_ps(_mm256_castsi256_ps(gtmask)); eq = _mm256_cmpeq_epi32(r0_7, mrank); _ASSERTE(((mask + 1) & mask) == 0); reg = 1; } if (!mask) { __m256i r8_15 = _mm256_permute2x128_si256(_rv[3], _rv[2], 0x20); // [8 .. 15] gtmask = _mm256_cmpgt_epi32(r8_15, mrank); mask = _mm256_movemask_ps(_mm256_castsi256_ps(gtmask)); eq = _mm256_or_si256(eq, _mm256_cmpeq_epi32(r8_15, mrank)); _ASSERTE(((mask + 1) & mask) == 0); reg = 3; } if (!mask) { gtmask = _mm256_cmpgt_epi32(_rv[4], mrank); // [16 .. 19]; don't care about value eq4 = _mm256_and_si256(eq4mask, _mm256_cmpeq_epi32(mrank, _rv[4])); // .. ditto mask = _mm256_movemask_ps(_mm256_castsi256_ps(gtmask)) & 0xF; // ignore comparison with values eq = _mm256_or_si256(eq, eq4); _ASSERTE(((mask + 1) & mask) == 0); reg = 4; } if (_mm256_movemask_ps(_mm256_castsi256_ps(eq)) != 0) mask = 0; if (!mask) return -1; // Adjust register according to mask (higher 128-bits i double register: one register lower) // There is no "previous" register to test against for equality if we need to insert in the // very first register. Also duplicate the same mask to both lanes. if (mask > 0xF) { mask >>= 4; --reg; gtmask = _mm256_permute2x128_si256(gtmask, gtmask, 0x11); // replicate high lane to both }
INLINE uint movemask(const avxb& a) {return _mm256_movemask_ps(a);}
INLINE bool all(const avxb& a) {return _mm256_movemask_ps(a) == (uint)0xff;}
INLINE bool reduce_and(const avxb& a) {return _mm256_movemask_ps(a) == (uint)0xff;}
// reduction operations INLINE size_t popcnt(const avxb& a) {return __popcnt(_mm256_movemask_ps(a));}
CPLErr GDALGridInverseDistanceToAPower2NoSmoothingNoSearchAVX( const void *poOptions, GUInt32 nPoints, CPL_UNUSED const double *unused_padfX, CPL_UNUSED const double *unused_padfY, CPL_UNUSED const double *unused_padfZ, double dfXPoint, double dfYPoint, double *pdfValue, void* hExtraParamsIn ) { size_t i = 0; GDALGridExtraParameters* psExtraParams = (GDALGridExtraParameters*) hExtraParamsIn; const float* pafX = psExtraParams->pafX; const float* pafY = psExtraParams->pafY; const float* pafZ = psExtraParams->pafZ; const float fEpsilon = 0.0000000000001f; const float fXPoint = (float)dfXPoint; const float fYPoint = (float)dfYPoint; const __m256 ymm_small = GDAL_mm256_load1_ps(fEpsilon); const __m256 ymm_x = GDAL_mm256_load1_ps(fXPoint); const __m256 ymm_y = GDAL_mm256_load1_ps(fYPoint); __m256 ymm_nominator = _mm256_setzero_ps(); __m256 ymm_denominator = _mm256_setzero_ps(); int mask = 0; #undef LOOP_SIZE #if defined(__x86_64) || defined(_M_X64) /* This would also work in 32bit mode, but there are only 8 XMM registers */ /* whereas we have 16 for 64bit */ #define LOOP_SIZE 16 size_t nPointsRound = (nPoints / LOOP_SIZE) * LOOP_SIZE; for ( i = 0; i < nPointsRound; i += LOOP_SIZE ) { __m256 ymm_rx = _mm256_sub_ps(_mm256_load_ps(pafX + i), ymm_x); /* rx = pafX[i] - fXPoint */ __m256 ymm_rx_8 = _mm256_sub_ps(_mm256_load_ps(pafX + i + 8), ymm_x); __m256 ymm_ry = _mm256_sub_ps(_mm256_load_ps(pafY + i), ymm_y); /* ry = pafY[i] - fYPoint */ __m256 ymm_ry_8 = _mm256_sub_ps(_mm256_load_ps(pafY + i + 8), ymm_y); __m256 ymm_r2 = _mm256_add_ps(_mm256_mul_ps(ymm_rx, ymm_rx), /* r2 = rx * rx + ry * ry */ _mm256_mul_ps(ymm_ry, ymm_ry)); __m256 ymm_r2_8 = _mm256_add_ps(_mm256_mul_ps(ymm_rx_8, ymm_rx_8), _mm256_mul_ps(ymm_ry_8, ymm_ry_8)); __m256 ymm_invr2 = _mm256_rcp_ps(ymm_r2); /* invr2 = 1.0f / r2 */ __m256 ymm_invr2_8 = _mm256_rcp_ps(ymm_r2_8); ymm_nominator = _mm256_add_ps(ymm_nominator, /* nominator += invr2 * pafZ[i] */ _mm256_mul_ps(ymm_invr2, _mm256_load_ps(pafZ + i))); ymm_nominator = _mm256_add_ps(ymm_nominator, _mm256_mul_ps(ymm_invr2_8, _mm256_load_ps(pafZ + i + 8))); ymm_denominator = _mm256_add_ps(ymm_denominator, ymm_invr2); /* denominator += invr2 */ ymm_denominator = _mm256_add_ps(ymm_denominator, ymm_invr2_8); mask = _mm256_movemask_ps(_mm256_cmp_ps(ymm_r2, ymm_small, _CMP_LT_OS)) | /* if( r2 < fEpsilon) */ (_mm256_movemask_ps(_mm256_cmp_ps(ymm_r2_8, ymm_small, _CMP_LT_OS)) << 8); if( mask ) break; } #else #define LOOP_SIZE 8 size_t nPointsRound = (nPoints / LOOP_SIZE) * LOOP_SIZE; for ( i = 0; i < nPointsRound; i += LOOP_SIZE ) { __m256 ymm_rx = _mm256_sub_ps(_mm256_load_ps((float*)pafX + i), ymm_x); /* rx = pafX[i] - fXPoint */ __m256 ymm_ry = _mm256_sub_ps(_mm256_load_ps((float*)pafY + i), ymm_y); /* ry = pafY[i] - fYPoint */ __m256 ymm_r2 = _mm256_add_ps(_mm256_mul_ps(ymm_rx, ymm_rx), /* r2 = rx * rx + ry * ry */ _mm256_mul_ps(ymm_ry, ymm_ry)); __m256 ymm_invr2 = _mm256_rcp_ps(ymm_r2); /* invr2 = 1.0f / r2 */ ymm_nominator = _mm256_add_ps(ymm_nominator, /* nominator += invr2 * pafZ[i] */ _mm256_mul_ps(ymm_invr2, _mm256_load_ps((float*)pafZ + i))); ymm_denominator = _mm256_add_ps(ymm_denominator, ymm_invr2); /* denominator += invr2 */ mask = _mm256_movemask_ps(_mm256_cmp_ps(ymm_r2, ymm_small, _CMP_LT_OS)); /* if( r2 < fEpsilon) */ if( mask ) break; } #endif /* Find which i triggered r2 < fEpsilon */ if( mask ) { for(int j = 0; j < LOOP_SIZE; j++ ) { if( mask & (1 << j) ) { (*pdfValue) = (pafZ)[i + j]; // GCC and MSVC need explicit zeroing #if !defined(__clang__) _mm256_zeroupper(); #endif return CE_None; } } } #undef LOOP_SIZE /* Get back nominator and denominator values for YMM registers */ float afNominator[8], afDenominator[8]; _mm256_storeu_ps(afNominator, ymm_nominator); _mm256_storeu_ps(afDenominator, ymm_denominator); // MSVC doesn't emit AVX afterwards but may use SSE, so clear upper bits // Other compilers will continue using AVX for the below floating points operations #if defined(_MSC_FULL_VER) _mm256_zeroupper(); #endif float fNominator = afNominator[0] + afNominator[1] + afNominator[2] + afNominator[3] + afNominator[4] + afNominator[5] + afNominator[6] + afNominator[7]; float fDenominator = afDenominator[0] + afDenominator[1] + afDenominator[2] + afDenominator[3] + afDenominator[4] + afDenominator[5] + afDenominator[6] + afDenominator[7]; /* Do the few remaining loop iterations */ for ( ; i < nPoints; i++ ) { const float fRX = pafX[i] - fXPoint; const float fRY = pafY[i] - fYPoint; const float fR2 = fRX * fRX + fRY * fRY; // If the test point is close to the grid node, use the point // value directly as a node value to avoid singularity. if ( fR2 < 0.0000000000001 ) { break; } else { const float fInvR2 = 1.0f / fR2; fNominator += fInvR2 * pafZ[i]; fDenominator += fInvR2; } } if( i != nPoints ) { (*pdfValue) = pafZ[i]; } else if ( fDenominator == 0.0 ) { (*pdfValue) = ((GDALGridInverseDistanceToAPowerOptions*)poOptions)->dfNoDataValue; } else (*pdfValue) = fNominator / fDenominator; // GCC needs explicit zeroing #if defined(__GNUC__) && !defined(__clang__) _mm256_zeroupper(); #endif return CE_None; }
template<class Extension,class Info> struct call<nbtrue_,tag::simd_(tag::arithmetic_,Extension),Info> { typedef int32_t result_type; NT2_FUNCTOR_CALL_DISPATCH( 1, typename nt2::meta::scalar_of<A0>::type, (3, (float,double,arithmetic_)) ) NT2_FUNCTOR_CALL_EVAL_IF(1, float) { typedef typename meta::as_real<A0>::type type; int32_t r = _mm256_movemask_ps(isnez(a0)); return (r&1)+((r>>1)&1)+((r>>2)&1)+(r>>3&1)+((r>>4)&1)+((r>>5)&1)+(r>>6&1)+(r>>7); // return __builtin_popcount(_mm_movemask_ps(isnez(cast<type>(a0)))); } NT2_FUNCTOR_CALL_EVAL_IF(1, double) { int32_t r = _mm256_movemask_pd(isnez(a0)); return (r&1)+(r>>1&1)+((r>>2)&1)+(r>>3); } NT2_FUNCTOR_CALL_EVAL_IF(1, arithmetic_) { typedef typename meta::scalar_of<A0>::type sctype; typedef typename simd::native<sctype, tag::sse_ > svtype; svtype a00 = { _mm256_extractf128_si256(a0, 0)}; svtype a01 = { _mm256_extractf128_si256(a0, 1)}; return nbtrue(a00)+nbtrue(a01);
void plot(u32 w, u32 h, float x1, float y1, float x2, float y2, float dx, float dy, u32 max_iter = 4096) { assert(w % 8 == 0); // AVX Constants float const constants[] { x1, y1, dx, dy, 1.0f, 4.0f }; __m256 const vx1 = _mm256_broadcast_ss(constants); __m256 const vy1 = _mm256_broadcast_ss(constants + 1); __m256 const vdx = _mm256_broadcast_ss(constants + 2); __m256 const vdy = _mm256_broadcast_ss(constants + 3); __m256 const v1 = _mm256_broadcast_ss(constants + 4); __m256 const v4 = _mm256_broadcast_ss(constants + 5); // Start timing std::chrono::time_point<std::chrono::high_resolution_clock> t1, t2; std::chrono::duration<double> dt; t1 = std::chrono::high_resolution_clock::now(); // Zero line counter __m256 vj = _mm256_xor_ps(v1, v1); for (u32 j = 0; j < h; j++) { for (u32 i = 0; i < w; i += 8) { // Fill column counter float const vi_[8] { i+0.f, i+1.f, i+2.f, i+3.f, i+4.f, i+5.f, i+6.f, i+7.f }; __m256 vi = _mm256_load_ps(vi_); // Compute start point __m256 vx0 = _mm256_mul_ps(vi, vdx); vx0 = _mm256_add_ps(vx0, vx1); __m256 vy0 = _mm256_mul_ps(vj, vdy); vy0 = _mm256_add_ps(vy0, vy1); __m256 vx = vx0; __m256 vy = vy0; __m256 vcount = _mm256_xor_ps(v1, v1); // Zero iteration counter u32 iter = 0; u8 no_overflow = 0; do { // Compute products __m256 vxx = _mm256_mul_ps(vx, vx); __m256 vyy = _mm256_mul_ps(vy, vy); // Check termination condition __m256 vtmp = _mm256_add_ps(vxx, vyy); vtmp = _mm256_cmp_ps(vtmp, v4, _CMP_LT_OQ); no_overflow = _mm256_movemask_ps(vtmp) & 0xff; // Accumulate iteration counter vtmp = _mm256_and_ps(vtmp, v1); vcount = _mm256_add_ps(vcount, vtmp); // Step vtmp = _mm256_mul_ps(vx, vy); vtmp = _mm256_add_ps(vtmp, vtmp); vy = _mm256_add_ps(vtmp, vy0); vtmp = _mm256_sub_ps(vxx, vyy); vx = _mm256_add_ps(vtmp, vx0); ++iter; } while (no_overflow && (iter < max_iter)); for (u32 k = 0; k < 8; k++) { u32 n = ((float *) &vcount)[k] + 0.5f; if (n == max_iter) n = 0; char c = ' '; if (n > 0) { static char const charset[] = ".,c8M@jawrpogOQEPGJ"; c = charset[n % (sizeof(charset) - 1)]; } attron(COLOR_PAIR((n % 7) + 1)); addch(c); attroff(COLOR_PAIR((n % 7) + 1)); if (i + k + 1 == w) addch('\n'); } } // Increment line counter vj = _mm256_add_ps(vj, v1); } // End timing t2 = std::chrono::high_resolution_clock::now(); dt = t2 - t1; std::string info = std::to_string(dt.count() * 1000.0) + "ms"; attron(COLOR_PAIR(1)); printw(info.c_str()); attroff(COLOR_PAIR(1)); }
void Decoder::ADMMDecoder_deg_6_7_2_3_6() { int maxIter = maxIteration; float mu = 5.5f; float tableau[12] = { 0.0f }; if ((mBlocklength == 576) && (mNChecks == 288)) { mu = 3.37309f;//penalty tableau[2] = 0.00001f; tableau[3] = 2.00928f; tableau[6] = 4.69438f; } else if((mBlocklength == 2304) && (mNChecks == 1152) ) { mu = 3.81398683f;//penalty tableau[2] = 0.29669288f; tableau[3] = 0.46964023f; tableau[6] = 3.19548154f; } else { mu = 5.5;//penalty tableau[2] = 0.8f; tableau[3] = 0.8f; tableau[6] = 0.8f; } const float rho = 1.9f; //over relaxation parameter; const float un_m_rho = 1.0 - rho; const auto _rho = _mm256_set1_ps( rho ); const auto _un_m_rho = _mm256_set1_ps( un_m_rho ); float tableaX[12]; // // ON PRECALCULE LES CONSTANTES // #pragma unroll for (int i = 0; i < 7; i++) { tableaX[i] = tableau[ i ] / mu; } const auto t_mu = _mm256_set1_ps ( mu ); const auto t2_amu = _mm256_set1_ps ( tableau[ 2 ] / mu ); const auto t3_amu = _mm256_set1_ps ( tableau[ 3 ] / mu ); const auto t6_amu = _mm256_set1_ps ( tableau[ 6 ] / mu ); const auto t2_2amu = _mm256_set1_ps ( 2.0f * tableau[ 2 ] / mu ); const auto t3_2amu = _mm256_set1_ps ( 2.0f * tableau[ 3 ] / mu ); const auto t6_2amu = _mm256_set1_ps ( 2.0f * tableau[ 6 ] / mu ); const auto t2_deg = _mm256_set1_ps ( 2.0f ); const auto t3_deg = _mm256_set1_ps ( 3.0f ); const auto t6_deg = _mm256_set1_ps ( 6.0f ); const auto zero = _mm256_set1_ps ( 0.0f ); const auto un = _mm256_set1_ps ( 1.0f ); const __m256 a = _mm256_set1_ps ( 0.0f ); const __m256 b = _mm256_set1_ps ( 0.5f ); ////////////////////////////////////////////////////////////////////////////////////// #pragma unroll for( int j = 0; j < _mPCheckMapSize; j+=8 ) { _mm256_store_ps(&Lambda [j], a); _mm256_store_ps(&zReplica[j], b); _mm256_store_ps(&latestProjVector[j], b); } ////////////////////////////////////////////////////////////////////////////////////// for(int i = 0; i < maxIter; i++) { int ptr = 0; mIteration = i + 1; // // MEASURE OF THE VN EXECUTION TIME // #ifdef PROFILE_ON const auto start = timer(); #endif // // VN processing kernel // #pragma unroll for (int j = 0; j < _mBlocklength; j++) { const int degVn = VariableDegree[j]; float M[8] __attribute__((aligned(64))); if( degVn == 2 ){ #if 1 const int dVN = 2; for(int qq = 0; qq < 8; qq++) { M[qq] = (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]); ptr += 1; #pragma unroll for(int k = 1; k < dVN; k++) { M[qq] += (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]); ptr += 1; } } const auto m = _mm256_loadu_ps( M ); const auto llr = _mm256_loadu_ps( &_LogLikelihoodRatio[j] ); const auto t1 = _mm256_sub_ps(m, _mm256_div_ps(llr, t_mu)); const auto xx = _mm256_div_ps(_mm256_sub_ps(t1, t2_amu), _mm256_sub_ps(t2_deg, t2_2amu)); const auto vMin = _mm256_max_ps(_mm256_min_ps(xx, un) , zero); _mm256_storeu_ps(&OutputFromDecoder[j], vMin); j += 7; #else const int degVN = 2; float temp = (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]); #pragma unroll for(int k = 1; k < degVN; k++) temp += (zReplica[ t_row[ptr + k] ] + Lambda[ t_row[ptr + k] ]); ptr += degVN; const float _amu_ = tableaX[ degVN ]; const float _2_amu_ = _amu_+ _amu_; const float llr = _LogLikelihoodRatio[j]; const float t = temp - llr / mu; const float xx = (t - _amu_)/(degVn - _2_amu_); const float vMax = std::min(xx, 1.0f); const float vMin = std::max(vMax, 0.0f); OutputFromDecoder[j] = vMin; #endif }else if( degVn == 3 ){ #if 1 const int dVN = 3; for(int qq = 0; qq < 8; qq++) { M[qq] = (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]); ptr += 1; #pragma unroll for(int k = 1; k < dVN; k++) { M[qq] += (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]); ptr += 1; } } const auto m = _mm256_loadu_ps( M ); const auto llr = _mm256_loadu_ps( &_LogLikelihoodRatio[j] ); const auto t1 = _mm256_sub_ps(m, _mm256_div_ps(llr, t_mu)); const auto xx = _mm256_div_ps(_mm256_sub_ps(t1, t3_amu), _mm256_sub_ps(t3_deg, t3_2amu)); const auto vMin = _mm256_max_ps(_mm256_min_ps(xx, un) , zero); _mm256_storeu_ps(&OutputFromDecoder[j], vMin); j += 7; #else const int degVN = 3; float temp = (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]); #pragma unroll for(int k = 1; k < degVN; k++) temp += (zReplica[ t_row[ptr + k] ] + Lambda[ t_row[ptr + k] ]); ptr += degVN; const float _amu_ = tableaX[ degVN ]; const float _2_amu_ = _amu_+ _amu_; const float llr = _LogLikelihoodRatio[j]; const float t = temp - llr / mu; const float xx = (t - _amu_)/(degVn - _2_amu_); const float vMax = std::min(xx, 1.0f); const float vMin = std::max(vMax, 0.0f); OutputFromDecoder[j] = vMin; #endif }else if( degVn == 6 ){ #if 1 const int dVN = 6; for(int qq = 0; qq < 8; qq++) { M[qq] = (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]); ptr += 1; #pragma unroll for(int k = 1; k < dVN; k++) { M[qq] += (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]); ptr += 1; } } const auto m = _mm256_loadu_ps( M ); const auto llr = _mm256_loadu_ps( &_LogLikelihoodRatio[j] ); const auto t1 = _mm256_sub_ps(m, _mm256_div_ps(llr, t_mu)); const auto xx = _mm256_div_ps(_mm256_sub_ps(t1, t6_amu), _mm256_sub_ps(t6_deg, t6_2amu)); const auto vMin = _mm256_max_ps(_mm256_min_ps(xx, un) , zero); _mm256_storeu_ps(&OutputFromDecoder[j], vMin); j += 7; #else const int degVN = 6; float temp = (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]); #pragma unroll for(int k = 1; k < degVN; k++) temp += (zReplica[ t_row[ptr + k] ] + Lambda[ t_row[ptr + k] ]); ptr += degVN; const float _amu_ = tableaX[ degVN ]; const float _2_amu_ = _amu_+ _amu_; const float llr = _LogLikelihoodRatio[j]; const float t = temp - llr / mu; const float xx = (t - _amu_)/(degVn - _2_amu_); const float vMax = std::min(xx, 1.0f); const float vMin = std::max(vMax, 0.0f); OutputFromDecoder[j] = vMin; #endif } } // // MEASURE OF THE VN EXECUTION TIME // #ifdef PROFILE_ON t_vn += (timer() - start); #endif // // CN processing kernel // int CumSumCheckDegree = 0; // cumulative position of currect edge in factor graph int allVerified = 0; float vector_before_proj[8] __attribute__((aligned(64))); const auto zero = _mm256_set1_ps ( 0.0f ); const auto mask_6 = _mm256_set_epi32(0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF); const auto mask_7 = _mm256_set_epi32(0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF); const auto dot5 = _mm256_set1_ps( 0.5f ); // // MEASURE OF THE CN EXECUTION TIME // #ifdef PROFILE_ON const auto starT = timer(); #endif const auto seuilProj = _mm256_set1_ps( 1e-5f ); for(int j = 0; j < _mNChecks; j++) { if( CheckDegree[j] == 6 ){ const int cDeg6 = 0x3F; const auto offsets = _mm256_loadu_si256 ((const __m256i*)&t_col1 [CumSumCheckDegree]); const auto xpred = _mm256_mask_i32gather_ps (zero, OutputFromDecoder, offsets, _mm256_castsi256_ps(mask_6), 4); const auto synd = _mm256_cmp_ps( xpred, dot5, _CMP_GT_OS ); int test = (_mm256_movemask_ps( synd ) & cDeg6); // deg 6 const auto syndrom = _mm_popcnt_u32( test ); const auto _Replica = _mm256_loadu_ps( &zReplica[CumSumCheckDegree]); const auto _ambda = _mm256_loadu_ps( &Lambda [CumSumCheckDegree]); const auto v1 = _mm256_mul_ps (xpred, _rho ); const auto v2 = _mm256_mul_ps ( _Replica, _un_m_rho ); const auto v3 = _mm256_add_ps ( v1, v2 ); const auto vect_proj = _mm256_sub_ps ( v3, _ambda ); // // ON REALISE LA PROJECTION !!! // allVerified += ( syndrom & 0x01 ); // // MEASURE OF THE PROJECTION EXECUTION TIME // #ifdef PROFILE_ON const auto START = timer(); #endif const auto latest = _mm256_loadu_ps(&latestProjVector[CumSumCheckDegree]); const auto different = _mm256_sub_ps ( vect_proj, latest ); const auto maskAbsol = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF)); const auto absolute = _mm256_and_ps ( different, maskAbsol ); const auto despass = _mm256_cmp_ps( absolute, seuilProj, _CMP_GT_OS ); int skip = (_mm256_movemask_ps( despass ) & cDeg6) == 0x00; // degree 6 if( skip == false ) { const auto _ztemp = mp.projection_deg6( vect_proj ); const auto _ztemp1 = _mm256_sub_ps(_ztemp, xpred ); const auto _ztemp2 = _mm256_sub_ps(_ztemp, _Replica ); const auto _ztemp3 = _mm256_mul_ps(_ztemp1, _rho); const auto _ztemp4 = _mm256_mul_ps(_ztemp2, _un_m_rho); const auto nLambda = _mm256_add_ps( _ambda, _ztemp3 ); const auto mLambda = _mm256_add_ps( nLambda, _ztemp4 ); _mm256_maskstore_ps(& Lambda[CumSumCheckDegree], mask_6, mLambda); _mm256_maskstore_ps(&zReplica[CumSumCheckDegree], mask_6, _ztemp); } _mm256_maskstore_ps(&latestProjVector[CumSumCheckDegree], mask_6, vect_proj); // // MEASURE OF THE PROJECTION EXECUTION TIME // #ifdef PROFILE_ON t_pj += (timer() - START); #endif CumSumCheckDegree += 6; }else if( CheckDegree[j] == 7 ) { const int cDeg7 = 0x7F; const auto offsets = _mm256_loadu_si256 ((const __m256i*)&t_col1 [CumSumCheckDegree]); const auto xpred = _mm256_mask_i32gather_ps (zero, OutputFromDecoder, offsets, _mm256_castsi256_ps(mask_7), 4); const auto synd = _mm256_cmp_ps( xpred, dot5, _CMP_GT_OS ); const int test = (_mm256_movemask_ps( synd ) & cDeg7); // deg 7 const auto syndrom = _mm_popcnt_u32( test ); const auto _Replica = _mm256_loadu_ps( &zReplica[CumSumCheckDegree]); const auto _ambda = _mm256_loadu_ps( &Lambda [CumSumCheckDegree]); const auto v1 = _mm256_mul_ps ( xpred, _rho ); const auto v2 = _mm256_mul_ps ( _Replica, _un_m_rho ); const auto v3 = _mm256_add_ps ( v1, v2 ); const auto vect_proj = _mm256_sub_ps ( v3, _ambda ); // // ON REALISE LA PROJECTION !!! // allVerified += ( syndrom & 0x01 ); // // MEASURE OF THE PROJECTION EXECUTION TIME // #ifdef PROFILE_ON const auto START = timer(); #endif const auto latest = _mm256_loadu_ps(&latestProjVector[CumSumCheckDegree]); const auto different = _mm256_sub_ps ( vect_proj, latest ); const auto maskAbsol = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF)); const auto absolute = _mm256_and_ps ( different, maskAbsol ); const auto despass = _mm256_cmp_ps( absolute, seuilProj, _CMP_GT_OS ); int skip = (_mm256_movemask_ps( despass ) & cDeg7) == 0x00; // degree 7 if( skip == false ) { const auto _ztemp = mp.projection_deg7( vect_proj ); const auto _ztemp1 = _mm256_sub_ps(_ztemp, xpred ); const auto _ztemp2 = _mm256_sub_ps(_ztemp, _Replica ); const auto _ztemp3 = _mm256_mul_ps(_ztemp1, _rho); const auto _ztemp4 = _mm256_mul_ps(_ztemp2, _un_m_rho); const auto nLambda = _mm256_add_ps( _ambda, _ztemp3 ); const auto mLambda = _mm256_add_ps( nLambda, _ztemp4 ); _mm256_maskstore_ps(& Lambda [CumSumCheckDegree], mask_7, mLambda); _mm256_maskstore_ps(&zReplica [CumSumCheckDegree], mask_7, _ztemp); } _mm256_maskstore_ps(&latestProjVector[CumSumCheckDegree], mask_7, vect_proj); // // MEASURE OF THE PROJECTION EXECUTION TIME // #ifdef PROFILE_ON t_pj += (timer() - START); #endif CumSumCheckDegree += 7; }else{ exit( 0 ); } } // // MEASURE OF THE CN LOOP EXECUTION TIME // #ifdef PROFILE_ON t_cn += (timer() - starT); #endif #ifdef PROFILE_ON t_ex += 1; //FILE *ft=fopen("time.txt","a"); //fprintf(ft,"%d \n", t_cn/t_ex); //fprintf(ft,"%d %d %d \n", t_cn, t_vn, t_pj); //fclose(ft); #endif if(allVerified == 0) { mAlgorithmConverge = true; mValidCodeword = true; break; } } // // MEASURE OF THE NUMBER OF EXECUTION // // #ifdef PROFILE_ON // t_ex += 1; // #endif }
void molec_quadrant_neighbor_interaction_fma(molec_Quadrant_t q, molec_Quadrant_t q_n, float* Epot_) { #ifdef __AVX2__ const __m256 sigLJ = _mm256_set1_ps(molec_parameter->sigLJ); const __m256 epsLJ = _mm256_set1_ps(molec_parameter->epsLJ); const __m256 Rcut2 = _mm256_set1_ps(molec_parameter->Rcut2); const int N = q.N; const int N_n = q_n.N_pad; __m256 Epot8 = _mm256_setzero_ps(); __m256 _1 = _mm256_set1_ps(1.f); __m256 _2 = _mm256_set1_ps(2.f); __m256 _24epsLJ = _mm256_mul_ps(_mm256_set1_ps(24.f), epsLJ); for(int i = 0; i < N; ++i) { const __m256 xi = _mm256_set1_ps(q.x[i]); const __m256 yi = _mm256_set1_ps(q.y[i]); const __m256 zi = _mm256_set1_ps(q.z[i]); __m256 f_xi = _mm256_setzero_ps(); __m256 f_yi = _mm256_setzero_ps(); __m256 f_zi = _mm256_setzero_ps(); for(int j = 0; j < N_n; j += 8) { // count number of interactions if(MOLEC_CELLLIST_COUNT_INTERACTION) ++num_potential_interactions; // load coordinates and fores into AVX vectors const __m256 xj = _mm256_load_ps(&q_n.x[j]); const __m256 yj = _mm256_load_ps(&q_n.y[j]); const __m256 zj = _mm256_load_ps(&q_n.z[j]); __m256 f_xj = _mm256_load_ps(&q_n.f_x[j]); __m256 f_yj = _mm256_load_ps(&q_n.f_y[j]); __m256 f_zj = _mm256_load_ps(&q_n.f_z[j]); // distance computation const __m256 xij = _mm256_sub_ps(xi, xj); const __m256 yij = _mm256_sub_ps(yi, yj); const __m256 zij = _mm256_sub_ps(zi, zj); const __m256 zij2 = _mm256_mul_ps(zij, zij); const __m256 r2 = _mm256_fmadd_ps(xij, xij, _mm256_fmadd_ps(yij, yij, zij2)); // r2 < Rcut2 const __m256 mask = _mm256_cmp_ps(r2, Rcut2, _CMP_LT_OQ); // if( any(r2 < R2) ) if(_mm256_movemask_ps(mask)) { const __m256 r2inv = _mm256_div_ps(_1, r2); const __m256 s2 = _mm256_mul_ps(_mm256_mul_ps(sigLJ, sigLJ), r2inv); const __m256 s6 = _mm256_mul_ps(_mm256_mul_ps(s2, s2), s2); const __m256 s12 = _mm256_mul_ps(s6, s6); const __m256 s12_minus_s6 = _mm256_sub_ps(s12, s6); const __m256 two_s12_minus_s6 = _mm256_sub_ps(_mm256_mul_ps(_2, s12), s6); Epot8 = _mm256_add_ps(Epot8, _mm256_and_ps(s12_minus_s6, mask)); const __m256 fr = _mm256_mul_ps(_mm256_mul_ps(_24epsLJ, r2inv), two_s12_minus_s6); const __m256 fr_mask = _mm256_and_ps(fr, mask); // update forces f_xi = _mm256_fmadd_ps(fr_mask, xij,f_xi); f_yi = _mm256_fmadd_ps(fr_mask, yij,f_yi); f_zi = _mm256_fmadd_ps(fr_mask, zij,f_zi); f_xj = _mm256_fnmadd_ps(fr_mask,xij,f_xj); f_yj = _mm256_fnmadd_ps(fr_mask,yij,f_yj); f_zj = _mm256_fnmadd_ps(fr_mask,zij,f_zj); // store back j-forces _mm256_store_ps(&q_n.f_x[j], f_xj); _mm256_store_ps(&q_n.f_y[j], f_yj); _mm256_store_ps(&q_n.f_z[j], f_zj); } } // update i-forces float MOLEC_ALIGNAS(32) f_array[8]; _mm256_store_ps(f_array, f_xi); q.f_x[i] += f_array[0] + f_array[1] + f_array[2] + f_array[3] + f_array[4] + f_array[5] + f_array[6] + f_array[7]; _mm256_store_ps(f_array, f_yi); q.f_y[i] += f_array[0] + f_array[1] + f_array[2] + f_array[3] + f_array[4] + f_array[5] + f_array[6] + f_array[7]; _mm256_store_ps(f_array, f_zi); q.f_z[i] += f_array[0] + f_array[1] + f_array[2] + f_array[3] + f_array[4] + f_array[5] + f_array[6] + f_array[7]; } float MOLEC_ALIGNAS(32) E_pot_array[8]; _mm256_store_ps(E_pot_array, Epot8); // perform reduction of potential energy *Epot_ += 4 * molec_parameter->epsLJ*(E_pot_array[0] + E_pot_array[1] + E_pot_array[2] + E_pot_array[3] + E_pot_array[4] + E_pot_array[5] + E_pot_array[6] + E_pot_array[7]); #endif }