void molec_quadrant_neighbor_interaction_fma(molec_Quadrant_t q, molec_Quadrant_t q_n, float* Epot_) { #ifdef __AVX2__ const __m256 sigLJ = _mm256_set1_ps(molec_parameter->sigLJ); const __m256 epsLJ = _mm256_set1_ps(molec_parameter->epsLJ); const __m256 Rcut2 = _mm256_set1_ps(molec_parameter->Rcut2); const int N = q.N; const int N_n = q_n.N_pad; __m256 Epot8 = _mm256_setzero_ps(); __m256 _1 = _mm256_set1_ps(1.f); __m256 _2 = _mm256_set1_ps(2.f); __m256 _24epsLJ = _mm256_mul_ps(_mm256_set1_ps(24.f), epsLJ); for(int i = 0; i < N; ++i) { const __m256 xi = _mm256_set1_ps(q.x[i]); const __m256 yi = _mm256_set1_ps(q.y[i]); const __m256 zi = _mm256_set1_ps(q.z[i]); __m256 f_xi = _mm256_setzero_ps(); __m256 f_yi = _mm256_setzero_ps(); __m256 f_zi = _mm256_setzero_ps(); for(int j = 0; j < N_n; j += 8) { // count number of interactions if(MOLEC_CELLLIST_COUNT_INTERACTION) ++num_potential_interactions; // load coordinates and fores into AVX vectors const __m256 xj = _mm256_load_ps(&q_n.x[j]); const __m256 yj = _mm256_load_ps(&q_n.y[j]); const __m256 zj = _mm256_load_ps(&q_n.z[j]); __m256 f_xj = _mm256_load_ps(&q_n.f_x[j]); __m256 f_yj = _mm256_load_ps(&q_n.f_y[j]); __m256 f_zj = _mm256_load_ps(&q_n.f_z[j]); // distance computation const __m256 xij = _mm256_sub_ps(xi, xj); const __m256 yij = _mm256_sub_ps(yi, yj); const __m256 zij = _mm256_sub_ps(zi, zj); const __m256 zij2 = _mm256_mul_ps(zij, zij); const __m256 r2 = _mm256_fmadd_ps(xij, xij, _mm256_fmadd_ps(yij, yij, zij2)); // r2 < Rcut2 const __m256 mask = _mm256_cmp_ps(r2, Rcut2, _CMP_LT_OQ); // if( any(r2 < R2) ) if(_mm256_movemask_ps(mask)) { const __m256 r2inv = _mm256_div_ps(_1, r2); const __m256 s2 = _mm256_mul_ps(_mm256_mul_ps(sigLJ, sigLJ), r2inv); const __m256 s6 = _mm256_mul_ps(_mm256_mul_ps(s2, s2), s2); const __m256 s12 = _mm256_mul_ps(s6, s6); const __m256 s12_minus_s6 = _mm256_sub_ps(s12, s6); const __m256 two_s12_minus_s6 = _mm256_sub_ps(_mm256_mul_ps(_2, s12), s6); Epot8 = _mm256_add_ps(Epot8, _mm256_and_ps(s12_minus_s6, mask)); const __m256 fr = _mm256_mul_ps(_mm256_mul_ps(_24epsLJ, r2inv), two_s12_minus_s6); const __m256 fr_mask = _mm256_and_ps(fr, mask); // update forces f_xi = _mm256_fmadd_ps(fr_mask, xij,f_xi); f_yi = _mm256_fmadd_ps(fr_mask, yij,f_yi); f_zi = _mm256_fmadd_ps(fr_mask, zij,f_zi); f_xj = _mm256_fnmadd_ps(fr_mask,xij,f_xj); f_yj = _mm256_fnmadd_ps(fr_mask,yij,f_yj); f_zj = _mm256_fnmadd_ps(fr_mask,zij,f_zj); // store back j-forces _mm256_store_ps(&q_n.f_x[j], f_xj); _mm256_store_ps(&q_n.f_y[j], f_yj); _mm256_store_ps(&q_n.f_z[j], f_zj); } } // update i-forces float MOLEC_ALIGNAS(32) f_array[8]; _mm256_store_ps(f_array, f_xi); q.f_x[i] += f_array[0] + f_array[1] + f_array[2] + f_array[3] + f_array[4] + f_array[5] + f_array[6] + f_array[7]; _mm256_store_ps(f_array, f_yi); q.f_y[i] += f_array[0] + f_array[1] + f_array[2] + f_array[3] + f_array[4] + f_array[5] + f_array[6] + f_array[7]; _mm256_store_ps(f_array, f_zi); q.f_z[i] += f_array[0] + f_array[1] + f_array[2] + f_array[3] + f_array[4] + f_array[5] + f_array[6] + f_array[7]; } float MOLEC_ALIGNAS(32) E_pot_array[8]; _mm256_store_ps(E_pot_array, Epot8); // perform reduction of potential energy *Epot_ += 4 * molec_parameter->epsLJ*(E_pot_array[0] + E_pot_array[1] + E_pot_array[2] + E_pot_array[3] + E_pot_array[4] + E_pot_array[5] + E_pot_array[6] + E_pot_array[7]); #endif }
real rcutoff_scalar; real *shiftvec,*fshift,*x,*f; real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH; real scratch[4*DIM]; __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; real * vdwioffsetptr0; __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H; __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; __m256 velec,felec,velecsum,facel,crf,krf,krf2; real *charge; __m256i gbitab; __m128i gbitab_lo,gbitab_hi; __m256 vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,gbeps,dvdatmp; __m256 minushalf = _mm256_set1_ps(-0.5); real *invsqrta,*dvda,*gbtab; __m256i vfitab; __m128i vfitab_lo,vfitab_hi; __m128i ifour = _mm_set1_epi32(4); __m256 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; real *vftab; __m256 dummy_mask,cutoff_mask; __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) ); __m256 one = _mm256_set1_ps(1.0); __m256 two = _mm256_set1_ps(2.0); x = xx[0]; f = ff[0]; nri = nlist->nri; iinr = nlist->iinr;
int *iinr,*jindex,*jjnr,*shiftidx,*gid; real rcutoff_scalar; real *shiftvec,*fshift,*x,*f; real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH; real scratch[4*DIM]; __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; real * vdwioffsetptr0; __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H; __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; int nvdwtype; __m256 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; int *vdwtype; real *vdwparam; __m256 one_sixth = _mm256_set1_ps(1.0/6.0); __m256 one_twelfth = _mm256_set1_ps(1.0/12.0); __m256i vfitab; __m128i vfitab_lo,vfitab_hi; __m128i ifour = _mm_set1_epi32(4); __m256 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; real *vftab; __m256 dummy_mask,cutoff_mask; __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) ); __m256 one = _mm256_set1_ps(1.0); __m256 two = _mm256_set1_ps(2.0); x = xx[0]; f = ff[0]; nri = nlist->nri; iinr = nlist->iinr;
real * vdwioffsetptr1; __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; real * vdwioffsetptr2; __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; real * vdwioffsetptr3; __m256 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H; __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; __m256 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; __m256 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; __m256 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30; __m256 velec,felec,velecsum,facel,crf,krf,krf2; real *charge; __m256 dummy_mask,cutoff_mask; __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) ); __m256 one = _mm256_set1_ps(1.0); __m256 two = _mm256_set1_ps(2.0); x = xx[0]; f = ff[0]; nri = nlist->nri; iinr = nlist->iinr; jindex = nlist->jindex; jjnr = nlist->jjnr; shiftidx = nlist->shift; gid = nlist->gid; shiftvec = fr->shift_vec[0]; fshift = fr->fshift[0]; facel = _mm256_set1_ps(fr->epsfac); charge = mdatoms->chargeA; krf = _mm256_set1_ps(fr->ic->k_rf);
static void sfid_render_cache_rt_write_simd8_rgba_uint32_linear(struct thread *t, const struct sfid_render_cache_args *args) { const int slice_y = args->rt.minimum_array_element * args->rt.qpitch; const int x = t->grf[1].uw[4]; const int y = t->grf[1].uw[5] + slice_y; const struct reg *src = &t->grf[args->src]; __m128i *base0 = args->rt.pixels + x * args->rt.cpp + y * args->rt.stride; __m128i *base1 = (void *) base0 + args->rt.stride; __m256i rg0145 = _mm256_unpacklo_epi32(src[0].ireg, src[1].ireg); __m256i rg2367 = _mm256_unpackhi_epi32(src[0].ireg, src[1].ireg); __m256i ba0145 = _mm256_unpacklo_epi32(src[2].ireg, src[3].ireg); __m256i ba2367 = _mm256_unpackhi_epi32(src[2].ireg, src[3].ireg); __m256i rgba04 = _mm256_unpacklo_epi64(rg0145, ba0145); __m256i rgba15 = _mm256_unpackhi_epi64(rg0145, ba0145); __m256i rgba26 = _mm256_unpacklo_epi64(rg2367, ba2367); __m256i rgba37 = _mm256_unpackhi_epi64(rg2367, ba2367); struct reg mask = { .ireg = t->mask_q1 }; if (mask.d[0] < 0) base0[0] = _mm256_extractf128_si256(rgba04, 0); if (mask.d[1] < 0) base0[1] = _mm256_extractf128_si256(rgba15, 0); if (mask.d[2] < 0) base1[0] = _mm256_extractf128_si256(rgba26, 0); if (mask.d[3] < 0) base1[1] = _mm256_extractf128_si256(rgba37, 0); if (mask.d[4] < 0) base0[2] = _mm256_extractf128_si256(rgba04, 1); if (mask.d[5] < 0) base0[3] = _mm256_extractf128_si256(rgba15, 1); if (mask.d[6] < 0) base1[2] = _mm256_extractf128_si256(rgba26, 1); if (mask.d[7] < 0) base1[3] = _mm256_extractf128_si256(rgba37, 1); } static void write_uint16_linear(struct thread *t, const struct sfid_render_cache_args *args, __m256i r, __m256i g, __m256i b, __m256i a) { const int slice_y = args->rt.minimum_array_element * args->rt.qpitch; const int x = t->grf[1].uw[4]; const int y = t->grf[1].uw[5] + slice_y; __m256i rg, ba; rg = _mm256_slli_epi32(g, 16); rg = _mm256_or_si256(rg, r); ba = _mm256_slli_epi32(a, 16); ba = _mm256_or_si256(ba, b); __m256i p0 = _mm256_unpacklo_epi32(rg, ba); __m256i m0 = _mm256_cvtepi32_epi64(_mm256_extractf128_si256(t->mask_q1, 0)); __m256i p1 = _mm256_unpackhi_epi32(rg, ba); __m256i m1 = _mm256_cvtepi32_epi64(_mm256_extractf128_si256(t->mask_q1, 1)); void *base = args->rt.pixels + x * args->rt.cpp + y * args->rt.stride; _mm_maskstore_epi64(base, _mm256_extractf128_si256(m0, 0), _mm256_extractf128_si256(p0, 0)); _mm_maskstore_epi64((base + 16), _mm256_extractf128_si256(m1, 0), _mm256_extractf128_si256(p0, 1)); _mm_maskstore_epi64((base + args->rt.stride), _mm256_extractf128_si256(m0, 1), _mm256_extractf128_si256(p1, 0)); _mm_maskstore_epi64((base + args->rt.stride + 16), _mm256_extractf128_si256(m1, 1), _mm256_extractf128_si256(p1, 1)); } static void sfid_render_cache_rt_write_simd8_rgba_unorm16_linear(struct thread *t, const struct sfid_render_cache_args *args) { __m256i r, g, b, a; const __m256 scale = _mm256_set1_ps(65535.0f); const __m256 half = _mm256_set1_ps(0.5f); struct reg *src = &t->grf[args->src]; r = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[0].reg, scale), half)); g = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[1].reg, scale), half)); b = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[2].reg, scale), half)); a = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[3].reg, scale), half)); write_uint16_linear(t, args, r, g, b, a); }
real *shiftvec,*fshift,*x,*f; real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH; real scratch[4*DIM]; __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; real * vdwioffsetptr0; __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H; __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; __m256 velec,felec,velecsum,facel,crf,krf,krf2; real *charge; int nvdwtype; __m256 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; int *vdwtype; real *vdwparam; __m256 one_sixth = _mm256_set1_ps(1.0/6.0); __m256 one_twelfth = _mm256_set1_ps(1.0/12.0); __m256 dummy_mask,cutoff_mask; __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) ); __m256 one = _mm256_set1_ps(1.0); __m256 two = _mm256_set1_ps(2.0); x = xx[0]; f = ff[0]; nri = nlist->nri; iinr = nlist->iinr; jindex = nlist->jindex; jjnr = nlist->jjnr; shiftidx = nlist->shift; gid = nlist->gid; shiftvec = fr->shift_vec[0];
void Decoder::ADMMDecoder_deg_6_7_2_3_6() { int maxIter = maxIteration; float mu = 5.5f; float tableau[12] = { 0.0f }; if ((mBlocklength == 576) && (mNChecks == 288)) { mu = 3.37309f;//penalty tableau[2] = 0.00001f; tableau[3] = 2.00928f; tableau[6] = 4.69438f; } else if((mBlocklength == 2304) && (mNChecks == 1152) ) { mu = 3.81398683f;//penalty tableau[2] = 0.29669288f; tableau[3] = 0.46964023f; tableau[6] = 3.19548154f; } else { mu = 5.5;//penalty tableau[2] = 0.8f; tableau[3] = 0.8f; tableau[6] = 0.8f; } const float rho = 1.9f; //over relaxation parameter; const float un_m_rho = 1.0 - rho; const auto _rho = _mm256_set1_ps( rho ); const auto _un_m_rho = _mm256_set1_ps( un_m_rho ); float tableaX[12]; // // ON PRECALCULE LES CONSTANTES // #pragma unroll for (int i = 0; i < 7; i++) { tableaX[i] = tableau[ i ] / mu; } const auto t_mu = _mm256_set1_ps ( mu ); const auto t2_amu = _mm256_set1_ps ( tableau[ 2 ] / mu ); const auto t3_amu = _mm256_set1_ps ( tableau[ 3 ] / mu ); const auto t6_amu = _mm256_set1_ps ( tableau[ 6 ] / mu ); const auto t2_2amu = _mm256_set1_ps ( 2.0f * tableau[ 2 ] / mu ); const auto t3_2amu = _mm256_set1_ps ( 2.0f * tableau[ 3 ] / mu ); const auto t6_2amu = _mm256_set1_ps ( 2.0f * tableau[ 6 ] / mu ); const auto t2_deg = _mm256_set1_ps ( 2.0f ); const auto t3_deg = _mm256_set1_ps ( 3.0f ); const auto t6_deg = _mm256_set1_ps ( 6.0f ); const auto zero = _mm256_set1_ps ( 0.0f ); const auto un = _mm256_set1_ps ( 1.0f ); const __m256 a = _mm256_set1_ps ( 0.0f ); const __m256 b = _mm256_set1_ps ( 0.5f ); ////////////////////////////////////////////////////////////////////////////////////// #pragma unroll for( int j = 0; j < _mPCheckMapSize; j+=8 ) { _mm256_store_ps(&Lambda [j], a); _mm256_store_ps(&zReplica[j], b); _mm256_store_ps(&latestProjVector[j], b); } ////////////////////////////////////////////////////////////////////////////////////// for(int i = 0; i < maxIter; i++) { int ptr = 0; mIteration = i + 1; // // MEASURE OF THE VN EXECUTION TIME // #ifdef PROFILE_ON const auto start = timer(); #endif // // VN processing kernel // #pragma unroll for (int j = 0; j < _mBlocklength; j++) { const int degVn = VariableDegree[j]; float M[8] __attribute__((aligned(64))); if( degVn == 2 ){ #if 1 const int dVN = 2; for(int qq = 0; qq < 8; qq++) { M[qq] = (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]); ptr += 1; #pragma unroll for(int k = 1; k < dVN; k++) { M[qq] += (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]); ptr += 1; } } const auto m = _mm256_loadu_ps( M ); const auto llr = _mm256_loadu_ps( &_LogLikelihoodRatio[j] ); const auto t1 = _mm256_sub_ps(m, _mm256_div_ps(llr, t_mu)); const auto xx = _mm256_div_ps(_mm256_sub_ps(t1, t2_amu), _mm256_sub_ps(t2_deg, t2_2amu)); const auto vMin = _mm256_max_ps(_mm256_min_ps(xx, un) , zero); _mm256_storeu_ps(&OutputFromDecoder[j], vMin); j += 7; #else const int degVN = 2; float temp = (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]); #pragma unroll for(int k = 1; k < degVN; k++) temp += (zReplica[ t_row[ptr + k] ] + Lambda[ t_row[ptr + k] ]); ptr += degVN; const float _amu_ = tableaX[ degVN ]; const float _2_amu_ = _amu_+ _amu_; const float llr = _LogLikelihoodRatio[j]; const float t = temp - llr / mu; const float xx = (t - _amu_)/(degVn - _2_amu_); const float vMax = std::min(xx, 1.0f); const float vMin = std::max(vMax, 0.0f); OutputFromDecoder[j] = vMin; #endif }else if( degVn == 3 ){ #if 1 const int dVN = 3; for(int qq = 0; qq < 8; qq++) { M[qq] = (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]); ptr += 1; #pragma unroll for(int k = 1; k < dVN; k++) { M[qq] += (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]); ptr += 1; } } const auto m = _mm256_loadu_ps( M ); const auto llr = _mm256_loadu_ps( &_LogLikelihoodRatio[j] ); const auto t1 = _mm256_sub_ps(m, _mm256_div_ps(llr, t_mu)); const auto xx = _mm256_div_ps(_mm256_sub_ps(t1, t3_amu), _mm256_sub_ps(t3_deg, t3_2amu)); const auto vMin = _mm256_max_ps(_mm256_min_ps(xx, un) , zero); _mm256_storeu_ps(&OutputFromDecoder[j], vMin); j += 7; #else const int degVN = 3; float temp = (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]); #pragma unroll for(int k = 1; k < degVN; k++) temp += (zReplica[ t_row[ptr + k] ] + Lambda[ t_row[ptr + k] ]); ptr += degVN; const float _amu_ = tableaX[ degVN ]; const float _2_amu_ = _amu_+ _amu_; const float llr = _LogLikelihoodRatio[j]; const float t = temp - llr / mu; const float xx = (t - _amu_)/(degVn - _2_amu_); const float vMax = std::min(xx, 1.0f); const float vMin = std::max(vMax, 0.0f); OutputFromDecoder[j] = vMin; #endif }else if( degVn == 6 ){ #if 1 const int dVN = 6; for(int qq = 0; qq < 8; qq++) { M[qq] = (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]); ptr += 1; #pragma unroll for(int k = 1; k < dVN; k++) { M[qq] += (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]); ptr += 1; } } const auto m = _mm256_loadu_ps( M ); const auto llr = _mm256_loadu_ps( &_LogLikelihoodRatio[j] ); const auto t1 = _mm256_sub_ps(m, _mm256_div_ps(llr, t_mu)); const auto xx = _mm256_div_ps(_mm256_sub_ps(t1, t6_amu), _mm256_sub_ps(t6_deg, t6_2amu)); const auto vMin = _mm256_max_ps(_mm256_min_ps(xx, un) , zero); _mm256_storeu_ps(&OutputFromDecoder[j], vMin); j += 7; #else const int degVN = 6; float temp = (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]); #pragma unroll for(int k = 1; k < degVN; k++) temp += (zReplica[ t_row[ptr + k] ] + Lambda[ t_row[ptr + k] ]); ptr += degVN; const float _amu_ = tableaX[ degVN ]; const float _2_amu_ = _amu_+ _amu_; const float llr = _LogLikelihoodRatio[j]; const float t = temp - llr / mu; const float xx = (t - _amu_)/(degVn - _2_amu_); const float vMax = std::min(xx, 1.0f); const float vMin = std::max(vMax, 0.0f); OutputFromDecoder[j] = vMin; #endif } } // // MEASURE OF THE VN EXECUTION TIME // #ifdef PROFILE_ON t_vn += (timer() - start); #endif // // CN processing kernel // int CumSumCheckDegree = 0; // cumulative position of currect edge in factor graph int allVerified = 0; float vector_before_proj[8] __attribute__((aligned(64))); const auto zero = _mm256_set1_ps ( 0.0f ); const auto mask_6 = _mm256_set_epi32(0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF); const auto mask_7 = _mm256_set_epi32(0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF); const auto dot5 = _mm256_set1_ps( 0.5f ); // // MEASURE OF THE CN EXECUTION TIME // #ifdef PROFILE_ON const auto starT = timer(); #endif const auto seuilProj = _mm256_set1_ps( 1e-5f ); for(int j = 0; j < _mNChecks; j++) { if( CheckDegree[j] == 6 ){ const int cDeg6 = 0x3F; const auto offsets = _mm256_loadu_si256 ((const __m256i*)&t_col1 [CumSumCheckDegree]); const auto xpred = _mm256_mask_i32gather_ps (zero, OutputFromDecoder, offsets, _mm256_castsi256_ps(mask_6), 4); const auto synd = _mm256_cmp_ps( xpred, dot5, _CMP_GT_OS ); int test = (_mm256_movemask_ps( synd ) & cDeg6); // deg 6 const auto syndrom = _mm_popcnt_u32( test ); const auto _Replica = _mm256_loadu_ps( &zReplica[CumSumCheckDegree]); const auto _ambda = _mm256_loadu_ps( &Lambda [CumSumCheckDegree]); const auto v1 = _mm256_mul_ps (xpred, _rho ); const auto v2 = _mm256_mul_ps ( _Replica, _un_m_rho ); const auto v3 = _mm256_add_ps ( v1, v2 ); const auto vect_proj = _mm256_sub_ps ( v3, _ambda ); // // ON REALISE LA PROJECTION !!! // allVerified += ( syndrom & 0x01 ); // // MEASURE OF THE PROJECTION EXECUTION TIME // #ifdef PROFILE_ON const auto START = timer(); #endif const auto latest = _mm256_loadu_ps(&latestProjVector[CumSumCheckDegree]); const auto different = _mm256_sub_ps ( vect_proj, latest ); const auto maskAbsol = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF)); const auto absolute = _mm256_and_ps ( different, maskAbsol ); const auto despass = _mm256_cmp_ps( absolute, seuilProj, _CMP_GT_OS ); int skip = (_mm256_movemask_ps( despass ) & cDeg6) == 0x00; // degree 6 if( skip == false ) { const auto _ztemp = mp.projection_deg6( vect_proj ); const auto _ztemp1 = _mm256_sub_ps(_ztemp, xpred ); const auto _ztemp2 = _mm256_sub_ps(_ztemp, _Replica ); const auto _ztemp3 = _mm256_mul_ps(_ztemp1, _rho); const auto _ztemp4 = _mm256_mul_ps(_ztemp2, _un_m_rho); const auto nLambda = _mm256_add_ps( _ambda, _ztemp3 ); const auto mLambda = _mm256_add_ps( nLambda, _ztemp4 ); _mm256_maskstore_ps(& Lambda[CumSumCheckDegree], mask_6, mLambda); _mm256_maskstore_ps(&zReplica[CumSumCheckDegree], mask_6, _ztemp); } _mm256_maskstore_ps(&latestProjVector[CumSumCheckDegree], mask_6, vect_proj); // // MEASURE OF THE PROJECTION EXECUTION TIME // #ifdef PROFILE_ON t_pj += (timer() - START); #endif CumSumCheckDegree += 6; }else if( CheckDegree[j] == 7 ) { const int cDeg7 = 0x7F; const auto offsets = _mm256_loadu_si256 ((const __m256i*)&t_col1 [CumSumCheckDegree]); const auto xpred = _mm256_mask_i32gather_ps (zero, OutputFromDecoder, offsets, _mm256_castsi256_ps(mask_7), 4); const auto synd = _mm256_cmp_ps( xpred, dot5, _CMP_GT_OS ); const int test = (_mm256_movemask_ps( synd ) & cDeg7); // deg 7 const auto syndrom = _mm_popcnt_u32( test ); const auto _Replica = _mm256_loadu_ps( &zReplica[CumSumCheckDegree]); const auto _ambda = _mm256_loadu_ps( &Lambda [CumSumCheckDegree]); const auto v1 = _mm256_mul_ps ( xpred, _rho ); const auto v2 = _mm256_mul_ps ( _Replica, _un_m_rho ); const auto v3 = _mm256_add_ps ( v1, v2 ); const auto vect_proj = _mm256_sub_ps ( v3, _ambda ); // // ON REALISE LA PROJECTION !!! // allVerified += ( syndrom & 0x01 ); // // MEASURE OF THE PROJECTION EXECUTION TIME // #ifdef PROFILE_ON const auto START = timer(); #endif const auto latest = _mm256_loadu_ps(&latestProjVector[CumSumCheckDegree]); const auto different = _mm256_sub_ps ( vect_proj, latest ); const auto maskAbsol = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF)); const auto absolute = _mm256_and_ps ( different, maskAbsol ); const auto despass = _mm256_cmp_ps( absolute, seuilProj, _CMP_GT_OS ); int skip = (_mm256_movemask_ps( despass ) & cDeg7) == 0x00; // degree 7 if( skip == false ) { const auto _ztemp = mp.projection_deg7( vect_proj ); const auto _ztemp1 = _mm256_sub_ps(_ztemp, xpred ); const auto _ztemp2 = _mm256_sub_ps(_ztemp, _Replica ); const auto _ztemp3 = _mm256_mul_ps(_ztemp1, _rho); const auto _ztemp4 = _mm256_mul_ps(_ztemp2, _un_m_rho); const auto nLambda = _mm256_add_ps( _ambda, _ztemp3 ); const auto mLambda = _mm256_add_ps( nLambda, _ztemp4 ); _mm256_maskstore_ps(& Lambda [CumSumCheckDegree], mask_7, mLambda); _mm256_maskstore_ps(&zReplica [CumSumCheckDegree], mask_7, _ztemp); } _mm256_maskstore_ps(&latestProjVector[CumSumCheckDegree], mask_7, vect_proj); // // MEASURE OF THE PROJECTION EXECUTION TIME // #ifdef PROFILE_ON t_pj += (timer() - START); #endif CumSumCheckDegree += 7; }else{ exit( 0 ); } } // // MEASURE OF THE CN LOOP EXECUTION TIME // #ifdef PROFILE_ON t_cn += (timer() - starT); #endif #ifdef PROFILE_ON t_ex += 1; //FILE *ft=fopen("time.txt","a"); //fprintf(ft,"%d \n", t_cn/t_ex); //fprintf(ft,"%d %d %d \n", t_cn, t_vn, t_pj); //fclose(ft); #endif if(allVerified == 0) { mAlgorithmConverge = true; mValidCodeword = true; break; } } // // MEASURE OF THE NUMBER OF EXECUTION // // #ifdef PROFILE_ON // t_ex += 1; // #endif }
real rcutoff_scalar; real *shiftvec,*fshift,*x,*f; real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH; real scratch[4*DIM]; __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; real * vdwioffsetptr0; real * vdwgridioffsetptr0; __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H; __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; int nvdwtype; __m256 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; int *vdwtype; real *vdwparam; __m256 one_sixth = _mm256_set1_ps(1.0/6.0); __m256 one_twelfth = _mm256_set1_ps(1.0/12.0); __m256 c6grid_00; real *vdwgridparam; __m256 ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald; __m256 one_half = _mm256_set1_ps(0.5); __m256 minus_one = _mm256_set1_ps(-1.0); __m256 dummy_mask,cutoff_mask; __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) ); __m256 one = _mm256_set1_ps(1.0); __m256 two = _mm256_set1_ps(2.0); x = xx[0]; f = ff[0]; nri = nlist->nri; iinr = nlist->iinr;
int *iinr,*jindex,*jjnr,*shiftidx,*gid; real rcutoff_scalar; real *shiftvec,*fshift,*x,*f; real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH; real scratch[4*DIM]; __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; real * vdwioffsetptr0; __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H; __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; int nvdwtype; __m256 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; int *vdwtype; real *vdwparam; __m256 one_sixth = _mm256_set1_ps(1.0/6.0); __m256 one_twelfth = _mm256_set1_ps(1.0/12.0); __m256 dummy_mask,cutoff_mask; __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) ); __m256 one = _mm256_set1_ps(1.0); __m256 two = _mm256_set1_ps(2.0); x = xx[0]; f = ff[0]; nri = nlist->nri; iinr = nlist->iinr; jindex = nlist->jindex; jjnr = nlist->jjnr; shiftidx = nlist->shift; gid = nlist->gid; shiftvec = fr->shift_vec[0];
int *iinr,*jindex,*jjnr,*shiftidx,*gid; real rcutoff_scalar; real *shiftvec,*fshift,*x,*f; real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH; real scratch[4*DIM]; __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; real * vdwioffsetptr0; __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H; __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; int nvdwtype; __m256 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; int *vdwtype; real *vdwparam; __m256 one_sixth = _mm256_set1_ps(1.0/6.0); __m256 one_twelfth = _mm256_set1_ps(1.0/12.0); __m256 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw; real rswitch_scalar,d_scalar; __m256 dummy_mask,cutoff_mask; __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) ); __m256 one = _mm256_set1_ps(1.0); __m256 two = _mm256_set1_ps(2.0); x = xx[0]; f = ff[0]; nri = nlist->nri; iinr = nlist->iinr; jindex = nlist->jindex; jjnr = nlist->jjnr; shiftidx = nlist->shift;
static inline __m256 gen_05(void) { return _mm256_set1_ps(0.5f); }
static inline __m256 gen_one(void) { return _mm256_set1_ps(1.f); }
//----------------------------------------------------------------------------------------- // Rasterize the occludee AABB and depth test it against the CPU rasterized depth buffer // If any of the rasterized AABB pixels passes the depth test exit early and mark the occludee // as visible. If all rasterized AABB pixels are occluded then the occludee is culled //----------------------------------------------------------------------------------------- bool TransformedAABBoxAVX::RasterizeAndDepthTestAABBox(UINT *pRenderTargetPixels, const __m128 pXformedPos[], UINT idx) { // Set DAZ and FZ MXCSR bits to flush denormals to zero (i.e., make it faster) // Denormal are zero (DAZ) is bit 6 and Flush to zero (FZ) is bit 15. // so to enable the two to have to set bits 6 and 15 which 1000 0000 0100 0000 = 0x8040 _mm_setcsr( _mm_getcsr() | 0x8040 ); __m256i colOffset = _mm256_setr_epi32(0, 1, 2, 3, 0, 1, 2, 3); __m256i rowOffset = _mm256_setr_epi32(0, 0, 0, 0, 1, 1, 1, 1); float* pDepthBuffer = (float*)pRenderTargetPixels; // Rasterize the AABB triangles 4 at a time for(UINT i = 0; i < AABB_TRIANGLES; i += SSE) { vFloat4 xformedPos[3]; Gather(xformedPos, i, pXformedPos, idx); // use fixed-point only for X and Y. Avoid work for Z and W. __m128i fxPtX[3], fxPtY[3]; for(int m = 0; m < 3; m++) { fxPtX[m] = _mm_cvtps_epi32(xformedPos[m].X); fxPtY[m] = _mm_cvtps_epi32(xformedPos[m].Y); } // Fab(x, y) = Ax + By + C = 0 // Fab(x, y) = (ya - yb)x + (xb - xa)y + (xa * yb - xb * ya) = 0 // Compute A = (ya - yb) for the 3 line segments that make up each triangle __m128i A0 = _mm_sub_epi32(fxPtY[1], fxPtY[2]); __m128i A1 = _mm_sub_epi32(fxPtY[2], fxPtY[0]); __m128i A2 = _mm_sub_epi32(fxPtY[0], fxPtY[1]); // Compute B = (xb - xa) for the 3 line segments that make up each triangle __m128i B0 = _mm_sub_epi32(fxPtX[2], fxPtX[1]); __m128i B1 = _mm_sub_epi32(fxPtX[0], fxPtX[2]); __m128i B2 = _mm_sub_epi32(fxPtX[1], fxPtX[0]); // Compute C = (xa * yb - xb * ya) for the 3 line segments that make up each triangle __m128i C0 = _mm_sub_epi32(_mm_mullo_epi32(fxPtX[1], fxPtY[2]), _mm_mullo_epi32(fxPtX[2], fxPtY[1])); __m128i C1 = _mm_sub_epi32(_mm_mullo_epi32(fxPtX[2], fxPtY[0]), _mm_mullo_epi32(fxPtX[0], fxPtY[2])); __m128i C2 = _mm_sub_epi32(_mm_mullo_epi32(fxPtX[0], fxPtY[1]), _mm_mullo_epi32(fxPtX[1], fxPtY[0])); // Compute triangle area __m128i triArea = _mm_mullo_epi32(B2, A1); triArea = _mm_sub_epi32(triArea, _mm_mullo_epi32(B1, A2)); __m128 oneOverTriArea = _mm_rcp_ps(_mm_cvtepi32_ps(triArea)); __m128 Z[3]; Z[0] = xformedPos[0].Z; Z[1] = _mm_mul_ps(_mm_sub_ps(xformedPos[1].Z, Z[0]), oneOverTriArea); Z[2] = _mm_mul_ps(_mm_sub_ps(xformedPos[2].Z, Z[0]), oneOverTriArea); // Use bounding box traversal strategy to determine which pixels to rasterize //__m128i startX = _mm_and_si128(HelperSSE::Max(HelperSSE::Min(HelperSSE::Min(fxPtX[0], fxPtX[1]), fxPtX[2]), _mm_set1_epi32(0)), _mm_set1_epi32(~1)); __m128i startX = _mm_and_si128(HelperSSE::Max(HelperSSE::Min(HelperSSE::Min(fxPtX[0], fxPtX[1]), fxPtX[2]), _mm_set1_epi32(0)), _mm_set1_epi32(~3)); __m128i endX = HelperSSE::Min(HelperSSE::Max(HelperSSE::Max(fxPtX[0], fxPtX[1]), fxPtX[2]), _mm_set1_epi32(SCREENW - 1)); __m128i startY = _mm_and_si128(HelperSSE::Max(HelperSSE::Min(HelperSSE::Min(fxPtY[0], fxPtY[1]), fxPtY[2]), _mm_set1_epi32(0)), _mm_set1_epi32(~1)); __m128i endY = HelperSSE::Min(HelperSSE::Max(HelperSSE::Max(fxPtY[0], fxPtY[1]), fxPtY[2]), _mm_set1_epi32(SCREENH - 1)); // Now we have 4 triangles set up. Rasterize them each individually. for(int lane=0; lane < SSE; lane++) { // Skip triangle if area is zero if(triArea.m128i_i32[lane] <= 0) { continue; } // Extract this triangle's properties from the SIMD versions __m256 zz[3]; for (int vv = 0; vv < 3; vv++) { zz[vv] = _mm256_set1_ps(Z[vv].m128_f32[lane]); } int startXx = startX.m128i_i32[lane]; int endXx = endX.m128i_i32[lane]; int startYy = startY.m128i_i32[lane]; int endYy = endY.m128i_i32[lane]; __m256i aa0 = _mm256_set1_epi32(A0.m128i_i32[lane]); __m256i aa1 = _mm256_set1_epi32(A1.m128i_i32[lane]); __m256i aa2 = _mm256_set1_epi32(A2.m128i_i32[lane]); __m256i bb0 = _mm256_set1_epi32(B0.m128i_i32[lane]); __m256i bb1 = _mm256_set1_epi32(B1.m128i_i32[lane]); __m256i bb2 = _mm256_set1_epi32(B2.m128i_i32[lane]); __m256i aa0Inc = _mm256_slli_epi32(aa0, 2); __m256i aa1Inc = _mm256_slli_epi32(aa1, 2); __m256i aa2Inc = _mm256_slli_epi32(aa2, 2); __m256i bb0Inc = _mm256_slli_epi32(bb0, 1); __m256i bb1Inc = _mm256_slli_epi32(bb1, 1); __m256i bb2Inc = _mm256_slli_epi32(bb2, 1); __m256i row, col; // Traverse pixels in 2x4 blocks and store 2x4 pixel quad depths contiguously in memory ==> 2*X // This method provides better performance int rowIdx = (startYy * SCREENW + 2 * startXx); col = _mm256_add_epi32(colOffset, _mm256_set1_epi32(startXx)); __m256i aa0Col = _mm256_mullo_epi32(aa0, col); __m256i aa1Col = _mm256_mullo_epi32(aa1, col); __m256i aa2Col = _mm256_mullo_epi32(aa2, col); row = _mm256_add_epi32(rowOffset, _mm256_set1_epi32(startYy)); __m256i bb0Row = _mm256_add_epi32(_mm256_mullo_epi32(bb0, row), _mm256_set1_epi32(C0.m128i_i32[lane])); __m256i bb1Row = _mm256_add_epi32(_mm256_mullo_epi32(bb1, row), _mm256_set1_epi32(C1.m128i_i32[lane])); __m256i bb2Row = _mm256_add_epi32(_mm256_mullo_epi32(bb2, row), _mm256_set1_epi32(C2.m128i_i32[lane])); __m256i sum0Row = _mm256_add_epi32(aa0Col, bb0Row); __m256i sum1Row = _mm256_add_epi32(aa1Col, bb1Row); __m256i sum2Row = _mm256_add_epi32(aa2Col, bb2Row); __m256 zx = _mm256_mul_ps(_mm256_cvtepi32_ps(aa1Inc), zz[1]); zx = _mm256_add_ps(zx, _mm256_mul_ps(_mm256_cvtepi32_ps(aa2Inc), zz[2])); // Incrementally compute Fab(x, y) for all the pixels inside the bounding box formed by (startX, endX) and (startY, endY) for (int r = startYy; r < endYy; r += 2, rowIdx += 2 * SCREENW, sum0Row = _mm256_add_epi32(sum0Row, bb0Inc), sum1Row = _mm256_add_epi32(sum1Row, bb1Inc), sum2Row = _mm256_add_epi32(sum2Row, bb2Inc)) { // Compute barycentric coordinates int index = rowIdx; __m256i alpha = sum0Row; __m256i beta = sum1Row; __m256i gama = sum2Row; //Compute barycentric-interpolated depth __m256 depth = zz[0]; depth = _mm256_add_ps(depth, _mm256_mul_ps(_mm256_cvtepi32_ps(beta), zz[1])); depth = _mm256_add_ps(depth, _mm256_mul_ps(_mm256_cvtepi32_ps(gama), zz[2])); __m256i anyOut = _mm256_setzero_si256(); for (int c = startXx; c < endXx; c += 4, index += 8, alpha = _mm256_add_epi32(alpha, aa0Inc), beta = _mm256_add_epi32(beta, aa1Inc), gama = _mm256_add_epi32(gama, aa2Inc), depth = _mm256_add_ps(depth, zx)) { //Test Pixel inside triangle __m256i mask = _mm256_or_si256(_mm256_or_si256(alpha, beta), gama); __m256 previousDepthValue = _mm256_loadu_ps(&pDepthBuffer[index]); __m256 depthMask = _mm256_cmp_ps(depth, previousDepthValue, 0x1D); __m256i finalMask = _mm256_andnot_si256(mask, _mm256_castps_si256(depthMask)); anyOut = _mm256_or_si256(anyOut, finalMask); }//for each column if (!_mm256_testz_si256(anyOut, _mm256_set1_epi32(0x80000000))) { return true; //early exit } }// for each row }// for each triangle }// for each set of SIMD# triangles return false; }
void animate() { float mx; float my; if(ManualControl) { POINT pos; GetCursorPos(&pos); RECT rc; GetClientRect(hMainWnd, &rc); ScreenToClient(hMainWnd, &pos); mx = pos.x; my = pos.y; } else { UpdatePosition(mx, my); } const auto size = partCount; VertexData *pVertexBuffer; pVertexObject->Lock(0, 0, (void**)&pVertexBuffer, D3DLOCK_DISCARD); _mm256_zeroall(); #pragma omp parallel \ shared(pVertexBuffer, particlesCoord, particlesVel, mx, my, size) { #pragma omp for nowait for(int i = 0; i < size; i += 4) { float mouseCoordVec[8] = { mx, my, mx, my, mx, my, mx, my }; float *particleCoordsVec = (float*)particlesCoord + i; float *velocityVec = (float*)particlesVel + i; auto xyCoord = _mm256_loadu_ps(particleCoordsVec); auto hwTempData = _mm256_sub_ps(xyCoord, _mm256_loadu_ps(mouseCoordVec)); auto squares = _mm256_mul_ps(hwTempData, hwTempData); auto distSquare = _mm256_hadd_ps(squares, squares); distSquare = _mm256_shuffle_ps(distSquare, distSquare, 0x50); auto theForce = _mm256_div_ps(_mm256_set1_ps(G), distSquare); if(distSquare.m256_f32[0] < 400) { theForce.m256_f32[0] = 0; theForce.m256_f32[1] = 0; } if(distSquare.m256_f32[2] < 400) { theForce.m256_f32[2] = 0; theForce.m256_f32[3] = 0; } if(distSquare.m256_f32[4] < 400) { theForce.m256_f32[4] = 0; theForce.m256_f32[5] = 0; } if(distSquare.m256_f32[6] < 400) { theForce.m256_f32[6] = 0; theForce.m256_f32[7] = 0; } auto xyForces = _mm256_mul_ps(_mm256_xor_ps(hwTempData, _mm256_set1_ps(-0.f)), theForce); auto xyVelocities = _mm256_loadu_ps(velocityVec); xyVelocities = _mm256_mul_ps(xyVelocities, _mm256_set1_ps(Resistance)); xyVelocities = _mm256_add_ps(xyVelocities, xyForces); xyCoord = _mm256_add_ps(xyCoord, xyVelocities); _mm256_storeu_ps(velocityVec, xyVelocities); _mm256_storeu_ps(particleCoordsVec, xyCoord); processIfOutOfBounds(((ParticleCoord*)particleCoordsVec)[0], ((ParticleVel*)velocityVec)[0]); processIfOutOfBounds(((ParticleCoord*)particleCoordsVec)[1], ((ParticleVel*)velocityVec)[1]); processIfOutOfBounds(((ParticleCoord*)particleCoordsVec)[2], ((ParticleVel*)velocityVec)[2]); processIfOutOfBounds(((ParticleCoord*)particleCoordsVec)[3], ((ParticleVel*)velocityVec)[3]); pVertexBuffer[i].x = ((ParticleCoord*)particleCoordsVec)[0].x; pVertexBuffer[i].y = ((ParticleCoord*)particleCoordsVec)[0].y; pVertexBuffer[i + 1].x = ((ParticleCoord*)particleCoordsVec)[1].x; pVertexBuffer[i + 1].y = ((ParticleCoord*)particleCoordsVec)[1].y; pVertexBuffer[i + 2].x = ((ParticleCoord*)particleCoordsVec)[2].x; pVertexBuffer[i + 2].y = ((ParticleCoord*)particleCoordsVec)[2].y; pVertexBuffer[i + 3].x = ((ParticleCoord*)particleCoordsVec)[3].x; pVertexBuffer[i + 3].y = ((ParticleCoord*)particleCoordsVec)[3].y; } } pVertexObject->Unlock(); _mm256_zeroall(); }
real * vdwioffsetptr1; __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; real * vdwioffsetptr2; __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; real * vdwioffsetptr3; __m256 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H; __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; __m256 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; __m256 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; __m256 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30; __m256 velec,felec,velecsum,facel,crf,krf,krf2; real *charge; __m256 dummy_mask,cutoff_mask; __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) ); __m256 one = _mm256_set1_ps(1.0); __m256 two = _mm256_set1_ps(2.0); x = xx[0]; f = ff[0]; nri = nlist->nri; iinr = nlist->iinr; jindex = nlist->jindex; jjnr = nlist->jjnr; shiftidx = nlist->shift; gid = nlist->gid; shiftvec = fr->shift_vec[0]; fshift = fr->fshift[0]; facel = _mm256_set1_ps(fr->epsfac); charge = mdatoms->chargeA;
int main() { /* screen ( integer) coordinate */ int iX, iY; const int iXmax = 16384; const int iYmax = 16384; /* world ( double) coordinate = parameter plane*/ // A variavel Cy foi extendida para um vetor, para aproveitar o uso dos registradores avx float Cx, _Cy[8]; const float CxMin = -2.5; const float CxMax = 1.5; const float CyMin = -2.0; const float CyMax = 2.0; float PixelWidth = (CxMax - CxMin) / iXmax; float PixelHeight = (CyMax - CyMin) / iYmax; /* color component ( R or G or B) is coded from 0 to 255 */ /* it is 24 bit color RGB file */ const int MaxColorComponentValue = 255; FILE * fp; char *filename = "_simd_avx_iY.ppm"; static unsigned char color[3]; /* Z=Zx+Zy*i ; Z0 = 0 */ double Zx, Zy; double Zx2, Zy2; /* Zx2=Zx*Zx; Zy2=Zy*Zy */ int Iteration; const int IterationMax = 256; /* bail-out value , radius of circle ; */ const double EscapeRadius = 2; double ER2 = EscapeRadius*EscapeRadius; /*create new file,give it a name and open it in binary mode */ fp = fopen(filename, "wb"); /* b - binary mode */ /*write ASCII header to the file*/ fprintf(fp, "P6\n %d\n %d\n %d\n", iXmax, iYmax, MaxColorComponentValue); // Gera um vetor com oito palavras de 32 bits, com valor PixelWidth por meio de funcoes intrinsecas __m256 PixelHeight256 = _mm256_set1_ps(PixelHeight); __m256 CyMin256 = _mm256_set1_ps(CyMin); /* compute and write image data bytes to the file*/ // Loop paralelizado(são feitas 8 iteracoes simultaneamente) for (iY = 0; iY<iYmax/8; iY++) { // Gera os indices e coloca em simdIy float avxIy[8]; for (int i = 0; i < 8; i++) avxIy[i] = iY * 8.0 + i; //Cy = CyMin + iY*PixelHeight _asm{ vmovups ymm5, avxIy vmulps ymm5, ymm5, PixelHeight256 vaddps ymm5, ymm5, CyMin256 vmovups _Cy, ymm5 } for (int i = 0; i < 8; i++){ if (fabs(_Cy[i]) < PixelHeight / 2) _Cy[i] = 0.0; /* Main antenna */ for (iX = 0; iX < iXmax; iX++) { Cx = CxMin + iX*PixelWidth; /* initial value of orbit = critical point Z= 0 */ Zx = 0.0; Zy = 0.0; Zx2 = Zx*Zx; Zy2 = Zy*Zy; for (Iteration = 0; Iteration < IterationMax && ((Zx2 + Zy2) < ER2); Iteration++) { Zy = 2 * Zx*Zy + _Cy[i]; Zx = Zx2 - Zy2 + Cx; Zx2 = Zx*Zx; Zy2 = Zy*Zy; }; /* compute pixel color (24 bit = 3 bytes) */ if (Iteration == IterationMax) { /* interior of Mandelbrot set = black */ color[0] = 0; color[1] = 0; color[2] = 0; } else { /* exterior of Mandelbrot set = white */ color[0] = ((IterationMax - Iteration) % 8) * 63; /* Red */ color[1] = ((IterationMax - Iteration) % 4) * 127; /* Green */ color[2] = ((IterationMax - Iteration) % 2) * 255; /* Blue */ }; /*write color to the file*/ fwrite(color, 1, 3, fp); } } } fclose(fp); return 0; }
void Viterbi::AlignWithOutCellOff(HMMSimd* q, HMMSimd* t,ViterbiMatrix * viterbiMatrix, int maxres, ViterbiResult* result) #endif #endif { // Linear topology of query (and template) HMM: // 1. The HMM HMM has L+2 columns. Columns 1 to L contain // a match state, a delete state and an insert state each. // 2. The Start state is M0, the virtual match state in column i=0 (j=0). (Therefore X[k][0]=ANY) // This column has only a match state and it has only a transitions to the next match state. // 3. The End state is M(L+1), the virtual match state in column i=L+1.(j=L+1) (Therefore X[k][L+1]=ANY) // Column L has no transitions to the delete state: tr[L][M2D]=tr[L][D2D]=0. // 4. Transitions I->D and D->I are ignored, since they do not appear in PsiBlast alignments // (as long as the gap opening penalty d is higher than the best match score S(a,b)). // Pairwise alignment of two HMMs: // 1. Pair-states for the alignment of two HMMs are // MM (Q:Match T:Match) , GD (Q:Gap T:Delete), IM (Q:Insert T:Match), DG (Q:Delelte, T:Match) , MI (Q:Match T:Insert) // 2. Transitions are allowed only between the MM-state and each of the four other states. // Saving space: // The best score ending in pair state XY sXY[i][j] is calculated from left to right (j=1->t->L) // and top to bottom (i=1->q->L). To save space, only the last row of scores calculated is kept in memory. // (The backtracing matrices are kept entirely in memory [O(t->L*q->L)]). // When the calculation has proceeded up to the point where the scores for cell (i,j) are caculated, // sXY[i-1][j'] = sXY[j'] for j'>=j (A below) // sXY[i][j'] = sXY[j'] for j'<j (B below) // sXY[i-1][j-1]= sXY_i_1_j_1 (C below) // sXY[i][j] = sXY_i_j (D below) // j-1 // j // i-1: CAAAAAAAAAAAAAAAAAA // i : BBBBBBBBBBBBBD // Variable declarations const float smin = (this->local ? 0 : -FLT_MAX); //used to distinguish between SW and NW algorithms in maximization const simd_float smin_vec = simdf32_set(smin); const simd_float shift_vec = simdf32_set(shift); // const simd_float one_vec = simdf32_set(1); // 00000001 const simd_int mm_vec = simdi32_set(2); //MM 00000010 const simd_int gd_vec = simdi32_set(3); //GD 00000011 const simd_int im_vec = simdi32_set(4); //IM 00000100 const simd_int dg_vec = simdi32_set(5); //DG 00000101 const simd_int mi_vec = simdi32_set(6); //MI 00000110 const simd_int gd_mm_vec = simdi32_set(8); // 00001000 const simd_int im_mm_vec = simdi32_set(16);// 00010000 const simd_int dg_mm_vec = simdi32_set(32);// 00100000 const simd_int mi_mm_vec = simdi32_set(64);// 01000000 #ifdef VITERBI_SS_SCORE HMM * q_s = q->GetHMM(0); const unsigned char * t_index; if(ss_hmm_mode == HMM::PRED_PRED || ss_hmm_mode == HMM::DSSP_PRED ){ t_index = t->pred_index; }else if(ss_hmm_mode == HMM::PRED_DSSP){ t_index = t->dssp_index; } simd_float * ss_score_vec = (simd_float *) ss_score; #endif #ifdef AVX2 const simd_int shuffle_mask_extract = _mm256_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1); #endif #ifdef VITERBI_CELLOFF #ifdef AVX2 const __m128i tmp_vec = _mm_set_epi32(0x40000000,0x00400000,0x00004000,0x00000040);//01000000010000000100000001000000 const simd_int co_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(tmp_vec), tmp_vec, 1); const simd_int float_min_vec = (simd_int) _mm256_set1_ps(-FLT_MAX); const simd_int shuffle_mask_celloff = _mm256_set_epi8( 15, 14, 13, 12, 15, 14, 13, 12, 15, 14, 13, 12, 15, 14, 13, 12, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0); #else // SSE case const simd_int tmp_vec = simdi32_set4(0x40000000,0x00400000,0x00004000,0x00000040); const simd_int co_vec = tmp_vec; const simd_int float_min_vec = (simd_int) simdf32_set(-FLT_MAX); #endif #endif // AVX2 end int i,j; //query and template match state indices simd_int i2_vec = simdi32_set(0); simd_int j2_vec = simdi32_set(0); simd_float sMM_i_j = simdf32_set(0); simd_float sMI_i_j,sIM_i_j,sGD_i_j,sDG_i_j; simd_float Si_vec; simd_float sMM_i_1_j_1; simd_float sMI_i_1_j_1; simd_float sIM_i_1_j_1; simd_float sGD_i_1_j_1; simd_float sDG_i_1_j_1; simd_float score_vec = simdf32_set(-FLT_MAX); simd_int byte_result_vec = simdi32_set(0); // Initialization of top row, i.e. cells (0,j) for (j=0; j <= t->L; ++j) { const unsigned int index_pos_j = j * 5; sMM_DG_MI_GD_IM_vec[index_pos_j + 0] = simdf32_set(-j*penalty_gap_template); sMM_DG_MI_GD_IM_vec[index_pos_j + 1] = simdf32_set(-FLT_MAX); sMM_DG_MI_GD_IM_vec[index_pos_j + 2] = simdf32_set(-FLT_MAX); sMM_DG_MI_GD_IM_vec[index_pos_j + 3] = simdf32_set(-FLT_MAX); sMM_DG_MI_GD_IM_vec[index_pos_j + 4] = simdf32_set(-FLT_MAX); } // Viterbi algorithm const int queryLength = q->L; for (i=1; i <= queryLength; ++i) // Loop through query positions i { // If q is compared to t, exclude regions where overlap of q with t < min_overlap residues // Initialize cells sMM_i_1_j_1 = simdf32_set(-(i - 1) * penalty_gap_query); // initialize at (i-1,0) sIM_i_1_j_1 = simdf32_set(-FLT_MAX); // initialize at (i-1,jmin-1) sMI_i_1_j_1 = simdf32_set(-FLT_MAX); sDG_i_1_j_1 = simdf32_set(-FLT_MAX); sGD_i_1_j_1 = simdf32_set(-FLT_MAX); // initialize at (i,jmin-1) const unsigned int index_pos_i = 0 * 5; sMM_DG_MI_GD_IM_vec[index_pos_i + 0] = simdf32_set(-i * penalty_gap_query); // initialize at (i,0) sMM_DG_MI_GD_IM_vec[index_pos_i + 1] = simdf32_set(-FLT_MAX); sMM_DG_MI_GD_IM_vec[index_pos_i + 2] = simdf32_set(-FLT_MAX); sMM_DG_MI_GD_IM_vec[index_pos_i + 3] = simdf32_set(-FLT_MAX); sMM_DG_MI_GD_IM_vec[index_pos_i + 4] = simdf32_set(-FLT_MAX); #ifdef AVX2 unsigned long long * sCO_MI_DG_IM_GD_MM_vec = (unsigned long long *) viterbiMatrix->getRow(i); #else unsigned int *sCO_MI_DG_IM_GD_MM_vec = (unsigned int *) viterbiMatrix->getRow(i); #endif const unsigned int start_pos_tr_i_1 = (i - 1) * 7; const unsigned int start_pos_tr_i = (i) * 7; const simd_float q_m2m = simdf32_load((float *) (q->tr + start_pos_tr_i_1 + 2)); // M2M const simd_float q_m2d = simdf32_load((float *) (q->tr + start_pos_tr_i_1 + 3)); // M2D const simd_float q_d2m = simdf32_load((float *) (q->tr + start_pos_tr_i_1 + 4)); // D2M const simd_float q_d2d = simdf32_load((float *) (q->tr + start_pos_tr_i_1 + 5)); // D2D const simd_float q_i2m = simdf32_load((float *) (q->tr + start_pos_tr_i_1 + 6)); // I2m const simd_float q_i2i = simdf32_load((float *) (q->tr + start_pos_tr_i)); // I2I const simd_float q_m2i = simdf32_load((float *) (q->tr + start_pos_tr_i + 1)); // M2I // Find maximum score; global alignment: maxize only over last row and last column const bool findMaxInnerLoop = (local || i == queryLength); const int targetLength = t->L; #ifdef VITERBI_SS_SCORE if(ss_hmm_mode == HMM::NO_SS_INFORMATION){ // set all to log(1.0) = 0.0 memset(ss_score, 0, (targetLength+1)*VECSIZE_FLOAT*sizeof(float)); }else { const float * score; if(ss_hmm_mode == HMM::PRED_PRED){ score = &S33[ (int)q_s->ss_pred[i]][ (int)q_s->ss_conf[i]][0][0]; }else if (ss_hmm_mode == HMM::DSSP_PRED){ score = &S73[ (int)q_s->ss_dssp[i]][0][0]; }else{ score = &S37[ (int)q_s->ss_pred[i]][ (int)q_s->ss_conf[i]][0]; } // access SS scores and write them to the ss_score array for (j = 0; j <= (targetLength*VECSIZE_FLOAT); j++) // Loop through template positions j { ss_score[j] = ssw * score[t_index[j]]; } } #endif for (j=1; j <= targetLength; ++j) // Loop through template positions j { simd_int index_vec; simd_int res_gt_vec; // cache line optimized reading const unsigned int start_pos_tr_j_1 = (j-1) * 7; const unsigned int start_pos_tr_j = (j) * 7; const simd_float t_m2m = simdf32_load((float *) (t->tr+start_pos_tr_j_1+2)); // M2M const simd_float t_m2d = simdf32_load((float *) (t->tr+start_pos_tr_j_1+3)); // M2D const simd_float t_d2m = simdf32_load((float *) (t->tr+start_pos_tr_j_1+4)); // D2M const simd_float t_d2d = simdf32_load((float *) (t->tr+start_pos_tr_j_1+5)); // D2D const simd_float t_i2m = simdf32_load((float *) (t->tr+start_pos_tr_j_1+6)); // I2m const simd_float t_i2i = simdf32_load((float *) (t->tr+start_pos_tr_j)); // I2i const simd_float t_m2i = simdf32_load((float *) (t->tr+start_pos_tr_j+1)); // M2I // Find max value // CALCULATE_MAX6( sMM_i_j, // smin, // sMM_i_1_j_1 + q->tr[i-1][M2M] + t->tr[j-1][M2M], // sGD_i_1_j_1 + q->tr[i-1][M2M] + t->tr[j-1][D2M], // sIM_i_1_j_1 + q->tr[i-1][I2M] + t->tr[j-1][M2M], // sDG_i_1_j_1 + q->tr[i-1][D2M] + t->tr[j-1][M2M], // sMI_i_1_j_1 + q->tr[i-1][M2M] + t->tr[j-1][I2M], // bMM[i][j] // ); // same as sMM_i_1_j_1 + q->tr[i-1][M2M] + t->tr[j-1][M2M] simd_float mm_m2m_m2m_vec = simdf32_add( simdf32_add(sMM_i_1_j_1, q_m2m), t_m2m); // if mm > min { 2 } res_gt_vec = (simd_int)simdf32_gt(mm_m2m_m2m_vec, smin_vec); byte_result_vec = simdi_and(res_gt_vec, mm_vec); sMM_i_j = simdf32_max(smin_vec, mm_m2m_m2m_vec); // same as sGD_i_1_j_1 + q->tr[i-1][M2M] + t->tr[j-1][D2M] simd_float gd_m2m_d2m_vec = simdf32_add( simdf32_add(sGD_i_1_j_1, q_m2m), t_d2m); // if gd > max { 3 } res_gt_vec = (simd_int)simdf32_gt(gd_m2m_d2m_vec, sMM_i_j); index_vec = simdi_and( res_gt_vec, gd_vec); byte_result_vec = simdi_or( index_vec, byte_result_vec); sMM_i_j = simdf32_max(sMM_i_j, gd_m2m_d2m_vec); // same as sIM_i_1_j_1 + q->tr[i-1][I2M] + t->tr[j-1][M2M] simd_float im_m2m_d2m_vec = simdf32_add( simdf32_add(sIM_i_1_j_1, q_i2m), t_m2m); // if im > max { 4 } MAX2(im_m2m_d2m_vec, sMM_i_j, im_vec,byte_result_vec); sMM_i_j = simdf32_max(sMM_i_j, im_m2m_d2m_vec); // same as sDG_i_1_j_1 + q->tr[i-1][D2M] + t->tr[j-1][M2M] simd_float dg_m2m_d2m_vec = simdf32_add( simdf32_add(sDG_i_1_j_1, q_d2m), t_m2m); // if dg > max { 5 } MAX2(dg_m2m_d2m_vec, sMM_i_j, dg_vec,byte_result_vec); sMM_i_j = simdf32_max(sMM_i_j, dg_m2m_d2m_vec); // same as sMI_i_1_j_1 + q->tr[i-1][M2M] + t->tr[j-1][I2M], simd_float mi_m2m_d2m_vec = simdf32_add( simdf32_add(sMI_i_1_j_1, q_m2m), t_i2m); // if mi > max { 6 } MAX2(mi_m2m_d2m_vec, sMM_i_j, mi_vec, byte_result_vec); sMM_i_j = simdf32_max(sMM_i_j, mi_m2m_d2m_vec); // TODO add secondary structure score // calculate amino acid profile-profile scores Si_vec = log2f4(ScalarProd20Vec((simd_float *) q->p[i],(simd_float *) t->p[j])); #ifdef VITERBI_SS_SCORE Si_vec = simdf32_add(ss_score_vec[j], Si_vec); #endif Si_vec = simdf32_add(Si_vec, shift_vec); sMM_i_j = simdf32_add(sMM_i_j, Si_vec); //+ ScoreSS(q,t,i,j) + shift + (Sstruc==NULL? 0: Sstruc[i][j]); const unsigned int index_pos_j = (j * 5); const unsigned int index_pos_j_1 = (j - 1) * 5; const simd_float sMM_j_1 = simdf32_load((float *) (sMM_DG_MI_GD_IM_vec + index_pos_j_1 + 0)); const simd_float sGD_j_1 = simdf32_load((float *) (sMM_DG_MI_GD_IM_vec + index_pos_j_1 + 3)); const simd_float sIM_j_1 = simdf32_load((float *) (sMM_DG_MI_GD_IM_vec + index_pos_j_1 + 4)); const simd_float sMM_j = simdf32_load((float *) (sMM_DG_MI_GD_IM_vec + index_pos_j + 0)); const simd_float sDG_j = simdf32_load((float *) (sMM_DG_MI_GD_IM_vec + index_pos_j + 1)); const simd_float sMI_j = simdf32_load((float *) (sMM_DG_MI_GD_IM_vec + index_pos_j + 2)); sMM_i_1_j_1 = simdf32_load((float *)(sMM_DG_MI_GD_IM_vec + index_pos_j + 0)); sDG_i_1_j_1 = simdf32_load((float *)(sMM_DG_MI_GD_IM_vec + index_pos_j + 1)); sMI_i_1_j_1 = simdf32_load((float *)(sMM_DG_MI_GD_IM_vec + index_pos_j + 2)); sGD_i_1_j_1 = simdf32_load((float *)(sMM_DG_MI_GD_IM_vec + index_pos_j + 3)); sIM_i_1_j_1 = simdf32_load((float *)(sMM_DG_MI_GD_IM_vec + index_pos_j + 4)); // sGD_i_j = max2 // ( // sMM[j-1] + t->tr[j-1][M2D], // MM->GD gap opening in query // sGD[j-1] + t->tr[j-1][D2D], // GD->GD gap extension in query // bGD[i][j] // ); //sMM_DG_GD_MI_IM_vec simd_float mm_gd_vec = simdf32_add(sMM_j_1, t_m2d); // MM->GD gap opening in query simd_float gd_gd_vec = simdf32_add(sGD_j_1, t_d2d); // GD->GD gap extension in query // if mm_gd > gd_dg { 8 } MAX2_SET_MASK(mm_gd_vec, gd_gd_vec,gd_mm_vec, byte_result_vec); sGD_i_j = simdf32_max( mm_gd_vec, gd_gd_vec ); // sIM_i_j = max2 // ( // sMM[j-1] + q->tr[i][M2I] + t->tr[j-1][M2M] , // sIM[j-1] + q->tr[i][I2I] + t->tr[j-1][M2M], // IM->IM gap extension in query // bIM[i][j] // ); simd_float mm_mm_vec = simdf32_add(simdf32_add(sMM_j_1, q_m2i), t_m2m); simd_float im_im_vec = simdf32_add(simdf32_add(sIM_j_1, q_i2i), t_m2m); // IM->IM gap extension in query // if mm_mm > im_im { 16 } MAX2_SET_MASK(mm_mm_vec,im_im_vec, im_mm_vec, byte_result_vec); sIM_i_j = simdf32_max( mm_mm_vec, im_im_vec ); // sDG_i_j = max2 // ( // sMM[j] + q->tr[i-1][M2D], // sDG[j] + q->tr[i-1][D2D], //gap extension (DD) in query // bDG[i][j] // ); simd_float mm_dg_vec = simdf32_add(sMM_j, q_m2d); simd_float dg_dg_vec = simdf32_add(sDG_j, q_d2d); //gap extension (DD) in query // if mm_dg > dg_dg { 32 } MAX2_SET_MASK(mm_dg_vec,dg_dg_vec, dg_mm_vec, byte_result_vec); sDG_i_j = simdf32_max( mm_dg_vec , dg_dg_vec ); // sMI_i_j = max2 // ( // sMM[j] + q->tr[i-1][M2M] + t->tr[j][M2I], // MM->MI gap opening M2I in template // sMI[j] + q->tr[i-1][M2M] + t->tr[j][I2I], // MI->MI gap extension I2I in template // bMI[i][j] // ); simd_float mm_mi_vec = simdf32_add( simdf32_add(sMM_j, q_m2m), t_m2i); // MM->MI gap opening M2I in template simd_float mi_mi_vec = simdf32_add( simdf32_add(sMI_j, q_m2m), t_i2i); // MI->MI gap extension I2I in template // if mm_mi > mi_mi { 64 } MAX2_SET_MASK(mm_mi_vec, mi_mi_vec,mi_mm_vec, byte_result_vec); sMI_i_j = simdf32_max( mm_mi_vec, mi_mi_vec ); // Cell of logic // if (cell_off[i][j]) //shift 10000000100000001000000010000000 -> 01000000010000000100000001000000 //because 10000000000000000000000000000000 = -2147483648 kills cmplt #ifdef VITERBI_CELLOFF #ifdef AVX2 simd_int matrix_vec = _mm256_set1_epi64x(sCO_MI_DG_IM_GD_MM_vec[j]>>1); matrix_vec = _mm256_shuffle_epi8(matrix_vec,shuffle_mask_celloff); #else // if(((sCO_MI_DG_IM_GD_MM_vec[j] >>1) & 0x40404040) > 0){ // std::cout << ((sCO_MI_DG_IM_GD_MM_vec[j] >>1) & 0x40404040 ) << std::endl; // } simd_int matrix_vec = simdi32_set(sCO_MI_DG_IM_GD_MM_vec[j]>>1); #endif simd_int cell_off_vec = simdi_and(matrix_vec, co_vec); simd_int res_eq_co_vec = simdi32_gt(co_vec, cell_off_vec ); // shift is because signed can't be checked here simd_float cell_off_float_min_vec = (simd_float) simdi_andnot(res_eq_co_vec, float_min_vec); // inverse sMM_i_j = simdf32_add(sMM_i_j,cell_off_float_min_vec); // add the cell off vec to sMM_i_j. Set -FLT_MAX to cell off sGD_i_j = simdf32_add(sGD_i_j,cell_off_float_min_vec); sIM_i_j = simdf32_add(sIM_i_j,cell_off_float_min_vec); sDG_i_j = simdf32_add(sDG_i_j,cell_off_float_min_vec); sMI_i_j = simdf32_add(sMI_i_j,cell_off_float_min_vec); #endif simdf32_store((float *)(sMM_DG_MI_GD_IM_vec+index_pos_j + 0), sMM_i_j); simdf32_store((float *)(sMM_DG_MI_GD_IM_vec+index_pos_j + 1), sDG_i_j); simdf32_store((float *)(sMM_DG_MI_GD_IM_vec+index_pos_j + 2), sMI_i_j); simdf32_store((float *)(sMM_DG_MI_GD_IM_vec+index_pos_j + 3), sGD_i_j); simdf32_store((float *)(sMM_DG_MI_GD_IM_vec+index_pos_j + 4), sIM_i_j); // write values back to ViterbiMatrix #ifdef AVX2 /* byte_result_vec 000H 000G 000F 000E 000D 000C 000B 000A */ /* abcdefgh 0000 0000 HGFE 0000 0000 0000 0000 DCBA */ const __m256i abcdefgh = _mm256_shuffle_epi8(byte_result_vec, shuffle_mask_extract); /* abcd 0000 0000 0000 DCBA */ const __m128i abcd = _mm256_castsi256_si128(abcdefgh); /* efgh 0000 0000 HGFE 0000 */ const __m128i efgh = _mm256_extracti128_si256(abcdefgh, 1); _mm_storel_epi64((__m128i*)&sCO_MI_DG_IM_GD_MM_vec[j], _mm_or_si128(abcd, efgh)); #elif defined(SSE) byte_result_vec = _mm_packs_epi32(byte_result_vec, byte_result_vec); byte_result_vec = _mm_packus_epi16(byte_result_vec, byte_result_vec); int int_result = _mm_cvtsi128_si32(byte_result_vec); sCO_MI_DG_IM_GD_MM_vec[j] = int_result; #endif // Find maximum score; global alignment: maxize only over last row and last column // if(sMM_i_j>score && (par.loc || i==q->L)) { i2=i; j2=j; score=sMM_i_j; } if (findMaxInnerLoop){ // new score is higer // output // 0 0 0 MAX simd_int lookup_mask_hi = (simd_int) simdf32_gt(sMM_i_j,score_vec); simd_int lookup_mask_lo = simdi_andnot(lookup_mask_hi,simdi32_set(-1)); //simd_int lookup_mask_lo = (simd_int) simdf32_gt(score_vec,sMM_i_j); // old score is higher // output // MAX MAX MAX 0 //simd_int lookup_mask_lo = (simd_int) simdf32_lt(sMM_i_j,score_vec); simd_int curr_pos_j = simdi32_set(j); simd_int new_j_pos_hi = simdi_and(lookup_mask_hi,curr_pos_j); simd_int old_j_pos_lo = simdi_and(lookup_mask_lo,j2_vec); j2_vec = simdi32_add(new_j_pos_hi,old_j_pos_lo); simd_int curr_pos_i = simdi32_set(i); simd_int new_i_pos_hi = simdi_and(lookup_mask_hi,curr_pos_i); simd_int old_i_pos_lo = simdi_and(lookup_mask_lo,i2_vec); i2_vec = simdi32_add(new_i_pos_hi,old_i_pos_lo); score_vec=simdf32_max(sMM_i_j,score_vec); // printf("%d %d ",i, j); // for(int seq_index=0; seq_index < maxres; seq_index++){ // printf("(%d %d %d %.3f %.3f %d %d)\t", seq_index, ((int*)&lookup_mask_hi)[seq_index], ((int*)&lookup_mask_lo)[seq_index], ((float*)&sMM_i_j)[seq_index], ((float*)&score_vec)[seq_index], // ((int*)&i2_vec)[seq_index], ((int*)&j2_vec)[seq_index]); // } // printf("\n"); } } //end for j // if global alignment: look for best cell in last column if (!local){ // new score is higer // output // 0 0 0 MAX simd_int lookup_mask_hi = (simd_int) simdf32_gt(sMM_i_j,score_vec); // simd_int lookup_mask_lo; simd_int lookup_mask_lo = simdi_andnot(lookup_mask_hi,simdi32_set(-1)); // old score is higher // output // MAX MAX MAX 0 simd_int curr_pos_j = simdi32_set(j); simd_int new_j_pos_hi = simdi_and(lookup_mask_hi,curr_pos_j); simd_int old_j_pos_lo = simdi_and(lookup_mask_lo,j2_vec); j2_vec = simdi32_add(new_j_pos_hi,old_j_pos_lo); simd_int curr_pos_i = simdi32_set(i); simd_int new_i_pos_hi = simdi_and(lookup_mask_hi,curr_pos_i); simd_int old_i_pos_lo = simdi_and(lookup_mask_lo,i2_vec); i2_vec = simdi32_add(new_i_pos_hi,old_i_pos_lo); score_vec = simdf32_max(sMM_i_j,score_vec); } // end for j } // end for i for(int seq_index=0; seq_index < maxres; seq_index++){ result->score[seq_index]=((float*)&score_vec)[seq_index]; result->i[seq_index] = ((int*)&i2_vec)[seq_index]; result->j[seq_index] = ((int*)&j2_vec)[seq_index]; // std::cout << seq_index << "\t" << result->score[seq_index] << "\t" << result->i[seq_index] <<"\t" << result->j[seq_index] << std::endl; } // printf("Template=%-12.12s i=%-4i j=%-4i score=%6.3f\n",t->name,i2,j2,score); }
real *shiftvec,*fshift,*x,*f; real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH; real scratch[4*DIM]; __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; real * vdwioffsetptr0; __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H; __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; __m256 velec,felec,velecsum,facel,crf,krf,krf2; real *charge; int nvdwtype; __m256 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; int *vdwtype; real *vdwparam; __m256 one_sixth = _mm256_set1_ps(1.0/6.0); __m256 one_twelfth = _mm256_set1_ps(1.0/12.0); __m256 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw; real rswitch_scalar,d_scalar; __m256 dummy_mask,cutoff_mask; __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) ); __m256 one = _mm256_set1_ps(1.0); __m256 two = _mm256_set1_ps(2.0); x = xx[0]; f = ff[0]; nri = nlist->nri; iinr = nlist->iinr; jindex = nlist->jindex; jjnr = nlist->jjnr; shiftidx = nlist->shift;
real *shiftvec,*fshift,*x,*f; real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH; real scratch[4*DIM]; __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; real * vdwioffsetptr0; __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H; __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; __m256 velec,felec,velecsum,facel,crf,krf,krf2; real *charge; int nvdwtype; __m256 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; int *vdwtype; real *vdwparam; __m256 one_sixth = _mm256_set1_ps(1.0/6.0); __m256 one_twelfth = _mm256_set1_ps(1.0/12.0); __m256i ewitab; __m128i ewitab_lo,ewitab_hi; __m256 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV; __m256 beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3; real *ewtab; __m256 dummy_mask,cutoff_mask; __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) ); __m256 one = _mm256_set1_ps(1.0); __m256 two = _mm256_set1_ps(2.0); x = xx[0]; f = ff[0]; nri = nlist->nri; iinr = nlist->iinr;
void fastGEMM( const float* aptr, size_t astep, const float* bptr, size_t bstep, float* cptr, size_t cstep, int ma, int na, int nb ) { int n = 0; for( ; n <= nb - 16; n += 16 ) { for( int m = 0; m < ma; m += 4 ) { const float* aptr0 = aptr + astep*m; const float* aptr1 = aptr + astep*std::min(m+1, ma-1); const float* aptr2 = aptr + astep*std::min(m+2, ma-1); const float* aptr3 = aptr + astep*std::min(m+3, ma-1); float* cptr0 = cptr + cstep*m; float* cptr1 = cptr + cstep*std::min(m+1, ma-1); float* cptr2 = cptr + cstep*std::min(m+2, ma-1); float* cptr3 = cptr + cstep*std::min(m+3, ma-1); __m256 d00 = _mm256_setzero_ps(), d01 = _mm256_setzero_ps(); __m256 d10 = _mm256_setzero_ps(), d11 = _mm256_setzero_ps(); __m256 d20 = _mm256_setzero_ps(), d21 = _mm256_setzero_ps(); __m256 d30 = _mm256_setzero_ps(), d31 = _mm256_setzero_ps(); for( int k = 0; k < na; k++ ) { __m256 a0 = _mm256_set1_ps(aptr0[k]); __m256 a1 = _mm256_set1_ps(aptr1[k]); __m256 a2 = _mm256_set1_ps(aptr2[k]); __m256 a3 = _mm256_set1_ps(aptr3[k]); __m256 b0 = _mm256_loadu_ps(bptr + k*bstep + n); __m256 b1 = _mm256_loadu_ps(bptr + k*bstep + n + 8); d00 = _mm256_fmadd_ps(a0, b0, d00); d01 = _mm256_fmadd_ps(a0, b1, d01); d10 = _mm256_fmadd_ps(a1, b0, d10); d11 = _mm256_fmadd_ps(a1, b1, d11); d20 = _mm256_fmadd_ps(a2, b0, d20); d21 = _mm256_fmadd_ps(a2, b1, d21); d30 = _mm256_fmadd_ps(a3, b0, d30); d31 = _mm256_fmadd_ps(a3, b1, d31); } _mm256_storeu_ps(cptr0 + n, d00); _mm256_storeu_ps(cptr0 + n + 8, d01); _mm256_storeu_ps(cptr1 + n, d10); _mm256_storeu_ps(cptr1 + n + 8, d11); _mm256_storeu_ps(cptr2 + n, d20); _mm256_storeu_ps(cptr2 + n + 8, d21); _mm256_storeu_ps(cptr3 + n, d30); _mm256_storeu_ps(cptr3 + n + 8, d31); } } for( ; n < nb; n++ ) { for( int m = 0; m < ma; m++ ) { const float* aptr0 = aptr + astep*m; float* cptr0 = cptr + cstep*m; float d0 = 0.f; for( int k = 0; k < na; k++ ) d0 += aptr0[k]*bptr[k*bstep + n]; cptr0[n] = d0; } } _mm256_zeroupper(); }
real * vdwioffsetptr0; __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; real * vdwioffsetptr1; __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; real * vdwioffsetptr2; __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H; __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; __m256 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; __m256 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; __m256 velec,felec,velecsum,facel,crf,krf,krf2; real *charge; __m256 dummy_mask,cutoff_mask; __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) ); __m256 one = _mm256_set1_ps(1.0); __m256 two = _mm256_set1_ps(2.0); x = xx[0]; f = ff[0]; nri = nlist->nri; iinr = nlist->iinr; jindex = nlist->jindex; jjnr = nlist->jjnr; shiftidx = nlist->shift; gid = nlist->gid; shiftvec = fr->shift_vec[0]; fshift = fr->fshift[0]; facel = _mm256_set1_ps(fr->ic->epsfac); charge = mdatoms->chargeA; krf = _mm256_set1_ps(fr->ic->k_rf);
real *shiftvec,*fshift,*x,*f; real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH; real scratch[4*DIM]; __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; real * vdwioffsetptr0; __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H; __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; __m256 velec,felec,velecsum,facel,crf,krf,krf2; real *charge; int nvdwtype; __m256 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; int *vdwtype; real *vdwparam; __m256 one_sixth = _mm256_set1_ps(1.0/6.0); __m256 one_twelfth = _mm256_set1_ps(1.0/12.0); __m256i ewitab; __m128i ewitab_lo,ewitab_hi; __m256 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV; __m256 beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3; real *ewtab; __m256 rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw; real rswitch_scalar,d_scalar; __m256 dummy_mask,cutoff_mask; __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) ); __m256 one = _mm256_set1_ps(1.0); __m256 two = _mm256_set1_ps(2.0); x = xx[0]; f = ff[0];
/*-------------------------------------------------------------------------*/ /** @file FieldDataCPU.cpp */ /*--------------------------------------------------------------------------*/ #include "FieldDataCPU.h" #include "ShapeFunctions.h" #include "vec_funcs.h" #include "PlasmaData.h" #include "ParallelInfo.h" #include "mpi.h" #include <immintrin.h> #include <omp.h> #if !(defined NO_HAND_VEC) const __m256 float_1 = _mm256_set1_ps(1.0); const __m256 float_15 = _mm256_set1_ps(1.5); const __m256 float_075 = _mm256_set1_ps(0.75); const __m256 float_05 = _mm256_set1_ps(0.5); const __m256 sign_mask = (__m256)_mm256_set1_epi32(0x7fffffff); #endif __inline__ int zorder(int ix,int iy,int iz) { // Spread the bits of each index
int *iinr,*jindex,*jjnr,*shiftidx,*gid; real rcutoff_scalar; real *shiftvec,*fshift,*x,*f; real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH; real scratch[4*DIM]; __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; real * vdwioffsetptr0; __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H; __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; __m256 velec,felec,velecsum,facel,crf,krf,krf2; real *charge; __m256 dummy_mask,cutoff_mask; __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) ); __m256 one = _mm256_set1_ps(1.0); __m256 two = _mm256_set1_ps(2.0); x = xx[0]; f = ff[0]; nri = nlist->nri; iinr = nlist->iinr; jindex = nlist->jindex; jjnr = nlist->jjnr; shiftidx = nlist->shift; gid = nlist->gid; shiftvec = fr->shift_vec[0]; fshift = fr->fshift[0]; facel = _mm256_set1_ps(fr->epsfac); charge = mdatoms->chargeA; krf = _mm256_set1_ps(fr->ic->k_rf);
int main() { //Variaveis para medir o tempo clock_t begin, end; double time_spent; begin = clock(); /* screen ( integer) coordinate */ int iX, iY; const int iXmax = 16384; const int iYmax = 16384; /* world ( double) coordinate = parameter plane*/ float _Cx[8], Cy; const float CxMin = -2.5; const float CxMax = 1.5; const float CyMin = -2.0; const float CyMax = 2.0; /* */ float PixelWidth = (CxMax - CxMin) / iXmax; float PixelHeight = (CyMax - CyMin) / iYmax; /* color component ( R or G or B) is coded from 0 to 255 */ /* it is 24 bit color RGB file */ const int MaxColorComponentValue = 255; FILE * fp; char *filename = "mandelbrot.ppm"; static unsigned char color[3]; //int vetorCoresAux[8]; /* Z=Zx+Zy*i ; Z0 = 0 */ float Zx, Zy; float Zx2, Zy2; /* Zx2=Zx*Zx; Zy2=Zy*Zy */ /* */ int Iteration[8]; //float IterationF[8]; const int IterationMax = 256; //const float IterationMaxF = 256.0; /* bail-out value , radius of circle ; */ const float EscapeRadius = 2; float ER2 = EscapeRadius*EscapeRadius; /*create new file,give it a name and open it in binary mode */ fp = fopen(filename, "wb"); /* b - binary mode */ /*write ASCII header to the file*/ fprintf(fp, "P6\n %d\n %d\n %d\n", iXmax, iYmax, MaxColorComponentValue); // Funcao intrinseca: carrega float em variável para AVX (256 bits) __m256 PixelWidth256 = _mm256_set1_ps(PixelWidth); __m256 CxMin256 = _mm256_set1_ps(CxMin); __m256 IterationMax256 = _mm256_set1_ps(IterationMax); for (iY = 0; iY<iYmax; iY++){ Cy = CyMin + iY*PixelHeight; if (fabs(Cy)< PixelHeight / 2) Cy = 0.0; /* Main antenna */ for (iX = 0; iX<iXmax / 8; iX++){ float avxIx[8]; for (int i = 0; i < 8; i++) avxIx[i] = iX * 8.0 + i; _asm{ vmovups ymm5, avxIx vmulps ymm5, ymm5, PixelWidth256 vaddps ymm5, ymm5, CxMin256 // ymm5 = CxMin + iX*PixelWidth vmovups _Cx, ymm5 } for (int i = 0; i < 8; i++){ Zx = 0; Zy = 0; Zx2 = 0; Zy2 = 0; for (Iteration[i] = 0; Iteration[i] < IterationMax && ((Zx2 + Zy2) < ER2); Iteration[i]++){ Zy = 2 * Zx * Zy + Cy; Zx = Zx2 - Zy2 + _Cx[i]; Zx2 = Zx * Zx; Zy2 = Zy * Zy; } if (Iteration[i] == IterationMax){ /* interior of Mandelbrot set = black */ color[0] = 0; color[1] = 0; color[2] = 0; } else{ /* exterior of Mandelbrot set = white */ color[0] = ((IterationMax - Iteration[i]) % 8) * 63; /* Red */ color[1] = ((IterationMax - Iteration[i]) % 4) * 127; /* Green */ color[2] = ((IterationMax - Iteration[i]) % 2) * 255; /* Blue */ }; fwrite(color, 1, 3, fp); } } } fclose(fp); end = clock(); time_spent = (double)(end - begin) / CLOCKS_PER_SEC;; printf("%f", time_spent); return 0; }
__m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; real * vdwioffsetptr0; __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H; __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; __m256 velec,felec,velecsum,facel,crf,krf,krf2; real *charge; __m256i vfitab; __m128i vfitab_lo,vfitab_hi; __m128i ifour = _mm_set1_epi32(4); __m256 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; real *vftab; __m256 dummy_mask,cutoff_mask; __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) ); __m256 one = _mm256_set1_ps(1.0); __m256 two = _mm256_set1_ps(2.0); x = xx[0]; f = ff[0]; nri = nlist->nri; iinr = nlist->iinr; jindex = nlist->jindex; jjnr = nlist->jjnr; shiftidx = nlist->shift; gid = nlist->gid; shiftvec = fr->shift_vec[0]; fshift = fr->fshift[0]; facel = _mm256_set1_ps(fr->epsfac); charge = mdatoms->chargeA;
__m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; real * vdwioffsetptr0; __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H; __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; __m256 velec,felec,velecsum,facel,crf,krf,krf2; real *charge; __m256i ewitab; __m128i ewitab_lo,ewitab_hi; __m256 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV; __m256 beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3; real *ewtab; __m256 dummy_mask,cutoff_mask; __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) ); __m256 one = _mm256_set1_ps(1.0); __m256 two = _mm256_set1_ps(2.0); x = xx[0]; f = ff[0]; nri = nlist->nri; iinr = nlist->iinr; jindex = nlist->jindex; jjnr = nlist->jjnr; shiftidx = nlist->shift; gid = nlist->gid; shiftvec = fr->shift_vec[0]; fshift = fr->fshift[0]; facel = _mm256_set1_ps(fr->epsfac); charge = mdatoms->chargeA;