void UpResidual_GMRF_5() { exchsolution_gmrfData_5(0); for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[0]) { /* Statements in this Scop: S141 */ for (int i0 = iterationOffsetBegin[0][1]; (i0<=(iterationOffsetEnd[0][1]+32)); i0 += 1) { double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][(i0*35)]); double* fieldData_Residual_GMRF_5_p1 = (&fieldData_Residual_GMRF[5][(i0*35)]); double* fieldData_Solution_GMRF_5_p1 = (&fieldData_Solution_GMRF[5][(i0*35)]); double* fieldData_RHS_GMRF_5_p1 = (&fieldData_RHS_GMRF[5][(i0*33)]); int i1 = (iterationOffsetBegin[0][0]+i0); for (; (i1<(((iterationOffsetBegin[0][0]+i0)+1)&(~1))); i1 += 1) { fieldData_Residual_GMRF_5_p1[(i1+38)] = (fieldData_RHS_GMRF_5_p1[i1]-(((((((((fieldData_LaplaceCoeff_GMRF_5_p1[(i1+38)]*fieldData_Solution_GMRF_5_p1[(i1+38)])+(fieldData_LaplaceCoeff_GMRF_5_p1[(i1+1298)]*fieldData_Solution_GMRF_5_p1[(i1+39)]))+(fieldData_LaplaceCoeff_GMRF_5_p1[(i1+2558)]*fieldData_Solution_GMRF_5_p1[(i1+37)]))+(fieldData_LaplaceCoeff_GMRF_5_p1[(i1+3818)]*fieldData_Solution_GMRF_5_p1[(i1+74)]))+(fieldData_LaplaceCoeff_GMRF_5_p1[(i1+5078)]*fieldData_Solution_GMRF_5_p1[(i1+2)]))+(fieldData_LaplaceCoeff_GMRF_5_p1[(i1+6338)]*fieldData_Solution_GMRF_5_p1[(i1+1)]))+(fieldData_LaplaceCoeff_GMRF_5_p1[(i1+7598)]*fieldData_Solution_GMRF_5_p1[(i1+73)]))+(fieldData_LaplaceCoeff_GMRF_5_p1[(i1+8858)]*fieldData_Solution_GMRF_5_p1[(i1+3)]))+(fieldData_LaplaceCoeff_GMRF_5_p1[(i1+10118)]*fieldData_Solution_GMRF_5_p1[(i1+75)]))); } for (; (i1<((iterationOffsetEnd[0][0]+i0)+30)); i1 += 4) { /* fieldData_Residual_GMRF_5_p1[(i1+38)] = (fieldData_RHS_GMRF_5_p1[i1]-(((((((((fieldData_LaplaceCoeff_GMRF_5_p1[(i1+38)]*fieldData_Solution_GMRF_5_p1[(i1+38)])+(fieldData_LaplaceCoeff_GMRF_5_p1[(i1+1298)]*fieldData_Solution_GMRF_5_p1[(i1+39)]))+(fieldData_LaplaceCoeff_GMRF_5_p1[(i1+2558)]*fieldData_Solution_GMRF_5_p1[(i1+37)]))+(fieldData_LaplaceCoeff_GMRF_5_p1[(i1+3818)]*fieldData_Solution_GMRF_5_p1[(i1+74)]))+(fieldData_LaplaceCoeff_GMRF_5_p1[(i1+5078)]*fieldData_Solution_GMRF_5_p1[(i1+2)]))+(fieldData_LaplaceCoeff_GMRF_5_p1[(i1+6338)]*fieldData_Solution_GMRF_5_p1[(i1+1)]))+(fieldData_LaplaceCoeff_GMRF_5_p1[(i1+7598)]*fieldData_Solution_GMRF_5_p1[(i1+73)]))+(fieldData_LaplaceCoeff_GMRF_5_p1[(i1+8858)]*fieldData_Solution_GMRF_5_p1[(i1+3)]))+(fieldData_LaplaceCoeff_GMRF_5_p1[(i1+10118)]*fieldData_Solution_GMRF_5_p1[(i1+75)]))); */ __m128d vec0 = _mm_loadu_pd((&fieldData_RHS_GMRF_5_p1[i1])); __m128d vec0_2 = _mm_loadu_pd((&fieldData_RHS_GMRF_5_p1[(i1+2)])); __m128d vec1 = _mm_loadu_pd((&fieldData_LaplaceCoeff_GMRF_5_p1[(i1+10118)])); __m128d vec1_2 = _mm_loadu_pd((&fieldData_LaplaceCoeff_GMRF_5_p1[(i1+10120)])); __m128d vec2 = _mm_loadu_pd((&fieldData_Solution_GMRF_5_p1[(i1+75)])); __m128d vec2_2 = _mm_loadu_pd((&fieldData_Solution_GMRF_5_p1[(i1+77)])); __m128d vec3 = _mm_loadu_pd((&fieldData_LaplaceCoeff_GMRF_5_p1[(i1+8858)])); __m128d vec3_2 = _mm_loadu_pd((&fieldData_LaplaceCoeff_GMRF_5_p1[(i1+8860)])); __m128d vec4 = _mm_loadu_pd((&fieldData_Solution_GMRF_5_p1[(i1+3)])); __m128d vec4_2 = _mm_loadu_pd((&fieldData_Solution_GMRF_5_p1[(i1+5)])); __m128d vec5 = _mm_loadu_pd((&fieldData_LaplaceCoeff_GMRF_5_p1[(i1+7598)])); __m128d vec5_2 = _mm_loadu_pd((&fieldData_LaplaceCoeff_GMRF_5_p1[(i1+7600)])); __m128d vec6 = _mm_loadu_pd((&fieldData_Solution_GMRF_5_p1[(i1+73)])); __m128d vec6_2 = _mm_loadu_pd((&fieldData_Solution_GMRF_5_p1[(i1+75)])); __m128d vec7 = _mm_loadu_pd((&fieldData_LaplaceCoeff_GMRF_5_p1[(i1+6338)])); __m128d vec7_2 = _mm_loadu_pd((&fieldData_LaplaceCoeff_GMRF_5_p1[(i1+6340)])); __m128d vec8 = _mm_loadu_pd((&fieldData_Solution_GMRF_5_p1[(i1+1)])); __m128d vec8_2 = _mm_loadu_pd((&fieldData_Solution_GMRF_5_p1[(i1+3)])); __m128d vec9 = _mm_loadu_pd((&fieldData_LaplaceCoeff_GMRF_5_p1[(i1+5078)])); __m128d vec9_2 = _mm_loadu_pd((&fieldData_LaplaceCoeff_GMRF_5_p1[(i1+5080)])); __m128d vec10 = _mm_loadu_pd((&fieldData_Solution_GMRF_5_p1[(i1+2)])); __m128d vec10_2 = _mm_loadu_pd((&fieldData_Solution_GMRF_5_p1[(i1+4)])); __m128d vec11 = _mm_loadu_pd((&fieldData_LaplaceCoeff_GMRF_5_p1[(i1+3818)])); __m128d vec11_2 = _mm_loadu_pd((&fieldData_LaplaceCoeff_GMRF_5_p1[(i1+3820)])); __m128d vec12 = _mm_loadu_pd((&fieldData_Solution_GMRF_5_p1[(i1+74)])); __m128d vec12_2 = _mm_loadu_pd((&fieldData_Solution_GMRF_5_p1[(i1+76)])); __m128d vec13 = _mm_loadu_pd((&fieldData_LaplaceCoeff_GMRF_5_p1[(i1+2558)])); __m128d vec13_2 = _mm_loadu_pd((&fieldData_LaplaceCoeff_GMRF_5_p1[(i1+2560)])); __m128d vec14 = _mm_loadu_pd((&fieldData_Solution_GMRF_5_p1[(i1+37)])); __m128d vec14_2 = _mm_loadu_pd((&fieldData_Solution_GMRF_5_p1[(i1+39)])); __m128d vec15 = _mm_loadu_pd((&fieldData_LaplaceCoeff_GMRF_5_p1[(i1+38)])); __m128d vec15_2 = _mm_loadu_pd((&fieldData_LaplaceCoeff_GMRF_5_p1[(i1+40)])); __m128d vec16 = _mm_loadu_pd((&fieldData_Solution_GMRF_5_p1[(i1+38)])); __m128d vec16_2 = _mm_loadu_pd((&fieldData_Solution_GMRF_5_p1[(i1+40)])); __m128d vec17 = _mm_loadu_pd((&fieldData_LaplaceCoeff_GMRF_5_p1[(i1+1298)])); __m128d vec17_2 = _mm_loadu_pd((&fieldData_LaplaceCoeff_GMRF_5_p1[(i1+1300)])); __m128d vec18 = _mm_loadu_pd((&fieldData_Solution_GMRF_5_p1[(i1+39)])); __m128d vec18_2 = _mm_loadu_pd((&fieldData_Solution_GMRF_5_p1[(i1+41)])); __m128d vec19; __m128d vec19_2; vec19 = _mm_sub_pd(vec0, _mm_add_pd(_mm_mul_pd(vec1, vec2), _mm_add_pd(_mm_mul_pd(vec3, vec4), _mm_add_pd(_mm_mul_pd(vec5, vec6), _mm_add_pd(_mm_mul_pd(vec7, vec8), _mm_add_pd(_mm_mul_pd(vec9, vec10), _mm_add_pd(_mm_mul_pd(vec11, vec12), _mm_add_pd(_mm_mul_pd(vec13, vec14), _mm_add_pd(_mm_mul_pd(vec15, vec16), _mm_mul_pd(vec17, vec18)))))))))); vec19_2 = _mm_sub_pd(vec0_2, _mm_add_pd(_mm_mul_pd(vec1_2, vec2_2), _mm_add_pd(_mm_mul_pd(vec3_2, vec4_2), _mm_add_pd(_mm_mul_pd(vec5_2, vec6_2), _mm_add_pd(_mm_mul_pd(vec7_2, vec8_2), _mm_add_pd(_mm_mul_pd(vec9_2, vec10_2), _mm_add_pd(_mm_mul_pd(vec11_2, vec12_2), _mm_add_pd(_mm_mul_pd(vec13_2, vec14_2), _mm_add_pd(_mm_mul_pd(vec15_2, vec16_2), _mm_mul_pd(vec17_2, vec18_2)))))))))); _mm_storeu_pd((&fieldData_Residual_GMRF_5_p1[(i1+38)]), vec19); _mm_storeu_pd((&fieldData_Residual_GMRF_5_p1[(i1+40)]), vec19_2); } for (; (i1<((iterationOffsetEnd[0][0]+i0)+33)); i1 += 1) { fieldData_Residual_GMRF_5_p1[(i1+38)] = (fieldData_RHS_GMRF_5_p1[i1]-(((((((((fieldData_LaplaceCoeff_GMRF_5_p1[(i1+38)]*fieldData_Solution_GMRF_5_p1[(i1+38)])+(fieldData_LaplaceCoeff_GMRF_5_p1[(i1+1298)]*fieldData_Solution_GMRF_5_p1[(i1+39)]))+(fieldData_LaplaceCoeff_GMRF_5_p1[(i1+2558)]*fieldData_Solution_GMRF_5_p1[(i1+37)]))+(fieldData_LaplaceCoeff_GMRF_5_p1[(i1+3818)]*fieldData_Solution_GMRF_5_p1[(i1+74)]))+(fieldData_LaplaceCoeff_GMRF_5_p1[(i1+5078)]*fieldData_Solution_GMRF_5_p1[(i1+2)]))+(fieldData_LaplaceCoeff_GMRF_5_p1[(i1+6338)]*fieldData_Solution_GMRF_5_p1[(i1+1)]))+(fieldData_LaplaceCoeff_GMRF_5_p1[(i1+7598)]*fieldData_Solution_GMRF_5_p1[(i1+73)]))+(fieldData_LaplaceCoeff_GMRF_5_p1[(i1+8858)]*fieldData_Solution_GMRF_5_p1[(i1+3)]))+(fieldData_LaplaceCoeff_GMRF_5_p1[(i1+10118)]*fieldData_Solution_GMRF_5_p1[(i1+75)]))); } } } } }
void m2l_along_z(long long nmultipoles, double *scr1, double *scr2, double *d2, double *fr, double *sg) { int mmmm,mmm,mm,m; int i,j,k,l,n,nn; __m128d reg00,reg01,reg02,reg03; __m128d reg04,reg05,reg06,reg07; __m128d reg08,reg09,reg10,reg11; __m128d reg12,reg13,reg14,reg15; __m128d reg16,reg17; /* register for rotation matrix TODO: rename regdmat1,regdmat2*/ __m128d reg18,reg19; /* register for g,gl,glm */ i = -15; __m128d regzero = _mm_setzero_pd(); reg08 = regzero; reg09 = regzero; reg10 = regzero; reg11 = regzero; for(j=0;j<=nmultipoles;++j) { i += 16; reg00 = _mm_load_pd(&scr2[i-1]); reg01 = _mm_load_pd(&scr2[i+1]); reg04 = _mm_load_pd(&scr2[i+7]); reg05 = _mm_load_pd(&scr2[i+9]); reg18 = _mm_load1_pd(&fr[j]); reg08 = _mm_add_pd(reg08,_mm_mul_pd(reg00,reg18)); reg09 = _mm_add_pd(reg09,_mm_mul_pd(reg01,reg18)); reg12 = _mm_add_pd(reg12,_mm_mul_pd(reg04,reg18)); reg13 = _mm_add_pd(reg13,_mm_mul_pd(reg05,reg18)); } _mm_store_pd(&scr1[ 0],reg12); _mm_store_pd(&scr1[ 2],reg13); _mm_store_pd(&scr1[ 4],regzero); _mm_store_pd(&scr1[ 6],regzero); _mm_store_pd(&scr1[ 8],reg08); _mm_store_pd(&scr1[10],reg09); _mm_store_pd(&scr1[12],regzero); _mm_store_pd(&scr1[14],regzero); i = 1; for(l=1;l<=nmultipoles;++l) { i += 16 * l; j = -15; k = nmultipoles+l; reg08 = regzero; reg09 = regzero; reg12 = regzero; reg13 = regzero; for(m=l;m<=k;++m) { j += 16; reg00 = _mm_load_pd(&scr2[j-1]); reg01 = _mm_load_pd(&scr2[j+1]); reg04 = _mm_load_pd(&scr2[j+7]); reg05 = _mm_load_pd(&scr2[j+9]); reg18 = _mm_load1_pd(&fr[m]); reg08 = _mm_add_pd(reg08,_mm_mul_pd(reg00,reg18)); reg09 = _mm_add_pd(reg09,_mm_mul_pd(reg01,reg18)); reg12 = _mm_add_pd(reg12,_mm_mul_pd(reg04,reg18)); reg13 = _mm_add_pd(reg13,_mm_mul_pd(reg05,reg18)); } reg18 = _mm_load1_pd(&sg[l]); reg12 = _mm_mul_pd(reg12,reg18); _mm_store_pd(&scr1[i- 1],reg12); reg13 = _mm_mul_pd(reg13,reg18); _mm_store_pd(&scr1[i+ 1],reg13); _mm_store_pd(&scr1[i+ 3],regzero); _mm_store_pd(&scr1[i+ 5],regzero); reg08 = _mm_mul_pd(reg08,reg18); _mm_store_pd(&scr1[i+ 7],reg08); reg09 = _mm_mul_pd(reg09,reg18); _mm_store_pd(&scr1[i+ 9],reg09); _mm_store_pd(&scr1[i+11],regzero); _mm_store_pd(&scr1[i+13],regzero); } mm = 16 * nmultipoles; i = 1; n = mm+1; for(m=1;m<=nmultipoles;++m) { i += 16 * m; j = i; for(l=m;l<=nmultipoles;++l) { j += 16 * l; nn = n; k = m + l; mmm = nmultipoles + l; reg08 = regzero; reg09 = regzero; reg10 = regzero; reg11 = regzero; reg12 = regzero; reg13 = regzero; reg14 = regzero; reg15 = regzero; for(mmmm=k;mmmm<=mmm;++mmmm) { nn += 16; reg00 = _mm_load_pd(&scr2[nn- 1]); reg01 = _mm_load_pd(&scr2[nn+ 1]); reg02 = _mm_load_pd(&scr2[nn+ 3]); reg03 = _mm_load_pd(&scr2[nn+ 5]); reg04 = _mm_load_pd(&scr2[nn+ 7]); reg05 = _mm_load_pd(&scr2[nn+ 9]); reg06 = _mm_load_pd(&scr2[nn+11]); reg07 = _mm_load_pd(&scr2[nn+13]); reg18 = _mm_load1_pd(&fr[mmmm]); reg08 = _mm_add_pd(reg08,_mm_mul_pd(reg00,reg18)); reg09 = _mm_add_pd(reg09,_mm_mul_pd(reg01,reg18)); reg10 = _mm_sub_pd(reg10,_mm_mul_pd(reg02,reg18)); reg11 = _mm_sub_pd(reg11,_mm_mul_pd(reg03,reg18)); reg12 = _mm_add_pd(reg12,_mm_mul_pd(reg04,reg18)); reg13 = _mm_add_pd(reg13,_mm_mul_pd(reg05,reg18)); reg14 = _mm_sub_pd(reg14,_mm_mul_pd(reg06,reg18)); reg15 = _mm_sub_pd(reg15,_mm_mul_pd(reg07,reg18)); } reg18 = _mm_load1_pd(&sg[k]); reg12 = _mm_mul_pd(reg12,reg18); _mm_store_pd(&scr1[j- 1],reg12); reg13 = _mm_mul_pd(reg13,reg18); _mm_store_pd(&scr1[j+ 1],reg13); reg14 = _mm_mul_pd(reg14,reg18); _mm_store_pd(&scr1[j+ 3],reg14); reg15 = _mm_mul_pd(reg15,reg18); _mm_store_pd(&scr1[j+ 5],reg15); reg08 = _mm_mul_pd(reg08,reg18); _mm_store_pd(&scr1[j+ 7],reg08); reg09 = _mm_mul_pd(reg09,reg18); _mm_store_pd(&scr1[j+ 9],reg09); reg10 = _mm_mul_pd(reg10,reg18); _mm_store_pd(&scr1[j+11],reg10); reg11 = _mm_mul_pd(reg11,reg18); _mm_store_pd(&scr1[j+13],reg11); } n += mm; mm -= 16; } }
static inline __m128d my_invrsq_pd(__m128d x) { const __m128d three = (const __m128d) {3.0f, 3.0f}; const __m128d half = (const __m128d) {0.5f, 0.5f}; __m128 t = _mm_rsqrt_ps(_mm_cvtpd_ps(x)); /* Convert to single precision and do _mm_rsqrt_ps() */ __m128d t1 = _mm_cvtps_pd(t); /* Convert back to double precision */ /* First Newton-Rapson step, accuracy is now 24 bits */ __m128d t2 = _mm_mul_pd(half,_mm_mul_pd(t1,_mm_sub_pd(three,_mm_mul_pd(x,_mm_mul_pd(t1,t1))))); /* Return second Newton-Rapson step, accuracy 48 bits */ return (__m128d) _mm_mul_pd(half,_mm_mul_pd(t2,_mm_sub_pd(three,_mm_mul_pd(x,_mm_mul_pd(t2,t2))))); } /* to extract single integers from a __m128i datatype */ #define _mm_extract_epi64(x, imm) \ _mm_cvtsi128_si32(_mm_srli_si128((x), 4 * (imm))) void nb_kernel400_x86_64_sse2(int * p_nri, int * iinr, int * jindex, int * jjnr, int * shift, double * shiftvec, double * fshift, int * gid, double * pos, double * faction, double * charge, double * p_facel, double * p_krf, double * p_crf, double * Vc, int * type, int * p_ntype, double * vdwparam, double * Vvdw, double * p_tabscale, double * VFtab, double * invsqrta, double * dvda, double * p_gbtabscale, double * GBtab, int * p_nthreads, int * count, void * mtx, int * outeriter, int * inneriter, double * work) { int nri,ntype,nthreads,offset; int n,ii,is3,ii3,k,nj0,nj1,jnr1,jnr2,j13,j23,ggid; double facel,krf,crf,tabscl,gbtabscl,vct,vgbt; double shX,shY,shZ,isai_d,dva; gmx_gbdata_t *gbdata; float * gpol; __m128d ix,iy,iz,jx,jy,jz; __m128d dx,dy,dz,t1,t2,t3; __m128d fix,fiy,fiz,rsq11,rinv,r,fscal,rt,eps,eps2; __m128d q,iq,qq,isai,isaj,isaprod,vcoul,gbscale,dvdai,dvdaj; __m128d Y,F,G,H,Fp,VV,FF,vgb,fijC,dvdatmp,dvdasum,vctot,vgbtot,n0d; __m128d xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7,xmm8; __m128d fac,tabscale,gbtabscale; __m128i n0,nnn; const __m128d neg = {-1.0f,-1.0f}; const __m128d zero = {0.0f,0.0f}; const __m128d half = {0.5f,0.5f}; const __m128d two = {2.0f,2.0f}; const __m128d three = {3.0f,3.0f}; gbdata = (gmx_gbdata_t *)work; gpol = gbdata->gpol; nri = *p_nri; ntype = *p_ntype; nthreads = *p_nthreads; facel = (*p_facel) * (1.0 - (1.0/gbdata->gb_epsilon_solvent)); krf = *p_krf; crf = *p_crf; tabscl = *p_tabscale; gbtabscl = *p_gbtabscale; nj1 = 0; /* Splat variables */ fac = _mm_load1_pd(&facel); tabscale = _mm_load1_pd(&tabscl); gbtabscale = _mm_load1_pd(&gbtabscl); /* Keep compiler happy */ dvdatmp = _mm_setzero_pd(); vgb = _mm_setzero_pd(); dvdaj = _mm_setzero_pd(); isaj = _mm_setzero_pd(); vcoul = _mm_setzero_pd(); t1 = _mm_setzero_pd(); t2 = _mm_setzero_pd(); t3 = _mm_setzero_pd(); jnr1=jnr2=0; j13=j23=0; for(n=0;n<nri;n++) { is3 = 3*shift[n]; shX = shiftvec[is3]; shY = shiftvec[is3+1]; shZ = shiftvec[is3+2]; nj0 = jindex[n]; nj1 = jindex[n+1]; offset = (nj1-nj0)%2; ii = iinr[n]; ii3 = ii*3; ix = _mm_set1_pd(shX+pos[ii3+0]); iy = _mm_set1_pd(shX+pos[ii3+1]); iz = _mm_set1_pd(shX+pos[ii3+2]); q = _mm_set1_pd(charge[ii]); iq = _mm_mul_pd(fac,q); isai_d = invsqrta[ii]; isai = _mm_load1_pd(&isai_d); fix = _mm_setzero_pd(); fiy = _mm_setzero_pd(); fiz = _mm_setzero_pd(); dvdasum = _mm_setzero_pd(); vctot = _mm_setzero_pd(); vgbtot = _mm_setzero_pd(); for(k=nj0;k<nj1-offset; k+=2) { jnr1 = jjnr[k]; jnr2 = jjnr[k+1]; j13 = jnr1 * 3; j23 = jnr2 * 3; /* Load coordinates */ xmm1 = _mm_loadu_pd(pos+j13); /* x1 y1 */ xmm2 = _mm_loadu_pd(pos+j23); /* x2 y2 */ xmm5 = _mm_load_sd(pos+j13+2); /* z1 - */ xmm6 = _mm_load_sd(pos+j23+2); /* z2 - */ /* transpose */ jx = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); jy = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); jz = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); /* distances */ dx = _mm_sub_pd(ix,jx); dy = _mm_sub_pd(iy,jy); dz = _mm_sub_pd(iz,jz); rsq11 = _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx) , _mm_mul_pd(dy,dy) ) , _mm_mul_pd(dz,dz) ); rinv = my_invrsq_pd(rsq11); /* Load invsqrta */ isaj = _mm_loadl_pd(isaj,invsqrta+jnr1); isaj = _mm_loadh_pd(isaj,invsqrta+jnr2); isaprod = _mm_mul_pd(isai,isaj); /* Load charges */ q = _mm_loadl_pd(q,charge+jnr1); q = _mm_loadh_pd(q,charge+jnr2); qq = _mm_mul_pd(iq,q); vcoul = _mm_mul_pd(qq,rinv); fscal = _mm_mul_pd(vcoul,rinv); qq = _mm_mul_pd(isaprod,qq); qq = _mm_mul_pd(qq,neg); gbscale = _mm_mul_pd(isaprod,gbtabscale); /* Load dvdaj */ dvdaj = _mm_loadl_pd(dvdaj, dvda+jnr1); dvdaj = _mm_loadh_pd(dvdaj, dvda+jnr2); r = _mm_mul_pd(rsq11,rinv); rt = _mm_mul_pd(r,gbscale); n0 = _mm_cvttpd_epi32(rt); n0d = _mm_cvtepi32_pd(n0); eps = _mm_sub_pd(rt,n0d); eps2 = _mm_mul_pd(eps,eps); nnn = _mm_slli_epi64(n0,2); xmm1 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,0))); /* Y1 F1 */ xmm2 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,1))); /* Y2 F2 */ xmm3 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,0))+2); /* G1 H1 */ xmm4 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,1))+2); /* G2 H2 */ Y = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* Y1 Y2 */ F = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* F1 F2 */ G = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); /* G1 G2 */ H = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); /* H1 H2 */ G = _mm_mul_pd(G,eps); H = _mm_mul_pd(H,eps2); Fp = _mm_add_pd(F,G); Fp = _mm_add_pd(Fp,H); VV = _mm_mul_pd(Fp,eps); VV = _mm_add_pd(Y,VV); H = _mm_mul_pd(two,H); FF = _mm_add_pd(Fp,G); FF = _mm_add_pd(FF,H); vgb = _mm_mul_pd(qq,VV); fijC = _mm_mul_pd(qq,FF); fijC = _mm_mul_pd(fijC,gbscale); dvdatmp = _mm_mul_pd(fijC,r); dvdatmp = _mm_add_pd(vgb,dvdatmp); dvdatmp = _mm_mul_pd(dvdatmp,neg); dvdatmp = _mm_mul_pd(dvdatmp,half); dvdasum = _mm_add_pd(dvdasum,dvdatmp); xmm1 = _mm_mul_pd(dvdatmp,isaj); xmm1 = _mm_mul_pd(xmm1,isaj); dvdaj = _mm_add_pd(dvdaj,xmm1); /* store dvda */ _mm_storel_pd(dvda+jnr1,dvdaj); _mm_storeh_pd(dvda+jnr2,dvdaj); vctot = _mm_add_pd(vctot,vcoul); vgbtot = _mm_add_pd(vgbtot,vgb); fscal = _mm_sub_pd(fijC,fscal); fscal = _mm_mul_pd(fscal,neg); fscal = _mm_mul_pd(fscal,rinv); /* calculate partial force terms */ t1 = _mm_mul_pd(fscal,dx); t2 = _mm_mul_pd(fscal,dy); t3 = _mm_mul_pd(fscal,dz); /* update the i force */ fix = _mm_add_pd(fix,t1); fiy = _mm_add_pd(fiy,t2); fiz = _mm_add_pd(fiz,t3); /* accumulate forces from memory */ xmm1 = _mm_loadu_pd(faction+j13); /* fx1 fy1 */ xmm2 = _mm_loadu_pd(faction+j23); /* fx2 fy2 */ xmm5 = _mm_load1_pd(faction+j13+2); /* fz1 fz1 */ xmm6 = _mm_load1_pd(faction+j23+2); /* fz2 fz2 */ /* transpose */ xmm7 = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); /* fz1 fz2 */ xmm5 = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* fx1 fx2 */ xmm6 = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* fy1 fy2 */ /* subtract partial forces */ xmm5 = _mm_sub_pd(xmm5,t1); xmm6 = _mm_sub_pd(xmm6,t2); xmm7 = _mm_sub_pd(xmm7,t3); xmm1 = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); /* fx1 fy1 */ xmm2 = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(1,1)); /* fy1 fy2 */ /* store fx and fy */ _mm_storeu_pd(faction+j13,xmm1); _mm_storeu_pd(faction+j23,xmm2); /* .. then fz */ _mm_storel_pd(faction+j13+2,xmm7); _mm_storel_pd(faction+j23+2,xmm7); } /* In double precision, offset can only be either 0 or 1 */ if(offset!=0) { jnr1 = jjnr[k]; j13 = jnr1*3; jx = _mm_load_sd(pos+j13); jy = _mm_load_sd(pos+j13+1); jz = _mm_load_sd(pos+j13+2); isaj = _mm_load_sd(invsqrta+jnr1); isaprod = _mm_mul_sd(isai,isaj); dvdaj = _mm_load_sd(dvda+jnr1); q = _mm_load_sd(charge+jnr1); qq = _mm_mul_sd(iq,q); dx = _mm_sub_sd(ix,jx); dy = _mm_sub_sd(iy,jy); dz = _mm_sub_sd(iz,jz); rsq11 = _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx) , _mm_mul_pd(dy,dy) ) , _mm_mul_pd(dz,dz) ); rinv = my_invrsq_pd(rsq11); vcoul = _mm_mul_sd(qq,rinv); fscal = _mm_mul_sd(vcoul,rinv); qq = _mm_mul_sd(isaprod,qq); qq = _mm_mul_sd(qq,neg); gbscale = _mm_mul_sd(isaprod,gbtabscale); r = _mm_mul_sd(rsq11,rinv); rt = _mm_mul_sd(r,gbscale); n0 = _mm_cvttpd_epi32(rt); n0d = _mm_cvtepi32_pd(n0); eps = _mm_sub_sd(rt,n0d); eps2 = _mm_mul_sd(eps,eps); nnn = _mm_slli_epi64(n0,2); xmm1 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,0))); xmm2 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,1))); xmm3 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,0))+2); xmm4 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,1))+2); Y = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); F = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); G = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); H = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); G = _mm_mul_sd(G,eps); H = _mm_mul_sd(H,eps2); Fp = _mm_add_sd(F,G); Fp = _mm_add_sd(Fp,H); VV = _mm_mul_sd(Fp,eps); VV = _mm_add_sd(Y,VV); H = _mm_mul_sd(two,H); FF = _mm_add_sd(Fp,G); FF = _mm_add_sd(FF,H); vgb = _mm_mul_sd(qq,VV); fijC = _mm_mul_sd(qq,FF); fijC = _mm_mul_sd(fijC,gbscale); dvdatmp = _mm_mul_sd(fijC,r); dvdatmp = _mm_add_sd(vgb,dvdatmp); dvdatmp = _mm_mul_sd(dvdatmp,neg); dvdatmp = _mm_mul_sd(dvdatmp,half); dvdasum = _mm_add_sd(dvdasum,dvdatmp); xmm1 = _mm_mul_sd(dvdatmp,isaj); xmm1 = _mm_mul_sd(xmm1,isaj); dvdaj = _mm_add_sd(dvdaj,xmm1); /* store dvda */ _mm_storel_pd(dvda+jnr1,dvdaj); vctot = _mm_add_sd(vctot,vcoul); vgbtot = _mm_add_sd(vgbtot,vgb); fscal = _mm_sub_sd(fijC,fscal); fscal = _mm_mul_sd(fscal,neg); fscal = _mm_mul_sd(fscal,rinv); /* calculate partial force terms */ t1 = _mm_mul_sd(fscal,dx); t2 = _mm_mul_sd(fscal,dy); t3 = _mm_mul_sd(fscal,dz); /* update the i force */ fix = _mm_add_sd(fix,t1); fiy = _mm_add_sd(fiy,t2); fiz = _mm_add_sd(fiz,t3); /* accumulate forces from memory */ xmm5 = _mm_load_sd(faction+j13); /* fx */ xmm6 = _mm_load_sd(faction+j13+1); /* fy */ xmm7 = _mm_load_sd(faction+j13+2); /* fz */ /* subtract partial forces */ xmm5 = _mm_sub_sd(xmm5,t1); xmm6 = _mm_sub_sd(xmm6,t2); xmm7 = _mm_sub_sd(xmm7,t3); /* store forces */ _mm_store_sd(faction+j13,xmm5); _mm_store_sd(faction+j13+1,xmm6); _mm_store_sd(faction+j13+2,xmm7); } /* fix/fiy/fiz now contain four partial terms, that all should be * added to the i particle forces */ t1 = _mm_unpacklo_pd(t1,fix); t2 = _mm_unpacklo_pd(t2,fiy); t3 = _mm_unpacklo_pd(t3,fiz); fix = _mm_add_pd(fix,t1); fiy = _mm_add_pd(fiy,t2); fiz = _mm_add_pd(fiz,t3); fix = _mm_shuffle_pd(fix,fix,_MM_SHUFFLE2(1,1)); fiy = _mm_shuffle_pd(fiy,fiy,_MM_SHUFFLE2(1,1)); fiz = _mm_shuffle_pd(fiz,fiz,_MM_SHUFFLE2(1,1)); /* Load i forces from memory */ xmm1 = _mm_load_sd(faction+ii3); xmm2 = _mm_load_sd(faction+ii3+1); xmm3 = _mm_load_sd(faction+ii3+2); /* Add to i force */ fix = _mm_add_sd(fix,xmm1); fiy = _mm_add_sd(fiy,xmm2); fiz = _mm_add_sd(fiz,xmm3); /* store i forces to memory */ _mm_store_sd(faction+ii3,fix); _mm_store_sd(faction+ii3+1,fiy); _mm_store_sd(faction+ii3+2,fiz); /* now do dvda */ dvdatmp = _mm_unpacklo_pd(dvdatmp,dvdasum); dvdasum = _mm_add_pd(dvdasum,dvdatmp); _mm_storeh_pd(&dva,dvdasum); dvda[ii] = dvda[ii] + dva*isai_d*isai_d; ggid = gid[n]; /* Coulomb potential */ vcoul = _mm_unpacklo_pd(vcoul,vctot); vctot = _mm_add_pd(vctot,vcoul); _mm_storeh_pd(&vct,vctot); Vc[ggid] = Vc[ggid] + vct; /* GB potential */ vgb = _mm_unpacklo_pd(vgb,vgbtot); vgbtot = _mm_add_pd(vgbtot,vgb); _mm_storeh_pd(&vgbt,vgbtot); gpol[ggid] = gpol[ggid] + vgbt; } *outeriter = nri; *inneriter = nj1; }
void exchlaplacecoeff_gmrfData_0(unsigned int slot) { for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[0]) { if ((!neighbor_isValid[0][0])) { { double xPos; double yPos; /* Statements in this Scop: S1053, S1056, S1059, S1050, S1058, S1052, S1055, S1060, S1054, S1057, S1051 */ { { { { { { { { { { { double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][0]); int i1 = 1; for (; (i1<=1); i1 += 2) { fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+26)] = 0.000000e+00; fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+32)] = 0.000000e+00; } for (; (i1<=2); i1 += 1) { fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+26)] = 0.000000e+00; } } { double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][0]); int i1 = 1; for (; (i1<=1); i1 += 2) { fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+146)] = 0.000000e+00; fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+152)] = 0.000000e+00; } for (; (i1<=2); i1 += 1) { fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+146)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][0]); int i1 = 1; for (; (i1<=1); i1 += 2) { fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+98)] = 0.000000e+00; fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+104)] = 0.000000e+00; } for (; (i1<=2); i1 += 1) { fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+98)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][0]); int i1 = 1; for (; (i1<=1); i1 += 2) { fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+74)] = 0.000000e+00; fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+80)] = 0.000000e+00; } for (; (i1<=2); i1 += 1) { fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+74)] = 0.000000e+00; } } } { int i1 = 1; for (; (i1<(2&(~1))); i1 += 1) { xPos = posBegin[0]; } __m128d vec1 = _mm_set1_pd(xPos); for (; (i1<0); i1 += 4) { /* xPos = posBegin[0]; */ __m128d vec0 = _mm_load1_pd((&posBegin[0])); __m128d vec0_2 = _mm_load1_pd((&posBegin[0])); vec1 = vec0; vec1 = vec0_2; } for (; (i1<3); i1 += 1) { xPos = posBegin[0]; } } } { double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][0]); int i1 = 1; for (; (i1<=1); i1 += 2) { fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+122)] = 0.000000e+00; fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+128)] = 0.000000e+00; } for (; (i1<=2); i1 += 1) { fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+122)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][0]); int i1 = 1; for (; (i1<=1); i1 += 2) { fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+170)] = 0.000000e+00; fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+176)] = 0.000000e+00; } for (; (i1<=2); i1 += 1) { fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+170)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][0]); int i1 = 1; for (; (i1<=1); i1 += 2) { fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+194)] = 0.000000e+00; fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+200)] = 0.000000e+00; } for (; (i1<=2); i1 += 1) { fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+194)] = 0.000000e+00; } } } { int i1 = 1; for (; (i1<(2&(~1))); i1 += 1) { yPos = ((((i1-1)/1.000000e+00)*(posEnd[1]-posBegin[1]))+posBegin[1]); } __m128d vec1 = _mm_set1_pd(1.000000e+00); __m128d vec2 = _mm_set1_pd(1.000000e+00); __m128d vec5 = _mm_set1_pd(yPos); for (; (i1<0); i1 += 4) { /* yPos = ((((i1-1)/1.000000e+00)*(posEnd[1]-posBegin[1]))+posBegin[1]); */ __m128d vec0 = _mm_set_pd(i1+1,i1); __m128d vec0_2 = _mm_set_pd(i1+1,i1); __m128d vec3 = _mm_load1_pd((&posEnd[1])); __m128d vec3_2 = _mm_load1_pd((&posEnd[1])); __m128d vec4 = _mm_load1_pd((&posBegin[1])); __m128d vec4_2 = _mm_load1_pd((&posBegin[1])); vec5 = _mm_add_pd(_mm_mul_pd(_mm_div_pd(_mm_sub_pd(vec0, vec1), vec2), _mm_sub_pd(vec3, vec4)), vec4); vec5 = _mm_add_pd(_mm_mul_pd(_mm_div_pd(_mm_sub_pd(vec0_2, vec1), vec2), _mm_sub_pd(vec3_2, vec4_2)), vec4_2); } for (; (i1<3); i1 += 1) { yPos = ((((i1-1)/1.000000e+00)*(posEnd[1]-posBegin[1]))+posBegin[1]); } } } { double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][0]); int i1 = 1; for (; (i1<=1); i1 += 2) { fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+2)] = 0.000000e+00; fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+8)] = 0.000000e+00; } for (; (i1<=2); i1 += 1) { fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+2)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][0]); int i1 = 1; for (; (i1<=1); i1 += 2) { fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+50)] = 0.000000e+00; fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+56)] = 0.000000e+00; } for (; (i1<=2); i1 += 1) { fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+50)] = 0.000000e+00; } } } } } if ((!neighbor_isValid[0][1])) { { double xPos; double yPos; /* Statements in this Scop: S1071, S1065, S1068, S1062, S1070, S1064, S1067, S1061, S1069, S1063, S1066 */ { { { { { { { { { { { double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][0]); int i1 = 1; for (; (i1<=1); i1 += 2) { fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+195)] = 0.000000e+00; fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+201)] = 0.000000e+00; } for (; (i1<=2); i1 += 1) { fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+195)] = 0.000000e+00; } } { double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][0]); int i1 = 1; for (; (i1<=1); i1 += 2) { fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+51)] = 0.000000e+00; fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+57)] = 0.000000e+00; } for (; (i1<=2); i1 += 1) { fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+51)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][0]); int i1 = 1; for (; (i1<=1); i1 += 2) { fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+75)] = 0.000000e+00; fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+81)] = 0.000000e+00; } for (; (i1<=2); i1 += 1) { fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+75)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][0]); int i1 = 1; for (; (i1<=1); i1 += 2) { fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+3)] = 0.000000e+00; fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+9)] = 0.000000e+00; } for (; (i1<=2); i1 += 1) { fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+3)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][0]); int i1 = 1; for (; (i1<=1); i1 += 2) { fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+171)] = 0.000000e+00; fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+177)] = 0.000000e+00; } for (; (i1<=2); i1 += 1) { fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+171)] = 0.000000e+00; } } } { int i1 = 1; for (; (i1<(2&(~1))); i1 += 1) { yPos = ((((i1-1)/1.000000e+00)*(posEnd[1]-posBegin[1]))+posBegin[1]); } __m128d vec1 = _mm_set1_pd(1.000000e+00); __m128d vec2 = _mm_set1_pd(1.000000e+00); __m128d vec5 = _mm_set1_pd(yPos); for (; (i1<0); i1 += 4) { /* yPos = ((((i1-1)/1.000000e+00)*(posEnd[1]-posBegin[1]))+posBegin[1]); */ __m128d vec0 = _mm_set_pd(i1+1,i1); __m128d vec0_2 = _mm_set_pd(i1+1,i1); __m128d vec3 = _mm_load1_pd((&posEnd[1])); __m128d vec3_2 = _mm_load1_pd((&posEnd[1])); __m128d vec4 = _mm_load1_pd((&posBegin[1])); __m128d vec4_2 = _mm_load1_pd((&posBegin[1])); vec5 = _mm_add_pd(_mm_mul_pd(_mm_div_pd(_mm_sub_pd(vec0, vec1), vec2), _mm_sub_pd(vec3, vec4)), vec4); vec5 = _mm_add_pd(_mm_mul_pd(_mm_div_pd(_mm_sub_pd(vec0_2, vec1), vec2), _mm_sub_pd(vec3_2, vec4_2)), vec4_2); } for (; (i1<3); i1 += 1) { yPos = ((((i1-1)/1.000000e+00)*(posEnd[1]-posBegin[1]))+posBegin[1]); } } } { double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][0]); int i1 = 1; for (; (i1<=1); i1 += 2) { fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+99)] = 0.000000e+00; fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+105)] = 0.000000e+00; } for (; (i1<=2); i1 += 1) { fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+99)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][0]); int i1 = 1; for (; (i1<=1); i1 += 2) { fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+123)] = 0.000000e+00; fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+129)] = 0.000000e+00; } for (; (i1<=2); i1 += 1) { fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+123)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][0]); int i1 = 1; for (; (i1<=1); i1 += 2) { fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+147)] = 0.000000e+00; fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+153)] = 0.000000e+00; } for (; (i1<=2); i1 += 1) { fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+147)] = 0.000000e+00; } } } { int i1 = 1; for (; (i1<(2&(~1))); i1 += 1) { xPos = posEnd[0]; } __m128d vec1 = _mm_set1_pd(xPos); for (; (i1<0); i1 += 4) { /* xPos = posEnd[0]; */ __m128d vec0 = _mm_load1_pd((&posEnd[0])); __m128d vec0_2 = _mm_load1_pd((&posEnd[0])); vec1 = vec0; vec1 = vec0_2; } for (; (i1<3); i1 += 1) { xPos = posEnd[0]; } } } { double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][0]); int i1 = 1; for (; (i1<=1); i1 += 2) { fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+27)] = 0.000000e+00; fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+33)] = 0.000000e+00; } for (; (i1<=2); i1 += 1) { fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+27)] = 0.000000e+00; } } } } } if ((!neighbor_isValid[0][2])) { { double xPos; double yPos; /* Statements in this Scop: S1080, S1074, S1077, S1082, S1076, S1079, S1073, S1072, S1081, S1075, S1078 */ { { { { { { { { { { { double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][0]); int i2 = 2; for (; (i2<=2); i2 += 2) { fieldData_LaplaceCoeff_GMRF_0_p1[(i2+126)] = 0.000000e+00; fieldData_LaplaceCoeff_GMRF_0_p1[(i2+127)] = 0.000000e+00; } for (; (i2<=3); i2 += 1) { fieldData_LaplaceCoeff_GMRF_0_p1[(i2+126)] = 0.000000e+00; } } { int i2 = 2; for (; (i2<=2); i2 += 2) { xPos = ((((i2-2)/1.000000e+00)*(posEnd[0]-posBegin[0]))+posBegin[0]); xPos = ((((i2-1)/1.000000e+00)*(posEnd[0]-posBegin[0]))+posBegin[0]); } for (; (i2<=3); i2 += 1) { xPos = ((((i2-2)/1.000000e+00)*(posEnd[0]-posBegin[0]))+posBegin[0]); } } } { double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][0]); int i2 = 2; for (; (i2<=2); i2 += 2) { fieldData_LaplaceCoeff_GMRF_0_p1[(i2+198)] = 0.000000e+00; fieldData_LaplaceCoeff_GMRF_0_p1[(i2+199)] = 0.000000e+00; } for (; (i2<=3); i2 += 1) { fieldData_LaplaceCoeff_GMRF_0_p1[(i2+198)] = 0.000000e+00; } } } { int i2 = 2; for (; (i2<=2); i2 += 2) { yPos = posBegin[1]; yPos = posBegin[1]; } for (; (i2<=3); i2 += 1) { yPos = posBegin[1]; } } } { double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][0]); int i2 = 2; for (; (i2<=2); i2 += 2) { fieldData_LaplaceCoeff_GMRF_0_p1[(i2+30)] = 0.000000e+00; fieldData_LaplaceCoeff_GMRF_0_p1[(i2+31)] = 0.000000e+00; } for (; (i2<=3); i2 += 1) { fieldData_LaplaceCoeff_GMRF_0_p1[(i2+30)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][0]); int i2 = 2; for (; (i2<=2); i2 += 2) { fieldData_LaplaceCoeff_GMRF_0_p1[(i2+174)] = 0.000000e+00; fieldData_LaplaceCoeff_GMRF_0_p1[(i2+175)] = 0.000000e+00; } for (; (i2<=3); i2 += 1) { fieldData_LaplaceCoeff_GMRF_0_p1[(i2+174)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][0]); int i2 = 2; for (; (i2<=2); i2 += 2) { fieldData_LaplaceCoeff_GMRF_0_p1[(i2+78)] = 0.000000e+00; fieldData_LaplaceCoeff_GMRF_0_p1[(i2+79)] = 0.000000e+00; } for (; (i2<=3); i2 += 1) { fieldData_LaplaceCoeff_GMRF_0_p1[(i2+78)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][0]); int i2 = 2; for (; (i2<=2); i2 += 2) { fieldData_LaplaceCoeff_GMRF_0_p1[(i2+54)] = 0.000000e+00; fieldData_LaplaceCoeff_GMRF_0_p1[(i2+55)] = 0.000000e+00; } for (; (i2<=3); i2 += 1) { fieldData_LaplaceCoeff_GMRF_0_p1[(i2+54)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][0]); int i2 = 2; for (; (i2<=2); i2 += 2) { fieldData_LaplaceCoeff_GMRF_0_p1[(i2+150)] = 0.000000e+00; fieldData_LaplaceCoeff_GMRF_0_p1[(i2+151)] = 0.000000e+00; } for (; (i2<=3); i2 += 1) { fieldData_LaplaceCoeff_GMRF_0_p1[(i2+150)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][0]); int i2 = 2; for (; (i2<=2); i2 += 2) { fieldData_LaplaceCoeff_GMRF_0_p1[(i2+6)] = 0.000000e+00; fieldData_LaplaceCoeff_GMRF_0_p1[(i2+7)] = 0.000000e+00; } for (; (i2<=3); i2 += 1) { fieldData_LaplaceCoeff_GMRF_0_p1[(i2+6)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][0]); int i2 = 2; for (; (i2<=2); i2 += 2) { fieldData_LaplaceCoeff_GMRF_0_p1[(i2+102)] = 0.000000e+00; fieldData_LaplaceCoeff_GMRF_0_p1[(i2+103)] = 0.000000e+00; } for (; (i2<=3); i2 += 1) { fieldData_LaplaceCoeff_GMRF_0_p1[(i2+102)] = 0.000000e+00; } } } } } if ((!neighbor_isValid[0][3])) { { double xPos; double yPos; /* Statements in this Scop: S1083, S1092, S1086, S1089, S1088, S1091, S1085, S1090, S1093, S1087, S1084 */ { { { { { { { { { { { double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][0]); int i2 = 2; for (; (i2<=2); i2 += 2) { fieldData_LaplaceCoeff_GMRF_0_p1[(i2+12)] = 0.000000e+00; fieldData_LaplaceCoeff_GMRF_0_p1[(i2+13)] = 0.000000e+00; } for (; (i2<=3); i2 += 1) { fieldData_LaplaceCoeff_GMRF_0_p1[(i2+12)] = 0.000000e+00; } } { double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][0]); int i2 = 2; for (; (i2<=2); i2 += 2) { fieldData_LaplaceCoeff_GMRF_0_p1[(i2+60)] = 0.000000e+00; fieldData_LaplaceCoeff_GMRF_0_p1[(i2+61)] = 0.000000e+00; } for (; (i2<=3); i2 += 1) { fieldData_LaplaceCoeff_GMRF_0_p1[(i2+60)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][0]); int i2 = 2; for (; (i2<=2); i2 += 2) { fieldData_LaplaceCoeff_GMRF_0_p1[(i2+204)] = 0.000000e+00; fieldData_LaplaceCoeff_GMRF_0_p1[(i2+205)] = 0.000000e+00; } for (; (i2<=3); i2 += 1) { fieldData_LaplaceCoeff_GMRF_0_p1[(i2+204)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][0]); int i2 = 2; for (; (i2<=2); i2 += 2) { fieldData_LaplaceCoeff_GMRF_0_p1[(i2+132)] = 0.000000e+00; fieldData_LaplaceCoeff_GMRF_0_p1[(i2+133)] = 0.000000e+00; } for (; (i2<=3); i2 += 1) { fieldData_LaplaceCoeff_GMRF_0_p1[(i2+132)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][0]); int i2 = 2; for (; (i2<=2); i2 += 2) { fieldData_LaplaceCoeff_GMRF_0_p1[(i2+84)] = 0.000000e+00; fieldData_LaplaceCoeff_GMRF_0_p1[(i2+85)] = 0.000000e+00; } for (; (i2<=3); i2 += 1) { fieldData_LaplaceCoeff_GMRF_0_p1[(i2+84)] = 0.000000e+00; } } } { int i2 = 2; for (; (i2<=2); i2 += 2) { yPos = posEnd[1]; yPos = posEnd[1]; } for (; (i2<=3); i2 += 1) { yPos = posEnd[1]; } } } { double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][0]); int i2 = 2; for (; (i2<=2); i2 += 2) { fieldData_LaplaceCoeff_GMRF_0_p1[(i2+36)] = 0.000000e+00; fieldData_LaplaceCoeff_GMRF_0_p1[(i2+37)] = 0.000000e+00; } for (; (i2<=3); i2 += 1) { fieldData_LaplaceCoeff_GMRF_0_p1[(i2+36)] = 0.000000e+00; } } } { int i2 = 2; for (; (i2<=2); i2 += 2) { xPos = ((((i2-2)/1.000000e+00)*(posEnd[0]-posBegin[0]))+posBegin[0]); xPos = ((((i2-1)/1.000000e+00)*(posEnd[0]-posBegin[0]))+posBegin[0]); } for (; (i2<=3); i2 += 1) { xPos = ((((i2-2)/1.000000e+00)*(posEnd[0]-posBegin[0]))+posBegin[0]); } } } { double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][0]); int i2 = 2; for (; (i2<=2); i2 += 2) { fieldData_LaplaceCoeff_GMRF_0_p1[(i2+180)] = 0.000000e+00; fieldData_LaplaceCoeff_GMRF_0_p1[(i2+181)] = 0.000000e+00; } for (; (i2<=3); i2 += 1) { fieldData_LaplaceCoeff_GMRF_0_p1[(i2+180)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][0]); int i2 = 2; for (; (i2<=2); i2 += 2) { fieldData_LaplaceCoeff_GMRF_0_p1[(i2+156)] = 0.000000e+00; fieldData_LaplaceCoeff_GMRF_0_p1[(i2+157)] = 0.000000e+00; } for (; (i2<=3); i2 += 1) { fieldData_LaplaceCoeff_GMRF_0_p1[(i2+156)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][0]); int i2 = 2; for (; (i2<=2); i2 += 2) { fieldData_LaplaceCoeff_GMRF_0_p1[(i2+108)] = 0.000000e+00; fieldData_LaplaceCoeff_GMRF_0_p1[(i2+109)] = 0.000000e+00; } for (; (i2<=3); i2 += 1) { fieldData_LaplaceCoeff_GMRF_0_p1[(i2+108)] = 0.000000e+00; } } } } } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[0]) { if ((neighbor_isValid[0][1]&&neighbor_isRemote[0][1])) { /* Statements in this Scop: S1094 */ for (int i0 = 0; (i0<=8); i0 += 1) { double* buffer_Send_1_p1 = (&buffer_Send[1][(i0*2)]); double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][(i0*24)]); int i1 = 1; for (; (i1<=1); i1 += 2) { buffer_Send_1_p1[(i1-1)] = fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+3)]; buffer_Send_1_p1[i1] = fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+9)]; } for (; (i1<=2); i1 += 1) { buffer_Send_1_p1[(i1-1)] = fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+3)]; } } } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[0]) { if ((neighbor_isValid[0][1]&&neighbor_isRemote[0][1])) { MPI_Isend(buffer_Send[1], 18, MPI_DOUBLE, neighbor_remoteRank[0][1], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[0][1]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[1]); reqOutstanding_Send[1] = true; } } } ; for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[0]) { if ((neighbor_isValid[0][0]&&neighbor_isRemote[0][0])) { MPI_Irecv(buffer_Recv[0], 18, MPI_DOUBLE, neighbor_remoteRank[0][0], ((unsigned int)(neighbor_fragCommId[0][0]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[0]); reqOutstanding_Recv[0] = true; } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[0]) { if (reqOutstanding_Recv[0]) { waitForMPIReq(&mpiRequest_Recv[0]); reqOutstanding_Recv[0] = false; } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[0]) { if ((neighbor_isValid[0][0]&&neighbor_isRemote[0][0])) { /* Statements in this Scop: S1095 */ for (int i0 = 0; (i0<=8); i0 += 1) { double* buffer_Recv_0_p1 = (&buffer_Recv[0][(i0*2)]); double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][(i0*24)]); int i1 = 3; for (; (i1<=3); i1 += 2) { fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)-10)] = buffer_Recv_0_p1[(i1-3)]; fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)-4)] = buffer_Recv_0_p1[(i1-2)]; } for (; (i1<=4); i1 += 1) { fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)-10)] = buffer_Recv_0_p1[(i1-3)]; } } } } } ; ; for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[0]) { if (reqOutstanding_Send[1]) { waitForMPIReq(&mpiRequest_Send[1]); reqOutstanding_Send[1] = false; } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[0]) { ; } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[0]) { if ((neighbor_isValid[0][3]&&neighbor_isRemote[0][3])) { MPI_Isend(&fieldData_LaplaceCoeff_GMRF[0][14], 1, mpiDatatype_9_2_24, neighbor_remoteRank[0][3], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[0][3]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[3]); reqOutstanding_Send[3] = true; } } } ; for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[0]) { if ((neighbor_isValid[0][2]&&neighbor_isRemote[0][2])) { MPI_Irecv(&fieldData_LaplaceCoeff_GMRF[0][8], 1, mpiDatatype_9_2_24, neighbor_remoteRank[0][2], ((unsigned int)(neighbor_fragCommId[0][2]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[2]); reqOutstanding_Recv[2] = true; } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[0]) { if (reqOutstanding_Recv[2]) { waitForMPIReq(&mpiRequest_Recv[2]); reqOutstanding_Recv[2] = false; } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[0]) { ; } } ; ; for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[0]) { if (reqOutstanding_Send[3]) { waitForMPIReq(&mpiRequest_Send[3]); reqOutstanding_Send[3] = false; } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[0]) { if ((neighbor_isValid[0][0]&&neighbor_isRemote[0][0])) { /* Statements in this Scop: S1096 */ for (int i0 = 0; (i0<=8); i0 += 1) { double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][(i0*24)]); double* buffer_Send_0_p1 = (&buffer_Send[0][(i0*4)]); int i1 = 0; for (; (i1<=2); i1 += 2) { buffer_Send_0_p1[i1] = fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+3)]; buffer_Send_0_p1[(i1+1)] = fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+9)]; } for (; (i1<=3); i1 += 1) { buffer_Send_0_p1[i1] = fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+3)]; } } } if ((neighbor_isValid[0][1]&&neighbor_isRemote[0][1])) { /* Statements in this Scop: S1097 */ for (int i0 = 0; (i0<=8); i0 += 1) { double* buffer_Send_1_p1 = (&buffer_Send[1][(i0*4)]); double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][(i0*24)]); int i1 = 0; for (; (i1<=2); i1 += 2) { buffer_Send_1_p1[i1] = fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+2)]; buffer_Send_1_p1[(i1+1)] = fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+8)]; } for (; (i1<=3); i1 += 1) { buffer_Send_1_p1[i1] = fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+2)]; } } } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[0]) { if ((neighbor_isValid[0][0]&&neighbor_isRemote[0][0])) { MPI_Isend(buffer_Send[0], 36, MPI_DOUBLE, neighbor_remoteRank[0][0], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[0][0]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[0]); reqOutstanding_Send[0] = true; } if ((neighbor_isValid[0][1]&&neighbor_isRemote[0][1])) { MPI_Isend(buffer_Send[1], 36, MPI_DOUBLE, neighbor_remoteRank[0][1], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[0][1]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[1]); reqOutstanding_Send[1] = true; } } } ; for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[0]) { if ((neighbor_isValid[0][0]&&neighbor_isRemote[0][0])) { MPI_Irecv(buffer_Recv[0], 36, MPI_DOUBLE, neighbor_remoteRank[0][0], ((unsigned int)(neighbor_fragCommId[0][0]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[0]); reqOutstanding_Recv[0] = true; } if ((neighbor_isValid[0][1]&&neighbor_isRemote[0][1])) { MPI_Irecv(buffer_Recv[1], 36, MPI_DOUBLE, neighbor_remoteRank[0][1], ((unsigned int)(neighbor_fragCommId[0][1]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[1]); reqOutstanding_Recv[1] = true; } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[0]) { if (reqOutstanding_Recv[0]) { waitForMPIReq(&mpiRequest_Recv[0]); reqOutstanding_Recv[0] = false; } if (reqOutstanding_Recv[1]) { waitForMPIReq(&mpiRequest_Recv[1]); reqOutstanding_Recv[1] = false; } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[0]) { if ((neighbor_isValid[0][0]&&neighbor_isRemote[0][0])) { /* Statements in this Scop: S1098 */ for (int i0 = 0; (i0<=8); i0 += 1) { double* buffer_Recv_0_p1 = (&buffer_Recv[0][(i0*4)]); double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][(i0*24)]); int i1 = 1; for (; (i1<=3); i1 += 2) { fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)-5)] = buffer_Recv_0_p1[(i1-1)]; fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+1)] = buffer_Recv_0_p1[i1]; } for (; (i1<=4); i1 += 1) { fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)-5)] = buffer_Recv_0_p1[(i1-1)]; } } } if ((neighbor_isValid[0][1]&&neighbor_isRemote[0][1])) { /* Statements in this Scop: S1099 */ for (int i0 = 0; (i0<=8); i0 += 1) { double* buffer_Recv_1_p1 = (&buffer_Recv[1][(i0*4)]); double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][(i0*24)]); int i1 = 4; for (; (i1<=6); i1 += 2) { fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)-20)] = buffer_Recv_1_p1[(i1-4)]; fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)-14)] = buffer_Recv_1_p1[(i1-3)]; } for (; (i1<=7); i1 += 1) { fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)-20)] = buffer_Recv_1_p1[(i1-4)]; } } } } } ; ; for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[0]) { if (reqOutstanding_Send[0]) { waitForMPIReq(&mpiRequest_Send[0]); reqOutstanding_Send[0] = false; } if (reqOutstanding_Send[1]) { waitForMPIReq(&mpiRequest_Send[1]); reqOutstanding_Send[1] = false; } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[0]) { ; ; } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[0]) { if ((neighbor_isValid[0][2]&&neighbor_isRemote[0][2])) { MPI_Isend(&fieldData_LaplaceCoeff_GMRF[0][13], 1, mpiDatatype_9_4_24, neighbor_remoteRank[0][2], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[0][2]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[2]); reqOutstanding_Send[2] = true; } if ((neighbor_isValid[0][3]&&neighbor_isRemote[0][3])) { MPI_Isend(&fieldData_LaplaceCoeff_GMRF[0][7], 1, mpiDatatype_9_4_24, neighbor_remoteRank[0][3], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[0][3]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[3]); reqOutstanding_Send[3] = true; } } } ; for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[0]) { if ((neighbor_isValid[0][2]&&neighbor_isRemote[0][2])) { MPI_Irecv(&fieldData_LaplaceCoeff_GMRF[0][1], 1, mpiDatatype_9_4_24, neighbor_remoteRank[0][2], ((unsigned int)(neighbor_fragCommId[0][2]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[2]); reqOutstanding_Recv[2] = true; } if ((neighbor_isValid[0][3]&&neighbor_isRemote[0][3])) { MPI_Irecv(&fieldData_LaplaceCoeff_GMRF[0][19], 1, mpiDatatype_9_4_24, neighbor_remoteRank[0][3], ((unsigned int)(neighbor_fragCommId[0][3]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[3]); reqOutstanding_Recv[3] = true; } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[0]) { if (reqOutstanding_Recv[2]) { waitForMPIReq(&mpiRequest_Recv[2]); reqOutstanding_Recv[2] = false; } if (reqOutstanding_Recv[3]) { waitForMPIReq(&mpiRequest_Recv[3]); reqOutstanding_Recv[3] = false; } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[0]) { ; ; } } ; ; for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[0]) { if (reqOutstanding_Send[2]) { waitForMPIReq(&mpiRequest_Send[2]); reqOutstanding_Send[2] = false; } if (reqOutstanding_Send[3]) { waitForMPIReq(&mpiRequest_Send[3]); reqOutstanding_Send[3] = false; } } } }
inline void vwl_s1( __m128d const vqn, __m128d const vx0, __m128d const vx1, __m128d vwl[] ) { vwl[0] = _mm_mul_pd( vqn, _mm_sub_pd( vx1, vx0 ) ); }
namespace nt2 { namespace ext { template<class Dummy> struct call< tag::minus_ ( tag::simd_<tag::double_,tag::sse_> , tag::simd_<tag::double_,tag::sse_> ) , tag::cpu_, Dummy > : callable { template<class Sig> struct result; template<class This,class A> struct result<This(A,A)> : meta::strip<A> {}; NT2_FUNCTOR_CALL(2) { A0 that = { _mm_sub_pd(a0,a1) }; return that; } }; template<class Dummy> struct call< tag::minus_ ( tag::simd_<tag::float_,tag::sse_> , tag::simd_<tag::float_,tag::sse_> ) , tag::cpu_, Dummy > : callable { template<class Sig> struct result; template<class This,class A> struct result<This(A,A)> : meta::strip<A> {};
// only compute the necessary indices of su2_i = subgroup( U*staple^\dagger ) void only_subgroup( GLU_complex *s0 , GLU_complex *s1 , double *scale , const GLU_complex U[ NCNC ] , const GLU_complex staple[ NCNC ] , const size_t su2_index ) { const __m128d *u = (const __m128d*)U ; const __m128d *s = (const __m128d*)staple ; register __m128d sm0 ; register __m128d sm1 ; #if NC == 3 switch( su2_index%3 ) { // I don't like this // rotation 1 // | s0 s1 0 | // | -s1* s0* 0 | // | 0 0 1 | case 0 : sm0 = _mm_add_pd( // temp0 _mm_add_pd( SSE2_MUL_CONJ( *( u + 0 ) , *( s + 0 ) ) , _mm_add_pd( SSE2_MUL_CONJ( *( u + 1 ) , *( s + 1 ) ) , SSE2_MUL_CONJ( *( u + 2 ) , *( s + 2 ) ) ) ) , // temp3^* _mm_add_pd( SSE2_MULCONJ( *( u + 3 ) , *( s + 3 ) ) , _mm_add_pd( SSE2_MULCONJ( *( u + 4 ) , *( s + 4 ) ) , SSE2_MULCONJ( *( u + 5 ) , *( s + 5 ) ) ) ) ) ; sm1 = _mm_sub_pd( // temp1 _mm_add_pd( SSE2_MUL_CONJ( *( u + 0 ) , *( s + 3 ) ) , _mm_add_pd( SSE2_MUL_CONJ( *( u + 1 ) , *( s + 4 ) ) , SSE2_MUL_CONJ( *( u + 2 ) , *( s + 5 ) ) ) ) , // temp2^* _mm_add_pd( SSE2_MULCONJ( *( u + 3 ) , *( s + 0 ) ) , _mm_add_pd( SSE2_MULCONJ( *( u + 4 ) , *( s + 1 ) ) , SSE2_MULCONJ( *( u + 5 ) , *( s + 2 ) ) ) ) ) ; break ; case 1 : // rotation 2 // | 1 0 0 | // | 0 s0 s1 | // | 0 -s1* s0* | sm0 = _mm_add_pd( // temp0 _mm_add_pd( SSE2_MUL_CONJ( *( u + 3 ) , *( s + 3 ) ) , _mm_add_pd( SSE2_MUL_CONJ( *( u + 4 ) , *( s + 4 ) ) , SSE2_MUL_CONJ( *( u + 5 ) , *( s + 5 ) ) ) ) , // temp3^* _mm_add_pd( SSE2_MULCONJ( *( u + 6 ) , *( s + 6 ) ) , _mm_add_pd( SSE2_MULCONJ( *( u + 7 ) , *( s + 7 ) ) , SSE2_MULCONJ( *( u + 8 ) , *( s + 8 ) ) ) ) ) ; sm1 = _mm_sub_pd( // temp1 _mm_add_pd( SSE2_MUL_CONJ( *( u + 3 ) , *( s + 6 ) ) , _mm_add_pd( SSE2_MUL_CONJ( *( u + 4 ) , *( s + 7 ) ) , SSE2_MUL_CONJ( *( u + 5 ) , *( s + 8 ) ) ) ) , // temp2^* _mm_add_pd( SSE2_MULCONJ( *( u + 6 ) , *( s + 3 ) ) , _mm_add_pd( SSE2_MULCONJ( *( u + 7 ) , *( s + 4 ) ) , SSE2_MULCONJ( *( u + 8 ) , *( s + 5 ) ) ) ) ) ; break ; case 2 : // rotation 3 // | s0* 0 -s1 | // | 0 1 0 | // | s1 0 s0 | sm0 = _mm_add_pd( // temp3^* _mm_add_pd( SSE2_MULCONJ( *( u + 0 ) , *( s + 0 ) ) , _mm_add_pd( SSE2_MULCONJ( *( u + 1 ) , *( s + 1 ) ) , SSE2_MULCONJ( *( u + 2 ) , *( s + 2 ) ) ) ) , // temp0 _mm_add_pd( SSE2_MUL_CONJ( *( u + 6 ) , *( s + 6 ) ) , _mm_add_pd( SSE2_MUL_CONJ( *( u + 7 ) , *( s + 7 ) ) , SSE2_MUL_CONJ( *( u + 8 ) , *( s + 8 ) ) ) ) ) ; sm1 = _mm_sub_pd( // temp1 _mm_add_pd( SSE2_MUL_CONJ( *( u + 6 ) , *( s + 0 ) ) , _mm_add_pd( SSE2_MUL_CONJ( *( u + 7 ) , *( s + 1 ) ) , SSE2_MUL_CONJ( *( u + 8 ) , *( s + 2 ) ) ) ) , // temp2^* _mm_add_pd( SSE2_MULCONJ( *( u + 0 ) , *( s + 6 ) ) , _mm_add_pd( SSE2_MULCONJ( *( u + 1 ) , *( s + 7 ) ) , SSE2_MULCONJ( *( u + 2 ) , *( s + 8 ) ) ) ) ) ; break ; } #elif NC == 2 sm0 = _mm_add_pd( // temp0 _mm_add_pd( SSE2_MUL_CONJ( *( u + 0 ) , *( s + 0 ) ) , SSE2_MUL_CONJ( *( u + 1 ) , *( s + 1 ) ) ) , // temp3^* _mm_add_pd( SSE2_MULCONJ( *( u + 2 ) , *( s + 2 ) ) , SSE2_MULCONJ( *( u + 3 ) , *( s + 3 ) ) ) ) ; sm1 = _mm_sub_pd( // temp1 _mm_add_pd( SSE2_MUL_CONJ( *( u + 0 ) , *( s + 2 ) ) , SSE2_MUL_CONJ( *( u + 1 ) , *( s + 3 ) ) ) , // temp2^* _mm_add_pd( SSE2_MULCONJ( *( u + 2 ) , *( s + 0 ) ) , SSE2_MULCONJ( *( u + 3 ) , *( s + 1 ) ) ) ) ; #else // su(N) version const size_t row_a = Latt.su2_data[ su2_index ].idx_a / NC ; const size_t col_b = Latt.su2_data[ su2_index ].idx_b % NC ; // prefetch the staple & link indices const __m128d *S1 = ( s + NC * row_a ) , *S2 = ( s + NC * col_b ) ; const __m128d *U1 = ( u + NC * row_a ) , *U2 = ( u + NC * col_b ) ; // initialise to zero & perform multiplication sm0 = _mm_setzero_pd() ; sm1 = _mm_setzero_pd() ; size_t i ; for( i = 0 ; i < NC ; i++ ) { sm0 = _mm_add_pd( sm0 , _mm_add_pd( SSE2_MUL_CONJ( *U1 , *S1 ) , SSE2_MULCONJ( *U2 , *S2 ) ) ) ; sm1 = _mm_add_pd( sm1 , _mm_sub_pd( SSE2_MUL_CONJ( *U1 , *S2 ) , SSE2_MULCONJ( *U2 , *S1 ) ) ) ; // increment our pointers S1++ , S2++ , U1++ , U2++ ; } #endif // puts the norm in both parts register __m128d z = SSE2_FMA( sm0 , sm0 , _mm_mul_pd( sm1 , sm1 ) ) ; z = _mm_add_pd( z , _mm_shuffle_pd( z , z , 1 ) ) ; z = _mm_sqrt_pd( z ) ; z = _mm_div_pd( _mm_set1_pd( 1.0 ) , z ) ; sm0 = _mm_mul_pd( sm0 , z ) ; sm1 = _mm_mul_pd( sm1 , z ) ; // poke back into *s0 and *s1 and *scale _mm_store_pd( (void*)s0 , sm0 ) ; _mm_store_pd( (void*)s1 , sm1 ) ; _mm_store_sd( (void*)scale , z ) ; return ; }
int fft5b_(double *a, double *b, double *w, int *m, int *l) { /* static double c51 = .95105651629515357; static double c52 = .61803398874989485; static double c53 = .55901699437494742; static double c54 = .25; */ static __m128d c51, c52, c53, c54; int i, i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, j, j0; /* double x0, y0, x1, y1, x2, y2, x3, y3, x4, y4, x5, y5, x6, y6, x7, y7, x8, y8, x9, y9, x10, y10, wi1, wi2, wi3, wi4, wr1, wr2, wr3, wr4; */ __m128d t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, w1, w2, w3, w4; c51 = _mm_set1_pd(0.95105651629515357); c52 = _mm_set1_pd(0.61803398874989485); c53 = _mm_set1_pd(0.55901699437494742); c54 = _mm_set1_pd(0.25); for (i = 0; i < *m; i++) { i0 = i << 1; i1 = i0 + (*m * *l << 1); i2 = i1 + (*m * *l << 1); i3 = i2 + (*m * *l << 1); i4 = i3 + (*m * *l << 1); i5 = i << 1; i6 = i5 + (*m << 1); i7 = i6 + (*m << 1); i8 = i7 + (*m << 1); i9 = i8 + (*m << 1); /* x0 = a[i1] + a[i4]; y0 = a[i1 + 1] + a[i4 + 1]; x1 = a[i2] + a[i3]; y1 = a[i2 + 1] + a[i3 + 1]; x2 = c51 * (a[i1] - a[i4]); y2 = c51 * (a[i1 + 1] - a[i4 + 1]); x3 = c51 * (a[i2] - a[i3]); y3 = c51 * (a[i2 + 1] - a[i3 + 1]); x4 = x0 + x1; y4 = y0 + y1; x5 = c53 * (x0 - x1); y5 = c53 * (y0 - y1); x6 = a[i0] - c54 * x4; y6 = a[i0 + 1] - c54 * y4; x7 = x6 + x5; y7 = y6 + y5; x8 = x6 - x5; y8 = y6 - y5; x9 = y2 + c52 * y3; y9 = -x2 - c52 * x3; x10 = c52 * y2 - y3; y10 = x3 - c52 * x2; */ t1 = _mm_load_pd(&a[i1]); t4 = _mm_load_pd(&a[i4]); t0 = _mm_add_pd(t1, t4); t2 = _mm_mul_pd(c51, _mm_sub_pd(t1, t4)); t1 = _mm_load_pd(&a[i2]); t4 = _mm_load_pd(&a[i3]); t3 = _mm_mul_pd(c51, _mm_sub_pd(t1, t4)); t1 = _mm_add_pd(t1, t4); t4 = _mm_add_pd(t0, t1); t5 = _mm_mul_pd(c53, _mm_sub_pd(t0, t1)); t0 = _mm_load_pd(&a[i0]); t6 = _mm_sub_pd(t0, _mm_mul_pd(c54, t4)); t7 = _mm_add_pd(t6, t5); t8 = _mm_sub_pd(t6, t5); t9 = _mm_xor_pd(_mm_add_pd(t2, _mm_mul_pd(c52, t3)), _mm_set_sd(-0.0)); t9 = _mm_shuffle_pd(t9, t9, 1); t10 = _mm_sub_pd(t3, _mm_mul_pd(c52, t2)); t10 = _mm_xor_pd(_mm_shuffle_pd(t10, t10, 1), _mm_set_sd(-0.0)); /* b[i5] = a[i0] + x4; b[i5 + 1] = a[i0 + 1] + y4; b[i6] = x7 + x9; b[i6 + 1] = y7 + y9; b[i7] = x8 + x10; b[i7 + 1] = y8 + y10; b[i8] = x8 - x10; b[i8 + 1] = y8 - y10; b[i9] = x7 - x9; b[i9 + 1] = y7 - y9; */ _mm_store_pd(&b[i5], _mm_add_pd(t0, t4)); _mm_store_pd(&b[i6], _mm_add_pd(t7, t9)); _mm_store_pd(&b[i7], _mm_add_pd(t8, t10)); _mm_store_pd(&b[i8], _mm_sub_pd(t8, t10)); _mm_store_pd(&b[i9], _mm_sub_pd(t7, t9)); } for (j = 1; j < *l; j++) { j0 = j << 1; /* wr1 = w[j0]; wi1 = w[j0 + 1]; wr2 = wr1 * wr1 - wi1 * wi1; wi2 = wr1 * wi1 + wr1 * wi1; wr3 = wr1 * wr2 - wi1 * wi2; wi3 = wr1 * wi2 + wi1 * wr2; wr4 = wr2 * wr2 - wi2 * wi2; wi4 = wr2 * wi2 + wr2 * wi2; */ w1 = _mm_load_pd(&w[j0]); w2 = ZMUL(w1, w1); w3 = ZMUL(w1, w2); w4 = ZMUL(w2, w2); for (i = 0; i < *m; i++) { i0 = (i << 1) + (j * *m << 1); i1 = i0 + (*m * *l << 1); i2 = i1 + (*m * *l << 1); i3 = i2 + (*m * *l << 1); i4 = i3 + (*m * *l << 1); i5 = (i << 1) + (j * *m * 10); i6 = i5 + (*m << 1); i7 = i6 + (*m << 1); i8 = i7 + (*m << 1); i9 = i8 + (*m << 1); /* x0 = a[i1] + a[i4]; y0 = a[i1 + 1] + a[i4 + 1]; x1 = a[i2] + a[i3]; y1 = a[i2 + 1] + a[i3 + 1]; x2 = c51 * (a[i1] - a[i4]); y2 = c51 * (a[i1 + 1] - a[i4 + 1]); x3 = c51 * (a[i2] - a[i3]); y3 = c51 * (a[i2 + 1] - a[i3 + 1]); x4 = x0 + x1; y4 = y0 + y1; x5 = c53 * (x0 - x1); y5 = c53 * (y0 - y1); x6 = a[i0] - c54 * x4; y6 = a[i0 + 1] - c54 * y4; x7 = x6 + x5; y7 = y6 + y5; x8 = x6 - x5; y8 = y6 - y5; x9 = y2 + c52 * y3; y9 = -x2 - c52 * x3; x10 = c52 * y2 - y3; y10 = x3 - c52 * x2; */ t1 = _mm_load_pd(&a[i1]); t4 = _mm_load_pd(&a[i4]); t0 = _mm_add_pd(t1, t4); t2 = _mm_mul_pd(c51, _mm_sub_pd(t1, t4)); t1 = _mm_load_pd(&a[i2]); t4 = _mm_load_pd(&a[i3]); t3 = _mm_mul_pd(c51, _mm_sub_pd(t1, t4)); t1 = _mm_add_pd(t1, t4); t4 = _mm_add_pd(t0, t1); t5 = _mm_mul_pd(c53, _mm_sub_pd(t0, t1)); t0 = _mm_load_pd(&a[i0]); t6 = _mm_sub_pd(t0, _mm_mul_pd(c54, t4)); t7 = _mm_add_pd(t6, t5); t8 = _mm_sub_pd(t6, t5); t9 = _mm_xor_pd(_mm_add_pd(t2, _mm_mul_pd(c52, t3)), _mm_set_sd(-0.0)); t9 = _mm_shuffle_pd(t9, t9, 1); t10 = _mm_sub_pd(t3, _mm_mul_pd(c52, t2)); t10 = _mm_xor_pd(_mm_shuffle_pd(t10, t10, 1), _mm_set_sd(-0.0)); /* b[i5] = a[i0] + x4; b[i5 + 1] = a[i0 + 1] + y4; b[i6] = wr1 * (x7 + x9) - wi1 * (y7 + y9); b[i6 + 1] = wr1 * (y7 + y9) + wi1 * (x7 + x9); b[i7] = wr2 * (x8 + x10) - wi2 * (y8 + y10); b[i7 + 1] = wr2 * (y8 + y10) + wi2 * (x8 + x10); b[i8] = wr3 * (x8 - x10) - wi3 * (y8 - y10); b[i8 + 1] = wr3 * (y8 - y10) + wi3 * (x8 - x10); b[i9] = wr4 * (x7 - x9) - wi4 * (y7 - y9); b[i9 + 1] = wr4 * (y7 - y9) + wi4 * (x7 - x9); */ _mm_store_pd(&b[i5], _mm_add_pd(t0, t4)); _mm_store_pd(&b[i6], ZMUL(w1, _mm_add_pd(t7, t9))); _mm_store_pd(&b[i7], ZMUL(w2, _mm_add_pd(t8, t10))); _mm_store_pd(&b[i8], ZMUL(w3, _mm_sub_pd(t8, t10))); _mm_store_pd(&b[i9], ZMUL(w4, _mm_sub_pd(t7, t9))); } } return 0; }
int fft8a_(double *a, double *b, double *w, int *l) { /* static double c81 = .70710678118654752; */ static __m128d c81; int j, j0, j1, j2, j3, j4, j5, j6, j7, j8, j9, j10, j11, j12, j13, j14, j15; /* double u0, v0, u1, x0, y0, x1, y1, x2, y2, x3, y3, v1, x4, y4, x5, y5, x6, y6, x7, y7, u2, v2, u3, v3, wi1, wi2, wi3, wi4, wi5, wi6, wi7, wr1, wr2, wr3, wr4, wr5, wr6, wr7; */ __m128d t0, t1, t2, t3, t4, t5, t6, t7, t8, u0, u1, u2, u3, w1, w2, w3, w4, w5, w6, w7; c81 = _mm_set1_pd(0.70710678118654752); for (j = 0; j < *l; j++) { j0 = j << 1; j1 = j0 + (*l << 1); j2 = j1 + (*l << 1); j3 = j2 + (*l << 1); j4 = j3 + (*l << 1); j5 = j4 + (*l << 1); j6 = j5 + (*l << 1); j7 = j6 + (*l << 1); j8 = j << 4; j9 = j8 + 2; j10 = j9 + 2; j11 = j10 + 2; j12 = j11 + 2; j13 = j12 + 2; j14 = j13 + 2; j15 = j14 + 2; /* wr1 = w[j0]; wi1 = w[j0 + 1]; wr2 = wr1 * wr1 - wi1 * wi1; wi2 = wr1 * wi1 + wr1 * wi1; wr3 = wr1 * wr2 - wi1 * wi2; wi3 = wr1 * wi2 + wi1 * wr2; wr4 = wr2 * wr2 - wi2 * wi2; wi4 = wr2 * wi2 + wr2 * wi2; wr5 = wr2 * wr3 - wi2 * wi3; wi5 = wr2 * wi3 + wi2 * wr3; wr6 = wr3 * wr3 - wi3 * wi3; wi6 = wr3 * wi3 + wr3 * wi3; wr7 = wr3 * wr4 - wi3 * wi4; wi7 = wr3 * wi4 + wi3 * wr4; */ w1 = _mm_load_pd(&w[j0]); w2 = ZMUL(w1, w1); w3 = ZMUL(w1, w2); w4 = ZMUL(w2, w2); w5 = ZMUL(w2, w3); w6 = ZMUL(w3, w3); w7 = ZMUL(w3, w4); /* x0 = a[j0] + a[j4]; y0 = a[j0 + 1] + a[j4 + 1]; x1 = a[j0] - a[j4]; y1 = a[j0 + 1] - a[j4 + 1]; x2 = a[j2] + a[j6]; y2 = a[j2 + 1] + a[j6 + 1]; x3 = a[j2 + 1] - a[j6 + 1]; y3 = a[j6] - a[j2]; */ t0 = _mm_load_pd(&a[j0]); t2 = _mm_load_pd(&a[j4]); t1 = _mm_sub_pd(t0, t2); t0 = _mm_add_pd(t0, t2); t3 = _mm_load_pd(&a[j2]); t4 = _mm_load_pd(&a[j6]); t2 = _mm_add_pd(t3, t4); t3 = _mm_xor_pd(_mm_sub_pd(t3, t4), _mm_set_sd(-0.0)); t3 = _mm_shuffle_pd(t3, t3, 1); /* u0 = x0 + x2; v0 = y0 + y2; u1 = x0 - x2; v1 = y0 - y2; */ u0 = _mm_add_pd(t0, t2); u1 = _mm_sub_pd(t0, t2); /* x4 = a[j1] + a[j5]; y4 = a[j1 + 1] + a[j5 + 1]; x5 = a[j1] - a[j5]; y5 = a[j1 + 1] - a[j5 + 1]; x6 = a[j3] + a[j7]; y6 = a[j3 + 1] + a[j7 + 1]; x7 = a[j3] - a[j7]; y7 = a[j3 + 1] - a[j7 + 1]; */ t4 = _mm_load_pd(&a[j1]); t6 = _mm_load_pd(&a[j5]); t5 = _mm_sub_pd(t4, t6); t4 = _mm_add_pd(t4, t6); t7 = _mm_load_pd(&a[j3]); t8 = _mm_load_pd(&a[j7]); t6 = _mm_add_pd(t7, t8); t7 = _mm_sub_pd(t7, t8); /* u2 = x4 + x6; v2 = y4 + y6; u3 = y4 - y6; v3 = x6 - x4; */ u2 = _mm_add_pd(t4, t6); u3 = _mm_xor_pd(_mm_sub_pd(t4, t6), _mm_set_sd(-0.0)); u3 = _mm_shuffle_pd(u3, u3, 1); /* b[j8] = u0 + u2; b[j8 + 1] = v0 + v2; b[j12] = wr4 * (u0 - u2) - wi4 * (v0 - v2); b[j12 + 1] = wr4 * (v0 - v2) + wi4 * (u0 - u2); b[j10] = wr2 * (u1 + u3) - wi2 * (v1 + v3); b[j10 + 1] = wr2 * (v1 + v3) + wi2 * (u1 + u3); b[j14] = wr6 * (u1 - u3) - wi6 * (v1 - v3); b[j14 + 1] = wr6 * (v1 - v3) + wi6 * (u1 - u3); */ _mm_store_pd(&b[j8], _mm_add_pd(u0, u2)); _mm_store_pd(&b[j12], ZMUL(w4, _mm_sub_pd(u0, u2))); _mm_store_pd(&b[j10], ZMUL(w2, _mm_add_pd(u1, u3))); _mm_store_pd(&b[j14], ZMUL(w6, _mm_sub_pd(u1, u3))); /* u0 = x1 + c81 * (x5 - x7); v0 = y1 + c81 * (y5 - y7); u1 = x1 - c81 * (x5 - x7); v1 = y1 - c81 * (y5 - y7); u2 = x3 + c81 * (y5 + y7); v2 = y3 - c81 * (x5 + x7); u3 = x3 - c81 * (y5 + y7); v3 = y3 + c81 * (x5 + x7); */ u1 = _mm_mul_pd(c81, _mm_sub_pd(t5, t7)); u0 = _mm_add_pd(t1, u1); u1 = _mm_sub_pd(t1, u1); u3 = _mm_xor_pd(_mm_mul_pd(c81, _mm_add_pd(t5, t7)), _mm_set_sd(-0.0)); u3 = _mm_shuffle_pd(u3, u3, 1); u2 = _mm_add_pd(t3, u3); u3 = _mm_sub_pd(t3, u3); /* b[j9] = wr1 * (u0 + u2) - wi1 * (v0 + v2); b[j9 + 1] = wr1 * (v0 + v2) + wi1 * (u0 + u2); b[j13] = wr5 * (u1 + u3) - wi5 * (v1 + v3); b[j13 + 1] = wr5 * (v1 + v3) + wi5 * (u1 + u3); b[j11] = wr3 * (u1 - u3) - wi3 * (v1 - v3); b[j11 + 1] = wr3 * (v1 - v3) + wi3 * (u1 - u3); b[j15] = wr7 * (u0 - u2) - wi7 * (v0 - v2); b[j15 + 1] = wr7 * (v0 - v2) + wi7 * (u0 - u2); */ _mm_store_pd(&b[j9], ZMUL(w1, _mm_add_pd(u0, u2))); _mm_store_pd(&b[j13], ZMUL(w5, _mm_add_pd(u1, u3))); _mm_store_pd(&b[j11], ZMUL(w3, _mm_sub_pd(u1, u3))); _mm_store_pd(&b[j15], ZMUL(w7, _mm_sub_pd(u0, u2))); } return 0; }
int fft4b_(double *a, double *b, double *w, int *m, int *l) { int i, i0, i1, i2, i3, i4, i5, i6, i7, j, j0; /* double x0, y0, x1, y1, x2, y2, x3, y3, wi1, wi2, wi3, wr1, wr2, wr3; */ __m128d t0, t1, t2, t3, t4, w1, w2, w3; for (i = 0; i < *m; i++) { i0 = i << 1; i1 = i0 + (*m * *l << 1); i2 = i1 + (*m * *l << 1); i3 = i2 + (*m * *l << 1); i4 = i << 1; i5 = i4 + (*m << 1); i6 = i5 + (*m << 1); i7 = i6 + (*m << 1); /* x0 = a[i0] + a[i2]; y0 = a[i0 + 1] + a[i2 + 1]; x1 = a[i0] - a[i2]; y1 = a[i0 + 1] - a[i2 + 1]; x2 = a[i1] + a[i3]; y2 = a[i1 + 1] + a[i3 + 1]; x3 = a[i1 + 1] - a[i3 + 1]; y3 = a[i3] - a[i1]; */ t0 = _mm_load_pd(&a[i0]); t2 = _mm_load_pd(&a[i2]); t1 = _mm_sub_pd(t0, t2); t0 = _mm_add_pd(t0, t2); t3 = _mm_load_pd(&a[i1]); t4 = _mm_load_pd(&a[i3]); t2 = _mm_add_pd(t3, t4); t3 = _mm_xor_pd(_mm_sub_pd(t3, t4), _mm_set_sd(-0.0)); t3 = _mm_shuffle_pd(t3, t3, 1); /* b[i4] = x0 + x2; b[i4 + 1] = y0 + y2; b[i6] = x0 - x2; b[i6 + 1] = y0 - y2; b[i5] = x1 + x3; b[i5 + 1] = y1 + y3; b[i7] = x1 - x3; b[i7 + 1] = y1 - y3; */ _mm_store_pd(&b[i4], _mm_add_pd(t0, t2)); _mm_store_pd(&b[i6], _mm_sub_pd(t0, t2)); _mm_store_pd(&b[i5], _mm_add_pd(t1, t3)); _mm_store_pd(&b[i7], _mm_sub_pd(t1, t3)); } for (j = 1; j < *l; j++) { j0 = j << 1; /* wr1 = w[j0]; wi1 = w[j0 + 1]; wr2 = wr1 * wr1 - wi1 * wi1; wi2 = wr1 * wi1 + wr1 * wi1; wr3 = wr1 * wr2 - wi1 * wi2; wi3 = wr1 * wi2 + wi1 * wr2; */ w1 = _mm_load_pd(&w[j0]); w2 = ZMUL(w1, w1); w3 = ZMUL(w1, w2); for (i = 0; i < *m; i++) { i0 = (i << 1) + (j * *m << 1); i1 = i0 + (*m * *l << 1); i2 = i1 + (*m * *l << 1); i3 = i2 + (*m * *l << 1); i4 = (i << 1) + (j * *m << 3); i5 = i4 + (*m << 1); i6 = i5 + (*m << 1); i7 = i6 + (*m << 1); /* x0 = a[i0] + a[i2]; y0 = a[i0 + 1] + a[i2 + 1]; x1 = a[i0] - a[i2]; y1 = a[i0 + 1] - a[i2 + 1]; x2 = a[i1] + a[i3]; y2 = a[i1 + 1] + a[i3 + 1]; x3 = a[i1 + 1] - a[i3 + 1]; y3 = a[i3] - a[i1]; */ t0 = _mm_load_pd(&a[i0]); t2 = _mm_load_pd(&a[i2]); t1 = _mm_sub_pd(t0, t2); t0 = _mm_add_pd(t0, t2); t3 = _mm_load_pd(&a[i1]); t4 = _mm_load_pd(&a[i3]); t2 = _mm_add_pd(t3, t4); t3 = _mm_xor_pd(_mm_sub_pd(t3, t4), _mm_set_sd(-0.0)); t3 = _mm_shuffle_pd(t3, t3, 1); /* b[i4] = x0 + x2; b[i4 + 1] = y0 + y2; b[i6] = wr2 * (x0 - x2) - wi2 * (y0 - y2); b[i6 + 1] = wr2 * (y0 - y2) + wi2 * (x0 - x2); b[i5] = wr1 * (x1 + x3) - wi1 * (y1 + y3); b[i5 + 1] = wr1 * (y1 + y3) + wi1 * (x1 + x3); b[i7] = wr3 * (x1 - x3) - wi3 * (y1 - y3); b[i7 + 1] = wr3 * (y1 - y3) + wi3 * (x1 - x3); */ _mm_store_pd(&b[i4], _mm_add_pd(t0, t2)); _mm_store_pd(&b[i6], ZMUL(w2, _mm_sub_pd(t0, t2))); _mm_store_pd(&b[i5], ZMUL(w1, _mm_add_pd(t1, t3))); _mm_store_pd(&b[i7], ZMUL(w3, _mm_sub_pd(t1, t3))); } } return 0; }
int fft5a_(double *a, double *b, double *w, int *l) { /* static double c51 = .95105651629515357; static double c52 = .61803398874989485; static double c53 = .55901699437494742; static double c54 = .25; */ static __m128d c51, c52, c53, c54; int j, j0, j1, j2, j3, j4, j5, j6, j7, j8, j9; /* double x0, y0, x1, y1, x2, y2, x3, y3, x4, y4, x5, y5, x6, y6, x7, y7, x8, y8, x9, y9, x10, y10, wi1, wi2, wi3, wi4, wr1, wr2, wr3, wr4; */ __m128d t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, w1, w2, w3, w4; c51 = _mm_set1_pd(0.95105651629515357); c52 = _mm_set1_pd(0.61803398874989485); c53 = _mm_set1_pd(0.55901699437494742); c54 = _mm_set1_pd(0.25); for (j = 0; j < *l; j++) { j0 = j << 1; j1 = j0 + (*l << 1); j2 = j1 + (*l << 1); j3 = j2 + (*l << 1); j4 = j3 + (*l << 1); j5 = j * 10; j6 = j5 + 2; j7 = j6 + 2; j8 = j7 + 2; j9 = j8 + 2; /* wr1 = w[j0]; wi1 = w[j0 + 1]; wr2 = wr1 * wr1 - wi1 * wi1; wi2 = wr1 * wi1 + wr1 * wi1; wr3 = wr1 * wr2 - wi1 * wi2; wi3 = wr1 * wi2 + wi1 * wr2; wr4 = wr2 * wr2 - wi2 * wi2; wi4 = wr2 * wi2 + wr2 * wi2; */ w1 = _mm_load_pd(&w[j0]); w2 = ZMUL(w1, w1); w3 = ZMUL(w1, w2); w4 = ZMUL(w2, w2); /* x0 = a[j1] + a[j4]; y0 = a[j1 + 1] + a[j4 + 1]; x1 = a[j2] + a[j3]; y1 = a[j2 + 1] + a[j3 + 1]; x2 = c51 * (a[j1] - a[j4]); y2 = c51 * (a[j1 + 1] - a[j4 + 1]); x3 = c51 * (a[j2] - a[j3]); y3 = c51 * (a[j2 + 1] - a[j3 + 1]); x4 = x0 + x1; y4 = y0 + y1; x5 = c53 * (x0 - x1); y5 = c53 * (y0 - y1); x6 = a[j0] - c54 * x4; y6 = a[j0 + 1] - c54 * y4; x7 = x6 + x5; y7 = y6 + y5; x8 = x6 - x5; y8 = y6 - y5; x9 = y2 + c52 * y3; y9 = -x2 - c52 * x3; x10 = c52 * y2 - y3; y10 = x3 - c52 * x2; */ t1 = _mm_load_pd(&a[j1]); t4 = _mm_load_pd(&a[j4]); t0 = _mm_add_pd(t1, t4); t2 = _mm_mul_pd(c51, _mm_sub_pd(t1, t4)); t1 = _mm_load_pd(&a[j2]); t4 = _mm_load_pd(&a[j3]); t3 = _mm_mul_pd(c51, _mm_sub_pd(t1, t4)); t1 = _mm_add_pd(t1, t4); t4 = _mm_add_pd(t0, t1); t5 = _mm_mul_pd(c53, _mm_sub_pd(t0, t1)); t0 = _mm_load_pd(&a[j0]); t6 = _mm_sub_pd(t0, _mm_mul_pd(c54, t4)); t7 = _mm_add_pd(t6, t5); t8 = _mm_sub_pd(t6, t5); t9 = _mm_xor_pd(_mm_add_pd(t2, _mm_mul_pd(c52, t3)), _mm_set_sd(-0.0)); t9 = _mm_shuffle_pd(t9, t9, 1); t10 = _mm_sub_pd(t3, _mm_mul_pd(c52, t2)); t10 = _mm_xor_pd(_mm_shuffle_pd(t10, t10, 1), _mm_set_sd(-0.0)); /* b[j5] = a[j0] + x4; b[j5 + 1] = a[j0 + 1] + y4; b[j6] = wr1 * (x7 + x9) - wi1 * (y7 + y9); b[j6 + 1] = wr1 * (y7 + y9) + wi1 * (x7 + x9); b[j7] = wr2 * (x8 + x10) - wi2 * (y8 + y10); b[j7 + 1] = wr2 * (y8 + y10) + wi2 * (x8 + x10); b[j8] = wr3 * (x8 - x10) - wi3 * (y8 - y10); b[j8 + 1] = wr3 * (y8 - y10) + wi3 * (x8 - x10); b[j9] = wr4 * (x7 - x9) - wi4 * (y7 - y9); b[j9 + 1] = wr4 * (y7 - y9) + wi4 * (x7 - x9); */ _mm_store_pd(&b[j5], _mm_add_pd(t0, t4)); _mm_store_pd(&b[j6], ZMUL(w1, _mm_add_pd(t7, t9))); _mm_store_pd(&b[j7], ZMUL(w2, _mm_add_pd(t8, t10))); _mm_store_pd(&b[j8], ZMUL(w3, _mm_sub_pd(t8, t10))); _mm_store_pd(&b[j9], ZMUL(w4, _mm_sub_pd(t7, t9))); } return 0; }
int fft3b_(double *a, double *b, double *w, int *m, int *l) { /* static double c31 = .86602540378443865; static double c32 = .5; */ static __m128d c31, c32; int i, i0, i1, i2, i3, i4, i5, j, j0; /* double x0, y0, x1, y1, x2, y2, wi1, wi2, wr1, wr2; */ __m128d t0, t1, t2, t3, w1, w2; c31 = _mm_set1_pd(0.86602540378443865); c32 = _mm_set1_pd(0.5); for (i = 0; i < *m; i++) { i0 = i << 1; i1 = i0 + (*m * *l << 1); i2 = i1 + (*m * *l << 1); i3 = i << 1; i4 = i3 + (*m << 1); i5 = i4 + (*m << 1); /* x0 = a[i1] + a[i2]; y0 = a[i1 + 1] + a[i2 + 1]; x1 = a[i0] - c32 * x0; y1 = a[i0 + 1] - c32 * y0; x2 = c31 * (a[i1 + 1] - a[i2 + 1]); y2 = c31 * (a[i2] - a[i1]); */ t1 = _mm_load_pd(&a[i1]); t2 = _mm_load_pd(&a[i2]); t0 = _mm_add_pd(t1, t2); t2 = _mm_xor_pd(_mm_sub_pd(t1, t2), _mm_set_sd(-0.0)); t2 = _mm_mul_pd(c31, _mm_shuffle_pd(t2, t2, 1)); t3 = _mm_load_pd(&a[i0]); t1 = _mm_sub_pd(t3, _mm_mul_pd(c32, t0)); /* b[i3] = a[i0] + x0; b[i3 + 1] = a[i0 + 1] + y0; b[i4] = x1 + x2; b[i4 + 1] = y1 + y2; b[i5] = x1 - x2; b[i5 + 1] = y1 - y2; */ _mm_store_pd(&b[i3], _mm_add_pd(t3, t0)); _mm_store_pd(&b[i4], _mm_add_pd(t1, t2)); _mm_store_pd(&b[i5], _mm_sub_pd(t1, t2)); } for (j = 1; j < *l; j++) { j0 = j << 1; /* wr1 = w[j0]; wi1 = w[j0 + 1]; wr2 = wr1 * wr1 - wi1 * wi1; wi2 = wr1 * wi1 + wr1 * wi1; */ w1 = _mm_load_pd(&w[j0]); w2 = ZMUL(w1, w1); for (i = 0; i < *m; i++) { i0 = (i << 1) + (j * *m << 1); i1 = i0 + (*m * *l << 1); i2 = i1 + (*m * *l << 1); i3 = (i << 1) + (j * *m * 6); i4 = i3 + (*m << 1); i5 = i4 + (*m << 1); /* x0 = a[i1] + a[i2]; y0 = a[i1 + 1] + a[i2 + 1]; x1 = a[i0] - x0 * .5; y1 = a[i0 + 1] - y0 * .5; x2 = c31 * (a[i1 + 1] - a[i2 + 1]); y2 = c31 * (a[i2] - a[i1]); */ t1 = _mm_load_pd(&a[i1]); t2 = _mm_load_pd(&a[i2]); t0 = _mm_add_pd(t1, t2); t2 = _mm_xor_pd(_mm_sub_pd(t1, t2), _mm_set_sd(-0.0)); t2 = _mm_mul_pd(c31, _mm_shuffle_pd(t2, t2, 1)); t3 = _mm_load_pd(&a[i0]); t1 = _mm_sub_pd(t3, _mm_mul_pd(c32, t0)); /* b[i3] = a[i0] + x0; b[i3 + 1] = a[i0 + 1] + y0; b[i4] = wr1 * (x1 + x2) - wi1 * (y1 + y2); b[i4 + 1] = wr1 * (y1 + y2) + wi1 * (x1 + x2); b[i5] = wr2 * (x1 - x2) - wi2 * (y1 - y2); b[i5 + 1] = wr2 * (y1 - y2) + wi2 * (x1 - x2); */ _mm_store_pd(&b[i3], _mm_add_pd(t3, t0)); _mm_store_pd(&b[i4], ZMUL(w1, _mm_add_pd(t1, t2))); _mm_store_pd(&b[i5], ZMUL(w2, _mm_sub_pd(t1, t2))); } } return 0; }
// ============================================================================= // // sse2_vChirpData // version by: Alex Kan - SSE2 mods (haddsum removal) BH // http://tbp.berkeley.edu/~alexkan/seti/ // int sse2_ChirpData_ak( sah_complex * cx_DataArray, sah_complex * cx_ChirpDataArray, int chirp_rate_ind, double chirp_rate, int ul_NumDataPoints, double sample_rate ) { int i; if (chirp_rate_ind == 0) { memcpy(cx_ChirpDataArray, cx_DataArray, (int)ul_NumDataPoints * sizeof(sah_complex) ); return 0; } int vEnd; double srate = chirp_rate * 0.5 / (sample_rate * sample_rate); __m128d rate = _mm_set1_pd(chirp_rate * 0.5 / (sample_rate * sample_rate)); __m128d roundVal = _mm_set1_pd(srate >= 0.0 ? TWO_TO_52 : -TWO_TO_52); // main vectorised loop vEnd = ul_NumDataPoints - (ul_NumDataPoints & 3); for (i = 0; i < vEnd; i += 4) { const float *data = (const float *) (cx_DataArray + i); float *chirped = (float *) (cx_ChirpDataArray + i); __m128d di = _mm_set1_pd(i); __m128d a1 = _mm_add_pd(_mm_set_pd(1.0, 0.0), di); __m128d a2 = _mm_add_pd(_mm_set_pd(3.0, 2.0), di); __m128d x1, y1; __m128 d1, d2; __m128 cd1, cd2; __m128 td1, td2; __m128 x; __m128 y; __m128 s; __m128 c; __m128 m; // load the signal to be chirped prefetchnta((const void *)( data+32 )); d1 = _mm_load_ps(data); d2 = _mm_load_ps(data+4); // calculate the input angle a1 = _mm_mul_pd(a1, a1); a2 = _mm_mul_pd(a2, a2); a1 = _mm_mul_pd(a1, rate); a2 = _mm_mul_pd(a2, rate); // reduce the angle to the range (-0.5, 0.5) x1 = _mm_add_pd(a1, roundVal); y1 = _mm_add_pd(a2, roundVal); x1 = _mm_sub_pd(x1, roundVal); y1 = _mm_sub_pd(y1, roundVal); a1 = _mm_sub_pd(a1, x1); a2 = _mm_sub_pd(a2, y1); // convert pair of packed double into packed single x = _mm_movelh_ps(_mm_cvtpd_ps(a1), _mm_cvtpd_ps(a2)); // square to the range [0, 0.25) y = _mm_mul_ps(x, x); // perform the initial polynomial approximations s = _mm_mul_ps(y, SS4); c = _mm_mul_ps(y, CC3); s = _mm_add_ps(s, SS3); c = _mm_add_ps(c, CC2); s = _mm_mul_ps(s, y); c = _mm_mul_ps(c, y); s = _mm_add_ps(s, SS2); c = _mm_add_ps(c, CC1); s = _mm_mul_ps(s, y); c = _mm_mul_ps(c, y); s = _mm_add_ps(s, SS1); s = _mm_mul_ps(s, x); c = _mm_add_ps(c, ONE); // perform first angle doubling x = _mm_sub_ps(_mm_mul_ps(c, c), _mm_mul_ps(s, s)); y = _mm_mul_ps(_mm_mul_ps(s, c), TWO); // calculate scaling factor to correct the magnitude // m1 = vec_nmsub(y1, y1, vec_nmsub(x1, x1, TWO)); // m2 = vec_nmsub(y2, y2, vec_nmsub(x2, x2, TWO)); m = vec_recip2(_mm_add_ps(_mm_mul_ps(x, x), _mm_mul_ps(y, y))); // perform second angle doubling c = _mm_sub_ps(_mm_mul_ps(x, x), _mm_mul_ps(y, y)); s = _mm_mul_ps(_mm_mul_ps(y, x), TWO); // correct the magnitude (final sine / cosine approximations) c = _mm_mul_ps(c, m); s = _mm_mul_ps(s, m); /* c1 c2 c3 c4 s1 s2 s3 s4 R1 i1 R2 I2 R3 i3 R4 i4 R1 * c1 + (i1 * s1 * -1) i1 * c1 + R1 * s1 R2 * c2 + (i2 * s2 * -1) i2 * c2 + R2 * s2 */ x = d1; y = d2; x = _mm_shuffle_ps(x, x, 0xB1); y = _mm_shuffle_ps(y, y, 0xB1); x = _mm_mul_ps(x, R_NEG); y = _mm_mul_ps(y, R_NEG); cd1 = _mm_shuffle_ps(c, c, 0x50); // 01 01 00 00 AaBb => BBbb => c3c3c4c4 cd2 = _mm_shuffle_ps(c, c, 0xfa); // 11 11 10 10 AaBb => AAaa => c1c1c2c2 td1 = _mm_shuffle_ps(s, s, 0x50); td2 = _mm_shuffle_ps(s, s, 0xfa); cd1 = _mm_mul_ps(cd1, d1); cd2 = _mm_mul_ps(cd2, d2); td1 = _mm_mul_ps(td1, x); td2 = _mm_mul_ps(td2, y); cd1 = _mm_add_ps(cd1, td1); cd2 = _mm_add_ps(cd2, td2); // store chirped values _mm_stream_ps(chirped+0, cd1); _mm_stream_ps(chirped+4, cd2); } _mm_sfence(); if( i < ul_NumDataPoints) { // use original routine to finish up any tailings (max stride-1 elements) v_ChirpData(cx_DataArray+i, cx_ChirpDataArray+i , chirp_rate_ind, chirp_rate, ul_NumDataPoints-i, sample_rate); } analysis_state.FLOP_counter+=12.0*ul_NumDataPoints; return 0; }
void CalcGravity(int sp, PSpot* allSpot,int* length) { __m128d force1 = _mm_set1_pd(0); __m128d force2 = _mm_set1_pd(0); PSpot* spotSp = &allSpot[sp]; for(int i=0;i<sp;i++) { __m128d diff1 = _mm_sub_pd(allSpot[i].pos1, spotSp->pos1); __m128d diff2 = _mm_sub_sd(allSpot[i].pos2, spotSp->pos2); __m128d r = Length(diff1, diff2); if (r.m128d_f64[0]*2 < (spotSp->qmass + allSpot[i].qmass)) { if (allSpot[i].mass > spotSp->mass) { allSpot[i].heading1 = _mm_add_pd( allSpot[i].heading1, _mm_mul_pd( _mm_sub_pd(spotSp->heading1, allSpot[i].heading1), _mm_set1_pd(spotSp->mass / (spotSp->mass + allSpot[i].mass)) ) ); allSpot[i].heading2 = _mm_add_sd( allSpot[i].heading2, _mm_mul_sd( _mm_sub_sd(spotSp->heading2, allSpot[i].heading2), _mm_set1_pd(spotSp->mass / (spotSp->mass + allSpot[i].mass)) ) ); allSpot[i].mass += spotSp->mass; allSpot[i].qmass = pow(allSpot[i].mass, 0.33333); spotSp->mass = 0; (*length)--; PSpot temp; temp = allSpot[sp]; allSpot[sp] = allSpot[*length]; allSpot[*length] = temp; return; } else { spotSp->heading1 = _mm_add_pd( spotSp->heading1, _mm_mul_pd( _mm_sub_pd(allSpot[i].heading1, spotSp->heading1), _mm_set1_pd(allSpot[i].mass / (spotSp->mass + allSpot[i].mass)) ) ); spotSp->heading2 = _mm_add_sd( spotSp->heading2, _mm_mul_sd( _mm_sub_sd(allSpot[i].heading2, spotSp->heading2), _mm_set1_pd(allSpot[i].mass / (spotSp->mass + allSpot[i].mass)) ) ); spotSp->mass += allSpot[i].mass; spotSp->qmass = pow(spotSp->mass, 0.33333); allSpot[i].mass = 0; (*length)--; PSpot temp; temp = allSpot[i]; allSpot[i] = allSpot[*length]; allSpot[*length] = temp; return; } } //float f = (G * spotSp->mass * allSpot[i].mass) / (r.m128d_f64[0] * r.m128d_f64[0] * r.m128d_f64[0]); __m128d r1 = r; r1.m128d_f64[1] = G; __m128d r2 = r; r2.m128d_f64[1] = spotSp->mass; __m128d r3 = r; r3.m128d_f64[1] = allSpot[i].mass; __m128d r4 = _mm_mul_pd(_mm_mul_pd(r1, r2), r3); __m128d r5 = _mm_shuffle_pd(r4, r4, 3); r4 = _mm_shuffle_pd(r4, r4, 0); __m128d r6 = _mm_div_pd(r5, r4); force1 = _mm_add_pd(force1,_mm_mul_pd(diff1, r6)); force2 = _mm_add_sd(force2,_mm_mul_sd(diff2, r6)); } for(int i=sp+1;i<*length;i++) { __m128d diff1 = _mm_sub_pd(allSpot[i].pos1, spotSp->pos1); __m128d diff2 = _mm_sub_sd(allSpot[i].pos2, spotSp->pos2); __m128d r = Length(diff1, diff2); if (r.m128d_f64[0]*2 < (spotSp->qmass + allSpot[i].qmass)) { if (allSpot[i].mass > spotSp->mass) { allSpot[i].heading1 = _mm_add_pd( allSpot[i].heading1, _mm_mul_pd( _mm_sub_pd(spotSp->heading1, allSpot[i].heading1), _mm_set1_pd(spotSp->mass / (spotSp->mass + allSpot[i].mass)) ) ); allSpot[i].heading2 = _mm_add_sd( allSpot[i].heading2, _mm_mul_sd( _mm_sub_sd(spotSp->heading2, allSpot[i].heading2), _mm_set1_pd(spotSp->mass / (spotSp->mass + allSpot[i].mass)) ) ); allSpot[i].mass += spotSp->mass; allSpot[i].qmass = pow(allSpot[i].mass, 0.33333); spotSp->mass = 0; (*length)--; PSpot temp; temp = allSpot[sp]; allSpot[sp] = allSpot[*length]; allSpot[*length] = temp; return; } else { spotSp->heading1 = _mm_add_pd( spotSp->heading1, _mm_mul_pd( _mm_sub_pd(allSpot[i].heading1, spotSp->heading1), _mm_set1_pd(allSpot[i].mass / (spotSp->mass + allSpot[i].mass)) ) ); spotSp->heading2 = _mm_add_sd( spotSp->heading2, _mm_mul_sd( _mm_sub_sd(allSpot[i].heading2, spotSp->heading2), _mm_set1_pd(allSpot[i].mass / (spotSp->mass + allSpot[i].mass)) ) ); spotSp->mass += allSpot[i].mass; spotSp->qmass = pow(spotSp->mass, 0.33333); allSpot[i].mass = 0; (*length)--; PSpot temp; temp = allSpot[i]; allSpot[i] = allSpot[*length]; allSpot[*length] = temp; return; } } //float f = (G * spotSp->mass * allSpot[i].mass) / (r.m128d_f64[0] * r.m128d_f64[0] * r.m128d_f64[0]); __m128d r1 = r; r1.m128d_f64[1] = G; __m128d r2 = r; r2.m128d_f64[1] = spotSp->mass; __m128d r3 = r; r3.m128d_f64[1] = allSpot[i].mass; __m128d r4 = _mm_mul_pd(_mm_mul_pd(r1, r2), r3); __m128d r5 = _mm_shuffle_pd(r4, r4, 3); r4 = _mm_shuffle_pd(r4, r4, 0); __m128d r6 = _mm_div_pd(r5, r4); force1 = _mm_add_pd(force1,_mm_mul_pd(diff1, r6)); force2 = _mm_add_sd(force2,_mm_mul_sd(diff2, r6)); } force1 = _mm_div_pd(force1, _mm_set1_pd(spotSp->mass)); force2 = _mm_div_sd(force2, _mm_set1_pd(spotSp->mass)); __m128d forcef = Length(force1, force2); if (forcef.m128d_f64[0] > 0) { double gate = 0.001f; double step = gate / forcef.m128d_f64[0]; if (spotSp->process + step < 1) { spotSp->process += step; } else { step = 1 - spotSp->process; spotSp->process = 1; } __m128d stepd = _mm_set1_pd(step); spotSp->heading1 = _mm_add_pd(spotSp->heading1,_mm_mul_pd(force1,stepd)); spotSp->heading2 = _mm_add_sd(spotSp->heading2,_mm_mul_sd(force2,stepd)); spotSp->pos1 = _mm_add_pd(spotSp->pos1, _mm_mul_pd(spotSp->heading1,stepd)); spotSp->pos2 = _mm_add_sd(spotSp->pos2, _mm_mul_sd(spotSp->heading2,stepd)); } else { spotSp->pos1 = _mm_add_pd(spotSp->pos1, spotSp->heading1); spotSp->pos2 = _mm_add_sd(spotSp->pos2, spotSp->heading2); spotSp->process = 1; } }
void exchlaplacecoeffData_6(unsigned int slot) { for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { if ((!neighbor_isValid[1][0])) { { double xPos; double yPos; /* Statements in this Scop: S902, S905, S908, S907, S901, S910, S904, S903, S906, S909, S900 */ { { { { { { { { { { { double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][0]); int i1 = 1; for (; (i1<=64); i1 += 2) { fieldData_LaplaceCoeff_6_p1[((i1*68)+4558)] = 0.000000e+00; fieldData_LaplaceCoeff_6_p1[((i1*68)+4626)] = 0.000000e+00; } for (; (i1<=65); i1 += 1) { fieldData_LaplaceCoeff_6_p1[((i1*68)+4558)] = 0.000000e+00; } } { int i1 = 1; for (; (i1<(2&(~1))); i1 += 1) { yPos = ((((i1-1)/6.400000e+01)*(posEnd[1]-posBegin[1]))+posBegin[1]); } __m128d vec1 = _mm_set1_pd(1.000000e+00); __m128d vec2 = _mm_set1_pd(6.400000e+01); __m128d vec5 = _mm_set1_pd(yPos); for (; (i1<63); i1 += 4) { /* yPos = ((((i1-1)/6.400000e+01)*(posEnd[1]-posBegin[1]))+posBegin[1]); */ __m128d vec0 = _mm_set_pd(i1+1,i1); __m128d vec0_2 = _mm_set_pd(i1+1,i1); __m128d vec3 = _mm_load1_pd((&posEnd[1])); __m128d vec3_2 = _mm_load1_pd((&posEnd[1])); __m128d vec4 = _mm_load1_pd((&posBegin[1])); __m128d vec4_2 = _mm_load1_pd((&posBegin[1])); vec5 = _mm_add_pd(_mm_mul_pd(_mm_div_pd(_mm_sub_pd(vec0, vec1), vec2), _mm_sub_pd(vec3, vec4)), vec4); vec5 = _mm_add_pd(_mm_mul_pd(_mm_div_pd(_mm_sub_pd(vec0_2, vec1), vec2), _mm_sub_pd(vec3_2, vec4_2)), vec4_2); } for (; (i1<66); i1 += 1) { yPos = ((((i1-1)/6.400000e+01)*(posEnd[1]-posBegin[1]))+posBegin[1]); } } } { double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][0]); int i1 = 1; for (; (i1<=64); i1 += 2) { fieldData_LaplaceCoeff_6_p1[((i1*68)+9114)] = 0.000000e+00; fieldData_LaplaceCoeff_6_p1[((i1*68)+9182)] = 0.000000e+00; } for (; (i1<=65); i1 += 1) { fieldData_LaplaceCoeff_6_p1[((i1*68)+9114)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][0]); int i1 = 1; for (; (i1<=64); i1 += 2) { fieldData_LaplaceCoeff_6_p1[((i1*68)+2)] = 0.000000e+00; fieldData_LaplaceCoeff_6_p1[((i1*68)+70)] = 0.000000e+00; } for (; (i1<=65); i1 += 1) { fieldData_LaplaceCoeff_6_p1[((i1*68)+2)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][0]); int i1 = 1; for (; (i1<=64); i1 += 2) { fieldData_LaplaceCoeff_6_p1[((i1*68)+31894)] = 0.000000e+00; fieldData_LaplaceCoeff_6_p1[((i1*68)+31962)] = 0.000000e+00; } for (; (i1<=65); i1 += 1) { fieldData_LaplaceCoeff_6_p1[((i1*68)+31894)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][0]); int i1 = 1; for (; (i1<=64); i1 += 2) { fieldData_LaplaceCoeff_6_p1[((i1*68)+36450)] = 0.000000e+00; fieldData_LaplaceCoeff_6_p1[((i1*68)+36518)] = 0.000000e+00; } for (; (i1<=65); i1 += 1) { fieldData_LaplaceCoeff_6_p1[((i1*68)+36450)] = 0.000000e+00; } } } { int i1 = 1; for (; (i1<(2&(~1))); i1 += 1) { xPos = posBegin[0]; } __m128d vec1 = _mm_set1_pd(xPos); for (; (i1<63); i1 += 4) { /* xPos = posBegin[0]; */ __m128d vec0 = _mm_load1_pd((&posBegin[0])); __m128d vec0_2 = _mm_load1_pd((&posBegin[0])); vec1 = vec0; vec1 = vec0_2; } for (; (i1<66); i1 += 1) { xPos = posBegin[0]; } } } { double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][0]); int i1 = 1; for (; (i1<=64); i1 += 2) { fieldData_LaplaceCoeff_6_p1[((i1*68)+18226)] = 0.000000e+00; fieldData_LaplaceCoeff_6_p1[((i1*68)+18294)] = 0.000000e+00; } for (; (i1<=65); i1 += 1) { fieldData_LaplaceCoeff_6_p1[((i1*68)+18226)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][0]); int i1 = 1; for (; (i1<=64); i1 += 2) { fieldData_LaplaceCoeff_6_p1[((i1*68)+13670)] = 0.000000e+00; fieldData_LaplaceCoeff_6_p1[((i1*68)+13738)] = 0.000000e+00; } for (; (i1<=65); i1 += 1) { fieldData_LaplaceCoeff_6_p1[((i1*68)+13670)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][0]); int i1 = 1; for (; (i1<=64); i1 += 2) { fieldData_LaplaceCoeff_6_p1[((i1*68)+27338)] = 0.000000e+00; fieldData_LaplaceCoeff_6_p1[((i1*68)+27406)] = 0.000000e+00; } for (; (i1<=65); i1 += 1) { fieldData_LaplaceCoeff_6_p1[((i1*68)+27338)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][0]); int i1 = 1; for (; (i1<=64); i1 += 2) { fieldData_LaplaceCoeff_6_p1[((i1*68)+22782)] = 0.000000e+00; fieldData_LaplaceCoeff_6_p1[((i1*68)+22850)] = 0.000000e+00; } for (; (i1<=65); i1 += 1) { fieldData_LaplaceCoeff_6_p1[((i1*68)+22782)] = 0.000000e+00; } } } } } if ((!neighbor_isValid[1][1])) { { double xPos; double yPos; /* Statements in this Scop: S920, S914, S917, S911, S913, S916, S919, S921, S918, S912, S915 */ { { { { { { { { { { { double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][0]); int i1 = 1; for (; (i1<=64); i1 += 2) { fieldData_LaplaceCoeff_6_p1[((i1*68)+4622)] = 0.000000e+00; fieldData_LaplaceCoeff_6_p1[((i1*68)+4690)] = 0.000000e+00; } for (; (i1<=65); i1 += 1) { fieldData_LaplaceCoeff_6_p1[((i1*68)+4622)] = 0.000000e+00; } } { double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][0]); int i1 = 1; for (; (i1<=64); i1 += 2) { fieldData_LaplaceCoeff_6_p1[((i1*68)+31958)] = 0.000000e+00; fieldData_LaplaceCoeff_6_p1[((i1*68)+32026)] = 0.000000e+00; } for (; (i1<=65); i1 += 1) { fieldData_LaplaceCoeff_6_p1[((i1*68)+31958)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][0]); int i1 = 1; for (; (i1<=64); i1 += 2) { fieldData_LaplaceCoeff_6_p1[((i1*68)+13734)] = 0.000000e+00; fieldData_LaplaceCoeff_6_p1[((i1*68)+13802)] = 0.000000e+00; } for (; (i1<=65); i1 += 1) { fieldData_LaplaceCoeff_6_p1[((i1*68)+13734)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][0]); int i1 = 1; for (; (i1<=64); i1 += 2) { fieldData_LaplaceCoeff_6_p1[((i1*68)+66)] = 0.000000e+00; fieldData_LaplaceCoeff_6_p1[((i1*68)+134)] = 0.000000e+00; } for (; (i1<=65); i1 += 1) { fieldData_LaplaceCoeff_6_p1[((i1*68)+66)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][0]); int i1 = 1; for (; (i1<=64); i1 += 2) { fieldData_LaplaceCoeff_6_p1[((i1*68)+22846)] = 0.000000e+00; fieldData_LaplaceCoeff_6_p1[((i1*68)+22914)] = 0.000000e+00; } for (; (i1<=65); i1 += 1) { fieldData_LaplaceCoeff_6_p1[((i1*68)+22846)] = 0.000000e+00; } } } { int i1 = 1; for (; (i1<(2&(~1))); i1 += 1) { yPos = ((((i1-1)/6.400000e+01)*(posEnd[1]-posBegin[1]))+posBegin[1]); } __m128d vec1 = _mm_set1_pd(1.000000e+00); __m128d vec2 = _mm_set1_pd(6.400000e+01); __m128d vec5 = _mm_set1_pd(yPos); for (; (i1<63); i1 += 4) { /* yPos = ((((i1-1)/6.400000e+01)*(posEnd[1]-posBegin[1]))+posBegin[1]); */ __m128d vec0 = _mm_set_pd(i1+1,i1); __m128d vec0_2 = _mm_set_pd(i1+1,i1); __m128d vec3 = _mm_load1_pd((&posEnd[1])); __m128d vec3_2 = _mm_load1_pd((&posEnd[1])); __m128d vec4 = _mm_load1_pd((&posBegin[1])); __m128d vec4_2 = _mm_load1_pd((&posBegin[1])); vec5 = _mm_add_pd(_mm_mul_pd(_mm_div_pd(_mm_sub_pd(vec0, vec1), vec2), _mm_sub_pd(vec3, vec4)), vec4); vec5 = _mm_add_pd(_mm_mul_pd(_mm_div_pd(_mm_sub_pd(vec0_2, vec1), vec2), _mm_sub_pd(vec3_2, vec4_2)), vec4_2); } for (; (i1<66); i1 += 1) { yPos = ((((i1-1)/6.400000e+01)*(posEnd[1]-posBegin[1]))+posBegin[1]); } } } { double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][0]); int i1 = 1; for (; (i1<=64); i1 += 2) { fieldData_LaplaceCoeff_6_p1[((i1*68)+18290)] = 0.000000e+00; fieldData_LaplaceCoeff_6_p1[((i1*68)+18358)] = 0.000000e+00; } for (; (i1<=65); i1 += 1) { fieldData_LaplaceCoeff_6_p1[((i1*68)+18290)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][0]); int i1 = 1; for (; (i1<=64); i1 += 2) { fieldData_LaplaceCoeff_6_p1[((i1*68)+27402)] = 0.000000e+00; fieldData_LaplaceCoeff_6_p1[((i1*68)+27470)] = 0.000000e+00; } for (; (i1<=65); i1 += 1) { fieldData_LaplaceCoeff_6_p1[((i1*68)+27402)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][0]); int i1 = 1; for (; (i1<=64); i1 += 2) { fieldData_LaplaceCoeff_6_p1[((i1*68)+36514)] = 0.000000e+00; fieldData_LaplaceCoeff_6_p1[((i1*68)+36582)] = 0.000000e+00; } for (; (i1<=65); i1 += 1) { fieldData_LaplaceCoeff_6_p1[((i1*68)+36514)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][0]); int i1 = 1; for (; (i1<=64); i1 += 2) { fieldData_LaplaceCoeff_6_p1[((i1*68)+9178)] = 0.000000e+00; fieldData_LaplaceCoeff_6_p1[((i1*68)+9246)] = 0.000000e+00; } for (; (i1<=65); i1 += 1) { fieldData_LaplaceCoeff_6_p1[((i1*68)+9178)] = 0.000000e+00; } } } { int i1 = 1; for (; (i1<(2&(~1))); i1 += 1) { xPos = posEnd[0]; } __m128d vec1 = _mm_set1_pd(xPos); for (; (i1<63); i1 += 4) { /* xPos = posEnd[0]; */ __m128d vec0 = _mm_load1_pd((&posEnd[0])); __m128d vec0_2 = _mm_load1_pd((&posEnd[0])); vec1 = vec0; vec1 = vec0_2; } for (; (i1<66); i1 += 1) { xPos = posEnd[0]; } } } } } if ((!neighbor_isValid[1][2])) { { double xPos; double yPos; /* Statements in this Scop: S929, S923, S926, S931, S925, S928, S922, S930, S924, S927, S932 */ { { { { { { { { { { { int i2 = 2; for (; (i2<=65); i2 += 2) { xPos = ((((i2-2)/6.400000e+01)*(posEnd[0]-posBegin[0]))+posBegin[0]); xPos = ((((i2-1)/6.400000e+01)*(posEnd[0]-posBegin[0]))+posBegin[0]); } for (; (i2<=66); i2 += 1) { xPos = ((((i2-2)/6.400000e+01)*(posEnd[0]-posBegin[0]))+posBegin[0]); } } { double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][0]); int i2 = 2; for (; (i2<=65); i2 += 2) { fieldData_LaplaceCoeff_6_p1[(i2+18292)] = 0.000000e+00; fieldData_LaplaceCoeff_6_p1[(i2+18293)] = 0.000000e+00; } for (; (i2<=66); i2 += 1) { fieldData_LaplaceCoeff_6_p1[(i2+18292)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][0]); int i2 = 2; for (; (i2<=65); i2 += 2) { fieldData_LaplaceCoeff_6_p1[(i2+36516)] = 0.000000e+00; fieldData_LaplaceCoeff_6_p1[(i2+36517)] = 0.000000e+00; } for (; (i2<=66); i2 += 1) { fieldData_LaplaceCoeff_6_p1[(i2+36516)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][0]); int i2 = 2; for (; (i2<=65); i2 += 2) { fieldData_LaplaceCoeff_6_p1[(i2+31960)] = 0.000000e+00; fieldData_LaplaceCoeff_6_p1[(i2+31961)] = 0.000000e+00; } for (; (i2<=66); i2 += 1) { fieldData_LaplaceCoeff_6_p1[(i2+31960)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][0]); int i2 = 2; for (; (i2<=65); i2 += 2) { fieldData_LaplaceCoeff_6_p1[(i2+68)] = 0.000000e+00; fieldData_LaplaceCoeff_6_p1[(i2+69)] = 0.000000e+00; } for (; (i2<=66); i2 += 1) { fieldData_LaplaceCoeff_6_p1[(i2+68)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][0]); int i2 = 2; for (; (i2<=65); i2 += 2) { fieldData_LaplaceCoeff_6_p1[(i2+9180)] = 0.000000e+00; fieldData_LaplaceCoeff_6_p1[(i2+9181)] = 0.000000e+00; } for (; (i2<=66); i2 += 1) { fieldData_LaplaceCoeff_6_p1[(i2+9180)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][0]); int i2 = 2; for (; (i2<=65); i2 += 2) { fieldData_LaplaceCoeff_6_p1[(i2+22848)] = 0.000000e+00; fieldData_LaplaceCoeff_6_p1[(i2+22849)] = 0.000000e+00; } for (; (i2<=66); i2 += 1) { fieldData_LaplaceCoeff_6_p1[(i2+22848)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][0]); int i2 = 2; for (; (i2<=65); i2 += 2) { fieldData_LaplaceCoeff_6_p1[(i2+27404)] = 0.000000e+00; fieldData_LaplaceCoeff_6_p1[(i2+27405)] = 0.000000e+00; } for (; (i2<=66); i2 += 1) { fieldData_LaplaceCoeff_6_p1[(i2+27404)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][0]); int i2 = 2; for (; (i2<=65); i2 += 2) { fieldData_LaplaceCoeff_6_p1[(i2+4624)] = 0.000000e+00; fieldData_LaplaceCoeff_6_p1[(i2+4625)] = 0.000000e+00; } for (; (i2<=66); i2 += 1) { fieldData_LaplaceCoeff_6_p1[(i2+4624)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][0]); int i2 = 2; for (; (i2<=65); i2 += 2) { fieldData_LaplaceCoeff_6_p1[(i2+13736)] = 0.000000e+00; fieldData_LaplaceCoeff_6_p1[(i2+13737)] = 0.000000e+00; } for (; (i2<=66); i2 += 1) { fieldData_LaplaceCoeff_6_p1[(i2+13736)] = 0.000000e+00; } } } { int i2 = 2; for (; (i2<=65); i2 += 2) { yPos = posBegin[1]; yPos = posBegin[1]; } for (; (i2<=66); i2 += 1) { yPos = posBegin[1]; } } } } } if ((!neighbor_isValid[1][3])) { { double xPos; double yPos; /* Statements in this Scop: S941, S935, S938, S943, S940, S934, S937, S942, S936, S939, S933 */ { { { { { { { { { { { int i2 = 2; for (; (i2<=65); i2 += 2) { xPos = ((((i2-2)/6.400000e+01)*(posEnd[0]-posBegin[0]))+posBegin[0]); xPos = ((((i2-1)/6.400000e+01)*(posEnd[0]-posBegin[0]))+posBegin[0]); } for (; (i2<=66); i2 += 1) { xPos = ((((i2-2)/6.400000e+01)*(posEnd[0]-posBegin[0]))+posBegin[0]); } } { double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][0]); int i2 = 2; for (; (i2<=65); i2 += 2) { fieldData_LaplaceCoeff_6_p1[(i2+36312)] = 0.000000e+00; fieldData_LaplaceCoeff_6_p1[(i2+36313)] = 0.000000e+00; } for (; (i2<=66); i2 += 1) { fieldData_LaplaceCoeff_6_p1[(i2+36312)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][0]); int i2 = 2; for (; (i2<=65); i2 += 2) { fieldData_LaplaceCoeff_6_p1[(i2+22644)] = 0.000000e+00; fieldData_LaplaceCoeff_6_p1[(i2+22645)] = 0.000000e+00; } for (; (i2<=66); i2 += 1) { fieldData_LaplaceCoeff_6_p1[(i2+22644)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][0]); int i2 = 2; for (; (i2<=65); i2 += 2) { fieldData_LaplaceCoeff_6_p1[(i2+13532)] = 0.000000e+00; fieldData_LaplaceCoeff_6_p1[(i2+13533)] = 0.000000e+00; } for (; (i2<=66); i2 += 1) { fieldData_LaplaceCoeff_6_p1[(i2+13532)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][0]); int i2 = 2; for (; (i2<=65); i2 += 2) { fieldData_LaplaceCoeff_6_p1[(i2+40868)] = 0.000000e+00; fieldData_LaplaceCoeff_6_p1[(i2+40869)] = 0.000000e+00; } for (; (i2<=66); i2 += 1) { fieldData_LaplaceCoeff_6_p1[(i2+40868)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][0]); int i2 = 2; for (; (i2<=65); i2 += 2) { fieldData_LaplaceCoeff_6_p1[(i2+18088)] = 0.000000e+00; fieldData_LaplaceCoeff_6_p1[(i2+18089)] = 0.000000e+00; } for (; (i2<=66); i2 += 1) { fieldData_LaplaceCoeff_6_p1[(i2+18088)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][0]); int i2 = 2; for (; (i2<=65); i2 += 2) { fieldData_LaplaceCoeff_6_p1[(i2+4420)] = 0.000000e+00; fieldData_LaplaceCoeff_6_p1[(i2+4421)] = 0.000000e+00; } for (; (i2<=66); i2 += 1) { fieldData_LaplaceCoeff_6_p1[(i2+4420)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][0]); int i2 = 2; for (; (i2<=65); i2 += 2) { fieldData_LaplaceCoeff_6_p1[(i2+31756)] = 0.000000e+00; fieldData_LaplaceCoeff_6_p1[(i2+31757)] = 0.000000e+00; } for (; (i2<=66); i2 += 1) { fieldData_LaplaceCoeff_6_p1[(i2+31756)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][0]); int i2 = 2; for (; (i2<=65); i2 += 2) { fieldData_LaplaceCoeff_6_p1[(i2+27200)] = 0.000000e+00; fieldData_LaplaceCoeff_6_p1[(i2+27201)] = 0.000000e+00; } for (; (i2<=66); i2 += 1) { fieldData_LaplaceCoeff_6_p1[(i2+27200)] = 0.000000e+00; } } } { int i2 = 2; for (; (i2<=65); i2 += 2) { yPos = posEnd[1]; yPos = posEnd[1]; } for (; (i2<=66); i2 += 1) { yPos = posEnd[1]; } } } { double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][0]); int i2 = 2; for (; (i2<=65); i2 += 2) { fieldData_LaplaceCoeff_6_p1[(i2+8976)] = 0.000000e+00; fieldData_LaplaceCoeff_6_p1[(i2+8977)] = 0.000000e+00; } for (; (i2<=66); i2 += 1) { fieldData_LaplaceCoeff_6_p1[(i2+8976)] = 0.000000e+00; } } } } } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { if ((neighbor_isValid[1][1]&&neighbor_isRemote[1][1])) { /* Statements in this Scop: S944 */ for (int i3 = 0; (i3<=8); i3 += 1) { double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][(i3*4556)]); double* buffer_Send_1_p1 = (&buffer_Send[1][(i3*65)]); int i4 = 1; for (; (i4<=64); i4 += 2) { buffer_Send_1_p1[(i4-1)] = fieldData_LaplaceCoeff_6_p1[((i4*68)+66)]; buffer_Send_1_p1[i4] = fieldData_LaplaceCoeff_6_p1[((i4*68)+134)]; } for (; (i4<=65); i4 += 1) { buffer_Send_1_p1[(i4-1)] = fieldData_LaplaceCoeff_6_p1[((i4*68)+66)]; } } } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { if ((neighbor_isValid[1][1]&&neighbor_isRemote[1][1])) { MPI_Isend(buffer_Send[1], 585, MPI_DOUBLE, neighbor_remoteRank[1][1], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[1][1]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[1]); reqOutstanding_Send[1] = true; } } } ; for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { if ((neighbor_isValid[1][0]&&neighbor_isRemote[1][0])) { MPI_Irecv(buffer_Recv[0], 585, MPI_DOUBLE, neighbor_remoteRank[1][0], ((unsigned int)(neighbor_fragCommId[1][0]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[0]); reqOutstanding_Recv[0] = true; } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { if (reqOutstanding_Recv[0]) { waitForMPIReq(&mpiRequest_Recv[0]); reqOutstanding_Recv[0] = false; } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { if ((neighbor_isValid[1][0]&&neighbor_isRemote[1][0])) { /* Statements in this Scop: S945 */ for (int i3 = 0; (i3<=8); i3 += 1) { double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][(i3*4556)]); double* buffer_Recv_0_p1 = (&buffer_Recv[0][(i3*65)]); int i4 = 3; for (; (i4<=66); i4 += 2) { fieldData_LaplaceCoeff_6_p1[((i4*68)-134)] = buffer_Recv_0_p1[(i4-3)]; fieldData_LaplaceCoeff_6_p1[((i4*68)-66)] = buffer_Recv_0_p1[(i4-2)]; } for (; (i4<=67); i4 += 1) { fieldData_LaplaceCoeff_6_p1[((i4*68)-134)] = buffer_Recv_0_p1[(i4-3)]; } } } } } ; ; for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { if (reqOutstanding_Send[1]) { waitForMPIReq(&mpiRequest_Send[1]); reqOutstanding_Send[1] = false; } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { ; } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { if ((neighbor_isValid[1][3]&&neighbor_isRemote[1][3])) { MPI_Isend(&fieldData_LaplaceCoeff[6][4422], 1, mpiDatatype_9_65_4556, neighbor_remoteRank[1][3], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[1][3]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[3]); reqOutstanding_Send[3] = true; } } } ; for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { if ((neighbor_isValid[1][2]&&neighbor_isRemote[1][2])) { MPI_Irecv(&fieldData_LaplaceCoeff[6][70], 1, mpiDatatype_9_65_4556, neighbor_remoteRank[1][2], ((unsigned int)(neighbor_fragCommId[1][2]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[2]); reqOutstanding_Recv[2] = true; } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { if (reqOutstanding_Recv[2]) { waitForMPIReq(&mpiRequest_Recv[2]); reqOutstanding_Recv[2] = false; } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { ; } } ; ; for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { if (reqOutstanding_Send[3]) { waitForMPIReq(&mpiRequest_Send[3]); reqOutstanding_Send[3] = false; } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { if ((neighbor_isValid[1][0]&&neighbor_isRemote[1][0])) { /* Statements in this Scop: S946 */ for (int i3 = 0; (i3<=8); i3 += 1) { double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][(i3*4556)]); double* buffer_Send_0_p1 = (&buffer_Send[0][(i3*67)]); int i4 = 0; for (; (i4<=65); i4 += 2) { buffer_Send_0_p1[i4] = fieldData_LaplaceCoeff_6_p1[((i4*68)+3)]; buffer_Send_0_p1[(i4+1)] = fieldData_LaplaceCoeff_6_p1[((i4*68)+71)]; } for (; (i4<=66); i4 += 1) { buffer_Send_0_p1[i4] = fieldData_LaplaceCoeff_6_p1[((i4*68)+3)]; } } } if ((neighbor_isValid[1][1]&&neighbor_isRemote[1][1])) { /* Statements in this Scop: S947 */ for (int i3 = 0; (i3<=8); i3 += 1) { double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][(i3*4556)]); double* buffer_Send_1_p1 = (&buffer_Send[1][(i3*67)]); int i4 = 0; for (; (i4<=65); i4 += 2) { buffer_Send_1_p1[i4] = fieldData_LaplaceCoeff_6_p1[((i4*68)+65)]; buffer_Send_1_p1[(i4+1)] = fieldData_LaplaceCoeff_6_p1[((i4*68)+133)]; } for (; (i4<=66); i4 += 1) { buffer_Send_1_p1[i4] = fieldData_LaplaceCoeff_6_p1[((i4*68)+65)]; } } } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { if ((neighbor_isValid[1][0]&&neighbor_isRemote[1][0])) { MPI_Isend(buffer_Send[0], 603, MPI_DOUBLE, neighbor_remoteRank[1][0], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[1][0]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[0]); reqOutstanding_Send[0] = true; } if ((neighbor_isValid[1][1]&&neighbor_isRemote[1][1])) { MPI_Isend(buffer_Send[1], 603, MPI_DOUBLE, neighbor_remoteRank[1][1], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[1][1]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[1]); reqOutstanding_Send[1] = true; } } } ; for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { if ((neighbor_isValid[1][0]&&neighbor_isRemote[1][0])) { MPI_Irecv(buffer_Recv[0], 603, MPI_DOUBLE, neighbor_remoteRank[1][0], ((unsigned int)(neighbor_fragCommId[1][0]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[0]); reqOutstanding_Recv[0] = true; } if ((neighbor_isValid[1][1]&&neighbor_isRemote[1][1])) { MPI_Irecv(buffer_Recv[1], 603, MPI_DOUBLE, neighbor_remoteRank[1][1], ((unsigned int)(neighbor_fragCommId[1][1]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[1]); reqOutstanding_Recv[1] = true; } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { if (reqOutstanding_Recv[0]) { waitForMPIReq(&mpiRequest_Recv[0]); reqOutstanding_Recv[0] = false; } if (reqOutstanding_Recv[1]) { waitForMPIReq(&mpiRequest_Recv[1]); reqOutstanding_Recv[1] = false; } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { if ((neighbor_isValid[1][0]&&neighbor_isRemote[1][0])) { /* Statements in this Scop: S948 */ for (int i3 = 0; (i3<=8); i3 += 1) { double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][(i3*4556)]); double* buffer_Recv_0_p1 = (&buffer_Recv[0][(i3*67)]); int i4 = 1; for (; (i4<=66); i4 += 2) { fieldData_LaplaceCoeff_6_p1[((i4*68)-67)] = buffer_Recv_0_p1[(i4-1)]; fieldData_LaplaceCoeff_6_p1[((i4*68)+1)] = buffer_Recv_0_p1[i4]; } for (; (i4<=67); i4 += 1) { fieldData_LaplaceCoeff_6_p1[((i4*68)-67)] = buffer_Recv_0_p1[(i4-1)]; } } } if ((neighbor_isValid[1][1]&&neighbor_isRemote[1][1])) { /* Statements in this Scop: S949 */ for (int i3 = 0; (i3<=8); i3 += 1) { double* buffer_Recv_1_p1 = (&buffer_Recv[1][(i3*67)]); double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][(i3*4556)]); int i4 = 67; for (; (i4<=132); i4 += 2) { fieldData_LaplaceCoeff_6_p1[((i4*68)-4489)] = buffer_Recv_1_p1[(i4-67)]; fieldData_LaplaceCoeff_6_p1[((i4*68)-4421)] = buffer_Recv_1_p1[(i4-66)]; } for (; (i4<=133); i4 += 1) { fieldData_LaplaceCoeff_6_p1[((i4*68)-4489)] = buffer_Recv_1_p1[(i4-67)]; } } } } } ; ; for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { if (reqOutstanding_Send[0]) { waitForMPIReq(&mpiRequest_Send[0]); reqOutstanding_Send[0] = false; } if (reqOutstanding_Send[1]) { waitForMPIReq(&mpiRequest_Send[1]); reqOutstanding_Send[1] = false; } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { ; ; } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { if ((neighbor_isValid[1][2]&&neighbor_isRemote[1][2])) { MPI_Isend(&fieldData_LaplaceCoeff[6][137], 1, mpiDatatype_9_67_4556, neighbor_remoteRank[1][2], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[1][2]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[2]); reqOutstanding_Send[2] = true; } if ((neighbor_isValid[1][3]&&neighbor_isRemote[1][3])) { MPI_Isend(&fieldData_LaplaceCoeff[6][4353], 1, mpiDatatype_9_67_4556, neighbor_remoteRank[1][3], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[1][3]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[3]); reqOutstanding_Send[3] = true; } } } ; for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { if ((neighbor_isValid[1][2]&&neighbor_isRemote[1][2])) { MPI_Irecv(&fieldData_LaplaceCoeff[6][1], 1, mpiDatatype_9_67_4556, neighbor_remoteRank[1][2], ((unsigned int)(neighbor_fragCommId[1][2]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[2]); reqOutstanding_Recv[2] = true; } if ((neighbor_isValid[1][3]&&neighbor_isRemote[1][3])) { MPI_Irecv(&fieldData_LaplaceCoeff[6][4489], 1, mpiDatatype_9_67_4556, neighbor_remoteRank[1][3], ((unsigned int)(neighbor_fragCommId[1][3]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[3]); reqOutstanding_Recv[3] = true; } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { if (reqOutstanding_Recv[2]) { waitForMPIReq(&mpiRequest_Recv[2]); reqOutstanding_Recv[2] = false; } if (reqOutstanding_Recv[3]) { waitForMPIReq(&mpiRequest_Recv[3]); reqOutstanding_Recv[3] = false; } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { ; ; } } ; ; for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { if (reqOutstanding_Send[2]) { waitForMPIReq(&mpiRequest_Send[2]); reqOutstanding_Send[2] = false; } if (reqOutstanding_Send[3]) { waitForMPIReq(&mpiRequest_Send[3]); reqOutstanding_Send[3] = false; } } } }
int fft8b_(double *a, double *b, double *w, int *m, int *l) { /* static double c81 = .70710678118654752; */ static __m128d c81; int i, i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15, j, j0; /* double u0, v0, u1, x0, y0, x1, y1, x2, y2, x3, y3, v1, x4, y4, x5, y5, x6, y6, x7, y7, u2, v2, u3, v3, wi1, wi2, wi3, wi4, wi5, wi6, wi7, wr1, wr2, wr3, wr4, wr5, wr6, wr7; */ __m128d t0, t1, t2, t3, t4, t5, t6, t7, t8, u0, u1, u2, u3, w1, w2, w3, w4, w5, w6, w7; c81 = _mm_set1_pd(0.70710678118654752); for (i = 0; i < *m; i++) { i0 = i << 1; i1 = i0 + (*m * *l << 1); i2 = i1 + (*m * *l << 1); i3 = i2 + (*m * *l << 1); i4 = i3 + (*m * *l << 1); i5 = i4 + (*m * *l << 1); i6 = i5 + (*m * *l << 1); i7 = i6 + (*m * *l << 1); i8 = i << 1; i9 = i8 + (*m << 1); i10 = i9 + (*m << 1); i11 = i10 + (*m << 1); i12 = i11 + (*m << 1); i13 = i12 + (*m << 1); i14 = i13 + (*m << 1); i15 = i14 + (*m << 1); /* x0 = a[i0] + a[i4]; y0 = a[i0 + 1] + a[i4 + 1]; x1 = a[i0] - a[i4]; y1 = a[i0 + 1] - a[i4 + 1]; x2 = a[i2] + a[i6]; y2 = a[i2 + 1] + a[i6 + 1]; x3 = a[i2 + 1] - a[i6 + 1]; y3 = a[i6] - a[i2]; */ t0 = _mm_load_pd(&a[i0]); t2 = _mm_load_pd(&a[i4]); t1 = _mm_sub_pd(t0, t2); t0 = _mm_add_pd(t0, t2); t3 = _mm_load_pd(&a[i2]); t4 = _mm_load_pd(&a[i6]); t2 = _mm_add_pd(t3, t4); t3 = _mm_xor_pd(_mm_sub_pd(t3, t4), _mm_set_sd(-0.0)); t3 = _mm_shuffle_pd(t3, t3, 1); /* u0 = x0 + x2; v0 = y0 + y2; u1 = x0 - x2; v1 = y0 - y2; */ u0 = _mm_add_pd(t0, t2); u1 = _mm_sub_pd(t0, t2); /* x4 = a[i1] + a[i5]; y4 = a[i1 + 1] + a[i5 + 1]; x5 = a[i1] - a[i5]; y5 = a[i1 + 1] - a[i5 + 1]; x6 = a[i3] + a[i7]; y6 = a[i3 + 1] + a[i7 + 1]; x7 = a[i3] - a[i7]; y7 = a[i3 + 1] - a[i7 + 1]; */ t4 = _mm_load_pd(&a[i1]); t6 = _mm_load_pd(&a[i5]); t5 = _mm_sub_pd(t4, t6); t4 = _mm_add_pd(t4, t6); t7 = _mm_load_pd(&a[i3]); t8 = _mm_load_pd(&a[i7]); t6 = _mm_add_pd(t7, t8); t7 = _mm_sub_pd(t7, t8); /* u2 = x4 + x6; v2 = y4 + y6; u3 = y4 - y6; v3 = x6 - x4; */ u2 = _mm_add_pd(t4, t6); u3 = _mm_xor_pd(_mm_sub_pd(t4, t6), _mm_set_sd(-0.0)); u3 = _mm_shuffle_pd(u3, u3, 1); /* b[i8] = u0 + u2; b[i8 + 1] = v0 + v2; b[i12] = u0 - u2; b[i12 + 1] = v0 - v2; b[i10] = u1 + u3; b[i10 + 1] = v1 + v3; b[i14] = u1 - u3; b[i14 + 1] = v1 - v3; */ _mm_store_pd(&b[i8], _mm_add_pd(u0, u2)); _mm_store_pd(&b[i12], _mm_sub_pd(u0, u2)); _mm_store_pd(&b[i10], _mm_add_pd(u1, u3)); _mm_store_pd(&b[i14], _mm_sub_pd(u1, u3)); /* u0 = x1 + c81 * (x5 - x7); v0 = y1 + c81 * (y5 - y7); u1 = x1 - c81 * (x5 - x7); v1 = y1 - c81 * (y5 - y7); u2 = x3 + c81 * (y5 + y7); v2 = y3 - c81 * (x5 + x7); u3 = x3 - c81 * (y5 + y7); v3 = y3 + c81 * (x5 + x7); */ u1 = _mm_mul_pd(c81, _mm_sub_pd(t5, t7)); u0 = _mm_add_pd(t1, u1); u1 = _mm_sub_pd(t1, u1); u3 = _mm_xor_pd(_mm_mul_pd(c81, _mm_add_pd(t5, t7)), _mm_set_sd(-0.0)); u3 = _mm_shuffle_pd(u3, u3, 1); u2 = _mm_add_pd(t3, u3); u3 = _mm_sub_pd(t3, u3); /* b[i9] = u0 + u2; b[i9 + 1] = v0 + v2; b[i13] = u1 + u3; b[i13 + 1] = v1 + v3; b[i11] = u1 - u3; b[i11 + 1] = v1 - v3; b[i15] = u0 - u2; b[i15 + 1] = v0 - v2; */ _mm_store_pd(&b[i9], _mm_add_pd(u0, u2)); _mm_store_pd(&b[i13], _mm_add_pd(u1, u3)); _mm_store_pd(&b[i11], _mm_sub_pd(u1, u3)); _mm_store_pd(&b[i15], _mm_sub_pd(u0, u2)); } for (j = 1; j < *l; j++) { j0 = j << 1; /* wr1 = w[j0]; wi1 = w[j0 + 1]; wr2 = wr1 * wr1 - wi1 * wi1; wi2 = wr1 * wi1 + wr1 * wi1; wr3 = wr1 * wr2 - wi1 * wi2; wi3 = wr1 * wi2 + wi1 * wr2; wr4 = wr2 * wr2 - wi2 * wi2; wi4 = wr2 * wi2 + wr2 * wi2; wr5 = wr2 * wr3 - wi2 * wi3; wi5 = wr2 * wi3 + wi2 * wr3; wr6 = wr3 * wr3 - wi3 * wi3; wi6 = wr3 * wi3 + wr3 * wi3; wr7 = wr3 * wr4 - wi3 * wi4; wi7 = wr3 * wi4 + wi3 * wr4; */ w1 = _mm_load_pd(&w[j0]); w2 = ZMUL(w1, w1); w3 = ZMUL(w1, w2); w4 = ZMUL(w2, w2); w5 = ZMUL(w2, w3); w6 = ZMUL(w3, w3); w7 = ZMUL(w3, w4); for (i = 0; i < *m; i++) { i0 = (i << 1) + (j * *m << 1); i1 = i0 + (*m * *l << 1); i2 = i1 + (*m * *l << 1); i3 = i2 + (*m * *l << 1); i4 = i3 + (*m * *l << 1); i5 = i4 + (*m * *l << 1); i6 = i5 + (*m * *l << 1); i7 = i6 + (*m * *l << 1); i8 = (i << 1) + (j * *m << 4); i9 = i8 + (*m << 1); i10 = i9 + (*m << 1); i11 = i10 + (*m << 1); i12 = i11 + (*m << 1); i13 = i12 + (*m << 1); i14 = i13 + (*m << 1); i15 = i14 + (*m << 1); /* x0 = a[i0] + a[i4]; y0 = a[i0 + 1] + a[i4 + 1]; x1 = a[i0] - a[i4]; y1 = a[i0 + 1] - a[i4 + 1]; x2 = a[i2] + a[i6]; y2 = a[i2 + 1] + a[i6 + 1]; x3 = a[i2 + 1] - a[i6 + 1]; y3 = a[i6] - a[i2]; */ t0 = _mm_load_pd(&a[i0]); t2 = _mm_load_pd(&a[i4]); t1 = _mm_sub_pd(t0, t2); t0 = _mm_add_pd(t0, t2); t3 = _mm_load_pd(&a[i2]); t4 = _mm_load_pd(&a[i6]); t2 = _mm_add_pd(t3, t4); t3 = _mm_xor_pd(_mm_sub_pd(t3, t4), _mm_set_sd(-0.0)); t3 = _mm_shuffle_pd(t3, t3, 1); /* u0 = x0 + x2; v0 = y0 + y2; u1 = x0 - x2; v1 = y0 - y2; */ u0 = _mm_add_pd(t0, t2); u1 = _mm_sub_pd(t0, t2); /* x4 = a[i1] + a[i5]; y4 = a[i1 + 1] + a[i5 + 1]; x5 = a[i1] - a[i5]; y5 = a[i1 + 1] - a[i5 + 1]; x6 = a[i3] + a[i7]; y6 = a[i3 + 1] + a[i7 + 1]; x7 = a[i3] - a[i7]; y7 = a[i3 + 1] - a[i7 + 1]; */ t4 = _mm_load_pd(&a[i1]); t6 = _mm_load_pd(&a[i5]); t5 = _mm_sub_pd(t4, t6); t4 = _mm_add_pd(t4, t6); t7 = _mm_load_pd(&a[i3]); t8 = _mm_load_pd(&a[i7]); t6 = _mm_add_pd(t7, t8); t7 = _mm_sub_pd(t7, t8); /* u2 = x4 + x6; v2 = y4 + y6; u3 = y4 - y6; v3 = x6 - x4; */ u2 = _mm_add_pd(t4, t6); u3 = _mm_xor_pd(_mm_sub_pd(t4, t6), _mm_set_sd(-0.0)); u3 = _mm_shuffle_pd(u3, u3, 1); /* b[i8] = u0 + u2; b[i8 + 1] = v0 + v2; b[i12] = wr4 * (u0 - u2) - wi4 * (v0 - v2); b[i12 + 1] = wr4 * (v0 - v2) + wi4 * (u0 - u2); b[i10] = wr2 * (u1 + u3) - wi2 * (v1 + v3); b[i10 + 1] = wr2 * (v1 + v3) + wi2 * (u1 + u3); b[i14] = wr6 * (u1 - u3) - wi6 * (v1 - v3); b[i14 + 1] = wr6 * (v1 - v3) + wi6 * (u1 - u3); */ _mm_store_pd(&b[i8], _mm_add_pd(u0, u2)); _mm_store_pd(&b[i12], ZMUL(w4, _mm_sub_pd(u0, u2))); _mm_store_pd(&b[i10], ZMUL(w2, _mm_add_pd(u1, u3))); _mm_store_pd(&b[i14], ZMUL(w6, _mm_sub_pd(u1, u3))); /* u0 = x1 + c81 * (x5 - x7); v0 = y1 + c81 * (y5 - y7); u1 = x1 - c81 * (x5 - x7); v1 = y1 - c81 * (y5 - y7); u2 = x3 + c81 * (y5 + y7); v2 = y3 - c81 * (x5 + x7); u3 = x3 - c81 * (y5 + y7); v3 = y3 + c81 * (x5 + x7); */ u1 = _mm_mul_pd(c81, _mm_sub_pd(t5, t7)); u0 = _mm_add_pd(t1, u1); u1 = _mm_sub_pd(t1, u1); u3 = _mm_xor_pd(_mm_mul_pd(c81, _mm_add_pd(t5, t7)), _mm_set_sd(-0.0)); u3 = _mm_shuffle_pd(u3, u3, 1); u2 = _mm_add_pd(t3, u3); u3 = _mm_sub_pd(t3, u3); /* b[i9] = wr1 * (u0 + u2) - wi1 * (v0 + v2); b[i9 + 1] = wr1 * (v0 + v2) + wi1 * (u0 + u2); b[i13] = wr5 * (u1 + u3) - wi5 * (v1 + v3); b[i13 + 1] = wr5 * (v1 + v3) + wi5 * (u1 + u3); b[i11] = wr3 * (u1 - u3) - wi3 * (v1 - v3); b[i11 + 1] = wr3 * (v1 - v3) + wi3 * (u1 - u3); b[i15] = wr7 * (u0 - u2) - wi7 * (v0 - v2); b[i15 + 1] = wr7 * (v0 - v2) + wi7 * (u0 - u2); */ _mm_store_pd(&b[i9], ZMUL(w1, _mm_add_pd(u0, u2))); _mm_store_pd(&b[i13], ZMUL(w5, _mm_add_pd(u1, u3))); _mm_store_pd(&b[i11], ZMUL(w3, _mm_sub_pd(u1, u3))); _mm_store_pd(&b[i15], ZMUL(w7, _mm_sub_pd(u0, u2))); } } return 0; }
void nb_kernel430_ia32_sse2(int * p_nri, int * iinr, int * jindex, int * jjnr, int * shift, double * shiftvec, double * fshift, int * gid, double * pos, double * faction, double * charge, double * p_facel, double * p_krf, double * p_crf, double * vc, int * type, int * p_ntype, double * vdwparam, double * vvdw, double * p_tabscale, double * VFtab, double * invsqrta, double * dvda, double * p_gbtabscale, double * GBtab, int * p_nthreads, int * count, void * mtx, int * outeriter, int * inneriter, double * work) { int nri,ntype,nthreads; int n,ii,is3,ii3,k,nj0,nj1,ggid; double shX,shY,shZ; int offset,nti; int jnrA,jnrB; int j3A,j3B; int tjA,tjB; gmx_gbdata_t *gbdata; double * gpol; __m128d iq,qq,jq,isai; __m128d ix,iy,iz; __m128d jx,jy,jz; __m128d dx,dy,dz; __m128d vctot,vvdwtot,vgbtot,dvdasum,gbfactor; __m128d fix,fiy,fiz,tx,ty,tz,rsq; __m128d rinv,isaj,isaprod; __m128d vcoul,fscal,gbscale,c6,c12; __m128d rinvsq,r,rtab; __m128d eps,Y,F,G,H; __m128d VV,FF,Fp; __m128d vgb,fijGB,dvdatmp; __m128d rinvsix,vvdw6,vvdw12,vvdwtmp; __m128d facel,gbtabscale,dvdaj; __m128d fijD,fijR; __m128d xmm1,tabscale,eps2; __m128i n0, nnn; const __m128d neg = _mm_set1_pd(-1.0); const __m128d zero = _mm_set1_pd(0.0); const __m128d minushalf = _mm_set1_pd(-0.5); const __m128d two = _mm_set1_pd(2.0); gbdata = (gmx_gbdata_t *)work; gpol = gbdata->gpol; nri = *p_nri; ntype = *p_ntype; gbfactor = _mm_set1_pd( - ((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent))); gbtabscale = _mm_load1_pd(p_gbtabscale); facel = _mm_load1_pd(p_facel); tabscale = _mm_load1_pd(p_tabscale); nj1 = 0; jnrA = jnrB = 0; j3A = j3B = 0; jx = _mm_setzero_pd(); jy = _mm_setzero_pd(); jz = _mm_setzero_pd(); c6 = _mm_setzero_pd(); c12 = _mm_setzero_pd(); for(n=0;n<nri;n++) { is3 = 3*shift[n]; shX = shiftvec[is3]; shY = shiftvec[is3+1]; shZ = shiftvec[is3+2]; nj0 = jindex[n]; nj1 = jindex[n+1]; ii = iinr[n]; ii3 = 3*ii; ix = _mm_set1_pd(shX+pos[ii3+0]); iy = _mm_set1_pd(shY+pos[ii3+1]); iz = _mm_set1_pd(shZ+pos[ii3+2]); iq = _mm_load1_pd(charge+ii); iq = _mm_mul_pd(iq,facel); isai = _mm_load1_pd(invsqrta+ii); nti = 2*ntype*type[ii]; vctot = _mm_setzero_pd(); vvdwtot = _mm_setzero_pd(); vgbtot = _mm_setzero_pd(); dvdasum = _mm_setzero_pd(); fix = _mm_setzero_pd(); fiy = _mm_setzero_pd(); fiz = _mm_setzero_pd(); for(k=nj0;k<nj1-1; k+=2) { jnrA = jjnr[k]; jnrB = jjnr[k+1]; j3A = jnrA * 3; j3B = jnrB * 3; GMX_MM_LOAD_1RVEC_2POINTERS_PD(pos+j3A,pos+j3B,jx,jy,jz); dx = _mm_sub_pd(ix,jx); dy = _mm_sub_pd(iy,jy); dz = _mm_sub_pd(iz,jz); rsq = gmx_mm_calc_rsq_pd(dx,dy,dz); rinv = gmx_mm_invsqrt_pd(rsq); rinvsq = _mm_mul_pd(rinv,rinv); /***********************************/ /* INTERACTION SECTION STARTS HERE */ /***********************************/ GMX_MM_LOAD_2VALUES_PD(charge+jnrA,charge+jnrB,jq); GMX_MM_LOAD_2VALUES_PD(invsqrta+jnrA,invsqrta+jnrB,isaj); /* Lennard-Jones */ tjA = nti+2*type[jnrA]; tjB = nti+2*type[jnrB]; GMX_MM_LOAD_2PAIRS_PD(vdwparam+tjA,vdwparam+tjB,c6,c12); isaprod = _mm_mul_pd(isai,isaj); qq = _mm_mul_pd(iq,jq); vcoul = _mm_mul_pd(qq,rinv); fscal = _mm_mul_pd(vcoul,rinv); vctot = _mm_add_pd(vctot,vcoul); /* Polarization interaction */ qq = _mm_mul_pd(qq,_mm_mul_pd(isaprod,gbfactor)); gbscale = _mm_mul_pd(isaprod,gbtabscale); /* Calculate GB table index */ r = _mm_mul_pd(rsq,rinv); rtab = _mm_mul_pd(r,gbscale); n0 = _mm_cvttpd_epi32(rtab); eps = _mm_sub_pd(rtab,_mm_cvtepi32_pd(n0)); nnn = _mm_slli_epi32(n0,2); /* the tables are 16-byte aligned, so we can use _mm_load_pd */ Y = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))); F = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1))); GMX_MM_TRANSPOSE2_PD(Y,F); G = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2); H = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1))+2); GMX_MM_TRANSPOSE2_PD(G,H); G = _mm_mul_pd(G,eps); H = _mm_mul_pd(H, _mm_mul_pd(eps,eps) ); F = _mm_add_pd(F, _mm_add_pd( G , H ) ); Y = _mm_add_pd(Y, _mm_mul_pd(F, eps)); F = _mm_add_pd(F, _mm_add_pd(G , _mm_mul_pd(H,two))); vgb = _mm_mul_pd(Y, qq); fijGB = _mm_mul_pd(F, _mm_mul_pd(qq,gbscale)); dvdatmp = _mm_mul_pd(_mm_add_pd(vgb, _mm_mul_pd(fijGB,r)) , minushalf); vgbtot = _mm_add_pd(vgbtot, vgb); dvdasum = _mm_add_pd(dvdasum, dvdatmp); dvdatmp = _mm_mul_pd(dvdatmp, _mm_mul_pd(isaj,isaj)); GMX_MM_INCREMENT_2VALUES_PD(dvda+jnrA,dvda+jnrB,dvdatmp); /* Calculate VDW table index */ rtab = _mm_mul_pd(r,tabscale); n0 = _mm_cvttpd_epi32(rtab); eps = _mm_sub_pd(rtab,_mm_cvtepi32_pd(n0)); eps2 = _mm_mul_pd(eps,eps); nnn = _mm_slli_epi32(n0,3); /* Dispersion */ Y = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))); F = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1))); GMX_MM_TRANSPOSE2_PD(Y,F); G = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+2); H = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1))+2); GMX_MM_TRANSPOSE2_PD(G,H); G = _mm_mul_pd(G,eps); H = _mm_mul_pd(H,eps2); Fp = _mm_add_pd(F,G); Fp = _mm_add_pd(Fp,H); VV = _mm_mul_pd(Fp,eps); VV = _mm_add_pd(Y,VV); xmm1 = _mm_mul_pd(two,H); FF = _mm_add_pd(Fp,G); FF = _mm_add_pd(FF,xmm1); vvdw6 = _mm_mul_pd(c6,VV); fijD = _mm_mul_pd(c6,FF); /* Dispersion */ Y = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+4); F = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1))+4); GMX_MM_TRANSPOSE2_PD(Y,F); G = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+6); H = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1))+6); GMX_MM_TRANSPOSE2_PD(G,H); G = _mm_mul_pd(G,eps); H = _mm_mul_pd(H,eps2); Fp = _mm_add_pd(F,G); Fp = _mm_add_pd(Fp,H); VV = _mm_mul_pd(Fp,eps); VV = _mm_add_pd(Y,VV); xmm1 = _mm_mul_pd(two,H); FF = _mm_add_pd(Fp,G); FF = _mm_add_pd(FF,xmm1); vvdw12 = _mm_mul_pd(c12,VV); fijR = _mm_mul_pd(c12,FF); vvdwtmp = _mm_add_pd(vvdw12,vvdw6); vvdwtot = _mm_add_pd(vvdwtot,vvdwtmp); xmm1 = _mm_add_pd(fijD,fijR); xmm1 = _mm_mul_pd(xmm1,tabscale); xmm1 = _mm_add_pd(xmm1,fijGB); xmm1 = _mm_sub_pd(xmm1,fscal); fscal = _mm_mul_pd(xmm1,neg); fscal = _mm_mul_pd(fscal,rinv); /***********************************/ /* INTERACTION SECTION ENDS HERE */ /***********************************/ /* Calculate temporary vectorial force */ tx = _mm_mul_pd(fscal,dx); ty = _mm_mul_pd(fscal,dy); tz = _mm_mul_pd(fscal,dz); /* Increment i atom force */ fix = _mm_add_pd(fix,tx); fiy = _mm_add_pd(fiy,ty); fiz = _mm_add_pd(fiz,tz); /* Store j forces back */ GMX_MM_DECREMENT_1RVEC_2POINTERS_PD(faction+j3A,faction+j3B,tx,ty,tz); } /* In double precision, offset can only be either 0 or 1 */ if(k<nj1) { jnrA = jjnr[k]; j3A = jnrA * 3; GMX_MM_LOAD_1RVEC_1POINTER_PD(pos+j3A,jx,jy,jz); dx = _mm_sub_sd(ix,jx); dy = _mm_sub_sd(iy,jy); dz = _mm_sub_sd(iz,jz); rsq = gmx_mm_calc_rsq_pd(dx,dy,dz); rinv = gmx_mm_invsqrt_pd(rsq); rinvsq = _mm_mul_sd(rinv,rinv); /* These reason for zeroing these variables here is for fixing bug 585 * What happens is that __m128d _mm_add_sd(a,b) gives back r0=a[0]+b[0], * and r1=0, but it should be r1=a[1]. * This might be a compiler issue (tested with gcc-4.1.3 and -O3). * To work around it, we zero these variables and use _mm_add_pd (**) instead * Note that the only variables that get affected are the energies since * the total sum needs to be correct */ vgb = _mm_setzero_pd(); vcoul = _mm_setzero_pd(); dvdatmp = _mm_setzero_pd(); vvdw6 = _mm_setzero_pd(); vvdw12 = _mm_setzero_pd(); /***********************************/ /* INTERACTION SECTION STARTS HERE */ /***********************************/ GMX_MM_LOAD_1VALUE_PD(charge+jnrA,jq); GMX_MM_LOAD_1VALUE_PD(invsqrta+jnrA,isaj); /* Lennard-Jones */ tjA = nti+2*type[jnrA]; GMX_MM_LOAD_1PAIR_PD(vdwparam+tjA,c6,c12); isaprod = _mm_mul_sd(isai,isaj); qq = _mm_mul_sd(jq,iq); vcoul = _mm_mul_sd(qq,rinv); fscal = _mm_mul_sd(vcoul,rinv); vctot = _mm_add_pd(vctot,vcoul); /* (**) */ /* Polarization interaction */ qq = _mm_mul_sd(qq,_mm_mul_sd(isaprod,gbfactor)); gbscale = _mm_mul_sd(isaprod,gbtabscale); /* Calculate GB table index */ r = _mm_mul_sd(rsq,rinv); rtab = _mm_mul_sd(r,gbscale); n0 = _mm_cvttpd_epi32(rtab); eps = _mm_sub_sd(rtab,_mm_cvtepi32_pd(n0)); nnn = _mm_slli_epi32(n0,2); /* the tables are 16-byte aligned, so we can use _mm_load_pd */ Y = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))); F = _mm_setzero_pd(); GMX_MM_TRANSPOSE2_PD(Y,F); G = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2); H = _mm_setzero_pd(); GMX_MM_TRANSPOSE2_PD(G,H); G = _mm_mul_sd(G,eps); H = _mm_mul_sd(H, _mm_mul_sd(eps,eps) ); F = _mm_add_sd(F, _mm_add_sd( G , H ) ); Y = _mm_add_sd(Y, _mm_mul_sd(F, eps)); F = _mm_add_sd(F, _mm_add_sd(G , _mm_mul_sd(H,two))); vgb = _mm_mul_sd(Y, qq); fijGB = _mm_mul_sd(F, _mm_mul_sd(qq,gbscale)); dvdatmp = _mm_mul_sd(_mm_add_sd(vgb, _mm_mul_sd(fijGB,r)) , minushalf); vgbtot = _mm_add_pd(vgbtot, vgb); /* (**) */ dvdasum = _mm_add_pd(dvdasum, dvdatmp); /* (**) */ dvdatmp = _mm_mul_sd(dvdatmp, _mm_mul_sd(isaj,isaj)); GMX_MM_INCREMENT_1VALUE_PD(dvda+jnrA,dvdatmp); /* Calculate VDW table index */ rtab = _mm_mul_sd(r,tabscale); n0 = _mm_cvttpd_epi32(rtab); eps = _mm_sub_sd(rtab,_mm_cvtepi32_pd(n0)); eps2 = _mm_mul_sd(eps,eps); nnn = _mm_slli_epi32(n0,3); /* Dispersion */ Y = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))); F = _mm_setzero_pd(); GMX_MM_TRANSPOSE2_PD(Y,F); G = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+2); H = _mm_setzero_pd(); GMX_MM_TRANSPOSE2_PD(G,H); G = _mm_mul_sd(G,eps); H = _mm_mul_sd(H,eps2); Fp = _mm_add_sd(F,G); Fp = _mm_add_sd(Fp,H); VV = _mm_mul_sd(Fp,eps); VV = _mm_add_sd(Y,VV); xmm1 = _mm_mul_sd(two,H); FF = _mm_add_sd(Fp,G); FF = _mm_add_sd(FF,xmm1); vvdw6 = _mm_mul_sd(c6,VV); fijD = _mm_mul_sd(c6,FF); /* Dispersion */ Y = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+4); F = _mm_setzero_pd(); GMX_MM_TRANSPOSE2_PD(Y,F); G = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+6); H = _mm_setzero_pd(); GMX_MM_TRANSPOSE2_PD(G,H); G = _mm_mul_sd(G,eps); H = _mm_mul_sd(H,eps2); Fp = _mm_add_sd(F,G); Fp = _mm_add_sd(Fp,H); VV = _mm_mul_sd(Fp,eps); VV = _mm_add_sd(Y,VV); xmm1 = _mm_mul_sd(two,H); FF = _mm_add_sd(Fp,G); FF = _mm_add_sd(FF,xmm1); vvdw12 = _mm_mul_sd(c12,VV); fijR = _mm_mul_sd(c12,FF); vvdwtmp = _mm_add_sd(vvdw12,vvdw6); vvdwtot = _mm_add_pd(vvdwtot,vvdwtmp); /* (**) */ xmm1 = _mm_add_sd(fijD,fijR); xmm1 = _mm_mul_sd(xmm1,tabscale); xmm1 = _mm_add_sd(xmm1,fijGB); xmm1 = _mm_sub_sd(xmm1,fscal); fscal = _mm_mul_sd(xmm1,neg); fscal = _mm_mul_sd(fscal,rinv); /***********************************/ /* INTERACTION SECTION ENDS HERE */ /***********************************/ /* Calculate temporary vectorial force */ tx = _mm_mul_sd(fscal,dx); ty = _mm_mul_sd(fscal,dy); tz = _mm_mul_sd(fscal,dz); /* Increment i atom force */ fix = _mm_add_sd(fix,tx); fiy = _mm_add_sd(fiy,ty); fiz = _mm_add_sd(fiz,tz); /* Store j forces back */ GMX_MM_DECREMENT_1RVEC_1POINTER_PD(faction+j3A,tx,ty,tz); } dvdasum = _mm_mul_pd(dvdasum, _mm_mul_pd(isai,isai)); gmx_mm_update_iforce_1atom_pd(&fix,&fiy,&fiz,faction+ii3,fshift+is3); ggid = gid[n]; gmx_mm_update_1pot_pd(vctot,vc+ggid); gmx_mm_update_1pot_pd(vgbtot,gpol+ggid); gmx_mm_update_1pot_pd(dvdasum,dvda+ii); gmx_mm_update_1pot_pd(vvdwtot,vvdw+ggid); } *outeriter = nri; *inneriter = nj1; }
void exchlaplacecoeff_gmrfData_5(unsigned int slot) { for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[0]) { if ((!neighbor_isValid[0][0])) { { double xPos; double yPos; /* Statements in this Scop: S1306, S1309, S1300, S1308, S1302, S1305, S1310, S1304, S1307, S1301, S1303 */ { { { { { { { { { { { double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][0]); int i1 = 1; for (; (i1<=32); i1 += 2) { fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+6302)] = 0.000000e+00; fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+6338)] = 0.000000e+00; } for (; (i1<=33); i1 += 1) { fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+6302)] = 0.000000e+00; } } { double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][0]); int i1 = 1; for (; (i1<=32); i1 += 2) { fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+3782)] = 0.000000e+00; fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+3818)] = 0.000000e+00; } for (; (i1<=33); i1 += 1) { fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+3782)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][0]); int i1 = 1; for (; (i1<=32); i1 += 2) { fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+7562)] = 0.000000e+00; fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+7598)] = 0.000000e+00; } for (; (i1<=33); i1 += 1) { fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+7562)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][0]); int i1 = 1; for (; (i1<=32); i1 += 2) { fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+2)] = 0.000000e+00; fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+38)] = 0.000000e+00; } for (; (i1<=33); i1 += 1) { fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+2)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][0]); int i1 = 1; for (; (i1<=32); i1 += 2) { fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+2522)] = 0.000000e+00; fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+2558)] = 0.000000e+00; } for (; (i1<=33); i1 += 1) { fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+2522)] = 0.000000e+00; } } } { int i1 = 1; for (; (i1<(2&(~1))); i1 += 1) { xPos = posBegin[0]; } __m128d vec1 = _mm_set1_pd(xPos); for (; (i1<31); i1 += 4) { /* xPos = posBegin[0]; */ __m128d vec0 = _mm_load1_pd((&posBegin[0])); __m128d vec0_2 = _mm_load1_pd((&posBegin[0])); vec1 = vec0; vec1 = vec0_2; } for (; (i1<34); i1 += 1) { xPos = posBegin[0]; } } } { double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][0]); int i1 = 1; for (; (i1<=32); i1 += 2) { fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+8822)] = 0.000000e+00; fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+8858)] = 0.000000e+00; } for (; (i1<=33); i1 += 1) { fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+8822)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][0]); int i1 = 1; for (; (i1<=32); i1 += 2) { fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+1262)] = 0.000000e+00; fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+1298)] = 0.000000e+00; } for (; (i1<=33); i1 += 1) { fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+1262)] = 0.000000e+00; } } } { int i1 = 1; for (; (i1<(2&(~1))); i1 += 1) { yPos = ((((i1-1)/3.200000e+01)*(posEnd[1]-posBegin[1]))+posBegin[1]); } __m128d vec1 = _mm_set1_pd(1.000000e+00); __m128d vec2 = _mm_set1_pd(3.200000e+01); __m128d vec5 = _mm_set1_pd(yPos); for (; (i1<31); i1 += 4) { /* yPos = ((((i1-1)/3.200000e+01)*(posEnd[1]-posBegin[1]))+posBegin[1]); */ __m128d vec0 = _mm_set_pd(i1+1,i1); __m128d vec0_2 = _mm_set_pd(i1+1,i1); __m128d vec3 = _mm_load1_pd((&posEnd[1])); __m128d vec3_2 = _mm_load1_pd((&posEnd[1])); __m128d vec4 = _mm_load1_pd((&posBegin[1])); __m128d vec4_2 = _mm_load1_pd((&posBegin[1])); vec5 = _mm_add_pd(_mm_mul_pd(_mm_div_pd(_mm_sub_pd(vec0, vec1), vec2), _mm_sub_pd(vec3, vec4)), vec4); vec5 = _mm_add_pd(_mm_mul_pd(_mm_div_pd(_mm_sub_pd(vec0_2, vec1), vec2), _mm_sub_pd(vec3_2, vec4_2)), vec4_2); } for (; (i1<34); i1 += 1) { yPos = ((((i1-1)/3.200000e+01)*(posEnd[1]-posBegin[1]))+posBegin[1]); } } } { double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][0]); int i1 = 1; for (; (i1<=32); i1 += 2) { fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+5042)] = 0.000000e+00; fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+5078)] = 0.000000e+00; } for (; (i1<=33); i1 += 1) { fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+5042)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][0]); int i1 = 1; for (; (i1<=32); i1 += 2) { fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+10082)] = 0.000000e+00; fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+10118)] = 0.000000e+00; } for (; (i1<=33); i1 += 1) { fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+10082)] = 0.000000e+00; } } } } } if ((!neighbor_isValid[0][1])) { { double xPos; double yPos; /* Statements in this Scop: S1312, S1320, S1314, S1317, S1311, S1319, S1313, S1316, S1321, S1315, S1318 */ { { { { { { { { { { { double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][0]); int i1 = 1; for (; (i1<=32); i1 += 2) { fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+1294)] = 0.000000e+00; fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+1330)] = 0.000000e+00; } for (; (i1<=33); i1 += 1) { fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+1294)] = 0.000000e+00; } } { double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][0]); int i1 = 1; for (; (i1<=32); i1 += 2) { fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+5074)] = 0.000000e+00; fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+5110)] = 0.000000e+00; } for (; (i1<=33); i1 += 1) { fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+5074)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][0]); int i1 = 1; for (; (i1<=32); i1 += 2) { fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+8854)] = 0.000000e+00; fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+8890)] = 0.000000e+00; } for (; (i1<=33); i1 += 1) { fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+8854)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][0]); int i1 = 1; for (; (i1<=32); i1 += 2) { fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+2554)] = 0.000000e+00; fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+2590)] = 0.000000e+00; } for (; (i1<=33); i1 += 1) { fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+2554)] = 0.000000e+00; } } } { int i1 = 1; for (; (i1<(2&(~1))); i1 += 1) { yPos = ((((i1-1)/3.200000e+01)*(posEnd[1]-posBegin[1]))+posBegin[1]); } __m128d vec1 = _mm_set1_pd(1.000000e+00); __m128d vec2 = _mm_set1_pd(3.200000e+01); __m128d vec5 = _mm_set1_pd(yPos); for (; (i1<31); i1 += 4) { /* yPos = ((((i1-1)/3.200000e+01)*(posEnd[1]-posBegin[1]))+posBegin[1]); */ __m128d vec0 = _mm_set_pd(i1+1,i1); __m128d vec0_2 = _mm_set_pd(i1+1,i1); __m128d vec3 = _mm_load1_pd((&posEnd[1])); __m128d vec3_2 = _mm_load1_pd((&posEnd[1])); __m128d vec4 = _mm_load1_pd((&posBegin[1])); __m128d vec4_2 = _mm_load1_pd((&posBegin[1])); vec5 = _mm_add_pd(_mm_mul_pd(_mm_div_pd(_mm_sub_pd(vec0, vec1), vec2), _mm_sub_pd(vec3, vec4)), vec4); vec5 = _mm_add_pd(_mm_mul_pd(_mm_div_pd(_mm_sub_pd(vec0_2, vec1), vec2), _mm_sub_pd(vec3_2, vec4_2)), vec4_2); } for (; (i1<34); i1 += 1) { yPos = ((((i1-1)/3.200000e+01)*(posEnd[1]-posBegin[1]))+posBegin[1]); } } } { double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][0]); int i1 = 1; for (; (i1<=32); i1 += 2) { fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+34)] = 0.000000e+00; fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+70)] = 0.000000e+00; } for (; (i1<=33); i1 += 1) { fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+34)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][0]); int i1 = 1; for (; (i1<=32); i1 += 2) { fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+3814)] = 0.000000e+00; fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+3850)] = 0.000000e+00; } for (; (i1<=33); i1 += 1) { fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+3814)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][0]); int i1 = 1; for (; (i1<=32); i1 += 2) { fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+7594)] = 0.000000e+00; fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+7630)] = 0.000000e+00; } for (; (i1<=33); i1 += 1) { fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+7594)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][0]); int i1 = 1; for (; (i1<=32); i1 += 2) { fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+6334)] = 0.000000e+00; fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+6370)] = 0.000000e+00; } for (; (i1<=33); i1 += 1) { fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+6334)] = 0.000000e+00; } } } { int i1 = 1; for (; (i1<(2&(~1))); i1 += 1) { xPos = posEnd[0]; } __m128d vec1 = _mm_set1_pd(xPos); for (; (i1<31); i1 += 4) { /* xPos = posEnd[0]; */ __m128d vec0 = _mm_load1_pd((&posEnd[0])); __m128d vec0_2 = _mm_load1_pd((&posEnd[0])); vec1 = vec0; vec1 = vec0_2; } for (; (i1<34); i1 += 1) { xPos = posEnd[0]; } } } { double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][0]); int i1 = 1; for (; (i1<=32); i1 += 2) { fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+10114)] = 0.000000e+00; fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+10150)] = 0.000000e+00; } for (; (i1<=33); i1 += 1) { fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+10114)] = 0.000000e+00; } } } } } if ((!neighbor_isValid[0][2])) { { double xPos; double yPos; /* Statements in this Scop: S1327, S1332, S1326, S1329, S1323, S1322, S1331, S1325, S1328, S1330, S1324 */ { { { { { { { { { { { double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][0]); int i2 = 2; for (; (i2<=33); i2 += 2) { fieldData_LaplaceCoeff_GMRF_5_p1[(i2+5076)] = 0.000000e+00; fieldData_LaplaceCoeff_GMRF_5_p1[(i2+5077)] = 0.000000e+00; } for (; (i2<=34); i2 += 1) { fieldData_LaplaceCoeff_GMRF_5_p1[(i2+5076)] = 0.000000e+00; } } { double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][0]); int i2 = 2; for (; (i2<=33); i2 += 2) { fieldData_LaplaceCoeff_GMRF_5_p1[(i2+8856)] = 0.000000e+00; fieldData_LaplaceCoeff_GMRF_5_p1[(i2+8857)] = 0.000000e+00; } for (; (i2<=34); i2 += 1) { fieldData_LaplaceCoeff_GMRF_5_p1[(i2+8856)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][0]); int i2 = 2; for (; (i2<=33); i2 += 2) { fieldData_LaplaceCoeff_GMRF_5_p1[(i2+6336)] = 0.000000e+00; fieldData_LaplaceCoeff_GMRF_5_p1[(i2+6337)] = 0.000000e+00; } for (; (i2<=34); i2 += 1) { fieldData_LaplaceCoeff_GMRF_5_p1[(i2+6336)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][0]); int i2 = 2; for (; (i2<=33); i2 += 2) { fieldData_LaplaceCoeff_GMRF_5_p1[(i2+7596)] = 0.000000e+00; fieldData_LaplaceCoeff_GMRF_5_p1[(i2+7597)] = 0.000000e+00; } for (; (i2<=34); i2 += 1) { fieldData_LaplaceCoeff_GMRF_5_p1[(i2+7596)] = 0.000000e+00; } } } { int i2 = 2; for (; (i2<=33); i2 += 2) { xPos = ((((i2-2)/3.200000e+01)*(posEnd[0]-posBegin[0]))+posBegin[0]); xPos = ((((i2-1)/3.200000e+01)*(posEnd[0]-posBegin[0]))+posBegin[0]); } for (; (i2<=34); i2 += 1) { xPos = ((((i2-2)/3.200000e+01)*(posEnd[0]-posBegin[0]))+posBegin[0]); } } } { double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][0]); int i2 = 2; for (; (i2<=33); i2 += 2) { fieldData_LaplaceCoeff_GMRF_5_p1[(i2+36)] = 0.000000e+00; fieldData_LaplaceCoeff_GMRF_5_p1[(i2+37)] = 0.000000e+00; } for (; (i2<=34); i2 += 1) { fieldData_LaplaceCoeff_GMRF_5_p1[(i2+36)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][0]); int i2 = 2; for (; (i2<=33); i2 += 2) { fieldData_LaplaceCoeff_GMRF_5_p1[(i2+2556)] = 0.000000e+00; fieldData_LaplaceCoeff_GMRF_5_p1[(i2+2557)] = 0.000000e+00; } for (; (i2<=34); i2 += 1) { fieldData_LaplaceCoeff_GMRF_5_p1[(i2+2556)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][0]); int i2 = 2; for (; (i2<=33); i2 += 2) { fieldData_LaplaceCoeff_GMRF_5_p1[(i2+10116)] = 0.000000e+00; fieldData_LaplaceCoeff_GMRF_5_p1[(i2+10117)] = 0.000000e+00; } for (; (i2<=34); i2 += 1) { fieldData_LaplaceCoeff_GMRF_5_p1[(i2+10116)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][0]); int i2 = 2; for (; (i2<=33); i2 += 2) { fieldData_LaplaceCoeff_GMRF_5_p1[(i2+3816)] = 0.000000e+00; fieldData_LaplaceCoeff_GMRF_5_p1[(i2+3817)] = 0.000000e+00; } for (; (i2<=34); i2 += 1) { fieldData_LaplaceCoeff_GMRF_5_p1[(i2+3816)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][0]); int i2 = 2; for (; (i2<=33); i2 += 2) { fieldData_LaplaceCoeff_GMRF_5_p1[(i2+1296)] = 0.000000e+00; fieldData_LaplaceCoeff_GMRF_5_p1[(i2+1297)] = 0.000000e+00; } for (; (i2<=34); i2 += 1) { fieldData_LaplaceCoeff_GMRF_5_p1[(i2+1296)] = 0.000000e+00; } } } { int i2 = 2; for (; (i2<=33); i2 += 2) { yPos = posBegin[1]; yPos = posBegin[1]; } for (; (i2<=34); i2 += 1) { yPos = posBegin[1]; } } } } } if ((!neighbor_isValid[0][3])) { { double xPos; double yPos; /* Statements in this Scop: S1338, S1341, S1335, S1340, S1343, S1337, S1334, S1333, S1342, S1336, S1339 */ { { { { { { { { { { { double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][0]); int i2 = 2; for (; (i2<=33); i2 += 2) { fieldData_LaplaceCoeff_GMRF_5_p1[(i2+7488)] = 0.000000e+00; fieldData_LaplaceCoeff_GMRF_5_p1[(i2+7489)] = 0.000000e+00; } for (; (i2<=34); i2 += 1) { fieldData_LaplaceCoeff_GMRF_5_p1[(i2+7488)] = 0.000000e+00; } } { double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][0]); int i2 = 2; for (; (i2<=33); i2 += 2) { fieldData_LaplaceCoeff_GMRF_5_p1[(i2+2448)] = 0.000000e+00; fieldData_LaplaceCoeff_GMRF_5_p1[(i2+2449)] = 0.000000e+00; } for (; (i2<=34); i2 += 1) { fieldData_LaplaceCoeff_GMRF_5_p1[(i2+2448)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][0]); int i2 = 2; for (; (i2<=33); i2 += 2) { fieldData_LaplaceCoeff_GMRF_5_p1[(i2+11268)] = 0.000000e+00; fieldData_LaplaceCoeff_GMRF_5_p1[(i2+11269)] = 0.000000e+00; } for (; (i2<=34); i2 += 1) { fieldData_LaplaceCoeff_GMRF_5_p1[(i2+11268)] = 0.000000e+00; } } } { int i2 = 2; for (; (i2<=33); i2 += 2) { yPos = posEnd[1]; yPos = posEnd[1]; } for (; (i2<=34); i2 += 1) { yPos = posEnd[1]; } } } { double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][0]); int i2 = 2; for (; (i2<=33); i2 += 2) { fieldData_LaplaceCoeff_GMRF_5_p1[(i2+10008)] = 0.000000e+00; fieldData_LaplaceCoeff_GMRF_5_p1[(i2+10009)] = 0.000000e+00; } for (; (i2<=34); i2 += 1) { fieldData_LaplaceCoeff_GMRF_5_p1[(i2+10008)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][0]); int i2 = 2; for (; (i2<=33); i2 += 2) { fieldData_LaplaceCoeff_GMRF_5_p1[(i2+6228)] = 0.000000e+00; fieldData_LaplaceCoeff_GMRF_5_p1[(i2+6229)] = 0.000000e+00; } for (; (i2<=34); i2 += 1) { fieldData_LaplaceCoeff_GMRF_5_p1[(i2+6228)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][0]); int i2 = 2; for (; (i2<=33); i2 += 2) { fieldData_LaplaceCoeff_GMRF_5_p1[(i2+8748)] = 0.000000e+00; fieldData_LaplaceCoeff_GMRF_5_p1[(i2+8749)] = 0.000000e+00; } for (; (i2<=34); i2 += 1) { fieldData_LaplaceCoeff_GMRF_5_p1[(i2+8748)] = 0.000000e+00; } } } { int i2 = 2; for (; (i2<=33); i2 += 2) { xPos = ((((i2-2)/3.200000e+01)*(posEnd[0]-posBegin[0]))+posBegin[0]); xPos = ((((i2-1)/3.200000e+01)*(posEnd[0]-posBegin[0]))+posBegin[0]); } for (; (i2<=34); i2 += 1) { xPos = ((((i2-2)/3.200000e+01)*(posEnd[0]-posBegin[0]))+posBegin[0]); } } } { double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][0]); int i2 = 2; for (; (i2<=33); i2 += 2) { fieldData_LaplaceCoeff_GMRF_5_p1[(i2+3708)] = 0.000000e+00; fieldData_LaplaceCoeff_GMRF_5_p1[(i2+3709)] = 0.000000e+00; } for (; (i2<=34); i2 += 1) { fieldData_LaplaceCoeff_GMRF_5_p1[(i2+3708)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][0]); int i2 = 2; for (; (i2<=33); i2 += 2) { fieldData_LaplaceCoeff_GMRF_5_p1[(i2+1188)] = 0.000000e+00; fieldData_LaplaceCoeff_GMRF_5_p1[(i2+1189)] = 0.000000e+00; } for (; (i2<=34); i2 += 1) { fieldData_LaplaceCoeff_GMRF_5_p1[(i2+1188)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][0]); int i2 = 2; for (; (i2<=33); i2 += 2) { fieldData_LaplaceCoeff_GMRF_5_p1[(i2+4968)] = 0.000000e+00; fieldData_LaplaceCoeff_GMRF_5_p1[(i2+4969)] = 0.000000e+00; } for (; (i2<=34); i2 += 1) { fieldData_LaplaceCoeff_GMRF_5_p1[(i2+4968)] = 0.000000e+00; } } } } } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[0]) { if ((neighbor_isValid[0][1]&&neighbor_isRemote[0][1])) { /* Statements in this Scop: S1344 */ for (int i3 = 0; (i3<=8); i3 += 1) { double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][(i3*1260)]); double* buffer_Send_1_p1 = (&buffer_Send[1][(i3*33)]); int i4 = 1; for (; (i4<=32); i4 += 2) { buffer_Send_1_p1[(i4-1)] = fieldData_LaplaceCoeff_GMRF_5_p1[((i4*36)+34)]; buffer_Send_1_p1[i4] = fieldData_LaplaceCoeff_GMRF_5_p1[((i4*36)+70)]; } for (; (i4<=33); i4 += 1) { buffer_Send_1_p1[(i4-1)] = fieldData_LaplaceCoeff_GMRF_5_p1[((i4*36)+34)]; } } } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[0]) { if ((neighbor_isValid[0][1]&&neighbor_isRemote[0][1])) { MPI_Isend(buffer_Send[1], 297, MPI_DOUBLE, neighbor_remoteRank[0][1], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[0][1]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[1]); reqOutstanding_Send[1] = true; } } } ; for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[0]) { if ((neighbor_isValid[0][0]&&neighbor_isRemote[0][0])) { MPI_Irecv(buffer_Recv[0], 297, MPI_DOUBLE, neighbor_remoteRank[0][0], ((unsigned int)(neighbor_fragCommId[0][0]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[0]); reqOutstanding_Recv[0] = true; } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[0]) { if (reqOutstanding_Recv[0]) { waitForMPIReq(&mpiRequest_Recv[0]); reqOutstanding_Recv[0] = false; } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[0]) { if ((neighbor_isValid[0][0]&&neighbor_isRemote[0][0])) { /* Statements in this Scop: S1345 */ for (int i3 = 0; (i3<=8); i3 += 1) { double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][(i3*1260)]); double* buffer_Recv_0_p1 = (&buffer_Recv[0][(i3*33)]); int i4 = 3; for (; (i4<=34); i4 += 2) { fieldData_LaplaceCoeff_GMRF_5_p1[((i4*36)-70)] = buffer_Recv_0_p1[(i4-3)]; fieldData_LaplaceCoeff_GMRF_5_p1[((i4*36)-34)] = buffer_Recv_0_p1[(i4-2)]; } for (; (i4<=35); i4 += 1) { fieldData_LaplaceCoeff_GMRF_5_p1[((i4*36)-70)] = buffer_Recv_0_p1[(i4-3)]; } } } } } ; ; for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[0]) { if (reqOutstanding_Send[1]) { waitForMPIReq(&mpiRequest_Send[1]); reqOutstanding_Send[1] = false; } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[0]) { ; } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[0]) { if ((neighbor_isValid[0][3]&&neighbor_isRemote[0][3])) { MPI_Isend(&fieldData_LaplaceCoeff_GMRF[5][1190], 1, mpiDatatype_9_33_1260, neighbor_remoteRank[0][3], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[0][3]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[3]); reqOutstanding_Send[3] = true; } } } ; for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[0]) { if ((neighbor_isValid[0][2]&&neighbor_isRemote[0][2])) { MPI_Irecv(&fieldData_LaplaceCoeff_GMRF[5][38], 1, mpiDatatype_9_33_1260, neighbor_remoteRank[0][2], ((unsigned int)(neighbor_fragCommId[0][2]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[2]); reqOutstanding_Recv[2] = true; } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[0]) { if (reqOutstanding_Recv[2]) { waitForMPIReq(&mpiRequest_Recv[2]); reqOutstanding_Recv[2] = false; } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[0]) { ; } } ; ; for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[0]) { if (reqOutstanding_Send[3]) { waitForMPIReq(&mpiRequest_Send[3]); reqOutstanding_Send[3] = false; } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[0]) { if ((neighbor_isValid[0][0]&&neighbor_isRemote[0][0])) { /* Statements in this Scop: S1346 */ for (int i3 = 0; (i3<=8); i3 += 1) { double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][(i3*1260)]); double* buffer_Send_0_p1 = (&buffer_Send[0][(i3*35)]); int i4 = 0; for (; (i4<=33); i4 += 2) { buffer_Send_0_p1[i4] = fieldData_LaplaceCoeff_GMRF_5_p1[((i4*36)+3)]; buffer_Send_0_p1[(i4+1)] = fieldData_LaplaceCoeff_GMRF_5_p1[((i4*36)+39)]; } for (; (i4<=34); i4 += 1) { buffer_Send_0_p1[i4] = fieldData_LaplaceCoeff_GMRF_5_p1[((i4*36)+3)]; } } } if ((neighbor_isValid[0][1]&&neighbor_isRemote[0][1])) { /* Statements in this Scop: S1347 */ for (int i3 = 0; (i3<=8); i3 += 1) { double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][(i3*1260)]); double* buffer_Send_1_p1 = (&buffer_Send[1][(i3*35)]); int i4 = 0; for (; (i4<=33); i4 += 2) { buffer_Send_1_p1[i4] = fieldData_LaplaceCoeff_GMRF_5_p1[((i4*36)+33)]; buffer_Send_1_p1[(i4+1)] = fieldData_LaplaceCoeff_GMRF_5_p1[((i4*36)+69)]; } for (; (i4<=34); i4 += 1) { buffer_Send_1_p1[i4] = fieldData_LaplaceCoeff_GMRF_5_p1[((i4*36)+33)]; } } } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[0]) { if ((neighbor_isValid[0][0]&&neighbor_isRemote[0][0])) { MPI_Isend(buffer_Send[0], 315, MPI_DOUBLE, neighbor_remoteRank[0][0], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[0][0]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[0]); reqOutstanding_Send[0] = true; } if ((neighbor_isValid[0][1]&&neighbor_isRemote[0][1])) { MPI_Isend(buffer_Send[1], 315, MPI_DOUBLE, neighbor_remoteRank[0][1], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[0][1]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[1]); reqOutstanding_Send[1] = true; } } } ; for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[0]) { if ((neighbor_isValid[0][0]&&neighbor_isRemote[0][0])) { MPI_Irecv(buffer_Recv[0], 315, MPI_DOUBLE, neighbor_remoteRank[0][0], ((unsigned int)(neighbor_fragCommId[0][0]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[0]); reqOutstanding_Recv[0] = true; } if ((neighbor_isValid[0][1]&&neighbor_isRemote[0][1])) { MPI_Irecv(buffer_Recv[1], 315, MPI_DOUBLE, neighbor_remoteRank[0][1], ((unsigned int)(neighbor_fragCommId[0][1]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[1]); reqOutstanding_Recv[1] = true; } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[0]) { if (reqOutstanding_Recv[0]) { waitForMPIReq(&mpiRequest_Recv[0]); reqOutstanding_Recv[0] = false; } if (reqOutstanding_Recv[1]) { waitForMPIReq(&mpiRequest_Recv[1]); reqOutstanding_Recv[1] = false; } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[0]) { if ((neighbor_isValid[0][0]&&neighbor_isRemote[0][0])) { /* Statements in this Scop: S1348 */ for (int i3 = 0; (i3<=8); i3 += 1) { double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][(i3*1260)]); double* buffer_Recv_0_p1 = (&buffer_Recv[0][(i3*35)]); int i4 = 1; for (; (i4<=34); i4 += 2) { fieldData_LaplaceCoeff_GMRF_5_p1[((i4*36)-35)] = buffer_Recv_0_p1[(i4-1)]; fieldData_LaplaceCoeff_GMRF_5_p1[((i4*36)+1)] = buffer_Recv_0_p1[i4]; } for (; (i4<=35); i4 += 1) { fieldData_LaplaceCoeff_GMRF_5_p1[((i4*36)-35)] = buffer_Recv_0_p1[(i4-1)]; } } } if ((neighbor_isValid[0][1]&&neighbor_isRemote[0][1])) { /* Statements in this Scop: S1349 */ for (int i3 = 0; (i3<=8); i3 += 1) { double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][(i3*1260)]); double* buffer_Recv_1_p1 = (&buffer_Recv[1][(i3*35)]); int i4 = 35; for (; (i4<=68); i4 += 2) { fieldData_LaplaceCoeff_GMRF_5_p1[((i4*36)-1225)] = buffer_Recv_1_p1[(i4-35)]; fieldData_LaplaceCoeff_GMRF_5_p1[((i4*36)-1189)] = buffer_Recv_1_p1[(i4-34)]; } for (; (i4<=69); i4 += 1) { fieldData_LaplaceCoeff_GMRF_5_p1[((i4*36)-1225)] = buffer_Recv_1_p1[(i4-35)]; } } } } } ; ; for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[0]) { if (reqOutstanding_Send[0]) { waitForMPIReq(&mpiRequest_Send[0]); reqOutstanding_Send[0] = false; } if (reqOutstanding_Send[1]) { waitForMPIReq(&mpiRequest_Send[1]); reqOutstanding_Send[1] = false; } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[0]) { ; ; } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[0]) { if ((neighbor_isValid[0][2]&&neighbor_isRemote[0][2])) { MPI_Isend(&fieldData_LaplaceCoeff_GMRF[5][73], 1, mpiDatatype_9_35_1260, neighbor_remoteRank[0][2], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[0][2]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[2]); reqOutstanding_Send[2] = true; } if ((neighbor_isValid[0][3]&&neighbor_isRemote[0][3])) { MPI_Isend(&fieldData_LaplaceCoeff_GMRF[5][1153], 1, mpiDatatype_9_35_1260, neighbor_remoteRank[0][3], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[0][3]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[3]); reqOutstanding_Send[3] = true; } } } ; for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[0]) { if ((neighbor_isValid[0][2]&&neighbor_isRemote[0][2])) { MPI_Irecv(&fieldData_LaplaceCoeff_GMRF[5][1], 1, mpiDatatype_9_35_1260, neighbor_remoteRank[0][2], ((unsigned int)(neighbor_fragCommId[0][2]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[2]); reqOutstanding_Recv[2] = true; } if ((neighbor_isValid[0][3]&&neighbor_isRemote[0][3])) { MPI_Irecv(&fieldData_LaplaceCoeff_GMRF[5][1225], 1, mpiDatatype_9_35_1260, neighbor_remoteRank[0][3], ((unsigned int)(neighbor_fragCommId[0][3]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[3]); reqOutstanding_Recv[3] = true; } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[0]) { if (reqOutstanding_Recv[2]) { waitForMPIReq(&mpiRequest_Recv[2]); reqOutstanding_Recv[2] = false; } if (reqOutstanding_Recv[3]) { waitForMPIReq(&mpiRequest_Recv[3]); reqOutstanding_Recv[3] = false; } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[0]) { ; ; } } ; ; for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[0]) { if (reqOutstanding_Send[2]) { waitForMPIReq(&mpiRequest_Send[2]); reqOutstanding_Send[2] = false; } if (reqOutstanding_Send[3]) { waitForMPIReq(&mpiRequest_Send[3]); reqOutstanding_Send[3] = false; } } } }
// rotate a matrix U = su2_i*U where su2_i is an su2 matrix embedded in suN void su2_rotate( GLU_complex U[ NCNC ] , const GLU_complex s0 , const GLU_complex s1 , const size_t su2_index ) { #if NC == 3 __m128d *u = (__m128d*)U ; register const __m128d sm0 = _mm_setr_pd( creal( s0 ) , cimag( s0 ) ) ; register const __m128d sm1 = _mm_setr_pd( creal( s1 ) , cimag( s1 ) ) ; register __m128d tmp0 , tmp1 , a , b ; switch( su2_index%3 ) { // again I don't like this case 0 : // first one a = *( u + 0 ) ; b = *( u + 3 ) ; tmp0 = _mm_add_pd( SSE2_MUL( sm0 , a ) , SSE2_MUL( sm1 , b ) ) ; tmp1 = _mm_sub_pd( SSE2_MULCONJ( sm0 , b ) , SSE2_MULCONJ( sm1 , a ) ) ; *( u + 0 ) = tmp0 ; *( u + 3 ) = tmp1 ; // second one a = *( u + 1 ) ; b = *( u + 4 ) ; tmp0 = _mm_add_pd( SSE2_MUL( sm0 , a ) , SSE2_MUL( sm1 , b ) ) ; tmp1 = _mm_sub_pd( SSE2_MULCONJ( sm0 , b ) , SSE2_MULCONJ( sm1 , a ) ) ; *( u + 1 ) = tmp0 ; *( u + 4 ) = tmp1 ; // third a = *( u + 2 ) ; b = *( u + 5 ) ; tmp0 = _mm_add_pd( SSE2_MUL( sm0 , a ) , SSE2_MUL( sm1 , b ) ) ; tmp1 = _mm_sub_pd( SSE2_MULCONJ( sm0 , b ) , SSE2_MULCONJ( sm1 , a ) ) ; *( u + 2 ) = tmp0 ; *( u + 5 ) = tmp1 ; break ; case 1 : // first one a = *( u + 3 ) ; b = *( u + 6 ) ; tmp0 = _mm_add_pd( SSE2_MUL( sm0 , a ) , SSE2_MUL( sm1 , b ) ) ; tmp1 = _mm_sub_pd( SSE2_MULCONJ( sm0 , b ) , SSE2_MULCONJ( sm1 , a ) ) ; *( u + 3 ) = tmp0 ; *( u + 6 ) = tmp1 ; // second one a = *( u + 4 ) ; b = *( u + 7 ) ; tmp0 = _mm_add_pd( SSE2_MUL( sm0 , a ) , SSE2_MUL( sm1 , b ) ) ; tmp1 = _mm_sub_pd( SSE2_MULCONJ( sm0 , b ) , SSE2_MULCONJ( sm1 , a ) ) ; *( u + 4 ) = tmp0 ; *( u + 7 ) = tmp1 ; // third a = *( u + 5 ) ; b = *( u + 8 ) ; tmp0 = _mm_add_pd( SSE2_MUL( sm0 , a ) , SSE2_MUL( sm1 , b ) ) ; tmp1 = _mm_sub_pd( SSE2_MULCONJ( sm0 , b ) , SSE2_MULCONJ( sm1 , a ) ) ; *( u + 5 ) = tmp0 ; *( u + 8 ) = tmp1 ; break ; case 2 : // first one a = *( u + 0 ) ; b = *( u + 6 ) ; tmp0 = _mm_sub_pd( SSE2_MULCONJ( sm0 , a ) , SSE2_MULCONJ( sm1 , b ) ) ; tmp1 = _mm_add_pd( SSE2_MUL( sm0 , b ) , SSE2_MUL( sm1 , a ) ) ; *( u + 0 ) = tmp0 ; *( u + 6 ) = tmp1 ; // second a = *( u + 1 ) ; b = *( u + 7 ) ; tmp0 = _mm_sub_pd( SSE2_MULCONJ( sm0 , a ) , SSE2_MULCONJ( sm1 , b ) ) ; tmp1 = _mm_add_pd( SSE2_MUL( sm0 , b ) , SSE2_MUL( sm1 , a ) ) ; *( u + 1 ) = tmp0 ; *( u + 7 ) = tmp1 ; // third a = *( u + 2 ) ; b = *( u + 8 ) ; tmp0 = _mm_sub_pd( SSE2_MULCONJ( sm0 , a ) , SSE2_MULCONJ( sm1 , b ) ) ; tmp1 = _mm_add_pd( SSE2_MUL( sm0 , b ) , SSE2_MUL( sm1 , a ) ) ; *( u + 2 ) = tmp0 ; *( u + 8 ) = tmp1 ; break ; } #elif NC == 2 __m128d *u = (__m128d*)U ; register const __m128d sm0 = _mm_setr_pd( creal( s0 ) , cimag( s0 ) ) ; register const __m128d sm1 = _mm_setr_pd( creal( s1 ) , cimag( s1 ) ) ; *( u + 0 ) = _mm_add_pd( SSE2_MUL( sm0 , *( u + 0 ) ) , SSE2_MUL( sm1 , *( u + 2 ) ) ) ; *( u + 1 ) = _mm_add_pd( SSE2_MUL( sm0 , *( u + 1 ) ) , SSE2_MUL( sm1 , *( u + 3 ) ) ) ; *( u + 2 ) = SSE_FLIP( SSE2_CONJ( *( u + 1 ) ) ) ; *( u + 3 ) = SSE2_CONJ( *( u + 0 ) ) ; #else // just a call to su2 multiply shortened_su2_multiply( U , s0 , s1 , su2_index ) ; #endif return ; }
DBL AVXFMA4Noise(const Vector3d& EPoint, int noise_generator) { DBL x, y, z; DBL *mp; int ix, iy, iz; int ixiy_hash, ixjy_hash, jxiy_hash, jxjy_hash; DBL sum; // TODO FIXME - global statistics reference // Stats[Calls_To_Noise]++; if (noise_generator==kNoiseGen_Perlin) { // The 1.59 and 0.985 are to correct for some biasing problems with // the random # generator used to create the noise tables. Final // range of values is about 5.0e-4 below 0.0 and above 1.0. Mean // value is 0.49 (ideally it would be 0.5). sum = 0.5 * (1.59 * SolidNoise(EPoint) + 0.985); // Clamp final value to 0-1 range if (sum < 0.0) sum = 0.0; if (sum > 1.0) sum = 1.0; return sum; } x = EPoint[X]; y = EPoint[Y]; z = EPoint[Z]; /* its equivalent integer lattice point. */ /* ix = (int)x; iy = (int)y; iz = (long)z; */ /* JB fix for the range problem */ __m128d xy = _mm_setr_pd(x, y); __m128d zn = _mm_set_sd(z); __m128d epsy = _mm_set1_pd(1.0 - EPSILON); __m128d xy_e = _mm_sub_pd(xy, epsy); __m128d zn_e = _mm_sub_sd(zn, epsy); __m128i tmp_xy = _mm_cvttpd_epi32(_mm_blendv_pd(xy, xy_e, xy)); __m128i tmp_zn = _mm_cvttpd_epi32(_mm_blendv_pd(zn, zn_e, zn)); __m128i noise_min_xy = _mm_setr_epi32(NOISE_MINX, NOISE_MINY, 0, 0); __m128i noise_min_zn = _mm_set1_epi32(NOISE_MINZ); __m128d xy_ixy = _mm_sub_pd(xy, _mm_cvtepi32_pd(tmp_xy)); __m128d zn_izn = _mm_sub_sd(zn, _mm_cvtepi32_pd(tmp_zn)); const __m128i fff = _mm_set1_epi32(0xfff); __m128i i_xy = _mm_and_si128(_mm_sub_epi32(tmp_xy, noise_min_xy), fff); __m128i i_zn = _mm_and_si128(_mm_sub_epi32(tmp_zn, noise_min_zn), fff); ix = _mm_extract_epi32(i_xy, 0); iy = _mm_extract_epi32(i_xy, 1); iz = _mm_extract_epi32(i_zn, 0); ixiy_hash = Hash2d(ix, iy); jxiy_hash = Hash2d(ix + 1, iy); ixjy_hash = Hash2d(ix, iy + 1); jxjy_hash = Hash2d(ix + 1, iy + 1); mp = &RTable[Hash1dRTableIndex(ixiy_hash, iz)]; DBL *mp2 = &RTable[Hash1dRTableIndex(ixjy_hash, iz)]; DBL *mp3 = &RTable[Hash1dRTableIndex(ixiy_hash, iz + 1)]; DBL *mp4 = &RTable[Hash1dRTableIndex(ixjy_hash, iz + 1)]; DBL *mp5 = &RTable[Hash1dRTableIndex(jxiy_hash, iz)]; DBL *mp6 = &RTable[Hash1dRTableIndex(jxjy_hash, iz)]; DBL *mp7 = &RTable[Hash1dRTableIndex(jxiy_hash, iz + 1)]; DBL *mp8 = &RTable[Hash1dRTableIndex(jxjy_hash, iz + 1)]; const __m128d three = _mm_set1_pd(3.0); const __m128d two = _mm_set1_pd(2.0); const __m128d one = _mm_set1_pd(1.0); __m128d ix_mm = _mm_unpacklo_pd(xy_ixy, xy_ixy); __m128d iy_mm = _mm_unpackhi_pd(xy_ixy, xy_ixy); __m128d iz_mm = _mm_unpacklo_pd(zn_izn, zn_izn); __m128d jx_mm = _mm_sub_pd(ix_mm, one); __m128d jy_mm = _mm_sub_pd(iy_mm, one); __m128d jz_mm = _mm_sub_pd(iz_mm, one); __m128d mm_sxy = _mm_mul_pd(_mm_mul_pd(xy_ixy, xy_ixy), _mm_nmacc_pd(two, xy_ixy, three)); __m128d mm_sz = _mm_mul_pd(_mm_mul_pd(iz_mm, iz_mm), _mm_nmacc_pd(two, iz_mm, three)); __m128d mm_tz = _mm_sub_pd(one, mm_sz); __m128d mm_txy = _mm_sub_pd(one, mm_sxy); __m128d mm_tysy = _mm_unpackhi_pd(mm_txy, mm_sxy); __m128d mm_txty_txsy = _mm_mul_pd(_mm_unpacklo_pd(mm_txy, mm_txy), mm_tysy); __m128d mm_sxty_sxsy = _mm_mul_pd(_mm_unpacklo_pd(mm_sxy, mm_sxy), mm_tysy); __m128d y_mm = _mm_unpacklo_pd(iy_mm, jy_mm); __m128d mp_t1, mp_t2, mp1_mm, mp2_mm, mp4_mm, mp6_mm, sum_p, s_mm; __m128d int_sum1 = _mm_setzero_pd(); s_mm = _mm_mul_pd(mm_txty_txsy, mm_tz); INCRSUMP2(mp, mp2, s_mm, ix_mm, y_mm, iz_mm, int_sum1); s_mm = _mm_mul_pd(mm_txty_txsy, mm_sz); INCRSUMP2(mp3, mp4, s_mm, ix_mm, y_mm, jz_mm, int_sum1); s_mm = _mm_mul_pd(mm_sxty_sxsy, mm_tz); INCRSUMP2(mp5, mp6, s_mm, jx_mm, y_mm, iz_mm, int_sum1); s_mm = _mm_mul_pd(mm_sxty_sxsy, mm_sz); INCRSUMP2(mp7, mp8, s_mm, jx_mm, y_mm, jz_mm, int_sum1); int_sum1 = _mm_hadd_pd(int_sum1, int_sum1); if(noise_generator==kNoiseGen_RangeCorrected) { /* details of range here: Min, max: -1.05242, 0.988997 Mean: -0.0191481, Median: -0.535493, Std Dev: 0.256828 We want to change it to as close to [0,1] as possible. */ const __m128d r2 = _mm_set_sd(0.48985582); const __m128d r1r2 = _mm_set_sd(1.05242*0.48985582); int_sum1 = _mm_macc_sd(int_sum1, r2, r1r2); } else { int_sum1 = _mm_add_sd(int_sum1, _mm_set_sd(0.5)); } int_sum1 = _mm_min_sd(one, int_sum1); int_sum1 = _mm_max_sd(_mm_setzero_pd(), int_sum1); _mm_store_sd(&sum, int_sum1); return (sum); }
void multipath_channel(channel_desc_t *desc, double **tx_sig_re, double **tx_sig_im, double **rx_sig_re, double **rx_sig_im, uint32_t length, uint8_t keep_channel) { int i,ii,j,l; int length1, length2, tail; __m128d rx_tmp128_re_f,rx_tmp128_im_f,rx_tmp128_re,rx_tmp128_im, rx_tmp128_1,rx_tmp128_2,rx_tmp128_3,rx_tmp128_4,tx128_re,tx128_im,ch128_x,ch128_y,pathloss128; double path_loss = pow(10,desc->path_loss_dB/20); int dd = abs(desc->channel_offset); pathloss128 = _mm_set1_pd(path_loss); #ifdef DEBUG_CH printf("[CHANNEL] keep = %d : path_loss = %g (%f), nb_rx %d, nb_tx %d, dd %d, len %d \n",keep_channel,path_loss,desc->path_loss_dB,desc->nb_rx,desc->nb_tx,dd,desc->channel_length); #endif if (keep_channel) { // do nothing - keep channel } else { random_channel(desc,0); } start_meas(&desc->convolution); #ifdef DEBUG_CH for (l = 0; l<(int)desc->channel_length; l++) { printf("%p (%f,%f) ",desc->ch[0],desc->ch[0][l].x,desc->ch[0][l].y); } printf("\n"); #endif tail = ((int)length-dd)%2; if(tail) length1 = ((int)length-dd)-1; else length1 = ((int)length-dd); length2 = length1/2; for (i=0; i<length2; i++) { // for (ii=0; ii<desc->nb_rx; ii++) { // rx_tmp.x = 0; // rx_tmp.y = 0; rx_tmp128_re_f = _mm_setzero_pd(); rx_tmp128_im_f = _mm_setzero_pd(); for (j=0; j<desc->nb_tx; j++) { for (l = 0; l<(int)desc->channel_length; l++) { if ((i>=0) && (i-l)>=0) { //SIMD correct only if length1 > 2*channel_length...which is almost always satisfied // tx.x = tx_sig_re[j][i-l]; // tx.y = tx_sig_im[j][i-l]; tx128_re = _mm_loadu_pd(&tx_sig_re[j][2*i-l]); // tx_sig_re[j][i-l+1], tx_sig_re[j][i-l] tx128_im = _mm_loadu_pd(&tx_sig_im[j][2*i-l]); } else { //tx.x =0; //tx.y =0; tx128_re = _mm_setzero_pd(); tx128_im = _mm_setzero_pd(); } ch128_x = _mm_set1_pd(desc->ch[ii+(j*desc->nb_rx)][l].x); ch128_y = _mm_set1_pd(desc->ch[ii+(j*desc->nb_rx)][l].y); // rx_tmp.x += (tx.x * desc->ch[ii+(j*desc->nb_rx)][l].x) - (tx.y * desc->ch[ii+(j*desc->nb_rx)][l].y); // rx_tmp.y += (tx.y * desc->ch[ii+(j*desc->nb_rx)][l].x) + (tx.x * desc->ch[ii+(j*desc->nb_rx)][l].y); rx_tmp128_1 = _mm_mul_pd(tx128_re,ch128_x); rx_tmp128_2 = _mm_mul_pd(tx128_re,ch128_y); rx_tmp128_3 = _mm_mul_pd(tx128_im,ch128_x); rx_tmp128_4 = _mm_mul_pd(tx128_im,ch128_y); rx_tmp128_re = _mm_sub_pd(rx_tmp128_1,rx_tmp128_4); rx_tmp128_im = _mm_add_pd(rx_tmp128_2,rx_tmp128_3); rx_tmp128_re_f = _mm_add_pd(rx_tmp128_re_f,rx_tmp128_re); rx_tmp128_im_f = _mm_add_pd(rx_tmp128_im_f,rx_tmp128_im); } //l } // j //rx_sig_re[ii][i+dd] = rx_tmp.x*path_loss; //rx_sig_im[ii][i+dd] = rx_tmp.y*path_loss; rx_tmp128_re_f = _mm_mul_pd(rx_tmp128_re_f,pathloss128); rx_tmp128_im_f = _mm_mul_pd(rx_tmp128_im_f,pathloss128); _mm_storeu_pd(&rx_sig_re[ii][2*i+dd],rx_tmp128_re_f); // max index: length-dd -1 + dd = length -1 _mm_storeu_pd(&rx_sig_im[ii][2*i+dd],rx_tmp128_im_f); /* if ((ii==0)&&((i%32)==0)) { printf("%p %p %f,%f => %e,%e\n",rx_sig_re[ii],rx_sig_im[ii],rx_tmp.x,rx_tmp.y,rx_sig_re[ii][i-dd],rx_sig_im[ii][i-dd]); } */ //rx_sig_re[ii][i] = sqrt(.5)*(tx_sig_re[0][i] + tx_sig_re[1][i]); //rx_sig_im[ii][i] = sqrt(.5)*(tx_sig_im[0][i] + tx_sig_im[1][i]); } // ii } // i stop_meas(&desc->convolution); }
void AVXFMA4DNoise(Vector3d& result, const Vector3d& EPoint) { DBL x, y, z; int ix, iy, iz; int ixiy_hash, ixjy_hash, jxiy_hash, jxjy_hash; // TODO FIXME - global statistics reference // Stats[Calls_To_DNoise]++; x = EPoint[X]; y = EPoint[Y]; z = EPoint[Z]; /* its equivalent integer lattice point. */ /*ix = (int)x; iy = (int)y; iz = (int)z; x_ix = x - ix; y_iy = y - iy; z_iz = z - iz;*/ /* JB fix for the range problem */ __m128d xy = _mm_setr_pd(x, y); __m128d zn = _mm_set_sd(z); __m128d epsy = _mm_set1_pd(1.0 - EPSILON); __m128d xy_e = _mm_sub_pd(xy, epsy); __m128d zn_e = _mm_sub_sd(zn, epsy); __m128i tmp_xy = _mm_cvttpd_epi32(_mm_blendv_pd(xy, xy_e, xy)); __m128i tmp_zn = _mm_cvttpd_epi32(_mm_blendv_pd(zn, zn_e, zn)); __m128i noise_min_xy = _mm_setr_epi32(NOISE_MINX, NOISE_MINY, 0, 0); __m128i noise_min_zn = _mm_set1_epi32(NOISE_MINZ); __m128d xy_ixy = _mm_sub_pd(xy, _mm_cvtepi32_pd(tmp_xy)); __m128d zn_izn = _mm_sub_sd(zn, _mm_cvtepi32_pd(tmp_zn)); const __m128i fff = _mm_set1_epi32(0xfff); __m128i i_xy = _mm_and_si128(_mm_sub_epi32(tmp_xy, noise_min_xy), fff); __m128i i_zn = _mm_and_si128(_mm_sub_epi32(tmp_zn, noise_min_zn), fff); ix = _mm_extract_epi32(i_xy, 0); iy = _mm_extract_epi32(i_xy, 1); iz = _mm_extract_epi32(i_zn, 0); ixiy_hash = Hash2d(ix, iy); jxiy_hash = Hash2d(ix + 1, iy); ixjy_hash = Hash2d(ix, iy + 1); jxjy_hash = Hash2d(ix + 1, iy + 1); DBL* mp1 = &RTable[Hash1dRTableIndex(ixiy_hash, iz)]; DBL* mp2 = &RTable[Hash1dRTableIndex(jxiy_hash, iz)]; DBL* mp3 = &RTable[Hash1dRTableIndex(jxjy_hash, iz)]; DBL* mp4 = &RTable[Hash1dRTableIndex(ixjy_hash, iz)]; DBL* mp5 = &RTable[Hash1dRTableIndex(ixjy_hash, iz + 1)]; DBL* mp6 = &RTable[Hash1dRTableIndex(jxjy_hash, iz + 1)]; DBL* mp7 = &RTable[Hash1dRTableIndex(jxiy_hash, iz + 1)]; DBL* mp8 = &RTable[Hash1dRTableIndex(ixiy_hash, iz + 1)]; const __m128d three = _mm_set1_pd(3.0); const __m128d two = _mm_set1_pd(2.0); const __m128d one = _mm_set1_pd(1.0); __m128d ix_mm = _mm_unpacklo_pd(xy_ixy, xy_ixy); __m128d iy_mm = _mm_unpackhi_pd(xy_ixy, xy_ixy); __m128d iz_mm = _mm_unpacklo_pd(zn_izn, zn_izn); __m128d jx_mm = _mm_sub_pd(ix_mm, one); __m128d jy_mm = _mm_sub_pd(iy_mm, one); __m128d jz_mm = _mm_sub_pd(iz_mm, one); __m128d mm_sz = _mm_mul_pd(_mm_mul_pd(iz_mm, iz_mm), _mm_nmacc_pd(two, iz_mm, three)); __m128d mm_tz = _mm_sub_pd(one, mm_sz); __m128d mm_sxy = _mm_mul_pd(_mm_mul_pd(xy_ixy, xy_ixy), _mm_nmacc_pd(two, xy_ixy, three)); __m128d mm_txy = _mm_sub_pd(one, mm_sxy); __m128d mm_tysy = _mm_unpackhi_pd(mm_txy, mm_sxy); __m128d mm_txty_txsy = _mm_mul_pd(_mm_unpacklo_pd(mm_txy, mm_txy), mm_tysy); __m128d mm_sxty_sxsy = _mm_mul_pd(_mm_unpacklo_pd(mm_sxy, mm_sxy), mm_tysy); __m128d mm_txty_txsy_tz = _mm_mul_pd(mm_txty_txsy, mm_tz); __m128d mm_txty_txsy_sz = _mm_mul_pd(mm_txty_txsy, mm_sz); __m128d mm_sxty_sxsy_tz = _mm_mul_pd(mm_sxty_sxsy, mm_tz); __m128d mm_sxty_sxsy_sz = _mm_mul_pd(mm_sxty_sxsy, mm_sz); __m128d mp_t1, mp_t2, mp1_mm, mp2_mm, mp4_mm, mp6_mm, sum_p; __m128d sum_X_Y = _mm_setzero_pd(); __m128d sum__Z = _mm_setzero_pd(); __m128d mm_s1 = _mm_unpacklo_pd(mm_txty_txsy_tz, mm_txty_txsy_tz); INCRSUMP2(mp1, mp1 + 8, mm_s1, ix_mm, iy_mm, iz_mm, sum_X_Y); __m128d mm_s2 = _mm_unpacklo_pd(mm_sxty_sxsy_tz, mm_sxty_sxsy_tz); INCRSUMP2(mp2, mp2 + 8, mm_s2, jx_mm, iy_mm, iz_mm, sum_X_Y); __m128d mm_s3 = _mm_unpackhi_pd(mm_sxty_sxsy_tz, mm_sxty_sxsy_tz); INCRSUMP2(mp3, mp3 + 8, mm_s3, jx_mm, jy_mm, iz_mm, sum_X_Y); __m128d mm_s4 = _mm_unpackhi_pd(mm_txty_txsy_tz, mm_txty_txsy_tz); INCRSUMP2(mp4, mp4 + 8, mm_s4, ix_mm, jy_mm, iz_mm, sum_X_Y); __m128d mm_s5 = _mm_unpackhi_pd(mm_txty_txsy_sz, mm_txty_txsy_sz); INCRSUMP2(mp5, mp5 + 8, mm_s5, ix_mm, jy_mm, jz_mm, sum_X_Y); __m128d mm_s6 = _mm_unpackhi_pd(mm_sxty_sxsy_sz, mm_sxty_sxsy_sz); INCRSUMP2(mp6, mp6 + 8, mm_s6, jx_mm, jy_mm, jz_mm, sum_X_Y); __m128d mm_s7 = _mm_unpacklo_pd(mm_sxty_sxsy_sz, mm_sxty_sxsy_sz); INCRSUMP2(mp7, mp7 + 8, mm_s7, jx_mm, iy_mm, jz_mm, sum_X_Y); __m128d mm_s8 = _mm_unpacklo_pd(mm_txty_txsy_sz, mm_txty_txsy_sz); INCRSUMP2(mp8, mp8 + 8, mm_s8, ix_mm, iy_mm, jz_mm, sum_X_Y); __m128d iy_jy = _mm_unpacklo_pd(iy_mm, jy_mm); INCRSUMP2(mp1 + 16, mp4 + 16, mm_txty_txsy_tz, ix_mm, iy_jy, iz_mm, sum__Z); INCRSUMP2(mp8 + 16, mp5 + 16, mm_txty_txsy_sz, ix_mm, iy_jy, jz_mm, sum__Z); INCRSUMP2(mp2 + 16, mp3 + 16, mm_sxty_sxsy_tz, jx_mm, iy_jy, iz_mm, sum__Z); INCRSUMP2(mp7 + 16, mp6 + 16, mm_sxty_sxsy_sz, jx_mm, iy_jy, jz_mm, sum__Z); sum__Z = _mm_hadd_pd(sum__Z, sum__Z); _mm_storeu_pd(*result, sum_X_Y); _mm_store_sd(&result[Z], sum__Z); }
void exchsolution_gmrfData_1(unsigned int slot) { for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[0]) { if ((!neighbor_isValid[0][0])) { { double xPos; double yPos; /* Statements in this Scop: S493, S492, S494 */ { { { double* fieldData_Solution_GMRF_1_p1 = (&fieldData_Solution_GMRF[1][0]); int i1 = 1; for (; (i1<=2); i1 += 2) { fieldData_Solution_GMRF_1_p1[((i1*6)+2)] = 0.000000e+00; fieldData_Solution_GMRF_1_p1[((i1*6)+8)] = 0.000000e+00; } for (; (i1<=3); i1 += 1) { fieldData_Solution_GMRF_1_p1[((i1*6)+2)] = 0.000000e+00; } } { int i1 = 1; for (; (i1<(2&(~1))); i1 += 1) { xPos = posBegin[0]; } __m128d vec1 = _mm_set1_pd(xPos); for (; (i1<1); i1 += 4) { /* xPos = posBegin[0]; */ __m128d vec0 = _mm_load1_pd((&posBegin[0])); __m128d vec0_2 = _mm_load1_pd((&posBegin[0])); vec1 = vec0; vec1 = vec0_2; } for (; (i1<4); i1 += 1) { xPos = posBegin[0]; } } } { int i1 = 1; for (; (i1<(2&(~1))); i1 += 1) { yPos = ((((i1-1)/2.000000e+00)*(posEnd[1]-posBegin[1]))+posBegin[1]); } __m128d vec1 = _mm_set1_pd(1.000000e+00); __m128d vec2 = _mm_set1_pd(2.000000e+00); __m128d vec5 = _mm_set1_pd(yPos); for (; (i1<1); i1 += 4) { /* yPos = ((((i1-1)/2.000000e+00)*(posEnd[1]-posBegin[1]))+posBegin[1]); */ __m128d vec0 = _mm_set_pd(i1+1,i1); __m128d vec0_2 = _mm_set_pd(i1+1,i1); __m128d vec3 = _mm_load1_pd((&posEnd[1])); __m128d vec3_2 = _mm_load1_pd((&posEnd[1])); __m128d vec4 = _mm_load1_pd((&posBegin[1])); __m128d vec4_2 = _mm_load1_pd((&posBegin[1])); vec5 = _mm_add_pd(_mm_mul_pd(_mm_div_pd(_mm_sub_pd(vec0, vec1), vec2), _mm_sub_pd(vec3, vec4)), vec4); vec5 = _mm_add_pd(_mm_mul_pd(_mm_div_pd(_mm_sub_pd(vec0_2, vec1), vec2), _mm_sub_pd(vec3_2, vec4_2)), vec4_2); } for (; (i1<4); i1 += 1) { yPos = ((((i1-1)/2.000000e+00)*(posEnd[1]-posBegin[1]))+posBegin[1]); } } } } } if ((!neighbor_isValid[0][1])) { { double xPos; double yPos; /* Statements in this Scop: S496, S495, S497 */ { { { double* fieldData_Solution_GMRF_1_p1 = (&fieldData_Solution_GMRF[1][0]); int i1 = 1; for (; (i1<=2); i1 += 2) { fieldData_Solution_GMRF_1_p1[((i1*6)+4)] = 0.000000e+00; fieldData_Solution_GMRF_1_p1[((i1*6)+10)] = 0.000000e+00; } for (; (i1<=3); i1 += 1) { fieldData_Solution_GMRF_1_p1[((i1*6)+4)] = 0.000000e+00; } } { int i1 = 1; for (; (i1<(2&(~1))); i1 += 1) { xPos = posEnd[0]; } __m128d vec1 = _mm_set1_pd(xPos); for (; (i1<1); i1 += 4) { /* xPos = posEnd[0]; */ __m128d vec0 = _mm_load1_pd((&posEnd[0])); __m128d vec0_2 = _mm_load1_pd((&posEnd[0])); vec1 = vec0; vec1 = vec0_2; } for (; (i1<4); i1 += 1) { xPos = posEnd[0]; } } } { int i1 = 1; for (; (i1<(2&(~1))); i1 += 1) { yPos = ((((i1-1)/2.000000e+00)*(posEnd[1]-posBegin[1]))+posBegin[1]); } __m128d vec1 = _mm_set1_pd(1.000000e+00); __m128d vec2 = _mm_set1_pd(2.000000e+00); __m128d vec5 = _mm_set1_pd(yPos); for (; (i1<1); i1 += 4) { /* yPos = ((((i1-1)/2.000000e+00)*(posEnd[1]-posBegin[1]))+posBegin[1]); */ __m128d vec0 = _mm_set_pd(i1+1,i1); __m128d vec0_2 = _mm_set_pd(i1+1,i1); __m128d vec3 = _mm_load1_pd((&posEnd[1])); __m128d vec3_2 = _mm_load1_pd((&posEnd[1])); __m128d vec4 = _mm_load1_pd((&posBegin[1])); __m128d vec4_2 = _mm_load1_pd((&posBegin[1])); vec5 = _mm_add_pd(_mm_mul_pd(_mm_div_pd(_mm_sub_pd(vec0, vec1), vec2), _mm_sub_pd(vec3, vec4)), vec4); vec5 = _mm_add_pd(_mm_mul_pd(_mm_div_pd(_mm_sub_pd(vec0_2, vec1), vec2), _mm_sub_pd(vec3_2, vec4_2)), vec4_2); } for (; (i1<4); i1 += 1) { yPos = ((((i1-1)/2.000000e+00)*(posEnd[1]-posBegin[1]))+posBegin[1]); } } } } } if ((!neighbor_isValid[0][2])) { { double xPos; double yPos; /* Statements in this Scop: S500, S499, S498 */ { { { int i2 = 2; for (; (i2<=3); i2 += 2) { xPos = ((((i2-2)/2.000000e+00)*(posEnd[0]-posBegin[0]))+posBegin[0]); xPos = ((((i2-1)/2.000000e+00)*(posEnd[0]-posBegin[0]))+posBegin[0]); } for (; (i2<=4); i2 += 1) { xPos = ((((i2-2)/2.000000e+00)*(posEnd[0]-posBegin[0]))+posBegin[0]); } } { double* fieldData_Solution_GMRF_1_p1 = (&fieldData_Solution_GMRF[1][0]); int i2 = 2; for (; (i2<=3); i2 += 2) { fieldData_Solution_GMRF_1_p1[(i2+6)] = 0.000000e+00; fieldData_Solution_GMRF_1_p1[(i2+7)] = 0.000000e+00; } for (; (i2<=4); i2 += 1) { fieldData_Solution_GMRF_1_p1[(i2+6)] = 0.000000e+00; } } } { int i2 = 2; for (; (i2<=3); i2 += 2) { yPos = posBegin[1]; yPos = posBegin[1]; } for (; (i2<=4); i2 += 1) { yPos = posBegin[1]; } } } } } if ((!neighbor_isValid[0][3])) { { double xPos; double yPos; /* Statements in this Scop: S503, S502, S501 */ { { { int i2 = 2; for (; (i2<=3); i2 += 2) { xPos = ((((i2-2)/2.000000e+00)*(posEnd[0]-posBegin[0]))+posBegin[0]); xPos = ((((i2-1)/2.000000e+00)*(posEnd[0]-posBegin[0]))+posBegin[0]); } for (; (i2<=4); i2 += 1) { xPos = ((((i2-2)/2.000000e+00)*(posEnd[0]-posBegin[0]))+posBegin[0]); } } { int i2 = 2; for (; (i2<=3); i2 += 2) { yPos = posEnd[1]; yPos = posEnd[1]; } for (; (i2<=4); i2 += 1) { yPos = posEnd[1]; } } } { double* fieldData_Solution_GMRF_1_p1 = (&fieldData_Solution_GMRF[1][0]); int i2 = 2; for (; (i2<=3); i2 += 2) { fieldData_Solution_GMRF_1_p1[(i2+18)] = 0.000000e+00; fieldData_Solution_GMRF_1_p1[(i2+19)] = 0.000000e+00; } for (; (i2<=4); i2 += 1) { fieldData_Solution_GMRF_1_p1[(i2+18)] = 0.000000e+00; } } } } } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[0]) { ; } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[0]) { if ((neighbor_isValid[0][1]&&neighbor_isRemote[0][1])) { MPI_Isend(&fieldData_Solution_GMRF[1][10], 1, mpiDatatype_3_1_6, neighbor_remoteRank[0][1], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[0][1]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[1]); reqOutstanding_Send[1] = true; } } } ; for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[0]) { if ((neighbor_isValid[0][0]&&neighbor_isRemote[0][0])) { MPI_Irecv(&fieldData_Solution_GMRF[1][8], 1, mpiDatatype_3_1_6, neighbor_remoteRank[0][0], ((unsigned int)(neighbor_fragCommId[0][0]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[0]); reqOutstanding_Recv[0] = true; } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[0]) { if (reqOutstanding_Recv[0]) { waitForMPIReq(&mpiRequest_Recv[0]); reqOutstanding_Recv[0] = false; } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[0]) { ; } } ; ; for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[0]) { if (reqOutstanding_Send[1]) { waitForMPIReq(&mpiRequest_Send[1]); reqOutstanding_Send[1] = false; } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[0]) { ; } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[0]) { if ((neighbor_isValid[0][3]&&neighbor_isRemote[0][3])) { MPI_Isend(&fieldData_Solution_GMRF[1][20], 1, mpiDatatype_1_3_6, neighbor_remoteRank[0][3], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[0][3]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[3]); reqOutstanding_Send[3] = true; } } } ; for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[0]) { if ((neighbor_isValid[0][2]&&neighbor_isRemote[0][2])) { MPI_Irecv(&fieldData_Solution_GMRF[1][8], 1, mpiDatatype_1_3_6, neighbor_remoteRank[0][2], ((unsigned int)(neighbor_fragCommId[0][2]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[2]); reqOutstanding_Recv[2] = true; } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[0]) { if (reqOutstanding_Recv[2]) { waitForMPIReq(&mpiRequest_Recv[2]); reqOutstanding_Recv[2] = false; } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[0]) { ; } } ; ; for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[0]) { if (reqOutstanding_Send[3]) { waitForMPIReq(&mpiRequest_Send[3]); reqOutstanding_Send[3] = false; } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[0]) { ; ; } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[0]) { if ((neighbor_isValid[0][0]&&neighbor_isRemote[0][0])) { MPI_Isend(&fieldData_Solution_GMRF[1][3], 1, mpiDatatype_5_1_6, neighbor_remoteRank[0][0], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[0][0]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[0]); reqOutstanding_Send[0] = true; } if ((neighbor_isValid[0][1]&&neighbor_isRemote[0][1])) { MPI_Isend(&fieldData_Solution_GMRF[1][3], 1, mpiDatatype_5_1_6, neighbor_remoteRank[0][1], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[0][1]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[1]); reqOutstanding_Send[1] = true; } } } ; for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[0]) { if ((neighbor_isValid[0][0]&&neighbor_isRemote[0][0])) { MPI_Irecv(&fieldData_Solution_GMRF[1][1], 1, mpiDatatype_5_1_6, neighbor_remoteRank[0][0], ((unsigned int)(neighbor_fragCommId[0][0]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[0]); reqOutstanding_Recv[0] = true; } if ((neighbor_isValid[0][1]&&neighbor_isRemote[0][1])) { MPI_Irecv(&fieldData_Solution_GMRF[1][5], 1, mpiDatatype_5_1_6, neighbor_remoteRank[0][1], ((unsigned int)(neighbor_fragCommId[0][1]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[1]); reqOutstanding_Recv[1] = true; } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[0]) { if (reqOutstanding_Recv[0]) { waitForMPIReq(&mpiRequest_Recv[0]); reqOutstanding_Recv[0] = false; } if (reqOutstanding_Recv[1]) { waitForMPIReq(&mpiRequest_Recv[1]); reqOutstanding_Recv[1] = false; } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[0]) { ; ; } } ; ; for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[0]) { if (reqOutstanding_Send[0]) { waitForMPIReq(&mpiRequest_Send[0]); reqOutstanding_Send[0] = false; } if (reqOutstanding_Send[1]) { waitForMPIReq(&mpiRequest_Send[1]); reqOutstanding_Send[1] = false; } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[0]) { ; ; } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[0]) { if ((neighbor_isValid[0][2]&&neighbor_isRemote[0][2])) { MPI_Isend(&fieldData_Solution_GMRF[1][13], 1, mpiDatatype_1_5_6, neighbor_remoteRank[0][2], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[0][2]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[2]); reqOutstanding_Send[2] = true; } if ((neighbor_isValid[0][3]&&neighbor_isRemote[0][3])) { MPI_Isend(&fieldData_Solution_GMRF[1][13], 1, mpiDatatype_1_5_6, neighbor_remoteRank[0][3], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[0][3]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[3]); reqOutstanding_Send[3] = true; } } } ; for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[0]) { if ((neighbor_isValid[0][2]&&neighbor_isRemote[0][2])) { MPI_Irecv(&fieldData_Solution_GMRF[1][1], 1, mpiDatatype_1_5_6, neighbor_remoteRank[0][2], ((unsigned int)(neighbor_fragCommId[0][2]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[2]); reqOutstanding_Recv[2] = true; } if ((neighbor_isValid[0][3]&&neighbor_isRemote[0][3])) { MPI_Irecv(&fieldData_Solution_GMRF[1][25], 1, mpiDatatype_1_5_6, neighbor_remoteRank[0][3], ((unsigned int)(neighbor_fragCommId[0][3]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[3]); reqOutstanding_Recv[3] = true; } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[0]) { if (reqOutstanding_Recv[2]) { waitForMPIReq(&mpiRequest_Recv[2]); reqOutstanding_Recv[2] = false; } if (reqOutstanding_Recv[3]) { waitForMPIReq(&mpiRequest_Recv[3]); reqOutstanding_Recv[3] = false; } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[0]) { ; ; } } ; ; for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[0]) { if (reqOutstanding_Send[2]) { waitForMPIReq(&mpiRequest_Send[2]); reqOutstanding_Send[2] = false; } if (reqOutstanding_Send[3]) { waitForMPIReq(&mpiRequest_Send[3]); reqOutstanding_Send[3] = false; } } } }
inline const vector2d operator-(const vector2d& lhs, const vector2d& rhs) { return _mm_sub_pd(lhs, rhs); }
static forcedinline ParallelType sub (ParallelType a, ParallelType b) noexcept { return _mm_sub_pd (a, b); }
// it moves horizontally inside a block void kernel_dtrmv_u_n_2_lib4(int kmax, double *A, double *x, double *y, int alg) { if(kmax<=0) return; const int lda = 4; int k; __m128d ax_temp, a_00_10, a_01_11, a_02_12, a_03_13, x_0, x_1, x_2, x_3, y_0_1, y_0_1_b, y_0_1_c, y_0_1_d, z_0_1; /* y_0_1 = _mm_setzero_pd(); */ // second col (avoid zero y_0_1) x_0 = _mm_loaddup_pd( &x[1] ); a_00_10 = _mm_load_pd( &A[0+lda*1] ); y_0_1 = _mm_mul_pd( a_00_10, x_0 ); // first col x_0 = _mm_load_sd( &x[0] ); a_00_10 = _mm_load_sd( &A[0+lda*0] ); ax_temp = _mm_mul_sd( a_00_10, x_0 ); y_0_1 = _mm_add_sd( y_0_1, ax_temp ); A += 2*lda; x += 2; k=2; for(; k<kmax-1; k+=2) { x_0 = _mm_loaddup_pd( &x[0] ); x_1 = _mm_loaddup_pd( &x[1] ); a_00_10 = _mm_load_pd( &A[0+lda*0] ); a_01_11 = _mm_load_pd( &A[0+lda*1] ); ax_temp = _mm_mul_pd( a_00_10, x_0 ); y_0_1 = _mm_add_pd( y_0_1, ax_temp ); ax_temp = _mm_mul_pd( a_01_11, x_1 ); y_0_1 = _mm_add_pd( y_0_1, ax_temp ); A += 2*lda; x += 2; } if(kmax%2==1) { x_0 = _mm_loaddup_pd( &x[0] ); a_00_10 = _mm_load_pd( &A[0+lda*0] ); ax_temp = _mm_mul_pd( a_00_10, x_0 ); y_0_1 = _mm_add_pd( y_0_1, ax_temp ); } if(alg==0) { _mm_storeu_pd(&y[0], y_0_1); } else if(alg==1) { z_0_1 = _mm_loadu_pd( &y[0] ); z_0_1 = _mm_add_pd( z_0_1, y_0_1 ); _mm_storeu_pd(&y[0], z_0_1); } else // alg==-1 { z_0_1 = _mm_loadu_pd( &y[0] ); z_0_1 = _mm_sub_pd( z_0_1, y_0_1 ); _mm_storeu_pd(&y[0], z_0_1); } }
static void filterYule(const Float_t* input, Float_t* output, size_t nSamples, const Float_t* kernel) { #ifdef HAVE_SSE2 __m128d __kernel, __result, __temp; __declspec(align(16)) Float_t __temp2[2]; while (nSamples--) { __kernel = _mm_loadr_pd(&kernel[0]); __temp = _mm_loadu_pd(&input[-1]); __result = _mm_mul_pd(__temp, __kernel); __kernel = _mm_loadr_pd(&kernel[12]); __temp = _mm_loadu_pd(&output[-2]); __temp = _mm_mul_pd(__kernel, __temp); __result = _mm_sub_pd(__result, __temp); __kernel = _mm_loadr_pd(&kernel[2]); __temp = _mm_loadu_pd(&input[-3]); __temp = _mm_mul_pd(__kernel, __temp); __result = _mm_add_pd(__result, __temp); __kernel = _mm_loadr_pd(&kernel[14]); __temp = _mm_loadu_pd(&output[-4]); __temp = _mm_mul_pd(__kernel, __temp); __result = _mm_sub_pd(__result, __temp); __kernel = _mm_loadr_pd(&kernel[4]); __temp = _mm_loadu_pd(&input[-5]); __temp = _mm_mul_pd(__kernel, __temp); __result = _mm_add_pd(__result, __temp); __kernel = _mm_loadr_pd(&kernel[16]); __temp = _mm_loadu_pd(&output[-6]); __temp = _mm_mul_pd(__kernel, __temp); __result = _mm_sub_pd(__result, __temp); __kernel = _mm_loadr_pd(&kernel[6]); __temp = _mm_loadu_pd(&input[-7]); __temp = _mm_mul_pd(__kernel, __temp); __result = _mm_add_pd(__result, __temp); __kernel = _mm_loadr_pd(&kernel[18]); __temp = _mm_loadu_pd(&output[-8]); __temp = _mm_mul_pd(__kernel, __temp); __result = _mm_sub_pd(__result, __temp); __kernel = _mm_loadr_pd(&kernel[8]); __temp = _mm_loadu_pd(&input[-9]); __temp = _mm_mul_pd(__kernel, __temp); __result = _mm_add_pd(__result, __temp); __kernel = _mm_loadr_pd(&kernel[20]); __temp = _mm_loadu_pd(&output[-10]); __temp = _mm_mul_pd(__kernel, __temp); __result = _mm_sub_pd(__result, __temp); _mm_store_pd(__temp2, __result); *output = 1e-10 /* 1e-10 is a hack to avoid slowdown because of denormals */ + __temp2[0] + __temp2[1] + input [-10] * kernel[11]; ; ++output; ++input; } #else while (nSamples--) { *output = 1e-10 /* 1e-10 is a hack to avoid slowdown because of denormals */ + input [0] * kernel[0] - output[-1] * kernel[1] + input [-1] * kernel[2] - output[-2] * kernel[3] + input [-2] * kernel[4] - output[-3] * kernel[5] + input [-3] * kernel[6] - output[-4] * kernel[7] + input [-4] * kernel[8] - output[-5] * kernel[9] + input [-5] * kernel[10] - output[-6] * kernel[11] + input [-6] * kernel[12] - output[-7] * kernel[13] + input [-7] * kernel[14] - output[-8] * kernel[15] + input [-8] * kernel[16] - output[-9] * kernel[17] + input [-9] * kernel[18] - output[-10]* kernel[19] + input [-10]* kernel[20]; ++output; ++input; } #endif }
// it moves vertically across blocks void kernel_dtrmv_u_t_2_lib4(int kmax, double *A, int sda, double *x, double *y, int alg) { /* if(kmax<=0) */ /* return;*/ const int lda = 4; double *tA, *tx; int k; __m256d tmp0, tmp1, a_00_10_20_30, a_01_11_21_31, x_0_1_2_3, y_00, y_11; y_00 = _mm256_setzero_pd(); y_11 = _mm256_setzero_pd(); k=0; for(; k<kmax-7; k+=8) { x_0_1_2_3 = _mm256_loadu_pd( &x[0] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] ); tmp0 = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 ); y_00 = _mm256_add_pd( y_00, tmp0 ); y_11 = _mm256_add_pd( y_11, tmp1 ); A += 4 + (sda-1)*lda; x += 4; x_0_1_2_3 = _mm256_loadu_pd( &x[0] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] ); tmp0 = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 ); y_00 = _mm256_add_pd( y_00, tmp0 ); y_11 = _mm256_add_pd( y_11, tmp1 ); A += 4 + (sda-1)*lda; x += 4; } for(; k<kmax-3; k+=4) { x_0_1_2_3 = _mm256_loadu_pd( &x[0] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] ); tmp0 = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 ); y_00 = _mm256_add_pd( y_00, tmp0 ); y_11 = _mm256_add_pd( y_11, tmp1 ); A += 4 + (sda-1)*lda; x += 4; } __m128d tm0, tm1, a_00_10, a_01_11, x_0_1, y_0, y_1, y_0_1; tm0 = _mm256_extractf128_pd( y_00, 0x1 ); tm1 = _mm256_extractf128_pd( y_11, 0x1 ); y_0 = _mm256_castpd256_pd128( y_00 ); y_1 = _mm256_castpd256_pd128( y_11 ); y_0 = _mm_add_pd( y_0, tm0 ); y_1 = _mm_add_pd( y_1, tm1 ); x_0_1 = _mm_loadu_pd( &x[0] ); a_00_10 = _mm_load_sd( &A[0+lda*0] ); a_01_11 = _mm_load_pd( &A[0+lda*1] ); tm0 = _mm_mul_sd( a_00_10, x_0_1 ); tm1 = _mm_mul_pd( a_01_11, x_0_1 ); y_0 = _mm_add_sd( y_0, tm0 ); y_1 = _mm_add_pd( y_1, tm1 ); y_0 = _mm_hadd_pd( y_0, y_1 ); if(alg==0) { _mm_storeu_pd(&y[0], y_0); } else if(alg==1) { y_0_1 = _mm_loadu_pd( &y[0] ); y_0_1 = _mm_add_pd( y_0_1, y_0 ); _mm_storeu_pd(&y[0], y_0_1); } else // alg==-1 { y_0_1 = _mm_loadu_pd( &y[0] ); y_0_1 = _mm_sub_pd( y_0_1, y_0 ); _mm_storeu_pd(&y[0], y_0_1); } }
void exchsolutionData_2(unsigned int slot) { for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { if ((!neighbor_isValid[1][0])) { { double xPos; double yPos; /* Statements in this Scop: S397, S396, S398 */ { { { int i1 = 1; for (; (i1<(2&(~1))); i1 += 1) { yPos = ((((i1-1)/4.000000e+00)*(posEnd[1]-posBegin[1]))+posBegin[1]); } __m128d vec1 = _mm_set1_pd(1.000000e+00); __m128d vec2 = _mm_set1_pd(4.000000e+00); __m128d vec5 = _mm_set1_pd(yPos); for (; (i1<3); i1 += 4) { /* yPos = ((((i1-1)/4.000000e+00)*(posEnd[1]-posBegin[1]))+posBegin[1]); */ __m128d vec0 = _mm_set_pd(i1+1,i1); __m128d vec0_2 = _mm_set_pd(i1+1,i1); __m128d vec3 = _mm_load1_pd((&posEnd[1])); __m128d vec3_2 = _mm_load1_pd((&posEnd[1])); __m128d vec4 = _mm_load1_pd((&posBegin[1])); __m128d vec4_2 = _mm_load1_pd((&posBegin[1])); vec5 = _mm_add_pd(_mm_mul_pd(_mm_div_pd(_mm_sub_pd(vec0, vec1), vec2), _mm_sub_pd(vec3, vec4)), vec4); vec5 = _mm_add_pd(_mm_mul_pd(_mm_div_pd(_mm_sub_pd(vec0_2, vec1), vec2), _mm_sub_pd(vec3_2, vec4_2)), vec4_2); } for (; (i1<6); i1 += 1) { yPos = ((((i1-1)/4.000000e+00)*(posEnd[1]-posBegin[1]))+posBegin[1]); } } { double* fieldData_Solution_2_p1 = (&fieldData_Solution[2][0]); int i1 = 1; for (; (i1<=4); i1 += 2) { fieldData_Solution_2_p1[((i1*8)+2)] = 0.000000e+00; fieldData_Solution_2_p1[((i1*8)+10)] = 0.000000e+00; } for (; (i1<=5); i1 += 1) { fieldData_Solution_2_p1[((i1*8)+2)] = 0.000000e+00; } } } { int i1 = 1; for (; (i1<(2&(~1))); i1 += 1) { xPos = posBegin[0]; } __m128d vec1 = _mm_set1_pd(xPos); for (; (i1<3); i1 += 4) { /* xPos = posBegin[0]; */ __m128d vec0 = _mm_load1_pd((&posBegin[0])); __m128d vec0_2 = _mm_load1_pd((&posBegin[0])); vec1 = vec0; vec1 = vec0_2; } for (; (i1<6); i1 += 1) { xPos = posBegin[0]; } } } } } if ((!neighbor_isValid[1][1])) { { double xPos; double yPos; /* Statements in this Scop: S401, S400, S399 */ { { { double* fieldData_Solution_2_p1 = (&fieldData_Solution[2][0]); int i1 = 1; for (; (i1<=4); i1 += 2) { fieldData_Solution_2_p1[((i1*8)+6)] = 0.000000e+00; fieldData_Solution_2_p1[((i1*8)+14)] = 0.000000e+00; } for (; (i1<=5); i1 += 1) { fieldData_Solution_2_p1[((i1*8)+6)] = 0.000000e+00; } } { int i1 = 1; for (; (i1<(2&(~1))); i1 += 1) { xPos = posEnd[0]; } __m128d vec1 = _mm_set1_pd(xPos); for (; (i1<3); i1 += 4) { /* xPos = posEnd[0]; */ __m128d vec0 = _mm_load1_pd((&posEnd[0])); __m128d vec0_2 = _mm_load1_pd((&posEnd[0])); vec1 = vec0; vec1 = vec0_2; } for (; (i1<6); i1 += 1) { xPos = posEnd[0]; } } } { int i1 = 1; for (; (i1<(2&(~1))); i1 += 1) { yPos = ((((i1-1)/4.000000e+00)*(posEnd[1]-posBegin[1]))+posBegin[1]); } __m128d vec1 = _mm_set1_pd(1.000000e+00); __m128d vec2 = _mm_set1_pd(4.000000e+00); __m128d vec5 = _mm_set1_pd(yPos); for (; (i1<3); i1 += 4) { /* yPos = ((((i1-1)/4.000000e+00)*(posEnd[1]-posBegin[1]))+posBegin[1]); */ __m128d vec0 = _mm_set_pd(i1+1,i1); __m128d vec0_2 = _mm_set_pd(i1+1,i1); __m128d vec3 = _mm_load1_pd((&posEnd[1])); __m128d vec3_2 = _mm_load1_pd((&posEnd[1])); __m128d vec4 = _mm_load1_pd((&posBegin[1])); __m128d vec4_2 = _mm_load1_pd((&posBegin[1])); vec5 = _mm_add_pd(_mm_mul_pd(_mm_div_pd(_mm_sub_pd(vec0, vec1), vec2), _mm_sub_pd(vec3, vec4)), vec4); vec5 = _mm_add_pd(_mm_mul_pd(_mm_div_pd(_mm_sub_pd(vec0_2, vec1), vec2), _mm_sub_pd(vec3_2, vec4_2)), vec4_2); } for (; (i1<6); i1 += 1) { yPos = ((((i1-1)/4.000000e+00)*(posEnd[1]-posBegin[1]))+posBegin[1]); } } } } } if ((!neighbor_isValid[1][2])) { { double xPos; double yPos; /* Statements in this Scop: S404, S403, S402 */ { { { double* fieldData_Solution_2_p1 = (&fieldData_Solution[2][0]); int i2 = 2; for (; (i2<=5); i2 += 2) { fieldData_Solution_2_p1[(i2+8)] = 0.000000e+00; fieldData_Solution_2_p1[(i2+9)] = 0.000000e+00; } for (; (i2<=6); i2 += 1) { fieldData_Solution_2_p1[(i2+8)] = 0.000000e+00; } } { int i2 = 2; for (; (i2<=5); i2 += 2) { xPos = ((((i2-2)/4.000000e+00)*(posEnd[0]-posBegin[0]))+posBegin[0]); xPos = ((((i2-1)/4.000000e+00)*(posEnd[0]-posBegin[0]))+posBegin[0]); } for (; (i2<=6); i2 += 1) { xPos = ((((i2-2)/4.000000e+00)*(posEnd[0]-posBegin[0]))+posBegin[0]); } } } { int i2 = 2; for (; (i2<=5); i2 += 2) { yPos = posBegin[1]; yPos = posBegin[1]; } for (; (i2<=6); i2 += 1) { yPos = posBegin[1]; } } } } } if ((!neighbor_isValid[1][3])) { { double xPos; double yPos; /* Statements in this Scop: S407, S406, S405 */ { { { double* fieldData_Solution_2_p1 = (&fieldData_Solution[2][0]); int i2 = 2; for (; (i2<=5); i2 += 2) { fieldData_Solution_2_p1[(i2+40)] = 0.000000e+00; fieldData_Solution_2_p1[(i2+41)] = 0.000000e+00; } for (; (i2<=6); i2 += 1) { fieldData_Solution_2_p1[(i2+40)] = 0.000000e+00; } } { int i2 = 2; for (; (i2<=5); i2 += 2) { xPos = ((((i2-2)/4.000000e+00)*(posEnd[0]-posBegin[0]))+posBegin[0]); xPos = ((((i2-1)/4.000000e+00)*(posEnd[0]-posBegin[0]))+posBegin[0]); } for (; (i2<=6); i2 += 1) { xPos = ((((i2-2)/4.000000e+00)*(posEnd[0]-posBegin[0]))+posBegin[0]); } } } { int i2 = 2; for (; (i2<=5); i2 += 2) { yPos = posEnd[1]; yPos = posEnd[1]; } for (; (i2<=6); i2 += 1) { yPos = posEnd[1]; } } } } } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { ; } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { if ((neighbor_isValid[1][1]&&neighbor_isRemote[1][1])) { MPI_Isend(&fieldData_Solution[2][14], 1, mpiDatatype_5_1_8, neighbor_remoteRank[1][1], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[1][1]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[1]); reqOutstanding_Send[1] = true; } } } ; for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { if ((neighbor_isValid[1][0]&&neighbor_isRemote[1][0])) { MPI_Irecv(&fieldData_Solution[2][10], 1, mpiDatatype_5_1_8, neighbor_remoteRank[1][0], ((unsigned int)(neighbor_fragCommId[1][0]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[0]); reqOutstanding_Recv[0] = true; } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { if (reqOutstanding_Recv[0]) { waitForMPIReq(&mpiRequest_Recv[0]); reqOutstanding_Recv[0] = false; } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { ; } } ; ; for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { if (reqOutstanding_Send[1]) { waitForMPIReq(&mpiRequest_Send[1]); reqOutstanding_Send[1] = false; } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { ; } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { if ((neighbor_isValid[1][3]&&neighbor_isRemote[1][3])) { MPI_Isend(&fieldData_Solution[2][42], 1, mpiDatatype_1_5_8, neighbor_remoteRank[1][3], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[1][3]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[3]); reqOutstanding_Send[3] = true; } } } ; for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { if ((neighbor_isValid[1][2]&&neighbor_isRemote[1][2])) { MPI_Irecv(&fieldData_Solution[2][10], 1, mpiDatatype_1_5_8, neighbor_remoteRank[1][2], ((unsigned int)(neighbor_fragCommId[1][2]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[2]); reqOutstanding_Recv[2] = true; } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { if (reqOutstanding_Recv[2]) { waitForMPIReq(&mpiRequest_Recv[2]); reqOutstanding_Recv[2] = false; } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { ; } } ; ; for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { if (reqOutstanding_Send[3]) { waitForMPIReq(&mpiRequest_Send[3]); reqOutstanding_Send[3] = false; } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { ; ; } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { if ((neighbor_isValid[1][0]&&neighbor_isRemote[1][0])) { MPI_Isend(&fieldData_Solution[2][3], 1, mpiDatatype_7_1_8, neighbor_remoteRank[1][0], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[1][0]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[0]); reqOutstanding_Send[0] = true; } if ((neighbor_isValid[1][1]&&neighbor_isRemote[1][1])) { MPI_Isend(&fieldData_Solution[2][5], 1, mpiDatatype_7_1_8, neighbor_remoteRank[1][1], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[1][1]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[1]); reqOutstanding_Send[1] = true; } } } ; for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { if ((neighbor_isValid[1][0]&&neighbor_isRemote[1][0])) { MPI_Irecv(&fieldData_Solution[2][1], 1, mpiDatatype_7_1_8, neighbor_remoteRank[1][0], ((unsigned int)(neighbor_fragCommId[1][0]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[0]); reqOutstanding_Recv[0] = true; } if ((neighbor_isValid[1][1]&&neighbor_isRemote[1][1])) { MPI_Irecv(&fieldData_Solution[2][7], 1, mpiDatatype_7_1_8, neighbor_remoteRank[1][1], ((unsigned int)(neighbor_fragCommId[1][1]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[1]); reqOutstanding_Recv[1] = true; } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { if (reqOutstanding_Recv[0]) { waitForMPIReq(&mpiRequest_Recv[0]); reqOutstanding_Recv[0] = false; } if (reqOutstanding_Recv[1]) { waitForMPIReq(&mpiRequest_Recv[1]); reqOutstanding_Recv[1] = false; } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { ; ; } } ; ; for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { if (reqOutstanding_Send[0]) { waitForMPIReq(&mpiRequest_Send[0]); reqOutstanding_Send[0] = false; } if (reqOutstanding_Send[1]) { waitForMPIReq(&mpiRequest_Send[1]); reqOutstanding_Send[1] = false; } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { ; ; } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { if ((neighbor_isValid[1][2]&&neighbor_isRemote[1][2])) { MPI_Isend(&fieldData_Solution[2][17], 1, mpiDatatype_1_7_8, neighbor_remoteRank[1][2], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[1][2]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[2]); reqOutstanding_Send[2] = true; } if ((neighbor_isValid[1][3]&&neighbor_isRemote[1][3])) { MPI_Isend(&fieldData_Solution[2][33], 1, mpiDatatype_1_7_8, neighbor_remoteRank[1][3], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[1][3]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[3]); reqOutstanding_Send[3] = true; } } } ; for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { if ((neighbor_isValid[1][2]&&neighbor_isRemote[1][2])) { MPI_Irecv(&fieldData_Solution[2][1], 1, mpiDatatype_1_7_8, neighbor_remoteRank[1][2], ((unsigned int)(neighbor_fragCommId[1][2]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[2]); reqOutstanding_Recv[2] = true; } if ((neighbor_isValid[1][3]&&neighbor_isRemote[1][3])) { MPI_Irecv(&fieldData_Solution[2][49], 1, mpiDatatype_1_7_8, neighbor_remoteRank[1][3], ((unsigned int)(neighbor_fragCommId[1][3]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[3]); reqOutstanding_Recv[3] = true; } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { if (reqOutstanding_Recv[2]) { waitForMPIReq(&mpiRequest_Recv[2]); reqOutstanding_Recv[2] = false; } if (reqOutstanding_Recv[3]) { waitForMPIReq(&mpiRequest_Recv[3]); reqOutstanding_Recv[3] = false; } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { ; ; } } ; ; for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { if (reqOutstanding_Send[2]) { waitForMPIReq(&mpiRequest_Send[2]); reqOutstanding_Send[2] = false; } if (reqOutstanding_Send[3]) { waitForMPIReq(&mpiRequest_Send[3]); reqOutstanding_Send[3] = false; } } } }
static inline Simd sub(const Simd& lhs, const Simd& rhs) { Simd res; res.reg[0] = _mm_sub_pd(lhs.reg[0], rhs.reg[0]); res.reg[1] = _mm_sub_pd(lhs.reg[1], rhs.reg[1]); return res; }