void mlib_FIR_tap4f_d64( mlib_d64 *pdst, const mlib_d64 *psrc, mlib_d64 *pflt, mlib_s32 n) { mlib_s32 j; mlib_d64 src1, src2, src3, src4; mlib_d64 dflt1 = pflt[0], dflt2 = pflt[1], dflt3 = pflt[2], dflt4 = pflt[3]; __m128d sdflt1, sdflt2, sdflt3, sdflt4; __m128d ssrc1, ssrc2, ssrc3, ssrc4; __m128d smul1, smul2, smul3, smul4; src1 = psrc[0]; src2 = psrc[1]; src3 = psrc[2]; j = 0; if ((mlib_addr)psrc & 15) { src4 = psrc[3]; pdst[0] = dflt4 * src1 + dflt3 * src2 + dflt2 * src3 + dflt1 * src4; psrc++; pdst++; j++; src1 = src2; src2 = src3; src3 = src4; } sdflt4 = _mm_set1_pd(dflt4); sdflt3 = _mm_set1_pd(dflt3); sdflt2 = _mm_set1_pd(dflt2); sdflt1 = _mm_set1_pd(dflt1); #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (; j < n - 1; j += 2) { ssrc1 = _mm_load_pd(psrc); ssrc2 = _mm_loadu_pd(psrc + 1); ssrc3 = _mm_load_pd(psrc + 2); ssrc4 = _mm_loadu_pd(psrc + 3); smul1 = _mm_mul_pd(sdflt4, ssrc1); smul2 = _mm_mul_pd(sdflt3, ssrc2); smul3 = _mm_mul_pd(sdflt2, ssrc3); smul4 = _mm_mul_pd(sdflt1, ssrc4); smul1 = _mm_add_pd(smul1, smul2); smul3 = _mm_add_pd(smul3, smul4); smul1 = _mm_add_pd(smul1, smul3); _mm_storeu_pd(pdst, smul1); psrc += 2; pdst += 2; } src1 = psrc[0]; src2 = psrc[1]; src3 = psrc[2]; for (; j < n; j++) { src4 = psrc[3]; pdst[0] = dflt4 * src1 + dflt3 * src2 + dflt2 * src3 + dflt1 * src4; psrc++; pdst++; src1 = src2; src2 = src3; src3 = src4; } }
int jnrA,jnrB; int j_coord_offsetA,j_coord_offsetB; int *iinr,*jindex,*jjnr,*shiftidx,*gid; real rcutoff_scalar; real *shiftvec,*fshift,*x,*f; __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; int vdwioffset0; __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; int vdwjidx0A,vdwjidx0B; __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; int nvdwtype; __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; int *vdwtype; real *vdwparam; __m128d one_sixth = _mm_set1_pd(1.0/6.0); __m128d one_twelfth = _mm_set1_pd(1.0/12.0); __m128i vfitab; __m128i ifour = _mm_set1_epi32(4); __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; real *vftab; __m128d dummy_mask,cutoff_mask; __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); __m128d one = _mm_set1_pd(1.0); __m128d two = _mm_set1_pd(2.0); x = xx[0]; f = ff[0]; nri = nlist->nri; iinr = nlist->iinr; jindex = nlist->jindex;
__m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; int vdwioffset0; __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; int vdwjidx0A,vdwjidx0B; __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; __m128d velec,felec,velecsum,facel,crf,krf,krf2; real *charge; __m128i ewitab; __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV; real *ewtab; __m128d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw; real rswitch_scalar,d_scalar; __m128d dummy_mask,cutoff_mask; __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); __m128d one = _mm_set1_pd(1.0); __m128d two = _mm_set1_pd(2.0); x = xx[0]; f = ff[0]; nri = nlist->nri; iinr = nlist->iinr; jindex = nlist->jindex; jjnr = nlist->jjnr; shiftidx = nlist->shift; gid = nlist->gid; shiftvec = fr->shift_vec[0]; fshift = fr->fshift[0]; facel = _mm_set1_pd(fr->epsfac); charge = mdatoms->chargeA;
int jnrA,jnrB; int j_coord_offsetA,j_coord_offsetB; int *iinr,*jindex,*jjnr,*shiftidx,*gid; real rcutoff_scalar; real *shiftvec,*fshift,*x,*f; __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; int vdwioffset0; __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; int vdwjidx0A,vdwjidx0B; __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; int nvdwtype; __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; int *vdwtype; real *vdwparam; __m128d one_sixth = _mm_set1_pd(1.0/6.0); __m128d one_twelfth = _mm_set1_pd(1.0/12.0); __m128d c6grid_00; __m128d ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald; real *vdwgridparam; __m128d one_half = _mm_set1_pd(0.5); __m128d minus_one = _mm_set1_pd(-1.0); __m128d dummy_mask,cutoff_mask; __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); __m128d one = _mm_set1_pd(1.0); __m128d two = _mm_set1_pd(2.0); x = xx[0]; f = ff[0]; nri = nlist->nri; iinr = nlist->iinr;
__m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; int vdwioffset3; __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; int vdwjidx0A,vdwjidx0B; __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; __m128d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30; __m128d velec,felec,velecsum,facel,crf,krf,krf2; real *charge; __m128i ewitab; __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV; real *ewtab; __m128d dummy_mask,cutoff_mask; __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); __m128d one = _mm_set1_pd(1.0); __m128d two = _mm_set1_pd(2.0); x = xx[0]; f = ff[0]; nri = nlist->nri; iinr = nlist->iinr; jindex = nlist->jindex; jjnr = nlist->jjnr; shiftidx = nlist->shift; gid = nlist->gid; shiftvec = fr->shift_vec[0]; fshift = fr->fshift[0]; facel = _mm_set1_pd(fr->epsfac); charge = mdatoms->chargeA;
int jnrA,jnrB; int j_coord_offsetA,j_coord_offsetB; int *iinr,*jindex,*jjnr,*shiftidx,*gid; real rcutoff_scalar; real *shiftvec,*fshift,*x,*f; __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; int vdwioffset0; __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; int vdwjidx0A,vdwjidx0B; __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; __m128d velec,felec,velecsum,facel,crf,krf,krf2; real *charge; __m128i gbitab; __m128d vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,dvdatmp; __m128d minushalf = _mm_set1_pd(-0.5); real *invsqrta,*dvda,*gbtab; int nvdwtype; __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; int *vdwtype; real *vdwparam; __m128d one_sixth = _mm_set1_pd(1.0/6.0); __m128d one_twelfth = _mm_set1_pd(1.0/12.0); __m128i vfitab; __m128i ifour = _mm_set1_epi32(4); __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; real *vftab; __m128d dummy_mask,cutoff_mask; __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); __m128d one = _mm_set1_pd(1.0); __m128d two = _mm_set1_pd(2.0);
int *iinr,*jindex,*jjnr,*shiftidx,*gid; real rcutoff_scalar; real *shiftvec,*fshift,*x,*f; __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; int vdwioffset0; __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; int vdwjidx0A,vdwjidx0B; __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; __m128d velec,felec,velecsum,facel,crf,krf,krf2; real *charge; int nvdwtype; __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; int *vdwtype; real *vdwparam; __m128d one_sixth = _mm_set1_pd(1.0/6.0); __m128d one_twelfth = _mm_set1_pd(1.0/12.0); __m128i ewitab; __m128d ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV; real *ewtab; __m128d dummy_mask,cutoff_mask; __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); __m128d one = _mm_set1_pd(1.0); __m128d two = _mm_set1_pd(2.0); x = xx[0]; f = ff[0]; nri = nlist->nri; iinr = nlist->iinr; jindex = nlist->jindex; jjnr = nlist->jjnr;
int *iinr,*jindex,*jjnr,*shiftidx,*gid; real rcutoff_scalar; real *shiftvec,*fshift,*x,*f; __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; int vdwioffset0; __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; int vdwjidx0A,vdwjidx0B; __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; __m128d velec,felec,velecsum,facel,crf,krf,krf2; real *charge; int nvdwtype; __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; int *vdwtype; real *vdwparam; __m128d one_sixth = _mm_set1_pd(1.0/6.0); __m128d one_twelfth = _mm_set1_pd(1.0/12.0); __m128i vfitab; __m128i ifour = _mm_set1_epi32(4); __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; real *vftab; __m128d dummy_mask,cutoff_mask; __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); __m128d one = _mm_set1_pd(1.0); __m128d two = _mm_set1_pd(2.0); x = xx[0]; f = ff[0]; nri = nlist->nri; iinr = nlist->iinr; jindex = nlist->jindex;
int *iinr,*jindex,*jjnr,*shiftidx,*gid; real rcutoff_scalar; real *shiftvec,*fshift,*x,*f; __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; int vdwioffset0; __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; int vdwjidx0A,vdwjidx0B; __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; __m128d velec,felec,velecsum,facel,crf,krf,krf2; real *charge; int nvdwtype; __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; int *vdwtype; real *vdwparam; __m128d one_sixth = _mm_set1_pd(1.0/6.0); __m128d one_twelfth = _mm_set1_pd(1.0/12.0); __m128d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw; real rswitch_scalar,d_scalar; __m128d dummy_mask,cutoff_mask; __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); __m128d one = _mm_set1_pd(1.0); __m128d two = _mm_set1_pd(2.0); x = xx[0]; f = ff[0]; nri = nlist->nri; iinr = nlist->iinr; jindex = nlist->jindex; jjnr = nlist->jjnr; shiftidx = nlist->shift;
void exchsolutionData_2(unsigned int slot) { for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { if ((!neighbor_isValid[1][0])) { { double xPos; double yPos; /* Statements in this Scop: S397, S396, S398 */ { { { int i1 = 1; for (; (i1<(2&(~1))); i1 += 1) { yPos = ((((i1-1)/4.000000e+00)*(posEnd[1]-posBegin[1]))+posBegin[1]); } __m128d vec1 = _mm_set1_pd(1.000000e+00); __m128d vec2 = _mm_set1_pd(4.000000e+00); __m128d vec5 = _mm_set1_pd(yPos); for (; (i1<3); i1 += 4) { /* yPos = ((((i1-1)/4.000000e+00)*(posEnd[1]-posBegin[1]))+posBegin[1]); */ __m128d vec0 = _mm_set_pd(i1+1,i1); __m128d vec0_2 = _mm_set_pd(i1+1,i1); __m128d vec3 = _mm_load1_pd((&posEnd[1])); __m128d vec3_2 = _mm_load1_pd((&posEnd[1])); __m128d vec4 = _mm_load1_pd((&posBegin[1])); __m128d vec4_2 = _mm_load1_pd((&posBegin[1])); vec5 = _mm_add_pd(_mm_mul_pd(_mm_div_pd(_mm_sub_pd(vec0, vec1), vec2), _mm_sub_pd(vec3, vec4)), vec4); vec5 = _mm_add_pd(_mm_mul_pd(_mm_div_pd(_mm_sub_pd(vec0_2, vec1), vec2), _mm_sub_pd(vec3_2, vec4_2)), vec4_2); } for (; (i1<6); i1 += 1) { yPos = ((((i1-1)/4.000000e+00)*(posEnd[1]-posBegin[1]))+posBegin[1]); } } { double* fieldData_Solution_2_p1 = (&fieldData_Solution[2][0]); int i1 = 1; for (; (i1<=4); i1 += 2) { fieldData_Solution_2_p1[((i1*8)+2)] = 0.000000e+00; fieldData_Solution_2_p1[((i1*8)+10)] = 0.000000e+00; } for (; (i1<=5); i1 += 1) { fieldData_Solution_2_p1[((i1*8)+2)] = 0.000000e+00; } } } { int i1 = 1; for (; (i1<(2&(~1))); i1 += 1) { xPos = posBegin[0]; } __m128d vec1 = _mm_set1_pd(xPos); for (; (i1<3); i1 += 4) { /* xPos = posBegin[0]; */ __m128d vec0 = _mm_load1_pd((&posBegin[0])); __m128d vec0_2 = _mm_load1_pd((&posBegin[0])); vec1 = vec0; vec1 = vec0_2; } for (; (i1<6); i1 += 1) { xPos = posBegin[0]; } } } } } if ((!neighbor_isValid[1][1])) { { double xPos; double yPos; /* Statements in this Scop: S401, S400, S399 */ { { { double* fieldData_Solution_2_p1 = (&fieldData_Solution[2][0]); int i1 = 1; for (; (i1<=4); i1 += 2) { fieldData_Solution_2_p1[((i1*8)+6)] = 0.000000e+00; fieldData_Solution_2_p1[((i1*8)+14)] = 0.000000e+00; } for (; (i1<=5); i1 += 1) { fieldData_Solution_2_p1[((i1*8)+6)] = 0.000000e+00; } } { int i1 = 1; for (; (i1<(2&(~1))); i1 += 1) { xPos = posEnd[0]; } __m128d vec1 = _mm_set1_pd(xPos); for (; (i1<3); i1 += 4) { /* xPos = posEnd[0]; */ __m128d vec0 = _mm_load1_pd((&posEnd[0])); __m128d vec0_2 = _mm_load1_pd((&posEnd[0])); vec1 = vec0; vec1 = vec0_2; } for (; (i1<6); i1 += 1) { xPos = posEnd[0]; } } } { int i1 = 1; for (; (i1<(2&(~1))); i1 += 1) { yPos = ((((i1-1)/4.000000e+00)*(posEnd[1]-posBegin[1]))+posBegin[1]); } __m128d vec1 = _mm_set1_pd(1.000000e+00); __m128d vec2 = _mm_set1_pd(4.000000e+00); __m128d vec5 = _mm_set1_pd(yPos); for (; (i1<3); i1 += 4) { /* yPos = ((((i1-1)/4.000000e+00)*(posEnd[1]-posBegin[1]))+posBegin[1]); */ __m128d vec0 = _mm_set_pd(i1+1,i1); __m128d vec0_2 = _mm_set_pd(i1+1,i1); __m128d vec3 = _mm_load1_pd((&posEnd[1])); __m128d vec3_2 = _mm_load1_pd((&posEnd[1])); __m128d vec4 = _mm_load1_pd((&posBegin[1])); __m128d vec4_2 = _mm_load1_pd((&posBegin[1])); vec5 = _mm_add_pd(_mm_mul_pd(_mm_div_pd(_mm_sub_pd(vec0, vec1), vec2), _mm_sub_pd(vec3, vec4)), vec4); vec5 = _mm_add_pd(_mm_mul_pd(_mm_div_pd(_mm_sub_pd(vec0_2, vec1), vec2), _mm_sub_pd(vec3_2, vec4_2)), vec4_2); } for (; (i1<6); i1 += 1) { yPos = ((((i1-1)/4.000000e+00)*(posEnd[1]-posBegin[1]))+posBegin[1]); } } } } } if ((!neighbor_isValid[1][2])) { { double xPos; double yPos; /* Statements in this Scop: S404, S403, S402 */ { { { double* fieldData_Solution_2_p1 = (&fieldData_Solution[2][0]); int i2 = 2; for (; (i2<=5); i2 += 2) { fieldData_Solution_2_p1[(i2+8)] = 0.000000e+00; fieldData_Solution_2_p1[(i2+9)] = 0.000000e+00; } for (; (i2<=6); i2 += 1) { fieldData_Solution_2_p1[(i2+8)] = 0.000000e+00; } } { int i2 = 2; for (; (i2<=5); i2 += 2) { xPos = ((((i2-2)/4.000000e+00)*(posEnd[0]-posBegin[0]))+posBegin[0]); xPos = ((((i2-1)/4.000000e+00)*(posEnd[0]-posBegin[0]))+posBegin[0]); } for (; (i2<=6); i2 += 1) { xPos = ((((i2-2)/4.000000e+00)*(posEnd[0]-posBegin[0]))+posBegin[0]); } } } { int i2 = 2; for (; (i2<=5); i2 += 2) { yPos = posBegin[1]; yPos = posBegin[1]; } for (; (i2<=6); i2 += 1) { yPos = posBegin[1]; } } } } } if ((!neighbor_isValid[1][3])) { { double xPos; double yPos; /* Statements in this Scop: S407, S406, S405 */ { { { double* fieldData_Solution_2_p1 = (&fieldData_Solution[2][0]); int i2 = 2; for (; (i2<=5); i2 += 2) { fieldData_Solution_2_p1[(i2+40)] = 0.000000e+00; fieldData_Solution_2_p1[(i2+41)] = 0.000000e+00; } for (; (i2<=6); i2 += 1) { fieldData_Solution_2_p1[(i2+40)] = 0.000000e+00; } } { int i2 = 2; for (; (i2<=5); i2 += 2) { xPos = ((((i2-2)/4.000000e+00)*(posEnd[0]-posBegin[0]))+posBegin[0]); xPos = ((((i2-1)/4.000000e+00)*(posEnd[0]-posBegin[0]))+posBegin[0]); } for (; (i2<=6); i2 += 1) { xPos = ((((i2-2)/4.000000e+00)*(posEnd[0]-posBegin[0]))+posBegin[0]); } } } { int i2 = 2; for (; (i2<=5); i2 += 2) { yPos = posEnd[1]; yPos = posEnd[1]; } for (; (i2<=6); i2 += 1) { yPos = posEnd[1]; } } } } } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { ; } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { if ((neighbor_isValid[1][1]&&neighbor_isRemote[1][1])) { MPI_Isend(&fieldData_Solution[2][14], 1, mpiDatatype_5_1_8, neighbor_remoteRank[1][1], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[1][1]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[1]); reqOutstanding_Send[1] = true; } } } ; for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { if ((neighbor_isValid[1][0]&&neighbor_isRemote[1][0])) { MPI_Irecv(&fieldData_Solution[2][10], 1, mpiDatatype_5_1_8, neighbor_remoteRank[1][0], ((unsigned int)(neighbor_fragCommId[1][0]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[0]); reqOutstanding_Recv[0] = true; } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { if (reqOutstanding_Recv[0]) { waitForMPIReq(&mpiRequest_Recv[0]); reqOutstanding_Recv[0] = false; } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { ; } } ; ; for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { if (reqOutstanding_Send[1]) { waitForMPIReq(&mpiRequest_Send[1]); reqOutstanding_Send[1] = false; } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { ; } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { if ((neighbor_isValid[1][3]&&neighbor_isRemote[1][3])) { MPI_Isend(&fieldData_Solution[2][42], 1, mpiDatatype_1_5_8, neighbor_remoteRank[1][3], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[1][3]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[3]); reqOutstanding_Send[3] = true; } } } ; for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { if ((neighbor_isValid[1][2]&&neighbor_isRemote[1][2])) { MPI_Irecv(&fieldData_Solution[2][10], 1, mpiDatatype_1_5_8, neighbor_remoteRank[1][2], ((unsigned int)(neighbor_fragCommId[1][2]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[2]); reqOutstanding_Recv[2] = true; } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { if (reqOutstanding_Recv[2]) { waitForMPIReq(&mpiRequest_Recv[2]); reqOutstanding_Recv[2] = false; } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { ; } } ; ; for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { if (reqOutstanding_Send[3]) { waitForMPIReq(&mpiRequest_Send[3]); reqOutstanding_Send[3] = false; } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { ; ; } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { if ((neighbor_isValid[1][0]&&neighbor_isRemote[1][0])) { MPI_Isend(&fieldData_Solution[2][3], 1, mpiDatatype_7_1_8, neighbor_remoteRank[1][0], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[1][0]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[0]); reqOutstanding_Send[0] = true; } if ((neighbor_isValid[1][1]&&neighbor_isRemote[1][1])) { MPI_Isend(&fieldData_Solution[2][5], 1, mpiDatatype_7_1_8, neighbor_remoteRank[1][1], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[1][1]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[1]); reqOutstanding_Send[1] = true; } } } ; for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { if ((neighbor_isValid[1][0]&&neighbor_isRemote[1][0])) { MPI_Irecv(&fieldData_Solution[2][1], 1, mpiDatatype_7_1_8, neighbor_remoteRank[1][0], ((unsigned int)(neighbor_fragCommId[1][0]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[0]); reqOutstanding_Recv[0] = true; } if ((neighbor_isValid[1][1]&&neighbor_isRemote[1][1])) { MPI_Irecv(&fieldData_Solution[2][7], 1, mpiDatatype_7_1_8, neighbor_remoteRank[1][1], ((unsigned int)(neighbor_fragCommId[1][1]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[1]); reqOutstanding_Recv[1] = true; } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { if (reqOutstanding_Recv[0]) { waitForMPIReq(&mpiRequest_Recv[0]); reqOutstanding_Recv[0] = false; } if (reqOutstanding_Recv[1]) { waitForMPIReq(&mpiRequest_Recv[1]); reqOutstanding_Recv[1] = false; } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { ; ; } } ; ; for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { if (reqOutstanding_Send[0]) { waitForMPIReq(&mpiRequest_Send[0]); reqOutstanding_Send[0] = false; } if (reqOutstanding_Send[1]) { waitForMPIReq(&mpiRequest_Send[1]); reqOutstanding_Send[1] = false; } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { ; ; } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { if ((neighbor_isValid[1][2]&&neighbor_isRemote[1][2])) { MPI_Isend(&fieldData_Solution[2][17], 1, mpiDatatype_1_7_8, neighbor_remoteRank[1][2], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[1][2]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[2]); reqOutstanding_Send[2] = true; } if ((neighbor_isValid[1][3]&&neighbor_isRemote[1][3])) { MPI_Isend(&fieldData_Solution[2][33], 1, mpiDatatype_1_7_8, neighbor_remoteRank[1][3], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[1][3]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[3]); reqOutstanding_Send[3] = true; } } } ; for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { if ((neighbor_isValid[1][2]&&neighbor_isRemote[1][2])) { MPI_Irecv(&fieldData_Solution[2][1], 1, mpiDatatype_1_7_8, neighbor_remoteRank[1][2], ((unsigned int)(neighbor_fragCommId[1][2]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[2]); reqOutstanding_Recv[2] = true; } if ((neighbor_isValid[1][3]&&neighbor_isRemote[1][3])) { MPI_Irecv(&fieldData_Solution[2][49], 1, mpiDatatype_1_7_8, neighbor_remoteRank[1][3], ((unsigned int)(neighbor_fragCommId[1][3]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[3]); reqOutstanding_Recv[3] = true; } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { if (reqOutstanding_Recv[2]) { waitForMPIReq(&mpiRequest_Recv[2]); reqOutstanding_Recv[2] = false; } if (reqOutstanding_Recv[3]) { waitForMPIReq(&mpiRequest_Recv[3]); reqOutstanding_Recv[3] = false; } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { ; ; } } ; ; for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { if (reqOutstanding_Send[2]) { waitForMPIReq(&mpiRequest_Send[2]); reqOutstanding_Send[2] = false; } if (reqOutstanding_Send[3]) { waitForMPIReq(&mpiRequest_Send[3]); reqOutstanding_Send[3] = false; } } } }
real *shiftvec,*fshift,*x,*f; __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; int vdwioffset0; __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; int vdwjidx0A,vdwjidx0B; __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; __m128d velec,felec,velecsum,facel,crf,krf,krf2; real *charge; __m128i vfitab; __m128i ifour = _mm_set1_epi32(4); __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps; real *vftab; __m128d dummy_mask,cutoff_mask; __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); __m128d one = _mm_set1_pd(1.0); __m128d two = _mm_set1_pd(2.0); x = xx[0]; f = ff[0]; nri = nlist->nri; iinr = nlist->iinr; jindex = nlist->jindex; jjnr = nlist->jjnr; shiftidx = nlist->shift; gid = nlist->gid; shiftvec = fr->shift_vec[0]; fshift = fr->fshift[0]; facel = _mm_set1_pd(fr->epsfac); charge = mdatoms->chargeA;
__m128d cross(__m128d tmp9) { __m128d t1 = _mm_set1_pd(1.0); __m128d tmp10 = _mm_move_sd(t1, tmp9); return tmp10; }
void mlib_FIR_tap3f_d64s( mlib_d64 *pdst, const mlib_d64 *psrc, mlib_d64 *pflt, mlib_s32 n) { mlib_s32 j; mlib_d64 src1_1, src2_1, src3_1; mlib_d64 src1_2, src2_2, src3_2; mlib_d64 dflt1 = pflt[0], dflt2 = pflt[1], dflt3 = pflt[2]; __m128d sdflt1, sdflt2, sdflt3; __m128d ssrc1, ssrc2, ssrc3; __m128d smul1, smul2, smul3; sdflt3 = _mm_set1_pd(dflt3); sdflt2 = _mm_set1_pd(dflt2); sdflt1 = _mm_set1_pd(dflt1); if ((mlib_addr)psrc & 15) { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (j = 0; j < n; j++) { ssrc1 = _mm_loadu_pd(psrc); ssrc2 = _mm_loadu_pd(psrc + 2); ssrc3 = _mm_loadu_pd(psrc + 4); smul1 = _mm_mul_pd(sdflt3, ssrc1); smul2 = _mm_mul_pd(sdflt2, ssrc2); smul3 = _mm_mul_pd(sdflt1, ssrc3); smul1 = _mm_add_pd(smul1, smul2); smul1 = _mm_add_pd(smul1, smul3); _mm_storeu_pd(pdst, smul1); psrc += 2; pdst += 2; } } else { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (j = 0; j < n; j++) { ssrc1 = _mm_load_pd(psrc); ssrc2 = _mm_load_pd(psrc + 2); ssrc3 = _mm_load_pd(psrc + 4); smul1 = _mm_mul_pd(sdflt3, ssrc1); smul2 = _mm_mul_pd(sdflt2, ssrc2); smul3 = _mm_mul_pd(sdflt1, ssrc3); smul1 = _mm_add_pd(smul1, smul2); smul1 = _mm_add_pd(smul1, smul3); _mm_storeu_pd(pdst, smul1); psrc += 2; pdst += 2; } } }
int jnrA,jnrB; int j_coord_offsetA,j_coord_offsetB; int *iinr,*jindex,*jjnr,*shiftidx,*gid; real rcutoff_scalar; real *shiftvec,*fshift,*x,*f; __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; int vdwioffset0; __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; int vdwjidx0A,vdwjidx0B; __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; int nvdwtype; __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; int *vdwtype; real *vdwparam; __m128d one_sixth = _mm_set1_pd(1.0/6.0); __m128d one_twelfth = _mm_set1_pd(1.0/12.0); __m128d dummy_mask,cutoff_mask; __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); __m128d one = _mm_set1_pd(1.0); __m128d two = _mm_set1_pd(2.0); x = xx[0]; f = ff[0]; nri = nlist->nri; iinr = nlist->iinr; jindex = nlist->jindex; jjnr = nlist->jjnr; shiftidx = nlist->shift; gid = nlist->gid; shiftvec = fr->shift_vec[0];
int *iinr,*jindex,*jjnr,*shiftidx,*gid; real rcutoff_scalar; real *shiftvec,*fshift,*x,*f; __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; int vdwioffset0; __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; int vdwjidx0A,vdwjidx0B; __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; __m128d velec,felec,velecsum,facel,crf,krf,krf2; real *charge; int nvdwtype; __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; int *vdwtype; real *vdwparam; __m128d one_sixth = _mm_set1_pd(1.0/6.0); __m128d one_twelfth = _mm_set1_pd(1.0/12.0); __m128d dummy_mask,cutoff_mask; __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); __m128d one = _mm_set1_pd(1.0); __m128d two = _mm_set1_pd(2.0); x = xx[0]; f = ff[0]; nri = nlist->nri; iinr = nlist->iinr; jindex = nlist->jindex; jjnr = nlist->jjnr; shiftidx = nlist->shift; gid = nlist->gid; shiftvec = fr->shift_vec[0];
int vdwioffset1; __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; int vdwioffset2; __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; int vdwioffset3; __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; int vdwjidx0A,vdwjidx0B; __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; __m128d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30; __m128d velec,felec,velecsum,facel,crf,krf,krf2; real *charge; __m128d dummy_mask,cutoff_mask; __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); __m128d one = _mm_set1_pd(1.0); __m128d two = _mm_set1_pd(2.0); x = xx[0]; f = ff[0]; nri = nlist->nri; iinr = nlist->iinr; jindex = nlist->jindex; jjnr = nlist->jjnr; shiftidx = nlist->shift; gid = nlist->gid; shiftvec = fr->shift_vec[0]; fshift = fr->fshift[0]; facel = _mm_set1_pd(fr->epsfac); charge = mdatoms->chargeA; krf = _mm_set1_pd(fr->ic->k_rf);
int vdwioffset1; __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; int vdwioffset2; __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; int vdwjidx0A,vdwjidx0B; __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; __m128d velec,felec,velecsum,facel,crf,krf,krf2; real *charge; int nvdwtype; __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; int *vdwtype; real *vdwparam; __m128d one_sixth = _mm_set1_pd(1.0/6.0); __m128d one_twelfth = _mm_set1_pd(1.0/12.0); __m128d dummy_mask,cutoff_mask; __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); __m128d one = _mm_set1_pd(1.0); __m128d two = _mm_set1_pd(2.0); x = xx[0]; f = ff[0]; nri = nlist->nri; iinr = nlist->iinr; jindex = nlist->jindex; jjnr = nlist->jjnr; shiftidx = nlist->shift; gid = nlist->gid; shiftvec = fr->shift_vec[0];
void exchlaplacecoeffData_2(unsigned int slot) { for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { if ((!neighbor_isValid[1][0])) { { double xPos; double yPos; /* Statements in this Scop: S710, S704, S707, S701, S709, S700, S703, S706, S708, S702, S705 */ { { { { { { { { { { { double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][0]); int i1 = 1; for (; (i1<=4); i1 += 2) { fieldData_LaplaceCoeff_2_p1[((i1*8)+394)] = 0.000000e+00; fieldData_LaplaceCoeff_2_p1[((i1*8)+402)] = 0.000000e+00; } for (; (i1<=5); i1 += 1) { fieldData_LaplaceCoeff_2_p1[((i1*8)+394)] = 0.000000e+00; } } { double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][0]); int i1 = 1; for (; (i1<=4); i1 += 2) { fieldData_LaplaceCoeff_2_p1[((i1*8)+226)] = 0.000000e+00; fieldData_LaplaceCoeff_2_p1[((i1*8)+234)] = 0.000000e+00; } for (; (i1<=5); i1 += 1) { fieldData_LaplaceCoeff_2_p1[((i1*8)+226)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][0]); int i1 = 1; for (; (i1<=4); i1 += 2) { fieldData_LaplaceCoeff_2_p1[((i1*8)+170)] = 0.000000e+00; fieldData_LaplaceCoeff_2_p1[((i1*8)+178)] = 0.000000e+00; } for (; (i1<=5); i1 += 1) { fieldData_LaplaceCoeff_2_p1[((i1*8)+170)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][0]); int i1 = 1; for (; (i1<=4); i1 += 2) { fieldData_LaplaceCoeff_2_p1[((i1*8)+58)] = 0.000000e+00; fieldData_LaplaceCoeff_2_p1[((i1*8)+66)] = 0.000000e+00; } for (; (i1<=5); i1 += 1) { fieldData_LaplaceCoeff_2_p1[((i1*8)+58)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][0]); int i1 = 1; for (; (i1<=4); i1 += 2) { fieldData_LaplaceCoeff_2_p1[((i1*8)+450)] = 0.000000e+00; fieldData_LaplaceCoeff_2_p1[((i1*8)+458)] = 0.000000e+00; } for (; (i1<=5); i1 += 1) { fieldData_LaplaceCoeff_2_p1[((i1*8)+450)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][0]); int i1 = 1; for (; (i1<=4); i1 += 2) { fieldData_LaplaceCoeff_2_p1[((i1*8)+114)] = 0.000000e+00; fieldData_LaplaceCoeff_2_p1[((i1*8)+122)] = 0.000000e+00; } for (; (i1<=5); i1 += 1) { fieldData_LaplaceCoeff_2_p1[((i1*8)+114)] = 0.000000e+00; } } } { int i1 = 1; for (; (i1<(2&(~1))); i1 += 1) { yPos = ((((i1-1)/4.000000e+00)*(posEnd[1]-posBegin[1]))+posBegin[1]); } __m128d vec1 = _mm_set1_pd(1.000000e+00); __m128d vec2 = _mm_set1_pd(4.000000e+00); __m128d vec5 = _mm_set1_pd(yPos); for (; (i1<3); i1 += 4) { /* yPos = ((((i1-1)/4.000000e+00)*(posEnd[1]-posBegin[1]))+posBegin[1]); */ __m128d vec0 = _mm_set_pd(i1+1,i1); __m128d vec0_2 = _mm_set_pd(i1+1,i1); __m128d vec3 = _mm_load1_pd((&posEnd[1])); __m128d vec3_2 = _mm_load1_pd((&posEnd[1])); __m128d vec4 = _mm_load1_pd((&posBegin[1])); __m128d vec4_2 = _mm_load1_pd((&posBegin[1])); vec5 = _mm_add_pd(_mm_mul_pd(_mm_div_pd(_mm_sub_pd(vec0, vec1), vec2), _mm_sub_pd(vec3, vec4)), vec4); vec5 = _mm_add_pd(_mm_mul_pd(_mm_div_pd(_mm_sub_pd(vec0_2, vec1), vec2), _mm_sub_pd(vec3_2, vec4_2)), vec4_2); } for (; (i1<6); i1 += 1) { yPos = ((((i1-1)/4.000000e+00)*(posEnd[1]-posBegin[1]))+posBegin[1]); } } } { double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][0]); int i1 = 1; for (; (i1<=4); i1 += 2) { fieldData_LaplaceCoeff_2_p1[((i1*8)+2)] = 0.000000e+00; fieldData_LaplaceCoeff_2_p1[((i1*8)+10)] = 0.000000e+00; } for (; (i1<=5); i1 += 1) { fieldData_LaplaceCoeff_2_p1[((i1*8)+2)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][0]); int i1 = 1; for (; (i1<=4); i1 += 2) { fieldData_LaplaceCoeff_2_p1[((i1*8)+338)] = 0.000000e+00; fieldData_LaplaceCoeff_2_p1[((i1*8)+346)] = 0.000000e+00; } for (; (i1<=5); i1 += 1) { fieldData_LaplaceCoeff_2_p1[((i1*8)+338)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][0]); int i1 = 1; for (; (i1<=4); i1 += 2) { fieldData_LaplaceCoeff_2_p1[((i1*8)+282)] = 0.000000e+00; fieldData_LaplaceCoeff_2_p1[((i1*8)+290)] = 0.000000e+00; } for (; (i1<=5); i1 += 1) { fieldData_LaplaceCoeff_2_p1[((i1*8)+282)] = 0.000000e+00; } } } { int i1 = 1; for (; (i1<(2&(~1))); i1 += 1) { xPos = posBegin[0]; } __m128d vec1 = _mm_set1_pd(xPos); for (; (i1<3); i1 += 4) { /* xPos = posBegin[0]; */ __m128d vec0 = _mm_load1_pd((&posBegin[0])); __m128d vec0_2 = _mm_load1_pd((&posBegin[0])); vec1 = vec0; vec1 = vec0_2; } for (; (i1<6); i1 += 1) { xPos = posBegin[0]; } } } } } if ((!neighbor_isValid[1][1])) { { double xPos; double yPos; /* Statements in this Scop: S716, S719, S713, S721, S715, S718, S712, S711, S720, S714, S717 */ { { { { { { { { { { { int i1 = 1; for (; (i1<(2&(~1))); i1 += 1) { yPos = ((((i1-1)/4.000000e+00)*(posEnd[1]-posBegin[1]))+posBegin[1]); } __m128d vec1 = _mm_set1_pd(1.000000e+00); __m128d vec2 = _mm_set1_pd(4.000000e+00); __m128d vec5 = _mm_set1_pd(yPos); for (; (i1<3); i1 += 4) { /* yPos = ((((i1-1)/4.000000e+00)*(posEnd[1]-posBegin[1]))+posBegin[1]); */ __m128d vec0 = _mm_set_pd(i1+1,i1); __m128d vec0_2 = _mm_set_pd(i1+1,i1); __m128d vec3 = _mm_load1_pd((&posEnd[1])); __m128d vec3_2 = _mm_load1_pd((&posEnd[1])); __m128d vec4 = _mm_load1_pd((&posBegin[1])); __m128d vec4_2 = _mm_load1_pd((&posBegin[1])); vec5 = _mm_add_pd(_mm_mul_pd(_mm_div_pd(_mm_sub_pd(vec0, vec1), vec2), _mm_sub_pd(vec3, vec4)), vec4); vec5 = _mm_add_pd(_mm_mul_pd(_mm_div_pd(_mm_sub_pd(vec0_2, vec1), vec2), _mm_sub_pd(vec3_2, vec4_2)), vec4_2); } for (; (i1<6); i1 += 1) { yPos = ((((i1-1)/4.000000e+00)*(posEnd[1]-posBegin[1]))+posBegin[1]); } } { double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][0]); int i1 = 1; for (; (i1<=4); i1 += 2) { fieldData_LaplaceCoeff_2_p1[((i1*8)+454)] = 0.000000e+00; fieldData_LaplaceCoeff_2_p1[((i1*8)+462)] = 0.000000e+00; } for (; (i1<=5); i1 += 1) { fieldData_LaplaceCoeff_2_p1[((i1*8)+454)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][0]); int i1 = 1; for (; (i1<=4); i1 += 2) { fieldData_LaplaceCoeff_2_p1[((i1*8)+230)] = 0.000000e+00; fieldData_LaplaceCoeff_2_p1[((i1*8)+238)] = 0.000000e+00; } for (; (i1<=5); i1 += 1) { fieldData_LaplaceCoeff_2_p1[((i1*8)+230)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][0]); int i1 = 1; for (; (i1<=4); i1 += 2) { fieldData_LaplaceCoeff_2_p1[((i1*8)+118)] = 0.000000e+00; fieldData_LaplaceCoeff_2_p1[((i1*8)+126)] = 0.000000e+00; } for (; (i1<=5); i1 += 1) { fieldData_LaplaceCoeff_2_p1[((i1*8)+118)] = 0.000000e+00; } } } { int i1 = 1; for (; (i1<(2&(~1))); i1 += 1) { xPos = posEnd[0]; } __m128d vec1 = _mm_set1_pd(xPos); for (; (i1<3); i1 += 4) { /* xPos = posEnd[0]; */ __m128d vec0 = _mm_load1_pd((&posEnd[0])); __m128d vec0_2 = _mm_load1_pd((&posEnd[0])); vec1 = vec0; vec1 = vec0_2; } for (; (i1<6); i1 += 1) { xPos = posEnd[0]; } } } { double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][0]); int i1 = 1; for (; (i1<=4); i1 += 2) { fieldData_LaplaceCoeff_2_p1[((i1*8)+286)] = 0.000000e+00; fieldData_LaplaceCoeff_2_p1[((i1*8)+294)] = 0.000000e+00; } for (; (i1<=5); i1 += 1) { fieldData_LaplaceCoeff_2_p1[((i1*8)+286)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][0]); int i1 = 1; for (; (i1<=4); i1 += 2) { fieldData_LaplaceCoeff_2_p1[((i1*8)+342)] = 0.000000e+00; fieldData_LaplaceCoeff_2_p1[((i1*8)+350)] = 0.000000e+00; } for (; (i1<=5); i1 += 1) { fieldData_LaplaceCoeff_2_p1[((i1*8)+342)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][0]); int i1 = 1; for (; (i1<=4); i1 += 2) { fieldData_LaplaceCoeff_2_p1[((i1*8)+398)] = 0.000000e+00; fieldData_LaplaceCoeff_2_p1[((i1*8)+406)] = 0.000000e+00; } for (; (i1<=5); i1 += 1) { fieldData_LaplaceCoeff_2_p1[((i1*8)+398)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][0]); int i1 = 1; for (; (i1<=4); i1 += 2) { fieldData_LaplaceCoeff_2_p1[((i1*8)+174)] = 0.000000e+00; fieldData_LaplaceCoeff_2_p1[((i1*8)+182)] = 0.000000e+00; } for (; (i1<=5); i1 += 1) { fieldData_LaplaceCoeff_2_p1[((i1*8)+174)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][0]); int i1 = 1; for (; (i1<=4); i1 += 2) { fieldData_LaplaceCoeff_2_p1[((i1*8)+62)] = 0.000000e+00; fieldData_LaplaceCoeff_2_p1[((i1*8)+70)] = 0.000000e+00; } for (; (i1<=5); i1 += 1) { fieldData_LaplaceCoeff_2_p1[((i1*8)+62)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][0]); int i1 = 1; for (; (i1<=4); i1 += 2) { fieldData_LaplaceCoeff_2_p1[((i1*8)+6)] = 0.000000e+00; fieldData_LaplaceCoeff_2_p1[((i1*8)+14)] = 0.000000e+00; } for (; (i1<=5); i1 += 1) { fieldData_LaplaceCoeff_2_p1[((i1*8)+6)] = 0.000000e+00; } } } } } if ((!neighbor_isValid[1][2])) { { double xPos; double yPos; /* Statements in this Scop: S722, S731, S725, S728, S727, S730, S724, S732, S726, S729, S723 */ { { { { { { { { { { { double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][0]); int i2 = 2; for (; (i2<=5); i2 += 2) { fieldData_LaplaceCoeff_2_p1[(i2+344)] = 0.000000e+00; fieldData_LaplaceCoeff_2_p1[(i2+345)] = 0.000000e+00; } for (; (i2<=6); i2 += 1) { fieldData_LaplaceCoeff_2_p1[(i2+344)] = 0.000000e+00; } } { double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][0]); int i2 = 2; for (; (i2<=5); i2 += 2) { fieldData_LaplaceCoeff_2_p1[(i2+400)] = 0.000000e+00; fieldData_LaplaceCoeff_2_p1[(i2+401)] = 0.000000e+00; } for (; (i2<=6); i2 += 1) { fieldData_LaplaceCoeff_2_p1[(i2+400)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][0]); int i2 = 2; for (; (i2<=5); i2 += 2) { fieldData_LaplaceCoeff_2_p1[(i2+120)] = 0.000000e+00; fieldData_LaplaceCoeff_2_p1[(i2+121)] = 0.000000e+00; } for (; (i2<=6); i2 += 1) { fieldData_LaplaceCoeff_2_p1[(i2+120)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][0]); int i2 = 2; for (; (i2<=5); i2 += 2) { fieldData_LaplaceCoeff_2_p1[(i2+8)] = 0.000000e+00; fieldData_LaplaceCoeff_2_p1[(i2+9)] = 0.000000e+00; } for (; (i2<=6); i2 += 1) { fieldData_LaplaceCoeff_2_p1[(i2+8)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][0]); int i2 = 2; for (; (i2<=5); i2 += 2) { fieldData_LaplaceCoeff_2_p1[(i2+64)] = 0.000000e+00; fieldData_LaplaceCoeff_2_p1[(i2+65)] = 0.000000e+00; } for (; (i2<=6); i2 += 1) { fieldData_LaplaceCoeff_2_p1[(i2+64)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][0]); int i2 = 2; for (; (i2<=5); i2 += 2) { fieldData_LaplaceCoeff_2_p1[(i2+456)] = 0.000000e+00; fieldData_LaplaceCoeff_2_p1[(i2+457)] = 0.000000e+00; } for (; (i2<=6); i2 += 1) { fieldData_LaplaceCoeff_2_p1[(i2+456)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][0]); int i2 = 2; for (; (i2<=5); i2 += 2) { fieldData_LaplaceCoeff_2_p1[(i2+232)] = 0.000000e+00; fieldData_LaplaceCoeff_2_p1[(i2+233)] = 0.000000e+00; } for (; (i2<=6); i2 += 1) { fieldData_LaplaceCoeff_2_p1[(i2+232)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][0]); int i2 = 2; for (; (i2<=5); i2 += 2) { fieldData_LaplaceCoeff_2_p1[(i2+288)] = 0.000000e+00; fieldData_LaplaceCoeff_2_p1[(i2+289)] = 0.000000e+00; } for (; (i2<=6); i2 += 1) { fieldData_LaplaceCoeff_2_p1[(i2+288)] = 0.000000e+00; } } } { int i2 = 2; for (; (i2<=5); i2 += 2) { xPos = ((((i2-2)/4.000000e+00)*(posEnd[0]-posBegin[0]))+posBegin[0]); xPos = ((((i2-1)/4.000000e+00)*(posEnd[0]-posBegin[0]))+posBegin[0]); } for (; (i2<=6); i2 += 1) { xPos = ((((i2-2)/4.000000e+00)*(posEnd[0]-posBegin[0]))+posBegin[0]); } } } { double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][0]); int i2 = 2; for (; (i2<=5); i2 += 2) { fieldData_LaplaceCoeff_2_p1[(i2+176)] = 0.000000e+00; fieldData_LaplaceCoeff_2_p1[(i2+177)] = 0.000000e+00; } for (; (i2<=6); i2 += 1) { fieldData_LaplaceCoeff_2_p1[(i2+176)] = 0.000000e+00; } } } { int i2 = 2; for (; (i2<=5); i2 += 2) { yPos = posBegin[1]; yPos = posBegin[1]; } for (; (i2<=6); i2 += 1) { yPos = posBegin[1]; } } } } } if ((!neighbor_isValid[1][3])) { { double xPos; double yPos; /* Statements in this Scop: S743, S737, S733, S742, S736, S739, S738, S741, S735, S740, S734 */ { { { { { { { { { { { int i2 = 2; for (; (i2<=5); i2 += 2) { yPos = posEnd[1]; yPos = posEnd[1]; } for (; (i2<=6); i2 += 1) { yPos = posEnd[1]; } } { double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][0]); int i2 = 2; for (; (i2<=5); i2 += 2) { fieldData_LaplaceCoeff_2_p1[(i2+376)] = 0.000000e+00; fieldData_LaplaceCoeff_2_p1[(i2+377)] = 0.000000e+00; } for (; (i2<=6); i2 += 1) { fieldData_LaplaceCoeff_2_p1[(i2+376)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][0]); int i2 = 2; for (; (i2<=5); i2 += 2) { fieldData_LaplaceCoeff_2_p1[(i2+488)] = 0.000000e+00; fieldData_LaplaceCoeff_2_p1[(i2+489)] = 0.000000e+00; } for (; (i2<=6); i2 += 1) { fieldData_LaplaceCoeff_2_p1[(i2+488)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][0]); int i2 = 2; for (; (i2<=5); i2 += 2) { fieldData_LaplaceCoeff_2_p1[(i2+40)] = 0.000000e+00; fieldData_LaplaceCoeff_2_p1[(i2+41)] = 0.000000e+00; } for (; (i2<=6); i2 += 1) { fieldData_LaplaceCoeff_2_p1[(i2+40)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][0]); int i2 = 2; for (; (i2<=5); i2 += 2) { fieldData_LaplaceCoeff_2_p1[(i2+208)] = 0.000000e+00; fieldData_LaplaceCoeff_2_p1[(i2+209)] = 0.000000e+00; } for (; (i2<=6); i2 += 1) { fieldData_LaplaceCoeff_2_p1[(i2+208)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][0]); int i2 = 2; for (; (i2<=5); i2 += 2) { fieldData_LaplaceCoeff_2_p1[(i2+152)] = 0.000000e+00; fieldData_LaplaceCoeff_2_p1[(i2+153)] = 0.000000e+00; } for (; (i2<=6); i2 += 1) { fieldData_LaplaceCoeff_2_p1[(i2+152)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][0]); int i2 = 2; for (; (i2<=5); i2 += 2) { fieldData_LaplaceCoeff_2_p1[(i2+320)] = 0.000000e+00; fieldData_LaplaceCoeff_2_p1[(i2+321)] = 0.000000e+00; } for (; (i2<=6); i2 += 1) { fieldData_LaplaceCoeff_2_p1[(i2+320)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][0]); int i2 = 2; for (; (i2<=5); i2 += 2) { fieldData_LaplaceCoeff_2_p1[(i2+432)] = 0.000000e+00; fieldData_LaplaceCoeff_2_p1[(i2+433)] = 0.000000e+00; } for (; (i2<=6); i2 += 1) { fieldData_LaplaceCoeff_2_p1[(i2+432)] = 0.000000e+00; } } } { double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][0]); int i2 = 2; for (; (i2<=5); i2 += 2) { fieldData_LaplaceCoeff_2_p1[(i2+96)] = 0.000000e+00; fieldData_LaplaceCoeff_2_p1[(i2+97)] = 0.000000e+00; } for (; (i2<=6); i2 += 1) { fieldData_LaplaceCoeff_2_p1[(i2+96)] = 0.000000e+00; } } } { int i2 = 2; for (; (i2<=5); i2 += 2) { xPos = ((((i2-2)/4.000000e+00)*(posEnd[0]-posBegin[0]))+posBegin[0]); xPos = ((((i2-1)/4.000000e+00)*(posEnd[0]-posBegin[0]))+posBegin[0]); } for (; (i2<=6); i2 += 1) { xPos = ((((i2-2)/4.000000e+00)*(posEnd[0]-posBegin[0]))+posBegin[0]); } } } { double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][0]); int i2 = 2; for (; (i2<=5); i2 += 2) { fieldData_LaplaceCoeff_2_p1[(i2+264)] = 0.000000e+00; fieldData_LaplaceCoeff_2_p1[(i2+265)] = 0.000000e+00; } for (; (i2<=6); i2 += 1) { fieldData_LaplaceCoeff_2_p1[(i2+264)] = 0.000000e+00; } } } } } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { if ((neighbor_isValid[1][1]&&neighbor_isRemote[1][1])) { /* Statements in this Scop: S744 */ for (int i0 = 0; (i0<=8); i0 += 1) { double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][(i0*56)]); double* buffer_Send_1_p1 = (&buffer_Send[1][(i0*5)]); int i1 = 1; for (; (i1<=4); i1 += 2) { buffer_Send_1_p1[(i1-1)] = fieldData_LaplaceCoeff_2_p1[((i1*8)+6)]; buffer_Send_1_p1[i1] = fieldData_LaplaceCoeff_2_p1[((i1*8)+14)]; } for (; (i1<=5); i1 += 1) { buffer_Send_1_p1[(i1-1)] = fieldData_LaplaceCoeff_2_p1[((i1*8)+6)]; } } } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { if ((neighbor_isValid[1][1]&&neighbor_isRemote[1][1])) { MPI_Isend(buffer_Send[1], 45, MPI_DOUBLE, neighbor_remoteRank[1][1], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[1][1]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[1]); reqOutstanding_Send[1] = true; } } } ; for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { if ((neighbor_isValid[1][0]&&neighbor_isRemote[1][0])) { MPI_Irecv(buffer_Recv[0], 45, MPI_DOUBLE, neighbor_remoteRank[1][0], ((unsigned int)(neighbor_fragCommId[1][0]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[0]); reqOutstanding_Recv[0] = true; } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { if (reqOutstanding_Recv[0]) { waitForMPIReq(&mpiRequest_Recv[0]); reqOutstanding_Recv[0] = false; } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { if ((neighbor_isValid[1][0]&&neighbor_isRemote[1][0])) { /* Statements in this Scop: S745 */ for (int i0 = 0; (i0<=8); i0 += 1) { double* buffer_Recv_0_p1 = (&buffer_Recv[0][(i0*5)]); double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][(i0*56)]); int i1 = 3; for (; (i1<=6); i1 += 2) { fieldData_LaplaceCoeff_2_p1[((i1*8)-14)] = buffer_Recv_0_p1[(i1-3)]; fieldData_LaplaceCoeff_2_p1[((i1*8)-6)] = buffer_Recv_0_p1[(i1-2)]; } for (; (i1<=7); i1 += 1) { fieldData_LaplaceCoeff_2_p1[((i1*8)-14)] = buffer_Recv_0_p1[(i1-3)]; } } } } } ; ; for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { if (reqOutstanding_Send[1]) { waitForMPIReq(&mpiRequest_Send[1]); reqOutstanding_Send[1] = false; } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { ; } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { if ((neighbor_isValid[1][3]&&neighbor_isRemote[1][3])) { MPI_Isend(&fieldData_LaplaceCoeff[2][42], 1, mpiDatatype_9_5_56, neighbor_remoteRank[1][3], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[1][3]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[3]); reqOutstanding_Send[3] = true; } } } ; for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { if ((neighbor_isValid[1][2]&&neighbor_isRemote[1][2])) { MPI_Irecv(&fieldData_LaplaceCoeff[2][10], 1, mpiDatatype_9_5_56, neighbor_remoteRank[1][2], ((unsigned int)(neighbor_fragCommId[1][2]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[2]); reqOutstanding_Recv[2] = true; } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { if (reqOutstanding_Recv[2]) { waitForMPIReq(&mpiRequest_Recv[2]); reqOutstanding_Recv[2] = false; } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { ; } } ; ; for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { if (reqOutstanding_Send[3]) { waitForMPIReq(&mpiRequest_Send[3]); reqOutstanding_Send[3] = false; } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { if ((neighbor_isValid[1][0]&&neighbor_isRemote[1][0])) { /* Statements in this Scop: S746 */ for (int i0 = 0; (i0<=8); i0 += 1) { double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][(i0*56)]); double* buffer_Send_0_p1 = (&buffer_Send[0][(i0*7)]); int i1 = 0; for (; (i1<=5); i1 += 2) { buffer_Send_0_p1[i1] = fieldData_LaplaceCoeff_2_p1[((i1*8)+3)]; buffer_Send_0_p1[(i1+1)] = fieldData_LaplaceCoeff_2_p1[((i1*8)+11)]; } for (; (i1<=6); i1 += 1) { buffer_Send_0_p1[i1] = fieldData_LaplaceCoeff_2_p1[((i1*8)+3)]; } } } if ((neighbor_isValid[1][1]&&neighbor_isRemote[1][1])) { /* Statements in this Scop: S747 */ for (int i0 = 0; (i0<=8); i0 += 1) { double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][(i0*56)]); double* buffer_Send_1_p1 = (&buffer_Send[1][(i0*7)]); int i1 = 0; for (; (i1<=5); i1 += 2) { buffer_Send_1_p1[i1] = fieldData_LaplaceCoeff_2_p1[((i1*8)+5)]; buffer_Send_1_p1[(i1+1)] = fieldData_LaplaceCoeff_2_p1[((i1*8)+13)]; } for (; (i1<=6); i1 += 1) { buffer_Send_1_p1[i1] = fieldData_LaplaceCoeff_2_p1[((i1*8)+5)]; } } } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { if ((neighbor_isValid[1][0]&&neighbor_isRemote[1][0])) { MPI_Isend(buffer_Send[0], 63, MPI_DOUBLE, neighbor_remoteRank[1][0], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[1][0]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[0]); reqOutstanding_Send[0] = true; } if ((neighbor_isValid[1][1]&&neighbor_isRemote[1][1])) { MPI_Isend(buffer_Send[1], 63, MPI_DOUBLE, neighbor_remoteRank[1][1], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[1][1]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[1]); reqOutstanding_Send[1] = true; } } } ; for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { if ((neighbor_isValid[1][0]&&neighbor_isRemote[1][0])) { MPI_Irecv(buffer_Recv[0], 63, MPI_DOUBLE, neighbor_remoteRank[1][0], ((unsigned int)(neighbor_fragCommId[1][0]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[0]); reqOutstanding_Recv[0] = true; } if ((neighbor_isValid[1][1]&&neighbor_isRemote[1][1])) { MPI_Irecv(buffer_Recv[1], 63, MPI_DOUBLE, neighbor_remoteRank[1][1], ((unsigned int)(neighbor_fragCommId[1][1]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[1]); reqOutstanding_Recv[1] = true; } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { if (reqOutstanding_Recv[0]) { waitForMPIReq(&mpiRequest_Recv[0]); reqOutstanding_Recv[0] = false; } if (reqOutstanding_Recv[1]) { waitForMPIReq(&mpiRequest_Recv[1]); reqOutstanding_Recv[1] = false; } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { if ((neighbor_isValid[1][0]&&neighbor_isRemote[1][0])) { /* Statements in this Scop: S748 */ for (int i0 = 0; (i0<=8); i0 += 1) { double* buffer_Recv_0_p1 = (&buffer_Recv[0][(i0*7)]); double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][(i0*56)]); int i1 = 1; for (; (i1<=6); i1 += 2) { fieldData_LaplaceCoeff_2_p1[((i1*8)-7)] = buffer_Recv_0_p1[(i1-1)]; fieldData_LaplaceCoeff_2_p1[((i1*8)+1)] = buffer_Recv_0_p1[i1]; } for (; (i1<=7); i1 += 1) { fieldData_LaplaceCoeff_2_p1[((i1*8)-7)] = buffer_Recv_0_p1[(i1-1)]; } } } if ((neighbor_isValid[1][1]&&neighbor_isRemote[1][1])) { /* Statements in this Scop: S749 */ for (int i0 = 0; (i0<=8); i0 += 1) { double* buffer_Recv_1_p1 = (&buffer_Recv[1][(i0*7)]); double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][(i0*56)]); int i1 = 7; for (; (i1<=12); i1 += 2) { fieldData_LaplaceCoeff_2_p1[((i1*8)-49)] = buffer_Recv_1_p1[(i1-7)]; fieldData_LaplaceCoeff_2_p1[((i1*8)-41)] = buffer_Recv_1_p1[(i1-6)]; } for (; (i1<=13); i1 += 1) { fieldData_LaplaceCoeff_2_p1[((i1*8)-49)] = buffer_Recv_1_p1[(i1-7)]; } } } } } ; ; for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { if (reqOutstanding_Send[0]) { waitForMPIReq(&mpiRequest_Send[0]); reqOutstanding_Send[0] = false; } if (reqOutstanding_Send[1]) { waitForMPIReq(&mpiRequest_Send[1]); reqOutstanding_Send[1] = false; } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { ; ; } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { if ((neighbor_isValid[1][2]&&neighbor_isRemote[1][2])) { MPI_Isend(&fieldData_LaplaceCoeff[2][17], 1, mpiDatatype_9_7_56, neighbor_remoteRank[1][2], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[1][2]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[2]); reqOutstanding_Send[2] = true; } if ((neighbor_isValid[1][3]&&neighbor_isRemote[1][3])) { MPI_Isend(&fieldData_LaplaceCoeff[2][33], 1, mpiDatatype_9_7_56, neighbor_remoteRank[1][3], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[1][3]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[3]); reqOutstanding_Send[3] = true; } } } ; for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { if ((neighbor_isValid[1][2]&&neighbor_isRemote[1][2])) { MPI_Irecv(&fieldData_LaplaceCoeff[2][1], 1, mpiDatatype_9_7_56, neighbor_remoteRank[1][2], ((unsigned int)(neighbor_fragCommId[1][2]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[2]); reqOutstanding_Recv[2] = true; } if ((neighbor_isValid[1][3]&&neighbor_isRemote[1][3])) { MPI_Irecv(&fieldData_LaplaceCoeff[2][49], 1, mpiDatatype_9_7_56, neighbor_remoteRank[1][3], ((unsigned int)(neighbor_fragCommId[1][3]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[3]); reqOutstanding_Recv[3] = true; } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { if (reqOutstanding_Recv[2]) { waitForMPIReq(&mpiRequest_Recv[2]); reqOutstanding_Recv[2] = false; } if (reqOutstanding_Recv[3]) { waitForMPIReq(&mpiRequest_Recv[3]); reqOutstanding_Recv[3] = false; } } } for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { ; ; } } ; ; for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { if (reqOutstanding_Send[2]) { waitForMPIReq(&mpiRequest_Send[2]); reqOutstanding_Send[2] = false; } if (reqOutstanding_Send[3]) { waitForMPIReq(&mpiRequest_Send[3]); reqOutstanding_Send[3] = false; } } } }
int vdwioffset3; __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; int vdwjidx0A,vdwjidx0B; __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; __m128d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30; __m128d velec,felec,velecsum,facel,crf,krf,krf2; real *charge; __m128i vfitab; __m128i ifour = _mm_set1_epi32(4); __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; real *vftab; __m128d dummy_mask,cutoff_mask; __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); __m128d one = _mm_set1_pd(1.0); __m128d two = _mm_set1_pd(2.0); x = xx[0]; f = ff[0]; nri = nlist->nri; iinr = nlist->iinr; jindex = nlist->jindex; jjnr = nlist->jjnr; shiftidx = nlist->shift; gid = nlist->gid; shiftvec = fr->shift_vec[0]; fshift = fr->fshift[0]; facel = _mm_set1_pd(fr->epsfac); charge = mdatoms->chargeA;
void mlib_s_ImageScalarBlend_s32( mlib_s32 *dst, mlib_s32 dlb, const mlib_s32 *src1, mlib_s32 slb1, const mlib_s32 *src2, mlib_s32 slb2, const mlib_s32 *alpha, mlib_s32 xsize, mlib_s32 ysize, mlib_s32 nchan) { mlib_s32 i, j, nsize; __m128i *srcPtr1, *srcPtr2, *dstPtr; mlib_s32 *dl = dst; mlib_s32 *sl1 = (mlib_s32 *)src1, *sl2 = (mlib_s32 *)src2; __m128d alphas0, alphas1, alphau0, alphau1, alphav0, alphav1; __m128d betas0, betas1, betau0, betau1, betav0, betav1; __m128d ones = _mm_set1_pd(1.0f); mlib_s32 res, sdata1, sdata2; mlib_d64 a0, a1, a2, a3; nsize = xsize * nchan; switch (nchan) { case 1: a0 = -(alpha[0] & MASK) / (mlib_d64)MLIB_S32_MIN; alphas0 = _mm_set1_pd(a0); alphas1 = _mm_set1_pd(a0); betas0 = _mm_sub_pd(ones, alphas0); betas1 = _mm_sub_pd(ones, alphas1); if ((((mlib_addr)dst | dlb | (mlib_addr)src1 | slb1 | (mlib_addr)src2 | slb2) & 0xf) == 0) { for (j = 0; j < ysize; j ++) { srcPtr1 = (__m128i *)sl1; srcPtr2 = (__m128i *)sl2; dstPtr = (__m128i *)dl; #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (nsize - 4); i += 4) { MLIB_S_IMAGESCALARBLEND_S32( _mm_store_si128, _mm_load_si128, _mm_load_si128); } for (; i < nsize; i++) { MLIB_C_IMAGESCALARBLEND_S32_1( sl1, sl2, dl); } sl1 = (mlib_s32 *)((mlib_u8 *)sl1 + slb1); sl2 = (mlib_s32 *)((mlib_u8 *)sl2 + slb2); dl = (mlib_s32 *)((mlib_u8 *)dl + dlb); } } else if ((((mlib_addr)src1 | slb1 | (mlib_addr)src2 | slb2) & 0xf) == 0) { for (j = 0; j < ysize; j ++) { srcPtr1 = (__m128i *)sl1; srcPtr2 = (__m128i *)sl2; dstPtr = (__m128i *)dl; #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (nsize - 4); i += 4) { MLIB_S_IMAGESCALARBLEND_S32( _mm_storeu_si128, _mm_load_si128, _mm_load_si128); } for (; i < nsize; i++) { MLIB_C_IMAGESCALARBLEND_S32_1( sl1, sl2, dl); } sl1 = (mlib_s32 *)((mlib_u8 *)sl1 + slb1); sl2 = (mlib_s32 *)((mlib_u8 *)sl2 + slb2); dl = (mlib_s32 *)((mlib_u8 *)dl + dlb); } } else { for (j = 0; j < ysize; j ++) { srcPtr1 = (__m128i *)sl1; srcPtr2 = (__m128i *)sl2; dstPtr = (__m128i *)dl; #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (nsize - 4); i += 4) { MLIB_S_IMAGESCALARBLEND_S32( _mm_storeu_si128, _mm_loadu_si128, _mm_loadu_si128); } for (; i < nsize; i++) { MLIB_C_IMAGESCALARBLEND_S32_1( sl1, sl2, dl); } sl1 = (mlib_s32 *)((mlib_u8 *)sl1 + slb1); sl2 = (mlib_s32 *)((mlib_u8 *)sl2 + slb2); dl = (mlib_s32 *)((mlib_u8 *)dl + dlb); } } break; case 2: a0 = -(alpha[0] & MASK) / (mlib_d64)MLIB_S32_MIN; a1 = -(alpha[1] & MASK) / (mlib_d64)MLIB_S32_MIN; alphas0 = _mm_set_pd(a1, a0); alphas1 = _mm_set_pd(a1, a0); betas0 = _mm_sub_pd(ones, alphas0); betas1 = _mm_sub_pd(ones, alphas1); if ((((mlib_addr)dst | dlb | (mlib_addr)src1 | slb1 | (mlib_addr)src2 | slb2) & 0xf) == 0) { for (j = 0; j < ysize; j ++) { srcPtr1 = (__m128i *)sl1; srcPtr2 = (__m128i *)sl2; dstPtr = (__m128i *)dl; #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (nsize - 4); i += 4) { MLIB_S_IMAGESCALARBLEND_S32( _mm_store_si128, _mm_load_si128, _mm_load_si128); } for (; i < nsize; i += 2) { MLIB_C_IMAGESCALARBLEND_S32_2( sl1, sl2, dl); } sl1 = (mlib_s32 *)((mlib_u8 *)sl1 + slb1); sl2 = (mlib_s32 *)((mlib_u8 *)sl2 + slb2); dl = (mlib_s32 *)((mlib_u8 *)dl + dlb); } } else if ((((mlib_addr)src1 | slb1 | (mlib_addr)src2 | slb2) & 0xf) == 0) { for (j = 0; j < ysize; j ++) { srcPtr1 = (__m128i *)sl1; srcPtr2 = (__m128i *)sl2; dstPtr = (__m128i *)dl; #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (nsize - 4); i += 4) { MLIB_S_IMAGESCALARBLEND_S32( _mm_storeu_si128, _mm_load_si128, _mm_load_si128); } for (; i < nsize; i += 2) { MLIB_C_IMAGESCALARBLEND_S32_2( sl1, sl2, dl); } sl1 = (mlib_s32 *)((mlib_u8 *)sl1 + slb1); sl2 = (mlib_s32 *)((mlib_u8 *)sl2 + slb2); dl = (mlib_s32 *)((mlib_u8 *)dl + dlb); } } else { for (j = 0; j < ysize; j ++) { srcPtr1 = (__m128i *)sl1; srcPtr2 = (__m128i *)sl2; dstPtr = (__m128i *)dl; #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (nsize - 4); i += 4) { MLIB_S_IMAGESCALARBLEND_S32( _mm_storeu_si128, _mm_loadu_si128, _mm_loadu_si128); } for (; i < nsize; i += 2) { MLIB_C_IMAGESCALARBLEND_S32_2( sl1, sl2, dl); } sl1 = (mlib_s32 *)((mlib_u8 *)sl1 + slb1); sl2 = (mlib_s32 *)((mlib_u8 *)sl2 + slb2); dl = (mlib_s32 *)((mlib_u8 *)dl + dlb); } } break; case 3: a0 = -(alpha[0] & MASK) / (mlib_d64)MLIB_S32_MIN; a1 = -(alpha[1] & MASK) / (mlib_d64)MLIB_S32_MIN; a2 = -(alpha[2] & MASK) / (mlib_d64)MLIB_S32_MIN; alphas0 = _mm_set_pd(a1, a0); alphas1 = _mm_set_pd(a0, a2); alphau0 = _mm_set_pd(a2, a1); alphau1 = _mm_set_pd(a1, a0); alphav0 = _mm_set_pd(a0, a2); alphav1 = _mm_set_pd(a2, a1); betas0 = _mm_sub_pd(ones, alphas0); betas1 = _mm_sub_pd(ones, alphas1); betau0 = _mm_sub_pd(ones, alphau0); betau1 = _mm_sub_pd(ones, alphau1); betav0 = _mm_sub_pd(ones, alphav0); betav1 = _mm_sub_pd(ones, alphav1); if ((((mlib_addr)dst | dlb | (mlib_addr)src1 | slb1 | (mlib_addr)src2 | slb2) & 0xf) == 0) { for (j = 0; j < ysize; j ++) { srcPtr1 = (__m128i *)sl1; srcPtr2 = (__m128i *)sl2; dstPtr = (__m128i *)dl; #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (nsize - 12); i += 12) { MLIB_S_IMAGESCALARBLEND3_S32( _mm_store_si128, _mm_load_si128, _mm_load_si128); } for (; i < nsize; i += 3) { MLIB_C_IMAGESCALARBLEND_S32_3( sl1, sl2, dl); } sl1 = (mlib_s32 *)((mlib_u8 *)sl1 + slb1); sl2 = (mlib_s32 *)((mlib_u8 *)sl2 + slb2); dl = (mlib_s32 *)((mlib_u8 *)dl + dlb); } } else if ((((mlib_addr)src1 | slb1 | (mlib_addr)src2 | slb2) & 0xf) == 0) { for (j = 0; j < ysize; j ++) { srcPtr1 = (__m128i *)sl1; srcPtr2 = (__m128i *)sl2; dstPtr = (__m128i *)dl; #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (nsize - 12); i += 12) { MLIB_S_IMAGESCALARBLEND3_S32( _mm_storeu_si128, _mm_load_si128, _mm_load_si128); } for (; i < nsize; i += 3) { MLIB_C_IMAGESCALARBLEND_S32_3( sl1, sl2, dl); } sl1 = (mlib_s32 *)((mlib_u8 *)sl1 + slb1); sl2 = (mlib_s32 *)((mlib_u8 *)sl2 + slb2); dl = (mlib_s32 *)((mlib_u8 *)dl + dlb); } } else { for (j = 0; j < ysize; j ++) { srcPtr1 = (__m128i *)sl1; srcPtr2 = (__m128i *)sl2; dstPtr = (__m128i *)dl; #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (nsize - 12); i += 12) { MLIB_S_IMAGESCALARBLEND3_S32( _mm_storeu_si128, _mm_loadu_si128, _mm_loadu_si128); } for (; i < nsize; i += 3) { MLIB_C_IMAGESCALARBLEND_S32_3( sl1, sl2, dl); } sl1 = (mlib_s32 *)((mlib_u8 *)sl1 + slb1); sl2 = (mlib_s32 *)((mlib_u8 *)sl2 + slb2); dl = (mlib_s32 *)((mlib_u8 *)dl + dlb); } } break; case 4: a0 = -(alpha[0] & MASK) / (mlib_d64)MLIB_S32_MIN; a1 = -(alpha[1] & MASK) / (mlib_d64)MLIB_S32_MIN; a2 = -(alpha[2] & MASK) / (mlib_d64)MLIB_S32_MIN; a3 = -(alpha[3] & MASK) / (mlib_d64)MLIB_S32_MIN; alphas0 = _mm_set_pd(a1, a0); alphas1 = _mm_set_pd(a3, a2); betas0 = _mm_sub_pd(ones, alphas0); betas1 = _mm_sub_pd(ones, alphas1); if ((((mlib_addr)dst | dlb | (mlib_addr)src1 | slb1 | (mlib_addr)src2 | slb2) & 0xf) == 0) { for (j = 0; j < ysize; j ++) { srcPtr1 = (__m128i *)sl1; srcPtr2 = (__m128i *)sl2; dstPtr = (__m128i *)dl; #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (nsize - 4); i += 4) { MLIB_S_IMAGESCALARBLEND_S32( _mm_store_si128, _mm_load_si128, _mm_load_si128); } sl1 = (mlib_s32 *)((mlib_u8 *)sl1 + slb1); sl2 = (mlib_s32 *)((mlib_u8 *)sl2 + slb2); dl = (mlib_s32 *)((mlib_u8 *)dl + dlb); } } else if ((((mlib_addr)src1 | slb1 | (mlib_addr)src2 | slb2) & 0xf) == 0) { for (j = 0; j < ysize; j ++) { srcPtr1 = (__m128i *)sl1; srcPtr2 = (__m128i *)sl2; dstPtr = (__m128i *)dl; #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (nsize - 4); i += 4) { MLIB_S_IMAGESCALARBLEND_S32( _mm_storeu_si128, _mm_load_si128, _mm_load_si128); } sl1 = (mlib_s32 *)((mlib_u8 *)sl1 + slb1); sl2 = (mlib_s32 *)((mlib_u8 *)sl2 + slb2); dl = (mlib_s32 *)((mlib_u8 *)dl + dlb); } } else { for (j = 0; j < ysize; j ++) { srcPtr1 = (__m128i *)sl1; srcPtr2 = (__m128i *)sl2; dstPtr = (__m128i *)dl; #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (nsize - 4); i += 4) { MLIB_S_IMAGESCALARBLEND_S32( _mm_storeu_si128, _mm_loadu_si128, _mm_loadu_si128); } sl1 = (mlib_s32 *)((mlib_u8 *)sl1 + slb1); sl2 = (mlib_s32 *)((mlib_u8 *)sl2 + slb2); dl = (mlib_s32 *)((mlib_u8 *)dl + dlb); } } break; } }
int jnrA,jnrB; int j_coord_offsetA,j_coord_offsetB; int *iinr,*jindex,*jjnr,*shiftidx,*gid; real rcutoff_scalar; real *shiftvec,*fshift,*x,*f; __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; int vdwioffset0; __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; int vdwjidx0A,vdwjidx0B; __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; __m128d velec,felec,velecsum,facel,crf,krf,krf2; real *charge; __m128d dummy_mask,cutoff_mask; __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); __m128d one = _mm_set1_pd(1.0); __m128d two = _mm_set1_pd(2.0); x = xx[0]; f = ff[0]; nri = nlist->nri; iinr = nlist->iinr; jindex = nlist->jindex; jjnr = nlist->jjnr; shiftidx = nlist->shift; gid = nlist->gid; shiftvec = fr->shift_vec[0]; fshift = fr->fshift[0]; facel = _mm_set1_pd(fr->epsfac); charge = mdatoms->chargeA; krf = _mm_set1_pd(fr->ic->k_rf);
void matrix_vector_mul_SSE_f48_loop_unrolled (fl48** mat, fl48* &vec) { // TESTING change SIZE to min 8 - but multiple of 8 fl48* result = new fl48[SIZE]; __m128i load_mask = _mm_set_epi8(11, 10, 9, 8, 7, 6, 255, 255, 5, 4, 3, 2, 1, 0, 255, 255); for(unsigned i=0;i<SIZE;i+=8) { // row // requiring 8 at a time - because loop un-roll __m128d running_sum1 = _mm_set1_pd(0.0); // running sum initially 0 __m128d running_sum2 = _mm_set1_pd(0.0); // running sum initially 0 __m128d running_sum3 = _mm_set1_pd(0.0); // running sum initially 0 __m128d running_sum4 = _mm_set1_pd(0.0); // running sum initially 0 __m128d running_sum5 = _mm_set1_pd(0.0); // running sum initially 0 __m128d running_sum6 = _mm_set1_pd(0.0); // running sum initially 0 __m128d running_sum7 = _mm_set1_pd(0.0); // running sum initially 0 __m128d running_sum8 = _mm_set1_pd(0.0); // running sum initially 0 for(unsigned j=0;j<SIZE;j+=2) { // col - requires skipping on 2 at a time __m128i mat_vect = _mm_loadu_si128((__m128i*) &mat[i][j]); // hoping that addresses are as expected - seems like this is the way it's stored mat_vect = _mm_shuffle_epi8(mat_vect, load_mask); __m128i vec_elem = _mm_loadu_si128((__m128i*) &vec[j]); vec_elem = _mm_shuffle_epi8(vec_elem, load_mask); __m128d mult = _mm_mul_pd((__m128d)mat_vect,(__m128d)vec_elem); running_sum1 = _mm_add_pd(mult,running_sum1); mat_vect = _mm_loadu_si128((__m128i*) &mat[i+1][j]); mat_vect = _mm_shuffle_epi8(mat_vect, load_mask); mult = _mm_mul_pd((__m128d)mat_vect,(__m128d)vec_elem); running_sum2 = _mm_add_pd(mult,running_sum2); mat_vect = _mm_loadu_si128((__m128i*) &mat[i+2][j]); mat_vect = _mm_shuffle_epi8(mat_vect, load_mask); mult = _mm_mul_pd((__m128d)mat_vect,(__m128d)vec_elem); running_sum3 = _mm_add_pd(mult,running_sum3); mat_vect = _mm_loadu_si128((__m128i*) &mat[i+3][j]); mat_vect = _mm_shuffle_epi8(mat_vect, load_mask); mult = _mm_mul_pd((__m128d)mat_vect,(__m128d)vec_elem); running_sum4 = _mm_add_pd(mult,running_sum4); mat_vect = _mm_loadu_si128((__m128i*) &mat[i+4][j]); mat_vect = _mm_shuffle_epi8(mat_vect, load_mask); mult = _mm_mul_pd((__m128d)mat_vect,(__m128d)vec_elem); running_sum5 = _mm_add_pd(mult,running_sum5); mat_vect = _mm_loadu_si128((__m128i*) &mat[i+5][j]); mat_vect = _mm_shuffle_epi8(mat_vect, load_mask); mult = _mm_mul_pd((__m128d)mat_vect,(__m128d)vec_elem); running_sum6 = _mm_add_pd(mult,running_sum6); mat_vect = _mm_loadu_si128((__m128i*) &mat[i+6][j]); mat_vect = _mm_shuffle_epi8(mat_vect, load_mask); mult = _mm_mul_pd((__m128d)mat_vect,(__m128d)vec_elem); running_sum7 = _mm_add_pd(mult,running_sum7); mat_vect = _mm_loadu_si128((__m128i*) &mat[i+7][j]); mat_vect = _mm_shuffle_epi8(mat_vect, load_mask); mult = _mm_mul_pd((__m128d)mat_vect,(__m128d)vec_elem); running_sum8 = _mm_add_pd(mult,running_sum8); } __m128i mask = _mm_set_epi8(7 ,6 ,5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8); __m128i sum_shuffled = _mm_shuffle_epi8((__m128i)running_sum1, mask); running_sum1 = _mm_add_pd(running_sum1,(__m128d)sum_shuffled); sum_shuffled = _mm_shuffle_epi8((__m128i)running_sum2, mask); running_sum2 = _mm_add_pd(running_sum2,(__m128d)sum_shuffled); sum_shuffled = _mm_shuffle_epi8((__m128i)running_sum3, mask); running_sum3 = _mm_add_pd(running_sum3,(__m128d)sum_shuffled); sum_shuffled = _mm_shuffle_epi8((__m128i)running_sum4, mask); running_sum4 = _mm_add_pd(running_sum4,(__m128d)sum_shuffled); sum_shuffled = _mm_shuffle_epi8((__m128i)running_sum5, mask); running_sum5 = _mm_add_pd(running_sum5,(__m128d)sum_shuffled); sum_shuffled = _mm_shuffle_epi8((__m128i)running_sum6, mask); running_sum6 = _mm_add_pd(running_sum6,(__m128d)sum_shuffled); sum_shuffled = _mm_shuffle_epi8((__m128i)running_sum7, mask); running_sum7 = _mm_add_pd(running_sum7,(__m128d)sum_shuffled); sum_shuffled = _mm_shuffle_epi8((__m128i)running_sum8, mask); running_sum8 = _mm_add_pd(running_sum8,(__m128d)sum_shuffled); // mesh them into 4 __m128i mask_first = _mm_set_epi8(255,255,255,255,255,255,255,255, 7 ,6 ,5, 4, 3, 2, 1, 0); __m128i mask_second = _mm_set_epi8(7 ,6 ,5, 4, 3, 2, 1, 0, 255,255,255,255,255,255,255,255); running_sum1 = (__m128d)_mm_shuffle_epi8((__m128i)running_sum1, mask_first); running_sum2 = (__m128d)_mm_shuffle_epi8((__m128i)running_sum2, mask_second); running_sum1 = (__m128d)_mm_or_si128((__m128i)running_sum1, (__m128i)running_sum2); running_sum3 = (__m128d)_mm_shuffle_epi8((__m128i)running_sum3, mask_first); running_sum4 = (__m128d)_mm_shuffle_epi8((__m128i)running_sum4, mask_second); running_sum2 = (__m128d)_mm_or_si128((__m128i)running_sum3, (__m128i)running_sum4); running_sum5 = (__m128d)_mm_shuffle_epi8((__m128i)running_sum5, mask_first); running_sum6 = (__m128d)_mm_shuffle_epi8((__m128i)running_sum6, mask_second); running_sum3 = (__m128d)_mm_or_si128((__m128i)running_sum6, (__m128i)running_sum5); running_sum7 = (__m128d)_mm_shuffle_epi8((__m128i)running_sum7, mask_first); running_sum8 = (__m128d)_mm_shuffle_epi8((__m128i)running_sum8, mask_second); running_sum4 = (__m128d)_mm_or_si128((__m128i)running_sum8, (__m128i)running_sum7); // RS 1-4 are right and expected here too // rs 5-8 neglected and not required from now __m128i a01_round = convert_double_to_f48_SSE((__m128i)running_sum1); __m128i a23_round = convert_double_to_f48_SSE((__m128i)running_sum2); __m128i a45_round = convert_double_to_f48_SSE((__m128i)running_sum3); __m128i a67_round = convert_double_to_f48_SSE((__m128i)running_sum4); // place them right for memory write __m128i match_mask = _mm_set_epi8(3,2,1,0,255,255,255,255,255,255,255,255,255,255,255,255); // mask used to match the missing spaces __m128i a23_shuffled = _mm_shuffle_epi8((__m128i)a23_round, match_mask); // shuffle the positions required for the space in a01 for a2 a01_round = _mm_or_si128(a01_round,a23_shuffled); a23_round = _mm_srli_si128 (a23_round, 4); // using _mm_srli_si128 instead of _mm_sll_epi64 because the epi64 shitfs witin each double element in the 128 item match_mask = _mm_set_epi8(7,6,5,4,3,2,1,0,255,255,255,255,255,255,255,255); // reset the match mask for a4 and small bit of a5 __m128i a45_shuffled = _mm_shuffle_epi8((__m128i)a45_round, match_mask); // shuffle a45 to fit in a23 a23_round = _mm_or_si128(a23_round,a45_shuffled); a45_round = _mm_srli_si128(a45_round, 8); // using _mm_srli_si128 instead of _mm_sll_epi64 because the epi64 shitfs witin each double element in the 128 item match_mask = _mm_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,255,255,255,255); __m128i a67_shuffled = _mm_shuffle_epi8((__m128i)a67_round, match_mask); a45_round = _mm_or_si128(a45_round,a67_shuffled); // WRITE BACK TO MEMORY! _mm_storeu_pd((double*)&result[i], (__m128d)a01_round); _mm_storeu_pd(bofs(&result[i],2), (__m128d)a23_round); _mm_storeu_pd(bofs(&result[i],4), (__m128d)a45_round); } vec = result; }
void mymm_dsymv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const int N, const double alpha, const double *A, const int lda, const double *X, const int incX, const double beta, double *Y, const int incY) { // limited implementation assert(Order==CblasRowMajor); assert(Uplo==CblasUpper); assert(N==lda); __builtin_prefetch (Y, 1, 3); __builtin_prefetch (X, 1, 3); int i,j; double temp , reg1; double* unpack; const double *pA=A, *pX=X; double* pY = Y; __m128d mm_beta = _mm_set1_pd(beta); __m128d mm_alpha = _mm_set1_pd(alpha); __m128d mm_reg1 __attribute__((aligned (16))); __m128d mm_reg2 __attribute__((aligned (16))); __m128d mm_temp __attribute__((aligned (16))); posix_memalign((void**)&unpack, 16, 2*sizeof(double)); // y = beta*y for(i=0;i<lda-1;i+=2,pY+=(2*incY)) { mm_reg1 = _mm_loadu_pd(pY); mm_reg1 = _mm_mul_pd(mm_reg1, mm_beta); _mm_storeu_pd( pY, mm_reg1); } for(;i<lda;i++,pY++) (*pY) = beta * (*pY); pY = Y; for(i=0;i<lda;i++,pA+=i,pY+=incY) { pX = X + i*incX; temp = 0.0; mm_temp = _mm_set1_pd(0.0); j=i; for(;j<N-1;j+=2,pA+=2,pX+=(2*incX)) { mm_reg1 = _mm_loadu_pd(pA); mm_reg2 = _mm_loadu_pd(pX); mm_reg1 = _mm_mul_pd(mm_reg1, mm_reg2); mm_reg2 = _mm_mul_pd(mm_alpha, mm_reg1); mm_temp = _mm_add_pd(mm_temp, mm_reg2); } _mm_store_pd(unpack, mm_temp); temp+=unpack[0]; temp+=unpack[1]; for(;j<N;j++,pA++,pX+=incX) temp += alpha * (*pA) * (*pX); (*pY) += temp; } pA = A; pX = X; for(i=0;i<lda;i++,pA+=i,pX+=incX) { reg1 = (*pX); mm_reg1 = _mm_set1_pd(reg1); pA++; pY=Y+(i+1)*incY; j=i+1; for(;j<N-1;j+=2,pA+=2,pY+=(2*incY)) { mm_reg2 = _mm_loadu_pd(pA); mm_reg2 = _mm_mul_pd(mm_reg2, mm_reg1); mm_reg2 = _mm_mul_pd(mm_alpha, mm_reg2); _mm_storeu_pd( pY, _mm_add_pd(_mm_loadu_pd(pY),mm_reg2)); } for(;j<N;j++,pA++,pY+=incY) *pY += alpha * (*pA) * reg1; } }
int vdwioffset0; __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; int vdwioffset1; __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; int vdwioffset2; __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; int vdwjidx0A,vdwjidx0B; __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; __m128d velec,felec,velecsum,facel,crf,krf,krf2; real *charge; __m128d dummy_mask,cutoff_mask; __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); __m128d one = _mm_set1_pd(1.0); __m128d two = _mm_set1_pd(2.0); x = xx[0]; f = ff[0]; nri = nlist->nri; iinr = nlist->iinr; jindex = nlist->jindex; jjnr = nlist->jjnr; shiftidx = nlist->shift; gid = nlist->gid; shiftvec = fr->shift_vec[0]; fshift = fr->fshift[0]; facel = _mm_set1_pd(fr->epsfac); charge = mdatoms->chargeA; krf = _mm_set1_pd(fr->ic->k_rf);
int vdwioffset1; __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; int vdwioffset2; __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; int vdwioffset3; __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; int vdwjidx0A,vdwjidx0B; __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; __m128d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30; __m128d velec,felec,velecsum,facel,crf,krf,krf2; real *charge; __m128d dummy_mask,cutoff_mask; __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); __m128d one = _mm_set1_pd(1.0); __m128d two = _mm_set1_pd(2.0); x = xx[0]; f = ff[0]; nri = nlist->nri; iinr = nlist->iinr; jindex = nlist->jindex; jjnr = nlist->jjnr; shiftidx = nlist->shift; gid = nlist->gid; shiftvec = fr->shift_vec[0]; fshift = fr->fshift[0]; facel = _mm_set1_pd(fr->epsfac); charge = mdatoms->chargeA;
__m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; int vdwioffset3; __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3; int vdwjidx0A,vdwjidx0B; __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; __m128d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30; __m128d velec,felec,velecsum,facel,crf,krf,krf2; real *charge; int nvdwtype; __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; int *vdwtype; real *vdwparam; __m128d one_sixth = _mm_set1_pd(1.0/6.0); __m128d one_twelfth = _mm_set1_pd(1.0/12.0); __m128d dummy_mask,cutoff_mask; __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); __m128d one = _mm_set1_pd(1.0); __m128d two = _mm_set1_pd(2.0); x = xx[0]; f = ff[0]; nri = nlist->nri; iinr = nlist->iinr; jindex = nlist->jindex; jjnr = nlist->jjnr; shiftidx = nlist->shift; gid = nlist->gid; shiftvec = fr->shift_vec[0];
int vdwioffset1; __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1; int vdwioffset2; __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; int vdwjidx0A,vdwjidx0B; __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; __m128d velec,felec,velecsum,facel,crf,krf,krf2; real *charge; int nvdwtype; __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; int *vdwtype; real *vdwparam; __m128d one_sixth = _mm_set1_pd(1.0/6.0); __m128d one_twelfth = _mm_set1_pd(1.0/12.0); __m128i vfitab; __m128i ifour = _mm_set1_epi32(4); __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps; real *vftab; __m128d dummy_mask,cutoff_mask; __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); __m128d one = _mm_set1_pd(1.0); __m128d two = _mm_set1_pd(2.0); x = xx[0]; f = ff[0]; nri = nlist->nri; iinr = nlist->iinr; jindex = nlist->jindex;
void mlib_FIR_tap2_d64( mlib_d64 *pdst, const mlib_d64 *psrc, mlib_d64 *pflt, mlib_s32 n) { mlib_s32 j; mlib_d64 src1, src2; mlib_d64 dflt1 = pflt[0], dflt2 = pflt[1]; __m128d sdflt1, sdflt2; __m128d ssrc1, ssrc2; __m128d smul1, smul2; __m128d sdst; src1 = psrc[0]; j = 0; if ((mlib_addr)psrc & 15) { src2 = psrc[1]; pdst[0] += dflt2 * src1 + dflt1 * src2; psrc++; pdst++; j++; src1 = src2; } sdflt2 = _mm_set1_pd(dflt2); sdflt1 = _mm_set1_pd(dflt1); #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (; j < (n - 1); j += 2) { ssrc1 = _mm_load_pd(psrc); ssrc2 = _mm_loadu_pd(psrc + 1); smul1 = _mm_mul_pd(sdflt2, ssrc1); smul2 = _mm_mul_pd(sdflt1, ssrc2); smul1 = _mm_add_pd(smul1, smul2); sdst = _mm_loadu_pd(pdst); sdst = _mm_add_pd(sdst, smul1); _mm_storeu_pd(pdst, sdst); psrc += 2; pdst += 2; } src1 = psrc[0]; for (; j < n; j++) { src2 = psrc[1]; pdst[0] += dflt2 * src1 + dflt1 * src2; psrc++; pdst++; src1 = src2; } }