Ejemplo n.º 1
0
void
mlib_FIR_tap4f_d64(
    mlib_d64 *pdst,
    const mlib_d64 *psrc,
    mlib_d64 *pflt,
    mlib_s32 n)
{
	mlib_s32 j;
	mlib_d64 src1, src2, src3, src4;
	mlib_d64 dflt1 = pflt[0], dflt2 = pflt[1], dflt3 = pflt[2], dflt4 =
	    pflt[3];
	__m128d sdflt1, sdflt2, sdflt3, sdflt4;
	__m128d ssrc1, ssrc2, ssrc3, ssrc4;
	__m128d smul1, smul2, smul3, smul4;

	src1 = psrc[0];
	src2 = psrc[1];
	src3 = psrc[2];
	j = 0;
	if ((mlib_addr)psrc & 15) {
		src4 = psrc[3];

		pdst[0] =
		    dflt4 * src1 + dflt3 * src2 + dflt2 * src3 + dflt1 * src4;

		psrc++;
		pdst++;
		j++;

		src1 = src2;
		src2 = src3;
		src3 = src4;
	}

	sdflt4 = _mm_set1_pd(dflt4);
	sdflt3 = _mm_set1_pd(dflt3);
	sdflt2 = _mm_set1_pd(dflt2);
	sdflt1 = _mm_set1_pd(dflt1);

#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
	for (; j < n - 1; j += 2) {
		ssrc1 = _mm_load_pd(psrc);
		ssrc2 = _mm_loadu_pd(psrc + 1);
		ssrc3 = _mm_load_pd(psrc + 2);
		ssrc4 = _mm_loadu_pd(psrc + 3);

		smul1 = _mm_mul_pd(sdflt4, ssrc1);
		smul2 = _mm_mul_pd(sdflt3, ssrc2);
		smul3 = _mm_mul_pd(sdflt2, ssrc3);
		smul4 = _mm_mul_pd(sdflt1, ssrc4);

		smul1 = _mm_add_pd(smul1, smul2);
		smul3 = _mm_add_pd(smul3, smul4);
		smul1 = _mm_add_pd(smul1, smul3);

		_mm_storeu_pd(pdst, smul1);

		psrc += 2;
		pdst += 2;
	}

	src1 = psrc[0];
	src2 = psrc[1];
	src3 = psrc[2];

	for (; j < n; j++) {
		src4 = psrc[3];

		pdst[0] =
		    dflt4 * src1 + dflt3 * src2 + dflt2 * src3 + dflt1 * src4;

		psrc++;
		pdst++;

		src1 = src2;
		src2 = src3;
		src3 = src4;
	}
}
    int              jnrA,jnrB;
    int              j_coord_offsetA,j_coord_offsetB;
    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
    real             rcutoff_scalar;
    real             *shiftvec,*fshift,*x,*f;
    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
    int              vdwioffset0;
    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
    int              vdwjidx0A,vdwjidx0B;
    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
    int              nvdwtype;
    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
    int              *vdwtype;
    real             *vdwparam;
    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
    __m128i          vfitab;
    __m128i          ifour       = _mm_set1_epi32(4);
    __m128d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
    real             *vftab;
    __m128d          dummy_mask,cutoff_mask;
    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
    __m128d          one     = _mm_set1_pd(1.0);
    __m128d          two     = _mm_set1_pd(2.0);
    x                = xx[0];
    f                = ff[0];

    nri              = nlist->nri;
    iinr             = nlist->iinr;
    jindex           = nlist->jindex;
    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
    int              vdwioffset0;
    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
    int              vdwjidx0A,vdwjidx0B;
    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
    real             *charge;
    __m128i          ewitab;
    __m128d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
    real             *ewtab;
    __m128d          rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
    real             rswitch_scalar,d_scalar;
    __m128d          dummy_mask,cutoff_mask;
    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
    __m128d          one     = _mm_set1_pd(1.0);
    __m128d          two     = _mm_set1_pd(2.0);
    x                = xx[0];
    f                = ff[0];

    nri              = nlist->nri;
    iinr             = nlist->iinr;
    jindex           = nlist->jindex;
    jjnr             = nlist->jjnr;
    shiftidx         = nlist->shift;
    gid              = nlist->gid;
    shiftvec         = fr->shift_vec[0];
    fshift           = fr->fshift[0];
    facel            = _mm_set1_pd(fr->epsfac);
    charge           = mdatoms->chargeA;
    int              jnrA,jnrB;
    int              j_coord_offsetA,j_coord_offsetB;
    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
    real             rcutoff_scalar;
    real             *shiftvec,*fshift,*x,*f;
    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
    int              vdwioffset0;
    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
    int              vdwjidx0A,vdwjidx0B;
    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
    int              nvdwtype;
    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
    int              *vdwtype;
    real             *vdwparam;
    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
    __m128d           c6grid_00;
    __m128d           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
    real             *vdwgridparam;
    __m128d           one_half = _mm_set1_pd(0.5);
    __m128d           minus_one = _mm_set1_pd(-1.0);
    __m128d          dummy_mask,cutoff_mask;
    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
    __m128d          one     = _mm_set1_pd(1.0);
    __m128d          two     = _mm_set1_pd(2.0);
    x                = xx[0];
    f                = ff[0];

    nri              = nlist->nri;
    iinr             = nlist->iinr;
    __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
    int              vdwioffset3;
    __m128d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
    int              vdwjidx0A,vdwjidx0B;
    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
    __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
    __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
    __m128d          dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
    real             *charge;
    __m128i          ewitab;
    __m128d          ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
    real             *ewtab;
    __m128d          dummy_mask,cutoff_mask;
    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
    __m128d          one     = _mm_set1_pd(1.0);
    __m128d          two     = _mm_set1_pd(2.0);
    x                = xx[0];
    f                = ff[0];

    nri              = nlist->nri;
    iinr             = nlist->iinr;
    jindex           = nlist->jindex;
    jjnr             = nlist->jjnr;
    shiftidx         = nlist->shift;
    gid              = nlist->gid;
    shiftvec         = fr->shift_vec[0];
    fshift           = fr->fshift[0];
    facel            = _mm_set1_pd(fr->epsfac);
    charge           = mdatoms->chargeA;
 int              jnrA,jnrB;
 int              j_coord_offsetA,j_coord_offsetB;
 int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
 real             rcutoff_scalar;
 real             *shiftvec,*fshift,*x,*f;
 __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
 int              vdwioffset0;
 __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
 int              vdwjidx0A,vdwjidx0B;
 __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
 __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
 __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
 real             *charge;
 __m128i          gbitab;
 __m128d          vgb,fgb,vgbsum,dvdasum,gbscale,gbtabscale,isaprod,gbqqfactor,gbinvepsdiff,dvdaj,gbeps,dvdatmp;
 __m128d          minushalf = _mm_set1_pd(-0.5);
 real             *invsqrta,*dvda,*gbtab;
 int              nvdwtype;
 __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
 int              *vdwtype;
 real             *vdwparam;
 __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
 __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
 __m128i          vfitab;
 __m128i          ifour       = _mm_set1_epi32(4);
 __m128d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
 real             *vftab;
 __m128d          dummy_mask,cutoff_mask;
 __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
 __m128d          one     = _mm_set1_pd(1.0);
 __m128d          two     = _mm_set1_pd(2.0);
    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
    real             rcutoff_scalar;
    real             *shiftvec,*fshift,*x,*f;
    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
    int              vdwioffset0;
    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
    int              vdwjidx0A,vdwjidx0B;
    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
    real             *charge;
    int              nvdwtype;
    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
    int              *vdwtype;
    real             *vdwparam;
    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
    __m128i          ewitab;
    __m128d          ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
    real             *ewtab;
    __m128d          dummy_mask,cutoff_mask;
    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
    __m128d          one     = _mm_set1_pd(1.0);
    __m128d          two     = _mm_set1_pd(2.0);
    x                = xx[0];
    f                = ff[0];

    nri              = nlist->nri;
    iinr             = nlist->iinr;
    jindex           = nlist->jindex;
    jjnr             = nlist->jjnr;
    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
    real             rcutoff_scalar;
    real             *shiftvec,*fshift,*x,*f;
    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
    int              vdwioffset0;
    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
    int              vdwjidx0A,vdwjidx0B;
    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
    real             *charge;
    int              nvdwtype;
    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
    int              *vdwtype;
    real             *vdwparam;
    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
    __m128i          ewitab;
    __m128d          ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
    real             *ewtab;
    __m128d          dummy_mask,cutoff_mask;
    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
    __m128d          one     = _mm_set1_pd(1.0);
    __m128d          two     = _mm_set1_pd(2.0);
    x                = xx[0];
    f                = ff[0];

    nri              = nlist->nri;
    iinr             = nlist->iinr;
    jindex           = nlist->jindex;
    jjnr             = nlist->jjnr;
    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
    real             rcutoff_scalar;
    real             *shiftvec,*fshift,*x,*f;
    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
    int              vdwioffset0;
    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
    int              vdwjidx0A,vdwjidx0B;
    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
    real             *charge;
    int              nvdwtype;
    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
    int              *vdwtype;
    real             *vdwparam;
    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
    __m128i          vfitab;
    __m128i          ifour       = _mm_set1_epi32(4);
    __m128d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
    real             *vftab;
    __m128d          dummy_mask,cutoff_mask;
    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
    __m128d          one     = _mm_set1_pd(1.0);
    __m128d          two     = _mm_set1_pd(2.0);
    x                = xx[0];
    f                = ff[0];

    nri              = nlist->nri;
    iinr             = nlist->iinr;
    jindex           = nlist->jindex;
    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
    real             rcutoff_scalar;
    real             *shiftvec,*fshift,*x,*f;
    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
    int              vdwioffset0;
    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
    int              vdwjidx0A,vdwjidx0B;
    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
    real             *charge;
    int              nvdwtype;
    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
    int              *vdwtype;
    real             *vdwparam;
    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
    __m128d          rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
    real             rswitch_scalar,d_scalar;
    __m128d          dummy_mask,cutoff_mask;
    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
    __m128d          one     = _mm_set1_pd(1.0);
    __m128d          two     = _mm_set1_pd(2.0);
    x                = xx[0];
    f                = ff[0];

    nri              = nlist->nri;
    iinr             = nlist->iinr;
    jindex           = nlist->jindex;
    jjnr             = nlist->jjnr;
    shiftidx         = nlist->shift;
Ejemplo n.º 11
0
void exchsolutionData_2(unsigned int slot) {
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if ((!neighbor_isValid[1][0])) {
{
double xPos;
double yPos;
/* Statements in this Scop: S397, S396, S398 */
{
{
{
int i1 = 1;
for (; (i1<(2&(~1))); i1 += 1) {
yPos = ((((i1-1)/4.000000e+00)*(posEnd[1]-posBegin[1]))+posBegin[1]);
}
__m128d vec1 = _mm_set1_pd(1.000000e+00);
__m128d vec2 = _mm_set1_pd(4.000000e+00);
__m128d vec5 = _mm_set1_pd(yPos);
for (; (i1<3); i1 += 4) {
/* yPos = ((((i1-1)/4.000000e+00)*(posEnd[1]-posBegin[1]))+posBegin[1]); */
__m128d vec0 = _mm_set_pd(i1+1,i1);
__m128d vec0_2 = _mm_set_pd(i1+1,i1);
__m128d vec3 = _mm_load1_pd((&posEnd[1]));
__m128d vec3_2 = _mm_load1_pd((&posEnd[1]));
__m128d vec4 = _mm_load1_pd((&posBegin[1]));
__m128d vec4_2 = _mm_load1_pd((&posBegin[1]));
vec5 = _mm_add_pd(_mm_mul_pd(_mm_div_pd(_mm_sub_pd(vec0, vec1), vec2), _mm_sub_pd(vec3, vec4)), vec4);
vec5 = _mm_add_pd(_mm_mul_pd(_mm_div_pd(_mm_sub_pd(vec0_2, vec1), vec2), _mm_sub_pd(vec3_2, vec4_2)), vec4_2);
}
for (; (i1<6); i1 += 1) {
yPos = ((((i1-1)/4.000000e+00)*(posEnd[1]-posBegin[1]))+posBegin[1]);
}
}
{
double* fieldData_Solution_2_p1 = (&fieldData_Solution[2][0]);
int i1 = 1;
for (; (i1<=4); i1 += 2) {
fieldData_Solution_2_p1[((i1*8)+2)] = 0.000000e+00;
fieldData_Solution_2_p1[((i1*8)+10)] = 0.000000e+00;
}
for (; (i1<=5); i1 += 1) {
fieldData_Solution_2_p1[((i1*8)+2)] = 0.000000e+00;
}
}
}
{
int i1 = 1;
for (; (i1<(2&(~1))); i1 += 1) {
xPos = posBegin[0];
}
__m128d vec1 = _mm_set1_pd(xPos);
for (; (i1<3); i1 += 4) {
/* xPos = posBegin[0]; */
__m128d vec0 = _mm_load1_pd((&posBegin[0]));
__m128d vec0_2 = _mm_load1_pd((&posBegin[0]));
vec1 = vec0;
vec1 = vec0_2;
}
for (; (i1<6); i1 += 1) {
xPos = posBegin[0];
}
}
}
}
}
if ((!neighbor_isValid[1][1])) {
{
double xPos;
double yPos;
/* Statements in this Scop: S401, S400, S399 */
{
{
{
double* fieldData_Solution_2_p1 = (&fieldData_Solution[2][0]);
int i1 = 1;
for (; (i1<=4); i1 += 2) {
fieldData_Solution_2_p1[((i1*8)+6)] = 0.000000e+00;
fieldData_Solution_2_p1[((i1*8)+14)] = 0.000000e+00;
}
for (; (i1<=5); i1 += 1) {
fieldData_Solution_2_p1[((i1*8)+6)] = 0.000000e+00;
}
}
{
int i1 = 1;
for (; (i1<(2&(~1))); i1 += 1) {
xPos = posEnd[0];
}
__m128d vec1 = _mm_set1_pd(xPos);
for (; (i1<3); i1 += 4) {
/* xPos = posEnd[0]; */
__m128d vec0 = _mm_load1_pd((&posEnd[0]));
__m128d vec0_2 = _mm_load1_pd((&posEnd[0]));
vec1 = vec0;
vec1 = vec0_2;
}
for (; (i1<6); i1 += 1) {
xPos = posEnd[0];
}
}
}
{
int i1 = 1;
for (; (i1<(2&(~1))); i1 += 1) {
yPos = ((((i1-1)/4.000000e+00)*(posEnd[1]-posBegin[1]))+posBegin[1]);
}
__m128d vec1 = _mm_set1_pd(1.000000e+00);
__m128d vec2 = _mm_set1_pd(4.000000e+00);
__m128d vec5 = _mm_set1_pd(yPos);
for (; (i1<3); i1 += 4) {
/* yPos = ((((i1-1)/4.000000e+00)*(posEnd[1]-posBegin[1]))+posBegin[1]); */
__m128d vec0 = _mm_set_pd(i1+1,i1);
__m128d vec0_2 = _mm_set_pd(i1+1,i1);
__m128d vec3 = _mm_load1_pd((&posEnd[1]));
__m128d vec3_2 = _mm_load1_pd((&posEnd[1]));
__m128d vec4 = _mm_load1_pd((&posBegin[1]));
__m128d vec4_2 = _mm_load1_pd((&posBegin[1]));
vec5 = _mm_add_pd(_mm_mul_pd(_mm_div_pd(_mm_sub_pd(vec0, vec1), vec2), _mm_sub_pd(vec3, vec4)), vec4);
vec5 = _mm_add_pd(_mm_mul_pd(_mm_div_pd(_mm_sub_pd(vec0_2, vec1), vec2), _mm_sub_pd(vec3_2, vec4_2)), vec4_2);
}
for (; (i1<6); i1 += 1) {
yPos = ((((i1-1)/4.000000e+00)*(posEnd[1]-posBegin[1]))+posBegin[1]);
}
}
}
}
}
if ((!neighbor_isValid[1][2])) {
{
double xPos;
double yPos;
/* Statements in this Scop: S404, S403, S402 */
{
{
{
double* fieldData_Solution_2_p1 = (&fieldData_Solution[2][0]);
int i2 = 2;
for (; (i2<=5); i2 += 2) {
fieldData_Solution_2_p1[(i2+8)] = 0.000000e+00;
fieldData_Solution_2_p1[(i2+9)] = 0.000000e+00;
}
for (; (i2<=6); i2 += 1) {
fieldData_Solution_2_p1[(i2+8)] = 0.000000e+00;
}
}
{
int i2 = 2;
for (; (i2<=5); i2 += 2) {
xPos = ((((i2-2)/4.000000e+00)*(posEnd[0]-posBegin[0]))+posBegin[0]);
xPos = ((((i2-1)/4.000000e+00)*(posEnd[0]-posBegin[0]))+posBegin[0]);
}
for (; (i2<=6); i2 += 1) {
xPos = ((((i2-2)/4.000000e+00)*(posEnd[0]-posBegin[0]))+posBegin[0]);
}
}
}
{
int i2 = 2;
for (; (i2<=5); i2 += 2) {
yPos = posBegin[1];
yPos = posBegin[1];
}
for (; (i2<=6); i2 += 1) {
yPos = posBegin[1];
}
}
}
}
}
if ((!neighbor_isValid[1][3])) {
{
double xPos;
double yPos;
/* Statements in this Scop: S407, S406, S405 */
{
{
{
double* fieldData_Solution_2_p1 = (&fieldData_Solution[2][0]);
int i2 = 2;
for (; (i2<=5); i2 += 2) {
fieldData_Solution_2_p1[(i2+40)] = 0.000000e+00;
fieldData_Solution_2_p1[(i2+41)] = 0.000000e+00;
}
for (; (i2<=6); i2 += 1) {
fieldData_Solution_2_p1[(i2+40)] = 0.000000e+00;
}
}
{
int i2 = 2;
for (; (i2<=5); i2 += 2) {
xPos = ((((i2-2)/4.000000e+00)*(posEnd[0]-posBegin[0]))+posBegin[0]);
xPos = ((((i2-1)/4.000000e+00)*(posEnd[0]-posBegin[0]))+posBegin[0]);
}
for (; (i2<=6); i2 += 1) {
xPos = ((((i2-2)/4.000000e+00)*(posEnd[0]-posBegin[0]))+posBegin[0]);
}
}
}
{
int i2 = 2;
for (; (i2<=5); i2 += 2) {
yPos = posEnd[1];
yPos = posEnd[1];
}
for (; (i2<=6); i2 += 1) {
yPos = posEnd[1];
}
}
}
}
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
;
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if ((neighbor_isValid[1][1]&&neighbor_isRemote[1][1])) {
MPI_Isend(&fieldData_Solution[2][14], 1, mpiDatatype_5_1_8, neighbor_remoteRank[1][1], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[1][1]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[1]);
reqOutstanding_Send[1] = true;
}
}
}
;
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if ((neighbor_isValid[1][0]&&neighbor_isRemote[1][0])) {
MPI_Irecv(&fieldData_Solution[2][10], 1, mpiDatatype_5_1_8, neighbor_remoteRank[1][0], ((unsigned int)(neighbor_fragCommId[1][0]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[0]);
reqOutstanding_Recv[0] = true;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if (reqOutstanding_Recv[0]) {
waitForMPIReq(&mpiRequest_Recv[0]);
reqOutstanding_Recv[0] = false;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
;
}
}
;
;
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if (reqOutstanding_Send[1]) {
waitForMPIReq(&mpiRequest_Send[1]);
reqOutstanding_Send[1] = false;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
;
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if ((neighbor_isValid[1][3]&&neighbor_isRemote[1][3])) {
MPI_Isend(&fieldData_Solution[2][42], 1, mpiDatatype_1_5_8, neighbor_remoteRank[1][3], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[1][3]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[3]);
reqOutstanding_Send[3] = true;
}
}
}
;
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if ((neighbor_isValid[1][2]&&neighbor_isRemote[1][2])) {
MPI_Irecv(&fieldData_Solution[2][10], 1, mpiDatatype_1_5_8, neighbor_remoteRank[1][2], ((unsigned int)(neighbor_fragCommId[1][2]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[2]);
reqOutstanding_Recv[2] = true;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if (reqOutstanding_Recv[2]) {
waitForMPIReq(&mpiRequest_Recv[2]);
reqOutstanding_Recv[2] = false;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
;
}
}
;
;
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if (reqOutstanding_Send[3]) {
waitForMPIReq(&mpiRequest_Send[3]);
reqOutstanding_Send[3] = false;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
;
;
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if ((neighbor_isValid[1][0]&&neighbor_isRemote[1][0])) {
MPI_Isend(&fieldData_Solution[2][3], 1, mpiDatatype_7_1_8, neighbor_remoteRank[1][0], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[1][0]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[0]);
reqOutstanding_Send[0] = true;
}
if ((neighbor_isValid[1][1]&&neighbor_isRemote[1][1])) {
MPI_Isend(&fieldData_Solution[2][5], 1, mpiDatatype_7_1_8, neighbor_remoteRank[1][1], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[1][1]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[1]);
reqOutstanding_Send[1] = true;
}
}
}
;
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if ((neighbor_isValid[1][0]&&neighbor_isRemote[1][0])) {
MPI_Irecv(&fieldData_Solution[2][1], 1, mpiDatatype_7_1_8, neighbor_remoteRank[1][0], ((unsigned int)(neighbor_fragCommId[1][0]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[0]);
reqOutstanding_Recv[0] = true;
}
if ((neighbor_isValid[1][1]&&neighbor_isRemote[1][1])) {
MPI_Irecv(&fieldData_Solution[2][7], 1, mpiDatatype_7_1_8, neighbor_remoteRank[1][1], ((unsigned int)(neighbor_fragCommId[1][1]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[1]);
reqOutstanding_Recv[1] = true;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if (reqOutstanding_Recv[0]) {
waitForMPIReq(&mpiRequest_Recv[0]);
reqOutstanding_Recv[0] = false;
}
if (reqOutstanding_Recv[1]) {
waitForMPIReq(&mpiRequest_Recv[1]);
reqOutstanding_Recv[1] = false;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
;
;
}
}
;
;
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if (reqOutstanding_Send[0]) {
waitForMPIReq(&mpiRequest_Send[0]);
reqOutstanding_Send[0] = false;
}
if (reqOutstanding_Send[1]) {
waitForMPIReq(&mpiRequest_Send[1]);
reqOutstanding_Send[1] = false;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
;
;
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if ((neighbor_isValid[1][2]&&neighbor_isRemote[1][2])) {
MPI_Isend(&fieldData_Solution[2][17], 1, mpiDatatype_1_7_8, neighbor_remoteRank[1][2], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[1][2]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[2]);
reqOutstanding_Send[2] = true;
}
if ((neighbor_isValid[1][3]&&neighbor_isRemote[1][3])) {
MPI_Isend(&fieldData_Solution[2][33], 1, mpiDatatype_1_7_8, neighbor_remoteRank[1][3], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[1][3]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[3]);
reqOutstanding_Send[3] = true;
}
}
}
;
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if ((neighbor_isValid[1][2]&&neighbor_isRemote[1][2])) {
MPI_Irecv(&fieldData_Solution[2][1], 1, mpiDatatype_1_7_8, neighbor_remoteRank[1][2], ((unsigned int)(neighbor_fragCommId[1][2]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[2]);
reqOutstanding_Recv[2] = true;
}
if ((neighbor_isValid[1][3]&&neighbor_isRemote[1][3])) {
MPI_Irecv(&fieldData_Solution[2][49], 1, mpiDatatype_1_7_8, neighbor_remoteRank[1][3], ((unsigned int)(neighbor_fragCommId[1][3]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[3]);
reqOutstanding_Recv[3] = true;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if (reqOutstanding_Recv[2]) {
waitForMPIReq(&mpiRequest_Recv[2]);
reqOutstanding_Recv[2] = false;
}
if (reqOutstanding_Recv[3]) {
waitForMPIReq(&mpiRequest_Recv[3]);
reqOutstanding_Recv[3] = false;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
;
;
}
}
;
;
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if (reqOutstanding_Send[2]) {
waitForMPIReq(&mpiRequest_Send[2]);
reqOutstanding_Send[2] = false;
}
if (reqOutstanding_Send[3]) {
waitForMPIReq(&mpiRequest_Send[3]);
reqOutstanding_Send[3] = false;
}
}
}
}
    real             *shiftvec,*fshift,*x,*f;
    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
    int              vdwioffset0;
    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
    int              vdwjidx0A,vdwjidx0B;
    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
    real             *charge;
    __m128i          vfitab;
    __m128i          ifour       = _mm_set1_epi32(4);
    __m128d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
    real             *vftab;
    __m128d          dummy_mask,cutoff_mask;
    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
    __m128d          one     = _mm_set1_pd(1.0);
    __m128d          two     = _mm_set1_pd(2.0);
    x                = xx[0];
    f                = ff[0];

    nri              = nlist->nri;
    iinr             = nlist->iinr;
    jindex           = nlist->jindex;
    jjnr             = nlist->jjnr;
    shiftidx         = nlist->shift;
    gid              = nlist->gid;
    shiftvec         = fr->shift_vec[0];
    fshift           = fr->fshift[0];
    facel            = _mm_set1_pd(fr->epsfac);
    charge           = mdatoms->chargeA;
__m128d cross(__m128d tmp9)
{
  __m128d t1 = _mm_set1_pd(1.0);
  __m128d tmp10 = _mm_move_sd(t1, tmp9);
  return tmp10;
}
Ejemplo n.º 14
0
void
mlib_FIR_tap3f_d64s(
    mlib_d64 *pdst,
    const mlib_d64 *psrc,
    mlib_d64 *pflt,
    mlib_s32 n)
{
	mlib_s32 j;
	mlib_d64 src1_1, src2_1, src3_1;
	mlib_d64 src1_2, src2_2, src3_2;
	mlib_d64 dflt1 = pflt[0], dflt2 = pflt[1], dflt3 = pflt[2];

	__m128d sdflt1, sdflt2, sdflt3;
	__m128d ssrc1, ssrc2, ssrc3;
	__m128d smul1, smul2, smul3;

	sdflt3 = _mm_set1_pd(dflt3);
	sdflt2 = _mm_set1_pd(dflt2);
	sdflt1 = _mm_set1_pd(dflt1);

	if ((mlib_addr)psrc & 15) {

#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
		for (j = 0; j < n; j++) {
			ssrc1 = _mm_loadu_pd(psrc);
			ssrc2 = _mm_loadu_pd(psrc + 2);
			ssrc3 = _mm_loadu_pd(psrc + 4);

			smul1 = _mm_mul_pd(sdflt3, ssrc1);
			smul2 = _mm_mul_pd(sdflt2, ssrc2);
			smul3 = _mm_mul_pd(sdflt1, ssrc3);

			smul1 = _mm_add_pd(smul1, smul2);
			smul1 = _mm_add_pd(smul1, smul3);
			_mm_storeu_pd(pdst, smul1);
			psrc += 2;
			pdst += 2;
		}
	} else {

#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
	for (j = 0; j < n; j++) {
			ssrc1 = _mm_load_pd(psrc);
			ssrc2 = _mm_load_pd(psrc + 2);
			ssrc3 = _mm_load_pd(psrc + 4);

			smul1 = _mm_mul_pd(sdflt3, ssrc1);
			smul2 = _mm_mul_pd(sdflt2, ssrc2);
			smul3 = _mm_mul_pd(sdflt1, ssrc3);

			smul1 = _mm_add_pd(smul1, smul2);
			smul1 = _mm_add_pd(smul1, smul3);
			_mm_storeu_pd(pdst, smul1);

			psrc += 2;
			pdst += 2;
		}
	}
}
    int              jnrA,jnrB;
    int              j_coord_offsetA,j_coord_offsetB;
    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
    real             rcutoff_scalar;
    real             *shiftvec,*fshift,*x,*f;
    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
    int              vdwioffset0;
    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
    int              vdwjidx0A,vdwjidx0B;
    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
    int              nvdwtype;
    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
    int              *vdwtype;
    real             *vdwparam;
    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
    __m128d          dummy_mask,cutoff_mask;
    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
    __m128d          one     = _mm_set1_pd(1.0);
    __m128d          two     = _mm_set1_pd(2.0);
    x                = xx[0];
    f                = ff[0];

    nri              = nlist->nri;
    iinr             = nlist->iinr;
    jindex           = nlist->jindex;
    jjnr             = nlist->jjnr;
    shiftidx         = nlist->shift;
    gid              = nlist->gid;
    shiftvec         = fr->shift_vec[0];
    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
    real             rcutoff_scalar;
    real             *shiftvec,*fshift,*x,*f;
    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
    int              vdwioffset0;
    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
    int              vdwjidx0A,vdwjidx0B;
    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
    real             *charge;
    int              nvdwtype;
    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
    int              *vdwtype;
    real             *vdwparam;
    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
    __m128d          dummy_mask,cutoff_mask;
    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
    __m128d          one     = _mm_set1_pd(1.0);
    __m128d          two     = _mm_set1_pd(2.0);
    x                = xx[0];
    f                = ff[0];

    nri              = nlist->nri;
    iinr             = nlist->iinr;
    jindex           = nlist->jindex;
    jjnr             = nlist->jjnr;
    shiftidx         = nlist->shift;
    gid              = nlist->gid;
    shiftvec         = fr->shift_vec[0];
    int              vdwioffset1;
    __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
    int              vdwioffset2;
    __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
    int              vdwioffset3;
    __m128d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
    int              vdwjidx0A,vdwjidx0B;
    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
    __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
    __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
    __m128d          dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
    real             *charge;
    __m128d          dummy_mask,cutoff_mask;
    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
    __m128d          one     = _mm_set1_pd(1.0);
    __m128d          two     = _mm_set1_pd(2.0);
    x                = xx[0];
    f                = ff[0];

    nri              = nlist->nri;
    iinr             = nlist->iinr;
    jindex           = nlist->jindex;
    jjnr             = nlist->jjnr;
    shiftidx         = nlist->shift;
    gid              = nlist->gid;
    shiftvec         = fr->shift_vec[0];
    fshift           = fr->fshift[0];
    facel            = _mm_set1_pd(fr->epsfac);
    charge           = mdatoms->chargeA;
    krf              = _mm_set1_pd(fr->ic->k_rf);
    int              vdwioffset1;
    __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
    int              vdwioffset2;
    __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
    int              vdwjidx0A,vdwjidx0B;
    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
    __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
    __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
    real             *charge;
    int              nvdwtype;
    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
    int              *vdwtype;
    real             *vdwparam;
    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
    __m128d          dummy_mask,cutoff_mask;
    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
    __m128d          one     = _mm_set1_pd(1.0);
    __m128d          two     = _mm_set1_pd(2.0);
    x                = xx[0];
    f                = ff[0];

    nri              = nlist->nri;
    iinr             = nlist->iinr;
    jindex           = nlist->jindex;
    jjnr             = nlist->jjnr;
    shiftidx         = nlist->shift;
    gid              = nlist->gid;
    shiftvec         = fr->shift_vec[0];
Ejemplo n.º 19
0
void exchlaplacecoeffData_2(unsigned int slot) {
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if ((!neighbor_isValid[1][0])) {
{
double xPos;
double yPos;
/* Statements in this Scop: S710, S704, S707, S701, S709, S700, S703, S706, S708, S702, S705 */
{
{
{
{
{
{
{
{
{
{
{
double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][0]);
int i1 = 1;
for (; (i1<=4); i1 += 2) {
fieldData_LaplaceCoeff_2_p1[((i1*8)+394)] = 0.000000e+00;
fieldData_LaplaceCoeff_2_p1[((i1*8)+402)] = 0.000000e+00;
}
for (; (i1<=5); i1 += 1) {
fieldData_LaplaceCoeff_2_p1[((i1*8)+394)] = 0.000000e+00;
}
}
{
double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][0]);
int i1 = 1;
for (; (i1<=4); i1 += 2) {
fieldData_LaplaceCoeff_2_p1[((i1*8)+226)] = 0.000000e+00;
fieldData_LaplaceCoeff_2_p1[((i1*8)+234)] = 0.000000e+00;
}
for (; (i1<=5); i1 += 1) {
fieldData_LaplaceCoeff_2_p1[((i1*8)+226)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][0]);
int i1 = 1;
for (; (i1<=4); i1 += 2) {
fieldData_LaplaceCoeff_2_p1[((i1*8)+170)] = 0.000000e+00;
fieldData_LaplaceCoeff_2_p1[((i1*8)+178)] = 0.000000e+00;
}
for (; (i1<=5); i1 += 1) {
fieldData_LaplaceCoeff_2_p1[((i1*8)+170)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][0]);
int i1 = 1;
for (; (i1<=4); i1 += 2) {
fieldData_LaplaceCoeff_2_p1[((i1*8)+58)] = 0.000000e+00;
fieldData_LaplaceCoeff_2_p1[((i1*8)+66)] = 0.000000e+00;
}
for (; (i1<=5); i1 += 1) {
fieldData_LaplaceCoeff_2_p1[((i1*8)+58)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][0]);
int i1 = 1;
for (; (i1<=4); i1 += 2) {
fieldData_LaplaceCoeff_2_p1[((i1*8)+450)] = 0.000000e+00;
fieldData_LaplaceCoeff_2_p1[((i1*8)+458)] = 0.000000e+00;
}
for (; (i1<=5); i1 += 1) {
fieldData_LaplaceCoeff_2_p1[((i1*8)+450)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][0]);
int i1 = 1;
for (; (i1<=4); i1 += 2) {
fieldData_LaplaceCoeff_2_p1[((i1*8)+114)] = 0.000000e+00;
fieldData_LaplaceCoeff_2_p1[((i1*8)+122)] = 0.000000e+00;
}
for (; (i1<=5); i1 += 1) {
fieldData_LaplaceCoeff_2_p1[((i1*8)+114)] = 0.000000e+00;
}
}
}
{
int i1 = 1;
for (; (i1<(2&(~1))); i1 += 1) {
yPos = ((((i1-1)/4.000000e+00)*(posEnd[1]-posBegin[1]))+posBegin[1]);
}
__m128d vec1 = _mm_set1_pd(1.000000e+00);
__m128d vec2 = _mm_set1_pd(4.000000e+00);
__m128d vec5 = _mm_set1_pd(yPos);
for (; (i1<3); i1 += 4) {
/* yPos = ((((i1-1)/4.000000e+00)*(posEnd[1]-posBegin[1]))+posBegin[1]); */
__m128d vec0 = _mm_set_pd(i1+1,i1);
__m128d vec0_2 = _mm_set_pd(i1+1,i1);
__m128d vec3 = _mm_load1_pd((&posEnd[1]));
__m128d vec3_2 = _mm_load1_pd((&posEnd[1]));
__m128d vec4 = _mm_load1_pd((&posBegin[1]));
__m128d vec4_2 = _mm_load1_pd((&posBegin[1]));
vec5 = _mm_add_pd(_mm_mul_pd(_mm_div_pd(_mm_sub_pd(vec0, vec1), vec2), _mm_sub_pd(vec3, vec4)), vec4);
vec5 = _mm_add_pd(_mm_mul_pd(_mm_div_pd(_mm_sub_pd(vec0_2, vec1), vec2), _mm_sub_pd(vec3_2, vec4_2)), vec4_2);
}
for (; (i1<6); i1 += 1) {
yPos = ((((i1-1)/4.000000e+00)*(posEnd[1]-posBegin[1]))+posBegin[1]);
}
}
}
{
double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][0]);
int i1 = 1;
for (; (i1<=4); i1 += 2) {
fieldData_LaplaceCoeff_2_p1[((i1*8)+2)] = 0.000000e+00;
fieldData_LaplaceCoeff_2_p1[((i1*8)+10)] = 0.000000e+00;
}
for (; (i1<=5); i1 += 1) {
fieldData_LaplaceCoeff_2_p1[((i1*8)+2)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][0]);
int i1 = 1;
for (; (i1<=4); i1 += 2) {
fieldData_LaplaceCoeff_2_p1[((i1*8)+338)] = 0.000000e+00;
fieldData_LaplaceCoeff_2_p1[((i1*8)+346)] = 0.000000e+00;
}
for (; (i1<=5); i1 += 1) {
fieldData_LaplaceCoeff_2_p1[((i1*8)+338)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][0]);
int i1 = 1;
for (; (i1<=4); i1 += 2) {
fieldData_LaplaceCoeff_2_p1[((i1*8)+282)] = 0.000000e+00;
fieldData_LaplaceCoeff_2_p1[((i1*8)+290)] = 0.000000e+00;
}
for (; (i1<=5); i1 += 1) {
fieldData_LaplaceCoeff_2_p1[((i1*8)+282)] = 0.000000e+00;
}
}
}
{
int i1 = 1;
for (; (i1<(2&(~1))); i1 += 1) {
xPos = posBegin[0];
}
__m128d vec1 = _mm_set1_pd(xPos);
for (; (i1<3); i1 += 4) {
/* xPos = posBegin[0]; */
__m128d vec0 = _mm_load1_pd((&posBegin[0]));
__m128d vec0_2 = _mm_load1_pd((&posBegin[0]));
vec1 = vec0;
vec1 = vec0_2;
}
for (; (i1<6); i1 += 1) {
xPos = posBegin[0];
}
}
}
}
}
if ((!neighbor_isValid[1][1])) {
{
double xPos;
double yPos;
/* Statements in this Scop: S716, S719, S713, S721, S715, S718, S712, S711, S720, S714, S717 */
{
{
{
{
{
{
{
{
{
{
{
int i1 = 1;
for (; (i1<(2&(~1))); i1 += 1) {
yPos = ((((i1-1)/4.000000e+00)*(posEnd[1]-posBegin[1]))+posBegin[1]);
}
__m128d vec1 = _mm_set1_pd(1.000000e+00);
__m128d vec2 = _mm_set1_pd(4.000000e+00);
__m128d vec5 = _mm_set1_pd(yPos);
for (; (i1<3); i1 += 4) {
/* yPos = ((((i1-1)/4.000000e+00)*(posEnd[1]-posBegin[1]))+posBegin[1]); */
__m128d vec0 = _mm_set_pd(i1+1,i1);
__m128d vec0_2 = _mm_set_pd(i1+1,i1);
__m128d vec3 = _mm_load1_pd((&posEnd[1]));
__m128d vec3_2 = _mm_load1_pd((&posEnd[1]));
__m128d vec4 = _mm_load1_pd((&posBegin[1]));
__m128d vec4_2 = _mm_load1_pd((&posBegin[1]));
vec5 = _mm_add_pd(_mm_mul_pd(_mm_div_pd(_mm_sub_pd(vec0, vec1), vec2), _mm_sub_pd(vec3, vec4)), vec4);
vec5 = _mm_add_pd(_mm_mul_pd(_mm_div_pd(_mm_sub_pd(vec0_2, vec1), vec2), _mm_sub_pd(vec3_2, vec4_2)), vec4_2);
}
for (; (i1<6); i1 += 1) {
yPos = ((((i1-1)/4.000000e+00)*(posEnd[1]-posBegin[1]))+posBegin[1]);
}
}
{
double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][0]);
int i1 = 1;
for (; (i1<=4); i1 += 2) {
fieldData_LaplaceCoeff_2_p1[((i1*8)+454)] = 0.000000e+00;
fieldData_LaplaceCoeff_2_p1[((i1*8)+462)] = 0.000000e+00;
}
for (; (i1<=5); i1 += 1) {
fieldData_LaplaceCoeff_2_p1[((i1*8)+454)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][0]);
int i1 = 1;
for (; (i1<=4); i1 += 2) {
fieldData_LaplaceCoeff_2_p1[((i1*8)+230)] = 0.000000e+00;
fieldData_LaplaceCoeff_2_p1[((i1*8)+238)] = 0.000000e+00;
}
for (; (i1<=5); i1 += 1) {
fieldData_LaplaceCoeff_2_p1[((i1*8)+230)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][0]);
int i1 = 1;
for (; (i1<=4); i1 += 2) {
fieldData_LaplaceCoeff_2_p1[((i1*8)+118)] = 0.000000e+00;
fieldData_LaplaceCoeff_2_p1[((i1*8)+126)] = 0.000000e+00;
}
for (; (i1<=5); i1 += 1) {
fieldData_LaplaceCoeff_2_p1[((i1*8)+118)] = 0.000000e+00;
}
}
}
{
int i1 = 1;
for (; (i1<(2&(~1))); i1 += 1) {
xPos = posEnd[0];
}
__m128d vec1 = _mm_set1_pd(xPos);
for (; (i1<3); i1 += 4) {
/* xPos = posEnd[0]; */
__m128d vec0 = _mm_load1_pd((&posEnd[0]));
__m128d vec0_2 = _mm_load1_pd((&posEnd[0]));
vec1 = vec0;
vec1 = vec0_2;
}
for (; (i1<6); i1 += 1) {
xPos = posEnd[0];
}
}
}
{
double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][0]);
int i1 = 1;
for (; (i1<=4); i1 += 2) {
fieldData_LaplaceCoeff_2_p1[((i1*8)+286)] = 0.000000e+00;
fieldData_LaplaceCoeff_2_p1[((i1*8)+294)] = 0.000000e+00;
}
for (; (i1<=5); i1 += 1) {
fieldData_LaplaceCoeff_2_p1[((i1*8)+286)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][0]);
int i1 = 1;
for (; (i1<=4); i1 += 2) {
fieldData_LaplaceCoeff_2_p1[((i1*8)+342)] = 0.000000e+00;
fieldData_LaplaceCoeff_2_p1[((i1*8)+350)] = 0.000000e+00;
}
for (; (i1<=5); i1 += 1) {
fieldData_LaplaceCoeff_2_p1[((i1*8)+342)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][0]);
int i1 = 1;
for (; (i1<=4); i1 += 2) {
fieldData_LaplaceCoeff_2_p1[((i1*8)+398)] = 0.000000e+00;
fieldData_LaplaceCoeff_2_p1[((i1*8)+406)] = 0.000000e+00;
}
for (; (i1<=5); i1 += 1) {
fieldData_LaplaceCoeff_2_p1[((i1*8)+398)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][0]);
int i1 = 1;
for (; (i1<=4); i1 += 2) {
fieldData_LaplaceCoeff_2_p1[((i1*8)+174)] = 0.000000e+00;
fieldData_LaplaceCoeff_2_p1[((i1*8)+182)] = 0.000000e+00;
}
for (; (i1<=5); i1 += 1) {
fieldData_LaplaceCoeff_2_p1[((i1*8)+174)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][0]);
int i1 = 1;
for (; (i1<=4); i1 += 2) {
fieldData_LaplaceCoeff_2_p1[((i1*8)+62)] = 0.000000e+00;
fieldData_LaplaceCoeff_2_p1[((i1*8)+70)] = 0.000000e+00;
}
for (; (i1<=5); i1 += 1) {
fieldData_LaplaceCoeff_2_p1[((i1*8)+62)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][0]);
int i1 = 1;
for (; (i1<=4); i1 += 2) {
fieldData_LaplaceCoeff_2_p1[((i1*8)+6)] = 0.000000e+00;
fieldData_LaplaceCoeff_2_p1[((i1*8)+14)] = 0.000000e+00;
}
for (; (i1<=5); i1 += 1) {
fieldData_LaplaceCoeff_2_p1[((i1*8)+6)] = 0.000000e+00;
}
}
}
}
}
if ((!neighbor_isValid[1][2])) {
{
double xPos;
double yPos;
/* Statements in this Scop: S722, S731, S725, S728, S727, S730, S724, S732, S726, S729, S723 */
{
{
{
{
{
{
{
{
{
{
{
double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][0]);
int i2 = 2;
for (; (i2<=5); i2 += 2) {
fieldData_LaplaceCoeff_2_p1[(i2+344)] = 0.000000e+00;
fieldData_LaplaceCoeff_2_p1[(i2+345)] = 0.000000e+00;
}
for (; (i2<=6); i2 += 1) {
fieldData_LaplaceCoeff_2_p1[(i2+344)] = 0.000000e+00;
}
}
{
double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][0]);
int i2 = 2;
for (; (i2<=5); i2 += 2) {
fieldData_LaplaceCoeff_2_p1[(i2+400)] = 0.000000e+00;
fieldData_LaplaceCoeff_2_p1[(i2+401)] = 0.000000e+00;
}
for (; (i2<=6); i2 += 1) {
fieldData_LaplaceCoeff_2_p1[(i2+400)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][0]);
int i2 = 2;
for (; (i2<=5); i2 += 2) {
fieldData_LaplaceCoeff_2_p1[(i2+120)] = 0.000000e+00;
fieldData_LaplaceCoeff_2_p1[(i2+121)] = 0.000000e+00;
}
for (; (i2<=6); i2 += 1) {
fieldData_LaplaceCoeff_2_p1[(i2+120)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][0]);
int i2 = 2;
for (; (i2<=5); i2 += 2) {
fieldData_LaplaceCoeff_2_p1[(i2+8)] = 0.000000e+00;
fieldData_LaplaceCoeff_2_p1[(i2+9)] = 0.000000e+00;
}
for (; (i2<=6); i2 += 1) {
fieldData_LaplaceCoeff_2_p1[(i2+8)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][0]);
int i2 = 2;
for (; (i2<=5); i2 += 2) {
fieldData_LaplaceCoeff_2_p1[(i2+64)] = 0.000000e+00;
fieldData_LaplaceCoeff_2_p1[(i2+65)] = 0.000000e+00;
}
for (; (i2<=6); i2 += 1) {
fieldData_LaplaceCoeff_2_p1[(i2+64)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][0]);
int i2 = 2;
for (; (i2<=5); i2 += 2) {
fieldData_LaplaceCoeff_2_p1[(i2+456)] = 0.000000e+00;
fieldData_LaplaceCoeff_2_p1[(i2+457)] = 0.000000e+00;
}
for (; (i2<=6); i2 += 1) {
fieldData_LaplaceCoeff_2_p1[(i2+456)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][0]);
int i2 = 2;
for (; (i2<=5); i2 += 2) {
fieldData_LaplaceCoeff_2_p1[(i2+232)] = 0.000000e+00;
fieldData_LaplaceCoeff_2_p1[(i2+233)] = 0.000000e+00;
}
for (; (i2<=6); i2 += 1) {
fieldData_LaplaceCoeff_2_p1[(i2+232)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][0]);
int i2 = 2;
for (; (i2<=5); i2 += 2) {
fieldData_LaplaceCoeff_2_p1[(i2+288)] = 0.000000e+00;
fieldData_LaplaceCoeff_2_p1[(i2+289)] = 0.000000e+00;
}
for (; (i2<=6); i2 += 1) {
fieldData_LaplaceCoeff_2_p1[(i2+288)] = 0.000000e+00;
}
}
}
{
int i2 = 2;
for (; (i2<=5); i2 += 2) {
xPos = ((((i2-2)/4.000000e+00)*(posEnd[0]-posBegin[0]))+posBegin[0]);
xPos = ((((i2-1)/4.000000e+00)*(posEnd[0]-posBegin[0]))+posBegin[0]);
}
for (; (i2<=6); i2 += 1) {
xPos = ((((i2-2)/4.000000e+00)*(posEnd[0]-posBegin[0]))+posBegin[0]);
}
}
}
{
double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][0]);
int i2 = 2;
for (; (i2<=5); i2 += 2) {
fieldData_LaplaceCoeff_2_p1[(i2+176)] = 0.000000e+00;
fieldData_LaplaceCoeff_2_p1[(i2+177)] = 0.000000e+00;
}
for (; (i2<=6); i2 += 1) {
fieldData_LaplaceCoeff_2_p1[(i2+176)] = 0.000000e+00;
}
}
}
{
int i2 = 2;
for (; (i2<=5); i2 += 2) {
yPos = posBegin[1];
yPos = posBegin[1];
}
for (; (i2<=6); i2 += 1) {
yPos = posBegin[1];
}
}
}
}
}
if ((!neighbor_isValid[1][3])) {
{
double xPos;
double yPos;
/* Statements in this Scop: S743, S737, S733, S742, S736, S739, S738, S741, S735, S740, S734 */
{
{
{
{
{
{
{
{
{
{
{
int i2 = 2;
for (; (i2<=5); i2 += 2) {
yPos = posEnd[1];
yPos = posEnd[1];
}
for (; (i2<=6); i2 += 1) {
yPos = posEnd[1];
}
}
{
double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][0]);
int i2 = 2;
for (; (i2<=5); i2 += 2) {
fieldData_LaplaceCoeff_2_p1[(i2+376)] = 0.000000e+00;
fieldData_LaplaceCoeff_2_p1[(i2+377)] = 0.000000e+00;
}
for (; (i2<=6); i2 += 1) {
fieldData_LaplaceCoeff_2_p1[(i2+376)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][0]);
int i2 = 2;
for (; (i2<=5); i2 += 2) {
fieldData_LaplaceCoeff_2_p1[(i2+488)] = 0.000000e+00;
fieldData_LaplaceCoeff_2_p1[(i2+489)] = 0.000000e+00;
}
for (; (i2<=6); i2 += 1) {
fieldData_LaplaceCoeff_2_p1[(i2+488)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][0]);
int i2 = 2;
for (; (i2<=5); i2 += 2) {
fieldData_LaplaceCoeff_2_p1[(i2+40)] = 0.000000e+00;
fieldData_LaplaceCoeff_2_p1[(i2+41)] = 0.000000e+00;
}
for (; (i2<=6); i2 += 1) {
fieldData_LaplaceCoeff_2_p1[(i2+40)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][0]);
int i2 = 2;
for (; (i2<=5); i2 += 2) {
fieldData_LaplaceCoeff_2_p1[(i2+208)] = 0.000000e+00;
fieldData_LaplaceCoeff_2_p1[(i2+209)] = 0.000000e+00;
}
for (; (i2<=6); i2 += 1) {
fieldData_LaplaceCoeff_2_p1[(i2+208)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][0]);
int i2 = 2;
for (; (i2<=5); i2 += 2) {
fieldData_LaplaceCoeff_2_p1[(i2+152)] = 0.000000e+00;
fieldData_LaplaceCoeff_2_p1[(i2+153)] = 0.000000e+00;
}
for (; (i2<=6); i2 += 1) {
fieldData_LaplaceCoeff_2_p1[(i2+152)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][0]);
int i2 = 2;
for (; (i2<=5); i2 += 2) {
fieldData_LaplaceCoeff_2_p1[(i2+320)] = 0.000000e+00;
fieldData_LaplaceCoeff_2_p1[(i2+321)] = 0.000000e+00;
}
for (; (i2<=6); i2 += 1) {
fieldData_LaplaceCoeff_2_p1[(i2+320)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][0]);
int i2 = 2;
for (; (i2<=5); i2 += 2) {
fieldData_LaplaceCoeff_2_p1[(i2+432)] = 0.000000e+00;
fieldData_LaplaceCoeff_2_p1[(i2+433)] = 0.000000e+00;
}
for (; (i2<=6); i2 += 1) {
fieldData_LaplaceCoeff_2_p1[(i2+432)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][0]);
int i2 = 2;
for (; (i2<=5); i2 += 2) {
fieldData_LaplaceCoeff_2_p1[(i2+96)] = 0.000000e+00;
fieldData_LaplaceCoeff_2_p1[(i2+97)] = 0.000000e+00;
}
for (; (i2<=6); i2 += 1) {
fieldData_LaplaceCoeff_2_p1[(i2+96)] = 0.000000e+00;
}
}
}
{
int i2 = 2;
for (; (i2<=5); i2 += 2) {
xPos = ((((i2-2)/4.000000e+00)*(posEnd[0]-posBegin[0]))+posBegin[0]);
xPos = ((((i2-1)/4.000000e+00)*(posEnd[0]-posBegin[0]))+posBegin[0]);
}
for (; (i2<=6); i2 += 1) {
xPos = ((((i2-2)/4.000000e+00)*(posEnd[0]-posBegin[0]))+posBegin[0]);
}
}
}
{
double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][0]);
int i2 = 2;
for (; (i2<=5); i2 += 2) {
fieldData_LaplaceCoeff_2_p1[(i2+264)] = 0.000000e+00;
fieldData_LaplaceCoeff_2_p1[(i2+265)] = 0.000000e+00;
}
for (; (i2<=6); i2 += 1) {
fieldData_LaplaceCoeff_2_p1[(i2+264)] = 0.000000e+00;
}
}
}
}
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if ((neighbor_isValid[1][1]&&neighbor_isRemote[1][1])) {
/* Statements in this Scop: S744 */
for (int i0 = 0; (i0<=8); i0 += 1) {
double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][(i0*56)]);
double* buffer_Send_1_p1 = (&buffer_Send[1][(i0*5)]);
int i1 = 1;
for (; (i1<=4); i1 += 2) {
buffer_Send_1_p1[(i1-1)] = fieldData_LaplaceCoeff_2_p1[((i1*8)+6)];
buffer_Send_1_p1[i1] = fieldData_LaplaceCoeff_2_p1[((i1*8)+14)];
}
for (; (i1<=5); i1 += 1) {
buffer_Send_1_p1[(i1-1)] = fieldData_LaplaceCoeff_2_p1[((i1*8)+6)];
}
}
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if ((neighbor_isValid[1][1]&&neighbor_isRemote[1][1])) {
MPI_Isend(buffer_Send[1], 45, MPI_DOUBLE, neighbor_remoteRank[1][1], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[1][1]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[1]);
reqOutstanding_Send[1] = true;
}
}
}
;
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if ((neighbor_isValid[1][0]&&neighbor_isRemote[1][0])) {
MPI_Irecv(buffer_Recv[0], 45, MPI_DOUBLE, neighbor_remoteRank[1][0], ((unsigned int)(neighbor_fragCommId[1][0]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[0]);
reqOutstanding_Recv[0] = true;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if (reqOutstanding_Recv[0]) {
waitForMPIReq(&mpiRequest_Recv[0]);
reqOutstanding_Recv[0] = false;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if ((neighbor_isValid[1][0]&&neighbor_isRemote[1][0])) {
/* Statements in this Scop: S745 */
for (int i0 = 0; (i0<=8); i0 += 1) {
double* buffer_Recv_0_p1 = (&buffer_Recv[0][(i0*5)]);
double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][(i0*56)]);
int i1 = 3;
for (; (i1<=6); i1 += 2) {
fieldData_LaplaceCoeff_2_p1[((i1*8)-14)] = buffer_Recv_0_p1[(i1-3)];
fieldData_LaplaceCoeff_2_p1[((i1*8)-6)] = buffer_Recv_0_p1[(i1-2)];
}
for (; (i1<=7); i1 += 1) {
fieldData_LaplaceCoeff_2_p1[((i1*8)-14)] = buffer_Recv_0_p1[(i1-3)];
}
}
}
}
}
;
;
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if (reqOutstanding_Send[1]) {
waitForMPIReq(&mpiRequest_Send[1]);
reqOutstanding_Send[1] = false;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
;
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if ((neighbor_isValid[1][3]&&neighbor_isRemote[1][3])) {
MPI_Isend(&fieldData_LaplaceCoeff[2][42], 1, mpiDatatype_9_5_56, neighbor_remoteRank[1][3], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[1][3]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[3]);
reqOutstanding_Send[3] = true;
}
}
}
;
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if ((neighbor_isValid[1][2]&&neighbor_isRemote[1][2])) {
MPI_Irecv(&fieldData_LaplaceCoeff[2][10], 1, mpiDatatype_9_5_56, neighbor_remoteRank[1][2], ((unsigned int)(neighbor_fragCommId[1][2]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[2]);
reqOutstanding_Recv[2] = true;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if (reqOutstanding_Recv[2]) {
waitForMPIReq(&mpiRequest_Recv[2]);
reqOutstanding_Recv[2] = false;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
;
}
}
;
;
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if (reqOutstanding_Send[3]) {
waitForMPIReq(&mpiRequest_Send[3]);
reqOutstanding_Send[3] = false;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if ((neighbor_isValid[1][0]&&neighbor_isRemote[1][0])) {
/* Statements in this Scop: S746 */
for (int i0 = 0; (i0<=8); i0 += 1) {
double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][(i0*56)]);
double* buffer_Send_0_p1 = (&buffer_Send[0][(i0*7)]);
int i1 = 0;
for (; (i1<=5); i1 += 2) {
buffer_Send_0_p1[i1] = fieldData_LaplaceCoeff_2_p1[((i1*8)+3)];
buffer_Send_0_p1[(i1+1)] = fieldData_LaplaceCoeff_2_p1[((i1*8)+11)];
}
for (; (i1<=6); i1 += 1) {
buffer_Send_0_p1[i1] = fieldData_LaplaceCoeff_2_p1[((i1*8)+3)];
}
}
}
if ((neighbor_isValid[1][1]&&neighbor_isRemote[1][1])) {
/* Statements in this Scop: S747 */
for (int i0 = 0; (i0<=8); i0 += 1) {
double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][(i0*56)]);
double* buffer_Send_1_p1 = (&buffer_Send[1][(i0*7)]);
int i1 = 0;
for (; (i1<=5); i1 += 2) {
buffer_Send_1_p1[i1] = fieldData_LaplaceCoeff_2_p1[((i1*8)+5)];
buffer_Send_1_p1[(i1+1)] = fieldData_LaplaceCoeff_2_p1[((i1*8)+13)];
}
for (; (i1<=6); i1 += 1) {
buffer_Send_1_p1[i1] = fieldData_LaplaceCoeff_2_p1[((i1*8)+5)];
}
}
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if ((neighbor_isValid[1][0]&&neighbor_isRemote[1][0])) {
MPI_Isend(buffer_Send[0], 63, MPI_DOUBLE, neighbor_remoteRank[1][0], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[1][0]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[0]);
reqOutstanding_Send[0] = true;
}
if ((neighbor_isValid[1][1]&&neighbor_isRemote[1][1])) {
MPI_Isend(buffer_Send[1], 63, MPI_DOUBLE, neighbor_remoteRank[1][1], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[1][1]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[1]);
reqOutstanding_Send[1] = true;
}
}
}
;
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if ((neighbor_isValid[1][0]&&neighbor_isRemote[1][0])) {
MPI_Irecv(buffer_Recv[0], 63, MPI_DOUBLE, neighbor_remoteRank[1][0], ((unsigned int)(neighbor_fragCommId[1][0]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[0]);
reqOutstanding_Recv[0] = true;
}
if ((neighbor_isValid[1][1]&&neighbor_isRemote[1][1])) {
MPI_Irecv(buffer_Recv[1], 63, MPI_DOUBLE, neighbor_remoteRank[1][1], ((unsigned int)(neighbor_fragCommId[1][1]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[1]);
reqOutstanding_Recv[1] = true;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if (reqOutstanding_Recv[0]) {
waitForMPIReq(&mpiRequest_Recv[0]);
reqOutstanding_Recv[0] = false;
}
if (reqOutstanding_Recv[1]) {
waitForMPIReq(&mpiRequest_Recv[1]);
reqOutstanding_Recv[1] = false;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if ((neighbor_isValid[1][0]&&neighbor_isRemote[1][0])) {
/* Statements in this Scop: S748 */
for (int i0 = 0; (i0<=8); i0 += 1) {
double* buffer_Recv_0_p1 = (&buffer_Recv[0][(i0*7)]);
double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][(i0*56)]);
int i1 = 1;
for (; (i1<=6); i1 += 2) {
fieldData_LaplaceCoeff_2_p1[((i1*8)-7)] = buffer_Recv_0_p1[(i1-1)];
fieldData_LaplaceCoeff_2_p1[((i1*8)+1)] = buffer_Recv_0_p1[i1];
}
for (; (i1<=7); i1 += 1) {
fieldData_LaplaceCoeff_2_p1[((i1*8)-7)] = buffer_Recv_0_p1[(i1-1)];
}
}
}
if ((neighbor_isValid[1][1]&&neighbor_isRemote[1][1])) {
/* Statements in this Scop: S749 */
for (int i0 = 0; (i0<=8); i0 += 1) {
double* buffer_Recv_1_p1 = (&buffer_Recv[1][(i0*7)]);
double* fieldData_LaplaceCoeff_2_p1 = (&fieldData_LaplaceCoeff[2][(i0*56)]);
int i1 = 7;
for (; (i1<=12); i1 += 2) {
fieldData_LaplaceCoeff_2_p1[((i1*8)-49)] = buffer_Recv_1_p1[(i1-7)];
fieldData_LaplaceCoeff_2_p1[((i1*8)-41)] = buffer_Recv_1_p1[(i1-6)];
}
for (; (i1<=13); i1 += 1) {
fieldData_LaplaceCoeff_2_p1[((i1*8)-49)] = buffer_Recv_1_p1[(i1-7)];
}
}
}
}
}
;
;
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if (reqOutstanding_Send[0]) {
waitForMPIReq(&mpiRequest_Send[0]);
reqOutstanding_Send[0] = false;
}
if (reqOutstanding_Send[1]) {
waitForMPIReq(&mpiRequest_Send[1]);
reqOutstanding_Send[1] = false;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
;
;
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if ((neighbor_isValid[1][2]&&neighbor_isRemote[1][2])) {
MPI_Isend(&fieldData_LaplaceCoeff[2][17], 1, mpiDatatype_9_7_56, neighbor_remoteRank[1][2], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[1][2]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[2]);
reqOutstanding_Send[2] = true;
}
if ((neighbor_isValid[1][3]&&neighbor_isRemote[1][3])) {
MPI_Isend(&fieldData_LaplaceCoeff[2][33], 1, mpiDatatype_9_7_56, neighbor_remoteRank[1][3], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[1][3]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[3]);
reqOutstanding_Send[3] = true;
}
}
}
;
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if ((neighbor_isValid[1][2]&&neighbor_isRemote[1][2])) {
MPI_Irecv(&fieldData_LaplaceCoeff[2][1], 1, mpiDatatype_9_7_56, neighbor_remoteRank[1][2], ((unsigned int)(neighbor_fragCommId[1][2]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[2]);
reqOutstanding_Recv[2] = true;
}
if ((neighbor_isValid[1][3]&&neighbor_isRemote[1][3])) {
MPI_Irecv(&fieldData_LaplaceCoeff[2][49], 1, mpiDatatype_9_7_56, neighbor_remoteRank[1][3], ((unsigned int)(neighbor_fragCommId[1][3]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[3]);
reqOutstanding_Recv[3] = true;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if (reqOutstanding_Recv[2]) {
waitForMPIReq(&mpiRequest_Recv[2]);
reqOutstanding_Recv[2] = false;
}
if (reqOutstanding_Recv[3]) {
waitForMPIReq(&mpiRequest_Recv[3]);
reqOutstanding_Recv[3] = false;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
;
;
}
}
;
;
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if (reqOutstanding_Send[2]) {
waitForMPIReq(&mpiRequest_Send[2]);
reqOutstanding_Send[2] = false;
}
if (reqOutstanding_Send[3]) {
waitForMPIReq(&mpiRequest_Send[3]);
reqOutstanding_Send[3] = false;
}
}
}
}
    int              vdwioffset3;
    __m128d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
    int              vdwjidx0A,vdwjidx0B;
    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
    __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
    __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
    __m128d          dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
    real             *charge;
    __m128i          vfitab;
    __m128i          ifour       = _mm_set1_epi32(4);
    __m128d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
    real             *vftab;
    __m128d          dummy_mask,cutoff_mask;
    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
    __m128d          one     = _mm_set1_pd(1.0);
    __m128d          two     = _mm_set1_pd(2.0);
    x                = xx[0];
    f                = ff[0];

    nri              = nlist->nri;
    iinr             = nlist->iinr;
    jindex           = nlist->jindex;
    jjnr             = nlist->jjnr;
    shiftidx         = nlist->shift;
    gid              = nlist->gid;
    shiftvec         = fr->shift_vec[0];
    fshift           = fr->fshift[0];
    facel            = _mm_set1_pd(fr->epsfac);
    charge           = mdatoms->chargeA;
Ejemplo n.º 21
0
void
mlib_s_ImageScalarBlend_s32(
	mlib_s32 *dst,
	mlib_s32 dlb,
	const mlib_s32 *src1,
	mlib_s32 slb1,
	const mlib_s32 *src2,
	mlib_s32 slb2,
	const mlib_s32 *alpha,
	mlib_s32 xsize,
	mlib_s32 ysize,
	mlib_s32 nchan)
{
	mlib_s32 i, j, nsize;
	__m128i *srcPtr1, *srcPtr2, *dstPtr;
	mlib_s32 *dl = dst;
	mlib_s32 *sl1 = (mlib_s32 *)src1, *sl2 = (mlib_s32 *)src2;
	__m128d alphas0, alphas1, alphau0, alphau1, alphav0, alphav1;
	__m128d betas0, betas1, betau0, betau1, betav0, betav1;
	__m128d ones = _mm_set1_pd(1.0f);
	mlib_s32 res, sdata1, sdata2;
	mlib_d64 a0, a1, a2, a3;



	nsize = xsize * nchan;

	switch (nchan) {
	case 1:
		a0 = -(alpha[0] & MASK) / (mlib_d64)MLIB_S32_MIN;
		alphas0 = _mm_set1_pd(a0);
		alphas1 = _mm_set1_pd(a0);
		betas0  = _mm_sub_pd(ones, alphas0);
		betas1  = _mm_sub_pd(ones, alphas1);
		if ((((mlib_addr)dst | dlb |
			(mlib_addr)src1 | slb1 |
			(mlib_addr)src2 | slb2) & 0xf) == 0) {
			for (j = 0; j < ysize; j ++) {
				srcPtr1 = (__m128i *)sl1;
				srcPtr2 = (__m128i *)sl2;
				dstPtr = (__m128i *)dl;
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
				for (i = 0; i <= (nsize - 4); i += 4) {
					MLIB_S_IMAGESCALARBLEND_S32(
						_mm_store_si128,
						_mm_load_si128,
						_mm_load_si128);
				}
				for (; i < nsize; i++) {
					MLIB_C_IMAGESCALARBLEND_S32_1(
						sl1, sl2, dl);
				}
				sl1 = (mlib_s32 *)((mlib_u8 *)sl1 + slb1);
				sl2 = (mlib_s32 *)((mlib_u8 *)sl2 + slb2);
				dl = (mlib_s32 *)((mlib_u8 *)dl + dlb);
			}
		} else
		if ((((mlib_addr)src1 | slb1 |
			(mlib_addr)src2 | slb2) & 0xf) == 0) {
			for (j = 0; j < ysize; j ++) {
				srcPtr1 = (__m128i *)sl1;
				srcPtr2 = (__m128i *)sl2;
				dstPtr = (__m128i *)dl;
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
				for (i = 0; i <= (nsize - 4); i += 4) {
					MLIB_S_IMAGESCALARBLEND_S32(
						_mm_storeu_si128,
						_mm_load_si128,
						_mm_load_si128);
				}
				for (; i < nsize; i++) {
					MLIB_C_IMAGESCALARBLEND_S32_1(
						sl1, sl2, dl);
				}
				sl1 = (mlib_s32 *)((mlib_u8 *)sl1 + slb1);
				sl2 = (mlib_s32 *)((mlib_u8 *)sl2 + slb2);
				dl = (mlib_s32 *)((mlib_u8 *)dl + dlb);
			}
		} else {
			for (j = 0; j < ysize; j ++) {
				srcPtr1 = (__m128i *)sl1;
				srcPtr2 = (__m128i *)sl2;
				dstPtr = (__m128i *)dl;
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
				for (i = 0; i <= (nsize - 4); i += 4) {
					MLIB_S_IMAGESCALARBLEND_S32(
						_mm_storeu_si128,
						_mm_loadu_si128,
						_mm_loadu_si128);
				}
				for (; i < nsize; i++) {
					MLIB_C_IMAGESCALARBLEND_S32_1(
						sl1, sl2, dl);
				}
				sl1 = (mlib_s32 *)((mlib_u8 *)sl1 + slb1);
				sl2 = (mlib_s32 *)((mlib_u8 *)sl2 + slb2);
				dl = (mlib_s32 *)((mlib_u8 *)dl + dlb);
			}
		}
		break;
	case 2:
		a0 = -(alpha[0] & MASK) / (mlib_d64)MLIB_S32_MIN;
		a1 = -(alpha[1] & MASK) / (mlib_d64)MLIB_S32_MIN;
		alphas0 = _mm_set_pd(a1, a0);
		alphas1 = _mm_set_pd(a1, a0);
		betas0  = _mm_sub_pd(ones, alphas0);
		betas1  = _mm_sub_pd(ones, alphas1);

		if ((((mlib_addr)dst | dlb |
			(mlib_addr)src1 | slb1 |
			(mlib_addr)src2 | slb2) & 0xf) == 0) {
			for (j = 0; j < ysize; j ++) {
				srcPtr1 = (__m128i *)sl1;
				srcPtr2 = (__m128i *)sl2;
				dstPtr = (__m128i *)dl;
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
				for (i = 0; i <= (nsize - 4); i += 4) {
					MLIB_S_IMAGESCALARBLEND_S32(
						_mm_store_si128,
						_mm_load_si128,
						_mm_load_si128);
				}
				for (; i < nsize; i += 2) {
					MLIB_C_IMAGESCALARBLEND_S32_2(
						sl1, sl2, dl);
				}
				sl1 = (mlib_s32 *)((mlib_u8 *)sl1 + slb1);
				sl2 = (mlib_s32 *)((mlib_u8 *)sl2 + slb2);
				dl = (mlib_s32 *)((mlib_u8 *)dl + dlb);
			}
		} else
		if ((((mlib_addr)src1 | slb1 |
			(mlib_addr)src2 | slb2) & 0xf) == 0) {
			for (j = 0; j < ysize; j ++) {
				srcPtr1 = (__m128i *)sl1;
				srcPtr2 = (__m128i *)sl2;
				dstPtr = (__m128i *)dl;
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
				for (i = 0; i <= (nsize - 4); i += 4) {
					MLIB_S_IMAGESCALARBLEND_S32(
						_mm_storeu_si128,
						_mm_load_si128,
						_mm_load_si128);
				}
				for (; i < nsize; i += 2) {
					MLIB_C_IMAGESCALARBLEND_S32_2(
						sl1, sl2, dl);
				}
				sl1 = (mlib_s32 *)((mlib_u8 *)sl1 + slb1);
				sl2 = (mlib_s32 *)((mlib_u8 *)sl2 + slb2);
				dl = (mlib_s32 *)((mlib_u8 *)dl + dlb);
			}
		} else {
			for (j = 0; j < ysize; j ++) {
				srcPtr1 = (__m128i *)sl1;
				srcPtr2 = (__m128i *)sl2;
				dstPtr = (__m128i *)dl;
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
				for (i = 0; i <= (nsize - 4); i += 4) {
					MLIB_S_IMAGESCALARBLEND_S32(
						_mm_storeu_si128,
						_mm_loadu_si128,
						_mm_loadu_si128);
				}
				for (; i < nsize; i += 2) {
					MLIB_C_IMAGESCALARBLEND_S32_2(
						sl1, sl2, dl);
				}
				sl1 = (mlib_s32 *)((mlib_u8 *)sl1 + slb1);
				sl2 = (mlib_s32 *)((mlib_u8 *)sl2 + slb2);
				dl = (mlib_s32 *)((mlib_u8 *)dl + dlb);
			}
		}
		break;
	case 3:
		a0 = -(alpha[0] & MASK) / (mlib_d64)MLIB_S32_MIN;
		a1 = -(alpha[1] & MASK) / (mlib_d64)MLIB_S32_MIN;
		a2 = -(alpha[2] & MASK) / (mlib_d64)MLIB_S32_MIN;
		alphas0 = _mm_set_pd(a1, a0);
		alphas1 = _mm_set_pd(a0, a2);
		alphau0 = _mm_set_pd(a2, a1);
		alphau1 = _mm_set_pd(a1, a0);
		alphav0 = _mm_set_pd(a0, a2);
		alphav1 = _mm_set_pd(a2, a1);
		betas0  = _mm_sub_pd(ones, alphas0);
		betas1  = _mm_sub_pd(ones, alphas1);
		betau0  = _mm_sub_pd(ones, alphau0);
		betau1  = _mm_sub_pd(ones, alphau1);
		betav0  = _mm_sub_pd(ones, alphav0);
		betav1  = _mm_sub_pd(ones, alphav1);

		if ((((mlib_addr)dst | dlb |
			(mlib_addr)src1 | slb1 |
			(mlib_addr)src2 | slb2) & 0xf) == 0) {
			for (j = 0; j < ysize; j ++) {
				srcPtr1 = (__m128i *)sl1;
				srcPtr2 = (__m128i *)sl2;
				dstPtr = (__m128i *)dl;
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
				for (i = 0; i <= (nsize - 12); i += 12) {
					MLIB_S_IMAGESCALARBLEND3_S32(
						_mm_store_si128,
						_mm_load_si128,
						_mm_load_si128);
				}
				for (; i < nsize; i += 3) {
					MLIB_C_IMAGESCALARBLEND_S32_3(
						sl1, sl2, dl);
				}
				sl1 = (mlib_s32 *)((mlib_u8 *)sl1 + slb1);
				sl2 = (mlib_s32 *)((mlib_u8 *)sl2 + slb2);
				dl = (mlib_s32 *)((mlib_u8 *)dl + dlb);
			}
		} else
		if ((((mlib_addr)src1 | slb1 |
			(mlib_addr)src2 | slb2) & 0xf) == 0) {
			for (j = 0; j < ysize; j ++) {
				srcPtr1 = (__m128i *)sl1;
				srcPtr2 = (__m128i *)sl2;
				dstPtr = (__m128i *)dl;
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
				for (i = 0; i <= (nsize - 12); i += 12) {
					MLIB_S_IMAGESCALARBLEND3_S32(
						_mm_storeu_si128,
						_mm_load_si128,
						_mm_load_si128);
				}
				for (; i < nsize; i += 3) {
					MLIB_C_IMAGESCALARBLEND_S32_3(
						sl1, sl2, dl);
				}
				sl1 = (mlib_s32 *)((mlib_u8 *)sl1 + slb1);
				sl2 = (mlib_s32 *)((mlib_u8 *)sl2 + slb2);
				dl = (mlib_s32 *)((mlib_u8 *)dl + dlb);
			}
		} else {
			for (j = 0; j < ysize; j ++) {
				srcPtr1 = (__m128i *)sl1;
				srcPtr2 = (__m128i *)sl2;
				dstPtr = (__m128i *)dl;
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
				for (i = 0; i <= (nsize - 12); i += 12) {
					MLIB_S_IMAGESCALARBLEND3_S32(
						_mm_storeu_si128,
						_mm_loadu_si128,
						_mm_loadu_si128);
				}
				for (; i < nsize; i += 3) {
					MLIB_C_IMAGESCALARBLEND_S32_3(
						sl1, sl2, dl);
				}
				sl1 = (mlib_s32 *)((mlib_u8 *)sl1 + slb1);
				sl2 = (mlib_s32 *)((mlib_u8 *)sl2 + slb2);
				dl = (mlib_s32 *)((mlib_u8 *)dl + dlb);
			}
		}
		break;

	case 4:
		a0 = -(alpha[0] & MASK) / (mlib_d64)MLIB_S32_MIN;
		a1 = -(alpha[1] & MASK) / (mlib_d64)MLIB_S32_MIN;
		a2 = -(alpha[2] & MASK) / (mlib_d64)MLIB_S32_MIN;
		a3 = -(alpha[3] & MASK) / (mlib_d64)MLIB_S32_MIN;
		alphas0 = _mm_set_pd(a1, a0);
		alphas1 = _mm_set_pd(a3, a2);
		betas0  = _mm_sub_pd(ones, alphas0);
		betas1  = _mm_sub_pd(ones, alphas1);

		if ((((mlib_addr)dst | dlb |
			(mlib_addr)src1 | slb1 |
			(mlib_addr)src2 | slb2) & 0xf) == 0) {
			for (j = 0; j < ysize; j ++) {
				srcPtr1 = (__m128i *)sl1;
				srcPtr2 = (__m128i *)sl2;
				dstPtr = (__m128i *)dl;
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
				for (i = 0; i <= (nsize - 4); i += 4) {
					MLIB_S_IMAGESCALARBLEND_S32(
						_mm_store_si128,
						_mm_load_si128,
						_mm_load_si128);
				}
				sl1 = (mlib_s32 *)((mlib_u8 *)sl1 + slb1);
				sl2 = (mlib_s32 *)((mlib_u8 *)sl2 + slb2);
				dl = (mlib_s32 *)((mlib_u8 *)dl + dlb);
			}
		} else
		if ((((mlib_addr)src1 | slb1 |
			(mlib_addr)src2 | slb2) & 0xf) == 0) {
			for (j = 0; j < ysize; j ++) {
				srcPtr1 = (__m128i *)sl1;
				srcPtr2 = (__m128i *)sl2;
				dstPtr = (__m128i *)dl;
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
				for (i = 0; i <= (nsize - 4); i += 4) {
					MLIB_S_IMAGESCALARBLEND_S32(
						_mm_storeu_si128,
						_mm_load_si128,
						_mm_load_si128);
				}
				sl1 = (mlib_s32 *)((mlib_u8 *)sl1 + slb1);
				sl2 = (mlib_s32 *)((mlib_u8 *)sl2 + slb2);
				dl = (mlib_s32 *)((mlib_u8 *)dl + dlb);
			}
		} else {
			for (j = 0; j < ysize; j ++) {
				srcPtr1 = (__m128i *)sl1;
				srcPtr2 = (__m128i *)sl2;
				dstPtr = (__m128i *)dl;
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
				for (i = 0; i <= (nsize - 4); i += 4) {
					MLIB_S_IMAGESCALARBLEND_S32(
						_mm_storeu_si128,
						_mm_loadu_si128,
						_mm_loadu_si128);
				}
				sl1 = (mlib_s32 *)((mlib_u8 *)sl1 + slb1);
				sl2 = (mlib_s32 *)((mlib_u8 *)sl2 + slb2);
				dl = (mlib_s32 *)((mlib_u8 *)dl + dlb);
			}
		}
		break;
	}
}
    int              jnrA,jnrB;
    int              j_coord_offsetA,j_coord_offsetB;
    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
    real             rcutoff_scalar;
    real             *shiftvec,*fshift,*x,*f;
    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
    int              vdwioffset0;
    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
    int              vdwjidx0A,vdwjidx0B;
    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
    real             *charge;
    __m128d          dummy_mask,cutoff_mask;
    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
    __m128d          one     = _mm_set1_pd(1.0);
    __m128d          two     = _mm_set1_pd(2.0);
    x                = xx[0];
    f                = ff[0];

    nri              = nlist->nri;
    iinr             = nlist->iinr;
    jindex           = nlist->jindex;
    jjnr             = nlist->jjnr;
    shiftidx         = nlist->shift;
    gid              = nlist->gid;
    shiftvec         = fr->shift_vec[0];
    fshift           = fr->fshift[0];
    facel            = _mm_set1_pd(fr->epsfac);
    charge           = mdatoms->chargeA;
    krf              = _mm_set1_pd(fr->ic->k_rf);
Ejemplo n.º 23
0
void matrix_vector_mul_SSE_f48_loop_unrolled (fl48** mat, fl48* &vec)
{
    // TESTING change SIZE to min 8 - but multiple of 8
    fl48* result = new fl48[SIZE];
  __m128i load_mask = _mm_set_epi8(11, 10, 9, 8, 7, 6, 255, 255,
  			      5, 4, 3, 2, 1, 0, 255, 255);
  for(unsigned i=0;i<SIZE;i+=8) { // row // requiring 8 at a time - because loop un-roll
    __m128d running_sum1 = _mm_set1_pd(0.0); // running sum initially 0
    __m128d running_sum2 = _mm_set1_pd(0.0); // running sum initially 0
    __m128d running_sum3 = _mm_set1_pd(0.0); // running sum initially 0
    __m128d running_sum4 = _mm_set1_pd(0.0); // running sum initially 0
    __m128d running_sum5 = _mm_set1_pd(0.0); // running sum initially 0
    __m128d running_sum6 = _mm_set1_pd(0.0); // running sum initially 0
    __m128d running_sum7 = _mm_set1_pd(0.0); // running sum initially 0
    __m128d running_sum8 = _mm_set1_pd(0.0); // running sum initially 0

    for(unsigned j=0;j<SIZE;j+=2) { // col - requires skipping on 2 at a time
      __m128i mat_vect = _mm_loadu_si128((__m128i*) &mat[i][j]); // hoping that addresses are as expected - seems like this is the way it's stored
      mat_vect = _mm_shuffle_epi8(mat_vect, load_mask);
      __m128i vec_elem = _mm_loadu_si128((__m128i*) &vec[j]);
      vec_elem = _mm_shuffle_epi8(vec_elem, load_mask);
      __m128d mult = _mm_mul_pd((__m128d)mat_vect,(__m128d)vec_elem);
      running_sum1 = _mm_add_pd(mult,running_sum1);

      mat_vect = _mm_loadu_si128((__m128i*) &mat[i+1][j]);
      mat_vect = _mm_shuffle_epi8(mat_vect, load_mask);
      mult = _mm_mul_pd((__m128d)mat_vect,(__m128d)vec_elem);
      running_sum2 = _mm_add_pd(mult,running_sum2);

      mat_vect = _mm_loadu_si128((__m128i*) &mat[i+2][j]);
      mat_vect = _mm_shuffle_epi8(mat_vect, load_mask);
      mult = _mm_mul_pd((__m128d)mat_vect,(__m128d)vec_elem);
      running_sum3 = _mm_add_pd(mult,running_sum3);

      mat_vect = _mm_loadu_si128((__m128i*) &mat[i+3][j]);
      mat_vect = _mm_shuffle_epi8(mat_vect, load_mask);
      mult = _mm_mul_pd((__m128d)mat_vect,(__m128d)vec_elem);
      running_sum4 = _mm_add_pd(mult,running_sum4);

      mat_vect = _mm_loadu_si128((__m128i*) &mat[i+4][j]);
      mat_vect = _mm_shuffle_epi8(mat_vect, load_mask);
      mult = _mm_mul_pd((__m128d)mat_vect,(__m128d)vec_elem);
      running_sum5 = _mm_add_pd(mult,running_sum5);

      mat_vect = _mm_loadu_si128((__m128i*) &mat[i+5][j]);
      mat_vect = _mm_shuffle_epi8(mat_vect, load_mask);
      mult = _mm_mul_pd((__m128d)mat_vect,(__m128d)vec_elem);
      running_sum6 = _mm_add_pd(mult,running_sum6);

      mat_vect = _mm_loadu_si128((__m128i*) &mat[i+6][j]);
      mat_vect = _mm_shuffle_epi8(mat_vect, load_mask);
      mult = _mm_mul_pd((__m128d)mat_vect,(__m128d)vec_elem);
      running_sum7 = _mm_add_pd(mult,running_sum7);

      mat_vect = _mm_loadu_si128((__m128i*) &mat[i+7][j]);
      mat_vect = _mm_shuffle_epi8(mat_vect, load_mask);
      mult = _mm_mul_pd((__m128d)mat_vect,(__m128d)vec_elem);
      running_sum8 = _mm_add_pd(mult,running_sum8);
    }
    __m128i mask = _mm_set_epi8(7 ,6 ,5, 4, 3, 2, 1, 0,
		      15, 14, 13, 12, 11, 10, 9, 8);
    __m128i sum_shuffled = _mm_shuffle_epi8((__m128i)running_sum1, mask);
    running_sum1 = _mm_add_pd(running_sum1,(__m128d)sum_shuffled);
    sum_shuffled = _mm_shuffle_epi8((__m128i)running_sum2, mask);
    running_sum2 = _mm_add_pd(running_sum2,(__m128d)sum_shuffled);
    sum_shuffled = _mm_shuffle_epi8((__m128i)running_sum3, mask);
    running_sum3 = _mm_add_pd(running_sum3,(__m128d)sum_shuffled);
    sum_shuffled = _mm_shuffle_epi8((__m128i)running_sum4, mask);
    running_sum4 = _mm_add_pd(running_sum4,(__m128d)sum_shuffled);
    sum_shuffled = _mm_shuffle_epi8((__m128i)running_sum5, mask);
    running_sum5 = _mm_add_pd(running_sum5,(__m128d)sum_shuffled);
    sum_shuffled = _mm_shuffle_epi8((__m128i)running_sum6, mask);
    running_sum6 = _mm_add_pd(running_sum6,(__m128d)sum_shuffled);
    sum_shuffled = _mm_shuffle_epi8((__m128i)running_sum7, mask);
    running_sum7 = _mm_add_pd(running_sum7,(__m128d)sum_shuffled);
    sum_shuffled = _mm_shuffle_epi8((__m128i)running_sum8, mask);
    running_sum8 = _mm_add_pd(running_sum8,(__m128d)sum_shuffled);

    // mesh them into 4
    __m128i mask_first = _mm_set_epi8(255,255,255,255,255,255,255,255,
			      7 ,6 ,5, 4, 3, 2, 1, 0);
    __m128i mask_second = _mm_set_epi8(7 ,6 ,5, 4, 3, 2, 1, 0,
			      255,255,255,255,255,255,255,255);

    running_sum1 = (__m128d)_mm_shuffle_epi8((__m128i)running_sum1, mask_first);
    running_sum2 = (__m128d)_mm_shuffle_epi8((__m128i)running_sum2, mask_second);
    running_sum1 = (__m128d)_mm_or_si128((__m128i)running_sum1, (__m128i)running_sum2);

    running_sum3 = (__m128d)_mm_shuffle_epi8((__m128i)running_sum3, mask_first);
    running_sum4 = (__m128d)_mm_shuffle_epi8((__m128i)running_sum4, mask_second);
    running_sum2 = (__m128d)_mm_or_si128((__m128i)running_sum3, (__m128i)running_sum4);

    running_sum5 = (__m128d)_mm_shuffle_epi8((__m128i)running_sum5, mask_first);
    running_sum6 = (__m128d)_mm_shuffle_epi8((__m128i)running_sum6, mask_second);
    running_sum3 = (__m128d)_mm_or_si128((__m128i)running_sum6, (__m128i)running_sum5);

    running_sum7 = (__m128d)_mm_shuffle_epi8((__m128i)running_sum7, mask_first);
    running_sum8 = (__m128d)_mm_shuffle_epi8((__m128i)running_sum8, mask_second);
    running_sum4 = (__m128d)_mm_or_si128((__m128i)running_sum8, (__m128i)running_sum7);

    // RS 1-4 are right and expected here too
    // rs 5-8 neglected and not required from now

    __m128i a01_round = convert_double_to_f48_SSE((__m128i)running_sum1);
    __m128i a23_round = convert_double_to_f48_SSE((__m128i)running_sum2);
    __m128i a45_round = convert_double_to_f48_SSE((__m128i)running_sum3);
    __m128i a67_round = convert_double_to_f48_SSE((__m128i)running_sum4);

    // place them right for memory write
    __m128i match_mask = _mm_set_epi8(3,2,1,0,255,255,255,255,255,255,255,255,255,255,255,255); // mask used to match the missing spaces
    __m128i a23_shuffled = _mm_shuffle_epi8((__m128i)a23_round, match_mask); // shuffle the positions required for the space in a01 for a2
    a01_round = _mm_or_si128(a01_round,a23_shuffled);

    a23_round = _mm_srli_si128 (a23_round, 4); // using _mm_srli_si128 instead of _mm_sll_epi64 because the epi64 shitfs witin each double element in the 128 item

    match_mask = _mm_set_epi8(7,6,5,4,3,2,1,0,255,255,255,255,255,255,255,255); // reset the match mask for a4 and small bit of a5
    __m128i a45_shuffled = _mm_shuffle_epi8((__m128i)a45_round, match_mask); // shuffle a45 to fit in a23
    a23_round = _mm_or_si128(a23_round,a45_shuffled);

    a45_round = _mm_srli_si128(a45_round, 8); // using _mm_srli_si128 instead of _mm_sll_epi64 because the epi64 shitfs witin each double element in the 128 item

    match_mask = _mm_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,255,255,255,255);
    __m128i a67_shuffled = _mm_shuffle_epi8((__m128i)a67_round, match_mask);
    a45_round = _mm_or_si128(a45_round,a67_shuffled);
     // WRITE BACK TO MEMORY!
    _mm_storeu_pd((double*)&result[i], (__m128d)a01_round);
    _mm_storeu_pd(bofs(&result[i],2), (__m128d)a23_round);
    _mm_storeu_pd(bofs(&result[i],4), (__m128d)a45_round);
  }
  vec = result;
}
void mymm_dsymv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
	      const int N, const double alpha, const double *A,
	      const int lda, const double *X, const int incX,
	      const double beta, double *Y, const int incY)
{
	// limited implementation 
	assert(Order==CblasRowMajor);
	assert(Uplo==CblasUpper);
	assert(N==lda);

	__builtin_prefetch (Y, 1, 3);
	__builtin_prefetch (X, 1, 3);
	int i,j;
	double temp , reg1;
	double* unpack;
	const double *pA=A, *pX=X;
	double* pY = Y;
	__m128d mm_beta = _mm_set1_pd(beta);
	__m128d mm_alpha = _mm_set1_pd(alpha);
	__m128d mm_reg1 __attribute__((aligned (16)));
	__m128d mm_reg2 __attribute__((aligned (16)));
	__m128d mm_temp __attribute__((aligned (16)));
	posix_memalign((void**)&unpack, 16, 2*sizeof(double));

	// y = beta*y
	for(i=0;i<lda-1;i+=2,pY+=(2*incY))
	{
		mm_reg1 = _mm_loadu_pd(pY);
		mm_reg1 = _mm_mul_pd(mm_reg1, mm_beta);
		_mm_storeu_pd( pY, mm_reg1);
	}
	for(;i<lda;i++,pY++)
		(*pY) = beta * (*pY);

	pY = Y;
	for(i=0;i<lda;i++,pA+=i,pY+=incY)
	{
		pX = X + i*incX;
		temp = 0.0;
		mm_temp = _mm_set1_pd(0.0);
		j=i;
		for(;j<N-1;j+=2,pA+=2,pX+=(2*incX))
		{
			mm_reg1 = _mm_loadu_pd(pA);
			mm_reg2 = _mm_loadu_pd(pX);
			mm_reg1 = _mm_mul_pd(mm_reg1, mm_reg2);
			mm_reg2 = _mm_mul_pd(mm_alpha, mm_reg1);
			mm_temp = _mm_add_pd(mm_temp, mm_reg2);
		}
		_mm_store_pd(unpack, mm_temp);
		temp+=unpack[0];
		temp+=unpack[1];
		for(;j<N;j++,pA++,pX+=incX)
			temp += alpha * (*pA) * (*pX);
		(*pY) += temp;
	}

	pA = A;
	pX = X;
	for(i=0;i<lda;i++,pA+=i,pX+=incX)
	{
		reg1 = (*pX);
		mm_reg1 = _mm_set1_pd(reg1);
		pA++;
		pY=Y+(i+1)*incY;
		j=i+1;
		for(;j<N-1;j+=2,pA+=2,pY+=(2*incY))
		{
			mm_reg2 = _mm_loadu_pd(pA);
			mm_reg2 = _mm_mul_pd(mm_reg2, mm_reg1);
			mm_reg2 = _mm_mul_pd(mm_alpha, mm_reg2);
			_mm_storeu_pd( pY, _mm_add_pd(_mm_loadu_pd(pY),mm_reg2));
		}
		for(;j<N;j++,pA++,pY+=incY)
			*pY += alpha * (*pA) * reg1;
	}
}
    int              vdwioffset0;
    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
    int              vdwioffset1;
    __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
    int              vdwioffset2;
    __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
    int              vdwjidx0A,vdwjidx0B;
    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
    __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
    __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
    real             *charge;
    __m128d          dummy_mask,cutoff_mask;
    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
    __m128d          one     = _mm_set1_pd(1.0);
    __m128d          two     = _mm_set1_pd(2.0);
    x                = xx[0];
    f                = ff[0];

    nri              = nlist->nri;
    iinr             = nlist->iinr;
    jindex           = nlist->jindex;
    jjnr             = nlist->jjnr;
    shiftidx         = nlist->shift;
    gid              = nlist->gid;
    shiftvec         = fr->shift_vec[0];
    fshift           = fr->fshift[0];
    facel            = _mm_set1_pd(fr->epsfac);
    charge           = mdatoms->chargeA;
    krf              = _mm_set1_pd(fr->ic->k_rf);
    int              jnrA,jnrB;
    int              j_coord_offsetA,j_coord_offsetB;
    int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
    real             rcutoff_scalar;
    real             *shiftvec,*fshift,*x,*f;
    __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
    int              vdwioffset0;
    __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
    int              vdwjidx0A,vdwjidx0B;
    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
    int              nvdwtype;
    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
    int              *vdwtype;
    real             *vdwparam;
    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
    __m128d          dummy_mask,cutoff_mask;
    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
    __m128d          one     = _mm_set1_pd(1.0);
    __m128d          two     = _mm_set1_pd(2.0);
    x                = xx[0];
    f                = ff[0];

    nri              = nlist->nri;
    iinr             = nlist->iinr;
    jindex           = nlist->jindex;
    jjnr             = nlist->jjnr;
    shiftidx         = nlist->shift;
    gid              = nlist->gid;
    shiftvec         = fr->shift_vec[0];
    int              vdwioffset1;
    __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
    int              vdwioffset2;
    __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
    int              vdwioffset3;
    __m128d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
    int              vdwjidx0A,vdwjidx0B;
    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
    __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
    __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
    __m128d          dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
    real             *charge;
    __m128d          dummy_mask,cutoff_mask;
    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
    __m128d          one     = _mm_set1_pd(1.0);
    __m128d          two     = _mm_set1_pd(2.0);
    x                = xx[0];
    f                = ff[0];

    nri              = nlist->nri;
    iinr             = nlist->iinr;
    jindex           = nlist->jindex;
    jjnr             = nlist->jjnr;
    shiftidx         = nlist->shift;
    gid              = nlist->gid;
    shiftvec         = fr->shift_vec[0];
    fshift           = fr->fshift[0];
    facel            = _mm_set1_pd(fr->epsfac);
    charge           = mdatoms->chargeA;
    __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
    int              vdwioffset3;
    __m128d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
    int              vdwjidx0A,vdwjidx0B;
    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
    __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
    __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
    __m128d          dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
    real             *charge;
    int              nvdwtype;
    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
    int              *vdwtype;
    real             *vdwparam;
    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
    __m128d          dummy_mask,cutoff_mask;
    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
    __m128d          one     = _mm_set1_pd(1.0);
    __m128d          two     = _mm_set1_pd(2.0);
    x                = xx[0];
    f                = ff[0];

    nri              = nlist->nri;
    iinr             = nlist->iinr;
    jindex           = nlist->jindex;
    jjnr             = nlist->jjnr;
    shiftidx         = nlist->shift;
    gid              = nlist->gid;
    shiftvec         = fr->shift_vec[0];
    int              vdwioffset1;
    __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
    int              vdwioffset2;
    __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
    int              vdwjidx0A,vdwjidx0B;
    __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
    __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
    __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
    __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
    __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
    real             *charge;
    int              nvdwtype;
    __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
    int              *vdwtype;
    real             *vdwparam;
    __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
    __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
    __m128i          vfitab;
    __m128i          ifour       = _mm_set1_epi32(4);
    __m128d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
    real             *vftab;
    __m128d          dummy_mask,cutoff_mask;
    __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
    __m128d          one     = _mm_set1_pd(1.0);
    __m128d          two     = _mm_set1_pd(2.0);
    x                = xx[0];
    f                = ff[0];

    nri              = nlist->nri;
    iinr             = nlist->iinr;
    jindex           = nlist->jindex;
Ejemplo n.º 30
0
void
mlib_FIR_tap2_d64(
    mlib_d64 *pdst,
    const mlib_d64 *psrc,
    mlib_d64 *pflt,
    mlib_s32 n)
{
	mlib_s32 j;
	mlib_d64 src1, src2;
	mlib_d64 dflt1 = pflt[0], dflt2 = pflt[1];
	__m128d sdflt1, sdflt2;
	__m128d ssrc1, ssrc2;
	__m128d smul1, smul2;
	__m128d sdst;

	src1 = psrc[0];
	j = 0;
	if ((mlib_addr)psrc & 15) {
		src2 = psrc[1];
		pdst[0] += dflt2 * src1 + dflt1 * src2;

		psrc++;
		pdst++;
		j++;
		src1 = src2;
	}

	sdflt2 = _mm_set1_pd(dflt2);
	sdflt1 = _mm_set1_pd(dflt1);

#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
	for (; j < (n - 1); j += 2) {
		ssrc1 = _mm_load_pd(psrc);
		ssrc2 = _mm_loadu_pd(psrc + 1);

		smul1 = _mm_mul_pd(sdflt2, ssrc1);
		smul2 = _mm_mul_pd(sdflt1, ssrc2);

		smul1 = _mm_add_pd(smul1, smul2);

		sdst = _mm_loadu_pd(pdst);
		sdst = _mm_add_pd(sdst, smul1);
		_mm_storeu_pd(pdst, sdst);

		psrc += 2;
		pdst += 2;
	}

	src1 = psrc[0];

	for (; j < n; j++) {
		src2 = psrc[1];

		pdst[0] += dflt2 * src1 + dflt1 * src2;

		psrc++;
		pdst++;

		src1 = src2;
	}
}