Beispiel #1
0
void UpResidual_GMRF_5() {
exchsolution_gmrfData_5(0);
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[0]) {
/* Statements in this Scop: S141 */
for (int i0 = iterationOffsetBegin[0][1]; (i0<=(iterationOffsetEnd[0][1]+32)); i0 += 1) {
double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][(i0*35)]);
double* fieldData_Residual_GMRF_5_p1 = (&fieldData_Residual_GMRF[5][(i0*35)]);
double* fieldData_Solution_GMRF_5_p1 = (&fieldData_Solution_GMRF[5][(i0*35)]);
double* fieldData_RHS_GMRF_5_p1 = (&fieldData_RHS_GMRF[5][(i0*33)]);
int i1 = (iterationOffsetBegin[0][0]+i0);
for (; (i1<(((iterationOffsetBegin[0][0]+i0)+1)&(~1))); i1 += 1) {
fieldData_Residual_GMRF_5_p1[(i1+38)] = (fieldData_RHS_GMRF_5_p1[i1]-(((((((((fieldData_LaplaceCoeff_GMRF_5_p1[(i1+38)]*fieldData_Solution_GMRF_5_p1[(i1+38)])+(fieldData_LaplaceCoeff_GMRF_5_p1[(i1+1298)]*fieldData_Solution_GMRF_5_p1[(i1+39)]))+(fieldData_LaplaceCoeff_GMRF_5_p1[(i1+2558)]*fieldData_Solution_GMRF_5_p1[(i1+37)]))+(fieldData_LaplaceCoeff_GMRF_5_p1[(i1+3818)]*fieldData_Solution_GMRF_5_p1[(i1+74)]))+(fieldData_LaplaceCoeff_GMRF_5_p1[(i1+5078)]*fieldData_Solution_GMRF_5_p1[(i1+2)]))+(fieldData_LaplaceCoeff_GMRF_5_p1[(i1+6338)]*fieldData_Solution_GMRF_5_p1[(i1+1)]))+(fieldData_LaplaceCoeff_GMRF_5_p1[(i1+7598)]*fieldData_Solution_GMRF_5_p1[(i1+73)]))+(fieldData_LaplaceCoeff_GMRF_5_p1[(i1+8858)]*fieldData_Solution_GMRF_5_p1[(i1+3)]))+(fieldData_LaplaceCoeff_GMRF_5_p1[(i1+10118)]*fieldData_Solution_GMRF_5_p1[(i1+75)])));
}
for (; (i1<((iterationOffsetEnd[0][0]+i0)+30)); i1 += 4) {
/* fieldData_Residual_GMRF_5_p1[(i1+38)] = (fieldData_RHS_GMRF_5_p1[i1]-(((((((((fieldData_LaplaceCoeff_GMRF_5_p1[(i1+38)]*fieldData_Solution_GMRF_5_p1[(i1+38)])+(fieldData_LaplaceCoeff_GMRF_5_p1[(i1+1298)]*fieldData_Solution_GMRF_5_p1[(i1+39)]))+(fieldData_LaplaceCoeff_GMRF_5_p1[(i1+2558)]*fieldData_Solution_GMRF_5_p1[(i1+37)]))+(fieldData_LaplaceCoeff_GMRF_5_p1[(i1+3818)]*fieldData_Solution_GMRF_5_p1[(i1+74)]))+(fieldData_LaplaceCoeff_GMRF_5_p1[(i1+5078)]*fieldData_Solution_GMRF_5_p1[(i1+2)]))+(fieldData_LaplaceCoeff_GMRF_5_p1[(i1+6338)]*fieldData_Solution_GMRF_5_p1[(i1+1)]))+(fieldData_LaplaceCoeff_GMRF_5_p1[(i1+7598)]*fieldData_Solution_GMRF_5_p1[(i1+73)]))+(fieldData_LaplaceCoeff_GMRF_5_p1[(i1+8858)]*fieldData_Solution_GMRF_5_p1[(i1+3)]))+(fieldData_LaplaceCoeff_GMRF_5_p1[(i1+10118)]*fieldData_Solution_GMRF_5_p1[(i1+75)]))); */
__m128d vec0 = _mm_loadu_pd((&fieldData_RHS_GMRF_5_p1[i1]));
__m128d vec0_2 = _mm_loadu_pd((&fieldData_RHS_GMRF_5_p1[(i1+2)]));
__m128d vec1 = _mm_loadu_pd((&fieldData_LaplaceCoeff_GMRF_5_p1[(i1+10118)]));
__m128d vec1_2 = _mm_loadu_pd((&fieldData_LaplaceCoeff_GMRF_5_p1[(i1+10120)]));
__m128d vec2 = _mm_loadu_pd((&fieldData_Solution_GMRF_5_p1[(i1+75)]));
__m128d vec2_2 = _mm_loadu_pd((&fieldData_Solution_GMRF_5_p1[(i1+77)]));
__m128d vec3 = _mm_loadu_pd((&fieldData_LaplaceCoeff_GMRF_5_p1[(i1+8858)]));
__m128d vec3_2 = _mm_loadu_pd((&fieldData_LaplaceCoeff_GMRF_5_p1[(i1+8860)]));
__m128d vec4 = _mm_loadu_pd((&fieldData_Solution_GMRF_5_p1[(i1+3)]));
__m128d vec4_2 = _mm_loadu_pd((&fieldData_Solution_GMRF_5_p1[(i1+5)]));
__m128d vec5 = _mm_loadu_pd((&fieldData_LaplaceCoeff_GMRF_5_p1[(i1+7598)]));
__m128d vec5_2 = _mm_loadu_pd((&fieldData_LaplaceCoeff_GMRF_5_p1[(i1+7600)]));
__m128d vec6 = _mm_loadu_pd((&fieldData_Solution_GMRF_5_p1[(i1+73)]));
__m128d vec6_2 = _mm_loadu_pd((&fieldData_Solution_GMRF_5_p1[(i1+75)]));
__m128d vec7 = _mm_loadu_pd((&fieldData_LaplaceCoeff_GMRF_5_p1[(i1+6338)]));
__m128d vec7_2 = _mm_loadu_pd((&fieldData_LaplaceCoeff_GMRF_5_p1[(i1+6340)]));
__m128d vec8 = _mm_loadu_pd((&fieldData_Solution_GMRF_5_p1[(i1+1)]));
__m128d vec8_2 = _mm_loadu_pd((&fieldData_Solution_GMRF_5_p1[(i1+3)]));
__m128d vec9 = _mm_loadu_pd((&fieldData_LaplaceCoeff_GMRF_5_p1[(i1+5078)]));
__m128d vec9_2 = _mm_loadu_pd((&fieldData_LaplaceCoeff_GMRF_5_p1[(i1+5080)]));
__m128d vec10 = _mm_loadu_pd((&fieldData_Solution_GMRF_5_p1[(i1+2)]));
__m128d vec10_2 = _mm_loadu_pd((&fieldData_Solution_GMRF_5_p1[(i1+4)]));
__m128d vec11 = _mm_loadu_pd((&fieldData_LaplaceCoeff_GMRF_5_p1[(i1+3818)]));
__m128d vec11_2 = _mm_loadu_pd((&fieldData_LaplaceCoeff_GMRF_5_p1[(i1+3820)]));
__m128d vec12 = _mm_loadu_pd((&fieldData_Solution_GMRF_5_p1[(i1+74)]));
__m128d vec12_2 = _mm_loadu_pd((&fieldData_Solution_GMRF_5_p1[(i1+76)]));
__m128d vec13 = _mm_loadu_pd((&fieldData_LaplaceCoeff_GMRF_5_p1[(i1+2558)]));
__m128d vec13_2 = _mm_loadu_pd((&fieldData_LaplaceCoeff_GMRF_5_p1[(i1+2560)]));
__m128d vec14 = _mm_loadu_pd((&fieldData_Solution_GMRF_5_p1[(i1+37)]));
__m128d vec14_2 = _mm_loadu_pd((&fieldData_Solution_GMRF_5_p1[(i1+39)]));
__m128d vec15 = _mm_loadu_pd((&fieldData_LaplaceCoeff_GMRF_5_p1[(i1+38)]));
__m128d vec15_2 = _mm_loadu_pd((&fieldData_LaplaceCoeff_GMRF_5_p1[(i1+40)]));
__m128d vec16 = _mm_loadu_pd((&fieldData_Solution_GMRF_5_p1[(i1+38)]));
__m128d vec16_2 = _mm_loadu_pd((&fieldData_Solution_GMRF_5_p1[(i1+40)]));
__m128d vec17 = _mm_loadu_pd((&fieldData_LaplaceCoeff_GMRF_5_p1[(i1+1298)]));
__m128d vec17_2 = _mm_loadu_pd((&fieldData_LaplaceCoeff_GMRF_5_p1[(i1+1300)]));
__m128d vec18 = _mm_loadu_pd((&fieldData_Solution_GMRF_5_p1[(i1+39)]));
__m128d vec18_2 = _mm_loadu_pd((&fieldData_Solution_GMRF_5_p1[(i1+41)]));
__m128d vec19;
__m128d vec19_2;
vec19 = _mm_sub_pd(vec0, _mm_add_pd(_mm_mul_pd(vec1, vec2), _mm_add_pd(_mm_mul_pd(vec3, vec4), _mm_add_pd(_mm_mul_pd(vec5, vec6), _mm_add_pd(_mm_mul_pd(vec7, vec8), _mm_add_pd(_mm_mul_pd(vec9, vec10), _mm_add_pd(_mm_mul_pd(vec11, vec12), _mm_add_pd(_mm_mul_pd(vec13, vec14), _mm_add_pd(_mm_mul_pd(vec15, vec16), _mm_mul_pd(vec17, vec18))))))))));
vec19_2 = _mm_sub_pd(vec0_2, _mm_add_pd(_mm_mul_pd(vec1_2, vec2_2), _mm_add_pd(_mm_mul_pd(vec3_2, vec4_2), _mm_add_pd(_mm_mul_pd(vec5_2, vec6_2), _mm_add_pd(_mm_mul_pd(vec7_2, vec8_2), _mm_add_pd(_mm_mul_pd(vec9_2, vec10_2), _mm_add_pd(_mm_mul_pd(vec11_2, vec12_2), _mm_add_pd(_mm_mul_pd(vec13_2, vec14_2), _mm_add_pd(_mm_mul_pd(vec15_2, vec16_2), _mm_mul_pd(vec17_2, vec18_2))))))))));
_mm_storeu_pd((&fieldData_Residual_GMRF_5_p1[(i1+38)]), vec19);
_mm_storeu_pd((&fieldData_Residual_GMRF_5_p1[(i1+40)]), vec19_2);
}
for (; (i1<((iterationOffsetEnd[0][0]+i0)+33)); i1 += 1) {
fieldData_Residual_GMRF_5_p1[(i1+38)] = (fieldData_RHS_GMRF_5_p1[i1]-(((((((((fieldData_LaplaceCoeff_GMRF_5_p1[(i1+38)]*fieldData_Solution_GMRF_5_p1[(i1+38)])+(fieldData_LaplaceCoeff_GMRF_5_p1[(i1+1298)]*fieldData_Solution_GMRF_5_p1[(i1+39)]))+(fieldData_LaplaceCoeff_GMRF_5_p1[(i1+2558)]*fieldData_Solution_GMRF_5_p1[(i1+37)]))+(fieldData_LaplaceCoeff_GMRF_5_p1[(i1+3818)]*fieldData_Solution_GMRF_5_p1[(i1+74)]))+(fieldData_LaplaceCoeff_GMRF_5_p1[(i1+5078)]*fieldData_Solution_GMRF_5_p1[(i1+2)]))+(fieldData_LaplaceCoeff_GMRF_5_p1[(i1+6338)]*fieldData_Solution_GMRF_5_p1[(i1+1)]))+(fieldData_LaplaceCoeff_GMRF_5_p1[(i1+7598)]*fieldData_Solution_GMRF_5_p1[(i1+73)]))+(fieldData_LaplaceCoeff_GMRF_5_p1[(i1+8858)]*fieldData_Solution_GMRF_5_p1[(i1+3)]))+(fieldData_LaplaceCoeff_GMRF_5_p1[(i1+10118)]*fieldData_Solution_GMRF_5_p1[(i1+75)])));
}
}
}
}
}
Beispiel #2
0
void m2l_along_z(long long nmultipoles, double *scr1, double *scr2, double *d2, double *fr, double *sg)
{
  int mmmm,mmm,mm,m;
  int i,j,k,l,n,nn;

  __m128d reg00,reg01,reg02,reg03;
  __m128d reg04,reg05,reg06,reg07;
  __m128d reg08,reg09,reg10,reg11;
  __m128d reg12,reg13,reg14,reg15;
  __m128d reg16,reg17;                   /* register for rotation matrix TODO: rename regdmat1,regdmat2*/
  __m128d reg18,reg19;                   /* register for g,gl,glm */

  i = -15;

  __m128d regzero = _mm_setzero_pd();

  reg08 = regzero;
  reg09 = regzero;
  reg10 = regzero;
  reg11 = regzero;

  for(j=0;j<=nmultipoles;++j)
  {
    i += 16;

    reg00 = _mm_load_pd(&scr2[i-1]);
    reg01 = _mm_load_pd(&scr2[i+1]);
    reg04 = _mm_load_pd(&scr2[i+7]);
    reg05 = _mm_load_pd(&scr2[i+9]);

    reg18 = _mm_load1_pd(&fr[j]);

    reg08 = _mm_add_pd(reg08,_mm_mul_pd(reg00,reg18));
    reg09 = _mm_add_pd(reg09,_mm_mul_pd(reg01,reg18));
    reg12 = _mm_add_pd(reg12,_mm_mul_pd(reg04,reg18));
    reg13 = _mm_add_pd(reg13,_mm_mul_pd(reg05,reg18));
  }

  _mm_store_pd(&scr1[ 0],reg12);
  _mm_store_pd(&scr1[ 2],reg13);
  _mm_store_pd(&scr1[ 4],regzero);
  _mm_store_pd(&scr1[ 6],regzero);
  _mm_store_pd(&scr1[ 8],reg08);
  _mm_store_pd(&scr1[10],reg09);
  _mm_store_pd(&scr1[12],regzero);
  _mm_store_pd(&scr1[14],regzero);

  i = 1;

  for(l=1;l<=nmultipoles;++l)
  {
    i += 16 * l;
    j = -15;
    k = nmultipoles+l;

    reg08 = regzero;
    reg09 = regzero;
    reg12 = regzero;
    reg13 = regzero;

    for(m=l;m<=k;++m)
    {
      j += 16;

      reg00 = _mm_load_pd(&scr2[j-1]);
      reg01 = _mm_load_pd(&scr2[j+1]);
      reg04 = _mm_load_pd(&scr2[j+7]);
      reg05 = _mm_load_pd(&scr2[j+9]);

      reg18 = _mm_load1_pd(&fr[m]);

      reg08 = _mm_add_pd(reg08,_mm_mul_pd(reg00,reg18));
      reg09 = _mm_add_pd(reg09,_mm_mul_pd(reg01,reg18));
      reg12 = _mm_add_pd(reg12,_mm_mul_pd(reg04,reg18));
      reg13 = _mm_add_pd(reg13,_mm_mul_pd(reg05,reg18));
    }

    reg18 = _mm_load1_pd(&sg[l]);

    reg12 = _mm_mul_pd(reg12,reg18);
    _mm_store_pd(&scr1[i- 1],reg12);

    reg13 = _mm_mul_pd(reg13,reg18);
    _mm_store_pd(&scr1[i+ 1],reg13);

    _mm_store_pd(&scr1[i+ 3],regzero);
    _mm_store_pd(&scr1[i+ 5],regzero);

    reg08 = _mm_mul_pd(reg08,reg18);
    _mm_store_pd(&scr1[i+ 7],reg08);

    reg09 = _mm_mul_pd(reg09,reg18);
    _mm_store_pd(&scr1[i+ 9],reg09);

    _mm_store_pd(&scr1[i+11],regzero);
    _mm_store_pd(&scr1[i+13],regzero);
  }

  mm = 16 * nmultipoles;

  i = 1;
  n = mm+1;

  for(m=1;m<=nmultipoles;++m)
  {
    i += 16 * m;
    j = i;

    for(l=m;l<=nmultipoles;++l)
    {

      j += 16 * l;
      nn = n;
      k = m + l;
      mmm = nmultipoles + l;

      reg08 = regzero;
      reg09 = regzero;
      reg10 = regzero;
      reg11 = regzero;
      reg12 = regzero;
      reg13 = regzero;
      reg14 = regzero;
      reg15 = regzero;

      for(mmmm=k;mmmm<=mmm;++mmmm)
      {
        nn += 16;

        reg00 = _mm_load_pd(&scr2[nn- 1]);
        reg01 = _mm_load_pd(&scr2[nn+ 1]);
        reg02 = _mm_load_pd(&scr2[nn+ 3]);
        reg03 = _mm_load_pd(&scr2[nn+ 5]);
        reg04 = _mm_load_pd(&scr2[nn+ 7]);
        reg05 = _mm_load_pd(&scr2[nn+ 9]);
        reg06 = _mm_load_pd(&scr2[nn+11]);
        reg07 = _mm_load_pd(&scr2[nn+13]);

        reg18 = _mm_load1_pd(&fr[mmmm]);

        reg08 = _mm_add_pd(reg08,_mm_mul_pd(reg00,reg18));
        reg09 = _mm_add_pd(reg09,_mm_mul_pd(reg01,reg18));

        reg10 = _mm_sub_pd(reg10,_mm_mul_pd(reg02,reg18));
        reg11 = _mm_sub_pd(reg11,_mm_mul_pd(reg03,reg18));

        reg12 = _mm_add_pd(reg12,_mm_mul_pd(reg04,reg18));
        reg13 = _mm_add_pd(reg13,_mm_mul_pd(reg05,reg18));

        reg14 = _mm_sub_pd(reg14,_mm_mul_pd(reg06,reg18));
        reg15 = _mm_sub_pd(reg15,_mm_mul_pd(reg07,reg18));
      }

      reg18 = _mm_load1_pd(&sg[k]);

      reg12 = _mm_mul_pd(reg12,reg18);
      _mm_store_pd(&scr1[j- 1],reg12);

      reg13 = _mm_mul_pd(reg13,reg18);
      _mm_store_pd(&scr1[j+ 1],reg13);

      reg14 = _mm_mul_pd(reg14,reg18);
      _mm_store_pd(&scr1[j+ 3],reg14);

      reg15 = _mm_mul_pd(reg15,reg18);
      _mm_store_pd(&scr1[j+ 5],reg15);

      reg08 = _mm_mul_pd(reg08,reg18);
      _mm_store_pd(&scr1[j+ 7],reg08);

      reg09 = _mm_mul_pd(reg09,reg18);
      _mm_store_pd(&scr1[j+ 9],reg09);

      reg10 = _mm_mul_pd(reg10,reg18);
      _mm_store_pd(&scr1[j+11],reg10);

      reg11 = _mm_mul_pd(reg11,reg18);
      _mm_store_pd(&scr1[j+13],reg11);
    }

    n += mm;
    mm -= 16;
  }
}
static inline __m128d
my_invrsq_pd(__m128d x)
{
	const __m128d three = (const __m128d) {3.0f, 3.0f};
	const __m128d half  = (const __m128d) {0.5f, 0.5f};
	
	__m128  t  = _mm_rsqrt_ps(_mm_cvtpd_ps(x)); /* Convert to single precision and do _mm_rsqrt_ps() */
	__m128d t1 = _mm_cvtps_pd(t); /* Convert back to double precision */
	
	/* First Newton-Rapson step, accuracy is now 24 bits */
	__m128d t2 = _mm_mul_pd(half,_mm_mul_pd(t1,_mm_sub_pd(three,_mm_mul_pd(x,_mm_mul_pd(t1,t1)))));
	
	/* Return second Newton-Rapson step, accuracy 48 bits */
	return (__m128d) _mm_mul_pd(half,_mm_mul_pd(t2,_mm_sub_pd(three,_mm_mul_pd(x,_mm_mul_pd(t2,t2)))));
}

/* to extract single integers from a __m128i datatype */
#define _mm_extract_epi64(x, imm) \
    _mm_cvtsi128_si32(_mm_srli_si128((x), 4 * (imm)))
	
void nb_kernel400_x86_64_sse2(int *           p_nri,
                    int *           iinr,
                    int *           jindex,
                    int *           jjnr,
                    int *           shift,
                    double *         shiftvec,
                    double *         fshift,
                    int *           gid,
                    double *         pos,
                    double *         faction,
                    double *         charge,
                    double *         p_facel,
                    double *         p_krf,
                    double *         p_crf,
                    double *         Vc,
                    int *           type,
                    int *           p_ntype,
                    double *         vdwparam,
                    double *         Vvdw,
                    double *         p_tabscale,
                    double *         VFtab,
                    double *         invsqrta,
                    double *         dvda,
                    double *         p_gbtabscale,
                    double *         GBtab,
                    int *           p_nthreads,
                    int *           count,
                    void *          mtx,
                    int *           outeriter,
                    int *           inneriter,
                    double *         work)
{
	int           nri,ntype,nthreads,offset;
	int           n,ii,is3,ii3,k,nj0,nj1,jnr1,jnr2,j13,j23,ggid;
	double        facel,krf,crf,tabscl,gbtabscl,vct,vgbt;
	double        shX,shY,shZ,isai_d,dva;
	gmx_gbdata_t *gbdata;
	float *        gpol;

	__m128d       ix,iy,iz,jx,jy,jz;
	__m128d		  dx,dy,dz,t1,t2,t3;
	__m128d		  fix,fiy,fiz,rsq11,rinv,r,fscal,rt,eps,eps2;
	__m128d		  q,iq,qq,isai,isaj,isaprod,vcoul,gbscale,dvdai,dvdaj;
	__m128d       Y,F,G,H,Fp,VV,FF,vgb,fijC,dvdatmp,dvdasum,vctot,vgbtot,n0d;
	__m128d		  xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7,xmm8;
	__m128d       fac,tabscale,gbtabscale;
	__m128i       n0,nnn;
	
	const __m128d neg    = {-1.0f,-1.0f};
	const __m128d zero   = {0.0f,0.0f};
	const __m128d half   = {0.5f,0.5f};
	const __m128d two    = {2.0f,2.0f};
	const __m128d three  = {3.0f,3.0f};
	
	gbdata     = (gmx_gbdata_t *)work;
	gpol       = gbdata->gpol;

	nri        = *p_nri;
	ntype      = *p_ntype;
	nthreads   = *p_nthreads; 
    facel      = (*p_facel) * (1.0 - (1.0/gbdata->gb_epsilon_solvent));       
	krf        = *p_krf;
	crf        = *p_crf;
	tabscl     = *p_tabscale;
	gbtabscl   = *p_gbtabscale;
	nj1        = 0;
	
	/* Splat variables */
	fac        = _mm_load1_pd(&facel);
	tabscale   = _mm_load1_pd(&tabscl);
	gbtabscale = _mm_load1_pd(&gbtabscl);
		
	/* Keep compiler happy */
	dvdatmp = _mm_setzero_pd();
	vgb     = _mm_setzero_pd();
	dvdaj   = _mm_setzero_pd();
	isaj    = _mm_setzero_pd();
	vcoul   = _mm_setzero_pd();
	t1      = _mm_setzero_pd();
	t2      = _mm_setzero_pd();
	t3      = _mm_setzero_pd();

	jnr1=jnr2=0;
	j13=j23=0;
	
	for(n=0;n<nri;n++)
	{
		is3     = 3*shift[n];
		shX     = shiftvec[is3];
		shY     = shiftvec[is3+1];
		shZ     = shiftvec[is3+2];
		
		nj0     = jindex[n];      
        nj1     = jindex[n+1];  
		offset  = (nj1-nj0)%2;
		
		ii      = iinr[n];
		ii3     = ii*3;
		
		ix      = _mm_set1_pd(shX+pos[ii3+0]);
		iy      = _mm_set1_pd(shX+pos[ii3+1]);
		iz      = _mm_set1_pd(shX+pos[ii3+2]); 
		q       = _mm_set1_pd(charge[ii]);
		
		iq      = _mm_mul_pd(fac,q); 
		isai_d  = invsqrta[ii];
		isai    = _mm_load1_pd(&isai_d);
		
		fix     = _mm_setzero_pd();
		fiy     = _mm_setzero_pd();
		fiz     = _mm_setzero_pd();
		dvdasum = _mm_setzero_pd();
		vctot   = _mm_setzero_pd();
		vgbtot  = _mm_setzero_pd();
		
		for(k=nj0;k<nj1-offset; k+=2)
		{
			jnr1    = jjnr[k];
			jnr2    = jjnr[k+1];
						
			j13     = jnr1 * 3;
			j23     = jnr2 * 3;
			
			/* Load coordinates */
			xmm1    = _mm_loadu_pd(pos+j13); /* x1 y1 */
			xmm2    = _mm_loadu_pd(pos+j23); /* x2 y2 */
			
			xmm5    = _mm_load_sd(pos+j13+2); /* z1 - */
			xmm6    = _mm_load_sd(pos+j23+2); /* z2 - */
			
			/* transpose */
			jx      = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); 
			jy      = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); 
			jz      = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); 
			
			/* distances */
			dx      = _mm_sub_pd(ix,jx);
			dy		= _mm_sub_pd(iy,jy);
			dz		= _mm_sub_pd(iz,jz);
			
			rsq11   = _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx) , _mm_mul_pd(dy,dy) ) , _mm_mul_pd(dz,dz) );
			rinv    = my_invrsq_pd(rsq11);
						
			/* Load invsqrta */
			isaj	= _mm_loadl_pd(isaj,invsqrta+jnr1);
			isaj	= _mm_loadh_pd(isaj,invsqrta+jnr2);
			isaprod = _mm_mul_pd(isai,isaj);
			
			/* Load charges */
			q		= _mm_loadl_pd(q,charge+jnr1);
			q		= _mm_loadh_pd(q,charge+jnr2);
			qq		= _mm_mul_pd(iq,q);
			
			vcoul	= _mm_mul_pd(qq,rinv);
			fscal	= _mm_mul_pd(vcoul,rinv);
			qq		= _mm_mul_pd(isaprod,qq);
			qq		= _mm_mul_pd(qq,neg);
			gbscale	= _mm_mul_pd(isaprod,gbtabscale);
			
			/* Load dvdaj */
			dvdaj	= _mm_loadl_pd(dvdaj, dvda+jnr1);
			dvdaj	= _mm_loadh_pd(dvdaj, dvda+jnr2);
			
			r		= _mm_mul_pd(rsq11,rinv);
			rt		= _mm_mul_pd(r,gbscale);
			n0		= _mm_cvttpd_epi32(rt);
			n0d		= _mm_cvtepi32_pd(n0);
			eps		= _mm_sub_pd(rt,n0d);
			eps2	= _mm_mul_pd(eps,eps);
			
			nnn		= _mm_slli_epi64(n0,2);
			
			xmm1	= _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,0)));   /* Y1 F1 */
			xmm2	= _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,1)));   /* Y2 F2 */
			xmm3	= _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,0))+2); /* G1 H1 */
			xmm4	= _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,1))+2); /* G2 H2 */
			
			Y		= _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* Y1 Y2 */
			F		= _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* F1 F2 */
			G		= _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); /* G1 G2 */
			H		= _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); /* H1 H2 */
			
			G		= _mm_mul_pd(G,eps);
			H		= _mm_mul_pd(H,eps2);
			Fp		= _mm_add_pd(F,G);
			Fp		= _mm_add_pd(Fp,H);
			VV		= _mm_mul_pd(Fp,eps);
			VV		= _mm_add_pd(Y,VV);
			H		= _mm_mul_pd(two,H);
			FF		= _mm_add_pd(Fp,G);
			FF		= _mm_add_pd(FF,H);
			vgb		= _mm_mul_pd(qq,VV);
			fijC	= _mm_mul_pd(qq,FF);
			fijC	= _mm_mul_pd(fijC,gbscale);
			
			dvdatmp = _mm_mul_pd(fijC,r);
			dvdatmp	= _mm_add_pd(vgb,dvdatmp);
			dvdatmp = _mm_mul_pd(dvdatmp,neg);
			dvdatmp = _mm_mul_pd(dvdatmp,half);
			dvdasum	= _mm_add_pd(dvdasum,dvdatmp);
			
			xmm1	= _mm_mul_pd(dvdatmp,isaj);
			xmm1	= _mm_mul_pd(xmm1,isaj);
			dvdaj	= _mm_add_pd(dvdaj,xmm1);
			
			/* store dvda */
			_mm_storel_pd(dvda+jnr1,dvdaj);
			_mm_storeh_pd(dvda+jnr2,dvdaj);
			
			vctot	= _mm_add_pd(vctot,vcoul);
			vgbtot  = _mm_add_pd(vgbtot,vgb);
					
			fscal	= _mm_sub_pd(fijC,fscal);
			fscal	= _mm_mul_pd(fscal,neg);
			fscal	= _mm_mul_pd(fscal,rinv);
						
			/* calculate partial force terms */
			t1		= _mm_mul_pd(fscal,dx);
			t2		= _mm_mul_pd(fscal,dy);
			t3		= _mm_mul_pd(fscal,dz);
			
			/* update the i force */
			fix		= _mm_add_pd(fix,t1);
			fiy		= _mm_add_pd(fiy,t2);
			fiz		= _mm_add_pd(fiz,t3);
			
			/* accumulate forces from memory */
			xmm1	= _mm_loadu_pd(faction+j13); /* fx1 fy1 */
			xmm2	= _mm_loadu_pd(faction+j23); /* fx2 fy2 */
			
			xmm5	= _mm_load1_pd(faction+j13+2); /* fz1 fz1 */
			xmm6	= _mm_load1_pd(faction+j23+2); /* fz2 fz2 */
			
			/* transpose */
			xmm7	= _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); /* fz1 fz2 */
			xmm5	= _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* fx1 fx2 */
			xmm6	= _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* fy1 fy2 */
			
			/* subtract partial forces */
			xmm5	= _mm_sub_pd(xmm5,t1);
			xmm6	= _mm_sub_pd(xmm6,t2);
			xmm7	= _mm_sub_pd(xmm7,t3);
			
			xmm1	= _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); /* fx1 fy1 */
			xmm2	= _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(1,1)); /* fy1 fy2 */
			
			/* store fx and fy */
			_mm_storeu_pd(faction+j13,xmm1);
			_mm_storeu_pd(faction+j23,xmm2);
			
			/* .. then fz */
			_mm_storel_pd(faction+j13+2,xmm7);
			_mm_storel_pd(faction+j23+2,xmm7);
		}

		/* In double precision, offset can only be either 0 or 1 */
		if(offset!=0)
		{
			jnr1	= jjnr[k];
			j13		= jnr1*3;
			
			jx      = _mm_load_sd(pos+j13);
			jy      = _mm_load_sd(pos+j13+1);
			jz      = _mm_load_sd(pos+j13+2);
						
			isaj	= _mm_load_sd(invsqrta+jnr1);
			isaprod = _mm_mul_sd(isai,isaj);
			dvdaj	= _mm_load_sd(dvda+jnr1);
			q		= _mm_load_sd(charge+jnr1);
			qq      = _mm_mul_sd(iq,q);
			
			dx      = _mm_sub_sd(ix,jx);
			dy		= _mm_sub_sd(iy,jy);
			dz		= _mm_sub_sd(iz,jz);
			
			rsq11   = _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx) , _mm_mul_pd(dy,dy) ) , _mm_mul_pd(dz,dz) );
			rinv    = my_invrsq_pd(rsq11);
						
			vcoul	= _mm_mul_sd(qq,rinv);
			fscal	= _mm_mul_sd(vcoul,rinv);
			qq		= _mm_mul_sd(isaprod,qq);
			qq		= _mm_mul_sd(qq,neg);
			gbscale	= _mm_mul_sd(isaprod,gbtabscale);
			
			r		= _mm_mul_sd(rsq11,rinv);
			rt		= _mm_mul_sd(r,gbscale);
			n0		= _mm_cvttpd_epi32(rt);
			n0d		= _mm_cvtepi32_pd(n0);
			eps		= _mm_sub_sd(rt,n0d);
			eps2	= _mm_mul_sd(eps,eps);
			
			nnn		= _mm_slli_epi64(n0,2);
			
			xmm1	= _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,0))); 
			xmm2	= _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,1))); 
			xmm3	= _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,0))+2); 
			xmm4	= _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,1))+2); 
			
			Y		= _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); 
			F		= _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); 
			G		= _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); 
			H		= _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); 
			
			G		= _mm_mul_sd(G,eps);
			H		= _mm_mul_sd(H,eps2);
			Fp		= _mm_add_sd(F,G);
			Fp		= _mm_add_sd(Fp,H);
			VV		= _mm_mul_sd(Fp,eps);
			VV		= _mm_add_sd(Y,VV);
			H		= _mm_mul_sd(two,H);
			FF		= _mm_add_sd(Fp,G);
			FF		= _mm_add_sd(FF,H);
			vgb		= _mm_mul_sd(qq,VV);
			fijC	= _mm_mul_sd(qq,FF);
			fijC	= _mm_mul_sd(fijC,gbscale);
			
			dvdatmp = _mm_mul_sd(fijC,r);
			dvdatmp	= _mm_add_sd(vgb,dvdatmp);
			dvdatmp = _mm_mul_sd(dvdatmp,neg);
			dvdatmp = _mm_mul_sd(dvdatmp,half);
			dvdasum	= _mm_add_sd(dvdasum,dvdatmp);
			
			xmm1	= _mm_mul_sd(dvdatmp,isaj);
			xmm1	= _mm_mul_sd(xmm1,isaj);
			dvdaj	= _mm_add_sd(dvdaj,xmm1);
			
			/* store dvda */
			_mm_storel_pd(dvda+jnr1,dvdaj);
			
			vctot	= _mm_add_sd(vctot,vcoul);
			vgbtot  = _mm_add_sd(vgbtot,vgb);
						
			fscal	= _mm_sub_sd(fijC,fscal);
			fscal	= _mm_mul_sd(fscal,neg);
			fscal	= _mm_mul_sd(fscal,rinv);
								
			/* calculate partial force terms */
			t1		= _mm_mul_sd(fscal,dx);
			t2		= _mm_mul_sd(fscal,dy);
			t3		= _mm_mul_sd(fscal,dz);
			
			/* update the i force */
			fix		= _mm_add_sd(fix,t1);
			fiy		= _mm_add_sd(fiy,t2);
			fiz		= _mm_add_sd(fiz,t3);
			
			/* accumulate forces from memory */
			xmm5	= _mm_load_sd(faction+j13);   /* fx */
			xmm6    = _mm_load_sd(faction+j13+1); /* fy */
			xmm7    = _mm_load_sd(faction+j13+2); /* fz */
						
			/* subtract partial forces */
			xmm5	= _mm_sub_sd(xmm5,t1);
			xmm6	= _mm_sub_sd(xmm6,t2);
			xmm7	= _mm_sub_sd(xmm7,t3);
			
			/* store forces */
			_mm_store_sd(faction+j13,xmm5);
			_mm_store_sd(faction+j13+1,xmm6);
			_mm_store_sd(faction+j13+2,xmm7);
		}
		
		/* fix/fiy/fiz now contain four partial terms, that all should be
		 * added to the i particle forces
		 */
		t1		 = _mm_unpacklo_pd(t1,fix);
		t2		 = _mm_unpacklo_pd(t2,fiy);
		t3		 = _mm_unpacklo_pd(t3,fiz);
				
		fix		 = _mm_add_pd(fix,t1);
		fiy		 = _mm_add_pd(fiy,t2);
		fiz		 = _mm_add_pd(fiz,t3);
		
		fix      = _mm_shuffle_pd(fix,fix,_MM_SHUFFLE2(1,1));
		fiy      = _mm_shuffle_pd(fiy,fiy,_MM_SHUFFLE2(1,1));
		fiz      = _mm_shuffle_pd(fiz,fiz,_MM_SHUFFLE2(1,1));
		
		/* Load i forces from memory */
		xmm1     = _mm_load_sd(faction+ii3);
		xmm2     = _mm_load_sd(faction+ii3+1);
		xmm3     = _mm_load_sd(faction+ii3+2);
		
		/* Add to i force */
		fix      = _mm_add_sd(fix,xmm1);
		fiy      = _mm_add_sd(fiy,xmm2);
		fiz      = _mm_add_sd(fiz,xmm3);
	
		/* store i forces to memory */
		_mm_store_sd(faction+ii3,fix);
		_mm_store_sd(faction+ii3+1,fiy);
		_mm_store_sd(faction+ii3+2,fiz);
				
		/* now do dvda */
		dvdatmp  = _mm_unpacklo_pd(dvdatmp,dvdasum);
		dvdasum  = _mm_add_pd(dvdasum,dvdatmp);
		_mm_storeh_pd(&dva,dvdasum);
		dvda[ii] = dvda[ii] + dva*isai_d*isai_d;
		
		ggid	 = gid[n];
		
		/* Coulomb potential */
		vcoul	 = _mm_unpacklo_pd(vcoul,vctot);
		vctot	 = _mm_add_pd(vctot,vcoul);
		_mm_storeh_pd(&vct,vctot);
		Vc[ggid] = Vc[ggid] + vct;
		
		/* GB potential */
		vgb  	 = _mm_unpacklo_pd(vgb,vgbtot);
		vgbtot	 = _mm_add_pd(vgbtot,vgb);
		_mm_storeh_pd(&vgbt,vgbtot);
		gpol[ggid] = gpol[ggid] + vgbt;
	}
	
	*outeriter   = nri;            
    *inneriter   = nj1; 
	
}
void exchlaplacecoeff_gmrfData_0(unsigned int slot) {
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[0]) {
if ((!neighbor_isValid[0][0])) {
{
double xPos;
double yPos;
/* Statements in this Scop: S1053, S1056, S1059, S1050, S1058, S1052, S1055, S1060, S1054, S1057, S1051 */
{
{
{
{
{
{
{
{
{
{
{
double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][0]);
int i1 = 1;
for (; (i1<=1); i1 += 2) {
fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+26)] = 0.000000e+00;
fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+32)] = 0.000000e+00;
}
for (; (i1<=2); i1 += 1) {
fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+26)] = 0.000000e+00;
}
}
{
double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][0]);
int i1 = 1;
for (; (i1<=1); i1 += 2) {
fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+146)] = 0.000000e+00;
fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+152)] = 0.000000e+00;
}
for (; (i1<=2); i1 += 1) {
fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+146)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][0]);
int i1 = 1;
for (; (i1<=1); i1 += 2) {
fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+98)] = 0.000000e+00;
fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+104)] = 0.000000e+00;
}
for (; (i1<=2); i1 += 1) {
fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+98)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][0]);
int i1 = 1;
for (; (i1<=1); i1 += 2) {
fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+74)] = 0.000000e+00;
fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+80)] = 0.000000e+00;
}
for (; (i1<=2); i1 += 1) {
fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+74)] = 0.000000e+00;
}
}
}
{
int i1 = 1;
for (; (i1<(2&(~1))); i1 += 1) {
xPos = posBegin[0];
}
__m128d vec1 = _mm_set1_pd(xPos);
for (; (i1<0); i1 += 4) {
/* xPos = posBegin[0]; */
__m128d vec0 = _mm_load1_pd((&posBegin[0]));
__m128d vec0_2 = _mm_load1_pd((&posBegin[0]));
vec1 = vec0;
vec1 = vec0_2;
}
for (; (i1<3); i1 += 1) {
xPos = posBegin[0];
}
}
}
{
double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][0]);
int i1 = 1;
for (; (i1<=1); i1 += 2) {
fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+122)] = 0.000000e+00;
fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+128)] = 0.000000e+00;
}
for (; (i1<=2); i1 += 1) {
fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+122)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][0]);
int i1 = 1;
for (; (i1<=1); i1 += 2) {
fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+170)] = 0.000000e+00;
fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+176)] = 0.000000e+00;
}
for (; (i1<=2); i1 += 1) {
fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+170)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][0]);
int i1 = 1;
for (; (i1<=1); i1 += 2) {
fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+194)] = 0.000000e+00;
fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+200)] = 0.000000e+00;
}
for (; (i1<=2); i1 += 1) {
fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+194)] = 0.000000e+00;
}
}
}
{
int i1 = 1;
for (; (i1<(2&(~1))); i1 += 1) {
yPos = ((((i1-1)/1.000000e+00)*(posEnd[1]-posBegin[1]))+posBegin[1]);
}
__m128d vec1 = _mm_set1_pd(1.000000e+00);
__m128d vec2 = _mm_set1_pd(1.000000e+00);
__m128d vec5 = _mm_set1_pd(yPos);
for (; (i1<0); i1 += 4) {
/* yPos = ((((i1-1)/1.000000e+00)*(posEnd[1]-posBegin[1]))+posBegin[1]); */
__m128d vec0 = _mm_set_pd(i1+1,i1);
__m128d vec0_2 = _mm_set_pd(i1+1,i1);
__m128d vec3 = _mm_load1_pd((&posEnd[1]));
__m128d vec3_2 = _mm_load1_pd((&posEnd[1]));
__m128d vec4 = _mm_load1_pd((&posBegin[1]));
__m128d vec4_2 = _mm_load1_pd((&posBegin[1]));
vec5 = _mm_add_pd(_mm_mul_pd(_mm_div_pd(_mm_sub_pd(vec0, vec1), vec2), _mm_sub_pd(vec3, vec4)), vec4);
vec5 = _mm_add_pd(_mm_mul_pd(_mm_div_pd(_mm_sub_pd(vec0_2, vec1), vec2), _mm_sub_pd(vec3_2, vec4_2)), vec4_2);
}
for (; (i1<3); i1 += 1) {
yPos = ((((i1-1)/1.000000e+00)*(posEnd[1]-posBegin[1]))+posBegin[1]);
}
}
}
{
double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][0]);
int i1 = 1;
for (; (i1<=1); i1 += 2) {
fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+2)] = 0.000000e+00;
fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+8)] = 0.000000e+00;
}
for (; (i1<=2); i1 += 1) {
fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+2)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][0]);
int i1 = 1;
for (; (i1<=1); i1 += 2) {
fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+50)] = 0.000000e+00;
fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+56)] = 0.000000e+00;
}
for (; (i1<=2); i1 += 1) {
fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+50)] = 0.000000e+00;
}
}
}
}
}
if ((!neighbor_isValid[0][1])) {
{
double xPos;
double yPos;
/* Statements in this Scop: S1071, S1065, S1068, S1062, S1070, S1064, S1067, S1061, S1069, S1063, S1066 */
{
{
{
{
{
{
{
{
{
{
{
double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][0]);
int i1 = 1;
for (; (i1<=1); i1 += 2) {
fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+195)] = 0.000000e+00;
fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+201)] = 0.000000e+00;
}
for (; (i1<=2); i1 += 1) {
fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+195)] = 0.000000e+00;
}
}
{
double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][0]);
int i1 = 1;
for (; (i1<=1); i1 += 2) {
fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+51)] = 0.000000e+00;
fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+57)] = 0.000000e+00;
}
for (; (i1<=2); i1 += 1) {
fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+51)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][0]);
int i1 = 1;
for (; (i1<=1); i1 += 2) {
fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+75)] = 0.000000e+00;
fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+81)] = 0.000000e+00;
}
for (; (i1<=2); i1 += 1) {
fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+75)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][0]);
int i1 = 1;
for (; (i1<=1); i1 += 2) {
fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+3)] = 0.000000e+00;
fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+9)] = 0.000000e+00;
}
for (; (i1<=2); i1 += 1) {
fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+3)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][0]);
int i1 = 1;
for (; (i1<=1); i1 += 2) {
fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+171)] = 0.000000e+00;
fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+177)] = 0.000000e+00;
}
for (; (i1<=2); i1 += 1) {
fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+171)] = 0.000000e+00;
}
}
}
{
int i1 = 1;
for (; (i1<(2&(~1))); i1 += 1) {
yPos = ((((i1-1)/1.000000e+00)*(posEnd[1]-posBegin[1]))+posBegin[1]);
}
__m128d vec1 = _mm_set1_pd(1.000000e+00);
__m128d vec2 = _mm_set1_pd(1.000000e+00);
__m128d vec5 = _mm_set1_pd(yPos);
for (; (i1<0); i1 += 4) {
/* yPos = ((((i1-1)/1.000000e+00)*(posEnd[1]-posBegin[1]))+posBegin[1]); */
__m128d vec0 = _mm_set_pd(i1+1,i1);
__m128d vec0_2 = _mm_set_pd(i1+1,i1);
__m128d vec3 = _mm_load1_pd((&posEnd[1]));
__m128d vec3_2 = _mm_load1_pd((&posEnd[1]));
__m128d vec4 = _mm_load1_pd((&posBegin[1]));
__m128d vec4_2 = _mm_load1_pd((&posBegin[1]));
vec5 = _mm_add_pd(_mm_mul_pd(_mm_div_pd(_mm_sub_pd(vec0, vec1), vec2), _mm_sub_pd(vec3, vec4)), vec4);
vec5 = _mm_add_pd(_mm_mul_pd(_mm_div_pd(_mm_sub_pd(vec0_2, vec1), vec2), _mm_sub_pd(vec3_2, vec4_2)), vec4_2);
}
for (; (i1<3); i1 += 1) {
yPos = ((((i1-1)/1.000000e+00)*(posEnd[1]-posBegin[1]))+posBegin[1]);
}
}
}
{
double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][0]);
int i1 = 1;
for (; (i1<=1); i1 += 2) {
fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+99)] = 0.000000e+00;
fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+105)] = 0.000000e+00;
}
for (; (i1<=2); i1 += 1) {
fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+99)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][0]);
int i1 = 1;
for (; (i1<=1); i1 += 2) {
fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+123)] = 0.000000e+00;
fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+129)] = 0.000000e+00;
}
for (; (i1<=2); i1 += 1) {
fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+123)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][0]);
int i1 = 1;
for (; (i1<=1); i1 += 2) {
fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+147)] = 0.000000e+00;
fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+153)] = 0.000000e+00;
}
for (; (i1<=2); i1 += 1) {
fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+147)] = 0.000000e+00;
}
}
}
{
int i1 = 1;
for (; (i1<(2&(~1))); i1 += 1) {
xPos = posEnd[0];
}
__m128d vec1 = _mm_set1_pd(xPos);
for (; (i1<0); i1 += 4) {
/* xPos = posEnd[0]; */
__m128d vec0 = _mm_load1_pd((&posEnd[0]));
__m128d vec0_2 = _mm_load1_pd((&posEnd[0]));
vec1 = vec0;
vec1 = vec0_2;
}
for (; (i1<3); i1 += 1) {
xPos = posEnd[0];
}
}
}
{
double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][0]);
int i1 = 1;
for (; (i1<=1); i1 += 2) {
fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+27)] = 0.000000e+00;
fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+33)] = 0.000000e+00;
}
for (; (i1<=2); i1 += 1) {
fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+27)] = 0.000000e+00;
}
}
}
}
}
if ((!neighbor_isValid[0][2])) {
{
double xPos;
double yPos;
/* Statements in this Scop: S1080, S1074, S1077, S1082, S1076, S1079, S1073, S1072, S1081, S1075, S1078 */
{
{
{
{
{
{
{
{
{
{
{
double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][0]);
int i2 = 2;
for (; (i2<=2); i2 += 2) {
fieldData_LaplaceCoeff_GMRF_0_p1[(i2+126)] = 0.000000e+00;
fieldData_LaplaceCoeff_GMRF_0_p1[(i2+127)] = 0.000000e+00;
}
for (; (i2<=3); i2 += 1) {
fieldData_LaplaceCoeff_GMRF_0_p1[(i2+126)] = 0.000000e+00;
}
}
{
int i2 = 2;
for (; (i2<=2); i2 += 2) {
xPos = ((((i2-2)/1.000000e+00)*(posEnd[0]-posBegin[0]))+posBegin[0]);
xPos = ((((i2-1)/1.000000e+00)*(posEnd[0]-posBegin[0]))+posBegin[0]);
}
for (; (i2<=3); i2 += 1) {
xPos = ((((i2-2)/1.000000e+00)*(posEnd[0]-posBegin[0]))+posBegin[0]);
}
}
}
{
double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][0]);
int i2 = 2;
for (; (i2<=2); i2 += 2) {
fieldData_LaplaceCoeff_GMRF_0_p1[(i2+198)] = 0.000000e+00;
fieldData_LaplaceCoeff_GMRF_0_p1[(i2+199)] = 0.000000e+00;
}
for (; (i2<=3); i2 += 1) {
fieldData_LaplaceCoeff_GMRF_0_p1[(i2+198)] = 0.000000e+00;
}
}
}
{
int i2 = 2;
for (; (i2<=2); i2 += 2) {
yPos = posBegin[1];
yPos = posBegin[1];
}
for (; (i2<=3); i2 += 1) {
yPos = posBegin[1];
}
}
}
{
double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][0]);
int i2 = 2;
for (; (i2<=2); i2 += 2) {
fieldData_LaplaceCoeff_GMRF_0_p1[(i2+30)] = 0.000000e+00;
fieldData_LaplaceCoeff_GMRF_0_p1[(i2+31)] = 0.000000e+00;
}
for (; (i2<=3); i2 += 1) {
fieldData_LaplaceCoeff_GMRF_0_p1[(i2+30)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][0]);
int i2 = 2;
for (; (i2<=2); i2 += 2) {
fieldData_LaplaceCoeff_GMRF_0_p1[(i2+174)] = 0.000000e+00;
fieldData_LaplaceCoeff_GMRF_0_p1[(i2+175)] = 0.000000e+00;
}
for (; (i2<=3); i2 += 1) {
fieldData_LaplaceCoeff_GMRF_0_p1[(i2+174)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][0]);
int i2 = 2;
for (; (i2<=2); i2 += 2) {
fieldData_LaplaceCoeff_GMRF_0_p1[(i2+78)] = 0.000000e+00;
fieldData_LaplaceCoeff_GMRF_0_p1[(i2+79)] = 0.000000e+00;
}
for (; (i2<=3); i2 += 1) {
fieldData_LaplaceCoeff_GMRF_0_p1[(i2+78)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][0]);
int i2 = 2;
for (; (i2<=2); i2 += 2) {
fieldData_LaplaceCoeff_GMRF_0_p1[(i2+54)] = 0.000000e+00;
fieldData_LaplaceCoeff_GMRF_0_p1[(i2+55)] = 0.000000e+00;
}
for (; (i2<=3); i2 += 1) {
fieldData_LaplaceCoeff_GMRF_0_p1[(i2+54)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][0]);
int i2 = 2;
for (; (i2<=2); i2 += 2) {
fieldData_LaplaceCoeff_GMRF_0_p1[(i2+150)] = 0.000000e+00;
fieldData_LaplaceCoeff_GMRF_0_p1[(i2+151)] = 0.000000e+00;
}
for (; (i2<=3); i2 += 1) {
fieldData_LaplaceCoeff_GMRF_0_p1[(i2+150)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][0]);
int i2 = 2;
for (; (i2<=2); i2 += 2) {
fieldData_LaplaceCoeff_GMRF_0_p1[(i2+6)] = 0.000000e+00;
fieldData_LaplaceCoeff_GMRF_0_p1[(i2+7)] = 0.000000e+00;
}
for (; (i2<=3); i2 += 1) {
fieldData_LaplaceCoeff_GMRF_0_p1[(i2+6)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][0]);
int i2 = 2;
for (; (i2<=2); i2 += 2) {
fieldData_LaplaceCoeff_GMRF_0_p1[(i2+102)] = 0.000000e+00;
fieldData_LaplaceCoeff_GMRF_0_p1[(i2+103)] = 0.000000e+00;
}
for (; (i2<=3); i2 += 1) {
fieldData_LaplaceCoeff_GMRF_0_p1[(i2+102)] = 0.000000e+00;
}
}
}
}
}
if ((!neighbor_isValid[0][3])) {
{
double xPos;
double yPos;
/* Statements in this Scop: S1083, S1092, S1086, S1089, S1088, S1091, S1085, S1090, S1093, S1087, S1084 */
{
{
{
{
{
{
{
{
{
{
{
double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][0]);
int i2 = 2;
for (; (i2<=2); i2 += 2) {
fieldData_LaplaceCoeff_GMRF_0_p1[(i2+12)] = 0.000000e+00;
fieldData_LaplaceCoeff_GMRF_0_p1[(i2+13)] = 0.000000e+00;
}
for (; (i2<=3); i2 += 1) {
fieldData_LaplaceCoeff_GMRF_0_p1[(i2+12)] = 0.000000e+00;
}
}
{
double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][0]);
int i2 = 2;
for (; (i2<=2); i2 += 2) {
fieldData_LaplaceCoeff_GMRF_0_p1[(i2+60)] = 0.000000e+00;
fieldData_LaplaceCoeff_GMRF_0_p1[(i2+61)] = 0.000000e+00;
}
for (; (i2<=3); i2 += 1) {
fieldData_LaplaceCoeff_GMRF_0_p1[(i2+60)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][0]);
int i2 = 2;
for (; (i2<=2); i2 += 2) {
fieldData_LaplaceCoeff_GMRF_0_p1[(i2+204)] = 0.000000e+00;
fieldData_LaplaceCoeff_GMRF_0_p1[(i2+205)] = 0.000000e+00;
}
for (; (i2<=3); i2 += 1) {
fieldData_LaplaceCoeff_GMRF_0_p1[(i2+204)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][0]);
int i2 = 2;
for (; (i2<=2); i2 += 2) {
fieldData_LaplaceCoeff_GMRF_0_p1[(i2+132)] = 0.000000e+00;
fieldData_LaplaceCoeff_GMRF_0_p1[(i2+133)] = 0.000000e+00;
}
for (; (i2<=3); i2 += 1) {
fieldData_LaplaceCoeff_GMRF_0_p1[(i2+132)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][0]);
int i2 = 2;
for (; (i2<=2); i2 += 2) {
fieldData_LaplaceCoeff_GMRF_0_p1[(i2+84)] = 0.000000e+00;
fieldData_LaplaceCoeff_GMRF_0_p1[(i2+85)] = 0.000000e+00;
}
for (; (i2<=3); i2 += 1) {
fieldData_LaplaceCoeff_GMRF_0_p1[(i2+84)] = 0.000000e+00;
}
}
}
{
int i2 = 2;
for (; (i2<=2); i2 += 2) {
yPos = posEnd[1];
yPos = posEnd[1];
}
for (; (i2<=3); i2 += 1) {
yPos = posEnd[1];
}
}
}
{
double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][0]);
int i2 = 2;
for (; (i2<=2); i2 += 2) {
fieldData_LaplaceCoeff_GMRF_0_p1[(i2+36)] = 0.000000e+00;
fieldData_LaplaceCoeff_GMRF_0_p1[(i2+37)] = 0.000000e+00;
}
for (; (i2<=3); i2 += 1) {
fieldData_LaplaceCoeff_GMRF_0_p1[(i2+36)] = 0.000000e+00;
}
}
}
{
int i2 = 2;
for (; (i2<=2); i2 += 2) {
xPos = ((((i2-2)/1.000000e+00)*(posEnd[0]-posBegin[0]))+posBegin[0]);
xPos = ((((i2-1)/1.000000e+00)*(posEnd[0]-posBegin[0]))+posBegin[0]);
}
for (; (i2<=3); i2 += 1) {
xPos = ((((i2-2)/1.000000e+00)*(posEnd[0]-posBegin[0]))+posBegin[0]);
}
}
}
{
double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][0]);
int i2 = 2;
for (; (i2<=2); i2 += 2) {
fieldData_LaplaceCoeff_GMRF_0_p1[(i2+180)] = 0.000000e+00;
fieldData_LaplaceCoeff_GMRF_0_p1[(i2+181)] = 0.000000e+00;
}
for (; (i2<=3); i2 += 1) {
fieldData_LaplaceCoeff_GMRF_0_p1[(i2+180)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][0]);
int i2 = 2;
for (; (i2<=2); i2 += 2) {
fieldData_LaplaceCoeff_GMRF_0_p1[(i2+156)] = 0.000000e+00;
fieldData_LaplaceCoeff_GMRF_0_p1[(i2+157)] = 0.000000e+00;
}
for (; (i2<=3); i2 += 1) {
fieldData_LaplaceCoeff_GMRF_0_p1[(i2+156)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][0]);
int i2 = 2;
for (; (i2<=2); i2 += 2) {
fieldData_LaplaceCoeff_GMRF_0_p1[(i2+108)] = 0.000000e+00;
fieldData_LaplaceCoeff_GMRF_0_p1[(i2+109)] = 0.000000e+00;
}
for (; (i2<=3); i2 += 1) {
fieldData_LaplaceCoeff_GMRF_0_p1[(i2+108)] = 0.000000e+00;
}
}
}
}
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[0]) {
if ((neighbor_isValid[0][1]&&neighbor_isRemote[0][1])) {
/* Statements in this Scop: S1094 */
for (int i0 = 0; (i0<=8); i0 += 1) {
double* buffer_Send_1_p1 = (&buffer_Send[1][(i0*2)]);
double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][(i0*24)]);
int i1 = 1;
for (; (i1<=1); i1 += 2) {
buffer_Send_1_p1[(i1-1)] = fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+3)];
buffer_Send_1_p1[i1] = fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+9)];
}
for (; (i1<=2); i1 += 1) {
buffer_Send_1_p1[(i1-1)] = fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+3)];
}
}
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[0]) {
if ((neighbor_isValid[0][1]&&neighbor_isRemote[0][1])) {
MPI_Isend(buffer_Send[1], 18, MPI_DOUBLE, neighbor_remoteRank[0][1], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[0][1]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[1]);
reqOutstanding_Send[1] = true;
}
}
}
;
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[0]) {
if ((neighbor_isValid[0][0]&&neighbor_isRemote[0][0])) {
MPI_Irecv(buffer_Recv[0], 18, MPI_DOUBLE, neighbor_remoteRank[0][0], ((unsigned int)(neighbor_fragCommId[0][0]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[0]);
reqOutstanding_Recv[0] = true;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[0]) {
if (reqOutstanding_Recv[0]) {
waitForMPIReq(&mpiRequest_Recv[0]);
reqOutstanding_Recv[0] = false;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[0]) {
if ((neighbor_isValid[0][0]&&neighbor_isRemote[0][0])) {
/* Statements in this Scop: S1095 */
for (int i0 = 0; (i0<=8); i0 += 1) {
double* buffer_Recv_0_p1 = (&buffer_Recv[0][(i0*2)]);
double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][(i0*24)]);
int i1 = 3;
for (; (i1<=3); i1 += 2) {
fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)-10)] = buffer_Recv_0_p1[(i1-3)];
fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)-4)] = buffer_Recv_0_p1[(i1-2)];
}
for (; (i1<=4); i1 += 1) {
fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)-10)] = buffer_Recv_0_p1[(i1-3)];
}
}
}
}
}
;
;
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[0]) {
if (reqOutstanding_Send[1]) {
waitForMPIReq(&mpiRequest_Send[1]);
reqOutstanding_Send[1] = false;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[0]) {
;
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[0]) {
if ((neighbor_isValid[0][3]&&neighbor_isRemote[0][3])) {
MPI_Isend(&fieldData_LaplaceCoeff_GMRF[0][14], 1, mpiDatatype_9_2_24, neighbor_remoteRank[0][3], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[0][3]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[3]);
reqOutstanding_Send[3] = true;
}
}
}
;
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[0]) {
if ((neighbor_isValid[0][2]&&neighbor_isRemote[0][2])) {
MPI_Irecv(&fieldData_LaplaceCoeff_GMRF[0][8], 1, mpiDatatype_9_2_24, neighbor_remoteRank[0][2], ((unsigned int)(neighbor_fragCommId[0][2]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[2]);
reqOutstanding_Recv[2] = true;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[0]) {
if (reqOutstanding_Recv[2]) {
waitForMPIReq(&mpiRequest_Recv[2]);
reqOutstanding_Recv[2] = false;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[0]) {
;
}
}
;
;
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[0]) {
if (reqOutstanding_Send[3]) {
waitForMPIReq(&mpiRequest_Send[3]);
reqOutstanding_Send[3] = false;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[0]) {
if ((neighbor_isValid[0][0]&&neighbor_isRemote[0][0])) {
/* Statements in this Scop: S1096 */
for (int i0 = 0; (i0<=8); i0 += 1) {
double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][(i0*24)]);
double* buffer_Send_0_p1 = (&buffer_Send[0][(i0*4)]);
int i1 = 0;
for (; (i1<=2); i1 += 2) {
buffer_Send_0_p1[i1] = fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+3)];
buffer_Send_0_p1[(i1+1)] = fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+9)];
}
for (; (i1<=3); i1 += 1) {
buffer_Send_0_p1[i1] = fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+3)];
}
}
}
if ((neighbor_isValid[0][1]&&neighbor_isRemote[0][1])) {
/* Statements in this Scop: S1097 */
for (int i0 = 0; (i0<=8); i0 += 1) {
double* buffer_Send_1_p1 = (&buffer_Send[1][(i0*4)]);
double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][(i0*24)]);
int i1 = 0;
for (; (i1<=2); i1 += 2) {
buffer_Send_1_p1[i1] = fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+2)];
buffer_Send_1_p1[(i1+1)] = fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+8)];
}
for (; (i1<=3); i1 += 1) {
buffer_Send_1_p1[i1] = fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+2)];
}
}
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[0]) {
if ((neighbor_isValid[0][0]&&neighbor_isRemote[0][0])) {
MPI_Isend(buffer_Send[0], 36, MPI_DOUBLE, neighbor_remoteRank[0][0], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[0][0]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[0]);
reqOutstanding_Send[0] = true;
}
if ((neighbor_isValid[0][1]&&neighbor_isRemote[0][1])) {
MPI_Isend(buffer_Send[1], 36, MPI_DOUBLE, neighbor_remoteRank[0][1], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[0][1]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[1]);
reqOutstanding_Send[1] = true;
}
}
}
;
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[0]) {
if ((neighbor_isValid[0][0]&&neighbor_isRemote[0][0])) {
MPI_Irecv(buffer_Recv[0], 36, MPI_DOUBLE, neighbor_remoteRank[0][0], ((unsigned int)(neighbor_fragCommId[0][0]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[0]);
reqOutstanding_Recv[0] = true;
}
if ((neighbor_isValid[0][1]&&neighbor_isRemote[0][1])) {
MPI_Irecv(buffer_Recv[1], 36, MPI_DOUBLE, neighbor_remoteRank[0][1], ((unsigned int)(neighbor_fragCommId[0][1]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[1]);
reqOutstanding_Recv[1] = true;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[0]) {
if (reqOutstanding_Recv[0]) {
waitForMPIReq(&mpiRequest_Recv[0]);
reqOutstanding_Recv[0] = false;
}
if (reqOutstanding_Recv[1]) {
waitForMPIReq(&mpiRequest_Recv[1]);
reqOutstanding_Recv[1] = false;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[0]) {
if ((neighbor_isValid[0][0]&&neighbor_isRemote[0][0])) {
/* Statements in this Scop: S1098 */
for (int i0 = 0; (i0<=8); i0 += 1) {
double* buffer_Recv_0_p1 = (&buffer_Recv[0][(i0*4)]);
double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][(i0*24)]);
int i1 = 1;
for (; (i1<=3); i1 += 2) {
fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)-5)] = buffer_Recv_0_p1[(i1-1)];
fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)+1)] = buffer_Recv_0_p1[i1];
}
for (; (i1<=4); i1 += 1) {
fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)-5)] = buffer_Recv_0_p1[(i1-1)];
}
}
}
if ((neighbor_isValid[0][1]&&neighbor_isRemote[0][1])) {
/* Statements in this Scop: S1099 */
for (int i0 = 0; (i0<=8); i0 += 1) {
double* buffer_Recv_1_p1 = (&buffer_Recv[1][(i0*4)]);
double* fieldData_LaplaceCoeff_GMRF_0_p1 = (&fieldData_LaplaceCoeff_GMRF[0][(i0*24)]);
int i1 = 4;
for (; (i1<=6); i1 += 2) {
fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)-20)] = buffer_Recv_1_p1[(i1-4)];
fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)-14)] = buffer_Recv_1_p1[(i1-3)];
}
for (; (i1<=7); i1 += 1) {
fieldData_LaplaceCoeff_GMRF_0_p1[((i1*6)-20)] = buffer_Recv_1_p1[(i1-4)];
}
}
}
}
}
;
;
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[0]) {
if (reqOutstanding_Send[0]) {
waitForMPIReq(&mpiRequest_Send[0]);
reqOutstanding_Send[0] = false;
}
if (reqOutstanding_Send[1]) {
waitForMPIReq(&mpiRequest_Send[1]);
reqOutstanding_Send[1] = false;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[0]) {
;
;
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[0]) {
if ((neighbor_isValid[0][2]&&neighbor_isRemote[0][2])) {
MPI_Isend(&fieldData_LaplaceCoeff_GMRF[0][13], 1, mpiDatatype_9_4_24, neighbor_remoteRank[0][2], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[0][2]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[2]);
reqOutstanding_Send[2] = true;
}
if ((neighbor_isValid[0][3]&&neighbor_isRemote[0][3])) {
MPI_Isend(&fieldData_LaplaceCoeff_GMRF[0][7], 1, mpiDatatype_9_4_24, neighbor_remoteRank[0][3], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[0][3]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[3]);
reqOutstanding_Send[3] = true;
}
}
}
;
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[0]) {
if ((neighbor_isValid[0][2]&&neighbor_isRemote[0][2])) {
MPI_Irecv(&fieldData_LaplaceCoeff_GMRF[0][1], 1, mpiDatatype_9_4_24, neighbor_remoteRank[0][2], ((unsigned int)(neighbor_fragCommId[0][2]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[2]);
reqOutstanding_Recv[2] = true;
}
if ((neighbor_isValid[0][3]&&neighbor_isRemote[0][3])) {
MPI_Irecv(&fieldData_LaplaceCoeff_GMRF[0][19], 1, mpiDatatype_9_4_24, neighbor_remoteRank[0][3], ((unsigned int)(neighbor_fragCommId[0][3]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[3]);
reqOutstanding_Recv[3] = true;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[0]) {
if (reqOutstanding_Recv[2]) {
waitForMPIReq(&mpiRequest_Recv[2]);
reqOutstanding_Recv[2] = false;
}
if (reqOutstanding_Recv[3]) {
waitForMPIReq(&mpiRequest_Recv[3]);
reqOutstanding_Recv[3] = false;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[0]) {
;
;
}
}
;
;
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[0]) {
if (reqOutstanding_Send[2]) {
waitForMPIReq(&mpiRequest_Send[2]);
reqOutstanding_Send[2] = false;
}
if (reqOutstanding_Send[3]) {
waitForMPIReq(&mpiRequest_Send[3]);
reqOutstanding_Send[3] = false;
}
}
}
}
inline void vwl_s1( __m128d const vqn, __m128d const vx0, __m128d const vx1, __m128d vwl[] ) {

  vwl[0] = _mm_mul_pd( vqn, _mm_sub_pd( vx1, vx0 ) ); 
}
Beispiel #6
0
namespace nt2 { namespace ext
{
  template<class Dummy>
  struct  call< tag::minus_ ( tag::simd_<tag::double_,tag::sse_>
                            , tag::simd_<tag::double_,tag::sse_>
                            )
              , tag::cpu_, Dummy
              >
        : callable
  {
    template<class Sig>           struct result;
    template<class This,class A>  struct result<This(A,A)> : meta::strip<A> {};

    NT2_FUNCTOR_CALL(2)
    {
      A0 that = { _mm_sub_pd(a0,a1) };
      return that;
    }
  };

  template<class Dummy>
  struct  call< tag::minus_ ( tag::simd_<tag::float_,tag::sse_>
                            , tag::simd_<tag::float_,tag::sse_>
                            )
              , tag::cpu_, Dummy
              >
        : callable
  {
    template<class Sig>           struct result;
    template<class This,class A>  struct result<This(A,A)> : meta::strip<A> {};
Beispiel #7
0
// only compute the necessary indices of su2_i = subgroup( U*staple^\dagger )
void
only_subgroup( GLU_complex *s0 ,
	       GLU_complex *s1 ,
	       double *scale ,
	       const GLU_complex U[ NCNC ] ,
	       const GLU_complex staple[ NCNC ] ,
	       const size_t su2_index )
{
  const __m128d *u = (const __m128d*)U ;
  const __m128d *s = (const __m128d*)staple ;

  register __m128d sm0 ; 
  register __m128d sm1 ;
#if NC == 3
  switch( su2_index%3 ) { // I don't like this
    // rotation 1
    //  |  s0   s1  0 |
    //  | -s1*  s0* 0 |
    //  |  0     0  1 |
  case 0 :
    sm0 = _mm_add_pd(
		     // temp0
		     _mm_add_pd( SSE2_MUL_CONJ( *( u + 0 ) , *( s + 0 ) ) ,
				 _mm_add_pd( SSE2_MUL_CONJ( *( u + 1 ) , *( s + 1 ) ) ,
					     SSE2_MUL_CONJ( *( u + 2 ) , *( s + 2 ) ) ) ) ,
		      // temp3^*
		     _mm_add_pd( SSE2_MULCONJ( *( u + 3 ) , *( s + 3 ) ) ,
				 _mm_add_pd( SSE2_MULCONJ( *( u + 4 ) , *( s + 4 ) ) ,
					     SSE2_MULCONJ( *( u + 5 ) , *( s + 5 ) ) ) ) ) ;
    sm1 = _mm_sub_pd( 
		     // temp1
		     _mm_add_pd( SSE2_MUL_CONJ( *( u + 0 ) , *( s + 3 ) ) ,
				 _mm_add_pd( SSE2_MUL_CONJ( *( u + 1 ) , *( s + 4 ) ) ,
					     SSE2_MUL_CONJ( *( u + 2 ) , *( s + 5 ) ) ) ) ,
		     // temp2^*
		     _mm_add_pd( SSE2_MULCONJ( *( u + 3 ) , *( s + 0 ) ) ,
				 _mm_add_pd( SSE2_MULCONJ( *( u + 4 ) , *( s + 1 ) ) ,
					     SSE2_MULCONJ( *( u + 5 ) , *( s + 2 ) ) ) ) ) ;
    break ;
  case 1 :
    // rotation 2
    //  |  1    0   0  |
    //  |  0   s0  s1  |
    //  |  0  -s1* s0* |
    sm0 = _mm_add_pd( 
		     // temp0
		     _mm_add_pd( SSE2_MUL_CONJ( *( u + 3 ) , *( s + 3 ) ) ,
				 _mm_add_pd( SSE2_MUL_CONJ( *( u + 4 ) , *( s + 4 ) ) ,
					     SSE2_MUL_CONJ( *( u + 5 ) , *( s + 5 ) ) ) ) ,
		     // temp3^*
		     _mm_add_pd( SSE2_MULCONJ( *( u + 6 ) , *( s + 6 ) ) ,
				 _mm_add_pd( SSE2_MULCONJ( *( u + 7 ) , *( s + 7 ) ) ,
					     SSE2_MULCONJ( *( u + 8 ) , *( s + 8 ) ) ) ) ) ;
    sm1 = _mm_sub_pd(
		     // temp1
		     _mm_add_pd( SSE2_MUL_CONJ( *( u + 3 ) , *( s + 6 ) ) ,
				 _mm_add_pd( SSE2_MUL_CONJ( *( u + 4 ) , *( s + 7 ) ) ,
					     SSE2_MUL_CONJ( *( u + 5 ) , *( s + 8 ) ) ) ) ,
		     // temp2^*
		     _mm_add_pd( SSE2_MULCONJ( *( u + 6 ) , *( s + 3 ) ) ,
				 _mm_add_pd( SSE2_MULCONJ( *( u + 7 ) , *( s + 4 ) ) ,
					     SSE2_MULCONJ( *( u + 8 ) , *( s + 5 ) ) ) ) ) ;
    break ;
  case 2 :
    // rotation 3
    //  | s0*  0  -s1 |
    //  |  0   1   0  |
    //  | s1   0  s0  |
    sm0 = _mm_add_pd( 
		     // temp3^*
		     _mm_add_pd( SSE2_MULCONJ( *( u + 0 ) , *( s + 0 ) ) ,
				 _mm_add_pd( SSE2_MULCONJ( *( u + 1 ) , *( s + 1 ) ) ,
					     SSE2_MULCONJ( *( u + 2 ) , *( s + 2 ) ) ) ) ,
		     // temp0
		     _mm_add_pd( SSE2_MUL_CONJ( *( u + 6 ) , *( s + 6 ) ) ,
				 _mm_add_pd( SSE2_MUL_CONJ( *( u + 7 ) , *( s + 7 ) ) ,
					     SSE2_MUL_CONJ( *( u + 8 ) , *( s + 8 ) ) ) ) ) ;
    sm1 = _mm_sub_pd(
		     // temp1
		     _mm_add_pd( SSE2_MUL_CONJ( *( u + 6 ) , *( s + 0 ) ) ,
				 _mm_add_pd( SSE2_MUL_CONJ( *( u + 7 ) , *( s + 1 ) ) ,
					     SSE2_MUL_CONJ( *( u + 8 ) , *( s + 2 ) ) ) ) ,
		     // temp2^*
		     _mm_add_pd( SSE2_MULCONJ( *( u + 0 ) , *( s + 6 ) ) ,
				 _mm_add_pd( SSE2_MULCONJ( *( u + 1 ) , *( s + 7 ) ) ,
					     SSE2_MULCONJ( *( u + 2 ) , *( s + 8 ) ) ) ) ) ;
    break ;
  }
#elif NC == 2
  sm0 = _mm_add_pd( 
		   // temp0
		   _mm_add_pd( SSE2_MUL_CONJ( *( u + 0 ) , *( s + 0 ) ) , 
			       SSE2_MUL_CONJ( *( u + 1 ) , *( s + 1 ) ) ) ,
		   // temp3^*
		   _mm_add_pd( SSE2_MULCONJ( *( u + 2 ) , *( s + 2 ) ) , 
			       SSE2_MULCONJ( *( u + 3 ) , *( s + 3 ) ) ) ) ;
  
  sm1 = _mm_sub_pd( 
		   // temp1
		   _mm_add_pd( SSE2_MUL_CONJ( *( u + 0 ) , *( s + 2 ) ) , 
			       SSE2_MUL_CONJ( *( u + 1 ) , *( s + 3 ) ) ) ,
		   // temp2^*
		   _mm_add_pd( SSE2_MULCONJ( *( u + 2 ) , *( s + 0 ) ) ,
			       SSE2_MULCONJ( *( u + 3 ) , *( s + 1 ) ) ) ) ;
#else
  // su(N) version
  const size_t row_a = Latt.su2_data[ su2_index ].idx_a / NC ;
  const size_t col_b = Latt.su2_data[ su2_index ].idx_b % NC ;

  // prefetch the staple & link indices
  const __m128d *S1 = ( s + NC * row_a ) , *S2 = ( s + NC * col_b ) ; 
  const __m128d *U1 = ( u + NC * row_a ) , *U2 = ( u + NC * col_b ) ; 

  // initialise to zero & perform multiplication
  sm0 = _mm_setzero_pd() ; sm1 = _mm_setzero_pd() ;

  size_t i ;
  for( i = 0 ; i < NC ; i++ ) {
    sm0 = _mm_add_pd( sm0 ,
		      _mm_add_pd( SSE2_MUL_CONJ( *U1 , *S1 ) ,
				  SSE2_MULCONJ( *U2 , *S2 ) ) ) ;
    sm1 = _mm_add_pd( sm1 ,
		      _mm_sub_pd( SSE2_MUL_CONJ( *U1 , *S2 ) ,
				  SSE2_MULCONJ( *U2 , *S1 ) ) ) ;
    // increment our pointers
    S1++ , S2++ , U1++ , U2++ ;
  }
#endif

  // puts the norm in both parts
  register __m128d z = SSE2_FMA( sm0 , sm0 , _mm_mul_pd( sm1 , sm1 ) ) ; 
  z = _mm_add_pd( z , _mm_shuffle_pd( z , z , 1 ) ) ;
  z = _mm_sqrt_pd( z ) ;
  z = _mm_div_pd( _mm_set1_pd( 1.0 ) , z ) ;
  sm0 = _mm_mul_pd( sm0 , z ) ;
  sm1 = _mm_mul_pd( sm1 , z ) ;

  // poke back into *s0 and *s1 and *scale
  _mm_store_pd( (void*)s0 , sm0 ) ; 
  _mm_store_pd( (void*)s1 , sm1 ) ; 
  _mm_store_sd( (void*)scale , z ) ;

  return ;
}
Beispiel #8
0
int fft5b_(double *a, double *b, double *w, int *m, int *l)
{
    /* static double c51 = .95105651629515357;
    static double c52 = .61803398874989485;
    static double c53 = .55901699437494742;
    static double c54 = .25; */
    static __m128d c51, c52, c53, c54;

    int i, i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, j, j0;
    /* double x0, y0, x1, y1, x2, y2, x3, y3, x4, y4, x5, y5, x6, y6, x7, y7,
                x8, y8, x9, y9, x10, y10, wi1, wi2, wi3, wi4, wr1, wr2, wr3, wr4; */
    __m128d t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, w1, w2, w3, w4;

    c51 = _mm_set1_pd(0.95105651629515357);
    c52 = _mm_set1_pd(0.61803398874989485);
    c53 = _mm_set1_pd(0.55901699437494742);
    c54 = _mm_set1_pd(0.25);

    for (i = 0; i < *m; i++) {
        i0 = i << 1;
	i1 = i0 + (*m * *l << 1);
	i2 = i1 + (*m * *l << 1);
	i3 = i2 + (*m * *l << 1);
	i4 = i3 + (*m * *l << 1);
	i5 = i << 1;
	i6 = i5 + (*m << 1);
	i7 = i6 + (*m << 1);
	i8 = i7 + (*m << 1);
	i9 = i8 + (*m << 1);
	/* x0 = a[i1] + a[i4];
	y0 = a[i1 + 1] + a[i4 + 1];
	x1 = a[i2] + a[i3];
	y1 = a[i2 + 1] + a[i3 + 1];
	x2 = c51 * (a[i1] - a[i4]);
	y2 = c51 * (a[i1 + 1] - a[i4 + 1]);
	x3 = c51 * (a[i2] - a[i3]);
	y3 = c51 * (a[i2 + 1] - a[i3 + 1]);
	x4 = x0 + x1;
	y4 = y0 + y1;
	x5 = c53 * (x0 - x1);
	y5 = c53 * (y0 - y1);
	x6 = a[i0] - c54 * x4;
	y6 = a[i0 + 1] - c54 * y4;
	x7 = x6 + x5;
	y7 = y6 + y5;
	x8 = x6 - x5;
	y8 = y6 - y5;
	x9 = y2 + c52 * y3;
	y9 = -x2 - c52 * x3;
	x10 = c52 * y2 - y3;
	y10 = x3 - c52 * x2; */
	t1 = _mm_load_pd(&a[i1]);
	t4 = _mm_load_pd(&a[i4]);
	t0 = _mm_add_pd(t1, t4);
	t2 = _mm_mul_pd(c51, _mm_sub_pd(t1, t4));
	t1 = _mm_load_pd(&a[i2]);
	t4 = _mm_load_pd(&a[i3]);
	t3 = _mm_mul_pd(c51, _mm_sub_pd(t1, t4));
	t1 = _mm_add_pd(t1, t4);
	t4 = _mm_add_pd(t0, t1);
	t5 = _mm_mul_pd(c53, _mm_sub_pd(t0, t1));
	t0 = _mm_load_pd(&a[i0]);
	t6 = _mm_sub_pd(t0, _mm_mul_pd(c54, t4));
	t7 = _mm_add_pd(t6, t5);
	t8 = _mm_sub_pd(t6, t5);
	t9 = _mm_xor_pd(_mm_add_pd(t2, _mm_mul_pd(c52, t3)), _mm_set_sd(-0.0));
	t9 = _mm_shuffle_pd(t9, t9, 1);
	t10 = _mm_sub_pd(t3, _mm_mul_pd(c52, t2));
	t10 = _mm_xor_pd(_mm_shuffle_pd(t10, t10, 1), _mm_set_sd(-0.0));
	/* b[i5] = a[i0] + x4;
	b[i5 + 1] = a[i0 + 1] + y4;
	b[i6] = x7 + x9;
	b[i6 + 1] = y7 + y9;
	b[i7] = x8 + x10;
	b[i7 + 1] = y8 + y10;
	b[i8] = x8 - x10;
	b[i8 + 1] = y8 - y10;
	b[i9] = x7 - x9;
	b[i9 + 1] = y7 - y9; */
	_mm_store_pd(&b[i5], _mm_add_pd(t0, t4));
	_mm_store_pd(&b[i6], _mm_add_pd(t7, t9));
	_mm_store_pd(&b[i7], _mm_add_pd(t8, t10));
	_mm_store_pd(&b[i8], _mm_sub_pd(t8, t10));
	_mm_store_pd(&b[i9], _mm_sub_pd(t7, t9));
    }
    for (j = 1; j < *l; j++) {
        j0 = j << 1;
	/* wr1 = w[j0];
	wi1 = w[j0 + 1];
	wr2 = wr1 * wr1 - wi1 * wi1;
	wi2 = wr1 * wi1 + wr1 * wi1;
	wr3 = wr1 * wr2 - wi1 * wi2;
	wi3 = wr1 * wi2 + wi1 * wr2;
	wr4 = wr2 * wr2 - wi2 * wi2;
	wi4 = wr2 * wi2 + wr2 * wi2; */
	w1 = _mm_load_pd(&w[j0]);
	w2 = ZMUL(w1, w1);
	w3 = ZMUL(w1, w2);
	w4 = ZMUL(w2, w2);
	for (i = 0; i < *m; i++) {
	    i0 = (i << 1) + (j * *m << 1);
	    i1 = i0 + (*m * *l << 1);
	    i2 = i1 + (*m * *l << 1);
	    i3 = i2 + (*m * *l << 1);
	    i4 = i3 + (*m * *l << 1);
	    i5 = (i << 1) + (j * *m * 10);
	    i6 = i5 + (*m << 1);
	    i7 = i6 + (*m << 1);
	    i8 = i7 + (*m << 1);
	    i9 = i8 + (*m << 1);
	    /* x0 = a[i1] + a[i4];
	    y0 = a[i1 + 1] + a[i4 + 1];
	    x1 = a[i2] + a[i3];
	    y1 = a[i2 + 1] + a[i3 + 1];
	    x2 = c51 * (a[i1] - a[i4]);
	    y2 = c51 * (a[i1 + 1] - a[i4 + 1]);
	    x3 = c51 * (a[i2] - a[i3]);
	    y3 = c51 * (a[i2 + 1] - a[i3 + 1]);
	    x4 = x0 + x1;
	    y4 = y0 + y1;
	    x5 = c53 * (x0 - x1);
	    y5 = c53 * (y0 - y1);
	    x6 = a[i0] - c54 * x4;
	    y6 = a[i0 + 1] - c54 * y4;
	    x7 = x6 + x5;
	    y7 = y6 + y5;
	    x8 = x6 - x5;
	    y8 = y6 - y5;
	    x9 = y2 + c52 * y3;
	    y9 = -x2 - c52 * x3;
	    x10 = c52 * y2 - y3;
	    y10 = x3 - c52 * x2; */
	    t1 = _mm_load_pd(&a[i1]);
	    t4 = _mm_load_pd(&a[i4]);
	    t0 = _mm_add_pd(t1, t4);
	    t2 = _mm_mul_pd(c51, _mm_sub_pd(t1, t4));
	    t1 = _mm_load_pd(&a[i2]);
	    t4 = _mm_load_pd(&a[i3]);
	    t3 = _mm_mul_pd(c51, _mm_sub_pd(t1, t4));
	    t1 = _mm_add_pd(t1, t4);
	    t4 = _mm_add_pd(t0, t1);
	    t5 = _mm_mul_pd(c53, _mm_sub_pd(t0, t1));
	    t0 = _mm_load_pd(&a[i0]);
	    t6 = _mm_sub_pd(t0, _mm_mul_pd(c54, t4));
	    t7 = _mm_add_pd(t6, t5);
	    t8 = _mm_sub_pd(t6, t5);
	    t9 = _mm_xor_pd(_mm_add_pd(t2, _mm_mul_pd(c52, t3)), _mm_set_sd(-0.0));
	    t9 = _mm_shuffle_pd(t9, t9, 1);
	    t10 = _mm_sub_pd(t3, _mm_mul_pd(c52, t2));
	    t10 = _mm_xor_pd(_mm_shuffle_pd(t10, t10, 1), _mm_set_sd(-0.0));
	    /* b[i5] = a[i0] + x4;
	    b[i5 + 1] = a[i0 + 1] + y4;
	    b[i6] = wr1 * (x7 + x9) - wi1 * (y7 + y9);
	    b[i6 + 1] = wr1 * (y7 + y9) + wi1 * (x7 + x9);
	    b[i7] = wr2 * (x8 + x10) - wi2 * (y8 + y10);
	    b[i7 + 1] = wr2 * (y8 + y10) + wi2 * (x8 + x10);
	    b[i8] = wr3 * (x8 - x10) - wi3 * (y8 - y10);
	    b[i8 + 1] = wr3 * (y8 - y10) + wi3 * (x8 - x10);
	    b[i9] = wr4 * (x7 - x9) - wi4 * (y7 - y9);
	    b[i9 + 1] = wr4 * (y7 - y9) + wi4 * (x7 - x9); */
	    _mm_store_pd(&b[i5], _mm_add_pd(t0, t4));
	    _mm_store_pd(&b[i6], ZMUL(w1, _mm_add_pd(t7, t9)));
	    _mm_store_pd(&b[i7], ZMUL(w2, _mm_add_pd(t8, t10)));
	    _mm_store_pd(&b[i8], ZMUL(w3, _mm_sub_pd(t8, t10)));
	    _mm_store_pd(&b[i9], ZMUL(w4, _mm_sub_pd(t7, t9)));
	}
    }
    return 0;
}
Beispiel #9
0
int fft8a_(double *a, double *b, double *w, int *l)
{
    /* static double c81 = .70710678118654752; */
    static __m128d c81;

    int j, j0, j1, j2, j3, j4, j5, j6, j7, j8, j9, j10, j11, j12, j13, j14, j15;
    /* double u0, v0, u1, x0, y0, x1, y1, x2, y2, x3, y3, v1, x4, y4, x5, y5,
             x6, y6, x7, y7, u2, v2, u3, v3, wi1, wi2, wi3, wi4, wi5, wi6,
             wi7, wr1, wr2, wr3, wr4, wr5, wr6, wr7; */
    __m128d t0, t1, t2, t3, t4, t5, t6, t7, t8, u0, u1, u2, u3, w1, w2, w3, w4, w5, w6, w7;

    c81 = _mm_set1_pd(0.70710678118654752);

    for (j = 0; j < *l; j++) {
        j0 = j << 1;
        j1 = j0 + (*l << 1);
        j2 = j1 + (*l << 1);
        j3 = j2 + (*l << 1);
        j4 = j3 + (*l << 1);
        j5 = j4 + (*l << 1);
        j6 = j5 + (*l << 1);
        j7 = j6 + (*l << 1);
        j8 = j << 4;
        j9 = j8 + 2;
        j10 = j9 + 2;
        j11 = j10 + 2;
        j12 = j11 + 2;
        j13 = j12 + 2;
        j14 = j13 + 2;
        j15 = j14 + 2;
	/* wr1 = w[j0];
	wi1 = w[j0 + 1];
	wr2 = wr1 * wr1 - wi1 * wi1;
	wi2 = wr1 * wi1 + wr1 * wi1;
	wr3 = wr1 * wr2 - wi1 * wi2;
	wi3 = wr1 * wi2 + wi1 * wr2;
	wr4 = wr2 * wr2 - wi2 * wi2;
	wi4 = wr2 * wi2 + wr2 * wi2;
	wr5 = wr2 * wr3 - wi2 * wi3;
	wi5 = wr2 * wi3 + wi2 * wr3;
	wr6 = wr3 * wr3 - wi3 * wi3;
	wi6 = wr3 * wi3 + wr3 * wi3;
	wr7 = wr3 * wr4 - wi3 * wi4;
	wi7 = wr3 * wi4 + wi3 * wr4; */
	w1 = _mm_load_pd(&w[j0]);
	w2 = ZMUL(w1, w1);
	w3 = ZMUL(w1, w2);
	w4 = ZMUL(w2, w2);
	w5 = ZMUL(w2, w3);
	w6 = ZMUL(w3, w3);
	w7 = ZMUL(w3, w4);
	/* x0 = a[j0] + a[j4];
	y0 = a[j0 + 1] + a[j4 + 1];
	x1 = a[j0] - a[j4];
	y1 = a[j0 + 1] - a[j4 + 1];
	x2 = a[j2] + a[j6];
	y2 = a[j2 + 1] + a[j6 + 1];
	x3 = a[j2 + 1] - a[j6 + 1];
	y3 = a[j6] - a[j2]; */
	t0 = _mm_load_pd(&a[j0]);
	t2 = _mm_load_pd(&a[j4]);
	t1 = _mm_sub_pd(t0, t2);
	t0 = _mm_add_pd(t0, t2);
	t3 = _mm_load_pd(&a[j2]);
	t4 = _mm_load_pd(&a[j6]);
	t2 = _mm_add_pd(t3, t4);
	t3 = _mm_xor_pd(_mm_sub_pd(t3, t4), _mm_set_sd(-0.0));
	t3 = _mm_shuffle_pd(t3, t3, 1);
	/* u0 = x0 + x2;
	v0 = y0 + y2;
	u1 = x0 - x2;
	v1 = y0 - y2; */
	u0 = _mm_add_pd(t0, t2);
	u1 = _mm_sub_pd(t0, t2);
	/* x4 = a[j1] + a[j5];
	y4 = a[j1 + 1] + a[j5 + 1];
	x5 = a[j1] - a[j5];
	y5 = a[j1 + 1] - a[j5 + 1];
	x6 = a[j3] + a[j7];
	y6 = a[j3 + 1] + a[j7 + 1];
	x7 = a[j3] - a[j7];
	y7 = a[j3 + 1] - a[j7 + 1]; */
	t4 = _mm_load_pd(&a[j1]);
	t6 = _mm_load_pd(&a[j5]);
	t5 = _mm_sub_pd(t4, t6);
	t4 = _mm_add_pd(t4, t6);
	t7 = _mm_load_pd(&a[j3]);
	t8 = _mm_load_pd(&a[j7]);
	t6 = _mm_add_pd(t7, t8);
	t7 = _mm_sub_pd(t7, t8);
	/* u2 = x4 + x6;
	v2 = y4 + y6;
	u3 = y4 - y6;
	v3 = x6 - x4; */
	u2 = _mm_add_pd(t4, t6);
	u3 = _mm_xor_pd(_mm_sub_pd(t4, t6), _mm_set_sd(-0.0));
	u3 = _mm_shuffle_pd(u3, u3, 1);
	/* b[j8] = u0 + u2;
	b[j8 + 1] = v0 + v2;
	b[j12] = wr4 * (u0 - u2) - wi4 * (v0 - v2);
	b[j12 + 1] = wr4 * (v0 - v2) + wi4 * (u0 - u2);
	b[j10] = wr2 * (u1 + u3) - wi2 * (v1 + v3);
	b[j10 + 1] = wr2 * (v1 + v3) + wi2 * (u1 + u3);
	b[j14] = wr6 * (u1 - u3) - wi6 * (v1 - v3);
	b[j14 + 1] = wr6 * (v1 - v3) + wi6 * (u1 - u3); */
	_mm_store_pd(&b[j8], _mm_add_pd(u0, u2));
	_mm_store_pd(&b[j12], ZMUL(w4, _mm_sub_pd(u0, u2)));
	_mm_store_pd(&b[j10], ZMUL(w2, _mm_add_pd(u1, u3)));
	_mm_store_pd(&b[j14], ZMUL(w6, _mm_sub_pd(u1, u3)));
	/* u0 = x1 + c81 * (x5 - x7);
	v0 = y1 + c81 * (y5 - y7);
	u1 = x1 - c81 * (x5 - x7);
	v1 = y1 - c81 * (y5 - y7);
	u2 = x3 + c81 * (y5 + y7);
	v2 = y3 - c81 * (x5 + x7);
	u3 = x3 - c81 * (y5 + y7);
	v3 = y3 + c81 * (x5 + x7); */
	u1 = _mm_mul_pd(c81, _mm_sub_pd(t5, t7));
	u0 = _mm_add_pd(t1, u1);
	u1 = _mm_sub_pd(t1, u1);
	u3 = _mm_xor_pd(_mm_mul_pd(c81, _mm_add_pd(t5, t7)), _mm_set_sd(-0.0));
	u3 = _mm_shuffle_pd(u3, u3, 1);
	u2 = _mm_add_pd(t3, u3);
	u3 = _mm_sub_pd(t3, u3);
	/* b[j9] = wr1 * (u0 + u2) - wi1 * (v0 + v2);
	b[j9 + 1] = wr1 * (v0 + v2) + wi1 * (u0 + u2);
	b[j13] = wr5 * (u1 + u3) - wi5 * (v1 + v3);
	b[j13 + 1] = wr5 * (v1 + v3) + wi5 * (u1 + u3);
	b[j11] = wr3 * (u1 - u3) - wi3 * (v1 - v3);
	b[j11 + 1] = wr3 * (v1 - v3) + wi3 * (u1 - u3);
	b[j15] = wr7 * (u0 - u2) - wi7 * (v0 - v2);
	b[j15 + 1] = wr7 * (v0 - v2) + wi7 * (u0 - u2); */
	_mm_store_pd(&b[j9], ZMUL(w1, _mm_add_pd(u0, u2)));
	_mm_store_pd(&b[j13], ZMUL(w5, _mm_add_pd(u1, u3)));
	_mm_store_pd(&b[j11], ZMUL(w3, _mm_sub_pd(u1, u3)));
	_mm_store_pd(&b[j15], ZMUL(w7, _mm_sub_pd(u0, u2)));
    }
    return 0;
}
Beispiel #10
0
int fft4b_(double *a, double *b, double *w, int *m, int *l)
{
    int i, i0, i1, i2, i3, i4, i5, i6, i7, j, j0;
    /* double x0, y0, x1, y1, x2, y2, x3, y3, wi1, wi2, wi3, wr1, wr2, wr3; */
    __m128d t0, t1, t2, t3, t4, w1, w2, w3;

    for (i = 0; i < *m; i++) {
	i0 = i << 1;
	i1 = i0 + (*m * *l << 1);
	i2 = i1 + (*m * *l << 1);
	i3 = i2 + (*m * *l << 1);
	i4 = i << 1;
	i5 = i4 + (*m << 1);
	i6 = i5 + (*m << 1);
	i7 = i6 + (*m << 1);
	/* x0 = a[i0] + a[i2];
	y0 = a[i0 + 1] + a[i2 + 1];
	x1 = a[i0] - a[i2];
	y1 = a[i0 + 1] - a[i2 + 1];
	x2 = a[i1] + a[i3];
	y2 = a[i1 + 1] + a[i3 + 1];
	x3 = a[i1 + 1] - a[i3 + 1];
	y3 = a[i3] - a[i1]; */
	t0 = _mm_load_pd(&a[i0]);
	t2 = _mm_load_pd(&a[i2]);
	t1 = _mm_sub_pd(t0, t2);
	t0 = _mm_add_pd(t0, t2);
	t3 = _mm_load_pd(&a[i1]);
	t4 = _mm_load_pd(&a[i3]);
	t2 = _mm_add_pd(t3, t4);
	t3 = _mm_xor_pd(_mm_sub_pd(t3, t4), _mm_set_sd(-0.0));
	t3 = _mm_shuffle_pd(t3, t3, 1);
	/* b[i4] = x0 + x2;
	b[i4 + 1] = y0 + y2;
	b[i6] = x0 - x2;
	b[i6 + 1] = y0 - y2;
	b[i5] = x1 + x3;
	b[i5 + 1] = y1 + y3;
	b[i7] = x1 - x3;
	b[i7 + 1] = y1 - y3; */
	_mm_store_pd(&b[i4], _mm_add_pd(t0, t2));
	_mm_store_pd(&b[i6], _mm_sub_pd(t0, t2));
	_mm_store_pd(&b[i5], _mm_add_pd(t1, t3));
	_mm_store_pd(&b[i7], _mm_sub_pd(t1, t3));
    }
    for (j = 1; j < *l; j++) {
	j0 = j << 1;
	/* wr1 = w[j0];
	wi1 = w[j0 + 1];
	wr2 = wr1 * wr1 - wi1 * wi1;
	wi2 = wr1 * wi1 + wr1 * wi1;
	wr3 = wr1 * wr2 - wi1 * wi2;
	wi3 = wr1 * wi2 + wi1 * wr2; */
	w1 = _mm_load_pd(&w[j0]);
	w2 = ZMUL(w1, w1);
	w3 = ZMUL(w1, w2);
	for (i = 0; i < *m; i++) {
	    i0 = (i << 1) + (j * *m << 1);
	    i1 = i0 + (*m * *l << 1);
	    i2 = i1 + (*m * *l << 1);
	    i3 = i2 + (*m * *l << 1);
	    i4 = (i << 1) + (j * *m << 3);
	    i5 = i4 + (*m << 1);
	    i6 = i5 + (*m << 1);
	    i7 = i6 + (*m << 1);
	    /* x0 = a[i0] + a[i2];
	    y0 = a[i0 + 1] + a[i2 + 1];
	    x1 = a[i0] - a[i2];
	    y1 = a[i0 + 1] - a[i2 + 1];
	    x2 = a[i1] + a[i3];
	    y2 = a[i1 + 1] + a[i3 + 1];
	    x3 = a[i1 + 1] - a[i3 + 1];
	    y3 = a[i3] - a[i1]; */
	    t0 = _mm_load_pd(&a[i0]);
	    t2 = _mm_load_pd(&a[i2]);
	    t1 = _mm_sub_pd(t0, t2);
	    t0 = _mm_add_pd(t0, t2);
	    t3 = _mm_load_pd(&a[i1]);
	    t4 = _mm_load_pd(&a[i3]);
	    t2 = _mm_add_pd(t3, t4);
	    t3 = _mm_xor_pd(_mm_sub_pd(t3, t4), _mm_set_sd(-0.0));
	    t3 = _mm_shuffle_pd(t3, t3, 1);
	    /* b[i4] = x0 + x2;
	    b[i4 + 1] = y0 + y2;
	    b[i6] = wr2 * (x0 - x2) - wi2 * (y0 - y2);
	    b[i6 + 1] = wr2 * (y0 - y2) + wi2 * (x0 - x2);
	    b[i5] = wr1 * (x1 + x3) - wi1 * (y1 + y3);
	    b[i5 + 1] = wr1 * (y1 + y3) + wi1 * (x1 + x3);
	    b[i7] = wr3 * (x1 - x3) - wi3 * (y1 - y3);
	    b[i7 + 1] = wr3 * (y1 - y3) + wi3 * (x1 - x3); */
	    _mm_store_pd(&b[i4], _mm_add_pd(t0, t2));
	    _mm_store_pd(&b[i6], ZMUL(w2, _mm_sub_pd(t0, t2)));
	    _mm_store_pd(&b[i5], ZMUL(w1, _mm_add_pd(t1, t3)));
	    _mm_store_pd(&b[i7], ZMUL(w3, _mm_sub_pd(t1, t3)));
	}
    }
    return 0;
}
Beispiel #11
0
int fft5a_(double *a, double *b, double *w, int *l)
{
    /* static double c51 = .95105651629515357;
    static double c52 = .61803398874989485;
    static double c53 = .55901699437494742;
    static double c54 = .25; */
    static __m128d c51, c52, c53, c54;

    int j, j0, j1, j2, j3, j4, j5, j6, j7, j8, j9;
    /* double x0, y0, x1, y1, x2, y2, x3, y3, x4, y4, x5, y5, x6, y6, x7, y7,
                x8, y8, x9, y9, x10, y10, wi1, wi2, wi3, wi4, wr1, wr2, wr3, wr4; */
    __m128d t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, w1, w2, w3, w4;

    c51 = _mm_set1_pd(0.95105651629515357);
    c52 = _mm_set1_pd(0.61803398874989485);
    c53 = _mm_set1_pd(0.55901699437494742);
    c54 = _mm_set1_pd(0.25);

    for (j = 0; j < *l; j++) {
        j0 = j << 1;
	j1 = j0 + (*l << 1);
	j2 = j1 + (*l << 1);
	j3 = j2 + (*l << 1);
	j4 = j3 + (*l << 1);
	j5 = j * 10;
	j6 = j5 + 2;
	j7 = j6 + 2;
	j8 = j7 + 2;
	j9 = j8 + 2;
	/* wr1 = w[j0];
	wi1 = w[j0 + 1];
	wr2 = wr1 * wr1 - wi1 * wi1;
	wi2 = wr1 * wi1 + wr1 * wi1;
	wr3 = wr1 * wr2 - wi1 * wi2;
	wi3 = wr1 * wi2 + wi1 * wr2;
	wr4 = wr2 * wr2 - wi2 * wi2;
	wi4 = wr2 * wi2 + wr2 * wi2; */
	w1 = _mm_load_pd(&w[j0]);
	w2 = ZMUL(w1, w1);
	w3 = ZMUL(w1, w2);
	w4 = ZMUL(w2, w2);
	/* x0 = a[j1] + a[j4];
	y0 = a[j1 + 1] + a[j4 + 1];
	x1 = a[j2] + a[j3];
	y1 = a[j2 + 1] + a[j3 + 1];
	x2 = c51 * (a[j1] - a[j4]);
	y2 = c51 * (a[j1 + 1] - a[j4 + 1]);
	x3 = c51 * (a[j2] - a[j3]);
	y3 = c51 * (a[j2 + 1] - a[j3 + 1]);
	x4 = x0 + x1;
	y4 = y0 + y1;
	x5 = c53 * (x0 - x1);
	y5 = c53 * (y0 - y1);
	x6 = a[j0] - c54 * x4;
	y6 = a[j0 + 1] - c54 * y4;
	x7 = x6 + x5;
	y7 = y6 + y5;
	x8 = x6 - x5;
	y8 = y6 - y5;
	x9 = y2 + c52 * y3;
	y9 = -x2 - c52 * x3;
	x10 = c52 * y2 - y3;
	y10 = x3 - c52 * x2; */
	t1 = _mm_load_pd(&a[j1]);
	t4 = _mm_load_pd(&a[j4]);
	t0 = _mm_add_pd(t1, t4);
	t2 = _mm_mul_pd(c51, _mm_sub_pd(t1, t4));
	t1 = _mm_load_pd(&a[j2]);
	t4 = _mm_load_pd(&a[j3]);
	t3 = _mm_mul_pd(c51, _mm_sub_pd(t1, t4));
	t1 = _mm_add_pd(t1, t4);
	t4 = _mm_add_pd(t0, t1);
	t5 = _mm_mul_pd(c53, _mm_sub_pd(t0, t1));
	t0 = _mm_load_pd(&a[j0]);
	t6 = _mm_sub_pd(t0, _mm_mul_pd(c54, t4));
	t7 = _mm_add_pd(t6, t5);
	t8 = _mm_sub_pd(t6, t5);
	t9 = _mm_xor_pd(_mm_add_pd(t2, _mm_mul_pd(c52, t3)), _mm_set_sd(-0.0));
	t9 = _mm_shuffle_pd(t9, t9, 1);
	t10 = _mm_sub_pd(t3, _mm_mul_pd(c52, t2));
	t10 = _mm_xor_pd(_mm_shuffle_pd(t10, t10, 1), _mm_set_sd(-0.0));
	/* b[j5] = a[j0] + x4;
	b[j5 + 1] = a[j0 + 1] + y4;
	b[j6] = wr1 * (x7 + x9) - wi1 * (y7 + y9);
	b[j6 + 1] = wr1 * (y7 + y9) + wi1 * (x7 + x9);
	b[j7] = wr2 * (x8 + x10) - wi2 * (y8 + y10);
	b[j7 + 1] = wr2 * (y8 + y10) + wi2 * (x8 + x10);
	b[j8] = wr3 * (x8 - x10) - wi3 * (y8 - y10);
	b[j8 + 1] = wr3 * (y8 - y10) + wi3 * (x8 - x10);
	b[j9] = wr4 * (x7 - x9) - wi4 * (y7 - y9);
	b[j9 + 1] = wr4 * (y7 - y9) + wi4 * (x7 - x9); */
	_mm_store_pd(&b[j5], _mm_add_pd(t0, t4));
	_mm_store_pd(&b[j6], ZMUL(w1, _mm_add_pd(t7, t9)));
	_mm_store_pd(&b[j7], ZMUL(w2, _mm_add_pd(t8, t10)));
	_mm_store_pd(&b[j8], ZMUL(w3, _mm_sub_pd(t8, t10)));
	_mm_store_pd(&b[j9], ZMUL(w4, _mm_sub_pd(t7, t9)));
    }
    return 0;
}
Beispiel #12
0
int fft3b_(double *a, double *b, double *w, int *m, int *l)
{
    /* static double c31 = .86602540378443865;
    static double c32 = .5; */
    static __m128d c31, c32;

    int i, i0, i1, i2, i3, i4, i5, j, j0;
    /* double x0, y0, x1, y1, x2, y2, wi1, wi2, wr1, wr2; */
    __m128d t0, t1, t2, t3, w1, w2;

    c31 = _mm_set1_pd(0.86602540378443865);
    c32 = _mm_set1_pd(0.5);

    for (i = 0; i < *m; i++) {
        i0 = i << 1;
	i1 = i0 + (*m * *l << 1);
	i2 = i1 + (*m * *l << 1);
	i3 = i << 1;
	i4 = i3 + (*m << 1);
	i5 = i4 + (*m << 1);
	/* x0 = a[i1] + a[i2];
	y0 = a[i1 + 1] + a[i2 + 1];
	x1 = a[i0] - c32 * x0;
	y1 = a[i0 + 1] - c32 * y0;
	x2 = c31 * (a[i1 + 1] - a[i2 + 1]);
	y2 = c31 * (a[i2] - a[i1]); */
	t1 = _mm_load_pd(&a[i1]);
	t2 = _mm_load_pd(&a[i2]);
	t0 = _mm_add_pd(t1, t2);
	t2 = _mm_xor_pd(_mm_sub_pd(t1, t2), _mm_set_sd(-0.0));
	t2 = _mm_mul_pd(c31, _mm_shuffle_pd(t2, t2, 1));
	t3 = _mm_load_pd(&a[i0]);
	t1 = _mm_sub_pd(t3, _mm_mul_pd(c32, t0));
	/* b[i3] = a[i0] + x0;
	b[i3 + 1] = a[i0 + 1] + y0;
	b[i4] = x1 + x2;
	b[i4 + 1] = y1 + y2;
	b[i5] = x1 - x2;
	b[i5 + 1] = y1 - y2; */
	_mm_store_pd(&b[i3], _mm_add_pd(t3, t0));
	_mm_store_pd(&b[i4], _mm_add_pd(t1, t2));
	_mm_store_pd(&b[i5], _mm_sub_pd(t1, t2));
    }
    for (j = 1; j < *l; j++) {
        j0 = j << 1;
	/* wr1 = w[j0];
	wi1 = w[j0 + 1];
	wr2 = wr1 * wr1 - wi1 * wi1;
	wi2 = wr1 * wi1 + wr1 * wi1; */
	w1 = _mm_load_pd(&w[j0]);
	w2 = ZMUL(w1, w1);
	for (i = 0; i < *m; i++) {
	    i0 = (i << 1) + (j * *m << 1);
	    i1 = i0 + (*m * *l << 1);
	    i2 = i1 + (*m * *l << 1);
	    i3 = (i << 1) + (j * *m * 6);
	    i4 = i3 + (*m << 1);
	    i5 = i4 + (*m << 1);
	    /* x0 = a[i1] + a[i2];
	    y0 = a[i1 + 1] + a[i2 + 1];
	    x1 = a[i0] - x0 * .5;
	    y1 = a[i0 + 1] - y0 * .5;
	    x2 = c31 * (a[i1 + 1] - a[i2 + 1]);
	    y2 = c31 * (a[i2] - a[i1]); */
	    t1 = _mm_load_pd(&a[i1]);
	    t2 = _mm_load_pd(&a[i2]);
	    t0 = _mm_add_pd(t1, t2);
	    t2 = _mm_xor_pd(_mm_sub_pd(t1, t2), _mm_set_sd(-0.0));
	    t2 = _mm_mul_pd(c31, _mm_shuffle_pd(t2, t2, 1));
	    t3 = _mm_load_pd(&a[i0]);
	    t1 = _mm_sub_pd(t3, _mm_mul_pd(c32, t0));
	    /* b[i3] = a[i0] + x0;
	    b[i3 + 1] = a[i0 + 1] + y0;
	    b[i4] = wr1 * (x1 + x2) - wi1 * (y1 + y2);
	    b[i4 + 1] = wr1 * (y1 + y2) + wi1 * (x1 + x2);
	    b[i5] = wr2 * (x1 - x2) - wi2 * (y1 - y2);
	    b[i5 + 1] = wr2 * (y1 - y2) + wi2 * (x1 - x2); */
	    _mm_store_pd(&b[i3], _mm_add_pd(t3, t0));
	    _mm_store_pd(&b[i4], ZMUL(w1, _mm_add_pd(t1, t2)));
	    _mm_store_pd(&b[i5], ZMUL(w2, _mm_sub_pd(t1, t2)));
	}
    }
    return 0;
}
Beispiel #13
0
// =============================================================================
//
// sse2_vChirpData
// version by: Alex Kan - SSE2 mods (haddsum removal) BH
//   http://tbp.berkeley.edu/~alexkan/seti/
//
int sse2_ChirpData_ak(
  sah_complex * cx_DataArray,
  sah_complex * cx_ChirpDataArray,
  int chirp_rate_ind,
  double chirp_rate,
  int  ul_NumDataPoints,
  double sample_rate
) {
  int i;

  if (chirp_rate_ind == 0) {
    memcpy(cx_ChirpDataArray, cx_DataArray,  (int)ul_NumDataPoints * sizeof(sah_complex)  );
    return 0;
  }

  int vEnd;  
  double srate = chirp_rate * 0.5 / (sample_rate * sample_rate);
  __m128d rate = _mm_set1_pd(chirp_rate * 0.5 / (sample_rate * sample_rate));
  __m128d roundVal = _mm_set1_pd(srate >= 0.0 ? TWO_TO_52 : -TWO_TO_52);

  // main vectorised loop
  vEnd = ul_NumDataPoints - (ul_NumDataPoints & 3);
  for (i = 0; i < vEnd; i += 4) {
    const float *data = (const float *) (cx_DataArray + i);
    float *chirped = (float *) (cx_ChirpDataArray + i);
    __m128d di = _mm_set1_pd(i);
    __m128d a1 = _mm_add_pd(_mm_set_pd(1.0, 0.0), di);
    __m128d a2 = _mm_add_pd(_mm_set_pd(3.0, 2.0), di);
    __m128d x1, y1;

    __m128 d1, d2;
    __m128 cd1, cd2;
    __m128 td1, td2;
    __m128 x;
    __m128 y;
    __m128 s;
    __m128 c;
    __m128 m;

    // load the signal to be chirped
    prefetchnta((const void *)( data+32 ));
    d1 = _mm_load_ps(data);
    d2 = _mm_load_ps(data+4);

    // calculate the input angle
    a1 = _mm_mul_pd(a1, a1);
    a2 = _mm_mul_pd(a2, a2);
    a1 = _mm_mul_pd(a1, rate);
    a2 = _mm_mul_pd(a2, rate);

    // reduce the angle to the range (-0.5, 0.5)
    x1 = _mm_add_pd(a1, roundVal);
    y1 = _mm_add_pd(a2, roundVal);
    x1 = _mm_sub_pd(x1, roundVal);
    y1 = _mm_sub_pd(y1, roundVal);
    a1 = _mm_sub_pd(a1, x1);
    a2 = _mm_sub_pd(a2, y1);

    // convert pair of packed double into packed single
    x = _mm_movelh_ps(_mm_cvtpd_ps(a1), _mm_cvtpd_ps(a2));

    // square to the range [0, 0.25)
    y = _mm_mul_ps(x, x);

    // perform the initial polynomial approximations
    s = _mm_mul_ps(y, SS4);
    c = _mm_mul_ps(y, CC3);            
    s = _mm_add_ps(s, SS3);
    c = _mm_add_ps(c, CC2);
    s = _mm_mul_ps(s, y);
    c = _mm_mul_ps(c, y);
    s = _mm_add_ps(s, SS2);
    c = _mm_add_ps(c, CC1);
    s = _mm_mul_ps(s, y);
    c = _mm_mul_ps(c, y);
    s = _mm_add_ps(s, SS1);
    s = _mm_mul_ps(s, x);
    c = _mm_add_ps(c, ONE);

    // perform first angle doubling
    x = _mm_sub_ps(_mm_mul_ps(c, c), _mm_mul_ps(s, s));
    y = _mm_mul_ps(_mm_mul_ps(s, c), TWO);

    // calculate scaling factor to correct the magnitude
    //      m1 = vec_nmsub(y1, y1, vec_nmsub(x1, x1, TWO));
    //      m2 = vec_nmsub(y2, y2, vec_nmsub(x2, x2, TWO));
    m = vec_recip2(_mm_add_ps(_mm_mul_ps(x, x), _mm_mul_ps(y, y)));

    // perform second angle doubling
    c = _mm_sub_ps(_mm_mul_ps(x, x), _mm_mul_ps(y, y));
    s = _mm_mul_ps(_mm_mul_ps(y, x), TWO);

    // correct the magnitude (final sine / cosine approximations)
    c = _mm_mul_ps(c, m);
    s = _mm_mul_ps(s, m);

/*    c1 c2 c3 c4
    s1 s2 s3 s4

    R1 i1 R2 I2    R3 i3 R4 i4

    R1 * c1  +  (i1 * s1 * -1)
    i1 * c1  +   R1 * s1  
    R2 * c2  +  (i2 * s2 * -1)
    i2 * c2  +   R2 * s2
*/

    x = d1;
    y = d2;
    x = _mm_shuffle_ps(x, x, 0xB1);
    y = _mm_shuffle_ps(y, y, 0xB1);
    x = _mm_mul_ps(x, R_NEG);
    y = _mm_mul_ps(y, R_NEG);
    cd1 = _mm_shuffle_ps(c, c, 0x50);  // 01 01 00 00  AaBb => BBbb => c3c3c4c4
    cd2 = _mm_shuffle_ps(c, c, 0xfa);  // 11 11 10 10  AaBb => AAaa => c1c1c2c2
    td1 = _mm_shuffle_ps(s, s, 0x50);
    td2 = _mm_shuffle_ps(s, s, 0xfa);

    cd1 = _mm_mul_ps(cd1, d1);
    cd2 = _mm_mul_ps(cd2, d2);
    td1 = _mm_mul_ps(td1, x);
    td2 = _mm_mul_ps(td2, y);

    cd1 = _mm_add_ps(cd1, td1);
    cd2 = _mm_add_ps(cd2, td2);

    // store chirped values
    _mm_stream_ps(chirped+0, cd1);
    _mm_stream_ps(chirped+4, cd2);
  }
  _mm_sfence();

  if( i < ul_NumDataPoints) {
    // use original routine to finish up any tailings (max stride-1 elements)
    v_ChirpData(cx_DataArray+i, cx_ChirpDataArray+i
      , chirp_rate_ind, chirp_rate, ul_NumDataPoints-i, sample_rate);
  }
  analysis_state.FLOP_counter+=12.0*ul_NumDataPoints;

  return 0;
}
Beispiel #14
0
void CalcGravity(int sp, PSpot* allSpot,int* length)
{
	__m128d force1 = _mm_set1_pd(0);
	__m128d force2 = _mm_set1_pd(0);
	
	PSpot* spotSp = &allSpot[sp];

	for(int i=0;i<sp;i++)
	{
		__m128d diff1 = _mm_sub_pd(allSpot[i].pos1, spotSp->pos1);
		__m128d diff2 = _mm_sub_sd(allSpot[i].pos2, spotSp->pos2);

		__m128d r = Length(diff1, diff2);

		if (r.m128d_f64[0]*2 < (spotSp->qmass + allSpot[i].qmass))
		{
			if (allSpot[i].mass > spotSp->mass)
			{
				allSpot[i].heading1 = 
					_mm_add_pd(
						allSpot[i].heading1,
						_mm_mul_pd(
							_mm_sub_pd(spotSp->heading1, allSpot[i].heading1),
							_mm_set1_pd(spotSp->mass / (spotSp->mass + allSpot[i].mass))
						)
					);

				allSpot[i].heading2 = 
					_mm_add_sd(
						allSpot[i].heading2,
						_mm_mul_sd(
							_mm_sub_sd(spotSp->heading2, allSpot[i].heading2),
							_mm_set1_pd(spotSp->mass / (spotSp->mass + allSpot[i].mass))
						)
					);

				allSpot[i].mass += spotSp->mass;
				allSpot[i].qmass = pow(allSpot[i].mass, 0.33333);
				spotSp->mass = 0;

				(*length)--;
				PSpot temp;
				temp = allSpot[sp];
				allSpot[sp] = allSpot[*length];
				allSpot[*length] = temp;
				return;
			}
			else
			{
				spotSp->heading1 = 
					_mm_add_pd(
						spotSp->heading1,
						_mm_mul_pd(
							_mm_sub_pd(allSpot[i].heading1, spotSp->heading1),
							_mm_set1_pd(allSpot[i].mass / (spotSp->mass + allSpot[i].mass))
						)
					);

				spotSp->heading2 = 
					_mm_add_sd(
						spotSp->heading2,
						_mm_mul_sd(
							_mm_sub_sd(allSpot[i].heading2, spotSp->heading2),
							_mm_set1_pd(allSpot[i].mass / (spotSp->mass + allSpot[i].mass))
						)
					);

				spotSp->mass += allSpot[i].mass;
				spotSp->qmass = pow(spotSp->mass, 0.33333);
				allSpot[i].mass = 0;

				(*length)--;
				PSpot temp;
				temp = allSpot[i];
				allSpot[i] = allSpot[*length];
				allSpot[*length] = temp;
				return;
			}
		}

		//float f = (G * spotSp->mass * allSpot[i].mass) / (r.m128d_f64[0] * r.m128d_f64[0] * r.m128d_f64[0]);

		__m128d r1 = r;
		r1.m128d_f64[1] = G;
		__m128d r2 = r;
		r2.m128d_f64[1] = spotSp->mass;
		__m128d r3 = r;
		r3.m128d_f64[1] = allSpot[i].mass;
		__m128d r4 = _mm_mul_pd(_mm_mul_pd(r1, r2), r3);
		__m128d r5 = _mm_shuffle_pd(r4, r4, 3);
		r4 = _mm_shuffle_pd(r4, r4, 0);
		__m128d r6 = _mm_div_pd(r5, r4);

		force1 = _mm_add_pd(force1,_mm_mul_pd(diff1, r6));
		force2 = _mm_add_sd(force2,_mm_mul_sd(diff2, r6));
	}

	for(int i=sp+1;i<*length;i++)
	{
		__m128d diff1 = _mm_sub_pd(allSpot[i].pos1, spotSp->pos1);
		__m128d diff2 = _mm_sub_sd(allSpot[i].pos2, spotSp->pos2);

		__m128d r = Length(diff1, diff2);

		if (r.m128d_f64[0]*2 < (spotSp->qmass + allSpot[i].qmass))
		{
			if (allSpot[i].mass > spotSp->mass)
			{
				allSpot[i].heading1 = 
					_mm_add_pd(
						allSpot[i].heading1,
						_mm_mul_pd(
							_mm_sub_pd(spotSp->heading1, allSpot[i].heading1),
							_mm_set1_pd(spotSp->mass / (spotSp->mass + allSpot[i].mass))
						)
					);

				allSpot[i].heading2 = 
					_mm_add_sd(
						allSpot[i].heading2,
						_mm_mul_sd(
							_mm_sub_sd(spotSp->heading2, allSpot[i].heading2),
							_mm_set1_pd(spotSp->mass / (spotSp->mass + allSpot[i].mass))
						)
					);

				allSpot[i].mass += spotSp->mass;
				allSpot[i].qmass = pow(allSpot[i].mass, 0.33333);
				spotSp->mass = 0;

				(*length)--;
				PSpot temp;
				temp = allSpot[sp];
				allSpot[sp] = allSpot[*length];
				allSpot[*length] = temp;
				return;
			}
			else
			{
				spotSp->heading1 = 
					_mm_add_pd(
						spotSp->heading1,
						_mm_mul_pd(
							_mm_sub_pd(allSpot[i].heading1, spotSp->heading1),
							_mm_set1_pd(allSpot[i].mass / (spotSp->mass + allSpot[i].mass))
						)
					);

				spotSp->heading2 = 
					_mm_add_sd(
						spotSp->heading2,
						_mm_mul_sd(
							_mm_sub_sd(allSpot[i].heading2, spotSp->heading2),
							_mm_set1_pd(allSpot[i].mass / (spotSp->mass + allSpot[i].mass))
						)
					);

				spotSp->mass += allSpot[i].mass;
				spotSp->qmass = pow(spotSp->mass, 0.33333);
				allSpot[i].mass = 0;

				(*length)--;
				PSpot temp;
				temp = allSpot[i];
				allSpot[i] = allSpot[*length];
				allSpot[*length] = temp;
				return;
			}
		}

		//float f = (G * spotSp->mass * allSpot[i].mass) / (r.m128d_f64[0] * r.m128d_f64[0] * r.m128d_f64[0]);

		__m128d r1 = r;
		r1.m128d_f64[1] = G;
		__m128d r2 = r;
		r2.m128d_f64[1] = spotSp->mass;
		__m128d r3 = r;
		r3.m128d_f64[1] = allSpot[i].mass;
		__m128d r4 = _mm_mul_pd(_mm_mul_pd(r1, r2), r3);
		__m128d r5 = _mm_shuffle_pd(r4, r4, 3);
		r4 = _mm_shuffle_pd(r4, r4, 0);
		__m128d r6 = _mm_div_pd(r5, r4);

		force1 = _mm_add_pd(force1,_mm_mul_pd(diff1, r6));
		force2 = _mm_add_sd(force2,_mm_mul_sd(diff2, r6));
	}

	force1 = _mm_div_pd(force1, _mm_set1_pd(spotSp->mass));
	force2 = _mm_div_sd(force2, _mm_set1_pd(spotSp->mass));

	__m128d forcef = Length(force1, force2);

	if (forcef.m128d_f64[0] > 0)
	{
		double gate = 0.001f;
		double step = gate / forcef.m128d_f64[0];

		if (spotSp->process + step < 1)
		{
			spotSp->process += step;
		}
		else
		{
			step = 1 - spotSp->process;
			spotSp->process = 1;
		}

		__m128d stepd = _mm_set1_pd(step);

		spotSp->heading1 = _mm_add_pd(spotSp->heading1,_mm_mul_pd(force1,stepd));
		spotSp->heading2 = _mm_add_sd(spotSp->heading2,_mm_mul_sd(force2,stepd));

		spotSp->pos1 = _mm_add_pd(spotSp->pos1, _mm_mul_pd(spotSp->heading1,stepd));
		spotSp->pos2 = _mm_add_sd(spotSp->pos2, _mm_mul_sd(spotSp->heading2,stepd));
	}
	else
	{
		spotSp->pos1 = _mm_add_pd(spotSp->pos1, spotSp->heading1);
		spotSp->pos2 = _mm_add_sd(spotSp->pos2, spotSp->heading2);
		spotSp->process = 1;
	}
}
void exchlaplacecoeffData_6(unsigned int slot) {
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if ((!neighbor_isValid[1][0])) {
{
double xPos;
double yPos;
/* Statements in this Scop: S902, S905, S908, S907, S901, S910, S904, S903, S906, S909, S900 */
{
{
{
{
{
{
{
{
{
{
{
double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][0]);
int i1 = 1;
for (; (i1<=64); i1 += 2) {
fieldData_LaplaceCoeff_6_p1[((i1*68)+4558)] = 0.000000e+00;
fieldData_LaplaceCoeff_6_p1[((i1*68)+4626)] = 0.000000e+00;
}
for (; (i1<=65); i1 += 1) {
fieldData_LaplaceCoeff_6_p1[((i1*68)+4558)] = 0.000000e+00;
}
}
{
int i1 = 1;
for (; (i1<(2&(~1))); i1 += 1) {
yPos = ((((i1-1)/6.400000e+01)*(posEnd[1]-posBegin[1]))+posBegin[1]);
}
__m128d vec1 = _mm_set1_pd(1.000000e+00);
__m128d vec2 = _mm_set1_pd(6.400000e+01);
__m128d vec5 = _mm_set1_pd(yPos);
for (; (i1<63); i1 += 4) {
/* yPos = ((((i1-1)/6.400000e+01)*(posEnd[1]-posBegin[1]))+posBegin[1]); */
__m128d vec0 = _mm_set_pd(i1+1,i1);
__m128d vec0_2 = _mm_set_pd(i1+1,i1);
__m128d vec3 = _mm_load1_pd((&posEnd[1]));
__m128d vec3_2 = _mm_load1_pd((&posEnd[1]));
__m128d vec4 = _mm_load1_pd((&posBegin[1]));
__m128d vec4_2 = _mm_load1_pd((&posBegin[1]));
vec5 = _mm_add_pd(_mm_mul_pd(_mm_div_pd(_mm_sub_pd(vec0, vec1), vec2), _mm_sub_pd(vec3, vec4)), vec4);
vec5 = _mm_add_pd(_mm_mul_pd(_mm_div_pd(_mm_sub_pd(vec0_2, vec1), vec2), _mm_sub_pd(vec3_2, vec4_2)), vec4_2);
}
for (; (i1<66); i1 += 1) {
yPos = ((((i1-1)/6.400000e+01)*(posEnd[1]-posBegin[1]))+posBegin[1]);
}
}
}
{
double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][0]);
int i1 = 1;
for (; (i1<=64); i1 += 2) {
fieldData_LaplaceCoeff_6_p1[((i1*68)+9114)] = 0.000000e+00;
fieldData_LaplaceCoeff_6_p1[((i1*68)+9182)] = 0.000000e+00;
}
for (; (i1<=65); i1 += 1) {
fieldData_LaplaceCoeff_6_p1[((i1*68)+9114)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][0]);
int i1 = 1;
for (; (i1<=64); i1 += 2) {
fieldData_LaplaceCoeff_6_p1[((i1*68)+2)] = 0.000000e+00;
fieldData_LaplaceCoeff_6_p1[((i1*68)+70)] = 0.000000e+00;
}
for (; (i1<=65); i1 += 1) {
fieldData_LaplaceCoeff_6_p1[((i1*68)+2)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][0]);
int i1 = 1;
for (; (i1<=64); i1 += 2) {
fieldData_LaplaceCoeff_6_p1[((i1*68)+31894)] = 0.000000e+00;
fieldData_LaplaceCoeff_6_p1[((i1*68)+31962)] = 0.000000e+00;
}
for (; (i1<=65); i1 += 1) {
fieldData_LaplaceCoeff_6_p1[((i1*68)+31894)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][0]);
int i1 = 1;
for (; (i1<=64); i1 += 2) {
fieldData_LaplaceCoeff_6_p1[((i1*68)+36450)] = 0.000000e+00;
fieldData_LaplaceCoeff_6_p1[((i1*68)+36518)] = 0.000000e+00;
}
for (; (i1<=65); i1 += 1) {
fieldData_LaplaceCoeff_6_p1[((i1*68)+36450)] = 0.000000e+00;
}
}
}
{
int i1 = 1;
for (; (i1<(2&(~1))); i1 += 1) {
xPos = posBegin[0];
}
__m128d vec1 = _mm_set1_pd(xPos);
for (; (i1<63); i1 += 4) {
/* xPos = posBegin[0]; */
__m128d vec0 = _mm_load1_pd((&posBegin[0]));
__m128d vec0_2 = _mm_load1_pd((&posBegin[0]));
vec1 = vec0;
vec1 = vec0_2;
}
for (; (i1<66); i1 += 1) {
xPos = posBegin[0];
}
}
}
{
double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][0]);
int i1 = 1;
for (; (i1<=64); i1 += 2) {
fieldData_LaplaceCoeff_6_p1[((i1*68)+18226)] = 0.000000e+00;
fieldData_LaplaceCoeff_6_p1[((i1*68)+18294)] = 0.000000e+00;
}
for (; (i1<=65); i1 += 1) {
fieldData_LaplaceCoeff_6_p1[((i1*68)+18226)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][0]);
int i1 = 1;
for (; (i1<=64); i1 += 2) {
fieldData_LaplaceCoeff_6_p1[((i1*68)+13670)] = 0.000000e+00;
fieldData_LaplaceCoeff_6_p1[((i1*68)+13738)] = 0.000000e+00;
}
for (; (i1<=65); i1 += 1) {
fieldData_LaplaceCoeff_6_p1[((i1*68)+13670)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][0]);
int i1 = 1;
for (; (i1<=64); i1 += 2) {
fieldData_LaplaceCoeff_6_p1[((i1*68)+27338)] = 0.000000e+00;
fieldData_LaplaceCoeff_6_p1[((i1*68)+27406)] = 0.000000e+00;
}
for (; (i1<=65); i1 += 1) {
fieldData_LaplaceCoeff_6_p1[((i1*68)+27338)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][0]);
int i1 = 1;
for (; (i1<=64); i1 += 2) {
fieldData_LaplaceCoeff_6_p1[((i1*68)+22782)] = 0.000000e+00;
fieldData_LaplaceCoeff_6_p1[((i1*68)+22850)] = 0.000000e+00;
}
for (; (i1<=65); i1 += 1) {
fieldData_LaplaceCoeff_6_p1[((i1*68)+22782)] = 0.000000e+00;
}
}
}
}
}
if ((!neighbor_isValid[1][1])) {
{
double xPos;
double yPos;
/* Statements in this Scop: S920, S914, S917, S911, S913, S916, S919, S921, S918, S912, S915 */
{
{
{
{
{
{
{
{
{
{
{
double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][0]);
int i1 = 1;
for (; (i1<=64); i1 += 2) {
fieldData_LaplaceCoeff_6_p1[((i1*68)+4622)] = 0.000000e+00;
fieldData_LaplaceCoeff_6_p1[((i1*68)+4690)] = 0.000000e+00;
}
for (; (i1<=65); i1 += 1) {
fieldData_LaplaceCoeff_6_p1[((i1*68)+4622)] = 0.000000e+00;
}
}
{
double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][0]);
int i1 = 1;
for (; (i1<=64); i1 += 2) {
fieldData_LaplaceCoeff_6_p1[((i1*68)+31958)] = 0.000000e+00;
fieldData_LaplaceCoeff_6_p1[((i1*68)+32026)] = 0.000000e+00;
}
for (; (i1<=65); i1 += 1) {
fieldData_LaplaceCoeff_6_p1[((i1*68)+31958)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][0]);
int i1 = 1;
for (; (i1<=64); i1 += 2) {
fieldData_LaplaceCoeff_6_p1[((i1*68)+13734)] = 0.000000e+00;
fieldData_LaplaceCoeff_6_p1[((i1*68)+13802)] = 0.000000e+00;
}
for (; (i1<=65); i1 += 1) {
fieldData_LaplaceCoeff_6_p1[((i1*68)+13734)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][0]);
int i1 = 1;
for (; (i1<=64); i1 += 2) {
fieldData_LaplaceCoeff_6_p1[((i1*68)+66)] = 0.000000e+00;
fieldData_LaplaceCoeff_6_p1[((i1*68)+134)] = 0.000000e+00;
}
for (; (i1<=65); i1 += 1) {
fieldData_LaplaceCoeff_6_p1[((i1*68)+66)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][0]);
int i1 = 1;
for (; (i1<=64); i1 += 2) {
fieldData_LaplaceCoeff_6_p1[((i1*68)+22846)] = 0.000000e+00;
fieldData_LaplaceCoeff_6_p1[((i1*68)+22914)] = 0.000000e+00;
}
for (; (i1<=65); i1 += 1) {
fieldData_LaplaceCoeff_6_p1[((i1*68)+22846)] = 0.000000e+00;
}
}
}
{
int i1 = 1;
for (; (i1<(2&(~1))); i1 += 1) {
yPos = ((((i1-1)/6.400000e+01)*(posEnd[1]-posBegin[1]))+posBegin[1]);
}
__m128d vec1 = _mm_set1_pd(1.000000e+00);
__m128d vec2 = _mm_set1_pd(6.400000e+01);
__m128d vec5 = _mm_set1_pd(yPos);
for (; (i1<63); i1 += 4) {
/* yPos = ((((i1-1)/6.400000e+01)*(posEnd[1]-posBegin[1]))+posBegin[1]); */
__m128d vec0 = _mm_set_pd(i1+1,i1);
__m128d vec0_2 = _mm_set_pd(i1+1,i1);
__m128d vec3 = _mm_load1_pd((&posEnd[1]));
__m128d vec3_2 = _mm_load1_pd((&posEnd[1]));
__m128d vec4 = _mm_load1_pd((&posBegin[1]));
__m128d vec4_2 = _mm_load1_pd((&posBegin[1]));
vec5 = _mm_add_pd(_mm_mul_pd(_mm_div_pd(_mm_sub_pd(vec0, vec1), vec2), _mm_sub_pd(vec3, vec4)), vec4);
vec5 = _mm_add_pd(_mm_mul_pd(_mm_div_pd(_mm_sub_pd(vec0_2, vec1), vec2), _mm_sub_pd(vec3_2, vec4_2)), vec4_2);
}
for (; (i1<66); i1 += 1) {
yPos = ((((i1-1)/6.400000e+01)*(posEnd[1]-posBegin[1]))+posBegin[1]);
}
}
}
{
double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][0]);
int i1 = 1;
for (; (i1<=64); i1 += 2) {
fieldData_LaplaceCoeff_6_p1[((i1*68)+18290)] = 0.000000e+00;
fieldData_LaplaceCoeff_6_p1[((i1*68)+18358)] = 0.000000e+00;
}
for (; (i1<=65); i1 += 1) {
fieldData_LaplaceCoeff_6_p1[((i1*68)+18290)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][0]);
int i1 = 1;
for (; (i1<=64); i1 += 2) {
fieldData_LaplaceCoeff_6_p1[((i1*68)+27402)] = 0.000000e+00;
fieldData_LaplaceCoeff_6_p1[((i1*68)+27470)] = 0.000000e+00;
}
for (; (i1<=65); i1 += 1) {
fieldData_LaplaceCoeff_6_p1[((i1*68)+27402)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][0]);
int i1 = 1;
for (; (i1<=64); i1 += 2) {
fieldData_LaplaceCoeff_6_p1[((i1*68)+36514)] = 0.000000e+00;
fieldData_LaplaceCoeff_6_p1[((i1*68)+36582)] = 0.000000e+00;
}
for (; (i1<=65); i1 += 1) {
fieldData_LaplaceCoeff_6_p1[((i1*68)+36514)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][0]);
int i1 = 1;
for (; (i1<=64); i1 += 2) {
fieldData_LaplaceCoeff_6_p1[((i1*68)+9178)] = 0.000000e+00;
fieldData_LaplaceCoeff_6_p1[((i1*68)+9246)] = 0.000000e+00;
}
for (; (i1<=65); i1 += 1) {
fieldData_LaplaceCoeff_6_p1[((i1*68)+9178)] = 0.000000e+00;
}
}
}
{
int i1 = 1;
for (; (i1<(2&(~1))); i1 += 1) {
xPos = posEnd[0];
}
__m128d vec1 = _mm_set1_pd(xPos);
for (; (i1<63); i1 += 4) {
/* xPos = posEnd[0]; */
__m128d vec0 = _mm_load1_pd((&posEnd[0]));
__m128d vec0_2 = _mm_load1_pd((&posEnd[0]));
vec1 = vec0;
vec1 = vec0_2;
}
for (; (i1<66); i1 += 1) {
xPos = posEnd[0];
}
}
}
}
}
if ((!neighbor_isValid[1][2])) {
{
double xPos;
double yPos;
/* Statements in this Scop: S929, S923, S926, S931, S925, S928, S922, S930, S924, S927, S932 */
{
{
{
{
{
{
{
{
{
{
{
int i2 = 2;
for (; (i2<=65); i2 += 2) {
xPos = ((((i2-2)/6.400000e+01)*(posEnd[0]-posBegin[0]))+posBegin[0]);
xPos = ((((i2-1)/6.400000e+01)*(posEnd[0]-posBegin[0]))+posBegin[0]);
}
for (; (i2<=66); i2 += 1) {
xPos = ((((i2-2)/6.400000e+01)*(posEnd[0]-posBegin[0]))+posBegin[0]);
}
}
{
double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][0]);
int i2 = 2;
for (; (i2<=65); i2 += 2) {
fieldData_LaplaceCoeff_6_p1[(i2+18292)] = 0.000000e+00;
fieldData_LaplaceCoeff_6_p1[(i2+18293)] = 0.000000e+00;
}
for (; (i2<=66); i2 += 1) {
fieldData_LaplaceCoeff_6_p1[(i2+18292)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][0]);
int i2 = 2;
for (; (i2<=65); i2 += 2) {
fieldData_LaplaceCoeff_6_p1[(i2+36516)] = 0.000000e+00;
fieldData_LaplaceCoeff_6_p1[(i2+36517)] = 0.000000e+00;
}
for (; (i2<=66); i2 += 1) {
fieldData_LaplaceCoeff_6_p1[(i2+36516)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][0]);
int i2 = 2;
for (; (i2<=65); i2 += 2) {
fieldData_LaplaceCoeff_6_p1[(i2+31960)] = 0.000000e+00;
fieldData_LaplaceCoeff_6_p1[(i2+31961)] = 0.000000e+00;
}
for (; (i2<=66); i2 += 1) {
fieldData_LaplaceCoeff_6_p1[(i2+31960)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][0]);
int i2 = 2;
for (; (i2<=65); i2 += 2) {
fieldData_LaplaceCoeff_6_p1[(i2+68)] = 0.000000e+00;
fieldData_LaplaceCoeff_6_p1[(i2+69)] = 0.000000e+00;
}
for (; (i2<=66); i2 += 1) {
fieldData_LaplaceCoeff_6_p1[(i2+68)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][0]);
int i2 = 2;
for (; (i2<=65); i2 += 2) {
fieldData_LaplaceCoeff_6_p1[(i2+9180)] = 0.000000e+00;
fieldData_LaplaceCoeff_6_p1[(i2+9181)] = 0.000000e+00;
}
for (; (i2<=66); i2 += 1) {
fieldData_LaplaceCoeff_6_p1[(i2+9180)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][0]);
int i2 = 2;
for (; (i2<=65); i2 += 2) {
fieldData_LaplaceCoeff_6_p1[(i2+22848)] = 0.000000e+00;
fieldData_LaplaceCoeff_6_p1[(i2+22849)] = 0.000000e+00;
}
for (; (i2<=66); i2 += 1) {
fieldData_LaplaceCoeff_6_p1[(i2+22848)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][0]);
int i2 = 2;
for (; (i2<=65); i2 += 2) {
fieldData_LaplaceCoeff_6_p1[(i2+27404)] = 0.000000e+00;
fieldData_LaplaceCoeff_6_p1[(i2+27405)] = 0.000000e+00;
}
for (; (i2<=66); i2 += 1) {
fieldData_LaplaceCoeff_6_p1[(i2+27404)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][0]);
int i2 = 2;
for (; (i2<=65); i2 += 2) {
fieldData_LaplaceCoeff_6_p1[(i2+4624)] = 0.000000e+00;
fieldData_LaplaceCoeff_6_p1[(i2+4625)] = 0.000000e+00;
}
for (; (i2<=66); i2 += 1) {
fieldData_LaplaceCoeff_6_p1[(i2+4624)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][0]);
int i2 = 2;
for (; (i2<=65); i2 += 2) {
fieldData_LaplaceCoeff_6_p1[(i2+13736)] = 0.000000e+00;
fieldData_LaplaceCoeff_6_p1[(i2+13737)] = 0.000000e+00;
}
for (; (i2<=66); i2 += 1) {
fieldData_LaplaceCoeff_6_p1[(i2+13736)] = 0.000000e+00;
}
}
}
{
int i2 = 2;
for (; (i2<=65); i2 += 2) {
yPos = posBegin[1];
yPos = posBegin[1];
}
for (; (i2<=66); i2 += 1) {
yPos = posBegin[1];
}
}
}
}
}
if ((!neighbor_isValid[1][3])) {
{
double xPos;
double yPos;
/* Statements in this Scop: S941, S935, S938, S943, S940, S934, S937, S942, S936, S939, S933 */
{
{
{
{
{
{
{
{
{
{
{
int i2 = 2;
for (; (i2<=65); i2 += 2) {
xPos = ((((i2-2)/6.400000e+01)*(posEnd[0]-posBegin[0]))+posBegin[0]);
xPos = ((((i2-1)/6.400000e+01)*(posEnd[0]-posBegin[0]))+posBegin[0]);
}
for (; (i2<=66); i2 += 1) {
xPos = ((((i2-2)/6.400000e+01)*(posEnd[0]-posBegin[0]))+posBegin[0]);
}
}
{
double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][0]);
int i2 = 2;
for (; (i2<=65); i2 += 2) {
fieldData_LaplaceCoeff_6_p1[(i2+36312)] = 0.000000e+00;
fieldData_LaplaceCoeff_6_p1[(i2+36313)] = 0.000000e+00;
}
for (; (i2<=66); i2 += 1) {
fieldData_LaplaceCoeff_6_p1[(i2+36312)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][0]);
int i2 = 2;
for (; (i2<=65); i2 += 2) {
fieldData_LaplaceCoeff_6_p1[(i2+22644)] = 0.000000e+00;
fieldData_LaplaceCoeff_6_p1[(i2+22645)] = 0.000000e+00;
}
for (; (i2<=66); i2 += 1) {
fieldData_LaplaceCoeff_6_p1[(i2+22644)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][0]);
int i2 = 2;
for (; (i2<=65); i2 += 2) {
fieldData_LaplaceCoeff_6_p1[(i2+13532)] = 0.000000e+00;
fieldData_LaplaceCoeff_6_p1[(i2+13533)] = 0.000000e+00;
}
for (; (i2<=66); i2 += 1) {
fieldData_LaplaceCoeff_6_p1[(i2+13532)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][0]);
int i2 = 2;
for (; (i2<=65); i2 += 2) {
fieldData_LaplaceCoeff_6_p1[(i2+40868)] = 0.000000e+00;
fieldData_LaplaceCoeff_6_p1[(i2+40869)] = 0.000000e+00;
}
for (; (i2<=66); i2 += 1) {
fieldData_LaplaceCoeff_6_p1[(i2+40868)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][0]);
int i2 = 2;
for (; (i2<=65); i2 += 2) {
fieldData_LaplaceCoeff_6_p1[(i2+18088)] = 0.000000e+00;
fieldData_LaplaceCoeff_6_p1[(i2+18089)] = 0.000000e+00;
}
for (; (i2<=66); i2 += 1) {
fieldData_LaplaceCoeff_6_p1[(i2+18088)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][0]);
int i2 = 2;
for (; (i2<=65); i2 += 2) {
fieldData_LaplaceCoeff_6_p1[(i2+4420)] = 0.000000e+00;
fieldData_LaplaceCoeff_6_p1[(i2+4421)] = 0.000000e+00;
}
for (; (i2<=66); i2 += 1) {
fieldData_LaplaceCoeff_6_p1[(i2+4420)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][0]);
int i2 = 2;
for (; (i2<=65); i2 += 2) {
fieldData_LaplaceCoeff_6_p1[(i2+31756)] = 0.000000e+00;
fieldData_LaplaceCoeff_6_p1[(i2+31757)] = 0.000000e+00;
}
for (; (i2<=66); i2 += 1) {
fieldData_LaplaceCoeff_6_p1[(i2+31756)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][0]);
int i2 = 2;
for (; (i2<=65); i2 += 2) {
fieldData_LaplaceCoeff_6_p1[(i2+27200)] = 0.000000e+00;
fieldData_LaplaceCoeff_6_p1[(i2+27201)] = 0.000000e+00;
}
for (; (i2<=66); i2 += 1) {
fieldData_LaplaceCoeff_6_p1[(i2+27200)] = 0.000000e+00;
}
}
}
{
int i2 = 2;
for (; (i2<=65); i2 += 2) {
yPos = posEnd[1];
yPos = posEnd[1];
}
for (; (i2<=66); i2 += 1) {
yPos = posEnd[1];
}
}
}
{
double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][0]);
int i2 = 2;
for (; (i2<=65); i2 += 2) {
fieldData_LaplaceCoeff_6_p1[(i2+8976)] = 0.000000e+00;
fieldData_LaplaceCoeff_6_p1[(i2+8977)] = 0.000000e+00;
}
for (; (i2<=66); i2 += 1) {
fieldData_LaplaceCoeff_6_p1[(i2+8976)] = 0.000000e+00;
}
}
}
}
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if ((neighbor_isValid[1][1]&&neighbor_isRemote[1][1])) {
/* Statements in this Scop: S944 */
for (int i3 = 0; (i3<=8); i3 += 1) {
double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][(i3*4556)]);
double* buffer_Send_1_p1 = (&buffer_Send[1][(i3*65)]);
int i4 = 1;
for (; (i4<=64); i4 += 2) {
buffer_Send_1_p1[(i4-1)] = fieldData_LaplaceCoeff_6_p1[((i4*68)+66)];
buffer_Send_1_p1[i4] = fieldData_LaplaceCoeff_6_p1[((i4*68)+134)];
}
for (; (i4<=65); i4 += 1) {
buffer_Send_1_p1[(i4-1)] = fieldData_LaplaceCoeff_6_p1[((i4*68)+66)];
}
}
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if ((neighbor_isValid[1][1]&&neighbor_isRemote[1][1])) {
MPI_Isend(buffer_Send[1], 585, MPI_DOUBLE, neighbor_remoteRank[1][1], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[1][1]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[1]);
reqOutstanding_Send[1] = true;
}
}
}
;
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if ((neighbor_isValid[1][0]&&neighbor_isRemote[1][0])) {
MPI_Irecv(buffer_Recv[0], 585, MPI_DOUBLE, neighbor_remoteRank[1][0], ((unsigned int)(neighbor_fragCommId[1][0]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[0]);
reqOutstanding_Recv[0] = true;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if (reqOutstanding_Recv[0]) {
waitForMPIReq(&mpiRequest_Recv[0]);
reqOutstanding_Recv[0] = false;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if ((neighbor_isValid[1][0]&&neighbor_isRemote[1][0])) {
/* Statements in this Scop: S945 */
for (int i3 = 0; (i3<=8); i3 += 1) {
double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][(i3*4556)]);
double* buffer_Recv_0_p1 = (&buffer_Recv[0][(i3*65)]);
int i4 = 3;
for (; (i4<=66); i4 += 2) {
fieldData_LaplaceCoeff_6_p1[((i4*68)-134)] = buffer_Recv_0_p1[(i4-3)];
fieldData_LaplaceCoeff_6_p1[((i4*68)-66)] = buffer_Recv_0_p1[(i4-2)];
}
for (; (i4<=67); i4 += 1) {
fieldData_LaplaceCoeff_6_p1[((i4*68)-134)] = buffer_Recv_0_p1[(i4-3)];
}
}
}
}
}
;
;
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if (reqOutstanding_Send[1]) {
waitForMPIReq(&mpiRequest_Send[1]);
reqOutstanding_Send[1] = false;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
;
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if ((neighbor_isValid[1][3]&&neighbor_isRemote[1][3])) {
MPI_Isend(&fieldData_LaplaceCoeff[6][4422], 1, mpiDatatype_9_65_4556, neighbor_remoteRank[1][3], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[1][3]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[3]);
reqOutstanding_Send[3] = true;
}
}
}
;
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if ((neighbor_isValid[1][2]&&neighbor_isRemote[1][2])) {
MPI_Irecv(&fieldData_LaplaceCoeff[6][70], 1, mpiDatatype_9_65_4556, neighbor_remoteRank[1][2], ((unsigned int)(neighbor_fragCommId[1][2]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[2]);
reqOutstanding_Recv[2] = true;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if (reqOutstanding_Recv[2]) {
waitForMPIReq(&mpiRequest_Recv[2]);
reqOutstanding_Recv[2] = false;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
;
}
}
;
;
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if (reqOutstanding_Send[3]) {
waitForMPIReq(&mpiRequest_Send[3]);
reqOutstanding_Send[3] = false;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if ((neighbor_isValid[1][0]&&neighbor_isRemote[1][0])) {
/* Statements in this Scop: S946 */
for (int i3 = 0; (i3<=8); i3 += 1) {
double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][(i3*4556)]);
double* buffer_Send_0_p1 = (&buffer_Send[0][(i3*67)]);
int i4 = 0;
for (; (i4<=65); i4 += 2) {
buffer_Send_0_p1[i4] = fieldData_LaplaceCoeff_6_p1[((i4*68)+3)];
buffer_Send_0_p1[(i4+1)] = fieldData_LaplaceCoeff_6_p1[((i4*68)+71)];
}
for (; (i4<=66); i4 += 1) {
buffer_Send_0_p1[i4] = fieldData_LaplaceCoeff_6_p1[((i4*68)+3)];
}
}
}
if ((neighbor_isValid[1][1]&&neighbor_isRemote[1][1])) {
/* Statements in this Scop: S947 */
for (int i3 = 0; (i3<=8); i3 += 1) {
double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][(i3*4556)]);
double* buffer_Send_1_p1 = (&buffer_Send[1][(i3*67)]);
int i4 = 0;
for (; (i4<=65); i4 += 2) {
buffer_Send_1_p1[i4] = fieldData_LaplaceCoeff_6_p1[((i4*68)+65)];
buffer_Send_1_p1[(i4+1)] = fieldData_LaplaceCoeff_6_p1[((i4*68)+133)];
}
for (; (i4<=66); i4 += 1) {
buffer_Send_1_p1[i4] = fieldData_LaplaceCoeff_6_p1[((i4*68)+65)];
}
}
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if ((neighbor_isValid[1][0]&&neighbor_isRemote[1][0])) {
MPI_Isend(buffer_Send[0], 603, MPI_DOUBLE, neighbor_remoteRank[1][0], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[1][0]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[0]);
reqOutstanding_Send[0] = true;
}
if ((neighbor_isValid[1][1]&&neighbor_isRemote[1][1])) {
MPI_Isend(buffer_Send[1], 603, MPI_DOUBLE, neighbor_remoteRank[1][1], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[1][1]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[1]);
reqOutstanding_Send[1] = true;
}
}
}
;
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if ((neighbor_isValid[1][0]&&neighbor_isRemote[1][0])) {
MPI_Irecv(buffer_Recv[0], 603, MPI_DOUBLE, neighbor_remoteRank[1][0], ((unsigned int)(neighbor_fragCommId[1][0]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[0]);
reqOutstanding_Recv[0] = true;
}
if ((neighbor_isValid[1][1]&&neighbor_isRemote[1][1])) {
MPI_Irecv(buffer_Recv[1], 603, MPI_DOUBLE, neighbor_remoteRank[1][1], ((unsigned int)(neighbor_fragCommId[1][1]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[1]);
reqOutstanding_Recv[1] = true;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if (reqOutstanding_Recv[0]) {
waitForMPIReq(&mpiRequest_Recv[0]);
reqOutstanding_Recv[0] = false;
}
if (reqOutstanding_Recv[1]) {
waitForMPIReq(&mpiRequest_Recv[1]);
reqOutstanding_Recv[1] = false;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if ((neighbor_isValid[1][0]&&neighbor_isRemote[1][0])) {
/* Statements in this Scop: S948 */
for (int i3 = 0; (i3<=8); i3 += 1) {
double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][(i3*4556)]);
double* buffer_Recv_0_p1 = (&buffer_Recv[0][(i3*67)]);
int i4 = 1;
for (; (i4<=66); i4 += 2) {
fieldData_LaplaceCoeff_6_p1[((i4*68)-67)] = buffer_Recv_0_p1[(i4-1)];
fieldData_LaplaceCoeff_6_p1[((i4*68)+1)] = buffer_Recv_0_p1[i4];
}
for (; (i4<=67); i4 += 1) {
fieldData_LaplaceCoeff_6_p1[((i4*68)-67)] = buffer_Recv_0_p1[(i4-1)];
}
}
}
if ((neighbor_isValid[1][1]&&neighbor_isRemote[1][1])) {
/* Statements in this Scop: S949 */
for (int i3 = 0; (i3<=8); i3 += 1) {
double* buffer_Recv_1_p1 = (&buffer_Recv[1][(i3*67)]);
double* fieldData_LaplaceCoeff_6_p1 = (&fieldData_LaplaceCoeff[6][(i3*4556)]);
int i4 = 67;
for (; (i4<=132); i4 += 2) {
fieldData_LaplaceCoeff_6_p1[((i4*68)-4489)] = buffer_Recv_1_p1[(i4-67)];
fieldData_LaplaceCoeff_6_p1[((i4*68)-4421)] = buffer_Recv_1_p1[(i4-66)];
}
for (; (i4<=133); i4 += 1) {
fieldData_LaplaceCoeff_6_p1[((i4*68)-4489)] = buffer_Recv_1_p1[(i4-67)];
}
}
}
}
}
;
;
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if (reqOutstanding_Send[0]) {
waitForMPIReq(&mpiRequest_Send[0]);
reqOutstanding_Send[0] = false;
}
if (reqOutstanding_Send[1]) {
waitForMPIReq(&mpiRequest_Send[1]);
reqOutstanding_Send[1] = false;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
;
;
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if ((neighbor_isValid[1][2]&&neighbor_isRemote[1][2])) {
MPI_Isend(&fieldData_LaplaceCoeff[6][137], 1, mpiDatatype_9_67_4556, neighbor_remoteRank[1][2], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[1][2]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[2]);
reqOutstanding_Send[2] = true;
}
if ((neighbor_isValid[1][3]&&neighbor_isRemote[1][3])) {
MPI_Isend(&fieldData_LaplaceCoeff[6][4353], 1, mpiDatatype_9_67_4556, neighbor_remoteRank[1][3], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[1][3]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[3]);
reqOutstanding_Send[3] = true;
}
}
}
;
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if ((neighbor_isValid[1][2]&&neighbor_isRemote[1][2])) {
MPI_Irecv(&fieldData_LaplaceCoeff[6][1], 1, mpiDatatype_9_67_4556, neighbor_remoteRank[1][2], ((unsigned int)(neighbor_fragCommId[1][2]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[2]);
reqOutstanding_Recv[2] = true;
}
if ((neighbor_isValid[1][3]&&neighbor_isRemote[1][3])) {
MPI_Irecv(&fieldData_LaplaceCoeff[6][4489], 1, mpiDatatype_9_67_4556, neighbor_remoteRank[1][3], ((unsigned int)(neighbor_fragCommId[1][3]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[3]);
reqOutstanding_Recv[3] = true;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if (reqOutstanding_Recv[2]) {
waitForMPIReq(&mpiRequest_Recv[2]);
reqOutstanding_Recv[2] = false;
}
if (reqOutstanding_Recv[3]) {
waitForMPIReq(&mpiRequest_Recv[3]);
reqOutstanding_Recv[3] = false;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
;
;
}
}
;
;
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if (reqOutstanding_Send[2]) {
waitForMPIReq(&mpiRequest_Send[2]);
reqOutstanding_Send[2] = false;
}
if (reqOutstanding_Send[3]) {
waitForMPIReq(&mpiRequest_Send[3]);
reqOutstanding_Send[3] = false;
}
}
}
}
Beispiel #16
0
int fft8b_(double *a, double *b, double *w, int *m, int *l)
{
    /* static double c81 = .70710678118654752; */
    static __m128d c81;

    int i, i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15, j, j0;
    /* double u0, v0, u1, x0, y0, x1, y1, x2, y2, x3, y3, v1, x4, y4, x5, y5,
             x6, y6, x7, y7, u2, v2, u3, v3, wi1, wi2, wi3, wi4, wi5, wi6,
             wi7, wr1, wr2, wr3, wr4, wr5, wr6, wr7; */
    __m128d t0, t1, t2, t3, t4, t5, t6, t7, t8, u0, u1, u2, u3, w1, w2, w3, w4, w5, w6, w7;

    c81 = _mm_set1_pd(0.70710678118654752);

    for (i = 0; i < *m; i++) {
        i0 = i << 1;
	i1 = i0 + (*m * *l << 1);
	i2 = i1 + (*m * *l << 1);
	i3 = i2 + (*m * *l << 1);
	i4 = i3 + (*m * *l << 1);
	i5 = i4 + (*m * *l << 1);
	i6 = i5 + (*m * *l << 1);
	i7 = i6 + (*m * *l << 1);
	i8 = i << 1;
	i9 = i8 + (*m << 1);
	i10 = i9 + (*m << 1);
	i11 = i10 + (*m << 1);
	i12 = i11 + (*m << 1);
	i13 = i12 + (*m << 1);
	i14 = i13 + (*m << 1);
	i15 = i14 + (*m << 1);
	/* x0 = a[i0] + a[i4];
	y0 = a[i0 + 1] + a[i4 + 1];
	x1 = a[i0] - a[i4];
	y1 = a[i0 + 1] - a[i4 + 1];
	x2 = a[i2] + a[i6];
	y2 = a[i2 + 1] + a[i6 + 1];
	x3 = a[i2 + 1] - a[i6 + 1];
	y3 = a[i6] - a[i2]; */
	t0 = _mm_load_pd(&a[i0]);
	t2 = _mm_load_pd(&a[i4]);
	t1 = _mm_sub_pd(t0, t2);
	t0 = _mm_add_pd(t0, t2);
	t3 = _mm_load_pd(&a[i2]);
	t4 = _mm_load_pd(&a[i6]);
	t2 = _mm_add_pd(t3, t4);
	t3 = _mm_xor_pd(_mm_sub_pd(t3, t4), _mm_set_sd(-0.0));
	t3 = _mm_shuffle_pd(t3, t3, 1);
	/* u0 = x0 + x2;
	v0 = y0 + y2;
	u1 = x0 - x2;
	v1 = y0 - y2; */
	u0 = _mm_add_pd(t0, t2);
	u1 = _mm_sub_pd(t0, t2);
	/* x4 = a[i1] + a[i5];
	y4 = a[i1 + 1] + a[i5 + 1];
	x5 = a[i1] - a[i5];
	y5 = a[i1 + 1] - a[i5 + 1];
	x6 = a[i3] + a[i7];
	y6 = a[i3 + 1] + a[i7 + 1];
	x7 = a[i3] - a[i7];
	y7 = a[i3 + 1] - a[i7 + 1]; */
	t4 = _mm_load_pd(&a[i1]);
	t6 = _mm_load_pd(&a[i5]);
	t5 = _mm_sub_pd(t4, t6);
	t4 = _mm_add_pd(t4, t6);
	t7 = _mm_load_pd(&a[i3]);
	t8 = _mm_load_pd(&a[i7]);
	t6 = _mm_add_pd(t7, t8);
	t7 = _mm_sub_pd(t7, t8);
	/* u2 = x4 + x6;
	v2 = y4 + y6;
	u3 = y4 - y6;
	v3 = x6 - x4; */
	u2 = _mm_add_pd(t4, t6);
	u3 = _mm_xor_pd(_mm_sub_pd(t4, t6), _mm_set_sd(-0.0));
	u3 = _mm_shuffle_pd(u3, u3, 1);
	/* b[i8] = u0 + u2;
	b[i8 + 1] = v0 + v2;
	b[i12] = u0 - u2;
	b[i12 + 1] = v0 - v2;
	b[i10] = u1 + u3;
	b[i10 + 1] = v1 + v3;
	b[i14] = u1 - u3;
	b[i14 + 1] = v1 - v3; */
	_mm_store_pd(&b[i8], _mm_add_pd(u0, u2));
	_mm_store_pd(&b[i12], _mm_sub_pd(u0, u2));
	_mm_store_pd(&b[i10], _mm_add_pd(u1, u3));
	_mm_store_pd(&b[i14], _mm_sub_pd(u1, u3));
	/* u0 = x1 + c81 * (x5 - x7);
	v0 = y1 + c81 * (y5 - y7);
	u1 = x1 - c81 * (x5 - x7);
	v1 = y1 - c81 * (y5 - y7);
	u2 = x3 + c81 * (y5 + y7);
	v2 = y3 - c81 * (x5 + x7);
	u3 = x3 - c81 * (y5 + y7);
	v3 = y3 + c81 * (x5 + x7); */
	u1 = _mm_mul_pd(c81, _mm_sub_pd(t5, t7));
	u0 = _mm_add_pd(t1, u1);
	u1 = _mm_sub_pd(t1, u1);
	u3 = _mm_xor_pd(_mm_mul_pd(c81, _mm_add_pd(t5, t7)), _mm_set_sd(-0.0));
	u3 = _mm_shuffle_pd(u3, u3, 1);
	u2 = _mm_add_pd(t3, u3);
	u3 = _mm_sub_pd(t3, u3);
	/* b[i9] = u0 + u2;
	b[i9 + 1] = v0 + v2;
	b[i13] = u1 + u3;
	b[i13 + 1] = v1 + v3;
	b[i11] = u1 - u3;
	b[i11 + 1] = v1 - v3;
	b[i15] = u0 - u2;
	b[i15 + 1] = v0 - v2; */
	_mm_store_pd(&b[i9], _mm_add_pd(u0, u2));
	_mm_store_pd(&b[i13], _mm_add_pd(u1, u3));
	_mm_store_pd(&b[i11], _mm_sub_pd(u1, u3));
	_mm_store_pd(&b[i15], _mm_sub_pd(u0, u2));
    }
    for (j = 1; j < *l; j++) {
        j0 = j << 1;
	/* wr1 = w[j0];
	wi1 = w[j0 + 1];
	wr2 = wr1 * wr1 - wi1 * wi1;
	wi2 = wr1 * wi1 + wr1 * wi1;
	wr3 = wr1 * wr2 - wi1 * wi2;
	wi3 = wr1 * wi2 + wi1 * wr2;
	wr4 = wr2 * wr2 - wi2 * wi2;
	wi4 = wr2 * wi2 + wr2 * wi2;
	wr5 = wr2 * wr3 - wi2 * wi3;
	wi5 = wr2 * wi3 + wi2 * wr3;
	wr6 = wr3 * wr3 - wi3 * wi3;
	wi6 = wr3 * wi3 + wr3 * wi3;
	wr7 = wr3 * wr4 - wi3 * wi4;
	wi7 = wr3 * wi4 + wi3 * wr4; */
	w1 = _mm_load_pd(&w[j0]);
	w2 = ZMUL(w1, w1);
	w3 = ZMUL(w1, w2);
	w4 = ZMUL(w2, w2);
	w5 = ZMUL(w2, w3);
	w6 = ZMUL(w3, w3);
	w7 = ZMUL(w3, w4);
	for (i = 0; i < *m; i++) {
	    i0 = (i << 1) + (j * *m << 1);
	    i1 = i0 + (*m * *l << 1);
	    i2 = i1 + (*m * *l << 1);
	    i3 = i2 + (*m * *l << 1);
	    i4 = i3 + (*m * *l << 1);
	    i5 = i4 + (*m * *l << 1);
	    i6 = i5 + (*m * *l << 1);
	    i7 = i6 + (*m * *l << 1);
	    i8 = (i << 1) + (j * *m << 4);
	    i9 = i8 + (*m << 1);
	    i10 = i9 + (*m << 1);
	    i11 = i10 + (*m << 1);
	    i12 = i11 + (*m << 1);
	    i13 = i12 + (*m << 1);
	    i14 = i13 + (*m << 1);
	    i15 = i14 + (*m << 1);
	    /* x0 = a[i0] + a[i4];
	    y0 = a[i0 + 1] + a[i4 + 1];
	    x1 = a[i0] - a[i4];
	    y1 = a[i0 + 1] - a[i4 + 1];
	    x2 = a[i2] + a[i6];
	    y2 = a[i2 + 1] + a[i6 + 1];
	    x3 = a[i2 + 1] - a[i6 + 1];
	    y3 = a[i6] - a[i2]; */
	    t0 = _mm_load_pd(&a[i0]);
	    t2 = _mm_load_pd(&a[i4]);
	    t1 = _mm_sub_pd(t0, t2);
	    t0 = _mm_add_pd(t0, t2);
	    t3 = _mm_load_pd(&a[i2]);
	    t4 = _mm_load_pd(&a[i6]);
	    t2 = _mm_add_pd(t3, t4);
	    t3 = _mm_xor_pd(_mm_sub_pd(t3, t4), _mm_set_sd(-0.0));
	    t3 = _mm_shuffle_pd(t3, t3, 1);
	    /* u0 = x0 + x2;
	    v0 = y0 + y2;
	    u1 = x0 - x2;
	    v1 = y0 - y2; */
	    u0 = _mm_add_pd(t0, t2);
	    u1 = _mm_sub_pd(t0, t2);
	    /* x4 = a[i1] + a[i5];
	    y4 = a[i1 + 1] + a[i5 + 1];
	    x5 = a[i1] - a[i5];
	    y5 = a[i1 + 1] - a[i5 + 1];
	    x6 = a[i3] + a[i7];
	    y6 = a[i3 + 1] + a[i7 + 1];
	    x7 = a[i3] - a[i7];
	    y7 = a[i3 + 1] - a[i7 + 1]; */
	    t4 = _mm_load_pd(&a[i1]);
	    t6 = _mm_load_pd(&a[i5]);
	    t5 = _mm_sub_pd(t4, t6);
	    t4 = _mm_add_pd(t4, t6);
	    t7 = _mm_load_pd(&a[i3]);
	    t8 = _mm_load_pd(&a[i7]);
	    t6 = _mm_add_pd(t7, t8);
	    t7 = _mm_sub_pd(t7, t8);
	    /* u2 = x4 + x6;
	    v2 = y4 + y6;
	    u3 = y4 - y6;
	    v3 = x6 - x4; */
	    u2 = _mm_add_pd(t4, t6);
	    u3 = _mm_xor_pd(_mm_sub_pd(t4, t6), _mm_set_sd(-0.0));
	    u3 = _mm_shuffle_pd(u3, u3, 1);
	    /* b[i8] = u0 + u2;
	    b[i8 + 1] = v0 + v2;
	    b[i12] = wr4 * (u0 - u2) - wi4 * (v0 - v2);
	    b[i12 + 1] = wr4 * (v0 - v2) + wi4 * (u0 - u2);
	    b[i10] = wr2 * (u1 + u3) - wi2 * (v1 + v3);
	    b[i10 + 1] = wr2 * (v1 + v3) + wi2 * (u1 + u3);
	    b[i14] = wr6 * (u1 - u3) - wi6 * (v1 - v3);
	    b[i14 + 1] = wr6 * (v1 - v3) + wi6 * (u1 - u3); */
	    _mm_store_pd(&b[i8], _mm_add_pd(u0, u2));
	    _mm_store_pd(&b[i12], ZMUL(w4, _mm_sub_pd(u0, u2)));
	    _mm_store_pd(&b[i10], ZMUL(w2, _mm_add_pd(u1, u3)));
	    _mm_store_pd(&b[i14], ZMUL(w6, _mm_sub_pd(u1, u3)));
	    /* u0 = x1 + c81 * (x5 - x7);
	    v0 = y1 + c81 * (y5 - y7);
	    u1 = x1 - c81 * (x5 - x7);
	    v1 = y1 - c81 * (y5 - y7);
	    u2 = x3 + c81 * (y5 + y7);
	    v2 = y3 - c81 * (x5 + x7);
	    u3 = x3 - c81 * (y5 + y7);
	    v3 = y3 + c81 * (x5 + x7); */
	    u1 = _mm_mul_pd(c81, _mm_sub_pd(t5, t7));
	    u0 = _mm_add_pd(t1, u1);
	    u1 = _mm_sub_pd(t1, u1);
	    u3 = _mm_xor_pd(_mm_mul_pd(c81, _mm_add_pd(t5, t7)), _mm_set_sd(-0.0));
	    u3 = _mm_shuffle_pd(u3, u3, 1);
	    u2 = _mm_add_pd(t3, u3);
	    u3 = _mm_sub_pd(t3, u3);
	    /* b[i9] = wr1 * (u0 + u2) - wi1 * (v0 + v2);
	    b[i9 + 1] = wr1 * (v0 + v2) + wi1 * (u0 + u2);
	    b[i13] = wr5 * (u1 + u3) - wi5 * (v1 + v3);
	    b[i13 + 1] = wr5 * (v1 + v3) + wi5 * (u1 + u3);
	    b[i11] = wr3 * (u1 - u3) - wi3 * (v1 - v3);
	    b[i11 + 1] = wr3 * (v1 - v3) + wi3 * (u1 - u3);
	    b[i15] = wr7 * (u0 - u2) - wi7 * (v0 - v2);
	    b[i15 + 1] = wr7 * (v0 - v2) + wi7 * (u0 - u2); */
	    _mm_store_pd(&b[i9], ZMUL(w1, _mm_add_pd(u0, u2)));
	    _mm_store_pd(&b[i13], ZMUL(w5, _mm_add_pd(u1, u3)));
	    _mm_store_pd(&b[i11], ZMUL(w3, _mm_sub_pd(u1, u3)));
	    _mm_store_pd(&b[i15], ZMUL(w7, _mm_sub_pd(u0, u2)));
	}
    }
    return 0;
}
void nb_kernel430_ia32_sse2(int *           p_nri,
                              int *           iinr,
                              int *           jindex,
                              int *           jjnr,
                              int *           shift,
                              double *         shiftvec,
                              double *         fshift,
                              int *           gid,
                              double *         pos,
                              double *         faction,
                              double *         charge,
                              double *         p_facel,
                              double *         p_krf,
                              double *         p_crf,
                              double *         vc,
                              int *           type,
                              int *           p_ntype,
                              double *         vdwparam,
                              double *         vvdw,
                              double *         p_tabscale,
                              double *         VFtab,
                              double *         invsqrta,
                              double *         dvda,
                              double *         p_gbtabscale,
                              double *         GBtab,
                              int *           p_nthreads,
                              int *           count,
                              void *          mtx,
                              int *           outeriter,
                              int *           inneriter,
                              double *         work)
{
  int           nri,ntype,nthreads;
  int           n,ii,is3,ii3,k,nj0,nj1,ggid;
  double        shX,shY,shZ;
	int			  offset,nti;
  int           jnrA,jnrB;
  int           j3A,j3B;
	int           tjA,tjB;
	gmx_gbdata_t *gbdata;
	double *      gpol;
    
	__m128d  iq,qq,jq,isai;
	__m128d  ix,iy,iz;
	__m128d  jx,jy,jz;
	__m128d  dx,dy,dz;
	__m128d  vctot,vvdwtot,vgbtot,dvdasum,gbfactor;
	__m128d  fix,fiy,fiz,tx,ty,tz,rsq;
	__m128d  rinv,isaj,isaprod;
	__m128d  vcoul,fscal,gbscale,c6,c12;
	__m128d  rinvsq,r,rtab;
	__m128d  eps,Y,F,G,H;
  __m128d  VV,FF,Fp;
	__m128d  vgb,fijGB,dvdatmp;
	__m128d  rinvsix,vvdw6,vvdw12,vvdwtmp;
	__m128d  facel,gbtabscale,dvdaj;
  __m128d  fijD,fijR;
  __m128d  xmm1,tabscale,eps2;
	__m128i  n0, nnn;
    
	
	const __m128d neg        = _mm_set1_pd(-1.0);
	const __m128d zero       = _mm_set1_pd(0.0);
	const __m128d minushalf  = _mm_set1_pd(-0.5);
	const __m128d two        = _mm_set1_pd(2.0);
	
	gbdata     = (gmx_gbdata_t *)work;
	gpol       = gbdata->gpol;
    
	nri        = *p_nri;
	ntype      = *p_ntype;
    
  gbfactor   = _mm_set1_pd( - ((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent)));     
  gbtabscale = _mm_load1_pd(p_gbtabscale);  
  facel      = _mm_load1_pd(p_facel);
  tabscale   = _mm_load1_pd(p_tabscale);
  
  nj1         = 0;
  jnrA = jnrB = 0;
  j3A = j3B   = 0;
  jx          = _mm_setzero_pd();
  jy          = _mm_setzero_pd();
  jz          = _mm_setzero_pd();
  c6          = _mm_setzero_pd();
  c12         = _mm_setzero_pd();
	
	for(n=0;n<nri;n++)
	{
    is3              = 3*shift[n];     
    shX              = shiftvec[is3];  
    shY              = shiftvec[is3+1];
    shZ              = shiftvec[is3+2];
    nj0              = jindex[n];      
    nj1              = jindex[n+1];    
    ii               = iinr[n];        
    ii3              = 3*ii;           
		
		ix               = _mm_set1_pd(shX+pos[ii3+0]);
		iy               = _mm_set1_pd(shY+pos[ii3+1]);
		iz               = _mm_set1_pd(shZ+pos[ii3+2]);
    
		iq               = _mm_load1_pd(charge+ii);
		iq               = _mm_mul_pd(iq,facel);
    
		isai             = _mm_load1_pd(invsqrta+ii);
    
		nti              = 2*ntype*type[ii];
		
		vctot            = _mm_setzero_pd();
		vvdwtot          = _mm_setzero_pd();
		vgbtot           = _mm_setzero_pd();
		dvdasum          = _mm_setzero_pd();
		fix              = _mm_setzero_pd();
		fiy              = _mm_setzero_pd();
		fiz              = _mm_setzero_pd();
        
		for(k=nj0;k<nj1-1; k+=2)
		{
			jnrA    = jjnr[k];
			jnrB    = jjnr[k+1];
			
			j3A     = jnrA * 3;
			j3B     = jnrB * 3;
            
      GMX_MM_LOAD_1RVEC_2POINTERS_PD(pos+j3A,pos+j3B,jx,jy,jz);
            
			dx           = _mm_sub_pd(ix,jx);
			dy           = _mm_sub_pd(iy,jy);
			dz           = _mm_sub_pd(iz,jz);
            
      rsq          = gmx_mm_calc_rsq_pd(dx,dy,dz);
      
      rinv         = gmx_mm_invsqrt_pd(rsq);
 			rinvsq       = _mm_mul_pd(rinv,rinv);
      
			/***********************************/
			/* INTERACTION SECTION STARTS HERE */
			/***********************************/
			GMX_MM_LOAD_2VALUES_PD(charge+jnrA,charge+jnrB,jq);
			GMX_MM_LOAD_2VALUES_PD(invsqrta+jnrA,invsqrta+jnrB,isaj);
            
      /* Lennard-Jones */
      tjA          = nti+2*type[jnrA];
			tjB          = nti+2*type[jnrB];
      
      GMX_MM_LOAD_2PAIRS_PD(vdwparam+tjA,vdwparam+tjB,c6,c12);
			
			isaprod      = _mm_mul_pd(isai,isaj);
			qq           = _mm_mul_pd(iq,jq);            
			vcoul        = _mm_mul_pd(qq,rinv);
			fscal        = _mm_mul_pd(vcoul,rinv);                                 
      vctot        = _mm_add_pd(vctot,vcoul);
      
      /* Polarization interaction */
			qq           = _mm_mul_pd(qq,_mm_mul_pd(isaprod,gbfactor));
			gbscale      = _mm_mul_pd(isaprod,gbtabscale);
      
 			/* Calculate GB table index */
			r            = _mm_mul_pd(rsq,rinv);
			rtab         = _mm_mul_pd(r,gbscale);
			
			n0		     = _mm_cvttpd_epi32(rtab);
			eps	     	 = _mm_sub_pd(rtab,_mm_cvtepi32_pd(n0));
			nnn		     = _mm_slli_epi32(n0,2);
			
      /* the tables are 16-byte aligned, so we can use _mm_load_pd */			
      Y            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))); 
      F            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1)));
      GMX_MM_TRANSPOSE2_PD(Y,F);
      G            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2); 
      H            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1))+2);
      GMX_MM_TRANSPOSE2_PD(G,H);
      
      G       = _mm_mul_pd(G,eps);
      H       = _mm_mul_pd(H, _mm_mul_pd(eps,eps) );
      F       = _mm_add_pd(F, _mm_add_pd( G , H ) );
      Y       = _mm_add_pd(Y, _mm_mul_pd(F, eps));
      F       = _mm_add_pd(F, _mm_add_pd(G , _mm_mul_pd(H,two)));
      vgb     = _mm_mul_pd(Y, qq);           
      fijGB   = _mm_mul_pd(F, _mm_mul_pd(qq,gbscale));
      
      dvdatmp = _mm_mul_pd(_mm_add_pd(vgb, _mm_mul_pd(fijGB,r)) , minushalf);
      
      vgbtot  = _mm_add_pd(vgbtot, vgb);
      
      dvdasum = _mm_add_pd(dvdasum, dvdatmp);
      dvdatmp = _mm_mul_pd(dvdatmp, _mm_mul_pd(isaj,isaj));
      
      GMX_MM_INCREMENT_2VALUES_PD(dvda+jnrA,dvda+jnrB,dvdatmp);
			
      /* Calculate VDW table index */
			rtab    = _mm_mul_pd(r,tabscale);
			n0      = _mm_cvttpd_epi32(rtab);
			eps     = _mm_sub_pd(rtab,_mm_cvtepi32_pd(n0));
			eps2    = _mm_mul_pd(eps,eps);
			nnn     = _mm_slli_epi32(n0,3);
			
      /* Dispersion */
      Y            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))); 
      F            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1)));
      GMX_MM_TRANSPOSE2_PD(Y,F);
      G            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+2); 
      H            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1))+2);
      GMX_MM_TRANSPOSE2_PD(G,H);
      
      G       = _mm_mul_pd(G,eps);
			H       = _mm_mul_pd(H,eps2);
			Fp      = _mm_add_pd(F,G);
			Fp      = _mm_add_pd(Fp,H);
			VV      = _mm_mul_pd(Fp,eps);
			VV      = _mm_add_pd(Y,VV);
			xmm1    = _mm_mul_pd(two,H);
			FF      = _mm_add_pd(Fp,G);
			FF      = _mm_add_pd(FF,xmm1);
			
			vvdw6   = _mm_mul_pd(c6,VV);
			fijD    = _mm_mul_pd(c6,FF);
      
      /* Dispersion */
      Y            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+4); 
      F            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1))+4);
      GMX_MM_TRANSPOSE2_PD(Y,F);
      G            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+6); 
      H            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1))+6);
      GMX_MM_TRANSPOSE2_PD(G,H);
      
      G       = _mm_mul_pd(G,eps);
			H       = _mm_mul_pd(H,eps2);
			Fp      = _mm_add_pd(F,G);
			Fp      = _mm_add_pd(Fp,H);
			VV      = _mm_mul_pd(Fp,eps);
			VV      = _mm_add_pd(Y,VV);
			xmm1    = _mm_mul_pd(two,H);
			FF      = _mm_add_pd(Fp,G);
			FF      = _mm_add_pd(FF,xmm1);
			
			vvdw12  = _mm_mul_pd(c12,VV);
			fijR    = _mm_mul_pd(c12,FF);
			
			vvdwtmp = _mm_add_pd(vvdw12,vvdw6);
			vvdwtot = _mm_add_pd(vvdwtot,vvdwtmp);
      
			xmm1    = _mm_add_pd(fijD,fijR);
			xmm1    = _mm_mul_pd(xmm1,tabscale);
			xmm1    = _mm_add_pd(xmm1,fijGB);
			xmm1    = _mm_sub_pd(xmm1,fscal);
			fscal   = _mm_mul_pd(xmm1,neg);
			fscal   = _mm_mul_pd(fscal,rinv);
      
      /***********************************/
			/*  INTERACTION SECTION ENDS HERE  */
			/***********************************/
      
      /* Calculate temporary vectorial force */
      tx           = _mm_mul_pd(fscal,dx);
      ty           = _mm_mul_pd(fscal,dy);
      tz           = _mm_mul_pd(fscal,dz);
      
      /* Increment i atom force */
      fix          = _mm_add_pd(fix,tx);
      fiy          = _mm_add_pd(fiy,ty);
      fiz          = _mm_add_pd(fiz,tz);
      
      /* Store j forces back */
			GMX_MM_DECREMENT_1RVEC_2POINTERS_PD(faction+j3A,faction+j3B,tx,ty,tz);
		}
		
		/* In double precision, offset can only be either 0 or 1 */
		if(k<nj1)
		{
			jnrA    = jjnr[k];
			j3A     = jnrA * 3;
      
      GMX_MM_LOAD_1RVEC_1POINTER_PD(pos+j3A,jx,jy,jz);
      
			dx           = _mm_sub_sd(ix,jx);
			dy           = _mm_sub_sd(iy,jy);
			dz           = _mm_sub_sd(iz,jz);
            
      rsq          = gmx_mm_calc_rsq_pd(dx,dy,dz);
      
      rinv         = gmx_mm_invsqrt_pd(rsq);
 			rinvsq       = _mm_mul_sd(rinv,rinv);
      
      /* These reason for zeroing these variables here is for fixing bug 585
       * What happens is that __m128d _mm_add_sd(a,b) gives back r0=a[0]+b[0],
       * and r1=0, but it should be r1=a[1]. 
       * This might be a compiler issue (tested with gcc-4.1.3 and -O3).
       * To work around it, we zero these variables and use _mm_add_pd (**) instead
       * Note that the only variables that get affected are the energies since
       * the total sum needs to be correct 
       */
      vgb          = _mm_setzero_pd();
      vcoul        = _mm_setzero_pd();
      dvdatmp      = _mm_setzero_pd();
      vvdw6        = _mm_setzero_pd();
      vvdw12       = _mm_setzero_pd();

      /***********************************/
			/* INTERACTION SECTION STARTS HERE */
			/***********************************/
			GMX_MM_LOAD_1VALUE_PD(charge+jnrA,jq);
			GMX_MM_LOAD_1VALUE_PD(invsqrta+jnrA,isaj);
            
      /* Lennard-Jones */
      tjA          = nti+2*type[jnrA];
      
      GMX_MM_LOAD_1PAIR_PD(vdwparam+tjA,c6,c12);
			
			isaprod      = _mm_mul_sd(isai,isaj);
			qq           = _mm_mul_sd(jq,iq);            
			vcoul        = _mm_mul_sd(qq,rinv);
			fscal        = _mm_mul_sd(vcoul,rinv);                                 
      vctot        = _mm_add_pd(vctot,vcoul); /* (**) */
      
      /* Polarization interaction */
			qq           = _mm_mul_sd(qq,_mm_mul_sd(isaprod,gbfactor));
			gbscale      = _mm_mul_sd(isaprod,gbtabscale);
      
 			/* Calculate GB table index */
			r            = _mm_mul_sd(rsq,rinv);
			rtab         = _mm_mul_sd(r,gbscale);
			
			n0		     = _mm_cvttpd_epi32(rtab);
			eps	     	 = _mm_sub_sd(rtab,_mm_cvtepi32_pd(n0));
			nnn		     = _mm_slli_epi32(n0,2);
			
      /* the tables are 16-byte aligned, so we can use _mm_load_pd */			
      Y            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))); 
      F            = _mm_setzero_pd();
      GMX_MM_TRANSPOSE2_PD(Y,F);
      G            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2); 
      H            = _mm_setzero_pd();
      GMX_MM_TRANSPOSE2_PD(G,H);
      
      G       = _mm_mul_sd(G,eps);
      H       = _mm_mul_sd(H, _mm_mul_sd(eps,eps) );
      F       = _mm_add_sd(F, _mm_add_sd( G , H ) );
      Y       = _mm_add_sd(Y, _mm_mul_sd(F, eps));
      F       = _mm_add_sd(F, _mm_add_sd(G , _mm_mul_sd(H,two)));
      vgb     = _mm_mul_sd(Y, qq);           
      fijGB   = _mm_mul_sd(F, _mm_mul_sd(qq,gbscale));
      
      dvdatmp = _mm_mul_sd(_mm_add_sd(vgb, _mm_mul_sd(fijGB,r)) , minushalf);
      
      vgbtot  = _mm_add_pd(vgbtot, vgb); /* (**) */
      
      dvdasum = _mm_add_pd(dvdasum, dvdatmp); /* (**) */
      dvdatmp = _mm_mul_sd(dvdatmp, _mm_mul_sd(isaj,isaj));
      
      GMX_MM_INCREMENT_1VALUE_PD(dvda+jnrA,dvdatmp);
			
      /* Calculate VDW table index */
			rtab    = _mm_mul_sd(r,tabscale);
			n0      = _mm_cvttpd_epi32(rtab);
			eps     = _mm_sub_sd(rtab,_mm_cvtepi32_pd(n0));
			eps2    = _mm_mul_sd(eps,eps);
			nnn     = _mm_slli_epi32(n0,3);
			
      /* Dispersion */
      Y            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))); 
      F            = _mm_setzero_pd();
      GMX_MM_TRANSPOSE2_PD(Y,F);
      G            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+2); 
      H            = _mm_setzero_pd();
      GMX_MM_TRANSPOSE2_PD(G,H);
      
      G       = _mm_mul_sd(G,eps);
			H       = _mm_mul_sd(H,eps2);
			Fp      = _mm_add_sd(F,G);
			Fp      = _mm_add_sd(Fp,H);
			VV      = _mm_mul_sd(Fp,eps);
			VV      = _mm_add_sd(Y,VV);
			xmm1    = _mm_mul_sd(two,H);
			FF      = _mm_add_sd(Fp,G);
			FF      = _mm_add_sd(FF,xmm1);
			
			vvdw6   = _mm_mul_sd(c6,VV);
			fijD    = _mm_mul_sd(c6,FF);
      
      /* Dispersion */
      Y            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+4); 
      F            = _mm_setzero_pd();
      GMX_MM_TRANSPOSE2_PD(Y,F);
      G            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+6); 
      H            = _mm_setzero_pd();
      GMX_MM_TRANSPOSE2_PD(G,H);
      
      G       = _mm_mul_sd(G,eps);
			H       = _mm_mul_sd(H,eps2);
			Fp      = _mm_add_sd(F,G);
			Fp      = _mm_add_sd(Fp,H);
			VV      = _mm_mul_sd(Fp,eps);
			VV      = _mm_add_sd(Y,VV);
			xmm1    = _mm_mul_sd(two,H);
			FF      = _mm_add_sd(Fp,G);
			FF      = _mm_add_sd(FF,xmm1);
			
			vvdw12  = _mm_mul_sd(c12,VV);
			fijR    = _mm_mul_sd(c12,FF);
			
			vvdwtmp = _mm_add_sd(vvdw12,vvdw6);
			vvdwtot = _mm_add_pd(vvdwtot,vvdwtmp); /* (**) */
            
			xmm1    = _mm_add_sd(fijD,fijR);
			xmm1    = _mm_mul_sd(xmm1,tabscale);
			xmm1    = _mm_add_sd(xmm1,fijGB);
			xmm1    = _mm_sub_sd(xmm1,fscal);
			fscal   = _mm_mul_sd(xmm1,neg);
			fscal   = _mm_mul_sd(fscal,rinv);

      /***********************************/
			/*  INTERACTION SECTION ENDS HERE  */
			/***********************************/
      
      /* Calculate temporary vectorial force */
      tx           = _mm_mul_sd(fscal,dx);
      ty           = _mm_mul_sd(fscal,dy);
      tz           = _mm_mul_sd(fscal,dz);
      
      /* Increment i atom force */
      fix          = _mm_add_sd(fix,tx);
      fiy          = _mm_add_sd(fiy,ty);
      fiz          = _mm_add_sd(fiz,tz);
      
      /* Store j forces back */
			GMX_MM_DECREMENT_1RVEC_1POINTER_PD(faction+j3A,tx,ty,tz);
		}
		
    dvdasum = _mm_mul_pd(dvdasum, _mm_mul_pd(isai,isai));
    gmx_mm_update_iforce_1atom_pd(&fix,&fiy,&fiz,faction+ii3,fshift+is3);
    
    ggid     = gid[n];         
    
    gmx_mm_update_1pot_pd(vctot,vc+ggid);
    gmx_mm_update_1pot_pd(vgbtot,gpol+ggid);
    gmx_mm_update_1pot_pd(dvdasum,dvda+ii);
    gmx_mm_update_1pot_pd(vvdwtot,vvdw+ggid);
    
	}
  
	*outeriter   = nri;            
  *inneriter   = nj1; 	
}
void exchlaplacecoeff_gmrfData_5(unsigned int slot) {
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[0]) {
if ((!neighbor_isValid[0][0])) {
{
double xPos;
double yPos;
/* Statements in this Scop: S1306, S1309, S1300, S1308, S1302, S1305, S1310, S1304, S1307, S1301, S1303 */
{
{
{
{
{
{
{
{
{
{
{
double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][0]);
int i1 = 1;
for (; (i1<=32); i1 += 2) {
fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+6302)] = 0.000000e+00;
fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+6338)] = 0.000000e+00;
}
for (; (i1<=33); i1 += 1) {
fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+6302)] = 0.000000e+00;
}
}
{
double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][0]);
int i1 = 1;
for (; (i1<=32); i1 += 2) {
fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+3782)] = 0.000000e+00;
fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+3818)] = 0.000000e+00;
}
for (; (i1<=33); i1 += 1) {
fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+3782)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][0]);
int i1 = 1;
for (; (i1<=32); i1 += 2) {
fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+7562)] = 0.000000e+00;
fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+7598)] = 0.000000e+00;
}
for (; (i1<=33); i1 += 1) {
fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+7562)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][0]);
int i1 = 1;
for (; (i1<=32); i1 += 2) {
fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+2)] = 0.000000e+00;
fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+38)] = 0.000000e+00;
}
for (; (i1<=33); i1 += 1) {
fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+2)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][0]);
int i1 = 1;
for (; (i1<=32); i1 += 2) {
fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+2522)] = 0.000000e+00;
fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+2558)] = 0.000000e+00;
}
for (; (i1<=33); i1 += 1) {
fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+2522)] = 0.000000e+00;
}
}
}
{
int i1 = 1;
for (; (i1<(2&(~1))); i1 += 1) {
xPos = posBegin[0];
}
__m128d vec1 = _mm_set1_pd(xPos);
for (; (i1<31); i1 += 4) {
/* xPos = posBegin[0]; */
__m128d vec0 = _mm_load1_pd((&posBegin[0]));
__m128d vec0_2 = _mm_load1_pd((&posBegin[0]));
vec1 = vec0;
vec1 = vec0_2;
}
for (; (i1<34); i1 += 1) {
xPos = posBegin[0];
}
}
}
{
double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][0]);
int i1 = 1;
for (; (i1<=32); i1 += 2) {
fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+8822)] = 0.000000e+00;
fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+8858)] = 0.000000e+00;
}
for (; (i1<=33); i1 += 1) {
fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+8822)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][0]);
int i1 = 1;
for (; (i1<=32); i1 += 2) {
fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+1262)] = 0.000000e+00;
fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+1298)] = 0.000000e+00;
}
for (; (i1<=33); i1 += 1) {
fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+1262)] = 0.000000e+00;
}
}
}
{
int i1 = 1;
for (; (i1<(2&(~1))); i1 += 1) {
yPos = ((((i1-1)/3.200000e+01)*(posEnd[1]-posBegin[1]))+posBegin[1]);
}
__m128d vec1 = _mm_set1_pd(1.000000e+00);
__m128d vec2 = _mm_set1_pd(3.200000e+01);
__m128d vec5 = _mm_set1_pd(yPos);
for (; (i1<31); i1 += 4) {
/* yPos = ((((i1-1)/3.200000e+01)*(posEnd[1]-posBegin[1]))+posBegin[1]); */
__m128d vec0 = _mm_set_pd(i1+1,i1);
__m128d vec0_2 = _mm_set_pd(i1+1,i1);
__m128d vec3 = _mm_load1_pd((&posEnd[1]));
__m128d vec3_2 = _mm_load1_pd((&posEnd[1]));
__m128d vec4 = _mm_load1_pd((&posBegin[1]));
__m128d vec4_2 = _mm_load1_pd((&posBegin[1]));
vec5 = _mm_add_pd(_mm_mul_pd(_mm_div_pd(_mm_sub_pd(vec0, vec1), vec2), _mm_sub_pd(vec3, vec4)), vec4);
vec5 = _mm_add_pd(_mm_mul_pd(_mm_div_pd(_mm_sub_pd(vec0_2, vec1), vec2), _mm_sub_pd(vec3_2, vec4_2)), vec4_2);
}
for (; (i1<34); i1 += 1) {
yPos = ((((i1-1)/3.200000e+01)*(posEnd[1]-posBegin[1]))+posBegin[1]);
}
}
}
{
double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][0]);
int i1 = 1;
for (; (i1<=32); i1 += 2) {
fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+5042)] = 0.000000e+00;
fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+5078)] = 0.000000e+00;
}
for (; (i1<=33); i1 += 1) {
fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+5042)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][0]);
int i1 = 1;
for (; (i1<=32); i1 += 2) {
fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+10082)] = 0.000000e+00;
fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+10118)] = 0.000000e+00;
}
for (; (i1<=33); i1 += 1) {
fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+10082)] = 0.000000e+00;
}
}
}
}
}
if ((!neighbor_isValid[0][1])) {
{
double xPos;
double yPos;
/* Statements in this Scop: S1312, S1320, S1314, S1317, S1311, S1319, S1313, S1316, S1321, S1315, S1318 */
{
{
{
{
{
{
{
{
{
{
{
double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][0]);
int i1 = 1;
for (; (i1<=32); i1 += 2) {
fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+1294)] = 0.000000e+00;
fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+1330)] = 0.000000e+00;
}
for (; (i1<=33); i1 += 1) {
fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+1294)] = 0.000000e+00;
}
}
{
double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][0]);
int i1 = 1;
for (; (i1<=32); i1 += 2) {
fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+5074)] = 0.000000e+00;
fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+5110)] = 0.000000e+00;
}
for (; (i1<=33); i1 += 1) {
fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+5074)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][0]);
int i1 = 1;
for (; (i1<=32); i1 += 2) {
fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+8854)] = 0.000000e+00;
fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+8890)] = 0.000000e+00;
}
for (; (i1<=33); i1 += 1) {
fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+8854)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][0]);
int i1 = 1;
for (; (i1<=32); i1 += 2) {
fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+2554)] = 0.000000e+00;
fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+2590)] = 0.000000e+00;
}
for (; (i1<=33); i1 += 1) {
fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+2554)] = 0.000000e+00;
}
}
}
{
int i1 = 1;
for (; (i1<(2&(~1))); i1 += 1) {
yPos = ((((i1-1)/3.200000e+01)*(posEnd[1]-posBegin[1]))+posBegin[1]);
}
__m128d vec1 = _mm_set1_pd(1.000000e+00);
__m128d vec2 = _mm_set1_pd(3.200000e+01);
__m128d vec5 = _mm_set1_pd(yPos);
for (; (i1<31); i1 += 4) {
/* yPos = ((((i1-1)/3.200000e+01)*(posEnd[1]-posBegin[1]))+posBegin[1]); */
__m128d vec0 = _mm_set_pd(i1+1,i1);
__m128d vec0_2 = _mm_set_pd(i1+1,i1);
__m128d vec3 = _mm_load1_pd((&posEnd[1]));
__m128d vec3_2 = _mm_load1_pd((&posEnd[1]));
__m128d vec4 = _mm_load1_pd((&posBegin[1]));
__m128d vec4_2 = _mm_load1_pd((&posBegin[1]));
vec5 = _mm_add_pd(_mm_mul_pd(_mm_div_pd(_mm_sub_pd(vec0, vec1), vec2), _mm_sub_pd(vec3, vec4)), vec4);
vec5 = _mm_add_pd(_mm_mul_pd(_mm_div_pd(_mm_sub_pd(vec0_2, vec1), vec2), _mm_sub_pd(vec3_2, vec4_2)), vec4_2);
}
for (; (i1<34); i1 += 1) {
yPos = ((((i1-1)/3.200000e+01)*(posEnd[1]-posBegin[1]))+posBegin[1]);
}
}
}
{
double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][0]);
int i1 = 1;
for (; (i1<=32); i1 += 2) {
fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+34)] = 0.000000e+00;
fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+70)] = 0.000000e+00;
}
for (; (i1<=33); i1 += 1) {
fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+34)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][0]);
int i1 = 1;
for (; (i1<=32); i1 += 2) {
fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+3814)] = 0.000000e+00;
fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+3850)] = 0.000000e+00;
}
for (; (i1<=33); i1 += 1) {
fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+3814)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][0]);
int i1 = 1;
for (; (i1<=32); i1 += 2) {
fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+7594)] = 0.000000e+00;
fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+7630)] = 0.000000e+00;
}
for (; (i1<=33); i1 += 1) {
fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+7594)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][0]);
int i1 = 1;
for (; (i1<=32); i1 += 2) {
fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+6334)] = 0.000000e+00;
fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+6370)] = 0.000000e+00;
}
for (; (i1<=33); i1 += 1) {
fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+6334)] = 0.000000e+00;
}
}
}
{
int i1 = 1;
for (; (i1<(2&(~1))); i1 += 1) {
xPos = posEnd[0];
}
__m128d vec1 = _mm_set1_pd(xPos);
for (; (i1<31); i1 += 4) {
/* xPos = posEnd[0]; */
__m128d vec0 = _mm_load1_pd((&posEnd[0]));
__m128d vec0_2 = _mm_load1_pd((&posEnd[0]));
vec1 = vec0;
vec1 = vec0_2;
}
for (; (i1<34); i1 += 1) {
xPos = posEnd[0];
}
}
}
{
double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][0]);
int i1 = 1;
for (; (i1<=32); i1 += 2) {
fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+10114)] = 0.000000e+00;
fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+10150)] = 0.000000e+00;
}
for (; (i1<=33); i1 += 1) {
fieldData_LaplaceCoeff_GMRF_5_p1[((i1*36)+10114)] = 0.000000e+00;
}
}
}
}
}
if ((!neighbor_isValid[0][2])) {
{
double xPos;
double yPos;
/* Statements in this Scop: S1327, S1332, S1326, S1329, S1323, S1322, S1331, S1325, S1328, S1330, S1324 */
{
{
{
{
{
{
{
{
{
{
{
double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][0]);
int i2 = 2;
for (; (i2<=33); i2 += 2) {
fieldData_LaplaceCoeff_GMRF_5_p1[(i2+5076)] = 0.000000e+00;
fieldData_LaplaceCoeff_GMRF_5_p1[(i2+5077)] = 0.000000e+00;
}
for (; (i2<=34); i2 += 1) {
fieldData_LaplaceCoeff_GMRF_5_p1[(i2+5076)] = 0.000000e+00;
}
}
{
double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][0]);
int i2 = 2;
for (; (i2<=33); i2 += 2) {
fieldData_LaplaceCoeff_GMRF_5_p1[(i2+8856)] = 0.000000e+00;
fieldData_LaplaceCoeff_GMRF_5_p1[(i2+8857)] = 0.000000e+00;
}
for (; (i2<=34); i2 += 1) {
fieldData_LaplaceCoeff_GMRF_5_p1[(i2+8856)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][0]);
int i2 = 2;
for (; (i2<=33); i2 += 2) {
fieldData_LaplaceCoeff_GMRF_5_p1[(i2+6336)] = 0.000000e+00;
fieldData_LaplaceCoeff_GMRF_5_p1[(i2+6337)] = 0.000000e+00;
}
for (; (i2<=34); i2 += 1) {
fieldData_LaplaceCoeff_GMRF_5_p1[(i2+6336)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][0]);
int i2 = 2;
for (; (i2<=33); i2 += 2) {
fieldData_LaplaceCoeff_GMRF_5_p1[(i2+7596)] = 0.000000e+00;
fieldData_LaplaceCoeff_GMRF_5_p1[(i2+7597)] = 0.000000e+00;
}
for (; (i2<=34); i2 += 1) {
fieldData_LaplaceCoeff_GMRF_5_p1[(i2+7596)] = 0.000000e+00;
}
}
}
{
int i2 = 2;
for (; (i2<=33); i2 += 2) {
xPos = ((((i2-2)/3.200000e+01)*(posEnd[0]-posBegin[0]))+posBegin[0]);
xPos = ((((i2-1)/3.200000e+01)*(posEnd[0]-posBegin[0]))+posBegin[0]);
}
for (; (i2<=34); i2 += 1) {
xPos = ((((i2-2)/3.200000e+01)*(posEnd[0]-posBegin[0]))+posBegin[0]);
}
}
}
{
double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][0]);
int i2 = 2;
for (; (i2<=33); i2 += 2) {
fieldData_LaplaceCoeff_GMRF_5_p1[(i2+36)] = 0.000000e+00;
fieldData_LaplaceCoeff_GMRF_5_p1[(i2+37)] = 0.000000e+00;
}
for (; (i2<=34); i2 += 1) {
fieldData_LaplaceCoeff_GMRF_5_p1[(i2+36)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][0]);
int i2 = 2;
for (; (i2<=33); i2 += 2) {
fieldData_LaplaceCoeff_GMRF_5_p1[(i2+2556)] = 0.000000e+00;
fieldData_LaplaceCoeff_GMRF_5_p1[(i2+2557)] = 0.000000e+00;
}
for (; (i2<=34); i2 += 1) {
fieldData_LaplaceCoeff_GMRF_5_p1[(i2+2556)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][0]);
int i2 = 2;
for (; (i2<=33); i2 += 2) {
fieldData_LaplaceCoeff_GMRF_5_p1[(i2+10116)] = 0.000000e+00;
fieldData_LaplaceCoeff_GMRF_5_p1[(i2+10117)] = 0.000000e+00;
}
for (; (i2<=34); i2 += 1) {
fieldData_LaplaceCoeff_GMRF_5_p1[(i2+10116)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][0]);
int i2 = 2;
for (; (i2<=33); i2 += 2) {
fieldData_LaplaceCoeff_GMRF_5_p1[(i2+3816)] = 0.000000e+00;
fieldData_LaplaceCoeff_GMRF_5_p1[(i2+3817)] = 0.000000e+00;
}
for (; (i2<=34); i2 += 1) {
fieldData_LaplaceCoeff_GMRF_5_p1[(i2+3816)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][0]);
int i2 = 2;
for (; (i2<=33); i2 += 2) {
fieldData_LaplaceCoeff_GMRF_5_p1[(i2+1296)] = 0.000000e+00;
fieldData_LaplaceCoeff_GMRF_5_p1[(i2+1297)] = 0.000000e+00;
}
for (; (i2<=34); i2 += 1) {
fieldData_LaplaceCoeff_GMRF_5_p1[(i2+1296)] = 0.000000e+00;
}
}
}
{
int i2 = 2;
for (; (i2<=33); i2 += 2) {
yPos = posBegin[1];
yPos = posBegin[1];
}
for (; (i2<=34); i2 += 1) {
yPos = posBegin[1];
}
}
}
}
}
if ((!neighbor_isValid[0][3])) {
{
double xPos;
double yPos;
/* Statements in this Scop: S1338, S1341, S1335, S1340, S1343, S1337, S1334, S1333, S1342, S1336, S1339 */
{
{
{
{
{
{
{
{
{
{
{
double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][0]);
int i2 = 2;
for (; (i2<=33); i2 += 2) {
fieldData_LaplaceCoeff_GMRF_5_p1[(i2+7488)] = 0.000000e+00;
fieldData_LaplaceCoeff_GMRF_5_p1[(i2+7489)] = 0.000000e+00;
}
for (; (i2<=34); i2 += 1) {
fieldData_LaplaceCoeff_GMRF_5_p1[(i2+7488)] = 0.000000e+00;
}
}
{
double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][0]);
int i2 = 2;
for (; (i2<=33); i2 += 2) {
fieldData_LaplaceCoeff_GMRF_5_p1[(i2+2448)] = 0.000000e+00;
fieldData_LaplaceCoeff_GMRF_5_p1[(i2+2449)] = 0.000000e+00;
}
for (; (i2<=34); i2 += 1) {
fieldData_LaplaceCoeff_GMRF_5_p1[(i2+2448)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][0]);
int i2 = 2;
for (; (i2<=33); i2 += 2) {
fieldData_LaplaceCoeff_GMRF_5_p1[(i2+11268)] = 0.000000e+00;
fieldData_LaplaceCoeff_GMRF_5_p1[(i2+11269)] = 0.000000e+00;
}
for (; (i2<=34); i2 += 1) {
fieldData_LaplaceCoeff_GMRF_5_p1[(i2+11268)] = 0.000000e+00;
}
}
}
{
int i2 = 2;
for (; (i2<=33); i2 += 2) {
yPos = posEnd[1];
yPos = posEnd[1];
}
for (; (i2<=34); i2 += 1) {
yPos = posEnd[1];
}
}
}
{
double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][0]);
int i2 = 2;
for (; (i2<=33); i2 += 2) {
fieldData_LaplaceCoeff_GMRF_5_p1[(i2+10008)] = 0.000000e+00;
fieldData_LaplaceCoeff_GMRF_5_p1[(i2+10009)] = 0.000000e+00;
}
for (; (i2<=34); i2 += 1) {
fieldData_LaplaceCoeff_GMRF_5_p1[(i2+10008)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][0]);
int i2 = 2;
for (; (i2<=33); i2 += 2) {
fieldData_LaplaceCoeff_GMRF_5_p1[(i2+6228)] = 0.000000e+00;
fieldData_LaplaceCoeff_GMRF_5_p1[(i2+6229)] = 0.000000e+00;
}
for (; (i2<=34); i2 += 1) {
fieldData_LaplaceCoeff_GMRF_5_p1[(i2+6228)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][0]);
int i2 = 2;
for (; (i2<=33); i2 += 2) {
fieldData_LaplaceCoeff_GMRF_5_p1[(i2+8748)] = 0.000000e+00;
fieldData_LaplaceCoeff_GMRF_5_p1[(i2+8749)] = 0.000000e+00;
}
for (; (i2<=34); i2 += 1) {
fieldData_LaplaceCoeff_GMRF_5_p1[(i2+8748)] = 0.000000e+00;
}
}
}
{
int i2 = 2;
for (; (i2<=33); i2 += 2) {
xPos = ((((i2-2)/3.200000e+01)*(posEnd[0]-posBegin[0]))+posBegin[0]);
xPos = ((((i2-1)/3.200000e+01)*(posEnd[0]-posBegin[0]))+posBegin[0]);
}
for (; (i2<=34); i2 += 1) {
xPos = ((((i2-2)/3.200000e+01)*(posEnd[0]-posBegin[0]))+posBegin[0]);
}
}
}
{
double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][0]);
int i2 = 2;
for (; (i2<=33); i2 += 2) {
fieldData_LaplaceCoeff_GMRF_5_p1[(i2+3708)] = 0.000000e+00;
fieldData_LaplaceCoeff_GMRF_5_p1[(i2+3709)] = 0.000000e+00;
}
for (; (i2<=34); i2 += 1) {
fieldData_LaplaceCoeff_GMRF_5_p1[(i2+3708)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][0]);
int i2 = 2;
for (; (i2<=33); i2 += 2) {
fieldData_LaplaceCoeff_GMRF_5_p1[(i2+1188)] = 0.000000e+00;
fieldData_LaplaceCoeff_GMRF_5_p1[(i2+1189)] = 0.000000e+00;
}
for (; (i2<=34); i2 += 1) {
fieldData_LaplaceCoeff_GMRF_5_p1[(i2+1188)] = 0.000000e+00;
}
}
}
{
double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][0]);
int i2 = 2;
for (; (i2<=33); i2 += 2) {
fieldData_LaplaceCoeff_GMRF_5_p1[(i2+4968)] = 0.000000e+00;
fieldData_LaplaceCoeff_GMRF_5_p1[(i2+4969)] = 0.000000e+00;
}
for (; (i2<=34); i2 += 1) {
fieldData_LaplaceCoeff_GMRF_5_p1[(i2+4968)] = 0.000000e+00;
}
}
}
}
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[0]) {
if ((neighbor_isValid[0][1]&&neighbor_isRemote[0][1])) {
/* Statements in this Scop: S1344 */
for (int i3 = 0; (i3<=8); i3 += 1) {
double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][(i3*1260)]);
double* buffer_Send_1_p1 = (&buffer_Send[1][(i3*33)]);
int i4 = 1;
for (; (i4<=32); i4 += 2) {
buffer_Send_1_p1[(i4-1)] = fieldData_LaplaceCoeff_GMRF_5_p1[((i4*36)+34)];
buffer_Send_1_p1[i4] = fieldData_LaplaceCoeff_GMRF_5_p1[((i4*36)+70)];
}
for (; (i4<=33); i4 += 1) {
buffer_Send_1_p1[(i4-1)] = fieldData_LaplaceCoeff_GMRF_5_p1[((i4*36)+34)];
}
}
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[0]) {
if ((neighbor_isValid[0][1]&&neighbor_isRemote[0][1])) {
MPI_Isend(buffer_Send[1], 297, MPI_DOUBLE, neighbor_remoteRank[0][1], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[0][1]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[1]);
reqOutstanding_Send[1] = true;
}
}
}
;
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[0]) {
if ((neighbor_isValid[0][0]&&neighbor_isRemote[0][0])) {
MPI_Irecv(buffer_Recv[0], 297, MPI_DOUBLE, neighbor_remoteRank[0][0], ((unsigned int)(neighbor_fragCommId[0][0]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[0]);
reqOutstanding_Recv[0] = true;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[0]) {
if (reqOutstanding_Recv[0]) {
waitForMPIReq(&mpiRequest_Recv[0]);
reqOutstanding_Recv[0] = false;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[0]) {
if ((neighbor_isValid[0][0]&&neighbor_isRemote[0][0])) {
/* Statements in this Scop: S1345 */
for (int i3 = 0; (i3<=8); i3 += 1) {
double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][(i3*1260)]);
double* buffer_Recv_0_p1 = (&buffer_Recv[0][(i3*33)]);
int i4 = 3;
for (; (i4<=34); i4 += 2) {
fieldData_LaplaceCoeff_GMRF_5_p1[((i4*36)-70)] = buffer_Recv_0_p1[(i4-3)];
fieldData_LaplaceCoeff_GMRF_5_p1[((i4*36)-34)] = buffer_Recv_0_p1[(i4-2)];
}
for (; (i4<=35); i4 += 1) {
fieldData_LaplaceCoeff_GMRF_5_p1[((i4*36)-70)] = buffer_Recv_0_p1[(i4-3)];
}
}
}
}
}
;
;
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[0]) {
if (reqOutstanding_Send[1]) {
waitForMPIReq(&mpiRequest_Send[1]);
reqOutstanding_Send[1] = false;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[0]) {
;
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[0]) {
if ((neighbor_isValid[0][3]&&neighbor_isRemote[0][3])) {
MPI_Isend(&fieldData_LaplaceCoeff_GMRF[5][1190], 1, mpiDatatype_9_33_1260, neighbor_remoteRank[0][3], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[0][3]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[3]);
reqOutstanding_Send[3] = true;
}
}
}
;
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[0]) {
if ((neighbor_isValid[0][2]&&neighbor_isRemote[0][2])) {
MPI_Irecv(&fieldData_LaplaceCoeff_GMRF[5][38], 1, mpiDatatype_9_33_1260, neighbor_remoteRank[0][2], ((unsigned int)(neighbor_fragCommId[0][2]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[2]);
reqOutstanding_Recv[2] = true;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[0]) {
if (reqOutstanding_Recv[2]) {
waitForMPIReq(&mpiRequest_Recv[2]);
reqOutstanding_Recv[2] = false;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[0]) {
;
}
}
;
;
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[0]) {
if (reqOutstanding_Send[3]) {
waitForMPIReq(&mpiRequest_Send[3]);
reqOutstanding_Send[3] = false;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[0]) {
if ((neighbor_isValid[0][0]&&neighbor_isRemote[0][0])) {
/* Statements in this Scop: S1346 */
for (int i3 = 0; (i3<=8); i3 += 1) {
double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][(i3*1260)]);
double* buffer_Send_0_p1 = (&buffer_Send[0][(i3*35)]);
int i4 = 0;
for (; (i4<=33); i4 += 2) {
buffer_Send_0_p1[i4] = fieldData_LaplaceCoeff_GMRF_5_p1[((i4*36)+3)];
buffer_Send_0_p1[(i4+1)] = fieldData_LaplaceCoeff_GMRF_5_p1[((i4*36)+39)];
}
for (; (i4<=34); i4 += 1) {
buffer_Send_0_p1[i4] = fieldData_LaplaceCoeff_GMRF_5_p1[((i4*36)+3)];
}
}
}
if ((neighbor_isValid[0][1]&&neighbor_isRemote[0][1])) {
/* Statements in this Scop: S1347 */
for (int i3 = 0; (i3<=8); i3 += 1) {
double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][(i3*1260)]);
double* buffer_Send_1_p1 = (&buffer_Send[1][(i3*35)]);
int i4 = 0;
for (; (i4<=33); i4 += 2) {
buffer_Send_1_p1[i4] = fieldData_LaplaceCoeff_GMRF_5_p1[((i4*36)+33)];
buffer_Send_1_p1[(i4+1)] = fieldData_LaplaceCoeff_GMRF_5_p1[((i4*36)+69)];
}
for (; (i4<=34); i4 += 1) {
buffer_Send_1_p1[i4] = fieldData_LaplaceCoeff_GMRF_5_p1[((i4*36)+33)];
}
}
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[0]) {
if ((neighbor_isValid[0][0]&&neighbor_isRemote[0][0])) {
MPI_Isend(buffer_Send[0], 315, MPI_DOUBLE, neighbor_remoteRank[0][0], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[0][0]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[0]);
reqOutstanding_Send[0] = true;
}
if ((neighbor_isValid[0][1]&&neighbor_isRemote[0][1])) {
MPI_Isend(buffer_Send[1], 315, MPI_DOUBLE, neighbor_remoteRank[0][1], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[0][1]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[1]);
reqOutstanding_Send[1] = true;
}
}
}
;
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[0]) {
if ((neighbor_isValid[0][0]&&neighbor_isRemote[0][0])) {
MPI_Irecv(buffer_Recv[0], 315, MPI_DOUBLE, neighbor_remoteRank[0][0], ((unsigned int)(neighbor_fragCommId[0][0]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[0]);
reqOutstanding_Recv[0] = true;
}
if ((neighbor_isValid[0][1]&&neighbor_isRemote[0][1])) {
MPI_Irecv(buffer_Recv[1], 315, MPI_DOUBLE, neighbor_remoteRank[0][1], ((unsigned int)(neighbor_fragCommId[0][1]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[1]);
reqOutstanding_Recv[1] = true;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[0]) {
if (reqOutstanding_Recv[0]) {
waitForMPIReq(&mpiRequest_Recv[0]);
reqOutstanding_Recv[0] = false;
}
if (reqOutstanding_Recv[1]) {
waitForMPIReq(&mpiRequest_Recv[1]);
reqOutstanding_Recv[1] = false;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[0]) {
if ((neighbor_isValid[0][0]&&neighbor_isRemote[0][0])) {
/* Statements in this Scop: S1348 */
for (int i3 = 0; (i3<=8); i3 += 1) {
double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][(i3*1260)]);
double* buffer_Recv_0_p1 = (&buffer_Recv[0][(i3*35)]);
int i4 = 1;
for (; (i4<=34); i4 += 2) {
fieldData_LaplaceCoeff_GMRF_5_p1[((i4*36)-35)] = buffer_Recv_0_p1[(i4-1)];
fieldData_LaplaceCoeff_GMRF_5_p1[((i4*36)+1)] = buffer_Recv_0_p1[i4];
}
for (; (i4<=35); i4 += 1) {
fieldData_LaplaceCoeff_GMRF_5_p1[((i4*36)-35)] = buffer_Recv_0_p1[(i4-1)];
}
}
}
if ((neighbor_isValid[0][1]&&neighbor_isRemote[0][1])) {
/* Statements in this Scop: S1349 */
for (int i3 = 0; (i3<=8); i3 += 1) {
double* fieldData_LaplaceCoeff_GMRF_5_p1 = (&fieldData_LaplaceCoeff_GMRF[5][(i3*1260)]);
double* buffer_Recv_1_p1 = (&buffer_Recv[1][(i3*35)]);
int i4 = 35;
for (; (i4<=68); i4 += 2) {
fieldData_LaplaceCoeff_GMRF_5_p1[((i4*36)-1225)] = buffer_Recv_1_p1[(i4-35)];
fieldData_LaplaceCoeff_GMRF_5_p1[((i4*36)-1189)] = buffer_Recv_1_p1[(i4-34)];
}
for (; (i4<=69); i4 += 1) {
fieldData_LaplaceCoeff_GMRF_5_p1[((i4*36)-1225)] = buffer_Recv_1_p1[(i4-35)];
}
}
}
}
}
;
;
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[0]) {
if (reqOutstanding_Send[0]) {
waitForMPIReq(&mpiRequest_Send[0]);
reqOutstanding_Send[0] = false;
}
if (reqOutstanding_Send[1]) {
waitForMPIReq(&mpiRequest_Send[1]);
reqOutstanding_Send[1] = false;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[0]) {
;
;
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[0]) {
if ((neighbor_isValid[0][2]&&neighbor_isRemote[0][2])) {
MPI_Isend(&fieldData_LaplaceCoeff_GMRF[5][73], 1, mpiDatatype_9_35_1260, neighbor_remoteRank[0][2], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[0][2]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[2]);
reqOutstanding_Send[2] = true;
}
if ((neighbor_isValid[0][3]&&neighbor_isRemote[0][3])) {
MPI_Isend(&fieldData_LaplaceCoeff_GMRF[5][1153], 1, mpiDatatype_9_35_1260, neighbor_remoteRank[0][3], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[0][3]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[3]);
reqOutstanding_Send[3] = true;
}
}
}
;
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[0]) {
if ((neighbor_isValid[0][2]&&neighbor_isRemote[0][2])) {
MPI_Irecv(&fieldData_LaplaceCoeff_GMRF[5][1], 1, mpiDatatype_9_35_1260, neighbor_remoteRank[0][2], ((unsigned int)(neighbor_fragCommId[0][2]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[2]);
reqOutstanding_Recv[2] = true;
}
if ((neighbor_isValid[0][3]&&neighbor_isRemote[0][3])) {
MPI_Irecv(&fieldData_LaplaceCoeff_GMRF[5][1225], 1, mpiDatatype_9_35_1260, neighbor_remoteRank[0][3], ((unsigned int)(neighbor_fragCommId[0][3]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[3]);
reqOutstanding_Recv[3] = true;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[0]) {
if (reqOutstanding_Recv[2]) {
waitForMPIReq(&mpiRequest_Recv[2]);
reqOutstanding_Recv[2] = false;
}
if (reqOutstanding_Recv[3]) {
waitForMPIReq(&mpiRequest_Recv[3]);
reqOutstanding_Recv[3] = false;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[0]) {
;
;
}
}
;
;
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[0]) {
if (reqOutstanding_Send[2]) {
waitForMPIReq(&mpiRequest_Send[2]);
reqOutstanding_Send[2] = false;
}
if (reqOutstanding_Send[3]) {
waitForMPIReq(&mpiRequest_Send[3]);
reqOutstanding_Send[3] = false;
}
}
}
}
Beispiel #19
0
// rotate a matrix U = su2_i*U where su2_i is an su2 matrix embedded in suN
void
su2_rotate( GLU_complex U[ NCNC ] ,
	    const GLU_complex s0 ,
	    const GLU_complex s1 ,
	    const size_t su2_index )
{
#if NC == 3
  __m128d *u = (__m128d*)U ;
  register const __m128d sm0 = _mm_setr_pd( creal( s0 ) , cimag( s0 ) ) ;
  register const __m128d sm1 = _mm_setr_pd( creal( s1 ) , cimag( s1 ) ) ;
  register __m128d tmp0 , tmp1 , a , b ;
  switch( su2_index%3 ) { // again I don't like this
  case 0 :
    // first one
    a = *( u + 0 ) ; b = *( u + 3 ) ;
    tmp0 = _mm_add_pd( SSE2_MUL( sm0 , a ) ,
		       SSE2_MUL( sm1 , b ) ) ;
    tmp1 = _mm_sub_pd( SSE2_MULCONJ( sm0 , b ) ,
		       SSE2_MULCONJ( sm1 , a ) ) ;
    *( u + 0 ) = tmp0 ;
    *( u + 3 ) = tmp1 ;
    // second one
    a = *( u + 1 ) ; b = *( u + 4 ) ;
    tmp0 = _mm_add_pd( SSE2_MUL( sm0 , a ) ,
		       SSE2_MUL( sm1 , b ) ) ;
    tmp1 = _mm_sub_pd( SSE2_MULCONJ( sm0 , b ) ,
		       SSE2_MULCONJ( sm1 , a ) ) ;
    *( u + 1 ) = tmp0 ;
    *( u + 4 ) = tmp1 ;
    // third
    a = *( u + 2 ) ; b = *( u + 5 ) ;
    tmp0 = _mm_add_pd( SSE2_MUL( sm0 , a ) ,
		       SSE2_MUL( sm1 , b ) ) ;
    tmp1 = _mm_sub_pd( SSE2_MULCONJ( sm0 , b ) ,
		       SSE2_MULCONJ( sm1 , a ) ) ;
    *( u + 2 ) = tmp0 ;
    *( u + 5 ) = tmp1 ;
    break ;
  case 1 :
    // first one
    a = *( u + 3 ) ; b = *( u + 6 ) ;
    tmp0 = _mm_add_pd( SSE2_MUL( sm0 , a ) ,
		       SSE2_MUL( sm1 , b ) ) ;
    tmp1 = _mm_sub_pd( SSE2_MULCONJ( sm0 , b ) ,
		       SSE2_MULCONJ( sm1 , a ) ) ;
    *( u + 3 ) = tmp0 ;
    *( u + 6 ) = tmp1 ;
    // second one
    a = *( u + 4 ) ; b = *( u + 7 ) ;
    tmp0 = _mm_add_pd( SSE2_MUL( sm0 , a ) ,
		       SSE2_MUL( sm1 , b ) ) ;
    tmp1 = _mm_sub_pd( SSE2_MULCONJ( sm0 , b ) ,
		       SSE2_MULCONJ( sm1 , a ) ) ;
    *( u + 4 ) = tmp0 ;
    *( u + 7 ) = tmp1 ;
    // third
    a = *( u + 5 ) ; b = *( u + 8 ) ;
    tmp0 = _mm_add_pd( SSE2_MUL( sm0 , a ) ,
		       SSE2_MUL( sm1 , b ) ) ;
    tmp1 = _mm_sub_pd( SSE2_MULCONJ( sm0 , b ) ,
		       SSE2_MULCONJ( sm1 , a ) ) ;
    *( u + 5 ) = tmp0 ;
    *( u + 8 ) = tmp1 ;
    break ;
  case 2 :
    // first one
    a = *( u + 0 ) ; b = *( u + 6 ) ;
    tmp0 = _mm_sub_pd( SSE2_MULCONJ( sm0 , a ) ,
		       SSE2_MULCONJ( sm1 , b ) ) ;
    tmp1 = _mm_add_pd( SSE2_MUL( sm0 , b ) ,
		       SSE2_MUL( sm1 , a ) ) ;
    *( u + 0 ) = tmp0 ;
    *( u + 6 ) = tmp1 ;
    // second
    a = *( u + 1 ) ; b = *( u + 7 ) ;
    tmp0 = _mm_sub_pd( SSE2_MULCONJ( sm0 , a ) ,
		       SSE2_MULCONJ( sm1 , b ) ) ;
    tmp1 = _mm_add_pd( SSE2_MUL( sm0 , b ) ,
		       SSE2_MUL( sm1 , a ) ) ;
    *( u + 1 ) = tmp0 ;
    *( u + 7 ) = tmp1 ;
    // third
    a = *( u + 2 ) ; b = *( u + 8 ) ;
    tmp0 = _mm_sub_pd( SSE2_MULCONJ( sm0 , a ) ,
		       SSE2_MULCONJ( sm1 , b ) ) ;
    tmp1 = _mm_add_pd( SSE2_MUL( sm0 , b ) ,
		       SSE2_MUL( sm1 , a ) ) ;
    *( u + 2 ) = tmp0 ;
    *( u + 8 ) = tmp1 ;
    break ;
  }
#elif NC == 2
  __m128d *u = (__m128d*)U ;
  register const __m128d sm0 = _mm_setr_pd( creal( s0 ) , cimag( s0 ) ) ;
  register const __m128d sm1 = _mm_setr_pd( creal( s1 ) , cimag( s1 ) ) ;
  *( u + 0 ) = _mm_add_pd( SSE2_MUL( sm0 , *( u + 0 ) ) ,
			   SSE2_MUL( sm1 , *( u + 2 ) ) ) ;
  *( u + 1 ) = _mm_add_pd( SSE2_MUL( sm0 , *( u + 1 ) ) ,
			   SSE2_MUL( sm1 , *( u + 3 ) ) ) ;
  *( u + 2 ) = SSE_FLIP( SSE2_CONJ( *( u + 1 ) ) ) ; 
  *( u + 3 ) = SSE2_CONJ( *( u + 0 ) ) ;
#else
  // just a call to su2 multiply
  shortened_su2_multiply( U , s0 , s1 , su2_index ) ;
#endif
  return ;
}
Beispiel #20
0
DBL AVXFMA4Noise(const Vector3d& EPoint, int noise_generator)
{
    DBL x, y, z;
    DBL *mp;
    int ix, iy, iz;
    int ixiy_hash, ixjy_hash, jxiy_hash, jxjy_hash;
    DBL sum;

    // TODO FIXME - global statistics reference
    // Stats[Calls_To_Noise]++;

    if (noise_generator==kNoiseGen_Perlin)
    {
        // The 1.59 and 0.985 are to correct for some biasing problems with
        // the random # generator used to create the noise tables.  Final
        // range of values is about 5.0e-4 below 0.0 and above 1.0.  Mean
        // value is 0.49 (ideally it would be 0.5).
        sum = 0.5 * (1.59 * SolidNoise(EPoint) + 0.985);

        // Clamp final value to 0-1 range
            if (sum < 0.0) sum = 0.0;
            if (sum > 1.0) sum = 1.0;

        return sum;
    }

    x = EPoint[X];
    y = EPoint[Y];
    z = EPoint[Z];

    /* its equivalent integer lattice point. */
    /* ix = (int)x; iy = (int)y; iz = (long)z; */
    /* JB fix for the range problem */

    __m128d xy = _mm_setr_pd(x, y);
    __m128d zn = _mm_set_sd(z);
    __m128d epsy = _mm_set1_pd(1.0 - EPSILON);
    __m128d xy_e = _mm_sub_pd(xy, epsy);
    __m128d zn_e = _mm_sub_sd(zn, epsy);
    __m128i tmp_xy = _mm_cvttpd_epi32(_mm_blendv_pd(xy, xy_e, xy));
    __m128i tmp_zn = _mm_cvttpd_epi32(_mm_blendv_pd(zn, zn_e, zn));

    __m128i noise_min_xy = _mm_setr_epi32(NOISE_MINX, NOISE_MINY, 0, 0);
    __m128i noise_min_zn = _mm_set1_epi32(NOISE_MINZ);

    __m128d xy_ixy = _mm_sub_pd(xy, _mm_cvtepi32_pd(tmp_xy));
    __m128d zn_izn = _mm_sub_sd(zn, _mm_cvtepi32_pd(tmp_zn));

    const __m128i fff = _mm_set1_epi32(0xfff);
    __m128i i_xy = _mm_and_si128(_mm_sub_epi32(tmp_xy, noise_min_xy), fff);
    __m128i i_zn = _mm_and_si128(_mm_sub_epi32(tmp_zn, noise_min_zn), fff);

    ix = _mm_extract_epi32(i_xy, 0);
    iy = _mm_extract_epi32(i_xy, 1);
    iz = _mm_extract_epi32(i_zn, 0);

    ixiy_hash = Hash2d(ix, iy);
    jxiy_hash = Hash2d(ix + 1, iy);
    ixjy_hash = Hash2d(ix, iy + 1);
    jxjy_hash = Hash2d(ix + 1, iy + 1);

    mp = &RTable[Hash1dRTableIndex(ixiy_hash, iz)];
    DBL *mp2 = &RTable[Hash1dRTableIndex(ixjy_hash, iz)];
    DBL *mp3 = &RTable[Hash1dRTableIndex(ixiy_hash, iz + 1)];
    DBL *mp4 = &RTable[Hash1dRTableIndex(ixjy_hash, iz + 1)];
    DBL *mp5 = &RTable[Hash1dRTableIndex(jxiy_hash, iz)];
    DBL *mp6 = &RTable[Hash1dRTableIndex(jxjy_hash, iz)];
    DBL *mp7 = &RTable[Hash1dRTableIndex(jxiy_hash, iz + 1)];
    DBL *mp8 = &RTable[Hash1dRTableIndex(jxjy_hash, iz + 1)];

    const __m128d three = _mm_set1_pd(3.0);
    const __m128d two = _mm_set1_pd(2.0);
    const __m128d one = _mm_set1_pd(1.0);

    __m128d ix_mm = _mm_unpacklo_pd(xy_ixy, xy_ixy);
    __m128d iy_mm = _mm_unpackhi_pd(xy_ixy, xy_ixy);
    __m128d iz_mm = _mm_unpacklo_pd(zn_izn, zn_izn);

    __m128d jx_mm = _mm_sub_pd(ix_mm, one);
    __m128d jy_mm = _mm_sub_pd(iy_mm, one);
    __m128d jz_mm = _mm_sub_pd(iz_mm, one);

    __m128d mm_sxy = _mm_mul_pd(_mm_mul_pd(xy_ixy, xy_ixy), _mm_nmacc_pd(two, xy_ixy, three));
    __m128d mm_sz = _mm_mul_pd(_mm_mul_pd(iz_mm, iz_mm), _mm_nmacc_pd(two, iz_mm, three));

    __m128d mm_tz = _mm_sub_pd(one, mm_sz);
    __m128d mm_txy = _mm_sub_pd(one, mm_sxy);
    __m128d mm_tysy = _mm_unpackhi_pd(mm_txy, mm_sxy);
    __m128d mm_txty_txsy = _mm_mul_pd(_mm_unpacklo_pd(mm_txy, mm_txy), mm_tysy);
    __m128d mm_sxty_sxsy = _mm_mul_pd(_mm_unpacklo_pd(mm_sxy, mm_sxy), mm_tysy);

    __m128d y_mm = _mm_unpacklo_pd(iy_mm, jy_mm);

    __m128d mp_t1, mp_t2, mp1_mm, mp2_mm, mp4_mm, mp6_mm, sum_p, s_mm;
    __m128d int_sum1 = _mm_setzero_pd();

    s_mm = _mm_mul_pd(mm_txty_txsy, mm_tz);
    INCRSUMP2(mp, mp2, s_mm, ix_mm, y_mm, iz_mm, int_sum1);

    s_mm = _mm_mul_pd(mm_txty_txsy, mm_sz);
    INCRSUMP2(mp3, mp4, s_mm, ix_mm, y_mm, jz_mm, int_sum1);

    s_mm = _mm_mul_pd(mm_sxty_sxsy, mm_tz);
    INCRSUMP2(mp5, mp6, s_mm, jx_mm, y_mm, iz_mm, int_sum1);

    s_mm = _mm_mul_pd(mm_sxty_sxsy, mm_sz);
    INCRSUMP2(mp7, mp8, s_mm, jx_mm, y_mm, jz_mm, int_sum1);

    int_sum1 = _mm_hadd_pd(int_sum1, int_sum1);

    if(noise_generator==kNoiseGen_RangeCorrected)
    {
        /* details of range here:
        Min, max: -1.05242, 0.988997
        Mean: -0.0191481, Median: -0.535493, Std Dev: 0.256828

        We want to change it to as close to [0,1] as possible.
        */
        const __m128d r2 = _mm_set_sd(0.48985582);
        const __m128d r1r2 = _mm_set_sd(1.05242*0.48985582);
        int_sum1 = _mm_macc_sd(int_sum1, r2, r1r2);
    }
    else
    {
        int_sum1 = _mm_add_sd(int_sum1, _mm_set_sd(0.5));
    }

    int_sum1 = _mm_min_sd(one, int_sum1);
    int_sum1 = _mm_max_sd(_mm_setzero_pd(), int_sum1);
    _mm_store_sd(&sum, int_sum1);

    return (sum);
}
void multipath_channel(channel_desc_t *desc,
                       double **tx_sig_re,
                       double **tx_sig_im,
                       double **rx_sig_re,
                       double **rx_sig_im,
                       uint32_t length,
                       uint8_t keep_channel)
{

  int i,ii,j,l;
  int length1, length2, tail;
  __m128d rx_tmp128_re_f,rx_tmp128_im_f,rx_tmp128_re,rx_tmp128_im, rx_tmp128_1,rx_tmp128_2,rx_tmp128_3,rx_tmp128_4,tx128_re,tx128_im,ch128_x,ch128_y,pathloss128;

  double path_loss = pow(10,desc->path_loss_dB/20);
  int dd = abs(desc->channel_offset);

  pathloss128 = _mm_set1_pd(path_loss);

#ifdef DEBUG_CH
  printf("[CHANNEL] keep = %d : path_loss = %g (%f), nb_rx %d, nb_tx %d, dd %d, len %d \n",keep_channel,path_loss,desc->path_loss_dB,desc->nb_rx,desc->nb_tx,dd,desc->channel_length);
#endif

  if (keep_channel) {
    // do nothing - keep channel
  } else {
    random_channel(desc,0);
  }

  start_meas(&desc->convolution);

#ifdef DEBUG_CH

  for (l = 0; l<(int)desc->channel_length; l++) {
    printf("%p (%f,%f) ",desc->ch[0],desc->ch[0][l].x,desc->ch[0][l].y);
  }

  printf("\n");
#endif

  tail = ((int)length-dd)%2;

  if(tail)
    length1 = ((int)length-dd)-1;
  else
    length1 = ((int)length-dd);

  length2 = length1/2;

  for (i=0; i<length2; i++) { //
    for (ii=0; ii<desc->nb_rx; ii++) {
      // rx_tmp.x = 0;
      // rx_tmp.y = 0;
      rx_tmp128_re_f = _mm_setzero_pd();
      rx_tmp128_im_f = _mm_setzero_pd();

      for (j=0; j<desc->nb_tx; j++) {
        for (l = 0; l<(int)desc->channel_length; l++) {
          if ((i>=0) && (i-l)>=0) { //SIMD correct only if length1 > 2*channel_length...which is almost always satisfied
            // tx.x = tx_sig_re[j][i-l];
            // tx.y = tx_sig_im[j][i-l];
            tx128_re = _mm_loadu_pd(&tx_sig_re[j][2*i-l]); // tx_sig_re[j][i-l+1], tx_sig_re[j][i-l]
            tx128_im = _mm_loadu_pd(&tx_sig_im[j][2*i-l]);
          } else {
            //tx.x =0;
            //tx.y =0;
            tx128_re = _mm_setzero_pd();
            tx128_im = _mm_setzero_pd();
          }

          ch128_x = _mm_set1_pd(desc->ch[ii+(j*desc->nb_rx)][l].x);
          ch128_y = _mm_set1_pd(desc->ch[ii+(j*desc->nb_rx)][l].y);
          //  rx_tmp.x += (tx.x * desc->ch[ii+(j*desc->nb_rx)][l].x) - (tx.y * desc->ch[ii+(j*desc->nb_rx)][l].y);
          //  rx_tmp.y += (tx.y * desc->ch[ii+(j*desc->nb_rx)][l].x) + (tx.x * desc->ch[ii+(j*desc->nb_rx)][l].y);
          rx_tmp128_1 = _mm_mul_pd(tx128_re,ch128_x);
          rx_tmp128_2 = _mm_mul_pd(tx128_re,ch128_y);
          rx_tmp128_3 = _mm_mul_pd(tx128_im,ch128_x);
          rx_tmp128_4 = _mm_mul_pd(tx128_im,ch128_y);
          rx_tmp128_re = _mm_sub_pd(rx_tmp128_1,rx_tmp128_4);
          rx_tmp128_im = _mm_add_pd(rx_tmp128_2,rx_tmp128_3);
          rx_tmp128_re_f = _mm_add_pd(rx_tmp128_re_f,rx_tmp128_re);
          rx_tmp128_im_f = _mm_add_pd(rx_tmp128_im_f,rx_tmp128_im);
        } //l
      }  // j

      //rx_sig_re[ii][i+dd] = rx_tmp.x*path_loss;
      //rx_sig_im[ii][i+dd] = rx_tmp.y*path_loss;
      rx_tmp128_re_f = _mm_mul_pd(rx_tmp128_re_f,pathloss128);
      rx_tmp128_im_f = _mm_mul_pd(rx_tmp128_im_f,pathloss128);
      _mm_storeu_pd(&rx_sig_re[ii][2*i+dd],rx_tmp128_re_f); // max index: length-dd -1 + dd = length -1
      _mm_storeu_pd(&rx_sig_im[ii][2*i+dd],rx_tmp128_im_f);
      /*
      if ((ii==0)&&((i%32)==0)) {
      printf("%p %p %f,%f => %e,%e\n",rx_sig_re[ii],rx_sig_im[ii],rx_tmp.x,rx_tmp.y,rx_sig_re[ii][i-dd],rx_sig_im[ii][i-dd]);
      }
      */
      //rx_sig_re[ii][i] = sqrt(.5)*(tx_sig_re[0][i] + tx_sig_re[1][i]);
      //rx_sig_im[ii][i] = sqrt(.5)*(tx_sig_im[0][i] + tx_sig_im[1][i]);

    } // ii
  } // i

  stop_meas(&desc->convolution);

}
Beispiel #22
0
void AVXFMA4DNoise(Vector3d& result, const Vector3d& EPoint)
{
    DBL x, y, z;
    int ix, iy, iz;
    int ixiy_hash, ixjy_hash, jxiy_hash, jxjy_hash;

    // TODO FIXME - global statistics reference
    // Stats[Calls_To_DNoise]++;

    x = EPoint[X];
    y = EPoint[Y];
    z = EPoint[Z];

    /* its equivalent integer lattice point. */
    /*ix = (int)x; iy = (int)y; iz = (int)z;
    x_ix = x - ix; y_iy = y - iy; z_iz = z - iz;*/
                /* JB fix for the range problem */

    __m128d xy = _mm_setr_pd(x, y);
    __m128d zn = _mm_set_sd(z);
    __m128d epsy = _mm_set1_pd(1.0 - EPSILON);
    __m128d xy_e = _mm_sub_pd(xy, epsy);
    __m128d zn_e = _mm_sub_sd(zn, epsy);
    __m128i tmp_xy = _mm_cvttpd_epi32(_mm_blendv_pd(xy, xy_e, xy));
    __m128i tmp_zn = _mm_cvttpd_epi32(_mm_blendv_pd(zn, zn_e, zn));

    __m128i noise_min_xy = _mm_setr_epi32(NOISE_MINX, NOISE_MINY, 0, 0);
    __m128i noise_min_zn = _mm_set1_epi32(NOISE_MINZ);

    __m128d xy_ixy = _mm_sub_pd(xy, _mm_cvtepi32_pd(tmp_xy));
    __m128d zn_izn = _mm_sub_sd(zn, _mm_cvtepi32_pd(tmp_zn));

    const __m128i fff = _mm_set1_epi32(0xfff);
    __m128i i_xy = _mm_and_si128(_mm_sub_epi32(tmp_xy, noise_min_xy), fff);
    __m128i i_zn = _mm_and_si128(_mm_sub_epi32(tmp_zn, noise_min_zn), fff);

    ix = _mm_extract_epi32(i_xy, 0);
    iy = _mm_extract_epi32(i_xy, 1);
    iz = _mm_extract_epi32(i_zn, 0);

    ixiy_hash = Hash2d(ix, iy);
    jxiy_hash = Hash2d(ix + 1, iy);
    ixjy_hash = Hash2d(ix, iy + 1);
    jxjy_hash = Hash2d(ix + 1, iy + 1);

    DBL* mp1 = &RTable[Hash1dRTableIndex(ixiy_hash, iz)];
    DBL* mp2 = &RTable[Hash1dRTableIndex(jxiy_hash, iz)];
    DBL* mp3 = &RTable[Hash1dRTableIndex(jxjy_hash, iz)];
    DBL* mp4 = &RTable[Hash1dRTableIndex(ixjy_hash, iz)];
    DBL* mp5 = &RTable[Hash1dRTableIndex(ixjy_hash, iz + 1)];
    DBL* mp6 = &RTable[Hash1dRTableIndex(jxjy_hash, iz + 1)];
    DBL* mp7 = &RTable[Hash1dRTableIndex(jxiy_hash, iz + 1)];
    DBL* mp8 = &RTable[Hash1dRTableIndex(ixiy_hash, iz + 1)];

    const __m128d three = _mm_set1_pd(3.0);
    const __m128d two = _mm_set1_pd(2.0);
    const __m128d one = _mm_set1_pd(1.0);

    __m128d ix_mm = _mm_unpacklo_pd(xy_ixy, xy_ixy);
    __m128d iy_mm = _mm_unpackhi_pd(xy_ixy, xy_ixy);
    __m128d iz_mm = _mm_unpacklo_pd(zn_izn, zn_izn);

    __m128d jx_mm = _mm_sub_pd(ix_mm, one);
    __m128d jy_mm = _mm_sub_pd(iy_mm, one);
    __m128d jz_mm = _mm_sub_pd(iz_mm, one);

    __m128d mm_sz = _mm_mul_pd(_mm_mul_pd(iz_mm, iz_mm), _mm_nmacc_pd(two, iz_mm, three));

    __m128d mm_tz = _mm_sub_pd(one, mm_sz);

    __m128d mm_sxy = _mm_mul_pd(_mm_mul_pd(xy_ixy, xy_ixy), _mm_nmacc_pd(two, xy_ixy, three));

    __m128d mm_txy = _mm_sub_pd(one, mm_sxy);
    __m128d mm_tysy = _mm_unpackhi_pd(mm_txy, mm_sxy);
    __m128d mm_txty_txsy = _mm_mul_pd(_mm_unpacklo_pd(mm_txy, mm_txy), mm_tysy);
    __m128d mm_sxty_sxsy = _mm_mul_pd(_mm_unpacklo_pd(mm_sxy, mm_sxy), mm_tysy);

    __m128d mm_txty_txsy_tz = _mm_mul_pd(mm_txty_txsy, mm_tz);
    __m128d mm_txty_txsy_sz = _mm_mul_pd(mm_txty_txsy, mm_sz);
    __m128d mm_sxty_sxsy_tz = _mm_mul_pd(mm_sxty_sxsy, mm_tz);
    __m128d mm_sxty_sxsy_sz = _mm_mul_pd(mm_sxty_sxsy, mm_sz);

    __m128d mp_t1, mp_t2, mp1_mm, mp2_mm, mp4_mm, mp6_mm, sum_p;
    __m128d sum_X_Y = _mm_setzero_pd();
    __m128d sum__Z = _mm_setzero_pd();

    __m128d mm_s1 = _mm_unpacklo_pd(mm_txty_txsy_tz, mm_txty_txsy_tz);
    INCRSUMP2(mp1, mp1 + 8, mm_s1, ix_mm, iy_mm, iz_mm, sum_X_Y);

    __m128d mm_s2 = _mm_unpacklo_pd(mm_sxty_sxsy_tz, mm_sxty_sxsy_tz);
    INCRSUMP2(mp2, mp2 + 8, mm_s2, jx_mm, iy_mm, iz_mm, sum_X_Y);

    __m128d mm_s3 = _mm_unpackhi_pd(mm_sxty_sxsy_tz, mm_sxty_sxsy_tz);
    INCRSUMP2(mp3, mp3 + 8, mm_s3, jx_mm, jy_mm, iz_mm, sum_X_Y);

    __m128d mm_s4 = _mm_unpackhi_pd(mm_txty_txsy_tz, mm_txty_txsy_tz);
    INCRSUMP2(mp4, mp4 + 8, mm_s4, ix_mm, jy_mm, iz_mm, sum_X_Y);

    __m128d mm_s5 = _mm_unpackhi_pd(mm_txty_txsy_sz, mm_txty_txsy_sz);
    INCRSUMP2(mp5, mp5 + 8, mm_s5, ix_mm, jy_mm, jz_mm, sum_X_Y);

    __m128d mm_s6 = _mm_unpackhi_pd(mm_sxty_sxsy_sz, mm_sxty_sxsy_sz);
    INCRSUMP2(mp6, mp6 + 8, mm_s6, jx_mm, jy_mm, jz_mm, sum_X_Y);

    __m128d mm_s7 = _mm_unpacklo_pd(mm_sxty_sxsy_sz, mm_sxty_sxsy_sz);
    INCRSUMP2(mp7, mp7 + 8, mm_s7, jx_mm, iy_mm, jz_mm, sum_X_Y);

    __m128d mm_s8 = _mm_unpacklo_pd(mm_txty_txsy_sz, mm_txty_txsy_sz);
    INCRSUMP2(mp8, mp8 + 8, mm_s8, ix_mm, iy_mm, jz_mm, sum_X_Y);

    __m128d iy_jy = _mm_unpacklo_pd(iy_mm, jy_mm);
    INCRSUMP2(mp1 + 16, mp4 + 16, mm_txty_txsy_tz, ix_mm, iy_jy, iz_mm, sum__Z);
    INCRSUMP2(mp8 + 16, mp5 + 16, mm_txty_txsy_sz, ix_mm, iy_jy, jz_mm, sum__Z);
    INCRSUMP2(mp2 + 16, mp3 + 16, mm_sxty_sxsy_tz, jx_mm, iy_jy, iz_mm, sum__Z);
    INCRSUMP2(mp7 + 16, mp6 + 16, mm_sxty_sxsy_sz, jx_mm, iy_jy, jz_mm, sum__Z);

    sum__Z = _mm_hadd_pd(sum__Z, sum__Z);

    _mm_storeu_pd(*result, sum_X_Y);
    _mm_store_sd(&result[Z], sum__Z);
}
void exchsolution_gmrfData_1(unsigned int slot) {
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[0]) {
if ((!neighbor_isValid[0][0])) {
{
double xPos;
double yPos;
/* Statements in this Scop: S493, S492, S494 */
{
{
{
double* fieldData_Solution_GMRF_1_p1 = (&fieldData_Solution_GMRF[1][0]);
int i1 = 1;
for (; (i1<=2); i1 += 2) {
fieldData_Solution_GMRF_1_p1[((i1*6)+2)] = 0.000000e+00;
fieldData_Solution_GMRF_1_p1[((i1*6)+8)] = 0.000000e+00;
}
for (; (i1<=3); i1 += 1) {
fieldData_Solution_GMRF_1_p1[((i1*6)+2)] = 0.000000e+00;
}
}
{
int i1 = 1;
for (; (i1<(2&(~1))); i1 += 1) {
xPos = posBegin[0];
}
__m128d vec1 = _mm_set1_pd(xPos);
for (; (i1<1); i1 += 4) {
/* xPos = posBegin[0]; */
__m128d vec0 = _mm_load1_pd((&posBegin[0]));
__m128d vec0_2 = _mm_load1_pd((&posBegin[0]));
vec1 = vec0;
vec1 = vec0_2;
}
for (; (i1<4); i1 += 1) {
xPos = posBegin[0];
}
}
}
{
int i1 = 1;
for (; (i1<(2&(~1))); i1 += 1) {
yPos = ((((i1-1)/2.000000e+00)*(posEnd[1]-posBegin[1]))+posBegin[1]);
}
__m128d vec1 = _mm_set1_pd(1.000000e+00);
__m128d vec2 = _mm_set1_pd(2.000000e+00);
__m128d vec5 = _mm_set1_pd(yPos);
for (; (i1<1); i1 += 4) {
/* yPos = ((((i1-1)/2.000000e+00)*(posEnd[1]-posBegin[1]))+posBegin[1]); */
__m128d vec0 = _mm_set_pd(i1+1,i1);
__m128d vec0_2 = _mm_set_pd(i1+1,i1);
__m128d vec3 = _mm_load1_pd((&posEnd[1]));
__m128d vec3_2 = _mm_load1_pd((&posEnd[1]));
__m128d vec4 = _mm_load1_pd((&posBegin[1]));
__m128d vec4_2 = _mm_load1_pd((&posBegin[1]));
vec5 = _mm_add_pd(_mm_mul_pd(_mm_div_pd(_mm_sub_pd(vec0, vec1), vec2), _mm_sub_pd(vec3, vec4)), vec4);
vec5 = _mm_add_pd(_mm_mul_pd(_mm_div_pd(_mm_sub_pd(vec0_2, vec1), vec2), _mm_sub_pd(vec3_2, vec4_2)), vec4_2);
}
for (; (i1<4); i1 += 1) {
yPos = ((((i1-1)/2.000000e+00)*(posEnd[1]-posBegin[1]))+posBegin[1]);
}
}
}
}
}
if ((!neighbor_isValid[0][1])) {
{
double xPos;
double yPos;
/* Statements in this Scop: S496, S495, S497 */
{
{
{
double* fieldData_Solution_GMRF_1_p1 = (&fieldData_Solution_GMRF[1][0]);
int i1 = 1;
for (; (i1<=2); i1 += 2) {
fieldData_Solution_GMRF_1_p1[((i1*6)+4)] = 0.000000e+00;
fieldData_Solution_GMRF_1_p1[((i1*6)+10)] = 0.000000e+00;
}
for (; (i1<=3); i1 += 1) {
fieldData_Solution_GMRF_1_p1[((i1*6)+4)] = 0.000000e+00;
}
}
{
int i1 = 1;
for (; (i1<(2&(~1))); i1 += 1) {
xPos = posEnd[0];
}
__m128d vec1 = _mm_set1_pd(xPos);
for (; (i1<1); i1 += 4) {
/* xPos = posEnd[0]; */
__m128d vec0 = _mm_load1_pd((&posEnd[0]));
__m128d vec0_2 = _mm_load1_pd((&posEnd[0]));
vec1 = vec0;
vec1 = vec0_2;
}
for (; (i1<4); i1 += 1) {
xPos = posEnd[0];
}
}
}
{
int i1 = 1;
for (; (i1<(2&(~1))); i1 += 1) {
yPos = ((((i1-1)/2.000000e+00)*(posEnd[1]-posBegin[1]))+posBegin[1]);
}
__m128d vec1 = _mm_set1_pd(1.000000e+00);
__m128d vec2 = _mm_set1_pd(2.000000e+00);
__m128d vec5 = _mm_set1_pd(yPos);
for (; (i1<1); i1 += 4) {
/* yPos = ((((i1-1)/2.000000e+00)*(posEnd[1]-posBegin[1]))+posBegin[1]); */
__m128d vec0 = _mm_set_pd(i1+1,i1);
__m128d vec0_2 = _mm_set_pd(i1+1,i1);
__m128d vec3 = _mm_load1_pd((&posEnd[1]));
__m128d vec3_2 = _mm_load1_pd((&posEnd[1]));
__m128d vec4 = _mm_load1_pd((&posBegin[1]));
__m128d vec4_2 = _mm_load1_pd((&posBegin[1]));
vec5 = _mm_add_pd(_mm_mul_pd(_mm_div_pd(_mm_sub_pd(vec0, vec1), vec2), _mm_sub_pd(vec3, vec4)), vec4);
vec5 = _mm_add_pd(_mm_mul_pd(_mm_div_pd(_mm_sub_pd(vec0_2, vec1), vec2), _mm_sub_pd(vec3_2, vec4_2)), vec4_2);
}
for (; (i1<4); i1 += 1) {
yPos = ((((i1-1)/2.000000e+00)*(posEnd[1]-posBegin[1]))+posBegin[1]);
}
}
}
}
}
if ((!neighbor_isValid[0][2])) {
{
double xPos;
double yPos;
/* Statements in this Scop: S500, S499, S498 */
{
{
{
int i2 = 2;
for (; (i2<=3); i2 += 2) {
xPos = ((((i2-2)/2.000000e+00)*(posEnd[0]-posBegin[0]))+posBegin[0]);
xPos = ((((i2-1)/2.000000e+00)*(posEnd[0]-posBegin[0]))+posBegin[0]);
}
for (; (i2<=4); i2 += 1) {
xPos = ((((i2-2)/2.000000e+00)*(posEnd[0]-posBegin[0]))+posBegin[0]);
}
}
{
double* fieldData_Solution_GMRF_1_p1 = (&fieldData_Solution_GMRF[1][0]);
int i2 = 2;
for (; (i2<=3); i2 += 2) {
fieldData_Solution_GMRF_1_p1[(i2+6)] = 0.000000e+00;
fieldData_Solution_GMRF_1_p1[(i2+7)] = 0.000000e+00;
}
for (; (i2<=4); i2 += 1) {
fieldData_Solution_GMRF_1_p1[(i2+6)] = 0.000000e+00;
}
}
}
{
int i2 = 2;
for (; (i2<=3); i2 += 2) {
yPos = posBegin[1];
yPos = posBegin[1];
}
for (; (i2<=4); i2 += 1) {
yPos = posBegin[1];
}
}
}
}
}
if ((!neighbor_isValid[0][3])) {
{
double xPos;
double yPos;
/* Statements in this Scop: S503, S502, S501 */
{
{
{
int i2 = 2;
for (; (i2<=3); i2 += 2) {
xPos = ((((i2-2)/2.000000e+00)*(posEnd[0]-posBegin[0]))+posBegin[0]);
xPos = ((((i2-1)/2.000000e+00)*(posEnd[0]-posBegin[0]))+posBegin[0]);
}
for (; (i2<=4); i2 += 1) {
xPos = ((((i2-2)/2.000000e+00)*(posEnd[0]-posBegin[0]))+posBegin[0]);
}
}
{
int i2 = 2;
for (; (i2<=3); i2 += 2) {
yPos = posEnd[1];
yPos = posEnd[1];
}
for (; (i2<=4); i2 += 1) {
yPos = posEnd[1];
}
}
}
{
double* fieldData_Solution_GMRF_1_p1 = (&fieldData_Solution_GMRF[1][0]);
int i2 = 2;
for (; (i2<=3); i2 += 2) {
fieldData_Solution_GMRF_1_p1[(i2+18)] = 0.000000e+00;
fieldData_Solution_GMRF_1_p1[(i2+19)] = 0.000000e+00;
}
for (; (i2<=4); i2 += 1) {
fieldData_Solution_GMRF_1_p1[(i2+18)] = 0.000000e+00;
}
}
}
}
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[0]) {
;
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[0]) {
if ((neighbor_isValid[0][1]&&neighbor_isRemote[0][1])) {
MPI_Isend(&fieldData_Solution_GMRF[1][10], 1, mpiDatatype_3_1_6, neighbor_remoteRank[0][1], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[0][1]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[1]);
reqOutstanding_Send[1] = true;
}
}
}
;
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[0]) {
if ((neighbor_isValid[0][0]&&neighbor_isRemote[0][0])) {
MPI_Irecv(&fieldData_Solution_GMRF[1][8], 1, mpiDatatype_3_1_6, neighbor_remoteRank[0][0], ((unsigned int)(neighbor_fragCommId[0][0]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[0]);
reqOutstanding_Recv[0] = true;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[0]) {
if (reqOutstanding_Recv[0]) {
waitForMPIReq(&mpiRequest_Recv[0]);
reqOutstanding_Recv[0] = false;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[0]) {
;
}
}
;
;
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[0]) {
if (reqOutstanding_Send[1]) {
waitForMPIReq(&mpiRequest_Send[1]);
reqOutstanding_Send[1] = false;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[0]) {
;
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[0]) {
if ((neighbor_isValid[0][3]&&neighbor_isRemote[0][3])) {
MPI_Isend(&fieldData_Solution_GMRF[1][20], 1, mpiDatatype_1_3_6, neighbor_remoteRank[0][3], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[0][3]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[3]);
reqOutstanding_Send[3] = true;
}
}
}
;
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[0]) {
if ((neighbor_isValid[0][2]&&neighbor_isRemote[0][2])) {
MPI_Irecv(&fieldData_Solution_GMRF[1][8], 1, mpiDatatype_1_3_6, neighbor_remoteRank[0][2], ((unsigned int)(neighbor_fragCommId[0][2]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[2]);
reqOutstanding_Recv[2] = true;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[0]) {
if (reqOutstanding_Recv[2]) {
waitForMPIReq(&mpiRequest_Recv[2]);
reqOutstanding_Recv[2] = false;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[0]) {
;
}
}
;
;
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[0]) {
if (reqOutstanding_Send[3]) {
waitForMPIReq(&mpiRequest_Send[3]);
reqOutstanding_Send[3] = false;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[0]) {
;
;
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[0]) {
if ((neighbor_isValid[0][0]&&neighbor_isRemote[0][0])) {
MPI_Isend(&fieldData_Solution_GMRF[1][3], 1, mpiDatatype_5_1_6, neighbor_remoteRank[0][0], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[0][0]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[0]);
reqOutstanding_Send[0] = true;
}
if ((neighbor_isValid[0][1]&&neighbor_isRemote[0][1])) {
MPI_Isend(&fieldData_Solution_GMRF[1][3], 1, mpiDatatype_5_1_6, neighbor_remoteRank[0][1], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[0][1]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[1]);
reqOutstanding_Send[1] = true;
}
}
}
;
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[0]) {
if ((neighbor_isValid[0][0]&&neighbor_isRemote[0][0])) {
MPI_Irecv(&fieldData_Solution_GMRF[1][1], 1, mpiDatatype_5_1_6, neighbor_remoteRank[0][0], ((unsigned int)(neighbor_fragCommId[0][0]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[0]);
reqOutstanding_Recv[0] = true;
}
if ((neighbor_isValid[0][1]&&neighbor_isRemote[0][1])) {
MPI_Irecv(&fieldData_Solution_GMRF[1][5], 1, mpiDatatype_5_1_6, neighbor_remoteRank[0][1], ((unsigned int)(neighbor_fragCommId[0][1]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[1]);
reqOutstanding_Recv[1] = true;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[0]) {
if (reqOutstanding_Recv[0]) {
waitForMPIReq(&mpiRequest_Recv[0]);
reqOutstanding_Recv[0] = false;
}
if (reqOutstanding_Recv[1]) {
waitForMPIReq(&mpiRequest_Recv[1]);
reqOutstanding_Recv[1] = false;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[0]) {
;
;
}
}
;
;
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[0]) {
if (reqOutstanding_Send[0]) {
waitForMPIReq(&mpiRequest_Send[0]);
reqOutstanding_Send[0] = false;
}
if (reqOutstanding_Send[1]) {
waitForMPIReq(&mpiRequest_Send[1]);
reqOutstanding_Send[1] = false;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[0]) {
;
;
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[0]) {
if ((neighbor_isValid[0][2]&&neighbor_isRemote[0][2])) {
MPI_Isend(&fieldData_Solution_GMRF[1][13], 1, mpiDatatype_1_5_6, neighbor_remoteRank[0][2], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[0][2]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[2]);
reqOutstanding_Send[2] = true;
}
if ((neighbor_isValid[0][3]&&neighbor_isRemote[0][3])) {
MPI_Isend(&fieldData_Solution_GMRF[1][13], 1, mpiDatatype_1_5_6, neighbor_remoteRank[0][3], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[0][3]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[3]);
reqOutstanding_Send[3] = true;
}
}
}
;
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[0]) {
if ((neighbor_isValid[0][2]&&neighbor_isRemote[0][2])) {
MPI_Irecv(&fieldData_Solution_GMRF[1][1], 1, mpiDatatype_1_5_6, neighbor_remoteRank[0][2], ((unsigned int)(neighbor_fragCommId[0][2]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[2]);
reqOutstanding_Recv[2] = true;
}
if ((neighbor_isValid[0][3]&&neighbor_isRemote[0][3])) {
MPI_Irecv(&fieldData_Solution_GMRF[1][25], 1, mpiDatatype_1_5_6, neighbor_remoteRank[0][3], ((unsigned int)(neighbor_fragCommId[0][3]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[3]);
reqOutstanding_Recv[3] = true;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[0]) {
if (reqOutstanding_Recv[2]) {
waitForMPIReq(&mpiRequest_Recv[2]);
reqOutstanding_Recv[2] = false;
}
if (reqOutstanding_Recv[3]) {
waitForMPIReq(&mpiRequest_Recv[3]);
reqOutstanding_Recv[3] = false;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[0]) {
;
;
}
}
;
;
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[0]) {
if (reqOutstanding_Send[2]) {
waitForMPIReq(&mpiRequest_Send[2]);
reqOutstanding_Send[2] = false;
}
if (reqOutstanding_Send[3]) {
waitForMPIReq(&mpiRequest_Send[3]);
reqOutstanding_Send[3] = false;
}
}
}
}
Beispiel #24
0
 inline const vector2d operator-(const vector2d& lhs, const vector2d& rhs) {
     return _mm_sub_pd(lhs, rhs);
 }
 static forcedinline ParallelType sub (ParallelType a, ParallelType b) noexcept  { return _mm_sub_pd (a, b); }
Beispiel #26
0
// it moves horizontally inside a block
void kernel_dtrmv_u_n_2_lib4(int kmax, double *A, double *x, double *y, int alg)
	{

	if(kmax<=0) 
		return;
	
	const int lda = 4;
	
	int k;

	__m128d
		ax_temp,
		a_00_10, a_01_11, a_02_12, a_03_13,
		x_0, x_1, x_2, x_3,
		y_0_1, y_0_1_b, y_0_1_c, y_0_1_d, z_0_1;
	
/*	y_0_1 = _mm_setzero_pd();	*/

	// second col (avoid zero y_0_1)
	x_0     = _mm_loaddup_pd( &x[1] );
	a_00_10 = _mm_load_pd( &A[0+lda*1] );
	y_0_1   = _mm_mul_pd( a_00_10, x_0 );

	// first col
	x_0     = _mm_load_sd( &x[0] );
	a_00_10 = _mm_load_sd( &A[0+lda*0] );
	ax_temp = _mm_mul_sd( a_00_10, x_0 );
	y_0_1   = _mm_add_sd( y_0_1, ax_temp );

	A += 2*lda;
	x += 2;

	k=2;
	for(; k<kmax-1; k+=2)
		{

		x_0 = _mm_loaddup_pd( &x[0] );
		x_1 = _mm_loaddup_pd( &x[1] );

		a_00_10 = _mm_load_pd( &A[0+lda*0] );
		a_01_11 = _mm_load_pd( &A[0+lda*1] );

		ax_temp = _mm_mul_pd( a_00_10, x_0 );
		y_0_1 = _mm_add_pd( y_0_1, ax_temp );
		ax_temp = _mm_mul_pd( a_01_11, x_1 );
		y_0_1 = _mm_add_pd( y_0_1, ax_temp );

		A += 2*lda;
		x += 2;

		}
	if(kmax%2==1)
		{

		x_0 = _mm_loaddup_pd( &x[0] );

		a_00_10 = _mm_load_pd( &A[0+lda*0] );

		ax_temp = _mm_mul_pd( a_00_10, x_0 );
		y_0_1 = _mm_add_pd( y_0_1, ax_temp );

		}

	if(alg==0)
		{
		_mm_storeu_pd(&y[0], y_0_1);
		}
	else if(alg==1)
		{
		z_0_1 = _mm_loadu_pd( &y[0] );

		z_0_1 = _mm_add_pd( z_0_1, y_0_1 );

		_mm_storeu_pd(&y[0], z_0_1);
		}
	else // alg==-1
		{
		z_0_1 = _mm_loadu_pd( &y[0] );

		z_0_1 = _mm_sub_pd( z_0_1, y_0_1 );

		_mm_storeu_pd(&y[0], z_0_1);
		}

	}
Beispiel #27
0
static void
filterYule(const Float_t* input, Float_t* output, size_t nSamples, const Float_t* kernel)
{
#ifdef HAVE_SSE2

    __m128d __kernel, __result, __temp;
    __declspec(align(16)) Float_t __temp2[2];

    while (nSamples--) {
        __kernel = _mm_loadr_pd(&kernel[0]);
        __temp = _mm_loadu_pd(&input[-1]);
        __result = _mm_mul_pd(__temp, __kernel);
        __kernel = _mm_loadr_pd(&kernel[12]);
        __temp = _mm_loadu_pd(&output[-2]);
        __temp = _mm_mul_pd(__kernel, __temp);
        __result = _mm_sub_pd(__result, __temp);
        __kernel = _mm_loadr_pd(&kernel[2]);
        __temp = _mm_loadu_pd(&input[-3]);
        __temp = _mm_mul_pd(__kernel, __temp);
        __result = _mm_add_pd(__result, __temp);
        __kernel = _mm_loadr_pd(&kernel[14]);
        __temp = _mm_loadu_pd(&output[-4]);
        __temp = _mm_mul_pd(__kernel, __temp);
        __result = _mm_sub_pd(__result, __temp);
        __kernel = _mm_loadr_pd(&kernel[4]);
        __temp = _mm_loadu_pd(&input[-5]);
        __temp = _mm_mul_pd(__kernel, __temp);
        __result = _mm_add_pd(__result, __temp);
        __kernel = _mm_loadr_pd(&kernel[16]);
        __temp = _mm_loadu_pd(&output[-6]);
        __temp = _mm_mul_pd(__kernel, __temp);
        __result = _mm_sub_pd(__result, __temp);
        __kernel = _mm_loadr_pd(&kernel[6]);
        __temp = _mm_loadu_pd(&input[-7]);
        __temp = _mm_mul_pd(__kernel, __temp);
        __result = _mm_add_pd(__result, __temp);
        __kernel = _mm_loadr_pd(&kernel[18]);
        __temp = _mm_loadu_pd(&output[-8]);
        __temp = _mm_mul_pd(__kernel, __temp);
        __result = _mm_sub_pd(__result, __temp);
        __kernel = _mm_loadr_pd(&kernel[8]);
        __temp = _mm_loadu_pd(&input[-9]);
        __temp = _mm_mul_pd(__kernel, __temp);
        __result = _mm_add_pd(__result, __temp);
        __kernel = _mm_loadr_pd(&kernel[20]);
        __temp = _mm_loadu_pd(&output[-10]);
        __temp = _mm_mul_pd(__kernel, __temp);
        __result = _mm_sub_pd(__result, __temp);

        _mm_store_pd(__temp2, __result);
        *output =  1e-10  /* 1e-10 is a hack to avoid slowdown because of denormals */
                + __temp2[0]
                + __temp2[1]
                + input [-10] * kernel[11];
                ;

        ++output;
        ++input; 
    }
#else
    while (nSamples--) {
        *output =  1e-10  /* 1e-10 is a hack to avoid slowdown because of denormals */
                    + input [0]  * kernel[0] - output[-1] * kernel[1]
                    + input [-1] * kernel[2] - output[-2] * kernel[3]
                    + input [-2] * kernel[4] - output[-3] * kernel[5]
                    + input [-3] * kernel[6] - output[-4] * kernel[7]
                    + input [-4] * kernel[8] - output[-5] * kernel[9]
                    + input [-5] * kernel[10] - output[-6] * kernel[11]
                    + input [-6] * kernel[12] - output[-7] * kernel[13]
                    + input [-7] * kernel[14] - output[-8] * kernel[15]
                    + input [-8] * kernel[16] - output[-9] * kernel[17]
                    + input [-9] * kernel[18] - output[-10]* kernel[19]
                    + input [-10]* kernel[20];

        ++output;
        ++input; 
    }
#endif
}
Beispiel #28
0
// it moves vertically across blocks
void kernel_dtrmv_u_t_2_lib4(int kmax, double *A, int sda, double *x, double *y, int alg)
	{

/*	if(kmax<=0) */
/*		return;*/
	
	const int lda = 4;
	
	double *tA, *tx;

	int k;
	
	__m256d
		tmp0, tmp1,
		a_00_10_20_30, a_01_11_21_31,
		x_0_1_2_3,
		y_00, y_11;
	
	y_00 = _mm256_setzero_pd();
	y_11 = _mm256_setzero_pd();

	k=0;
	for(; k<kmax-7; k+=8)
		{
		
		x_0_1_2_3 = _mm256_loadu_pd( &x[0] );

		a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] );
		a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] );
		
		tmp0 = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 );
		tmp1 = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 );
		y_00 = _mm256_add_pd( y_00, tmp0 );
		y_11 = _mm256_add_pd( y_11, tmp1 );

		A += 4 + (sda-1)*lda;
		x += 4;

		x_0_1_2_3 = _mm256_loadu_pd( &x[0] );

		a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] );
		a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] );
		
		tmp0 = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 );
		tmp1 = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 );
		y_00 = _mm256_add_pd( y_00, tmp0 );
		y_11 = _mm256_add_pd( y_11, tmp1 );

		A += 4 + (sda-1)*lda;
		x += 4;

		}
	for(; k<kmax-3; k+=4)
		{
		
		x_0_1_2_3 = _mm256_loadu_pd( &x[0] );

		a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] );
		a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] );
		
		tmp0 = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 );
		tmp1 = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 );
		y_00 = _mm256_add_pd( y_00, tmp0 );
		y_11 = _mm256_add_pd( y_11, tmp1 );

		A += 4 + (sda-1)*lda;
		x += 4;

		}

	__m128d
		tm0, tm1,
		a_00_10, a_01_11,
		x_0_1,
		y_0, y_1, y_0_1;
	
	tm0 = _mm256_extractf128_pd( y_00, 0x1 );
	tm1 = _mm256_extractf128_pd( y_11, 0x1 );
	y_0 = _mm256_castpd256_pd128( y_00 );
	y_1 = _mm256_castpd256_pd128( y_11 );
	y_0 = _mm_add_pd( y_0, tm0 );
	y_1 = _mm_add_pd( y_1, tm1 );
	
	x_0_1 = _mm_loadu_pd( &x[0] );
	a_00_10 = _mm_load_sd( &A[0+lda*0] );
	a_01_11 = _mm_load_pd( &A[0+lda*1] );
	tm0 = _mm_mul_sd( a_00_10, x_0_1 );
	tm1 = _mm_mul_pd( a_01_11, x_0_1 );
	y_0 = _mm_add_sd( y_0, tm0 );
	y_1 = _mm_add_pd( y_1, tm1 );

	y_0 = _mm_hadd_pd( y_0, y_1 );


	if(alg==0)
		{
		_mm_storeu_pd(&y[0], y_0);
		}
	else if(alg==1)
		{
		y_0_1 = _mm_loadu_pd( &y[0] );

		y_0_1 = _mm_add_pd( y_0_1, y_0 );

		_mm_storeu_pd(&y[0], y_0_1);
		}
	else // alg==-1
		{
		y_0_1 = _mm_loadu_pd( &y[0] );
	
		y_0_1 = _mm_sub_pd( y_0_1, y_0 );
	
		_mm_storeu_pd(&y[0], y_0_1);
		}

	}
void exchsolutionData_2(unsigned int slot) {
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if ((!neighbor_isValid[1][0])) {
{
double xPos;
double yPos;
/* Statements in this Scop: S397, S396, S398 */
{
{
{
int i1 = 1;
for (; (i1<(2&(~1))); i1 += 1) {
yPos = ((((i1-1)/4.000000e+00)*(posEnd[1]-posBegin[1]))+posBegin[1]);
}
__m128d vec1 = _mm_set1_pd(1.000000e+00);
__m128d vec2 = _mm_set1_pd(4.000000e+00);
__m128d vec5 = _mm_set1_pd(yPos);
for (; (i1<3); i1 += 4) {
/* yPos = ((((i1-1)/4.000000e+00)*(posEnd[1]-posBegin[1]))+posBegin[1]); */
__m128d vec0 = _mm_set_pd(i1+1,i1);
__m128d vec0_2 = _mm_set_pd(i1+1,i1);
__m128d vec3 = _mm_load1_pd((&posEnd[1]));
__m128d vec3_2 = _mm_load1_pd((&posEnd[1]));
__m128d vec4 = _mm_load1_pd((&posBegin[1]));
__m128d vec4_2 = _mm_load1_pd((&posBegin[1]));
vec5 = _mm_add_pd(_mm_mul_pd(_mm_div_pd(_mm_sub_pd(vec0, vec1), vec2), _mm_sub_pd(vec3, vec4)), vec4);
vec5 = _mm_add_pd(_mm_mul_pd(_mm_div_pd(_mm_sub_pd(vec0_2, vec1), vec2), _mm_sub_pd(vec3_2, vec4_2)), vec4_2);
}
for (; (i1<6); i1 += 1) {
yPos = ((((i1-1)/4.000000e+00)*(posEnd[1]-posBegin[1]))+posBegin[1]);
}
}
{
double* fieldData_Solution_2_p1 = (&fieldData_Solution[2][0]);
int i1 = 1;
for (; (i1<=4); i1 += 2) {
fieldData_Solution_2_p1[((i1*8)+2)] = 0.000000e+00;
fieldData_Solution_2_p1[((i1*8)+10)] = 0.000000e+00;
}
for (; (i1<=5); i1 += 1) {
fieldData_Solution_2_p1[((i1*8)+2)] = 0.000000e+00;
}
}
}
{
int i1 = 1;
for (; (i1<(2&(~1))); i1 += 1) {
xPos = posBegin[0];
}
__m128d vec1 = _mm_set1_pd(xPos);
for (; (i1<3); i1 += 4) {
/* xPos = posBegin[0]; */
__m128d vec0 = _mm_load1_pd((&posBegin[0]));
__m128d vec0_2 = _mm_load1_pd((&posBegin[0]));
vec1 = vec0;
vec1 = vec0_2;
}
for (; (i1<6); i1 += 1) {
xPos = posBegin[0];
}
}
}
}
}
if ((!neighbor_isValid[1][1])) {
{
double xPos;
double yPos;
/* Statements in this Scop: S401, S400, S399 */
{
{
{
double* fieldData_Solution_2_p1 = (&fieldData_Solution[2][0]);
int i1 = 1;
for (; (i1<=4); i1 += 2) {
fieldData_Solution_2_p1[((i1*8)+6)] = 0.000000e+00;
fieldData_Solution_2_p1[((i1*8)+14)] = 0.000000e+00;
}
for (; (i1<=5); i1 += 1) {
fieldData_Solution_2_p1[((i1*8)+6)] = 0.000000e+00;
}
}
{
int i1 = 1;
for (; (i1<(2&(~1))); i1 += 1) {
xPos = posEnd[0];
}
__m128d vec1 = _mm_set1_pd(xPos);
for (; (i1<3); i1 += 4) {
/* xPos = posEnd[0]; */
__m128d vec0 = _mm_load1_pd((&posEnd[0]));
__m128d vec0_2 = _mm_load1_pd((&posEnd[0]));
vec1 = vec0;
vec1 = vec0_2;
}
for (; (i1<6); i1 += 1) {
xPos = posEnd[0];
}
}
}
{
int i1 = 1;
for (; (i1<(2&(~1))); i1 += 1) {
yPos = ((((i1-1)/4.000000e+00)*(posEnd[1]-posBegin[1]))+posBegin[1]);
}
__m128d vec1 = _mm_set1_pd(1.000000e+00);
__m128d vec2 = _mm_set1_pd(4.000000e+00);
__m128d vec5 = _mm_set1_pd(yPos);
for (; (i1<3); i1 += 4) {
/* yPos = ((((i1-1)/4.000000e+00)*(posEnd[1]-posBegin[1]))+posBegin[1]); */
__m128d vec0 = _mm_set_pd(i1+1,i1);
__m128d vec0_2 = _mm_set_pd(i1+1,i1);
__m128d vec3 = _mm_load1_pd((&posEnd[1]));
__m128d vec3_2 = _mm_load1_pd((&posEnd[1]));
__m128d vec4 = _mm_load1_pd((&posBegin[1]));
__m128d vec4_2 = _mm_load1_pd((&posBegin[1]));
vec5 = _mm_add_pd(_mm_mul_pd(_mm_div_pd(_mm_sub_pd(vec0, vec1), vec2), _mm_sub_pd(vec3, vec4)), vec4);
vec5 = _mm_add_pd(_mm_mul_pd(_mm_div_pd(_mm_sub_pd(vec0_2, vec1), vec2), _mm_sub_pd(vec3_2, vec4_2)), vec4_2);
}
for (; (i1<6); i1 += 1) {
yPos = ((((i1-1)/4.000000e+00)*(posEnd[1]-posBegin[1]))+posBegin[1]);
}
}
}
}
}
if ((!neighbor_isValid[1][2])) {
{
double xPos;
double yPos;
/* Statements in this Scop: S404, S403, S402 */
{
{
{
double* fieldData_Solution_2_p1 = (&fieldData_Solution[2][0]);
int i2 = 2;
for (; (i2<=5); i2 += 2) {
fieldData_Solution_2_p1[(i2+8)] = 0.000000e+00;
fieldData_Solution_2_p1[(i2+9)] = 0.000000e+00;
}
for (; (i2<=6); i2 += 1) {
fieldData_Solution_2_p1[(i2+8)] = 0.000000e+00;
}
}
{
int i2 = 2;
for (; (i2<=5); i2 += 2) {
xPos = ((((i2-2)/4.000000e+00)*(posEnd[0]-posBegin[0]))+posBegin[0]);
xPos = ((((i2-1)/4.000000e+00)*(posEnd[0]-posBegin[0]))+posBegin[0]);
}
for (; (i2<=6); i2 += 1) {
xPos = ((((i2-2)/4.000000e+00)*(posEnd[0]-posBegin[0]))+posBegin[0]);
}
}
}
{
int i2 = 2;
for (; (i2<=5); i2 += 2) {
yPos = posBegin[1];
yPos = posBegin[1];
}
for (; (i2<=6); i2 += 1) {
yPos = posBegin[1];
}
}
}
}
}
if ((!neighbor_isValid[1][3])) {
{
double xPos;
double yPos;
/* Statements in this Scop: S407, S406, S405 */
{
{
{
double* fieldData_Solution_2_p1 = (&fieldData_Solution[2][0]);
int i2 = 2;
for (; (i2<=5); i2 += 2) {
fieldData_Solution_2_p1[(i2+40)] = 0.000000e+00;
fieldData_Solution_2_p1[(i2+41)] = 0.000000e+00;
}
for (; (i2<=6); i2 += 1) {
fieldData_Solution_2_p1[(i2+40)] = 0.000000e+00;
}
}
{
int i2 = 2;
for (; (i2<=5); i2 += 2) {
xPos = ((((i2-2)/4.000000e+00)*(posEnd[0]-posBegin[0]))+posBegin[0]);
xPos = ((((i2-1)/4.000000e+00)*(posEnd[0]-posBegin[0]))+posBegin[0]);
}
for (; (i2<=6); i2 += 1) {
xPos = ((((i2-2)/4.000000e+00)*(posEnd[0]-posBegin[0]))+posBegin[0]);
}
}
}
{
int i2 = 2;
for (; (i2<=5); i2 += 2) {
yPos = posEnd[1];
yPos = posEnd[1];
}
for (; (i2<=6); i2 += 1) {
yPos = posEnd[1];
}
}
}
}
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
;
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if ((neighbor_isValid[1][1]&&neighbor_isRemote[1][1])) {
MPI_Isend(&fieldData_Solution[2][14], 1, mpiDatatype_5_1_8, neighbor_remoteRank[1][1], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[1][1]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[1]);
reqOutstanding_Send[1] = true;
}
}
}
;
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if ((neighbor_isValid[1][0]&&neighbor_isRemote[1][0])) {
MPI_Irecv(&fieldData_Solution[2][10], 1, mpiDatatype_5_1_8, neighbor_remoteRank[1][0], ((unsigned int)(neighbor_fragCommId[1][0]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[0]);
reqOutstanding_Recv[0] = true;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if (reqOutstanding_Recv[0]) {
waitForMPIReq(&mpiRequest_Recv[0]);
reqOutstanding_Recv[0] = false;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
;
}
}
;
;
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if (reqOutstanding_Send[1]) {
waitForMPIReq(&mpiRequest_Send[1]);
reqOutstanding_Send[1] = false;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
;
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if ((neighbor_isValid[1][3]&&neighbor_isRemote[1][3])) {
MPI_Isend(&fieldData_Solution[2][42], 1, mpiDatatype_1_5_8, neighbor_remoteRank[1][3], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[1][3]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[3]);
reqOutstanding_Send[3] = true;
}
}
}
;
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if ((neighbor_isValid[1][2]&&neighbor_isRemote[1][2])) {
MPI_Irecv(&fieldData_Solution[2][10], 1, mpiDatatype_1_5_8, neighbor_remoteRank[1][2], ((unsigned int)(neighbor_fragCommId[1][2]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[2]);
reqOutstanding_Recv[2] = true;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if (reqOutstanding_Recv[2]) {
waitForMPIReq(&mpiRequest_Recv[2]);
reqOutstanding_Recv[2] = false;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
;
}
}
;
;
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if (reqOutstanding_Send[3]) {
waitForMPIReq(&mpiRequest_Send[3]);
reqOutstanding_Send[3] = false;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
;
;
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if ((neighbor_isValid[1][0]&&neighbor_isRemote[1][0])) {
MPI_Isend(&fieldData_Solution[2][3], 1, mpiDatatype_7_1_8, neighbor_remoteRank[1][0], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[1][0]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[0]);
reqOutstanding_Send[0] = true;
}
if ((neighbor_isValid[1][1]&&neighbor_isRemote[1][1])) {
MPI_Isend(&fieldData_Solution[2][5], 1, mpiDatatype_7_1_8, neighbor_remoteRank[1][1], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[1][1]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[1]);
reqOutstanding_Send[1] = true;
}
}
}
;
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if ((neighbor_isValid[1][0]&&neighbor_isRemote[1][0])) {
MPI_Irecv(&fieldData_Solution[2][1], 1, mpiDatatype_7_1_8, neighbor_remoteRank[1][0], ((unsigned int)(neighbor_fragCommId[1][0]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[0]);
reqOutstanding_Recv[0] = true;
}
if ((neighbor_isValid[1][1]&&neighbor_isRemote[1][1])) {
MPI_Irecv(&fieldData_Solution[2][7], 1, mpiDatatype_7_1_8, neighbor_remoteRank[1][1], ((unsigned int)(neighbor_fragCommId[1][1]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[1]);
reqOutstanding_Recv[1] = true;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if (reqOutstanding_Recv[0]) {
waitForMPIReq(&mpiRequest_Recv[0]);
reqOutstanding_Recv[0] = false;
}
if (reqOutstanding_Recv[1]) {
waitForMPIReq(&mpiRequest_Recv[1]);
reqOutstanding_Recv[1] = false;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
;
;
}
}
;
;
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if (reqOutstanding_Send[0]) {
waitForMPIReq(&mpiRequest_Send[0]);
reqOutstanding_Send[0] = false;
}
if (reqOutstanding_Send[1]) {
waitForMPIReq(&mpiRequest_Send[1]);
reqOutstanding_Send[1] = false;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
;
;
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if ((neighbor_isValid[1][2]&&neighbor_isRemote[1][2])) {
MPI_Isend(&fieldData_Solution[2][17], 1, mpiDatatype_1_7_8, neighbor_remoteRank[1][2], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[1][2]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[2]);
reqOutstanding_Send[2] = true;
}
if ((neighbor_isValid[1][3]&&neighbor_isRemote[1][3])) {
MPI_Isend(&fieldData_Solution[2][33], 1, mpiDatatype_1_7_8, neighbor_remoteRank[1][3], ((unsigned int)commId << 16) + ((unsigned int)(neighbor_fragCommId[1][3]) & 0x0000ffff), mpiCommunicator, &mpiRequest_Send[3]);
reqOutstanding_Send[3] = true;
}
}
}
;
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if ((neighbor_isValid[1][2]&&neighbor_isRemote[1][2])) {
MPI_Irecv(&fieldData_Solution[2][1], 1, mpiDatatype_1_7_8, neighbor_remoteRank[1][2], ((unsigned int)(neighbor_fragCommId[1][2]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[2]);
reqOutstanding_Recv[2] = true;
}
if ((neighbor_isValid[1][3]&&neighbor_isRemote[1][3])) {
MPI_Irecv(&fieldData_Solution[2][49], 1, mpiDatatype_1_7_8, neighbor_remoteRank[1][3], ((unsigned int)(neighbor_fragCommId[1][3]) << 16) + ((unsigned int)commId & 0x0000ffff), mpiCommunicator, &mpiRequest_Recv[3]);
reqOutstanding_Recv[3] = true;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if (reqOutstanding_Recv[2]) {
waitForMPIReq(&mpiRequest_Recv[2]);
reqOutstanding_Recv[2] = false;
}
if (reqOutstanding_Recv[3]) {
waitForMPIReq(&mpiRequest_Recv[3]);
reqOutstanding_Recv[3] = false;
}
}
}
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
;
;
}
}
;
;
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
if (reqOutstanding_Send[2]) {
waitForMPIReq(&mpiRequest_Send[2]);
reqOutstanding_Send[2] = false;
}
if (reqOutstanding_Send[3]) {
waitForMPIReq(&mpiRequest_Send[3]);
reqOutstanding_Send[3] = false;
}
}
}
}
Beispiel #30
0
	static inline Simd sub(const Simd& lhs, const Simd& rhs) {
		Simd res;
		res.reg[0] = _mm_sub_pd(lhs.reg[0], rhs.reg[0]);
		res.reg[1] = _mm_sub_pd(lhs.reg[1], rhs.reg[1]);
		return res;
	}