int
calc_gb_chainrule_sse2_double(int natoms, t_nblist *nl, double *dadx, double *dvda, 
                              double *x, double *f, double *fshift, double *shiftvec,
                              int gb_algorithm, gmx_genborn_t *born, t_mdatoms *md)						
{
	int    i,k,n,ii,jnr,ii3,is3,nj0,nj1,n0,n1;
	int	   jnrA,jnrB;
    int    j3A,j3B;
	int *  jjnr;
    
	double   rbi,shX,shY,shZ;
	double   *rb;
    
	__m128d ix,iy,iz;
	__m128d jx,jy,jz;
	__m128d fix,fiy,fiz;
	__m128d dx,dy,dz;
    __m128d tx,ty,tz;
    
	__m128d rbai,rbaj,f_gb, f_gb_ai;
	__m128d xmm1,xmm2,xmm3;
	
	const __m128d two = _mm_set1_pd(2.0);
    
	rb     = born->work; 
    
  jjnr   = nl->jjnr;
  
	/* Loop to get the proper form for the Born radius term, sse style */	
  n0 = 0;
  n1 = natoms;
    
	if(gb_algorithm==egbSTILL) 
	{
		for(i=n0;i<n1;i++)
		{
      rbi   = born->bRad[i];
			rb[i] = (2 * rbi * rbi * dvda[i])/ONE_4PI_EPS0;
		}
	}
	else if(gb_algorithm==egbHCT) 
	{
		for(i=n0;i<n1;i++)
		{
      rbi   = born->bRad[i];
			rb[i] = rbi * rbi * dvda[i];
		}
	}
	else if(gb_algorithm==egbOBC) 
	{
		for(i=n0;i<n1;i++)
		{
      rbi   = born->bRad[i];
			rb[i] = rbi * rbi * born->drobc[i] * dvda[i];
		}
	}
    
  jz = _mm_setzero_pd();
  
  n = j3A = j3B = 0;
  
	for(i=0;i<nl->nri;i++)
	{
    ii     = nl->iinr[i];
		ii3	   = ii*3;
    is3    = 3*nl->shift[i];     
    shX    = shiftvec[is3];  
    shY    = shiftvec[is3+1];
    shZ    = shiftvec[is3+2];
    nj0    = nl->jindex[i];      
    nj1    = nl->jindex[i+1];    
    
    ix     = _mm_set1_pd(shX+x[ii3+0]);
		iy     = _mm_set1_pd(shY+x[ii3+1]);
		iz     = _mm_set1_pd(shZ+x[ii3+2]);
    
		rbai   = _mm_load1_pd(rb+ii);			
		fix    = _mm_setzero_pd();
		fiy    = _mm_setzero_pd();
		fiz    = _mm_setzero_pd();	
    
        
    for(k=nj0;k<nj1-1;k+=2)
		{
			jnrA        = jjnr[k];   
			jnrB        = jjnr[k+1];
      
      j3A         = 3*jnrA;  
			j3B         = 3*jnrB;
            
      GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A,x+j3B,jx,jy,jz);
      
			dx          = _mm_sub_pd(ix,jx);
			dy          = _mm_sub_pd(iy,jy);
			dz          = _mm_sub_pd(iz,jz);
      
      GMX_MM_LOAD_2VALUES_PD(rb+jnrA,rb+jnrB,rbaj);
      
			/* load chain rule terms for j1-4 */
			f_gb        = _mm_load_pd(dadx);
			dadx += 2;
			f_gb_ai     = _mm_load_pd(dadx);
			dadx += 2;
			
      /* calculate scalar force */
      f_gb    = _mm_mul_pd(f_gb,rbai); 
      f_gb_ai = _mm_mul_pd(f_gb_ai,rbaj);
      f_gb    = _mm_add_pd(f_gb,f_gb_ai);
      
      tx     = _mm_mul_pd(f_gb,dx);
      ty     = _mm_mul_pd(f_gb,dy);
      tz     = _mm_mul_pd(f_gb,dz);
      
      fix    = _mm_add_pd(fix,tx);
      fiy    = _mm_add_pd(fiy,ty);
      fiz    = _mm_add_pd(fiz,tz);
      
      GMX_MM_DECREMENT_1RVEC_2POINTERS_PD(f+j3A,f+j3B,tx,ty,tz);
		}
    
		/*deal with odd elements */
		if(k<nj1) 
        {
          jnrA        = jjnr[k];   
          j3A         = 3*jnrA;  
          
          GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A,jx,jy,jz);
          
          dx          = _mm_sub_sd(ix,jx);
          dy          = _mm_sub_sd(iy,jy);
          dz          = _mm_sub_sd(iz,jz);
          
          GMX_MM_LOAD_1VALUE_PD(rb+jnrA,rbaj);
          
          /* load chain rule terms */
          f_gb        = _mm_load_pd(dadx);
          dadx += 2;
          f_gb_ai     = _mm_load_pd(dadx);
          dadx += 2;
          
          /* calculate scalar force */
          f_gb    = _mm_mul_sd(f_gb,rbai); 
          f_gb_ai = _mm_mul_sd(f_gb_ai,rbaj);
          f_gb    = _mm_add_sd(f_gb,f_gb_ai);
          
          tx     = _mm_mul_sd(f_gb,dx);
          ty     = _mm_mul_sd(f_gb,dy);
          tz     = _mm_mul_sd(f_gb,dz);
          
          fix    = _mm_add_sd(fix,tx);
          fiy    = _mm_add_sd(fiy,ty);
          fiz    = _mm_add_sd(fiz,tz);
          
          GMX_MM_DECREMENT_1RVEC_1POINTER_PD(f+j3A,tx,ty,tz);
        } 
    
		/* fix/fiy/fiz now contain four partial force terms, that all should be
     * added to the i particle forces and shift forces. 
     */
 		gmx_mm_update_iforce_1atom_pd(&fix,&fiy,&fiz,f+ii3,fshift+is3);
	}	
  
	return 0;	
}
void nb_kernel400_ia32_sse2(int *           p_nri,
                              int *           iinr,
                              int *           jindex,
                              int *           jjnr,
                              int *           shift,
                              double *         shiftvec,
                              double *         fshift,
                              int *           gid,
                              double *         pos,
                              double *         faction,
                              double *         charge,
                              double *         p_facel,
                              double *         p_krf,
                              double *         p_crf,
                              double *         vc,
                              int *           type,
                              int *           p_ntype,
                              double *         vdwparam,
                              double *         vvdw,
                              double *         p_tabscale,
                              double *         VFtab,
                              double *         invsqrta,
                              double *         dvda,
                              double *         p_gbtabscale,
                              double *         GBtab,
                              int *           p_nthreads,
                              int *           count,
                              void *          mtx,
                              int *           outeriter,
                              int *           inneriter,
                              double *         work)
{
  int           nri,nthreads;
  int           n,ii,is3,ii3,k,nj0,nj1,ggid;
  double        shX,shY,shZ;
  int           jnrA,jnrB;
  int           j3A,j3B;
	gmx_gbdata_t *gbdata;
	double *      gpol;
    
	__m128d  iq,qq,jq,isai;
	__m128d  ix,iy,iz;
	__m128d  jx,jy,jz;
	__m128d  dx,dy,dz;
	__m128d  vctot,vgbtot,dvdasum,gbfactor;
	__m128d  fix,fiy,fiz,tx,ty,tz,rsq;
	__m128d  rinv,isaj,isaprod;
	__m128d  vcoul,fscal,gbscale;
	__m128d  rinvsq,r,rtab;
	__m128d  eps,Y,F,G,H;
	__m128d  vgb,fijGB,dvdatmp;
	__m128d  facel,gbtabscale,dvdaj;
	__m128i  n0, nnn;
	
	const __m128d neg        = _mm_set1_pd(-1.0);
	const __m128d zero       = _mm_set1_pd(0.0);
	const __m128d minushalf  = _mm_set1_pd(-0.5);
	const __m128d two        = _mm_set1_pd(2.0);
	
	gbdata     = (gmx_gbdata_t *)work;
	gpol       = gbdata->gpol;
    
	nri        = *p_nri;
    
  gbfactor   = _mm_set1_pd( - ((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent)));     
  gbtabscale = _mm_load1_pd(p_gbtabscale);  
  facel      = _mm_load1_pd(p_facel);
  
  nj1         = 0;
  jnrA = jnrB = 0;
  j3A = j3B   = 0;
  jx          = _mm_setzero_pd();
  jy          = _mm_setzero_pd();
  jz          = _mm_setzero_pd();
	
	for(n=0;n<nri;n++)
	{
    is3              = 3*shift[n];     
    shX              = shiftvec[is3];  
    shY              = shiftvec[is3+1];
    shZ              = shiftvec[is3+2];
    nj0              = jindex[n];      
    nj1              = jindex[n+1];    
    ii               = iinr[n];        
    ii3              = 3*ii;           
		
		ix               = _mm_set1_pd(shX+pos[ii3+0]);
		iy               = _mm_set1_pd(shY+pos[ii3+1]);
		iz               = _mm_set1_pd(shZ+pos[ii3+2]);
    
		iq               = _mm_load1_pd(charge+ii);
		iq               = _mm_mul_pd(iq,facel);
    
		isai             = _mm_load1_pd(invsqrta+ii);
        		
		vctot            = _mm_setzero_pd();
		vgbtot           = _mm_setzero_pd();
		dvdasum          = _mm_setzero_pd();
		fix              = _mm_setzero_pd();
		fiy              = _mm_setzero_pd();
		fiz              = _mm_setzero_pd();
                
		for(k=nj0;k<nj1-1; k+=2)
		{
			jnrA    = jjnr[k];
			jnrB    = jjnr[k+1];
			
			j3A     = jnrA * 3;
			j3B     = jnrB * 3;
      
      GMX_MM_LOAD_1RVEC_2POINTERS_PD(pos+j3A,pos+j3B,jx,jy,jz);
            
			dx           = _mm_sub_pd(ix,jx);
			dy           = _mm_sub_pd(iy,jy);
			dz           = _mm_sub_pd(iz,jz);
            
      rsq          = gmx_mm_calc_rsq_pd(dx,dy,dz);
      
      rinv         = gmx_mm_invsqrt_pd(rsq);
 			rinvsq       = _mm_mul_pd(rinv,rinv);
      
			/***********************************/
			/* INTERACTION SECTION STARTS HERE */
			/***********************************/
			GMX_MM_LOAD_2VALUES_PD(charge+jnrA,charge+jnrB,jq);
			GMX_MM_LOAD_2VALUES_PD(invsqrta+jnrA,invsqrta+jnrB,isaj);
            
			isaprod      = _mm_mul_pd(isai,isaj);
			qq           = _mm_mul_pd(iq,jq);            
			vcoul        = _mm_mul_pd(qq,rinv);
			fscal        = _mm_mul_pd(vcoul,rinv);                                 
      vctot        = _mm_add_pd(vctot,vcoul);
            
            /* Polarization interaction */
			qq           = _mm_mul_pd(qq,_mm_mul_pd(isaprod,gbfactor));
			gbscale      = _mm_mul_pd(isaprod,gbtabscale);
            
 			/* Calculate GB table index */
			r            = _mm_mul_pd(rsq,rinv);
			rtab         = _mm_mul_pd(r,gbscale);
			
			n0		     = _mm_cvttpd_epi32(rtab);
			eps	     	 = _mm_sub_pd(rtab,_mm_cvtepi32_pd(n0));
			nnn		     = _mm_slli_epi32(n0,2);
			
      /* the tables are 16-byte aligned, so we can use _mm_load_pd */			
      Y            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))); 
      F            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1)));
      GMX_MM_TRANSPOSE2_PD(Y,F);
      G            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2); 
      H            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1))+2);
      GMX_MM_TRANSPOSE2_PD(G,H);
      
      G       = _mm_mul_pd(G,eps);
      H       = _mm_mul_pd(H, _mm_mul_pd(eps,eps) );
      F       = _mm_add_pd(F, _mm_add_pd( G , H ) );
      Y       = _mm_add_pd(Y, _mm_mul_pd(F, eps));
      F       = _mm_add_pd(F, _mm_add_pd(G , _mm_mul_pd(H,two)));
      vgb     = _mm_mul_pd(Y, qq);           
      fijGB   = _mm_mul_pd(F, _mm_mul_pd(qq,gbscale));
      
      dvdatmp = _mm_mul_pd(_mm_add_pd(vgb, _mm_mul_pd(fijGB,r)) , minushalf);
      
      vgbtot  = _mm_add_pd(vgbtot, vgb);
      
      dvdasum = _mm_add_pd(dvdasum, dvdatmp);
      dvdatmp = _mm_mul_pd(dvdatmp, _mm_mul_pd(isaj,isaj));
      
      GMX_MM_INCREMENT_2VALUES_PD(dvda+jnrA,dvda+jnrB,dvdatmp);
      
      fscal        = _mm_mul_pd( _mm_sub_pd( fscal, fijGB),rinv );
      
      /***********************************/
			/*  INTERACTION SECTION ENDS HERE  */
			/***********************************/
      
      /* Calculate temporary vectorial force */
      tx           = _mm_mul_pd(fscal,dx);
      ty           = _mm_mul_pd(fscal,dy);
      tz           = _mm_mul_pd(fscal,dz);
      
      /* Increment i atom force */
      fix          = _mm_add_pd(fix,tx);
      fiy          = _mm_add_pd(fiy,ty);
      fiz          = _mm_add_pd(fiz,tz);
      
      /* Store j forces back */
			GMX_MM_DECREMENT_1RVEC_2POINTERS_PD(faction+j3A,faction+j3B,tx,ty,tz);
		}
		
		/* In double precision, offset can only be either 0 or 1 */
		if(k<nj1)
		{
			jnrA    = jjnr[k];
			j3A     = jnrA * 3;
      
      GMX_MM_LOAD_1RVEC_1POINTER_PD(pos+j3A,jx,jy,jz);
      
			dx           = _mm_sub_sd(ix,jx);
			dy           = _mm_sub_sd(iy,jy);
			dz           = _mm_sub_sd(iz,jz);
      
      rsq          = gmx_mm_calc_rsq_pd(dx,dy,dz);
      
      rinv         = gmx_mm_invsqrt_pd(rsq);
 			rinvsq       = _mm_mul_sd(rinv,rinv);
      
      /* These reason for zeroing these variables here is for fixing bug 585
       * What happens is that __m128d _mm_add_sd(a,b) gives back r0=a[0]+b[0],
       * and r1=0, but it should be r1=a[1]. 
       * This might be a compiler issue (tested with gcc-4.1.3 and -O3).
       * To work around it, we zero these variables and use _mm_add_pd (**) instead
       * Note that the only variables that get affected are the energies since
       * the total sum needs to be correct 
       */
      vgb          = _mm_setzero_pd();
      vcoul        = _mm_setzero_pd();
      dvdatmp      = _mm_setzero_pd();
      
			/***********************************/
			/* INTERACTION SECTION STARTS HERE */
			/***********************************/
			GMX_MM_LOAD_1VALUE_PD(charge+jnrA,jq);
			GMX_MM_LOAD_1VALUE_PD(invsqrta+jnrA,isaj);
      
			isaprod      = _mm_mul_sd(isai,isaj);
			qq           = _mm_mul_sd(jq,iq);            
			vcoul        = _mm_mul_sd(qq,rinv);
			fscal        = _mm_mul_sd(vcoul,rinv);                                 
      vctot        = _mm_add_pd(vctot,vcoul); /* (**) */
      
      /* Polarization interaction */
			qq           = _mm_mul_sd(qq,_mm_mul_sd(isaprod,gbfactor));
			gbscale      = _mm_mul_sd(isaprod,gbtabscale);
      
 			/* Calculate GB table index */
			r            = _mm_mul_sd(rsq,rinv);
			rtab         = _mm_mul_sd(r,gbscale);
      
			n0		     = _mm_cvttpd_epi32(rtab);
			eps	     	 = _mm_sub_sd(rtab,_mm_cvtepi32_pd(n0));
			nnn		     = _mm_slli_epi32(n0,2);
			
      /* the tables are 16-byte aligned, so we can use _mm_load_pd */			
      Y            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))); 
      F            = _mm_setzero_pd();
      GMX_MM_TRANSPOSE2_PD(Y,F);
      G            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2); 
      H            = _mm_setzero_pd();
      GMX_MM_TRANSPOSE2_PD(G,H);
      
      G       = _mm_mul_sd(G,eps);
      H       = _mm_mul_sd(H, _mm_mul_sd(eps,eps) );
      F       = _mm_add_sd(F, _mm_add_sd( G , H ) );
      Y       = _mm_add_sd(Y, _mm_mul_sd(F, eps));
      F       = _mm_add_sd(F, _mm_add_sd(G , _mm_mul_sd(H,two)));
      vgb     = _mm_mul_sd(Y, qq);           
      fijGB   = _mm_mul_sd(F, _mm_mul_sd(qq,gbscale));
      
      dvdatmp = _mm_mul_sd(_mm_add_sd(vgb, _mm_mul_sd(fijGB,r)) , minushalf);
      
      vgbtot  = _mm_add_pd(vgbtot, vgb); /* (**) */
      
      dvdasum = _mm_add_pd(dvdasum, dvdatmp); /* (**) */
      dvdatmp = _mm_mul_sd(dvdatmp, _mm_mul_sd(isaj,isaj));
      
      GMX_MM_INCREMENT_1VALUE_PD(dvda+jnrA,dvdatmp);
			
      fscal        = _mm_mul_sd( _mm_sub_sd( fscal, fijGB),rinv );
      
      /***********************************/
			/*  INTERACTION SECTION ENDS HERE  */
			/***********************************/
      
      /* Calculate temporary vectorial force */
      tx           = _mm_mul_sd(fscal,dx);
      ty           = _mm_mul_sd(fscal,dy);
      tz           = _mm_mul_sd(fscal,dz);
      
      /* Increment i atom force */
      fix          = _mm_add_sd(fix,tx);
      fiy          = _mm_add_sd(fiy,ty);
      fiz          = _mm_add_sd(fiz,tz);
      
      /* Store j forces back */
			GMX_MM_DECREMENT_1RVEC_1POINTER_PD(faction+j3A,tx,ty,tz);
		}
		
    dvdasum = _mm_mul_pd(dvdasum, _mm_mul_pd(isai,isai));
    gmx_mm_update_iforce_1atom_pd(&fix,&fiy,&fiz,faction+ii3,fshift+is3);
    
    ggid     = gid[n];         
    
    gmx_mm_update_1pot_pd(vctot,vc+ggid);
    gmx_mm_update_1pot_pd(vgbtot,gpol+ggid);
    gmx_mm_update_1pot_pd(dvdasum,dvda+ii);
  }
  
	*outeriter   = nri;            
  *inneriter   = nj1; 	
}
Beispiel #3
0
void bl1_ddotaxmyv2( int       n,
                     double*   alpha,
                     double*   beta,
                     double*   x, int inc_x,
                     double*   u, int inc_u,
                     double*   rho,
                     double*   y, int inc_y,
                     double*   z, int inc_z )
#if BLIS1_VECTOR_INTRINSIC_TYPE == BLIS1_SSE_INTRINSICS
{
	double*   restrict chi1;
	double*   restrict upsilon1;
	double*   restrict psi1;
	double*   restrict zeta1;
	double    rho_c;
	int       i;

	int       n_pre;
	int       n_run;
	int       n_left;

	v2df_t    a1v, b1v;
	v2df_t    rho1v;
	v2df_t    x1v, u1v, y1v, z1v;

	if ( inc_x != 1 ||
	     inc_u != 1 ||
	     inc_y != 1 ||
	     inc_z != 1 ) bl1_abort();

	n_pre = 0;
	if ( ( unsigned long ) z % 16 != 0 )
	{
		if ( ( unsigned long ) x % 16 == 0 ||
		     ( unsigned long ) u % 16 == 0 ||
		     ( unsigned long ) y % 16 == 0 ) bl1_abort();

		n_pre = 1;
	}

	n_run       = ( n - n_pre ) / 2;
	n_left      = ( n - n_pre ) % 2;

	chi1     = x;
	upsilon1 = u;
	psi1     = y;
	zeta1    = z;

	rho_c   = 0.0;

	if ( n_pre == 1 )
	{
		double   alpha_c = *alpha;
		double   beta_c  = *beta;
		double   chi1_c    = *chi1;
		double   upsilon_c = *upsilon1;

		rho_c  += chi1_c * upsilon_c;
		*psi1  -= alpha_c * chi1_c;
		*zeta1 -= beta_c  * chi1_c;

		chi1     += inc_x;
		upsilon1 += inc_u;
		psi1     += inc_y;
		zeta1    += inc_z;
	}

	a1v.v = _mm_loaddup_pd( ( double* )alpha );
	b1v.v = _mm_loaddup_pd( ( double* )beta );

	rho1v.v = _mm_setzero_pd();

	for ( i = 0; i < n_run; ++i )
	{
		x1v.v = _mm_load_pd( ( double* )chi1 );
		u1v.v = _mm_load_pd( ( double* )upsilon1 );
		y1v.v = _mm_load_pd( ( double* )psi1 );
		z1v.v = _mm_load_pd( ( double* )zeta1 );

		rho1v.v += x1v.v * u1v.v;
		y1v.v   -= a1v.v * x1v.v;
		z1v.v   -= b1v.v * x1v.v;

		_mm_store_pd( ( double* )psi1,  y1v.v );
		_mm_store_pd( ( double* )zeta1, z1v.v );

		chi1     += 2;
		upsilon1 += 2;
		psi1     += 2;
		zeta1    += 2;
	}

	rho_c += rho1v.d[0] + rho1v.d[1];

	if ( n_left > 0 )
	{
		double   alpha_c = *alpha;
		double   beta_c  = *beta;

		for( i = 0; i < n_left; ++i )
		{
			double   chi1_c    = *chi1;
			double   upsilon_c = *upsilon1;

			rho_c  += chi1_c * upsilon_c;
			*psi1  -= alpha_c * chi1_c;
			*zeta1 -= beta_c  * chi1_c;

			chi1     += inc_x;
			upsilon1 += inc_u;
			psi1     += inc_y;
			zeta1    += inc_z;
		}
	}

	*rho = rho_c;
}
Beispiel #4
0
inline double lanczos13m53::lanczos_sum_expG_scaled<double>(const double& x)
{
   static const ALIGN16 double coeff[26] = {
         static_cast<double>(0.006061842346248906525783753964555936883222L),
         static_cast<double>(1u),
         static_cast<double>(0.5098416655656676188125178644804694509993L),
         static_cast<double>(66u),
         static_cast<double>(19.51992788247617482847860966235652136208L),
         static_cast<double>(1925u),
         static_cast<double>(449.9445569063168119446858607650988409623L),
         static_cast<double>(32670u),
         static_cast<double>(6955.999602515376140356310115515198987526L),
         static_cast<double>(357423u),
         static_cast<double>(75999.29304014542649875303443598909137092L),
         static_cast<double>(2637558u),
         static_cast<double>(601859.6171681098786670226533699352302507L),
         static_cast<double>(13339535u),
         static_cast<double>(3481712.15498064590882071018964774556468L),
         static_cast<double>(45995730u),
         static_cast<double>(14605578.08768506808414169982791359218571L),
         static_cast<double>(105258076u),
         static_cast<double>(43338889.32467613834773723740590533316085L),
         static_cast<double>(150917976u),
         static_cast<double>(86363131.28813859145546927288977868422342L),
         static_cast<double>(120543840u),
         static_cast<double>(103794043.1163445451906271053616070238554L),
         static_cast<double>(39916800u),
         static_cast<double>(56906521.91347156388090791033559122686859L),
         static_cast<double>(0u)
   };
   register __m128d vx = _mm_load1_pd(&x);
   register __m128d sum_even = _mm_load_pd(coeff);
   register __m128d sum_odd = _mm_load_pd(coeff+2);
   register __m128d nc_odd, nc_even;
   register __m128d vx2 = _mm_mul_pd(vx, vx);

   sum_even = _mm_mul_pd(sum_even, vx2);
   nc_even = _mm_load_pd(coeff + 4);
   sum_odd = _mm_mul_pd(sum_odd, vx2);
   nc_odd = _mm_load_pd(coeff + 6);
   sum_even = _mm_add_pd(sum_even, nc_even);
   sum_odd = _mm_add_pd(sum_odd, nc_odd);

   sum_even = _mm_mul_pd(sum_even, vx2);
   nc_even = _mm_load_pd(coeff + 8);
   sum_odd = _mm_mul_pd(sum_odd, vx2);
   nc_odd = _mm_load_pd(coeff + 10);
   sum_even = _mm_add_pd(sum_even, nc_even);
   sum_odd = _mm_add_pd(sum_odd, nc_odd);

   sum_even = _mm_mul_pd(sum_even, vx2);
   nc_even = _mm_load_pd(coeff + 12);
   sum_odd = _mm_mul_pd(sum_odd, vx2);
   nc_odd = _mm_load_pd(coeff + 14);
   sum_even = _mm_add_pd(sum_even, nc_even);
   sum_odd = _mm_add_pd(sum_odd, nc_odd);

   sum_even = _mm_mul_pd(sum_even, vx2);
   nc_even = _mm_load_pd(coeff + 16);
   sum_odd = _mm_mul_pd(sum_odd, vx2);
   nc_odd = _mm_load_pd(coeff + 18);
   sum_even = _mm_add_pd(sum_even, nc_even);
   sum_odd = _mm_add_pd(sum_odd, nc_odd);

   sum_even = _mm_mul_pd(sum_even, vx2);
   nc_even = _mm_load_pd(coeff + 20);
   sum_odd = _mm_mul_pd(sum_odd, vx2);
   nc_odd = _mm_load_pd(coeff + 22);
   sum_even = _mm_add_pd(sum_even, nc_even);
   sum_odd = _mm_add_pd(sum_odd, nc_odd);

   sum_even = _mm_mul_pd(sum_even, vx2);
   nc_even = _mm_load_pd(coeff + 24);
   sum_odd = _mm_mul_pd(sum_odd, vx);
   sum_even = _mm_add_pd(sum_even, nc_even);
   sum_even = _mm_add_pd(sum_even, sum_odd);


   double ALIGN16 t[2];
   _mm_store_pd(t, sum_even);
   
   return t[0] / t[1];
}
Beispiel #5
0
inline double lanczos13m53::lanczos_sum<double>(const double& x)
{
   static const ALIGN16 double coeff[26] = {
      static_cast<double>(2.506628274631000270164908177133837338626L),
      static_cast<double>(1u),
      static_cast<double>(210.8242777515793458725097339207133627117L),
      static_cast<double>(66u),
      static_cast<double>(8071.672002365816210638002902272250613822L),
      static_cast<double>(1925u),
      static_cast<double>(186056.2653952234950402949897160456992822L),
      static_cast<double>(32670u),
      static_cast<double>(2876370.628935372441225409051620849613599L),
      static_cast<double>(357423u),
      static_cast<double>(31426415.58540019438061423162831820536287L),
      static_cast<double>(2637558u),
      static_cast<double>(248874557.8620541565114603864132294232163L),
      static_cast<double>(13339535u),
      static_cast<double>(1439720407.311721673663223072794912393972L),
      static_cast<double>(45995730u),
      static_cast<double>(6039542586.35202800506429164430729792107L),
      static_cast<double>(105258076u),
      static_cast<double>(17921034426.03720969991975575445893111267L),
      static_cast<double>(150917976u),
      static_cast<double>(35711959237.35566804944018545154716670596L),
      static_cast<double>(120543840u),
      static_cast<double>(42919803642.64909876895789904700198885093L),
      static_cast<double>(39916800u),
      static_cast<double>(23531376880.41075968857200767445163675473L),
      static_cast<double>(0u)
   };
   register __m128d vx = _mm_load1_pd(&x);
   register __m128d sum_even = _mm_load_pd(coeff);
   register __m128d sum_odd = _mm_load_pd(coeff+2);
   register __m128d nc_odd, nc_even;
   register __m128d vx2 = _mm_mul_pd(vx, vx);

   sum_even = _mm_mul_pd(sum_even, vx2);
   nc_even = _mm_load_pd(coeff + 4);
   sum_odd = _mm_mul_pd(sum_odd, vx2);
   nc_odd = _mm_load_pd(coeff + 6);
   sum_even = _mm_add_pd(sum_even, nc_even);
   sum_odd = _mm_add_pd(sum_odd, nc_odd);

   sum_even = _mm_mul_pd(sum_even, vx2);
   nc_even = _mm_load_pd(coeff + 8);
   sum_odd = _mm_mul_pd(sum_odd, vx2);
   nc_odd = _mm_load_pd(coeff + 10);
   sum_even = _mm_add_pd(sum_even, nc_even);
   sum_odd = _mm_add_pd(sum_odd, nc_odd);

   sum_even = _mm_mul_pd(sum_even, vx2);
   nc_even = _mm_load_pd(coeff + 12);
   sum_odd = _mm_mul_pd(sum_odd, vx2);
   nc_odd = _mm_load_pd(coeff + 14);
   sum_even = _mm_add_pd(sum_even, nc_even);
   sum_odd = _mm_add_pd(sum_odd, nc_odd);

   sum_even = _mm_mul_pd(sum_even, vx2);
   nc_even = _mm_load_pd(coeff + 16);
   sum_odd = _mm_mul_pd(sum_odd, vx2);
   nc_odd = _mm_load_pd(coeff + 18);
   sum_even = _mm_add_pd(sum_even, nc_even);
   sum_odd = _mm_add_pd(sum_odd, nc_odd);

   sum_even = _mm_mul_pd(sum_even, vx2);
   nc_even = _mm_load_pd(coeff + 20);
   sum_odd = _mm_mul_pd(sum_odd, vx2);
   nc_odd = _mm_load_pd(coeff + 22);
   sum_even = _mm_add_pd(sum_even, nc_even);
   sum_odd = _mm_add_pd(sum_odd, nc_odd);

   sum_even = _mm_mul_pd(sum_even, vx2);
   nc_even = _mm_load_pd(coeff + 24);
   sum_odd = _mm_mul_pd(sum_odd, vx);
   sum_even = _mm_add_pd(sum_even, nc_even);
   sum_even = _mm_add_pd(sum_even, sum_odd);


   double ALIGN16 t[2];
   _mm_store_pd(t, sum_even);
   
   return t[0] / t[1];
}
int main()
{
#ifndef __EMSCRIPTEN__
	_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
#endif

	printf ("{ \"workload\": %u, \"results\": [\n", N);
	assert(N%2 == 0); // Don't care about the tail for now.
	double *src = get_src_d();//(float*)aligned_alloc(16, N*sizeof(float));
	for(int i = 0; i < N; ++i)
		src[i] = (double)rand() / RAND_MAX;
	double *src2 = get_src2_d();//(float*)aligned_alloc(16, N*sizeof(float));
	for(int i = 0; i < N; ++i)
		src2[i] = (double)rand() / RAND_MAX;
	double *dst = get_dst_d();//(float*)aligned_alloc(16, N*sizeof(float));

	float scalarTime;
	SETCHART("load");
	START();
		for(int i = 0; i < N; ++i)
			dst[i] = src[i];
	ENDSCALAR(checksum_dst(dst), "scalar");

	LS_TEST("_mm_load_pd", _mm_load_pd, 0, _mm_store_pd, double*, 0, 2);
	LS_TEST("_mm_load_pd1", _mm_load_pd1, 1, _mm_store_pd, double*, 0, 2);
	LS_TEST("_mm_load_sd", _mm_load_sd, 1, _mm_store_pd, double*, 0, 2);
	// _mm_load_si128
	LS_TEST("_mm_load1_pd", _mm_load1_pd, 1, _mm_store_pd, double*, 0, 2);

	__m128d tempReg = _mm_set_pd(1.0, 2.0);
	LSH_TEST("_mm_loadh_pd", tempReg, _mm_loadh_pd, double*, 1, _mm_store_pd, double*, 0, 2);
	// _mm_loadl_epi64
	LSH_TEST("_mm_loadl_pd", tempReg, _mm_loadh_pd, double*, 1, _mm_store_pd, double*, 0, 2);

	LS_TEST("_mm_loadr_pd", _mm_loadr_pd, 0, _mm_store_pd, double*, 0, 2);
	LS_TEST("_mm_loadu_pd", _mm_loadu_pd, 1, _mm_store_pd, double*, 0, 2);
	// _mm_loadu_si128

	SETCHART("set");
/*	_mm_set_epi16
	_mm_set_epi32
	_mm_set_epi64
	_mm_set_epi64x
	_mm_set_epi8 */
	SS_TEST_D("_mm_set_pd", _mm_set_pd(src[i+2], src[i+0]));
	//SS_TEST_D("_mm_set_pd1", _mm_set_pd1(src[i]));
	SS_TEST_D("_mm_set_sd", _mm_set_sd(src[i]));
/*	_mm_set1_epi16
	_mm_set1_epi32
	_mm_set1_epi64
	_mm_set1_epi64x
	_mm_set1_epi8 */
	SS_TEST_D("_mm_set1_pd", _mm_set1_pd(src[i]));
/* 	_mm_setr_epi16
	_mm_setr_epi32
	_mm_setr_epi64
	_mm_setr_epi8 */
	SS_TEST_D("_mm_setr_pd", _mm_set_pd(src[i+2], src[i+0]));
	SS_TEST_D("_mm_setzero_pd", _mm_setzero_pd());
//	_mm_setzero_si128

	SETCHART("move");
	// _mm_move_epi64
	SS_TEST_D("_mm_move_sd", _mm_move_sd(_mm_load_pd(src+i), _mm_load_pd(src2+i)));

	SETCHART("store");
	// _mm_maskmoveu_si128
	LS_TEST("_mm_store_pd", _mm_load_pd, 0, _mm_store_pd, double*, 0, 2);
//	LS_TEST("_mm_store_pd1", _mm_load_pd, 0, _mm_store_pd1, double*, 0);
	LS_TEST("_mm_store_sd", _mm_load_pd, 0, _mm_store_sd, double*, 1, 2);
	// _mm_store_si128
	// _mm_store1_pd
	LS64_TEST("_mm_storeh_pi", _mm_load_pd, 0, _mm_storeh_pi, 1, 2);
	// _mm_storel_epi64
	LS64_TEST("_mm_storel_pi", _mm_load_pd, 0, _mm_storel_pi, 1, 2);
	LS_TEST("_mm_storer_pd", _mm_load_pd, 0, _mm_storer_pd, double*, 0, 2);
	LS_TEST("_mm_storeu_pd", _mm_load_pd, 0, _mm_storeu_pd, double*, 1, 2);
	// _mm_storeu_si128
	LS_TEST("_mm_stream_pd", _mm_load_pd, 0, _mm_stream_pd, double*, 0, 2);
	// _mm_stream_si128
	// _mm_stream_si32
	// _mm_stream_si64

	SETCHART("arithmetic");
	// _mm_add_epi16
	// _mm_add_epi32
	// _mm_add_epi64
	// _mm_add_epi8
	START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] += src2[0]; dst[1] += src2[1]; dst[2] += src2[2]; dst[3] += src2[3]; } ENDSCALAR(checksum_dst(dst), "scalar add");
	BINARYOP_TEST_D("_mm_add_pd", _mm_add_pd, _mm_load_pd(src), _mm_load_pd(src2));
	BINARYOP_TEST_D("_mm_add_sd", _mm_add_sd, _mm_load_pd(src), _mm_load_pd(src2));
	// _mm_adds_epi16
	// _mm_adds_epi8
	// _mm_adds_epu16
	// _mm_adds_epu8
	START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] /= src2[0]; dst[1] /= src2[1]; dst[2] /= src2[2]; dst[3] /= src2[3]; } ENDSCALAR(checksum_dst(dst), "scalar div");
	BINARYOP_TEST_D("_mm_div_pd", _mm_div_pd, _mm_load_pd(src), _mm_load_pd(src2));
	BINARYOP_TEST_D("_mm_div_sd", _mm_div_sd, _mm_load_pd(src), _mm_load_pd(src2));
	// _mm_madd_epi16
	// _mm_mul_epu32
	START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] *= src2[0]; dst[1] *= src2[1]; dst[2] *= src2[2]; dst[3] *= src2[3]; } ENDSCALAR(checksum_dst(dst), "scalar mul");
	BINARYOP_TEST_D("_mm_mul_pd", _mm_mul_pd, _mm_load_pd(src), _mm_load_pd(src2));
	BINARYOP_TEST_D("_mm_mul_sd", _mm_mul_sd, _mm_load_pd(src), _mm_load_pd(src2));
	// _mm_mulhi_epi16
	// _mm_mulhi_epu16
	// _mm_mullo_epi16
	// _mm_sad_epu8
	// _mm_sub_epi16
	// _mm_sub_epi32
	// _mm_sub_epi64
	// _mm_sub_epi8
	START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] -= src2[0]; dst[1] -= src2[1]; dst[2] -= src2[2]; dst[3] -= src2[3]; } ENDSCALAR(checksum_dst(dst), "scalar sub");
	BINARYOP_TEST_D("_mm_sub_pd", _mm_sub_pd, _mm_load_pd(src), _mm_load_pd(src2));
	BINARYOP_TEST_D("_mm_sub_sd", _mm_sub_sd, _mm_load_pd(src), _mm_load_pd(src2));
	// _mm_subs_epi16
	// _mm_subs_epi8
	// _mm_subs_epu16
	// _mm_subs_epu8

	SETCHART("roots");
	START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = sqrt(dst[0]); dst[1] = sqrt(dst[1]); dst[2] = sqrt(dst[2]); dst[3] = sqrt(dst[3]); } ENDSCALAR(checksum_dst(dst), "scalar sqrt");
	UNARYOP_TEST_D("_mm_sqrt_pd", _mm_sqrt_pd, _mm_load_pd(src));
//	UNARYOP_TEST_D("_mm_sqrt_sd", _mm_sqrt_sd, _mm_load_pd(src));

	SETCHART("logical");
	START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = ucastd(dcastu(dst[0]) & dcastu(src2[0])); dst[1] = ucastd(dcastu(dst[1]) & dcastu(src2[1])); dst[2] = ucastd(dcastu(dst[2]) & dcastu(src2[2])); dst[3] = ucastd(dcastu(dst[3]) & dcastu(src2[3])); } ENDSCALAR(checksum_dst(dst), "scalar and");
	BINARYOP_TEST_D("_mm_and_pd", _mm_and_pd, _mm_load_pd(src), _mm_load_pd(src2));
	// _mm_and_si128
	START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = ucastd((~dcastu(dst[0])) & dcastu(src2[0])); dst[1] = ucastd((~dcastu(dst[1])) & dcastu(src2[1])); dst[2] = ucastd((~dcastu(dst[2])) & dcastu(src2[2])); dst[3] = ucastd((~dcastu(dst[3])) & dcastu(src2[3])); } ENDSCALAR(checksum_dst(dst), "scalar andnot");
	BINARYOP_TEST_D("_mm_andnot_pd", _mm_andnot_pd, _mm_load_pd(src), _mm_load_pd(src2));
	// _mm_andnot_si128
	START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = ucastd(dcastu(dst[0]) | dcastu(src2[0])); dst[1] = ucastd(dcastu(dst[1]) | dcastu(src2[1])); dst[2] = ucastd(dcastu(dst[2]) | dcastu(src2[2])); dst[3] = ucastd(dcastu(dst[3]) | dcastu(src2[3])); } ENDSCALAR(checksum_dst(dst), "scalar or");
	BINARYOP_TEST_D("_mm_or_pd", _mm_or_pd, _mm_load_pd(src), _mm_load_pd(src2));
	// _mm_or_si128
	START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = ucastd(dcastu(dst[0]) ^ dcastu(src2[0])); dst[1] = ucastd(dcastu(dst[1]) ^ dcastu(src2[1])); dst[2] = ucastd(dcastu(dst[2]) ^ dcastu(src2[2])); dst[3] = ucastd(dcastu(dst[3]) ^ dcastu(src2[3])); } ENDSCALAR(checksum_dst(dst), "scalar xor");
	BINARYOP_TEST_D("_mm_xor_pd", _mm_xor_pd, _mm_load_pd(src), _mm_load_pd(src2));
	// _mm_xor_si128

	SETCHART("cmp");
	// _mm_cmpeq_epi16
	// _mm_cmpeq_epi32
	// _mm_cmpeq_epi8
	START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = (dst[0] == src2[0]) ? ucastd(0xFFFFFFFFU) : 0.f; dst[1] = (dst[1] == src2[1]) ? ucastd(0xFFFFFFFFU) : 0.f; dst[2] = (dst[2] == src2[2]) ? ucastd(0xFFFFFFFFU) : 0.f; dst[3] = (dst[3] == src2[3]) ? ucastd(0xFFFFFFFFU) : 0.f; } ENDSCALAR(checksum_dst(dst), "scalar cmp==");
	BINARYOP_TEST_D("_mm_cmpeq_pd", _mm_cmpeq_pd, _mm_load_pd(src), _mm_load_pd(src2));
	BINARYOP_TEST_D("_mm_cmpeq_sd", _mm_cmpeq_sd, _mm_load_pd(src), _mm_load_pd(src2));
	START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = (dst[0] >= src2[0]) ? ucastd(0xFFFFFFFFU) : 0.f; dst[1] = (dst[1] >= src2[1]) ? ucastd(0xFFFFFFFFU) : 0.f; dst[2] = (dst[2] >= src2[2]) ? ucastd(0xFFFFFFFFU) : 0.f; dst[3] = (dst[3] >= src2[3]) ? ucastd(0xFFFFFFFFU) : 0.f; } ENDSCALAR(checksum_dst(dst), "scalar cmp>=");
	BINARYOP_TEST_D("_mm_cmpge_pd", _mm_cmpge_pd, _mm_load_pd(src), _mm_load_pd(src2));
	BINARYOP_TEST_D("_mm_cmpge_sd", _mm_cmpge_sd, _mm_load_pd(src), _mm_load_pd(src2));
	// _mm_cmpgt_epi16
	// _mm_cmpgt_epi32
	// _mm_cmpgt_epi8
	START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = (dst[0] > src2[0]) ? ucastd(0xFFFFFFFFU) : 0.f; dst[1] = (dst[1] > src2[1]) ? ucastd(0xFFFFFFFFU) : 0.f; dst[2] = (dst[2] > src2[2]) ? ucastd(0xFFFFFFFFU) : 0.f; dst[3] = (dst[3] > src2[3]) ? ucastd(0xFFFFFFFFU) : 0.f; } ENDSCALAR(checksum_dst(dst), "scalar cmp>");
	BINARYOP_TEST_D("_mm_cmpgt_pd", _mm_cmpgt_pd, _mm_load_pd(src), _mm_load_pd(src2));
	BINARYOP_TEST_D("_mm_cmpgt_sd", _mm_cmpgt_sd, _mm_load_pd(src), _mm_load_pd(src2));
	START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = (dst[0] <= src2[0]) ? ucastd(0xFFFFFFFFU) : 0.f; dst[1] = (dst[1] <= src2[1]) ? ucastd(0xFFFFFFFFU) : 0.f; dst[2] = (dst[2] <= src2[2]) ? ucastd(0xFFFFFFFFU) : 0.f; dst[3] = (dst[3] <= src2[3]) ? ucastd(0xFFFFFFFFU) : 0.f; } ENDSCALAR(checksum_dst(dst), "scalar cmp<=");
	BINARYOP_TEST_D("_mm_cmple_pd", _mm_cmple_pd, _mm_load_pd(src), _mm_load_pd(src2));
	BINARYOP_TEST_D("_mm_cmple_sd", _mm_cmple_sd, _mm_load_pd(src), _mm_load_pd(src2));
	// _mm_cmplt_epi16
	// _mm_cmplt_epi32
	// _mm_cmplt_epi8
	START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = (dst[0] < src2[0]) ? ucastd(0xFFFFFFFFU) : 0.f; dst[1] = (dst[1] < src2[1]) ? ucastd(0xFFFFFFFFU) : 0.f; dst[2] = (dst[2] < src2[2]) ? ucastd(0xFFFFFFFFU) : 0.f; dst[3] = (dst[3] < src2[3]) ? ucastd(0xFFFFFFFFU) : 0.f; } ENDSCALAR(checksum_dst(dst), "scalar cmp<");
	BINARYOP_TEST_D("_mm_cmplt_pd", _mm_cmplt_pd, _mm_load_pd(src), _mm_load_pd(src2));
	BINARYOP_TEST_D("_mm_cmplt_sd", _mm_cmplt_sd, _mm_load_pd(src), _mm_load_pd(src2));

	/*_mm_cmpneq_pd
	_mm_cmpneq_sd
	_mm_cmpnge_pd
	_mm_cmpnge_sd
	_mm_cmpngt_pd
	_mm_cmpngt_sd
	_mm_cmpnle_pd
	_mm_cmpnle_sd
	_mm_cmpnlt_pd
	_mm_cmpnlt_sd*/

	START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = (!Isnan(dst[0]) && !Isnan(src2[0])) ? ucastd(0xFFFFFFFFU) : 0.f; dst[1] = (!Isnan(dst[1]) && !Isnan(src2[1])) ? ucastd(0xFFFFFFFFU) : 0.f; dst[2] = (!Isnan(dst[2]) && !Isnan(src2[2])) ? ucastd(0xFFFFFFFFU) : 0.f; dst[3] = (!Isnan(dst[3]) && !Isnan(src2[3])) ? ucastd(0xFFFFFFFFU) : 0.f; } ENDSCALAR(checksum_dst(dst), "scalar cmpord");
	BINARYOP_TEST_D("_mm_cmpord_pd", _mm_cmpord_pd, _mm_load_pd(src), _mm_load_pd(src2));
	BINARYOP_TEST_D("_mm_cmpord_sd", _mm_cmpord_sd, _mm_load_pd(src), _mm_load_pd(src2));
	START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = (Isnan(dst[0]) || Isnan(src2[0])) ? ucastd(0xFFFFFFFFU) : 0.f; dst[1] = (Isnan(dst[1]) || Isnan(src2[1])) ? ucastd(0xFFFFFFFFU) : 0.f; dst[2] = (Isnan(dst[2]) || Isnan(src2[2])) ? ucastd(0xFFFFFFFFU) : 0.f; dst[3] = (Isnan(dst[3]) || Isnan(src2[3])) ? ucastd(0xFFFFFFFFU) : 0.f; } ENDSCALAR(checksum_dst(dst), "scalar cmpunord");
	BINARYOP_TEST_D("_mm_cmpunord_pd", _mm_cmpunord_pd, _mm_load_pd(src), _mm_load_pd(src2));
	BINARYOP_TEST_D("_mm_cmpunord_sd", _mm_cmpunord_sd, _mm_load_pd(src), _mm_load_pd(src2));

	SETCHART("max");
	// _mm_max_epi16
	// _mm_max_epu8
	START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = Max(dst[0], src2[0]); dst[1] = Max(dst[1], src2[1]); dst[2] = Max(dst[2], src2[2]); dst[3] = Max(dst[3], src2[3]); } ENDSCALAR(checksum_dst(dst), "scalar max");
	BINARYOP_TEST_D("_mm_max_pd", _mm_max_pd, _mm_load_pd(src), _mm_load_pd(src2));
	BINARYOP_TEST_D("_mm_max_sd", _mm_max_sd, _mm_load_pd(src), _mm_load_pd(src2));
	// _mm_min_epi16
	// _mm_min_epu8
	START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = Min(dst[0], src2[0]); dst[1] = Min(dst[1], src2[1]); dst[2] = Min(dst[2], src2[2]); dst[3] = Min(dst[3], src2[3]); } ENDSCALAR(checksum_dst(dst), "scalar min");
	BINARYOP_TEST_D("_mm_min_pd", _mm_min_pd, _mm_load_pd(src), _mm_load_pd(src2));
	BINARYOP_TEST_D("_mm_min_sd", _mm_min_sd, _mm_load_pd(src), _mm_load_pd(src2));

	SETCHART("shuffle");
	// _mm_extract_epi16
	// _mm_insert_epi16
	// _mm_shuffle_epi32
	START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[3] = dst[1]; dst[2] = dst[0]; dst[1] = src2[3]; dst[0] = src2[2]; } ENDSCALAR(checksum_dst(dst), "scalar shuffle");
//	BINARYOP_TEST_D("_mm_shuffle_pd", _mm_shuffle_pd, _mm_load_pd(src), _mm_load_pd(src2));
	START();
		__m128 o0 = _mm_load_pd(src);
		__m128 o1 = _mm_load_pd(src2);
		for(int i = 0; i < N; i += 4)
			o0 = _mm_shuffle_pd(o0, o1, _MM_SHUFFLE(1, 0, 3, 2));
		_mm_store_pd(dst, o0);
	END(checksum_dst(dst), "_mm_shuffle_pd");

	// _mm_shufflehi_epi16
	// _mm_shufflelo_epi16
	// _mm_unpackhi_epi16
	// _mm_unpackhi_epi32
	// _mm_unpackhi_epi64
	// _mm_unpackhi_epi8
	START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = dst[2]; dst[1] = src2[2]; dst[2] = dst[3]; dst[3] = src2[3]; } ENDSCALAR(checksum_dst(dst), "scalar unpackhi_pd");
	BINARYOP_TEST_D("_mm_unpackhi_pd", _mm_unpackhi_pd, _mm_load_pd(src), _mm_load_pd(src2));
	// _mm_unpacklo_epi16
	// _mm_unpacklo_epi32
	// _mm_unpacklo_epi64
	// _mm_unpacklo_epi8
	START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[2] = dst[1]; dst[1] = dst[0]; dst[0] = src2[0]; dst[3] = src2[1]; } ENDSCALAR(checksum_dst(dst), "scalar unpacklo_pd");
	BINARYOP_TEST_D("_mm_unpacklo_pd", _mm_unpacklo_pd, _mm_load_pd(src), _mm_load_pd(src2));
	printf("]}\n");
/*
	printf("Finished!\n");
	printf("Total time spent in scalar intrinsics: %f msecs.\n", (double)scalarTotalTicks * 1000.0 / ticks_per_sec());
	printf("Total time spent in SSE1 intrinsics: %f msecs.\n", (double)simdTotalTicks * 1000.0 / ticks_per_sec());
	if (scalarTotalTicks > simdTotalTicks)
		printf("SSE1 was %.3fx faster than scalar!\n", (double)scalarTotalTicks / simdTotalTicks);
	else
		printf("SSE1 was %.3fx slower than scalar!\n", (double)simdTotalTicks / scalarTotalTicks);
*/
#ifdef __EMSCRIPTEN__
	fprintf(stderr,"User Agent: %s\n", emscripten_run_script_string("navigator.userAgent"));
	printf("/*Test finished! Now please close Firefox to continue with benchmark_sse2.py.*/\n");
#endif
	exit(0);
}
Beispiel #7
0
/*
 * Subtract off x0 & x1 contribution to all remaining equations using a
 * rank-2 update with mu=2, nu=3, ku=2.  This version is for 16 SSE regs.
 * nu is the # of RHS, ku is the number of equations solved, and mu is
 * unrolled only to enable vectorization & software pipelining of load/use.
 * Loop order is MKN, so that B is kept completely in registers, and
 * C and A are streamed in (and out, for C) from cache during the operation.
 */
ATL_SINLINE void ATL_rk2(ATL_CINT M, const TYPE *pA0, const TYPE *pA1,
                         const TYPE *pB0, const TYPE *pB1,
                         TYPE *C, ATL_CINT ldc0)
{
   ATL_CINT ldc=ldc0+ldc0;
   TYPE *pC0 = C, *pC1 = C+ldc, *pC2 = C+((ldc)<<1);
   ATL_INT i;
   ATL_CINT MM = (M&1) ? M-1 : M-2;
   register __m128d B00, B10, B01, B11, B02, B12;
   register __m128d C00, C01, C02, C10, C11, C12;
   register __m128d A, a;

   B00 = _mm_load_pd(pB0);
   B10 = _mm_load_pd(pB1);
   B01 = _mm_load_pd(pB0+2);
   B11 = _mm_load_pd(pB1+2);
   B02 = _mm_load_pd(pB0+4);
   B12 = _mm_load_pd(pB1+4);    	/* iB12, rB12 */


   C00 = _mm_load_pd(pC0);
   C01 = _mm_load_pd(pC1);
   C02 = _mm_load_pd(pC2);
   A = _mm_load_pd(pA0);                		/* iA00, rA00 */
   for (i=0; i < MM; i += 2, pA0 += 4, pA1 += 4, pC0 += 4, pC1 += 4, pC2 += 4)
   {
      register __m128d b;
/*
 *    K=0, M=[0,1], apply real components of B0x
 */
      b = _mm_movedup_pd(B00);			/* rB00,      rB00 */
      b = _mm_mul_pd(b, A);                     /* iA00*rB00, rA00*rB00 */
      C00 = _mm_add_pd(C00, b);
         a = (__m128d)_mm_shuffle_epi32((__m128i)A, 0x4E);  	/* rA00, iA00 */
      b = _mm_movedup_pd(B01);
      b = _mm_mul_pd(b, A);
      C01 = _mm_add_pd(C01, b);
         C10 = _mm_load_pd(pC0+2);
      b = _mm_movedup_pd(B02);
      b = _mm_mul_pd(b, A);
      C02 = _mm_add_pd(C02, b);
         A = _mm_load_pd(pA1);                		/* iA01, rA01 */
/*
 *    K=0, M=0, apply imaginary components of B0x
 */
      b = (__m128d)_mm_shuffle_epi32((__m128i)B00, 0xEE); /* iB00, iB00 */
      b = _mm_mul_pd(b, a);                     /* rA00*iB00, iA00*iB00 */
      C00 = _mm_addsub_pd(C00, b);
         C11 = _mm_load_pd(pC1+2);
      b = (__m128d)_mm_shuffle_epi32((__m128i)B01, 0xEE);
      b = _mm_mul_pd(b, a);
      C01 = _mm_addsub_pd(C01, b);
         C12 = _mm_load_pd(pC2+2);
      b = (__m128d)_mm_shuffle_epi32((__m128i)B02, 0xEE);
      b = _mm_mul_pd(b, a);
      C02 = _mm_addsub_pd(C02, b);
/*
 *    K=1, M=0, apply real components of B1x
 */
      b = _mm_movedup_pd(B10);			/* rB10,      rB10 */
      b = _mm_mul_pd(b, A);                     /* iA01*rB10, rA01*rB10 */
      C00 = _mm_add_pd(C00, b);
      a = (__m128d)_mm_shuffle_epi32((__m128i)A, 0x4E);  	/* rA01, iA01 */
      b = _mm_movedup_pd(B11);
      b = _mm_mul_pd(b, A);
      C01 = _mm_add_pd(C01, b);
      b = _mm_movedup_pd(B12);
      b = _mm_mul_pd(b, A);
      C02 = _mm_add_pd(C02, b);
         A = _mm_load_pd(pA0+2);                /* iA10, rA10 */
/*
 *    K=1, M=0, apply imaginary components of B1x
 */
      b = (__m128d)_mm_shuffle_epi32((__m128i)B10, 0xEE); /* iB10, iB10 */
      b = _mm_mul_pd(b, a);                     /* rA01*iB10, iA01*iB10 */
      C00 = _mm_addsub_pd(C00, b);
         _mm_store_pd(pC0, C00);
      b = (__m128d)_mm_shuffle_epi32((__m128i)B11, 0xEE);
      b = _mm_mul_pd(b, a);
      C01 = _mm_addsub_pd(C01, b);
         _mm_store_pd(pC1, C01);
      b = (__m128d)_mm_shuffle_epi32((__m128i)B12, 0xEE);
      b = _mm_mul_pd(b, a);
      C02 = _mm_addsub_pd(C02, b);
         _mm_store_pd(pC2, C02);
/*
 *    K=0, M=1, apply real components of B0x
 */
      b = _mm_movedup_pd(B00);			/* rB00,      rB00 */
      b = _mm_mul_pd(b, A);                     /* iA10*rB00, rA10*rB00 */
      C10 = _mm_add_pd(C10, b);
         a = (__m128d)_mm_shuffle_epi32((__m128i)A, 0x4E);  	/* rA10, iA10 */
      b = _mm_movedup_pd(B01);
      b = _mm_mul_pd(b, A);
      C11 = _mm_add_pd(C11, b);
         C00 = _mm_load_pd(pC0+4);
      b = _mm_movedup_pd(B02);
      b = _mm_mul_pd(b, A);
      C12 = _mm_add_pd(C12, b);
         A = _mm_load_pd(pA1+2);               		/* iA11, rA11 */
/*
 *    K=0, M=1, apply imaginary components of B0x
 */
      b = (__m128d)_mm_shuffle_epi32((__m128i)B00, 0xEE); /* iB00, iB00 */
      b = _mm_mul_pd(b, a);                     /* rA10*iB00, iA10*iB00 */
      C10 = _mm_addsub_pd(C10, b);
         C01 = _mm_load_pd(pC1+4);
      b = (__m128d)_mm_shuffle_epi32((__m128i)B01, 0xEE);
      b = _mm_mul_pd(b, a);
      C11 = _mm_addsub_pd(C11, b);
         C02 = _mm_load_pd(pC2+4);
      b = (__m128d)_mm_shuffle_epi32((__m128i)B02, 0xEE);
      b = _mm_mul_pd(b, a);
      C12 = _mm_addsub_pd(C12, b);
/*
 *    K=1, M=1, apply real components of B1x
 */
      b = _mm_movedup_pd(B10);			/* rB10,      rB10 */
      b = _mm_mul_pd(b, A);                     /* iA11*rB10, rA11*rB10 */
      C10 = _mm_add_pd(C10, b);
      a = (__m128d)_mm_shuffle_epi32((__m128i)A, 0x4E);  	/* rA11, iA11 */
      b = _mm_movedup_pd(B11);
      b = _mm_mul_pd(b, A);
      C11 = _mm_add_pd(C11, b);
      b = _mm_movedup_pd(B12);
      b = _mm_mul_pd(b, A);
      C12 = _mm_add_pd(C12, b);
         A = _mm_load_pd(pA0+4);               		/* iA20, rA20 */
/*
 *    K=1, M=1, apply imaginary components of B1x
 */
      b = (__m128d)_mm_shuffle_epi32((__m128i)B10, 0xEE); /* iB10, iB10 */
      b = _mm_mul_pd(b, a);                     /* rA11*iB10, iA11*iB10 */
      C10 = _mm_addsub_pd(C10, b);
         _mm_store_pd(pC0+2, C10);
      b = (__m128d)_mm_shuffle_epi32((__m128i)B11, 0xEE);
      b = _mm_mul_pd(b, a);
      C11 = _mm_addsub_pd(C11, b);
         _mm_store_pd(pC1+2, C11);
      b = (__m128d)_mm_shuffle_epi32((__m128i)B12, 0xEE);
      b = _mm_mul_pd(b, a);
      C12 = _mm_addsub_pd(C12, b);
         _mm_store_pd(pC2+2, C12);
   }
/*
 * Drain pipes
 */
   {
      register __m128d b;
/*
 *    K=0, M=[0,1], apply real components of B0x
 */
      b = _mm_movedup_pd(B00);			/* rB00,      rB00 */
      b = _mm_mul_pd(b, A);                     /* iA00*rB00, rA00*rB00 */
      C00 = _mm_add_pd(C00, b);
         a = (__m128d)_mm_shuffle_epi32((__m128i)A, 0x4E);  	/* rA00, iA00 */
      b = _mm_movedup_pd(B01);
      b = _mm_mul_pd(b, A);
      C01 = _mm_add_pd(C01, b);
      b = _mm_movedup_pd(B02);
      b = _mm_mul_pd(b, A);
      C02 = _mm_add_pd(C02, b);
         A = _mm_load_pd(pA1);                		/* iA01, rA01 */
/*
 *    K=0, M=0, apply imaginary components of B0x
 */
      b = (__m128d)_mm_shuffle_epi32((__m128i)B00, 0xEE); /* iB00, iB00 */
      b = _mm_mul_pd(b, a);                     /* rA00*iB00, iA00*iB00 */
      C00 = _mm_addsub_pd(C00, b);
      b = (__m128d)_mm_shuffle_epi32((__m128i)B01, 0xEE);
      b = _mm_mul_pd(b, a);
      C01 = _mm_addsub_pd(C01, b);
      b = (__m128d)_mm_shuffle_epi32((__m128i)B02, 0xEE);
      b = _mm_mul_pd(b, a);
      C02 = _mm_addsub_pd(C02, b);
/*
 *    K=1, M=0, apply real components of B1x
 */
      b = _mm_movedup_pd(B10);			/* rB10,      rB10 */
      b = _mm_mul_pd(b, A);                     /* iA01*rB10, rA01*rB10 */
      C00 = _mm_add_pd(C00, b);
      a = (__m128d)_mm_shuffle_epi32((__m128i)A, 0x4E);  	/* rA01, iA01 */
      b = _mm_movedup_pd(B11);
      b = _mm_mul_pd(b, A);
      C01 = _mm_add_pd(C01, b);
      b = _mm_movedup_pd(B12);
      b = _mm_mul_pd(b, A);
      C02 = _mm_add_pd(C02, b);
/*
 *    K=1, M=0, apply imaginary components of B1x
 */
      b = (__m128d)_mm_shuffle_epi32((__m128i)B10, 0xEE); /* iB10, iB10 */
      b = _mm_mul_pd(b, a);                     /* rA01*iB10, iA01*iB10 */
      C00 = _mm_addsub_pd(C00, b);
         _mm_store_pd(pC0, C00);
      b = (__m128d)_mm_shuffle_epi32((__m128i)B11, 0xEE);
      b = _mm_mul_pd(b, a);
      C01 = _mm_addsub_pd(C01, b);
         _mm_store_pd(pC1, C01);
      b = (__m128d)_mm_shuffle_epi32((__m128i)B12, 0xEE);
      b = _mm_mul_pd(b, a);
      C02 = _mm_addsub_pd(C02, b);
         _mm_store_pd(pC2, C02);
      if (!(M&1))
      {
         C10 = _mm_load_pd(pC0+2);
         C11 = _mm_load_pd(pC1+2);
         C12 = _mm_load_pd(pC2+2);
         A = _mm_load_pd(pA0+2);                /* iA10, rA10 */
/*
 *       K=0, M=1, apply real components of B0x
 */
         b = _mm_movedup_pd(B00);			/* rB00,      rB00 */
         b = _mm_mul_pd(b, A);                     /* iA10*rB00, rA10*rB00 */
         C10 = _mm_add_pd(C10, b);
            a = (__m128d)_mm_shuffle_epi32((__m128i)A, 0x4E);  	/* rA10, iA10 */
         b = _mm_movedup_pd(B01);
         b = _mm_mul_pd(b, A);
         C11 = _mm_add_pd(C11, b);
         b = _mm_movedup_pd(B02);
         b = _mm_mul_pd(b, A);
         C12 = _mm_add_pd(C12, b);
            A = _mm_load_pd(pA1+2);               		/* iA11, rA11 */
/*
 *       K=0, M=1, apply imaginary components of B0x
 */
         b = (__m128d)_mm_shuffle_epi32((__m128i)B00, 0xEE); /* iB00, iB00 */
         b = _mm_mul_pd(b, a);                     /* rA10*iB00, iA10*iB00 */
         C10 = _mm_addsub_pd(C10, b);
         b = (__m128d)_mm_shuffle_epi32((__m128i)B01, 0xEE);
         b = _mm_mul_pd(b, a);
         C11 = _mm_addsub_pd(C11, b);
         b = (__m128d)_mm_shuffle_epi32((__m128i)B02, 0xEE);
         b = _mm_mul_pd(b, a);
         C12 = _mm_addsub_pd(C12, b);
/*
 *       K=1, M=1, apply real components of B1x
 */
         b = _mm_movedup_pd(B10);			/* rB10,      rB10 */
         b = _mm_mul_pd(b, A);                     /* iA11*rB10, rA11*rB10 */
         C10 = _mm_add_pd(C10, b);
         a = (__m128d)_mm_shuffle_epi32((__m128i)A, 0x4E);  	/* rA11, iA11 */
         b = _mm_movedup_pd(B11);
         b = _mm_mul_pd(b, A);
         C11 = _mm_add_pd(C11, b);
         b = _mm_movedup_pd(B12);
         b = _mm_mul_pd(b, A);
         C12 = _mm_add_pd(C12, b);
/*
 *       K=1, M=1, apply imaginary components of B1x
 */
         b = (__m128d)_mm_shuffle_epi32((__m128i)B10, 0xEE); /* iB10, iB10 */
         b = _mm_mul_pd(b, a);                     /* rA11*iB10, iA11*iB10 */
         C10 = _mm_addsub_pd(C10, b);
            _mm_store_pd(pC0+2, C10);
         b = (__m128d)_mm_shuffle_epi32((__m128i)B11, 0xEE);
         b = _mm_mul_pd(b, a);
         C11 = _mm_addsub_pd(C11, b);
            _mm_store_pd(pC1+2, C11);
         b = (__m128d)_mm_shuffle_epi32((__m128i)B12, 0xEE);
         b = _mm_mul_pd(b, a);
         C12 = _mm_addsub_pd(C12, b);
            _mm_store_pd(pC2+2, C12);
      }
   }
}
void blurRemoveMinMax_(Mat& src, Mat& dest, const int r)
{
	const Size ksize = Size(2*r+1,2*r+1);
	if(src.data!=dest.data)src.copyTo(dest);

	Mat xv;
	Mat nv;
	Mat element = Mat::ones(2*r+1,2*r+1,CV_8U);
	dilate(src,xv,element);
	erode(src,nv,element);

	Mat mind;
	Mat maxd;
	Mat mask;
	absdiff(src,nv,mind);//can move to loop
	absdiff(src,xv,maxd);//
	min(mind,maxd,mask);//

	T* n = nv.ptr<T>(0);
	T* x = xv.ptr<T>(0);
	T* d = dest.ptr<T>(0);
	T* nd = mind.ptr<T>(0);
	T* mk = mask.ptr<T>(0);

	int remsize = src.size().area();

#if CV_SSE4_1
	if(src.depth()==CV_8U)
	{

		const int ssesize = src.size().area()/16;
		remsize = src.size().area()-ssesize*16;	
		for(int i=0;i<ssesize;i++)
		{
			__m128i mmk = _mm_load_si128((__m128i*)mk);
			__m128i mnd = _mm_load_si128((__m128i*)nd);

			__m128i mmn = _mm_load_si128((__m128i*)n);
			__m128i mmx = _mm_load_si128((__m128i*)x);
			__m128i msk =  _mm_cmpeq_epi8(mnd,mmk);
			_mm_store_si128((__m128i*)d,_mm_blendv_epi8(mmx,mmn,msk));
			nd+=16;
			mk+=16;
			d+=16;
			n+=16;
			x+=16;
		}
	}
	else if(src.depth()==CV_16S || src.depth()==CV_16U)
	{

		const int ssesize = src.size().area()/8;
		remsize = src.size().area()-ssesize*8;	
		for(int i=0;i<ssesize;i++)
		{
			__m128i mmk = _mm_load_si128((__m128i*)mk);
			__m128i mnd = _mm_load_si128((__m128i*)nd);

			__m128i mmn = _mm_load_si128((__m128i*)n);
			__m128i mmx = _mm_load_si128((__m128i*)x);
			__m128i msk =  _mm_cmpeq_epi16(mnd,mmk);
			_mm_store_si128((__m128i*)d,_mm_blendv_epi8(mmx,mmn,msk));
			nd+=8;
			mk+=8;
			d+=8;
			n+=8;
			x+=8;
		}
	}
	else if(src.depth()==CV_32F)
	{

		const int ssesize = src.size().area()/4;
		remsize = src.size().area()-ssesize*4;	
		for(int i=0;i<ssesize;i++)
		{
			__m128 mmk = _mm_load_ps((float*)mk);
			__m128 mnd = _mm_load_ps((float*)nd);

			__m128 mmn = _mm_load_ps((float*)n);
			__m128 mmx = _mm_load_ps((float*)x);
			__m128 msk =  _mm_cmpeq_ps(mnd,mmk);
			_mm_store_ps((float*)d,_mm_blendv_ps(mmx,mmn,msk));
			nd+=4;
			mk+=4;
			d+=4;
			n+=4;
			x+=4;
		}
	}
	else if(src.depth()==CV_64F)
	{

		const int ssesize = src.size().area()/2;
		remsize = src.size().area()-ssesize*2;	
		for(int i=0;i<ssesize;i++)
		{
			__m128d mmk = _mm_load_pd((double*)mk);
			__m128d mnd = _mm_load_pd((double*)nd);

			__m128d mmn = _mm_load_pd((double*)n);
			__m128d mmx = _mm_load_pd((double*)x);
			__m128d msk =  _mm_cmpeq_pd(mnd,mmk);
			_mm_store_pd((double*)d,_mm_blendv_pd(mmx,mmn,msk));
			nd+=2;
			mk+=2;
			d+=2;
			n+=2;
			x+=2;
		}
	}
#endif
	for(int i=0;i<remsize;i++)
	{
		{
			if(nd[i]==mk[i])
			{
				d[i]=n[i];
			}
			else
			{
				d[i]=x[i];
			}
		}
	}
}
static inline __m128d
my_invrsq_pd(__m128d x)
{
	const __m128d three = (const __m128d) {3.0f, 3.0f};
	const __m128d half  = (const __m128d) {0.5f, 0.5f};
	
	__m128  t  = _mm_rsqrt_ps(_mm_cvtpd_ps(x)); /* Convert to single precision and do _mm_rsqrt_ps() */
	__m128d t1 = _mm_cvtps_pd(t); /* Convert back to double precision */
	
	/* First Newton-Rapson step, accuracy is now 24 bits */
	__m128d t2 = _mm_mul_pd(half,_mm_mul_pd(t1,_mm_sub_pd(three,_mm_mul_pd(x,_mm_mul_pd(t1,t1)))));
	
	/* Return second Newton-Rapson step, accuracy 48 bits */
	return (__m128d) _mm_mul_pd(half,_mm_mul_pd(t2,_mm_sub_pd(three,_mm_mul_pd(x,_mm_mul_pd(t2,t2)))));
}

/* to extract single integers from a __m128i datatype */
#define _mm_extract_epi64(x, imm) \
_mm_cvtsi128_si32(_mm_srli_si128((x), 4 * (imm)))

void nb_kernel430_ia32_sse2(int *           p_nri,
							int *           iinr,
							int *           jindex,
							int *           jjnr,
							int *           shift,
							double *         shiftvec,
							double *         fshift,
							int *           gid,
							double *         pos,
							double *         faction,
							double *         charge,
							double *         p_facel,
							double *         p_krf,
							double *         p_crf,
							double *         Vc,
							int *           type,
							int *           p_ntype,
							double *         vdwparam,
							double *         Vvdw,
							double *         p_tabscale,
							double *         VFtab,
							double *         invsqrta,
							double *         dvda,
							double *         p_gbtabscale,
							double *         GBtab,
							int *           p_nthreads,
							int *           count,
							void *          mtx,
							int *           outeriter,
							int *           inneriter,
							double *         work)
{
	int           nri,ntype,nthreads,offset,tj,tj2,nti;
	int           n,ii,is3,ii3,k,nj0,nj1,jnr1,jnr2,j13,j23,ggid;
	double        facel,krf,crf,tabscl,gbtabscl,vct,vdwt,vgbt,nt1,nt2;
	double        shX,shY,shZ,isai_d,dva;
	gmx_gbdata_t *gbdata;
	float *        gpol;

	__m128d       ix,iy,iz,jx,jy,jz;
	__m128d		  dx,dy,dz,t1,t2,t3;
	__m128d		  fix,fiy,fiz,rsq11,rinv,r,fscal,rt,eps,eps2;
	__m128d		  q,iq,qq,isai,isaj,isaprod,vcoul,gbscale,dvdai,dvdaj;
	__m128d       Y,F,G,H,Fp,VV,FF,vgb,fijC,fijD,fijR,dvdatmp,dvdasum,vctot,n0d;
	__m128d		  xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7,xmm8;
	__m128d       c6,c12,Vvdw6,Vvdw12,Vvdwtmp,Vvdwtot,vgbtot,rinvsq,rinvsix;
	__m128d       fac,tabscale,gbtabscale;
	__m128i       n0,nnn;
	
	const __m128d neg    = {-1.0f,-1.0f};
	const __m128d zero   = {0.0f,0.0f};
	const __m128d half   = {0.5f,0.5f};
	const __m128d two    = {2.0f,2.0f};
	const __m128d three  = {3.0f,3.0f};
	const __m128d six    = {6.0f,6.0f};
	const __m128d twelwe = {12.0f,12.0f};
	
	const __m128i four   = _mm_set_epi32(4,4,4,4);
	
	gbdata     = (gmx_gbdata_t *)work;
	gpol       = gbdata->gpol;

	nri        = *p_nri;
	ntype      = *p_ntype;
	nthreads   = *p_nthreads; 
    facel      = (*p_facel) * (1.0 - (1.0/gbdata->gb_epsilon_solvent));       
	krf        = *p_krf;
	crf        = *p_crf;
	tabscl     = *p_tabscale;
	gbtabscl   = *p_gbtabscale;
	nj1        = 0;
	
	/* Splat variables */
	fac        = _mm_load1_pd(&facel);
	tabscale   = _mm_load1_pd(&tabscl);
	gbtabscale = _mm_load1_pd(&gbtabscl);
		
	/* Keep compiler happy */
	Vvdwtmp = _mm_setzero_pd();
	Vvdwtot = _mm_setzero_pd();
	dvdatmp = _mm_setzero_pd();
	dvdaj   = _mm_setzero_pd();
	isaj    = _mm_setzero_pd();
	vcoul   = _mm_setzero_pd();
	vgb     = _mm_setzero_pd();
	t1      = _mm_setzero_pd();
	t2      = _mm_setzero_pd();
	t3      = _mm_setzero_pd();
	xmm1    = _mm_setzero_pd();
	xmm2    = _mm_setzero_pd();
	xmm3    = _mm_setzero_pd();
	xmm4    = _mm_setzero_pd();
	jnr1    = jnr2 = 0;
	j13     = j23  = 0;
		
	for(n=0;n<nri;n++)
	{
		is3     = 3*shift[n];
		shX     = shiftvec[is3];
		shY     = shiftvec[is3+1];
		shZ     = shiftvec[is3+2];
		
		nj0     = jindex[n];      
        nj1     = jindex[n+1];  
		offset  = (nj1-nj0)%2;
		
		ii      = iinr[n];
		ii3     = ii*3;
		
		ix      = _mm_set1_pd(shX+pos[ii3+0]);
		iy      = _mm_set1_pd(shX+pos[ii3+1]);
		iz      = _mm_set1_pd(shX+pos[ii3+2]); 
		q       = _mm_set1_pd(charge[ii]);
		
		iq      = _mm_mul_pd(fac,q); 
		isai_d  = invsqrta[ii];
		isai    = _mm_load1_pd(&isai_d);
		
		nti      = 2*ntype*type[ii];
		
		fix     = _mm_setzero_pd();
		fiy     = _mm_setzero_pd();
		fiz     = _mm_setzero_pd();
		dvdasum = _mm_setzero_pd();
		vctot   = _mm_setzero_pd();
		vgbtot  = _mm_setzero_pd();
		Vvdwtot = _mm_setzero_pd();
		
		for(k=nj0;k<nj1-offset; k+=2)
		{
			jnr1    = jjnr[k];
			jnr2    = jjnr[k+1];
			
			j13     = jnr1 * 3;
			j23     = jnr2 * 3;
			
			/* Load coordinates */
			xmm1    = _mm_loadu_pd(pos+j13); /* x1 y1 */
			xmm2    = _mm_loadu_pd(pos+j23); /* x2 y2 */
			
			xmm5    = _mm_load_sd(pos+j13+2); /* z1 - */
			xmm6    = _mm_load_sd(pos+j23+2); /* z2 - */
			
			/* transpose */
			jx      = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); 
			jy      = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); 
			jz      = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); 
			
			/* distances */
			dx      = _mm_sub_pd(ix,jx);
			dy		= _mm_sub_pd(iy,jy);
			dz		= _mm_sub_pd(iz,jz);
			
			rsq11   = _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx) , _mm_mul_pd(dy,dy) ) , _mm_mul_pd(dz,dz) );
			rinv    = my_invrsq_pd(rsq11);
			
			/* Load invsqrta */
			isaj	= _mm_loadl_pd(isaj,invsqrta+jnr1);
			isaj	= _mm_loadh_pd(isaj,invsqrta+jnr2);
			isaprod = _mm_mul_pd(isai,isaj);
			
			/* Load charges */
			q		= _mm_loadl_pd(q,charge+jnr1);
			q		= _mm_loadh_pd(q,charge+jnr2);
			qq		= _mm_mul_pd(iq,q);
			
			vcoul	= _mm_mul_pd(qq,rinv);
			fscal	= _mm_mul_pd(vcoul,rinv);
			qq		= _mm_mul_pd(isaprod,qq);
			qq		= _mm_mul_pd(qq,neg);
			gbscale	= _mm_mul_pd(isaprod,gbtabscale);
			
			/* Load VdW parameters */
			tj      = nti+2*type[jnr1];
			tj2     = nti+2*type[jnr2];
			
			xmm1      = _mm_loadu_pd(vdwparam+tj);
			xmm2     = _mm_loadu_pd(vdwparam+tj2);
			c6      = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0));
			c12     = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1));
						
			/* Load dvdaj */
			dvdaj	= _mm_loadl_pd(dvdaj, dvda+jnr1);
			dvdaj	= _mm_loadh_pd(dvdaj, dvda+jnr2);
			
			/* Calculate GB table index */
			r		= _mm_mul_pd(rsq11,rinv);
			rt		= _mm_mul_pd(r,gbscale);
			n0		= _mm_cvttpd_epi32(rt);
			n0d		= _mm_cvtepi32_pd(n0);
			eps		= _mm_sub_pd(rt,n0d);
			eps2	= _mm_mul_pd(eps,eps);
			
			nnn		= _mm_slli_epi64(n0,2);
			
			xmm1	= _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,0)));   /* Y1 F1 */
			xmm2	= _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,1)));   /* Y2 F2 */
			xmm3	= _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,0))+2); /* G1 H1 */
			xmm4	= _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,1))+2); /* G2 H2 */
			
			Y		= _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* Y1 Y2 */
			F		= _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* F1 F2 */
			G		= _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); /* G1 G2 */
			H		= _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); /* H1 H2 */
			
			G		= _mm_mul_pd(G,eps);
			H		= _mm_mul_pd(H,eps2);
			Fp		= _mm_add_pd(F,G);
			Fp		= _mm_add_pd(Fp,H);
			VV		= _mm_mul_pd(Fp,eps);
			VV		= _mm_add_pd(Y,VV);
			H		= _mm_mul_pd(two,H);
			FF		= _mm_add_pd(Fp,G);
			FF		= _mm_add_pd(FF,H);
			vgb		= _mm_mul_pd(qq,VV);
			fijC	= _mm_mul_pd(qq,FF);
			fijC	= _mm_mul_pd(fijC,gbscale);
			
			dvdatmp = _mm_mul_pd(fijC,r);
			dvdatmp	= _mm_add_pd(vgb,dvdatmp);
			dvdatmp = _mm_mul_pd(dvdatmp,neg);
			dvdatmp = _mm_mul_pd(dvdatmp,half);
			dvdasum	= _mm_add_pd(dvdasum,dvdatmp);
			
			xmm1	= _mm_mul_pd(dvdatmp,isaj);
			xmm1	= _mm_mul_pd(xmm1,isaj);
			dvdaj	= _mm_add_pd(dvdaj,xmm1);
			
			/* store dvda */
			_mm_storel_pd(dvda+jnr1,dvdaj);
			_mm_storeh_pd(dvda+jnr2,dvdaj);
			
			vctot	= _mm_add_pd(vctot,vcoul);
			vgbtot	= _mm_add_pd(vgbtot,vgb);
			
			/* Calculate VDW table index */
			rt      = _mm_mul_pd(r,tabscale);
			n0      = _mm_cvttpd_epi32(rt);
			n0d     = _mm_cvtepi32_pd(n0);
			eps     = _mm_sub_pd(rt,n0d);
			eps2    = _mm_mul_pd(eps,eps);
			nnn     = _mm_slli_epi32(n0,3);
			
			/* Tabulated VdW interaction - dispersion */
			xmm1	= _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,0)));   /* Y1 F1 */
			xmm2	= _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,1)));   /* Y2 F2 */
			xmm3	= _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,0))+2); /* G1 H1 */
			xmm4	= _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,1))+2); /* G2 H2 */
			
			Y		= _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* Y1 Y2 */
			F		= _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* F1 F2 */
			G		= _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); /* G1 G2 */
			H		= _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); /* H1 H2 */
			
			G       = _mm_mul_pd(G,eps);
			H       = _mm_mul_pd(H,eps2);
			Fp      = _mm_add_pd(F,G);
			Fp      = _mm_add_pd(Fp,H);
			VV      = _mm_mul_pd(Fp,eps);
			VV      = _mm_add_pd(Y,VV);
			xmm1    = _mm_mul_pd(two,H);
			FF      = _mm_add_pd(Fp,G);
			FF      = _mm_add_pd(FF,xmm1);
			
			Vvdw6   = _mm_mul_pd(c6,VV);
			fijD    = _mm_mul_pd(c6,FF);
			
			/* Tabulated VdW interaction - repulsion */
			nnn     = _mm_add_epi32(nnn,four);
			
			xmm1	= _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,0)));   /* Y1 F1 */
			xmm2	= _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,1)));   /* Y2 F2 */
			xmm3	= _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,0))+2); /* G1 H1 */
			xmm4	= _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,1))+2); /* G2 H2 */
			
			Y		= _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* Y1 Y2 */
			F		= _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* F1 F2 */
			G		= _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); /* G1 G2 */
			H		= _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); /* H1 H2 */
			
			G       = _mm_mul_pd(G,eps);
			H       = _mm_mul_pd(H,eps2);
			Fp      = _mm_add_pd(F,G);
			Fp      = _mm_add_pd(Fp,H);
			VV      = _mm_mul_pd(Fp,eps);
			VV      = _mm_add_pd(Y,VV);
			xmm1    = _mm_mul_pd(two,H);
			FF      = _mm_add_pd(Fp,G);
			FF      = _mm_add_pd(FF,xmm1);
			
			Vvdw12  = _mm_mul_pd(c12,VV);
			fijR    = _mm_mul_pd(c12,FF);
			
			Vvdwtmp = _mm_add_pd(Vvdw12,Vvdw6);
			Vvdwtot = _mm_add_pd(Vvdwtot,Vvdwtmp);
			
			xmm1    = _mm_add_pd(fijD,fijR);
			xmm1    = _mm_mul_pd(xmm1,tabscale);
			xmm1    = _mm_add_pd(xmm1,fijC);
			xmm1    = _mm_sub_pd(xmm1,fscal);
			fscal   = _mm_mul_pd(xmm1,neg);
			fscal   = _mm_mul_pd(fscal,rinv);
						
			/* calculate partial force terms */
			t1		= _mm_mul_pd(fscal,dx);
			t2		= _mm_mul_pd(fscal,dy);
			t3		= _mm_mul_pd(fscal,dz);
			
			/* update the i force */
			fix		= _mm_add_pd(fix,t1);
			fiy		= _mm_add_pd(fiy,t2);
			fiz		= _mm_add_pd(fiz,t3);
			
			/* accumulate forces from memory */
			xmm1	= _mm_loadu_pd(faction+j13); /* fx1 fy1 */
			xmm2	= _mm_loadu_pd(faction+j23); /* fx2 fy2 */
			
			xmm5	= _mm_load1_pd(faction+j13+2); /* fz1 fz1 */
			xmm6	= _mm_load1_pd(faction+j23+2); /* fz2 fz2 */
			
			/* transpose */
			xmm7	= _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); /* fz1 fz2 */
			xmm5	= _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* fx1 fx2 */
			xmm6	= _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* fy1 fy2 */
			
			/* subtract partial forces */
			xmm5	= _mm_sub_pd(xmm5,t1);
			xmm6	= _mm_sub_pd(xmm6,t2);
			xmm7	= _mm_sub_pd(xmm7,t3);
			
			xmm1	= _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); /* fx1 fy1 */
			xmm2	= _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(1,1)); /* fy1 fy2 */
			
			/* store fx and fy */
			_mm_storeu_pd(faction+j13,xmm1);
			_mm_storeu_pd(faction+j23,xmm2);
			
			/* .. then fz */
			_mm_storel_pd(faction+j13+2,xmm7);
			_mm_storel_pd(faction+j23+2,xmm7);
		}
		
		/* In double precision, offset can only be either 0 or 1 */
		if(offset!=0)
		{
			jnr1	= jjnr[k];
			j13		= jnr1*3;
			
			jx      = _mm_load_sd(pos+j13);
			jy      = _mm_load_sd(pos+j13+1);
			jz      = _mm_load_sd(pos+j13+2);
			
			isaj	= _mm_load_sd(invsqrta+jnr1);
			isaprod = _mm_mul_sd(isai,isaj);
			dvdaj	= _mm_load_sd(dvda+jnr1);
			q		= _mm_load_sd(charge+jnr1);
			qq      = _mm_mul_sd(iq,q);
			
			dx      = _mm_sub_sd(ix,jx);
			dy		= _mm_sub_sd(iy,jy);
			dz		= _mm_sub_sd(iz,jz);
			
			rsq11   = _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx) , _mm_mul_pd(dy,dy) ) , _mm_mul_pd(dz,dz) );
			rinv    = my_invrsq_pd(rsq11);
			
			vcoul	= _mm_mul_sd(qq,rinv);
			fscal	= _mm_mul_sd(vcoul,rinv);
			qq		= _mm_mul_sd(isaprod,qq);
			qq		= _mm_mul_sd(qq,neg);
			gbscale	= _mm_mul_sd(isaprod,gbtabscale);
			
			/* Load VdW parameters */
			tj      = nti+2*type[jnr1];
			
			c6      = _mm_load_sd(vdwparam+tj);
			c12     = _mm_load_sd(vdwparam+tj+1);
					
			/* Calculate GB table index */
			r		= _mm_mul_sd(rsq11,rinv);
			rt		= _mm_mul_sd(r,gbscale);
			n0		= _mm_cvttpd_epi32(rt);
			n0d		= _mm_cvtepi32_pd(n0);
			eps		= _mm_sub_sd(rt,n0d);
			eps2	= _mm_mul_sd(eps,eps);
			
			nnn		= _mm_slli_epi64(n0,2);
			
			xmm1	= _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,0))); 
			xmm2	= _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,1))); 
			xmm3	= _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,0))+2); 
			xmm4	= _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,1))+2); 
			
			Y		= _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); 
			F		= _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); 
			G		= _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); 
			H		= _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); 
			
			G		= _mm_mul_sd(G,eps);
			H		= _mm_mul_sd(H,eps2);
			Fp		= _mm_add_sd(F,G);
			Fp		= _mm_add_sd(Fp,H);
			VV		= _mm_mul_sd(Fp,eps);
			VV		= _mm_add_sd(Y,VV);
			H		= _mm_mul_sd(two,H);
			FF		= _mm_add_sd(Fp,G);
			FF		= _mm_add_sd(FF,H);
			vgb		= _mm_mul_sd(qq,VV);
			fijC	= _mm_mul_sd(qq,FF);
			fijC	= _mm_mul_sd(fijC,gbscale);
			
			dvdatmp = _mm_mul_sd(fijC,r);
			dvdatmp	= _mm_add_sd(vgb,dvdatmp);
			dvdatmp = _mm_mul_sd(dvdatmp,neg);
			dvdatmp = _mm_mul_sd(dvdatmp,half);
			dvdasum	= _mm_add_sd(dvdasum,dvdatmp);
			
			xmm1	= _mm_mul_sd(dvdatmp,isaj);
			xmm1	= _mm_mul_sd(xmm1,isaj);
			dvdaj	= _mm_add_sd(dvdaj,xmm1);
			
			/* store dvda */
			_mm_storel_pd(dvda+jnr1,dvdaj);
			
			vctot	= _mm_add_sd(vctot,vcoul);
			vgbtot	= _mm_add_sd(vgbtot,vgb);
			
			/* Calculate VDW table index */
			rt      = _mm_mul_sd(r,tabscale);
			n0      = _mm_cvttpd_epi32(rt);
			n0d     = _mm_cvtepi32_pd(n0);
			eps     = _mm_sub_sd(rt,n0d);
			eps2    = _mm_mul_sd(eps,eps);
			nnn     = _mm_slli_epi32(n0,3);
			
			/* Tabulated VdW interaction - dispersion */
			xmm1	= _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,0)));   /* Y1 F1 */
			xmm2	= _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,1)));   /* Y2 F2 */
			xmm3	= _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,0))+2); /* G1 H1 */
			xmm4	= _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,1))+2); /* G2 H2 */
			
			Y		= _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* Y1 Y2 */
			F		= _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* F1 F2 */
			G		= _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); /* G1 G2 */
			H		= _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); /* H1 H2 */
			
			G       = _mm_mul_sd(G,eps);
			H       = _mm_mul_sd(H,eps2);
			Fp      = _mm_add_sd(F,G);
			Fp      = _mm_add_sd(Fp,H);
			VV      = _mm_mul_sd(Fp,eps);
			VV      = _mm_add_sd(Y,VV);
			xmm1    = _mm_mul_sd(two,H);
			FF      = _mm_add_sd(Fp,G);
			FF      = _mm_add_sd(FF,xmm1);
			
			Vvdw6   = _mm_mul_sd(c6,VV);
			fijD    = _mm_mul_sd(c6,FF);
			
			/* Tabulated VdW interaction - repulsion */
			nnn     = _mm_add_epi32(nnn,four);
			
			xmm1	= _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,0)));   /* Y1 F1 */
			xmm2	= _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,1)));   /* Y2 F2 */
			xmm3	= _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,0))+2); /* G1 H1 */
			xmm4	= _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,1))+2); /* G2 H2 */
			
			Y		= _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* Y1 Y2 */
			F		= _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* F1 F2 */
			G		= _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); /* G1 G2 */
			H		= _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); /* H1 H2 */
			
			G       = _mm_mul_sd(G,eps);
			H       = _mm_mul_sd(H,eps2);
			Fp      = _mm_add_sd(F,G);
			Fp      = _mm_add_sd(Fp,H);
			VV      = _mm_mul_sd(Fp,eps);
			VV      = _mm_add_sd(Y,VV);
			xmm1    = _mm_mul_sd(two,H);
			FF      = _mm_add_sd(Fp,G);
			FF      = _mm_add_sd(FF,xmm1);
			
			Vvdw12  = _mm_mul_sd(c12,VV);
			fijR    = _mm_mul_sd(c12,FF);
			
			Vvdwtmp = _mm_add_sd(Vvdw12,Vvdw6);
			Vvdwtot = _mm_add_sd(Vvdwtot,Vvdwtmp);
			
			xmm1    = _mm_add_sd(fijD,fijR);
			xmm1    = _mm_mul_sd(xmm1,tabscale);
			xmm1    = _mm_add_sd(xmm1,fijC);
			xmm1    = _mm_sub_sd(xmm1,fscal);
			fscal   = _mm_mul_sd(xmm1,neg);
			fscal   = _mm_mul_sd(fscal,rinv);
			
			/* calculate partial force terms */
			t1		= _mm_mul_sd(fscal,dx);
			t2		= _mm_mul_sd(fscal,dy);
			t3		= _mm_mul_sd(fscal,dz);
			
			/* update the i force */
			fix		= _mm_add_sd(fix,t1);
			fiy		= _mm_add_sd(fiy,t2);
			fiz		= _mm_add_sd(fiz,t3);
			
			/* accumulate forces from memory */
			xmm5	= _mm_load_sd(faction+j13);   /* fx */
			xmm6    = _mm_load_sd(faction+j13+1); /* fy */
			xmm7    = _mm_load_sd(faction+j13+2); /* fz */
			
			/* subtract partial forces */
			xmm5	= _mm_sub_sd(xmm5,t1);
			xmm6	= _mm_sub_sd(xmm6,t2);
			xmm7	= _mm_sub_sd(xmm7,t3);
			
			/* store forces */
			_mm_store_sd(faction+j13,xmm5);
			_mm_store_sd(faction+j13+1,xmm6);
			_mm_store_sd(faction+j13+2,xmm7);
		}
		
		/* fix/fiy/fiz now contain four partial terms, that all should be
		 * added to the i particle forces
		 */
		t1		 = _mm_unpacklo_pd(t1,fix);
		t2		 = _mm_unpacklo_pd(t2,fiy);
		t3		 = _mm_unpacklo_pd(t3,fiz);
		
		fix		 = _mm_add_pd(fix,t1);
		fiy		 = _mm_add_pd(fiy,t2);
		fiz		 = _mm_add_pd(fiz,t3);
		
		fix      = _mm_shuffle_pd(fix,fix,_MM_SHUFFLE2(1,1));
		fiy      = _mm_shuffle_pd(fiy,fiy,_MM_SHUFFLE2(1,1));
		fiz      = _mm_shuffle_pd(fiz,fiz,_MM_SHUFFLE2(1,1));
		
		/* Load i forces from memory */
		xmm1     = _mm_load_sd(faction+ii3);
		xmm2     = _mm_load_sd(faction+ii3+1);
		xmm3     = _mm_load_sd(faction+ii3+2);
		
		/* Add to i force */
		fix      = _mm_add_sd(fix,xmm1);
		fiy      = _mm_add_sd(fiy,xmm2);
		fiz      = _mm_add_sd(fiz,xmm3);
		
		/* store i forces to memory */
		_mm_store_sd(faction+ii3,fix);
		_mm_store_sd(faction+ii3+1,fiy);
		_mm_store_sd(faction+ii3+2,fiz);
		
		/* now do dvda */
		dvdatmp  = _mm_unpacklo_pd(dvdatmp,dvdasum);
		dvdasum  = _mm_add_pd(dvdasum,dvdatmp);
		_mm_storeh_pd(&dva,dvdasum);
		dvda[ii] = dvda[ii] + dva*isai_d*isai_d;
		
		ggid	 = gid[n];
		
		/* Coulomb potential */
		vcoul	 = _mm_unpacklo_pd(vcoul,vctot);
		vctot	 = _mm_add_pd(vctot,vcoul);
		_mm_storeh_pd(&vct,vctot);
		Vc[ggid] = Vc[ggid] + vct;
		
		/* VdW potential */
		Vvdwtmp	 = _mm_unpacklo_pd(Vvdwtmp,Vvdwtot);
		Vvdwtot	 = _mm_add_pd(Vvdwtot,Vvdwtmp);
		_mm_storeh_pd(&vdwt,Vvdwtot);
		Vvdw[ggid] = Vvdw[ggid] + vdwt;
		
		/* GB potential */
		vgb  	 = _mm_unpacklo_pd(vgb,vgbtot);
		vgbtot	 = _mm_add_pd(vgbtot,vgb);
		_mm_storeh_pd(&vgbt,vgbtot);
		gpol[ggid] = gpol[ggid] + vgbt;
	}
	
	*outeriter   = nri;            
    *inneriter   = nj1; 	
	
	
	
	
	
}
Beispiel #10
0
void kernel_dgemv_t_8_lib4(int kmax, double *A, int sda, double *x, double *y, int alg)
	{
	if(kmax<=0) 
		return;
	
	const int lda = 4;
	
	__builtin_prefetch( A + 0*lda );
	__builtin_prefetch( A + 2*lda );
	__builtin_prefetch( A + 4*lda );
	__builtin_prefetch( A + 6*lda );

	double *tA, *tx;

	int k;
	int ka = kmax; // number from aligned positon
	
	__m256d
		aaxx_temp, 
		a_00_10_20_30, a_01_11_21_31, a_02_12_22_32, a_03_13_23_33,
		x_0_1_2_3,
		y_00, y_11, y_22, y_33, y_44, y_55, y_66, y_77;
	
	__m128d
		ax_temp,
		a_00_10, a_01_11, a_02_12, a_03_13,
		x_0_1,
		y_0, y_1, y_2, y_3, y_4, y_5, y_6, y_7;
	
	y_00 = _mm256_setzero_pd();
	y_11 = _mm256_setzero_pd();
	y_22 = _mm256_setzero_pd();
	y_33 = _mm256_setzero_pd();
	y_44 = _mm256_setzero_pd();
	y_55 = _mm256_setzero_pd();
	y_66 = _mm256_setzero_pd();
	y_77 = _mm256_setzero_pd();
	
	y_0 = _mm256_castpd256_pd128(y_00);
	y_1 = _mm256_castpd256_pd128(y_11);
	y_2 = _mm256_castpd256_pd128(y_22);
	y_3 = _mm256_castpd256_pd128(y_33);
	y_4 = _mm256_castpd256_pd128(y_44);
	y_5 = _mm256_castpd256_pd128(y_55);
	y_6 = _mm256_castpd256_pd128(y_66);
	y_7 = _mm256_castpd256_pd128(y_77);

	k = lda*(ka/lda);
	tA = A + (ka/lda)*sda*lda;
	tx = x + (ka/lda)*lda;

	if(ka-k>0) // it can be only ka-k = {1, 2, 3}
		{
		if((ka-k)>=2)
			{
		
			x_0_1 = _mm_load_pd( &tx[0] );

			a_00_10 = _mm_load_pd( &tA[0+lda*0] );
			a_01_11 = _mm_load_pd( &tA[0+lda*1] );
			a_02_12 = _mm_load_pd( &tA[0+lda*2] );
			a_03_13 = _mm_load_pd( &tA[0+lda*3] );

			ax_temp = _mm_mul_pd( a_00_10, x_0_1 );	
			y_0 = _mm_add_pd (y_0, ax_temp );
			ax_temp = _mm_mul_pd( a_01_11, x_0_1 );	
			y_1 = _mm_add_pd (y_1, ax_temp );
			ax_temp = _mm_mul_pd( a_02_12, x_0_1 );	
			y_2 = _mm_add_pd (y_2, ax_temp );
			ax_temp = _mm_mul_pd( a_03_13, x_0_1 );	
			y_3 = _mm_add_pd (y_3, ax_temp );
		
			a_00_10 = _mm_load_pd( &tA[0+lda*4] );
			a_01_11 = _mm_load_pd( &tA[0+lda*5] );
			a_02_12 = _mm_load_pd( &tA[0+lda*6] );
			a_03_13 = _mm_load_pd( &tA[0+lda*7] );

			ax_temp = _mm_mul_pd( a_00_10, x_0_1 );	
			y_4 = _mm_add_pd (y_4, ax_temp );
			ax_temp = _mm_mul_pd( a_01_11, x_0_1 );	
			y_5 = _mm_add_pd (y_5, ax_temp );
			ax_temp = _mm_mul_pd( a_02_12, x_0_1 );	
			y_6 = _mm_add_pd (y_6, ax_temp );
			ax_temp = _mm_mul_pd( a_03_13, x_0_1 );	
			y_7 = _mm_add_pd (y_7, ax_temp );
		
			tA += 2;
			tx += 2;
			k+=2;
		
			}

		if((ka-k)==1)
			{
		
			x_0_1 = _mm_load_sd( &tx[0] );

			a_00_10 = _mm_load_sd( &tA[0+lda*0] );
			a_01_11 = _mm_load_sd( &tA[0+lda*1] );
			a_02_12 = _mm_load_sd( &tA[0+lda*2] );
			a_03_13 = _mm_load_sd( &tA[0+lda*3] );

			ax_temp = _mm_mul_sd( a_00_10, x_0_1 );	
			y_0 = _mm_add_sd (y_0, ax_temp );
			ax_temp = _mm_mul_sd( a_01_11, x_0_1 );	
			y_1 = _mm_add_sd (y_1, ax_temp );
			ax_temp = _mm_mul_sd( a_02_12, x_0_1 );	
			y_2 = _mm_add_sd (y_2, ax_temp );
			ax_temp = _mm_mul_sd( a_03_13, x_0_1 );	
			y_3 = _mm_add_sd (y_3, ax_temp );
		
			a_00_10 = _mm_load_sd( &tA[0+lda*4] );
			a_01_11 = _mm_load_sd( &tA[0+lda*5] );
			a_02_12 = _mm_load_sd( &tA[0+lda*6] );
			a_03_13 = _mm_load_sd( &tA[0+lda*7] );

			ax_temp = _mm_mul_sd( a_00_10, x_0_1 );	
			y_4 = _mm_add_sd (y_4, ax_temp );
			ax_temp = _mm_mul_sd( a_01_11, x_0_1 );	
			y_5 = _mm_add_sd (y_5, ax_temp );
			ax_temp = _mm_mul_sd( a_02_12, x_0_1 );	
			y_6 = _mm_add_sd (y_6, ax_temp );
			ax_temp = _mm_mul_sd( a_03_13, x_0_1 );	
			y_7 = _mm_add_sd (y_7, ax_temp );
		
			tA += 1;
			tx += 1;
			k++;
		
			}

		}

	y_00 = _mm256_castpd128_pd256(y_0);
	y_11 = _mm256_castpd128_pd256(y_1);
	y_22 = _mm256_castpd128_pd256(y_2);
	y_33 = _mm256_castpd128_pd256(y_3);
	y_44 = _mm256_castpd128_pd256(y_4);
	y_55 = _mm256_castpd128_pd256(y_5);
	y_66 = _mm256_castpd128_pd256(y_6);
	y_77 = _mm256_castpd128_pd256(y_7);
		
	k=0;
	for(; k<ka-7; k+=8)
		{

		__builtin_prefetch( A + sda*lda + 0*lda );
		__builtin_prefetch( A + sda*lda + 2*lda );

		x_0_1_2_3 = _mm256_loadu_pd( &x[0] );

		a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] );
		a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] );
		a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] );
		a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] );
	
		aaxx_temp = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 );
		y_00 = _mm256_add_pd( y_00, aaxx_temp );
		aaxx_temp = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 );
		y_11 = _mm256_add_pd( y_11, aaxx_temp );
		aaxx_temp = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 );
		y_22 = _mm256_add_pd( y_22, aaxx_temp );
		aaxx_temp = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 );
		y_33 = _mm256_add_pd( y_33, aaxx_temp );
	
		__builtin_prefetch( A + sda*lda + 4*lda );
		__builtin_prefetch( A + sda*lda + 6*lda );

		a_00_10_20_30 = _mm256_load_pd( &A[0+lda*4] );
		a_01_11_21_31 = _mm256_load_pd( &A[0+lda*5] );
		a_02_12_22_32 = _mm256_load_pd( &A[0+lda*6] );
		a_03_13_23_33 = _mm256_load_pd( &A[0+lda*7] );
	
		aaxx_temp = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 );
		y_44 = _mm256_add_pd( y_44, aaxx_temp );
		aaxx_temp = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 );
		y_55 = _mm256_add_pd( y_55, aaxx_temp );
		aaxx_temp = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 );
		y_66 = _mm256_add_pd( y_66, aaxx_temp );
		aaxx_temp = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 );
		y_77 = _mm256_add_pd( y_77, aaxx_temp );

		A += 4 + (sda-1)*lda;
		x += 4;


		__builtin_prefetch( A + sda*lda + 0*lda );
		__builtin_prefetch( A + sda*lda + 2*lda );

		x_0_1_2_3 = _mm256_loadu_pd( &x[0] );

		a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] );
		a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] );
		a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] );
		a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] );
	
		aaxx_temp = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 );
		y_00 = _mm256_add_pd( y_00, aaxx_temp );
		aaxx_temp = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 );
		y_11 = _mm256_add_pd( y_11, aaxx_temp );
		aaxx_temp = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 );
		y_22 = _mm256_add_pd( y_22, aaxx_temp );
		aaxx_temp = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 );
		y_33 = _mm256_add_pd( y_33, aaxx_temp );
	
		__builtin_prefetch( A + sda*lda + 4*lda );
		__builtin_prefetch( A + sda*lda + 6*lda );

		a_00_10_20_30 = _mm256_load_pd( &A[0+lda*4] );
		a_01_11_21_31 = _mm256_load_pd( &A[0+lda*5] );
		a_02_12_22_32 = _mm256_load_pd( &A[0+lda*6] );
		a_03_13_23_33 = _mm256_load_pd( &A[0+lda*7] );
	
		aaxx_temp = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 );
		y_44 = _mm256_add_pd( y_44, aaxx_temp );
		aaxx_temp = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 );
		y_55 = _mm256_add_pd( y_55, aaxx_temp );
		aaxx_temp = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 );
		y_66 = _mm256_add_pd( y_66, aaxx_temp );
		aaxx_temp = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 );
		y_77 = _mm256_add_pd( y_77, aaxx_temp );

		A += 4 + (sda-1)*lda;
		x += 4;

		}
	for(; k<ka-3; k+=4)
		{

		__builtin_prefetch( A + sda*lda + 0*lda );
		__builtin_prefetch( A + sda*lda + 2*lda );

		x_0_1_2_3 = _mm256_loadu_pd( &x[0] );

		a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] );
		a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] );
		a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] );
		a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] );
	
		aaxx_temp = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 );
		y_00 = _mm256_add_pd( y_00, aaxx_temp );
		aaxx_temp = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 );
		y_11 = _mm256_add_pd( y_11, aaxx_temp );
		aaxx_temp = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 );
		y_22 = _mm256_add_pd( y_22, aaxx_temp );
		aaxx_temp = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 );
		y_33 = _mm256_add_pd( y_33, aaxx_temp );
	
		__builtin_prefetch( A + sda*lda + 4*lda );
		__builtin_prefetch( A + sda*lda + 6*lda );

		a_00_10_20_30 = _mm256_load_pd( &A[0+lda*4] );
		a_01_11_21_31 = _mm256_load_pd( &A[0+lda*5] );
		a_02_12_22_32 = _mm256_load_pd( &A[0+lda*6] );
		a_03_13_23_33 = _mm256_load_pd( &A[0+lda*7] );
	
		aaxx_temp = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 );
		y_44 = _mm256_add_pd( y_44, aaxx_temp );
		aaxx_temp = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 );
		y_55 = _mm256_add_pd( y_55, aaxx_temp );
		aaxx_temp = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 );
		y_66 = _mm256_add_pd( y_66, aaxx_temp );
		aaxx_temp = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 );
		y_77 = _mm256_add_pd( y_77, aaxx_temp );

		A += 4 + (sda-1)*lda;
		x += 4;

		}
		
	__m256d
		y_0_1_2_3, y_4_5_6_7;

	y_00 = _mm256_hadd_pd(y_00, y_11);
	y_22 = _mm256_hadd_pd(y_22, y_33);
	y_44 = _mm256_hadd_pd(y_44, y_55);
	y_66 = _mm256_hadd_pd(y_66, y_77);

	y_11 = _mm256_permute2f128_pd(y_22, y_00, 2 );	
	y_00 = _mm256_permute2f128_pd(y_22, y_00, 19);	
	y_55 = _mm256_permute2f128_pd(y_66, y_44, 2 );	
	y_44 = _mm256_permute2f128_pd(y_66, y_44, 19);	

	y_00 = _mm256_add_pd( y_00, y_11 );
	y_44 = _mm256_add_pd( y_44, y_55 );

	if(alg==0)
		{
		_mm256_storeu_pd(&y[0], y_00);
		_mm256_storeu_pd(&y[4], y_44);
		}
	else if(alg==1)
		{
		y_0_1_2_3 = _mm256_loadu_pd( &y[0] );
		y_4_5_6_7 = _mm256_loadu_pd( &y[4] );

		y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, y_00 );
		y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, y_44 );

		_mm256_storeu_pd(&y[0], y_0_1_2_3);
		_mm256_storeu_pd(&y[4], y_4_5_6_7);
		}
	else // alg==-1
		{
		y_0_1_2_3 = _mm256_loadu_pd( &y[0] );
		y_4_5_6_7 = _mm256_loadu_pd( &y[4] );
	
		y_0_1_2_3 = _mm256_sub_pd( y_0_1_2_3, y_00 );
		y_4_5_6_7 = _mm256_sub_pd( y_4_5_6_7, y_44 );
	
		_mm256_storeu_pd(&y[0], y_0_1_2_3);
		_mm256_storeu_pd(&y[4], y_4_5_6_7);
		}

	}
test (double *e)
{
  return _mm_load_pd (e); 
}
Beispiel #12
0
// it moves horizontally inside a block
void kernel_dgemv_n_2_lib4(int kmax, double *A, double *x, double *y, int alg)
	{
	if(kmax<=0) 
		return;
	
	const int lda = 4;
	
	int k;

	__m128d
		ax_temp,
		a_00_10, a_01_11, a_02_12, a_03_13,
		x_0, x_1, x_2, x_3,
		y_0_1, y_0_1_b, y_0_1_c, y_0_1_d, z_0_1;
	
	y_0_1 = _mm_setzero_pd();	
	y_0_1_b = _mm_setzero_pd();	
	y_0_1_c = _mm_setzero_pd();	
	y_0_1_d = _mm_setzero_pd();	

	k=0;
	for(; k<kmax-3; k+=4)
		{

		x_0 = _mm_loaddup_pd( &x[0] );
		x_1 = _mm_loaddup_pd( &x[1] );

		a_00_10 = _mm_load_pd( &A[0+lda*0] );
		a_01_11 = _mm_load_pd( &A[0+lda*1] );

		x_2 = _mm_loaddup_pd( &x[2] );
		x_3 = _mm_loaddup_pd( &x[3] );

		a_02_12 = _mm_load_pd( &A[0+lda*2] );
		a_03_13 = _mm_load_pd( &A[0+lda*3] );

		ax_temp = _mm_mul_pd( a_00_10, x_0 );
		y_0_1 = _mm_add_pd( y_0_1, ax_temp );
		ax_temp = _mm_mul_pd( a_01_11, x_1 );
		y_0_1_b = _mm_add_pd( y_0_1_b, ax_temp );

		ax_temp = _mm_mul_pd( a_02_12, x_2 );
		y_0_1_c = _mm_add_pd( y_0_1_c, ax_temp );
		ax_temp = _mm_mul_pd( a_03_13, x_3 );
		y_0_1_d = _mm_add_pd( y_0_1_d, ax_temp );
		
		A += 4*lda;
		x += 4;

		}

	y_0_1 = _mm_add_pd( y_0_1, y_0_1_c );
	y_0_1_b = _mm_add_pd( y_0_1_b, y_0_1_d );

	if(kmax%4>=2)
		{

		x_0 = _mm_loaddup_pd( &x[0] );
		x_1 = _mm_loaddup_pd( &x[1] );

		a_00_10 = _mm_load_pd( &A[0+lda*0] );
		a_01_11 = _mm_load_pd( &A[0+lda*1] );

		ax_temp = _mm_mul_pd( a_00_10, x_0 );
		y_0_1 = _mm_add_pd( y_0_1, ax_temp );
		ax_temp = _mm_mul_pd( a_01_11, x_1 );
		y_0_1_b = _mm_add_pd( y_0_1_b, ax_temp );

		A += 2*lda;
		x += 2;

		}

	y_0_1 = _mm_add_pd( y_0_1, y_0_1_b );

	if(kmax%2==1)
		{

		x_0 = _mm_loaddup_pd( &x[0] );

		a_00_10 = _mm_load_pd( &A[0+lda*0] );

		ax_temp = _mm_mul_pd( a_00_10, x_0 );
		y_0_1 = _mm_add_pd( y_0_1, ax_temp );

		}

	if(alg==0)
		{
		_mm_storeu_pd(&y[0], y_0_1);
		}
	else if(alg==1)
		{
		z_0_1 = _mm_loadu_pd( &y[0] );

		z_0_1 = _mm_add_pd( z_0_1, y_0_1 );

		_mm_storeu_pd(&y[0], z_0_1);
		}
	else // alg==-1
		{
		z_0_1 = _mm_loadu_pd( &y[0] );

		z_0_1 = _mm_sub_pd( z_0_1, y_0_1 );

		_mm_storeu_pd(&y[0], z_0_1);
		}

	}
Beispiel #13
0
void SpringEmbedderFRExact::mainStep_sse3(ArrayGraph &C)
{
//#if (defined(OGDF_ARCH_X86) || defined(OGDF_ARCH_X64)) && !(defined(__GNUC__) && !defined(__SSE3__))
#ifdef OGDF_SSE3_EXTENSIONS

	const int n            = C.numberOfNodes();

#ifdef _OPENMP
	const int work = 256;
	const int nThreadsRep  = min(omp_get_max_threads(), 1 + n*n/work);
	const int nThreadsPrev = min(omp_get_max_threads(), 1 + n  /work);
#endif

	const double k       = m_idealEdgeLength;
	const double kSquare = k*k;
	const double c_rep   = 0.052 * kSquare; // 0.2 = factor for repulsive forces as suggested by Warshal

	const double minDist       = 10e-6;//100*DBL_EPSILON;
	const double minDistSquare = minDist*minDist;

	double *disp_x = (double*) System::alignedMemoryAlloc16(n*sizeof(double));
	double *disp_y = (double*) System::alignedMemoryAlloc16(n*sizeof(double));

	__m128d mm_kSquare       = _mm_set1_pd(kSquare);
	__m128d mm_minDist       = _mm_set1_pd(minDist);
	__m128d mm_minDistSquare = _mm_set1_pd(minDistSquare);
	__m128d mm_c_rep         = _mm_set1_pd(c_rep);

	#pragma omp parallel num_threads(nThreadsRep)
	{
		double tx = m_txNull;
		double ty = m_tyNull;
		int cF = 1;

		for(int i = 1; i <= m_iterations; i++)
		{
			// repulsive forces

			#pragma omp for
			for(int v = 0; v < n; ++v)
			{
				__m128d mm_disp_xv = _mm_setzero_pd();
				__m128d mm_disp_yv = _mm_setzero_pd();

				__m128d mm_xv = _mm_set1_pd(C.m_x[v]);
				__m128d mm_yv = _mm_set1_pd(C.m_y[v]);

				int u;
				for(u = 0; u+1 < v; u += 2)
				{
					__m128d mm_delta_x = _mm_sub_pd(mm_xv, _mm_load_pd(&C.m_x[u]));
					__m128d mm_delta_y = _mm_sub_pd(mm_yv, _mm_load_pd(&C.m_y[u]));

					__m128d mm_distSquare = _mm_max_pd(mm_minDistSquare,
						_mm_add_pd(_mm_mul_pd(mm_delta_x,mm_delta_x),_mm_mul_pd(mm_delta_y,mm_delta_y))
					);

					__m128d mm_t = _mm_div_pd(_mm_load_pd(&C.m_nodeWeight[u]), mm_distSquare);
					mm_disp_xv = _mm_add_pd(mm_disp_xv, _mm_mul_pd(mm_delta_x, mm_t));
					mm_disp_yv = _mm_add_pd(mm_disp_yv, _mm_mul_pd(mm_delta_y, mm_t));
					//mm_disp_xv = _mm_add_pd(mm_disp_xv, _mm_mul_pd(mm_delta_x, _mm_div_pd(mm_kSquare,mm_distSquare)));
					//mm_disp_yv = _mm_add_pd(mm_disp_yv, _mm_mul_pd(mm_delta_y, _mm_div_pd(mm_kSquare,mm_distSquare)));
				}
				int uStart = u+2;
				if(u == v) ++u;
				if(u < n) {
					__m128d mm_delta_x = _mm_sub_sd(mm_xv, _mm_load_sd(&C.m_x[u]));
					__m128d mm_delta_y = _mm_sub_sd(mm_yv, _mm_load_sd(&C.m_y[u]));

					__m128d mm_distSquare = _mm_max_sd(mm_minDistSquare,
						_mm_add_sd(_mm_mul_sd(mm_delta_x,mm_delta_x),_mm_mul_sd(mm_delta_y,mm_delta_y))
					);

					__m128d mm_t = _mm_div_sd(_mm_load_sd(&C.m_nodeWeight[u]), mm_distSquare);
					mm_disp_xv = _mm_add_sd(mm_disp_xv, _mm_mul_sd(mm_delta_x, mm_t));
					mm_disp_yv = _mm_add_sd(mm_disp_yv, _mm_mul_sd(mm_delta_y, mm_t));
					//mm_disp_xv = _mm_add_sd(mm_disp_xv, _mm_mul_sd(mm_delta_x, _mm_div_sd(mm_kSquare,mm_distSquare)));
					//mm_disp_yv = _mm_add_sd(mm_disp_yv, _mm_mul_sd(mm_delta_y, _mm_div_sd(mm_kSquare,mm_distSquare)));
				}

				for(u = uStart; u < n; u += 2)
				{
					__m128d mm_delta_x = _mm_sub_pd(mm_xv, _mm_load_pd(&C.m_x[u]));
					__m128d mm_delta_y = _mm_sub_pd(mm_yv, _mm_load_pd(&C.m_y[u]));

					__m128d mm_distSquare = _mm_max_pd(mm_minDistSquare,
						_mm_add_pd(_mm_mul_pd(mm_delta_x,mm_delta_x),_mm_mul_pd(mm_delta_y,mm_delta_y))
					);

					__m128d mm_t = _mm_div_pd(_mm_load_pd(&C.m_nodeWeight[u]), mm_distSquare);
					mm_disp_xv = _mm_add_pd(mm_disp_xv, _mm_mul_pd(mm_delta_x, mm_t));
					mm_disp_yv = _mm_add_pd(mm_disp_yv, _mm_mul_pd(mm_delta_y, mm_t));
					//mm_disp_xv = _mm_add_pd(mm_disp_xv, _mm_mul_pd(mm_delta_x, _mm_div_pd(mm_kSquare,mm_distSquare)));
					//mm_disp_yv = _mm_add_pd(mm_disp_yv, _mm_mul_pd(mm_delta_y, _mm_div_pd(mm_kSquare,mm_distSquare)));
				}
				if(u < n) {
					__m128d mm_delta_x = _mm_sub_sd(mm_xv, _mm_load_sd(&C.m_x[u]));
					__m128d mm_delta_y = _mm_sub_sd(mm_yv, _mm_load_sd(&C.m_y[u]));

					__m128d mm_distSquare = _mm_max_sd(mm_minDistSquare,
						_mm_add_sd(_mm_mul_sd(mm_delta_x,mm_delta_x),_mm_mul_sd(mm_delta_y,mm_delta_y))
					);

					__m128d mm_t = _mm_div_sd(_mm_load_sd(&C.m_nodeWeight[u]), mm_distSquare);
					mm_disp_xv = _mm_add_sd(mm_disp_xv, _mm_mul_sd(mm_delta_x, mm_t));
					mm_disp_yv = _mm_add_sd(mm_disp_yv, _mm_mul_sd(mm_delta_y, mm_t));
					//mm_disp_xv = _mm_add_sd(mm_disp_xv, _mm_mul_sd(mm_delta_x, _mm_div_sd(mm_kSquare,mm_distSquare)));
					//mm_disp_yv = _mm_add_sd(mm_disp_yv, _mm_mul_sd(mm_delta_y, _mm_div_sd(mm_kSquare,mm_distSquare)));
				}

				mm_disp_xv = _mm_hadd_pd(mm_disp_xv,mm_disp_xv);
				mm_disp_yv = _mm_hadd_pd(mm_disp_yv,mm_disp_yv);

				_mm_store_sd(&disp_x[v], _mm_mul_sd(mm_disp_xv, mm_c_rep));
				_mm_store_sd(&disp_y[v], _mm_mul_sd(mm_disp_yv, mm_c_rep));
			}

			// attractive forces

			#pragma omp single
			for(int e = 0; e < C.numberOfEdges(); ++e)
			{
				int v = C.m_src[e];
				int u = C.m_tgt[e];

				double delta_x = C.m_x[v] - C.m_x[u];
				double delta_y = C.m_y[v] - C.m_y[u];

				double dist = max(minDist, sqrt(delta_x*delta_x + delta_y*delta_y));

				disp_x[v] -= delta_x * dist / k;
				disp_y[v] -= delta_y * dist / k;

				disp_x[u] += delta_x * dist / k;
				disp_y[u] += delta_y * dist / k;
			}

			// limit the maximum displacement to the temperature (m_tx,m_ty)

			__m128d mm_tx = _mm_set1_pd(tx);
			__m128d mm_ty = _mm_set1_pd(ty);

			#pragma omp for nowait
			for(int v = 0; v < n-1; v += 2)
			{
				__m128d mm_disp_xv = _mm_load_pd(&disp_x[v]);
				__m128d mm_disp_yv = _mm_load_pd(&disp_y[v]);

				__m128d mm_dist = _mm_max_pd(mm_minDist, _mm_sqrt_pd(
					_mm_add_pd(_mm_mul_pd(mm_disp_xv,mm_disp_xv),_mm_mul_pd(mm_disp_yv,mm_disp_yv))
				));

				_mm_store_pd(&C.m_x[v], _mm_add_pd(_mm_load_pd(&C.m_x[v]),
					_mm_mul_pd(_mm_div_pd(mm_disp_xv, mm_dist), _mm_min_pd(mm_dist,mm_tx))
				));
				_mm_store_pd(&C.m_y[v], _mm_add_pd(_mm_load_pd(&C.m_y[v]),
					_mm_mul_pd(_mm_div_pd(mm_disp_yv, mm_dist), _mm_min_pd(mm_dist,mm_ty))
				));
			}
			#pragma omp single nowait
			{
				if(n % 2) {
					int v = n-1;
					double dist = max(minDist, sqrt(disp_x[v]*disp_x[v] + disp_y[v]*disp_y[v]));

					C.m_x[v] += disp_x[v] / dist * min(dist,tx);
					C.m_y[v] += disp_y[v] / dist * min(dist,ty);
				}
			}

			cool(tx,ty,cF);

			#pragma omp barrier
		}
	}

	System::alignedMemoryFree(disp_x);
	System::alignedMemoryFree(disp_y);

#else
	mainStep(C);
#endif
}
AABB3d TriangleItemHandler::clip(
    const size_t                        item_index,
    const size_t                        dimension,
    const double                        slab_min,
    const double                        slab_max) const
{
    const TriangleVertexInfo& vertex_info = m_triangle_vertex_infos[item_index];

    if (vertex_info.m_motion_segment_count > 0)
    {
        AABB3d triangle_bbox = m_triangle_bboxes[item_index];

        if (triangle_bbox.min[dimension] < slab_min)
            triangle_bbox.min[dimension] = slab_min;

        if (triangle_bbox.max[dimension] > slab_max)
            triangle_bbox.max[dimension] = slab_max;

        return triangle_bbox;
    }

#ifdef APPLESEED_USE_SSE

    APPLESEED_SIMD4_ALIGN const Vector3d v0(m_triangle_vertices[vertex_info.m_vertex_index + 0]);
    APPLESEED_SIMD4_ALIGN const Vector3d v1(m_triangle_vertices[vertex_info.m_vertex_index + 1]);
    APPLESEED_SIMD4_ALIGN const Vector3d v2(m_triangle_vertices[vertex_info.m_vertex_index + 2]);

    const double v0d = v0[dimension];
    const double v1d = v1[dimension];
    const double v2d = v2[dimension];

    const int v0_ge_min = v0d >= slab_min ? 1 : 0;
    const int v0_le_max = v0d <= slab_max ? 1 : 0;
    const int v1_ge_min = v1d >= slab_min ? 1 : 0;
    const int v1_le_max = v1d <= slab_max ? 1 : 0;
    const int v2_ge_min = v2d >= slab_min ? 1 : 0;
    const int v2_le_max = v2d <= slab_max ? 1 : 0;

    __m128d bbox_min_xy = _mm_set1_pd(+numeric_limits<double>::max());
    __m128d bbox_min_zz = _mm_set1_pd(+numeric_limits<double>::max());
    __m128d bbox_max_xy = _mm_set1_pd(-numeric_limits<double>::max());
    __m128d bbox_max_zz = _mm_set1_pd(-numeric_limits<double>::max());

    const __m128d v0_xy = _mm_load_pd(&v0.x);
    const __m128d v0_zz = _mm_set1_pd(v0.z);
    const __m128d v1_xy = _mm_load_pd(&v1.x);
    const __m128d v1_zz = _mm_set1_pd(v1.z);
    const __m128d v2_xy = _mm_load_pd(&v2.x);
    const __m128d v2_zz = _mm_set1_pd(v2.z);

    if (v0_ge_min & v0_le_max)
    {
        bbox_min_xy = _mm_min_pd(bbox_min_xy, v0_xy);
        bbox_max_xy = _mm_max_pd(bbox_max_xy, v0_xy);
        bbox_min_zz = _mm_min_pd(bbox_min_zz, v0_zz);
        bbox_max_zz = _mm_max_pd(bbox_max_zz, v0_zz);
    }

    if (v1_ge_min & v1_le_max)
    {
        bbox_min_xy = _mm_min_pd(bbox_min_xy, v1_xy);
        bbox_max_xy = _mm_max_pd(bbox_max_xy, v1_xy);
        bbox_min_zz = _mm_min_pd(bbox_min_zz, v1_zz);
        bbox_max_zz = _mm_max_pd(bbox_max_zz, v1_zz);
    }

    if (v2_ge_min & v2_le_max)
    {
        bbox_min_xy = _mm_min_pd(bbox_min_xy, v2_xy);
        bbox_max_xy = _mm_max_pd(bbox_max_xy, v2_xy);
        bbox_min_zz = _mm_min_pd(bbox_min_zz, v2_zz);
        bbox_max_zz = _mm_max_pd(bbox_max_zz, v2_zz);
    }

    const int v0v1_cross_min = v0_ge_min ^ v1_ge_min;
    const int v0v1_cross_max = v0_le_max ^ v1_le_max;
    const int v1v2_cross_min = v1_ge_min ^ v2_ge_min;
    const int v1v2_cross_max = v1_le_max ^ v2_le_max;
    const int v2v0_cross_min = v2_ge_min ^ v0_ge_min;
    const int v2v0_cross_max = v2_le_max ^ v0_le_max;

    if (v0v1_cross_min | v0v1_cross_max)
    {
        const double rcp_v0v1 = 1.0 / (v1[dimension] - v0[dimension]);

        if (v0v1_cross_min)
        {
            const double t = (slab_min - v0[dimension]) * rcp_v0v1;
            assert(t >= 0.0 && t <= 1.0);

            const __m128d mt = _mm_set1_pd(t);
            const __m128d mt1 = _mm_set1_pd(1.0 - t);
            const __m128d p_xy = _mm_add_pd(_mm_mul_pd(v0_xy, mt1), _mm_mul_pd(v1_xy, mt));
            const __m128d p_zz = _mm_add_pd(_mm_mul_pd(v0_zz, mt1), _mm_mul_pd(v1_zz, mt));

            bbox_min_xy = _mm_min_pd(bbox_min_xy, p_xy);
            bbox_max_xy = _mm_max_pd(bbox_max_xy, p_xy);
            bbox_min_zz = _mm_min_pd(bbox_min_zz, p_zz);
            bbox_max_zz = _mm_max_pd(bbox_max_zz, p_zz);
        }

        if (v0v1_cross_max)
        {
            const double t = (slab_max - v0[dimension]) * rcp_v0v1;
            assert(t >= 0.0 && t <= 1.0);

            const __m128d mt = _mm_set1_pd(t);
            const __m128d mt1 = _mm_set1_pd(1.0 - t);
            const __m128d p_xy = _mm_add_pd(_mm_mul_pd(v0_xy, mt1), _mm_mul_pd(v1_xy, mt));
            const __m128d p_zz = _mm_add_pd(_mm_mul_pd(v0_zz, mt1), _mm_mul_pd(v1_zz, mt));

            bbox_min_xy = _mm_min_pd(bbox_min_xy, p_xy);
            bbox_max_xy = _mm_max_pd(bbox_max_xy, p_xy);
            bbox_min_zz = _mm_min_pd(bbox_min_zz, p_zz);
            bbox_max_zz = _mm_max_pd(bbox_max_zz, p_zz);
        }
    }

    if (v1v2_cross_min | v1v2_cross_max)
    {
        const double rcp_v1v2 = 1.0 / (v2[dimension] - v1[dimension]);

        if (v1v2_cross_min)
        {
            const double t = (slab_min - v1[dimension]) * rcp_v1v2;
            assert(t >= 0.0 && t <= 1.0);

            const __m128d mt = _mm_set1_pd(t);
            const __m128d mt1 = _mm_set1_pd(1.0 - t);
            const __m128d p_xy = _mm_add_pd(_mm_mul_pd(v1_xy, mt1), _mm_mul_pd(v2_xy, mt));
            const __m128d p_zz = _mm_add_pd(_mm_mul_pd(v1_zz, mt1), _mm_mul_pd(v2_zz, mt));

            bbox_min_xy = _mm_min_pd(bbox_min_xy, p_xy);
            bbox_max_xy = _mm_max_pd(bbox_max_xy, p_xy);
            bbox_min_zz = _mm_min_pd(bbox_min_zz, p_zz);
            bbox_max_zz = _mm_max_pd(bbox_max_zz, p_zz);
        }

        if (v1v2_cross_max)
        {
            const double t = (slab_max - v1[dimension]) * rcp_v1v2;
            assert(t >= 0.0 && t <= 1.0);

            const __m128d mt = _mm_set1_pd(t);
            const __m128d mt1 = _mm_set1_pd(1.0 - t);
            const __m128d p_xy = _mm_add_pd(_mm_mul_pd(v1_xy, mt1), _mm_mul_pd(v2_xy, mt));
            const __m128d p_zz = _mm_add_pd(_mm_mul_pd(v1_zz, mt1), _mm_mul_pd(v2_zz, mt));

            bbox_min_xy = _mm_min_pd(bbox_min_xy, p_xy);
            bbox_max_xy = _mm_max_pd(bbox_max_xy, p_xy);
            bbox_min_zz = _mm_min_pd(bbox_min_zz, p_zz);
            bbox_max_zz = _mm_max_pd(bbox_max_zz, p_zz);
        }
    }

    if (v2v0_cross_min | v2v0_cross_max)
    {
        const double rcp_v2v0 = 1.0 / (v0[dimension] - v2[dimension]);

        if (v2v0_cross_min)
        {
            const double t = (slab_min - v2[dimension]) * rcp_v2v0;
            assert(t >= 0.0 && t <= 1.0);

            const __m128d mt = _mm_set1_pd(t);
            const __m128d mt1 = _mm_set1_pd(1.0 - t);
            const __m128d p_xy = _mm_add_pd(_mm_mul_pd(v2_xy, mt1), _mm_mul_pd(v0_xy, mt));
            const __m128d p_zz = _mm_add_pd(_mm_mul_pd(v2_zz, mt1), _mm_mul_pd(v0_zz, mt));

            bbox_min_xy = _mm_min_pd(bbox_min_xy, p_xy);
            bbox_max_xy = _mm_max_pd(bbox_max_xy, p_xy);
            bbox_min_zz = _mm_min_pd(bbox_min_zz, p_zz);
            bbox_max_zz = _mm_max_pd(bbox_max_zz, p_zz);
        }

        if (v2v0_cross_max)
        {
            const double t = (slab_max - v2[dimension]) * rcp_v2v0;
            assert(t >= 0.0 && t <= 1.0);

            const __m128d mt = _mm_set1_pd(t);
            const __m128d mt1 = _mm_set1_pd(1.0 - t);
            const __m128d p_xy = _mm_add_pd(_mm_mul_pd(v2_xy, mt1), _mm_mul_pd(v0_xy, mt));
            const __m128d p_zz = _mm_add_pd(_mm_mul_pd(v2_zz, mt1), _mm_mul_pd(v0_zz, mt));

            bbox_min_xy = _mm_min_pd(bbox_min_xy, p_xy);
            bbox_max_xy = _mm_max_pd(bbox_max_xy, p_xy);
            bbox_min_zz = _mm_min_pd(bbox_min_zz, p_zz);
            bbox_max_zz = _mm_max_pd(bbox_max_zz, p_zz);
        }
    }

    APPLESEED_SIMD4_ALIGN AABB3d bbox;

    _mm_store_pd(&bbox.min.x, bbox_min_xy);
    _mm_store_sd(&bbox.min.z, bbox_min_zz);
    _mm_storeu_pd(&bbox.max.x, bbox_max_xy);
    _mm_store_sd(&bbox.max.z, bbox_max_zz);

    if (bbox.min[dimension] < slab_min)
        bbox.min[dimension] = slab_min;

    if (bbox.max[dimension] > slab_max)
        bbox.max[dimension] = slab_max;

#else

    const Vector3d v0(m_triangle_vertices[vertex_info.m_vertex_index + 0]);
    const Vector3d v1(m_triangle_vertices[vertex_info.m_vertex_index + 1]);
    const Vector3d v2(m_triangle_vertices[vertex_info.m_vertex_index + 2]);

    const int v0_ge_min = v0[dimension] >= slab_min ? 1 : 0;
    const int v0_le_max = v0[dimension] <= slab_max ? 1 : 0;
    const int v1_ge_min = v1[dimension] >= slab_min ? 1 : 0;
    const int v1_le_max = v1[dimension] <= slab_max ? 1 : 0;
    const int v2_ge_min = v2[dimension] >= slab_min ? 1 : 0;
    const int v2_le_max = v2[dimension] <= slab_max ? 1 : 0;

    AABB3d bbox;
    bbox.invalidate();

    if (v0_ge_min & v0_le_max)
        bbox.insert(v0);

    if (v1_ge_min & v1_le_max)
        bbox.insert(v1);

    if (v2_ge_min & v2_le_max)
        bbox.insert(v2);

    if (v0_ge_min != v1_ge_min)
        bbox.insert(segment_plane_intersection(v0, v1, dimension, slab_min));

    if (v0_le_max != v1_le_max)
        bbox.insert(segment_plane_intersection(v0, v1, dimension, slab_max));

    if (v1_ge_min != v2_ge_min)
        bbox.insert(segment_plane_intersection(v1, v2, dimension, slab_min));

    if (v1_le_max != v2_le_max)
        bbox.insert(segment_plane_intersection(v1, v2, dimension, slab_max));

    if (v2_ge_min != v0_ge_min)
        bbox.insert(segment_plane_intersection(v2, v0, dimension, slab_min));

    if (v2_le_max != v0_le_max)
        bbox.insert(segment_plane_intersection(v2, v0, dimension, slab_max));

#endif

    return bbox;
}
Beispiel #15
0
void bli_daxpyf_int_var1
     (
       conj_t  conja,
       conj_t  conjx,
       dim_t   m,
       dim_t   b_n,
       double* alpha,
       double* a, inc_t inca, inc_t lda,
       double* x, inc_t incx,
       double* y, inc_t incy,
       cntx_t* cntx
     )
{
	double*  restrict alpha_cast = alpha;
	double*  restrict a_cast = a;
	double*  restrict x_cast = x;
	double*  restrict y_cast = y;
	dim_t             i;

	const dim_t       n_elem_per_reg = 2;
	const dim_t       n_iter_unroll  = 2;

	dim_t             m_pre;
	dim_t             m_run;
	dim_t             m_left;

    double*  restrict a0;
    double*  restrict a1;
    double*  restrict a2;
    double*  restrict a3;
    double*  restrict y0;
    double            a0c, a1c, a2c, a3c;
    double            chi0, chi1, chi2, chi3;

	v2df_t            a00v, a01v, a02v, a03v, y0v;
	v2df_t            a10v, a11v, a12v, a13v, y1v;
	v2df_t            chi0v, chi1v, chi2v, chi3v;

	bool_t            use_ref = FALSE;


	if ( bli_zero_dim2( m, b_n ) ) return;

	m_pre = 0;

	// If there is anything that would interfere with our use of aligned
	// vector loads/stores, call the reference implementation.
	if ( b_n < bli_cntx_get_blksz_def_dt( BLIS_DOUBLE, BLIS_AF, cntx ) )
	{
		use_ref = TRUE;
	}
	else if ( inca != 1 || incx != 1 || incy != 1 ||
	          bli_is_unaligned_to( lda*sizeof(double), 16 ) )
	{
		use_ref = TRUE;
	}
	else if ( bli_is_unaligned_to( a, 16 ) ||
	          bli_is_unaligned_to( y, 16 ) )
	{
		use_ref = TRUE;

		if ( bli_is_unaligned_to( a, 16 ) &&
		     bli_is_unaligned_to( y, 16 ) )
		{
			use_ref = FALSE;
			m_pre   = 1;
		}
	}

	// Call the reference implementation if needed.
	if ( use_ref == TRUE )
	{
		BLIS_DAXPYF_KERNEL_REF( conja,
		                        conjx,
		                        m,
		                        b_n,
		                        alpha_cast,
		                        a_cast, inca, lda,
		                        x_cast, incx,
		                        y_cast, incy,
		                        cntx );
		return;
	}


	m_run       = ( m - m_pre ) / ( n_elem_per_reg * n_iter_unroll );
	m_left      = ( m - m_pre ) % ( n_elem_per_reg * n_iter_unroll );

	a0   = a_cast + 0*lda;
	a1   = a_cast + 1*lda;
	a2   = a_cast + 2*lda;
	a3   = a_cast + 3*lda;
	y0   = y_cast;

	chi0 = *(x_cast + 0*incx);
	chi1 = *(x_cast + 1*incx);
	chi2 = *(x_cast + 2*incx);
	chi3 = *(x_cast + 3*incx);

	PASTEMAC2(d,d,scals)( *alpha_cast, chi0 );
	PASTEMAC2(d,d,scals)( *alpha_cast, chi1 );
	PASTEMAC2(d,d,scals)( *alpha_cast, chi2 );
	PASTEMAC2(d,d,scals)( *alpha_cast, chi3 );

	if ( m_pre == 1 )
	{
		a0c = *a0;
		a1c = *a1;
		a2c = *a2;
		a3c = *a3;

		*y0 += chi0 * a0c + 
		       chi1 * a1c + 
		       chi2 * a2c + 
		       chi3 * a3c;

		a0 += inca;
		a1 += inca;
		a2 += inca;
		a3 += inca;
		y0 += incy;
	}

	chi0v.v = _mm_loaddup_pd( ( double* )&chi0 );
	chi1v.v = _mm_loaddup_pd( ( double* )&chi1 );
	chi2v.v = _mm_loaddup_pd( ( double* )&chi2 );
	chi3v.v = _mm_loaddup_pd( ( double* )&chi3 );

	for ( i = 0; i < m_run; ++i )
	{
		y0v.v = _mm_load_pd( ( double* )(y0 + 0*n_elem_per_reg) );

		a00v.v = _mm_load_pd( ( double* )(a0 + 0*n_elem_per_reg) );
		a01v.v = _mm_load_pd( ( double* )(a1 + 0*n_elem_per_reg) );

		y0v.v += chi0v.v * a00v.v;
		y0v.v += chi1v.v * a01v.v;

		a02v.v = _mm_load_pd( ( double* )(a2 + 0*n_elem_per_reg) );
		a03v.v = _mm_load_pd( ( double* )(a3 + 0*n_elem_per_reg) );

		y0v.v += chi2v.v * a02v.v;
		y0v.v += chi3v.v * a03v.v;

		_mm_store_pd( ( double* )(y0 + 0*n_elem_per_reg), y0v.v );


		y1v.v = _mm_load_pd( ( double* )(y0 + 1*n_elem_per_reg) );

		a10v.v = _mm_load_pd( ( double* )(a0 + 1*n_elem_per_reg) );
		a11v.v = _mm_load_pd( ( double* )(a1 + 1*n_elem_per_reg) );

		y1v.v += chi0v.v * a10v.v;
		y1v.v += chi1v.v * a11v.v;

		a12v.v = _mm_load_pd( ( double* )(a2 + 1*n_elem_per_reg) );
		a13v.v = _mm_load_pd( ( double* )(a3 + 1*n_elem_per_reg) );

		y1v.v += chi2v.v * a12v.v;
		y1v.v += chi3v.v * a13v.v;

		_mm_store_pd( ( double* )(y0 + 1*n_elem_per_reg), y1v.v );


		a0 += n_elem_per_reg * n_iter_unroll;
		a1 += n_elem_per_reg * n_iter_unroll;
		a2 += n_elem_per_reg * n_iter_unroll;
		a3 += n_elem_per_reg * n_iter_unroll;
		y0 += n_elem_per_reg * n_iter_unroll;
	}

	if ( m_left > 0 )
	{
		for ( i = 0; i < m_left; ++i )
		{
			a0c = *a0;
			a1c = *a1;
			a2c = *a2;
			a3c = *a3;

			*y0 += chi0 * a0c + 
			       chi1 * a1c + 
			       chi2 * a2c + 
			       chi3 * a3c;

			a0 += inca;
			a1 += inca;
			a2 += inca;
			a3 += inca;
			y0 += incy;
		}
	}
}
void nb_kernel430_x86_64_sse2(int *           p_nri,
                              int *           iinr,
                              int *           jindex,
                              int *           jjnr,
                              int *           shift,
                              double *         shiftvec,
                              double *         fshift,
                              int *           gid,
                              double *         pos,
                              double *         faction,
                              double *         charge,
                              double *         p_facel,
                              double *         p_krf,
                              double *         p_crf,
                              double *         vc,
                              int *           type,
                              int *           p_ntype,
                              double *         vdwparam,
                              double *         vvdw,
                              double *         p_tabscale,
                              double *         VFtab,
                              double *         invsqrta,
                              double *         dvda,
                              double *         p_gbtabscale,
                              double *         GBtab,
                              int *           p_nthreads,
                              int *           count,
                              void *          mtx,
                              int *           outeriter,
                              int *           inneriter,
                              double *         work)
{
    int           nri,ntype,nthreads;
    int           n,ii,is3,ii3,k,nj0,nj1,ggid;
    double        shX,shY,shZ;
	int			  offset,nti;
    int           jnrA,jnrB;
    int           j3A,j3B;
	int           tjA,tjB;
	gmx_gbdata_t *gbdata;
	double *      gpol;
    
	__m128d  iq,qq,jq,isai;
	__m128d  ix,iy,iz;
	__m128d  jx,jy,jz;
	__m128d  dx,dy,dz;
	__m128d  vctot,vvdwtot,vgbtot,dvdasum,gbfactor;
	__m128d  fix,fiy,fiz,tx,ty,tz,rsq;
	__m128d  rinv,isaj,isaprod;
	__m128d  vcoul,fscal,gbscale,c6,c12;
	__m128d  rinvsq,r,rtab;
	__m128d  eps,Y,F,G,H;
    __m128d  VV,FF,Fp;
	__m128d  vgb,fijGB,dvdatmp;
	__m128d  rinvsix,vvdw6,vvdw12,vvdwtmp;
	__m128d  facel,gbtabscale,dvdaj;
    __m128d  fijD,fijR;
    __m128d  xmm1,tabscale,eps2;
	__m128i  n0, nnn;
    
	
	const __m128d neg        = _mm_set1_pd(-1.0);
	const __m128d zero       = _mm_set1_pd(0.0);
	const __m128d minushalf  = _mm_set1_pd(-0.5);
	const __m128d two        = _mm_set1_pd(2.0);
	
	gbdata     = (gmx_gbdata_t *)work;
	gpol       = gbdata->gpol;
    
	nri        = *p_nri;
	ntype      = *p_ntype;
    
    gbfactor   = _mm_set1_pd( - ((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent)));     
    gbtabscale = _mm_load1_pd(p_gbtabscale);  
    facel      = _mm_load1_pd(p_facel);
    tabscale   = _mm_load1_pd(p_tabscale);
    
    nj1         = 0;
    jnrA = jnrB = 0;
    j3A = j3B   = 0;
    jx          = _mm_setzero_pd();
    jy          = _mm_setzero_pd();
    jz          = _mm_setzero_pd();
    c6          = _mm_setzero_pd();
    c12         = _mm_setzero_pd();
	
	for(n=0;n<nri;n++)
	{
        is3              = 3*shift[n];     
        shX              = shiftvec[is3];  
        shY              = shiftvec[is3+1];
        shZ              = shiftvec[is3+2];
        nj0              = jindex[n];      
        nj1              = jindex[n+1];    
        ii               = iinr[n];        
        ii3              = 3*ii;           
		
		ix               = _mm_set1_pd(shX+pos[ii3+0]);
		iy               = _mm_set1_pd(shY+pos[ii3+1]);
		iz               = _mm_set1_pd(shZ+pos[ii3+2]);
        
		iq               = _mm_load1_pd(charge+ii);
		iq               = _mm_mul_pd(iq,facel);
        
		isai             = _mm_load1_pd(invsqrta+ii);
        
		nti              = 2*ntype*type[ii];
		
		vctot            = _mm_setzero_pd();
		vvdwtot          = _mm_setzero_pd();
		vgbtot           = _mm_setzero_pd();
		dvdasum          = _mm_setzero_pd();
		fix              = _mm_setzero_pd();
		fiy              = _mm_setzero_pd();
		fiz              = _mm_setzero_pd();
        
		for(k=nj0;k<nj1-1; k+=2)
		{
			jnrA    = jjnr[k];
			jnrB    = jjnr[k+1];
			
			j3A     = jnrA * 3;
			j3B     = jnrB * 3;
            
            GMX_MM_LOAD_1RVEC_2POINTERS_PD(pos+j3A,pos+j3B,jx,jy,jz);
            
			dx           = _mm_sub_pd(ix,jx);
			dy           = _mm_sub_pd(iy,jy);
			dz           = _mm_sub_pd(iz,jz);
            
            rsq          = gmx_mm_calc_rsq_pd(dx,dy,dz);
            
            rinv         = gmx_mm_invsqrt_pd(rsq);
 			rinvsq       = _mm_mul_pd(rinv,rinv);
            
			/***********************************/
			/* INTERACTION SECTION STARTS HERE */
			/***********************************/
			GMX_MM_LOAD_2VALUES_PD(charge+jnrA,charge+jnrB,jq);
			GMX_MM_LOAD_2VALUES_PD(invsqrta+jnrA,invsqrta+jnrB,isaj);
            
            /* Lennard-Jones */
            tjA          = nti+2*type[jnrA];
			tjB          = nti+2*type[jnrB];
            
            GMX_MM_LOAD_2PAIRS_PD(vdwparam+tjA,vdwparam+tjB,c6,c12);
			
			isaprod      = _mm_mul_pd(isai,isaj);
			qq           = _mm_mul_pd(iq,jq);            
			vcoul        = _mm_mul_pd(qq,rinv);
			fscal        = _mm_mul_pd(vcoul,rinv);                                 
            vctot        = _mm_add_pd(vctot,vcoul);
            
            /* Polarization interaction */
			qq           = _mm_mul_pd(qq,_mm_mul_pd(isaprod,gbfactor));
			gbscale      = _mm_mul_pd(isaprod,gbtabscale);
            
 			/* Calculate GB table index */
			r            = _mm_mul_pd(rsq,rinv);
			rtab         = _mm_mul_pd(r,gbscale);
			
			n0		     = _mm_cvttpd_epi32(rtab);
			eps	     	 = _mm_sub_pd(rtab,_mm_cvtepi32_pd(n0));
			nnn		     = _mm_slli_epi32(n0,2);
			
            /* the tables are 16-byte aligned, so we can use _mm_load_pd */			
            Y            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))); 
            F            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1)));
            GMX_MM_TRANSPOSE2_PD(Y,F);
            G            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2); 
            H            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1))+2);
            GMX_MM_TRANSPOSE2_PD(G,H);
            
            G       = _mm_mul_pd(G,eps);
            H       = _mm_mul_pd(H, _mm_mul_pd(eps,eps) );
            F       = _mm_add_pd(F, _mm_add_pd( G , H ) );
            Y       = _mm_add_pd(Y, _mm_mul_pd(F, eps));
            F       = _mm_add_pd(F, _mm_add_pd(G , _mm_mul_pd(H,two)));
            vgb     = _mm_mul_pd(Y, qq);           
            fijGB   = _mm_mul_pd(F, _mm_mul_pd(qq,gbscale));
            
            dvdatmp = _mm_mul_pd(_mm_add_pd(vgb, _mm_mul_pd(fijGB,r)) , minushalf);
            
            vgbtot  = _mm_add_pd(vgbtot, vgb);
            
            dvdasum = _mm_add_pd(dvdasum, dvdatmp);
            dvdatmp = _mm_mul_pd(dvdatmp, _mm_mul_pd(isaj,isaj));
            
            GMX_MM_INCREMENT_2VALUES_PD(dvda+jnrA,dvda+jnrB,dvdatmp);
			
            /* Calculate VDW table index */
			rtab    = _mm_mul_pd(r,tabscale);
			n0      = _mm_cvttpd_epi32(rtab);
			eps     = _mm_sub_pd(rtab,_mm_cvtepi32_pd(n0));
			eps2    = _mm_mul_pd(eps,eps);
			nnn     = _mm_slli_epi32(n0,3);
			
            /* Dispersion */
            Y            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))); 
            F            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1)));
            GMX_MM_TRANSPOSE2_PD(Y,F);
            G            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+2); 
            H            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1))+2);
            GMX_MM_TRANSPOSE2_PD(G,H);
            
            G       = _mm_mul_pd(G,eps);
			H       = _mm_mul_pd(H,eps2);
			Fp      = _mm_add_pd(F,G);
			Fp      = _mm_add_pd(Fp,H);
			VV      = _mm_mul_pd(Fp,eps);
			VV      = _mm_add_pd(Y,VV);
			xmm1    = _mm_mul_pd(two,H);
			FF      = _mm_add_pd(Fp,G);
			FF      = _mm_add_pd(FF,xmm1);
			
			vvdw6   = _mm_mul_pd(c6,VV);
			fijD    = _mm_mul_pd(c6,FF);
            
            /* Dispersion */
            Y            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+4); 
            F            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1))+4);
            GMX_MM_TRANSPOSE2_PD(Y,F);
            G            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+6); 
            H            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1))+6);
            GMX_MM_TRANSPOSE2_PD(G,H);
            
            G       = _mm_mul_pd(G,eps);
			H       = _mm_mul_pd(H,eps2);
			Fp      = _mm_add_pd(F,G);
			Fp      = _mm_add_pd(Fp,H);
			VV      = _mm_mul_pd(Fp,eps);
			VV      = _mm_add_pd(Y,VV);
			xmm1    = _mm_mul_pd(two,H);
			FF      = _mm_add_pd(Fp,G);
			FF      = _mm_add_pd(FF,xmm1);
			
			vvdw12  = _mm_mul_pd(c12,VV);
			fijR    = _mm_mul_pd(c12,FF);
			
			vvdwtmp = _mm_add_pd(vvdw12,vvdw6);
			vvdwtot = _mm_add_pd(vvdwtot,vvdwtmp);
            
			xmm1    = _mm_add_pd(fijD,fijR);
			xmm1    = _mm_mul_pd(xmm1,tabscale);
			xmm1    = _mm_add_pd(xmm1,fijGB);
			xmm1    = _mm_sub_pd(xmm1,fscal);
			fscal   = _mm_mul_pd(xmm1,neg);
			fscal   = _mm_mul_pd(fscal,rinv);
            
            /***********************************/
			/*  INTERACTION SECTION ENDS HERE  */
			/***********************************/
            
            /* Calculate temporary vectorial force */
            tx           = _mm_mul_pd(fscal,dx);
            ty           = _mm_mul_pd(fscal,dy);
            tz           = _mm_mul_pd(fscal,dz);
            
            /* Increment i atom force */
            fix          = _mm_add_pd(fix,tx);
            fiy          = _mm_add_pd(fiy,ty);
            fiz          = _mm_add_pd(fiz,tz);
            
            /* Store j forces back */
			GMX_MM_DECREMENT_1RVEC_2POINTERS_PD(faction+j3A,faction+j3B,tx,ty,tz);
		}
		
		/* In double precision, offset can only be either 0 or 1 */
		if(k<nj1)
		{
			jnrA    = jjnr[k];
			j3A     = jnrA * 3;
            
            GMX_MM_LOAD_1RVEC_1POINTER_PD(pos+j3A,jx,jy,jz);
            
			dx           = _mm_sub_sd(ix,jx);
			dy           = _mm_sub_sd(iy,jy);
			dz           = _mm_sub_sd(iz,jz);
            
            rsq          = gmx_mm_calc_rsq_pd(dx,dy,dz);
            
            rinv         = gmx_mm_invsqrt_pd(rsq);
 			rinvsq       = _mm_mul_sd(rinv,rinv);
            
			/***********************************/
			/* INTERACTION SECTION STARTS HERE */
			/***********************************/
			GMX_MM_LOAD_1VALUE_PD(charge+jnrA,jq);
			GMX_MM_LOAD_1VALUE_PD(invsqrta+jnrA,isaj);
            
            /* Lennard-Jones */
            tjA          = nti+2*type[jnrA];
            
            GMX_MM_LOAD_1PAIR_PD(vdwparam+tjA,c6,c12);
			
			isaprod      = _mm_mul_sd(isai,isaj);
			qq           = _mm_mul_sd(iq,jq);            
			vcoul        = _mm_mul_sd(qq,rinv);
			fscal        = _mm_mul_sd(vcoul,rinv);                                 
            vctot        = _mm_add_sd(vctot,vcoul);
            
            /* Polarization interaction */
			qq           = _mm_mul_sd(qq,_mm_mul_sd(isaprod,gbfactor));
			gbscale      = _mm_mul_sd(isaprod,gbtabscale);
            
 			/* Calculate GB table index */
			r            = _mm_mul_sd(rsq,rinv);
			rtab         = _mm_mul_sd(r,gbscale);
			
			n0		     = _mm_cvttpd_epi32(rtab);
			eps	     	 = _mm_sub_sd(rtab,_mm_cvtepi32_pd(n0));
			nnn		     = _mm_slli_epi32(n0,2);
			
            /* the tables are 16-byte aligned, so we can use _mm_load_pd */			
            Y            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))); 
            F            = _mm_setzero_pd();
            GMX_MM_TRANSPOSE2_PD(Y,F);
            G            = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2); 
            H            = _mm_setzero_pd();
            GMX_MM_TRANSPOSE2_PD(G,H);
            
            G       = _mm_mul_sd(G,eps);
            H       = _mm_mul_sd(H, _mm_mul_sd(eps,eps) );
            F       = _mm_add_sd(F, _mm_add_sd( G , H ) );
            Y       = _mm_add_sd(Y, _mm_mul_sd(F, eps));
            F       = _mm_add_sd(F, _mm_add_sd(G , _mm_mul_sd(H,two)));
            vgb     = _mm_mul_sd(Y, qq);           
            fijGB   = _mm_mul_sd(F, _mm_mul_sd(qq,gbscale));
            
            dvdatmp = _mm_mul_sd(_mm_add_sd(vgb, _mm_mul_sd(fijGB,r)) , minushalf);
            
            vgbtot  = _mm_add_sd(vgbtot, vgb);
            
            dvdasum = _mm_add_sd(dvdasum, dvdatmp);
            dvdatmp = _mm_mul_sd(dvdatmp, _mm_mul_sd(isaj,isaj));
            
            GMX_MM_INCREMENT_1VALUE_PD(dvda+jnrA,dvdatmp);
			
            /* Calculate VDW table index */
			rtab    = _mm_mul_sd(r,tabscale);
			n0      = _mm_cvttpd_epi32(rtab);
			eps     = _mm_sub_sd(rtab,_mm_cvtepi32_pd(n0));
			eps2    = _mm_mul_sd(eps,eps);
			nnn     = _mm_slli_epi32(n0,3);
			
            /* Dispersion */
            Y            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))); 
            F            = _mm_setzero_pd();
            GMX_MM_TRANSPOSE2_PD(Y,F);
            G            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+2); 
            H            = _mm_setzero_pd();
            GMX_MM_TRANSPOSE2_PD(G,H);
            
            G       = _mm_mul_sd(G,eps);
			H       = _mm_mul_sd(H,eps2);
			Fp      = _mm_add_sd(F,G);
			Fp      = _mm_add_sd(Fp,H);
			VV      = _mm_mul_sd(Fp,eps);
			VV      = _mm_add_sd(Y,VV);
			xmm1    = _mm_mul_sd(two,H);
			FF      = _mm_add_sd(Fp,G);
			FF      = _mm_add_sd(FF,xmm1);
			
			vvdw6   = _mm_mul_sd(c6,VV);
			fijD    = _mm_mul_sd(c6,FF);
            
            /* Dispersion */
            Y            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+4); 
            F            = _mm_setzero_pd();
            GMX_MM_TRANSPOSE2_PD(Y,F);
            G            = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+6); 
            H            = _mm_setzero_pd();
            GMX_MM_TRANSPOSE2_PD(G,H);
            
            G       = _mm_mul_sd(G,eps);
			H       = _mm_mul_sd(H,eps2);
			Fp      = _mm_add_sd(F,G);
			Fp      = _mm_add_sd(Fp,H);
			VV      = _mm_mul_sd(Fp,eps);
			VV      = _mm_add_sd(Y,VV);
			xmm1    = _mm_mul_sd(two,H);
			FF      = _mm_add_sd(Fp,G);
			FF      = _mm_add_sd(FF,xmm1);
			
			vvdw12  = _mm_mul_sd(c12,VV);
			fijR    = _mm_mul_sd(c12,FF);
			
			vvdwtmp = _mm_add_sd(vvdw12,vvdw6);
			vvdwtot = _mm_add_sd(vvdwtot,vvdwtmp);
            
			xmm1    = _mm_add_sd(fijD,fijR);
			xmm1    = _mm_mul_sd(xmm1,tabscale);
			xmm1    = _mm_add_sd(xmm1,fijGB);
			xmm1    = _mm_sub_sd(xmm1,fscal);
			fscal   = _mm_mul_sd(xmm1,neg);
			fscal   = _mm_mul_sd(fscal,rinv);

            /***********************************/
			/*  INTERACTION SECTION ENDS HERE  */
			/***********************************/
            
            /* Calculate temporary vectorial force */
            tx           = _mm_mul_sd(fscal,dx);
            ty           = _mm_mul_sd(fscal,dy);
            tz           = _mm_mul_sd(fscal,dz);
            
            /* Increment i atom force */
            fix          = _mm_add_sd(fix,tx);
            fiy          = _mm_add_sd(fiy,ty);
            fiz          = _mm_add_sd(fiz,tz);
            
            /* Store j forces back */
			GMX_MM_DECREMENT_1RVEC_1POINTER_PD(faction+j3A,tx,ty,tz);
		}
		
        dvdasum = _mm_mul_pd(dvdasum, _mm_mul_pd(isai,isai));
        gmx_mm_update_iforce_1atom_pd(&fix,&fiy,&fiz,faction+ii3,fshift+is3);
        
        ggid     = gid[n];         
        
        gmx_mm_update_2pot_pd(vctot,vc+ggid,vvdwtot,vvdw+ggid);
        gmx_mm_update_2pot_pd(vgbtot,gpol+ggid,dvdasum,dvda+ii);
	}
    
	*outeriter   = nri;            
    *inneriter   = nj1; 	
}
Beispiel #17
0
void do_matrix_mult(int lda, double* A, double* B, double* C)
{

/*
	double* C = (double*) malloc(16*sizeof(double));
	int offset = 0;
	for (int j=0; j<4; j++) {
		for (int i=0; i<4; i++) {
			C[j*4+i] = C_t[i+offset];
		}
		offset += lda;
	}

	C[0] = C_t[0];
	C[1] = C_t[1];
	C[2] = C_t[2];
	C[3] = C_t[3];
	int offset = lda;
	C[4] = C_t[offset];
	C[5] = C_t[offset+1];
	C[6] = C_t[offset+2];
	C[7] = C_t[offset+3];
	offset += offset;
	C[8] = C_t[offset];
	C[9] = C_t[offset+1];
	C[10] = C_t[offset+2];
	C[11] = C_t[offset+3];
	offset += offset;
	C[12] = C_t[offset];
	C[13] = C_t[offset+1];
	C[14] = C_t[offset+2];
	C[15] = C_t[offset+3];
*/
	__m128d c1 = _mm_load_pd(C);
	__m128d c2 = _mm_load_pd(C+2);         //likewise, we are loading C[0,2] and C[0,3] here
	__m128d c3 = _mm_load_pd(C+lda);       //likewise, we are loading C[1,0] and C[1,1] here
	__m128d c4 = _mm_load_pd(C+lda+2);     //likewise, we are loading C[1,2] and C[1,3] here
	__m128d c5 = _mm_load_pd(C+2*lda);     //likewise, we are loading C[2,0] and C[2,1] here
	__m128d c6 = _mm_load_pd(C+2*lda+2);   //likewise, we are loading C[2,2] and C[2,3] here
	__m128d c7 = _mm_load_pd(C+3*lda);     //likewise, we are loading C[3,0] and C[3,1] here
	__m128d c8 = _mm_load_pd(C+3*lda +2);   //likewise, we are loading C[3,2] and C[3,3] here

	//__m128d r1, r2, r3, r4, r5, r6, r7, r8;

        for(int l=0; l<4; l+=1)
        {
		__m128d a1 = _mm_load1_pd(A+l);         
		__m128d a2 = _mm_load1_pd(A+l+lda);     
		__m128d a3 = _mm_load1_pd(A+l+2*lda);   
                __m128d a4 = _mm_load1_pd(A+l+3*lda);   
		__m128d b1 = _mm_load_pd(B+l*lda);     
                __m128d b2 = _mm_load_pd(B+l*lda+2);   
		c1 = _mm_add_pd(c1, _mm_mul_pd(a1, b1));
		c2 = _mm_add_pd(c2, _mm_mul_pd(a1, b2));
		c3 = _mm_add_pd(c3, _mm_mul_pd(a2, b1));
		c4 = _mm_add_pd(c4, _mm_mul_pd(a2, b2));
		c5 = _mm_add_pd(c5, _mm_mul_pd(a3, b1));
		c6 = _mm_add_pd(c6, _mm_mul_pd(a3, b2));
		c7 = _mm_add_pd(c7, _mm_mul_pd(a4, b1));
		c8 = _mm_add_pd(c8, _mm_mul_pd(a4, b2));

	}

	_mm_store_pd(C, c1);
	_mm_store_pd(C+2, c2);
	_mm_store_pd(C+lda, c3);
	_mm_store_pd(C+2+lda, c4);
	_mm_store_pd(C+2*lda, c5);
	_mm_store_pd(C+2+2*lda, c6);
	_mm_store_pd(C+3*lda, c7);
	_mm_store_pd(C+2+3*lda, c8);

}
Beispiel #18
0
// add *p by *s and applied to all n
COREARRAY_DLL_DEFAULT void vec_f64_add(double *p, const double *s, size_t n)
{
#if defined(COREARRAY_SIMD_AVX)

	switch ((size_t)p & 0x1F)
	{
	case 0x08:
		if (n > 0) { (*p++) += (*s++); n--; }
	case 0x10:
		if (n > 0) { (*p++) += (*s++); n--; }
	case 0x18:
		if (n > 0) { (*p++) += (*s++); n--; }
	case 0x00:
		for (; n >= 4; n-=4)
		{
			_mm256_store_pd(p, _mm256_add_pd(_mm256_load_pd(p), _mm256_loadu_pd(s)));
			p += 4; s += 4;
		}
		if (n >= 2)
		{
			_mm_store_pd(p, _mm_add_pd(_mm_load_pd(p), _mm_loadu_pd(s)));
			p += 2; s += 2; n -= 2;
		}
		break;
	default:
		for (; n >= 4; n-=4)
		{
			_mm256_storeu_pd(p, _mm256_add_pd(_mm256_loadu_pd(p), _mm256_loadu_pd(s)));
			p += 4; s += 4;
		}
		if (n >= 2)
		{
			_mm_storeu_pd(p, _mm_add_pd(_mm_loadu_pd(p), _mm_loadu_pd(s)));
			p += 2; s += 2; n -= 2;
		}
	}

#elif defined(COREARRAY_SIMD_SSE2)

	switch ((size_t)p & 0x0F)
	{
	case 0x08:
		if (n > 0) { (*p++) += (*s++); n--; }
	case 0x00:
		for (; n >= 2; n-=2)
		{
			_mm_store_pd(p, _mm_add_pd(_mm_load_pd(p), _mm_loadu_pd(s)));
			p += 2; s += 2;
		}
		break;
	default:
		for (; n >= 2; n-=2)
		{
			_mm_storeu_pd(p, _mm_add_pd(_mm_loadu_pd(p), _mm_loadu_pd(s)));
			p += 2; s += 2;
		}
	}

#endif

	for (; n > 0; n--) (*p++) += (*s++);
}