int calc_gb_chainrule_sse2_double(int natoms, t_nblist *nl, double *dadx, double *dvda, double *x, double *f, double *fshift, double *shiftvec, int gb_algorithm, gmx_genborn_t *born, t_mdatoms *md) { int i,k,n,ii,jnr,ii3,is3,nj0,nj1,n0,n1; int jnrA,jnrB; int j3A,j3B; int * jjnr; double rbi,shX,shY,shZ; double *rb; __m128d ix,iy,iz; __m128d jx,jy,jz; __m128d fix,fiy,fiz; __m128d dx,dy,dz; __m128d tx,ty,tz; __m128d rbai,rbaj,f_gb, f_gb_ai; __m128d xmm1,xmm2,xmm3; const __m128d two = _mm_set1_pd(2.0); rb = born->work; jjnr = nl->jjnr; /* Loop to get the proper form for the Born radius term, sse style */ n0 = 0; n1 = natoms; if(gb_algorithm==egbSTILL) { for(i=n0;i<n1;i++) { rbi = born->bRad[i]; rb[i] = (2 * rbi * rbi * dvda[i])/ONE_4PI_EPS0; } } else if(gb_algorithm==egbHCT) { for(i=n0;i<n1;i++) { rbi = born->bRad[i]; rb[i] = rbi * rbi * dvda[i]; } } else if(gb_algorithm==egbOBC) { for(i=n0;i<n1;i++) { rbi = born->bRad[i]; rb[i] = rbi * rbi * born->drobc[i] * dvda[i]; } } jz = _mm_setzero_pd(); n = j3A = j3B = 0; for(i=0;i<nl->nri;i++) { ii = nl->iinr[i]; ii3 = ii*3; is3 = 3*nl->shift[i]; shX = shiftvec[is3]; shY = shiftvec[is3+1]; shZ = shiftvec[is3+2]; nj0 = nl->jindex[i]; nj1 = nl->jindex[i+1]; ix = _mm_set1_pd(shX+x[ii3+0]); iy = _mm_set1_pd(shY+x[ii3+1]); iz = _mm_set1_pd(shZ+x[ii3+2]); rbai = _mm_load1_pd(rb+ii); fix = _mm_setzero_pd(); fiy = _mm_setzero_pd(); fiz = _mm_setzero_pd(); for(k=nj0;k<nj1-1;k+=2) { jnrA = jjnr[k]; jnrB = jjnr[k+1]; j3A = 3*jnrA; j3B = 3*jnrB; GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A,x+j3B,jx,jy,jz); dx = _mm_sub_pd(ix,jx); dy = _mm_sub_pd(iy,jy); dz = _mm_sub_pd(iz,jz); GMX_MM_LOAD_2VALUES_PD(rb+jnrA,rb+jnrB,rbaj); /* load chain rule terms for j1-4 */ f_gb = _mm_load_pd(dadx); dadx += 2; f_gb_ai = _mm_load_pd(dadx); dadx += 2; /* calculate scalar force */ f_gb = _mm_mul_pd(f_gb,rbai); f_gb_ai = _mm_mul_pd(f_gb_ai,rbaj); f_gb = _mm_add_pd(f_gb,f_gb_ai); tx = _mm_mul_pd(f_gb,dx); ty = _mm_mul_pd(f_gb,dy); tz = _mm_mul_pd(f_gb,dz); fix = _mm_add_pd(fix,tx); fiy = _mm_add_pd(fiy,ty); fiz = _mm_add_pd(fiz,tz); GMX_MM_DECREMENT_1RVEC_2POINTERS_PD(f+j3A,f+j3B,tx,ty,tz); } /*deal with odd elements */ if(k<nj1) { jnrA = jjnr[k]; j3A = 3*jnrA; GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A,jx,jy,jz); dx = _mm_sub_sd(ix,jx); dy = _mm_sub_sd(iy,jy); dz = _mm_sub_sd(iz,jz); GMX_MM_LOAD_1VALUE_PD(rb+jnrA,rbaj); /* load chain rule terms */ f_gb = _mm_load_pd(dadx); dadx += 2; f_gb_ai = _mm_load_pd(dadx); dadx += 2; /* calculate scalar force */ f_gb = _mm_mul_sd(f_gb,rbai); f_gb_ai = _mm_mul_sd(f_gb_ai,rbaj); f_gb = _mm_add_sd(f_gb,f_gb_ai); tx = _mm_mul_sd(f_gb,dx); ty = _mm_mul_sd(f_gb,dy); tz = _mm_mul_sd(f_gb,dz); fix = _mm_add_sd(fix,tx); fiy = _mm_add_sd(fiy,ty); fiz = _mm_add_sd(fiz,tz); GMX_MM_DECREMENT_1RVEC_1POINTER_PD(f+j3A,tx,ty,tz); } /* fix/fiy/fiz now contain four partial force terms, that all should be * added to the i particle forces and shift forces. */ gmx_mm_update_iforce_1atom_pd(&fix,&fiy,&fiz,f+ii3,fshift+is3); } return 0; }
void nb_kernel400_ia32_sse2(int * p_nri, int * iinr, int * jindex, int * jjnr, int * shift, double * shiftvec, double * fshift, int * gid, double * pos, double * faction, double * charge, double * p_facel, double * p_krf, double * p_crf, double * vc, int * type, int * p_ntype, double * vdwparam, double * vvdw, double * p_tabscale, double * VFtab, double * invsqrta, double * dvda, double * p_gbtabscale, double * GBtab, int * p_nthreads, int * count, void * mtx, int * outeriter, int * inneriter, double * work) { int nri,nthreads; int n,ii,is3,ii3,k,nj0,nj1,ggid; double shX,shY,shZ; int jnrA,jnrB; int j3A,j3B; gmx_gbdata_t *gbdata; double * gpol; __m128d iq,qq,jq,isai; __m128d ix,iy,iz; __m128d jx,jy,jz; __m128d dx,dy,dz; __m128d vctot,vgbtot,dvdasum,gbfactor; __m128d fix,fiy,fiz,tx,ty,tz,rsq; __m128d rinv,isaj,isaprod; __m128d vcoul,fscal,gbscale; __m128d rinvsq,r,rtab; __m128d eps,Y,F,G,H; __m128d vgb,fijGB,dvdatmp; __m128d facel,gbtabscale,dvdaj; __m128i n0, nnn; const __m128d neg = _mm_set1_pd(-1.0); const __m128d zero = _mm_set1_pd(0.0); const __m128d minushalf = _mm_set1_pd(-0.5); const __m128d two = _mm_set1_pd(2.0); gbdata = (gmx_gbdata_t *)work; gpol = gbdata->gpol; nri = *p_nri; gbfactor = _mm_set1_pd( - ((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent))); gbtabscale = _mm_load1_pd(p_gbtabscale); facel = _mm_load1_pd(p_facel); nj1 = 0; jnrA = jnrB = 0; j3A = j3B = 0; jx = _mm_setzero_pd(); jy = _mm_setzero_pd(); jz = _mm_setzero_pd(); for(n=0;n<nri;n++) { is3 = 3*shift[n]; shX = shiftvec[is3]; shY = shiftvec[is3+1]; shZ = shiftvec[is3+2]; nj0 = jindex[n]; nj1 = jindex[n+1]; ii = iinr[n]; ii3 = 3*ii; ix = _mm_set1_pd(shX+pos[ii3+0]); iy = _mm_set1_pd(shY+pos[ii3+1]); iz = _mm_set1_pd(shZ+pos[ii3+2]); iq = _mm_load1_pd(charge+ii); iq = _mm_mul_pd(iq,facel); isai = _mm_load1_pd(invsqrta+ii); vctot = _mm_setzero_pd(); vgbtot = _mm_setzero_pd(); dvdasum = _mm_setzero_pd(); fix = _mm_setzero_pd(); fiy = _mm_setzero_pd(); fiz = _mm_setzero_pd(); for(k=nj0;k<nj1-1; k+=2) { jnrA = jjnr[k]; jnrB = jjnr[k+1]; j3A = jnrA * 3; j3B = jnrB * 3; GMX_MM_LOAD_1RVEC_2POINTERS_PD(pos+j3A,pos+j3B,jx,jy,jz); dx = _mm_sub_pd(ix,jx); dy = _mm_sub_pd(iy,jy); dz = _mm_sub_pd(iz,jz); rsq = gmx_mm_calc_rsq_pd(dx,dy,dz); rinv = gmx_mm_invsqrt_pd(rsq); rinvsq = _mm_mul_pd(rinv,rinv); /***********************************/ /* INTERACTION SECTION STARTS HERE */ /***********************************/ GMX_MM_LOAD_2VALUES_PD(charge+jnrA,charge+jnrB,jq); GMX_MM_LOAD_2VALUES_PD(invsqrta+jnrA,invsqrta+jnrB,isaj); isaprod = _mm_mul_pd(isai,isaj); qq = _mm_mul_pd(iq,jq); vcoul = _mm_mul_pd(qq,rinv); fscal = _mm_mul_pd(vcoul,rinv); vctot = _mm_add_pd(vctot,vcoul); /* Polarization interaction */ qq = _mm_mul_pd(qq,_mm_mul_pd(isaprod,gbfactor)); gbscale = _mm_mul_pd(isaprod,gbtabscale); /* Calculate GB table index */ r = _mm_mul_pd(rsq,rinv); rtab = _mm_mul_pd(r,gbscale); n0 = _mm_cvttpd_epi32(rtab); eps = _mm_sub_pd(rtab,_mm_cvtepi32_pd(n0)); nnn = _mm_slli_epi32(n0,2); /* the tables are 16-byte aligned, so we can use _mm_load_pd */ Y = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))); F = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1))); GMX_MM_TRANSPOSE2_PD(Y,F); G = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2); H = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1))+2); GMX_MM_TRANSPOSE2_PD(G,H); G = _mm_mul_pd(G,eps); H = _mm_mul_pd(H, _mm_mul_pd(eps,eps) ); F = _mm_add_pd(F, _mm_add_pd( G , H ) ); Y = _mm_add_pd(Y, _mm_mul_pd(F, eps)); F = _mm_add_pd(F, _mm_add_pd(G , _mm_mul_pd(H,two))); vgb = _mm_mul_pd(Y, qq); fijGB = _mm_mul_pd(F, _mm_mul_pd(qq,gbscale)); dvdatmp = _mm_mul_pd(_mm_add_pd(vgb, _mm_mul_pd(fijGB,r)) , minushalf); vgbtot = _mm_add_pd(vgbtot, vgb); dvdasum = _mm_add_pd(dvdasum, dvdatmp); dvdatmp = _mm_mul_pd(dvdatmp, _mm_mul_pd(isaj,isaj)); GMX_MM_INCREMENT_2VALUES_PD(dvda+jnrA,dvda+jnrB,dvdatmp); fscal = _mm_mul_pd( _mm_sub_pd( fscal, fijGB),rinv ); /***********************************/ /* INTERACTION SECTION ENDS HERE */ /***********************************/ /* Calculate temporary vectorial force */ tx = _mm_mul_pd(fscal,dx); ty = _mm_mul_pd(fscal,dy); tz = _mm_mul_pd(fscal,dz); /* Increment i atom force */ fix = _mm_add_pd(fix,tx); fiy = _mm_add_pd(fiy,ty); fiz = _mm_add_pd(fiz,tz); /* Store j forces back */ GMX_MM_DECREMENT_1RVEC_2POINTERS_PD(faction+j3A,faction+j3B,tx,ty,tz); } /* In double precision, offset can only be either 0 or 1 */ if(k<nj1) { jnrA = jjnr[k]; j3A = jnrA * 3; GMX_MM_LOAD_1RVEC_1POINTER_PD(pos+j3A,jx,jy,jz); dx = _mm_sub_sd(ix,jx); dy = _mm_sub_sd(iy,jy); dz = _mm_sub_sd(iz,jz); rsq = gmx_mm_calc_rsq_pd(dx,dy,dz); rinv = gmx_mm_invsqrt_pd(rsq); rinvsq = _mm_mul_sd(rinv,rinv); /* These reason for zeroing these variables here is for fixing bug 585 * What happens is that __m128d _mm_add_sd(a,b) gives back r0=a[0]+b[0], * and r1=0, but it should be r1=a[1]. * This might be a compiler issue (tested with gcc-4.1.3 and -O3). * To work around it, we zero these variables and use _mm_add_pd (**) instead * Note that the only variables that get affected are the energies since * the total sum needs to be correct */ vgb = _mm_setzero_pd(); vcoul = _mm_setzero_pd(); dvdatmp = _mm_setzero_pd(); /***********************************/ /* INTERACTION SECTION STARTS HERE */ /***********************************/ GMX_MM_LOAD_1VALUE_PD(charge+jnrA,jq); GMX_MM_LOAD_1VALUE_PD(invsqrta+jnrA,isaj); isaprod = _mm_mul_sd(isai,isaj); qq = _mm_mul_sd(jq,iq); vcoul = _mm_mul_sd(qq,rinv); fscal = _mm_mul_sd(vcoul,rinv); vctot = _mm_add_pd(vctot,vcoul); /* (**) */ /* Polarization interaction */ qq = _mm_mul_sd(qq,_mm_mul_sd(isaprod,gbfactor)); gbscale = _mm_mul_sd(isaprod,gbtabscale); /* Calculate GB table index */ r = _mm_mul_sd(rsq,rinv); rtab = _mm_mul_sd(r,gbscale); n0 = _mm_cvttpd_epi32(rtab); eps = _mm_sub_sd(rtab,_mm_cvtepi32_pd(n0)); nnn = _mm_slli_epi32(n0,2); /* the tables are 16-byte aligned, so we can use _mm_load_pd */ Y = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))); F = _mm_setzero_pd(); GMX_MM_TRANSPOSE2_PD(Y,F); G = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2); H = _mm_setzero_pd(); GMX_MM_TRANSPOSE2_PD(G,H); G = _mm_mul_sd(G,eps); H = _mm_mul_sd(H, _mm_mul_sd(eps,eps) ); F = _mm_add_sd(F, _mm_add_sd( G , H ) ); Y = _mm_add_sd(Y, _mm_mul_sd(F, eps)); F = _mm_add_sd(F, _mm_add_sd(G , _mm_mul_sd(H,two))); vgb = _mm_mul_sd(Y, qq); fijGB = _mm_mul_sd(F, _mm_mul_sd(qq,gbscale)); dvdatmp = _mm_mul_sd(_mm_add_sd(vgb, _mm_mul_sd(fijGB,r)) , minushalf); vgbtot = _mm_add_pd(vgbtot, vgb); /* (**) */ dvdasum = _mm_add_pd(dvdasum, dvdatmp); /* (**) */ dvdatmp = _mm_mul_sd(dvdatmp, _mm_mul_sd(isaj,isaj)); GMX_MM_INCREMENT_1VALUE_PD(dvda+jnrA,dvdatmp); fscal = _mm_mul_sd( _mm_sub_sd( fscal, fijGB),rinv ); /***********************************/ /* INTERACTION SECTION ENDS HERE */ /***********************************/ /* Calculate temporary vectorial force */ tx = _mm_mul_sd(fscal,dx); ty = _mm_mul_sd(fscal,dy); tz = _mm_mul_sd(fscal,dz); /* Increment i atom force */ fix = _mm_add_sd(fix,tx); fiy = _mm_add_sd(fiy,ty); fiz = _mm_add_sd(fiz,tz); /* Store j forces back */ GMX_MM_DECREMENT_1RVEC_1POINTER_PD(faction+j3A,tx,ty,tz); } dvdasum = _mm_mul_pd(dvdasum, _mm_mul_pd(isai,isai)); gmx_mm_update_iforce_1atom_pd(&fix,&fiy,&fiz,faction+ii3,fshift+is3); ggid = gid[n]; gmx_mm_update_1pot_pd(vctot,vc+ggid); gmx_mm_update_1pot_pd(vgbtot,gpol+ggid); gmx_mm_update_1pot_pd(dvdasum,dvda+ii); } *outeriter = nri; *inneriter = nj1; }
void bl1_ddotaxmyv2( int n, double* alpha, double* beta, double* x, int inc_x, double* u, int inc_u, double* rho, double* y, int inc_y, double* z, int inc_z ) #if BLIS1_VECTOR_INTRINSIC_TYPE == BLIS1_SSE_INTRINSICS { double* restrict chi1; double* restrict upsilon1; double* restrict psi1; double* restrict zeta1; double rho_c; int i; int n_pre; int n_run; int n_left; v2df_t a1v, b1v; v2df_t rho1v; v2df_t x1v, u1v, y1v, z1v; if ( inc_x != 1 || inc_u != 1 || inc_y != 1 || inc_z != 1 ) bl1_abort(); n_pre = 0; if ( ( unsigned long ) z % 16 != 0 ) { if ( ( unsigned long ) x % 16 == 0 || ( unsigned long ) u % 16 == 0 || ( unsigned long ) y % 16 == 0 ) bl1_abort(); n_pre = 1; } n_run = ( n - n_pre ) / 2; n_left = ( n - n_pre ) % 2; chi1 = x; upsilon1 = u; psi1 = y; zeta1 = z; rho_c = 0.0; if ( n_pre == 1 ) { double alpha_c = *alpha; double beta_c = *beta; double chi1_c = *chi1; double upsilon_c = *upsilon1; rho_c += chi1_c * upsilon_c; *psi1 -= alpha_c * chi1_c; *zeta1 -= beta_c * chi1_c; chi1 += inc_x; upsilon1 += inc_u; psi1 += inc_y; zeta1 += inc_z; } a1v.v = _mm_loaddup_pd( ( double* )alpha ); b1v.v = _mm_loaddup_pd( ( double* )beta ); rho1v.v = _mm_setzero_pd(); for ( i = 0; i < n_run; ++i ) { x1v.v = _mm_load_pd( ( double* )chi1 ); u1v.v = _mm_load_pd( ( double* )upsilon1 ); y1v.v = _mm_load_pd( ( double* )psi1 ); z1v.v = _mm_load_pd( ( double* )zeta1 ); rho1v.v += x1v.v * u1v.v; y1v.v -= a1v.v * x1v.v; z1v.v -= b1v.v * x1v.v; _mm_store_pd( ( double* )psi1, y1v.v ); _mm_store_pd( ( double* )zeta1, z1v.v ); chi1 += 2; upsilon1 += 2; psi1 += 2; zeta1 += 2; } rho_c += rho1v.d[0] + rho1v.d[1]; if ( n_left > 0 ) { double alpha_c = *alpha; double beta_c = *beta; for( i = 0; i < n_left; ++i ) { double chi1_c = *chi1; double upsilon_c = *upsilon1; rho_c += chi1_c * upsilon_c; *psi1 -= alpha_c * chi1_c; *zeta1 -= beta_c * chi1_c; chi1 += inc_x; upsilon1 += inc_u; psi1 += inc_y; zeta1 += inc_z; } } *rho = rho_c; }
inline double lanczos13m53::lanczos_sum_expG_scaled<double>(const double& x) { static const ALIGN16 double coeff[26] = { static_cast<double>(0.006061842346248906525783753964555936883222L), static_cast<double>(1u), static_cast<double>(0.5098416655656676188125178644804694509993L), static_cast<double>(66u), static_cast<double>(19.51992788247617482847860966235652136208L), static_cast<double>(1925u), static_cast<double>(449.9445569063168119446858607650988409623L), static_cast<double>(32670u), static_cast<double>(6955.999602515376140356310115515198987526L), static_cast<double>(357423u), static_cast<double>(75999.29304014542649875303443598909137092L), static_cast<double>(2637558u), static_cast<double>(601859.6171681098786670226533699352302507L), static_cast<double>(13339535u), static_cast<double>(3481712.15498064590882071018964774556468L), static_cast<double>(45995730u), static_cast<double>(14605578.08768506808414169982791359218571L), static_cast<double>(105258076u), static_cast<double>(43338889.32467613834773723740590533316085L), static_cast<double>(150917976u), static_cast<double>(86363131.28813859145546927288977868422342L), static_cast<double>(120543840u), static_cast<double>(103794043.1163445451906271053616070238554L), static_cast<double>(39916800u), static_cast<double>(56906521.91347156388090791033559122686859L), static_cast<double>(0u) }; register __m128d vx = _mm_load1_pd(&x); register __m128d sum_even = _mm_load_pd(coeff); register __m128d sum_odd = _mm_load_pd(coeff+2); register __m128d nc_odd, nc_even; register __m128d vx2 = _mm_mul_pd(vx, vx); sum_even = _mm_mul_pd(sum_even, vx2); nc_even = _mm_load_pd(coeff + 4); sum_odd = _mm_mul_pd(sum_odd, vx2); nc_odd = _mm_load_pd(coeff + 6); sum_even = _mm_add_pd(sum_even, nc_even); sum_odd = _mm_add_pd(sum_odd, nc_odd); sum_even = _mm_mul_pd(sum_even, vx2); nc_even = _mm_load_pd(coeff + 8); sum_odd = _mm_mul_pd(sum_odd, vx2); nc_odd = _mm_load_pd(coeff + 10); sum_even = _mm_add_pd(sum_even, nc_even); sum_odd = _mm_add_pd(sum_odd, nc_odd); sum_even = _mm_mul_pd(sum_even, vx2); nc_even = _mm_load_pd(coeff + 12); sum_odd = _mm_mul_pd(sum_odd, vx2); nc_odd = _mm_load_pd(coeff + 14); sum_even = _mm_add_pd(sum_even, nc_even); sum_odd = _mm_add_pd(sum_odd, nc_odd); sum_even = _mm_mul_pd(sum_even, vx2); nc_even = _mm_load_pd(coeff + 16); sum_odd = _mm_mul_pd(sum_odd, vx2); nc_odd = _mm_load_pd(coeff + 18); sum_even = _mm_add_pd(sum_even, nc_even); sum_odd = _mm_add_pd(sum_odd, nc_odd); sum_even = _mm_mul_pd(sum_even, vx2); nc_even = _mm_load_pd(coeff + 20); sum_odd = _mm_mul_pd(sum_odd, vx2); nc_odd = _mm_load_pd(coeff + 22); sum_even = _mm_add_pd(sum_even, nc_even); sum_odd = _mm_add_pd(sum_odd, nc_odd); sum_even = _mm_mul_pd(sum_even, vx2); nc_even = _mm_load_pd(coeff + 24); sum_odd = _mm_mul_pd(sum_odd, vx); sum_even = _mm_add_pd(sum_even, nc_even); sum_even = _mm_add_pd(sum_even, sum_odd); double ALIGN16 t[2]; _mm_store_pd(t, sum_even); return t[0] / t[1]; }
inline double lanczos13m53::lanczos_sum<double>(const double& x) { static const ALIGN16 double coeff[26] = { static_cast<double>(2.506628274631000270164908177133837338626L), static_cast<double>(1u), static_cast<double>(210.8242777515793458725097339207133627117L), static_cast<double>(66u), static_cast<double>(8071.672002365816210638002902272250613822L), static_cast<double>(1925u), static_cast<double>(186056.2653952234950402949897160456992822L), static_cast<double>(32670u), static_cast<double>(2876370.628935372441225409051620849613599L), static_cast<double>(357423u), static_cast<double>(31426415.58540019438061423162831820536287L), static_cast<double>(2637558u), static_cast<double>(248874557.8620541565114603864132294232163L), static_cast<double>(13339535u), static_cast<double>(1439720407.311721673663223072794912393972L), static_cast<double>(45995730u), static_cast<double>(6039542586.35202800506429164430729792107L), static_cast<double>(105258076u), static_cast<double>(17921034426.03720969991975575445893111267L), static_cast<double>(150917976u), static_cast<double>(35711959237.35566804944018545154716670596L), static_cast<double>(120543840u), static_cast<double>(42919803642.64909876895789904700198885093L), static_cast<double>(39916800u), static_cast<double>(23531376880.41075968857200767445163675473L), static_cast<double>(0u) }; register __m128d vx = _mm_load1_pd(&x); register __m128d sum_even = _mm_load_pd(coeff); register __m128d sum_odd = _mm_load_pd(coeff+2); register __m128d nc_odd, nc_even; register __m128d vx2 = _mm_mul_pd(vx, vx); sum_even = _mm_mul_pd(sum_even, vx2); nc_even = _mm_load_pd(coeff + 4); sum_odd = _mm_mul_pd(sum_odd, vx2); nc_odd = _mm_load_pd(coeff + 6); sum_even = _mm_add_pd(sum_even, nc_even); sum_odd = _mm_add_pd(sum_odd, nc_odd); sum_even = _mm_mul_pd(sum_even, vx2); nc_even = _mm_load_pd(coeff + 8); sum_odd = _mm_mul_pd(sum_odd, vx2); nc_odd = _mm_load_pd(coeff + 10); sum_even = _mm_add_pd(sum_even, nc_even); sum_odd = _mm_add_pd(sum_odd, nc_odd); sum_even = _mm_mul_pd(sum_even, vx2); nc_even = _mm_load_pd(coeff + 12); sum_odd = _mm_mul_pd(sum_odd, vx2); nc_odd = _mm_load_pd(coeff + 14); sum_even = _mm_add_pd(sum_even, nc_even); sum_odd = _mm_add_pd(sum_odd, nc_odd); sum_even = _mm_mul_pd(sum_even, vx2); nc_even = _mm_load_pd(coeff + 16); sum_odd = _mm_mul_pd(sum_odd, vx2); nc_odd = _mm_load_pd(coeff + 18); sum_even = _mm_add_pd(sum_even, nc_even); sum_odd = _mm_add_pd(sum_odd, nc_odd); sum_even = _mm_mul_pd(sum_even, vx2); nc_even = _mm_load_pd(coeff + 20); sum_odd = _mm_mul_pd(sum_odd, vx2); nc_odd = _mm_load_pd(coeff + 22); sum_even = _mm_add_pd(sum_even, nc_even); sum_odd = _mm_add_pd(sum_odd, nc_odd); sum_even = _mm_mul_pd(sum_even, vx2); nc_even = _mm_load_pd(coeff + 24); sum_odd = _mm_mul_pd(sum_odd, vx); sum_even = _mm_add_pd(sum_even, nc_even); sum_even = _mm_add_pd(sum_even, sum_odd); double ALIGN16 t[2]; _mm_store_pd(t, sum_even); return t[0] / t[1]; }
int main() { #ifndef __EMSCRIPTEN__ _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); #endif printf ("{ \"workload\": %u, \"results\": [\n", N); assert(N%2 == 0); // Don't care about the tail for now. double *src = get_src_d();//(float*)aligned_alloc(16, N*sizeof(float)); for(int i = 0; i < N; ++i) src[i] = (double)rand() / RAND_MAX; double *src2 = get_src2_d();//(float*)aligned_alloc(16, N*sizeof(float)); for(int i = 0; i < N; ++i) src2[i] = (double)rand() / RAND_MAX; double *dst = get_dst_d();//(float*)aligned_alloc(16, N*sizeof(float)); float scalarTime; SETCHART("load"); START(); for(int i = 0; i < N; ++i) dst[i] = src[i]; ENDSCALAR(checksum_dst(dst), "scalar"); LS_TEST("_mm_load_pd", _mm_load_pd, 0, _mm_store_pd, double*, 0, 2); LS_TEST("_mm_load_pd1", _mm_load_pd1, 1, _mm_store_pd, double*, 0, 2); LS_TEST("_mm_load_sd", _mm_load_sd, 1, _mm_store_pd, double*, 0, 2); // _mm_load_si128 LS_TEST("_mm_load1_pd", _mm_load1_pd, 1, _mm_store_pd, double*, 0, 2); __m128d tempReg = _mm_set_pd(1.0, 2.0); LSH_TEST("_mm_loadh_pd", tempReg, _mm_loadh_pd, double*, 1, _mm_store_pd, double*, 0, 2); // _mm_loadl_epi64 LSH_TEST("_mm_loadl_pd", tempReg, _mm_loadh_pd, double*, 1, _mm_store_pd, double*, 0, 2); LS_TEST("_mm_loadr_pd", _mm_loadr_pd, 0, _mm_store_pd, double*, 0, 2); LS_TEST("_mm_loadu_pd", _mm_loadu_pd, 1, _mm_store_pd, double*, 0, 2); // _mm_loadu_si128 SETCHART("set"); /* _mm_set_epi16 _mm_set_epi32 _mm_set_epi64 _mm_set_epi64x _mm_set_epi8 */ SS_TEST_D("_mm_set_pd", _mm_set_pd(src[i+2], src[i+0])); //SS_TEST_D("_mm_set_pd1", _mm_set_pd1(src[i])); SS_TEST_D("_mm_set_sd", _mm_set_sd(src[i])); /* _mm_set1_epi16 _mm_set1_epi32 _mm_set1_epi64 _mm_set1_epi64x _mm_set1_epi8 */ SS_TEST_D("_mm_set1_pd", _mm_set1_pd(src[i])); /* _mm_setr_epi16 _mm_setr_epi32 _mm_setr_epi64 _mm_setr_epi8 */ SS_TEST_D("_mm_setr_pd", _mm_set_pd(src[i+2], src[i+0])); SS_TEST_D("_mm_setzero_pd", _mm_setzero_pd()); // _mm_setzero_si128 SETCHART("move"); // _mm_move_epi64 SS_TEST_D("_mm_move_sd", _mm_move_sd(_mm_load_pd(src+i), _mm_load_pd(src2+i))); SETCHART("store"); // _mm_maskmoveu_si128 LS_TEST("_mm_store_pd", _mm_load_pd, 0, _mm_store_pd, double*, 0, 2); // LS_TEST("_mm_store_pd1", _mm_load_pd, 0, _mm_store_pd1, double*, 0); LS_TEST("_mm_store_sd", _mm_load_pd, 0, _mm_store_sd, double*, 1, 2); // _mm_store_si128 // _mm_store1_pd LS64_TEST("_mm_storeh_pi", _mm_load_pd, 0, _mm_storeh_pi, 1, 2); // _mm_storel_epi64 LS64_TEST("_mm_storel_pi", _mm_load_pd, 0, _mm_storel_pi, 1, 2); LS_TEST("_mm_storer_pd", _mm_load_pd, 0, _mm_storer_pd, double*, 0, 2); LS_TEST("_mm_storeu_pd", _mm_load_pd, 0, _mm_storeu_pd, double*, 1, 2); // _mm_storeu_si128 LS_TEST("_mm_stream_pd", _mm_load_pd, 0, _mm_stream_pd, double*, 0, 2); // _mm_stream_si128 // _mm_stream_si32 // _mm_stream_si64 SETCHART("arithmetic"); // _mm_add_epi16 // _mm_add_epi32 // _mm_add_epi64 // _mm_add_epi8 START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] += src2[0]; dst[1] += src2[1]; dst[2] += src2[2]; dst[3] += src2[3]; } ENDSCALAR(checksum_dst(dst), "scalar add"); BINARYOP_TEST_D("_mm_add_pd", _mm_add_pd, _mm_load_pd(src), _mm_load_pd(src2)); BINARYOP_TEST_D("_mm_add_sd", _mm_add_sd, _mm_load_pd(src), _mm_load_pd(src2)); // _mm_adds_epi16 // _mm_adds_epi8 // _mm_adds_epu16 // _mm_adds_epu8 START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] /= src2[0]; dst[1] /= src2[1]; dst[2] /= src2[2]; dst[3] /= src2[3]; } ENDSCALAR(checksum_dst(dst), "scalar div"); BINARYOP_TEST_D("_mm_div_pd", _mm_div_pd, _mm_load_pd(src), _mm_load_pd(src2)); BINARYOP_TEST_D("_mm_div_sd", _mm_div_sd, _mm_load_pd(src), _mm_load_pd(src2)); // _mm_madd_epi16 // _mm_mul_epu32 START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] *= src2[0]; dst[1] *= src2[1]; dst[2] *= src2[2]; dst[3] *= src2[3]; } ENDSCALAR(checksum_dst(dst), "scalar mul"); BINARYOP_TEST_D("_mm_mul_pd", _mm_mul_pd, _mm_load_pd(src), _mm_load_pd(src2)); BINARYOP_TEST_D("_mm_mul_sd", _mm_mul_sd, _mm_load_pd(src), _mm_load_pd(src2)); // _mm_mulhi_epi16 // _mm_mulhi_epu16 // _mm_mullo_epi16 // _mm_sad_epu8 // _mm_sub_epi16 // _mm_sub_epi32 // _mm_sub_epi64 // _mm_sub_epi8 START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] -= src2[0]; dst[1] -= src2[1]; dst[2] -= src2[2]; dst[3] -= src2[3]; } ENDSCALAR(checksum_dst(dst), "scalar sub"); BINARYOP_TEST_D("_mm_sub_pd", _mm_sub_pd, _mm_load_pd(src), _mm_load_pd(src2)); BINARYOP_TEST_D("_mm_sub_sd", _mm_sub_sd, _mm_load_pd(src), _mm_load_pd(src2)); // _mm_subs_epi16 // _mm_subs_epi8 // _mm_subs_epu16 // _mm_subs_epu8 SETCHART("roots"); START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = sqrt(dst[0]); dst[1] = sqrt(dst[1]); dst[2] = sqrt(dst[2]); dst[3] = sqrt(dst[3]); } ENDSCALAR(checksum_dst(dst), "scalar sqrt"); UNARYOP_TEST_D("_mm_sqrt_pd", _mm_sqrt_pd, _mm_load_pd(src)); // UNARYOP_TEST_D("_mm_sqrt_sd", _mm_sqrt_sd, _mm_load_pd(src)); SETCHART("logical"); START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = ucastd(dcastu(dst[0]) & dcastu(src2[0])); dst[1] = ucastd(dcastu(dst[1]) & dcastu(src2[1])); dst[2] = ucastd(dcastu(dst[2]) & dcastu(src2[2])); dst[3] = ucastd(dcastu(dst[3]) & dcastu(src2[3])); } ENDSCALAR(checksum_dst(dst), "scalar and"); BINARYOP_TEST_D("_mm_and_pd", _mm_and_pd, _mm_load_pd(src), _mm_load_pd(src2)); // _mm_and_si128 START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = ucastd((~dcastu(dst[0])) & dcastu(src2[0])); dst[1] = ucastd((~dcastu(dst[1])) & dcastu(src2[1])); dst[2] = ucastd((~dcastu(dst[2])) & dcastu(src2[2])); dst[3] = ucastd((~dcastu(dst[3])) & dcastu(src2[3])); } ENDSCALAR(checksum_dst(dst), "scalar andnot"); BINARYOP_TEST_D("_mm_andnot_pd", _mm_andnot_pd, _mm_load_pd(src), _mm_load_pd(src2)); // _mm_andnot_si128 START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = ucastd(dcastu(dst[0]) | dcastu(src2[0])); dst[1] = ucastd(dcastu(dst[1]) | dcastu(src2[1])); dst[2] = ucastd(dcastu(dst[2]) | dcastu(src2[2])); dst[3] = ucastd(dcastu(dst[3]) | dcastu(src2[3])); } ENDSCALAR(checksum_dst(dst), "scalar or"); BINARYOP_TEST_D("_mm_or_pd", _mm_or_pd, _mm_load_pd(src), _mm_load_pd(src2)); // _mm_or_si128 START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = ucastd(dcastu(dst[0]) ^ dcastu(src2[0])); dst[1] = ucastd(dcastu(dst[1]) ^ dcastu(src2[1])); dst[2] = ucastd(dcastu(dst[2]) ^ dcastu(src2[2])); dst[3] = ucastd(dcastu(dst[3]) ^ dcastu(src2[3])); } ENDSCALAR(checksum_dst(dst), "scalar xor"); BINARYOP_TEST_D("_mm_xor_pd", _mm_xor_pd, _mm_load_pd(src), _mm_load_pd(src2)); // _mm_xor_si128 SETCHART("cmp"); // _mm_cmpeq_epi16 // _mm_cmpeq_epi32 // _mm_cmpeq_epi8 START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = (dst[0] == src2[0]) ? ucastd(0xFFFFFFFFU) : 0.f; dst[1] = (dst[1] == src2[1]) ? ucastd(0xFFFFFFFFU) : 0.f; dst[2] = (dst[2] == src2[2]) ? ucastd(0xFFFFFFFFU) : 0.f; dst[3] = (dst[3] == src2[3]) ? ucastd(0xFFFFFFFFU) : 0.f; } ENDSCALAR(checksum_dst(dst), "scalar cmp=="); BINARYOP_TEST_D("_mm_cmpeq_pd", _mm_cmpeq_pd, _mm_load_pd(src), _mm_load_pd(src2)); BINARYOP_TEST_D("_mm_cmpeq_sd", _mm_cmpeq_sd, _mm_load_pd(src), _mm_load_pd(src2)); START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = (dst[0] >= src2[0]) ? ucastd(0xFFFFFFFFU) : 0.f; dst[1] = (dst[1] >= src2[1]) ? ucastd(0xFFFFFFFFU) : 0.f; dst[2] = (dst[2] >= src2[2]) ? ucastd(0xFFFFFFFFU) : 0.f; dst[3] = (dst[3] >= src2[3]) ? ucastd(0xFFFFFFFFU) : 0.f; } ENDSCALAR(checksum_dst(dst), "scalar cmp>="); BINARYOP_TEST_D("_mm_cmpge_pd", _mm_cmpge_pd, _mm_load_pd(src), _mm_load_pd(src2)); BINARYOP_TEST_D("_mm_cmpge_sd", _mm_cmpge_sd, _mm_load_pd(src), _mm_load_pd(src2)); // _mm_cmpgt_epi16 // _mm_cmpgt_epi32 // _mm_cmpgt_epi8 START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = (dst[0] > src2[0]) ? ucastd(0xFFFFFFFFU) : 0.f; dst[1] = (dst[1] > src2[1]) ? ucastd(0xFFFFFFFFU) : 0.f; dst[2] = (dst[2] > src2[2]) ? ucastd(0xFFFFFFFFU) : 0.f; dst[3] = (dst[3] > src2[3]) ? ucastd(0xFFFFFFFFU) : 0.f; } ENDSCALAR(checksum_dst(dst), "scalar cmp>"); BINARYOP_TEST_D("_mm_cmpgt_pd", _mm_cmpgt_pd, _mm_load_pd(src), _mm_load_pd(src2)); BINARYOP_TEST_D("_mm_cmpgt_sd", _mm_cmpgt_sd, _mm_load_pd(src), _mm_load_pd(src2)); START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = (dst[0] <= src2[0]) ? ucastd(0xFFFFFFFFU) : 0.f; dst[1] = (dst[1] <= src2[1]) ? ucastd(0xFFFFFFFFU) : 0.f; dst[2] = (dst[2] <= src2[2]) ? ucastd(0xFFFFFFFFU) : 0.f; dst[3] = (dst[3] <= src2[3]) ? ucastd(0xFFFFFFFFU) : 0.f; } ENDSCALAR(checksum_dst(dst), "scalar cmp<="); BINARYOP_TEST_D("_mm_cmple_pd", _mm_cmple_pd, _mm_load_pd(src), _mm_load_pd(src2)); BINARYOP_TEST_D("_mm_cmple_sd", _mm_cmple_sd, _mm_load_pd(src), _mm_load_pd(src2)); // _mm_cmplt_epi16 // _mm_cmplt_epi32 // _mm_cmplt_epi8 START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = (dst[0] < src2[0]) ? ucastd(0xFFFFFFFFU) : 0.f; dst[1] = (dst[1] < src2[1]) ? ucastd(0xFFFFFFFFU) : 0.f; dst[2] = (dst[2] < src2[2]) ? ucastd(0xFFFFFFFFU) : 0.f; dst[3] = (dst[3] < src2[3]) ? ucastd(0xFFFFFFFFU) : 0.f; } ENDSCALAR(checksum_dst(dst), "scalar cmp<"); BINARYOP_TEST_D("_mm_cmplt_pd", _mm_cmplt_pd, _mm_load_pd(src), _mm_load_pd(src2)); BINARYOP_TEST_D("_mm_cmplt_sd", _mm_cmplt_sd, _mm_load_pd(src), _mm_load_pd(src2)); /*_mm_cmpneq_pd _mm_cmpneq_sd _mm_cmpnge_pd _mm_cmpnge_sd _mm_cmpngt_pd _mm_cmpngt_sd _mm_cmpnle_pd _mm_cmpnle_sd _mm_cmpnlt_pd _mm_cmpnlt_sd*/ START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = (!Isnan(dst[0]) && !Isnan(src2[0])) ? ucastd(0xFFFFFFFFU) : 0.f; dst[1] = (!Isnan(dst[1]) && !Isnan(src2[1])) ? ucastd(0xFFFFFFFFU) : 0.f; dst[2] = (!Isnan(dst[2]) && !Isnan(src2[2])) ? ucastd(0xFFFFFFFFU) : 0.f; dst[3] = (!Isnan(dst[3]) && !Isnan(src2[3])) ? ucastd(0xFFFFFFFFU) : 0.f; } ENDSCALAR(checksum_dst(dst), "scalar cmpord"); BINARYOP_TEST_D("_mm_cmpord_pd", _mm_cmpord_pd, _mm_load_pd(src), _mm_load_pd(src2)); BINARYOP_TEST_D("_mm_cmpord_sd", _mm_cmpord_sd, _mm_load_pd(src), _mm_load_pd(src2)); START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = (Isnan(dst[0]) || Isnan(src2[0])) ? ucastd(0xFFFFFFFFU) : 0.f; dst[1] = (Isnan(dst[1]) || Isnan(src2[1])) ? ucastd(0xFFFFFFFFU) : 0.f; dst[2] = (Isnan(dst[2]) || Isnan(src2[2])) ? ucastd(0xFFFFFFFFU) : 0.f; dst[3] = (Isnan(dst[3]) || Isnan(src2[3])) ? ucastd(0xFFFFFFFFU) : 0.f; } ENDSCALAR(checksum_dst(dst), "scalar cmpunord"); BINARYOP_TEST_D("_mm_cmpunord_pd", _mm_cmpunord_pd, _mm_load_pd(src), _mm_load_pd(src2)); BINARYOP_TEST_D("_mm_cmpunord_sd", _mm_cmpunord_sd, _mm_load_pd(src), _mm_load_pd(src2)); SETCHART("max"); // _mm_max_epi16 // _mm_max_epu8 START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = Max(dst[0], src2[0]); dst[1] = Max(dst[1], src2[1]); dst[2] = Max(dst[2], src2[2]); dst[3] = Max(dst[3], src2[3]); } ENDSCALAR(checksum_dst(dst), "scalar max"); BINARYOP_TEST_D("_mm_max_pd", _mm_max_pd, _mm_load_pd(src), _mm_load_pd(src2)); BINARYOP_TEST_D("_mm_max_sd", _mm_max_sd, _mm_load_pd(src), _mm_load_pd(src2)); // _mm_min_epi16 // _mm_min_epu8 START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = Min(dst[0], src2[0]); dst[1] = Min(dst[1], src2[1]); dst[2] = Min(dst[2], src2[2]); dst[3] = Min(dst[3], src2[3]); } ENDSCALAR(checksum_dst(dst), "scalar min"); BINARYOP_TEST_D("_mm_min_pd", _mm_min_pd, _mm_load_pd(src), _mm_load_pd(src2)); BINARYOP_TEST_D("_mm_min_sd", _mm_min_sd, _mm_load_pd(src), _mm_load_pd(src2)); SETCHART("shuffle"); // _mm_extract_epi16 // _mm_insert_epi16 // _mm_shuffle_epi32 START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[3] = dst[1]; dst[2] = dst[0]; dst[1] = src2[3]; dst[0] = src2[2]; } ENDSCALAR(checksum_dst(dst), "scalar shuffle"); // BINARYOP_TEST_D("_mm_shuffle_pd", _mm_shuffle_pd, _mm_load_pd(src), _mm_load_pd(src2)); START(); __m128 o0 = _mm_load_pd(src); __m128 o1 = _mm_load_pd(src2); for(int i = 0; i < N; i += 4) o0 = _mm_shuffle_pd(o0, o1, _MM_SHUFFLE(1, 0, 3, 2)); _mm_store_pd(dst, o0); END(checksum_dst(dst), "_mm_shuffle_pd"); // _mm_shufflehi_epi16 // _mm_shufflelo_epi16 // _mm_unpackhi_epi16 // _mm_unpackhi_epi32 // _mm_unpackhi_epi64 // _mm_unpackhi_epi8 START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = dst[2]; dst[1] = src2[2]; dst[2] = dst[3]; dst[3] = src2[3]; } ENDSCALAR(checksum_dst(dst), "scalar unpackhi_pd"); BINARYOP_TEST_D("_mm_unpackhi_pd", _mm_unpackhi_pd, _mm_load_pd(src), _mm_load_pd(src2)); // _mm_unpacklo_epi16 // _mm_unpacklo_epi32 // _mm_unpacklo_epi64 // _mm_unpacklo_epi8 START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[2] = dst[1]; dst[1] = dst[0]; dst[0] = src2[0]; dst[3] = src2[1]; } ENDSCALAR(checksum_dst(dst), "scalar unpacklo_pd"); BINARYOP_TEST_D("_mm_unpacklo_pd", _mm_unpacklo_pd, _mm_load_pd(src), _mm_load_pd(src2)); printf("]}\n"); /* printf("Finished!\n"); printf("Total time spent in scalar intrinsics: %f msecs.\n", (double)scalarTotalTicks * 1000.0 / ticks_per_sec()); printf("Total time spent in SSE1 intrinsics: %f msecs.\n", (double)simdTotalTicks * 1000.0 / ticks_per_sec()); if (scalarTotalTicks > simdTotalTicks) printf("SSE1 was %.3fx faster than scalar!\n", (double)scalarTotalTicks / simdTotalTicks); else printf("SSE1 was %.3fx slower than scalar!\n", (double)simdTotalTicks / scalarTotalTicks); */ #ifdef __EMSCRIPTEN__ fprintf(stderr,"User Agent: %s\n", emscripten_run_script_string("navigator.userAgent")); printf("/*Test finished! Now please close Firefox to continue with benchmark_sse2.py.*/\n"); #endif exit(0); }
/* * Subtract off x0 & x1 contribution to all remaining equations using a * rank-2 update with mu=2, nu=3, ku=2. This version is for 16 SSE regs. * nu is the # of RHS, ku is the number of equations solved, and mu is * unrolled only to enable vectorization & software pipelining of load/use. * Loop order is MKN, so that B is kept completely in registers, and * C and A are streamed in (and out, for C) from cache during the operation. */ ATL_SINLINE void ATL_rk2(ATL_CINT M, const TYPE *pA0, const TYPE *pA1, const TYPE *pB0, const TYPE *pB1, TYPE *C, ATL_CINT ldc0) { ATL_CINT ldc=ldc0+ldc0; TYPE *pC0 = C, *pC1 = C+ldc, *pC2 = C+((ldc)<<1); ATL_INT i; ATL_CINT MM = (M&1) ? M-1 : M-2; register __m128d B00, B10, B01, B11, B02, B12; register __m128d C00, C01, C02, C10, C11, C12; register __m128d A, a; B00 = _mm_load_pd(pB0); B10 = _mm_load_pd(pB1); B01 = _mm_load_pd(pB0+2); B11 = _mm_load_pd(pB1+2); B02 = _mm_load_pd(pB0+4); B12 = _mm_load_pd(pB1+4); /* iB12, rB12 */ C00 = _mm_load_pd(pC0); C01 = _mm_load_pd(pC1); C02 = _mm_load_pd(pC2); A = _mm_load_pd(pA0); /* iA00, rA00 */ for (i=0; i < MM; i += 2, pA0 += 4, pA1 += 4, pC0 += 4, pC1 += 4, pC2 += 4) { register __m128d b; /* * K=0, M=[0,1], apply real components of B0x */ b = _mm_movedup_pd(B00); /* rB00, rB00 */ b = _mm_mul_pd(b, A); /* iA00*rB00, rA00*rB00 */ C00 = _mm_add_pd(C00, b); a = (__m128d)_mm_shuffle_epi32((__m128i)A, 0x4E); /* rA00, iA00 */ b = _mm_movedup_pd(B01); b = _mm_mul_pd(b, A); C01 = _mm_add_pd(C01, b); C10 = _mm_load_pd(pC0+2); b = _mm_movedup_pd(B02); b = _mm_mul_pd(b, A); C02 = _mm_add_pd(C02, b); A = _mm_load_pd(pA1); /* iA01, rA01 */ /* * K=0, M=0, apply imaginary components of B0x */ b = (__m128d)_mm_shuffle_epi32((__m128i)B00, 0xEE); /* iB00, iB00 */ b = _mm_mul_pd(b, a); /* rA00*iB00, iA00*iB00 */ C00 = _mm_addsub_pd(C00, b); C11 = _mm_load_pd(pC1+2); b = (__m128d)_mm_shuffle_epi32((__m128i)B01, 0xEE); b = _mm_mul_pd(b, a); C01 = _mm_addsub_pd(C01, b); C12 = _mm_load_pd(pC2+2); b = (__m128d)_mm_shuffle_epi32((__m128i)B02, 0xEE); b = _mm_mul_pd(b, a); C02 = _mm_addsub_pd(C02, b); /* * K=1, M=0, apply real components of B1x */ b = _mm_movedup_pd(B10); /* rB10, rB10 */ b = _mm_mul_pd(b, A); /* iA01*rB10, rA01*rB10 */ C00 = _mm_add_pd(C00, b); a = (__m128d)_mm_shuffle_epi32((__m128i)A, 0x4E); /* rA01, iA01 */ b = _mm_movedup_pd(B11); b = _mm_mul_pd(b, A); C01 = _mm_add_pd(C01, b); b = _mm_movedup_pd(B12); b = _mm_mul_pd(b, A); C02 = _mm_add_pd(C02, b); A = _mm_load_pd(pA0+2); /* iA10, rA10 */ /* * K=1, M=0, apply imaginary components of B1x */ b = (__m128d)_mm_shuffle_epi32((__m128i)B10, 0xEE); /* iB10, iB10 */ b = _mm_mul_pd(b, a); /* rA01*iB10, iA01*iB10 */ C00 = _mm_addsub_pd(C00, b); _mm_store_pd(pC0, C00); b = (__m128d)_mm_shuffle_epi32((__m128i)B11, 0xEE); b = _mm_mul_pd(b, a); C01 = _mm_addsub_pd(C01, b); _mm_store_pd(pC1, C01); b = (__m128d)_mm_shuffle_epi32((__m128i)B12, 0xEE); b = _mm_mul_pd(b, a); C02 = _mm_addsub_pd(C02, b); _mm_store_pd(pC2, C02); /* * K=0, M=1, apply real components of B0x */ b = _mm_movedup_pd(B00); /* rB00, rB00 */ b = _mm_mul_pd(b, A); /* iA10*rB00, rA10*rB00 */ C10 = _mm_add_pd(C10, b); a = (__m128d)_mm_shuffle_epi32((__m128i)A, 0x4E); /* rA10, iA10 */ b = _mm_movedup_pd(B01); b = _mm_mul_pd(b, A); C11 = _mm_add_pd(C11, b); C00 = _mm_load_pd(pC0+4); b = _mm_movedup_pd(B02); b = _mm_mul_pd(b, A); C12 = _mm_add_pd(C12, b); A = _mm_load_pd(pA1+2); /* iA11, rA11 */ /* * K=0, M=1, apply imaginary components of B0x */ b = (__m128d)_mm_shuffle_epi32((__m128i)B00, 0xEE); /* iB00, iB00 */ b = _mm_mul_pd(b, a); /* rA10*iB00, iA10*iB00 */ C10 = _mm_addsub_pd(C10, b); C01 = _mm_load_pd(pC1+4); b = (__m128d)_mm_shuffle_epi32((__m128i)B01, 0xEE); b = _mm_mul_pd(b, a); C11 = _mm_addsub_pd(C11, b); C02 = _mm_load_pd(pC2+4); b = (__m128d)_mm_shuffle_epi32((__m128i)B02, 0xEE); b = _mm_mul_pd(b, a); C12 = _mm_addsub_pd(C12, b); /* * K=1, M=1, apply real components of B1x */ b = _mm_movedup_pd(B10); /* rB10, rB10 */ b = _mm_mul_pd(b, A); /* iA11*rB10, rA11*rB10 */ C10 = _mm_add_pd(C10, b); a = (__m128d)_mm_shuffle_epi32((__m128i)A, 0x4E); /* rA11, iA11 */ b = _mm_movedup_pd(B11); b = _mm_mul_pd(b, A); C11 = _mm_add_pd(C11, b); b = _mm_movedup_pd(B12); b = _mm_mul_pd(b, A); C12 = _mm_add_pd(C12, b); A = _mm_load_pd(pA0+4); /* iA20, rA20 */ /* * K=1, M=1, apply imaginary components of B1x */ b = (__m128d)_mm_shuffle_epi32((__m128i)B10, 0xEE); /* iB10, iB10 */ b = _mm_mul_pd(b, a); /* rA11*iB10, iA11*iB10 */ C10 = _mm_addsub_pd(C10, b); _mm_store_pd(pC0+2, C10); b = (__m128d)_mm_shuffle_epi32((__m128i)B11, 0xEE); b = _mm_mul_pd(b, a); C11 = _mm_addsub_pd(C11, b); _mm_store_pd(pC1+2, C11); b = (__m128d)_mm_shuffle_epi32((__m128i)B12, 0xEE); b = _mm_mul_pd(b, a); C12 = _mm_addsub_pd(C12, b); _mm_store_pd(pC2+2, C12); } /* * Drain pipes */ { register __m128d b; /* * K=0, M=[0,1], apply real components of B0x */ b = _mm_movedup_pd(B00); /* rB00, rB00 */ b = _mm_mul_pd(b, A); /* iA00*rB00, rA00*rB00 */ C00 = _mm_add_pd(C00, b); a = (__m128d)_mm_shuffle_epi32((__m128i)A, 0x4E); /* rA00, iA00 */ b = _mm_movedup_pd(B01); b = _mm_mul_pd(b, A); C01 = _mm_add_pd(C01, b); b = _mm_movedup_pd(B02); b = _mm_mul_pd(b, A); C02 = _mm_add_pd(C02, b); A = _mm_load_pd(pA1); /* iA01, rA01 */ /* * K=0, M=0, apply imaginary components of B0x */ b = (__m128d)_mm_shuffle_epi32((__m128i)B00, 0xEE); /* iB00, iB00 */ b = _mm_mul_pd(b, a); /* rA00*iB00, iA00*iB00 */ C00 = _mm_addsub_pd(C00, b); b = (__m128d)_mm_shuffle_epi32((__m128i)B01, 0xEE); b = _mm_mul_pd(b, a); C01 = _mm_addsub_pd(C01, b); b = (__m128d)_mm_shuffle_epi32((__m128i)B02, 0xEE); b = _mm_mul_pd(b, a); C02 = _mm_addsub_pd(C02, b); /* * K=1, M=0, apply real components of B1x */ b = _mm_movedup_pd(B10); /* rB10, rB10 */ b = _mm_mul_pd(b, A); /* iA01*rB10, rA01*rB10 */ C00 = _mm_add_pd(C00, b); a = (__m128d)_mm_shuffle_epi32((__m128i)A, 0x4E); /* rA01, iA01 */ b = _mm_movedup_pd(B11); b = _mm_mul_pd(b, A); C01 = _mm_add_pd(C01, b); b = _mm_movedup_pd(B12); b = _mm_mul_pd(b, A); C02 = _mm_add_pd(C02, b); /* * K=1, M=0, apply imaginary components of B1x */ b = (__m128d)_mm_shuffle_epi32((__m128i)B10, 0xEE); /* iB10, iB10 */ b = _mm_mul_pd(b, a); /* rA01*iB10, iA01*iB10 */ C00 = _mm_addsub_pd(C00, b); _mm_store_pd(pC0, C00); b = (__m128d)_mm_shuffle_epi32((__m128i)B11, 0xEE); b = _mm_mul_pd(b, a); C01 = _mm_addsub_pd(C01, b); _mm_store_pd(pC1, C01); b = (__m128d)_mm_shuffle_epi32((__m128i)B12, 0xEE); b = _mm_mul_pd(b, a); C02 = _mm_addsub_pd(C02, b); _mm_store_pd(pC2, C02); if (!(M&1)) { C10 = _mm_load_pd(pC0+2); C11 = _mm_load_pd(pC1+2); C12 = _mm_load_pd(pC2+2); A = _mm_load_pd(pA0+2); /* iA10, rA10 */ /* * K=0, M=1, apply real components of B0x */ b = _mm_movedup_pd(B00); /* rB00, rB00 */ b = _mm_mul_pd(b, A); /* iA10*rB00, rA10*rB00 */ C10 = _mm_add_pd(C10, b); a = (__m128d)_mm_shuffle_epi32((__m128i)A, 0x4E); /* rA10, iA10 */ b = _mm_movedup_pd(B01); b = _mm_mul_pd(b, A); C11 = _mm_add_pd(C11, b); b = _mm_movedup_pd(B02); b = _mm_mul_pd(b, A); C12 = _mm_add_pd(C12, b); A = _mm_load_pd(pA1+2); /* iA11, rA11 */ /* * K=0, M=1, apply imaginary components of B0x */ b = (__m128d)_mm_shuffle_epi32((__m128i)B00, 0xEE); /* iB00, iB00 */ b = _mm_mul_pd(b, a); /* rA10*iB00, iA10*iB00 */ C10 = _mm_addsub_pd(C10, b); b = (__m128d)_mm_shuffle_epi32((__m128i)B01, 0xEE); b = _mm_mul_pd(b, a); C11 = _mm_addsub_pd(C11, b); b = (__m128d)_mm_shuffle_epi32((__m128i)B02, 0xEE); b = _mm_mul_pd(b, a); C12 = _mm_addsub_pd(C12, b); /* * K=1, M=1, apply real components of B1x */ b = _mm_movedup_pd(B10); /* rB10, rB10 */ b = _mm_mul_pd(b, A); /* iA11*rB10, rA11*rB10 */ C10 = _mm_add_pd(C10, b); a = (__m128d)_mm_shuffle_epi32((__m128i)A, 0x4E); /* rA11, iA11 */ b = _mm_movedup_pd(B11); b = _mm_mul_pd(b, A); C11 = _mm_add_pd(C11, b); b = _mm_movedup_pd(B12); b = _mm_mul_pd(b, A); C12 = _mm_add_pd(C12, b); /* * K=1, M=1, apply imaginary components of B1x */ b = (__m128d)_mm_shuffle_epi32((__m128i)B10, 0xEE); /* iB10, iB10 */ b = _mm_mul_pd(b, a); /* rA11*iB10, iA11*iB10 */ C10 = _mm_addsub_pd(C10, b); _mm_store_pd(pC0+2, C10); b = (__m128d)_mm_shuffle_epi32((__m128i)B11, 0xEE); b = _mm_mul_pd(b, a); C11 = _mm_addsub_pd(C11, b); _mm_store_pd(pC1+2, C11); b = (__m128d)_mm_shuffle_epi32((__m128i)B12, 0xEE); b = _mm_mul_pd(b, a); C12 = _mm_addsub_pd(C12, b); _mm_store_pd(pC2+2, C12); } } }
void blurRemoveMinMax_(Mat& src, Mat& dest, const int r) { const Size ksize = Size(2*r+1,2*r+1); if(src.data!=dest.data)src.copyTo(dest); Mat xv; Mat nv; Mat element = Mat::ones(2*r+1,2*r+1,CV_8U); dilate(src,xv,element); erode(src,nv,element); Mat mind; Mat maxd; Mat mask; absdiff(src,nv,mind);//can move to loop absdiff(src,xv,maxd);// min(mind,maxd,mask);// T* n = nv.ptr<T>(0); T* x = xv.ptr<T>(0); T* d = dest.ptr<T>(0); T* nd = mind.ptr<T>(0); T* mk = mask.ptr<T>(0); int remsize = src.size().area(); #if CV_SSE4_1 if(src.depth()==CV_8U) { const int ssesize = src.size().area()/16; remsize = src.size().area()-ssesize*16; for(int i=0;i<ssesize;i++) { __m128i mmk = _mm_load_si128((__m128i*)mk); __m128i mnd = _mm_load_si128((__m128i*)nd); __m128i mmn = _mm_load_si128((__m128i*)n); __m128i mmx = _mm_load_si128((__m128i*)x); __m128i msk = _mm_cmpeq_epi8(mnd,mmk); _mm_store_si128((__m128i*)d,_mm_blendv_epi8(mmx,mmn,msk)); nd+=16; mk+=16; d+=16; n+=16; x+=16; } } else if(src.depth()==CV_16S || src.depth()==CV_16U) { const int ssesize = src.size().area()/8; remsize = src.size().area()-ssesize*8; for(int i=0;i<ssesize;i++) { __m128i mmk = _mm_load_si128((__m128i*)mk); __m128i mnd = _mm_load_si128((__m128i*)nd); __m128i mmn = _mm_load_si128((__m128i*)n); __m128i mmx = _mm_load_si128((__m128i*)x); __m128i msk = _mm_cmpeq_epi16(mnd,mmk); _mm_store_si128((__m128i*)d,_mm_blendv_epi8(mmx,mmn,msk)); nd+=8; mk+=8; d+=8; n+=8; x+=8; } } else if(src.depth()==CV_32F) { const int ssesize = src.size().area()/4; remsize = src.size().area()-ssesize*4; for(int i=0;i<ssesize;i++) { __m128 mmk = _mm_load_ps((float*)mk); __m128 mnd = _mm_load_ps((float*)nd); __m128 mmn = _mm_load_ps((float*)n); __m128 mmx = _mm_load_ps((float*)x); __m128 msk = _mm_cmpeq_ps(mnd,mmk); _mm_store_ps((float*)d,_mm_blendv_ps(mmx,mmn,msk)); nd+=4; mk+=4; d+=4; n+=4; x+=4; } } else if(src.depth()==CV_64F) { const int ssesize = src.size().area()/2; remsize = src.size().area()-ssesize*2; for(int i=0;i<ssesize;i++) { __m128d mmk = _mm_load_pd((double*)mk); __m128d mnd = _mm_load_pd((double*)nd); __m128d mmn = _mm_load_pd((double*)n); __m128d mmx = _mm_load_pd((double*)x); __m128d msk = _mm_cmpeq_pd(mnd,mmk); _mm_store_pd((double*)d,_mm_blendv_pd(mmx,mmn,msk)); nd+=2; mk+=2; d+=2; n+=2; x+=2; } } #endif for(int i=0;i<remsize;i++) { { if(nd[i]==mk[i]) { d[i]=n[i]; } else { d[i]=x[i]; } } } }
static inline __m128d my_invrsq_pd(__m128d x) { const __m128d three = (const __m128d) {3.0f, 3.0f}; const __m128d half = (const __m128d) {0.5f, 0.5f}; __m128 t = _mm_rsqrt_ps(_mm_cvtpd_ps(x)); /* Convert to single precision and do _mm_rsqrt_ps() */ __m128d t1 = _mm_cvtps_pd(t); /* Convert back to double precision */ /* First Newton-Rapson step, accuracy is now 24 bits */ __m128d t2 = _mm_mul_pd(half,_mm_mul_pd(t1,_mm_sub_pd(three,_mm_mul_pd(x,_mm_mul_pd(t1,t1))))); /* Return second Newton-Rapson step, accuracy 48 bits */ return (__m128d) _mm_mul_pd(half,_mm_mul_pd(t2,_mm_sub_pd(three,_mm_mul_pd(x,_mm_mul_pd(t2,t2))))); } /* to extract single integers from a __m128i datatype */ #define _mm_extract_epi64(x, imm) \ _mm_cvtsi128_si32(_mm_srli_si128((x), 4 * (imm))) void nb_kernel430_ia32_sse2(int * p_nri, int * iinr, int * jindex, int * jjnr, int * shift, double * shiftvec, double * fshift, int * gid, double * pos, double * faction, double * charge, double * p_facel, double * p_krf, double * p_crf, double * Vc, int * type, int * p_ntype, double * vdwparam, double * Vvdw, double * p_tabscale, double * VFtab, double * invsqrta, double * dvda, double * p_gbtabscale, double * GBtab, int * p_nthreads, int * count, void * mtx, int * outeriter, int * inneriter, double * work) { int nri,ntype,nthreads,offset,tj,tj2,nti; int n,ii,is3,ii3,k,nj0,nj1,jnr1,jnr2,j13,j23,ggid; double facel,krf,crf,tabscl,gbtabscl,vct,vdwt,vgbt,nt1,nt2; double shX,shY,shZ,isai_d,dva; gmx_gbdata_t *gbdata; float * gpol; __m128d ix,iy,iz,jx,jy,jz; __m128d dx,dy,dz,t1,t2,t3; __m128d fix,fiy,fiz,rsq11,rinv,r,fscal,rt,eps,eps2; __m128d q,iq,qq,isai,isaj,isaprod,vcoul,gbscale,dvdai,dvdaj; __m128d Y,F,G,H,Fp,VV,FF,vgb,fijC,fijD,fijR,dvdatmp,dvdasum,vctot,n0d; __m128d xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7,xmm8; __m128d c6,c12,Vvdw6,Vvdw12,Vvdwtmp,Vvdwtot,vgbtot,rinvsq,rinvsix; __m128d fac,tabscale,gbtabscale; __m128i n0,nnn; const __m128d neg = {-1.0f,-1.0f}; const __m128d zero = {0.0f,0.0f}; const __m128d half = {0.5f,0.5f}; const __m128d two = {2.0f,2.0f}; const __m128d three = {3.0f,3.0f}; const __m128d six = {6.0f,6.0f}; const __m128d twelwe = {12.0f,12.0f}; const __m128i four = _mm_set_epi32(4,4,4,4); gbdata = (gmx_gbdata_t *)work; gpol = gbdata->gpol; nri = *p_nri; ntype = *p_ntype; nthreads = *p_nthreads; facel = (*p_facel) * (1.0 - (1.0/gbdata->gb_epsilon_solvent)); krf = *p_krf; crf = *p_crf; tabscl = *p_tabscale; gbtabscl = *p_gbtabscale; nj1 = 0; /* Splat variables */ fac = _mm_load1_pd(&facel); tabscale = _mm_load1_pd(&tabscl); gbtabscale = _mm_load1_pd(&gbtabscl); /* Keep compiler happy */ Vvdwtmp = _mm_setzero_pd(); Vvdwtot = _mm_setzero_pd(); dvdatmp = _mm_setzero_pd(); dvdaj = _mm_setzero_pd(); isaj = _mm_setzero_pd(); vcoul = _mm_setzero_pd(); vgb = _mm_setzero_pd(); t1 = _mm_setzero_pd(); t2 = _mm_setzero_pd(); t3 = _mm_setzero_pd(); xmm1 = _mm_setzero_pd(); xmm2 = _mm_setzero_pd(); xmm3 = _mm_setzero_pd(); xmm4 = _mm_setzero_pd(); jnr1 = jnr2 = 0; j13 = j23 = 0; for(n=0;n<nri;n++) { is3 = 3*shift[n]; shX = shiftvec[is3]; shY = shiftvec[is3+1]; shZ = shiftvec[is3+2]; nj0 = jindex[n]; nj1 = jindex[n+1]; offset = (nj1-nj0)%2; ii = iinr[n]; ii3 = ii*3; ix = _mm_set1_pd(shX+pos[ii3+0]); iy = _mm_set1_pd(shX+pos[ii3+1]); iz = _mm_set1_pd(shX+pos[ii3+2]); q = _mm_set1_pd(charge[ii]); iq = _mm_mul_pd(fac,q); isai_d = invsqrta[ii]; isai = _mm_load1_pd(&isai_d); nti = 2*ntype*type[ii]; fix = _mm_setzero_pd(); fiy = _mm_setzero_pd(); fiz = _mm_setzero_pd(); dvdasum = _mm_setzero_pd(); vctot = _mm_setzero_pd(); vgbtot = _mm_setzero_pd(); Vvdwtot = _mm_setzero_pd(); for(k=nj0;k<nj1-offset; k+=2) { jnr1 = jjnr[k]; jnr2 = jjnr[k+1]; j13 = jnr1 * 3; j23 = jnr2 * 3; /* Load coordinates */ xmm1 = _mm_loadu_pd(pos+j13); /* x1 y1 */ xmm2 = _mm_loadu_pd(pos+j23); /* x2 y2 */ xmm5 = _mm_load_sd(pos+j13+2); /* z1 - */ xmm6 = _mm_load_sd(pos+j23+2); /* z2 - */ /* transpose */ jx = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); jy = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); jz = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); /* distances */ dx = _mm_sub_pd(ix,jx); dy = _mm_sub_pd(iy,jy); dz = _mm_sub_pd(iz,jz); rsq11 = _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx) , _mm_mul_pd(dy,dy) ) , _mm_mul_pd(dz,dz) ); rinv = my_invrsq_pd(rsq11); /* Load invsqrta */ isaj = _mm_loadl_pd(isaj,invsqrta+jnr1); isaj = _mm_loadh_pd(isaj,invsqrta+jnr2); isaprod = _mm_mul_pd(isai,isaj); /* Load charges */ q = _mm_loadl_pd(q,charge+jnr1); q = _mm_loadh_pd(q,charge+jnr2); qq = _mm_mul_pd(iq,q); vcoul = _mm_mul_pd(qq,rinv); fscal = _mm_mul_pd(vcoul,rinv); qq = _mm_mul_pd(isaprod,qq); qq = _mm_mul_pd(qq,neg); gbscale = _mm_mul_pd(isaprod,gbtabscale); /* Load VdW parameters */ tj = nti+2*type[jnr1]; tj2 = nti+2*type[jnr2]; xmm1 = _mm_loadu_pd(vdwparam+tj); xmm2 = _mm_loadu_pd(vdwparam+tj2); c6 = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); c12 = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* Load dvdaj */ dvdaj = _mm_loadl_pd(dvdaj, dvda+jnr1); dvdaj = _mm_loadh_pd(dvdaj, dvda+jnr2); /* Calculate GB table index */ r = _mm_mul_pd(rsq11,rinv); rt = _mm_mul_pd(r,gbscale); n0 = _mm_cvttpd_epi32(rt); n0d = _mm_cvtepi32_pd(n0); eps = _mm_sub_pd(rt,n0d); eps2 = _mm_mul_pd(eps,eps); nnn = _mm_slli_epi64(n0,2); xmm1 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,0))); /* Y1 F1 */ xmm2 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,1))); /* Y2 F2 */ xmm3 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,0))+2); /* G1 H1 */ xmm4 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,1))+2); /* G2 H2 */ Y = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* Y1 Y2 */ F = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* F1 F2 */ G = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); /* G1 G2 */ H = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); /* H1 H2 */ G = _mm_mul_pd(G,eps); H = _mm_mul_pd(H,eps2); Fp = _mm_add_pd(F,G); Fp = _mm_add_pd(Fp,H); VV = _mm_mul_pd(Fp,eps); VV = _mm_add_pd(Y,VV); H = _mm_mul_pd(two,H); FF = _mm_add_pd(Fp,G); FF = _mm_add_pd(FF,H); vgb = _mm_mul_pd(qq,VV); fijC = _mm_mul_pd(qq,FF); fijC = _mm_mul_pd(fijC,gbscale); dvdatmp = _mm_mul_pd(fijC,r); dvdatmp = _mm_add_pd(vgb,dvdatmp); dvdatmp = _mm_mul_pd(dvdatmp,neg); dvdatmp = _mm_mul_pd(dvdatmp,half); dvdasum = _mm_add_pd(dvdasum,dvdatmp); xmm1 = _mm_mul_pd(dvdatmp,isaj); xmm1 = _mm_mul_pd(xmm1,isaj); dvdaj = _mm_add_pd(dvdaj,xmm1); /* store dvda */ _mm_storel_pd(dvda+jnr1,dvdaj); _mm_storeh_pd(dvda+jnr2,dvdaj); vctot = _mm_add_pd(vctot,vcoul); vgbtot = _mm_add_pd(vgbtot,vgb); /* Calculate VDW table index */ rt = _mm_mul_pd(r,tabscale); n0 = _mm_cvttpd_epi32(rt); n0d = _mm_cvtepi32_pd(n0); eps = _mm_sub_pd(rt,n0d); eps2 = _mm_mul_pd(eps,eps); nnn = _mm_slli_epi32(n0,3); /* Tabulated VdW interaction - dispersion */ xmm1 = _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,0))); /* Y1 F1 */ xmm2 = _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,1))); /* Y2 F2 */ xmm3 = _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,0))+2); /* G1 H1 */ xmm4 = _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,1))+2); /* G2 H2 */ Y = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* Y1 Y2 */ F = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* F1 F2 */ G = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); /* G1 G2 */ H = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); /* H1 H2 */ G = _mm_mul_pd(G,eps); H = _mm_mul_pd(H,eps2); Fp = _mm_add_pd(F,G); Fp = _mm_add_pd(Fp,H); VV = _mm_mul_pd(Fp,eps); VV = _mm_add_pd(Y,VV); xmm1 = _mm_mul_pd(two,H); FF = _mm_add_pd(Fp,G); FF = _mm_add_pd(FF,xmm1); Vvdw6 = _mm_mul_pd(c6,VV); fijD = _mm_mul_pd(c6,FF); /* Tabulated VdW interaction - repulsion */ nnn = _mm_add_epi32(nnn,four); xmm1 = _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,0))); /* Y1 F1 */ xmm2 = _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,1))); /* Y2 F2 */ xmm3 = _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,0))+2); /* G1 H1 */ xmm4 = _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,1))+2); /* G2 H2 */ Y = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* Y1 Y2 */ F = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* F1 F2 */ G = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); /* G1 G2 */ H = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); /* H1 H2 */ G = _mm_mul_pd(G,eps); H = _mm_mul_pd(H,eps2); Fp = _mm_add_pd(F,G); Fp = _mm_add_pd(Fp,H); VV = _mm_mul_pd(Fp,eps); VV = _mm_add_pd(Y,VV); xmm1 = _mm_mul_pd(two,H); FF = _mm_add_pd(Fp,G); FF = _mm_add_pd(FF,xmm1); Vvdw12 = _mm_mul_pd(c12,VV); fijR = _mm_mul_pd(c12,FF); Vvdwtmp = _mm_add_pd(Vvdw12,Vvdw6); Vvdwtot = _mm_add_pd(Vvdwtot,Vvdwtmp); xmm1 = _mm_add_pd(fijD,fijR); xmm1 = _mm_mul_pd(xmm1,tabscale); xmm1 = _mm_add_pd(xmm1,fijC); xmm1 = _mm_sub_pd(xmm1,fscal); fscal = _mm_mul_pd(xmm1,neg); fscal = _mm_mul_pd(fscal,rinv); /* calculate partial force terms */ t1 = _mm_mul_pd(fscal,dx); t2 = _mm_mul_pd(fscal,dy); t3 = _mm_mul_pd(fscal,dz); /* update the i force */ fix = _mm_add_pd(fix,t1); fiy = _mm_add_pd(fiy,t2); fiz = _mm_add_pd(fiz,t3); /* accumulate forces from memory */ xmm1 = _mm_loadu_pd(faction+j13); /* fx1 fy1 */ xmm2 = _mm_loadu_pd(faction+j23); /* fx2 fy2 */ xmm5 = _mm_load1_pd(faction+j13+2); /* fz1 fz1 */ xmm6 = _mm_load1_pd(faction+j23+2); /* fz2 fz2 */ /* transpose */ xmm7 = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); /* fz1 fz2 */ xmm5 = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* fx1 fx2 */ xmm6 = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* fy1 fy2 */ /* subtract partial forces */ xmm5 = _mm_sub_pd(xmm5,t1); xmm6 = _mm_sub_pd(xmm6,t2); xmm7 = _mm_sub_pd(xmm7,t3); xmm1 = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); /* fx1 fy1 */ xmm2 = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(1,1)); /* fy1 fy2 */ /* store fx and fy */ _mm_storeu_pd(faction+j13,xmm1); _mm_storeu_pd(faction+j23,xmm2); /* .. then fz */ _mm_storel_pd(faction+j13+2,xmm7); _mm_storel_pd(faction+j23+2,xmm7); } /* In double precision, offset can only be either 0 or 1 */ if(offset!=0) { jnr1 = jjnr[k]; j13 = jnr1*3; jx = _mm_load_sd(pos+j13); jy = _mm_load_sd(pos+j13+1); jz = _mm_load_sd(pos+j13+2); isaj = _mm_load_sd(invsqrta+jnr1); isaprod = _mm_mul_sd(isai,isaj); dvdaj = _mm_load_sd(dvda+jnr1); q = _mm_load_sd(charge+jnr1); qq = _mm_mul_sd(iq,q); dx = _mm_sub_sd(ix,jx); dy = _mm_sub_sd(iy,jy); dz = _mm_sub_sd(iz,jz); rsq11 = _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx) , _mm_mul_pd(dy,dy) ) , _mm_mul_pd(dz,dz) ); rinv = my_invrsq_pd(rsq11); vcoul = _mm_mul_sd(qq,rinv); fscal = _mm_mul_sd(vcoul,rinv); qq = _mm_mul_sd(isaprod,qq); qq = _mm_mul_sd(qq,neg); gbscale = _mm_mul_sd(isaprod,gbtabscale); /* Load VdW parameters */ tj = nti+2*type[jnr1]; c6 = _mm_load_sd(vdwparam+tj); c12 = _mm_load_sd(vdwparam+tj+1); /* Calculate GB table index */ r = _mm_mul_sd(rsq11,rinv); rt = _mm_mul_sd(r,gbscale); n0 = _mm_cvttpd_epi32(rt); n0d = _mm_cvtepi32_pd(n0); eps = _mm_sub_sd(rt,n0d); eps2 = _mm_mul_sd(eps,eps); nnn = _mm_slli_epi64(n0,2); xmm1 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,0))); xmm2 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,1))); xmm3 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,0))+2); xmm4 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,1))+2); Y = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); F = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); G = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); H = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); G = _mm_mul_sd(G,eps); H = _mm_mul_sd(H,eps2); Fp = _mm_add_sd(F,G); Fp = _mm_add_sd(Fp,H); VV = _mm_mul_sd(Fp,eps); VV = _mm_add_sd(Y,VV); H = _mm_mul_sd(two,H); FF = _mm_add_sd(Fp,G); FF = _mm_add_sd(FF,H); vgb = _mm_mul_sd(qq,VV); fijC = _mm_mul_sd(qq,FF); fijC = _mm_mul_sd(fijC,gbscale); dvdatmp = _mm_mul_sd(fijC,r); dvdatmp = _mm_add_sd(vgb,dvdatmp); dvdatmp = _mm_mul_sd(dvdatmp,neg); dvdatmp = _mm_mul_sd(dvdatmp,half); dvdasum = _mm_add_sd(dvdasum,dvdatmp); xmm1 = _mm_mul_sd(dvdatmp,isaj); xmm1 = _mm_mul_sd(xmm1,isaj); dvdaj = _mm_add_sd(dvdaj,xmm1); /* store dvda */ _mm_storel_pd(dvda+jnr1,dvdaj); vctot = _mm_add_sd(vctot,vcoul); vgbtot = _mm_add_sd(vgbtot,vgb); /* Calculate VDW table index */ rt = _mm_mul_sd(r,tabscale); n0 = _mm_cvttpd_epi32(rt); n0d = _mm_cvtepi32_pd(n0); eps = _mm_sub_sd(rt,n0d); eps2 = _mm_mul_sd(eps,eps); nnn = _mm_slli_epi32(n0,3); /* Tabulated VdW interaction - dispersion */ xmm1 = _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,0))); /* Y1 F1 */ xmm2 = _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,1))); /* Y2 F2 */ xmm3 = _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,0))+2); /* G1 H1 */ xmm4 = _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,1))+2); /* G2 H2 */ Y = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* Y1 Y2 */ F = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* F1 F2 */ G = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); /* G1 G2 */ H = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); /* H1 H2 */ G = _mm_mul_sd(G,eps); H = _mm_mul_sd(H,eps2); Fp = _mm_add_sd(F,G); Fp = _mm_add_sd(Fp,H); VV = _mm_mul_sd(Fp,eps); VV = _mm_add_sd(Y,VV); xmm1 = _mm_mul_sd(two,H); FF = _mm_add_sd(Fp,G); FF = _mm_add_sd(FF,xmm1); Vvdw6 = _mm_mul_sd(c6,VV); fijD = _mm_mul_sd(c6,FF); /* Tabulated VdW interaction - repulsion */ nnn = _mm_add_epi32(nnn,four); xmm1 = _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,0))); /* Y1 F1 */ xmm2 = _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,1))); /* Y2 F2 */ xmm3 = _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,0))+2); /* G1 H1 */ xmm4 = _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,1))+2); /* G2 H2 */ Y = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* Y1 Y2 */ F = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* F1 F2 */ G = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); /* G1 G2 */ H = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); /* H1 H2 */ G = _mm_mul_sd(G,eps); H = _mm_mul_sd(H,eps2); Fp = _mm_add_sd(F,G); Fp = _mm_add_sd(Fp,H); VV = _mm_mul_sd(Fp,eps); VV = _mm_add_sd(Y,VV); xmm1 = _mm_mul_sd(two,H); FF = _mm_add_sd(Fp,G); FF = _mm_add_sd(FF,xmm1); Vvdw12 = _mm_mul_sd(c12,VV); fijR = _mm_mul_sd(c12,FF); Vvdwtmp = _mm_add_sd(Vvdw12,Vvdw6); Vvdwtot = _mm_add_sd(Vvdwtot,Vvdwtmp); xmm1 = _mm_add_sd(fijD,fijR); xmm1 = _mm_mul_sd(xmm1,tabscale); xmm1 = _mm_add_sd(xmm1,fijC); xmm1 = _mm_sub_sd(xmm1,fscal); fscal = _mm_mul_sd(xmm1,neg); fscal = _mm_mul_sd(fscal,rinv); /* calculate partial force terms */ t1 = _mm_mul_sd(fscal,dx); t2 = _mm_mul_sd(fscal,dy); t3 = _mm_mul_sd(fscal,dz); /* update the i force */ fix = _mm_add_sd(fix,t1); fiy = _mm_add_sd(fiy,t2); fiz = _mm_add_sd(fiz,t3); /* accumulate forces from memory */ xmm5 = _mm_load_sd(faction+j13); /* fx */ xmm6 = _mm_load_sd(faction+j13+1); /* fy */ xmm7 = _mm_load_sd(faction+j13+2); /* fz */ /* subtract partial forces */ xmm5 = _mm_sub_sd(xmm5,t1); xmm6 = _mm_sub_sd(xmm6,t2); xmm7 = _mm_sub_sd(xmm7,t3); /* store forces */ _mm_store_sd(faction+j13,xmm5); _mm_store_sd(faction+j13+1,xmm6); _mm_store_sd(faction+j13+2,xmm7); } /* fix/fiy/fiz now contain four partial terms, that all should be * added to the i particle forces */ t1 = _mm_unpacklo_pd(t1,fix); t2 = _mm_unpacklo_pd(t2,fiy); t3 = _mm_unpacklo_pd(t3,fiz); fix = _mm_add_pd(fix,t1); fiy = _mm_add_pd(fiy,t2); fiz = _mm_add_pd(fiz,t3); fix = _mm_shuffle_pd(fix,fix,_MM_SHUFFLE2(1,1)); fiy = _mm_shuffle_pd(fiy,fiy,_MM_SHUFFLE2(1,1)); fiz = _mm_shuffle_pd(fiz,fiz,_MM_SHUFFLE2(1,1)); /* Load i forces from memory */ xmm1 = _mm_load_sd(faction+ii3); xmm2 = _mm_load_sd(faction+ii3+1); xmm3 = _mm_load_sd(faction+ii3+2); /* Add to i force */ fix = _mm_add_sd(fix,xmm1); fiy = _mm_add_sd(fiy,xmm2); fiz = _mm_add_sd(fiz,xmm3); /* store i forces to memory */ _mm_store_sd(faction+ii3,fix); _mm_store_sd(faction+ii3+1,fiy); _mm_store_sd(faction+ii3+2,fiz); /* now do dvda */ dvdatmp = _mm_unpacklo_pd(dvdatmp,dvdasum); dvdasum = _mm_add_pd(dvdasum,dvdatmp); _mm_storeh_pd(&dva,dvdasum); dvda[ii] = dvda[ii] + dva*isai_d*isai_d; ggid = gid[n]; /* Coulomb potential */ vcoul = _mm_unpacklo_pd(vcoul,vctot); vctot = _mm_add_pd(vctot,vcoul); _mm_storeh_pd(&vct,vctot); Vc[ggid] = Vc[ggid] + vct; /* VdW potential */ Vvdwtmp = _mm_unpacklo_pd(Vvdwtmp,Vvdwtot); Vvdwtot = _mm_add_pd(Vvdwtot,Vvdwtmp); _mm_storeh_pd(&vdwt,Vvdwtot); Vvdw[ggid] = Vvdw[ggid] + vdwt; /* GB potential */ vgb = _mm_unpacklo_pd(vgb,vgbtot); vgbtot = _mm_add_pd(vgbtot,vgb); _mm_storeh_pd(&vgbt,vgbtot); gpol[ggid] = gpol[ggid] + vgbt; } *outeriter = nri; *inneriter = nj1; }
void kernel_dgemv_t_8_lib4(int kmax, double *A, int sda, double *x, double *y, int alg) { if(kmax<=0) return; const int lda = 4; __builtin_prefetch( A + 0*lda ); __builtin_prefetch( A + 2*lda ); __builtin_prefetch( A + 4*lda ); __builtin_prefetch( A + 6*lda ); double *tA, *tx; int k; int ka = kmax; // number from aligned positon __m256d aaxx_temp, a_00_10_20_30, a_01_11_21_31, a_02_12_22_32, a_03_13_23_33, x_0_1_2_3, y_00, y_11, y_22, y_33, y_44, y_55, y_66, y_77; __m128d ax_temp, a_00_10, a_01_11, a_02_12, a_03_13, x_0_1, y_0, y_1, y_2, y_3, y_4, y_5, y_6, y_7; y_00 = _mm256_setzero_pd(); y_11 = _mm256_setzero_pd(); y_22 = _mm256_setzero_pd(); y_33 = _mm256_setzero_pd(); y_44 = _mm256_setzero_pd(); y_55 = _mm256_setzero_pd(); y_66 = _mm256_setzero_pd(); y_77 = _mm256_setzero_pd(); y_0 = _mm256_castpd256_pd128(y_00); y_1 = _mm256_castpd256_pd128(y_11); y_2 = _mm256_castpd256_pd128(y_22); y_3 = _mm256_castpd256_pd128(y_33); y_4 = _mm256_castpd256_pd128(y_44); y_5 = _mm256_castpd256_pd128(y_55); y_6 = _mm256_castpd256_pd128(y_66); y_7 = _mm256_castpd256_pd128(y_77); k = lda*(ka/lda); tA = A + (ka/lda)*sda*lda; tx = x + (ka/lda)*lda; if(ka-k>0) // it can be only ka-k = {1, 2, 3} { if((ka-k)>=2) { x_0_1 = _mm_load_pd( &tx[0] ); a_00_10 = _mm_load_pd( &tA[0+lda*0] ); a_01_11 = _mm_load_pd( &tA[0+lda*1] ); a_02_12 = _mm_load_pd( &tA[0+lda*2] ); a_03_13 = _mm_load_pd( &tA[0+lda*3] ); ax_temp = _mm_mul_pd( a_00_10, x_0_1 ); y_0 = _mm_add_pd (y_0, ax_temp ); ax_temp = _mm_mul_pd( a_01_11, x_0_1 ); y_1 = _mm_add_pd (y_1, ax_temp ); ax_temp = _mm_mul_pd( a_02_12, x_0_1 ); y_2 = _mm_add_pd (y_2, ax_temp ); ax_temp = _mm_mul_pd( a_03_13, x_0_1 ); y_3 = _mm_add_pd (y_3, ax_temp ); a_00_10 = _mm_load_pd( &tA[0+lda*4] ); a_01_11 = _mm_load_pd( &tA[0+lda*5] ); a_02_12 = _mm_load_pd( &tA[0+lda*6] ); a_03_13 = _mm_load_pd( &tA[0+lda*7] ); ax_temp = _mm_mul_pd( a_00_10, x_0_1 ); y_4 = _mm_add_pd (y_4, ax_temp ); ax_temp = _mm_mul_pd( a_01_11, x_0_1 ); y_5 = _mm_add_pd (y_5, ax_temp ); ax_temp = _mm_mul_pd( a_02_12, x_0_1 ); y_6 = _mm_add_pd (y_6, ax_temp ); ax_temp = _mm_mul_pd( a_03_13, x_0_1 ); y_7 = _mm_add_pd (y_7, ax_temp ); tA += 2; tx += 2; k+=2; } if((ka-k)==1) { x_0_1 = _mm_load_sd( &tx[0] ); a_00_10 = _mm_load_sd( &tA[0+lda*0] ); a_01_11 = _mm_load_sd( &tA[0+lda*1] ); a_02_12 = _mm_load_sd( &tA[0+lda*2] ); a_03_13 = _mm_load_sd( &tA[0+lda*3] ); ax_temp = _mm_mul_sd( a_00_10, x_0_1 ); y_0 = _mm_add_sd (y_0, ax_temp ); ax_temp = _mm_mul_sd( a_01_11, x_0_1 ); y_1 = _mm_add_sd (y_1, ax_temp ); ax_temp = _mm_mul_sd( a_02_12, x_0_1 ); y_2 = _mm_add_sd (y_2, ax_temp ); ax_temp = _mm_mul_sd( a_03_13, x_0_1 ); y_3 = _mm_add_sd (y_3, ax_temp ); a_00_10 = _mm_load_sd( &tA[0+lda*4] ); a_01_11 = _mm_load_sd( &tA[0+lda*5] ); a_02_12 = _mm_load_sd( &tA[0+lda*6] ); a_03_13 = _mm_load_sd( &tA[0+lda*7] ); ax_temp = _mm_mul_sd( a_00_10, x_0_1 ); y_4 = _mm_add_sd (y_4, ax_temp ); ax_temp = _mm_mul_sd( a_01_11, x_0_1 ); y_5 = _mm_add_sd (y_5, ax_temp ); ax_temp = _mm_mul_sd( a_02_12, x_0_1 ); y_6 = _mm_add_sd (y_6, ax_temp ); ax_temp = _mm_mul_sd( a_03_13, x_0_1 ); y_7 = _mm_add_sd (y_7, ax_temp ); tA += 1; tx += 1; k++; } } y_00 = _mm256_castpd128_pd256(y_0); y_11 = _mm256_castpd128_pd256(y_1); y_22 = _mm256_castpd128_pd256(y_2); y_33 = _mm256_castpd128_pd256(y_3); y_44 = _mm256_castpd128_pd256(y_4); y_55 = _mm256_castpd128_pd256(y_5); y_66 = _mm256_castpd128_pd256(y_6); y_77 = _mm256_castpd128_pd256(y_7); k=0; for(; k<ka-7; k+=8) { __builtin_prefetch( A + sda*lda + 0*lda ); __builtin_prefetch( A + sda*lda + 2*lda ); x_0_1_2_3 = _mm256_loadu_pd( &x[0] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] ); aaxx_temp = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); y_00 = _mm256_add_pd( y_00, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 ); y_11 = _mm256_add_pd( y_11, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 ); y_22 = _mm256_add_pd( y_22, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 ); y_33 = _mm256_add_pd( y_33, aaxx_temp ); __builtin_prefetch( A + sda*lda + 4*lda ); __builtin_prefetch( A + sda*lda + 6*lda ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*4] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*5] ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*6] ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*7] ); aaxx_temp = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); y_44 = _mm256_add_pd( y_44, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 ); y_55 = _mm256_add_pd( y_55, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 ); y_66 = _mm256_add_pd( y_66, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 ); y_77 = _mm256_add_pd( y_77, aaxx_temp ); A += 4 + (sda-1)*lda; x += 4; __builtin_prefetch( A + sda*lda + 0*lda ); __builtin_prefetch( A + sda*lda + 2*lda ); x_0_1_2_3 = _mm256_loadu_pd( &x[0] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] ); aaxx_temp = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); y_00 = _mm256_add_pd( y_00, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 ); y_11 = _mm256_add_pd( y_11, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 ); y_22 = _mm256_add_pd( y_22, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 ); y_33 = _mm256_add_pd( y_33, aaxx_temp ); __builtin_prefetch( A + sda*lda + 4*lda ); __builtin_prefetch( A + sda*lda + 6*lda ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*4] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*5] ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*6] ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*7] ); aaxx_temp = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); y_44 = _mm256_add_pd( y_44, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 ); y_55 = _mm256_add_pd( y_55, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 ); y_66 = _mm256_add_pd( y_66, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 ); y_77 = _mm256_add_pd( y_77, aaxx_temp ); A += 4 + (sda-1)*lda; x += 4; } for(; k<ka-3; k+=4) { __builtin_prefetch( A + sda*lda + 0*lda ); __builtin_prefetch( A + sda*lda + 2*lda ); x_0_1_2_3 = _mm256_loadu_pd( &x[0] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] ); aaxx_temp = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); y_00 = _mm256_add_pd( y_00, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 ); y_11 = _mm256_add_pd( y_11, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 ); y_22 = _mm256_add_pd( y_22, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 ); y_33 = _mm256_add_pd( y_33, aaxx_temp ); __builtin_prefetch( A + sda*lda + 4*lda ); __builtin_prefetch( A + sda*lda + 6*lda ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*4] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*5] ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*6] ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*7] ); aaxx_temp = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); y_44 = _mm256_add_pd( y_44, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 ); y_55 = _mm256_add_pd( y_55, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 ); y_66 = _mm256_add_pd( y_66, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 ); y_77 = _mm256_add_pd( y_77, aaxx_temp ); A += 4 + (sda-1)*lda; x += 4; } __m256d y_0_1_2_3, y_4_5_6_7; y_00 = _mm256_hadd_pd(y_00, y_11); y_22 = _mm256_hadd_pd(y_22, y_33); y_44 = _mm256_hadd_pd(y_44, y_55); y_66 = _mm256_hadd_pd(y_66, y_77); y_11 = _mm256_permute2f128_pd(y_22, y_00, 2 ); y_00 = _mm256_permute2f128_pd(y_22, y_00, 19); y_55 = _mm256_permute2f128_pd(y_66, y_44, 2 ); y_44 = _mm256_permute2f128_pd(y_66, y_44, 19); y_00 = _mm256_add_pd( y_00, y_11 ); y_44 = _mm256_add_pd( y_44, y_55 ); if(alg==0) { _mm256_storeu_pd(&y[0], y_00); _mm256_storeu_pd(&y[4], y_44); } else if(alg==1) { y_0_1_2_3 = _mm256_loadu_pd( &y[0] ); y_4_5_6_7 = _mm256_loadu_pd( &y[4] ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, y_00 ); y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, y_44 ); _mm256_storeu_pd(&y[0], y_0_1_2_3); _mm256_storeu_pd(&y[4], y_4_5_6_7); } else // alg==-1 { y_0_1_2_3 = _mm256_loadu_pd( &y[0] ); y_4_5_6_7 = _mm256_loadu_pd( &y[4] ); y_0_1_2_3 = _mm256_sub_pd( y_0_1_2_3, y_00 ); y_4_5_6_7 = _mm256_sub_pd( y_4_5_6_7, y_44 ); _mm256_storeu_pd(&y[0], y_0_1_2_3); _mm256_storeu_pd(&y[4], y_4_5_6_7); } }
test (double *e) { return _mm_load_pd (e); }
// it moves horizontally inside a block void kernel_dgemv_n_2_lib4(int kmax, double *A, double *x, double *y, int alg) { if(kmax<=0) return; const int lda = 4; int k; __m128d ax_temp, a_00_10, a_01_11, a_02_12, a_03_13, x_0, x_1, x_2, x_3, y_0_1, y_0_1_b, y_0_1_c, y_0_1_d, z_0_1; y_0_1 = _mm_setzero_pd(); y_0_1_b = _mm_setzero_pd(); y_0_1_c = _mm_setzero_pd(); y_0_1_d = _mm_setzero_pd(); k=0; for(; k<kmax-3; k+=4) { x_0 = _mm_loaddup_pd( &x[0] ); x_1 = _mm_loaddup_pd( &x[1] ); a_00_10 = _mm_load_pd( &A[0+lda*0] ); a_01_11 = _mm_load_pd( &A[0+lda*1] ); x_2 = _mm_loaddup_pd( &x[2] ); x_3 = _mm_loaddup_pd( &x[3] ); a_02_12 = _mm_load_pd( &A[0+lda*2] ); a_03_13 = _mm_load_pd( &A[0+lda*3] ); ax_temp = _mm_mul_pd( a_00_10, x_0 ); y_0_1 = _mm_add_pd( y_0_1, ax_temp ); ax_temp = _mm_mul_pd( a_01_11, x_1 ); y_0_1_b = _mm_add_pd( y_0_1_b, ax_temp ); ax_temp = _mm_mul_pd( a_02_12, x_2 ); y_0_1_c = _mm_add_pd( y_0_1_c, ax_temp ); ax_temp = _mm_mul_pd( a_03_13, x_3 ); y_0_1_d = _mm_add_pd( y_0_1_d, ax_temp ); A += 4*lda; x += 4; } y_0_1 = _mm_add_pd( y_0_1, y_0_1_c ); y_0_1_b = _mm_add_pd( y_0_1_b, y_0_1_d ); if(kmax%4>=2) { x_0 = _mm_loaddup_pd( &x[0] ); x_1 = _mm_loaddup_pd( &x[1] ); a_00_10 = _mm_load_pd( &A[0+lda*0] ); a_01_11 = _mm_load_pd( &A[0+lda*1] ); ax_temp = _mm_mul_pd( a_00_10, x_0 ); y_0_1 = _mm_add_pd( y_0_1, ax_temp ); ax_temp = _mm_mul_pd( a_01_11, x_1 ); y_0_1_b = _mm_add_pd( y_0_1_b, ax_temp ); A += 2*lda; x += 2; } y_0_1 = _mm_add_pd( y_0_1, y_0_1_b ); if(kmax%2==1) { x_0 = _mm_loaddup_pd( &x[0] ); a_00_10 = _mm_load_pd( &A[0+lda*0] ); ax_temp = _mm_mul_pd( a_00_10, x_0 ); y_0_1 = _mm_add_pd( y_0_1, ax_temp ); } if(alg==0) { _mm_storeu_pd(&y[0], y_0_1); } else if(alg==1) { z_0_1 = _mm_loadu_pd( &y[0] ); z_0_1 = _mm_add_pd( z_0_1, y_0_1 ); _mm_storeu_pd(&y[0], z_0_1); } else // alg==-1 { z_0_1 = _mm_loadu_pd( &y[0] ); z_0_1 = _mm_sub_pd( z_0_1, y_0_1 ); _mm_storeu_pd(&y[0], z_0_1); } }
void SpringEmbedderFRExact::mainStep_sse3(ArrayGraph &C) { //#if (defined(OGDF_ARCH_X86) || defined(OGDF_ARCH_X64)) && !(defined(__GNUC__) && !defined(__SSE3__)) #ifdef OGDF_SSE3_EXTENSIONS const int n = C.numberOfNodes(); #ifdef _OPENMP const int work = 256; const int nThreadsRep = min(omp_get_max_threads(), 1 + n*n/work); const int nThreadsPrev = min(omp_get_max_threads(), 1 + n /work); #endif const double k = m_idealEdgeLength; const double kSquare = k*k; const double c_rep = 0.052 * kSquare; // 0.2 = factor for repulsive forces as suggested by Warshal const double minDist = 10e-6;//100*DBL_EPSILON; const double minDistSquare = minDist*minDist; double *disp_x = (double*) System::alignedMemoryAlloc16(n*sizeof(double)); double *disp_y = (double*) System::alignedMemoryAlloc16(n*sizeof(double)); __m128d mm_kSquare = _mm_set1_pd(kSquare); __m128d mm_minDist = _mm_set1_pd(minDist); __m128d mm_minDistSquare = _mm_set1_pd(minDistSquare); __m128d mm_c_rep = _mm_set1_pd(c_rep); #pragma omp parallel num_threads(nThreadsRep) { double tx = m_txNull; double ty = m_tyNull; int cF = 1; for(int i = 1; i <= m_iterations; i++) { // repulsive forces #pragma omp for for(int v = 0; v < n; ++v) { __m128d mm_disp_xv = _mm_setzero_pd(); __m128d mm_disp_yv = _mm_setzero_pd(); __m128d mm_xv = _mm_set1_pd(C.m_x[v]); __m128d mm_yv = _mm_set1_pd(C.m_y[v]); int u; for(u = 0; u+1 < v; u += 2) { __m128d mm_delta_x = _mm_sub_pd(mm_xv, _mm_load_pd(&C.m_x[u])); __m128d mm_delta_y = _mm_sub_pd(mm_yv, _mm_load_pd(&C.m_y[u])); __m128d mm_distSquare = _mm_max_pd(mm_minDistSquare, _mm_add_pd(_mm_mul_pd(mm_delta_x,mm_delta_x),_mm_mul_pd(mm_delta_y,mm_delta_y)) ); __m128d mm_t = _mm_div_pd(_mm_load_pd(&C.m_nodeWeight[u]), mm_distSquare); mm_disp_xv = _mm_add_pd(mm_disp_xv, _mm_mul_pd(mm_delta_x, mm_t)); mm_disp_yv = _mm_add_pd(mm_disp_yv, _mm_mul_pd(mm_delta_y, mm_t)); //mm_disp_xv = _mm_add_pd(mm_disp_xv, _mm_mul_pd(mm_delta_x, _mm_div_pd(mm_kSquare,mm_distSquare))); //mm_disp_yv = _mm_add_pd(mm_disp_yv, _mm_mul_pd(mm_delta_y, _mm_div_pd(mm_kSquare,mm_distSquare))); } int uStart = u+2; if(u == v) ++u; if(u < n) { __m128d mm_delta_x = _mm_sub_sd(mm_xv, _mm_load_sd(&C.m_x[u])); __m128d mm_delta_y = _mm_sub_sd(mm_yv, _mm_load_sd(&C.m_y[u])); __m128d mm_distSquare = _mm_max_sd(mm_minDistSquare, _mm_add_sd(_mm_mul_sd(mm_delta_x,mm_delta_x),_mm_mul_sd(mm_delta_y,mm_delta_y)) ); __m128d mm_t = _mm_div_sd(_mm_load_sd(&C.m_nodeWeight[u]), mm_distSquare); mm_disp_xv = _mm_add_sd(mm_disp_xv, _mm_mul_sd(mm_delta_x, mm_t)); mm_disp_yv = _mm_add_sd(mm_disp_yv, _mm_mul_sd(mm_delta_y, mm_t)); //mm_disp_xv = _mm_add_sd(mm_disp_xv, _mm_mul_sd(mm_delta_x, _mm_div_sd(mm_kSquare,mm_distSquare))); //mm_disp_yv = _mm_add_sd(mm_disp_yv, _mm_mul_sd(mm_delta_y, _mm_div_sd(mm_kSquare,mm_distSquare))); } for(u = uStart; u < n; u += 2) { __m128d mm_delta_x = _mm_sub_pd(mm_xv, _mm_load_pd(&C.m_x[u])); __m128d mm_delta_y = _mm_sub_pd(mm_yv, _mm_load_pd(&C.m_y[u])); __m128d mm_distSquare = _mm_max_pd(mm_minDistSquare, _mm_add_pd(_mm_mul_pd(mm_delta_x,mm_delta_x),_mm_mul_pd(mm_delta_y,mm_delta_y)) ); __m128d mm_t = _mm_div_pd(_mm_load_pd(&C.m_nodeWeight[u]), mm_distSquare); mm_disp_xv = _mm_add_pd(mm_disp_xv, _mm_mul_pd(mm_delta_x, mm_t)); mm_disp_yv = _mm_add_pd(mm_disp_yv, _mm_mul_pd(mm_delta_y, mm_t)); //mm_disp_xv = _mm_add_pd(mm_disp_xv, _mm_mul_pd(mm_delta_x, _mm_div_pd(mm_kSquare,mm_distSquare))); //mm_disp_yv = _mm_add_pd(mm_disp_yv, _mm_mul_pd(mm_delta_y, _mm_div_pd(mm_kSquare,mm_distSquare))); } if(u < n) { __m128d mm_delta_x = _mm_sub_sd(mm_xv, _mm_load_sd(&C.m_x[u])); __m128d mm_delta_y = _mm_sub_sd(mm_yv, _mm_load_sd(&C.m_y[u])); __m128d mm_distSquare = _mm_max_sd(mm_minDistSquare, _mm_add_sd(_mm_mul_sd(mm_delta_x,mm_delta_x),_mm_mul_sd(mm_delta_y,mm_delta_y)) ); __m128d mm_t = _mm_div_sd(_mm_load_sd(&C.m_nodeWeight[u]), mm_distSquare); mm_disp_xv = _mm_add_sd(mm_disp_xv, _mm_mul_sd(mm_delta_x, mm_t)); mm_disp_yv = _mm_add_sd(mm_disp_yv, _mm_mul_sd(mm_delta_y, mm_t)); //mm_disp_xv = _mm_add_sd(mm_disp_xv, _mm_mul_sd(mm_delta_x, _mm_div_sd(mm_kSquare,mm_distSquare))); //mm_disp_yv = _mm_add_sd(mm_disp_yv, _mm_mul_sd(mm_delta_y, _mm_div_sd(mm_kSquare,mm_distSquare))); } mm_disp_xv = _mm_hadd_pd(mm_disp_xv,mm_disp_xv); mm_disp_yv = _mm_hadd_pd(mm_disp_yv,mm_disp_yv); _mm_store_sd(&disp_x[v], _mm_mul_sd(mm_disp_xv, mm_c_rep)); _mm_store_sd(&disp_y[v], _mm_mul_sd(mm_disp_yv, mm_c_rep)); } // attractive forces #pragma omp single for(int e = 0; e < C.numberOfEdges(); ++e) { int v = C.m_src[e]; int u = C.m_tgt[e]; double delta_x = C.m_x[v] - C.m_x[u]; double delta_y = C.m_y[v] - C.m_y[u]; double dist = max(minDist, sqrt(delta_x*delta_x + delta_y*delta_y)); disp_x[v] -= delta_x * dist / k; disp_y[v] -= delta_y * dist / k; disp_x[u] += delta_x * dist / k; disp_y[u] += delta_y * dist / k; } // limit the maximum displacement to the temperature (m_tx,m_ty) __m128d mm_tx = _mm_set1_pd(tx); __m128d mm_ty = _mm_set1_pd(ty); #pragma omp for nowait for(int v = 0; v < n-1; v += 2) { __m128d mm_disp_xv = _mm_load_pd(&disp_x[v]); __m128d mm_disp_yv = _mm_load_pd(&disp_y[v]); __m128d mm_dist = _mm_max_pd(mm_minDist, _mm_sqrt_pd( _mm_add_pd(_mm_mul_pd(mm_disp_xv,mm_disp_xv),_mm_mul_pd(mm_disp_yv,mm_disp_yv)) )); _mm_store_pd(&C.m_x[v], _mm_add_pd(_mm_load_pd(&C.m_x[v]), _mm_mul_pd(_mm_div_pd(mm_disp_xv, mm_dist), _mm_min_pd(mm_dist,mm_tx)) )); _mm_store_pd(&C.m_y[v], _mm_add_pd(_mm_load_pd(&C.m_y[v]), _mm_mul_pd(_mm_div_pd(mm_disp_yv, mm_dist), _mm_min_pd(mm_dist,mm_ty)) )); } #pragma omp single nowait { if(n % 2) { int v = n-1; double dist = max(minDist, sqrt(disp_x[v]*disp_x[v] + disp_y[v]*disp_y[v])); C.m_x[v] += disp_x[v] / dist * min(dist,tx); C.m_y[v] += disp_y[v] / dist * min(dist,ty); } } cool(tx,ty,cF); #pragma omp barrier } } System::alignedMemoryFree(disp_x); System::alignedMemoryFree(disp_y); #else mainStep(C); #endif }
AABB3d TriangleItemHandler::clip( const size_t item_index, const size_t dimension, const double slab_min, const double slab_max) const { const TriangleVertexInfo& vertex_info = m_triangle_vertex_infos[item_index]; if (vertex_info.m_motion_segment_count > 0) { AABB3d triangle_bbox = m_triangle_bboxes[item_index]; if (triangle_bbox.min[dimension] < slab_min) triangle_bbox.min[dimension] = slab_min; if (triangle_bbox.max[dimension] > slab_max) triangle_bbox.max[dimension] = slab_max; return triangle_bbox; } #ifdef APPLESEED_USE_SSE APPLESEED_SIMD4_ALIGN const Vector3d v0(m_triangle_vertices[vertex_info.m_vertex_index + 0]); APPLESEED_SIMD4_ALIGN const Vector3d v1(m_triangle_vertices[vertex_info.m_vertex_index + 1]); APPLESEED_SIMD4_ALIGN const Vector3d v2(m_triangle_vertices[vertex_info.m_vertex_index + 2]); const double v0d = v0[dimension]; const double v1d = v1[dimension]; const double v2d = v2[dimension]; const int v0_ge_min = v0d >= slab_min ? 1 : 0; const int v0_le_max = v0d <= slab_max ? 1 : 0; const int v1_ge_min = v1d >= slab_min ? 1 : 0; const int v1_le_max = v1d <= slab_max ? 1 : 0; const int v2_ge_min = v2d >= slab_min ? 1 : 0; const int v2_le_max = v2d <= slab_max ? 1 : 0; __m128d bbox_min_xy = _mm_set1_pd(+numeric_limits<double>::max()); __m128d bbox_min_zz = _mm_set1_pd(+numeric_limits<double>::max()); __m128d bbox_max_xy = _mm_set1_pd(-numeric_limits<double>::max()); __m128d bbox_max_zz = _mm_set1_pd(-numeric_limits<double>::max()); const __m128d v0_xy = _mm_load_pd(&v0.x); const __m128d v0_zz = _mm_set1_pd(v0.z); const __m128d v1_xy = _mm_load_pd(&v1.x); const __m128d v1_zz = _mm_set1_pd(v1.z); const __m128d v2_xy = _mm_load_pd(&v2.x); const __m128d v2_zz = _mm_set1_pd(v2.z); if (v0_ge_min & v0_le_max) { bbox_min_xy = _mm_min_pd(bbox_min_xy, v0_xy); bbox_max_xy = _mm_max_pd(bbox_max_xy, v0_xy); bbox_min_zz = _mm_min_pd(bbox_min_zz, v0_zz); bbox_max_zz = _mm_max_pd(bbox_max_zz, v0_zz); } if (v1_ge_min & v1_le_max) { bbox_min_xy = _mm_min_pd(bbox_min_xy, v1_xy); bbox_max_xy = _mm_max_pd(bbox_max_xy, v1_xy); bbox_min_zz = _mm_min_pd(bbox_min_zz, v1_zz); bbox_max_zz = _mm_max_pd(bbox_max_zz, v1_zz); } if (v2_ge_min & v2_le_max) { bbox_min_xy = _mm_min_pd(bbox_min_xy, v2_xy); bbox_max_xy = _mm_max_pd(bbox_max_xy, v2_xy); bbox_min_zz = _mm_min_pd(bbox_min_zz, v2_zz); bbox_max_zz = _mm_max_pd(bbox_max_zz, v2_zz); } const int v0v1_cross_min = v0_ge_min ^ v1_ge_min; const int v0v1_cross_max = v0_le_max ^ v1_le_max; const int v1v2_cross_min = v1_ge_min ^ v2_ge_min; const int v1v2_cross_max = v1_le_max ^ v2_le_max; const int v2v0_cross_min = v2_ge_min ^ v0_ge_min; const int v2v0_cross_max = v2_le_max ^ v0_le_max; if (v0v1_cross_min | v0v1_cross_max) { const double rcp_v0v1 = 1.0 / (v1[dimension] - v0[dimension]); if (v0v1_cross_min) { const double t = (slab_min - v0[dimension]) * rcp_v0v1; assert(t >= 0.0 && t <= 1.0); const __m128d mt = _mm_set1_pd(t); const __m128d mt1 = _mm_set1_pd(1.0 - t); const __m128d p_xy = _mm_add_pd(_mm_mul_pd(v0_xy, mt1), _mm_mul_pd(v1_xy, mt)); const __m128d p_zz = _mm_add_pd(_mm_mul_pd(v0_zz, mt1), _mm_mul_pd(v1_zz, mt)); bbox_min_xy = _mm_min_pd(bbox_min_xy, p_xy); bbox_max_xy = _mm_max_pd(bbox_max_xy, p_xy); bbox_min_zz = _mm_min_pd(bbox_min_zz, p_zz); bbox_max_zz = _mm_max_pd(bbox_max_zz, p_zz); } if (v0v1_cross_max) { const double t = (slab_max - v0[dimension]) * rcp_v0v1; assert(t >= 0.0 && t <= 1.0); const __m128d mt = _mm_set1_pd(t); const __m128d mt1 = _mm_set1_pd(1.0 - t); const __m128d p_xy = _mm_add_pd(_mm_mul_pd(v0_xy, mt1), _mm_mul_pd(v1_xy, mt)); const __m128d p_zz = _mm_add_pd(_mm_mul_pd(v0_zz, mt1), _mm_mul_pd(v1_zz, mt)); bbox_min_xy = _mm_min_pd(bbox_min_xy, p_xy); bbox_max_xy = _mm_max_pd(bbox_max_xy, p_xy); bbox_min_zz = _mm_min_pd(bbox_min_zz, p_zz); bbox_max_zz = _mm_max_pd(bbox_max_zz, p_zz); } } if (v1v2_cross_min | v1v2_cross_max) { const double rcp_v1v2 = 1.0 / (v2[dimension] - v1[dimension]); if (v1v2_cross_min) { const double t = (slab_min - v1[dimension]) * rcp_v1v2; assert(t >= 0.0 && t <= 1.0); const __m128d mt = _mm_set1_pd(t); const __m128d mt1 = _mm_set1_pd(1.0 - t); const __m128d p_xy = _mm_add_pd(_mm_mul_pd(v1_xy, mt1), _mm_mul_pd(v2_xy, mt)); const __m128d p_zz = _mm_add_pd(_mm_mul_pd(v1_zz, mt1), _mm_mul_pd(v2_zz, mt)); bbox_min_xy = _mm_min_pd(bbox_min_xy, p_xy); bbox_max_xy = _mm_max_pd(bbox_max_xy, p_xy); bbox_min_zz = _mm_min_pd(bbox_min_zz, p_zz); bbox_max_zz = _mm_max_pd(bbox_max_zz, p_zz); } if (v1v2_cross_max) { const double t = (slab_max - v1[dimension]) * rcp_v1v2; assert(t >= 0.0 && t <= 1.0); const __m128d mt = _mm_set1_pd(t); const __m128d mt1 = _mm_set1_pd(1.0 - t); const __m128d p_xy = _mm_add_pd(_mm_mul_pd(v1_xy, mt1), _mm_mul_pd(v2_xy, mt)); const __m128d p_zz = _mm_add_pd(_mm_mul_pd(v1_zz, mt1), _mm_mul_pd(v2_zz, mt)); bbox_min_xy = _mm_min_pd(bbox_min_xy, p_xy); bbox_max_xy = _mm_max_pd(bbox_max_xy, p_xy); bbox_min_zz = _mm_min_pd(bbox_min_zz, p_zz); bbox_max_zz = _mm_max_pd(bbox_max_zz, p_zz); } } if (v2v0_cross_min | v2v0_cross_max) { const double rcp_v2v0 = 1.0 / (v0[dimension] - v2[dimension]); if (v2v0_cross_min) { const double t = (slab_min - v2[dimension]) * rcp_v2v0; assert(t >= 0.0 && t <= 1.0); const __m128d mt = _mm_set1_pd(t); const __m128d mt1 = _mm_set1_pd(1.0 - t); const __m128d p_xy = _mm_add_pd(_mm_mul_pd(v2_xy, mt1), _mm_mul_pd(v0_xy, mt)); const __m128d p_zz = _mm_add_pd(_mm_mul_pd(v2_zz, mt1), _mm_mul_pd(v0_zz, mt)); bbox_min_xy = _mm_min_pd(bbox_min_xy, p_xy); bbox_max_xy = _mm_max_pd(bbox_max_xy, p_xy); bbox_min_zz = _mm_min_pd(bbox_min_zz, p_zz); bbox_max_zz = _mm_max_pd(bbox_max_zz, p_zz); } if (v2v0_cross_max) { const double t = (slab_max - v2[dimension]) * rcp_v2v0; assert(t >= 0.0 && t <= 1.0); const __m128d mt = _mm_set1_pd(t); const __m128d mt1 = _mm_set1_pd(1.0 - t); const __m128d p_xy = _mm_add_pd(_mm_mul_pd(v2_xy, mt1), _mm_mul_pd(v0_xy, mt)); const __m128d p_zz = _mm_add_pd(_mm_mul_pd(v2_zz, mt1), _mm_mul_pd(v0_zz, mt)); bbox_min_xy = _mm_min_pd(bbox_min_xy, p_xy); bbox_max_xy = _mm_max_pd(bbox_max_xy, p_xy); bbox_min_zz = _mm_min_pd(bbox_min_zz, p_zz); bbox_max_zz = _mm_max_pd(bbox_max_zz, p_zz); } } APPLESEED_SIMD4_ALIGN AABB3d bbox; _mm_store_pd(&bbox.min.x, bbox_min_xy); _mm_store_sd(&bbox.min.z, bbox_min_zz); _mm_storeu_pd(&bbox.max.x, bbox_max_xy); _mm_store_sd(&bbox.max.z, bbox_max_zz); if (bbox.min[dimension] < slab_min) bbox.min[dimension] = slab_min; if (bbox.max[dimension] > slab_max) bbox.max[dimension] = slab_max; #else const Vector3d v0(m_triangle_vertices[vertex_info.m_vertex_index + 0]); const Vector3d v1(m_triangle_vertices[vertex_info.m_vertex_index + 1]); const Vector3d v2(m_triangle_vertices[vertex_info.m_vertex_index + 2]); const int v0_ge_min = v0[dimension] >= slab_min ? 1 : 0; const int v0_le_max = v0[dimension] <= slab_max ? 1 : 0; const int v1_ge_min = v1[dimension] >= slab_min ? 1 : 0; const int v1_le_max = v1[dimension] <= slab_max ? 1 : 0; const int v2_ge_min = v2[dimension] >= slab_min ? 1 : 0; const int v2_le_max = v2[dimension] <= slab_max ? 1 : 0; AABB3d bbox; bbox.invalidate(); if (v0_ge_min & v0_le_max) bbox.insert(v0); if (v1_ge_min & v1_le_max) bbox.insert(v1); if (v2_ge_min & v2_le_max) bbox.insert(v2); if (v0_ge_min != v1_ge_min) bbox.insert(segment_plane_intersection(v0, v1, dimension, slab_min)); if (v0_le_max != v1_le_max) bbox.insert(segment_plane_intersection(v0, v1, dimension, slab_max)); if (v1_ge_min != v2_ge_min) bbox.insert(segment_plane_intersection(v1, v2, dimension, slab_min)); if (v1_le_max != v2_le_max) bbox.insert(segment_plane_intersection(v1, v2, dimension, slab_max)); if (v2_ge_min != v0_ge_min) bbox.insert(segment_plane_intersection(v2, v0, dimension, slab_min)); if (v2_le_max != v0_le_max) bbox.insert(segment_plane_intersection(v2, v0, dimension, slab_max)); #endif return bbox; }
void bli_daxpyf_int_var1 ( conj_t conja, conj_t conjx, dim_t m, dim_t b_n, double* alpha, double* a, inc_t inca, inc_t lda, double* x, inc_t incx, double* y, inc_t incy, cntx_t* cntx ) { double* restrict alpha_cast = alpha; double* restrict a_cast = a; double* restrict x_cast = x; double* restrict y_cast = y; dim_t i; const dim_t n_elem_per_reg = 2; const dim_t n_iter_unroll = 2; dim_t m_pre; dim_t m_run; dim_t m_left; double* restrict a0; double* restrict a1; double* restrict a2; double* restrict a3; double* restrict y0; double a0c, a1c, a2c, a3c; double chi0, chi1, chi2, chi3; v2df_t a00v, a01v, a02v, a03v, y0v; v2df_t a10v, a11v, a12v, a13v, y1v; v2df_t chi0v, chi1v, chi2v, chi3v; bool_t use_ref = FALSE; if ( bli_zero_dim2( m, b_n ) ) return; m_pre = 0; // If there is anything that would interfere with our use of aligned // vector loads/stores, call the reference implementation. if ( b_n < bli_cntx_get_blksz_def_dt( BLIS_DOUBLE, BLIS_AF, cntx ) ) { use_ref = TRUE; } else if ( inca != 1 || incx != 1 || incy != 1 || bli_is_unaligned_to( lda*sizeof(double), 16 ) ) { use_ref = TRUE; } else if ( bli_is_unaligned_to( a, 16 ) || bli_is_unaligned_to( y, 16 ) ) { use_ref = TRUE; if ( bli_is_unaligned_to( a, 16 ) && bli_is_unaligned_to( y, 16 ) ) { use_ref = FALSE; m_pre = 1; } } // Call the reference implementation if needed. if ( use_ref == TRUE ) { BLIS_DAXPYF_KERNEL_REF( conja, conjx, m, b_n, alpha_cast, a_cast, inca, lda, x_cast, incx, y_cast, incy, cntx ); return; } m_run = ( m - m_pre ) / ( n_elem_per_reg * n_iter_unroll ); m_left = ( m - m_pre ) % ( n_elem_per_reg * n_iter_unroll ); a0 = a_cast + 0*lda; a1 = a_cast + 1*lda; a2 = a_cast + 2*lda; a3 = a_cast + 3*lda; y0 = y_cast; chi0 = *(x_cast + 0*incx); chi1 = *(x_cast + 1*incx); chi2 = *(x_cast + 2*incx); chi3 = *(x_cast + 3*incx); PASTEMAC2(d,d,scals)( *alpha_cast, chi0 ); PASTEMAC2(d,d,scals)( *alpha_cast, chi1 ); PASTEMAC2(d,d,scals)( *alpha_cast, chi2 ); PASTEMAC2(d,d,scals)( *alpha_cast, chi3 ); if ( m_pre == 1 ) { a0c = *a0; a1c = *a1; a2c = *a2; a3c = *a3; *y0 += chi0 * a0c + chi1 * a1c + chi2 * a2c + chi3 * a3c; a0 += inca; a1 += inca; a2 += inca; a3 += inca; y0 += incy; } chi0v.v = _mm_loaddup_pd( ( double* )&chi0 ); chi1v.v = _mm_loaddup_pd( ( double* )&chi1 ); chi2v.v = _mm_loaddup_pd( ( double* )&chi2 ); chi3v.v = _mm_loaddup_pd( ( double* )&chi3 ); for ( i = 0; i < m_run; ++i ) { y0v.v = _mm_load_pd( ( double* )(y0 + 0*n_elem_per_reg) ); a00v.v = _mm_load_pd( ( double* )(a0 + 0*n_elem_per_reg) ); a01v.v = _mm_load_pd( ( double* )(a1 + 0*n_elem_per_reg) ); y0v.v += chi0v.v * a00v.v; y0v.v += chi1v.v * a01v.v; a02v.v = _mm_load_pd( ( double* )(a2 + 0*n_elem_per_reg) ); a03v.v = _mm_load_pd( ( double* )(a3 + 0*n_elem_per_reg) ); y0v.v += chi2v.v * a02v.v; y0v.v += chi3v.v * a03v.v; _mm_store_pd( ( double* )(y0 + 0*n_elem_per_reg), y0v.v ); y1v.v = _mm_load_pd( ( double* )(y0 + 1*n_elem_per_reg) ); a10v.v = _mm_load_pd( ( double* )(a0 + 1*n_elem_per_reg) ); a11v.v = _mm_load_pd( ( double* )(a1 + 1*n_elem_per_reg) ); y1v.v += chi0v.v * a10v.v; y1v.v += chi1v.v * a11v.v; a12v.v = _mm_load_pd( ( double* )(a2 + 1*n_elem_per_reg) ); a13v.v = _mm_load_pd( ( double* )(a3 + 1*n_elem_per_reg) ); y1v.v += chi2v.v * a12v.v; y1v.v += chi3v.v * a13v.v; _mm_store_pd( ( double* )(y0 + 1*n_elem_per_reg), y1v.v ); a0 += n_elem_per_reg * n_iter_unroll; a1 += n_elem_per_reg * n_iter_unroll; a2 += n_elem_per_reg * n_iter_unroll; a3 += n_elem_per_reg * n_iter_unroll; y0 += n_elem_per_reg * n_iter_unroll; } if ( m_left > 0 ) { for ( i = 0; i < m_left; ++i ) { a0c = *a0; a1c = *a1; a2c = *a2; a3c = *a3; *y0 += chi0 * a0c + chi1 * a1c + chi2 * a2c + chi3 * a3c; a0 += inca; a1 += inca; a2 += inca; a3 += inca; y0 += incy; } } }
void nb_kernel430_x86_64_sse2(int * p_nri, int * iinr, int * jindex, int * jjnr, int * shift, double * shiftvec, double * fshift, int * gid, double * pos, double * faction, double * charge, double * p_facel, double * p_krf, double * p_crf, double * vc, int * type, int * p_ntype, double * vdwparam, double * vvdw, double * p_tabscale, double * VFtab, double * invsqrta, double * dvda, double * p_gbtabscale, double * GBtab, int * p_nthreads, int * count, void * mtx, int * outeriter, int * inneriter, double * work) { int nri,ntype,nthreads; int n,ii,is3,ii3,k,nj0,nj1,ggid; double shX,shY,shZ; int offset,nti; int jnrA,jnrB; int j3A,j3B; int tjA,tjB; gmx_gbdata_t *gbdata; double * gpol; __m128d iq,qq,jq,isai; __m128d ix,iy,iz; __m128d jx,jy,jz; __m128d dx,dy,dz; __m128d vctot,vvdwtot,vgbtot,dvdasum,gbfactor; __m128d fix,fiy,fiz,tx,ty,tz,rsq; __m128d rinv,isaj,isaprod; __m128d vcoul,fscal,gbscale,c6,c12; __m128d rinvsq,r,rtab; __m128d eps,Y,F,G,H; __m128d VV,FF,Fp; __m128d vgb,fijGB,dvdatmp; __m128d rinvsix,vvdw6,vvdw12,vvdwtmp; __m128d facel,gbtabscale,dvdaj; __m128d fijD,fijR; __m128d xmm1,tabscale,eps2; __m128i n0, nnn; const __m128d neg = _mm_set1_pd(-1.0); const __m128d zero = _mm_set1_pd(0.0); const __m128d minushalf = _mm_set1_pd(-0.5); const __m128d two = _mm_set1_pd(2.0); gbdata = (gmx_gbdata_t *)work; gpol = gbdata->gpol; nri = *p_nri; ntype = *p_ntype; gbfactor = _mm_set1_pd( - ((1.0/gbdata->epsilon_r) - (1.0/gbdata->gb_epsilon_solvent))); gbtabscale = _mm_load1_pd(p_gbtabscale); facel = _mm_load1_pd(p_facel); tabscale = _mm_load1_pd(p_tabscale); nj1 = 0; jnrA = jnrB = 0; j3A = j3B = 0; jx = _mm_setzero_pd(); jy = _mm_setzero_pd(); jz = _mm_setzero_pd(); c6 = _mm_setzero_pd(); c12 = _mm_setzero_pd(); for(n=0;n<nri;n++) { is3 = 3*shift[n]; shX = shiftvec[is3]; shY = shiftvec[is3+1]; shZ = shiftvec[is3+2]; nj0 = jindex[n]; nj1 = jindex[n+1]; ii = iinr[n]; ii3 = 3*ii; ix = _mm_set1_pd(shX+pos[ii3+0]); iy = _mm_set1_pd(shY+pos[ii3+1]); iz = _mm_set1_pd(shZ+pos[ii3+2]); iq = _mm_load1_pd(charge+ii); iq = _mm_mul_pd(iq,facel); isai = _mm_load1_pd(invsqrta+ii); nti = 2*ntype*type[ii]; vctot = _mm_setzero_pd(); vvdwtot = _mm_setzero_pd(); vgbtot = _mm_setzero_pd(); dvdasum = _mm_setzero_pd(); fix = _mm_setzero_pd(); fiy = _mm_setzero_pd(); fiz = _mm_setzero_pd(); for(k=nj0;k<nj1-1; k+=2) { jnrA = jjnr[k]; jnrB = jjnr[k+1]; j3A = jnrA * 3; j3B = jnrB * 3; GMX_MM_LOAD_1RVEC_2POINTERS_PD(pos+j3A,pos+j3B,jx,jy,jz); dx = _mm_sub_pd(ix,jx); dy = _mm_sub_pd(iy,jy); dz = _mm_sub_pd(iz,jz); rsq = gmx_mm_calc_rsq_pd(dx,dy,dz); rinv = gmx_mm_invsqrt_pd(rsq); rinvsq = _mm_mul_pd(rinv,rinv); /***********************************/ /* INTERACTION SECTION STARTS HERE */ /***********************************/ GMX_MM_LOAD_2VALUES_PD(charge+jnrA,charge+jnrB,jq); GMX_MM_LOAD_2VALUES_PD(invsqrta+jnrA,invsqrta+jnrB,isaj); /* Lennard-Jones */ tjA = nti+2*type[jnrA]; tjB = nti+2*type[jnrB]; GMX_MM_LOAD_2PAIRS_PD(vdwparam+tjA,vdwparam+tjB,c6,c12); isaprod = _mm_mul_pd(isai,isaj); qq = _mm_mul_pd(iq,jq); vcoul = _mm_mul_pd(qq,rinv); fscal = _mm_mul_pd(vcoul,rinv); vctot = _mm_add_pd(vctot,vcoul); /* Polarization interaction */ qq = _mm_mul_pd(qq,_mm_mul_pd(isaprod,gbfactor)); gbscale = _mm_mul_pd(isaprod,gbtabscale); /* Calculate GB table index */ r = _mm_mul_pd(rsq,rinv); rtab = _mm_mul_pd(r,gbscale); n0 = _mm_cvttpd_epi32(rtab); eps = _mm_sub_pd(rtab,_mm_cvtepi32_pd(n0)); nnn = _mm_slli_epi32(n0,2); /* the tables are 16-byte aligned, so we can use _mm_load_pd */ Y = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))); F = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1))); GMX_MM_TRANSPOSE2_PD(Y,F); G = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2); H = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,1))+2); GMX_MM_TRANSPOSE2_PD(G,H); G = _mm_mul_pd(G,eps); H = _mm_mul_pd(H, _mm_mul_pd(eps,eps) ); F = _mm_add_pd(F, _mm_add_pd( G , H ) ); Y = _mm_add_pd(Y, _mm_mul_pd(F, eps)); F = _mm_add_pd(F, _mm_add_pd(G , _mm_mul_pd(H,two))); vgb = _mm_mul_pd(Y, qq); fijGB = _mm_mul_pd(F, _mm_mul_pd(qq,gbscale)); dvdatmp = _mm_mul_pd(_mm_add_pd(vgb, _mm_mul_pd(fijGB,r)) , minushalf); vgbtot = _mm_add_pd(vgbtot, vgb); dvdasum = _mm_add_pd(dvdasum, dvdatmp); dvdatmp = _mm_mul_pd(dvdatmp, _mm_mul_pd(isaj,isaj)); GMX_MM_INCREMENT_2VALUES_PD(dvda+jnrA,dvda+jnrB,dvdatmp); /* Calculate VDW table index */ rtab = _mm_mul_pd(r,tabscale); n0 = _mm_cvttpd_epi32(rtab); eps = _mm_sub_pd(rtab,_mm_cvtepi32_pd(n0)); eps2 = _mm_mul_pd(eps,eps); nnn = _mm_slli_epi32(n0,3); /* Dispersion */ Y = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))); F = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1))); GMX_MM_TRANSPOSE2_PD(Y,F); G = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+2); H = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1))+2); GMX_MM_TRANSPOSE2_PD(G,H); G = _mm_mul_pd(G,eps); H = _mm_mul_pd(H,eps2); Fp = _mm_add_pd(F,G); Fp = _mm_add_pd(Fp,H); VV = _mm_mul_pd(Fp,eps); VV = _mm_add_pd(Y,VV); xmm1 = _mm_mul_pd(two,H); FF = _mm_add_pd(Fp,G); FF = _mm_add_pd(FF,xmm1); vvdw6 = _mm_mul_pd(c6,VV); fijD = _mm_mul_pd(c6,FF); /* Dispersion */ Y = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+4); F = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1))+4); GMX_MM_TRANSPOSE2_PD(Y,F); G = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+6); H = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,1))+6); GMX_MM_TRANSPOSE2_PD(G,H); G = _mm_mul_pd(G,eps); H = _mm_mul_pd(H,eps2); Fp = _mm_add_pd(F,G); Fp = _mm_add_pd(Fp,H); VV = _mm_mul_pd(Fp,eps); VV = _mm_add_pd(Y,VV); xmm1 = _mm_mul_pd(two,H); FF = _mm_add_pd(Fp,G); FF = _mm_add_pd(FF,xmm1); vvdw12 = _mm_mul_pd(c12,VV); fijR = _mm_mul_pd(c12,FF); vvdwtmp = _mm_add_pd(vvdw12,vvdw6); vvdwtot = _mm_add_pd(vvdwtot,vvdwtmp); xmm1 = _mm_add_pd(fijD,fijR); xmm1 = _mm_mul_pd(xmm1,tabscale); xmm1 = _mm_add_pd(xmm1,fijGB); xmm1 = _mm_sub_pd(xmm1,fscal); fscal = _mm_mul_pd(xmm1,neg); fscal = _mm_mul_pd(fscal,rinv); /***********************************/ /* INTERACTION SECTION ENDS HERE */ /***********************************/ /* Calculate temporary vectorial force */ tx = _mm_mul_pd(fscal,dx); ty = _mm_mul_pd(fscal,dy); tz = _mm_mul_pd(fscal,dz); /* Increment i atom force */ fix = _mm_add_pd(fix,tx); fiy = _mm_add_pd(fiy,ty); fiz = _mm_add_pd(fiz,tz); /* Store j forces back */ GMX_MM_DECREMENT_1RVEC_2POINTERS_PD(faction+j3A,faction+j3B,tx,ty,tz); } /* In double precision, offset can only be either 0 or 1 */ if(k<nj1) { jnrA = jjnr[k]; j3A = jnrA * 3; GMX_MM_LOAD_1RVEC_1POINTER_PD(pos+j3A,jx,jy,jz); dx = _mm_sub_sd(ix,jx); dy = _mm_sub_sd(iy,jy); dz = _mm_sub_sd(iz,jz); rsq = gmx_mm_calc_rsq_pd(dx,dy,dz); rinv = gmx_mm_invsqrt_pd(rsq); rinvsq = _mm_mul_sd(rinv,rinv); /***********************************/ /* INTERACTION SECTION STARTS HERE */ /***********************************/ GMX_MM_LOAD_1VALUE_PD(charge+jnrA,jq); GMX_MM_LOAD_1VALUE_PD(invsqrta+jnrA,isaj); /* Lennard-Jones */ tjA = nti+2*type[jnrA]; GMX_MM_LOAD_1PAIR_PD(vdwparam+tjA,c6,c12); isaprod = _mm_mul_sd(isai,isaj); qq = _mm_mul_sd(iq,jq); vcoul = _mm_mul_sd(qq,rinv); fscal = _mm_mul_sd(vcoul,rinv); vctot = _mm_add_sd(vctot,vcoul); /* Polarization interaction */ qq = _mm_mul_sd(qq,_mm_mul_sd(isaprod,gbfactor)); gbscale = _mm_mul_sd(isaprod,gbtabscale); /* Calculate GB table index */ r = _mm_mul_sd(rsq,rinv); rtab = _mm_mul_sd(r,gbscale); n0 = _mm_cvttpd_epi32(rtab); eps = _mm_sub_sd(rtab,_mm_cvtepi32_pd(n0)); nnn = _mm_slli_epi32(n0,2); /* the tables are 16-byte aligned, so we can use _mm_load_pd */ Y = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))); F = _mm_setzero_pd(); GMX_MM_TRANSPOSE2_PD(Y,F); G = _mm_load_pd(GBtab+(gmx_mm_extract_epi32(nnn,0))+2); H = _mm_setzero_pd(); GMX_MM_TRANSPOSE2_PD(G,H); G = _mm_mul_sd(G,eps); H = _mm_mul_sd(H, _mm_mul_sd(eps,eps) ); F = _mm_add_sd(F, _mm_add_sd( G , H ) ); Y = _mm_add_sd(Y, _mm_mul_sd(F, eps)); F = _mm_add_sd(F, _mm_add_sd(G , _mm_mul_sd(H,two))); vgb = _mm_mul_sd(Y, qq); fijGB = _mm_mul_sd(F, _mm_mul_sd(qq,gbscale)); dvdatmp = _mm_mul_sd(_mm_add_sd(vgb, _mm_mul_sd(fijGB,r)) , minushalf); vgbtot = _mm_add_sd(vgbtot, vgb); dvdasum = _mm_add_sd(dvdasum, dvdatmp); dvdatmp = _mm_mul_sd(dvdatmp, _mm_mul_sd(isaj,isaj)); GMX_MM_INCREMENT_1VALUE_PD(dvda+jnrA,dvdatmp); /* Calculate VDW table index */ rtab = _mm_mul_sd(r,tabscale); n0 = _mm_cvttpd_epi32(rtab); eps = _mm_sub_sd(rtab,_mm_cvtepi32_pd(n0)); eps2 = _mm_mul_sd(eps,eps); nnn = _mm_slli_epi32(n0,3); /* Dispersion */ Y = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))); F = _mm_setzero_pd(); GMX_MM_TRANSPOSE2_PD(Y,F); G = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+2); H = _mm_setzero_pd(); GMX_MM_TRANSPOSE2_PD(G,H); G = _mm_mul_sd(G,eps); H = _mm_mul_sd(H,eps2); Fp = _mm_add_sd(F,G); Fp = _mm_add_sd(Fp,H); VV = _mm_mul_sd(Fp,eps); VV = _mm_add_sd(Y,VV); xmm1 = _mm_mul_sd(two,H); FF = _mm_add_sd(Fp,G); FF = _mm_add_sd(FF,xmm1); vvdw6 = _mm_mul_sd(c6,VV); fijD = _mm_mul_sd(c6,FF); /* Dispersion */ Y = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+4); F = _mm_setzero_pd(); GMX_MM_TRANSPOSE2_PD(Y,F); G = _mm_load_pd(VFtab+(gmx_mm_extract_epi32(nnn,0))+6); H = _mm_setzero_pd(); GMX_MM_TRANSPOSE2_PD(G,H); G = _mm_mul_sd(G,eps); H = _mm_mul_sd(H,eps2); Fp = _mm_add_sd(F,G); Fp = _mm_add_sd(Fp,H); VV = _mm_mul_sd(Fp,eps); VV = _mm_add_sd(Y,VV); xmm1 = _mm_mul_sd(two,H); FF = _mm_add_sd(Fp,G); FF = _mm_add_sd(FF,xmm1); vvdw12 = _mm_mul_sd(c12,VV); fijR = _mm_mul_sd(c12,FF); vvdwtmp = _mm_add_sd(vvdw12,vvdw6); vvdwtot = _mm_add_sd(vvdwtot,vvdwtmp); xmm1 = _mm_add_sd(fijD,fijR); xmm1 = _mm_mul_sd(xmm1,tabscale); xmm1 = _mm_add_sd(xmm1,fijGB); xmm1 = _mm_sub_sd(xmm1,fscal); fscal = _mm_mul_sd(xmm1,neg); fscal = _mm_mul_sd(fscal,rinv); /***********************************/ /* INTERACTION SECTION ENDS HERE */ /***********************************/ /* Calculate temporary vectorial force */ tx = _mm_mul_sd(fscal,dx); ty = _mm_mul_sd(fscal,dy); tz = _mm_mul_sd(fscal,dz); /* Increment i atom force */ fix = _mm_add_sd(fix,tx); fiy = _mm_add_sd(fiy,ty); fiz = _mm_add_sd(fiz,tz); /* Store j forces back */ GMX_MM_DECREMENT_1RVEC_1POINTER_PD(faction+j3A,tx,ty,tz); } dvdasum = _mm_mul_pd(dvdasum, _mm_mul_pd(isai,isai)); gmx_mm_update_iforce_1atom_pd(&fix,&fiy,&fiz,faction+ii3,fshift+is3); ggid = gid[n]; gmx_mm_update_2pot_pd(vctot,vc+ggid,vvdwtot,vvdw+ggid); gmx_mm_update_2pot_pd(vgbtot,gpol+ggid,dvdasum,dvda+ii); } *outeriter = nri; *inneriter = nj1; }
void do_matrix_mult(int lda, double* A, double* B, double* C) { /* double* C = (double*) malloc(16*sizeof(double)); int offset = 0; for (int j=0; j<4; j++) { for (int i=0; i<4; i++) { C[j*4+i] = C_t[i+offset]; } offset += lda; } C[0] = C_t[0]; C[1] = C_t[1]; C[2] = C_t[2]; C[3] = C_t[3]; int offset = lda; C[4] = C_t[offset]; C[5] = C_t[offset+1]; C[6] = C_t[offset+2]; C[7] = C_t[offset+3]; offset += offset; C[8] = C_t[offset]; C[9] = C_t[offset+1]; C[10] = C_t[offset+2]; C[11] = C_t[offset+3]; offset += offset; C[12] = C_t[offset]; C[13] = C_t[offset+1]; C[14] = C_t[offset+2]; C[15] = C_t[offset+3]; */ __m128d c1 = _mm_load_pd(C); __m128d c2 = _mm_load_pd(C+2); //likewise, we are loading C[0,2] and C[0,3] here __m128d c3 = _mm_load_pd(C+lda); //likewise, we are loading C[1,0] and C[1,1] here __m128d c4 = _mm_load_pd(C+lda+2); //likewise, we are loading C[1,2] and C[1,3] here __m128d c5 = _mm_load_pd(C+2*lda); //likewise, we are loading C[2,0] and C[2,1] here __m128d c6 = _mm_load_pd(C+2*lda+2); //likewise, we are loading C[2,2] and C[2,3] here __m128d c7 = _mm_load_pd(C+3*lda); //likewise, we are loading C[3,0] and C[3,1] here __m128d c8 = _mm_load_pd(C+3*lda +2); //likewise, we are loading C[3,2] and C[3,3] here //__m128d r1, r2, r3, r4, r5, r6, r7, r8; for(int l=0; l<4; l+=1) { __m128d a1 = _mm_load1_pd(A+l); __m128d a2 = _mm_load1_pd(A+l+lda); __m128d a3 = _mm_load1_pd(A+l+2*lda); __m128d a4 = _mm_load1_pd(A+l+3*lda); __m128d b1 = _mm_load_pd(B+l*lda); __m128d b2 = _mm_load_pd(B+l*lda+2); c1 = _mm_add_pd(c1, _mm_mul_pd(a1, b1)); c2 = _mm_add_pd(c2, _mm_mul_pd(a1, b2)); c3 = _mm_add_pd(c3, _mm_mul_pd(a2, b1)); c4 = _mm_add_pd(c4, _mm_mul_pd(a2, b2)); c5 = _mm_add_pd(c5, _mm_mul_pd(a3, b1)); c6 = _mm_add_pd(c6, _mm_mul_pd(a3, b2)); c7 = _mm_add_pd(c7, _mm_mul_pd(a4, b1)); c8 = _mm_add_pd(c8, _mm_mul_pd(a4, b2)); } _mm_store_pd(C, c1); _mm_store_pd(C+2, c2); _mm_store_pd(C+lda, c3); _mm_store_pd(C+2+lda, c4); _mm_store_pd(C+2*lda, c5); _mm_store_pd(C+2+2*lda, c6); _mm_store_pd(C+3*lda, c7); _mm_store_pd(C+2+3*lda, c8); }
// add *p by *s and applied to all n COREARRAY_DLL_DEFAULT void vec_f64_add(double *p, const double *s, size_t n) { #if defined(COREARRAY_SIMD_AVX) switch ((size_t)p & 0x1F) { case 0x08: if (n > 0) { (*p++) += (*s++); n--; } case 0x10: if (n > 0) { (*p++) += (*s++); n--; } case 0x18: if (n > 0) { (*p++) += (*s++); n--; } case 0x00: for (; n >= 4; n-=4) { _mm256_store_pd(p, _mm256_add_pd(_mm256_load_pd(p), _mm256_loadu_pd(s))); p += 4; s += 4; } if (n >= 2) { _mm_store_pd(p, _mm_add_pd(_mm_load_pd(p), _mm_loadu_pd(s))); p += 2; s += 2; n -= 2; } break; default: for (; n >= 4; n-=4) { _mm256_storeu_pd(p, _mm256_add_pd(_mm256_loadu_pd(p), _mm256_loadu_pd(s))); p += 4; s += 4; } if (n >= 2) { _mm_storeu_pd(p, _mm_add_pd(_mm_loadu_pd(p), _mm_loadu_pd(s))); p += 2; s += 2; n -= 2; } } #elif defined(COREARRAY_SIMD_SSE2) switch ((size_t)p & 0x0F) { case 0x08: if (n > 0) { (*p++) += (*s++); n--; } case 0x00: for (; n >= 2; n-=2) { _mm_store_pd(p, _mm_add_pd(_mm_load_pd(p), _mm_loadu_pd(s))); p += 2; s += 2; } break; default: for (; n >= 2; n-=2) { _mm_storeu_pd(p, _mm_add_pd(_mm_loadu_pd(p), _mm_loadu_pd(s))); p += 2; s += 2; } } #endif for (; n > 0; n--) (*p++) += (*s++); }