__m128d test_mm_min_sd(__m128d A, __m128d B) { // DAG-LABEL: test_mm_min_sd // DAG: call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}) // // ASM-LABEL: test_mm_min_sd // ASM: minsd return _mm_min_sd(A, B); }
DBL AVXFMA4Noise(const Vector3d& EPoint, int noise_generator) { DBL x, y, z; DBL *mp; int ix, iy, iz; int ixiy_hash, ixjy_hash, jxiy_hash, jxjy_hash; DBL sum; // TODO FIXME - global statistics reference // Stats[Calls_To_Noise]++; if (noise_generator==kNoiseGen_Perlin) { // The 1.59 and 0.985 are to correct for some biasing problems with // the random # generator used to create the noise tables. Final // range of values is about 5.0e-4 below 0.0 and above 1.0. Mean // value is 0.49 (ideally it would be 0.5). sum = 0.5 * (1.59 * SolidNoise(EPoint) + 0.985); // Clamp final value to 0-1 range if (sum < 0.0) sum = 0.0; if (sum > 1.0) sum = 1.0; return sum; } x = EPoint[X]; y = EPoint[Y]; z = EPoint[Z]; /* its equivalent integer lattice point. */ /* ix = (int)x; iy = (int)y; iz = (long)z; */ /* JB fix for the range problem */ __m128d xy = _mm_setr_pd(x, y); __m128d zn = _mm_set_sd(z); __m128d epsy = _mm_set1_pd(1.0 - EPSILON); __m128d xy_e = _mm_sub_pd(xy, epsy); __m128d zn_e = _mm_sub_sd(zn, epsy); __m128i tmp_xy = _mm_cvttpd_epi32(_mm_blendv_pd(xy, xy_e, xy)); __m128i tmp_zn = _mm_cvttpd_epi32(_mm_blendv_pd(zn, zn_e, zn)); __m128i noise_min_xy = _mm_setr_epi32(NOISE_MINX, NOISE_MINY, 0, 0); __m128i noise_min_zn = _mm_set1_epi32(NOISE_MINZ); __m128d xy_ixy = _mm_sub_pd(xy, _mm_cvtepi32_pd(tmp_xy)); __m128d zn_izn = _mm_sub_sd(zn, _mm_cvtepi32_pd(tmp_zn)); const __m128i fff = _mm_set1_epi32(0xfff); __m128i i_xy = _mm_and_si128(_mm_sub_epi32(tmp_xy, noise_min_xy), fff); __m128i i_zn = _mm_and_si128(_mm_sub_epi32(tmp_zn, noise_min_zn), fff); ix = _mm_extract_epi32(i_xy, 0); iy = _mm_extract_epi32(i_xy, 1); iz = _mm_extract_epi32(i_zn, 0); ixiy_hash = Hash2d(ix, iy); jxiy_hash = Hash2d(ix + 1, iy); ixjy_hash = Hash2d(ix, iy + 1); jxjy_hash = Hash2d(ix + 1, iy + 1); mp = &RTable[Hash1dRTableIndex(ixiy_hash, iz)]; DBL *mp2 = &RTable[Hash1dRTableIndex(ixjy_hash, iz)]; DBL *mp3 = &RTable[Hash1dRTableIndex(ixiy_hash, iz + 1)]; DBL *mp4 = &RTable[Hash1dRTableIndex(ixjy_hash, iz + 1)]; DBL *mp5 = &RTable[Hash1dRTableIndex(jxiy_hash, iz)]; DBL *mp6 = &RTable[Hash1dRTableIndex(jxjy_hash, iz)]; DBL *mp7 = &RTable[Hash1dRTableIndex(jxiy_hash, iz + 1)]; DBL *mp8 = &RTable[Hash1dRTableIndex(jxjy_hash, iz + 1)]; const __m128d three = _mm_set1_pd(3.0); const __m128d two = _mm_set1_pd(2.0); const __m128d one = _mm_set1_pd(1.0); __m128d ix_mm = _mm_unpacklo_pd(xy_ixy, xy_ixy); __m128d iy_mm = _mm_unpackhi_pd(xy_ixy, xy_ixy); __m128d iz_mm = _mm_unpacklo_pd(zn_izn, zn_izn); __m128d jx_mm = _mm_sub_pd(ix_mm, one); __m128d jy_mm = _mm_sub_pd(iy_mm, one); __m128d jz_mm = _mm_sub_pd(iz_mm, one); __m128d mm_sxy = _mm_mul_pd(_mm_mul_pd(xy_ixy, xy_ixy), _mm_nmacc_pd(two, xy_ixy, three)); __m128d mm_sz = _mm_mul_pd(_mm_mul_pd(iz_mm, iz_mm), _mm_nmacc_pd(two, iz_mm, three)); __m128d mm_tz = _mm_sub_pd(one, mm_sz); __m128d mm_txy = _mm_sub_pd(one, mm_sxy); __m128d mm_tysy = _mm_unpackhi_pd(mm_txy, mm_sxy); __m128d mm_txty_txsy = _mm_mul_pd(_mm_unpacklo_pd(mm_txy, mm_txy), mm_tysy); __m128d mm_sxty_sxsy = _mm_mul_pd(_mm_unpacklo_pd(mm_sxy, mm_sxy), mm_tysy); __m128d y_mm = _mm_unpacklo_pd(iy_mm, jy_mm); __m128d mp_t1, mp_t2, mp1_mm, mp2_mm, mp4_mm, mp6_mm, sum_p, s_mm; __m128d int_sum1 = _mm_setzero_pd(); s_mm = _mm_mul_pd(mm_txty_txsy, mm_tz); INCRSUMP2(mp, mp2, s_mm, ix_mm, y_mm, iz_mm, int_sum1); s_mm = _mm_mul_pd(mm_txty_txsy, mm_sz); INCRSUMP2(mp3, mp4, s_mm, ix_mm, y_mm, jz_mm, int_sum1); s_mm = _mm_mul_pd(mm_sxty_sxsy, mm_tz); INCRSUMP2(mp5, mp6, s_mm, jx_mm, y_mm, iz_mm, int_sum1); s_mm = _mm_mul_pd(mm_sxty_sxsy, mm_sz); INCRSUMP2(mp7, mp8, s_mm, jx_mm, y_mm, jz_mm, int_sum1); int_sum1 = _mm_hadd_pd(int_sum1, int_sum1); if(noise_generator==kNoiseGen_RangeCorrected) { /* details of range here: Min, max: -1.05242, 0.988997 Mean: -0.0191481, Median: -0.535493, Std Dev: 0.256828 We want to change it to as close to [0,1] as possible. */ const __m128d r2 = _mm_set_sd(0.48985582); const __m128d r1r2 = _mm_set_sd(1.05242*0.48985582); int_sum1 = _mm_macc_sd(int_sum1, r2, r1r2); } else { int_sum1 = _mm_add_sd(int_sum1, _mm_set_sd(0.5)); } int_sum1 = _mm_min_sd(one, int_sum1); int_sum1 = _mm_max_sd(_mm_setzero_pd(), int_sum1); _mm_store_sd(&sum, int_sum1); return (sum); }
test (__m128d s1, __m128d s2) { return _mm_min_sd (s1, s2); }
int calc_gb_rad_still_sse2_double(t_commrec *cr, t_forcerec *fr, int natoms, gmx_localtop_t *top, const t_atomtypes *atype, double *x, t_nblist *nl, gmx_genborn_t *born) { int i,k,n,ii,is3,ii3,nj0,nj1,offset; int jnrA,jnrB,j3A,j3B; int *mdtype; double shX,shY,shZ; int *jjnr; double *shiftvec; double gpi_ai,gpi2; double factor; double *gb_radius; double *vsolv; double *work; double *dadx; __m128d ix,iy,iz; __m128d jx,jy,jz; __m128d dx,dy,dz; __m128d tx,ty,tz; __m128d rsq,rinv,rinv2,rinv4,rinv6; __m128d ratio,gpi,rai,raj,vai,vaj,rvdw; __m128d ccf,dccf,theta,cosq,term,sinq,res,prod,prod_ai,tmp; __m128d mask,icf4,icf6,mask_cmp; const __m128d half = _mm_set1_pd(0.5); const __m128d three = _mm_set1_pd(3.0); const __m128d one = _mm_set1_pd(1.0); const __m128d two = _mm_set1_pd(2.0); const __m128d zero = _mm_set1_pd(0.0); const __m128d four = _mm_set1_pd(4.0); const __m128d still_p5inv = _mm_set1_pd(STILL_P5INV); const __m128d still_pip5 = _mm_set1_pd(STILL_PIP5); const __m128d still_p4 = _mm_set1_pd(STILL_P4); factor = 0.5 * ONE_4PI_EPS0; gb_radius = born->gb_radius; vsolv = born->vsolv; work = born->gpol_still_work; jjnr = nl->jjnr; shiftvec = fr->shift_vec[0]; dadx = fr->dadx; jnrA = jnrB = 0; jx = _mm_setzero_pd(); jy = _mm_setzero_pd(); jz = _mm_setzero_pd(); n = 0; for(i=0;i<natoms;i++) { work[i]=0; } for(i=0;i<nl->nri;i++) { ii = nl->iinr[i]; ii3 = ii*3; is3 = 3*nl->shift[i]; shX = shiftvec[is3]; shY = shiftvec[is3+1]; shZ = shiftvec[is3+2]; nj0 = nl->jindex[i]; nj1 = nl->jindex[i+1]; ix = _mm_set1_pd(shX+x[ii3+0]); iy = _mm_set1_pd(shY+x[ii3+1]); iz = _mm_set1_pd(shZ+x[ii3+2]); /* Polarization energy for atom ai */ gpi = _mm_setzero_pd(); rai = _mm_load1_pd(gb_radius+ii); prod_ai = _mm_set1_pd(STILL_P4*vsolv[ii]); for(k=nj0;k<nj1-1;k+=2) { jnrA = jjnr[k]; jnrB = jjnr[k+1]; j3A = 3*jnrA; j3B = 3*jnrB; GMX_MM_LOAD_1RVEC_2POINTERS_PD(x+j3A,x+j3B,jx,jy,jz); GMX_MM_LOAD_2VALUES_PD(gb_radius+jnrA,gb_radius+jnrB,raj); GMX_MM_LOAD_2VALUES_PD(vsolv+jnrA,vsolv+jnrB,vaj); dx = _mm_sub_pd(ix,jx); dy = _mm_sub_pd(iy,jy); dz = _mm_sub_pd(iz,jz); rsq = gmx_mm_calc_rsq_pd(dx,dy,dz); rinv = gmx_mm_invsqrt_pd(rsq); rinv2 = _mm_mul_pd(rinv,rinv); rinv4 = _mm_mul_pd(rinv2,rinv2); rinv6 = _mm_mul_pd(rinv4,rinv2); rvdw = _mm_add_pd(rai,raj); ratio = _mm_mul_pd(rsq, gmx_mm_inv_pd( _mm_mul_pd(rvdw,rvdw))); mask_cmp = _mm_cmple_pd(ratio,still_p5inv); /* gmx_mm_sincos_pd() is quite expensive, so avoid calculating it if we can! */ if( 0 == _mm_movemask_pd(mask_cmp) ) { /* if ratio>still_p5inv for ALL elements */ ccf = one; dccf = _mm_setzero_pd(); } else { ratio = _mm_min_pd(ratio,still_p5inv); theta = _mm_mul_pd(ratio,still_pip5); gmx_mm_sincos_pd(theta,&sinq,&cosq); term = _mm_mul_pd(half,_mm_sub_pd(one,cosq)); ccf = _mm_mul_pd(term,term); dccf = _mm_mul_pd(_mm_mul_pd(two,term), _mm_mul_pd(sinq,theta)); } prod = _mm_mul_pd(still_p4,vaj); icf4 = _mm_mul_pd(ccf,rinv4); icf6 = _mm_mul_pd( _mm_sub_pd( _mm_mul_pd(four,ccf),dccf), rinv6); GMX_MM_INCREMENT_2VALUES_PD(work+jnrA,work+jnrB,_mm_mul_pd(prod_ai,icf4)); gpi = _mm_add_pd(gpi, _mm_mul_pd(prod,icf4) ); _mm_store_pd(dadx,_mm_mul_pd(prod,icf6)); dadx+=2; _mm_store_pd(dadx,_mm_mul_pd(prod_ai,icf6)); dadx+=2; } if(k<nj1) { jnrA = jjnr[k]; j3A = 3*jnrA; GMX_MM_LOAD_1RVEC_1POINTER_PD(x+j3A,jx,jy,jz); GMX_MM_LOAD_1VALUE_PD(gb_radius+jnrA,raj); GMX_MM_LOAD_1VALUE_PD(vsolv+jnrA,vaj); dx = _mm_sub_sd(ix,jx); dy = _mm_sub_sd(iy,jy); dz = _mm_sub_sd(iz,jz); rsq = gmx_mm_calc_rsq_pd(dx,dy,dz); rinv = gmx_mm_invsqrt_pd(rsq); rinv2 = _mm_mul_sd(rinv,rinv); rinv4 = _mm_mul_sd(rinv2,rinv2); rinv6 = _mm_mul_sd(rinv4,rinv2); rvdw = _mm_add_sd(rai,raj); ratio = _mm_mul_sd(rsq, gmx_mm_inv_pd( _mm_mul_pd(rvdw,rvdw))); mask_cmp = _mm_cmple_sd(ratio,still_p5inv); /* gmx_mm_sincos_pd() is quite expensive, so avoid calculating it if we can! */ if( 0 == _mm_movemask_pd(mask_cmp) ) { /* if ratio>still_p5inv for ALL elements */ ccf = one; dccf = _mm_setzero_pd(); } else { ratio = _mm_min_sd(ratio,still_p5inv); theta = _mm_mul_sd(ratio,still_pip5); gmx_mm_sincos_pd(theta,&sinq,&cosq); term = _mm_mul_sd(half,_mm_sub_sd(one,cosq)); ccf = _mm_mul_sd(term,term); dccf = _mm_mul_sd(_mm_mul_sd(two,term), _mm_mul_sd(sinq,theta)); } prod = _mm_mul_sd(still_p4,vaj); icf4 = _mm_mul_sd(ccf,rinv4); icf6 = _mm_mul_sd( _mm_sub_sd( _mm_mul_sd(four,ccf),dccf), rinv6); GMX_MM_INCREMENT_1VALUE_PD(work+jnrA,_mm_mul_sd(prod_ai,icf4)); gpi = _mm_add_sd(gpi, _mm_mul_sd(prod,icf4) ); _mm_store_pd(dadx,_mm_mul_pd(prod,icf6)); dadx+=2; _mm_store_pd(dadx,_mm_mul_pd(prod_ai,icf6)); dadx+=2; } gmx_mm_update_1pot_pd(gpi,work+ii); } /* Sum up the polarization energy from other nodes */ if(PARTDECOMP(cr)) { gmx_sum(natoms, work, cr); } else if(DOMAINDECOMP(cr)) { dd_atom_sum_real(cr->dd, work); } /* Compute the radii */ for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */ { if(born->use[i] != 0) { gpi_ai = born->gpol[i] + work[i]; /* add gpi to the initial pol energy gpi_ai*/ gpi2 = gpi_ai * gpi_ai; born->bRad[i] = factor*gmx_invsqrt(gpi2); fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]); } } /* Extra (local) communication required for DD */ if(DOMAINDECOMP(cr)) { dd_atom_spread_real(cr->dd, born->bRad); dd_atom_spread_real(cr->dd, fr->invsqrta); } return 0; }