real calc_ewaldcoeff(real rc, real dtol) { real x = 5, low, high; int n, i = 0; do { i++; x *= 2; } while (gmx_erfc(x*rc) > dtol); n = i+60; /* search tolerance is 2^-60 */ low = 0; high = x; for (i = 0; i < n; i++) { x = (low+high)/2; if (gmx_erfc(x*rc) > dtol) { low = x; } else { high = x; } } return x; }
real calc_ewaldcoeff_q(real rc, real rtol) { real beta = 5, low, high; int n, i = 0; do { i++; beta *= 2; } while (gmx_erfc(beta*rc) > rtol); /* Do a binary search with tolerance 2^-60 */ n = i+60; low = 0; high = beta; for (i = 0; i < n; i++) { beta = (low+high)/2; if (gmx_erfc(beta*rc) > rtol) { low = beta; } else { high = beta; } } return beta; }
/* The scale (1/spacing) for third order spline interpolation * of the Ewald mesh contribution which needs to be subtracted * from the non-bonded interactions. */ real ewald_spline3_table_scale(real ewaldcoeff,real rc) { double erf_x_d3=1.0522; /* max of (erf(x)/x)''' */ double ftol,etol; double sc_f,sc_e; /* Force tolerance: single precision accuracy */ ftol = GMX_FLOAT_EPS; sc_f = sqrt(erf_x_d3/(6*4*ftol*ewaldcoeff))*ewaldcoeff; /* Energy tolerance: 10x more accurate than the cut-off jump */ etol = 0.1*gmx_erfc(ewaldcoeff*rc); etol = max(etol,GMX_REAL_EPS); sc_e = pow(erf_x_d3/(6*12*sqrt(3)*etol),1.0/3.0)*ewaldcoeff; return max(sc_f,sc_e); }
static void approx_2dof(real s2, real x, real *shift, real *scale) { /* A particle with 1 DOF constrained has 2 DOFs instead of 3. * This code is also used for particles with multiple constraints, * in which case we overestimate the displacement. * The 2DOF distribution is sqrt(pi/2)*erfc(r/(sqrt(2)*s))/(2*s). * We approximate this with scale*Gaussian(s,r+shift), * by matching the distribution value and derivative at x. * This is a tight overestimate for all r>=0 at any s and x. */ real ex, er; ex = exp(-x*x/(2*s2)); er = gmx_erfc(x/sqrt(2*s2)); *shift = -x + sqrt(2*s2/M_PI)*ex/er; *scale = 0.5*M_PI*exp(ex*ex/(M_PI*er*er))*er; }
gmx_inline static void calc_exponentials_lj(int start, int end, real *r, real *tmp2, real *d) { int kx; real mk; for (kx = start; kx < end; kx++) { d[kx] = 1.0/d[kx]; } for (kx = start; kx < end; kx++) { r[kx] = exp(r[kx]); } for (kx = start; kx < end; kx++) { mk = tmp2[kx]; tmp2[kx] = sqrt(M_PI)*mk*gmx_erfc(mk); } }
/* Estimate the error of the SPME Ewald sum. This estimate is based upon * a) a homogeneous distribution of the charges * b) a total charge of zero. */ static void estimate_PME_error(t_inputinfo *info, t_state *state, gmx_mtop_t *mtop, FILE *fp_out, gmx_bool bVerbose, unsigned int seed, t_commrec *cr) { rvec *x=NULL; /* The coordinates */ real *q=NULL; /* The charges */ real edir=0.0; /* real space error */ real erec=0.0; /* reciprocal space error */ real derr=0.0; /* difference of real and reciprocal space error */ real derr0=0.0; /* difference of real and reciprocal space error */ real beta=0.0; /* splitting parameter beta */ real beta0=0.0; /* splitting parameter beta */ int ncharges; /* The number of atoms with charges */ int nsamples; /* The number of samples used for the calculation of the * self-energy error term */ int i=0; if (MASTER(cr)) fprintf(fp_out, "\n--- PME ERROR ESTIMATE ---\n"); /* Prepare an x and q array with only the charged atoms */ ncharges = prepare_x_q(&q, &x, mtop, state->x, cr); if (MASTER(cr)) { calc_q2all(mtop, &(info->q2all), &(info->q2allnr)); info->ewald_rtol[0]=gmx_erfc(info->rcoulomb[0]*info->ewald_beta[0]); /* Write some info to log file */ fprintf(fp_out, "Box volume : %g nm^3\n", info->volume); fprintf(fp_out, "Number of charged atoms : %d (total atoms %d)\n",ncharges, info->natoms); fprintf(fp_out, "Coulomb radius : %g nm\n", info->rcoulomb[0]); fprintf(fp_out, "Ewald_rtol : %g\n", info->ewald_rtol[0]); fprintf(fp_out, "Ewald parameter beta : %g\n", info->ewald_beta[0]); fprintf(fp_out, "Interpolation order : %d\n", info->pme_order[0]); fprintf(fp_out, "Fourier grid (nx,ny,nz) : %d x %d x %d\n", info->nkx[0],info->nky[0],info->nkz[0]); fflush(fp_out); } if (PAR(cr)) bcast_info(info, cr); /* Calculate direct space error */ info->e_dir[0] = estimate_direct(info); /* Calculate reciprocal space error */ info->e_rec[0] = estimate_reciprocal(info, x, q, ncharges, fp_out, bVerbose, seed, &nsamples, cr); if (PAR(cr)) bcast_info(info, cr); if (MASTER(cr)) { fprintf(fp_out, "Direct space error est. : %10.3e kJ/(mol*nm)\n", info->e_dir[0]); fprintf(fp_out, "Reciprocal sp. err. est.: %10.3e kJ/(mol*nm)\n", info->e_rec[0]); fprintf(fp_out, "Self-energy error term was estimated using %d samples\n", nsamples); fflush(fp_out); fprintf(stderr, "Direct space error est. : %10.3e kJ/(mol*nm)\n", info->e_dir[0]); fprintf(stderr, "Reciprocal sp. err. est.: %10.3e kJ/(mol*nm)\n", info->e_rec[0]); } i=0; if (info->bTUNE) { if(MASTER(cr)) fprintf(stderr,"Starting tuning ...\n"); edir=info->e_dir[0]; erec=info->e_rec[0]; derr0=edir-erec; beta0=info->ewald_beta[0]; if (derr>0.0) info->ewald_beta[0]+=0.1; else info->ewald_beta[0]-=0.1; info->e_dir[0] = estimate_direct(info); info->e_rec[0] = estimate_reciprocal(info, x, q, ncharges, fp_out, bVerbose, seed, &nsamples, cr); if (PAR(cr)) bcast_info(info, cr); edir=info->e_dir[0]; erec=info->e_rec[0]; derr=edir-erec; while ( fabs(derr/min(erec,edir)) > 1e-4) { beta=info->ewald_beta[0]; beta-=derr*(info->ewald_beta[0]-beta0)/(derr-derr0); beta0=info->ewald_beta[0]; info->ewald_beta[0]=beta; derr0=derr; info->e_dir[0] = estimate_direct(info); info->e_rec[0] = estimate_reciprocal(info, x, q, ncharges, fp_out, bVerbose, seed, &nsamples, cr); if (PAR(cr)) bcast_info(info, cr); edir=info->e_dir[0]; erec=info->e_rec[0]; derr=edir-erec; if (MASTER(cr)) { i++; fprintf(stderr,"difference between real and rec. space error (step %d): %g\n",i,fabs(derr)); fprintf(stderr,"old beta: %f\n",beta0); fprintf(stderr,"new beta: %f\n",beta); } } info->ewald_rtol[0]=gmx_erfc(info->rcoulomb[0]*info->ewald_beta[0]); if (MASTER(cr)) { /* Write some info to log file */ fflush(fp_out); fprintf(fp_out, "========= After tuning ========\n"); fprintf(fp_out, "Direct space error est. : %10.3e kJ/(mol*nm)\n", info->e_dir[0]); fprintf(fp_out, "Reciprocal sp. err. est.: %10.3e kJ/(mol*nm)\n", info->e_rec[0]); fprintf(stderr, "Direct space error est. : %10.3e kJ/(mol*nm)\n", info->e_dir[0]); fprintf(stderr, "Reciprocal sp. err. est.: %10.3e kJ/(mol*nm)\n", info->e_rec[0]); fprintf(fp_out, "Ewald_rtol : %g\n", info->ewald_rtol[0]); fprintf(fp_out, "Ewald parameter beta : %g\n", info->ewald_beta[0]); fflush(fp_out); } } }
void calc_verlet_buffer_size(const gmx_mtop_t *mtop, real boxvol, const t_inputrec *ir, real reference_temperature, const verletbuf_list_setup_t *list_setup, int *n_nonlin_vsite, real *rlist) { double resolution; char *env; real particle_distance; real nb_clust_frac_pairs_not_in_list_at_cutoff; verletbuf_atomtype_t *att = NULL; int natt = -1, i; double reppow; real md1_ljd, d2_ljd, md3_ljd; real md1_ljr, d2_ljr, md3_ljr; real md1_el, d2_el; real elfac; real kT_fac, mass_min; int ib0, ib1, ib; real rb, rl; real drift; if (reference_temperature < 0) { if (EI_MD(ir->eI) && ir->etc == etcNO) { /* This case should be handled outside calc_verlet_buffer_size */ gmx_incons("calc_verlet_buffer_size called with an NVE ensemble and reference_temperature < 0"); } /* We use the maximum temperature with multiple T-coupl groups. * We could use a per particle temperature, but since particles * interact, this might underestimate the buffer size. */ reference_temperature = 0; for (i = 0; i < ir->opts.ngtc; i++) { if (ir->opts.tau_t[i] >= 0) { reference_temperature = max(reference_temperature, ir->opts.ref_t[i]); } } } /* Resolution of the buffer size */ resolution = 0.001; env = getenv("GMX_VERLET_BUFFER_RES"); if (env != NULL) { sscanf(env, "%lf", &resolution); } /* In an atom wise pair-list there would be no pairs in the list * beyond the pair-list cut-off. * However, we use a pair-list of groups vs groups of atoms. * For groups of 4 atoms, the parallelism of SSE instructions, only * 10% of the atoms pairs are not in the list just beyond the cut-off. * As this percentage increases slowly compared to the decrease of the * Gaussian displacement distribution over this range, we can simply * reduce the drift by this fraction. * For larger groups, e.g. of 8 atoms, this fraction will be lower, * so then buffer size will be on the conservative (large) side. * * Note that the formulas used here do not take into account * cancellation of errors which could occur by missing both * attractive and repulsive interactions. * * The only major assumption is homogeneous particle distribution. * For an inhomogeneous system, such as a liquid-vapor system, * the buffer will be underestimated. The actual energy drift * will be higher by the factor: local/homogeneous particle density. * * The results of this estimate have been checked againt simulations. * In most cases the real drift differs by less than a factor 2. */ /* Worst case assumption: HCP packing of particles gives largest distance */ particle_distance = pow(boxvol*sqrt(2)/mtop->natoms, 1.0/3.0); get_verlet_buffer_atomtypes(mtop, &att, &natt, n_nonlin_vsite); assert(att != NULL && natt >= 0); if (debug) { fprintf(debug, "particle distance assuming HCP packing: %f nm\n", particle_distance); fprintf(debug, "energy drift atom types: %d\n", natt); } reppow = mtop->ffparams.reppow; md1_ljd = 0; d2_ljd = 0; md3_ljd = 0; md1_ljr = 0; d2_ljr = 0; md3_ljr = 0; if (ir->vdwtype == evdwCUT) { real sw_range, md3_pswf; switch (ir->vdw_modifier) { case eintmodNONE: case eintmodPOTSHIFT: /* -dV/dr of -r^-6 and r^-reppow */ md1_ljd = -6*pow(ir->rvdw, -7.0); md1_ljr = reppow*pow(ir->rvdw, -(reppow+1)); /* The contribution of the higher derivatives is negligible */ break; case eintmodFORCESWITCH: /* At the cut-off: V=V'=V''=0, so we use only V''' */ md3_ljd = -md3_force_switch(6.0, ir->rvdw_switch, ir->rvdw); md3_ljr = md3_force_switch(reppow, ir->rvdw_switch, ir->rvdw); break; case eintmodPOTSWITCH: /* At the cut-off: V=V'=V''=0. * V''' is given by the original potential times * the third derivative of the switch function. */ sw_range = ir->rvdw - ir->rvdw_switch; md3_pswf = 60.0*pow(sw_range, -3.0); md3_ljd = -pow(ir->rvdw, -6.0 )*md3_pswf; md3_ljr = pow(ir->rvdw, -reppow)*md3_pswf; break; default: gmx_incons("Unimplemented VdW modifier"); } } else if (EVDW_PME(ir->vdwtype)) { real b, r, br, br2, br4, br6; b = calc_ewaldcoeff_lj(ir->rvdw, ir->ewald_rtol_lj); r = ir->rvdw; br = b*r; br2 = br*br; br4 = br2*br2; br6 = br4*br2; /* -dV/dr of g(br)*r^-6 [where g(x) = exp(-x^2)(1+x^2+x^4/2), see LJ-PME equations in manual] and r^-reppow */ md1_ljd = -exp(-br2)*(br6 + 3.0*br4 + 6.0*br2 + 6.0)*pow(r, -7.0); md1_ljr = reppow*pow(r, -(reppow+1)); /* The contribution of the higher derivatives is negligible */ } else { gmx_fatal(FARGS, "Energy drift calculation is only implemented for plain cut-off Lennard-Jones interactions"); } elfac = ONE_4PI_EPS0/ir->epsilon_r; /* Determine md=-dV/dr and dd=d^2V/dr^2 */ md1_el = 0; d2_el = 0; if (ir->coulombtype == eelCUT || EEL_RF(ir->coulombtype)) { real eps_rf, k_rf; if (ir->coulombtype == eelCUT) { eps_rf = 1; k_rf = 0; } else { eps_rf = ir->epsilon_rf/ir->epsilon_r; if (eps_rf != 0) { k_rf = pow(ir->rcoulomb, -3.0)*(eps_rf - ir->epsilon_r)/(2*eps_rf + ir->epsilon_r); } else { /* epsilon_rf = infinity */ k_rf = 0.5*pow(ir->rcoulomb, -3.0); } } if (eps_rf > 0) { md1_el = elfac*(pow(ir->rcoulomb, -2.0) - 2*k_rf*ir->rcoulomb); } d2_el = elfac*(2*pow(ir->rcoulomb, -3.0) + 2*k_rf); } else if (EEL_PME(ir->coulombtype) || ir->coulombtype == eelEWALD) { real b, rc, br; b = calc_ewaldcoeff_q(ir->rcoulomb, ir->ewald_rtol); rc = ir->rcoulomb; br = b*rc; md1_el = elfac*(b*exp(-br*br)*M_2_SQRTPI/rc + gmx_erfc(br)/(rc*rc)); d2_el = elfac/(rc*rc)*(2*b*(1 + br*br)*exp(-br*br)*M_2_SQRTPI + 2*gmx_erfc(br)/rc); } else { gmx_fatal(FARGS, "Energy drift calculation is only implemented for Reaction-Field and Ewald electrostatics"); } /* Determine the variance of the atomic displacement * over nstlist-1 steps: kT_fac * For inertial dynamics (not Brownian dynamics) the mass factor * is not included in kT_fac, it is added later. */ if (ir->eI == eiBD) { /* Get the displacement distribution from the random component only. * With accurate integration the systematic (force) displacement * should be negligible (unless nstlist is extremely large, which * you wouldn't do anyhow). */ kT_fac = 2*BOLTZ*reference_temperature*(ir->nstlist-1)*ir->delta_t; if (ir->bd_fric > 0) { /* This is directly sigma^2 of the displacement */ kT_fac /= ir->bd_fric; /* Set the masses to 1 as kT_fac is the full sigma^2, * but we divide by m in ener_drift(). */ for (i = 0; i < natt; i++) { att[i].prop.mass = 1; } } else { real tau_t; /* Per group tau_t is not implemented yet, use the maximum */ tau_t = ir->opts.tau_t[0]; for (i = 1; i < ir->opts.ngtc; i++) { tau_t = max(tau_t, ir->opts.tau_t[i]); } kT_fac *= tau_t; /* This kT_fac needs to be divided by the mass to get sigma^2 */ } } else { kT_fac = BOLTZ*reference_temperature*sqr((ir->nstlist-1)*ir->delta_t); } mass_min = att[0].prop.mass; for (i = 1; i < natt; i++) { mass_min = min(mass_min, att[i].prop.mass); } if (debug) { fprintf(debug, "md1_ljd %9.2e d2_ljd %9.2e md3_ljd %9.2e\n", md1_ljd, d2_ljd, md3_ljd); fprintf(debug, "md1_ljr %9.2e d2_ljr %9.2e md3_ljr %9.2e\n", md1_ljr, d2_ljr, md3_ljr); fprintf(debug, "md1_el %9.2e d2_el %9.2e\n", md1_el, d2_el); fprintf(debug, "sqrt(kT_fac) %f\n", sqrt(kT_fac)); fprintf(debug, "mass_min %f\n", mass_min); } /* Search using bisection */ ib0 = -1; /* The drift will be neglible at 5 times the max sigma */ ib1 = (int)(5*2*sqrt(kT_fac/mass_min)/resolution) + 1; while (ib1 - ib0 > 1) { ib = (ib0 + ib1)/2; rb = ib*resolution; rl = max(ir->rvdw, ir->rcoulomb) + rb; /* Calculate the average energy drift at the last step * of the nstlist steps at which the pair-list is used. */ drift = ener_drift(att, natt, &mtop->ffparams, kT_fac, md1_ljd, d2_ljd, md3_ljd, md1_ljr, d2_ljr, md3_ljr, md1_el, d2_el, rb, rl, boxvol); /* Correct for the fact that we are using a Ni x Nj particle pair list * and not a 1 x 1 particle pair list. This reduces the drift. */ /* We don't have a formula for 8 (yet), use 4 which is conservative */ nb_clust_frac_pairs_not_in_list_at_cutoff = surface_frac(min(list_setup->cluster_size_i, 4), particle_distance, rl)* surface_frac(min(list_setup->cluster_size_j, 4), particle_distance, rl); drift *= nb_clust_frac_pairs_not_in_list_at_cutoff; /* Convert the drift to drift per unit time per atom */ drift /= ir->nstlist*ir->delta_t*mtop->natoms; if (debug) { fprintf(debug, "ib %3d %3d %3d rb %.3f %dx%d fac %.3f drift %.1e\n", ib0, ib, ib1, rb, list_setup->cluster_size_i, list_setup->cluster_size_j, nb_clust_frac_pairs_not_in_list_at_cutoff, drift); } if (fabs(drift) > ir->verletbuf_tol) { ib0 = ib; } else { ib1 = ib; } } sfree(att); *rlist = max(ir->rvdw, ir->rcoulomb) + ib1*resolution; }
static real ener_drift(const verletbuf_atomtype_t *att, int natt, const gmx_ffparams_t *ffp, real kT_fac, real md1_ljd, real d2_ljd, real md3_ljd, real md1_ljr, real d2_ljr, real md3_ljr, real md1_el, real d2_el, real r_buffer, real rlist, real boxvol) { /* Erfc(8)=1e-29, use this limit so we have some space for arithmetic * on the result when using float precision. */ const real erfc_arg_max = 8.0; double drift_tot, pot1, pot2, pot3, pot; int i, j; real s2i_2d, s2i_3d, s2j_2d, s2j_3d, s2, s; int ti, tj; real md1, d2, md3; real sc_fac, rsh, rsh2; double c_exp, c_erfc; drift_tot = 0; /* Loop over the different atom type pairs */ for (i = 0; i < natt; i++) { get_atom_sigma2(kT_fac, &att[i].prop, &s2i_2d, &s2i_3d); ti = att[i].prop.type; for (j = i; j < natt; j++) { get_atom_sigma2(kT_fac, &att[j].prop, &s2j_2d, &s2j_3d); tj = att[j].prop.type; /* Add up the up to four independent variances */ s2 = s2i_2d + s2i_3d + s2j_2d + s2j_3d; /* Note that attractive and repulsive potentials for individual * pairs will partially cancel. */ /* -dV/dr at the cut-off for LJ + Coulomb */ md1 = md1_ljd*ffp->iparams[ti*ffp->atnr+tj].lj.c6 + md1_ljr*ffp->iparams[ti*ffp->atnr+tj].lj.c12 + md1_el*att[i].prop.q*att[j].prop.q; /* d2V/dr2 at the cut-off for LJ + Coulomb */ d2 = d2_ljd*ffp->iparams[ti*ffp->atnr+tj].lj.c6 + d2_ljr*ffp->iparams[ti*ffp->atnr+tj].lj.c12 + d2_el*att[i].prop.q*att[j].prop.q; /* -d3V/dr3 at the cut-off for LJ, we neglect Coulomb */ md3 = md3_ljd*ffp->iparams[ti*ffp->atnr+tj].lj.c6 + md3_ljr*ffp->iparams[ti*ffp->atnr+tj].lj.c12; rsh = r_buffer; sc_fac = 1.0; if (rsh*rsh > 2*s2*erfc_arg_max*erfc_arg_max) { /* Erfc might run out of float and become 0, somewhat before * c_exp becomes 0. To avoid this and to avoid NaN in * approx_2dof, we set both c_expc and c_erfc to zero. * In any relevant case this has no effect on the results, * since c_exp < 6e-29, so the displacement is completely * negligible for such atom pairs (and an overestimate). * In nearly all use cases, there will be other atom * pairs that contribute much more to the total, so zeroing * this particular contribution has no effect at all. */ c_exp = 0; c_erfc = 0; } else { /* For constraints: adapt r and scaling for the Gaussian */ if (att[i].prop.bConstr) { real sh, sc; approx_2dof(s2i_2d, r_buffer*s2i_2d/s2, &sh, &sc); rsh += sh; sc_fac *= sc; } if (att[j].prop.bConstr) { real sh, sc; approx_2dof(s2j_2d, r_buffer*s2j_2d/s2, &sh, &sc); rsh += sh; sc_fac *= sc; } /* Exact contribution of an atom pair with Gaussian displacement * with sigma s to the energy drift for a potential with * derivative -md and second derivative dd at the cut-off. * The only catch is that for potentials that change sign * near the cut-off there could be an unlucky compensation * of positive and negative energy drift. * Such potentials are extremely rare though. * * Note that pot has unit energy*length, as the linear * atom density still needs to be put in. */ c_exp = exp(-rsh*rsh/(2*s2))/sqrt(2*M_PI); c_erfc = 0.5*gmx_erfc(rsh/(sqrt(2*s2))); } s = sqrt(s2); rsh2 = rsh*rsh; pot1 = sc_fac* md1/2*((rsh2 + s2)*c_erfc - rsh*s*c_exp); pot2 = sc_fac* d2/6*(s*(rsh2 + 2*s2)*c_exp - rsh*(rsh2 + 3*s2)*c_erfc); pot3 = sc_fac* md3/24*((rsh2*rsh2 + 6*rsh2*s2 + 3*s2*s2)*c_erfc - rsh*s*(rsh2 + 5*s2)*c_exp); pot = pot1 + pot2 + pot3; if (gmx_debug_at) { fprintf(debug, "n %d %d d s %.3f %.3f %.3f %.3f con %d -d1 %8.1e d2 %8.1e -d3 %8.1e pot1 %8.1e pot2 %8.1e pot3 %8.1e pot %8.1e\n", att[i].n, att[j].n, sqrt(s2i_2d), sqrt(s2i_3d), sqrt(s2j_2d), sqrt(s2j_3d), att[i].prop.bConstr+att[j].prop.bConstr, md1, d2, md3, pot1, pot2, pot3, pot); } /* Multiply by the number of atom pairs */ if (j == i) { pot *= (double)att[i].n*(att[i].n - 1)/2; } else { pot *= (double)att[i].n*att[j].n; } /* We need the line density to get the energy drift of the system. * The effective average r^2 is close to (rlist+sigma)^2. */ pot *= 4*M_PI*sqr(rlist + s)/boxvol; /* Add the unsigned drift to avoid cancellation of errors */ drift_tot += fabs(pot); } } return drift_tot; }
gmx_bool pme_load_balance(pme_load_balancing_t pme_lb, t_commrec *cr, FILE *fp_err, FILE *fp_log, t_inputrec *ir, t_state *state, double cycles, interaction_const_t *ic, struct nonbonded_verlet_t *nbv, struct gmx_pme_t ** pmedata, gmx_int64_t step) { gmx_bool OK; pme_setup_t *set; double cycles_fast; char buf[STRLEN], sbuf[22]; real rtab; gmx_bool bUsesSimpleTables = TRUE; if (pme_lb->stage == pme_lb->nstage) { return FALSE; } if (PAR(cr)) { gmx_sumd(1, &cycles, cr); cycles /= cr->nnodes; } set = &pme_lb->setup[pme_lb->cur]; set->count++; rtab = ir->rlistlong + ir->tabext; if (set->count % 2 == 1) { /* Skip the first cycle, because the first step after a switch * is much slower due to allocation and/or caching effects. */ return TRUE; } sprintf(buf, "step %4s: ", gmx_step_str(step, sbuf)); print_grid(fp_err, fp_log, buf, "timed with", set, cycles); if (set->count <= 2) { set->cycles = cycles; } else { if (cycles*PME_LB_ACCEL_TOL < set->cycles && pme_lb->stage == pme_lb->nstage - 1) { /* The performance went up a lot (due to e.g. DD load balancing). * Add a stage, keep the minima, but rescan all setups. */ pme_lb->nstage++; if (debug) { fprintf(debug, "The performance for grid %d %d %d went from %.3f to %.1f M-cycles, this is more than %f\n" "Increased the number stages to %d" " and ignoring the previous performance\n", set->grid[XX], set->grid[YY], set->grid[ZZ], cycles*1e-6, set->cycles*1e-6, PME_LB_ACCEL_TOL, pme_lb->nstage); } } set->cycles = min(set->cycles, cycles); } if (set->cycles < pme_lb->setup[pme_lb->fastest].cycles) { pme_lb->fastest = pme_lb->cur; if (DOMAINDECOMP(cr)) { /* We found a new fastest setting, ensure that with subsequent * shorter cut-off's the dynamic load balancing does not make * the use of the current cut-off impossible. This solution is * a trade-off, as the PME load balancing and DD domain size * load balancing can interact in complex ways. * With the Verlet kernels, DD load imbalance will usually be * mainly due to bonded interaction imbalance, which will often * quickly push the domain boundaries beyond the limit for the * optimal, PME load balanced, cut-off. But it could be that * better overal performance can be obtained with a slightly * shorter cut-off and better DD load balancing. */ change_dd_dlb_cutoff_limit(cr); } } cycles_fast = pme_lb->setup[pme_lb->fastest].cycles; /* Check in stage 0 if we should stop scanning grids. * Stop when the time is more than SLOW_FAC longer than the fastest. */ if (pme_lb->stage == 0 && pme_lb->cur > 0 && cycles > pme_lb->setup[pme_lb->fastest].cycles*PME_LB_SLOW_FAC) { pme_lb->n = pme_lb->cur + 1; /* Done with scanning, go to stage 1 */ switch_to_stage1(pme_lb); } if (pme_lb->stage == 0) { int gridsize_start; gridsize_start = set->grid[XX]*set->grid[YY]*set->grid[ZZ]; do { if (pme_lb->cur+1 < pme_lb->n) { /* We had already generated the next setup */ OK = TRUE; } else { /* Find the next setup */ OK = pme_loadbal_increase_cutoff(pme_lb, ir->pme_order, cr->dd); if (!OK) { pme_lb->elimited = epmelblimPMEGRID; } } if (OK && ir->ePBC != epbcNONE) { OK = (sqr(pme_lb->setup[pme_lb->cur+1].rlistlong) <= max_cutoff2(ir->ePBC, state->box)); if (!OK) { pme_lb->elimited = epmelblimBOX; } } if (OK) { pme_lb->cur++; if (DOMAINDECOMP(cr)) { OK = change_dd_cutoff(cr, state, ir, pme_lb->setup[pme_lb->cur].rlistlong); if (!OK) { /* Failed: do not use this setup */ pme_lb->cur--; pme_lb->elimited = epmelblimDD; } } } if (!OK) { /* We hit the upper limit for the cut-off, * the setup should not go further than cur. */ pme_lb->n = pme_lb->cur + 1; print_loadbal_limited(fp_err, fp_log, step, pme_lb); /* Switch to the next stage */ switch_to_stage1(pme_lb); } } while (OK && !(pme_lb->setup[pme_lb->cur].grid[XX]* pme_lb->setup[pme_lb->cur].grid[YY]* pme_lb->setup[pme_lb->cur].grid[ZZ] < gridsize_start*PME_LB_GRID_SCALE_FAC && pme_lb->setup[pme_lb->cur].grid_efficiency < pme_lb->setup[pme_lb->cur-1].grid_efficiency*PME_LB_GRID_EFFICIENCY_REL_FAC)); } if (pme_lb->stage > 0 && pme_lb->end == 1) { pme_lb->cur = 0; pme_lb->stage = pme_lb->nstage; } else if (pme_lb->stage > 0 && pme_lb->end > 1) { /* If stage = nstage-1: * scan over all setups, rerunning only those setups * which are not much slower than the fastest * else: * use the next setup */ do { pme_lb->cur++; if (pme_lb->cur == pme_lb->end) { pme_lb->stage++; pme_lb->cur = pme_lb->start; } } while (pme_lb->stage == pme_lb->nstage - 1 && pme_lb->setup[pme_lb->cur].count > 0 && pme_lb->setup[pme_lb->cur].cycles > cycles_fast*PME_LB_SLOW_FAC); if (pme_lb->stage == pme_lb->nstage) { /* We are done optimizing, use the fastest setup we found */ pme_lb->cur = pme_lb->fastest; } } if (DOMAINDECOMP(cr) && pme_lb->stage > 0) { OK = change_dd_cutoff(cr, state, ir, pme_lb->setup[pme_lb->cur].rlistlong); if (!OK) { /* Failsafe solution */ if (pme_lb->cur > 1 && pme_lb->stage == pme_lb->nstage) { pme_lb->stage--; } pme_lb->fastest = 0; pme_lb->start = 0; pme_lb->end = pme_lb->cur; pme_lb->cur = pme_lb->start; pme_lb->elimited = epmelblimDD; print_loadbal_limited(fp_err, fp_log, step, pme_lb); } } /* Change the Coulomb cut-off and the PME grid */ set = &pme_lb->setup[pme_lb->cur]; ic->rcoulomb = set->rcut_coulomb; ic->rlist = set->rlist; ic->rlistlong = set->rlistlong; ir->nstcalclr = set->nstcalclr; ic->ewaldcoeff_q = set->ewaldcoeff_q; /* TODO: centralize the code that sets the potentials shifts */ if (ic->coulomb_modifier == eintmodPOTSHIFT) { ic->sh_ewald = gmx_erfc(ic->ewaldcoeff_q*ic->rcoulomb); } if (EVDW_PME(ic->vdwtype)) { /* We have PME for both Coulomb and VdW, set rvdw equal to rcoulomb */ ic->rvdw = set->rcut_coulomb; ic->ewaldcoeff_lj = set->ewaldcoeff_lj; if (ic->vdw_modifier == eintmodPOTSHIFT) { real crc2; ic->dispersion_shift.cpot = -pow(ic->rvdw, -6.0); ic->repulsion_shift.cpot = -pow(ic->rvdw, -12.0); ic->sh_invrc6 = -ic->dispersion_shift.cpot; crc2 = sqr(ic->ewaldcoeff_lj*ic->rvdw); ic->sh_lj_ewald = (exp(-crc2)*(1 + crc2 + 0.5*crc2*crc2) - 1)*pow(ic->rvdw, -6.0); } } bUsesSimpleTables = uses_simple_tables(ir->cutoff_scheme, nbv, 0); nbnxn_gpu_pme_loadbal_update_param(nbv, ic); /* With tMPI + GPUs some ranks may be sharing GPU(s) and therefore * also sharing texture references. To keep the code simple, we don't * treat texture references as shared resources, but this means that * the coulomb_tab texture ref will get updated by multiple threads. * Hence, to ensure that the non-bonded kernels don't start before all * texture binding operations are finished, we need to wait for all ranks * to arrive here before continuing. * * Note that we could omit this barrier if GPUs are not shared (or * texture objects are used), but as this is initialization code, there * is not point in complicating things. */ #ifdef GMX_THREAD_MPI if (PAR(cr) && use_GPU(nbv)) { gmx_barrier(cr); } #endif /* GMX_THREAD_MPI */ /* Usually we won't need the simple tables with GPUs. * But we do with hybrid acceleration and with free energy. * To avoid bugs, we always re-initialize the simple tables here. */ init_interaction_const_tables(NULL, ic, bUsesSimpleTables, rtab); if (cr->duty & DUTY_PME) { if (pme_lb->setup[pme_lb->cur].pmedata == NULL) { /* Generate a new PME data structure, * copying part of the old pointers. */ gmx_pme_reinit(&set->pmedata, cr, pme_lb->setup[0].pmedata, ir, set->grid); } *pmedata = set->pmedata; } else { /* Tell our PME-only node to switch grid */ gmx_pme_send_switchgrid(cr, set->grid, set->ewaldcoeff_q, set->ewaldcoeff_lj); } if (debug) { print_grid(NULL, debug, "", "switched to", set, -1); } if (pme_lb->stage == pme_lb->nstage) { print_grid(fp_err, fp_log, "", "optimal", set, -1); } return TRUE; }
static void fill_table(t_tabledata *td,int tp,const t_forcerec *fr) { /* Fill the table according to the formulas in the manual. * In principle, we only need the potential and the second * derivative, but then we would have to do lots of calculations * in the inner loop. By precalculating some terms (see manual) * we get better eventual performance, despite a larger table. * * Since some of these higher-order terms are very small, * we always use double precision to calculate them here, in order * to avoid unnecessary loss of precision. */ #ifdef DEBUG_SWITCH FILE *fp; #endif int i; double reppow,p; double r1,rc,r12,r13; double r,r2,r6,rc6; double expr,Vtab,Ftab; /* Parameters for David's function */ double A=0,B=0,C=0,A_3=0,B_4=0; /* Parameters for the switching function */ double ksw,swi,swi1; /* Temporary parameters */ gmx_bool bSwitch,bShift; double ewc=fr->ewaldcoeff; double isp= 0.564189583547756; bSwitch = ((tp == etabLJ6Switch) || (tp == etabLJ12Switch) || (tp == etabCOULSwitch) || (tp == etabEwaldSwitch) || (tp == etabEwaldUserSwitch)); bShift = ((tp == etabLJ6Shift) || (tp == etabLJ12Shift) || (tp == etabShift)); reppow = fr->reppow; if (tprops[tp].bCoulomb) { r1 = fr->rcoulomb_switch; rc = fr->rcoulomb; } else { r1 = fr->rvdw_switch; rc = fr->rvdw; } if (bSwitch) ksw = 1.0/(pow5(rc-r1)); else ksw = 0.0; if (bShift) { if (tp == etabShift) p = 1; else if (tp == etabLJ6Shift) p = 6; else p = reppow; A = p * ((p+1)*r1-(p+4)*rc)/(pow(rc,p+2)*pow2(rc-r1)); B = -p * ((p+1)*r1-(p+3)*rc)/(pow(rc,p+2)*pow3(rc-r1)); C = 1.0/pow(rc,p)-A/3.0*pow3(rc-r1)-B/4.0*pow4(rc-r1); if (tp == etabLJ6Shift) { A=-A; B=-B; C=-C; } A_3=A/3.0; B_4=B/4.0; } if (debug) { fprintf(debug,"Setting up tables\n"); fflush(debug); } #ifdef DEBUG_SWITCH fp=xvgropen("switch.xvg","switch","r","s"); #endif for(i=td->nx0; (i<td->nx); i++) { r = td->x[i]; r2 = r*r; r6 = 1.0/(r2*r2*r2); if (gmx_within_tol(reppow,12.0,10*GMX_DOUBLE_EPS)) { r12 = r6*r6; } else { r12 = pow(r,-reppow); } Vtab = 0.0; Ftab = 0.0; if (bSwitch) { /* swi is function, swi1 1st derivative and swi2 2nd derivative */ /* The switch function is 1 for r<r1, 0 for r>rc, and smooth for * r1<=r<=rc. The 1st and 2nd derivatives are both zero at * r1 and rc. * ksw is just the constant 1/(rc-r1)^5, to save some calculations... */ if(r<=r1) { swi = 1.0; swi1 = 0.0; } else if (r>=rc) { swi = 0.0; swi1 = 0.0; } else { swi = 1 - 10*pow3(r-r1)*ksw*pow2(rc-r1) + 15*pow4(r-r1)*ksw*(rc-r1) - 6*pow5(r-r1)*ksw; swi1 = -30*pow2(r-r1)*ksw*pow2(rc-r1) + 60*pow3(r-r1)*ksw*(rc-r1) - 30*pow4(r-r1)*ksw; } } else { /* not really needed, but avoids compiler warnings... */ swi = 1.0; swi1 = 0.0; } #ifdef DEBUG_SWITCH fprintf(fp,"%10g %10g %10g %10g\n",r,swi,swi1,swi2); #endif rc6 = rc*rc*rc; rc6 = 1.0/(rc6*rc6); switch (tp) { case etabLJ6: /* Dispersion */ Vtab = -r6; Ftab = 6.0*Vtab/r; break; case etabLJ6Switch: case etabLJ6Shift: /* Dispersion */ if (r < rc) { Vtab = -r6; Ftab = 6.0*Vtab/r; } break; case etabLJ12: /* Repulsion */ Vtab = r12; Ftab = reppow*Vtab/r; break; case etabLJ12Switch: case etabLJ12Shift: /* Repulsion */ if (r < rc) { Vtab = r12; Ftab = reppow*Vtab/r; } break; case etabLJ6Encad: if(r < rc) { Vtab = -(r6-6.0*(rc-r)*rc6/rc-rc6); Ftab = -(6.0*r6/r-6.0*rc6/rc); } else { /* r>rc */ Vtab = 0; Ftab = 0; } break; case etabLJ12Encad: if(r < rc) { Vtab = r12-12.0*(rc-r)*rc6*rc6/rc-1.0*rc6*rc6; Ftab = 12.0*r12/r-12.0*rc6*rc6/rc; } else { /* r>rc */ Vtab = 0; Ftab = 0; } break; case etabCOUL: Vtab = 1.0/r; Ftab = 1.0/r2; break; case etabCOULSwitch: case etabShift: if (r < rc) { Vtab = 1.0/r; Ftab = 1.0/r2; } break; case etabEwald: case etabEwaldSwitch: Vtab = gmx_erfc(ewc*r)/r; Ftab = gmx_erfc(ewc*r)/r2+2*exp(-(ewc*ewc*r2))*ewc*isp/r; break; case etabEwaldUser: case etabEwaldUserSwitch: /* Only calculate minus the reciprocal space contribution */ Vtab = -gmx_erf(ewc*r)/r; Ftab = -gmx_erf(ewc*r)/r2+2*exp(-(ewc*ewc*r2))*ewc*isp/r; break; case etabRF: case etabRF_ZERO: Vtab = 1.0/r + fr->k_rf*r2 - fr->c_rf; Ftab = 1.0/r2 - 2*fr->k_rf*r; if (tp == etabRF_ZERO && r >= rc) { Vtab = 0; Ftab = 0; } break; case etabEXPMIN: expr = exp(-r); Vtab = expr; Ftab = expr; break; case etabCOULEncad: if(r < rc) { Vtab = 1.0/r-(rc-r)/(rc*rc)-1.0/rc; Ftab = 1.0/r2-1.0/(rc*rc); } else { /* r>rc */ Vtab = 0; Ftab = 0; } break; default: gmx_fatal(FARGS,"Table type %d not implemented yet. (%s,%d)", tp,__FILE__,__LINE__); } if (bShift) { /* Normal coulomb with cut-off correction for potential */ if (r < rc) { Vtab -= C; /* If in Shifting range add something to it */ if (r > r1) { r12 = (r-r1)*(r-r1); r13 = (r-r1)*r12; Vtab += - A_3*r13 - B_4*r12*r12; Ftab += A*r12 + B*r13; } } } if (ETAB_USER(tp)) { Vtab += td->v[i]; Ftab += td->f[i]; } if ((r > r1) && bSwitch) { Ftab = Ftab*swi - Vtab*swi1; Vtab = Vtab*swi; } /* Convert to single precision when we store to mem */ td->v[i] = Vtab; td->f[i] = Ftab; } /* Continue the table linearly from nx0 to 0. * These values are only required for energy minimization with overlap or TPI. */ for(i=td->nx0-1; i>=0; i--) { td->v[i] = td->v[i+1] + td->f[i+1]*(td->x[i+1] - td->x[i]); td->f[i] = td->f[i+1]; } #ifdef DEBUG_SWITCH gmx_fio_fclose(fp); #endif }
static real ener_drift(const verletbuf_atomtype_t *att, int natt, const gmx_ffparams_t *ffp, real kT_fac, real md_ljd, real md_ljr, real md_el, real dd_el, real r_buffer, real rlist, real boxvol) { double drift_tot, pot1, pot2, pot; int i, j; real s2i, s2j, s2, s; int ti, tj; real md, dd; real sc_fac, rsh; double c_exp, c_erfc; drift_tot = 0; /* Loop over the different atom type pairs */ for (i = 0; i < natt; i++) { s2i = kT_fac/att[i].mass; ti = att[i].type; for (j = i; j < natt; j++) { s2j = kT_fac/att[j].mass; tj = att[j].type; /* Note that attractive and repulsive potentials for individual * pairs will partially cancel. */ /* -dV/dr at the cut-off for LJ + Coulomb */ md = md_ljd*ffp->iparams[ti*ffp->atnr+tj].lj.c6 + md_ljr*ffp->iparams[ti*ffp->atnr+tj].lj.c12 + md_el*att[i].q*att[j].q; /* d2V/dr2 at the cut-off for Coulomb, we neglect LJ */ dd = dd_el*att[i].q*att[j].q; s2 = s2i + s2j; rsh = r_buffer; sc_fac = 1.0; /* For constraints: adapt r and scaling for the Gaussian */ if (att[i].con) { real sh, sc; approx_2dof(s2i, r_buffer*s2i/s2, &sh, &sc); rsh += sh; sc_fac *= sc; } if (att[j].con) { real sh, sc; approx_2dof(s2j, r_buffer*s2j/s2, &sh, &sc); rsh += sh; sc_fac *= sc; } /* Exact contribution of an atom pair with Gaussian displacement * with sigma s to the energy drift for a potential with * derivative -md and second derivative dd at the cut-off. * The only catch is that for potentials that change sign * near the cut-off there could be an unlucky compensation * of positive and negative energy drift. * Such potentials are extremely rare though. * * Note that pot has unit energy*length, as the linear * atom density still needs to be put in. */ c_exp = exp(-rsh*rsh/(2*s2))/sqrt(2*M_PI); c_erfc = 0.5*gmx_erfc(rsh/(sqrt(2*s2))); s = sqrt(s2); pot1 = sc_fac* md/2*((rsh*rsh + s2)*c_erfc - rsh*s*c_exp); pot2 = sc_fac* dd/6*(s*(rsh*rsh + 2*s2)*c_exp - rsh*(rsh*rsh + 3*s2)*c_erfc); pot = pot1 + pot2; if (gmx_debug_at) { fprintf(debug, "n %d %d d s %.3f %.3f con %d md %8.1e dd %8.1e pot1 %8.1e pot2 %8.1e pot %8.1e\n", att[i].n, att[j].n, sqrt(s2i), sqrt(s2j), att[i].con+att[j].con, md, dd, pot1, pot2, pot); } /* Multiply by the number of atom pairs */ if (j == i) { pot *= (double)att[i].n*(att[i].n - 1)/2; } else { pot *= (double)att[i].n*att[j].n; } /* We need the line density to get the energy drift of the system. * The effective average r^2 is close to (rlist+sigma)^2. */ pot *= 4*M_PI*sqr(rlist + s)/boxvol; /* Add the unsigned drift to avoid cancellation of errors */ drift_tot += fabs(pot); } } return drift_tot; }