static void pp_verlet_load(gmx_mtop_t *mtop, t_inputrec *ir, matrix box, int *nq_tot, int *nlj_tot, double *cost_pp, gmx_bool *bChargePerturbed, gmx_bool *bTypePerturbed) { t_atom *atom; int mb, nmol, atnr, cg, a, a0, nqlj, nq, nlj; gmx_bool bQRF; t_iparams *iparams; gmx_moltype_t *molt; real r_eff; double c_qlj, c_q, c_lj; double nppa; int j_cluster_size; /* Conversion factor for reference vs SIMD kernel performance. * The factor is about right for SSE2/4, but should be 2 higher for AVX256. */ #ifdef GMX_DOUBLE const real nbnxn_refkernel_fac = 4.0; #else const real nbnxn_refkernel_fac = 8.0; #endif bQRF = (EEL_RF(ir->coulombtype) || ir->coulombtype == eelCUT); iparams = mtop->ffparams.iparams; atnr = mtop->ffparams.atnr; nqlj = 0; nq = 0; *bChargePerturbed = FALSE; *bTypePerturbed = FALSE; for (mb = 0; mb < mtop->nmolblock; mb++) { molt = &mtop->moltype[mtop->molblock[mb].type]; atom = molt->atoms.atom; nmol = mtop->molblock[mb].nmol; a = 0; for (a = 0; a < molt->atoms.nr; a++) { if (atom[a].q != 0 || atom[a].qB != 0) { if (iparams[(atnr+1)*atom[a].type].lj.c6 != 0 || iparams[(atnr+1)*atom[a].type].lj.c12 != 0) { nqlj += nmol; } else { nq += nmol; } } if (atom[a].q != atom[a].qB) { *bChargePerturbed = TRUE; } if (atom[a].type != atom[a].typeB) { *bTypePerturbed = TRUE; } } } nlj = mtop->natoms - nqlj - nq; *nq_tot = nqlj + nq; *nlj_tot = nqlj + nlj; /* Effective cut-off for cluster pair list of 4x4 or 4x8 atoms. * This choice should match the one of pick_nbnxn_kernel_cpu(). * TODO: Make this function use pick_nbnxn_kernel_cpu(). */ #if defined GMX_SIMD_HAVE_REAL && ((GMX_SIMD_REAL_WIDTH == 8 && defined GMX_SIMD_HAVE_FMA) || GMX_SIMD_REAL_WIDTH > 8) j_cluster_size = 8; #else j_cluster_size = 4; #endif r_eff = ir->rlist + nbnxn_get_rlist_effective_inc(j_cluster_size, mtop->natoms/det(box)); /* The average number of pairs per atom */ nppa = 0.5*4/3*M_PI*r_eff*r_eff*r_eff*mtop->natoms/det(box); if (debug) { fprintf(debug, "nqlj %d nq %d nlj %d rlist %.3f r_eff %.3f pairs per atom %.1f\n", nqlj, nq, nlj, ir->rlist, r_eff, nppa); } /* Determine the cost per pair interaction */ c_qlj = (bQRF ? c_nbnxn_qrf_lj : c_nbnxn_qexp_lj); c_q = (bQRF ? c_nbnxn_qrf : c_nbnxn_qexp); c_lj = c_nbnxn_lj; if (ir->vdw_modifier == eintmodPOTSWITCH || EVDW_PME(ir->vdwtype)) { c_qlj += c_nbnxn_ljexp_add; c_lj += c_nbnxn_ljexp_add; } if (EVDW_PME(ir->vdwtype) && ir->ljpme_combination_rule == eljpmeLB) { /* We don't have LJ-PME LB comb. rule kernels, we use slow kernels */ c_qlj *= nbnxn_refkernel_fac; c_q *= nbnxn_refkernel_fac; c_lj *= nbnxn_refkernel_fac; } /* For the PP non-bonded cost it is (unrealistically) assumed * that all atoms are distributed homogeneously in space. */ *cost_pp = (nqlj*c_qlj + nq*c_q + nlj*c_lj)*nppa; *cost_pp *= simd_cycle_factor(bHaveSIMD); }
static void pp_verlet_load(const gmx_mtop_t *mtop, const t_inputrec *ir, matrix box, int *nq_tot, int *nlj_tot, double *cost_pp, gmx_bool *bChargePerturbed, gmx_bool *bTypePerturbed) { t_atom *atom; int mb, nmol, atnr, a, nqlj, nq, nlj; gmx_bool bQRF; t_iparams *iparams; gmx_moltype_t *molt; real r_eff; double c_qlj, c_q, c_lj; double nat; /* Conversion factor for reference vs SIMD kernel performance. * The factor is about right for SSE2/4, but should be 2 higher for AVX256. */ #ifdef GMX_DOUBLE const real nbnxn_refkernel_fac = 4.0; #else const real nbnxn_refkernel_fac = 8.0; #endif bQRF = (EEL_RF(ir->coulombtype) || ir->coulombtype == eelCUT); iparams = mtop->ffparams.iparams; atnr = mtop->ffparams.atnr; nqlj = 0; nq = 0; *bChargePerturbed = FALSE; *bTypePerturbed = FALSE; for (mb = 0; mb < mtop->nmolblock; mb++) { molt = &mtop->moltype[mtop->molblock[mb].type]; atom = molt->atoms.atom; nmol = mtop->molblock[mb].nmol; for (a = 0; a < molt->atoms.nr; a++) { if (atom[a].q != 0 || atom[a].qB != 0) { if (iparams[(atnr+1)*atom[a].type].lj.c6 != 0 || iparams[(atnr+1)*atom[a].type].lj.c12 != 0) { nqlj += nmol; } else { nq += nmol; } } if (atom[a].q != atom[a].qB) { *bChargePerturbed = TRUE; } if (atom[a].type != atom[a].typeB) { *bTypePerturbed = TRUE; } } } nlj = mtop->natoms - nqlj - nq; *nq_tot = nqlj + nq; *nlj_tot = nqlj + nlj; /* Effective cut-off for cluster pair list of 4x4 atoms */ r_eff = ir->rlist + nbnxn_get_rlist_effective_inc(NBNXN_CPU_CLUSTER_I_SIZE, mtop->natoms/det(box)); if (debug) { fprintf(debug, "nqlj %d nq %d nlj %d rlist %.3f r_eff %.3f\n", nqlj, nq, nlj, ir->rlist, r_eff); } /* Determine the cost per pair interaction */ c_qlj = (bQRF ? C_VT_QRF_LJ : C_VT_QEXP_LJ); c_q = (bQRF ? C_VT_QRF : C_VT_QEXP); c_lj = C_VT_LJ; if (ir->vdw_modifier == eintmodPOTSWITCH || EVDW_PME(ir->vdwtype)) { c_qlj += C_VT_LJEXP_ADD; c_lj += C_VT_LJEXP_ADD; } if (EVDW_PME(ir->vdwtype) && ir->ljpme_combination_rule == eljpmeLB) { /* We don't have LJ-PME LB comb. rule kernels, we use slow kernels */ c_qlj *= nbnxn_refkernel_fac; c_q *= nbnxn_refkernel_fac; c_lj *= nbnxn_refkernel_fac; } /* For the PP non-bonded cost it is (unrealistically) assumed * that all atoms are distributed homogeneously in space. */ /* Convert mtop->natoms to double to avoid int overflow */ nat = mtop->natoms; *cost_pp = 0.5*nat*(nqlj*c_qlj + nq*c_q + nlj*c_lj) *4/3*M_PI*r_eff*r_eff*r_eff/det(box); }
static void pp_verlet_load(gmx_mtop_t *mtop,t_inputrec *ir,matrix box, int *nq_tot, double *cost_pp, gmx_bool *bChargePerturbed) { t_atom *atom; int mb,nmol,atnr,cg,a,a0,nqlj,nq,nlj; gmx_bool bQRF; t_iparams *iparams; gmx_moltype_t *molt; float r_eff; double nat; bQRF = (EEL_RF(ir->coulombtype) || ir->coulombtype == eelCUT); iparams = mtop->ffparams.iparams; atnr = mtop->ffparams.atnr; nqlj = 0; nq = 0; *bChargePerturbed = FALSE; for(mb=0; mb<mtop->nmolblock; mb++) { molt = &mtop->moltype[mtop->molblock[mb].type]; atom = molt->atoms.atom; nmol = mtop->molblock[mb].nmol; a = 0; for(a=0; a<molt->atoms.nr; a++) { if (atom[a].q != 0 || atom[a].qB != 0) { if (iparams[(atnr+1)*atom[a].type].lj.c6 != 0 || iparams[(atnr+1)*atom[a].type].lj.c12 != 0) { nqlj += nmol; } else { nq += nmol; } } if (atom[a].q != atom[a].qB) { *bChargePerturbed = TRUE; } } } nlj = mtop->natoms - nqlj - nq; *nq_tot = nqlj + nq; /* Effective cut-off for cluster pair list of 4x4 atoms */ r_eff = ir->rlist + nbnxn_get_rlist_effective_inc(NBNXN_CPU_CLUSTER_I_SIZE,mtop->natoms/det(box)); if (debug) { fprintf(debug,"nqlj %d nq %d nlj %d rlist %.3f r_eff %.3f\n", nqlj,nq,nlj,ir->rlist,r_eff); } /* For the PP non-bonded cost it is (unrealistically) assumed * that all atoms are distributed homogeneously in space. */ /* Convert mtop->natoms to double to avoid int overflow */ nat = mtop->natoms; *cost_pp = 0.5*(nqlj*nat*(bQRF ? C_VT_QLJ_RF : C_VT_QLJ_TAB) + nq*nat*(bQRF ? C_VT_Q_RF : C_VT_Q_TAB) + nlj*nat*C_VT_LJ) *4/3*M_PI*r_eff*r_eff*r_eff/det(box); }
/* Try to increase nstlist when using the Verlet cut-off scheme */ static void increase_nstlist(FILE *fp, t_commrec *cr, t_inputrec *ir, int nstlist_cmdline, const gmx_mtop_t *mtop, matrix box, gmx_bool bGPU) { float listfac_ok, listfac_max; int nstlist_orig, nstlist_prev; verletbuf_list_setup_t ls; real rlistWithReferenceNstlist, rlist_inc, rlist_ok, rlist_max; real rlist_new, rlist_prev; size_t nstlist_ind = 0; t_state state_tmp; gmx_bool bBox, bDD, bCont; const char *nstl_gpu = "\nFor optimal performance with a GPU nstlist (now %d) should be larger.\nThe optimum depends on your CPU and GPU resources.\nYou might want to try several nstlist values.\n"; const char *nve_err = "Can not increase nstlist because an NVE ensemble is used"; const char *vbd_err = "Can not increase nstlist because verlet-buffer-tolerance is not set or used"; const char *box_err = "Can not increase nstlist because the box is too small"; const char *dd_err = "Can not increase nstlist because of domain decomposition limitations"; char buf[STRLEN]; const float oneThird = 1.0f / 3.0f; if (nstlist_cmdline <= 0) { if (ir->nstlist == 1) { /* The user probably set nstlist=1 for a reason, * don't mess with the settings. */ return; } if (fp != NULL && bGPU && ir->nstlist < nstlist_try[0]) { fprintf(fp, nstl_gpu, ir->nstlist); } nstlist_ind = 0; while (nstlist_ind < NNSTL && ir->nstlist >= nstlist_try[nstlist_ind]) { nstlist_ind++; } if (nstlist_ind == NNSTL) { /* There are no larger nstlist value to try */ return; } } if (EI_MD(ir->eI) && ir->etc == etcNO) { if (MASTER(cr)) { fprintf(stderr, "%s\n", nve_err); } if (fp != NULL) { fprintf(fp, "%s\n", nve_err); } return; } if (ir->verletbuf_tol == 0 && bGPU) { gmx_fatal(FARGS, "You are using an old tpr file with a GPU, please generate a new tpr file with an up to date version of grompp"); } if (ir->verletbuf_tol < 0) { if (MASTER(cr)) { fprintf(stderr, "%s\n", vbd_err); } if (fp != NULL) { fprintf(fp, "%s\n", vbd_err); } return; } if (bGPU) { listfac_ok = nbnxn_gpu_listfac_ok; listfac_max = nbnxn_gpu_listfac_max; } else { listfac_ok = nbnxn_cpu_listfac_ok; listfac_max = nbnxn_cpu_listfac_max; } nstlist_orig = ir->nstlist; if (nstlist_cmdline > 0) { if (fp) { sprintf(buf, "Getting nstlist=%d from command line option", nstlist_cmdline); } ir->nstlist = nstlist_cmdline; } verletbuf_get_list_setup(TRUE, bGPU, &ls); /* Allow rlist to make the list a given factor larger than the list * would be with the reference value for nstlist (10). */ nstlist_prev = ir->nstlist; ir->nstlist = nbnxnReferenceNstlist; calc_verlet_buffer_size(mtop, det(box), ir, -1, &ls, NULL, &rlistWithReferenceNstlist); ir->nstlist = nstlist_prev; /* Determine the pair list size increase due to zero interactions */ rlist_inc = nbnxn_get_rlist_effective_inc(ls.cluster_size_j, mtop->natoms/det(box)); rlist_ok = (rlistWithReferenceNstlist + rlist_inc)*pow(listfac_ok, oneThird) - rlist_inc; rlist_max = (rlistWithReferenceNstlist + rlist_inc)*pow(listfac_max, oneThird) - rlist_inc; if (debug) { fprintf(debug, "nstlist tuning: rlist_inc %.3f rlist_ok %.3f rlist_max %.3f\n", rlist_inc, rlist_ok, rlist_max); } nstlist_prev = nstlist_orig; rlist_prev = ir->rlist; do { if (nstlist_cmdline <= 0) { ir->nstlist = nstlist_try[nstlist_ind]; } /* Set the pair-list buffer size in ir */ calc_verlet_buffer_size(mtop, det(box), ir, -1, &ls, NULL, &rlist_new); /* Does rlist fit in the box? */ bBox = (sqr(rlist_new) < max_cutoff2(ir->ePBC, box)); bDD = TRUE; if (bBox && DOMAINDECOMP(cr)) { /* Check if rlist fits in the domain decomposition */ if (inputrec2nboundeddim(ir) < DIM) { gmx_incons("Changing nstlist with domain decomposition and unbounded dimensions is not implemented yet"); } copy_mat(box, state_tmp.box); bDD = change_dd_cutoff(cr, &state_tmp, ir, rlist_new); } if (debug) { fprintf(debug, "nstlist %d rlist %.3f bBox %d bDD %d\n", ir->nstlist, rlist_new, bBox, bDD); } bCont = FALSE; if (nstlist_cmdline <= 0) { if (bBox && bDD && rlist_new <= rlist_max) { /* Increase nstlist */ nstlist_prev = ir->nstlist; rlist_prev = rlist_new; bCont = (nstlist_ind+1 < NNSTL && rlist_new < rlist_ok); } else { /* Stick with the previous nstlist */ ir->nstlist = nstlist_prev; rlist_new = rlist_prev; bBox = TRUE; bDD = TRUE; } } nstlist_ind++; } while (bCont); if (!bBox || !bDD) { gmx_warning(!bBox ? box_err : dd_err); if (fp != NULL) { fprintf(fp, "\n%s\n", bBox ? box_err : dd_err); } ir->nstlist = nstlist_orig; } else if (ir->nstlist != nstlist_orig || rlist_new != ir->rlist) { sprintf(buf, "Changing nstlist from %d to %d, rlist from %g to %g", nstlist_orig, ir->nstlist, ir->rlist, rlist_new); if (MASTER(cr)) { fprintf(stderr, "%s\n\n", buf); } if (fp != NULL) { fprintf(fp, "%s\n\n", buf); } ir->rlist = rlist_new; ir->rlistlong = rlist_new; } }