gmx_bool pme_load_balance(pme_load_balancing_t pme_lb, t_commrec *cr, FILE *fp_err, FILE *fp_log, t_inputrec *ir, t_state *state, double cycles, interaction_const_t *ic, nonbonded_verlet_t *nbv, gmx_pme_t *pmedata, gmx_large_int_t step) { gmx_bool OK; pme_setup_t *set; double cycles_fast; char buf[STRLEN], sbuf[22]; real rtab; gmx_bool bUsesSimpleTables = TRUE; if (pme_lb->stage == pme_lb->nstage) { return FALSE; } if (PAR(cr)) { gmx_sumd(1, &cycles, cr); cycles /= cr->nnodes; } set = &pme_lb->setup[pme_lb->cur]; set->count++; rtab = ir->rlistlong + ir->tabext; if (set->count % 2 == 1) { /* Skip the first cycle, because the first step after a switch * is much slower due to allocation and/or caching effects. */ return TRUE; } sprintf(buf, "step %4s: ", gmx_step_str(step, sbuf)); print_grid(fp_err, fp_log, buf, "timed with", set, cycles); if (set->count <= 2) { set->cycles = cycles; } else { if (cycles*PME_LB_ACCEL_TOL < set->cycles && pme_lb->stage == pme_lb->nstage - 1) { /* The performance went up a lot (due to e.g. DD load balancing). * Add a stage, keep the minima, but rescan all setups. */ pme_lb->nstage++; if (debug) { fprintf(debug, "The performance for grid %d %d %d went from %.3f to %.1f M-cycles, this is more than %f\n" "Increased the number stages to %d" " and ignoring the previous performance\n", set->grid[XX], set->grid[YY], set->grid[ZZ], cycles*1e-6, set->cycles*1e-6, PME_LB_ACCEL_TOL, pme_lb->nstage); } } set->cycles = min(set->cycles, cycles); } if (set->cycles < pme_lb->setup[pme_lb->fastest].cycles) { pme_lb->fastest = pme_lb->cur; if (DOMAINDECOMP(cr)) { /* We found a new fastest setting, ensure that with subsequent * shorter cut-off's the dynamic load balancing does not make * the use of the current cut-off impossible. This solution is * a trade-off, as the PME load balancing and DD domain size * load balancing can interact in complex ways. * With the Verlet kernels, DD load imbalance will usually be * mainly due to bonded interaction imbalance, which will often * quickly push the domain boundaries beyond the limit for the * optimal, PME load balanced, cut-off. But it could be that * better overal performance can be obtained with a slightly * shorter cut-off and better DD load balancing. */ change_dd_dlb_cutoff_limit(cr); } } cycles_fast = pme_lb->setup[pme_lb->fastest].cycles; /* Check in stage 0 if we should stop scanning grids. * Stop when the time is more than SLOW_FAC longer than the fastest. */ if (pme_lb->stage == 0 && pme_lb->cur > 0 && cycles > pme_lb->setup[pme_lb->fastest].cycles*PME_LB_SLOW_FAC) { pme_lb->n = pme_lb->cur + 1; /* Done with scanning, go to stage 1 */ switch_to_stage1(pme_lb); } if (pme_lb->stage == 0) { int gridsize_start; gridsize_start = set->grid[XX]*set->grid[YY]*set->grid[ZZ]; do { if (pme_lb->cur+1 < pme_lb->n) { /* We had already generated the next setup */ OK = TRUE; } else { /* Find the next setup */ OK = pme_loadbal_increase_cutoff(pme_lb, ir->pme_order); } if (OK && ir->ePBC != epbcNONE) { OK = (sqr(pme_lb->setup[pme_lb->cur+1].rlistlong) <= max_cutoff2(ir->ePBC, state->box)); if (!OK) { pme_lb->elimited = epmelblimBOX; } } if (OK) { pme_lb->cur++; if (DOMAINDECOMP(cr)) { OK = change_dd_cutoff(cr, state, ir, pme_lb->setup[pme_lb->cur].rlistlong); if (!OK) { /* Failed: do not use this setup */ pme_lb->cur--; pme_lb->elimited = epmelblimDD; } } } if (!OK) { /* We hit the upper limit for the cut-off, * the setup should not go further than cur. */ pme_lb->n = pme_lb->cur + 1; print_loadbal_limited(fp_err, fp_log, step, pme_lb); /* Switch to the next stage */ switch_to_stage1(pme_lb); } } while (OK && !(pme_lb->setup[pme_lb->cur].grid[XX]* pme_lb->setup[pme_lb->cur].grid[YY]* pme_lb->setup[pme_lb->cur].grid[ZZ] < gridsize_start*PME_LB_GRID_SCALE_FAC && pme_lb->setup[pme_lb->cur].grid_efficiency < pme_lb->setup[pme_lb->cur-1].grid_efficiency*PME_LB_GRID_EFFICIENCY_REL_FAC)); } if (pme_lb->stage > 0 && pme_lb->end == 1) { pme_lb->cur = 0; pme_lb->stage = pme_lb->nstage; } else if (pme_lb->stage > 0 && pme_lb->end > 1) { /* If stage = nstage-1: * scan over all setups, rerunning only those setups * which are not much slower than the fastest * else: * use the next setup */ do { pme_lb->cur++; if (pme_lb->cur == pme_lb->end) { pme_lb->stage++; pme_lb->cur = pme_lb->start; } } while (pme_lb->stage == pme_lb->nstage - 1 && pme_lb->setup[pme_lb->cur].count > 0 && pme_lb->setup[pme_lb->cur].cycles > cycles_fast*PME_LB_SLOW_FAC); if (pme_lb->stage == pme_lb->nstage) { /* We are done optimizing, use the fastest setup we found */ pme_lb->cur = pme_lb->fastest; } } if (DOMAINDECOMP(cr) && pme_lb->stage > 0) { OK = change_dd_cutoff(cr, state, ir, pme_lb->setup[pme_lb->cur].rlistlong); if (!OK) { /* Failsafe solution */ if (pme_lb->cur > 1 && pme_lb->stage == pme_lb->nstage) { pme_lb->stage--; } pme_lb->fastest = 0; pme_lb->start = 0; pme_lb->end = pme_lb->cur; pme_lb->cur = pme_lb->start; pme_lb->elimited = epmelblimDD; print_loadbal_limited(fp_err, fp_log, step, pme_lb); } } /* Change the Coulomb cut-off and the PME grid */ set = &pme_lb->setup[pme_lb->cur]; ic->rcoulomb = set->rcut_coulomb; ic->rlist = set->rlist; ic->rlistlong = set->rlistlong; ir->nstcalclr = set->nstcalclr; ic->ewaldcoeff = set->ewaldcoeff; bUsesSimpleTables = uses_simple_tables(ir->cutoff_scheme, nbv, 0); if (pme_lb->cutoff_scheme == ecutsVERLET && nbv->grp[0].kernel_type == nbnxnk8x8x8_CUDA) { nbnxn_cuda_pme_loadbal_update_param(nbv->cu_nbv, ic); } else { init_interaction_const_tables(NULL, ic, bUsesSimpleTables, rtab); } if (pme_lb->cutoff_scheme == ecutsVERLET && nbv->ngrp > 1) { init_interaction_const_tables(NULL, ic, bUsesSimpleTables, rtab); } if (cr->duty & DUTY_PME) { if (pme_lb->setup[pme_lb->cur].pmedata == NULL) { /* Generate a new PME data structure, * copying part of the old pointers. */ gmx_pme_reinit(&set->pmedata, cr, pme_lb->setup[0].pmedata, ir, set->grid); } *pmedata = set->pmedata; } else { /* Tell our PME-only node to switch grid */ gmx_pme_send_switchgrid(cr, set->grid, set->ewaldcoeff); } if (debug) { print_grid(NULL, debug, "", "switched to", set, -1); } if (pme_lb->stage == pme_lb->nstage) { print_grid(fp_err, fp_log, "", "optimal", set, -1); } return TRUE; }
gmx_bool pme_load_balance(pme_load_balancing_t pme_lb, t_commrec *cr, FILE *fp_err, FILE *fp_log, t_inputrec *ir, t_state *state, double cycles, interaction_const_t *ic, struct nonbonded_verlet_t *nbv, struct gmx_pme_t ** pmedata, gmx_int64_t step) { gmx_bool OK; pme_setup_t *set; double cycles_fast; char buf[STRLEN], sbuf[22]; real rtab; gmx_bool bUsesSimpleTables = TRUE; if (pme_lb->stage == pme_lb->nstage) { return FALSE; } if (PAR(cr)) { gmx_sumd(1, &cycles, cr); cycles /= cr->nnodes; } set = &pme_lb->setup[pme_lb->cur]; set->count++; rtab = ir->rlistlong + ir->tabext; if (set->count % 2 == 1) { /* Skip the first cycle, because the first step after a switch * is much slower due to allocation and/or caching effects. */ return TRUE; } sprintf(buf, "step %4s: ", gmx_step_str(step, sbuf)); print_grid(fp_err, fp_log, buf, "timed with", set, cycles); if (set->count <= 2) { set->cycles = cycles; } else { if (cycles*PME_LB_ACCEL_TOL < set->cycles && pme_lb->stage == pme_lb->nstage - 1) { /* The performance went up a lot (due to e.g. DD load balancing). * Add a stage, keep the minima, but rescan all setups. */ pme_lb->nstage++; if (debug) { fprintf(debug, "The performance for grid %d %d %d went from %.3f to %.1f M-cycles, this is more than %f\n" "Increased the number stages to %d" " and ignoring the previous performance\n", set->grid[XX], set->grid[YY], set->grid[ZZ], cycles*1e-6, set->cycles*1e-6, PME_LB_ACCEL_TOL, pme_lb->nstage); } } set->cycles = min(set->cycles, cycles); } if (set->cycles < pme_lb->setup[pme_lb->fastest].cycles) { pme_lb->fastest = pme_lb->cur; if (DOMAINDECOMP(cr)) { /* We found a new fastest setting, ensure that with subsequent * shorter cut-off's the dynamic load balancing does not make * the use of the current cut-off impossible. This solution is * a trade-off, as the PME load balancing and DD domain size * load balancing can interact in complex ways. * With the Verlet kernels, DD load imbalance will usually be * mainly due to bonded interaction imbalance, which will often * quickly push the domain boundaries beyond the limit for the * optimal, PME load balanced, cut-off. But it could be that * better overal performance can be obtained with a slightly * shorter cut-off and better DD load balancing. */ change_dd_dlb_cutoff_limit(cr); } } cycles_fast = pme_lb->setup[pme_lb->fastest].cycles; /* Check in stage 0 if we should stop scanning grids. * Stop when the time is more than SLOW_FAC longer than the fastest. */ if (pme_lb->stage == 0 && pme_lb->cur > 0 && cycles > pme_lb->setup[pme_lb->fastest].cycles*PME_LB_SLOW_FAC) { pme_lb->n = pme_lb->cur + 1; /* Done with scanning, go to stage 1 */ switch_to_stage1(pme_lb); } if (pme_lb->stage == 0) { int gridsize_start; gridsize_start = set->grid[XX]*set->grid[YY]*set->grid[ZZ]; do { if (pme_lb->cur+1 < pme_lb->n) { /* We had already generated the next setup */ OK = TRUE; } else { /* Find the next setup */ OK = pme_loadbal_increase_cutoff(pme_lb, ir->pme_order, cr->dd); if (!OK) { pme_lb->elimited = epmelblimPMEGRID; } } if (OK && ir->ePBC != epbcNONE) { OK = (sqr(pme_lb->setup[pme_lb->cur+1].rlistlong) <= max_cutoff2(ir->ePBC, state->box)); if (!OK) { pme_lb->elimited = epmelblimBOX; } } if (OK) { pme_lb->cur++; if (DOMAINDECOMP(cr)) { OK = change_dd_cutoff(cr, state, ir, pme_lb->setup[pme_lb->cur].rlistlong); if (!OK) { /* Failed: do not use this setup */ pme_lb->cur--; pme_lb->elimited = epmelblimDD; } } } if (!OK) { /* We hit the upper limit for the cut-off, * the setup should not go further than cur. */ pme_lb->n = pme_lb->cur + 1; print_loadbal_limited(fp_err, fp_log, step, pme_lb); /* Switch to the next stage */ switch_to_stage1(pme_lb); } } while (OK && !(pme_lb->setup[pme_lb->cur].grid[XX]* pme_lb->setup[pme_lb->cur].grid[YY]* pme_lb->setup[pme_lb->cur].grid[ZZ] < gridsize_start*PME_LB_GRID_SCALE_FAC && pme_lb->setup[pme_lb->cur].grid_efficiency < pme_lb->setup[pme_lb->cur-1].grid_efficiency*PME_LB_GRID_EFFICIENCY_REL_FAC)); } if (pme_lb->stage > 0 && pme_lb->end == 1) { pme_lb->cur = 0; pme_lb->stage = pme_lb->nstage; } else if (pme_lb->stage > 0 && pme_lb->end > 1) { /* If stage = nstage-1: * scan over all setups, rerunning only those setups * which are not much slower than the fastest * else: * use the next setup */ do { pme_lb->cur++; if (pme_lb->cur == pme_lb->end) { pme_lb->stage++; pme_lb->cur = pme_lb->start; } } while (pme_lb->stage == pme_lb->nstage - 1 && pme_lb->setup[pme_lb->cur].count > 0 && pme_lb->setup[pme_lb->cur].cycles > cycles_fast*PME_LB_SLOW_FAC); if (pme_lb->stage == pme_lb->nstage) { /* We are done optimizing, use the fastest setup we found */ pme_lb->cur = pme_lb->fastest; } } if (DOMAINDECOMP(cr) && pme_lb->stage > 0) { OK = change_dd_cutoff(cr, state, ir, pme_lb->setup[pme_lb->cur].rlistlong); if (!OK) { /* Failsafe solution */ if (pme_lb->cur > 1 && pme_lb->stage == pme_lb->nstage) { pme_lb->stage--; } pme_lb->fastest = 0; pme_lb->start = 0; pme_lb->end = pme_lb->cur; pme_lb->cur = pme_lb->start; pme_lb->elimited = epmelblimDD; print_loadbal_limited(fp_err, fp_log, step, pme_lb); } } /* Change the Coulomb cut-off and the PME grid */ set = &pme_lb->setup[pme_lb->cur]; ic->rcoulomb = set->rcut_coulomb; ic->rlist = set->rlist; ic->rlistlong = set->rlistlong; ir->nstcalclr = set->nstcalclr; ic->ewaldcoeff_q = set->ewaldcoeff_q; /* TODO: centralize the code that sets the potentials shifts */ if (ic->coulomb_modifier == eintmodPOTSHIFT) { ic->sh_ewald = gmx_erfc(ic->ewaldcoeff_q*ic->rcoulomb); } if (EVDW_PME(ic->vdwtype)) { /* We have PME for both Coulomb and VdW, set rvdw equal to rcoulomb */ ic->rvdw = set->rcut_coulomb; ic->ewaldcoeff_lj = set->ewaldcoeff_lj; if (ic->vdw_modifier == eintmodPOTSHIFT) { real crc2; ic->dispersion_shift.cpot = -pow(ic->rvdw, -6.0); ic->repulsion_shift.cpot = -pow(ic->rvdw, -12.0); ic->sh_invrc6 = -ic->dispersion_shift.cpot; crc2 = sqr(ic->ewaldcoeff_lj*ic->rvdw); ic->sh_lj_ewald = (exp(-crc2)*(1 + crc2 + 0.5*crc2*crc2) - 1)*pow(ic->rvdw, -6.0); } } bUsesSimpleTables = uses_simple_tables(ir->cutoff_scheme, nbv, 0); nbnxn_gpu_pme_loadbal_update_param(nbv, ic); /* With tMPI + GPUs some ranks may be sharing GPU(s) and therefore * also sharing texture references. To keep the code simple, we don't * treat texture references as shared resources, but this means that * the coulomb_tab texture ref will get updated by multiple threads. * Hence, to ensure that the non-bonded kernels don't start before all * texture binding operations are finished, we need to wait for all ranks * to arrive here before continuing. * * Note that we could omit this barrier if GPUs are not shared (or * texture objects are used), but as this is initialization code, there * is not point in complicating things. */ #ifdef GMX_THREAD_MPI if (PAR(cr) && use_GPU(nbv)) { gmx_barrier(cr); } #endif /* GMX_THREAD_MPI */ /* Usually we won't need the simple tables with GPUs. * But we do with hybrid acceleration and with free energy. * To avoid bugs, we always re-initialize the simple tables here. */ init_interaction_const_tables(NULL, ic, bUsesSimpleTables, rtab); if (cr->duty & DUTY_PME) { if (pme_lb->setup[pme_lb->cur].pmedata == NULL) { /* Generate a new PME data structure, * copying part of the old pointers. */ gmx_pme_reinit(&set->pmedata, cr, pme_lb->setup[0].pmedata, ir, set->grid); } *pmedata = set->pmedata; } else { /* Tell our PME-only node to switch grid */ gmx_pme_send_switchgrid(cr, set->grid, set->ewaldcoeff_q, set->ewaldcoeff_lj); } if (debug) { print_grid(NULL, debug, "", "switched to", set, -1); } if (pme_lb->stage == pme_lb->nstage) { print_grid(fp_err, fp_log, "", "optimal", set, -1); } return TRUE; }
/* Try to increase nstlist when using the Verlet cut-off scheme */ static void increase_nstlist(FILE *fp, t_commrec *cr, t_inputrec *ir, int nstlist_cmdline, const gmx_mtop_t *mtop, matrix box, gmx_bool bGPU) { float listfac_ok, listfac_max; int nstlist_orig, nstlist_prev; verletbuf_list_setup_t ls; real rlistWithReferenceNstlist, rlist_inc, rlist_ok, rlist_max; real rlist_new, rlist_prev; size_t nstlist_ind = 0; t_state state_tmp; gmx_bool bBox, bDD, bCont; const char *nstl_gpu = "\nFor optimal performance with a GPU nstlist (now %d) should be larger.\nThe optimum depends on your CPU and GPU resources.\nYou might want to try several nstlist values.\n"; const char *nve_err = "Can not increase nstlist because an NVE ensemble is used"; const char *vbd_err = "Can not increase nstlist because verlet-buffer-tolerance is not set or used"; const char *box_err = "Can not increase nstlist because the box is too small"; const char *dd_err = "Can not increase nstlist because of domain decomposition limitations"; char buf[STRLEN]; const float oneThird = 1.0f / 3.0f; if (nstlist_cmdline <= 0) { if (ir->nstlist == 1) { /* The user probably set nstlist=1 for a reason, * don't mess with the settings. */ return; } if (fp != NULL && bGPU && ir->nstlist < nstlist_try[0]) { fprintf(fp, nstl_gpu, ir->nstlist); } nstlist_ind = 0; while (nstlist_ind < NNSTL && ir->nstlist >= nstlist_try[nstlist_ind]) { nstlist_ind++; } if (nstlist_ind == NNSTL) { /* There are no larger nstlist value to try */ return; } } if (EI_MD(ir->eI) && ir->etc == etcNO) { if (MASTER(cr)) { fprintf(stderr, "%s\n", nve_err); } if (fp != NULL) { fprintf(fp, "%s\n", nve_err); } return; } if (ir->verletbuf_tol == 0 && bGPU) { gmx_fatal(FARGS, "You are using an old tpr file with a GPU, please generate a new tpr file with an up to date version of grompp"); } if (ir->verletbuf_tol < 0) { if (MASTER(cr)) { fprintf(stderr, "%s\n", vbd_err); } if (fp != NULL) { fprintf(fp, "%s\n", vbd_err); } return; } if (bGPU) { listfac_ok = nbnxn_gpu_listfac_ok; listfac_max = nbnxn_gpu_listfac_max; } else { listfac_ok = nbnxn_cpu_listfac_ok; listfac_max = nbnxn_cpu_listfac_max; } nstlist_orig = ir->nstlist; if (nstlist_cmdline > 0) { if (fp) { sprintf(buf, "Getting nstlist=%d from command line option", nstlist_cmdline); } ir->nstlist = nstlist_cmdline; } verletbuf_get_list_setup(TRUE, bGPU, &ls); /* Allow rlist to make the list a given factor larger than the list * would be with the reference value for nstlist (10). */ nstlist_prev = ir->nstlist; ir->nstlist = nbnxnReferenceNstlist; calc_verlet_buffer_size(mtop, det(box), ir, -1, &ls, NULL, &rlistWithReferenceNstlist); ir->nstlist = nstlist_prev; /* Determine the pair list size increase due to zero interactions */ rlist_inc = nbnxn_get_rlist_effective_inc(ls.cluster_size_j, mtop->natoms/det(box)); rlist_ok = (rlistWithReferenceNstlist + rlist_inc)*pow(listfac_ok, oneThird) - rlist_inc; rlist_max = (rlistWithReferenceNstlist + rlist_inc)*pow(listfac_max, oneThird) - rlist_inc; if (debug) { fprintf(debug, "nstlist tuning: rlist_inc %.3f rlist_ok %.3f rlist_max %.3f\n", rlist_inc, rlist_ok, rlist_max); } nstlist_prev = nstlist_orig; rlist_prev = ir->rlist; do { if (nstlist_cmdline <= 0) { ir->nstlist = nstlist_try[nstlist_ind]; } /* Set the pair-list buffer size in ir */ calc_verlet_buffer_size(mtop, det(box), ir, -1, &ls, NULL, &rlist_new); /* Does rlist fit in the box? */ bBox = (sqr(rlist_new) < max_cutoff2(ir->ePBC, box)); bDD = TRUE; if (bBox && DOMAINDECOMP(cr)) { /* Check if rlist fits in the domain decomposition */ if (inputrec2nboundeddim(ir) < DIM) { gmx_incons("Changing nstlist with domain decomposition and unbounded dimensions is not implemented yet"); } copy_mat(box, state_tmp.box); bDD = change_dd_cutoff(cr, &state_tmp, ir, rlist_new); } if (debug) { fprintf(debug, "nstlist %d rlist %.3f bBox %d bDD %d\n", ir->nstlist, rlist_new, bBox, bDD); } bCont = FALSE; if (nstlist_cmdline <= 0) { if (bBox && bDD && rlist_new <= rlist_max) { /* Increase nstlist */ nstlist_prev = ir->nstlist; rlist_prev = rlist_new; bCont = (nstlist_ind+1 < NNSTL && rlist_new < rlist_ok); } else { /* Stick with the previous nstlist */ ir->nstlist = nstlist_prev; rlist_new = rlist_prev; bBox = TRUE; bDD = TRUE; } } nstlist_ind++; } while (bCont); if (!bBox || !bDD) { gmx_warning(!bBox ? box_err : dd_err); if (fp != NULL) { fprintf(fp, "\n%s\n", bBox ? box_err : dd_err); } ir->nstlist = nstlist_orig; } else if (ir->nstlist != nstlist_orig || rlist_new != ir->rlist) { sprintf(buf, "Changing nstlist from %d to %d, rlist from %g to %g", nstlist_orig, ir->nstlist, ir->rlist, rlist_new); if (MASTER(cr)) { fprintf(stderr, "%s\n\n", buf); } if (fp != NULL) { fprintf(fp, "%s\n\n", buf); } ir->rlist = rlist_new; ir->rlistlong = rlist_new; } }