gmx_bool pme_load_balance(pme_load_balancing_t pme_lb, t_commrec *cr, FILE *fp_err, FILE *fp_log, t_inputrec *ir, t_state *state, double cycles, interaction_const_t *ic, struct nonbonded_verlet_t *nbv, struct gmx_pme_t ** pmedata, gmx_int64_t step) { gmx_bool OK; pme_setup_t *set; double cycles_fast; char buf[STRLEN], sbuf[22]; real rtab; gmx_bool bUsesSimpleTables = TRUE; if (pme_lb->stage == pme_lb->nstage) { return FALSE; } if (PAR(cr)) { gmx_sumd(1, &cycles, cr); cycles /= cr->nnodes; } set = &pme_lb->setup[pme_lb->cur]; set->count++; rtab = ir->rlistlong + ir->tabext; if (set->count % 2 == 1) { /* Skip the first cycle, because the first step after a switch * is much slower due to allocation and/or caching effects. */ return TRUE; } sprintf(buf, "step %4s: ", gmx_step_str(step, sbuf)); print_grid(fp_err, fp_log, buf, "timed with", set, cycles); if (set->count <= 2) { set->cycles = cycles; } else { if (cycles*PME_LB_ACCEL_TOL < set->cycles && pme_lb->stage == pme_lb->nstage - 1) { /* The performance went up a lot (due to e.g. DD load balancing). * Add a stage, keep the minima, but rescan all setups. */ pme_lb->nstage++; if (debug) { fprintf(debug, "The performance for grid %d %d %d went from %.3f to %.1f M-cycles, this is more than %f\n" "Increased the number stages to %d" " and ignoring the previous performance\n", set->grid[XX], set->grid[YY], set->grid[ZZ], cycles*1e-6, set->cycles*1e-6, PME_LB_ACCEL_TOL, pme_lb->nstage); } } set->cycles = min(set->cycles, cycles); } if (set->cycles < pme_lb->setup[pme_lb->fastest].cycles) { pme_lb->fastest = pme_lb->cur; if (DOMAINDECOMP(cr)) { /* We found a new fastest setting, ensure that with subsequent * shorter cut-off's the dynamic load balancing does not make * the use of the current cut-off impossible. This solution is * a trade-off, as the PME load balancing and DD domain size * load balancing can interact in complex ways. * With the Verlet kernels, DD load imbalance will usually be * mainly due to bonded interaction imbalance, which will often * quickly push the domain boundaries beyond the limit for the * optimal, PME load balanced, cut-off. But it could be that * better overal performance can be obtained with a slightly * shorter cut-off and better DD load balancing. */ change_dd_dlb_cutoff_limit(cr); } } cycles_fast = pme_lb->setup[pme_lb->fastest].cycles; /* Check in stage 0 if we should stop scanning grids. * Stop when the time is more than SLOW_FAC longer than the fastest. */ if (pme_lb->stage == 0 && pme_lb->cur > 0 && cycles > pme_lb->setup[pme_lb->fastest].cycles*PME_LB_SLOW_FAC) { pme_lb->n = pme_lb->cur + 1; /* Done with scanning, go to stage 1 */ switch_to_stage1(pme_lb); } if (pme_lb->stage == 0) { int gridsize_start; gridsize_start = set->grid[XX]*set->grid[YY]*set->grid[ZZ]; do { if (pme_lb->cur+1 < pme_lb->n) { /* We had already generated the next setup */ OK = TRUE; } else { /* Find the next setup */ OK = pme_loadbal_increase_cutoff(pme_lb, ir->pme_order, cr->dd); if (!OK) { pme_lb->elimited = epmelblimPMEGRID; } } if (OK && ir->ePBC != epbcNONE) { OK = (sqr(pme_lb->setup[pme_lb->cur+1].rlistlong) <= max_cutoff2(ir->ePBC, state->box)); if (!OK) { pme_lb->elimited = epmelblimBOX; } } if (OK) { pme_lb->cur++; if (DOMAINDECOMP(cr)) { OK = change_dd_cutoff(cr, state, ir, pme_lb->setup[pme_lb->cur].rlistlong); if (!OK) { /* Failed: do not use this setup */ pme_lb->cur--; pme_lb->elimited = epmelblimDD; } } } if (!OK) { /* We hit the upper limit for the cut-off, * the setup should not go further than cur. */ pme_lb->n = pme_lb->cur + 1; print_loadbal_limited(fp_err, fp_log, step, pme_lb); /* Switch to the next stage */ switch_to_stage1(pme_lb); } } while (OK && !(pme_lb->setup[pme_lb->cur].grid[XX]* pme_lb->setup[pme_lb->cur].grid[YY]* pme_lb->setup[pme_lb->cur].grid[ZZ] < gridsize_start*PME_LB_GRID_SCALE_FAC && pme_lb->setup[pme_lb->cur].grid_efficiency < pme_lb->setup[pme_lb->cur-1].grid_efficiency*PME_LB_GRID_EFFICIENCY_REL_FAC)); } if (pme_lb->stage > 0 && pme_lb->end == 1) { pme_lb->cur = 0; pme_lb->stage = pme_lb->nstage; } else if (pme_lb->stage > 0 && pme_lb->end > 1) { /* If stage = nstage-1: * scan over all setups, rerunning only those setups * which are not much slower than the fastest * else: * use the next setup */ do { pme_lb->cur++; if (pme_lb->cur == pme_lb->end) { pme_lb->stage++; pme_lb->cur = pme_lb->start; } } while (pme_lb->stage == pme_lb->nstage - 1 && pme_lb->setup[pme_lb->cur].count > 0 && pme_lb->setup[pme_lb->cur].cycles > cycles_fast*PME_LB_SLOW_FAC); if (pme_lb->stage == pme_lb->nstage) { /* We are done optimizing, use the fastest setup we found */ pme_lb->cur = pme_lb->fastest; } } if (DOMAINDECOMP(cr) && pme_lb->stage > 0) { OK = change_dd_cutoff(cr, state, ir, pme_lb->setup[pme_lb->cur].rlistlong); if (!OK) { /* Failsafe solution */ if (pme_lb->cur > 1 && pme_lb->stage == pme_lb->nstage) { pme_lb->stage--; } pme_lb->fastest = 0; pme_lb->start = 0; pme_lb->end = pme_lb->cur; pme_lb->cur = pme_lb->start; pme_lb->elimited = epmelblimDD; print_loadbal_limited(fp_err, fp_log, step, pme_lb); } } /* Change the Coulomb cut-off and the PME grid */ set = &pme_lb->setup[pme_lb->cur]; ic->rcoulomb = set->rcut_coulomb; ic->rlist = set->rlist; ic->rlistlong = set->rlistlong; ir->nstcalclr = set->nstcalclr; ic->ewaldcoeff_q = set->ewaldcoeff_q; /* TODO: centralize the code that sets the potentials shifts */ if (ic->coulomb_modifier == eintmodPOTSHIFT) { ic->sh_ewald = gmx_erfc(ic->ewaldcoeff_q*ic->rcoulomb); } if (EVDW_PME(ic->vdwtype)) { /* We have PME for both Coulomb and VdW, set rvdw equal to rcoulomb */ ic->rvdw = set->rcut_coulomb; ic->ewaldcoeff_lj = set->ewaldcoeff_lj; if (ic->vdw_modifier == eintmodPOTSHIFT) { real crc2; ic->dispersion_shift.cpot = -pow(ic->rvdw, -6.0); ic->repulsion_shift.cpot = -pow(ic->rvdw, -12.0); ic->sh_invrc6 = -ic->dispersion_shift.cpot; crc2 = sqr(ic->ewaldcoeff_lj*ic->rvdw); ic->sh_lj_ewald = (exp(-crc2)*(1 + crc2 + 0.5*crc2*crc2) - 1)*pow(ic->rvdw, -6.0); } } bUsesSimpleTables = uses_simple_tables(ir->cutoff_scheme, nbv, 0); nbnxn_gpu_pme_loadbal_update_param(nbv, ic); /* With tMPI + GPUs some ranks may be sharing GPU(s) and therefore * also sharing texture references. To keep the code simple, we don't * treat texture references as shared resources, but this means that * the coulomb_tab texture ref will get updated by multiple threads. * Hence, to ensure that the non-bonded kernels don't start before all * texture binding operations are finished, we need to wait for all ranks * to arrive here before continuing. * * Note that we could omit this barrier if GPUs are not shared (or * texture objects are used), but as this is initialization code, there * is not point in complicating things. */ #ifdef GMX_THREAD_MPI if (PAR(cr) && use_GPU(nbv)) { gmx_barrier(cr); } #endif /* GMX_THREAD_MPI */ /* Usually we won't need the simple tables with GPUs. * But we do with hybrid acceleration and with free energy. * To avoid bugs, we always re-initialize the simple tables here. */ init_interaction_const_tables(NULL, ic, bUsesSimpleTables, rtab); if (cr->duty & DUTY_PME) { if (pme_lb->setup[pme_lb->cur].pmedata == NULL) { /* Generate a new PME data structure, * copying part of the old pointers. */ gmx_pme_reinit(&set->pmedata, cr, pme_lb->setup[0].pmedata, ir, set->grid); } *pmedata = set->pmedata; } else { /* Tell our PME-only node to switch grid */ gmx_pme_send_switchgrid(cr, set->grid, set->ewaldcoeff_q, set->ewaldcoeff_lj); } if (debug) { print_grid(NULL, debug, "", "switched to", set, -1); } if (pme_lb->stage == pme_lb->nstage) { print_grid(fp_err, fp_log, "", "optimal", set, -1); } return TRUE; }
void pme_loadbal_do(pme_load_balancing_t *pme_lb, t_commrec *cr, FILE *fp_err, FILE *fp_log, t_inputrec *ir, t_forcerec *fr, t_state *state, gmx_wallcycle_t wcycle, gmx_int64_t step, gmx_int64_t step_rel, gmx_bool *bPrinting) { int n_prev; double cycles_prev; assert(pme_lb != NULL); if (!pme_lb->bActive) { return; } n_prev = pme_lb->cycles_n; cycles_prev = pme_lb->cycles_c; wallcycle_get(wcycle, ewcSTEP, &pme_lb->cycles_n, &pme_lb->cycles_c); if (pme_lb->cycles_n == 0) { /* Before the first step we haven't done any steps yet */ return; } /* Sanity check, we expect nstlist cycle counts */ if (pme_lb->cycles_n - n_prev != ir->nstlist) { /* We could return here, but it's safer to issue and error and quit */ gmx_incons("pme_loadbal_do called at an interval != nstlist"); } /* PME grid + cut-off optimization with GPUs or PME ranks */ if (!pme_lb->bBalance && pme_lb->bSepPMERanks) { if (DDMASTER(cr->dd)) { /* PME rank load is too high, start tuning */ pme_lb->bBalance = (dd_pme_f_ratio(cr->dd) >= loadBalanceTriggerFactor); } dd_bcast(cr->dd, sizeof(gmx_bool), &pme_lb->bBalance); if (pme_lb->bBalance && use_GPU(fr->nbv) && DOMAINDECOMP(cr) && pme_lb->bSepPMERanks) { /* Lock DLB=auto to off (does nothing when DLB=yes/no). * With GPUs + separate PME ranks, we don't want DLB. * This could happen when we scan coarse grids and * it would then never be turned off again. * This would hurt performance at the final, optimal * grid spacing, where DLB almost never helps. * Also, DLB can limit the cut-off for PME tuning. */ dd_dlb_set_lock(cr->dd, TRUE); } } if (pme_lb->bBalance) { /* init_step might not be a multiple of nstlist, * but the first cycle is always skipped anyhow. */ pme_lb->bBalance = pme_load_balance(pme_lb, cr, fp_err, fp_log, ir, state, pme_lb->cycles_c - cycles_prev, fr->ic, fr->nbv, &fr->pmedata, step); /* Update constants in forcerec/inputrec to keep them in sync with fr->ic */ fr->ewaldcoeff_q = fr->ic->ewaldcoeff_q; fr->ewaldcoeff_lj = fr->ic->ewaldcoeff_lj; fr->rlist = fr->ic->rlist; fr->rlistlong = fr->ic->rlistlong; fr->rcoulomb = fr->ic->rcoulomb; fr->rvdw = fr->ic->rvdw; if (ir->eDispCorr != edispcNO) { calc_enervirdiff(NULL, ir->eDispCorr, fr); } if (!pme_lb->bBalance && DOMAINDECOMP(cr) && dd_dlb_is_locked(cr->dd)) { /* Unlock the DLB=auto, DLB is allowed to activate * (but we don't expect it to activate in most cases). */ dd_dlb_set_lock(cr->dd, FALSE); } } if (!pme_lb->bBalance && (!pme_lb->bSepPMERanks || (step_rel <= PMETunePeriod*ir->nstlist))) { /* We have just deactivated the balancing and we're not measuring PP/PME * imbalance during the first 50*nstlist steps: deactivate the tuning. */ pme_lb->bActive = FALSE; } *bPrinting = pme_lb->bBalance; }