void pme_loadbal_init(pme_load_balancing_t **pme_lb_p, t_commrec *cr, FILE *fp_log, const t_inputrec *ir, matrix box, const interaction_const_t *ic, struct gmx_pme_t *pmedata, gmx_bool bUseGPU, gmx_bool *bPrinting) { pme_load_balancing_t *pme_lb; real spm, sp; int d; snew(pme_lb, 1); pme_lb->bSepPMERanks = !(cr->duty & DUTY_PME); /* Initially we turn on balancing directly on based on PP/PME imbalance */ pme_lb->bTriggerOnDLB = FALSE; /* Any number of stages >= 2 is supported */ pme_lb->nstage = 2; pme_lb->cutoff_scheme = ir->cutoff_scheme; if (pme_lb->cutoff_scheme == ecutsVERLET) { pme_lb->rbuf_coulomb = ic->rlist - ic->rcoulomb; pme_lb->rbuf_vdw = pme_lb->rbuf_coulomb; } else { if (ic->rcoulomb > ic->rlist) { pme_lb->rbuf_coulomb = ic->rlistlong - ic->rcoulomb; } else { pme_lb->rbuf_coulomb = ic->rlist - ic->rcoulomb; } if (ic->rvdw > ic->rlist) { pme_lb->rbuf_vdw = ic->rlistlong - ic->rvdw; } else { pme_lb->rbuf_vdw = ic->rlist - ic->rvdw; } } copy_mat(box, pme_lb->box_start); if (ir->ePBC == epbcXY && ir->nwall == 2) { svmul(ir->wall_ewald_zfac, pme_lb->box_start[ZZ], pme_lb->box_start[ZZ]); } pme_lb->n = 1; snew(pme_lb->setup, pme_lb->n); pme_lb->rcut_vdw = ic->rvdw; pme_lb->rcut_coulomb_start = ir->rcoulomb; pme_lb->nstcalclr_start = ir->nstcalclr; pme_lb->cur = 0; pme_lb->setup[0].rcut_coulomb = ic->rcoulomb; pme_lb->setup[0].rlist = ic->rlist; pme_lb->setup[0].rlistlong = ic->rlistlong; pme_lb->setup[0].nstcalclr = ir->nstcalclr; pme_lb->setup[0].grid[XX] = ir->nkx; pme_lb->setup[0].grid[YY] = ir->nky; pme_lb->setup[0].grid[ZZ] = ir->nkz; pme_lb->setup[0].ewaldcoeff_q = ic->ewaldcoeff_q; pme_lb->setup[0].ewaldcoeff_lj = ic->ewaldcoeff_lj; pme_lb->setup[0].pmedata = pmedata; spm = 0; for (d = 0; d < DIM; d++) { sp = norm(pme_lb->box_start[d])/pme_lb->setup[0].grid[d]; if (sp > spm) { spm = sp; } } pme_lb->setup[0].spacing = spm; if (ir->fourier_spacing > 0) { pme_lb->cut_spacing = ir->rcoulomb/ir->fourier_spacing; } else { pme_lb->cut_spacing = ir->rcoulomb/pme_lb->setup[0].spacing; } pme_lb->stage = 0; pme_lb->fastest = 0; pme_lb->lower_limit = 0; pme_lb->start = 0; pme_lb->end = 0; pme_lb->elimited = epmelblimNO; pme_lb->cycles_n = 0; pme_lb->cycles_c = 0; /* Tune with GPUs and/or separate PME ranks. * When running only on a CPU without PME ranks, PME tuning will only help * with small numbers of atoms in the cut-off sphere. */ pme_lb->bActive = (wallcycle_have_counter() && (bUseGPU || pme_lb->bSepPMERanks)); /* With GPUs and no separate PME ranks we can't measure the PP/PME * imbalance, so we start balancing right away. * Otherwise we only start balancing after we observe imbalance. */ pme_lb->bBalance = (pme_lb->bActive && (bUseGPU && !pme_lb->bSepPMERanks)); pme_lb->step_rel_stop = PMETunePeriod*ir->nstlist; /* Delay DD load balancing when GPUs are used */ if (pme_lb->bActive && DOMAINDECOMP(cr) && cr->dd->nnodes > 1 && bUseGPU) { /* Lock DLB=auto to off (does nothing when DLB=yes/no. * With GPUs and separate PME nodes, we want to first * do PME tuning without DLB, since DLB might limit * the cut-off, which never improves performance. * We allow for DLB + PME tuning after a first round of tuning. */ dd_dlb_lock(cr->dd); if (dd_dlb_is_locked(cr->dd)) { md_print_warn(cr, fp_log, "NOTE: DLB will not turn on during the first phase of PME tuning\n"); } } *pme_lb_p = pme_lb; *bPrinting = pme_lb->bBalance; }
void pme_loadbal_do(pme_load_balancing_t *pme_lb, t_commrec *cr, FILE *fp_err, FILE *fp_log, t_inputrec *ir, t_forcerec *fr, t_state *state, gmx_wallcycle_t wcycle, gmx_int64_t step, gmx_int64_t step_rel, gmx_bool *bPrinting) { int n_prev; double cycles_prev; assert(pme_lb != NULL); if (!pme_lb->bActive) { return; } n_prev = pme_lb->cycles_n; cycles_prev = pme_lb->cycles_c; wallcycle_get(wcycle, ewcSTEP, &pme_lb->cycles_n, &pme_lb->cycles_c); if (pme_lb->cycles_n == 0) { /* Before the first step we haven't done any steps yet */ return; } /* Sanity check, we expect nstlist cycle counts */ if (pme_lb->cycles_n - n_prev != ir->nstlist) { /* We could return here, but it's safer to issue and error and quit */ gmx_incons("pme_loadbal_do called at an interval != nstlist"); } /* PME grid + cut-off optimization with GPUs or PME ranks */ if (!pme_lb->bBalance && pme_lb->bSepPMERanks) { if (pme_lb->bTriggerOnDLB) { pme_lb->bBalance = dd_dlb_is_on(cr->dd); } /* We should ignore the first timing to avoid timing allocation * overhead. And since the PME load balancing is called just * before DD repartitioning, the ratio returned by dd_pme_f_ratio * is not over the last nstlist steps, but the nstlist steps before * that. So the first useful ratio is available at step_rel=3*nstlist. */ else if (step_rel >= 3*ir->nstlist) { if (DDMASTER(cr->dd)) { /* If PME rank load is too high, start tuning */ pme_lb->bBalance = (dd_pme_f_ratio(cr->dd) >= loadBalanceTriggerFactor); } dd_bcast(cr->dd, sizeof(gmx_bool), &pme_lb->bBalance); } pme_lb->bActive = (pme_lb->bBalance || step_rel <= pme_lb->step_rel_stop); } /* The location in the code of this balancing termination is strange. * You would expect to have it after the call to pme_load_balance() * below, since there pme_lb->stage is updated. * But when terminating directly after deciding on and selecting the * optimal setup, DLB will turn on right away if it was locked before. * This might be due to PME reinitialization. So we check stage here * to allow for another nstlist steps with DLB locked to stabilize * the performance. */ if (pme_lb->bBalance && pme_lb->stage == pme_lb->nstage) { pme_lb->bBalance = FALSE; if (DOMAINDECOMP(cr) && dd_dlb_is_locked(cr->dd)) { /* Unlock the DLB=auto, DLB is allowed to activate */ dd_dlb_unlock(cr->dd); md_print_warn(cr, fp_log, "NOTE: DLB can now turn on, when beneficial\n"); /* We don't deactivate the tuning yet, since we will balance again * after DLB gets turned on, if it does within PMETune_period. */ continue_pme_loadbal(pme_lb, TRUE); pme_lb->bTriggerOnDLB = TRUE; pme_lb->step_rel_stop = step_rel + PMETunePeriod*ir->nstlist; } else { /* We're completely done with PME tuning */ pme_lb->bActive = FALSE; } if (DOMAINDECOMP(cr)) { /* Set the cut-off limit to the final selected cut-off, * so we don't have artificial DLB limits. * This also ensures that we won't disable the currently * optimal setting during a second round of PME balancing. */ set_dd_dlb_max_cutoff(cr, fr->ic->rlistlong); } } if (pme_lb->bBalance) { /* We might not have collected nstlist steps in cycles yet, * since init_step might not be a multiple of nstlist, * but the first data collected is skipped anyhow. */ pme_load_balance(pme_lb, cr, fp_err, fp_log, ir, state, pme_lb->cycles_c - cycles_prev, fr->ic, fr->nbv, &fr->pmedata, step); /* Update constants in forcerec/inputrec to keep them in sync with fr->ic */ fr->ewaldcoeff_q = fr->ic->ewaldcoeff_q; fr->ewaldcoeff_lj = fr->ic->ewaldcoeff_lj; fr->rlist = fr->ic->rlist; fr->rlistlong = fr->ic->rlistlong; fr->rcoulomb = fr->ic->rcoulomb; fr->rvdw = fr->ic->rvdw; if (ir->eDispCorr != edispcNO) { calc_enervirdiff(NULL, ir->eDispCorr, fr); } } if (!pme_lb->bBalance && (!pme_lb->bSepPMERanks || step_rel > pme_lb->step_rel_stop)) { /* We have just deactivated the balancing and we're not measuring PP/PME * imbalance during the first steps of the run: deactivate the tuning. */ pme_lb->bActive = FALSE; } if (!(pme_lb->bActive) && DOMAINDECOMP(cr) && dd_dlb_is_locked(cr->dd)) { /* Make sure DLB is allowed when we deactivate PME tuning */ dd_dlb_unlock(cr->dd); md_print_warn(cr, fp_log, "NOTE: DLB can now turn on, when beneficial\n"); } *bPrinting = pme_lb->bBalance; }
void pme_loadbal_do(pme_load_balancing_t *pme_lb, t_commrec *cr, FILE *fp_err, FILE *fp_log, t_inputrec *ir, t_forcerec *fr, t_state *state, gmx_wallcycle_t wcycle, gmx_int64_t step, gmx_int64_t step_rel, gmx_bool *bPrinting) { int n_prev; double cycles_prev; assert(pme_lb != NULL); if (!pme_lb->bActive) { return; } n_prev = pme_lb->cycles_n; cycles_prev = pme_lb->cycles_c; wallcycle_get(wcycle, ewcSTEP, &pme_lb->cycles_n, &pme_lb->cycles_c); if (pme_lb->cycles_n == 0) { /* Before the first step we haven't done any steps yet */ return; } /* Sanity check, we expect nstlist cycle counts */ if (pme_lb->cycles_n - n_prev != ir->nstlist) { /* We could return here, but it's safer to issue and error and quit */ gmx_incons("pme_loadbal_do called at an interval != nstlist"); } /* PME grid + cut-off optimization with GPUs or PME ranks */ if (!pme_lb->bBalance && pme_lb->bSepPMERanks) { if (DDMASTER(cr->dd)) { /* PME rank load is too high, start tuning */ pme_lb->bBalance = (dd_pme_f_ratio(cr->dd) >= loadBalanceTriggerFactor); } dd_bcast(cr->dd, sizeof(gmx_bool), &pme_lb->bBalance); if (pme_lb->bBalance && use_GPU(fr->nbv) && DOMAINDECOMP(cr) && pme_lb->bSepPMERanks) { /* Lock DLB=auto to off (does nothing when DLB=yes/no). * With GPUs + separate PME ranks, we don't want DLB. * This could happen when we scan coarse grids and * it would then never be turned off again. * This would hurt performance at the final, optimal * grid spacing, where DLB almost never helps. * Also, DLB can limit the cut-off for PME tuning. */ dd_dlb_set_lock(cr->dd, TRUE); } } if (pme_lb->bBalance) { /* init_step might not be a multiple of nstlist, * but the first cycle is always skipped anyhow. */ pme_lb->bBalance = pme_load_balance(pme_lb, cr, fp_err, fp_log, ir, state, pme_lb->cycles_c - cycles_prev, fr->ic, fr->nbv, &fr->pmedata, step); /* Update constants in forcerec/inputrec to keep them in sync with fr->ic */ fr->ewaldcoeff_q = fr->ic->ewaldcoeff_q; fr->ewaldcoeff_lj = fr->ic->ewaldcoeff_lj; fr->rlist = fr->ic->rlist; fr->rlistlong = fr->ic->rlistlong; fr->rcoulomb = fr->ic->rcoulomb; fr->rvdw = fr->ic->rvdw; if (ir->eDispCorr != edispcNO) { calc_enervirdiff(NULL, ir->eDispCorr, fr); } if (!pme_lb->bBalance && DOMAINDECOMP(cr) && dd_dlb_is_locked(cr->dd)) { /* Unlock the DLB=auto, DLB is allowed to activate * (but we don't expect it to activate in most cases). */ dd_dlb_set_lock(cr->dd, FALSE); } } if (!pme_lb->bBalance && (!pme_lb->bSepPMERanks || (step_rel <= PMETunePeriod*ir->nstlist))) { /* We have just deactivated the balancing and we're not measuring PP/PME * imbalance during the first 50*nstlist steps: deactivate the tuning. */ pme_lb->bActive = FALSE; } *bPrinting = pme_lb->bBalance; }