void dd_collect_state(gmx_domdec_t *dd, const t_state *state_local, t_state *state) { int nh = state_local->nhchainlength; if (DDMASTER(dd)) { GMX_RELEASE_ASSERT(state->nhchainlength == nh, "The global and local Nose-Hoover chain lengths should match"); for (int i = 0; i < efptNR; i++) { state->lambda[i] = state_local->lambda[i]; } state->fep_state = state_local->fep_state; state->veta = state_local->veta; state->vol0 = state_local->vol0; copy_mat(state_local->box, state->box); copy_mat(state_local->boxv, state->boxv); copy_mat(state_local->svir_prev, state->svir_prev); copy_mat(state_local->fvir_prev, state->fvir_prev); copy_mat(state_local->pres_prev, state->pres_prev); for (int i = 0; i < state_local->ngtc; i++) { for (int j = 0; j < nh; j++) { state->nosehoover_xi[i*nh+j] = state_local->nosehoover_xi[i*nh+j]; state->nosehoover_vxi[i*nh+j] = state_local->nosehoover_vxi[i*nh+j]; } state->therm_integral[i] = state_local->therm_integral[i]; } for (int i = 0; i < state_local->nnhpres; i++) { for (int j = 0; j < nh; j++) { state->nhpres_xi[i*nh+j] = state_local->nhpres_xi[i*nh+j]; state->nhpres_vxi[i*nh+j] = state_local->nhpres_vxi[i*nh+j]; } } state->baros_integral = state_local->baros_integral; } if (state_local->flags & (1 << estX)) { gmx::ArrayRef<gmx::RVec> globalXRef = state ? makeArrayRef(state->x) : gmx::EmptyArrayRef(); dd_collect_vec(dd, state_local, makeConstArrayRef(state_local->x), globalXRef); } if (state_local->flags & (1 << estV)) { gmx::ArrayRef<gmx::RVec> globalVRef = state ? makeArrayRef(state->v) : gmx::EmptyArrayRef(); dd_collect_vec(dd, state_local, makeConstArrayRef(state_local->v), globalVRef); } if (state_local->flags & (1 << estCGP)) { gmx::ArrayRef<gmx::RVec> globalCgpRef = state ? makeArrayRef(state->cg_p) : gmx::EmptyArrayRef(); dd_collect_vec(dd, state_local, makeConstArrayRef(state_local->cg_p), globalCgpRef); } }
void dd_bcastc(gmx_domdec_t *dd,int nbytes,void *src,void *dest) { if (DDMASTER(dd)) { memcpy(dest,src,nbytes); } #ifdef GMX_MPI MPI_Bcast(dest,nbytes,MPI_BYTE, DDMASTERRANK(dd),dd->mpi_comm_all); #endif }
void dd_bcastc(const gmx_domdec_t *dd, int nbytes, void *src, void *dest) { if (DDMASTER(dd) || dd->nnodes == 1) { memcpy(dest, src, nbytes); } #if GMX_MPI if (dd->nnodes > 1) { MPI_Bcast(dest, nbytes, MPI_BYTE, DDMASTERRANK(dd), dd->mpi_comm_all); } #endif }
static void dd_collect_vec_gatherv(gmx_domdec_t *dd, gmx::ArrayRef<const gmx::RVec> lv, gmx::ArrayRef<gmx::RVec> v) { int *recvCounts = nullptr; int *displacements = nullptr; if (DDMASTER(dd)) { get_commbuffer_counts(dd->ma.get(), &recvCounts, &displacements); } const int numHomeAtoms = dd->comm->atomRanges.numHomeAtoms(); dd_gatherv(dd, numHomeAtoms*sizeof(rvec), lv.data(), recvCounts, displacements, DDMASTER(dd) ? dd->ma->rvecBuffer.data() : nullptr); if (DDMASTER(dd)) { const AtomDistribution &ma = *dd->ma; const t_block &cgs_gl = dd->comm->cgs_gl; int bufferAtom = 0; for (int rank = 0; rank < dd->nnodes; rank++) { const auto &domainGroups = ma.domainGroups[rank]; for (const int &cg : domainGroups.atomGroups) { for (int globalAtom = cgs_gl.index[cg]; globalAtom < cgs_gl.index[cg + 1]; globalAtom++) { copy_rvec(ma.rvecBuffer[bufferAtom++], v[globalAtom]); } } } } }
void set_ddbox(gmx_domdec_t *dd, gmx_bool bMasterState, t_commrec *cr_sum, t_inputrec *ir, matrix box, gmx_bool bCalcUnboundedSize, t_block *cgs, rvec *x, gmx_ddbox_t *ddbox) { if (!bMasterState || DDMASTER(dd)) { low_set_ddbox(ir, &dd->nc, box, bCalcUnboundedSize, bMasterState ? cgs->nr : dd->ncg_home, cgs, x, bMasterState ? NULL : cr_sum, ddbox); } if (bMasterState) { dd_bcast(dd, sizeof(gmx_ddbox_t), ddbox); } }
void pme_loadbal_do(pme_load_balancing_t *pme_lb, t_commrec *cr, FILE *fp_err, FILE *fp_log, t_inputrec *ir, t_forcerec *fr, t_state *state, gmx_wallcycle_t wcycle, gmx_int64_t step, gmx_int64_t step_rel, gmx_bool *bPrinting) { int n_prev; double cycles_prev; assert(pme_lb != NULL); if (!pme_lb->bActive) { return; } n_prev = pme_lb->cycles_n; cycles_prev = pme_lb->cycles_c; wallcycle_get(wcycle, ewcSTEP, &pme_lb->cycles_n, &pme_lb->cycles_c); if (pme_lb->cycles_n == 0) { /* Before the first step we haven't done any steps yet */ return; } /* Sanity check, we expect nstlist cycle counts */ if (pme_lb->cycles_n - n_prev != ir->nstlist) { /* We could return here, but it's safer to issue and error and quit */ gmx_incons("pme_loadbal_do called at an interval != nstlist"); } /* PME grid + cut-off optimization with GPUs or PME ranks */ if (!pme_lb->bBalance && pme_lb->bSepPMERanks) { if (pme_lb->bTriggerOnDLB) { pme_lb->bBalance = dd_dlb_is_on(cr->dd); } /* We should ignore the first timing to avoid timing allocation * overhead. And since the PME load balancing is called just * before DD repartitioning, the ratio returned by dd_pme_f_ratio * is not over the last nstlist steps, but the nstlist steps before * that. So the first useful ratio is available at step_rel=3*nstlist. */ else if (step_rel >= 3*ir->nstlist) { if (DDMASTER(cr->dd)) { /* If PME rank load is too high, start tuning */ pme_lb->bBalance = (dd_pme_f_ratio(cr->dd) >= loadBalanceTriggerFactor); } dd_bcast(cr->dd, sizeof(gmx_bool), &pme_lb->bBalance); } pme_lb->bActive = (pme_lb->bBalance || step_rel <= pme_lb->step_rel_stop); } /* The location in the code of this balancing termination is strange. * You would expect to have it after the call to pme_load_balance() * below, since there pme_lb->stage is updated. * But when terminating directly after deciding on and selecting the * optimal setup, DLB will turn on right away if it was locked before. * This might be due to PME reinitialization. So we check stage here * to allow for another nstlist steps with DLB locked to stabilize * the performance. */ if (pme_lb->bBalance && pme_lb->stage == pme_lb->nstage) { pme_lb->bBalance = FALSE; if (DOMAINDECOMP(cr) && dd_dlb_is_locked(cr->dd)) { /* Unlock the DLB=auto, DLB is allowed to activate */ dd_dlb_unlock(cr->dd); md_print_warn(cr, fp_log, "NOTE: DLB can now turn on, when beneficial\n"); /* We don't deactivate the tuning yet, since we will balance again * after DLB gets turned on, if it does within PMETune_period. */ continue_pme_loadbal(pme_lb, TRUE); pme_lb->bTriggerOnDLB = TRUE; pme_lb->step_rel_stop = step_rel + PMETunePeriod*ir->nstlist; } else { /* We're completely done with PME tuning */ pme_lb->bActive = FALSE; } if (DOMAINDECOMP(cr)) { /* Set the cut-off limit to the final selected cut-off, * so we don't have artificial DLB limits. * This also ensures that we won't disable the currently * optimal setting during a second round of PME balancing. */ set_dd_dlb_max_cutoff(cr, fr->ic->rlistlong); } } if (pme_lb->bBalance) { /* We might not have collected nstlist steps in cycles yet, * since init_step might not be a multiple of nstlist, * but the first data collected is skipped anyhow. */ pme_load_balance(pme_lb, cr, fp_err, fp_log, ir, state, pme_lb->cycles_c - cycles_prev, fr->ic, fr->nbv, &fr->pmedata, step); /* Update constants in forcerec/inputrec to keep them in sync with fr->ic */ fr->ewaldcoeff_q = fr->ic->ewaldcoeff_q; fr->ewaldcoeff_lj = fr->ic->ewaldcoeff_lj; fr->rlist = fr->ic->rlist; fr->rlistlong = fr->ic->rlistlong; fr->rcoulomb = fr->ic->rcoulomb; fr->rvdw = fr->ic->rvdw; if (ir->eDispCorr != edispcNO) { calc_enervirdiff(NULL, ir->eDispCorr, fr); } } if (!pme_lb->bBalance && (!pme_lb->bSepPMERanks || step_rel > pme_lb->step_rel_stop)) { /* We have just deactivated the balancing and we're not measuring PP/PME * imbalance during the first steps of the run: deactivate the tuning. */ pme_lb->bActive = FALSE; } if (!(pme_lb->bActive) && DOMAINDECOMP(cr) && dd_dlb_is_locked(cr->dd)) { /* Make sure DLB is allowed when we deactivate PME tuning */ dd_dlb_unlock(cr->dd); md_print_warn(cr, fp_log, "NOTE: DLB can now turn on, when beneficial\n"); } *bPrinting = pme_lb->bBalance; }
void pme_loadbal_do(pme_load_balancing_t *pme_lb, t_commrec *cr, FILE *fp_err, FILE *fp_log, t_inputrec *ir, t_forcerec *fr, t_state *state, gmx_wallcycle_t wcycle, gmx_int64_t step, gmx_int64_t step_rel, gmx_bool *bPrinting) { int n_prev; double cycles_prev; assert(pme_lb != NULL); if (!pme_lb->bActive) { return; } n_prev = pme_lb->cycles_n; cycles_prev = pme_lb->cycles_c; wallcycle_get(wcycle, ewcSTEP, &pme_lb->cycles_n, &pme_lb->cycles_c); if (pme_lb->cycles_n == 0) { /* Before the first step we haven't done any steps yet */ return; } /* Sanity check, we expect nstlist cycle counts */ if (pme_lb->cycles_n - n_prev != ir->nstlist) { /* We could return here, but it's safer to issue and error and quit */ gmx_incons("pme_loadbal_do called at an interval != nstlist"); } /* PME grid + cut-off optimization with GPUs or PME ranks */ if (!pme_lb->bBalance && pme_lb->bSepPMERanks) { if (DDMASTER(cr->dd)) { /* PME rank load is too high, start tuning */ pme_lb->bBalance = (dd_pme_f_ratio(cr->dd) >= loadBalanceTriggerFactor); } dd_bcast(cr->dd, sizeof(gmx_bool), &pme_lb->bBalance); if (pme_lb->bBalance && use_GPU(fr->nbv) && DOMAINDECOMP(cr) && pme_lb->bSepPMERanks) { /* Lock DLB=auto to off (does nothing when DLB=yes/no). * With GPUs + separate PME ranks, we don't want DLB. * This could happen when we scan coarse grids and * it would then never be turned off again. * This would hurt performance at the final, optimal * grid spacing, where DLB almost never helps. * Also, DLB can limit the cut-off for PME tuning. */ dd_dlb_set_lock(cr->dd, TRUE); } } if (pme_lb->bBalance) { /* init_step might not be a multiple of nstlist, * but the first cycle is always skipped anyhow. */ pme_lb->bBalance = pme_load_balance(pme_lb, cr, fp_err, fp_log, ir, state, pme_lb->cycles_c - cycles_prev, fr->ic, fr->nbv, &fr->pmedata, step); /* Update constants in forcerec/inputrec to keep them in sync with fr->ic */ fr->ewaldcoeff_q = fr->ic->ewaldcoeff_q; fr->ewaldcoeff_lj = fr->ic->ewaldcoeff_lj; fr->rlist = fr->ic->rlist; fr->rlistlong = fr->ic->rlistlong; fr->rcoulomb = fr->ic->rcoulomb; fr->rvdw = fr->ic->rvdw; if (ir->eDispCorr != edispcNO) { calc_enervirdiff(NULL, ir->eDispCorr, fr); } if (!pme_lb->bBalance && DOMAINDECOMP(cr) && dd_dlb_is_locked(cr->dd)) { /* Unlock the DLB=auto, DLB is allowed to activate * (but we don't expect it to activate in most cases). */ dd_dlb_set_lock(cr->dd, FALSE); } } if (!pme_lb->bBalance && (!pme_lb->bSepPMERanks || (step_rel <= PMETunePeriod*ir->nstlist))) { /* We have just deactivated the balancing and we're not measuring PP/PME * imbalance during the first 50*nstlist steps: deactivate the tuning. */ pme_lb->bActive = FALSE; } *bPrinting = pme_lb->bBalance; }
static void dd_collect_cg(gmx_domdec_t *dd, const t_state *state_local) { if (state_local->ddp_count == dd->comm->master_cg_ddp_count) { /* The master has the correct distribution */ return; } gmx::ArrayRef<const int> atomGroups; int nat_home = 0; if (state_local->ddp_count == dd->ddp_count) { /* The local state and DD are in sync, use the DD indices */ atomGroups = gmx::constArrayRefFromArray(dd->globalAtomGroupIndices.data(), dd->ncg_home); nat_home = dd->comm->atomRanges.numHomeAtoms(); } else if (state_local->ddp_count_cg_gl == state_local->ddp_count) { /* The DD is out of sync with the local state, but we have stored * the cg indices with the local state, so we can use those. */ const t_block &cgs_gl = dd->comm->cgs_gl; atomGroups = state_local->cg_gl; nat_home = 0; for (const int &i : atomGroups) { nat_home += cgs_gl.index[i + 1] - cgs_gl.index[i]; } } else { gmx_incons("Attempted to collect a vector for a state for which the charge group distribution is unknown"); } AtomDistribution *ma = dd->ma.get(); /* Collect the charge group and atom counts on the master */ int localBuffer[2] = { static_cast<int>(atomGroups.size()), nat_home }; dd_gather(dd, 2*sizeof(int), localBuffer, DDMASTER(dd) ? ma->intBuffer.data() : nullptr); if (DDMASTER(dd)) { int groupOffset = 0; for (int rank = 0; rank < dd->nnodes; rank++) { auto &domainGroups = ma->domainGroups[rank]; int numGroups = ma->intBuffer[2*rank]; domainGroups.atomGroups = gmx::constArrayRefFromArray(ma->atomGroups.data() + groupOffset, numGroups); domainGroups.numAtoms = ma->intBuffer[2*rank + 1]; groupOffset += numGroups; } if (debug) { fprintf(debug, "Initial charge group distribution: "); for (int rank = 0; rank < dd->nnodes; rank++) { fprintf(debug, " %td", ma->domainGroups[rank].atomGroups.size()); } fprintf(debug, "\n"); } /* Make byte counts and indices */ int offset = 0; for (int rank = 0; rank < dd->nnodes; rank++) { int numGroups = ma->domainGroups[rank].atomGroups.size(); ma->intBuffer[rank] = numGroups*sizeof(int); ma->intBuffer[dd->nnodes + rank] = offset*sizeof(int); offset += numGroups; } } /* Collect the charge group indices on the master */ dd_gatherv(dd, atomGroups.size()*sizeof(int), atomGroups.data(), DDMASTER(dd) ? ma->intBuffer.data() : nullptr, DDMASTER(dd) ? ma->intBuffer.data() + dd->nnodes : nullptr, DDMASTER(dd) ? ma->atomGroups.data() : nullptr); dd->comm->master_cg_ddp_count = state_local->ddp_count; }
static void dd_collect_vec_sendrecv(gmx_domdec_t *dd, gmx::ArrayRef<const gmx::RVec> lv, gmx::ArrayRef<gmx::RVec> v) { if (!DDMASTER(dd)) { #if GMX_MPI const int numHomeAtoms = dd->comm->atomRanges.numHomeAtoms(); MPI_Send(const_cast<void *>(static_cast<const void *>(lv.data())), numHomeAtoms*sizeof(rvec), MPI_BYTE, dd->masterrank, dd->rank, dd->mpi_comm_all); #endif } else { AtomDistribution &ma = *dd->ma; /* Copy the master coordinates to the global array */ const t_block &cgs_gl = dd->comm->cgs_gl; int rank = dd->masterrank; int localAtom = 0; for (const int &i : ma.domainGroups[rank].atomGroups) { for (int globalAtom = cgs_gl.index[i]; globalAtom < cgs_gl.index[i + 1]; globalAtom++) { copy_rvec(lv[localAtom++], v[globalAtom]); } } for (int rank = 0; rank < dd->nnodes; rank++) { if (rank != dd->rank) { const auto &domainGroups = ma.domainGroups[rank]; GMX_RELEASE_ASSERT(v.data() != ma.rvecBuffer.data(), "We need different communication and return buffers"); /* When we send/recv instead of scatter/gather, we might need * to increase the communication buffer size here. */ if (static_cast<size_t>(domainGroups.numAtoms) > ma.rvecBuffer.size()) { ma.rvecBuffer.resize(domainGroups.numAtoms); } #if GMX_MPI MPI_Recv(ma.rvecBuffer.data(), domainGroups.numAtoms*sizeof(rvec), MPI_BYTE, rank, rank, dd->mpi_comm_all, MPI_STATUS_IGNORE); #endif int localAtom = 0; for (const int &cg : domainGroups.atomGroups) { for (int globalAtom = cgs_gl.index[cg]; globalAtom < cgs_gl.index[cg + 1]; globalAtom++) { copy_rvec(ma.rvecBuffer[localAtom++], v[globalAtom]); } } } } } }