gmx_wallcycle_t wallcycle_init(FILE *fplog, int resetstep, t_commrec gmx_unused *cr, int nthreads_pp, int nthreads_pme) { gmx_wallcycle_t wc; if (!wallcycle_have_counter()) { return NULL; } snew(wc, 1); wc->haveInvalidCount = FALSE; wc->wc_barrier = FALSE; wc->wcc_all = NULL; wc->wc_depth = 0; wc->ewc_prev = -1; wc->reset_counters = resetstep; wc->nthreads_pp = nthreads_pp; wc->nthreads_pme = nthreads_pme; wc->cycles_sum = NULL; #ifdef GMX_MPI if (PAR(cr) && getenv("GMX_CYCLE_BARRIER") != NULL) { if (fplog) { fprintf(fplog, "\nWill call MPI_Barrier before each cycle start/stop call\n\n"); } wc->wc_barrier = TRUE; wc->mpi_comm_mygroup = cr->mpi_comm_mygroup; } #endif snew(wc->wcc, ewcNR); if (getenv("GMX_CYCLE_ALL") != NULL) { if (fplog) { fprintf(fplog, "\nWill time all the code during the run\n\n"); } snew(wc->wcc_all, ewcNR*ewcNR); } #ifdef GMX_CYCLE_SUBCOUNTERS snew(wc->wcsc, ewcsNR); #endif #ifdef DEBUG_WCYCLE wc->count_depth = 0; #endif return wc; }
gmx_wallcycle_t wallcycle_init(FILE *fplog,t_commrec *cr) { gmx_wallcycle_t wc; char *env_ptr; if (!wallcycle_have_counter()) { return NULL; } snew(wc,1); wc->wc_barrier = FALSE; wc->wcc_all = NULL; wc->wc_depth = 0; wc->ewc_prev = -1; #ifdef GMX_MPI if (PAR(cr) && getenv("GMX_CYCLE_BARRIER") != NULL) { if (fplog) { fprintf(fplog,"\nWill call MPI_Barrier before each cycle start/stop call\n\n"); } wc->wc_barrier = TRUE; wc->mpi_comm_mygroup = cr->mpi_comm_mygroup; } #endif snew(wc->wcc,ewcNR); if (getenv("GMX_CYCLE_ALL") != NULL) { if (fplog) { fprintf(fplog,"\nWill time all the code during the run\n\n"); } snew(wc->wcc_all,ewcNR*ewcNR); } /* Read variable GMX_RESET_COUNTER from environment */ wc->reset_counters = -1; if ((env_ptr=getenv("GMX_RESET_COUNTERS")) != NULL) { sscanf(env_ptr,gmx_step_pfmt,&wc->reset_counters); } return wc; }
gmx_wallcycle_t wallcycle_init(FILE *fplog,int resetstep,t_commrec *cr) { gmx_wallcycle_t wc; if (!wallcycle_have_counter()) { return NULL; } snew(wc,1); wc->wc_barrier = FALSE; wc->wcc_all = NULL; wc->wc_depth = 0; wc->ewc_prev = -1; wc->reset_counters = resetstep; #ifdef GMX_MPI if (PAR(cr) && getenv("GMX_CYCLE_BARRIER") != NULL) { if (fplog) { fprintf(fplog,"\nWill call MPI_Barrier before each cycle start/stop call\n\n"); } wc->wc_barrier = TRUE; wc->mpi_comm_mygroup = cr->mpi_comm_mygroup; } #endif snew(wc->wcc,ewcNR); if (getenv("GMX_CYCLE_ALL") != NULL) { /*#ifndef GMX_THREADS*/ if (fplog) { fprintf(fplog,"\nWill time all the code during the run\n\n"); } snew(wc->wcc_all,ewcNR*ewcNR); /*#else*/ gmx_fatal(FARGS, "GMX_CYCLE_ALL is incompatible with threaded code"); /*#endif*/ } return wc; }
void pme_loadbal_init(pme_load_balancing_t **pme_lb_p, t_commrec *cr, FILE *fp_log, const t_inputrec *ir, matrix box, const interaction_const_t *ic, struct gmx_pme_t *pmedata, gmx_bool bUseGPU, gmx_bool *bPrinting) { pme_load_balancing_t *pme_lb; real spm, sp; int d; snew(pme_lb, 1); pme_lb->bSepPMERanks = !(cr->duty & DUTY_PME); /* Initially we turn on balancing directly on based on PP/PME imbalance */ pme_lb->bTriggerOnDLB = FALSE; /* Any number of stages >= 2 is supported */ pme_lb->nstage = 2; pme_lb->cutoff_scheme = ir->cutoff_scheme; if (pme_lb->cutoff_scheme == ecutsVERLET) { pme_lb->rbuf_coulomb = ic->rlist - ic->rcoulomb; pme_lb->rbuf_vdw = pme_lb->rbuf_coulomb; } else { if (ic->rcoulomb > ic->rlist) { pme_lb->rbuf_coulomb = ic->rlistlong - ic->rcoulomb; } else { pme_lb->rbuf_coulomb = ic->rlist - ic->rcoulomb; } if (ic->rvdw > ic->rlist) { pme_lb->rbuf_vdw = ic->rlistlong - ic->rvdw; } else { pme_lb->rbuf_vdw = ic->rlist - ic->rvdw; } } copy_mat(box, pme_lb->box_start); if (ir->ePBC == epbcXY && ir->nwall == 2) { svmul(ir->wall_ewald_zfac, pme_lb->box_start[ZZ], pme_lb->box_start[ZZ]); } pme_lb->n = 1; snew(pme_lb->setup, pme_lb->n); pme_lb->rcut_vdw = ic->rvdw; pme_lb->rcut_coulomb_start = ir->rcoulomb; pme_lb->nstcalclr_start = ir->nstcalclr; pme_lb->cur = 0; pme_lb->setup[0].rcut_coulomb = ic->rcoulomb; pme_lb->setup[0].rlist = ic->rlist; pme_lb->setup[0].rlistlong = ic->rlistlong; pme_lb->setup[0].nstcalclr = ir->nstcalclr; pme_lb->setup[0].grid[XX] = ir->nkx; pme_lb->setup[0].grid[YY] = ir->nky; pme_lb->setup[0].grid[ZZ] = ir->nkz; pme_lb->setup[0].ewaldcoeff_q = ic->ewaldcoeff_q; pme_lb->setup[0].ewaldcoeff_lj = ic->ewaldcoeff_lj; pme_lb->setup[0].pmedata = pmedata; spm = 0; for (d = 0; d < DIM; d++) { sp = norm(pme_lb->box_start[d])/pme_lb->setup[0].grid[d]; if (sp > spm) { spm = sp; } } pme_lb->setup[0].spacing = spm; if (ir->fourier_spacing > 0) { pme_lb->cut_spacing = ir->rcoulomb/ir->fourier_spacing; } else { pme_lb->cut_spacing = ir->rcoulomb/pme_lb->setup[0].spacing; } pme_lb->stage = 0; pme_lb->fastest = 0; pme_lb->lower_limit = 0; pme_lb->start = 0; pme_lb->end = 0; pme_lb->elimited = epmelblimNO; pme_lb->cycles_n = 0; pme_lb->cycles_c = 0; /* Tune with GPUs and/or separate PME ranks. * When running only on a CPU without PME ranks, PME tuning will only help * with small numbers of atoms in the cut-off sphere. */ pme_lb->bActive = (wallcycle_have_counter() && (bUseGPU || pme_lb->bSepPMERanks)); /* With GPUs and no separate PME ranks we can't measure the PP/PME * imbalance, so we start balancing right away. * Otherwise we only start balancing after we observe imbalance. */ pme_lb->bBalance = (pme_lb->bActive && (bUseGPU && !pme_lb->bSepPMERanks)); pme_lb->step_rel_stop = PMETunePeriod*ir->nstlist; /* Delay DD load balancing when GPUs are used */ if (pme_lb->bActive && DOMAINDECOMP(cr) && cr->dd->nnodes > 1 && bUseGPU) { /* Lock DLB=auto to off (does nothing when DLB=yes/no. * With GPUs and separate PME nodes, we want to first * do PME tuning without DLB, since DLB might limit * the cut-off, which never improves performance. * We allow for DLB + PME tuning after a first round of tuning. */ dd_dlb_lock(cr->dd); if (dd_dlb_is_locked(cr->dd)) { md_print_warn(cr, fp_log, "NOTE: DLB will not turn on during the first phase of PME tuning\n"); } } *pme_lb_p = pme_lb; *bPrinting = pme_lb->bBalance; }
void pme_loadbal_init(pme_load_balancing_t **pme_lb_p, const t_inputrec *ir, matrix box, const interaction_const_t *ic, struct gmx_pme_t *pmedata, gmx_bool bUseGPU, gmx_bool bSepPMERanks, gmx_bool *bPrinting) { pme_load_balancing_t *pme_lb; real spm, sp; int d; snew(pme_lb, 1); pme_lb->bSepPMERanks = bSepPMERanks; /* Any number of stages >= 2 is supported */ pme_lb->nstage = 2; pme_lb->cutoff_scheme = ir->cutoff_scheme; if (pme_lb->cutoff_scheme == ecutsVERLET) { pme_lb->rbuf_coulomb = ic->rlist - ic->rcoulomb; pme_lb->rbuf_vdw = pme_lb->rbuf_coulomb; } else { if (ic->rcoulomb > ic->rlist) { pme_lb->rbuf_coulomb = ic->rlistlong - ic->rcoulomb; } else { pme_lb->rbuf_coulomb = ic->rlist - ic->rcoulomb; } if (ic->rvdw > ic->rlist) { pme_lb->rbuf_vdw = ic->rlistlong - ic->rvdw; } else { pme_lb->rbuf_vdw = ic->rlist - ic->rvdw; } } copy_mat(box, pme_lb->box_start); if (ir->ePBC == epbcXY && ir->nwall == 2) { svmul(ir->wall_ewald_zfac, pme_lb->box_start[ZZ], pme_lb->box_start[ZZ]); } pme_lb->n = 1; snew(pme_lb->setup, pme_lb->n); pme_lb->rcut_vdw = ic->rvdw; pme_lb->rcut_coulomb_start = ir->rcoulomb; pme_lb->nstcalclr_start = ir->nstcalclr; pme_lb->cur = 0; pme_lb->setup[0].rcut_coulomb = ic->rcoulomb; pme_lb->setup[0].rlist = ic->rlist; pme_lb->setup[0].rlistlong = ic->rlistlong; pme_lb->setup[0].nstcalclr = ir->nstcalclr; pme_lb->setup[0].grid[XX] = ir->nkx; pme_lb->setup[0].grid[YY] = ir->nky; pme_lb->setup[0].grid[ZZ] = ir->nkz; pme_lb->setup[0].ewaldcoeff_q = ic->ewaldcoeff_q; pme_lb->setup[0].ewaldcoeff_lj = ic->ewaldcoeff_lj; pme_lb->setup[0].pmedata = pmedata; spm = 0; for (d = 0; d < DIM; d++) { sp = norm(pme_lb->box_start[d])/pme_lb->setup[0].grid[d]; if (sp > spm) { spm = sp; } } pme_lb->setup[0].spacing = spm; if (ir->fourier_spacing > 0) { pme_lb->cut_spacing = ir->rcoulomb/ir->fourier_spacing; } else { pme_lb->cut_spacing = ir->rcoulomb/pme_lb->setup[0].spacing; } pme_lb->stage = 0; pme_lb->fastest = 0; pme_lb->start = 0; pme_lb->end = 0; pme_lb->elimited = epmelblimNO; pme_lb->cycles_n = 0; pme_lb->cycles_c = 0; /* Tune with GPUs and/or separate PME ranks. * When running only on a CPU without PME ranks, PME tuning will only help * with small numbers of atoms in the cut-off sphere. */ pme_lb->bActive = (wallcycle_have_counter() && (bUseGPU || bSepPMERanks)); /* With GPUs and no separate PME ranks we can't measure the PP/PME * imbalance, so we start balancing right away. * Otherwise we only start balancing after we observe imbalance. */ pme_lb->bBalance = (pme_lb->bActive && (bUseGPU && !bSepPMERanks)); *pme_lb_p = pme_lb; *bPrinting = pme_lb->bBalance; }