void wallcycle_scale_by_num_threads(gmx_wallcycle_t wc, bool isPmeRank, int nthreads_pp, int nthreads_pme) { if (wc == NULL) { return; } for (int i = 0; i < ewcNR; i++) { if (is_pme_counter(i) || (i == ewcRUN && isPmeRank)) { wc->wcc[i].c *= nthreads_pme; if (wc->wcc_all) { for (int j = 0; j < ewcNR; j++) { wc->wcc_all[i*ewcNR+j].c *= nthreads_pme; } } } else { wc->wcc[i].c *= nthreads_pp; if (wc->wcc_all) { for (int j = 0; j < ewcNR; j++) { wc->wcc_all[i*ewcNR+j].c *= nthreads_pp; } } } } if (useCycleSubcounters && wc->wcsc && !isPmeRank) { for (int i = 0; i < ewcsNR; i++) { wc->wcsc[i].c *= nthreads_pp; } } }
void wallcycle_print(FILE *fplog, int nnodes, int npme, int nth_pp, int nth_pme, double realtime, gmx_wallcycle_t wc, const WallcycleCounts &cyc_sum, struct gmx_wallclock_gpu_t *gpu_t) { double tot, tot_for_pp, tot_for_rest, tot_gpu, tot_cpu_overlap, gpu_cpu_ratio, tot_k; double c2t, c2t_pp, c2t_pme = 0; int i, j, npp, nth_tot; char buf[STRLEN]; const char *hline = "-----------------------------------------------------------------------------"; if (wc == NULL) { return; } GMX_ASSERT(nth_pp > 0, "Number of particle-particle threads must be >0"); GMX_ASSERT(nth_pme > 0, "Number of PME threads must be >0"); GMX_ASSERT(nnodes > 0, "Number of nodes must be >0"); GMX_ASSERT(npme >= 0, "Number of PME nodes cannot be negative"); npp = nnodes - npme; /* npme is the number of PME-only ranks used, and we always do PP work */ GMX_ASSERT(npp > 0, "Number of particle-particle nodes must be >0"); nth_tot = npp*nth_pp + npme*nth_pme; /* When using PME-only nodes, the next line is valid for both PP-only and PME-only nodes because they started ewcRUN at the same time. */ tot = cyc_sum[ewcRUN]; tot_for_pp = 0; if (tot <= 0.0) { /* TODO This is heavy handed, but until someone reworks the code so that it is provably robust with respect to non-positive values for all possible timer and cycle counters, there is less value gained from printing whatever timing data might still be sensible for some non-Jenkins run, than is lost from diagnosing Jenkins FP exceptions on runs about whose execution time we don't care. */ md_print_warn(NULL, fplog, "WARNING: A total of %f CPU cycles was recorded, so mdrun cannot print a time accounting\n", tot); return; } if (wc->haveInvalidCount) { md_print_warn(NULL, fplog, "%s\n", "NOTE: Detected invalid cycle counts, probably because threads moved between CPU cores that do not have synchronized cycle counters. Will not print the cycle accounting."); return; } /* Conversion factor from cycles to seconds */ c2t = realtime/tot; c2t_pp = c2t * nth_tot / static_cast<double>(npp*nth_pp); if (npme > 0) { c2t_pme = c2t * nth_tot / static_cast<double>(npme*nth_pme); } else { c2t_pme = 0; } fprintf(fplog, "\n R E A L C Y C L E A N D T I M E A C C O U N T I N G\n\n"); print_header(fplog, npp, nth_pp, npme, nth_pme); fprintf(fplog, "%s\n", hline); for (i = ewcPPDURINGPME+1; i < ewcNR; i++) { if (is_pme_subcounter(i)) { /* Do not count these at all */ } else if (npme > 0 && is_pme_counter(i)) { /* Print timing information for PME-only nodes, but add an * asterisk so the reader of the table can know that the * walltimes are not meant to add up. The asterisk still * fits in the required maximum of 19 characters. */ char buffer[STRLEN]; snprintf(buffer, STRLEN, "%s *", wcn[i]); print_cycles(fplog, c2t_pme, buffer, npme, nth_pme, wc->wcc[i].n, cyc_sum[i], tot); } else { /* Print timing information when it is for a PP or PP+PME node */ print_cycles(fplog, c2t_pp, wcn[i], npp, nth_pp, wc->wcc[i].n, cyc_sum[i], tot); tot_for_pp += cyc_sum[i]; } } if (wc->wcc_all != NULL) { for (i = 0; i < ewcNR; i++) { for (j = 0; j < ewcNR; j++) { snprintf(buf, 20, "%-9.9s %-9.9s", wcn[i], wcn[j]); print_cycles(fplog, c2t_pp, buf, npp, nth_pp, wc->wcc_all[i*ewcNR+j].n, wc->wcc_all[i*ewcNR+j].c, tot); } } } tot_for_rest = tot * npp * nth_pp / static_cast<double>(nth_tot); print_cycles(fplog, c2t_pp, "Rest", npp, nth_pp, -1, tot_for_rest - tot_for_pp, tot); fprintf(fplog, "%s\n", hline); print_cycles(fplog, c2t, "Total", npp, nth_pp, -1, tot, tot); fprintf(fplog, "%s\n", hline); if (npme > 0) { fprintf(fplog, "(*) Note that with separate PME ranks, the walltime column actually sums to\n" " twice the total reported, but the cycle count total and %% are correct.\n" "%s\n", hline); } if (wc->wcc[ewcPMEMESH].n > 0) { fprintf(fplog, " Breakdown of PME mesh computation\n"); fprintf(fplog, "%s\n", hline); for (i = ewcPPDURINGPME+1; i < ewcNR; i++) { if (is_pme_subcounter(i)) { print_cycles(fplog, npme > 0 ? c2t_pme : c2t_pp, wcn[i], npme > 0 ? npme : npp, nth_pme, wc->wcc[i].n, cyc_sum[i], tot); } } fprintf(fplog, "%s\n", hline); } if (useCycleSubcounters && wc->wcsc) { fprintf(fplog, " Breakdown of PP computation\n"); fprintf(fplog, "%s\n", hline); for (i = 0; i < ewcsNR; i++) { print_cycles(fplog, c2t_pp, wcsn[i], npp, nth_pp, wc->wcsc[i].n, cyc_sum[ewcNR+i], tot); } fprintf(fplog, "%s\n", hline); } /* print GPU timing summary */ if (gpu_t) { const char *k_log_str[2][2] = { {"Nonbonded F kernel", "Nonbonded F+ene k."}, {"Nonbonded F+prune k.", "Nonbonded F+ene+prune k."} }; tot_gpu = gpu_t->pl_h2d_t + gpu_t->nb_h2d_t + gpu_t->nb_d2h_t; /* add up the kernel timings */ tot_k = 0.0; for (i = 0; i < 2; i++) { for (j = 0; j < 2; j++) { tot_k += gpu_t->ktime[i][j].t; } } tot_gpu += tot_k; tot_cpu_overlap = wc->wcc[ewcFORCE].c; if (wc->wcc[ewcPMEMESH].n > 0) { tot_cpu_overlap += wc->wcc[ewcPMEMESH].c; } tot_cpu_overlap *= realtime*1000/tot; /* convert s to ms */ fprintf(fplog, "\n GPU timings\n%s\n", hline); fprintf(fplog, " Computing: Count Wall t (s) ms/step %c\n", '%'); fprintf(fplog, "%s\n", hline); print_gputimes(fplog, "Pair list H2D", gpu_t->pl_h2d_c, gpu_t->pl_h2d_t, tot_gpu); print_gputimes(fplog, "X / q H2D", gpu_t->nb_c, gpu_t->nb_h2d_t, tot_gpu); for (i = 0; i < 2; i++) { for (j = 0; j < 2; j++) { if (gpu_t->ktime[i][j].c) { print_gputimes(fplog, k_log_str[i][j], gpu_t->ktime[i][j].c, gpu_t->ktime[i][j].t, tot_gpu); } } } print_gputimes(fplog, "F D2H", gpu_t->nb_c, gpu_t->nb_d2h_t, tot_gpu); fprintf(fplog, "%s\n", hline); print_gputimes(fplog, "Total ", gpu_t->nb_c, tot_gpu, tot_gpu); fprintf(fplog, "%s\n", hline); gpu_cpu_ratio = tot_gpu/tot_cpu_overlap; if (gpu_t->nb_c > 0 && wc->wcc[ewcFORCE].n > 0) { fprintf(fplog, "\nForce evaluation time GPU/CPU: %.3f ms/%.3f ms = %.3f\n", tot_gpu/gpu_t->nb_c, tot_cpu_overlap/wc->wcc[ewcFORCE].n, gpu_cpu_ratio); } /* only print notes related to CPU-GPU load balance with PME */ if (wc->wcc[ewcPMEMESH].n > 0) { fprintf(fplog, "For optimal performance this ratio should be close to 1!\n"); /* print note if the imbalance is high with PME case in which * CPU-GPU load balancing is possible */ if (gpu_cpu_ratio < 0.75 || gpu_cpu_ratio > 1.2) { /* Only the sim master calls this function, so always print to stderr */ if (gpu_cpu_ratio < 0.75) { if (npp > 1) { /* The user could have used -notunepme, * but we currently can't check that here. */ md_print_warn(NULL, fplog, "\nNOTE: The GPU has >25%% less load than the CPU. This imbalance causes\n" " performance loss. Maybe the domain decomposition limits the PME tuning.\n" " In that case, try setting the DD grid manually (-dd) or lowering -dds."); } else { /* We should not end up here, unless the box is * too small for increasing the cut-off for PME tuning. */ md_print_warn(NULL, fplog, "\nNOTE: The GPU has >25%% less load than the CPU. This imbalance causes\n" " performance loss."); } } if (gpu_cpu_ratio > 1.2) { md_print_warn(NULL, fplog, "\nNOTE: The GPU has >20%% more load than the CPU. This imbalance causes\n" " performance loss, consider using a shorter cut-off and a finer PME grid."); } } } } if (wc->wc_barrier) { md_print_warn(NULL, fplog, "MPI_Barrier was called before each cycle start/stop\n" "call, so timings are not those of real runs.\n"); } if (wc->wcc[ewcNB_XF_BUF_OPS].n > 0 && (cyc_sum[ewcDOMDEC] > tot*0.1 || cyc_sum[ewcNS] > tot*0.1)) { /* Only the sim master calls this function, so always print to stderr */ if (wc->wcc[ewcDOMDEC].n == 0) { md_print_warn(NULL, fplog, "NOTE: %d %% of the run time was spent in pair search,\n" " you might want to increase nstlist (this has no effect on accuracy)\n", (int)(100*cyc_sum[ewcNS]/tot+0.5)); } else { md_print_warn(NULL, fplog, "NOTE: %d %% of the run time was spent in domain decomposition,\n" " %d %% of the run time was spent in pair search,\n" " you might want to increase nstlist (this has no effect on accuracy)\n", (int)(100*cyc_sum[ewcDOMDEC]/tot+0.5), (int)(100*cyc_sum[ewcNS]/tot+0.5)); } } if (cyc_sum[ewcMoveE] > tot*0.05) { /* Only the sim master calls this function, so always print to stderr */ md_print_warn(NULL, fplog, "NOTE: %d %% of the run time was spent communicating energies,\n" " you might want to use the -gcom option of mdrun\n", (int)(100*cyc_sum[ewcMoveE]/tot+0.5)); } }
void wallcycle_print(FILE *fplog, int nnodes, int npme, double realtime, gmx_wallcycle_t wc, wallclock_gpu_t *gpu_t) { double *cyc_sum; double tot, tot_for_pp, tot_for_rest, tot_gpu, tot_cpu_overlap, gpu_cpu_ratio, tot_k; double c2t, c2t_pp, c2t_pme; int i, j, npp, nth_pp, nth_pme, nth_tot; char buf[STRLEN]; const char *hline = "-----------------------------------------------------------------------------"; if (wc == NULL) { return; } nth_pp = wc->nthreads_pp; nth_pme = wc->nthreads_pme; cyc_sum = wc->cycles_sum; npp = nnodes - npme; nth_tot = npp*nth_pp + npme*nth_pme; /* When using PME-only nodes, the next line is valid for both PP-only and PME-only nodes because they started ewcRUN at the same time. */ tot = cyc_sum[ewcRUN]; tot_for_pp = 0; /* Conversion factor from cycles to seconds */ if (tot > 0) { c2t = realtime/tot; c2t_pp = c2t * nth_tot / (double) (npp*nth_pp); c2t_pme = c2t * nth_tot / (double) (npme*nth_pme); } else { c2t = 0; c2t_pp = 0; c2t_pme = 0; } fprintf(fplog, "\n R E A L C Y C L E A N D T I M E A C C O U N T I N G\n\n"); print_header(fplog, npp, nth_pp, npme, nth_pme); fprintf(fplog, "%s\n", hline); for (i = ewcPPDURINGPME+1; i < ewcNR; i++) { if (is_pme_subcounter(i)) { /* Do not count these at all */ } else if (npme > 0 && is_pme_counter(i)) { /* Print timing information for PME-only nodes, but add an * asterisk so the reader of the table can know that the * walltimes are not meant to add up. The asterisk still * fits in the required maximum of 19 characters. */ char buffer[STRLEN]; snprintf(buffer, STRLEN, "%s *", wcn[i]); print_cycles(fplog, c2t_pme, buffer, npme, nth_pme, wc->wcc[i].n, cyc_sum[i], tot); } else { /* Print timing information when it is for a PP or PP+PME node */ print_cycles(fplog, c2t_pp, wcn[i], npp, nth_pp, wc->wcc[i].n, cyc_sum[i], tot); tot_for_pp += cyc_sum[i]; } } if (wc->wcc_all != NULL) { for (i = 0; i < ewcNR; i++) { for (j = 0; j < ewcNR; j++) { snprintf(buf, 20, "%-9.9s %-9.9s", wcn[i], wcn[j]); print_cycles(fplog, c2t_pp, buf, npp, nth_pp, wc->wcc_all[i*ewcNR+j].n, wc->wcc_all[i*ewcNR+j].c, tot); } } } tot_for_rest = tot * (npp * nth_pp) / (double) nth_tot; print_cycles(fplog, c2t_pp, "Rest", npp, nth_pp, -1, tot_for_rest - tot_for_pp, tot); fprintf(fplog, "%s\n", hline); print_cycles(fplog, c2t, "Total", npp, nth_pp, -1, tot, tot); fprintf(fplog, "%s\n", hline); if (npme > 0) { fprintf(fplog, "(*) Note that with separate PME nodes, the walltime column actually sums to\n" " twice the total reported, but the cycle count total and %% are correct.\n" "%s\n", hline); } if (wc->wcc[ewcPMEMESH].n > 0) { fprintf(fplog, " Breakdown of PME mesh computation\n"); fprintf(fplog, "%s\n", hline); for (i = ewcPPDURINGPME+1; i < ewcNR; i++) { if (is_pme_subcounter(i)) { print_cycles(fplog, npme > 0 ? c2t_pme : c2t_pp, wcn[i], npme > 0 ? npme : npp, nth_pme, wc->wcc[i].n, cyc_sum[i], tot); } } fprintf(fplog, "%s\n", hline); } #ifdef GMX_CYCLE_SUBCOUNTERS fprintf(fplog, " Breakdown of PP computation\n"); fprintf(fplog, "%s\n", hline); for (i = 0; i < ewcsNR; i++) { print_cycles(fplog, c2t_pp, wcsn[i], npp, nth_pp, wc->wcsc[i].n, cyc_sum[ewcNR+i], tot); } fprintf(fplog, "%s\n", hline); #endif /* print GPU timing summary */ if (gpu_t) { const char *k_log_str[2][2] = { {"Nonbonded F kernel", "Nonbonded F+ene k."}, {"Nonbonded F+prune k.", "Nonbonded F+ene+prune k."} }; tot_gpu = gpu_t->pl_h2d_t + gpu_t->nb_h2d_t + gpu_t->nb_d2h_t; /* add up the kernel timings */ tot_k = 0.0; for (i = 0; i < 2; i++) { for (j = 0; j < 2; j++) { tot_k += gpu_t->ktime[i][j].t; } } tot_gpu += tot_k; tot_cpu_overlap = wc->wcc[ewcFORCE].c; if (wc->wcc[ewcPMEMESH].n > 0) { tot_cpu_overlap += wc->wcc[ewcPMEMESH].c; } tot_cpu_overlap *= realtime*1000/tot; /* convert s to ms */ fprintf(fplog, "\n GPU timings\n%s\n", hline); fprintf(fplog, " Computing: Count Wall t (s) ms/step %c\n", '%'); fprintf(fplog, "%s\n", hline); print_gputimes(fplog, "Pair list H2D", gpu_t->pl_h2d_c, gpu_t->pl_h2d_t, tot_gpu); print_gputimes(fplog, "X / q H2D", gpu_t->nb_c, gpu_t->nb_h2d_t, tot_gpu); for (i = 0; i < 2; i++) { for (j = 0; j < 2; j++) { if (gpu_t->ktime[i][j].c) { print_gputimes(fplog, k_log_str[i][j], gpu_t->ktime[i][j].c, gpu_t->ktime[i][j].t, tot_gpu); } } } print_gputimes(fplog, "F D2H", gpu_t->nb_c, gpu_t->nb_d2h_t, tot_gpu); fprintf(fplog, "%s\n", hline); print_gputimes(fplog, "Total ", gpu_t->nb_c, tot_gpu, tot_gpu); fprintf(fplog, "%s\n", hline); gpu_cpu_ratio = tot_gpu/tot_cpu_overlap; fprintf(fplog, "\nForce evaluation time GPU/CPU: %.3f ms/%.3f ms = %.3f\n", tot_gpu/gpu_t->nb_c, tot_cpu_overlap/wc->wcc[ewcFORCE].n, gpu_cpu_ratio); /* only print notes related to CPU-GPU load balance with PME */ if (wc->wcc[ewcPMEMESH].n > 0) { fprintf(fplog, "For optimal performance this ratio should be close to 1!\n"); /* print note if the imbalance is high with PME case in which * CPU-GPU load balancing is possible */ if (gpu_cpu_ratio < 0.75 || gpu_cpu_ratio > 1.2) { /* Only the sim master calls this function, so always print to stderr */ if (gpu_cpu_ratio < 0.75) { if (npp > 1) { /* The user could have used -notunepme, * but we currently can't check that here. */ md_print_warn(NULL, fplog, "\nNOTE: The GPU has >25%% less load than the CPU. This imbalance causes\n" " performance loss. Maybe the domain decomposition limits the PME tuning.\n" " In that case, try setting the DD grid manually (-dd) or lowering -dds."); } else { /* We should not end up here, unless the box is * too small for increasing the cut-off for PME tuning. */ md_print_warn(NULL, fplog, "\nNOTE: The GPU has >25%% less load than the CPU. This imbalance causes\n" " performance loss."); } } if (gpu_cpu_ratio > 1.2) { md_print_warn(NULL, fplog, "\nNOTE: The GPU has >20%% more load than the CPU. This imbalance causes\n" " performance loss, consider using a shorter cut-off and a finer PME grid."); } } } } if (wc->wcc[ewcNB_XF_BUF_OPS].n > 0 && (cyc_sum[ewcDOMDEC] > tot*0.1 || cyc_sum[ewcNS] > tot*0.1)) { /* Only the sim master calls this function, so always print to stderr */ if (wc->wcc[ewcDOMDEC].n == 0) { md_print_warn(NULL, fplog, "NOTE: %d %% of the run time was spent in pair search,\n" " you might want to increase nstlist (this has no effect on accuracy)\n", (int)(100*cyc_sum[ewcNS]/tot+0.5)); } else { md_print_warn(NULL, fplog, "NOTE: %d %% of the run time was spent in domain decomposition,\n" " %d %% of the run time was spent in pair search,\n" " you might want to increase nstlist (this has no effect on accuracy)\n", (int)(100*cyc_sum[ewcDOMDEC]/tot+0.5), (int)(100*cyc_sum[ewcNS]/tot+0.5)); } } if (cyc_sum[ewcMoveE] > tot*0.05) { /* Only the sim master calls this function, so always print to stderr */ md_print_warn(NULL, fplog, "NOTE: %d %% of the run time was spent communicating energies,\n" " you might want to use the -gcom option of mdrun\n", (int)(100*cyc_sum[ewcMoveE]/tot+0.5)); } }
void wallcycle_sum(t_commrec *cr, gmx_wallcycle_t wc) { wallcc_t *wcc; double cycles[ewcNR+ewcsNR]; double cycles_n[ewcNR+ewcsNR], buf[ewcNR+ewcsNR], *cyc_all, *buf_all; int i, j; int nsum; if (wc == NULL) { return; } snew(wc->cycles_sum, ewcNR+ewcsNR); wcc = wc->wcc; for (i = 0; i < ewcNR; i++) { if (is_pme_counter(i) || (i == ewcRUN && cr->duty == DUTY_PME)) { wcc[i].c *= wc->nthreads_pme; if (wc->wcc_all) { for (j = 0; j < ewcNR; j++) { wc->wcc_all[i*ewcNR+j].c *= wc->nthreads_pme; } } } else { wcc[i].c *= wc->nthreads_pp; if (wc->wcc_all) { for (j = 0; j < ewcNR; j++) { wc->wcc_all[i*ewcNR+j].c *= wc->nthreads_pp; } } } } if (wcc[ewcDDCOMMLOAD].n > 0) { wcc[ewcDOMDEC].c -= wcc[ewcDDCOMMLOAD].c; } if (wcc[ewcDDCOMMBOUND].n > 0) { wcc[ewcDOMDEC].c -= wcc[ewcDDCOMMBOUND].c; } if (wcc[ewcPME_FFTCOMM].n > 0) { wcc[ewcPME_FFT].c -= wcc[ewcPME_FFTCOMM].c; } if (cr->npmenodes == 0) { /* All nodes do PME (or no PME at all) */ if (wcc[ewcPMEMESH].n > 0) { wcc[ewcFORCE].c -= wcc[ewcPMEMESH].c; } } else { /* The are PME-only nodes */ if (wcc[ewcPMEMESH].n > 0) { /* This must be a PME only node, calculate the Wait + Comm. time */ wcc[ewcPMEWAITCOMM].c = wcc[ewcRUN].c - wcc[ewcPMEMESH].c; } } /* Store the cycles in a double buffer for summing */ for (i = 0; i < ewcNR; i++) { cycles_n[i] = (double)wcc[i].n; cycles[i] = (double)wcc[i].c; } nsum = ewcNR; #ifdef GMX_CYCLE_SUBCOUNTERS for (i = 0; i < ewcsNR; i++) { wc->wcsc[i].c *= wc->nthreads_pp; cycles_n[ewcNR+i] = (double)wc->wcsc[i].n; cycles[ewcNR+i] = (double)wc->wcsc[i].c; } nsum += ewcsNR; #endif #ifdef GMX_MPI if (cr->nnodes > 1) { MPI_Allreduce(cycles_n, buf, nsum, MPI_DOUBLE, MPI_MAX, cr->mpi_comm_mysim); for (i = 0; i < ewcNR; i++) { wcc[i].n = (int)(buf[i] + 0.5); } #ifdef GMX_CYCLE_SUBCOUNTERS for (i = 0; i < ewcsNR; i++) { wc->wcsc[i].n = (int)(buf[ewcNR+i] + 0.5); } #endif MPI_Allreduce(cycles, wc->cycles_sum, nsum, MPI_DOUBLE, MPI_SUM, cr->mpi_comm_mysim); if (wc->wcc_all != NULL) { snew(cyc_all, ewcNR*ewcNR); snew(buf_all, ewcNR*ewcNR); for (i = 0; i < ewcNR*ewcNR; i++) { cyc_all[i] = wc->wcc_all[i].c; } MPI_Allreduce(cyc_all, buf_all, ewcNR*ewcNR, MPI_DOUBLE, MPI_SUM, cr->mpi_comm_mysim); for (i = 0; i < ewcNR*ewcNR; i++) { wc->wcc_all[i].c = buf_all[i]; } sfree(buf_all); sfree(cyc_all); } } else #endif { for (i = 0; i < nsum; i++) { wc->cycles_sum[i] = cycles[i]; } } }
void wallcycle_print(FILE *fplog, int nnodes, int npme, double realtime, gmx_wallcycle_t wc, wallclock_gpu_t *gpu_t) { double *cycles; double c2t, tot, tot_gpu, tot_cpu_overlap, gpu_cpu_ratio, sum, tot_k; int i, j, npp, nth_pp, nth_pme, nth_tot; char buf[STRLEN]; const char *hline = "-----------------------------------------------------------------------------"; if (wc == NULL) { return; } nth_pp = wc->nthreads_pp; nth_pme = wc->nthreads_pme; cycles = wc->cycles_sum; if (npme > 0) { npp = nnodes - npme; nth_tot = npp*nth_pp + npme*nth_pme; } else { npp = nnodes; npme = nnodes; nth_tot = npp*nth_pp; } tot = cycles[ewcRUN]; /* Conversion factor from cycles to seconds */ if (tot > 0) { c2t = realtime/tot; } else { c2t = 0; } fprintf(fplog, "\n R E A L C Y C L E A N D T I M E A C C O U N T I N G\n\n"); fprintf(fplog, " Computing: Nodes Th. Count Wall t (s) G-Cycles %c\n", '%'); fprintf(fplog, "%s\n", hline); sum = 0; for (i = ewcPPDURINGPME+1; i < ewcNR; i++) { if (!is_pme_subcounter(i)) { print_cycles(fplog, c2t, wcn[i], nth_tot, is_pme_counter(i) ? npme : npp, is_pme_counter(i) ? nth_pme : nth_pp, wc->wcc[i].n, cycles[i], tot); sum += cycles[i]; } } if (wc->wcc_all != NULL) { for (i = 0; i < ewcNR; i++) { for (j = 0; j < ewcNR; j++) { snprintf(buf, 9, "%-9s", wcn[i]); buf[9] = ' '; snprintf(buf+10, 9, "%-9s", wcn[j]); buf[19] = '\0'; print_cycles(fplog, c2t, buf, nth_tot, is_pme_counter(i) ? npme : npp, is_pme_counter(i) ? nth_pme : nth_pp, wc->wcc_all[i*ewcNR+j].n, wc->wcc_all[i*ewcNR+j].c, tot); } } } print_cycles(fplog, c2t, "Rest", nth_tot, npp, -1, 0, tot-sum, tot); fprintf(fplog, "%s\n", hline); print_cycles(fplog, c2t, "Total", nth_tot, nnodes, -1, 0, tot, tot); fprintf(fplog, "%s\n", hline); if (wc->wcc[ewcPMEMESH].n > 0) { fprintf(fplog, "%s\n", hline); for (i = ewcPPDURINGPME+1; i < ewcNR; i++) { if (is_pme_subcounter(i)) { print_cycles(fplog, c2t, wcn[i], nth_tot, is_pme_counter(i) ? npme : npp, is_pme_counter(i) ? nth_pme : nth_pp, wc->wcc[i].n, cycles[i], tot); } } fprintf(fplog, "%s\n", hline); } #ifdef GMX_CYCLE_SUBCOUNTERS fprintf(fplog, "%s\n", hline); for (i = 0; i < ewcsNR; i++) { print_cycles(fplog, c2t, wcsn[i], nth_tot, npp, nth_pp, wc->wcsc[i].n, cycles[ewcNR+i], tot); } fprintf(fplog, "%s\n", hline); #endif /* print GPU timing summary */ if (gpu_t) { const char *k_log_str[2][2] = { {"Nonbonded F kernel", "Nonbonded F+ene k."}, {"Nonbonded F+prune k.", "Nonbonded F+ene+prune k."} }; tot_gpu = gpu_t->pl_h2d_t + gpu_t->nb_h2d_t + gpu_t->nb_d2h_t; /* add up the kernel timings */ tot_k = 0.0; for (i = 0; i < 2; i++) { for (j = 0; j < 2; j++) { tot_k += gpu_t->ktime[i][j].t; } } tot_gpu += tot_k; tot_cpu_overlap = wc->wcc[ewcFORCE].c; if (wc->wcc[ewcPMEMESH].n > 0) { tot_cpu_overlap += wc->wcc[ewcPMEMESH].c; } tot_cpu_overlap *= c2t * 1000; /* convert s to ms */ fprintf(fplog, "\n GPU timings\n%s\n", hline); fprintf(fplog, " Computing: Count Wall t (s) ms/step %c\n", '%'); fprintf(fplog, "%s\n", hline); print_gputimes(fplog, "Pair list H2D", gpu_t->pl_h2d_c, gpu_t->pl_h2d_t, tot_gpu); print_gputimes(fplog, "X / q H2D", gpu_t->nb_c, gpu_t->nb_h2d_t, tot_gpu); for (i = 0; i < 2; i++) { for (j = 0; j < 2; j++) { if (gpu_t->ktime[i][j].c) { print_gputimes(fplog, k_log_str[i][j], gpu_t->ktime[i][j].c, gpu_t->ktime[i][j].t, tot_gpu); } } } print_gputimes(fplog, "F D2H", gpu_t->nb_c, gpu_t->nb_d2h_t, tot_gpu); fprintf(fplog, "%s\n", hline); print_gputimes(fplog, "Total ", gpu_t->nb_c, tot_gpu, tot_gpu); fprintf(fplog, "%s\n", hline); gpu_cpu_ratio = tot_gpu/tot_cpu_overlap; fprintf(fplog, "\nForce evaluation time GPU/CPU: %.3f ms/%.3f ms = %.3f\n", tot_gpu/gpu_t->nb_c, tot_cpu_overlap/wc->wcc[ewcFORCE].n, gpu_cpu_ratio); /* only print notes related to CPU-GPU load balance with PME */ if (wc->wcc[ewcPMEMESH].n > 0) { fprintf(fplog, "For optimal performance this ratio should be close to 1!\n"); /* print note if the imbalance is high with PME case in which * CPU-GPU load balancing is possible */ if (gpu_cpu_ratio < 0.75 || gpu_cpu_ratio > 1.2) { /* Only the sim master calls this function, so always print to stderr */ if (gpu_cpu_ratio < 0.75) { if (npp > 1) { /* The user could have used -notunepme, * but we currently can't check that here. */ md_print_warn(NULL, fplog, "\nNOTE: The GPU has >25%% less load than the CPU. This imbalance causes\n" " performance loss. Maybe the domain decomposition limits the PME tuning.\n" " In that case, try setting the DD grid manually (-dd) or lowering -dds."); } else { /* We should not end up here, unless the box is * too small for increasing the cut-off for PME tuning. */ md_print_warn(NULL, fplog, "\nNOTE: The GPU has >25%% less load than the CPU. This imbalance causes\n" " performance loss."); } } if (gpu_cpu_ratio > 1.2) { md_print_warn(NULL, fplog, "\nNOTE: The GPU has >20%% more load than the CPU. This imbalance causes\n" " performance loss, consider using a shorter cut-off and a finer PME grid."); } } } } if (wc->wcc[ewcNB_XF_BUF_OPS].n > 0 && (cycles[ewcDOMDEC] > tot*0.1 || cycles[ewcNS] > tot*0.1)) { /* Only the sim master calls this function, so always print to stderr */ if (wc->wcc[ewcDOMDEC].n == 0) { md_print_warn(NULL, fplog, "NOTE: %d %% of the run time was spent in pair search,\n" " you might want to increase nstlist (this has no effect on accuracy)\n", (int)(100*cycles[ewcNS]/tot+0.5)); } else { md_print_warn(NULL, fplog, "NOTE: %d %% of the run time was spent in domain decomposition,\n" " %d %% of the run time was spent in pair search,\n" " you might want to increase nstlist (this has no effect on accuracy)\n", (int)(100*cycles[ewcDOMDEC]/tot+0.5), (int)(100*cycles[ewcNS]/tot+0.5)); } } if (cycles[ewcMoveE] > tot*0.05) { /* Only the sim master calls this function, so always print to stderr */ md_print_warn(NULL, fplog, "NOTE: %d %% of the run time was spent communicating energies,\n" " you might want to use the -gcom option of mdrun\n", (int)(100*cycles[ewcMoveE]/tot+0.5)); } }
void wallcycle_sum(t_commrec *cr, gmx_wallcycle_t wc) { wallcc_t *wcc; double cycles[ewcNR+ewcsNR]; #ifdef GMX_MPI double cycles_n[ewcNR+ewcsNR+1]; double buf[ewcNR+ewcsNR]; double *buf_all, *cyc_all; #endif int i, j; int nsum; if (wc == NULL) { return; } snew(wc->cycles_sum, ewcNR+ewcsNR); wcc = wc->wcc; /* The GPU wait estimate counter is used for load balancing only * and will mess up the total due to double counting: clear it. */ wcc[ewcWAIT_GPU_NB_L_EST].n = 0; wcc[ewcWAIT_GPU_NB_L_EST].c = 0; for (i = 0; i < ewcNR; i++) { if (is_pme_counter(i) || (i == ewcRUN && cr->duty == DUTY_PME)) { wcc[i].c *= wc->nthreads_pme; if (wc->wcc_all) { for (j = 0; j < ewcNR; j++) { wc->wcc_all[i*ewcNR+j].c *= wc->nthreads_pme; } } } else { wcc[i].c *= wc->nthreads_pp; if (wc->wcc_all) { for (j = 0; j < ewcNR; j++) { wc->wcc_all[i*ewcNR+j].c *= wc->nthreads_pp; } } } } subtract_cycles(wcc, ewcDOMDEC, ewcDDCOMMLOAD); subtract_cycles(wcc, ewcDOMDEC, ewcDDCOMMBOUND); subtract_cycles(wcc, ewcPME_FFT, ewcPME_FFTCOMM); if (cr->npmenodes == 0) { /* All nodes do PME (or no PME at all) */ subtract_cycles(wcc, ewcFORCE, ewcPMEMESH); } else { /* The are PME-only nodes */ if (wcc[ewcPMEMESH].n > 0) { /* This must be a PME only node, calculate the Wait + Comm. time */ GMX_ASSERT(wcc[ewcRUN].c >= wcc[ewcPMEMESH].c, "Total run ticks must be greater than PME-only ticks"); wcc[ewcPMEWAITCOMM].c = wcc[ewcRUN].c - wcc[ewcPMEMESH].c; } } /* Store the cycles in a double buffer for summing */ for (i = 0; i < ewcNR; i++) { #ifdef GMX_MPI cycles_n[i] = static_cast<double>(wcc[i].n); #endif cycles[i] = static_cast<double>(wcc[i].c); } nsum = ewcNR; #ifdef GMX_CYCLE_SUBCOUNTERS for (i = 0; i < ewcsNR; i++) { wc->wcsc[i].c *= wc->nthreads_pp; #ifdef GMX_MPI cycles_n[ewcNR+i] = static_cast<double>(wc->wcsc[i].n); #endif cycles[ewcNR+i] = static_cast<double>(wc->wcsc[i].c); } nsum += ewcsNR; #endif #ifdef GMX_MPI if (cr->nnodes > 1) { cycles_n[nsum] = (wc->haveInvalidCount > 0 ? 1 : 0); MPI_Allreduce(cycles_n, buf, nsum + 1, MPI_DOUBLE, MPI_MAX, cr->mpi_comm_mysim); for (i = 0; i < ewcNR; i++) { wcc[i].n = static_cast<int>(buf[i] + 0.5); } wc->haveInvalidCount = (buf[nsum] > 0); #ifdef GMX_CYCLE_SUBCOUNTERS for (i = 0; i < ewcsNR; i++) { wc->wcsc[i].n = static_cast<int>(buf[ewcNR+i] + 0.5); } #endif MPI_Allreduce(cycles, wc->cycles_sum, nsum, MPI_DOUBLE, MPI_SUM, cr->mpi_comm_mysim); if (wc->wcc_all != NULL) { snew(cyc_all, ewcNR*ewcNR); snew(buf_all, ewcNR*ewcNR); for (i = 0; i < ewcNR*ewcNR; i++) { cyc_all[i] = wc->wcc_all[i].c; } MPI_Allreduce(cyc_all, buf_all, ewcNR*ewcNR, MPI_DOUBLE, MPI_SUM, cr->mpi_comm_mysim); for (i = 0; i < ewcNR*ewcNR; i++) { wc->wcc_all[i].c = static_cast<gmx_cycles_t>(buf_all[i]); } sfree(buf_all); sfree(cyc_all); } } else #endif { for (i = 0; i < nsum; i++) { wc->cycles_sum[i] = cycles[i]; } } }