void thread3_proc() { int cycles = 0; while(!sufficient_time()) { t1(); t3(); ++cycles; } print_cycles(cycles); }
void wallcycle_print(FILE *fplog, int nnodes, int npme, double realtime, gmx_wallcycle_t wc, double cycles[]) { double c2t,tot,sum; int i,j,npp; char buf[STRLEN]; const char *myline = "-----------------------------------------------------------------------"; if (wc == NULL) { return; } if (npme > 0) { npp = nnodes - npme; } else { npp = nnodes; npme = nnodes; } tot = cycles[ewcRUN]; /* Conversion factor from cycles to seconds */ if (tot > 0) { c2t = nnodes*realtime/tot; } else { c2t = 0; } fprintf(fplog,"\n R E A L C Y C L E A N D T I M E A C C O U N T I N G\n\n"); fprintf(fplog," Computing: Nodes Number G-Cycles Seconds %c\n",'%'); fprintf(fplog,"%s\n",myline); sum = 0; for(i=ewcPPDURINGPME+1; i<ewcNR; i++) { if (!subdivision(i)) { print_cycles(fplog,c2t,wcn[i], (i==ewcPMEMESH || i==ewcPMEWAITCOMM) ? npme : npp, wc->wcc[i].n,cycles[i],tot); sum += cycles[i]; } } if (wc->wcc_all != NULL) { for(i=0; i<ewcNR; i++) { for(j=0; j<ewcNR; j++) { sprintf(buf,"%-9s",wcn[i]); buf[9] = ' '; sprintf(buf+10,"%-9s",wcn[j]); buf[19] = '\0'; print_cycles(fplog,c2t,buf, (i==ewcPMEMESH || i==ewcPMEWAITCOMM) ? npme : npp, wc->wcc_all[i*ewcNR+j].n, wc->wcc_all[i*ewcNR+j].c, tot); } } } print_cycles(fplog,c2t,"Rest",npp,0,tot-sum,tot); fprintf(fplog,"%s\n",myline); print_cycles(fplog,c2t,"Total",nnodes,0,tot,tot); fprintf(fplog,"%s\n",myline); if (wc->wcc[ewcPMEMESH].n > 0) { fprintf(fplog,"%s\n",myline); for(i=ewcPPDURINGPME+1; i<ewcNR; i++) { if (subdivision(i)) { print_cycles(fplog,c2t,wcn[i], (i>=ewcPMEMESH || i<=ewcPME_SOLVE) ? npme : npp, wc->wcc[i].n,cycles[i],tot); } } fprintf(fplog,"%s\n",myline); } if (cycles[ewcMoveE] > tot*0.05) { sprintf(buf, "NOTE: %d %% of the run time was spent communicating energies,\n" " you might want to use the -gcom option of mdrun\n", (int)(100*cycles[ewcMoveE]/tot+0.5)); if (fplog) { fprintf(fplog,"\n%s\n",buf); } /* Only the sim master calls this function, so always print to stderr */ fprintf(stderr,"\n%s\n",buf); } }
void wallcycle_print(FILE *fplog, int nnodes, int npme, int nth_pp, int nth_pme, double realtime, gmx_wallcycle_t wc, const WallcycleCounts &cyc_sum, struct gmx_wallclock_gpu_t *gpu_t) { double tot, tot_for_pp, tot_for_rest, tot_gpu, tot_cpu_overlap, gpu_cpu_ratio, tot_k; double c2t, c2t_pp, c2t_pme = 0; int i, j, npp, nth_tot; char buf[STRLEN]; const char *hline = "-----------------------------------------------------------------------------"; if (wc == NULL) { return; } GMX_ASSERT(nth_pp > 0, "Number of particle-particle threads must be >0"); GMX_ASSERT(nth_pme > 0, "Number of PME threads must be >0"); GMX_ASSERT(nnodes > 0, "Number of nodes must be >0"); GMX_ASSERT(npme >= 0, "Number of PME nodes cannot be negative"); npp = nnodes - npme; /* npme is the number of PME-only ranks used, and we always do PP work */ GMX_ASSERT(npp > 0, "Number of particle-particle nodes must be >0"); nth_tot = npp*nth_pp + npme*nth_pme; /* When using PME-only nodes, the next line is valid for both PP-only and PME-only nodes because they started ewcRUN at the same time. */ tot = cyc_sum[ewcRUN]; tot_for_pp = 0; if (tot <= 0.0) { /* TODO This is heavy handed, but until someone reworks the code so that it is provably robust with respect to non-positive values for all possible timer and cycle counters, there is less value gained from printing whatever timing data might still be sensible for some non-Jenkins run, than is lost from diagnosing Jenkins FP exceptions on runs about whose execution time we don't care. */ md_print_warn(NULL, fplog, "WARNING: A total of %f CPU cycles was recorded, so mdrun cannot print a time accounting\n", tot); return; } if (wc->haveInvalidCount) { md_print_warn(NULL, fplog, "%s\n", "NOTE: Detected invalid cycle counts, probably because threads moved between CPU cores that do not have synchronized cycle counters. Will not print the cycle accounting."); return; } /* Conversion factor from cycles to seconds */ c2t = realtime/tot; c2t_pp = c2t * nth_tot / static_cast<double>(npp*nth_pp); if (npme > 0) { c2t_pme = c2t * nth_tot / static_cast<double>(npme*nth_pme); } else { c2t_pme = 0; } fprintf(fplog, "\n R E A L C Y C L E A N D T I M E A C C O U N T I N G\n\n"); print_header(fplog, npp, nth_pp, npme, nth_pme); fprintf(fplog, "%s\n", hline); for (i = ewcPPDURINGPME+1; i < ewcNR; i++) { if (is_pme_subcounter(i)) { /* Do not count these at all */ } else if (npme > 0 && is_pme_counter(i)) { /* Print timing information for PME-only nodes, but add an * asterisk so the reader of the table can know that the * walltimes are not meant to add up. The asterisk still * fits in the required maximum of 19 characters. */ char buffer[STRLEN]; snprintf(buffer, STRLEN, "%s *", wcn[i]); print_cycles(fplog, c2t_pme, buffer, npme, nth_pme, wc->wcc[i].n, cyc_sum[i], tot); } else { /* Print timing information when it is for a PP or PP+PME node */ print_cycles(fplog, c2t_pp, wcn[i], npp, nth_pp, wc->wcc[i].n, cyc_sum[i], tot); tot_for_pp += cyc_sum[i]; } } if (wc->wcc_all != NULL) { for (i = 0; i < ewcNR; i++) { for (j = 0; j < ewcNR; j++) { snprintf(buf, 20, "%-9.9s %-9.9s", wcn[i], wcn[j]); print_cycles(fplog, c2t_pp, buf, npp, nth_pp, wc->wcc_all[i*ewcNR+j].n, wc->wcc_all[i*ewcNR+j].c, tot); } } } tot_for_rest = tot * npp * nth_pp / static_cast<double>(nth_tot); print_cycles(fplog, c2t_pp, "Rest", npp, nth_pp, -1, tot_for_rest - tot_for_pp, tot); fprintf(fplog, "%s\n", hline); print_cycles(fplog, c2t, "Total", npp, nth_pp, -1, tot, tot); fprintf(fplog, "%s\n", hline); if (npme > 0) { fprintf(fplog, "(*) Note that with separate PME ranks, the walltime column actually sums to\n" " twice the total reported, but the cycle count total and %% are correct.\n" "%s\n", hline); } if (wc->wcc[ewcPMEMESH].n > 0) { fprintf(fplog, " Breakdown of PME mesh computation\n"); fprintf(fplog, "%s\n", hline); for (i = ewcPPDURINGPME+1; i < ewcNR; i++) { if (is_pme_subcounter(i)) { print_cycles(fplog, npme > 0 ? c2t_pme : c2t_pp, wcn[i], npme > 0 ? npme : npp, nth_pme, wc->wcc[i].n, cyc_sum[i], tot); } } fprintf(fplog, "%s\n", hline); } if (useCycleSubcounters && wc->wcsc) { fprintf(fplog, " Breakdown of PP computation\n"); fprintf(fplog, "%s\n", hline); for (i = 0; i < ewcsNR; i++) { print_cycles(fplog, c2t_pp, wcsn[i], npp, nth_pp, wc->wcsc[i].n, cyc_sum[ewcNR+i], tot); } fprintf(fplog, "%s\n", hline); } /* print GPU timing summary */ if (gpu_t) { const char *k_log_str[2][2] = { {"Nonbonded F kernel", "Nonbonded F+ene k."}, {"Nonbonded F+prune k.", "Nonbonded F+ene+prune k."} }; tot_gpu = gpu_t->pl_h2d_t + gpu_t->nb_h2d_t + gpu_t->nb_d2h_t; /* add up the kernel timings */ tot_k = 0.0; for (i = 0; i < 2; i++) { for (j = 0; j < 2; j++) { tot_k += gpu_t->ktime[i][j].t; } } tot_gpu += tot_k; tot_cpu_overlap = wc->wcc[ewcFORCE].c; if (wc->wcc[ewcPMEMESH].n > 0) { tot_cpu_overlap += wc->wcc[ewcPMEMESH].c; } tot_cpu_overlap *= realtime*1000/tot; /* convert s to ms */ fprintf(fplog, "\n GPU timings\n%s\n", hline); fprintf(fplog, " Computing: Count Wall t (s) ms/step %c\n", '%'); fprintf(fplog, "%s\n", hline); print_gputimes(fplog, "Pair list H2D", gpu_t->pl_h2d_c, gpu_t->pl_h2d_t, tot_gpu); print_gputimes(fplog, "X / q H2D", gpu_t->nb_c, gpu_t->nb_h2d_t, tot_gpu); for (i = 0; i < 2; i++) { for (j = 0; j < 2; j++) { if (gpu_t->ktime[i][j].c) { print_gputimes(fplog, k_log_str[i][j], gpu_t->ktime[i][j].c, gpu_t->ktime[i][j].t, tot_gpu); } } } print_gputimes(fplog, "F D2H", gpu_t->nb_c, gpu_t->nb_d2h_t, tot_gpu); fprintf(fplog, "%s\n", hline); print_gputimes(fplog, "Total ", gpu_t->nb_c, tot_gpu, tot_gpu); fprintf(fplog, "%s\n", hline); gpu_cpu_ratio = tot_gpu/tot_cpu_overlap; if (gpu_t->nb_c > 0 && wc->wcc[ewcFORCE].n > 0) { fprintf(fplog, "\nForce evaluation time GPU/CPU: %.3f ms/%.3f ms = %.3f\n", tot_gpu/gpu_t->nb_c, tot_cpu_overlap/wc->wcc[ewcFORCE].n, gpu_cpu_ratio); } /* only print notes related to CPU-GPU load balance with PME */ if (wc->wcc[ewcPMEMESH].n > 0) { fprintf(fplog, "For optimal performance this ratio should be close to 1!\n"); /* print note if the imbalance is high with PME case in which * CPU-GPU load balancing is possible */ if (gpu_cpu_ratio < 0.75 || gpu_cpu_ratio > 1.2) { /* Only the sim master calls this function, so always print to stderr */ if (gpu_cpu_ratio < 0.75) { if (npp > 1) { /* The user could have used -notunepme, * but we currently can't check that here. */ md_print_warn(NULL, fplog, "\nNOTE: The GPU has >25%% less load than the CPU. This imbalance causes\n" " performance loss. Maybe the domain decomposition limits the PME tuning.\n" " In that case, try setting the DD grid manually (-dd) or lowering -dds."); } else { /* We should not end up here, unless the box is * too small for increasing the cut-off for PME tuning. */ md_print_warn(NULL, fplog, "\nNOTE: The GPU has >25%% less load than the CPU. This imbalance causes\n" " performance loss."); } } if (gpu_cpu_ratio > 1.2) { md_print_warn(NULL, fplog, "\nNOTE: The GPU has >20%% more load than the CPU. This imbalance causes\n" " performance loss, consider using a shorter cut-off and a finer PME grid."); } } } } if (wc->wc_barrier) { md_print_warn(NULL, fplog, "MPI_Barrier was called before each cycle start/stop\n" "call, so timings are not those of real runs.\n"); } if (wc->wcc[ewcNB_XF_BUF_OPS].n > 0 && (cyc_sum[ewcDOMDEC] > tot*0.1 || cyc_sum[ewcNS] > tot*0.1)) { /* Only the sim master calls this function, so always print to stderr */ if (wc->wcc[ewcDOMDEC].n == 0) { md_print_warn(NULL, fplog, "NOTE: %d %% of the run time was spent in pair search,\n" " you might want to increase nstlist (this has no effect on accuracy)\n", (int)(100*cyc_sum[ewcNS]/tot+0.5)); } else { md_print_warn(NULL, fplog, "NOTE: %d %% of the run time was spent in domain decomposition,\n" " %d %% of the run time was spent in pair search,\n" " you might want to increase nstlist (this has no effect on accuracy)\n", (int)(100*cyc_sum[ewcDOMDEC]/tot+0.5), (int)(100*cyc_sum[ewcNS]/tot+0.5)); } } if (cyc_sum[ewcMoveE] > tot*0.05) { /* Only the sim master calls this function, so always print to stderr */ md_print_warn(NULL, fplog, "NOTE: %d %% of the run time was spent communicating energies,\n" " you might want to use the -gcom option of mdrun\n", (int)(100*cyc_sum[ewcMoveE]/tot+0.5)); } }
void wallcycle_print(FILE *fplog, int nnodes, int npme, double realtime, gmx_wallcycle_t wc, wallclock_gpu_t *gpu_t) { double *cyc_sum; double tot, tot_for_pp, tot_for_rest, tot_gpu, tot_cpu_overlap, gpu_cpu_ratio, tot_k; double c2t, c2t_pp, c2t_pme; int i, j, npp, nth_pp, nth_pme, nth_tot; char buf[STRLEN]; const char *hline = "-----------------------------------------------------------------------------"; if (wc == NULL) { return; } nth_pp = wc->nthreads_pp; nth_pme = wc->nthreads_pme; cyc_sum = wc->cycles_sum; npp = nnodes - npme; nth_tot = npp*nth_pp + npme*nth_pme; /* When using PME-only nodes, the next line is valid for both PP-only and PME-only nodes because they started ewcRUN at the same time. */ tot = cyc_sum[ewcRUN]; tot_for_pp = 0; /* Conversion factor from cycles to seconds */ if (tot > 0) { c2t = realtime/tot; c2t_pp = c2t * nth_tot / (double) (npp*nth_pp); c2t_pme = c2t * nth_tot / (double) (npme*nth_pme); } else { c2t = 0; c2t_pp = 0; c2t_pme = 0; } fprintf(fplog, "\n R E A L C Y C L E A N D T I M E A C C O U N T I N G\n\n"); print_header(fplog, npp, nth_pp, npme, nth_pme); fprintf(fplog, "%s\n", hline); for (i = ewcPPDURINGPME+1; i < ewcNR; i++) { if (is_pme_subcounter(i)) { /* Do not count these at all */ } else if (npme > 0 && is_pme_counter(i)) { /* Print timing information for PME-only nodes, but add an * asterisk so the reader of the table can know that the * walltimes are not meant to add up. The asterisk still * fits in the required maximum of 19 characters. */ char buffer[STRLEN]; snprintf(buffer, STRLEN, "%s *", wcn[i]); print_cycles(fplog, c2t_pme, buffer, npme, nth_pme, wc->wcc[i].n, cyc_sum[i], tot); } else { /* Print timing information when it is for a PP or PP+PME node */ print_cycles(fplog, c2t_pp, wcn[i], npp, nth_pp, wc->wcc[i].n, cyc_sum[i], tot); tot_for_pp += cyc_sum[i]; } } if (wc->wcc_all != NULL) { for (i = 0; i < ewcNR; i++) { for (j = 0; j < ewcNR; j++) { snprintf(buf, 20, "%-9.9s %-9.9s", wcn[i], wcn[j]); print_cycles(fplog, c2t_pp, buf, npp, nth_pp, wc->wcc_all[i*ewcNR+j].n, wc->wcc_all[i*ewcNR+j].c, tot); } } } tot_for_rest = tot * (npp * nth_pp) / (double) nth_tot; print_cycles(fplog, c2t_pp, "Rest", npp, nth_pp, -1, tot_for_rest - tot_for_pp, tot); fprintf(fplog, "%s\n", hline); print_cycles(fplog, c2t, "Total", npp, nth_pp, -1, tot, tot); fprintf(fplog, "%s\n", hline); if (npme > 0) { fprintf(fplog, "(*) Note that with separate PME nodes, the walltime column actually sums to\n" " twice the total reported, but the cycle count total and %% are correct.\n" "%s\n", hline); } if (wc->wcc[ewcPMEMESH].n > 0) { fprintf(fplog, " Breakdown of PME mesh computation\n"); fprintf(fplog, "%s\n", hline); for (i = ewcPPDURINGPME+1; i < ewcNR; i++) { if (is_pme_subcounter(i)) { print_cycles(fplog, npme > 0 ? c2t_pme : c2t_pp, wcn[i], npme > 0 ? npme : npp, nth_pme, wc->wcc[i].n, cyc_sum[i], tot); } } fprintf(fplog, "%s\n", hline); } #ifdef GMX_CYCLE_SUBCOUNTERS fprintf(fplog, " Breakdown of PP computation\n"); fprintf(fplog, "%s\n", hline); for (i = 0; i < ewcsNR; i++) { print_cycles(fplog, c2t_pp, wcsn[i], npp, nth_pp, wc->wcsc[i].n, cyc_sum[ewcNR+i], tot); } fprintf(fplog, "%s\n", hline); #endif /* print GPU timing summary */ if (gpu_t) { const char *k_log_str[2][2] = { {"Nonbonded F kernel", "Nonbonded F+ene k."}, {"Nonbonded F+prune k.", "Nonbonded F+ene+prune k."} }; tot_gpu = gpu_t->pl_h2d_t + gpu_t->nb_h2d_t + gpu_t->nb_d2h_t; /* add up the kernel timings */ tot_k = 0.0; for (i = 0; i < 2; i++) { for (j = 0; j < 2; j++) { tot_k += gpu_t->ktime[i][j].t; } } tot_gpu += tot_k; tot_cpu_overlap = wc->wcc[ewcFORCE].c; if (wc->wcc[ewcPMEMESH].n > 0) { tot_cpu_overlap += wc->wcc[ewcPMEMESH].c; } tot_cpu_overlap *= realtime*1000/tot; /* convert s to ms */ fprintf(fplog, "\n GPU timings\n%s\n", hline); fprintf(fplog, " Computing: Count Wall t (s) ms/step %c\n", '%'); fprintf(fplog, "%s\n", hline); print_gputimes(fplog, "Pair list H2D", gpu_t->pl_h2d_c, gpu_t->pl_h2d_t, tot_gpu); print_gputimes(fplog, "X / q H2D", gpu_t->nb_c, gpu_t->nb_h2d_t, tot_gpu); for (i = 0; i < 2; i++) { for (j = 0; j < 2; j++) { if (gpu_t->ktime[i][j].c) { print_gputimes(fplog, k_log_str[i][j], gpu_t->ktime[i][j].c, gpu_t->ktime[i][j].t, tot_gpu); } } } print_gputimes(fplog, "F D2H", gpu_t->nb_c, gpu_t->nb_d2h_t, tot_gpu); fprintf(fplog, "%s\n", hline); print_gputimes(fplog, "Total ", gpu_t->nb_c, tot_gpu, tot_gpu); fprintf(fplog, "%s\n", hline); gpu_cpu_ratio = tot_gpu/tot_cpu_overlap; fprintf(fplog, "\nForce evaluation time GPU/CPU: %.3f ms/%.3f ms = %.3f\n", tot_gpu/gpu_t->nb_c, tot_cpu_overlap/wc->wcc[ewcFORCE].n, gpu_cpu_ratio); /* only print notes related to CPU-GPU load balance with PME */ if (wc->wcc[ewcPMEMESH].n > 0) { fprintf(fplog, "For optimal performance this ratio should be close to 1!\n"); /* print note if the imbalance is high with PME case in which * CPU-GPU load balancing is possible */ if (gpu_cpu_ratio < 0.75 || gpu_cpu_ratio > 1.2) { /* Only the sim master calls this function, so always print to stderr */ if (gpu_cpu_ratio < 0.75) { if (npp > 1) { /* The user could have used -notunepme, * but we currently can't check that here. */ md_print_warn(NULL, fplog, "\nNOTE: The GPU has >25%% less load than the CPU. This imbalance causes\n" " performance loss. Maybe the domain decomposition limits the PME tuning.\n" " In that case, try setting the DD grid manually (-dd) or lowering -dds."); } else { /* We should not end up here, unless the box is * too small for increasing the cut-off for PME tuning. */ md_print_warn(NULL, fplog, "\nNOTE: The GPU has >25%% less load than the CPU. This imbalance causes\n" " performance loss."); } } if (gpu_cpu_ratio > 1.2) { md_print_warn(NULL, fplog, "\nNOTE: The GPU has >20%% more load than the CPU. This imbalance causes\n" " performance loss, consider using a shorter cut-off and a finer PME grid."); } } } } if (wc->wcc[ewcNB_XF_BUF_OPS].n > 0 && (cyc_sum[ewcDOMDEC] > tot*0.1 || cyc_sum[ewcNS] > tot*0.1)) { /* Only the sim master calls this function, so always print to stderr */ if (wc->wcc[ewcDOMDEC].n == 0) { md_print_warn(NULL, fplog, "NOTE: %d %% of the run time was spent in pair search,\n" " you might want to increase nstlist (this has no effect on accuracy)\n", (int)(100*cyc_sum[ewcNS]/tot+0.5)); } else { md_print_warn(NULL, fplog, "NOTE: %d %% of the run time was spent in domain decomposition,\n" " %d %% of the run time was spent in pair search,\n" " you might want to increase nstlist (this has no effect on accuracy)\n", (int)(100*cyc_sum[ewcDOMDEC]/tot+0.5), (int)(100*cyc_sum[ewcNS]/tot+0.5)); } } if (cyc_sum[ewcMoveE] > tot*0.05) { /* Only the sim master calls this function, so always print to stderr */ md_print_warn(NULL, fplog, "NOTE: %d %% of the run time was spent communicating energies,\n" " you might want to use the -gcom option of mdrun\n", (int)(100*cyc_sum[ewcMoveE]/tot+0.5)); } }
void wallcycle_print(FILE *fplog, int nnodes, int npme, double realtime, gmx_wallcycle_t wc, wallclock_gpu_t *gpu_t) { double *cycles; double c2t, tot, tot_gpu, tot_cpu_overlap, gpu_cpu_ratio, sum, tot_k; int i, j, npp, nth_pp, nth_pme, nth_tot; char buf[STRLEN]; const char *hline = "-----------------------------------------------------------------------------"; if (wc == NULL) { return; } nth_pp = wc->nthreads_pp; nth_pme = wc->nthreads_pme; cycles = wc->cycles_sum; if (npme > 0) { npp = nnodes - npme; nth_tot = npp*nth_pp + npme*nth_pme; } else { npp = nnodes; npme = nnodes; nth_tot = npp*nth_pp; } tot = cycles[ewcRUN]; /* Conversion factor from cycles to seconds */ if (tot > 0) { c2t = realtime/tot; } else { c2t = 0; } fprintf(fplog, "\n R E A L C Y C L E A N D T I M E A C C O U N T I N G\n\n"); fprintf(fplog, " Computing: Nodes Th. Count Wall t (s) G-Cycles %c\n", '%'); fprintf(fplog, "%s\n", hline); sum = 0; for (i = ewcPPDURINGPME+1; i < ewcNR; i++) { if (!is_pme_subcounter(i)) { print_cycles(fplog, c2t, wcn[i], nth_tot, is_pme_counter(i) ? npme : npp, is_pme_counter(i) ? nth_pme : nth_pp, wc->wcc[i].n, cycles[i], tot); sum += cycles[i]; } } if (wc->wcc_all != NULL) { for (i = 0; i < ewcNR; i++) { for (j = 0; j < ewcNR; j++) { snprintf(buf, 9, "%-9s", wcn[i]); buf[9] = ' '; snprintf(buf+10, 9, "%-9s", wcn[j]); buf[19] = '\0'; print_cycles(fplog, c2t, buf, nth_tot, is_pme_counter(i) ? npme : npp, is_pme_counter(i) ? nth_pme : nth_pp, wc->wcc_all[i*ewcNR+j].n, wc->wcc_all[i*ewcNR+j].c, tot); } } } print_cycles(fplog, c2t, "Rest", nth_tot, npp, -1, 0, tot-sum, tot); fprintf(fplog, "%s\n", hline); print_cycles(fplog, c2t, "Total", nth_tot, nnodes, -1, 0, tot, tot); fprintf(fplog, "%s\n", hline); if (wc->wcc[ewcPMEMESH].n > 0) { fprintf(fplog, "%s\n", hline); for (i = ewcPPDURINGPME+1; i < ewcNR; i++) { if (is_pme_subcounter(i)) { print_cycles(fplog, c2t, wcn[i], nth_tot, is_pme_counter(i) ? npme : npp, is_pme_counter(i) ? nth_pme : nth_pp, wc->wcc[i].n, cycles[i], tot); } } fprintf(fplog, "%s\n", hline); } #ifdef GMX_CYCLE_SUBCOUNTERS fprintf(fplog, "%s\n", hline); for (i = 0; i < ewcsNR; i++) { print_cycles(fplog, c2t, wcsn[i], nth_tot, npp, nth_pp, wc->wcsc[i].n, cycles[ewcNR+i], tot); } fprintf(fplog, "%s\n", hline); #endif /* print GPU timing summary */ if (gpu_t) { const char *k_log_str[2][2] = { {"Nonbonded F kernel", "Nonbonded F+ene k."}, {"Nonbonded F+prune k.", "Nonbonded F+ene+prune k."} }; tot_gpu = gpu_t->pl_h2d_t + gpu_t->nb_h2d_t + gpu_t->nb_d2h_t; /* add up the kernel timings */ tot_k = 0.0; for (i = 0; i < 2; i++) { for (j = 0; j < 2; j++) { tot_k += gpu_t->ktime[i][j].t; } } tot_gpu += tot_k; tot_cpu_overlap = wc->wcc[ewcFORCE].c; if (wc->wcc[ewcPMEMESH].n > 0) { tot_cpu_overlap += wc->wcc[ewcPMEMESH].c; } tot_cpu_overlap *= c2t * 1000; /* convert s to ms */ fprintf(fplog, "\n GPU timings\n%s\n", hline); fprintf(fplog, " Computing: Count Wall t (s) ms/step %c\n", '%'); fprintf(fplog, "%s\n", hline); print_gputimes(fplog, "Pair list H2D", gpu_t->pl_h2d_c, gpu_t->pl_h2d_t, tot_gpu); print_gputimes(fplog, "X / q H2D", gpu_t->nb_c, gpu_t->nb_h2d_t, tot_gpu); for (i = 0; i < 2; i++) { for (j = 0; j < 2; j++) { if (gpu_t->ktime[i][j].c) { print_gputimes(fplog, k_log_str[i][j], gpu_t->ktime[i][j].c, gpu_t->ktime[i][j].t, tot_gpu); } } } print_gputimes(fplog, "F D2H", gpu_t->nb_c, gpu_t->nb_d2h_t, tot_gpu); fprintf(fplog, "%s\n", hline); print_gputimes(fplog, "Total ", gpu_t->nb_c, tot_gpu, tot_gpu); fprintf(fplog, "%s\n", hline); gpu_cpu_ratio = tot_gpu/tot_cpu_overlap; fprintf(fplog, "\nForce evaluation time GPU/CPU: %.3f ms/%.3f ms = %.3f\n", tot_gpu/gpu_t->nb_c, tot_cpu_overlap/wc->wcc[ewcFORCE].n, gpu_cpu_ratio); /* only print notes related to CPU-GPU load balance with PME */ if (wc->wcc[ewcPMEMESH].n > 0) { fprintf(fplog, "For optimal performance this ratio should be close to 1!\n"); /* print note if the imbalance is high with PME case in which * CPU-GPU load balancing is possible */ if (gpu_cpu_ratio < 0.75 || gpu_cpu_ratio > 1.2) { /* Only the sim master calls this function, so always print to stderr */ if (gpu_cpu_ratio < 0.75) { if (npp > 1) { /* The user could have used -notunepme, * but we currently can't check that here. */ md_print_warn(NULL, fplog, "\nNOTE: The GPU has >25%% less load than the CPU. This imbalance causes\n" " performance loss. Maybe the domain decomposition limits the PME tuning.\n" " In that case, try setting the DD grid manually (-dd) or lowering -dds."); } else { /* We should not end up here, unless the box is * too small for increasing the cut-off for PME tuning. */ md_print_warn(NULL, fplog, "\nNOTE: The GPU has >25%% less load than the CPU. This imbalance causes\n" " performance loss."); } } if (gpu_cpu_ratio > 1.2) { md_print_warn(NULL, fplog, "\nNOTE: The GPU has >20%% more load than the CPU. This imbalance causes\n" " performance loss, consider using a shorter cut-off and a finer PME grid."); } } } } if (wc->wcc[ewcNB_XF_BUF_OPS].n > 0 && (cycles[ewcDOMDEC] > tot*0.1 || cycles[ewcNS] > tot*0.1)) { /* Only the sim master calls this function, so always print to stderr */ if (wc->wcc[ewcDOMDEC].n == 0) { md_print_warn(NULL, fplog, "NOTE: %d %% of the run time was spent in pair search,\n" " you might want to increase nstlist (this has no effect on accuracy)\n", (int)(100*cycles[ewcNS]/tot+0.5)); } else { md_print_warn(NULL, fplog, "NOTE: %d %% of the run time was spent in domain decomposition,\n" " %d %% of the run time was spent in pair search,\n" " you might want to increase nstlist (this has no effect on accuracy)\n", (int)(100*cycles[ewcDOMDEC]/tot+0.5), (int)(100*cycles[ewcNS]/tot+0.5)); } } if (cycles[ewcMoveE] > tot*0.05) { /* Only the sim master calls this function, so always print to stderr */ md_print_warn(NULL, fplog, "NOTE: %d %% of the run time was spent communicating energies,\n" " you might want to use the -gcom option of mdrun\n", (int)(100*cycles[ewcMoveE]/tot+0.5)); } }