/* TODO Make an object for this function to return, containing some * vectors of something like wallcc_t for the summed wcc, wcc_all and * wcsc, AND the original wcc for rank 0. * * The GPU timing is reported only for rank 0, so we want to preserve * the original wcycle on that rank. Rank 0 also reports the global * counts before that, so needs something to contain the global data * without over-writing the rank-0 data. The current implementation * uses cycles_sum to manage this, which works OK now because wcsc and * wcc_all are unused by the GPU reporting, but it is not satisfactory * for the future. Also, there's no need for MPI_Allreduce, since * only MASTERRANK uses any of the results. */ WallcycleCounts wallcycle_sum(t_commrec *cr, gmx_wallcycle_t wc) { WallcycleCounts cycles_sum; wallcc_t *wcc; double cycles[ewcNR+ewcsNR]; #if GMX_MPI double cycles_n[ewcNR+ewcsNR+1]; #endif int i; int nsum; if (wc == NULL) { /* Default construction of std::array of non-class T can leave the values indeterminate, just like a C array, and icc warns about it. */ cycles_sum.fill(0); return cycles_sum; } wcc = wc->wcc; subtract_cycles(wcc, ewcDOMDEC, ewcDDCOMMLOAD); subtract_cycles(wcc, ewcDOMDEC, ewcDDCOMMBOUND); subtract_cycles(wcc, ewcPME_FFT, ewcPME_FFTCOMM); if (cr->npmenodes == 0) { /* All nodes do PME (or no PME at all) */ subtract_cycles(wcc, ewcFORCE, ewcPMEMESH); } else { /* The are PME-only nodes */ if (wcc[ewcPMEMESH].n > 0) { /* This must be a PME only node, calculate the Wait + Comm. time */ GMX_ASSERT(wcc[ewcRUN].c >= wcc[ewcPMEMESH].c, "Total run ticks must be greater than PME-only ticks"); wcc[ewcPMEWAITCOMM].c = wcc[ewcRUN].c - wcc[ewcPMEMESH].c; } } /* Store the cycles in a double buffer for summing */ for (i = 0; i < ewcNR; i++) { #if GMX_MPI cycles_n[i] = static_cast<double>(wcc[i].n); #endif cycles[i] = static_cast<double>(wcc[i].c); } nsum = ewcNR; if (wc->wcsc) { for (i = 0; i < ewcsNR; i++) { #if GMX_MPI cycles_n[ewcNR+i] = static_cast<double>(wc->wcsc[i].n); #endif cycles[ewcNR+i] = static_cast<double>(wc->wcsc[i].c); } nsum += ewcsNR; } #if GMX_MPI if (cr->nnodes > 1) { double buf[ewcNR+ewcsNR+1]; // TODO this code is used only at the end of the run, so we // can just do a simple reduce of haveInvalidCount in // wallcycle_print, and avoid bugs cycles_n[nsum] = (wc->haveInvalidCount > 0 ? 1 : 0); // TODO Use MPI_Reduce MPI_Allreduce(cycles_n, buf, nsum + 1, MPI_DOUBLE, MPI_MAX, cr->mpi_comm_mysim); for (i = 0; i < ewcNR; i++) { wcc[i].n = static_cast<int>(buf[i] + 0.5); } wc->haveInvalidCount = (buf[nsum] > 0); if (wc->wcsc) { for (i = 0; i < ewcsNR; i++) { wc->wcsc[i].n = static_cast<int>(buf[ewcNR+i] + 0.5); } } // TODO Use MPI_Reduce MPI_Allreduce(cycles, cycles_sum.data(), nsum, MPI_DOUBLE, MPI_SUM, cr->mpi_comm_mysim); if (wc->wcc_all != NULL) { double *buf_all, *cyc_all; snew(cyc_all, ewcNR*ewcNR); snew(buf_all, ewcNR*ewcNR); for (i = 0; i < ewcNR*ewcNR; i++) { cyc_all[i] = wc->wcc_all[i].c; } // TODO Use MPI_Reduce MPI_Allreduce(cyc_all, buf_all, ewcNR*ewcNR, MPI_DOUBLE, MPI_SUM, cr->mpi_comm_mysim); for (i = 0; i < ewcNR*ewcNR; i++) { wc->wcc_all[i].c = static_cast<gmx_cycles_t>(buf_all[i]); } sfree(buf_all); sfree(cyc_all); } } else #endif { for (i = 0; i < nsum; i++) { cycles_sum[i] = cycles[i]; } } return cycles_sum; }
void wallcycle_sum(t_commrec *cr, gmx_wallcycle_t wc) { wallcc_t *wcc; double cycles[ewcNR+ewcsNR]; double cycles_n[ewcNR+ewcsNR], buf[ewcNR+ewcsNR], *cyc_all, *buf_all; int i, j; int nsum; if (wc == NULL) { return; } snew(wc->cycles_sum, ewcNR+ewcsNR); wcc = wc->wcc; /* The GPU wait estimate counter is used for load balancing only * and will mess up the total due to double counting: clear it. */ wcc[ewcWAIT_GPU_NB_L_EST].n = 0; wcc[ewcWAIT_GPU_NB_L_EST].c = 0; for (i = 0; i < ewcNR; i++) { if (is_pme_counter(i) || (i == ewcRUN && cr->duty == DUTY_PME)) { wcc[i].c *= wc->nthreads_pme; if (wc->wcc_all) { for (j = 0; j < ewcNR; j++) { wc->wcc_all[i*ewcNR+j].c *= wc->nthreads_pme; } } } else { wcc[i].c *= wc->nthreads_pp; if (wc->wcc_all) { for (j = 0; j < ewcNR; j++) { wc->wcc_all[i*ewcNR+j].c *= wc->nthreads_pp; } } } } subtract_cycles(wcc, ewcDOMDEC, ewcDDCOMMLOAD); subtract_cycles(wcc, ewcDOMDEC, ewcDDCOMMBOUND); subtract_cycles(wcc, ewcPME_FFT, ewcPME_FFTCOMM); if (cr->npmenodes == 0) { /* All nodes do PME (or no PME at all) */ subtract_cycles(wcc, ewcFORCE, ewcPMEMESH); } else { /* The are PME-only nodes */ if (wcc[ewcPMEMESH].n > 0) { /* This must be a PME only node, calculate the Wait + Comm. time */ assert(wcc[ewcRUN].c >= wcc[ewcPMEMESH].c); wcc[ewcPMEWAITCOMM].c = wcc[ewcRUN].c - wcc[ewcPMEMESH].c; } } /* Store the cycles in a double buffer for summing */ for (i = 0; i < ewcNR; i++) { cycles_n[i] = (double)wcc[i].n; cycles[i] = (double)wcc[i].c; } nsum = ewcNR; #ifdef GMX_CYCLE_SUBCOUNTERS for (i = 0; i < ewcsNR; i++) { wc->wcsc[i].c *= wc->nthreads_pp; cycles_n[ewcNR+i] = (double)wc->wcsc[i].n; cycles[ewcNR+i] = (double)wc->wcsc[i].c; } nsum += ewcsNR; #endif #ifdef GMX_MPI if (cr->nnodes > 1) { MPI_Allreduce(cycles_n, buf, nsum, MPI_DOUBLE, MPI_MAX, cr->mpi_comm_mysim); for (i = 0; i < ewcNR; i++) { wcc[i].n = (int)(buf[i] + 0.5); } #ifdef GMX_CYCLE_SUBCOUNTERS for (i = 0; i < ewcsNR; i++) { wc->wcsc[i].n = (int)(buf[ewcNR+i] + 0.5); } #endif MPI_Allreduce(cycles, wc->cycles_sum, nsum, MPI_DOUBLE, MPI_SUM, cr->mpi_comm_mysim); if (wc->wcc_all != NULL) { snew(cyc_all, ewcNR*ewcNR); snew(buf_all, ewcNR*ewcNR); for (i = 0; i < ewcNR*ewcNR; i++) { cyc_all[i] = wc->wcc_all[i].c; } MPI_Allreduce(cyc_all, buf_all, ewcNR*ewcNR, MPI_DOUBLE, MPI_SUM, cr->mpi_comm_mysim); for (i = 0; i < ewcNR*ewcNR; i++) { wc->wcc_all[i].c = buf_all[i]; } sfree(buf_all); sfree(cyc_all); } } else #endif { for (i = 0; i < nsum; i++) { wc->cycles_sum[i] = cycles[i]; } } }