コード例 #1
0
ファイル: wallcycle.cpp プロジェクト: MelroLeandro/gromacs
/* TODO Make an object for this function to return, containing some
 * vectors of something like wallcc_t for the summed wcc, wcc_all and
 * wcsc, AND the original wcc for rank 0.
 *
 * The GPU timing is reported only for rank 0, so we want to preserve
 * the original wcycle on that rank. Rank 0 also reports the global
 * counts before that, so needs something to contain the global data
 * without over-writing the rank-0 data. The current implementation
 * uses cycles_sum to manage this, which works OK now because wcsc and
 * wcc_all are unused by the GPU reporting, but it is not satisfactory
 * for the future. Also, there's no need for MPI_Allreduce, since
 * only MASTERRANK uses any of the results. */
WallcycleCounts wallcycle_sum(t_commrec *cr, gmx_wallcycle_t wc)
{
    WallcycleCounts cycles_sum;
    wallcc_t       *wcc;
    double          cycles[ewcNR+ewcsNR];
#if GMX_MPI
    double          cycles_n[ewcNR+ewcsNR+1];
#endif
    int             i;
    int             nsum;

    if (wc == NULL)
    {
        /* Default construction of std::array of non-class T can leave
           the values indeterminate, just like a C array, and icc
           warns about it. */
        cycles_sum.fill(0);
        return cycles_sum;
    }

    wcc = wc->wcc;

    subtract_cycles(wcc, ewcDOMDEC, ewcDDCOMMLOAD);
    subtract_cycles(wcc, ewcDOMDEC, ewcDDCOMMBOUND);

    subtract_cycles(wcc, ewcPME_FFT, ewcPME_FFTCOMM);

    if (cr->npmenodes == 0)
    {
        /* All nodes do PME (or no PME at all) */
        subtract_cycles(wcc, ewcFORCE, ewcPMEMESH);
    }
    else
    {
        /* The are PME-only nodes */
        if (wcc[ewcPMEMESH].n > 0)
        {
            /* This must be a PME only node, calculate the Wait + Comm. time */
            GMX_ASSERT(wcc[ewcRUN].c >= wcc[ewcPMEMESH].c, "Total run ticks must be greater than PME-only ticks");
            wcc[ewcPMEWAITCOMM].c = wcc[ewcRUN].c - wcc[ewcPMEMESH].c;
        }
    }

    /* Store the cycles in a double buffer for summing */
    for (i = 0; i < ewcNR; i++)
    {
#if GMX_MPI
        cycles_n[i] = static_cast<double>(wcc[i].n);
#endif
        cycles[i]   = static_cast<double>(wcc[i].c);
    }
    nsum = ewcNR;
    if (wc->wcsc)
    {
        for (i = 0; i < ewcsNR; i++)
        {
#if GMX_MPI
            cycles_n[ewcNR+i] = static_cast<double>(wc->wcsc[i].n);
#endif
            cycles[ewcNR+i]   = static_cast<double>(wc->wcsc[i].c);
        }
        nsum += ewcsNR;
    }

#if GMX_MPI
    if (cr->nnodes > 1)
    {
        double buf[ewcNR+ewcsNR+1];

        // TODO this code is used only at the end of the run, so we
        // can just do a simple reduce of haveInvalidCount in
        // wallcycle_print, and avoid bugs
        cycles_n[nsum] = (wc->haveInvalidCount > 0 ? 1 : 0);
        // TODO Use MPI_Reduce
        MPI_Allreduce(cycles_n, buf, nsum + 1, MPI_DOUBLE, MPI_MAX,
                      cr->mpi_comm_mysim);
        for (i = 0; i < ewcNR; i++)
        {
            wcc[i].n = static_cast<int>(buf[i] + 0.5);
        }
        wc->haveInvalidCount = (buf[nsum] > 0);
        if (wc->wcsc)
        {
            for (i = 0; i < ewcsNR; i++)
            {
                wc->wcsc[i].n = static_cast<int>(buf[ewcNR+i] + 0.5);
            }
        }

        // TODO Use MPI_Reduce
        MPI_Allreduce(cycles, cycles_sum.data(), nsum, MPI_DOUBLE, MPI_SUM,
                      cr->mpi_comm_mysim);

        if (wc->wcc_all != NULL)
        {
            double *buf_all, *cyc_all;

            snew(cyc_all, ewcNR*ewcNR);
            snew(buf_all, ewcNR*ewcNR);
            for (i = 0; i < ewcNR*ewcNR; i++)
            {
                cyc_all[i] = wc->wcc_all[i].c;
            }
            // TODO Use MPI_Reduce
            MPI_Allreduce(cyc_all, buf_all, ewcNR*ewcNR, MPI_DOUBLE, MPI_SUM,
                          cr->mpi_comm_mysim);
            for (i = 0; i < ewcNR*ewcNR; i++)
            {
                wc->wcc_all[i].c = static_cast<gmx_cycles_t>(buf_all[i]);
            }
            sfree(buf_all);
            sfree(cyc_all);
        }
    }
    else
#endif
    {
        for (i = 0; i < nsum; i++)
        {
            cycles_sum[i] = cycles[i];
        }
    }

    return cycles_sum;
}
コード例 #2
0
ファイル: wallcycle.c プロジェクト: JehandadKhan/gromacs
void wallcycle_sum(t_commrec *cr, gmx_wallcycle_t wc)
{
    wallcc_t *wcc;
    double    cycles[ewcNR+ewcsNR];
    double    cycles_n[ewcNR+ewcsNR], buf[ewcNR+ewcsNR], *cyc_all, *buf_all;
    int       i, j;
    int       nsum;

    if (wc == NULL)
    {
        return;
    }

    snew(wc->cycles_sum, ewcNR+ewcsNR);

    wcc = wc->wcc;

    /* The GPU wait estimate counter is used for load balancing only
     * and will mess up the total due to double counting: clear it.
     */
    wcc[ewcWAIT_GPU_NB_L_EST].n = 0;
    wcc[ewcWAIT_GPU_NB_L_EST].c = 0;

    for (i = 0; i < ewcNR; i++)
    {
        if (is_pme_counter(i) || (i == ewcRUN && cr->duty == DUTY_PME))
        {
            wcc[i].c *= wc->nthreads_pme;

            if (wc->wcc_all)
            {
                for (j = 0; j < ewcNR; j++)
                {
                    wc->wcc_all[i*ewcNR+j].c *= wc->nthreads_pme;
                }
            }
        }
        else
        {
            wcc[i].c *= wc->nthreads_pp;

            if (wc->wcc_all)
            {
                for (j = 0; j < ewcNR; j++)
                {
                    wc->wcc_all[i*ewcNR+j].c *= wc->nthreads_pp;
                }
            }
        }
    }

    subtract_cycles(wcc, ewcDOMDEC, ewcDDCOMMLOAD);
    subtract_cycles(wcc, ewcDOMDEC, ewcDDCOMMBOUND);

    subtract_cycles(wcc, ewcPME_FFT, ewcPME_FFTCOMM);

    if (cr->npmenodes == 0)
    {
        /* All nodes do PME (or no PME at all) */
        subtract_cycles(wcc, ewcFORCE, ewcPMEMESH);
    }
    else
    {
        /* The are PME-only nodes */
        if (wcc[ewcPMEMESH].n > 0)
        {
            /* This must be a PME only node, calculate the Wait + Comm. time */
            assert(wcc[ewcRUN].c >= wcc[ewcPMEMESH].c);
            wcc[ewcPMEWAITCOMM].c = wcc[ewcRUN].c - wcc[ewcPMEMESH].c;
        }
    }

    /* Store the cycles in a double buffer for summing */
    for (i = 0; i < ewcNR; i++)
    {
        cycles_n[i] = (double)wcc[i].n;
        cycles[i]   = (double)wcc[i].c;
    }
    nsum = ewcNR;
#ifdef GMX_CYCLE_SUBCOUNTERS
    for (i = 0; i < ewcsNR; i++)
    {
        wc->wcsc[i].c    *= wc->nthreads_pp;
        cycles_n[ewcNR+i] = (double)wc->wcsc[i].n;
        cycles[ewcNR+i]   = (double)wc->wcsc[i].c;
    }
    nsum += ewcsNR;
#endif

#ifdef GMX_MPI
    if (cr->nnodes > 1)
    {
        MPI_Allreduce(cycles_n, buf, nsum, MPI_DOUBLE, MPI_MAX,
                      cr->mpi_comm_mysim);
        for (i = 0; i < ewcNR; i++)
        {
            wcc[i].n = (int)(buf[i] + 0.5);
        }
#ifdef GMX_CYCLE_SUBCOUNTERS
        for (i = 0; i < ewcsNR; i++)
        {
            wc->wcsc[i].n = (int)(buf[ewcNR+i] + 0.5);
        }
#endif

        MPI_Allreduce(cycles, wc->cycles_sum, nsum, MPI_DOUBLE, MPI_SUM,
                      cr->mpi_comm_mysim);

        if (wc->wcc_all != NULL)
        {
            snew(cyc_all, ewcNR*ewcNR);
            snew(buf_all, ewcNR*ewcNR);
            for (i = 0; i < ewcNR*ewcNR; i++)
            {
                cyc_all[i] = wc->wcc_all[i].c;
            }
            MPI_Allreduce(cyc_all, buf_all, ewcNR*ewcNR, MPI_DOUBLE, MPI_SUM,
                          cr->mpi_comm_mysim);
            for (i = 0; i < ewcNR*ewcNR; i++)
            {
                wc->wcc_all[i].c = buf_all[i];
            }
            sfree(buf_all);
            sfree(cyc_all);
        }
    }
    else
#endif
    {
        for (i = 0; i < nsum; i++)
        {
            wc->cycles_sum[i] = cycles[i];
        }
    }
}