Example #1
0
void wallcycle_scale_by_num_threads(gmx_wallcycle_t wc, bool isPmeRank, int nthreads_pp, int nthreads_pme)
{
    if (wc == NULL)
    {
        return;
    }

    for (int i = 0; i < ewcNR; i++)
    {
        if (is_pme_counter(i) || (i == ewcRUN && isPmeRank))
        {
            wc->wcc[i].c *= nthreads_pme;

            if (wc->wcc_all)
            {
                for (int j = 0; j < ewcNR; j++)
                {
                    wc->wcc_all[i*ewcNR+j].c *= nthreads_pme;
                }
            }
        }
        else
        {
            wc->wcc[i].c *= nthreads_pp;

            if (wc->wcc_all)
            {
                for (int j = 0; j < ewcNR; j++)
                {
                    wc->wcc_all[i*ewcNR+j].c *= nthreads_pp;
                }
            }
        }
    }
    if (useCycleSubcounters && wc->wcsc && !isPmeRank)
    {
        for (int i = 0; i < ewcsNR; i++)
        {
            wc->wcsc[i].c *= nthreads_pp;
        }
    }
}
Example #2
0
void wallcycle_print(FILE *fplog, int nnodes, int npme,
                     int nth_pp, int nth_pme, double realtime,
                     gmx_wallcycle_t wc, const WallcycleCounts &cyc_sum,
                     struct gmx_wallclock_gpu_t *gpu_t)
{
    double      tot, tot_for_pp, tot_for_rest, tot_gpu, tot_cpu_overlap, gpu_cpu_ratio, tot_k;
    double      c2t, c2t_pp, c2t_pme = 0;
    int         i, j, npp, nth_tot;
    char        buf[STRLEN];
    const char *hline = "-----------------------------------------------------------------------------";

    if (wc == NULL)
    {
        return;
    }

    GMX_ASSERT(nth_pp > 0, "Number of particle-particle threads must be >0");
    GMX_ASSERT(nth_pme > 0, "Number of PME threads must be >0");
    GMX_ASSERT(nnodes > 0, "Number of nodes must be >0");
    GMX_ASSERT(npme >= 0, "Number of PME nodes cannot be negative");
    npp     = nnodes - npme;
    /* npme is the number of PME-only ranks used, and we always do PP work */
    GMX_ASSERT(npp > 0, "Number of particle-particle nodes must be >0");

    nth_tot = npp*nth_pp + npme*nth_pme;

    /* When using PME-only nodes, the next line is valid for both
       PP-only and PME-only nodes because they started ewcRUN at the
       same time. */
    tot        = cyc_sum[ewcRUN];
    tot_for_pp = 0;

    if (tot <= 0.0)
    {
        /* TODO This is heavy handed, but until someone reworks the
           code so that it is provably robust with respect to
           non-positive values for all possible timer and cycle
           counters, there is less value gained from printing whatever
           timing data might still be sensible for some non-Jenkins
           run, than is lost from diagnosing Jenkins FP exceptions on
           runs about whose execution time we don't care. */
        md_print_warn(NULL, fplog, "WARNING: A total of %f CPU cycles was recorded, so mdrun cannot print a time accounting\n", tot);
        return;
    }

    if (wc->haveInvalidCount)
    {
        md_print_warn(NULL, fplog, "%s\n",
                      "NOTE: Detected invalid cycle counts, probably because threads moved between CPU cores that do not have synchronized cycle counters. Will not print the cycle accounting.");
        return;
    }


    /* Conversion factor from cycles to seconds */
    c2t     = realtime/tot;
    c2t_pp  = c2t * nth_tot / static_cast<double>(npp*nth_pp);
    if (npme > 0)
    {
        c2t_pme = c2t * nth_tot / static_cast<double>(npme*nth_pme);
    }
    else
    {
        c2t_pme = 0;
    }

    fprintf(fplog, "\n     R E A L   C Y C L E   A N D   T I M E   A C C O U N T I N G\n\n");

    print_header(fplog, npp, nth_pp, npme, nth_pme);

    fprintf(fplog, "%s\n", hline);
    for (i = ewcPPDURINGPME+1; i < ewcNR; i++)
    {
        if (is_pme_subcounter(i))
        {
            /* Do not count these at all */
        }
        else if (npme > 0 && is_pme_counter(i))
        {
            /* Print timing information for PME-only nodes, but add an
             * asterisk so the reader of the table can know that the
             * walltimes are not meant to add up. The asterisk still
             * fits in the required maximum of 19 characters. */
            char buffer[STRLEN];
            snprintf(buffer, STRLEN, "%s *", wcn[i]);
            print_cycles(fplog, c2t_pme, buffer,
                         npme, nth_pme,
                         wc->wcc[i].n, cyc_sum[i], tot);
        }
        else
        {
            /* Print timing information when it is for a PP or PP+PME
               node */
            print_cycles(fplog, c2t_pp, wcn[i],
                         npp, nth_pp,
                         wc->wcc[i].n, cyc_sum[i], tot);
            tot_for_pp += cyc_sum[i];
        }
    }
    if (wc->wcc_all != NULL)
    {
        for (i = 0; i < ewcNR; i++)
        {
            for (j = 0; j < ewcNR; j++)
            {
                snprintf(buf, 20, "%-9.9s %-9.9s", wcn[i], wcn[j]);
                print_cycles(fplog, c2t_pp, buf,
                             npp, nth_pp,
                             wc->wcc_all[i*ewcNR+j].n,
                             wc->wcc_all[i*ewcNR+j].c,
                             tot);
            }
        }
    }
    tot_for_rest = tot * npp * nth_pp / static_cast<double>(nth_tot);
    print_cycles(fplog, c2t_pp, "Rest",
                 npp, nth_pp,
                 -1, tot_for_rest - tot_for_pp, tot);
    fprintf(fplog, "%s\n", hline);
    print_cycles(fplog, c2t, "Total",
                 npp, nth_pp,
                 -1, tot, tot);
    fprintf(fplog, "%s\n", hline);

    if (npme > 0)
    {
        fprintf(fplog,
                "(*) Note that with separate PME ranks, the walltime column actually sums to\n"
                "    twice the total reported, but the cycle count total and %% are correct.\n"
                "%s\n", hline);
    }

    if (wc->wcc[ewcPMEMESH].n > 0)
    {
        fprintf(fplog, " Breakdown of PME mesh computation\n");
        fprintf(fplog, "%s\n", hline);
        for (i = ewcPPDURINGPME+1; i < ewcNR; i++)
        {
            if (is_pme_subcounter(i))
            {
                print_cycles(fplog, npme > 0 ? c2t_pme : c2t_pp, wcn[i],
                             npme > 0 ? npme : npp, nth_pme,
                             wc->wcc[i].n, cyc_sum[i], tot);
            }
        }
        fprintf(fplog, "%s\n", hline);
    }

    if (useCycleSubcounters && wc->wcsc)
    {
        fprintf(fplog, " Breakdown of PP computation\n");
        fprintf(fplog, "%s\n", hline);
        for (i = 0; i < ewcsNR; i++)
        {
            print_cycles(fplog, c2t_pp, wcsn[i],
                         npp, nth_pp,
                         wc->wcsc[i].n, cyc_sum[ewcNR+i], tot);
        }
        fprintf(fplog, "%s\n", hline);
    }

    /* print GPU timing summary */
    if (gpu_t)
    {
        const char *k_log_str[2][2] = {
            {"Nonbonded F kernel", "Nonbonded F+ene k."},
            {"Nonbonded F+prune k.", "Nonbonded F+ene+prune k."}
        };

        tot_gpu = gpu_t->pl_h2d_t + gpu_t->nb_h2d_t + gpu_t->nb_d2h_t;

        /* add up the kernel timings */
        tot_k = 0.0;
        for (i = 0; i < 2; i++)
        {
            for (j = 0; j < 2; j++)
            {
                tot_k += gpu_t->ktime[i][j].t;
            }
        }
        tot_gpu += tot_k;

        tot_cpu_overlap = wc->wcc[ewcFORCE].c;
        if (wc->wcc[ewcPMEMESH].n > 0)
        {
            tot_cpu_overlap += wc->wcc[ewcPMEMESH].c;
        }
        tot_cpu_overlap *= realtime*1000/tot; /* convert s to ms */

        fprintf(fplog, "\n GPU timings\n%s\n", hline);
        fprintf(fplog, " Computing:                         Count  Wall t (s)      ms/step       %c\n", '%');
        fprintf(fplog, "%s\n", hline);
        print_gputimes(fplog, "Pair list H2D",
                       gpu_t->pl_h2d_c, gpu_t->pl_h2d_t, tot_gpu);
        print_gputimes(fplog, "X / q H2D",
                       gpu_t->nb_c, gpu_t->nb_h2d_t, tot_gpu);

        for (i = 0; i < 2; i++)
        {
            for (j = 0; j < 2; j++)
            {
                if (gpu_t->ktime[i][j].c)
                {
                    print_gputimes(fplog, k_log_str[i][j],
                                   gpu_t->ktime[i][j].c, gpu_t->ktime[i][j].t, tot_gpu);
                }
            }
        }

        print_gputimes(fplog, "F D2H",  gpu_t->nb_c, gpu_t->nb_d2h_t, tot_gpu);
        fprintf(fplog, "%s\n", hline);
        print_gputimes(fplog, "Total ", gpu_t->nb_c, tot_gpu, tot_gpu);
        fprintf(fplog, "%s\n", hline);

        gpu_cpu_ratio = tot_gpu/tot_cpu_overlap;
        if (gpu_t->nb_c > 0 && wc->wcc[ewcFORCE].n > 0)
        {
            fprintf(fplog, "\nForce evaluation time GPU/CPU: %.3f ms/%.3f ms = %.3f\n",
                    tot_gpu/gpu_t->nb_c, tot_cpu_overlap/wc->wcc[ewcFORCE].n,
                    gpu_cpu_ratio);
        }

        /* only print notes related to CPU-GPU load balance with PME */
        if (wc->wcc[ewcPMEMESH].n > 0)
        {
            fprintf(fplog, "For optimal performance this ratio should be close to 1!\n");

            /* print note if the imbalance is high with PME case in which
             * CPU-GPU load balancing is possible */
            if (gpu_cpu_ratio < 0.75 || gpu_cpu_ratio > 1.2)
            {
                /* Only the sim master calls this function, so always print to stderr */
                if (gpu_cpu_ratio < 0.75)
                {
                    if (npp > 1)
                    {
                        /* The user could have used -notunepme,
                         * but we currently can't check that here.
                         */
                        md_print_warn(NULL, fplog,
                                      "\nNOTE: The GPU has >25%% less load than the CPU. This imbalance causes\n"
                                      "      performance loss. Maybe the domain decomposition limits the PME tuning.\n"
                                      "      In that case, try setting the DD grid manually (-dd) or lowering -dds.");
                    }
                    else
                    {
                        /* We should not end up here, unless the box is
                         * too small for increasing the cut-off for PME tuning.
                         */
                        md_print_warn(NULL, fplog,
                                      "\nNOTE: The GPU has >25%% less load than the CPU. This imbalance causes\n"
                                      "      performance loss.");
                    }
                }
                if (gpu_cpu_ratio > 1.2)
                {
                    md_print_warn(NULL, fplog,
                                  "\nNOTE: The GPU has >20%% more load than the CPU. This imbalance causes\n"
                                  "      performance loss, consider using a shorter cut-off and a finer PME grid.");
                }
            }
        }
    }

    if (wc->wc_barrier)
    {
        md_print_warn(NULL, fplog,
                      "MPI_Barrier was called before each cycle start/stop\n"
                      "call, so timings are not those of real runs.\n");
    }

    if (wc->wcc[ewcNB_XF_BUF_OPS].n > 0 &&
        (cyc_sum[ewcDOMDEC] > tot*0.1 ||
         cyc_sum[ewcNS] > tot*0.1))
    {
        /* Only the sim master calls this function, so always print to stderr */
        if (wc->wcc[ewcDOMDEC].n == 0)
        {
            md_print_warn(NULL, fplog,
                          "NOTE: %d %% of the run time was spent in pair search,\n"
                          "      you might want to increase nstlist (this has no effect on accuracy)\n",
                          (int)(100*cyc_sum[ewcNS]/tot+0.5));
        }
        else
        {
            md_print_warn(NULL, fplog,
                          "NOTE: %d %% of the run time was spent in domain decomposition,\n"
                          "      %d %% of the run time was spent in pair search,\n"
                          "      you might want to increase nstlist (this has no effect on accuracy)\n",
                          (int)(100*cyc_sum[ewcDOMDEC]/tot+0.5),
                          (int)(100*cyc_sum[ewcNS]/tot+0.5));
        }
    }

    if (cyc_sum[ewcMoveE] > tot*0.05)
    {
        /* Only the sim master calls this function, so always print to stderr */
        md_print_warn(NULL, fplog,
                      "NOTE: %d %% of the run time was spent communicating energies,\n"
                      "      you might want to use the -gcom option of mdrun\n",
                      (int)(100*cyc_sum[ewcMoveE]/tot+0.5));
    }
}
Example #3
0
void wallcycle_print(FILE *fplog, int nnodes, int npme, double realtime,
                     gmx_wallcycle_t wc, wallclock_gpu_t *gpu_t)
{
    double     *cyc_sum;
    double      tot, tot_for_pp, tot_for_rest, tot_gpu, tot_cpu_overlap, gpu_cpu_ratio, tot_k;
    double      c2t, c2t_pp, c2t_pme;
    int         i, j, npp, nth_pp, nth_pme, nth_tot;
    char        buf[STRLEN];
    const char *hline = "-----------------------------------------------------------------------------";

    if (wc == NULL)
    {
        return;
    }

    nth_pp  = wc->nthreads_pp;
    nth_pme = wc->nthreads_pme;

    cyc_sum = wc->cycles_sum;

    npp     = nnodes - npme;
    nth_tot = npp*nth_pp + npme*nth_pme;

    /* When using PME-only nodes, the next line is valid for both
       PP-only and PME-only nodes because they started ewcRUN at the
       same time. */
    tot        = cyc_sum[ewcRUN];
    tot_for_pp = 0;

    /* Conversion factor from cycles to seconds */
    if (tot > 0)
    {
        c2t     = realtime/tot;
        c2t_pp  = c2t * nth_tot / (double) (npp*nth_pp);
        c2t_pme = c2t * nth_tot / (double) (npme*nth_pme);
    }
    else
    {
        c2t     = 0;
        c2t_pp  = 0;
        c2t_pme = 0;
    }

    fprintf(fplog, "\n     R E A L   C Y C L E   A N D   T I M E   A C C O U N T I N G\n\n");

    print_header(fplog, npp, nth_pp, npme, nth_pme);

    fprintf(fplog, "%s\n", hline);
    for (i = ewcPPDURINGPME+1; i < ewcNR; i++)
    {
        if (is_pme_subcounter(i))
        {
            /* Do not count these at all */
        }
        else if (npme > 0 && is_pme_counter(i))
        {
            /* Print timing information for PME-only nodes, but add an
             * asterisk so the reader of the table can know that the
             * walltimes are not meant to add up. The asterisk still
             * fits in the required maximum of 19 characters. */
            char buffer[STRLEN];
            snprintf(buffer, STRLEN, "%s *", wcn[i]);
            print_cycles(fplog, c2t_pme, buffer,
                         npme, nth_pme,
                         wc->wcc[i].n, cyc_sum[i], tot);
        }
        else
        {
            /* Print timing information when it is for a PP or PP+PME
               node */
            print_cycles(fplog, c2t_pp, wcn[i],
                         npp, nth_pp,
                         wc->wcc[i].n, cyc_sum[i], tot);
            tot_for_pp += cyc_sum[i];
        }
    }
    if (wc->wcc_all != NULL)
    {
        for (i = 0; i < ewcNR; i++)
        {
            for (j = 0; j < ewcNR; j++)
            {
                snprintf(buf, 20, "%-9.9s %-9.9s", wcn[i], wcn[j]);
                print_cycles(fplog, c2t_pp, buf,
                             npp, nth_pp,
                             wc->wcc_all[i*ewcNR+j].n,
                             wc->wcc_all[i*ewcNR+j].c,
                             tot);
            }
        }
    }
    tot_for_rest = tot * (npp * nth_pp) / (double) nth_tot;
    print_cycles(fplog, c2t_pp, "Rest",
                 npp, nth_pp,
                 -1, tot_for_rest - tot_for_pp, tot);
    fprintf(fplog, "%s\n", hline);
    print_cycles(fplog, c2t, "Total",
                 npp, nth_pp,
                 -1, tot, tot);
    fprintf(fplog, "%s\n", hline);

    if (npme > 0)
    {
        fprintf(fplog,
                "(*) Note that with separate PME nodes, the walltime column actually sums to\n"
                "    twice the total reported, but the cycle count total and %% are correct.\n"
                "%s\n", hline);
    }

    if (wc->wcc[ewcPMEMESH].n > 0)
    {
        fprintf(fplog, " Breakdown of PME mesh computation\n");
        fprintf(fplog, "%s\n", hline);
        for (i = ewcPPDURINGPME+1; i < ewcNR; i++)
        {
            if (is_pme_subcounter(i))
            {
                print_cycles(fplog, npme > 0 ? c2t_pme : c2t_pp, wcn[i],
                             npme > 0 ? npme : npp, nth_pme,
                             wc->wcc[i].n, cyc_sum[i], tot);
            }
        }
        fprintf(fplog, "%s\n", hline);
    }

#ifdef GMX_CYCLE_SUBCOUNTERS
    fprintf(fplog, " Breakdown of PP computation\n");
    fprintf(fplog, "%s\n", hline);
    for (i = 0; i < ewcsNR; i++)
    {
        print_cycles(fplog, c2t_pp, wcsn[i],
                     npp, nth_pp,
                     wc->wcsc[i].n, cyc_sum[ewcNR+i], tot);
    }
    fprintf(fplog, "%s\n", hline);
#endif

    /* print GPU timing summary */
    if (gpu_t)
    {
        const char *k_log_str[2][2] = {
            {"Nonbonded F kernel", "Nonbonded F+ene k."},
            {"Nonbonded F+prune k.", "Nonbonded F+ene+prune k."}
        };

        tot_gpu = gpu_t->pl_h2d_t + gpu_t->nb_h2d_t + gpu_t->nb_d2h_t;

        /* add up the kernel timings */
        tot_k = 0.0;
        for (i = 0; i < 2; i++)
        {
            for (j = 0; j < 2; j++)
            {
                tot_k += gpu_t->ktime[i][j].t;
            }
        }
        tot_gpu += tot_k;

        tot_cpu_overlap = wc->wcc[ewcFORCE].c;
        if (wc->wcc[ewcPMEMESH].n > 0)
        {
            tot_cpu_overlap += wc->wcc[ewcPMEMESH].c;
        }
        tot_cpu_overlap *= realtime*1000/tot; /* convert s to ms */

        fprintf(fplog, "\n GPU timings\n%s\n", hline);
        fprintf(fplog, " Computing:                         Count  Wall t (s)      ms/step       %c\n", '%');
        fprintf(fplog, "%s\n", hline);
        print_gputimes(fplog, "Pair list H2D",
                       gpu_t->pl_h2d_c, gpu_t->pl_h2d_t, tot_gpu);
        print_gputimes(fplog, "X / q H2D",
                       gpu_t->nb_c, gpu_t->nb_h2d_t, tot_gpu);

        for (i = 0; i < 2; i++)
        {
            for (j = 0; j < 2; j++)
            {
                if (gpu_t->ktime[i][j].c)
                {
                    print_gputimes(fplog, k_log_str[i][j],
                                   gpu_t->ktime[i][j].c, gpu_t->ktime[i][j].t, tot_gpu);
                }
            }
        }

        print_gputimes(fplog, "F D2H",  gpu_t->nb_c, gpu_t->nb_d2h_t, tot_gpu);
        fprintf(fplog, "%s\n", hline);
        print_gputimes(fplog, "Total ", gpu_t->nb_c, tot_gpu, tot_gpu);
        fprintf(fplog, "%s\n", hline);

        gpu_cpu_ratio = tot_gpu/tot_cpu_overlap;
        fprintf(fplog, "\nForce evaluation time GPU/CPU: %.3f ms/%.3f ms = %.3f\n",
                tot_gpu/gpu_t->nb_c, tot_cpu_overlap/wc->wcc[ewcFORCE].n,
                gpu_cpu_ratio);

        /* only print notes related to CPU-GPU load balance with PME */
        if (wc->wcc[ewcPMEMESH].n > 0)
        {
            fprintf(fplog, "For optimal performance this ratio should be close to 1!\n");

            /* print note if the imbalance is high with PME case in which
             * CPU-GPU load balancing is possible */
            if (gpu_cpu_ratio < 0.75 || gpu_cpu_ratio > 1.2)
            {
                /* Only the sim master calls this function, so always print to stderr */
                if (gpu_cpu_ratio < 0.75)
                {
                    if (npp > 1)
                    {
                        /* The user could have used -notunepme,
                         * but we currently can't check that here.
                         */
                        md_print_warn(NULL, fplog,
                                      "\nNOTE: The GPU has >25%% less load than the CPU. This imbalance causes\n"
                                      "      performance loss. Maybe the domain decomposition limits the PME tuning.\n"
                                      "      In that case, try setting the DD grid manually (-dd) or lowering -dds.");
                    }
                    else
                    {
                        /* We should not end up here, unless the box is
                         * too small for increasing the cut-off for PME tuning.
                         */
                        md_print_warn(NULL, fplog,
                                      "\nNOTE: The GPU has >25%% less load than the CPU. This imbalance causes\n"
                                      "      performance loss.");
                    }
                }
                if (gpu_cpu_ratio > 1.2)
                {
                    md_print_warn(NULL, fplog,
                                  "\nNOTE: The GPU has >20%% more load than the CPU. This imbalance causes\n"
                                  "      performance loss, consider using a shorter cut-off and a finer PME grid.");
                }
            }
        }
    }

    if (wc->wcc[ewcNB_XF_BUF_OPS].n > 0 &&
        (cyc_sum[ewcDOMDEC] > tot*0.1 ||
         cyc_sum[ewcNS] > tot*0.1))
    {
        /* Only the sim master calls this function, so always print to stderr */
        if (wc->wcc[ewcDOMDEC].n == 0)
        {
            md_print_warn(NULL, fplog,
                          "NOTE: %d %% of the run time was spent in pair search,\n"
                          "      you might want to increase nstlist (this has no effect on accuracy)\n",
                          (int)(100*cyc_sum[ewcNS]/tot+0.5));
        }
        else
        {
            md_print_warn(NULL, fplog,
                          "NOTE: %d %% of the run time was spent in domain decomposition,\n"
                          "      %d %% of the run time was spent in pair search,\n"
                          "      you might want to increase nstlist (this has no effect on accuracy)\n",
                          (int)(100*cyc_sum[ewcDOMDEC]/tot+0.5),
                          (int)(100*cyc_sum[ewcNS]/tot+0.5));
        }
    }

    if (cyc_sum[ewcMoveE] > tot*0.05)
    {
        /* Only the sim master calls this function, so always print to stderr */
        md_print_warn(NULL, fplog,
                      "NOTE: %d %% of the run time was spent communicating energies,\n"
                      "      you might want to use the -gcom option of mdrun\n",
                      (int)(100*cyc_sum[ewcMoveE]/tot+0.5));
    }
}
Example #4
0
void wallcycle_sum(t_commrec *cr, gmx_wallcycle_t wc)
{
    wallcc_t *wcc;
    double    cycles[ewcNR+ewcsNR];
    double    cycles_n[ewcNR+ewcsNR], buf[ewcNR+ewcsNR], *cyc_all, *buf_all;
    int       i, j;
    int       nsum;

    if (wc == NULL)
    {
        return;
    }

    snew(wc->cycles_sum, ewcNR+ewcsNR);

    wcc = wc->wcc;

    for (i = 0; i < ewcNR; i++)
    {
        if (is_pme_counter(i) || (i == ewcRUN && cr->duty == DUTY_PME))
        {
            wcc[i].c *= wc->nthreads_pme;

            if (wc->wcc_all)
            {
                for (j = 0; j < ewcNR; j++)
                {
                    wc->wcc_all[i*ewcNR+j].c *= wc->nthreads_pme;
                }
            }
        }
        else
        {
            wcc[i].c *= wc->nthreads_pp;

            if (wc->wcc_all)
            {
                for (j = 0; j < ewcNR; j++)
                {
                    wc->wcc_all[i*ewcNR+j].c *= wc->nthreads_pp;
                }
            }
        }
    }

    if (wcc[ewcDDCOMMLOAD].n > 0)
    {
        wcc[ewcDOMDEC].c -= wcc[ewcDDCOMMLOAD].c;
    }
    if (wcc[ewcDDCOMMBOUND].n > 0)
    {
        wcc[ewcDOMDEC].c -= wcc[ewcDDCOMMBOUND].c;
    }
    if (wcc[ewcPME_FFTCOMM].n > 0)
    {
        wcc[ewcPME_FFT].c -= wcc[ewcPME_FFTCOMM].c;
    }

    if (cr->npmenodes == 0)
    {
        /* All nodes do PME (or no PME at all) */
        if (wcc[ewcPMEMESH].n > 0)
        {
            wcc[ewcFORCE].c -= wcc[ewcPMEMESH].c;
        }
    }
    else
    {
        /* The are PME-only nodes */
        if (wcc[ewcPMEMESH].n > 0)
        {
            /* This must be a PME only node, calculate the Wait + Comm. time */
            wcc[ewcPMEWAITCOMM].c = wcc[ewcRUN].c - wcc[ewcPMEMESH].c;
        }
    }

    /* Store the cycles in a double buffer for summing */
    for (i = 0; i < ewcNR; i++)
    {
        cycles_n[i] = (double)wcc[i].n;
        cycles[i]   = (double)wcc[i].c;
    }
    nsum = ewcNR;
#ifdef GMX_CYCLE_SUBCOUNTERS
    for (i = 0; i < ewcsNR; i++)
    {
        wc->wcsc[i].c    *= wc->nthreads_pp;
        cycles_n[ewcNR+i] = (double)wc->wcsc[i].n;
        cycles[ewcNR+i]   = (double)wc->wcsc[i].c;
    }
    nsum += ewcsNR;
#endif

#ifdef GMX_MPI
    if (cr->nnodes > 1)
    {
        MPI_Allreduce(cycles_n, buf, nsum, MPI_DOUBLE, MPI_MAX,
                      cr->mpi_comm_mysim);
        for (i = 0; i < ewcNR; i++)
        {
            wcc[i].n = (int)(buf[i] + 0.5);
        }
#ifdef GMX_CYCLE_SUBCOUNTERS
        for (i = 0; i < ewcsNR; i++)
        {
            wc->wcsc[i].n = (int)(buf[ewcNR+i] + 0.5);
        }
#endif

        MPI_Allreduce(cycles, wc->cycles_sum, nsum, MPI_DOUBLE, MPI_SUM,
                      cr->mpi_comm_mysim);

        if (wc->wcc_all != NULL)
        {
            snew(cyc_all, ewcNR*ewcNR);
            snew(buf_all, ewcNR*ewcNR);
            for (i = 0; i < ewcNR*ewcNR; i++)
            {
                cyc_all[i] = wc->wcc_all[i].c;
            }
            MPI_Allreduce(cyc_all, buf_all, ewcNR*ewcNR, MPI_DOUBLE, MPI_SUM,
                          cr->mpi_comm_mysim);
            for (i = 0; i < ewcNR*ewcNR; i++)
            {
                wc->wcc_all[i].c = buf_all[i];
            }
            sfree(buf_all);
            sfree(cyc_all);
        }
    }
    else
#endif
    {
        for (i = 0; i < nsum; i++)
        {
            wc->cycles_sum[i] = cycles[i];
        }
    }
}
Example #5
0
void wallcycle_print(FILE *fplog, int nnodes, int npme, double realtime,
                     gmx_wallcycle_t wc, wallclock_gpu_t *gpu_t)
{
    double     *cycles;
    double      c2t, tot, tot_gpu, tot_cpu_overlap, gpu_cpu_ratio, sum, tot_k;
    int         i, j, npp, nth_pp, nth_pme, nth_tot;
    char        buf[STRLEN];
    const char *hline = "-----------------------------------------------------------------------------";

    if (wc == NULL)
    {
        return;
    }

    nth_pp  = wc->nthreads_pp;
    nth_pme = wc->nthreads_pme;

    cycles = wc->cycles_sum;

    if (npme > 0)
    {
        npp = nnodes - npme;

        nth_tot = npp*nth_pp + npme*nth_pme;
    }
    else
    {
        npp  = nnodes;
        npme = nnodes;

        nth_tot = npp*nth_pp;
    }

    tot = cycles[ewcRUN];

    /* Conversion factor from cycles to seconds */
    if (tot > 0)
    {
        c2t = realtime/tot;
    }
    else
    {
        c2t = 0;
    }

    fprintf(fplog, "\n     R E A L   C Y C L E   A N D   T I M E   A C C O U N T I N G\n\n");

    fprintf(fplog, " Computing:         Nodes   Th.     Count  Wall t (s)     G-Cycles       %c\n", '%');
    fprintf(fplog, "%s\n", hline);
    sum = 0;
    for (i = ewcPPDURINGPME+1; i < ewcNR; i++)
    {
        if (!is_pme_subcounter(i))
        {
            print_cycles(fplog, c2t, wcn[i], nth_tot,
                         is_pme_counter(i) ? npme : npp,
                         is_pme_counter(i) ? nth_pme : nth_pp,
                         wc->wcc[i].n, cycles[i], tot);
            sum += cycles[i];
        }
    }
    if (wc->wcc_all != NULL)
    {
        for (i = 0; i < ewcNR; i++)
        {
            for (j = 0; j < ewcNR; j++)
            {
                snprintf(buf, 9, "%-9s", wcn[i]);
                buf[9] = ' ';
                snprintf(buf+10, 9, "%-9s", wcn[j]);
                buf[19] = '\0';
                print_cycles(fplog, c2t, buf, nth_tot,
                             is_pme_counter(i) ? npme : npp,
                             is_pme_counter(i) ? nth_pme : nth_pp,
                             wc->wcc_all[i*ewcNR+j].n,
                             wc->wcc_all[i*ewcNR+j].c,
                             tot);
            }
        }
    }
    print_cycles(fplog, c2t, "Rest", nth_tot, npp, -1, 0, tot-sum, tot);
    fprintf(fplog, "%s\n", hline);
    print_cycles(fplog, c2t, "Total", nth_tot, nnodes, -1, 0, tot, tot);
    fprintf(fplog, "%s\n", hline);

    if (wc->wcc[ewcPMEMESH].n > 0)
    {
        fprintf(fplog, "%s\n", hline);
        for (i = ewcPPDURINGPME+1; i < ewcNR; i++)
        {
            if (is_pme_subcounter(i))
            {
                print_cycles(fplog, c2t, wcn[i], nth_tot,
                             is_pme_counter(i) ? npme : npp,
                             is_pme_counter(i) ? nth_pme : nth_pp,
                             wc->wcc[i].n, cycles[i], tot);
            }
        }
        fprintf(fplog, "%s\n", hline);
    }

#ifdef GMX_CYCLE_SUBCOUNTERS
    fprintf(fplog, "%s\n", hline);
    for (i = 0; i < ewcsNR; i++)
    {
        print_cycles(fplog, c2t, wcsn[i], nth_tot, npp, nth_pp,
                     wc->wcsc[i].n, cycles[ewcNR+i], tot);
    }
    fprintf(fplog, "%s\n", hline);
#endif

    /* print GPU timing summary */
    if (gpu_t)
    {
        const char *k_log_str[2][2] = {
            {"Nonbonded F kernel", "Nonbonded F+ene k."},
            {"Nonbonded F+prune k.", "Nonbonded F+ene+prune k."}
        };

        tot_gpu = gpu_t->pl_h2d_t + gpu_t->nb_h2d_t + gpu_t->nb_d2h_t;

        /* add up the kernel timings */
        tot_k = 0.0;
        for (i = 0; i < 2; i++)
        {
            for (j = 0; j < 2; j++)
            {
                tot_k += gpu_t->ktime[i][j].t;
            }
        }
        tot_gpu += tot_k;

        tot_cpu_overlap = wc->wcc[ewcFORCE].c;
        if (wc->wcc[ewcPMEMESH].n > 0)
        {
            tot_cpu_overlap += wc->wcc[ewcPMEMESH].c;
        }
        tot_cpu_overlap *= c2t * 1000; /* convert s to ms */

        fprintf(fplog, "\n GPU timings\n%s\n", hline);
        fprintf(fplog, " Computing:                         Count  Wall t (s)      ms/step       %c\n", '%');
        fprintf(fplog, "%s\n", hline);
        print_gputimes(fplog, "Pair list H2D",
                       gpu_t->pl_h2d_c, gpu_t->pl_h2d_t, tot_gpu);
        print_gputimes(fplog, "X / q H2D",
                       gpu_t->nb_c, gpu_t->nb_h2d_t, tot_gpu);

        for (i = 0; i < 2; i++)
        {
            for (j = 0; j < 2; j++)
            {
                if (gpu_t->ktime[i][j].c)
                {
                    print_gputimes(fplog, k_log_str[i][j],
                                   gpu_t->ktime[i][j].c, gpu_t->ktime[i][j].t, tot_gpu);
                }
            }
        }

        print_gputimes(fplog, "F D2H",  gpu_t->nb_c, gpu_t->nb_d2h_t, tot_gpu);
        fprintf(fplog, "%s\n", hline);
        print_gputimes(fplog, "Total ", gpu_t->nb_c, tot_gpu, tot_gpu);
        fprintf(fplog, "%s\n", hline);

        gpu_cpu_ratio = tot_gpu/tot_cpu_overlap;
        fprintf(fplog, "\nForce evaluation time GPU/CPU: %.3f ms/%.3f ms = %.3f\n",
                tot_gpu/gpu_t->nb_c, tot_cpu_overlap/wc->wcc[ewcFORCE].n,
                gpu_cpu_ratio);

        /* only print notes related to CPU-GPU load balance with PME */
        if (wc->wcc[ewcPMEMESH].n > 0)
        {
            fprintf(fplog, "For optimal performance this ratio should be close to 1!\n");

            /* print note if the imbalance is high with PME case in which
             * CPU-GPU load balancing is possible */
            if (gpu_cpu_ratio < 0.75 || gpu_cpu_ratio > 1.2)
            {
                /* Only the sim master calls this function, so always print to stderr */
                if (gpu_cpu_ratio < 0.75)
                {
                    if (npp > 1)
                    {
                        /* The user could have used -notunepme,
                         * but we currently can't check that here.
                         */
                        md_print_warn(NULL, fplog,
                                      "\nNOTE: The GPU has >25%% less load than the CPU. This imbalance causes\n"
                                      "      performance loss. Maybe the domain decomposition limits the PME tuning.\n"
                                      "      In that case, try setting the DD grid manually (-dd) or lowering -dds.");
                    }
                    else
                    {
                        /* We should not end up here, unless the box is
                         * too small for increasing the cut-off for PME tuning.
                         */
                        md_print_warn(NULL, fplog,
                                      "\nNOTE: The GPU has >25%% less load than the CPU. This imbalance causes\n"
                                      "      performance loss.");
                    }
                }
                if (gpu_cpu_ratio > 1.2)
                {
                    md_print_warn(NULL, fplog,
                                  "\nNOTE: The GPU has >20%% more load than the CPU. This imbalance causes\n"
                                  "      performance loss, consider using a shorter cut-off and a finer PME grid.");
                }
            }
        }
    }

    if (wc->wcc[ewcNB_XF_BUF_OPS].n > 0 &&
        (cycles[ewcDOMDEC] > tot*0.1 ||
         cycles[ewcNS] > tot*0.1))
    {
        /* Only the sim master calls this function, so always print to stderr */
        if (wc->wcc[ewcDOMDEC].n == 0)
        {
            md_print_warn(NULL, fplog,
                          "NOTE: %d %% of the run time was spent in pair search,\n"
                          "      you might want to increase nstlist (this has no effect on accuracy)\n",
                          (int)(100*cycles[ewcNS]/tot+0.5));
        }
        else
        {
            md_print_warn(NULL, fplog,
                          "NOTE: %d %% of the run time was spent in domain decomposition,\n"
                          "      %d %% of the run time was spent in pair search,\n"
                          "      you might want to increase nstlist (this has no effect on accuracy)\n",
                          (int)(100*cycles[ewcDOMDEC]/tot+0.5),
                          (int)(100*cycles[ewcNS]/tot+0.5));
        }
    }

    if (cycles[ewcMoveE] > tot*0.05)
    {
        /* Only the sim master calls this function, so always print to stderr */
        md_print_warn(NULL, fplog,
                      "NOTE: %d %% of the run time was spent communicating energies,\n"
                      "      you might want to use the -gcom option of mdrun\n",
                      (int)(100*cycles[ewcMoveE]/tot+0.5));
    }
}
Example #6
0
void wallcycle_sum(t_commrec *cr, gmx_wallcycle_t wc)
{
    wallcc_t *wcc;
    double    cycles[ewcNR+ewcsNR];
#ifdef GMX_MPI
    double    cycles_n[ewcNR+ewcsNR+1];
    double    buf[ewcNR+ewcsNR];
    double   *buf_all, *cyc_all;
#endif
    int       i, j;
    int       nsum;

    if (wc == NULL)
    {
        return;
    }

    snew(wc->cycles_sum, ewcNR+ewcsNR);

    wcc = wc->wcc;

    /* The GPU wait estimate counter is used for load balancing only
     * and will mess up the total due to double counting: clear it.
     */
    wcc[ewcWAIT_GPU_NB_L_EST].n = 0;
    wcc[ewcWAIT_GPU_NB_L_EST].c = 0;

    for (i = 0; i < ewcNR; i++)
    {
        if (is_pme_counter(i) || (i == ewcRUN && cr->duty == DUTY_PME))
        {
            wcc[i].c *= wc->nthreads_pme;

            if (wc->wcc_all)
            {
                for (j = 0; j < ewcNR; j++)
                {
                    wc->wcc_all[i*ewcNR+j].c *= wc->nthreads_pme;
                }
            }
        }
        else
        {
            wcc[i].c *= wc->nthreads_pp;

            if (wc->wcc_all)
            {
                for (j = 0; j < ewcNR; j++)
                {
                    wc->wcc_all[i*ewcNR+j].c *= wc->nthreads_pp;
                }
            }
        }
    }

    subtract_cycles(wcc, ewcDOMDEC, ewcDDCOMMLOAD);
    subtract_cycles(wcc, ewcDOMDEC, ewcDDCOMMBOUND);

    subtract_cycles(wcc, ewcPME_FFT, ewcPME_FFTCOMM);

    if (cr->npmenodes == 0)
    {
        /* All nodes do PME (or no PME at all) */
        subtract_cycles(wcc, ewcFORCE, ewcPMEMESH);
    }
    else
    {
        /* The are PME-only nodes */
        if (wcc[ewcPMEMESH].n > 0)
        {
            /* This must be a PME only node, calculate the Wait + Comm. time */
            GMX_ASSERT(wcc[ewcRUN].c >= wcc[ewcPMEMESH].c, "Total run ticks must be greater than PME-only ticks");
            wcc[ewcPMEWAITCOMM].c = wcc[ewcRUN].c - wcc[ewcPMEMESH].c;
        }
    }

    /* Store the cycles in a double buffer for summing */
    for (i = 0; i < ewcNR; i++)
    {
#ifdef GMX_MPI
        cycles_n[i] = static_cast<double>(wcc[i].n);
#endif
        cycles[i]   = static_cast<double>(wcc[i].c);
    }
    nsum = ewcNR;
#ifdef GMX_CYCLE_SUBCOUNTERS
    for (i = 0; i < ewcsNR; i++)
    {
        wc->wcsc[i].c    *= wc->nthreads_pp;
#ifdef GMX_MPI
        cycles_n[ewcNR+i] = static_cast<double>(wc->wcsc[i].n);
#endif
        cycles[ewcNR+i]   = static_cast<double>(wc->wcsc[i].c);
    }
    nsum += ewcsNR;
#endif

#ifdef GMX_MPI
    if (cr->nnodes > 1)
    {
        cycles_n[nsum] = (wc->haveInvalidCount > 0 ? 1 : 0);
        MPI_Allreduce(cycles_n, buf, nsum + 1, MPI_DOUBLE, MPI_MAX,
                      cr->mpi_comm_mysim);
        for (i = 0; i < ewcNR; i++)
        {
            wcc[i].n = static_cast<int>(buf[i] + 0.5);
        }
        wc->haveInvalidCount = (buf[nsum] > 0);
#ifdef GMX_CYCLE_SUBCOUNTERS
        for (i = 0; i < ewcsNR; i++)
        {
            wc->wcsc[i].n = static_cast<int>(buf[ewcNR+i] + 0.5);
        }
#endif

        MPI_Allreduce(cycles, wc->cycles_sum, nsum, MPI_DOUBLE, MPI_SUM,
                      cr->mpi_comm_mysim);

        if (wc->wcc_all != NULL)
        {
            snew(cyc_all, ewcNR*ewcNR);
            snew(buf_all, ewcNR*ewcNR);
            for (i = 0; i < ewcNR*ewcNR; i++)
            {
                cyc_all[i] = wc->wcc_all[i].c;
            }
            MPI_Allreduce(cyc_all, buf_all, ewcNR*ewcNR, MPI_DOUBLE, MPI_SUM,
                          cr->mpi_comm_mysim);
            for (i = 0; i < ewcNR*ewcNR; i++)
            {
                wc->wcc_all[i].c = static_cast<gmx_cycles_t>(buf_all[i]);
            }
            sfree(buf_all);
            sfree(cyc_all);
        }
    }
    else
#endif
    {
        for (i = 0; i < nsum; i++)
        {
            wc->cycles_sum[i] = cycles[i];
        }
    }
}