C++ (Cpp) print_cyclesの例

プログラミング言語: C++ (Cpp)

メソッド/関数: print_cycles

hotexamples.comのコード掲載数: 5

C++ (Cpp) print_cycles - 5件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたC++ (Cpp)のprint_cyclesの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

コード例 #1

ファイルを表示

ファイル: static_mutex_test.cpp プロジェクト: hoangphihung0604/boost

void thread3_proc()
{
   int cycles = 0;
   while(!sufficient_time())
   {
      t1();
      t3();
      ++cycles;
   }
   print_cycles(cycles);
}

コード例 #2

ファイルを表示

ファイル: gmx_wallcycle.c プロジェクト: chenleo/gromacs453pf

void wallcycle_print(FILE *fplog, int nnodes, int npme, double realtime,
                     gmx_wallcycle_t wc, double cycles[])
{
    double c2t,tot,sum;
    int    i,j,npp;
    char   buf[STRLEN];
    const char *myline = "-----------------------------------------------------------------------";
    
    if (wc == NULL)
    {
        return;
    }

    if (npme > 0)
    {
        npp = nnodes - npme;
    }
    else
    {
        npp  = nnodes;
        npme = nnodes;
    }
    tot = cycles[ewcRUN];
    /* Conversion factor from cycles to seconds */
    if (tot > 0)
    {
      c2t = nnodes*realtime/tot;
    }
    else
    {
      c2t = 0;
    }

    fprintf(fplog,"\n     R E A L   C Y C L E   A N D   T I M E   A C C O U N T I N G\n\n");

    fprintf(fplog," Computing:         Nodes     Number     G-Cycles    Seconds     %c\n",'%');
    fprintf(fplog,"%s\n",myline);
    sum = 0;
    for(i=ewcPPDURINGPME+1; i<ewcNR; i++)
    {
        if (!subdivision(i))
        {
            print_cycles(fplog,c2t,wcn[i],
                         (i==ewcPMEMESH || i==ewcPMEWAITCOMM) ? npme : npp,
                         wc->wcc[i].n,cycles[i],tot);
            sum += cycles[i];
        }
    }
    if (wc->wcc_all != NULL)
    {
        for(i=0; i<ewcNR; i++)
        {
            for(j=0; j<ewcNR; j++)
            {
                sprintf(buf,"%-9s",wcn[i]);
                buf[9] = ' ';
                sprintf(buf+10,"%-9s",wcn[j]);
                buf[19] = '\0';
                print_cycles(fplog,c2t,buf,
                             (i==ewcPMEMESH || i==ewcPMEWAITCOMM) ? npme : npp,
                             wc->wcc_all[i*ewcNR+j].n,
                             wc->wcc_all[i*ewcNR+j].c,
                             tot);
            }
        }
    }
    print_cycles(fplog,c2t,"Rest",npp,0,tot-sum,tot);
    fprintf(fplog,"%s\n",myline);
    print_cycles(fplog,c2t,"Total",nnodes,0,tot,tot);
    fprintf(fplog,"%s\n",myline);
    
    if (wc->wcc[ewcPMEMESH].n > 0)
    {
        fprintf(fplog,"%s\n",myline);
        for(i=ewcPPDURINGPME+1; i<ewcNR; i++)
        {
            if (subdivision(i))
            {
                print_cycles(fplog,c2t,wcn[i],
                             (i>=ewcPMEMESH || i<=ewcPME_SOLVE) ? npme : npp,
                             wc->wcc[i].n,cycles[i],tot);
            }
        }
        fprintf(fplog,"%s\n",myline);
    }

    if (cycles[ewcMoveE] > tot*0.05)
    {
        sprintf(buf,
                "NOTE: %d %% of the run time was spent communicating energies,\n"
                "      you might want to use the -gcom option of mdrun\n",
                (int)(100*cycles[ewcMoveE]/tot+0.5));
        if (fplog)
        {
            fprintf(fplog,"\n%s\n",buf);
        }
        /* Only the sim master calls this function, so always print to stderr */
        fprintf(stderr,"\n%s\n",buf);
    }
}

コード例 #3

ファイルを表示

ファイル: wallcycle.cpp プロジェクト: MelroLeandro/gromacs

void wallcycle_print(FILE *fplog, int nnodes, int npme,
                     int nth_pp, int nth_pme, double realtime,
                     gmx_wallcycle_t wc, const WallcycleCounts &cyc_sum,
                     struct gmx_wallclock_gpu_t *gpu_t)
{
    double      tot, tot_for_pp, tot_for_rest, tot_gpu, tot_cpu_overlap, gpu_cpu_ratio, tot_k;
    double      c2t, c2t_pp, c2t_pme = 0;
    int         i, j, npp, nth_tot;
    char        buf[STRLEN];
    const char *hline = "-----------------------------------------------------------------------------";

    if (wc == NULL)
    {
        return;
    }

    GMX_ASSERT(nth_pp > 0, "Number of particle-particle threads must be >0");
    GMX_ASSERT(nth_pme > 0, "Number of PME threads must be >0");
    GMX_ASSERT(nnodes > 0, "Number of nodes must be >0");
    GMX_ASSERT(npme >= 0, "Number of PME nodes cannot be negative");
    npp     = nnodes - npme;
    /* npme is the number of PME-only ranks used, and we always do PP work */
    GMX_ASSERT(npp > 0, "Number of particle-particle nodes must be >0");

    nth_tot = npp*nth_pp + npme*nth_pme;

    /* When using PME-only nodes, the next line is valid for both
       PP-only and PME-only nodes because they started ewcRUN at the
       same time. */
    tot        = cyc_sum[ewcRUN];
    tot_for_pp = 0;

    if (tot <= 0.0)
    {
        /* TODO This is heavy handed, but until someone reworks the
           code so that it is provably robust with respect to
           non-positive values for all possible timer and cycle
           counters, there is less value gained from printing whatever
           timing data might still be sensible for some non-Jenkins
           run, than is lost from diagnosing Jenkins FP exceptions on
           runs about whose execution time we don't care. */
        md_print_warn(NULL, fplog, "WARNING: A total of %f CPU cycles was recorded, so mdrun cannot print a time accounting\n", tot);
        return;
    }

    if (wc->haveInvalidCount)
    {
        md_print_warn(NULL, fplog, "%s\n",
                      "NOTE: Detected invalid cycle counts, probably because threads moved between CPU cores that do not have synchronized cycle counters. Will not print the cycle accounting.");
        return;
    }


    /* Conversion factor from cycles to seconds */
    c2t     = realtime/tot;
    c2t_pp  = c2t * nth_tot / static_cast<double>(npp*nth_pp);
    if (npme > 0)
    {
        c2t_pme = c2t * nth_tot / static_cast<double>(npme*nth_pme);
    }
    else
    {
        c2t_pme = 0;
    }

    fprintf(fplog, "\n     R E A L   C Y C L E   A N D   T I M E   A C C O U N T I N G\n\n");

    print_header(fplog, npp, nth_pp, npme, nth_pme);

    fprintf(fplog, "%s\n", hline);
    for (i = ewcPPDURINGPME+1; i < ewcNR; i++)
    {
        if (is_pme_subcounter(i))
        {
            /* Do not count these at all */
        }
        else if (npme > 0 && is_pme_counter(i))
        {
            /* Print timing information for PME-only nodes, but add an
             * asterisk so the reader of the table can know that the
             * walltimes are not meant to add up. The asterisk still
             * fits in the required maximum of 19 characters. */
            char buffer[STRLEN];
            snprintf(buffer, STRLEN, "%s *", wcn[i]);
            print_cycles(fplog, c2t_pme, buffer,
                         npme, nth_pme,
                         wc->wcc[i].n, cyc_sum[i], tot);
        }
        else
        {
            /* Print timing information when it is for a PP or PP+PME
               node */
            print_cycles(fplog, c2t_pp, wcn[i],
                         npp, nth_pp,
                         wc->wcc[i].n, cyc_sum[i], tot);
            tot_for_pp += cyc_sum[i];
        }
    }
    if (wc->wcc_all != NULL)
    {
        for (i = 0; i < ewcNR; i++)
        {
            for (j = 0; j < ewcNR; j++)
            {
                snprintf(buf, 20, "%-9.9s %-9.9s", wcn[i], wcn[j]);
                print_cycles(fplog, c2t_pp, buf,
                             npp, nth_pp,
                             wc->wcc_all[i*ewcNR+j].n,
                             wc->wcc_all[i*ewcNR+j].c,
                             tot);
            }
        }
    }
    tot_for_rest = tot * npp * nth_pp / static_cast<double>(nth_tot);
    print_cycles(fplog, c2t_pp, "Rest",
                 npp, nth_pp,
                 -1, tot_for_rest - tot_for_pp, tot);
    fprintf(fplog, "%s\n", hline);
    print_cycles(fplog, c2t, "Total",
                 npp, nth_pp,
                 -1, tot, tot);
    fprintf(fplog, "%s\n", hline);

    if (npme > 0)
    {
        fprintf(fplog,
                "(*) Note that with separate PME ranks, the walltime column actually sums to\n"
                "    twice the total reported, but the cycle count total and %% are correct.\n"
                "%s\n", hline);
    }

    if (wc->wcc[ewcPMEMESH].n > 0)
    {
        fprintf(fplog, " Breakdown of PME mesh computation\n");
        fprintf(fplog, "%s\n", hline);
        for (i = ewcPPDURINGPME+1; i < ewcNR; i++)
        {
            if (is_pme_subcounter(i))
            {
                print_cycles(fplog, npme > 0 ? c2t_pme : c2t_pp, wcn[i],
                             npme > 0 ? npme : npp, nth_pme,
                             wc->wcc[i].n, cyc_sum[i], tot);
            }
        }
        fprintf(fplog, "%s\n", hline);
    }

    if (useCycleSubcounters && wc->wcsc)
    {
        fprintf(fplog, " Breakdown of PP computation\n");
        fprintf(fplog, "%s\n", hline);
        for (i = 0; i < ewcsNR; i++)
        {
            print_cycles(fplog, c2t_pp, wcsn[i],
                         npp, nth_pp,
                         wc->wcsc[i].n, cyc_sum[ewcNR+i], tot);
        }
        fprintf(fplog, "%s\n", hline);
    }

    /* print GPU timing summary */
    if (gpu_t)
    {
        const char *k_log_str[2][2] = {
            {"Nonbonded F kernel", "Nonbonded F+ene k."},
            {"Nonbonded F+prune k.", "Nonbonded F+ene+prune k."}
        };

        tot_gpu = gpu_t->pl_h2d_t + gpu_t->nb_h2d_t + gpu_t->nb_d2h_t;

        /* add up the kernel timings */
        tot_k = 0.0;
        for (i = 0; i < 2; i++)
        {
            for (j = 0; j < 2; j++)
            {
                tot_k += gpu_t->ktime[i][j].t;
            }
        }
        tot_gpu += tot_k;

        tot_cpu_overlap = wc->wcc[ewcFORCE].c;
        if (wc->wcc[ewcPMEMESH].n > 0)
        {
            tot_cpu_overlap += wc->wcc[ewcPMEMESH].c;
        }
        tot_cpu_overlap *= realtime*1000/tot; /* convert s to ms */

        fprintf(fplog, "\n GPU timings\n%s\n", hline);
        fprintf(fplog, " Computing:                         Count  Wall t (s)      ms/step       %c\n", '%');
        fprintf(fplog, "%s\n", hline);
        print_gputimes(fplog, "Pair list H2D",
                       gpu_t->pl_h2d_c, gpu_t->pl_h2d_t, tot_gpu);
        print_gputimes(fplog, "X / q H2D",
                       gpu_t->nb_c, gpu_t->nb_h2d_t, tot_gpu);

        for (i = 0; i < 2; i++)
        {
            for (j = 0; j < 2; j++)
            {
                if (gpu_t->ktime[i][j].c)
                {
                    print_gputimes(fplog, k_log_str[i][j],
                                   gpu_t->ktime[i][j].c, gpu_t->ktime[i][j].t, tot_gpu);
                }
            }
        }

        print_gputimes(fplog, "F D2H",  gpu_t->nb_c, gpu_t->nb_d2h_t, tot_gpu);
        fprintf(fplog, "%s\n", hline);
        print_gputimes(fplog, "Total ", gpu_t->nb_c, tot_gpu, tot_gpu);
        fprintf(fplog, "%s\n", hline);

        gpu_cpu_ratio = tot_gpu/tot_cpu_overlap;
        if (gpu_t->nb_c > 0 && wc->wcc[ewcFORCE].n > 0)
        {
            fprintf(fplog, "\nForce evaluation time GPU/CPU: %.3f ms/%.3f ms = %.3f\n",
                    tot_gpu/gpu_t->nb_c, tot_cpu_overlap/wc->wcc[ewcFORCE].n,
                    gpu_cpu_ratio);
        }

        /* only print notes related to CPU-GPU load balance with PME */
        if (wc->wcc[ewcPMEMESH].n > 0)
        {
            fprintf(fplog, "For optimal performance this ratio should be close to 1!\n");

            /* print note if the imbalance is high with PME case in which
             * CPU-GPU load balancing is possible */
            if (gpu_cpu_ratio < 0.75 || gpu_cpu_ratio > 1.2)
            {
                /* Only the sim master calls this function, so always print to stderr */
                if (gpu_cpu_ratio < 0.75)
                {
                    if (npp > 1)
                    {
                        /* The user could have used -notunepme,
                         * but we currently can't check that here.
                         */
                        md_print_warn(NULL, fplog,
                                      "\nNOTE: The GPU has >25%% less load than the CPU. This imbalance causes\n"
                                      "      performance loss. Maybe the domain decomposition limits the PME tuning.\n"
                                      "      In that case, try setting the DD grid manually (-dd) or lowering -dds.");
                    }
                    else
                    {
                        /* We should not end up here, unless the box is
                         * too small for increasing the cut-off for PME tuning.
                         */
                        md_print_warn(NULL, fplog,
                                      "\nNOTE: The GPU has >25%% less load than the CPU. This imbalance causes\n"
                                      "      performance loss.");
                    }
                }
                if (gpu_cpu_ratio > 1.2)
                {
                    md_print_warn(NULL, fplog,
                                  "\nNOTE: The GPU has >20%% more load than the CPU. This imbalance causes\n"
                                  "      performance loss, consider using a shorter cut-off and a finer PME grid.");
                }
            }
        }
    }

    if (wc->wc_barrier)
    {
        md_print_warn(NULL, fplog,
                      "MPI_Barrier was called before each cycle start/stop\n"
                      "call, so timings are not those of real runs.\n");
    }

    if (wc->wcc[ewcNB_XF_BUF_OPS].n > 0 &&
        (cyc_sum[ewcDOMDEC] > tot*0.1 ||
         cyc_sum[ewcNS] > tot*0.1))
    {
        /* Only the sim master calls this function, so always print to stderr */
        if (wc->wcc[ewcDOMDEC].n == 0)
        {
            md_print_warn(NULL, fplog,
                          "NOTE: %d %% of the run time was spent in pair search,\n"
                          "      you might want to increase nstlist (this has no effect on accuracy)\n",
                          (int)(100*cyc_sum[ewcNS]/tot+0.5));
        }
        else
        {
            md_print_warn(NULL, fplog,
                          "NOTE: %d %% of the run time was spent in domain decomposition,\n"
                          "      %d %% of the run time was spent in pair search,\n"
                          "      you might want to increase nstlist (this has no effect on accuracy)\n",
                          (int)(100*cyc_sum[ewcDOMDEC]/tot+0.5),
                          (int)(100*cyc_sum[ewcNS]/tot+0.5));
        }
    }

    if (cyc_sum[ewcMoveE] > tot*0.05)
    {
        /* Only the sim master calls this function, so always print to stderr */
        md_print_warn(NULL, fplog,
                      "NOTE: %d %% of the run time was spent communicating energies,\n"
                      "      you might want to use the -gcom option of mdrun\n",
                      (int)(100*cyc_sum[ewcMoveE]/tot+0.5));
    }
}

コード例 #4

ファイルを表示

ファイル: wallcycle.c プロジェクト: drmaruyama/gromacs

void wallcycle_print(FILE *fplog, int nnodes, int npme, double realtime,
                     gmx_wallcycle_t wc, wallclock_gpu_t *gpu_t)
{
    double     *cyc_sum;
    double      tot, tot_for_pp, tot_for_rest, tot_gpu, tot_cpu_overlap, gpu_cpu_ratio, tot_k;
    double      c2t, c2t_pp, c2t_pme;
    int         i, j, npp, nth_pp, nth_pme, nth_tot;
    char        buf[STRLEN];
    const char *hline = "-----------------------------------------------------------------------------";

    if (wc == NULL)
    {
        return;
    }

    nth_pp  = wc->nthreads_pp;
    nth_pme = wc->nthreads_pme;

    cyc_sum = wc->cycles_sum;

    npp     = nnodes - npme;
    nth_tot = npp*nth_pp + npme*nth_pme;

    /* When using PME-only nodes, the next line is valid for both
       PP-only and PME-only nodes because they started ewcRUN at the
       same time. */
    tot        = cyc_sum[ewcRUN];
    tot_for_pp = 0;

    /* Conversion factor from cycles to seconds */
    if (tot > 0)
    {
        c2t     = realtime/tot;
        c2t_pp  = c2t * nth_tot / (double) (npp*nth_pp);
        c2t_pme = c2t * nth_tot / (double) (npme*nth_pme);
    }
    else
    {
        c2t     = 0;
        c2t_pp  = 0;
        c2t_pme = 0;
    }

    fprintf(fplog, "\n     R E A L   C Y C L E   A N D   T I M E   A C C O U N T I N G\n\n");

    print_header(fplog, npp, nth_pp, npme, nth_pme);

    fprintf(fplog, "%s\n", hline);
    for (i = ewcPPDURINGPME+1; i < ewcNR; i++)
    {
        if (is_pme_subcounter(i))
        {
            /* Do not count these at all */
        }
        else if (npme > 0 && is_pme_counter(i))
        {
            /* Print timing information for PME-only nodes, but add an
             * asterisk so the reader of the table can know that the
             * walltimes are not meant to add up. The asterisk still
             * fits in the required maximum of 19 characters. */
            char buffer[STRLEN];
            snprintf(buffer, STRLEN, "%s *", wcn[i]);
            print_cycles(fplog, c2t_pme, buffer,
                         npme, nth_pme,
                         wc->wcc[i].n, cyc_sum[i], tot);
        }
        else
        {
            /* Print timing information when it is for a PP or PP+PME
               node */
            print_cycles(fplog, c2t_pp, wcn[i],
                         npp, nth_pp,
                         wc->wcc[i].n, cyc_sum[i], tot);
            tot_for_pp += cyc_sum[i];
        }
    }
    if (wc->wcc_all != NULL)
    {
        for (i = 0; i < ewcNR; i++)
        {
            for (j = 0; j < ewcNR; j++)
            {
                snprintf(buf, 20, "%-9.9s %-9.9s", wcn[i], wcn[j]);
                print_cycles(fplog, c2t_pp, buf,
                             npp, nth_pp,
                             wc->wcc_all[i*ewcNR+j].n,
                             wc->wcc_all[i*ewcNR+j].c,
                             tot);
            }
        }
    }
    tot_for_rest = tot * (npp * nth_pp) / (double) nth_tot;
    print_cycles(fplog, c2t_pp, "Rest",
                 npp, nth_pp,
                 -1, tot_for_rest - tot_for_pp, tot);
    fprintf(fplog, "%s\n", hline);
    print_cycles(fplog, c2t, "Total",
                 npp, nth_pp,
                 -1, tot, tot);
    fprintf(fplog, "%s\n", hline);

    if (npme > 0)
    {
        fprintf(fplog,
                "(*) Note that with separate PME nodes, the walltime column actually sums to\n"
                "    twice the total reported, but the cycle count total and %% are correct.\n"
                "%s\n", hline);
    }

    if (wc->wcc[ewcPMEMESH].n > 0)
    {
        fprintf(fplog, " Breakdown of PME mesh computation\n");
        fprintf(fplog, "%s\n", hline);
        for (i = ewcPPDURINGPME+1; i < ewcNR; i++)
        {
            if (is_pme_subcounter(i))
            {
                print_cycles(fplog, npme > 0 ? c2t_pme : c2t_pp, wcn[i],
                             npme > 0 ? npme : npp, nth_pme,
                             wc->wcc[i].n, cyc_sum[i], tot);
            }
        }
        fprintf(fplog, "%s\n", hline);
    }

#ifdef GMX_CYCLE_SUBCOUNTERS
    fprintf(fplog, " Breakdown of PP computation\n");
    fprintf(fplog, "%s\n", hline);
    for (i = 0; i < ewcsNR; i++)
    {
        print_cycles(fplog, c2t_pp, wcsn[i],
                     npp, nth_pp,
                     wc->wcsc[i].n, cyc_sum[ewcNR+i], tot);
    }
    fprintf(fplog, "%s\n", hline);
#endif

    /* print GPU timing summary */
    if (gpu_t)
    {
        const char *k_log_str[2][2] = {
            {"Nonbonded F kernel", "Nonbonded F+ene k."},
            {"Nonbonded F+prune k.", "Nonbonded F+ene+prune k."}
        };

        tot_gpu = gpu_t->pl_h2d_t + gpu_t->nb_h2d_t + gpu_t->nb_d2h_t;

        /* add up the kernel timings */
        tot_k = 0.0;
        for (i = 0; i < 2; i++)
        {
            for (j = 0; j < 2; j++)
            {
                tot_k += gpu_t->ktime[i][j].t;
            }
        }
        tot_gpu += tot_k;

        tot_cpu_overlap = wc->wcc[ewcFORCE].c;
        if (wc->wcc[ewcPMEMESH].n > 0)
        {
            tot_cpu_overlap += wc->wcc[ewcPMEMESH].c;
        }
        tot_cpu_overlap *= realtime*1000/tot; /* convert s to ms */

        fprintf(fplog, "\n GPU timings\n%s\n", hline);
        fprintf(fplog, " Computing:                         Count  Wall t (s)      ms/step       %c\n", '%');
        fprintf(fplog, "%s\n", hline);
        print_gputimes(fplog, "Pair list H2D",
                       gpu_t->pl_h2d_c, gpu_t->pl_h2d_t, tot_gpu);
        print_gputimes(fplog, "X / q H2D",
                       gpu_t->nb_c, gpu_t->nb_h2d_t, tot_gpu);

        for (i = 0; i < 2; i++)
        {
            for (j = 0; j < 2; j++)
            {
                if (gpu_t->ktime[i][j].c)
                {
                    print_gputimes(fplog, k_log_str[i][j],
                                   gpu_t->ktime[i][j].c, gpu_t->ktime[i][j].t, tot_gpu);
                }
            }
        }

        print_gputimes(fplog, "F D2H",  gpu_t->nb_c, gpu_t->nb_d2h_t, tot_gpu);
        fprintf(fplog, "%s\n", hline);
        print_gputimes(fplog, "Total ", gpu_t->nb_c, tot_gpu, tot_gpu);
        fprintf(fplog, "%s\n", hline);

        gpu_cpu_ratio = tot_gpu/tot_cpu_overlap;
        fprintf(fplog, "\nForce evaluation time GPU/CPU: %.3f ms/%.3f ms = %.3f\n",
                tot_gpu/gpu_t->nb_c, tot_cpu_overlap/wc->wcc[ewcFORCE].n,
                gpu_cpu_ratio);

        /* only print notes related to CPU-GPU load balance with PME */
        if (wc->wcc[ewcPMEMESH].n > 0)
        {
            fprintf(fplog, "For optimal performance this ratio should be close to 1!\n");

            /* print note if the imbalance is high with PME case in which
             * CPU-GPU load balancing is possible */
            if (gpu_cpu_ratio < 0.75 || gpu_cpu_ratio > 1.2)
            {
                /* Only the sim master calls this function, so always print to stderr */
                if (gpu_cpu_ratio < 0.75)
                {
                    if (npp > 1)
                    {
                        /* The user could have used -notunepme,
                         * but we currently can't check that here.
                         */
                        md_print_warn(NULL, fplog,
                                      "\nNOTE: The GPU has >25%% less load than the CPU. This imbalance causes\n"
                                      "      performance loss. Maybe the domain decomposition limits the PME tuning.\n"
                                      "      In that case, try setting the DD grid manually (-dd) or lowering -dds.");
                    }
                    else
                    {
                        /* We should not end up here, unless the box is
                         * too small for increasing the cut-off for PME tuning.
                         */
                        md_print_warn(NULL, fplog,
                                      "\nNOTE: The GPU has >25%% less load than the CPU. This imbalance causes\n"
                                      "      performance loss.");
                    }
                }
                if (gpu_cpu_ratio > 1.2)
                {
                    md_print_warn(NULL, fplog,
                                  "\nNOTE: The GPU has >20%% more load than the CPU. This imbalance causes\n"
                                  "      performance loss, consider using a shorter cut-off and a finer PME grid.");
                }
            }
        }
    }

    if (wc->wcc[ewcNB_XF_BUF_OPS].n > 0 &&
        (cyc_sum[ewcDOMDEC] > tot*0.1 ||
         cyc_sum[ewcNS] > tot*0.1))
    {
        /* Only the sim master calls this function, so always print to stderr */
        if (wc->wcc[ewcDOMDEC].n == 0)
        {
            md_print_warn(NULL, fplog,
                          "NOTE: %d %% of the run time was spent in pair search,\n"
                          "      you might want to increase nstlist (this has no effect on accuracy)\n",
                          (int)(100*cyc_sum[ewcNS]/tot+0.5));
        }
        else
        {
            md_print_warn(NULL, fplog,
                          "NOTE: %d %% of the run time was spent in domain decomposition,\n"
                          "      %d %% of the run time was spent in pair search,\n"
                          "      you might want to increase nstlist (this has no effect on accuracy)\n",
                          (int)(100*cyc_sum[ewcDOMDEC]/tot+0.5),
                          (int)(100*cyc_sum[ewcNS]/tot+0.5));
        }
    }

    if (cyc_sum[ewcMoveE] > tot*0.05)
    {
        /* Only the sim master calls this function, so always print to stderr */
        md_print_warn(NULL, fplog,
                      "NOTE: %d %% of the run time was spent communicating energies,\n"
                      "      you might want to use the -gcom option of mdrun\n",
                      (int)(100*cyc_sum[ewcMoveE]/tot+0.5));
    }
}

コード例 #5

ファイルを表示

ファイル: gmx_wallcycle.c プロジェクト: alwanderer/gromacs

void wallcycle_print(FILE *fplog, int nnodes, int npme, double realtime,
                     gmx_wallcycle_t wc, wallclock_gpu_t *gpu_t)
{
    double     *cycles;
    double      c2t, tot, tot_gpu, tot_cpu_overlap, gpu_cpu_ratio, sum, tot_k;
    int         i, j, npp, nth_pp, nth_pme, nth_tot;
    char        buf[STRLEN];
    const char *hline = "-----------------------------------------------------------------------------";

    if (wc == NULL)
    {
        return;
    }

    nth_pp  = wc->nthreads_pp;
    nth_pme = wc->nthreads_pme;

    cycles = wc->cycles_sum;

    if (npme > 0)
    {
        npp = nnodes - npme;

        nth_tot = npp*nth_pp + npme*nth_pme;
    }
    else
    {
        npp  = nnodes;
        npme = nnodes;

        nth_tot = npp*nth_pp;
    }

    tot = cycles[ewcRUN];

    /* Conversion factor from cycles to seconds */
    if (tot > 0)
    {
        c2t = realtime/tot;
    }
    else
    {
        c2t = 0;
    }

    fprintf(fplog, "\n     R E A L   C Y C L E   A N D   T I M E   A C C O U N T I N G\n\n");

    fprintf(fplog, " Computing:         Nodes   Th.     Count  Wall t (s)     G-Cycles       %c\n", '%');
    fprintf(fplog, "%s\n", hline);
    sum = 0;
    for (i = ewcPPDURINGPME+1; i < ewcNR; i++)
    {
        if (!is_pme_subcounter(i))
        {
            print_cycles(fplog, c2t, wcn[i], nth_tot,
                         is_pme_counter(i) ? npme : npp,
                         is_pme_counter(i) ? nth_pme : nth_pp,
                         wc->wcc[i].n, cycles[i], tot);
            sum += cycles[i];
        }
    }
    if (wc->wcc_all != NULL)
    {
        for (i = 0; i < ewcNR; i++)
        {
            for (j = 0; j < ewcNR; j++)
            {
                snprintf(buf, 9, "%-9s", wcn[i]);
                buf[9] = ' ';
                snprintf(buf+10, 9, "%-9s", wcn[j]);
                buf[19] = '\0';
                print_cycles(fplog, c2t, buf, nth_tot,
                             is_pme_counter(i) ? npme : npp,
                             is_pme_counter(i) ? nth_pme : nth_pp,
                             wc->wcc_all[i*ewcNR+j].n,
                             wc->wcc_all[i*ewcNR+j].c,
                             tot);
            }
        }
    }
    print_cycles(fplog, c2t, "Rest", nth_tot, npp, -1, 0, tot-sum, tot);
    fprintf(fplog, "%s\n", hline);
    print_cycles(fplog, c2t, "Total", nth_tot, nnodes, -1, 0, tot, tot);
    fprintf(fplog, "%s\n", hline);

    if (wc->wcc[ewcPMEMESH].n > 0)
    {
        fprintf(fplog, "%s\n", hline);
        for (i = ewcPPDURINGPME+1; i < ewcNR; i++)
        {
            if (is_pme_subcounter(i))
            {
                print_cycles(fplog, c2t, wcn[i], nth_tot,
                             is_pme_counter(i) ? npme : npp,
                             is_pme_counter(i) ? nth_pme : nth_pp,
                             wc->wcc[i].n, cycles[i], tot);
            }
        }
        fprintf(fplog, "%s\n", hline);
    }

#ifdef GMX_CYCLE_SUBCOUNTERS
    fprintf(fplog, "%s\n", hline);
    for (i = 0; i < ewcsNR; i++)
    {
        print_cycles(fplog, c2t, wcsn[i], nth_tot, npp, nth_pp,
                     wc->wcsc[i].n, cycles[ewcNR+i], tot);
    }
    fprintf(fplog, "%s\n", hline);
#endif

    /* print GPU timing summary */
    if (gpu_t)
    {
        const char *k_log_str[2][2] = {
            {"Nonbonded F kernel", "Nonbonded F+ene k."},
            {"Nonbonded F+prune k.", "Nonbonded F+ene+prune k."}
        };

        tot_gpu = gpu_t->pl_h2d_t + gpu_t->nb_h2d_t + gpu_t->nb_d2h_t;

        /* add up the kernel timings */
        tot_k = 0.0;
        for (i = 0; i < 2; i++)
        {
            for (j = 0; j < 2; j++)
            {
                tot_k += gpu_t->ktime[i][j].t;
            }
        }
        tot_gpu += tot_k;

        tot_cpu_overlap = wc->wcc[ewcFORCE].c;
        if (wc->wcc[ewcPMEMESH].n > 0)
        {
            tot_cpu_overlap += wc->wcc[ewcPMEMESH].c;
        }
        tot_cpu_overlap *= c2t * 1000; /* convert s to ms */

        fprintf(fplog, "\n GPU timings\n%s\n", hline);
        fprintf(fplog, " Computing:                         Count  Wall t (s)      ms/step       %c\n", '%');
        fprintf(fplog, "%s\n", hline);
        print_gputimes(fplog, "Pair list H2D",
                       gpu_t->pl_h2d_c, gpu_t->pl_h2d_t, tot_gpu);
        print_gputimes(fplog, "X / q H2D",
                       gpu_t->nb_c, gpu_t->nb_h2d_t, tot_gpu);

        for (i = 0; i < 2; i++)
        {
            for (j = 0; j < 2; j++)
            {
                if (gpu_t->ktime[i][j].c)
                {
                    print_gputimes(fplog, k_log_str[i][j],
                                   gpu_t->ktime[i][j].c, gpu_t->ktime[i][j].t, tot_gpu);
                }
            }
        }

        print_gputimes(fplog, "F D2H",  gpu_t->nb_c, gpu_t->nb_d2h_t, tot_gpu);
        fprintf(fplog, "%s\n", hline);
        print_gputimes(fplog, "Total ", gpu_t->nb_c, tot_gpu, tot_gpu);
        fprintf(fplog, "%s\n", hline);

        gpu_cpu_ratio = tot_gpu/tot_cpu_overlap;
        fprintf(fplog, "\nForce evaluation time GPU/CPU: %.3f ms/%.3f ms = %.3f\n",
                tot_gpu/gpu_t->nb_c, tot_cpu_overlap/wc->wcc[ewcFORCE].n,
                gpu_cpu_ratio);

        /* only print notes related to CPU-GPU load balance with PME */
        if (wc->wcc[ewcPMEMESH].n > 0)
        {
            fprintf(fplog, "For optimal performance this ratio should be close to 1!\n");

            /* print note if the imbalance is high with PME case in which
             * CPU-GPU load balancing is possible */
            if (gpu_cpu_ratio < 0.75 || gpu_cpu_ratio > 1.2)
            {
                /* Only the sim master calls this function, so always print to stderr */
                if (gpu_cpu_ratio < 0.75)
                {
                    if (npp > 1)
                    {
                        /* The user could have used -notunepme,
                         * but we currently can't check that here.
                         */
                        md_print_warn(NULL, fplog,
                                      "\nNOTE: The GPU has >25%% less load than the CPU. This imbalance causes\n"
                                      "      performance loss. Maybe the domain decomposition limits the PME tuning.\n"
                                      "      In that case, try setting the DD grid manually (-dd) or lowering -dds.");
                    }
                    else
                    {
                        /* We should not end up here, unless the box is
                         * too small for increasing the cut-off for PME tuning.
                         */
                        md_print_warn(NULL, fplog,
                                      "\nNOTE: The GPU has >25%% less load than the CPU. This imbalance causes\n"
                                      "      performance loss.");
                    }
                }
                if (gpu_cpu_ratio > 1.2)
                {
                    md_print_warn(NULL, fplog,
                                  "\nNOTE: The GPU has >20%% more load than the CPU. This imbalance causes\n"
                                  "      performance loss, consider using a shorter cut-off and a finer PME grid.");
                }
            }
        }
    }

    if (wc->wcc[ewcNB_XF_BUF_OPS].n > 0 &&
        (cycles[ewcDOMDEC] > tot*0.1 ||
         cycles[ewcNS] > tot*0.1))
    {
        /* Only the sim master calls this function, so always print to stderr */
        if (wc->wcc[ewcDOMDEC].n == 0)
        {
            md_print_warn(NULL, fplog,
                          "NOTE: %d %% of the run time was spent in pair search,\n"
                          "      you might want to increase nstlist (this has no effect on accuracy)\n",
                          (int)(100*cycles[ewcNS]/tot+0.5));
        }
        else
        {
            md_print_warn(NULL, fplog,
                          "NOTE: %d %% of the run time was spent in domain decomposition,\n"
                          "      %d %% of the run time was spent in pair search,\n"
                          "      you might want to increase nstlist (this has no effect on accuracy)\n",
                          (int)(100*cycles[ewcDOMDEC]/tot+0.5),
                          (int)(100*cycles[ewcNS]/tot+0.5));
        }
    }

    if (cycles[ewcMoveE] > tot*0.05)
    {
        /* Only the sim master calls this function, so always print to stderr */
        md_print_warn(NULL, fplog,
                      "NOTE: %d %% of the run time was spent communicating energies,\n"
                      "      you might want to use the -gcom option of mdrun\n",
                      (int)(100*cycles[ewcMoveE]/tot+0.5));
    }
}