示例#1
0
static void print_gpu_detection_stats(FILE                 *fplog,
                                      const gmx_gpu_info_t *gpu_info,
                                      const t_commrec      *cr)
{
    char onhost[266], stmp[STRLEN];
    int  ngpu;

    if (!gpu_info->bDetectGPUs)
    {
        /* We skipped the detection, so don't print detection stats */
        return;
    }

    ngpu = gpu_info->ncuda_dev;

#if defined GMX_MPI && !defined GMX_THREAD_MPI
    /* We only print the detection on one, of possibly multiple, nodes */
    strncpy(onhost, " on host ", 10);
    gmx_gethostname(onhost+9, 256);
#else
    /* We detect all relevant GPUs */
    strncpy(onhost, "", 1);
#endif

    if (ngpu > 0)
    {
        sprint_gpus(stmp, gpu_info);
        md_print_warn(cr, fplog, "%d GPU%s detected%s:\n%s\n",
                      ngpu, (ngpu > 1) ? "s" : "", onhost, stmp);
    }
    else
    {
        md_print_warn(cr, fplog, "No GPUs detected%s\n", onhost);
    }
}
static int getMaxGpuUsable(FILE *fplog, const t_commrec *cr, const gmx_hw_info_t *hwinfo, int cutoff_scheme)
{
    /* This code relies on the fact that GPU are not detected when GPU
     * acceleration was disabled at run time by the user.
     */
    if (cutoff_scheme == ecutsVERLET &&
        hwinfo->gpu_info.n_dev_compatible > 0)
    {
        if (gmx_multiple_gpu_per_node_supported())
        {
            return hwinfo->gpu_info.n_dev_compatible;
        }
        else
        {
            if (hwinfo->gpu_info.n_dev_compatible > 1)
            {
                md_print_warn(cr, fplog, "More than one compatible GPU is available, but GROMACS can only use one of them. Using a single thread-MPI rank.\n");
            }
            return 1;
        }
    }
    else
    {
        return 0;
    }
}
示例#3
0
/* Give a suitable fatal error or warning if the build configuration
   and runtime CPU do not match. */
static void
check_use_of_rdtscp_on_this_cpu(FILE                *fplog,
                                const t_commrec     *cr,
                                const gmx_hw_info_t *hwinfo)
{
    gmx_bool bCpuHasRdtscp, bBinaryUsesRdtscp;
#ifdef HAVE_RDTSCP
    bBinaryUsesRdtscp = TRUE;
#else
    bBinaryUsesRdtscp = FALSE;
#endif

    bCpuHasRdtscp = gmx_cpuid_feature(hwinfo->cpuid_info, GMX_CPUID_FEATURE_X86_RDTSCP);

    if (!bCpuHasRdtscp && bBinaryUsesRdtscp)
    {
        gmx_fatal(FARGS, "The %s executable was compiled to use the rdtscp CPU instruction. "
                  "However, this is not supported by the current hardware and continuing would lead to a crash. "
                  "Please rebuild GROMACS with the GMX_USE_RDTSCP=OFF CMake option.",
                  ShortProgram());
    }

    if (bCpuHasRdtscp && !bBinaryUsesRdtscp)
    {
        md_print_warn(cr, fplog, "The current CPU can measure timings more accurately than the code in\n"
                      "%s was configured to use. This might affect your simulation\n"
                      "speed as accurate timings are needed for load-balancing.\n"
                      "Please consider rebuilding %s with the GMX_USE_RDTSCP=OFF CMake option.\n",
                      ShortProgram(), ShortProgram());
    }
}
/* Give a suitable fatal error or warning if the build configuration
   and runtime CPU do not match. */
static void
check_use_of_rdtscp_on_this_cpu(FILE                  *fplog,
                                const t_commrec       *cr,
                                const gmx::CpuInfo    &cpuInfo)
{
#ifdef HAVE_RDTSCP
    bool binaryUsesRdtscp = TRUE;
#else
    bool binaryUsesRdtscp = FALSE;
#endif

    const char *programName = gmx::getProgramContext().displayName();

    if (cpuInfo.supportLevel() < gmx::CpuInfo::SupportLevel::Features)
    {
        if (binaryUsesRdtscp)
        {
            md_print_warn(cr, fplog, "The %s executable was compiled to use the rdtscp CPU instruction. "
                          "We cannot detect the features of your current CPU, but will proceed anyway. "
                          "If you get a crash, rebuild GROMACS with the GMX_USE_RDTSCP=OFF CMake option.",
                          programName);
        }
    }
    else
    {
        bool cpuHasRdtscp = cpuInfo.feature(gmx::CpuInfo::Feature::X86_Rdtscp);

        if (!cpuHasRdtscp && binaryUsesRdtscp)
        {
            gmx_fatal(FARGS, "The %s executable was compiled to use the rdtscp CPU instruction. "
                      "However, this is not supported by the current hardware and continuing would lead to a crash. "
                      "Please rebuild GROMACS with the GMX_USE_RDTSCP=OFF CMake option.",
                      programName);
        }

        if (cpuHasRdtscp && !binaryUsesRdtscp)
        {
            md_print_warn(cr, fplog, "The current CPU can measure timings more accurately than the code in\n"
                          "%s was configured to use. This might affect your simulation\n"
                          "speed as accurate timings are needed for load-balancing.\n"
                          "Please consider rebuilding %s with the GMX_USE_RDTSCP=ON CMake option.\n",
                          programName, programName);
        }
    }
}
示例#5
0
void check_ir_old_tpx_versions(t_commrec *cr, FILE *fplog,
                               t_inputrec *ir, gmx_mtop_t *mtop)
{
    /* Check required for old tpx files */
    if (IR_TWINRANGE(*ir) && ir->nstlist > 1 &&
        ir->nstcalcenergy % ir->nstlist != 0)
    {
        md_print_warn(cr, fplog, "Old tpr file with twin-range settings: modifying energy calculation and/or T/P-coupling frequencies\n");

        if (gmx_mtop_ftype_count(mtop, F_CONSTR) +
            gmx_mtop_ftype_count(mtop, F_CONSTRNC) > 0 &&
            ir->eConstrAlg == econtSHAKE)
        {
            md_print_warn(cr, fplog, "With twin-range cut-off's and SHAKE the virial and pressure are incorrect\n");
            if (ir->epc != epcNO)
            {
                gmx_fatal(FARGS, "Can not do pressure coupling with twin-range cut-off's and SHAKE");
            }
        }
        check_nst_param(fplog, cr, "nstlist", ir->nstlist,
                        "nstcalcenergy", &ir->nstcalcenergy);
        if (ir->epc != epcNO)
        {
            check_nst_param(fplog, cr, "nstlist", ir->nstlist,
                            "nstpcouple", &ir->nstpcouple);
        }
        check_nst_param(fplog, cr, "nstcalcenergy", ir->nstcalcenergy,
                        "nstenergy", &ir->nstenergy);
        check_nst_param(fplog, cr, "nstcalcenergy", ir->nstcalcenergy,
                        "nstlog", &ir->nstlog);
        if (ir->efep != efepNO)
        {
            check_nst_param(fplog, cr, "nstcalcenergy", ir->nstcalcenergy,
                            "nstdhdl", &ir->fepvals->nstdhdl);
        }
    }

    if (EI_VV(ir->eI) && IR_TWINRANGE(*ir) && ir->nstlist > 1)
    {
        gmx_fatal(FARGS, "Twin-range multiple time stepping does not work with integrator %s.", ei_names[ir->eI]);
    }
}
示例#6
0
void check_nst_param(FILE *fplog, t_commrec *cr,
                     const char *desc_nst, int nst,
                     const char *desc_p, int *p)
{
    if (*p > 0 && *p % nst != 0)
    {
        /* Round up to the next multiple of nst */
        *p = ((*p)/nst + 1)*nst;
        md_print_warn(cr, fplog,
                      "NOTE: %s changes %s to %d\n", desc_nst, desc_p, *p);
    }
}
/*! \brief Print all load-balancing settings */
static void print_pme_loadbal_settings(pme_load_balancing_t *pme_lb,
                                       t_commrec            *cr,
                                       FILE                 *fplog,
                                       gmx_bool              bNonBondedOnGPU)
{
    double     pp_ratio, grid_ratio;
    real       pp_ratio_temporary;

    pp_ratio_temporary = pme_loadbal_rlist(&pme_lb->setup[pme_lb->cur])/pme_loadbal_rlist(&pme_lb->setup[0]);
    pp_ratio           = std::pow(static_cast<double>(pp_ratio_temporary), 3.0);
    grid_ratio         = pme_grid_points(&pme_lb->setup[pme_lb->cur])/
        (double)pme_grid_points(&pme_lb->setup[0]);

    fprintf(fplog, "\n");
    fprintf(fplog, "       P P   -   P M E   L O A D   B A L A N C I N G\n");
    fprintf(fplog, "\n");
    /* Here we only warn when the optimal setting is the last one */
    if (pme_lb->elimited != epmelblimNO &&
        pme_lb->cur == pme_loadbal_end(pme_lb)-1)
    {
        fprintf(fplog, " NOTE: The PP/PME load balancing was limited by the %s,\n",
                pmelblim_str[pme_lb->elimited]);
        fprintf(fplog, "       you might not have reached a good load balance.\n");
        if (pme_lb->elimited == epmelblimDD)
        {
            fprintf(fplog, "       Try different mdrun -dd settings or lower the -dds value.\n");
        }
        fprintf(fplog, "\n");
    }
    fprintf(fplog, " PP/PME load balancing changed the cut-off and PME settings:\n");
    fprintf(fplog, "           particle-particle                    PME\n");
    fprintf(fplog, "            rcoulomb  rlist            grid      spacing   1/beta\n");
    print_pme_loadbal_setting(fplog, "initial", &pme_lb->setup[0]);
    print_pme_loadbal_setting(fplog, "final", &pme_lb->setup[pme_lb->cur]);
    fprintf(fplog, " cost-ratio           %4.2f             %4.2f\n",
            pp_ratio, grid_ratio);
    fprintf(fplog, " (note that these numbers concern only part of the total PP and PME load)\n");

    if (pp_ratio > 1.5 && !bNonBondedOnGPU)
    {
        md_print_warn(cr, fplog,
                      "NOTE: PME load balancing increased the non-bonded workload by more than 50%%.\n"
                      "      For better performance, use (more) PME ranks (mdrun -npme),\n"
                      "      or if you are beyond the scaling limit, use fewer total ranks (or nodes).\n");
    }
    else
    {
        fprintf(fplog, "\n");
    }
}
示例#8
0
/* Override the value in inputrec with value passed on the command line (if any) */
static void override_nsteps_cmdline(FILE            *fplog,
                                    gmx_int64_t      nsteps_cmdline,
                                    t_inputrec      *ir,
                                    const t_commrec *cr)
{
    assert(ir);
    assert(cr);

    /* override with anything else than the default -2 */
    if (nsteps_cmdline > -2)
    {
        char sbuf_steps[STEPSTRSIZE];
        char sbuf_msg[STRLEN];

        ir->nsteps = nsteps_cmdline;
        if (EI_DYNAMICS(ir->eI) && nsteps_cmdline != -1)
        {
            sprintf(sbuf_msg, "Overriding nsteps with value passed on the command line: %s steps, %.3g ps",
                    gmx_step_str(nsteps_cmdline, sbuf_steps),
                    fabs(nsteps_cmdline*ir->delta_t));
        }
        else
        {
            sprintf(sbuf_msg, "Overriding nsteps with value passed on the command line: %s steps",
                    gmx_step_str(nsteps_cmdline, sbuf_steps));
        }

        md_print_warn(cr, fplog, "%s\n", sbuf_msg);
    }
    else if (nsteps_cmdline < -2)
    {
        gmx_fatal(FARGS, "Invalid nsteps value passed on the command line: %d",
                  nsteps_cmdline);
    }
    /* Do nothing if nsteps_cmdline == -2 */
}
示例#9
0
void wallcycle_print(FILE *fplog, int nnodes, int npme, double realtime,
                     gmx_wallcycle_t wc, wallclock_gpu_t *gpu_t)
{
    double     *cyc_sum;
    double      tot, tot_for_pp, tot_for_rest, tot_gpu, tot_cpu_overlap, gpu_cpu_ratio, tot_k;
    double      c2t, c2t_pp, c2t_pme;
    int         i, j, npp, nth_pp, nth_pme, nth_tot;
    char        buf[STRLEN];
    const char *hline = "-----------------------------------------------------------------------------";

    if (wc == NULL)
    {
        return;
    }

    nth_pp  = wc->nthreads_pp;
    nth_pme = wc->nthreads_pme;

    cyc_sum = wc->cycles_sum;

    npp     = nnodes - npme;
    nth_tot = npp*nth_pp + npme*nth_pme;

    /* When using PME-only nodes, the next line is valid for both
       PP-only and PME-only nodes because they started ewcRUN at the
       same time. */
    tot        = cyc_sum[ewcRUN];
    tot_for_pp = 0;

    /* Conversion factor from cycles to seconds */
    if (tot > 0)
    {
        c2t     = realtime/tot;
        c2t_pp  = c2t * nth_tot / (double) (npp*nth_pp);
        c2t_pme = c2t * nth_tot / (double) (npme*nth_pme);
    }
    else
    {
        c2t     = 0;
        c2t_pp  = 0;
        c2t_pme = 0;
    }

    fprintf(fplog, "\n     R E A L   C Y C L E   A N D   T I M E   A C C O U N T I N G\n\n");

    print_header(fplog, npp, nth_pp, npme, nth_pme);

    fprintf(fplog, "%s\n", hline);
    for (i = ewcPPDURINGPME+1; i < ewcNR; i++)
    {
        if (is_pme_subcounter(i))
        {
            /* Do not count these at all */
        }
        else if (npme > 0 && is_pme_counter(i))
        {
            /* Print timing information for PME-only nodes, but add an
             * asterisk so the reader of the table can know that the
             * walltimes are not meant to add up. The asterisk still
             * fits in the required maximum of 19 characters. */
            char buffer[STRLEN];
            snprintf(buffer, STRLEN, "%s *", wcn[i]);
            print_cycles(fplog, c2t_pme, buffer,
                         npme, nth_pme,
                         wc->wcc[i].n, cyc_sum[i], tot);
        }
        else
        {
            /* Print timing information when it is for a PP or PP+PME
               node */
            print_cycles(fplog, c2t_pp, wcn[i],
                         npp, nth_pp,
                         wc->wcc[i].n, cyc_sum[i], tot);
            tot_for_pp += cyc_sum[i];
        }
    }
    if (wc->wcc_all != NULL)
    {
        for (i = 0; i < ewcNR; i++)
        {
            for (j = 0; j < ewcNR; j++)
            {
                snprintf(buf, 20, "%-9.9s %-9.9s", wcn[i], wcn[j]);
                print_cycles(fplog, c2t_pp, buf,
                             npp, nth_pp,
                             wc->wcc_all[i*ewcNR+j].n,
                             wc->wcc_all[i*ewcNR+j].c,
                             tot);
            }
        }
    }
    tot_for_rest = tot * (npp * nth_pp) / (double) nth_tot;
    print_cycles(fplog, c2t_pp, "Rest",
                 npp, nth_pp,
                 -1, tot_for_rest - tot_for_pp, tot);
    fprintf(fplog, "%s\n", hline);
    print_cycles(fplog, c2t, "Total",
                 npp, nth_pp,
                 -1, tot, tot);
    fprintf(fplog, "%s\n", hline);

    if (npme > 0)
    {
        fprintf(fplog,
                "(*) Note that with separate PME nodes, the walltime column actually sums to\n"
                "    twice the total reported, but the cycle count total and %% are correct.\n"
                "%s\n", hline);
    }

    if (wc->wcc[ewcPMEMESH].n > 0)
    {
        fprintf(fplog, " Breakdown of PME mesh computation\n");
        fprintf(fplog, "%s\n", hline);
        for (i = ewcPPDURINGPME+1; i < ewcNR; i++)
        {
            if (is_pme_subcounter(i))
            {
                print_cycles(fplog, npme > 0 ? c2t_pme : c2t_pp, wcn[i],
                             npme > 0 ? npme : npp, nth_pme,
                             wc->wcc[i].n, cyc_sum[i], tot);
            }
        }
        fprintf(fplog, "%s\n", hline);
    }

#ifdef GMX_CYCLE_SUBCOUNTERS
    fprintf(fplog, " Breakdown of PP computation\n");
    fprintf(fplog, "%s\n", hline);
    for (i = 0; i < ewcsNR; i++)
    {
        print_cycles(fplog, c2t_pp, wcsn[i],
                     npp, nth_pp,
                     wc->wcsc[i].n, cyc_sum[ewcNR+i], tot);
    }
    fprintf(fplog, "%s\n", hline);
#endif

    /* print GPU timing summary */
    if (gpu_t)
    {
        const char *k_log_str[2][2] = {
            {"Nonbonded F kernel", "Nonbonded F+ene k."},
            {"Nonbonded F+prune k.", "Nonbonded F+ene+prune k."}
        };

        tot_gpu = gpu_t->pl_h2d_t + gpu_t->nb_h2d_t + gpu_t->nb_d2h_t;

        /* add up the kernel timings */
        tot_k = 0.0;
        for (i = 0; i < 2; i++)
        {
            for (j = 0; j < 2; j++)
            {
                tot_k += gpu_t->ktime[i][j].t;
            }
        }
        tot_gpu += tot_k;

        tot_cpu_overlap = wc->wcc[ewcFORCE].c;
        if (wc->wcc[ewcPMEMESH].n > 0)
        {
            tot_cpu_overlap += wc->wcc[ewcPMEMESH].c;
        }
        tot_cpu_overlap *= realtime*1000/tot; /* convert s to ms */

        fprintf(fplog, "\n GPU timings\n%s\n", hline);
        fprintf(fplog, " Computing:                         Count  Wall t (s)      ms/step       %c\n", '%');
        fprintf(fplog, "%s\n", hline);
        print_gputimes(fplog, "Pair list H2D",
                       gpu_t->pl_h2d_c, gpu_t->pl_h2d_t, tot_gpu);
        print_gputimes(fplog, "X / q H2D",
                       gpu_t->nb_c, gpu_t->nb_h2d_t, tot_gpu);

        for (i = 0; i < 2; i++)
        {
            for (j = 0; j < 2; j++)
            {
                if (gpu_t->ktime[i][j].c)
                {
                    print_gputimes(fplog, k_log_str[i][j],
                                   gpu_t->ktime[i][j].c, gpu_t->ktime[i][j].t, tot_gpu);
                }
            }
        }

        print_gputimes(fplog, "F D2H",  gpu_t->nb_c, gpu_t->nb_d2h_t, tot_gpu);
        fprintf(fplog, "%s\n", hline);
        print_gputimes(fplog, "Total ", gpu_t->nb_c, tot_gpu, tot_gpu);
        fprintf(fplog, "%s\n", hline);

        gpu_cpu_ratio = tot_gpu/tot_cpu_overlap;
        fprintf(fplog, "\nForce evaluation time GPU/CPU: %.3f ms/%.3f ms = %.3f\n",
                tot_gpu/gpu_t->nb_c, tot_cpu_overlap/wc->wcc[ewcFORCE].n,
                gpu_cpu_ratio);

        /* only print notes related to CPU-GPU load balance with PME */
        if (wc->wcc[ewcPMEMESH].n > 0)
        {
            fprintf(fplog, "For optimal performance this ratio should be close to 1!\n");

            /* print note if the imbalance is high with PME case in which
             * CPU-GPU load balancing is possible */
            if (gpu_cpu_ratio < 0.75 || gpu_cpu_ratio > 1.2)
            {
                /* Only the sim master calls this function, so always print to stderr */
                if (gpu_cpu_ratio < 0.75)
                {
                    if (npp > 1)
                    {
                        /* The user could have used -notunepme,
                         * but we currently can't check that here.
                         */
                        md_print_warn(NULL, fplog,
                                      "\nNOTE: The GPU has >25%% less load than the CPU. This imbalance causes\n"
                                      "      performance loss. Maybe the domain decomposition limits the PME tuning.\n"
                                      "      In that case, try setting the DD grid manually (-dd) or lowering -dds.");
                    }
                    else
                    {
                        /* We should not end up here, unless the box is
                         * too small for increasing the cut-off for PME tuning.
                         */
                        md_print_warn(NULL, fplog,
                                      "\nNOTE: The GPU has >25%% less load than the CPU. This imbalance causes\n"
                                      "      performance loss.");
                    }
                }
                if (gpu_cpu_ratio > 1.2)
                {
                    md_print_warn(NULL, fplog,
                                  "\nNOTE: The GPU has >20%% more load than the CPU. This imbalance causes\n"
                                  "      performance loss, consider using a shorter cut-off and a finer PME grid.");
                }
            }
        }
    }

    if (wc->wcc[ewcNB_XF_BUF_OPS].n > 0 &&
        (cyc_sum[ewcDOMDEC] > tot*0.1 ||
         cyc_sum[ewcNS] > tot*0.1))
    {
        /* Only the sim master calls this function, so always print to stderr */
        if (wc->wcc[ewcDOMDEC].n == 0)
        {
            md_print_warn(NULL, fplog,
                          "NOTE: %d %% of the run time was spent in pair search,\n"
                          "      you might want to increase nstlist (this has no effect on accuracy)\n",
                          (int)(100*cyc_sum[ewcNS]/tot+0.5));
        }
        else
        {
            md_print_warn(NULL, fplog,
                          "NOTE: %d %% of the run time was spent in domain decomposition,\n"
                          "      %d %% of the run time was spent in pair search,\n"
                          "      you might want to increase nstlist (this has no effect on accuracy)\n",
                          (int)(100*cyc_sum[ewcDOMDEC]/tot+0.5),
                          (int)(100*cyc_sum[ewcNS]/tot+0.5));
        }
    }

    if (cyc_sum[ewcMoveE] > tot*0.05)
    {
        /* Only the sim master calls this function, so always print to stderr */
        md_print_warn(NULL, fplog,
                      "NOTE: %d %% of the run time was spent communicating energies,\n"
                      "      you might want to use the -gcom option of mdrun\n",
                      (int)(100*cyc_sum[ewcMoveE]/tot+0.5));
    }
}
示例#10
0
static int
get_thread_affinity_layout(FILE *fplog,
                           const t_commrec *cr,
                           const gmx_hw_info_t * hwinfo,
                           int nthreads,
                           int pin_offset, int * pin_stride,
                           const int **locality_order)
{
    int         nhwthreads, npkg, ncores, nhwthreads_per_core, rc;
    const int * pkg_id;
    const int * core_id;
    const int * hwthread_id;
    gmx_bool    bPickPinStride;

    if (pin_offset < 0)
    {
        gmx_fatal(FARGS, "Negative thread pinning offset requested");
    }
    if (*pin_stride < 0)
    {
        gmx_fatal(FARGS, "Negative thread pinning stride requested");
    }

    rc = gmx_cpuid_topology(hwinfo->cpuid_info, &nhwthreads, &npkg, &ncores,
                            &nhwthreads_per_core,
                            &pkg_id, &core_id, &hwthread_id, locality_order);

    if (rc != 0)
    {
        /* topology information not available or invalid, ignore it */
        nhwthreads      = hwinfo->nthreads_hw_avail;
        *locality_order = NULL;

        if (nhwthreads <= 0)
        {
            /* We don't know anything about the hardware, don't pin */
            md_print_warn(cr, fplog,
                          "NOTE: We don't know how many logical cores we have, will not pin threads");

            return -1;
        }
    }

    if (nthreads > nhwthreads)
    {
        /* We are oversubscribing, don't pin */
        md_print_warn(NULL, fplog,
                      "WARNING: Oversubscribing the CPU, will not pin threads");

        return -1;
    }

    if (pin_offset + nthreads > nhwthreads)
    {
        /* We are oversubscribing, don't pin */
        md_print_warn(NULL, fplog,
                      "WARNING: The requested pin offset is too large for the available logical cores,\n"
                      "         will not pin threads");

        return -1;
    }


    /* do we need to choose the pinning stride? */
    bPickPinStride = (*pin_stride == 0);

    if (bPickPinStride)
    {
        if (rc == 0 && pin_offset + nthreads*nhwthreads_per_core <= nhwthreads)
        {
            /* Put one thread on each physical core */
            *pin_stride = nhwthreads_per_core;
        }
        else
        {
            /* We don't know if we have SMT, and if we do, we don't know
             * if hw threads in the same physical core are consecutive.
             * Without SMT the pinning layout should not matter too much.
             * so we assume a consecutive layout and maximally spread out"
             * the threads at equal threads per core.
             * Note that IBM is the major non-x86 case with cpuid support
             * and probably threads are already pinned by the queuing system,
             * so we wouldn't end up here in the first place.
             */
            *pin_stride = (nhwthreads - pin_offset)/nthreads;
        }
    }
    else
    {
        /* Check the placement of the thread with the largest index to make sure
         * that the offset & stride doesn't cause pinning beyond the last hardware thread. */
        if (pin_offset + (nthreads-1)*(*pin_stride) >= nhwthreads)
        {
            /* We are oversubscribing, don't pin */
            md_print_warn(NULL, fplog,
                          "WARNING: The requested pinning stride is too large for the available logical cores,\n"
                          "         will not pin threads");

            return -1;
        }
    }

    if (fplog != NULL)
    {
        fprintf(fplog, "Pinning threads with a%s logical core stride of %d\n",
                bPickPinStride ? "n auto-selected" : " user-specified",
                *pin_stride);
    }

    return 0;
}
示例#11
0
gmx_bool done_iterating(const t_commrec *cr, FILE *fplog, int nsteps, gmx_iterate_t *iterate, gmx_bool bFirstIterate, real fom, real *newf)
{
    /* monitor convergence, and use a secant search to propose new
       values.
                                                                  x_{i} - x_{i-1}
       The secant method computes x_{i+1} = x_{i} - f(x_{i}) * ---------------------
                                                                f(x_{i}) - f(x_{i-1})

       The function we are trying to zero is fom-x, where fom is the
       "figure of merit" which is the pressure (or the veta value) we
       would get by putting in an old value of the pressure or veta into
       the incrementor function for the step or half step.  I have
       verified that this gives the same answer as self consistent
       iteration, usually in many fewer steps, especially for small tau_p.

       We could possibly eliminate an iteration with proper use
       of the value from the previous step, but that would take a bit
       more bookkeeping, especially for veta, since tests indicate the
       function of veta on the last step is not sufficiently close to
       guarantee convergence this step. This is
       good enough for now.  On my tests, I could use tau_p down to
       0.02, which is smaller that would ever be necessary in
       practice. Generally, 3-5 iterations will be sufficient */

    real     relerr, err, xmin;
    int      i;
    gmx_bool incycle;

    if (bFirstIterate)
    {
        iterate->x     = fom;
        iterate->f     = fom-iterate->x;
        iterate->xprev = 0;
        iterate->fprev = 0;
        *newf          = fom;
    }
    else
    {
        iterate->f = fom-iterate->x; /* we want to zero this difference */
        if ((iterate->iter_i > 1) && (iterate->iter_i < MAXITERCONST))
        {
            if (iterate->f == iterate->fprev)
            {
                *newf = iterate->f;
            }
            else
            {
                *newf = iterate->x - (iterate->x-iterate->xprev)*(iterate->f)/(iterate->f-iterate->fprev);
            }
        }
        else
        {
            /* just use self-consistent iteration the first step to initialize, or
               if it's not converging (which happens occasionally -- need to investigate why) */
            *newf = fom;
        }
    }
    /* Consider a slight shortcut allowing us to exit one sooner -- we check the
       difference between the closest of x and xprev to the new
       value. To be 100% certain, we should check the difference between
       the last result, and the previous result, or

       relerr = (fabs((x-xprev)/fom));

       but this is pretty much never necessary under typical conditions.
       Checking numerically, it seems to lead to almost exactly the same
       trajectories, but there are small differences out a few decimal
       places in the pressure, and eventually in the v_eta, but it could
       save an interation.

       if (fabs(*newf-x) < fabs(*newf - xprev)) { xmin = x;} else { xmin = xprev;}
       relerr = (fabs((*newf-xmin) / *newf));
     */

    err    = fabs((iterate->f-iterate->fprev));
    relerr = fabs(err/fom);

    iterate->allrelerr[iterate->iter_i] = relerr;

    if (iterate->iter_i > 0)
    {
        if (debug)
        {
            fprintf(debug, "Iterating NPT constraints: %6i %20.12f%14.6g%20.12f\n",
                    iterate->iter_i, fom, relerr, *newf);
        }

        if ((relerr < CONVERGEITER) || (err < CONVERGEITER) || (fom == 0) || ((iterate->x == iterate->xprev) && iterate->iter_i > 1))
        {
            iterate->bIterationActive = FALSE;
            if (debug)
            {
                fprintf(debug, "Iterating NPT constraints: CONVERGED\n");
            }
            return TRUE;
        }
        if (iterate->iter_i > MAXITERCONST)
        {
            if (relerr < CLOSE_ENOUGH)
            {
                incycle = FALSE;
                for (i = 1; i < CYCLEMAX; i++)
                {
                    if ((iterate->allrelerr[iterate->iter_i-(1+i)] == iterate->allrelerr[iterate->iter_i-1]) &&
                        (iterate->allrelerr[iterate->iter_i-(1+i)] == iterate->allrelerr[iterate->iter_i-(1+2*i)]))
                    {
                        incycle = TRUE;
                        if (debug)
                        {
                            fprintf(debug, "Exiting from an NPT iterating cycle of length %d\n", i);
                        }
                        break;
                    }
                }

                if (incycle)
                {
                    /* step 1: trapped in a numerical attractor */
                    /* we are trapped in a numerical attractor, and can't converge any more, and are close to the final result.
                       Better to give up convergence here than have the simulation die.
                     */
                    iterate->num_close++;
                    iterate->bIterationActive = FALSE;
                    return TRUE;
                }
                else
                {
                    /* Step #2: test if we are reasonably close for other reasons, then monitor the number.  If not, die */

                    /* how many close calls have we had?  If less than a few, we're OK */
                    if (iterate->num_close < MAX_NUMBER_CLOSE)
                    {
                        md_print_warn(cr, fplog, "Slight numerical convergence deviation with NPT at step %d, relative error only %10.5g, likely not a problem, continuing\n", nsteps, relerr);
                        iterate->num_close++;
                        iterate->bIterationActive = FALSE;
                        return TRUE;
                        /* if more than a few, check the total fraction.  If too high, die. */
                    }
                    else if (iterate->num_close/(double)nsteps > FRACTION_CLOSE)
                    {
                        gmx_fatal(FARGS, "Could not converge NPT constraints, too many exceptions (%d%%\n", iterate->num_close/(double)nsteps);
                    }
                }
            }
            else
            {
                gmx_fatal(FARGS, "Could not converge NPT constraints\n");
            }
        }
    }

    iterate->xprev = iterate->x;
    iterate->x     = *newf;
    iterate->fprev = iterate->f;
    iterate->iter_i++;

    return FALSE;
}
示例#12
0
/* Set CPU affinity. Can be important for performance.
   On some systems (e.g. Cray) CPU Affinity is set by default.
   But default assigning doesn't work (well) with only some ranks
   having threads. This causes very low performance.
   External tools have cumbersome syntax for setting affinity
   in the case that only some ranks have threads.
   Thus it is important that GROMACS sets the affinity internally
   if only PME is using threads.
 */
void
gmx_set_thread_affinity(FILE                *fplog,
                        const t_commrec     *cr,
                        gmx_hw_opt_t        *hw_opt,
                        int                  nthreads_pme,
                        const gmx_hw_info_t *hwinfo,
                        const t_inputrec    *inputrec)
{
    int        nth_affinity_set, thread_id_node, thread_id,
               nthread_local, nthread_node, nthread_hw_max, nphyscore;
    int        offset;
    const int *locality_order;
    int        rc;

    if (hw_opt->thread_affinity == threadaffOFF)
    {
        /* Nothing to do */
        return;
    }

    /* If the tMPI thread affinity setting is not supported encourage the user
     * to report it as it's either a bug or an exotic platform which we might
     * want to support. */
    if (tMPI_Thread_setaffinity_support() != TMPI_SETAFFINITY_SUPPORT_YES)
    {
        /* we know Mac OS doesn't support setting thread affinity, so there's
           no point in warning the user in that case. In any other case
           the user might be able to do something about it. */
#ifndef __APPLE__
        md_print_warn(NULL, fplog,
                      "Can not set thread affinities on the current platform. On NUMA systems this\n"
                      "can cause performance degradation. If you think your platform should support\n"
                      "setting affinities, contact the GROMACS developers.");
#endif  /* __APPLE__ */
        return;
    }

    /* threads on this MPI process or TMPI thread */
    if (cr->duty & DUTY_PP)
    {
        nthread_local = gmx_omp_nthreads_get(emntNonbonded);
    }
    else
    {
        nthread_local = gmx_omp_nthreads_get(emntPME);
    }

    /* map the current process to cores */
    thread_id_node = 0;
    nthread_node   = nthread_local;
#ifdef GMX_MPI
    if (PAR(cr) || MULTISIM(cr))
    {
        /* We need to determine a scan of the thread counts in this
         * compute node.
         */
        MPI_Comm comm_intra;

        MPI_Comm_split(MPI_COMM_WORLD, gmx_hostname_num(), cr->rank_intranode,
                       &comm_intra);
        MPI_Scan(&nthread_local, &thread_id_node, 1, MPI_INT, MPI_SUM, comm_intra);
        /* MPI_Scan is inclusive, but here we need exclusive */
        thread_id_node -= nthread_local;
        /* Get the total number of threads on this physical node */
        MPI_Allreduce(&nthread_local, &nthread_node, 1, MPI_INT, MPI_SUM, comm_intra);
        MPI_Comm_free(&comm_intra);
    }
#endif

    if (hw_opt->thread_affinity == threadaffAUTO &&
        nthread_node != hwinfo->nthreads_hw_avail)
    {
        if (nthread_node > 1 && nthread_node < hwinfo->nthreads_hw_avail)
        {
            md_print_warn(cr, fplog,
                          "NOTE: The number of threads is not equal to the number of (logical) cores\n"
                          "      and the -pin option is set to auto: will not pin thread to cores.\n"
                          "      This can lead to significant performance degradation.\n"
                          "      Consider using -pin on (and -pinoffset in case you run multiple jobs).\n");
        }

        return;
    }

    offset = 0;
    if (hw_opt->core_pinning_offset != 0)
    {
        offset = hw_opt->core_pinning_offset;
        md_print_info(cr, fplog, "Applying core pinning offset %d\n", offset);
    }

    rc = get_thread_affinity_layout(fplog, cr, hwinfo,
                                    nthread_node,
                                    offset, &hw_opt->core_pinning_stride,
                                    &locality_order);

    if (rc != 0)
    {
        /* Incompatible layout, don't pin, warning was already issued */
        return;
    }

    /* Set the per-thread affinity. In order to be able to check the success
     * of affinity settings, we will set nth_affinity_set to 1 on threads
     * where the affinity setting succeded and to 0 where it failed.
     * Reducing these 0/1 values over the threads will give the total number
     * of threads on which we succeeded.
     */
    nth_affinity_set = 0;
#pragma omp parallel firstprivate(thread_id_node) num_threads(nthread_local) \
    reduction(+:nth_affinity_set)
    {
        int      index, core;
        gmx_bool setaffinity_ret;

        thread_id       = gmx_omp_get_thread_num();
        thread_id_node += thread_id;
        index           = offset + thread_id_node*hw_opt->core_pinning_stride;
        if (locality_order != NULL)
        {
            core = locality_order[index];
        }
        else
        {
            core = index;
        }

        setaffinity_ret = tMPI_Thread_setaffinity_single(tMPI_Thread_self(), core);

        /* store the per-thread success-values of the setaffinity */
        nth_affinity_set = (setaffinity_ret == 0);

        if (debug)
        {
            fprintf(debug, "On rank %2d, thread %2d, core %2d the affinity setting returned %d\n",
                    cr->nodeid, gmx_omp_get_thread_num(), core, setaffinity_ret);
        }
    }

    if (nth_affinity_set > nthread_local)
    {
        char msg[STRLEN];

        sprintf(msg, "Looks like we have set affinity for more threads than "
                "we have (%d > %d)!\n", nth_affinity_set, nthread_local);
        gmx_incons(msg);
    }
    else
    {
        /* check & warn if some threads failed to set their affinities */
        if (nth_affinity_set != nthread_local)
        {
            char sbuf1[STRLEN], sbuf2[STRLEN];

            /* sbuf1 contains rank info, while sbuf2 OpenMP thread info */
            sbuf1[0] = sbuf2[0] = '\0';
            /* Only add rank info if we have more than one rank. */
            if (cr->nnodes > 1)
            {
#ifdef GMX_MPI
#ifdef GMX_THREAD_MPI
                sprintf(sbuf1, "In tMPI thread #%d: ", cr->nodeid);
#else           /* GMX_LIB_MPI */
                sprintf(sbuf1, "In MPI process #%d: ", cr->nodeid);
#endif
#endif          /* GMX_MPI */
            }

            if (nthread_local > 1)
            {
                sprintf(sbuf2, "for %d/%d thread%s ",
                        nthread_local - nth_affinity_set, nthread_local,
                        nthread_local > 1 ? "s" : "");
            }

            md_print_warn(NULL, fplog,
                          "WARNING: %sAffinity setting %sfailed.\n"
                          "         This can cause performance degradation! If you think your setting are\n"
                          "         correct, contact the GROMACS developers.",
                          sbuf1, sbuf2);
        }
    }
    return;
}
示例#13
0
/* Check the process affinity mask and if it is found to be non-zero,
 * will honor it and disable mdrun internal affinity setting.
 * Note that this will only work on Linux as we use a GNU feature.
 */
void
gmx_check_thread_affinity_set(FILE *fplog, const t_commrec *cr,
                              gmx_hw_opt_t *hw_opt, int ncpus,
                              gmx_bool bAfterOpenmpInit)
{
#ifdef HAVE_SCHED_GETAFFINITY
    cpu_set_t mask_current;
    int       i, ret, cpu_count, cpu_set;
    gmx_bool  bAllSet;

    assert(hw_opt);
    if (hw_opt->thread_affinity == threadaffOFF)
    {
        /* internal affinity setting is off, don't bother checking process affinity */
        return;
    }

    CPU_ZERO(&mask_current);
    if ((ret = sched_getaffinity(0, sizeof(cpu_set_t), &mask_current)) != 0)
    {
        /* failed to query affinity mask, will just return */
        if (debug)
        {
            fprintf(debug, "Failed to query affinity mask (error %d)", ret);
        }
        return;
    }

    /* Before proceeding with the actual check, make sure that the number of
     * detected CPUs is >= the CPUs in the current set.
     * We need to check for CPU_COUNT as it was added only in glibc 2.6. */
#ifdef CPU_COUNT
    if (ncpus < CPU_COUNT(&mask_current))
    {
        if (debug)
        {
            fprintf(debug, "%d CPUs detected, but %d was returned by CPU_COUNT",
                    ncpus, CPU_COUNT(&mask_current));
        }
        return;
    }
#endif /* CPU_COUNT */

    bAllSet = TRUE;
    for (i = 0; (i < ncpus && i < CPU_SETSIZE); i++)
    {
        bAllSet = bAllSet && (CPU_ISSET(i, &mask_current) != 0);
    }

    if (!bAllSet)
    {
        if (hw_opt->thread_affinity == threadaffAUTO)
        {
            if (!bAfterOpenmpInit)
            {
                md_print_warn(cr, fplog,
                              "Non-default thread affinity set, disabling internal thread affinity");
            }
            else
            {
                md_print_warn(cr, fplog,
                              "Non-default thread affinity set probably by the OpenMP library,\n"
                              "disabling internal thread affinity");
            }
            hw_opt->thread_affinity = threadaffOFF;
        }
        else
        {
            /* Only warn once, at the last check (bAfterOpenmpInit==TRUE) */
            if (bAfterOpenmpInit)
            {
                md_print_warn(cr, fplog,
                              "Overriding thread affinity set outside %s\n",
                              ShortProgram());
            }
        }

        if (debug)
        {
            fprintf(debug, "Non-default affinity mask found\n");
        }
    }
    else
    {
        if (debug)
        {
            fprintf(debug, "Default affinity mask found\n");
        }
    }
#endif /* HAVE_SCHED_GETAFFINITY */
}
/*! \brief Process the timings and try to adjust the PME grid and Coulomb cut-off
 *
 * The adjustment is done to generate a different non-bonded PP and PME load.
 * With separate PME ranks (PP and PME on different processes) or with
 * a GPU (PP on GPU, PME on CPU), PP and PME run on different resources
 * and changing the load will affect the load balance and performance.
 * The total time for a set of integration steps is monitored and a range
 * of grid/cut-off setups is scanned. After calling pme_load_balance many
 * times and acquiring enough statistics, the best performing setup is chosen.
 * Here we try to take into account fluctuations and changes due to external
 * factors as well as DD load balancing.
 */
static void
pme_load_balance(pme_load_balancing_t      *pme_lb,
                 t_commrec                 *cr,
                 FILE                      *fp_err,
                 FILE                      *fp_log,
                 t_inputrec                *ir,
                 t_state                   *state,
                 double                     cycles,
                 interaction_const_t       *ic,
                 struct nonbonded_verlet_t *nbv,
                 struct gmx_pme_t **        pmedata,
                 gmx_int64_t                step)
{
    gmx_bool     OK;
    pme_setup_t *set;
    double       cycles_fast;
    char         buf[STRLEN], sbuf[22];
    real         rtab;

    if (PAR(cr))
    {
        gmx_sumd(1, &cycles, cr);
        cycles /= cr->nnodes;
    }

    set = &pme_lb->setup[pme_lb->cur];
    set->count++;

    rtab = ir->rlistlong + ir->tabext;

    if (set->count % 2 == 1)
    {
        /* Skip the first cycle, because the first step after a switch
         * is much slower due to allocation and/or caching effects.
         */
        return;
    }

    sprintf(buf, "step %4s: ", gmx_step_str(step, sbuf));
    print_grid(fp_err, fp_log, buf, "timed with", set, cycles);

    if (set->count <= 2)
    {
        set->cycles = cycles;
    }
    else
    {
        if (cycles*maxFluctuationAccepted < set->cycles &&
            pme_lb->stage == pme_lb->nstage - 1)
        {
            /* The performance went up a lot (due to e.g. DD load balancing).
             * Add a stage, keep the minima, but rescan all setups.
             */
            pme_lb->nstage++;

            if (debug)
            {
                fprintf(debug, "The performance for grid %d %d %d went from %.3f to %.1f M-cycles, this is more than %f\n"
                        "Increased the number stages to %d"
                        " and ignoring the previous performance\n",
                        set->grid[XX], set->grid[YY], set->grid[ZZ],
                        set->cycles*1e-6, cycles*1e-6, maxFluctuationAccepted,
                        pme_lb->nstage);
            }
        }
        set->cycles = std::min(set->cycles, cycles);
    }

    if (set->cycles < pme_lb->setup[pme_lb->fastest].cycles)
    {
        pme_lb->fastest = pme_lb->cur;

        if (DOMAINDECOMP(cr))
        {
            /* We found a new fastest setting, ensure that with subsequent
             * shorter cut-off's the dynamic load balancing does not make
             * the use of the current cut-off impossible. This solution is
             * a trade-off, as the PME load balancing and DD domain size
             * load balancing can interact in complex ways.
             * With the Verlet kernels, DD load imbalance will usually be
             * mainly due to bonded interaction imbalance, which will often
             * quickly push the domain boundaries beyond the limit for the
             * optimal, PME load balanced, cut-off. But it could be that
             * better overal performance can be obtained with a slightly
             * shorter cut-off and better DD load balancing.
             */
            set_dd_dlb_max_cutoff(cr, pme_lb->setup[pme_lb->fastest].rlistlong);
        }
    }
    cycles_fast = pme_lb->setup[pme_lb->fastest].cycles;

    /* Check in stage 0 if we should stop scanning grids.
     * Stop when the time is more than maxRelativeSlowDownAccepted longer than the fastest.
     */
    if (pme_lb->stage == 0 && pme_lb->cur > 0 &&
        cycles > pme_lb->setup[pme_lb->fastest].cycles*maxRelativeSlowdownAccepted)
    {
        pme_lb->n = pme_lb->cur + 1;
        /* Done with scanning, go to stage 1 */
        switch_to_stage1(pme_lb);
    }

    if (pme_lb->stage == 0)
    {
        int gridsize_start;

        gridsize_start = set->grid[XX]*set->grid[YY]*set->grid[ZZ];

        do
        {
            if (pme_lb->cur+1 < pme_lb->n)
            {
                /* We had already generated the next setup */
                OK = TRUE;
            }
            else
            {
                /* Find the next setup */
                OK = pme_loadbal_increase_cutoff(pme_lb, ir->pme_order, cr->dd);

                if (!OK)
                {
                    pme_lb->elimited = epmelblimPMEGRID;
                }
            }

            if (OK && ir->ePBC != epbcNONE)
            {
                OK = (sqr(pme_lb->setup[pme_lb->cur+1].rlistlong)
                      <= max_cutoff2(ir->ePBC, state->box));
                if (!OK)
                {
                    pme_lb->elimited = epmelblimBOX;
                }
            }

            if (OK)
            {
                pme_lb->cur++;

                if (DOMAINDECOMP(cr))
                {
                    OK = change_dd_cutoff(cr, state, ir,
                                          pme_lb->setup[pme_lb->cur].rlistlong);
                    if (!OK)
                    {
                        /* Failed: do not use this setup */
                        pme_lb->cur--;
                        pme_lb->elimited = epmelblimDD;
                    }
                }
            }
            if (!OK)
            {
                /* We hit the upper limit for the cut-off,
                 * the setup should not go further than cur.
                 */
                pme_lb->n = pme_lb->cur + 1;
                print_loadbal_limited(fp_err, fp_log, step, pme_lb);
                /* Switch to the next stage */
                switch_to_stage1(pme_lb);
            }
        }
        while (OK &&
               !(pme_lb->setup[pme_lb->cur].grid[XX]*
                 pme_lb->setup[pme_lb->cur].grid[YY]*
                 pme_lb->setup[pme_lb->cur].grid[ZZ] <
                 gridsize_start*gridScaleFactor
                 &&
                 pme_lb->setup[pme_lb->cur].grid_efficiency <
                 pme_lb->setup[pme_lb->cur-1].grid_efficiency*relativeEfficiencyFactor));
    }

    if (pme_lb->stage > 0 && pme_lb->end == 1)
    {
        pme_lb->cur   = pme_lb->lower_limit;
        pme_lb->stage = pme_lb->nstage;
    }
    else if (pme_lb->stage > 0 && pme_lb->end > 1)
    {
        /* If stage = nstage-1:
         *   scan over all setups, rerunning only those setups
         *   which are not much slower than the fastest
         * else:
         *   use the next setup
         * Note that we loop backward to minimize the risk of the cut-off
         * getting limited by DD DLB, since the DLB cut-off limit is set
         * to the fastest PME setup.
         */
        do
        {
            pme_lb->cur--;
            if (pme_lb->cur == pme_lb->start)
            {
                pme_lb->stage++;

                pme_lb->cur = pme_lb->end - 1;
            }
        }
        while (pme_lb->stage == pme_lb->nstage - 1 &&
               pme_lb->setup[pme_lb->cur].count > 0 &&
               pme_lb->setup[pme_lb->cur].cycles > cycles_fast*maxRelativeSlowdownAccepted);

        if (pme_lb->stage == pme_lb->nstage)
        {
            /* We are done optimizing, use the fastest setup we found */
            pme_lb->cur = pme_lb->fastest;
        }
    }

    if (DOMAINDECOMP(cr) && pme_lb->stage > 0)
    {
        OK = change_dd_cutoff(cr, state, ir, pme_lb->setup[pme_lb->cur].rlistlong);
        if (!OK)
        {
            /* For some reason the chosen cut-off is incompatible with DD.
             * We should continue scanning a more limited range of cut-off's.
             */
            if (pme_lb->cur > 1 && pme_lb->stage == pme_lb->nstage)
            {
                /* stage=nstage says we're finished, but we should continue
                 * balancing, so we set back stage which was just incremented.
                 */
                pme_lb->stage--;
            }
            if (pme_lb->cur <= pme_lb->fastest)
            {
                /* This should not happen, as we set limits on the DLB bounds.
                 * But we implement a complete failsafe solution anyhow.
                 */
                md_print_warn(cr, fp_log, "The fastest PP/PME load balancing setting (cutoff %.3f nm) is no longer available due to DD DLB or box size limitations\n");
                pme_lb->fastest = pme_lb->lower_limit;
                pme_lb->start   = pme_lb->lower_limit;
            }
            /* Limit the range to below the current cut-off, scan from start */
            pme_lb->end         = pme_lb->cur;
            pme_lb->cur         = pme_lb->start;
            pme_lb->elimited    = epmelblimDD;
            print_loadbal_limited(fp_err, fp_log, step, pme_lb);
        }
    }

    /* Change the Coulomb cut-off and the PME grid */

    set = &pme_lb->setup[pme_lb->cur];

    ic->rcoulomb     = set->rcut_coulomb;
    ic->rlist        = set->rlist;
    ic->rlistlong    = set->rlistlong;
    ir->nstcalclr    = set->nstcalclr;
    ic->ewaldcoeff_q = set->ewaldcoeff_q;
    /* TODO: centralize the code that sets the potentials shifts */
    if (ic->coulomb_modifier == eintmodPOTSHIFT)
    {
        ic->sh_ewald = gmx_erfc(ic->ewaldcoeff_q*ic->rcoulomb);
    }
    if (EVDW_PME(ic->vdwtype))
    {
        /* We have PME for both Coulomb and VdW, set rvdw equal to rcoulomb */
        ic->rvdw            = set->rcut_coulomb;
        ic->ewaldcoeff_lj   = set->ewaldcoeff_lj;
        if (ic->vdw_modifier == eintmodPOTSHIFT)
        {
            real       crc2;

            ic->dispersion_shift.cpot = -std::pow(static_cast<double>(ic->rvdw), -6.0);
            ic->repulsion_shift.cpot  = -std::pow(static_cast<double>(ic->rvdw), -12.0);
            ic->sh_invrc6             = -ic->dispersion_shift.cpot;
            crc2                      = sqr(ic->ewaldcoeff_lj*ic->rvdw);
            ic->sh_lj_ewald           = (exp(-crc2)*(1 + crc2 + 0.5*crc2*crc2) - 1)*std::pow(static_cast<double>(ic->rvdw), -6.0);
        }
    }

    /* We always re-initialize the tables whether they are used or not */
    init_interaction_const_tables(NULL, ic, rtab);

    nbnxn_gpu_pme_loadbal_update_param(nbv, ic);

    /* With tMPI + GPUs some ranks may be sharing GPU(s) and therefore
     * also sharing texture references. To keep the code simple, we don't
     * treat texture references as shared resources, but this means that
     * the coulomb_tab texture ref will get updated by multiple threads.
     * Hence, to ensure that the non-bonded kernels don't start before all
     * texture binding operations are finished, we need to wait for all ranks
     * to arrive here before continuing.
     *
     * Note that we could omit this barrier if GPUs are not shared (or
     * texture objects are used), but as this is initialization code, there
     * is not point in complicating things.
     */
#ifdef GMX_THREAD_MPI
    if (PAR(cr) && use_GPU(nbv))
    {
        gmx_barrier(cr);
    }
#endif  /* GMX_THREAD_MPI */

    if (!pme_lb->bSepPMERanks)
    {
        if (pme_lb->setup[pme_lb->cur].pmedata == NULL)
        {
            /* Generate a new PME data structure,
             * copying part of the old pointers.
             */
            gmx_pme_reinit(&set->pmedata,
                           cr, pme_lb->setup[0].pmedata, ir,
                           set->grid);
        }
        *pmedata = set->pmedata;
    }
    else
    {
        /* Tell our PME-only rank to switch grid */
        gmx_pme_send_switchgrid(cr, set->grid, set->ewaldcoeff_q, set->ewaldcoeff_lj);
    }

    if (debug)
    {
        print_grid(NULL, debug, "", "switched to", set, -1);
    }

    if (pme_lb->stage == pme_lb->nstage)
    {
        print_grid(fp_err, fp_log, "", "optimal", set, -1);
    }
}
示例#15
0
static int
get_thread_affinity_layout(FILE *fplog,
                           const t_commrec *cr,
                           const gmx_hw_info_t * hwinfo,
                           int nthreads,
                           int pin_offset, int * pin_stride,
                           const int **locality_order)
{
    int         nhwthreads, npkg, ncores, nhwthreads_per_core, rc;
    const int * pkg_id;
    const int * core_id;
    const int * hwthread_id;

    if (pin_offset < 0)
    {
        gmx_fatal(FARGS, "Negative thread pinning offset requested");
    }
    if (*pin_stride < 0)
    {
        gmx_fatal(FARGS, "Negative thread pinning stride requested");
    }

    rc = gmx_cpuid_topology(hwinfo->cpuid_info, &nhwthreads, &npkg, &ncores,
                            &nhwthreads_per_core,
                            &pkg_id, &core_id, &hwthread_id, locality_order);

    if (rc != 0)
    {
        nhwthreads      = hwinfo->nthreads_hw_avail;
        *locality_order = NULL;

        if (nhwthreads <= 0)
        {
            /* We don't know anything about the hardware, don't pin */
            md_print_warn(cr, fplog,
                          "We don't know how many logical cores we have, will not pin threads");

            return -1;
        }
    }

    if (pin_offset + nthreads > nhwthreads)
    {
        /* We are oversubscribing, don't pin */
        md_print_warn(NULL, fplog,
                      "More threads requested than available logical cores, will not pin threads");

        return -1;
    }

    /* Check if we need to choose the pinning stride */
    if (*pin_stride == 0)
    {
        if (rc == 0 && pin_offset + nthreads*nhwthreads_per_core <= nhwthreads)
        {
            /* Put one thread on each physical core */
            *pin_stride = nhwthreads_per_core;
        }
        else
        {
            /* We don't know if we have SMT, and if we do, we don't know
             * if hw threads in the same physical core are consecutive.
             * Without SMT the pinning layout should not matter too much.
             * so we assume a consecutive layout and maximally spread out"
             * the threads at equal threads per core.
             * Note that IBM is the major non-x86 case with cpuid support
             * and probably threads are already pinned by the queuing system,
             * so we wouldn't end up here in the first place.
             */
            *pin_stride = (nhwthreads - pin_offset)/nthreads;
        }

        if (fplog != NULL)
        {
            fprintf(fplog, "Pinning threads with a logical core stride of %d\n",
                    *pin_stride);
        }
    }
    else
    {
        if (pin_offset + nthreads*(*pin_stride) > nhwthreads)
        {
            /* We are oversubscribing, don't pin */
            md_print_warn(NULL, fplog,
                          "The requested pinning stride is too large for the available logical cores, will not pin threads");

            return -1;
        }
    }

    return 0;
}
示例#16
0
void wallcycle_print(FILE *fplog, int nnodes, int npme, double realtime,
                     gmx_wallcycle_t wc, wallclock_gpu_t *gpu_t)
{
    double     *cycles;
    double      c2t, tot, tot_gpu, tot_cpu_overlap, gpu_cpu_ratio, sum, tot_k;
    int         i, j, npp, nth_pp, nth_pme, nth_tot;
    char        buf[STRLEN];
    const char *hline = "-----------------------------------------------------------------------------";

    if (wc == NULL)
    {
        return;
    }

    nth_pp  = wc->nthreads_pp;
    nth_pme = wc->nthreads_pme;

    cycles = wc->cycles_sum;

    if (npme > 0)
    {
        npp = nnodes - npme;

        nth_tot = npp*nth_pp + npme*nth_pme;
    }
    else
    {
        npp  = nnodes;
        npme = nnodes;

        nth_tot = npp*nth_pp;
    }

    tot = cycles[ewcRUN];

    /* Conversion factor from cycles to seconds */
    if (tot > 0)
    {
        c2t = realtime/tot;
    }
    else
    {
        c2t = 0;
    }

    fprintf(fplog, "\n     R E A L   C Y C L E   A N D   T I M E   A C C O U N T I N G\n\n");

    fprintf(fplog, " Computing:         Nodes   Th.     Count  Wall t (s)     G-Cycles       %c\n", '%');
    fprintf(fplog, "%s\n", hline);
    sum = 0;
    for (i = ewcPPDURINGPME+1; i < ewcNR; i++)
    {
        if (!is_pme_subcounter(i))
        {
            print_cycles(fplog, c2t, wcn[i], nth_tot,
                         is_pme_counter(i) ? npme : npp,
                         is_pme_counter(i) ? nth_pme : nth_pp,
                         wc->wcc[i].n, cycles[i], tot);
            sum += cycles[i];
        }
    }
    if (wc->wcc_all != NULL)
    {
        for (i = 0; i < ewcNR; i++)
        {
            for (j = 0; j < ewcNR; j++)
            {
                snprintf(buf, 9, "%-9s", wcn[i]);
                buf[9] = ' ';
                snprintf(buf+10, 9, "%-9s", wcn[j]);
                buf[19] = '\0';
                print_cycles(fplog, c2t, buf, nth_tot,
                             is_pme_counter(i) ? npme : npp,
                             is_pme_counter(i) ? nth_pme : nth_pp,
                             wc->wcc_all[i*ewcNR+j].n,
                             wc->wcc_all[i*ewcNR+j].c,
                             tot);
            }
        }
    }
    print_cycles(fplog, c2t, "Rest", nth_tot, npp, -1, 0, tot-sum, tot);
    fprintf(fplog, "%s\n", hline);
    print_cycles(fplog, c2t, "Total", nth_tot, nnodes, -1, 0, tot, tot);
    fprintf(fplog, "%s\n", hline);

    if (wc->wcc[ewcPMEMESH].n > 0)
    {
        fprintf(fplog, "%s\n", hline);
        for (i = ewcPPDURINGPME+1; i < ewcNR; i++)
        {
            if (is_pme_subcounter(i))
            {
                print_cycles(fplog, c2t, wcn[i], nth_tot,
                             is_pme_counter(i) ? npme : npp,
                             is_pme_counter(i) ? nth_pme : nth_pp,
                             wc->wcc[i].n, cycles[i], tot);
            }
        }
        fprintf(fplog, "%s\n", hline);
    }

#ifdef GMX_CYCLE_SUBCOUNTERS
    fprintf(fplog, "%s\n", hline);
    for (i = 0; i < ewcsNR; i++)
    {
        print_cycles(fplog, c2t, wcsn[i], nth_tot, npp, nth_pp,
                     wc->wcsc[i].n, cycles[ewcNR+i], tot);
    }
    fprintf(fplog, "%s\n", hline);
#endif

    /* print GPU timing summary */
    if (gpu_t)
    {
        const char *k_log_str[2][2] = {
            {"Nonbonded F kernel", "Nonbonded F+ene k."},
            {"Nonbonded F+prune k.", "Nonbonded F+ene+prune k."}
        };

        tot_gpu = gpu_t->pl_h2d_t + gpu_t->nb_h2d_t + gpu_t->nb_d2h_t;

        /* add up the kernel timings */
        tot_k = 0.0;
        for (i = 0; i < 2; i++)
        {
            for (j = 0; j < 2; j++)
            {
                tot_k += gpu_t->ktime[i][j].t;
            }
        }
        tot_gpu += tot_k;

        tot_cpu_overlap = wc->wcc[ewcFORCE].c;
        if (wc->wcc[ewcPMEMESH].n > 0)
        {
            tot_cpu_overlap += wc->wcc[ewcPMEMESH].c;
        }
        tot_cpu_overlap *= c2t * 1000; /* convert s to ms */

        fprintf(fplog, "\n GPU timings\n%s\n", hline);
        fprintf(fplog, " Computing:                         Count  Wall t (s)      ms/step       %c\n", '%');
        fprintf(fplog, "%s\n", hline);
        print_gputimes(fplog, "Pair list H2D",
                       gpu_t->pl_h2d_c, gpu_t->pl_h2d_t, tot_gpu);
        print_gputimes(fplog, "X / q H2D",
                       gpu_t->nb_c, gpu_t->nb_h2d_t, tot_gpu);

        for (i = 0; i < 2; i++)
        {
            for (j = 0; j < 2; j++)
            {
                if (gpu_t->ktime[i][j].c)
                {
                    print_gputimes(fplog, k_log_str[i][j],
                                   gpu_t->ktime[i][j].c, gpu_t->ktime[i][j].t, tot_gpu);
                }
            }
        }

        print_gputimes(fplog, "F D2H",  gpu_t->nb_c, gpu_t->nb_d2h_t, tot_gpu);
        fprintf(fplog, "%s\n", hline);
        print_gputimes(fplog, "Total ", gpu_t->nb_c, tot_gpu, tot_gpu);
        fprintf(fplog, "%s\n", hline);

        gpu_cpu_ratio = tot_gpu/tot_cpu_overlap;
        fprintf(fplog, "\nForce evaluation time GPU/CPU: %.3f ms/%.3f ms = %.3f\n",
                tot_gpu/gpu_t->nb_c, tot_cpu_overlap/wc->wcc[ewcFORCE].n,
                gpu_cpu_ratio);

        /* only print notes related to CPU-GPU load balance with PME */
        if (wc->wcc[ewcPMEMESH].n > 0)
        {
            fprintf(fplog, "For optimal performance this ratio should be close to 1!\n");

            /* print note if the imbalance is high with PME case in which
             * CPU-GPU load balancing is possible */
            if (gpu_cpu_ratio < 0.75 || gpu_cpu_ratio > 1.2)
            {
                /* Only the sim master calls this function, so always print to stderr */
                if (gpu_cpu_ratio < 0.75)
                {
                    if (npp > 1)
                    {
                        /* The user could have used -notunepme,
                         * but we currently can't check that here.
                         */
                        md_print_warn(NULL, fplog,
                                      "\nNOTE: The GPU has >25%% less load than the CPU. This imbalance causes\n"
                                      "      performance loss. Maybe the domain decomposition limits the PME tuning.\n"
                                      "      In that case, try setting the DD grid manually (-dd) or lowering -dds.");
                    }
                    else
                    {
                        /* We should not end up here, unless the box is
                         * too small for increasing the cut-off for PME tuning.
                         */
                        md_print_warn(NULL, fplog,
                                      "\nNOTE: The GPU has >25%% less load than the CPU. This imbalance causes\n"
                                      "      performance loss.");
                    }
                }
                if (gpu_cpu_ratio > 1.2)
                {
                    md_print_warn(NULL, fplog,
                                  "\nNOTE: The GPU has >20%% more load than the CPU. This imbalance causes\n"
                                  "      performance loss, consider using a shorter cut-off and a finer PME grid.");
                }
            }
        }
    }

    if (wc->wcc[ewcNB_XF_BUF_OPS].n > 0 &&
        (cycles[ewcDOMDEC] > tot*0.1 ||
         cycles[ewcNS] > tot*0.1))
    {
        /* Only the sim master calls this function, so always print to stderr */
        if (wc->wcc[ewcDOMDEC].n == 0)
        {
            md_print_warn(NULL, fplog,
                          "NOTE: %d %% of the run time was spent in pair search,\n"
                          "      you might want to increase nstlist (this has no effect on accuracy)\n",
                          (int)(100*cycles[ewcNS]/tot+0.5));
        }
        else
        {
            md_print_warn(NULL, fplog,
                          "NOTE: %d %% of the run time was spent in domain decomposition,\n"
                          "      %d %% of the run time was spent in pair search,\n"
                          "      you might want to increase nstlist (this has no effect on accuracy)\n",
                          (int)(100*cycles[ewcDOMDEC]/tot+0.5),
                          (int)(100*cycles[ewcNS]/tot+0.5));
        }
    }

    if (cycles[ewcMoveE] > tot*0.05)
    {
        /* Only the sim master calls this function, so always print to stderr */
        md_print_warn(NULL, fplog,
                      "NOTE: %d %% of the run time was spent communicating energies,\n"
                      "      you might want to use the -gcom option of mdrun\n",
                      (int)(100*cycles[ewcMoveE]/tot+0.5));
    }
}
void check_resource_division_efficiency(const gmx_hw_info_t *hwinfo,
                                        const gmx_hw_opt_t  *hw_opt,
                                        gmx_bool             bNtOmpOptionSet,
                                        t_commrec           *cr,
                                        FILE                *fplog)
{
#if defined GMX_OPENMP && defined GMX_MPI
    int         nth_omp_min, nth_omp_max, ngpu;
    char        buf[1000];
#ifdef GMX_THREAD_MPI
    const char *mpi_option = " (option -ntmpi)";
#else
    const char *mpi_option = "";
#endif

    /* This function should be called after thread-MPI (when configured) and
     * OpenMP have been initialized. Check that here.
     */
#ifdef GMX_THREAD_MPI
    GMX_RELEASE_ASSERT(nthreads_omp_faster_default >= nthreads_omp_mpi_ok_max, "Inconsistent OpenMP thread count default values");
    GMX_RELEASE_ASSERT(hw_opt->nthreads_tmpi >= 1, "Must have at least one thread-MPI rank");
#endif
    GMX_RELEASE_ASSERT(gmx_omp_nthreads_get(emntDefault) >= 1, "Must have at least one OpenMP thread");

    nth_omp_min = gmx_omp_nthreads_get(emntDefault);
    nth_omp_max = gmx_omp_nthreads_get(emntDefault);
    ngpu        = hw_opt->gpu_opt.n_dev_use;

    /* Thread-MPI seems to have a bug with reduce on 1 node, so use a cond. */
    if (cr->nnodes + cr->npmenodes > 1)
    {
        int count[3], count_max[3];

        count[0] = -nth_omp_min;
        count[1] =  nth_omp_max;
        count[2] =  ngpu;

        MPI_Allreduce(count, count_max, 3, MPI_INT, MPI_MAX, cr->mpi_comm_mysim);

        /* In case of an inhomogeneous run setup we use the maximum counts */
        nth_omp_min = -count_max[0];
        nth_omp_max =  count_max[1];
        ngpu        =  count_max[2];
    }

    int nthreads_omp_mpi_ok_min;

    if (ngpu == 0)
    {
        nthreads_omp_mpi_ok_min = nthreads_omp_mpi_ok_min_cpu;
    }
    else
    {
        /* With GPUs we set the minimum number of OpenMP threads to 2 to catch
         * cases where the user specifies #ranks == #cores.
         */
        nthreads_omp_mpi_ok_min = nthreads_omp_mpi_ok_min_gpu;
    }

    if (DOMAINDECOMP(cr) && cr->nnodes > 1)
    {
        if (nth_omp_max < nthreads_omp_mpi_ok_min ||
            (!(ngpu > 0 && !gmx_gpu_sharing_supported()) &&
             nth_omp_max > nthreads_omp_mpi_ok_max))
        {
            /* Note that we print target_max here, not ok_max */
            sprintf(buf, "Your choice of number of MPI ranks and amount of resources results in using %d OpenMP threads per rank, which is most likely inefficient. The optimum is usually between %d and %d threads per rank.",
                    nth_omp_max,
                    nthreads_omp_mpi_ok_min,
                    nthreads_omp_mpi_target_max);

            if (bNtOmpOptionSet)
            {
                md_print_warn(cr, fplog, "NOTE: %s\n", buf);
            }
            else
            {
                /* This fatal error, and the one below, is nasty, but it's
                 * probably the only way to ensure that all users don't waste
                 * a lot of resources, since many users don't read logs/stderr.
                 */
                gmx_fatal(FARGS, "%s If you want to run with this setup, specify the -ntomp option. But we suggest to change the number of MPI ranks%s.", buf, mpi_option);
            }
        }
    }
    else
    {
        /* No domain decomposition (or only one domain) */
        if (!(ngpu > 0 && !gmx_gpu_sharing_supported()) &&
            nth_omp_max > nthreads_omp_faster(hwinfo->cpuid_info, ngpu > 0))
        {
            /* To arrive here, the user/system set #ranks and/or #OMPthreads */
            gmx_bool bEnvSet;
            char     buf2[256];

            bEnvSet = (getenv("OMP_NUM_THREADS") != NULL);

            if (bNtOmpOptionSet || bEnvSet)
            {
                sprintf(buf2, "You requested %d OpenMP threads", nth_omp_max);
            }
            else
            {
                sprintf(buf2, "Your choice of %d MPI rank%s and the use of %d total threads %sleads to the use of %d OpenMP threads",
                        cr->nnodes + cr->npmenodes,
                        cr->nnodes + cr->npmenodes == 1 ? "" : "s",
                        hw_opt->nthreads_tot > 0 ? hw_opt->nthreads_tot : hwinfo->nthreads_hw_avail,
                        hwinfo->nphysicalnode > 1 ? "on a node " : "",
                        nth_omp_max);
            }
            sprintf(buf, "%s, whereas we expect the optimum to be with more MPI ranks with %d to %d OpenMP threads.",
                    buf2, nthreads_omp_mpi_ok_min, nthreads_omp_mpi_target_max);

            /* We can not quit with a fatal error when OMP_NUM_THREADS is set
             * with different values per rank or node, since in that case
             * the user can not set -ntomp to override the error.
             */
            if (bNtOmpOptionSet || (bEnvSet && nth_omp_min != nth_omp_max))
            {
                md_print_warn(cr, fplog, "NOTE: %s\n", buf);
            }
            else
            {
                gmx_fatal(FARGS, "%s If you want to run with this many OpenMP threads, specify the -ntomp option. But we suggest to increase the number of MPI ranks%s.", buf, mpi_option);
            }
        }
    }
#else /* GMX_OPENMP && GMX_MPI */
      /* No OpenMP and/or MPI: it doesn't make much sense to check */
    GMX_UNUSED_VALUE(hw_opt);
    GMX_UNUSED_VALUE(bNtOmpOptionSet);
    /* Check if we have more than 1 physical core, if detected,
     * or more than 1 hardware thread if physical cores were not detected.
     */
#if !(defined GMX_OPENMP) && !(defined GMX_MPI)
    if ((hwinfo->ncore > 1) ||
        (hwinfo->ncore == 0 && hwinfo->nthreads_hw_avail > 1))
    {
        md_print_warn(cr, fplog, "NOTE: GROMACS was compiled without OpenMP and (thread-)MPI support, can only use a single CPU core\n");
    }
#else
    GMX_UNUSED_VALUE(hwinfo);
    GMX_UNUSED_VALUE(cr);
    GMX_UNUSED_VALUE(fplog);
#endif

#endif /* GMX_OPENMP && GMX_MPI */
}
示例#18
0
static int
get_thread_affinity_layout(FILE *fplog,
                           const t_commrec *cr,
                           const gmx_hw_info_t * hwinfo,
                           int   threads,
                           int pin_offset, int * pin_stride,
                           int **localityOrder)
{
    int                          hwThreads;
    int                          hwThreadsPerCore = 0;
    bool                         bPickPinStride;
    bool                         haveTopology;
    bool                         invalidLayout;

    const gmx::HardwareTopology &hwTop = *hwinfo->hardwareTopology;

    haveTopology = (hwTop.supportLevel() >= gmx::HardwareTopology::SupportLevel::Basic);

    if (pin_offset < 0)
    {
        gmx_fatal(FARGS, "Negative thread pinning offset requested");
    }
    if (*pin_stride < 0)
    {
        gmx_fatal(FARGS, "Negative thread pinning stride requested");
    }

    if (haveTopology)
    {
        hwThreads           = hwTop.machine().logicalProcessorCount;
        // Just use the value for the first core
        hwThreadsPerCore    = hwTop.machine().sockets[0].cores[0].hwThreads.size();
        snew(*localityOrder, hwThreads);
        int i = 0;
        for (auto &s : hwTop.machine().sockets)
        {
            for (auto &c : s.cores)
            {
                for (auto &t : c.hwThreads)
                {
                    (*localityOrder)[i++] = t.logicalProcessorId;
                }
            }
        }
    }
    else
    {
        /* topology information not available or invalid, ignore it */
        hwThreads       = hwinfo->nthreads_hw_avail;
        *localityOrder  = NULL;

        if (hwThreads <= 0)
        {
            /* We don't know anything about the hardware, don't pin */
            md_print_warn(cr, fplog,
                          "NOTE: No information on available cores, thread pinning disabled.\n");

            return -1;
        }
    }

    invalidLayout = (threads > hwThreads);
    if (invalidWithinSimulation(cr, invalidLayout))
    {
        /* We are oversubscribing, don't pin */
        md_print_warn(cr, fplog,
                      "NOTE: Oversubscribing the CPU, will not pin threads.\n");
    }
    if (invalidLayout)
    {
        return -1;
    }

    invalidLayout = (pin_offset + threads > hwThreads);
    if (invalidWithinSimulation(cr, invalidLayout))
    {
        /* We are oversubscribing, don't pin */
        md_print_warn(cr, fplog,
                      "WARNING: Requested offset too large for available cores, thread pinning disabled.\n");

    }
    if (invalidLayout)
    {
        return -1;
    }


    /* do we need to choose the pinning stride? */
    bPickPinStride = (*pin_stride == 0);

    if (bPickPinStride)
    {
        if (haveTopology && pin_offset + threads*hwThreadsPerCore <= hwThreads)
        {
            /* Put one thread on each physical core */
            *pin_stride = hwThreadsPerCore;
        }
        else
        {
            /* We don't know if we have SMT, and if we do, we don't know
             * if hw threads in the same physical core are consecutive.
             * Without SMT the pinning layout should not matter too much.
             * so we assume a consecutive layout and maximally spread out"
             * the threads at equal threads per core.
             * Note that IBM is the major non-x86 case with cpuid support
             * and probably threads are already pinned by the queuing system,
             * so we wouldn't end up here in the first place.
             */
            *pin_stride = (hwThreads - pin_offset)/threads;
        }
    }
    else
    {
        /* Check the placement of the thread with the largest index to make sure
         * that the offset & stride doesn't cause pinning beyond the last hardware thread. */
        invalidLayout = (pin_offset + (threads-1)*(*pin_stride) >= hwThreads);
        if (invalidWithinSimulation(cr, invalidLayout))
        {
            /* We are oversubscribing, don't pin */
            md_print_warn(cr, fplog,
                          "WARNING: Requested stride too large for available cores, thread pinning disabled.\n");

        }
        if (invalidLayout)
        {
            return -1;
        }
    }

    if (fplog != NULL)
    {
        fprintf(fplog, "Pinning threads with a%s logical core stride of %d\n",
                bPickPinStride ? "n auto-selected" : " user-specified",
                *pin_stride);
    }

    return 0;
}
示例#19
0
int mdrunner(gmx_hw_opt_t *hw_opt,
             FILE *fplog, t_commrec *cr, int nfile,
             const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
             gmx_bool bCompact, int nstglobalcomm,
             ivec ddxyz, int dd_node_order, real rdd, real rconstr,
             const char *dddlb_opt, real dlb_scale,
             const char *ddcsx, const char *ddcsy, const char *ddcsz,
             const char *nbpu_opt, int nstlist_cmdline,
             gmx_int64_t nsteps_cmdline, int nstepout, int resetstep,
             int gmx_unused nmultisim, int repl_ex_nst, int repl_ex_nex,
             int repl_ex_seed, real pforce, real cpt_period, real max_hours,
             int imdport, unsigned long Flags)
{
    gmx_bool                  bForceUseGPU, bTryUseGPU, bRerunMD;
    t_inputrec               *inputrec;
    t_state                  *state = NULL;
    matrix                    box;
    gmx_ddbox_t               ddbox = {0};
    int                       npme_major, npme_minor;
    t_nrnb                   *nrnb;
    gmx_mtop_t               *mtop          = NULL;
    t_mdatoms                *mdatoms       = NULL;
    t_forcerec               *fr            = NULL;
    t_fcdata                 *fcd           = NULL;
    real                      ewaldcoeff_q  = 0;
    real                      ewaldcoeff_lj = 0;
    struct gmx_pme_t        **pmedata       = NULL;
    gmx_vsite_t              *vsite         = NULL;
    gmx_constr_t              constr;
    int                       nChargePerturbed = -1, nTypePerturbed = 0, status;
    gmx_wallcycle_t           wcycle;
    gmx_bool                  bReadEkin;
    gmx_walltime_accounting_t walltime_accounting = NULL;
    int                       rc;
    gmx_int64_t               reset_counters;
    gmx_edsam_t               ed           = NULL;
    int                       nthreads_pme = 1;
    int                       nthreads_pp  = 1;
    gmx_membed_t              membed       = NULL;
    gmx_hw_info_t            *hwinfo       = NULL;
    /* The master rank decides early on bUseGPU and broadcasts this later */
    gmx_bool                  bUseGPU      = FALSE;

    /* CAUTION: threads may be started later on in this function, so
       cr doesn't reflect the final parallel state right now */
    snew(inputrec, 1);
    snew(mtop, 1);

    if (Flags & MD_APPENDFILES)
    {
        fplog = NULL;
    }

    bRerunMD     = (Flags & MD_RERUN);
    bForceUseGPU = (strncmp(nbpu_opt, "gpu", 3) == 0);
    bTryUseGPU   = (strncmp(nbpu_opt, "auto", 4) == 0) || bForceUseGPU;

    /* Detect hardware, gather information. This is an operation that is
     * global for this process (MPI rank). */
    hwinfo = gmx_detect_hardware(fplog, cr, bTryUseGPU);

    gmx_print_detected_hardware(fplog, cr, hwinfo);

    if (fplog != NULL)
    {
        /* Print references after all software/hardware printing */
        please_cite(fplog, "Abraham2015");
        please_cite(fplog, "Pall2015");
        please_cite(fplog, "Pronk2013");
        please_cite(fplog, "Hess2008b");
        please_cite(fplog, "Spoel2005a");
        please_cite(fplog, "Lindahl2001a");
        please_cite(fplog, "Berendsen95a");
    }

    snew(state, 1);
    if (SIMMASTER(cr))
    {
        /* Read (nearly) all data required for the simulation */
        read_tpx_state(ftp2fn(efTPR, nfile, fnm), inputrec, state, NULL, mtop);

        if (inputrec->cutoff_scheme == ecutsVERLET)
        {
            /* Here the master rank decides if all ranks will use GPUs */
            bUseGPU = (hwinfo->gpu_info.n_dev_compatible > 0 ||
                       getenv("GMX_EMULATE_GPU") != NULL);

            /* TODO add GPU kernels for this and replace this check by:
             * (bUseGPU && (ir->vdwtype == evdwPME &&
             *               ir->ljpme_combination_rule == eljpmeLB))
             * update the message text and the content of nbnxn_acceleration_supported.
             */
            if (bUseGPU &&
                !nbnxn_gpu_acceleration_supported(fplog, cr, inputrec, bRerunMD))
            {
                /* Fallback message printed by nbnxn_acceleration_supported */
                if (bForceUseGPU)
                {
                    gmx_fatal(FARGS, "GPU acceleration requested, but not supported with the given input settings");
                }
                bUseGPU = FALSE;
            }

            prepare_verlet_scheme(fplog, cr,
                                  inputrec, nstlist_cmdline, mtop, state->box,
                                  bUseGPU);
        }
        else
        {
            if (nstlist_cmdline > 0)
            {
                gmx_fatal(FARGS, "Can not set nstlist with the group cut-off scheme");
            }

            if (hwinfo->gpu_info.n_dev_compatible > 0)
            {
                md_print_warn(cr, fplog,
                              "NOTE: GPU(s) found, but the current simulation can not use GPUs\n"
                              "      To use a GPU, set the mdp option: cutoff-scheme = Verlet\n");
            }

            if (bForceUseGPU)
            {
                gmx_fatal(FARGS, "GPU requested, but can't be used without cutoff-scheme=Verlet");
            }

#ifdef GMX_TARGET_BGQ
            md_print_warn(cr, fplog,
                          "NOTE: There is no SIMD implementation of the group scheme kernels on\n"
                          "      BlueGene/Q. You will observe better performance from using the\n"
                          "      Verlet cut-off scheme.\n");
#endif
        }

        if (inputrec->eI == eiSD2)
        {
            md_print_warn(cr, fplog, "The stochastic dynamics integrator %s is deprecated, since\n"
                          "it is slower than integrator %s and is slightly less accurate\n"
                          "with constraints. Use the %s integrator.",
                          ei_names[inputrec->eI], ei_names[eiSD1], ei_names[eiSD1]);
        }
    }

    /* Check and update the hardware options for internal consistency */
    check_and_update_hw_opt_1(hw_opt, cr);

    /* Early check for externally set process affinity. */
    gmx_check_thread_affinity_set(fplog, cr,
                                  hw_opt, hwinfo->nthreads_hw_avail, FALSE);

#ifdef GMX_THREAD_MPI
    if (SIMMASTER(cr))
    {
        if (cr->npmenodes > 0 && hw_opt->nthreads_tmpi <= 0)
        {
            gmx_fatal(FARGS, "You need to explicitly specify the number of MPI threads (-ntmpi) when using separate PME ranks");
        }

        /* Since the master knows the cut-off scheme, update hw_opt for this.
         * This is done later for normal MPI and also once more with tMPI
         * for all tMPI ranks.
         */
        check_and_update_hw_opt_2(hw_opt, inputrec->cutoff_scheme);

        /* NOW the threads will be started: */
        hw_opt->nthreads_tmpi = get_nthreads_mpi(hwinfo,
                                                 hw_opt,
                                                 inputrec, mtop,
                                                 cr, fplog, bUseGPU);

        if (hw_opt->nthreads_tmpi > 1)
        {
            t_commrec *cr_old       = cr;
            /* now start the threads. */
            cr = mdrunner_start_threads(hw_opt, fplog, cr_old, nfile, fnm,
                                        oenv, bVerbose, bCompact, nstglobalcomm,
                                        ddxyz, dd_node_order, rdd, rconstr,
                                        dddlb_opt, dlb_scale, ddcsx, ddcsy, ddcsz,
                                        nbpu_opt, nstlist_cmdline,
                                        nsteps_cmdline, nstepout, resetstep, nmultisim,
                                        repl_ex_nst, repl_ex_nex, repl_ex_seed, pforce,
                                        cpt_period, max_hours,
                                        Flags);
            /* the main thread continues here with a new cr. We don't deallocate
               the old cr because other threads may still be reading it. */
            if (cr == NULL)
            {
                gmx_comm("Failed to spawn threads");
            }
        }
    }
#endif
    /* END OF CAUTION: cr is now reliable */

    /* g_membed initialisation *
     * Because we change the mtop, init_membed is called before the init_parallel *
     * (in case we ever want to make it run in parallel) */
    if (opt2bSet("-membed", nfile, fnm))
    {
        if (MASTER(cr))
        {
            fprintf(stderr, "Initializing membed");
        }
        membed = init_membed(fplog, nfile, fnm, mtop, inputrec, state, cr, &cpt_period);
    }

    if (PAR(cr))
    {
        /* now broadcast everything to the non-master nodes/threads: */
        init_parallel(cr, inputrec, mtop);

        /* The master rank decided on the use of GPUs,
         * broadcast this information to all ranks.
         */
        gmx_bcast_sim(sizeof(bUseGPU), &bUseGPU, cr);
    }

    if (fplog != NULL)
    {
        pr_inputrec(fplog, 0, "Input Parameters", inputrec, FALSE);
        fprintf(fplog, "\n");
    }

    /* now make sure the state is initialized and propagated */
    set_state_entries(state, inputrec);

    /* A parallel command line option consistency check that we can
       only do after any threads have started. */
    if (!PAR(cr) &&
        (ddxyz[XX] > 1 || ddxyz[YY] > 1 || ddxyz[ZZ] > 1 || cr->npmenodes > 0))
    {
        gmx_fatal(FARGS,
                  "The -dd or -npme option request a parallel simulation, "
#ifndef GMX_MPI
                  "but %s was compiled without threads or MPI enabled"
#else
#ifdef GMX_THREAD_MPI
                  "but the number of threads (option -nt) is 1"
#else
                  "but %s was not started through mpirun/mpiexec or only one rank was requested through mpirun/mpiexec"
#endif
#endif
                  , output_env_get_program_display_name(oenv)
                  );
    }

    if (bRerunMD &&
        (EI_ENERGY_MINIMIZATION(inputrec->eI) || eiNM == inputrec->eI))
    {
        gmx_fatal(FARGS, "The .mdp file specified an energy mininization or normal mode algorithm, and these are not compatible with mdrun -rerun");
    }

    if (can_use_allvsall(inputrec, TRUE, cr, fplog) && DOMAINDECOMP(cr))
    {
        gmx_fatal(FARGS, "All-vs-all loops do not work with domain decomposition, use a single MPI rank");
    }

    if (!(EEL_PME(inputrec->coulombtype) || EVDW_PME(inputrec->vdwtype)))
    {
        if (cr->npmenodes > 0)
        {
            gmx_fatal_collective(FARGS, cr, NULL,
                                 "PME-only ranks are requested, but the system does not use PME for electrostatics or LJ");
        }

        cr->npmenodes = 0;
    }

    if (bUseGPU && cr->npmenodes < 0)
    {
        /* With GPUs we don't automatically use PME-only ranks. PME ranks can
         * improve performance with many threads per GPU, since our OpenMP
         * scaling is bad, but it's difficult to automate the setup.
         */
        cr->npmenodes = 0;
    }

#ifdef GMX_FAHCORE
    if (MASTER(cr))
    {
        fcRegisterSteps(inputrec->nsteps, inputrec->init_step);
    }
#endif

    /* NMR restraints must be initialized before load_checkpoint,
     * since with time averaging the history is added to t_state.
     * For proper consistency check we therefore need to extend
     * t_state here.
     * So the PME-only nodes (if present) will also initialize
     * the distance restraints.
     */
    snew(fcd, 1);

    /* This needs to be called before read_checkpoint to extend the state */
    init_disres(fplog, mtop, inputrec, cr, fcd, state, repl_ex_nst > 0);

    init_orires(fplog, mtop, state->x, inputrec, cr, &(fcd->orires),
                state);

    if (DEFORM(*inputrec))
    {
        /* Store the deform reference box before reading the checkpoint */
        if (SIMMASTER(cr))
        {
            copy_mat(state->box, box);
        }
        if (PAR(cr))
        {
            gmx_bcast(sizeof(box), box, cr);
        }
        /* Because we do not have the update struct available yet
         * in which the reference values should be stored,
         * we store them temporarily in static variables.
         * This should be thread safe, since they are only written once
         * and with identical values.
         */
        tMPI_Thread_mutex_lock(&deform_init_box_mutex);
        deform_init_init_step_tpx = inputrec->init_step;
        copy_mat(box, deform_init_box_tpx);
        tMPI_Thread_mutex_unlock(&deform_init_box_mutex);
    }

    if (opt2bSet("-cpi", nfile, fnm))
    {
        /* Check if checkpoint file exists before doing continuation.
         * This way we can use identical input options for the first and subsequent runs...
         */
        if (gmx_fexist_master(opt2fn_master("-cpi", nfile, fnm, cr), cr) )
        {
            load_checkpoint(opt2fn_master("-cpi", nfile, fnm, cr), &fplog,
                            cr, ddxyz,
                            inputrec, state, &bReadEkin,
                            (Flags & MD_APPENDFILES),
                            (Flags & MD_APPENDFILESSET));

            if (bReadEkin)
            {
                Flags |= MD_READ_EKIN;
            }
        }
    }

    if (MASTER(cr) && (Flags & MD_APPENDFILES))
    {
        gmx_log_open(ftp2fn(efLOG, nfile, fnm), cr,
                     Flags, &fplog);
    }

    /* override nsteps with value from cmdline */
    override_nsteps_cmdline(fplog, nsteps_cmdline, inputrec, cr);

    if (SIMMASTER(cr))
    {
        copy_mat(state->box, box);
    }

    if (PAR(cr))
    {
        gmx_bcast(sizeof(box), box, cr);
    }

    /* Essential dynamics */
    if (opt2bSet("-ei", nfile, fnm))
    {
        /* Open input and output files, allocate space for ED data structure */
        ed = ed_open(mtop->natoms, &state->edsamstate, nfile, fnm, Flags, oenv, cr);
    }

    if (PAR(cr) && !(EI_TPI(inputrec->eI) ||
                     inputrec->eI == eiNM))
    {
        cr->dd = init_domain_decomposition(fplog, cr, Flags, ddxyz, rdd, rconstr,
                                           dddlb_opt, dlb_scale,
                                           ddcsx, ddcsy, ddcsz,
                                           mtop, inputrec,
                                           box, state->x,
                                           &ddbox, &npme_major, &npme_minor);

        make_dd_communicators(fplog, cr, dd_node_order);

        /* Set overallocation to avoid frequent reallocation of arrays */
        set_over_alloc_dd(TRUE);
    }
    else
    {
        /* PME, if used, is done on all nodes with 1D decomposition */
        cr->npmenodes = 0;
        cr->duty      = (DUTY_PP | DUTY_PME);
        npme_major    = 1;
        npme_minor    = 1;

        if (inputrec->ePBC == epbcSCREW)
        {
            gmx_fatal(FARGS,
                      "pbc=%s is only implemented with domain decomposition",
                      epbc_names[inputrec->ePBC]);
        }
    }

    if (PAR(cr))
    {
        /* After possible communicator splitting in make_dd_communicators.
         * we can set up the intra/inter node communication.
         */
        gmx_setup_nodecomm(fplog, cr);
    }

    /* Initialize per-physical-node MPI process/thread ID and counters. */
    gmx_init_intranode_counters(cr);
#ifdef GMX_MPI
    if (MULTISIM(cr))
    {
        md_print_info(cr, fplog,
                      "This is simulation %d out of %d running as a composite GROMACS\n"
                      "multi-simulation job. Setup for this simulation:\n\n",
                      cr->ms->sim, cr->ms->nsim);
    }
    md_print_info(cr, fplog, "Using %d MPI %s\n",
                  cr->nnodes,
#ifdef GMX_THREAD_MPI
                  cr->nnodes == 1 ? "thread" : "threads"
#else
                  cr->nnodes == 1 ? "process" : "processes"
#endif
                  );
    fflush(stderr);
#endif

    /* Check and update hw_opt for the cut-off scheme */
    check_and_update_hw_opt_2(hw_opt, inputrec->cutoff_scheme);

    /* Check and update hw_opt for the number of MPI ranks */
    check_and_update_hw_opt_3(hw_opt);

    gmx_omp_nthreads_init(fplog, cr,
                          hwinfo->nthreads_hw_avail,
                          hw_opt->nthreads_omp,
                          hw_opt->nthreads_omp_pme,
                          (cr->duty & DUTY_PP) == 0,
                          inputrec->cutoff_scheme == ecutsVERLET);

#ifndef NDEBUG
    if (integrator[inputrec->eI].func != do_tpi &&
        inputrec->cutoff_scheme == ecutsVERLET)
    {
        gmx_feenableexcept();
    }
#endif

    if (bUseGPU)
    {
        /* Select GPU id's to use */
        gmx_select_gpu_ids(fplog, cr, &hwinfo->gpu_info, bForceUseGPU,
                           &hw_opt->gpu_opt);
    }
    else
    {
        /* Ignore (potentially) manually selected GPUs */
        hw_opt->gpu_opt.n_dev_use = 0;
    }

    /* check consistency across ranks of things like SIMD
     * support and number of GPUs selected */
    gmx_check_hw_runconf_consistency(fplog, hwinfo, cr, hw_opt, bUseGPU);

    /* Now that we know the setup is consistent, check for efficiency */
    check_resource_division_efficiency(hwinfo, hw_opt, Flags & MD_NTOMPSET,
                                       cr, fplog);

    if (DOMAINDECOMP(cr))
    {
        /* When we share GPUs over ranks, we need to know this for the DLB */
        dd_setup_dlb_resource_sharing(cr, hwinfo, hw_opt);
    }

    /* getting number of PP/PME threads
       PME: env variable should be read only on one node to make sure it is
       identical everywhere;
     */
    /* TODO nthreads_pp is only used for pinning threads.
     * This is a temporary solution until we have a hw topology library.
     */
    nthreads_pp  = gmx_omp_nthreads_get(emntNonbonded);
    nthreads_pme = gmx_omp_nthreads_get(emntPME);

    wcycle = wallcycle_init(fplog, resetstep, cr, nthreads_pp, nthreads_pme);

    if (PAR(cr))
    {
        /* Master synchronizes its value of reset_counters with all nodes
         * including PME only nodes */
        reset_counters = wcycle_get_reset_counters(wcycle);
        gmx_bcast_sim(sizeof(reset_counters), &reset_counters, cr);
        wcycle_set_reset_counters(wcycle, reset_counters);
    }

    snew(nrnb, 1);
    if (cr->duty & DUTY_PP)
    {
        bcast_state(cr, state);

        /* Initiate forcerecord */
        fr          = mk_forcerec();
        fr->hwinfo  = hwinfo;
        fr->gpu_opt = &hw_opt->gpu_opt;
        init_forcerec(fplog, oenv, fr, fcd, inputrec, mtop, cr, box,
                      opt2fn("-table", nfile, fnm),
                      opt2fn("-tabletf", nfile, fnm),
                      opt2fn("-tablep", nfile, fnm),
                      opt2fn("-tableb", nfile, fnm),
                      nbpu_opt,
                      FALSE,
                      pforce);

        /* version for PCA_NOT_READ_NODE (see md.c) */
        /*init_forcerec(fplog,fr,fcd,inputrec,mtop,cr,box,FALSE,
           "nofile","nofile","nofile","nofile",FALSE,pforce);
         */

        /* Initialize QM-MM */
        if (fr->bQMMM)
        {
            init_QMMMrec(cr, mtop, inputrec, fr);
        }

        /* Initialize the mdatoms structure.
         * mdatoms is not filled with atom data,
         * as this can not be done now with domain decomposition.
         */
        mdatoms = init_mdatoms(fplog, mtop, inputrec->efep != efepNO);

        /* Initialize the virtual site communication */
        vsite = init_vsite(mtop, cr, FALSE);

        calc_shifts(box, fr->shift_vec);

        /* With periodic molecules the charge groups should be whole at start up
         * and the virtual sites should not be far from their proper positions.
         */
        if (!inputrec->bContinuation && MASTER(cr) &&
            !(inputrec->ePBC != epbcNONE && inputrec->bPeriodicMols))
        {
            /* Make molecules whole at start of run */
            if (fr->ePBC != epbcNONE)
            {
                do_pbc_first_mtop(fplog, inputrec->ePBC, box, mtop, state->x);
            }
            if (vsite)
            {
                /* Correct initial vsite positions are required
                 * for the initial distribution in the domain decomposition
                 * and for the initial shell prediction.
                 */
                construct_vsites_mtop(vsite, mtop, state->x);
            }
        }

        if (EEL_PME(fr->eeltype) || EVDW_PME(fr->vdwtype))
        {
            ewaldcoeff_q  = fr->ewaldcoeff_q;
            ewaldcoeff_lj = fr->ewaldcoeff_lj;
            pmedata       = &fr->pmedata;
        }
        else
        {
            pmedata = NULL;
        }
    }
    else
    {
        /* This is a PME only node */

        /* We don't need the state */
        done_state(state);

        ewaldcoeff_q  = calc_ewaldcoeff_q(inputrec->rcoulomb, inputrec->ewald_rtol);
        ewaldcoeff_lj = calc_ewaldcoeff_lj(inputrec->rvdw, inputrec->ewald_rtol_lj);
        snew(pmedata, 1);
    }

    if (hw_opt->thread_affinity != threadaffOFF)
    {
        /* Before setting affinity, check whether the affinity has changed
         * - which indicates that probably the OpenMP library has changed it
         * since we first checked).
         */
        gmx_check_thread_affinity_set(fplog, cr,
                                      hw_opt, hwinfo->nthreads_hw_avail, TRUE);

        /* Set the CPU affinity */
        gmx_set_thread_affinity(fplog, cr, hw_opt, hwinfo);
    }

    /* Initiate PME if necessary,
     * either on all nodes or on dedicated PME nodes only. */
    if (EEL_PME(inputrec->coulombtype) || EVDW_PME(inputrec->vdwtype))
    {
        if (mdatoms)
        {
            nChargePerturbed = mdatoms->nChargePerturbed;
            if (EVDW_PME(inputrec->vdwtype))
            {
                nTypePerturbed   = mdatoms->nTypePerturbed;
            }
        }
        if (cr->npmenodes > 0)
        {
            /* The PME only nodes need to know nChargePerturbed(FEP on Q) and nTypePerturbed(FEP on LJ)*/
            gmx_bcast_sim(sizeof(nChargePerturbed), &nChargePerturbed, cr);
            gmx_bcast_sim(sizeof(nTypePerturbed), &nTypePerturbed, cr);
        }

        if (cr->duty & DUTY_PME)
        {
            status = gmx_pme_init(pmedata, cr, npme_major, npme_minor, inputrec,
                                  mtop ? mtop->natoms : 0, nChargePerturbed, nTypePerturbed,
                                  (Flags & MD_REPRODUCIBLE), nthreads_pme);
            if (status != 0)
            {
                gmx_fatal(FARGS, "Error %d initializing PME", status);
            }
        }
    }


    if (integrator[inputrec->eI].func == do_md)
    {
        /* Turn on signal handling on all nodes */
        /*
         * (A user signal from the PME nodes (if any)
         * is communicated to the PP nodes.
         */
        signal_handler_install();
    }

    if (cr->duty & DUTY_PP)
    {
        /* Assumes uniform use of the number of OpenMP threads */
        walltime_accounting = walltime_accounting_init(gmx_omp_nthreads_get(emntDefault));

        if (inputrec->bPull)
        {
            /* Initialize pull code */
            inputrec->pull_work =
                init_pull(fplog, inputrec->pull, inputrec, nfile, fnm,
                          mtop, cr, oenv, inputrec->fepvals->init_lambda,
                          EI_DYNAMICS(inputrec->eI) && MASTER(cr), Flags);
        }

        if (inputrec->bRot)
        {
            /* Initialize enforced rotation code */
            init_rot(fplog, inputrec, nfile, fnm, cr, state->x, box, mtop, oenv,
                     bVerbose, Flags);
        }

        if (inputrec->eSwapCoords != eswapNO)
        {
            /* Initialize ion swapping code */
            init_swapcoords(fplog, bVerbose, inputrec, opt2fn_master("-swap", nfile, fnm, cr),
                            mtop, state->x, state->box, &state->swapstate, cr, oenv, Flags);
        }

        constr = init_constraints(fplog, mtop, inputrec, ed, state, cr);

        if (DOMAINDECOMP(cr))
        {
            GMX_RELEASE_ASSERT(fr, "fr was NULL while cr->duty was DUTY_PP");
            dd_init_bondeds(fplog, cr->dd, mtop, vsite, inputrec,
                            Flags & MD_DDBONDCHECK, fr->cginfo_mb);

            set_dd_parameters(fplog, cr->dd, dlb_scale, inputrec, &ddbox);

            setup_dd_grid(fplog, cr->dd);
        }

        /* Now do whatever the user wants us to do (how flexible...) */
        integrator[inputrec->eI].func(fplog, cr, nfile, fnm,
                                      oenv, bVerbose, bCompact,
                                      nstglobalcomm,
                                      vsite, constr,
                                      nstepout, inputrec, mtop,
                                      fcd, state,
                                      mdatoms, nrnb, wcycle, ed, fr,
                                      repl_ex_nst, repl_ex_nex, repl_ex_seed,
                                      membed,
                                      cpt_period, max_hours,
                                      imdport,
                                      Flags,
                                      walltime_accounting);

        if (inputrec->bPull)
        {
            finish_pull(inputrec->pull_work);
        }

        if (inputrec->bRot)
        {
            finish_rot(inputrec->rot);
        }

    }
    else
    {
        GMX_RELEASE_ASSERT(pmedata, "pmedata was NULL while cr->duty was not DUTY_PP");
        /* do PME only */
        walltime_accounting = walltime_accounting_init(gmx_omp_nthreads_get(emntPME));
        gmx_pmeonly(*pmedata, cr, nrnb, wcycle, walltime_accounting, ewaldcoeff_q, ewaldcoeff_lj, inputrec);
    }

    wallcycle_stop(wcycle, ewcRUN);

    /* Finish up, write some stuff
     * if rerunMD, don't write last frame again
     */
    finish_run(fplog, cr,
               inputrec, nrnb, wcycle, walltime_accounting,
               fr ? fr->nbv : NULL,
               EI_DYNAMICS(inputrec->eI) && !MULTISIM(cr));


    /* Free GPU memory and context */
    free_gpu_resources(fr, cr, &hwinfo->gpu_info, fr ? fr->gpu_opt : NULL);

    if (opt2bSet("-membed", nfile, fnm))
    {
        sfree(membed);
    }

    gmx_hardware_info_free(hwinfo);

    /* Does what it says */
    print_date_and_time(fplog, cr->nodeid, "Finished mdrun", gmx_gettime());
    walltime_accounting_destroy(walltime_accounting);

    /* PLUMED */
    if(plumedswitch){
      plumed_finalize(plumedmain);
    }
    /* END PLUMED */

    /* Close logfile already here if we were appending to it */
    if (MASTER(cr) && (Flags & MD_APPENDFILES))
    {
        gmx_log_close(fplog);
    }

    rc = (int)gmx_get_stop_condition();

    done_ed(&ed);

#ifdef GMX_THREAD_MPI
    /* we need to join all threads. The sub-threads join when they
       exit this function, but the master thread needs to be told to
       wait for that. */
    if (PAR(cr) && MASTER(cr))
    {
        tMPI_Finalize();
    }
#endif

    return rc;
}
示例#20
0
/* Set CPU affinity. Can be important for performance.
   On some systems (e.g. Cray) CPU Affinity is set by default.
   But default assigning doesn't work (well) with only some ranks
   having threads. This causes very low performance.
   External tools have cumbersome syntax for setting affinity
   in the case that only some ranks have threads.
   Thus it is important that GROMACS sets the affinity internally
   if only PME is using threads.
 */
void
gmx_set_thread_affinity(FILE                *fplog,
                        const t_commrec     *cr,
                        const gmx_hw_opt_t  *hw_opt,
                        const gmx_hw_info_t *hwinfo)
{
    int        nth_affinity_set, thread0_id_node,
               nthread_local, nthread_node;
    int        offset;
    int *      localityOrder = nullptr;
    int        rc;

    if (hw_opt->thread_affinity == threadaffOFF)
    {
        /* Nothing to do */
        return;
    }

    /* If the tMPI thread affinity setting is not supported encourage the user
     * to report it as it's either a bug or an exotic platform which we might
     * want to support. */
    if (tMPI_Thread_setaffinity_support() != TMPI_SETAFFINITY_SUPPORT_YES)
    {
        /* we know Mac OS & BlueGene do not support setting thread affinity, so there's
           no point in warning the user in that case. In any other case
           the user might be able to do something about it. */
#if !defined(__APPLE__) && !defined(__bg__)
        md_print_warn(cr, fplog,
                      "NOTE: Cannot set thread affinities on the current platform.\n");
#endif  /* __APPLE__ */
        return;
    }

    /* threads on this MPI process or TMPI thread */
    if (cr->duty & DUTY_PP)
    {
        nthread_local = gmx_omp_nthreads_get(emntNonbonded);
    }
    else
    {
        nthread_local = gmx_omp_nthreads_get(emntPME);
    }

    /* map the current process to cores */
    thread0_id_node = 0;
    nthread_node    = nthread_local;
#if GMX_MPI
    if (PAR(cr) || MULTISIM(cr))
    {
        /* We need to determine a scan of the thread counts in this
         * compute node.
         */
        MPI_Comm comm_intra;

        MPI_Comm_split(MPI_COMM_WORLD,
                       gmx_physicalnode_id_hash(), cr->rank_intranode,
                       &comm_intra);
        MPI_Scan(&nthread_local, &thread0_id_node, 1, MPI_INT, MPI_SUM, comm_intra);
        /* MPI_Scan is inclusive, but here we need exclusive */
        thread0_id_node -= nthread_local;
        /* Get the total number of threads on this physical node */
        MPI_Allreduce(&nthread_local, &nthread_node, 1, MPI_INT, MPI_SUM, comm_intra);
        MPI_Comm_free(&comm_intra);
    }
#endif

    if (hw_opt->thread_affinity == threadaffAUTO &&
        nthread_node != hwinfo->nthreads_hw_avail)
    {
        if (nthread_node > 1 && nthread_node < hwinfo->nthreads_hw_avail)
        {
            md_print_warn(cr, fplog,
                          "NOTE: The number of threads is not equal to the number of (logical) cores\n"
                          "      and the -pin option is set to auto: will not pin thread to cores.\n"
                          "      This can lead to significant performance degradation.\n"
                          "      Consider using -pin on (and -pinoffset in case you run multiple jobs).\n");
        }

        return;
    }

    offset = 0;
    if (hw_opt->core_pinning_offset != 0)
    {
        offset = hw_opt->core_pinning_offset;
        md_print_info(cr, fplog, "Applying core pinning offset %d\n", offset);
    }

    int core_pinning_stride = hw_opt->core_pinning_stride;
    rc = get_thread_affinity_layout(fplog, cr, hwinfo,
                                    nthread_node,
                                    offset, &core_pinning_stride,
                                    &localityOrder);
    gmx::scoped_guard_sfree localityOrderGuard(localityOrder);

    if (rc != 0)
    {
        /* Incompatible layout, don't pin, warning was already issued */
        return;
    }

    /* Set the per-thread affinity. In order to be able to check the success
     * of affinity settings, we will set nth_affinity_set to 1 on threads
     * where the affinity setting succeded and to 0 where it failed.
     * Reducing these 0/1 values over the threads will give the total number
     * of threads on which we succeeded.
     */

    // To avoid warnings from the static analyzer we initialize nth_affinity_set
    // to zero outside the OpenMP block, and then add to it inside the block.
    // The value will still always be 0 or 1 from each thread.
    nth_affinity_set = 0;
#pragma omp parallel num_threads(nthread_local) reduction(+:nth_affinity_set)
    {
        try
        {
            int      thread_id, thread_id_node;
            int      index, core;
            gmx_bool setaffinity_ret;

            thread_id      = gmx_omp_get_thread_num();
            thread_id_node = thread0_id_node + thread_id;
            index          = offset + thread_id_node*core_pinning_stride;
            if (localityOrder != nullptr)
            {
                core = localityOrder[index];
            }
            else
            {
                core = index;
            }

            setaffinity_ret = tMPI_Thread_setaffinity_single(tMPI_Thread_self(), core);

            /* store the per-thread success-values of the setaffinity */
            nth_affinity_set += (setaffinity_ret == 0);

            if (debug)
            {
                fprintf(debug, "On rank %2d, thread %2d, index %2d, core %2d the affinity setting returned %d\n",
                        cr->nodeid, gmx_omp_get_thread_num(), index, core, setaffinity_ret);
            }
        }
        GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
    }

    if (nth_affinity_set > nthread_local)
    {
        char msg[STRLEN];

        sprintf(msg, "Looks like we have set affinity for more threads than "
                "we have (%d > %d)!\n", nth_affinity_set, nthread_local);
        gmx_incons(msg);
    }
    else
    {
        /* check & warn if some threads failed to set their affinities */
        const bool allAffinitiesSet = (nth_affinity_set == nthread_local);
        if (!allAffinitiesSet)
        {
            char sbuf1[STRLEN], sbuf2[STRLEN];

            /* sbuf1 contains rank info, while sbuf2 OpenMP thread info */
            sbuf1[0] = sbuf2[0] = '\0';
            /* Only add rank info if we have more than one rank. */
            if (cr->nnodes > 1)
            {
#if GMX_MPI
#if GMX_THREAD_MPI
                sprintf(sbuf1, "In tMPI thread #%d: ", cr->nodeid);
#else           /* GMX_LIB_MPI */
                sprintf(sbuf1, "In MPI process #%d: ", cr->nodeid);
#endif
#endif          /* GMX_MPI */
            }

            if (nthread_local > 1)
            {
                sprintf(sbuf2, "for %d/%d thread%s ",
                        nthread_local - nth_affinity_set, nthread_local,
                        nthread_local > 1 ? "s" : "");
            }

            fprintf(stderr, "NOTE: %sAffinity setting %sfailed.\n", sbuf1, sbuf2);
        }
        if (invalidWithinSimulation(cr, !allAffinitiesSet))
        {
            md_print_warn(cr, fplog,
                          "NOTE: Thread affinity setting failed. This can cause performance degradation.\n"
                          "      If you think your settings are correct, ask on the gmx-users list.\n");
        }
    }
}
示例#21
0
void wallcycle_print(FILE *fplog, int nnodes, int npme,
                     int nth_pp, int nth_pme, double realtime,
                     gmx_wallcycle_t wc, const WallcycleCounts &cyc_sum,
                     struct gmx_wallclock_gpu_t *gpu_t)
{
    double      tot, tot_for_pp, tot_for_rest, tot_gpu, tot_cpu_overlap, gpu_cpu_ratio, tot_k;
    double      c2t, c2t_pp, c2t_pme = 0;
    int         i, j, npp, nth_tot;
    char        buf[STRLEN];
    const char *hline = "-----------------------------------------------------------------------------";

    if (wc == NULL)
    {
        return;
    }

    GMX_ASSERT(nth_pp > 0, "Number of particle-particle threads must be >0");
    GMX_ASSERT(nth_pme > 0, "Number of PME threads must be >0");
    GMX_ASSERT(nnodes > 0, "Number of nodes must be >0");
    GMX_ASSERT(npme >= 0, "Number of PME nodes cannot be negative");
    npp     = nnodes - npme;
    /* npme is the number of PME-only ranks used, and we always do PP work */
    GMX_ASSERT(npp > 0, "Number of particle-particle nodes must be >0");

    nth_tot = npp*nth_pp + npme*nth_pme;

    /* When using PME-only nodes, the next line is valid for both
       PP-only and PME-only nodes because they started ewcRUN at the
       same time. */
    tot        = cyc_sum[ewcRUN];
    tot_for_pp = 0;

    if (tot <= 0.0)
    {
        /* TODO This is heavy handed, but until someone reworks the
           code so that it is provably robust with respect to
           non-positive values for all possible timer and cycle
           counters, there is less value gained from printing whatever
           timing data might still be sensible for some non-Jenkins
           run, than is lost from diagnosing Jenkins FP exceptions on
           runs about whose execution time we don't care. */
        md_print_warn(NULL, fplog, "WARNING: A total of %f CPU cycles was recorded, so mdrun cannot print a time accounting\n", tot);
        return;
    }

    if (wc->haveInvalidCount)
    {
        md_print_warn(NULL, fplog, "%s\n",
                      "NOTE: Detected invalid cycle counts, probably because threads moved between CPU cores that do not have synchronized cycle counters. Will not print the cycle accounting.");
        return;
    }


    /* Conversion factor from cycles to seconds */
    c2t     = realtime/tot;
    c2t_pp  = c2t * nth_tot / static_cast<double>(npp*nth_pp);
    if (npme > 0)
    {
        c2t_pme = c2t * nth_tot / static_cast<double>(npme*nth_pme);
    }
    else
    {
        c2t_pme = 0;
    }

    fprintf(fplog, "\n     R E A L   C Y C L E   A N D   T I M E   A C C O U N T I N G\n\n");

    print_header(fplog, npp, nth_pp, npme, nth_pme);

    fprintf(fplog, "%s\n", hline);
    for (i = ewcPPDURINGPME+1; i < ewcNR; i++)
    {
        if (is_pme_subcounter(i))
        {
            /* Do not count these at all */
        }
        else if (npme > 0 && is_pme_counter(i))
        {
            /* Print timing information for PME-only nodes, but add an
             * asterisk so the reader of the table can know that the
             * walltimes are not meant to add up. The asterisk still
             * fits in the required maximum of 19 characters. */
            char buffer[STRLEN];
            snprintf(buffer, STRLEN, "%s *", wcn[i]);
            print_cycles(fplog, c2t_pme, buffer,
                         npme, nth_pme,
                         wc->wcc[i].n, cyc_sum[i], tot);
        }
        else
        {
            /* Print timing information when it is for a PP or PP+PME
               node */
            print_cycles(fplog, c2t_pp, wcn[i],
                         npp, nth_pp,
                         wc->wcc[i].n, cyc_sum[i], tot);
            tot_for_pp += cyc_sum[i];
        }
    }
    if (wc->wcc_all != NULL)
    {
        for (i = 0; i < ewcNR; i++)
        {
            for (j = 0; j < ewcNR; j++)
            {
                snprintf(buf, 20, "%-9.9s %-9.9s", wcn[i], wcn[j]);
                print_cycles(fplog, c2t_pp, buf,
                             npp, nth_pp,
                             wc->wcc_all[i*ewcNR+j].n,
                             wc->wcc_all[i*ewcNR+j].c,
                             tot);
            }
        }
    }
    tot_for_rest = tot * npp * nth_pp / static_cast<double>(nth_tot);
    print_cycles(fplog, c2t_pp, "Rest",
                 npp, nth_pp,
                 -1, tot_for_rest - tot_for_pp, tot);
    fprintf(fplog, "%s\n", hline);
    print_cycles(fplog, c2t, "Total",
                 npp, nth_pp,
                 -1, tot, tot);
    fprintf(fplog, "%s\n", hline);

    if (npme > 0)
    {
        fprintf(fplog,
                "(*) Note that with separate PME ranks, the walltime column actually sums to\n"
                "    twice the total reported, but the cycle count total and %% are correct.\n"
                "%s\n", hline);
    }

    if (wc->wcc[ewcPMEMESH].n > 0)
    {
        fprintf(fplog, " Breakdown of PME mesh computation\n");
        fprintf(fplog, "%s\n", hline);
        for (i = ewcPPDURINGPME+1; i < ewcNR; i++)
        {
            if (is_pme_subcounter(i))
            {
                print_cycles(fplog, npme > 0 ? c2t_pme : c2t_pp, wcn[i],
                             npme > 0 ? npme : npp, nth_pme,
                             wc->wcc[i].n, cyc_sum[i], tot);
            }
        }
        fprintf(fplog, "%s\n", hline);
    }

    if (useCycleSubcounters && wc->wcsc)
    {
        fprintf(fplog, " Breakdown of PP computation\n");
        fprintf(fplog, "%s\n", hline);
        for (i = 0; i < ewcsNR; i++)
        {
            print_cycles(fplog, c2t_pp, wcsn[i],
                         npp, nth_pp,
                         wc->wcsc[i].n, cyc_sum[ewcNR+i], tot);
        }
        fprintf(fplog, "%s\n", hline);
    }

    /* print GPU timing summary */
    if (gpu_t)
    {
        const char *k_log_str[2][2] = {
            {"Nonbonded F kernel", "Nonbonded F+ene k."},
            {"Nonbonded F+prune k.", "Nonbonded F+ene+prune k."}
        };

        tot_gpu = gpu_t->pl_h2d_t + gpu_t->nb_h2d_t + gpu_t->nb_d2h_t;

        /* add up the kernel timings */
        tot_k = 0.0;
        for (i = 0; i < 2; i++)
        {
            for (j = 0; j < 2; j++)
            {
                tot_k += gpu_t->ktime[i][j].t;
            }
        }
        tot_gpu += tot_k;

        tot_cpu_overlap = wc->wcc[ewcFORCE].c;
        if (wc->wcc[ewcPMEMESH].n > 0)
        {
            tot_cpu_overlap += wc->wcc[ewcPMEMESH].c;
        }
        tot_cpu_overlap *= realtime*1000/tot; /* convert s to ms */

        fprintf(fplog, "\n GPU timings\n%s\n", hline);
        fprintf(fplog, " Computing:                         Count  Wall t (s)      ms/step       %c\n", '%');
        fprintf(fplog, "%s\n", hline);
        print_gputimes(fplog, "Pair list H2D",
                       gpu_t->pl_h2d_c, gpu_t->pl_h2d_t, tot_gpu);
        print_gputimes(fplog, "X / q H2D",
                       gpu_t->nb_c, gpu_t->nb_h2d_t, tot_gpu);

        for (i = 0; i < 2; i++)
        {
            for (j = 0; j < 2; j++)
            {
                if (gpu_t->ktime[i][j].c)
                {
                    print_gputimes(fplog, k_log_str[i][j],
                                   gpu_t->ktime[i][j].c, gpu_t->ktime[i][j].t, tot_gpu);
                }
            }
        }

        print_gputimes(fplog, "F D2H",  gpu_t->nb_c, gpu_t->nb_d2h_t, tot_gpu);
        fprintf(fplog, "%s\n", hline);
        print_gputimes(fplog, "Total ", gpu_t->nb_c, tot_gpu, tot_gpu);
        fprintf(fplog, "%s\n", hline);

        gpu_cpu_ratio = tot_gpu/tot_cpu_overlap;
        if (gpu_t->nb_c > 0 && wc->wcc[ewcFORCE].n > 0)
        {
            fprintf(fplog, "\nForce evaluation time GPU/CPU: %.3f ms/%.3f ms = %.3f\n",
                    tot_gpu/gpu_t->nb_c, tot_cpu_overlap/wc->wcc[ewcFORCE].n,
                    gpu_cpu_ratio);
        }

        /* only print notes related to CPU-GPU load balance with PME */
        if (wc->wcc[ewcPMEMESH].n > 0)
        {
            fprintf(fplog, "For optimal performance this ratio should be close to 1!\n");

            /* print note if the imbalance is high with PME case in which
             * CPU-GPU load balancing is possible */
            if (gpu_cpu_ratio < 0.75 || gpu_cpu_ratio > 1.2)
            {
                /* Only the sim master calls this function, so always print to stderr */
                if (gpu_cpu_ratio < 0.75)
                {
                    if (npp > 1)
                    {
                        /* The user could have used -notunepme,
                         * but we currently can't check that here.
                         */
                        md_print_warn(NULL, fplog,
                                      "\nNOTE: The GPU has >25%% less load than the CPU. This imbalance causes\n"
                                      "      performance loss. Maybe the domain decomposition limits the PME tuning.\n"
                                      "      In that case, try setting the DD grid manually (-dd) or lowering -dds.");
                    }
                    else
                    {
                        /* We should not end up here, unless the box is
                         * too small for increasing the cut-off for PME tuning.
                         */
                        md_print_warn(NULL, fplog,
                                      "\nNOTE: The GPU has >25%% less load than the CPU. This imbalance causes\n"
                                      "      performance loss.");
                    }
                }
                if (gpu_cpu_ratio > 1.2)
                {
                    md_print_warn(NULL, fplog,
                                  "\nNOTE: The GPU has >20%% more load than the CPU. This imbalance causes\n"
                                  "      performance loss, consider using a shorter cut-off and a finer PME grid.");
                }
            }
        }
    }

    if (wc->wc_barrier)
    {
        md_print_warn(NULL, fplog,
                      "MPI_Barrier was called before each cycle start/stop\n"
                      "call, so timings are not those of real runs.\n");
    }

    if (wc->wcc[ewcNB_XF_BUF_OPS].n > 0 &&
        (cyc_sum[ewcDOMDEC] > tot*0.1 ||
         cyc_sum[ewcNS] > tot*0.1))
    {
        /* Only the sim master calls this function, so always print to stderr */
        if (wc->wcc[ewcDOMDEC].n == 0)
        {
            md_print_warn(NULL, fplog,
                          "NOTE: %d %% of the run time was spent in pair search,\n"
                          "      you might want to increase nstlist (this has no effect on accuracy)\n",
                          (int)(100*cyc_sum[ewcNS]/tot+0.5));
        }
        else
        {
            md_print_warn(NULL, fplog,
                          "NOTE: %d %% of the run time was spent in domain decomposition,\n"
                          "      %d %% of the run time was spent in pair search,\n"
                          "      you might want to increase nstlist (this has no effect on accuracy)\n",
                          (int)(100*cyc_sum[ewcDOMDEC]/tot+0.5),
                          (int)(100*cyc_sum[ewcNS]/tot+0.5));
        }
    }

    if (cyc_sum[ewcMoveE] > tot*0.05)
    {
        /* Only the sim master calls this function, so always print to stderr */
        md_print_warn(NULL, fplog,
                      "NOTE: %d %% of the run time was spent communicating energies,\n"
                      "      you might want to use the -gcom option of mdrun\n",
                      (int)(100*cyc_sum[ewcMoveE]/tot+0.5));
    }
}
void pme_loadbal_do(pme_load_balancing_t *pme_lb,
                    t_commrec            *cr,
                    FILE                 *fp_err,
                    FILE                 *fp_log,
                    t_inputrec           *ir,
                    t_forcerec           *fr,
                    t_state              *state,
                    gmx_wallcycle_t       wcycle,
                    gmx_int64_t           step,
                    gmx_int64_t           step_rel,
                    gmx_bool             *bPrinting)
{
    int    n_prev;
    double cycles_prev;

    assert(pme_lb != NULL);

    if (!pme_lb->bActive)
    {
        return;
    }

    n_prev      = pme_lb->cycles_n;
    cycles_prev = pme_lb->cycles_c;
    wallcycle_get(wcycle, ewcSTEP, &pme_lb->cycles_n, &pme_lb->cycles_c);
    if (pme_lb->cycles_n == 0)
    {
        /* Before the first step we haven't done any steps yet */
        return;
    }
    /* Sanity check, we expect nstlist cycle counts */
    if (pme_lb->cycles_n - n_prev != ir->nstlist)
    {
        /* We could return here, but it's safer to issue and error and quit */
        gmx_incons("pme_loadbal_do called at an interval != nstlist");
    }

    /* PME grid + cut-off optimization with GPUs or PME ranks */
    if (!pme_lb->bBalance && pme_lb->bSepPMERanks)
    {
        if (pme_lb->bTriggerOnDLB)
        {
            pme_lb->bBalance = dd_dlb_is_on(cr->dd);
        }
        /* We should ignore the first timing to avoid timing allocation
         * overhead. And since the PME load balancing is called just
         * before DD repartitioning, the ratio returned by dd_pme_f_ratio
         * is not over the last nstlist steps, but the nstlist steps before
         * that. So the first useful ratio is available at step_rel=3*nstlist.
         */
        else if (step_rel >= 3*ir->nstlist)
        {
            if (DDMASTER(cr->dd))
            {
                /* If PME rank load is too high, start tuning */
                pme_lb->bBalance =
                    (dd_pme_f_ratio(cr->dd) >= loadBalanceTriggerFactor);
            }
            dd_bcast(cr->dd, sizeof(gmx_bool), &pme_lb->bBalance);
        }

        pme_lb->bActive = (pme_lb->bBalance ||
                           step_rel <= pme_lb->step_rel_stop);
    }

    /* The location in the code of this balancing termination is strange.
     * You would expect to have it after the call to pme_load_balance()
     * below, since there pme_lb->stage is updated.
     * But when terminating directly after deciding on and selecting the
     * optimal setup, DLB will turn on right away if it was locked before.
     * This might be due to PME reinitialization. So we check stage here
     * to allow for another nstlist steps with DLB locked to stabilize
     * the performance.
     */
    if (pme_lb->bBalance && pme_lb->stage == pme_lb->nstage)
    {
        pme_lb->bBalance = FALSE;

        if (DOMAINDECOMP(cr) && dd_dlb_is_locked(cr->dd))
        {
            /* Unlock the DLB=auto, DLB is allowed to activate */
            dd_dlb_unlock(cr->dd);
            md_print_warn(cr, fp_log, "NOTE: DLB can now turn on, when beneficial\n");

            /* We don't deactivate the tuning yet, since we will balance again
             * after DLB gets turned on, if it does within PMETune_period.
             */
            continue_pme_loadbal(pme_lb, TRUE);
            pme_lb->bTriggerOnDLB = TRUE;
            pme_lb->step_rel_stop = step_rel + PMETunePeriod*ir->nstlist;
        }
        else
        {
            /* We're completely done with PME tuning */
            pme_lb->bActive = FALSE;
        }

        if (DOMAINDECOMP(cr))
        {
            /* Set the cut-off limit to the final selected cut-off,
             * so we don't have artificial DLB limits.
             * This also ensures that we won't disable the currently
             * optimal setting during a second round of PME balancing.
             */
            set_dd_dlb_max_cutoff(cr, fr->ic->rlistlong);
        }
    }

    if (pme_lb->bBalance)
    {
        /* We might not have collected nstlist steps in cycles yet,
         * since init_step might not be a multiple of nstlist,
         * but the first data collected is skipped anyhow.
         */
        pme_load_balance(pme_lb, cr,
                         fp_err, fp_log,
                         ir, state, pme_lb->cycles_c - cycles_prev,
                         fr->ic, fr->nbv, &fr->pmedata,
                         step);

        /* Update constants in forcerec/inputrec to keep them in sync with fr->ic */
        fr->ewaldcoeff_q  = fr->ic->ewaldcoeff_q;
        fr->ewaldcoeff_lj = fr->ic->ewaldcoeff_lj;
        fr->rlist         = fr->ic->rlist;
        fr->rlistlong     = fr->ic->rlistlong;
        fr->rcoulomb      = fr->ic->rcoulomb;
        fr->rvdw          = fr->ic->rvdw;

        if (ir->eDispCorr != edispcNO)
        {
            calc_enervirdiff(NULL, ir->eDispCorr, fr);
        }
    }

    if (!pme_lb->bBalance &&
        (!pme_lb->bSepPMERanks || step_rel > pme_lb->step_rel_stop))
    {
        /* We have just deactivated the balancing and we're not measuring PP/PME
         * imbalance during the first steps of the run: deactivate the tuning.
         */
        pme_lb->bActive = FALSE;
    }

    if (!(pme_lb->bActive) && DOMAINDECOMP(cr) && dd_dlb_is_locked(cr->dd))
    {
        /* Make sure DLB is allowed when we deactivate PME tuning */
        dd_dlb_unlock(cr->dd);
        md_print_warn(cr, fp_log, "NOTE: DLB can now turn on, when beneficial\n");
    }

    *bPrinting = pme_lb->bBalance;
}
示例#23
0
int check_nstglobalcomm(FILE *fplog, t_commrec *cr,
                        int nstglobalcomm, t_inputrec *ir)
{
    if (!EI_DYNAMICS(ir->eI))
    {
        nstglobalcomm = 1;
    }

    if (nstglobalcomm == -1)
    {
        if (!(ir->nstcalcenergy > 0 ||
              ir->nstlist > 0 ||
              ir->etc != etcNO ||
              ir->epc != epcNO))
        {
            nstglobalcomm = 10;
            if (ir->nstenergy > 0 && ir->nstenergy < nstglobalcomm)
            {
                nstglobalcomm = ir->nstenergy;
            }
        }
        else
        {
            /* Ensure that we do timely global communication for
             * (possibly) each of the four following options.
             */
            nstglobalcomm = lcd4(ir->nstcalcenergy,
                                 ir->nstlist,
                                 ir->etc != etcNO ? ir->nsttcouple : 0,
                                 ir->epc != epcNO ? ir->nstpcouple : 0);
        }
    }
    else
    {
        if (ir->nstlist > 0 &&
            nstglobalcomm > ir->nstlist && nstglobalcomm % ir->nstlist != 0)
        {
            nstglobalcomm = (nstglobalcomm / ir->nstlist)*ir->nstlist;
            md_print_warn(cr, fplog, "WARNING: nstglobalcomm is larger than nstlist, but not a multiple, setting it to %d\n", nstglobalcomm);
        }
        if (ir->nstcalcenergy > 0)
        {
            check_nst_param(fplog, cr, "-gcom", nstglobalcomm,
                            "nstcalcenergy", &ir->nstcalcenergy);
        }
        if (ir->etc != etcNO && ir->nsttcouple > 0)
        {
            check_nst_param(fplog, cr, "-gcom", nstglobalcomm,
                            "nsttcouple", &ir->nsttcouple);
        }
        if (ir->epc != epcNO && ir->nstpcouple > 0)
        {
            check_nst_param(fplog, cr, "-gcom", nstglobalcomm,
                            "nstpcouple", &ir->nstpcouple);
        }

        check_nst_param(fplog, cr, "-gcom", nstglobalcomm,
                        "nstenergy", &ir->nstenergy);

        check_nst_param(fplog, cr, "-gcom", nstglobalcomm,
                        "nstlog", &ir->nstlog);
    }

    if (ir->comm_mode != ecmNO && ir->nstcomm < nstglobalcomm)
    {
        md_print_warn(cr, fplog, "WARNING: Changing nstcomm from %d to %d\n",
                      ir->nstcomm, nstglobalcomm);
        ir->nstcomm = nstglobalcomm;
    }

    return nstglobalcomm;
}
示例#24
0
/* Check the process affinity mask and if it is found to be non-zero,
 * will honor it and disable mdrun internal affinity setting.
 * Note that this will only work on Linux as we use a GNU feature.
 */
void
gmx_check_thread_affinity_set(FILE            *fplog,
                              const t_commrec *cr,
                              gmx_hw_opt_t    *hw_opt,
                              int  gmx_unused  nthreads_hw_avail,
                              gmx_bool         bAfterOpenmpInit)
{
#ifdef HAVE_SCHED_AFFINITY
    cpu_set_t mask_current;
    int       i, ret, cpu_count, cpu_set;
    gmx_bool  bAllSet;
#endif
#ifdef GMX_LIB_MPI
    gmx_bool  bAllSet_All;
#endif

    assert(hw_opt);
    if (!bAfterOpenmpInit)
    {
        /* Check for externally set OpenMP affinity and turn off internal
         * pinning if any is found. We need to do this check early to tell
         * thread-MPI whether it should do pinning when spawning threads.
         * TODO: the above no longer holds, we should move these checks later
         */
        if (hw_opt->thread_affinity != threadaffOFF)
        {
            char *message;
            if (!gmx_omp_check_thread_affinity(&message))
            {
                /* TODO: with -pin auto we should only warn when using all cores */
                md_print_warn(cr, fplog, "%s", message);
                sfree(message);
                hw_opt->thread_affinity = threadaffOFF;
            }
        }

        /* With thread-MPI this is needed as pinning might get turned off,
         * which needs to be known before starting thread-MPI.
         * With thread-MPI hw_opt is processed here on the master rank
         * and passed to the other ranks later, so we only do this on master.
         */
        if (!SIMMASTER(cr))
        {
            return;
        }
#ifndef GMX_THREAD_MPI
        return;
#endif
    }

#ifdef HAVE_SCHED_AFFINITY
    if (hw_opt->thread_affinity == threadaffOFF)
    {
        /* internal affinity setting is off, don't bother checking process affinity */
        return;
    }

    CPU_ZERO(&mask_current);
    if ((ret = sched_getaffinity(0, sizeof(cpu_set_t), &mask_current)) != 0)
    {
        /* failed to query affinity mask, will just return */
        if (debug)
        {
            fprintf(debug, "Failed to query affinity mask (error %d)", ret);
        }
        return;
    }

    /* Before proceeding with the actual check, make sure that the number of
     * detected CPUs is >= the CPUs in the current set.
     * We need to check for CPU_COUNT as it was added only in glibc 2.6. */
#ifdef CPU_COUNT
    if (nthreads_hw_avail < CPU_COUNT(&mask_current))
    {
        if (debug)
        {
            fprintf(debug, "%d hardware threads detected, but %d was returned by CPU_COUNT",
                    nthreads_hw_avail, CPU_COUNT(&mask_current));
        }
        return;
    }
#endif /* CPU_COUNT */

    bAllSet = TRUE;
    for (i = 0; (i < nthreads_hw_avail && i < CPU_SETSIZE); i++)
    {
        bAllSet = bAllSet && (CPU_ISSET(i, &mask_current) != 0);
    }

#ifdef GMX_LIB_MPI
    MPI_Allreduce(&bAllSet, &bAllSet_All, 1, MPI_INT, MPI_LAND, MPI_COMM_WORLD);
    bAllSet = bAllSet_All;
#endif

    if (!bAllSet)
    {
        if (hw_opt->thread_affinity == threadaffAUTO)
        {
            if (!bAfterOpenmpInit)
            {
                md_print_warn(cr, fplog,
                              "Non-default thread affinity set, disabling internal thread affinity");
            }
            else
            {
                md_print_warn(cr, fplog,
                              "Non-default thread affinity set probably by the OpenMP library,\n"
                              "disabling internal thread affinity");
            }
            hw_opt->thread_affinity = threadaffOFF;
        }
        else
        {
            /* Only warn once, at the last check (bAfterOpenmpInit==TRUE) */
            if (bAfterOpenmpInit)
            {
                md_print_warn(cr, fplog,
                              "Overriding thread affinity set outside %s\n",
                              ShortProgram());
            }
        }

        if (debug)
        {
            fprintf(debug, "Non-default affinity mask found\n");
        }
    }
    else
    {
        if (debug)
        {
            fprintf(debug, "Default affinity mask found\n");
        }
    }
#endif /* HAVE_SCHED_AFFINITY */
}
/* Get the number of MPI ranks to use for thread-MPI based on how many
 * were requested, which algorithms we're using,
 * and how many particles there are.
 * At the point we have already called check_and_update_hw_opt.
 * Thus all options should be internally consistent and consistent
 * with the hardware, except that ntmpi could be larger than #GPU.
 */
int get_nthreads_mpi(const gmx_hw_info_t *hwinfo,
                     gmx_hw_opt_t        *hw_opt,
                     const t_inputrec    *inputrec,
                     const gmx_mtop_t    *mtop,
                     const t_commrec     *cr,
                     FILE                *fplog)
{
    int      nthreads_hw, nthreads_tot_max, nrank, ngpu;
    int      min_atoms_per_mpi_rank;

    /* Check if an algorithm does not support parallel simulation.  */
    if (inputrec->eI == eiLBFGS ||
        inputrec->coulombtype == eelEWALD)
    {
        md_print_warn(cr, fplog, "The integration or electrostatics algorithm doesn't support parallel runs. Using a single thread-MPI rank.\n");
        if (hw_opt->nthreads_tmpi > 1)
        {
            gmx_fatal(FARGS, "You asked for more than 1 thread-MPI rank, but an algorithm doesn't support that");
        }

        return 1;
    }

    if (hw_opt->nthreads_tmpi > 0)
    {
        /* Trivial, return right away */
        return hw_opt->nthreads_tmpi;
    }

    nthreads_hw = hwinfo->nthreads_hw_avail;

    if (nthreads_hw <= 0)
    {
        /* This should normally not happen, but if it does, we handle it */
        gmx_fatal(FARGS, "The number of available hardware threads can not be detected, please specify the number of MPI ranks and the number of OpenMP threads (if supported) manually with options -ntmpi and -ntomp, respectively");
    }

    /* How many total (#tMPI*#OpenMP) threads can we start? */
    if (hw_opt->nthreads_tot > 0)
    {
        nthreads_tot_max = hw_opt->nthreads_tot;
    }
    else
    {
        nthreads_tot_max = nthreads_hw;
    }

    ngpu = getMaxGpuUsable(fplog, cr, hwinfo, inputrec->cutoff_scheme);

    if (inputrec->cutoff_scheme == ecutsGROUP)
    {
        /* We checked this before, but it doesn't hurt to do it once more */
        GMX_RELEASE_ASSERT(hw_opt->nthreads_omp == 1, "The group scheme only supports one OpenMP thread per rank");
    }

    nrank =
        get_tmpi_omp_thread_division(hwinfo, hw_opt, nthreads_tot_max, ngpu);

    if (inputrec->eI == eiNM || EI_TPI(inputrec->eI))
    {
        /* Dims/steps are divided over the nodes iso splitting the atoms.
         * With NM we can't have more ranks than #atoms*#dim. With TPI it's
         * unlikely we have fewer atoms than ranks, and if so, communication
         * would become a bottleneck, so we set the limit to 1 atom/rank.
         */
        min_atoms_per_mpi_rank = 1;
    }
    else
    {
        if (ngpu >= 1)
        {
            min_atoms_per_mpi_rank = min_atoms_per_gpu;
        }
        else
        {
            min_atoms_per_mpi_rank = min_atoms_per_mpi_thread;
        }
    }

    if (mtop->natoms/nrank < min_atoms_per_mpi_rank)
    {
        int nrank_new;

        /* the rank number was chosen automatically, but there are too few
           atoms per rank, so we need to reduce the rank count */
        nrank_new = std::max(1, mtop->natoms/min_atoms_per_mpi_rank);

        /* Avoid partial use of Hyper-Threading */
        if (gmx_cpuid_x86_smt(hwinfo->cpuid_info) == GMX_CPUID_X86_SMT_ENABLED &&
            nrank_new > nthreads_hw/2 && nrank_new < nthreads_hw)
        {
            nrank_new = nthreads_hw/2;
        }

        /* If the user specified the total thread count, ensure this is
         * divisible by the number of ranks.
         * It is quite likely that we have too many total threads compared
         * to the size of the system, but if the user asked for this many
         * threads we should respect that.
         */
        while (hw_opt->nthreads_tot > 0 &&
               hw_opt->nthreads_tot % nrank_new != 0)
        {
            nrank_new--;
        }

        /* Avoid large prime numbers in the rank count */
        if (nrank_new >= 6)
        {
            /* Use only 6,8,10 with additional factors of 2 */
            int fac;

            fac = 2;
            while (3*fac*2 <= nrank_new)
            {
                fac *= 2;
            }

            nrank_new = (nrank_new/fac)*fac;
        }
        else
        {
            /* Avoid 5, since small system won't fit 5 domains along
             * a dimension. This might lead to waisting some cores, but this
             * will have a small impact in this regime of very small systems.
             */
            if (nrank_new == 5)
            {
                nrank_new = 4;
            }
        }

        nrank = nrank_new;

        /* We reduced the number of tMPI ranks, which means we might violate
         * our own efficiency checks if we simply use all hardware threads.
         */
        if (bHasOmpSupport && hw_opt->nthreads_omp <= 0 && hw_opt->nthreads_tot <= 0)
        {
            /* The user set neither the total nor the OpenMP thread count,
             * we should use all hardware threads, unless we will violate
             * our own efficiency limitation on the thread count.
             */
            int  nt_omp_max;

            nt_omp_max = nthreads_omp_efficient_max(nrank, hwinfo->cpuid_info, ngpu >= 1);

            if (nrank*nt_omp_max < hwinfo->nthreads_hw_avail)
            {
                /* Limit the number of OpenMP threads to start */
                hw_opt->nthreads_omp = nt_omp_max;
            }
        }

        fprintf(stderr, "\n");
        fprintf(stderr, "NOTE: Parallelization is limited by the small number of atoms,\n");
        fprintf(stderr, "      only starting %d thread-MPI ranks.\n", nrank);
        fprintf(stderr, "      You can use the -nt and/or -ntmpi option to optimize the number of threads.\n\n");
    }

    return nrank;
}
void pme_loadbal_init(pme_load_balancing_t     **pme_lb_p,
                      t_commrec                 *cr,
                      FILE                      *fp_log,
                      const t_inputrec          *ir,
                      matrix                     box,
                      const interaction_const_t *ic,
                      struct gmx_pme_t          *pmedata,
                      gmx_bool                   bUseGPU,
                      gmx_bool                  *bPrinting)
{
    pme_load_balancing_t *pme_lb;
    real                  spm, sp;
    int                   d;

    snew(pme_lb, 1);

    pme_lb->bSepPMERanks  = !(cr->duty & DUTY_PME);

    /* Initially we turn on balancing directly on based on PP/PME imbalance */
    pme_lb->bTriggerOnDLB = FALSE;

    /* Any number of stages >= 2 is supported */
    pme_lb->nstage        = 2;

    pme_lb->cutoff_scheme = ir->cutoff_scheme;

    if (pme_lb->cutoff_scheme == ecutsVERLET)
    {
        pme_lb->rbuf_coulomb = ic->rlist - ic->rcoulomb;
        pme_lb->rbuf_vdw     = pme_lb->rbuf_coulomb;
    }
    else
    {
        if (ic->rcoulomb > ic->rlist)
        {
            pme_lb->rbuf_coulomb = ic->rlistlong - ic->rcoulomb;
        }
        else
        {
            pme_lb->rbuf_coulomb = ic->rlist - ic->rcoulomb;
        }
        if (ic->rvdw > ic->rlist)
        {
            pme_lb->rbuf_vdw = ic->rlistlong - ic->rvdw;
        }
        else
        {
            pme_lb->rbuf_vdw = ic->rlist - ic->rvdw;
        }
    }

    copy_mat(box, pme_lb->box_start);
    if (ir->ePBC == epbcXY && ir->nwall == 2)
    {
        svmul(ir->wall_ewald_zfac, pme_lb->box_start[ZZ], pme_lb->box_start[ZZ]);
    }

    pme_lb->n = 1;
    snew(pme_lb->setup, pme_lb->n);

    pme_lb->rcut_vdw                 = ic->rvdw;
    pme_lb->rcut_coulomb_start       = ir->rcoulomb;
    pme_lb->nstcalclr_start          = ir->nstcalclr;

    pme_lb->cur                      = 0;
    pme_lb->setup[0].rcut_coulomb    = ic->rcoulomb;
    pme_lb->setup[0].rlist           = ic->rlist;
    pme_lb->setup[0].rlistlong       = ic->rlistlong;
    pme_lb->setup[0].nstcalclr       = ir->nstcalclr;
    pme_lb->setup[0].grid[XX]        = ir->nkx;
    pme_lb->setup[0].grid[YY]        = ir->nky;
    pme_lb->setup[0].grid[ZZ]        = ir->nkz;
    pme_lb->setup[0].ewaldcoeff_q    = ic->ewaldcoeff_q;
    pme_lb->setup[0].ewaldcoeff_lj   = ic->ewaldcoeff_lj;

    pme_lb->setup[0].pmedata         = pmedata;

    spm = 0;
    for (d = 0; d < DIM; d++)
    {
        sp = norm(pme_lb->box_start[d])/pme_lb->setup[0].grid[d];
        if (sp > spm)
        {
            spm = sp;
        }
    }
    pme_lb->setup[0].spacing = spm;

    if (ir->fourier_spacing > 0)
    {
        pme_lb->cut_spacing = ir->rcoulomb/ir->fourier_spacing;
    }
    else
    {
        pme_lb->cut_spacing = ir->rcoulomb/pme_lb->setup[0].spacing;
    }

    pme_lb->stage = 0;

    pme_lb->fastest     = 0;
    pme_lb->lower_limit = 0;
    pme_lb->start       = 0;
    pme_lb->end         = 0;
    pme_lb->elimited    = epmelblimNO;

    pme_lb->cycles_n = 0;
    pme_lb->cycles_c = 0;

    /* Tune with GPUs and/or separate PME ranks.
     * When running only on a CPU without PME ranks, PME tuning will only help
     * with small numbers of atoms in the cut-off sphere.
     */
    pme_lb->bActive  = (wallcycle_have_counter() && (bUseGPU ||
                                                     pme_lb->bSepPMERanks));

    /* With GPUs and no separate PME ranks we can't measure the PP/PME
     * imbalance, so we start balancing right away.
     * Otherwise we only start balancing after we observe imbalance.
     */
    pme_lb->bBalance = (pme_lb->bActive && (bUseGPU && !pme_lb->bSepPMERanks));

    pme_lb->step_rel_stop = PMETunePeriod*ir->nstlist;

    /* Delay DD load balancing when GPUs are used */
    if (pme_lb->bActive && DOMAINDECOMP(cr) && cr->dd->nnodes > 1 && bUseGPU)
    {
        /* Lock DLB=auto to off (does nothing when DLB=yes/no.
         * With GPUs and separate PME nodes, we want to first
         * do PME tuning without DLB, since DLB might limit
         * the cut-off, which never improves performance.
         * We allow for DLB + PME tuning after a first round of tuning.
         */
        dd_dlb_lock(cr->dd);
        if (dd_dlb_is_locked(cr->dd))
        {
            md_print_warn(cr, fp_log, "NOTE: DLB will not turn on during the first phase of PME tuning\n");
        }
    }

    *pme_lb_p = pme_lb;

    *bPrinting = pme_lb->bBalance;
}
示例#27
0
/*!
 * Thread affinity set by the OpenMP library can conflict with the GROMACS
 * internal affinity setting.
 *
 * While GNU OpenMP does not set affinity by default, the Intel OpenMP library
 * does. This conflicts with the internal affinity (especially thread-MPI)
 * setting, results in incorrectly locked threads, and causes dreadful performance.
 *
 * The KMP_AFFINITY environment variable is used by Intel, GOMP_CPU_AFFINITY
 * by the GNU compilers (Intel also honors it well). If any of the variables
 * is set, we honor it, disable the internal pinning, and warn the user.
 * When using Intel OpenMP, we will disable affinity if the user did not set it
 * anually through one of the aforementioned environment variables.
 *
 * Note that the Intel OpenMP affinity disabling iwll only take effect if this
 * function is called before the OpenMP library gets initialized which happens
 * when the first call is made into a compilation unit that contains OpenMP
 * pragmas.
 */
void gmx_omp_check_thread_affinity(FILE *fplog, const t_commrec *cr,
                                   gmx_hw_opt_t *hw_opt)
{
    gmx_bool bKmpAffinitySet, bGompCpuAffinitySet;
    char *kmp_env, *gomp_env;

    /* no need to worry if internal thread pinning is turned off */
    if (!hw_opt->bThreadPinning)
    {
        return;
    }

#if defined(GMX_OPENMP)

    /* We assume that the affinity setting is available on all platforms
     * gcc supports. Even if this is not the case (e.g. Mac OS) the user
     * will only get a warning.*/
    bGompCpuAffinitySet = FALSE;
    gomp_env = NULL;
#if defined(__GNUC__)
    gomp_env = getenv("GOMP_CPU_AFFINITY");
    bGompCpuAffinitySet = (gomp_env != NULL);
#endif /* __GNUC__ */

    bKmpAffinitySet = FALSE;
#if defined(__INTEL_COMPILER)
    kmp_env = getenv("KMP_AFFINITY");
    bKmpAffinitySet = (kmp_env != NULL);

    /* disable Intel OpenMP affinity if neither KMP_AFFINITY nor
     * GOMP_CPU_AFFINITY is set (Intel uses the GNU env. var as well) */
    if (!bKmpAffinitySet && !bGompCpuAffinitySet)
    {
        int retval;

#ifdef _MSC_VER
        /* Windows not POSIX */
        retval = _putenv_s("KMP_AFFINITY", "disabled");
#else
        /* POSIX */
        retval = setenv("KMP_AFFINITY", "disabled", 0);
#endif /* _MSC_VER */

        if (debug)
        {
            fprintf(debug, "Disabling Intel OpenMP affinity by setting the KMP_AFFINITY=disabled env. var.\n");
        }

        if (retval != 0)
        {
            gmx_warning("Disabling Intel OpenMp affinity setting failed!");
        }
    }

    /* turn off internal pinning KMP_AFFINITY != "disabled" */
    if (bKmpAffinitySet && (gmx_strncasecmp(kmp_env, "disabled", 8) != 0))
    {
        md_print_warn(cr, fplog, "WARNING: KMP_AFFINITY set, will turn off %s internal affinity\n"
                      "         setting as the two can conflict and cause performance degradation.\n"
                      "         To keep using the %s internal affinity setting, set the\n"
                      "         KMP_AFFINITY=disabled environment variable.",
                      ShortProgram(), ShortProgram());

        hw_opt->bThreadPinning = FALSE;
    }
#endif /* __INTEL_COMPILER */

#if defined(__INTEL_COMPILER) || defined(__GNUC__)
    /* turn off internal pinning f GOMP_CPU_AFFINITY is set & non-empty */
    if (bGompCpuAffinitySet && gomp_env != NULL && gomp_env != '\0')
    {
        md_print_warn(cr, fplog,
                      "WARNING: GOMP_CPU_AFFINITY set, will turn off %s internal affinity\n"
                      "         setting as the two can conflict and cause performance degradation.\n"
                      "         To keep using the %s internal affinity setting, unset the\n"
                      "         GOMP_CPU_AFFINITY environment variable.",
                      ShortProgram(), ShortProgram());

        hw_opt->bThreadPinning = FALSE;
    }
#endif /* __INTEL_COMPILER || __GNUC__ */

#endif /* GMX_OPENMP */
}
示例#28
0
void gmx_omp_nthreads_init(FILE *fplog, t_commrec *cr,
                           int nthreads_hw_avail,
                           int omp_nthreads_req,
                           int omp_nthreads_pme_req,
                           gmx_bool gmx_unused bThisNodePMEOnly,
                           gmx_bool bFullOmpSupport)
{
    int      nth, nth_pmeonly, gmx_maxth, nppn;
    char    *env;
    gmx_bool bSepPME, bOMP;

#ifdef GMX_OPENMP
    bOMP = TRUE;
#else
    bOMP = FALSE;
#endif /* GMX_OPENMP */

    /* number of MPI processes/threads per physical node */
    nppn = cr->nrank_intranode;

    bSepPME = ( (cr->duty & DUTY_PP) && !(cr->duty & DUTY_PME)) ||
        (!(cr->duty & DUTY_PP) &&  (cr->duty & DUTY_PME));

#ifdef GMX_THREAD_MPI
    /* modth is shared among tMPI threads, so for thread safety do the
     * detection is done on the master only. It is not thread-safe with
     * multiple simulations, but that's anyway not supported by tMPI. */
    if (SIMMASTER(cr))
#endif
    {
        /* just return if the initialization has already been done */
        if (modth.initialized)
        {
            return;
        }

        /* With full OpenMP support (verlet scheme) set the number of threads
         * per process / default:
         * - 1 if not compiled with OpenMP or
         * - OMP_NUM_THREADS if the env. var is set, or
         * - omp_nthreads_req = #of threads requested by the user on the mdrun
         *   command line, otherwise
         * - take the max number of available threads and distribute them
         *   on the processes/tMPI threads.
         * ~ The GMX_*_NUM_THREADS env var overrides the number of threads of
         *   the respective module and it has to be used in conjunction with
         *   OMP_NUM_THREADS.
         *
         * With the group scheme OpenMP multithreading is only supported in PME,
         * for all other modules nthreads is set to 1.
         * The number of PME threads is equal to:
         * - 1 if not compiled with OpenMP or
         * - GMX_PME_NUM_THREADS if defined, otherwise
         * - OMP_NUM_THREADS if defined, otherwise
         * - 1
         */
        nth = 1;
        if ((env = getenv("OMP_NUM_THREADS")) != NULL)
        {
            if (!bOMP && (strncmp(env, "1", 1) != 0))
            {
                gmx_warning("OMP_NUM_THREADS is set, but %s was compiled without OpenMP support!",
                            ShortProgram());
            }
            else
            {
                nth = gmx_omp_get_max_threads();
            }
        }
        else if (omp_nthreads_req > 0)
        {
            nth = omp_nthreads_req;
        }
        else if (bFullOmpSupport && bOMP)
        {
            /* max available threads per node */
            nth = nthreads_hw_avail;

            /* divide the threads among the MPI processes/tMPI threads */
            if (nth >= nppn)
            {
                nth /= nppn;
            }
            else
            {
                nth = 1;
            }
        }

        /* now we have the global values, set them:
         * - 1 if not compiled with OpenMP and for the group scheme
         * - nth for the verlet scheme when compiled with OpenMP
         */
        if (bFullOmpSupport && bOMP)
        {
            modth.gnth = nth;
        }
        else
        {
            modth.gnth = 1;
        }

        if (bSepPME)
        {
            if (omp_nthreads_pme_req > 0)
            {
                modth.gnth_pme = omp_nthreads_pme_req;
            }
            else
            {
                modth.gnth_pme = nth;
            }
        }
        else
        {
            modth.gnth_pme = 0;
        }

        /* now set the per-module values */
        modth.nth[emntDefault] = modth.gnth;
        pick_module_nthreads(fplog, emntDomdec, SIMMASTER(cr), bFullOmpSupport, bSepPME);
        pick_module_nthreads(fplog, emntPairsearch, SIMMASTER(cr), bFullOmpSupport, bSepPME);
        pick_module_nthreads(fplog, emntNonbonded, SIMMASTER(cr), bFullOmpSupport, bSepPME);
        pick_module_nthreads(fplog, emntBonded, SIMMASTER(cr), bFullOmpSupport, bSepPME);
        pick_module_nthreads(fplog, emntPME, SIMMASTER(cr), bFullOmpSupport, bSepPME);
        pick_module_nthreads(fplog, emntUpdate, SIMMASTER(cr), bFullOmpSupport, bSepPME);
        pick_module_nthreads(fplog, emntVSITE, SIMMASTER(cr), bFullOmpSupport, bSepPME);
        pick_module_nthreads(fplog, emntLINCS, SIMMASTER(cr), bFullOmpSupport, bSepPME);
        pick_module_nthreads(fplog, emntSETTLE, SIMMASTER(cr), bFullOmpSupport, bSepPME);

        /* set the number of threads globally */
        if (bOMP)
        {
#ifndef GMX_THREAD_MPI
            if (bThisNodePMEOnly)
            {
                gmx_omp_set_num_threads(modth.gnth_pme);
            }
            else
#endif      /* GMX_THREAD_MPI */
            {
                if (bFullOmpSupport)
                {
                    gmx_omp_set_num_threads(nth);
                }
                else
                {
                    gmx_omp_set_num_threads(1);
                }
            }
        }

        modth.initialized = TRUE;
    }
#ifdef GMX_THREAD_MPI
    /* Non-master threads have to wait for the detection to be done. */
    if (PAR(cr))
    {
        MPI_Barrier(cr->mpi_comm_mysim);
    }
#endif

    /* inform the user about the settings */
    if (bOMP)
    {
#ifdef GMX_THREAD_MPI
        const char *mpi_str = "per tMPI thread";
#else
        const char *mpi_str = "per MPI process";
#endif

        /* for group scheme we print PME threads info only */
        if (bFullOmpSupport)
        {
            md_print_info(cr, fplog, "Using %d OpenMP thread%s %s\n",
                          modth.gnth, modth.gnth > 1 ? "s" : "",
                          cr->nnodes > 1 ? mpi_str : "");
        }
        if (bSepPME && modth.gnth_pme != modth.gnth)
        {
            md_print_info(cr, fplog, "Using %d OpenMP thread%s %s for PME\n",
                          modth.gnth_pme, modth.gnth_pme > 1 ? "s" : "",
                          cr->nnodes > 1 ? mpi_str : "");
        }
    }

    /* detect and warn about oversubscription
     * TODO: enable this for separate PME nodes as well! */
    if (!bSepPME && cr->rank_pp_intranode == 0)
    {
        char sbuf[STRLEN], sbuf1[STRLEN], sbuf2[STRLEN];

        if (modth.gnth*nppn > nthreads_hw_avail)
        {
            sprintf(sbuf, "threads");
            sbuf1[0] = '\0';
            sprintf(sbuf2, "O");
#ifdef GMX_MPI
            if (modth.gnth == 1)
            {
#ifdef GMX_THREAD_MPI
                sprintf(sbuf, "thread-MPI threads");
#else
                sprintf(sbuf, "MPI processes");
                sprintf(sbuf1, " per node");
                sprintf(sbuf2, "On node %d: o", cr->sim_nodeid);
#endif
            }
#endif
            md_print_warn(cr, fplog,
                          "WARNING: %sversubscribing the available %d logical CPU cores%s with %d %s.\n"
                          "         This will cause considerable performance loss!",
                          sbuf2, nthreads_hw_avail, sbuf1, nppn*modth.gnth, sbuf);
        }
    }
}