コード例 #1
0
/* Return the number of thread-MPI ranks to use.
 * This is chosen such that we can always obey our own efficiency checks.
 */
static int get_tmpi_omp_thread_division(const gmx_hw_info_t *hwinfo,
                                        const gmx_hw_opt_t  *hw_opt,
                                        int                  nthreads_tot,
                                        int                  ngpu)
{
    int nrank;

    GMX_RELEASE_ASSERT(nthreads_tot > 0, "There must be at least one thread per rank");

    /* There are no separate PME nodes here, as we ensured in
     * check_and_update_hw_opt that nthreads_tmpi>0 with PME nodes
     * and a conditional ensures we would not have ended up here.
     * Note that separate PME nodes might be switched on later.
     */
    if (ngpu > 0)
    {
        nrank = ngpu;
        if (nthreads_tot < nrank)
        {
            /* #thread < #gpu is very unlikely, but if so: waste gpu(s) */
            nrank = nthreads_tot;
        }
        else if (gmx_gpu_sharing_supported() &&
                 (nthreads_tot > nthreads_omp_faster(hwinfo->cpuid_info,
                                                     ngpu > 0) ||
                  (ngpu > 1 && nthreads_tot/ngpu > nthreads_omp_mpi_target_max)))
        {
            /* The high OpenMP thread count will likely result in sub-optimal
             * performance. Increase the rank count to reduce the thread count
             * per rank. This will lead to GPU sharing by MPI ranks/threads.
             */
            int nshare;

            /* Increase the rank count as long as have we more than 6 OpenMP
             * threads per rank or the number of hardware threads is not
             * divisible by the rank count. Don't go below 2 OpenMP threads.
             */
            nshare = 1;
            do
            {
                nshare++;
                nrank = ngpu*nshare;
            }
            while (nthreads_tot/nrank > nthreads_omp_mpi_target_max ||
                   (nthreads_tot/(ngpu*(nshare + 1)) >= nthreads_omp_mpi_ok_min_gpu && nthreads_tot % nrank != 0));
        }
    }
    else if (hw_opt->nthreads_omp > 0)
    {
        /* Here we could oversubscribe, when we do, we issue a warning later */
        nrank = std::max(1, nthreads_tot/hw_opt->nthreads_omp);
    }
    else
    {
        if (nthreads_tot <= nthreads_omp_faster(hwinfo->cpuid_info, ngpu > 0))
        {
            /* Use pure OpenMP parallelization */
            nrank = 1;
        }
        else
        {
            /* Don't use OpenMP parallelization */
            nrank = nthreads_tot;
        }
    }

    return nrank;
}
コード例 #2
0
void check_resource_division_efficiency(const gmx_hw_info_t *hwinfo,
                                        const gmx_hw_opt_t  *hw_opt,
                                        gmx_bool             bNtOmpOptionSet,
                                        t_commrec           *cr,
                                        FILE                *fplog)
{
#if defined GMX_OPENMP && defined GMX_MPI
    int         nth_omp_min, nth_omp_max, ngpu;
    char        buf[1000];
#ifdef GMX_THREAD_MPI
    const char *mpi_option = " (option -ntmpi)";
#else
    const char *mpi_option = "";
#endif

    /* This function should be called after thread-MPI (when configured) and
     * OpenMP have been initialized. Check that here.
     */
#ifdef GMX_THREAD_MPI
    GMX_RELEASE_ASSERT(nthreads_omp_faster_default >= nthreads_omp_mpi_ok_max, "Inconsistent OpenMP thread count default values");
    GMX_RELEASE_ASSERT(hw_opt->nthreads_tmpi >= 1, "Must have at least one thread-MPI rank");
#endif
    GMX_RELEASE_ASSERT(gmx_omp_nthreads_get(emntDefault) >= 1, "Must have at least one OpenMP thread");

    nth_omp_min = gmx_omp_nthreads_get(emntDefault);
    nth_omp_max = gmx_omp_nthreads_get(emntDefault);
    ngpu        = hw_opt->gpu_opt.n_dev_use;

    /* Thread-MPI seems to have a bug with reduce on 1 node, so use a cond. */
    if (cr->nnodes + cr->npmenodes > 1)
    {
        int count[3], count_max[3];

        count[0] = -nth_omp_min;
        count[1] =  nth_omp_max;
        count[2] =  ngpu;

        MPI_Allreduce(count, count_max, 3, MPI_INT, MPI_MAX, cr->mpi_comm_mysim);

        /* In case of an inhomogeneous run setup we use the maximum counts */
        nth_omp_min = -count_max[0];
        nth_omp_max =  count_max[1];
        ngpu        =  count_max[2];
    }

    int nthreads_omp_mpi_ok_min;

    if (ngpu == 0)
    {
        nthreads_omp_mpi_ok_min = nthreads_omp_mpi_ok_min_cpu;
    }
    else
    {
        /* With GPUs we set the minimum number of OpenMP threads to 2 to catch
         * cases where the user specifies #ranks == #cores.
         */
        nthreads_omp_mpi_ok_min = nthreads_omp_mpi_ok_min_gpu;
    }

    if (DOMAINDECOMP(cr) && cr->nnodes > 1)
    {
        if (nth_omp_max < nthreads_omp_mpi_ok_min ||
            (!(ngpu > 0 && !gmx_gpu_sharing_supported()) &&
             nth_omp_max > nthreads_omp_mpi_ok_max))
        {
            /* Note that we print target_max here, not ok_max */
            sprintf(buf, "Your choice of number of MPI ranks and amount of resources results in using %d OpenMP threads per rank, which is most likely inefficient. The optimum is usually between %d and %d threads per rank.",
                    nth_omp_max,
                    nthreads_omp_mpi_ok_min,
                    nthreads_omp_mpi_target_max);

            if (bNtOmpOptionSet)
            {
                md_print_warn(cr, fplog, "NOTE: %s\n", buf);
            }
            else
            {
                /* This fatal error, and the one below, is nasty, but it's
                 * probably the only way to ensure that all users don't waste
                 * a lot of resources, since many users don't read logs/stderr.
                 */
                gmx_fatal(FARGS, "%s If you want to run with this setup, specify the -ntomp option. But we suggest to change the number of MPI ranks%s.", buf, mpi_option);
            }
        }
    }
    else
    {
        /* No domain decomposition (or only one domain) */
        if (!(ngpu > 0 && !gmx_gpu_sharing_supported()) &&
            nth_omp_max > nthreads_omp_faster(hwinfo->cpuid_info, ngpu > 0))
        {
            /* To arrive here, the user/system set #ranks and/or #OMPthreads */
            gmx_bool bEnvSet;
            char     buf2[256];

            bEnvSet = (getenv("OMP_NUM_THREADS") != NULL);

            if (bNtOmpOptionSet || bEnvSet)
            {
                sprintf(buf2, "You requested %d OpenMP threads", nth_omp_max);
            }
            else
            {
                sprintf(buf2, "Your choice of %d MPI rank%s and the use of %d total threads %sleads to the use of %d OpenMP threads",
                        cr->nnodes + cr->npmenodes,
                        cr->nnodes + cr->npmenodes == 1 ? "" : "s",
                        hw_opt->nthreads_tot > 0 ? hw_opt->nthreads_tot : hwinfo->nthreads_hw_avail,
                        hwinfo->nphysicalnode > 1 ? "on a node " : "",
                        nth_omp_max);
            }
            sprintf(buf, "%s, whereas we expect the optimum to be with more MPI ranks with %d to %d OpenMP threads.",
                    buf2, nthreads_omp_mpi_ok_min, nthreads_omp_mpi_target_max);

            /* We can not quit with a fatal error when OMP_NUM_THREADS is set
             * with different values per rank or node, since in that case
             * the user can not set -ntomp to override the error.
             */
            if (bNtOmpOptionSet || (bEnvSet && nth_omp_min != nth_omp_max))
            {
                md_print_warn(cr, fplog, "NOTE: %s\n", buf);
            }
            else
            {
                gmx_fatal(FARGS, "%s If you want to run with this many OpenMP threads, specify the -ntomp option. But we suggest to increase the number of MPI ranks%s.", buf, mpi_option);
            }
        }
    }
#else /* GMX_OPENMP && GMX_MPI */
      /* No OpenMP and/or MPI: it doesn't make much sense to check */
    GMX_UNUSED_VALUE(hw_opt);
    GMX_UNUSED_VALUE(bNtOmpOptionSet);
    /* Check if we have more than 1 physical core, if detected,
     * or more than 1 hardware thread if physical cores were not detected.
     */
#if !(defined GMX_OPENMP) && !(defined GMX_MPI)
    if ((hwinfo->ncore > 1) ||
        (hwinfo->ncore == 0 && hwinfo->nthreads_hw_avail > 1))
    {
        md_print_warn(cr, fplog, "NOTE: GROMACS was compiled without OpenMP and (thread-)MPI support, can only use a single CPU core\n");
    }
#else
    GMX_UNUSED_VALUE(hwinfo);
    GMX_UNUSED_VALUE(cr);
    GMX_UNUSED_VALUE(fplog);
#endif

#endif /* GMX_OPENMP && GMX_MPI */
}
コード例 #3
0
/* Return the number of thread-MPI ranks to use.
 * This is chosen such that we can always obey our own efficiency checks.
 */
static int get_tmpi_omp_thread_division(const gmx_hw_info_t *hwinfo,
                                        const gmx_hw_opt_t  *hw_opt,
                                        int                  nthreads_tot,
                                        int                  ngpu)
{
    int                 nrank;
    const gmx::CpuInfo &cpuInfo = *hwinfo->cpuInfo;

    GMX_RELEASE_ASSERT(nthreads_tot > 0, "There must be at least one thread per rank");

    /* There are no separate PME nodes here, as we ensured in
     * check_and_update_hw_opt that nthreads_tmpi>0 with PME nodes
     * and a conditional ensures we would not have ended up here.
     * Note that separate PME nodes might be switched on later.
     */
    if (ngpu > 0)
    {
        nrank = ngpu;

        /* When the user sets nthreads_omp, we can end up oversubscribing CPU cores
         * if we simply start as many ranks as GPUs. To avoid this, we start as few
         * tMPI ranks as necessary to avoid oversubscription and instead leave GPUs idle.
         * If the user does not set the number of OpenMP threads, nthreads_omp==0 and
         * this code has no effect.
         */
        GMX_RELEASE_ASSERT(hw_opt->nthreads_omp >= 0, "nthreads_omp is negative, but previous checks should have prevented this");
        while (nrank*hw_opt->nthreads_omp > hwinfo->nthreads_hw_avail && nrank > 1)
        {
            nrank--;
        }

        if (nthreads_tot < nrank)
        {
            /* #thread < #gpu is very unlikely, but if so: waste gpu(s) */
            nrank = nthreads_tot;
        }
        else if (gmx_gpu_sharing_supported() &&
                 (nthreads_tot > nthreads_omp_faster(cpuInfo, ngpu > 0) ||
                  (ngpu > 1 && nthreads_tot/ngpu > nthreads_omp_mpi_target_max)))
        {
            /* The high OpenMP thread count will likely result in sub-optimal
             * performance. Increase the rank count to reduce the thread count
             * per rank. This will lead to GPU sharing by MPI ranks/threads.
             */
            int nshare;

            /* Increase the rank count as long as have we more than 6 OpenMP
             * threads per rank or the number of hardware threads is not
             * divisible by the rank count. Don't go below 2 OpenMP threads.
             */
            nshare = 1;
            do
            {
                nshare++;
                nrank = ngpu*nshare;
            }
            while (nthreads_tot/nrank > nthreads_omp_mpi_target_max ||
                   (nthreads_tot/(ngpu*(nshare + 1)) >= nthreads_omp_mpi_ok_min_gpu && nthreads_tot % nrank != 0));
        }
    }
    else if (hw_opt->nthreads_omp > 0)
    {
        /* Here we could oversubscribe, when we do, we issue a warning later */
        nrank = std::max(1, nthreads_tot/hw_opt->nthreads_omp);
    }
    else
    {
        if (nthreads_tot <= nthreads_omp_faster(cpuInfo, ngpu > 0))
        {
            /* Use pure OpenMP parallelization */
            nrank = 1;
        }
        else
        {
            /* Don't use OpenMP parallelization */
            nrank = nthreads_tot;
        }
    }

    return nrank;
}