/* Return the number of thread-MPI ranks to use. * This is chosen such that we can always obey our own efficiency checks. */ static int get_tmpi_omp_thread_division(const gmx_hw_info_t *hwinfo, const gmx_hw_opt_t *hw_opt, int nthreads_tot, int ngpu) { int nrank; GMX_RELEASE_ASSERT(nthreads_tot > 0, "There must be at least one thread per rank"); /* There are no separate PME nodes here, as we ensured in * check_and_update_hw_opt that nthreads_tmpi>0 with PME nodes * and a conditional ensures we would not have ended up here. * Note that separate PME nodes might be switched on later. */ if (ngpu > 0) { nrank = ngpu; if (nthreads_tot < nrank) { /* #thread < #gpu is very unlikely, but if so: waste gpu(s) */ nrank = nthreads_tot; } else if (gmx_gpu_sharing_supported() && (nthreads_tot > nthreads_omp_faster(hwinfo->cpuid_info, ngpu > 0) || (ngpu > 1 && nthreads_tot/ngpu > nthreads_omp_mpi_target_max))) { /* The high OpenMP thread count will likely result in sub-optimal * performance. Increase the rank count to reduce the thread count * per rank. This will lead to GPU sharing by MPI ranks/threads. */ int nshare; /* Increase the rank count as long as have we more than 6 OpenMP * threads per rank or the number of hardware threads is not * divisible by the rank count. Don't go below 2 OpenMP threads. */ nshare = 1; do { nshare++; nrank = ngpu*nshare; } while (nthreads_tot/nrank > nthreads_omp_mpi_target_max || (nthreads_tot/(ngpu*(nshare + 1)) >= nthreads_omp_mpi_ok_min_gpu && nthreads_tot % nrank != 0)); } } else if (hw_opt->nthreads_omp > 0) { /* Here we could oversubscribe, when we do, we issue a warning later */ nrank = std::max(1, nthreads_tot/hw_opt->nthreads_omp); } else { if (nthreads_tot <= nthreads_omp_faster(hwinfo->cpuid_info, ngpu > 0)) { /* Use pure OpenMP parallelization */ nrank = 1; } else { /* Don't use OpenMP parallelization */ nrank = nthreads_tot; } } return nrank; }
void check_resource_division_efficiency(const gmx_hw_info_t *hwinfo, const gmx_hw_opt_t *hw_opt, gmx_bool bNtOmpOptionSet, t_commrec *cr, FILE *fplog) { #if defined GMX_OPENMP && defined GMX_MPI int nth_omp_min, nth_omp_max, ngpu; char buf[1000]; #ifdef GMX_THREAD_MPI const char *mpi_option = " (option -ntmpi)"; #else const char *mpi_option = ""; #endif /* This function should be called after thread-MPI (when configured) and * OpenMP have been initialized. Check that here. */ #ifdef GMX_THREAD_MPI GMX_RELEASE_ASSERT(nthreads_omp_faster_default >= nthreads_omp_mpi_ok_max, "Inconsistent OpenMP thread count default values"); GMX_RELEASE_ASSERT(hw_opt->nthreads_tmpi >= 1, "Must have at least one thread-MPI rank"); #endif GMX_RELEASE_ASSERT(gmx_omp_nthreads_get(emntDefault) >= 1, "Must have at least one OpenMP thread"); nth_omp_min = gmx_omp_nthreads_get(emntDefault); nth_omp_max = gmx_omp_nthreads_get(emntDefault); ngpu = hw_opt->gpu_opt.n_dev_use; /* Thread-MPI seems to have a bug with reduce on 1 node, so use a cond. */ if (cr->nnodes + cr->npmenodes > 1) { int count[3], count_max[3]; count[0] = -nth_omp_min; count[1] = nth_omp_max; count[2] = ngpu; MPI_Allreduce(count, count_max, 3, MPI_INT, MPI_MAX, cr->mpi_comm_mysim); /* In case of an inhomogeneous run setup we use the maximum counts */ nth_omp_min = -count_max[0]; nth_omp_max = count_max[1]; ngpu = count_max[2]; } int nthreads_omp_mpi_ok_min; if (ngpu == 0) { nthreads_omp_mpi_ok_min = nthreads_omp_mpi_ok_min_cpu; } else { /* With GPUs we set the minimum number of OpenMP threads to 2 to catch * cases where the user specifies #ranks == #cores. */ nthreads_omp_mpi_ok_min = nthreads_omp_mpi_ok_min_gpu; } if (DOMAINDECOMP(cr) && cr->nnodes > 1) { if (nth_omp_max < nthreads_omp_mpi_ok_min || (!(ngpu > 0 && !gmx_gpu_sharing_supported()) && nth_omp_max > nthreads_omp_mpi_ok_max)) { /* Note that we print target_max here, not ok_max */ sprintf(buf, "Your choice of number of MPI ranks and amount of resources results in using %d OpenMP threads per rank, which is most likely inefficient. The optimum is usually between %d and %d threads per rank.", nth_omp_max, nthreads_omp_mpi_ok_min, nthreads_omp_mpi_target_max); if (bNtOmpOptionSet) { md_print_warn(cr, fplog, "NOTE: %s\n", buf); } else { /* This fatal error, and the one below, is nasty, but it's * probably the only way to ensure that all users don't waste * a lot of resources, since many users don't read logs/stderr. */ gmx_fatal(FARGS, "%s If you want to run with this setup, specify the -ntomp option. But we suggest to change the number of MPI ranks%s.", buf, mpi_option); } } } else { /* No domain decomposition (or only one domain) */ if (!(ngpu > 0 && !gmx_gpu_sharing_supported()) && nth_omp_max > nthreads_omp_faster(hwinfo->cpuid_info, ngpu > 0)) { /* To arrive here, the user/system set #ranks and/or #OMPthreads */ gmx_bool bEnvSet; char buf2[256]; bEnvSet = (getenv("OMP_NUM_THREADS") != NULL); if (bNtOmpOptionSet || bEnvSet) { sprintf(buf2, "You requested %d OpenMP threads", nth_omp_max); } else { sprintf(buf2, "Your choice of %d MPI rank%s and the use of %d total threads %sleads to the use of %d OpenMP threads", cr->nnodes + cr->npmenodes, cr->nnodes + cr->npmenodes == 1 ? "" : "s", hw_opt->nthreads_tot > 0 ? hw_opt->nthreads_tot : hwinfo->nthreads_hw_avail, hwinfo->nphysicalnode > 1 ? "on a node " : "", nth_omp_max); } sprintf(buf, "%s, whereas we expect the optimum to be with more MPI ranks with %d to %d OpenMP threads.", buf2, nthreads_omp_mpi_ok_min, nthreads_omp_mpi_target_max); /* We can not quit with a fatal error when OMP_NUM_THREADS is set * with different values per rank or node, since in that case * the user can not set -ntomp to override the error. */ if (bNtOmpOptionSet || (bEnvSet && nth_omp_min != nth_omp_max)) { md_print_warn(cr, fplog, "NOTE: %s\n", buf); } else { gmx_fatal(FARGS, "%s If you want to run with this many OpenMP threads, specify the -ntomp option. But we suggest to increase the number of MPI ranks%s.", buf, mpi_option); } } } #else /* GMX_OPENMP && GMX_MPI */ /* No OpenMP and/or MPI: it doesn't make much sense to check */ GMX_UNUSED_VALUE(hw_opt); GMX_UNUSED_VALUE(bNtOmpOptionSet); /* Check if we have more than 1 physical core, if detected, * or more than 1 hardware thread if physical cores were not detected. */ #if !(defined GMX_OPENMP) && !(defined GMX_MPI) if ((hwinfo->ncore > 1) || (hwinfo->ncore == 0 && hwinfo->nthreads_hw_avail > 1)) { md_print_warn(cr, fplog, "NOTE: GROMACS was compiled without OpenMP and (thread-)MPI support, can only use a single CPU core\n"); } #else GMX_UNUSED_VALUE(hwinfo); GMX_UNUSED_VALUE(cr); GMX_UNUSED_VALUE(fplog); #endif #endif /* GMX_OPENMP && GMX_MPI */ }
/* Return the number of thread-MPI ranks to use. * This is chosen such that we can always obey our own efficiency checks. */ static int get_tmpi_omp_thread_division(const gmx_hw_info_t *hwinfo, const gmx_hw_opt_t *hw_opt, int nthreads_tot, int ngpu) { int nrank; const gmx::CpuInfo &cpuInfo = *hwinfo->cpuInfo; GMX_RELEASE_ASSERT(nthreads_tot > 0, "There must be at least one thread per rank"); /* There are no separate PME nodes here, as we ensured in * check_and_update_hw_opt that nthreads_tmpi>0 with PME nodes * and a conditional ensures we would not have ended up here. * Note that separate PME nodes might be switched on later. */ if (ngpu > 0) { nrank = ngpu; /* When the user sets nthreads_omp, we can end up oversubscribing CPU cores * if we simply start as many ranks as GPUs. To avoid this, we start as few * tMPI ranks as necessary to avoid oversubscription and instead leave GPUs idle. * If the user does not set the number of OpenMP threads, nthreads_omp==0 and * this code has no effect. */ GMX_RELEASE_ASSERT(hw_opt->nthreads_omp >= 0, "nthreads_omp is negative, but previous checks should have prevented this"); while (nrank*hw_opt->nthreads_omp > hwinfo->nthreads_hw_avail && nrank > 1) { nrank--; } if (nthreads_tot < nrank) { /* #thread < #gpu is very unlikely, but if so: waste gpu(s) */ nrank = nthreads_tot; } else if (gmx_gpu_sharing_supported() && (nthreads_tot > nthreads_omp_faster(cpuInfo, ngpu > 0) || (ngpu > 1 && nthreads_tot/ngpu > nthreads_omp_mpi_target_max))) { /* The high OpenMP thread count will likely result in sub-optimal * performance. Increase the rank count to reduce the thread count * per rank. This will lead to GPU sharing by MPI ranks/threads. */ int nshare; /* Increase the rank count as long as have we more than 6 OpenMP * threads per rank or the number of hardware threads is not * divisible by the rank count. Don't go below 2 OpenMP threads. */ nshare = 1; do { nshare++; nrank = ngpu*nshare; } while (nthreads_tot/nrank > nthreads_omp_mpi_target_max || (nthreads_tot/(ngpu*(nshare + 1)) >= nthreads_omp_mpi_ok_min_gpu && nthreads_tot % nrank != 0)); } } else if (hw_opt->nthreads_omp > 0) { /* Here we could oversubscribe, when we do, we issue a warning later */ nrank = std::max(1, nthreads_tot/hw_opt->nthreads_omp); } else { if (nthreads_tot <= nthreads_omp_faster(cpuInfo, ngpu > 0)) { /* Use pure OpenMP parallelization */ nrank = 1; } else { /* Don't use OpenMP parallelization */ nrank = nthreads_tot; } } return nrank; }