/* Set CPU affinity. Can be important for performance.
   On some systems (e.g. Cray) CPU Affinity is set by default.
   But default assigning doesn't work (well) with only some ranks
   having threads. This causes very low performance.
   External tools have cumbersome syntax for setting affinity
   in the case that only some ranks have threads.
   Thus it is important that GROMACS sets the affinity internally
   if only PME is using threads.
 */
void
gmx_set_thread_affinity(FILE                *fplog,
                        const t_commrec     *cr,
                        const gmx_hw_opt_t  *hw_opt,
                        const gmx_hw_info_t *hwinfo)
{
    int        nth_affinity_set, thread0_id_node,
               nthread_local, nthread_node;
    int        offset;
    int *      localityOrder = nullptr;
    int        rc;

    if (hw_opt->thread_affinity == threadaffOFF)
    {
        /* Nothing to do */
        return;
    }

    /* If the tMPI thread affinity setting is not supported encourage the user
     * to report it as it's either a bug or an exotic platform which we might
     * want to support. */
    if (tMPI_Thread_setaffinity_support() != TMPI_SETAFFINITY_SUPPORT_YES)
    {
        /* we know Mac OS & BlueGene do not support setting thread affinity, so there's
           no point in warning the user in that case. In any other case
           the user might be able to do something about it. */
#if !defined(__APPLE__) && !defined(__bg__)
        md_print_warn(cr, fplog,
                      "NOTE: Cannot set thread affinities on the current platform.\n");
#endif  /* __APPLE__ */
        return;
    }

    /* threads on this MPI process or TMPI thread */
    if (cr->duty & DUTY_PP)
    {
        nthread_local = gmx_omp_nthreads_get(emntNonbonded);
    }
    else
    {
        nthread_local = gmx_omp_nthreads_get(emntPME);
    }

    /* map the current process to cores */
    thread0_id_node = 0;
    nthread_node    = nthread_local;
#if GMX_MPI
    if (PAR(cr) || MULTISIM(cr))
    {
        /* We need to determine a scan of the thread counts in this
         * compute node.
         */
        MPI_Comm comm_intra;

        MPI_Comm_split(MPI_COMM_WORLD,
                       gmx_physicalnode_id_hash(), cr->rank_intranode,
                       &comm_intra);
        MPI_Scan(&nthread_local, &thread0_id_node, 1, MPI_INT, MPI_SUM, comm_intra);
        /* MPI_Scan is inclusive, but here we need exclusive */
        thread0_id_node -= nthread_local;
        /* Get the total number of threads on this physical node */
        MPI_Allreduce(&nthread_local, &nthread_node, 1, MPI_INT, MPI_SUM, comm_intra);
        MPI_Comm_free(&comm_intra);
    }
#endif

    if (hw_opt->thread_affinity == threadaffAUTO &&
        nthread_node != hwinfo->nthreads_hw_avail)
    {
        if (nthread_node > 1 && nthread_node < hwinfo->nthreads_hw_avail)
        {
            md_print_warn(cr, fplog,
                          "NOTE: The number of threads is not equal to the number of (logical) cores\n"
                          "      and the -pin option is set to auto: will not pin thread to cores.\n"
                          "      This can lead to significant performance degradation.\n"
                          "      Consider using -pin on (and -pinoffset in case you run multiple jobs).\n");
        }

        return;
    }

    offset = 0;
    if (hw_opt->core_pinning_offset != 0)
    {
        offset = hw_opt->core_pinning_offset;
        md_print_info(cr, fplog, "Applying core pinning offset %d\n", offset);
    }

    int core_pinning_stride = hw_opt->core_pinning_stride;
    rc = get_thread_affinity_layout(fplog, cr, hwinfo,
                                    nthread_node,
                                    offset, &core_pinning_stride,
                                    &localityOrder);
    gmx::scoped_guard_sfree localityOrderGuard(localityOrder);

    if (rc != 0)
    {
        /* Incompatible layout, don't pin, warning was already issued */
        return;
    }

    /* Set the per-thread affinity. In order to be able to check the success
     * of affinity settings, we will set nth_affinity_set to 1 on threads
     * where the affinity setting succeded and to 0 where it failed.
     * Reducing these 0/1 values over the threads will give the total number
     * of threads on which we succeeded.
     */

    // To avoid warnings from the static analyzer we initialize nth_affinity_set
    // to zero outside the OpenMP block, and then add to it inside the block.
    // The value will still always be 0 or 1 from each thread.
    nth_affinity_set = 0;
#pragma omp parallel num_threads(nthread_local) reduction(+:nth_affinity_set)
    {
        try
        {
            int      thread_id, thread_id_node;
            int      index, core;
            gmx_bool setaffinity_ret;

            thread_id      = gmx_omp_get_thread_num();
            thread_id_node = thread0_id_node + thread_id;
            index          = offset + thread_id_node*core_pinning_stride;
            if (localityOrder != nullptr)
            {
                core = localityOrder[index];
            }
            else
            {
                core = index;
            }

            setaffinity_ret = tMPI_Thread_setaffinity_single(tMPI_Thread_self(), core);

            /* store the per-thread success-values of the setaffinity */
            nth_affinity_set += (setaffinity_ret == 0);

            if (debug)
            {
                fprintf(debug, "On rank %2d, thread %2d, index %2d, core %2d the affinity setting returned %d\n",
                        cr->nodeid, gmx_omp_get_thread_num(), index, core, setaffinity_ret);
            }
        }
        GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
    }

    if (nth_affinity_set > nthread_local)
    {
        char msg[STRLEN];

        sprintf(msg, "Looks like we have set affinity for more threads than "
                "we have (%d > %d)!\n", nth_affinity_set, nthread_local);
        gmx_incons(msg);
    }
    else
    {
        /* check & warn if some threads failed to set their affinities */
        const bool allAffinitiesSet = (nth_affinity_set == nthread_local);
        if (!allAffinitiesSet)
        {
            char sbuf1[STRLEN], sbuf2[STRLEN];

            /* sbuf1 contains rank info, while sbuf2 OpenMP thread info */
            sbuf1[0] = sbuf2[0] = '\0';
            /* Only add rank info if we have more than one rank. */
            if (cr->nnodes > 1)
            {
#if GMX_MPI
#if GMX_THREAD_MPI
                sprintf(sbuf1, "In tMPI thread #%d: ", cr->nodeid);
#else           /* GMX_LIB_MPI */
                sprintf(sbuf1, "In MPI process #%d: ", cr->nodeid);
#endif
#endif          /* GMX_MPI */
            }

            if (nthread_local > 1)
            {
                sprintf(sbuf2, "for %d/%d thread%s ",
                        nthread_local - nth_affinity_set, nthread_local,
                        nthread_local > 1 ? "s" : "");
            }

            fprintf(stderr, "NOTE: %sAffinity setting %sfailed.\n", sbuf1, sbuf2);
        }
        if (invalidWithinSimulation(cr, !allAffinitiesSet))
        {
            md_print_warn(cr, fplog,
                          "NOTE: Thread affinity setting failed. This can cause performance degradation.\n"
                          "      If you think your settings are correct, ask on the gmx-users list.\n");
        }
    }
}
Beispiel #2
0
/* Set CPU affinity. Can be important for performance.
   On some systems (e.g. Cray) CPU Affinity is set by default.
   But default assigning doesn't work (well) with only some ranks
   having threads. This causes very low performance.
   External tools have cumbersome syntax for setting affinity
   in the case that only some ranks have threads.
   Thus it is important that GROMACS sets the affinity internally
   if only PME is using threads.
 */
void
gmx_set_thread_affinity(FILE                *fplog,
                        const t_commrec     *cr,
                        gmx_hw_opt_t        *hw_opt,
                        int                  nthreads_pme,
                        const gmx_hw_info_t *hwinfo,
                        const t_inputrec    *inputrec)
{
    int        nth_affinity_set, thread_id_node, thread_id,
               nthread_local, nthread_node, nthread_hw_max, nphyscore;
    int        offset;
    const int *locality_order;
    int        rc;

    if (hw_opt->thread_affinity == threadaffOFF)
    {
        /* Nothing to do */
        return;
    }

    /* If the tMPI thread affinity setting is not supported encourage the user
     * to report it as it's either a bug or an exotic platform which we might
     * want to support. */
    if (tMPI_Thread_setaffinity_support() != TMPI_SETAFFINITY_SUPPORT_YES)
    {
        /* we know Mac OS doesn't support setting thread affinity, so there's
           no point in warning the user in that case. In any other case
           the user might be able to do something about it. */
#ifndef __APPLE__
        md_print_warn(NULL, fplog,
                      "Can not set thread affinities on the current platform. On NUMA systems this\n"
                      "can cause performance degradation. If you think your platform should support\n"
                      "setting affinities, contact the GROMACS developers.");
#endif  /* __APPLE__ */
        return;
    }

    /* threads on this MPI process or TMPI thread */
    if (cr->duty & DUTY_PP)
    {
        nthread_local = gmx_omp_nthreads_get(emntNonbonded);
    }
    else
    {
        nthread_local = gmx_omp_nthreads_get(emntPME);
    }

    /* map the current process to cores */
    thread_id_node = 0;
    nthread_node   = nthread_local;
#ifdef GMX_MPI
    if (PAR(cr) || MULTISIM(cr))
    {
        /* We need to determine a scan of the thread counts in this
         * compute node.
         */
        MPI_Comm comm_intra;

        MPI_Comm_split(MPI_COMM_WORLD, gmx_hostname_num(), cr->rank_intranode,
                       &comm_intra);
        MPI_Scan(&nthread_local, &thread_id_node, 1, MPI_INT, MPI_SUM, comm_intra);
        /* MPI_Scan is inclusive, but here we need exclusive */
        thread_id_node -= nthread_local;
        /* Get the total number of threads on this physical node */
        MPI_Allreduce(&nthread_local, &nthread_node, 1, MPI_INT, MPI_SUM, comm_intra);
        MPI_Comm_free(&comm_intra);
    }
#endif

    if (hw_opt->thread_affinity == threadaffAUTO &&
        nthread_node != hwinfo->nthreads_hw_avail)
    {
        if (nthread_node > 1 && nthread_node < hwinfo->nthreads_hw_avail)
        {
            md_print_warn(cr, fplog,
                          "NOTE: The number of threads is not equal to the number of (logical) cores\n"
                          "      and the -pin option is set to auto: will not pin thread to cores.\n"
                          "      This can lead to significant performance degradation.\n"
                          "      Consider using -pin on (and -pinoffset in case you run multiple jobs).\n");
        }

        return;
    }

    offset = 0;
    if (hw_opt->core_pinning_offset != 0)
    {
        offset = hw_opt->core_pinning_offset;
        md_print_info(cr, fplog, "Applying core pinning offset %d\n", offset);
    }

    rc = get_thread_affinity_layout(fplog, cr, hwinfo,
                                    nthread_node,
                                    offset, &hw_opt->core_pinning_stride,
                                    &locality_order);

    if (rc != 0)
    {
        /* Incompatible layout, don't pin, warning was already issued */
        return;
    }

    /* Set the per-thread affinity. In order to be able to check the success
     * of affinity settings, we will set nth_affinity_set to 1 on threads
     * where the affinity setting succeded and to 0 where it failed.
     * Reducing these 0/1 values over the threads will give the total number
     * of threads on which we succeeded.
     */
    nth_affinity_set = 0;
#pragma omp parallel firstprivate(thread_id_node) num_threads(nthread_local) \
    reduction(+:nth_affinity_set)
    {
        int      index, core;
        gmx_bool setaffinity_ret;

        thread_id       = gmx_omp_get_thread_num();
        thread_id_node += thread_id;
        index           = offset + thread_id_node*hw_opt->core_pinning_stride;
        if (locality_order != NULL)
        {
            core = locality_order[index];
        }
        else
        {
            core = index;
        }

        setaffinity_ret = tMPI_Thread_setaffinity_single(tMPI_Thread_self(), core);

        /* store the per-thread success-values of the setaffinity */
        nth_affinity_set = (setaffinity_ret == 0);

        if (debug)
        {
            fprintf(debug, "On rank %2d, thread %2d, core %2d the affinity setting returned %d\n",
                    cr->nodeid, gmx_omp_get_thread_num(), core, setaffinity_ret);
        }
    }

    if (nth_affinity_set > nthread_local)
    {
        char msg[STRLEN];

        sprintf(msg, "Looks like we have set affinity for more threads than "
                "we have (%d > %d)!\n", nth_affinity_set, nthread_local);
        gmx_incons(msg);
    }
    else
    {
        /* check & warn if some threads failed to set their affinities */
        if (nth_affinity_set != nthread_local)
        {
            char sbuf1[STRLEN], sbuf2[STRLEN];

            /* sbuf1 contains rank info, while sbuf2 OpenMP thread info */
            sbuf1[0] = sbuf2[0] = '\0';
            /* Only add rank info if we have more than one rank. */
            if (cr->nnodes > 1)
            {
#ifdef GMX_MPI
#ifdef GMX_THREAD_MPI
                sprintf(sbuf1, "In tMPI thread #%d: ", cr->nodeid);
#else           /* GMX_LIB_MPI */
                sprintf(sbuf1, "In MPI process #%d: ", cr->nodeid);
#endif
#endif          /* GMX_MPI */
            }

            if (nthread_local > 1)
            {
                sprintf(sbuf2, "for %d/%d thread%s ",
                        nthread_local - nth_affinity_set, nthread_local,
                        nthread_local > 1 ? "s" : "");
            }

            md_print_warn(NULL, fplog,
                          "WARNING: %sAffinity setting %sfailed.\n"
                          "         This can cause performance degradation! If you think your setting are\n"
                          "         correct, contact the GROMACS developers.",
                          sbuf1, sbuf2);
        }
    }
    return;
}
Beispiel #3
0
void tMPI_Start_threads(tmpi_bool main_returns, int N,
                        tMPI_Affinity_strategy aff_strategy,
                        int *argc, char ***argv,
                        void (*start_fn)(void*), void *start_arg,
                        int (*start_fn_main)(int, char**))
{
#ifdef TMPI_TRACE
    tMPI_Trace_print("tMPI_Start_threads(%d, %d, %d, %d, %d, %p, %p, %p, %p)",
                     main_returns, N, aff_strategy, argc, argv, start_fn,
                     start_arg);
#endif
    if (N > 0)
    {
        int i;
        int set_affinity = FALSE;

        tmpi_finalized = FALSE;
        Nthreads       = N;

        /* allocate global data */
        tmpi_global = (struct tmpi_global*)
            tMPI_Malloc(sizeof(struct tmpi_global));
        tMPI_Global_init(tmpi_global, N);

        /* allocate world and thread data */
        threads          = (struct tmpi_thread*)tMPI_Malloc(sizeof(struct tmpi_thread)*N);
        TMPI_COMM_WORLD  = tMPI_Comm_alloc(NULL, N);
        TMPI_GROUP_EMPTY = tMPI_Group_alloc();

        if (tMPI_Thread_key_create(&id_key, NULL))
        {
            tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_INIT);
        }
        for (i = 0; i < N; i++)
        {
            TMPI_COMM_WORLD->grp.peers[i] = &(threads[i]);

            /* copy argc, argv */
            if (argc && argv)
            {
                int j;
                threads[i].argc = *argc;
                threads[i].argv = (char**)tMPI_Malloc(threads[i].argc*
                                                      sizeof(char*));
                for (j = 0; j < threads[i].argc; j++)
                {
#if !(defined( _WIN32 ) || defined( _WIN64 ) )
                    threads[i].argv[j] = strdup( (*argv)[j] );
#else
                    threads[i].argv[j] = _strdup( (*argv)[j] );
#endif
                }
            }
            else
            {
                threads[i].argc = 0;
                threads[i].argv = NULL;
            }
            threads[i].start_fn      = start_fn;
            threads[i].start_fn_main = start_fn_main;
            threads[i].start_arg     = start_arg;
        }

        /* now check whether to set affinity */
        if (aff_strategy == TMPI_AFFINITY_ALL_CORES)
        {
            int nhw = tMPI_Thread_get_hw_number();
            if ((nhw > 1) && (nhw == N))
            {
                set_affinity = TRUE;
            }
        }

        /* set thread 0's properties */
        threads[0].thread_id = tMPI_Thread_self();
        if (set_affinity)
        {
            /* set the main thread's affinity */
            tMPI_Thread_setaffinity_single(threads[0].thread_id, 0);
        }

        for (i = 1; i < N; i++) /* zero is the main thread */
        {
            int ret;
            ret = tMPI_Thread_create(&(threads[i].thread_id),
                                     tMPI_Thread_starter,
                                     (void*)&(threads[i]) );

            if (set_affinity)
            {
                tMPI_Thread_setaffinity_single(threads[i].thread_id, i);
            }
            if (ret)
            {
                tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_INIT);
            }
        }
        /* the main thread also runs start_fn if we don't want
           it to return */
        if (!main_returns)
        {
            tMPI_Thread_starter((void*)&(threads[0]));
        }
        else
        {
            tMPI_Thread_init(&(threads[0]));
        }
    }
}