/* Set CPU affinity. Can be important for performance. On some systems (e.g. Cray) CPU Affinity is set by default. But default assigning doesn't work (well) with only some ranks having threads. This causes very low performance. External tools have cumbersome syntax for setting affinity in the case that only some ranks have threads. Thus it is important that GROMACS sets the affinity internally if only PME is using threads. */ void gmx_set_thread_affinity(FILE *fplog, const t_commrec *cr, const gmx_hw_opt_t *hw_opt, const gmx_hw_info_t *hwinfo) { int nth_affinity_set, thread0_id_node, nthread_local, nthread_node; int offset; int * localityOrder = nullptr; int rc; if (hw_opt->thread_affinity == threadaffOFF) { /* Nothing to do */ return; } /* If the tMPI thread affinity setting is not supported encourage the user * to report it as it's either a bug or an exotic platform which we might * want to support. */ if (tMPI_Thread_setaffinity_support() != TMPI_SETAFFINITY_SUPPORT_YES) { /* we know Mac OS & BlueGene do not support setting thread affinity, so there's no point in warning the user in that case. In any other case the user might be able to do something about it. */ #if !defined(__APPLE__) && !defined(__bg__) md_print_warn(cr, fplog, "NOTE: Cannot set thread affinities on the current platform.\n"); #endif /* __APPLE__ */ return; } /* threads on this MPI process or TMPI thread */ if (cr->duty & DUTY_PP) { nthread_local = gmx_omp_nthreads_get(emntNonbonded); } else { nthread_local = gmx_omp_nthreads_get(emntPME); } /* map the current process to cores */ thread0_id_node = 0; nthread_node = nthread_local; #if GMX_MPI if (PAR(cr) || MULTISIM(cr)) { /* We need to determine a scan of the thread counts in this * compute node. */ MPI_Comm comm_intra; MPI_Comm_split(MPI_COMM_WORLD, gmx_physicalnode_id_hash(), cr->rank_intranode, &comm_intra); MPI_Scan(&nthread_local, &thread0_id_node, 1, MPI_INT, MPI_SUM, comm_intra); /* MPI_Scan is inclusive, but here we need exclusive */ thread0_id_node -= nthread_local; /* Get the total number of threads on this physical node */ MPI_Allreduce(&nthread_local, &nthread_node, 1, MPI_INT, MPI_SUM, comm_intra); MPI_Comm_free(&comm_intra); } #endif if (hw_opt->thread_affinity == threadaffAUTO && nthread_node != hwinfo->nthreads_hw_avail) { if (nthread_node > 1 && nthread_node < hwinfo->nthreads_hw_avail) { md_print_warn(cr, fplog, "NOTE: The number of threads is not equal to the number of (logical) cores\n" " and the -pin option is set to auto: will not pin thread to cores.\n" " This can lead to significant performance degradation.\n" " Consider using -pin on (and -pinoffset in case you run multiple jobs).\n"); } return; } offset = 0; if (hw_opt->core_pinning_offset != 0) { offset = hw_opt->core_pinning_offset; md_print_info(cr, fplog, "Applying core pinning offset %d\n", offset); } int core_pinning_stride = hw_opt->core_pinning_stride; rc = get_thread_affinity_layout(fplog, cr, hwinfo, nthread_node, offset, &core_pinning_stride, &localityOrder); gmx::scoped_guard_sfree localityOrderGuard(localityOrder); if (rc != 0) { /* Incompatible layout, don't pin, warning was already issued */ return; } /* Set the per-thread affinity. In order to be able to check the success * of affinity settings, we will set nth_affinity_set to 1 on threads * where the affinity setting succeded and to 0 where it failed. * Reducing these 0/1 values over the threads will give the total number * of threads on which we succeeded. */ // To avoid warnings from the static analyzer we initialize nth_affinity_set // to zero outside the OpenMP block, and then add to it inside the block. // The value will still always be 0 or 1 from each thread. nth_affinity_set = 0; #pragma omp parallel num_threads(nthread_local) reduction(+:nth_affinity_set) { try { int thread_id, thread_id_node; int index, core; gmx_bool setaffinity_ret; thread_id = gmx_omp_get_thread_num(); thread_id_node = thread0_id_node + thread_id; index = offset + thread_id_node*core_pinning_stride; if (localityOrder != nullptr) { core = localityOrder[index]; } else { core = index; } setaffinity_ret = tMPI_Thread_setaffinity_single(tMPI_Thread_self(), core); /* store the per-thread success-values of the setaffinity */ nth_affinity_set += (setaffinity_ret == 0); if (debug) { fprintf(debug, "On rank %2d, thread %2d, index %2d, core %2d the affinity setting returned %d\n", cr->nodeid, gmx_omp_get_thread_num(), index, core, setaffinity_ret); } } GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR; } if (nth_affinity_set > nthread_local) { char msg[STRLEN]; sprintf(msg, "Looks like we have set affinity for more threads than " "we have (%d > %d)!\n", nth_affinity_set, nthread_local); gmx_incons(msg); } else { /* check & warn if some threads failed to set their affinities */ const bool allAffinitiesSet = (nth_affinity_set == nthread_local); if (!allAffinitiesSet) { char sbuf1[STRLEN], sbuf2[STRLEN]; /* sbuf1 contains rank info, while sbuf2 OpenMP thread info */ sbuf1[0] = sbuf2[0] = '\0'; /* Only add rank info if we have more than one rank. */ if (cr->nnodes > 1) { #if GMX_MPI #if GMX_THREAD_MPI sprintf(sbuf1, "In tMPI thread #%d: ", cr->nodeid); #else /* GMX_LIB_MPI */ sprintf(sbuf1, "In MPI process #%d: ", cr->nodeid); #endif #endif /* GMX_MPI */ } if (nthread_local > 1) { sprintf(sbuf2, "for %d/%d thread%s ", nthread_local - nth_affinity_set, nthread_local, nthread_local > 1 ? "s" : ""); } fprintf(stderr, "NOTE: %sAffinity setting %sfailed.\n", sbuf1, sbuf2); } if (invalidWithinSimulation(cr, !allAffinitiesSet)) { md_print_warn(cr, fplog, "NOTE: Thread affinity setting failed. This can cause performance degradation.\n" " If you think your settings are correct, ask on the gmx-users list.\n"); } } }
/* Set CPU affinity. Can be important for performance. On some systems (e.g. Cray) CPU Affinity is set by default. But default assigning doesn't work (well) with only some ranks having threads. This causes very low performance. External tools have cumbersome syntax for setting affinity in the case that only some ranks have threads. Thus it is important that GROMACS sets the affinity internally if only PME is using threads. */ void gmx_set_thread_affinity(FILE *fplog, const t_commrec *cr, gmx_hw_opt_t *hw_opt, int nthreads_pme, const gmx_hw_info_t *hwinfo, const t_inputrec *inputrec) { int nth_affinity_set, thread_id_node, thread_id, nthread_local, nthread_node, nthread_hw_max, nphyscore; int offset; const int *locality_order; int rc; if (hw_opt->thread_affinity == threadaffOFF) { /* Nothing to do */ return; } /* If the tMPI thread affinity setting is not supported encourage the user * to report it as it's either a bug or an exotic platform which we might * want to support. */ if (tMPI_Thread_setaffinity_support() != TMPI_SETAFFINITY_SUPPORT_YES) { /* we know Mac OS doesn't support setting thread affinity, so there's no point in warning the user in that case. In any other case the user might be able to do something about it. */ #ifndef __APPLE__ md_print_warn(NULL, fplog, "Can not set thread affinities on the current platform. On NUMA systems this\n" "can cause performance degradation. If you think your platform should support\n" "setting affinities, contact the GROMACS developers."); #endif /* __APPLE__ */ return; } /* threads on this MPI process or TMPI thread */ if (cr->duty & DUTY_PP) { nthread_local = gmx_omp_nthreads_get(emntNonbonded); } else { nthread_local = gmx_omp_nthreads_get(emntPME); } /* map the current process to cores */ thread_id_node = 0; nthread_node = nthread_local; #ifdef GMX_MPI if (PAR(cr) || MULTISIM(cr)) { /* We need to determine a scan of the thread counts in this * compute node. */ MPI_Comm comm_intra; MPI_Comm_split(MPI_COMM_WORLD, gmx_hostname_num(), cr->rank_intranode, &comm_intra); MPI_Scan(&nthread_local, &thread_id_node, 1, MPI_INT, MPI_SUM, comm_intra); /* MPI_Scan is inclusive, but here we need exclusive */ thread_id_node -= nthread_local; /* Get the total number of threads on this physical node */ MPI_Allreduce(&nthread_local, &nthread_node, 1, MPI_INT, MPI_SUM, comm_intra); MPI_Comm_free(&comm_intra); } #endif if (hw_opt->thread_affinity == threadaffAUTO && nthread_node != hwinfo->nthreads_hw_avail) { if (nthread_node > 1 && nthread_node < hwinfo->nthreads_hw_avail) { md_print_warn(cr, fplog, "NOTE: The number of threads is not equal to the number of (logical) cores\n" " and the -pin option is set to auto: will not pin thread to cores.\n" " This can lead to significant performance degradation.\n" " Consider using -pin on (and -pinoffset in case you run multiple jobs).\n"); } return; } offset = 0; if (hw_opt->core_pinning_offset != 0) { offset = hw_opt->core_pinning_offset; md_print_info(cr, fplog, "Applying core pinning offset %d\n", offset); } rc = get_thread_affinity_layout(fplog, cr, hwinfo, nthread_node, offset, &hw_opt->core_pinning_stride, &locality_order); if (rc != 0) { /* Incompatible layout, don't pin, warning was already issued */ return; } /* Set the per-thread affinity. In order to be able to check the success * of affinity settings, we will set nth_affinity_set to 1 on threads * where the affinity setting succeded and to 0 where it failed. * Reducing these 0/1 values over the threads will give the total number * of threads on which we succeeded. */ nth_affinity_set = 0; #pragma omp parallel firstprivate(thread_id_node) num_threads(nthread_local) \ reduction(+:nth_affinity_set) { int index, core; gmx_bool setaffinity_ret; thread_id = gmx_omp_get_thread_num(); thread_id_node += thread_id; index = offset + thread_id_node*hw_opt->core_pinning_stride; if (locality_order != NULL) { core = locality_order[index]; } else { core = index; } setaffinity_ret = tMPI_Thread_setaffinity_single(tMPI_Thread_self(), core); /* store the per-thread success-values of the setaffinity */ nth_affinity_set = (setaffinity_ret == 0); if (debug) { fprintf(debug, "On rank %2d, thread %2d, core %2d the affinity setting returned %d\n", cr->nodeid, gmx_omp_get_thread_num(), core, setaffinity_ret); } } if (nth_affinity_set > nthread_local) { char msg[STRLEN]; sprintf(msg, "Looks like we have set affinity for more threads than " "we have (%d > %d)!\n", nth_affinity_set, nthread_local); gmx_incons(msg); } else { /* check & warn if some threads failed to set their affinities */ if (nth_affinity_set != nthread_local) { char sbuf1[STRLEN], sbuf2[STRLEN]; /* sbuf1 contains rank info, while sbuf2 OpenMP thread info */ sbuf1[0] = sbuf2[0] = '\0'; /* Only add rank info if we have more than one rank. */ if (cr->nnodes > 1) { #ifdef GMX_MPI #ifdef GMX_THREAD_MPI sprintf(sbuf1, "In tMPI thread #%d: ", cr->nodeid); #else /* GMX_LIB_MPI */ sprintf(sbuf1, "In MPI process #%d: ", cr->nodeid); #endif #endif /* GMX_MPI */ } if (nthread_local > 1) { sprintf(sbuf2, "for %d/%d thread%s ", nthread_local - nth_affinity_set, nthread_local, nthread_local > 1 ? "s" : ""); } md_print_warn(NULL, fplog, "WARNING: %sAffinity setting %sfailed.\n" " This can cause performance degradation! If you think your setting are\n" " correct, contact the GROMACS developers.", sbuf1, sbuf2); } } return; }
void tMPI_Start_threads(tmpi_bool main_returns, int N, tMPI_Affinity_strategy aff_strategy, int *argc, char ***argv, void (*start_fn)(void*), void *start_arg, int (*start_fn_main)(int, char**)) { #ifdef TMPI_TRACE tMPI_Trace_print("tMPI_Start_threads(%d, %d, %d, %d, %d, %p, %p, %p, %p)", main_returns, N, aff_strategy, argc, argv, start_fn, start_arg); #endif if (N > 0) { int i; int set_affinity = FALSE; tmpi_finalized = FALSE; Nthreads = N; /* allocate global data */ tmpi_global = (struct tmpi_global*) tMPI_Malloc(sizeof(struct tmpi_global)); tMPI_Global_init(tmpi_global, N); /* allocate world and thread data */ threads = (struct tmpi_thread*)tMPI_Malloc(sizeof(struct tmpi_thread)*N); TMPI_COMM_WORLD = tMPI_Comm_alloc(NULL, N); TMPI_GROUP_EMPTY = tMPI_Group_alloc(); if (tMPI_Thread_key_create(&id_key, NULL)) { tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_INIT); } for (i = 0; i < N; i++) { TMPI_COMM_WORLD->grp.peers[i] = &(threads[i]); /* copy argc, argv */ if (argc && argv) { int j; threads[i].argc = *argc; threads[i].argv = (char**)tMPI_Malloc(threads[i].argc* sizeof(char*)); for (j = 0; j < threads[i].argc; j++) { #if !(defined( _WIN32 ) || defined( _WIN64 ) ) threads[i].argv[j] = strdup( (*argv)[j] ); #else threads[i].argv[j] = _strdup( (*argv)[j] ); #endif } } else { threads[i].argc = 0; threads[i].argv = NULL; } threads[i].start_fn = start_fn; threads[i].start_fn_main = start_fn_main; threads[i].start_arg = start_arg; } /* now check whether to set affinity */ if (aff_strategy == TMPI_AFFINITY_ALL_CORES) { int nhw = tMPI_Thread_get_hw_number(); if ((nhw > 1) && (nhw == N)) { set_affinity = TRUE; } } /* set thread 0's properties */ threads[0].thread_id = tMPI_Thread_self(); if (set_affinity) { /* set the main thread's affinity */ tMPI_Thread_setaffinity_single(threads[0].thread_id, 0); } for (i = 1; i < N; i++) /* zero is the main thread */ { int ret; ret = tMPI_Thread_create(&(threads[i].thread_id), tMPI_Thread_starter, (void*)&(threads[i]) ); if (set_affinity) { tMPI_Thread_setaffinity_single(threads[i].thread_id, i); } if (ret) { tMPI_Error(TMPI_COMM_WORLD, TMPI_ERR_INIT); } } /* the main thread also runs start_fn if we don't want it to return */ if (!main_returns) { tMPI_Thread_starter((void*)&(threads[0])); } else { tMPI_Thread_init(&(threads[0])); } } }