void chpl_task_init(void) { int32_t numCoresPerLocale; int32_t numThreadsPerLocale; size_t callStackSize; pthread_t initer; char newenv_sheps[100] = { 0 }; char newenv_stack[100] = { 0 }; // Set up available hardware parallelism numThreadsPerLocale = chpl_task_getenvNumThreadsPerLocale(); numCoresPerLocale = chpl_numCoresOnThisLocale(); if (numThreadsPerLocale == 0) numThreadsPerLocale = chpl_comm_getMaxThreads(); if (0 < numThreadsPerLocale) { // We are assuming the user wants to constrain the hardware // resources used during this run of the application. // Unset relevant Qthreads environment variables qt_internal_unset_envstr("HWPAR"); qt_internal_unset_envstr("NUM_SHEPHERDS"); qt_internal_unset_envstr("NUM_WORKERS_PER_SHEPHERD"); if (numCoresPerLocale < numThreadsPerLocale) { if (2 == verbosity) { printf("QTHREADS: Ignored --numThreadsPerLocale=%d to prevent oversubsription of the system.\n", numThreadsPerLocale); } // Do not oversubscribe the system, use all available resources. numThreadsPerLocale = numCoresPerLocale; } // Set environment variable for Qthreads snprintf(newenv_sheps, 99, "%i", (int)numThreadsPerLocale); setenv("QT_HWPAR", newenv_sheps, 1); } else if (qt_internal_get_env_str("HWPAR", NULL) || qt_internal_get_env_str("NUM_SHEPHERDS", NULL) || qt_internal_get_env_str("NUM_WORKERS_PER_SHEPHERD", NULL)) { // Assume the user wants has manually set the desired Qthreads // environment variables. } else { // Default to using all hardware resources. numThreadsPerLocale = numCoresPerLocale; snprintf(newenv_sheps, 99, "%i", (int)numThreadsPerLocale); setenv("QT_HWPAR", newenv_sheps, 1); } // Precendence (high-to-low): // 1) Chapel minimum // 2) QTHREAD_STACK_SIZE // In practice we never get to #2, because the Chapel minimum is // always > 0, but we cover that case as a backstop. callStackSize = chpl_task_getMinCallStackSize(); if (callStackSize <= 0) callStackSize = 1024 * 1024 * sizeof(size_t); snprintf(newenv_stack, 99, "%zu", callStackSize); setenv("QT_STACK_SIZE", newenv_stack, 1); // Turn on informative Qthreads setting messages with Chapel's verbose flag if (verbosity == 2) { setenv("QT_INFO", "1", 1); } pthread_create(&initer, NULL, initializer, NULL); while (chpl_qthread_done_initializing == 0) SPINLOCK_BODY(); if (blockreport || taskreport) { if (signal(SIGINT, SIGINT_handler) == SIG_ERR) { perror("Could not register SIGINT handler"); } } }
void chpl_thread_init(void(*threadBeginFn)(void*), void(*threadEndFn)(void)) { // // This threading layer does not have any inherent limit on the number // of threads. Its limit is the lesser of any limits imposed by the // comm layer and the user. // { uint32_t lim; if ((lim = chpl_task_getenvNumThreadsPerLocale()) > 0) maxThreads = lim; else if ((lim = chpl_comm_getMaxThreads()) > 0) maxThreads = lim; } // // Count the main thread on locale 0 as already existing, since it // is (or soon will be) running the main program. // if (chpl_nodeID == 0) numThreads = 1; // // If a value was specified for the call stack size config const, use // that (rounded up to a whole number of pages) to set the system and // pthread stack limits. // if (pthread_attr_init(&thread_attributes) != 0) chpl_internal_error("pthread_attr_init() failed"); // // If a value was specified for the call stack size, use that (rounded // up to a whole number of pages) to set the system and pthread stack // limits. This will in turn limit the stack for any task hosted by // either the main process or a pthread. // { size_t css; size_t pagesize = (size_t) sysconf(_SC_PAGESIZE); struct rlimit rlim; if ((css = chpl_task_getEnvCallStackSize()) == 0) css = chpl_task_getDefaultCallStackSize(); assert(css > 0); css = (css + pagesize - 1) & ~(pagesize - 1); if (getrlimit(RLIMIT_STACK, &rlim) != 0) chpl_internal_error("getrlimit() failed"); if (rlim.rlim_max != RLIM_INFINITY && css > rlim.rlim_max) { char warning[128]; sprintf(warning, "call stack size capped at %lu\n", (unsigned long)rlim.rlim_max); chpl_warning(warning, 0, 0); css = rlim.rlim_max; } rlim.rlim_cur = css; #ifndef __CYGWIN__ // // Cygwin can't do setrlimit(RLIMIT_STACK). // if (setrlimit(RLIMIT_STACK, &rlim) != 0) chpl_internal_error("setrlimit() failed"); #endif if (pthread_attr_setstacksize(&thread_attributes, css) != 0) chpl_internal_error("pthread_attr_setstacksize() failed"); } if (pthread_attr_getstacksize(&thread_attributes, &threadCallStackSize) != 0) chpl_internal_error("pthread_attr_getstacksize() failed"); saved_threadBeginFn = threadBeginFn; saved_threadEndFn = threadEndFn; CHPL_TLS_INIT(chpl_thread_id); CHPL_TLS_SET(chpl_thread_id, (intptr_t) --curr_thread_id); CHPL_TLS_INIT(chpl_thread_data); pthread_mutex_init(&thread_info_lock, NULL); pthread_mutex_init(&numThreadsLock, NULL); // // This is something of a hack, but it makes us a bit more resilient // if we're out of memory or near to it at shutdown time. Launch, // cancel, and join with an initial pthread, forcing initialization // needed by any of those activities. (In particular we have found // that cancellation needs to dlopen(3) a shared object, which fails // if we are out of memory. Doing it now means that shared object is // already available when we need it later.) // { pthread_t initial_pthread; if (!pthread_create(&initial_pthread, NULL, initial_pthread_func, NULL)) { (void) pthread_cancel(initial_pthread); (void) pthread_join(initial_pthread, NULL); } } }
void chpl_task_init(void) { chpl_bool we_set_worker_unit = false; int32_t numThreadsPerLocale; int32_t commMaxThreads; int32_t hwpar; size_t callStackSize; pthread_t initer; char newenv_stack[100] = { 0 }; char *noWorkSteal; // Set up available hardware parallelism. // Experience has shown that we hardly ever win by using more than // one PU per core, so default to that. If this was explicitly // set by the user we won't override it, however. if (getenv("QTHREAD_WORKER_UNIT") == NULL) { we_set_worker_unit = (getenv("QT_WORKER_UNIT") == NULL); (void) setenv("QT_WORKER_UNIT", "core", 0); } // Determine the thread count. CHPL_RT_NUM_THREADS_PER_LOCALE has // the highest precedence but we limit it to the number of PUs. // QTHREAD_HWPAR has the next precedence. We don't impose the // same limit on it, so it can be used to overload the hardware. // In either case the number of threads can be no greater than any // maximum imposed by the comm layer. This limit is imposed // silently. numThreadsPerLocale = chpl_task_getenvNumThreadsPerLocale(); commMaxThreads = chpl_comm_getMaxThreads(); hwpar = 0; if (numThreadsPerLocale != 0) { int32_t numPUsPerLocale; hwpar = numThreadsPerLocale; numPUsPerLocale = chpl_numCoresOnThisLocale(); if (0 < numPUsPerLocale && numPUsPerLocale < hwpar) { if (2 == verbosity) { printf("QTHREADS: Reduced numThreadsPerLocale=%d to %d " "to prevent oversubscription of the system.\n", hwpar, numPUsPerLocale); } // Do not oversubscribe the system, use all available resources. hwpar = numPUsPerLocale; } if (0 < commMaxThreads && commMaxThreads < hwpar) { hwpar = commMaxThreads; } } else { if (0 < commMaxThreads) { hwpar = qt_internal_get_env_num("HWPAR", 0, 0); if (commMaxThreads < hwpar) { hwpar = commMaxThreads; } } } if (hwpar > 0) { char newenv[100]; char *sched; // Unset relevant Qthreads environment variables. Currently // QTHREAD_HWPAR has precedence over the QTHREAD_NUM_* ones, // but that isn't documented and may not be true forever, so // we unset them all. qt_internal_unset_envstr("HWPAR"); qt_internal_unset_envstr("NUM_SHEPHERDS"); qt_internal_unset_envstr("NUM_WORKERS_PER_SHEPHERD"); // The current check for scheduler and setting HWPAR or // NUM_SHEPHERDS/WORKERS_PER_SHEPHERD is just to experiment with // the performance of different schedulers. This is not production code // and if it's around after July 2014, yell at Elliot. sched = getenv("CHPL_QTHREAD_SCHEDULER"); if (sched != NULL && strncmp(sched, "nemesis", 7) == 0) { // Set environment variable for Qthreads snprintf(newenv, sizeof(newenv), "%i", (int)hwpar); setenv("QT_NUM_SHEPHERDS", newenv, 1); setenv("QT_NUM_WORKERS_PER_SHEPHERD", "1", 1); // Unset QT_WORKER_UNIT iff we set it. if (we_set_worker_unit) { (void) unsetenv("QT_WORKER_UNIT"); } } else { // Set environment variable for Qthreads snprintf(newenv, sizeof(newenv), "%i", (int)hwpar); setenv("QT_HWPAR", newenv, 1); } } // Precedence (high-to-low): // 1) Chapel minimum // 2) QTHREAD_STACK_SIZE // In practice we never get to #2, because the Chapel minimum is // always > 0, but we cover that case as a backstop. callStackSize = chpl_task_getMinCallStackSize(); if (callStackSize <= 0) callStackSize = 1024 * 1024 * sizeof(size_t); snprintf(newenv_stack, 99, "%zu", callStackSize); setenv("QT_STACK_SIZE", newenv_stack, 1); // Turn on informative Qthreads setting messages with Chapel's verbose flag if (verbosity == 2) { setenv("QT_INFO", "1", 1); } pthread_create(&initer, NULL, initializer, NULL); while (chpl_qthread_done_initializing == 0) SPINLOCK_BODY(); // Now that Qthreads is up and running, make sure that the number // of workers is less than any comm layer limit. This is mainly // checking that the default thread count without QTHREAD_HWPAR // being set is within any comm layer limit, because we can't // determine that default ahead of time. Secondarily, it's a // sanity check on the thread count versus comm limit logic // above. assert(0 == commMaxThreads || qthread_num_workers() < commMaxThreads); if (blockreport || taskreport) { if (signal(SIGINT, SIGINT_handler) == SIG_ERR) { perror("Could not register SIGINT handler"); } } // Turn off work stealing if it was configured to be off noWorkSteal = getenv("CHPL_QTHREAD_NO_WORK_STEALING"); if (noWorkSteal != NULL && strncmp(noWorkSteal, "yes", 3) == 0) { qthread_steal_disable(); } }