qlfqueue_t *qlfqueue_create(void) { /*{{{ */ qlfqueue_t *q; if (qlfqueue_node_pool == NULL) { switch ((uintptr_t)qthread_cas_ptr(&qlfqueue_node_pool, NULL, (void *)1)) { case 0: /* I won, I will allocate */ qlfqueue_node_pool = qpool_create_aligned(sizeof(qlfqueue_node_t), 0); break; case 1: while (qlfqueue_node_pool == (void *)1) { SPINLOCK_BODY(); } break; } } qassert_ret((qlfqueue_node_pool != NULL), NULL); q = MALLOC(sizeof(struct qlfqueue_s)); if (q != NULL) { q->head = (qlfqueue_node_t *)qpool_alloc(qlfqueue_node_pool); assert(q->head != NULL); if (q->head == NULL) { /* if we're not using asserts, fail nicely */ FREE(q, sizeof(struct qlfqueue_s)); return NULL; } q->tail = q->head; q->tail->next = NULL; } return q; } /*}}} */
static inline qt_threadqueue_node_t *qt_internal_NEMESIS_dequeue(NEMESIS_queue *q) { /*{{{ */ if (!q->shadow_head) { if (!q->head) { return NULL; } q->shadow_head = q->head; q->head = NULL; } qt_threadqueue_node_t *const retval = (void *volatile)(q->shadow_head); if ((retval != NULL) && (retval != (void *)1)) { if (retval->next != NULL) { q->shadow_head = retval->next; retval->next = NULL; } else { qt_threadqueue_node_t *old; q->shadow_head = NULL; old = qthread_cas_ptr(&(q->tail), retval, NULL); if (old != retval) { while (retval->next == NULL) SPINLOCK_BODY(); q->shadow_head = retval->next; retval->next = NULL; } } } return retval; } /*}}} */
// Sync variables void chpl_sync_lock(chpl_sync_aux_t *s) { aligned_t l; PROFILE_INCR(profile_sync_lock, 1); l = qthread_incr(&s->lockers_in, 1); while (l != s->lockers_out) SPINLOCK_BODY(); }
void qtimer_start(qtimer_t q) { if (inited == 0) { if (qthread_cas(&inited, 0, 1) == 0) { mach_timebase_info_data_t info; kern_return_t err = mach_timebase_info(&info); // Convert the timebase into seconds if (err == 0) { conversion = 1e-9 * (double)info.numer / (double)info.denom; } COMPILER_FENCE; inited = 2; } else { while (inited == 1) SPINLOCK_BODY(); } } q->start = mach_absolute_time(); }
static void attach_bounce_buffer(buf_t *buf, data_t *data) { void *bb; ni_t *ni = obj_to_ni(buf); while ((bb = ll_dequeue_obj_alien(&ni->shmem.bounce_buf.head->free_list, ni->shmem.bounce_buf.head, ni->shmem.bounce_buf.head->head_index0)) == NULL) SPINLOCK_BODY(); buf->transfer.noknem.data = bb; buf->transfer.noknem.data_length = ni->shmem.bounce_buf.buf_size; buf->transfer.noknem.bounce_offset = bb - (void *)ni->shmem.bounce_buf.head; data->noknem.bounce_offset = buf->transfer.noknem.bounce_offset; }
void chpl_task_exit(void) { #ifdef CHAPEL_PROFILE profile_print(); #endif /* CHAPEL_PROFILE */ #ifdef QTHREAD_MULTINODE #else if (qthread_shep() == NO_SHEPHERD) { /* sometimes, tasking is told to shutdown even though it hasn't been * told to start yet */ if (chpl_qthread_done_initializing == 1) { qthread_syncvar_fill(&canexit); while (done_finalizing == 0) SPINLOCK_BODY(); } } else { qthread_syncvar_fill(&exit_ret); } #endif /* QTHREAD_MULTINODE */ }
static void net_cleanup(void) { qthread_debug(MULTINODE_FUNCTIONS, "[%d] begin net_cleanup\n", my_rank); if (my_rank == 0) { int i; for (i = 1; i < world_size; ++i) { struct die_msg_t msg; msg.my_rank = my_rank; qthread_debug(MULTINODE_DETAILS, "[%d] sending die message to %d\n", my_rank, i); qthread_internal_net_driver_send(i, DIE_MSG_TAG, &msg, sizeof(msg)); } while (num_ended != world_size - 1) SPINLOCK_BODY(); } qthread_internal_net_driver_finalize(); qt_hash_destroy(uid_to_ptr_hash); qt_hash_destroy(ptr_to_uid_hash); qthread_debug(MULTINODE_FUNCTIONS, "[%d] end net_cleanup\n", my_rank); }
// old public method static inline qt_hash qt_hash_create(qt_dict_key_equals_f eq, qt_dict_hash_f hash, qt_dict_cleanup_f cleanup) { qt_hash tmp; if (hash_entry_pool == NULL) { if (qthread_cas_ptr(&hash_entry_pool, NULL, (void *)1) == NULL) { hash_entry_pool = qpool_create(sizeof(hash_entry)); } else { while (hash_entry_pool == (void *)1) SPINLOCK_BODY(); } } tmp = MALLOC(sizeof(qt_dictionary)); assert(tmp); tmp->op_equals = eq; tmp->op_hash = hash; tmp->op_cleanup = cleanup; assert(tmp); if (hard_max_buckets == 0) { hard_max_buckets = pagesize / sizeof(marked_ptr_t); } tmp->B = calloc(hard_max_buckets, sizeof(marked_ptr_t)); assert(tmp->B); tmp->size = 2; tmp->count = 0; { hash_entry *dummy = qpool_alloc(hash_entry_pool); assert(dummy); memset(dummy, 0, sizeof(hash_entry)); tmp->B[0] = CONSTRUCT(0, dummy); } return tmp; }
void chpl_task_init(void) { int32_t numCoresPerLocale; int32_t numThreadsPerLocale; size_t callStackSize; pthread_t initer; char newenv_sheps[100] = { 0 }; char newenv_stack[100] = { 0 }; // Set up available hardware parallelism numThreadsPerLocale = chpl_task_getenvNumThreadsPerLocale(); numCoresPerLocale = chpl_numCoresOnThisLocale(); if (numThreadsPerLocale == 0) numThreadsPerLocale = chpl_comm_getMaxThreads(); if (0 < numThreadsPerLocale) { // We are assuming the user wants to constrain the hardware // resources used during this run of the application. // Unset relevant Qthreads environment variables qt_internal_unset_envstr("HWPAR"); qt_internal_unset_envstr("NUM_SHEPHERDS"); qt_internal_unset_envstr("NUM_WORKERS_PER_SHEPHERD"); if (numCoresPerLocale < numThreadsPerLocale) { if (2 == verbosity) { printf("QTHREADS: Ignored --numThreadsPerLocale=%d to prevent oversubsription of the system.\n", numThreadsPerLocale); } // Do not oversubscribe the system, use all available resources. numThreadsPerLocale = numCoresPerLocale; } // Set environment variable for Qthreads snprintf(newenv_sheps, 99, "%i", (int)numThreadsPerLocale); setenv("QT_HWPAR", newenv_sheps, 1); } else if (qt_internal_get_env_str("HWPAR", NULL) || qt_internal_get_env_str("NUM_SHEPHERDS", NULL) || qt_internal_get_env_str("NUM_WORKERS_PER_SHEPHERD", NULL)) { // Assume the user wants has manually set the desired Qthreads // environment variables. } else { // Default to using all hardware resources. numThreadsPerLocale = numCoresPerLocale; snprintf(newenv_sheps, 99, "%i", (int)numThreadsPerLocale); setenv("QT_HWPAR", newenv_sheps, 1); } // Precendence (high-to-low): // 1) Chapel minimum // 2) QTHREAD_STACK_SIZE // In practice we never get to #2, because the Chapel minimum is // always > 0, but we cover that case as a backstop. callStackSize = chpl_task_getMinCallStackSize(); if (callStackSize <= 0) callStackSize = 1024 * 1024 * sizeof(size_t); snprintf(newenv_stack, 99, "%zu", callStackSize); setenv("QT_STACK_SIZE", newenv_stack, 1); // Turn on informative Qthreads setting messages with Chapel's verbose flag if (verbosity == 2) { setenv("QT_INFO", "1", 1); } pthread_create(&initer, NULL, initializer, NULL); while (chpl_qthread_done_initializing == 0) SPINLOCK_BODY(); if (blockreport || taskreport) { if (signal(SIGINT, SIGINT_handler) == SIG_ERR) { perror("Could not register SIGINT handler"); } } }
void chpl_task_init(void) { chpl_bool we_set_worker_unit = false; int32_t numThreadsPerLocale; int32_t commMaxThreads; int32_t hwpar; size_t callStackSize; pthread_t initer; char newenv_stack[100] = { 0 }; char *noWorkSteal; // Set up available hardware parallelism. // Experience has shown that we hardly ever win by using more than // one PU per core, so default to that. If this was explicitly // set by the user we won't override it, however. if (getenv("QTHREAD_WORKER_UNIT") == NULL) { we_set_worker_unit = (getenv("QT_WORKER_UNIT") == NULL); (void) setenv("QT_WORKER_UNIT", "core", 0); } // Determine the thread count. CHPL_RT_NUM_THREADS_PER_LOCALE has // the highest precedence but we limit it to the number of PUs. // QTHREAD_HWPAR has the next precedence. We don't impose the // same limit on it, so it can be used to overload the hardware. // In either case the number of threads can be no greater than any // maximum imposed by the comm layer. This limit is imposed // silently. numThreadsPerLocale = chpl_task_getenvNumThreadsPerLocale(); commMaxThreads = chpl_comm_getMaxThreads(); hwpar = 0; if (numThreadsPerLocale != 0) { int32_t numPUsPerLocale; hwpar = numThreadsPerLocale; numPUsPerLocale = chpl_numCoresOnThisLocale(); if (0 < numPUsPerLocale && numPUsPerLocale < hwpar) { if (2 == verbosity) { printf("QTHREADS: Reduced numThreadsPerLocale=%d to %d " "to prevent oversubscription of the system.\n", hwpar, numPUsPerLocale); } // Do not oversubscribe the system, use all available resources. hwpar = numPUsPerLocale; } if (0 < commMaxThreads && commMaxThreads < hwpar) { hwpar = commMaxThreads; } } else { if (0 < commMaxThreads) { hwpar = qt_internal_get_env_num("HWPAR", 0, 0); if (commMaxThreads < hwpar) { hwpar = commMaxThreads; } } } if (hwpar > 0) { char newenv[100]; char *sched; // Unset relevant Qthreads environment variables. Currently // QTHREAD_HWPAR has precedence over the QTHREAD_NUM_* ones, // but that isn't documented and may not be true forever, so // we unset them all. qt_internal_unset_envstr("HWPAR"); qt_internal_unset_envstr("NUM_SHEPHERDS"); qt_internal_unset_envstr("NUM_WORKERS_PER_SHEPHERD"); // The current check for scheduler and setting HWPAR or // NUM_SHEPHERDS/WORKERS_PER_SHEPHERD is just to experiment with // the performance of different schedulers. This is not production code // and if it's around after July 2014, yell at Elliot. sched = getenv("CHPL_QTHREAD_SCHEDULER"); if (sched != NULL && strncmp(sched, "nemesis", 7) == 0) { // Set environment variable for Qthreads snprintf(newenv, sizeof(newenv), "%i", (int)hwpar); setenv("QT_NUM_SHEPHERDS", newenv, 1); setenv("QT_NUM_WORKERS_PER_SHEPHERD", "1", 1); // Unset QT_WORKER_UNIT iff we set it. if (we_set_worker_unit) { (void) unsetenv("QT_WORKER_UNIT"); } } else { // Set environment variable for Qthreads snprintf(newenv, sizeof(newenv), "%i", (int)hwpar); setenv("QT_HWPAR", newenv, 1); } } // Precedence (high-to-low): // 1) Chapel minimum // 2) QTHREAD_STACK_SIZE // In practice we never get to #2, because the Chapel minimum is // always > 0, but we cover that case as a backstop. callStackSize = chpl_task_getMinCallStackSize(); if (callStackSize <= 0) callStackSize = 1024 * 1024 * sizeof(size_t); snprintf(newenv_stack, 99, "%zu", callStackSize); setenv("QT_STACK_SIZE", newenv_stack, 1); // Turn on informative Qthreads setting messages with Chapel's verbose flag if (verbosity == 2) { setenv("QT_INFO", "1", 1); } pthread_create(&initer, NULL, initializer, NULL); while (chpl_qthread_done_initializing == 0) SPINLOCK_BODY(); // Now that Qthreads is up and running, make sure that the number // of workers is less than any comm layer limit. This is mainly // checking that the default thread count without QTHREAD_HWPAR // being set is within any comm layer limit, because we can't // determine that default ahead of time. Secondarily, it's a // sanity check on the thread count versus comm limit logic // above. assert(0 == commMaxThreads || qthread_num_workers() < commMaxThreads); if (blockreport || taskreport) { if (signal(SIGINT, SIGINT_handler) == SIG_ERR) { perror("Could not register SIGINT handler"); } } // Turn off work stealing if it was configured to be off noWorkSteal = getenv("CHPL_QTHREAD_NO_WORK_STEALING"); if (noWorkSteal != NULL && strncmp(noWorkSteal, "yes", 3) == 0) { qthread_steal_disable(); } }
/** * @brief Initialize shared memory resources. * * This function is called during NI creation if the NI is physical, * or after PtlSetMap if it is logical. * * @param[in] ni * * @return status */ static int setup_commpad(ni_t *ni) { int shm_fd = -1; char comm_pad_shm_name[200] = ""; int err; int i; int pid_table_size; /* * Buffers in shared memory. The buffers will be allocated later, * but not by the pool management. We compute the sizes now. */ /* Allocate a pool of buffers in the mmapped region. */ ni->shmem.per_proc_comm_buf_numbers = get_param(PTL_NUM_SBUF); ni->sbuf_pool.setup = buf_setup; ni->sbuf_pool.init = buf_init; ni->sbuf_pool.fini = buf_fini; ni->sbuf_pool.cleanup = buf_cleanup; ni->sbuf_pool.use_pre_alloc_buffer = 1; ni->sbuf_pool.round_size = real_buf_t_size(); ni->sbuf_pool.slab_size = ni->shmem.per_proc_comm_buf_numbers * ni->sbuf_pool.round_size; /* Open KNEM device */ if (knem_init(ni)) { WARN(); goto exit_fail; } if (ni->options & PTL_NI_PHYSICAL) { /* Create a unique name for the shared memory file. */ snprintf(comm_pad_shm_name, sizeof(comm_pad_shm_name), "/portals4-shmem-pid%d", ni->id.phys.pid); } else { /* Create a unique name for the shared memory file. Use the hash * created from the mapping. */ snprintf(comm_pad_shm_name, sizeof(comm_pad_shm_name), "/portals4-shmem-%x-%d", ni->mem.hash, ni->options); } ni->shmem.comm_pad_shm_name = strdup(comm_pad_shm_name); /* Allocate a pool of buffers in the mmapped region. */ ni->shmem.per_proc_comm_buf_size = sizeof(queue_t) + ni->sbuf_pool.slab_size; pid_table_size = ni->mem.node_size * sizeof(struct shmem_pid_table); pid_table_size = ROUND_UP(pid_table_size, pagesize); ni->shmem.comm_pad_size = pid_table_size; ni->shmem.comm_pad_size += (ni->shmem.per_proc_comm_buf_size * ni->mem.node_size); #if !USE_KNEM off_t bounce_buf_offset; off_t bounce_head_offset; bounce_head_offset = ni->shmem.comm_pad_size; ni->shmem.comm_pad_size += ROUND_UP(sizeof(struct shmem_bounce_head), pagesize); ni->shmem.bounce_buf.buf_size = get_param(PTL_BOUNCE_BUF_SIZE); ni->shmem.bounce_buf.num_bufs = get_param(PTL_BOUNCE_NUM_BUFS); bounce_buf_offset = ni->shmem.comm_pad_size; ni->shmem.comm_pad_size += ni->shmem.bounce_buf.buf_size * ni->shmem.bounce_buf.num_bufs; #endif /* Open the communication pad. Let rank 0 create the shared memory. */ assert(ni->shmem.comm_pad == MAP_FAILED); if (ni->mem.index == 0) { /* Just in case, remove that file if it already exist. */ shm_unlink(comm_pad_shm_name); shm_fd = shm_open(comm_pad_shm_name, O_RDWR | O_CREAT | O_EXCL, S_IRUSR | S_IWUSR); assert(shm_fd >= 0); if (shm_fd < 0) { ptl_warn("shm_open of %s failed (errno=%d)", comm_pad_shm_name, errno); goto exit_fail; } /* Enlarge the memory zone to the size we need. */ if (ftruncate(shm_fd, ni->shmem.comm_pad_size) != 0) { ptl_warn("share memory ftruncate failed"); shm_unlink(comm_pad_shm_name); goto exit_fail; } } else { int try_count; /* Try for 10 seconds. That should leave enough time for rank * 0 to create the file. */ try_count = 100; do { shm_fd = shm_open(comm_pad_shm_name, O_RDWR, S_IRUSR | S_IWUSR); if (shm_fd != -1) break; usleep(100000); /* 100ms */ try_count--; } while (try_count); if (shm_fd == -1) { ptl_warn("Couldn't open the shared memory file %s\n", comm_pad_shm_name); goto exit_fail; } /* Wait for the file to have the right size before mmaping * it. */ try_count = 100; do { struct stat buf; if (fstat(shm_fd, &buf) == -1) { ptl_warn("Couldn't fstat the shared memory file\n"); goto exit_fail; } if (buf.st_size >= ni->shmem.comm_pad_size) break; usleep(100000); /* 100ms */ try_count--; } while (try_count); if (try_count >= 100000) { ptl_warn("Shared memory file has wrong size\n"); goto exit_fail; } } /* Fill our portion of the comm pad. */ ni->shmem.comm_pad = (uint8_t *) mmap(NULL, ni->shmem.comm_pad_size, PROT_READ | PROT_WRITE, MAP_SHARED, shm_fd, 0); if (ni->shmem.comm_pad == MAP_FAILED) { ptl_warn("mmap failed (%d)", errno); perror(""); goto exit_fail; } /* The share memory is mmaped, so we can close the file. */ close(shm_fd); shm_fd = -1; /* Now we can create the buffer pool */ ni->shmem.first_queue = ni->shmem.comm_pad + pid_table_size; ni->shmem.queue = (queue_t *)(ni->shmem.first_queue + (ni->shmem.per_proc_comm_buf_size * ni->mem.index)); queue_init(ni->shmem.queue); /* The buffer is right after the nemesis queue. */ ni->sbuf_pool.pre_alloc_buffer = (void *)(ni->shmem.queue + 1); err = pool_init(ni->iface->gbl, &ni->sbuf_pool, "sbuf", real_buf_t_size(), POOL_SBUF, (obj_t *)ni); if (err) { WARN(); goto exit_fail; } #if !USE_KNEM /* Initialize the bounce buffers and let index 0 link them * together. */ ni->shmem.bounce_buf.head = ni->shmem.comm_pad + bounce_head_offset; ni->shmem.bounce_buf.bbs = ni->shmem.comm_pad + bounce_buf_offset; if (ni->mem.index == 0) { ni->shmem.bounce_buf.head->head_index0 = ni->shmem.bounce_buf.head; ll_init(&ni->shmem.bounce_buf.head->free_list); for (i = 0; i < ni->shmem.bounce_buf.num_bufs; i++) { void *bb = ni->shmem.bounce_buf.bbs + i * ni->shmem.bounce_buf.buf_size; ll_enqueue_obj(&ni->shmem.bounce_buf.head->free_list, bb); } } #endif if (ni->options & PTL_NI_LOGICAL) { /* Can now announce my presence. */ /* The PID table is a the beginning of the comm pad. */ struct shmem_pid_table *pid_table = (struct shmem_pid_table *)ni->shmem.comm_pad; pid_table[ni->mem.index].id = ni->id; __sync_synchronize(); /* ensure "valid" is not written before pid. */ pid_table[ni->mem.index].valid = 1; /* Now, wait for my siblings to get here. */ for (i = 0; i < ni->mem.node_size; ++i) { /* oddly enough, this should reduce cache traffic * for large numbers of siblings */ while (pid_table[i].valid == 0) SPINLOCK_BODY(); } /* All ranks have mmaped the memory. Get rid of the file. */ shm_unlink(ni->shmem.comm_pad_shm_name); free(ni->shmem.comm_pad_shm_name); ni->shmem.comm_pad_shm_name = NULL; } return PTL_OK; exit_fail: if (shm_fd != -1) close(shm_fd); release_shmem_resources(ni); return PTL_FAIL; }