static void extend_global_pool(global_state_t *g) { /* FIXME: memalign to a cache line? */ struct pool_cons *c = (struct pool_cons *)__cilkrts_malloc(sizeof(*c)); g->frame_malloc.pool_begin = (char *)__cilkrts_malloc((size_t)FRAME_MALLOC_CHUNK); g->frame_malloc.pool_end = g->frame_malloc.pool_begin + FRAME_MALLOC_CHUNK; g->frame_malloc.allocated_from_os += FRAME_MALLOC_CHUNK; c->p = g->frame_malloc.pool_begin; c->cdr = g->frame_malloc.pool_list; g->frame_malloc.pool_list = c; }
/** * Allocate memory for the list of logged events. * * This function will read through the file and count the number of records * so it can estimate how big a buffer to allocate for the array or replay * entries. It will then rewind the file to the beginning so it can be * loaded into memory. * * @param w The worker we're loading the file for. * @param f The file of replay data we're scanning. */ static void allocate_replay_list(__cilkrts_worker *w, FILE *f) { // Count the number of entries - yeah, it's a hack, but it lets me // allocate the space all at once instead of in chunks char buf[1024]; int entries = 1; // Include "LAST" node while (! feof(f)) { if (fgets(buf, 1024, f)) { // Skip the Workers record - should only be in file for Worker 0 if (0 != strncmp(PED_TYPE_STR_WORKERS, buf, sizeof(PED_TYPE_STR_WORKERS)-1)) entries++; } } w->l->replay_list_root = (replay_entry_t *)__cilkrts_malloc(entries * sizeof(replay_entry_t)); w->l->replay_list_root[entries - 1].m_type = ped_type_last; // Reset the file to the beginning rewind(f); }
/* * Save TBB interop information from the cilk_fiber. It will get picked * up when the thread is bound to the runtime next time. */ void cilk_fiber_tbb_interop_save_info_from_stack(cilk_fiber *fiber) { __cilk_tbb_stack_op_thunk *saved_thunk; cilk_fiber_data* fdata; if (NULL == fiber) return; fdata = cilk_fiber_get_data(fiber); // If there is no TBB interop data, just return if (NULL == fdata->stack_op_routine) return; saved_thunk = __cilkrts_get_tls_tbb_interop(); // If there is not already space allocated, allocate some. if (NULL == saved_thunk) { saved_thunk = (__cilk_tbb_stack_op_thunk*) __cilkrts_malloc(sizeof(__cilk_tbb_stack_op_thunk)); __cilkrts_set_tls_tbb_interop(saved_thunk); } saved_thunk->routine = fdata->stack_op_routine; saved_thunk->data = fdata->stack_op_data; }
/** * Load data read from the log into the entry */ bool load(const char *type, const char *pedigee_str, int32_t value1, int32_t value2) { // Convert the type into an enum if (0 == strcmp(type, PED_TYPE_STR_STEAL)) { m_type = ped_type_steal; m_value = (int16_t)value1; // Victim } else { m_value = -1; // Victim not valid if (0 == strcmp(type, PED_TYPE_STR_SYNC)) m_type = ped_type_sync; else if (0 == strcmp(type, PED_TYPE_STR_ORPHANED)) m_type = ped_type_orphaned; else { m_type = ped_type_unknown; return false; } } // Parse the pedigree m_pedigree_len = 0; const char *p = pedigee_str; char *end; uint64_t temp_pedigree[PEDIGREE_BUFF_SIZE/2]; while(1) { temp_pedigree[m_pedigree_len++] = (uint64_t)strtol(p, &end, 10); if ('\0' == *end) break; p = end + 1; } // Allocate memory to hold the pedigree. // Copy the pedigree in reverse order since that's the order we'll // traverse it m_reverse_pedigree = (uint64_t *)__cilkrts_malloc(sizeof(int64_t) * m_pedigree_len); for (int n = 0; n < m_pedigree_len; n++) m_reverse_pedigree[n] = temp_pedigree[(m_pedigree_len - 1) - n]; return true; }
cilk_fiber* cilk_fiber::allocate_from_thread() { void* retmem = __cilkrts_malloc(sizeof(cilk_fiber_sysdep)); CILK_ASSERT(retmem); cilk_fiber_sysdep* ret = ::new(retmem) cilk_fiber_sysdep(from_thread); // A fiber allocated from a thread begins with a reference count // of 2. The first is for being created, and the second is for // being running. // // Suspending this fiber will decrement the count down to 1. ret->init_ref_count(2); #if SUPPORT_GET_CURRENT_FIBER // We're creating the main fiber for this thread. Set this fiber as the // current fiber. cilkos_set_tls_cilk_fiber(ret); #endif return ret; }
cilk_fiber* cilk_fiber::allocate_from_heap(std::size_t stack_size) { // Case 1: pool is NULL. create a new fiber from the heap // No need for locks here. cilk_fiber_sysdep* ret = (cilk_fiber_sysdep*) __cilkrts_malloc(sizeof(cilk_fiber_sysdep)); // Error condition. If we failed to allocate a fiber from the // heap, we are in trouble though... if (!ret) return NULL; ::new(ret) cilk_fiber_sysdep(stack_size); CILK_ASSERT(0 == ret->m_flags); CILK_ASSERT(NULL == ret->m_pending_remove_ref); CILK_ASSERT(NULL == ret->m_pending_pool); ret->init_ref_count(1); return ret; }
/* * Save TBB interop information for an unbound thread. It will get picked * up when the thread is bound to the runtime. */ void cilk_fiber_tbb_interop_save_stack_op_info(__cilk_tbb_stack_op_thunk o) { __cilk_tbb_stack_op_thunk *saved_thunk = __cilkrts_get_tls_tbb_interop(); DBG_STACK_OPS("Calling save_stack_op; o.routine=%p, o.data=%p, saved_thunk=%p\n", o.routine, o.data, saved_thunk); // If there is not already space allocated, allocate some. if (NULL == saved_thunk) { saved_thunk = (__cilk_tbb_stack_op_thunk*) __cilkrts_malloc(sizeof(__cilk_tbb_stack_op_thunk)); __cilkrts_set_tls_tbb_interop(saved_thunk); } *saved_thunk = o; DBG_STACK_OPS ("Unbound Thread %04x: tbb_interop_save_stack_op_info - saved info\n", cilkos_get_current_thread_id()); }
void cilk_fiber_pool_init(cilk_fiber_pool* pool, cilk_fiber_pool* parent, size_t stack_size, unsigned buffer_size, int alloc_max, int is_shared) { #if FIBER_DEBUG >= 1 fprintf(stderr, "fiber_pool_init, pool=%p, parent=%p, alloc_max=%u\n", pool, parent, alloc_max); #endif pool->lock = (is_shared ? spin_mutex_create() : NULL); pool->parent = parent; pool->stack_size = stack_size; pool->max_size = buffer_size; pool->size = 0; pool->total = 0; pool->high_water = 0; pool->alloc_max = alloc_max; pool->fibers = (cilk_fiber**) __cilkrts_malloc(buffer_size * sizeof(cilk_fiber*)); CILK_ASSERT(NULL != pool->fibers); #ifdef __MIC__ #define PREALLOCATE_FIBERS #endif #ifdef PREALLOCATE_FIBERS // Pre-allocate 1/4 of fibers in the pools ahead of time. This // value is somewhat arbitrary. It was chosen to be less than the // threshold (of about 3/4) of fibers to keep in the pool when // transferring fibers to the parent. int pre_allocate_count = buffer_size/4; for (pool->size = 0; pool->size < pre_allocate_count; pool->size++) { pool->fibers[pool->size] = cilk_fiber::allocate_from_heap(pool->stack_size); } #endif }
void *__cilkrts_frame_malloc(__cilkrts_worker *w, size_t size) { int bucket; void *mem; /* if too large, or if no worker, fall back to __cilkrts_malloc() */ if (!w || size > FRAME_MALLOC_MAX_SIZE) { NOTE_INTERVAL(w, INTERVAL_FRAME_ALLOC_LARGE); return __cilkrts_malloc(size); } START_INTERVAL(w, INTERVAL_FRAME_ALLOC); { bucket = bucket_of_size(size); size = FRAME_MALLOC_BUCKET_TO_SIZE(bucket); while (!(mem = pop(&w->l->free_list[bucket]))) { /* get a batch of frames from the global pool */ START_INTERVAL(w, INTERVAL_FRAME_ALLOC_GLOBAL) { allocate_batch(w, bucket, size); } STOP_INTERVAL(w, INTERVAL_FRAME_ALLOC_GLOBAL); } } STOP_INTERVAL(w, INTERVAL_FRAME_ALLOC);
COMMON_SYSDEP __cilkrts_pedigree *__cilkrts_get_tls_pedigree_leaf(int create_new) { __cilkrts_pedigree *pedigree_tls; if (__builtin_expect(cilk_keys_defined, 1)) { pedigree_tls = (struct __cilkrts_pedigree *)pthread_getspecific(pedigree_leaf_key); } else { return 0; } if (!pedigree_tls && create_new) { // This call creates two nodes, X and Y. // X == pedigree_tls[0] is the leaf node, which gets copied // in and out of a user worker w when w binds and unbinds. // Y == pedigree_tls[1] is the root node, // which is a constant node that represents the user worker // thread w. pedigree_tls = (__cilkrts_pedigree*) __cilkrts_malloc(2 * sizeof(__cilkrts_pedigree)); // This call sets the TLS pointer to the new node. __cilkrts_set_tls_pedigree_leaf(pedigree_tls); pedigree_tls[0].rank = 0; pedigree_tls[0].parent = &pedigree_tls[1]; // Create Y, whose rank begins as the global counter value. pedigree_tls[1].rank = __sync_add_and_fetch(&__cilkrts_global_pedigree_tls_counter, 1); pedigree_tls[1].parent = NULL; CILK_ASSERT(pedigree_tls[1].rank != -1); } return pedigree_tls; }
__CILKRTS_BEGIN_EXTERN_C /** * @brief Returns the global state object. If called for the first time, * initializes the user-settable values in the global state, but does not * initialize the rest of the structure. */ global_state_t* cilkg_get_user_settable_values() { // Environment variable value. More than big enough for a 64-bit signed // integer. char envstr[24]; // Abbreviating &global_state_singleton as g is not only shorter, it also // facilitates grepping for the string "g->", which appears ubiquitously // in the runtime code. global_state_t* g = &global_state_singleton; // TBD: We need synchronization around this loop to prevent // multiple threads from initializing this data. if (! cilkg_user_settable_values_initialized) { size_t len; // Preserve stealing disabled since it may have been set by the // debugger int stealing_disabled = g->stealing_disabled; // All fields will be zero until set. In particular std::memset(g, 0, sizeof(global_state_t)); // Fetch the number of cores. There must be at last 1, since we're // executing on *something*, aren't we!? int hardware_cpu_count = __cilkrts_hardware_cpu_count(); CILK_ASSERT(hardware_cpu_count > 0); bool under_ptool = __cilkrts_running_under_sequential_ptool(); if (under_ptool) hardware_cpu_count = 1; g->stealing_disabled = stealing_disabled; g->under_ptool = under_ptool; g->force_reduce = 0; // Default Off g->P = hardware_cpu_count; // Defaults to hardware CPU count g->max_user_workers = 0; // 0 unless set by user g->fiber_pool_size = 7; // Arbitrary default g->global_fiber_pool_size = 3 * 3* g->P; // Arbitrary default // 3*P was the default size of the worker array (including // space for extra user workers). This parameter was chosen // to match previous versions of the runtime. if (4 == sizeof(void *)) g->max_stacks = 1200; // Only 1GB on 32-bit machines else g->max_stacks = 2400; // 2GB on 64-bit machines // If we have 2400 1MB stacks, that is 2 gb. If we reach this // limit on a single-socket machine, we may have other // problems. Is 2400 too small for large multicore machines? // TBD(jsukha, 11/27/2012): I set this limit on stacks to be a // value independent of P. When running on a Xeon Phi with // small values of P, I recall seeing a few microbenchmarks // (e.g., fib) where a limit of 10*P seemed to be // unnecessarily slowing things down. // // That being said, the code has changed sufficiently that // this observation may no longer be true. // // Note: in general, the worst-case number of stacks required // for a Cilk computation with spawn depth "d" on P workers is // O(Pd). Code with unbalanced recursion may run into issues // with this stack usage. g->max_steal_failures = 128; // TBD: depend on max_workers? g->stack_size = 0; // 0 unless set by the user // Assume no record or replay log for now g->record_replay_file_name = NULL; g->record_or_replay = RECORD_REPLAY_NONE; // set by user if (always_force_reduce()) g->force_reduce = true; else if (cilkos_getenv(envstr, sizeof(envstr), "CILK_FORCE_REDUCE")) store_bool(&g->force_reduce, envstr); if (under_ptool) g->P = 1; // Ignore environment variable if under cilkscreen else if (cilkos_getenv(envstr, sizeof(envstr), "CILK_NWORKERS")) // Set P to environment variable, but limit to no less than 1 // and no more than 16 times the number of hardware threads. store_int(&g->P, envstr, 1, 16 * hardware_cpu_count); if (cilkos_getenv(envstr, sizeof(envstr), "CILK_MAX_USER_WORKERS")) // Set max_user_workers to environment variable, but limit to no // less than 1 and no more 16 times the number of hardware // threads. If not specified, defaults (somewhat arbitrarily) to // the larger of 3 and twice the number of hardware threads. store_int(&g->max_user_workers, envstr, 1, 16*hardware_cpu_count); if (cilkos_getenv(envstr, sizeof(envstr), "CILK_STEAL_FAILURES")) // Set the number of times a worker should fail to steal before // it looks to see whether it should suspend itself. store_int<unsigned>(&g->max_steal_failures, envstr, 1, INT_MAX); // Compute the total number of workers to allocate. Subtract one from // nworkers and user workers so that the first user worker isn't // factored in twice. // // total_workers must be computed now to support __cilkrts_get_total_workers g->total_workers = g->P + calc_max_user_workers(g) - 1; #ifdef CILK_RECORD_REPLAY // RecordReplay: See if we've been asked to replay a log len = cilkos_getenv(envstr, 0, "CILK_REPLAY_LOG"); if (len > 0) { len += 1; // Allow for trailing NUL g->record_or_replay = REPLAY_LOG; g->record_replay_file_name = (char *)__cilkrts_malloc(len); cilkos_getenv(g->record_replay_file_name, len, "CILK_REPLAY_LOG"); } // RecordReplay: See if we've been asked to record a log len = cilkos_getenv(envstr, 0, "CILK_RECORD_LOG"); if (len > 0) { if (RECORD_REPLAY_NONE != g->record_or_replay) cilkos_warning("CILK_RECORD_LOG ignored since CILK_REPLAY_LOG is defined.\n"); else { len += 1; // Allow for trailing NUL g->record_or_replay = RECORD_LOG; g->record_replay_file_name = (char *)__cilkrts_malloc(len); cilkos_getenv(g->record_replay_file_name, len, "CILK_RECORD_LOG"); } } #endif cilkg_user_settable_values_initialized = true; } return g; }
spin_mutex* spin_mutex_create() { spin_mutex* mutex = (spin_mutex*)__cilkrts_malloc(sizeof(spin_mutex)); spin_mutex_init(mutex); return mutex; }