void cilk_fiber::deallocate_self(cilk_fiber_pool* pool) { this->set_resumable(false); CILK_ASSERT(NULL != pool); CILK_ASSERT(!this->is_allocated_from_thread()); this->assert_ref_count_equals(0); // Cases: // // 1. pool has space: Add to this pool. // 2. pool is full: Give some fibers to parent, and then free // enough to make space for the fiber we are deallocating. // Then put the fiber back into the pool. const bool need_lock = pool->lock; // Grab the lock for the remaining cases. if (need_lock) { spin_mutex_lock(pool->lock); } // Case 1: this pool has space. Return the fiber. if (pool->size < pool->max_size) { // Add this fiber to pool pool->fibers[pool->size++] = this; if (need_lock) { spin_mutex_unlock(pool->lock); } return; } // Case 2: Pool is full. // // First free up some space by giving fibers to the parent. if (pool->parent) { // Pool is full. Move all but "num_to_keep" fibers to parent, // if we can. unsigned num_to_keep = pool->max_size/2 + pool->max_size/4; cilk_fiber_pool_move_fibers_to_parent_pool(pool, num_to_keep); } if (need_lock) { spin_mutex_unlock(pool->lock); } // Now, free a fiber to make room for the one we need to put back, // and then put this fiber back. This step may actually return // fibers to the heap. cilk_fiber_pool_free_fibers_from_pool(pool, pool->max_size -1, this); }
void __cilkrts_obj_metadata_wakeup( __cilkrts_ready_list *rlist, __cilkrts_obj_metadata *meta) { spin_mutex_lock( &meta->mutex ); // printf( "%d-%p: wakeup begin meta=%p {yg=%d, ng=%d, ont=%d} \n", __cilkrts_get_tls_worker()->self, (void*)0, meta, meta->youngest_group, meta->num_gens, meta-> oldest_num_tasks ); if( --meta->oldest_num_tasks > 0 ) { // printf( "%d-%p: wakeup end ont>0 meta=%p {yg=%d, ng=%d, ont=%d} \n", __cilkrts_get_tls_worker()->self, (void*)0, meta, meta->youngest_group, meta->num_gens, meta-> oldest_num_tasks ); spin_mutex_unlock( &meta->mutex ); } else if( meta->num_gens == 1 ) { meta->num_gens = 0; meta->youngest_group = CILK_OBJ_GROUP_EMPTY; // printf( "%d-%p: wakeup end ng=1 meta=%p {yg=%d, ng=%d, ont=%d}\n", __cilkrts_get_tls_worker()->self, (void*)0, meta, meta->youngest_group, meta->num_gens, meta-> oldest_num_tasks ); spin_mutex_unlock( &meta->mutex ); } else __cilkrts_obj_metadata_wakeup_hard(rlist, meta); }
void cilk_fiber_pool_destroy(cilk_fiber_pool* pool) { CILK_ASSERT(cilk_fiber_pool_sanity_check(pool, "pool_destroy")); // Lock my own pool, if I need to. if (pool->lock) { spin_mutex_lock(pool->lock); } // Give any remaining fibers to parent pool. if (pool->parent) { cilk_fiber_pool_move_fibers_to_parent_pool(pool, 0); } // Unlock pool. if (pool->lock) { spin_mutex_unlock(pool->lock); } // If I have any left in my pool, just free them myself. // This method may acquire the pool lock. cilk_fiber_pool_free_fibers_from_pool(pool, 0, NULL); // Destroy the lock if there is one. if (pool->lock) { spin_mutex_destroy(pool->lock); } __cilkrts_free(pool->fibers); }
void __cilkrts_obj_metadata_wakeup_hard( __cilkrts_ready_list *rlist, __cilkrts_obj_metadata *meta) { struct __cilkrts_task_list_node * head = meta->tasks.head.it_next; struct __cilkrts_task_list_node * i_next = 0; unsigned new_tasks = 0; for( struct __cilkrts_task_list_node * i=head; i; i=i_next ) { struct __cilkrts_pending_frame * t = (struct __cilkrts_pending_frame *) ((uintptr_t)i->st_task_and_last & ~(uintptr_t)(1)); ++new_tasks; i_next = i->it_next; if( __sync_fetch_and_add( &t->incoming_count, -1 ) == 1 ) { // TODO: push in front for convenience -- revise // require per-worker ready list t->next_ready_frame = 0; rlist->tail->next_ready_frame = t; rlist->tail = t; } // if( i->is_last_in_generation() ) if( ((uintptr_t)i->st_task_and_last & (uintptr_t)(1)) != 0 ) break; } // TODO: move num_tasks inc/dec outside CS and change to neg_num_tasks. // There may be a race on the atomic decrement on num_tasks at the beginning // of wakeup (if outside critical section) due to the possibility of a // task spawning concurrently with wakeup and completing while wakeup // still busy. // __sync_fetch_and_add( &oldest.num_tasks, new_tasks ); // assert( oldest.num_tasks == 0 ); meta->oldest_num_tasks = new_tasks; // TODO: In CS, so non-atomic dec/inc suffices? // pop_generation(); meta->num_gens--; if( i_next ) { // assert( num_gens > 0 && "Zero generations but still tasks in youngest" ); meta->tasks.head.it_next = i_next; } else { // assert( num_gens <= 1 // && "Few generations present when depleting youngest" ); meta->tasks.head.it_next = 0; meta->tasks.tail = &meta->tasks.head; if( meta->num_gens == 0 ) meta->youngest_group = CILK_OBJ_GROUP_EMPTY; } // assert( (youngest.has_tasks() == (num_gens > 0)) && "tasks require gens" ); // printf( "%d-%p: wakeup end hard meta=%p {yg=%d, ng=%d, ont=%d} nt=%d\n", __cilkrts_get_tls_worker()->self, (void*)0, meta, meta->youngest_group, meta->num_gens, meta-> oldest_num_tasks, new_tasks ); spin_mutex_unlock(&meta->mutex); }
/** * Helper method: try to allocate a fiber from this pool or its * ancestors without going to the OS / heap. * * Returns allocated pool, or NULL if no pool is found. * * If pool contains a suitable fiber. Return it. Otherwise, try to * recursively grab a fiber from the parent pool, if there is one. * * This method will not allocate a fiber from the heap. * * This method could be written either recursively or iteratively. * It probably does not matter which one we do. * * @note This method is compiled, but may not be used unless the * USE_FIBER_TRY_ALLOCATE_FROM_POOL switch is set. */ cilk_fiber* cilk_fiber::try_allocate_from_pool_recursive(cilk_fiber_pool* pool) { cilk_fiber* ret = NULL; if (pool->size > 0) { // Try to get the lock. if (pool->lock) { // For some reason, it seems to be better to just block on the parent // pool lock, instead of using a try-lock? #define USE_TRY_LOCK_IN_FAST_ALLOCATE 0 #if USE_TRY_LOCK_IN_FAST_ALLOCATE int got_lock = spin_mutex_trylock(pool->lock); if (!got_lock) { // If we fail, skip to the parent. if (pool->parent) { return try_allocate_from_pool_recursive(pool->parent); } } #else spin_mutex_lock(pool->lock); #endif } // Check in the pool if we have the lock. if (pool->size > 0) { ret = pool->fibers[--pool->size]; } // Release the lock once we are done updating pool fields. if (pool->lock) { spin_mutex_unlock(pool->lock); } } if ((!ret) && (pool->parent)) { return try_allocate_from_pool_recursive(pool->parent); } if (ret) { // When we pull a fiber out of the pool, set its reference // count before we return it. ret->init_ref_count(1); } return ret; }
/** * @brief Transfer fibers from @c pool to @c pool->parent. * * @pre Must hold @c pool->lock if it exists. * @post After completion, some number of fibers * have been moved from this pool to the parent. * The lock @c pool->lock is still held. * * TBD: Do we wish to guarantee that the lock has never been * released? It may depend on the implementation... */ static void cilk_fiber_pool_move_fibers_to_parent_pool(cilk_fiber_pool* pool, unsigned num_to_keep) { // ASSERT: We should hold the lock on pool (if it has one). CILK_ASSERT(pool->parent); cilk_fiber_pool* parent_pool = pool->parent; // Move fibers from our pool to the parent until we either run out // of space in the parent, or hit our threshold. // // This operation must be done while holding the parent lock. // If the parent pool appears to be full, just return early. if (parent_pool->size >= parent_pool->max_size) return; spin_mutex_lock(pool->parent->lock); while ((parent_pool->size < parent_pool->max_size) && (pool->size > num_to_keep)) { parent_pool->fibers[parent_pool->size++] = pool->fibers[--pool->size]; } // If the child pool has deallocated more than fibers to the heap // than it has allocated, then transfer this "surplus" to the // parent, so that the parent is free to allocate more from the // heap. // // This transfer means that the total in the parent can // temporarily go negative. if (pool->total < 0) { // Reduce parent total by the surplus we have in the local // pool. parent_pool->total += pool->total; pool->total = 0; } spin_mutex_unlock(pool->parent->lock); }
void __cilkrts_obj_metadata_add_task( __cilkrts_pending_frame *t, __cilkrts_obj_metadata *meta, __cilkrts_task_list_node *tags, int g) { // Set pointer to task in argument's tags storage tags->st_task_and_last = t; // Fully mutual exclusion to avoid races spin_mutex_lock( &meta->mutex ); // Optimized version. This is called only if the task is already running, // by stealing the parent of an un-issued ready task. In this case, // joins=1, pushg=0 and ready=1, so only need to set num_gens=1, // youngest_group=g and oldest_num_tasks++ without further ado. if( !t ) { meta->num_gens = 1; meta->oldest_num_tasks++; meta->youngest_group = g; spin_mutex_unlock( &meta->mutex ); return; } // printf( "%d-%p: add_task begin t=%p meta=%p {yg=%d, ng=%d, ont=%d} tags=%p g=%d\n", __cilkrts_get_tls_worker()->self, (void*)0, t, meta, meta->youngest_group, meta->num_gens, meta-> oldest_num_tasks, tags, g ); int joins = ( meta->youngest_group & ((g | CILK_OBJ_GROUP_EMPTY) & CILK_OBJ_GROUP_NOT_WRITE ) ) != 0; int pushg = ( g & ( meta->youngest_group & CILK_OBJ_GROUP_NOT_WRITE ) ) == 0; int ready = joins & ( meta->num_gens <= 1 ); // push_generation( pushg ); // TODO: in CS, so non-atomic suffices? // __sync_fetch_and_add( &meta->num_gens, (uint32_t)pushg ); meta->num_gens += (uint32_t)pushg; meta->oldest_num_tasks += ready; meta->youngest_group = g; if( !ready ) { // t->add_incoming(); __sync_fetch_and_add( &t->incoming_count, 1 ); // We avoid branches by using a sentinel node in tasks (pointer // retrieved is always meaningful) and by unconditionally storing // a value __cilkrts_task_list_node * old_tail = meta->tasks.tail; tags->it_next = 0; old_tail->it_next = tags; // old_tail->set_last_in_generation( pushg ); // TODO: the bit should not be set, should it? Remove "& ~1" part old_tail->st_task_and_last = (__cilkrts_pending_frame *) ( ( (uintptr_t)old_tail->st_task_and_last & ~(uintptr_t)1 ) | (uintptr_t)pushg ); meta->tasks.tail = tags; } __CILKRTS_ASSERT( (meta->num_gens <= 1) == (meta->tasks.head.it_next == 0) ); __CILKRTS_ASSERT( meta->num_gens > 0 ); // printf( "%d-%p: add_task end t=%p meta=%p {yg=%d, ng=%d, ont=%d} tags=%p g=%d\n", __cilkrts_get_tls_worker()->self, (void*)0, t, meta, meta->youngest_group, meta->num_gens, meta-> oldest_num_tasks, tags, g ); spin_mutex_unlock( &meta->mutex ); }
/** * @brief Free fibers from this pool until we have at most @c * num_to_keep fibers remaining, and then put a fiber back. * * @pre We do not hold @c pool->lock * @post After completion, we do not hold @c pool->lock */ static void cilk_fiber_pool_free_fibers_from_pool(cilk_fiber_pool* pool, unsigned num_to_keep, cilk_fiber* fiber_to_return) { // Free our own fibers, until we fall below our desired threshold. // Each iteration of this loop proceeds in the following stages: // 1. Acquire the pool lock, // 2. Grabs up to B fibers from the pool, stores them into a buffer. // 3. Check if pool is empty enough. If yes, put the last fiber back, // and remember that we should quit. // 4. Release the pool lock, and actually free any buffered fibers. // 5. Check if we are done and should exit the loop. Otherwise, try again. // const bool need_lock = pool->lock; bool last_fiber_returned = false; do { const int B = 10; // Pull at most this many fibers from the // parent for one lock acquisition. Make // this value large enough to amortize // against the cost of acquiring and // releasing the lock. int num_to_free = 0; cilk_fiber* fibers_to_free[B]; // Stage 1: Grab the lock. if (need_lock) { spin_mutex_lock(pool->lock); } // Stage 2: Grab up to B fibers to free. int fibers_freed = 0; while ((pool->size > num_to_keep) && (num_to_free < B)) { fibers_to_free[num_to_free++] = pool->fibers[--pool->size]; fibers_freed++; } decrement_pool_total(pool, fibers_freed); // Stage 3. Pool is below threshold. Put extra fiber back. if (pool->size <= num_to_keep) { // Put the last fiber back into the pool. if (fiber_to_return) { CILK_ASSERT(pool->size < pool->max_size); pool->fibers[pool->size] = fiber_to_return; pool->size++; } last_fiber_returned = true; } // Stage 4: Release the lock, and actually free any fibers // buffered. if (need_lock) { spin_mutex_unlock(pool->lock); } for (int i = 0; i < num_to_free; ++i) { fibers_to_free[i]->deallocate_to_heap(); } } while (!last_fiber_returned); }
cilk_fiber* cilk_fiber::allocate(cilk_fiber_pool* pool) { // Pool should not be NULL in this method. But I'm not going to // actually assert it, because we are likely to seg fault anyway // if it is. // CILK_ASSERT(NULL != pool); cilk_fiber *ret = NULL; #if USE_FIBER_TRY_ALLOCATE_FROM_POOL // "Fast" path, which doesn't go to the heap or OS until checking // the ancestors first. ret = try_allocate_from_pool_recursive(pool); if (ret) return ret; #endif // If we don't get anything from the "fast path", then go through // a slower path to look for a fiber. // // 1. Lock the pool if it is shared. // 2. Look in our local pool. If we find one, release the lock // and quit searching. // 3. Otherwise, check whether we can allocate from heap. // 4. Release the lock if it was acquired. // 5. Try to allocate from the heap, if step 3 said we could. // If we find a fiber, then quit searching. // 6. If none of these steps work, just recursively try again // from the parent. // 1. Lock the pool if it is shared. if (pool->lock) { spin_mutex_lock(pool->lock); } // 2. Look in local pool. if (pool->size > 0) { ret = pool->fibers[--pool->size]; if (ret) { // If we found one, release the lock once we are // done updating pool fields, and break out of the // loop. if (pool->lock) { spin_mutex_unlock(pool->lock); } // When we pull a fiber out of the pool, set its reference // count just in case. ret->init_ref_count(1); return ret; } } // 3. Check whether we can allocate from the heap. bool can_allocate_from_heap = false; if (pool->total < pool->alloc_max) { // Track that we are allocating a new fiber from the // heap, originating from this pool. // This increment may be undone if we happen to fail to // allocate from the heap. increment_pool_total(pool); can_allocate_from_heap = true; } // 4. Unlock the pool, and then allocate from the heap. if (pool->lock) { spin_mutex_unlock(pool->lock); } // 5. Actually try to allocate from the heap / OS. if (can_allocate_from_heap) { ret = allocate_from_heap(pool->stack_size); // If we got something from the heap, just return it. if (ret) { return ret; } // Otherwise, we failed in our attempt to allocate a // fiber from the heap. Grab the lock and decrement // the total again. if (pool->lock) { spin_mutex_lock(pool->lock); } decrement_pool_total(pool, 1); if (pool->lock) { spin_mutex_unlock(pool->lock); } } // 6. If we get here, then searching this pool failed. Go search // the parent instead if we have one. if (pool->parent) { return allocate(pool->parent); } return ret; }