/* * __cilkrts_cilk_for_64 * * Implementation of cilk_for for 64-bit trip counts (regardless of processor * word size). Assumes that the range is 0 - count. * * body - lambda function for the cilk_for loop body * data - data used by the lambda function * count - trip count for loop * grain - grain size (0 if it should be computed) */ CILK_ABI_THROWS_VOID __cilkrts_cilk_for_64(__cilk_abi_f64_t body, void *data, cilk64_t count, int grain) { // Cilkscreen should not report this call in a stack trace NOTIFY_ZC_INTRINSIC((char *)"cilkscreen_hide_call", 0); // Check for an empty range here as an optimization - don't need to do any // __cilkrts_stack_frame initialization if (count > 0) cilk_for_root(body, data, count, grain); }
inline static void call_cilk_for_loop_body(count_t low, count_t high, F body, void *data, __cilkrts_worker *w, __cilkrts_pedigree *loop_root_pedigree) { // Cilkscreen should not report this call in a stack trace NOTIFY_ZC_INTRINSIC((char *)"cilkscreen_hide_call", 0); // The worker is only valid until the first spawn. Fetch the // __cilkrts_stack_frame out of the worker, since it will be stable across // steals. The sf pointer actually points to the *parent's* // __cilkrts_stack_frame, since this function is a non-spawning function // and therefore has no cilk stack frame of its own. __cilkrts_stack_frame *sf = w->current_stack_frame; // Save the pedigree node pointed to by the worker. We'll need to restore // that when we exit since the spawn helpers in the cilk_for call tree // will assume that it's valid const __cilkrts_pedigree *saved_next_pedigree_node = w->pedigree.parent; // Add the leaf pedigree node to the chain. The parent is the root node // to flatten the tree regardless of the DAG branches in the cilk_for // divide-and-conquer recursion. // // The rank is initialized to the low index. The user is // expected to call __cilkrts_bump_loop_rank at the end of the cilk_for // loop body. __cilkrts_pedigree loop_leaf_pedigree; loop_leaf_pedigree.rank = (uint64_t)low; loop_leaf_pedigree.parent = loop_root_pedigree; // The worker's pedigree always starts with a rank of 0 w->pedigree.rank = 0; w->pedigree.parent = &loop_leaf_pedigree; // Call the compiler generated cilk_for loop body lambda function body(data, low, high); // The loop body may have included spawns, so we must refetch the worker // from the __cilkrts_stack_frame, which is stable regardless of which // worker we're executing on. w = sf->worker; // Restore the pedigree chain. It must be valid because the spawn helpers // generated by the cilk_for implementation will access it. w->pedigree.parent = saved_next_pedigree_node; }
static void cilk_for_recursive(count_t low, count_t high, F body, void *data, int grain, __cilkrts_worker *w, __cilkrts_pedigree *loop_root_pedigree) { tail_recurse: // Cilkscreen should not report this call in a stack trace // This needs to be done everytime the worker resumes NOTIFY_ZC_INTRINSIC((char *)"cilkscreen_hide_call", 0); count_t count = high - low; // Invariant: count > 0, grain >= 1 if (count > grain) { // Invariant: count >= 2 count_t mid = low + count / 2; // The worker is valid only until the first spawn and is expensive to // retrieve (using '__cilkrts_get_tls_worker') after the spawn. The // '__cilkrts_stack_frame' is more stable, but isn't initialized until // the first spawn. Thus, we want to grab the address of the // '__cilkrts_stack_frame' after it is initialized but before the // spawn detaches. The only place we can do that is within the // argument list of the spawned function, hence the call to // capture_spawn_arg_stack_frame(). __cilkrts_stack_frame *sf; #if defined(__GNUC__) && ! defined(__INTEL_COMPILER) && ! defined(__clang__) // The current version of gcc initializes the sf structure eagerly. // We can take advantage of this fact to avoid calling // `capture_spawn_arg_stack_frame` when compiling with gcc. // Remove this if the "shrink-wrap" optimization is implemented. sf = w->current_stack_frame; _Cilk_spawn cilk_for_recursive(low, mid, body, data, grain, w, loop_root_pedigree); #else _Cilk_spawn cilk_for_recursive(low, mid, body, data, grain, capture_spawn_arg_stack_frame(sf, w), loop_root_pedigree); #endif w = sf->worker; low = mid; goto tail_recurse; } // Call the cilk_for loop body lambda function passed in by the compiler to // execute one grain call_cilk_for_loop_body(low, high, body, data, w, loop_root_pedigree); }
static void cilk_for_recursive(count_t low, count_t high, F body, void *data, int grain, __cilkrts_worker *w, __cilkrts_pedigree *loop_root_pedigree) { tail_recurse: // Cilkscreen should not report this call in a stack trace // This needs to be done everytime the worker resumes NOTIFY_ZC_INTRINSIC((char *)"cilkscreen_hide_call", 0); count_t count = high - low; // Invariant: count > 0, grain >= 1 if (count > grain) { // Invariant: count >= 2 count_t mid = low + count / 2; // The worker is valid only until the first spawn and is expensive to // retrieve (using '__cilkrts_get_tls_worker') after the spawn. The // '__cilkrts_stack_frame' is more stable, but isn't initialized until // the first spawn. Thus, we want to grab the address of the // '__cilkrts_stack_frame' after it is initialized but before the // spawn detaches. The only place we can do that is within the // argument list of the spawned function, hence the call to // capture_spawn_arg_stack_frame(). __cilkrts_stack_frame *sf; _Cilk_spawn cilk_for_recursive(low, mid, body, data, grain, capture_spawn_arg_stack_frame(sf, w), loop_root_pedigree); w = sf->worker; low = mid; goto tail_recurse; } // Call the cilk_for loop body lambda function passed in by the compiler to // execute one grain call_cilk_for_loop_body(low, high, body, data, w, loop_root_pedigree); }
static void cilk_for_root(F body, void *data, count_t count, int grain) { // Cilkscreen should not report this call in a stack trace NOTIFY_ZC_INTRINSIC((char *)"cilkscreen_hide_call", 0); // Pedigree computation: // // If the last pedigree node on entry to the _Cilk_for has value X, // then at the start of each iteration of the loop body, the value of // the last pedigree node should be 0, the value of the second-to-last // node should equal the loop counter, and the value of the // third-to-last node should be X. On return from the _Cilk_for, the // value of the last pedigree should be incremented to X+2. The // pedigree within the loop is thus flattened, such that the depth of // recursion does not affect the results either inside or outside of // the loop. Note that the pedigree after the loop exists is the same // as if a single spawn and sync were executed within this function. // TBD: Since the shrink-wrap optimization was turned on in the compiler, // it is not possible to get the current stack frame without actually // forcing a call to bind-thread. This spurious spawn is a temporary // stopgap until the correct intrinsics are added to give us total control // over frame initialization. _Cilk_spawn noop(); // Fetch the current worker. From that we can get the current stack frame // which will be constant even if we're stolen __cilkrts_worker *w = __cilkrts_get_tls_worker(); __cilkrts_stack_frame *sf = w->current_stack_frame; // Decrement the rank by one to undo the pedigree change from the // _Cilk_spawn --w->pedigree.rank; // Save the current worker pedigree into loop_root_pedigree, which will be // the root node for our flattened pedigree. __cilkrts_pedigree loop_root_pedigree = w->pedigree; // Don't splice the loop_root node in yet. It will be done when we // call the loop body lambda function // w->pedigree.rank = 0; // w->pedigree.next = &loop_root_pedigree; /* Spawn is necessary at top-level to force runtime to start up. * Runtime must be started in order to call the grainsize() function. */ int gs = grainsize(grain, count); cilk_for_recursive((count_t) 0, count, body, data, gs, w, &loop_root_pedigree); // Need to refetch the worker after calling a spawning function. w = sf->worker; // Restore the pedigree in the worker. w->pedigree = loop_root_pedigree; // Bump the worker pedigree. ++w->pedigree.rank; // Implicit sync will increment the pedigree leaf rank again, for a total // of two increments. If the noop spawn above is removed, then we'll need // to re-enable the following code: // // If this is an optimized build, then the compiler will have optimized // // out the increment of the worker's pedigree in the implied sync. We // // need to add one to make the pedigree_loop test work correctly. // #if CILKRTS_OPTIMIZED // ++sf->worker->pedigree.rank; // #endif }