コード例 #1
0
/*
 * __cilkrts_cilk_for_64
 *
 * Implementation of cilk_for for 64-bit trip counts (regardless of processor
 * word size).  Assumes that the range is 0 - count.
 *
 * body  - lambda function for the cilk_for loop body
 * data  - data used by the lambda function
 * count - trip count for loop
 * grain - grain size (0 if it should be computed)
 */
CILK_ABI_THROWS_VOID __cilkrts_cilk_for_64(__cilk_abi_f64_t body, void *data,
                                            cilk64_t count, int grain)
{
    // Cilkscreen should not report this call in a stack trace
    NOTIFY_ZC_INTRINSIC((char *)"cilkscreen_hide_call", 0);

    // Check for an empty range here as an optimization - don't need to do any
    // __cilkrts_stack_frame initialization
    if (count > 0)
        cilk_for_root(body, data, count, grain);
}
コード例 #2
0
inline static
void call_cilk_for_loop_body(count_t low, count_t high,
                             F body, void *data,
                             __cilkrts_worker *w,
                             __cilkrts_pedigree *loop_root_pedigree)
{
    // Cilkscreen should not report this call in a stack trace
    NOTIFY_ZC_INTRINSIC((char *)"cilkscreen_hide_call", 0);

    // The worker is only valid until the first spawn.  Fetch the
    // __cilkrts_stack_frame out of the worker, since it will be stable across
    // steals.  The sf pointer actually points to the *parent's*
    // __cilkrts_stack_frame, since this function is a non-spawning function
    // and therefore has no cilk stack frame of its own.
    __cilkrts_stack_frame *sf = w->current_stack_frame;

    // Save the pedigree node pointed to by the worker.  We'll need to restore
    // that when we exit since the spawn helpers in the cilk_for call tree
    // will assume that it's valid
    const __cilkrts_pedigree *saved_next_pedigree_node = w->pedigree.parent;

    // Add the leaf pedigree node to the chain. The parent is the root node
    // to flatten the tree regardless of the DAG branches in the cilk_for
    // divide-and-conquer recursion.
    //
    // The rank is initialized to the low index.  The user is
    // expected to call __cilkrts_bump_loop_rank at the end of the cilk_for
    // loop body.
    __cilkrts_pedigree loop_leaf_pedigree;

    loop_leaf_pedigree.rank = (uint64_t)low;
    loop_leaf_pedigree.parent = loop_root_pedigree;

    // The worker's pedigree always starts with a rank of 0
    w->pedigree.rank = 0;
    w->pedigree.parent = &loop_leaf_pedigree;

    // Call the compiler generated cilk_for loop body lambda function
    body(data, low, high);

    // The loop body may have included spawns, so we must refetch the worker
    // from the __cilkrts_stack_frame, which is stable regardless of which
    // worker we're executing on.
    w = sf->worker;

    // Restore the pedigree chain. It must be valid because the spawn helpers
    // generated by the cilk_for implementation will access it.
    w->pedigree.parent = saved_next_pedigree_node;
}
コード例 #3
0
static
void cilk_for_recursive(count_t low, count_t high,
                        F body, void *data, int grain,
                        __cilkrts_worker *w,
                        __cilkrts_pedigree *loop_root_pedigree)
{
tail_recurse:
    // Cilkscreen should not report this call in a stack trace
    // This needs to be done everytime the worker resumes
    NOTIFY_ZC_INTRINSIC((char *)"cilkscreen_hide_call", 0);

    count_t count = high - low;
    // Invariant: count > 0, grain >= 1
    if (count > grain)
    {
        // Invariant: count >= 2
        count_t mid = low + count / 2;
        // The worker is valid only until the first spawn and is expensive to
        // retrieve (using '__cilkrts_get_tls_worker') after the spawn.  The
        // '__cilkrts_stack_frame' is more stable, but isn't initialized until
        // the first spawn.  Thus, we want to grab the address of the
        // '__cilkrts_stack_frame' after it is initialized but before the
        // spawn detaches.  The only place we can do that is within the
        // argument list of the spawned function, hence the call to
        // capture_spawn_arg_stack_frame().
        __cilkrts_stack_frame *sf;
#if defined(__GNUC__) && ! defined(__INTEL_COMPILER) && ! defined(__clang__)
        // The current version of gcc initializes the sf structure eagerly.
        // We can take advantage of this fact to avoid calling
        // `capture_spawn_arg_stack_frame` when compiling with gcc.
        // Remove this if the "shrink-wrap" optimization is implemented.
        sf = w->current_stack_frame;
        _Cilk_spawn cilk_for_recursive(low, mid, body, data, grain, w,
                                       loop_root_pedigree);
#else        
        _Cilk_spawn cilk_for_recursive(low, mid, body, data, grain,
                                       capture_spawn_arg_stack_frame(sf, w),
                                       loop_root_pedigree);
#endif
        w = sf->worker;
        low = mid;

        goto tail_recurse;
    }

    // Call the cilk_for loop body lambda function passed in by the compiler to
    // execute one grain
    call_cilk_for_loop_body(low, high, body, data, w, loop_root_pedigree);
}
コード例 #4
0
static
void cilk_for_recursive(count_t low, count_t high,
                        F body, void *data, int grain,
                        __cilkrts_worker *w,
                        __cilkrts_pedigree *loop_root_pedigree)
{
tail_recurse:
    // Cilkscreen should not report this call in a stack trace
    // This needs to be done everytime the worker resumes
    NOTIFY_ZC_INTRINSIC((char *)"cilkscreen_hide_call", 0);

    count_t count = high - low;
    // Invariant: count > 0, grain >= 1
    if (count > grain)
    {
        // Invariant: count >= 2
        count_t mid = low + count / 2;
        // The worker is valid only until the first spawn and is expensive to
        // retrieve (using '__cilkrts_get_tls_worker') after the spawn.  The
        // '__cilkrts_stack_frame' is more stable, but isn't initialized until
        // the first spawn.  Thus, we want to grab the address of the
        // '__cilkrts_stack_frame' after it is initialized but before the
        // spawn detaches.  The only place we can do that is within the
        // argument list of the spawned function, hence the call to
        // capture_spawn_arg_stack_frame().
        __cilkrts_stack_frame *sf;
        _Cilk_spawn cilk_for_recursive(low, mid, body, data, grain,
                                       capture_spawn_arg_stack_frame(sf, w),
                                       loop_root_pedigree);
        w = sf->worker;
        low = mid;

        goto tail_recurse;
    }

    // Call the cilk_for loop body lambda function passed in by the compiler to
    // execute one grain
    call_cilk_for_loop_body(low, high, body, data, w, loop_root_pedigree);
}
コード例 #5
0
static void cilk_for_root(F body, void *data, count_t count, int grain)
{
    // Cilkscreen should not report this call in a stack trace
    NOTIFY_ZC_INTRINSIC((char *)"cilkscreen_hide_call", 0);

    // Pedigree computation:
    //
    //    If the last pedigree node on entry to the _Cilk_for has value X,
    //    then at the start of each iteration of the loop body, the value of
    //    the last pedigree node should be 0, the value of the second-to-last
    //    node should equal the loop counter, and the value of the
    //    third-to-last node should be X.  On return from the _Cilk_for, the
    //    value of the last pedigree should be incremented to X+2. The
    //    pedigree within the loop is thus flattened, such that the depth of
    //    recursion does not affect the results either inside or outside of
    //    the loop.  Note that the pedigree after the loop exists is the same
    //    as if a single spawn and sync were executed within this function.

    // TBD: Since the shrink-wrap optimization was turned on in the compiler,
    // it is not possible to get the current stack frame without actually
    // forcing a call to bind-thread.  This spurious spawn is a temporary
    // stopgap until the correct intrinsics are added to give us total control
    // over frame initialization.
    _Cilk_spawn noop();

    // Fetch the current worker.  From that we can get the current stack frame
    // which will be constant even if we're stolen
    __cilkrts_worker *w = __cilkrts_get_tls_worker();
    __cilkrts_stack_frame *sf = w->current_stack_frame;

    // Decrement the rank by one to undo the pedigree change from the
    // _Cilk_spawn
    --w->pedigree.rank;

    // Save the current worker pedigree into loop_root_pedigree, which will be
    // the root node for our flattened pedigree.
    __cilkrts_pedigree loop_root_pedigree = w->pedigree;

    // Don't splice the loop_root node in yet.  It will be done when we
    // call the loop body lambda function
//    w->pedigree.rank = 0;
//    w->pedigree.next = &loop_root_pedigree;

    /* Spawn is necessary at top-level to force runtime to start up.
     * Runtime must be started in order to call the grainsize() function.
     */
    int gs = grainsize(grain, count);
    cilk_for_recursive((count_t) 0, count, body, data, gs, w,
                       &loop_root_pedigree);

    // Need to refetch the worker after calling a spawning function.
    w = sf->worker;

    // Restore the pedigree in the worker.
    w->pedigree = loop_root_pedigree;

    // Bump the worker pedigree.
    ++w->pedigree.rank;

    // Implicit sync will increment the pedigree leaf rank again, for a total
    // of two increments.  If the noop spawn above is removed, then we'll need
    // to re-enable the following code:
//     // If this is an optimized build, then the compiler will have optimized
//     // out the increment of the worker's pedigree in the implied sync.  We
//     // need to add one to make the pedigree_loop test work correctly.
// #if CILKRTS_OPTIMIZED
//     ++sf->worker->pedigree.rank;
// #endif
}