static
void cilk_for_recursive(count_t low, count_t high,
                        F body, void *data, int grain,
                        __cilkrts_worker *w,
                        __cilkrts_pedigree *loop_root_pedigree)
{
tail_recurse:
    // Cilkscreen should not report this call in a stack trace
    // This needs to be done everytime the worker resumes
    NOTIFY_ZC_INTRINSIC((char *)"cilkscreen_hide_call", 0);

    count_t count = high - low;
    // Invariant: count > 0, grain >= 1
    if (count > grain)
    {
        // Invariant: count >= 2
        count_t mid = low + count / 2;
        // The worker is valid only until the first spawn and is expensive to
        // retrieve (using '__cilkrts_get_tls_worker') after the spawn.  The
        // '__cilkrts_stack_frame' is more stable, but isn't initialized until
        // the first spawn.  Thus, we want to grab the address of the
        // '__cilkrts_stack_frame' after it is initialized but before the
        // spawn detaches.  The only place we can do that is within the
        // argument list of the spawned function, hence the call to
        // capture_spawn_arg_stack_frame().
        __cilkrts_stack_frame *sf;
#if defined(__GNUC__) && ! defined(__INTEL_COMPILER) && ! defined(__clang__)
        // The current version of gcc initializes the sf structure eagerly.
        // We can take advantage of this fact to avoid calling
        // `capture_spawn_arg_stack_frame` when compiling with gcc.
        // Remove this if the "shrink-wrap" optimization is implemented.
        sf = w->current_stack_frame;
        _Cilk_spawn cilk_for_recursive(low, mid, body, data, grain, w,
                                       loop_root_pedigree);
#else        
        _Cilk_spawn cilk_for_recursive(low, mid, body, data, grain,
                                       capture_spawn_arg_stack_frame(sf, w),
                                       loop_root_pedigree);
#endif
        w = sf->worker;
        low = mid;

        goto tail_recurse;
    }

    // Call the cilk_for loop body lambda function passed in by the compiler to
    // execute one grain
    call_cilk_for_loop_body(low, high, body, data, w, loop_root_pedigree);
}
static
void cilk_for_recursive(count_t low, count_t high,
                        F body, void *data, int grain,
                        __cilkrts_worker *w,
                        __cilkrts_pedigree *loop_root_pedigree)
{
tail_recurse:
    // Cilkscreen should not report this call in a stack trace
    // This needs to be done everytime the worker resumes
    NOTIFY_ZC_INTRINSIC((char *)"cilkscreen_hide_call", 0);

    count_t count = high - low;
    // Invariant: count > 0, grain >= 1
    if (count > grain)
    {
        // Invariant: count >= 2
        count_t mid = low + count / 2;
        // The worker is valid only until the first spawn and is expensive to
        // retrieve (using '__cilkrts_get_tls_worker') after the spawn.  The
        // '__cilkrts_stack_frame' is more stable, but isn't initialized until
        // the first spawn.  Thus, we want to grab the address of the
        // '__cilkrts_stack_frame' after it is initialized but before the
        // spawn detaches.  The only place we can do that is within the
        // argument list of the spawned function, hence the call to
        // capture_spawn_arg_stack_frame().
        __cilkrts_stack_frame *sf;
        _Cilk_spawn cilk_for_recursive(low, mid, body, data, grain,
                                       capture_spawn_arg_stack_frame(sf, w),
                                       loop_root_pedigree);
        w = sf->worker;
        low = mid;

        goto tail_recurse;
    }

    // Call the cilk_for loop body lambda function passed in by the compiler to
    // execute one grain
    call_cilk_for_loop_body(low, high, body, data, w, loop_root_pedigree);
}