aligned_t Task::qthread_func( void * arg )
{
  Task * const task = reinterpret_cast< Task * >(arg);

  // First member of the team change state to executing.
  // Use compare-exchange to avoid race condition with a respawn.
  Kokkos::atomic_compare_exchange_strong( & task->m_state
                                        , int(Kokkos::Experimental::TASK_STATE_WAITING)
                                        , int(Kokkos::Experimental::TASK_STATE_EXECUTING)
                                        );

  if ( task->m_apply_team && ! task->m_apply_single ) {
    Kokkos::Impl::QthreadTeamPolicyMember::TaskTeam task_team_tag ;

    // Initialize team size and rank with shephered info
    Kokkos::Impl::QthreadTeamPolicyMember member( task_team_tag );

    (*task->m_apply_team)( task , member );

#if 0
fprintf( stdout
       , "worker(%d.%d) task 0x%.12lx executed by member(%d:%d)\n"
       , qthread_shep()
       , qthread_worker_local(NULL)
       , reinterpret_cast<unsigned long>(task)
       , member.team_rank()
       , member.team_size()
       );
fflush(stdout);
#endif

    member.team_barrier();
    if ( member.team_rank() == 0 ) task->closeout();
    member.team_barrier();
  }
  else if ( task->m_apply_team && task->m_apply_single == reinterpret_cast<function_single_type>(1) ) {
    // Team hard-wired to one, no cloning
    Kokkos::Impl::QthreadTeamPolicyMember member ;
    (*task->m_apply_team)( task , member );
    task->closeout();
  }
  else {
    (*task->m_apply_single)( task );
    task->closeout();
  }

#if 0
fprintf( stdout
       , "worker(%d.%d) task 0x%.12lx return\n"
       , qthread_shep()
       , qthread_worker_local(NULL)
       , reinterpret_cast<unsigned long>(task)
       );
fflush(stdout);
#endif

  return 0 ;
}
void Task::closeout()
{
  enum { RESPAWN = int( Kokkos::Experimental::TASK_STATE_WAITING ) |
                   int( Kokkos::Experimental::TASK_STATE_EXECUTING ) };

#if 0
fprintf( stdout
       , "worker(%d.%d) task 0x%.12lx %s\n"
       , qthread_shep()
       , qthread_worker_local(NULL)
       , reinterpret_cast<unsigned long>(this)
       , ( m_state == RESPAWN ? "respawn" : "complete" )
       );
fflush(stdout);
#endif

  // When dependent tasks run there would be a race
  // condition between destroying this task and
  // querying the active count pointer from this task.
  int volatile * const active_count = m_active_count ;

  if ( m_state == RESPAWN ) {
    // Task requests respawn, set state to waiting and reschedule the task
    m_state = Kokkos::Experimental::TASK_STATE_WAITING ;
    schedule();
  }
  else {

    // Task did not respawn, is complete
    m_state = Kokkos::Experimental::TASK_STATE_COMPLETE ;

    // Release dependences before allowing dependent tasks to run.
    // Otherwise there is a thread race condition for removing dependences.
    for ( int i = 0 ; i < m_dep_size ; ++i ) {
      assign( & m_dep[i] , 0 );
    }

    // Set qthread FEB to full so that dependent tasks are allowed to execute.
    // This 'task' may be deleted immediately following this function call.
    qthread_fill( & m_qfeb );

    // The dependent task could now complete and destroy 'this' task
    // before the call to 'qthread_fill' returns.  Therefore, for
    // thread safety assume that 'this' task has now been destroyed.
  }

  // Decrement active task count before returning.
  Kokkos::atomic_decrement( active_count );
}
QthreadExec::QthreadExec()
{
  const int shepherd_rank        = qthread_shep();
  const int shepherd_worker_rank = qthread_worker_local(NULL);
  const int worker_rank          = shepherd_rank * s_number_workers_per_shepherd + shepherd_worker_rank ;

  m_worker_base          = s_exec ;
  m_shepherd_base        = s_exec + s_number_workers_per_shepherd * ( ( s_number_shepherds - ( shepherd_rank + 1 ) ) );
  m_scratch_alloc        = ( (unsigned char *) this ) + s_base_size ;
  m_reduce_end           = s_worker_reduce_end ;
  m_shepherd_rank        = shepherd_rank ;
  m_shepherd_size        = s_number_shepherds ;
  m_shepherd_worker_rank = shepherd_worker_rank ;
  m_shepherd_worker_size = s_number_workers_per_shepherd ;
  m_worker_rank          = worker_rank ;
  m_worker_size          = s_number_workers ;
  m_worker_state         = QthreadExec::Active ;
}
void Task::schedule()
{
  // Is waiting for execution

  // Increment active task count before spawning.
  Kokkos::atomic_increment( m_active_count );

  // spawn in qthread.  must malloc the precondition array and give to qthread.
  // qthread will eventually free this allocation so memory will not be leaked.

  // concern with thread safety of malloc, does this need to be guarded?
  aligned_t ** qprecon = (aligned_t **) malloc( ( m_dep_size + 1 ) * sizeof(aligned_t *) );

  qprecon[0] = reinterpret_cast<aligned_t *>( uintptr_t(m_dep_size) );

  for ( int i = 0 ; i < m_dep_size ; ++i ) {
    qprecon[i+1] = & m_dep[i]->m_qfeb ; // Qthread precondition flag
  }

  if ( m_apply_team && ! m_apply_single ) {
    // If more than one shepherd spawn on a shepherd other than this shepherd
    const int num_shepherd            = qthread_num_shepherds();
    const int num_worker_per_shepherd = qthread_num_workers_local(NO_SHEPHERD);
    const int this_shepherd           = qthread_shep();

    int spawn_shepherd = ( this_shepherd + 1 ) % num_shepherd ;

#if 0
fprintf( stdout
       , "worker(%d.%d) task 0x%.12lx spawning on shepherd(%d) clone(%d)\n"
       , qthread_shep()
       , qthread_worker_local(NULL)
       , reinterpret_cast<unsigned long>(this)
       , spawn_shepherd
       , num_worker_per_shepherd - 1
       );
fflush(stdout);
#endif

    qthread_spawn_cloneable
      ( & Task::qthread_func
      , this
      , 0
      , NULL
      , m_dep_size , qprecon /* dependences */
      , spawn_shepherd
      , unsigned( QTHREAD_SPAWN_SIMPLE | QTHREAD_SPAWN_LOCAL_PRIORITY )
      , num_worker_per_shepherd - 1
      );
  }
  else {
    qthread_spawn( & Task::qthread_func /* function */
                 , this                 /* function argument */
                 , 0
                 , NULL
                 , m_dep_size , qprecon /* dependences */
                 , NO_SHEPHERD
                 , QTHREAD_SPAWN_SIMPLE /* allows optimization for non-blocking task */
                 );
  }
}