WorkGraphPolicy( const graph_type & arg_graph )
    : m_graph(arg_graph)
    , m_queue( view_alloc( "queue" , WithoutInitializing )
             , arg_graph.numRows() * 2 + 2 )
  {
    { // Initialize
      using policy_type = RangePolicy<std::int32_t, execution_space, TagInit>;
      using closure_type = Kokkos::Impl::ParallelFor<self_type, policy_type>;
      const closure_type closure(*this, policy_type(0, m_queue.size()));
      closure.execute();
      execution_space::fence();
    }

    { // execute-after counts
      using policy_type = RangePolicy<std::int32_t, execution_space, TagCount>;
      using closure_type = Kokkos::Impl::ParallelFor<self_type, policy_type>;
      const closure_type closure(*this,policy_type(0,m_graph.entries.size()));
      closure.execute();
      execution_space::fence();
    }

    { // Scheduling ready tasks
      using policy_type = RangePolicy<std::int32_t, execution_space, TagReady>;
      using closure_type = Kokkos::Impl::ParallelFor<self_type, policy_type>;
      const closure_type closure(*this,policy_type(0,m_graph.numRows()));
      closure.execute();
      execution_space::fence();
    }
  }
  /**\brief  Attempt to pop the work item at the head of the queue.
   *
   *  Find entry 'i' such that
   *    ( m_queue[i] != BEGIN_TOKEN ) AND
   *    ( i == 0 OR m_queue[i-1] == BEGIN_TOKEN )
   *  if found then
   *    increment begin hint
   *    return atomic_exchange( m_queue[i] , BEGIN_TOKEN )
   *  else if i < total work
   *    return END_TOKEN
   *  else
   *    return COMPLETED_TOKEN
   *  
   */
  KOKKOS_INLINE_FUNCTION
  std::int32_t pop_work() const noexcept
    {
      const std::int32_t N = m_graph.numRows();

      std::int32_t volatile * const ready_queue = & m_queue[0] ;
      std::int32_t volatile * const begin_hint  = & m_queue[2*N] ;

      // begin hint is guaranteed to be less than or equal to
      // actual begin location in the queue.

      for ( std::int32_t i = *begin_hint ; i < N ; ++i ) {

        const std::int32_t w = ready_queue[i] ;

        if ( w == END_TOKEN ) { return END_TOKEN ; }

        if ( ( w != BEGIN_TOKEN ) &&
             ( w == atomic_compare_exchange(ready_queue+i,w,(std::int32_t)BEGIN_TOKEN) ) ) {
          // Attempt to claim ready work index succeeded,
          // update the hint and return work index
          atomic_increment( begin_hint );
          return w ;
        }
        // arrive here when ready_queue[i] == BEGIN_TOKEN
      }

      return COMPLETED_TOKEN ;
    }
  KOKKOS_INLINE_FUNCTION
  void operator()( const TagReady , int w ) const noexcept
    {
      std::int32_t const * const count_queue =
        & m_queue[ m_graph.numRows() ] ;

      if ( 0 == count_queue[w] ) push_work(w);
    }
  KOKKOS_INLINE_FUNCTION
  void operator()( const TagCount , int i ) const noexcept
    {
      std::int32_t volatile * const count_queue =
        & m_queue[ m_graph.numRows() ] ;

      atomic_increment( count_queue + m_graph.entries[i] );
    }
  KOKKOS_INLINE_FUNCTION
  void push_work( const std::int32_t w ) const noexcept
    {
      const std::int32_t N = m_graph.numRows();

      std::int32_t volatile * const ready_queue = & m_queue[0] ;
      std::int32_t volatile * const end_hint    = & m_queue[2*N+1] ;

      // Push work to end of queue
      const std::int32_t j = atomic_fetch_add( end_hint , 1 );

      if ( ( N <= j ) ||
           ( END_TOKEN != atomic_exchange(ready_queue+j,w) ) ) {
        // ERROR: past the end of queue or did not replace END_TOKEN
        Kokkos::abort("WorkGraphPolicy push_work error");
      }

      memory_fence();
    }
  KOKKOS_INLINE_FUNCTION
  void completed_work( std::int32_t w ) const noexcept
    {
      Kokkos::memory_fence();

      // Make sure the completed work function's memory accesses are flushed.

      const std::int32_t N = m_graph.numRows();

      std::int32_t volatile * const count_queue = & m_queue[N] ;

      const std::int32_t B = m_graph.row_map(w);
      const std::int32_t E = m_graph.row_map(w+1);

      for ( std::int32_t i = B ; i < E ; ++i ) {
        const std::int32_t j = m_graph.entries(i);
        if ( 1 == atomic_fetch_add(count_queue+j,-1) ) {
          push_work(j);
        }
      }
    }
 /**\brief  Initialize queue
  *
  *  m_queue[0..N-1] = END_TOKEN, the ready queue
  *  m_queue[N..2*N-1] = 0, the waiting count queue
  *  m_queue[2*N..2*N+1] = 0, begin/end hints for ready queue
  */
 KOKKOS_INLINE_FUNCTION
 void operator()( const TagInit , int i ) const noexcept
   { m_queue[i] = i < m_graph.numRows() ? END_TOKEN : 0 ; }