WorkGraphPolicy( const graph_type & arg_graph ) : m_graph(arg_graph) , m_queue( view_alloc( "queue" , WithoutInitializing ) , arg_graph.numRows() * 2 + 2 ) { { // Initialize using policy_type = RangePolicy<std::int32_t, execution_space, TagInit>; using closure_type = Kokkos::Impl::ParallelFor<self_type, policy_type>; const closure_type closure(*this, policy_type(0, m_queue.size())); closure.execute(); execution_space::fence(); } { // execute-after counts using policy_type = RangePolicy<std::int32_t, execution_space, TagCount>; using closure_type = Kokkos::Impl::ParallelFor<self_type, policy_type>; const closure_type closure(*this,policy_type(0,m_graph.entries.size())); closure.execute(); execution_space::fence(); } { // Scheduling ready tasks using policy_type = RangePolicy<std::int32_t, execution_space, TagReady>; using closure_type = Kokkos::Impl::ParallelFor<self_type, policy_type>; const closure_type closure(*this,policy_type(0,m_graph.numRows())); closure.execute(); execution_space::fence(); } }
/**\brief Attempt to pop the work item at the head of the queue. * * Find entry 'i' such that * ( m_queue[i] != BEGIN_TOKEN ) AND * ( i == 0 OR m_queue[i-1] == BEGIN_TOKEN ) * if found then * increment begin hint * return atomic_exchange( m_queue[i] , BEGIN_TOKEN ) * else if i < total work * return END_TOKEN * else * return COMPLETED_TOKEN * */ KOKKOS_INLINE_FUNCTION std::int32_t pop_work() const noexcept { const std::int32_t N = m_graph.numRows(); std::int32_t volatile * const ready_queue = & m_queue[0] ; std::int32_t volatile * const begin_hint = & m_queue[2*N] ; // begin hint is guaranteed to be less than or equal to // actual begin location in the queue. for ( std::int32_t i = *begin_hint ; i < N ; ++i ) { const std::int32_t w = ready_queue[i] ; if ( w == END_TOKEN ) { return END_TOKEN ; } if ( ( w != BEGIN_TOKEN ) && ( w == atomic_compare_exchange(ready_queue+i,w,(std::int32_t)BEGIN_TOKEN) ) ) { // Attempt to claim ready work index succeeded, // update the hint and return work index atomic_increment( begin_hint ); return w ; } // arrive here when ready_queue[i] == BEGIN_TOKEN } return COMPLETED_TOKEN ; }
KOKKOS_INLINE_FUNCTION void operator()( const TagReady , int w ) const noexcept { std::int32_t const * const count_queue = & m_queue[ m_graph.numRows() ] ; if ( 0 == count_queue[w] ) push_work(w); }
KOKKOS_INLINE_FUNCTION void operator()( const TagCount , int i ) const noexcept { std::int32_t volatile * const count_queue = & m_queue[ m_graph.numRows() ] ; atomic_increment( count_queue + m_graph.entries[i] ); }
KOKKOS_INLINE_FUNCTION void push_work( const std::int32_t w ) const noexcept { const std::int32_t N = m_graph.numRows(); std::int32_t volatile * const ready_queue = & m_queue[0] ; std::int32_t volatile * const end_hint = & m_queue[2*N+1] ; // Push work to end of queue const std::int32_t j = atomic_fetch_add( end_hint , 1 ); if ( ( N <= j ) || ( END_TOKEN != atomic_exchange(ready_queue+j,w) ) ) { // ERROR: past the end of queue or did not replace END_TOKEN Kokkos::abort("WorkGraphPolicy push_work error"); } memory_fence(); }
KOKKOS_INLINE_FUNCTION void completed_work( std::int32_t w ) const noexcept { Kokkos::memory_fence(); // Make sure the completed work function's memory accesses are flushed. const std::int32_t N = m_graph.numRows(); std::int32_t volatile * const count_queue = & m_queue[N] ; const std::int32_t B = m_graph.row_map(w); const std::int32_t E = m_graph.row_map(w+1); for ( std::int32_t i = B ; i < E ; ++i ) { const std::int32_t j = m_graph.entries(i); if ( 1 == atomic_fetch_add(count_queue+j,-1) ) { push_work(j); } } }
/**\brief Initialize queue * * m_queue[0..N-1] = END_TOKEN, the ready queue * m_queue[N..2*N-1] = 0, the waiting count queue * m_queue[2*N..2*N+1] = 0, begin/end hints for ready queue */ KOKKOS_INLINE_FUNCTION void operator()( const TagInit , int i ) const noexcept { m_queue[i] = i < m_graph.numRows() ? END_TOKEN : 0 ; }