aligned_t Task::qthread_func( void * arg ) { Task * const task = reinterpret_cast< Task * >(arg); // First member of the team change state to executing. // Use compare-exchange to avoid race condition with a respawn. Kokkos::atomic_compare_exchange_strong( & task->m_state , int(Kokkos::Experimental::TASK_STATE_WAITING) , int(Kokkos::Experimental::TASK_STATE_EXECUTING) ); if ( task->m_apply_team && ! task->m_apply_single ) { Kokkos::Impl::QthreadTeamPolicyMember::TaskTeam task_team_tag ; // Initialize team size and rank with shephered info Kokkos::Impl::QthreadTeamPolicyMember member( task_team_tag ); (*task->m_apply_team)( task , member ); #if 0 fprintf( stdout , "worker(%d.%d) task 0x%.12lx executed by member(%d:%d)\n" , qthread_shep() , qthread_worker_local(NULL) , reinterpret_cast<unsigned long>(task) , member.team_rank() , member.team_size() ); fflush(stdout); #endif member.team_barrier(); if ( member.team_rank() == 0 ) task->closeout(); member.team_barrier(); } else if ( task->m_apply_team && task->m_apply_single == reinterpret_cast<function_single_type>(1) ) { // Team hard-wired to one, no cloning Kokkos::Impl::QthreadTeamPolicyMember member ; (*task->m_apply_team)( task , member ); task->closeout(); } else { (*task->m_apply_single)( task ); task->closeout(); } #if 0 fprintf( stdout , "worker(%d.%d) task 0x%.12lx return\n" , qthread_shep() , qthread_worker_local(NULL) , reinterpret_cast<unsigned long>(task) ); fflush(stdout); #endif return 0 ; }
void Task::closeout() { enum { RESPAWN = int( Kokkos::Experimental::TASK_STATE_WAITING ) | int( Kokkos::Experimental::TASK_STATE_EXECUTING ) }; #if 0 fprintf( stdout , "worker(%d.%d) task 0x%.12lx %s\n" , qthread_shep() , qthread_worker_local(NULL) , reinterpret_cast<unsigned long>(this) , ( m_state == RESPAWN ? "respawn" : "complete" ) ); fflush(stdout); #endif // When dependent tasks run there would be a race // condition between destroying this task and // querying the active count pointer from this task. int volatile * const active_count = m_active_count ; if ( m_state == RESPAWN ) { // Task requests respawn, set state to waiting and reschedule the task m_state = Kokkos::Experimental::TASK_STATE_WAITING ; schedule(); } else { // Task did not respawn, is complete m_state = Kokkos::Experimental::TASK_STATE_COMPLETE ; // Release dependences before allowing dependent tasks to run. // Otherwise there is a thread race condition for removing dependences. for ( int i = 0 ; i < m_dep_size ; ++i ) { assign( & m_dep[i] , 0 ); } // Set qthread FEB to full so that dependent tasks are allowed to execute. // This 'task' may be deleted immediately following this function call. qthread_fill( & m_qfeb ); // The dependent task could now complete and destroy 'this' task // before the call to 'qthread_fill' returns. Therefore, for // thread safety assume that 'this' task has now been destroyed. } // Decrement active task count before returning. Kokkos::atomic_decrement( active_count ); }
QthreadExec::QthreadExec() { const int shepherd_rank = qthread_shep(); const int shepherd_worker_rank = qthread_worker_local(NULL); const int worker_rank = shepherd_rank * s_number_workers_per_shepherd + shepherd_worker_rank ; m_worker_base = s_exec ; m_shepherd_base = s_exec + s_number_workers_per_shepherd * ( ( s_number_shepherds - ( shepherd_rank + 1 ) ) ); m_scratch_alloc = ( (unsigned char *) this ) + s_base_size ; m_reduce_end = s_worker_reduce_end ; m_shepherd_rank = shepherd_rank ; m_shepherd_size = s_number_shepherds ; m_shepherd_worker_rank = shepherd_worker_rank ; m_shepherd_worker_size = s_number_workers_per_shepherd ; m_worker_rank = worker_rank ; m_worker_size = s_number_workers ; m_worker_state = QthreadExec::Active ; }
void Task::schedule() { // Is waiting for execution // Increment active task count before spawning. Kokkos::atomic_increment( m_active_count ); // spawn in qthread. must malloc the precondition array and give to qthread. // qthread will eventually free this allocation so memory will not be leaked. // concern with thread safety of malloc, does this need to be guarded? aligned_t ** qprecon = (aligned_t **) malloc( ( m_dep_size + 1 ) * sizeof(aligned_t *) ); qprecon[0] = reinterpret_cast<aligned_t *>( uintptr_t(m_dep_size) ); for ( int i = 0 ; i < m_dep_size ; ++i ) { qprecon[i+1] = & m_dep[i]->m_qfeb ; // Qthread precondition flag } if ( m_apply_team && ! m_apply_single ) { // If more than one shepherd spawn on a shepherd other than this shepherd const int num_shepherd = qthread_num_shepherds(); const int num_worker_per_shepherd = qthread_num_workers_local(NO_SHEPHERD); const int this_shepherd = qthread_shep(); int spawn_shepherd = ( this_shepherd + 1 ) % num_shepherd ; #if 0 fprintf( stdout , "worker(%d.%d) task 0x%.12lx spawning on shepherd(%d) clone(%d)\n" , qthread_shep() , qthread_worker_local(NULL) , reinterpret_cast<unsigned long>(this) , spawn_shepherd , num_worker_per_shepherd - 1 ); fflush(stdout); #endif qthread_spawn_cloneable ( & Task::qthread_func , this , 0 , NULL , m_dep_size , qprecon /* dependences */ , spawn_shepherd , unsigned( QTHREAD_SPAWN_SIMPLE | QTHREAD_SPAWN_LOCAL_PRIORITY ) , num_worker_per_shepherd - 1 ); } else { qthread_spawn( & Task::qthread_func /* function */ , this /* function argument */ , 0 , NULL , m_dep_size , qprecon /* dependences */ , NO_SHEPHERD , QTHREAD_SPAWN_SIMPLE /* allows optimization for non-blocking task */ ); } }