aligned_t Task::qthread_func( void * arg ) { Task * const task = reinterpret_cast< Task * >(arg); // First member of the team change state to executing. // Use compare-exchange to avoid race condition with a respawn. Kokkos::atomic_compare_exchange_strong( & task->m_state , int(Kokkos::Experimental::TASK_STATE_WAITING) , int(Kokkos::Experimental::TASK_STATE_EXECUTING) ); if ( task->m_apply_team && ! task->m_apply_single ) { Kokkos::Impl::QthreadTeamPolicyMember::TaskTeam task_team_tag ; // Initialize team size and rank with shephered info Kokkos::Impl::QthreadTeamPolicyMember member( task_team_tag ); (*task->m_apply_team)( task , member ); #if 0 fprintf( stdout , "worker(%d.%d) task 0x%.12lx executed by member(%d:%d)\n" , qthread_shep() , qthread_worker_local(NULL) , reinterpret_cast<unsigned long>(task) , member.team_rank() , member.team_size() ); fflush(stdout); #endif member.team_barrier(); if ( member.team_rank() == 0 ) task->closeout(); member.team_barrier(); } else if ( task->m_apply_team && task->m_apply_single == reinterpret_cast<function_single_type>(1) ) { // Team hard-wired to one, no cloning Kokkos::Impl::QthreadTeamPolicyMember member ; (*task->m_apply_team)( task , member ); task->closeout(); } else { (*task->m_apply_single)( task ); task->closeout(); } #if 0 fprintf( stdout , "worker(%d.%d) task 0x%.12lx return\n" , qthread_shep() , qthread_worker_local(NULL) , reinterpret_cast<unsigned long>(task) ); fflush(stdout); #endif return 0 ; }
void chpl_task_addToTaskList(chpl_fn_int_t fid, void *arg, c_sublocid_t subLoc, chpl_task_list_p *task_list, int32_t task_list_locale, chpl_bool is_begin_stmt, int lineno, chpl_string filename) { qthread_shepherd_id_t const here_shep_id = qthread_shep(); chpl_task_private_data_t *parent_chpl_data = chpl_task_getPrivateData(); chpl_bool serial_state = parent_chpl_data->serial_state; chapel_wrapper_args_t wrapper_args = {chpl_ftable[fid], arg, filename, lineno, *parent_chpl_data}; PROFILE_INCR(profile_task_addToTaskList,1); if (serial_state) { syncvar_t ret = SYNCVAR_STATIC_EMPTY_INITIALIZER; qthread_fork_syncvar_copyargs_to(chapel_wrapper, &wrapper_args, sizeof(chapel_wrapper_args_t), &ret, here_shep_id); qthread_syncvar_readFF(NULL, &ret); } else if (subLoc == c_sublocid_any) { qthread_fork_syncvar_copyargs(chapel_wrapper, &wrapper_args, sizeof(chapel_wrapper_args_t), NULL); } else { if (subLoc == c_sublocid_curr) subLoc = (c_sublocid_t) here_shep_id; qthread_fork_syncvar_copyargs_to(chapel_wrapper, &wrapper_args, sizeof(chapel_wrapper_args_t), NULL, (qthread_shepherd_id_t) subLoc); } }
void QthreadExec::exec_all( Qthread & , QthreadExecFunctionPointer func , const void * arg ) { verify_is_process("QthreadExec::exec_all(...)",true); s_active_function = func ; s_active_function_arg = arg ; // Need to query which shepherd this main 'process' is running... const int main_shep = qthread_shep(); for ( int jshep = 0 , iwork = 0 ; jshep < s_number_shepherds ; ++jshep ) { for ( int i = jshep != main_shep ? 0 : 1 ; i < s_number_workers_per_shepherd ; ++i , ++iwork ) { // Unit tests hang with this call: // // qthread_fork_to_local_priority( driver_exec_all , NULL , NULL , jshep ); // qthread_fork_to( driver_exec_all , NULL , NULL , jshep ); }} driver_exec_all( NULL ); s_active_function = 0 ; s_active_function_arg = 0 ; }
void chpl_task_addToTaskList(chpl_fn_int_t fid, void *arg, c_sublocid_t subloc, chpl_task_list_p *task_list, int32_t task_list_locale, chpl_bool is_begin_stmt, int lineno, chpl_string filename) { qthread_shepherd_id_t const here_shep_id = qthread_shep(); chpl_bool serial_state = chpl_task_getSerial(); chpl_qthread_wrapper_args_t wrapper_args = {chpl_ftable[fid], arg, filename, lineno, false, {subloc, serial_state}}; assert(subloc != c_sublocid_none); PROFILE_INCR(profile_task_addToTaskList,1); if (serial_state) { syncvar_t ret = SYNCVAR_STATIC_EMPTY_INITIALIZER; qthread_fork_syncvar_copyargs_to(chapel_wrapper, &wrapper_args, sizeof(chpl_qthread_wrapper_args_t), &ret, here_shep_id); qthread_syncvar_readFF(NULL, &ret); } else if (subloc == c_sublocid_any) { qthread_fork_copyargs(chapel_wrapper, &wrapper_args, sizeof(chpl_qthread_wrapper_args_t), NULL); } else { qthread_fork_copyargs_to(chapel_wrapper, &wrapper_args, sizeof(chpl_qthread_wrapper_args_t), NULL, (qthread_shepherd_id_t) subloc); } }
// Test that writeFF waits for empty var to be filled, writes, and leaves full. // Requires that only one worker is running. Basically does: // 1: empty var // 1: fork(writeFF) // 1: yields // 2: starts runnning // 2: hits writeFF, and yields since var is empty // 1: writeEF // 1: hits readFF on forked task and yield // 2: running again, finishes writeFF, task returns // 1: readFF competes, finishes static void testWriteFFWaits(void) { aligned_t ret; concurrent_t=45; qthread_empty(&concurrent_t); assert(qthread_num_workers() == 1); iprintf("1: Forking writeFF wrapper\n"); qthread_fork_to(writeFF_wrapper, NULL, &ret, qthread_shep()); iprintf("1: Forked, now yielding to 2\n"); qthread_yield(); iprintf("1: Back from yield\n"); // verify that writeFF has not completed assert(qthread_feb_status(&concurrent_t) == 0); assert(concurrent_t != 55); iprintf("1: Writing EF\n"); qthread_writeEF_const(&concurrent_t, 35); // wait for writeFF wrapper to complete qthread_readFF(NULL, &ret); // veify that writeFF completed and that FEB is full iprintf("1: concurrent_t=%d\n", concurrent_t); assert(qthread_feb_status(&concurrent_t) == 1); assert(concurrent_t == 55); }
void chpl_task_yield(void) { PROFILE_INCR(profile_task_yield,1); if (qthread_shep() == NO_SHEPHERD) { sched_yield(); } else { qthread_yield(); } }
static aligned_t checkres(void *arg) { qthread_shepherd_id_t myshep = qthread_shep(); assert(myshep == 1 || myshep == 0); iprintf("myshep = %u\n", (unsigned)myshep); iprintf("arg = %u\n", (unsigned)(uintptr_t)arg); assert(myshep == (qthread_shepherd_id_t)(intptr_t)arg); return 0; }
int accalt_get_thread_num() { #ifdef ARGOBOTS int rank; ABT_xstream_self_rank(&rank); return rank; #endif #ifdef MASSIVETHREADS return myth_get_worker_num(); #endif #ifdef QTHREADS return qthread_shep(); #endif }
void Task::closeout() { enum { RESPAWN = int( Kokkos::Experimental::TASK_STATE_WAITING ) | int( Kokkos::Experimental::TASK_STATE_EXECUTING ) }; #if 0 fprintf( stdout , "worker(%d.%d) task 0x%.12lx %s\n" , qthread_shep() , qthread_worker_local(NULL) , reinterpret_cast<unsigned long>(this) , ( m_state == RESPAWN ? "respawn" : "complete" ) ); fflush(stdout); #endif // When dependent tasks run there would be a race // condition between destroying this task and // querying the active count pointer from this task. int volatile * const active_count = m_active_count ; if ( m_state == RESPAWN ) { // Task requests respawn, set state to waiting and reschedule the task m_state = Kokkos::Experimental::TASK_STATE_WAITING ; schedule(); } else { // Task did not respawn, is complete m_state = Kokkos::Experimental::TASK_STATE_COMPLETE ; // Release dependences before allowing dependent tasks to run. // Otherwise there is a thread race condition for removing dependences. for ( int i = 0 ; i < m_dep_size ; ++i ) { assign( & m_dep[i] , 0 ); } // Set qthread FEB to full so that dependent tasks are allowed to execute. // This 'task' may be deleted immediately following this function call. qthread_fill( & m_qfeb ); // The dependent task could now complete and destroy 'this' task // before the call to 'qthread_fill' returns. Therefore, for // thread safety assume that 'this' task has now been destroyed. } // Decrement active task count before returning. Kokkos::atomic_decrement( active_count ); }
void QthreadExec::exec_all( Qthread & , QthreadExecFunctionPointer func , const void * arg ) { verify_is_process("QthreadExec::exec_all(...)",true); /* fprintf( stdout , "QthreadExec::exec_all\n"); fflush(stdout); */ s_active_function = func ; s_active_function_arg = arg ; // Need to query which shepherd this main 'process' is running... const int main_shep = qthread_shep(); #if 1 for ( int jshep = 0 , iwork = 0 ; jshep < s_number_shepherds ; ++jshep ) { for ( int i = jshep != main_shep ? 0 : 1 ; i < s_number_workers_per_shepherd ; ++i , ++iwork ) { qthread_fork_to( driver_exec_all , NULL , NULL , jshep ); }} #else // If this function is used before the 'qthread.task_policy' unit test // the 'qthread.task_policy' unit test fails with a seg-fault within libqthread.so. for ( int jshep = 0 ; jshep < s_number_shepherds ; ++jshep ) { const int num_clone = jshep != main_shep ? s_number_workers_per_shepherd : s_number_workers_per_shepherd - 1 ; if ( num_clone ) { const int ret = qthread_fork_clones_to_local_priority ( driver_exec_all /* function */ , NULL /* function data block */ , NULL /* pointer to return value feb */ , jshep /* shepherd number */ , num_clone - 1 /* number of instances - 1 */ ); assert(ret == QTHREAD_SUCCESS); } } #endif driver_exec_all( NULL ); s_active_function = 0 ; s_active_function_arg = 0 ; }
QthreadExec::QthreadExec() { const int shepherd_rank = qthread_shep(); const int shepherd_worker_rank = qthread_worker_local(NULL); const int worker_rank = shepherd_rank * s_number_workers_per_shepherd + shepherd_worker_rank ; m_worker_base = s_exec ; m_shepherd_base = s_exec + s_number_workers_per_shepherd * ( ( s_number_shepherds - ( shepherd_rank + 1 ) ) ); m_scratch_alloc = ( (unsigned char *) this ) + s_base_size ; m_reduce_end = s_worker_reduce_end ; m_shepherd_rank = shepherd_rank ; m_shepherd_size = s_number_shepherds ; m_shepherd_worker_rank = shepherd_worker_rank ; m_shepherd_worker_size = s_number_workers_per_shepherd ; m_worker_rank = worker_rank ; m_worker_size = s_number_workers ; m_worker_state = QthreadExec::Active ; }
void chpl_task_exit(void) { #ifdef CHAPEL_PROFILE profile_print(); #endif /* CHAPEL_PROFILE */ #ifdef QTHREAD_MULTINODE #else if (qthread_shep() == NO_SHEPHERD) { /* sometimes, tasking is told to shutdown even though it hasn't been * told to start yet */ if (chpl_qthread_done_initializing == 1) { qthread_syncvar_fill(&canexit); while (done_finalizing == 0) SPINLOCK_BODY(); } } else { qthread_syncvar_fill(&exit_ret); } #endif /* QTHREAD_MULTINODE */ }
void chpl_task_setSubLoc(c_sublocid_t subLoc) { qthread_shepherd_id_t curr_shep; // Only change sublocales if the caller asked for a particular one, // which is not the current one, and we're a (movable) task. // // Note: It's likely that this won't work in all cases where we need // it. In particular, we envision needing to move execution // from sublocale to sublocale while initializing the memory // layer, in order to get the NUMA domain affinity right for // the subparts of the heap. But this will be happening well // before tasking init and in any case would be done from the // main thread of execution, which doesn't have a shepherd. // The code below wouldn't work in that situation. if (subLoc != c_sublocid_any && subLoc != c_sublocid_curr && (curr_shep = qthread_shep()) != NO_SHEPHERD && (qthread_shepherd_id_t) subLoc != curr_shep) { qthread_migrate_to((qthread_shepherd_id_t) subLoc); } }
static aligned_t migrant(void *arg) { int myshep = qthread_shep(); assert(myshep == 1 || myshep == 0); iprintf("migrant running on shep %i\n", myshep); if (myshep == 1) { qthread_migrate_to(0); iprintf("migrant now running on shep %i\n", (int)qthread_shep()); assert(qthread_shep() == 0); } else { qthread_migrate_to(1); if (qthread_shep() != 1) { fprintf(stderr, "Expected to be on shepherd 1, actually on shepherd %i\n", qthread_shep()); assert(qthread_shep() == 1); abort(); } } return 0; }
void QthreadExec::resize_worker_scratch( const int reduce_size , const int shared_size ) { const int exec_all_reduce_alloc = align_alloc( reduce_size ); const int shepherd_scan_alloc = align_alloc( 8 ); const int shepherd_shared_end = exec_all_reduce_alloc + shepherd_scan_alloc + align_alloc( shared_size ); if ( s_worker_reduce_end < exec_all_reduce_alloc || s_worker_shared_end < shepherd_shared_end ) { /* fprintf( stdout , "QthreadExec::resize\n"); fflush(stdout); */ // Clear current worker memory before allocating new worker memory clear_workers(); // Increase the buffers to an aligned allocation s_worker_reduce_end = exec_all_reduce_alloc ; s_worker_shared_begin = exec_all_reduce_alloc + shepherd_scan_alloc ; s_worker_shared_end = shepherd_shared_end ; // Need to query which shepherd this main 'process' is running... const int main_shep = qthread_shep(); // Have each worker resize its memory for proper first-touch #if 1 for ( int jshep = 0 ; jshep < s_number_shepherds ; ++jshep ) { for ( int i = jshep != main_shep ? 0 : 1 ; i < s_number_workers_per_shepherd ; ++i ) { qthread_fork_to( driver_resize_worker_scratch , NULL , NULL , jshep ); }} #else // If this function is used before the 'qthread.task_policy' unit test // the 'qthread.task_policy' unit test fails with a seg-fault within libqthread.so. for ( int jshep = 0 ; jshep < s_number_shepherds ; ++jshep ) { const int num_clone = jshep != main_shep ? s_number_workers_per_shepherd : s_number_workers_per_shepherd - 1 ; if ( num_clone ) { const int ret = qthread_fork_clones_to_local_priority ( driver_resize_worker_scratch /* function */ , NULL /* function data block */ , NULL /* pointer to return value feb */ , jshep /* shepherd number */ , num_clone - 1 /* number of instances - 1 */ ); assert(ret == QTHREAD_SUCCESS); } } #endif driver_resize_worker_scratch( NULL ); // Verify all workers allocated bool ok = true ; for ( int iwork = 0 ; ok && iwork < s_number_workers ; ++iwork ) { ok = 0 != s_exec[iwork] ; } if ( ! ok ) { std::ostringstream msg ; msg << "Kokkos::Impl::QthreadExec::resize : FAILED for workers {" ; for ( int iwork = 0 ; iwork < s_number_workers ; ++iwork ) { if ( 0 == s_exec[iwork] ) { msg << " " << ( s_number_workers - ( iwork + 1 ) ); } } msg << " }" ; Kokkos::Impl::throw_runtime_exception( msg.str() ); } } }
GLT_func_prefix int glt_get_thread_num() { int a = qthread_shep(); //printf("en glt_get_thread_num devolviendo %d\n",a); return a; }
void Task::schedule() { // Is waiting for execution // Increment active task count before spawning. Kokkos::atomic_increment( m_active_count ); // spawn in qthread. must malloc the precondition array and give to qthread. // qthread will eventually free this allocation so memory will not be leaked. // concern with thread safety of malloc, does this need to be guarded? aligned_t ** qprecon = (aligned_t **) malloc( ( m_dep_size + 1 ) * sizeof(aligned_t *) ); qprecon[0] = reinterpret_cast<aligned_t *>( uintptr_t(m_dep_size) ); for ( int i = 0 ; i < m_dep_size ; ++i ) { qprecon[i+1] = & m_dep[i]->m_qfeb ; // Qthread precondition flag } if ( m_apply_team && ! m_apply_single ) { // If more than one shepherd spawn on a shepherd other than this shepherd const int num_shepherd = qthread_num_shepherds(); const int num_worker_per_shepherd = qthread_num_workers_local(NO_SHEPHERD); const int this_shepherd = qthread_shep(); int spawn_shepherd = ( this_shepherd + 1 ) % num_shepherd ; #if 0 fprintf( stdout , "worker(%d.%d) task 0x%.12lx spawning on shepherd(%d) clone(%d)\n" , qthread_shep() , qthread_worker_local(NULL) , reinterpret_cast<unsigned long>(this) , spawn_shepherd , num_worker_per_shepherd - 1 ); fflush(stdout); #endif qthread_spawn_cloneable ( & Task::qthread_func , this , 0 , NULL , m_dep_size , qprecon /* dependences */ , spawn_shepherd , unsigned( QTHREAD_SPAWN_SIMPLE | QTHREAD_SPAWN_LOCAL_PRIORITY ) , num_worker_per_shepherd - 1 ); } else { qthread_spawn( & Task::qthread_func /* function */ , this /* function argument */ , 0 , NULL , m_dep_size , qprecon /* dependences */ , NO_SHEPHERD , QTHREAD_SPAWN_SIMPLE /* allows optimization for non-blocking task */ ); } }
c_sublocid_t chpl_task_getSubLoc(void) { return (c_sublocid_t) qthread_shep(); }