void meta_server_failure_detector::set_primary(rpc_address primary) { bool old = _is_primary; { utils::auto_lock<zlock> l(_primary_address_lock); _primary_address = primary; _is_primary = (primary == primary_address()); } if (!old && _is_primary) { node_states ns; _state->get_node_state(ns); for (auto& pr : ns) { register_worker(pr.first, pr.second); } } if (old && !_is_primary) { clear_workers(); } }
void QthreadExec::resize_worker_scratch( const int reduce_size , const int shared_size ) { const int exec_all_reduce_alloc = align_alloc( reduce_size ); const int shepherd_scan_alloc = align_alloc( 8 ); const int shepherd_shared_end = exec_all_reduce_alloc + shepherd_scan_alloc + align_alloc( shared_size ); if ( s_worker_reduce_end < exec_all_reduce_alloc || s_worker_shared_end < shepherd_shared_end ) { // Clear current worker memory before allocating new worker memory clear_workers(); // Increase the buffers to an aligned allocation s_worker_reduce_end = exec_all_reduce_alloc ; s_worker_shared_begin = exec_all_reduce_alloc + shepherd_scan_alloc ; s_worker_shared_end = shepherd_shared_end ; // Need to query which shepherd this main 'process' is running... // Have each worker resize its memory for proper first-touch for ( int jshep = 0 ; jshep < s_number_shepherds ; ++jshep ) { for ( int i = jshep ? 0 : 1 ; i < s_number_workers_per_shepherd ; ++i ) { // Unit tests hang with this call: // // qthread_fork_to_local_priority( driver_resize_workers , NULL , NULL , jshep ); // qthread_fork_to( driver_resize_worker_scratch , NULL , NULL , jshep ); }} driver_resize_worker_scratch( NULL ); // Verify all workers allocated bool ok = true ; for ( int iwork = 0 ; ok && iwork < s_number_workers ; ++iwork ) { ok = 0 != s_exec[iwork] ; } if ( ! ok ) { std::ostringstream msg ; msg << "Kokkos::Impl::QthreadExec::resize : FAILED for workers {" ; for ( int iwork = 0 ; iwork < s_number_workers ; ++iwork ) { if ( 0 == s_exec[iwork] ) { msg << " " << ( s_number_workers - ( iwork + 1 ) ); } } msg << " }" ; Kokkos::Impl::throw_runtime_exception( msg.str() ); } } }
void meta_server_failure_detector::set_primary(rpc_address primary) { /* * we don't do register worker things in set_primary * as only nodes sync from meta_state_service are useful, * but currently, we haven't do sync yet */ bool old = _is_primary; { utils::auto_lock<zlock> l(_primary_address_lock); _primary_address = primary; _is_primary = (primary == primary_address()); } if (old && !_is_primary) { clear_workers(); } }
bool meta_server_failure_detector::set_primary(bool is_primary /*= false*/) { bool bRet = true; if (is_primary && !_is_primary) { node_states ns; _state->get_node_state(ns); for (auto& pr : ns) { register_worker(pr.first, pr.second); } _is_primary = true; } if (!is_primary && _is_primary) { clear_workers(); _is_primary = false; } return bRet; }
void QthreadExec::resize_worker_scratch( const int reduce_size , const int shared_size ) { const int exec_all_reduce_alloc = align_alloc( reduce_size ); const int shepherd_scan_alloc = align_alloc( 8 ); const int shepherd_shared_end = exec_all_reduce_alloc + shepherd_scan_alloc + align_alloc( shared_size ); if ( s_worker_reduce_end < exec_all_reduce_alloc || s_worker_shared_end < shepherd_shared_end ) { /* fprintf( stdout , "QthreadExec::resize\n"); fflush(stdout); */ // Clear current worker memory before allocating new worker memory clear_workers(); // Increase the buffers to an aligned allocation s_worker_reduce_end = exec_all_reduce_alloc ; s_worker_shared_begin = exec_all_reduce_alloc + shepherd_scan_alloc ; s_worker_shared_end = shepherd_shared_end ; // Need to query which shepherd this main 'process' is running... const int main_shep = qthread_shep(); // Have each worker resize its memory for proper first-touch #if 1 for ( int jshep = 0 ; jshep < s_number_shepherds ; ++jshep ) { for ( int i = jshep != main_shep ? 0 : 1 ; i < s_number_workers_per_shepherd ; ++i ) { qthread_fork_to( driver_resize_worker_scratch , NULL , NULL , jshep ); }} #else // If this function is used before the 'qthread.task_policy' unit test // the 'qthread.task_policy' unit test fails with a seg-fault within libqthread.so. for ( int jshep = 0 ; jshep < s_number_shepherds ; ++jshep ) { const int num_clone = jshep != main_shep ? s_number_workers_per_shepherd : s_number_workers_per_shepherd - 1 ; if ( num_clone ) { const int ret = qthread_fork_clones_to_local_priority ( driver_resize_worker_scratch /* function */ , NULL /* function data block */ , NULL /* pointer to return value feb */ , jshep /* shepherd number */ , num_clone - 1 /* number of instances - 1 */ ); assert(ret == QTHREAD_SUCCESS); } } #endif driver_resize_worker_scratch( NULL ); // Verify all workers allocated bool ok = true ; for ( int iwork = 0 ; ok && iwork < s_number_workers ; ++iwork ) { ok = 0 != s_exec[iwork] ; } if ( ! ok ) { std::ostringstream msg ; msg << "Kokkos::Impl::QthreadExec::resize : FAILED for workers {" ; for ( int iwork = 0 ; iwork < s_number_workers ; ++iwork ) { if ( 0 == s_exec[iwork] ) { msg << " " << ( s_number_workers - ( iwork + 1 ) ); } } msg << " }" ; Kokkos::Impl::throw_runtime_exception( msg.str() ); } } }