Exemplo n.º 1
0
void meta_server_failure_detector::set_primary(rpc_address primary)
{
    bool old = _is_primary;
    {
        utils::auto_lock<zlock> l(_primary_address_lock);
        _primary_address = primary;
        _is_primary = (primary == primary_address());
    }

    if (!old && _is_primary)
    {
        node_states ns;
        _state->get_node_state(ns);

        for (auto& pr : ns)
        {
            register_worker(pr.first, pr.second);
        }
    }

    if (old && !_is_primary)
    {
        clear_workers();
    }
}
Exemplo n.º 2
0
void QthreadExec::resize_worker_scratch( const int reduce_size , const int shared_size )
{
  const int exec_all_reduce_alloc = align_alloc( reduce_size );
  const int shepherd_scan_alloc   = align_alloc( 8 );
  const int shepherd_shared_end   = exec_all_reduce_alloc + shepherd_scan_alloc + align_alloc( shared_size );

  if ( s_worker_reduce_end < exec_all_reduce_alloc ||
       s_worker_shared_end < shepherd_shared_end ) {

    // Clear current worker memory before allocating new worker memory
    clear_workers();

    // Increase the buffers to an aligned allocation
    s_worker_reduce_end   = exec_all_reduce_alloc ;
    s_worker_shared_begin = exec_all_reduce_alloc + shepherd_scan_alloc ;
    s_worker_shared_end   = shepherd_shared_end ;

    // Need to query which shepherd this main 'process' is running...

    // Have each worker resize its memory for proper first-touch
    for ( int jshep = 0 ; jshep < s_number_shepherds ; ++jshep ) {
    for ( int i = jshep ? 0 : 1 ; i < s_number_workers_per_shepherd ; ++i ) {

      // Unit tests hang with this call:
      //
      // qthread_fork_to_local_priority( driver_resize_workers , NULL , NULL , jshep );
      //

      qthread_fork_to( driver_resize_worker_scratch , NULL , NULL , jshep );
    }}

    driver_resize_worker_scratch( NULL );

    // Verify all workers allocated

    bool ok = true ;
    for ( int iwork = 0 ; ok && iwork < s_number_workers ; ++iwork ) { ok = 0 != s_exec[iwork] ; }

    if ( ! ok ) {
      std::ostringstream msg ;
      msg << "Kokkos::Impl::QthreadExec::resize : FAILED for workers {" ;
      for ( int iwork = 0 ; iwork < s_number_workers ; ++iwork ) {
         if ( 0 == s_exec[iwork] ) { msg << " " << ( s_number_workers - ( iwork + 1 ) ); }
      }
      msg << " }" ;
      Kokkos::Impl::throw_runtime_exception( msg.str() );
    }
  }
}
Exemplo n.º 3
0
void meta_server_failure_detector::set_primary(rpc_address primary)
{
    /*
    * we don't do register worker things in set_primary
    * as only nodes sync from meta_state_service are useful, 
    * but currently, we haven't do sync yet
    */
    bool old = _is_primary;
    {
        utils::auto_lock<zlock> l(_primary_address_lock);
        _primary_address = primary;
        _is_primary = (primary == primary_address());
    }

    if (old && !_is_primary)
    {
        clear_workers();
    }
}
bool meta_server_failure_detector::set_primary(bool is_primary /*= false*/)
{
    bool bRet = true;
    if (is_primary && !_is_primary)
    {
        node_states ns;
        _state->get_node_state(ns);

        for (auto& pr : ns)
        {
            register_worker(pr.first, pr.second);
        }

        _is_primary = true;
    }

    if (!is_primary && _is_primary)
    {
        clear_workers();
        _is_primary = false;
    }

    return bRet;
}
void QthreadExec::resize_worker_scratch( const int reduce_size , const int shared_size )
{
  const int exec_all_reduce_alloc = align_alloc( reduce_size );
  const int shepherd_scan_alloc   = align_alloc( 8 );
  const int shepherd_shared_end   = exec_all_reduce_alloc + shepherd_scan_alloc + align_alloc( shared_size );

  if ( s_worker_reduce_end < exec_all_reduce_alloc ||
       s_worker_shared_end < shepherd_shared_end ) {

/*
  fprintf( stdout , "QthreadExec::resize\n");
  fflush(stdout);
*/

    // Clear current worker memory before allocating new worker memory
    clear_workers();

    // Increase the buffers to an aligned allocation
    s_worker_reduce_end   = exec_all_reduce_alloc ;
    s_worker_shared_begin = exec_all_reduce_alloc + shepherd_scan_alloc ;
    s_worker_shared_end   = shepherd_shared_end ;

    // Need to query which shepherd this main 'process' is running...
 
    const int main_shep = qthread_shep();

    // Have each worker resize its memory for proper first-touch
#if 1
    for ( int jshep = 0 ; jshep < s_number_shepherds ; ++jshep ) {
    for ( int i = jshep != main_shep ? 0 : 1 ; i < s_number_workers_per_shepherd ; ++i ) {
      qthread_fork_to( driver_resize_worker_scratch , NULL , NULL , jshep );
    }}
#else
    // If this function is used before the 'qthread.task_policy' unit test
    // the 'qthread.task_policy' unit test fails with a seg-fault within libqthread.so.
    for ( int jshep = 0 ; jshep < s_number_shepherds ; ++jshep ) {
      const int num_clone = jshep != main_shep ? s_number_workers_per_shepherd : s_number_workers_per_shepherd - 1 ;

      if ( num_clone ) {
        const int ret = qthread_fork_clones_to_local_priority
          ( driver_resize_worker_scratch   /* function */
          , NULL                           /* function data block */
          , NULL                           /* pointer to return value feb */
          , jshep                          /* shepherd number */
          , num_clone - 1                  /* number of instances - 1 */
          );

        assert(ret == QTHREAD_SUCCESS);
      }
    }
#endif

    driver_resize_worker_scratch( NULL );

    // Verify all workers allocated

    bool ok = true ;
    for ( int iwork = 0 ; ok && iwork < s_number_workers ; ++iwork ) { ok = 0 != s_exec[iwork] ; }

    if ( ! ok ) {
      std::ostringstream msg ;
      msg << "Kokkos::Impl::QthreadExec::resize : FAILED for workers {" ;
      for ( int iwork = 0 ; iwork < s_number_workers ; ++iwork ) {
         if ( 0 == s_exec[iwork] ) { msg << " " << ( s_number_workers - ( iwork + 1 ) ); }
      }
      msg << " }" ;
      Kokkos::Impl::throw_runtime_exception( msg.str() );
    }
  }
}