Пример #1
0
void ThreadsExec::execute_serial( void (*func)( ThreadsExec & , const void * ) )
{
  s_current_function = func ;
  s_current_function_arg = & s_threads_process ;

  // Make sure function and arguments are written before activating threads.
  memory_fence();

  const unsigned begin = s_threads_process.m_pool_base ? 1 : 0 ;

  for ( unsigned i = s_thread_pool_size[0] ; begin < i ; ) {
    ThreadsExec & th = * s_threads_exec[ --i ];

    th.m_pool_state = ThreadsExec::Active ;

    wait_yield( th.m_pool_state , ThreadsExec::Active );
  }

  if ( s_threads_process.m_pool_base ) {
    s_threads_process.m_pool_state = ThreadsExec::Active ;
    (*func)( s_threads_process , 0 );
    s_threads_process.m_pool_state = ThreadsExec::Inactive ;
  }

  s_current_function_arg = 0 ;
  s_current_function = 0 ;

  // Make sure function and arguments are cleared before proceeding.
  memory_fence();
}
Пример #2
0
  inline
  void barrier( )
    {
      // Make sure there is enough scratch space:
      const int rev_rank = m_pool_size - ( m_pool_rank + 1 );

      memory_fence();

      // Fan-in reduction with highest ranking thread as the root
      for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
        // Wait: Active -> Rendezvous
        Impl::spinwait_while_equal<int>( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Active );
      }

      if ( rev_rank ) {
        m_pool_state = ThreadsExec::Rendezvous ;
        // Wait: Rendezvous -> Active
        Impl::spinwait_while_equal<int>( m_pool_state , ThreadsExec::Rendezvous );
      }
      else {
        // Root thread does the reduction and broadcast

        memory_fence();

        for ( int rank = 0 ; rank < m_pool_size ; ++rank ) {
          get_thread( rank )->m_pool_state = ThreadsExec::Active ;
        }
      }
    }
Пример #3
0
  inline
  Type shepherd_scan( const int team_size
                    , const Type & value
                    ,       Type * const global_value = 0 ) const
    {
      *shepherd_team_scratch_value<Type>() = value ;

      memory_fence();

      const int rev_rank = team_size - ( m_shepherd_worker_rank + 1 );

      int n , j ;

      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
        Impl::spinwait( m_shepherd_base[j]->m_worker_state , QthreadExec::Active );
      }

      if ( rev_rank ) {
        m_worker_state = QthreadExec::Inactive ;
        Impl::spinwait( m_worker_state , QthreadExec::Inactive );
      }
      else {
        // Root thread scans across values before releasing threads
        // Worker data is in reverse order, so m_shepherd_base[0] is the
        // highest ranking thread.

        // Copy from lower ranking to higher ranking worker.

        Type accum = * m_shepherd_base[0]->shepherd_team_scratch_value<Type>();
        for ( int i = 1 ; i < team_size ; ++i ) {
          const Type tmp = * m_shepherd_base[i]->shepherd_team_scratch_value<Type>();
          accum += tmp ;
          * m_shepherd_base[i-1]->shepherd_team_scratch_value<Type>() = tmp ;
        }

        * m_shepherd_base[team_size-1]->shepherd_team_scratch_value<Type>() =
          global_value ? atomic_fetch_add( global_value , accum ) : 0 ;

        // Join from lower ranking to higher ranking worker.
        for ( int i = team_size ; --i ; ) {
          * m_shepherd_base[i-1]->shepherd_team_scratch_value<Type>() += * m_shepherd_base[i]->shepherd_team_scratch_value<Type>();
        }

        memory_fence();
      }

      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
        m_shepherd_base[j]->m_worker_state = QthreadExec::Active ;
      }

      return *shepherd_team_scratch_value<Type>();
    }
Пример #4
0
  // Get a work index. Claim from owned range until its exhausted, then steal from other thread
  inline long get_work_index (int team_size = 0) {
    long work_index = -1;
    if(!m_stealing) work_index = get_work_index_begin();

    if( work_index == -1) {
      memory_fence();
      m_stealing = true;
      work_index = steal_work_index(team_size);
    }
    m_team_work_index = work_index;
    memory_fence();
    return work_index;
  }
Пример #5
0
inline typename size_to_int<size>::type atomic_read(const volatile void * ptr)
{
    typedef typename size_to_int<size>::type value_type;
    value_type result = *static_cast<const volatile value_type*>(ptr);
    memory_fence();
    return result;
}
Пример #6
0
T *FixedVector<T, Size>::pop_back(std::size_t *out_index) {
  int kRetryDelay = 1;
  while (true) {
    Word length = length_.nobarrier_load();
    if (length == 0) return reinterpret_cast<T *>(kOutOfRange);

    Word index = length - 1;
    T *value;

    // pop_back "primes" the value it is about to pop by setting a
    // bit.  It is illegal to pop "past" a primed element.
    if (unlikely(!buffer_[index].cas_prime(&value))) {
      Platform::Sleep(kRetryDelay);
      continue;
    }

    // We can't let this load be reordered to after the modifying the
    // length -- we might end up reading a completely different value.
    memory_fence();

    if (unlikely(!length_.boolean_cas(length, length - 1))) {
      // Something's changed, undo priming and retry.
      buffer_[index].nobarrier_store(value);
      continue;
    }

    if (out_index != NULL) *out_index = index;
    return value;
  }
}
Пример #7
0
/** \brief  Begin execution of the asynchronous functor */
void ThreadsExec::start( void (*func)( ThreadsExec & , const void * ) , const void * arg )
{
  verify_is_process("ThreadsExec::start" , true );

  if ( s_current_function || s_current_function_arg ) {
    Kokkos::Impl::throw_runtime_exception( std::string( "ThreadsExec::start() FAILED : already executing" ) );
  }

  s_current_function     = func ;
  s_current_function_arg = arg ;

  // Make sure function and arguments are written before activating threads.
  memory_fence();

  // Activate threads:
  for ( int i = s_thread_pool_size[0] ; 0 < i-- ; ) {
    s_threads_exec[i]->m_pool_state = ThreadsExec::Active ;
  }

  if ( s_threads_process.m_pool_size ) {
    // Master process is the root thread, run it:
    (*func)( s_threads_process , arg );
    s_threads_process.m_pool_state = ThreadsExec::Inactive ;
  }
}
Пример #8
0
  inline
  int all_reduce( const int value )
    {
      // Make sure there is enough scratch space:
      const int rev_rank = m_pool_size - ( m_pool_rank + 1 );

      *((volatile int*) reduce_memory()) = value ;

      memory_fence();

      // Fan-in reduction with highest ranking thread as the root
      for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
        // Wait: Active -> Rendezvous
        Impl::spinwait_while_equal<int>( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Active );
      }

      if ( rev_rank ) {
        m_pool_state = ThreadsExec::Rendezvous ;
        // Wait: Rendezvous -> Active
        Impl::spinwait_while_equal<int>( m_pool_state , ThreadsExec::Rendezvous );
      }
      else {
        // Root thread does the reduction and broadcast

        int accum = 0 ;

        for ( int rank = 0 ; rank < m_pool_size ; ++rank ) {
          accum += *((volatile int *) get_thread( rank )->reduce_memory());
        }

        for ( int rank = 0 ; rank < m_pool_size ; ++rank ) {
          *((volatile int *) get_thread( rank )->reduce_memory()) = accum ;
        }

        memory_fence();

        for ( int rank = 0 ; rank < m_pool_size ; ++rank ) {
          get_thread( rank )->m_pool_state = ThreadsExec::Active ;
        }
      }

      return *((volatile int*) reduce_memory());
    }
Пример #9
0
  inline
  typename JoinOp::value_type
    shepherd_reduce( const int team_size
                   , const typename JoinOp::value_type & value
                   , const JoinOp & op ) const
    {
      typedef typename JoinOp::value_type Type ;

      *shepherd_team_scratch_value<Type>() = value ;

      memory_fence();

      const int rev_rank = team_size - ( m_shepherd_worker_rank + 1 );

      int n , j ;

      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
        Impl::spinwait( m_shepherd_base[j]->m_worker_state , QthreadExec::Active );
      }

      if ( rev_rank ) {
        m_worker_state = QthreadExec::Inactive ;
        Impl::spinwait( m_worker_state , QthreadExec::Inactive );
      }
      else {
        volatile Type & accum = * m_shepherd_base[0]->shepherd_team_scratch_value<Type>();
        for ( int i = 1 ; i < team_size ; ++i ) {
          op.join( accum , * m_shepherd_base[i]->shepherd_team_scratch_value<Type>() );
        }
        for ( int i = 1 ; i < team_size ; ++i ) {
          * m_shepherd_base[i]->shepherd_team_scratch_value<Type>() = accum ;
        }

        memory_fence();
      }

      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
        m_shepherd_base[j]->m_worker_state = QthreadExec::Active ;
      }

      return *shepherd_team_scratch_value<Type>();
    }
Пример #10
0
 inline
 void shepherd_broadcast( Type & value , const int team_size , const int team_rank ) const
   {
     if ( m_shepherd_base ) {
       Type * const shared_value = m_shepherd_base[0]->shepherd_team_scratch_value<Type>();
       if ( m_shepherd_worker_rank == team_rank ) { *shared_value = value ; }
       memory_fence();
       shepherd_barrier( team_size );
       value = *shared_value ;
     }
   }
Пример #11
0
// Wait for root thread to become inactive
void ThreadsExec::fence()
{
  if ( s_thread_pool_size[0] ) {
    // Wait for the root thread to complete:
    Impl::spinwait( s_threads_exec[0]->m_pool_state , ThreadsExec::Active );
  }

  s_current_function     = 0 ;
  s_current_function_arg = 0 ;

  // Make sure function and arguments are cleared before
  // potentially re-activating threads with a subsequent launch.
  memory_fence();
}
Пример #12
0
  inline
  Type shepherd_reduce( const int team_size , const Type & value ) const
    {
      *shepherd_team_scratch_value<Type>() = value ;

      memory_fence();

      const int rev_rank = team_size - ( m_shepherd_worker_rank + 1 );

      int n , j ;

      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
        Impl::spinwait( m_shepherd_base[j]->m_worker_state , QthreadExec::Active );
      }

      if ( rev_rank ) {
        m_worker_state = QthreadExec::Inactive ;
        Impl::spinwait( m_worker_state , QthreadExec::Inactive );
      }
      else {
        Type & accum = * m_shepherd_base[0]->shepherd_team_scratch_value<Type>();
        for ( int i = 1 ; i < n ; ++i ) {
          accum += * m_shepherd_base[i]->shepherd_team_scratch_value<Type>();
        }
        for ( int i = 1 ; i < n ; ++i ) {
          * m_shepherd_base[i]->shepherd_team_scratch_value<Type>() = accum ;
        }

        memory_fence();
      }

      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
        m_shepherd_base[j]->m_worker_state = QthreadExec::Active ;
      }

      return *shepherd_team_scratch_value<Type>();
    }
Пример #13
0
void Window::TextureDisplayer_SetTextureObject(TextureObject Texture){
    if(TextureDisplayer_Inited){
        //BorrowContext();
        ConsoleEcho("on rentre");
        Texture.Load(0);
        ConsoleEcho("Binding");
        Texture.Bind(1);
        ConsoleEcho("Binded");
        int dara=1;
        float dara2=1.0f;
        TextureDisplayer_Texture=Texture.GetTextureIndex();
        ConsoleEcho("Envoie de la Texture");
        SetUniform(GL_INT,1,&dara,TextureDisplayer_Adress,"Tex1");
        memory_fence();
        SetUniform(GL_FLOAT,1,&dara2,TextureDisplayer_Adress,"ColorTest");
    }
}
Пример #14
0
  KOKKOS_INLINE_FUNCTION
  void team_broadcast(ValueType& value, const int& thread_id) const
  {
#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
    { }
#else
    // Make sure there is enough scratch space:
    typedef typename if_c< sizeof(ValueType) < TEAM_REDUCE_SIZE
                         , ValueType , void >::type type ;

    type * const local_value = ((type*) m_exec.scratch_thread());
    if(team_rank() == thread_id)
      *local_value = value;
    memory_fence();
    team_barrier();
    value = *local_value;
#endif
  }
  KOKKOS_INLINE_FUNCTION
  void push_work( const std::int32_t w ) const noexcept
    {
      const std::int32_t N = m_graph.numRows();

      std::int32_t volatile * const ready_queue = & m_queue[0] ;
      std::int32_t volatile * const end_hint    = & m_queue[2*N+1] ;

      // Push work to end of queue
      const std::int32_t j = atomic_fetch_add( end_hint , 1 );

      if ( ( N <= j ) ||
           ( END_TOKEN != atomic_exchange(ready_queue+j,w) ) ) {
        // ERROR: past the end of queue or did not replace END_TOKEN
        Kokkos::abort("WorkGraphPolicy push_work error");
      }

      memory_fence();
    }
Пример #16
0
std::size_t FixedVector<T, Size>::push_back(T *value) {
  while (true) {
    Word index = length_.nobarrier_load();
    if (index >= Size) return -1;

    // We "make space" for the element we are going to insert by
    // incrementing the index.  We can't use an atomic add here since
    // we need to ensure we don't bump the index out of bounds and
    // make the container inconsistent.  There may be some clever way
    // around that, though; might be worth thinking about if atomic
    // adds are faster than atomic compare exchanges.
    if (!length_.boolean_cas(index, index + 1)) continue;

    // We can't let the actual store to the buffer be reordered ahead
    // of the length_ increment -- another thread might end up writing
    // to the same location.
    memory_fence();
    buffer_[index].nobarrier_store(value);
    return static_cast<std::size_t>(index);
  }
}
Пример #17
0
void ThreadsExec::initialize( unsigned thread_count ,
                              unsigned use_numa_count ,
                              unsigned use_cores_per_numa ,
                              bool allow_asynchronous_threadpool )
{
  static const Sentinel sentinel ;

  const bool is_initialized = 0 != s_thread_pool_size[0] ;

  unsigned thread_spawn_failed = 0 ;

  for ( int i = 0; i < ThreadsExec::MAX_THREAD_COUNT ; i++)
    s_threads_exec[i] = NULL;

  if ( ! is_initialized ) {

    // If thread_count, use_numa_count, or use_cores_per_numa are zero
    // then they will be given default values based upon hwloc detection
    // and allowed asynchronous execution.

    const bool hwloc_avail = Kokkos::hwloc::available();
    const bool hwloc_can_bind = hwloc_avail && Kokkos::hwloc::can_bind_threads();

    if ( thread_count == 0 ) {
      thread_count = hwloc_avail
      ? Kokkos::hwloc::get_available_numa_count() *
        Kokkos::hwloc::get_available_cores_per_numa() *
        Kokkos::hwloc::get_available_threads_per_core()
      : 1 ;
    }

    const unsigned thread_spawn_begin =
      hwloc::thread_mapping( "Kokkos::Threads::initialize" ,
                             allow_asynchronous_threadpool ,
                             thread_count ,
                             use_numa_count ,
                             use_cores_per_numa ,
                             s_threads_coord );

    const std::pair<unsigned,unsigned> proc_coord = s_threads_coord[0] ;

    if ( thread_spawn_begin ) {
      // Synchronous with s_threads_coord[0] as the process core
      // Claim entry #0 for binding the process core.
      s_threads_coord[0] = std::pair<unsigned,unsigned>(~0u,~0u);
    }

    s_thread_pool_size[0] = thread_count ;
    s_thread_pool_size[1] = s_thread_pool_size[0] / use_numa_count ;
    s_thread_pool_size[2] = s_thread_pool_size[1] / use_cores_per_numa ;
    s_current_function = & execute_function_noop ; // Initialization work function

    for ( unsigned ith = thread_spawn_begin ; ith < thread_count ; ++ith ) {

      s_threads_process.m_pool_state = ThreadsExec::Inactive ;

      // If hwloc available then spawned thread will
      // choose its own entry in 's_threads_coord'
      // otherwise specify the entry.
      s_current_function_arg = (void*)static_cast<uintptr_t>( hwloc_can_bind ? ~0u : ith );

      // Make sure all outstanding memory writes are complete
      // before spawning the new thread.
      memory_fence();

      // Spawn thread executing the 'driver()' function.
      // Wait until spawned thread has attempted to initialize.
      // If spawning and initialization is successfull then
      // an entry in 's_threads_exec' will be assigned.
      if ( ThreadsExec::spawn() ) {
        wait_yield( s_threads_process.m_pool_state , ThreadsExec::Inactive );
      }
      if ( s_threads_process.m_pool_state == ThreadsExec::Terminating ) break ;
    }

    // Wait for all spawned threads to deactivate before zeroing the function.

    for ( unsigned ith = thread_spawn_begin ; ith < thread_count ; ++ith ) {
      // Try to protect against cache coherency failure by casting to volatile.
      ThreadsExec * const th = ((ThreadsExec * volatile *)s_threads_exec)[ith] ;
      if ( th ) {
        wait_yield( th->m_pool_state , ThreadsExec::Active );
      }
      else {
        ++thread_spawn_failed ;
      }
    }

    s_current_function     = 0 ;
    s_current_function_arg = 0 ;
    s_threads_process.m_pool_state = ThreadsExec::Inactive ;

    memory_fence();

    if ( ! thread_spawn_failed ) {
      // Bind process to the core on which it was located before spawning occured
      if (hwloc_can_bind) {
        Kokkos::hwloc::bind_this_thread( proc_coord );
      }

      if ( thread_spawn_begin ) { // Include process in pool.
        const std::pair<unsigned,unsigned> coord = Kokkos::hwloc::get_this_thread_coordinate();

        s_threads_exec[0]                   = & s_threads_process ;
        s_threads_process.m_numa_rank       = coord.first ;
        s_threads_process.m_numa_core_rank  = coord.second ;
        s_threads_process.m_pool_base       = s_threads_exec ;
        s_threads_process.m_pool_rank       = thread_count - 1 ; // Reversed for scan-compatible reductions
        s_threads_process.m_pool_size       = thread_count ;
        s_threads_process.m_pool_fan_size   = fan_size( s_threads_process.m_pool_rank , s_threads_process.m_pool_size );
        s_threads_pid[ s_threads_process.m_pool_rank ] = pthread_self();
      }
      else {
        s_threads_process.m_pool_base = 0 ;
        s_threads_process.m_pool_rank = 0 ;
        s_threads_process.m_pool_size = 0 ;
        s_threads_process.m_pool_fan_size = 0 ;
      }

      // Initial allocations:
      ThreadsExec::resize_scratch( 1024 , 1024 );
    }
    else {
      s_thread_pool_size[0] = 0 ;
      s_thread_pool_size[1] = 0 ;
      s_thread_pool_size[2] = 0 ;
    }
  }

  if ( is_initialized || thread_spawn_failed ) {

    std::ostringstream msg ;

    msg << "Kokkos::Threads::initialize ERROR" ;

    if ( is_initialized ) {
      msg << " : already initialized" ;
    }
    if ( thread_spawn_failed ) {
      msg << " : failed to spawn " << thread_spawn_failed << " threads" ;
    }

    Kokkos::Impl::throw_runtime_exception( msg.str() );
  }

  // Check for over-subscription
  if( Impl::mpi_ranks_per_node() * long(thread_count) > Impl::processors_per_node() ) {
    std::cout << "Kokkos::Threads::initialize WARNING: You are likely oversubscribing your CPU cores." << std::endl;
    std::cout << "                                    Detected: " << Impl::processors_per_node() << " cores per node." << std::endl;
    std::cout << "                                    Detected: " << Impl::mpi_ranks_per_node() << " MPI_ranks per node." << std::endl;
    std::cout << "                                    Requested: " << thread_count << " threads per process." << std::endl;
  }

  // Init the array for used for arbitrarily sized atomics
  Impl::init_lock_array_host_space();

  #if (KOKKOS_ENABLE_PROFILING)
    Kokkos::Profiling::initialize();
  #endif
}
Пример #18
0
void OpenMPExec::resize_thread_data( size_t pool_reduce_bytes
                                   , size_t team_reduce_bytes
                                   , size_t team_shared_bytes
                                   , size_t thread_local_bytes )
{
  const size_t member_bytes =
    sizeof(int64_t) *
    HostThreadTeamData::align_to_int64( sizeof(HostThreadTeamData) );

  HostThreadTeamData * root = m_pool[0] ;

  const size_t old_pool_reduce  = root ? root->pool_reduce_bytes() : 0 ;
  const size_t old_team_reduce  = root ? root->team_reduce_bytes() : 0 ;
  const size_t old_team_shared  = root ? root->team_shared_bytes() : 0 ;
  const size_t old_thread_local = root ? root->thread_local_bytes() : 0 ;
  const size_t old_alloc_bytes  = root ? ( member_bytes + root->scratch_bytes() ) : 0 ;

  // Allocate if any of the old allocation is tool small:

  const bool allocate = ( old_pool_reduce  < pool_reduce_bytes ) ||
                        ( old_team_reduce  < team_reduce_bytes ) ||
                        ( old_team_shared  < team_shared_bytes ) ||
                        ( old_thread_local < thread_local_bytes );

  if ( allocate ) {

    if ( pool_reduce_bytes < old_pool_reduce ) { pool_reduce_bytes = old_pool_reduce ; }
    if ( team_reduce_bytes < old_team_reduce ) { team_reduce_bytes = old_team_reduce ; }
    if ( team_shared_bytes < old_team_shared ) { team_shared_bytes = old_team_shared ; }
    if ( thread_local_bytes < old_thread_local ) { thread_local_bytes = old_thread_local ; }

    const size_t alloc_bytes =
      member_bytes +
      HostThreadTeamData::scratch_size( pool_reduce_bytes
                                      , team_reduce_bytes
                                      , team_shared_bytes
                                      , thread_local_bytes );

    OpenMP::memory_space space ;

    memory_fence();

    #pragma omp parallel num_threads(m_pool_size)
    {
      const int rank = omp_get_thread_num();

      if ( 0 != m_pool[rank] ) {

        m_pool[rank]->disband_pool();

        space.deallocate( m_pool[rank] , old_alloc_bytes );
      }

      void * const ptr = space.allocate( alloc_bytes );

      m_pool[ rank ] = new( ptr ) HostThreadTeamData();

      m_pool[ rank ]->
        scratch_assign( ((char *)ptr) + member_bytes
                      , alloc_bytes
                      , pool_reduce_bytes
                      , team_reduce_bytes
                      , team_shared_bytes
                      , thread_local_bytes
                      );

      memory_fence();
    }
/* END #pragma omp parallel */

    HostThreadTeamData::organize_pool( m_pool , m_pool_size );
  }
}
Пример #19
0
inline void atomic_write(volatile void * ptr, typename size_to_int<size>::type value)
{
    memory_fence();
    *static_cast<volatile typename size_to_int<size>::type*>(ptr) = value;
}