void OpenMPExec::clear_thread_data() { const size_t member_bytes = sizeof(int64_t) * HostThreadTeamData::align_to_int64( sizeof(HostThreadTeamData) ); const int old_alloc_bytes = m_pool[0] ? ( member_bytes + m_pool[0]->scratch_bytes() ) : 0 ; OpenMP::memory_space space ; #pragma omp parallel num_threads( m_pool_size ) { const int rank = omp_get_thread_num(); if ( 0 != m_pool[rank] ) { m_pool[rank]->disband_pool(); space.deallocate( m_pool[rank] , old_alloc_bytes ); m_pool[rank] = 0 ; } } /* END #pragma omp parallel */ }
void OpenMP::partition_master( F const& f , int num_partitions , int partition_size ) { if (omp_get_nested()) { using Exec = Impl::OpenMPExec; Exec * prev_instance = Impl::t_openmp_instance; Exec::validate_partition( prev_instance->m_pool_size, num_partitions, partition_size ); OpenMP::memory_space space; #pragma omp parallel num_threads(num_partitions) { void * const ptr = space.allocate( sizeof(Exec) ); Impl::t_openmp_instance = new (ptr) Exec( partition_size ); size_t pool_reduce_bytes = 32 * partition_size ; size_t team_reduce_bytes = 32 * partition_size ; size_t team_shared_bytes = 1024 * partition_size ; size_t thread_local_bytes = 1024 ; Impl::t_openmp_instance->resize_thread_data( pool_reduce_bytes , team_reduce_bytes , team_shared_bytes , thread_local_bytes ); omp_set_num_threads(partition_size); f( omp_get_thread_num(), omp_get_num_threads() ); Impl::t_openmp_instance->~Exec(); space.deallocate( Impl::t_openmp_instance, sizeof(Exec) ); Impl::t_openmp_instance = nullptr; } Impl::t_openmp_instance = prev_instance; } else { // nested openmp not enabled f(0,1); } }
void OpenMP::impl_finalize() #endif { if ( omp_in_parallel() ) { std::string msg("Kokkos::OpenMP::finalize ERROR "); if( !Impl::t_openmp_instance ) msg.append(": not initialized"); if( omp_in_parallel() ) msg.append(": in parallel"); Kokkos::Impl::throw_runtime_exception(msg); } if ( Impl::t_openmp_instance ) { // Silence Cuda Warning const int nthreads = Impl::t_openmp_instance->m_pool_size <= Impl::g_openmp_hardware_max_threads ? Impl::g_openmp_hardware_max_threads : Impl::t_openmp_instance->m_pool_size; (void) nthreads; using Exec = Impl::OpenMPExec; Exec * instance = Impl::t_openmp_instance; instance->~Exec(); OpenMP::memory_space space; space.deallocate( instance, sizeof(Exec) ); #pragma omp parallel num_threads(nthreads) { Impl::t_openmp_hardware_id = 0; Impl::t_openmp_instance = nullptr; Impl::SharedAllocationRecord< void, void >::tracking_disable(); } // allow main thread to track Impl::SharedAllocationRecord< void, void >::tracking_enable(); Impl::g_openmp_hardware_max_threads = 1; } #if defined(KOKKOS_ENABLE_PROFILING) Kokkos::Profiling::finalize(); #endif }
void OpenMP::impl_initialize( int thread_count ) #endif { if ( omp_in_parallel() ) { std::string msg("Kokkos::OpenMP::initialize ERROR : in parallel"); Kokkos::Impl::throw_runtime_exception(msg); } if ( Impl::t_openmp_instance ) { finalize(); } { if ( Kokkos::show_warnings() && nullptr == std::getenv("OMP_PROC_BIND") ) { printf("Kokkos::OpenMP::initialize WARNING: OMP_PROC_BIND environment variable not set\n"); printf(" In general, for best performance with OpenMP 4.0 or better set OMP_PROC_BIND=spread and OMP_PLACES=threads\n"); printf(" For best performance with OpenMP 3.1 set OMP_PROC_BIND=true\n"); printf(" For unit testing set OMP_PROC_BIND=false\n"); } OpenMP::memory_space space ; // Before any other call to OMP query the maximum number of threads // and save the value for re-initialization unit testing. #ifdef KOKKOS_ENABLE_DEPRECATED_CODE Impl::g_openmp_hardware_max_threads = get_current_max_threads(); #else Impl::g_openmp_hardware_max_threads = impl_get_current_max_threads(); #endif int process_num_threads = Impl::g_openmp_hardware_max_threads; if ( Kokkos::hwloc::available() ) { process_num_threads = Kokkos::hwloc::get_available_numa_count() * Kokkos::hwloc::get_available_cores_per_numa() * Kokkos::hwloc::get_available_threads_per_core(); } // if thread_count < 0, use g_openmp_hardware_max_threads; // if thread_count == 0, set g_openmp_hardware_max_threads to process_num_threads // if thread_count > 0, set g_openmp_hardware_max_threads to thread_count if (thread_count < 0 ) { thread_count = Impl::g_openmp_hardware_max_threads; } else if( thread_count == 0 && Impl::g_openmp_hardware_max_threads != process_num_threads ) { Impl::g_openmp_hardware_max_threads = process_num_threads; omp_set_num_threads(Impl::g_openmp_hardware_max_threads); } else { if( Kokkos::show_warnings() && thread_count > process_num_threads ) { printf( "Kokkos::OpenMP::initialize WARNING: You are likely oversubscribing your CPU cores.\n"); printf( " process threads available : %3d, requested thread : %3d\n", process_num_threads, thread_count ); } Impl::g_openmp_hardware_max_threads = thread_count; omp_set_num_threads(Impl::g_openmp_hardware_max_threads); } // setup thread local #pragma omp parallel num_threads(Impl::g_openmp_hardware_max_threads) { Impl::t_openmp_instance = nullptr; Impl::t_openmp_hardware_id = omp_get_thread_num(); Impl::SharedAllocationRecord< void, void >::tracking_enable(); } void * const ptr = space.allocate( sizeof(Impl::OpenMPExec) ); Impl::t_openmp_instance = new (ptr) Impl::OpenMPExec( Impl::g_openmp_hardware_max_threads ); // New, unified host thread team data: { size_t pool_reduce_bytes = 32 * thread_count ; size_t team_reduce_bytes = 32 * thread_count ; size_t team_shared_bytes = 1024 * thread_count ; size_t thread_local_bytes = 1024 ; Impl::t_openmp_instance->resize_thread_data( pool_reduce_bytes , team_reduce_bytes , team_shared_bytes , thread_local_bytes ); } } // Check for over-subscription if( Kokkos::show_warnings() && (Impl::mpi_ranks_per_node() * long(thread_count) > Impl::processors_per_node()) ) { std::cerr << "Kokkos::OpenMP::initialize WARNING: You are likely oversubscribing your CPU cores." << std::endl; std::cerr << " Detected: " << Impl::processors_per_node() << " cores per node." << std::endl; std::cerr << " Detected: " << Impl::mpi_ranks_per_node() << " MPI_ranks per node." << std::endl; std::cerr << " Requested: " << thread_count << " threads per process." << std::endl; } // Init the array for used for arbitrarily sized atomics Impl::init_lock_array_host_space(); #if defined(KOKKOS_ENABLE_DEPRECATED_CODE) && defined(KOKKOS_ENABLE_PROFILING) Kokkos::Profiling::initialize(); #endif }
void OpenMPExec::resize_thread_data( size_t pool_reduce_bytes , size_t team_reduce_bytes , size_t team_shared_bytes , size_t thread_local_bytes ) { const size_t member_bytes = sizeof(int64_t) * HostThreadTeamData::align_to_int64( sizeof(HostThreadTeamData) ); HostThreadTeamData * root = m_pool[0] ; const size_t old_pool_reduce = root ? root->pool_reduce_bytes() : 0 ; const size_t old_team_reduce = root ? root->team_reduce_bytes() : 0 ; const size_t old_team_shared = root ? root->team_shared_bytes() : 0 ; const size_t old_thread_local = root ? root->thread_local_bytes() : 0 ; const size_t old_alloc_bytes = root ? ( member_bytes + root->scratch_bytes() ) : 0 ; // Allocate if any of the old allocation is tool small: const bool allocate = ( old_pool_reduce < pool_reduce_bytes ) || ( old_team_reduce < team_reduce_bytes ) || ( old_team_shared < team_shared_bytes ) || ( old_thread_local < thread_local_bytes ); if ( allocate ) { if ( pool_reduce_bytes < old_pool_reduce ) { pool_reduce_bytes = old_pool_reduce ; } if ( team_reduce_bytes < old_team_reduce ) { team_reduce_bytes = old_team_reduce ; } if ( team_shared_bytes < old_team_shared ) { team_shared_bytes = old_team_shared ; } if ( thread_local_bytes < old_thread_local ) { thread_local_bytes = old_thread_local ; } const size_t alloc_bytes = member_bytes + HostThreadTeamData::scratch_size( pool_reduce_bytes , team_reduce_bytes , team_shared_bytes , thread_local_bytes ); OpenMP::memory_space space ; memory_fence(); #pragma omp parallel num_threads(m_pool_size) { const int rank = omp_get_thread_num(); if ( 0 != m_pool[rank] ) { m_pool[rank]->disband_pool(); space.deallocate( m_pool[rank] , old_alloc_bytes ); } void * const ptr = space.allocate( alloc_bytes ); m_pool[ rank ] = new( ptr ) HostThreadTeamData(); m_pool[ rank ]-> scratch_assign( ((char *)ptr) + member_bytes , alloc_bytes , pool_reduce_bytes , team_reduce_bytes , team_shared_bytes , thread_local_bytes ); memory_fence(); } /* END #pragma omp parallel */ HostThreadTeamData::organize_pool( m_pool , m_pool_size ); } }