void OpenMPExec::clear_thread_data() { const size_t member_bytes = sizeof(int64_t) * HostThreadTeamData::align_to_int64( sizeof(HostThreadTeamData) ); const int old_alloc_bytes = m_pool[0] ? ( member_bytes + m_pool[0]->scratch_bytes() ) : 0 ; OpenMP::memory_space space ; #pragma omp parallel num_threads( m_pool_size ) { const int rank = omp_get_thread_num(); if ( 0 != m_pool[rank] ) { m_pool[rank]->disband_pool(); space.deallocate( m_pool[rank] , old_alloc_bytes ); m_pool[rank] = 0 ; } } /* END #pragma omp parallel */ }
void OpenMP::partition_master( F const& f , int num_partitions , int partition_size ) { if (omp_get_nested()) { using Exec = Impl::OpenMPExec; Exec * prev_instance = Impl::t_openmp_instance; Exec::validate_partition( prev_instance->m_pool_size, num_partitions, partition_size ); OpenMP::memory_space space; #pragma omp parallel num_threads(num_partitions) { void * const ptr = space.allocate( sizeof(Exec) ); Impl::t_openmp_instance = new (ptr) Exec( partition_size ); size_t pool_reduce_bytes = 32 * partition_size ; size_t team_reduce_bytes = 32 * partition_size ; size_t team_shared_bytes = 1024 * partition_size ; size_t thread_local_bytes = 1024 ; Impl::t_openmp_instance->resize_thread_data( pool_reduce_bytes , team_reduce_bytes , team_shared_bytes , thread_local_bytes ); omp_set_num_threads(partition_size); f( omp_get_thread_num(), omp_get_num_threads() ); Impl::t_openmp_instance->~Exec(); space.deallocate( Impl::t_openmp_instance, sizeof(Exec) ); Impl::t_openmp_instance = nullptr; } Impl::t_openmp_instance = prev_instance; } else { // nested openmp not enabled f(0,1); } }
void OpenMP::impl_finalize() #endif { if ( omp_in_parallel() ) { std::string msg("Kokkos::OpenMP::finalize ERROR "); if( !Impl::t_openmp_instance ) msg.append(": not initialized"); if( omp_in_parallel() ) msg.append(": in parallel"); Kokkos::Impl::throw_runtime_exception(msg); } if ( Impl::t_openmp_instance ) { // Silence Cuda Warning const int nthreads = Impl::t_openmp_instance->m_pool_size <= Impl::g_openmp_hardware_max_threads ? Impl::g_openmp_hardware_max_threads : Impl::t_openmp_instance->m_pool_size; (void) nthreads; using Exec = Impl::OpenMPExec; Exec * instance = Impl::t_openmp_instance; instance->~Exec(); OpenMP::memory_space space; space.deallocate( instance, sizeof(Exec) ); #pragma omp parallel num_threads(nthreads) { Impl::t_openmp_hardware_id = 0; Impl::t_openmp_instance = nullptr; Impl::SharedAllocationRecord< void, void >::tracking_disable(); } // allow main thread to track Impl::SharedAllocationRecord< void, void >::tracking_enable(); Impl::g_openmp_hardware_max_threads = 1; } #if defined(KOKKOS_ENABLE_PROFILING) Kokkos::Profiling::finalize(); #endif }
void OpenMPExec::resize_thread_data( size_t pool_reduce_bytes , size_t team_reduce_bytes , size_t team_shared_bytes , size_t thread_local_bytes ) { const size_t member_bytes = sizeof(int64_t) * HostThreadTeamData::align_to_int64( sizeof(HostThreadTeamData) ); HostThreadTeamData * root = m_pool[0] ; const size_t old_pool_reduce = root ? root->pool_reduce_bytes() : 0 ; const size_t old_team_reduce = root ? root->team_reduce_bytes() : 0 ; const size_t old_team_shared = root ? root->team_shared_bytes() : 0 ; const size_t old_thread_local = root ? root->thread_local_bytes() : 0 ; const size_t old_alloc_bytes = root ? ( member_bytes + root->scratch_bytes() ) : 0 ; // Allocate if any of the old allocation is tool small: const bool allocate = ( old_pool_reduce < pool_reduce_bytes ) || ( old_team_reduce < team_reduce_bytes ) || ( old_team_shared < team_shared_bytes ) || ( old_thread_local < thread_local_bytes ); if ( allocate ) { if ( pool_reduce_bytes < old_pool_reduce ) { pool_reduce_bytes = old_pool_reduce ; } if ( team_reduce_bytes < old_team_reduce ) { team_reduce_bytes = old_team_reduce ; } if ( team_shared_bytes < old_team_shared ) { team_shared_bytes = old_team_shared ; } if ( thread_local_bytes < old_thread_local ) { thread_local_bytes = old_thread_local ; } const size_t alloc_bytes = member_bytes + HostThreadTeamData::scratch_size( pool_reduce_bytes , team_reduce_bytes , team_shared_bytes , thread_local_bytes ); OpenMP::memory_space space ; memory_fence(); #pragma omp parallel num_threads(m_pool_size) { const int rank = omp_get_thread_num(); if ( 0 != m_pool[rank] ) { m_pool[rank]->disband_pool(); space.deallocate( m_pool[rank] , old_alloc_bytes ); } void * const ptr = space.allocate( alloc_bytes ); m_pool[ rank ] = new( ptr ) HostThreadTeamData(); m_pool[ rank ]-> scratch_assign( ((char *)ptr) + member_bytes , alloc_bytes , pool_reduce_bytes , team_reduce_bytes , team_shared_bytes , thread_local_bytes ); memory_fence(); } /* END #pragma omp parallel */ HostThreadTeamData::organize_pool( m_pool , m_pool_size ); } }