void OpenMPExec::clear_thread_data()
{
  const size_t member_bytes =
    sizeof(int64_t) *
    HostThreadTeamData::align_to_int64( sizeof(HostThreadTeamData) );

  const int old_alloc_bytes =
    m_pool[0] ? ( member_bytes + m_pool[0]->scratch_bytes() ) : 0 ;

  OpenMP::memory_space space ;

  #pragma omp parallel num_threads( m_pool_size )
  {
    const int rank = omp_get_thread_num();

    if ( 0 != m_pool[rank] ) {

      m_pool[rank]->disband_pool();

      space.deallocate( m_pool[rank] , old_alloc_bytes );

      m_pool[rank] = 0 ;
    }
  }
/* END #pragma omp parallel */
}
void OpenMP::partition_master( F const& f
                             , int num_partitions
                             , int partition_size
                             )
{
  if (omp_get_nested()) {
    using Exec = Impl::OpenMPExec;

    Exec * prev_instance = Impl::t_openmp_instance;

    Exec::validate_partition( prev_instance->m_pool_size, num_partitions, partition_size );

    OpenMP::memory_space space;

    #pragma omp parallel num_threads(num_partitions)
    {
      void * const ptr = space.allocate( sizeof(Exec) );

      Impl::t_openmp_instance = new (ptr) Exec( partition_size );

      size_t pool_reduce_bytes  =   32 * partition_size ;
      size_t team_reduce_bytes  =   32 * partition_size ;
      size_t team_shared_bytes  = 1024 * partition_size ;
      size_t thread_local_bytes = 1024 ;

      Impl::t_openmp_instance->resize_thread_data( pool_reduce_bytes
                                                 , team_reduce_bytes
                                                 , team_shared_bytes
                                                 , thread_local_bytes
                                                 );

      omp_set_num_threads(partition_size);
      f( omp_get_thread_num(), omp_get_num_threads() );

      Impl::t_openmp_instance->~Exec();
      space.deallocate( Impl::t_openmp_instance, sizeof(Exec) );
      Impl::t_openmp_instance = nullptr;
    }

    Impl::t_openmp_instance  = prev_instance;
  }
  else {
    // nested openmp not enabled
    f(0,1);
  }
}
void OpenMP::impl_finalize()
#endif
{
  if ( omp_in_parallel() )
  {
    std::string msg("Kokkos::OpenMP::finalize ERROR ");
    if( !Impl::t_openmp_instance ) msg.append(": not initialized");
    if( omp_in_parallel() ) msg.append(": in parallel");
    Kokkos::Impl::throw_runtime_exception(msg);
  }

  if ( Impl::t_openmp_instance ) {
    // Silence Cuda Warning
    const int nthreads = Impl::t_openmp_instance->m_pool_size <= Impl::g_openmp_hardware_max_threads
                       ? Impl::g_openmp_hardware_max_threads
                       : Impl::t_openmp_instance->m_pool_size;
    (void) nthreads;

    using Exec = Impl::OpenMPExec;
    Exec * instance = Impl::t_openmp_instance;
    instance->~Exec();

    OpenMP::memory_space space;
    space.deallocate( instance, sizeof(Exec) );

    #pragma omp parallel num_threads(nthreads)
    {
      Impl::t_openmp_hardware_id = 0;
      Impl::t_openmp_instance    = nullptr;
      Impl::SharedAllocationRecord< void, void >::tracking_disable();
    }

    // allow main thread to track
    Impl::SharedAllocationRecord< void, void >::tracking_enable();

    Impl::g_openmp_hardware_max_threads = 1;
  }

  #if defined(KOKKOS_ENABLE_PROFILING)
    Kokkos::Profiling::finalize();
  #endif
}
void OpenMP::impl_initialize( int thread_count )
#endif
{
  if ( omp_in_parallel() ) {
    std::string msg("Kokkos::OpenMP::initialize ERROR : in parallel");
    Kokkos::Impl::throw_runtime_exception(msg);
  }

  if ( Impl::t_openmp_instance )
  {
    finalize();
  }

  {
    if ( Kokkos::show_warnings() && nullptr == std::getenv("OMP_PROC_BIND") ) {
      printf("Kokkos::OpenMP::initialize WARNING: OMP_PROC_BIND environment variable not set\n");
      printf("  In general, for best performance with OpenMP 4.0 or better set OMP_PROC_BIND=spread and OMP_PLACES=threads\n");
      printf("  For best performance with OpenMP 3.1 set OMP_PROC_BIND=true\n");
      printf("  For unit testing set OMP_PROC_BIND=false\n");
    }

    OpenMP::memory_space space ;

    // Before any other call to OMP query the maximum number of threads
    // and save the value for re-initialization unit testing.

#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
    Impl::g_openmp_hardware_max_threads = get_current_max_threads();
#else
    Impl::g_openmp_hardware_max_threads = impl_get_current_max_threads();
#endif

    int process_num_threads = Impl::g_openmp_hardware_max_threads;

    if ( Kokkos::hwloc::available() ) {
      process_num_threads = Kokkos::hwloc::get_available_numa_count()
                          * Kokkos::hwloc::get_available_cores_per_numa()
                          * Kokkos::hwloc::get_available_threads_per_core();
    }

    // if thread_count  < 0, use g_openmp_hardware_max_threads;
    // if thread_count == 0, set g_openmp_hardware_max_threads to process_num_threads
    // if thread_count  > 0, set g_openmp_hardware_max_threads to thread_count
    if (thread_count < 0 ) {
      thread_count = Impl::g_openmp_hardware_max_threads;
    }
    else if( thread_count == 0 && Impl::g_openmp_hardware_max_threads != process_num_threads ) {
      Impl::g_openmp_hardware_max_threads = process_num_threads;
      omp_set_num_threads(Impl::g_openmp_hardware_max_threads);
    }
    else {
      if( Kokkos::show_warnings() && thread_count > process_num_threads ) {
        printf( "Kokkos::OpenMP::initialize WARNING: You are likely oversubscribing your CPU cores.\n");
        printf( "  process threads available : %3d,  requested thread : %3d\n", process_num_threads, thread_count );
      }
      Impl::g_openmp_hardware_max_threads = thread_count;
      omp_set_num_threads(Impl::g_openmp_hardware_max_threads);
    }

    // setup thread local
    #pragma omp parallel num_threads(Impl::g_openmp_hardware_max_threads)
    {
      Impl::t_openmp_instance = nullptr;
      Impl::t_openmp_hardware_id = omp_get_thread_num();
      Impl::SharedAllocationRecord< void, void >::tracking_enable();
    }

    void * const ptr = space.allocate( sizeof(Impl::OpenMPExec) );

    Impl::t_openmp_instance = new (ptr) Impl::OpenMPExec( Impl::g_openmp_hardware_max_threads );

    // New, unified host thread team data:
    {
      size_t pool_reduce_bytes  =   32 * thread_count ;
      size_t team_reduce_bytes  =   32 * thread_count ;
      size_t team_shared_bytes  = 1024 * thread_count ;
      size_t thread_local_bytes = 1024 ;

      Impl::t_openmp_instance->resize_thread_data( pool_reduce_bytes
                                                 , team_reduce_bytes
                                                 , team_shared_bytes
                                                 , thread_local_bytes
                                                 );
    }
  }


  // Check for over-subscription
  if( Kokkos::show_warnings() && (Impl::mpi_ranks_per_node() * long(thread_count) > Impl::processors_per_node()) ) {
    std::cerr << "Kokkos::OpenMP::initialize WARNING: You are likely oversubscribing your CPU cores." << std::endl;
    std::cerr << "                                    Detected: " << Impl::processors_per_node() << " cores per node." << std::endl;
    std::cerr << "                                    Detected: " << Impl::mpi_ranks_per_node() << " MPI_ranks per node." << std::endl;
    std::cerr << "                                    Requested: " << thread_count << " threads per process." << std::endl;
  }
  // Init the array for used for arbitrarily sized atomics
  Impl::init_lock_array_host_space();

  #if defined(KOKKOS_ENABLE_DEPRECATED_CODE) && defined(KOKKOS_ENABLE_PROFILING)
  Kokkos::Profiling::initialize();
  #endif
}
void OpenMPExec::resize_thread_data( size_t pool_reduce_bytes
                                   , size_t team_reduce_bytes
                                   , size_t team_shared_bytes
                                   , size_t thread_local_bytes )
{
  const size_t member_bytes =
    sizeof(int64_t) *
    HostThreadTeamData::align_to_int64( sizeof(HostThreadTeamData) );

  HostThreadTeamData * root = m_pool[0] ;

  const size_t old_pool_reduce  = root ? root->pool_reduce_bytes() : 0 ;
  const size_t old_team_reduce  = root ? root->team_reduce_bytes() : 0 ;
  const size_t old_team_shared  = root ? root->team_shared_bytes() : 0 ;
  const size_t old_thread_local = root ? root->thread_local_bytes() : 0 ;
  const size_t old_alloc_bytes  = root ? ( member_bytes + root->scratch_bytes() ) : 0 ;

  // Allocate if any of the old allocation is tool small:

  const bool allocate = ( old_pool_reduce  < pool_reduce_bytes ) ||
                        ( old_team_reduce  < team_reduce_bytes ) ||
                        ( old_team_shared  < team_shared_bytes ) ||
                        ( old_thread_local < thread_local_bytes );

  if ( allocate ) {

    if ( pool_reduce_bytes < old_pool_reduce ) { pool_reduce_bytes = old_pool_reduce ; }
    if ( team_reduce_bytes < old_team_reduce ) { team_reduce_bytes = old_team_reduce ; }
    if ( team_shared_bytes < old_team_shared ) { team_shared_bytes = old_team_shared ; }
    if ( thread_local_bytes < old_thread_local ) { thread_local_bytes = old_thread_local ; }

    const size_t alloc_bytes =
      member_bytes +
      HostThreadTeamData::scratch_size( pool_reduce_bytes
                                      , team_reduce_bytes
                                      , team_shared_bytes
                                      , thread_local_bytes );

    OpenMP::memory_space space ;

    memory_fence();

    #pragma omp parallel num_threads(m_pool_size)
    {
      const int rank = omp_get_thread_num();

      if ( 0 != m_pool[rank] ) {

        m_pool[rank]->disband_pool();

        space.deallocate( m_pool[rank] , old_alloc_bytes );
      }

      void * const ptr = space.allocate( alloc_bytes );

      m_pool[ rank ] = new( ptr ) HostThreadTeamData();

      m_pool[ rank ]->
        scratch_assign( ((char *)ptr) + member_bytes
                      , alloc_bytes
                      , pool_reduce_bytes
                      , team_reduce_bytes
                      , team_shared_bytes
                      , thread_local_bytes
                      );

      memory_fence();
    }
/* END #pragma omp parallel */

    HostThreadTeamData::organize_pool( m_pool , m_pool_size );
  }
}