static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
                            const size_t shmem_extra_block, const size_t shmem_extra_thread) {
    int blockSize=16;
    int numBlocks;
    int sharedmem;
    int maxOccupancy=0;
    int bestBlockSize=0;
    int max_threads_per_block = std::min(MaxThreadsPerBlock,cuda_internal_maximum_warp_count()*CudaTraits::WarpSize);

    while(blockSize < max_threads_per_block ) {
      blockSize*=2;
      sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
                  FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize/vector_length );

      cudaOccupancyMaxActiveBlocksPerMultiprocessor(
              &numBlocks,
              cuda_parallel_launch_local_memory<DriverType,MaxThreadsPerBlock,MinBlocksPerSM>,
              blockSize,
              sharedmem);
      if(numBlocks >= int(MinBlocksPerSM) && blockSize<=int(MaxThreadsPerBlock)) {
        if(maxOccupancy < numBlocks*blockSize) {
          maxOccupancy = numBlocks*blockSize;
          bestBlockSize = blockSize;
        }
      }
    }
    if(maxOccupancy > 0)
      return bestBlockSize;
    return -1;
  }
Example #2
0
  ParallelFor( const FunctorType  & functor ,
               const size_t         work )
    : m_functor( functor )
    , m_work(    work )
    {
      const dim3 block( CudaTraits::WarpSize * cuda_internal_maximum_warp_count(), 1, 1);
      const dim3 grid( std::min( ( m_work + block.x - 1 ) / block.x , cuda_internal_maximum_grid_count() ) , 1 , 1 );

      CudaParallelLaunch< ParallelFor >( *this , grid , block , 0 );
    }
Example #3
0
  ParallelFor( const FunctorType         & functor ,
               const ParallelWorkRequest &  work )
    : m_functor( functor )
    , m_work( std::min( work.league_size , size_t(cuda_internal_maximum_grid_count()) ) ,
              std::min( work.team_size ,   size_t(CudaTraits::WarpSize * cuda_internal_maximum_warp_count()) ) )
    , m_shmem( FunctorShmemSize< FunctorType >::value( functor ) )
    {
      const dim3 grid(  m_work.league_size , 1 , 1 );
      const dim3 block( m_work.team_size , 1, 1 );

      CudaParallelLaunch< ParallelFor >( *this , grid , block , m_shmem );
    }
  __host__
  static dim3 thread_block( const block_type & block )
  {
    const int d = block.dimension();
    const int y = ( cuda_internal_maximum_warp_count() * CudaTraits::WarpSize ) / d ;

    if ( 0 == y ) {
      throw std::runtime_error( std::string("Kokkos::Impl::Multiply< SymmetricDiagonalSpec<Cuda> > ERROR: block too large") );
    }

    // dimension X #diagonals to concurrently process
    return dim3( d , std::min( y , ( 1 + d ) / 2 ) , 1 );
  }