static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length, const size_t shmem_extra_block, const size_t shmem_extra_thread) { int numBlocks; int blockSize=32; int sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) + FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length ); cudaOccupancyMaxActiveBlocksPerMultiprocessor( &numBlocks, cuda_parallel_launch_local_memory<DriverType>, blockSize, sharedmem); while (blockSize<1024 && numBlocks>0) { blockSize*=2; sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) + FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length ); cudaOccupancyMaxActiveBlocksPerMultiprocessor( &numBlocks, cuda_parallel_launch_local_memory<DriverType>, blockSize, sharedmem); } if(numBlocks>0) return blockSize; else return blockSize/2; }
static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length, const size_t shmem_extra_block, const size_t shmem_extra_thread) { int blockSize=16; int numBlocks; int sharedmem; int maxOccupancy=0; int bestBlockSize=0; while(blockSize<1024) { blockSize*=2; sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) + FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length ); cudaOccupancyMaxActiveBlocksPerMultiprocessor( &numBlocks, cuda_parallel_launch_local_memory<DriverType>, blockSize, sharedmem); if(maxOccupancy < numBlocks*blockSize) { maxOccupancy = numBlocks*blockSize; bestBlockSize = blockSize; } } return bestBlockSize; }
static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length, const size_t shmem_extra_block, const size_t shmem_extra_thread) { int blockSize=16; int numBlocks; int sharedmem; int maxOccupancy=0; int bestBlockSize=0; while(blockSize<1024) { blockSize*=2; //calculate the occupancy with that optBlockSize and check whether its larger than the largest one found so far sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) + FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length ); cudaOccupancyMaxActiveBlocksPerMultiprocessor( &numBlocks, cuda_parallel_launch_constant_memory<DriverType>, blockSize, sharedmem); if(maxOccupancy < numBlocks*blockSize) { maxOccupancy = numBlocks*blockSize; bestBlockSize = blockSize; } } return bestBlockSize; }
static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length, const size_t shmem_extra_block, const size_t shmem_extra_thread) { int blockSize=16; int numBlocks; int sharedmem; int maxOccupancy=0; int bestBlockSize=0; int max_threads_per_block = std::min(MaxThreadsPerBlock,cuda_internal_maximum_warp_count()*CudaTraits::WarpSize); while(blockSize < max_threads_per_block ) { blockSize*=2; sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) + FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length ); cudaOccupancyMaxActiveBlocksPerMultiprocessor( &numBlocks, cuda_parallel_launch_local_memory<DriverType,MaxThreadsPerBlock,MinBlocksPerSM>, blockSize, sharedmem); if(numBlocks >= int(MinBlocksPerSM) && blockSize<=int(MaxThreadsPerBlock)) { if(maxOccupancy < numBlocks*blockSize) { maxOccupancy = numBlocks*blockSize; bestBlockSize = blockSize; } } } if(maxOccupancy > 0) return bestBlockSize; return -1; }
static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length, const size_t shmem_extra_block, const size_t shmem_extra_thread) { int numBlocks = 0, oldNumBlocks = 0; unsigned int blockSize=MaxThreadsPerBlock; unsigned int sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) + FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length ); cudaOccupancyMaxActiveBlocksPerMultiprocessor( &numBlocks, cuda_parallel_launch_constant_memory<DriverType,MaxThreadsPerBlock,MinBlocksPerSM>, blockSize, sharedmem); if(static_cast<unsigned int>(numBlocks)>=MinBlocksPerSM) return blockSize; while (blockSize>32 && static_cast<unsigned int>(numBlocks)<MinBlocksPerSM) { blockSize/=2; sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) + FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length ); cudaOccupancyMaxActiveBlocksPerMultiprocessor( &numBlocks, cuda_parallel_launch_constant_memory<DriverType>, blockSize, sharedmem); } unsigned int blockSizeUpperBound = (blockSize*2<MaxThreadsPerBlock?blockSize*2:MaxThreadsPerBlock); while (blockSize<blockSizeUpperBound && static_cast<unsigned int>(numBlocks)>MinBlocksPerSM) { blockSize+=32; sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) + FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length ); oldNumBlocks = numBlocks; cudaOccupancyMaxActiveBlocksPerMultiprocessor( &numBlocks, cuda_parallel_launch_constant_memory<DriverType>, blockSize, sharedmem); } if(static_cast<unsigned int>(oldNumBlocks)>=MinBlocksPerSM) return blockSize - 32; return -1; }
static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length, const size_t shmem_extra_block, const size_t shmem_extra_thread) { int numBlocks; unsigned int blockSize=1024; unsigned int sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) + FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length ); cudaOccupancyMaxActiveBlocksPerMultiprocessor( &numBlocks, cuda_parallel_launch_local_memory<DriverType>, blockSize, sharedmem); if(numBlocks>0) return blockSize; while (blockSize>32 && numBlocks==0) { blockSize/=2; sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) + FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length ); cudaOccupancyMaxActiveBlocksPerMultiprocessor( &numBlocks, cuda_parallel_launch_local_memory<DriverType>, blockSize, sharedmem); } unsigned int blockSizeUpperBound = blockSize*2; while (blockSize<blockSizeUpperBound && numBlocks>0) { blockSize+=32; sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) + FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length ); cudaOccupancyMaxActiveBlocksPerMultiprocessor( &numBlocks, cuda_parallel_launch_local_memory<DriverType>, blockSize, sharedmem); } return blockSize - 32; }
__host__ __device__ int max_grid_size_of_bulk_then_execute_concurrent_kernel(const agency::cuda::device_id& device, const Function& f, Shape block_dim, const agency::cuda::async_future<T>& predecessor, const ResultFactory& result_factory, const OuterFactory& outer_factory, const InnerFactory& inner_factory) { const size_t block_dimension = agency::detail::shape_size<Shape>::value; constexpr auto kernel = detail::bulk_then_execute_kernel<block_dimension,Function,T,ResultFactory,OuterFactory,InnerFactory>::value; int max_active_blocks_per_multiprocessor = 0; detail::throw_on_error(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks_per_multiprocessor, kernel, agency::detail::shape_cast<int>(block_dim), device.native_handle()), "cuda::detail::max_grid_size_of_bulk_then_execute_concurrent_kernel(): CUDA error after cudaOccupancyMaxActiveBlocksPerMultiprocessor()"); int num_multiprocessors = 0; detail::throw_on_error(cudaDeviceGetAttribute(&num_multiprocessors, cudaDevAttrMultiProcessorCount, device.native_handle()), "cuda::detail::max_grid_size_of_bulk_then_execute_concurrent_kernel(): CUDA error after cudaDeviceGetAttribute()"); return max_active_blocks_per_multiprocessor * num_multiprocessors; }