static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
                            const size_t shmem_extra_block, const size_t shmem_extra_thread) {
    int numBlocks;

    int blockSize=32;
    int sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
                    FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize/vector_length );
    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
        &numBlocks,
        cuda_parallel_launch_local_memory<DriverType>,
        blockSize,
        sharedmem);

    while (blockSize<1024 && numBlocks>0) {
      blockSize*=2;
      sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
                  FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize/vector_length );

      cudaOccupancyMaxActiveBlocksPerMultiprocessor(
          &numBlocks,
          cuda_parallel_launch_local_memory<DriverType>,
          blockSize,
          sharedmem);
    }
    if(numBlocks>0) return blockSize;
    else return blockSize/2;
  }
  static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
                            const size_t shmem_extra_block, const size_t shmem_extra_thread) {
    int blockSize=16;
    int numBlocks;
    int sharedmem;
    int maxOccupancy=0;
    int bestBlockSize=0;

    while(blockSize<1024) {
      blockSize*=2;
      sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
                  FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize/vector_length );

      cudaOccupancyMaxActiveBlocksPerMultiprocessor(
              &numBlocks,
              cuda_parallel_launch_local_memory<DriverType>,
              blockSize,
              sharedmem);

      if(maxOccupancy < numBlocks*blockSize) {
        maxOccupancy = numBlocks*blockSize;
        bestBlockSize = blockSize;
      }
    }
    return bestBlockSize;
  }
  static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
                            const size_t shmem_extra_block, const size_t shmem_extra_thread) {
    int blockSize=16;
    int numBlocks;
    int sharedmem;
    int maxOccupancy=0;
    int bestBlockSize=0;

    while(blockSize<1024) {
      blockSize*=2;

      //calculate the occupancy with that optBlockSize and check whether its larger than the largest one found so far
      sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
                  FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize/vector_length );
      cudaOccupancyMaxActiveBlocksPerMultiprocessor(
              &numBlocks,
              cuda_parallel_launch_constant_memory<DriverType>,
              blockSize,
              sharedmem);
      if(maxOccupancy < numBlocks*blockSize) {
         maxOccupancy = numBlocks*blockSize;
         bestBlockSize = blockSize;
      }
    }
    return bestBlockSize;
  }
  static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
                            const size_t shmem_extra_block, const size_t shmem_extra_thread) {
    int blockSize=16;
    int numBlocks;
    int sharedmem;
    int maxOccupancy=0;
    int bestBlockSize=0;
    int max_threads_per_block = std::min(MaxThreadsPerBlock,cuda_internal_maximum_warp_count()*CudaTraits::WarpSize);

    while(blockSize < max_threads_per_block ) {
      blockSize*=2;
      sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
                  FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize/vector_length );

      cudaOccupancyMaxActiveBlocksPerMultiprocessor(
              &numBlocks,
              cuda_parallel_launch_local_memory<DriverType,MaxThreadsPerBlock,MinBlocksPerSM>,
              blockSize,
              sharedmem);
      if(numBlocks >= int(MinBlocksPerSM) && blockSize<=int(MaxThreadsPerBlock)) {
        if(maxOccupancy < numBlocks*blockSize) {
          maxOccupancy = numBlocks*blockSize;
          bestBlockSize = blockSize;
        }
      }
    }
    if(maxOccupancy > 0)
      return bestBlockSize;
    return -1;
  }
  static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
                            const size_t shmem_extra_block, const size_t shmem_extra_thread) {
    int numBlocks = 0, oldNumBlocks = 0;
    unsigned int blockSize=MaxThreadsPerBlock;
    unsigned int sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
                    FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize/vector_length );
    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
        &numBlocks,
        cuda_parallel_launch_constant_memory<DriverType,MaxThreadsPerBlock,MinBlocksPerSM>,
        blockSize,
        sharedmem);

    if(static_cast<unsigned int>(numBlocks)>=MinBlocksPerSM) return blockSize;

    while (blockSize>32 && static_cast<unsigned int>(numBlocks)<MinBlocksPerSM) {
      blockSize/=2;
      sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
                  FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize/vector_length );

      cudaOccupancyMaxActiveBlocksPerMultiprocessor(
          &numBlocks,
          cuda_parallel_launch_constant_memory<DriverType>,
          blockSize,
          sharedmem);
    }
    unsigned int blockSizeUpperBound = (blockSize*2<MaxThreadsPerBlock?blockSize*2:MaxThreadsPerBlock);
    while (blockSize<blockSizeUpperBound && static_cast<unsigned int>(numBlocks)>MinBlocksPerSM) {
      blockSize+=32;
      sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
                  FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize/vector_length );
      oldNumBlocks = numBlocks;
      cudaOccupancyMaxActiveBlocksPerMultiprocessor(
            &numBlocks,
            cuda_parallel_launch_constant_memory<DriverType>,
            blockSize,
            sharedmem);
    }
    if(static_cast<unsigned int>(oldNumBlocks)>=MinBlocksPerSM) return blockSize - 32;
    return -1;
  }
  static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
                            const size_t shmem_extra_block, const size_t shmem_extra_thread) {
    int numBlocks;

    unsigned int blockSize=1024;
    unsigned int sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
                    FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize/vector_length );
    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
        &numBlocks,
        cuda_parallel_launch_local_memory<DriverType>,
        blockSize,
        sharedmem);

    if(numBlocks>0) return blockSize;
    while (blockSize>32 && numBlocks==0) {
      blockSize/=2;
      sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
                  FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize/vector_length );

      cudaOccupancyMaxActiveBlocksPerMultiprocessor(
          &numBlocks,
          cuda_parallel_launch_local_memory<DriverType>,
          blockSize,
          sharedmem);
    }
    unsigned int blockSizeUpperBound = blockSize*2;
    while (blockSize<blockSizeUpperBound && numBlocks>0) {
      blockSize+=32;
      sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
                  FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize/vector_length );

      cudaOccupancyMaxActiveBlocksPerMultiprocessor(
            &numBlocks,
            cuda_parallel_launch_local_memory<DriverType>,
            blockSize,
            sharedmem);
    }
    return blockSize - 32;
  }
__host__ __device__
int max_grid_size_of_bulk_then_execute_concurrent_kernel(const agency::cuda::device_id& device, const Function& f, Shape block_dim, const agency::cuda::async_future<T>& predecessor, const ResultFactory& result_factory, const OuterFactory& outer_factory, const InnerFactory& inner_factory)
{
  const size_t block_dimension = agency::detail::shape_size<Shape>::value;
  constexpr auto kernel = detail::bulk_then_execute_kernel<block_dimension,Function,T,ResultFactory,OuterFactory,InnerFactory>::value;

  int max_active_blocks_per_multiprocessor = 0;
  detail::throw_on_error(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks_per_multiprocessor, kernel, agency::detail::shape_cast<int>(block_dim), device.native_handle()), "cuda::detail::max_grid_size_of_bulk_then_execute_concurrent_kernel(): CUDA error after cudaOccupancyMaxActiveBlocksPerMultiprocessor()");

  int num_multiprocessors = 0;
  detail::throw_on_error(cudaDeviceGetAttribute(&num_multiprocessors, cudaDevAttrMultiProcessorCount, device.native_handle()), "cuda::detail::max_grid_size_of_bulk_then_execute_concurrent_kernel(): CUDA error after cudaDeviceGetAttribute()");

  return max_active_blocks_per_multiprocessor * num_multiprocessors;
}