__host__ __device__
int max_block_size_of_bulk_then_execute_concurrent_kernel(const agency::cuda::device_id& device, const Function&, const agency::cuda::async_future<T>&, const ResultFactory&, const OuterFactory&, const InnerFactory&)
{
  // temporarily switch the CUDA runtime's current device to the given device
  scoped_device scope(device);

  // get a pointer to the kernel which bulk_then_execute_concurent_grid() would launch
  auto kernel = detail::bulk_then_execute_kernel<block_dimension,Function,T,ResultFactory,OuterFactory,InnerFactory>::value;

  // get the kernel's attributes
  cudaFuncAttributes attr;
  detail::throw_on_error(cudaFuncGetAttributes(&attr, kernel), "cuda::detail::max_block_size_of_bulk_then_execute_concurrent_kernel(): CUDA error after cudaFuncGetAttributes()");

  // return the attribute of interest
  return attr.maxThreadsPerBlock;
}
Пример #2
0
    void printFuncAttrib(Func& func)
    {

        cudaFuncAttributes attrs;
        cudaFuncGetAttributes(&attrs, func);

        printf("=== Function stats ===\n");
        printf("Name: \n");
        printf("sharedSizeBytes    = %d\n", attrs.sharedSizeBytes);
        printf("constSizeBytes     = %d\n", attrs.constSizeBytes);
        printf("localSizeBytes     = %d\n", attrs.localSizeBytes);
        printf("maxThreadsPerBlock = %d\n", attrs.maxThreadsPerBlock);
        printf("numRegs            = %d\n", attrs.numRegs);
        printf("ptxVersion         = %d\n", attrs.ptxVersion);
        printf("binaryVersion      = %d\n", attrs.binaryVersion);
        printf("\n");
        fflush(stdout);
    }
Пример #3
0
void computeNumCTAs(KernelPointer kernel, int smemDynamicBytes, bool bManualCoalesce)
{
    cudaDeviceProp devprop;
    int deviceID = -1;
    cudaError_t err = cudaGetDevice(&deviceID);
    assert(err == cudaSuccess);

    cudaGetDeviceProperties(&devprop, deviceID);

    // Determine the maximum number of CTAs that can be run simultaneously for each kernel
    // This is equivalent to the calculation done in the CUDA Occupancy Calculator spreadsheet
    const unsigned int regAllocationUnit = (devprop.major < 2 && devprop.minor < 2) ? 256 : 512; // in registers
    const unsigned int warpAllocationMultiple = 2;
    const unsigned int smemAllocationUnit = 512;                                                 // in bytes
    const unsigned int maxThreadsPerSM = bManualCoalesce ? 768 : 1024; // sm_12 GPUs increase threads/SM to 1024
    const unsigned int maxBlocksPerSM = 8;

    cudaFuncAttributes attr;
    err = cudaFuncGetAttributes(&attr, (const char*)kernel);
    assert(err == cudaSuccess);


    // Number of warps (round up to nearest whole multiple of warp size)
    size_t numWarps = multiple(RadixSort::CTA_SIZE, devprop.warpSize);
    // Round up to warp allocation multiple
    numWarps = ceiling(numWarps, warpAllocationMultiple);

    // Number of regs is regs per thread times number of warps times warp size
    size_t regsPerCTA = attr.numRegs * devprop.warpSize * numWarps;
    // Round up to multiple of register allocation unit size
    regsPerCTA = ceiling(regsPerCTA, regAllocationUnit);

    size_t smemBytes = attr.sharedSizeBytes + smemDynamicBytes;
    size_t smemPerCTA = ceiling(smemBytes, smemAllocationUnit);

    size_t ctaLimitRegs    = regsPerCTA > 0 ? devprop.regsPerBlock / regsPerCTA : maxBlocksPerSM;
    size_t ctaLimitSMem    = smemPerCTA > 0 ? devprop.sharedMemPerBlock      / smemPerCTA : maxBlocksPerSM;
    size_t ctaLimitThreads =                  maxThreadsPerSM                / RadixSort::CTA_SIZE;

    unsigned int numSMs = devprop.multiProcessorCount;
    int maxCTAs = numSMs * std::min<size_t>(ctaLimitRegs, std::min<size_t>(ctaLimitSMem, std::min<size_t>(ctaLimitThreads, maxBlocksPerSM)));
    setNumCTAs(kernel, maxCTAs);
}
Пример #4
0
inline __host__
std::size_t maximum_residency(T t, size_t CTA_SIZE, size_t dynamic_smem_bytes)
{
  cudaError_t err;
  cudaFuncAttributes attributes;
  err = cudaFuncGetAttributes(&attributes, t);

  if (err != cudaSuccess)
    return 0;

  int device;
  err = cudaGetDevice(&device);

  if (err != cudaSuccess)
    return 0;

  cudaDeviceProp properties;
  err = cudaGetDeviceProperties(&properties, device);
  
  if (err != cudaSuccess)
    return 0;

  return maximum_residency(attributes, properties, CTA_SIZE, dynamic_smem_bytes);
}
Пример #5
0
inline __host__
std::size_t block_size_with_maximum_potential_occupancy(T t)
{
  cudaError_t err;
  cudaFuncAttributes attributes;
  err = cudaFuncGetAttributes(&attributes, t);

  if (err != cudaSuccess)
    return 0;

  int device;
  err = cudaGetDevice(&device);

  if (err != cudaSuccess)
    return 0;

  cudaDeviceProp properties;
  err = cudaGetDeviceProperties(&properties, device);
  
  if (err != cudaSuccess)
    return 0;

  return block_size_with_maximum_potential_occupancy(attributes, properties);
}
Пример #6
0
void computeNumCTAs(KernelPointer kernel, int smemDynamicBytes, bool bManualCoalesce)
{
    cudaDeviceProp devprop;
    int deviceID = -1;
    cudaError_t err = cudaGetDevice(&deviceID);
    assert(err == cudaSuccess);

    cudaGetDeviceProperties(&devprop, deviceID);

    int smVersion = devprop.major * 10 + devprop.minor;
    // Determine the maximum number of CTAs that can be run simultaneously for each kernel
    // This is equivalent to the calculation done in the CUDA Occupancy Calculator spreadsheet
    
    const unsigned int warpAllocationMultiple = 2;
    const unsigned int maxBlocksPerSM = 8;
    unsigned int maxThreadsPerSM = 768;
    unsigned int regAllocationUnit = 256;  // in registers
    unsigned int smemAllocationUnit = 512; // in bytes
    bool blockRegisterAllocation = true;   // otherwise warp granularity (sm_20)

    if (smVersion >= 20)
    {
        maxThreadsPerSM = 1536;
        regAllocationUnit = 64;
        blockRegisterAllocation = false;
        smemAllocationUnit = 128;
    }
    else if (smVersion >= 12)
    {
        maxThreadsPerSM = 1024;
        regAllocationUnit = 512;
    }
   
    cudaFuncAttributes attr;
    err = cudaFuncGetAttributes(&attr, (const char*)kernel);
    assert(err == cudaSuccess);

    // Number of warps (round up to nearest whole multiple of warp size)
    size_t numWarps = multiple(RadixSort::CTA_SIZE, devprop.warpSize);
    // Round up to warp allocation multiple
    numWarps = ceiling(numWarps, warpAllocationMultiple);

    size_t regsPerCTA = 0;

    if (blockRegisterAllocation)
    {
        // Number of regs is regs per thread times number of warps times warp size
        // rounded up to multiple of register allocation unit size
        regsPerCTA = ceiling(attr.numRegs * devprop.warpSize * numWarps, regAllocationUnit);
    }
    else
    {
        // warp register allocation
        // Number of regs is regs per thread times warp size, rounded up to multiple of 
        // register allocation unit size, times number of warps.
        regsPerCTA = ceiling(attr.numRegs * devprop.warpSize, regAllocationUnit) * numWarps;
    }

    size_t smemBytes = attr.sharedSizeBytes + smemDynamicBytes;
    size_t smemPerCTA = ceiling(smemBytes, smemAllocationUnit);

    size_t ctaLimitRegs    = regsPerCTA > 0 ? devprop.regsPerBlock           / regsPerCTA : maxBlocksPerSM;
    size_t ctaLimitSMem    = smemPerCTA > 0 ? devprop.sharedMemPerBlock      / smemPerCTA : maxBlocksPerSM;
    size_t ctaLimitThreads =                  maxThreadsPerSM                / RadixSort::CTA_SIZE;

    unsigned int numSMs = devprop.multiProcessorCount;
    int maxCTAs = numSMs * std::min<size_t>(ctaLimitRegs, std::min<size_t>(ctaLimitSMem, std::min<size_t>(ctaLimitThreads, maxBlocksPerSM)));
    setNumCTAs(kernel, maxCTAs);
}
Пример #7
0
cudaError_t WINAPI wine_cudaFuncGetAttributes( struct cudaFuncAttributes *attr, const char *func ) {
    WINE_TRACE("\n");
    return cudaFuncGetAttributes( attr, func );
}