示例#1
0
void computeNumCTAs(KernelPointer kernel, int smemDynamicBytes, bool bManualCoalesce)
{
    cudaDeviceProp devprop;
    int deviceID = -1;
    cudaError_t err = cudaGetDevice(&deviceID);
    assert(err == cudaSuccess);

    cudaGetDeviceProperties(&devprop, deviceID);

    // Determine the maximum number of CTAs that can be run simultaneously for each kernel
    // This is equivalent to the calculation done in the CUDA Occupancy Calculator spreadsheet
    const unsigned int regAllocationUnit = (devprop.major < 2 && devprop.minor < 2) ? 256 : 512; // in registers
    const unsigned int warpAllocationMultiple = 2;
    const unsigned int smemAllocationUnit = 512;                                                 // in bytes
    const unsigned int maxThreadsPerSM = bManualCoalesce ? 768 : 1024; // sm_12 GPUs increase threads/SM to 1024
    const unsigned int maxBlocksPerSM = 8;

    cudaFuncAttributes attr;
    err = cudaFuncGetAttributes(&attr, (const char*)kernel);
    assert(err == cudaSuccess);


    // Number of warps (round up to nearest whole multiple of warp size)
    size_t numWarps = multiple(RadixSort::CTA_SIZE, devprop.warpSize);
    // Round up to warp allocation multiple
    numWarps = ceiling(numWarps, warpAllocationMultiple);

    // Number of regs is regs per thread times number of warps times warp size
    size_t regsPerCTA = attr.numRegs * devprop.warpSize * numWarps;
    // Round up to multiple of register allocation unit size
    regsPerCTA = ceiling(regsPerCTA, regAllocationUnit);

    size_t smemBytes = attr.sharedSizeBytes + smemDynamicBytes;
    size_t smemPerCTA = ceiling(smemBytes, smemAllocationUnit);

    size_t ctaLimitRegs    = regsPerCTA > 0 ? devprop.regsPerBlock / regsPerCTA : maxBlocksPerSM;
    size_t ctaLimitSMem    = smemPerCTA > 0 ? devprop.sharedMemPerBlock      / smemPerCTA : maxBlocksPerSM;
    size_t ctaLimitThreads =                  maxThreadsPerSM                / RadixSort::CTA_SIZE;

    unsigned int numSMs = devprop.multiProcessorCount;
    int maxCTAs = numSMs * std::min<size_t>(ctaLimitRegs, std::min<size_t>(ctaLimitSMem, std::min<size_t>(ctaLimitThreads, maxBlocksPerSM)));
    setNumCTAs(kernel, maxCTAs);
}
示例#2
0
void computeNumCTAs(KernelPointer kernel, int smemDynamicBytes, bool bManualCoalesce)
{
    cudaDeviceProp devprop;
    int deviceID = -1;
    cudaError_t err = cudaGetDevice(&deviceID);
    assert(err == cudaSuccess);

    cudaGetDeviceProperties(&devprop, deviceID);

    int smVersion = devprop.major * 10 + devprop.minor;
    // Determine the maximum number of CTAs that can be run simultaneously for each kernel
    // This is equivalent to the calculation done in the CUDA Occupancy Calculator spreadsheet
    
    const unsigned int warpAllocationMultiple = 2;
    const unsigned int maxBlocksPerSM = 8;
    unsigned int maxThreadsPerSM = 768;
    unsigned int regAllocationUnit = 256;  // in registers
    unsigned int smemAllocationUnit = 512; // in bytes
    bool blockRegisterAllocation = true;   // otherwise warp granularity (sm_20)

    if (smVersion >= 20)
    {
        maxThreadsPerSM = 1536;
        regAllocationUnit = 64;
        blockRegisterAllocation = false;
        smemAllocationUnit = 128;
    }
    else if (smVersion >= 12)
    {
        maxThreadsPerSM = 1024;
        regAllocationUnit = 512;
    }
   
    cudaFuncAttributes attr;
    err = cudaFuncGetAttributes(&attr, (const char*)kernel);
    assert(err == cudaSuccess);

    // Number of warps (round up to nearest whole multiple of warp size)
    size_t numWarps = multiple(RadixSort::CTA_SIZE, devprop.warpSize);
    // Round up to warp allocation multiple
    numWarps = ceiling(numWarps, warpAllocationMultiple);

    size_t regsPerCTA = 0;

    if (blockRegisterAllocation)
    {
        // Number of regs is regs per thread times number of warps times warp size
        // rounded up to multiple of register allocation unit size
        regsPerCTA = ceiling(attr.numRegs * devprop.warpSize * numWarps, regAllocationUnit);
    }
    else
    {
        // warp register allocation
        // Number of regs is regs per thread times warp size, rounded up to multiple of 
        // register allocation unit size, times number of warps.
        regsPerCTA = ceiling(attr.numRegs * devprop.warpSize, regAllocationUnit) * numWarps;
    }

    size_t smemBytes = attr.sharedSizeBytes + smemDynamicBytes;
    size_t smemPerCTA = ceiling(smemBytes, smemAllocationUnit);

    size_t ctaLimitRegs    = regsPerCTA > 0 ? devprop.regsPerBlock           / regsPerCTA : maxBlocksPerSM;
    size_t ctaLimitSMem    = smemPerCTA > 0 ? devprop.sharedMemPerBlock      / smemPerCTA : maxBlocksPerSM;
    size_t ctaLimitThreads =                  maxThreadsPerSM                / RadixSort::CTA_SIZE;

    unsigned int numSMs = devprop.multiProcessorCount;
    int maxCTAs = numSMs * std::min<size_t>(ctaLimitRegs, std::min<size_t>(ctaLimitSMem, std::min<size_t>(ctaLimitThreads, maxBlocksPerSM)));
    setNumCTAs(kernel, maxCTAs);
}