void computeNumCTAs(KernelPointer kernel, int smemDynamicBytes, bool bManualCoalesce) { cudaDeviceProp devprop; int deviceID = -1; cudaError_t err = cudaGetDevice(&deviceID); assert(err == cudaSuccess); cudaGetDeviceProperties(&devprop, deviceID); // Determine the maximum number of CTAs that can be run simultaneously for each kernel // This is equivalent to the calculation done in the CUDA Occupancy Calculator spreadsheet const unsigned int regAllocationUnit = (devprop.major < 2 && devprop.minor < 2) ? 256 : 512; // in registers const unsigned int warpAllocationMultiple = 2; const unsigned int smemAllocationUnit = 512; // in bytes const unsigned int maxThreadsPerSM = bManualCoalesce ? 768 : 1024; // sm_12 GPUs increase threads/SM to 1024 const unsigned int maxBlocksPerSM = 8; cudaFuncAttributes attr; err = cudaFuncGetAttributes(&attr, (const char*)kernel); assert(err == cudaSuccess); // Number of warps (round up to nearest whole multiple of warp size) size_t numWarps = multiple(RadixSort::CTA_SIZE, devprop.warpSize); // Round up to warp allocation multiple numWarps = ceiling(numWarps, warpAllocationMultiple); // Number of regs is regs per thread times number of warps times warp size size_t regsPerCTA = attr.numRegs * devprop.warpSize * numWarps; // Round up to multiple of register allocation unit size regsPerCTA = ceiling(regsPerCTA, regAllocationUnit); size_t smemBytes = attr.sharedSizeBytes + smemDynamicBytes; size_t smemPerCTA = ceiling(smemBytes, smemAllocationUnit); size_t ctaLimitRegs = regsPerCTA > 0 ? devprop.regsPerBlock / regsPerCTA : maxBlocksPerSM; size_t ctaLimitSMem = smemPerCTA > 0 ? devprop.sharedMemPerBlock / smemPerCTA : maxBlocksPerSM; size_t ctaLimitThreads = maxThreadsPerSM / RadixSort::CTA_SIZE; unsigned int numSMs = devprop.multiProcessorCount; int maxCTAs = numSMs * std::min<size_t>(ctaLimitRegs, std::min<size_t>(ctaLimitSMem, std::min<size_t>(ctaLimitThreads, maxBlocksPerSM))); setNumCTAs(kernel, maxCTAs); }
void computeNumCTAs(KernelPointer kernel, int smemDynamicBytes, bool bManualCoalesce) { cudaDeviceProp devprop; int deviceID = -1; cudaError_t err = cudaGetDevice(&deviceID); assert(err == cudaSuccess); cudaGetDeviceProperties(&devprop, deviceID); int smVersion = devprop.major * 10 + devprop.minor; // Determine the maximum number of CTAs that can be run simultaneously for each kernel // This is equivalent to the calculation done in the CUDA Occupancy Calculator spreadsheet const unsigned int warpAllocationMultiple = 2; const unsigned int maxBlocksPerSM = 8; unsigned int maxThreadsPerSM = 768; unsigned int regAllocationUnit = 256; // in registers unsigned int smemAllocationUnit = 512; // in bytes bool blockRegisterAllocation = true; // otherwise warp granularity (sm_20) if (smVersion >= 20) { maxThreadsPerSM = 1536; regAllocationUnit = 64; blockRegisterAllocation = false; smemAllocationUnit = 128; } else if (smVersion >= 12) { maxThreadsPerSM = 1024; regAllocationUnit = 512; } cudaFuncAttributes attr; err = cudaFuncGetAttributes(&attr, (const char*)kernel); assert(err == cudaSuccess); // Number of warps (round up to nearest whole multiple of warp size) size_t numWarps = multiple(RadixSort::CTA_SIZE, devprop.warpSize); // Round up to warp allocation multiple numWarps = ceiling(numWarps, warpAllocationMultiple); size_t regsPerCTA = 0; if (blockRegisterAllocation) { // Number of regs is regs per thread times number of warps times warp size // rounded up to multiple of register allocation unit size regsPerCTA = ceiling(attr.numRegs * devprop.warpSize * numWarps, regAllocationUnit); } else { // warp register allocation // Number of regs is regs per thread times warp size, rounded up to multiple of // register allocation unit size, times number of warps. regsPerCTA = ceiling(attr.numRegs * devprop.warpSize, regAllocationUnit) * numWarps; } size_t smemBytes = attr.sharedSizeBytes + smemDynamicBytes; size_t smemPerCTA = ceiling(smemBytes, smemAllocationUnit); size_t ctaLimitRegs = regsPerCTA > 0 ? devprop.regsPerBlock / regsPerCTA : maxBlocksPerSM; size_t ctaLimitSMem = smemPerCTA > 0 ? devprop.sharedMemPerBlock / smemPerCTA : maxBlocksPerSM; size_t ctaLimitThreads = maxThreadsPerSM / RadixSort::CTA_SIZE; unsigned int numSMs = devprop.multiProcessorCount; int maxCTAs = numSMs * std::min<size_t>(ctaLimitRegs, std::min<size_t>(ctaLimitSMem, std::min<size_t>(ctaLimitThreads, maxBlocksPerSM))); setNumCTAs(kernel, maxCTAs); }