__host__ __device__ int max_block_size_of_bulk_then_execute_concurrent_kernel(const agency::cuda::device_id& device, const Function&, const agency::cuda::async_future<T>&, const ResultFactory&, const OuterFactory&, const InnerFactory&) { // temporarily switch the CUDA runtime's current device to the given device scoped_device scope(device); // get a pointer to the kernel which bulk_then_execute_concurent_grid() would launch auto kernel = detail::bulk_then_execute_kernel<block_dimension,Function,T,ResultFactory,OuterFactory,InnerFactory>::value; // get the kernel's attributes cudaFuncAttributes attr; detail::throw_on_error(cudaFuncGetAttributes(&attr, kernel), "cuda::detail::max_block_size_of_bulk_then_execute_concurrent_kernel(): CUDA error after cudaFuncGetAttributes()"); // return the attribute of interest return attr.maxThreadsPerBlock; }
void printFuncAttrib(Func& func) { cudaFuncAttributes attrs; cudaFuncGetAttributes(&attrs, func); printf("=== Function stats ===\n"); printf("Name: \n"); printf("sharedSizeBytes = %d\n", attrs.sharedSizeBytes); printf("constSizeBytes = %d\n", attrs.constSizeBytes); printf("localSizeBytes = %d\n", attrs.localSizeBytes); printf("maxThreadsPerBlock = %d\n", attrs.maxThreadsPerBlock); printf("numRegs = %d\n", attrs.numRegs); printf("ptxVersion = %d\n", attrs.ptxVersion); printf("binaryVersion = %d\n", attrs.binaryVersion); printf("\n"); fflush(stdout); }
void computeNumCTAs(KernelPointer kernel, int smemDynamicBytes, bool bManualCoalesce) { cudaDeviceProp devprop; int deviceID = -1; cudaError_t err = cudaGetDevice(&deviceID); assert(err == cudaSuccess); cudaGetDeviceProperties(&devprop, deviceID); // Determine the maximum number of CTAs that can be run simultaneously for each kernel // This is equivalent to the calculation done in the CUDA Occupancy Calculator spreadsheet const unsigned int regAllocationUnit = (devprop.major < 2 && devprop.minor < 2) ? 256 : 512; // in registers const unsigned int warpAllocationMultiple = 2; const unsigned int smemAllocationUnit = 512; // in bytes const unsigned int maxThreadsPerSM = bManualCoalesce ? 768 : 1024; // sm_12 GPUs increase threads/SM to 1024 const unsigned int maxBlocksPerSM = 8; cudaFuncAttributes attr; err = cudaFuncGetAttributes(&attr, (const char*)kernel); assert(err == cudaSuccess); // Number of warps (round up to nearest whole multiple of warp size) size_t numWarps = multiple(RadixSort::CTA_SIZE, devprop.warpSize); // Round up to warp allocation multiple numWarps = ceiling(numWarps, warpAllocationMultiple); // Number of regs is regs per thread times number of warps times warp size size_t regsPerCTA = attr.numRegs * devprop.warpSize * numWarps; // Round up to multiple of register allocation unit size regsPerCTA = ceiling(regsPerCTA, regAllocationUnit); size_t smemBytes = attr.sharedSizeBytes + smemDynamicBytes; size_t smemPerCTA = ceiling(smemBytes, smemAllocationUnit); size_t ctaLimitRegs = regsPerCTA > 0 ? devprop.regsPerBlock / regsPerCTA : maxBlocksPerSM; size_t ctaLimitSMem = smemPerCTA > 0 ? devprop.sharedMemPerBlock / smemPerCTA : maxBlocksPerSM; size_t ctaLimitThreads = maxThreadsPerSM / RadixSort::CTA_SIZE; unsigned int numSMs = devprop.multiProcessorCount; int maxCTAs = numSMs * std::min<size_t>(ctaLimitRegs, std::min<size_t>(ctaLimitSMem, std::min<size_t>(ctaLimitThreads, maxBlocksPerSM))); setNumCTAs(kernel, maxCTAs); }
inline __host__ std::size_t maximum_residency(T t, size_t CTA_SIZE, size_t dynamic_smem_bytes) { cudaError_t err; cudaFuncAttributes attributes; err = cudaFuncGetAttributes(&attributes, t); if (err != cudaSuccess) return 0; int device; err = cudaGetDevice(&device); if (err != cudaSuccess) return 0; cudaDeviceProp properties; err = cudaGetDeviceProperties(&properties, device); if (err != cudaSuccess) return 0; return maximum_residency(attributes, properties, CTA_SIZE, dynamic_smem_bytes); }
inline __host__ std::size_t block_size_with_maximum_potential_occupancy(T t) { cudaError_t err; cudaFuncAttributes attributes; err = cudaFuncGetAttributes(&attributes, t); if (err != cudaSuccess) return 0; int device; err = cudaGetDevice(&device); if (err != cudaSuccess) return 0; cudaDeviceProp properties; err = cudaGetDeviceProperties(&properties, device); if (err != cudaSuccess) return 0; return block_size_with_maximum_potential_occupancy(attributes, properties); }
void computeNumCTAs(KernelPointer kernel, int smemDynamicBytes, bool bManualCoalesce) { cudaDeviceProp devprop; int deviceID = -1; cudaError_t err = cudaGetDevice(&deviceID); assert(err == cudaSuccess); cudaGetDeviceProperties(&devprop, deviceID); int smVersion = devprop.major * 10 + devprop.minor; // Determine the maximum number of CTAs that can be run simultaneously for each kernel // This is equivalent to the calculation done in the CUDA Occupancy Calculator spreadsheet const unsigned int warpAllocationMultiple = 2; const unsigned int maxBlocksPerSM = 8; unsigned int maxThreadsPerSM = 768; unsigned int regAllocationUnit = 256; // in registers unsigned int smemAllocationUnit = 512; // in bytes bool blockRegisterAllocation = true; // otherwise warp granularity (sm_20) if (smVersion >= 20) { maxThreadsPerSM = 1536; regAllocationUnit = 64; blockRegisterAllocation = false; smemAllocationUnit = 128; } else if (smVersion >= 12) { maxThreadsPerSM = 1024; regAllocationUnit = 512; } cudaFuncAttributes attr; err = cudaFuncGetAttributes(&attr, (const char*)kernel); assert(err == cudaSuccess); // Number of warps (round up to nearest whole multiple of warp size) size_t numWarps = multiple(RadixSort::CTA_SIZE, devprop.warpSize); // Round up to warp allocation multiple numWarps = ceiling(numWarps, warpAllocationMultiple); size_t regsPerCTA = 0; if (blockRegisterAllocation) { // Number of regs is regs per thread times number of warps times warp size // rounded up to multiple of register allocation unit size regsPerCTA = ceiling(attr.numRegs * devprop.warpSize * numWarps, regAllocationUnit); } else { // warp register allocation // Number of regs is regs per thread times warp size, rounded up to multiple of // register allocation unit size, times number of warps. regsPerCTA = ceiling(attr.numRegs * devprop.warpSize, regAllocationUnit) * numWarps; } size_t smemBytes = attr.sharedSizeBytes + smemDynamicBytes; size_t smemPerCTA = ceiling(smemBytes, smemAllocationUnit); size_t ctaLimitRegs = regsPerCTA > 0 ? devprop.regsPerBlock / regsPerCTA : maxBlocksPerSM; size_t ctaLimitSMem = smemPerCTA > 0 ? devprop.sharedMemPerBlock / smemPerCTA : maxBlocksPerSM; size_t ctaLimitThreads = maxThreadsPerSM / RadixSort::CTA_SIZE; unsigned int numSMs = devprop.multiProcessorCount; int maxCTAs = numSMs * std::min<size_t>(ctaLimitRegs, std::min<size_t>(ctaLimitSMem, std::min<size_t>(ctaLimitThreads, maxBlocksPerSM))); setNumCTAs(kernel, maxCTAs); }
cudaError_t WINAPI wine_cudaFuncGetAttributes( struct cudaFuncAttributes *attr, const char *func ) { WINE_TRACE("\n"); return cudaFuncGetAttributes( attr, func ); }