inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute, int device) { CUresult error_result = cuDeviceGetAttribute( attribute, device_attribute, device ); if (error_result != CUDA_SUCCESS) { shrLog( "cuDeviceGetAttribute returned %d\n-> %s\n", (int)error_result, getCudaDrvErrorString(error_result) ); exit(0); } }
// These are the inline versions for all of the SDK helper functions inline void __checkCudaErrors(CUresult err, const char *file, const int line) { if (CUDA_SUCCESS != err) { fprintf(stderr, "checkCudaErrors() Driver API error = %04d \"%s\" from file <%s>, line %i.\n", err, getCudaDrvErrorString(err), file, line); exit(EXIT_FAILURE); } }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main( int argc, char** argv) { CUdevice dev; int major = 0, minor = 0; int deviceCount = 0; char deviceName[256]; shrQAStart(argc, argv); // note your project will need to link with cuda.lib files on windows printf("CUDA Device Query (Driver API) statically linked version \n"); CUresult error_id = cuInit(0); if (error_id != CUDA_SUCCESS) { printf("cuInit(0) returned %d\n-> %s\n", error_id, getCudaDrvErrorString(error_id)); shrQAFinishExit(argc, (const char **)argv, QA_FAILED); } error_id = cuDeviceGetCount(&deviceCount); if (error_id != CUDA_SUCCESS) { shrLog( "cuDeviceGetCount returned %d\n-> %s\n", (int)error_id, getCudaDrvErrorString(error_id) ); shrQAFinishExit(argc, (const char **)argv, QA_FAILED); } // This function call returns 0 if there are no CUDA capable devices. if (deviceCount == 0) printf("There are no available device(s) that support CUDA\n"); else if (deviceCount == 1) printf("There is 1 device supporting CUDA\n"); else printf("There are %d devices supporting CUDA\n", deviceCount); for (dev = 0; dev < deviceCount; ++dev) { error_id = cuDeviceComputeCapability(&major, &minor, dev); if (error_id != CUDA_SUCCESS) { shrLog( "cuDeviceComputeCapability returned %d\n-> %s\n", (int)error_id, getCudaDrvErrorString(error_id) ); shrQAFinishExit(argc, (const char **)argv, QA_FAILED); } error_id = cuDeviceGetName(deviceName, 256, dev); if (error_id != CUDA_SUCCESS) { shrLog( "cuDeviceGetName returned %d\n-> %s\n", (int)error_id, getCudaDrvErrorString(error_id) ); shrQAFinishExit(argc, (const char **)argv, QA_FAILED); } printf("\nDevice %d: \"%s\"\n", dev, deviceName); #if CUDA_VERSION >= 2020 int driverVersion = 0; cuDriverGetVersion(&driverVersion); printf(" CUDA Driver Version: %d.%d\n", driverVersion/1000, (driverVersion%100)/10); #endif shrLog(" CUDA Capability Major/Minor version number: %d.%d\n", major, minor); size_t totalGlobalMem; error_id = cuDeviceTotalMem(&totalGlobalMem, dev); if (error_id != CUDA_SUCCESS) { shrLog( "cuDeviceTotalMem returned %d\n-> %s\n", (int)error_id, getCudaDrvErrorString(error_id) ); shrQAFinishExit(argc, (const char **)argv, QA_FAILED); } char msg[256]; sprintf(msg, " Total amount of global memory: %.0f MBytes (%llu bytes)\n", (float)totalGlobalMem/1048576.0f, (unsigned long long) totalGlobalMem); shrLog(msg); #if CUDA_VERSION >= 2000 int multiProcessorCount; getCudaAttribute<int>(&multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev); shrLog(" (%2d) Multiprocessors x (%3d) CUDA Cores/MP: %d CUDA Cores\n", multiProcessorCount, ConvertSMVer2Cores(major, minor), ConvertSMVer2Cores(major, minor) * multiProcessorCount); #endif int clockRate; getCudaAttribute<int>(&clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev); printf(" GPU Clock rate: %.0f MHz (%0.2f GHz)\n", clockRate * 1e-3f, clockRate * 1e-6f); #if CUDA_VERSION >= 4000 int memoryClock; getCudaAttribute<int>( &memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev ); shrLog(" Memory Clock rate: %.0f Mhz\n", memoryClock * 1e-3f); int memBusWidth; getCudaAttribute<int>( &memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev ); shrLog(" Memory Bus Width: %d-bit\n", memBusWidth); int L2CacheSize; getCudaAttribute<int>( &L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev ); if (L2CacheSize) { shrLog(" L2 Cache Size: %d bytes\n", L2CacheSize); } int maxTex1D, maxTex2D[2], maxTex3D[3]; getCudaAttribute<int>( &maxTex1D, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH, dev ); getCudaAttribute<int>( &maxTex2D[0], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH, dev ); getCudaAttribute<int>( &maxTex2D[1], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT, dev ); getCudaAttribute<int>( &maxTex3D[0], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH, dev ); getCudaAttribute<int>( &maxTex3D[1], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT, dev ); getCudaAttribute<int>( &maxTex3D[2], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH, dev ); shrLog(" Max Texture Dimension Sizes 1D=(%d) 2D=(%d,%d) 3D=(%d,%d,%d)\n", maxTex1D, maxTex2D[0], maxTex2D[1], maxTex3D[0], maxTex3D[1], maxTex3D[2]); int maxTex2DLayered[3]; getCudaAttribute<int>( &maxTex2DLayered[0], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH, dev ); getCudaAttribute<int>( &maxTex2DLayered[1], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT, dev ); getCudaAttribute<int>( &maxTex2DLayered[2], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS, dev ); shrLog(" Max Layered Texture Size (dim) x layers 1D=(%d) x %d, 2D=(%d,%d) x %d\n", maxTex2DLayered[0], maxTex2DLayered[2], maxTex2DLayered[0], maxTex2DLayered[1], maxTex2DLayered[2]); #endif int totalConstantMemory; getCudaAttribute<int>( &totalConstantMemory, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, dev ); printf(" Total amount of constant memory: %u bytes\n", totalConstantMemory); int sharedMemPerBlock; getCudaAttribute<int>( &sharedMemPerBlock, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, dev ); printf(" Total amount of shared memory per block: %u bytes\n", sharedMemPerBlock); int regsPerBlock; getCudaAttribute<int>( ®sPerBlock, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev ); printf(" Total number of registers available per block: %d\n", regsPerBlock); int warpSize; getCudaAttribute<int>( &warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev ); printf(" Warp size: %d\n", warpSize); int maxThreadsPerMultiProcessor; getCudaAttribute<int>( &maxThreadsPerMultiProcessor, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev ); printf(" Maximum number of threads per multiprocessor: %d\n", maxThreadsPerMultiProcessor); int maxThreadsPerBlock; getCudaAttribute<int>( &maxThreadsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev ); printf(" Maximum number of threads per block: %d\n", maxThreadsPerBlock); int blockDim[3]; getCudaAttribute<int>( &blockDim[0], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, dev ); getCudaAttribute<int>( &blockDim[1], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, dev ); getCudaAttribute<int>( &blockDim[2], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, dev ); printf(" Maximum sizes of each dimension of a block: %d x %d x %d\n", blockDim[0], blockDim[1], blockDim[2]); int gridDim[3]; getCudaAttribute<int>( &gridDim[0], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, dev ); getCudaAttribute<int>( &gridDim[1], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, dev ); getCudaAttribute<int>( &gridDim[2], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, dev ); printf(" Maximum sizes of each dimension of a grid: %d x %d x %d\n", gridDim[0], gridDim[1], gridDim[2]); int textureAlign; getCudaAttribute<int>( &textureAlign, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, dev ); printf(" Texture alignment: %u bytes\n", textureAlign); int memPitch; getCudaAttribute<int>( &memPitch, CU_DEVICE_ATTRIBUTE_MAX_PITCH, dev ); printf(" Maximum memory pitch: %u bytes\n", memPitch); #if CUDA_VERSION >= 2000 int gpuOverlap; getCudaAttribute<int>( &gpuOverlap, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev ); #endif #if CUDA_VERSION >= 4000 int asyncEngineCount; getCudaAttribute<int>( &asyncEngineCount, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev ); printf(" Concurrent copy and execution: %s with %d copy engine(s)\n", (gpuOverlap ? "Yes" : "No"), asyncEngineCount); #else printf(" Concurrent copy and execution: %s\n",gpuOverlap ? "Yes" : "No"); #endif #if CUDA_VERSION >= 2020 int kernelExecTimeoutEnabled; getCudaAttribute<int>( &kernelExecTimeoutEnabled, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, dev ); printf(" Run time limit on kernels: %s\n", kernelExecTimeoutEnabled ? "Yes" : "No"); int integrated; getCudaAttribute<int>( &integrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev ); printf(" Integrated GPU sharing Host Memory: %s\n", integrated ? "Yes" : "No"); int canMapHostMemory; getCudaAttribute<int>( &canMapHostMemory, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev ); printf(" Support host page-locked memory mapping: %s\n", canMapHostMemory ? "Yes" : "No"); #endif #if CUDA_VERSION >= 3000 int concurrentKernels; getCudaAttribute<int>( &concurrentKernels, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev ); printf(" Concurrent kernel execution: %s\n", concurrentKernels ? "Yes" : "No"); int surfaceAlignment; getCudaAttribute<int>( &surfaceAlignment, CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT, dev ); printf(" Alignment requirement for Surfaces: %s\n", surfaceAlignment ? "Yes" : "No"); int eccEnabled; getCudaAttribute<int>( &eccEnabled, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, dev ); printf(" Device has ECC support enabled: %s\n", eccEnabled ? "Yes" : "No"); #endif #if CUDA_VERSION >= 3020 int tccDriver ; getCudaAttribute<int>( &tccDriver , CU_DEVICE_ATTRIBUTE_TCC_DRIVER, dev ); printf(" Device is using TCC driver mode: %s\n", tccDriver ? "Yes" : "No"); #endif #if CUDA_VERSION >= 4000 int unifiedAddressing; getCudaAttribute<int>( &unifiedAddressing, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, dev ); printf(" Device supports Unified Addressing (UVA): %s\n", unifiedAddressing ? "Yes" : "No"); int pciBusID, pciDeviceID; getCudaAttribute<int>( &pciBusID, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, dev ); getCudaAttribute<int>( &pciDeviceID, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, dev ); printf(" Device PCI Bus ID / PCI location ID: %d / %d\n", pciBusID, pciDeviceID ); const char *sComputeMode[] = { "Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)", "Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)", "Prohibited (no host thread can use ::cudaSetDevice() with this device)", "Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)", "Unknown", NULL }; int computeMode; getCudaAttribute<int>( &computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev ); printf(" Compute Mode:\n"); printf(" < %s >\n", sComputeMode[computeMode]); #endif } shrQAFinishExit(argc, (const char **)argv, QA_PASSED); }
int VideoEncoder::DisplayGPUCaps(int deviceOrdinal, NVEncoderParams *pParams, bool bDisplay) { NVVE_GPUAttributes GPUAttributes = {0}; HRESULT hr = S_OK; int gpuPerformance; assert(pParams != NULL); GPUAttributes.iGpuOrdinal = deviceOrdinal; hr = GetParamValue(NVVE_GET_GPU_ATTRIBUTES, &GPUAttributes); if (hr!=S_OK) { printf(" >> NVVE_GET_GPU_ATTRIBUTES error! <<\n\n"); } gpuPerformance = GPUAttributes.iClockRate * GPUAttributes.iMultiProcessorCount; gpuPerformance = gpuPerformance * _ConvertSMVer2Cores(GPUAttributes.iMajor, GPUAttributes.iMinor); size_t totalGlobalMem; CUresult error_id = cuDeviceTotalMem(&totalGlobalMem, deviceOrdinal); if (error_id != CUDA_SUCCESS) { printf("cuDeviceTotalMem returned %d\n-> %s\n", (int)error_id, getCudaDrvErrorString(error_id)); return -1; } if (bDisplay) { printf(" GPU Device %d (SM %d.%d) : %s\n", GPUAttributes.iGpuOrdinal, GPUAttributes.iMajor, GPUAttributes.iMinor, GPUAttributes.cName); printf(" Total Memory = %4.0f MBytes\n" , ceil((float)totalGlobalMem/1048576.0f)); printf(" GPU Clock = %4.2f MHz\n" , (float)GPUAttributes.iClockRate/1000.f); printf(" MultiProcessors/Cores = %d MPs (%d Cores)\n", GPUAttributes.iMultiProcessorCount, GPUAttributes.iMultiProcessorCount*_ConvertSMVer2Cores(GPUAttributes.iMajor, GPUAttributes.iMinor)); printf(" Maximum Offload Mode = "); switch (GPUAttributes.MaxGpuOffloadLevel) { case NVVE_GPU_OFFLOAD_DEFAULT: printf("CPU: PEL Processing Only\n"); break; case NVVE_GPU_OFFLOAD_ESTIMATORS: printf("GPU: Motion Estimation & Intra Prediction\n"); break; case NVVE_GPU_OFFLOAD_ALL: printf("GPU: Full Offload\n"); break; } printf("\n"); } pParams->MaxOffloadLevel = GPUAttributes.MaxGpuOffloadLevel; return gpuPerformance; }