// This function returns the best GPU (with maximum GFLOPS) int gpuGetMaxGflopsDeviceId() { int current_device = 0, sm_per_multiproc = 0; int max_compute_perf = 0, max_perf_device = 0; int device_count = 0, best_SM_arch = 0; cudaDeviceProp deviceProp; cudaGetDeviceCount( &device_count ); // Find the best major SM Architecture GPU device while ( current_device < device_count ) { cudaGetDeviceProperties( &deviceProp, current_device ); if (deviceProp.major > 0 && deviceProp.major < 9999) { best_SM_arch = MAX(best_SM_arch, deviceProp.major); } current_device++; } // Find the best CUDA capable GPU device current_device = 0; while( current_device < device_count ) { cudaGetDeviceProperties( &deviceProp, current_device ); if (deviceProp.major == 9999 && deviceProp.minor == 9999) { sm_per_multiproc = 1; } else { sm_per_multiproc = _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor); } int compute_perf = deviceProp.multiProcessorCount * sm_per_multiproc * deviceProp.clockRate; if( compute_perf > max_compute_perf ) { // If we find GPU with SM major > 2, search only these if ( best_SM_arch > 2 ) { // If our device==dest_SM_arch, choose this, or else pass if (deviceProp.major == best_SM_arch) { max_compute_perf = compute_perf; max_perf_device = current_device; } } else { max_compute_perf = compute_perf; max_perf_device = current_device; } } ++current_device; } return max_perf_device; }
/// Utility function to tweak problem size for small GPUs int adjustProblemSize(int GPU_N, int default_nOptions) { int nOptions = default_nOptions; // select problem size for (int i=0; i<GPU_N; i++) { cudaDeviceProp deviceProp; checkCudaErrors(cudaGetDeviceProperties(&deviceProp, i)); int cudaCores = _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount; if (cudaCores <= 32) { nOptions = (nOptions < cudaCores/2 ? nOptions : cudaCores/2); } } return nOptions; }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { pArgc = &argc; pArgv = argv; printf("%s Starting...\n\n", argv[0]); printf(" CUDA Device Query (Runtime API) version (CUDART static linking)\n\n"); int deviceCount = 0; cudaError_t error_id = cudaGetDeviceCount(&deviceCount); if (error_id != cudaSuccess) { printf("cudaGetDeviceCount returned %d\n-> %s\n", (int)error_id, cudaGetErrorString(error_id)); exit(EXIT_FAILURE); } // This function call returns 0 if there are no CUDA capable devices. if (deviceCount == 0) { printf("There are no available device(s) that support CUDA\n"); } else { printf("Detected %d CUDA Capable device(s)\n", deviceCount); } int dev, driverVersion = 0, runtimeVersion = 0; for (dev = 0; dev < deviceCount; ++dev) { cudaSetDevice(dev); cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, dev); printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name); // Console log cudaDriverGetVersion(&driverVersion); cudaRuntimeGetVersion(&runtimeVersion); printf(" CUDA Driver Version / Runtime Version %d.%d / %d.%d\n", driverVersion/1000, (driverVersion%100)/10, runtimeVersion/1000, (runtimeVersion%100)/10); printf(" CUDA Capability Major/Minor version number: %d.%d\n", deviceProp.major, deviceProp.minor); char msg[256]; sprintf(msg, " Total amount of global memory: %.0f MBytes (%llu bytes)\n", (float)deviceProp.totalGlobalMem/1048576.0f, (unsigned long long) deviceProp.totalGlobalMem); printf("%s", msg); printf(" (%2d) Multiprocessors x (%3d) CUDA Cores/MP: %d CUDA Cores\n", deviceProp.multiProcessorCount, _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor), _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount); printf(" GPU Clock rate: %.0f MHz (%0.2f GHz)\n", deviceProp.clockRate * 1e-3f, deviceProp.clockRate * 1e-6f); #if CUDART_VERSION >= 5000 // This is supported in CUDA 5.0 (runtime API device properties) printf(" Memory Clock rate: %.0f Mhz\n", deviceProp.memoryClockRate * 1e-3f); printf(" Memory Bus Width: %d-bit\n", deviceProp.memoryBusWidth); if (deviceProp.l2CacheSize) { printf(" L2 Cache Size: %d bytes\n", deviceProp.l2CacheSize); } #else // This only available in CUDA 4.0-4.2 (but these were only exposed in the CUDA Driver API) int memoryClock; getCudaAttribute<int>(&memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev); printf(" Memory Clock rate: %.0f Mhz\n", memoryClock * 1e-3f); int memBusWidth; getCudaAttribute<int>(&memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev); printf(" Memory Bus Width: %d-bit\n", memBusWidth); int L2CacheSize; getCudaAttribute<int>(&L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev); if (L2CacheSize) { printf(" L2 Cache Size: %d bytes\n", L2CacheSize); } #endif printf(" Max Texture Dimension Size (x,y,z) 1D=(%d), 2D=(%d,%d), 3D=(%d,%d,%d)\n", deviceProp.maxTexture1D , deviceProp.maxTexture2D[0], deviceProp.maxTexture2D[1], deviceProp.maxTexture3D[0], deviceProp.maxTexture3D[1], deviceProp.maxTexture3D[2]); printf(" Max Layered Texture Size (dim) x layers 1D=(%d) x %d, 2D=(%d,%d) x %d\n", deviceProp.maxTexture1DLayered[0], deviceProp.maxTexture1DLayered[1], deviceProp.maxTexture2DLayered[0], deviceProp.maxTexture2DLayered[1], deviceProp.maxTexture2DLayered[2]); printf(" Total amount of constant memory: %lu bytes\n", deviceProp.totalConstMem); printf(" Total amount of shared memory per block: %lu bytes\n", deviceProp.sharedMemPerBlock); printf(" Total number of registers available per block: %d\n", deviceProp.regsPerBlock); printf(" Warp size: %d\n", deviceProp.warpSize); printf(" Maximum number of threads per multiprocessor: %d\n", deviceProp.maxThreadsPerMultiProcessor); printf(" Maximum number of threads per block: %d\n", deviceProp.maxThreadsPerBlock); printf(" Maximum sizes of each dimension of a block: %d x %d x %d\n", deviceProp.maxThreadsDim[0], deviceProp.maxThreadsDim[1], deviceProp.maxThreadsDim[2]); printf(" Maximum sizes of each dimension of a grid: %d x %d x %d\n", deviceProp.maxGridSize[0], deviceProp.maxGridSize[1], deviceProp.maxGridSize[2]); printf(" Maximum memory pitch: %lu bytes\n", deviceProp.memPitch); printf(" Texture alignment: %lu bytes\n", deviceProp.textureAlignment); printf(" Concurrent copy and kernel execution: %s with %d copy engine(s)\n", (deviceProp.deviceOverlap ? "Yes" : "No"), deviceProp.asyncEngineCount); printf(" Run time limit on kernels: %s\n", deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No"); printf(" Integrated GPU sharing Host Memory: %s\n", deviceProp.integrated ? "Yes" : "No"); printf(" Support host page-locked memory mapping: %s\n", deviceProp.canMapHostMemory ? "Yes" : "No"); printf(" Alignment requirement for Surfaces: %s\n", deviceProp.surfaceAlignment ? "Yes" : "No"); printf(" Device has ECC support: %s\n", deviceProp.ECCEnabled ? "Enabled" : "Disabled"); #ifdef WIN32 printf(" CUDA Device Driver Mode (TCC or WDDM): %s\n", deviceProp.tccDriver ? "TCC (Tesla Compute Cluster Driver)" : "WDDM (Windows Display Driver Model)"); #endif printf(" Device supports Unified Addressing (UVA): %s\n", deviceProp.unifiedAddressing ? "Yes" : "No"); printf(" Device PCI Bus ID / PCI location ID: %d / %d\n", deviceProp.pciBusID, deviceProp.pciDeviceID); const char *sComputeMode[] = { "Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)", "Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)", "Prohibited (no host thread can use ::cudaSetDevice() with this device)", "Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)", "Unknown", NULL }; printf(" Compute Mode:\n"); printf(" < %s >\n", sComputeMode[deviceProp.computeMode]); } // csv masterlog info // ***************************** // exe and CUDA driver name printf("\n"); std::string sProfileString = "deviceQuery, CUDA Driver = CUDART"; char cTemp[16]; // driver version sProfileString += ", CUDA Driver Version = "; #ifdef WIN32 sprintf_s(cTemp, 10, "%d.%d", driverVersion/1000, (driverVersion%100)/10); #else sprintf(cTemp, "%d.%d", driverVersion/1000, (driverVersion%100)/10); #endif sProfileString += cTemp; // Runtime version sProfileString += ", CUDA Runtime Version = "; #ifdef WIN32 sprintf_s(cTemp, 10, "%d.%d", runtimeVersion/1000, (runtimeVersion%100)/10); #else sprintf(cTemp, "%d.%d", runtimeVersion/1000, (runtimeVersion%100)/10); #endif sProfileString += cTemp; // Device count sProfileString += ", NumDevs = "; #ifdef WIN32 sprintf_s(cTemp, 10, "%d", deviceCount); #else sprintf(cTemp, "%d", deviceCount); #endif sProfileString += cTemp; // Print Out all device Names for (dev = 0; dev < deviceCount; ++dev) { #ifdef _WIN32 sprintf_s(cTemp, 13, ", Device%d = ", dev); #else sprintf(cTemp, ", Device%d = ", dev); #endif cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, dev); sProfileString += cTemp; sProfileString += deviceProp.name; } sProfileString += "\n"; printf("%s", sProfileString.c_str()); // finish exit(EXIT_SUCCESS); }
// This function returns the best GPU (with maximum GFLOPS) inline int gpuGetMaxGflopsDeviceId() { int current_device = 0, sm_per_multiproc = 0; int max_perf_device = 0; int device_count = 0, best_SM_arch = 0; int devices_prohibited = 0; unsigned long long max_compute_perf = 0; cudaDeviceProp deviceProp; cudaGetDeviceCount(&device_count); checkCudaErrors(cudaGetDeviceCount(&device_count)); if (device_count == 0) { fprintf(stderr, "gpuGetMaxGflopsDeviceId() CUDA error: no devices supporting CUDA.\n"); exit(EXIT_FAILURE); } // Find the best major SM Architecture GPU device while (current_device < device_count) { cudaGetDeviceProperties(&deviceProp, current_device); // If this GPU is not running on Compute Mode prohibited, then we can add it to the list if (deviceProp.computeMode != cudaComputeModeProhibited) { if (deviceProp.major > 0 && deviceProp.major < 9999) { best_SM_arch = MAX(best_SM_arch, deviceProp.major); } } else { devices_prohibited++; } current_device++; } if (devices_prohibited == device_count) { fprintf(stderr, "gpuGetMaxGflopsDeviceId() CUDA error: all devices have compute mode prohibited.\n"); exit(EXIT_FAILURE); } // Find the best CUDA capable GPU device current_device = 0; while (current_device < device_count) { cudaGetDeviceProperties(&deviceProp, current_device); // If this GPU is not running on Compute Mode prohibited, then we can add it to the list if (deviceProp.computeMode != cudaComputeModeProhibited) { if (deviceProp.major == 9999 && deviceProp.minor == 9999) { sm_per_multiproc = 1; } else { sm_per_multiproc = _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor); } unsigned long long compute_perf = (unsigned long long) deviceProp.multiProcessorCount * sm_per_multiproc * deviceProp.clockRate; if (compute_perf > max_compute_perf) { // If we find GPU with SM major > 2, search only these if (best_SM_arch > 2) { // If our device==dest_SM_arch, choose this, or else pass if (deviceProp.major == best_SM_arch) { max_compute_perf = compute_perf; max_perf_device = current_device; } } else { max_compute_perf = compute_perf; max_perf_device = current_device; } } } ++current_device; } return max_perf_device; }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { pArgc = &argc; pArgv = argv; printf("%s Starting...\n\n", argv[0]); printf(" CUDA Device Query (Runtime API) version (CUDART static linking)\n\n"); int deviceCount = 0; cudaError_t error_id = cudaGetDeviceCount(&deviceCount); if (error_id != cudaSuccess) { printf("cudaGetDeviceCount returned %d\n-> %s\n", (int)error_id, cudaGetErrorString(error_id)); printf("Result = FAIL\n"); exit(EXIT_FAILURE); } // This function call returns 0 if there are no CUDA capable devices. if (deviceCount == 0) { printf("There are no available device(s) that support CUDA\n"); } else { printf("Detected %d CUDA Capable device(s)\n", deviceCount); } int dev, driverVersion = 0, runtimeVersion = 0; for (dev = 0; dev < deviceCount; ++dev) { cudaSetDevice(dev); cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, dev); printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name); // Console log cudaDriverGetVersion(&driverVersion); cudaRuntimeGetVersion(&runtimeVersion); printf(" CUDA Driver Version / Runtime Version %d.%d / %d.%d\n", driverVersion/1000, (driverVersion%100)/10, runtimeVersion/1000, (runtimeVersion%100)/10); printf(" CUDA Capability Major/Minor version number: %d.%d\n", deviceProp.major, deviceProp.minor); char msg[256]; SPRINTF(msg, " Total amount of global memory: %.0f MBytes (%llu bytes)\n", (float)deviceProp.totalGlobalMem/1048576.0f, (unsigned long long) deviceProp.totalGlobalMem); printf("%s", msg); printf(" (%2d) Multiprocessors, (%3d) CUDA Cores/MP: %d CUDA Cores\n", deviceProp.multiProcessorCount, _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor), _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount); printf(" GPU Max Clock rate: %.0f MHz (%0.2f GHz)\n", deviceProp.clockRate * 1e-3f, deviceProp.clockRate * 1e-6f); #if CUDART_VERSION >= 5000 // This is supported in CUDA 5.0 (runtime API device properties) printf(" Memory Clock rate: %.0f Mhz\n", deviceProp.memoryClockRate * 1e-3f); printf(" Memory Bus Width: %d-bit\n", deviceProp.memoryBusWidth); if (deviceProp.l2CacheSize) { printf(" L2 Cache Size: %d bytes\n", deviceProp.l2CacheSize); } #else // This only available in CUDA 4.0-4.2 (but these were only exposed in the CUDA Driver API) int memoryClock; getCudaAttribute<int>(&memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev); printf(" Memory Clock rate: %.0f Mhz\n", memoryClock * 1e-3f); int memBusWidth; getCudaAttribute<int>(&memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev); printf(" Memory Bus Width: %d-bit\n", memBusWidth); int L2CacheSize; getCudaAttribute<int>(&L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev); if (L2CacheSize) { printf(" L2 Cache Size: %d bytes\n", L2CacheSize); } #endif printf(" Maximum Texture Dimension Size (x,y,z) 1D=(%d), 2D=(%d, %d), 3D=(%d, %d, %d)\n", deviceProp.maxTexture1D , deviceProp.maxTexture2D[0], deviceProp.maxTexture2D[1], deviceProp.maxTexture3D[0], deviceProp.maxTexture3D[1], deviceProp.maxTexture3D[2]); printf(" Maximum Layered 1D Texture Size, (num) layers 1D=(%d), %d layers\n", deviceProp.maxTexture1DLayered[0], deviceProp.maxTexture1DLayered[1]); printf(" Maximum Layered 2D Texture Size, (num) layers 2D=(%d, %d), %d layers\n", deviceProp.maxTexture2DLayered[0], deviceProp.maxTexture2DLayered[1], deviceProp.maxTexture2DLayered[2]); printf(" Total amount of constant memory: %lu bytes\n", deviceProp.totalConstMem); printf(" Total amount of shared memory per block: %lu bytes\n", deviceProp.sharedMemPerBlock); printf(" Total number of registers available per block: %d\n", deviceProp.regsPerBlock); printf(" Warp size: %d\n", deviceProp.warpSize); printf(" Maximum number of threads per multiprocessor: %d\n", deviceProp.maxThreadsPerMultiProcessor); printf(" Maximum number of threads per block: %d\n", deviceProp.maxThreadsPerBlock); printf(" Max dimension size of a thread block (x,y,z): (%d, %d, %d)\n", deviceProp.maxThreadsDim[0], deviceProp.maxThreadsDim[1], deviceProp.maxThreadsDim[2]); printf(" Max dimension size of a grid size (x,y,z): (%d, %d, %d)\n", deviceProp.maxGridSize[0], deviceProp.maxGridSize[1], deviceProp.maxGridSize[2]); printf(" Maximum memory pitch: %lu bytes\n", deviceProp.memPitch); printf(" Texture alignment: %lu bytes\n", deviceProp.textureAlignment); printf(" Concurrent copy and kernel execution: %s with %d copy engine(s)\n", (deviceProp.deviceOverlap ? "Yes" : "No"), deviceProp.asyncEngineCount); printf(" Run time limit on kernels: %s\n", deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No"); printf(" Integrated GPU sharing Host Memory: %s\n", deviceProp.integrated ? "Yes" : "No"); printf(" Support host page-locked memory mapping: %s\n", deviceProp.canMapHostMemory ? "Yes" : "No"); printf(" Alignment requirement for Surfaces: %s\n", deviceProp.surfaceAlignment ? "Yes" : "No"); printf(" Device has ECC support: %s\n", deviceProp.ECCEnabled ? "Enabled" : "Disabled"); #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) printf(" CUDA Device Driver Mode (TCC or WDDM): %s\n", deviceProp.tccDriver ? "TCC (Tesla Compute Cluster Driver)" : "WDDM (Windows Display Driver Model)"); #endif printf(" Device supports Unified Addressing (UVA): %s\n", deviceProp.unifiedAddressing ? "Yes" : "No"); printf(" Device PCI Domain ID / Bus ID / location ID: %d / %d / %d\n", deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID); const char *sComputeMode[] = { "Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)", "Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)", "Prohibited (no host thread can use ::cudaSetDevice() with this device)", "Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)", "Unknown", NULL }; printf(" Compute Mode:\n"); printf(" < %s >\n", sComputeMode[deviceProp.computeMode]); } // If there are 2 or more GPUs, query to determine whether RDMA is supported if (deviceCount >= 2) { cudaDeviceProp prop[64]; int gpuid[64]; // we want to find the first two GPUs that can support P2P int gpu_p2p_count = 0; for (int i=0; i < deviceCount; i++) { checkCudaErrors(cudaGetDeviceProperties(&prop[i], i)); // Only boards based on Fermi or later can support P2P if ((prop[i].major >= 2) #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) // on Windows (64-bit), the Tesla Compute Cluster driver for windows must be enabled to support this && prop[i].tccDriver #endif ) { // This is an array of P2P capable GPUs gpuid[gpu_p2p_count++] = i; } } // Show all the combinations of support P2P GPUs int can_access_peer; if (gpu_p2p_count >= 2) { for (int i = 0; i < gpu_p2p_count; i++) { for (int j = 0; j < gpu_p2p_count; j++) { if (gpuid[i] == gpuid[j]) { continue; } checkCudaErrors(cudaDeviceCanAccessPeer(&can_access_peer, gpuid[i], gpuid[j])); printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n", prop[gpuid[i]].name, gpuid[i], prop[gpuid[j]].name, gpuid[j] , can_access_peer ? "Yes" : "No"); } } } } // csv masterlog info // ***************************** // exe and CUDA driver name printf("\n"); std::string sProfileString = "deviceQuery, CUDA Driver = CUDART"; char cTemp[16]; // driver version sProfileString += ", CUDA Driver Version = "; #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) sprintf_s(cTemp, 10, "%d.%d", driverVersion/1000, (driverVersion%100)/10); #else sprintf(cTemp, "%d.%d", driverVersion/1000, (driverVersion%100)/10); #endif sProfileString += cTemp; // Runtime version sProfileString += ", CUDA Runtime Version = "; #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) sprintf_s(cTemp, 10, "%d.%d", runtimeVersion/1000, (runtimeVersion%100)/10); #else sprintf(cTemp, "%d.%d", runtimeVersion/1000, (runtimeVersion%100)/10); #endif sProfileString += cTemp; // Device count sProfileString += ", NumDevs = "; #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) sprintf_s(cTemp, 10, "%d", deviceCount); #else sprintf(cTemp, "%d", deviceCount); #endif sProfileString += cTemp; // Print Out all device Names for (dev = 0; dev < deviceCount; ++dev) { #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) sprintf_s(cTemp, 13, ", Device%d = ", dev); #else sprintf(cTemp, ", Device%d = ", dev); #endif cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, dev); sProfileString += cTemp; sProfileString += deviceProp.name; } sProfileString += "\n"; printf("%s", sProfileString.c_str()); printf("Result = PASS\n"); // finish exit(EXIT_SUCCESS); }
int cuda_api::GetMaxGflopsGraphicsDeviceId() { CUdevice current_device = 0, max_perf_device = 0; int device_count = 0, sm_per_multiproc = 0; int max_compute_perf = 0, best_SM_arch = 0; int major = 0, minor = 0, multiProcessorCount, clockRate; int bTCC = 0, version; char deviceName[256]; cuDeviceGetCount(&device_count); if (device_count <= 0) return -1; cuDriverGetVersion(&version); // Find the best major SM Architecture GPU device that are graphics devices while (current_device < device_count) { cuDeviceGetName(deviceName, 256, current_device); cuDeviceComputeCapability(&major, &minor, current_device); if (version >= 3020) { cuDeviceGetAttribute(&bTCC, CU_DEVICE_ATTRIBUTE_TCC_DRIVER, current_device); } else { // Assume a Tesla GPU is running in TCC if we are running CUDA 3.1 if (deviceName[0] == 'T') bTCC = 1; } if (!bTCC) { if (major > 0 && major < 9999) { best_SM_arch = std::max(best_SM_arch, major); } } current_device++; } // Find the best CUDA capable GPU device current_device = 0; while (current_device < device_count) { cuDeviceGetAttribute(&multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, current_device); cuDeviceGetAttribute(&clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, current_device); cuDeviceComputeCapability(&major, &minor, current_device); if (version >= 3020) { cuDeviceGetAttribute(&bTCC, CU_DEVICE_ATTRIBUTE_TCC_DRIVER, current_device); } else { // Assume a Tesla GPU is running in TCC if we are running CUDA 3.1 if (deviceName[0] == 'T') bTCC = 1; } if (major == 9999 && minor == 9999) { sm_per_multiproc = 1; } else { sm_per_multiproc = _ConvertSMVer2Cores(major, minor); } // If this is a Tesla based GPU and SM 2.0, and TCC is disabled, this is a contendor if (!bTCC) {// Is this GPU running the TCC driver? If so we pass on this int compute_perf = multiProcessorCount * sm_per_multiproc * clockRate; printf("%s @%d compute_perf=%d max_compute_perf=%d\n", __FUNCTION__, __LINE__, compute_perf, max_compute_perf); if (compute_perf > max_compute_perf) { // If we find GPU with SM major > 2, search only these if (best_SM_arch > 2) { printf("%s @%d best_SM_arch=%d\n", __FUNCTION__, __LINE__, best_SM_arch); // If our device = dest_SM_arch, then we pick this one if (major == best_SM_arch) { max_compute_perf = compute_perf; max_perf_device = current_device; } } else { max_compute_perf = compute_perf; max_perf_device = current_device; } } cuDeviceGetName(deviceName, 256, current_device); printf("CUDA Device: %s, Compute: %d.%d, CUDA Cores: %d, Clock: %d MHz\n", deviceName, major, minor, multiProcessorCount * sm_per_multiproc, clockRate / 1000); } ++current_device; } return max_perf_device; }
int VideoEncoder::DisplayGPUCaps(int deviceOrdinal, NVEncoderParams *pParams, bool bDisplay) { NVVE_GPUAttributes GPUAttributes = {0}; HRESULT hr = S_OK; int gpuPerformance; assert(pParams != NULL); GPUAttributes.iGpuOrdinal = deviceOrdinal; hr = GetParamValue(NVVE_GET_GPU_ATTRIBUTES, &GPUAttributes); if (hr!=S_OK) { printf(" >> NVVE_GET_GPU_ATTRIBUTES error! <<\n\n"); } gpuPerformance = GPUAttributes.iClockRate * GPUAttributes.iMultiProcessorCount; gpuPerformance = gpuPerformance * _ConvertSMVer2Cores(GPUAttributes.iMajor, GPUAttributes.iMinor); size_t totalGlobalMem; CUresult error_id = cuDeviceTotalMem(&totalGlobalMem, deviceOrdinal); if (error_id != CUDA_SUCCESS) { printf("cuDeviceTotalMem returned %d\n-> %s\n", (int)error_id, getCudaDrvErrorString(error_id)); return -1; } if (bDisplay) { printf(" GPU Device %d (SM %d.%d) : %s\n", GPUAttributes.iGpuOrdinal, GPUAttributes.iMajor, GPUAttributes.iMinor, GPUAttributes.cName); printf(" Total Memory = %4.0f MBytes\n" , ceil((float)totalGlobalMem/1048576.0f)); printf(" GPU Clock = %4.2f MHz\n" , (float)GPUAttributes.iClockRate/1000.f); printf(" MultiProcessors/Cores = %d MPs (%d Cores)\n", GPUAttributes.iMultiProcessorCount, GPUAttributes.iMultiProcessorCount*_ConvertSMVer2Cores(GPUAttributes.iMajor, GPUAttributes.iMinor)); printf(" Maximum Offload Mode = "); switch (GPUAttributes.MaxGpuOffloadLevel) { case NVVE_GPU_OFFLOAD_DEFAULT: printf("CPU: PEL Processing Only\n"); break; case NVVE_GPU_OFFLOAD_ESTIMATORS: printf("GPU: Motion Estimation & Intra Prediction\n"); break; case NVVE_GPU_OFFLOAD_ALL: printf("GPU: Full Offload\n"); break; } printf("\n"); } pParams->MaxOffloadLevel = GPUAttributes.MaxGpuOffloadLevel; return gpuPerformance; }
void DialogSelectHardware::ChangeText(int indexDevice) { int driverVersion = 0, runtimeVersion = 0; cudaSetDevice(indexDevice); cudaGetDeviceProperties(deviceProp, indexDevice); cudaDriverGetVersion(&driverVersion); cudaRuntimeGetVersion(&runtimeVersion); char msg[256]; SPRINTF(msg,"%.0f MBytes (%llu bytes)\n", (float)deviceProp->totalGlobalMem/1048576.0f, (unsigned long long) deviceProp->totalGlobalMem); ui->tableWidget->clear(); addItem(QString ("Device "+QString::number(indexDevice).append(" : ")+ deviceProp->name),0,0); addItem((selectDevice == indexDevice) ? "Dispositivo Seleccionado " : " ",0,1); addItem("CUDA Driver Version / Runtime Version",1,0); addItem(QString ("%1.%2 / %3.%4").arg(driverVersion/1000).arg((driverVersion%100)/10).arg( runtimeVersion/1000).arg((runtimeVersion%100)/10),1,1); addItem("CUDA Capability Major/Minor version number: ",2,0); addItem(QString ("%1.%2").arg(deviceProp->major).arg(deviceProp->minor),2,1); addItem("Total amount of global memory:",3,0); addItem(msg,3,1); addItem(QString ("(%1) Multiprocessors, (%2) CUDA Cores/MP:%3 CUDA Cores").arg( deviceProp->multiProcessorCount).arg( _ConvertSMVer2Cores(deviceProp->major, deviceProp->minor)).arg( _ConvertSMVer2Cores(deviceProp->major, deviceProp->minor) * deviceProp->multiProcessorCount),4,0); addItem("Total amount of constant memory:",5,0); addItem(QString ("%1 bytes").arg(deviceProp->totalConstMem),5,1); addItem("Total amount of shared memory per block:",6,0); addItem(QString ("%1 bytes").arg(deviceProp->sharedMemPerBlock),6,1); addItem("Total number of registers available per block:",7,0); addItem(QString ("%1").arg(deviceProp->regsPerBlock),7,1); addItem("Warp size:",8,0); addItem(QString ("%1").arg(deviceProp->warpSize),8,1); addItem("Maximum number of threads per multiprocessor:",9,0); addItem(QString ("%1").arg(deviceProp->maxThreadsPerMultiProcessor),9,1); addItem("Maximum number of threads per block:",10,0); addItem(QString ("%1").arg(deviceProp->maxThreadsPerBlock),10,1); addItem("Max dimension size of a thread block (x,y,z):",11,0); addItem(QString ("(%1, %2, %3)").arg(deviceProp->maxThreadsDim[0]).arg( deviceProp->maxThreadsDim[1]).arg( deviceProp->maxThreadsDim[2]),11,1); addItem("Max dimension size of a grid size (x,y,z):",12,0); addItem(QString ("(%1, %2, %3)\n").arg(deviceProp->maxGridSize[0]).arg(deviceProp->maxGridSize[1]).arg(deviceProp->maxGridSize[2]),12,1); addItem("Run time limit on kernels: ",13,0); addItem(QString ("%1\n").arg(deviceProp->kernelExecTimeoutEnabled ? "Yes" : "No"),13,1); addItem("Integrated GPU sharing Host Memory: ",14,0); addItem( QString ("%1\n").arg(deviceProp->integrated ? "Yes" : "No"),14,1); ui->tableWidget->resizeColumnsToContents(); ui->tableWidget->resizeRowsToContents(); }