void printCudaDeviceInfo(int deviceId) { cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, deviceId); printf("CUDA Device Information:\n"); printf("Device %d: \"%s\"\n", deviceId, deviceProp.name); printf(" Integrated: %d\n", deviceProp.integrated); printf(" Can map host mem: %d\n", deviceProp.canMapHostMemory); printf(" Number of cores: %d\n", ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount); printf(" Clock rate: %.2f GHz\n", deviceProp.clockRate * 1e-6f); printf(" Performance Number: %d\n", ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount * (deviceProp.clockRate / 1000)); printf(" Note: Performance number is clock in mhz * core count, for comparing devices.\n"); }
std::vector<int> host::QueryDevices() { int device_count = 0; CUDA_SAFE_CALL(cudaGetDeviceCount(&device_count)); if (device_count < 1) { fprintf(stderr, "No suitable CUDA devices found!\n"); exit(EXIT_FAILURE); } std::vector<int> device_ids; for (int i = 0; i < device_count; i++) { cudaDeviceProp device_prop; CUDA_SAFE_CALL(cudaGetDeviceProperties(&device_prop, i)); int compute_cap_major = device_prop.major; int compute_cap_minor = device_prop.minor; int core_count = ConvertSMVer2Cores(compute_cap_major, compute_cap_minor) * device_prop.multiProcessorCount; float clock_speed = device_prop.clockRate * 1e-6f; float mem_size = device_prop.totalGlobalMem / 1024.0f / 1024.0f; if (compute_cap_major >= 2) { device_ids.push_back(i); printf("\t[%d] %s (%d.%d, %d cores, %.2f GHz, %.2f MB)\n", i, device_prop.name, compute_cap_major, compute_cap_minor, core_count, clock_speed, mem_size); } else { printf("\t[%d] %s (%d.%d not usable)\n", i, device_prop.name, compute_cap_major, compute_cap_minor); } } if (device_ids.size() == 0) { fprintf(stderr, "No suitable CUDA devices found!\n"); exit(EXIT_FAILURE); } return device_ids; }
int getCudaStreamProcessorCount(int deviceId) { cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, deviceId); return ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount; }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main( int argc, char** argv) { CUdevice dev; int major = 0, minor = 0; int deviceCount = 0; char deviceName[256]; // note your project will need to link with cuda.lib files on windows printf("CUDA Device Query (Driver API) statically linked version \n"); CUresult err = cuInit(0); CU_SAFE_CALL_NO_SYNC(cuDeviceGetCount(&deviceCount)); // This function call returns 0 if there are no CUDA capable devices. if (deviceCount == 0) { printf("There is no device supporting CUDA\n"); } for (dev = 0; dev < deviceCount; ++dev) { CU_SAFE_CALL_NO_SYNC( cuDeviceComputeCapability(&major, &minor, dev) ); if (dev == 0) { // This function call returns 9999 for both major & minor fields, if no CUDA capable devices are present if (major == 9999 && minor == 9999) printf("There is no device supporting CUDA.\n"); else if (deviceCount == 1) printf("There is 1 device supporting CUDA\n"); else printf("There are %d devices supporting CUDA\n", deviceCount); } CU_SAFE_CALL_NO_SYNC( cuDeviceGetName(deviceName, 256, dev) ); printf("\nDevice %d: \"%s\"\n", dev, deviceName); #if CUDA_VERSION >= 2020 int driverVersion = 0; cuDriverGetVersion(&driverVersion); printf(" CUDA Driver Version: %d.%d\n", driverVersion/1000, driverVersion%100); #endif shrLog(" CUDA Capability Major/Minor version number: %d.%d\n", major, minor); size_t totalGlobalMem; CU_SAFE_CALL_NO_SYNC( cuDeviceTotalMem(&totalGlobalMem, dev) ); printf(" Total amount of global memory: %llu bytes\n", (unsigned long long)totalGlobalMem); #if CUDA_VERSION >= 2000 int multiProcessorCount; CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev ) ); shrLog(" Multiprocessors x Cores/MP = Cores: %d (MP) x %d (Cores/MP) = %d (Cores)\n", multiProcessorCount, ConvertSMVer2Cores(major, minor), ConvertSMVer2Cores(major, minor) * multiProcessorCount); #endif int totalConstantMemory; CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &totalConstantMemory, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, dev ) ); printf(" Total amount of constant memory: %u bytes\n", totalConstantMemory); int sharedMemPerBlock; CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &sharedMemPerBlock, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, dev ) ); printf(" Total amount of shared memory per block: %u bytes\n", sharedMemPerBlock); int regsPerBlock; CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( ®sPerBlock, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev ) ); printf(" Total number of registers available per block: %d\n", regsPerBlock); int warpSize; CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev ) ); printf(" Warp size: %d\n", warpSize); int maxThreadsPerBlock; CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &maxThreadsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev ) ); printf(" Maximum number of threads per block: %d\n", maxThreadsPerBlock); int blockDim[3]; CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &blockDim[0], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, dev ) ); CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &blockDim[1], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, dev ) ); CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &blockDim[2], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, dev ) ); printf(" Maximum sizes of each dimension of a block: %d x %d x %d\n", blockDim[0], blockDim[1], blockDim[2]); int gridDim[3]; CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &gridDim[0], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, dev ) ); CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &gridDim[1], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, dev ) ); CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &gridDim[2], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, dev ) ); printf(" Maximum sizes of each dimension of a grid: %d x %d x %d\n", gridDim[0], gridDim[1], gridDim[2]); int memPitch; CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &memPitch, CU_DEVICE_ATTRIBUTE_MAX_PITCH, dev ) ); printf(" Maximum memory pitch: %u bytes\n", memPitch); int textureAlign; CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &textureAlign, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, dev ) ); printf(" Texture alignment: %u bytes\n", textureAlign); int clockRate; CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev ) ); printf(" Clock rate: %.2f GHz\n", clockRate * 1e-6f); #if CUDA_VERSION >= 2000 int gpuOverlap; CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &gpuOverlap, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev ) ); printf(" Concurrent copy and execution: %s\n",gpuOverlap ? "Yes" : "No"); #endif #if CUDA_VERSION >= 2020 int kernelExecTimeoutEnabled; CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &kernelExecTimeoutEnabled, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, dev ) ); printf(" Run time limit on kernels: %s\n", kernelExecTimeoutEnabled ? "Yes" : "No"); int integrated; CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &integrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev ) ); printf(" Integrated: %s\n", integrated ? "Yes" : "No"); int canMapHostMemory; CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &canMapHostMemory, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev ) ); printf(" Support host page-locked memory mapping: %s\n", canMapHostMemory ? "Yes" : "No"); #endif #if CUDA_VERSION >= 3000 int concurrentKernels; CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &concurrentKernels, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev ) ); printf(" Concurrent kernel execution: %s\n", concurrentKernels ? "Yes" : "No"); int eccEnabled; CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &eccEnabled, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, dev ) ); printf(" Device has ECC support enabled: %s\n", eccEnabled ? "Yes" : "No"); #endif #if CUDA_VERSION >= 3020 int tccDriver ; CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &tccDriver , CU_DEVICE_ATTRIBUTE_TCC_DRIVER, dev ) ); printf(" Device is using TCC driver mode: %s\n", tccDriver ? "Yes" : "No"); #endif } printf("\nPASSED\n"); CUT_EXIT(argc, argv); }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main( int argc, char** argv) { pArgc = &argc; pArgv = argv; /* shrQAStart(argc, argv); shrSetLogFileName ("deviceQuery.txt"); */ shrLog("%s Starting...\n\n", argv[0]); shrLog(" CUDA Device Query (Runtime API) version (CUDART static linking)\n\n"); int deviceCount = 0; cudaError_t error_id = cudaGetDeviceCount(&deviceCount); if (error_id != cudaSuccess) { shrLog( "cudaGetDeviceCount returned %d\n-> %s\n", (int)error_id, cudaGetErrorString(error_id) ); return -1; } // This function call returns 0 if there are no CUDA capable devices. if (deviceCount == 0) shrLog("There is no device supporting CUDA\n"); else shrLog("Found %d CUDA Capable device(s)\n", deviceCount); int dev, driverVersion = 0, runtimeVersion = 0; for (dev = 0; dev < deviceCount; ++dev) { cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, dev); shrLog("\nDevice %d: \"%s\"\n", dev, deviceProp.name); #if CUDART_VERSION >= 2020 // Console log cudaDriverGetVersion(&driverVersion); cudaRuntimeGetVersion(&runtimeVersion); shrLog(" CUDA Driver Version / Runtime Version %d.%d / %d.%d\n", driverVersion/1000, (driverVersion%100)/10, runtimeVersion/1000, (runtimeVersion%100)/10); #endif shrLog(" CUDA Capability Major/Minor version number: %d.%d\n", deviceProp.major, deviceProp.minor); char msg[256]; sprintf(msg, " Total amount of global memory: %.0f MBytes (%llu bytes)\n", (float)deviceProp.totalGlobalMem/1048576.0f, (unsigned long long) deviceProp.totalGlobalMem); shrLog(msg); #if CUDART_VERSION >= 2000 shrLog(" (%2d) Multiprocessors x (%2d) CUDA Cores/MP: %d CUDA Cores\n", deviceProp.multiProcessorCount, ConvertSMVer2Cores(deviceProp.major, deviceProp.minor), ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount); #endif shrLog(" GPU Clock Speed: %.2f GHz\n", deviceProp.clockRate * 1e-6f); #if CUDART_VERSION >= 4000 // This is not available in the CUDA Runtime API, so we make the necessary calls the driver API to support this for output int memoryClock; getCudaAttribute<int>( &memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev ); shrLog(" Memory Clock rate: %.2f Mhz\n", memoryClock * 1e-3f); int memBusWidth; getCudaAttribute<int>( &memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev ); shrLog(" Memory Bus Width: %d-bit\n", memBusWidth); int L2CacheSize; getCudaAttribute<int>( &L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev ); if (L2CacheSize) { shrLog(" L2 Cache Size: %d bytes\n", L2CacheSize); } shrLog(" Max Texture Dimension Size (x,y,z) 1D=(%d), 2D=(%d,%d), 3D=(%d,%d,%d)\n", deviceProp.maxTexture1D, deviceProp.maxTexture2D[0], deviceProp.maxTexture2D[1], deviceProp.maxTexture3D[0], deviceProp.maxTexture3D[1], deviceProp.maxTexture3D[2]); shrLog(" Max Layered Texture Size (dim) x layers 1D=(%d) x %d, 2D=(%d,%d) x %d\n", deviceProp.maxTexture1DLayered[0], deviceProp.maxTexture1DLayered[1], deviceProp.maxTexture2DLayered[0], deviceProp.maxTexture2DLayered[1], deviceProp.maxTexture2DLayered[2]); #endif shrLog(" Total amount of constant memory: %u bytes\n", deviceProp.totalConstMem); shrLog(" Total amount of shared memory per block: %u bytes\n", deviceProp.sharedMemPerBlock); shrLog(" Total number of registers available per block: %d\n", deviceProp.regsPerBlock); shrLog(" Warp size: %d\n", deviceProp.warpSize); shrLog(" Maximum number of threads per block: %d\n", deviceProp.maxThreadsPerBlock); shrLog(" Maximum sizes of each dimension of a block: %d x %d x %d\n", deviceProp.maxThreadsDim[0], deviceProp.maxThreadsDim[1], deviceProp.maxThreadsDim[2]); shrLog(" Maximum sizes of each dimension of a grid: %d x %d x %d\n", deviceProp.maxGridSize[0], deviceProp.maxGridSize[1], deviceProp.maxGridSize[2]); shrLog(" Maximum memory pitch: %u bytes\n", deviceProp.memPitch); shrLog(" Texture alignment: %u bytes\n", deviceProp.textureAlignment); #if CUDART_VERSION >= 4000 shrLog(" Concurrent copy and execution: %s with %d copy engine(s)\n", (deviceProp.deviceOverlap ? "Yes" : "No"), deviceProp.asyncEngineCount); #else shrLog(" Concurrent copy and execution: %s\n", deviceProp.deviceOverlap ? "Yes" : "No"); #endif #if CUDART_VERSION >= 2020 shrLog(" Run time limit on kernels: %s\n", deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No"); shrLog(" Integrated GPU sharing Host Memory: %s\n", deviceProp.integrated ? "Yes" : "No"); shrLog(" Support host page-locked memory mapping: %s\n", deviceProp.canMapHostMemory ? "Yes" : "No"); #endif #if CUDART_VERSION >= 3000 shrLog(" Concurrent kernel execution: %s\n", deviceProp.concurrentKernels ? "Yes" : "No"); shrLog(" Alignment requirement for Surfaces: %s\n", deviceProp.surfaceAlignment ? "Yes" : "No"); #endif #if CUDART_VERSION >= 3010 shrLog(" Device has ECC support enabled: %s\n", deviceProp.ECCEnabled ? "Yes" : "No"); #endif #if CUDART_VERSION >= 3020 shrLog(" Device is using TCC driver mode: %s\n", deviceProp.tccDriver ? "Yes" : "No"); #endif #if CUDART_VERSION >= 4000 shrLog(" Device supports Unified Addressing (UVA): %s\n", deviceProp.unifiedAddressing ? "Yes" : "No"); shrLog(" Device PCI Bus ID / PCI location ID: %d / %d\n", deviceProp.pciBusID, deviceProp.pciDeviceID ); #endif #if CUDART_VERSION >= 2020 const char *sComputeMode[] = { "Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)", "Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)", "Prohibited (no host thread can use ::cudaSetDevice() with this device)", "Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)", "Unknown", NULL }; shrLog(" Compute Mode:\n"); shrLog(" < %s >\n", sComputeMode[deviceProp.computeMode]); #endif } // csv masterlog info // ***************************** // exe and CUDA driver name shrLog("\n"); std::string sProfileString = "deviceQuery, CUDA Driver = CUDART"; char cTemp[10]; // driver version sProfileString += ", CUDA Driver Version = "; #ifdef WIN32 sprintf_s(cTemp, 10, "%d.%d", driverVersion/1000, (driverVersion%100)/10); #else sprintf(cTemp, "%d.%d", driverVersion/1000, (driverVersion%100)/10); #endif sProfileString += cTemp; // Runtime version sProfileString += ", CUDA Runtime Version = "; #ifdef WIN32 sprintf_s(cTemp, 10, "%d.%d", runtimeVersion/1000, (runtimeVersion%100)/10); #else sprintf(cTemp, "%d.%d", runtimeVersion/1000, (runtimeVersion%100)/10); #endif sProfileString += cTemp; // Device count sProfileString += ", NumDevs = "; #ifdef WIN32 sprintf_s(cTemp, 10, "%d", deviceCount); #else sprintf(cTemp, "%d", deviceCount); #endif sProfileString += cTemp; // First 2 device names, if any for (dev = 0; dev < ((deviceCount > 2) ? 2 : deviceCount); ++dev) { cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, dev); sProfileString += ", Device = "; sProfileString += deviceProp.name; } sProfileString += "\n"; //shrLogEx(LOGBOTH | MASTER, 0, sProfileString.c_str()); std::cout << sProfileString.c_str() << std::endl; std::cout << "Press <ENTER>" << std::endl; // getchar(); runtimeTest(); getchar(); // finish return 0; }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main( int argc, char** argv) { CUdevice dev; int major = 0, minor = 0; int deviceCount = 0; char deviceName[256]; shrQAStart(argc, argv); // note your project will need to link with cuda.lib files on windows printf("CUDA Device Query (Driver API) statically linked version \n"); CUresult error_id = cuInit(0); if (error_id != CUDA_SUCCESS) { printf("cuInit(0) returned %d\n-> %s\n", error_id, getCudaDrvErrorString(error_id)); shrQAFinishExit(argc, (const char **)argv, QA_FAILED); } error_id = cuDeviceGetCount(&deviceCount); if (error_id != CUDA_SUCCESS) { shrLog( "cuDeviceGetCount returned %d\n-> %s\n", (int)error_id, getCudaDrvErrorString(error_id) ); shrQAFinishExit(argc, (const char **)argv, QA_FAILED); } // This function call returns 0 if there are no CUDA capable devices. if (deviceCount == 0) printf("There are no available device(s) that support CUDA\n"); else if (deviceCount == 1) printf("There is 1 device supporting CUDA\n"); else printf("There are %d devices supporting CUDA\n", deviceCount); for (dev = 0; dev < deviceCount; ++dev) { error_id = cuDeviceComputeCapability(&major, &minor, dev); if (error_id != CUDA_SUCCESS) { shrLog( "cuDeviceComputeCapability returned %d\n-> %s\n", (int)error_id, getCudaDrvErrorString(error_id) ); shrQAFinishExit(argc, (const char **)argv, QA_FAILED); } error_id = cuDeviceGetName(deviceName, 256, dev); if (error_id != CUDA_SUCCESS) { shrLog( "cuDeviceGetName returned %d\n-> %s\n", (int)error_id, getCudaDrvErrorString(error_id) ); shrQAFinishExit(argc, (const char **)argv, QA_FAILED); } printf("\nDevice %d: \"%s\"\n", dev, deviceName); #if CUDA_VERSION >= 2020 int driverVersion = 0; cuDriverGetVersion(&driverVersion); printf(" CUDA Driver Version: %d.%d\n", driverVersion/1000, (driverVersion%100)/10); #endif shrLog(" CUDA Capability Major/Minor version number: %d.%d\n", major, minor); size_t totalGlobalMem; error_id = cuDeviceTotalMem(&totalGlobalMem, dev); if (error_id != CUDA_SUCCESS) { shrLog( "cuDeviceTotalMem returned %d\n-> %s\n", (int)error_id, getCudaDrvErrorString(error_id) ); shrQAFinishExit(argc, (const char **)argv, QA_FAILED); } char msg[256]; sprintf(msg, " Total amount of global memory: %.0f MBytes (%llu bytes)\n", (float)totalGlobalMem/1048576.0f, (unsigned long long) totalGlobalMem); shrLog(msg); #if CUDA_VERSION >= 2000 int multiProcessorCount; getCudaAttribute<int>(&multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev); shrLog(" (%2d) Multiprocessors x (%3d) CUDA Cores/MP: %d CUDA Cores\n", multiProcessorCount, ConvertSMVer2Cores(major, minor), ConvertSMVer2Cores(major, minor) * multiProcessorCount); #endif int clockRate; getCudaAttribute<int>(&clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev); printf(" GPU Clock rate: %.0f MHz (%0.2f GHz)\n", clockRate * 1e-3f, clockRate * 1e-6f); #if CUDA_VERSION >= 4000 int memoryClock; getCudaAttribute<int>( &memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev ); shrLog(" Memory Clock rate: %.0f Mhz\n", memoryClock * 1e-3f); int memBusWidth; getCudaAttribute<int>( &memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev ); shrLog(" Memory Bus Width: %d-bit\n", memBusWidth); int L2CacheSize; getCudaAttribute<int>( &L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev ); if (L2CacheSize) { shrLog(" L2 Cache Size: %d bytes\n", L2CacheSize); } int maxTex1D, maxTex2D[2], maxTex3D[3]; getCudaAttribute<int>( &maxTex1D, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH, dev ); getCudaAttribute<int>( &maxTex2D[0], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH, dev ); getCudaAttribute<int>( &maxTex2D[1], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT, dev ); getCudaAttribute<int>( &maxTex3D[0], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH, dev ); getCudaAttribute<int>( &maxTex3D[1], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT, dev ); getCudaAttribute<int>( &maxTex3D[2], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH, dev ); shrLog(" Max Texture Dimension Sizes 1D=(%d) 2D=(%d,%d) 3D=(%d,%d,%d)\n", maxTex1D, maxTex2D[0], maxTex2D[1], maxTex3D[0], maxTex3D[1], maxTex3D[2]); int maxTex2DLayered[3]; getCudaAttribute<int>( &maxTex2DLayered[0], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH, dev ); getCudaAttribute<int>( &maxTex2DLayered[1], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT, dev ); getCudaAttribute<int>( &maxTex2DLayered[2], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS, dev ); shrLog(" Max Layered Texture Size (dim) x layers 1D=(%d) x %d, 2D=(%d,%d) x %d\n", maxTex2DLayered[0], maxTex2DLayered[2], maxTex2DLayered[0], maxTex2DLayered[1], maxTex2DLayered[2]); #endif int totalConstantMemory; getCudaAttribute<int>( &totalConstantMemory, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, dev ); printf(" Total amount of constant memory: %u bytes\n", totalConstantMemory); int sharedMemPerBlock; getCudaAttribute<int>( &sharedMemPerBlock, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, dev ); printf(" Total amount of shared memory per block: %u bytes\n", sharedMemPerBlock); int regsPerBlock; getCudaAttribute<int>( ®sPerBlock, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev ); printf(" Total number of registers available per block: %d\n", regsPerBlock); int warpSize; getCudaAttribute<int>( &warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev ); printf(" Warp size: %d\n", warpSize); int maxThreadsPerMultiProcessor; getCudaAttribute<int>( &maxThreadsPerMultiProcessor, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev ); printf(" Maximum number of threads per multiprocessor: %d\n", maxThreadsPerMultiProcessor); int maxThreadsPerBlock; getCudaAttribute<int>( &maxThreadsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev ); printf(" Maximum number of threads per block: %d\n", maxThreadsPerBlock); int blockDim[3]; getCudaAttribute<int>( &blockDim[0], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, dev ); getCudaAttribute<int>( &blockDim[1], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, dev ); getCudaAttribute<int>( &blockDim[2], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, dev ); printf(" Maximum sizes of each dimension of a block: %d x %d x %d\n", blockDim[0], blockDim[1], blockDim[2]); int gridDim[3]; getCudaAttribute<int>( &gridDim[0], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, dev ); getCudaAttribute<int>( &gridDim[1], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, dev ); getCudaAttribute<int>( &gridDim[2], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, dev ); printf(" Maximum sizes of each dimension of a grid: %d x %d x %d\n", gridDim[0], gridDim[1], gridDim[2]); int textureAlign; getCudaAttribute<int>( &textureAlign, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, dev ); printf(" Texture alignment: %u bytes\n", textureAlign); int memPitch; getCudaAttribute<int>( &memPitch, CU_DEVICE_ATTRIBUTE_MAX_PITCH, dev ); printf(" Maximum memory pitch: %u bytes\n", memPitch); #if CUDA_VERSION >= 2000 int gpuOverlap; getCudaAttribute<int>( &gpuOverlap, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev ); #endif #if CUDA_VERSION >= 4000 int asyncEngineCount; getCudaAttribute<int>( &asyncEngineCount, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev ); printf(" Concurrent copy and execution: %s with %d copy engine(s)\n", (gpuOverlap ? "Yes" : "No"), asyncEngineCount); #else printf(" Concurrent copy and execution: %s\n",gpuOverlap ? "Yes" : "No"); #endif #if CUDA_VERSION >= 2020 int kernelExecTimeoutEnabled; getCudaAttribute<int>( &kernelExecTimeoutEnabled, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, dev ); printf(" Run time limit on kernels: %s\n", kernelExecTimeoutEnabled ? "Yes" : "No"); int integrated; getCudaAttribute<int>( &integrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev ); printf(" Integrated GPU sharing Host Memory: %s\n", integrated ? "Yes" : "No"); int canMapHostMemory; getCudaAttribute<int>( &canMapHostMemory, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev ); printf(" Support host page-locked memory mapping: %s\n", canMapHostMemory ? "Yes" : "No"); #endif #if CUDA_VERSION >= 3000 int concurrentKernels; getCudaAttribute<int>( &concurrentKernels, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev ); printf(" Concurrent kernel execution: %s\n", concurrentKernels ? "Yes" : "No"); int surfaceAlignment; getCudaAttribute<int>( &surfaceAlignment, CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT, dev ); printf(" Alignment requirement for Surfaces: %s\n", surfaceAlignment ? "Yes" : "No"); int eccEnabled; getCudaAttribute<int>( &eccEnabled, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, dev ); printf(" Device has ECC support enabled: %s\n", eccEnabled ? "Yes" : "No"); #endif #if CUDA_VERSION >= 3020 int tccDriver ; getCudaAttribute<int>( &tccDriver , CU_DEVICE_ATTRIBUTE_TCC_DRIVER, dev ); printf(" Device is using TCC driver mode: %s\n", tccDriver ? "Yes" : "No"); #endif #if CUDA_VERSION >= 4000 int unifiedAddressing; getCudaAttribute<int>( &unifiedAddressing, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, dev ); printf(" Device supports Unified Addressing (UVA): %s\n", unifiedAddressing ? "Yes" : "No"); int pciBusID, pciDeviceID; getCudaAttribute<int>( &pciBusID, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, dev ); getCudaAttribute<int>( &pciDeviceID, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, dev ); printf(" Device PCI Bus ID / PCI location ID: %d / %d\n", pciBusID, pciDeviceID ); const char *sComputeMode[] = { "Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)", "Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)", "Prohibited (no host thread can use ::cudaSetDevice() with this device)", "Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)", "Unknown", NULL }; int computeMode; getCudaAttribute<int>( &computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev ); printf(" Compute Mode:\n"); printf(" < %s >\n", sComputeMode[computeMode]); #endif } shrQAFinishExit(argc, (const char **)argv, QA_PASSED); }
void CudaDeviceDialog::updateInfo(int index) { m_infoText = "<html><body>"; int deviceCount = 0; cudaGetDeviceCount(&deviceCount); cudaDeviceProp p; cudaGetDeviceProperties(&p, index); if (p.major == 9999 && p.minor == 9999) m_infoText += "<p>There is no device supporting CUDA</p>"; else if (deviceCount == 1) m_infoText += "<p>There is 1 device supporting CUDA</p>"; else m_infoText += QString("<p>There are %1 devices supporting CUDA</p>").arg(deviceCount); m_infoText += QString("<p>CUDA Driver/Runtime</p>"); m_infoText += "<table>"; int driverVersion = 0, runtimeVersion = 0; cudaDriverGetVersion(&driverVersion); cudaRuntimeGetVersion(&runtimeVersion); QString error = "<span style='color:red;'>***ERROR*** >= 4.0 required</span>"; addItem(1, "CUDA Driver Version:", QString("%1.%2 %3").arg(driverVersion/1000).arg(driverVersion%100) .arg((driverVersion >= 4000)? "" : error)); addItem(1, "CUDA Runtime Version:", QString("%1.%2 %3").arg(runtimeVersion/1000).arg(runtimeVersion%100) .arg((driverVersion >= 4000)? "" : error)); m_infoText += "</table>"; if (index < deviceCount) { m_infoText += QString("<p>Device %1: "%2"</p>").arg(index).arg(p.name); m_infoText += "<table>"; addItem(1, "CUDA Capability Major/Minor version number:", QString("%1.%2").arg(p.major).arg(p.minor)); addItem(1, "Total amount of global memory:", QString("%1 MB").arg(p.totalGlobalMem / 1024 / 1024)); addItem(1, QString("%1 Multiprocessors x %2 CUDA Cores/MP:").arg(p.multiProcessorCount).arg(ConvertSMVer2Cores(p.major, p.minor)), QString("%1 CUDA Cores").arg(ConvertSMVer2Cores(p.major, p.minor) * p.multiProcessorCount)); addItem(1, "Total amount of constant memory:", QString("%1 bytes").arg(p.totalConstMem)); addItem(1, "Total amount of shared memory per block:", QString("%1 bytes").arg(p.sharedMemPerBlock)); addItem(1, "Total number of registers available per block:", QString("%1").arg(p.regsPerBlock)); addItem(1, "Warp size:", QString("%1").arg(p.warpSize)); addItem(1, "Maximum number of threads per block:", QString("%1").arg(p.maxThreadsPerBlock)); addItem(1, "Maximum sizes of each dimension of a block:", QString("%1 x %2 x %3") .arg(p.maxThreadsDim[0]) .arg(p.maxThreadsDim[1]) .arg(p.maxThreadsDim[2])); addItem(1, "Maximum sizes of each dimension of a grid:", QString("%1 x %2 x %3") .arg(p.maxGridSize[0]) .arg(p.maxGridSize[1]) .arg(p.maxGridSize[2])); addItem(1, "Maximum memory pitch:", QString("%1 bytes").arg(p.memPitch)); addItem(1, "Texture alignment:", QString("%1 bytes").arg(p.textureAlignment)); addItem(1, "Clock rate:", QString("%1 GHz").arg(p.clockRate * 1e-6f)); addItem(1, "Concurrent copy and execution:", p.deviceOverlap ? "yes" : "no"); addItem(1, "# of Asynchronous Copy Engines:", QString("%1").arg(p.asyncEngineCount)); addItem(1, "Run time limit on kernels:", p.kernelExecTimeoutEnabled ? "yes" : "no"); addItem(1, "Integrated:", p.integrated ? "yes" : "no"); addItem(1, "Support host page-locked memory mapping:", p.canMapHostMemory ? "yes" : "no"); addItem(1, "Compute mode:", p.computeMode == cudaComputeModeDefault ? "Default (multiple host threads can use this device simultaneously)" : p.computeMode == cudaComputeModeExclusive ? "Exclusive (only one host thread at a time can use this device)" : p.computeMode == cudaComputeModeProhibited ? "Prohibited (no host thread can use this device)" : "Unknown"); addItem(1, "Concurrent kernel execution:", p.concurrentKernels ? "yes" : "no"); addItem(1, "Device has ECC support enabled:", p.ECCEnabled ? "yes" : "no"); addItem(1, "Device is using TCC driver mode:", p.tccDriver ? "yes" : "no"); m_infoText += "</table>"; } m_infoText += "</body></html>"; m->info->setHtml(m_infoText); m->buttonBox->button(QDialogButtonBox::Ok)->setEnabled((driverVersion >= 4000) && (runtimeVersion >= 4000)); }
////////////////////////////////////////////////////////////////////////////// //! Print info about the device //! //! @param iLogMode enum LOGBOTH, LOGCONSOLE, LOGFILE //! @param device OpenCL id of the device ////////////////////////////////////////////////////////////////////////////// void oclPrintDevInfo(int iLogMode, cl_device_id device) { char device_string[1024]; bool nv_device_attibute_query = false; // CL_DEVICE_NAME clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_string), &device_string, NULL); shrLogEx(iLogMode, 0, " CL_DEVICE_NAME: \t\t\t%s\n", device_string); // CL_DEVICE_VENDOR clGetDeviceInfo(device, CL_DEVICE_VENDOR, sizeof(device_string), &device_string, NULL); shrLogEx(iLogMode, 0, " CL_DEVICE_VENDOR: \t\t\t%s\n", device_string); // CL_DRIVER_VERSION clGetDeviceInfo(device, CL_DRIVER_VERSION, sizeof(device_string), &device_string, NULL); shrLogEx(iLogMode, 0, " CL_DRIVER_VERSION: \t\t\t%s\n", device_string); // CL_DEVICE_VERSION clGetDeviceInfo(device, CL_DEVICE_VERSION, sizeof(device_string), &device_string, NULL); shrLogEx(iLogMode, 0, " CL_DEVICE_VERSION: \t\t\t%s\n", device_string); #if !defined(__APPLE__) && !defined(__MACOSX) // CL_DEVICE_OPENCL_C_VERSION (if CL_DEVICE_VERSION version > 1.0) if(strncmp("OpenCL 1.0", device_string, 10) != 0) { // This code is unused for devices reporting OpenCL 1.0, but a def is needed anyway to allow compilation using v 1.0 headers // This constant isn't #defined in 1.0 #ifndef CL_DEVICE_OPENCL_C_VERSION #define CL_DEVICE_OPENCL_C_VERSION 0x103D #endif clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_VERSION, sizeof(device_string), &device_string, NULL); shrLogEx(iLogMode, 0, " CL_DEVICE_OPENCL_C_VERSION: \t\t%s\n", device_string); } #endif // CL_DEVICE_TYPE cl_device_type type; clGetDeviceInfo(device, CL_DEVICE_TYPE, sizeof(type), &type, NULL); if( type & CL_DEVICE_TYPE_CPU ) shrLogEx(iLogMode, 0, " CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_CPU"); if( type & CL_DEVICE_TYPE_GPU ) shrLogEx(iLogMode, 0, " CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_GPU"); if( type & CL_DEVICE_TYPE_ACCELERATOR ) shrLogEx(iLogMode, 0, " CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_ACCELERATOR"); if( type & CL_DEVICE_TYPE_DEFAULT ) shrLogEx(iLogMode, 0, " CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_DEFAULT"); // CL_DEVICE_MAX_COMPUTE_UNITS cl_uint compute_units; clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(compute_units), &compute_units, NULL); shrLogEx(iLogMode, 0, " CL_DEVICE_MAX_COMPUTE_UNITS:\t\t%u\n", compute_units); // CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS size_t workitem_dims; clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(workitem_dims), &workitem_dims, NULL); shrLogEx(iLogMode, 0, " CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS:\t%u\n", workitem_dims); // CL_DEVICE_MAX_WORK_ITEM_SIZES size_t workitem_size[3]; clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(workitem_size), &workitem_size, NULL); shrLogEx(iLogMode, 0, " CL_DEVICE_MAX_WORK_ITEM_SIZES:\t%u / %u / %u \n", workitem_size[0], workitem_size[1], workitem_size[2]); // CL_DEVICE_MAX_WORK_GROUP_SIZE size_t workgroup_size; clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(workgroup_size), &workgroup_size, NULL); shrLogEx(iLogMode, 0, " CL_DEVICE_MAX_WORK_GROUP_SIZE:\t%u\n", workgroup_size); // CL_DEVICE_MAX_CLOCK_FREQUENCY cl_uint clock_frequency; clGetDeviceInfo(device, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(clock_frequency), &clock_frequency, NULL); shrLogEx(iLogMode, 0, " CL_DEVICE_MAX_CLOCK_FREQUENCY:\t%u MHz\n", clock_frequency); // CL_DEVICE_ADDRESS_BITS cl_uint addr_bits; clGetDeviceInfo(device, CL_DEVICE_ADDRESS_BITS, sizeof(addr_bits), &addr_bits, NULL); shrLogEx(iLogMode, 0, " CL_DEVICE_ADDRESS_BITS:\t\t%u\n", addr_bits); // CL_DEVICE_MAX_MEM_ALLOC_SIZE cl_ulong max_mem_alloc_size; clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(max_mem_alloc_size), &max_mem_alloc_size, NULL); shrLogEx(iLogMode, 0, " CL_DEVICE_MAX_MEM_ALLOC_SIZE:\t\t%u MByte\n", (unsigned int)(max_mem_alloc_size / (1024 * 1024))); // CL_DEVICE_GLOBAL_MEM_SIZE cl_ulong mem_size; clGetDeviceInfo(device, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(mem_size), &mem_size, NULL); shrLogEx(iLogMode, 0, " CL_DEVICE_GLOBAL_MEM_SIZE:\t\t%u MByte\n", (unsigned int)(mem_size / (1024 * 1024))); // CL_DEVICE_ERROR_CORRECTION_SUPPORT cl_bool error_correction_support; clGetDeviceInfo(device, CL_DEVICE_ERROR_CORRECTION_SUPPORT, sizeof(error_correction_support), &error_correction_support, NULL); shrLogEx(iLogMode, 0, " CL_DEVICE_ERROR_CORRECTION_SUPPORT:\t%s\n", error_correction_support == CL_TRUE ? "yes" : "no"); // CL_DEVICE_LOCAL_MEM_TYPE cl_device_local_mem_type local_mem_type; clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_TYPE, sizeof(local_mem_type), &local_mem_type, NULL); shrLogEx(iLogMode, 0, " CL_DEVICE_LOCAL_MEM_TYPE:\t\t%s\n", local_mem_type == 1 ? "local" : "global"); // CL_DEVICE_LOCAL_MEM_SIZE clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(mem_size), &mem_size, NULL); shrLogEx(iLogMode, 0, " CL_DEVICE_LOCAL_MEM_SIZE:\t\t%u KByte\n", (unsigned int)(mem_size / 1024)); // CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE clGetDeviceInfo(device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof(mem_size), &mem_size, NULL); shrLogEx(iLogMode, 0, " CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE:\t%u KByte\n", (unsigned int)(mem_size / 1024)); // CL_DEVICE_QUEUE_PROPERTIES cl_command_queue_properties queue_properties; clGetDeviceInfo(device, CL_DEVICE_QUEUE_PROPERTIES, sizeof(queue_properties), &queue_properties, NULL); if( queue_properties & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE ) shrLogEx(iLogMode, 0, " CL_DEVICE_QUEUE_PROPERTIES:\t\t%s\n", "CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE"); if( queue_properties & CL_QUEUE_PROFILING_ENABLE ) shrLogEx(iLogMode, 0, " CL_DEVICE_QUEUE_PROPERTIES:\t\t%s\n", "CL_QUEUE_PROFILING_ENABLE"); // CL_DEVICE_IMAGE_SUPPORT cl_bool image_support; clGetDeviceInfo(device, CL_DEVICE_IMAGE_SUPPORT, sizeof(image_support), &image_support, NULL); shrLogEx(iLogMode, 0, " CL_DEVICE_IMAGE_SUPPORT:\t\t%u\n", image_support); // CL_DEVICE_MAX_READ_IMAGE_ARGS cl_uint max_read_image_args; clGetDeviceInfo(device, CL_DEVICE_MAX_READ_IMAGE_ARGS, sizeof(max_read_image_args), &max_read_image_args, NULL); shrLogEx(iLogMode, 0, " CL_DEVICE_MAX_READ_IMAGE_ARGS:\t%u\n", max_read_image_args); // CL_DEVICE_MAX_WRITE_IMAGE_ARGS cl_uint max_write_image_args; clGetDeviceInfo(device, CL_DEVICE_MAX_WRITE_IMAGE_ARGS, sizeof(max_write_image_args), &max_write_image_args, NULL); shrLogEx(iLogMode, 0, " CL_DEVICE_MAX_WRITE_IMAGE_ARGS:\t%u\n", max_write_image_args); // CL_DEVICE_SINGLE_FP_CONFIG cl_device_fp_config fp_config; clGetDeviceInfo(device, CL_DEVICE_SINGLE_FP_CONFIG, sizeof(cl_device_fp_config), &fp_config, NULL); shrLogEx(iLogMode, 0, " CL_DEVICE_SINGLE_FP_CONFIG:\t\t%s%s%s%s%s%s\n", fp_config & CL_FP_DENORM ? "denorms " : "", fp_config & CL_FP_INF_NAN ? "INF-quietNaNs " : "", fp_config & CL_FP_ROUND_TO_NEAREST ? "round-to-nearest " : "", fp_config & CL_FP_ROUND_TO_ZERO ? "round-to-zero " : "", fp_config & CL_FP_ROUND_TO_INF ? "round-to-inf " : "", fp_config & CL_FP_FMA ? "fma " : ""); // CL_DEVICE_IMAGE2D_MAX_WIDTH, CL_DEVICE_IMAGE2D_MAX_HEIGHT, CL_DEVICE_IMAGE3D_MAX_WIDTH, CL_DEVICE_IMAGE3D_MAX_HEIGHT, CL_DEVICE_IMAGE3D_MAX_DEPTH size_t szMaxDims[5]; shrLogEx(iLogMode, 0, "\n CL_DEVICE_IMAGE <dim>"); clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_WIDTH, sizeof(size_t), &szMaxDims[0], NULL); shrLogEx(iLogMode, 0, "\t\t\t2D_MAX_WIDTH\t %u\n", szMaxDims[0]); clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_HEIGHT, sizeof(size_t), &szMaxDims[1], NULL); shrLogEx(iLogMode, 0, "\t\t\t\t\t2D_MAX_HEIGHT\t %u\n", szMaxDims[1]); clGetDeviceInfo(device, CL_DEVICE_IMAGE3D_MAX_WIDTH, sizeof(size_t), &szMaxDims[2], NULL); shrLogEx(iLogMode, 0, "\t\t\t\t\t3D_MAX_WIDTH\t %u\n", szMaxDims[2]); clGetDeviceInfo(device, CL_DEVICE_IMAGE3D_MAX_HEIGHT, sizeof(size_t), &szMaxDims[3], NULL); shrLogEx(iLogMode, 0, "\t\t\t\t\t3D_MAX_HEIGHT\t %u\n", szMaxDims[3]); clGetDeviceInfo(device, CL_DEVICE_IMAGE3D_MAX_DEPTH, sizeof(size_t), &szMaxDims[4], NULL); shrLogEx(iLogMode, 0, "\t\t\t\t\t3D_MAX_DEPTH\t %u\n", szMaxDims[4]); // CL_DEVICE_EXTENSIONS: get device extensions, and if any then parse & log the string onto separate lines clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, sizeof(device_string), &device_string, NULL); if (device_string != 0) { shrLogEx(iLogMode, 0, "\n CL_DEVICE_EXTENSIONS:"); std::string stdDevString; stdDevString = std::string(device_string); size_t szOldPos = 0; size_t szSpacePos = stdDevString.find(' ', szOldPos); // extensions string is space delimited while (szSpacePos != stdDevString.npos) { if( strcmp("cl_nv_device_attribute_query", stdDevString.substr(szOldPos, szSpacePos - szOldPos).c_str()) == 0 ) nv_device_attibute_query = true; if (szOldPos > 0) { shrLogEx(iLogMode, 0, "\t\t"); } shrLogEx(iLogMode, 0, "\t\t\t%s\n", stdDevString.substr(szOldPos, szSpacePos - szOldPos).c_str()); do { szOldPos = szSpacePos + 1; szSpacePos = stdDevString.find(' ', szOldPos); } while (szSpacePos == szOldPos); } shrLogEx(iLogMode, 0, "\n"); } else { shrLogEx(iLogMode, 0, " CL_DEVICE_EXTENSIONS: None\n"); } if(nv_device_attibute_query) { cl_uint compute_capability_major, compute_capability_minor; clGetDeviceInfo(device, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, sizeof(cl_uint), &compute_capability_major, NULL); clGetDeviceInfo(device, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV, sizeof(cl_uint), &compute_capability_minor, NULL); shrLogEx(iLogMode, 0, "\n CL_DEVICE_COMPUTE_CAPABILITY_NV:\t%u.%u\n", compute_capability_major, compute_capability_minor); shrLogEx(iLogMode, 0, " NUMBER OF MULTIPROCESSORS:\t\t%u\n", compute_units); // this is the same value reported by CL_DEVICE_MAX_COMPUTE_UNITS shrLogEx(iLogMode, 0, " NUMBER OF CUDA CORES:\t\t\t%u\n", ConvertSMVer2Cores(compute_capability_major, compute_capability_minor) * compute_units); cl_uint regs_per_block; clGetDeviceInfo(device, CL_DEVICE_REGISTERS_PER_BLOCK_NV, sizeof(cl_uint), ®s_per_block, NULL); shrLogEx(iLogMode, 0, " CL_DEVICE_REGISTERS_PER_BLOCK_NV:\t%u\n", regs_per_block); cl_uint warp_size; clGetDeviceInfo(device, CL_DEVICE_WARP_SIZE_NV, sizeof(cl_uint), &warp_size, NULL); shrLogEx(iLogMode, 0, " CL_DEVICE_WARP_SIZE_NV:\t\t%u\n", warp_size); cl_bool gpu_overlap; clGetDeviceInfo(device, CL_DEVICE_GPU_OVERLAP_NV, sizeof(cl_bool), &gpu_overlap, NULL); shrLogEx(iLogMode, 0, " CL_DEVICE_GPU_OVERLAP_NV:\t\t%s\n", gpu_overlap == CL_TRUE ? "CL_TRUE" : "CL_FALSE"); cl_bool exec_timeout; clGetDeviceInfo(device, CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV, sizeof(cl_bool), &exec_timeout, NULL); shrLogEx(iLogMode, 0, " CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV:\t%s\n", exec_timeout == CL_TRUE ? "CL_TRUE" : "CL_FALSE"); cl_bool integrated_memory; clGetDeviceInfo(device, CL_DEVICE_INTEGRATED_MEMORY_NV, sizeof(cl_bool), &integrated_memory, NULL); shrLogEx(iLogMode, 0, " CL_DEVICE_INTEGRATED_MEMORY_NV:\t%s\n", integrated_memory == CL_TRUE ? "CL_TRUE" : "CL_FALSE"); } // CL_DEVICE_PREFERRED_VECTOR_WIDTH_<type> shrLogEx(iLogMode, 0, " CL_DEVICE_PREFERRED_VECTOR_WIDTH_<t>\t"); cl_uint vec_width [6]; clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, sizeof(cl_uint), &vec_width[0], NULL); clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT, sizeof(cl_uint), &vec_width[1], NULL); clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, sizeof(cl_uint), &vec_width[2], NULL); clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG, sizeof(cl_uint), &vec_width[3], NULL); clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, sizeof(cl_uint), &vec_width[4], NULL); clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, sizeof(cl_uint), &vec_width[5], NULL); shrLogEx(iLogMode, 0, "CHAR %u, SHORT %u, INT %u, LONG %u, FLOAT %u, DOUBLE %u\n\n\n", vec_width[0], vec_width[1], vec_width[2], vec_width[3], vec_width[4], vec_width[5]); }