void pcl::gpu::printShortCudaDeviceInfo(int device) { int count = getCudaEnabledDeviceCount(); bool valid = (device >= 0) && (device < count); int beg = valid ? device : 0; int end = valid ? device+1 : count; int driverVersion = 0, runtimeVersion = 0; cudaSafeCall( cudaDriverGetVersion(&driverVersion) ); cudaSafeCall( cudaRuntimeGetVersion(&runtimeVersion) ); for(int dev = beg; dev < end; ++dev) { cudaDeviceProp prop; cudaSafeCall( cudaGetDeviceProperties(&prop, dev) ); const char *arch_str = prop.major < 2 ? " (pre-Fermi)" : ""; printf("[pcl::gpu::printShortCudaDeviceInfo] : Device %d: \"%s\" %.0fMb", dev, prop.name, (float)prop.totalGlobalMem/1048576.0f); printf(", sm_%d%d%s, %d cores", prop.major, prop.minor, arch_str, convertSMVer2Cores(prop.major, prop.minor) * prop.multiProcessorCount); printf(", Driver/Runtime ver.%d.%d/%d.%d\n", driverVersion/1000, driverVersion%100, runtimeVersion/1000, runtimeVersion%100); } fflush(stdout); }
Caffe::Properties::Properties() : init_time_(std::time(nullptr)), main_thread_id_(std::this_thread::get_id()), caffe_version_(AS_STRING(CAFFE_VERSION)) { #ifndef CPU_ONLY int count = 0; CUDA_CHECK(cudaGetDeviceCount(&count)); compute_capabilities_.resize(count); cudaDeviceProp device_prop; for (int gpu = 0; gpu < compute_capabilities_.size(); ++gpu) { CUDA_CHECK(cudaGetDeviceProperties(&device_prop, gpu)); compute_capabilities_[gpu] = device_prop.major * 100 + device_prop.minor; DLOG(INFO) << "GPU " << gpu << " '" << device_prop.name << "' has compute capability " << device_prop.major << "." << device_prop.minor; } #ifdef USE_CUDNN cudnn_version_ = AS_STRING(CUDNN_MAJOR) "." AS_STRING(CUDNN_MINOR) "." AS_STRING(CUDNN_PATCHLEVEL); #else cudnn_version_ = "USE_CUDNN is not defined"; #endif int cublas_version = 0; CUBLAS_CHECK(cublasGetVersion(Caffe::cublas_handle(), &cublas_version)); cublas_version_ = std::to_string(cublas_version); int cuda_version = 0; CUDA_CHECK(cudaRuntimeGetVersion(&cuda_version)); cuda_version_ = std::to_string(cuda_version); int cuda_driver_version = 0; CUDA_CHECK(cudaDriverGetVersion(&cuda_driver_version)); cuda_driver_version_ = std::to_string(cuda_driver_version); #endif }
// CUDAが使えるかチェック Waifu2x::eWaifu2xCudaError Waifu2x::can_use_CUDA() { static eWaifu2xCudaError CudaFlag = eWaifu2xCudaError_NotFind; std::call_once(waifu2x_cuda_once_flag, [&]() { int driverVersion = 0; if (cudaDriverGetVersion(&driverVersion) == cudaSuccess) { if (driverVersion > 0) { int runtimeVersion; if (cudaRuntimeGetVersion(&runtimeVersion) == cudaSuccess) { if (runtimeVersion >= MinCudaDriverVersion && driverVersion >= runtimeVersion) CudaFlag = eWaifu2xCudaError_OK; else CudaFlag = eWaifu2xCudaError_OldVersion; } else CudaFlag = eWaifu2xCudaError_NotFind; } else CudaFlag = eWaifu2xCudaError_NotFind; } else CudaFlag = eWaifu2xCudaError_NotFind; }); return CudaFlag; }
oskar_CudaInfo* oskar_cuda_info_create(int* status) { oskar_CudaInfo* info; int i; /* Allocate index. */ info = (oskar_CudaInfo*) calloc(1, sizeof(oskar_CudaInfo)); /* Get the runtime version and the driver version. */ cudaDriverGetVersion(&info->driver_version); cudaRuntimeGetVersion(&info->runtime_version); /* Query the number of devices in the system. */ *status = cudaGetDeviceCount(&info->num_devices); if (*status != cudaSuccess || info->num_devices == 0) { fprintf(stderr, "Unable to determine number of CUDA devices: %s\n", cudaGetErrorString((cudaError_t)(*status))); return info; } /* Allocate array big enough. */ info->device = (oskar_CudaDeviceInfo*) calloc(info->num_devices, sizeof(oskar_CudaDeviceInfo)); /* Populate device array. */ for (i = 0; i < info->num_devices; ++i) { oskar_cuda_device_info_scan(&(info->device[i]), i); } return info; }
TEST(PeerAccess, EnableDisable) { cudaError_t ret; int devices; ret = cudaGetDeviceCount(&devices); ASSERT_EQ(cudaSuccess, ret); if (devices <= 1) { return; } int version; ret = cudaRuntimeGetVersion(&version); ASSERT_EQ(cudaSuccess, ret); typedef std::pair<int, int> peer_t; std::vector<peer_t> peers; for (int i = 0; i < devices; i++) { ret = cudaSetDevice(i); ASSERT_EQ(cudaSuccess, ret); for (int j = 0; j < devices; j++) { int peer; ret = cudaDeviceCanAccessPeer(&peer, i, j); ASSERT_EQ(cudaSuccess, ret); cudaError_t expected; if (peer) { expected = cudaSuccess; peers.push_back(peer_t(i, j)); #if CUDA_VERSION >= 5000 } else if (version >= 5000 /* 5.0 */) { expected = cudaErrorPeerAccessUnsupported; #endif } else { expected = cudaErrorInvalidDevice; } ret = cudaDeviceEnablePeerAccess(j, 0); EXPECT_EQ(expected, ret); } } /* Cleanup. */ const size_t n_peers = peers.size(); for (size_t i = 0; i < n_peers; i++) { ret = cudaSetDevice(peers[i].first); ASSERT_EQ(cudaSuccess, ret); ret = cudaDeviceDisablePeerAccess(peers[i].second); EXPECT_EQ(cudaSuccess, ret); } }
bool printfNPPinfo(int argc, char *argv[], int cudaVerMajor, int cudaVerMinor) { const NppLibraryVersion *libVer = nppGetLibVersion(); printf("NPP Library Version %d.%d.%d\n", libVer->major, libVer->minor, libVer->build); int driverVersion, runtimeVersion; cudaDriverGetVersion(&driverVersion); cudaRuntimeGetVersion(&runtimeVersion); printf(" CUDA Driver Version: %d.%d\n", driverVersion/1000, (driverVersion%100)/10); printf(" CUDA Runtime Version: %d.%d\n", runtimeVersion/1000, (runtimeVersion%100)/10); bool bVal = checkCudaCapabilities(cudaVerMajor, cudaVerMinor); return bVal; }
void DialogSelectHardware::ChangeText(int indexDevice) { int driverVersion = 0, runtimeVersion = 0; cudaSetDevice(indexDevice); cudaGetDeviceProperties(deviceProp, indexDevice); cudaDriverGetVersion(&driverVersion); cudaRuntimeGetVersion(&runtimeVersion); char msg[256]; SPRINTF(msg,"%.0f MBytes (%llu bytes)\n", (float)deviceProp->totalGlobalMem/1048576.0f, (unsigned long long) deviceProp->totalGlobalMem); ui->tableWidget->clear(); addItem(QString ("Device "+QString::number(indexDevice).append(" : ")+ deviceProp->name),0,0); addItem((selectDevice == indexDevice) ? "Dispositivo Seleccionado " : " ",0,1); addItem("CUDA Driver Version / Runtime Version",1,0); addItem(QString ("%1.%2 / %3.%4").arg(driverVersion/1000).arg((driverVersion%100)/10).arg( runtimeVersion/1000).arg((runtimeVersion%100)/10),1,1); addItem("CUDA Capability Major/Minor version number: ",2,0); addItem(QString ("%1.%2").arg(deviceProp->major).arg(deviceProp->minor),2,1); addItem("Total amount of global memory:",3,0); addItem(msg,3,1); addItem(QString ("(%1) Multiprocessors, (%2) CUDA Cores/MP:%3 CUDA Cores").arg( deviceProp->multiProcessorCount).arg( _ConvertSMVer2Cores(deviceProp->major, deviceProp->minor)).arg( _ConvertSMVer2Cores(deviceProp->major, deviceProp->minor) * deviceProp->multiProcessorCount),4,0); addItem("Total amount of constant memory:",5,0); addItem(QString ("%1 bytes").arg(deviceProp->totalConstMem),5,1); addItem("Total amount of shared memory per block:",6,0); addItem(QString ("%1 bytes").arg(deviceProp->sharedMemPerBlock),6,1); addItem("Total number of registers available per block:",7,0); addItem(QString ("%1").arg(deviceProp->regsPerBlock),7,1); addItem("Warp size:",8,0); addItem(QString ("%1").arg(deviceProp->warpSize),8,1); addItem("Maximum number of threads per multiprocessor:",9,0); addItem(QString ("%1").arg(deviceProp->maxThreadsPerMultiProcessor),9,1); addItem("Maximum number of threads per block:",10,0); addItem(QString ("%1").arg(deviceProp->maxThreadsPerBlock),10,1); addItem("Max dimension size of a thread block (x,y,z):",11,0); addItem(QString ("(%1, %2, %3)").arg(deviceProp->maxThreadsDim[0]).arg( deviceProp->maxThreadsDim[1]).arg( deviceProp->maxThreadsDim[2]),11,1); addItem("Max dimension size of a grid size (x,y,z):",12,0); addItem(QString ("(%1, %2, %3)\n").arg(deviceProp->maxGridSize[0]).arg(deviceProp->maxGridSize[1]).arg(deviceProp->maxGridSize[2]),12,1); addItem("Run time limit on kernels: ",13,0); addItem(QString ("%1\n").arg(deviceProp->kernelExecTimeoutEnabled ? "Yes" : "No"),13,1); addItem("Integrated GPU sharing Host Memory: ",14,0); addItem( QString ("%1\n").arg(deviceProp->integrated ? "Yes" : "No"),14,1); ui->tableWidget->resizeColumnsToContents(); ui->tableWidget->resizeRowsToContents(); }
void cuda_running_configuration::update_parameters() { cuda_safe_call(cudaDriverGetVersion(&driver_version)); cuda_safe_call(cudaRuntimeGetVersion(&runtime_version)); int device_count; cuda_safe_call(cudaGetDeviceCount(&device_count)); if (device_count <= 0) throw neural_network_exception("No CUDA capable devices are found"); if (device_id >= device_count) throw neural_network_exception((boost::format("Device ID %1% specified while %2% devices are available") % device_id % device_count).str()); cudaDeviceProp device_prop; cuda_safe_call(cudaGetDeviceProperties(&device_prop, device_id)); device_name = device_prop.name; compute_capability_major = device_prop.major; compute_capability_minor = device_prop.minor; clock_rate = device_prop.clockRate; memory_clock_rate = device_prop.memoryClockRate; memory_bus_width = device_prop.memoryBusWidth; global_memory_size = device_prop.totalGlobalMem; ecc_enabled = (device_prop.ECCEnabled != 0); l2_cache_size = device_prop.l2CacheSize; multiprocessor_count = device_prop.multiProcessorCount; smem_per_block = device_prop.sharedMemPerBlock; max_threads_per_multiprocessor = device_prop.maxThreadsPerMultiProcessor; max_threads_per_block = device_prop.maxThreadsPerBlock; for(int i = 0; i < sizeof(max_threads_dim) / sizeof(max_threads_dim[0]); ++i) max_threads_dim[i] = device_prop.maxThreadsDim[i]; for(int i = 0; i < sizeof(max_grid_size) / sizeof(max_grid_size[0]); ++i) max_grid_size[i] = device_prop.maxGridSize[i]; max_texture_1d_linear = device_prop.maxTexture1DLinear; texture_alignment = device_prop.textureAlignment; pci_bus_id = device_prop.pciBusID; pci_device_id = device_prop.pciDeviceID; #ifdef _WIN32 tcc_mode = (device_prop.tccDriver != 0); #endif cuda_safe_call(cudaSetDevice(device_id)); cublas_safe_call(cublasCreate(&cublas_handle)); cusparse_safe_call(cusparseCreate(&cusparse_handle)); }
bool checkCUDAProfile(int dev) { int runtimeVersion = 0; cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, dev); fprintf(stderr,"\nDevice %d: \"%s\"\n", dev, deviceProp.name); cudaRuntimeGetVersion(&runtimeVersion); fprintf(stderr," CUDA Runtime Version:\t%d.%d\n", runtimeVersion/1000, (runtimeVersion%100)/10); fprintf(stderr," CUDA SM Capability :\t%d.%d\n", deviceProp.major, deviceProp.minor); if( runtimeVersion/1000 >= 3 && runtimeVersion%100 >= 1 && deviceProp.major >= 2 ) { return true; } else { return false; } }
bool checkCUDAProfile(int dev, int min_runtime, int min_compute) { int runtimeVersion = 0; cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, dev); fprintf(stderr,"\nDevice %d: \"%s\"\n", dev, deviceProp.name); cudaRuntimeGetVersion(&runtimeVersion); fprintf(stderr," CUDA Runtime Version :\t%d.%d\n", runtimeVersion/1000, (runtimeVersion%100)/10); fprintf(stderr," CUDA Compute Capability :\t%d.%d\n", deviceProp.major, deviceProp.minor); if( runtimeVersion >= min_runtime && ((deviceProp.major<<4) + deviceProp.minor) >= min_compute ) { return true; } else { return false; } }
char CudaBase::CheckCUDevice() { int deviceCount = 0; if (cudaGetDeviceCount(&deviceCount) != cudaSuccess) { std::cout << "Cannot find CUDA device!"; return 0; } if(deviceCount>0) { std::cout << "Found " << deviceCount << " device(s)\n"; int driverVersion = 0, runtimeVersion = 0; cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, 0); cudaDriverGetVersion(&driverVersion); cudaRuntimeGetVersion(&runtimeVersion); std::cout << " Device name: " << deviceProp.name<<"\n"; std::cout << " Diver Version: " << driverVersion<<"\n"; std::cout << " Runtime Version: " << runtimeVersion<<"\n"; std::cout << " Capability Major/Minor version number: "<<deviceProp.major<<"."<<deviceProp.minor<<"\n"; std::cout << " Total amount of global memory: "<<(unsigned long long)deviceProp.totalGlobalMem<<" bytes\n"; std::cout << " Total amount of constant memory: "<<deviceProp.totalConstMem<<"bytes\n"; std::cout << " Total amount of shared memory per block: "<<deviceProp.sharedMemPerBlock<<" bytes\n"; std::cout << " Total number of registers available per block: "<<deviceProp.regsPerBlock<<"\n"; std::cout << " Warp size: "<<deviceProp.warpSize<<"\n"; std::stringstream sst; sst<<" Maximum sizes of each dimension of a grid: "<<deviceProp.maxGridSize[0]<<" x "<<deviceProp.maxGridSize[1]<<" x "<<deviceProp.maxGridSize[2]; std::cout<<sst.str()<<"\n"; sst.str(""); sst<<" Maximum sizes of each dimension of a block: "<<deviceProp.maxThreadsDim[0]<<" x "<<deviceProp.maxThreadsDim[1]<<" x "<<deviceProp.maxThreadsDim[2]; std::cout<<sst.str()<<"\n"; std::cout << " Maximum number of threads per block: " << deviceProp.maxThreadsPerBlock<<"\n"; MaxThreadPerBlock = deviceProp.maxThreadsPerBlock; MaxRegisterPerBlock = deviceProp.regsPerBlock; MaxSharedMemoryPerBlock = deviceProp.sharedMemPerBlock; WarpSize = deviceProp.warpSize; RuntimeVersion = runtimeVersion; return 1; } return 0; }
void oskar_device_get_info_cuda(oskar_Device* device) { #ifdef OSKAR_HAVE_CUDA struct cudaDeviceProp prop; cudaDriverGetVersion(&device->cuda_driver_version); cudaRuntimeGetVersion(&device->cuda_runtime_version); cudaGetDeviceProperties(&prop, device->index); device->name = (char*) realloc(device->name, 1 + strlen(prop.name)); device->vendor = (char*) realloc(device->vendor, 1 + strlen("NVIDIA")); strcpy(device->name, prop.name); strcpy(device->vendor, "NVIDIA"); device->is_nv = 1; device->platform_type = 'C'; device->device_type = 'G'; device->compute_capability[0] = prop.major; device->compute_capability[1] = prop.minor; device->supports_double = 0; if (prop.major >= 2 || prop.minor >= 3) device->supports_double = 1; device->supports_atomic32 = 1; device->supports_atomic64 = 1; device->global_mem_cache_size = (size_t) prop.l2CacheSize; device->local_mem_size = prop.sharedMemPerBlock; device->max_work_group_size = (size_t) prop.maxThreadsPerBlock; device->max_local_size[0] = prop.maxThreadsDim[0]; device->max_local_size[1] = prop.maxThreadsDim[1]; device->max_local_size[2] = prop.maxThreadsDim[2]; device->max_compute_units = prop.multiProcessorCount; device->max_clock_freq_kHz = prop.clockRate; device->memory_clock_freq_kHz = prop.memoryClockRate; device->memory_bus_width = prop.memoryBusWidth; device->num_registers = (unsigned int) prop.regsPerBlock; device->warp_size = prop.warpSize; cudaMemGetInfo(&device->global_mem_free_size, &device->global_mem_size); #endif device->num_cores = device->max_compute_units * oskar_get_num_cuda_cores( device->compute_capability[0], device->compute_capability[1]); device->init = 1; }
// CUDAが使えるかチェック Waifu2x::eWaifu2xCudaError Waifu2x::can_use_CUDA() { static eWaifu2xCudaError CudaFlag = eWaifu2xCudaError_NotFind; std::call_once(waifu2x_cuda_once_flag, [&]() { int driverVersion = 0; if (cudaDriverGetVersion(&driverVersion) == cudaSuccess) { if (driverVersion > 0) { int runtimeVersion; if (cudaRuntimeGetVersion(&runtimeVersion) == cudaSuccess) { if (runtimeVersion >= MinCudaDriverVersion && driverVersion >= runtimeVersion) { cudaDeviceProp prop; cudaGetDeviceProperties(&prop, 0); if (prop.major >= 2) CudaFlag = eWaifu2xCudaError_OK; else CudaFlag = eWaifu2xCudaError_OldDevice; } else CudaFlag = eWaifu2xCudaError_OldVersion; } else CudaFlag = eWaifu2xCudaError_NotFind; } else CudaFlag = eWaifu2xCudaError_NotFind; } else CudaFlag = eWaifu2xCudaError_NotFind; }); return CudaFlag; }
void CUT_DEVICE_QUERY() { log_printf(INFO,"CUDA Device Query (Runtime API) version (CUDART static linking)\n"); int deviceCount; cudaGetDeviceCount(&deviceCount); // This function call returns 0 if there are no CUDA capable devices. if (deviceCount == 0) log_printf(INFO,"There is no device supporting CUDA\n"); int dev; for (dev = 0; dev < deviceCount; ++dev) { cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, dev); if (dev == 0) { // This function call returns 9999 for both major & minor fields, if no CUDA capable devices are present if (deviceProp.major == 9999 && deviceProp.minor == 9999) log_printf(INFO,"There is no device supporting CUDA.\n"); else if (deviceCount == 1) log_printf(INFO,"There is 1 device supporting CUDA\n"); else log_printf(INFO,"There are %d devices supporting CUDA\n", deviceCount); } log_printf(INFO,"\n"); log_printf(INFO,"Device %d: \"%s\"\n", dev, deviceProp.name); #if CUDART_VERSION >= 2020 int driverVersion = 0, runtimeVersion = 0; cudaDriverGetVersion(&driverVersion); log_printf(INFO," CUDA Driver Version: %d.%d\n", driverVersion/1000, driverVersion%100); cudaRuntimeGetVersion(&runtimeVersion); log_printf(INFO," CUDA Runtime Version: %d.%d\n", runtimeVersion/1000, runtimeVersion%100); #endif log_printf(INFO," CUDA Capability Major revision number: %d\n", deviceProp.major); log_printf(INFO," CUDA Capability Minor revision number: %d\n", deviceProp.minor); log_printf(INFO," Total amount of global memory: %u bytes\n", deviceProp.totalGlobalMem); #if CUDART_VERSION >= 2000 log_printf(INFO," Number of multiprocessors: %d\n", deviceProp.multiProcessorCount); log_printf(INFO," Number of cores: %d\n", 8 * deviceProp.multiProcessorCount); #endif log_printf(INFO," Total amount of constant memory: %u bytes\n", deviceProp.totalConstMem); log_printf(INFO," Total amount of shared memory per block: %u bytes\n", deviceProp.sharedMemPerBlock); log_printf(INFO," Total number of registers available per block: %d\n", deviceProp.regsPerBlock); log_printf(INFO," Warp size: %d\n", deviceProp.warpSize); log_printf(INFO," Maximum number of threads per block: %d\n", deviceProp.maxThreadsPerBlock); log_printf(INFO," Maximum sizes of each dimension of a block: %d x %d x %d\n", deviceProp.maxThreadsDim[0], deviceProp.maxThreadsDim[1], deviceProp.maxThreadsDim[2]); log_printf(INFO," Maximum sizes of each dimension of a grid: %d x %d x %d\n", deviceProp.maxGridSize[0], deviceProp.maxGridSize[1], deviceProp.maxGridSize[2]); log_printf(INFO," Maximum memory pitch: %u bytes\n", deviceProp.memPitch); log_printf(INFO," Texture alignment: %u bytes\n", deviceProp.textureAlignment); log_printf(INFO," Clock rate: %.2f GHz\n", deviceProp.clockRate * 1e-6f); #if CUDART_VERSION >= 2000 log_printf(INFO," Concurrent copy and execution: %s\n", deviceProp.deviceOverlap ? "Yes" : "No"); #endif #if CUDART_VERSION >= 2020 log_printf(INFO," Run time limit on kernels: %s\n", deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No"); log_printf(INFO," Integrated: %s\n", deviceProp.integrated ? "Yes" : "No"); log_printf(INFO," Support host page-locked memory mapping: %s\n", deviceProp.canMapHostMemory ? "Yes" : "No"); log_printf(INFO," Compute mode: %s\n", deviceProp.computeMode == cudaComputeModeDefault ? "Default (multiple host threads can use this device simultaneously)" : deviceProp.computeMode == cudaComputeModeExclusive ? "Exclusive (only one host thread at a time can use this device)" : deviceProp.computeMode == cudaComputeModeProhibited ? "Prohibited (no host thread can use this device)" : "Unknown"); #endif } }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { pArgc = &argc; pArgv = argv; printf("%s Starting...\n\n", argv[0]); printf(" CUDA Device Query (Runtime API) version (CUDART static linking)\n\n"); int deviceCount = 0; cudaError_t error_id = cudaGetDeviceCount(&deviceCount); if (error_id != cudaSuccess) { printf("cudaGetDeviceCount returned %d\n-> %s\n", (int)error_id, cudaGetErrorString(error_id)); printf("Result = FAIL\n"); exit(EXIT_FAILURE); } // This function call returns 0 if there are no CUDA capable devices. if (deviceCount == 0) { printf("There are no available device(s) that support CUDA\n"); } else { printf("Detected %d CUDA Capable device(s)\n", deviceCount); } int dev, driverVersion = 0, runtimeVersion = 0; for (dev = 0; dev < deviceCount; ++dev) { cudaSetDevice(dev); cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, dev); printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name); // Console log cudaDriverGetVersion(&driverVersion); cudaRuntimeGetVersion(&runtimeVersion); printf(" CUDA Driver Version / Runtime Version %d.%d / %d.%d\n", driverVersion/1000, (driverVersion%100)/10, runtimeVersion/1000, (runtimeVersion%100)/10); printf(" CUDA Capability Major/Minor version number: %d.%d\n", deviceProp.major, deviceProp.minor); char msg[256]; SPRINTF(msg, " Total amount of global memory: %.0f MBytes (%llu bytes)\n", (float)deviceProp.totalGlobalMem/1048576.0f, (unsigned long long) deviceProp.totalGlobalMem); printf("%s", msg); printf(" (%2d) Multiprocessors, (%3d) CUDA Cores/MP: %d CUDA Cores\n", deviceProp.multiProcessorCount, _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor), _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount); printf(" GPU Max Clock rate: %.0f MHz (%0.2f GHz)\n", deviceProp.clockRate * 1e-3f, deviceProp.clockRate * 1e-6f); #if CUDART_VERSION >= 5000 // This is supported in CUDA 5.0 (runtime API device properties) printf(" Memory Clock rate: %.0f Mhz\n", deviceProp.memoryClockRate * 1e-3f); printf(" Memory Bus Width: %d-bit\n", deviceProp.memoryBusWidth); if (deviceProp.l2CacheSize) { printf(" L2 Cache Size: %d bytes\n", deviceProp.l2CacheSize); } #else // This only available in CUDA 4.0-4.2 (but these were only exposed in the CUDA Driver API) int memoryClock; getCudaAttribute<int>(&memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev); printf(" Memory Clock rate: %.0f Mhz\n", memoryClock * 1e-3f); int memBusWidth; getCudaAttribute<int>(&memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev); printf(" Memory Bus Width: %d-bit\n", memBusWidth); int L2CacheSize; getCudaAttribute<int>(&L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev); if (L2CacheSize) { printf(" L2 Cache Size: %d bytes\n", L2CacheSize); } #endif printf(" Maximum Texture Dimension Size (x,y,z) 1D=(%d), 2D=(%d, %d), 3D=(%d, %d, %d)\n", deviceProp.maxTexture1D , deviceProp.maxTexture2D[0], deviceProp.maxTexture2D[1], deviceProp.maxTexture3D[0], deviceProp.maxTexture3D[1], deviceProp.maxTexture3D[2]); printf(" Maximum Layered 1D Texture Size, (num) layers 1D=(%d), %d layers\n", deviceProp.maxTexture1DLayered[0], deviceProp.maxTexture1DLayered[1]); printf(" Maximum Layered 2D Texture Size, (num) layers 2D=(%d, %d), %d layers\n", deviceProp.maxTexture2DLayered[0], deviceProp.maxTexture2DLayered[1], deviceProp.maxTexture2DLayered[2]); printf(" Total amount of constant memory: %lu bytes\n", deviceProp.totalConstMem); printf(" Total amount of shared memory per block: %lu bytes\n", deviceProp.sharedMemPerBlock); printf(" Total number of registers available per block: %d\n", deviceProp.regsPerBlock); printf(" Warp size: %d\n", deviceProp.warpSize); printf(" Maximum number of threads per multiprocessor: %d\n", deviceProp.maxThreadsPerMultiProcessor); printf(" Maximum number of threads per block: %d\n", deviceProp.maxThreadsPerBlock); printf(" Max dimension size of a thread block (x,y,z): (%d, %d, %d)\n", deviceProp.maxThreadsDim[0], deviceProp.maxThreadsDim[1], deviceProp.maxThreadsDim[2]); printf(" Max dimension size of a grid size (x,y,z): (%d, %d, %d)\n", deviceProp.maxGridSize[0], deviceProp.maxGridSize[1], deviceProp.maxGridSize[2]); printf(" Maximum memory pitch: %lu bytes\n", deviceProp.memPitch); printf(" Texture alignment: %lu bytes\n", deviceProp.textureAlignment); printf(" Concurrent copy and kernel execution: %s with %d copy engine(s)\n", (deviceProp.deviceOverlap ? "Yes" : "No"), deviceProp.asyncEngineCount); printf(" Run time limit on kernels: %s\n", deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No"); printf(" Integrated GPU sharing Host Memory: %s\n", deviceProp.integrated ? "Yes" : "No"); printf(" Support host page-locked memory mapping: %s\n", deviceProp.canMapHostMemory ? "Yes" : "No"); printf(" Alignment requirement for Surfaces: %s\n", deviceProp.surfaceAlignment ? "Yes" : "No"); printf(" Device has ECC support: %s\n", deviceProp.ECCEnabled ? "Enabled" : "Disabled"); #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) printf(" CUDA Device Driver Mode (TCC or WDDM): %s\n", deviceProp.tccDriver ? "TCC (Tesla Compute Cluster Driver)" : "WDDM (Windows Display Driver Model)"); #endif printf(" Device supports Unified Addressing (UVA): %s\n", deviceProp.unifiedAddressing ? "Yes" : "No"); printf(" Device PCI Domain ID / Bus ID / location ID: %d / %d / %d\n", deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID); const char *sComputeMode[] = { "Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)", "Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)", "Prohibited (no host thread can use ::cudaSetDevice() with this device)", "Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)", "Unknown", NULL }; printf(" Compute Mode:\n"); printf(" < %s >\n", sComputeMode[deviceProp.computeMode]); } // If there are 2 or more GPUs, query to determine whether RDMA is supported if (deviceCount >= 2) { cudaDeviceProp prop[64]; int gpuid[64]; // we want to find the first two GPUs that can support P2P int gpu_p2p_count = 0; for (int i=0; i < deviceCount; i++) { checkCudaErrors(cudaGetDeviceProperties(&prop[i], i)); // Only boards based on Fermi or later can support P2P if ((prop[i].major >= 2) #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) // on Windows (64-bit), the Tesla Compute Cluster driver for windows must be enabled to support this && prop[i].tccDriver #endif ) { // This is an array of P2P capable GPUs gpuid[gpu_p2p_count++] = i; } } // Show all the combinations of support P2P GPUs int can_access_peer; if (gpu_p2p_count >= 2) { for (int i = 0; i < gpu_p2p_count; i++) { for (int j = 0; j < gpu_p2p_count; j++) { if (gpuid[i] == gpuid[j]) { continue; } checkCudaErrors(cudaDeviceCanAccessPeer(&can_access_peer, gpuid[i], gpuid[j])); printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n", prop[gpuid[i]].name, gpuid[i], prop[gpuid[j]].name, gpuid[j] , can_access_peer ? "Yes" : "No"); } } } } // csv masterlog info // ***************************** // exe and CUDA driver name printf("\n"); std::string sProfileString = "deviceQuery, CUDA Driver = CUDART"; char cTemp[16]; // driver version sProfileString += ", CUDA Driver Version = "; #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) sprintf_s(cTemp, 10, "%d.%d", driverVersion/1000, (driverVersion%100)/10); #else sprintf(cTemp, "%d.%d", driverVersion/1000, (driverVersion%100)/10); #endif sProfileString += cTemp; // Runtime version sProfileString += ", CUDA Runtime Version = "; #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) sprintf_s(cTemp, 10, "%d.%d", runtimeVersion/1000, (runtimeVersion%100)/10); #else sprintf(cTemp, "%d.%d", runtimeVersion/1000, (runtimeVersion%100)/10); #endif sProfileString += cTemp; // Device count sProfileString += ", NumDevs = "; #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) sprintf_s(cTemp, 10, "%d", deviceCount); #else sprintf(cTemp, "%d", deviceCount); #endif sProfileString += cTemp; // Print Out all device Names for (dev = 0; dev < deviceCount; ++dev) { #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) sprintf_s(cTemp, 13, ", Device%d = ", dev); #else sprintf(cTemp, ", Device%d = ", dev); #endif cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, dev); sProfileString += cTemp; sProfileString += deviceProp.name; } sProfileString += "\n"; printf("%s", sProfileString.c_str()); printf("Result = PASS\n"); // finish exit(EXIT_SUCCESS); }
QHardwareWidget::QHardwareWidget(QWidget* pParent) : QGroupBox(pParent), m_MainLayout(), m_Devices(), m_OptimalDevice() { setTitle("Hardware Selection"); setStatusTip("Hardware Selection"); setToolTip("Hardware Selection"); m_MainLayout.setColumnMinimumWidth(0, 75); setLayout(&m_MainLayout); int DriverVersion = 0, RuntimeVersion = 0; cudaDriverGetVersion(&DriverVersion); cudaRuntimeGetVersion(&RuntimeVersion); QString DriverVersionString = QString::number(DriverVersion / 1000) + "." + QString::number(DriverVersion % 100); QString RuntimeVersionString = QString::number(RuntimeVersion / 1000) + "." + QString::number(RuntimeVersion % 100); gStatus.SetStatisticChanged("Graphics Card", "CUDA Driver Version", DriverVersionString); gStatus.SetStatisticChanged("Graphics Card", "CUDA Runtime Version", RuntimeVersionString); QString VersionInfo; VersionInfo += "CUDA Driver Version: " + DriverVersionString; VersionInfo += ", CUDA Runtime Version: " + RuntimeVersionString; m_MainLayout.addWidget(new QLabel(VersionInfo), 0, 0, 1, 2); m_MainLayout.addWidget(&m_Devices, 1, 0, 1, 2); m_Model.EnumerateDevices(); m_Devices.horizontalHeader()->setResizeMode(QHeaderView::ResizeToContents); m_Devices.horizontalHeader()->setStretchLastSection(true); m_Devices.horizontalHeader()->setDefaultSectionSize(1); m_Devices.horizontalHeader()->setDefaultAlignment(Qt::AlignLeft); m_Devices.horizontalHeader()->setHighlightSections(false); m_Devices.verticalHeader()->setVisible(false); m_Devices.verticalHeader()->setDefaultSectionSize(20); m_Devices.setSelectionMode(QAbstractItemView::SingleSelection); m_Devices.setSelectionBehavior(QAbstractItemView::SelectRows); m_Devices.setFixedHeight(75); m_Devices.setModel(&m_Model); m_OptimalDevice.setText("Optimal Device"); m_OptimalDevice.setToolTip("Optimal Device"); m_OptimalDevice.setStatusTip("Choose the most optimal device for rendering"); m_OptimalDevice.setFixedWidth(90); m_OptimalDevice.setVisible(m_Model.rowCount(QModelIndex()) > 1); m_MainLayout.addWidget(&m_OptimalDevice); QObject::connect(&m_OptimalDevice, SIGNAL(clicked()), this, SLOT(OnOptimalDevice())); QObject::connect(&m_Devices, SIGNAL(clicked(const QModelIndex&)), this, SLOT(OnSelection(const QModelIndex&))); OnOptimalDevice(); }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main( int argc, char** argv) { pArgc = &argc; pArgv = argv; /* shrQAStart(argc, argv); shrSetLogFileName ("deviceQuery.txt"); */ shrLog("%s Starting...\n\n", argv[0]); shrLog(" CUDA Device Query (Runtime API) version (CUDART static linking)\n\n"); int deviceCount = 0; cudaError_t error_id = cudaGetDeviceCount(&deviceCount); if (error_id != cudaSuccess) { shrLog( "cudaGetDeviceCount returned %d\n-> %s\n", (int)error_id, cudaGetErrorString(error_id) ); return -1; } // This function call returns 0 if there are no CUDA capable devices. if (deviceCount == 0) shrLog("There is no device supporting CUDA\n"); else shrLog("Found %d CUDA Capable device(s)\n", deviceCount); int dev, driverVersion = 0, runtimeVersion = 0; for (dev = 0; dev < deviceCount; ++dev) { cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, dev); shrLog("\nDevice %d: \"%s\"\n", dev, deviceProp.name); #if CUDART_VERSION >= 2020 // Console log cudaDriverGetVersion(&driverVersion); cudaRuntimeGetVersion(&runtimeVersion); shrLog(" CUDA Driver Version / Runtime Version %d.%d / %d.%d\n", driverVersion/1000, (driverVersion%100)/10, runtimeVersion/1000, (runtimeVersion%100)/10); #endif shrLog(" CUDA Capability Major/Minor version number: %d.%d\n", deviceProp.major, deviceProp.minor); char msg[256]; sprintf(msg, " Total amount of global memory: %.0f MBytes (%llu bytes)\n", (float)deviceProp.totalGlobalMem/1048576.0f, (unsigned long long) deviceProp.totalGlobalMem); shrLog(msg); #if CUDART_VERSION >= 2000 shrLog(" (%2d) Multiprocessors x (%2d) CUDA Cores/MP: %d CUDA Cores\n", deviceProp.multiProcessorCount, ConvertSMVer2Cores(deviceProp.major, deviceProp.minor), ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount); #endif shrLog(" GPU Clock Speed: %.2f GHz\n", deviceProp.clockRate * 1e-6f); #if CUDART_VERSION >= 4000 // This is not available in the CUDA Runtime API, so we make the necessary calls the driver API to support this for output int memoryClock; getCudaAttribute<int>( &memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev ); shrLog(" Memory Clock rate: %.2f Mhz\n", memoryClock * 1e-3f); int memBusWidth; getCudaAttribute<int>( &memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev ); shrLog(" Memory Bus Width: %d-bit\n", memBusWidth); int L2CacheSize; getCudaAttribute<int>( &L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev ); if (L2CacheSize) { shrLog(" L2 Cache Size: %d bytes\n", L2CacheSize); } shrLog(" Max Texture Dimension Size (x,y,z) 1D=(%d), 2D=(%d,%d), 3D=(%d,%d,%d)\n", deviceProp.maxTexture1D, deviceProp.maxTexture2D[0], deviceProp.maxTexture2D[1], deviceProp.maxTexture3D[0], deviceProp.maxTexture3D[1], deviceProp.maxTexture3D[2]); shrLog(" Max Layered Texture Size (dim) x layers 1D=(%d) x %d, 2D=(%d,%d) x %d\n", deviceProp.maxTexture1DLayered[0], deviceProp.maxTexture1DLayered[1], deviceProp.maxTexture2DLayered[0], deviceProp.maxTexture2DLayered[1], deviceProp.maxTexture2DLayered[2]); #endif shrLog(" Total amount of constant memory: %u bytes\n", deviceProp.totalConstMem); shrLog(" Total amount of shared memory per block: %u bytes\n", deviceProp.sharedMemPerBlock); shrLog(" Total number of registers available per block: %d\n", deviceProp.regsPerBlock); shrLog(" Warp size: %d\n", deviceProp.warpSize); shrLog(" Maximum number of threads per block: %d\n", deviceProp.maxThreadsPerBlock); shrLog(" Maximum sizes of each dimension of a block: %d x %d x %d\n", deviceProp.maxThreadsDim[0], deviceProp.maxThreadsDim[1], deviceProp.maxThreadsDim[2]); shrLog(" Maximum sizes of each dimension of a grid: %d x %d x %d\n", deviceProp.maxGridSize[0], deviceProp.maxGridSize[1], deviceProp.maxGridSize[2]); shrLog(" Maximum memory pitch: %u bytes\n", deviceProp.memPitch); shrLog(" Texture alignment: %u bytes\n", deviceProp.textureAlignment); #if CUDART_VERSION >= 4000 shrLog(" Concurrent copy and execution: %s with %d copy engine(s)\n", (deviceProp.deviceOverlap ? "Yes" : "No"), deviceProp.asyncEngineCount); #else shrLog(" Concurrent copy and execution: %s\n", deviceProp.deviceOverlap ? "Yes" : "No"); #endif #if CUDART_VERSION >= 2020 shrLog(" Run time limit on kernels: %s\n", deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No"); shrLog(" Integrated GPU sharing Host Memory: %s\n", deviceProp.integrated ? "Yes" : "No"); shrLog(" Support host page-locked memory mapping: %s\n", deviceProp.canMapHostMemory ? "Yes" : "No"); #endif #if CUDART_VERSION >= 3000 shrLog(" Concurrent kernel execution: %s\n", deviceProp.concurrentKernels ? "Yes" : "No"); shrLog(" Alignment requirement for Surfaces: %s\n", deviceProp.surfaceAlignment ? "Yes" : "No"); #endif #if CUDART_VERSION >= 3010 shrLog(" Device has ECC support enabled: %s\n", deviceProp.ECCEnabled ? "Yes" : "No"); #endif #if CUDART_VERSION >= 3020 shrLog(" Device is using TCC driver mode: %s\n", deviceProp.tccDriver ? "Yes" : "No"); #endif #if CUDART_VERSION >= 4000 shrLog(" Device supports Unified Addressing (UVA): %s\n", deviceProp.unifiedAddressing ? "Yes" : "No"); shrLog(" Device PCI Bus ID / PCI location ID: %d / %d\n", deviceProp.pciBusID, deviceProp.pciDeviceID ); #endif #if CUDART_VERSION >= 2020 const char *sComputeMode[] = { "Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)", "Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)", "Prohibited (no host thread can use ::cudaSetDevice() with this device)", "Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)", "Unknown", NULL }; shrLog(" Compute Mode:\n"); shrLog(" < %s >\n", sComputeMode[deviceProp.computeMode]); #endif } // csv masterlog info // ***************************** // exe and CUDA driver name shrLog("\n"); std::string sProfileString = "deviceQuery, CUDA Driver = CUDART"; char cTemp[10]; // driver version sProfileString += ", CUDA Driver Version = "; #ifdef WIN32 sprintf_s(cTemp, 10, "%d.%d", driverVersion/1000, (driverVersion%100)/10); #else sprintf(cTemp, "%d.%d", driverVersion/1000, (driverVersion%100)/10); #endif sProfileString += cTemp; // Runtime version sProfileString += ", CUDA Runtime Version = "; #ifdef WIN32 sprintf_s(cTemp, 10, "%d.%d", runtimeVersion/1000, (runtimeVersion%100)/10); #else sprintf(cTemp, "%d.%d", runtimeVersion/1000, (runtimeVersion%100)/10); #endif sProfileString += cTemp; // Device count sProfileString += ", NumDevs = "; #ifdef WIN32 sprintf_s(cTemp, 10, "%d", deviceCount); #else sprintf(cTemp, "%d", deviceCount); #endif sProfileString += cTemp; // First 2 device names, if any for (dev = 0; dev < ((deviceCount > 2) ? 2 : deviceCount); ++dev) { cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, dev); sProfileString += ", Device = "; sProfileString += deviceProp.name; } sProfileString += "\n"; //shrLogEx(LOGBOTH | MASTER, 0, sProfileString.c_str()); std::cout << sProfileString.c_str() << std::endl; std::cout << "Press <ENTER>" << std::endl; // getchar(); runtimeTest(); getchar(); // finish return 0; }
///////////////////////////////////////////////////////////////////////////// // Device info dump (this code is taken from the SDK's deviceQuery example) ///////////////////////////////////////////////////////////////////////////// static int dump() { int deviceCount = 0; // This function call returns 0 if there are no CUDA capable devices. cudaGetDeviceCount(&deviceCount); if (deviceCount == 0) { printf("There is no device supporting CUDA\n"); return (1); } int dev = 0; for (dev = 0; dev < deviceCount; ++dev) { cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, dev); if (dev == 0) { // This function call returns 9999 for both major & minor fields, // if no CUDA capable devices are present if (deviceProp.major == 9999 && deviceProp.minor == 9999) { printf("There is no device supporting CUDA.\n"); return (1); } else if (deviceCount == 1) printf("There is 1 device supporting CUDA:\n"); else printf("There are %d devices supporting CUDA:\n", deviceCount); } printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name); int driverVersion = 0, runtimeVersion = 0; cudaDriverGetVersion(&driverVersion); printf(" CUDA Driver Version: %d.%d\n", driverVersion/1000, driverVersion%100); cudaRuntimeGetVersion(&runtimeVersion); printf(" CUDA Runtime Version: %d.%d\n", runtimeVersion/1000, runtimeVersion%100); printf(" CUDA Capability Major revision number: %d\n", deviceProp.major); printf(" CUDA Capability Minor revision number: %d\n", deviceProp.minor); printf(" Total amount of global memory: %u bytes\n", (unsigned int)deviceProp.totalGlobalMem); printf(" Number of multiprocessors: %d\n", deviceProp.multiProcessorCount); printf(" Number of cores: %d\n", nGpuArchCoresPerSM[deviceProp.major] * deviceProp.multiProcessorCount); printf(" Total amount of constant memory: %u bytes\n", (unsigned int)deviceProp.totalConstMem); printf(" Total amount of shared memory per block: %u bytes\n", (unsigned int)deviceProp.sharedMemPerBlock); printf(" Total number of registers available per block: %d\n", deviceProp.regsPerBlock); printf(" Warp size: %d\n", deviceProp.warpSize); printf(" Maximum number of threads per block: %d\n", deviceProp.maxThreadsPerBlock); printf(" Maximum sizes of each dimension of a block: %d x %d x %d\n", deviceProp.maxThreadsDim[0], deviceProp.maxThreadsDim[1], deviceProp.maxThreadsDim[2]); printf(" Maximum sizes of each dimension of a grid: %d x %d x %d\n", deviceProp.maxGridSize[0], deviceProp.maxGridSize[1], deviceProp.maxGridSize[2]); printf(" Maximum memory pitch: %u bytes\n", (unsigned int)deviceProp.memPitch); printf(" Texture alignment: %u bytes\n", (unsigned int)deviceProp.textureAlignment); printf(" Clock rate: %.2f GHz\n", deviceProp.clockRate * 1e-6f); printf(" Concurrent copy and execution: %s\n", deviceProp.deviceOverlap ? "Yes" : "No"); printf(" Run time limit on kernels: %s\n", deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No"); printf(" Integrated: %s\n", deviceProp.integrated ? "Yes" : "No"); printf(" Support host page-locked memory mapping: %s\n", deviceProp.canMapHostMemory ? "Yes" : "No"); printf(" Compute mode: %s\n", deviceProp.computeMode == cudaComputeModeDefault ? "Default (multiple host threads can use this device " "simultaneously)" : deviceProp.computeMode == cudaComputeModeExclusive ? "Exclusive (only one host thread at a time can use this device)" : deviceProp.computeMode == cudaComputeModeProhibited ? "Prohibited (no host thread can use this device)" : "Unknown"); } return (0); }
void CudaDeviceDialog::updateInfo(int index) { m_infoText = "<html><body>"; int deviceCount = 0; cudaGetDeviceCount(&deviceCount); cudaDeviceProp p; cudaGetDeviceProperties(&p, index); if (p.major == 9999 && p.minor == 9999) m_infoText += "<p>There is no device supporting CUDA</p>"; else if (deviceCount == 1) m_infoText += "<p>There is 1 device supporting CUDA</p>"; else m_infoText += QString("<p>There are %1 devices supporting CUDA</p>").arg(deviceCount); m_infoText += QString("<p>CUDA Driver/Runtime</p>"); m_infoText += "<table>"; int driverVersion = 0, runtimeVersion = 0; cudaDriverGetVersion(&driverVersion); cudaRuntimeGetVersion(&runtimeVersion); QString error = "<span style='color:red;'>***ERROR*** >= 4.0 required</span>"; addItem(1, "CUDA Driver Version:", QString("%1.%2 %3").arg(driverVersion/1000).arg(driverVersion%100) .arg((driverVersion >= 4000)? "" : error)); addItem(1, "CUDA Runtime Version:", QString("%1.%2 %3").arg(runtimeVersion/1000).arg(runtimeVersion%100) .arg((driverVersion >= 4000)? "" : error)); m_infoText += "</table>"; if (index < deviceCount) { m_infoText += QString("<p>Device %1: "%2"</p>").arg(index).arg(p.name); m_infoText += "<table>"; addItem(1, "CUDA Capability Major/Minor version number:", QString("%1.%2").arg(p.major).arg(p.minor)); addItem(1, "Total amount of global memory:", QString("%1 MB").arg(p.totalGlobalMem / 1024 / 1024)); addItem(1, QString("%1 Multiprocessors x %2 CUDA Cores/MP:").arg(p.multiProcessorCount).arg(ConvertSMVer2Cores(p.major, p.minor)), QString("%1 CUDA Cores").arg(ConvertSMVer2Cores(p.major, p.minor) * p.multiProcessorCount)); addItem(1, "Total amount of constant memory:", QString("%1 bytes").arg(p.totalConstMem)); addItem(1, "Total amount of shared memory per block:", QString("%1 bytes").arg(p.sharedMemPerBlock)); addItem(1, "Total number of registers available per block:", QString("%1").arg(p.regsPerBlock)); addItem(1, "Warp size:", QString("%1").arg(p.warpSize)); addItem(1, "Maximum number of threads per block:", QString("%1").arg(p.maxThreadsPerBlock)); addItem(1, "Maximum sizes of each dimension of a block:", QString("%1 x %2 x %3") .arg(p.maxThreadsDim[0]) .arg(p.maxThreadsDim[1]) .arg(p.maxThreadsDim[2])); addItem(1, "Maximum sizes of each dimension of a grid:", QString("%1 x %2 x %3") .arg(p.maxGridSize[0]) .arg(p.maxGridSize[1]) .arg(p.maxGridSize[2])); addItem(1, "Maximum memory pitch:", QString("%1 bytes").arg(p.memPitch)); addItem(1, "Texture alignment:", QString("%1 bytes").arg(p.textureAlignment)); addItem(1, "Clock rate:", QString("%1 GHz").arg(p.clockRate * 1e-6f)); addItem(1, "Concurrent copy and execution:", p.deviceOverlap ? "yes" : "no"); addItem(1, "# of Asynchronous Copy Engines:", QString("%1").arg(p.asyncEngineCount)); addItem(1, "Run time limit on kernels:", p.kernelExecTimeoutEnabled ? "yes" : "no"); addItem(1, "Integrated:", p.integrated ? "yes" : "no"); addItem(1, "Support host page-locked memory mapping:", p.canMapHostMemory ? "yes" : "no"); addItem(1, "Compute mode:", p.computeMode == cudaComputeModeDefault ? "Default (multiple host threads can use this device simultaneously)" : p.computeMode == cudaComputeModeExclusive ? "Exclusive (only one host thread at a time can use this device)" : p.computeMode == cudaComputeModeProhibited ? "Prohibited (no host thread can use this device)" : "Unknown"); addItem(1, "Concurrent kernel execution:", p.concurrentKernels ? "yes" : "no"); addItem(1, "Device has ECC support enabled:", p.ECCEnabled ? "yes" : "no"); addItem(1, "Device is using TCC driver mode:", p.tccDriver ? "yes" : "no"); m_infoText += "</table>"; } m_infoText += "</body></html>"; m->info->setHtml(m_infoText); m->buttonBox->button(QDialogButtonBox::Ok)->setEnabled((driverVersion >= 4000) && (runtimeVersion >= 4000)); }
/** Documented at declaration */ int gpujpeg_init_device(int device_id, int flags) { int dev_count; cudaGetDeviceCount(&dev_count); if ( dev_count == 0 ) { fprintf(stderr, "[GPUJPEG] [Error] No CUDA enabled device\n"); return -1; } if ( device_id < 0 || device_id >= dev_count ) { fprintf(stderr, "[GPUJPEG] [Error] Selected device %d is out of bound. Devices on your system are in range %d - %d\n", device_id, 0, dev_count - 1); return -1; } struct cudaDeviceProp devProp; if ( cudaSuccess != cudaGetDeviceProperties(&devProp, device_id) ) { fprintf(stderr, "[GPUJPEG] [Error] Can't get CUDA device properties!\n" "[GPUJPEG] [Error] Do you have proper driver for CUDA installed?\n" ); return -1; } if ( devProp.major < 1 ) { fprintf(stderr, "[GPUJPEG] [Error] Device %d does not support CUDA\n", device_id); return -1; } if ( flags & GPUJPEG_OPENGL_INTEROPERABILITY ) { cudaGLSetGLDevice(device_id); gpujpeg_cuda_check_error("Enabling OpenGL interoperability"); } if ( flags & GPUJPEG_VERBOSE ) { int cuda_driver_version = 0; cudaDriverGetVersion(&cuda_driver_version); printf("CUDA driver version: %d.%d\n", cuda_driver_version / 1000, (cuda_driver_version % 100) / 10); int cuda_runtime_version = 0; cudaRuntimeGetVersion(&cuda_runtime_version); printf("CUDA runtime version: %d.%d\n", cuda_runtime_version / 1000, (cuda_runtime_version % 100) / 10); printf("Using Device #%d: %s (c.c. %d.%d)\n", device_id, devProp.name, devProp.major, devProp.minor); } cudaSetDevice(device_id); gpujpeg_cuda_check_error("Set CUDA device"); // Test by simple copying that the device is ready uint8_t data[] = {8}; uint8_t* d_data = NULL; cudaMalloc((void**)&d_data, 1); cudaMemcpy(d_data, data, 1, cudaMemcpyHostToDevice); cudaFree(d_data); cudaError_t error = cudaGetLastError(); if ( cudaSuccess != error ) { fprintf(stderr, "[GPUJPEG] [Error] Failed to initialize CUDA device.\n"); if ( flags & GPUJPEG_OPENGL_INTEROPERABILITY ) fprintf(stderr, "[GPUJPEG] [Info] OpenGL interoperability is used, is OpenGL context available?\n"); return -1; } return 0; }
cudaError_t WINAPI wine_cudaRuntimeGetVersion( int *runtimeVersion ) { WINE_TRACE("\n"); return cudaRuntimeGetVersion( runtimeVersion ); }
int SimCudaHelper::PrintDevices(int deviceSelected) { int deviceCount = 0; if (cudaGetDeviceCount(&deviceCount) != cudaSuccess) { printf("cudaGetDeviceCount FAILED CUDA Driver and Runtime version may be mismatched.\n"); printf("\nFAILED\n"); } // This function call returns 0 if there are no CUDA capable devices. if (deviceCount == 0) printf("There is no device supporting CUDA\n"); int dev; int driverVersion = 0, runtimeVersion = 0; for (dev = 0; dev < deviceCount; ++dev) { cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, dev); if (dev == 0) { // This function call returns 9999 for both major & minor fields, if no CUDA capable devices are present if (deviceProp.major == 9999 && deviceProp.minor == 9999) printf("There is no device supporting CUDA.\n"); else if (deviceCount == 1) printf("There is 1 device supporting CUDA\n"); else printf("There are %d devices supporting CUDA\n", deviceCount); } printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name); #if CUDART_VERSION >= 2020 // Console log cudaDriverGetVersion(&driverVersion); printf(" CUDA Driver Version: %d.%d\n", driverVersion / 1000, driverVersion % 100); cudaRuntimeGetVersion(&runtimeVersion); printf(" CUDA Runtime Version: %d.%d\n", runtimeVersion / 1000, runtimeVersion % 100); #endif printf(" CUDA Capability Major revision number: %d\n", deviceProp.major); printf(" CUDA Capability Minor revision number: %d\n", deviceProp.minor); printf(" Total amount of global memory: %zu bytes\n", deviceProp.totalGlobalMem); #if CUDART_VERSION >= 2000 printf(" Number of multiprocessors: %d\n", deviceProp.multiProcessorCount); //printf(" Number of cores: %d\n", nGpuArchCoresPerSM[deviceProp.major] * deviceProp.multiProcessorCount); #endif printf(" Total amount of constant memory: %zu bytes\n", deviceProp.totalConstMem); printf(" Total amount of shared memory per block: %zu bytes\n", deviceProp.sharedMemPerBlock); printf(" Total number of registers available per block: %d\n", deviceProp.regsPerBlock); printf(" Warp size: %d\n", deviceProp.warpSize); printf(" Maximum number of threads per block: %d\n", deviceProp.maxThreadsPerBlock); printf(" Maximum sizes of each dimension of a block: %d x %d x %d\n", deviceProp.maxThreadsDim[0], deviceProp.maxThreadsDim[1], deviceProp.maxThreadsDim[2]); printf(" Maximum sizes of each dimension of a grid: %d x %d x %d\n", deviceProp.maxGridSize[0], deviceProp.maxGridSize[1], deviceProp.maxGridSize[2]); printf(" Maximum memory pitch: %zu bytes\n", deviceProp.memPitch); printf(" Texture alignment: %zu bytes\n", deviceProp.textureAlignment); printf(" Clock rate: %.2f GHz\n", deviceProp.clockRate * 1e-6f); #if CUDART_VERSION >= 2000 printf(" Concurrent copy and execution: %s\n", deviceProp.deviceOverlap ? "Yes" : "No"); #endif #if CUDART_VERSION >= 2020 printf(" Run time limit on kernels: %s\n", deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No"); printf(" Integrated: %s\n", deviceProp.integrated ? "Yes" : "No"); printf(" Support host page-locked memory mapping: %s\n", deviceProp.canMapHostMemory ? "Yes" : "No"); printf(" Compute mode: %s\n", deviceProp.computeMode == cudaComputeModeDefault ? "Default (multiple host threads can use this device simultaneously)" : deviceProp.computeMode == cudaComputeModeExclusive ? "Exclusive (only one host thread at a time can use this device)" : deviceProp.computeMode == cudaComputeModeProhibited ? "Prohibited (no host thread can use this device)" : "Unknown"); #endif } // csv masterlog info // ***************************** // exe and CUDA driver name printf("\n"); std::string sProfileString = "deviceQuery, CUDA Driver = CUDART"; char cTemp[10]; // driver version sProfileString += ", CUDA Driver Version = "; #ifdef WIN32 sprintf_s(cTemp, 10, "%d.%d", driverVersion / 1000, driverVersion % 100); #else sprintf(cTemp, "%d.%d", driverVersion / 1000, driverVersion % 100); #endif sProfileString += cTemp; // Runtime version sProfileString += ", CUDA Runtime Version = "; #ifdef WIN32 sprintf_s(cTemp, 10, "%d.%d", runtimeVersion / 1000, runtimeVersion % 100); #else sprintf(cTemp, "%d.%d", runtimeVersion / 1000, runtimeVersion % 100); #endif sProfileString += cTemp; // Device count sProfileString += ", NumDevs = "; #ifdef WIN32 sprintf_s(cTemp, 10, "%d", deviceCount); #else sprintf(cTemp, "%d", deviceCount); #endif sProfileString += cTemp; // First 2 device names, if any for (dev = 0; dev < ((deviceCount > 2) ? 2 : deviceCount); ++dev) { cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, dev); sProfileString += ", Device = "; sProfileString += deviceProp.name; } sProfileString += "\n"; // shrLogEx(LOGBOTH | MASTER, 0, sProfileString.c_str()); // finish //printf("\n\nPASSED\n"); //Log* pLog = Ogre::LogManager::getSingleton().getDefaultLog(); cudaError_t err = cudaSuccess; return err; }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { pArgc = &argc; pArgv = argv; printf("%s Starting...\n\n", argv[0]); printf(" CUDA Device Query (Runtime API) version (CUDART static linking)\n\n"); int deviceCount = 0; cudaError_t error_id = cudaGetDeviceCount(&deviceCount); if (error_id != cudaSuccess) { printf("cudaGetDeviceCount returned %d\n-> %s\n", (int)error_id, cudaGetErrorString(error_id)); exit(EXIT_FAILURE); } // This function call returns 0 if there are no CUDA capable devices. if (deviceCount == 0) { printf("There are no available device(s) that support CUDA\n"); } else { printf("Detected %d CUDA Capable device(s)\n", deviceCount); } int dev, driverVersion = 0, runtimeVersion = 0; for (dev = 0; dev < deviceCount; ++dev) { cudaSetDevice(dev); cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, dev); printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name); // Console log cudaDriverGetVersion(&driverVersion); cudaRuntimeGetVersion(&runtimeVersion); printf(" CUDA Driver Version / Runtime Version %d.%d / %d.%d\n", driverVersion/1000, (driverVersion%100)/10, runtimeVersion/1000, (runtimeVersion%100)/10); printf(" CUDA Capability Major/Minor version number: %d.%d\n", deviceProp.major, deviceProp.minor); char msg[256]; sprintf(msg, " Total amount of global memory: %.0f MBytes (%llu bytes)\n", (float)deviceProp.totalGlobalMem/1048576.0f, (unsigned long long) deviceProp.totalGlobalMem); printf("%s", msg); printf(" (%2d) Multiprocessors x (%3d) CUDA Cores/MP: %d CUDA Cores\n", deviceProp.multiProcessorCount, _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor), _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount); printf(" GPU Clock rate: %.0f MHz (%0.2f GHz)\n", deviceProp.clockRate * 1e-3f, deviceProp.clockRate * 1e-6f); #if CUDART_VERSION >= 5000 // This is supported in CUDA 5.0 (runtime API device properties) printf(" Memory Clock rate: %.0f Mhz\n", deviceProp.memoryClockRate * 1e-3f); printf(" Memory Bus Width: %d-bit\n", deviceProp.memoryBusWidth); if (deviceProp.l2CacheSize) { printf(" L2 Cache Size: %d bytes\n", deviceProp.l2CacheSize); } #else // This only available in CUDA 4.0-4.2 (but these were only exposed in the CUDA Driver API) int memoryClock; getCudaAttribute<int>(&memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev); printf(" Memory Clock rate: %.0f Mhz\n", memoryClock * 1e-3f); int memBusWidth; getCudaAttribute<int>(&memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev); printf(" Memory Bus Width: %d-bit\n", memBusWidth); int L2CacheSize; getCudaAttribute<int>(&L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev); if (L2CacheSize) { printf(" L2 Cache Size: %d bytes\n", L2CacheSize); } #endif printf(" Max Texture Dimension Size (x,y,z) 1D=(%d), 2D=(%d,%d), 3D=(%d,%d,%d)\n", deviceProp.maxTexture1D , deviceProp.maxTexture2D[0], deviceProp.maxTexture2D[1], deviceProp.maxTexture3D[0], deviceProp.maxTexture3D[1], deviceProp.maxTexture3D[2]); printf(" Max Layered Texture Size (dim) x layers 1D=(%d) x %d, 2D=(%d,%d) x %d\n", deviceProp.maxTexture1DLayered[0], deviceProp.maxTexture1DLayered[1], deviceProp.maxTexture2DLayered[0], deviceProp.maxTexture2DLayered[1], deviceProp.maxTexture2DLayered[2]); printf(" Total amount of constant memory: %lu bytes\n", deviceProp.totalConstMem); printf(" Total amount of shared memory per block: %lu bytes\n", deviceProp.sharedMemPerBlock); printf(" Total number of registers available per block: %d\n", deviceProp.regsPerBlock); printf(" Warp size: %d\n", deviceProp.warpSize); printf(" Maximum number of threads per multiprocessor: %d\n", deviceProp.maxThreadsPerMultiProcessor); printf(" Maximum number of threads per block: %d\n", deviceProp.maxThreadsPerBlock); printf(" Maximum sizes of each dimension of a block: %d x %d x %d\n", deviceProp.maxThreadsDim[0], deviceProp.maxThreadsDim[1], deviceProp.maxThreadsDim[2]); printf(" Maximum sizes of each dimension of a grid: %d x %d x %d\n", deviceProp.maxGridSize[0], deviceProp.maxGridSize[1], deviceProp.maxGridSize[2]); printf(" Maximum memory pitch: %lu bytes\n", deviceProp.memPitch); printf(" Texture alignment: %lu bytes\n", deviceProp.textureAlignment); printf(" Concurrent copy and kernel execution: %s with %d copy engine(s)\n", (deviceProp.deviceOverlap ? "Yes" : "No"), deviceProp.asyncEngineCount); printf(" Run time limit on kernels: %s\n", deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No"); printf(" Integrated GPU sharing Host Memory: %s\n", deviceProp.integrated ? "Yes" : "No"); printf(" Support host page-locked memory mapping: %s\n", deviceProp.canMapHostMemory ? "Yes" : "No"); printf(" Alignment requirement for Surfaces: %s\n", deviceProp.surfaceAlignment ? "Yes" : "No"); printf(" Device has ECC support: %s\n", deviceProp.ECCEnabled ? "Enabled" : "Disabled"); #ifdef WIN32 printf(" CUDA Device Driver Mode (TCC or WDDM): %s\n", deviceProp.tccDriver ? "TCC (Tesla Compute Cluster Driver)" : "WDDM (Windows Display Driver Model)"); #endif printf(" Device supports Unified Addressing (UVA): %s\n", deviceProp.unifiedAddressing ? "Yes" : "No"); printf(" Device PCI Bus ID / PCI location ID: %d / %d\n", deviceProp.pciBusID, deviceProp.pciDeviceID); const char *sComputeMode[] = { "Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)", "Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)", "Prohibited (no host thread can use ::cudaSetDevice() with this device)", "Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)", "Unknown", NULL }; printf(" Compute Mode:\n"); printf(" < %s >\n", sComputeMode[deviceProp.computeMode]); } // csv masterlog info // ***************************** // exe and CUDA driver name printf("\n"); std::string sProfileString = "deviceQuery, CUDA Driver = CUDART"; char cTemp[16]; // driver version sProfileString += ", CUDA Driver Version = "; #ifdef WIN32 sprintf_s(cTemp, 10, "%d.%d", driverVersion/1000, (driverVersion%100)/10); #else sprintf(cTemp, "%d.%d", driverVersion/1000, (driverVersion%100)/10); #endif sProfileString += cTemp; // Runtime version sProfileString += ", CUDA Runtime Version = "; #ifdef WIN32 sprintf_s(cTemp, 10, "%d.%d", runtimeVersion/1000, (runtimeVersion%100)/10); #else sprintf(cTemp, "%d.%d", runtimeVersion/1000, (runtimeVersion%100)/10); #endif sProfileString += cTemp; // Device count sProfileString += ", NumDevs = "; #ifdef WIN32 sprintf_s(cTemp, 10, "%d", deviceCount); #else sprintf(cTemp, "%d", deviceCount); #endif sProfileString += cTemp; // Print Out all device Names for (dev = 0; dev < deviceCount; ++dev) { #ifdef _WIN32 sprintf_s(cTemp, 13, ", Device%d = ", dev); #else sprintf(cTemp, ", Device%d = ", dev); #endif cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, dev); sProfileString += cTemp; sProfileString += deviceProp.name; } sProfileString += "\n"; printf("%s", sProfileString.c_str()); // finish exit(EXIT_SUCCESS); }
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { //if (nrhs != 1) // mexErrMsgTxt("Wrong number of arguments"); //int dev = (int) mxGetScalar(prhs[0]); int deviceCount; cudaGetDeviceCount(&deviceCount); // This function call returns 0 if there are no CUDA capable devices. if (deviceCount == 0) mexErrMsgTxt("There is no device supporting CUDA\n"); int dev; int devStart = 0; int devStop = deviceCount; if (nrhs==1) { devStart = (int) mxGetScalar(prhs[0]); devStop = devStart+1; if (devStart >= deviceCount) mexErrMsgTxt("Please specify a valid GPU device.\n"); } for (dev = devStart; dev < devStop; ++dev) { cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, dev); if (dev == devStart) { // This function call returns 9999 for both major & minor fields, if no CUDA capable devices are present if (deviceProp.major == 9999 && deviceProp.minor == 9999) mexErrMsgTxt("There is no device supporting CUDA.\n"); else if (deviceCount == 1) printf("There is 1 device supporting CUDA\n"); else printf("There are %d devices supporting CUDA\n", deviceCount); #if CUDART_VERSION >= 2020 int driverVersion = 0, runtimeVersion = 0; cudaDriverGetVersion(&driverVersion); printf("CUDA Driver Version: %d.%d\n", driverVersion/1000, driverVersion%100); cudaRuntimeGetVersion(&runtimeVersion); printf("CUDA Runtime Version: %d.%d\n", runtimeVersion/1000, runtimeVersion%100); #endif } printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name); printf(" CUDA Capability Major revision number: %d\n", deviceProp.major); printf(" CUDA Capability Minor revision number: %d\n", deviceProp.minor); printf(" Total amount of global memory: %u bytes\n", deviceProp.totalGlobalMem); //#if CUDART_VERSION >= 2000 // printf(" Number of multiprocessors: %d\n", // deviceProp.multiProcessorCount); // printf(" Number of cores: %d\n", 8 // * deviceProp.multiProcessorCount); //#endif printf(" Total amount of constant memory: %u bytes\n", deviceProp.totalConstMem); printf(" Total amount of shared memory per block: %u bytes\n", deviceProp.sharedMemPerBlock); printf(" Total number of registers available per block: %d\n", deviceProp.regsPerBlock); printf(" Warp size: %d\n", deviceProp.warpSize); printf(" Maximum number of threads per block: %d\n", deviceProp.maxThreadsPerBlock); printf(" Maximum sizes of each dimension of a block: %d x %d x %d\n", deviceProp.maxThreadsDim[0], deviceProp.maxThreadsDim[1], deviceProp.maxThreadsDim[2]); printf(" Maximum sizes of each dimension of a grid: %d x %d x %d\n", deviceProp.maxGridSize[0], deviceProp.maxGridSize[1], deviceProp.maxGridSize[2]); printf(" Maximum memory pitch: %u bytes\n", deviceProp.memPitch); printf(" Texture alignment: %u bytes\n", deviceProp.textureAlignment); printf(" Clock rate: %.2f GHz\n", deviceProp.clockRate * 1e-6f); #if CUDART_VERSION >= 2000 printf(" Concurrent copy and execution: %s\n", deviceProp.deviceOverlap ? "Yes" : "No"); #endif #if CUDART_VERSION >= 2020 printf(" Run time limit on kernels: %s\n", deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No"); printf(" Integrated: %s\n", deviceProp.integrated ? "Yes" : "No"); printf(" Support host page-locked memory mapping: %s\n", deviceProp.canMapHostMemory ? "Yes" : "No"); printf(" Compute mode: %s\n", deviceProp.computeMode == cudaComputeModeDefault ? "Default (multiple host threads can use this device simultaneously)" : deviceProp.computeMode == cudaComputeModeExclusive ? "Exclusive (only one host thread at a time can use this device)" : deviceProp.computeMode == cudaComputeModeProhibited ? "Prohibited (no host thread can use this device)" : "Unknown"); #endif } }
void pcl::gpu::printCudaDeviceInfo(int device) { int count = getCudaEnabledDeviceCount(); bool valid = (device >= 0) && (device < count); int beg = valid ? device : 0; int end = valid ? device+1 : count; printf("*** CUDA Device Query (Runtime API) version (CUDART static linking) *** \n\n"); printf("Device count: %d\n", count); int driverVersion = 0, runtimeVersion = 0; cudaSafeCall( cudaDriverGetVersion(&driverVersion) ); cudaSafeCall( cudaRuntimeGetVersion(&runtimeVersion) ); const char *computeMode[] = { "Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)", "Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)", "Prohibited (no host thread can use ::cudaSetDevice() with this device)", "Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)", "Unknown", NULL }; for(int dev = beg; dev < end; ++dev) { cudaDeviceProp prop; cudaSafeCall( cudaGetDeviceProperties(&prop, dev) ); int sm_cores = convertSMVer2Cores(prop.major, prop.minor); printf("\nDevice %d: \"%s\"\n", dev, prop.name); printf(" CUDA Driver Version / Runtime Version %d.%d / %d.%d\n", driverVersion/1000, driverVersion%100, runtimeVersion/1000, runtimeVersion%100); printf(" CUDA Capability Major/Minor version number: %d.%d\n", prop.major, prop.minor); printf(" Total amount of global memory: %.0f MBytes (%llu bytes)\n", (float)prop.totalGlobalMem/1048576.0f, (unsigned long long) prop.totalGlobalMem); printf(" (%2d) Multiprocessors x (%2d) CUDA Cores/MP: %d CUDA Cores\n", prop.multiProcessorCount, sm_cores, sm_cores * prop.multiProcessorCount); printf(" GPU Clock Speed: %.2f GHz\n", prop.clockRate * 1e-6f); #if (CUDART_VERSION >= 4000) // This is not available in the CUDA Runtime API, so we make the necessary calls the driver API to support this for output int memoryClock, memBusWidth, L2CacheSize; getCudaAttribute<int>( &memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev ); getCudaAttribute<int>( &memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev ); getCudaAttribute<int>( &L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev ); printf(" Memory Clock rate: %.2f Mhz\n", memoryClock * 1e-3f); printf(" Memory Bus Width: %d-bit\n", memBusWidth); if (L2CacheSize) printf(" L2 Cache Size: %d bytes\n", L2CacheSize); printf(" Max Texture Dimension Size (x,y,z) 1D=(%d), 2D=(%d,%d), 3D=(%d,%d,%d)\n", prop.maxTexture1D, prop.maxTexture2D[0], prop.maxTexture2D[1], prop.maxTexture3D[0], prop.maxTexture3D[1], prop.maxTexture3D[2]); printf(" Max Layered Texture Size (dim) x layers 1D=(%d) x %d, 2D=(%d,%d) x %d\n", prop.maxTexture1DLayered[0], prop.maxTexture1DLayered[1], prop.maxTexture2DLayered[0], prop.maxTexture2DLayered[1], prop.maxTexture2DLayered[2]); #endif printf(" Total amount of constant memory: %u bytes\n", (int)prop.totalConstMem); printf(" Total amount of shared memory per block: %u bytes\n", (int)prop.sharedMemPerBlock); printf(" Total number of registers available per block: %d\n", prop.regsPerBlock); printf(" Warp size: %d\n", prop.warpSize); printf(" Maximum number of threads per block: %d\n", prop.maxThreadsPerBlock); printf(" Maximum sizes of each dimension of a block: %d x %d x %d\n", prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]); printf(" Maximum sizes of each dimension of a grid: %d x %d x %d\n", prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]); printf(" Maximum memory pitch: %u bytes\n", (int)prop.memPitch); printf(" Texture alignment: %u bytes\n", (int)prop.textureAlignment); #if CUDART_VERSION >= 4000 printf(" Concurrent copy and execution: %s with %d copy engine(s)\n", (prop.deviceOverlap ? "Yes" : "No"), prop.asyncEngineCount); #else printf(" Concurrent copy and execution: %s\n", prop.deviceOverlap ? "Yes" : "No"); #endif printf(" Run time limit on kernels: %s\n", prop.kernelExecTimeoutEnabled ? "Yes" : "No"); printf(" Integrated GPU sharing Host Memory: %s\n", prop.integrated ? "Yes" : "No"); printf(" Support host page-locked memory mapping: %s\n", prop.canMapHostMemory ? "Yes" : "No"); printf(" Concurrent kernel execution: %s\n", prop.concurrentKernels ? "Yes" : "No"); printf(" Alignment requirement for Surfaces: %s\n", prop.surfaceAlignment ? "Yes" : "No"); printf(" Device has ECC support enabled: %s\n", prop.ECCEnabled ? "Yes" : "No"); printf(" Device is using TCC driver mode: %s\n", prop.tccDriver ? "Yes" : "No"); #if CUDART_VERSION >= 4000 printf(" Device supports Unified Addressing (UVA): %s\n", prop.unifiedAddressing ? "Yes" : "No"); printf(" Device PCI Bus ID / PCI location ID: %d / %d\n", prop.pciBusID, prop.pciDeviceID ); #endif printf(" Compute Mode:\n"); printf(" %s \n", computeMode[prop.computeMode]); } printf("\n"); printf("deviceQuery, CUDA Driver = CUDART"); printf(", CUDA Driver Version = %d.%d", driverVersion / 1000, driverVersion % 100); printf(", CUDA Runtime Version = %d.%d", runtimeVersion/1000, runtimeVersion%100); printf(", NumDevs = %d\n\n", count); fflush(stdout); }
// Check if there is a device supporting CUDA void GetCUDADeviceFlags() { int deviceCount; bool archi_13 = false; cudaGetDeviceCount(&deviceCount); // This function call returns 0 if there are no CUDA capable devices. if (deviceCount == 0) printf("There is no device supporting CUDA\n"); int dev = 0; for (dev = 0; dev < 1/*deviceCount/*HACK : we want the first one only for now*/; ++dev) { printf("\nFirst device is selected by default.\n\n"); cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, dev); if (dev == 0) { // This function call returns 9999 for both major & minor fields, if no CUDA capable devices are present if (deviceProp.major == 9999 && deviceProp.minor == 9999) printf("There is no device supporting CUDA.\n"); else if (deviceCount == 1) printf("There is 1 device supporting CUDA\n"); else printf("There are %d devices supporting CUDA\n", deviceCount); if ((deviceProp.major * 10 + deviceProp.minor) >= 13 ) { // OK on peut faire du double et rajouter des optimizations archi_13 = true; } #if CUDART_VERSION >= 2000 if (archi_13){ fb.add_def("MAX_THREADS", deviceProp.multiProcessorCount*1024); } else { fb.add_def("MAX_THREADS", deviceProp.multiProcessorCount*768); } #endif fb.add_def("WARP_SIZE", deviceProp.warpSize); fb.add_flag("USE_CRS_SHARED"); fb.add_flag("USE_TEXTURE"); } printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name); #if CUDART_VERSION >= 2020 int driverVersion = 0, runtimeVersion = 0; cudaDriverGetVersion(&driverVersion); printf(" CUDA Driver Version: %d.%d\n", driverVersion/1000, driverVersion%100); cudaRuntimeGetVersion(&runtimeVersion); printf(" CUDA Runtime Version: %d.%d\n", runtimeVersion/1000, runtimeVersion%100); #endif printf(" CUDA Capability Major revision number: %d\n", deviceProp.major); printf(" CUDA Capability Minor revision number: %d\n", deviceProp.minor); printf(" Total amount of global memory: %u bytes\n", deviceProp.totalGlobalMem); #if CUDART_VERSION >= 2000 printf(" Number of multiprocessors: %d\n", deviceProp.multiProcessorCount); printf(" Number of cores: %d\n", 8 * deviceProp.multiProcessorCount); #endif printf(" Total amount of constant memory: %u bytes\n", deviceProp.totalConstMem); printf(" Total amount of shared memory per block: %u bytes\n", deviceProp.sharedMemPerBlock); printf(" Total number of registers available per block: %d\n", deviceProp.regsPerBlock); printf(" Warp size: %d\n", deviceProp.warpSize); printf(" Warp size: %d\n", deviceProp.warpSize); printf(" Maximum number of threads per block: %d\n", deviceProp.maxThreadsPerBlock); printf(" Maximum sizes of each dimension of a block: %d x %d x %d\n", deviceProp.maxThreadsDim[0], deviceProp.maxThreadsDim[1], deviceProp.maxThreadsDim[2]); printf(" Maximum sizes of each dimension of a grid: %d x %d x %d\n", deviceProp.maxGridSize[0], deviceProp.maxGridSize[1], deviceProp.maxGridSize[2]); printf(" Maximum memory pitch: %u bytes\n", deviceProp.memPitch); printf(" Texture alignment: %u bytes\n", deviceProp.textureAlignment); printf(" Clock rate: %.2f GHz\n", deviceProp.clockRate * 1e-6f); #if CUDART_VERSION >= 2000 printf(" Concurrent copy and execution: %s\n", deviceProp.deviceOverlap ? "Yes" : "No"); #endif #if CUDART_VERSION >= 2020 printf(" Run time limit on kernels: %s\n", deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No"); printf(" Integrated: %s\n", deviceProp.integrated ? "Yes" : "No"); printf(" Support host page-locked memory mapping: %s\n", deviceProp.canMapHostMemory ? "Yes" : "No"); printf(" Compute mode: %s\n", deviceProp.computeMode == cudaComputeModeDefault ? "Default (multiple host threads can use this device simultaneously)" : deviceProp.computeMode == cudaComputeModeExclusive ? "Exclusive (only one host thread at a time can use this device)" : deviceProp.computeMode == cudaComputeModeProhibited ? "Prohibited (no host thread can use this device)" : "Unknown"); #endif } }
int cudaDeviceInfo(void) { int deviceCount = 0; if (cudaGetDeviceCount(&deviceCount) != cudaSuccess) { Scierror(999, "\ncudaGetDeviceCount FAILED CUDA Driver and Runtime version may be mismatched.\n\n"); return 1; } sciprint("Starting...\n\n"); sciprint(" CUDA Device Query (Runtime API) version (CUDART static linking)\n\n"); // This function call returns 0 if there are no CUDA capable devices. if (deviceCount == 0) { sciprint("There is no device supporting CUDA\n"); } int dev = 0; int driverVersion = 0, runtimeVersion = 0; for (dev = 0; dev < deviceCount; ++dev) { cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, dev); if (dev == 0) { // This function call returns 9999 for both major & minor fields, if no CUDA capable devices are present if (deviceProp.major == 9999 && deviceProp.minor == 9999) { sciprint("There is no device supporting CUDA.\n"); } else if (deviceCount == 1) { sciprint("There is 1 device supporting CUDA\n"); } else { sciprint("There are %d devices supporting CUDA\n", deviceCount); } } sciprint("\nDevice %d: \"%s\"\n", dev, deviceProp.name); #if CUDART_VERSION >= 2020 // Console log cudaDriverGetVersion(&driverVersion); sciprint(" CUDA Driver Version: %d.%d\n", driverVersion / 1000, driverVersion % 100); cudaRuntimeGetVersion(&runtimeVersion); sciprint(" CUDA Runtime Version: %d.%d\n", runtimeVersion / 1000, runtimeVersion % 100); #endif sciprint(" CUDA Capability Major revision number: %d\n", deviceProp.major); sciprint(" CUDA Capability Minor revision number: %d\n", deviceProp.minor); sciprint(" Total amount of global memory: %u bytes\n", deviceProp.totalGlobalMem); #if CUDART_VERSION >= 2000 sciprint(" Number of multiprocessors: %d\n", deviceProp.multiProcessorCount); // sciprint(" Number of cores: %d\n", nGpuArchCoresPerSM[deviceProp.major] * deviceProp.multiProcessorCount); #endif sciprint(" Total amount of constant memory: %u bytes\n", deviceProp.totalConstMem); sciprint(" Total amount of shared memory per block: %u bytes\n", deviceProp.sharedMemPerBlock); sciprint(" Total number of registers available per block: %d\n", deviceProp.regsPerBlock); sciprint(" Warp size: %d\n", deviceProp.warpSize); sciprint(" Maximum number of threads per block: %d\n", deviceProp.maxThreadsPerBlock); sciprint(" Maximum sizes of each dimension of a block: %d x %d x %d\n", deviceProp.maxThreadsDim[0], deviceProp.maxThreadsDim[1], deviceProp.maxThreadsDim[2]); sciprint(" Maximum sizes of each dimension of a grid: %d x %d x %d\n", deviceProp.maxGridSize[0], deviceProp.maxGridSize[1], deviceProp.maxGridSize[2]); sciprint(" Maximum memory pitch: %u bytes\n", deviceProp.memPitch); sciprint(" Texture alignment: %u bytes\n", deviceProp.textureAlignment); sciprint(" Clock rate: %.2f GHz\n", deviceProp.clockRate * 1e-6f); #if CUDART_VERSION >= 2000 sciprint(" Concurrent copy and execution: %s\n", deviceProp.deviceOverlap ? "Yes" : "No"); #endif #if CUDART_VERSION >= 2020 sciprint(" Run time limit on kernels: %s\n", deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No"); sciprint(" Integrated: %s\n", deviceProp.integrated ? "Yes" : "No"); sciprint(" Support host page-locked memory mapping: %s\n", deviceProp.canMapHostMemory ? "Yes" : "No"); sciprint(" Compute mode: %s\n", deviceProp.computeMode == cudaComputeModeDefault ? "Default (multiple host threads can use this device simultaneously)" : deviceProp.computeMode == cudaComputeModeExclusive ? "Exclusive (only one host thread at a time can use this device)" : deviceProp.computeMode == cudaComputeModeProhibited ? "Prohibited (no host thread can use this device)" : "Unknown"); #endif #if CUDART_VERSION >= 3000 sciprint(" Concurrent kernel execution: %s\n", deviceProp.concurrentKernels ? "Yes" : "No"); #endif #if CUDART_VERSION >= 3010 sciprint(" Device has ECC support enabled: %s\n", deviceProp.ECCEnabled ? "Yes" : "No"); #endif } // finish sciprint("\n\nPASSED\n"); return 0; }
static void device_query() { int deviceCount = 0, device; cudaError_t status = (cudaError_t)0; struct cudaDeviceProp deviceProperties; int driverVersion = 0, runtimeVersion = 0; if ((status = cudaGetDeviceCount(&deviceCount)) != cudaSuccess) { fprintf(stderr, "cudaGetDeviceCount() FAILED, status = %d (%s)\n", status, cudaGetErrorString(status)); exit(1); } if (deviceCount == 0) { printf("There are no hardware devices which support CUDA\n"); } else { printf("There %s %d CUDA capable hardware device%s\n", deviceCount == 1 ? "is" : "are", deviceCount, deviceCount > 1 ? "s" : ""); } if ((status = cudaDriverGetVersion(&driverVersion)) != cudaSuccess) { fprintf(stderr, "cudaDriverGetVersion() FAILED, status = %d (%s)\n", status, cudaGetErrorString(status)); exit(1); } else { printf("CUDA driver version: %d.%d\n", driverVersion / 1000, driverVersion % 100); } if ((status = cudaRuntimeGetVersion(&runtimeVersion)) != cudaSuccess) { fprintf(stderr, "cudaRuntimeGetVersion() FAILED, status = %d (%s)\n", status, cudaGetErrorString(status)); exit(1); } else { printf("CUDA runtime version: %d.%d\n", runtimeVersion / 1000, runtimeVersion % 100); } for (device = 0; device < deviceCount; ++device) { if ((status = cudaGetDeviceProperties(&deviceProperties, device)) != cudaSuccess) { fprintf(stderr, "cudaGetDeviceProperties() FAILED, status = %d (%s)\n", status, cudaGetErrorString(status)); exit(1); } printf("Device %d:\n", device); printf("\tname = %s\n", deviceProperties.name); printf("\tCUDA capability major.minor version = %d.%d\n", deviceProperties.major, deviceProperties.minor); printf("\tmultiProcessorCount = %d\n", deviceProperties.multiProcessorCount); printf("\ttotalGlobalMem = %ld bytes\n", (long)deviceProperties.totalGlobalMem); printf("\tsharedMemPerBlock = %d bytes\n", (int)deviceProperties.sharedMemPerBlock); printf("\tregsPerBlock = %d\n", deviceProperties.regsPerBlock); printf("\twarpSize = %d\n", deviceProperties.warpSize); printf("\tmemPitch = %d bytes\n", (int)deviceProperties.memPitch); printf("\tmaxThreadsPerBlock = %d\n", deviceProperties.maxThreadsPerBlock); printf("\tmaxThreadsDim = %d x %d x %d\n", deviceProperties.maxThreadsDim[0], deviceProperties.maxThreadsDim[1], deviceProperties.maxThreadsDim[2]); printf("\tmaxGridSize = %d x %d x %d\n", deviceProperties.maxGridSize[0], deviceProperties.maxGridSize[1], deviceProperties.maxGridSize[2]); printf("\n"); printf("\tmemPitch = %ld bytes\n", (long)deviceProperties.memPitch); printf("\ttextureAlignment = %ld bytes\n", (long)deviceProperties.textureAlignment); printf("\tclockRate = %.2f GHz\n", deviceProperties.clockRate * 1e-6f); #if CUDART_VERSION >= 2000 printf("\tdeviceOverlap = %s\n", deviceProperties.deviceOverlap ? "Yes" : "No"); #endif #if CUDART_VERSION >= 2020 printf("\tkernelExecTimeoutEnabled = %s\n", deviceProperties.kernelExecTimeoutEnabled ? "Yes" : "No"); printf("\tintegrated = %s\n", deviceProperties.integrated ? "Yes" : "No"); printf("\tcanMapHostMemory = %s\n", deviceProperties.canMapHostMemory ? "Yes" : "No"); printf("\tcomputeMode = %s\n", deviceProperties.computeMode == cudaComputeModeDefault ? "Default (multiple host threads can use this device simultaneously)" : deviceProperties.computeMode == cudaComputeModeExclusive ? "Exclusive (only one host thread at a time can use this device)" : deviceProperties.computeMode == cudaComputeModeProhibited ? "Prohibited (no host thread can use this device)" : "Unknown"); #endif #if CUDART_VERSION >= 3000 printf("\tconcurrentKernels = %s\n", deviceProperties.concurrentKernels ? "Yes" : "No"); #endif #if CUDART_VERSION >= 3010 printf("\tECCEnabled = %s\n", deviceProperties.ECCEnabled ? "Yes" : "No"); #endif #if CUDART_VERSION >= 3020 printf("\ttccDriver = %s\n", deviceProperties.tccDriver ? "Yes" : "No"); #endif printf("\n"); } }