Ejemplo n.º 1
1
void pcl::gpu::printShortCudaDeviceInfo(int device)
{
    int count = getCudaEnabledDeviceCount();
    bool valid = (device >= 0) && (device < count);

    int beg = valid ? device   : 0;
    int end = valid ? device+1 : count;

    int driverVersion = 0, runtimeVersion = 0;
    cudaSafeCall( cudaDriverGetVersion(&driverVersion) );
    cudaSafeCall( cudaRuntimeGetVersion(&runtimeVersion) );

    for(int dev = beg; dev < end; ++dev)
    {                
        cudaDeviceProp prop;
        cudaSafeCall( cudaGetDeviceProperties(&prop, dev) );

        const char *arch_str = prop.major < 2 ? " (pre-Fermi)" : "";
        printf("[pcl::gpu::printShortCudaDeviceInfo] : Device %d:  \"%s\"  %.0fMb", dev, prop.name, (float)prop.totalGlobalMem/1048576.0f);
        printf(", sm_%d%d%s, %d cores", prop.major, prop.minor, arch_str, convertSMVer2Cores(prop.major, prop.minor) * prop.multiProcessorCount);                
        printf(", Driver/Runtime ver.%d.%d/%d.%d\n", driverVersion/1000, driverVersion%100, runtimeVersion/1000, runtimeVersion%100);
    }
    fflush(stdout);
}
Ejemplo n.º 2
0
// CUDA Devices on the System
int cuda_num_devices()
{
	int version;
	cudaError_t err = cudaDriverGetVersion(&version);
	if (err != cudaSuccess)
	{
		applog(LOG_ERR, "Unable to query CUDA driver version! Is an nVidia driver installed?");
		exit(1);
	}

	int maj = version / 1000, min = version % 100; // same as in deviceQuery sample
	if (maj < 5 || (maj == 5 && min < 5))
	{
		applog(LOG_ERR, "Driver does not support CUDA %d.%d API! Update your nVidia driver!", 5, 5);
		exit(1);
	}

	int GPU_N;
	err = cudaGetDeviceCount(&GPU_N);
	if (err != cudaSuccess)
	{
		applog(LOG_ERR, "Unable to query number of CUDA devices! Is an nVidia driver installed?");
		exit(1);
	}
	return GPU_N;
}
Ejemplo n.º 3
0
TEST(StreamQuery, InvalidStream) {
    ::testing::FLAGS_gtest_death_test_style = "threadsafe";

    cudaError_t ret;
    cudaStream_t stream;

    /* The CUDA 5.0 driver no longer segfaults. */
    int driver;
    ret = cudaDriverGetVersion(&driver);
    ASSERT_EQ(cudaSuccess, ret);

    ret = cudaStreamCreate(&stream);
    ASSERT_EQ(cudaSuccess, ret);

    ret = cudaStreamDestroy(stream);
    ASSERT_EQ(cudaSuccess, ret);

    if (driver >= 5000) {
        ret = cudaStreamQuery(stream);
        EXPECT_EQ(cudaErrorUnknown, ret);
    } else {
        EXPECT_EXIT({
            cudaStreamQuery(stream); },
            ::testing::KilledBySignal(SIGSEGV), "");
    }
Ejemplo n.º 4
0
oskar_CudaInfo* oskar_cuda_info_create(int* status)
{
    oskar_CudaInfo* info;
    int i;

    /* Allocate index. */
    info = (oskar_CudaInfo*) calloc(1, sizeof(oskar_CudaInfo));

    /* Get the runtime version and the driver version. */
    cudaDriverGetVersion(&info->driver_version);
    cudaRuntimeGetVersion(&info->runtime_version);

    /* Query the number of devices in the system. */
    *status = cudaGetDeviceCount(&info->num_devices);
    if (*status != cudaSuccess || info->num_devices == 0)
    {
        fprintf(stderr, "Unable to determine number of CUDA devices: %s\n",
                cudaGetErrorString((cudaError_t)(*status)));
        return info;
    }

    /* Allocate array big enough. */
    info->device = (oskar_CudaDeviceInfo*) calloc(info->num_devices,
                   sizeof(oskar_CudaDeviceInfo));

    /* Populate device array. */
    for (i = 0; i < info->num_devices; ++i)
    {
        oskar_cuda_device_info_scan(&(info->device[i]), i);
    }
    return info;
}
Ejemplo n.º 5
0
// CUDAが使えるかチェック
Waifu2x::eWaifu2xCudaError Waifu2x::can_use_CUDA()
{
	static eWaifu2xCudaError CudaFlag = eWaifu2xCudaError_NotFind;
	std::call_once(waifu2x_cuda_once_flag, [&]()
	{
		int driverVersion = 0;
		if (cudaDriverGetVersion(&driverVersion) == cudaSuccess)
		{
			if (driverVersion > 0)
			{
				int runtimeVersion;
				if (cudaRuntimeGetVersion(&runtimeVersion) == cudaSuccess)
				{
					if (runtimeVersion >= MinCudaDriverVersion && driverVersion >= runtimeVersion)
						CudaFlag = eWaifu2xCudaError_OK;
					else
						CudaFlag = eWaifu2xCudaError_OldVersion;
				}
				else
					CudaFlag = eWaifu2xCudaError_NotFind;
			}
			else
				CudaFlag = eWaifu2xCudaError_NotFind;
		}
		else
			CudaFlag = eWaifu2xCudaError_NotFind;
	});

	return CudaFlag;
}
Ejemplo n.º 6
0
Caffe::Properties::Properties() :
      init_time_(std::time(nullptr)),
      main_thread_id_(std::this_thread::get_id()),
      caffe_version_(AS_STRING(CAFFE_VERSION)) {
#ifndef CPU_ONLY
  int count = 0;
  CUDA_CHECK(cudaGetDeviceCount(&count));
  compute_capabilities_.resize(count);
  cudaDeviceProp device_prop;
  for (int gpu = 0; gpu < compute_capabilities_.size(); ++gpu) {
    CUDA_CHECK(cudaGetDeviceProperties(&device_prop, gpu));
    compute_capabilities_[gpu] = device_prop.major * 100 + device_prop.minor;
    DLOG(INFO) << "GPU " << gpu << " '" << device_prop.name << "' has compute capability "
        << device_prop.major << "." << device_prop.minor;
  }
#ifdef USE_CUDNN
  cudnn_version_ =
      AS_STRING(CUDNN_MAJOR) "." AS_STRING(CUDNN_MINOR) "." AS_STRING(CUDNN_PATCHLEVEL);
#else
  cudnn_version_ = "USE_CUDNN is not defined";
#endif
  int cublas_version = 0;
  CUBLAS_CHECK(cublasGetVersion(Caffe::cublas_handle(), &cublas_version));
  cublas_version_ = std::to_string(cublas_version);

  int cuda_version = 0;
  CUDA_CHECK(cudaRuntimeGetVersion(&cuda_version));
  cuda_version_ = std::to_string(cuda_version);

  int cuda_driver_version = 0;
  CUDA_CHECK(cudaDriverGetVersion(&cuda_driver_version));
  cuda_driver_version_ = std::to_string(cuda_driver_version);
#endif
}
Ejemplo n.º 7
0
int
main(int argc, char *argv[])
{
 int ver, major;
 cudaDriverGetVersion(&ver);
 major = ver/1000;
 return(major < 5);
}
Ejemplo n.º 8
0
void print_info()
{    
    printf("\n");
#if defined _WIN32
#   if defined _WIN64
        puts("OS: Windows 64");
#   else
        puts("OS: Windows 32");
#   endif
#elif defined linux
#   if defined _LP64
        puts("OS: Linux 64");
#   else
        puts("OS: Linux 32");
#   endif
#elif defined __APPLE__
#   if defined _LP64
        puts("OS: Apple 64");
#   else
        puts("OS: Apple 32");
#   endif
#endif

    int deviceCount = cv::gpu::getCudaEnabledDeviceCount();
    int driver;
    cudaDriverGetVersion(&driver);

    printf("CUDA Driver  version: %d\n", driver);        
    printf("CUDA Runtime version: %d\n", CUDART_VERSION);    
    printf("CUDA device count: %d\n\n", deviceCount);
    

    for (int i = 0; i < deviceCount; ++i)
    {
        cv::gpu::DeviceInfo info(i);
        printf("Device %d:\n", i);
        printf("    Name: %s\n", info.name().c_str());
        printf("    Compute capability version: %d.%d\n", info.majorVersion(), info.minorVersion());
        printf("    Total memory: %d Mb\n", static_cast<int>(static_cast<int>(info.totalMemory() / 1024.0) / 1024.0));
        printf("    Free  memory: %d Mb\n", static_cast<int>(static_cast<int>(info.freeMemory() / 1024.0) / 1024.0));
        if (info.isCompatible())
            puts("    This device is compatible with current GPU module build\n");
        else
            puts("    This device is NOT compatible with current GPU module build\n");
    }
    
    puts("GPU module was compiled for the following GPU archs:");
    printf("    BIN: %s\n", CUDA_ARCH_BIN);
    printf("    PTX: %s\n\n", CUDA_ARCH_PTX);
}
Ejemplo n.º 9
0
bool printfNPPinfo(int argc, char *argv[], int cudaVerMajor, int cudaVerMinor)
{
    const NppLibraryVersion *libVer   = nppGetLibVersion();

    printf("NPP Library Version %d.%d.%d\n", libVer->major, libVer->minor, libVer->build);

	int driverVersion, runtimeVersion;
    cudaDriverGetVersion(&driverVersion);
    cudaRuntimeGetVersion(&runtimeVersion);

	printf("  CUDA Driver  Version: %d.%d\n", driverVersion/1000, (driverVersion%100)/10);
	printf("  CUDA Runtime Version: %d.%d\n", runtimeVersion/1000, (runtimeVersion%100)/10);

	bool bVal = checkCudaCapabilities(cudaVerMajor, cudaVerMinor);
	return bVal;
}
void DialogSelectHardware::ChangeText(int indexDevice)
{
    int  driverVersion = 0, runtimeVersion = 0;
    cudaSetDevice(indexDevice);
    cudaGetDeviceProperties(deviceProp, indexDevice);
    cudaDriverGetVersion(&driverVersion);
    cudaRuntimeGetVersion(&runtimeVersion);

    char msg[256];
    SPRINTF(msg,"%.0f MBytes (%llu bytes)\n",
            (float)deviceProp->totalGlobalMem/1048576.0f, (unsigned long long) deviceProp->totalGlobalMem);

    ui->tableWidget->clear();
    addItem(QString ("Device "+QString::number(indexDevice).append(" : ")+ deviceProp->name),0,0);
    addItem((selectDevice == indexDevice) ? "Dispositivo Seleccionado " : " ",0,1);
    addItem("CUDA Driver Version / Runtime Version",1,0);
    addItem(QString ("%1.%2  /  %3.%4").arg(driverVersion/1000).arg((driverVersion%100)/10).arg( runtimeVersion/1000).arg((runtimeVersion%100)/10),1,1);
    addItem("CUDA Capability Major/Minor version number: ",2,0);
    addItem(QString ("%1.%2").arg(deviceProp->major).arg(deviceProp->minor),2,1);
    addItem("Total amount of global memory:",3,0);
    addItem(msg,3,1);
    addItem(QString ("(%1) Multiprocessors, (%2) CUDA Cores/MP:%3 CUDA Cores").arg( deviceProp->multiProcessorCount).arg( _ConvertSMVer2Cores(deviceProp->major, deviceProp->minor)).arg( _ConvertSMVer2Cores(deviceProp->major, deviceProp->minor) * deviceProp->multiProcessorCount),4,0);
    addItem("Total amount of constant memory:",5,0);
    addItem(QString ("%1 bytes").arg(deviceProp->totalConstMem),5,1);
    addItem("Total amount of shared memory per block:",6,0);
    addItem(QString ("%1 bytes").arg(deviceProp->sharedMemPerBlock),6,1);
    addItem("Total number of registers available per block:",7,0);
    addItem(QString ("%1").arg(deviceProp->regsPerBlock),7,1);
    addItem("Warp size:",8,0);
    addItem(QString ("%1").arg(deviceProp->warpSize),8,1);
    addItem("Maximum number of threads per multiprocessor:",9,0);
    addItem(QString ("%1").arg(deviceProp->maxThreadsPerMultiProcessor),9,1);
    addItem("Maximum number of threads per block:",10,0);
    addItem(QString ("%1").arg(deviceProp->maxThreadsPerBlock),10,1);
    addItem("Max dimension size of a thread block (x,y,z):",11,0);
    addItem(QString ("(%1, %2, %3)").arg(deviceProp->maxThreadsDim[0]).arg(  deviceProp->maxThreadsDim[1]).arg(  deviceProp->maxThreadsDim[2]),11,1);
    addItem("Max dimension size of a grid size    (x,y,z):",12,0);
    addItem(QString ("(%1, %2, %3)\n").arg(deviceProp->maxGridSize[0]).arg(deviceProp->maxGridSize[1]).arg(deviceProp->maxGridSize[2]),12,1);
    addItem("Run time limit on kernels: ",13,0);
    addItem(QString ("%1\n").arg(deviceProp->kernelExecTimeoutEnabled ? "Yes" : "No"),13,1);
    addItem("Integrated GPU sharing Host Memory: ",14,0);
    addItem( QString ("%1\n").arg(deviceProp->integrated ? "Yes" : "No"),14,1);

    ui->tableWidget->resizeColumnsToContents();
    ui->tableWidget->resizeRowsToContents();
}
Ejemplo n.º 11
0
		void cuda_running_configuration::update_parameters()
		{
	        cuda_safe_call(cudaDriverGetVersion(&driver_version));
	        cuda_safe_call(cudaRuntimeGetVersion(&runtime_version));

			int device_count;
		    cuda_safe_call(cudaGetDeviceCount(&device_count));
			if (device_count <= 0)
				throw neural_network_exception("No CUDA capable devices are found");

			if (device_id >= device_count)
				throw neural_network_exception((boost::format("Device ID %1% specified while %2% devices are available") % device_id % device_count).str());

			cudaDeviceProp device_prop;
			cuda_safe_call(cudaGetDeviceProperties(&device_prop, device_id));
			device_name = device_prop.name;
			compute_capability_major = device_prop.major;
			compute_capability_minor = device_prop.minor;
			clock_rate = device_prop.clockRate;
			memory_clock_rate = device_prop.memoryClockRate;
			memory_bus_width = device_prop.memoryBusWidth;
			global_memory_size = device_prop.totalGlobalMem;
			ecc_enabled = (device_prop.ECCEnabled != 0);
			l2_cache_size = device_prop.l2CacheSize;
			multiprocessor_count = device_prop.multiProcessorCount;
			smem_per_block = device_prop.sharedMemPerBlock;
			max_threads_per_multiprocessor = device_prop.maxThreadsPerMultiProcessor;
			max_threads_per_block = device_prop.maxThreadsPerBlock;
			for(int i = 0; i < sizeof(max_threads_dim) / sizeof(max_threads_dim[0]); ++i)
				max_threads_dim[i] = device_prop.maxThreadsDim[i];
			for(int i = 0; i < sizeof(max_grid_size) / sizeof(max_grid_size[0]); ++i)
				max_grid_size[i] = device_prop.maxGridSize[i];
			max_texture_1d_linear = device_prop.maxTexture1DLinear;
			texture_alignment = device_prop.textureAlignment;
			pci_bus_id = device_prop.pciBusID;
			pci_device_id = device_prop.pciDeviceID;
		#ifdef _WIN32
			tcc_mode = (device_prop.tccDriver != 0);
		#endif

			cuda_safe_call(cudaSetDevice(device_id));

			cublas_safe_call(cublasCreate(&cublas_handle));

			cusparse_safe_call(cusparseCreate(&cusparse_handle));
		}
Ejemplo n.º 12
0
static void printCudaInfo()
{
    printOsInfo();
#ifndef HAVE_CUDA
    printf("[----------]\n[ GPU INFO ] \tOpenCV was built without CUDA support.\n[----------]\n"), fflush(stdout);
#else
    int driver;
    cudaDriverGetVersion(&driver);

    printf("[----------]\n"), fflush(stdout);
    printf("[ GPU INFO ] \tCUDA Driver  version: %d.\n", driver), fflush(stdout);
    printf("[ GPU INFO ] \tCUDA Runtime version: %d.\n", CUDART_VERSION), fflush(stdout);
    printf("[----------]\n"), fflush(stdout);

    printf("[----------]\n"), fflush(stdout);
    printf("[ GPU INFO ] \tGPU module was compiled for the following GPU archs.\n"), fflush(stdout);
    printf("[      BIN ] \t%s.\n", CUDA_ARCH_BIN), fflush(stdout);
    printf("[      PTX ] \t%s.\n", CUDA_ARCH_PTX), fflush(stdout);
    printf("[----------]\n"), fflush(stdout);

    printf("[----------]\n"), fflush(stdout);
    int deviceCount = cv::gpu::getCudaEnabledDeviceCount();
    printf("[ GPU INFO ] \tCUDA device count:: %d.\n", deviceCount), fflush(stdout);
    printf("[----------]\n"), fflush(stdout);

    for (int i = 0; i < deviceCount; ++i)
    {
        cv::gpu::DeviceInfo info(i);

        printf("[----------]\n"), fflush(stdout);
        printf("[ DEVICE   ] \t# %d %s.\n", i, info.name().c_str()), fflush(stdout);
        printf("[          ] \tCompute capability: %d.%d\n", (int)info.majorVersion(), (int)info.minorVersion()), fflush(stdout);
        printf("[          ] \tMulti Processor Count:  %d\n", info.multiProcessorCount()), fflush(stdout);
        printf("[          ] \tTotal memory: %d Mb\n", static_cast<int>(static_cast<int>(info.totalMemory() / 1024.0) / 1024.0)), fflush(stdout);
        printf("[          ] \tFree  memory: %d Mb\n", static_cast<int>(static_cast<int>(info.freeMemory()  / 1024.0) / 1024.0)), fflush(stdout);
        if (!info.isCompatible())
            printf("[ GPU INFO ] \tThis device is NOT compatible with current GPU module build\n");
        printf("[----------]\n"), fflush(stdout);
    }

#endif
}
Ejemplo n.º 13
0
char CudaBase::CheckCUDevice()
{
    int deviceCount = 0;
    if (cudaGetDeviceCount(&deviceCount) != cudaSuccess) {
            std::cout << "Cannot find CUDA device!";
        return 0;
    }
    
    if(deviceCount>0) {
            std::cout << "Found " << deviceCount << " device(s)\n";
            int driverVersion = 0, runtimeVersion = 0;
            cudaDeviceProp deviceProp;
    cudaGetDeviceProperties(&deviceProp, 0);
            cudaDriverGetVersion(&driverVersion);
            cudaRuntimeGetVersion(&runtimeVersion);
            std::cout << "  Device name: " << deviceProp.name<<"\n";
            std::cout << "  Diver Version: " << driverVersion<<"\n";
            std::cout << "  Runtime Version: " << runtimeVersion<<"\n";
            std::cout << "  Capability Major/Minor version number: "<<deviceProp.major<<"."<<deviceProp.minor<<"\n";
            std::cout << "  Total amount of global memory: "<<(unsigned long long)deviceProp.totalGlobalMem<<" bytes\n";
            std::cout << "  Total amount of constant memory: "<<deviceProp.totalConstMem<<"bytes\n"; 
            std::cout << "  Total amount of shared memory per block: "<<deviceProp.sharedMemPerBlock<<" bytes\n";
            std::cout << "  Total number of registers available per block: "<<deviceProp.regsPerBlock<<"\n";
			std::cout << "  Warp size: "<<deviceProp.warpSize<<"\n";
        
            std::stringstream sst;
            sst<<"  Maximum sizes of each dimension of a grid: "<<deviceProp.maxGridSize[0]<<" x "<<deviceProp.maxGridSize[1]<<" x "<<deviceProp.maxGridSize[2];
            std::cout<<sst.str()<<"\n";
            sst.str("");
            sst<<"  Maximum sizes of each dimension of a block: "<<deviceProp.maxThreadsDim[0]<<" x "<<deviceProp.maxThreadsDim[1]<<" x "<<deviceProp.maxThreadsDim[2];
            std::cout<<sst.str()<<"\n";
            std::cout << "  Maximum number of threads per block: " << deviceProp.maxThreadsPerBlock<<"\n";
            
            MaxThreadPerBlock = deviceProp.maxThreadsPerBlock;
            MaxRegisterPerBlock = deviceProp.regsPerBlock;
            MaxSharedMemoryPerBlock = deviceProp.sharedMemPerBlock;
			WarpSize = deviceProp.warpSize;
			RuntimeVersion = runtimeVersion;
        return 1;               
    }
    return 0;
}
Ejemplo n.º 14
0
void oskar_device_get_info_cuda(oskar_Device* device)
{
#ifdef OSKAR_HAVE_CUDA
    struct cudaDeviceProp prop;
    cudaDriverGetVersion(&device->cuda_driver_version);
    cudaRuntimeGetVersion(&device->cuda_runtime_version);
    cudaGetDeviceProperties(&prop, device->index);
    device->name = (char*) realloc(device->name, 1 + strlen(prop.name));
    device->vendor = (char*) realloc(device->vendor, 1 + strlen("NVIDIA"));
    strcpy(device->name, prop.name);
    strcpy(device->vendor, "NVIDIA");
    device->is_nv = 1;
    device->platform_type = 'C';
    device->device_type = 'G';
    device->compute_capability[0] = prop.major;
    device->compute_capability[1] = prop.minor;
    device->supports_double = 0;
    if (prop.major >= 2 || prop.minor >= 3)
        device->supports_double = 1;
    device->supports_atomic32 = 1;
    device->supports_atomic64 = 1;
    device->global_mem_cache_size = (size_t) prop.l2CacheSize;
    device->local_mem_size = prop.sharedMemPerBlock;
    device->max_work_group_size = (size_t) prop.maxThreadsPerBlock;
    device->max_local_size[0] = prop.maxThreadsDim[0];
    device->max_local_size[1] = prop.maxThreadsDim[1];
    device->max_local_size[2] = prop.maxThreadsDim[2];
    device->max_compute_units = prop.multiProcessorCount;
    device->max_clock_freq_kHz = prop.clockRate;
    device->memory_clock_freq_kHz = prop.memoryClockRate;
    device->memory_bus_width = prop.memoryBusWidth;
    device->num_registers = (unsigned int) prop.regsPerBlock;
    device->warp_size = prop.warpSize;
    cudaMemGetInfo(&device->global_mem_free_size, &device->global_mem_size);
#endif
    device->num_cores = device->max_compute_units * oskar_get_num_cuda_cores(
            device->compute_capability[0], device->compute_capability[1]);
    device->init = 1;
}
// CUDAが使えるかチェック
Waifu2x::eWaifu2xCudaError Waifu2x::can_use_CUDA()
{
	static eWaifu2xCudaError CudaFlag = eWaifu2xCudaError_NotFind;
	std::call_once(waifu2x_cuda_once_flag, [&]()
	{
		int driverVersion = 0;
		if (cudaDriverGetVersion(&driverVersion) == cudaSuccess)
		{
			if (driverVersion > 0)
			{
				int runtimeVersion;
				if (cudaRuntimeGetVersion(&runtimeVersion) == cudaSuccess)
				{
					if (runtimeVersion >= MinCudaDriverVersion && driverVersion >= runtimeVersion)
					{
						cudaDeviceProp prop;
						cudaGetDeviceProperties(&prop, 0);
						if (prop.major >= 2)
							CudaFlag = eWaifu2xCudaError_OK;
						else
							CudaFlag = eWaifu2xCudaError_OldDevice;
					}
					else
						CudaFlag = eWaifu2xCudaError_OldVersion;
				}
				else
					CudaFlag = eWaifu2xCudaError_NotFind;
			}
			else
				CudaFlag = eWaifu2xCudaError_NotFind;
		}
		else
			CudaFlag = eWaifu2xCudaError_NotFind;
	});

	return CudaFlag;
}
Ejemplo n.º 16
0
TEST(StreamSynchronize, InvalidStream) {
    ::testing::FLAGS_gtest_death_test_style = "threadsafe";

    cudaStream_t stream;
    cudaError_t ret;

    /* The CUDA 5.0 driver no longer segfaults. */
    int driver;
    ret = cudaDriverGetVersion(&driver);

    if (driver >= 5000) {
        ret = cudaStreamCreate(&stream);
        ASSERT_EQ(cudaSuccess, ret);

        ret = cudaStreamDestroy(stream);
        ASSERT_EQ(cudaSuccess, ret);

        ret = cudaStreamSynchronize(stream);
        EXPECT_EQ(cudaErrorUnknown, ret);
    } else {
        /**
         * Without this compound statement, EXPECT_EXIT fails.
         * Without the EXPECT_EXIT wrapper, SIGSEGV happens.
         *
         * From appearances, it seems that gtest does not properly execute both
         * cudaStreamCreate and cudaStreamDestroy on stream when entering the
         * death test for this lone statement as to properly "initialize"
         * stream.
         */
        EXPECT_EXIT(
            {
                cudaStreamCreate(&stream);
                cudaStreamDestroy(stream);
                cudaStreamSynchronize(stream);
            },
            ::testing::KilledBySignal(SIGSEGV), "");
    }
Ejemplo n.º 17
0
int cudaDeviceInfo(void)
{
    int deviceCount = 0;
    if (cudaGetDeviceCount(&deviceCount) != cudaSuccess)
    {
        Scierror(999, "\ncudaGetDeviceCount FAILED CUDA Driver and Runtime version may be mismatched.\n\n");
        return 1;
    }

    sciprint("Starting...\n\n");
    sciprint(" CUDA Device Query (Runtime API) version (CUDART static linking)\n\n");

    // This function call returns 0 if there are no CUDA capable devices.
    if (deviceCount == 0)
    {
        sciprint("There is no device supporting CUDA\n");
    }

    int dev = 0;
    int driverVersion = 0, runtimeVersion = 0;
    for (dev = 0; dev < deviceCount; ++dev)
    {
        cudaDeviceProp deviceProp;
        cudaGetDeviceProperties(&deviceProp, dev);

        if (dev == 0)
        {
            // This function call returns 9999 for both major & minor fields, if no CUDA capable devices are present
            if (deviceProp.major == 9999 && deviceProp.minor == 9999)
            {
                sciprint("There is no device supporting CUDA.\n");
            }
            else if (deviceCount == 1)
            {
                sciprint("There is 1 device supporting CUDA\n");
            }
            else
            {
                sciprint("There are %d devices supporting CUDA\n", deviceCount);
            }
        }
        sciprint("\nDevice %d: \"%s\"\n", dev, deviceProp.name);

#if CUDART_VERSION >= 2020
        // Console log
        cudaDriverGetVersion(&driverVersion);
        sciprint("  CUDA Driver Version:                           %d.%d\n", driverVersion / 1000, driverVersion % 100);
        cudaRuntimeGetVersion(&runtimeVersion);
        sciprint("  CUDA Runtime Version:                          %d.%d\n", runtimeVersion / 1000, runtimeVersion % 100);
#endif
        sciprint("  CUDA Capability Major revision number:         %d\n", deviceProp.major);
        sciprint("  CUDA Capability Minor revision number:         %d\n", deviceProp.minor);

        sciprint("  Total amount of global memory:                 %u bytes\n", deviceProp.totalGlobalMem);
#if CUDART_VERSION >= 2000
        sciprint("  Number of multiprocessors:                     %d\n", deviceProp.multiProcessorCount);
        //    sciprint("  Number of cores:                               %d\n", nGpuArchCoresPerSM[deviceProp.major] * deviceProp.multiProcessorCount);
#endif
        sciprint("  Total amount of constant memory:               %u bytes\n", deviceProp.totalConstMem);
        sciprint("  Total amount of shared memory per block:       %u bytes\n", deviceProp.sharedMemPerBlock);
        sciprint("  Total number of registers available per block: %d\n", deviceProp.regsPerBlock);
        sciprint("  Warp size:                                     %d\n", deviceProp.warpSize);
        sciprint("  Maximum number of threads per block:           %d\n", deviceProp.maxThreadsPerBlock);
        sciprint("  Maximum sizes of each dimension of a block:    %d x %d x %d\n",
                 deviceProp.maxThreadsDim[0],
                 deviceProp.maxThreadsDim[1],
                 deviceProp.maxThreadsDim[2]);
        sciprint("  Maximum sizes of each dimension of a grid:     %d x %d x %d\n",
                 deviceProp.maxGridSize[0],
                 deviceProp.maxGridSize[1],
                 deviceProp.maxGridSize[2]);
        sciprint("  Maximum memory pitch:                          %u bytes\n", deviceProp.memPitch);
        sciprint("  Texture alignment:                             %u bytes\n", deviceProp.textureAlignment);
        sciprint("  Clock rate:                                    %.2f GHz\n", deviceProp.clockRate * 1e-6f);
#if CUDART_VERSION >= 2000
        sciprint("  Concurrent copy and execution:                 %s\n", deviceProp.deviceOverlap ? "Yes" : "No");
#endif
#if CUDART_VERSION >= 2020
        sciprint("  Run time limit on kernels:                     %s\n", deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No");
        sciprint("  Integrated:                                    %s\n", deviceProp.integrated ? "Yes" : "No");
        sciprint("  Support host page-locked memory mapping:       %s\n", deviceProp.canMapHostMemory ? "Yes" : "No");
        sciprint("  Compute mode:                                  %s\n", deviceProp.computeMode == cudaComputeModeDefault ?
                 "Default (multiple host threads can use this device simultaneously)" :
                 deviceProp.computeMode == cudaComputeModeExclusive ?
                 "Exclusive (only one host thread at a time can use this device)" :
                 deviceProp.computeMode == cudaComputeModeProhibited ?
                 "Prohibited (no host thread can use this device)" :
                 "Unknown");
#endif
#if CUDART_VERSION >= 3000
        sciprint("  Concurrent kernel execution:                   %s\n", deviceProp.concurrentKernels ? "Yes" : "No");
#endif
#if CUDART_VERSION >= 3010
        sciprint("  Device has ECC support enabled:                %s\n", deviceProp.ECCEnabled ? "Yes" : "No");
#endif
    }
    // finish
    sciprint("\n\nPASSED\n");
    return 0;
}
Ejemplo n.º 18
0
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main(int argc, char **argv)
{
    pArgc = &argc;
    pArgv = argv;

    printf("%s Starting...\n\n", argv[0]);
    printf(" CUDA Device Query (Runtime API) version (CUDART static linking)\n\n");

    int deviceCount = 0;
    cudaError_t error_id = cudaGetDeviceCount(&deviceCount);

    if (error_id != cudaSuccess)
    {
        printf("cudaGetDeviceCount returned %d\n-> %s\n", (int)error_id, cudaGetErrorString(error_id));
        exit(EXIT_FAILURE);
    }

    // This function call returns 0 if there are no CUDA capable devices.
    if (deviceCount == 0)
    {
        printf("There are no available device(s) that support CUDA\n");
    }
    else
    {
        printf("Detected %d CUDA Capable device(s)\n", deviceCount);
    }

    int dev, driverVersion = 0, runtimeVersion = 0;

    for (dev = 0; dev < deviceCount; ++dev)
    {
        cudaSetDevice(dev);
        cudaDeviceProp deviceProp;
        cudaGetDeviceProperties(&deviceProp, dev);

        printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name);

        // Console log
        cudaDriverGetVersion(&driverVersion);
        cudaRuntimeGetVersion(&runtimeVersion);
        printf("  CUDA Driver Version / Runtime Version          %d.%d / %d.%d\n", driverVersion/1000, (driverVersion%100)/10, runtimeVersion/1000, (runtimeVersion%100)/10);
        printf("  CUDA Capability Major/Minor version number:    %d.%d\n", deviceProp.major, deviceProp.minor);

        char msg[256];
        sprintf(msg, "  Total amount of global memory:                 %.0f MBytes (%llu bytes)\n",
                (float)deviceProp.totalGlobalMem/1048576.0f, (unsigned long long) deviceProp.totalGlobalMem);
        printf("%s", msg);

        printf("  (%2d) Multiprocessors x (%3d) CUDA Cores/MP:    %d CUDA Cores\n",
               deviceProp.multiProcessorCount,
               _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
               _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount);
        printf("  GPU Clock rate:                                %.0f MHz (%0.2f GHz)\n", deviceProp.clockRate * 1e-3f, deviceProp.clockRate * 1e-6f);


#if CUDART_VERSION >= 5000
        // This is supported in CUDA 5.0 (runtime API device properties)
        printf("  Memory Clock rate:                             %.0f Mhz\n", deviceProp.memoryClockRate * 1e-3f);
        printf("  Memory Bus Width:                              %d-bit\n",   deviceProp.memoryBusWidth);

        if (deviceProp.l2CacheSize)
        {
            printf("  L2 Cache Size:                                 %d bytes\n", deviceProp.l2CacheSize);
        }
#else
        // This only available in CUDA 4.0-4.2 (but these were only exposed in the CUDA Driver API)
        int memoryClock;
        getCudaAttribute<int>(&memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev);
        printf("  Memory Clock rate:                             %.0f Mhz\n", memoryClock * 1e-3f);
        int memBusWidth;
        getCudaAttribute<int>(&memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev);
        printf("  Memory Bus Width:                              %d-bit\n", memBusWidth);
        int L2CacheSize;
        getCudaAttribute<int>(&L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev);

        if (L2CacheSize)
        {
            printf("  L2 Cache Size:                                 %d bytes\n", L2CacheSize);
        }
#endif

        printf("  Max Texture Dimension Size (x,y,z)             1D=(%d), 2D=(%d,%d), 3D=(%d,%d,%d)\n",
               deviceProp.maxTexture1D   , deviceProp.maxTexture2D[0], deviceProp.maxTexture2D[1],
               deviceProp.maxTexture3D[0], deviceProp.maxTexture3D[1], deviceProp.maxTexture3D[2]);
        printf("  Max Layered Texture Size (dim) x layers        1D=(%d) x %d, 2D=(%d,%d) x %d\n",
               deviceProp.maxTexture1DLayered[0], deviceProp.maxTexture1DLayered[1],
               deviceProp.maxTexture2DLayered[0], deviceProp.maxTexture2DLayered[1], deviceProp.maxTexture2DLayered[2]);

        printf("  Total amount of constant memory:               %lu bytes\n", deviceProp.totalConstMem);
        printf("  Total amount of shared memory per block:       %lu bytes\n", deviceProp.sharedMemPerBlock);
        printf("  Total number of registers available per block: %d\n", deviceProp.regsPerBlock);
        printf("  Warp size:                                     %d\n", deviceProp.warpSize);
        printf("  Maximum number of threads per multiprocessor:  %d\n", deviceProp.maxThreadsPerMultiProcessor);
        printf("  Maximum number of threads per block:           %d\n", deviceProp.maxThreadsPerBlock);
        printf("  Maximum sizes of each dimension of a block:    %d x %d x %d\n",
               deviceProp.maxThreadsDim[0],
               deviceProp.maxThreadsDim[1],
               deviceProp.maxThreadsDim[2]);
        printf("  Maximum sizes of each dimension of a grid:     %d x %d x %d\n",
               deviceProp.maxGridSize[0],
               deviceProp.maxGridSize[1],
               deviceProp.maxGridSize[2]);
        printf("  Maximum memory pitch:                          %lu bytes\n", deviceProp.memPitch);
        printf("  Texture alignment:                             %lu bytes\n", deviceProp.textureAlignment);
        printf("  Concurrent copy and kernel execution:          %s with %d copy engine(s)\n", (deviceProp.deviceOverlap ? "Yes" : "No"), deviceProp.asyncEngineCount);
        printf("  Run time limit on kernels:                     %s\n", deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No");
        printf("  Integrated GPU sharing Host Memory:            %s\n", deviceProp.integrated ? "Yes" : "No");
        printf("  Support host page-locked memory mapping:       %s\n", deviceProp.canMapHostMemory ? "Yes" : "No");
        printf("  Alignment requirement for Surfaces:            %s\n", deviceProp.surfaceAlignment ? "Yes" : "No");
        printf("  Device has ECC support:                        %s\n", deviceProp.ECCEnabled ? "Enabled" : "Disabled");
#ifdef WIN32
        printf("  CUDA Device Driver Mode (TCC or WDDM):         %s\n", deviceProp.tccDriver ? "TCC (Tesla Compute Cluster Driver)" : "WDDM (Windows Display Driver Model)");
#endif
        printf("  Device supports Unified Addressing (UVA):      %s\n", deviceProp.unifiedAddressing ? "Yes" : "No");
        printf("  Device PCI Bus ID / PCI location ID:           %d / %d\n", deviceProp.pciBusID, deviceProp.pciDeviceID);

        const char *sComputeMode[] =
        {
            "Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)",
            "Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)",
            "Prohibited (no host thread can use ::cudaSetDevice() with this device)",
            "Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)",
            "Unknown",
            NULL
        };
        printf("  Compute Mode:\n");
        printf("     < %s >\n", sComputeMode[deviceProp.computeMode]);
    }

    // csv masterlog info
    // *****************************
    // exe and CUDA driver name
    printf("\n");
    std::string sProfileString = "deviceQuery, CUDA Driver = CUDART";
    char cTemp[16];

    // driver version
    sProfileString += ", CUDA Driver Version = ";
#ifdef WIN32
    sprintf_s(cTemp, 10, "%d.%d", driverVersion/1000, (driverVersion%100)/10);
#else
    sprintf(cTemp, "%d.%d", driverVersion/1000, (driverVersion%100)/10);
#endif
    sProfileString +=  cTemp;

    // Runtime version
    sProfileString += ", CUDA Runtime Version = ";
#ifdef WIN32
    sprintf_s(cTemp, 10, "%d.%d", runtimeVersion/1000, (runtimeVersion%100)/10);
#else
    sprintf(cTemp, "%d.%d", runtimeVersion/1000, (runtimeVersion%100)/10);
#endif
    sProfileString +=  cTemp;

    // Device count
    sProfileString += ", NumDevs = ";
#ifdef WIN32
    sprintf_s(cTemp, 10, "%d", deviceCount);
#else
    sprintf(cTemp, "%d", deviceCount);
#endif
    sProfileString += cTemp;

    // Print Out all device Names
    for (dev = 0; dev < deviceCount; ++dev)
    {
#ifdef _WIN32
    sprintf_s(cTemp, 13, ", Device%d = ", dev);
#else
    sprintf(cTemp, ", Device%d = ", dev);
#endif
        cudaDeviceProp deviceProp;
        cudaGetDeviceProperties(&deviceProp, dev);
        sProfileString += cTemp;
        sProfileString += deviceProp.name;
    }

    sProfileString += "\n";
    printf("%s", sProfileString.c_str());

    // finish
    exit(EXIT_SUCCESS);
}
Ejemplo n.º 19
0
/** Documented at declaration */
int
gpujpeg_init_device(int device_id, int flags)
{
    int dev_count;
    cudaGetDeviceCount(&dev_count);
    if ( dev_count == 0 ) {
        fprintf(stderr, "[GPUJPEG] [Error] No CUDA enabled device\n");
        return -1;
    }

    if ( device_id < 0 || device_id >= dev_count ) {
        fprintf(stderr, "[GPUJPEG] [Error] Selected device %d is out of bound. Devices on your system are in range %d - %d\n",
                device_id, 0, dev_count - 1);
        return -1;
    }

    struct cudaDeviceProp devProp;
    if ( cudaSuccess != cudaGetDeviceProperties(&devProp, device_id) ) {
        fprintf(stderr,
                "[GPUJPEG] [Error] Can't get CUDA device properties!\n"
                "[GPUJPEG] [Error] Do you have proper driver for CUDA installed?\n"
               );
        return -1;
    }

    if ( devProp.major < 1 ) {
        fprintf(stderr, "[GPUJPEG] [Error] Device %d does not support CUDA\n", device_id);
        return -1;
    }

    if ( flags & GPUJPEG_OPENGL_INTEROPERABILITY ) {
        cudaGLSetGLDevice(device_id);
        gpujpeg_cuda_check_error("Enabling OpenGL interoperability");
    }

    if ( flags & GPUJPEG_VERBOSE ) {
        int cuda_driver_version = 0;
        cudaDriverGetVersion(&cuda_driver_version);
        printf("CUDA driver version:   %d.%d\n", cuda_driver_version / 1000, (cuda_driver_version % 100) / 10);

        int cuda_runtime_version = 0;
        cudaRuntimeGetVersion(&cuda_runtime_version);
        printf("CUDA runtime version:  %d.%d\n", cuda_runtime_version / 1000, (cuda_runtime_version % 100) / 10);

        printf("Using Device #%d:       %s (c.c. %d.%d)\n", device_id, devProp.name, devProp.major, devProp.minor);
    }

    cudaSetDevice(device_id);
    gpujpeg_cuda_check_error("Set CUDA device");

    // Test by simple copying that the device is ready
    uint8_t data[] = {8};
    uint8_t* d_data = NULL;
    cudaMalloc((void**)&d_data, 1);
    cudaMemcpy(d_data, data, 1, cudaMemcpyHostToDevice);
    cudaFree(d_data);
    cudaError_t error = cudaGetLastError();
    if ( cudaSuccess != error ) {
        fprintf(stderr, "[GPUJPEG] [Error] Failed to initialize CUDA device.\n");
        if ( flags & GPUJPEG_OPENGL_INTEROPERABILITY )
            fprintf(stderr, "[GPUJPEG] [Info]  OpenGL interoperability is used, is OpenGL context available?\n");
        return -1;
    }

    return 0;
}
QHardwareWidget::QHardwareWidget(QWidget* pParent) :
	QGroupBox(pParent),
	m_MainLayout(),
	m_Devices(),
	m_OptimalDevice()
{
	setTitle("Hardware Selection");
	setStatusTip("Hardware Selection");
	setToolTip("Hardware Selection");

	m_MainLayout.setColumnMinimumWidth(0, 75);
	setLayout(&m_MainLayout);

	int DriverVersion = 0, RuntimeVersion = 0; 

	cudaDriverGetVersion(&DriverVersion);
	cudaRuntimeGetVersion(&RuntimeVersion);

	QString DriverVersionString		= QString::number(DriverVersion / 1000) + "." + QString::number(DriverVersion % 100);
	QString RuntimeVersionString	= QString::number(RuntimeVersion / 1000) + "." + QString::number(RuntimeVersion % 100);

	gStatus.SetStatisticChanged("Graphics Card", "CUDA Driver Version", DriverVersionString);
	gStatus.SetStatisticChanged("Graphics Card", "CUDA Runtime Version", RuntimeVersionString);

	QString VersionInfo;

	VersionInfo += "CUDA Driver Version: " + DriverVersionString;
	VersionInfo += ", CUDA Runtime Version: " + RuntimeVersionString;

	m_MainLayout.addWidget(new QLabel(VersionInfo), 0, 0, 1, 2);
	
	m_MainLayout.addWidget(&m_Devices, 1, 0, 1, 2);

	m_Model.EnumerateDevices();

	m_Devices.horizontalHeader()->setResizeMode(QHeaderView::ResizeToContents);
	m_Devices.horizontalHeader()->setStretchLastSection(true); 
	m_Devices.horizontalHeader()->setDefaultSectionSize(1);
	m_Devices.horizontalHeader()->setDefaultAlignment(Qt::AlignLeft);
	m_Devices.horizontalHeader()->setHighlightSections(false);
	m_Devices.verticalHeader()->setVisible(false);
	m_Devices.verticalHeader()->setDefaultSectionSize(20);
	m_Devices.setSelectionMode(QAbstractItemView::SingleSelection);
	m_Devices.setSelectionBehavior(QAbstractItemView::SelectRows);
	m_Devices.setFixedHeight(75);

	m_Devices.setModel(&m_Model);

	m_OptimalDevice.setText("Optimal Device");
	m_OptimalDevice.setToolTip("Optimal Device");
	m_OptimalDevice.setStatusTip("Choose the most optimal device for rendering");
	m_OptimalDevice.setFixedWidth(90);
	m_OptimalDevice.setVisible(m_Model.rowCount(QModelIndex()) > 1);

	m_MainLayout.addWidget(&m_OptimalDevice);
	
	QObject::connect(&m_OptimalDevice, SIGNAL(clicked()), this, SLOT(OnOptimalDevice()));
	QObject::connect(&m_Devices, SIGNAL(clicked(const QModelIndex&)), this, SLOT(OnSelection(const QModelIndex&)));

	OnOptimalDevice();
}
Ejemplo n.º 21
0
/////////////////////////////////////////////////////////////////////////////
// Device info dump (this code is taken from the SDK's deviceQuery example)
/////////////////////////////////////////////////////////////////////////////
static int dump()
{
    int deviceCount = 0;

    // This function call returns 0 if there are no CUDA capable devices.
    cudaGetDeviceCount(&deviceCount);
    if (deviceCount == 0)
    {
        printf("There is no device supporting CUDA\n");
        return (1);
    }

    int dev = 0;
    for (dev = 0; dev < deviceCount; ++dev)
    {
        cudaDeviceProp deviceProp;
        cudaGetDeviceProperties(&deviceProp, dev);

        if (dev == 0)
        {
            // This function call returns 9999 for both major & minor fields,
            // if no CUDA capable devices are present
            if (deviceProp.major == 9999 && deviceProp.minor == 9999)
            {
                printf("There is no device supporting CUDA.\n");
                return (1);
            }
            else if (deviceCount == 1)
                printf("There is 1 device supporting CUDA:\n");
            else
                printf("There are %d devices supporting CUDA:\n", deviceCount);
        }
        printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name);

        int driverVersion = 0, runtimeVersion = 0;
        cudaDriverGetVersion(&driverVersion);
        printf("  CUDA Driver Version:                           %d.%d\n",
               driverVersion/1000, driverVersion%100);
        cudaRuntimeGetVersion(&runtimeVersion);
        printf("  CUDA Runtime Version:                          %d.%d\n",
               runtimeVersion/1000, runtimeVersion%100);

        printf("  CUDA Capability Major revision number:         %d\n",
               deviceProp.major);
        printf("  CUDA Capability Minor revision number:         %d\n",
               deviceProp.minor);

        printf("  Total amount of global memory:                 %u bytes\n",
               (unsigned int)deviceProp.totalGlobalMem);

        printf("  Number of multiprocessors:                     %d\n",
               deviceProp.multiProcessorCount);
        printf("  Number of cores:                               %d\n",
               nGpuArchCoresPerSM[deviceProp.major] *
               deviceProp.multiProcessorCount);

        printf("  Total amount of constant memory:               %u bytes\n",
               (unsigned int)deviceProp.totalConstMem);
        printf("  Total amount of shared memory per block:       %u bytes\n",
               (unsigned int)deviceProp.sharedMemPerBlock);
        printf("  Total number of registers available per block: %d\n",
               deviceProp.regsPerBlock);
        printf("  Warp size:                                     %d\n",
               deviceProp.warpSize);
        printf("  Maximum number of threads per block:           %d\n",
               deviceProp.maxThreadsPerBlock);
        printf("  Maximum sizes of each dimension of a block:    %d x %d x %d\n",
               deviceProp.maxThreadsDim[0],
               deviceProp.maxThreadsDim[1],
               deviceProp.maxThreadsDim[2]);
        printf("  Maximum sizes of each dimension of a grid:     %d x %d x %d\n",
               deviceProp.maxGridSize[0],
               deviceProp.maxGridSize[1],
               deviceProp.maxGridSize[2]);
        printf("  Maximum memory pitch:                          %u bytes\n",
               (unsigned int)deviceProp.memPitch);
        printf("  Texture alignment:                             %u bytes\n",
               (unsigned int)deviceProp.textureAlignment);
        printf("  Clock rate:                                    %.2f GHz\n",
               deviceProp.clockRate * 1e-6f);

        printf("  Concurrent copy and execution:                 %s\n",
               deviceProp.deviceOverlap ? "Yes" : "No");

        printf("  Run time limit on kernels:                     %s\n",
               deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No");
        printf("  Integrated:                                    %s\n",
               deviceProp.integrated ? "Yes" : "No");
        printf("  Support host page-locked memory mapping:       %s\n",
               deviceProp.canMapHostMemory ? "Yes" : "No");
        printf("  Compute mode:                                  %s\n",
               deviceProp.computeMode == cudaComputeModeDefault ?
               "Default (multiple host threads can use this device "
               "simultaneously)" :
               deviceProp.computeMode == cudaComputeModeExclusive ?
               "Exclusive (only one host thread at a time can use this device)" :
               deviceProp.computeMode == cudaComputeModeProhibited ?
               "Prohibited (no host thread can use this device)" :
               "Unknown");
    }

    return (0);
}
Ejemplo n.º 22
0
cudaError_t WINAPI wine_cudaDriverGetVersion( int *driverVersion ) {
    WINE_TRACE("\n");
    return cudaDriverGetVersion( driverVersion );
}
Ejemplo n.º 23
0
int SimCudaHelper::PrintDevices(int deviceSelected)
{
    int deviceCount = 0;
    if (cudaGetDeviceCount(&deviceCount) != cudaSuccess)
    {
        printf("cudaGetDeviceCount FAILED CUDA Driver and Runtime version may be mismatched.\n");
        printf("\nFAILED\n");
    }

    // This function call returns 0 if there are no CUDA capable devices.
    if (deviceCount == 0)
        printf("There is no device supporting CUDA\n");

    int dev;
    int driverVersion = 0, runtimeVersion = 0;
    for (dev = 0; dev < deviceCount; ++dev)
    {
        cudaDeviceProp deviceProp;
        cudaGetDeviceProperties(&deviceProp, dev);

        if (dev == 0)
        {
            // This function call returns 9999 for both major & minor fields, if no CUDA capable devices are present
            if (deviceProp.major == 9999 && deviceProp.minor == 9999)
                printf("There is no device supporting CUDA.\n");
            else if (deviceCount == 1)
                printf("There is 1 device supporting CUDA\n");
            else
                printf("There are %d devices supporting CUDA\n", deviceCount);
        }
        printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name);

#if CUDART_VERSION >= 2020
        // Console log
        cudaDriverGetVersion(&driverVersion);
        printf("  CUDA Driver Version:                           %d.%d\n", driverVersion / 1000, driverVersion % 100);
        cudaRuntimeGetVersion(&runtimeVersion);
        printf("  CUDA Runtime Version:                          %d.%d\n", runtimeVersion / 1000, runtimeVersion % 100);
#endif
        printf("  CUDA Capability Major revision number:         %d\n", deviceProp.major);
        printf("  CUDA Capability Minor revision number:         %d\n", deviceProp.minor);

        printf("  Total amount of global memory:                 %zu bytes\n", deviceProp.totalGlobalMem);
#if CUDART_VERSION >= 2000
        printf("  Number of multiprocessors:                     %d\n", deviceProp.multiProcessorCount);
//printf("  Number of cores:                               %d\n", nGpuArchCoresPerSM[deviceProp.major] * deviceProp.multiProcessorCount);
#endif
        printf("  Total amount of constant memory:               %zu bytes\n", deviceProp.totalConstMem);
        printf("  Total amount of shared memory per block:       %zu bytes\n", deviceProp.sharedMemPerBlock);
        printf("  Total number of registers available per block: %d\n", deviceProp.regsPerBlock);
        printf("  Warp size:                                     %d\n", deviceProp.warpSize);
        printf("  Maximum number of threads per block:           %d\n", deviceProp.maxThreadsPerBlock);
        printf("  Maximum sizes of each dimension of a block:    %d x %d x %d\n",
               deviceProp.maxThreadsDim[0],
               deviceProp.maxThreadsDim[1],
               deviceProp.maxThreadsDim[2]);
        printf("  Maximum sizes of each dimension of a grid:     %d x %d x %d\n",
               deviceProp.maxGridSize[0],
               deviceProp.maxGridSize[1],
               deviceProp.maxGridSize[2]);
        printf("  Maximum memory pitch:                          %zu bytes\n", deviceProp.memPitch);
        printf("  Texture alignment:                             %zu bytes\n", deviceProp.textureAlignment);
        printf("  Clock rate:                                    %.2f GHz\n", deviceProp.clockRate * 1e-6f);
#if CUDART_VERSION >= 2000
        printf("  Concurrent copy and execution:                 %s\n", deviceProp.deviceOverlap ? "Yes" : "No");
#endif
#if CUDART_VERSION >= 2020
        printf("  Run time limit on kernels:                     %s\n", deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No");
        printf("  Integrated:                                    %s\n", deviceProp.integrated ? "Yes" : "No");
        printf("  Support host page-locked memory mapping:       %s\n", deviceProp.canMapHostMemory ? "Yes" : "No");
        printf("  Compute mode:                                  %s\n", deviceProp.computeMode == cudaComputeModeDefault ? "Default (multiple host threads can use this device simultaneously)" : deviceProp.computeMode == cudaComputeModeExclusive ? "Exclusive (only one host thread at a time can use this device)" : deviceProp.computeMode == cudaComputeModeProhibited ? "Prohibited (no host thread can use this device)" : "Unknown");
#endif
    }

    // csv masterlog info
    // *****************************
    // exe and CUDA driver name
    printf("\n");
    std::string sProfileString = "deviceQuery, CUDA Driver = CUDART";
    char cTemp[10];

    // driver version
    sProfileString += ", CUDA Driver Version = ";
#ifdef WIN32
    sprintf_s(cTemp, 10, "%d.%d", driverVersion / 1000, driverVersion % 100);
#else
    sprintf(cTemp, "%d.%d", driverVersion / 1000, driverVersion % 100);
#endif
    sProfileString += cTemp;

    // Runtime version
    sProfileString += ", CUDA Runtime Version = ";
#ifdef WIN32
    sprintf_s(cTemp, 10, "%d.%d", runtimeVersion / 1000, runtimeVersion % 100);
#else
    sprintf(cTemp, "%d.%d", runtimeVersion / 1000, runtimeVersion % 100);
#endif
    sProfileString += cTemp;

    // Device count
    sProfileString += ", NumDevs = ";
#ifdef WIN32
    sprintf_s(cTemp, 10, "%d", deviceCount);
#else
    sprintf(cTemp, "%d", deviceCount);
#endif
    sProfileString += cTemp;

    // First 2 device names, if any
    for (dev = 0; dev < ((deviceCount > 2) ? 2 : deviceCount); ++dev)
    {
        cudaDeviceProp deviceProp;
        cudaGetDeviceProperties(&deviceProp, dev);
        sProfileString += ", Device = ";
        sProfileString += deviceProp.name;
    }
    sProfileString += "\n";
    //		shrLogEx(LOGBOTH | MASTER, 0, sProfileString.c_str());

    // finish
    //printf("\n\nPASSED\n");

    //Log* pLog = Ogre::LogManager::getSingleton().getDefaultLog();
    cudaError_t err = cudaSuccess;
    return err;
}
Ejemplo n.º 24
0
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main(int argc, char **argv)
{
    pArgc = &argc;
    pArgv = argv;

    printf("%s Starting...\n\n", argv[0]);
    printf(" CUDA Device Query (Runtime API) version (CUDART static linking)\n\n");

    int deviceCount = 0;
    cudaError_t error_id = cudaGetDeviceCount(&deviceCount);

    if (error_id != cudaSuccess)
    {
        printf("cudaGetDeviceCount returned %d\n-> %s\n", (int)error_id, cudaGetErrorString(error_id));
        printf("Result = FAIL\n");
        exit(EXIT_FAILURE);
    }

    // This function call returns 0 if there are no CUDA capable devices.
    if (deviceCount == 0)
    {
        printf("There are no available device(s) that support CUDA\n");
    }
    else
    {
        printf("Detected %d CUDA Capable device(s)\n", deviceCount);
    }

    int dev, driverVersion = 0, runtimeVersion = 0;

    for (dev = 0; dev < deviceCount; ++dev)
    {
        cudaSetDevice(dev);
        cudaDeviceProp deviceProp;
        cudaGetDeviceProperties(&deviceProp, dev);

        printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name);

        // Console log
        cudaDriverGetVersion(&driverVersion);
        cudaRuntimeGetVersion(&runtimeVersion);
        printf("  CUDA Driver Version / Runtime Version          %d.%d / %d.%d\n", driverVersion/1000, (driverVersion%100)/10, runtimeVersion/1000, (runtimeVersion%100)/10);
        printf("  CUDA Capability Major/Minor version number:    %d.%d\n", deviceProp.major, deviceProp.minor);

        char msg[256];
        SPRINTF(msg, "  Total amount of global memory:                 %.0f MBytes (%llu bytes)\n",
                (float)deviceProp.totalGlobalMem/1048576.0f, (unsigned long long) deviceProp.totalGlobalMem);
        printf("%s", msg);

        printf("  (%2d) Multiprocessors, (%3d) CUDA Cores/MP:     %d CUDA Cores\n",
               deviceProp.multiProcessorCount,
               _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
               _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount);
        printf("  GPU Max Clock rate:                            %.0f MHz (%0.2f GHz)\n", deviceProp.clockRate * 1e-3f, deviceProp.clockRate * 1e-6f);


#if CUDART_VERSION >= 5000
        // This is supported in CUDA 5.0 (runtime API device properties)
        printf("  Memory Clock rate:                             %.0f Mhz\n", deviceProp.memoryClockRate * 1e-3f);
        printf("  Memory Bus Width:                              %d-bit\n",   deviceProp.memoryBusWidth);

        if (deviceProp.l2CacheSize)
        {
            printf("  L2 Cache Size:                                 %d bytes\n", deviceProp.l2CacheSize);
        }

#else
        // This only available in CUDA 4.0-4.2 (but these were only exposed in the CUDA Driver API)
        int memoryClock;
        getCudaAttribute<int>(&memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev);
        printf("  Memory Clock rate:                             %.0f Mhz\n", memoryClock * 1e-3f);
        int memBusWidth;
        getCudaAttribute<int>(&memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev);
        printf("  Memory Bus Width:                              %d-bit\n", memBusWidth);
        int L2CacheSize;
        getCudaAttribute<int>(&L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev);

        if (L2CacheSize)
        {
            printf("  L2 Cache Size:                                 %d bytes\n", L2CacheSize);
        }

#endif

        printf("  Maximum Texture Dimension Size (x,y,z)         1D=(%d), 2D=(%d, %d), 3D=(%d, %d, %d)\n",
               deviceProp.maxTexture1D   , deviceProp.maxTexture2D[0], deviceProp.maxTexture2D[1],
               deviceProp.maxTexture3D[0], deviceProp.maxTexture3D[1], deviceProp.maxTexture3D[2]);
        printf("  Maximum Layered 1D Texture Size, (num) layers  1D=(%d), %d layers\n",
               deviceProp.maxTexture1DLayered[0], deviceProp.maxTexture1DLayered[1]);
        printf("  Maximum Layered 2D Texture Size, (num) layers  2D=(%d, %d), %d layers\n",
               deviceProp.maxTexture2DLayered[0], deviceProp.maxTexture2DLayered[1], deviceProp.maxTexture2DLayered[2]);


        printf("  Total amount of constant memory:               %lu bytes\n", deviceProp.totalConstMem);
        printf("  Total amount of shared memory per block:       %lu bytes\n", deviceProp.sharedMemPerBlock);
        printf("  Total number of registers available per block: %d\n", deviceProp.regsPerBlock);
        printf("  Warp size:                                     %d\n", deviceProp.warpSize);
        printf("  Maximum number of threads per multiprocessor:  %d\n", deviceProp.maxThreadsPerMultiProcessor);
        printf("  Maximum number of threads per block:           %d\n", deviceProp.maxThreadsPerBlock);
        printf("  Max dimension size of a thread block (x,y,z): (%d, %d, %d)\n",
               deviceProp.maxThreadsDim[0],
               deviceProp.maxThreadsDim[1],
               deviceProp.maxThreadsDim[2]);
        printf("  Max dimension size of a grid size    (x,y,z): (%d, %d, %d)\n",
               deviceProp.maxGridSize[0],
               deviceProp.maxGridSize[1],
               deviceProp.maxGridSize[2]);
        printf("  Maximum memory pitch:                          %lu bytes\n", deviceProp.memPitch);
        printf("  Texture alignment:                             %lu bytes\n", deviceProp.textureAlignment);
        printf("  Concurrent copy and kernel execution:          %s with %d copy engine(s)\n", (deviceProp.deviceOverlap ? "Yes" : "No"), deviceProp.asyncEngineCount);
        printf("  Run time limit on kernels:                     %s\n", deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No");
        printf("  Integrated GPU sharing Host Memory:            %s\n", deviceProp.integrated ? "Yes" : "No");
        printf("  Support host page-locked memory mapping:       %s\n", deviceProp.canMapHostMemory ? "Yes" : "No");
        printf("  Alignment requirement for Surfaces:            %s\n", deviceProp.surfaceAlignment ? "Yes" : "No");
        printf("  Device has ECC support:                        %s\n", deviceProp.ECCEnabled ? "Enabled" : "Disabled");
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
        printf("  CUDA Device Driver Mode (TCC or WDDM):         %s\n", deviceProp.tccDriver ? "TCC (Tesla Compute Cluster Driver)" : "WDDM (Windows Display Driver Model)");
#endif
        printf("  Device supports Unified Addressing (UVA):      %s\n", deviceProp.unifiedAddressing ? "Yes" : "No");
        printf("  Device PCI Domain ID / Bus ID / location ID:   %d / %d / %d\n", deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID);

        const char *sComputeMode[] =
        {
            "Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)",
            "Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)",
            "Prohibited (no host thread can use ::cudaSetDevice() with this device)",
            "Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)",
            "Unknown",
            NULL
        };
        printf("  Compute Mode:\n");
        printf("     < %s >\n", sComputeMode[deviceProp.computeMode]);
    }

    // If there are 2 or more GPUs, query to determine whether RDMA is supported
    if (deviceCount >= 2)
    {
        cudaDeviceProp prop[64];
        int gpuid[64]; // we want to find the first two GPUs that can support P2P
        int gpu_p2p_count = 0;

        for (int i=0; i < deviceCount; i++)
        {
            checkCudaErrors(cudaGetDeviceProperties(&prop[i], i));

            // Only boards based on Fermi or later can support P2P
            if ((prop[i].major >= 2)
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
                // on Windows (64-bit), the Tesla Compute Cluster driver for windows must be enabled to support this
                && prop[i].tccDriver
#endif
               )
            {
                // This is an array of P2P capable GPUs
                gpuid[gpu_p2p_count++] = i;
            }
        }

        // Show all the combinations of support P2P GPUs
        int can_access_peer;

        if (gpu_p2p_count >= 2)
        {
            for (int i = 0; i < gpu_p2p_count; i++)
            {
                for (int j = 0; j < gpu_p2p_count; j++)
                {
                    if (gpuid[i] == gpuid[j])
                    {
                        continue;
                    }
                    checkCudaErrors(cudaDeviceCanAccessPeer(&can_access_peer, gpuid[i], gpuid[j]));
                        printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n", prop[gpuid[i]].name, gpuid[i],
                           prop[gpuid[j]].name, gpuid[j] ,
                           can_access_peer ? "Yes" : "No");
                }
            }
        }
    }

    // csv masterlog info
    // *****************************
    // exe and CUDA driver name
    printf("\n");
    std::string sProfileString = "deviceQuery, CUDA Driver = CUDART";
    char cTemp[16];

    // driver version
    sProfileString += ", CUDA Driver Version = ";
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
    sprintf_s(cTemp, 10, "%d.%d", driverVersion/1000, (driverVersion%100)/10);
#else
    sprintf(cTemp, "%d.%d", driverVersion/1000, (driverVersion%100)/10);
#endif
    sProfileString +=  cTemp;

    // Runtime version
    sProfileString += ", CUDA Runtime Version = ";
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
    sprintf_s(cTemp, 10, "%d.%d", runtimeVersion/1000, (runtimeVersion%100)/10);
#else
    sprintf(cTemp, "%d.%d", runtimeVersion/1000, (runtimeVersion%100)/10);
#endif
    sProfileString +=  cTemp;

    // Device count
    sProfileString += ", NumDevs = ";
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
    sprintf_s(cTemp, 10, "%d", deviceCount);
#else
    sprintf(cTemp, "%d", deviceCount);
#endif
    sProfileString += cTemp;

    // Print Out all device Names
    for (dev = 0; dev < deviceCount; ++dev)
    {
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
        sprintf_s(cTemp, 13, ", Device%d = ", dev);
#else
        sprintf(cTemp, ", Device%d = ", dev);
#endif
        cudaDeviceProp deviceProp;
        cudaGetDeviceProperties(&deviceProp, dev);
        sProfileString += cTemp;
        sProfileString += deviceProp.name;
    }

    sProfileString += "\n";
    printf("%s", sProfileString.c_str());

    printf("Result = PASS\n");

    // finish
    exit(EXIT_SUCCESS);
}
Ejemplo n.º 25
0
void CUT_DEVICE_QUERY()
{
	log_printf(INFO,"CUDA Device Query (Runtime API) version (CUDART static linking)\n");

	int deviceCount;

	cudaGetDeviceCount(&deviceCount);

	// This function call returns 0 if there are no CUDA capable devices.
	if (deviceCount == 0)
		log_printf(INFO,"There is no device supporting CUDA\n");
	int dev;
	for (dev = 0; dev < deviceCount; ++dev) {
		cudaDeviceProp deviceProp;
		cudaGetDeviceProperties(&deviceProp, dev);

		if (dev == 0) {
			// This function call returns 9999 for both major & minor fields, if no CUDA capable devices are present
			if (deviceProp.major == 9999 && deviceProp.minor == 9999)
				log_printf(INFO,"There is no device supporting CUDA.\n");
			else if (deviceCount == 1)
				log_printf(INFO,"There is 1 device supporting CUDA\n");
			else
				log_printf(INFO,"There are %d devices supporting CUDA\n", deviceCount);
		}
		log_printf(INFO,"\n");
		log_printf(INFO,"Device %d: \"%s\"\n", dev, deviceProp.name);
#if CUDART_VERSION >= 2020
		int driverVersion = 0, runtimeVersion = 0;
		cudaDriverGetVersion(&driverVersion);
		log_printf(INFO,"  CUDA Driver Version:                           %d.%d\n", driverVersion/1000, driverVersion%100);
		cudaRuntimeGetVersion(&runtimeVersion);
		log_printf(INFO,"  CUDA Runtime Version:                          %d.%d\n", runtimeVersion/1000, runtimeVersion%100);
#endif

		log_printf(INFO,"  CUDA Capability Major revision number:         %d\n", deviceProp.major);
		log_printf(INFO,"  CUDA Capability Minor revision number:         %d\n", deviceProp.minor);

		log_printf(INFO,"  Total amount of global memory:                 %u bytes\n", deviceProp.totalGlobalMem);
#if CUDART_VERSION >= 2000
		log_printf(INFO,"  Number of multiprocessors:                     %d\n", deviceProp.multiProcessorCount);
		log_printf(INFO,"  Number of cores:                               %d\n", 8 * deviceProp.multiProcessorCount);
#endif
		log_printf(INFO,"  Total amount of constant memory:               %u bytes\n", deviceProp.totalConstMem); 
		log_printf(INFO,"  Total amount of shared memory per block:       %u bytes\n", deviceProp.sharedMemPerBlock);
		log_printf(INFO,"  Total number of registers available per block: %d\n", deviceProp.regsPerBlock);
		log_printf(INFO,"  Warp size:                                     %d\n", deviceProp.warpSize);
		log_printf(INFO,"  Maximum number of threads per block:           %d\n", deviceProp.maxThreadsPerBlock);
		log_printf(INFO,"  Maximum sizes of each dimension of a block:    %d x %d x %d\n",
				deviceProp.maxThreadsDim[0],
				deviceProp.maxThreadsDim[1],
				deviceProp.maxThreadsDim[2]);
		log_printf(INFO,"  Maximum sizes of each dimension of a grid:     %d x %d x %d\n",
				deviceProp.maxGridSize[0],
				deviceProp.maxGridSize[1],
				deviceProp.maxGridSize[2]);
		log_printf(INFO,"  Maximum memory pitch:                          %u bytes\n", deviceProp.memPitch);
		log_printf(INFO,"  Texture alignment:                             %u bytes\n", deviceProp.textureAlignment);
		log_printf(INFO,"  Clock rate:                                    %.2f GHz\n", deviceProp.clockRate * 1e-6f);
#if CUDART_VERSION >= 2000
		log_printf(INFO,"  Concurrent copy and execution:                 %s\n", deviceProp.deviceOverlap ? "Yes" : "No");
#endif
#if CUDART_VERSION >= 2020
		log_printf(INFO,"  Run time limit on kernels:                     %s\n", deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No");
		log_printf(INFO,"  Integrated:                                    %s\n", deviceProp.integrated ? "Yes" : "No");
		log_printf(INFO,"  Support host page-locked memory mapping:       %s\n", deviceProp.canMapHostMemory ? "Yes" : "No");
		log_printf(INFO,"  Compute mode:                                  %s\n", deviceProp.computeMode == cudaComputeModeDefault ?
				"Default (multiple host threads can use this device simultaneously)" :
				deviceProp.computeMode == cudaComputeModeExclusive ?
				"Exclusive (only one host thread at a time can use this device)" :
				deviceProp.computeMode == cudaComputeModeProhibited ?
				"Prohibited (no host thread can use this device)" :
				"Unknown");
#endif
	}
}
Ejemplo n.º 26
0
void pcl::gpu::printCudaDeviceInfo(int device)
{
    int count = getCudaEnabledDeviceCount();
    bool valid = (device >= 0) && (device < count);

    int beg = valid ? device   : 0;
    int end = valid ? device+1 : count;

    printf("*** CUDA Device Query (Runtime API) version (CUDART static linking) *** \n\n");
    printf("Device count: %d\n", count);

    int driverVersion = 0, runtimeVersion = 0;
    cudaSafeCall( cudaDriverGetVersion(&driverVersion) );
    cudaSafeCall( cudaRuntimeGetVersion(&runtimeVersion) );

    const char *computeMode[] = {
        "Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)",
        "Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)",
        "Prohibited (no host thread can use ::cudaSetDevice() with this device)",
        "Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)",
        "Unknown",
        NULL
    };

    for(int dev = beg; dev < end; ++dev)
    {                
        cudaDeviceProp prop;
        cudaSafeCall( cudaGetDeviceProperties(&prop, dev) );

        int sm_cores = convertSMVer2Cores(prop.major, prop.minor);

        printf("\nDevice %d: \"%s\"\n", dev, prop.name);        
        printf("  CUDA Driver Version / Runtime Version          %d.%d / %d.%d\n", driverVersion/1000, driverVersion%100, runtimeVersion/1000, runtimeVersion%100);
        printf("  CUDA Capability Major/Minor version number:    %d.%d\n", prop.major, prop.minor);        
        printf("  Total amount of global memory:                 %.0f MBytes (%llu bytes)\n", (float)prop.totalGlobalMem/1048576.0f, (unsigned long long) prop.totalGlobalMem);            
        printf("  (%2d) Multiprocessors x (%2d) CUDA Cores/MP:     %d CUDA Cores\n", prop.multiProcessorCount, sm_cores, sm_cores * prop.multiProcessorCount);
        printf("  GPU Clock Speed:                               %.2f GHz\n", prop.clockRate * 1e-6f);

#if (CUDART_VERSION >= 4000)
        // This is not available in the CUDA Runtime API, so we make the necessary calls the driver API to support this for output
        int memoryClock, memBusWidth, L2CacheSize;
        getCudaAttribute<int>( &memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev );        
        getCudaAttribute<int>( &memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev );                
        getCudaAttribute<int>( &L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev );

        printf("  Memory Clock rate:                             %.2f Mhz\n", memoryClock * 1e-3f);
        printf("  Memory Bus Width:                              %d-bit\n", memBusWidth);
        if (L2CacheSize)
            printf("  L2 Cache Size:                                 %d bytes\n", L2CacheSize);
        
        printf("  Max Texture Dimension Size (x,y,z)             1D=(%d), 2D=(%d,%d), 3D=(%d,%d,%d)\n",
            prop.maxTexture1D, prop.maxTexture2D[0], prop.maxTexture2D[1],
            prop.maxTexture3D[0], prop.maxTexture3D[1], prop.maxTexture3D[2]);
        printf("  Max Layered Texture Size (dim) x layers        1D=(%d) x %d, 2D=(%d,%d) x %d\n",
            prop.maxTexture1DLayered[0], prop.maxTexture1DLayered[1],
            prop.maxTexture2DLayered[0], prop.maxTexture2DLayered[1], prop.maxTexture2DLayered[2]);
#endif
        printf("  Total amount of constant memory:               %u bytes\n", (int)prop.totalConstMem);
        printf("  Total amount of shared memory per block:       %u bytes\n", (int)prop.sharedMemPerBlock);
        printf("  Total number of registers available per block: %d\n", prop.regsPerBlock);
        printf("  Warp size:                                     %d\n", prop.warpSize);
        printf("  Maximum number of threads per block:           %d\n", prop.maxThreadsPerBlock);
        printf("  Maximum sizes of each dimension of a block:    %d x %d x %d\n", prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]);
        printf("  Maximum sizes of each dimension of a grid:     %d x %d x %d\n", prop.maxGridSize[0], prop.maxGridSize[1],  prop.maxGridSize[2]);
        printf("  Maximum memory pitch:                          %u bytes\n", (int)prop.memPitch);
        printf("  Texture alignment:                             %u bytes\n", (int)prop.textureAlignment);

#if CUDART_VERSION >= 4000
        printf("  Concurrent copy and execution:                 %s with %d copy engine(s)\n", (prop.deviceOverlap ? "Yes" : "No"), prop.asyncEngineCount);
#else
        printf("  Concurrent copy and execution:                 %s\n", prop.deviceOverlap ? "Yes" : "No");
#endif
        printf("  Run time limit on kernels:                     %s\n", prop.kernelExecTimeoutEnabled ? "Yes" : "No");
        printf("  Integrated GPU sharing Host Memory:            %s\n", prop.integrated ? "Yes" : "No");
        printf("  Support host page-locked memory mapping:       %s\n", prop.canMapHostMemory ? "Yes" : "No");

        printf("  Concurrent kernel execution:                   %s\n", prop.concurrentKernels ? "Yes" : "No");
        printf("  Alignment requirement for Surfaces:            %s\n", prop.surfaceAlignment ? "Yes" : "No");
        printf("  Device has ECC support enabled:                %s\n", prop.ECCEnabled ? "Yes" : "No");
        printf("  Device is using TCC driver mode:               %s\n", prop.tccDriver ? "Yes" : "No");
#if CUDART_VERSION >= 4000
        printf("  Device supports Unified Addressing (UVA):      %s\n", prop.unifiedAddressing ? "Yes" : "No");
        printf("  Device PCI Bus ID / PCI location ID:           %d / %d\n", prop.pciBusID, prop.pciDeviceID );
#endif
        printf("  Compute Mode:\n");
        printf("      %s \n", computeMode[prop.computeMode]);
    }
    
    printf("\n");    
    printf("deviceQuery, CUDA Driver = CUDART");
    printf(", CUDA Driver Version  = %d.%d", driverVersion / 1000, driverVersion % 100);
    printf(", CUDA Runtime Version = %d.%d", runtimeVersion/1000, runtimeVersion%100);
    printf(", NumDevs = %d\n\n", count);                
    fflush(stdout);
}
// Check if there is a device supporting CUDA
void GetCUDADeviceFlags()
{
    int deviceCount;
    bool archi_13 = false; 
    cudaGetDeviceCount(&deviceCount);

    // This function call returns 0 if there are no CUDA capable devices.
    if (deviceCount == 0)
        printf("There is no device supporting CUDA\n");
    int dev = 0;
    for (dev = 0; dev < 1/*deviceCount/*HACK : we want the first one only for now*/; ++dev) {
    printf("\nFirst device is selected by default.\n\n");
        cudaDeviceProp deviceProp;
        cudaGetDeviceProperties(&deviceProp, dev);

        if (dev == 0) {
			// This function call returns 9999 for both major & minor fields, if no CUDA capable devices are present
            if (deviceProp.major == 9999 && deviceProp.minor == 9999)

            printf("There is no device supporting CUDA.\n");
            else if (deviceCount == 1)
                printf("There is 1 device supporting CUDA\n");
            else
                printf("There are %d devices supporting CUDA\n", deviceCount);
           if ((deviceProp.major * 10 + deviceProp.minor) >= 13 ) {
               // OK on peut faire du double et rajouter des optimizations
	       archi_13 = true; 
	   } 
        #if CUDART_VERSION >= 2000
        if (archi_13){
            fb.add_def("MAX_THREADS", deviceProp.multiProcessorCount*1024);
	    } else {
            fb.add_def("MAX_THREADS", deviceProp.multiProcessorCount*768);
        }
        #endif
        fb.add_def("WARP_SIZE", deviceProp.warpSize);
	fb.add_flag("USE_CRS_SHARED");
	fb.add_flag("USE_TEXTURE");
	}
        printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name);
    #if CUDART_VERSION >= 2020
		int driverVersion = 0, runtimeVersion = 0;
		cudaDriverGetVersion(&driverVersion);
		printf("  CUDA Driver Version:                           %d.%d\n", driverVersion/1000, driverVersion%100);
		cudaRuntimeGetVersion(&runtimeVersion);
		printf("  CUDA Runtime Version:                          %d.%d\n", runtimeVersion/1000, runtimeVersion%100);
    #endif

        printf("  CUDA Capability Major revision number:         %d\n", deviceProp.major);
        printf("  CUDA Capability Minor revision number:         %d\n", deviceProp.minor);
        
		printf("  Total amount of global memory:                 %u bytes\n", deviceProp.totalGlobalMem);
    #if CUDART_VERSION >= 2000
        printf("  Number of multiprocessors:                     %d\n", deviceProp.multiProcessorCount);
        printf("  Number of cores:                               %d\n", 8 * deviceProp.multiProcessorCount);
    #endif
        printf("  Total amount of constant memory:               %u bytes\n", deviceProp.totalConstMem); 
        printf("  Total amount of shared memory per block:       %u bytes\n", deviceProp.sharedMemPerBlock);
        printf("  Total number of registers available per block: %d\n", deviceProp.regsPerBlock);
        printf("  Warp size:                                     %d\n", deviceProp.warpSize);
        printf("  Warp size:                                     %d\n", deviceProp.warpSize);
        printf("  Maximum number of threads per block:           %d\n", deviceProp.maxThreadsPerBlock);
        printf("  Maximum sizes of each dimension of a block:    %d x %d x %d\n",
               deviceProp.maxThreadsDim[0],
               deviceProp.maxThreadsDim[1],
               deviceProp.maxThreadsDim[2]);
        printf("  Maximum sizes of each dimension of a grid:     %d x %d x %d\n",
               deviceProp.maxGridSize[0],
               deviceProp.maxGridSize[1],
               deviceProp.maxGridSize[2]);
        printf("  Maximum memory pitch:                          %u bytes\n", deviceProp.memPitch);
        printf("  Texture alignment:                             %u bytes\n", deviceProp.textureAlignment);
        printf("  Clock rate:                                    %.2f GHz\n", deviceProp.clockRate * 1e-6f);
    #if CUDART_VERSION >= 2000
        printf("  Concurrent copy and execution:                 %s\n", deviceProp.deviceOverlap ? "Yes" : "No");
    #endif
    #if CUDART_VERSION >= 2020
        printf("  Run time limit on kernels:                     %s\n", deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No");
        printf("  Integrated:                                    %s\n", deviceProp.integrated ? "Yes" : "No");
        printf("  Support host page-locked memory mapping:       %s\n", deviceProp.canMapHostMemory ? "Yes" : "No");
        printf("  Compute mode:                                  %s\n", deviceProp.computeMode == cudaComputeModeDefault ?
			                                                            "Default (multiple host threads can use this device simultaneously)" :
		                                                                deviceProp.computeMode == cudaComputeModeExclusive ?
																		"Exclusive (only one host thread at a time can use this device)" :
		                                                                deviceProp.computeMode == cudaComputeModeProhibited ?
																		"Prohibited (no host thread can use this device)" :
																		"Unknown");
    #endif
    }
}
Ejemplo n.º 28
0
void CudaDeviceDialog::updateInfo(int index) {
    m_infoText = "<html><body>";

    int deviceCount = 0;
    cudaGetDeviceCount(&deviceCount);

    cudaDeviceProp p;
    cudaGetDeviceProperties(&p, index);
    if (p.major == 9999 && p.minor == 9999)
        m_infoText += "<p>There is no device supporting CUDA</p>";
    else if (deviceCount == 1)
        m_infoText += "<p>There is 1 device supporting CUDA</p>";
    else
        m_infoText += QString("<p>There are %1 devices supporting CUDA</p>").arg(deviceCount);

    m_infoText += QString("<p>CUDA Driver/Runtime</p>");
    m_infoText += "<table>";
    int driverVersion = 0, runtimeVersion = 0;
    cudaDriverGetVersion(&driverVersion);
    cudaRuntimeGetVersion(&runtimeVersion);
    QString error = "<span style='color:red;'>***ERROR*** >= 4.0 required</span>";
    addItem(1, "CUDA Driver Version:", 
               QString("%1.%2 %3").arg(driverVersion/1000).arg(driverVersion%100)
                    .arg((driverVersion >= 4000)? "" : error));
    addItem(1, "CUDA Runtime Version:", 
               QString("%1.%2 %3").arg(runtimeVersion/1000).arg(runtimeVersion%100)
                    .arg((driverVersion >= 4000)? "" : error));
    m_infoText += "</table>";

    if (index < deviceCount) {
        m_infoText += QString("<p>Device %1: &quot;%2&quot;</p>").arg(index).arg(p.name);
        m_infoText += "<table>";
        addItem(1, "CUDA Capability Major/Minor version number:", 
                   QString("%1.%2").arg(p.major).arg(p.minor));

        addItem(1, "Total amount of global memory:", QString("%1 MB").arg(p.totalGlobalMem / 1024 / 1024));

        addItem(1, QString("%1 Multiprocessors x %2 CUDA Cores/MP:").arg(p.multiProcessorCount).arg(ConvertSMVer2Cores(p.major, p.minor)),
                   QString("%1 CUDA Cores").arg(ConvertSMVer2Cores(p.major, p.minor) * p.multiProcessorCount));

        addItem(1, "Total amount of constant memory:", QString("%1 bytes").arg(p.totalConstMem));
        addItem(1, "Total amount of shared memory per block:", QString("%1 bytes").arg(p.sharedMemPerBlock));
        addItem(1, "Total number of registers available per block:", QString("%1").arg(p.regsPerBlock));
        addItem(1, "Warp size:", QString("%1").arg(p.warpSize));
        addItem(1, "Maximum number of threads per block:", QString("%1").arg(p.maxThreadsPerBlock));
        addItem(1, "Maximum sizes of each dimension of a block:", QString("%1 x %2 x %3")
                        .arg(p.maxThreadsDim[0])
                        .arg(p.maxThreadsDim[1])
                        .arg(p.maxThreadsDim[2]));
        addItem(1, "Maximum sizes of each dimension of a grid:", QString("%1 x %2 x %3")
                        .arg(p.maxGridSize[0])
                        .arg(p.maxGridSize[1])
                        .arg(p.maxGridSize[2]));
        addItem(1, "Maximum memory pitch:", QString("%1 bytes").arg(p.memPitch));
        addItem(1, "Texture alignment:", QString("%1 bytes").arg(p.textureAlignment));
        addItem(1, "Clock rate:", QString("%1 GHz").arg(p.clockRate * 1e-6f));
        
        addItem(1, "Concurrent copy and execution:", p.deviceOverlap ? "yes" : "no");
        addItem(1, "# of Asynchronous Copy Engines:", QString("%1").arg(p.asyncEngineCount));
        addItem(1, "Run time limit on kernels:", p.kernelExecTimeoutEnabled ? "yes" : "no");
        addItem(1, "Integrated:", p.integrated ? "yes" : "no");
        addItem(1, "Support host page-locked memory mapping:", p.canMapHostMemory ? "yes" : "no");

        addItem(1, "Compute mode:", p.computeMode == cudaComputeModeDefault ?
                                        "Default (multiple host threads can use this device simultaneously)" :
                                    p.computeMode == cudaComputeModeExclusive ?
                                        "Exclusive (only one host thread at a time can use this device)" :
                                    p.computeMode == cudaComputeModeProhibited ?
                                        "Prohibited (no host thread can use this device)" :
                                        "Unknown");
        addItem(1, "Concurrent kernel execution:", p.concurrentKernels ? "yes" : "no");
        addItem(1, "Device has ECC support enabled:", p.ECCEnabled ? "yes" : "no");
        addItem(1, "Device is using TCC driver mode:", p.tccDriver ? "yes" : "no");

        m_infoText += "</table>";
    }
    m_infoText += "</body></html>";
    m->info->setHtml(m_infoText);

    m->buttonBox->button(QDialogButtonBox::Ok)->setEnabled((driverVersion >= 4000) && (runtimeVersion >= 4000));
}
Ejemplo n.º 29
0
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main( int argc, char** argv) 
{
    pArgc = &argc;
    pArgv = argv;

    /*   shrQAStart(argc, argv);

    shrSetLogFileName ("deviceQuery.txt");


 */
    shrLog("%s Starting...\n\n", argv[0]);
    shrLog(" CUDA Device Query (Runtime API) version (CUDART static linking)\n\n");

    int deviceCount = 0;
    cudaError_t error_id = cudaGetDeviceCount(&deviceCount);
    if (error_id != cudaSuccess) {
        shrLog( "cudaGetDeviceCount returned %d\n-> %s\n", (int)error_id, cudaGetErrorString(error_id) );
        return -1;
    }
    // This function call returns 0 if there are no CUDA capable devices.
    if (deviceCount == 0)
        shrLog("There is no device supporting CUDA\n");
    else
        shrLog("Found %d CUDA Capable device(s)\n", deviceCount);

    int dev, driverVersion = 0, runtimeVersion = 0;
    for (dev = 0; dev < deviceCount; ++dev) {
        cudaDeviceProp deviceProp;
        cudaGetDeviceProperties(&deviceProp, dev);

        shrLog("\nDevice %d: \"%s\"\n", dev, deviceProp.name);

#if CUDART_VERSION >= 2020
        // Console log
        cudaDriverGetVersion(&driverVersion);
        cudaRuntimeGetVersion(&runtimeVersion);
        shrLog("  CUDA Driver Version / Runtime Version          %d.%d / %d.%d\n", driverVersion/1000, (driverVersion%100)/10, runtimeVersion/1000, (runtimeVersion%100)/10);
#endif
        shrLog("  CUDA Capability Major/Minor version number:    %d.%d\n", deviceProp.major, deviceProp.minor);

        char msg[256];
        sprintf(msg, "  Total amount of global memory:                 %.0f MBytes (%llu bytes)\n",
                (float)deviceProp.totalGlobalMem/1048576.0f, (unsigned long long) deviceProp.totalGlobalMem);
        shrLog(msg);
#if CUDART_VERSION >= 2000
        shrLog("  (%2d) Multiprocessors x (%2d) CUDA Cores/MP:     %d CUDA Cores\n",
               deviceProp.multiProcessorCount,
               ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
               ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount);
#endif
        shrLog("  GPU Clock Speed:                               %.2f GHz\n", deviceProp.clockRate * 1e-6f);
#if CUDART_VERSION >= 4000
        // This is not available in the CUDA Runtime API, so we make the necessary calls the driver API to support this for output
        int memoryClock;
        getCudaAttribute<int>( &memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev );
        shrLog("  Memory Clock rate:                             %.2f Mhz\n", memoryClock * 1e-3f);
        int memBusWidth;
        getCudaAttribute<int>( &memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev );
        shrLog("  Memory Bus Width:                              %d-bit\n", memBusWidth);
        int L2CacheSize;
        getCudaAttribute<int>( &L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev );
        if (L2CacheSize) {
            shrLog("  L2 Cache Size:                                 %d bytes\n", L2CacheSize);
        }

        shrLog("  Max Texture Dimension Size (x,y,z)             1D=(%d), 2D=(%d,%d), 3D=(%d,%d,%d)\n",
               deviceProp.maxTexture1D, deviceProp.maxTexture2D[0], deviceProp.maxTexture2D[1],
                deviceProp.maxTexture3D[0], deviceProp.maxTexture3D[1], deviceProp.maxTexture3D[2]);
        shrLog("  Max Layered Texture Size (dim) x layers        1D=(%d) x %d, 2D=(%d,%d) x %d\n",
               deviceProp.maxTexture1DLayered[0], deviceProp.maxTexture1DLayered[1],
                deviceProp.maxTexture2DLayered[0], deviceProp.maxTexture2DLayered[1], deviceProp.maxTexture2DLayered[2]);
#endif
        shrLog("  Total amount of constant memory:               %u bytes\n", deviceProp.totalConstMem);
        shrLog("  Total amount of shared memory per block:       %u bytes\n", deviceProp.sharedMemPerBlock);
        shrLog("  Total number of registers available per block: %d\n", deviceProp.regsPerBlock);
        shrLog("  Warp size:                                     %d\n", deviceProp.warpSize);
        shrLog("  Maximum number of threads per block:           %d\n", deviceProp.maxThreadsPerBlock);
        shrLog("  Maximum sizes of each dimension of a block:    %d x %d x %d\n",
               deviceProp.maxThreadsDim[0],
                deviceProp.maxThreadsDim[1],
                deviceProp.maxThreadsDim[2]);
        shrLog("  Maximum sizes of each dimension of a grid:     %d x %d x %d\n",
               deviceProp.maxGridSize[0],
                deviceProp.maxGridSize[1],
                deviceProp.maxGridSize[2]);
        shrLog("  Maximum memory pitch:                          %u bytes\n", deviceProp.memPitch);
        shrLog("  Texture alignment:                             %u bytes\n", deviceProp.textureAlignment);

#if CUDART_VERSION >= 4000
        shrLog("  Concurrent copy and execution:                 %s with %d copy engine(s)\n", (deviceProp.deviceOverlap ? "Yes" : "No"), deviceProp.asyncEngineCount);
#else
        shrLog("  Concurrent copy and execution:                 %s\n", deviceProp.deviceOverlap ? "Yes" : "No");
#endif

#if CUDART_VERSION >= 2020
        shrLog("  Run time limit on kernels:                     %s\n", deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No");
        shrLog("  Integrated GPU sharing Host Memory:            %s\n", deviceProp.integrated ? "Yes" : "No");
        shrLog("  Support host page-locked memory mapping:       %s\n", deviceProp.canMapHostMemory ? "Yes" : "No");
#endif
#if CUDART_VERSION >= 3000
        shrLog("  Concurrent kernel execution:                   %s\n", deviceProp.concurrentKernels ? "Yes" : "No");
        shrLog("  Alignment requirement for Surfaces:            %s\n", deviceProp.surfaceAlignment ? "Yes" : "No");
#endif
#if CUDART_VERSION >= 3010
        shrLog("  Device has ECC support enabled:                %s\n", deviceProp.ECCEnabled ? "Yes" : "No");
#endif
#if CUDART_VERSION >= 3020
        shrLog("  Device is using TCC driver mode:               %s\n", deviceProp.tccDriver ? "Yes" : "No");
#endif
#if CUDART_VERSION >= 4000
        shrLog("  Device supports Unified Addressing (UVA):      %s\n", deviceProp.unifiedAddressing ? "Yes" : "No");
        shrLog("  Device PCI Bus ID / PCI location ID:           %d / %d\n", deviceProp.pciBusID, deviceProp.pciDeviceID );
#endif

#if CUDART_VERSION >= 2020
        const char *sComputeMode[] = {
            "Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)",
            "Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)",
            "Prohibited (no host thread can use ::cudaSetDevice() with this device)",
            "Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)",
            "Unknown",
            NULL
        };
        shrLog("  Compute Mode:\n");
        shrLog("     < %s >\n", sComputeMode[deviceProp.computeMode]);
#endif
    }

    // csv masterlog info
    // *****************************
    // exe and CUDA driver name
    shrLog("\n");
    std::string sProfileString = "deviceQuery, CUDA Driver = CUDART";
    char cTemp[10];
    
    // driver version
    sProfileString += ", CUDA Driver Version = ";
#ifdef WIN32
    sprintf_s(cTemp, 10, "%d.%d", driverVersion/1000, (driverVersion%100)/10);
#else
    sprintf(cTemp, "%d.%d", driverVersion/1000, (driverVersion%100)/10);
#endif
    sProfileString +=  cTemp;
    
    // Runtime version
    sProfileString += ", CUDA Runtime Version = ";
#ifdef WIN32
    sprintf_s(cTemp, 10, "%d.%d", runtimeVersion/1000, (runtimeVersion%100)/10);
#else
    sprintf(cTemp, "%d.%d", runtimeVersion/1000, (runtimeVersion%100)/10);
#endif
    sProfileString +=  cTemp;
    
    // Device count
    sProfileString += ", NumDevs = ";
#ifdef WIN32
    sprintf_s(cTemp, 10, "%d", deviceCount);
#else
    sprintf(cTemp, "%d", deviceCount);
#endif
    sProfileString += cTemp;
    
    // First 2 device names, if any
    for (dev = 0; dev < ((deviceCount > 2) ? 2 : deviceCount); ++dev)
    {
        cudaDeviceProp deviceProp;
        cudaGetDeviceProperties(&deviceProp, dev);
        sProfileString += ", Device = ";
        sProfileString += deviceProp.name;
    }
    sProfileString += "\n";
    //shrLogEx(LOGBOTH | MASTER, 0, sProfileString.c_str());

    std::cout << sProfileString.c_str() << std::endl;
    std::cout << "Press <ENTER>" << std::endl;

    //
    getchar();

    runtimeTest();

    getchar();
    // finish
    return 0;
}
Ejemplo n.º 30
0
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {

  //if (nrhs != 1)
  //  mexErrMsgTxt("Wrong number of arguments");

  //int dev = (int) mxGetScalar(prhs[0]);

  int deviceCount;

  cudaGetDeviceCount(&deviceCount);

  // This function call returns 0 if there are no CUDA capable devices.
  if (deviceCount == 0)
    mexErrMsgTxt("There is no device supporting CUDA\n");

  int dev;

  int devStart = 0;
  int devStop = deviceCount;

  if (nrhs==1) {
    devStart = (int) mxGetScalar(prhs[0]);
    devStop = devStart+1;
    if (devStart >= deviceCount)
      mexErrMsgTxt("Please specify a valid GPU device.\n");
  }

  for (dev = devStart; dev < devStop; ++dev) {
    cudaDeviceProp deviceProp;
    cudaGetDeviceProperties(&deviceProp, dev);

    if (dev == devStart) {
      // This function call returns 9999 for both major & minor fields, if no CUDA capable devices are present
      if (deviceProp.major == 9999 && deviceProp.minor == 9999)
        mexErrMsgTxt("There is no device supporting CUDA.\n");
      else if (deviceCount == 1)
        printf("There is 1 device supporting CUDA\n");
      else
        printf("There are %d devices supporting CUDA\n", deviceCount);
#if CUDART_VERSION >= 2020
      int driverVersion = 0, runtimeVersion = 0;
      cudaDriverGetVersion(&driverVersion);
      printf("CUDA Driver Version:                           %d.%d\n", driverVersion/1000, driverVersion%100);
      cudaRuntimeGetVersion(&runtimeVersion);
      printf("CUDA Runtime Version:                          %d.%d\n", runtimeVersion/1000, runtimeVersion%100);
#endif

    }
    printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name);

    printf("  CUDA Capability Major revision number:         %d\n",
      deviceProp.major);
    printf("  CUDA Capability Minor revision number:         %d\n",
      deviceProp.minor);

    printf("  Total amount of global memory:                 %u bytes\n",
      deviceProp.totalGlobalMem);
    //#if CUDART_VERSION >= 2000
    //    printf("  Number of multiprocessors:                     %d\n",
    //        deviceProp.multiProcessorCount);
    //    printf("  Number of cores:                               %d\n", 8
    //        * deviceProp.multiProcessorCount);
    //#endif
    printf("  Total amount of constant memory:               %u bytes\n",
      deviceProp.totalConstMem);
    printf("  Total amount of shared memory per block:       %u bytes\n",
      deviceProp.sharedMemPerBlock);
    printf("  Total number of registers available per block: %d\n",
      deviceProp.regsPerBlock);
    printf("  Warp size:                                     %d\n",
      deviceProp.warpSize);
    printf("  Maximum number of threads per block:           %d\n",
      deviceProp.maxThreadsPerBlock);
    printf("  Maximum sizes of each dimension of a block:    %d x %d x %d\n",
      deviceProp.maxThreadsDim[0], deviceProp.maxThreadsDim[1],
      deviceProp.maxThreadsDim[2]);
    printf("  Maximum sizes of each dimension of a grid:     %d x %d x %d\n",
      deviceProp.maxGridSize[0], deviceProp.maxGridSize[1],
      deviceProp.maxGridSize[2]);
    printf("  Maximum memory pitch:                          %u bytes\n",
      deviceProp.memPitch);
    printf("  Texture alignment:                             %u bytes\n",
      deviceProp.textureAlignment);
    printf("  Clock rate:                                    %.2f GHz\n",
      deviceProp.clockRate * 1e-6f);
#if CUDART_VERSION >= 2000
    printf("  Concurrent copy and execution:                 %s\n",
      deviceProp.deviceOverlap ? "Yes" : "No");
#endif
#if CUDART_VERSION >= 2020
    printf("  Run time limit on kernels:                     %s\n", deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No");
    printf("  Integrated:                                    %s\n", deviceProp.integrated ? "Yes" : "No");
    printf("  Support host page-locked memory mapping:       %s\n", deviceProp.canMapHostMemory ? "Yes" : "No");
    printf("  Compute mode:                                  %s\n", deviceProp.computeMode == cudaComputeModeDefault ?
      "Default (multiple host threads can use this device simultaneously)" :
    deviceProp.computeMode == cudaComputeModeExclusive ?
      "Exclusive (only one host thread at a time can use this device)" :
    deviceProp.computeMode == cudaComputeModeProhibited ?
      "Prohibited (no host thread can use this device)" :
    "Unknown");
#endif
  }


}