Beispiel #1
0
/**
 * Basically the same functionality like cudaGetDeviceProperties looped over
 * cudaGetDeviceCount. Not sure why CUDA runtime is used directly. Faster?
 * Basically 95% boiler plate code
 *
 * Uses JNI to create an ArrayList[GpuDevice] object
 */
JNIEXPORT jobject JNICALL
Java_org_trifort_rootbeer_runtime_CUDARuntime_loadGpuDevices
(
    JNIEnv * env,
    jobject this_ref
)
{
    jclass    gpu_device_class;
    jmethodID gpu_device_init ;
    jobject   gpu_device      ;

    jclass    array_list_class = (*env)->FindClass  ( env, "java/util/ArrayList" );
    jmethodID array_list_init  = (*env)->GetMethodID( env, array_list_class, "<init>", "()V" );
    jmethodID array_list_add   = (*env)->GetMethodID( env, array_list_class, "add", "(Ljava/lang/Object;)Z" );
    jobject   ret              = (*env)->NewObject  ( env, array_list_class, array_list_init );

    gpu_device_class = (*env)->FindClass(env, "org/trifort/rootbeer/runtime/GpuDevice");
    gpu_device_init  = (*env)->GetStaticMethodID(env, gpu_device_class, "newCudaDevice",
      "(IIILjava/lang/String;JJIIIIIIIIZIIIIIIII)Lorg/trifort/rootbeer/runtime/GpuDevice;");
                          /* ^ function signature for constructor arguments */

    int status = cuInit(0);
    if ( status != CUDA_SUCCESS )
        return ret;

    int nDevices = 0;
    cuDeviceGetCount( &nDevices );

    int iDevice = 0;
    for ( iDevice = 0; iDevice < nDevices; ++iDevice )
    {
        CUdevice device;
        status = cuDeviceGet( &device, iDevice );
        if ( status != CUDA_SUCCESS )
            continue;

        int    major_version    ;
        int    minor_version    ;
        char   device_name[4096];
        size_t total_mem        ;

        CE( cuDeviceComputeCapability( &major_version, &minor_version, device ) );
        CE( cuDeviceGetName( device_name, 4096, device ) );
        CE( cuDeviceTotalMem( &total_mem, device ) );
        /* cuCtxCreate and Destroy are VERY expensive (~0.5s) and would only be necessary for free mem */
        // CE( cuCtxCreate ( &context, CU_CTX_MAP_HOST, device ) );
        // CE( cuMemGetInfo( &free_mem, &total_mem ) );
        // CE( cuCtxDestroy( context ) );

        /* makes use of https://gcc.gnu.org/onlinedocs/gcc/Statement-Exprs.html
         * to be able to write the constructor call, variable declaration and
         * call to cuDeviceGetAttribute in one go using a macro.
         * Also seems to work with -std=c99 switch:
         *   f(
         *       ({ int j = 5; j+1; })
         *   );
         */
        #define CUATTR( NAME )                                      \
        ( {                                                         \
            int NAME;                                               \
            CE( cuDeviceGetAttribute( &NAME,                        \
                                      CU_DEVICE_ATTRIBUTE_##NAME,   \
                                      device ) )                    \
            NAME;                                                   \
        } )

        /* @see GpuDevice.java */
        gpu_device = (*env)->CallObjectMethod
        (
            env                                     ,
            gpu_device_class                        ,
            gpu_device_init                         ,
            iDevice                                 ,  // device_id
            major_version                           ,  // major_version
            minor_version                           ,  // minor_version
            (*env)->NewStringUTF(env, device_name)  ,  // device_name
            -1                                      ,  // free_global_mem_size
            (jlong) total_mem                       ,  // total_global_mem_size
            CUATTR( MAX_REGISTERS_PER_BLOCK        ),  // max_registers_per_block
            CUATTR( WARP_SIZE                      ),  // warp_size
            CUATTR( MAX_PITCH                      ),  // max_pitch
            CUATTR( MAX_THREADS_PER_BLOCK          ),  // max_threads_per_block
            CUATTR( MAX_SHARED_MEMORY_PER_BLOCK    ),  // max_shared_memory_per_block
            CUATTR( CLOCK_RATE                     ),  // clock_rate
            CUATTR( MEMORY_CLOCK_RATE              ),  // memory_clock_rate
            CUATTR( TOTAL_CONSTANT_MEMORY          ),  // constant_mem_size
            CUATTR( INTEGRATED                     ),  // integrated
            CUATTR( MAX_THREADS_PER_MULTIPROCESSOR ),  // max_threads_per_multiprocessor
            CUATTR( MULTIPROCESSOR_COUNT           ),  // multiprocessor_count
            CUATTR( MAX_BLOCK_DIM_X                ),  // max_block_dim_x
            CUATTR( MAX_BLOCK_DIM_Y                ),  // max_block_dim_y
            CUATTR( MAX_BLOCK_DIM_Z                ),  // max_block_dim_z
            CUATTR( MAX_GRID_DIM_X                 ),  // max_grid_dim_x
            CUATTR( MAX_GRID_DIM_Y                 ),  // max_grid_dim_y
            CUATTR( MAX_GRID_DIM_Z                 )   // max_grid_dim_z
        );

        #undef CUATTR

        (*env)->CallBooleanMethod( env, ret, array_list_add, gpu_device );
    }

    return ret;
}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main( int argc, char** argv) 
{
	CUdevice dev;
	int major = 0, minor = 0;
	int deviceCount = 0;
	char deviceName[256];

	// note your project will need to link with cuda.lib files on windows
	printf("CUDA Device Query (Driver API) statically linked version \n");

	CUresult err = cuInit(0);
    CU_SAFE_CALL_NO_SYNC(cuDeviceGetCount(&deviceCount));
	// This function call returns 0 if there are no CUDA capable devices.
	if (deviceCount == 0) {
        printf("There is no device supporting CUDA\n");
	}
    for (dev = 0; dev < deviceCount; ++dev) {
		CU_SAFE_CALL_NO_SYNC( cuDeviceComputeCapability(&major, &minor, dev) );

        if (dev == 0) {
			// This function call returns 9999 for both major & minor fields, if no CUDA capable devices are present
            if (major == 9999 && minor == 9999)
                printf("There is no device supporting CUDA.\n");
            else if (deviceCount == 1)
                printf("There is 1 device supporting CUDA\n");
            else
                printf("There are %d devices supporting CUDA\n", deviceCount);
        }
		CU_SAFE_CALL_NO_SYNC( cuDeviceGetName(deviceName, 256, dev) );
        printf("\nDevice %d: \"%s\"\n", dev, deviceName);

    #if CUDA_VERSION >= 2020
		int driverVersion = 0;
		cuDriverGetVersion(&driverVersion);
		printf("  CUDA Driver Version:                           %d.%d\n", driverVersion/1000, driverVersion%100);
    #endif
        shrLog("  CUDA Capability Major/Minor version number:    %d.%d\n", major, minor);

		size_t totalGlobalMem;
		CU_SAFE_CALL_NO_SYNC( cuDeviceTotalMem(&totalGlobalMem, dev) );
        printf("  Total amount of global memory:                 %llu bytes\n", (unsigned long long)totalGlobalMem);

    #if CUDA_VERSION >= 2000
	    int multiProcessorCount;
		CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev ) );
        shrLog("  Multiprocessors x Cores/MP = Cores:            %d (MP) x %d (Cores/MP) = %d (Cores)\n", 
			     multiProcessorCount, ConvertSMVer2Cores(major, minor), 
				 ConvertSMVer2Cores(major, minor) * multiProcessorCount);
	#endif

 	    int totalConstantMemory;
		CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &totalConstantMemory, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, dev ) );
        printf("  Total amount of constant memory:               %u bytes\n", totalConstantMemory);
 	    int sharedMemPerBlock;
		CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &sharedMemPerBlock, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, dev ) );
        printf("  Total amount of shared memory per block:       %u bytes\n", sharedMemPerBlock);
 	    int regsPerBlock;
		CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &regsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev ) );
        printf("  Total number of registers available per block: %d\n", regsPerBlock);
 	    int warpSize;
		CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev ) );
        printf("  Warp size:                                     %d\n",	warpSize);
 	    int maxThreadsPerBlock;
		CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &maxThreadsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev ) );
		printf("  Maximum number of threads per block:           %d\n",	maxThreadsPerBlock);
 	    int blockDim[3];
		CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &blockDim[0], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, dev ) );
		CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &blockDim[1], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, dev ) );
		CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &blockDim[2], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, dev ) );
        printf("  Maximum sizes of each dimension of a block:    %d x %d x %d\n", blockDim[0], blockDim[1], blockDim[2]);
 	    int gridDim[3];
		CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &gridDim[0], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, dev ) );
		CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &gridDim[1], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, dev ) );
		CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &gridDim[2], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, dev ) );
        printf("  Maximum sizes of each dimension of a grid:     %d x %d x %d\n", gridDim[0], gridDim[1], gridDim[2]);
  	    int memPitch;
		CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &memPitch, CU_DEVICE_ATTRIBUTE_MAX_PITCH, dev ) );
        printf("  Maximum memory pitch:                          %u bytes\n", memPitch);
  	    int textureAlign;
		CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &textureAlign, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, dev ) );
        printf("  Texture alignment:                             %u bytes\n", textureAlign);
  	    int clockRate;
		CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev ) );
        printf("  Clock rate:                                    %.2f GHz\n", clockRate * 1e-6f);
    #if CUDA_VERSION >= 2000
	    int gpuOverlap;
		CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &gpuOverlap, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev ) );
        printf("  Concurrent copy and execution:                 %s\n",gpuOverlap ? "Yes" : "No");
    #endif

    #if CUDA_VERSION >= 2020
	    int kernelExecTimeoutEnabled;
		CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &kernelExecTimeoutEnabled, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, dev ) );
        printf("  Run time limit on kernels:                     %s\n", kernelExecTimeoutEnabled ? "Yes" : "No");
	    int integrated;
		CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &integrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev ) );
        printf("  Integrated:                                    %s\n", integrated ? "Yes" : "No");
	    int canMapHostMemory;
		CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &canMapHostMemory, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev ) );
        printf("  Support host page-locked memory mapping:       %s\n", canMapHostMemory ? "Yes" : "No");
    #endif

    #if CUDA_VERSION >= 3000
	    int concurrentKernels;
		CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &concurrentKernels, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev ) );
        printf("  Concurrent kernel execution:                   %s\n", concurrentKernels ? "Yes" : "No");
	    int eccEnabled;
		CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &eccEnabled,  CU_DEVICE_ATTRIBUTE_ECC_ENABLED, dev ) );
        printf("  Device has ECC support enabled:                %s\n", eccEnabled ? "Yes" : "No");
    #endif

    #if CUDA_VERSION >= 3020
	    int tccDriver ;
		CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &tccDriver ,  CU_DEVICE_ATTRIBUTE_TCC_DRIVER, dev ) );
		printf("  Device is using TCC driver mode:               %s\n", tccDriver ? "Yes" : "No");
    #endif
	}
    printf("\nPASSED\n");
    CUT_EXIT(argc, argv);
}
Beispiel #3
0
static void print_cuda_devices(int num_devices)
{
	CUdevice	dev;
	CUresult	rc;
	char		dev_name[512];
	size_t		dev_total_mem;
	int			dev_mem_clk;
	int			dev_mem_width;
	int			dev_mpu_nums;
	int			dev_mpu_clk;
	int			dev_nregs_mpu;
	int			dev_nregs_blk;
	int			dev_l2_sz;
	int			dev_cap_major;
	int			dev_cap_minor;
	int			i, j;
	struct {
		int		attr;
		int	   *dptr;
	} catalog[] = {
		{ CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE,        &dev_mem_clk },
		{ CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH,  &dev_mem_width },
		{ CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,     &dev_mpu_nums },
		{ CU_DEVICE_ATTRIBUTE_CLOCK_RATE,               &dev_mpu_clk },
		{ CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR,&dev_nregs_mpu},
		{ CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK,  &dev_nregs_blk },
		{ CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE,            &dev_l2_sz },
		{ CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, &dev_cap_major },
		{ CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, &dev_cap_minor },
	};

	for (i=0; i < num_devices; i++)
	{
		int		cores_per_mpu = -1;

		rc = cuDeviceGet(&dev, i);
		if (rc != CUDA_SUCCESS)
			cuda_error(rc, "cuDeviceGet");

		rc = cuDeviceGetName(dev_name, sizeof(dev_name), dev);
		if (rc != CUDA_SUCCESS)
			cuda_error(rc, "cuDeviceGetName");

		rc = cuDeviceTotalMem(&dev_total_mem, dev);
		if (rc != CUDA_SUCCESS)
			cuda_error(rc, "cuDeviceTotalMem");

		for (j=0; j < lengthof(catalog); j++)
		{
			rc = cuDeviceGetAttribute(catalog[j].dptr, catalog[j].attr, dev);
			if (rc != CUDA_SUCCESS)
				cuda_error(rc, "cuDeviceGetAttribute");
		}

		/* Number of CUDA cores */
		if (dev_cap_major == 1)
			cores_per_mpu = 8;
		else if (dev_cap_major == 2)
		{
			if (dev_cap_minor == 0)
				cores_per_mpu = 32;
			else if (dev_cap_minor == 1)
				cores_per_mpu = 48;
			else
				cores_per_mpu = -1;
		}
		else if (dev_cap_major == 3)
			cores_per_mpu = 192;
		else if (dev_cap_major == 5)
			cores_per_mpu = 128;

		printf("GPU%d - %s (capability: %d.%d), %d %s %s,"
			   " L2 %s, RAM %s (%dbits, %s), Regs=%d/%d\n",
			   i, dev_name, dev_cap_major, dev_cap_minor,
			   (cores_per_mpu > 0 ? cores_per_mpu : 1) * dev_mpu_nums,
			   (cores_per_mpu > 0 ? "CUDA cores" : "SMs"),
			   format_clock(dev_mpu_clk),
			   format_bytesz(dev_l2_sz),
			   format_bytesz(dev_total_mem),
			   dev_mem_width,
			   format_clock((size_t)dev_mem_clk * 1000),
			   dev_nregs_blk,
			   dev_nregs_mpu);
	}
}
    void test_driver_api()
    {
        CUdevice cuDevice;
        CUcontext cuContext;
        CUmodule cuModule;
        size_t totalGlobalMem;
        CUfunction matrixMult = 0;
        // cuda driver api intialization
        {
            int major = 0, minor = 0;
            char deviceName[100];

            cuda::Check::CUDAError(cuInit(0), "Error intializing cuda");
            int deviceCount;
            cuda::Check::CUDAError(cuDeviceGetCount(&deviceCount), "Error getting the number of devices");
            if (deviceCount <= 0)
            {
                std::cerr << "No devices found" << std::endl;
                return;
            }

            cuDeviceGet(&cuDevice, 0);

            // get compute capabilities and the devicename
            cuda::Check::CUDAError(cuDeviceComputeCapability(&major, &minor, cuDevice), "Error getting Device compute capability");
            cuda::Check::CUDAError(cuDeviceGetName(deviceName, 256, cuDevice), "Error getting device name");
            std::cout << "> GPU Device has SM " << major << "." << minor << " compute capability" << std::endl;

            cuda::Check::CUDAError(cuDeviceTotalMem(&totalGlobalMem, cuDevice), "Error getting totat global memory");
            std::cout << "  Total amount of global memory:     " << (unsigned long long)totalGlobalMem << " bytes" << std::endl;
            std::string tmp = (totalGlobalMem > (unsigned long long)4 * 1024 * 1024 * 1024L) ? "YES" : "NO";
            std::cout << "  64-bit Memory Address:             " << tmp << std::endl;

            cuda::Check::CUDAError(cuCtxCreate(&cuContext, 0, cuDevice), "Error creating the context");
        }
        // Compile and get the function
        {
            std::string module_path = "MatrixMult.cubin";
            std::cout << "> initCUDA loading module: " << module_path << std::endl;

            cuda::Check::CUDAError(cuModuleLoad(&cuModule, module_path.c_str()), "Error loading module");

            cuda::Check::CUDAError(cuModuleGetFunction(&matrixMult, cuModule, "MatrixMultKernelSimpleDriverAPI"), "Error retrieving the function");
        }
        // Call the kernel
        {
            int WIDTH = BLOCK_SIZE;
            int HEIGHT = BLOCK_SIZE;
            std::stringstream text;
            text << "CUDA Matrix Multiplication (" << WIDTH << "x" << WIDTH << ") Simple method Multiplication time";
            HostMatrix<float> M(WIDTH, HEIGHT); M.fillWithRandomData(); //M.print(std::cout); 
            HostMatrix<float> N(WIDTH, HEIGHT); N.fill_diagonal(2); //N.print(std::cout); 
            HostMatrix<float> C(WIDTH, HEIGHT);
            {
                ScopedTimer t(text.str());

                // allocate device memory
                CUdeviceptr d_M;
                cuda::Check::CUDAError(cuMemAlloc(&d_M, M.sizeInBytes()), "Error allocating memory");
                CUdeviceptr d_N;
                cuda::Check::CUDAError(cuMemAlloc(&d_N, N.sizeInBytes()), "Error allocating memory");

                // copy host memory to device
                cuda::Check::CUDAError(cuMemcpyHtoD(d_M, M, M.sizeInBytes()), "Error uploading memory to device");
                cuda::Check::CUDAError(cuMemcpyHtoD(d_N, N, N.sizeInBytes()), "Error uploading memory to device");

                // allocate device memory for result
                CUdeviceptr d_C;
                cuda::Check::CUDAError(cuMemAlloc(&d_C, C.sizeInBytes()), "Error allocating memory");


                dim3 block(BLOCK_SIZE, BLOCK_SIZE, 1);
                dim3 grid(C.width_ / BLOCK_SIZE, C.height_ / BLOCK_SIZE, 1);
                void *args[6] = { &d_M, &d_N, &d_C, &WIDTH, &WIDTH, &WIDTH};

                // new CUDA 4.0 Driver API Kernel launch call
                cuda::Check::CUDAError(cuLaunchKernel(
                    matrixMult,                                     // Selected kernel function
                    grid.x, grid.y, grid.z,                         // grid config 
                    block.x, block.y, block.z,                      // block config
                    2 * BLOCK_SIZE*BLOCK_SIZE*sizeof(float),        
                    NULL, args, NULL), "Error executing Kernel");

                cuda::Check::CUDAError(cuMemcpyDtoH((void *)C, d_C, C.sizeInBytes()),"Error downloading memory to host");
            }
            C.print(std::cout);
        }

        cuCtxDestroy(cuContext);
    }
void CudaModule::printDeviceInfo(CUdevice device)
{
    static const struct
    {
        CUdevice_attribute  attrib;
        const char*         name;
    } attribs[] =
    {
#define A21(ENUM, NAME) { CU_DEVICE_ATTRIBUTE_ ## ENUM, NAME },
#if (CUDA_VERSION >= 4000)
#   define A40(ENUM, NAME) A21(ENUM, NAME)
#else
#   define A40(ENUM, NAME) // TODO: Some of these may exist in earlier versions, too.
#endif

        A21(CLOCK_RATE,                         "Clock rate")
        A40(MEMORY_CLOCK_RATE,                  "Memory clock rate")
        A21(MULTIPROCESSOR_COUNT,               "Number of SMs")
//      A40(GLOBAL_MEMORY_BUS_WIDTH,            "DRAM bus width")
//      A40(L2_CACHE_SIZE,                      "L2 cache size")

        A21(MAX_THREADS_PER_BLOCK,              "Max threads per block")
        A40(MAX_THREADS_PER_MULTIPROCESSOR,     "Max threads per SM")
        A21(REGISTERS_PER_BLOCK,                "Registers per block")
//      A40(MAX_REGISTERS_PER_BLOCK,            "Max registers per block")
        A21(SHARED_MEMORY_PER_BLOCK,            "Shared mem per block")
//      A40(MAX_SHARED_MEMORY_PER_BLOCK,        "Max shared mem per block")
        A21(TOTAL_CONSTANT_MEMORY,              "Constant memory")
//      A21(WARP_SIZE,                          "Warp size")

        A21(MAX_BLOCK_DIM_X,                    "Max blockDim.x")
//      A21(MAX_BLOCK_DIM_Y,                    "Max blockDim.y")
//      A21(MAX_BLOCK_DIM_Z,                    "Max blockDim.z")
        A21(MAX_GRID_DIM_X,                     "Max gridDim.x")
//      A21(MAX_GRID_DIM_Y,                     "Max gridDim.y")
//      A21(MAX_GRID_DIM_Z,                     "Max gridDim.z")
//      A40(MAXIMUM_TEXTURE1D_WIDTH,            "Max tex1D.x")
//      A40(MAXIMUM_TEXTURE2D_WIDTH,            "Max tex2D.x")
//      A40(MAXIMUM_TEXTURE2D_HEIGHT,           "Max tex2D.y")
//      A40(MAXIMUM_TEXTURE3D_WIDTH,            "Max tex3D.x")
//      A40(MAXIMUM_TEXTURE3D_HEIGHT,           "Max tex3D.y")
//      A40(MAXIMUM_TEXTURE3D_DEPTH,            "Max tex3D.z")
//      A40(MAXIMUM_TEXTURE1D_LAYERED_WIDTH,    "Max layerTex1D.x")
//      A40(MAXIMUM_TEXTURE1D_LAYERED_LAYERS,   "Max layerTex1D.y")
//      A40(MAXIMUM_TEXTURE2D_LAYERED_WIDTH,    "Max layerTex2D.x")
//      A40(MAXIMUM_TEXTURE2D_LAYERED_HEIGHT,   "Max layerTex2D.y")
//      A40(MAXIMUM_TEXTURE2D_LAYERED_LAYERS,   "Max layerTex2D.z")
//      A40(MAXIMUM_TEXTURE2D_ARRAY_WIDTH,      "Max array.x")
//      A40(MAXIMUM_TEXTURE2D_ARRAY_HEIGHT,     "Max array.y")
//      A40(MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES,  "Max array.z")

//      A21(MAX_PITCH,                          "Max memcopy pitch")
//      A21(TEXTURE_ALIGNMENT,                  "Texture alignment")
//      A40(SURFACE_ALIGNMENT,                  "Surface alignment")

        A40(CONCURRENT_KERNELS,                 "Concurrent launches supported")
        A21(GPU_OVERLAP,                        "Concurrent memcopy supported")
        A40(ASYNC_ENGINE_COUNT,                 "Max concurrent memcopies")
//      A40(KERNEL_EXEC_TIMEOUT,                "Kernel launch time limited")
//      A40(INTEGRATED,                         "Integrated with host memory")
        A40(UNIFIED_ADDRESSING,                 "Unified addressing supported")
        A40(CAN_MAP_HOST_MEMORY,                "Can map host memory")
        A40(ECC_ENABLED,                        "ECC enabled")

//      A40(TCC_DRIVER,                         "Driver is TCC")
//      A40(COMPUTE_MODE,                       "Compute exclusivity mode")

//      A40(PCI_BUS_ID,                         "PCI bus ID")
//      A40(PCI_DEVICE_ID,                      "PCI device ID")
//      A40(PCI_DOMAIN_ID,                      "PCI domain ID")

#undef A21
#undef A40
    };

    char name[256];
    int major;
    int minor;
    size_t memory;

    checkError("cuDeviceGetName", cuDeviceGetName(name, FW_ARRAY_SIZE(name) - 1, device));
    checkError("cuDeviceComputeCapability", cuDeviceComputeCapability(&major, &minor, device));
    checkError("cuDeviceTotalMem", cuDeviceTotalMem(&memory, device));
    name[FW_ARRAY_SIZE(name) - 1] = '\0';

    printf("\n");
    char deviceIdStr[16];
    sprintf( deviceIdStr, "CUDA device %d", device);
    printf("%-32s%s\n",deviceIdStr, name);
        
    printf("%-32s%s\n", "---", "---");
    
    int version = getDriverVersion();
    printf("%-32s%d.%d\n", "CUDA driver API version", version/10, version%10);
    printf("%-32s%d.%d\n", "Compute capability", major, minor);
    printf("%-32s%.0f megs\n", "Total memory", (F32)memory * exp2(-20));

    for (int i = 0; i < (int)FW_ARRAY_SIZE(attribs); i++)
    {
        int value;
        if (cuDeviceGetAttribute(&value, attribs[i].attrib, device) == CUDA_SUCCESS)
            printf("%-32s%d\n", attribs[i].name, value);
    }
    printf("\n");
}
Beispiel #6
0
void CuDevice::GetAttributes() {

	cuDeviceComputeCapability(&_capability.first, &_capability.second, _h);

	char name[256];
	cuDeviceGetName(name, 255, _h);
	_deviceName = name;

	cuDeviceTotalMem(&_totalMem, _h);

	CuDeviceAttr& a = _attributes;
	cuDeviceGetAttribute(&a.threadsPerBlock, 
		CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, _h);
	cuDeviceGetAttribute(&a.blockDim.x, 
		CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, _h);
	cuDeviceGetAttribute(&a.blockDim.y, 
		CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, _h);
	cuDeviceGetAttribute(&a.blockDim.z, 
		CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, _h);
	cuDeviceGetAttribute(&a.gridDim.x, 
		CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, _h);
	cuDeviceGetAttribute(&a.gridDim.y, 
		CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, _h);
	cuDeviceGetAttribute(&a.gridDim.z, 
		CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, _h);
	cuDeviceGetAttribute(&a.sharedMemPerBlock, 
		CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, _h);
	cuDeviceGetAttribute(&a.totalConstantMem, 
		CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, _h);
	cuDeviceGetAttribute(&a.warpSize, 
		CU_DEVICE_ATTRIBUTE_WARP_SIZE, _h);
	cuDeviceGetAttribute(&a.maxPitch, 
		CU_DEVICE_ATTRIBUTE_MAX_PITCH, _h);
	cuDeviceGetAttribute(&a.regPerBlock, 
		CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, _h);
	cuDeviceGetAttribute(&a.clockRate, 
		CU_DEVICE_ATTRIBUTE_CLOCK_RATE, _h);
	cuDeviceGetAttribute(&a.textureAlignment, 
		CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, _h);
	cuDeviceGetAttribute(&a.gpuOverlap, 
		CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, _h);
	cuDeviceGetAttribute(&a.multiprocessorCount, 
		CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, _h);
	cuDeviceGetAttribute(&a.kernelExecTimeout, 
		CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, _h);
	cuDeviceGetAttribute(&a.integrated, 
		CU_DEVICE_ATTRIBUTE_INTEGRATED, _h);
	cuDeviceGetAttribute(&a.canMapHostMem, 
		CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, _h);
	cuDeviceGetAttribute((int*)&a.computeMode, 
		CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, _h);
	cuDeviceGetAttribute(&a.tex1DSize, 
		CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH, _h);
	cuDeviceGetAttribute(&a.tex2DSize.x,
		CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH, _h);
	cuDeviceGetAttribute(&a.tex2DSize.y, 
		CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT, _h);
	cuDeviceGetAttribute(&a.tex3DSize.x, 
		CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH, _h);
	cuDeviceGetAttribute(&a.tex3DSize.y, 
		CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT, _h);
	cuDeviceGetAttribute(&a.tex3DSize.z, 
		CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH, _h);
	cuDeviceGetAttribute(&a.tex2DArraySize.x, 
		CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH, _h);
	cuDeviceGetAttribute(&a.tex2DArraySize.y, 
		CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT, _h);
	cuDeviceGetAttribute(&a.tex2DArraySize.z, 
		CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES, _h);
	cuDeviceGetAttribute(&a.surfaceAlignment, 
		CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT, _h);
	cuDeviceGetAttribute(&a.concurrentKernels, 
		CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, _h);
	cuDeviceGetAttribute(&a.eccEnabled, 
		CU_DEVICE_ATTRIBUTE_ECC_ENABLED, _h);
	cuDeviceGetAttribute(&a.pciBusID, 
		CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, _h);
	cuDeviceGetAttribute(&a.pciDeviceID, 
		CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, _h);
	cuDeviceGetAttribute(&a.tccDriver, 
		CU_DEVICE_ATTRIBUTE_TCC_DRIVER, _h);
	cuDeviceGetAttribute(&a.memoryClockRate, 
		CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, _h);
	cuDeviceGetAttribute(&a.globalMemoryBusWidth, 
		CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, _h);
	cuDeviceGetAttribute(&a.l2CacheSize, 
		CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, _h);
	cuDeviceGetAttribute(&a.maxThreadsPerSM, 
		CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, _h);
}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main( int argc, char** argv) 
{
    CUdevice dev;
    int major = 0, minor = 0;
    int deviceCount = 0;
    char deviceName[256];

    shrQAStart(argc, argv);

    // note your project will need to link with cuda.lib files on windows
    printf("CUDA Device Query (Driver API) statically linked version \n");

    CUresult error_id = cuInit(0);
    if (error_id != CUDA_SUCCESS) {
        printf("cuInit(0) returned %d\n-> %s\n", error_id, getCudaDrvErrorString(error_id));
        shrQAFinishExit(argc, (const char **)argv, QA_FAILED);
    }

    error_id = cuDeviceGetCount(&deviceCount);
    if (error_id != CUDA_SUCCESS) {
        shrLog( "cuDeviceGetCount returned %d\n-> %s\n", (int)error_id, getCudaDrvErrorString(error_id) );
        shrQAFinishExit(argc, (const char **)argv, QA_FAILED);
    }

    // This function call returns 0 if there are no CUDA capable devices.
    if (deviceCount == 0)
        printf("There are no available device(s) that support CUDA\n");
    else if (deviceCount == 1)
        printf("There is 1 device supporting CUDA\n");
    else
        printf("There are %d devices supporting CUDA\n", deviceCount);

    for (dev = 0; dev < deviceCount; ++dev) {
        error_id =  cuDeviceComputeCapability(&major, &minor, dev);
		if (error_id != CUDA_SUCCESS) {
			shrLog( "cuDeviceComputeCapability returned %d\n-> %s\n", (int)error_id, getCudaDrvErrorString(error_id) );
			shrQAFinishExit(argc, (const char **)argv, QA_FAILED);
		}

        error_id = cuDeviceGetName(deviceName, 256, dev);
		if (error_id != CUDA_SUCCESS) {
			shrLog( "cuDeviceGetName returned %d\n-> %s\n", (int)error_id, getCudaDrvErrorString(error_id) );
			shrQAFinishExit(argc, (const char **)argv, QA_FAILED);
		}

		printf("\nDevice %d: \"%s\"\n", dev, deviceName);

    #if CUDA_VERSION >= 2020
        int driverVersion = 0;
        cuDriverGetVersion(&driverVersion);
        printf("  CUDA Driver Version:                           %d.%d\n", driverVersion/1000, (driverVersion%100)/10);
    #endif
        shrLog("  CUDA Capability Major/Minor version number:    %d.%d\n", major, minor);

        size_t totalGlobalMem;
        error_id = cuDeviceTotalMem(&totalGlobalMem, dev);
		if (error_id != CUDA_SUCCESS) {
			shrLog( "cuDeviceTotalMem returned %d\n-> %s\n", (int)error_id, getCudaDrvErrorString(error_id) );
			shrQAFinishExit(argc, (const char **)argv, QA_FAILED);
		}

        char msg[256];
        sprintf(msg, "  Total amount of global memory:                 %.0f MBytes (%llu bytes)\n",
                      (float)totalGlobalMem/1048576.0f, (unsigned long long) totalGlobalMem);
        shrLog(msg);

    #if CUDA_VERSION >= 2000
        int multiProcessorCount;
        getCudaAttribute<int>(&multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
        
        shrLog("  (%2d) Multiprocessors x (%3d) CUDA Cores/MP:   %d CUDA Cores\n",
                        multiProcessorCount, ConvertSMVer2Cores(major, minor),
                        ConvertSMVer2Cores(major, minor) * multiProcessorCount);
    #endif

        int clockRate;
        getCudaAttribute<int>(&clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
        printf("  GPU Clock rate:                                %.0f MHz (%0.2f GHz)\n", clockRate * 1e-3f, clockRate * 1e-6f);
    #if CUDA_VERSION >= 4000
        int memoryClock;
        getCudaAttribute<int>( &memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev );
        shrLog("  Memory Clock rate:                             %.0f Mhz\n", memoryClock * 1e-3f);
        int memBusWidth;
		getCudaAttribute<int>( &memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev );
        shrLog("  Memory Bus Width:                              %d-bit\n", memBusWidth);
        int L2CacheSize;
        getCudaAttribute<int>( &L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev );
        if (L2CacheSize) {
            shrLog("  L2 Cache Size:                                 %d bytes\n", L2CacheSize);
        }

        int maxTex1D, maxTex2D[2], maxTex3D[3];
		getCudaAttribute<int>( &maxTex1D, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH, dev );
		getCudaAttribute<int>( &maxTex2D[0], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH, dev );
		getCudaAttribute<int>( &maxTex2D[1], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT, dev );
		getCudaAttribute<int>( &maxTex3D[0], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH, dev );
		getCudaAttribute<int>( &maxTex3D[1], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT, dev );
		getCudaAttribute<int>( &maxTex3D[2], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH, dev );
        shrLog("  Max Texture Dimension Sizes                    1D=(%d) 2D=(%d,%d) 3D=(%d,%d,%d)\n",
                                                        maxTex1D, maxTex2D[0], maxTex2D[1], maxTex3D[0], maxTex3D[1], maxTex3D[2]);

        int  maxTex2DLayered[3];
        getCudaAttribute<int>( &maxTex2DLayered[0], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH, dev );
        getCudaAttribute<int>( &maxTex2DLayered[1], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT, dev );
        getCudaAttribute<int>( &maxTex2DLayered[2], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS, dev );

        shrLog("  Max Layered Texture Size (dim) x layers        1D=(%d) x %d, 2D=(%d,%d) x %d\n",
                                                        maxTex2DLayered[0], maxTex2DLayered[2],
                                                        maxTex2DLayered[0], maxTex2DLayered[1], maxTex2DLayered[2]);
#endif

        int totalConstantMemory;
		getCudaAttribute<int>( &totalConstantMemory, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, dev );
        printf("  Total amount of constant memory:               %u bytes\n", totalConstantMemory);
 	    int sharedMemPerBlock;
		getCudaAttribute<int>( &sharedMemPerBlock, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, dev );
        printf("  Total amount of shared memory per block:       %u bytes\n", sharedMemPerBlock);
 	    int regsPerBlock;
		getCudaAttribute<int>( &regsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev );
        printf("  Total number of registers available per block: %d\n", regsPerBlock);
 	    int warpSize;
		getCudaAttribute<int>( &warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev );
        printf("  Warp size:                                     %d\n",	warpSize);
 	    int maxThreadsPerMultiProcessor;
		getCudaAttribute<int>( &maxThreadsPerMultiProcessor, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev );
		printf("  Maximum number of threads per multiprocessor:  %d\n",	maxThreadsPerMultiProcessor);
 	    int maxThreadsPerBlock;
		getCudaAttribute<int>( &maxThreadsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev );
		printf("  Maximum number of threads per block:           %d\n",	maxThreadsPerBlock);

        int blockDim[3];
		getCudaAttribute<int>( &blockDim[0], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, dev );
		getCudaAttribute<int>( &blockDim[1], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, dev );
		getCudaAttribute<int>( &blockDim[2], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, dev );
        printf("  Maximum sizes of each dimension of a block:    %d x %d x %d\n", blockDim[0], blockDim[1], blockDim[2]);
 	    int gridDim[3];
		getCudaAttribute<int>( &gridDim[0], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, dev );
		getCudaAttribute<int>( &gridDim[1], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, dev );
		getCudaAttribute<int>( &gridDim[2], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, dev );
        printf("  Maximum sizes of each dimension of a grid:     %d x %d x %d\n", gridDim[0], gridDim[1], gridDim[2]);

        int textureAlign;
        getCudaAttribute<int>( &textureAlign, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, dev );
        printf("  Texture alignment:                             %u bytes\n", textureAlign);

        int memPitch;
		getCudaAttribute<int>( &memPitch, CU_DEVICE_ATTRIBUTE_MAX_PITCH, dev );
        printf("  Maximum memory pitch:                          %u bytes\n", memPitch);

    #if CUDA_VERSION >= 2000
        int gpuOverlap;
        getCudaAttribute<int>( &gpuOverlap, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev );
    #endif
    #if CUDA_VERSION >= 4000
        int asyncEngineCount;
        getCudaAttribute<int>( &asyncEngineCount, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev );
        printf("  Concurrent copy and execution:                 %s with %d copy engine(s)\n", (gpuOverlap ? "Yes" : "No"), asyncEngineCount);
    #else
        printf("  Concurrent copy and execution:                 %s\n",gpuOverlap ? "Yes" : "No");
    #endif

    #if CUDA_VERSION >= 2020
        int kernelExecTimeoutEnabled;
        getCudaAttribute<int>( &kernelExecTimeoutEnabled, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, dev );
        printf("  Run time limit on kernels:                     %s\n", kernelExecTimeoutEnabled ? "Yes" : "No");
        int integrated;
        getCudaAttribute<int>( &integrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev );
        printf("  Integrated GPU sharing Host Memory:            %s\n", integrated ? "Yes" : "No");
        int canMapHostMemory;
        getCudaAttribute<int>( &canMapHostMemory, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev );
        printf("  Support host page-locked memory mapping:       %s\n", canMapHostMemory ? "Yes" : "No");
    #endif

    #if CUDA_VERSION >= 3000
        int concurrentKernels;
        getCudaAttribute<int>( &concurrentKernels, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev );
        printf("  Concurrent kernel execution:                   %s\n", concurrentKernels ? "Yes" : "No");

        int surfaceAlignment;
        getCudaAttribute<int>( &surfaceAlignment, CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT, dev );
        printf("  Alignment requirement for Surfaces:            %s\n", surfaceAlignment ? "Yes" : "No");

        int eccEnabled;
        getCudaAttribute<int>( &eccEnabled,  CU_DEVICE_ATTRIBUTE_ECC_ENABLED, dev );
        printf("  Device has ECC support enabled:                %s\n", eccEnabled ? "Yes" : "No");
    #endif

    #if CUDA_VERSION >= 3020
        int tccDriver ;
        getCudaAttribute<int>( &tccDriver ,  CU_DEVICE_ATTRIBUTE_TCC_DRIVER, dev );
        printf("  Device is using TCC driver mode:               %s\n", tccDriver ? "Yes" : "No");
    #endif

    #if CUDA_VERSION >= 4000
        int unifiedAddressing;
        getCudaAttribute<int>( &unifiedAddressing, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, dev );
        printf("  Device supports Unified Addressing (UVA):      %s\n", unifiedAddressing ? "Yes" : "No");

        int pciBusID, pciDeviceID;
        getCudaAttribute<int>( &pciBusID, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, dev );
        getCudaAttribute<int>( &pciDeviceID, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, dev );
        printf("  Device PCI Bus ID / PCI location ID:           %d / %d\n", pciBusID, pciDeviceID );

        const char *sComputeMode[] = {
            "Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)",
            "Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)",
            "Prohibited (no host thread can use ::cudaSetDevice() with this device)",
            "Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)",
            "Unknown",
            NULL
        };

        int computeMode;
        getCudaAttribute<int>( &computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev );
        printf("  Compute Mode:\n");
        printf("     < %s >\n", sComputeMode[computeMode]);
    #endif

    }
	shrQAFinishExit(argc, (const char **)argv, QA_PASSED);
}
int
VideoEncoder::DisplayGPUCaps(int deviceOrdinal, NVEncoderParams *pParams, bool bDisplay)
{
    NVVE_GPUAttributes GPUAttributes = {0};
    HRESULT hr = S_OK;
    int gpuPerformance;

    assert(pParams != NULL);

    GPUAttributes.iGpuOrdinal = deviceOrdinal;
    hr = GetParamValue(NVVE_GET_GPU_ATTRIBUTES,  &GPUAttributes);

    if (hr!=S_OK)
    {
        printf("  >> NVVE_GET_GPU_ATTRIBUTES error! <<\n\n");
    }

    gpuPerformance = GPUAttributes.iClockRate * GPUAttributes.iMultiProcessorCount;
    gpuPerformance = gpuPerformance * _ConvertSMVer2Cores(GPUAttributes.iMajor, GPUAttributes.iMinor);

    size_t totalGlobalMem;
    CUresult error_id = cuDeviceTotalMem(&totalGlobalMem, deviceOrdinal);

    if (error_id != CUDA_SUCCESS)
    {
        printf("cuDeviceTotalMem returned %d\n-> %s\n", (int)error_id, getCudaDrvErrorString(error_id));
        return -1;
    }


    if (bDisplay)
    {
        printf("  GPU Device %d (SM %d.%d) : %s\n", GPUAttributes.iGpuOrdinal,
               GPUAttributes.iMajor, GPUAttributes.iMinor,
               GPUAttributes.cName);
        printf("  Total Memory          = %4.0f MBytes\n" , ceil((float)totalGlobalMem/1048576.0f));
        printf("  GPU Clock             = %4.2f MHz\n"    , (float)GPUAttributes.iClockRate/1000.f);
        printf("  MultiProcessors/Cores = %d MPs (%d Cores)\n", GPUAttributes.iMultiProcessorCount,
               GPUAttributes.iMultiProcessorCount*_ConvertSMVer2Cores(GPUAttributes.iMajor, GPUAttributes.iMinor));
        printf("  Maximum Offload Mode  = ");

        switch (GPUAttributes.MaxGpuOffloadLevel)
        {
            case NVVE_GPU_OFFLOAD_DEFAULT:
                printf("CPU: PEL Processing Only\n");
                break;

            case NVVE_GPU_OFFLOAD_ESTIMATORS:
                printf("GPU: Motion Estimation & Intra Prediction\n");
                break;

            case NVVE_GPU_OFFLOAD_ALL:
                printf("GPU: Full Offload\n");
                break;
        }

        printf("\n");
    }

    pParams->MaxOffloadLevel = GPUAttributes.MaxGpuOffloadLevel;

    return gpuPerformance;
}
Beispiel #9
0
value spoc_getCudaDevice(value i)
{
	CAMLparam1(i);
	CAMLlocal4(general_info, cuda_info, specific_info, gc_info);
	CAMLlocal3(device,  maxT, maxG);
	int nb_devices;
	CUdevprop dev_infos;
	CUdevice dev;
	CUcontext ctx;
	CUstream queue[2];
	spoc_cu_context *spoc_ctx;
	//CUcontext gl_ctx;
	char infoStr[1024];
	int infoInt;
	size_t infoUInt;
	int major, minor;
	enum cudaError_enum cuda_error; 


	cuDeviceGetCount (&nb_devices);

	if ((Int_val(i)) > nb_devices)
		raise_constant(*caml_named_value("no_cuda_device")) ;


	CUDA_CHECK_CALL(cuDeviceGet(&dev, Int_val(i)));
	CUDA_CHECK_CALL(cuDeviceGetProperties(&dev_infos, dev));

	general_info = caml_alloc (9, 0);
	CUDA_CHECK_CALL(cuDeviceGetName(infoStr, sizeof(infoStr), dev));

	Store_field(general_info,0, copy_string(infoStr));//
	CUDA_CHECK_CALL(cuDeviceTotalMem(&infoUInt, dev));

	Store_field(general_info,1, Val_int(infoUInt));//
	Store_field(general_info,2, Val_int(dev_infos.sharedMemPerBlock));//
	Store_field(general_info,3, Val_int(dev_infos.clockRate));//
	Store_field(general_info,4, Val_int(dev_infos.totalConstantMemory));//
	CUDA_CHECK_CALL(cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev));
	Store_field(general_info,5, Val_int(infoInt));//
	CUDA_CHECK_CALL(cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, dev));
	Store_field(general_info,6, Val_bool(infoInt));//
	Store_field(general_info,7, i);
	CUDA_CHECK_CALL(cuCtxCreate	(&ctx,
			CU_CTX_SCHED_BLOCKING_SYNC | CU_CTX_MAP_HOST,
			dev));
	spoc_ctx = malloc(sizeof(spoc_cl_context));
	spoc_ctx->ctx = ctx;
	CUDA_CHECK_CALL(cuStreamCreate(&queue[0], 0));
	CUDA_CHECK_CALL(cuStreamCreate(&queue[1], 0));
	spoc_ctx->queue[0] = queue[0];
	spoc_ctx->queue[1] = queue[1];
	Store_field(general_info,8, (value)spoc_ctx);
	CUDA_CHECK_CALL(cuCtxSetCurrent(ctx));


	cuda_info = caml_alloc(1, 0); //0 -> Cuda
	specific_info = caml_alloc(18, 0);

	cuDeviceComputeCapability(&major, &minor, dev);
	Store_field(specific_info,0, Val_int(major));//
	Store_field(specific_info,1, Val_int(minor));//
	Store_field(specific_info,2, Val_int(dev_infos.regsPerBlock));//
	Store_field(specific_info,3, Val_int(dev_infos.SIMDWidth));//
	Store_field(specific_info,4, Val_int(dev_infos.memPitch));//
	Store_field(specific_info,5, Val_int(dev_infos.maxThreadsPerBlock));//

	maxT = caml_alloc(3, 0);
	Store_field(maxT,0, Val_int(dev_infos.maxThreadsDim[0]));//
	Store_field(maxT,1, Val_int(dev_infos.maxThreadsDim[1]));//
	Store_field(maxT,2, Val_int(dev_infos.maxThreadsDim[2]));//
	Store_field(specific_info,6, maxT);

	maxG = caml_alloc(3, 0);
	Store_field(maxG,0, Val_int(dev_infos.maxGridSize[0]));//
	Store_field(maxG,1, Val_int(dev_infos.maxGridSize[1]));//
	Store_field(maxG,2, Val_int(dev_infos.maxGridSize[2]));//
	Store_field(specific_info,7, maxG);

	Store_field(specific_info,8, Val_int(dev_infos.textureAlign));//
	cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
	Store_field(specific_info,9, Val_bool(infoInt));//
	cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, dev);
	Store_field(specific_info,10, Val_bool(infoInt));//
	cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
	Store_field(specific_info,11, Val_bool(infoInt));//
	cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
	Store_field(specific_info,12, Val_bool(infoInt));//
	cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
	Store_field(specific_info,13, Val_int(infoInt));//
	cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
	Store_field(specific_info,14, Val_bool(infoInt));//
	cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, dev);
	Store_field(specific_info,15, Val_int(infoInt));
	cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, dev);
	Store_field(specific_info,16, Val_int(infoInt));
	cuDriverGetVersion(&infoInt);
	Store_field(specific_info, 17, Val_int(infoInt));

	Store_field(cuda_info, 0, specific_info);
	device = caml_alloc(4, 0);
	Store_field(device, 0, general_info);
	Store_field(device, 1, cuda_info);

	{spoc_cuda_gc_info* gcInfo = (spoc_cuda_gc_info*)malloc(sizeof(spoc_cuda_gc_info));
	CUDA_CHECK_CALL(cuMemGetInfo(&infoUInt, NULL));
	infoUInt -= (32*1024*1024);

	Store_field(device, 2, (value)gcInfo);


	{cuda_event_list* events = NULL;
	Store_field(device, 3, (value)events);



	CAMLreturn(device);}}
}
Beispiel #10
0
HRESULT initCudaResources(int argc, char **argv, int bUseInterop, int bTCC)
{
    HRESULT hr = S_OK;

    CUdevice cuda_device;

    if (checkCmdLineFlag(argc, (const char **)argv, "device"))
    {
        cuda_device = getCmdLineArgumentInt(argc, (const char **) argv, "device");

        // If interop is disabled, then we need to create a CUDA context w/o the GL context
        if (bUseInterop && !bTCC)
        {
            cuda_device = findCudaDeviceDRV(argc, (const char **)argv);
        }
        else
        {
            cuda_device = findCudaGLDeviceDRV(argc, (const char **)argv);
        }

        if (cuda_device < 0)
        {
            printf("No CUDA Capable devices found, exiting...\n");
            exit(EXIT_SUCCESS);
        }

        checkCudaErrors(cuDeviceGet(&g_oDevice, cuda_device));
    }
    else
    {
        // If we want to use Graphics Interop, then choose the GPU that is capable
        if (bUseInterop)
        {
            cuda_device = gpuGetMaxGflopsGLDeviceIdDRV();
            checkCudaErrors(cuDeviceGet(&g_oDevice, cuda_device));
        }
        else
        {
            cuda_device = gpuGetMaxGflopsDeviceIdDRV();
            checkCudaErrors(cuDeviceGet(&g_oDevice, cuda_device));
        }
    }

    // get compute capabilities and the devicename
    int major, minor;
    size_t totalGlobalMem;
    char deviceName[256];
    checkCudaErrors(cuDeviceComputeCapability(&major, &minor, g_oDevice));
    checkCudaErrors(cuDeviceGetName(deviceName, 256, g_oDevice));
    printf("> Using GPU Device %d: %s has SM %d.%d compute capability\n", cuda_device, deviceName, major, minor);

    checkCudaErrors(cuDeviceTotalMem(&totalGlobalMem, g_oDevice));
    printf("  Total amount of global memory:     %4.4f MB\n", (float)totalGlobalMem/(1024*1024));

    // Create CUDA Device w/ D3D9 interop (if WDDM), otherwise CUDA w/o interop (if TCC)
    // (use CU_CTX_BLOCKING_SYNC for better CPU synchronization)
    if (bUseInterop)
    {
        checkCudaErrors(cuD3D9CtxCreate(&g_oContext, &g_oDevice, CU_CTX_BLOCKING_SYNC, g_pD3DDevice));
    }
    else
    {
        checkCudaErrors(cuCtxCreate(&g_oContext, CU_CTX_BLOCKING_SYNC, g_oDevice));
    }

    try
    {
        // Initialize CUDA releated Driver API (32-bit or 64-bit), depending the platform running
        if (sizeof(void *) == 4)
        {
            g_pCudaModule = new CUmoduleManager("NV12ToARGB_drvapi_Win32.ptx", exec_path, 2, 2, 2);
        }
        else
        {
            g_pCudaModule = new CUmoduleManager("NV12ToARGB_drvapi_x64.ptx", exec_path, 2, 2, 2);
        }
    }
    catch (char const *p_file)
    {
        // If the CUmoduleManager constructor fails to load the PTX file, it will throw an exception
        printf("\n>> CUmoduleManager::Exception!  %s not found!\n", p_file);
        printf(">> Please rebuild NV12ToARGB_drvapi.cu or re-install this sample.\n");
        return E_FAIL;
    }

    g_pCudaModule->GetCudaFunction("NV12ToARGB_drvapi",   &gfpNV12toARGB);
    g_pCudaModule->GetCudaFunction("Passthru_drvapi",     &gfpPassthru);

    /////////////////Change///////////////////////////
    // Now we create the CUDA resources and the CUDA decoder context
    initCudaVideo();

    if (bUseInterop)
    {
        initD3D9Surface(g_pVideoDecoder->targetWidth(),
                        g_pVideoDecoder->targetHeight());
    }
    else
    {
        checkCudaErrors(cuMemAlloc(&g_pInteropFrame[0], g_pVideoDecoder->targetWidth() * g_pVideoDecoder->targetHeight() * 2));
        checkCudaErrors(cuMemAlloc(&g_pInteropFrame[1], g_pVideoDecoder->targetWidth() * g_pVideoDecoder->targetHeight() * 2));
    }

    CUcontext cuCurrent = NULL;
    CUresult result = cuCtxPopCurrent(&cuCurrent);

    if (result != CUDA_SUCCESS)
    {
        printf("cuCtxPopCurrent: %d\n", result);
        assert(0);
    }

    /////////////////////////////////////////
    return ((g_pCudaModule && g_pVideoDecoder && (g_pImageDX || g_pInteropFrame[0])) ? S_OK : E_FAIL);
}
	HRESULT CudaVideoRender::initCudaResources(int bUseInterop, int bTCC)
	{
		HRESULT hr = S_OK;

		CUdevice cuda_device;
		{
			// If we want to use Graphics Interop, then choose the GPU that is capable
			if (bUseInterop) {
				cuda_device = cutilDrvGetMaxGflopsGraphicsDeviceId();
				cutilDrvSafeCallNoSync(cuDeviceGet(&m_cuDevice, cuda_device ));
			} else {
				cuda_device = cutilDrvGetMaxGflopsDeviceId();
				cutilDrvSafeCallNoSync(cuDeviceGet(&m_cuDevice, cuda_device ));
			}
		}

		// get compute capabilities and the devicename
		int major, minor;
		size_t totalGlobalMem;
		char deviceName[256];
		cutilDrvSafeCallNoSync( cuDeviceComputeCapability(&major, &minor, m_cuDevice) );
		cutilDrvSafeCallNoSync( cuDeviceGetName(deviceName, 256, m_cuDevice) );
		printf("> Using GPU Device %d: %s has SM %d.%d compute capability\n", cuda_device, deviceName, major, minor);

		cutilDrvSafeCallNoSync( cuDeviceTotalMem(&totalGlobalMem, m_cuDevice) );
		printf("  Total amount of global memory:     %4.4f MB\n", (float)totalGlobalMem/(1024*1024) );

		// Create CUDA Device w/ D3D9 interop (if WDDM), otherwise CUDA w/o interop (if TCC)
		// (use CU_CTX_BLOCKING_SYNC for better CPU synchronization)
		if (bUseInterop) {
			cutilDrvSafeCallNoSync( cuD3D9CtxCreate(&m_cuContext, &m_cuDevice, CU_CTX_BLOCKING_SYNC, m_pRenderer9->getDevice()) );
		} else {
			cutilDrvSafeCallNoSync( cuCtxCreate(&m_cuContext, CU_CTX_BLOCKING_SYNC, m_cuDevice) );
		}

		// Initialize CUDA releated Driver API (32-bit or 64-bit), depending the platform running
		if (sizeof(void *) == 4) {
			m_pCudaModule = new CUmoduleManager("NV12ToARGB_drvapi_Win32.ptx", "./", 2, 2, 2);
		} else {
			m_pCudaModule = new CUmoduleManager("NV12ToARGB_drvapi_x64.ptx", "./", 2, 2, 2);
		}

		m_pCudaModule->GetCudaFunction("NV12ToARGB_drvapi",   &m_fpNV12toARGB);
		m_pCudaModule->GetCudaFunction("Passthru_drvapi",     &m_fpPassthru);

		/////////////////Change///////////////////////////
		// Now we create the CUDA resources and the CUDA decoder context
		initCudaVideo();

		if (bUseInterop) {
			//initD3D9Surface   ( m_pVideoDecoder->targetWidth(), 
			//					m_pVideoDecoder->targetHeight() );
		} else {
			cutilDrvSafeCallNoSync( cuMemAlloc(&m_pInteropFrame[0], m_pVideoDecoder->targetWidth() * m_pVideoDecoder->targetHeight() * 2) );
			cutilDrvSafeCallNoSync( cuMemAlloc(&m_pInteropFrame[1], m_pVideoDecoder->targetWidth() * m_pVideoDecoder->targetHeight() * 2) );
		}

		CUcontext cuCurrent = NULL;
		CUresult result = cuCtxPopCurrent(&cuCurrent);
		if (result != CUDA_SUCCESS) {
			printf("cuCtxPopCurrent: %d\n", result);
			assert(0);
		}

		/////////////////////////////////////////
		return ((m_pCudaModule && m_pVideoDecoder) ? S_OK : E_FAIL);
	}
Beispiel #12
0
int main() {
	int i, devCount;
	CUdevice dev;
	CUdevprop prop;
	CUresult e;

	cuInit(0);
	cuDeviceGetCount(&devCount);
	for(i = 0; i < devCount; i++) {
		e = cuDeviceGet(&dev, i);
		if(e != CUDA_SUCCESS) {
			printf("cuDeviceGet(%d) failed\n", i);
			continue;
		}
		e = cuDeviceGetProperties(&prop, dev);
		if(e != CUDA_SUCCESS) {
			printf("Could not get device properties");
			continue;
		}

		printf("Card #%02d:\n", i);

		printf("\tName: ");
		{
			char buf[1024];
			e = cuDeviceGetName(buf, 1024, dev);
			checkFail(e) ||
			printf("%s", buf);
			printf("\n");
		}

		printf("\tCompute capability: ");
		{
			int major, minor;
			e = cuDeviceComputeCapability(&major, &minor, dev);
			checkFail(e) ||
			printf("%d.%d", major, minor);
			printf("\n");
		}

		printf("\tTotal memory: ");
		{
			size_t mem;
			e = cuDeviceTotalMem(&mem, dev);
			checkFail(e) ||
			printf("%lu bytes", mem);
			printf("\n");
		}

		printf("\tClock rate: ");
		{
			printf("%d kHz", prop.clockRate);
			printf("\n");
		}

		printf("\tGrid dimensions: ");
		{
			printf("%d x %d x %d", prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]);
			printf("\n");
		}

		printf("\tThread dimensions: ");
		{
			printf("%d x %d x %d", prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]);
			printf("\n");
		}

		printf("\tThreads per block: ");
		{
			printf("%d", prop.maxThreadsPerBlock);
			printf("\n");
		}

		printf("\tShared memory per block: ");
		{
			printf("%d bytes", prop.sharedMemPerBlock);
			printf("\n");
		}

		printf("\tConstant memory: ");
		{
			printf("%d bytes", prop.totalConstantMemory);
			printf("\n");
		}

		printf("\tWarp size: ");
		{
			int attr;
			e = cuDeviceGetAttribute(&attr, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev);
			checkFail(e) ||
			printf("%d", attr);
			printf("\n");
		}

		printf("\tNumber of multiprocessors: ");
		{
			int attr;
			e = cuDeviceGetAttribute(&attr, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
			checkFail(e) ||
			printf("%d", attr);
			printf("\n");
		}

		printf("\tIs integrated: ");
		{
			int attr;
			e = cuDeviceGetAttribute(&attr, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
			checkFail(e) ||
			printf("%s", attr!=0?"yes":"no");
			printf("\n");
		}

		printf("\tCan map host memory: ");
		{
			int attr;
			e = cuDeviceGetAttribute(&attr, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
			checkFail(e) ||
			printf("%s", attr!=0?"yes":"no");
			printf("\n");
		}

		printf("\tCan execute multiple kernels: ");
		{
			int attr;
			e = cuDeviceGetAttribute(&attr, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
			checkFail(e) ||
			printf("%s", attr!=0?"yes":"no");
			printf("\n");
		}

		printf("\tThreads per multiprocessor: ");
		{
			int attr;
			e = cuDeviceGetAttribute(&attr, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev);
			checkFail(e) ||
			printf("%d", attr);
			printf("\n");
		}

		printf("\tAsynchronous engines: ");
		{
			int attr;
			e = cuDeviceGetAttribute(&attr, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
			checkFail(e) ||
			printf("%d", attr);
			printf("\n");
		}

		printf("\tShares address space with host: ");
		{
			int attr;
			e = cuDeviceGetAttribute(&attr, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, dev);
			checkFail(e) ||
			printf("%s", attr!=0?"yes":"no");
			printf("\n");
		}

		printf("\tL2 cache: ");
		{
			int attr;
			e = cuDeviceGetAttribute(&attr, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev);
			checkFail(e) ||
			printf("%d bytes", attr);
			printf("\n");
		}
	}
}