/** * Basically the same functionality like cudaGetDeviceProperties looped over * cudaGetDeviceCount. Not sure why CUDA runtime is used directly. Faster? * Basically 95% boiler plate code * * Uses JNI to create an ArrayList[GpuDevice] object */ JNIEXPORT jobject JNICALL Java_org_trifort_rootbeer_runtime_CUDARuntime_loadGpuDevices ( JNIEnv * env, jobject this_ref ) { jclass gpu_device_class; jmethodID gpu_device_init ; jobject gpu_device ; jclass array_list_class = (*env)->FindClass ( env, "java/util/ArrayList" ); jmethodID array_list_init = (*env)->GetMethodID( env, array_list_class, "<init>", "()V" ); jmethodID array_list_add = (*env)->GetMethodID( env, array_list_class, "add", "(Ljava/lang/Object;)Z" ); jobject ret = (*env)->NewObject ( env, array_list_class, array_list_init ); gpu_device_class = (*env)->FindClass(env, "org/trifort/rootbeer/runtime/GpuDevice"); gpu_device_init = (*env)->GetStaticMethodID(env, gpu_device_class, "newCudaDevice", "(IIILjava/lang/String;JJIIIIIIIIZIIIIIIII)Lorg/trifort/rootbeer/runtime/GpuDevice;"); /* ^ function signature for constructor arguments */ int status = cuInit(0); if ( status != CUDA_SUCCESS ) return ret; int nDevices = 0; cuDeviceGetCount( &nDevices ); int iDevice = 0; for ( iDevice = 0; iDevice < nDevices; ++iDevice ) { CUdevice device; status = cuDeviceGet( &device, iDevice ); if ( status != CUDA_SUCCESS ) continue; int major_version ; int minor_version ; char device_name[4096]; size_t total_mem ; CE( cuDeviceComputeCapability( &major_version, &minor_version, device ) ); CE( cuDeviceGetName( device_name, 4096, device ) ); CE( cuDeviceTotalMem( &total_mem, device ) ); /* cuCtxCreate and Destroy are VERY expensive (~0.5s) and would only be necessary for free mem */ // CE( cuCtxCreate ( &context, CU_CTX_MAP_HOST, device ) ); // CE( cuMemGetInfo( &free_mem, &total_mem ) ); // CE( cuCtxDestroy( context ) ); /* makes use of https://gcc.gnu.org/onlinedocs/gcc/Statement-Exprs.html * to be able to write the constructor call, variable declaration and * call to cuDeviceGetAttribute in one go using a macro. * Also seems to work with -std=c99 switch: * f( * ({ int j = 5; j+1; }) * ); */ #define CUATTR( NAME ) \ ( { \ int NAME; \ CE( cuDeviceGetAttribute( &NAME, \ CU_DEVICE_ATTRIBUTE_##NAME, \ device ) ) \ NAME; \ } ) /* @see GpuDevice.java */ gpu_device = (*env)->CallObjectMethod ( env , gpu_device_class , gpu_device_init , iDevice , // device_id major_version , // major_version minor_version , // minor_version (*env)->NewStringUTF(env, device_name) , // device_name -1 , // free_global_mem_size (jlong) total_mem , // total_global_mem_size CUATTR( MAX_REGISTERS_PER_BLOCK ), // max_registers_per_block CUATTR( WARP_SIZE ), // warp_size CUATTR( MAX_PITCH ), // max_pitch CUATTR( MAX_THREADS_PER_BLOCK ), // max_threads_per_block CUATTR( MAX_SHARED_MEMORY_PER_BLOCK ), // max_shared_memory_per_block CUATTR( CLOCK_RATE ), // clock_rate CUATTR( MEMORY_CLOCK_RATE ), // memory_clock_rate CUATTR( TOTAL_CONSTANT_MEMORY ), // constant_mem_size CUATTR( INTEGRATED ), // integrated CUATTR( MAX_THREADS_PER_MULTIPROCESSOR ), // max_threads_per_multiprocessor CUATTR( MULTIPROCESSOR_COUNT ), // multiprocessor_count CUATTR( MAX_BLOCK_DIM_X ), // max_block_dim_x CUATTR( MAX_BLOCK_DIM_Y ), // max_block_dim_y CUATTR( MAX_BLOCK_DIM_Z ), // max_block_dim_z CUATTR( MAX_GRID_DIM_X ), // max_grid_dim_x CUATTR( MAX_GRID_DIM_Y ), // max_grid_dim_y CUATTR( MAX_GRID_DIM_Z ) // max_grid_dim_z ); #undef CUATTR (*env)->CallBooleanMethod( env, ret, array_list_add, gpu_device ); } return ret; }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main( int argc, char** argv) { CUdevice dev; int major = 0, minor = 0; int deviceCount = 0; char deviceName[256]; // note your project will need to link with cuda.lib files on windows printf("CUDA Device Query (Driver API) statically linked version \n"); CUresult err = cuInit(0); CU_SAFE_CALL_NO_SYNC(cuDeviceGetCount(&deviceCount)); // This function call returns 0 if there are no CUDA capable devices. if (deviceCount == 0) { printf("There is no device supporting CUDA\n"); } for (dev = 0; dev < deviceCount; ++dev) { CU_SAFE_CALL_NO_SYNC( cuDeviceComputeCapability(&major, &minor, dev) ); if (dev == 0) { // This function call returns 9999 for both major & minor fields, if no CUDA capable devices are present if (major == 9999 && minor == 9999) printf("There is no device supporting CUDA.\n"); else if (deviceCount == 1) printf("There is 1 device supporting CUDA\n"); else printf("There are %d devices supporting CUDA\n", deviceCount); } CU_SAFE_CALL_NO_SYNC( cuDeviceGetName(deviceName, 256, dev) ); printf("\nDevice %d: \"%s\"\n", dev, deviceName); #if CUDA_VERSION >= 2020 int driverVersion = 0; cuDriverGetVersion(&driverVersion); printf(" CUDA Driver Version: %d.%d\n", driverVersion/1000, driverVersion%100); #endif shrLog(" CUDA Capability Major/Minor version number: %d.%d\n", major, minor); size_t totalGlobalMem; CU_SAFE_CALL_NO_SYNC( cuDeviceTotalMem(&totalGlobalMem, dev) ); printf(" Total amount of global memory: %llu bytes\n", (unsigned long long)totalGlobalMem); #if CUDA_VERSION >= 2000 int multiProcessorCount; CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev ) ); shrLog(" Multiprocessors x Cores/MP = Cores: %d (MP) x %d (Cores/MP) = %d (Cores)\n", multiProcessorCount, ConvertSMVer2Cores(major, minor), ConvertSMVer2Cores(major, minor) * multiProcessorCount); #endif int totalConstantMemory; CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &totalConstantMemory, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, dev ) ); printf(" Total amount of constant memory: %u bytes\n", totalConstantMemory); int sharedMemPerBlock; CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &sharedMemPerBlock, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, dev ) ); printf(" Total amount of shared memory per block: %u bytes\n", sharedMemPerBlock); int regsPerBlock; CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( ®sPerBlock, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev ) ); printf(" Total number of registers available per block: %d\n", regsPerBlock); int warpSize; CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev ) ); printf(" Warp size: %d\n", warpSize); int maxThreadsPerBlock; CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &maxThreadsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev ) ); printf(" Maximum number of threads per block: %d\n", maxThreadsPerBlock); int blockDim[3]; CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &blockDim[0], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, dev ) ); CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &blockDim[1], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, dev ) ); CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &blockDim[2], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, dev ) ); printf(" Maximum sizes of each dimension of a block: %d x %d x %d\n", blockDim[0], blockDim[1], blockDim[2]); int gridDim[3]; CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &gridDim[0], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, dev ) ); CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &gridDim[1], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, dev ) ); CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &gridDim[2], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, dev ) ); printf(" Maximum sizes of each dimension of a grid: %d x %d x %d\n", gridDim[0], gridDim[1], gridDim[2]); int memPitch; CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &memPitch, CU_DEVICE_ATTRIBUTE_MAX_PITCH, dev ) ); printf(" Maximum memory pitch: %u bytes\n", memPitch); int textureAlign; CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &textureAlign, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, dev ) ); printf(" Texture alignment: %u bytes\n", textureAlign); int clockRate; CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev ) ); printf(" Clock rate: %.2f GHz\n", clockRate * 1e-6f); #if CUDA_VERSION >= 2000 int gpuOverlap; CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &gpuOverlap, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev ) ); printf(" Concurrent copy and execution: %s\n",gpuOverlap ? "Yes" : "No"); #endif #if CUDA_VERSION >= 2020 int kernelExecTimeoutEnabled; CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &kernelExecTimeoutEnabled, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, dev ) ); printf(" Run time limit on kernels: %s\n", kernelExecTimeoutEnabled ? "Yes" : "No"); int integrated; CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &integrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev ) ); printf(" Integrated: %s\n", integrated ? "Yes" : "No"); int canMapHostMemory; CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &canMapHostMemory, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev ) ); printf(" Support host page-locked memory mapping: %s\n", canMapHostMemory ? "Yes" : "No"); #endif #if CUDA_VERSION >= 3000 int concurrentKernels; CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &concurrentKernels, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev ) ); printf(" Concurrent kernel execution: %s\n", concurrentKernels ? "Yes" : "No"); int eccEnabled; CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &eccEnabled, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, dev ) ); printf(" Device has ECC support enabled: %s\n", eccEnabled ? "Yes" : "No"); #endif #if CUDA_VERSION >= 3020 int tccDriver ; CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &tccDriver , CU_DEVICE_ATTRIBUTE_TCC_DRIVER, dev ) ); printf(" Device is using TCC driver mode: %s\n", tccDriver ? "Yes" : "No"); #endif } printf("\nPASSED\n"); CUT_EXIT(argc, argv); }
static void print_cuda_devices(int num_devices) { CUdevice dev; CUresult rc; char dev_name[512]; size_t dev_total_mem; int dev_mem_clk; int dev_mem_width; int dev_mpu_nums; int dev_mpu_clk; int dev_nregs_mpu; int dev_nregs_blk; int dev_l2_sz; int dev_cap_major; int dev_cap_minor; int i, j; struct { int attr; int *dptr; } catalog[] = { { CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, &dev_mem_clk }, { CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, &dev_mem_width }, { CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, &dev_mpu_nums }, { CU_DEVICE_ATTRIBUTE_CLOCK_RATE, &dev_mpu_clk }, { CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR,&dev_nregs_mpu}, { CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, &dev_nregs_blk }, { CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, &dev_l2_sz }, { CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, &dev_cap_major }, { CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, &dev_cap_minor }, }; for (i=0; i < num_devices; i++) { int cores_per_mpu = -1; rc = cuDeviceGet(&dev, i); if (rc != CUDA_SUCCESS) cuda_error(rc, "cuDeviceGet"); rc = cuDeviceGetName(dev_name, sizeof(dev_name), dev); if (rc != CUDA_SUCCESS) cuda_error(rc, "cuDeviceGetName"); rc = cuDeviceTotalMem(&dev_total_mem, dev); if (rc != CUDA_SUCCESS) cuda_error(rc, "cuDeviceTotalMem"); for (j=0; j < lengthof(catalog); j++) { rc = cuDeviceGetAttribute(catalog[j].dptr, catalog[j].attr, dev); if (rc != CUDA_SUCCESS) cuda_error(rc, "cuDeviceGetAttribute"); } /* Number of CUDA cores */ if (dev_cap_major == 1) cores_per_mpu = 8; else if (dev_cap_major == 2) { if (dev_cap_minor == 0) cores_per_mpu = 32; else if (dev_cap_minor == 1) cores_per_mpu = 48; else cores_per_mpu = -1; } else if (dev_cap_major == 3) cores_per_mpu = 192; else if (dev_cap_major == 5) cores_per_mpu = 128; printf("GPU%d - %s (capability: %d.%d), %d %s %s," " L2 %s, RAM %s (%dbits, %s), Regs=%d/%d\n", i, dev_name, dev_cap_major, dev_cap_minor, (cores_per_mpu > 0 ? cores_per_mpu : 1) * dev_mpu_nums, (cores_per_mpu > 0 ? "CUDA cores" : "SMs"), format_clock(dev_mpu_clk), format_bytesz(dev_l2_sz), format_bytesz(dev_total_mem), dev_mem_width, format_clock((size_t)dev_mem_clk * 1000), dev_nregs_blk, dev_nregs_mpu); } }
void test_driver_api() { CUdevice cuDevice; CUcontext cuContext; CUmodule cuModule; size_t totalGlobalMem; CUfunction matrixMult = 0; // cuda driver api intialization { int major = 0, minor = 0; char deviceName[100]; cuda::Check::CUDAError(cuInit(0), "Error intializing cuda"); int deviceCount; cuda::Check::CUDAError(cuDeviceGetCount(&deviceCount), "Error getting the number of devices"); if (deviceCount <= 0) { std::cerr << "No devices found" << std::endl; return; } cuDeviceGet(&cuDevice, 0); // get compute capabilities and the devicename cuda::Check::CUDAError(cuDeviceComputeCapability(&major, &minor, cuDevice), "Error getting Device compute capability"); cuda::Check::CUDAError(cuDeviceGetName(deviceName, 256, cuDevice), "Error getting device name"); std::cout << "> GPU Device has SM " << major << "." << minor << " compute capability" << std::endl; cuda::Check::CUDAError(cuDeviceTotalMem(&totalGlobalMem, cuDevice), "Error getting totat global memory"); std::cout << " Total amount of global memory: " << (unsigned long long)totalGlobalMem << " bytes" << std::endl; std::string tmp = (totalGlobalMem > (unsigned long long)4 * 1024 * 1024 * 1024L) ? "YES" : "NO"; std::cout << " 64-bit Memory Address: " << tmp << std::endl; cuda::Check::CUDAError(cuCtxCreate(&cuContext, 0, cuDevice), "Error creating the context"); } // Compile and get the function { std::string module_path = "MatrixMult.cubin"; std::cout << "> initCUDA loading module: " << module_path << std::endl; cuda::Check::CUDAError(cuModuleLoad(&cuModule, module_path.c_str()), "Error loading module"); cuda::Check::CUDAError(cuModuleGetFunction(&matrixMult, cuModule, "MatrixMultKernelSimpleDriverAPI"), "Error retrieving the function"); } // Call the kernel { int WIDTH = BLOCK_SIZE; int HEIGHT = BLOCK_SIZE; std::stringstream text; text << "CUDA Matrix Multiplication (" << WIDTH << "x" << WIDTH << ") Simple method Multiplication time"; HostMatrix<float> M(WIDTH, HEIGHT); M.fillWithRandomData(); //M.print(std::cout); HostMatrix<float> N(WIDTH, HEIGHT); N.fill_diagonal(2); //N.print(std::cout); HostMatrix<float> C(WIDTH, HEIGHT); { ScopedTimer t(text.str()); // allocate device memory CUdeviceptr d_M; cuda::Check::CUDAError(cuMemAlloc(&d_M, M.sizeInBytes()), "Error allocating memory"); CUdeviceptr d_N; cuda::Check::CUDAError(cuMemAlloc(&d_N, N.sizeInBytes()), "Error allocating memory"); // copy host memory to device cuda::Check::CUDAError(cuMemcpyHtoD(d_M, M, M.sizeInBytes()), "Error uploading memory to device"); cuda::Check::CUDAError(cuMemcpyHtoD(d_N, N, N.sizeInBytes()), "Error uploading memory to device"); // allocate device memory for result CUdeviceptr d_C; cuda::Check::CUDAError(cuMemAlloc(&d_C, C.sizeInBytes()), "Error allocating memory"); dim3 block(BLOCK_SIZE, BLOCK_SIZE, 1); dim3 grid(C.width_ / BLOCK_SIZE, C.height_ / BLOCK_SIZE, 1); void *args[6] = { &d_M, &d_N, &d_C, &WIDTH, &WIDTH, &WIDTH}; // new CUDA 4.0 Driver API Kernel launch call cuda::Check::CUDAError(cuLaunchKernel( matrixMult, // Selected kernel function grid.x, grid.y, grid.z, // grid config block.x, block.y, block.z, // block config 2 * BLOCK_SIZE*BLOCK_SIZE*sizeof(float), NULL, args, NULL), "Error executing Kernel"); cuda::Check::CUDAError(cuMemcpyDtoH((void *)C, d_C, C.sizeInBytes()),"Error downloading memory to host"); } C.print(std::cout); } cuCtxDestroy(cuContext); }
void CudaModule::printDeviceInfo(CUdevice device) { static const struct { CUdevice_attribute attrib; const char* name; } attribs[] = { #define A21(ENUM, NAME) { CU_DEVICE_ATTRIBUTE_ ## ENUM, NAME }, #if (CUDA_VERSION >= 4000) # define A40(ENUM, NAME) A21(ENUM, NAME) #else # define A40(ENUM, NAME) // TODO: Some of these may exist in earlier versions, too. #endif A21(CLOCK_RATE, "Clock rate") A40(MEMORY_CLOCK_RATE, "Memory clock rate") A21(MULTIPROCESSOR_COUNT, "Number of SMs") // A40(GLOBAL_MEMORY_BUS_WIDTH, "DRAM bus width") // A40(L2_CACHE_SIZE, "L2 cache size") A21(MAX_THREADS_PER_BLOCK, "Max threads per block") A40(MAX_THREADS_PER_MULTIPROCESSOR, "Max threads per SM") A21(REGISTERS_PER_BLOCK, "Registers per block") // A40(MAX_REGISTERS_PER_BLOCK, "Max registers per block") A21(SHARED_MEMORY_PER_BLOCK, "Shared mem per block") // A40(MAX_SHARED_MEMORY_PER_BLOCK, "Max shared mem per block") A21(TOTAL_CONSTANT_MEMORY, "Constant memory") // A21(WARP_SIZE, "Warp size") A21(MAX_BLOCK_DIM_X, "Max blockDim.x") // A21(MAX_BLOCK_DIM_Y, "Max blockDim.y") // A21(MAX_BLOCK_DIM_Z, "Max blockDim.z") A21(MAX_GRID_DIM_X, "Max gridDim.x") // A21(MAX_GRID_DIM_Y, "Max gridDim.y") // A21(MAX_GRID_DIM_Z, "Max gridDim.z") // A40(MAXIMUM_TEXTURE1D_WIDTH, "Max tex1D.x") // A40(MAXIMUM_TEXTURE2D_WIDTH, "Max tex2D.x") // A40(MAXIMUM_TEXTURE2D_HEIGHT, "Max tex2D.y") // A40(MAXIMUM_TEXTURE3D_WIDTH, "Max tex3D.x") // A40(MAXIMUM_TEXTURE3D_HEIGHT, "Max tex3D.y") // A40(MAXIMUM_TEXTURE3D_DEPTH, "Max tex3D.z") // A40(MAXIMUM_TEXTURE1D_LAYERED_WIDTH, "Max layerTex1D.x") // A40(MAXIMUM_TEXTURE1D_LAYERED_LAYERS, "Max layerTex1D.y") // A40(MAXIMUM_TEXTURE2D_LAYERED_WIDTH, "Max layerTex2D.x") // A40(MAXIMUM_TEXTURE2D_LAYERED_HEIGHT, "Max layerTex2D.y") // A40(MAXIMUM_TEXTURE2D_LAYERED_LAYERS, "Max layerTex2D.z") // A40(MAXIMUM_TEXTURE2D_ARRAY_WIDTH, "Max array.x") // A40(MAXIMUM_TEXTURE2D_ARRAY_HEIGHT, "Max array.y") // A40(MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES, "Max array.z") // A21(MAX_PITCH, "Max memcopy pitch") // A21(TEXTURE_ALIGNMENT, "Texture alignment") // A40(SURFACE_ALIGNMENT, "Surface alignment") A40(CONCURRENT_KERNELS, "Concurrent launches supported") A21(GPU_OVERLAP, "Concurrent memcopy supported") A40(ASYNC_ENGINE_COUNT, "Max concurrent memcopies") // A40(KERNEL_EXEC_TIMEOUT, "Kernel launch time limited") // A40(INTEGRATED, "Integrated with host memory") A40(UNIFIED_ADDRESSING, "Unified addressing supported") A40(CAN_MAP_HOST_MEMORY, "Can map host memory") A40(ECC_ENABLED, "ECC enabled") // A40(TCC_DRIVER, "Driver is TCC") // A40(COMPUTE_MODE, "Compute exclusivity mode") // A40(PCI_BUS_ID, "PCI bus ID") // A40(PCI_DEVICE_ID, "PCI device ID") // A40(PCI_DOMAIN_ID, "PCI domain ID") #undef A21 #undef A40 }; char name[256]; int major; int minor; size_t memory; checkError("cuDeviceGetName", cuDeviceGetName(name, FW_ARRAY_SIZE(name) - 1, device)); checkError("cuDeviceComputeCapability", cuDeviceComputeCapability(&major, &minor, device)); checkError("cuDeviceTotalMem", cuDeviceTotalMem(&memory, device)); name[FW_ARRAY_SIZE(name) - 1] = '\0'; printf("\n"); char deviceIdStr[16]; sprintf( deviceIdStr, "CUDA device %d", device); printf("%-32s%s\n",deviceIdStr, name); printf("%-32s%s\n", "---", "---"); int version = getDriverVersion(); printf("%-32s%d.%d\n", "CUDA driver API version", version/10, version%10); printf("%-32s%d.%d\n", "Compute capability", major, minor); printf("%-32s%.0f megs\n", "Total memory", (F32)memory * exp2(-20)); for (int i = 0; i < (int)FW_ARRAY_SIZE(attribs); i++) { int value; if (cuDeviceGetAttribute(&value, attribs[i].attrib, device) == CUDA_SUCCESS) printf("%-32s%d\n", attribs[i].name, value); } printf("\n"); }
void CuDevice::GetAttributes() { cuDeviceComputeCapability(&_capability.first, &_capability.second, _h); char name[256]; cuDeviceGetName(name, 255, _h); _deviceName = name; cuDeviceTotalMem(&_totalMem, _h); CuDeviceAttr& a = _attributes; cuDeviceGetAttribute(&a.threadsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, _h); cuDeviceGetAttribute(&a.blockDim.x, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, _h); cuDeviceGetAttribute(&a.blockDim.y, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, _h); cuDeviceGetAttribute(&a.blockDim.z, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, _h); cuDeviceGetAttribute(&a.gridDim.x, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, _h); cuDeviceGetAttribute(&a.gridDim.y, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, _h); cuDeviceGetAttribute(&a.gridDim.z, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, _h); cuDeviceGetAttribute(&a.sharedMemPerBlock, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, _h); cuDeviceGetAttribute(&a.totalConstantMem, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, _h); cuDeviceGetAttribute(&a.warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, _h); cuDeviceGetAttribute(&a.maxPitch, CU_DEVICE_ATTRIBUTE_MAX_PITCH, _h); cuDeviceGetAttribute(&a.regPerBlock, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, _h); cuDeviceGetAttribute(&a.clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, _h); cuDeviceGetAttribute(&a.textureAlignment, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, _h); cuDeviceGetAttribute(&a.gpuOverlap, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, _h); cuDeviceGetAttribute(&a.multiprocessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, _h); cuDeviceGetAttribute(&a.kernelExecTimeout, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, _h); cuDeviceGetAttribute(&a.integrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, _h); cuDeviceGetAttribute(&a.canMapHostMem, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, _h); cuDeviceGetAttribute((int*)&a.computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, _h); cuDeviceGetAttribute(&a.tex1DSize, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH, _h); cuDeviceGetAttribute(&a.tex2DSize.x, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH, _h); cuDeviceGetAttribute(&a.tex2DSize.y, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT, _h); cuDeviceGetAttribute(&a.tex3DSize.x, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH, _h); cuDeviceGetAttribute(&a.tex3DSize.y, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT, _h); cuDeviceGetAttribute(&a.tex3DSize.z, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH, _h); cuDeviceGetAttribute(&a.tex2DArraySize.x, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH, _h); cuDeviceGetAttribute(&a.tex2DArraySize.y, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT, _h); cuDeviceGetAttribute(&a.tex2DArraySize.z, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES, _h); cuDeviceGetAttribute(&a.surfaceAlignment, CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT, _h); cuDeviceGetAttribute(&a.concurrentKernels, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, _h); cuDeviceGetAttribute(&a.eccEnabled, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, _h); cuDeviceGetAttribute(&a.pciBusID, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, _h); cuDeviceGetAttribute(&a.pciDeviceID, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, _h); cuDeviceGetAttribute(&a.tccDriver, CU_DEVICE_ATTRIBUTE_TCC_DRIVER, _h); cuDeviceGetAttribute(&a.memoryClockRate, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, _h); cuDeviceGetAttribute(&a.globalMemoryBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, _h); cuDeviceGetAttribute(&a.l2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, _h); cuDeviceGetAttribute(&a.maxThreadsPerSM, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, _h); }
//////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main( int argc, char** argv) { CUdevice dev; int major = 0, minor = 0; int deviceCount = 0; char deviceName[256]; shrQAStart(argc, argv); // note your project will need to link with cuda.lib files on windows printf("CUDA Device Query (Driver API) statically linked version \n"); CUresult error_id = cuInit(0); if (error_id != CUDA_SUCCESS) { printf("cuInit(0) returned %d\n-> %s\n", error_id, getCudaDrvErrorString(error_id)); shrQAFinishExit(argc, (const char **)argv, QA_FAILED); } error_id = cuDeviceGetCount(&deviceCount); if (error_id != CUDA_SUCCESS) { shrLog( "cuDeviceGetCount returned %d\n-> %s\n", (int)error_id, getCudaDrvErrorString(error_id) ); shrQAFinishExit(argc, (const char **)argv, QA_FAILED); } // This function call returns 0 if there are no CUDA capable devices. if (deviceCount == 0) printf("There are no available device(s) that support CUDA\n"); else if (deviceCount == 1) printf("There is 1 device supporting CUDA\n"); else printf("There are %d devices supporting CUDA\n", deviceCount); for (dev = 0; dev < deviceCount; ++dev) { error_id = cuDeviceComputeCapability(&major, &minor, dev); if (error_id != CUDA_SUCCESS) { shrLog( "cuDeviceComputeCapability returned %d\n-> %s\n", (int)error_id, getCudaDrvErrorString(error_id) ); shrQAFinishExit(argc, (const char **)argv, QA_FAILED); } error_id = cuDeviceGetName(deviceName, 256, dev); if (error_id != CUDA_SUCCESS) { shrLog( "cuDeviceGetName returned %d\n-> %s\n", (int)error_id, getCudaDrvErrorString(error_id) ); shrQAFinishExit(argc, (const char **)argv, QA_FAILED); } printf("\nDevice %d: \"%s\"\n", dev, deviceName); #if CUDA_VERSION >= 2020 int driverVersion = 0; cuDriverGetVersion(&driverVersion); printf(" CUDA Driver Version: %d.%d\n", driverVersion/1000, (driverVersion%100)/10); #endif shrLog(" CUDA Capability Major/Minor version number: %d.%d\n", major, minor); size_t totalGlobalMem; error_id = cuDeviceTotalMem(&totalGlobalMem, dev); if (error_id != CUDA_SUCCESS) { shrLog( "cuDeviceTotalMem returned %d\n-> %s\n", (int)error_id, getCudaDrvErrorString(error_id) ); shrQAFinishExit(argc, (const char **)argv, QA_FAILED); } char msg[256]; sprintf(msg, " Total amount of global memory: %.0f MBytes (%llu bytes)\n", (float)totalGlobalMem/1048576.0f, (unsigned long long) totalGlobalMem); shrLog(msg); #if CUDA_VERSION >= 2000 int multiProcessorCount; getCudaAttribute<int>(&multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev); shrLog(" (%2d) Multiprocessors x (%3d) CUDA Cores/MP: %d CUDA Cores\n", multiProcessorCount, ConvertSMVer2Cores(major, minor), ConvertSMVer2Cores(major, minor) * multiProcessorCount); #endif int clockRate; getCudaAttribute<int>(&clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev); printf(" GPU Clock rate: %.0f MHz (%0.2f GHz)\n", clockRate * 1e-3f, clockRate * 1e-6f); #if CUDA_VERSION >= 4000 int memoryClock; getCudaAttribute<int>( &memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev ); shrLog(" Memory Clock rate: %.0f Mhz\n", memoryClock * 1e-3f); int memBusWidth; getCudaAttribute<int>( &memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev ); shrLog(" Memory Bus Width: %d-bit\n", memBusWidth); int L2CacheSize; getCudaAttribute<int>( &L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev ); if (L2CacheSize) { shrLog(" L2 Cache Size: %d bytes\n", L2CacheSize); } int maxTex1D, maxTex2D[2], maxTex3D[3]; getCudaAttribute<int>( &maxTex1D, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH, dev ); getCudaAttribute<int>( &maxTex2D[0], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH, dev ); getCudaAttribute<int>( &maxTex2D[1], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT, dev ); getCudaAttribute<int>( &maxTex3D[0], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH, dev ); getCudaAttribute<int>( &maxTex3D[1], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT, dev ); getCudaAttribute<int>( &maxTex3D[2], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH, dev ); shrLog(" Max Texture Dimension Sizes 1D=(%d) 2D=(%d,%d) 3D=(%d,%d,%d)\n", maxTex1D, maxTex2D[0], maxTex2D[1], maxTex3D[0], maxTex3D[1], maxTex3D[2]); int maxTex2DLayered[3]; getCudaAttribute<int>( &maxTex2DLayered[0], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH, dev ); getCudaAttribute<int>( &maxTex2DLayered[1], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT, dev ); getCudaAttribute<int>( &maxTex2DLayered[2], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS, dev ); shrLog(" Max Layered Texture Size (dim) x layers 1D=(%d) x %d, 2D=(%d,%d) x %d\n", maxTex2DLayered[0], maxTex2DLayered[2], maxTex2DLayered[0], maxTex2DLayered[1], maxTex2DLayered[2]); #endif int totalConstantMemory; getCudaAttribute<int>( &totalConstantMemory, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, dev ); printf(" Total amount of constant memory: %u bytes\n", totalConstantMemory); int sharedMemPerBlock; getCudaAttribute<int>( &sharedMemPerBlock, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, dev ); printf(" Total amount of shared memory per block: %u bytes\n", sharedMemPerBlock); int regsPerBlock; getCudaAttribute<int>( ®sPerBlock, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev ); printf(" Total number of registers available per block: %d\n", regsPerBlock); int warpSize; getCudaAttribute<int>( &warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev ); printf(" Warp size: %d\n", warpSize); int maxThreadsPerMultiProcessor; getCudaAttribute<int>( &maxThreadsPerMultiProcessor, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev ); printf(" Maximum number of threads per multiprocessor: %d\n", maxThreadsPerMultiProcessor); int maxThreadsPerBlock; getCudaAttribute<int>( &maxThreadsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev ); printf(" Maximum number of threads per block: %d\n", maxThreadsPerBlock); int blockDim[3]; getCudaAttribute<int>( &blockDim[0], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, dev ); getCudaAttribute<int>( &blockDim[1], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, dev ); getCudaAttribute<int>( &blockDim[2], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, dev ); printf(" Maximum sizes of each dimension of a block: %d x %d x %d\n", blockDim[0], blockDim[1], blockDim[2]); int gridDim[3]; getCudaAttribute<int>( &gridDim[0], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, dev ); getCudaAttribute<int>( &gridDim[1], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, dev ); getCudaAttribute<int>( &gridDim[2], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, dev ); printf(" Maximum sizes of each dimension of a grid: %d x %d x %d\n", gridDim[0], gridDim[1], gridDim[2]); int textureAlign; getCudaAttribute<int>( &textureAlign, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, dev ); printf(" Texture alignment: %u bytes\n", textureAlign); int memPitch; getCudaAttribute<int>( &memPitch, CU_DEVICE_ATTRIBUTE_MAX_PITCH, dev ); printf(" Maximum memory pitch: %u bytes\n", memPitch); #if CUDA_VERSION >= 2000 int gpuOverlap; getCudaAttribute<int>( &gpuOverlap, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev ); #endif #if CUDA_VERSION >= 4000 int asyncEngineCount; getCudaAttribute<int>( &asyncEngineCount, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev ); printf(" Concurrent copy and execution: %s with %d copy engine(s)\n", (gpuOverlap ? "Yes" : "No"), asyncEngineCount); #else printf(" Concurrent copy and execution: %s\n",gpuOverlap ? "Yes" : "No"); #endif #if CUDA_VERSION >= 2020 int kernelExecTimeoutEnabled; getCudaAttribute<int>( &kernelExecTimeoutEnabled, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, dev ); printf(" Run time limit on kernels: %s\n", kernelExecTimeoutEnabled ? "Yes" : "No"); int integrated; getCudaAttribute<int>( &integrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev ); printf(" Integrated GPU sharing Host Memory: %s\n", integrated ? "Yes" : "No"); int canMapHostMemory; getCudaAttribute<int>( &canMapHostMemory, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev ); printf(" Support host page-locked memory mapping: %s\n", canMapHostMemory ? "Yes" : "No"); #endif #if CUDA_VERSION >= 3000 int concurrentKernels; getCudaAttribute<int>( &concurrentKernels, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev ); printf(" Concurrent kernel execution: %s\n", concurrentKernels ? "Yes" : "No"); int surfaceAlignment; getCudaAttribute<int>( &surfaceAlignment, CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT, dev ); printf(" Alignment requirement for Surfaces: %s\n", surfaceAlignment ? "Yes" : "No"); int eccEnabled; getCudaAttribute<int>( &eccEnabled, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, dev ); printf(" Device has ECC support enabled: %s\n", eccEnabled ? "Yes" : "No"); #endif #if CUDA_VERSION >= 3020 int tccDriver ; getCudaAttribute<int>( &tccDriver , CU_DEVICE_ATTRIBUTE_TCC_DRIVER, dev ); printf(" Device is using TCC driver mode: %s\n", tccDriver ? "Yes" : "No"); #endif #if CUDA_VERSION >= 4000 int unifiedAddressing; getCudaAttribute<int>( &unifiedAddressing, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, dev ); printf(" Device supports Unified Addressing (UVA): %s\n", unifiedAddressing ? "Yes" : "No"); int pciBusID, pciDeviceID; getCudaAttribute<int>( &pciBusID, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, dev ); getCudaAttribute<int>( &pciDeviceID, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, dev ); printf(" Device PCI Bus ID / PCI location ID: %d / %d\n", pciBusID, pciDeviceID ); const char *sComputeMode[] = { "Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)", "Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)", "Prohibited (no host thread can use ::cudaSetDevice() with this device)", "Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)", "Unknown", NULL }; int computeMode; getCudaAttribute<int>( &computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev ); printf(" Compute Mode:\n"); printf(" < %s >\n", sComputeMode[computeMode]); #endif } shrQAFinishExit(argc, (const char **)argv, QA_PASSED); }
int VideoEncoder::DisplayGPUCaps(int deviceOrdinal, NVEncoderParams *pParams, bool bDisplay) { NVVE_GPUAttributes GPUAttributes = {0}; HRESULT hr = S_OK; int gpuPerformance; assert(pParams != NULL); GPUAttributes.iGpuOrdinal = deviceOrdinal; hr = GetParamValue(NVVE_GET_GPU_ATTRIBUTES, &GPUAttributes); if (hr!=S_OK) { printf(" >> NVVE_GET_GPU_ATTRIBUTES error! <<\n\n"); } gpuPerformance = GPUAttributes.iClockRate * GPUAttributes.iMultiProcessorCount; gpuPerformance = gpuPerformance * _ConvertSMVer2Cores(GPUAttributes.iMajor, GPUAttributes.iMinor); size_t totalGlobalMem; CUresult error_id = cuDeviceTotalMem(&totalGlobalMem, deviceOrdinal); if (error_id != CUDA_SUCCESS) { printf("cuDeviceTotalMem returned %d\n-> %s\n", (int)error_id, getCudaDrvErrorString(error_id)); return -1; } if (bDisplay) { printf(" GPU Device %d (SM %d.%d) : %s\n", GPUAttributes.iGpuOrdinal, GPUAttributes.iMajor, GPUAttributes.iMinor, GPUAttributes.cName); printf(" Total Memory = %4.0f MBytes\n" , ceil((float)totalGlobalMem/1048576.0f)); printf(" GPU Clock = %4.2f MHz\n" , (float)GPUAttributes.iClockRate/1000.f); printf(" MultiProcessors/Cores = %d MPs (%d Cores)\n", GPUAttributes.iMultiProcessorCount, GPUAttributes.iMultiProcessorCount*_ConvertSMVer2Cores(GPUAttributes.iMajor, GPUAttributes.iMinor)); printf(" Maximum Offload Mode = "); switch (GPUAttributes.MaxGpuOffloadLevel) { case NVVE_GPU_OFFLOAD_DEFAULT: printf("CPU: PEL Processing Only\n"); break; case NVVE_GPU_OFFLOAD_ESTIMATORS: printf("GPU: Motion Estimation & Intra Prediction\n"); break; case NVVE_GPU_OFFLOAD_ALL: printf("GPU: Full Offload\n"); break; } printf("\n"); } pParams->MaxOffloadLevel = GPUAttributes.MaxGpuOffloadLevel; return gpuPerformance; }
value spoc_getCudaDevice(value i) { CAMLparam1(i); CAMLlocal4(general_info, cuda_info, specific_info, gc_info); CAMLlocal3(device, maxT, maxG); int nb_devices; CUdevprop dev_infos; CUdevice dev; CUcontext ctx; CUstream queue[2]; spoc_cu_context *spoc_ctx; //CUcontext gl_ctx; char infoStr[1024]; int infoInt; size_t infoUInt; int major, minor; enum cudaError_enum cuda_error; cuDeviceGetCount (&nb_devices); if ((Int_val(i)) > nb_devices) raise_constant(*caml_named_value("no_cuda_device")) ; CUDA_CHECK_CALL(cuDeviceGet(&dev, Int_val(i))); CUDA_CHECK_CALL(cuDeviceGetProperties(&dev_infos, dev)); general_info = caml_alloc (9, 0); CUDA_CHECK_CALL(cuDeviceGetName(infoStr, sizeof(infoStr), dev)); Store_field(general_info,0, copy_string(infoStr));// CUDA_CHECK_CALL(cuDeviceTotalMem(&infoUInt, dev)); Store_field(general_info,1, Val_int(infoUInt));// Store_field(general_info,2, Val_int(dev_infos.sharedMemPerBlock));// Store_field(general_info,3, Val_int(dev_infos.clockRate));// Store_field(general_info,4, Val_int(dev_infos.totalConstantMemory));// CUDA_CHECK_CALL(cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev)); Store_field(general_info,5, Val_int(infoInt));// CUDA_CHECK_CALL(cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, dev)); Store_field(general_info,6, Val_bool(infoInt));// Store_field(general_info,7, i); CUDA_CHECK_CALL(cuCtxCreate (&ctx, CU_CTX_SCHED_BLOCKING_SYNC | CU_CTX_MAP_HOST, dev)); spoc_ctx = malloc(sizeof(spoc_cl_context)); spoc_ctx->ctx = ctx; CUDA_CHECK_CALL(cuStreamCreate(&queue[0], 0)); CUDA_CHECK_CALL(cuStreamCreate(&queue[1], 0)); spoc_ctx->queue[0] = queue[0]; spoc_ctx->queue[1] = queue[1]; Store_field(general_info,8, (value)spoc_ctx); CUDA_CHECK_CALL(cuCtxSetCurrent(ctx)); cuda_info = caml_alloc(1, 0); //0 -> Cuda specific_info = caml_alloc(18, 0); cuDeviceComputeCapability(&major, &minor, dev); Store_field(specific_info,0, Val_int(major));// Store_field(specific_info,1, Val_int(minor));// Store_field(specific_info,2, Val_int(dev_infos.regsPerBlock));// Store_field(specific_info,3, Val_int(dev_infos.SIMDWidth));// Store_field(specific_info,4, Val_int(dev_infos.memPitch));// Store_field(specific_info,5, Val_int(dev_infos.maxThreadsPerBlock));// maxT = caml_alloc(3, 0); Store_field(maxT,0, Val_int(dev_infos.maxThreadsDim[0]));// Store_field(maxT,1, Val_int(dev_infos.maxThreadsDim[1]));// Store_field(maxT,2, Val_int(dev_infos.maxThreadsDim[2]));// Store_field(specific_info,6, maxT); maxG = caml_alloc(3, 0); Store_field(maxG,0, Val_int(dev_infos.maxGridSize[0]));// Store_field(maxG,1, Val_int(dev_infos.maxGridSize[1]));// Store_field(maxG,2, Val_int(dev_infos.maxGridSize[2]));// Store_field(specific_info,7, maxG); Store_field(specific_info,8, Val_int(dev_infos.textureAlign));// cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev); Store_field(specific_info,9, Val_bool(infoInt));// cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, dev); Store_field(specific_info,10, Val_bool(infoInt));// cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev); Store_field(specific_info,11, Val_bool(infoInt));// cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev); Store_field(specific_info,12, Val_bool(infoInt));// cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev); Store_field(specific_info,13, Val_int(infoInt));// cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev); Store_field(specific_info,14, Val_bool(infoInt));// cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, dev); Store_field(specific_info,15, Val_int(infoInt)); cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, dev); Store_field(specific_info,16, Val_int(infoInt)); cuDriverGetVersion(&infoInt); Store_field(specific_info, 17, Val_int(infoInt)); Store_field(cuda_info, 0, specific_info); device = caml_alloc(4, 0); Store_field(device, 0, general_info); Store_field(device, 1, cuda_info); {spoc_cuda_gc_info* gcInfo = (spoc_cuda_gc_info*)malloc(sizeof(spoc_cuda_gc_info)); CUDA_CHECK_CALL(cuMemGetInfo(&infoUInt, NULL)); infoUInt -= (32*1024*1024); Store_field(device, 2, (value)gcInfo); {cuda_event_list* events = NULL; Store_field(device, 3, (value)events); CAMLreturn(device);}} }
HRESULT initCudaResources(int argc, char **argv, int bUseInterop, int bTCC) { HRESULT hr = S_OK; CUdevice cuda_device; if (checkCmdLineFlag(argc, (const char **)argv, "device")) { cuda_device = getCmdLineArgumentInt(argc, (const char **) argv, "device"); // If interop is disabled, then we need to create a CUDA context w/o the GL context if (bUseInterop && !bTCC) { cuda_device = findCudaDeviceDRV(argc, (const char **)argv); } else { cuda_device = findCudaGLDeviceDRV(argc, (const char **)argv); } if (cuda_device < 0) { printf("No CUDA Capable devices found, exiting...\n"); exit(EXIT_SUCCESS); } checkCudaErrors(cuDeviceGet(&g_oDevice, cuda_device)); } else { // If we want to use Graphics Interop, then choose the GPU that is capable if (bUseInterop) { cuda_device = gpuGetMaxGflopsGLDeviceIdDRV(); checkCudaErrors(cuDeviceGet(&g_oDevice, cuda_device)); } else { cuda_device = gpuGetMaxGflopsDeviceIdDRV(); checkCudaErrors(cuDeviceGet(&g_oDevice, cuda_device)); } } // get compute capabilities and the devicename int major, minor; size_t totalGlobalMem; char deviceName[256]; checkCudaErrors(cuDeviceComputeCapability(&major, &minor, g_oDevice)); checkCudaErrors(cuDeviceGetName(deviceName, 256, g_oDevice)); printf("> Using GPU Device %d: %s has SM %d.%d compute capability\n", cuda_device, deviceName, major, minor); checkCudaErrors(cuDeviceTotalMem(&totalGlobalMem, g_oDevice)); printf(" Total amount of global memory: %4.4f MB\n", (float)totalGlobalMem/(1024*1024)); // Create CUDA Device w/ D3D9 interop (if WDDM), otherwise CUDA w/o interop (if TCC) // (use CU_CTX_BLOCKING_SYNC for better CPU synchronization) if (bUseInterop) { checkCudaErrors(cuD3D9CtxCreate(&g_oContext, &g_oDevice, CU_CTX_BLOCKING_SYNC, g_pD3DDevice)); } else { checkCudaErrors(cuCtxCreate(&g_oContext, CU_CTX_BLOCKING_SYNC, g_oDevice)); } try { // Initialize CUDA releated Driver API (32-bit or 64-bit), depending the platform running if (sizeof(void *) == 4) { g_pCudaModule = new CUmoduleManager("NV12ToARGB_drvapi_Win32.ptx", exec_path, 2, 2, 2); } else { g_pCudaModule = new CUmoduleManager("NV12ToARGB_drvapi_x64.ptx", exec_path, 2, 2, 2); } } catch (char const *p_file) { // If the CUmoduleManager constructor fails to load the PTX file, it will throw an exception printf("\n>> CUmoduleManager::Exception! %s not found!\n", p_file); printf(">> Please rebuild NV12ToARGB_drvapi.cu or re-install this sample.\n"); return E_FAIL; } g_pCudaModule->GetCudaFunction("NV12ToARGB_drvapi", &gfpNV12toARGB); g_pCudaModule->GetCudaFunction("Passthru_drvapi", &gfpPassthru); /////////////////Change/////////////////////////// // Now we create the CUDA resources and the CUDA decoder context initCudaVideo(); if (bUseInterop) { initD3D9Surface(g_pVideoDecoder->targetWidth(), g_pVideoDecoder->targetHeight()); } else { checkCudaErrors(cuMemAlloc(&g_pInteropFrame[0], g_pVideoDecoder->targetWidth() * g_pVideoDecoder->targetHeight() * 2)); checkCudaErrors(cuMemAlloc(&g_pInteropFrame[1], g_pVideoDecoder->targetWidth() * g_pVideoDecoder->targetHeight() * 2)); } CUcontext cuCurrent = NULL; CUresult result = cuCtxPopCurrent(&cuCurrent); if (result != CUDA_SUCCESS) { printf("cuCtxPopCurrent: %d\n", result); assert(0); } ///////////////////////////////////////// return ((g_pCudaModule && g_pVideoDecoder && (g_pImageDX || g_pInteropFrame[0])) ? S_OK : E_FAIL); }
HRESULT CudaVideoRender::initCudaResources(int bUseInterop, int bTCC) { HRESULT hr = S_OK; CUdevice cuda_device; { // If we want to use Graphics Interop, then choose the GPU that is capable if (bUseInterop) { cuda_device = cutilDrvGetMaxGflopsGraphicsDeviceId(); cutilDrvSafeCallNoSync(cuDeviceGet(&m_cuDevice, cuda_device )); } else { cuda_device = cutilDrvGetMaxGflopsDeviceId(); cutilDrvSafeCallNoSync(cuDeviceGet(&m_cuDevice, cuda_device )); } } // get compute capabilities and the devicename int major, minor; size_t totalGlobalMem; char deviceName[256]; cutilDrvSafeCallNoSync( cuDeviceComputeCapability(&major, &minor, m_cuDevice) ); cutilDrvSafeCallNoSync( cuDeviceGetName(deviceName, 256, m_cuDevice) ); printf("> Using GPU Device %d: %s has SM %d.%d compute capability\n", cuda_device, deviceName, major, minor); cutilDrvSafeCallNoSync( cuDeviceTotalMem(&totalGlobalMem, m_cuDevice) ); printf(" Total amount of global memory: %4.4f MB\n", (float)totalGlobalMem/(1024*1024) ); // Create CUDA Device w/ D3D9 interop (if WDDM), otherwise CUDA w/o interop (if TCC) // (use CU_CTX_BLOCKING_SYNC for better CPU synchronization) if (bUseInterop) { cutilDrvSafeCallNoSync( cuD3D9CtxCreate(&m_cuContext, &m_cuDevice, CU_CTX_BLOCKING_SYNC, m_pRenderer9->getDevice()) ); } else { cutilDrvSafeCallNoSync( cuCtxCreate(&m_cuContext, CU_CTX_BLOCKING_SYNC, m_cuDevice) ); } // Initialize CUDA releated Driver API (32-bit or 64-bit), depending the platform running if (sizeof(void *) == 4) { m_pCudaModule = new CUmoduleManager("NV12ToARGB_drvapi_Win32.ptx", "./", 2, 2, 2); } else { m_pCudaModule = new CUmoduleManager("NV12ToARGB_drvapi_x64.ptx", "./", 2, 2, 2); } m_pCudaModule->GetCudaFunction("NV12ToARGB_drvapi", &m_fpNV12toARGB); m_pCudaModule->GetCudaFunction("Passthru_drvapi", &m_fpPassthru); /////////////////Change/////////////////////////// // Now we create the CUDA resources and the CUDA decoder context initCudaVideo(); if (bUseInterop) { //initD3D9Surface ( m_pVideoDecoder->targetWidth(), // m_pVideoDecoder->targetHeight() ); } else { cutilDrvSafeCallNoSync( cuMemAlloc(&m_pInteropFrame[0], m_pVideoDecoder->targetWidth() * m_pVideoDecoder->targetHeight() * 2) ); cutilDrvSafeCallNoSync( cuMemAlloc(&m_pInteropFrame[1], m_pVideoDecoder->targetWidth() * m_pVideoDecoder->targetHeight() * 2) ); } CUcontext cuCurrent = NULL; CUresult result = cuCtxPopCurrent(&cuCurrent); if (result != CUDA_SUCCESS) { printf("cuCtxPopCurrent: %d\n", result); assert(0); } ///////////////////////////////////////// return ((m_pCudaModule && m_pVideoDecoder) ? S_OK : E_FAIL); }
int main() { int i, devCount; CUdevice dev; CUdevprop prop; CUresult e; cuInit(0); cuDeviceGetCount(&devCount); for(i = 0; i < devCount; i++) { e = cuDeviceGet(&dev, i); if(e != CUDA_SUCCESS) { printf("cuDeviceGet(%d) failed\n", i); continue; } e = cuDeviceGetProperties(&prop, dev); if(e != CUDA_SUCCESS) { printf("Could not get device properties"); continue; } printf("Card #%02d:\n", i); printf("\tName: "); { char buf[1024]; e = cuDeviceGetName(buf, 1024, dev); checkFail(e) || printf("%s", buf); printf("\n"); } printf("\tCompute capability: "); { int major, minor; e = cuDeviceComputeCapability(&major, &minor, dev); checkFail(e) || printf("%d.%d", major, minor); printf("\n"); } printf("\tTotal memory: "); { size_t mem; e = cuDeviceTotalMem(&mem, dev); checkFail(e) || printf("%lu bytes", mem); printf("\n"); } printf("\tClock rate: "); { printf("%d kHz", prop.clockRate); printf("\n"); } printf("\tGrid dimensions: "); { printf("%d x %d x %d", prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]); printf("\n"); } printf("\tThread dimensions: "); { printf("%d x %d x %d", prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]); printf("\n"); } printf("\tThreads per block: "); { printf("%d", prop.maxThreadsPerBlock); printf("\n"); } printf("\tShared memory per block: "); { printf("%d bytes", prop.sharedMemPerBlock); printf("\n"); } printf("\tConstant memory: "); { printf("%d bytes", prop.totalConstantMemory); printf("\n"); } printf("\tWarp size: "); { int attr; e = cuDeviceGetAttribute(&attr, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev); checkFail(e) || printf("%d", attr); printf("\n"); } printf("\tNumber of multiprocessors: "); { int attr; e = cuDeviceGetAttribute(&attr, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev); checkFail(e) || printf("%d", attr); printf("\n"); } printf("\tIs integrated: "); { int attr; e = cuDeviceGetAttribute(&attr, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev); checkFail(e) || printf("%s", attr!=0?"yes":"no"); printf("\n"); } printf("\tCan map host memory: "); { int attr; e = cuDeviceGetAttribute(&attr, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev); checkFail(e) || printf("%s", attr!=0?"yes":"no"); printf("\n"); } printf("\tCan execute multiple kernels: "); { int attr; e = cuDeviceGetAttribute(&attr, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev); checkFail(e) || printf("%s", attr!=0?"yes":"no"); printf("\n"); } printf("\tThreads per multiprocessor: "); { int attr; e = cuDeviceGetAttribute(&attr, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev); checkFail(e) || printf("%d", attr); printf("\n"); } printf("\tAsynchronous engines: "); { int attr; e = cuDeviceGetAttribute(&attr, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev); checkFail(e) || printf("%d", attr); printf("\n"); } printf("\tShares address space with host: "); { int attr; e = cuDeviceGetAttribute(&attr, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, dev); checkFail(e) || printf("%s", attr!=0?"yes":"no"); printf("\n"); } printf("\tL2 cache: "); { int attr; e = cuDeviceGetAttribute(&attr, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev); checkFail(e) || printf("%d bytes", attr); printf("\n"); } } }