/*===========================================================================*/ bool Device::create( const int ordinal ) { KVS_CU_CALL( cuDeviceGet( &m_handler, ordinal ) ); if ( kvs::cuda::DriverAPI::HasError() ) return false; KVS_CU_CALL( cuDeviceGetProperties( &m_property, m_handler ) ); if ( kvs::cuda::DriverAPI::HasError() ) return false; return true; }
void print_GetProperties(CUdevice cuDevice) { int count = 0; cuDeviceGetCount(&count); printf ("cuDevice(%d)GetCount = %d\n", cuDevice, count); int len = 1024; char* dev_name = (char*)malloc(sizeof(char) * len); cuDeviceGetName(dev_name, len, cuDevice); printf("cuda-devicename = %s\n", dev_name); free(dev_name); int mj_v = 0, mn_v = 0; cuDeviceComputeCapability(&mj_v, &mn_v, cuDevice); printf("compute capability = mj:%d, mn:%d\n", mj_v, mn_v); size_t byt_mem = 0; cuDeviceTotalMem(&byt_mem, cuDevice); printf("total mem = %d\n", byt_mem); CUdevprop cp; cuDeviceGetProperties(&cp, cuDevice); printf("Thd/blk = %d, thrdDim xyz = (%d, %d, %d:threads), GridSz xyz = (%d, %d, %d:blocks), shrdmem/blk = %d, constmem = %d bytes, simdwidth = %d, mempitch = %d, regsPerBlock = %d, clockRate = %d, textureAlign = %d \n", cp.maxThreadsPerBlock, cp.maxThreadsDim[0], cp.maxThreadsDim[1], cp.maxThreadsDim[2], cp.maxGridSize[0], cp.maxGridSize[1], cp.maxGridSize[2], cp.sharedMemPerBlock, cp.totalConstantMemory, cp.SIMDWidth, cp.memPitch, cp.regsPerBlock, cp.clockRate, cp.textureAlign); int ip; cuDeviceGetAttribute(&ip, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuDevice); printf ("Attrib - CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = %d\n", ip); cuDeviceGetAttribute(&ip, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, cuDevice); printf ("Attrib - CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = %d\n", ip); cuDeviceGetAttribute(&ip, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, cuDevice); printf ("Attrib - CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = %d\n", ip); cuDeviceGetAttribute(&ip, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, cuDevice); printf ("Attrib - CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z = %d\n", ip); cuDeviceGetAttribute(&ip, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, cuDevice); printf ("Attrib - CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = %d\n", ip); cuDeviceGetAttribute(&ip, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, cuDevice); printf ("Attrib - CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y = %d\n", ip); cuDeviceGetAttribute(&ip, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, cuDevice); printf ("Attrib - CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z = %d\n", ip); cuDeviceGetAttribute(&ip, CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK, cuDevice); printf ("Attrib - CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK = %d\n", ip); cuDeviceGetAttribute(&ip, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, cuDevice); printf ("Attrib - CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = %d\n", ip); cuDeviceGetAttribute(&ip, CU_DEVICE_ATTRIBUTE_WARP_SIZE, cuDevice); printf ("Attrib - CU_DEVICE_ATTRIBUTE_WARP_SIZE = %d\n", ip); cuDeviceGetAttribute(&ip, CU_DEVICE_ATTRIBUTE_MAX_PITCH, cuDevice); printf ("Attrib - CU_DEVICE_ATTRIBUTE_MAX_PITCH = %d\n", ip); cuDeviceGetAttribute(&ip, CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK, cuDevice); printf ("Attrib - CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK = %d\n", ip); cuDeviceGetAttribute(&ip, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, cuDevice); printf ("Attrib - CU_DEVICE_ATTRIBUTE_CLOCK_RATE = %d\n", ip); cuDeviceGetAttribute(&ip, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, cuDevice); printf ("Attrib - CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = %d\n", ip); }
value spoc_getCudaDevice(value i) { CAMLparam1(i); CAMLlocal4(general_info, cuda_info, specific_info, gc_info); CAMLlocal3(device, maxT, maxG); int nb_devices; CUdevprop dev_infos; CUdevice dev; CUcontext ctx; CUstream queue[2]; spoc_cu_context *spoc_ctx; //CUcontext gl_ctx; char infoStr[1024]; int infoInt; size_t infoUInt; int major, minor; enum cudaError_enum cuda_error; cuDeviceGetCount (&nb_devices); if ((Int_val(i)) > nb_devices) raise_constant(*caml_named_value("no_cuda_device")) ; CUDA_CHECK_CALL(cuDeviceGet(&dev, Int_val(i))); CUDA_CHECK_CALL(cuDeviceGetProperties(&dev_infos, dev)); general_info = caml_alloc (9, 0); CUDA_CHECK_CALL(cuDeviceGetName(infoStr, sizeof(infoStr), dev)); Store_field(general_info,0, copy_string(infoStr));// CUDA_CHECK_CALL(cuDeviceTotalMem(&infoUInt, dev)); Store_field(general_info,1, Val_int(infoUInt));// Store_field(general_info,2, Val_int(dev_infos.sharedMemPerBlock));// Store_field(general_info,3, Val_int(dev_infos.clockRate));// Store_field(general_info,4, Val_int(dev_infos.totalConstantMemory));// CUDA_CHECK_CALL(cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev)); Store_field(general_info,5, Val_int(infoInt));// CUDA_CHECK_CALL(cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, dev)); Store_field(general_info,6, Val_bool(infoInt));// Store_field(general_info,7, i); CUDA_CHECK_CALL(cuCtxCreate (&ctx, CU_CTX_SCHED_BLOCKING_SYNC | CU_CTX_MAP_HOST, dev)); spoc_ctx = malloc(sizeof(spoc_cl_context)); spoc_ctx->ctx = ctx; CUDA_CHECK_CALL(cuStreamCreate(&queue[0], 0)); CUDA_CHECK_CALL(cuStreamCreate(&queue[1], 0)); spoc_ctx->queue[0] = queue[0]; spoc_ctx->queue[1] = queue[1]; Store_field(general_info,8, (value)spoc_ctx); CUDA_CHECK_CALL(cuCtxSetCurrent(ctx)); cuda_info = caml_alloc(1, 0); //0 -> Cuda specific_info = caml_alloc(18, 0); cuDeviceComputeCapability(&major, &minor, dev); Store_field(specific_info,0, Val_int(major));// Store_field(specific_info,1, Val_int(minor));// Store_field(specific_info,2, Val_int(dev_infos.regsPerBlock));// Store_field(specific_info,3, Val_int(dev_infos.SIMDWidth));// Store_field(specific_info,4, Val_int(dev_infos.memPitch));// Store_field(specific_info,5, Val_int(dev_infos.maxThreadsPerBlock));// maxT = caml_alloc(3, 0); Store_field(maxT,0, Val_int(dev_infos.maxThreadsDim[0]));// Store_field(maxT,1, Val_int(dev_infos.maxThreadsDim[1]));// Store_field(maxT,2, Val_int(dev_infos.maxThreadsDim[2]));// Store_field(specific_info,6, maxT); maxG = caml_alloc(3, 0); Store_field(maxG,0, Val_int(dev_infos.maxGridSize[0]));// Store_field(maxG,1, Val_int(dev_infos.maxGridSize[1]));// Store_field(maxG,2, Val_int(dev_infos.maxGridSize[2]));// Store_field(specific_info,7, maxG); Store_field(specific_info,8, Val_int(dev_infos.textureAlign));// cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev); Store_field(specific_info,9, Val_bool(infoInt));// cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, dev); Store_field(specific_info,10, Val_bool(infoInt));// cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev); Store_field(specific_info,11, Val_bool(infoInt));// cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev); Store_field(specific_info,12, Val_bool(infoInt));// cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev); Store_field(specific_info,13, Val_int(infoInt));// cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev); Store_field(specific_info,14, Val_bool(infoInt));// cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, dev); Store_field(specific_info,15, Val_int(infoInt)); cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, dev); Store_field(specific_info,16, Val_int(infoInt)); cuDriverGetVersion(&infoInt); Store_field(specific_info, 17, Val_int(infoInt)); Store_field(cuda_info, 0, specific_info); device = caml_alloc(4, 0); Store_field(device, 0, general_info); Store_field(device, 1, cuda_info); {spoc_cuda_gc_info* gcInfo = (spoc_cuda_gc_info*)malloc(sizeof(spoc_cuda_gc_info)); CUDA_CHECK_CALL(cuMemGetInfo(&infoUInt, NULL)); infoUInt -= (32*1024*1024); Store_field(device, 2, (value)gcInfo); {cuda_event_list* events = NULL; Store_field(device, 3, (value)events); CAMLreturn(device);}} }
int main() { int i, devCount; CUdevice dev; CUdevprop prop; CUresult e; cuInit(0); cuDeviceGetCount(&devCount); for(i = 0; i < devCount; i++) { e = cuDeviceGet(&dev, i); if(e != CUDA_SUCCESS) { printf("cuDeviceGet(%d) failed\n", i); continue; } e = cuDeviceGetProperties(&prop, dev); if(e != CUDA_SUCCESS) { printf("Could not get device properties"); continue; } printf("Card #%02d:\n", i); printf("\tName: "); { char buf[1024]; e = cuDeviceGetName(buf, 1024, dev); checkFail(e) || printf("%s", buf); printf("\n"); } printf("\tCompute capability: "); { int major, minor; e = cuDeviceComputeCapability(&major, &minor, dev); checkFail(e) || printf("%d.%d", major, minor); printf("\n"); } printf("\tTotal memory: "); { size_t mem; e = cuDeviceTotalMem(&mem, dev); checkFail(e) || printf("%lu bytes", mem); printf("\n"); } printf("\tClock rate: "); { printf("%d kHz", prop.clockRate); printf("\n"); } printf("\tGrid dimensions: "); { printf("%d x %d x %d", prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]); printf("\n"); } printf("\tThread dimensions: "); { printf("%d x %d x %d", prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]); printf("\n"); } printf("\tThreads per block: "); { printf("%d", prop.maxThreadsPerBlock); printf("\n"); } printf("\tShared memory per block: "); { printf("%d bytes", prop.sharedMemPerBlock); printf("\n"); } printf("\tConstant memory: "); { printf("%d bytes", prop.totalConstantMemory); printf("\n"); } printf("\tWarp size: "); { int attr; e = cuDeviceGetAttribute(&attr, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev); checkFail(e) || printf("%d", attr); printf("\n"); } printf("\tNumber of multiprocessors: "); { int attr; e = cuDeviceGetAttribute(&attr, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev); checkFail(e) || printf("%d", attr); printf("\n"); } printf("\tIs integrated: "); { int attr; e = cuDeviceGetAttribute(&attr, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev); checkFail(e) || printf("%s", attr!=0?"yes":"no"); printf("\n"); } printf("\tCan map host memory: "); { int attr; e = cuDeviceGetAttribute(&attr, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev); checkFail(e) || printf("%s", attr!=0?"yes":"no"); printf("\n"); } printf("\tCan execute multiple kernels: "); { int attr; e = cuDeviceGetAttribute(&attr, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev); checkFail(e) || printf("%s", attr!=0?"yes":"no"); printf("\n"); } printf("\tThreads per multiprocessor: "); { int attr; e = cuDeviceGetAttribute(&attr, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev); checkFail(e) || printf("%d", attr); printf("\n"); } printf("\tAsynchronous engines: "); { int attr; e = cuDeviceGetAttribute(&attr, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev); checkFail(e) || printf("%d", attr); printf("\n"); } printf("\tShares address space with host: "); { int attr; e = cuDeviceGetAttribute(&attr, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, dev); checkFail(e) || printf("%s", attr!=0?"yes":"no"); printf("\n"); } printf("\tL2 cache: "); { int attr; e = cuDeviceGetAttribute(&attr, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev); checkFail(e) || printf("%d bytes", attr); printf("\n"); } } }