Пример #1
0
/*===========================================================================*/
bool Device::create( const int ordinal )
{
    KVS_CU_CALL( cuDeviceGet( &m_handler, ordinal ) );
    if ( kvs::cuda::DriverAPI::HasError() ) return false;

    KVS_CU_CALL( cuDeviceGetProperties( &m_property, m_handler ) );
    if ( kvs::cuda::DriverAPI::HasError() ) return false;

    return true;
}
void print_GetProperties(CUdevice cuDevice)
{
    int count = 0;
    cuDeviceGetCount(&count);
    printf ("cuDevice(%d)GetCount = %d\n", cuDevice, count);

    int len = 1024;
    char* dev_name = (char*)malloc(sizeof(char) * len);
    cuDeviceGetName(dev_name, len, cuDevice);
    printf("cuda-devicename = %s\n", dev_name);
    free(dev_name);

    int mj_v = 0, mn_v = 0;
    cuDeviceComputeCapability(&mj_v, &mn_v, cuDevice);
    printf("compute capability = mj:%d, mn:%d\n", mj_v, mn_v);

    size_t byt_mem = 0;
    cuDeviceTotalMem(&byt_mem, cuDevice);
    printf("total mem = %d\n", byt_mem);
    CUdevprop cp;
    cuDeviceGetProperties(&cp, cuDevice);
    printf("Thd/blk = %d, thrdDim xyz = (%d, %d, %d:threads), GridSz xyz = (%d, %d, %d:blocks), shrdmem/blk = %d, constmem = %d bytes, simdwidth = %d, mempitch = %d, regsPerBlock = %d, clockRate = %d, textureAlign = %d \n",
        cp.maxThreadsPerBlock, cp.maxThreadsDim[0], cp.maxThreadsDim[1], cp.maxThreadsDim[2], cp.maxGridSize[0], cp.maxGridSize[1], cp.maxGridSize[2], cp.sharedMemPerBlock, cp.totalConstantMemory, cp.SIMDWidth, cp.memPitch, cp.regsPerBlock, cp.clockRate, cp.textureAlign);

    int ip;
    cuDeviceGetAttribute(&ip, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuDevice);
    printf ("Attrib - CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = %d\n", ip);
    cuDeviceGetAttribute(&ip, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, cuDevice);
    printf ("Attrib - CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = %d\n", ip);
    cuDeviceGetAttribute(&ip, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, cuDevice);
    printf ("Attrib - CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = %d\n", ip);
    cuDeviceGetAttribute(&ip, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, cuDevice);
    printf ("Attrib - CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z = %d\n", ip);
    cuDeviceGetAttribute(&ip, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, cuDevice);
    printf ("Attrib - CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = %d\n", ip);
    cuDeviceGetAttribute(&ip, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, cuDevice);
    printf ("Attrib - CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y = %d\n", ip);
    cuDeviceGetAttribute(&ip, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, cuDevice);
    printf ("Attrib - CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z = %d\n", ip);
    cuDeviceGetAttribute(&ip, CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK, cuDevice);
    printf ("Attrib - CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK = %d\n", ip);
    cuDeviceGetAttribute(&ip, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, cuDevice);
    printf ("Attrib - CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = %d\n", ip);
    cuDeviceGetAttribute(&ip, CU_DEVICE_ATTRIBUTE_WARP_SIZE, cuDevice);
    printf ("Attrib - CU_DEVICE_ATTRIBUTE_WARP_SIZE = %d\n", ip);
    cuDeviceGetAttribute(&ip, CU_DEVICE_ATTRIBUTE_MAX_PITCH, cuDevice);
    printf ("Attrib - CU_DEVICE_ATTRIBUTE_MAX_PITCH = %d\n", ip);
    cuDeviceGetAttribute(&ip, CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK, cuDevice);
    printf ("Attrib - CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK = %d\n", ip);
    cuDeviceGetAttribute(&ip, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, cuDevice);
    printf ("Attrib - CU_DEVICE_ATTRIBUTE_CLOCK_RATE = %d\n", ip);
    cuDeviceGetAttribute(&ip, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, cuDevice);
    printf ("Attrib - CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = %d\n", ip);
}
Пример #3
0
value spoc_getCudaDevice(value i)
{
	CAMLparam1(i);
	CAMLlocal4(general_info, cuda_info, specific_info, gc_info);
	CAMLlocal3(device,  maxT, maxG);
	int nb_devices;
	CUdevprop dev_infos;
	CUdevice dev;
	CUcontext ctx;
	CUstream queue[2];
	spoc_cu_context *spoc_ctx;
	//CUcontext gl_ctx;
	char infoStr[1024];
	int infoInt;
	size_t infoUInt;
	int major, minor;
	enum cudaError_enum cuda_error; 


	cuDeviceGetCount (&nb_devices);

	if ((Int_val(i)) > nb_devices)
		raise_constant(*caml_named_value("no_cuda_device")) ;


	CUDA_CHECK_CALL(cuDeviceGet(&dev, Int_val(i)));
	CUDA_CHECK_CALL(cuDeviceGetProperties(&dev_infos, dev));

	general_info = caml_alloc (9, 0);
	CUDA_CHECK_CALL(cuDeviceGetName(infoStr, sizeof(infoStr), dev));

	Store_field(general_info,0, copy_string(infoStr));//
	CUDA_CHECK_CALL(cuDeviceTotalMem(&infoUInt, dev));

	Store_field(general_info,1, Val_int(infoUInt));//
	Store_field(general_info,2, Val_int(dev_infos.sharedMemPerBlock));//
	Store_field(general_info,3, Val_int(dev_infos.clockRate));//
	Store_field(general_info,4, Val_int(dev_infos.totalConstantMemory));//
	CUDA_CHECK_CALL(cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev));
	Store_field(general_info,5, Val_int(infoInt));//
	CUDA_CHECK_CALL(cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, dev));
	Store_field(general_info,6, Val_bool(infoInt));//
	Store_field(general_info,7, i);
	CUDA_CHECK_CALL(cuCtxCreate	(&ctx,
			CU_CTX_SCHED_BLOCKING_SYNC | CU_CTX_MAP_HOST,
			dev));
	spoc_ctx = malloc(sizeof(spoc_cl_context));
	spoc_ctx->ctx = ctx;
	CUDA_CHECK_CALL(cuStreamCreate(&queue[0], 0));
	CUDA_CHECK_CALL(cuStreamCreate(&queue[1], 0));
	spoc_ctx->queue[0] = queue[0];
	spoc_ctx->queue[1] = queue[1];
	Store_field(general_info,8, (value)spoc_ctx);
	CUDA_CHECK_CALL(cuCtxSetCurrent(ctx));


	cuda_info = caml_alloc(1, 0); //0 -> Cuda
	specific_info = caml_alloc(18, 0);

	cuDeviceComputeCapability(&major, &minor, dev);
	Store_field(specific_info,0, Val_int(major));//
	Store_field(specific_info,1, Val_int(minor));//
	Store_field(specific_info,2, Val_int(dev_infos.regsPerBlock));//
	Store_field(specific_info,3, Val_int(dev_infos.SIMDWidth));//
	Store_field(specific_info,4, Val_int(dev_infos.memPitch));//
	Store_field(specific_info,5, Val_int(dev_infos.maxThreadsPerBlock));//

	maxT = caml_alloc(3, 0);
	Store_field(maxT,0, Val_int(dev_infos.maxThreadsDim[0]));//
	Store_field(maxT,1, Val_int(dev_infos.maxThreadsDim[1]));//
	Store_field(maxT,2, Val_int(dev_infos.maxThreadsDim[2]));//
	Store_field(specific_info,6, maxT);

	maxG = caml_alloc(3, 0);
	Store_field(maxG,0, Val_int(dev_infos.maxGridSize[0]));//
	Store_field(maxG,1, Val_int(dev_infos.maxGridSize[1]));//
	Store_field(maxG,2, Val_int(dev_infos.maxGridSize[2]));//
	Store_field(specific_info,7, maxG);

	Store_field(specific_info,8, Val_int(dev_infos.textureAlign));//
	cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
	Store_field(specific_info,9, Val_bool(infoInt));//
	cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, dev);
	Store_field(specific_info,10, Val_bool(infoInt));//
	cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
	Store_field(specific_info,11, Val_bool(infoInt));//
	cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
	Store_field(specific_info,12, Val_bool(infoInt));//
	cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
	Store_field(specific_info,13, Val_int(infoInt));//
	cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
	Store_field(specific_info,14, Val_bool(infoInt));//
	cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, dev);
	Store_field(specific_info,15, Val_int(infoInt));
	cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, dev);
	Store_field(specific_info,16, Val_int(infoInt));
	cuDriverGetVersion(&infoInt);
	Store_field(specific_info, 17, Val_int(infoInt));

	Store_field(cuda_info, 0, specific_info);
	device = caml_alloc(4, 0);
	Store_field(device, 0, general_info);
	Store_field(device, 1, cuda_info);

	{spoc_cuda_gc_info* gcInfo = (spoc_cuda_gc_info*)malloc(sizeof(spoc_cuda_gc_info));
	CUDA_CHECK_CALL(cuMemGetInfo(&infoUInt, NULL));
	infoUInt -= (32*1024*1024);

	Store_field(device, 2, (value)gcInfo);


	{cuda_event_list* events = NULL;
	Store_field(device, 3, (value)events);



	CAMLreturn(device);}}
}
Пример #4
0
int main() {
	int i, devCount;
	CUdevice dev;
	CUdevprop prop;
	CUresult e;

	cuInit(0);
	cuDeviceGetCount(&devCount);
	for(i = 0; i < devCount; i++) {
		e = cuDeviceGet(&dev, i);
		if(e != CUDA_SUCCESS) {
			printf("cuDeviceGet(%d) failed\n", i);
			continue;
		}
		e = cuDeviceGetProperties(&prop, dev);
		if(e != CUDA_SUCCESS) {
			printf("Could not get device properties");
			continue;
		}

		printf("Card #%02d:\n", i);

		printf("\tName: ");
		{
			char buf[1024];
			e = cuDeviceGetName(buf, 1024, dev);
			checkFail(e) ||
			printf("%s", buf);
			printf("\n");
		}

		printf("\tCompute capability: ");
		{
			int major, minor;
			e = cuDeviceComputeCapability(&major, &minor, dev);
			checkFail(e) ||
			printf("%d.%d", major, minor);
			printf("\n");
		}

		printf("\tTotal memory: ");
		{
			size_t mem;
			e = cuDeviceTotalMem(&mem, dev);
			checkFail(e) ||
			printf("%lu bytes", mem);
			printf("\n");
		}

		printf("\tClock rate: ");
		{
			printf("%d kHz", prop.clockRate);
			printf("\n");
		}

		printf("\tGrid dimensions: ");
		{
			printf("%d x %d x %d", prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]);
			printf("\n");
		}

		printf("\tThread dimensions: ");
		{
			printf("%d x %d x %d", prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]);
			printf("\n");
		}

		printf("\tThreads per block: ");
		{
			printf("%d", prop.maxThreadsPerBlock);
			printf("\n");
		}

		printf("\tShared memory per block: ");
		{
			printf("%d bytes", prop.sharedMemPerBlock);
			printf("\n");
		}

		printf("\tConstant memory: ");
		{
			printf("%d bytes", prop.totalConstantMemory);
			printf("\n");
		}

		printf("\tWarp size: ");
		{
			int attr;
			e = cuDeviceGetAttribute(&attr, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev);
			checkFail(e) ||
			printf("%d", attr);
			printf("\n");
		}

		printf("\tNumber of multiprocessors: ");
		{
			int attr;
			e = cuDeviceGetAttribute(&attr, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
			checkFail(e) ||
			printf("%d", attr);
			printf("\n");
		}

		printf("\tIs integrated: ");
		{
			int attr;
			e = cuDeviceGetAttribute(&attr, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
			checkFail(e) ||
			printf("%s", attr!=0?"yes":"no");
			printf("\n");
		}

		printf("\tCan map host memory: ");
		{
			int attr;
			e = cuDeviceGetAttribute(&attr, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
			checkFail(e) ||
			printf("%s", attr!=0?"yes":"no");
			printf("\n");
		}

		printf("\tCan execute multiple kernels: ");
		{
			int attr;
			e = cuDeviceGetAttribute(&attr, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
			checkFail(e) ||
			printf("%s", attr!=0?"yes":"no");
			printf("\n");
		}

		printf("\tThreads per multiprocessor: ");
		{
			int attr;
			e = cuDeviceGetAttribute(&attr, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev);
			checkFail(e) ||
			printf("%d", attr);
			printf("\n");
		}

		printf("\tAsynchronous engines: ");
		{
			int attr;
			e = cuDeviceGetAttribute(&attr, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
			checkFail(e) ||
			printf("%d", attr);
			printf("\n");
		}

		printf("\tShares address space with host: ");
		{
			int attr;
			e = cuDeviceGetAttribute(&attr, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, dev);
			checkFail(e) ||
			printf("%s", attr!=0?"yes":"no");
			printf("\n");
		}

		printf("\tL2 cache: ");
		{
			int attr;
			e = cuDeviceGetAttribute(&attr, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev);
			checkFail(e) ||
			printf("%d bytes", attr);
			printf("\n");
		}
	}
}