Example #1
0
CAMLprim value spoc_cuda_compile(value moduleSrc, value function_name, value gi){
	CAMLparam3(moduleSrc, function_name, gi);
	CUmodule module;
	CUfunction *kernel;
	char* functionN;
	char *ptx_source;
	const unsigned int jitNumOptions = 4;

	CUjit_option jitOptions[4];
	void *jitOptVals[4];
	int jitLogBufferSize;
	char *jitLogBuffer;
	int jitRegCount = 32;

	CUDA_GET_CONTEXT;

	kernel = malloc(sizeof(CUfunction));
	functionN = String_val(function_name);
	ptx_source = String_val(moduleSrc);

	// set up size of compilation log buffer
	jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
	jitLogBufferSize = 1024;
	jitOptVals[0] = (void *)(size_t)jitLogBufferSize;

	// set up pointer to the compilation log buffer
	jitOptions[1] = CU_JIT_INFO_LOG_BUFFER;
	jitLogBuffer = malloc(sizeof(char)*jitLogBufferSize);
	jitOptVals[1] = jitLogBuffer;

	// set up pointer to set the Maximum # of registers for a particular kernel
	jitOptions[2] = CU_JIT_MAX_REGISTERS;
	jitOptVals[2] = (void *)(size_t)jitRegCount;

	// set up pointer to set the Maximum # of registers for a particular kernel
	jitOptions[3] = CU_JIT_TARGET_FROM_CUCONTEXT;
	//CU_JIT_TARGET;
//	jitOptVals[3] =  (void*)(uintptr_t)CU_TARGET_COMPUTE_11;

	CUDA_CHECK_CALL(cuModuleLoadDataEx(&module, ptx_source, jitNumOptions, jitOptions, (void **)jitOptVals));
	CUDA_CHECK_CALL(cuModuleGetFunction(kernel, module, functionN));
	free(jitLogBuffer);
	CUDA_RESTORE_CONTEXT;
	//caml_leave_blocking_section();
	CAMLreturn((value) kernel);
}
Example #2
0
CAMLprim value spoc_cuda_set_block_shape(value ker, value block, value gi){
	CAMLparam3(ker, block, gi);
	CUfunction *kernel;

	CUDA_GET_CONTEXT;

	kernel = (CUfunction*) ker;
	CUDA_CHECK_CALL(cuFuncSetBlockShape(*kernel, Int_val(Field(block,0)),Int_val(Field(block,1)),Int_val(Field(block,2))));

	CUDA_RESTORE_CONTEXT;
	CAMLreturn(Val_unit);
}
Example #3
0
CAMLprim value spoc_cuda_launch_grid(value off, value ker, value grid, value block, value ex, value gi, value queue_id){
  CAMLparam5(ker, grid, ex, block, gi);
  CAMLxparam2(off, queue_id);
  CUfunction *kernel;
  int gridX, gridY, gridZ, blockX, blockY, blockZ;
  int offset;
  char* extra;
  void* extra2[5];
  offset = Int_val(Field(off, 0));

  gridX = Int_val(Field(grid,0));
  gridY = Int_val(Field(grid,1));
  gridZ = Int_val(Field(grid,2));
  blockX = Int_val(Field(block,0));
  blockY = Int_val(Field(block,1));
  blockZ = Int_val(Field(block,2));
  
  CUDA_GET_CONTEXT;
  
  kernel = (CUfunction*) ker;  
  extra = (char*)ex;
  
  extra2[0] = CU_LAUNCH_PARAM_BUFFER_POINTER;
  extra2[1] = extra;
  extra2[2] = CU_LAUNCH_PARAM_BUFFER_SIZE;
  extra2[3] = &offset;
  extra2[4] = CU_LAUNCH_PARAM_END;
  
  
  CUDA_CHECK_CALL(cuLaunchKernel(*kernel, 
				 gridX, gridY, gridZ,
				 blockX, blockY, blockZ, 
				 0, queue[Int_val(queue_id)], 
				 NULL, extra2));

  Store_field(off, 0, Val_int(offset));
  free(extra);
  CUDA_RESTORE_CONTEXT;
  CAMLreturn(Val_unit);
}
Example #4
0
value spoc_getCudaDevice(value i)
{
	CAMLparam1(i);
	CAMLlocal4(general_info, cuda_info, specific_info, gc_info);
	CAMLlocal3(device,  maxT, maxG);
	int nb_devices;
	CUdevprop dev_infos;
	CUdevice dev;
	CUcontext ctx;
	CUstream queue[2];
	spoc_cu_context *spoc_ctx;
	//CUcontext gl_ctx;
	char infoStr[1024];
	int infoInt;
	size_t infoUInt;
	int major, minor;
	enum cudaError_enum cuda_error; 


	cuDeviceGetCount (&nb_devices);

	if ((Int_val(i)) > nb_devices)
		raise_constant(*caml_named_value("no_cuda_device")) ;


	CUDA_CHECK_CALL(cuDeviceGet(&dev, Int_val(i)));
	CUDA_CHECK_CALL(cuDeviceGetProperties(&dev_infos, dev));

	general_info = caml_alloc (9, 0);
	CUDA_CHECK_CALL(cuDeviceGetName(infoStr, sizeof(infoStr), dev));

	Store_field(general_info,0, copy_string(infoStr));//
	CUDA_CHECK_CALL(cuDeviceTotalMem(&infoUInt, dev));

	Store_field(general_info,1, Val_int(infoUInt));//
	Store_field(general_info,2, Val_int(dev_infos.sharedMemPerBlock));//
	Store_field(general_info,3, Val_int(dev_infos.clockRate));//
	Store_field(general_info,4, Val_int(dev_infos.totalConstantMemory));//
	CUDA_CHECK_CALL(cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev));
	Store_field(general_info,5, Val_int(infoInt));//
	CUDA_CHECK_CALL(cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, dev));
	Store_field(general_info,6, Val_bool(infoInt));//
	Store_field(general_info,7, i);
	CUDA_CHECK_CALL(cuCtxCreate	(&ctx,
			CU_CTX_SCHED_BLOCKING_SYNC | CU_CTX_MAP_HOST,
			dev));
	spoc_ctx = malloc(sizeof(spoc_cl_context));
	spoc_ctx->ctx = ctx;
	CUDA_CHECK_CALL(cuStreamCreate(&queue[0], 0));
	CUDA_CHECK_CALL(cuStreamCreate(&queue[1], 0));
	spoc_ctx->queue[0] = queue[0];
	spoc_ctx->queue[1] = queue[1];
	Store_field(general_info,8, (value)spoc_ctx);
	CUDA_CHECK_CALL(cuCtxSetCurrent(ctx));


	cuda_info = caml_alloc(1, 0); //0 -> Cuda
	specific_info = caml_alloc(18, 0);

	cuDeviceComputeCapability(&major, &minor, dev);
	Store_field(specific_info,0, Val_int(major));//
	Store_field(specific_info,1, Val_int(minor));//
	Store_field(specific_info,2, Val_int(dev_infos.regsPerBlock));//
	Store_field(specific_info,3, Val_int(dev_infos.SIMDWidth));//
	Store_field(specific_info,4, Val_int(dev_infos.memPitch));//
	Store_field(specific_info,5, Val_int(dev_infos.maxThreadsPerBlock));//

	maxT = caml_alloc(3, 0);
	Store_field(maxT,0, Val_int(dev_infos.maxThreadsDim[0]));//
	Store_field(maxT,1, Val_int(dev_infos.maxThreadsDim[1]));//
	Store_field(maxT,2, Val_int(dev_infos.maxThreadsDim[2]));//
	Store_field(specific_info,6, maxT);

	maxG = caml_alloc(3, 0);
	Store_field(maxG,0, Val_int(dev_infos.maxGridSize[0]));//
	Store_field(maxG,1, Val_int(dev_infos.maxGridSize[1]));//
	Store_field(maxG,2, Val_int(dev_infos.maxGridSize[2]));//
	Store_field(specific_info,7, maxG);

	Store_field(specific_info,8, Val_int(dev_infos.textureAlign));//
	cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
	Store_field(specific_info,9, Val_bool(infoInt));//
	cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, dev);
	Store_field(specific_info,10, Val_bool(infoInt));//
	cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
	Store_field(specific_info,11, Val_bool(infoInt));//
	cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
	Store_field(specific_info,12, Val_bool(infoInt));//
	cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
	Store_field(specific_info,13, Val_int(infoInt));//
	cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
	Store_field(specific_info,14, Val_bool(infoInt));//
	cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, dev);
	Store_field(specific_info,15, Val_int(infoInt));
	cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, dev);
	Store_field(specific_info,16, Val_int(infoInt));
	cuDriverGetVersion(&infoInt);
	Store_field(specific_info, 17, Val_int(infoInt));

	Store_field(cuda_info, 0, specific_info);
	device = caml_alloc(4, 0);
	Store_field(device, 0, general_info);
	Store_field(device, 1, cuda_info);

	{spoc_cuda_gc_info* gcInfo = (spoc_cuda_gc_info*)malloc(sizeof(spoc_cuda_gc_info));
	CUDA_CHECK_CALL(cuMemGetInfo(&infoUInt, NULL));
	infoUInt -= (32*1024*1024);

	Store_field(device, 2, (value)gcInfo);


	{cuda_event_list* events = NULL;
	Store_field(device, 3, (value)events);



	CAMLreturn(device);}}
}