CAMLprim value spoc_cuda_compile(value moduleSrc, value function_name, value gi){ CAMLparam3(moduleSrc, function_name, gi); CUmodule module; CUfunction *kernel; char* functionN; char *ptx_source; const unsigned int jitNumOptions = 4; CUjit_option jitOptions[4]; void *jitOptVals[4]; int jitLogBufferSize; char *jitLogBuffer; int jitRegCount = 32; CUDA_GET_CONTEXT; kernel = malloc(sizeof(CUfunction)); functionN = String_val(function_name); ptx_source = String_val(moduleSrc); // set up size of compilation log buffer jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; jitLogBufferSize = 1024; jitOptVals[0] = (void *)(size_t)jitLogBufferSize; // set up pointer to the compilation log buffer jitOptions[1] = CU_JIT_INFO_LOG_BUFFER; jitLogBuffer = malloc(sizeof(char)*jitLogBufferSize); jitOptVals[1] = jitLogBuffer; // set up pointer to set the Maximum # of registers for a particular kernel jitOptions[2] = CU_JIT_MAX_REGISTERS; jitOptVals[2] = (void *)(size_t)jitRegCount; // set up pointer to set the Maximum # of registers for a particular kernel jitOptions[3] = CU_JIT_TARGET_FROM_CUCONTEXT; //CU_JIT_TARGET; // jitOptVals[3] = (void*)(uintptr_t)CU_TARGET_COMPUTE_11; CUDA_CHECK_CALL(cuModuleLoadDataEx(&module, ptx_source, jitNumOptions, jitOptions, (void **)jitOptVals)); CUDA_CHECK_CALL(cuModuleGetFunction(kernel, module, functionN)); free(jitLogBuffer); CUDA_RESTORE_CONTEXT; //caml_leave_blocking_section(); CAMLreturn((value) kernel); }
CAMLprim value spoc_cuda_set_block_shape(value ker, value block, value gi){ CAMLparam3(ker, block, gi); CUfunction *kernel; CUDA_GET_CONTEXT; kernel = (CUfunction*) ker; CUDA_CHECK_CALL(cuFuncSetBlockShape(*kernel, Int_val(Field(block,0)),Int_val(Field(block,1)),Int_val(Field(block,2)))); CUDA_RESTORE_CONTEXT; CAMLreturn(Val_unit); }
CAMLprim value spoc_cuda_launch_grid(value off, value ker, value grid, value block, value ex, value gi, value queue_id){ CAMLparam5(ker, grid, ex, block, gi); CAMLxparam2(off, queue_id); CUfunction *kernel; int gridX, gridY, gridZ, blockX, blockY, blockZ; int offset; char* extra; void* extra2[5]; offset = Int_val(Field(off, 0)); gridX = Int_val(Field(grid,0)); gridY = Int_val(Field(grid,1)); gridZ = Int_val(Field(grid,2)); blockX = Int_val(Field(block,0)); blockY = Int_val(Field(block,1)); blockZ = Int_val(Field(block,2)); CUDA_GET_CONTEXT; kernel = (CUfunction*) ker; extra = (char*)ex; extra2[0] = CU_LAUNCH_PARAM_BUFFER_POINTER; extra2[1] = extra; extra2[2] = CU_LAUNCH_PARAM_BUFFER_SIZE; extra2[3] = &offset; extra2[4] = CU_LAUNCH_PARAM_END; CUDA_CHECK_CALL(cuLaunchKernel(*kernel, gridX, gridY, gridZ, blockX, blockY, blockZ, 0, queue[Int_val(queue_id)], NULL, extra2)); Store_field(off, 0, Val_int(offset)); free(extra); CUDA_RESTORE_CONTEXT; CAMLreturn(Val_unit); }
value spoc_getCudaDevice(value i) { CAMLparam1(i); CAMLlocal4(general_info, cuda_info, specific_info, gc_info); CAMLlocal3(device, maxT, maxG); int nb_devices; CUdevprop dev_infos; CUdevice dev; CUcontext ctx; CUstream queue[2]; spoc_cu_context *spoc_ctx; //CUcontext gl_ctx; char infoStr[1024]; int infoInt; size_t infoUInt; int major, minor; enum cudaError_enum cuda_error; cuDeviceGetCount (&nb_devices); if ((Int_val(i)) > nb_devices) raise_constant(*caml_named_value("no_cuda_device")) ; CUDA_CHECK_CALL(cuDeviceGet(&dev, Int_val(i))); CUDA_CHECK_CALL(cuDeviceGetProperties(&dev_infos, dev)); general_info = caml_alloc (9, 0); CUDA_CHECK_CALL(cuDeviceGetName(infoStr, sizeof(infoStr), dev)); Store_field(general_info,0, copy_string(infoStr));// CUDA_CHECK_CALL(cuDeviceTotalMem(&infoUInt, dev)); Store_field(general_info,1, Val_int(infoUInt));// Store_field(general_info,2, Val_int(dev_infos.sharedMemPerBlock));// Store_field(general_info,3, Val_int(dev_infos.clockRate));// Store_field(general_info,4, Val_int(dev_infos.totalConstantMemory));// CUDA_CHECK_CALL(cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev)); Store_field(general_info,5, Val_int(infoInt));// CUDA_CHECK_CALL(cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, dev)); Store_field(general_info,6, Val_bool(infoInt));// Store_field(general_info,7, i); CUDA_CHECK_CALL(cuCtxCreate (&ctx, CU_CTX_SCHED_BLOCKING_SYNC | CU_CTX_MAP_HOST, dev)); spoc_ctx = malloc(sizeof(spoc_cl_context)); spoc_ctx->ctx = ctx; CUDA_CHECK_CALL(cuStreamCreate(&queue[0], 0)); CUDA_CHECK_CALL(cuStreamCreate(&queue[1], 0)); spoc_ctx->queue[0] = queue[0]; spoc_ctx->queue[1] = queue[1]; Store_field(general_info,8, (value)spoc_ctx); CUDA_CHECK_CALL(cuCtxSetCurrent(ctx)); cuda_info = caml_alloc(1, 0); //0 -> Cuda specific_info = caml_alloc(18, 0); cuDeviceComputeCapability(&major, &minor, dev); Store_field(specific_info,0, Val_int(major));// Store_field(specific_info,1, Val_int(minor));// Store_field(specific_info,2, Val_int(dev_infos.regsPerBlock));// Store_field(specific_info,3, Val_int(dev_infos.SIMDWidth));// Store_field(specific_info,4, Val_int(dev_infos.memPitch));// Store_field(specific_info,5, Val_int(dev_infos.maxThreadsPerBlock));// maxT = caml_alloc(3, 0); Store_field(maxT,0, Val_int(dev_infos.maxThreadsDim[0]));// Store_field(maxT,1, Val_int(dev_infos.maxThreadsDim[1]));// Store_field(maxT,2, Val_int(dev_infos.maxThreadsDim[2]));// Store_field(specific_info,6, maxT); maxG = caml_alloc(3, 0); Store_field(maxG,0, Val_int(dev_infos.maxGridSize[0]));// Store_field(maxG,1, Val_int(dev_infos.maxGridSize[1]));// Store_field(maxG,2, Val_int(dev_infos.maxGridSize[2]));// Store_field(specific_info,7, maxG); Store_field(specific_info,8, Val_int(dev_infos.textureAlign));// cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev); Store_field(specific_info,9, Val_bool(infoInt));// cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, dev); Store_field(specific_info,10, Val_bool(infoInt));// cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev); Store_field(specific_info,11, Val_bool(infoInt));// cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev); Store_field(specific_info,12, Val_bool(infoInt));// cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev); Store_field(specific_info,13, Val_int(infoInt));// cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev); Store_field(specific_info,14, Val_bool(infoInt));// cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, dev); Store_field(specific_info,15, Val_int(infoInt)); cuDeviceGetAttribute(&infoInt, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, dev); Store_field(specific_info,16, Val_int(infoInt)); cuDriverGetVersion(&infoInt); Store_field(specific_info, 17, Val_int(infoInt)); Store_field(cuda_info, 0, specific_info); device = caml_alloc(4, 0); Store_field(device, 0, general_info); Store_field(device, 1, cuda_info); {spoc_cuda_gc_info* gcInfo = (spoc_cuda_gc_info*)malloc(sizeof(spoc_cuda_gc_info)); CUDA_CHECK_CALL(cuMemGetInfo(&infoUInt, NULL)); infoUInt -= (32*1024*1024); Store_field(device, 2, (value)gcInfo); {cuda_event_list* events = NULL; Store_field(device, 3, (value)events); CAMLreturn(device);}} }