void mem_copy_to(device_memory& mem) { cuda_push_context(); if(mem.device_pointer) cuda_assert(cuMemcpyHtoD(cuda_device_ptr(mem.device_pointer), (void*)mem.data_pointer, mem.memory_size())) cuda_pop_context(); }
~CUDADevice() { task_pool.stop(); cuda_push_context(); cuda_assert(cuCtxDetach(cuContext)) }
bool load_kernels(bool experimental) { /* check if cuda init succeeded */ if(cuContext == 0) return false; /* check if GPU is supported with current feature set */ if(!support_device(experimental)) return false; /* get kernel */ string cubin = compile_kernel(); if(cubin == "") return false; /* open module */ cuda_push_context(); CUresult result = cuModuleLoad(&cuModule, cubin.c_str()); if(cuda_error_(result, "cuModuleLoad")) cuda_error_message(string_printf("Failed loading CUDA kernel %s.", cubin.c_str())); cuda_pop_context(); return (result == CUDA_SUCCESS); }
void mem_zero(device_memory& mem) { memset((void*)mem.data_pointer, 0, mem.memory_size()); cuda_push_context(); cuda_assert(cuMemsetD8(cuda_device_ptr(mem.device_pointer), 0, mem.memory_size())) cuda_pop_context(); }
void mem_alloc(device_memory& mem, MemoryType type) { cuda_push_context(); CUdeviceptr device_pointer; cuda_assert(cuMemAlloc(&device_pointer, mem.memory_size())) mem.device_pointer = (device_ptr)device_pointer; cuda_pop_context(); }
void mem_free(device_memory& mem) { if(mem.device_pointer) { cuda_push_context(); cuda_assert(cuMemFree(cuda_device_ptr(mem.device_pointer))) cuda_pop_context(); mem.device_pointer = 0; } }
void mem_copy_from(device_memory& mem, int y, int w, int h, int elem) { size_t offset = elem*y*w; size_t size = elem*w*h; cuda_push_context(); cuda_assert(cuMemcpyDtoH((uchar*)mem.data_pointer + offset, (CUdeviceptr)((uchar*)mem.device_pointer + offset), size)) cuda_pop_context(); }
void tex_alloc(const char *name, device_memory& mem, bool interpolation, bool periodic) { /* determine format */ CUarray_format_enum format; size_t dsize = datatype_size(mem.data_type); size_t size = mem.memory_size(); switch(mem.data_type) { case TYPE_UCHAR: format = CU_AD_FORMAT_UNSIGNED_INT8; break; case TYPE_UINT: format = CU_AD_FORMAT_UNSIGNED_INT32; break; case TYPE_INT: format = CU_AD_FORMAT_SIGNED_INT32; break; case TYPE_FLOAT: format = CU_AD_FORMAT_FLOAT; break; default: assert(0); return; } CUtexref texref = NULL; cuda_push_context(); cuda_assert(cuModuleGetTexRef(&texref, cuModule, name)) if(!texref) { cuda_pop_context(); return; } if(interpolation) { CUarray handle = NULL; CUDA_ARRAY_DESCRIPTOR desc; desc.Width = mem.data_width; desc.Height = mem.data_height; desc.Format = format; desc.NumChannels = mem.data_elements; cuda_assert(cuArrayCreate(&handle, &desc)) if(!handle) { cuda_pop_context(); return; } if(mem.data_height > 1) { CUDA_MEMCPY2D param; memset(¶m, 0, sizeof(param)); param.dstMemoryType = CU_MEMORYTYPE_ARRAY; param.dstArray = handle; param.srcMemoryType = CU_MEMORYTYPE_HOST; param.srcHost = (void*)mem.data_pointer; param.srcPitch = mem.data_width*dsize*mem.data_elements; param.WidthInBytes = param.srcPitch; param.Height = mem.data_height; cuda_assert(cuMemcpy2D(¶m)) } else
void const_copy_to(const char *name, void *host, size_t size) { CUdeviceptr mem; size_t bytes; cuda_push_context(); cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule, name)) //assert(bytes == size); cuda_assert(cuMemcpyHtoD(mem, host, size)) cuda_pop_context(); }
~CUDADevice() { cuda_push_context(); cuda_assert(cuCtxDetach(cuContext)) }