// Called at the beginning of a code block generated by Halide. This function // is responsible for setting up the OpenGL environment and compiling the GLSL // code into a fragment shader. EXPORT void *halide_opengl_init_kernels(void *user_context, void *state_ptr, const char *src, int size) { // TODO: handle error if (int error = halide_opengl_init(user_context)) { return NULL; } // Use '/// KERNEL' comments to split 'src' into discrete blocks, one for // each kernel contained in it. char *begin = strstr(src, kernel_marker); char *end = NULL; for (; begin && begin[0]; begin = end) { end = strstr(begin + sizeof(kernel_marker) - 1, kernel_marker); if (!end) { end = begin + strlen(begin); } HalideOpenGLKernel *kernel = create_kernel(user_context, begin, end - begin); if (!kernel) { // Simply skip invalid kernels continue; } #ifdef DEBUG halide_printf(user_context, "Defining kernel '%s'\n", kernel->name); #endif // Compile shader kernel->shader_id = halide_opengl_make_shader(user_context, GL_FRAGMENT_SHADER, kernel->source, NULL); // Link GLSL program GLuint program = ST.CreateProgram(); ST.AttachShader(program, ST.vertex_shader_id); ST.AttachShader(program, kernel->shader_id); ST.LinkProgram(program); GLint status; ST.GetProgramiv(program, GL_LINK_STATUS, &status); if (!status) { halide_printf(user_context, "Could not link GLSL program:\n"); GLint log_len; ST.GetProgramiv(program, GL_INFO_LOG_LENGTH, &log_len); char *log = (char*) malloc(log_len); ST.GetProgramInfoLog(program, log_len, NULL, log); halide_printf(user_context, "%s", log); free(log); ST.DeleteProgram(program); program = 0; } kernel->program_id = program; if (halide_opengl_find_kernel(kernel->name)) { halide_printf(user_context, "Duplicate kernel name '%s'\n", kernel->name); halide_opengl_delete_kernel(user_context, kernel); } else { kernel->next = ST.kernels; ST.kernels = kernel; } } return NULL; }
static cl_mem __dev_malloc(void *user_context, size_t bytes) { cl_mem p; #ifdef DEBUG halide_printf(user_context, "dev_malloc (%lld bytes)\n", (long long)bytes); #endif int err; p = clCreateBuffer(*cl_ctx, CL_MEM_READ_WRITE, bytes, NULL, &err ); #ifdef DEBUG halide_printf(user_context, " returned: %p (err: %d)\n", (void*)p, err); #endif halide_assert(user_context, p); return p; }
static int run_test(void *uc, int channels, Implementation imp, Layout layout) { std::string name = "Example_"; name += std::to_string(channels); name += (imp == kGLSL) ? "_GLSL" : "_CPU"; name += (layout == kChunky) ? "_Chunky" : "_Planar"; halide_printf(uc, "\n---------------------------\n%s\n", name.c_str()); Image<uint8_t> input(kWidth, kHeight, channels, 0, (layout == kChunky)); Image<uint8_t> output(kWidth, kHeight, channels, 0, (layout == kChunky)); (void) halide_smooth_buffer_host<uint8_t>(uc, kSeed, input); if (imp == kGLSL) { // Call once to ensure OpenGL is inited (we want to time the // cost of copy-to-device alone) halide_copy_to_device(uc, input, halide_opengl_device_interface()); // Mark as dirty so the next call won't be a no-op input.set_host_dirty(); { ScopedTimer timer(uc, name + " halide_copy_to_device input"); halide_copy_to_device(uc, input, halide_opengl_device_interface()); } { ScopedTimer timer(uc, name + " halide_copy_to_device output"); halide_copy_to_device(uc, output, halide_opengl_device_interface()); } } // Call once to compile shader, warm up, etc. ExampleFunc example = exampleFuncs[channels-1][imp]; (void) example(input, output); { ScopedTimer timer(uc, name, kIter); for (int i = 0; i < kIter; ++i) { (void) example(input, output); } } if (imp == kGLSL) { ScopedTimer timer(uc, name + " halide_copy_to_host"); halide_copy_to_host(uc, output); } // halide_buffer_display(input); // halide_buffer_print(input); // halide_buffer_display(output); // halide_buffer_print(output); int errors = check<uint8_t>(input, output); if (errors) { halide_errorf(uc, "Test %s had %d errors!\n\n", name.c_str(), errors); } else { halide_printf(uc, "Test %s had no errors.\n\n", name.c_str()); } return errors; }
WEAK void halide_dev_malloc(void *user_context, buffer_t* buf) { if (buf->dev) { // This buffer already has a device allocation return; } size_t size = __buf_size(user_context, buf); #ifdef DEBUG halide_printf(user_context, "dev_malloc allocating buffer of %zd bytes, " "extents: %zdx%zdx%zdx%zd strides: %zdx%zdx%zdx%zd (%d bytes per element)\n", size, buf->extent[0], buf->extent[1], buf->extent[2], buf->extent[3], buf->stride[0], buf->stride[1], buf->stride[2], buf->stride[3], buf->elem_size); #endif CUdeviceptr p; TIME_CALL( cuMemAlloc(&p, size), "dev_malloc"); buf->dev = (uint64_t)p; halide_assert(user_context, buf->dev); #ifdef DEBUG halide_assert(user_context, halide_validate_dev_pointer(user_context, buf)); #endif }
// Used to generate correct timings when tracing WEAK int halide_dev_sync(void *user_context) { DEBUG_PRINTF( user_context, "CUDA: halide_dev_sync (user_context: %p)\n", user_context ); CudaContext ctx(user_context); if (ctx.error != CUDA_SUCCESS) { return ctx.error; } #ifdef DEBUG uint64_t t_before = halide_current_time_ns(user_context); #endif CUresult err = cuCtxSynchronize(); if (err != CUDA_SUCCESS) { halide_error_varargs(user_context, "CUDA: cuCtxSynchronize failed (%s)", _get_error_name(err)); return err; } #ifdef DEBUG uint64_t t_after = halide_current_time_ns(user_context); halide_printf(user_context, " Time: %f ms\n", (t_after - t_before) / 1.0e6); #endif return 0; }
WEAK int halide_dev_run(void *user_context, void *state_ptr, const char* entry_name, int blocksX, int blocksY, int blocksZ, int threadsX, int threadsY, int threadsZ, int shared_mem_bytes, size_t arg_sizes[], void* args[]) { DEBUG_PRINTF( user_context, "CUDA: halide_dev_run (user_context: %p, entry: %s, blocks: %dx%dx%d, threads: %dx%dx%d, shmem: %d)\n", user_context, entry_name, blocksX, blocksY, blocksZ, threadsX, threadsY, threadsZ, shared_mem_bytes ); CUresult err; CudaContext ctx(user_context); if (ctx.error != CUDA_SUCCESS) { return ctx.error; } #ifdef DEBUG uint64_t t_before = halide_current_time_ns(user_context); #endif halide_assert(user_context, state_ptr); CUmodule mod = ((module_state*)state_ptr)->module; halide_assert(user_context, mod); CUfunction f; err = cuModuleGetFunction(&f, mod, entry_name); if (err != CUDA_SUCCESS) { halide_error_varargs(user_context, "CUDA: cuModuleGetFunction failed (%s)", _get_error_name(err)); return err; } err = cuLaunchKernel(f, blocksX, blocksY, blocksZ, threadsX, threadsY, threadsZ, shared_mem_bytes, NULL, // stream args, NULL); if (err != CUDA_SUCCESS) { halide_error_varargs(user_context, "CUDA: cuLaunchKernel failed (%s)", _get_error_name(err)); return err; } #ifdef DEBUG err = cuCtxSynchronize(); if (err != CUDA_SUCCESS) { halide_error_varargs(user_context, "CUDA: cuCtxSynchronize failed (%s)\n", _get_error_name(err)); return err; } uint64_t t_after = halide_current_time_ns(user_context); halide_printf(user_context, " Time: %f ms\n", (t_after - t_before) / 1.0e6); #endif return 0; }
WEAK bool halide_validate_dev_pointer(buffer_t* buf, size_t size=0) { if (buf->dev == 0) return true; size_t real_size; cl_int result = clGetMemObjectInfo((cl_mem)buf->dev, CL_MEM_SIZE, sizeof(size_t), &real_size, NULL); if (result) { halide_printf("Bad device pointer %p: clGetMemObjectInfo returned %d\n", (void *)buf->dev, result); return false; } #ifdef DEBUG halide_printf("validate %p: asked for %lld, actual allocated %lld\n", (void*)buf->dev, (long long)size, (long long)real_size); #endif if (size) halide_assert(real_size >= size && "Validating pointer with insufficient size"); return true; }
WEAK int halide_dev_free(void *user_context, buffer_t* buf) { DEBUG_PRINTF( user_context, "CL: halide_dev_free (user_context: %p, buf: %p)\n", user_context, buf ); ClContext ctx(user_context); // halide_dev_free, at present, can be exposed to clients and they // should be allowed to call halide_dev_free on any buffer_t // including ones that have never been used with a GPU. if (buf->dev == 0) { return 0; } #ifdef DEBUG uint64_t t_before = halide_current_time_ns(user_context); #endif halide_assert(user_context, halide_validate_dev_pointer(user_context, buf)); DEBUG_PRINTF(user_context, " clReleaseMemObject %p\n", (cl_mem)buf->dev ); cl_int result = clReleaseMemObject((cl_mem)buf->dev); // If clReleaseMemObject fails, it is unlikely to succeed in a later call, so // we just end our reference to it regardless. buf->dev = 0; if (result != CL_SUCCESS) { halide_error_varargs(user_context, "CL: clReleaseMemObject failed (%d)", result); return result; } #ifdef DEBUG uint64_t t_after = halide_current_time_ns(user_context); halide_printf(user_context, " Time: %f ms\n", (t_after - t_before) / 1.0e6); #endif return 0; }
WEAK void halide_error(void *user_context, const char *msg) { if (halide_error_handler) { (*halide_error_handler)(user_context, msg); } else { halide_printf(user_context, "Error: %s\n", msg); exit(1); } }
WEAK void halide_dev_run( void *user_context, const char* entry_name, int blocksX, int blocksY, int blocksZ, int threadsX, int threadsY, int threadsZ, int shared_mem_bytes, size_t arg_sizes[], void* args[]) { cl_kernel f = __get_kernel(user_context, entry_name); #ifdef DEBUG halide_printf(user_context, "dev_run %s with (%dx%dx%d) blks, (%dx%dx%d) threads, %d shmem\n", entry_name, blocksX, blocksY, blocksZ, threadsX, threadsY, threadsZ, shared_mem_bytes ); #endif // Pack dims size_t global_dim[3] = {blocksX*threadsX, blocksY*threadsY, blocksZ*threadsZ}; size_t local_dim[3] = {threadsX, threadsY, threadsZ}; // Set args int i = 0; while (arg_sizes[i] != 0) { #ifdef DEBUG halide_printf(user_context, "clSetKernelArg %i %i [0x%x ...]\n", i, arg_sizes[i], *(int *)args[i]); #endif CHECK_CALL( clSetKernelArg(f, i, arg_sizes[i], args[i]), "clSetKernelArg" ); i++; } // Set the shared mem buffer last // Always set at least 1 byte of shmem, to keep the launch happy CHECK_CALL( clSetKernelArg(f, i, (shared_mem_bytes > 0) ? shared_mem_bytes : 1, NULL), "clSetKernelArg" ); // Launch kernel int err = clEnqueueNDRangeKernel( *cl_q, f, 3, NULL, global_dim, local_dim, 0, NULL, NULL ); CHECK_ERR(err, "clEnqueueNDRangeKernel"); }
WEAK int halide_copy_to_host(void *user_context, buffer_t* buf) { if (!buf->dev_dirty) { return 0; } DEBUG_PRINTF( user_context, "CUDA: halide_copy_to_host (user_context: %p, buf: %p)\n", user_context, buf ); CudaContext ctx(user_context); if (ctx.error != CUDA_SUCCESS) { return ctx.error; } // Need to check dev_dirty again, in case another thread did the // copy_to_host before the serialization point above. if (buf->dev_dirty) { #ifdef DEBUG uint64_t t_before = halide_current_time_ns(user_context); #endif halide_assert(user_context, buf->dev && buf->dev); halide_assert(user_context, halide_validate_dev_pointer(user_context, buf)); _dev_copy c = _make_dev_to_host_copy(buf); for (int w = 0; w < c.extent[3]; w++) { for (int z = 0; z < c.extent[2]; z++) { for (int y = 0; y < c.extent[1]; y++) { for (int x = 0; x < c.extent[0]; x++) { uint64_t off = (x * c.stride_bytes[0] + y * c.stride_bytes[1] + z * c.stride_bytes[2] + w * c.stride_bytes[3]); CUdeviceptr src = (CUdeviceptr)(c.src + off); void *dst = (void *)(c.dst + off); uint64_t size = c.chunk_size; DEBUG_PRINTF( user_context, " cuMemcpyDtoH (%d, %d, %d, %d), %p -> %p, %lld bytes\n", x, y, z, w, (void *)src, dst, (long long)size ); CUresult err = cuMemcpyDtoH(dst, src, size); if (err != CUDA_SUCCESS) { halide_error_varargs(user_context, "CUDA: cuMemcpyDtoH failed (%s)", _get_error_name(err)); return err; } } } } } #ifdef DEBUG uint64_t t_after = halide_current_time_ns(user_context); halide_printf(user_context, " Time: %f ms\n", (t_after - t_before) / 1.0e6); #endif } buf->dev_dirty = false; return 0; }
WEAK void halide_do_par_for(void (*f)(int, uint8_t *), int min, int size, uint8_t *closure) { if (halide_custom_do_par_for) { (*halide_custom_do_par_for)(f, min, size, closure); return; } if (!halide_thread_pool_initialized) { halide_work_queue.shutdown = false; pthread_mutex_init(&halide_work_queue.mutex, NULL); pthread_cond_init(&halide_work_queue.not_empty, NULL); halide_work_queue.head = halide_work_queue.tail = 0; halide_work_queue.ids = 1; char *threadStr = getenv("HL_NUMTHREADS"); #ifdef _LP64 // On 64-bit systems we use 8 threads by default halide_threads = 8; #else // On 32-bit systems we use 2 threads by default halide_threads = 2; #endif if (threadStr) { halide_threads = atoi(threadStr); } else { halide_printf("HL_NUMTHREADS not defined. Defaulting to %d threads.\n", halide_threads); } if (halide_threads > MAX_THREADS) halide_threads = MAX_THREADS; for (int i = 0; i < halide_threads-1; i++) { //fprintf(stderr, "Creating thread %d\n", i); pthread_create(halide_work_queue.threads + i, NULL, halide_worker_thread, NULL); } halide_thread_pool_initialized = true; } // Enqueue the job pthread_mutex_lock(&halide_work_queue.mutex); //fprintf(stderr, "Enqueuing some work\n"); work job = {f, min, min + size, closure, halide_work_queue.ids++, 0}; if (job.id == 0) job.id = halide_work_queue.ids++; // disallow zero, as it flags a completed job halide_work_queue.jobs[halide_work_queue.tail] = job; work *jobPtr = halide_work_queue.jobs + halide_work_queue.tail; worker_arg arg = {job.id, jobPtr}; int new_tail = (halide_work_queue.tail + 1) % MAX_JOBS; //assert(new_tail != halide_work_queue.head); halide_work_queue.tail = new_tail; // TODO: check to make sure the work queue doesn't overflow pthread_mutex_unlock(&halide_work_queue.mutex); //fprintf(stderr, "Waking up workers\n"); // Wake up everyone pthread_cond_broadcast(&halide_work_queue.not_empty); // Do some work myself //fprintf(stderr, "Doing some work on job %d\n", arg.id); halide_worker_thread((void *)(&arg)); //fprintf(stderr, "Parallel for done\n"); }
WEAK bool halide_validate_dev_pointer(buffer_t* buf) { CUcontext ctx; CUresult result = cuPointerGetAttribute(&ctx, CU_POINTER_ATTRIBUTE_CONTEXT, buf->dev); if (result) { halide_printf("Bad device pointer %p: cuPointerGetAttribute returned %d\n", (void *)buf->dev, result); return false; } return true; }
WEAK void halide_dev_free(buffer_t* buf) { #ifdef DEBUG halide_printf("In dev_free of %p - dev: 0x%p\n", buf, (void*)buf->dev); halide_assert(halide_validate_dev_pointer(buf)); #endif CHECK_CALL( cuMemFree(buf->dev), "cuMemFree" ); buf->dev = 0; }
static cl_kernel __get_kernel(void *user_context, const char* entry_name) { cl_kernel f; #ifdef DEBUG halide_printf(user_context, "get_kernel %s\n", entry_name); #endif // Get kernel function ptr int err; f = clCreateKernel(__mod, entry_name, &err); CHECK_ERR(err, "clCreateKernel"); return f; }
WEAK GLuint halide_opengl_make_shader(void *user_context, GLenum type, const char *source, GLint *length) { GLuint shader = ST.CreateShader(type); ST.ShaderSource(shader, 1, (const GLchar **)&source, length); ST.CompileShader(shader); GLint shader_ok = 0; ST.GetShaderiv(shader, GL_COMPILE_STATUS, &shader_ok); if (!shader_ok) { halide_printf(user_context, "Could not compile shader:\n"); GLint log_len; ST.GetShaderiv(shader, GL_INFO_LOG_LENGTH, &log_len); char *log = (char *)malloc(log_len); ST.GetShaderInfoLog(shader, log_len, NULL, log); halide_printf(user_context, "%s", log); free(log); ST.DeleteShader(shader); return 0; } return shader; }
WEAK int halide_dev_malloc(void *user_context, buffer_t *buf) { DEBUG_PRINTF( user_context, "CUDA: halide_dev_malloc (user_context: %p, buf: %p)\n", user_context, buf ); CudaContext ctx(user_context); if (ctx.error != CUDA_SUCCESS) { return ctx.error; } size_t size = _buf_size(user_context, buf); if (buf->dev) { // This buffer already has a device allocation halide_assert(user_context, halide_validate_dev_pointer(user_context, buf, size)); return 0; } halide_assert(user_context, buf->stride[0] >= 0 && buf->stride[1] >= 0 && buf->stride[2] >= 0 && buf->stride[3] >= 0); DEBUG_PRINTF(user_context, " allocating buffer of %lld bytes, " "extents: %lldx%lldx%lldx%lld strides: %lldx%lldx%lldx%lld (%d bytes per element)\n", (long long)size, (long long)buf->extent[0], (long long)buf->extent[1], (long long)buf->extent[2], (long long)buf->extent[3], (long long)buf->stride[0], (long long)buf->stride[1], (long long)buf->stride[2], (long long)buf->stride[3], buf->elem_size); #ifdef DEBUG uint64_t t_before = halide_current_time_ns(user_context); #endif CUdeviceptr p; DEBUG_PRINTF( user_context, " cuMemAlloc %lld -> ", size ); CUresult err = cuMemAlloc(&p, size); if (err != CUDA_SUCCESS) { DEBUG_PRINTF( user_context, "%s\n", _get_error_name(err)); halide_error_varargs(user_context, "CUDA: cuMemAlloc failed (%s)", _get_error_name(err)); return err; } else { DEBUG_PRINTF( user_context, "%p\n", p ); } halide_assert(user_context, p); buf->dev = (uint64_t)p; #ifdef DEBUG uint64_t t_after = halide_current_time_ns(user_context); halide_printf(user_context, " Time: %f ms\n", (t_after - t_before) / 1.0e6); #endif return 0; }
WEAK void halide_copy_to_dev(void *user_context, buffer_t* buf) { if (buf->host_dirty) { halide_assert(user_context, buf->host && buf->dev); size_t size = __buf_size(user_context, buf); #ifdef DEBUG halide_printf(user_context, "copy_to_dev (%lld bytes) %p -> %p\n", (long long)size, buf->host, (void*)buf->dev); #endif halide_assert(user_context, halide_validate_dev_pointer(user_context, buf)); int err = clEnqueueWriteBuffer( *cl_q, (cl_mem)((void*)buf->dev), CL_TRUE, 0, size, buf->host, 0, NULL, NULL ); CHECK_ERR( err, "clEnqueueWriteBuffer" ); } buf->host_dirty = false; }
// Initialize the runtime, in particular all fields in halide_opengl_state. EXPORT int halide_opengl_init(void *user_context) { if (ST.initialized) return 0; // Make a context if there isn't one if (halide_opengl_create_context(user_context)) { halide_printf(user_context, "Failed to make opengl context\n"); return 1; } // Initialize pointers to OpenGL functions. #define GLFUNC(TYPE, VAR) \ ST.VAR = (TYPE)halide_opengl_get_proc_address(user_context, "gl" #VAR); \ if (!ST.VAR) { \ halide_printf(user_context, "Could not load function pointer for %s\n", "gl" #VAR); \ return 1; \ } USED_GL_FUNCTIONS; #undef GLFUNC ST.kernels = NULL; ST.textures = NULL; // Initialize all OpenGL objects that are shared between kernels. ST.GenFramebuffers(1, &ST.framebuffer_id); CHECK_GLERROR(1); ST.vertex_shader_id = halide_opengl_make_shader(user_context, GL_VERTEX_SHADER, vertex_shader_src, NULL); if (ST.vertex_shader_id == 0) { halide_error(user_context, "Failed to create vertex shader"); return 1; } GLuint buf; ST.GenBuffers(1, &buf); ST.BindBuffer(GL_ARRAY_BUFFER, buf); ST.BufferData(GL_ARRAY_BUFFER, sizeof(square_vertices), square_vertices, GL_STATIC_DRAW); CHECK_GLERROR(1); ST.vertex_buffer = buf; ST.GenBuffers(1, &buf); ST.BindBuffer(GL_ELEMENT_ARRAY_BUFFER, buf); ST.BufferData(GL_ELEMENT_ARRAY_BUFFER, sizeof(square_indices), square_indices, GL_STATIC_DRAW); CHECK_GLERROR(1); ST.element_buffer = buf; ST.initialized = true; return 0; }
WEAK int halide_dev_malloc(void *user_context, buffer_t* buf) { DEBUG_PRINTF( user_context, "CL: halide_dev_malloc (user_context: %p, buf: %p)\n", user_context, buf ); ClContext ctx(user_context); if (ctx.error != CL_SUCCESS) { return ctx.error; } size_t size = _buf_size(user_context, buf); if (buf->dev) { halide_assert(user_context, halide_validate_dev_pointer(user_context, buf, size)); return 0; } halide_assert(user_context, buf->stride[0] >= 0 && buf->stride[1] >= 0 && buf->stride[2] >= 0 && buf->stride[3] >= 0); DEBUG_PRINTF(user_context, " Allocating buffer of %lld bytes, " "extents: %lldx%lldx%lldx%lld strides: %lldx%lldx%lldx%lld (%d bytes per element)\n", (long long)size, (long long)buf->extent[0], (long long)buf->extent[1], (long long)buf->extent[2], (long long)buf->extent[3], (long long)buf->stride[0], (long long)buf->stride[1], (long long)buf->stride[2], (long long)buf->stride[3], buf->elem_size); #ifdef DEBUG uint64_t t_before = halide_current_time_ns(user_context); #endif cl_int err; DEBUG_PRINTF( user_context, " clCreateBuffer -> ", size ); buf->dev = (uint64_t)clCreateBuffer(ctx.context, CL_MEM_READ_WRITE, size, NULL, &err); if (err != CL_SUCCESS || buf->dev == 0) { DEBUG_PRINTF( user_context, "%d\n", err); halide_error_varargs(user_context, "CL: clCreateBuffer failed (%d)\n", err); return err; } else { DEBUG_PRINTF( user_context, "%p\n", (cl_mem)buf->dev ); } DEBUG_PRINTF(user_context, " Allocated device buffer %p for buffer %p\n", (void *)buf->dev, buf); #ifdef DEBUG uint64_t t_after = halide_current_time_ns(user_context); halide_printf(user_context, " Time: %f ms\n", (t_after - t_before) / 1.0e6); #endif return CL_SUCCESS; }
static void print_buffer(void *user_context, buffer_t *buf) { halide_printf(user_context, " dev: %ul\n", buf->dev); halide_printf(user_context, " host: %p\n", buf->host); halide_printf(user_context, " extent: %d %d %d %d\n", buf->extent[0], buf->extent[1], buf->extent[2], buf->extent[3]); halide_printf(user_context, " stride: %d %d %d %d\n", buf->stride[0], buf->stride[1], buf->stride[2], buf->stride[3]); halide_printf(user_context, " min: %d %d %d %d\n", buf->min[0], buf->min[1], buf->min[2], buf->min[3]); halide_printf(user_context, " elem_size: %d\n", buf->elem_size); halide_printf(user_context, " host_dirty: %d, dev_dirty: %d\n", buf->host_dirty, buf->dev_dirty); }
WEAK void halide_release(void *user_context) { // TODO: this is for timing; bad for release-mode performance #ifdef DEBUG halide_printf(user_context, "dev_sync on exit\n" ); #endif halide_dev_sync(user_context); // Unload the module if (__mod) { #ifdef DEBUG halide_printf(user_context, "clReleaseProgram %p\n", __mod); #endif CHECK_CALL( clReleaseProgram(__mod), "clReleaseProgram" ); __mod = 0; } // TODO: This is not a good solution to deal with this problem (finding out if the // cl_ctx/cl_q are going to be freed). I think a larger redesign of the global // context scheme might be necessary. cl_uint refs = 0; clGetContextInfo(*cl_ctx, CL_CONTEXT_REFERENCE_COUNT, sizeof(refs), &refs, NULL); // Unload context (ref counted). CHECK_CALL( clReleaseCommandQueue(*cl_q), "clReleaseCommandQueue" ); #ifdef DEBUG halide_printf(user_context, "clReleaseContext %p\n", *cl_ctx); #endif CHECK_CALL( clReleaseContext(*cl_ctx), "clReleaseContext" ); // See TODO above... if (--refs == 0) { *cl_ctx = NULL; *cl_q = NULL; } }
WEAK void halide_dev_free(buffer_t* buf) { // halide_dev_free, at present, can be exposed to clients and they // should be allowed to call halide_dev_free on any buffer_t // including ones that have never been used with a GPU. if (buf->dev == 0) return; #ifdef DEBUG halide_printf("In dev_free of %p - dev: 0x%p\n", buf, (void*)buf->dev); #endif halide_assert(halide_validate_dev_pointer(buf)); CHECK_CALL( clReleaseMemObject((cl_mem)buf->dev), "clReleaseMemObject" ); buf->dev = 0; }
WEAK void halide_dev_malloc(void *user_context, buffer_t* buf) { if (buf->dev) { halide_assert(user_context, halide_validate_dev_pointer(user_context, buf)); return; } size_t size = __buf_size(user_context, buf); #ifdef DEBUG halide_printf(user_context, "dev_malloc allocating buffer of %lld bytes, " "extents: %lldx%lldx%lldx%lld strides: %lldx%lldx%lldx%lld (%d bytes per element)\n", (long long)size, (long long)buf->extent[0], (long long)buf->extent[1], (long long)buf->extent[2], (long long)buf->extent[3], (long long)buf->stride[0], (long long)buf->stride[1], (long long)buf->stride[2], (long long)buf->stride[3], buf->elem_size); #endif buf->dev = (uint64_t)__dev_malloc(user_context, size); #ifdef DEBUG halide_printf(user_context, "dev_malloc allocated buffer %p of with buf->dev of %p\n", buf, (void *)buf->dev); #endif halide_assert(user_context, buf->dev); }
WEAK void halide_copy_to_host(void *user_context, buffer_t* buf) { if (buf->dev_dirty) { clFinish(*cl_q); // block on completion before read back halide_assert(user_context, buf->host && buf->dev); size_t size = __buf_size(user_context, buf); #ifdef DEBUG halide_printf(user_context, "copy_to_host buf %p (%lld bytes) %p -> %p\n", buf, (long long)size, (void*)buf->dev, buf->host ); #endif halide_assert(user_context, halide_validate_dev_pointer(user_context, buf, size)); int err = clEnqueueReadBuffer( *cl_q, (cl_mem)((void*)buf->dev), CL_TRUE, 0, size, buf->host, 0, NULL, NULL ); CHECK_ERR( err, "clEnqueueReadBuffer" ); } buf->dev_dirty = false; }
WEAK void halide_dev_free(void *user_context, buffer_t* buf) { // halide_dev_free, at present, can be exposed to clients and they // should be allowed to call halide_dev_free on any buffer_t // including ones that have never been used with a GPU. if (buf->dev == 0) return; #ifdef DEBUG halide_printf(user_context, "In dev_free of %p - dev: 0x%p\n", buf, (void*)buf->dev); halide_assert(user_context, halide_validate_dev_pointer(user_context, buf)); #endif CHECK_CALL( cuMemFree(buf->dev), "cuMemFree" ); buf->dev = 0; }
WEAK int halide_init_kernels(void *user_context, void **state_ptr, const char* ptx_src, int size) { DEBUG_PRINTF( user_context, "CUDA: halide_init_kernels (user_context: %p, state_ptr: %p, ptx_src: %p, %i)\n", user_context, state_ptr, ptx_src, size ); CudaContext ctx(user_context); if (ctx.error != 0) { return ctx.error; } #ifdef DEBUG uint64_t t_before = halide_current_time_ns(user_context); #endif // Create the state object if necessary. This only happens once, regardless // of how many times halide_init_kernels/halide_release is called. // halide_release traverses this list and releases the module objects, but // it does not modify the list nodes created/inserted here. module_state **state = (module_state**)state_ptr; if (!(*state)) { *state = (module_state*)malloc(sizeof(module_state)); (*state)->module = NULL; (*state)->next = state_list; state_list = *state; } // Create the module itself if necessary. if (!(*state)->module) { DEBUG_PRINTF( user_context, " cuModuleLoadData %p, %i -> ", ptx_src, size ); CUmodule module; CUresult err = cuModuleLoadData(&(*state)->module, ptx_src); if (err != CUDA_SUCCESS) { DEBUG_PRINTF( user_context, "%s\n", _get_error_name(err) ); halide_error_varargs(user_context, "CUDA: cuModuleLoadData failed (%s)", _get_error_name(err)); return err; } else { DEBUG_PRINTF( user_context, "%p\n", module ); } } #ifdef DEBUG uint64_t t_after = halide_current_time_ns(user_context); halide_printf(user_context, " Time: %f ms\n", (t_after - t_before) / 1.0e6); #endif return 0; }
WEAK void halide_dev_malloc(buffer_t* buf) { if (buf->dev) { halide_assert(halide_validate_dev_pointer(buf)); return; } size_t size = __buf_size(buf); #ifdef DEBUG halide_printf("dev_malloc allocating buffer of %zd bytes, extents: %zdx%zdx%zdx%zd strides: %zdx%zdx%zdx%zd (%d bytes per element)\n", size, buf->extent[0], buf->extent[1], buf->extent[2], buf->extent[3], buf->stride[0], buf->stride[1], buf->stride[2], buf->stride[3], buf->elem_size); #endif buf->dev = (uint64_t)__dev_malloc(size); halide_assert(buf->dev); }
WEAK void halide_release() { // TODO: this is for timing; bad for release-mode performance #ifdef DEBUG halide_printf("dev_sync on exit" ); #endif halide_dev_sync(); // Unload the module if (__mod) { CHECK_CALL( clReleaseProgram(__mod), "clReleaseProgram" ); __mod = 0; } // Unload context (ref counted). CHECK_CALL( clReleaseCommandQueue(cl_q), "clReleaseCommandQueue" ); CHECK_CALL( clReleaseContext(cl_ctx), "clReleaseContext" ); }
// Release all data allocated by the runtime. // // The OpenGL context itself is generally managed by the host application, so // we leave it untouched. EXPORT void halide_opengl_release(void *user_context) { CHECK_INITIALIZED(); ST.DeleteShader(ST.vertex_shader_id); ST.DeleteFramebuffers(1, &ST.framebuffer_id); HalideOpenGLKernel *cur = ST.kernels; while (cur) { HalideOpenGLKernel *next = cur->next; halide_opengl_delete_kernel(user_context, cur); cur = next; } // Delete all textures that were allocated by us. HalideOpenGLTexture *tex = ST.textures; int freed_textures = 0; while (tex) { HalideOpenGLTexture *next = tex->next; if (tex->halide_allocated) { ST.DeleteTextures(1, &tex->id); CHECK_GLERROR(); freed_textures++; } free(tex); tex = next; } #ifdef DEBUG if (freed_textures > 0) { halide_printf(user_context, "halide_opengl_release: deleted %d dangling texture(s).\n", freed_textures); } #endif ST.DeleteBuffers(1, &ST.vertex_buffer); ST.DeleteBuffers(1, &ST.element_buffer); ST.vertex_shader_id = 0; ST.framebuffer_id = 0; ST.vertex_buffer = 0; ST.element_buffer = 0; ST.kernels = NULL; ST.textures = NULL; ST.initialized = false; }