/** * Get the platform wrapper for the given device wrapper. * * @public @memberof ccl_platform * * @param[in] dev The device wrapper from where to get a platform wrapper. * @param[out] err Return location for a ::CCLErr object, or `NULL` if error * reporting is to be ignored. * @return The platform wrapper for the given device wrapper or `NULL` in * case an error occurs. * */ CCL_EXPORT CCLPlatform * ccl_platform_new_from_device(CCLDevice * dev, CCLErr ** err) { /* Make sure dev is not NULL. */ g_return_val_if_fail(dev != NULL, NULL); /* Make sure err is NULL or it is not set. */ g_return_val_if_fail(err == NULL || *err == NULL, NULL); /* The OpenCL platform_id object. */ cl_platform_id platform_id; /* The platform wrapper to return. */ CCLPlatform * platf = NULL; /* Internal error object. */ CCLErr * err_internal = NULL; /* Get OpenCL platform_id object from device. */ platform_id = ccl_device_get_info_scalar( dev, CL_DEVICE_PLATFORM, cl_platform_id, &err_internal); g_if_err_propagate_goto(err, err_internal, error_handler); /* Create/get the platform wrapper. */ platf = ccl_platform_new_wrap(platform_id); /* If we got here, everything is OK. */ g_assert(err == NULL || *err == NULL); goto finish; error_handler: /* If we got here there was an error, verify that it is so. */ g_assert(err == NULL || *err != NULL); finish: /* Return the device platform wrapper. */ return platf; }
/** * Suggest appropriate local (and optionally global) work sizes for the * given real work size, based on device and kernel characteristics. * * If the `gws` parameter is not `NULL`, it will be populated with a * global worksize which may be larger than the real work size * in order to better fit the kernel preferred multiple work size. As * such, kernels enqueued with global work sizes suggested by this * function should check if their global ID is within `real_worksize`. * * @public @memberof ccl_kernel * * @param[in] krnl Kernel wrapper object. If `NULL`, use only device * information for determining global and local worksizes. * @param[in] dev Device wrapper object. * @param[in] dims The number of dimensions used to specify the global * work-items and work-items in the work-group. * @param[in] real_worksize The real worksize. * @param[out] gws Location where to place a "nice" global worksize for * the given kernel and device, which must be equal or larger than the ` * real_worksize` and a multiple of `lws`. This memory location should * be pre-allocated with space for `dims` values of size `size_t`. If * `NULL` it is assumed that the global worksize must be equal to * `real_worksize`. * @param[in,out] lws This memory location, of size * `dims * sizeof(size_t)`, serves a dual purpose: 1) as an input, * containing the maximum allowed local work size for each dimension, or * zeros if these maximums are to be fetched from the given device * `CL_DEVICE_MAX_WORK_ITEM_SIZES` information (if the specified values * are larger than the device limits, the device limits are used * instead); 2) as an output, where to place a "nice" local worksize, * which is based and respects the limits of the given kernel and device * (and of the non-zero values given as input). * @param[out] err Return location for a ::CCLErr object, or `NULL` if error * reporting is to be ignored. * @return `CL_TRUE` if function returns successfully, `CL_FALSE` * otherwise. * */ CCL_EXPORT cl_bool ccl_kernel_suggest_worksizes(CCLKernel* krnl, CCLDevice* dev, cl_uint dims, const size_t* real_worksize, size_t* gws, size_t* lws, CCLErr** err) { /* Make sure dev is not NULL. */ g_return_val_if_fail(dev != NULL, CL_FALSE); /* Make sure dims not zero. */ g_return_val_if_fail(dims > 0, CL_FALSE); /* Make sure real_worksize is not NULL. */ g_return_val_if_fail(real_worksize != NULL, CL_FALSE); /* Make sure lws is not NULL. */ g_return_val_if_fail(lws != NULL, CL_FALSE); /* Make sure err is NULL or it is not set. */ g_return_val_if_fail(err == NULL || *err == NULL, CL_FALSE); /* The preferred workgroup size. */ size_t wg_size_mult = 0; size_t wg_size_max = 0; size_t wg_size = 1, wg_size_aux; size_t* max_wi_sizes; cl_uint dev_dims; cl_bool ret_status; size_t real_ws = 1; /* Error handling object. */ CCLErr* err_internal = NULL; /* Check if device supports the requested dims. */ dev_dims = ccl_device_get_info_scalar( dev, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, cl_uint, &err_internal); ccl_if_err_propagate_goto(err, err_internal, error_handler); ccl_if_err_create_goto(*err, CCL_ERROR, dims > dev_dims, CCL_ERROR_UNSUPPORTED_OCL, error_handler, "%s: device only supports a maximum of %d dimension(s), " "but %d were requested.", CCL_STRD, dev_dims, dims); /* Get max. work item sizes for device. */ max_wi_sizes = ccl_device_get_info_array( dev, CL_DEVICE_MAX_WORK_ITEM_SIZES, size_t*, &err_internal); ccl_if_err_propagate_goto(err, err_internal, error_handler); /* For each dimension, if the user specified a maximum local work * size, the effective maximum local work size will be the minimum * between the user value and the device value. */ for (cl_uint i = 0; i < dims; ++i) { if (lws[i] != 0) max_wi_sizes[i] = MIN(max_wi_sizes[i], lws[i]); } /* If kernel is not NULL, query it about workgroup size preferences * and capabilities. */ if (krnl != NULL) { /* Determine maximum workgroup size. */ wg_size_max = ccl_kernel_get_workgroup_info_scalar(krnl, dev, CL_KERNEL_WORK_GROUP_SIZE, size_t, &err_internal); ccl_if_err_not_info_unavailable_propagate_goto( err, err_internal, error_handler); #ifdef CL_VERSION_1_1 /* Determine preferred workgroup size multiple (OpenCL >= 1.1). */ /* Get OpenCL version of the underlying platform. */ cl_uint ocl_ver = ccl_kernel_get_opencl_version(krnl, &err_internal); ccl_if_err_propagate_goto(err, err_internal, error_handler); /* If OpenCL version of the underlying platform is >= 1.1 ... */ if (ocl_ver >= 110) { /* ...use CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE... */ wg_size_mult = ccl_kernel_get_workgroup_info_scalar( krnl, dev, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, size_t, &err_internal); ccl_if_err_not_info_unavailable_propagate_goto( err, err_internal, error_handler); } else { /* ...otherwise just use CL_KERNEL_WORK_GROUP_SIZE. */ wg_size_mult = wg_size_max; } #else wg_size_mult = wg_size_max; #endif } /* If it was not possible to obtain wg_size_mult and wg_size_max, either * because kernel is NULL or the information was unavailable, use values * obtained from device. */ if ((wg_size_max == 0) && (wg_size_mult == 0)) { wg_size_max = ccl_device_get_info_scalar( dev, CL_DEVICE_MAX_WORK_GROUP_SIZE, size_t, &err_internal); ccl_if_err_propagate_goto(err, err_internal, error_handler); wg_size_mult = wg_size_max; } /* Try to find an appropriate local worksize. */ for (cl_uint i = 0; i < dims; ++i) { /* Each lws component is at most the preferred workgroup * multiple or the maximum size of that component in device. */ lws[i] = MIN(wg_size_mult, max_wi_sizes[i]); /* Update total workgroup size. */ wg_size *= lws[i]; /* Update total real worksize. */ real_ws *= real_worksize[i]; } /* Don't let each component of the local worksize to be * higher than the respective component of the real * worksize. */ for (cl_uint i = 0; i < dims; ++i) { while (lws[i] > real_worksize[i]) { lws[i] /= 2; wg_size /= 2; } } /* The total workgroup size can't be higher than the maximum * supported by the device. */ while (wg_size > wg_size_max) { wg_size_aux = wg_size; for (int i = dims - 1; i >= 0; --i) { if (lws[i] > 1) { /* Local work size can't be smaller than 1. */ lws[i] /= 2; wg_size /= 2; } if (wg_size <= wg_size_max) break; } /* Avoid infinite loops and throw error if wg_size didn't * change. */ ccl_if_err_create_goto(*err, CCL_ERROR, wg_size == wg_size_aux, CCL_ERROR_OTHER, error_handler, "%s: Unable to determine a work size within the device limit (%d).", CCL_STRD, (int) wg_size_max); } /* If output variable gws is not NULL... */ if (gws != NULL) { /* ...find a global worksize which is a multiple of the local * worksize and is big enough to handle the real worksize. */ for (cl_uint i = 0; i < dims; ++i) { gws[i] = ((real_worksize[i] / lws[i]) + (((real_worksize[i] % lws[i]) > 0) ? 1 : 0)) * lws[i]; } } else { /* ...otherwise check if found local worksizes are divisors of * the respective real_worksize. If so keep them, otherwise find * local worksizes which respect the maximum sizes allowed by * the kernel and the device, and is a dimension-wise divisor of * the real_worksize. */ cl_bool lws_are_divisors = CL_TRUE; for (cl_uint i = 0; i < dims; ++i) { /* Check if lws[i] is divisor of real_worksize[i]. */ if (real_worksize[i] % lws[i] != 0) { /* Ops... lws[i] is not divisor of real_worksize[i], so * we'll have to try and find new lws ahead. */ lws_are_divisors = CL_FALSE; break; } } /* Is lws divisor of real_worksize, dimension-wise? */ if (!lws_are_divisors) { /* No, so we'll have to find new lws. */ wg_size = 1; for (cl_uint i = 0; i < dims; ++i) { /* For each dimension, try to use the previously * found lws[i]. */ if ((real_worksize[i] % lws[i] != 0) || (lws[i] * wg_size > wg_size_max)) { /* Previoulsy found lws[i] not usable, find * new one. Must be a divisor of real_worksize[i] * and respect the kernel and device maximum lws.*/ cl_uint best_lws_i = 1; for (cl_uint j = 2; j <= real_worksize[i] / 2; ++j) { /* If current value is higher than the kernel * and device limits, stop searching and use * the best one so far. */ if ((wg_size * j > wg_size_max) || (j > max_wi_sizes[i])) break; /* Otherwise check if current value is divisor * of lws[i]. If so, keep it as the best so * far. */ if (real_worksize[i] % j == 0) best_lws_i = j; } /* Keep the best divisor for current dimension. */ lws[i] = best_lws_i; } /* Update absolute workgroup size (all dimensions). */ wg_size *= lws[i]; } } } /* If we got here, everything is OK. */ g_assert(err == NULL || *err == NULL); ret_status = CL_TRUE; goto finish; error_handler: /* If we got here there was an error, verify that it is so. */ g_assert(err == NULL || *err != NULL); ret_status = CL_FALSE; finish: /* Return status. */ return ret_status; }
/** * Cellular automata sample main function. * */ int main(int argc, char* argv[]) { /* Wrappers for OpenCL objects. */ CCLContext* ctx; CCLDevice* dev; CCLImage* img1; CCLImage* img2; CCLProgram* prg; CCLKernel* krnl; CCLEvent* evt1; CCLEvent* evt2; /* Other variables. */ CCLEventWaitList ewl = NULL; /* Profiler object. */ CCLProf* prof; /* Output images filename. */ char* filename; /* Selected device, may be given in command line. */ int dev_idx = -1; /* Error handling object (must be NULL). */ GError* err = NULL; /* Does selected device support images? */ cl_bool image_ok; /* Initial sim state. */ cl_uchar4* input_image; /* Simulation states. */ cl_uchar4** output_images; /* RNG seed, may be given in command line. */ unsigned int seed; /* Image file write status. */ int file_write_status; /* Image format. */ cl_image_format image_format = { CL_RGBA, CL_UNSIGNED_INT8 }; /* Thread data. */ struct thread_data td; /* Global and local worksizes. */ size_t gws[2]; size_t lws[2]; /* Threads. */ GThread* comm_thread; GThread* exec_thread; /* Check arguments. */ if (argc >= 2) { /* Check if a device was specified in the command line. */ dev_idx = atoi(argv[1]); } if (argc >= 3) { /* Check if a RNG seed was specified. */ seed = atoi(argv[2]); } else { seed = (unsigned int) time(NULL); } /* Initialize RNG. */ srand(seed); /* Create random initial state. */ input_image = (cl_uchar4*) malloc(CA_WIDTH * CA_HEIGHT * sizeof(cl_uchar4)); for (cl_uint i = 0; i < CA_WIDTH * CA_HEIGHT; ++i) { cl_uchar state = (rand() & 0x3) ? 0xFF : 0x00; input_image[i] = (cl_uchar4) {{ state, state, state, 0xFF }}; } /* Allocate space for simulation results. */ output_images = (cl_uchar4**) malloc((CA_ITERS + 1) * sizeof(cl_uchar4*)); for (cl_uint i = 0; i < CA_ITERS + 1; ++i) output_images[i] = (cl_uchar4*) malloc(CA_WIDTH * CA_HEIGHT * sizeof(cl_uchar4)); /* Create context using device selected from menu. */ ctx = ccl_context_new_from_menu_full(&dev_idx, &err); HANDLE_ERROR(err); /* Get first device in context. */ dev = ccl_context_get_device(ctx, 0, &err); HANDLE_ERROR(err); /* Ask device if it supports images. */ image_ok = ccl_device_get_info_scalar( dev, CL_DEVICE_IMAGE_SUPPORT, cl_bool, &err); HANDLE_ERROR(err); if (!image_ok) ERROR_MSG_AND_EXIT("Selected device doesn't support images."); /* Create command queues. */ queue_exec = ccl_queue_new(ctx, dev, CL_QUEUE_PROFILING_ENABLE, &err); HANDLE_ERROR(err); queue_comm = ccl_queue_new(ctx, dev, CL_QUEUE_PROFILING_ENABLE, &err); HANDLE_ERROR(err); /* Create 2D image for initial state. */ img1 = ccl_image_new(ctx, CL_MEM_READ_WRITE, &image_format, NULL, &err, "image_type", (cl_mem_object_type) CL_MEM_OBJECT_IMAGE2D, "image_width", (size_t) CA_WIDTH, "image_height", (size_t) CA_HEIGHT, NULL); HANDLE_ERROR(err); /* Create another 2D image for double buffering. */ img2 = ccl_image_new(ctx, CL_MEM_READ_WRITE, &image_format, NULL, &err, "image_type", (cl_mem_object_type) CL_MEM_OBJECT_IMAGE2D, "image_width", (size_t) CA_WIDTH, "image_height", (size_t) CA_HEIGHT, NULL); HANDLE_ERROR(err); /* Create program from kernel source and compile it. */ prg = ccl_program_new_from_source(ctx, CA_KERNEL, &err); HANDLE_ERROR(err); ccl_program_build(prg, NULL, &err); HANDLE_ERROR(err); /* Get kernel wrapper. */ krnl = ccl_program_get_kernel(prg, "ca", &err); HANDLE_ERROR(err); /* Determine nice local and global worksizes. */ ccl_kernel_suggest_worksizes(krnl, dev, 2, real_ws, gws, lws, &err); HANDLE_ERROR(err); printf("\n * Global work-size: (%d, %d)\n", (int) gws[0], (int) gws[1]); printf(" * Local work-size: (%d, %d)\n", (int) lws[0], (int) lws[1]); /* Create thread communication queues. */ comm_thread_queue = g_async_queue_new(); exec_thread_queue = g_async_queue_new(); host_thread_queue = g_async_queue_new(); /* Setup thread data. */ td.krnl = krnl; td.img1 = img1; td.img2 = img2; td.gws = gws; td.lws = lws; td.output_images = output_images; /* Create threads. */ exec_thread = g_thread_new("exec_thread", exec_func, &td); comm_thread = g_thread_new("comm_thread", comm_func, &td); /* Start profiling. */ prof = ccl_prof_new(); ccl_prof_start(prof); /* Write initial state. */ ccl_image_enqueue_write(img1, queue_comm, CL_TRUE, origin, region, 0, 0, input_image, NULL, &err); HANDLE_ERROR(err); /* Run CA_ITERS iterations of the CA. */ for (cl_uint i = 0; i < CA_ITERS; ++i) { /* Send message to comms thread. */ g_async_queue_push(comm_thread_queue, &go_msg); /* Send message to exec thread. */ g_async_queue_push(exec_thread_queue, &go_msg); /* Get event wrappers from both threads. */ evt1 = (CCLEvent*) g_async_queue_pop(host_thread_queue); evt2 = (CCLEvent*) g_async_queue_pop(host_thread_queue); /* Can't continue until this iteration is over. */ ccl_event_wait_list_add(&ewl, evt1, evt2, NULL); /* Wait for events. */ ccl_event_wait(&ewl, &err); HANDLE_ERROR(err); } /* Send message to comms thread to read last result. */ g_async_queue_push(comm_thread_queue, &go_msg); /* Send stop messages to both threads. */ g_async_queue_push(comm_thread_queue, &stop_msg); g_async_queue_push(exec_thread_queue, &stop_msg); /* Get event wrapper from comms thread. */ evt1 = (CCLEvent*) g_async_queue_pop(host_thread_queue); /* Can't continue until final read is over. */ ccl_event_wait_list_add(&ewl, evt1, NULL); ccl_event_wait(&ewl, &err); HANDLE_ERROR(err); /* Make sure both queues are finished. */ ccl_queue_finish(queue_comm, &err); HANDLE_ERROR(err); ccl_queue_finish(queue_exec, &err); HANDLE_ERROR(err); /* Stop profiling timer and add queues for analysis. */ ccl_prof_stop(prof); ccl_prof_add_queue(prof, "Comms", queue_comm); ccl_prof_add_queue(prof, "Exec", queue_exec); /* Allocate space for base filename. */ filename = (char*) malloc( (strlen(IMAGE_FILE_PREFIX ".png") + IMAGE_FILE_NUM_DIGITS + 1) * sizeof(char)); /* Write results to image files. */ for (cl_uint i = 0; i < CA_ITERS; ++i) { /* Determine next filename. */ sprintf(filename, "%s%0" G_STRINGIFY(IMAGE_FILE_NUM_DIGITS) "d.png", IMAGE_FILE_PREFIX, i); /* Save next image. */ file_write_status = stbi_write_png(filename, CA_WIDTH, CA_HEIGHT, 4, output_images[i], CA_WIDTH * sizeof(cl_uchar4)); /* Give feedback if unable to save image. */ if (!file_write_status) { ERROR_MSG_AND_EXIT("Unable to save image in file."); } } /* Process profiling info. */ ccl_prof_calc(prof, &err); HANDLE_ERROR(err); /* Print profiling info. */ ccl_prof_print_summary(prof); /* Save profiling info. */ ccl_prof_export_info_file(prof, "prof.tsv", &err); HANDLE_ERROR(err); /* Destroy threads. */ g_thread_join(exec_thread); g_thread_join(comm_thread); /* Destroy thread communication queues. */ g_async_queue_unref(comm_thread_queue); g_async_queue_unref(exec_thread_queue); g_async_queue_unref(host_thread_queue); /* Release host buffers. */ free(filename); free(input_image); for (cl_uint i = 0; i < CA_ITERS + 1; ++i) free(output_images[i]); free(output_images); /* Release wrappers. */ ccl_image_destroy(img1); ccl_image_destroy(img2); ccl_program_destroy(prg); ccl_queue_destroy(queue_comm); ccl_queue_destroy(queue_exec); ccl_context_destroy(ctx); /* Destroy profiler. */ ccl_prof_destroy(prof); /* Check all wrappers have been destroyed. */ g_assert(ccl_wrapper_memcheck()); /* Terminate. */ return 0; }
/** * @internal * * @brief Tests the ccl_buffer_new_from_region() function. * */ static void create_from_region_test() { /* Test variables. */ CCLContext * ctx = NULL; CCLDevice * dev = NULL; CCLQueue * cq = NULL; CCLBuffer * buf = NULL; CCLBuffer * subbuf = NULL; CCLEvent * evt = NULL; CCLEventWaitList ewl = NULL; CCLErr * err = NULL; cl_ulong * hbuf; cl_ulong * hsubbuf; cl_uint min_align; size_t siz_buf; size_t siz_subbuf; /* Get the test context with the pre-defined device. */ ctx = ccl_test_context_new(&err); g_assert_no_error(err); /* Get first device in context. */ dev = ccl_context_get_device(ctx, 0, &err); g_assert_no_error(err); /* Get minimum alignment for sub-buffer in bits. */ min_align = ccl_device_get_info_scalar( dev, CL_DEVICE_MEM_BASE_ADDR_ALIGN, cl_uint, &err); g_assert_no_error(err); /* Determine buffer and sub-buffer sizes (divide by 64 because its * the number of bits in cl_ulong). */ siz_subbuf = sizeof(cl_ulong) * min_align / 64; siz_buf = 4 * siz_subbuf; /* Allocate memory for host buffer and host sub-buffer. */ hbuf = g_slice_alloc(siz_buf); hsubbuf = g_slice_alloc(siz_subbuf); /* Initialize initial host buffer. */ for (cl_uint i = 0; i < siz_buf / sizeof(cl_ulong); ++i) hbuf[i] = g_test_rand_int(); /* Create a command queue. */ cq = ccl_queue_new(ctx, dev, 0, &err); g_assert_no_error(err); /* Create a regular buffer, put some data in it. */ buf = ccl_buffer_new( ctx, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, siz_buf, hbuf, &err); g_assert_no_error(err); /* Create sub-buffer from indexes 16 to 31 (16 positions) of * original buffer. */ subbuf = ccl_buffer_new_from_region( buf, 0, siz_subbuf, siz_subbuf, &err); g_assert_no_error(err); /* Get data in sub-buffer to a new host buffer. */ evt = ccl_buffer_enqueue_read( subbuf, cq, CL_FALSE, 0, siz_subbuf, hsubbuf, NULL, &err); g_assert_no_error(err); /* Wait for read to be complete. */ ccl_event_wait(ccl_ewl(&ewl, evt, NULL), &err); g_assert_no_error(err); /* Check that expected values were successfully read. */ for (cl_uint i = 0; i < siz_subbuf / sizeof(cl_ulong); ++i) g_assert_cmpuint( hsubbuf[i], ==, hbuf[i + siz_subbuf / sizeof(cl_ulong)]); /* Destroy stuff. */ ccl_buffer_destroy(buf); ccl_buffer_destroy(subbuf); ccl_queue_destroy(cq); ccl_context_destroy(ctx); g_slice_free1(siz_buf, hbuf); g_slice_free1(siz_subbuf, hsubbuf); /* Confirm that memory allocated by wrappers has been properly * freed. */ g_assert(ccl_wrapper_memcheck()); }
/** * Image filter main function. * */ int main(int argc, char * argv[]) { /* Wrappers for OpenCL objects. */ CCLContext * ctx; CCLDevice * dev; CCLImage * img_in; CCLImage * img_out; CCLQueue * queue; CCLProgram * prg; CCLKernel * krnl; CCLSampler * smplr; /* Device selected specified in the command line. */ int dev_idx = -1; /* Error handling object (must be initialized to NULL). */ CCLErr * err = NULL; /* Does selected device support images? */ cl_bool image_ok; /* Image data in host. */ unsigned char * input_image; unsigned char * output_image; /* Image properties. */ int width, height, n_channels; /* Image file write status. */ int file_write_status; /* Image parameters. */ cl_image_format image_format = { CL_RGBA, CL_UNSIGNED_INT8 }; /* Origin and region of complete image. */ size_t origin[3] = { 0, 0, 0 }; size_t region[3]; /* Real worksize. */ size_t real_ws[2]; /* Global and local worksizes. */ size_t gws[2]; size_t lws[2]; /* Check arguments. */ if (argc < 2) { ERROR_MSG_AND_EXIT("Usage: image_filter <image_file> [device_index]"); } else if (argc >= 3) { /* Check if a device was specified in the command line. */ dev_idx = atoi(argv[2]); } /* Load image. */ input_image = stbi_load(argv[1], &width, &height, &n_channels, 4); if (!input_image) ERROR_MSG_AND_EXIT(stbi_failure_reason()); /* Real work size. */ real_ws[0] = width; real_ws[1] = height; /* Set image region. */ region[0] = width; region[1] = height; region[2] = 1; /* Create context using device selected from menu. */ ctx = ccl_context_new_from_menu_full(&dev_idx, &err); HANDLE_ERROR(err); /* Get first device in context. */ dev = ccl_context_get_device(ctx, 0, &err); HANDLE_ERROR(err); /* Ask device if it supports images. */ image_ok = ccl_device_get_info_scalar( dev, CL_DEVICE_IMAGE_SUPPORT, cl_bool, &err); HANDLE_ERROR(err); if (!image_ok) ERROR_MSG_AND_EXIT("Selected device doesn't support images."); /* Create a command queue. */ queue = ccl_queue_new(ctx, dev, 0, &err); HANDLE_ERROR(err); /* Create 2D input image using loaded image data. */ img_in = ccl_image_new(ctx, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, &image_format, input_image, &err, "image_type", (cl_mem_object_type) CL_MEM_OBJECT_IMAGE2D, "image_width", (size_t) width, "image_height", (size_t) height, NULL); HANDLE_ERROR(err); /* Create 2D output image. */ img_out = ccl_image_new(ctx, CL_MEM_WRITE_ONLY, &image_format, NULL, &err, "image_type", (cl_mem_object_type) CL_MEM_OBJECT_IMAGE2D, "image_width", (size_t) width, "image_height", (size_t) height, NULL); HANDLE_ERROR(err); /* Create program from kernel source and compile it. */ prg = ccl_program_new_from_source(ctx, FILTER_KERNEL, &err); HANDLE_ERROR(err); ccl_program_build(prg, NULL, &err); HANDLE_ERROR(err); /* Get kernel wrapper. */ krnl = ccl_program_get_kernel(prg, "do_filter", &err); HANDLE_ERROR(err); /* Determine nice local and global worksizes. */ ccl_kernel_suggest_worksizes(krnl, dev, 2, real_ws, gws, lws, &err); HANDLE_ERROR(err); /* Show information to user. */ printf("\n * Image size: %d x %d, %d channels\n", width, height, n_channels); printf(" * Global work-size: (%d, %d)\n", (int) gws[0], (int) gws[1]); printf(" * Local work-size: (%d, %d)\n", (int) lws[0], (int) lws[1]); /* Create sampler (this could also be created in-kernel). */ smplr = ccl_sampler_new(ctx, CL_FALSE, CL_ADDRESS_CLAMP_TO_EDGE, CL_FILTER_NEAREST, &err); HANDLE_ERROR(err); /* Apply filter. */ ccl_kernel_set_args_and_enqueue_ndrange( krnl, queue, 2, NULL, gws, lws, NULL, &err, img_in, img_out, smplr, NULL); HANDLE_ERROR(err); /* Allocate space for output image. */ output_image = (unsigned char *) malloc(width * height * 4 * sizeof(unsigned char)); /* Read image data back to host. */ ccl_image_enqueue_read(img_out, queue, CL_TRUE, origin, region, 0, 0, output_image, NULL, &err); HANDLE_ERROR(err); /* Write image to file. */ file_write_status = stbi_write_png(IMAGE_FILE, width, height, 4, output_image, width * 4); /* Give feedback. */ if (file_write_status) { fprintf(stdout, "\nImage saved in file '" IMAGE_FILE "'\n"); } else { ERROR_MSG_AND_EXIT("Unable to save image in file."); } /* Release host images. */ free(output_image); stbi_image_free(input_image); /* Release wrappers. */ ccl_image_destroy(img_in); ccl_image_destroy(img_out); ccl_sampler_destroy(smplr); ccl_program_destroy(prg); ccl_queue_destroy(queue); ccl_context_destroy(ctx); /* Check all wrappers have been destroyed. */ assert(ccl_wrapper_memcheck()); /* Terminate. */ return EXIT_SUCCESS; }