/** * @internal * Perform sort using device data. * */ static CCLEvent* clo_sort_gselect_sort_with_device_data( CloSort* sorter, CCLQueue* cq_exec, CCLQueue* cq_comm, CCLBuffer* data_in, CCLBuffer* data_out, size_t numel, size_t lws_max, GError** err) { /* Make sure err is NULL or it is not set. */ g_return_val_if_fail(err == NULL || *err == NULL, NULL); /* Make sure cq_exec is not NULL. */ g_return_val_if_fail(cq_exec != NULL, NULL); /* Worksizes. */ size_t lws, gws; /* OpenCL object wrappers. */ CCLContext* ctx = NULL; CCLDevice* dev = NULL; CCLKernel* krnl = NULL; CCLEvent* evt = NULL; /* Event wait list. */ CCLEventWaitList ewl = NULL; /* Internal error reporting object. */ GError* err_internal = NULL; /* Flag indicating if sorted data is to be copied back to original * buffer, simulating an in-place sort. */ cl_bool copy_back = CL_FALSE; /* If data transfer queue is NULL, use exec queue for data * transfers. */ if (cq_comm == NULL) cq_comm = cq_exec; /* Get device where sort will occurr. */ dev = ccl_queue_get_device(cq_exec, &err_internal); g_if_err_propagate_goto(err, err_internal, error_handler); /* Get the kernel wrapper. */ krnl = ccl_program_get_kernel(clo_sort_get_program(sorter), "gselect", &err_internal); g_if_err_propagate_goto(err, err_internal, error_handler); /* Determine worksizes. */ gws = numel; lws = lws_max; ccl_kernel_suggest_worksizes( krnl, dev, 1, &gws, NULL, &lws, &err_internal); g_if_err_propagate_goto(err, err_internal, error_handler); /* Check if data_out is set. */ if (data_out == NULL) { /* If not create it and set the copy back flag to TRUE. */ /* Get context. */ ctx = ccl_queue_get_context(cq_comm, &err_internal); g_if_err_propagate_goto(err, err_internal, error_handler); /* Set copy-back flag to true. */ copy_back = CL_TRUE; /* Create output buffer. */ data_out = ccl_buffer_new(ctx, CL_MEM_WRITE_ONLY, numel * clo_sort_get_element_size(sorter), NULL, &err_internal); g_if_err_propagate_goto(err, err_internal, error_handler); } else { /* Set copy back flag to FALSE. */ copy_back = CL_FALSE; } /* Set kernel arguments. */ cl_ulong numel_l = numel; ccl_kernel_set_args( krnl, data_in, data_out, ccl_arg_priv(numel_l, cl_ulong), NULL); /* Perform global memory selection sort. */ evt = ccl_kernel_enqueue_ndrange( krnl, cq_exec, 1, NULL, &gws, &lws, NULL, &err_internal); g_if_err_propagate_goto(err, err_internal, error_handler); ccl_event_set_name(evt, "gselect_ndrange"); /* If copy-back flag is set, copy sorted data back to original * buffer. */ if (copy_back) { ccl_event_wait_list_add(&ewl, evt, NULL); evt = ccl_buffer_enqueue_copy(data_out, data_in, cq_comm, 0, 0, numel * clo_sort_get_element_size(sorter), &ewl, &err_internal); g_if_err_propagate_goto(err, err_internal, error_handler); ccl_event_set_name(evt, "gselect_copy"); } /* If we got here, everything is OK. */ g_assert(err == NULL || *err == NULL); goto finish; error_handler: /* If we got here there was an error, verify that it is so. */ g_assert(err == NULL || *err != NULL); evt = NULL; finish: /* Free data out buffer if copy-back flag is set. */ if ((copy_back) && (data_out != NULL)) ccl_buffer_destroy(data_out); /* Return event wait list. */ return evt; }
/** * Canonical example main function. * */ int main(int argc, char** argv) { /* Number of elements in buffer. */ size_t buf_n = DEF_BUF_N; /* Device selected specified in the command line. */ int dev_idx = -1; /* Program return value. */ int ret_val; /* Check if a device was specified in the command line. */ if (argc >= 2) { dev_idx = atoi(argv[1]); } /* Check if a new buffer size was specified in the command line. */ if (argc >= 3) { buf_n = atoi(argv[2]); } /* Wrappers. */ CCLContext* ctx = NULL; CCLProgram* prg = NULL; CCLDevice* dev = NULL; CCLQueue* queue = NULL; CCLKernel* krnl = NULL; CCLBuffer* a_dev; CCLBuffer* b_dev; CCLBuffer* c_dev; CCLEvent* evt_write1; CCLEvent* evt_write2; CCLEvent* evt_exec; CCLEventWaitList ewl = NULL; /* Profiler. */ CCLProf* prof; /* Global and local worksizes. */ size_t gws = 0; size_t lws = 0; /* Host buffers. */ cl_uint* a_host = NULL; cl_uint* b_host = NULL; cl_uint* c_host = NULL; cl_uint d_host; /* Error reporting object. */ CCLErr* err = NULL; /* Check results flag. */ cl_bool check_result; /* Create a context with device selected from menu. */ ctx = ccl_context_new_from_menu_full(&dev_idx, &err); HANDLE_ERROR(err); /* Get the selected device. */ dev = ccl_context_get_device(ctx, 0, &err); HANDLE_ERROR(err); /* Create a new program from kernel source. */ prg = ccl_program_new_from_source(ctx, KERNEL_SRC, &err); HANDLE_ERROR(err); /* Build program. */ ccl_program_build(prg, NULL, &err); HANDLE_ERROR(err); /* Create a command queue. */ queue = ccl_queue_new(ctx, dev, CL_QUEUE_PROFILING_ENABLE, &err); HANDLE_ERROR(err); /* Get kernel object. */ krnl = ccl_program_get_kernel(prg, KERNEL_NAME, &err); HANDLE_ERROR(err); /* Get worksizes. */ lws = ccl_kernel_suggest_worksizes(krnl, dev, 1, &buf_n, &gws, &lws, &err); HANDLE_ERROR(err); /* Show worksizes. */ printf("\n"); printf(" * Global worksize: %d\n", (int) gws); printf(" * Local worksize : %d\n", (int) lws); /* Initialize host buffers. */ a_host = (cl_uint*) malloc(sizeof(cl_uint) * buf_n); b_host = (cl_uint*) malloc(sizeof(cl_uint) * buf_n); c_host = (cl_uint*) malloc(sizeof(cl_uint) * buf_n); /* Fill host buffers. */ for (cl_uint i = 0; i < buf_n; ++i) { a_host[i] = i; b_host[i] = buf_n - i; } d_host = buf_n / 4; /* Create device buffers. */ a_dev = ccl_buffer_new(ctx, CL_MEM_READ_ONLY, buf_n * sizeof(cl_uint), NULL, &err); HANDLE_ERROR(err); b_dev = ccl_buffer_new(ctx, CL_MEM_READ_ONLY, buf_n * sizeof(cl_uint), NULL, &err); HANDLE_ERROR(err); c_dev = ccl_buffer_new(ctx, CL_MEM_WRITE_ONLY, buf_n * sizeof(cl_uint), NULL, &err); HANDLE_ERROR(err); /* Copy host data to device buffers without waiting for transfer * to terminate before continuing host program. */ evt_write1 = ccl_buffer_enqueue_write(a_dev, queue, CL_FALSE, 0, buf_n * sizeof(cl_uint), a_host, NULL, &err); HANDLE_ERROR(err); evt_write2 = ccl_buffer_enqueue_write(b_dev, queue, CL_FALSE, 0, buf_n * sizeof(cl_uint), b_host, NULL, &err); HANDLE_ERROR(err); /* Initialize event wait list and add the two transfer events. */ ccl_event_wait_list_add(&ewl, evt_write1, evt_write2, NULL); /* Execute program kernel, waiting for the two transfer events * to terminate (this will empty the event wait list). */ evt_exec = ccl_program_enqueue_kernel(prg, KERNEL_NAME, queue, 1, NULL, &gws, &lws, &ewl, &err, /* Kernel arguments. */ a_dev, b_dev, c_dev, ccl_arg_priv(d_host, cl_uint), ccl_arg_priv(buf_n, cl_uint), NULL); HANDLE_ERROR(err); /* Add the kernel termination event to the wait list. */ ccl_event_wait_list_add(&ewl, evt_exec, NULL); /* Sync. queue for events in wait list (just the execute event in * this case) to terminate before going forward... */ ccl_enqueue_barrier(queue, &ewl, &err); HANDLE_ERROR(err); /* Read back results from host waiting for transfer to terminate * before continuing host program. */ ccl_buffer_enqueue_read(c_dev, queue, CL_TRUE, 0, buf_n * sizeof(cl_uint), c_host, NULL, &err); HANDLE_ERROR(err); /* Check results are as expected (not available with OpenCL stub). */ check_result = CL_TRUE; for (cl_uint i = 0; i < buf_n; ++i) { if(c_host[i] != a_host[i] + b_host[i] + d_host) { check_result = CL_FALSE; break; } } if (check_result) { fprintf(stdout, " * Kernel execution produced the expected results.\n"); ret_val = EXIT_SUCCESS; } else { fprintf(stderr, " * Kernel execution failed to produce the expected results.\n"); ret_val = EXIT_FAILURE; } /* Perform profiling. */ prof = ccl_prof_new(); ccl_prof_add_queue(prof, "queue1", queue); ccl_prof_calc(prof, &err); HANDLE_ERROR(err); /* Show profiling info. */ ccl_prof_print_summary(prof); /* Export profiling info. */ ccl_prof_export_info_file(prof, "out.tsv", &err); HANDLE_ERROR(err); /* Destroy profiler object. */ ccl_prof_destroy(prof); /* Destroy host buffers. */ free(a_host); free(b_host); free(c_host); /* Destroy wrappers. */ ccl_buffer_destroy(a_dev); ccl_buffer_destroy(b_dev); ccl_buffer_destroy(c_dev); ccl_queue_destroy(queue); ccl_program_destroy(prg); ccl_context_destroy(ctx); /* Confirm that memory allocated by wrappers has been properly freed. */ assert(ccl_wrapper_memcheck()); /* Bye. */ return ret_val; }