/** * Tests creation, getting info from and destruction of * kernel wrapper objects. * */ static void create_info_destroy_test() { /* Test variables. */ CCLContext* ctx = NULL; cl_context context = NULL; CCLProgram* prg = NULL; cl_program program = NULL; CCLKernel* krnl = NULL; cl_kernel kernel = NULL; CCLDevice* d = NULL; CCLQueue* cq = NULL; size_t gws; size_t lws; cl_uint host_buf[CCL_TEST_KERNEL_BUF_SIZE]; cl_uint host_buf_aux[CCL_TEST_KERNEL_BUF_SIZE]; CCLBuffer* buf; GError* err = NULL; CCLEvent* evt = NULL; CCLEventWaitList ewl = NULL; const char* krnl_name; void* args[] = { NULL, NULL }; cl_bool release_krnl; cl_int ocl_status; /* Create a context with devices from first available platform. */ ctx = ccl_test_context_new(&err); g_assert_no_error(err); /* Create a new program from source and build it. */ prg = ccl_program_new_from_source( ctx, CCL_TEST_KERNEL_CONTENT, &err); g_assert_no_error(err); ccl_program_build(prg, NULL, &err); g_assert_no_error(err); /* Create a command queue. */ cq = ccl_queue_new(ctx, d, CL_QUEUE_PROFILING_ENABLE, &err); g_assert_no_error(err); /* Test three ways to create a kernel wrapper. */ for (cl_uint i = 0; i < 3; ++i) { /* Create kernel wrapper. */ switch (i) { case 0: /* Instantiate kernel directly. */ krnl = ccl_kernel_new(prg, CCL_TEST_KERNEL_NAME, &err); g_assert_no_error(err); release_krnl = CL_TRUE; break; case 1: /* Using the program utility function. No need to free * kernel in this case, because it will be freed when * program is destroyed. */ krnl = ccl_program_get_kernel( prg, CCL_TEST_KERNEL_NAME, &err); g_assert_no_error(err); release_krnl = CL_FALSE; break; case 2: /* Using the "wrap" constructor. */ kernel = clCreateKernel(ccl_program_unwrap(prg), CCL_TEST_KERNEL_NAME, &ocl_status); g_assert_cmpint(ocl_status, ==, CL_SUCCESS); krnl = ccl_kernel_new_wrap(kernel); g_assert_cmphex(GPOINTER_TO_UINT(kernel), ==, GPOINTER_TO_UINT(ccl_kernel_unwrap(krnl))); release_krnl = CL_TRUE; break; } /* Get some kernel info, compare it with expected info. */ /* Get kernel function name from kernel info, compare it with the * expected value. */ krnl_name = ccl_kernel_get_info_array( krnl, CL_KERNEL_FUNCTION_NAME, char*, &err); g_assert_no_error(err); g_assert_cmpstr(krnl_name, ==, CCL_TEST_KERNEL_NAME); /* Check if the kernel context is the same as the initial context * and the program context. */ context = ccl_kernel_get_info_scalar( krnl, CL_KERNEL_CONTEXT, cl_context, &err); g_assert_no_error(err); g_assert(context == ccl_context_unwrap(ctx)); program = ccl_kernel_get_info_scalar( krnl, CL_KERNEL_PROGRAM, cl_program, &err); g_assert_no_error(err); g_assert(program == ccl_program_unwrap(prg)); #ifndef OPENCL_STUB cl_uint ocl_ver; /* Get OpenCL version of kernel's underlying platform. */ ocl_ver = ccl_kernel_get_opencl_version(krnl, &err); g_assert_no_error(err); (void)ocl_ver; #ifdef CL_VERSION_1_1 size_t kwgz; size_t* kcwgs; CCLDevice* dev = NULL; /* If platform supports kernel work group queries, get kernel * work group information and compare it with expected info. */ if (ocl_ver >= 110) { dev = ccl_context_get_device(ctx, 0, &err); g_assert_no_error(err); kwgz = ccl_kernel_get_workgroup_info_scalar( krnl, dev, CL_KERNEL_WORK_GROUP_SIZE, size_t, &err); g_assert_no_error(err); (void)kwgz; kcwgs = ccl_kernel_get_workgroup_info_array(krnl, dev, CL_KERNEL_COMPILE_WORK_GROUP_SIZE, size_t*, &err); g_assert_no_error(err); (void)kcwgs; } #endif /* ifdef CL_VERSION_1_1 */ #ifdef CL_VERSION_1_2 cl_kernel_arg_address_qualifier kaaq; char* kernel_arg_type_name; char* kernel_arg_name; /* If platform supports kernel argument queries, get kernel argument * information and compare it with expected info. */ if (ocl_ver >= 120) { kaaq = ccl_kernel_get_arg_info_scalar(krnl, 0, CL_KERNEL_ARG_ADDRESS_QUALIFIER, cl_kernel_arg_address_qualifier, &err); g_assert((err == NULL) || (err->code == CCL_ERROR_INFO_UNAVAILABLE_OCL)); if (err == NULL) { g_assert_cmphex(kaaq, ==, CL_KERNEL_ARG_ADDRESS_GLOBAL); } else {
/** * Suggest appropriate local (and optionally global) work sizes for the * given real work size, based on device and kernel characteristics. * * If the `gws` parameter is not `NULL`, it will be populated with a * global worksize which may be larger than the real work size * in order to better fit the kernel preferred multiple work size. As * such, kernels enqueued with global work sizes suggested by this * function should check if their global ID is within `real_worksize`. * * @public @memberof ccl_kernel * * @param[in] krnl Kernel wrapper object. If `NULL`, use only device * information for determining global and local worksizes. * @param[in] dev Device wrapper object. * @param[in] dims The number of dimensions used to specify the global * work-items and work-items in the work-group. * @param[in] real_worksize The real worksize. * @param[out] gws Location where to place a "nice" global worksize for * the given kernel and device, which must be equal or larger than the ` * real_worksize` and a multiple of `lws`. This memory location should * be pre-allocated with space for `dims` values of size `size_t`. If * `NULL` it is assumed that the global worksize must be equal to * `real_worksize`. * @param[in,out] lws This memory location, of size * `dims * sizeof(size_t)`, serves a dual purpose: 1) as an input, * containing the maximum allowed local work size for each dimension, or * zeros if these maximums are to be fetched from the given device * `CL_DEVICE_MAX_WORK_ITEM_SIZES` information (if the specified values * are larger than the device limits, the device limits are used * instead); 2) as an output, where to place a "nice" local worksize, * which is based and respects the limits of the given kernel and device * (and of the non-zero values given as input). * @param[out] err Return location for a ::CCLErr object, or `NULL` if error * reporting is to be ignored. * @return `CL_TRUE` if function returns successfully, `CL_FALSE` * otherwise. * */ CCL_EXPORT cl_bool ccl_kernel_suggest_worksizes(CCLKernel* krnl, CCLDevice* dev, cl_uint dims, const size_t* real_worksize, size_t* gws, size_t* lws, CCLErr** err) { /* Make sure dev is not NULL. */ g_return_val_if_fail(dev != NULL, CL_FALSE); /* Make sure dims not zero. */ g_return_val_if_fail(dims > 0, CL_FALSE); /* Make sure real_worksize is not NULL. */ g_return_val_if_fail(real_worksize != NULL, CL_FALSE); /* Make sure lws is not NULL. */ g_return_val_if_fail(lws != NULL, CL_FALSE); /* Make sure err is NULL or it is not set. */ g_return_val_if_fail(err == NULL || *err == NULL, CL_FALSE); /* The preferred workgroup size. */ size_t wg_size_mult = 0; size_t wg_size_max = 0; size_t wg_size = 1, wg_size_aux; size_t* max_wi_sizes; cl_uint dev_dims; cl_bool ret_status; size_t real_ws = 1; /* Error handling object. */ CCLErr* err_internal = NULL; /* Check if device supports the requested dims. */ dev_dims = ccl_device_get_info_scalar( dev, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, cl_uint, &err_internal); ccl_if_err_propagate_goto(err, err_internal, error_handler); ccl_if_err_create_goto(*err, CCL_ERROR, dims > dev_dims, CCL_ERROR_UNSUPPORTED_OCL, error_handler, "%s: device only supports a maximum of %d dimension(s), " "but %d were requested.", CCL_STRD, dev_dims, dims); /* Get max. work item sizes for device. */ max_wi_sizes = ccl_device_get_info_array( dev, CL_DEVICE_MAX_WORK_ITEM_SIZES, size_t*, &err_internal); ccl_if_err_propagate_goto(err, err_internal, error_handler); /* For each dimension, if the user specified a maximum local work * size, the effective maximum local work size will be the minimum * between the user value and the device value. */ for (cl_uint i = 0; i < dims; ++i) { if (lws[i] != 0) max_wi_sizes[i] = MIN(max_wi_sizes[i], lws[i]); } /* If kernel is not NULL, query it about workgroup size preferences * and capabilities. */ if (krnl != NULL) { /* Determine maximum workgroup size. */ wg_size_max = ccl_kernel_get_workgroup_info_scalar(krnl, dev, CL_KERNEL_WORK_GROUP_SIZE, size_t, &err_internal); ccl_if_err_not_info_unavailable_propagate_goto( err, err_internal, error_handler); #ifdef CL_VERSION_1_1 /* Determine preferred workgroup size multiple (OpenCL >= 1.1). */ /* Get OpenCL version of the underlying platform. */ cl_uint ocl_ver = ccl_kernel_get_opencl_version(krnl, &err_internal); ccl_if_err_propagate_goto(err, err_internal, error_handler); /* If OpenCL version of the underlying platform is >= 1.1 ... */ if (ocl_ver >= 110) { /* ...use CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE... */ wg_size_mult = ccl_kernel_get_workgroup_info_scalar( krnl, dev, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, size_t, &err_internal); ccl_if_err_not_info_unavailable_propagate_goto( err, err_internal, error_handler); } else { /* ...otherwise just use CL_KERNEL_WORK_GROUP_SIZE. */ wg_size_mult = wg_size_max; } #else wg_size_mult = wg_size_max; #endif } /* If it was not possible to obtain wg_size_mult and wg_size_max, either * because kernel is NULL or the information was unavailable, use values * obtained from device. */ if ((wg_size_max == 0) && (wg_size_mult == 0)) { wg_size_max = ccl_device_get_info_scalar( dev, CL_DEVICE_MAX_WORK_GROUP_SIZE, size_t, &err_internal); ccl_if_err_propagate_goto(err, err_internal, error_handler); wg_size_mult = wg_size_max; } /* Try to find an appropriate local worksize. */ for (cl_uint i = 0; i < dims; ++i) { /* Each lws component is at most the preferred workgroup * multiple or the maximum size of that component in device. */ lws[i] = MIN(wg_size_mult, max_wi_sizes[i]); /* Update total workgroup size. */ wg_size *= lws[i]; /* Update total real worksize. */ real_ws *= real_worksize[i]; } /* Don't let each component of the local worksize to be * higher than the respective component of the real * worksize. */ for (cl_uint i = 0; i < dims; ++i) { while (lws[i] > real_worksize[i]) { lws[i] /= 2; wg_size /= 2; } } /* The total workgroup size can't be higher than the maximum * supported by the device. */ while (wg_size > wg_size_max) { wg_size_aux = wg_size; for (int i = dims - 1; i >= 0; --i) { if (lws[i] > 1) { /* Local work size can't be smaller than 1. */ lws[i] /= 2; wg_size /= 2; } if (wg_size <= wg_size_max) break; } /* Avoid infinite loops and throw error if wg_size didn't * change. */ ccl_if_err_create_goto(*err, CCL_ERROR, wg_size == wg_size_aux, CCL_ERROR_OTHER, error_handler, "%s: Unable to determine a work size within the device limit (%d).", CCL_STRD, (int) wg_size_max); } /* If output variable gws is not NULL... */ if (gws != NULL) { /* ...find a global worksize which is a multiple of the local * worksize and is big enough to handle the real worksize. */ for (cl_uint i = 0; i < dims; ++i) { gws[i] = ((real_worksize[i] / lws[i]) + (((real_worksize[i] % lws[i]) > 0) ? 1 : 0)) * lws[i]; } } else { /* ...otherwise check if found local worksizes are divisors of * the respective real_worksize. If so keep them, otherwise find * local worksizes which respect the maximum sizes allowed by * the kernel and the device, and is a dimension-wise divisor of * the real_worksize. */ cl_bool lws_are_divisors = CL_TRUE; for (cl_uint i = 0; i < dims; ++i) { /* Check if lws[i] is divisor of real_worksize[i]. */ if (real_worksize[i] % lws[i] != 0) { /* Ops... lws[i] is not divisor of real_worksize[i], so * we'll have to try and find new lws ahead. */ lws_are_divisors = CL_FALSE; break; } } /* Is lws divisor of real_worksize, dimension-wise? */ if (!lws_are_divisors) { /* No, so we'll have to find new lws. */ wg_size = 1; for (cl_uint i = 0; i < dims; ++i) { /* For each dimension, try to use the previously * found lws[i]. */ if ((real_worksize[i] % lws[i] != 0) || (lws[i] * wg_size > wg_size_max)) { /* Previoulsy found lws[i] not usable, find * new one. Must be a divisor of real_worksize[i] * and respect the kernel and device maximum lws.*/ cl_uint best_lws_i = 1; for (cl_uint j = 2; j <= real_worksize[i] / 2; ++j) { /* If current value is higher than the kernel * and device limits, stop searching and use * the best one so far. */ if ((wg_size * j > wg_size_max) || (j > max_wi_sizes[i])) break; /* Otherwise check if current value is divisor * of lws[i]. If so, keep it as the best so * far. */ if (real_worksize[i] % j == 0) best_lws_i = j; } /* Keep the best divisor for current dimension. */ lws[i] = best_lws_i; } /* Update absolute workgroup size (all dimensions). */ wg_size *= lws[i]; } } } /* If we got here, everything is OK. */ g_assert(err == NULL || *err == NULL); ret_status = CL_TRUE; goto finish; error_handler: /* If we got here there was an error, verify that it is so. */ g_assert(err == NULL || *err != NULL); ret_status = CL_FALSE; finish: /* Return status. */ return ret_status; }
/** * Kernel info main function. * * @param[in] argc Number of command line arguments. * @param[in] argv Command line arguments. * @return ::CCL_SUCCESS if program returns with no error, or another * ::CCLErrorCode value otherwise. * */ int main(int argc, char *argv[]) { /* ***************** */ /* Program variables */ /* ***************** */ /* Function and program return status. */ int status; /* Error management. */ GError *err = NULL; /* Context wrapper. */ CCLContext* ctx = NULL; /* Program wrapper. */ CCLProgram* prg = NULL; /* Kernel wrapper. */ CCLKernel* krnl = NULL; /* Device wrapper. */ CCLDevice* dev = NULL; /* Device filters. */ CCLDevSelFilters filters = NULL; /* Default device index. */ cl_int dev_idx = -1; /* OpenCL version. */ double ocl_ver; /* Kernel workgroup info variables. */ size_t k_wg_size; size_t k_pref_wg_size_mult; size_t* k_compile_wg_size; cl_ulong k_loc_mem_size; cl_ulong k_priv_mem_size; /* ************************** */ /* Parse command line options */ /* ************************** */ /* If version was requested, output version and exit. */ if ((argc == 2) && (g_strcmp0("--version", argv[1]) == 0)) { ccl_common_version_print("ccl_kerninfo"); exit(0); } ccl_if_err_create_goto(err, CCL_ERROR, (argc < 3) || (argc > 4), CCL_ERROR_ARGS, error_handler, "Usage: %s <program_file> <kernel_name> [device_index]\n", argv[0]); if (argc == 4) dev_idx = atoi(argv[3]); /* ********************************************* */ /* Initialize OpenCL variables and build program */ /* ********************************************* */ /* Select a context/device. */ ccl_devsel_add_dep_filter( &filters, ccl_devsel_dep_menu, (dev_idx == -1) ? NULL : (void*) &dev_idx); ctx = ccl_context_new_from_filters(&filters, &err); ccl_if_err_goto(err, error_handler); /* Get program which contains kernel. */ prg = ccl_program_new_from_source_file(ctx, argv[1], &err); ccl_if_err_goto(err, error_handler); /* Build program. */ ccl_program_build(prg, NULL, &err); ccl_if_err_goto(err, error_handler); /* Get kernel */ krnl = ccl_program_get_kernel(prg, argv[2], &err); ccl_if_err_goto(err, error_handler); /* Get the device. */ dev = ccl_context_get_device(ctx, 0, &err); ccl_if_err_goto(err, error_handler); /* Check platform OpenCL version. */ ocl_ver = ccl_kernel_get_opencl_version(krnl, &err); ccl_if_err_goto(err, error_handler); /* *************************** */ /* Get and print kernel info */ /* *************************** */ g_printf("\n ======================== Static Kernel Information =======================\n\n"); k_wg_size = ccl_kernel_get_workgroup_info_scalar( krnl, dev, CL_KERNEL_WORK_GROUP_SIZE, size_t, &err); ccl_if_err_goto(err, error_handler); g_printf(" Maximum workgroup size : %lu\n", (unsigned long) k_wg_size); /* Only show info about CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE * if OpenCL version of the underlying platform is >= 1.1. */ if (ocl_ver >= 110) { k_pref_wg_size_mult = ccl_kernel_get_workgroup_info_scalar(krnl, dev, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, size_t, &err); ccl_if_err_goto(err, error_handler); g_printf(" Preferred multiple of workgroup size : %lu\n", (unsigned long) k_pref_wg_size_mult); } k_compile_wg_size = ccl_kernel_get_workgroup_info_array(krnl, dev, CL_KERNEL_COMPILE_WORK_GROUP_SIZE, size_t*, &err); ccl_if_err_goto(err, error_handler); g_printf(" WG size in __attribute__ qualifier : (%lu, %lu, %lu)\n", (unsigned long) k_compile_wg_size[0], (unsigned long) k_compile_wg_size[1], (unsigned long) k_compile_wg_size[2]); k_loc_mem_size = ccl_kernel_get_workgroup_info_scalar(krnl, dev, CL_KERNEL_LOCAL_MEM_SIZE, cl_ulong, &err); ccl_if_err_goto(err, error_handler); g_printf(" Local memory used by kernel : %lu bytes\n", (unsigned long) k_loc_mem_size); k_priv_mem_size = ccl_kernel_get_workgroup_info_scalar(krnl, dev, CL_KERNEL_PRIVATE_MEM_SIZE, cl_ulong, &err); ccl_if_err_goto(err, error_handler); g_printf(" Min. private mem. used by each workitem : %lu bytes\n", (unsigned long) k_priv_mem_size); g_printf("\n"); /* ************** */ /* Error handling */ /* ************** */ /* If we get here, no need for error checking, jump to cleanup. */ g_assert(err == NULL); status = CCL_SUCCESS; goto cleanup; error_handler: /* If we got here there was an error, verify that it is so. */ g_assert(err != NULL); g_fprintf(stderr, "%s\n", err->message); status = (err->domain == CCL_ERROR) ? err->code : CCL_ERROR_OTHER; g_error_free(err); cleanup: /* *********** */ /* Free stuff! */ /* *********** */ if (prg != NULL) ccl_program_destroy(prg); if (ctx != NULL) ccl_context_destroy(ctx); /* Confirm that memory allocated by wrappers has been properly * freed. */ g_return_val_if_fail(ccl_wrapper_memcheck(), CCL_ERROR_OTHER); /* Return status. */ return status; }