static vx_size vxArrayItemSize(vx_context context, vx_enum item_type) { vx_size res = vxSizeOfType(item_type); vx_uint32 i = 0; if (res == 0ul) { for (i = 0; i < VX_INT_MAX_USER_STRUCTS; ++i) { if (context->user_structs[i].type == item_type) { res = context->user_structs[i].size; break; } } } return res; }
/*! \brief Calls an OpenCL kernel from OpenVX Graph. * Steps: * \arg Find the target * \arg Get the vxcl context * \arg Find the kernel (to get cl kernel information) * \arg for each input parameter that is an object, enqueue write * \arg wait for finish * \arg for each parameter, SetKernelArg * \arg call kernel * \arg wait for finish * \arg for each output parameter that is an object, enqueue read * \arg wait for finish * \note This implementation will attempt to use the External API as much as possible, * but will cast to internal representation when needed (due to lack of API or * need for secret information). This is not an optimal OpenCL invocation. */ vx_status vxclCallOpenCLKernel(vx_node node, const vx_reference *parameters, vx_uint32 num) { vx_status status = VX_FAILURE; vx_context context = node->base.context; vx_target target = (vx_target_t *)&node->base.context->targets[node->affinity]; vx_cl_kernel_description_t *vxclk = vxclFindKernel(node->kernel->enumeration); vx_uint32 pidx, pln, didx, plidx, argidx; cl_int err = 0; size_t off_dim[3] = {0,0,0}; size_t work_dim[3]; //size_t local_dim[3]; cl_event writeEvents[VX_INT_MAX_PARAMS]; cl_event readEvents[VX_INT_MAX_PARAMS]; cl_int we = 0, re = 0; vxSemWait(&target->base.lock); // determine which platform to use plidx = 0; // determine which device to use didx = 0; /* for each input/bi data object, enqueue it and set the kernel parameters */ for (argidx = 0, pidx = 0; pidx < num; pidx++) { vx_reference ref = node->parameters[pidx]; vx_enum dir = node->kernel->signature.directions[pidx]; vx_enum type = node->kernel->signature.types[pidx]; vx_memory_t *memory = NULL; switch (type) { case VX_TYPE_ARRAY: memory = &((vx_array)ref)->memory; break; case VX_TYPE_CONVOLUTION: memory = &((vx_convolution)ref)->base.memory; break; case VX_TYPE_DISTRIBUTION: memory = &((vx_distribution)ref)->memory; break; case VX_TYPE_IMAGE: memory = &((vx_image)ref)->memory; break; case VX_TYPE_LUT: memory = &((vx_lut_t*)ref)->memory; break; case VX_TYPE_MATRIX: memory = &((vx_matrix)ref)->memory; break; //case VX_TYPE_PYRAMID: // break; case VX_TYPE_REMAP: memory = &((vx_remap)ref)->memory; break; //case VX_TYPE_SCALAR: //case VX_TYPE_THRESHOLD: // break; } if (memory) { for (pln = 0; pln < memory->nptrs; pln++) { if (memory->cl_type == CL_MEM_OBJECT_BUFFER) { if (type == VX_TYPE_IMAGE) { /* set the work dimensions */ work_dim[0] = memory->dims[pln][VX_DIM_X]; work_dim[1] = memory->dims[pln][VX_DIM_Y]; // width, height, stride_x, stride_y err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_int32), &memory->dims[pln][VX_DIM_X]); err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_int32), &memory->dims[pln][VX_DIM_Y]); err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_int32), &memory->strides[pln][VX_DIM_X]); err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_int32), &memory->strides[pln][VX_DIM_Y]); VX_PRINT(VX_ZONE_INFO, "Setting vx_image as Buffer with 4 parameters\n"); } else if (type == VX_TYPE_ARRAY || type == VX_TYPE_LUT) { vx_array arr = (vx_array)ref; // sizeof item, active count, capacity err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&arr->item_size); err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&arr->num_items); // this is output? err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&arr->capacity); err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_int32), &arr->memory.strides[VX_DIM_X]); VX_PRINT(VX_ZONE_INFO, "Setting vx_buffer as Buffer with 4 parameters\n"); } else if (type == VX_TYPE_MATRIX) { vx_matrix mat = (vx_matrix)ref; // columns, rows err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&mat->columns); err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&mat->rows); VX_PRINT(VX_ZONE_INFO, "Setting vx_matrix as Buffer with 2 parameters\n"); } else if (type == VX_TYPE_DISTRIBUTION) { vx_distribution dist = (vx_distribution)ref; // num, range, offset, winsize vx_uint32 range = dist->memory.dims[0][VX_DIM_X] * dist->window_x; err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&dist->memory.dims[VX_DIM_X]); err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&range); err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&dist->offset_x); err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&dist->window_x); } else if (type == VX_TYPE_CONVOLUTION) { vx_convolution conv = (vx_convolution)ref; // columns, rows, scale err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&conv->base.columns); err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&conv->base.rows); err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&conv->scale); } err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(cl_mem), &memory->hdls[pln]); CL_ERROR_MSG(err, "clSetKernelArg"); if (dir == VX_INPUT || dir == VX_BIDIRECTIONAL) { err = clEnqueueWriteBuffer(context->queues[plidx][didx], memory->hdls[pln], CL_TRUE, 0, vxComputeMemorySize(memory, pln), memory->ptrs[pln], 0, NULL, &ref->event); } } else if (memory->cl_type == CL_MEM_OBJECT_IMAGE2D) { vx_rectangle_t rect = {0}; vx_image image = (vx_image)ref; vxGetValidRegionImage(image, &rect); size_t origin[3] = {rect.start_x, rect.start_y, 0}; size_t region[3] = {rect.end_x-rect.start_x, rect.end_y-rect.start_y, 1}; /* set the work dimensions */ work_dim[0] = rect.end_x-rect.start_x; work_dim[1] = rect.end_y-rect.start_y; VX_PRINT(VX_ZONE_INFO, "Setting vx_image as image2d_t wd={%zu,%zu} arg:%u\n",work_dim[0], work_dim[1], argidx); err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(cl_mem), &memory->hdls[pln]); CL_ERROR_MSG(err, "clSetKernelArg"); if (err != CL_SUCCESS) { VX_PRINT(VX_ZONE_ERROR, "Error Calling Kernel %s, parameter %u\n", node->kernel->name, pidx); } if (dir == VX_INPUT || dir == VX_BIDIRECTIONAL) { err = clEnqueueWriteImage(context->queues[plidx][didx], memory->hdls[pln], CL_TRUE, origin, region, memory->strides[pln][VX_DIM_Y], 0, memory->ptrs[pln], 0, NULL, &ref->event); CL_ERROR_MSG(err, "clEnqueueWriteImage"); } } } } else { if (type == VX_TYPE_SCALAR) { vx_value_t value; // largest platform atomic vx_size size = 0ul; vx_scalar sc = (vx_scalar)ref; vx_enum stype = VX_TYPE_INVALID; vxReadScalarValue(sc, &value); vxQueryScalar(sc, VX_SCALAR_ATTRIBUTE_TYPE, &stype, sizeof(stype)); size = vxSizeOfType(stype); err = clSetKernelArg(vxclk->kernels[plidx], argidx++, size, &value); } else if (type == VX_TYPE_THRESHOLD) { vx_enum ttype = 0; vx_threshold th = (vx_threshold)ref; vxQueryThreshold(th, VX_THRESHOLD_ATTRIBUTE_TYPE, &ttype, sizeof(ttype)); if (ttype == VX_THRESHOLD_TYPE_BINARY) { err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint8), &th->value); } else if (ttype == VX_THRESHOLD_TYPE_RANGE) { err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint8), &th->lower); err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint8), &th->upper); } } } } we = 0; for (pidx = 0; pidx < num; pidx++) { vx_reference ref = node->parameters[pidx]; vx_enum dir = node->kernel->signature.directions[pidx]; if (dir == VX_INPUT || dir == VX_BIDIRECTIONAL) { memcpy(&writeEvents[we++],&ref->event, sizeof(cl_event)); } } //local_dim[0] = 1; //local_dim[1] = 1; err = clEnqueueNDRangeKernel(context->queues[plidx][didx], vxclk->kernels[plidx], 2, off_dim, work_dim, NULL, we, writeEvents, &node->base.event); CL_ERROR_MSG(err, "clEnqueueNDRangeKernel"); /* enqueue a read on all output data */ for (pidx = 0; pidx < num; pidx++) { vx_reference ref = node->parameters[pidx]; vx_enum dir = node->kernel->signature.directions[pidx]; vx_enum type = node->kernel->signature.types[pidx]; if (dir == VX_OUTPUT || dir == VX_BIDIRECTIONAL) { vx_memory_t *memory = NULL; switch (type) { case VX_TYPE_ARRAY: memory = &((vx_array)ref)->memory; break; case VX_TYPE_CONVOLUTION: memory = &((vx_convolution)ref)->base.memory; break; case VX_TYPE_DISTRIBUTION: memory = &((vx_distribution)ref)->memory; break; case VX_TYPE_IMAGE: memory = &((vx_image)ref)->memory; break; case VX_TYPE_LUT: memory = &((vx_lut_t*)ref)->memory; break; case VX_TYPE_MATRIX: memory = &((vx_matrix)ref)->memory; break; //case VX_TYPE_PYRAMID: // break; case VX_TYPE_REMAP: memory = &((vx_remap)ref)->memory; break; //case VX_TYPE_SCALAR: //case VX_TYPE_THRESHOLD: // break; } if (memory) { for (pln = 0; pln < memory->nptrs; pln++) { if (memory->cl_type == CL_MEM_OBJECT_BUFFER) { err = clEnqueueReadBuffer(context->queues[plidx][didx], memory->hdls[pln], CL_TRUE, 0, vxComputeMemorySize(memory, pln), memory->ptrs[pln], 1, &node->base.event, &ref->event); CL_ERROR_MSG(err, "clEnqueueReadBuffer"); } else if (memory->cl_type == CL_MEM_OBJECT_IMAGE2D) { vx_rectangle_t rect = {0}; vx_image image = (vx_image)ref; vxGetValidRegionImage(image, &rect); size_t origin[3] = {rect.start_x, rect.start_y, 0}; size_t region[3] = {rect.end_x-rect.start_x, rect.end_y-rect.start_y, 1}; /* set the work dimensions */ work_dim[0] = rect.end_x-rect.start_x; work_dim[1] = rect.end_y-rect.start_y; err = clEnqueueReadImage(context->queues[plidx][didx], memory->hdls[pln], CL_TRUE, origin, region, memory->strides[pln][VX_DIM_Y], 0, memory->ptrs[pln], 1, &node->base.event, &ref->event); CL_ERROR_MSG(err, "clEnqueueReadImage"); VX_PRINT(VX_ZONE_INFO, "Reading Image wd={%zu,%zu}\n", work_dim[0], work_dim[1]); } } } } } re = 0; for (pidx = 0; pidx < num; pidx++) { vx_reference ref = node->parameters[pidx]; vx_enum dir = node->kernel->signature.directions[pidx]; if (dir == VX_OUTPUT || dir == VX_BIDIRECTIONAL) { memcpy(&readEvents[re++],&ref->event, sizeof(cl_event)); } } err = clFlush(context->queues[plidx][didx]); CL_ERROR_MSG(err, "Flush"); VX_PRINT(VX_ZONE_TARGET, "Waiting for read events!\n"); clWaitForEvents(re, readEvents); if (err == CL_SUCCESS) status = VX_SUCCESS; //exit: VX_PRINT(VX_ZONE_API, "%s exiting %d\n", __FUNCTION__, status); vxSemPost(&target->base.lock); return status; }