static vx_status VX_CALLBACK validateSoftmaxLayer(vx_node node, const vx_reference parameters[], vx_uint32 num, vx_meta_format metas[]) { // check tensor dimensions vx_enum type; vx_size num_dims; vx_size input_dims[4], output_dims[4]; ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_NUMBER_OF_DIMS, &num_dims, sizeof(num_dims))); ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_DATA_TYPE, &type, sizeof(type))); if(num_dims != 4) return VX_ERROR_INVALID_DIMENSION; if(type != VX_TYPE_FLOAT32) return VX_ERROR_INVALID_TYPE; ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_DIMS, input_dims, sizeof(input_dims))); ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[1], VX_TENSOR_NUMBER_OF_DIMS, &num_dims, sizeof(num_dims))); ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[1], VX_TENSOR_DATA_TYPE, &type, sizeof(type))); if(num_dims != 4) return VX_ERROR_INVALID_DIMENSION; if(type != VX_TYPE_FLOAT32) return VX_ERROR_INVALID_TYPE; ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[1], VX_TENSOR_DIMS, output_dims, sizeof(output_dims))); if(output_dims[3] != input_dims[3]) return VX_ERROR_INVALID_DIMENSION; if(output_dims[2] != input_dims[2]) return VX_ERROR_INVALID_DIMENSION; if(output_dims[1] != input_dims[1]) return VX_ERROR_INVALID_DIMENSION; if(output_dims[0] != input_dims[0]) return VX_ERROR_INVALID_DIMENSION; // output tensor configuration type = VX_TYPE_FLOAT32; num_dims = 4; ERROR_CHECK_STATUS(vxSetMetaFormatAttribute(metas[1], VX_TENSOR_DATA_TYPE, &type, sizeof(type))); ERROR_CHECK_STATUS(vxSetMetaFormatAttribute(metas[1], VX_TENSOR_NUMBER_OF_DIMS, &num_dims, sizeof(num_dims))); ERROR_CHECK_STATUS(vxSetMetaFormatAttribute(metas[1], VX_TENSOR_DIMS, output_dims, sizeof(output_dims))); return VX_SUCCESS; }
static vx_status VX_CALLBACK validateTensorToImageKernel(vx_node node, const vx_reference parameters[], vx_uint32 num, vx_meta_format metas[]) { // check input configuration vx_enum type; vx_size num_dims, input_dims[4] = { 1, 1, 1, 1 }; ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_DATA_TYPE, &type, sizeof(type))); ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_NUMBER_OF_DIMS, &num_dims, sizeof(num_dims))); if (num_dims != 4) return VX_ERROR_INVALID_DIMENSION; if (type != VX_TYPE_FLOAT32) return VX_ERROR_INVALID_TYPE; ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_DIMS, input_dims, sizeof(input_dims[0])*num_dims)); if ((input_dims[2] != 3 && input_dims[2] != 1) || ((input_dims[0] & 3) != 0)) return VX_ERROR_INVALID_DIMENSION; vx_enum scalar_type; ERROR_CHECK_STATUS(vxQueryScalar((vx_scalar)parameters[2], VX_SCALAR_TYPE, &scalar_type, sizeof(scalar_type))); if(scalar_type != VX_TYPE_FLOAT32) return VX_ERROR_INVALID_TYPE; ERROR_CHECK_STATUS(vxQueryScalar((vx_scalar)parameters[3], VX_SCALAR_TYPE, &scalar_type, sizeof(scalar_type))); if(scalar_type != VX_TYPE_FLOAT32) return VX_ERROR_INVALID_TYPE; ERROR_CHECK_STATUS(vxQueryScalar((vx_scalar)parameters[4], VX_SCALAR_TYPE, &scalar_type, sizeof(scalar_type))); if(scalar_type != VX_TYPE_BOOL) return VX_ERROR_INVALID_TYPE; // set output image configuration vx_uint32 width = (vx_uint32)input_dims[0]; vx_uint32 height = (vx_uint32)(input_dims[1]*input_dims[3]); vx_df_image format = (input_dims[2] == 3) ? VX_DF_IMAGE_RGB : VX_DF_IMAGE_U8; ERROR_CHECK_STATUS(vxSetMetaFormatAttribute(metas[1], VX_IMAGE_WIDTH, &width, sizeof(width))); ERROR_CHECK_STATUS(vxSetMetaFormatAttribute(metas[1], VX_IMAGE_HEIGHT, &height, sizeof(height))); ERROR_CHECK_STATUS(vxSetMetaFormatAttribute(metas[1], VX_IMAGE_FORMAT, &format, sizeof(format))); return VX_SUCCESS; }
static vx_status VX_CALLBACK uninitializeSoftmaxLayer(vx_node node, const vx_reference *parameters, vx_uint32 num) { SoftmaxLayerLocalData * data = NULL; ERROR_CHECK_STATUS(vxQueryNode(node, VX_NODE_LOCAL_DATA_PTR, &data, sizeof(data))); ERROR_CHECK_MIOPEN_STATUS(miopenDestroyTensorDescriptor(data->input_desc)); ERROR_CHECK_MIOPEN_STATUS(miopenDestroyTensorDescriptor(data->output_desc)); if (data) { ERROR_CHECK_STATUS(releaseGraphHandle(node, data->handle)); delete data; } return VX_SUCCESS; }
static vx_status VX_CALLBACK processSoftmaxLayer(vx_node node, const vx_reference * parameters, vx_uint32 num) { SoftmaxLayerLocalData * data = NULL; ERROR_CHECK_STATUS(vxQueryNode(node, VX_NODE_LOCAL_DATA_PTR, &data, sizeof(data))); miopenHandle_t miopenHandle = data->handle->miopen_handle; ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_BUFFER_OPENCL, &data->input_mem, sizeof(data->input_mem))); ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[1], VX_TENSOR_BUFFER_OPENCL, &data->output_mem, sizeof(data->output_mem))); ERROR_CHECK_STATUS(miopenSoftmaxForward(miopenHandle, &data->alpha, data->input_desc, data->input_mem, &data->beta, data->output_desc, data->output_mem)); return VX_SUCCESS; }
//////// // The user kernel validator callback should check to make sure that all the input // parameters have correct data types and set meta format for the output parameters. // The input parameters to be validated are: // parameter #0 -- input tensor of format VX_TYPE_INT16 // The output parameters that requires setting meta format is: // parameter #1 -- output tebsor of format VX_TYPE_INT16 with the same dimensions as input // TODO:******** // 1. Query the input tensor for the dimensions and format. // 2. Check to make sure that the input tensor format is VX_TYPE_INT16. // 3. Set the required output tensor meta data as following: // - output tensor dimensions should be same as input tensor // - output tensor format should be VX_TYPE_INT16 // - output tensor fixed-point position can be whatever the user requested // * query the output tensor for the fixed-point position value // * set the same value in output tensor meta data vx_status VX_CALLBACK tensor_cos_validator( vx_node node, const vx_reference parameters[], vx_uint32 num, vx_meta_format metas[] ) { // parameter #0 -- query dimensions and format vx_size num_of_dims; ERROR_CHECK_STATUS( vxQueryTensor( ( vx_tensor )parameters[0], VX_TENSOR_NUMBER_OF_DIMS, &num_of_dims, sizeof( num_of_dims ) ) ); if( num_of_dims > 4 ) // sanity check to avoid stack corruption with querying VX_TENSOR_DIMS below { return VX_ERROR_INVALID_DIMENSION; } vx_size dims[4]; ERROR_CHECK_STATUS( vxQueryTensor( ( vx_tensor )parameters[0], VX_TENSOR_DIMS, &dims, num_of_dims * sizeof(vx_size) ) ); vx_enum data_type; ERROR_CHECK_STATUS( vxQueryTensor( ( vx_tensor )parameters[0], VX_TENSOR_DATA_TYPE, &data_type, sizeof( data_type ) ) ); // parameter #0 -- check input tensor format to be VX_TYPE_INF16 if( data_type != VX_TYPE_INT16 ) { return VX_ERROR_INVALID_FORMAT; } // parameter #1 -- query fixed-point position vx_uint8 fixed_point_pos; ERROR_CHECK_STATUS( vxQueryTensor( ( vx_tensor )parameters[1], VX_TENSOR_FIXED_POINT_POSITION, &fixed_point_pos, sizeof( fixed_point_pos ) ) ); // parameter #1 -- set required output tensor meta data ERROR_CHECK_STATUS( vxSetMetaFormatAttribute( metas[1], VX_TENSOR_NUMBER_OF_DIMS, &num_of_dims, sizeof( num_of_dims ) ) ); ERROR_CHECK_STATUS( vxSetMetaFormatAttribute( metas[1], VX_TENSOR_DIMS, &dims, sizeof( dims ) ) ); ERROR_CHECK_STATUS( vxSetMetaFormatAttribute( metas[1], VX_TENSOR_DATA_TYPE, &data_type, sizeof( data_type ) ) ); ERROR_CHECK_STATUS( vxSetMetaFormatAttribute( metas[1], VX_TENSOR_FIXED_POINT_POSITION, &fixed_point_pos, sizeof( fixed_point_pos ) ) ); return VX_SUCCESS; }
static vx_status VX_CALLBACK processTensorAddition(vx_node node, const vx_reference * parameters, vx_uint32 num) { TensorAddLocalData * data = NULL; ERROR_CHECK_STATUS(vxQueryNode(node, VX_NODE_LOCAL_DATA_PTR, &data, sizeof(data))); miopenHandle_t miopenHandle = data->handle->miopen_handle; ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_BUFFER_OPENCL, &data->input1_mem, sizeof(data->input1_mem))); ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[1], VX_TENSOR_BUFFER_OPENCL, &data->input2_mem, sizeof(data->input2_mem))); ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[3], VX_TENSOR_BUFFER_OPENCL, &data->output_mem, sizeof(data->output_mem))); //miopen elementwise addition call. ERROR_CHECK_MIOPEN_STATUS(miopenOpTensor(miopenHandle, data->operation, &data->alpha1, data->input1, data->input1_mem, &data->alpha2, data->input2, data->input2_mem, &data->beta, data->output, data->output_mem)); return VX_SUCCESS; }
//////// // The node creation interface for the "app.userkernels.tensor_cos" kernel. // This user kernel example expects parameters in the following order: // parameter #0 -- input tensor of format VX_TYPE_INT16 // parameter #1 -- output tensor of format VX_TYPE_INT16 // // TODO:******** // 1. Use vxGetKernelByEnum API to get a kernel object from USER_KERNEL_TENSOR_COS. // Note that you need to use vxGetContext API to get the context from a graph object. // 2. Use vxCreateGenericNode API to create a node from the kernel object. // 3. Use vxSetParameterByIndex API to set node arguments. // 4. Release the kernel object that are not needed any more. // 5. Use ERROR_CHECK_OBJECT and ERROR_CHECK_STATUS macros for error detection. vx_node userTensorCosNode( vx_graph graph, vx_tensor input, vx_tensor output ) { vx_context context = vxGetContext( ( vx_reference ) graph ); vx_kernel kernel = vxGetKernelByEnum( context, USER_KERNEL_TENSOR_COS ); ERROR_CHECK_OBJECT( kernel ); vx_node node = vxCreateGenericNode( graph, kernel ); ERROR_CHECK_OBJECT( node ); ERROR_CHECK_STATUS( vxSetParameterByIndex( node, 0, ( vx_reference ) input ) ); ERROR_CHECK_STATUS( vxSetParameterByIndex( node, 1, ( vx_reference ) output ) ); ERROR_CHECK_STATUS( vxReleaseKernel( &kernel ) ); return node; }
vx_status publishSoftmaxLayer(vx_context context) { // add kernel to the context with callbacks vx_kernel kernel = vxAddUserKernel(context, "org.khronos.nn_extension.softmax_layer", VX_KERNEL_SOFTMAX_LAYER, processSoftmaxLayer, 2, validateSoftmaxLayer, initializeSoftmaxLayer, uninitializeSoftmaxLayer); ERROR_CHECK_OBJECT(kernel); // enable OpenCL buffer access since the kernel_f callback uses OpenCL buffers instead of host accessible buffers vx_bool enableBufferAccess = vx_true_e; ERROR_CHECK_STATUS(vxSetKernelAttribute(kernel, VX_KERNEL_ATTRIBUTE_AMD_OPENCL_BUFFER_ACCESS_ENABLE, &enableBufferAccess, sizeof(enableBufferAccess))); // set kernel parameters ERROR_CHECK_STATUS(vxAddParameterToKernel(kernel, 0, VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED)); ERROR_CHECK_STATUS(vxAddParameterToKernel(kernel, 1, VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED)); // finalize and release kernel object ERROR_CHECK_STATUS(vxFinalizeKernel(kernel)); ERROR_CHECK_STATUS(vxReleaseKernel(&kernel)); return VX_SUCCESS; }
//////// // User kernels needs to be registered with every OpenVX context before use in a graph. // // TODO:******** // 1. Use vxAddUserKernel API to register "app.userkernels.tensor_cos" with // kernel enumeration = USER_KERNEL_TENSOR_COS, numParams = 2, and // all of the user kernel callback functions you implemented above. // 2. Use vxAddParameterToKernel API to specify direction, data_type, and // state of all 2 parameters to the kernel. Look into the comments of // userTensorCosNode function (above) to details about the order of // kernel parameters and their types. // 3. Use vxFinalizeKernel API to make the kernel ready to use in a graph. // Note that the kernel object is still valid after this call. // So you need to call vxReleaseKernel before returning from this function. vx_status registerUserKernel( vx_context context ) { vx_kernel kernel = vxAddUserKernel( context, "app.userkernels.tensor_cos", USER_KERNEL_TENSOR_COS, tensor_cos_host_side_function, 2, // numParams tensor_cos_validator, NULL, NULL ); ERROR_CHECK_OBJECT( kernel ); ERROR_CHECK_STATUS( vxAddParameterToKernel( kernel, 0, VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED ) ); // input ERROR_CHECK_STATUS( vxAddParameterToKernel( kernel, 1, VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED ) ); // output ERROR_CHECK_STATUS( vxFinalizeKernel( kernel ) ); ERROR_CHECK_STATUS( vxReleaseKernel( &kernel ) ); vxAddLogEntry( ( vx_reference ) context, VX_SUCCESS, "OK: registered user kernel app.userkernels.tensor_cos\n" ); return VX_SUCCESS; }
static vx_status VX_CALLBACK initializeSoftmaxLayer(vx_node node, const vx_reference *parameters, vx_uint32 num) { SoftmaxLayerLocalData * data = new SoftmaxLayerLocalData; memset(data, 0, sizeof(*data)); ERROR_CHECK_STATUS(createGraphHandle(node, &data->handle)); //Parameters input and output. vx_size input_dims[4], output_dims[4]; ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_DIMS, input_dims, sizeof(input_dims))); ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[1], VX_TENSOR_DIMS, output_dims, sizeof(output_dims))); ERROR_CHECK_MIOPEN_STATUS(miopenCreateTensorDescriptor(&data->input_desc)); ERROR_CHECK_MIOPEN_STATUS(miopenCreateTensorDescriptor(&data->output_desc)); ERROR_CHECK_MIOPEN_STATUS(miopenSet4dTensorDescriptor(data->input_desc, miopenFloat, input_dims[3], input_dims[2], input_dims[1], input_dims[0])); ERROR_CHECK_MIOPEN_STATUS(miopenSet4dTensorDescriptor(data->output_desc, miopenFloat, output_dims[3], output_dims[2], output_dims[1], output_dims[0])); ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_BUFFER_OPENCL, &data->input_mem, sizeof(data->input_mem))); ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[1], VX_TENSOR_BUFFER_OPENCL, &data->output_mem, sizeof(data->output_mem))); data->alpha = 1; data->beta = 0; #if ENABLE_DEBUG_PRINT_DIMS std::cout << "softmax input " << input_dims[3] << " " << input_dims[2] << " " << input_dims[1] << " " << input_dims[0] << " "; std::cout << "output " << output_dims[3] << " " << output_dims[2] << " " << output_dims[1] << " " << output_dims[0] << std::endl; #endif ERROR_CHECK_STATUS(vxSetNodeAttribute(node, VX_NODE_LOCAL_DATA_PTR, &data, sizeof(data))); return VX_SUCCESS; }
//! \brief The kernel publisher. vx_status publishImageToTensorConvert(vx_context context) { vx_kernel kernel = vxAddUserKernel(context, "com.amd.nn_extension.convert_image_to_tensor", VX_KERNEL_CONVERT_IMAGE_TO_TENSOR_AMD, host_kernel, 5, validateImageToTensorKernel, nullptr, nullptr); ERROR_CHECK_OBJECT(kernel); amd_kernel_query_target_support_f query_target_support_f = query_target_support; amd_kernel_opencl_codegen_callback_f opencl_codegen_callback_f = opencl_codegen; ERROR_CHECK_STATUS(vxSetKernelAttribute(kernel, VX_KERNEL_ATTRIBUTE_AMD_QUERY_TARGET_SUPPORT, &query_target_support_f, sizeof(query_target_support_f))); ERROR_CHECK_STATUS(vxSetKernelAttribute(kernel, VX_KERNEL_ATTRIBUTE_AMD_OPENCL_CODEGEN_CALLBACK, &opencl_codegen_callback_f, sizeof(opencl_codegen_callback_f))); // set kernel parameters. ERROR_CHECK_STATUS(vxAddParameterToKernel(kernel, 0, VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED)); ERROR_CHECK_STATUS(vxAddParameterToKernel(kernel, 1, VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED)); ERROR_CHECK_STATUS(vxAddParameterToKernel(kernel, 2, VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED)); ERROR_CHECK_STATUS(vxAddParameterToKernel(kernel, 3, VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED)); ERROR_CHECK_STATUS(vxAddParameterToKernel(kernel, 4, VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED)); // finalize and release kernel object. ERROR_CHECK_STATUS(vxFinalizeKernel(kernel)); ERROR_CHECK_STATUS(vxReleaseKernel(&kernel)); return VX_SUCCESS; }
static vx_status VX_CALLBACK initializeTensorAddition(vx_node node, const vx_reference *parameters, vx_uint32 num) { TensorAddLocalData * data = new TensorAddLocalData; memset(data, 0, sizeof(*data)); ERROR_CHECK_STATUS(createGraphHandle(node, &data->handle)); //initialize input and output tensor descriptors. vx_size input1_dims[4], input2_dims[4], output_dims[4]; ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_DIMS, input1_dims, sizeof(input1_dims))); ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[1], VX_TENSOR_DIMS, input2_dims, sizeof(input2_dims))); ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[3], VX_TENSOR_DIMS, output_dims, sizeof(output_dims))); ERROR_CHECK_MIOPEN_STATUS(miopenCreateTensorDescriptor(&data->input1)); ERROR_CHECK_MIOPEN_STATUS(miopenCreateTensorDescriptor(&data->input2)); ERROR_CHECK_MIOPEN_STATUS(miopenCreateTensorDescriptor(&data->output)); ERROR_CHECK_MIOPEN_STATUS(miopenSet4dTensorDescriptor(data->input1, miopenFloat, input1_dims[3], input1_dims[2], input1_dims[1], input1_dims[0])); ERROR_CHECK_MIOPEN_STATUS(miopenSet4dTensorDescriptor(data->input2, miopenFloat, input2_dims[3], input2_dims[2], input2_dims[1], input2_dims[0])); ERROR_CHECK_MIOPEN_STATUS(miopenSet4dTensorDescriptor(data->output, miopenFloat, output_dims[3], output_dims[2], output_dims[1], output_dims[0])); //scaling parameters. data->alpha1 = 1; data->alpha2 = 1; data->beta = 0; data->operation = miopenTensorOpAdd; //input and output memory. ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_BUFFER_OPENCL, &data->input1_mem, sizeof(data->input1_mem))); ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[1], VX_TENSOR_BUFFER_OPENCL, &data->input2_mem, sizeof(data->input2_mem))); ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[3], VX_TENSOR_BUFFER_OPENCL, &data->output_mem, sizeof(data->output_mem))); #if ENABLE_DEBUG_PRINT_DIMS std::cout << "tensor_add input1 " << input1_dims[3] << " " << input1_dims[2] << " " << input1_dims[1] << " " << input1_dims[0] << " "; std::cout << "tensor_add input2 " << input2_dims[3] << " " << input2_dims[2] << " " << input2_dims[1] << " " << input2_dims[0] << " "; std::cout << "tensor_add output " << output_dims[3] << " " << output_dims[2] << " " << output_dims[1] << " " << output_dims[0] << std::endl; #endif ERROR_CHECK_STATUS(vxSetNodeAttribute(node, VX_NODE_LOCAL_DATA_PTR, &data, sizeof(data))); return VX_SUCCESS; }
vx_status publishROIPoolingLayer(vx_context context) { // add kernel to the context with callbacks vx_kernel kernel = vxAddUserKernel(context, "org.khronos.nn_extension.roi_pooling_layer", VX_KERNEL_ROI_POOLING_LAYER, processROIPoolingLayer, 4, validateROIPoolingLayer, initializeROIPoolingLayer, uninitializeROIPoolingLayer); ERROR_CHECK_OBJECT(kernel); // set kernel parameters ERROR_CHECK_STATUS(vxAddParameterToKernel(kernel, 0, VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED)); ERROR_CHECK_STATUS(vxAddParameterToKernel(kernel, 1, VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED)); ERROR_CHECK_STATUS(vxAddParameterToKernel(kernel, 2, VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED)); ERROR_CHECK_STATUS(vxAddParameterToKernel(kernel, 3, VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED)); // finalize and release kernel object ERROR_CHECK_STATUS(vxFinalizeKernel(kernel)); ERROR_CHECK_STATUS(vxReleaseKernel(&kernel)); return VX_SUCCESS; }
vx_status publishTensorAdd(vx_context context) { // add kernel to the context with callbacks vx_kernel kernel = vxAddUserKernel(context, "org.khronos.openvx.tensor_add", VX_KERNEL_TENSOR_ADD, processTensorAddition, 4, validateTensorAddition, initializeTensorAddition, uninitializeTensorAddition); ERROR_CHECK_OBJECT(kernel); // enable OpenCL buffer access since the kernel_f callback uses OpenCL buffers instead of host accessible buffers vx_bool enableBufferAccess = vx_true_e; ERROR_CHECK_STATUS(vxSetKernelAttribute(kernel, VX_KERNEL_ATTRIBUTE_AMD_OPENCL_BUFFER_ACCESS_ENABLE, &enableBufferAccess, sizeof(enableBufferAccess))); // set kernel parameters ERROR_CHECK_STATUS(vxAddParameterToKernel(kernel, 0, VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED)); ERROR_CHECK_STATUS(vxAddParameterToKernel(kernel, 1, VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED)); ERROR_CHECK_STATUS(vxAddParameterToKernel(kernel, 2, VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED)); ERROR_CHECK_STATUS(vxAddParameterToKernel(kernel, 3, VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED)); // finalize and release kernel object ERROR_CHECK_STATUS(vxFinalizeKernel(kernel)); ERROR_CHECK_STATUS(vxReleaseKernel(&kernel)); return VX_SUCCESS; }
//////// // main() has all the OpenVX application code for this exercise. // Command-line usage: // % exercise3 [<video-sequence>|<camera-device-number>] // When neither video sequence nor camera device number is specified, // it defaults to the video sequence in "PETS09-S1-L1-View001.avi". int main( int argc, char * argv[] ) { // Get default video sequence when nothing is specified on command-line and // instantiate OpenCV GUI module for reading input RGB images and displaying // the image with OpenVX results const char * video_sequence = argv[1]; CGuiModule gui( video_sequence ); // Try grab first video frame from the sequence using cv::VideoCapture // and check if video frame is available if( !gui.Grab() ) { printf( "ERROR: input has no video\n" ); return 1; } //////// // Set the application configuration parameters. Note that input video // sequence is an 8-bit RGB image with dimensions given by gui.GetWidth() // and gui.GetHeight(). The parameters for the tensors are: // tensor_dims - 3 dimensions of tensor [3 x <width> x <height>] // tensor_input_fixed_point_pos - fixed-point position for input tensor // tensor_output_fixed_point_pos - fixed-point position for output tensor vx_uint32 width = gui.GetWidth(); vx_uint32 height = gui.GetHeight(); vx_size tensor_dims[3] = { width, height, 3 }; // 3 channels (RGB) vx_uint8 tensor_input_fixed_point_pos = 5; // Q10.5: input[-128..127] will be mapped to -4..3.96875 vx_uint8 tensor_output_fixed_point_pos = 7; // Q8.7: output[-1..1] will be mapped to -128 to 128 //////// // Create the OpenVX context and make sure returned context is valid and // register the log_callback to receive messages from OpenVX framework. vx_context context = vxCreateContext(); ERROR_CHECK_OBJECT( context ); vxRegisterLogCallback( context, log_callback, vx_false_e ); //////// // Register user kernels with the context. // // TODO STEP 05:******** // 1. Register user kernel with context by calling your implementation of "registerUserKernel()". // ERROR_CHECK_STATUS( registerUserKernel( context ) ); //////// // Create OpenVX tensor objects for input and output // // TODO STEP 06:******** // 1. Create tensor objects using tensor_dims, tensor_input_fixed_point_pos, and // tensor_output_fixed_point_pos // vx_tensor input_tensor = vxCreateTensor( context, 3, tensor_dims, VX_TYPE_INT16, tensor_input_fixed_point_pos ); // vx_tensor output_tensor = vxCreateTensor( context, /* Fill in parameters */ ); // ERROR_CHECK_OBJECT( input_tensor ); // ERROR_CHECK_OBJECT( output_tensor ); //////// // Create, build, and verify the graph with user kernel node. // // TODO STEP 07:******** // 1. Build a graph with just one node created using userTensorCosNode() // vx_graph graph = vxCreateGraph( context ); // ERROR_CHECK_OBJECT( graph ); // vx_node cos_node = userTensorCosNode( graph, /* Fill in parameters */ ); // ERROR_CHECK_OBJECT( cos_node ); // ERROR_CHECK_STATUS( vxReleaseNode( &cos_node ) ); // ERROR_CHECK_STATUS( vxVerifyGraph( graph ) ); //////// // Process the video sequence frame by frame until the end of sequence or aborted. cv::Mat bgrMatForOutputDisplay( height, width, CV_8UC3 ); for( int frame_index = 0; !gui.AbortRequested(); frame_index++ ) { //////// // Copy input RGB frame from OpenCV into input_tensor with UINT8 to Q10.5 (INT16) conversion. // In order to do this, vxMapTensorPatch API (see "vx_ext_amd.h"). // // TODO STEP 08:******** // 1. Use vxMapTensorPatch API for access to input tensor object for writing // 2. Copy UINT8 data from OpenCV RGB image to tensor object // 3. Use vxUnmapTensorPatch API to return control of buffer back to framework vx_uint8 * cv_rgb_image_buffer = gui.GetBuffer(); vx_size rgb_stride = gui.GetStride(); // vx_size zeros[3] = { 0 }; // vx_size tensor_stride[3]; // vx_map_id map_id; // vx_uint8 * buf; // ERROR_CHECK_STATUS( vxMapTensorPatch( input_tensor, // 3, /* Fill in parameters */ // &map_id, tensor_stride, // (void **)&buf, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST, 0 ) ); // for( vx_size c = 0; c < 3; c++ ) // { // for( vx_size y = 0; y < height; y++ ) // { // const vx_uint8 * img = cv_rgb_image_buffer + y * rgb_stride + c; // vx_int16 * inp = (vx_int16 *)(buf + y * tensor_stride[1] + c * tensor_stride[2]); // for( vx_size x = 0; x < width; x++ ) // { // // convert 0..255 to Q10.5 [-4..3.96875 range] fixed-point format // inp[x] = (vx_int16)img[x * 3] - 128; // } // } // } // ERROR_CHECK_STATUS( vxUnmapTensorPatch( input_tensor, map_id ) ); //////// // Now that input tensor is ready, just run the graph. // // TODO STEP 09:******** // 1. Call vxProcessGraph to execute the tensor_cos kernel in graph // ERROR_CHECK_STATUS( vxProcessGraph( graph ) ); //////// // Display the output tensor object as RGB image // // TODO STEP 10:******** // 1. Use vxMapTensorPatch API for access to output tensor object for reading // 2. Copy tensor object data into OpenCV RGB image // 3. Use vxUnmapTensorPatch API to return control of buffer back to framework // ERROR_CHECK_STATUS( vxMapTensorPatch( output_tensor, // 3, zeros, tensor_dims, // &map_id, tensor_stride, // (void **)&buf, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST, 0 ) ); // vx_uint8 * cv_bgr_image_buffer = bgrMatForOutputDisplay.data; // vx_size bgr_stride = bgrMatForOutputDisplay.step; // for( vx_size c = 0; c < 3; c++ ) // { // for( vx_size y = 0; y < height; y++ ) // { // const vx_int16 * out = (const vx_int16 *)(buf + y * tensor_stride[1] + c * tensor_stride[2]); // vx_uint8 * img = cv_bgr_image_buffer + y * bgr_stride + (2 - c); // (2 - c) for RGB to BGR conversion // for( vx_size x = 0; x < width; x++ ) // { // // scale convert Q8.7 [-1..1 range] fixed-point format to 0..255 with saturation // vx_int16 value = out[x] + 128; // value = value > 255 ? 255 : value; // saturation needed // img[x * 3] = (vx_uint8)value; // } // } // } //#if ENABLE_DISPLAY // cv::imshow( "Cosine", bgrMatForOutputDisplay ); //#endif // ERROR_CHECK_STATUS( vxUnmapTensorPatch( output_tensor, map_id ) ); //////// // Display the results and grab the next input RGB frame for the next iteration. char text[128]; sprintf( text, "Keyboard ESC/Q-Quit SPACE-Pause [FRAME %d] [fixed_point_pos input:%d output:%d]", frame_index, tensor_input_fixed_point_pos, tensor_output_fixed_point_pos ); gui.DrawText( 0, 16, text ); gui.Show(); if( !gui.Grab() ) { // Terminate the processing loop if the end of sequence is detected. gui.WaitForKey(); break; } } ////////******** // To release an OpenVX object, you need to call vxRelease<Object> API which takes a pointer to the object. // If the release operation is successful, the OpenVX framework will reset the object to NULL. // // TODO STEP 11:**** // 1. Release graph and tensor objects // ERROR_CHECK_STATUS( vxReleaseGraph( &graph ) ); // ERROR_CHECK_STATUS( vxReleaseTensor( &input_tensor ) ); // ERROR_CHECK_STATUS( vxReleaseTensor( &output_tensor ) ); ERROR_CHECK_STATUS( vxReleaseContext( &context ) ); return 0; }
//////// // main() has all the OpenVX application code for this exercise. // Command-line usage: // % solution_exercise2 [<video-sequence>|<camera-device-number>] // When neither video sequence nor camera device number is specified, // it defaults to the video sequence in "PETS09-S1-L1-View001.avi". int main( int argc, char * argv[] ) { // Get default video sequence when nothing is specified on command-line and // instantiate OpenCV GUI module for reading input RGB images and displaying // the image with OpenVX results. const char * video_sequence = argv[1]; CGuiModule gui( video_sequence ); // Try to grab the first video frame from the sequence using cv::VideoCapture // and check if a video frame is available. if( !gui.Grab() ) { printf( "ERROR: input has no video\n" ); return 1; } //////// // Set the application configuration parameters. Note that input video // sequence is an 8-bit RGB image with dimensions given by gui.GetWidth() // and gui.GetHeight(). The parameters for the Harris corners algorithm are: // max_keypoint_count - maximum number of keypoints to track // harris_strength_thresh - minimum threshold score to keep a corner // (computed using the normalized Sobel kernel) // harris_min_distance - radial L2 distance for non-max suppression // harris_k_sensitivity - sensitivity threshold k from the Harris-Stephens // harris_gradient_size - window size for gradient computation // harris_block_size - block window size used to compute the // Harris corner score // lk_pyramid_levels - number of pyramid levels for LK optical flow // lk_termination - can be VX_TERM_CRITERIA_ITERATIONS or // VX_TERM_CRITERIA_EPSILON or // VX_TERM_CRITERIA_BOTH // lk_epsilon - error for terminating the algorithm // lk_num_iterations - number of iterations // lk_use_initial_estimate - turn on/off use of initial estimates // lk_window_dimension - size of window on which to perform the algorithm vx_uint32 width = gui.GetWidth(); vx_uint32 height = gui.GetHeight(); vx_size max_keypoint_count = 10000; vx_float32 harris_strength_thresh = 0.0005f; vx_float32 harris_min_distance = 5.0f; vx_float32 harris_k_sensitivity = 0.04f; vx_int32 harris_gradient_size = 3; vx_int32 harris_block_size = 3; vx_uint32 lk_pyramid_levels = 6; vx_float32 lk_pyramid_scale = VX_SCALE_PYRAMID_HALF; vx_enum lk_termination = VX_TERM_CRITERIA_BOTH; vx_float32 lk_epsilon = 0.01f; vx_uint32 lk_num_iterations = 5; vx_bool lk_use_initial_estimate = vx_false_e; vx_uint32 lk_window_dimension = 6; //////// // Create the OpenVX context and make sure the returned context is valid and // register the log_callback to receive messages from OpenVX framework. vx_context context = vxCreateContext(); ERROR_CHECK_OBJECT( context ); vxRegisterLogCallback( context, log_callback, vx_false_e ); //////// // Create OpenVX image object for input RGB image. vx_image input_rgb_image = vxCreateImage( context, width, height, VX_DF_IMAGE_RGB ); ERROR_CHECK_OBJECT( input_rgb_image ); ////////******** // OpenVX optical flow functionality requires pyramids of the current input // image and the previous image. It also requires keypoints that correspond // to the previous pyramid and will output updated keypoints into // another keypoint array. To be able to toggle between the current and // the previous buffers, you need to use OpenVX delay objects and vxAgeDelay(). // Create OpenVX pyramid and array object exemplars and create OpenVX delay // objects for both to hold two of each. Note that the exemplar objects are not // needed once the delay objects are created. // // TODO STEP 01:******** // 1. Use vxCreatePyramid API to create a pyramid exemplar with the // same dimensions as the input image, VX_DF_IMAGE_U8 as image format, // lk_pyramid_levels as levels, and lk_pyramid_scale as scale. // We gave code for this in comments. // 2. Use vxCreateArray API to create an array exemplar with // keypoint data type with num_keypoint_count as capacity. // You need to add missing parameters to code in comments. // 3. Use vxCreateDelay API to create delay objects for pyramid and // keypoint array using the exemplars created using the two steps above. // Use 2 delay slots for both of the delay objects. // We gave code for one in comments; do similar for the other. // 4. Release the pyramid and keypoint array exemplar objects. // We gave code for one in comments; do similar for the other. // 5. Use ERROR_CHECK_OBJECT/STATUS macros for proper error checking. // We gave few error checks; do similar for the others. // vx_pyramid pyramidExemplar = vxCreatePyramid( context, lk_pyramid_levels, // lk_pyramid_scale, width, height, VX_DF_IMAGE_U8 ); // ERROR_CHECK_OBJECT( pyramidExemplar ); // vx_delay pyramidDelay = vxCreateDelay( context, ( vx_reference )pyramidExemplar, 2 ); // ERROR_CHECK_OBJECT( pyramidDelay ); // ERROR_CHECK_STATUS( vxReleasePyramid( &pyramidExemplar ) ); // vx_array keypointsExemplar = vxCreateArray( /* Fill in parameters */ ); // vx_delay keypointsDelay = vxCreateDelay( /* Fill in parameters */ ); ////////******** // An object from a delay slot can be accessed using vxGetReferenceFromDelay API. // You need to use index = 0 for the current object and index = -1 for the previous object. // // TODO STEP 02:******** // 1. Use vxGetReferenceFromDelay API to get the current and previous // pyramid objects from pyramid delay object. Note that you need // to typecast the vx_reference object to vx_pyramid. // We gave code for one in comments; do similar for the other. // 2. Similarly, get the current and previous keypoint array objects from // the keypoint delay object. // We gave code for one in comments; do similar for the other. // 3. Use ERROR_CHECK_OBJECT for proper error checking. // We gave one error check; do similar for the others. // vx_pyramid currentPyramid = ( vx_pyramid ) vxGetReferenceFromDelay( pyramidDelay, 0 ); // vx_pyramid previousPyramid = ( vx_pyramid ) vxGetReferenceFromDelay( /* Fill in parameters */ ); // vx_array currentKeypoints = ( vx_array ) vxGetReferenceFromDelay( /* Fill in parameters */ ); // vx_array previousKeypoints = ( vx_array ) vxGetReferenceFromDelay( keypointsDelay, -1 ); // ERROR_CHECK_OBJECT( currentPyramid ); ////////******** // Harris and optical flow algorithms require their own graph objects. // The Harris graph needs to extract gray scale image out of input RGB, // compute an initial set of keypoints, and compute an initial pyramid for use // by the optical flow graph. // // TODO STEP 03:******** // 1. Create two graph objects: one for the Harris corner detector and // the other for feature tracking using optical flow using the // vxCreateGraph API. // We gave code for one graph; do similar for the other. // 2. Use ERROR_CHECK_OBJECT to check the objects. // We gave one error check; do similar for the other. // vx_graph graphHarris = vxCreateGraph( context ); // vx_graph graphTrack = /* Fill in here */; // ERROR_CHECK_OBJECT( graphHarris ); ////////******** // Harris and pyramid computation expect input to be an 8-bit image. // Given that input is an RGB image, it is best to extract a gray image // from RGB image, which requires two steps: // - perform RGB to IYUV color conversion // - extract Y channel from IYUV image // This requires two intermediate OpenVX image objects. Since you don't // need to access these objects from the application, they can be virtual // objects that can be created using the vxCreateVirtualImage API. // // TODO STEP 04:******** // 1. Create an IYUV image and a U8 image (for Y channel) with the same // dimensions as the input RGB image. Note that the image formats for // IYUV and U8 images are VX_DF_IMAGE_IYUV and VX_DF_IMAGE_U8. // Note that virtual objects are specific to a graph, so you // need to create two sets, one for each graph. // We gave one fully in comments and you need to fill in missing // parameters for the others. // 2. Use ERROR_CHECK_OBJECT to check the objects. // We gave one error check in comments; do similar for others. // vx_image harris_yuv_image = vxCreateVirtualImage( graphHarris, width, height, VX_DF_IMAGE_IYUV ); // vx_image harris_luma_image = vxCreateVirtualImage( graphHarris, /* Fill in parameters */ ); // vx_image opticalflow_yuv_image = vxCreateVirtualImage( graphTrack, /* Fill in parameters */ ); // vx_image opticalflow_luma_image = vxCreateVirtualImage( /* Fill in parameters */ ); // ERROR_CHECK_OBJECT( harris_yuv_image ); ////////******** // The Harris corner detector and optical flow nodes (see "VX/vx_nodes.h") // take strength_thresh, min_distance, sensitivity, epsilon, // num_iterations, and use_initial_estimate parameters as scalar // data objects. So, you need to create scalar objects with the corresponding // configuration parameters. // // TODO STEP 05:******** // 1. Create scalar data objects of VX_TYPE_FLOAT32 for strength_thresh, // min_distance, sensitivity, and epsilon. Set their // initial values to harris_strength_thresh, harris_min_distance, // harris_k_sensitivity, and lk_epsilon. // We gave code full code for one scalar in comments; fill in // missing arguments for other ones. // 2. Similarly, create scalar objects for num_iterations and // use_initial_estimate with initial values: lk_num_iterations and // lk_use_initial_estimate. Make sure to use proper data types for // these parameters. // We gave code full code for one scalar in comments; fill in // missing arguments for the other. // 3. Use ERROR_CHECK_OBJECT to check proper creation of objects. // We gave the error check for one scalar; do similar for other 5 scalars. // vx_scalar strength_thresh = NULL; // vxCreateScalar( context, VX_TYPE_FLOAT32, &harris_strength_thresh ); // vx_scalar min_distance = NULL; // vxCreateScalar( context, /* Fill in parameters */ ); // vx_scalar sensitivity = NULL; // vxCreateScalar( /* Fill in parameters */ ); // vx_scalar epsilon = NULL; // vxCreateScalar( /* Fill in parameters */ ); // vx_scalar num_iterations = NULL; // vxCreateScalar( context, VX_TYPE_UINT32, /* Fill in parameter */ ); // vx_scalar use_initial_estimate = NULL; // vxCreateScalar( context, VX_TYPE_BOOL, &lk_use_initial_estimate ); // ERROR_CHECK_OBJECT( strength_thresh ); ////////******** // Now all the objects have been created for building the graphs. // First, build a graph that performs Harris corner detection and initial pyramid computation. // See "VX/vx_nodes.h" for APIs how to add nodes into a graph. // // TODO STEP 06:******** // 1. Use vxColorConvertNode and vxChannelExtractNode APIs to get gray // scale image for Harris and Pyramid computation from the input // RGB image. Add these nodes into Harris graph. // We gave code in comments with a missing parameter for you to fill in. // 2. Use vxGaussianPyramidNode API to add pyramid computation node. // You need to use the current pyramid from the pyramid delay object. // We gave code in comments with a missing parameter for you to fill in. // 3. Use vxHarrisCornersNode API to add a Harris corners node. // You need to use the current keypoints from keypoints delay object. // We gave code in comments with few missing parameters for you to fill in. // 4. Use ERROR_CHECK_OBJECT to check proper creation of objects. // 5. Release node and virtual objects immediately since the graph // retains references to them. // 6. Call vxVerifyGraph to check for any errors in the graph. // Fill in missing parameter in commented code. // vx_node nodesHarris[] = // { // vxColorConvertNode( graphHarris, input_rgb_image, harris_yuv_image ), // vxChannelExtractNode( graphHarris, /* Fill in parameter */, VX_CHANNEL_Y, harris_luma_image ), // vxGaussianPyramidNode( graphHarris, /* Fill in parameter */, currentPyramid ), // vxHarrisCornersNode( graphHarris, /* Fill in missing parameters */, currentKeypoints, NULL ) // }; // for( vx_size i = 0; i < sizeof( nodesHarris ) / sizeof( nodesHarris[0] ); i++ ) // { // ERROR_CHECK_OBJECT( nodesHarris[i] ); // ERROR_CHECK_STATUS( vxReleaseNode( &nodesHarris[i] ) ); // } // ERROR_CHECK_STATUS( vxReleaseImage( &harris_yuv_image ) ); // ERROR_CHECK_STATUS( vxReleaseImage( &harris_luma_image ) ); // ERROR_CHECK_STATUS( vxVerifyGraph( /* Fill in parameter */ ) ); ////////******** // Now, build a graph that performs pyramid computation and feature // tracking using optical flow. // // TODO STEP 07:******** // 1. Use vxColorConvertNode and vxChannelExtractNode APIs to get a gray // scale image for Harris and Pyramid computation from the input // RGB image. Add these nodes into Harris graph. // We gave the code in comments for color convert node; do similar // one for the channel extract node. // 2. Use vxGaussianPyramidNode API to add pyramid computation node. // You need to use the current pyramid from the pyramid delay object. // Most of the code is given in the comments; fill in the missing parameter. // 3. Use vxOpticalFlowPyrLKNode API to add an optical flow node. You need to // use the current and previous keypoints from the keypoints delay object. // Fill in the missing parameters in commented code. // 4. Use ERROR_CHECK_OBJECT to check proper creation of objects. // 5. Release node and virtual objects immediately since the graph // retains references to them. // 6. Call vxVerifyGraph to check for any errors in the graph. // Fill in the missing parameter in commented code. // vx_node nodesTrack[] = // { // vxColorConvertNode( graphTrack, input_rgb_image, opticalflow_yuv_image ), // vxChannelExtractNode( graphTrack, /* Fill in parameters */ ), // vxGaussianPyramidNode( graphTrack, /* Fill in parameter */, currentPyramid ), // vxOpticalFlowPyrLKNode( graphTrack, /* Fill in parameters */ ) // }; // for( vx_size i = 0; i < sizeof( nodesTrack ) / sizeof( nodesTrack[0] ); i++ ) // { // ERROR_CHECK_OBJECT( nodesTrack[i] ); // ERROR_CHECK_STATUS( vxReleaseNode( &nodesTrack[i] ) ); // } // ERROR_CHECK_STATUS( vxReleaseImage( &opticalflow_yuv_image ) ); // ERROR_CHECK_STATUS( vxReleaseImage( &opticalflow_luma_image ) ); // ERROR_CHECK_STATUS( vxVerifyGraph( /* Fill in parameter */ ) ); //////// // Process the video sequence frame by frame until the end of sequence or aborted. for( int frame_index = 0; !gui.AbortRequested(); frame_index++ ) { //////// // Copy the input RGB frame from OpenCV to OpenVX. // In order to do this, you need to use vxAccessImagePatch and vxCommitImagePatch APIs. // See "VX/vx_api.h" for the description of these APIs. vx_rectangle_t cv_rgb_image_region; cv_rgb_image_region.start_x = 0; cv_rgb_image_region.start_y = 0; cv_rgb_image_region.end_x = width; cv_rgb_image_region.end_y = height; vx_imagepatch_addressing_t cv_rgb_image_layout; cv_rgb_image_layout.stride_x = 3; cv_rgb_image_layout.stride_y = gui.GetStride(); vx_uint8 * cv_rgb_image_buffer = gui.GetBuffer(); ERROR_CHECK_STATUS( vxAccessImagePatch( input_rgb_image, &cv_rgb_image_region, 0, &cv_rgb_image_layout, ( void ** )&cv_rgb_image_buffer, VX_WRITE_ONLY ) ); ERROR_CHECK_STATUS( vxCommitImagePatch( input_rgb_image, &cv_rgb_image_region, 0, &cv_rgb_image_layout, cv_rgb_image_buffer ) ); ////////******** // Now that input RGB image is ready, just run a graph. // Run Harris at the beginning to initialize the previous keypoints. // // TODO STEP 08:******** // 1. Run a graph using vxProcessGraph API. Select Harris graph // if the frame_index == 0 (i.e., the first frame of the video // sequence), otherwise, select the feature tracking graph. // 2. Use ERROR_CHECK_STATUS for error checking. ////////******** // To mark the keypoints in display, you need to access the output // keypoint array and draw each item on the output window using gui.DrawArrow(). // // TODO STEP 09:******** // 1. Use vxGetReferenceFromDelay API to get the current and previous // keypoints array objects from the keypoints delay object. // Make sure to typecast the vx_reference object to vx_array. // We gave one for the previous previous keypoint array in comments; // do a similar one for the current keypoint array. // 2. OpenVX array object has an attribute that keeps the current // number of items in the array. The name of the attribute is // VX_ARRAY_ATTRIBUTE_NUMITEMS and its value is of type vx_size. // Use vxQueryArray API to get number of keypoints in the // current keypoint array data object, representing number of // corners detected in the input RGB image. // IMPORTANT: Read number of items into "num_corners" // because this variable is displayed by code segment below. // We gave most part of this statement in comment; just fill in the // missing parameter. // 3. The data items in output keypoint array are of type // vx_keypoint_t (see "VX/vx_types.h"). To access the array // buffer, use vxAccessArrayRange with start index = 0, // end index = number of items in the array, and usage mode = // VX_READ_ONLY. Note that the stride returned by this access // call is not guaranteed to be sizeof(vx_keypoint_t). // Also make sure that num_corners is > 0, because // vxAccessArrayRange expects end index > 0. // We gave the code for previous keypoint array in comment; // do similar one for the current keypoint array. // 4. For each item in the keypoint buffer, use vxArrayItem to // access an individual keypoint and draw a marker at (x,y) // using gui.DrawArrow() if tracking_status field of keypoint // is non-zero. Also count number of keypoints with // non-zero tracking_status into "num_tracking" variable. // We gave most of the code; fill in the missing parameters and uncomment. // 5. Hand the control of output keypoint buffer over back to // OpenVX framework by calling vxCommitArrayRange API. // We gave the code for previous keypoint array in comment; // do similar one for the current keypoint array. // 6. Use ERROR_CHECK_STATUS for error checking. vx_size num_corners = 0, num_tracking = 0; // previousKeypoints = ( vx_array )vxGetReferenceFromDelay( keypointsDelay, -1 ); // currentKeypoints = ( vx_array )vxGetReferenceFromDelay( /* Fill in parameters */ ); // ERROR_CHECK_OBJECT( currentKeypoints ); // ERROR_CHECK_OBJECT( previousKeypoints ); // ERROR_CHECK_STATUS( vxQueryArray( previousKeypoints, /* Fill in parameter */, &num_corners, sizeof( num_corners ) ) ); if( num_corners > 0 ) { vx_size kp_old_stride, kp_new_stride; vx_keypoint_t * kp_old_buf = NULL, * kp_new_buf = NULL; // ERROR_CHECK_STATUS( vxAccessArrayRange( previousKeypoints, 0, num_corners, // &kp_old_stride, ( void ** ) &kp_old_buf, VX_READ_ONLY ) ); // ERROR_CHECK_STATUS( vxAccessArrayRange( /* Fill in parameters */ ); for( vx_size i = 0; i < num_corners; i++ ) { // vx_keypoint_t * kp_old = &vxArrayItem( vx_keypoint_t, kp_old_buf, i, kp_old_stride ); // vx_keypoint_t * kp_new = &vxArrayItem( /* Fill in parameters */ ); // if( kp_new->tracking_status ) // { // num_tracking++; // gui.DrawArrow( kp_old->x, kp_old->y, kp_new->x, kp_new->y ); // } } // ERROR_CHECK_STATUS( vxCommitArrayRange( previousKeypoints, 0, num_corners, kp_old_buf ) ); // ERROR_CHECK_STATUS( vxCommitArrayRange( /* Fill in parameters */ ) ); } ////////******** // Flip the current and previous pyramid and keypoints in the delay objects. // // TODO STEP 10:******** // 1. Use vxAgeDelay API to flip the current and previous buffers in delay objects. // You need to call vxAgeDelay for both two delay objects. // 2. Use ERROR_CHECK_STATUS for error checking. // ERROR_CHECK_STATUS( vxAgeDelay( /* Fill in parameter */ ) ); // ERROR_CHECK_STATUS( vxAgeDelay( /* Fill in parameter */ ) ); //////// // Display the results and grab the next input RGB frame for the next iteration. char text[128]; sprintf( text, "Keyboard ESC/Q-Quit SPACE-Pause [FRAME %d]", frame_index ); gui.DrawText( 0, 16, text ); sprintf( text, "Number of Corners: %d [tracking %d]", ( int )num_corners, ( int )num_tracking ); gui.DrawText( 0, 36, text ); gui.Show(); if( !gui.Grab() ) { // Terminate the processing loop if the end of sequence is detected. gui.WaitForKey(); break; } } ////////******** // Query graph performance using VX_GRAPH_ATTRIBUTE_PERFORMANCE and print timing // in milliseconds. Note that time units of vx_perf_t fields are nanoseconds. // // TODO STEP 11:******** // 1. Use vxQueryGraph API with VX_GRAPH_ATTRIBUTE_PERFORMANCE to query graph performance. // We gave the attribute query for one graph in comments. Do the same for the second graph. // 2. Print the average and min execution times in milliseconds. Use the printf in comments. // vx_perf_t perfHarris = { 0 }, perfTrack = { 0 }; // ERROR_CHECK_STATUS( vxQueryGraph( graphHarris, VX_GRAPH_ATTRIBUTE_PERFORMANCE, &perfHarris, sizeof( perfHarris ) ) ); // ERROR_CHECK_STATUS( vxQueryGraph( /* Fill in parameters here for get performance of the other graph */ ); // printf( "GraphName NumFrames Avg(ms) Min(ms)\n" // "Harris %9d %7.3f %7.3f\n" // "Track %9d %7.3f %7.3f\n", // ( int )perfHarris.num, ( float )perfHarris.avg * 1e-6f, ( float )perfHarris.min * 1e-6f, // ( int )perfTrack.num, ( float )perfTrack.avg * 1e-6f, ( float )perfTrack.min * 1e-6f ); ////////******** // Release all the OpenVX objects created in this exercise, and make the context as the last one to release. // To release an OpenVX object, you need to call vxRelease<Object> API which takes a pointer to the object. // If the release operation is successful, the OpenVX framework will reset the object to NULL. // // TODO STEP 12:******** // 1. For releasing all other objects use vxRelease<Object> APIs. // You have to release 2 graph objects, 1 image object, 2 delay objects, // 6 scalar objects, and 1 context object. // 2. Use ERROR_CHECK_STATUS for error checking. // ERROR_CHECK_STATUS( vxReleaseContext( &context ) ); return 0; }
static vx_status VX_CALLBACK validateKernel(vx_node node, const vx_reference parameters[], vx_uint32 num, vx_meta_format metas[]) { // check input configuration vx_enum type, format; vx_size num_dims, input_dims[4] = { 1, 1, 1, 1 }; ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_DATA_TYPE, &type, sizeof(type))); ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_NUMBER_OF_DIMS, &num_dims, sizeof(num_dims))); ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_DIMS, input_dims, sizeof(input_dims[0])*num_dims)); if ((num_dims != 4 && num_dims != 3) || ((input_dims[0] & 3) != 0)) return VX_ERROR_INVALID_DIMENSION; if (type != VX_TYPE_FLOAT32) return VX_ERROR_INVALID_TYPE; // check output object type and set configuration ERROR_CHECK_STATUS(vxQueryReference(parameters[1], VX_REFERENCE_TYPE, &type, sizeof(type))); if (type == VX_TYPE_IMAGE) { vx_df_image format; ERROR_CHECK_STATUS(vxQueryImage((vx_image)parameters[1], VX_IMAGE_FORMAT, &format, sizeof(format))); if(format == VX_DF_IMAGE_U8 && input_dims[2] > 255) return VX_ERROR_INVALID_FORMAT; if(format == VX_DF_IMAGE_VIRT) format = (input_dims[2] < 256) ? VX_DF_IMAGE_U8 : VX_DF_IMAGE_U16; vx_uint32 width = (vx_uint32)input_dims[0]; vx_uint32 height = (vx_uint32)(input_dims[1]*input_dims[3]); ERROR_CHECK_STATUS(vxSetMetaFormatAttribute(metas[1], VX_IMAGE_WIDTH, &width, sizeof(width))); ERROR_CHECK_STATUS(vxSetMetaFormatAttribute(metas[1], VX_IMAGE_HEIGHT, &height, sizeof(height))); ERROR_CHECK_STATUS(vxSetMetaFormatAttribute(metas[1], VX_IMAGE_FORMAT, &format, sizeof(format))); } else if (type == VX_TYPE_TENSOR) { vx_size output_num_dims, output_dims[4] = { 1, 1, 1, 1 }; ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[1], VX_TENSOR_DATA_TYPE, &type, sizeof(type))); ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[1], VX_TENSOR_NUMBER_OF_DIMS, &output_num_dims, sizeof(output_num_dims))); ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[1], VX_TENSOR_DIMS, output_dims, sizeof(output_dims[0])*output_num_dims)); if (output_dims[2] != 1 && output_dims[2] != 2) // top_k must be 1 or 2 return VX_ERROR_INVALID_DIMENSION; if(type == VX_TYPE_UINT8 && input_dims[2] > 255) return VX_ERROR_INVALID_FORMAT; if(type != VX_TYPE_UINT8 && type != VX_TYPE_UINT16 && type != VX_TYPE_INT16) return VX_ERROR_INVALID_FORMAT; ERROR_CHECK_STATUS(vxSetMetaFormatAttribute(metas[1], VX_TENSOR_DATA_TYPE, &type, sizeof(type))); ERROR_CHECK_STATUS(vxSetMetaFormatAttribute(metas[1], VX_TENSOR_NUMBER_OF_DIMS, &output_num_dims, sizeof(output_num_dims))); ERROR_CHECK_STATUS(vxSetMetaFormatAttribute(metas[1], VX_TENSOR_DIMS, output_dims, sizeof(output_dims[0])*output_num_dims)); } else return VX_ERROR_INVALID_PARAMETERS; return VX_SUCCESS; }
//! \brief The OpenCL code generator callback. static vx_status VX_CALLBACK opencl_codegen( vx_node node, // [input] node const vx_reference parameters[], // [input] parameters vx_uint32 num, // [input] number of parameters bool opencl_load_function, // [input] false: normal OpenCL kernel; true: reserved char opencl_kernel_function_name[64], // [output] kernel_name for clCreateKernel() std::string& opencl_kernel_code, // [output] string for clCreateProgramWithSource() std::string& opencl_build_options, // [output] options for clBuildProgram() vx_uint32& opencl_work_dim, // [output] work_dim for clEnqueueNDRangeKernel() vx_size opencl_global_work[], // [output] global_work[] for clEnqueueNDRangeKernel() vx_size opencl_local_work[], // [output] local_work[] for clEnqueueNDRangeKernel() vx_uint32& opencl_local_buffer_usage_mask, // [output] reserved: must be ZERO vx_uint32& opencl_local_buffer_size_in_bytes // [output] reserved: must be ZERO ) { // get configuration vx_size num_dims, input_dims[4] = { 1, 1, 1, 1 }, top_k = 1; vx_enum output_obj_type, output_data_type = VX_TYPE_UINT16; ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_NUMBER_OF_DIMS, &num_dims, sizeof(num_dims))); ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_DIMS, input_dims, sizeof(input_dims[0])*num_dims)); ERROR_CHECK_STATUS(vxQueryReference(parameters[1], VX_REFERENCE_TYPE, &output_obj_type, sizeof(output_obj_type))); if(output_obj_type == VX_TYPE_IMAGE) { vx_df_image format; ERROR_CHECK_STATUS(vxQueryImage((vx_image)parameters[1], VX_IMAGE_FORMAT, &format, sizeof(format))); if(format == VX_DF_IMAGE_U8) output_data_type = VX_TYPE_UINT8; else if(format == VX_DF_IMAGE_U16) output_data_type = VX_TYPE_UINT16; } else { vx_size num_dims_output, output_dims[4] = { 1, 1, 1, 1 }; ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[1], VX_TENSOR_NUMBER_OF_DIMS, &num_dims_output, sizeof(num_dims_output))); ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[1], VX_TENSOR_DIMS, output_dims, sizeof(output_dims[0])*num_dims_output)); ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[1], VX_TENSOR_DATA_TYPE, &output_data_type, sizeof(output_data_type))); top_k = output_dims[2]; } size_t N = input_dims[3]; // compute global work opencl_work_dim = 3; opencl_local_work[0] = 8; opencl_local_work[1] = 8; opencl_local_work[2] = 1; opencl_global_work[0] = ((input_dims[0] + 3) / 4 + opencl_local_work[0] - 1) & ~(opencl_local_work[0] - 1); opencl_global_work[1] = (input_dims[1] + opencl_local_work[1] - 1) & ~(opencl_local_work[1] - 1); opencl_global_work[2] = N; // generate OpenCL C code strcpy(opencl_kernel_function_name, "argmax"); char item[8192]; sprintf(item, "#pragma OPENCL EXTENSION cl_amd_media_ops : enable\n" "__kernel __attribute__((reqd_work_group_size(%ld, %ld, 1)))\n" // opencl_local_work[0] opencl_local_work[1] "void %s(__global uchar * i0_buf, uint i0_offset, uint4 i0_stride, %s)\n" "{\n" " uint x = get_global_id(0) * 4;\n" " uint y = get_global_id(1);\n" " uint z = get_global_id(2);\n" " if(x < %ld && y < %ld) {\n" " i0_buf += i0_offset + z * i0_stride.s3 + y * i0_stride.s1 + x * i0_stride.s0;\n" " uint4 cmax;\n" , opencl_local_work[0], opencl_local_work[1], opencl_kernel_function_name , (output_obj_type == VX_TYPE_IMAGE) ? "uint o0_width, uint o0_height, __global uchar * o0_buf, uint o0_stride, uint o0_offset" : "__global uchar * o0_buf, uint o0_offset, uint4 o0_stride" , input_dims[0], input_dims[1]); opencl_kernel_code = item; if(top_k == 2) { sprintf(item, " uint4 cmax1;\n" " float4 f, fmax, fmax1;\n" " fmax = *(__global float4 *)i0_buf;\n" " i0_buf += i0_stride.s2; f = *(__global float4 *)i0_buf;\n" " cmax1.s0 = (f.s0 > fmax.s0) ? 0 : 1;\n" " cmax.s0 = (f.s0 > fmax.s0) ? 1 : 0;\n" " fmax1.s0 = (f.s0 > fmax.s0) ? fmax.s0 : f.s0;\n" " fmax.s0 = (f.s0 > fmax.s0) ? f.s0 : fmax.s0;\n" " cmax1.s1 = (f.s1 > fmax.s1) ? 0 : 1;\n" " cmax.s1 = (f.s1 > fmax.s1) ? 1 : 0;\n" " fmax1.s1 = (f.s1 > fmax.s1) ? fmax.s1 : f.s1;\n" " fmax.s1 = (f.s1 > fmax.s1) ? f.s1 : fmax.s1;\n" " cmax1.s2 = (f.s2 > fmax.s2) ? 0 : 1;\n" " cmax.s2 = (f.s2 > fmax.s2) ? 1 : 0;\n" " fmax1.s2 = (f.s2 > fmax.s2) ? fmax.s2 : f.s2;\n" " fmax.s2 = (f.s2 > fmax.s2) ? f.s2 : fmax.s2;\n" " cmax1.s3 = (f.s3 > fmax.s3) ? 0 : 1;\n" " cmax.s3 = (f.s3 > fmax.s3) ? 1 : 0;\n" " fmax1.s3 = (f.s3 > fmax.s3) ? fmax.s3 : f.s3;\n" " fmax.s3 = (f.s3 > fmax.s3) ? f.s3 : fmax.s3;\n" " for(uint c = 2; c < %ld; c++) {\n" " i0_buf += i0_stride.s2; f = *(__global float4 *)i0_buf;\n" " cmax1.s0 = (f.s0 > fmax.s0) ? cmax.s0 : ((f.s0 > fmax1.s0) ? c : cmax1.s0);\n" " fmax1.s0 = (f.s0 > fmax.s0) ? fmax.s0 : ((f.s0 > fmax1.s0) ? f.s0 : fmax1.s0);\n" " cmax.s0 = (f.s0 > fmax.s0) ? c : cmax.s0;\n" " fmax.s0 = (f.s0 > fmax.s0) ? f.s0 : fmax.s0;\n" " cmax1.s1 = (f.s1 > fmax.s1) ? cmax.s1 : ((f.s1 > fmax1.s1) ? c : cmax1.s1);\n" " fmax1.s1 = (f.s1 > fmax.s1) ? fmax.s1 : ((f.s1 > fmax1.s1) ? f.s1 : fmax1.s1);\n" " cmax.s1 = (f.s1 > fmax.s1) ? c : cmax.s1;\n" " fmax.s1 = (f.s1 > fmax.s1) ? f.s1 : fmax.s1;\n" " cmax1.s2 = (f.s2 > fmax.s2) ? cmax.s2 : ((f.s2 > fmax1.s2) ? c : cmax1.s2);\n" " fmax1.s2 = (f.s2 > fmax.s2) ? fmax.s2 : ((f.s2 > fmax1.s2) ? f.s2 : fmax1.s2);\n" " cmax.s2 = (f.s2 > fmax.s2) ? c : cmax.s2;\n" " fmax.s2 = (f.s2 > fmax.s2) ? f.s2 : fmax.s2;\n" " cmax1.s3 = (f.s3 > fmax.s3) ? cmax.s3 : ((f.s3 > fmax1.s3) ? c : cmax1.s3);\n" " fmax1.s3 = (f.s3 > fmax.s3) ? fmax.s3 : ((f.s3 > fmax1.s3) ? f.s3 : fmax1.s3);\n" " cmax.s3 = (f.s3 > fmax.s3) ? c : cmax.s3;\n" " fmax.s3 = (f.s3 > fmax.s3) ? f.s3 : fmax.s3;\n" " }\n" , input_dims[2]); opencl_kernel_code += item; } else if (top_k == 1) { sprintf(item, " cmax = (uint4)0;\n" " float4 fmax = *(__global float4 *)i0_buf;\n" " for(uint c = 1; c < %ld; c++) {\n" " i0_buf += i0_stride.s2;\n" " float4 f = *(__global float4 *)i0_buf;\n" " cmax.s0 = (f.s0 > fmax.s0) ? c : cmax.s0;\n" " fmax.s0 = (f.s0 > fmax.s0) ? f.s0 : fmax.s0;\n" " cmax.s1 = (f.s1 > fmax.s1) ? c : cmax.s1;\n" " fmax.s1 = (f.s1 > fmax.s1) ? f.s1 : fmax.s1;\n" " cmax.s2 = (f.s2 > fmax.s2) ? c : cmax.s2;\n" " fmax.s2 = (f.s2 > fmax.s2) ? f.s2 : fmax.s2;\n" " cmax.s3 = (f.s3 > fmax.s3) ? c : cmax.s3;\n" " fmax.s3 = (f.s3 > fmax.s3) ? f.s3 : fmax.s3;\n" " }\n" , input_dims[2]); opencl_kernel_code += item; } if(output_data_type == VX_TYPE_UINT8) { if(output_obj_type == VX_TYPE_IMAGE) { sprintf(item, " o0_buf += o0_offset + (z * %ld + y) * o0_stride + x;\n" , input_dims[1]); opencl_kernel_code += item; } else { opencl_kernel_code += " o0_buf += o0_offset + z * o0_stride.s3 + y * o0_stride.s1 + x * o0_stride.s0;\n"; } opencl_kernel_code += " uint imax = cmax.s0 + (cmax.s1 << 8) + (cmax.s2 << 16) + (cmax.s3 << 24);\n" " *(__global uint *)o0_buf = imax;\n"; if(top_k == 2) { opencl_kernel_code += " uint imax1 = cmax1.s0 + (cmax1.s1 << 8) + (cmax1.s2 << 16) + (cmax1.s3 << 24);\n" " *(__global uint *)&o0_buf[o0_stride.s2] = imax1;\n"; } } else if(output_data_type == VX_TYPE_UINT16) { if(output_obj_type == VX_TYPE_IMAGE) { sprintf(item, " o0_buf += o0_offset + (z * %ld + y) * o0_stride + x * 2;\n" , input_dims[1]); opencl_kernel_code += item; } else { opencl_kernel_code += " o0_buf += o0_offset + z * o0_stride.s3 + y * o0_stride.s1 + x * o0_stride.s0;\n"; } opencl_kernel_code += " uint2 imax;\n" " imax.s0 = cmax.s0 + (cmax.s1 << 16);\n" " imax.s1 = cmax.s2 + (cmax.s3 << 16);\n" " *(__global uint2 *)o0_buf = imax;\n"; if(top_k == 2) { opencl_kernel_code += " uint2 imax1;\n" " imax1.s0 = cmax1.s0 + (cmax1.s1 << 16);\n" " imax1.s1 = cmax1.s2 + (cmax1.s3 << 16);\n" " *(__global uint2 *)&o0_buf[o0_stride.s2] = imax1;\n"; } } opencl_kernel_code += " }\n" "}\n"; #if ENABLE_DEBUG_PRINT_DIMS std::cout << "KERNEL argmax_layer output " << input_dims[0] << "x" << input_dims[1] << " " << std::endl; #endif return VX_SUCCESS; }
//////// // User kernel host side function gets called to execute the user kernel node. // Perform element-wise consine function on input tensor to produce output tensor. // // TODO:******** // 1. Get fixed-point position and dimensions of input and output tensors. // Note that both input and output tensors have same dimensions. // 2. Access input and output tensor object data using vxMapTensorPatch API. // 3. Perform element-wise cosine function using fixed-point position. // 4. Use vxUnmapTensorPatch API to give the data buffers control back to OpenVX framework. vx_status VX_CALLBACK tensor_cos_host_side_function( vx_node node, const vx_reference * refs, vx_uint32 num ) { // Get fixed-point position and dimensions of input and output tensors. // Note that both input and output tensors have same dimensions. vx_tensor input = ( vx_tensor ) refs[0]; vx_tensor output = ( vx_tensor ) refs[1]; vx_size num_of_dims; vx_size dims[4] = { 1, 1, 1, 1 }; vx_uint8 input_fixed_point_pos; vx_uint8 output_fixed_point_pos; ERROR_CHECK_STATUS( vxQueryTensor( input, VX_TENSOR_NUMBER_OF_DIMS, &num_of_dims, sizeof( num_of_dims ) ) ); ERROR_CHECK_STATUS( vxQueryTensor( input, VX_TENSOR_DIMS, &dims, num_of_dims * sizeof(vx_size) ) ); ERROR_CHECK_STATUS( vxQueryTensor( input, VX_TENSOR_FIXED_POINT_POSITION, &input_fixed_point_pos, sizeof( input_fixed_point_pos ) ) ); ERROR_CHECK_STATUS( vxQueryTensor( output, VX_TENSOR_FIXED_POINT_POSITION, &output_fixed_point_pos, sizeof( output_fixed_point_pos ) ) ); // Access input and output tensor object data using vxMapTensorPatch API. vx_size zeros[4] = { 0 }; vx_map_id map_input, map_output; vx_uint8 * buf_input, * buf_output; vx_size stride_input[4] = { 0 }; vx_size stride_output[4] = { 0 }; ERROR_CHECK_STATUS( vxMapTensorPatch( input, num_of_dims, zeros, dims, &map_input, stride_input, (void **)&buf_input, VX_READ_ONLY, VX_MEMORY_TYPE_HOST, 0 ) ); ERROR_CHECK_STATUS( vxMapTensorPatch( output, num_of_dims, zeros, dims, &map_output, stride_output, (void **)&buf_output, VX_READ_ONLY, VX_MEMORY_TYPE_HOST, 0 ) ); // Perform element-wise cosine function using fixed-point position. vx_float32 input_to_float_multiplier = 1.0f / (vx_float32)(1 << input_fixed_point_pos); vx_float32 output_to_int16_multiplier = (vx_float32)(1 << output_fixed_point_pos); for( vx_size dim3 = 0; dim3 < dims[3]; dim3++) { for( vx_size dim2 = 0; dim2 < dims[2]; dim2++) { for( vx_size dim1 = 0; dim1 < dims[1]; dim1++) { const vx_int16 * ibuf = (const vx_int16 *) (buf_input + dim3 * stride_input[3] + dim2 * stride_input[2] + dim1 * stride_input[1] ); vx_int16 * obuf = (vx_int16 *) (buf_output + dim3 * stride_output[3] + dim2 * stride_output[2] + dim1 * stride_output[1] ); for( vx_size dim0 = 0; dim0 < dims[0]; dim0++) { // no saturation done here vx_int16 ivalue = ibuf[dim0]; vx_int16 ovalue = (vx_int16)(cosf((vx_float32)ivalue * input_to_float_multiplier) * output_to_int16_multiplier + 0.5f); obuf[dim0] = ovalue; } } } } // Use vxUnmapTensorPatch API to give the data buffers control back to OpenVX framework. ERROR_CHECK_STATUS( vxUnmapTensorPatch( input, map_input ) ); ERROR_CHECK_STATUS( vxUnmapTensorPatch( output, map_output ) ); return VX_SUCCESS; }
static vx_status VX_CALLBACK validateImageToTensorKernel(vx_node node, const vx_reference parameters[], vx_uint32 num, vx_meta_format metas[]) { // check input configuration vx_uint32 width, height; vx_df_image format; ERROR_CHECK_STATUS(vxQueryImage((vx_image)parameters[0], VX_IMAGE_WIDTH, &width, sizeof(width))); ERROR_CHECK_STATUS(vxQueryImage((vx_image)parameters[0], VX_IMAGE_HEIGHT, &height, sizeof(height))); ERROR_CHECK_STATUS(vxQueryImage((vx_image)parameters[0], VX_IMAGE_FORMAT, &format, sizeof(format))); if(format != VX_DF_IMAGE_RGB && format != VX_DF_IMAGE_U8) return VX_ERROR_INVALID_FORMAT; vx_enum scalar_type; ERROR_CHECK_STATUS(vxQueryScalar((vx_scalar)parameters[2], VX_SCALAR_TYPE, &scalar_type, sizeof(scalar_type))); if(scalar_type != VX_TYPE_FLOAT32) return VX_ERROR_INVALID_TYPE; ERROR_CHECK_STATUS(vxQueryScalar((vx_scalar)parameters[3], VX_SCALAR_TYPE, &scalar_type, sizeof(scalar_type))); if(scalar_type != VX_TYPE_FLOAT32) return VX_ERROR_INVALID_TYPE; ERROR_CHECK_STATUS(vxQueryScalar((vx_scalar)parameters[4], VX_SCALAR_TYPE, &scalar_type, sizeof(scalar_type))); if(scalar_type != VX_TYPE_BOOL) return VX_ERROR_INVALID_TYPE; // check output dimensions vx_enum type; vx_size num_dims, output_dims[4] = { 1, 1, 1, 1 }; ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[1], VX_TENSOR_DATA_TYPE, &type, sizeof(type))); ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[1], VX_TENSOR_NUMBER_OF_DIMS, &num_dims, sizeof(num_dims))); if (type != VX_TYPE_FLOAT32) return VX_ERROR_INVALID_TYPE; if (num_dims != 4) return VX_ERROR_INVALID_DIMENSION; ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[1], VX_TENSOR_DIMS, output_dims, sizeof(output_dims[0])*num_dims)); if ((output_dims[2] != 3 && output_dims[2] != 1) || output_dims[0] != (size_t)width || (output_dims[1] * output_dims[3]) != (size_t)height) return VX_ERROR_INVALID_DIMENSION; // set output tensor configuration ERROR_CHECK_STATUS(vxSetMetaFormatAttribute(metas[1], VX_TENSOR_DATA_TYPE, &type, sizeof(type))); ERROR_CHECK_STATUS(vxSetMetaFormatAttribute(metas[1], VX_TENSOR_NUMBER_OF_DIMS, &num_dims, sizeof(num_dims))); ERROR_CHECK_STATUS(vxSetMetaFormatAttribute(metas[1], VX_TENSOR_DIMS, output_dims, sizeof(output_dims))); return VX_SUCCESS; }
//! \brief The OpenCL code generator callback. static vx_status VX_CALLBACK opencl_codegen( vx_node node, // [input] node const vx_reference parameters[], // [input] parameters vx_uint32 num, // [input] number of parameters bool opencl_load_function, // [input] false: normal OpenCL kernel; true: reserved char opencl_kernel_function_name[64], // [output] kernel_name for clCreateKernel() std::string& opencl_kernel_code, // [output] string for clCreateProgramWithSource() std::string& opencl_build_options, // [output] options for clBuildProgram() vx_uint32& opencl_work_dim, // [output] work_dim for clEnqueueNDRangeKernel() vx_size opencl_global_work[], // [output] global_work[] for clEnqueueNDRangeKernel() vx_size opencl_local_work[], // [output] local_work[] for clEnqueueNDRangeKernel() vx_uint32& opencl_local_buffer_usage_mask, // [output] reserved: must be ZERO vx_uint32& opencl_local_buffer_size_in_bytes // [output] reserved: must be ZERO ) { // get configuration vx_df_image format; vx_size num_dims, input_dims[4] = { 1, 1, 1, 1 }; ERROR_CHECK_STATUS(vxQueryImage((vx_image)parameters[1], VX_IMAGE_FORMAT, &format, sizeof(format))); ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_NUMBER_OF_DIMS, &num_dims, sizeof(num_dims))); ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_DIMS, input_dims, sizeof(input_dims[0])*num_dims)); vx_uint32 width = (vx_uint32)input_dims[0]; vx_uint32 height = (vx_uint32)input_dims[1]; vx_uint32 N = (vx_uint32)input_dims[3]; // compute global work vx_uint32 width_div_4 = (width + 3) / 4; opencl_work_dim = 3; opencl_local_work[0] = 8; opencl_local_work[1] = 8; opencl_local_work[2] = 1; opencl_global_work[0] = (width_div_4 + opencl_local_work[0] - 1) & ~(opencl_local_work[0] - 1); opencl_global_work[1] = (height + opencl_local_work[1] - 1) & ~(opencl_local_work[1] - 1); opencl_global_work[2] = N; // generate OpenCL C code strcpy(opencl_kernel_function_name, "tensor_to_image"); if(format == VX_DF_IMAGE_RGB) { char item[8192]; sprintf(item, "#pragma OPENCL EXTENSION cl_amd_media_ops : enable\n" "__kernel __attribute__((reqd_work_group_size(%ld, %ld, 1)))\n" // opencl_local_work[0] opencl_local_work[1] "void %s(__global uchar * i0_buf, uint i0_offset, uint4 i0_stride, uint o0_width, uint o0_height, __global uchar * o0_buf, uint o0_stride, uint o0_offset, float ka, float kb, uint reverse_channel_order)\n" "{\n" " uint x = get_global_id(0) * 4;\n" " uint y = get_global_id(1);\n" " uint n = get_global_id(2);\n" " if(x < %d && y < %d) {\n" " i0_buf += i0_offset + n * i0_stride.s3 + y * i0_stride.s1 + x * i0_stride.s0;\n" " float4 r = *(__global float4 *)&i0_buf[reverse_channel_order ? 2 * i0_stride.s2 : 0];\n" " float4 g = *(__global float4 *)&i0_buf[ i0_stride.s2 ];\n" " float4 b = *(__global float4 *)&i0_buf[reverse_channel_order ? 0 : 2 * i0_stride.s2];\n" " r = r * (float4)ka + (float4)kb;\n" " g = g * (float4)ka + (float4)kb;\n" " b = b * (float4)ka + (float4)kb;\n" " uint3 u3;\n" " u3.s0 = amd_pack((float4)(r.s0, g.s0, b.s0, r.s1));\n" " u3.s1 = amd_pack((float4)(g.s1, b.s1, r.s2, g.s2));\n" " u3.s2 = amd_pack((float4)(b.s2, r.s3, g.s3, b.s3));\n" " vstore3(u3, 0, (__global uint *)&o0_buf[o0_offset + (y + n * %d) * o0_stride + x * 3]);\n" " }\n" "}\n" , opencl_local_work[0], opencl_local_work[1], opencl_kernel_function_name, width, height, height); opencl_kernel_code = item; } else { char item[8192]; sprintf(item, "#pragma OPENCL EXTENSION cl_amd_media_ops : enable\n" "__kernel __attribute__((reqd_work_group_size(%ld, %ld, 1)))\n" // opencl_local_work[0] opencl_local_work[1] "void %s(__global uchar * i0_buf, uint i0_offset, uint4 i0_stride, uint o0_width, uint o0_height, __global uchar * o0_buf, uint o0_stride, uint o0_offset, float ka, float kb, uint reverse_channel_order)\n" "{\n" " uint x = get_global_id(0) * 4;\n" " uint y = get_global_id(1);\n" " uint n = get_global_id(2);\n" " if(x < %d && y < %d) {\n" " i0_buf += i0_offset + n * i0_stride.s3 + y * i0_stride.s1 + x * i0_stride.s0;\n" " float4 i = *(__global float4 *)i0_buf;\n" " i = i * (float4)ka + (float4)kb;\n" " *(__global uint *)&o0_buf[o0_offset + (y + n * %d) * o0_stride + x] = amd_pack((float4)(i.s0, i.s1, i.s2, i.s3));\n" " }\n" "}\n" , opencl_local_work[0], opencl_local_work[1], opencl_kernel_function_name, width, height, height); opencl_kernel_code = item; } #if ENABLE_DEBUG_PRINT_DIMS std::cout << "KERNEL tensor_to_image output " << width << "x" << height << " " << N << std::endl; #endif return VX_SUCCESS; }
//! \brief The OpenCL code generator callback. static vx_status VX_CALLBACK opencl_codegen( vx_node node, // [input] node const vx_reference parameters[], // [input] parameters vx_uint32 num, // [input] number of parameters bool opencl_load_function, // [input] false: normal OpenCL kernel; true: reserved char opencl_kernel_function_name[64], // [output] kernel_name for clCreateKernel() std::string& opencl_kernel_code, // [output] string for clCreateProgramWithSource() std::string& opencl_build_options, // [output] options for clBuildProgram() vx_uint32& opencl_work_dim, // [output] work_dim for clEnqueueNDRangeKernel() vx_size opencl_global_work[], // [output] global_work[] for clEnqueueNDRangeKernel() vx_size opencl_local_work[], // [output] local_work[] for clEnqueueNDRangeKernel() vx_uint32& opencl_local_buffer_usage_mask, // [output] reserved: must be ZERO vx_uint32& opencl_local_buffer_size_in_bytes // [output] reserved: must be ZERO ) { // get configuration vx_uint32 width, height, N; vx_df_image format; vx_size num_dims, output_dims[4] = { 1, 1, 1, 1 }; ERROR_CHECK_STATUS(vxQueryImage((vx_image)parameters[0], VX_IMAGE_FORMAT, &format, sizeof(format))); ERROR_CHECK_STATUS(vxQueryImage((vx_image)parameters[0], VX_IMAGE_WIDTH, &width, sizeof(width))); ERROR_CHECK_STATUS(vxQueryImage((vx_image)parameters[0], VX_IMAGE_HEIGHT, &height, sizeof(height))); ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[1], VX_TENSOR_NUMBER_OF_DIMS, &num_dims, sizeof(num_dims))); ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[1], VX_TENSOR_DIMS, output_dims, sizeof(output_dims[0])*num_dims)); height = (vx_uint32)output_dims[1]; N = (vx_uint32)output_dims[3]; // generate OpenCL C code and compute global work strcpy(opencl_kernel_function_name, "image_to_tensor"); if(format == VX_DF_IMAGE_RGB) { opencl_work_dim = 3; opencl_local_work[0] = 8; opencl_local_work[1] = 8; opencl_local_work[2] = 1; opencl_global_work[0] = (width + opencl_local_work[0] - 1) & ~(opencl_local_work[0] - 1); opencl_global_work[1] = (height + opencl_local_work[1] - 1) & ~(opencl_local_work[1] - 1); opencl_global_work[2] = N; char item[8192]; sprintf(item, "#pragma OPENCL EXTENSION cl_amd_media_ops : enable\n" "__kernel __attribute__((reqd_work_group_size(%ld, %ld, 1)))\n" // opencl_local_work[0] opencl_local_work[1] "void %s(uint i0_width, uint i0_height, __global uchar * i0_buf, uint i0_stride, uint i0_offset, __global uchar * o0_buf, uint o0_offset, uint4 o0_stride, float ka, float kb, uint reverse_channel_order)\n" "{\n" " uint x = get_global_id(0);\n" " uint y = get_global_id(1);\n" " uint n = get_global_id(2);\n" " if(x < %d && y < %d) {\n" " uint ioffset = i0_offset + (y + n * %d) * i0_stride + x * 3;\n" " uint2 rgb2 = vload2(0, (__global uint *)&i0_buf[ioffset & ~3]);\n" " uint rgb = amd_bytealign(rgb2.s1, rgb2.s0, ioffset & 3);\n" " float r = ka * amd_unpack0(rgb) + kb;\n" " float g = ka * amd_unpack1(rgb) + kb;\n" " float b = ka * amd_unpack2(rgb) + kb;\n" " o0_buf += o0_offset + n * o0_stride.s3 + y * o0_stride.s1 + x * o0_stride.s0;\n" " *(__global float *)&o0_buf[ 0] = reverse_channel_order ? b : r;\n" " *(__global float *)&o0_buf[ o0_stride.s2] = g;\n" " *(__global float *)&o0_buf[2 * o0_stride.s2] = reverse_channel_order ? r : b;\n" " }\n" "}\n" , opencl_local_work[0], opencl_local_work[1], opencl_kernel_function_name, width, height, height); opencl_kernel_code = item; } else if(format == VX_DF_IMAGE_U8) { opencl_work_dim = 3; opencl_local_work[0] = 8; opencl_local_work[1] = 8; opencl_local_work[2] = 1; opencl_global_work[0] = ((width+3)/4 + opencl_local_work[0] - 1) & ~(opencl_local_work[0] - 1); opencl_global_work[1] = (height + opencl_local_work[1] - 1) & ~(opencl_local_work[1] - 1); opencl_global_work[2] = N; char item[8192]; sprintf(item, "#pragma OPENCL EXTENSION cl_amd_media_ops : enable\n" "__kernel __attribute__((reqd_work_group_size(%ld, %ld, 1)))\n" // opencl_local_work[0] opencl_local_work[1] "void %s(uint i0_width, uint i0_height, __global uchar * i0_buf, uint i0_stride, uint i0_offset, __global uchar * o0_buf, uint o0_offset, uint4 o0_stride, float a, float b, uint reverse_channel_order)\n" "{\n" " uint x = get_global_id(0) * 4;\n" " uint y = get_global_id(1);\n" " uint n = get_global_id(2);\n" " if(x < %d && y < %d) {\n" " uint u4 = *(__global uint *)&i0_buf[i0_offset + (y + n * %d) * i0_stride + x];\n" " float p0 = a * amd_unpack0(u4) + b;\n" " float p1 = a * amd_unpack1(u4) + b;\n" " float p2 = a * amd_unpack2(u4) + b;\n" " float p3 = a * amd_unpack3(u4) + b;\n" " *(__global float4 *)&o0_buf[o0_offset + n * o0_stride.s3 + y * o0_stride.s1 + x * o0_stride.s0] = (float4)(p0 , p1, p2, p3);\n" " }\n" "}\n" , opencl_local_work[0], opencl_local_work[1], opencl_kernel_function_name, width, height, height); opencl_kernel_code = item; } #if ENABLE_DEBUG_PRINT_DIMS std::cout << "KERNEL image_to_tensor output " << width << " " << height << " " << N << std::endl; #endif return VX_SUCCESS; }