static vx_status VX_CALLBACK validateSoftmaxLayer(vx_node node, const vx_reference parameters[], vx_uint32 num, vx_meta_format metas[])
{
    // check tensor dimensions
    vx_enum type;
    vx_size num_dims;
    vx_size input_dims[4], output_dims[4];
    ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_NUMBER_OF_DIMS, &num_dims, sizeof(num_dims)));
    ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_DATA_TYPE, &type, sizeof(type)));
    if(num_dims != 4) return VX_ERROR_INVALID_DIMENSION;
    if(type != VX_TYPE_FLOAT32) return VX_ERROR_INVALID_TYPE;
    ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_DIMS, input_dims, sizeof(input_dims)));
    ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[1], VX_TENSOR_NUMBER_OF_DIMS, &num_dims, sizeof(num_dims)));
    ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[1], VX_TENSOR_DATA_TYPE, &type, sizeof(type)));
    if(num_dims != 4) return VX_ERROR_INVALID_DIMENSION;
    if(type != VX_TYPE_FLOAT32) return VX_ERROR_INVALID_TYPE;
    ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[1], VX_TENSOR_DIMS, output_dims, sizeof(output_dims)));
    if(output_dims[3] != input_dims[3]) return VX_ERROR_INVALID_DIMENSION;
    if(output_dims[2] != input_dims[2]) return VX_ERROR_INVALID_DIMENSION;
    if(output_dims[1] != input_dims[1]) return VX_ERROR_INVALID_DIMENSION;
    if(output_dims[0] != input_dims[0]) return VX_ERROR_INVALID_DIMENSION;

    // output tensor configuration
    type = VX_TYPE_FLOAT32;
    num_dims = 4;
    ERROR_CHECK_STATUS(vxSetMetaFormatAttribute(metas[1], VX_TENSOR_DATA_TYPE, &type, sizeof(type)));
    ERROR_CHECK_STATUS(vxSetMetaFormatAttribute(metas[1], VX_TENSOR_NUMBER_OF_DIMS, &num_dims, sizeof(num_dims)));
    ERROR_CHECK_STATUS(vxSetMetaFormatAttribute(metas[1], VX_TENSOR_DIMS, output_dims, sizeof(output_dims)));
    return VX_SUCCESS;
}
static vx_status VX_CALLBACK validateTensorToImageKernel(vx_node node, const vx_reference parameters[], vx_uint32 num, vx_meta_format metas[])
{
    // check input configuration
    vx_enum type;
    vx_size num_dims, input_dims[4] = { 1, 1, 1, 1 };
    ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_DATA_TYPE, &type, sizeof(type)));
    ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_NUMBER_OF_DIMS, &num_dims, sizeof(num_dims)));
    if (num_dims != 4)
        return VX_ERROR_INVALID_DIMENSION;
    if (type != VX_TYPE_FLOAT32)
        return VX_ERROR_INVALID_TYPE;
    ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_DIMS, input_dims, sizeof(input_dims[0])*num_dims));
    if ((input_dims[2] != 3 && input_dims[2] != 1) || ((input_dims[0] & 3) != 0))
        return VX_ERROR_INVALID_DIMENSION;
    vx_enum scalar_type;
    ERROR_CHECK_STATUS(vxQueryScalar((vx_scalar)parameters[2], VX_SCALAR_TYPE, &scalar_type, sizeof(scalar_type)));
    if(scalar_type != VX_TYPE_FLOAT32)
        return VX_ERROR_INVALID_TYPE;
    ERROR_CHECK_STATUS(vxQueryScalar((vx_scalar)parameters[3], VX_SCALAR_TYPE, &scalar_type, sizeof(scalar_type)));
    if(scalar_type != VX_TYPE_FLOAT32)
        return VX_ERROR_INVALID_TYPE;
    ERROR_CHECK_STATUS(vxQueryScalar((vx_scalar)parameters[4], VX_SCALAR_TYPE, &scalar_type, sizeof(scalar_type)));
    if(scalar_type != VX_TYPE_BOOL)
        return VX_ERROR_INVALID_TYPE;

    // set output image configuration
    vx_uint32 width = (vx_uint32)input_dims[0];
    vx_uint32 height = (vx_uint32)(input_dims[1]*input_dims[3]);
    vx_df_image format = (input_dims[2] == 3) ? VX_DF_IMAGE_RGB : VX_DF_IMAGE_U8;
    ERROR_CHECK_STATUS(vxSetMetaFormatAttribute(metas[1], VX_IMAGE_WIDTH, &width, sizeof(width)));
    ERROR_CHECK_STATUS(vxSetMetaFormatAttribute(metas[1], VX_IMAGE_HEIGHT, &height, sizeof(height)));
    ERROR_CHECK_STATUS(vxSetMetaFormatAttribute(metas[1], VX_IMAGE_FORMAT, &format, sizeof(format)));

    return VX_SUCCESS;
}
static vx_status VX_CALLBACK uninitializeSoftmaxLayer(vx_node node, const vx_reference *parameters, vx_uint32 num)
{
    SoftmaxLayerLocalData * data = NULL;
    ERROR_CHECK_STATUS(vxQueryNode(node, VX_NODE_LOCAL_DATA_PTR, &data, sizeof(data)));
    ERROR_CHECK_MIOPEN_STATUS(miopenDestroyTensorDescriptor(data->input_desc));
    ERROR_CHECK_MIOPEN_STATUS(miopenDestroyTensorDescriptor(data->output_desc));
    if (data) {
        ERROR_CHECK_STATUS(releaseGraphHandle(node, data->handle));
        delete data;
    }
    return VX_SUCCESS;
}
static vx_status VX_CALLBACK processSoftmaxLayer(vx_node node, const vx_reference * parameters, vx_uint32 num)
{
    SoftmaxLayerLocalData * data = NULL;
    ERROR_CHECK_STATUS(vxQueryNode(node, VX_NODE_LOCAL_DATA_PTR, &data, sizeof(data)));
    miopenHandle_t miopenHandle = data->handle->miopen_handle;

    ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_BUFFER_OPENCL, &data->input_mem, sizeof(data->input_mem)));
    ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[1], VX_TENSOR_BUFFER_OPENCL, &data->output_mem, sizeof(data->output_mem)));

    ERROR_CHECK_STATUS(miopenSoftmaxForward(miopenHandle, &data->alpha, data->input_desc, data->input_mem, &data->beta, data->output_desc, data->output_mem));

    return VX_SUCCESS;
}
////////
// The user kernel validator callback should check to make sure that all the input
// parameters have correct data types and set meta format for the output parameters.
// The input parameters to be validated are:
//   parameter #0  --  input tensor of format VX_TYPE_INT16
// The output parameters that requires setting meta format is:
//   parameter #1  --  output tebsor of format VX_TYPE_INT16 with the same dimensions as input
// TODO:********
//   1. Query the input tensor for the dimensions and format.
//   2. Check to make sure that the input tensor format is VX_TYPE_INT16.
//   3. Set the required output tensor meta data as following:
//      - output tensor dimensions should be same as input tensor
//      - output tensor format should be VX_TYPE_INT16
//      - output tensor fixed-point position can be whatever the user requested
//          * query the output tensor for the fixed-point position value
//          * set the same value in output tensor meta data
vx_status VX_CALLBACK tensor_cos_validator( vx_node node,
                                             const vx_reference parameters[], vx_uint32 num,
                                             vx_meta_format metas[] )
{
    // parameter #0 -- query dimensions and format
    vx_size num_of_dims;
    ERROR_CHECK_STATUS( vxQueryTensor( ( vx_tensor )parameters[0], VX_TENSOR_NUMBER_OF_DIMS, &num_of_dims, sizeof( num_of_dims ) ) );
    if( num_of_dims > 4 ) // sanity check to avoid stack corruption with querying VX_TENSOR_DIMS below
    {
        return VX_ERROR_INVALID_DIMENSION;
    }
    vx_size dims[4];
    ERROR_CHECK_STATUS( vxQueryTensor( ( vx_tensor )parameters[0], VX_TENSOR_DIMS, &dims, num_of_dims * sizeof(vx_size) ) );
    vx_enum data_type;
    ERROR_CHECK_STATUS( vxQueryTensor( ( vx_tensor )parameters[0], VX_TENSOR_DATA_TYPE, &data_type, sizeof( data_type ) ) );

    // parameter #0 -- check input tensor format to be VX_TYPE_INF16
    if( data_type != VX_TYPE_INT16 )
    {
        return VX_ERROR_INVALID_FORMAT;
    }

    // parameter #1 -- query fixed-point position
    vx_uint8 fixed_point_pos;
    ERROR_CHECK_STATUS( vxQueryTensor( ( vx_tensor )parameters[1], VX_TENSOR_FIXED_POINT_POSITION, &fixed_point_pos, sizeof( fixed_point_pos ) ) );

    // parameter #1 -- set required output tensor meta data
    ERROR_CHECK_STATUS( vxSetMetaFormatAttribute( metas[1], VX_TENSOR_NUMBER_OF_DIMS,  &num_of_dims,  sizeof( num_of_dims ) ) );
    ERROR_CHECK_STATUS( vxSetMetaFormatAttribute( metas[1], VX_TENSOR_DIMS, &dims, sizeof( dims ) ) );
    ERROR_CHECK_STATUS( vxSetMetaFormatAttribute( metas[1], VX_TENSOR_DATA_TYPE, &data_type, sizeof( data_type ) ) );
    ERROR_CHECK_STATUS( vxSetMetaFormatAttribute( metas[1], VX_TENSOR_FIXED_POINT_POSITION, &fixed_point_pos, sizeof( fixed_point_pos ) ) );

    return VX_SUCCESS;
}
static vx_status VX_CALLBACK processTensorAddition(vx_node node, const vx_reference * parameters, vx_uint32 num)
{
    TensorAddLocalData * data = NULL;
    ERROR_CHECK_STATUS(vxQueryNode(node, VX_NODE_LOCAL_DATA_PTR, &data, sizeof(data)));
    miopenHandle_t miopenHandle = data->handle->miopen_handle;

    ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_BUFFER_OPENCL, &data->input1_mem, sizeof(data->input1_mem)));
    ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[1], VX_TENSOR_BUFFER_OPENCL, &data->input2_mem, sizeof(data->input2_mem)));
    ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[3], VX_TENSOR_BUFFER_OPENCL, &data->output_mem, sizeof(data->output_mem)));

    //miopen elementwise addition call.
    ERROR_CHECK_MIOPEN_STATUS(miopenOpTensor(miopenHandle, data->operation, &data->alpha1, data->input1, data->input1_mem, &data->alpha2, data->input2, data->input2_mem, &data->beta, data->output, data->output_mem));

    return VX_SUCCESS;
}
////////
// The node creation interface for the "app.userkernels.tensor_cos" kernel.
// This user kernel example expects parameters in the following order:
//   parameter #0  --  input tensor  of format VX_TYPE_INT16
//   parameter #1  --  output tensor of format VX_TYPE_INT16
//
// TODO:********
//   1. Use vxGetKernelByEnum API to get a kernel object from USER_KERNEL_TENSOR_COS.
//      Note that you need to use vxGetContext API to get the context from a graph object.
//   2. Use vxCreateGenericNode API to create a node from the kernel object.
//   3. Use vxSetParameterByIndex API to set node arguments.
//   4. Release the kernel object that are not needed any more.
//   5. Use ERROR_CHECK_OBJECT and ERROR_CHECK_STATUS macros for error detection.
vx_node userTensorCosNode( vx_graph graph,
                           vx_tensor input,
                           vx_tensor output )
{
    vx_context context = vxGetContext( ( vx_reference ) graph );
    vx_kernel kernel   = vxGetKernelByEnum( context, USER_KERNEL_TENSOR_COS );
    ERROR_CHECK_OBJECT( kernel );
    vx_node node       = vxCreateGenericNode( graph, kernel );
    ERROR_CHECK_OBJECT( node );

    ERROR_CHECK_STATUS( vxSetParameterByIndex( node, 0, ( vx_reference ) input ) );
    ERROR_CHECK_STATUS( vxSetParameterByIndex( node, 1, ( vx_reference ) output ) );

    ERROR_CHECK_STATUS( vxReleaseKernel( &kernel ) );

    return node;
}
vx_status publishSoftmaxLayer(vx_context context)
{
    // add kernel to the context with callbacks
    vx_kernel kernel = vxAddUserKernel(context, "org.khronos.nn_extension.softmax_layer", VX_KERNEL_SOFTMAX_LAYER, processSoftmaxLayer, 2, validateSoftmaxLayer, initializeSoftmaxLayer, uninitializeSoftmaxLayer);
    ERROR_CHECK_OBJECT(kernel);

    // enable OpenCL buffer access since the kernel_f callback uses OpenCL buffers instead of host accessible buffers
    vx_bool enableBufferAccess = vx_true_e;
    ERROR_CHECK_STATUS(vxSetKernelAttribute(kernel, VX_KERNEL_ATTRIBUTE_AMD_OPENCL_BUFFER_ACCESS_ENABLE, &enableBufferAccess, sizeof(enableBufferAccess)));

    // set kernel parameters
    ERROR_CHECK_STATUS(vxAddParameterToKernel(kernel, 0, VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED));
    ERROR_CHECK_STATUS(vxAddParameterToKernel(kernel, 1, VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED));

    // finalize and release kernel object
    ERROR_CHECK_STATUS(vxFinalizeKernel(kernel));
    ERROR_CHECK_STATUS(vxReleaseKernel(&kernel));

    return VX_SUCCESS;
}
////////
// User kernels needs to be registered with every OpenVX context before use in a graph.
//
// TODO:********
//   1. Use vxAddUserKernel API to register "app.userkernels.tensor_cos" with
//      kernel enumeration = USER_KERNEL_TENSOR_COS, numParams = 2, and
//      all of the user kernel callback functions you implemented above.
//   2. Use vxAddParameterToKernel API to specify direction, data_type, and
//      state of all 2 parameters to the kernel. Look into the comments of
//      userTensorCosNode function (above) to details about the order of
//      kernel parameters and their types.
//   3. Use vxFinalizeKernel API to make the kernel ready to use in a graph.
//      Note that the kernel object is still valid after this call.
//      So you need to call vxReleaseKernel before returning from this function.
vx_status registerUserKernel( vx_context context )
{
    vx_kernel kernel = vxAddUserKernel( context,
                                    "app.userkernels.tensor_cos",
                                    USER_KERNEL_TENSOR_COS,
                                    tensor_cos_host_side_function,
                                    2,   // numParams
                                    tensor_cos_validator,
                                    NULL,
                                    NULL );
    ERROR_CHECK_OBJECT( kernel );

    ERROR_CHECK_STATUS( vxAddParameterToKernel( kernel, 0, VX_INPUT,  VX_TYPE_TENSOR,  VX_PARAMETER_STATE_REQUIRED ) ); // input
    ERROR_CHECK_STATUS( vxAddParameterToKernel( kernel, 1, VX_OUTPUT, VX_TYPE_TENSOR,  VX_PARAMETER_STATE_REQUIRED ) ); // output
    ERROR_CHECK_STATUS( vxFinalizeKernel( kernel ) );
    ERROR_CHECK_STATUS( vxReleaseKernel( &kernel ) );

    vxAddLogEntry( ( vx_reference ) context, VX_SUCCESS, "OK: registered user kernel app.userkernels.tensor_cos\n" );
    return VX_SUCCESS;
}
static vx_status VX_CALLBACK initializeSoftmaxLayer(vx_node node, const vx_reference *parameters, vx_uint32 num)
{
    SoftmaxLayerLocalData * data = new SoftmaxLayerLocalData;
    memset(data, 0, sizeof(*data));
    ERROR_CHECK_STATUS(createGraphHandle(node, &data->handle));

    //Parameters input and output.
    vx_size input_dims[4], output_dims[4];
    ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_DIMS, input_dims, sizeof(input_dims)));
    ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[1], VX_TENSOR_DIMS, output_dims, sizeof(output_dims)));

    ERROR_CHECK_MIOPEN_STATUS(miopenCreateTensorDescriptor(&data->input_desc));
    ERROR_CHECK_MIOPEN_STATUS(miopenCreateTensorDescriptor(&data->output_desc));
    ERROR_CHECK_MIOPEN_STATUS(miopenSet4dTensorDescriptor(data->input_desc, miopenFloat, input_dims[3], input_dims[2], input_dims[1], input_dims[0]));
    ERROR_CHECK_MIOPEN_STATUS(miopenSet4dTensorDescriptor(data->output_desc, miopenFloat, output_dims[3], output_dims[2], output_dims[1], output_dims[0]));

    ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_BUFFER_OPENCL, &data->input_mem, sizeof(data->input_mem)));
    ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[1], VX_TENSOR_BUFFER_OPENCL, &data->output_mem, sizeof(data->output_mem)));
    
    data->alpha = 1;
    data->beta = 0;

#if ENABLE_DEBUG_PRINT_DIMS
    std::cout << "softmax input " << input_dims[3] << " " << input_dims[2] << " " << input_dims[1] << " " << input_dims[0] << " ";
    std::cout << "output " << output_dims[3] << " " << output_dims[2] << " " << output_dims[1] << " " << output_dims[0] << std::endl;
#endif

    ERROR_CHECK_STATUS(vxSetNodeAttribute(node, VX_NODE_LOCAL_DATA_PTR, &data, sizeof(data)));
    return VX_SUCCESS;
}
//! \brief The kernel publisher.
vx_status publishImageToTensorConvert(vx_context context)
{
    vx_kernel kernel = vxAddUserKernel(context, "com.amd.nn_extension.convert_image_to_tensor", VX_KERNEL_CONVERT_IMAGE_TO_TENSOR_AMD, host_kernel, 5, validateImageToTensorKernel, nullptr, nullptr);
    ERROR_CHECK_OBJECT(kernel);

    amd_kernel_query_target_support_f query_target_support_f = query_target_support;
    amd_kernel_opencl_codegen_callback_f opencl_codegen_callback_f = opencl_codegen;
    ERROR_CHECK_STATUS(vxSetKernelAttribute(kernel, VX_KERNEL_ATTRIBUTE_AMD_QUERY_TARGET_SUPPORT, &query_target_support_f, sizeof(query_target_support_f)));
    ERROR_CHECK_STATUS(vxSetKernelAttribute(kernel, VX_KERNEL_ATTRIBUTE_AMD_OPENCL_CODEGEN_CALLBACK, &opencl_codegen_callback_f, sizeof(opencl_codegen_callback_f)));

    // set kernel parameters.
    ERROR_CHECK_STATUS(vxAddParameterToKernel(kernel, 0, VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED));
    ERROR_CHECK_STATUS(vxAddParameterToKernel(kernel, 1, VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED));
    ERROR_CHECK_STATUS(vxAddParameterToKernel(kernel, 2, VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED));
    ERROR_CHECK_STATUS(vxAddParameterToKernel(kernel, 3, VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED));
    ERROR_CHECK_STATUS(vxAddParameterToKernel(kernel, 4, VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED));

    // finalize and release kernel object.
    ERROR_CHECK_STATUS(vxFinalizeKernel(kernel));
    ERROR_CHECK_STATUS(vxReleaseKernel(&kernel));

    return VX_SUCCESS;
}
static vx_status VX_CALLBACK initializeTensorAddition(vx_node node, const vx_reference *parameters, vx_uint32 num)
{
    TensorAddLocalData * data = new TensorAddLocalData;
    memset(data, 0, sizeof(*data));
    ERROR_CHECK_STATUS(createGraphHandle(node, &data->handle));

    //initialize input and output tensor descriptors.
    vx_size input1_dims[4], input2_dims[4], output_dims[4];
    ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_DIMS, input1_dims, sizeof(input1_dims)));
    ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[1], VX_TENSOR_DIMS, input2_dims, sizeof(input2_dims)));
    ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[3], VX_TENSOR_DIMS, output_dims, sizeof(output_dims)));
    ERROR_CHECK_MIOPEN_STATUS(miopenCreateTensorDescriptor(&data->input1));
    ERROR_CHECK_MIOPEN_STATUS(miopenCreateTensorDescriptor(&data->input2));
    ERROR_CHECK_MIOPEN_STATUS(miopenCreateTensorDescriptor(&data->output));
    ERROR_CHECK_MIOPEN_STATUS(miopenSet4dTensorDescriptor(data->input1, miopenFloat, input1_dims[3], input1_dims[2], input1_dims[1], input1_dims[0]));
    ERROR_CHECK_MIOPEN_STATUS(miopenSet4dTensorDescriptor(data->input2, miopenFloat, input2_dims[3], input2_dims[2], input2_dims[1], input2_dims[0]));
    ERROR_CHECK_MIOPEN_STATUS(miopenSet4dTensorDescriptor(data->output, miopenFloat, output_dims[3], output_dims[2], output_dims[1], output_dims[0]));

    //scaling parameters.
    data->alpha1 = 1;
    data->alpha2 = 1;
    data->beta = 0;
    data->operation = miopenTensorOpAdd;

    //input and output memory.
    ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_BUFFER_OPENCL, &data->input1_mem, sizeof(data->input1_mem)));
    ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[1], VX_TENSOR_BUFFER_OPENCL, &data->input2_mem, sizeof(data->input2_mem)));
    ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[3], VX_TENSOR_BUFFER_OPENCL, &data->output_mem, sizeof(data->output_mem)));

#if ENABLE_DEBUG_PRINT_DIMS
    std::cout << "tensor_add input1 " << input1_dims[3] << " " << input1_dims[2] << " " << input1_dims[1] << " " << input1_dims[0] << " ";
    std::cout << "tensor_add input2 " << input2_dims[3] << " " << input2_dims[2] << " " << input2_dims[1] << " " << input2_dims[0] << " ";
    std::cout << "tensor_add output " << output_dims[3] << " " << output_dims[2] << " " << output_dims[1] << " " << output_dims[0] << std::endl;
#endif

    ERROR_CHECK_STATUS(vxSetNodeAttribute(node, VX_NODE_LOCAL_DATA_PTR, &data, sizeof(data)));
    return VX_SUCCESS;
}
vx_status publishROIPoolingLayer(vx_context context)
{
    // add kernel to the context with callbacks
    vx_kernel kernel = vxAddUserKernel(context, "org.khronos.nn_extension.roi_pooling_layer", VX_KERNEL_ROI_POOLING_LAYER, processROIPoolingLayer, 4, validateROIPoolingLayer, initializeROIPoolingLayer, uninitializeROIPoolingLayer);
    ERROR_CHECK_OBJECT(kernel);

    // set kernel parameters
    ERROR_CHECK_STATUS(vxAddParameterToKernel(kernel, 0, VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED));
    ERROR_CHECK_STATUS(vxAddParameterToKernel(kernel, 1, VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED));
    ERROR_CHECK_STATUS(vxAddParameterToKernel(kernel, 2, VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED));
    ERROR_CHECK_STATUS(vxAddParameterToKernel(kernel, 3, VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED));

    // finalize and release kernel object
    ERROR_CHECK_STATUS(vxFinalizeKernel(kernel));
    ERROR_CHECK_STATUS(vxReleaseKernel(&kernel));

    return VX_SUCCESS;
}
vx_status publishTensorAdd(vx_context context)
{
    // add kernel to the context with callbacks
    vx_kernel kernel = vxAddUserKernel(context, "org.khronos.openvx.tensor_add", VX_KERNEL_TENSOR_ADD, processTensorAddition, 4, validateTensorAddition, initializeTensorAddition, uninitializeTensorAddition);
    ERROR_CHECK_OBJECT(kernel);

    // enable OpenCL buffer access since the kernel_f callback uses OpenCL buffers instead of host accessible buffers
    vx_bool enableBufferAccess = vx_true_e;
    ERROR_CHECK_STATUS(vxSetKernelAttribute(kernel, VX_KERNEL_ATTRIBUTE_AMD_OPENCL_BUFFER_ACCESS_ENABLE, &enableBufferAccess, sizeof(enableBufferAccess)));

    // set kernel parameters
    ERROR_CHECK_STATUS(vxAddParameterToKernel(kernel, 0, VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED));
    ERROR_CHECK_STATUS(vxAddParameterToKernel(kernel, 1, VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED));
    ERROR_CHECK_STATUS(vxAddParameterToKernel(kernel, 2, VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED));
    ERROR_CHECK_STATUS(vxAddParameterToKernel(kernel, 3, VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED));

    // finalize and release kernel object
    ERROR_CHECK_STATUS(vxFinalizeKernel(kernel));
    ERROR_CHECK_STATUS(vxReleaseKernel(&kernel));

    return VX_SUCCESS;
}
Ejemplo n.º 15
0
////////
// main() has all the OpenVX application code for this exercise.
// Command-line usage:
//   % exercise3 [<video-sequence>|<camera-device-number>]
// When neither video sequence nor camera device number is specified,
// it defaults to the video sequence in "PETS09-S1-L1-View001.avi".
int main( int argc, char * argv[] )
{
    // Get default video sequence when nothing is specified on command-line and
    // instantiate OpenCV GUI module for reading input RGB images and displaying
    // the image with OpenVX results
    const char * video_sequence = argv[1];
    CGuiModule gui( video_sequence );

    // Try grab first video frame from the sequence using cv::VideoCapture
    // and check if video frame is available
    if( !gui.Grab() )
    {
        printf( "ERROR: input has no video\n" );
        return 1;
    }

    ////////
    // Set the application configuration parameters. Note that input video
    // sequence is an 8-bit RGB image with dimensions given by gui.GetWidth()
    // and gui.GetHeight(). The parameters for the tensors are:
    //   tensor_dims                    - 3 dimensions of tensor [3 x <width> x <height>]
    //   tensor_input_fixed_point_pos   - fixed-point position for input tensor
    //   tensor_output_fixed_point_pos  - fixed-point position for output tensor
    vx_uint32  width                         = gui.GetWidth();
    vx_uint32  height                        = gui.GetHeight();
    vx_size    tensor_dims[3]                = { width, height, 3 }; // 3 channels (RGB)
    vx_uint8   tensor_input_fixed_point_pos  = 5; // Q10.5: input[-128..127] will be mapped to -4..3.96875
    vx_uint8   tensor_output_fixed_point_pos = 7; // Q8.7: output[-1..1] will be mapped to -128 to 128

    ////////
    // Create the OpenVX context and make sure returned context is valid and
    // register the log_callback to receive messages from OpenVX framework.
    vx_context context = vxCreateContext();
    ERROR_CHECK_OBJECT( context );
    vxRegisterLogCallback( context, log_callback, vx_false_e );

    ////////
    // Register user kernels with the context.
    //
    // TODO STEP 05:********
    //   1. Register user kernel with context by calling your implementation of "registerUserKernel()".
//    ERROR_CHECK_STATUS( registerUserKernel( context ) );

    ////////
    // Create OpenVX tensor objects for input and output
    //
    // TODO STEP 06:********
    //   1. Create tensor objects using tensor_dims, tensor_input_fixed_point_pos, and
    //      tensor_output_fixed_point_pos
//    vx_tensor input_tensor   = vxCreateTensor( context, 3, tensor_dims, VX_TYPE_INT16, tensor_input_fixed_point_pos );
//    vx_tensor output_tensor  = vxCreateTensor( context, /* Fill in parameters */ );
//    ERROR_CHECK_OBJECT( input_tensor );
//    ERROR_CHECK_OBJECT( output_tensor );

    ////////
    // Create, build, and verify the graph with user kernel node.
    //
    // TODO STEP 07:********
    //   1. Build a graph with just one node created using userTensorCosNode()
//    vx_graph graph = vxCreateGraph( context );
//    ERROR_CHECK_OBJECT( graph );
//    vx_node cos_node = userTensorCosNode( graph, /* Fill in parameters */ );
//    ERROR_CHECK_OBJECT( cos_node );
//    ERROR_CHECK_STATUS( vxReleaseNode( &cos_node ) );
//    ERROR_CHECK_STATUS( vxVerifyGraph( graph ) );

    ////////
    // Process the video sequence frame by frame until the end of sequence or aborted.
    cv::Mat bgrMatForOutputDisplay( height, width, CV_8UC3 );
    for( int frame_index = 0; !gui.AbortRequested(); frame_index++ )
    {
        ////////
        // Copy input RGB frame from OpenCV into input_tensor with UINT8 to Q10.5 (INT16) conversion.
        // In order to do this, vxMapTensorPatch API (see "vx_ext_amd.h").
        //
        // TODO STEP 08:********
        //   1. Use vxMapTensorPatch API for access to input tensor object for writing
        //   2. Copy UINT8 data from OpenCV RGB image to tensor object
        //   3. Use vxUnmapTensorPatch API to return control of buffer back to framework
        vx_uint8 * cv_rgb_image_buffer = gui.GetBuffer();
        vx_size rgb_stride             = gui.GetStride();
//        vx_size zeros[3]               = { 0 };
//        vx_size tensor_stride[3];
//        vx_map_id map_id;
//        vx_uint8 * buf;
//        ERROR_CHECK_STATUS( vxMapTensorPatch( input_tensor,
//                                              3, /* Fill in parameters */
//                                              &map_id, tensor_stride,
//                                              (void **)&buf, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST, 0 ) );
//        for( vx_size c = 0; c < 3; c++ )
//        {
//            for( vx_size y = 0; y < height; y++ )
//            {
//                const vx_uint8 * img = cv_rgb_image_buffer + y * rgb_stride + c;
//                vx_int16 * inp = (vx_int16 *)(buf + y * tensor_stride[1] + c * tensor_stride[2]);
//                for( vx_size x = 0; x < width; x++ )
//                {
//                    // convert 0..255 to Q10.5 [-4..3.96875 range] fixed-point format
//                    inp[x] = (vx_int16)img[x * 3] - 128;
//                }
//            }
//        }
//        ERROR_CHECK_STATUS( vxUnmapTensorPatch( input_tensor, map_id ) );


        ////////
        // Now that input tensor is ready, just run the graph.
        //
        // TODO STEP 09:********
        //   1. Call vxProcessGraph to execute the tensor_cos kernel in graph
//        ERROR_CHECK_STATUS( vxProcessGraph( graph ) );

        ////////
        // Display the output tensor object as RGB image
        //
        // TODO STEP 10:********
        //   1. Use vxMapTensorPatch API for access to output tensor object for reading
        //   2. Copy tensor object data into OpenCV RGB image
        //   3. Use vxUnmapTensorPatch API to return control of buffer back to framework
//        ERROR_CHECK_STATUS( vxMapTensorPatch( output_tensor,
//                                              3, zeros, tensor_dims,
//                                              &map_id, tensor_stride,
//                                              (void **)&buf, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST, 0 ) );
//        vx_uint8 * cv_bgr_image_buffer = bgrMatForOutputDisplay.data;
//        vx_size bgr_stride             = bgrMatForOutputDisplay.step;
//        for( vx_size c = 0; c < 3; c++ )
//        {
//            for( vx_size y = 0; y < height; y++ )
//            {
//                const vx_int16 * out = (const vx_int16 *)(buf + y * tensor_stride[1] + c * tensor_stride[2]);
//                vx_uint8 * img = cv_bgr_image_buffer + y * bgr_stride + (2 - c); // (2 - c) for RGB to BGR conversion
//                for( vx_size x = 0; x < width; x++ )
//                {
//                    // scale convert Q8.7 [-1..1 range] fixed-point format to 0..255 with saturation
//                    vx_int16 value = out[x] + 128;
//                    value = value > 255 ? 255 : value; // saturation needed
//                    img[x * 3] = (vx_uint8)value;
//                }
//            }
//        }
//#if ENABLE_DISPLAY
//        cv::imshow( "Cosine", bgrMatForOutputDisplay );
//#endif
//        ERROR_CHECK_STATUS( vxUnmapTensorPatch( output_tensor, map_id ) );

        ////////
        // Display the results and grab the next input RGB frame for the next iteration.
        char text[128];
        sprintf( text, "Keyboard ESC/Q-Quit SPACE-Pause [FRAME %d] [fixed_point_pos input:%d output:%d]", frame_index, tensor_input_fixed_point_pos, tensor_output_fixed_point_pos );
        gui.DrawText( 0, 16, text );
        gui.Show();
        if( !gui.Grab() )
        {
            // Terminate the processing loop if the end of sequence is detected.
            gui.WaitForKey();
            break;
        }
    }

    ////////********
    // To release an OpenVX object, you need to call vxRelease<Object> API which takes a pointer to the object.
    // If the release operation is successful, the OpenVX framework will reset the object to NULL.
    //
    // TODO STEP 11:****
    //   1. Release graph and tensor objects
//    ERROR_CHECK_STATUS( vxReleaseGraph( &graph ) );
//    ERROR_CHECK_STATUS( vxReleaseTensor( &input_tensor ) );
//    ERROR_CHECK_STATUS( vxReleaseTensor( &output_tensor ) );
    ERROR_CHECK_STATUS( vxReleaseContext( &context ) );

    return 0;
}
Ejemplo n.º 16
0
////////
// main() has all the OpenVX application code for this exercise.
// Command-line usage:
//   % solution_exercise2 [<video-sequence>|<camera-device-number>]
// When neither video sequence nor camera device number is specified,
// it defaults to the video sequence in "PETS09-S1-L1-View001.avi".
int main( int argc, char * argv[] )
{
    // Get default video sequence when nothing is specified on command-line and
    // instantiate OpenCV GUI module for reading input RGB images and displaying
    // the image with OpenVX results.
    const char * video_sequence = argv[1];
    CGuiModule gui( video_sequence );

    // Try to grab the first video frame from the sequence using cv::VideoCapture
    // and check if a video frame is available.
    if( !gui.Grab() )
    {
        printf( "ERROR: input has no video\n" );
        return 1;
    }

    ////////
    // Set the application configuration parameters. Note that input video
    // sequence is an 8-bit RGB image with dimensions given by gui.GetWidth()
    // and gui.GetHeight(). The parameters for the Harris corners algorithm are:
    //   max_keypoint_count      - maximum number of keypoints to track
    //   harris_strength_thresh  - minimum threshold score to keep a corner
    //                             (computed using the normalized Sobel kernel)
    //   harris_min_distance     - radial L2 distance for non-max suppression
    //   harris_k_sensitivity    - sensitivity threshold k from the Harris-Stephens
    //   harris_gradient_size    - window size for gradient computation
    //   harris_block_size       - block window size used to compute the
    //                             Harris corner score
    //   lk_pyramid_levels       - number of pyramid levels for LK optical flow
    //   lk_termination          - can be VX_TERM_CRITERIA_ITERATIONS or
    //                               VX_TERM_CRITERIA_EPSILON or
    //                               VX_TERM_CRITERIA_BOTH
    //   lk_epsilon              - error for terminating the algorithm
    //   lk_num_iterations       - number of iterations
    //   lk_use_initial_estimate - turn on/off use of initial estimates
    //   lk_window_dimension     - size of window on which to perform the algorithm
    vx_uint32  width                   = gui.GetWidth();
    vx_uint32  height                  = gui.GetHeight();
    vx_size    max_keypoint_count      = 10000;
    vx_float32 harris_strength_thresh  = 0.0005f;
    vx_float32 harris_min_distance     = 5.0f;
    vx_float32 harris_k_sensitivity    = 0.04f;
    vx_int32   harris_gradient_size    = 3;
    vx_int32   harris_block_size       = 3;
    vx_uint32  lk_pyramid_levels       = 6;
    vx_float32 lk_pyramid_scale        = VX_SCALE_PYRAMID_HALF;
    vx_enum    lk_termination          = VX_TERM_CRITERIA_BOTH;
    vx_float32 lk_epsilon              = 0.01f;
    vx_uint32  lk_num_iterations       = 5;
    vx_bool    lk_use_initial_estimate = vx_false_e;
    vx_uint32  lk_window_dimension     = 6;

    ////////
    // Create the OpenVX context and make sure the returned context is valid and
    // register the log_callback to receive messages from OpenVX framework.
    vx_context context = vxCreateContext();
    ERROR_CHECK_OBJECT( context );
    vxRegisterLogCallback( context, log_callback, vx_false_e );

    ////////
    // Create OpenVX image object for input RGB image.
    vx_image input_rgb_image = vxCreateImage( context, width, height, VX_DF_IMAGE_RGB );
    ERROR_CHECK_OBJECT( input_rgb_image );

    ////////********
    // OpenVX optical flow functionality requires pyramids of the current input
    // image and the previous image. It also requires keypoints that correspond
    // to the previous pyramid and will output updated keypoints into
    // another keypoint array. To be able to toggle between the current and
    // the previous buffers, you need to use OpenVX delay objects and vxAgeDelay().
    // Create OpenVX pyramid and array object exemplars and create OpenVX delay
    // objects for both to hold two of each. Note that the exemplar objects are not
    // needed once the delay objects are created.
    //
    // TODO STEP 01:********
    //   1. Use vxCreatePyramid API to create a pyramid exemplar with the
    //      same dimensions as the input image, VX_DF_IMAGE_U8 as image format,
    //      lk_pyramid_levels as levels, and lk_pyramid_scale as scale.
    //      We gave code for this in comments.
    //   2. Use vxCreateArray API to create an array exemplar with
    //      keypoint data type with num_keypoint_count as capacity.
    //      You need to add missing parameters to code in comments.
    //   3. Use vxCreateDelay API to create delay objects for pyramid and
    //      keypoint array using the exemplars created using the two steps above.
    //      Use 2 delay slots for both of the delay objects.
    //      We gave code for one in comments; do similar for the other.
    //   4. Release the pyramid and keypoint array exemplar objects.
    //      We gave code for one in comments; do similar for the other.
    //   5. Use ERROR_CHECK_OBJECT/STATUS macros for proper error checking.
    //      We gave few error checks; do similar for the others.
//    vx_pyramid pyramidExemplar = vxCreatePyramid( context, lk_pyramid_levels,
//                                                  lk_pyramid_scale, width, height, VX_DF_IMAGE_U8 );
//    ERROR_CHECK_OBJECT( pyramidExemplar );
//    vx_delay pyramidDelay   = vxCreateDelay( context, ( vx_reference )pyramidExemplar, 2 );
//    ERROR_CHECK_OBJECT( pyramidDelay );
//    ERROR_CHECK_STATUS( vxReleasePyramid( &pyramidExemplar ) );
//    vx_array keypointsExemplar = vxCreateArray( /* Fill in parameters */ );
//    vx_delay keypointsDelay = vxCreateDelay( /* Fill in parameters */ );


    ////////********
    // An object from a delay slot can be accessed using vxGetReferenceFromDelay API.
    // You need to use index = 0 for the current object and index = -1 for the previous object.
    //
    // TODO STEP 02:********
    //   1. Use vxGetReferenceFromDelay API to get the current and previous
    //      pyramid objects from pyramid delay object. Note that you need
    //      to typecast the vx_reference object to vx_pyramid.
    //      We gave code for one in comments; do similar for the other.
    //   2. Similarly, get the current and previous keypoint array objects from
    //      the keypoint delay object.
    //      We gave code for one in comments; do similar for the other.
    //   3. Use ERROR_CHECK_OBJECT for proper error checking.
    //      We gave one error check; do similar for the others.
//    vx_pyramid currentPyramid  = ( vx_pyramid ) vxGetReferenceFromDelay( pyramidDelay, 0 );
//    vx_pyramid previousPyramid = ( vx_pyramid ) vxGetReferenceFromDelay( /* Fill in parameters */ );
//    vx_array currentKeypoints  = ( vx_array )   vxGetReferenceFromDelay( /* Fill in parameters */ );
//    vx_array previousKeypoints = ( vx_array )   vxGetReferenceFromDelay( keypointsDelay, -1 );
//    ERROR_CHECK_OBJECT( currentPyramid );


    ////////********
    // Harris and optical flow algorithms require their own graph objects.
    // The Harris graph needs to extract gray scale image out of input RGB,
    // compute an initial set of keypoints, and compute an initial pyramid for use
    // by the optical flow graph.
    //
    // TODO STEP 03:********
    //   1. Create two graph objects: one for the Harris corner detector and
    //      the other for feature tracking using optical flow using the
    //      vxCreateGraph API.
    //      We gave code for one graph; do similar for the other.
    //   2. Use ERROR_CHECK_OBJECT to check the objects.
    //      We gave one error check; do similar for the other.
//    vx_graph graphHarris = vxCreateGraph( context );
//    vx_graph graphTrack  = /* Fill in here */;
//    ERROR_CHECK_OBJECT( graphHarris );


    ////////********
    // Harris and pyramid computation expect input to be an 8-bit image.
    // Given that input is an RGB image, it is best to extract a gray image
    // from RGB image, which requires two steps:
    //   - perform RGB to IYUV color conversion
    //   - extract Y channel from IYUV image
    // This requires two intermediate OpenVX image objects. Since you don't
    // need to access these objects from the application, they can be virtual
    // objects that can be created using the vxCreateVirtualImage API.
    //
    // TODO STEP 04:********
    //   1. Create an IYUV image and a U8 image (for Y channel) with the same
    //      dimensions as the input RGB image. Note that the image formats for
    //      IYUV and U8 images are VX_DF_IMAGE_IYUV and VX_DF_IMAGE_U8.
    //      Note that virtual objects are specific to a graph, so you
    //      need to create two sets, one for each graph.
    //      We gave one fully in comments and you need to fill in missing
    //      parameters for the others.
    //   2. Use ERROR_CHECK_OBJECT to check the objects.
    //      We gave one error check in comments; do similar for others.
//    vx_image harris_yuv_image       = vxCreateVirtualImage( graphHarris, width, height, VX_DF_IMAGE_IYUV );
//    vx_image harris_luma_image      = vxCreateVirtualImage( graphHarris, /* Fill in parameters */ );
//    vx_image opticalflow_yuv_image  = vxCreateVirtualImage( graphTrack,  /* Fill in parameters */ );
//    vx_image opticalflow_luma_image = vxCreateVirtualImage( /* Fill in parameters */ );
//    ERROR_CHECK_OBJECT( harris_yuv_image );


    ////////********
    // The Harris corner detector and optical flow nodes (see "VX/vx_nodes.h")
    // take strength_thresh, min_distance, sensitivity, epsilon,
    // num_iterations, and use_initial_estimate parameters as scalar
    // data objects. So, you need to create scalar objects with the corresponding
    // configuration parameters.
    //
    // TODO STEP 05:********
    //   1. Create scalar data objects of VX_TYPE_FLOAT32 for strength_thresh,
    //      min_distance, sensitivity, and epsilon. Set their
    //      initial values to harris_strength_thresh, harris_min_distance,
    //      harris_k_sensitivity, and lk_epsilon.
    //      We gave code full code for one scalar in comments; fill in
    //      missing arguments for other ones.
    //   2. Similarly, create scalar objects for num_iterations and
    //      use_initial_estimate with initial values: lk_num_iterations and
    //      lk_use_initial_estimate. Make sure to use proper data types for
    //      these parameters.
    //      We gave code full code for one scalar in comments; fill in
    //      missing arguments for the other.
    //   3. Use ERROR_CHECK_OBJECT to check proper creation of objects.
    //      We gave the error check for one scalar; do similar for other 5 scalars.
//    vx_scalar strength_thresh      = NULL; // vxCreateScalar( context, VX_TYPE_FLOAT32, &harris_strength_thresh );
//    vx_scalar min_distance         = NULL; // vxCreateScalar( context, /* Fill in parameters */ );
//    vx_scalar sensitivity          = NULL; // vxCreateScalar( /* Fill in parameters */ );
//    vx_scalar epsilon              = NULL; // vxCreateScalar( /* Fill in parameters */ );
//    vx_scalar num_iterations       = NULL; // vxCreateScalar( context, VX_TYPE_UINT32,  /* Fill in parameter */ );
//    vx_scalar use_initial_estimate = NULL; // vxCreateScalar( context, VX_TYPE_BOOL,    &lk_use_initial_estimate );
//    ERROR_CHECK_OBJECT( strength_thresh );


    ////////********
    // Now all the objects have been created for building the graphs.
    // First, build a graph that performs Harris corner detection and initial pyramid computation.
    // See "VX/vx_nodes.h" for APIs how to add nodes into a graph.
    //
    // TODO STEP 06:********
    //   1. Use vxColorConvertNode and vxChannelExtractNode APIs to get gray
    //      scale image for Harris and Pyramid computation from the input
    //      RGB image. Add these nodes into Harris graph.
    //      We gave code in comments with a missing parameter for you to fill in.
    //   2. Use vxGaussianPyramidNode API to add pyramid computation node.
    //      You need to use the current pyramid from the pyramid delay object.
    //      We gave code in comments with a missing parameter for you to fill in.
    //   3. Use vxHarrisCornersNode API to add a Harris corners node.
    //      You need to use the current keypoints from keypoints delay object.
    //      We gave code in comments with few missing parameters for you to fill in.
    //   4. Use ERROR_CHECK_OBJECT to check proper creation of objects.
    //   5. Release node and virtual objects immediately since the graph
    //      retains references to them.
    //   6. Call vxVerifyGraph to check for any errors in the graph.
    //      Fill in missing parameter in commented code.
//    vx_node nodesHarris[] =
//    {
//        vxColorConvertNode( graphHarris, input_rgb_image, harris_yuv_image ),
//        vxChannelExtractNode( graphHarris, /* Fill in parameter */, VX_CHANNEL_Y, harris_luma_image ),
//        vxGaussianPyramidNode( graphHarris, /* Fill in parameter */, currentPyramid ),
//        vxHarrisCornersNode( graphHarris, /* Fill in missing parameters */, currentKeypoints, NULL )
//    };
//    for( vx_size i = 0; i < sizeof( nodesHarris ) / sizeof( nodesHarris[0] ); i++ )
//    {
//        ERROR_CHECK_OBJECT( nodesHarris[i] );
//        ERROR_CHECK_STATUS( vxReleaseNode( &nodesHarris[i] ) );
//    }
//    ERROR_CHECK_STATUS( vxReleaseImage( &harris_yuv_image ) );
//    ERROR_CHECK_STATUS( vxReleaseImage( &harris_luma_image ) );
//    ERROR_CHECK_STATUS( vxVerifyGraph( /* Fill in parameter */ ) );


    ////////********
    // Now, build a graph that performs pyramid computation and feature
    // tracking using optical flow.
    //
    // TODO STEP 07:********
    //   1. Use vxColorConvertNode and vxChannelExtractNode APIs to get a gray
    //      scale image for Harris and Pyramid computation from the input
    //      RGB image. Add these nodes into Harris graph.
    //      We gave the code in comments for color convert node; do similar
    //      one for the channel extract node.
    //   2. Use vxGaussianPyramidNode API to add pyramid computation node.
    //      You need to use the current pyramid from the pyramid delay object.
    //      Most of the code is given in the comments; fill in the missing parameter.
    //   3. Use vxOpticalFlowPyrLKNode API to add an optical flow node. You need to
    //      use the current and previous keypoints from the keypoints delay object.
    //      Fill in the missing parameters in commented code.
    //   4. Use ERROR_CHECK_OBJECT to check proper creation of objects.
    //   5. Release node and virtual objects immediately since the graph
    //      retains references to them.
    //   6. Call vxVerifyGraph to check for any errors in the graph.
    //      Fill in the missing parameter in commented code.
//    vx_node nodesTrack[] =
//    {
//        vxColorConvertNode( graphTrack, input_rgb_image, opticalflow_yuv_image ),
//        vxChannelExtractNode( graphTrack, /* Fill in parameters */ ),
//        vxGaussianPyramidNode( graphTrack, /* Fill in parameter */, currentPyramid ),
//        vxOpticalFlowPyrLKNode( graphTrack, /* Fill in parameters */ )
//    };
//    for( vx_size i = 0; i < sizeof( nodesTrack ) / sizeof( nodesTrack[0] ); i++ )
//    {
//        ERROR_CHECK_OBJECT( nodesTrack[i] );
//        ERROR_CHECK_STATUS( vxReleaseNode( &nodesTrack[i] ) );
//    }
//    ERROR_CHECK_STATUS( vxReleaseImage( &opticalflow_yuv_image ) );
//    ERROR_CHECK_STATUS( vxReleaseImage( &opticalflow_luma_image ) );
//    ERROR_CHECK_STATUS( vxVerifyGraph( /* Fill in parameter */ ) );


    ////////
    // Process the video sequence frame by frame until the end of sequence or aborted.
    for( int frame_index = 0; !gui.AbortRequested(); frame_index++ )
    {
        ////////
        // Copy the input RGB frame from OpenCV to OpenVX.
        // In order to do this, you need to use vxAccessImagePatch and vxCommitImagePatch APIs.
        // See "VX/vx_api.h" for the description of these APIs.
        vx_rectangle_t cv_rgb_image_region;
        cv_rgb_image_region.start_x    = 0;
        cv_rgb_image_region.start_y    = 0;
        cv_rgb_image_region.end_x      = width;
        cv_rgb_image_region.end_y      = height;
        vx_imagepatch_addressing_t cv_rgb_image_layout;
        cv_rgb_image_layout.stride_x   = 3;
        cv_rgb_image_layout.stride_y   = gui.GetStride();
        vx_uint8 * cv_rgb_image_buffer = gui.GetBuffer();
        ERROR_CHECK_STATUS( vxAccessImagePatch( input_rgb_image, &cv_rgb_image_region, 0,
                                                &cv_rgb_image_layout, ( void ** )&cv_rgb_image_buffer, VX_WRITE_ONLY ) );
        ERROR_CHECK_STATUS( vxCommitImagePatch( input_rgb_image, &cv_rgb_image_region, 0,
                                                &cv_rgb_image_layout, cv_rgb_image_buffer ) );

        ////////********
        // Now that input RGB image is ready, just run a graph.
        // Run Harris at the beginning to initialize the previous keypoints.
        //
        // TODO STEP 08:********
        //   1. Run a graph using vxProcessGraph API. Select Harris graph
        //      if the frame_index == 0 (i.e., the first frame of the video
        //      sequence), otherwise, select the feature tracking graph.
        //   2. Use ERROR_CHECK_STATUS for error checking.



        ////////********
        // To mark the keypoints in display, you need to access the output
        // keypoint array and draw each item on the output window using gui.DrawArrow().
        //
        // TODO STEP 09:********
        //   1. Use vxGetReferenceFromDelay API to get the current and previous
        //      keypoints array objects from the keypoints delay object.
        //      Make sure to typecast the vx_reference object to vx_array.
        //      We gave one for the previous previous keypoint array in comments;
        //      do a similar one for the current keypoint array.
        //   2. OpenVX array object has an attribute that keeps the current
        //      number of items in the array. The name of the attribute is
        //      VX_ARRAY_ATTRIBUTE_NUMITEMS and its value is of type vx_size.
        //      Use vxQueryArray API to get number of keypoints in the
        //      current keypoint array data object, representing number of
        //      corners detected in the input RGB image.
        //      IMPORTANT: Read number of items into "num_corners"
        //      because this variable is displayed by code segment below.
        //      We gave most part of this statement in comment; just fill in the
        //      missing parameter.
        //   3. The data items in output keypoint array are of type
        //      vx_keypoint_t (see "VX/vx_types.h"). To access the array
        //      buffer, use vxAccessArrayRange with start index = 0,
        //      end index = number of items in the array, and usage mode =
        //      VX_READ_ONLY. Note that the stride returned by this access
        //      call is not guaranteed to be sizeof(vx_keypoint_t).
        //      Also make sure that num_corners is > 0, because
        //      vxAccessArrayRange expects end index > 0.
        //      We gave the code for previous keypoint array in comment;
        //      do similar one for the current keypoint array.
        //   4. For each item in the keypoint buffer, use vxArrayItem to
        //      access an individual keypoint and draw a marker at (x,y)
        //      using gui.DrawArrow() if tracking_status field of keypoint
        //      is non-zero. Also count number of keypoints with
        //      non-zero tracking_status into "num_tracking" variable.
        //      We gave most of the code; fill in the missing parameters and uncomment.
        //   5. Hand the control of output keypoint buffer over back to
        //      OpenVX framework by calling vxCommitArrayRange API.
        //      We gave the code for previous keypoint array in comment;
        //      do similar one for the current keypoint array.
        //   6. Use ERROR_CHECK_STATUS for error checking.
        vx_size num_corners = 0, num_tracking = 0;
//        previousKeypoints = ( vx_array )vxGetReferenceFromDelay( keypointsDelay, -1 );
//        currentKeypoints  = ( vx_array )vxGetReferenceFromDelay( /* Fill in parameters */ );
//        ERROR_CHECK_OBJECT( currentKeypoints );
//        ERROR_CHECK_OBJECT( previousKeypoints );
//        ERROR_CHECK_STATUS( vxQueryArray( previousKeypoints, /* Fill in parameter */, &num_corners, sizeof( num_corners ) ) );
        if( num_corners > 0 )
        {
            vx_size kp_old_stride, kp_new_stride;
            vx_keypoint_t * kp_old_buf = NULL, * kp_new_buf = NULL;
//            ERROR_CHECK_STATUS( vxAccessArrayRange( previousKeypoints, 0, num_corners,
//                                                    &kp_old_stride, ( void ** ) &kp_old_buf, VX_READ_ONLY ) );
//            ERROR_CHECK_STATUS( vxAccessArrayRange( /* Fill in parameters */ );
            for( vx_size i = 0; i < num_corners; i++ )
            {
//                vx_keypoint_t * kp_old = &vxArrayItem( vx_keypoint_t, kp_old_buf, i, kp_old_stride );
//                vx_keypoint_t * kp_new = &vxArrayItem( /* Fill in parameters */ );
//                if( kp_new->tracking_status )
//                {
//                    num_tracking++;
//                    gui.DrawArrow( kp_old->x, kp_old->y, kp_new->x, kp_new->y );
//                }
            }
//            ERROR_CHECK_STATUS( vxCommitArrayRange( previousKeypoints, 0, num_corners, kp_old_buf ) );
//            ERROR_CHECK_STATUS( vxCommitArrayRange( /* Fill in parameters */ ) );
        }


        ////////********
        // Flip the current and previous pyramid and keypoints in the delay objects.
        //
        // TODO STEP 10:********
        //   1. Use vxAgeDelay API to flip the current and previous buffers in delay objects.
        //      You need to call vxAgeDelay for both two delay objects.
        //   2. Use ERROR_CHECK_STATUS for error checking.
//        ERROR_CHECK_STATUS( vxAgeDelay( /* Fill in parameter */ ) );
//        ERROR_CHECK_STATUS( vxAgeDelay( /* Fill in parameter */ ) );


        ////////
        // Display the results and grab the next input RGB frame for the next iteration.
        char text[128];
        sprintf( text, "Keyboard ESC/Q-Quit SPACE-Pause [FRAME %d]", frame_index );
        gui.DrawText( 0, 16, text );
        sprintf( text, "Number of Corners: %d [tracking %d]", ( int )num_corners, ( int )num_tracking );
        gui.DrawText( 0, 36, text );
        gui.Show();
        if( !gui.Grab() )
        {
            // Terminate the processing loop if the end of sequence is detected.
            gui.WaitForKey();
            break;
        }
    }

    ////////********
    // Query graph performance using VX_GRAPH_ATTRIBUTE_PERFORMANCE and print timing
    // in milliseconds. Note that time units of vx_perf_t fields are nanoseconds.
    //
    // TODO STEP 11:********
    //   1. Use vxQueryGraph API with VX_GRAPH_ATTRIBUTE_PERFORMANCE to query graph performance.
    //      We gave the attribute query for one graph in comments. Do the same for the second graph.
    //   2. Print the average and min execution times in milliseconds. Use the printf in comments.
//    vx_perf_t perfHarris = { 0 }, perfTrack = { 0 };
//    ERROR_CHECK_STATUS( vxQueryGraph( graphHarris, VX_GRAPH_ATTRIBUTE_PERFORMANCE, &perfHarris, sizeof( perfHarris ) ) );
//    ERROR_CHECK_STATUS( vxQueryGraph( /* Fill in parameters here for get performance of the other graph */ );
//    printf( "GraphName NumFrames Avg(ms) Min(ms)\n"
//            "Harris    %9d %7.3f %7.3f\n"
//            "Track     %9d %7.3f %7.3f\n",
//            ( int )perfHarris.num, ( float )perfHarris.avg * 1e-6f, ( float )perfHarris.min * 1e-6f,
//            ( int )perfTrack.num,  ( float )perfTrack.avg  * 1e-6f, ( float )perfTrack.min  * 1e-6f );


    ////////********
    // Release all the OpenVX objects created in this exercise, and make the context as the last one to release.
    // To release an OpenVX object, you need to call vxRelease<Object> API which takes a pointer to the object.
    // If the release operation is successful, the OpenVX framework will reset the object to NULL.
    //
    // TODO STEP 12:********
    //   1. For releasing all other objects use vxRelease<Object> APIs.
    //      You have to release 2 graph objects, 1 image object, 2 delay objects,
    //      6 scalar objects, and 1 context object.
    //   2. Use ERROR_CHECK_STATUS for error checking.
//    ERROR_CHECK_STATUS( vxReleaseContext( &context ) );


    return 0;
}
static vx_status VX_CALLBACK validateKernel(vx_node node, const vx_reference parameters[], vx_uint32 num, vx_meta_format metas[])
{
    // check input configuration
    vx_enum type, format;
    vx_size num_dims, input_dims[4] = { 1, 1, 1, 1 };
    ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_DATA_TYPE, &type, sizeof(type)));
    ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_NUMBER_OF_DIMS, &num_dims, sizeof(num_dims)));
    ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_DIMS, input_dims, sizeof(input_dims[0])*num_dims));
    if ((num_dims != 4 && num_dims != 3) || ((input_dims[0] & 3) != 0))
        return VX_ERROR_INVALID_DIMENSION;
    if (type != VX_TYPE_FLOAT32)
        return VX_ERROR_INVALID_TYPE;

    // check output object type and set configuration
    ERROR_CHECK_STATUS(vxQueryReference(parameters[1], VX_REFERENCE_TYPE, &type, sizeof(type)));
    if (type == VX_TYPE_IMAGE) {
        vx_df_image format;
        ERROR_CHECK_STATUS(vxQueryImage((vx_image)parameters[1], VX_IMAGE_FORMAT, &format, sizeof(format)));
        if(format == VX_DF_IMAGE_U8 && input_dims[2] > 255)
            return VX_ERROR_INVALID_FORMAT;
        if(format == VX_DF_IMAGE_VIRT)
            format = (input_dims[2] < 256) ? VX_DF_IMAGE_U8 : VX_DF_IMAGE_U16;
        vx_uint32 width = (vx_uint32)input_dims[0];
        vx_uint32 height = (vx_uint32)(input_dims[1]*input_dims[3]);
        ERROR_CHECK_STATUS(vxSetMetaFormatAttribute(metas[1], VX_IMAGE_WIDTH, &width, sizeof(width)));
        ERROR_CHECK_STATUS(vxSetMetaFormatAttribute(metas[1], VX_IMAGE_HEIGHT, &height, sizeof(height)));
        ERROR_CHECK_STATUS(vxSetMetaFormatAttribute(metas[1], VX_IMAGE_FORMAT, &format, sizeof(format)));
    }
    else if (type == VX_TYPE_TENSOR) {
        vx_size output_num_dims, output_dims[4] = { 1, 1, 1, 1 };
        ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[1], VX_TENSOR_DATA_TYPE, &type, sizeof(type)));
        ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[1], VX_TENSOR_NUMBER_OF_DIMS, &output_num_dims, sizeof(output_num_dims)));
        ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[1], VX_TENSOR_DIMS, output_dims, sizeof(output_dims[0])*output_num_dims));
        if (output_dims[2] != 1 && output_dims[2] != 2) // top_k must be 1 or 2
            return VX_ERROR_INVALID_DIMENSION;
        if(type == VX_TYPE_UINT8 && input_dims[2] > 255)
            return VX_ERROR_INVALID_FORMAT;
        if(type != VX_TYPE_UINT8 && type != VX_TYPE_UINT16 && type != VX_TYPE_INT16)
            return VX_ERROR_INVALID_FORMAT;
        ERROR_CHECK_STATUS(vxSetMetaFormatAttribute(metas[1], VX_TENSOR_DATA_TYPE, &type, sizeof(type)));
        ERROR_CHECK_STATUS(vxSetMetaFormatAttribute(metas[1], VX_TENSOR_NUMBER_OF_DIMS, &output_num_dims, sizeof(output_num_dims)));
        ERROR_CHECK_STATUS(vxSetMetaFormatAttribute(metas[1], VX_TENSOR_DIMS, output_dims, sizeof(output_dims[0])*output_num_dims));
    }
    else
        return VX_ERROR_INVALID_PARAMETERS;

    return VX_SUCCESS;
}
//! \brief The OpenCL code generator callback.
static vx_status VX_CALLBACK opencl_codegen(
    vx_node node,                                  // [input] node
    const vx_reference parameters[],               // [input] parameters
    vx_uint32 num,                                 // [input] number of parameters
    bool opencl_load_function,                     // [input]  false: normal OpenCL kernel; true: reserved
    char opencl_kernel_function_name[64],          // [output] kernel_name for clCreateKernel()
    std::string& opencl_kernel_code,               // [output] string for clCreateProgramWithSource()
    std::string& opencl_build_options,             // [output] options for clBuildProgram()
    vx_uint32& opencl_work_dim,                    // [output] work_dim for clEnqueueNDRangeKernel()
    vx_size opencl_global_work[],                  // [output] global_work[] for clEnqueueNDRangeKernel()
    vx_size opencl_local_work[],                   // [output] local_work[] for clEnqueueNDRangeKernel()
    vx_uint32& opencl_local_buffer_usage_mask,     // [output] reserved: must be ZERO
    vx_uint32& opencl_local_buffer_size_in_bytes   // [output] reserved: must be ZERO
    )
{
    // get configuration
    vx_size num_dims, input_dims[4] = { 1, 1, 1, 1 }, top_k = 1;
    vx_enum output_obj_type, output_data_type = VX_TYPE_UINT16;
    ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_NUMBER_OF_DIMS, &num_dims, sizeof(num_dims)));
    ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_DIMS, input_dims, sizeof(input_dims[0])*num_dims));
    ERROR_CHECK_STATUS(vxQueryReference(parameters[1], VX_REFERENCE_TYPE, &output_obj_type, sizeof(output_obj_type)));
    if(output_obj_type == VX_TYPE_IMAGE) {
        vx_df_image format;
        ERROR_CHECK_STATUS(vxQueryImage((vx_image)parameters[1], VX_IMAGE_FORMAT, &format, sizeof(format)));
        if(format == VX_DF_IMAGE_U8)
            output_data_type = VX_TYPE_UINT8;
        else if(format == VX_DF_IMAGE_U16)
            output_data_type = VX_TYPE_UINT16;
    }
    else {
        vx_size num_dims_output, output_dims[4] = { 1, 1, 1, 1 };
        ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[1], VX_TENSOR_NUMBER_OF_DIMS, &num_dims_output, sizeof(num_dims_output)));
        ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[1], VX_TENSOR_DIMS, output_dims, sizeof(output_dims[0])*num_dims_output));
        ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[1], VX_TENSOR_DATA_TYPE, &output_data_type, sizeof(output_data_type)));
        top_k = output_dims[2];
    }
    size_t N = input_dims[3];

    // compute global work
    opencl_work_dim = 3;
    opencl_local_work[0] = 8;
    opencl_local_work[1] = 8;
    opencl_local_work[2] = 1;
    opencl_global_work[0] = ((input_dims[0] + 3) / 4  + opencl_local_work[0] - 1) & ~(opencl_local_work[0] - 1);
    opencl_global_work[1] = (input_dims[1] + opencl_local_work[1] - 1) & ~(opencl_local_work[1] - 1);
    opencl_global_work[2] = N;

    // generate OpenCL C code
    strcpy(opencl_kernel_function_name, "argmax");
    char item[8192];
    sprintf(item,
        "#pragma OPENCL EXTENSION cl_amd_media_ops : enable\n"
        "__kernel __attribute__((reqd_work_group_size(%ld, %ld, 1)))\n" // opencl_local_work[0] opencl_local_work[1]
        "void %s(__global uchar * i0_buf, uint i0_offset, uint4 i0_stride, %s)\n"
        "{\n"
        "    uint x = get_global_id(0) * 4;\n"
        "    uint y = get_global_id(1);\n"
        "    uint z = get_global_id(2);\n"
        "    if(x < %ld && y < %ld) {\n"
        "        i0_buf += i0_offset + z * i0_stride.s3 + y * i0_stride.s1 + x * i0_stride.s0;\n"
        "        uint4 cmax;\n"
        , opencl_local_work[0], opencl_local_work[1], opencl_kernel_function_name
        , (output_obj_type == VX_TYPE_IMAGE) ?
            "uint o0_width, uint o0_height, __global uchar * o0_buf, uint o0_stride, uint o0_offset" :
            "__global uchar * o0_buf, uint o0_offset, uint4 o0_stride"
        , input_dims[0], input_dims[1]);
    opencl_kernel_code = item;
    if(top_k == 2) {
        sprintf(item,
            "        uint4 cmax1;\n"
            "        float4 f, fmax, fmax1;\n"
            "        fmax = *(__global float4 *)i0_buf;\n"
            "        i0_buf += i0_stride.s2; f = *(__global float4 *)i0_buf;\n"
            "        cmax1.s0 = (f.s0 > fmax.s0) ? 0 : 1;\n"
            "         cmax.s0 = (f.s0 > fmax.s0) ? 1 : 0;\n"
            "        fmax1.s0 = (f.s0 > fmax.s0) ? fmax.s0 :    f.s0;\n"
            "         fmax.s0 = (f.s0 > fmax.s0) ?    f.s0 : fmax.s0;\n"
            "        cmax1.s1 = (f.s1 > fmax.s1) ? 0 : 1;\n"
            "         cmax.s1 = (f.s1 > fmax.s1) ? 1 : 0;\n"
            "        fmax1.s1 = (f.s1 > fmax.s1) ? fmax.s1 :    f.s1;\n"
            "         fmax.s1 = (f.s1 > fmax.s1) ?    f.s1 : fmax.s1;\n"
            "        cmax1.s2 = (f.s2 > fmax.s2) ? 0 : 1;\n"
            "         cmax.s2 = (f.s2 > fmax.s2) ? 1 : 0;\n"
            "        fmax1.s2 = (f.s2 > fmax.s2) ? fmax.s2 :    f.s2;\n"
            "         fmax.s2 = (f.s2 > fmax.s2) ?    f.s2 : fmax.s2;\n"
            "        cmax1.s3 = (f.s3 > fmax.s3) ? 0 : 1;\n"
            "         cmax.s3 = (f.s3 > fmax.s3) ? 1 : 0;\n"
            "        fmax1.s3 = (f.s3 > fmax.s3) ? fmax.s3 :    f.s3;\n"
            "         fmax.s3 = (f.s3 > fmax.s3) ?    f.s3 : fmax.s3;\n"
            "        for(uint c = 2; c < %ld; c++) {\n"
            "            i0_buf += i0_stride.s2; f = *(__global float4 *)i0_buf;\n"
            "            cmax1.s0 = (f.s0 > fmax.s0) ? cmax.s0 : ((f.s0 > fmax1.s0) ? c    : cmax1.s0);\n"
            "            fmax1.s0 = (f.s0 > fmax.s0) ? fmax.s0 : ((f.s0 > fmax1.s0) ? f.s0 : fmax1.s0);\n"
            "            cmax.s0  = (f.s0 > fmax.s0) ? c    : cmax.s0;\n"
            "            fmax.s0  = (f.s0 > fmax.s0) ? f.s0 : fmax.s0;\n"
            "            cmax1.s1 = (f.s1 > fmax.s1) ? cmax.s1 : ((f.s1 > fmax1.s1) ? c    : cmax1.s1);\n"
            "            fmax1.s1 = (f.s1 > fmax.s1) ? fmax.s1 : ((f.s1 > fmax1.s1) ? f.s1 : fmax1.s1);\n"
            "            cmax.s1  = (f.s1 > fmax.s1) ? c    : cmax.s1;\n"
            "            fmax.s1  = (f.s1 > fmax.s1) ? f.s1 : fmax.s1;\n"
            "            cmax1.s2 = (f.s2 > fmax.s2) ? cmax.s2 : ((f.s2 > fmax1.s2) ? c    : cmax1.s2);\n"
            "            fmax1.s2 = (f.s2 > fmax.s2) ? fmax.s2 : ((f.s2 > fmax1.s2) ? f.s2 : fmax1.s2);\n"
            "            cmax.s2  = (f.s2 > fmax.s2) ? c    : cmax.s2;\n"
            "            fmax.s2  = (f.s2 > fmax.s2) ? f.s2 : fmax.s2;\n"
            "            cmax1.s3 = (f.s3 > fmax.s3) ? cmax.s3 : ((f.s3 > fmax1.s3) ? c    : cmax1.s3);\n"
            "            fmax1.s3 = (f.s3 > fmax.s3) ? fmax.s3 : ((f.s3 > fmax1.s3) ? f.s3 : fmax1.s3);\n"
            "            cmax.s3  = (f.s3 > fmax.s3) ? c    : cmax.s3;\n"
            "            fmax.s3  = (f.s3 > fmax.s3) ? f.s3 : fmax.s3;\n"
            "        }\n"
            , input_dims[2]);
        opencl_kernel_code += item;
    }
    else if (top_k == 1) {
        sprintf(item,
            "        cmax = (uint4)0;\n"
            "        float4 fmax = *(__global float4 *)i0_buf;\n"
            "        for(uint c = 1; c < %ld; c++) {\n"
            "            i0_buf += i0_stride.s2;\n"
            "            float4 f = *(__global float4 *)i0_buf;\n"
            "            cmax.s0 = (f.s0 > fmax.s0) ? c    : cmax.s0;\n"
            "            fmax.s0 = (f.s0 > fmax.s0) ? f.s0 : fmax.s0;\n"
            "            cmax.s1 = (f.s1 > fmax.s1) ? c    : cmax.s1;\n"
            "            fmax.s1 = (f.s1 > fmax.s1) ? f.s1 : fmax.s1;\n"
            "            cmax.s2 = (f.s2 > fmax.s2) ? c    : cmax.s2;\n"
            "            fmax.s2 = (f.s2 > fmax.s2) ? f.s2 : fmax.s2;\n"
            "            cmax.s3 = (f.s3 > fmax.s3) ? c    : cmax.s3;\n"
            "            fmax.s3 = (f.s3 > fmax.s3) ? f.s3 : fmax.s3;\n"
            "        }\n"
            , input_dims[2]);
        opencl_kernel_code += item;
    }
    if(output_data_type == VX_TYPE_UINT8) {
        if(output_obj_type == VX_TYPE_IMAGE) {
            sprintf(item, "        o0_buf += o0_offset + (z * %ld + y) * o0_stride + x;\n" , input_dims[1]);
            opencl_kernel_code += item;
        }
        else {
            opencl_kernel_code +=
                "        o0_buf += o0_offset + z * o0_stride.s3 + y * o0_stride.s1 + x * o0_stride.s0;\n";
        }
        opencl_kernel_code +=
            "        uint imax = cmax.s0 + (cmax.s1 << 8) + (cmax.s2 << 16) + (cmax.s3 << 24);\n"
            "        *(__global uint *)o0_buf = imax;\n";
        if(top_k == 2) {
            opencl_kernel_code +=
                "        uint imax1 = cmax1.s0 + (cmax1.s1 << 8) + (cmax1.s2 << 16) + (cmax1.s3 << 24);\n"
                "        *(__global uint *)&o0_buf[o0_stride.s2] = imax1;\n";
        }
    }
    else if(output_data_type == VX_TYPE_UINT16) {
        if(output_obj_type == VX_TYPE_IMAGE) {
            sprintf(item, "        o0_buf += o0_offset + (z * %ld + y) * o0_stride + x * 2;\n" , input_dims[1]);
            opencl_kernel_code += item;
        }
        else {
            opencl_kernel_code +=
                "        o0_buf += o0_offset + z * o0_stride.s3 + y * o0_stride.s1 + x * o0_stride.s0;\n";
        }
        opencl_kernel_code +=
            "        uint2 imax;\n"
            "        imax.s0 = cmax.s0 + (cmax.s1 << 16);\n"
            "        imax.s1 = cmax.s2 + (cmax.s3 << 16);\n"
            "        *(__global uint2 *)o0_buf = imax;\n";
        if(top_k == 2) {
            opencl_kernel_code +=
                "        uint2 imax1;\n"
                "        imax1.s0 = cmax1.s0 + (cmax1.s1 << 16);\n"
                "        imax1.s1 = cmax1.s2 + (cmax1.s3 << 16);\n"
                "        *(__global uint2 *)&o0_buf[o0_stride.s2] = imax1;\n";
        }
    }
    opencl_kernel_code +=
        "    }\n"
        "}\n";

#if ENABLE_DEBUG_PRINT_DIMS
    std::cout << "KERNEL argmax_layer output " << input_dims[0] << "x" << input_dims[1] << " " << std::endl;
#endif

    return VX_SUCCESS;
}
////////
// User kernel host side function gets called to execute the user kernel node.
// Perform element-wise consine function on input tensor to produce output tensor.
//
// TODO:********
//   1. Get fixed-point position and dimensions of input and output tensors.
//      Note that both input and output tensors have same dimensions.
//   2. Access input and output tensor object data using vxMapTensorPatch API.
//   3. Perform element-wise cosine function using fixed-point position.
//   4. Use vxUnmapTensorPatch API to give the data buffers control back to OpenVX framework.
vx_status VX_CALLBACK tensor_cos_host_side_function( vx_node node, const vx_reference * refs, vx_uint32 num )
{
    // Get fixed-point position and dimensions of input and output tensors.
    // Note that both input and output tensors have same dimensions.
    vx_tensor input   = ( vx_tensor ) refs[0];
    vx_tensor output  = ( vx_tensor ) refs[1];
    vx_size num_of_dims;
    vx_size dims[4] = { 1, 1, 1, 1 };
    vx_uint8 input_fixed_point_pos;
    vx_uint8 output_fixed_point_pos;
    ERROR_CHECK_STATUS( vxQueryTensor( input,  VX_TENSOR_NUMBER_OF_DIMS, &num_of_dims, sizeof( num_of_dims ) ) );
    ERROR_CHECK_STATUS( vxQueryTensor( input,  VX_TENSOR_DIMS, &dims, num_of_dims * sizeof(vx_size) ) );
    ERROR_CHECK_STATUS( vxQueryTensor( input,  VX_TENSOR_FIXED_POINT_POSITION, &input_fixed_point_pos, sizeof( input_fixed_point_pos ) ) );
    ERROR_CHECK_STATUS( vxQueryTensor( output, VX_TENSOR_FIXED_POINT_POSITION, &output_fixed_point_pos, sizeof( output_fixed_point_pos ) ) );

    // Access input and output tensor object data using vxMapTensorPatch API.
    vx_size zeros[4] = { 0 };
    vx_map_id map_input, map_output;
    vx_uint8 * buf_input, * buf_output;
    vx_size stride_input[4] = { 0 };
    vx_size stride_output[4] = { 0 };
    ERROR_CHECK_STATUS( vxMapTensorPatch( input,
                                          num_of_dims, zeros, dims,
                                          &map_input, stride_input,
                                          (void **)&buf_input, VX_READ_ONLY, VX_MEMORY_TYPE_HOST, 0 ) );
    ERROR_CHECK_STATUS( vxMapTensorPatch( output,
                                          num_of_dims, zeros, dims,
                                          &map_output, stride_output,
                                          (void **)&buf_output, VX_READ_ONLY, VX_MEMORY_TYPE_HOST, 0 ) );

    // Perform element-wise cosine function using fixed-point position.
    vx_float32 input_to_float_multiplier = 1.0f / (vx_float32)(1 << input_fixed_point_pos);
    vx_float32 output_to_int16_multiplier = (vx_float32)(1 << output_fixed_point_pos);
    for( vx_size dim3 = 0; dim3 < dims[3]; dim3++)
    {
        for( vx_size dim2 = 0; dim2 < dims[2]; dim2++)
        {
            for( vx_size dim1 = 0; dim1 < dims[1]; dim1++)
            {
                const vx_int16 * ibuf = (const vx_int16 *) (buf_input +
                                                            dim3 * stride_input[3] +
                                                            dim2 * stride_input[2] +
                                                            dim1 * stride_input[1] );
                vx_int16 * obuf = (vx_int16 *) (buf_output +
                                                dim3 * stride_output[3] +
                                                dim2 * stride_output[2] +
                                                dim1 * stride_output[1] );
                for( vx_size dim0 = 0; dim0 < dims[0]; dim0++)
                {
                    // no saturation done here
                    vx_int16 ivalue = ibuf[dim0];
                    vx_int16 ovalue = (vx_int16)(cosf((vx_float32)ivalue * input_to_float_multiplier) * output_to_int16_multiplier + 0.5f);
                    obuf[dim0] = ovalue;
                }
            }
        }
    }

    // Use vxUnmapTensorPatch API to give the data buffers control back to OpenVX framework.
    ERROR_CHECK_STATUS( vxUnmapTensorPatch( input,  map_input ) );
    ERROR_CHECK_STATUS( vxUnmapTensorPatch( output, map_output ) );

    return VX_SUCCESS;
}
static vx_status VX_CALLBACK validateImageToTensorKernel(vx_node node, const vx_reference parameters[], vx_uint32 num, vx_meta_format metas[])
{
    // check input configuration
    vx_uint32 width, height;
    vx_df_image format;
    ERROR_CHECK_STATUS(vxQueryImage((vx_image)parameters[0], VX_IMAGE_WIDTH, &width, sizeof(width)));
    ERROR_CHECK_STATUS(vxQueryImage((vx_image)parameters[0], VX_IMAGE_HEIGHT, &height, sizeof(height)));
    ERROR_CHECK_STATUS(vxQueryImage((vx_image)parameters[0], VX_IMAGE_FORMAT, &format, sizeof(format)));
    if(format != VX_DF_IMAGE_RGB && format != VX_DF_IMAGE_U8)
        return VX_ERROR_INVALID_FORMAT;
    vx_enum scalar_type;
    ERROR_CHECK_STATUS(vxQueryScalar((vx_scalar)parameters[2], VX_SCALAR_TYPE, &scalar_type, sizeof(scalar_type)));
    if(scalar_type != VX_TYPE_FLOAT32)
        return VX_ERROR_INVALID_TYPE;
    ERROR_CHECK_STATUS(vxQueryScalar((vx_scalar)parameters[3], VX_SCALAR_TYPE, &scalar_type, sizeof(scalar_type)));
    if(scalar_type != VX_TYPE_FLOAT32)
        return VX_ERROR_INVALID_TYPE;
    ERROR_CHECK_STATUS(vxQueryScalar((vx_scalar)parameters[4], VX_SCALAR_TYPE, &scalar_type, sizeof(scalar_type)));
    if(scalar_type != VX_TYPE_BOOL)
        return VX_ERROR_INVALID_TYPE;

    // check output dimensions
    vx_enum type;
    vx_size num_dims, output_dims[4] = { 1, 1, 1, 1 };
    ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[1], VX_TENSOR_DATA_TYPE, &type, sizeof(type)));
    ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[1], VX_TENSOR_NUMBER_OF_DIMS, &num_dims, sizeof(num_dims)));
    if (type != VX_TYPE_FLOAT32)
        return VX_ERROR_INVALID_TYPE;
    if (num_dims != 4)
        return VX_ERROR_INVALID_DIMENSION;
    ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[1], VX_TENSOR_DIMS, output_dims, sizeof(output_dims[0])*num_dims));
    if ((output_dims[2] != 3 && output_dims[2] != 1) || output_dims[0] != (size_t)width || (output_dims[1] * output_dims[3]) != (size_t)height)
        return VX_ERROR_INVALID_DIMENSION;

    // set output tensor configuration
    ERROR_CHECK_STATUS(vxSetMetaFormatAttribute(metas[1], VX_TENSOR_DATA_TYPE, &type, sizeof(type)));
    ERROR_CHECK_STATUS(vxSetMetaFormatAttribute(metas[1], VX_TENSOR_NUMBER_OF_DIMS, &num_dims, sizeof(num_dims)));
    ERROR_CHECK_STATUS(vxSetMetaFormatAttribute(metas[1], VX_TENSOR_DIMS, output_dims, sizeof(output_dims)));

    return VX_SUCCESS;
}
//! \brief The OpenCL code generator callback.
static vx_status VX_CALLBACK opencl_codegen(
    vx_node node,                                  // [input] node
    const vx_reference parameters[],               // [input] parameters
    vx_uint32 num,                                 // [input] number of parameters
    bool opencl_load_function,                     // [input]  false: normal OpenCL kernel; true: reserved
    char opencl_kernel_function_name[64],          // [output] kernel_name for clCreateKernel()
    std::string& opencl_kernel_code,               // [output] string for clCreateProgramWithSource()
    std::string& opencl_build_options,             // [output] options for clBuildProgram()
    vx_uint32& opencl_work_dim,                    // [output] work_dim for clEnqueueNDRangeKernel()
    vx_size opencl_global_work[],                  // [output] global_work[] for clEnqueueNDRangeKernel()
    vx_size opencl_local_work[],                   // [output] local_work[] for clEnqueueNDRangeKernel()
    vx_uint32& opencl_local_buffer_usage_mask,     // [output] reserved: must be ZERO
    vx_uint32& opencl_local_buffer_size_in_bytes   // [output] reserved: must be ZERO
    )
{
    // get configuration
    vx_df_image format;
    vx_size num_dims, input_dims[4] = { 1, 1, 1, 1 };
    ERROR_CHECK_STATUS(vxQueryImage((vx_image)parameters[1], VX_IMAGE_FORMAT, &format, sizeof(format)));
    ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_NUMBER_OF_DIMS, &num_dims, sizeof(num_dims)));
    ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_DIMS, input_dims, sizeof(input_dims[0])*num_dims));
    vx_uint32 width = (vx_uint32)input_dims[0];
    vx_uint32 height = (vx_uint32)input_dims[1];
    vx_uint32 N = (vx_uint32)input_dims[3];

    // compute global work
    vx_uint32 width_div_4 = (width + 3) / 4;
    opencl_work_dim = 3;
    opencl_local_work[0] = 8;
    opencl_local_work[1] = 8;
    opencl_local_work[2] = 1;
    opencl_global_work[0] = (width_div_4  + opencl_local_work[0] - 1) & ~(opencl_local_work[0] - 1);
    opencl_global_work[1] = (height + opencl_local_work[1] - 1) & ~(opencl_local_work[1] - 1);
    opencl_global_work[2] = N;

    // generate OpenCL C code
    strcpy(opencl_kernel_function_name, "tensor_to_image");
    if(format == VX_DF_IMAGE_RGB) {
        char item[8192];
        sprintf(item,
            "#pragma OPENCL EXTENSION cl_amd_media_ops : enable\n"
            "__kernel __attribute__((reqd_work_group_size(%ld, %ld, 1)))\n" // opencl_local_work[0] opencl_local_work[1]
            "void %s(__global uchar * i0_buf, uint i0_offset, uint4 i0_stride, uint o0_width, uint o0_height, __global uchar * o0_buf, uint o0_stride, uint o0_offset, float ka, float kb, uint reverse_channel_order)\n"
            "{\n"
            "    uint x = get_global_id(0) * 4;\n"
            "    uint y = get_global_id(1);\n"
            "    uint n = get_global_id(2);\n"
            "    if(x < %d && y < %d) {\n"
            "        i0_buf += i0_offset + n * i0_stride.s3 + y * i0_stride.s1 + x * i0_stride.s0;\n"
            "        float4 r = *(__global float4 *)&i0_buf[reverse_channel_order ? 2 * i0_stride.s2 : 0];\n"
            "        float4 g = *(__global float4 *)&i0_buf[                              i0_stride.s2  ];\n"
            "        float4 b = *(__global float4 *)&i0_buf[reverse_channel_order ? 0 : 2 * i0_stride.s2];\n"
            "        r = r * (float4)ka + (float4)kb;\n"
            "        g = g * (float4)ka + (float4)kb;\n"
            "        b = b * (float4)ka + (float4)kb;\n"
            "        uint3 u3;\n"
            "        u3.s0 = amd_pack((float4)(r.s0, g.s0, b.s0, r.s1));\n"
            "        u3.s1 = amd_pack((float4)(g.s1, b.s1, r.s2, g.s2));\n"
            "        u3.s2 = amd_pack((float4)(b.s2, r.s3, g.s3, b.s3));\n"
            "        vstore3(u3, 0, (__global uint *)&o0_buf[o0_offset + (y + n * %d) * o0_stride + x * 3]);\n"
            "    }\n"
            "}\n"
            , opencl_local_work[0], opencl_local_work[1], opencl_kernel_function_name, width, height, height);
        opencl_kernel_code = item;
    }
    else {
        char item[8192];
        sprintf(item,
            "#pragma OPENCL EXTENSION cl_amd_media_ops : enable\n"
            "__kernel __attribute__((reqd_work_group_size(%ld, %ld, 1)))\n" // opencl_local_work[0] opencl_local_work[1]
            "void %s(__global uchar * i0_buf, uint i0_offset, uint4 i0_stride, uint o0_width, uint o0_height, __global uchar * o0_buf, uint o0_stride, uint o0_offset, float ka, float kb, uint reverse_channel_order)\n"
            "{\n"
            "    uint x = get_global_id(0) * 4;\n"
            "    uint y = get_global_id(1);\n"
            "    uint n = get_global_id(2);\n"
            "    if(x < %d && y < %d) {\n"
            "        i0_buf += i0_offset + n * i0_stride.s3 + y * i0_stride.s1 + x * i0_stride.s0;\n"
            "        float4 i = *(__global float4 *)i0_buf;\n"
            "        i = i * (float4)ka + (float4)kb;\n"
            "        *(__global uint *)&o0_buf[o0_offset + (y + n * %d) * o0_stride + x] = amd_pack((float4)(i.s0, i.s1, i.s2, i.s3));\n"
            "    }\n"
            "}\n"
            , opencl_local_work[0], opencl_local_work[1], opencl_kernel_function_name, width, height, height);
        opencl_kernel_code = item;
    }

#if ENABLE_DEBUG_PRINT_DIMS
    std::cout << "KERNEL tensor_to_image output " << width << "x" << height << " " << N << std::endl;
#endif

    return VX_SUCCESS;
}
//! \brief The OpenCL code generator callback.
static vx_status VX_CALLBACK opencl_codegen(
    vx_node node,                                  // [input] node
    const vx_reference parameters[],               // [input] parameters
    vx_uint32 num,                                 // [input] number of parameters
    bool opencl_load_function,                     // [input]  false: normal OpenCL kernel; true: reserved
    char opencl_kernel_function_name[64],          // [output] kernel_name for clCreateKernel()
    std::string& opencl_kernel_code,               // [output] string for clCreateProgramWithSource()
    std::string& opencl_build_options,             // [output] options for clBuildProgram()
    vx_uint32& opencl_work_dim,                    // [output] work_dim for clEnqueueNDRangeKernel()
    vx_size opencl_global_work[],                  // [output] global_work[] for clEnqueueNDRangeKernel()
    vx_size opencl_local_work[],                   // [output] local_work[] for clEnqueueNDRangeKernel()
    vx_uint32& opencl_local_buffer_usage_mask,     // [output] reserved: must be ZERO
    vx_uint32& opencl_local_buffer_size_in_bytes   // [output] reserved: must be ZERO
    )
{
    // get configuration
    vx_uint32 width, height, N;
    vx_df_image format;
    vx_size num_dims, output_dims[4] = { 1, 1, 1, 1 };
    ERROR_CHECK_STATUS(vxQueryImage((vx_image)parameters[0], VX_IMAGE_FORMAT, &format, sizeof(format)));
    ERROR_CHECK_STATUS(vxQueryImage((vx_image)parameters[0], VX_IMAGE_WIDTH, &width, sizeof(width)));
    ERROR_CHECK_STATUS(vxQueryImage((vx_image)parameters[0], VX_IMAGE_HEIGHT, &height, sizeof(height)));
    ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[1], VX_TENSOR_NUMBER_OF_DIMS, &num_dims, sizeof(num_dims)));
    ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[1], VX_TENSOR_DIMS, output_dims, sizeof(output_dims[0])*num_dims));
    height = (vx_uint32)output_dims[1];
    N = (vx_uint32)output_dims[3];

    // generate OpenCL C code and compute global work
    strcpy(opencl_kernel_function_name, "image_to_tensor");
    if(format == VX_DF_IMAGE_RGB) {
        opencl_work_dim = 3;
        opencl_local_work[0] = 8;
        opencl_local_work[1] = 8;
        opencl_local_work[2] = 1;
        opencl_global_work[0] = (width  + opencl_local_work[0] - 1) & ~(opencl_local_work[0] - 1);
        opencl_global_work[1] = (height + opencl_local_work[1] - 1) & ~(opencl_local_work[1] - 1);
        opencl_global_work[2] = N;

        char item[8192];
        sprintf(item,
            "#pragma OPENCL EXTENSION cl_amd_media_ops : enable\n"
            "__kernel __attribute__((reqd_work_group_size(%ld, %ld, 1)))\n" // opencl_local_work[0] opencl_local_work[1]
            "void %s(uint i0_width, uint i0_height, __global uchar * i0_buf, uint i0_stride, uint i0_offset, __global uchar * o0_buf, uint o0_offset, uint4 o0_stride, float ka, float kb, uint reverse_channel_order)\n"
            "{\n"
            "    uint x = get_global_id(0);\n"
            "    uint y = get_global_id(1);\n"
            "    uint n = get_global_id(2);\n"
            "    if(x < %d && y < %d) {\n"
            "        uint ioffset = i0_offset + (y + n * %d) * i0_stride + x * 3;\n"
            "        uint2 rgb2 = vload2(0, (__global uint *)&i0_buf[ioffset & ~3]);\n"
            "        uint rgb = amd_bytealign(rgb2.s1, rgb2.s0, ioffset & 3);\n"
            "        float r = ka * amd_unpack0(rgb) + kb;\n"
            "        float g = ka * amd_unpack1(rgb) + kb;\n"
            "        float b = ka * amd_unpack2(rgb) + kb;\n"
            "        o0_buf += o0_offset + n * o0_stride.s3 + y * o0_stride.s1 + x * o0_stride.s0;\n"
            "        *(__global float *)&o0_buf[               0] = reverse_channel_order ? b : r;\n"
            "        *(__global float *)&o0_buf[    o0_stride.s2] =                             g;\n"
            "        *(__global float *)&o0_buf[2 * o0_stride.s2] = reverse_channel_order ? r : b;\n"
            "    }\n"
            "}\n"
            , opencl_local_work[0], opencl_local_work[1], opencl_kernel_function_name, width, height, height);
        opencl_kernel_code = item;
    }
    else if(format == VX_DF_IMAGE_U8) {
        opencl_work_dim = 3;
        opencl_local_work[0] = 8;
        opencl_local_work[1] = 8;
        opencl_local_work[2] = 1;
        opencl_global_work[0] = ((width+3)/4 + opencl_local_work[0] - 1) & ~(opencl_local_work[0] - 1);
        opencl_global_work[1] = (height + opencl_local_work[1] - 1) & ~(opencl_local_work[1] - 1);
        opencl_global_work[2] = N;

        char item[8192];
        sprintf(item,
                "#pragma OPENCL EXTENSION cl_amd_media_ops : enable\n"
                "__kernel __attribute__((reqd_work_group_size(%ld, %ld, 1)))\n" // opencl_local_work[0] opencl_local_work[1]
                "void %s(uint i0_width, uint i0_height, __global uchar * i0_buf, uint i0_stride, uint i0_offset, __global uchar * o0_buf, uint o0_offset, uint4 o0_stride, float a, float b, uint reverse_channel_order)\n"
                "{\n"
                "    uint x = get_global_id(0) * 4;\n"
                "    uint y = get_global_id(1);\n"
                "    uint n = get_global_id(2);\n"
                "    if(x < %d && y < %d) {\n"
                "        uint u4 = *(__global uint *)&i0_buf[i0_offset + (y + n * %d) * i0_stride + x];\n"
                "        float p0 = a * amd_unpack0(u4) + b;\n"
                "        float p1 = a * amd_unpack1(u4) + b;\n"
                "        float p2 = a * amd_unpack2(u4) + b;\n"
                "        float p3 = a * amd_unpack3(u4) + b;\n"
                "        *(__global float4 *)&o0_buf[o0_offset + n * o0_stride.s3 + y * o0_stride.s1 + x * o0_stride.s0] = (float4)(p0 , p1, p2, p3);\n"
                "    }\n"
                "}\n"
            , opencl_local_work[0], opencl_local_work[1], opencl_kernel_function_name, width, height, height);
        opencl_kernel_code = item;
    }

#if ENABLE_DEBUG_PRINT_DIMS
    std::cout << "KERNEL image_to_tensor output " << width << " " << height << " " << N << std::endl;
#endif

    return VX_SUCCESS;
}