////////
// User kernel host side function gets called to execute the user kernel node.
// Perform element-wise consine function on input tensor to produce output tensor.
//
// TODO:********
//   1. Get fixed-point position and dimensions of input and output tensors.
//      Note that both input and output tensors have same dimensions.
//   2. Access input and output tensor object data using vxMapTensorPatch API.
//   3. Perform element-wise cosine function using fixed-point position.
//   4. Use vxUnmapTensorPatch API to give the data buffers control back to OpenVX framework.
vx_status VX_CALLBACK tensor_cos_host_side_function( vx_node node, const vx_reference * refs, vx_uint32 num )
{
    // Get fixed-point position and dimensions of input and output tensors.
    // Note that both input and output tensors have same dimensions.
    vx_tensor input   = ( vx_tensor ) refs[0];
    vx_tensor output  = ( vx_tensor ) refs[1];
    vx_size num_of_dims;
    vx_size dims[4] = { 1, 1, 1, 1 };
    vx_uint8 input_fixed_point_pos;
    vx_uint8 output_fixed_point_pos;
    ERROR_CHECK_STATUS( vxQueryTensor( input,  VX_TENSOR_NUMBER_OF_DIMS, &num_of_dims, sizeof( num_of_dims ) ) );
    ERROR_CHECK_STATUS( vxQueryTensor( input,  VX_TENSOR_DIMS, &dims, num_of_dims * sizeof(vx_size) ) );
    ERROR_CHECK_STATUS( vxQueryTensor( input,  VX_TENSOR_FIXED_POINT_POSITION, &input_fixed_point_pos, sizeof( input_fixed_point_pos ) ) );
    ERROR_CHECK_STATUS( vxQueryTensor( output, VX_TENSOR_FIXED_POINT_POSITION, &output_fixed_point_pos, sizeof( output_fixed_point_pos ) ) );

    // Access input and output tensor object data using vxMapTensorPatch API.
    vx_size zeros[4] = { 0 };
    vx_map_id map_input, map_output;
    vx_uint8 * buf_input, * buf_output;
    vx_size stride_input[4] = { 0 };
    vx_size stride_output[4] = { 0 };
    ERROR_CHECK_STATUS( vxMapTensorPatch( input,
                                          num_of_dims, zeros, dims,
                                          &map_input, stride_input,
                                          (void **)&buf_input, VX_READ_ONLY, VX_MEMORY_TYPE_HOST, 0 ) );
    ERROR_CHECK_STATUS( vxMapTensorPatch( output,
                                          num_of_dims, zeros, dims,
                                          &map_output, stride_output,
                                          (void **)&buf_output, VX_READ_ONLY, VX_MEMORY_TYPE_HOST, 0 ) );

    // Perform element-wise cosine function using fixed-point position.
    vx_float32 input_to_float_multiplier = 1.0f / (vx_float32)(1 << input_fixed_point_pos);
    vx_float32 output_to_int16_multiplier = (vx_float32)(1 << output_fixed_point_pos);
    for( vx_size dim3 = 0; dim3 < dims[3]; dim3++)
    {
        for( vx_size dim2 = 0; dim2 < dims[2]; dim2++)
        {
            for( vx_size dim1 = 0; dim1 < dims[1]; dim1++)
            {
                const vx_int16 * ibuf = (const vx_int16 *) (buf_input +
                                                            dim3 * stride_input[3] +
                                                            dim2 * stride_input[2] +
                                                            dim1 * stride_input[1] );
                vx_int16 * obuf = (vx_int16 *) (buf_output +
                                                dim3 * stride_output[3] +
                                                dim2 * stride_output[2] +
                                                dim1 * stride_output[1] );
                for( vx_size dim0 = 0; dim0 < dims[0]; dim0++)
                {
                    // no saturation done here
                    vx_int16 ivalue = ibuf[dim0];
                    vx_int16 ovalue = (vx_int16)(cosf((vx_float32)ivalue * input_to_float_multiplier) * output_to_int16_multiplier + 0.5f);
                    obuf[dim0] = ovalue;
                }
            }
        }
    }

    // Use vxUnmapTensorPatch API to give the data buffers control back to OpenVX framework.
    ERROR_CHECK_STATUS( vxUnmapTensorPatch( input,  map_input ) );
    ERROR_CHECK_STATUS( vxUnmapTensorPatch( output, map_output ) );

    return VX_SUCCESS;
}
////////
// main() has all the OpenVX application code for this exercise.
// Command-line usage:
//   % solution_exercise3 [<video-sequence>|<camera-device-number>]
// When neither video sequence nor camera device number is specified,
// it defaults to the video sequence in "PETS09-S1-L1-View001.avi".
int main( int argc, char * argv[] )
{
    // Get default video sequence when nothing is specified on command-line and
    // instantiate OpenCV GUI module for reading input RGB images and displaying
    // the image with OpenVX results
    const char * video_sequence = argv[1];
    CGuiModule gui( video_sequence );

    // Try grab first video frame from the sequence using cv::VideoCapture
    // and check if video frame is available
    if( !gui.Grab() )
    {
        printf( "ERROR: input has no video\n" );
        return 1;
    }

    ////////
    // Set the application configuration parameters. Note that input video
    // sequence is an 8-bit RGB image with dimensions given by gui.GetWidth()
    // and gui.GetHeight(). The parameters for the tensors are:
    //   tensor_dims                    - 3 dimensions of tensor [3 x <width> x <height>]
    //   tensor_input_fixed_point_pos   - fixed-point position for input tensor
    //   tensor_output_fixed_point_pos  - fixed-point position for output tensor
    vx_uint32  width                         = gui.GetWidth();
    vx_uint32  height                        = gui.GetHeight();
    vx_size    tensor_dims[3]                = { width, height, 3 }; // 3 channels (RGB)
    vx_uint8   tensor_input_fixed_point_pos  = 5; // input[-128..127] will be mapped to -4..3.96875
    vx_uint8   tensor_output_fixed_point_pos = 7; // output[-1..1] will be mapped to -128 to 128

    ////////
    // Create the OpenVX context and make sure returned context is valid and
    // register the log_callback to receive messages from OpenVX framework.
    vx_context context = vxCreateContext();
    ERROR_CHECK_OBJECT( context );
    vxRegisterLogCallback( context, log_callback, vx_false_e );

    ////////
    // Register user kernels with the context.
    //
    // TODO:********
    //   1. Register user kernel with context by calling your implementation of "registerUserKernel()".
    ERROR_CHECK_STATUS( registerUserKernel( context ) );

    ////////
    // Create OpenVX tensor objects for input and output
    //
    // TODO:********
    //   1. Create tensor objects using tensor_dims, tensor_input_fixed_point_pos, and
    //      tensor_output_fixed_point_pos
    vx_tensor input_tensor   = vxCreateTensor( context, 3, tensor_dims, VX_TYPE_INT16, tensor_input_fixed_point_pos );
    vx_tensor output_tensor  = vxCreateTensor( context, 3, tensor_dims, VX_TYPE_INT16, tensor_output_fixed_point_pos );
    ERROR_CHECK_OBJECT( input_tensor );
    ERROR_CHECK_OBJECT( output_tensor );

    ////////
    // Create, build, and verify the graph with user kernel node.
    //
    // TODO:********
    //   1. Build a graph with just one node created using userTensorCosNode()
    vx_graph graph = vxCreateGraph( context );
    ERROR_CHECK_OBJECT( graph );
    vx_node cos_node = userTensorCosNode( graph, input_tensor, output_tensor );
    ERROR_CHECK_OBJECT( cos_node );
    ERROR_CHECK_STATUS( vxReleaseNode( &cos_node ) );
    ERROR_CHECK_STATUS( vxVerifyGraph( graph ) );

    ////////
    // Process the video sequence frame by frame until the end of sequence or aborted.
    cv::Mat bgrMatForOutputDisplay( height, width, CV_8UC3 );
    for( int frame_index = 0; !gui.AbortRequested(); frame_index++ )
    {
        ////////
        // Copy input RGB frame from OpenCV into input_tensor with UINT8 to Q10.5 (INT16) conversion.
        // In order to do this, vxMapTensorPatch API (see "vx_ext_amd.h").
        //
         // TODO:********
         //   1. Use vxMapTensorPatch API for access to input tensor object for writing
         //   2. Copy UINT8 data from OpenCV RGB image to tensor object
         //   3. Use vxUnmapTensorPatch API to return control of buffer back to framework
        vx_uint8 * cv_rgb_image_buffer = gui.GetBuffer();
        vx_size rgb_stride             = gui.GetStride();
        vx_size zeros[3]               = { 0 };
        vx_size tensor_stride[3];
        vx_map_id map_id;
        vx_uint8 * buf;
        ERROR_CHECK_STATUS( vxMapTensorPatch( input_tensor,
                                              3, zeros, tensor_dims,
                                              &map_id, tensor_stride,
                                              (void **)&buf, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST, 0 ) );
        for( vx_size c = 0; c < 3; c++ )
        {
            for( vx_size y = 0; y < height; y++ )
            {
                const vx_uint8 * img = cv_rgb_image_buffer + y * rgb_stride + c;
                vx_int16 * inp = (vx_int16 *)(buf + y * tensor_stride[1] + c * tensor_stride[2]);
                for( vx_size x = 0; x < width; x++ )
                {
                    // convert 0..255 to Q10.5 [-4..3.96875 range] fixed-point format
                    inp[x] = (vx_int16)img[x * 3] - 128;
                }
            }
        }
        ERROR_CHECK_STATUS( vxUnmapTensorPatch( input_tensor, map_id ) );


        ////////
        // Now that input tensor is ready, just run the graph.
        //
        // TODO:********
        //   1. Call vxProcessGraph to execute the tensor_cos kernel in graph
        ERROR_CHECK_STATUS( vxProcessGraph( graph ) );

        ////////
        // Display the output tensor object as RGB image
        //
        // TODO:********
        //   1. Use vxMapTensorPatch API for access to output tensor object for reading
        //   2. Copy tensor object data into OpenCV RGB image
        //   3. Use vxUnmapTensorPatch API to return control of buffer back to framework
        ERROR_CHECK_STATUS( vxMapTensorPatch( output_tensor,
                                              3, zeros, tensor_dims,
                                              &map_id, tensor_stride,
                                              (void **)&buf, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST, 0 ) );
        vx_uint8 * cv_bgr_image_buffer = bgrMatForOutputDisplay.data;
        vx_size bgr_stride             = bgrMatForOutputDisplay.step;
        for( vx_size c = 0; c < 3; c++ )
        {
            for( vx_size y = 0; y < height; y++ )
            {
                const vx_int16 * out = (const vx_int16 *)(buf + y * tensor_stride[1] + c * tensor_stride[2]);
                vx_uint8 * img = cv_bgr_image_buffer + y * bgr_stride + (2 - c); // (2 - c) for RGB to BGR conversion
                for( vx_size x = 0; x < width; x++ )
                {
                    // scale convert Q8.7 [-1..1 range] fixed-point format to 0..255 with saturation
                    vx_int16 value = out[x] + 128;
                    value = value > 255 ? 255 : value; // saturation needed
                    img[x * 3] = (vx_uint8)value;
                }
            }
        }
#if ENABLE_DISPLAY
        cv::imshow( "Cosine", bgrMatForOutputDisplay );
#endif
        ERROR_CHECK_STATUS( vxUnmapTensorPatch( output_tensor, map_id ) );

        ////////
        // Display the results and grab the next input RGB frame for the next iteration.
        char text[128];
        sprintf( text, "Keyboard ESC/Q-Quit SPACE-Pause [FRAME %d] [fixed_point_pos input:%d output:%d]", frame_index, tensor_input_fixed_point_pos, tensor_output_fixed_point_pos );
        gui.DrawText( 0, 16, text );
        gui.Show();
        if( !gui.Grab() )
        {
            // Terminate the processing loop if the end of sequence is detected.
            gui.WaitForKey();
            break;
        }
    }

    ////////
    // To release an OpenVX object, you need to call vxRelease<Object> API which takes a pointer to the object.
    // If the release operation is successful, the OpenVX framework will reset the object to NULL.
    //
    // TODO:****
    //   1. Release graph and tensor objects
    ERROR_CHECK_STATUS( vxReleaseGraph( &graph ) );
    ERROR_CHECK_STATUS( vxReleaseTensor( &input_tensor ) );
    ERROR_CHECK_STATUS( vxReleaseTensor( &output_tensor ) );
    ERROR_CHECK_STATUS( vxReleaseContext( &context ) );

    return 0;
}
int CVxParamTensor::CompareFrame(int frameNumber)
{
	// check if there is no user request to compare
	if (m_fileNameCompare.length() < 1) return 0;

	// reading data from reference file
	char fileName[MAX_FILE_NAME_LENGTH]; sprintf(fileName, m_fileNameCompare.c_str(), frameNumber);
	if(!_stricmp(fileName + strlen(fileName) - 4, ".dat")) {
		ReportError("ERROR: read from .dat files not supported: %s\n", fileName);
	}
	FILE * fp = fopen(fileName, m_compareFileIsBinary ? "rb" : "r");
	if (!fp) {
		ReportError("ERROR: Unable to open: %s\n", fileName);
	}
	if (fread(m_data, 1, m_size, fp) != m_size)
		ReportError("ERROR: not enough data (%d bytes) in %s\n", (vx_uint32)m_size, fileName);
	fclose(fp);

	// compare
	vx_map_id map_id;
	vx_size stride[MAX_TENSOR_DIMENSIONS];
	vx_uint8 * ptr;
	vx_status status = vxMapTensorPatch(m_tensor, m_num_of_dims, nullptr, nullptr, &map_id, stride, (void **)&ptr, VX_READ_ONLY, VX_MEMORY_TYPE_HOST, 0);
	if (status != VX_SUCCESS)
		ReportError("ERROR: vxMapTensorPatch: read failed (%d)\n", status);

	bool mismatchDetected = false;
	if (m_data_type == VX_TYPE_INT16) {
		vx_int32 maxError = 0;
		vx_int64 sumError = 0;
		for (vx_size d3 = 0; d3 < m_dims[3]; d3++) {
			for (vx_size d2 = 0; d2 < m_dims[2]; d2++) {
				for (vx_size d1 = 0; d1 < m_dims[1]; d1++) {
					vx_size roffset = m_stride[3] * d3 + m_stride[2] * d2 + m_stride[1] * d1;
					vx_size doffset = stride[3] * d3 + stride[2] * d2 + stride[1] * d1;
					const vx_int16 * buf1 = (const vx_int16 *)(((vx_uint8 *)ptr) + doffset);
					const vx_int16 * buf2 = (const vx_int16 *)(m_data + roffset);
					for (vx_size d0 = 0; d0 < m_dims[0]; d0++) {
						vx_int32 v1 = buf1[d0];
						vx_int32 v2 = buf2[d0];
						vx_int32 d = v1 - v2;
						d = (d < 0) ? -d : d;
						maxError = (d > maxError) ? d : maxError;
						sumError += d * d;
					}
				}
			}
		}
		vx_size count = m_dims[0] * m_dims[1] * m_dims[2] * m_dims[3];
		float avgError = (float)sumError / (float)count;
		mismatchDetected = true;
		if (((float)maxError <= m_maxErrorLimit) && ((float)avgError <= m_avgErrorLimit))
		    mismatchDetected = false;
		if (mismatchDetected)
			printf("ERROR: tensor COMPARE MISMATCHED [max-err: %d] [avg-err: %.6f] for %s with frame#%d of %s\n", maxError, avgError, GetVxObjectName(), frameNumber, fileName);
		else if (m_verbose)
			printf("OK: tensor COMPARE MATCHED [max-err: %d] [avg-err: %.6f] for %s with frame#%d of %s\n", maxError, avgError, GetVxObjectName(), frameNumber, fileName);
	}
	else if (m_data_type == VX_TYPE_FLOAT32) {
		vx_float32 maxError = 0;
		vx_float64 sumError = 0;
		for (vx_size d3 = 0; d3 < m_dims[3]; d3++) {
			for (vx_size d2 = 0; d2 < m_dims[2]; d2++) {
				for (vx_size d1 = 0; d1 < m_dims[1]; d1++) {
					vx_size roffset = m_stride[3] * d3 + m_stride[2] * d2 + m_stride[1] * d1;
					vx_size doffset = stride[3] * d3 + stride[2] * d2 + stride[1] * d1;
					const vx_float32 * buf1 = (const vx_float32 *)(((vx_uint8 *)ptr) + doffset);
					const vx_float32 * buf2 = (const vx_float32 *)(m_data + roffset);
					for (vx_size d0 = 0; d0 < m_dims[0]; d0++) {
						vx_float32 v1 = buf1[d0];
						vx_float32 v2 = buf2[d0];
						vx_float32 d = v1 - v2;
						d = (d < 0) ? -d : d;
						maxError = (d > maxError) ? d : maxError;
						sumError += d * d;
					}
				}
			}
		}
		vx_size count = m_dims[0] * m_dims[1] * m_dims[2] * m_dims[3];
		float avgError = (float)sumError / (float)count;
		mismatchDetected = true;
		if ((maxError <= m_maxErrorLimit) && (avgError <= m_avgErrorLimit))
		    mismatchDetected = false;
		if (mismatchDetected)
			printf("ERROR: tensor COMPARE MISMATCHED [max-err: %.6f] [avg-err: %.6f] for %s with frame#%d of %s\n", maxError, avgError, GetVxObjectName(), frameNumber, fileName);
		else if (m_verbose)
			printf("OK: tensor COMPARE MATCHED [max-err: %.6f] [avg-err: %.6f] for %s with frame#%d of %s\n", maxError, avgError, GetVxObjectName(), frameNumber, fileName);
	}
	else if (m_data_type == VX_TYPE_FLOAT16) {
		vx_float32 maxError = 0;
		vx_float64 sumError = 0;
		for (vx_size d3 = 0; d3 < m_dims[3]; d3++) {
			for (vx_size d2 = 0; d2 < m_dims[2]; d2++) {
				for (vx_size d1 = 0; d1 < m_dims[1]; d1++) {
					vx_size roffset = m_stride[3] * d3 + m_stride[2] * d2 + m_stride[1] * d1;
					vx_size doffset = stride[3] * d3 + stride[2] * d2 + stride[1] * d1;
					const vx_uint16 * buf1 = (const vx_uint16 *)(((vx_uint8 *)ptr) + doffset);
					const vx_uint16 * buf2 = (const vx_uint16 *)(m_data + roffset);
					for (vx_size d0 = 0; d0 < m_dims[0]; d0++) {
						vx_uint16 h1 = buf1[d0];
						vx_uint16 h2 = buf2[d0];
						vx_uint32 d1 = ((h1 & 0x8000) << 16) | (((h1 & 0x7c00) + 0x1c000) << 13) | ((h1 & 0x03ff) << 13);
						vx_uint32 d2 = ((h2 & 0x8000) << 16) | (((h2 & 0x7c00) + 0x1c000) << 13) | ((h2 & 0x03ff) << 13);
						vx_float32 v1 = *(float *)&d1;
						vx_float32 v2 = *(float *)&d2;
						vx_float32 d = v1 - v2;
						d = (d < 0) ? -d : d;
						maxError = (d > maxError) ? d : maxError;
						sumError += d * d;
					}
				}
			}
		}
		vx_size count = m_dims[0] * m_dims[1] * m_dims[2] * m_dims[3];
		float avgError = (float)sumError / (float)count;
		mismatchDetected = true;
		if ((maxError <= m_maxErrorLimit) && (avgError <= m_avgErrorLimit))
		    mismatchDetected = false;
		if (mismatchDetected)
			printf("ERROR: tensor COMPARE MISMATCHED [max-err: %.6f] [avg-err: %.6f] for %s with frame#%d of %s\n", maxError, avgError, GetVxObjectName(), frameNumber, fileName);
		else if (m_verbose)
			printf("OK: tensor COMPARE MATCHED [max-err: %.6f] [avg-err: %.6f] for %s with frame#%d of %s\n", maxError, avgError, GetVxObjectName(), frameNumber, fileName);
	}
	else {
		for (vx_size d3 = 0; d3 < m_dims[3]; d3++) {
			for (vx_size d2 = 0; d2 < m_dims[2]; d2++) {
				for (vx_size d1 = 0; d1 < m_dims[1]; d1++) {
					vx_size roffset = m_stride[3] * d3 + m_stride[2] * d2 + m_stride[1] * d1;
					vx_size doffset = stride[3] * d3 + stride[2] * d2 + stride[1] * d1;
					if (memcpy(((vx_uint8 *)ptr) + doffset, m_data + roffset, stride[0] * m_dims[0])) {
						mismatchDetected = true;
						break;
					}
				}
				if (mismatchDetected)
					break;
			}
			if (mismatchDetected)
				break;
		}
		if (mismatchDetected)
			printf("ERROR: tensor COMPARE MISMATCHED for %s with frame#%d of %s\n", GetVxObjectName(), frameNumber, fileName);
		else if (m_verbose) 
			printf("OK: tensor COMPARE MATCHED for %s with frame#%d of %s\n", GetVxObjectName(), frameNumber, fileName);
	}

	status = vxUnmapTensorPatch(m_tensor, map_id);
	if (status != VX_SUCCESS)
		ReportError("ERROR: vxUnmapTensorPatch: read failed (%d)\n", status);

	// report error if mismatched
	if (mismatchDetected) {
		m_compareCountMismatches++;
		if (!m_discardCompareErrors) return -1;
	}
	else {
		m_compareCountMatches++;
	}

	return 0;
}