Ejemplo n.º 1
0
static VALUE Reference_count(VALUE self)
{
    vx_uint32 t;
    vx_reference ref = (vx_reference)DATA_PTR(self);
    vxQueryReference(ref, VX_QUERY_REF_COUNT, &t, sizeof(t));
    return INT2FIX(t);
}
Ejemplo n.º 2
0
static VALUE Reference_type(VALUE self)
{
    vx_enum t;
    vx_reference ref = (vx_reference)DATA_PTR(self);
    vxQueryReference(ref, VX_QUERY_REF_TYPE, &t, sizeof(t));
    return INT2FIX(t);
}
static vx_status VX_CALLBACK validateKernel(vx_node node, const vx_reference parameters[], vx_uint32 num, vx_meta_format metas[])
{
    // check input configuration
    vx_enum type, format;
    vx_size num_dims, input_dims[4] = { 1, 1, 1, 1 };
    ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_DATA_TYPE, &type, sizeof(type)));
    ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_NUMBER_OF_DIMS, &num_dims, sizeof(num_dims)));
    ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_DIMS, input_dims, sizeof(input_dims[0])*num_dims));
    if ((num_dims != 4 && num_dims != 3) || ((input_dims[0] & 3) != 0))
        return VX_ERROR_INVALID_DIMENSION;
    if (type != VX_TYPE_FLOAT32)
        return VX_ERROR_INVALID_TYPE;

    // check output object type and set configuration
    ERROR_CHECK_STATUS(vxQueryReference(parameters[1], VX_REFERENCE_TYPE, &type, sizeof(type)));
    if (type == VX_TYPE_IMAGE) {
        vx_df_image format;
        ERROR_CHECK_STATUS(vxQueryImage((vx_image)parameters[1], VX_IMAGE_FORMAT, &format, sizeof(format)));
        if(format == VX_DF_IMAGE_U8 && input_dims[2] > 255)
            return VX_ERROR_INVALID_FORMAT;
        if(format == VX_DF_IMAGE_VIRT)
            format = (input_dims[2] < 256) ? VX_DF_IMAGE_U8 : VX_DF_IMAGE_U16;
        vx_uint32 width = (vx_uint32)input_dims[0];
        vx_uint32 height = (vx_uint32)(input_dims[1]*input_dims[3]);
        ERROR_CHECK_STATUS(vxSetMetaFormatAttribute(metas[1], VX_IMAGE_WIDTH, &width, sizeof(width)));
        ERROR_CHECK_STATUS(vxSetMetaFormatAttribute(metas[1], VX_IMAGE_HEIGHT, &height, sizeof(height)));
        ERROR_CHECK_STATUS(vxSetMetaFormatAttribute(metas[1], VX_IMAGE_FORMAT, &format, sizeof(format)));
    }
    else if (type == VX_TYPE_TENSOR) {
        vx_size output_num_dims, output_dims[4] = { 1, 1, 1, 1 };
        ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[1], VX_TENSOR_DATA_TYPE, &type, sizeof(type)));
        ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[1], VX_TENSOR_NUMBER_OF_DIMS, &output_num_dims, sizeof(output_num_dims)));
        ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[1], VX_TENSOR_DIMS, output_dims, sizeof(output_dims[0])*output_num_dims));
        if (output_dims[2] != 1 && output_dims[2] != 2) // top_k must be 1 or 2
            return VX_ERROR_INVALID_DIMENSION;
        if(type == VX_TYPE_UINT8 && input_dims[2] > 255)
            return VX_ERROR_INVALID_FORMAT;
        if(type != VX_TYPE_UINT8 && type != VX_TYPE_UINT16 && type != VX_TYPE_INT16)
            return VX_ERROR_INVALID_FORMAT;
        ERROR_CHECK_STATUS(vxSetMetaFormatAttribute(metas[1], VX_TENSOR_DATA_TYPE, &type, sizeof(type)));
        ERROR_CHECK_STATUS(vxSetMetaFormatAttribute(metas[1], VX_TENSOR_NUMBER_OF_DIMS, &output_num_dims, sizeof(output_num_dims)));
        ERROR_CHECK_STATUS(vxSetMetaFormatAttribute(metas[1], VX_TENSOR_DIMS, output_dims, sizeof(output_dims[0])*output_num_dims));
    }
    else
        return VX_ERROR_INVALID_PARAMETERS;

    return VX_SUCCESS;
}
Ejemplo n.º 4
0
VX_API_ENTRY vx_status VX_API_CALL vxSetParameterByIndex(vx_node node, vx_uint32 index, vx_reference value)
{
    vx_status status = VX_SUCCESS;
    vx_enum type = 0;
    vx_enum data_type = 0;

    if (vxIsValidSpecificReference(&node->base, VX_TYPE_NODE) == vx_false_e)
    {
        VX_PRINT(VX_ZONE_ERROR, "Supplied node was not actually a node\n");
        status = VX_ERROR_INVALID_REFERENCE;
        goto exit;
    }

    VX_PRINT(VX_ZONE_PARAMETER, "Attempting to set parameter[%u] on %s (enum:%d) to "VX_FMT_REF"\n",
                    index,
                    node->kernel->name,
                    node->kernel->enumeration,
                    value);

    /* is the index out of bounds? */
    if ((index >= node->kernel->signature.num_parameters) || (index >= VX_INT_MAX_PARAMS))
    {
        VX_PRINT(VX_ZONE_ERROR, "Invalid index %u\n", index);
        status = VX_ERROR_INVALID_VALUE;
        goto exit;
    }

    /* if it's an optional parameter, it's ok to be NULL */
    if ((value == 0) && (node->kernel->signature.states[index] == VX_PARAMETER_STATE_OPTIONAL))
    {
        status = VX_SUCCESS;
        goto exit;
    }

    /* if it's required, it's got to exist */
    if (vxIsValidReference((vx_reference_t *)value) == vx_false_e)
    {
        VX_PRINT(VX_ZONE_ERROR, "Supplied value was not actually a reference\n");
        status = VX_ERROR_INVALID_REFERENCE;
        goto exit;
    }

    /* if it was a valid reference then get the type from it */
    vxQueryReference(value, VX_REF_ATTRIBUTE_TYPE, &type, sizeof(type));
    VX_PRINT(VX_ZONE_PARAMETER, "Query returned type %08x for ref "VX_FMT_REF"\n", type, value);
    /* Check that signature type matches reference type*/
    if (node->kernel->signature.types[index] != type)
    {
        /* Check special case where signature is a specific scalar type.
           This can happen if the vxAddParameterToKernel() passes one of the scalar
           vx_type_e types instead of the more generic VX_TYPE_SCALAR since the spec
           doesn't specify that only VX_TYPE_SCALAR should be used for scalar types in
           this function. */
        if((type == VX_TYPE_SCALAR) &&
           (vxQueryScalar((vx_scalar)value, VX_SCALAR_ATTRIBUTE_TYPE, &data_type, sizeof(data_type)) == VX_SUCCESS))
        {
            if(data_type != node->kernel->signature.types[index])
            {
                VX_PRINT(VX_ZONE_ERROR, "Invalid scalar type 0x%08x!\n", data_type);
                status = VX_ERROR_INVALID_TYPE;
                goto exit;
            }
        }
        else
        {
            VX_PRINT(VX_ZONE_ERROR, "Invalid type 0x%08x!\n", type);
            status = VX_ERROR_INVALID_TYPE;
            goto exit;
        }
    }

    if (node->parameters[index])
    {
        if (node->parameters[index]->delay!=NULL) {
            // we already have a delay element here */
            vx_bool res = vxRemoveAssociationToDelay(node->parameters[index], node, index);
            if (res == vx_false_e) {
                VX_PRINT(VX_ZONE_ERROR, "Internal error removing delay association\n");
                status = VX_ERROR_INVALID_REFERENCE;
                goto exit;
            }
        }
    }

    if (value->delay!=NULL) {
        /* the new parameter is a delay element */
        vx_bool res = vxAddAssociationToDelay(value, node, index);
        if (res == vx_false_e) {
            VX_PRINT(VX_ZONE_ERROR, "Internal error adding delay association\n");
            status = VX_ERROR_INVALID_REFERENCE;
            goto exit;
        }
    }

    /* actual change of the node parameter */
    vxNodeSetParameter(node, index, value);

    /* if the node has a child graph, find out which parameter is this */
    if (node->child)
    {
        vx_uint32 p = 0;
        for (p = 0; p < node->child->numParams; p++)
        {
            if ((node->child->parameters[p].node == node) &&
                (node->child->parameters[p].index == index))
            {
                status = vxSetGraphParameterByIndex((vx_graph)node->child, p, value);
                break;
            }
        }
    }

exit:
    if (status == VX_SUCCESS)
    {
        VX_PRINT(VX_ZONE_PARAMETER, "Assigned Node[%u] %p type:%08x ref="VX_FMT_REF"\n",
                 index, node, type, value);
    }
    else
    {
        VX_PRINT(VX_ZONE_ERROR, "Specified: parameter[%u] type:%08x => "VX_FMT_REF"\n",
                        index, type, value);
        VX_PRINT(VX_ZONE_ERROR, "Required: parameter[%u] dir:%d type:%08x\n",
            index,
            node->kernel->signature.directions[index],
            node->kernel->signature.types[index]);
    }
    return status;
}
Ejemplo n.º 5
0
static VALUE Node_init(int argc, VALUE *args, VALUE self)
{
    vx_graph graph = 0;
    vx_kernel kernel = 0;
    VALUE w,h,f;
    Check_Type(self, T_DATA);

    if (argc <= 1)
        rb_raise(rb_eArgError, "Not enough arguments");

    graph = (vx_graph)DATA_PTR(args[0]);

    if (argc == 2) // Kernel
    {
        Check_Type(args[1], T_DATA);
        kernel = (vx_kernel)DATA_PTR(args[1]);
        DATA_PTR(self) = (void *)vxCreateNode(graph, kernel);
    }
    else if (argc == 3) // graph, [string|enum], array of hashes
    {
        vx_node node = 0;
        vx_uint32 p = 0;
        VALUE kern = args[1];
        VALUE array = args[2];
        long param = 0;

        if (TYPE(kern) == T_STRING)
            kernel = vxGetKernelByName(context, RSTRING(kern)->ptr);
        else if (TYPE(kern) == T_FIXNUM)
            kernel = vxGetKernelByEnum(context, FIX2INT(kern));
        else if (TYPE(kern) == T_DATA) // a OpenVX::Kernel
            kernel = (vx_kernel)DATA_PTR(kern);
        else
            rb_raise(rb_eTypeError, "kernel must be a string, fixnum, or OpenVX::Kernel");

        if (kernel == 0)
            rb_raise(rb_eNameError, "kernel could not be found in OpenVX");

        Check_Type(array, T_ARRAY);

        node = vxCreateNode(graph, kernel);
        if (node == 0)
            rb_raise(rb_eTypeError, "node could not be created!");

        REXT_PRINT("Array of parameters has len = %ld\n", RARRAY(array)->len);
        for (param = 0; param < RARRAY(array)->len ; param++)
        {
            VALUE dir,ref,hash;
            vx_reference ref2 = 0;
            vx_status status = 0;
            vx_enum type = VX_TYPE_INVALID;
            const char *name;

            hash = rb_ary_entry(array, param);
            Check_Type(hash, T_HASH);
            dir = rb_hash_aref(hash, ID2SYM(rb_intern("dir")));
            ref = rb_hash_aref(hash, ID2SYM(rb_intern("ref")));
            name = rb_obj_classname(ref);

            REXT_PRINT("rb_type(dir)=0x%x\n", TYPE(dir));
            REXT_PRINT("ref class = %s\n", name);

            Check_Type(dir, T_FIXNUM);
            Check_Type(ref, T_DATA);

            REXT_PRINT("dir=%ld\n", FIX2UINT(dir));

            ref2 = (vx_reference)DATA_PTR(ref);
            if (strcmp("OpenVX::Image", name) == 0)
                type = VX_TYPE_IMAGE;
            else if (strcmp("OpenVX::Buffer", name) == 0)
                type = VX_TYPE_BUFFER;
            else if (strcmp("OpenVX::Scalar", name) == 0)
                type = VX_TYPE_MAX;

            REXT_PRINT("vx type = %d (0x%08x)\n", type, type);
            if (type == VX_TYPE_IMAGE) // replace with format
                status = vxQueryImage(ref2, VX_QUERY_IMAGE_FORMAT, &type, sizeof(vx_fourcc));
            else if (type == VX_TYPE_MAX)
                status = vxQueryReference(ref2, VX_QUERY_REF_TYPE, &type, sizeof(type));
            REXT_PRINT("status = %d vx type = %d (0x%08x)\n", status, type, type);

            status = vxSetParameterByIndex(node, param, FIX2UINT(dir), type, ref2);
            REXT_PRINT("status = %d\n", status);

        }
        DATA_PTR(self) = (void *)node;
    }
    else
    {
        rb_raise(rb_eArgError, "incorrect number of arguments");
    }
    return Qnil;
}
Ejemplo n.º 6
0
CVxParameter * CreateDataObject(vx_context context, vx_graph graph, vx_reference ref, const char * params, vx_uint32 captureFrameStart)
{
	// create the object based on the ref
	vx_enum type;
	vx_status status = vxQueryReference(ref, VX_REF_ATTRIBUTE_TYPE, &type, sizeof(type));
	if (status) {
		printf("ERROR: CreateDataObject: vxQueryReference(*,VX_REF_ATTRIBUTE_TYPE,...) failed(%d)\n", status);
		throw -1;
	}
	if (type == VX_TYPE_IMAGE) {
		CVxParamImage *this_image = new CVxParamImage();
		this_image->SetCaptureFrameStart(captureFrameStart);
		if (this_image->InitializeIO(context, graph, ref, params))
			return NULL;
		return this_image;
	}
	else if (type == VX_TYPE_ARRAY) {
		CVxParamArray *this_array = new CVxParamArray();
		this_array->SetCaptureFrameStart(captureFrameStart);
		if (this_array->InitializeIO(context, graph, ref, params))
			return NULL;
		return this_array;
	}
	else if (type == VX_TYPE_PYRAMID) {
		CVxParamPyramid *this_pyramid = new CVxParamPyramid();
		this_pyramid->SetCaptureFrameStart(captureFrameStart);
		if (this_pyramid->InitializeIO(context, graph, ref, params))
			return NULL;
		return this_pyramid;
	}
	else if (type == VX_TYPE_DISTRIBUTION) {
		CVxParamDistribution *this_distribution = new CVxParamDistribution();
		this_distribution->SetCaptureFrameStart(captureFrameStart);
		if (this_distribution->InitializeIO(context, graph, ref, params))
			return NULL;
		return this_distribution;
	}
	else if (type == VX_TYPE_CONVOLUTION) {
		CVxParamConvolution *this_convolution = new CVxParamConvolution();
		this_convolution->SetCaptureFrameStart(captureFrameStart);
		if (this_convolution->InitializeIO(context, graph, ref, params))
			return NULL;
		return this_convolution;
	}
	else if (type == VX_TYPE_LUT) {
		CVxParamLUT *this_LUT = new CVxParamLUT();
		this_LUT->SetCaptureFrameStart(captureFrameStart);
		if (this_LUT->InitializeIO(context, graph, ref, params))
			return NULL;
		return this_LUT;
	}
	else if (type == VX_TYPE_MATRIX) {
		CVxParamMatrix *this_matrix = new CVxParamMatrix();
		this_matrix->SetCaptureFrameStart(captureFrameStart);
		if (this_matrix->InitializeIO(context, graph, ref, params))
			return NULL;
		return this_matrix;
	}
	else if (type == VX_TYPE_REMAP) {
		CVxParamRemap *this_remap = new CVxParamRemap();
		this_remap->SetCaptureFrameStart(captureFrameStart);
		if (this_remap->InitializeIO(context, graph, ref, params))
			return NULL;
		return this_remap;
	}
	else if (type == VX_TYPE_SCALAR) {
		CVxParamScalar *this_scalar = new CVxParamScalar();
		this_scalar->SetCaptureFrameStart(captureFrameStart);
		if (this_scalar->InitializeIO(context, graph, ref, params))
			return NULL;
		return this_scalar;
	}
	else if (type == VX_TYPE_THRESHOLD) {
		CVxParamThreshold *this_threshold = new CVxParamThreshold();
		this_threshold->SetCaptureFrameStart(captureFrameStart);
		if (this_threshold->InitializeIO(context, graph, ref, params))
			return NULL;
		return this_threshold;
	}
	else return nullptr;
}
//! \brief The OpenCL code generator callback.
static vx_status VX_CALLBACK opencl_codegen(
    vx_node node,                                  // [input] node
    const vx_reference parameters[],               // [input] parameters
    vx_uint32 num,                                 // [input] number of parameters
    bool opencl_load_function,                     // [input]  false: normal OpenCL kernel; true: reserved
    char opencl_kernel_function_name[64],          // [output] kernel_name for clCreateKernel()
    std::string& opencl_kernel_code,               // [output] string for clCreateProgramWithSource()
    std::string& opencl_build_options,             // [output] options for clBuildProgram()
    vx_uint32& opencl_work_dim,                    // [output] work_dim for clEnqueueNDRangeKernel()
    vx_size opencl_global_work[],                  // [output] global_work[] for clEnqueueNDRangeKernel()
    vx_size opencl_local_work[],                   // [output] local_work[] for clEnqueueNDRangeKernel()
    vx_uint32& opencl_local_buffer_usage_mask,     // [output] reserved: must be ZERO
    vx_uint32& opencl_local_buffer_size_in_bytes   // [output] reserved: must be ZERO
    )
{
    // get configuration
    vx_size num_dims, input_dims[4] = { 1, 1, 1, 1 }, top_k = 1;
    vx_enum output_obj_type, output_data_type = VX_TYPE_UINT16;
    ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_NUMBER_OF_DIMS, &num_dims, sizeof(num_dims)));
    ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[0], VX_TENSOR_DIMS, input_dims, sizeof(input_dims[0])*num_dims));
    ERROR_CHECK_STATUS(vxQueryReference(parameters[1], VX_REFERENCE_TYPE, &output_obj_type, sizeof(output_obj_type)));
    if(output_obj_type == VX_TYPE_IMAGE) {
        vx_df_image format;
        ERROR_CHECK_STATUS(vxQueryImage((vx_image)parameters[1], VX_IMAGE_FORMAT, &format, sizeof(format)));
        if(format == VX_DF_IMAGE_U8)
            output_data_type = VX_TYPE_UINT8;
        else if(format == VX_DF_IMAGE_U16)
            output_data_type = VX_TYPE_UINT16;
    }
    else {
        vx_size num_dims_output, output_dims[4] = { 1, 1, 1, 1 };
        ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[1], VX_TENSOR_NUMBER_OF_DIMS, &num_dims_output, sizeof(num_dims_output)));
        ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[1], VX_TENSOR_DIMS, output_dims, sizeof(output_dims[0])*num_dims_output));
        ERROR_CHECK_STATUS(vxQueryTensor((vx_tensor)parameters[1], VX_TENSOR_DATA_TYPE, &output_data_type, sizeof(output_data_type)));
        top_k = output_dims[2];
    }
    size_t N = input_dims[3];

    // compute global work
    opencl_work_dim = 3;
    opencl_local_work[0] = 8;
    opencl_local_work[1] = 8;
    opencl_local_work[2] = 1;
    opencl_global_work[0] = ((input_dims[0] + 3) / 4  + opencl_local_work[0] - 1) & ~(opencl_local_work[0] - 1);
    opencl_global_work[1] = (input_dims[1] + opencl_local_work[1] - 1) & ~(opencl_local_work[1] - 1);
    opencl_global_work[2] = N;

    // generate OpenCL C code
    strcpy(opencl_kernel_function_name, "argmax");
    char item[8192];
    sprintf(item,
        "#pragma OPENCL EXTENSION cl_amd_media_ops : enable\n"
        "__kernel __attribute__((reqd_work_group_size(%ld, %ld, 1)))\n" // opencl_local_work[0] opencl_local_work[1]
        "void %s(__global uchar * i0_buf, uint i0_offset, uint4 i0_stride, %s)\n"
        "{\n"
        "    uint x = get_global_id(0) * 4;\n"
        "    uint y = get_global_id(1);\n"
        "    uint z = get_global_id(2);\n"
        "    if(x < %ld && y < %ld) {\n"
        "        i0_buf += i0_offset + z * i0_stride.s3 + y * i0_stride.s1 + x * i0_stride.s0;\n"
        "        uint4 cmax;\n"
        , opencl_local_work[0], opencl_local_work[1], opencl_kernel_function_name
        , (output_obj_type == VX_TYPE_IMAGE) ?
            "uint o0_width, uint o0_height, __global uchar * o0_buf, uint o0_stride, uint o0_offset" :
            "__global uchar * o0_buf, uint o0_offset, uint4 o0_stride"
        , input_dims[0], input_dims[1]);
    opencl_kernel_code = item;
    if(top_k == 2) {
        sprintf(item,
            "        uint4 cmax1;\n"
            "        float4 f, fmax, fmax1;\n"
            "        fmax = *(__global float4 *)i0_buf;\n"
            "        i0_buf += i0_stride.s2; f = *(__global float4 *)i0_buf;\n"
            "        cmax1.s0 = (f.s0 > fmax.s0) ? 0 : 1;\n"
            "         cmax.s0 = (f.s0 > fmax.s0) ? 1 : 0;\n"
            "        fmax1.s0 = (f.s0 > fmax.s0) ? fmax.s0 :    f.s0;\n"
            "         fmax.s0 = (f.s0 > fmax.s0) ?    f.s0 : fmax.s0;\n"
            "        cmax1.s1 = (f.s1 > fmax.s1) ? 0 : 1;\n"
            "         cmax.s1 = (f.s1 > fmax.s1) ? 1 : 0;\n"
            "        fmax1.s1 = (f.s1 > fmax.s1) ? fmax.s1 :    f.s1;\n"
            "         fmax.s1 = (f.s1 > fmax.s1) ?    f.s1 : fmax.s1;\n"
            "        cmax1.s2 = (f.s2 > fmax.s2) ? 0 : 1;\n"
            "         cmax.s2 = (f.s2 > fmax.s2) ? 1 : 0;\n"
            "        fmax1.s2 = (f.s2 > fmax.s2) ? fmax.s2 :    f.s2;\n"
            "         fmax.s2 = (f.s2 > fmax.s2) ?    f.s2 : fmax.s2;\n"
            "        cmax1.s3 = (f.s3 > fmax.s3) ? 0 : 1;\n"
            "         cmax.s3 = (f.s3 > fmax.s3) ? 1 : 0;\n"
            "        fmax1.s3 = (f.s3 > fmax.s3) ? fmax.s3 :    f.s3;\n"
            "         fmax.s3 = (f.s3 > fmax.s3) ?    f.s3 : fmax.s3;\n"
            "        for(uint c = 2; c < %ld; c++) {\n"
            "            i0_buf += i0_stride.s2; f = *(__global float4 *)i0_buf;\n"
            "            cmax1.s0 = (f.s0 > fmax.s0) ? cmax.s0 : ((f.s0 > fmax1.s0) ? c    : cmax1.s0);\n"
            "            fmax1.s0 = (f.s0 > fmax.s0) ? fmax.s0 : ((f.s0 > fmax1.s0) ? f.s0 : fmax1.s0);\n"
            "            cmax.s0  = (f.s0 > fmax.s0) ? c    : cmax.s0;\n"
            "            fmax.s0  = (f.s0 > fmax.s0) ? f.s0 : fmax.s0;\n"
            "            cmax1.s1 = (f.s1 > fmax.s1) ? cmax.s1 : ((f.s1 > fmax1.s1) ? c    : cmax1.s1);\n"
            "            fmax1.s1 = (f.s1 > fmax.s1) ? fmax.s1 : ((f.s1 > fmax1.s1) ? f.s1 : fmax1.s1);\n"
            "            cmax.s1  = (f.s1 > fmax.s1) ? c    : cmax.s1;\n"
            "            fmax.s1  = (f.s1 > fmax.s1) ? f.s1 : fmax.s1;\n"
            "            cmax1.s2 = (f.s2 > fmax.s2) ? cmax.s2 : ((f.s2 > fmax1.s2) ? c    : cmax1.s2);\n"
            "            fmax1.s2 = (f.s2 > fmax.s2) ? fmax.s2 : ((f.s2 > fmax1.s2) ? f.s2 : fmax1.s2);\n"
            "            cmax.s2  = (f.s2 > fmax.s2) ? c    : cmax.s2;\n"
            "            fmax.s2  = (f.s2 > fmax.s2) ? f.s2 : fmax.s2;\n"
            "            cmax1.s3 = (f.s3 > fmax.s3) ? cmax.s3 : ((f.s3 > fmax1.s3) ? c    : cmax1.s3);\n"
            "            fmax1.s3 = (f.s3 > fmax.s3) ? fmax.s3 : ((f.s3 > fmax1.s3) ? f.s3 : fmax1.s3);\n"
            "            cmax.s3  = (f.s3 > fmax.s3) ? c    : cmax.s3;\n"
            "            fmax.s3  = (f.s3 > fmax.s3) ? f.s3 : fmax.s3;\n"
            "        }\n"
            , input_dims[2]);
        opencl_kernel_code += item;
    }
    else if (top_k == 1) {
        sprintf(item,
            "        cmax = (uint4)0;\n"
            "        float4 fmax = *(__global float4 *)i0_buf;\n"
            "        for(uint c = 1; c < %ld; c++) {\n"
            "            i0_buf += i0_stride.s2;\n"
            "            float4 f = *(__global float4 *)i0_buf;\n"
            "            cmax.s0 = (f.s0 > fmax.s0) ? c    : cmax.s0;\n"
            "            fmax.s0 = (f.s0 > fmax.s0) ? f.s0 : fmax.s0;\n"
            "            cmax.s1 = (f.s1 > fmax.s1) ? c    : cmax.s1;\n"
            "            fmax.s1 = (f.s1 > fmax.s1) ? f.s1 : fmax.s1;\n"
            "            cmax.s2 = (f.s2 > fmax.s2) ? c    : cmax.s2;\n"
            "            fmax.s2 = (f.s2 > fmax.s2) ? f.s2 : fmax.s2;\n"
            "            cmax.s3 = (f.s3 > fmax.s3) ? c    : cmax.s3;\n"
            "            fmax.s3 = (f.s3 > fmax.s3) ? f.s3 : fmax.s3;\n"
            "        }\n"
            , input_dims[2]);
        opencl_kernel_code += item;
    }
    if(output_data_type == VX_TYPE_UINT8) {
        if(output_obj_type == VX_TYPE_IMAGE) {
            sprintf(item, "        o0_buf += o0_offset + (z * %ld + y) * o0_stride + x;\n" , input_dims[1]);
            opencl_kernel_code += item;
        }
        else {
            opencl_kernel_code +=
                "        o0_buf += o0_offset + z * o0_stride.s3 + y * o0_stride.s1 + x * o0_stride.s0;\n";
        }
        opencl_kernel_code +=
            "        uint imax = cmax.s0 + (cmax.s1 << 8) + (cmax.s2 << 16) + (cmax.s3 << 24);\n"
            "        *(__global uint *)o0_buf = imax;\n";
        if(top_k == 2) {
            opencl_kernel_code +=
                "        uint imax1 = cmax1.s0 + (cmax1.s1 << 8) + (cmax1.s2 << 16) + (cmax1.s3 << 24);\n"
                "        *(__global uint *)&o0_buf[o0_stride.s2] = imax1;\n";
        }
    }
    else if(output_data_type == VX_TYPE_UINT16) {
        if(output_obj_type == VX_TYPE_IMAGE) {
            sprintf(item, "        o0_buf += o0_offset + (z * %ld + y) * o0_stride + x * 2;\n" , input_dims[1]);
            opencl_kernel_code += item;
        }
        else {
            opencl_kernel_code +=
                "        o0_buf += o0_offset + z * o0_stride.s3 + y * o0_stride.s1 + x * o0_stride.s0;\n";
        }
        opencl_kernel_code +=
            "        uint2 imax;\n"
            "        imax.s0 = cmax.s0 + (cmax.s1 << 16);\n"
            "        imax.s1 = cmax.s2 + (cmax.s3 << 16);\n"
            "        *(__global uint2 *)o0_buf = imax;\n";
        if(top_k == 2) {
            opencl_kernel_code +=
                "        uint2 imax1;\n"
                "        imax1.s0 = cmax1.s0 + (cmax1.s1 << 16);\n"
                "        imax1.s1 = cmax1.s2 + (cmax1.s3 << 16);\n"
                "        *(__global uint2 *)&o0_buf[o0_stride.s2] = imax1;\n";
        }
    }
    opencl_kernel_code +=
        "    }\n"
        "}\n";

#if ENABLE_DEBUG_PRINT_DIMS
    std::cout << "KERNEL argmax_layer output " << input_dims[0] << "x" << input_dims[1] << " " << std::endl;
#endif

    return VX_SUCCESS;
}