void cleanup() {
        if(!is_valid()) throw std::runtime_error( error_ );

        for(auto wl : workflow_layer)
            di->workflow_item_delete_function( wl );

        di->workflow_delete_function( workflow );
    }
    void cleanup() {
        if(!is_valid()) throw std::runtime_error(error_);

        for(auto wl : workflow_layer)
                di->workflow_item_delete_function(wl);

        di->workflow_delete_function(workflow);

        for(auto wb : workflow_layer_factor)
            if(wb!=nullptr) delete wb;

    }
    void cleanup(){
        if(!is_valid()) throw std::runtime_error(error_);

        /* ****************************************************************************************** */
        /* Cleanup in memory                                                                          */
        /* ****************************************************************************************** */
        std::cout
            << "Cleanup in memory"
            << std::endl
            << "========================================================"
            << std::endl;

        di->workflow_item_delete_function(wrkflwi_input);
        di->workflow_item_delete_function(wrkflwi_stage_1_conv);
        di->workflow_item_delete_function(wrkflwi_stage_1_pool);
        di->workflow_item_delete_function(wrkflwi_stage_1_subv);
        di->workflow_item_delete_function(wrkflwi_stage_2_conv);
        di->workflow_item_delete_function(wrkflwi_stage_2_pool);
        di->workflow_item_delete_function(wrkflwi_stage_3_fc);
        di->workflow_item_delete_function(wrkflwi_stage_4_fc);
        di->workflow_item_delete_function(wrkflwi_softmax);
        di->workflow_item_delete_function(wrkflwi_output);

        di->workflow_delete_function(workflow);

        delete nnwrkld_conv1_weights;
        delete nnwrkld_conv1_biases;
        delete nnwrkld_conv2_weights;
        delete nnwrkld_conv2_biases;
        delete nnwrkld_fc1_weights;
        delete nnwrkld_fc1_biases;
        delete nnwrkld_fc2_weights;
        delete nnwrkld_fc2_biases;
        delete di;
    }
    virtual nn_workflow_t *init_test_workflow( nn_device_interface_0_t *_di ) {

        if(!is_valid()) throw std::runtime_error( error_ );

        for(auto wi : workflow_layer) wi = nullptr;

        this->di = _di;

        di->workflow_create_function( &workflow, 1, 1 );

        // STAGE 0 (input)
        {
            di->workflow_item_create_function( &workflow_layer[input], 0, nullptr, 1 );

            workflow_layer[input]->type = NN_WORK_ITEM_TYPE_INPUT;
            workflow_layer[input]->arguments.input.index = 0;
            workflow_layer[input]->output_format[0].format = NN_DATA_FORMAT_1D;
            workflow_layer[input]->output_format[0].format_1d = { { relu_length } };
        }

        // STAGE 1 relu
        {
            nn_workflow_use_descriptor_t inputs_descriptor = { workflow_layer[input], 0 };
            di->workflow_item_create_function( &workflow_layer[relu], 1, &inputs_descriptor, 1 );

            workflow_layer[relu]->type = NN_WORK_ITEM_TYPE_RELU;

            workflow_layer[relu]->output_format[0].format = NN_DATA_FORMAT_1D;
            workflow_layer[relu]->output_format[0].format_1d = { { relu_length } };
        }
        // ------------------------------------------------------------------------------------------
        // STAGE 2 output
    {
        nn_workflow_use_descriptor_t inputs_descriptor = { workflow_layer[relu], 0 };
        di->workflow_item_create_function( &workflow_layer[output], 1, &inputs_descriptor, 1 );

        workflow_layer[output]->type = NN_WORK_ITEM_TYPE_OUTPUT;

        workflow_layer[output]->output_format[0].format = NN_DATA_FORMAT_1D;
        workflow_layer[output]->output_format[0].format_3d = { { relu_length } };

    }
    // -------------------------------------------------------------------------------------------
    // END of workflow stages definition
    workflow->input[0] = workflow_layer[input];
    workflow->output[0] = workflow_layer[output];
    // -------------------------------------------------------------------------------------------

    return workflow;
    }
    void cleanup() {

        if(!is_valid()) throw std::runtime_error(error_);

        for(auto wl : workflow_layer)
                di->workflow_item_delete_function(wl);

        di->workflow_delete_function(workflow);

        for(auto wlwf : workflow_layer_weights_float)
            if(wlwf!=nullptr) delete wlwf;

        for(auto wlwi : workflow_layer_weights_int16)
            if(wlwi!=nullptr) delete wlwi;

        for(auto wlbi : workflow_layer_biases_int32)
            if(wlbi!=nullptr) delete wlbi;

        for(auto wlbf : workflow_layer_biases_float)
            if(wlbf!=nullptr) delete wlbf;

        if(mean_factor!=nullptr) delete mean_factor;
    }
    virtual nn_workflow_t *init_workflow(nn_device_interface_0_t *di){

        if(!is_valid()) throw std::runtime_error(error_);

        this->di = di;

        std::cout
            << "--------------------------------------------------------"
            << std::endl
            << "Loading weights and biases"
            << std::endl << std::endl;

        // Load weights and biases
        auto load_biases_or_weights = [](std::string wb_file_name) {
            nn::data<float> *wb_pointer = nn_data_load_from_file_time_measure(wb_file_name);
            if(wb_pointer == nullptr) {
                std::cerr << "Can't load " << wb_file_name << std::endl;
                throw;
            }
            return wb_pointer;
        };

        try {
            nnwrkld_conv1_weights = load_biases_or_weights("weights_lenet/conv1.nn");
            nnwrkld_conv1_biases = load_biases_or_weights("weights_lenet/conv1_bias.nn");
            nnwrkld_conv2_weights = load_biases_or_weights("weights_lenet/conv2.nn");
            nnwrkld_conv2_biases = load_biases_or_weights("weights_lenet/conv2_bias.nn");
            nnwrkld_fc1_weights = load_biases_or_weights("weights_lenet/ip1.nn");
            nnwrkld_fc1_biases = load_biases_or_weights("weights_lenet/ip1_bias.nn");
            nnwrkld_fc2_weights = load_biases_or_weights("weights_lenet/ip2.nn");
            nnwrkld_fc2_biases = load_biases_or_weights("weights_lenet/ip2_bias.nn");
        }
        catch(...) {
            return workflow;
        }

        std::cout
            << "--------------------------------------------------------" << std::endl
            << "Build of workflow" << std::endl;

        di->workflow_create_function(&workflow, 1, 1);

        // ------------------------------------------------------------------------------------------
        // STAGE 0 (input)
        //         output: 28x28x3
        {
            di->workflow_item_create_function(&wrkflwi_input, 0, nullptr, 1);

            wrkflwi_input->type = NN_WORK_ITEM_TYPE_INPUT;
            wrkflwi_input->arguments.input.index = 0;
            wrkflwi_input->output_format[0].format = NN_DATA_FORMAT_2D;     
            wrkflwi_input->output_format[0].format_3d ={ { img_size, img_size} };
        }

        // ------------------------------------------------------------------------------------------
        // STAGE 01
        //           convo: 5x5 stride 1x1; no-activation; output: 24x24x20
        //         maxpool: 2x2 stride 2x2;
        //          output: 12x12x20
        {
            nn_workflow_use_descriptor_t inputs_descriptor = { wrkflwi_input, 0 };
            di->workflow_item_create_function(&wrkflwi_stage_1_conv, 1, &inputs_descriptor, 1);

            wrkflwi_stage_1_conv->type = NN_WORK_ITEM_TYPE_CONVOLUTION;
            wrkflwi_stage_1_conv->name = "c1";

            wrkflwi_stage_1_conv->arguments.forward_convolution.padding = NN_PADDING_MODE_DATA_OR_ZERO;
            wrkflwi_stage_1_conv->arguments.forward_convolution.activation.function = NN_ACTIVATION_FUNCTION_NONE;

            // We have weights, biases for 20 filters , but we want to have for four more filters so lets add padding
            wrkflwi_stage_1_conv->arguments.forward_convolution.weights = nn_data_extend_weights_by_padding(nnwrkld_conv1_weights,1,24);
            wrkflwi_stage_1_conv->arguments.forward_convolution.biases = nn_data_extend_biases_by_padding(nnwrkld_conv1_biases,24);

            wrkflwi_stage_1_conv->arguments.forward_convolution.center_offset[0] = 0;
            wrkflwi_stage_1_conv->arguments.forward_convolution.center_offset[1] = 0;

            wrkflwi_stage_1_conv->arguments.forward_convolution.stride[0] = 1;
            wrkflwi_stage_1_conv->arguments.forward_convolution.stride[1] = 1;

            wrkflwi_stage_1_conv->output_format[0].format = NN_DATA_FORMAT_3D;
            // It should be 20 output FM , but we do support only case when output FM number is divisble by 8
            wrkflwi_stage_1_conv->output_format[0].format_3d ={ { 24, 24, 24 } };
        }
        
        {
            nn_workflow_use_descriptor_t inputs_descriptor = { wrkflwi_stage_1_conv, 0 };
            di->workflow_item_create_function(&wrkflwi_stage_1_pool, 1, &inputs_descriptor, 1);

            wrkflwi_stage_1_pool->type = NN_WORK_ITEM_TYPE_POOLING;
            wrkflwi_stage_1_pool->name = "p1";

            wrkflwi_stage_1_pool->arguments.forward_pooling.mode = NN_POOLING_MODE_MAX;
            wrkflwi_stage_1_pool->arguments.forward_pooling.size[0] = 2;
            wrkflwi_stage_1_pool->arguments.forward_pooling.size[1] = 2;
            wrkflwi_stage_1_pool->arguments.forward_pooling.stride[0] = 2;
            wrkflwi_stage_1_pool->arguments.forward_pooling.stride[1] = 2;

            wrkflwi_stage_1_pool->output_format[0].format = NN_DATA_FORMAT_3D;
            wrkflwi_stage_1_pool->output_format[0].format_3d ={ { 12, 12, 24 } };
        }
        // view
        {
            nn_workflow_use_descriptor_t inputs_descriptor = { wrkflwi_stage_1_pool, 0 };
            di->workflow_item_create_function(&wrkflwi_stage_1_subv, 1, &inputs_descriptor, 1); // view 

            wrkflwi_stage_1_subv->type = NN_WORK_ITEM_TYPE_VIEW;
            wrkflwi_stage_1_subv->arguments.view.origin[0] = 0;
            wrkflwi_stage_1_subv->arguments.view.origin[1] = 0;
            wrkflwi_stage_1_subv->arguments.view.origin[2] = 0;

            wrkflwi_stage_1_subv->output_format[0].format = NN_DATA_FORMAT_3D;
            wrkflwi_stage_1_subv->output_format[0].format_3d ={ { 12, 12, 20 } };
        }

        // ------------------------------------------------------------------------------------------
        // STAGE 02
        //           convo: 5x5 stride 1x1; no-activation; output: 8x8x50
        //         maxpool: 2x2 stride 2x2;
        //          output: 4x4x50

        // convolution 2 
        {
            nn_workflow_use_descriptor_t inputs_descriptor = { wrkflwi_stage_1_subv, 0 };
            di->workflow_item_create_function(&wrkflwi_stage_2_conv, 1, &inputs_descriptor, 1);

            wrkflwi_stage_2_conv->type = NN_WORK_ITEM_TYPE_CONVOLUTION;
            wrkflwi_stage_2_conv->name = "c2";

            wrkflwi_stage_2_conv->arguments.forward_convolution.activation.function = NN_ACTIVATION_FUNCTION_NONE;
            wrkflwi_stage_2_conv->arguments.forward_convolution.padding = NN_PADDING_MODE_DATA_OR_ZERO;

            wrkflwi_stage_2_conv->arguments.forward_convolution.weights = nn_data_extend_weights_by_padding(nnwrkld_conv2_weights,20,56);
            wrkflwi_stage_2_conv->arguments.forward_convolution.biases = nn_data_extend_biases_by_padding(nnwrkld_conv2_biases,56);

            wrkflwi_stage_2_conv->arguments.forward_convolution.center_offset[0] = 0;
            wrkflwi_stage_2_conv->arguments.forward_convolution.center_offset[1] = 0;

            wrkflwi_stage_2_conv->arguments.forward_convolution.stride[0] = 1;
            wrkflwi_stage_2_conv->arguments.forward_convolution.stride[1] = 1;

            wrkflwi_stage_2_conv->output_format[0].format = NN_DATA_FORMAT_3D;
            // It should be 50 output FM , but we do support only case when output FM number is divisble by 8
            wrkflwi_stage_2_conv->output_format[0].format_3d ={ { 8, 8, 56 } };
        }

        // maxpool: 2x2 stride 2x2;
        {
            nn_workflow_use_descriptor_t inputs_descriptor = { wrkflwi_stage_2_conv, 0 };
            di->workflow_item_create_function(&wrkflwi_stage_2_pool, 1, &inputs_descriptor, 1); // pooling

            wrkflwi_stage_2_pool->type = NN_WORK_ITEM_TYPE_POOLING;
            wrkflwi_stage_2_pool->name = "p2";

            wrkflwi_stage_2_pool->arguments.forward_pooling.mode = NN_POOLING_MODE_MAX;

            wrkflwi_stage_2_pool->arguments.forward_pooling.size[0] = 2;
            wrkflwi_stage_2_pool->arguments.forward_pooling.size[1] = 2;

            wrkflwi_stage_2_pool->arguments.forward_pooling.stride[0] = 2;
            wrkflwi_stage_2_pool->arguments.forward_pooling.stride[1] = 2;

            wrkflwi_stage_2_pool->output_format[0].format = NN_DATA_FORMAT_3D;
            wrkflwi_stage_2_pool->output_format[0].format_3d ={ { 4, 4, 56 } };
        }

        // ------------------------------------------------------------------------------------------
        // STAGE 03
        //            full: ReLU
        //          output: 500
        {
            nn_workflow_use_descriptor_t inputs_descriptor = { wrkflwi_stage_2_pool, 0 };
            di->workflow_item_create_function(&wrkflwi_stage_3_fc, 1, &inputs_descriptor, 1);

            wrkflwi_stage_3_fc->type = NN_WORK_ITEM_TYPE_FULLY_CONNECTED;
            wrkflwi_stage_3_fc->name = "fc1";

            wrkflwi_stage_3_fc->arguments.forward_fully_connected.activation.function = NN_ACTIVATION_FUNCTION_RELU;


            // Generated weights if taken from caffe , are in 2D format while we need them in 4d format
            nn::data<float>* nnwrkld_fc1_converted_weights = nn_data_convert_weights_2D_to_4D(nnwrkld_fc1_weights, 
                                                                                              4, 
                                                                                              4,
                                                                                              50,
                                                                                              nnwrkld_fc1_weights->size[1]);
            // release original weights
            delete nnwrkld_fc1_weights;
            // Extend weights' depth of FC layer to match extended weights input
            nnwrkld_fc1_weights = nn_data_extend_weights_by_padding(nnwrkld_fc1_converted_weights,56,nnwrkld_fc1_converted_weights->size[3]);
            delete nnwrkld_fc1_converted_weights;
            nnwrkld_fc1_converted_weights = nullptr;

            wrkflwi_stage_3_fc->arguments.forward_fully_connected.weights = nnwrkld_fc1_weights;
            wrkflwi_stage_3_fc->arguments.forward_fully_connected.biases = nnwrkld_fc1_biases;

            wrkflwi_stage_3_fc->output_format[0].format = NN_DATA_FORMAT_1D;
            wrkflwi_stage_3_fc->output_format[0].format_1d ={ { 500 } };
        }
 
        // ------------------------------------------------------------------------------------------
        // STAGE 04
        //            full: ;
        //          output: 10
        {
            nn_workflow_use_descriptor_t inputs_descriptor = { wrkflwi_stage_3_fc, 0 };
            di->workflow_item_create_function(&wrkflwi_stage_4_fc, 1, &inputs_descriptor, 1);

            wrkflwi_stage_4_fc->type = NN_WORK_ITEM_TYPE_FULLY_CONNECTED;
            wrkflwi_stage_4_fc->name = "fc2";

            wrkflwi_stage_4_fc->arguments.forward_fully_connected.activation.function = NN_ACTIVATION_FUNCTION_NONE;

            wrkflwi_stage_4_fc->arguments.forward_fully_connected.weights = nnwrkld_fc2_weights;
            wrkflwi_stage_4_fc->arguments.forward_fully_connected.biases = nnwrkld_fc2_biases;

            wrkflwi_stage_4_fc->output_format[0].format = NN_DATA_FORMAT_1D;
            wrkflwi_stage_4_fc->output_format[0].format_1d ={ { 10 } };
        }

        // ------------------------------------------------------------------------------------------
        // STAGE 05 (softmax)
        //          output: 10
        {
            nn_workflow_use_descriptor_t inputs_descriptor = { wrkflwi_stage_4_fc, 0 };
            di->workflow_item_create_function(&wrkflwi_softmax, 1, &inputs_descriptor, 1);

            wrkflwi_softmax->type = NN_WORK_ITEM_TYPE_SOFTMAX;

            wrkflwi_softmax->output_format[0].format = NN_DATA_FORMAT_1D;
            wrkflwi_softmax->output_format[0].format_1d ={ { 10 } };
        }

        // ------------------------------------------------------------------------------------------
        // STAGE 6 (output)
        //          output: 10
        {
            nn_workflow_use_descriptor_t inputs_descriptor = { wrkflwi_softmax, 0 };
            di->workflow_item_create_function(&wrkflwi_output, 1, &inputs_descriptor, 1);

            wrkflwi_output->type = NN_WORK_ITEM_TYPE_OUTPUT;

            wrkflwi_output->output_format[0].format = NN_DATA_FORMAT_1D;
            wrkflwi_output->output_format[0].format_1d ={ { 10 } };

        }

        // -------------------------------------------------------------------------------------------
        // END of workflow stages definition
        // -------------------------------------------------------------------------------------------
        workflow->input[0] = wrkflwi_input;
        workflow->output[0] = wrkflwi_output;
        // -------------------------------------------------------------------------------------------

        return workflow;
    }
    virtual nn_workflow_t *init_test_workflow(nn_device_interface_0_t *_di) {

        if(!is_valid()) throw std::runtime_error(error_);

        this->di = _di;

            // load nn:data factors (weights and biases) for successive layers
            mean_factor = nn_data_load_from_file("weights_caffenet/imagenet_mean.nnd");
            workflow_layer_weights_float[conv1_factor] = nn_data_load_from_file("weights_caffenet/conv1.nnd");
            workflow_layer_biases_float[conv1_factor] = nn_data_load_from_file("weights_caffenet/conv1_bias.nnd");
            workflow_layer_weights_float[conv2_1_factor] = nn_data_load_from_file("weights_caffenet/conv2_g1.nnd");
            workflow_layer_biases_float[conv2_1_factor] = nn_data_load_from_file("weights_caffenet/conv2_bias_g1.nnd");
            workflow_layer_weights_float[conv2_2_factor] = nn_data_load_from_file("weights_caffenet/conv2_g2.nnd");
            workflow_layer_biases_float[conv2_2_factor] = nn_data_load_from_file("weights_caffenet/conv2_bias_g2.nnd");
            workflow_layer_weights_float[conv3_factor] = nn_data_load_from_file("weights_caffenet/conv3.nnd");
            workflow_layer_biases_float[conv3_factor] = nn_data_load_from_file("weights_caffenet/conv3_bias.nnd");
            workflow_layer_weights_float[conv4_1_factor] = nn_data_load_from_file("weights_caffenet/conv4_g1.nnd");
            workflow_layer_biases_float[conv4_1_factor] = nn_data_load_from_file("weights_caffenet/conv4_bias_g1.nnd");
            workflow_layer_weights_float[conv4_2_factor] = nn_data_load_from_file("weights_caffenet/conv4_g2.nnd");
            workflow_layer_biases_float[conv4_2_factor] = nn_data_load_from_file("weights_caffenet/conv4_bias_g2.nnd");
            workflow_layer_weights_float[conv5_1_factor] = nn_data_load_from_file("weights_caffenet/conv5_g1.nnd");
            workflow_layer_biases_float[conv5_1_factor] = nn_data_load_from_file("weights_caffenet/conv5_bias_g1.nnd");
            workflow_layer_weights_float[conv5_2_factor] = nn_data_load_from_file("weights_caffenet/conv5_g2.nnd");
            workflow_layer_biases_float[conv5_2_factor] = nn_data_load_from_file("weights_caffenet/conv5_bias_g2.nnd");
            workflow_layer_weights_float[fc6_factor] = nn_data_load_from_file("weights_caffenet/fc6.nnd");
            workflow_layer_biases_float[fc6_factor] = nn_data_load_from_file("weights_caffenet/fc6_bias.nnd");
            workflow_layer_weights_float[fc7_factor] = nn_data_load_from_file("weights_caffenet/fc7.nnd");
            workflow_layer_biases_float[fc7_factor] = nn_data_load_from_file("weights_caffenet/fc7_bias.nnd");
            workflow_layer_weights_float[fc8_factor] = nn_data_load_from_file("weights_caffenet/fc8.nnd");
            workflow_layer_biases_float[fc8_factor] = nn_data_load_from_file("weights_caffenet/fc8_bias.nnd");

            for (auto wlwf : workflow_layer_weights_float)
               if (wlwf == nullptr)
                  throw  std::runtime_error("error: one or more of file with weights was not loaded");
            for (auto wlbf : workflow_layer_biases_float)
               if (wlbf == nullptr)
                  throw  std::runtime_error("error: one or more of file with biases was not loaded");

        di->workflow_create_function(&workflow,1,1);

        //                                                            { c1    c2_1  c2_2  c3    c4_1  c4_2  c5_1  c5_2  fc6   fc7   fc8   }
        const size_t nnwrkld_accumulator_fraction[last_factor+1]    = { 16,   19,   17,   22,   22,   22,   23,   22,   24,   26,   24    };
        const size_t nnwrkld_output_fraction[last_factor+1]         = { 3,    7,    7,    6,    7,    7,    8,    8,    10,   12,   26    };
        const size_t nnwrkld_weights_float_fraction[last_factor+1]  = { 16,   16,   14,   15,   16,   16,   16,   15,   16,   16,   12    };
        const size_t nnwrkld_biases_float_fraction[last_factor+1]   = {nnwrkld_accumulator_fraction[conv1_factor],
                                                                       nnwrkld_accumulator_fraction[conv2_1_factor],
                                                                       nnwrkld_accumulator_fraction[conv2_2_factor],
                                                                       nnwrkld_accumulator_fraction[conv3_factor],
                                                                       nnwrkld_accumulator_fraction[conv4_1_factor],
                                                                       nnwrkld_accumulator_fraction[conv4_2_factor],
                                                                       nnwrkld_accumulator_fraction[conv5_1_factor],
                                                                       nnwrkld_accumulator_fraction[conv5_2_factor],
                                                                       nnwrkld_accumulator_fraction[fc6_factor],
                                                                       nnwrkld_accumulator_fraction[fc7_factor],
                                                                       nnwrkld_accumulator_fraction[fc8_factor]
                                                                      };
        for(auto i = 0; i<=last_factor;++i) {
            workflow_layer_weights_int16[i] = new nn::data<int16_t>(static_cast<const size_t*>(workflow_layer_weights_float[i]->size),workflow_layer_weights_float[i]->dimension);
            workflow_layer_biases_int32[i] = new nn::data<int32_t>(static_cast<const size_t*>(workflow_layer_biases_float[i]->size),workflow_layer_biases_float[i]->dimension);
            nn_data_convert_float_to_int16_fixedpoint(workflow_layer_weights_float[i],workflow_layer_weights_int16[i],1 << nnwrkld_weights_float_fraction[i]);
            nn_data_convert_float_to_int32_fixedpoint(workflow_layer_biases_float[i],workflow_layer_biases_int32[i],1 << nnwrkld_biases_float_fraction[i]);
        }

        // ------------------------------------------------------------------------------------------
        // STAGE 0 (input)
        //         output: 227x227x3
        {
            di->workflow_item_create_function(&workflow_layer[input],0,nullptr,1);

            workflow_layer[input]->type = NN_WORK_ITEM_TYPE_INPUT;
            workflow_layer[input]->arguments.input.index = 0;
            workflow_layer[input]->output_format[0].format = NN_DATA_FORMAT_3D;
            workflow_layer[input]->output_format[0].format_3d ={{img_size,img_size,3}};
        }

        // ------------------------------------------------------------------------------------------
        // STAGE 0 (imagenet_mean_subtract)
        //         output: 227x227x3
        {
            nn_workflow_use_descriptor_t inputs_descriptor ={workflow_layer[input],0};
            di->workflow_item_create_function(&workflow_layer[mean_substract],1,&inputs_descriptor,1);

            workflow_layer[mean_substract]->type = NN_WORK_ITEM_TYPE_ARITHMETIC;
            workflow_layer[mean_substract]->arguments.forward_arithmetic.factor = mean_factor;
            workflow_layer[mean_substract]->arguments.forward_arithmetic.arithmetic_function = NN_ARITHMETIC_FUNCTION_SUBTRACTION;

            workflow_layer[mean_substract]->output_format[0].format = NN_DATA_FORMAT_3D;
            workflow_layer[mean_substract]->output_format[0].format_3d ={{img_size,img_size,3}};
        }

        // ------------------------------------------------------------------------------------------
        // STAGE 0 Convert float to int16
        //
        {
            nn_workflow_use_descriptor_t inputs_descriptor = { workflow_layer[mean_substract], 0 };
            di->workflow_item_create_function(&workflow_layer[convert], 1, &inputs_descriptor, 1);

            workflow_layer[convert]->type = NN_WORK_ITEM_TYPE_CONVERT_FLOAT_TO_INT16_FIXEDPOINT;
            workflow_layer[convert]->arguments.forward_convert_float_to_int16_fixedpoint.output_fraction = 0;

            workflow_layer[convert]->output_format[0].format = NN_DATA_FORMAT_3D;
            workflow_layer[convert]->output_format[0].format_3d = nn_output_format_3d{ { img_size, img_size, 4 } };
        }

        // ------------------------------------------------------------------------------------------
        // STAGE 01
        //           convo: 11x11 stride 4x4; ReLU; output: 55x55x96
        //         maxpool: 3x3 stride 2x2;
        //            norm: RESPONSE_ACROSS_MAPS
        //          output: 27x27x96
        {
            nn_workflow_use_descriptor_t inputs_descriptor = { workflow_layer[convert], 0 };
            di->workflow_item_create_function(&workflow_layer[conv1], 1, &inputs_descriptor, 1);

            workflow_layer[conv1]->type = NN_WORK_ITEM_TYPE_CONVOLUTION_INT16_FIXEDPOINT;
            workflow_layer[conv1]->name = "c1";

            workflow_layer[conv1]->arguments.forward_convolution_int16_fixedpoint.padding = NN_PADDING_MODE_DATA_OR_ZERO;
            workflow_layer[conv1]->arguments.forward_convolution_int16_fixedpoint.activation.basic_arguments.function = NN_ACTIVATION_FUNCTION_RELU;

            workflow_layer[conv1]->arguments.forward_convolution_int16_fixedpoint.weights = workflow_layer_weights_int16[conv1_factor];
            workflow_layer[conv1]->arguments.forward_convolution_int16_fixedpoint.biases = workflow_layer_biases_int32[conv1_factor];

            workflow_layer[conv1]->arguments.forward_convolution_int16_fixedpoint.center_offset[0] = 0;
            workflow_layer[conv1]->arguments.forward_convolution_int16_fixedpoint.center_offset[1] = 0;

            workflow_layer[conv1]->arguments.forward_convolution_int16_fixedpoint.stride[0] = 4;
            workflow_layer[conv1]->arguments.forward_convolution_int16_fixedpoint.stride[1] = 4;

            workflow_layer[conv1]->arguments.forward_convolution_int16_fixedpoint.activation.fractions.accumulator = nnwrkld_accumulator_fraction[conv1_factor];
            workflow_layer[conv1]->arguments.forward_convolution_int16_fixedpoint.activation.fractions.output = nnwrkld_output_fraction[conv1_factor];

            workflow_layer[conv1]->output_format[0].format = NN_DATA_FORMAT_3D;
            workflow_layer[conv1]->output_format[0].format_3d = { { 55, 55, 96 } };
        }

        {
            nn_workflow_use_descriptor_t inputs_descriptor = { workflow_layer[conv1], 0 };
            di->workflow_item_create_function(&workflow_layer[pool1], 1, &inputs_descriptor, 1);

            workflow_layer[pool1]->type = NN_WORK_ITEM_TYPE_MAX_POOLING_INT16_FIXEDPOINT;
            workflow_layer[pool1]->name = "p1";

            workflow_layer[pool1]->arguments.forward_pooling_fixedpoint.pool_size[0] = 3;
            workflow_layer[pool1]->arguments.forward_pooling_fixedpoint.pool_size[1] = 3;
            workflow_layer[pool1]->arguments.forward_pooling_fixedpoint.pool_stride[0] = 2;
            workflow_layer[pool1]->arguments.forward_pooling_fixedpoint.pool_stride[1] = 2;

            workflow_layer[pool1]->output_format[0].format = NN_DATA_FORMAT_3D;
            workflow_layer[pool1]->output_format[0].format_3d = { { 27, 27, 96 } };
        }

        {
            nn_workflow_use_descriptor_t inputs_descriptor = { workflow_layer[pool1], 0 };
            di->workflow_item_create_function(&workflow_layer[norm1], 1, &inputs_descriptor, 1);

            workflow_layer[norm1]->type = NN_WORK_ITEM_TYPE_NORMALIZATION_RESPONSE_ACROSS_MAPS_FORWARD_I16QN;
            workflow_layer[norm1]->name = "lrn1";

            workflow_layer[norm1]->arguments.normalization_response_across_maps_forward_i16qn.k = 1;
            workflow_layer[norm1]->arguments.normalization_response_across_maps_forward_i16qn.n = 5;
            workflow_layer[norm1]->arguments.normalization_response_across_maps_forward_i16qn.alpha = 0.00002f;
            workflow_layer[norm1]->arguments.normalization_response_across_maps_forward_i16qn.beta = 0.75f;
            workflow_layer[norm1]->arguments.normalization_response_across_maps_forward_i16qn.fractions.input = nnwrkld_output_fraction[conv1_factor];
            workflow_layer[norm1]->arguments.normalization_response_across_maps_forward_i16qn.fractions.output = nnwrkld_output_fraction[conv1_factor];

            workflow_layer[norm1]->output_format[0].format = NN_DATA_FORMAT_3D;
            workflow_layer[norm1]->output_format[0].format_3d = { { 27, 27, 96 } };
        }

        // ------------------------------------------------------------------------------------------
        // STAGE 02
        //           split: 2 (z-axis 96/2); output 27x27x(2*96/2)
        //           convo: 5x5 stride 1x1; ReLU; 0-padded output: 27x27x(2*256/2)
        //           merge: (z-axis)
        //         maxpool: 3x3 stride 2x2;
        //            norm: RESPONSE_ACROSS_MAPS
        //          output: 13x13x256
        {
            nn_workflow_use_descriptor_t inputs_descriptor = { workflow_layer[norm1], 0 };
            di->workflow_item_create_function(&workflow_layer[subv1_1], 1, &inputs_descriptor, 1); // view g1

            workflow_layer[subv1_1]->type = NN_WORK_ITEM_TYPE_VIEW;
            workflow_layer[subv1_1]->arguments.view.origin[0] = 0;
            workflow_layer[subv1_1]->arguments.view.origin[1] = 0;
            workflow_layer[subv1_1]->arguments.view.origin[2] = 0;

            workflow_layer[subv1_1]->output_format[0].format = NN_DATA_FORMAT_3D;
            workflow_layer[subv1_1]->output_format[0].format_3d = { { 27, 27, 96 / 2 } };

        }

        {
            nn_workflow_use_descriptor_t inputs_descriptor = { workflow_layer[norm1], 0 };
            di->workflow_item_create_function(&workflow_layer[subv1_2], 1, &inputs_descriptor, 1);   // view g2

            workflow_layer[subv1_2]->type = NN_WORK_ITEM_TYPE_VIEW;
            workflow_layer[subv1_2]->arguments.view.origin[0] = 0;
            workflow_layer[subv1_2]->arguments.view.origin[1] = 0;
            workflow_layer[subv1_2]->arguments.view.origin[2] = (96 / 2);

            workflow_layer[subv1_2]->output_format[0].format = NN_DATA_FORMAT_3D;
            workflow_layer[subv1_2]->output_format[0].format_3d = { { 27, 27, 96 / 2 } };
        }

        // convolution 2, g1: 5x5 stride 1x1; ReLU; 0-padded output: 13x13x(2*96/2)
        {
            nn_workflow_use_descriptor_t inputs_descriptor = { workflow_layer[subv1_1], 0 };
            di->workflow_item_create_function(&workflow_layer[conv2_1], 1, &inputs_descriptor, 1);

            workflow_layer[conv2_1]->type = NN_WORK_ITEM_TYPE_CONVOLUTION_INT16_FIXEDPOINT;
            workflow_layer[conv2_1]->name = "c2g1";

            workflow_layer[conv2_1]->arguments.forward_convolution_int16_fixedpoint.activation.basic_arguments.function = NN_ACTIVATION_FUNCTION_RELU;
            workflow_layer[conv2_1]->arguments.forward_convolution_int16_fixedpoint.padding = NN_PADDING_MODE_DATA_OR_ZERO;

            workflow_layer[conv2_1]->arguments.forward_convolution_int16_fixedpoint.weights = workflow_layer_weights_int16[conv2_1_factor];
            workflow_layer[conv2_1]->arguments.forward_convolution_int16_fixedpoint.biases = workflow_layer_biases_int32[conv2_1_factor];

            workflow_layer[conv2_1]->arguments.forward_convolution_int16_fixedpoint.center_offset[0] = 2;
            workflow_layer[conv2_1]->arguments.forward_convolution_int16_fixedpoint.center_offset[1] = 2;

            workflow_layer[conv2_1]->arguments.forward_convolution_int16_fixedpoint.stride[0] = 1;
            workflow_layer[conv2_1]->arguments.forward_convolution_int16_fixedpoint.stride[1] = 1;

            workflow_layer[conv2_1]->arguments.forward_convolution_int16_fixedpoint.activation.fractions.accumulator = nnwrkld_accumulator_fraction[conv2_1_factor];
            workflow_layer[conv2_1]->arguments.forward_convolution_int16_fixedpoint.activation.fractions.output = nnwrkld_output_fraction[conv2_1_factor];

            workflow_layer[conv2_1]->output_format[0].format = NN_DATA_FORMAT_3D;
            workflow_layer[conv2_1]->output_format[0].format_3d = { { 27, 27, 256 / 2 } };
        }

        // convolution 2, g2: 5x5 stride 1x1; ReLU; 0-padded output: 13x13x(2*96/2)
        {
            nn_workflow_use_descriptor_t inputs_descriptor = { workflow_layer[subv1_2], 0 };
            di->workflow_item_create_function(&workflow_layer[conv2_2], 1, &inputs_descriptor, 1);

            workflow_layer[conv2_2]->type = NN_WORK_ITEM_TYPE_CONVOLUTION_INT16_FIXEDPOINT;
            workflow_layer[conv2_2]->name = "c2g2";

            workflow_layer[conv2_2]->arguments.forward_convolution_int16_fixedpoint.activation.basic_arguments.function = NN_ACTIVATION_FUNCTION_RELU;
            workflow_layer[conv2_2]->arguments.forward_convolution_int16_fixedpoint.padding = NN_PADDING_MODE_DATA_OR_ZERO;

            workflow_layer[conv2_2]->arguments.forward_convolution_int16_fixedpoint.weights = workflow_layer_weights_int16[conv2_2_factor];
            workflow_layer[conv2_2]->arguments.forward_convolution_int16_fixedpoint.biases = workflow_layer_biases_int32[conv2_2_factor];

            workflow_layer[conv2_2]->arguments.forward_convolution_int16_fixedpoint.center_offset[0] = 2;
            workflow_layer[conv2_2]->arguments.forward_convolution_int16_fixedpoint.center_offset[1] = 2;

            workflow_layer[conv2_2]->arguments.forward_convolution_int16_fixedpoint.stride[0] = 1;
            workflow_layer[conv2_2]->arguments.forward_convolution_int16_fixedpoint.stride[1] = 1;

            workflow_layer[conv2_2]->arguments.forward_convolution_int16_fixedpoint.activation.fractions.accumulator = nnwrkld_accumulator_fraction[conv2_2_factor];
            workflow_layer[conv2_2]->arguments.forward_convolution_int16_fixedpoint.activation.fractions.output = nnwrkld_output_fraction[conv2_2_factor];

            workflow_layer[conv2_2]->output_format[0].format = NN_DATA_FORMAT_3D;
            workflow_layer[conv2_2]->output_format[0].format_3d = { { 27, 27, 256 / 2 } };
        }

        // merge g1 and g2
        {
            nn_workflow_use_descriptor_t inputs_descriptor[] = { { workflow_layer[conv2_1], 0 }, { workflow_layer[conv2_2], 0 } };
            di->workflow_item_create_function(&workflow_layer[merge2], 2, inputs_descriptor, 1);

            workflow_layer[merge2]->type = NN_WORK_ITEM_TYPE_MERGE;
            workflow_layer[merge2]->arguments.forward_merge.axis = 2; // value 2 for z-axis

            workflow_layer[merge2]->output_format[0].format = NN_DATA_FORMAT_3D;
            workflow_layer[merge2]->output_format[0].format_3d = { { 27, 27, 256 } };
        }

        // maxpool: 3x3 stride 2x2;
        {
            nn_workflow_use_descriptor_t inputs_descriptor = { workflow_layer[merge2], 0 };
            di->workflow_item_create_function(&workflow_layer[pool2], 1, &inputs_descriptor, 1); // pooling

            workflow_layer[pool2]->type = NN_WORK_ITEM_TYPE_MAX_POOLING_INT16_FIXEDPOINT;
            workflow_layer[pool2]->name = "p2";

            workflow_layer[pool2]->arguments.forward_pooling_fixedpoint.pool_size[0] = 3;
            workflow_layer[pool2]->arguments.forward_pooling_fixedpoint.pool_size[1] = 3;

            workflow_layer[pool2]->arguments.forward_pooling_fixedpoint.pool_stride[0] = 2;
            workflow_layer[pool2]->arguments.forward_pooling_fixedpoint.pool_stride[1] = 2;

            workflow_layer[pool2]->output_format[0].format = NN_DATA_FORMAT_3D;
            workflow_layer[pool2]->output_format[0].format_3d = { { 13, 13, 256 } };
        }

        //norm: RESPONSE_ACROSS_MAPS; output: 13x13x256
        {
            nn_workflow_use_descriptor_t inputs_descriptor = { workflow_layer[pool2], 0 };
            di->workflow_item_create_function(&workflow_layer[norm2], 1, &inputs_descriptor, 1);

            workflow_layer[norm2]->type = NN_WORK_ITEM_TYPE_NORMALIZATION_RESPONSE_ACROSS_MAPS_FORWARD_I16QN;
            workflow_layer[norm2]->name = "lrn2";

            workflow_layer[norm2]->arguments.normalization_response_across_maps_forward_i16qn.k = 1;
            workflow_layer[norm2]->arguments.normalization_response_across_maps_forward_i16qn.n = 5;
            workflow_layer[norm2]->arguments.normalization_response_across_maps_forward_i16qn.alpha = 0.00002f;
            workflow_layer[norm2]->arguments.normalization_response_across_maps_forward_i16qn.beta = 0.75f;
            workflow_layer[norm2]->arguments.normalization_response_across_maps_forward_i16qn.fractions.input = nnwrkld_output_fraction[conv2_2_factor];
            workflow_layer[norm2]->arguments.normalization_response_across_maps_forward_i16qn.fractions.output = nnwrkld_output_fraction[conv2_2_factor];

            workflow_layer[norm2]->output_format[0].format = NN_DATA_FORMAT_3D;
            workflow_layer[norm2]->output_format[0].format_3d = { { 13, 13, 256 } };
        }

        // ------------------------------------------------------------------------------------------
        // STAGE 03
        //           convo: 3x3 stride 1x1; ReLU; 0-padded
        //          output: 13x13x384
        {
            nn_workflow_use_descriptor_t inputs_descriptor = { workflow_layer[norm2], 0 };
            di->workflow_item_create_function(&workflow_layer[conv3], 1, &inputs_descriptor, 1);

            workflow_layer[conv3]->type = NN_WORK_ITEM_TYPE_CONVOLUTION_INT16_FIXEDPOINT;
            workflow_layer[conv3]->name = "c3";

            workflow_layer[conv3]->arguments.forward_convolution_int16_fixedpoint.activation.basic_arguments.function = NN_ACTIVATION_FUNCTION_RELU;
            workflow_layer[conv3]->arguments.forward_convolution_int16_fixedpoint.padding = NN_PADDING_MODE_DATA_OR_ZERO;

            workflow_layer[conv3]->arguments.forward_convolution_int16_fixedpoint.weights = workflow_layer_weights_int16[conv3_factor];
            workflow_layer[conv3]->arguments.forward_convolution_int16_fixedpoint.biases = workflow_layer_biases_int32[conv3_factor];

            workflow_layer[conv3]->arguments.forward_convolution_int16_fixedpoint.center_offset[0] = 1;
            workflow_layer[conv3]->arguments.forward_convolution_int16_fixedpoint.center_offset[1] = 1;

            workflow_layer[conv3]->arguments.forward_convolution_int16_fixedpoint.stride[0] = 1;
            workflow_layer[conv3]->arguments.forward_convolution_int16_fixedpoint.stride[1] = 1;

            workflow_layer[conv3]->arguments.forward_convolution_int16_fixedpoint.activation.fractions.accumulator = nnwrkld_accumulator_fraction[conv3_factor];
            workflow_layer[conv3]->arguments.forward_convolution_int16_fixedpoint.activation.fractions.output = nnwrkld_output_fraction[conv3_factor];

            workflow_layer[conv3]->output_format[0].format = NN_DATA_FORMAT_3D;
            workflow_layer[conv3]->output_format[0].format_3d = { { 13, 13, 384 } };
        }

        // ------------------------------------------------------------------------------------------
        // STAGE 04
        //           split: 2 (z-axis 384/2)
        //           convo: 3x3 stride 1x1; ReLU; 0-padded
        //          output: 13x13x(2*384/2) (continue split to next stage)
        {
            nn_workflow_use_descriptor_t inputs_descriptor = { workflow_layer[conv3], 0 };
            di->workflow_item_create_function(&workflow_layer[subv3_1], 1, &inputs_descriptor, 1); // view g1

            workflow_layer[subv3_1]->type = NN_WORK_ITEM_TYPE_VIEW;
            workflow_layer[subv3_1]->arguments.view.origin[0] = 0;
            workflow_layer[subv3_1]->arguments.view.origin[1] = 0;
            workflow_layer[subv3_1]->arguments.view.origin[2] = 0;

            workflow_layer[subv3_1]->output_format[0].format = NN_DATA_FORMAT_3D;
            workflow_layer[subv3_1]->output_format[0].format_3d = { { 13, 13, 384 / 2 } };
        }

        {
            nn_workflow_use_descriptor_t inputs_descriptor = { workflow_layer[conv3], 0 };
            di->workflow_item_create_function(&workflow_layer[subv3_2], 1, &inputs_descriptor, 1); // view g2

            workflow_layer[subv3_2]->type = NN_WORK_ITEM_TYPE_VIEW;
            workflow_layer[subv3_2]->arguments.view.origin[0] = 0;
            workflow_layer[subv3_2]->arguments.view.origin[1] = 0;
            workflow_layer[subv3_2]->arguments.view.origin[2] = 384 / 2;

            workflow_layer[subv3_2]->output_format[0].format = NN_DATA_FORMAT_3D;
            workflow_layer[subv3_2]->output_format[0].format_3d = { { 13, 13, 384 / 2 } };

        }

        {
            nn_workflow_use_descriptor_t inputs_descriptor = { workflow_layer[subv3_1], 0 };
            di->workflow_item_create_function(&workflow_layer[conv4_1], 1, &inputs_descriptor, 1); // conv g1

            workflow_layer[conv4_1]->type = NN_WORK_ITEM_TYPE_CONVOLUTION_INT16_FIXEDPOINT;
            workflow_layer[conv4_1]->name = "c4g1";

            workflow_layer[conv4_1]->arguments.forward_convolution_int16_fixedpoint.activation.basic_arguments.function = NN_ACTIVATION_FUNCTION_RELU;
            workflow_layer[conv4_1]->arguments.forward_convolution_int16_fixedpoint.padding = NN_PADDING_MODE_DATA_OR_ZERO;

            workflow_layer[conv4_1]->arguments.forward_convolution_int16_fixedpoint.weights = workflow_layer_weights_int16[conv4_1_factor];
            workflow_layer[conv4_1]->arguments.forward_convolution_int16_fixedpoint.biases = workflow_layer_biases_int32[conv4_1_factor];

            workflow_layer[conv4_1]->arguments.forward_convolution_int16_fixedpoint.center_offset[0] = 1;
            workflow_layer[conv4_1]->arguments.forward_convolution_int16_fixedpoint.center_offset[1] = 1;

            workflow_layer[conv4_1]->arguments.forward_convolution_int16_fixedpoint.stride[0] = 1;
            workflow_layer[conv4_1]->arguments.forward_convolution_int16_fixedpoint.stride[1] = 1;

            workflow_layer[conv4_1]->arguments.forward_convolution_int16_fixedpoint.activation.fractions.accumulator = nnwrkld_accumulator_fraction[conv4_1_factor];
            workflow_layer[conv4_1]->arguments.forward_convolution_int16_fixedpoint.activation.fractions.output = nnwrkld_output_fraction[conv4_1_factor];

            workflow_layer[conv4_1]->output_format[0].format = NN_DATA_FORMAT_3D;
            workflow_layer[conv4_1]->output_format[0].format_3d = { { 13, 13, 384 / 2 } };
        }

        {
            nn_workflow_use_descriptor_t inputs_descriptor = { workflow_layer[subv3_2], 0 };
            di->workflow_item_create_function(&workflow_layer[conv4_2], 1, &inputs_descriptor, 1); // conv g2

            workflow_layer[conv4_2]->type = NN_WORK_ITEM_TYPE_CONVOLUTION_INT16_FIXEDPOINT;
            workflow_layer[conv4_2]->name = "c4g2";

            workflow_layer[conv4_2]->arguments.forward_convolution_int16_fixedpoint.activation.basic_arguments.function = NN_ACTIVATION_FUNCTION_RELU;
            workflow_layer[conv4_2]->arguments.forward_convolution_int16_fixedpoint.padding = NN_PADDING_MODE_DATA_OR_ZERO;

            workflow_layer[conv4_2]->arguments.forward_convolution_int16_fixedpoint.weights = workflow_layer_weights_int16[conv4_2_factor];
            workflow_layer[conv4_2]->arguments.forward_convolution_int16_fixedpoint.biases = workflow_layer_biases_int32[conv4_2_factor];

            workflow_layer[conv4_2]->arguments.forward_convolution_int16_fixedpoint.center_offset[0] = 1;
            workflow_layer[conv4_2]->arguments.forward_convolution_int16_fixedpoint.center_offset[1] = 1;

            workflow_layer[conv4_2]->arguments.forward_convolution_int16_fixedpoint.stride[0] = 1;
            workflow_layer[conv4_2]->arguments.forward_convolution_int16_fixedpoint.stride[1] = 1;

            workflow_layer[conv4_2]->arguments.forward_convolution_int16_fixedpoint.activation.fractions.accumulator = nnwrkld_accumulator_fraction[conv4_2_factor];
            workflow_layer[conv4_2]->arguments.forward_convolution_int16_fixedpoint.activation.fractions.output = nnwrkld_output_fraction[conv4_2_factor];

            workflow_layer[conv4_2]->output_format[0].format = NN_DATA_FORMAT_3D;
            workflow_layer[conv4_2]->output_format[0].format_3d = { { 13, 13, 384 / 2 } };
        }

        // ------------------------------------------------------------------------------------------
        // STAGE 05
        //           convo: 3x3 stride 1x1; ReLU; 0-padded; output: 13x13x(2*256/2)
        //           merge: (z-axis)
        //         maxpool: 3x3 stride 2x2;
        //          output: 13x13x256
        {
            nn_workflow_use_descriptor_t inputs_descriptor = { workflow_layer[conv4_1], 0 };
            di->workflow_item_create_function(&workflow_layer[conv5_1], 1, &inputs_descriptor, 1); // conv g1

            workflow_layer[conv5_1]->type = NN_WORK_ITEM_TYPE_CONVOLUTION_INT16_FIXEDPOINT;
            workflow_layer[conv5_1]->name = "c5g1";

            workflow_layer[conv5_1]->arguments.forward_convolution_int16_fixedpoint.activation.basic_arguments.function = NN_ACTIVATION_FUNCTION_RELU;
            workflow_layer[conv5_1]->arguments.forward_convolution_int16_fixedpoint.padding = NN_PADDING_MODE_DATA_OR_ZERO;

            workflow_layer[conv5_1]->arguments.forward_convolution_int16_fixedpoint.weights = workflow_layer_weights_int16[conv5_1_factor];
            workflow_layer[conv5_1]->arguments.forward_convolution_int16_fixedpoint.biases = workflow_layer_biases_int32[conv5_1_factor];

            workflow_layer[conv5_1]->arguments.forward_convolution_int16_fixedpoint.center_offset[0] = 1;
            workflow_layer[conv5_1]->arguments.forward_convolution_int16_fixedpoint.center_offset[1] = 1;

            workflow_layer[conv5_1]->arguments.forward_convolution_int16_fixedpoint.stride[0] = 1;
            workflow_layer[conv5_1]->arguments.forward_convolution_int16_fixedpoint.stride[1] = 1;

            workflow_layer[conv5_1]->arguments.forward_convolution_int16_fixedpoint.activation.fractions.accumulator = nnwrkld_accumulator_fraction[conv5_1_factor];
            workflow_layer[conv5_1]->arguments.forward_convolution_int16_fixedpoint.activation.fractions.output = nnwrkld_output_fraction[conv5_1_factor];

            workflow_layer[conv5_1]->output_format[0].format = NN_DATA_FORMAT_3D;
            workflow_layer[conv5_1]->output_format[0].format_3d = { { 13, 13, 256 / 2 } };
        }

        {
            nn_workflow_use_descriptor_t inputs_descriptor = { workflow_layer[conv4_2], 0 };
            di->workflow_item_create_function(&workflow_layer[conv5_2], 1, &inputs_descriptor, 1); // conv g2

            workflow_layer[conv5_2]->type = NN_WORK_ITEM_TYPE_CONVOLUTION_INT16_FIXEDPOINT;
            workflow_layer[conv5_2]->name = "c5g2";

            workflow_layer[conv5_2]->arguments.forward_convolution_int16_fixedpoint.activation.basic_arguments.function = NN_ACTIVATION_FUNCTION_RELU;
            workflow_layer[conv5_2]->arguments.forward_convolution_int16_fixedpoint.padding = NN_PADDING_MODE_DATA_OR_ZERO;

            workflow_layer[conv5_2]->arguments.forward_convolution_int16_fixedpoint.weights = workflow_layer_weights_int16[conv5_2_factor];
            workflow_layer[conv5_2]->arguments.forward_convolution_int16_fixedpoint.biases = workflow_layer_biases_int32[conv5_2_factor];

            workflow_layer[conv5_2]->arguments.forward_convolution_int16_fixedpoint.center_offset[0] = 1;
            workflow_layer[conv5_2]->arguments.forward_convolution_int16_fixedpoint.center_offset[1] = 1;

            workflow_layer[conv5_2]->arguments.forward_convolution_int16_fixedpoint.stride[0] = 1;
            workflow_layer[conv5_2]->arguments.forward_convolution_int16_fixedpoint.stride[1] = 1;

            workflow_layer[conv5_2]->arguments.forward_convolution_int16_fixedpoint.activation.fractions.accumulator = nnwrkld_accumulator_fraction[conv5_2_factor];
            workflow_layer[conv5_2]->arguments.forward_convolution_int16_fixedpoint.activation.fractions.output = nnwrkld_output_fraction[conv5_2_factor];

            workflow_layer[conv5_2]->output_format[0].format = NN_DATA_FORMAT_3D;
            workflow_layer[conv5_2]->output_format[0].format_3d = { { 13, 13, 256 / 2 } };
        }

        // merge g1 and g2
        {
            nn_workflow_use_descriptor_t inputs_descriptor[] = {{workflow_layer[conv5_1],0},{workflow_layer[conv5_2],0}};
            di->workflow_item_create_function(&workflow_layer[merge5], 2, inputs_descriptor, 1);

            workflow_layer[merge5]->type = NN_WORK_ITEM_TYPE_MERGE;
            workflow_layer[merge5]->arguments.forward_merge.axis = 2; // value 2 for z-axis

            workflow_layer[merge5]->output_format[0].format = NN_DATA_FORMAT_3D;
            workflow_layer[merge5]->output_format[0].format_3d = { { 13, 13, 256 } };
        }

        // maxpool: 3x3 stride 2x2;
        {
            nn_workflow_use_descriptor_t inputs_descriptor = { workflow_layer[merge5], 0 };
            di->workflow_item_create_function(&workflow_layer[pool5], 1, &inputs_descriptor, 1); // pooling

            workflow_layer[pool5]->type = NN_WORK_ITEM_TYPE_MAX_POOLING_INT16_FIXEDPOINT;
            workflow_layer[pool5]->name = "p5";

            workflow_layer[pool5]->arguments.forward_pooling_fixedpoint.pool_size[0] = 3;
            workflow_layer[pool5]->arguments.forward_pooling_fixedpoint.pool_size[1] = 3;

            workflow_layer[pool5]->arguments.forward_pooling_fixedpoint.pool_stride[0] = 2;
            workflow_layer[pool5]->arguments.forward_pooling_fixedpoint.pool_stride[1] = 2;

            workflow_layer[pool5]->arguments.fully_connected_forward_i16qn_i32qn.activation.fractions.accumulator = 16;
            workflow_layer[pool5]->arguments.fully_connected_forward_i16qn_i32qn.activation.fractions.output = 8;

            workflow_layer[pool5]->output_format[0].format = NN_DATA_FORMAT_3D;
            workflow_layer[pool5]->output_format[0].format_3d = { { 6, 6, 256 } };
        }

        // ------------------------------------------------------------------------------------------
        // STAGE 06
        //            full: ReLU
        //          output: 4096
        {
            nn_workflow_use_descriptor_t inputs_descriptor = { workflow_layer[pool5], 0 };
            di->workflow_item_create_function(&workflow_layer[fc6], 1, &inputs_descriptor, 1);

            workflow_layer[fc6]->type = NN_WORK_ITEM_TYPE_FULLY_CONNECTED_FORWARD_I16QN_I16QN;
            workflow_layer[fc6]->name = "fc6";

            workflow_layer[fc6]->arguments.fully_connected_forward_i16qn_i16qn.activation.basic_arguments.function = NN_ACTIVATION_FUNCTION_RELU;

            workflow_layer[fc6]->arguments.fully_connected_forward_i16qn_i16qn.weights = workflow_layer_weights_int16[fc6_factor];
            workflow_layer[fc6]->arguments.fully_connected_forward_i16qn_i16qn.biases = workflow_layer_biases_int32[fc6_factor];

            workflow_layer[fc6]->arguments.fully_connected_forward_i16qn_i32qn.activation.fractions.accumulator = nnwrkld_accumulator_fraction[fc6_factor];
            workflow_layer[fc6]->arguments.fully_connected_forward_i16qn_i32qn.activation.fractions.output = nnwrkld_output_fraction[fc6_factor];

            workflow_layer[fc6]->output_format[0].format = NN_DATA_FORMAT_1D;
            workflow_layer[fc6]->output_format[0].format_1d = { { 4096 } };
        }

        // ------------------------------------------------------------------------------------------
        // STAGE 07
        //            full: ReLU
        //          output: 4096
        {
            nn_workflow_use_descriptor_t inputs_descriptor = { workflow_layer[fc6], 0 };
            di->workflow_item_create_function(&workflow_layer[fc7], 1, &inputs_descriptor, 1);

            workflow_layer[fc7]->type = NN_WORK_ITEM_TYPE_FULLY_CONNECTED_FORWARD_I16QN_I16QN;
            workflow_layer[fc7]->name = "fc7";

            workflow_layer[fc7]->arguments.fully_connected_forward_i16qn_i16qn.activation.basic_arguments.function = NN_ACTIVATION_FUNCTION_RELU;

            workflow_layer[fc7]->arguments.fully_connected_forward_i16qn_i16qn.weights = workflow_layer_weights_int16[fc7_factor];
            workflow_layer[fc7]->arguments.fully_connected_forward_i16qn_i16qn.biases = workflow_layer_biases_int32[fc7_factor];

            workflow_layer[fc7]->arguments.fully_connected_forward_i16qn_i32qn.activation.fractions.accumulator = nnwrkld_accumulator_fraction[fc7_factor];
            workflow_layer[fc7]->arguments.fully_connected_forward_i16qn_i32qn.activation.fractions.output = nnwrkld_output_fraction[fc7_factor];

            workflow_layer[fc7]->output_format[0].format = NN_DATA_FORMAT_1D;
            workflow_layer[fc7]->output_format[0].format_1d = { { 4096 } };
        }

        // ------------------------------------------------------------------------------------------
        // STAGE 08
        //            full: ;
        //          output: 1000
        {
            nn_workflow_use_descriptor_t inputs_descriptor = { workflow_layer[fc7], 0 };
            di->workflow_item_create_function(&workflow_layer[fc8], 1, &inputs_descriptor, 1);

            workflow_layer[fc8]->type = NN_WORK_ITEM_TYPE_FULLY_CONNECTED_FORWARD_I16QN_I32QN;
            workflow_layer[fc8]->name = "fc8";

            workflow_layer[fc8]->arguments.fully_connected_forward_i16qn_i32qn.activation.basic_arguments.function = NN_ACTIVATION_FUNCTION_NONE;

            workflow_layer[fc8]->arguments.fully_connected_forward_i16qn_i32qn.weights = workflow_layer_weights_int16[fc8_factor];
            workflow_layer[fc8]->arguments.fully_connected_forward_i16qn_i32qn.biases = workflow_layer_biases_int32[fc8_factor];

            workflow_layer[fc8]->arguments.fully_connected_forward_i16qn_i32qn.activation.fractions.accumulator = nnwrkld_accumulator_fraction[fc8_factor];
            workflow_layer[fc8]->arguments.fully_connected_forward_i16qn_i32qn.activation.fractions.output = nnwrkld_output_fraction[fc8_factor];

            workflow_layer[fc8]->output_format[0].format = NN_DATA_FORMAT_1D;
            workflow_layer[fc8]->output_format[0].format_1d = { { 1000 } };
        }

        // ------------------------------------------------------------------------------------------
        // STAGE 09 (softmax)
        //          output: 1000
        {
            nn_workflow_use_descriptor_t inputs_descriptor = { workflow_layer[fc8], 0 };
            di->workflow_item_create_function(&workflow_layer[softmax], 1, &inputs_descriptor, 1);

            workflow_layer[softmax]->type = NN_WORK_ITEM_TYPE_SOFTMAX_FIXEDPOINT;

            workflow_layer[softmax]->arguments.forward_softmax_fixedpoint.input_fraction = nnwrkld_output_fraction[fc8_factor];

            workflow_layer[softmax]->output_format[0].format = NN_DATA_FORMAT_1D;
            workflow_layer[softmax]->output_format[0].format_1d = { { 1000 } };
        }

        // ------------------------------------------------------------------------------------------
        // STAGE 10 (output)
        //          output: 1000
        {
            nn_workflow_use_descriptor_t inputs_descriptor = { workflow_layer[softmax], 0 };
            di->workflow_item_create_function(&workflow_layer[output], 1, &inputs_descriptor, 1);

            workflow_layer[output]->type = NN_WORK_ITEM_TYPE_OUTPUT;

            workflow_layer[output]->output_format[0].format = NN_DATA_FORMAT_1D;
            workflow_layer[output]->output_format[0].format_1d = { { 1000 } };

        }

        // -------------------------------------------------------------------------------------------
        // END of workflow stages definition
        // -------------------------------------------------------------------------------------------
        workflow->input[0] = workflow_layer[input];
        workflow->output[0] = workflow_layer[output];
        // -------------------------------------------------------------------------------------------

        return workflow;

    }
bool test_google_float_workload_cpu_images_classification::run()
{
    bool  run_ok = true;
    test_measurement_result   run_result;
    run_result.description = "RUN SUMMARY: " + test_description;

    C_time_control  run_timer;

    std::cout << "-> Testing: " << test_description << std::endl;

    try {
        if(!init()) throw std::runtime_error("init() returns false so can't run test");
        run_timer.tick();   //start time measurement
        run_result << std::string("run test with " + current_tested_device->get_device_description());

        for(uint32_t batch :{1,8,48}) {

            C_time_control  loop_timer;

            // compiling workload
            nn_workload_t             *workload = nullptr;
            NN_WORKLOAD_DATA_TYPE  input_format = NN_WORKLOAD_DATA_TYPE_F32_ZXY_BATCH;
            NN_WORKLOAD_DATA_TYPE output_format = NN_WORKLOAD_DATA_TYPE_F32_1D_BATCH;

            auto status = di->workflow_compile_function(&workload, di->device, workflow, &input_format, &output_format, batch);
            if(!workload) throw std::runtime_error("workload compilation failed for batch = " + std::to_string(batch)
                                                   + " status: " + std::to_string(status));

            test_measurement_result local_result;
            local_result.description = "RUN PART: (batch " + std::to_string(batch)+") execution of " + test_description;
            bool local_ok = true;

            auto images_list_iterator = images_list.begin();
            auto images_list_end      = images_list.end();

            while(images_list_iterator != images_list_end)
            {
                auto diff_itr = images_list_end - images_list_iterator < batch
                                ? images_list_end - images_list_iterator
                                : batch;

                std::vector< std::string >   batch_images(images_list_iterator,images_list_iterator + diff_itr);

                images_list_iterator += diff_itr;

                nn::data< float,4 > *images = nullptr;
                images = nn_data_load_from_image_list(&batch_images, img_size, image_process, batch, RGB_order);

                if(images) {
                    nn_data_t *input_array[1] ={images};
                    nn::data<float, 2> *workload_output = new nn::data<float, 2>(1000, batch);
                if(workload_output == nullptr)   throw std::runtime_error("unable to create workload_output for batch = " +std::to_string(batch));

                    nn::data<float> *output_array_cmpl[1] ={nn::data_cast<float,0>(workload_output)};

                    di->workload_execute_function(workload,reinterpret_cast<void**>(input_array),reinterpret_cast<void**>(output_array_cmpl),&status);

                    float *value_cmpl = reinterpret_cast<float *>(workload_output->buffer);

                    for(auto &image_filename : batch_images) {
                        std::ifstream reference_output_file(image_filename + ".txt", std::ifstream::in);
                        // Comparison with the reference output workload
                        float  difference = 0;
                        for(int index = 0; index < 1000; ++index) {

                            std::string reference_value_str;
                            std::getline(reference_output_file,reference_value_str);
                            float reference_value = std::stof(reference_value_str);
                            float delta = value_cmpl[index]-reference_value;

                             difference += abs(delta);

                        }
                        if(difference < threshold_to_pass_test)
                            local_result << std::string("note: " + image_filename + " difference = " + std::to_string(difference));
                        else {
                            local_result << std::string("error: image file: "
                                                        + image_filename
                                                        +" the difference exceeded the allowable threshold for compliance: "
                                                        + std::to_string(difference)
                                                        + " > "
                                                        + std::to_string(threshold_to_pass_test));
                            local_ok = false;
                            run_ok = false;
                        }

                        reference_output_file.close();
                        value_cmpl += 1000;
                    }

                    batch_images.clear();
                    if(images != nullptr) delete images;
                    if(workload_output != nullptr) delete workload_output;

                }

            }

            // batch loop summary:
            local_result.passed = local_ok;

            loop_timer.tock();
            local_result.time_consumed = loop_timer.get_time_diff();
            local_result.clocks_consumed = loop_timer.get_clocks_diff();
            tests_results << local_result;
            if(workload != nullptr) di->workload_delete_function(workload);
        } // END: for(uint32_t batch :{1,8,48})
    }
    catch(std::runtime_error &error) {
        run_result << "error: " + std::string(error.what());
        run_ok = false;
    }
    catch(...) {
        run_result << "error: unknown";
        run_ok = false;
    }

    run_timer.tock();
    run_result.time_consumed = run_timer.get_time_diff();
    run_result.clocks_consumed = run_timer.get_clocks_diff();

    run_result.passed = run_ok;
    tests_results << run_result;
    if (!done()) run_ok=false;
    std::cout << "<- Test " << (run_ok ? "passed" : "failed") << std::endl;;
    return run_ok;
}
bool test_convolution_float_cpu_random::run() {
    bool run_ok = true;
    test_measurement_result run_result;
    run_result.description = "RUN SUMMARY: " + test_description;

    std::cout << "-> Testing: " << test_description << std::endl;

    try {
        if(!init()) throw std::runtime_error( "init() returns false so can't run test" );

        NN_WORKLOAD_DATA_TYPE input_format  = NN_WORKLOAD_DATA_TYPE_F32_3D_BATCH;
        NN_WORKLOAD_DATA_TYPE output_format = NN_WORKLOAD_DATA_TYPE_F32_3D_BATCH;

        for(uint32_t batch : { 1, 8, 48 }) {
            bool local_ok = true;
            test_measurement_result local_result;
            local_result.description = "RUN PART: (batch " + std::to_string( batch ) + ") execution of " + test_description;
            C_time_control  local_timer;

            // begin local test
            uint32_t z = 2,
                img_size = 227,
                num_features_map = 8;

            nn::data<float, 4> *images = new nn::data<float, 4>( img_size, img_size, z, batch );
            if(images == nullptr) throw std::runtime_error("Cant't create images nn::data");

            nn_data_populate( nn::data_cast<float, 0>(images),
                0.0f,
                255.0f );

            nn::data<float, 4> *images_with_padding = new nn::data<float, 4>( img_size + 2, img_size + 2, z, batch );
            if(images_with_padding == nullptr) {
                delete images;
                throw std::runtime_error("Cant't create images_with_padding nn::data");
            }
            { // padding for input for naive method
                nn_data_populate( nn::data_cast<float, 0>(images_with_padding),
                    0.0f );
                for(uint32_t tmp_batch = 0; tmp_batch < batch; ++tmp_batch)
                    for(uint32_t tmp_z = 0; tmp_z < z; ++tmp_z)
                        for(uint32_t y = 0; y < img_size; ++y)
                            for(uint32_t x = 0; x < img_size; ++x)
                                images_with_padding->at( x, y, tmp_z, tmp_batch ) = images->at( x, y, tmp_z, tmp_batch );

            }

            nn_workload_t *workload = nullptr;
            nn_data_t *input_array[1] = { images };
            auto workload_output = new nn::data<float, 4>( img_size, img_size, num_features_map, batch );
            if(workload_output==nullptr) {
                delete images;
                delete images_with_padding;
                throw std::runtime_error("unable to create workload_output for batch = " +std::to_string(batch));
            }

            nn::data<float> *output_array_cmpl[1] = { nn::data_cast<float, 0>(workload_output) };

            auto naive_output = new nn::data<float, 4>( img_size, img_size, num_features_map, batch );
            if(naive_output==nullptr) {
                delete images;
                delete images_with_padding;
                delete workload_output;
                throw std::runtime_error("unable to create naive_output for batch = " +std::to_string(batch));
            }

            auto status = di->workflow_compile_function( &workload, di->device, workflow, &input_format, &output_format, batch );
            if(!workload) throw std::runtime_error( "workload compilation failed for batch = " + std::to_string( batch )
                + " status: " + std::to_string( status ) );

            test_measurement_result run_result;
            run_result.description = "RUN PART: (batch " + std::to_string( batch ) + ") execution of " + test_description;

            // changing order needed
            //di->workload_execute_function( workload, reinterpret_cast<void**>(input_array), reinterpret_cast<void**>(output_array_cmpl), &status );

            float* biases = nullptr;
            float* weights = nullptr;

            { // read biases and weights
                if(NN_WORK_ITEM_TYPE_CONVOLUTION == workflow->input[0]->use[0].item->type) {
                    auto tmp = reinterpret_cast<nn_arguments_forward_convolution_t*>(&workflow->input[0]->use[0].item->arguments);
                    biases = reinterpret_cast<float*>(tmp->biases->buffer);
                    weights = reinterpret_cast<float*>(tmp->weights->buffer);
                }
            }

            if(nullptr == biases || nullptr == weights)
                throw std::runtime_error( "reading weight or biases for naive version failed for batch = " + std::to_string( batch ) );

            naive_convolv_float_implementation(
                reinterpret_cast<float*>(images_with_padding->buffer),
                reinterpret_cast<float*>(naive_output->buffer),
                biases,
                weights,
                batch,
                num_features_map,
                z,
                img_size,
                img_size,
                img_size + 2,
                img_size + 2,
                3,
                3,
                1,
                1,
                NN_ACTIVATION_FUNCTION_RELU );

            //local_ok = compare_4d_data( workload_output, naive_output );
            local_ok = true; // BLIND TEST

            // end of local test
            // summary:
            local_timer.tock();
            local_result.time_consumed   = local_timer.get_time_diff();
            local_result.clocks_consumed = local_timer.get_clocks_diff();
            local_result.passed = local_ok;
            tests_results << local_result;

            run_ok = run_ok && local_ok;

            if(workload_output)      delete workload_output;
            if(naive_output)         delete naive_output;
            if(images)               delete images;
            if(images_with_padding)  delete images_with_padding;
        }
    } catch(std::runtime_error &error) {
        tests_results << run_result;
        std::cout << "error: " << error.what() << std::endl;
    } catch(std::exception &error) {
        tests_results << run_result;
        std::cout << "error: " << error.what() << std::endl;
    } catch(...) {
        tests_results << run_result;
        std::cout << "error: unknown" << std::endl;
    }
    if(!done()) run_ok = false;
    std::cout << "<- Test " << (run_ok ? "passed" : "failed") << std::endl;;
    return run_ok;
}
    virtual nn_workflow_t *init_test_workflow(nn_device_interface_0_t *_di) {

        if(!is_valid()) throw std::runtime_error(error_);

        for(auto wi : workflow_layer) wi = nullptr;
        for(auto wb : workflow_layer_factor) wb = nullptr;

        this->di = _di;




        // create and populate nn:data factors (weights and biases) for successive layers

        workflow_layer_factor[mean_factor] = new nn::data<float>(img_size,img_size,3);
        nn_data_populate(workflow_layer_factor[mean_factor],104.007f,122.679f);

        workflow_layer_factor[conv1_weights] = new nn::data<float>(11,11,3,96);
        nn_data_populate(workflow_layer_factor[conv1_weights],-0.374f,0.403f);

        workflow_layer_factor[conv1_biases] = new nn::data<float>(96);
        nn_data_populate(workflow_layer_factor[conv1_biases],-0.854f,0.232f);

        workflow_layer_factor[conv2_1_weights] = new nn::data<float>(5,5,48,128);
        nn_data_populate(workflow_layer_factor[conv2_1_weights],-0.285f,0.379f);

        workflow_layer_factor[conv2_1_biases] = new nn::data<float>(128);
        nn_data_populate(workflow_layer_factor[conv2_1_biases],0.974f,1.034f);

        workflow_layer_factor[conv2_2_weights] = new nn::data<float>(5,5,48,128);
        nn_data_populate(workflow_layer_factor[conv2_2_weights],-0.269f,0.416f);

        workflow_layer_factor[conv2_2_biases] = new nn::data<float>(128);
        nn_data_populate(workflow_layer_factor[conv2_2_biases],0.958f,1.027f);

        workflow_layer_factor[conv3_weights] = new nn::data<float>(3,3,256,384);
        nn_data_populate(workflow_layer_factor[conv3_weights],-0.185f,0.512f);

        workflow_layer_factor[conv3_biases] = new nn::data<float>(384);
        nn_data_populate(workflow_layer_factor[conv3_biases],-0.104f,0.093f);

        workflow_layer_factor[conv4_1_weights] = new nn::data<float>(3,3,192,192);
        nn_data_populate(workflow_layer_factor[conv4_1_weights],-0.103f,0.322f);

        workflow_layer_factor[conv4_1_biases] = new nn::data<float>(192);
        nn_data_populate(workflow_layer_factor[conv4_1_biases],0.844f,1.142f);

        workflow_layer_factor[conv4_2_weights] = new nn::data<float>(3,3,192,192);
        nn_data_populate(workflow_layer_factor[conv4_2_weights],-0.142f,0.353f);

        workflow_layer_factor[conv4_2_biases] = new nn::data<float>(192);
        nn_data_populate(workflow_layer_factor[conv4_2_biases],0.77f,1.219f);

        workflow_layer_factor[conv5_1_weights] = new nn::data<float>(3,3,192,128);
        nn_data_populate(workflow_layer_factor[conv5_1_weights],-0.092f,0.254f);

        workflow_layer_factor[conv5_1_biases] = new nn::data<float>(128);
        nn_data_populate(workflow_layer_factor[conv5_1_biases],0.723f,1.50f);

        workflow_layer_factor[conv5_2_weights] = new nn::data<float>(3,3,192,128);
        nn_data_populate(workflow_layer_factor[conv5_2_weights],-0.133f,0.315f);

        workflow_layer_factor[conv5_2_biases] = new nn::data<float>(128);
        nn_data_populate(workflow_layer_factor[conv5_2_biases],0.623f,1.742f);

        workflow_layer_factor[fc6_weights] = new nn::data<float>(6,6,256,4096);
        nn_data_populate(workflow_layer_factor[fc6_weights],-0.035f,0.048f);

        workflow_layer_factor[fc6_biases] = new nn::data<float>(4096);
        nn_data_populate(workflow_layer_factor[fc6_biases],0.92f,1.057f);

        workflow_layer_factor[fc7_weights] = new nn::data<float>(4096,4096);
        nn_data_populate(workflow_layer_factor[fc7_weights],-0.032f,0.052f);

        workflow_layer_factor[fc7_biases] = new nn::data<float>(4096);
        nn_data_populate(workflow_layer_factor[fc7_biases],0.741f,1.26f);

        workflow_layer_factor[fc8_weights] = new nn::data<float>(4096,1000);
        nn_data_populate(workflow_layer_factor[fc8_weights],-0.045f,0.067f);

        workflow_layer_factor[fc8_biases] = new nn::data<float>(1000);
        nn_data_populate(workflow_layer_factor[fc8_biases],-0.351f,0.425f);

        di->workflow_create_function(&workflow,1,1);
        // ------------------------------------------------------------------------------------------
        // STAGE 0 (input)
        //         output: 227x227x3
        {
            di->workflow_item_create_function(&workflow_layer[input],0,nullptr,1);

            workflow_layer[input]->type = NN_WORK_ITEM_TYPE_INPUT;
            workflow_layer[input]->arguments.input.index = 0;
            workflow_layer[input]->output_format[0].format = NN_DATA_FORMAT_3D;
            workflow_layer[input]->output_format[0].format_3d ={{img_size,img_size,3}};
        }

        // ------------------------------------------------------------------------------------------
        // STAGE 0 (imagenet_mean_subtract)
        //         output: 227x227x3
        {
            nn_workflow_use_descriptor_t inputs_descriptor ={workflow_layer[input],0};
            di->workflow_item_create_function(&workflow_layer[mean_substract],1,&inputs_descriptor,1);

            workflow_layer[mean_substract]->type = NN_WORK_ITEM_TYPE_ARITHMETIC;
            workflow_layer[mean_substract]->arguments.forward_arithmetic.factor = workflow_layer_factor[mean_factor];
            workflow_layer[mean_substract]->arguments.forward_arithmetic.arithmetic_function = NN_ARITHMETIC_FUNCTION_SUBTRACTION;

            workflow_layer[mean_substract]->output_format[0].format = NN_DATA_FORMAT_3D;
            workflow_layer[mean_substract]->output_format[0].format_3d ={{img_size,img_size,3}};
        }

        // ------------------------------------------------------------------------------------------
        // STAGE 01
        //           convo: 11x11 stride 4x4; ReLU; output: 55x55x96
        //         maxpool: 3x3 stride 2x2;
        //            norm: RESPONSE_ACROSS_MAPS
        //          output: 27x27x96
        {
            nn_workflow_use_descriptor_t inputs_descriptor ={workflow_layer[mean_substract],0};
            di->workflow_item_create_function(&workflow_layer[conv1],1,&inputs_descriptor,1);

            workflow_layer[conv1]->type = NN_WORK_ITEM_TYPE_CONVOLUTION;
            workflow_layer[conv1]->name = "c1";

            workflow_layer[conv1]->arguments.forward_convolution.padding = NN_PADDING_MODE_DATA_OR_ZERO;
            workflow_layer[conv1]->arguments.forward_convolution.activation.function = NN_ACTIVATION_FUNCTION_RELU;

            workflow_layer[conv1]->arguments.forward_convolution.weights = workflow_layer_factor[conv1_weights];
            workflow_layer[conv1]->arguments.forward_convolution.biases = workflow_layer_factor[conv1_biases];

            workflow_layer[conv1]->arguments.forward_convolution.center_offset[0] = 0;
            workflow_layer[conv1]->arguments.forward_convolution.center_offset[1] = 0;

            workflow_layer[conv1]->arguments.forward_convolution.stride[0] = 4;
            workflow_layer[conv1]->arguments.forward_convolution.stride[1] = 4;

            workflow_layer[conv1]->output_format[0].format = NN_DATA_FORMAT_3D;
            workflow_layer[conv1]->output_format[0].format_3d ={{55,55,96}};
        }

        {
            nn_workflow_use_descriptor_t inputs_descriptor ={workflow_layer[conv1],0};
            di->workflow_item_create_function(&workflow_layer[pool1],1,&inputs_descriptor,1);

            workflow_layer[pool1]->type = NN_WORK_ITEM_TYPE_POOLING;
            workflow_layer[pool1]->name = "p1";

            workflow_layer[pool1]->arguments.forward_pooling.mode = NN_POOLING_MODE_MAX;
            workflow_layer[pool1]->arguments.forward_pooling.size[0] = 3;
            workflow_layer[pool1]->arguments.forward_pooling.size[1] = 3;
            workflow_layer[pool1]->arguments.forward_pooling.stride[0] = 2;
            workflow_layer[pool1]->arguments.forward_pooling.stride[1] = 2;

            workflow_layer[pool1]->output_format[0].format = NN_DATA_FORMAT_3D;
            workflow_layer[pool1]->output_format[0].format_3d ={{27,27,96}};
        }

        {
            nn_workflow_use_descriptor_t inputs_descriptor ={workflow_layer[pool1],0};
            di->workflow_item_create_function(&workflow_layer[norm1],1,&inputs_descriptor,1);

            workflow_layer[norm1]->type = NN_WORK_ITEM_TYPE_NORMALIZATION;
            workflow_layer[norm1]->name = "lrn1";

            workflow_layer[norm1]->arguments.forward_normalization.normalization.mode = NN_NORMALIZATION_MODE_RESPONSE_ACROSS_MAPS;
            workflow_layer[norm1]->arguments.forward_normalization.normalization.k = 1; // in Krishevsky's article is 2
            workflow_layer[norm1]->arguments.forward_normalization.normalization.n = 5;
            workflow_layer[norm1]->arguments.forward_normalization.normalization.alpha = 0.0001f/5; // in Krishevsky's paper is 1e-4,
            // but didn't write that sum of the squares
            // is divided by number of elements (n)
            workflow_layer[norm1]->arguments.forward_normalization.normalization.beta = 0.75f;

            workflow_layer[norm1]->output_format[0].format = NN_DATA_FORMAT_3D;
            workflow_layer[norm1]->output_format[0].format_3d ={{27,27,96}};
        }

        // ------------------------------------------------------------------------------------------
        // STAGE 02
        //           split: 2 (z-axis 96/2); output 27x27x(2*96/2)
        //           convo: 5x5 stride 1x1; ReLU; 0-padded output: 27x27x(2*256/2)
        //           merge: (z-axis)
        //         maxpool: 3x3 stride 2x2;
        //            norm: RESPONSE_ACROSS_MAPS
        //          output: 13x13x256
        {
            nn_workflow_use_descriptor_t inputs_descriptor ={workflow_layer[norm1],0};
            di->workflow_item_create_function(&workflow_layer[subv1_1],1,&inputs_descriptor,1); // view g1

            workflow_layer[subv1_1]->type = NN_WORK_ITEM_TYPE_VIEW;
            workflow_layer[subv1_1]->arguments.view.origin[0] = 0;
            workflow_layer[subv1_1]->arguments.view.origin[1] = 0;
            workflow_layer[subv1_1]->arguments.view.origin[2] = 0;

            workflow_layer[subv1_1]->output_format[0].format = NN_DATA_FORMAT_3D;
            workflow_layer[subv1_1]->output_format[0].format_3d ={{27,27,96/2}};

        }

        {
            nn_workflow_use_descriptor_t inputs_descriptor ={workflow_layer[norm1],0};
            di->workflow_item_create_function(&workflow_layer[subv1_2],1,&inputs_descriptor,1);   // view g2

            workflow_layer[subv1_2]->type = NN_WORK_ITEM_TYPE_VIEW;
            workflow_layer[subv1_2]->arguments.view.origin[0] = 0;
            workflow_layer[subv1_2]->arguments.view.origin[1] = 0;
            workflow_layer[subv1_2]->arguments.view.origin[2] = (96/2);

            workflow_layer[subv1_2]->output_format[0].format = NN_DATA_FORMAT_3D;
            workflow_layer[subv1_2]->output_format[0].format_3d ={{27,27,96/2}};
        }

        // convolution 2, g1: 5x5 stride 1x1; ReLU; 0-padded output: 13x13x(2*96/2)
        {
            nn_workflow_use_descriptor_t inputs_descriptor ={workflow_layer[subv1_1],0};
            di->workflow_item_create_function(&workflow_layer[conv2_1],1,&inputs_descriptor,1);

            workflow_layer[conv2_1]->type = NN_WORK_ITEM_TYPE_CONVOLUTION;
            workflow_layer[conv2_1]->name = "c2g1";

            workflow_layer[conv2_1]->arguments.forward_convolution.activation.function = NN_ACTIVATION_FUNCTION_RELU;
            workflow_layer[conv2_1]->arguments.forward_convolution.padding = NN_PADDING_MODE_DATA_OR_ZERO;

            workflow_layer[conv2_1]->arguments.forward_convolution.weights = workflow_layer_factor[conv2_1_weights];
            workflow_layer[conv2_1]->arguments.forward_convolution.biases = workflow_layer_factor[conv2_1_biases];

            workflow_layer[conv2_1]->arguments.forward_convolution.center_offset[0] = 2;
            workflow_layer[conv2_1]->arguments.forward_convolution.center_offset[1] = 2;

            workflow_layer[conv2_1]->arguments.forward_convolution.stride[0] = 1;
            workflow_layer[conv2_1]->arguments.forward_convolution.stride[1] = 1;

            workflow_layer[conv2_1]->output_format[0].format = NN_DATA_FORMAT_3D;
            workflow_layer[conv2_1]->output_format[0].format_3d ={{27,27,256/2}};
        }

        // convolution 2, g2: 5x5 stride 1x1; ReLU; 0-padded output: 13x13x(2*96/2)
        {
            nn_workflow_use_descriptor_t inputs_descriptor ={workflow_layer[subv1_2],0};
            di->workflow_item_create_function(&workflow_layer[conv2_2],1,&inputs_descriptor,1);

            workflow_layer[conv2_2]->type = NN_WORK_ITEM_TYPE_CONVOLUTION;
            workflow_layer[conv2_2]->name = "c2g2";

            workflow_layer[conv2_2]->arguments.forward_convolution.activation.function = NN_ACTIVATION_FUNCTION_RELU;
            workflow_layer[conv2_2]->arguments.forward_convolution.padding = NN_PADDING_MODE_DATA_OR_ZERO;

            workflow_layer[conv2_2]->arguments.forward_convolution.weights = workflow_layer_factor[conv2_2_weights];
            workflow_layer[conv2_2]->arguments.forward_convolution.biases = workflow_layer_factor[conv2_2_biases];

            workflow_layer[conv2_2]->arguments.forward_convolution.center_offset[0] = 2;
            workflow_layer[conv2_2]->arguments.forward_convolution.center_offset[1] = 2;

            workflow_layer[conv2_2]->arguments.forward_convolution.stride[0] = 1;
            workflow_layer[conv2_2]->arguments.forward_convolution.stride[1] = 1;

            workflow_layer[conv2_2]->output_format[0].format = NN_DATA_FORMAT_3D;
            workflow_layer[conv2_2]->output_format[0].format_3d ={{27,27,256/2}};
        }

        // merge g1 and g2
        {
            nn_workflow_use_descriptor_t inputs_descriptor[] ={{workflow_layer[conv2_1],0},{workflow_layer[conv2_2],0}};
            di->workflow_item_create_function(&workflow_layer[merge2],2,inputs_descriptor,1);

            workflow_layer[merge2]->type = NN_WORK_ITEM_TYPE_MERGE;
            workflow_layer[merge2]->arguments.forward_merge.axis = 2; // value 2 for z-axis

            workflow_layer[merge2]->output_format[0].format = NN_DATA_FORMAT_3D;
            workflow_layer[merge2]->output_format[0].format_3d ={{27,27,256}};

        }

        // maxpool: 3x3 stride 2x2;
        {
            nn_workflow_use_descriptor_t inputs_descriptor ={workflow_layer[merge2],0};
            di->workflow_item_create_function(&workflow_layer[pool2],1,&inputs_descriptor,1); // pooling

            workflow_layer[pool2]->type = NN_WORK_ITEM_TYPE_POOLING;
            workflow_layer[pool2]->name = "p2";

            workflow_layer[pool2]->arguments.forward_pooling.mode = NN_POOLING_MODE_MAX;

            workflow_layer[pool2]->arguments.forward_pooling.size[0] = 3;
            workflow_layer[pool2]->arguments.forward_pooling.size[1] = 3;

            workflow_layer[pool2]->arguments.forward_pooling.stride[0] = 2;
            workflow_layer[pool2]->arguments.forward_pooling.stride[1] = 2;

            workflow_layer[pool2]->output_format[0].format = NN_DATA_FORMAT_3D;
            workflow_layer[pool2]->output_format[0].format_3d ={{13,13,256}};
        }

        //norm: RESPONSE_ACROSS_MAPS; output: 13x13x256
        {
            nn_workflow_use_descriptor_t inputs_descriptor ={workflow_layer[pool2],0};
            di->workflow_item_create_function(&workflow_layer[norm2],1,&inputs_descriptor,1);

            workflow_layer[norm2]->type = NN_WORK_ITEM_TYPE_NORMALIZATION;
            workflow_layer[norm2]->name = "lrn2";

            workflow_layer[norm2]->arguments.forward_normalization.normalization.mode = NN_NORMALIZATION_MODE_RESPONSE_ACROSS_MAPS;
            workflow_layer[norm2]->arguments.forward_normalization.normalization.k = 1;              // |
            workflow_layer[norm2]->arguments.forward_normalization.normalization.n = 5;              // |
            workflow_layer[norm2]->arguments.forward_normalization.normalization.alpha = 0.0001f/5;  // > see coment at wrkflwi_stage_1_norm
            workflow_layer[norm2]->arguments.forward_normalization.normalization.beta = 0.75f;       // |

            workflow_layer[norm2]->output_format[0].format = NN_DATA_FORMAT_3D;
            workflow_layer[norm2]->output_format[0].format_3d ={{13,13,256}};
        }

        // ------------------------------------------------------------------------------------------
        // STAGE 03
        //           convo: 3x3 stride 1x1; ReLU; 0-padded
        //          output: 13x13x384
        {
            nn_workflow_use_descriptor_t inputs_descriptor ={workflow_layer[norm2],0};
            di->workflow_item_create_function(&workflow_layer[conv3],1,&inputs_descriptor,1);

            workflow_layer[conv3]->type = NN_WORK_ITEM_TYPE_CONVOLUTION;
            workflow_layer[conv3]->name = "c3";
            workflow_layer[conv3]->arguments.forward_convolution.activation.function = NN_ACTIVATION_FUNCTION_RELU;
            workflow_layer[conv3]->arguments.forward_convolution.padding = NN_PADDING_MODE_DATA_OR_ZERO;

            workflow_layer[conv3]->arguments.forward_convolution.weights = workflow_layer_factor[conv3_weights];
            workflow_layer[conv3]->arguments.forward_convolution.biases = workflow_layer_factor[conv3_biases];

            workflow_layer[conv3]->arguments.forward_convolution.center_offset[0] = 1;
            workflow_layer[conv3]->arguments.forward_convolution.center_offset[1] = 1;

            workflow_layer[conv3]->arguments.forward_convolution.stride[0] = 1;
            workflow_layer[conv3]->arguments.forward_convolution.stride[1] = 1;

            workflow_layer[conv3]->output_format[0].format = NN_DATA_FORMAT_3D;
            workflow_layer[conv3]->output_format[0].format_3d ={{13,13,384}};
        }

        // ------------------------------------------------------------------------------------------
        // STAGE 04
        //           split: 2 (z-axis 384/2)
        //           convo: 3x3 stride 1x1; ReLU; 0-padded
        //          output: 13x13x(2*384/2) (continue split to next stage)
        {
            nn_workflow_use_descriptor_t inputs_descriptor ={workflow_layer[conv3],0};
            di->workflow_item_create_function(&workflow_layer[subv3_1],1,&inputs_descriptor,1); // view g1

            workflow_layer[subv3_1]->type = NN_WORK_ITEM_TYPE_VIEW;
            workflow_layer[subv3_1]->arguments.view.origin[0] = 0;
            workflow_layer[subv3_1]->arguments.view.origin[1] = 0;
            workflow_layer[subv3_1]->arguments.view.origin[2] = 0;

            workflow_layer[subv3_1]->output_format[0].format = NN_DATA_FORMAT_3D;
            workflow_layer[subv3_1]->output_format[0].format_3d ={{13,13,384/2}};
        }

        {
            nn_workflow_use_descriptor_t inputs_descriptor ={workflow_layer[conv3],0};
            di->workflow_item_create_function(&workflow_layer[subv3_2],1,&inputs_descriptor,1); // view g2

            workflow_layer[subv3_2]->type = NN_WORK_ITEM_TYPE_VIEW;
            workflow_layer[subv3_2]->arguments.view.origin[0] = 0;
            workflow_layer[subv3_2]->arguments.view.origin[1] = 0;
            workflow_layer[subv3_2]->arguments.view.origin[2] = 384/2;

            workflow_layer[subv3_2]->output_format[0].format = NN_DATA_FORMAT_3D;
            workflow_layer[subv3_2]->output_format[0].format_3d ={{13,13,384/2}};

        }

        {
            nn_workflow_use_descriptor_t inputs_descriptor ={workflow_layer[subv3_1],0};
            di->workflow_item_create_function(&workflow_layer[conv4_1],1,&inputs_descriptor,1); // conv g1

            workflow_layer[conv4_1]->type = NN_WORK_ITEM_TYPE_CONVOLUTION;
            workflow_layer[conv4_1]->name = "c4g1";

            workflow_layer[conv4_1]->arguments.forward_convolution.activation.function = NN_ACTIVATION_FUNCTION_RELU;
            workflow_layer[conv4_1]->arguments.forward_convolution.padding = NN_PADDING_MODE_DATA_OR_ZERO;

            workflow_layer[conv4_1]->arguments.forward_convolution.weights = workflow_layer_factor[conv4_1_weights];
            workflow_layer[conv4_1]->arguments.forward_convolution.biases = workflow_layer_factor[conv4_1_biases];

            workflow_layer[conv4_1]->arguments.forward_convolution.center_offset[0] = 1;
            workflow_layer[conv4_1]->arguments.forward_convolution.center_offset[1] = 1;

            workflow_layer[conv4_1]->arguments.forward_convolution.stride[0] = 1;
            workflow_layer[conv4_1]->arguments.forward_convolution.stride[1] = 1;

            workflow_layer[conv4_1]->output_format[0].format = NN_DATA_FORMAT_3D;
            workflow_layer[conv4_1]->output_format[0].format_3d ={{13,13,384/2}};
        }

        {
            nn_workflow_use_descriptor_t inputs_descriptor ={workflow_layer[subv3_2],0};
            di->workflow_item_create_function(&workflow_layer[conv4_2],1,&inputs_descriptor,1); // conv g2

            workflow_layer[conv4_2]->type = NN_WORK_ITEM_TYPE_CONVOLUTION;
            workflow_layer[conv4_2]->name = "c4g2";

            workflow_layer[conv4_2]->arguments.forward_convolution.activation.function = NN_ACTIVATION_FUNCTION_RELU;
            workflow_layer[conv4_2]->arguments.forward_convolution.padding = NN_PADDING_MODE_DATA_OR_ZERO;

            workflow_layer[conv4_2]->arguments.forward_convolution.weights = workflow_layer_factor[conv4_1_weights];
            workflow_layer[conv4_2]->arguments.forward_convolution.biases = workflow_layer_factor[conv4_2_biases];

            workflow_layer[conv4_2]->arguments.forward_convolution.center_offset[0] = 1;
            workflow_layer[conv4_2]->arguments.forward_convolution.center_offset[1] = 1;

            workflow_layer[conv4_2]->arguments.forward_convolution.stride[0] = 1;
            workflow_layer[conv4_2]->arguments.forward_convolution.stride[1] = 1;

            workflow_layer[conv4_2]->output_format[0].format = NN_DATA_FORMAT_3D;
            workflow_layer[conv4_2]->output_format[0].format_3d ={{13,13,384/2}};
        }

        // ------------------------------------------------------------------------------------------
        // STAGE 05
        //           convo: 3x3 stride 1x1; ReLU; 0-padded; output: 13x13x(2*256/2)
        //           merge: (z-axis)
        //         maxpool: 3x3 stride 2x2;
        //          output: 13x13x256
        {
            nn_workflow_use_descriptor_t inputs_descriptor ={workflow_layer[conv4_1],0};
            di->workflow_item_create_function(&workflow_layer[conv5_1],1,&inputs_descriptor,1); // conv g1

            workflow_layer[conv5_1]->type = NN_WORK_ITEM_TYPE_CONVOLUTION;
            workflow_layer[conv5_1]->name = "c5g1";

            workflow_layer[conv5_1]->arguments.forward_convolution.activation.function = NN_ACTIVATION_FUNCTION_RELU;
            workflow_layer[conv5_1]->arguments.forward_convolution.padding = NN_PADDING_MODE_DATA_OR_ZERO;

            workflow_layer[conv5_1]->arguments.forward_convolution.weights = workflow_layer_factor[conv5_1_weights];
            workflow_layer[conv5_1]->arguments.forward_convolution.biases = workflow_layer_factor[conv5_1_biases];

            workflow_layer[conv5_1]->arguments.forward_convolution.center_offset[0] = 1;
            workflow_layer[conv5_1]->arguments.forward_convolution.center_offset[1] = 1;

            workflow_layer[conv5_1]->arguments.forward_convolution.stride[0] = 1;
            workflow_layer[conv5_1]->arguments.forward_convolution.stride[1] = 1;

            workflow_layer[conv5_1]->output_format[0].format = NN_DATA_FORMAT_3D;
            workflow_layer[conv5_1]->output_format[0].format_3d ={{13,13,256/2}};
        }

        {
            nn_workflow_use_descriptor_t inputs_descriptor ={workflow_layer[conv4_2],0};
            di->workflow_item_create_function(&workflow_layer[conv5_2],1,&inputs_descriptor,1); // conv g2

            workflow_layer[conv5_2]->type = NN_WORK_ITEM_TYPE_CONVOLUTION;
            workflow_layer[conv5_2]->name = "c5g2";

            workflow_layer[conv5_2]->arguments.forward_convolution.activation.function = NN_ACTIVATION_FUNCTION_RELU;
            workflow_layer[conv5_2]->arguments.forward_convolution.padding = NN_PADDING_MODE_DATA_OR_ZERO;

            workflow_layer[conv5_2]->arguments.forward_convolution.weights = workflow_layer_factor[conv5_2_weights];
            workflow_layer[conv5_2]->arguments.forward_convolution.biases = workflow_layer_factor[conv5_2_biases];

            workflow_layer[conv5_2]->arguments.forward_convolution.center_offset[0] = 1;
            workflow_layer[conv5_2]->arguments.forward_convolution.center_offset[1] = 1;

            workflow_layer[conv5_2]->arguments.forward_convolution.stride[0] = 1;
            workflow_layer[conv5_2]->arguments.forward_convolution.stride[1] = 1;

            workflow_layer[conv5_2]->output_format[0].format = NN_DATA_FORMAT_3D;
            workflow_layer[conv5_2]->output_format[0].format_3d ={{13,13,256/2}};
        }

        // merge g1 and g2
        {
            nn_workflow_use_descriptor_t inputs_descriptor[] ={{workflow_layer[conv5_1],0},{workflow_layer[conv5_2],0}};
            di->workflow_item_create_function(&workflow_layer[merge5],2,inputs_descriptor,1);

            workflow_layer[merge5]->type = NN_WORK_ITEM_TYPE_MERGE;
            workflow_layer[merge5]->arguments.forward_merge.axis = 2; // value 2 for z-axis

            workflow_layer[merge5]->output_format[0].format = NN_DATA_FORMAT_3D;
            workflow_layer[merge5]->output_format[0].format_3d ={{13,13,256}};
        }

        // maxpool: 3x3 stride 2x2;
        {
            nn_workflow_use_descriptor_t inputs_descriptor ={workflow_layer[merge5],0};
            di->workflow_item_create_function(&workflow_layer[pool5],1,&inputs_descriptor,1); // pooling

            workflow_layer[pool5]->type = NN_WORK_ITEM_TYPE_POOLING;
            workflow_layer[pool5]->name = "p5";

            workflow_layer[pool5]->arguments.forward_pooling.mode = NN_POOLING_MODE_MAX;

            workflow_layer[pool5]->arguments.forward_pooling.size[0] = 3;
            workflow_layer[pool5]->arguments.forward_pooling.size[1] = 3;

            workflow_layer[pool5]->arguments.forward_pooling.stride[0] = 2;
            workflow_layer[pool5]->arguments.forward_pooling.stride[1] = 2;

            workflow_layer[pool5]->output_format[0].format = NN_DATA_FORMAT_3D;
            workflow_layer[pool5]->output_format[0].format_3d ={{6,6,256}};
        }

        // ------------------------------------------------------------------------------------------
        // STAGE 06
        //            full: ReLU
        //          output: 4096
        {
            nn_workflow_use_descriptor_t inputs_descriptor ={workflow_layer[pool5],0};
            di->workflow_item_create_function(&workflow_layer[fc6],1,&inputs_descriptor,1);

            workflow_layer[fc6]->type = NN_WORK_ITEM_TYPE_FULLY_CONNECTED;
            workflow_layer[fc6]->name = "fc6";

            workflow_layer[fc6]->arguments.forward_fully_connected.activation.function = NN_ACTIVATION_FUNCTION_RELU;

            workflow_layer[fc6]->arguments.forward_fully_connected.weights = workflow_layer_factor[fc6_weights];
            workflow_layer[fc6]->arguments.forward_fully_connected.biases = workflow_layer_factor[fc6_biases];

            workflow_layer[fc6]->output_format[0].format = NN_DATA_FORMAT_1D;
            workflow_layer[fc6]->output_format[0].format_1d ={{4096}};
        }

        // ------------------------------------------------------------------------------------------
        // STAGE 07
        //            full: ReLU
        //          output: 4096
        {
            nn_workflow_use_descriptor_t inputs_descriptor ={workflow_layer[fc6],0};
            di->workflow_item_create_function(&workflow_layer[fc7],1,&inputs_descriptor,1);

            workflow_layer[fc7]->type = NN_WORK_ITEM_TYPE_FULLY_CONNECTED;
            workflow_layer[fc7]->name = "fc7";
            workflow_layer[fc7]->arguments.forward_fully_connected.activation.function = NN_ACTIVATION_FUNCTION_RELU;

            workflow_layer[fc7]->arguments.forward_fully_connected.weights = workflow_layer_factor[fc7_weights];
            workflow_layer[fc7]->arguments.forward_fully_connected.biases = workflow_layer_factor[fc7_biases];

            workflow_layer[fc7]->output_format[0].format = NN_DATA_FORMAT_1D;
            workflow_layer[fc7]->output_format[0].format_1d ={{4096}};
        }

        // ------------------------------------------------------------------------------------------
        // STAGE 08
        //            full: ;
        //          output: 1000
        {
            nn_workflow_use_descriptor_t inputs_descriptor ={workflow_layer[fc7],0};
            di->workflow_item_create_function(&workflow_layer[fc8],1,&inputs_descriptor,1);

            workflow_layer[fc8]->type = NN_WORK_ITEM_TYPE_FULLY_CONNECTED;
            workflow_layer[fc8]->name = "fc8";

            workflow_layer[fc8]->arguments.forward_fully_connected.activation.function = NN_ACTIVATION_FUNCTION_NONE;

            workflow_layer[fc8]->arguments.forward_fully_connected.weights = workflow_layer_factor[fc8_weights];
            workflow_layer[fc8]->arguments.forward_fully_connected.biases = workflow_layer_factor[fc8_biases];

            workflow_layer[fc8]->output_format[0].format = NN_DATA_FORMAT_1D;
            workflow_layer[fc8]->output_format[0].format_1d ={{1000}};
        }

        // ------------------------------------------------------------------------------------------
        // STAGE 09 (softmax)
        //          output: 1000
        {
            nn_workflow_use_descriptor_t inputs_descriptor ={workflow_layer[fc8],0};
            di->workflow_item_create_function(&workflow_layer[softmax],1,&inputs_descriptor,1);

            workflow_layer[softmax]->type = NN_WORK_ITEM_TYPE_SOFTMAX;

            workflow_layer[softmax]->output_format[0].format = NN_DATA_FORMAT_1D;
            workflow_layer[softmax]->output_format[0].format_1d ={{1000}};
        }

        // ------------------------------------------------------------------------------------------
        // STAGE 10 (output)
        //          output: 1000
        {
            nn_workflow_use_descriptor_t inputs_descriptor ={workflow_layer[softmax],0};
            di->workflow_item_create_function(&workflow_layer[output],1,&inputs_descriptor,1);

            workflow_layer[output]->type = NN_WORK_ITEM_TYPE_OUTPUT;

            workflow_layer[output]->output_format[0].format = NN_DATA_FORMAT_1D;
            workflow_layer[output]->output_format[0].format_1d ={{1000}};

        }

        // -------------------------------------------------------------------------------------------
        // END of workflow stages definition
        // -------------------------------------------------------------------------------------------
        workflow->input[0] = workflow_layer[input];
        workflow->output[0] = workflow_layer[output];
        // -------------------------------------------------------------------------------------------

        return workflow;
    }
bool run_convolve_test(
    const nn_device_interface_0_t &di,
    uint_least32_t                num_output_feature_maps,
    uint_least32_t                num_input_feature_maps,
    uint_least32_t                input_feature_map_width,
    uint_least32_t                input_feature_map_height,
    uint_least32_t                kernel_width,
    uint_least32_t                kernel_height,
    uint_least32_t                kernel_stride_x,
    uint_least32_t                kernel_stride_y,
    uint_least32_t                num_batches,
    NN_ACTIVATION_FUNCTION        activation_function  )
{
    // Input generation
    float *input = nullptr;
    generate_input_data( input, input_feature_map_width, input_feature_map_height, num_input_feature_maps,
                         num_batches );

    // Generate Filter Data
    float *filters = nullptr;
    generate_filter_data( filters,
                          kernel_width,
                          kernel_height,
                          num_input_feature_maps,
                          num_output_feature_maps );

    uint_least32_t output_width  = ( ( input_feature_map_width - kernel_width ) / kernel_stride_x + 1 );
    uint_least32_t output_height = ( ( input_feature_map_height - kernel_height ) / kernel_stride_y + 1 );
    uint_least32_t output_depth  = num_output_feature_maps;

    // cpu_outputs and gpu_outputs are filled in with biases
    // so as such biases do not exist as separate entity
    float init_output_val = 0.0;        //No biases in output then output is initialized with zeros
    float *biases         = nullptr;
    float *cpu_outputs = nullptr;
    float *gpu_outputs = nullptr;

    // Biases exists as separate entity (each neuron got it own bias value)
    init_data( biases, output_width * output_height * output_depth, 1.0f );
    init_data( gpu_outputs, output_width * output_height * output_depth * num_batches, 0.0f );
    init_data( cpu_outputs, output_width * output_height * output_depth * num_batches, 0.0f );

    // Activation function
    fp_func_activ activ_func = nullptr;
    switch( activation_function )
    {
    case NN_ACTIVATION_FUNCTION_NONE:
        activ_func = none;
        break;
    case NN_ACTIVATION_FUNCTION_TANH:
        activ_func = mytanh;
        break;
    case NN_ACTIVATION_FUNCTION_RELU:
        activ_func = relu;
        break;
    case NN_ACTIVATION_FUNCTION_SOFTPLUS:
        activ_func = softplus;
        break;
    default:
        printf( "Error: Not supported activation function chosen: %d\n", activation_function );
        assert( 0 );
        break;
    }

    nn_workload_data_coords_t conv_input_view_begin( 0, 0, 0, 0, 0, 0 );
    nn_workload_data_coords_t conv_input_view_end( num_batches - 1, input_feature_map_width - 1, input_feature_map_height - 1, num_input_feature_maps - 1, 0, 0 );
    nn_workload_data_coords_t conv_output_view_begin( 0, 0, 0, 0, 0, 0 );
    nn_workload_data_coords_t conv_output_view_end( num_batches - 1, output_width - 1, output_height - 1, output_depth - 1, 0, 0 );

    // Run reference convolving (needed for comparison)
    convolve_ref( activ_func,
                  cpu_outputs,
                  input,
                  filters,
                  biases,
                  conv_output_view_begin,
                  conv_output_view_end,
                  conv_input_view_begin,
                  conv_input_view_end,
                  output_width,
                  output_height,
                  output_depth,
                  input_feature_map_width,
                  input_feature_map_height,
                  num_input_feature_maps,
                  kernel_width,
                  kernel_height,
                  num_input_feature_maps,
                  kernel_stride_x,
                  kernel_stride_y,
                  0,        // center offset x
                  0,        // center offset y
                  num_batches );



    // First workload item is input one (entity producing input data)
    nn_gpu_workload_item *input_workload_item = nullptr;
    initialize_input_workload_item( input_workload_item);

    // Specify layout
    nn_workload_data_layout_t input_output_weights_layout = {
        { 0, 0, 0, 0, 0, 0 }, // tile in log2(size)
        { 0, 0, 0, 0, 0, 0 }, // alignment
        { NN_DATA_COORD_x, NN_DATA_COORD_y, NN_DATA_COORD_z, NN_DATA_COORD_p, NN_DATA_COORD_n, NN_DATA_COORD_q }, // ordering
        NN_DATATYPE_FLOAT
    };

    // specify dimensions of input, output and weights
    nn_workload_data_coords_t input_coords =
    {
        num_batches,
        input_feature_map_width,
        input_feature_map_height,
        num_input_feature_maps,
        1,
        1
    };

    nn_workload_data_coords_t output_coords =
    {
        num_batches,
        output_width,
        output_height,
        num_output_feature_maps,
        1,
        1
    };

    nn_workload_data_coords_t weight_coords =
    {
        1,
        kernel_width,
        kernel_height,
        num_input_feature_maps,
        num_output_feature_maps,
        1
    };

    // Now create convolution workload_item giving as input input_workload_item
    nn_gpu_workload_item *convolution_workload_item = nullptr;
    initialize_layer_workload_item( convolution_workload_item, input_workload_item, input_output_weights_layout, output_coords);
    convolution_workload_item->type = NN_WORK_ITEM_TYPE_CONVOLUTION;
    convolution_workload_item->arguments.forward_convolution.padding = NN_PADDING_MODE_NONE;
    convolution_workload_item->arguments.forward_convolution.stride[0] = kernel_stride_x;
    convolution_workload_item->arguments.forward_convolution.stride[1] = kernel_stride_y;
    convolution_workload_item->arguments.forward_convolution.center_offset[0] = 0;
    convolution_workload_item->arguments.forward_convolution.center_offset[1] = 0;
    convolution_workload_item->arguments.forward_convolution.activation.function = activation_function; 

    nn::nn_workload_data_t< float > *weight_data = new nn::nn_workload_data_t< float >( filters, weight_coords, input_output_weights_layout );

    convolution_workload_item->arguments.forward_convolution.weights = new nn::nn_workload_data_t< float >( weight_coords, input_output_weights_layout );
    nn_workload_data_copy( weight_data, convolution_workload_item->arguments.forward_convolution.weights );
    delete weight_data; //release temporary buffers

    nn_workload_data_coords_t bias_coords =
    {
        1,
        1,
        1,
        1,
        num_output_feature_maps,
        1
    };

    nn::nn_workload_data_t< float > *bias_data = new nn::nn_workload_data_t< float >(biases, bias_coords, input_output_weights_layout);
    convolution_workload_item->arguments.forward_convolution.biases = new nn::nn_workload_data_t< float >( bias_coords, input_output_weights_layout );
    nn_workload_data_copy( bias_data, convolution_workload_item->arguments.forward_convolution.biases );
    delete bias_data;   //release temporary buffers

    // Now create output workload_item giving softmax workload item as precedessor
    nn_gpu_workload_item *output_workload_item = nullptr;
    initialize_output_workload_item( output_workload_item, convolution_workload_item );

    // Make a workload using two above created workload_items
    nn_gpu_workload *gpu_workload = nullptr;
    create_workload_using_workload_items( di, gpu_workload, num_batches, NN_WORKLOAD_DATA_TYPE_F32_3D_BATCH, NN_WORKLOAD_DATA_TYPE_F32_3D_BATCH, input_workload_item, convolution_workload_item, output_workload_item );

    using io_data = std::unique_ptr<nn::data<float, 0>>;
    io_data execute_inputs[1];
    io_data execute_outputs[1];

    // specify dimensions of input, output and weights
    size_t execution_input_size[4] = {input_feature_map_width, input_feature_map_height, num_input_feature_maps, num_batches};
    size_t execution_output_size[4] = {output_width, output_height, num_output_feature_maps, num_batches};

    execute_inputs[0]  = io_data(new nn::data<float, 0>(input, execution_input_size, 4));
    execute_outputs[0] = io_data(new nn::data<float, 0>(gpu_outputs, execution_output_size, 4));

    EXPECT_EQ( NN_API_STATUS_OK, di.workload_execute_function( ( nn_workload * )gpu_workload,
                                             ( void ** )execute_inputs,
                                             ( void ** )execute_outputs, nullptr ) );

    EXPECT_EQ( true, verify_output( execute_outputs[0], cpu_outputs ) );


    EXPECT_EQ( NN_API_STATUS_OK, di.workload_delete_function(( nn_workload * )gpu_workload));

#ifdef __linux__
    free( cpu_outputs );
    cpu_outputs = nullptr;
    free( gpu_outputs );
    gpu_outputs = nullptr;
    free( filters );
    filters = nullptr;
    free( biases );
    biases = nullptr;
    free( input );
    input = nullptr;
#else
    _aligned_free( cpu_outputs );
    cpu_outputs = nullptr;
    _aligned_free( gpu_outputs );
    gpu_outputs = nullptr;
    _aligned_free( filters );
    filters = nullptr;
    _aligned_free( biases );
    biases = nullptr;
    _aligned_free( input );
    input = nullptr;
#endif //__linux__

    return true;
}
bool run_softmax_test( const nn_device_interface_0_t &di,
                       uint_least32_t                num_samples,
                       uint_least32_t                num_batches) // length of input to be  processed (softmax normalize)
{
    // Input generation (input feature maps to have pooling run on it)
    float *input = nullptr;
    generate_input_data( input, num_samples, 1, 1, num_batches );

    // length of output is the same as input

    float *cpu_outputs;
    init_data( cpu_outputs, num_samples * num_batches, 0.0f );

    float *gpu_outputs;
    init_data( gpu_outputs, num_samples * num_batches, 0.0f );

    softmax_ref( cpu_outputs, input, num_samples, num_batches );

    // First workload item is input one (entity producing input data)
    nn_gpu_workload_item *input_workload_item = nullptr;
    initialize_input_workload_item( input_workload_item);

    // Specify layout of softmax workload
    nn_workload_data_layout_t workload_layout = {
        { 0, 0, 0, 0, 0, 0 }, // tile in log2(size)
        { 0, 0, 0, 0, 0, 0 }, // alignment
        { NN_DATA_COORD_x, NN_DATA_COORD_y, NN_DATA_COORD_z, NN_DATA_COORD_p, NN_DATA_COORD_n, NN_DATA_COORD_q },
        NN_DATATYPE_FLOAT
    };

    // specify dimensions of input, output
    nn_workload_data_coords_t workload_coords =
    {
        num_batches,
        num_samples,
        1,
        1,
        1,
        1
    };

    size_t output_coords[2] = {num_samples, num_batches};

    // Now create softmax workload_item giving as input input_workload_item
    nn_gpu_workload_item *softmax_workload_item = nullptr;
    initialize_layer_workload_item( softmax_workload_item, input_workload_item, workload_layout, workload_coords );
    softmax_workload_item->type        = NN_WORK_ITEM_TYPE_SOFTMAX;

    // Now create output workload_item giving softmax workload item as precedessor
    nn_gpu_workload_item *output_workload_item = nullptr;
    initialize_output_workload_item( output_workload_item, softmax_workload_item );

    // Make a workload using two above created workload_items
    nn_gpu_workload *gpu_workload = nullptr;
    create_workload_using_workload_items( di, gpu_workload, num_batches, NN_WORKLOAD_DATA_TYPE_F32_1D_BATCH, NN_WORKLOAD_DATA_TYPE_F32_1D_BATCH, input_workload_item, softmax_workload_item, output_workload_item );

    using io_data = std::unique_ptr<nn::data<float, 0>>;
    io_data execute_inputs[1];
    io_data execute_outputs[1];

    execute_inputs[0]  = io_data(new nn::data<float, 0>(input, output_coords, 2));
    execute_outputs[0] = io_data(new nn::data<float, 0>(gpu_outputs, output_coords, 2));

    EXPECT_EQ( NN_API_STATUS_OK, di.workload_execute_function( ( nn_workload * )gpu_workload,
                                             ( void ** )execute_inputs,
                                             ( void ** )execute_outputs, nullptr ) );

    nn_workload_data_coords_t output_view_begin(0, 0, 0, 0, 0, 0);
    nn_workload_data_coords_t output_view_end(num_batches - 1, num_samples - 1, 0, 0, 0, 0);

    // Compare CPU(reference) output with the one returned by GPU
    EXPECT_EQ( true, verify_output( execute_outputs[0], cpu_outputs ) );

    EXPECT_EQ( NN_API_STATUS_OK, di.workload_delete_function(( nn_workload * )gpu_workload));

#ifdef __linux__
    free( cpu_outputs );
    cpu_outputs = nullptr;
    free( gpu_outputs );
    gpu_outputs = nullptr;
    free( input );
    input = nullptr;
#else
    _aligned_free( cpu_outputs );
    cpu_outputs = nullptr;
    _aligned_free( gpu_outputs );
    gpu_outputs = nullptr;
    _aligned_free( input );
    input = nullptr;
#endif //__linux__

    return true;

}
bool test_caffe_float_workload_cpu_time::run()
{
    bool  run_ok = true;
    test_measurement_result   run_result;
    run_result.description = "RUN SUMMARY: " + test_description;

    C_time_control  run_timer;

    std::cout << "-> Testing: " << test_description << std::endl;

    try {
        if(!init()) throw std::runtime_error("error: init() returns false so can't run test");
        run_timer.tick();   //start time measurement
        run_result << std::string("run test with " + current_tested_device->get_device_description());
        // ---------------------------------------------------------------------------------------------------------
        // TODO: here test code
        //{   // BKM pattern of test with time measuring:
        //    bool local_ok=true;
        //    test_measurement_result local_result;
        //    local_result.description = "RUN PART: (name part) of " + test_description;
        //    C_time_control  local_timer;
        //    // begin local test

        //    // end of local test
        //    // summary:
        //    local_timer.tock();
        //    local_result.time_consumed = local_timer.time_diff_string();
        //    local_result.clocks_consumed = local_timer.get_clocks_diff();
        //    tests_results << local_result;
        //} // The pattern, of complex instruction above, can be multiplied
        for(uint16_t batch :{1,8,48})
        {

            std::vector<uint64_t>     time_diffs;
            std::vector<uint64_t>     clock_diffs;

            nn::data<float,4>        *images = new nn::data<float,4>(img_size,img_size,3,batch);
            nn_data_populate(nn::data_cast<float,0>(images),0.0f,255.0f);
            nn_data_t *input_array[1] ={images};

            auto workload_output = new nn::data<float, 2>(1000, batch);
            nn::data<float> *output_array_cmpl[1] ={ nn::data_cast<float, 0>(workload_output) };

            nn_workload_t             *workload = nullptr;

            // compiling workload
            NN_WORKLOAD_DATA_TYPE input_format = NN_WORKLOAD_DATA_TYPE_F32_ZXY_BATCH;
            NN_WORKLOAD_DATA_TYPE output_format = NN_WORKLOAD_DATA_TYPE_F32_1D_BATCH;


            auto status = di->workflow_compile_function(&workload,di->device,workflow,&input_format,&output_format,batch);
            if(!workload) throw std::runtime_error("workload compilation failed for batch = " + std::to_string(batch)
                                                   + " status: " + std::to_string(status));

            test_measurement_result local_result;
            local_result.description = "RUN PART: (batch " + std::to_string(batch)+") execution of " + test_description;
            local_result.loops = loops;

            // begin local test
            for(auto i = 0; i< loops; ++i)
            {
                NN_API_STATUS   status;
                C_time_control  loop_timer;
                di->workload_execute_function(workload,reinterpret_cast<void**>(input_array),reinterpret_cast<void**>(output_array_cmpl),&status);
                loop_timer.tock();
                time_diffs.push_back(loop_timer.get_time_diff()/batch);
                clock_diffs.push_back(loop_timer.get_clocks_diff()/batch);
            }

            // end of local test
            // summary:
            uint64_t  min_value = *std::min_element(time_diffs.begin(),time_diffs.end());
            local_result.time_consumed = std::accumulate(time_diffs.begin(),time_diffs.end(),0.0)/time_diffs.size();
            local_result.time_consumed_min = min_value;
            local_result.time_consumed_max = *std::max_element(time_diffs.begin(),time_diffs.end());

            local_result << std::string("note: The shortest time for one image obtained from the chrono: "
                                        + C_time_control::time_diff_string(min_value));
            local_result << std::string("note: Values of time's and clock's were divided by current value of batch: "+std::to_string(batch));

            local_result.clocks_consumed = std::accumulate(clock_diffs.begin(),clock_diffs.end(),0.0)/clock_diffs.size();
            local_result.clocks_consumed_min = *std::min_element(clock_diffs.begin(),clock_diffs.end());
            local_result.clocks_consumed_max = *std::max_element(clock_diffs.begin(),clock_diffs.end());

            tests_results << local_result;
            if(images != nullptr) delete images;
            if(workload_output != nullptr) delete workload_output;
            if(workload != nullptr) di->workload_delete_function(workload);
        }
        // ---------------------------------------------------------------------------------------------------------
        run_ok = true;
    }
    catch(std::runtime_error &error) {
        run_result << "error: " + std::string(error.what());
        run_ok = false;
    }
    catch(...) {
        run_result << "error: unknown";
        run_ok = false;
    }

    run_timer.tock();
    run_result.time_consumed = run_timer.get_time_diff();
    run_result.clocks_consumed = run_timer.get_clocks_diff();

    run_result.passed = run_ok;
    tests_results << run_result;
    if (!done()) run_ok=false;
    std::cout << "<- Test " << (run_ok ? "passed" : "failed") << std::endl;;
    return run_ok;
}
bool test_softmax_float_cpu_random::run() {
    bool  run_ok = true;
    test_measurement_result   run_result;
    run_result.description = "RUN SUMMARY: " + test_description;

    C_time_control  run_timer;

    std::cout << "-> Testing: " << test_description << std::endl;

    try {
        if( !init() ) throw std::runtime_error( "init() returns false so can't run test" );
        run_timer.tick();   //start time measurement
        run_result << std::string( "run test with " + current_tested_device->get_device_description() );

        NN_WORKLOAD_DATA_TYPE input_format = NN_WORKLOAD_DATA_TYPE_F32_1D_BATCH;
        NN_WORKLOAD_DATA_TYPE output_format = NN_WORKLOAD_DATA_TYPE_F32_1D_BATCH;

        const int softmax_size = 1000;
        for( auto batch : { 1, 8, 48 } ) {
            // ---------------------------------------------------------------------------------------------------------
            {   // simple sample pattern of test with time measuring:
                bool local_ok = true;
                test_measurement_result local_result;
                local_result.description = "RUN PART: (batch " + std::to_string( batch ) + ") execution of " + test_description;
                C_time_control  local_timer;
                // begin local test

                auto input = new nn::data<float>( softmax_size, batch );
                if(input == nullptr)   throw std::runtime_error("unable to create input for batch = " +std::to_string(batch));

                auto workload_output = new nn::data<float>( softmax_size, batch );
                if(workload_output == nullptr)   throw std::runtime_error("unable to create workload_output for batch = " +std::to_string(batch));

                nn_data_populate( workload_output, 0.0f );

                nn_data_populate( input, 0.0f, 20.0f );

                nn_workload_t *workload = nullptr;
                nn_data_t *input_array[1] = { input };
                nn::data<float> *output_array_cmpl[1] = { nn::data_cast<float, 0>(workload_output) };

                auto status = di->workflow_compile_function( &workload, di->device, workflow, &input_format, &output_format, batch );
                if( !workload ) throw std::runtime_error( "workload compilation failed for batch = " + std::to_string( batch )
                                                          + " status: " + std::to_string( status ) );

                di->workload_execute_function( workload, reinterpret_cast<void**>(input_array), reinterpret_cast<void**>(output_array_cmpl), &status );

                auto naive_output = cpu_layer_softmax( input );

                local_ok = compare_data(workload_output, naive_output);

                // end of local test
                // summary:
                local_timer.tock();
                local_result.time_consumed = local_timer.get_time_diff();
                local_result.clocks_consumed = local_timer.get_clocks_diff();
                local_result.passed = local_ok;
                tests_results << local_result;

                run_ok = run_ok && local_ok;

                if( input )           delete input;
                if( workload_output ) delete workload_output;
                if( naive_output )    delete naive_output;
                if( workload )        delete workload;

            } // The pattern, of complex instruction above, can be multiplied
            // END of run tests
            // ---------------------------------------------------------------------------------------------------------
        }
    } catch( std::runtime_error &error ) {
        run_result << "error: " + std::string( error.what() );
        run_ok = false;
    } catch( std::exception &error ) {
        run_result << "error: " + std::string( error.what() );
        run_ok = false;
    } catch( ... ) {
        run_result << "unknown error";
        run_ok = false;
    }

    run_timer.tock();
    run_result.time_consumed = run_timer.get_time_diff();
    run_result.clocks_consumed = run_timer.get_clocks_diff();

    run_result.passed = run_ok;
    tests_results << run_result;
    if( !done() ) run_ok = false;
    std::cout << "<- Test " << (run_ok ? "passed" : "failed") << std::endl;;
    return run_ok;
}