Ejemplo n.º 1
0
ScalarType opencl_convolve(std::vector<ScalarType>& in1,
                           std::vector<ScalarType>& in2,
                           unsigned int /*row*/, unsigned int /*col*/, unsigned int /*batch_size*/)
{
    //if(in1.size() > 2048) return -1;
    viennacl::vector<ScalarType> input1(in1.size());
    viennacl::vector<ScalarType> input2(in2.size());
    viennacl::vector<ScalarType> output(in1.size());

    viennacl::fast_copy(in1, input1);
    viennacl::fast_copy(in2, input2);

    viennacl::linalg::convolve(input1, input2, output);

    viennacl::backend::finish();
    std::vector<ScalarType> res(in1.size());
    viennacl::fast_copy(output, res);

    std::vector<ScalarType> ref(in1.size());
    convolve_ref(in1, in2, ref);

    return diff_max(res, ref);
}
bool run_convolve_test(
    const nn_device_interface_0_t &di,
    uint_least32_t                num_output_feature_maps,
    uint_least32_t                num_input_feature_maps,
    uint_least32_t                input_feature_map_width,
    uint_least32_t                input_feature_map_height,
    uint_least32_t                kernel_width,
    uint_least32_t                kernel_height,
    uint_least32_t                kernel_stride_x,
    uint_least32_t                kernel_stride_y,
    uint_least32_t                num_batches,
    NN_ACTIVATION_FUNCTION        activation_function  )
{
    // Input generation
    float *input = nullptr;
    generate_input_data( input, input_feature_map_width, input_feature_map_height, num_input_feature_maps,
                         num_batches );

    // Generate Filter Data
    float *filters = nullptr;
    generate_filter_data( filters,
                          kernel_width,
                          kernel_height,
                          num_input_feature_maps,
                          num_output_feature_maps );

    uint_least32_t output_width  = ( ( input_feature_map_width - kernel_width ) / kernel_stride_x + 1 );
    uint_least32_t output_height = ( ( input_feature_map_height - kernel_height ) / kernel_stride_y + 1 );
    uint_least32_t output_depth  = num_output_feature_maps;

    // cpu_outputs and gpu_outputs are filled in with biases
    // so as such biases do not exist as separate entity
    float init_output_val = 0.0;        //No biases in output then output is initialized with zeros
    float *biases         = nullptr;
    float *cpu_outputs = nullptr;
    float *gpu_outputs = nullptr;

    // Biases exists as separate entity (each neuron got it own bias value)
    init_data( biases, output_width * output_height * output_depth, 1.0f );
    init_data( gpu_outputs, output_width * output_height * output_depth * num_batches, 0.0f );
    init_data( cpu_outputs, output_width * output_height * output_depth * num_batches, 0.0f );

    // Activation function
    fp_func_activ activ_func = nullptr;
    switch( activation_function )
    {
    case NN_ACTIVATION_FUNCTION_NONE:
        activ_func = none;
        break;
    case NN_ACTIVATION_FUNCTION_TANH:
        activ_func = mytanh;
        break;
    case NN_ACTIVATION_FUNCTION_RELU:
        activ_func = relu;
        break;
    case NN_ACTIVATION_FUNCTION_SOFTPLUS:
        activ_func = softplus;
        break;
    default:
        printf( "Error: Not supported activation function chosen: %d\n", activation_function );
        assert( 0 );
        break;
    }

    nn_workload_data_coords_t conv_input_view_begin( 0, 0, 0, 0, 0, 0 );
    nn_workload_data_coords_t conv_input_view_end( num_batches - 1, input_feature_map_width - 1, input_feature_map_height - 1, num_input_feature_maps - 1, 0, 0 );
    nn_workload_data_coords_t conv_output_view_begin( 0, 0, 0, 0, 0, 0 );
    nn_workload_data_coords_t conv_output_view_end( num_batches - 1, output_width - 1, output_height - 1, output_depth - 1, 0, 0 );

    // Run reference convolving (needed for comparison)
    convolve_ref( activ_func,
                  cpu_outputs,
                  input,
                  filters,
                  biases,
                  conv_output_view_begin,
                  conv_output_view_end,
                  conv_input_view_begin,
                  conv_input_view_end,
                  output_width,
                  output_height,
                  output_depth,
                  input_feature_map_width,
                  input_feature_map_height,
                  num_input_feature_maps,
                  kernel_width,
                  kernel_height,
                  num_input_feature_maps,
                  kernel_stride_x,
                  kernel_stride_y,
                  0,        // center offset x
                  0,        // center offset y
                  num_batches );



    // First workload item is input one (entity producing input data)
    nn_gpu_workload_item *input_workload_item = nullptr;
    initialize_input_workload_item( input_workload_item);

    // Specify layout
    nn_workload_data_layout_t input_output_weights_layout = {
        { 0, 0, 0, 0, 0, 0 }, // tile in log2(size)
        { 0, 0, 0, 0, 0, 0 }, // alignment
        { NN_DATA_COORD_x, NN_DATA_COORD_y, NN_DATA_COORD_z, NN_DATA_COORD_p, NN_DATA_COORD_n, NN_DATA_COORD_q }, // ordering
        NN_DATATYPE_FLOAT
    };

    // specify dimensions of input, output and weights
    nn_workload_data_coords_t input_coords =
    {
        num_batches,
        input_feature_map_width,
        input_feature_map_height,
        num_input_feature_maps,
        1,
        1
    };

    nn_workload_data_coords_t output_coords =
    {
        num_batches,
        output_width,
        output_height,
        num_output_feature_maps,
        1,
        1
    };

    nn_workload_data_coords_t weight_coords =
    {
        1,
        kernel_width,
        kernel_height,
        num_input_feature_maps,
        num_output_feature_maps,
        1
    };

    // Now create convolution workload_item giving as input input_workload_item
    nn_gpu_workload_item *convolution_workload_item = nullptr;
    initialize_layer_workload_item( convolution_workload_item, input_workload_item, input_output_weights_layout, output_coords);
    convolution_workload_item->type = NN_WORK_ITEM_TYPE_CONVOLUTION;
    convolution_workload_item->arguments.forward_convolution.padding = NN_PADDING_MODE_NONE;
    convolution_workload_item->arguments.forward_convolution.stride[0] = kernel_stride_x;
    convolution_workload_item->arguments.forward_convolution.stride[1] = kernel_stride_y;
    convolution_workload_item->arguments.forward_convolution.center_offset[0] = 0;
    convolution_workload_item->arguments.forward_convolution.center_offset[1] = 0;
    convolution_workload_item->arguments.forward_convolution.activation.function = activation_function; 

    nn::nn_workload_data_t< float > *weight_data = new nn::nn_workload_data_t< float >( filters, weight_coords, input_output_weights_layout );

    convolution_workload_item->arguments.forward_convolution.weights = new nn::nn_workload_data_t< float >( weight_coords, input_output_weights_layout );
    nn_workload_data_copy( weight_data, convolution_workload_item->arguments.forward_convolution.weights );
    delete weight_data; //release temporary buffers

    nn_workload_data_coords_t bias_coords =
    {
        1,
        1,
        1,
        1,
        num_output_feature_maps,
        1
    };

    nn::nn_workload_data_t< float > *bias_data = new nn::nn_workload_data_t< float >(biases, bias_coords, input_output_weights_layout);
    convolution_workload_item->arguments.forward_convolution.biases = new nn::nn_workload_data_t< float >( bias_coords, input_output_weights_layout );
    nn_workload_data_copy( bias_data, convolution_workload_item->arguments.forward_convolution.biases );
    delete bias_data;   //release temporary buffers

    // Now create output workload_item giving softmax workload item as precedessor
    nn_gpu_workload_item *output_workload_item = nullptr;
    initialize_output_workload_item( output_workload_item, convolution_workload_item );

    // Make a workload using two above created workload_items
    nn_gpu_workload *gpu_workload = nullptr;
    create_workload_using_workload_items( di, gpu_workload, num_batches, NN_WORKLOAD_DATA_TYPE_F32_3D_BATCH, NN_WORKLOAD_DATA_TYPE_F32_3D_BATCH, input_workload_item, convolution_workload_item, output_workload_item );

    using io_data = std::unique_ptr<nn::data<float, 0>>;
    io_data execute_inputs[1];
    io_data execute_outputs[1];

    // specify dimensions of input, output and weights
    size_t execution_input_size[4] = {input_feature_map_width, input_feature_map_height, num_input_feature_maps, num_batches};
    size_t execution_output_size[4] = {output_width, output_height, num_output_feature_maps, num_batches};

    execute_inputs[0]  = io_data(new nn::data<float, 0>(input, execution_input_size, 4));
    execute_outputs[0] = io_data(new nn::data<float, 0>(gpu_outputs, execution_output_size, 4));

    EXPECT_EQ( NN_API_STATUS_OK, di.workload_execute_function( ( nn_workload * )gpu_workload,
                                             ( void ** )execute_inputs,
                                             ( void ** )execute_outputs, nullptr ) );

    EXPECT_EQ( true, verify_output( execute_outputs[0], cpu_outputs ) );


    EXPECT_EQ( NN_API_STATUS_OK, di.workload_delete_function(( nn_workload * )gpu_workload));

#ifdef __linux__
    free( cpu_outputs );
    cpu_outputs = nullptr;
    free( gpu_outputs );
    gpu_outputs = nullptr;
    free( filters );
    filters = nullptr;
    free( biases );
    biases = nullptr;
    free( input );
    input = nullptr;
#else
    _aligned_free( cpu_outputs );
    cpu_outputs = nullptr;
    _aligned_free( gpu_outputs );
    gpu_outputs = nullptr;
    _aligned_free( filters );
    filters = nullptr;
    _aligned_free( biases );
    biases = nullptr;
    _aligned_free( input );
    input = nullptr;
#endif //__linux__

    return true;
}