예제 #1
0
    void run_average_delta(nn_workload_item *const item)
    {
        assert(item->input.size() == item->output.size());

        for (uint32_t parameter = 0; parameter < item->input.size(); ++parameter)
        {
            auto input = nn::workload_data_cast<nn::layout_f32>(item->input[parameter].get_data_view());
            auto output = nn::workload_data_cast<nn::layout_f32>(item->output[parameter]);

            auto input_batch = input->parent->lengths.t[NN_DATA_COORD_n];
            auto output_batch = output->parent->lengths.t[NN_DATA_COORD_n];

            if (input_batch == output_batch)
                nn_workload_data_copy(output, input);
            else
            {
                for (uint32_t x = 0; x < output->parent->lengths.t[NN_DATA_COORD_x]; ++x)
                    for (uint32_t y = 0; y < output->parent->lengths.t[NN_DATA_COORD_y]; ++y)
                        for (uint32_t z = 0; z < output->parent->lengths.t[NN_DATA_COORD_z]; ++z)
                            for (uint32_t p = 0; p < output->parent->lengths.t[NN_DATA_COORD_p]; ++p)
                                for (uint32_t q = 0; q < output->parent->lengths.t[NN_DATA_COORD_q]; ++q)
                                {
                                    float acc = 0.0f;
                                    for (uint32_t n = 0; n < input_batch; ++n)
                                        acc += input->at(n, x, y, z, p, q);

                                    acc /= static_cast<float>(input_batch);

                                    (*output)(0, x, y, z, p, q) = acc;
                                }
            }
        }
    }
예제 #2
0
    template <bool backward> void run_dropout(
        float drop_rate,
        const nn_workload_data_t *input_data,
        const nn_workload_data_t *input_seed,
        const nn_workload_data_t *input_if_train,
        nn_workload_data_t *output)
    {
        // Third input - indicates if it's test or training phase.
        auto is_training = nn_workload_data_get<int32_t>(input_if_train, 0, 0, 0, 0, 0, 0) != 0;

        if(!is_training)
        {
            if(backward)
                nn_workload_delta_copy(output, input_data);
            else
                nn_workload_data_copy(output, input_data);
         
            return;
        }

        // Second input - seed.
        auto seed = static_cast<uint32_t>(nn_workload_data_get<int32_t>(input_seed, 0, 0, 0, 0, 0, 0));

        std::mt19937 gen(seed);
        std::bernoulli_distribution dis(drop_rate);
        float scale = 1.0f / (1.0f - drop_rate);

        for (uint32_t n = 0; n < output->parent->lengths.t[NN_DATA_COORD_n]; ++n)
            for (uint32_t x = 0; x < output->parent->lengths.t[NN_DATA_COORD_x]; ++x)
                for (uint32_t y = 0; y < output->parent->lengths.t[NN_DATA_COORD_y]; ++y)
                    for (uint32_t z = 0; z < output->parent->lengths.t[NN_DATA_COORD_z]; ++z)
                        for (uint32_t p = 0; p < output->parent->lengths.t[NN_DATA_COORD_p]; ++p)
                            for (uint32_t q = 0; q < output->parent->lengths.t[NN_DATA_COORD_q]; ++q)
                            {
                                if(backward)
                                    nn_workload_data_get_delta<float>(output, n, x, y, z, p, q) = dis(gen) ? 0.0f : nn_workload_data_get_delta<float>(input_data, n, x, y, z, p, q) * scale;
                                else
                                    nn_workload_data_get<float>(output, n, x, y, z, p, q) = dis(gen) ? 0.0f : nn_workload_data_get<float>(input_data, n, x, y, z, p, q) * scale;
                            }
                                
    }
bool run_convolve_test(
    const nn_device_interface_0_t &di,
    uint_least32_t                num_output_feature_maps,
    uint_least32_t                num_input_feature_maps,
    uint_least32_t                input_feature_map_width,
    uint_least32_t                input_feature_map_height,
    uint_least32_t                kernel_width,
    uint_least32_t                kernel_height,
    uint_least32_t                kernel_stride_x,
    uint_least32_t                kernel_stride_y,
    uint_least32_t                num_batches,
    NN_ACTIVATION_FUNCTION        activation_function  )
{
    // Input generation
    float *input = nullptr;
    generate_input_data( input, input_feature_map_width, input_feature_map_height, num_input_feature_maps,
                         num_batches );

    // Generate Filter Data
    float *filters = nullptr;
    generate_filter_data( filters,
                          kernel_width,
                          kernel_height,
                          num_input_feature_maps,
                          num_output_feature_maps );

    uint_least32_t output_width  = ( ( input_feature_map_width - kernel_width ) / kernel_stride_x + 1 );
    uint_least32_t output_height = ( ( input_feature_map_height - kernel_height ) / kernel_stride_y + 1 );
    uint_least32_t output_depth  = num_output_feature_maps;

    // cpu_outputs and gpu_outputs are filled in with biases
    // so as such biases do not exist as separate entity
    float init_output_val = 0.0;        //No biases in output then output is initialized with zeros
    float *biases         = nullptr;
    float *cpu_outputs = nullptr;
    float *gpu_outputs = nullptr;

    // Biases exists as separate entity (each neuron got it own bias value)
    init_data( biases, output_width * output_height * output_depth, 1.0f );
    init_data( gpu_outputs, output_width * output_height * output_depth * num_batches, 0.0f );
    init_data( cpu_outputs, output_width * output_height * output_depth * num_batches, 0.0f );

    // Activation function
    fp_func_activ activ_func = nullptr;
    switch( activation_function )
    {
    case NN_ACTIVATION_FUNCTION_NONE:
        activ_func = none;
        break;
    case NN_ACTIVATION_FUNCTION_TANH:
        activ_func = mytanh;
        break;
    case NN_ACTIVATION_FUNCTION_RELU:
        activ_func = relu;
        break;
    case NN_ACTIVATION_FUNCTION_SOFTPLUS:
        activ_func = softplus;
        break;
    default:
        printf( "Error: Not supported activation function chosen: %d\n", activation_function );
        assert( 0 );
        break;
    }

    nn_workload_data_coords_t conv_input_view_begin( 0, 0, 0, 0, 0, 0 );
    nn_workload_data_coords_t conv_input_view_end( num_batches - 1, input_feature_map_width - 1, input_feature_map_height - 1, num_input_feature_maps - 1, 0, 0 );
    nn_workload_data_coords_t conv_output_view_begin( 0, 0, 0, 0, 0, 0 );
    nn_workload_data_coords_t conv_output_view_end( num_batches - 1, output_width - 1, output_height - 1, output_depth - 1, 0, 0 );

    // Run reference convolving (needed for comparison)
    convolve_ref( activ_func,
                  cpu_outputs,
                  input,
                  filters,
                  biases,
                  conv_output_view_begin,
                  conv_output_view_end,
                  conv_input_view_begin,
                  conv_input_view_end,
                  output_width,
                  output_height,
                  output_depth,
                  input_feature_map_width,
                  input_feature_map_height,
                  num_input_feature_maps,
                  kernel_width,
                  kernel_height,
                  num_input_feature_maps,
                  kernel_stride_x,
                  kernel_stride_y,
                  0,        // center offset x
                  0,        // center offset y
                  num_batches );



    // First workload item is input one (entity producing input data)
    nn_gpu_workload_item *input_workload_item = nullptr;
    initialize_input_workload_item( input_workload_item);

    // Specify layout
    nn_workload_data_layout_t input_output_weights_layout = {
        { 0, 0, 0, 0, 0, 0 }, // tile in log2(size)
        { 0, 0, 0, 0, 0, 0 }, // alignment
        { NN_DATA_COORD_x, NN_DATA_COORD_y, NN_DATA_COORD_z, NN_DATA_COORD_p, NN_DATA_COORD_n, NN_DATA_COORD_q }, // ordering
        NN_DATATYPE_FLOAT
    };

    // specify dimensions of input, output and weights
    nn_workload_data_coords_t input_coords =
    {
        num_batches,
        input_feature_map_width,
        input_feature_map_height,
        num_input_feature_maps,
        1,
        1
    };

    nn_workload_data_coords_t output_coords =
    {
        num_batches,
        output_width,
        output_height,
        num_output_feature_maps,
        1,
        1
    };

    nn_workload_data_coords_t weight_coords =
    {
        1,
        kernel_width,
        kernel_height,
        num_input_feature_maps,
        num_output_feature_maps,
        1
    };

    // Now create convolution workload_item giving as input input_workload_item
    nn_gpu_workload_item *convolution_workload_item = nullptr;
    initialize_layer_workload_item( convolution_workload_item, input_workload_item, input_output_weights_layout, output_coords);
    convolution_workload_item->type = NN_WORK_ITEM_TYPE_CONVOLUTION;
    convolution_workload_item->arguments.forward_convolution.padding = NN_PADDING_MODE_NONE;
    convolution_workload_item->arguments.forward_convolution.stride[0] = kernel_stride_x;
    convolution_workload_item->arguments.forward_convolution.stride[1] = kernel_stride_y;
    convolution_workload_item->arguments.forward_convolution.center_offset[0] = 0;
    convolution_workload_item->arguments.forward_convolution.center_offset[1] = 0;
    convolution_workload_item->arguments.forward_convolution.activation.function = activation_function; 

    nn::nn_workload_data_t< float > *weight_data = new nn::nn_workload_data_t< float >( filters, weight_coords, input_output_weights_layout );

    convolution_workload_item->arguments.forward_convolution.weights = new nn::nn_workload_data_t< float >( weight_coords, input_output_weights_layout );
    nn_workload_data_copy( weight_data, convolution_workload_item->arguments.forward_convolution.weights );
    delete weight_data; //release temporary buffers

    nn_workload_data_coords_t bias_coords =
    {
        1,
        1,
        1,
        1,
        num_output_feature_maps,
        1
    };

    nn::nn_workload_data_t< float > *bias_data = new nn::nn_workload_data_t< float >(biases, bias_coords, input_output_weights_layout);
    convolution_workload_item->arguments.forward_convolution.biases = new nn::nn_workload_data_t< float >( bias_coords, input_output_weights_layout );
    nn_workload_data_copy( bias_data, convolution_workload_item->arguments.forward_convolution.biases );
    delete bias_data;   //release temporary buffers

    // Now create output workload_item giving softmax workload item as precedessor
    nn_gpu_workload_item *output_workload_item = nullptr;
    initialize_output_workload_item( output_workload_item, convolution_workload_item );

    // Make a workload using two above created workload_items
    nn_gpu_workload *gpu_workload = nullptr;
    create_workload_using_workload_items( di, gpu_workload, num_batches, NN_WORKLOAD_DATA_TYPE_F32_3D_BATCH, NN_WORKLOAD_DATA_TYPE_F32_3D_BATCH, input_workload_item, convolution_workload_item, output_workload_item );

    using io_data = std::unique_ptr<nn::data<float, 0>>;
    io_data execute_inputs[1];
    io_data execute_outputs[1];

    // specify dimensions of input, output and weights
    size_t execution_input_size[4] = {input_feature_map_width, input_feature_map_height, num_input_feature_maps, num_batches};
    size_t execution_output_size[4] = {output_width, output_height, num_output_feature_maps, num_batches};

    execute_inputs[0]  = io_data(new nn::data<float, 0>(input, execution_input_size, 4));
    execute_outputs[0] = io_data(new nn::data<float, 0>(gpu_outputs, execution_output_size, 4));

    EXPECT_EQ( NN_API_STATUS_OK, di.workload_execute_function( ( nn_workload * )gpu_workload,
                                             ( void ** )execute_inputs,
                                             ( void ** )execute_outputs, nullptr ) );

    EXPECT_EQ( true, verify_output( execute_outputs[0], cpu_outputs ) );


    EXPECT_EQ( NN_API_STATUS_OK, di.workload_delete_function(( nn_workload * )gpu_workload));

#ifdef __linux__
    free( cpu_outputs );
    cpu_outputs = nullptr;
    free( gpu_outputs );
    gpu_outputs = nullptr;
    free( filters );
    filters = nullptr;
    free( biases );
    biases = nullptr;
    free( input );
    input = nullptr;
#else
    _aligned_free( cpu_outputs );
    cpu_outputs = nullptr;
    _aligned_free( gpu_outputs );
    gpu_outputs = nullptr;
    _aligned_free( filters );
    filters = nullptr;
    _aligned_free( biases );
    biases = nullptr;
    _aligned_free( input );
    input = nullptr;
#endif //__linux__

    return true;
}