void run_average_delta(nn_workload_item *const item) { assert(item->input.size() == item->output.size()); for (uint32_t parameter = 0; parameter < item->input.size(); ++parameter) { auto input = nn::workload_data_cast<nn::layout_f32>(item->input[parameter].get_data_view()); auto output = nn::workload_data_cast<nn::layout_f32>(item->output[parameter]); auto input_batch = input->parent->lengths.t[NN_DATA_COORD_n]; auto output_batch = output->parent->lengths.t[NN_DATA_COORD_n]; if (input_batch == output_batch) nn_workload_data_copy(output, input); else { for (uint32_t x = 0; x < output->parent->lengths.t[NN_DATA_COORD_x]; ++x) for (uint32_t y = 0; y < output->parent->lengths.t[NN_DATA_COORD_y]; ++y) for (uint32_t z = 0; z < output->parent->lengths.t[NN_DATA_COORD_z]; ++z) for (uint32_t p = 0; p < output->parent->lengths.t[NN_DATA_COORD_p]; ++p) for (uint32_t q = 0; q < output->parent->lengths.t[NN_DATA_COORD_q]; ++q) { float acc = 0.0f; for (uint32_t n = 0; n < input_batch; ++n) acc += input->at(n, x, y, z, p, q); acc /= static_cast<float>(input_batch); (*output)(0, x, y, z, p, q) = acc; } } } }
template <bool backward> void run_dropout( float drop_rate, const nn_workload_data_t *input_data, const nn_workload_data_t *input_seed, const nn_workload_data_t *input_if_train, nn_workload_data_t *output) { // Third input - indicates if it's test or training phase. auto is_training = nn_workload_data_get<int32_t>(input_if_train, 0, 0, 0, 0, 0, 0) != 0; if(!is_training) { if(backward) nn_workload_delta_copy(output, input_data); else nn_workload_data_copy(output, input_data); return; } // Second input - seed. auto seed = static_cast<uint32_t>(nn_workload_data_get<int32_t>(input_seed, 0, 0, 0, 0, 0, 0)); std::mt19937 gen(seed); std::bernoulli_distribution dis(drop_rate); float scale = 1.0f / (1.0f - drop_rate); for (uint32_t n = 0; n < output->parent->lengths.t[NN_DATA_COORD_n]; ++n) for (uint32_t x = 0; x < output->parent->lengths.t[NN_DATA_COORD_x]; ++x) for (uint32_t y = 0; y < output->parent->lengths.t[NN_DATA_COORD_y]; ++y) for (uint32_t z = 0; z < output->parent->lengths.t[NN_DATA_COORD_z]; ++z) for (uint32_t p = 0; p < output->parent->lengths.t[NN_DATA_COORD_p]; ++p) for (uint32_t q = 0; q < output->parent->lengths.t[NN_DATA_COORD_q]; ++q) { if(backward) nn_workload_data_get_delta<float>(output, n, x, y, z, p, q) = dis(gen) ? 0.0f : nn_workload_data_get_delta<float>(input_data, n, x, y, z, p, q) * scale; else nn_workload_data_get<float>(output, n, x, y, z, p, q) = dis(gen) ? 0.0f : nn_workload_data_get<float>(input_data, n, x, y, z, p, q) * scale; } }
bool run_convolve_test( const nn_device_interface_0_t &di, uint_least32_t num_output_feature_maps, uint_least32_t num_input_feature_maps, uint_least32_t input_feature_map_width, uint_least32_t input_feature_map_height, uint_least32_t kernel_width, uint_least32_t kernel_height, uint_least32_t kernel_stride_x, uint_least32_t kernel_stride_y, uint_least32_t num_batches, NN_ACTIVATION_FUNCTION activation_function ) { // Input generation float *input = nullptr; generate_input_data( input, input_feature_map_width, input_feature_map_height, num_input_feature_maps, num_batches ); // Generate Filter Data float *filters = nullptr; generate_filter_data( filters, kernel_width, kernel_height, num_input_feature_maps, num_output_feature_maps ); uint_least32_t output_width = ( ( input_feature_map_width - kernel_width ) / kernel_stride_x + 1 ); uint_least32_t output_height = ( ( input_feature_map_height - kernel_height ) / kernel_stride_y + 1 ); uint_least32_t output_depth = num_output_feature_maps; // cpu_outputs and gpu_outputs are filled in with biases // so as such biases do not exist as separate entity float init_output_val = 0.0; //No biases in output then output is initialized with zeros float *biases = nullptr; float *cpu_outputs = nullptr; float *gpu_outputs = nullptr; // Biases exists as separate entity (each neuron got it own bias value) init_data( biases, output_width * output_height * output_depth, 1.0f ); init_data( gpu_outputs, output_width * output_height * output_depth * num_batches, 0.0f ); init_data( cpu_outputs, output_width * output_height * output_depth * num_batches, 0.0f ); // Activation function fp_func_activ activ_func = nullptr; switch( activation_function ) { case NN_ACTIVATION_FUNCTION_NONE: activ_func = none; break; case NN_ACTIVATION_FUNCTION_TANH: activ_func = mytanh; break; case NN_ACTIVATION_FUNCTION_RELU: activ_func = relu; break; case NN_ACTIVATION_FUNCTION_SOFTPLUS: activ_func = softplus; break; default: printf( "Error: Not supported activation function chosen: %d\n", activation_function ); assert( 0 ); break; } nn_workload_data_coords_t conv_input_view_begin( 0, 0, 0, 0, 0, 0 ); nn_workload_data_coords_t conv_input_view_end( num_batches - 1, input_feature_map_width - 1, input_feature_map_height - 1, num_input_feature_maps - 1, 0, 0 ); nn_workload_data_coords_t conv_output_view_begin( 0, 0, 0, 0, 0, 0 ); nn_workload_data_coords_t conv_output_view_end( num_batches - 1, output_width - 1, output_height - 1, output_depth - 1, 0, 0 ); // Run reference convolving (needed for comparison) convolve_ref( activ_func, cpu_outputs, input, filters, biases, conv_output_view_begin, conv_output_view_end, conv_input_view_begin, conv_input_view_end, output_width, output_height, output_depth, input_feature_map_width, input_feature_map_height, num_input_feature_maps, kernel_width, kernel_height, num_input_feature_maps, kernel_stride_x, kernel_stride_y, 0, // center offset x 0, // center offset y num_batches ); // First workload item is input one (entity producing input data) nn_gpu_workload_item *input_workload_item = nullptr; initialize_input_workload_item( input_workload_item); // Specify layout nn_workload_data_layout_t input_output_weights_layout = { { 0, 0, 0, 0, 0, 0 }, // tile in log2(size) { 0, 0, 0, 0, 0, 0 }, // alignment { NN_DATA_COORD_x, NN_DATA_COORD_y, NN_DATA_COORD_z, NN_DATA_COORD_p, NN_DATA_COORD_n, NN_DATA_COORD_q }, // ordering NN_DATATYPE_FLOAT }; // specify dimensions of input, output and weights nn_workload_data_coords_t input_coords = { num_batches, input_feature_map_width, input_feature_map_height, num_input_feature_maps, 1, 1 }; nn_workload_data_coords_t output_coords = { num_batches, output_width, output_height, num_output_feature_maps, 1, 1 }; nn_workload_data_coords_t weight_coords = { 1, kernel_width, kernel_height, num_input_feature_maps, num_output_feature_maps, 1 }; // Now create convolution workload_item giving as input input_workload_item nn_gpu_workload_item *convolution_workload_item = nullptr; initialize_layer_workload_item( convolution_workload_item, input_workload_item, input_output_weights_layout, output_coords); convolution_workload_item->type = NN_WORK_ITEM_TYPE_CONVOLUTION; convolution_workload_item->arguments.forward_convolution.padding = NN_PADDING_MODE_NONE; convolution_workload_item->arguments.forward_convolution.stride[0] = kernel_stride_x; convolution_workload_item->arguments.forward_convolution.stride[1] = kernel_stride_y; convolution_workload_item->arguments.forward_convolution.center_offset[0] = 0; convolution_workload_item->arguments.forward_convolution.center_offset[1] = 0; convolution_workload_item->arguments.forward_convolution.activation.function = activation_function; nn::nn_workload_data_t< float > *weight_data = new nn::nn_workload_data_t< float >( filters, weight_coords, input_output_weights_layout ); convolution_workload_item->arguments.forward_convolution.weights = new nn::nn_workload_data_t< float >( weight_coords, input_output_weights_layout ); nn_workload_data_copy( weight_data, convolution_workload_item->arguments.forward_convolution.weights ); delete weight_data; //release temporary buffers nn_workload_data_coords_t bias_coords = { 1, 1, 1, 1, num_output_feature_maps, 1 }; nn::nn_workload_data_t< float > *bias_data = new nn::nn_workload_data_t< float >(biases, bias_coords, input_output_weights_layout); convolution_workload_item->arguments.forward_convolution.biases = new nn::nn_workload_data_t< float >( bias_coords, input_output_weights_layout ); nn_workload_data_copy( bias_data, convolution_workload_item->arguments.forward_convolution.biases ); delete bias_data; //release temporary buffers // Now create output workload_item giving softmax workload item as precedessor nn_gpu_workload_item *output_workload_item = nullptr; initialize_output_workload_item( output_workload_item, convolution_workload_item ); // Make a workload using two above created workload_items nn_gpu_workload *gpu_workload = nullptr; create_workload_using_workload_items( di, gpu_workload, num_batches, NN_WORKLOAD_DATA_TYPE_F32_3D_BATCH, NN_WORKLOAD_DATA_TYPE_F32_3D_BATCH, input_workload_item, convolution_workload_item, output_workload_item ); using io_data = std::unique_ptr<nn::data<float, 0>>; io_data execute_inputs[1]; io_data execute_outputs[1]; // specify dimensions of input, output and weights size_t execution_input_size[4] = {input_feature_map_width, input_feature_map_height, num_input_feature_maps, num_batches}; size_t execution_output_size[4] = {output_width, output_height, num_output_feature_maps, num_batches}; execute_inputs[0] = io_data(new nn::data<float, 0>(input, execution_input_size, 4)); execute_outputs[0] = io_data(new nn::data<float, 0>(gpu_outputs, execution_output_size, 4)); EXPECT_EQ( NN_API_STATUS_OK, di.workload_execute_function( ( nn_workload * )gpu_workload, ( void ** )execute_inputs, ( void ** )execute_outputs, nullptr ) ); EXPECT_EQ( true, verify_output( execute_outputs[0], cpu_outputs ) ); EXPECT_EQ( NN_API_STATUS_OK, di.workload_delete_function(( nn_workload * )gpu_workload)); #ifdef __linux__ free( cpu_outputs ); cpu_outputs = nullptr; free( gpu_outputs ); gpu_outputs = nullptr; free( filters ); filters = nullptr; free( biases ); biases = nullptr; free( input ); input = nullptr; #else _aligned_free( cpu_outputs ); cpu_outputs = nullptr; _aligned_free( gpu_outputs ); gpu_outputs = nullptr; _aligned_free( filters ); filters = nullptr; _aligned_free( biases ); biases = nullptr; _aligned_free( input ); input = nullptr; #endif //__linux__ return true; }