ScalarType opencl_convolve(std::vector<ScalarType>& in1, std::vector<ScalarType>& in2, unsigned int /*row*/, unsigned int /*col*/, unsigned int /*batch_size*/) { //if(in1.size() > 2048) return -1; viennacl::vector<ScalarType> input1(in1.size()); viennacl::vector<ScalarType> input2(in2.size()); viennacl::vector<ScalarType> output(in1.size()); viennacl::fast_copy(in1, input1); viennacl::fast_copy(in2, input2); viennacl::linalg::convolve(input1, input2, output); viennacl::backend::finish(); std::vector<ScalarType> res(in1.size()); viennacl::fast_copy(output, res); std::vector<ScalarType> ref(in1.size()); convolve_ref(in1, in2, ref); return diff_max(res, ref); }
bool run_convolve_test( const nn_device_interface_0_t &di, uint_least32_t num_output_feature_maps, uint_least32_t num_input_feature_maps, uint_least32_t input_feature_map_width, uint_least32_t input_feature_map_height, uint_least32_t kernel_width, uint_least32_t kernel_height, uint_least32_t kernel_stride_x, uint_least32_t kernel_stride_y, uint_least32_t num_batches, NN_ACTIVATION_FUNCTION activation_function ) { // Input generation float *input = nullptr; generate_input_data( input, input_feature_map_width, input_feature_map_height, num_input_feature_maps, num_batches ); // Generate Filter Data float *filters = nullptr; generate_filter_data( filters, kernel_width, kernel_height, num_input_feature_maps, num_output_feature_maps ); uint_least32_t output_width = ( ( input_feature_map_width - kernel_width ) / kernel_stride_x + 1 ); uint_least32_t output_height = ( ( input_feature_map_height - kernel_height ) / kernel_stride_y + 1 ); uint_least32_t output_depth = num_output_feature_maps; // cpu_outputs and gpu_outputs are filled in with biases // so as such biases do not exist as separate entity float init_output_val = 0.0; //No biases in output then output is initialized with zeros float *biases = nullptr; float *cpu_outputs = nullptr; float *gpu_outputs = nullptr; // Biases exists as separate entity (each neuron got it own bias value) init_data( biases, output_width * output_height * output_depth, 1.0f ); init_data( gpu_outputs, output_width * output_height * output_depth * num_batches, 0.0f ); init_data( cpu_outputs, output_width * output_height * output_depth * num_batches, 0.0f ); // Activation function fp_func_activ activ_func = nullptr; switch( activation_function ) { case NN_ACTIVATION_FUNCTION_NONE: activ_func = none; break; case NN_ACTIVATION_FUNCTION_TANH: activ_func = mytanh; break; case NN_ACTIVATION_FUNCTION_RELU: activ_func = relu; break; case NN_ACTIVATION_FUNCTION_SOFTPLUS: activ_func = softplus; break; default: printf( "Error: Not supported activation function chosen: %d\n", activation_function ); assert( 0 ); break; } nn_workload_data_coords_t conv_input_view_begin( 0, 0, 0, 0, 0, 0 ); nn_workload_data_coords_t conv_input_view_end( num_batches - 1, input_feature_map_width - 1, input_feature_map_height - 1, num_input_feature_maps - 1, 0, 0 ); nn_workload_data_coords_t conv_output_view_begin( 0, 0, 0, 0, 0, 0 ); nn_workload_data_coords_t conv_output_view_end( num_batches - 1, output_width - 1, output_height - 1, output_depth - 1, 0, 0 ); // Run reference convolving (needed for comparison) convolve_ref( activ_func, cpu_outputs, input, filters, biases, conv_output_view_begin, conv_output_view_end, conv_input_view_begin, conv_input_view_end, output_width, output_height, output_depth, input_feature_map_width, input_feature_map_height, num_input_feature_maps, kernel_width, kernel_height, num_input_feature_maps, kernel_stride_x, kernel_stride_y, 0, // center offset x 0, // center offset y num_batches ); // First workload item is input one (entity producing input data) nn_gpu_workload_item *input_workload_item = nullptr; initialize_input_workload_item( input_workload_item); // Specify layout nn_workload_data_layout_t input_output_weights_layout = { { 0, 0, 0, 0, 0, 0 }, // tile in log2(size) { 0, 0, 0, 0, 0, 0 }, // alignment { NN_DATA_COORD_x, NN_DATA_COORD_y, NN_DATA_COORD_z, NN_DATA_COORD_p, NN_DATA_COORD_n, NN_DATA_COORD_q }, // ordering NN_DATATYPE_FLOAT }; // specify dimensions of input, output and weights nn_workload_data_coords_t input_coords = { num_batches, input_feature_map_width, input_feature_map_height, num_input_feature_maps, 1, 1 }; nn_workload_data_coords_t output_coords = { num_batches, output_width, output_height, num_output_feature_maps, 1, 1 }; nn_workload_data_coords_t weight_coords = { 1, kernel_width, kernel_height, num_input_feature_maps, num_output_feature_maps, 1 }; // Now create convolution workload_item giving as input input_workload_item nn_gpu_workload_item *convolution_workload_item = nullptr; initialize_layer_workload_item( convolution_workload_item, input_workload_item, input_output_weights_layout, output_coords); convolution_workload_item->type = NN_WORK_ITEM_TYPE_CONVOLUTION; convolution_workload_item->arguments.forward_convolution.padding = NN_PADDING_MODE_NONE; convolution_workload_item->arguments.forward_convolution.stride[0] = kernel_stride_x; convolution_workload_item->arguments.forward_convolution.stride[1] = kernel_stride_y; convolution_workload_item->arguments.forward_convolution.center_offset[0] = 0; convolution_workload_item->arguments.forward_convolution.center_offset[1] = 0; convolution_workload_item->arguments.forward_convolution.activation.function = activation_function; nn::nn_workload_data_t< float > *weight_data = new nn::nn_workload_data_t< float >( filters, weight_coords, input_output_weights_layout ); convolution_workload_item->arguments.forward_convolution.weights = new nn::nn_workload_data_t< float >( weight_coords, input_output_weights_layout ); nn_workload_data_copy( weight_data, convolution_workload_item->arguments.forward_convolution.weights ); delete weight_data; //release temporary buffers nn_workload_data_coords_t bias_coords = { 1, 1, 1, 1, num_output_feature_maps, 1 }; nn::nn_workload_data_t< float > *bias_data = new nn::nn_workload_data_t< float >(biases, bias_coords, input_output_weights_layout); convolution_workload_item->arguments.forward_convolution.biases = new nn::nn_workload_data_t< float >( bias_coords, input_output_weights_layout ); nn_workload_data_copy( bias_data, convolution_workload_item->arguments.forward_convolution.biases ); delete bias_data; //release temporary buffers // Now create output workload_item giving softmax workload item as precedessor nn_gpu_workload_item *output_workload_item = nullptr; initialize_output_workload_item( output_workload_item, convolution_workload_item ); // Make a workload using two above created workload_items nn_gpu_workload *gpu_workload = nullptr; create_workload_using_workload_items( di, gpu_workload, num_batches, NN_WORKLOAD_DATA_TYPE_F32_3D_BATCH, NN_WORKLOAD_DATA_TYPE_F32_3D_BATCH, input_workload_item, convolution_workload_item, output_workload_item ); using io_data = std::unique_ptr<nn::data<float, 0>>; io_data execute_inputs[1]; io_data execute_outputs[1]; // specify dimensions of input, output and weights size_t execution_input_size[4] = {input_feature_map_width, input_feature_map_height, num_input_feature_maps, num_batches}; size_t execution_output_size[4] = {output_width, output_height, num_output_feature_maps, num_batches}; execute_inputs[0] = io_data(new nn::data<float, 0>(input, execution_input_size, 4)); execute_outputs[0] = io_data(new nn::data<float, 0>(gpu_outputs, execution_output_size, 4)); EXPECT_EQ( NN_API_STATUS_OK, di.workload_execute_function( ( nn_workload * )gpu_workload, ( void ** )execute_inputs, ( void ** )execute_outputs, nullptr ) ); EXPECT_EQ( true, verify_output( execute_outputs[0], cpu_outputs ) ); EXPECT_EQ( NN_API_STATUS_OK, di.workload_delete_function(( nn_workload * )gpu_workload)); #ifdef __linux__ free( cpu_outputs ); cpu_outputs = nullptr; free( gpu_outputs ); gpu_outputs = nullptr; free( filters ); filters = nullptr; free( biases ); biases = nullptr; free( input ); input = nullptr; #else _aligned_free( cpu_outputs ); cpu_outputs = nullptr; _aligned_free( gpu_outputs ); gpu_outputs = nullptr; _aligned_free( filters ); filters = nullptr; _aligned_free( biases ); biases = nullptr; _aligned_free( input ); input = nullptr; #endif //__linux__ return true; }