bool test_google_float_workload_cpu_images_classification::run() { bool run_ok = true; test_measurement_result run_result; run_result.description = "RUN SUMMARY: " + test_description; C_time_control run_timer; std::cout << "-> Testing: " << test_description << std::endl; try { if(!init()) throw std::runtime_error("init() returns false so can't run test"); run_timer.tick(); //start time measurement run_result << std::string("run test with " + current_tested_device->get_device_description()); for(uint32_t batch :{1,8,48}) { C_time_control loop_timer; // compiling workload nn_workload_t *workload = nullptr; NN_WORKLOAD_DATA_TYPE input_format = NN_WORKLOAD_DATA_TYPE_F32_ZXY_BATCH; NN_WORKLOAD_DATA_TYPE output_format = NN_WORKLOAD_DATA_TYPE_F32_1D_BATCH; auto status = di->workflow_compile_function(&workload, di->device, workflow, &input_format, &output_format, batch); if(!workload) throw std::runtime_error("workload compilation failed for batch = " + std::to_string(batch) + " status: " + std::to_string(status)); test_measurement_result local_result; local_result.description = "RUN PART: (batch " + std::to_string(batch)+") execution of " + test_description; bool local_ok = true; auto images_list_iterator = images_list.begin(); auto images_list_end = images_list.end(); while(images_list_iterator != images_list_end) { auto diff_itr = images_list_end - images_list_iterator < batch ? images_list_end - images_list_iterator : batch; std::vector< std::string > batch_images(images_list_iterator,images_list_iterator + diff_itr); images_list_iterator += diff_itr; nn::data< float,4 > *images = nullptr; images = nn_data_load_from_image_list(&batch_images, img_size, image_process, batch, RGB_order); if(images) { nn_data_t *input_array[1] ={images}; nn::data<float, 2> *workload_output = new nn::data<float, 2>(1000, batch); if(workload_output == nullptr) throw std::runtime_error("unable to create workload_output for batch = " +std::to_string(batch)); nn::data<float> *output_array_cmpl[1] ={nn::data_cast<float,0>(workload_output)}; di->workload_execute_function(workload,reinterpret_cast<void**>(input_array),reinterpret_cast<void**>(output_array_cmpl),&status); float *value_cmpl = reinterpret_cast<float *>(workload_output->buffer); for(auto &image_filename : batch_images) { std::ifstream reference_output_file(image_filename + ".txt", std::ifstream::in); // Comparison with the reference output workload float difference = 0; for(int index = 0; index < 1000; ++index) { std::string reference_value_str; std::getline(reference_output_file,reference_value_str); float reference_value = std::stof(reference_value_str); float delta = value_cmpl[index]-reference_value; difference += abs(delta); } if(difference < threshold_to_pass_test) local_result << std::string("note: " + image_filename + " difference = " + std::to_string(difference)); else { local_result << std::string("error: image file: " + image_filename +" the difference exceeded the allowable threshold for compliance: " + std::to_string(difference) + " > " + std::to_string(threshold_to_pass_test)); local_ok = false; run_ok = false; } reference_output_file.close(); value_cmpl += 1000; } batch_images.clear(); if(images != nullptr) delete images; if(workload_output != nullptr) delete workload_output; } } // batch loop summary: local_result.passed = local_ok; loop_timer.tock(); local_result.time_consumed = loop_timer.get_time_diff(); local_result.clocks_consumed = loop_timer.get_clocks_diff(); tests_results << local_result; if(workload != nullptr) di->workload_delete_function(workload); } // END: for(uint32_t batch :{1,8,48}) } catch(std::runtime_error &error) { run_result << "error: " + std::string(error.what()); run_ok = false; } catch(...) { run_result << "error: unknown"; run_ok = false; } run_timer.tock(); run_result.time_consumed = run_timer.get_time_diff(); run_result.clocks_consumed = run_timer.get_clocks_diff(); run_result.passed = run_ok; tests_results << run_result; if (!done()) run_ok=false; std::cout << "<- Test " << (run_ok ? "passed" : "failed") << std::endl;; return run_ok; }
bool run_softmax_test( const nn_device_interface_0_t &di, uint_least32_t num_samples, uint_least32_t num_batches) // length of input to be processed (softmax normalize) { // Input generation (input feature maps to have pooling run on it) float *input = nullptr; generate_input_data( input, num_samples, 1, 1, num_batches ); // length of output is the same as input float *cpu_outputs; init_data( cpu_outputs, num_samples * num_batches, 0.0f ); float *gpu_outputs; init_data( gpu_outputs, num_samples * num_batches, 0.0f ); softmax_ref( cpu_outputs, input, num_samples, num_batches ); // First workload item is input one (entity producing input data) nn_gpu_workload_item *input_workload_item = nullptr; initialize_input_workload_item( input_workload_item); // Specify layout of softmax workload nn_workload_data_layout_t workload_layout = { { 0, 0, 0, 0, 0, 0 }, // tile in log2(size) { 0, 0, 0, 0, 0, 0 }, // alignment { NN_DATA_COORD_x, NN_DATA_COORD_y, NN_DATA_COORD_z, NN_DATA_COORD_p, NN_DATA_COORD_n, NN_DATA_COORD_q }, NN_DATATYPE_FLOAT }; // specify dimensions of input, output nn_workload_data_coords_t workload_coords = { num_batches, num_samples, 1, 1, 1, 1 }; size_t output_coords[2] = {num_samples, num_batches}; // Now create softmax workload_item giving as input input_workload_item nn_gpu_workload_item *softmax_workload_item = nullptr; initialize_layer_workload_item( softmax_workload_item, input_workload_item, workload_layout, workload_coords ); softmax_workload_item->type = NN_WORK_ITEM_TYPE_SOFTMAX; // Now create output workload_item giving softmax workload item as precedessor nn_gpu_workload_item *output_workload_item = nullptr; initialize_output_workload_item( output_workload_item, softmax_workload_item ); // Make a workload using two above created workload_items nn_gpu_workload *gpu_workload = nullptr; create_workload_using_workload_items( di, gpu_workload, num_batches, NN_WORKLOAD_DATA_TYPE_F32_1D_BATCH, NN_WORKLOAD_DATA_TYPE_F32_1D_BATCH, input_workload_item, softmax_workload_item, output_workload_item ); using io_data = std::unique_ptr<nn::data<float, 0>>; io_data execute_inputs[1]; io_data execute_outputs[1]; execute_inputs[0] = io_data(new nn::data<float, 0>(input, output_coords, 2)); execute_outputs[0] = io_data(new nn::data<float, 0>(gpu_outputs, output_coords, 2)); EXPECT_EQ( NN_API_STATUS_OK, di.workload_execute_function( ( nn_workload * )gpu_workload, ( void ** )execute_inputs, ( void ** )execute_outputs, nullptr ) ); nn_workload_data_coords_t output_view_begin(0, 0, 0, 0, 0, 0); nn_workload_data_coords_t output_view_end(num_batches - 1, num_samples - 1, 0, 0, 0, 0); // Compare CPU(reference) output with the one returned by GPU EXPECT_EQ( true, verify_output( execute_outputs[0], cpu_outputs ) ); EXPECT_EQ( NN_API_STATUS_OK, di.workload_delete_function(( nn_workload * )gpu_workload)); #ifdef __linux__ free( cpu_outputs ); cpu_outputs = nullptr; free( gpu_outputs ); gpu_outputs = nullptr; free( input ); input = nullptr; #else _aligned_free( cpu_outputs ); cpu_outputs = nullptr; _aligned_free( gpu_outputs ); gpu_outputs = nullptr; _aligned_free( input ); input = nullptr; #endif //__linux__ return true; }
bool run_convolve_test( const nn_device_interface_0_t &di, uint_least32_t num_output_feature_maps, uint_least32_t num_input_feature_maps, uint_least32_t input_feature_map_width, uint_least32_t input_feature_map_height, uint_least32_t kernel_width, uint_least32_t kernel_height, uint_least32_t kernel_stride_x, uint_least32_t kernel_stride_y, uint_least32_t num_batches, NN_ACTIVATION_FUNCTION activation_function ) { // Input generation float *input = nullptr; generate_input_data( input, input_feature_map_width, input_feature_map_height, num_input_feature_maps, num_batches ); // Generate Filter Data float *filters = nullptr; generate_filter_data( filters, kernel_width, kernel_height, num_input_feature_maps, num_output_feature_maps ); uint_least32_t output_width = ( ( input_feature_map_width - kernel_width ) / kernel_stride_x + 1 ); uint_least32_t output_height = ( ( input_feature_map_height - kernel_height ) / kernel_stride_y + 1 ); uint_least32_t output_depth = num_output_feature_maps; // cpu_outputs and gpu_outputs are filled in with biases // so as such biases do not exist as separate entity float init_output_val = 0.0; //No biases in output then output is initialized with zeros float *biases = nullptr; float *cpu_outputs = nullptr; float *gpu_outputs = nullptr; // Biases exists as separate entity (each neuron got it own bias value) init_data( biases, output_width * output_height * output_depth, 1.0f ); init_data( gpu_outputs, output_width * output_height * output_depth * num_batches, 0.0f ); init_data( cpu_outputs, output_width * output_height * output_depth * num_batches, 0.0f ); // Activation function fp_func_activ activ_func = nullptr; switch( activation_function ) { case NN_ACTIVATION_FUNCTION_NONE: activ_func = none; break; case NN_ACTIVATION_FUNCTION_TANH: activ_func = mytanh; break; case NN_ACTIVATION_FUNCTION_RELU: activ_func = relu; break; case NN_ACTIVATION_FUNCTION_SOFTPLUS: activ_func = softplus; break; default: printf( "Error: Not supported activation function chosen: %d\n", activation_function ); assert( 0 ); break; } nn_workload_data_coords_t conv_input_view_begin( 0, 0, 0, 0, 0, 0 ); nn_workload_data_coords_t conv_input_view_end( num_batches - 1, input_feature_map_width - 1, input_feature_map_height - 1, num_input_feature_maps - 1, 0, 0 ); nn_workload_data_coords_t conv_output_view_begin( 0, 0, 0, 0, 0, 0 ); nn_workload_data_coords_t conv_output_view_end( num_batches - 1, output_width - 1, output_height - 1, output_depth - 1, 0, 0 ); // Run reference convolving (needed for comparison) convolve_ref( activ_func, cpu_outputs, input, filters, biases, conv_output_view_begin, conv_output_view_end, conv_input_view_begin, conv_input_view_end, output_width, output_height, output_depth, input_feature_map_width, input_feature_map_height, num_input_feature_maps, kernel_width, kernel_height, num_input_feature_maps, kernel_stride_x, kernel_stride_y, 0, // center offset x 0, // center offset y num_batches ); // First workload item is input one (entity producing input data) nn_gpu_workload_item *input_workload_item = nullptr; initialize_input_workload_item( input_workload_item); // Specify layout nn_workload_data_layout_t input_output_weights_layout = { { 0, 0, 0, 0, 0, 0 }, // tile in log2(size) { 0, 0, 0, 0, 0, 0 }, // alignment { NN_DATA_COORD_x, NN_DATA_COORD_y, NN_DATA_COORD_z, NN_DATA_COORD_p, NN_DATA_COORD_n, NN_DATA_COORD_q }, // ordering NN_DATATYPE_FLOAT }; // specify dimensions of input, output and weights nn_workload_data_coords_t input_coords = { num_batches, input_feature_map_width, input_feature_map_height, num_input_feature_maps, 1, 1 }; nn_workload_data_coords_t output_coords = { num_batches, output_width, output_height, num_output_feature_maps, 1, 1 }; nn_workload_data_coords_t weight_coords = { 1, kernel_width, kernel_height, num_input_feature_maps, num_output_feature_maps, 1 }; // Now create convolution workload_item giving as input input_workload_item nn_gpu_workload_item *convolution_workload_item = nullptr; initialize_layer_workload_item( convolution_workload_item, input_workload_item, input_output_weights_layout, output_coords); convolution_workload_item->type = NN_WORK_ITEM_TYPE_CONVOLUTION; convolution_workload_item->arguments.forward_convolution.padding = NN_PADDING_MODE_NONE; convolution_workload_item->arguments.forward_convolution.stride[0] = kernel_stride_x; convolution_workload_item->arguments.forward_convolution.stride[1] = kernel_stride_y; convolution_workload_item->arguments.forward_convolution.center_offset[0] = 0; convolution_workload_item->arguments.forward_convolution.center_offset[1] = 0; convolution_workload_item->arguments.forward_convolution.activation.function = activation_function; nn::nn_workload_data_t< float > *weight_data = new nn::nn_workload_data_t< float >( filters, weight_coords, input_output_weights_layout ); convolution_workload_item->arguments.forward_convolution.weights = new nn::nn_workload_data_t< float >( weight_coords, input_output_weights_layout ); nn_workload_data_copy( weight_data, convolution_workload_item->arguments.forward_convolution.weights ); delete weight_data; //release temporary buffers nn_workload_data_coords_t bias_coords = { 1, 1, 1, 1, num_output_feature_maps, 1 }; nn::nn_workload_data_t< float > *bias_data = new nn::nn_workload_data_t< float >(biases, bias_coords, input_output_weights_layout); convolution_workload_item->arguments.forward_convolution.biases = new nn::nn_workload_data_t< float >( bias_coords, input_output_weights_layout ); nn_workload_data_copy( bias_data, convolution_workload_item->arguments.forward_convolution.biases ); delete bias_data; //release temporary buffers // Now create output workload_item giving softmax workload item as precedessor nn_gpu_workload_item *output_workload_item = nullptr; initialize_output_workload_item( output_workload_item, convolution_workload_item ); // Make a workload using two above created workload_items nn_gpu_workload *gpu_workload = nullptr; create_workload_using_workload_items( di, gpu_workload, num_batches, NN_WORKLOAD_DATA_TYPE_F32_3D_BATCH, NN_WORKLOAD_DATA_TYPE_F32_3D_BATCH, input_workload_item, convolution_workload_item, output_workload_item ); using io_data = std::unique_ptr<nn::data<float, 0>>; io_data execute_inputs[1]; io_data execute_outputs[1]; // specify dimensions of input, output and weights size_t execution_input_size[4] = {input_feature_map_width, input_feature_map_height, num_input_feature_maps, num_batches}; size_t execution_output_size[4] = {output_width, output_height, num_output_feature_maps, num_batches}; execute_inputs[0] = io_data(new nn::data<float, 0>(input, execution_input_size, 4)); execute_outputs[0] = io_data(new nn::data<float, 0>(gpu_outputs, execution_output_size, 4)); EXPECT_EQ( NN_API_STATUS_OK, di.workload_execute_function( ( nn_workload * )gpu_workload, ( void ** )execute_inputs, ( void ** )execute_outputs, nullptr ) ); EXPECT_EQ( true, verify_output( execute_outputs[0], cpu_outputs ) ); EXPECT_EQ( NN_API_STATUS_OK, di.workload_delete_function(( nn_workload * )gpu_workload)); #ifdef __linux__ free( cpu_outputs ); cpu_outputs = nullptr; free( gpu_outputs ); gpu_outputs = nullptr; free( filters ); filters = nullptr; free( biases ); biases = nullptr; free( input ); input = nullptr; #else _aligned_free( cpu_outputs ); cpu_outputs = nullptr; _aligned_free( gpu_outputs ); gpu_outputs = nullptr; _aligned_free( filters ); filters = nullptr; _aligned_free( biases ); biases = nullptr; _aligned_free( input ); input = nullptr; #endif //__linux__ return true; }
bool test_caffe_float_workload_cpu_time::run() { bool run_ok = true; test_measurement_result run_result; run_result.description = "RUN SUMMARY: " + test_description; C_time_control run_timer; std::cout << "-> Testing: " << test_description << std::endl; try { if(!init()) throw std::runtime_error("error: init() returns false so can't run test"); run_timer.tick(); //start time measurement run_result << std::string("run test with " + current_tested_device->get_device_description()); // --------------------------------------------------------------------------------------------------------- // TODO: here test code //{ // BKM pattern of test with time measuring: // bool local_ok=true; // test_measurement_result local_result; // local_result.description = "RUN PART: (name part) of " + test_description; // C_time_control local_timer; // // begin local test // // end of local test // // summary: // local_timer.tock(); // local_result.time_consumed = local_timer.time_diff_string(); // local_result.clocks_consumed = local_timer.get_clocks_diff(); // tests_results << local_result; //} // The pattern, of complex instruction above, can be multiplied for(uint16_t batch :{1,8,48}) { std::vector<uint64_t> time_diffs; std::vector<uint64_t> clock_diffs; nn::data<float,4> *images = new nn::data<float,4>(img_size,img_size,3,batch); nn_data_populate(nn::data_cast<float,0>(images),0.0f,255.0f); nn_data_t *input_array[1] ={images}; auto workload_output = new nn::data<float, 2>(1000, batch); nn::data<float> *output_array_cmpl[1] ={ nn::data_cast<float, 0>(workload_output) }; nn_workload_t *workload = nullptr; // compiling workload NN_WORKLOAD_DATA_TYPE input_format = NN_WORKLOAD_DATA_TYPE_F32_ZXY_BATCH; NN_WORKLOAD_DATA_TYPE output_format = NN_WORKLOAD_DATA_TYPE_F32_1D_BATCH; auto status = di->workflow_compile_function(&workload,di->device,workflow,&input_format,&output_format,batch); if(!workload) throw std::runtime_error("workload compilation failed for batch = " + std::to_string(batch) + " status: " + std::to_string(status)); test_measurement_result local_result; local_result.description = "RUN PART: (batch " + std::to_string(batch)+") execution of " + test_description; local_result.loops = loops; // begin local test for(auto i = 0; i< loops; ++i) { NN_API_STATUS status; C_time_control loop_timer; di->workload_execute_function(workload,reinterpret_cast<void**>(input_array),reinterpret_cast<void**>(output_array_cmpl),&status); loop_timer.tock(); time_diffs.push_back(loop_timer.get_time_diff()/batch); clock_diffs.push_back(loop_timer.get_clocks_diff()/batch); } // end of local test // summary: uint64_t min_value = *std::min_element(time_diffs.begin(),time_diffs.end()); local_result.time_consumed = std::accumulate(time_diffs.begin(),time_diffs.end(),0.0)/time_diffs.size(); local_result.time_consumed_min = min_value; local_result.time_consumed_max = *std::max_element(time_diffs.begin(),time_diffs.end()); local_result << std::string("note: The shortest time for one image obtained from the chrono: " + C_time_control::time_diff_string(min_value)); local_result << std::string("note: Values of time's and clock's were divided by current value of batch: "+std::to_string(batch)); local_result.clocks_consumed = std::accumulate(clock_diffs.begin(),clock_diffs.end(),0.0)/clock_diffs.size(); local_result.clocks_consumed_min = *std::min_element(clock_diffs.begin(),clock_diffs.end()); local_result.clocks_consumed_max = *std::max_element(clock_diffs.begin(),clock_diffs.end()); tests_results << local_result; if(images != nullptr) delete images; if(workload_output != nullptr) delete workload_output; if(workload != nullptr) di->workload_delete_function(workload); } // --------------------------------------------------------------------------------------------------------- run_ok = true; } catch(std::runtime_error &error) { run_result << "error: " + std::string(error.what()); run_ok = false; } catch(...) { run_result << "error: unknown"; run_ok = false; } run_timer.tock(); run_result.time_consumed = run_timer.get_time_diff(); run_result.clocks_consumed = run_timer.get_clocks_diff(); run_result.passed = run_ok; tests_results << run_result; if (!done()) run_ok=false; std::cout << "<- Test " << (run_ok ? "passed" : "failed") << std::endl;; return run_ok; }