void average_subsampling_layer_tester_plain::run_forward_propagation( plain_buffer::ptr output_buffer, const std::vector<plain_buffer::const_ptr>& input_buffers, plain_buffer::ptr temporary_working_fixed_buffer, plain_buffer::ptr temporary_working_per_entry_buffer, plain_running_configuration::const_ptr plain_config, layer::const_ptr layer_schema, layer_data::const_ptr data, layer_data_custom::const_ptr data_custom, const std::vector<layer_configuration_specific>& input_configuration_specific_list, const layer_configuration_specific& output_configuration_specific, unsigned int entry_count) const { const float * const in_it_global = *input_buffers[0]; float * const out_it_global = *output_buffer; const unsigned int input_neuron_count = input_configuration_specific_list[0].get_neuron_count(); const unsigned int input_neuron_count_per_feature_map = input_configuration_specific_list[0].get_neuron_count_per_feature_map(); const unsigned int output_neuron_count = output_configuration_specific.get_neuron_count(); const unsigned int output_neuron_count_per_feature_map = output_configuration_specific.get_neuron_count_per_feature_map(); nnforge_shared_ptr<const average_subsampling_layer> layer_derived = nnforge_dynamic_pointer_cast<const average_subsampling_layer>(layer_schema); const std::vector<unsigned int>& subsampling_sizes = layer_derived->subsampling_sizes; const unsigned int dimension_count = static_cast<unsigned int>(layer_derived->subsampling_sizes.size()); std::vector<unsigned int> input_slices(input_configuration_specific_list[0].dimension_sizes.size()); input_slices[0] = 1; for(unsigned int i = 0; i < dimension_count - 1; ++i) input_slices[i + 1] = input_slices[i] * input_configuration_specific_list[0].dimension_sizes[i]; unsigned int subsampling_elem_count = 1; for(unsigned int i = 0; i < dimension_count; ++i) subsampling_elem_count *= subsampling_sizes[i]; const unsigned int const_subsampling_elem_count = subsampling_elem_count; const float mult = 1.0F / static_cast<float>(subsampling_elem_count); const unsigned int feature_map_count = output_configuration_specific.feature_map_count; std::vector<unsigned int> current_local_input_position(dimension_count, 0); std::vector<unsigned int> offset_list(subsampling_elem_count); for(unsigned int i = 1; i < subsampling_elem_count; ++i) { int offset = 0; for(unsigned int j = 0; j < dimension_count; ++j) { offset += static_cast<int>(input_slices[j]); if ((++current_local_input_position[j]) < subsampling_sizes[j]) { offset_list[i] = offset_list[i-1] + offset; break; } current_local_input_position[j] = 0; offset -= static_cast<int>(subsampling_sizes[j] * input_slices[j]); } } const int total_workload = entry_count * output_configuration_specific.feature_map_count; const std::vector<unsigned int>::const_iterator dimension_sizes_it = output_configuration_specific.dimension_sizes.begin(); const std::vector<unsigned int>::const_iterator subsampling_sizes_it = subsampling_sizes.begin(); const std::vector<unsigned int>::const_iterator input_slices_it = input_slices.begin(); const std::vector<unsigned int>::const_iterator offset_list_it = offset_list.begin(); #pragma omp parallel default(none) num_threads(plain_config->openmp_thread_count) { nnforge_array<unsigned int, max_dimension_count> current_output_position; #pragma omp for schedule(guided) for(int workload_id = 0; workload_id < total_workload; ++workload_id) { int entry_id = workload_id / feature_map_count; int feature_map_id = workload_id - (entry_id * feature_map_count); const float * in_it_base = in_it_global + (entry_id * input_neuron_count) + (feature_map_id * input_neuron_count_per_feature_map); float * out_it_base = out_it_global + (entry_id * output_neuron_count) + (feature_map_id * output_neuron_count_per_feature_map); std::fill_n(current_output_position.begin(), dimension_count, 0); for(float * out_it = out_it_base; out_it != out_it_base + output_neuron_count_per_feature_map; ++out_it) { // Define the starting position of the first input elem int in_it_offset = 0; for(unsigned int i = 0; i < dimension_count; ++i) in_it_offset += current_output_position[i] * (*(subsampling_sizes_it + i)) * (*(input_slices_it + i)); float sum = 0.0F; for(unsigned int i = 0; i < const_subsampling_elem_count; ++i) { sum += *(in_it_base + (in_it_offset + (*(offset_list_it + i)))); } *out_it = sum * mult; // Go to the next output element for(unsigned int i = 0; i < dimension_count; ++i) { if ((++current_output_position[i]) < *( dimension_sizes_it + i)) break; current_output_position[i] = 0; } } } } }
void sparse_convolution_layer_updater_plain::test( const_additional_buffer_smart_ptr input_buffer, additional_buffer_smart_ptr output_buffer, std::vector<additional_buffer_smart_ptr>& additional_buffers, plain_running_configuration_const_smart_ptr plain_config, const_layer_smart_ptr layer_schema, const_layer_data_smart_ptr data, const_layer_data_custom_smart_ptr data_custom, const layer_configuration_specific& input_configuration_specific, const layer_configuration_specific& output_configuration_specific, unsigned int updater_count, unsigned int offset_input_entry_id) const { const unsigned int input_neuron_count = input_configuration_specific.get_neuron_count(); const unsigned int input_neuron_count_per_feature_map = input_configuration_specific.get_neuron_count_per_feature_map(); const unsigned int output_neuron_count = output_configuration_specific.get_neuron_count(); const unsigned int output_neuron_count_per_feature_map = output_configuration_specific.get_neuron_count_per_feature_map(); const std::vector<float>::const_iterator in_it_global = input_buffer->begin() + input_neuron_count * offset_input_entry_id; const std::vector<float>::iterator out_it_global = output_buffer->begin(); nnforge_shared_ptr<const sparse_convolution_layer> layer_derived = nnforge_dynamic_pointer_cast<const sparse_convolution_layer>(layer_schema); const std::vector<unsigned int>& window_sizes = layer_derived->window_sizes; const unsigned int dimension_count = static_cast<unsigned int>(window_sizes.size()); std::vector<unsigned int> input_slices(input_configuration_specific.dimension_sizes.size()); input_slices[0] = 1; for(unsigned int i = 0; i < dimension_count - 1; ++i) input_slices[i + 1] = input_slices[i] * input_configuration_specific.dimension_sizes[i]; unsigned int window_elem_count = 1; for(unsigned int i = 0; i < dimension_count; ++i) window_elem_count *= window_sizes[i]; const unsigned int const_window_elem_count = window_elem_count; const std::vector<float>::const_iterator weights = (*data)[0].begin(); const std::vector<float>::const_iterator biases = (*data)[1].begin(); const std::vector<int>::const_iterator column_indices = (*data_custom)[0].begin(); const std::vector<int>::const_iterator row_indices = (*data_custom)[1].begin(); std::vector<unsigned int> current_local_input_position(dimension_count, 0); std::vector<unsigned int> offset_list(window_elem_count); for(unsigned int i = 1; i < window_elem_count; ++i) { int offset = 0; for(unsigned int j = 0; j < dimension_count; ++j) { offset += static_cast<int>(input_slices[j]); if ((++current_local_input_position[j]) < window_sizes[j]) { offset_list[i] = offset_list[i-1] + offset; break; } current_local_input_position[j] = 0; offset -= static_cast<int>(window_sizes[j] * input_slices[j]); } } const unsigned int output_feature_map_count = output_configuration_specific.feature_map_count; const unsigned int input_feature_map_count = input_configuration_specific.feature_map_count; const int total_workload = updater_count * output_feature_map_count; const std::vector<unsigned int>::const_iterator output_dimension_sizes_it = output_configuration_specific.dimension_sizes.begin(); const std::vector<unsigned int>::const_iterator input_slices_it = input_slices.begin(); const std::vector<unsigned int>::const_iterator offset_list_it = offset_list.begin(); #pragma omp parallel default(none) num_threads(plain_config->openmp_thread_count) { nnforge_array<unsigned int, max_dimension_count> current_output_position; #pragma omp for schedule(guided) for(int workload_id = 0; workload_id < total_workload; ++workload_id) { int entry_id = workload_id / output_feature_map_count; int output_feature_map_id = workload_id - (entry_id * output_feature_map_count); std::vector<float>::iterator out_it_base = out_it_global + (entry_id * output_neuron_count) + (output_feature_map_id * output_neuron_count_per_feature_map); std::vector<float>::const_iterator in_it_base = in_it_global + entry_id * input_neuron_count; const int start_column_index = row_indices[output_feature_map_id]; const int end_column_index = row_indices[output_feature_map_id + 1]; std::fill_n(current_output_position.begin(), dimension_count, 0); for(std::vector<float>::iterator out_it = out_it_base; out_it != out_it_base + output_neuron_count_per_feature_map; ++out_it) { float sum = *(biases + output_feature_map_id); std::vector<float>::const_iterator weights_it = weights + start_column_index * const_window_elem_count; std::vector<float>::const_iterator in_it_base2 = in_it_base; for(unsigned int i = 0; i < dimension_count; ++i) in_it_base2 += current_output_position[i] * (*(input_slices_it + i)); for(int column_index = start_column_index; column_index < end_column_index; ++column_index) { int input_feature_map_id = column_indices[column_index]; // Define the starting position of the first input elem std::vector<float>::const_iterator in_it = in_it_base2 + (input_feature_map_id * input_neuron_count_per_feature_map); for(unsigned int i = 0; i < const_window_elem_count; ++i) { sum += (*(in_it + *(offset_list_it + i))) * (*weights_it); ++weights_it; } } *out_it = sum; // Go to the next output element for(unsigned int i = 0; i < dimension_count; ++i) { if ((++current_output_position[i]) < *(output_dimension_sizes_it + i)) break; current_output_position[i] = 0; } } } } }
void max_subsampling_layer_tester_plain::test_tiling( plain_buffer::ptr output_buffer, plain_buffer::const_ptr input_buffer, plain_running_configuration::const_ptr plain_config, layer::const_ptr layer_schema, const layer_configuration_specific& input_configuration_specific, const layer_configuration_specific& output_configuration_specific, unsigned int entry_count) const { nnforge_shared_ptr<const max_subsampling_layer> layer_derived = nnforge_dynamic_pointer_cast<const max_subsampling_layer>(layer_schema); const float * const in_it_global = *input_buffer; float * const out_it_global = *output_buffer; const unsigned int input_neuron_count = input_configuration_specific.get_neuron_count(); const unsigned int input_neuron_count_per_feature_map = input_configuration_specific.get_neuron_count_per_feature_map(); const unsigned int output_neuron_count = output_configuration_specific.get_neuron_count(); const unsigned int output_neuron_count_per_feature_map = output_configuration_specific.get_neuron_count_per_feature_map(); const std::vector<unsigned int>& subsampling_sizes = layer_derived->subsampling_sizes; const unsigned int dimension_count = static_cast<unsigned int>(layer_derived->subsampling_sizes.size()); std::vector<unsigned int> input_slices(input_configuration_specific.dimension_sizes.size()); input_slices[0] = 1; for(unsigned int i = 0; i < dimension_count - 1; ++i) input_slices[i + 1] = input_slices[i] * input_configuration_specific.dimension_sizes[i]; unsigned int subsampling_elem_count = 1; for(unsigned int i = 0; i < dimension_count; ++i) subsampling_elem_count *= subsampling_sizes[i]; const unsigned int const_subsampling_elem_count = subsampling_elem_count; const unsigned int feature_map_count = output_configuration_specific.feature_map_count; std::vector<unsigned int> current_local_input_position(dimension_count, 0); std::vector<unsigned int> offset_list(subsampling_elem_count); for(unsigned int i = 1; i < subsampling_elem_count; ++i) { int offset = 0; for(unsigned int j = 0; j < dimension_count; ++j) { offset += static_cast<int>(input_slices[j]); if ((++current_local_input_position[j]) < subsampling_sizes[j]) { offset_list[i] = offset_list[i-1] + offset; break; } current_local_input_position[j] = 0; offset -= static_cast<int>(subsampling_sizes[j] * input_slices[j]); } } const int total_workload = entry_count * output_configuration_specific.feature_map_count; const std::vector<unsigned int>::const_iterator dimension_sizes_it = output_configuration_specific.dimension_sizes.begin(); const std::vector<unsigned int>::const_iterator subsampling_sizes_it = subsampling_sizes.begin(); const std::vector<unsigned int>::const_iterator input_slices_it = input_slices.begin(); const std::vector<unsigned int>::const_iterator offset_list_it = offset_list.begin(); #pragma omp parallel default(none) num_threads(plain_config->openmp_thread_count) { nnforge_array<unsigned int, max_dimension_count> current_output_position; #pragma omp for schedule(guided) for(int workload_id = 0; workload_id < total_workload; ++workload_id) { int input_entry_id = workload_id / feature_map_count; int feature_map_id = workload_id - (input_entry_id * feature_map_count); int base_output_entry_id = input_entry_id * const_subsampling_elem_count; const float * in_it_base = in_it_global + (input_entry_id * input_neuron_count) + (feature_map_id * input_neuron_count_per_feature_map); float * out_it_base = out_it_global + (base_output_entry_id * output_neuron_count) + (feature_map_id * output_neuron_count_per_feature_map); std::fill_n(current_output_position.begin(), dimension_count, 0); for(float * out_it = out_it_base; out_it != out_it_base + output_neuron_count_per_feature_map; ++out_it) { // Define the starting position of the first input elem const float * in_it = in_it_base; for(unsigned int i = 0; i < dimension_count; ++i) in_it += current_output_position[i] * (*(subsampling_sizes_it + i)) * (*(input_slices_it + i)); for(unsigned int j = 0; j < const_subsampling_elem_count; ++j) { float current_max = -1.0e38F; const float * in_it2 = in_it + *(offset_list_it + j); for(unsigned int i = 0; i < const_subsampling_elem_count; ++i) { float new_val = *(in_it2 + (*(offset_list_it + i))); current_max = std::max<float>(current_max, new_val); } *(out_it + j * output_neuron_count) = current_max; } // Go to the next output element for(unsigned int i = 0; i < dimension_count; ++i) { if ((++current_output_position[i]) < *( dimension_sizes_it + i)) break; current_output_position[i] = 0; } } } } }
void sparse_convolution_layer_updater_plain::backprop( additional_buffer_smart_ptr input_errors, const_additional_buffer_smart_ptr input_neurons, const_additional_buffer_smart_ptr output_errors, const_additional_buffer_smart_ptr output_neurons, std::vector<additional_buffer_smart_ptr>& additional_buffers, plain_running_configuration_const_smart_ptr plain_config, const_layer_smart_ptr layer_schema, const_layer_data_smart_ptr data, const_layer_data_custom_smart_ptr data_custom, const layer_configuration_specific& input_configuration_specific, const layer_configuration_specific& output_configuration_specific, unsigned int updater_count) const { const std::vector<float>::iterator in_err_it_global = input_errors->begin(); const std::vector<float>::const_iterator out_err_it_global = output_errors->begin(); const unsigned int input_neuron_count = input_configuration_specific.get_neuron_count(); const unsigned int input_neuron_count_per_feature_map = input_configuration_specific.get_neuron_count_per_feature_map(); const unsigned int output_neuron_count = output_configuration_specific.get_neuron_count(); const unsigned int output_neuron_count_per_feature_map = output_configuration_specific.get_neuron_count_per_feature_map(); nnforge_shared_ptr<const sparse_convolution_layer> layer_derived = nnforge_dynamic_pointer_cast<const sparse_convolution_layer>(layer_schema); const std::vector<unsigned int>& window_sizes = layer_derived->window_sizes; const unsigned int dimension_count = static_cast<unsigned int>(window_sizes.size()); std::vector<unsigned int> input_slices(input_configuration_specific.dimension_sizes.size()); input_slices[0] = 1; for(unsigned int i = 0; i < dimension_count - 1; ++i) input_slices[i + 1] = input_slices[i] * input_configuration_specific.dimension_sizes[i]; unsigned int window_elem_count = 1; for(unsigned int i = 0; i < dimension_count; ++i) window_elem_count *= window_sizes[i]; const unsigned int const_window_elem_count = window_elem_count; const std::vector<float>::const_iterator weights = (*data)[0].begin(); const std::vector<int>::const_iterator column_indices = (*data_custom)[0].begin(); const std::vector<int>::const_iterator row_indices = (*data_custom)[1].begin(); std::vector<std::vector<std::pair<int, int> > > in_fm_out_fm_weight_pos_list_list(input_configuration_specific.feature_map_count); for(int output_feature_map_id = 0; output_feature_map_id < output_configuration_specific.feature_map_count; ++output_feature_map_id) { const int start_column_index = row_indices[output_feature_map_id]; const int end_column_index = row_indices[output_feature_map_id + 1]; for(int column_index = start_column_index; column_index < end_column_index; ++column_index) { int input_feature_map_id = column_indices[column_index]; in_fm_out_fm_weight_pos_list_list[input_feature_map_id].push_back(std::make_pair(output_feature_map_id, column_index)); } } std::vector<unsigned int> current_local_input_position(dimension_count, 0); std::vector<unsigned int> offset_list(window_elem_count); for(unsigned int i = 1; i < window_elem_count; ++i) { int offset = 0; for(unsigned int j = 0; j < dimension_count; ++j) { offset += static_cast<int>(input_slices[j]); if ((++current_local_input_position[j]) < window_sizes[j]) { offset_list[i] = offset_list[i-1] + offset; break; } current_local_input_position[j] = 0; offset -= static_cast<int>(window_sizes[j] * input_slices[j]); } } const unsigned int output_feature_map_count = output_configuration_specific.feature_map_count; const unsigned int input_feature_map_count = input_configuration_specific.feature_map_count; const int total_workload = updater_count * input_feature_map_count; const std::vector<unsigned int>::const_iterator output_dimension_sizes_it = output_configuration_specific.dimension_sizes.begin(); const std::vector<unsigned int>::const_iterator input_slices_it = input_slices.begin(); const std::vector<unsigned int>::const_iterator offset_list_it = offset_list.begin(); const std::vector<std::vector<std::pair<int, int> > >::const_iterator in_fm_out_fm_weight_pos_it = in_fm_out_fm_weight_pos_list_list.begin(); #pragma omp parallel default(none) num_threads(plain_config->openmp_thread_count) { nnforge_array<unsigned int, max_dimension_count> current_output_position; #pragma omp for schedule(guided) for(int workload_id = 0; workload_id < total_workload; ++workload_id) { int entry_id = workload_id / input_feature_map_count; int input_feature_map_id = workload_id - (entry_id * input_feature_map_count); std::vector<float>::const_iterator out_err_it_base = out_err_it_global + (entry_id * output_neuron_count); std::vector<float>::iterator in_err_it_base = in_err_it_global + (entry_id * input_neuron_count) + (input_feature_map_id * input_neuron_count_per_feature_map); const std::vector<std::pair<int, int> >& out_fm_weight_pos_list = in_fm_out_fm_weight_pos_it[input_feature_map_id]; std::fill_n(in_err_it_base, input_neuron_count_per_feature_map, 0.0F); std::fill_n(current_output_position.begin(), dimension_count, 0); for(std::vector<float>::const_iterator out_err_it_base2 = out_err_it_base; out_err_it_base2 != out_err_it_base + output_neuron_count_per_feature_map; ++out_err_it_base2) { std::vector<float>::iterator in_err_it = in_err_it_base; for(unsigned int i = 0; i < dimension_count; ++i) in_err_it += current_output_position[i] * (*(input_slices_it + i)); for(std::vector<std::pair<int, int> >::const_iterator it = out_fm_weight_pos_list.begin(); it != out_fm_weight_pos_list.end(); ++it) { int output_feature_map_id = it->first; int weight_block_id = it->second; std::vector<float>::const_iterator out_err_it = out_err_it_base2 + (output_feature_map_id * output_neuron_count_per_feature_map); std::vector<float>::const_iterator weights_it = weights + weight_block_id * const_window_elem_count; float current_err = *out_err_it; for(unsigned int i = 0; i < const_window_elem_count; ++i) { float w = *weights_it; *(in_err_it + *(offset_list_it + i)) += (w * current_err); ++weights_it; } } // Go to the next output element for(unsigned int i = 0; i < dimension_count; ++i) { if ((++current_output_position[i]) < *(output_dimension_sizes_it + i)) break; current_output_position[i] = 0; } } } } }
void sparse_convolution_layer_updater_plain::update_weights( const_additional_buffer_smart_ptr input_neurons, const_additional_buffer_smart_ptr output_errors, std::vector<additional_buffer_smart_ptr>& additional_buffers, layer_data_smart_ptr gradient, const_layer_data_custom_smart_ptr data_custom, plain_running_configuration_const_smart_ptr plain_config, const_layer_smart_ptr layer_schema, const layer_configuration_specific& input_configuration_specific, const layer_configuration_specific& output_configuration_specific, unsigned int updater_count, unsigned int offset_input_entry_id) const { const unsigned int input_neuron_count = input_configuration_specific.get_neuron_count(); const unsigned int input_neuron_count_per_feature_map = input_configuration_specific.get_neuron_count_per_feature_map(); const unsigned int output_neuron_count = output_configuration_specific.get_neuron_count(); const unsigned int output_neuron_count_per_feature_map = output_configuration_specific.get_neuron_count_per_feature_map(); const std::vector<float>::const_iterator in_it_global = input_neurons->begin() + input_neuron_count * offset_input_entry_id; const std::vector<float>::const_iterator out_err_it_global = output_errors->begin(); nnforge_shared_ptr<const sparse_convolution_layer> layer_derived = nnforge_dynamic_pointer_cast<const sparse_convolution_layer>(layer_schema); const std::vector<unsigned int>& window_sizes = layer_derived->window_sizes; unsigned int feature_map_connection_count = layer_derived->feature_map_connection_count; const unsigned int dimension_count = static_cast<unsigned int>(window_sizes.size()); std::vector<unsigned int> input_slices(input_configuration_specific.dimension_sizes.size()); input_slices[0] = 1; for(unsigned int i = 0; i < dimension_count - 1; ++i) input_slices[i + 1] = input_slices[i] * input_configuration_specific.dimension_sizes[i]; unsigned int window_elem_count = 1; for(unsigned int i = 0; i < dimension_count; ++i) window_elem_count *= window_sizes[i]; const unsigned int const_window_elem_count = window_elem_count; const std::vector<float>::iterator gradient_weights = (*gradient)[0].begin(); const std::vector<float>::iterator gradient_biases = (*gradient)[1].begin(); const std::vector<int>::const_iterator column_indices = (*data_custom)[0].begin(); const std::vector<int>::const_iterator row_indices = (*data_custom)[1].begin(); std::vector<std::pair<int, int> > out_fm_in_fm_list(feature_map_connection_count); int i = 0; for(int output_feature_map_id = 0; output_feature_map_id < output_configuration_specific.feature_map_count; ++output_feature_map_id) { const int start_column_index = row_indices[output_feature_map_id]; const int end_column_index = row_indices[output_feature_map_id + 1]; for(int column_index = start_column_index; column_index < end_column_index; ++column_index) { int input_feature_map_id = column_indices[column_index]; out_fm_in_fm_list[i].first = output_feature_map_id; out_fm_in_fm_list[i].second = input_feature_map_id; ++i; } } std::vector<unsigned int> current_local_input_position(dimension_count, 0); std::vector<unsigned int> offset_list(window_elem_count); for(unsigned int i = 1; i < window_elem_count; ++i) { int offset = 0; for(unsigned int j = 0; j < dimension_count; ++j) { offset += static_cast<int>(input_slices[j]); if ((++current_local_input_position[j]) < window_sizes[j]) { offset_list[i] = offset_list[i-1] + offset; break; } current_local_input_position[j] = 0; offset -= static_cast<int>(window_sizes[j] * input_slices[j]); } } const unsigned int output_feature_map_count = output_configuration_specific.feature_map_count; const unsigned int input_feature_map_count = input_configuration_specific.feature_map_count; const int total_workload = feature_map_connection_count; const unsigned int const_entry_count = updater_count; const std::vector<unsigned int>::const_iterator output_dimension_sizes_it = output_configuration_specific.dimension_sizes.begin(); const std::vector<unsigned int>::const_iterator input_slices_it = input_slices.begin(); const std::vector<unsigned int>::const_iterator offset_list_it = offset_list.begin(); const std::vector<std::pair<int, int> >::const_iterator out_fm_in_fm_it = out_fm_in_fm_list.begin(); const int const_updater_count = updater_count; #pragma omp parallel default(none) num_threads(plain_config->openmp_thread_count) { nnforge_array<unsigned int, max_dimension_count> current_output_position; std::vector<float> weights_local(const_window_elem_count, 0.0F); #pragma omp for schedule(guided) for(int workload_id = 0; workload_id < total_workload; ++workload_id) { int weight_block_id = workload_id; int output_feature_map_id = out_fm_in_fm_it[weight_block_id].first; int input_feature_map_id = out_fm_in_fm_it[weight_block_id].second; std::fill_n(weights_local.begin(), const_window_elem_count, 0.0F); for(int entry_id = 0; entry_id < const_updater_count; ++entry_id) { std::vector<float>::const_iterator in_it_base = in_it_global + (entry_id * input_neuron_count) + (input_feature_map_id * input_neuron_count_per_feature_map); std::vector<float>::const_iterator out_err_it_base = out_err_it_global + (entry_id * output_neuron_count) + (output_feature_map_id * output_neuron_count_per_feature_map); std::fill_n(current_output_position.begin(), dimension_count, 0); for(std::vector<float>::const_iterator out_err_it = out_err_it_base; out_err_it != out_err_it_base + output_neuron_count_per_feature_map; ++out_err_it) { std::vector<float>::const_iterator in_it = in_it_base; for(unsigned int i = 0; i < dimension_count; ++i) in_it += current_output_position[i] * (*(input_slices_it + i)); float current_err = *out_err_it; for(unsigned int i = 0; i < const_window_elem_count; ++i) { float in_neuron = *(in_it + *(offset_list_it + i)); weights_local[i] += (in_neuron * current_err); } // Go to the next output element for(unsigned int i = 0; i < dimension_count; ++i) { if ((++current_output_position[i]) < *(output_dimension_sizes_it + i)) break; current_output_position[i] = 0; } } } std::vector<float>::iterator gradient_weights_it_base = gradient_weights + weight_block_id * const_window_elem_count; std::vector<float>::iterator weights_local_it = weights_local.begin(); for(std::vector<float>::iterator it = gradient_weights_it_base; it != gradient_weights_it_base + const_window_elem_count; ++it, ++weights_local_it) *it += *weights_local_it; } } const int total_workload_bias = output_feature_map_count; #pragma omp parallel for default(none) schedule(guided) num_threads(plain_config->openmp_thread_count) for(int workload_id = 0; workload_id < total_workload_bias; ++workload_id) { int output_feature_map_id = workload_id; float sum = 0.0F; for(int entry_id = 0; entry_id < const_updater_count; ++entry_id) { std::vector<float>::const_iterator out_err_it_base = out_err_it_global + (entry_id * output_neuron_count) + (output_feature_map_id * output_neuron_count_per_feature_map); for(std::vector<float>::const_iterator out_err_it = out_err_it_base; out_err_it != out_err_it_base + output_neuron_count_per_feature_map; ++out_err_it) sum += *out_err_it; } *(gradient_biases + output_feature_map_id) += sum; } }
void local_contrast_subtractive_layer_updater_plain::test( const_additional_buffer_smart_ptr input_buffer, additional_buffer_smart_ptr output_buffer, std::vector<additional_buffer_smart_ptr>& additional_buffers, plain_running_configuration_const_smart_ptr plain_config, const_layer_smart_ptr layer_schema, const_layer_data_smart_ptr data, const_layer_data_custom_smart_ptr data_custom, const layer_configuration_specific& input_configuration_specific, const layer_configuration_specific& output_configuration_specific, unsigned int updater_count, unsigned int offset_input_entry_id) const { if (offset_input_entry_id > 0) throw neural_network_exception("local_contrast_subtractive_layer_updater_plain is not able to run using offset"); const unsigned int input_neuron_count = input_configuration_specific.get_neuron_count(); const unsigned int input_neuron_count_per_feature_map = input_configuration_specific.get_neuron_count_per_feature_map(); const unsigned int output_neuron_count = output_configuration_specific.get_neuron_count(); const unsigned int output_neuron_count_per_feature_map = output_configuration_specific.get_neuron_count_per_feature_map(); nnforge_shared_ptr<const local_contrast_subtractive_layer> layer_derived = nnforge_dynamic_pointer_cast<const local_contrast_subtractive_layer>(layer_schema); const std::vector<std::vector<float> >& window_weights_list = layer_derived->window_weights_list; const std::vector<unsigned int>& feature_maps_affected = layer_derived->feature_maps_affected; const std::vector<unsigned int>& feature_maps_unaffected = layer_derived->feature_maps_unaffected; const unsigned int dimension_count = static_cast<unsigned int>(window_weights_list.size()); std::vector<unsigned int> input_slices(input_configuration_specific.dimension_sizes.size()); input_slices[0] = 1; for(unsigned int i = 0; i < dimension_count - 1; ++i) input_slices[i + 1] = input_slices[i] * input_configuration_specific.dimension_sizes[i]; const std::vector<unsigned int>::const_iterator dimension_sizes_it = output_configuration_specific.dimension_sizes.begin(); const unsigned int feature_maps_affected_count = static_cast<unsigned int>(feature_maps_affected.size()); const unsigned int feature_maps_unaffected_count = static_cast<unsigned int>(feature_maps_affected.size()); const std::vector<unsigned int>::const_iterator input_slices_it = input_slices.begin(); const std::vector<unsigned int>::const_iterator feature_maps_affected_it = feature_maps_affected.begin(); const std::vector<float>::const_iterator input_buffer_it = input_buffer->begin(); const std::vector<float>::iterator output_buffer_it = output_buffer->begin(); const std::vector<std::vector<float> >::const_iterator window_weights_list_it = window_weights_list.begin(); const int total_workload = updater_count * feature_maps_affected_count; const int openmp_thread_count = plain_config->openmp_thread_count; #pragma omp parallel default(none) shared(additional_buffers) num_threads(openmp_thread_count) { std::vector<additional_buffer_smart_ptr> local_additional_buffers; int thread_id = 0; #ifdef _OPENMP thread_id = omp_get_thread_num(); #endif local_additional_buffers.push_back(additional_buffers[thread_id]); if (dimension_count > 1) local_additional_buffers.push_back(additional_buffers[openmp_thread_count + thread_id]); #pragma omp for schedule(guided) for(int workload_id = 0; workload_id < total_workload; ++workload_id) { int entry_id = workload_id / feature_maps_affected_count; int affected_feature_map_id = workload_id - (entry_id * feature_maps_affected_count); unsigned int current_output_buffer_index = 0; unsigned int feature_map_id = *(feature_maps_affected_it + affected_feature_map_id); for(unsigned int dimension_id = 0; dimension_id < dimension_count; ++dimension_id) { std::vector<float>::iterator out_it_base = local_additional_buffers[current_output_buffer_index]->begin(); std::vector<float>::const_iterator in_it; if (dimension_id > 0) in_it = local_additional_buffers[1 - current_output_buffer_index]->begin(); else in_it = input_buffer_it + (entry_id * input_neuron_count) + (feature_map_id * input_neuron_count_per_feature_map); int max_output_size = *(dimension_sizes_it + dimension_id); int input_slice_size = *(input_slices_it + dimension_id); std::vector<unsigned int> current_output_position(dimension_count, 0); for(std::vector<float>::iterator out_it = out_it_base; out_it != out_it_base + output_neuron_count_per_feature_map; ++out_it, ++in_it) { const std::vector<float>& current_window_weights_list = *(window_weights_list_it + dimension_id); float sum = *in_it * current_window_weights_list[0]; int current_position = static_cast<int>(current_output_position[dimension_id]); int dest_forward = current_position; int dest_backward = dest_forward; for (std::vector<float>::const_iterator it = current_window_weights_list.begin() + 1; it != current_window_weights_list.end(); ++it) { dest_forward++; dest_backward--; int dest_forward_actual = (dest_forward < max_output_size) ? dest_forward : (((max_output_size << 1) - 1) - dest_forward); int dest_backward_actual = (dest_backward >= 0) ? dest_backward : (-1 - dest_backward); int offset_forward = ((dest_forward_actual - current_position) * input_slice_size); int offset_backward = ((dest_backward_actual - current_position) * input_slice_size); sum += (*(in_it + offset_forward) + *(in_it + offset_backward)) * (*it); } *out_it = sum; // Go to the next output element for(unsigned int i = 0; i < dimension_count; ++i) { if ((++current_output_position[i]) < *(dimension_sizes_it + i)) break; current_output_position[i] = 0; } } current_output_buffer_index = 1 - current_output_buffer_index; } // for(unsigned int dimension_id // Subtract the gaussian blur { std::vector<float>::const_iterator original_in_it = input_buffer_it + (entry_id * input_neuron_count) + (feature_map_id * input_neuron_count_per_feature_map); std::vector<float>::iterator out_it = output_buffer_it + (entry_id * input_neuron_count) + (feature_map_id * input_neuron_count_per_feature_map); std::vector<float>::const_iterator in_it = local_additional_buffers[1 - current_output_buffer_index]->begin(); for(int i = 0; i < static_cast<int>(input_neuron_count_per_feature_map); ++i) *(out_it + i) = *(original_in_it + i) - *(in_it + i); } } } // #pragma parallel if (feature_maps_unaffected_count > 0) { for(unsigned int entry_id = 0; entry_id < updater_count; ++entry_id) { for(std::vector<unsigned int>::const_iterator it = feature_maps_unaffected.begin(); it != feature_maps_unaffected.end(); ++it) { unsigned int feature_map_id = *it; std::vector<float>::const_iterator original_in_it = input_buffer_it + (entry_id * input_neuron_count) + (feature_map_id * input_neuron_count_per_feature_map); std::vector<float>::iterator out_it = output_buffer_it + (entry_id * input_neuron_count) + (feature_map_id * input_neuron_count_per_feature_map); std::copy(original_in_it, original_in_it + input_neuron_count_per_feature_map, out_it); } } } }
void max_subsampling_layer_tester_plain::test_non_tiling( plain_buffer::ptr output_buffer, plain_buffer::const_ptr input_buffer, plain_running_configuration::const_ptr plain_config, layer::const_ptr layer_schema, const layer_configuration_specific& input_configuration_specific, const layer_configuration_specific& output_configuration_specific, unsigned int entry_count) const { std::vector<unsigned int> input_dimension_sizes = input_configuration_specific.dimension_sizes; if (input_dimension_sizes.empty()) input_dimension_sizes.push_back(1); std::vector<unsigned int> output_dimension_sizes = output_configuration_specific.dimension_sizes; if (output_dimension_sizes.empty()) output_dimension_sizes.push_back(1); std::shared_ptr<const max_subsampling_layer> layer_derived = std::dynamic_pointer_cast<const max_subsampling_layer>(layer_schema); for(std::vector<bool>::const_iterator it = layer_derived->round_ups.begin(); it != layer_derived->round_ups.end(); ++it) if (*it) throw neural_network_exception("round up is not implemented for max_subsampling_layer_tester_plain"); const float * const in_it_global = *input_buffer; float * const out_it_global = *output_buffer; const unsigned int input_neuron_count = input_configuration_specific.get_neuron_count(); const unsigned int input_neuron_count_per_feature_map = input_configuration_specific.get_neuron_count_per_feature_map(); const unsigned int output_neuron_count = output_configuration_specific.get_neuron_count(); const unsigned int output_neuron_count_per_feature_map = output_configuration_specific.get_neuron_count_per_feature_map(); std::vector<unsigned int> strides = layer_derived->strides; if (strides.empty()) strides.push_back(1); std::vector<unsigned int> subsampling_sizes = layer_derived->subsampling_sizes; if (subsampling_sizes.empty()) subsampling_sizes.push_back(1); const unsigned int feature_map_subsampling_size = layer_derived->feature_map_subsampling_size; subsampling_sizes.push_back(feature_map_subsampling_size); const unsigned int entry_subsampling_size = layer_derived->entry_subsampling_size; subsampling_sizes.push_back(entry_subsampling_size); const unsigned int subsampling_dimension_count = static_cast<unsigned int>(subsampling_sizes.size()); const unsigned int spatial_dimension_count = static_cast<unsigned int>(output_dimension_sizes.size()); std::vector<unsigned int> input_slices(subsampling_sizes.size()); input_slices[0] = 1; for(unsigned int i = 0; i < subsampling_dimension_count - 1; ++i) { int dimension_size = (i < spatial_dimension_count) ? input_dimension_sizes[i] : input_configuration_specific.feature_map_count; input_slices[i + 1] = input_slices[i] * dimension_size; } unsigned int subsampling_elem_count = 1; for(unsigned int i = 0; i < subsampling_dimension_count; ++i) subsampling_elem_count *= subsampling_sizes[i]; const unsigned int const_subsampling_elem_count = subsampling_elem_count; const float mult = 1.0F / static_cast<float>(subsampling_elem_count); const unsigned int output_feature_map_count = output_configuration_specific.feature_map_count; const bool is_min = layer_derived->is_min; std::vector<unsigned int> current_local_input_position(subsampling_dimension_count, 0); std::vector<unsigned int> offset_list(subsampling_elem_count); for(unsigned int i = 1; i < subsampling_elem_count; ++i) { int offset = 0; for(unsigned int j = 0; j < subsampling_dimension_count; ++j) { offset += static_cast<int>(input_slices[j]); if ((++current_local_input_position[j]) < subsampling_sizes[j]) { offset_list[i] = offset_list[i-1] + offset; break; } current_local_input_position[j] = 0; offset -= static_cast<int>(subsampling_sizes[j] * input_slices[j]); } } const int total_workload = entry_count * output_configuration_specific.feature_map_count; const std::vector<unsigned int>::const_iterator dimension_sizes_it = output_dimension_sizes.begin(); const std::vector<unsigned int>::const_iterator strides_it = strides.begin(); const std::vector<unsigned int>::const_iterator input_slices_it = input_slices.begin(); const std::vector<unsigned int>::const_iterator offset_list_it = offset_list.begin(); #pragma omp parallel default(none) num_threads(plain_config->openmp_thread_count) { std::array<unsigned int, max_dimension_count> current_output_position; #pragma omp for schedule(guided) for(int workload_id = 0; workload_id < total_workload; ++workload_id) { int output_entry_id = workload_id / output_feature_map_count; int output_feature_map_id = workload_id - (output_entry_id * output_feature_map_count); const float * in_it_base = in_it_global + (output_entry_id * entry_subsampling_size * input_neuron_count) + (output_feature_map_id * feature_map_subsampling_size * input_neuron_count_per_feature_map); float * out_it_base = out_it_global + (output_entry_id * output_neuron_count) + (output_feature_map_id * output_neuron_count_per_feature_map); std::fill_n(current_output_position.begin(), spatial_dimension_count, 0); for(float * out_it = out_it_base; out_it != out_it_base + output_neuron_count_per_feature_map; ++out_it) { // Define the starting position of the first input elem const float * in_it = in_it_base; for(unsigned int i = 0; i < spatial_dimension_count; ++i) in_it += current_output_position[i] * (*(strides_it + i)) * (*(input_slices_it + i)); float current_max = is_min ? 1.0e37F : -1.0e37F; for(unsigned int i = 0; i < const_subsampling_elem_count; ++i) { float new_val = *(in_it + (*(offset_list_it + i))); current_max = is_min ? std::min<float>(current_max, new_val) : std::max<float>(current_max, new_val); } *out_it = current_max; // Go to the next output element for(unsigned int i = 0; i < spatial_dimension_count; ++i) { if ((++current_output_position[i]) < *( dimension_sizes_it + i)) break; current_output_position[i] = 0; } } } } }
void sparse_convolution_layer_tester_plain::run_forward_propagation( plain_buffer::ptr output_buffer, const std::vector<plain_buffer::const_ptr>& input_buffers, plain_buffer::ptr temporary_working_fixed_buffer, plain_buffer::ptr temporary_working_per_entry_buffer, plain_running_configuration::const_ptr plain_config, layer::const_ptr layer_schema, layer_data::const_ptr data, layer_data_custom::const_ptr data_custom, const std::vector<layer_configuration_specific>& input_configuration_specific_list, const layer_configuration_specific& output_configuration_specific, unsigned int entry_count) const { const unsigned int input_neuron_count = input_configuration_specific_list[0].get_neuron_count(); const unsigned int input_neuron_count_per_feature_map = input_configuration_specific_list[0].get_neuron_count_per_feature_map(); const unsigned int output_neuron_count = output_configuration_specific.get_neuron_count(); const unsigned int output_neuron_count_per_feature_map = output_configuration_specific.get_neuron_count_per_feature_map(); const float * const in_it_global = *input_buffers[0]; float * const out_it_global = *output_buffer; nnforge_shared_ptr<const sparse_convolution_layer> layer_derived = nnforge_dynamic_pointer_cast<const sparse_convolution_layer>(layer_schema); const bool bias = layer_derived->bias; std::vector<unsigned int> window_sizes_extended = layer_derived->window_sizes; window_sizes_extended.resize(max_dimension_count, 1); const std::vector<unsigned int>& window_sizes = window_sizes_extended; std::vector<unsigned int> strides_extended = layer_derived->strides; strides_extended.resize(max_dimension_count, 1); const std::vector<unsigned int>& strides = strides_extended; std::vector<unsigned int> left_zero_padding_extended = layer_derived->left_zero_padding; left_zero_padding_extended.resize(max_dimension_count, 0); const std::vector<unsigned int>& left_zero_padding = left_zero_padding_extended; std::vector<unsigned int> right_zero_padding_extended = layer_derived->right_zero_padding; right_zero_padding_extended.resize(max_dimension_count, 0); const std::vector<unsigned int>& right_zero_padding = right_zero_padding_extended; std::vector<unsigned int> input_dimension_sizes_extended = input_configuration_specific_list[0].dimension_sizes; input_dimension_sizes_extended .resize(max_dimension_count, 1); const std::vector<unsigned int>& input_dimension_sizes = input_dimension_sizes_extended ; const unsigned int dimension_count = static_cast<unsigned int>(layer_derived->window_sizes.size()); std::vector<unsigned int> input_slices(input_configuration_specific_list[0].dimension_sizes.size()); input_slices[0] = 1; for(unsigned int i = 0; i < dimension_count - 1; ++i) input_slices[i + 1] = input_slices[i] * input_configuration_specific_list[0].dimension_sizes[i]; unsigned int window_elem_count = 1; for(unsigned int i = 0; i < dimension_count; ++i) window_elem_count *= window_sizes[i]; const unsigned int const_window_elem_count = window_elem_count; const std::vector<float>::const_iterator weights = (*data)[0].begin(); const float * const biases = bias ? &(*data)[1][0] : 0; const std::vector<int>::const_iterator column_indices = (*data_custom)[0].begin(); const std::vector<int>::const_iterator row_indices = (*data_custom)[1].begin(); std::vector<unsigned int> current_local_input_position(dimension_count, 0); std::vector<unsigned int> offset_list(window_elem_count); for(unsigned int i = 1; i < window_elem_count; ++i) { int offset = 0; for(unsigned int j = 0; j < dimension_count; ++j) { offset += static_cast<int>(input_slices[j]); if ((++current_local_input_position[j]) < window_sizes[j]) { offset_list[i] = offset_list[i-1] + offset; break; } current_local_input_position[j] = 0; offset -= static_cast<int>(window_sizes[j] * input_slices[j]); } } const unsigned int output_feature_map_count = output_configuration_specific.feature_map_count; const unsigned int input_feature_map_count = input_configuration_specific_list[0].feature_map_count; const int total_workload = entry_count * output_feature_map_count; const std::vector<unsigned int>::const_iterator output_dimension_sizes_it = output_configuration_specific.dimension_sizes.begin(); const std::vector<unsigned int>::const_iterator input_slices_it = input_slices.begin(); const std::vector<unsigned int>::const_iterator offset_list_it = offset_list.begin(); const std::vector<unsigned int>::const_iterator strides_it = strides.begin(); #pragma omp parallel default(none) num_threads(plain_config->openmp_thread_count) shared(window_sizes,left_zero_padding,right_zero_padding,input_dimension_sizes) { nnforge_array<unsigned int, max_dimension_count> current_output_position; nnforge_array<int, max_dimension_count> current_input_position; #pragma omp for schedule(guided) for(int workload_id = 0; workload_id < total_workload; ++workload_id) { int entry_id = workload_id / output_feature_map_count; int output_feature_map_id = workload_id - (entry_id * output_feature_map_count); float * out_it_base = out_it_global + (entry_id * output_neuron_count) + (output_feature_map_id * output_neuron_count_per_feature_map); const float * in_it_base = in_it_global + entry_id * input_neuron_count; const int start_column_index = row_indices[output_feature_map_id]; const int end_column_index = row_indices[output_feature_map_id + 1]; std::fill_n(current_input_position.begin(), max_dimension_count, 0); std::fill_n(current_output_position.begin(), max_dimension_count, 0); for(float * out_it = out_it_base; out_it != out_it_base + output_neuron_count_per_feature_map; ++out_it) { float sum = bias ? *(biases + output_feature_map_id) : 0.0F; std::vector<float>::const_iterator weights_it = weights + start_column_index * const_window_elem_count; int in_it_offset2 = 0; for(unsigned int i = 0; i < dimension_count; ++i) current_input_position[i] = static_cast<int>(current_output_position[i] * strides_it[i]) - static_cast<int>(left_zero_padding[i]); for(unsigned int i = 0; i < dimension_count; ++i) in_it_offset2 += current_input_position[i] * (*(input_slices_it + i)); for(int column_index = start_column_index; column_index < end_column_index; ++column_index) { int input_feature_map_id = column_indices[column_index]; // Define the starting position of the first input elem int in_it_offset = in_it_offset2 + (input_feature_map_id * input_neuron_count_per_feature_map); int ind = 0; for(int w = current_input_position[3]; w < current_input_position[3] + static_cast<int>(window_sizes[3]); ++w) { bool fit3 = ((unsigned int)w < (unsigned int)input_dimension_sizes[3]); for(int z = current_input_position[2]; z < current_input_position[2] + static_cast<int>(window_sizes[2]); ++z) { bool fit2 = fit3 && ((unsigned int)z < (unsigned int)input_dimension_sizes[2]); for(int y = current_input_position[1]; y < current_input_position[1] + static_cast<int>(window_sizes[1]); ++y) { bool fit1 = fit2 && ((unsigned int)y < (unsigned int)input_dimension_sizes[1]); for(int x = current_input_position[0]; x < current_input_position[0] + static_cast<int>(window_sizes[0]); ++x) { bool fit0 = fit1 && ((unsigned int)x < (unsigned int)input_dimension_sizes[0]); if (fit0) sum += (*(in_it_base + (in_it_offset + *(offset_list_it + ind)))) * (*weights_it); ++ind; ++weights_it; } } } } } *out_it = sum; // Go to the next output element for(unsigned int i = 0; i < dimension_count; ++i) { if ((++current_output_position[i]) < *(output_dimension_sizes_it + i)) break; current_output_position[i] = 0; } } } } }
void convolution_layer_hessian_plain::update_hessian( const_additional_buffer_smart_ptr input_neurons, const_additional_buffer_smart_ptr output_errors, std::vector<additional_buffer_smart_ptr>& additional_buffers, layer_data_smart_ptr hessian_data, plain_running_configuration_const_smart_ptr plain_config, const_layer_smart_ptr layer_schema, const layer_configuration_specific& input_configuration_specific, const layer_configuration_specific& output_configuration_specific, unsigned int entry_count) const { const std::vector<float>::const_iterator in_it_global = input_neurons->begin(); const std::vector<float>::const_iterator out_err_it_global = output_errors->begin(); const unsigned int input_neuron_count = input_configuration_specific.get_neuron_count(); const unsigned int input_neuron_count_per_feature_map = input_configuration_specific.get_neuron_count_per_feature_map(); const unsigned int output_neuron_count = output_configuration_specific.get_neuron_count(); const unsigned int output_neuron_count_per_feature_map = output_configuration_specific.get_neuron_count_per_feature_map(); std::tr1::shared_ptr<const convolution_layer> layer_derived = std::tr1::dynamic_pointer_cast<const convolution_layer>(layer_schema); const std::vector<unsigned int>& window_sizes = layer_derived->window_sizes; const unsigned int dimension_count = static_cast<unsigned int>(window_sizes.size()); std::vector<unsigned int> input_slices(input_configuration_specific.dimension_sizes.size()); input_slices[0] = 1; for(unsigned int i = 0; i < dimension_count - 1; ++i) input_slices[i + 1] = input_slices[i] * input_configuration_specific.dimension_sizes[i]; unsigned int window_elem_count = 1; for(unsigned int i = 0; i < dimension_count; ++i) window_elem_count *= window_sizes[i]; const unsigned int const_window_elem_count = window_elem_count; const std::vector<float>::iterator weights = (*hessian_data)[0].begin(); const std::vector<float>::iterator biases = (*hessian_data)[1].begin(); std::vector<unsigned int> current_local_input_position(dimension_count, 0); std::vector<unsigned int> offset_list(window_elem_count); for(unsigned int i = 1; i < window_elem_count; ++i) { int offset = 0; for(unsigned int j = 0; j < dimension_count; ++j) { offset += static_cast<int>(input_slices[j]); if ((++current_local_input_position[j]) < window_sizes[j]) { offset_list[i] = offset_list[i-1] + offset; break; } current_local_input_position[j] = 0; offset -= static_cast<int>(window_sizes[j] * input_slices[j]); } } const unsigned int output_feature_map_count = output_configuration_specific.feature_map_count; const unsigned int input_feature_map_count = input_configuration_specific.feature_map_count; const int total_workload = output_feature_map_count * input_feature_map_count; const unsigned int const_entry_count = entry_count; const std::vector<unsigned int>::const_iterator output_dimension_sizes_it = output_configuration_specific.dimension_sizes.begin(); const std::vector<unsigned int>::const_iterator input_slices_it = input_slices.begin(); const std::vector<unsigned int>::const_iterator offset_list_it = offset_list.begin(); #pragma omp parallel default(none) num_threads(plain_config->openmp_thread_count) { std::tr1::array<unsigned int, max_dimension_count> current_output_position; std::vector<float> weights_global(const_window_elem_count, 0.0F); std::vector<float> weights_local(const_window_elem_count, 0.0F); #pragma omp for schedule(guided) for(int workload_id = 0; workload_id < total_workload; ++workload_id) { int output_feature_map_id = workload_id / input_feature_map_count; int input_feature_map_id = workload_id - (output_feature_map_id * input_feature_map_count); std::vector<float>::const_iterator in_it_base = in_it_global + (input_feature_map_id * input_neuron_count_per_feature_map); std::vector<float>::const_iterator out_err_it_base = out_err_it_global + (output_feature_map_id * output_neuron_count_per_feature_map); std::vector<float>::iterator weights_it_base = weights + (output_feature_map_id * (const_window_elem_count * input_feature_map_count)) + (const_window_elem_count * input_feature_map_id); std::fill_n(weights_global.begin(), const_window_elem_count, 0.0F); for(unsigned int entry_id = 0; entry_id < const_entry_count; ++entry_id) { std::vector<float>::const_iterator in_it_base2 = in_it_base + (entry_id * input_neuron_count); std::vector<float>::const_iterator out_err_it_base2 = out_err_it_base + (entry_id * output_neuron_count); std::fill_n(current_output_position.begin(), dimension_count, 0); std::fill_n(weights_local.begin(), const_window_elem_count, 0.0F); for(std::vector<float>::const_iterator out_err_it = out_err_it_base2; out_err_it != out_err_it_base2 + output_neuron_count_per_feature_map; ++out_err_it) { std::vector<float>::const_iterator in_it = in_it_base2; for(unsigned int i = 0; i < dimension_count; ++i) in_it += current_output_position[i] * (*(input_slices_it + i)); float current_err = *out_err_it; for(unsigned int i = 0; i < const_window_elem_count; ++i) { float in_neuron = *(in_it + *(offset_list_it + i)); weights_local[i] += (in_neuron * in_neuron * current_err); } // Go to the next output element for(unsigned int i = 0; i < dimension_count; ++i) { if ((++current_output_position[i]) < *(output_dimension_sizes_it + i)) break; current_output_position[i] = 0; } } std::vector<float>::iterator weights_local_it = weights_local.begin(); for(std::vector<float>::iterator it = weights_global.begin(); it != weights_global.end(); ++it, ++weights_local_it) *it += *weights_local_it; } std::vector<float>::iterator weights_global_it = weights_global.begin(); for(std::vector<float>::iterator it = weights_it_base; it != weights_it_base + const_window_elem_count; ++it, ++weights_global_it) *it += *weights_global_it; } } const int total_workload_bias = output_feature_map_count; #pragma omp parallel for default(none) schedule(guided) num_threads(plain_config->openmp_thread_count) for(int workload_id = 0; workload_id < total_workload_bias; ++workload_id) { unsigned int output_feature_map_id = workload_id; std::vector<float>::const_iterator out_err_it_base = out_err_it_global + (output_feature_map_id * output_neuron_count_per_feature_map); float sum = 0.0F; for(unsigned int entry_id = 0; entry_id < const_entry_count; ++entry_id) { std::vector<float>::const_iterator out_err_it_base2 = out_err_it_base + (entry_id * output_neuron_count); float sum_local = 0.0F; for(std::vector<float>::const_iterator out_err_it = out_err_it_base2; out_err_it != out_err_it_base2 + output_neuron_count_per_feature_map; ++out_err_it) sum_local += *out_err_it; sum += sum_local; } *(biases + output_feature_map_id) += sum; } }
void average_subsampling_layer_hessian_plain::backprop( additional_buffer_smart_ptr input_errors, const_additional_buffer_smart_ptr output_errors, const_additional_buffer_smart_ptr output_neurons, std::vector<additional_buffer_smart_ptr>& additional_buffers, plain_running_configuration_const_smart_ptr plain_config, const_layer_smart_ptr layer_schema, const_layer_data_smart_ptr data, const layer_configuration_specific& input_configuration_specific, const layer_configuration_specific& output_configuration_specific, unsigned int entry_count) const { const std::vector<float>::iterator in_err_it_global = input_errors->begin(); const std::vector<float>::const_iterator out_err_it_global = output_errors->begin(); const unsigned int input_neuron_count = input_configuration_specific.get_neuron_count(); const unsigned int input_neuron_count_per_feature_map = input_configuration_specific.get_neuron_count_per_feature_map(); const unsigned int output_neuron_count = output_configuration_specific.get_neuron_count(); const unsigned int output_neuron_count_per_feature_map = output_configuration_specific.get_neuron_count_per_feature_map(); std::tr1::shared_ptr<const average_subsampling_layer> layer_derived = std::tr1::dynamic_pointer_cast<const average_subsampling_layer>(layer_schema); const std::vector<unsigned int>& subsampling_sizes = layer_derived->subsampling_sizes; const unsigned int dimension_count = static_cast<unsigned int>(layer_derived->subsampling_sizes.size()); std::vector<unsigned int> input_slices(input_configuration_specific.dimension_sizes.size()); input_slices[0] = 1; for(unsigned int i = 0; i < dimension_count - 1; ++i) input_slices[i + 1] = input_slices[i] * input_configuration_specific.dimension_sizes[i]; unsigned int subsampling_elem_count = 1; for(unsigned int i = 0; i < dimension_count; ++i) subsampling_elem_count *= subsampling_sizes[i]; const unsigned int const_subsampling_elem_count = subsampling_elem_count; const float mult = 1.0F / static_cast<float>(subsampling_elem_count * subsampling_elem_count); const unsigned int feature_map_count = output_configuration_specific.feature_map_count; std::vector<unsigned int> current_local_input_position(dimension_count, 0); std::vector<unsigned int> offset_list(subsampling_elem_count); for(unsigned int i = 1; i < subsampling_elem_count; ++i) { int offset = 0; for(unsigned int j = 0; j < dimension_count; ++j) { offset += static_cast<int>(input_slices[j]); if ((++current_local_input_position[j]) < subsampling_sizes[j]) { offset_list[i] = offset_list[i-1] + offset; break; } current_local_input_position[j] = 0; offset -= static_cast<int>(subsampling_sizes[j] * input_slices[j]); } } const int total_workload = entry_count * output_configuration_specific.feature_map_count; const std::vector<unsigned int>::const_iterator dimension_sizes_it = output_configuration_specific.dimension_sizes.begin(); const std::vector<unsigned int>::const_iterator subsampling_sizes_it = subsampling_sizes.begin(); const std::vector<unsigned int>::const_iterator input_slices_it = input_slices.begin(); const std::vector<unsigned int>::const_iterator offset_list_it = offset_list.begin(); #pragma omp parallel default(none) num_threads(plain_config->openmp_thread_count) { std::tr1::array<unsigned int, max_dimension_count> current_output_position; #pragma omp for schedule(guided) for(int workload_id = 0; workload_id < total_workload; ++workload_id) { int entry_id = workload_id / feature_map_count; int feature_map_id = workload_id - (entry_id * feature_map_count); std::vector<float>::iterator in_err_it_base = in_err_it_global + (entry_id * input_neuron_count) + (feature_map_id * input_neuron_count_per_feature_map); std::vector<float>::const_iterator out_err_it_base = out_err_it_global + (entry_id * output_neuron_count) + (feature_map_id * output_neuron_count_per_feature_map); std::fill_n(current_output_position.begin(), dimension_count, 0); for(std::vector<float>::const_iterator out_it = out_err_it_base; out_it != out_err_it_base + output_neuron_count_per_feature_map; ++out_it) { // Define the starting position of the first input elem std::vector<float>::iterator in_it = in_err_it_base; for(unsigned int i = 0; i < dimension_count; ++i) in_it += current_output_position[i] * (*(subsampling_sizes_it + i)) * (*(input_slices_it + i)); float err = *out_it * mult; for(unsigned int i = 0; i < const_subsampling_elem_count; ++i) { *(in_it + (*(offset_list_it + i))) = err; } // Go to the next output element for(unsigned int i = 0; i < dimension_count; ++i) { if ((++current_output_position[i]) < *( dimension_sizes_it + i)) break; current_output_position[i] = 0; } } } } }
void local_contrast_subtractive_layer_updater_plain::run_backward_data_propagation( unsigned int input_index, plain_buffer::ptr input_errors_buffer, plain_buffer::const_ptr output_errors_buffer, const std::vector<plain_buffer::const_ptr>& input_neurons_buffers, plain_buffer::const_ptr output_neurons_buffer, plain_buffer::ptr temporary_working_fixed_buffer, plain_buffer::ptr temporary_working_per_entry_buffer, plain_buffer::ptr temporary_per_entry_buffer, plain_running_configuration::const_ptr plain_config, layer::const_ptr layer_schema, layer_data::const_ptr data, layer_data_custom::const_ptr data_custom, const std::vector<layer_configuration_specific>& input_configuration_specific_list, const layer_configuration_specific& output_configuration_specific, const bool add_update_to_destination, const std::set<layer_action>& actions, unsigned int entry_count) const { const unsigned int neuron_count = output_configuration_specific.get_neuron_count(); const unsigned int neuron_count_per_feature_map = output_configuration_specific.get_neuron_count_per_feature_map(); nnforge_shared_ptr<const local_contrast_subtractive_layer> layer_derived = nnforge_dynamic_pointer_cast<const local_contrast_subtractive_layer>(layer_schema); const std::vector<std::vector<float> >& window_weights_list = layer_derived->window_weights_list; const std::vector<unsigned int>& feature_maps_affected = layer_derived->feature_maps_affected; const std::vector<unsigned int>& feature_maps_unaffected = layer_derived->feature_maps_unaffected; const unsigned int dimension_count = static_cast<unsigned int>(window_weights_list.size()); std::vector<unsigned int> input_slices(input_configuration_specific_list[0].dimension_sizes.size()); input_slices[0] = 1; for(unsigned int i = 0; i < dimension_count - 1; ++i) input_slices[i + 1] = input_slices[i] * input_configuration_specific_list[0].dimension_sizes[i]; const std::vector<unsigned int>::const_iterator dimension_sizes_it = output_configuration_specific.dimension_sizes.begin(); const unsigned int feature_maps_affected_count = static_cast<unsigned int>(feature_maps_affected.size()); const unsigned int feature_maps_unaffected_count = static_cast<unsigned int>(feature_maps_affected.size()); const std::vector<unsigned int>::const_iterator input_slices_it = input_slices.begin(); const std::vector<unsigned int>::const_iterator feature_maps_affected_it = feature_maps_affected.begin(); float * const input_errors_it = *input_errors_buffer; const float * const output_errors_it = *output_errors_buffer; const std::vector<std::vector<float> >::const_iterator window_weights_list_it = window_weights_list.begin(); float * const working_buffer_it = *temporary_working_fixed_buffer; const int total_workload = entry_count * feature_maps_affected_count; const int openmp_thread_count = plain_config->openmp_thread_count; #pragma omp parallel default(none) num_threads(openmp_thread_count) { std::vector<float *> local_additional_buffers; int thread_id = 0; #ifdef _OPENMP thread_id = omp_get_thread_num(); #endif local_additional_buffers.push_back(working_buffer_it + thread_id * neuron_count_per_feature_map); if (dimension_count > 1) local_additional_buffers.push_back(working_buffer_it + (openmp_thread_count + thread_id) * neuron_count_per_feature_map); #pragma omp for schedule(guided) for(int workload_id = 0; workload_id < total_workload; ++workload_id) { int entry_id = workload_id / feature_maps_affected_count; int affected_feature_map_id = workload_id - (entry_id * feature_maps_affected_count); unsigned int current_output_buffer_index = 0; unsigned int feature_map_id = *(feature_maps_affected_it + affected_feature_map_id); for(unsigned int dimension_id = 0; dimension_id < dimension_count; ++dimension_id) { float * out_it_base = local_additional_buffers[current_output_buffer_index]; const float * in_it; if (dimension_id > 0) in_it = local_additional_buffers[1 - current_output_buffer_index]; else in_it = output_errors_it + (entry_id * neuron_count) + (feature_map_id * neuron_count_per_feature_map); int max_output_size = *(dimension_sizes_it + dimension_id); int input_slice_size = *(input_slices_it + dimension_id); std::vector<unsigned int> current_output_position(dimension_count, 0); for(float * out_it = out_it_base; out_it != out_it_base + neuron_count_per_feature_map; ++out_it, ++in_it) { const std::vector<float>& current_window_weights_list = *(window_weights_list_it + dimension_id); float sum = *in_it * current_window_weights_list[0]; int current_position = static_cast<int>(current_output_position[dimension_id]); int dest_forward = current_position; int dest_backward = dest_forward; for (std::vector<float>::const_iterator it = current_window_weights_list.begin() + 1; it != current_window_weights_list.end(); ++it) { dest_forward++; dest_backward--; int dest_forward_actual = (dest_forward < max_output_size) ? dest_forward : (((max_output_size << 1) - 1) - dest_forward); int dest_backward_actual = (dest_backward >= 0) ? dest_backward : (-1 - dest_backward); int offset_forward = ((dest_forward_actual - current_position) * input_slice_size); int offset_backward = ((dest_backward_actual - current_position) * input_slice_size); sum += (*(in_it + offset_forward) + *(in_it + offset_backward)) * (*it); } *out_it = sum; // Go to the next output element for(unsigned int i = 0; i < dimension_count; ++i) { if ((++current_output_position[i]) < *(dimension_sizes_it + i)) break; current_output_position[i] = 0; } } current_output_buffer_index = 1 - current_output_buffer_index; } // for(unsigned int dimension_id { float * out_it = input_errors_it + (entry_id * neuron_count) + (feature_map_id * neuron_count_per_feature_map); const float * orig_it = output_errors_it + (entry_id * neuron_count) + (feature_map_id * neuron_count_per_feature_map); const float * in_it = local_additional_buffers[1 - current_output_buffer_index]; if (add_update_to_destination) { for(int i = 0; i < static_cast<int>(neuron_count_per_feature_map); ++i) *(out_it + i) += *(orig_it + i) - *(in_it + i); } else { for(int i = 0; i < static_cast<int>(neuron_count_per_feature_map); ++i) *(out_it + i) = *(orig_it + i) - *(in_it + i); } } } } // #pragma parallel if ((!add_update_to_destination) && (feature_maps_unaffected_count > 0) && (input_errors_it != output_errors_it)) { for(unsigned int entry_id = 0; entry_id < entry_count; ++entry_id) { for(std::vector<unsigned int>::const_iterator it = feature_maps_unaffected.begin(); it != feature_maps_unaffected.end(); ++it) { unsigned int feature_map_id = *it; float * out_it = input_errors_it + (entry_id * neuron_count) + (feature_map_id * neuron_count_per_feature_map); for(unsigned int i = 0; i < neuron_count_per_feature_map; ++i) *(out_it + i) = 0.0F; } } } }
void max_subsampling_layer_updater_plain::test( const_additional_buffer_smart_ptr input_buffer, additional_buffer_smart_ptr output_buffer, std::vector<additional_buffer_smart_ptr>& additional_buffers, plain_running_configuration_const_smart_ptr plain_config, const_layer_smart_ptr layer_schema, const_layer_data_smart_ptr data, const_layer_data_custom_smart_ptr data_custom, const layer_configuration_specific& input_configuration_specific, const layer_configuration_specific& output_configuration_specific, unsigned int updater_count, unsigned int offset_input_entry_id, bool force_deterministic) const { nnforge_shared_ptr<const max_subsampling_layer> layer_derived = nnforge_dynamic_pointer_cast<const max_subsampling_layer>(layer_schema); if (layer_derived->tiling) throw neural_network_exception("max_subsampling_layer_updater_plain is not able to run for max subsampling layer with tiling"); if (offset_input_entry_id > 0) throw neural_network_exception("max_subsampling_layer_updater_plain is not able to run using offset"); const std::vector<float>::const_iterator in_it_global = input_buffer->begin(); const std::vector<float>::iterator out_it_global = output_buffer->begin(); const std::vector<float>::iterator max_indexes_it_global = additional_buffers[0]->begin(); const unsigned int input_neuron_count = input_configuration_specific.get_neuron_count(); const unsigned int input_neuron_count_per_feature_map = input_configuration_specific.get_neuron_count_per_feature_map(); const unsigned int output_neuron_count = output_configuration_specific.get_neuron_count(); const unsigned int output_neuron_count_per_feature_map = output_configuration_specific.get_neuron_count_per_feature_map(); const std::vector<unsigned int>& subsampling_sizes = layer_derived->subsampling_sizes; const unsigned int dimension_count = static_cast<unsigned int>(layer_derived->subsampling_sizes.size()); std::vector<unsigned int> input_slices(input_configuration_specific.dimension_sizes.size()); input_slices[0] = 1; for(unsigned int i = 0; i < dimension_count - 1; ++i) input_slices[i + 1] = input_slices[i] * input_configuration_specific.dimension_sizes[i]; unsigned int subsampling_elem_count = 1; for(unsigned int i = 0; i < dimension_count; ++i) subsampling_elem_count *= subsampling_sizes[i]; const unsigned int const_subsampling_elem_count = subsampling_elem_count; const unsigned int feature_map_count = output_configuration_specific.feature_map_count; std::vector<unsigned int> current_local_input_position(dimension_count, 0); std::vector<unsigned int> offset_list(subsampling_elem_count); for(unsigned int i = 1; i < subsampling_elem_count; ++i) { int offset = 0; for(unsigned int j = 0; j < dimension_count; ++j) { offset += static_cast<int>(input_slices[j]); if ((++current_local_input_position[j]) < subsampling_sizes[j]) { offset_list[i] = offset_list[i-1] + offset; break; } current_local_input_position[j] = 0; offset -= static_cast<int>(subsampling_sizes[j] * input_slices[j]); } } const int total_workload = updater_count * output_configuration_specific.feature_map_count; const std::vector<unsigned int>::const_iterator dimension_sizes_it = output_configuration_specific.dimension_sizes.begin(); const std::vector<unsigned int>::const_iterator subsampling_sizes_it = subsampling_sizes.begin(); const std::vector<unsigned int>::const_iterator input_slices_it = input_slices.begin(); const std::vector<unsigned int>::const_iterator offset_list_it = offset_list.begin(); #pragma omp parallel default(none) num_threads(plain_config->openmp_thread_count) { nnforge_array<unsigned int, max_dimension_count> current_output_position; #pragma omp for schedule(guided) for(int workload_id = 0; workload_id < total_workload; ++workload_id) { int entry_id = workload_id / feature_map_count; int feature_map_id = workload_id - (entry_id * feature_map_count); const int in_base_offset = (entry_id * input_neuron_count) + (feature_map_id * input_neuron_count_per_feature_map); std::vector<float>::iterator out_it_base = out_it_global + (entry_id * output_neuron_count) + (feature_map_id * output_neuron_count_per_feature_map); std::vector<float>::iterator max_indexes_it_base = max_indexes_it_global + (entry_id * output_neuron_count) + (feature_map_id * output_neuron_count_per_feature_map); std::fill_n(current_output_position.begin(), dimension_count, 0); std::vector<float>::iterator max_indexes_it = max_indexes_it_base; for(std::vector<float>::iterator out_it = out_it_base; out_it != out_it_base + output_neuron_count_per_feature_map; ++out_it, ++max_indexes_it) { // Define the starting position of the first input elem int in_offset = in_base_offset; for(unsigned int i = 0; i < dimension_count; ++i) in_offset += current_output_position[i] * (*(subsampling_sizes_it + i)) * (*(input_slices_it + i)); unsigned int max_index = 0; float best_val = -1.0e38F; for(unsigned int i = 0; i < const_subsampling_elem_count; ++i) { int current_offset = in_offset + *(offset_list_it + i); float new_val = *(in_it_global + current_offset); if ((i == 0) || (new_val > best_val)) { best_val = new_val; max_index = current_offset; } } *out_it = best_val; *((unsigned int *)(&(*max_indexes_it))) = max_index; // Go to the next output element for(unsigned int i = 0; i < dimension_count; ++i) { if ((++current_output_position[i]) < *( dimension_sizes_it + i)) break; current_output_position[i] = 0; } } } } }