void layer_updater_cuda::configure( const std::vector<layer_configuration_specific>& input_configuration_specific_list, const layer_configuration_specific& output_configuration_specific, layer::const_ptr layer_schema, cuda_running_configuration::const_ptr cuda_config, const std::set<layer_action>& actions) { this->layer_schema = layer_schema; this->input_configuration_specific_list = input_configuration_specific_list; this->output_configuration_specific = output_configuration_specific; this->cuda_config = cuda_config; this->actions = actions; input_elem_count_per_entry_list.resize(input_configuration_specific_list.size()); input_elem_count_per_feature_map_list.resize(input_configuration_specific_list.size()); for(int i = 0; i < input_configuration_specific_list.size(); ++i) { input_elem_count_per_entry_list[i] = input_configuration_specific_list[i].get_neuron_count(); input_elem_count_per_feature_map_list[i] = input_configuration_specific_list[i].get_neuron_count_per_feature_map(); } output_elem_count_per_entry = output_configuration_specific.get_neuron_count(); output_elem_count_per_feature_map = output_configuration_specific.get_neuron_count_per_feature_map(); updater_configured(); }
void rgb_to_yuv_convert_layer_tester_plain::run_forward_propagation( plain_buffer::ptr output_buffer, const std::vector<plain_buffer::const_ptr>& input_buffers, plain_buffer::ptr temporary_working_fixed_buffer, plain_buffer::ptr temporary_working_per_entry_buffer, plain_running_configuration::const_ptr plain_config, layer::const_ptr layer_schema, layer_data::const_ptr data, layer_data_custom::const_ptr data_custom, const std::vector<layer_configuration_specific>& input_configuration_specific_list, const layer_configuration_specific& output_configuration_specific, unsigned int entry_count) const { const float * const in_it = *input_buffers[0]; float * const out_it = *output_buffer; nnforge_shared_ptr<const rgb_to_yuv_convert_layer> layer_derived = nnforge_dynamic_pointer_cast<const rgb_to_yuv_convert_layer>(layer_schema); const unsigned int color_feature_map_config_count = static_cast<unsigned int>(layer_derived->color_feature_map_config_list.size()); if ((out_it != in_it) && ((color_feature_map_config_count * 3) != output_configuration_specific.feature_map_count)) memcpy(out_it, in_it, output_configuration_specific.get_neuron_count() * entry_count * sizeof(float)); const int total_workload = static_cast<int>(entry_count * color_feature_map_config_count); const unsigned int input_neuron_count = output_configuration_specific.get_neuron_count(); const unsigned int input_neuron_count_per_feature_map = output_configuration_specific.get_neuron_count_per_feature_map(); const std::vector<color_feature_map_config>::const_iterator cfm_it = layer_derived->color_feature_map_config_list.begin(); #pragma omp parallel for default(none) schedule(guided) num_threads(plain_config->openmp_thread_count) for(int workload_id = 0; workload_id < total_workload; ++workload_id) { int entry_id = workload_id / color_feature_map_config_count; int color_feature_map_config_id = workload_id - entry_id * color_feature_map_config_count; const color_feature_map_config& cfm = *(cfm_it + color_feature_map_config_id); const float * in_it_red_and_y = in_it + (entry_id * input_neuron_count) + (cfm.red_and_y_feature_map_id * input_neuron_count_per_feature_map); const float * in_it_green_and_u = in_it + (entry_id * input_neuron_count) + (cfm.green_and_u_feature_map_id * input_neuron_count_per_feature_map); const float * in_it_blue_and_v = in_it + (entry_id * input_neuron_count) + (cfm.blue_and_v_feature_map_id * input_neuron_count_per_feature_map); float * out_it_red_and_y = out_it + (entry_id * input_neuron_count) + (cfm.red_and_y_feature_map_id * input_neuron_count_per_feature_map); float * out_it_green_and_u = out_it + (entry_id * input_neuron_count) + (cfm.green_and_u_feature_map_id * input_neuron_count_per_feature_map); float * out_it_blue_and_v = out_it + (entry_id * input_neuron_count) + (cfm.blue_and_v_feature_map_id * input_neuron_count_per_feature_map); for(unsigned int i = 0; i < input_neuron_count_per_feature_map; ++i) { float red = in_it_red_and_y[i]; float green = in_it_green_and_u[i]; float blue = in_it_blue_and_v[i]; float y = w_r * red + w_g * green + w_b * blue; float u = u_mult * (blue - y); float v = v_mult * (red - y); out_it_red_and_y[i] = y; out_it_green_and_u[i] = u; out_it_blue_and_v[i] = v; } } }
void softmax_layer_hessian_plain::test( const_additional_buffer_smart_ptr input_buffer, additional_buffer_smart_ptr output_buffer, std::vector<additional_buffer_smart_ptr>& additional_buffers, plain_running_configuration_const_smart_ptr plain_config, const_layer_smart_ptr layer_schema, const_layer_data_smart_ptr data, const_layer_data_custom_smart_ptr data_custom, const layer_configuration_specific& input_configuration_specific, const layer_configuration_specific& output_configuration_specific, unsigned int entry_count) const { const unsigned int input_neuron_count = input_configuration_specific.get_neuron_count(); const unsigned int input_neuron_count_per_feature_map = input_configuration_specific.get_neuron_count_per_feature_map(); const unsigned int feature_map_count = static_cast<unsigned int>(input_configuration_specific.feature_map_count); const std::vector<float>::const_iterator input_buffer_it = input_buffer->begin(); const std::vector<float>::iterator output_buffer_it = output_buffer->begin(); const int total_workload = entry_count * input_neuron_count_per_feature_map; const int openmp_thread_count = plain_config->openmp_thread_count; #pragma omp parallel default(none) shared(additional_buffers) num_threads(openmp_thread_count) { int thread_id = 0; #ifdef _OPENMP thread_id = omp_get_thread_num(); #endif std::vector<float>& local_additional_buffer = *(additional_buffers[thread_id]); #pragma omp for schedule(guided) for(int workload_id = 0; workload_id < total_workload; ++workload_id) { int entry_id = workload_id / input_neuron_count_per_feature_map; int neuron_id = workload_id - (entry_id * input_neuron_count_per_feature_map); const std::vector<float>::const_iterator in_it = input_buffer_it + (entry_id * input_neuron_count) + neuron_id; const std::vector<float>::iterator out_it = output_buffer_it + (entry_id * input_neuron_count) + neuron_id; float max_val = -1.0e+37F; for(unsigned int feature_map_id = 0; feature_map_id < feature_map_count; ++feature_map_id) { float val = *(in_it + (feature_map_id * input_neuron_count_per_feature_map)); max_val = std::max(max_val, val); } float sum = 0.0F; for(unsigned int feature_map_id = 0; feature_map_id < feature_map_count; ++feature_map_id) { float val = expf((*(in_it + (feature_map_id * input_neuron_count_per_feature_map))) - max_val); sum += val; local_additional_buffer[feature_map_id] = val; } float mult = 1.0F / sum; for(unsigned int feature_map_id = 0; feature_map_id < feature_map_count; ++feature_map_id) *(out_it + (feature_map_id * input_neuron_count_per_feature_map)) = local_additional_buffer[feature_map_id] * mult; } // for(int workload_id } // #pragma parallel }
void softmax_layer_hessian_plain::backprop( additional_buffer_smart_ptr input_errors, const_additional_buffer_smart_ptr output_errors, const_additional_buffer_smart_ptr output_neurons, std::vector<additional_buffer_smart_ptr>& additional_buffers, plain_running_configuration_const_smart_ptr plain_config, const_layer_smart_ptr layer_schema, const_layer_data_smart_ptr data, const_layer_data_custom_smart_ptr data_custom, const layer_configuration_specific& input_configuration_specific, const layer_configuration_specific& output_configuration_specific, unsigned int entry_count) const { const unsigned int input_neuron_count = input_configuration_specific.get_neuron_count(); const unsigned int input_neuron_count_per_feature_map = input_configuration_specific.get_neuron_count_per_feature_map(); const unsigned int feature_map_count = static_cast<unsigned int>(input_configuration_specific.feature_map_count); const std::vector<float>::iterator input_errors_it = input_errors->begin(); const std::vector<float>::const_iterator output_errors_it = output_errors->begin(); const std::vector<float>::const_iterator output_neurons_it = output_neurons->begin(); const int total_workload = entry_count * input_neuron_count_per_feature_map; const int openmp_thread_count = plain_config->openmp_thread_count; #pragma omp parallel default(none) shared(additional_buffers) num_threads(openmp_thread_count) { int thread_id = 0; #ifdef _OPENMP thread_id = omp_get_thread_num(); #endif #pragma omp for schedule(guided) for(int workload_id = 0; workload_id < total_workload; ++workload_id) { int entry_id = workload_id / input_neuron_count_per_feature_map; int neuron_id = workload_id - (entry_id * input_neuron_count_per_feature_map); const std::vector<float>::iterator in_errors_it = input_errors_it + (entry_id * input_neuron_count) + neuron_id; const std::vector<float>::const_iterator out_errors_it = output_errors_it + (entry_id * input_neuron_count) + neuron_id; const std::vector<float>::const_iterator out_neurons_it = output_neurons_it + (entry_id * input_neuron_count) + neuron_id; float sum = 0.0F; for(unsigned int feature_map_id = 0; feature_map_id < feature_map_count; ++feature_map_id) { unsigned int offset = feature_map_id * input_neuron_count_per_feature_map; float val = (*(out_neurons_it + offset)); sum += val * val * (*(out_errors_it + offset)); } for(unsigned int feature_map_id = 0; feature_map_id < feature_map_count; ++feature_map_id) { unsigned int offset = feature_map_id * input_neuron_count_per_feature_map; float y = *(out_neurons_it + offset); float y2 = y * y; *(in_errors_it + offset) = y2 * ((*(out_errors_it + offset)) * (2.0F * (y2 - y) + 1.0F) - sum); } } // for(int workload_id } // #pragma parallel }
void reshape_data_transformer::transform( const float * data, float * data_transformed, const layer_configuration_specific& original_config, unsigned int sample_id) { if (original_config.get_neuron_count() != config.get_neuron_count()) throw neural_network_exception((boost::format("Neuron counts for reshape_data_transformer don't match: %1% and %2%") % original_config.get_neuron_count() % config.get_neuron_count()).str()); memcpy(data_transformed, data, original_config.get_neuron_count() * sizeof(float)); }
void dropout_layer_updater_plain::test( const_additional_buffer_smart_ptr input_buffer, additional_buffer_smart_ptr output_buffer, std::vector<additional_buffer_smart_ptr>& additional_buffers, plain_running_configuration_const_smart_ptr plain_config, const_layer_smart_ptr layer_schema, const_layer_data_smart_ptr data, const_layer_data_custom_smart_ptr data_custom, const layer_configuration_specific& input_configuration_specific, const layer_configuration_specific& output_configuration_specific, unsigned int updater_count, unsigned int offset_input_entry_id, bool force_deterministic) const { if (offset_input_entry_id > 0) throw neural_network_exception("dropout_layer_updater_plain is not able to run using offset"); if (force_deterministic) { memcpy(&(output_buffer->at(0)), &(input_buffer->at(0)), input_configuration_specific.get_neuron_count() * updater_count * sizeof(float)); } else { const std::vector<float>::const_iterator in_it_global = input_buffer->begin(); const std::vector<float>::iterator out_it_global = output_buffer->begin(); unsigned char * keep_elem_ptr = reinterpret_cast<unsigned char *>(&(additional_buffers[0]->at(0))); nnforge_shared_ptr<const dropout_layer> layer_derived = nnforge_dynamic_pointer_cast<const dropout_layer>(layer_schema); const float dropout_rate = layer_derived->dropout_rate; const float keep_rate = 1.0F - dropout_rate; const float mult = 1.0F / keep_rate; const int total_workload = input_configuration_specific.get_neuron_count() * updater_count; nnforge_uniform_real_distribution<float> dist(0.0F, 1.0F); for(int i = 0; i < total_workload; ++i) keep_elem_ptr[i] = (dist(gen) <= keep_rate ? (unsigned char)1 : (unsigned char)0); #pragma omp parallel default(none) num_threads(plain_config->openmp_thread_count) shared(keep_elem_ptr) { #pragma omp for schedule(guided) for(int workload_id = 0; workload_id < total_workload; ++workload_id) { int elem_id = workload_id; *(out_it_global + elem_id) = *(in_it_global + elem_id) * (keep_elem_ptr[elem_id] ? mult : 0.0F); } } } }
void maxout_layer_tester_plain::run_forward_propagation( plain_buffer::ptr output_buffer, const std::vector<plain_buffer::const_ptr>& input_buffers, plain_buffer::ptr temporary_working_fixed_buffer, plain_buffer::ptr temporary_working_per_entry_buffer, plain_running_configuration::const_ptr plain_config, layer::const_ptr layer_schema, layer_data::const_ptr data, layer_data_custom::const_ptr data_custom, const std::vector<layer_configuration_specific>& input_configuration_specific_list, const layer_configuration_specific& output_configuration_specific, unsigned int entry_count) const { const float * const in_it_global = *input_buffers[0]; float * const out_it_global = *output_buffer; const unsigned int input_neuron_count = input_configuration_specific_list[0].get_neuron_count(); const unsigned int input_neuron_count_per_feature_map = input_configuration_specific_list[0].get_neuron_count_per_feature_map(); const unsigned int output_neuron_count = output_configuration_specific.get_neuron_count(); const unsigned int output_neuron_count_per_feature_map = output_configuration_specific.get_neuron_count_per_feature_map(); nnforge_shared_ptr<const maxout_layer> layer_derived = nnforge_dynamic_pointer_cast<const maxout_layer>(layer_schema); const unsigned int feature_map_subsampling_size = layer_derived->feature_map_subsampling_size; const int output_feature_map_count = output_configuration_specific.feature_map_count; const int total_workload = entry_count * output_feature_map_count; #pragma omp parallel default(none) num_threads(plain_config->openmp_thread_count) { #pragma omp for schedule(guided) for(int workload_id = 0; workload_id < total_workload; ++workload_id) { int entry_id = workload_id / output_feature_map_count; int output_feature_map_id = workload_id - (entry_id * output_feature_map_count); const float * in_it_base = in_it_global + (entry_id * input_neuron_count) + (output_feature_map_id * input_neuron_count_per_feature_map); float * out_it_base = out_it_global + (entry_id * output_neuron_count) + (output_feature_map_id * output_neuron_count_per_feature_map); for(float * out_it = out_it_base; out_it != out_it_base + output_neuron_count_per_feature_map; ++out_it, ++in_it_base) { const float * in_it = in_it_base; float current_max = *in_it; for(unsigned int i = 1; i < feature_map_subsampling_size; ++i) { in_it += output_feature_map_count * output_neuron_count_per_feature_map; float new_val = *in_it; current_max = std::max(new_val, current_max); } *out_it = current_max; } } } }
layer_tester_cuda_smart_ptr convolution_layer_testing_schema::create_tester_specific( const layer_configuration_specific& input_configuration_specific, const layer_configuration_specific& output_configuration_specific) const { layer_tester_cuda_smart_ptr res; if (output_configuration_specific.get_neuron_count() == output_configuration_specific.feature_map_count) { res = layer_tester_cuda_smart_ptr(new fully_connected_layer_tester_cuda()); } else { switch (output_configuration_specific.dimension_sizes.size()) { case 1: if (cuda_config->get_compute_capability() >= 300) res = layer_tester_cuda_smart_ptr(new convolution_1d_layer_tester_cuda_kepler()); else res = layer_tester_cuda_smart_ptr(new convolution_1d_layer_tester_cuda_fermi()); break; case 2: if (cuda_config->get_compute_capability() >= 300) res = layer_tester_cuda_smart_ptr(new convolution_2d_layer_tester_cuda_kepler()); else res = layer_tester_cuda_smart_ptr(new convolution_2d_layer_tester_cuda_fermi()); break; default: throw neural_network_exception((boost::format("No CUDA tester for the convolution layer of %1% dimensions") % output_configuration_specific.dimension_sizes.size()).str()); break; } } return res; }
void concat_layer_updater_plain::run_forward_propagation( plain_buffer::ptr output_buffer, const std::vector<plain_buffer::const_ptr>& input_buffers, plain_buffer::ptr temporary_working_fixed_buffer, plain_buffer::ptr temporary_working_per_entry_buffer, plain_buffer::ptr temporary_per_entry_buffer, plain_running_configuration::const_ptr plain_config, layer::const_ptr layer_schema, layer_data::const_ptr data, layer_data_custom::const_ptr data_custom, const std::vector<layer_configuration_specific>& input_configuration_specific_list, const layer_configuration_specific& output_configuration_specific, const std::set<layer_action>& actions, unsigned int entry_count) const { for(unsigned int entry_id = 0; entry_id < entry_count; ++entry_id) { float *dst = (float *)*output_buffer + entry_id * output_configuration_specific.get_neuron_count(); for(unsigned int i = 0; i < static_cast<unsigned int>(input_configuration_specific_list.size()); ++i) { unsigned int input_neuron_count = input_configuration_specific_list[i].get_neuron_count(); memcpy( dst, (const float *)(*input_buffers[i]) + entry_id * input_neuron_count, input_neuron_count * sizeof(float)); dst += input_neuron_count; } } }
layer_configuration_specific max_subsampling_layer::get_output_layer_configuration_specific(const layer_configuration_specific& input_configuration_specific) const { if (input_configuration_specific.get_dimension_count() != subsampling_sizes.size()) throw neural_network_exception((boost::format("Dimension count in layer (%1%) and input configuration (%2%) don't match") % subsampling_sizes.size() % input_configuration_specific.get_dimension_count()).str()); layer_configuration_specific res(input_configuration_specific.feature_map_count); if (tiling) { for(unsigned int i = 0; i < subsampling_sizes.size(); ++i) { if (input_configuration_specific.dimension_sizes[i] < (subsampling_sizes[i] * 2 - 1)) throw neural_network_exception((boost::format("Input configuration size (%1%) of dimension (%2%) is smaller than subsampling size (%3%) * 2 - 1") % input_configuration_specific.dimension_sizes[i] % i % subsampling_sizes[i]).str()); res.dimension_sizes.push_back((input_configuration_specific.dimension_sizes[i] - (subsampling_sizes[i] - 1))/ subsampling_sizes[i]); } } else { for(unsigned int i = 0; i < subsampling_sizes.size(); ++i) { if (input_configuration_specific.dimension_sizes[i] < subsampling_sizes[i]) throw neural_network_exception((boost::format("Input configuration size (%1%) of dimension (%2%) is smaller than subsampling size (%3%)") % input_configuration_specific.dimension_sizes[i] % i % subsampling_sizes[i]).str()); res.dimension_sizes.push_back(input_configuration_specific.dimension_sizes[i] / subsampling_sizes[i]); } } return res; }
void hyperbolic_tangent_layer_tester_plain::run_forward_propagation( plain_buffer::ptr output_buffer, const std::vector<plain_buffer::const_ptr>& input_buffers, plain_buffer::ptr temporary_working_fixed_buffer, plain_buffer::ptr temporary_working_per_entry_buffer, plain_running_configuration::const_ptr plain_config, layer::const_ptr layer_schema, layer_data::const_ptr data, layer_data_custom::const_ptr data_custom, const std::vector<layer_configuration_specific>& input_configuration_specific_list, const layer_configuration_specific& output_configuration_specific, unsigned int entry_count) const { const int elem_count = static_cast<int>(entry_count * output_configuration_specific.get_neuron_count()); float * const out_it = *output_buffer; const float * const in_it = *input_buffers[0]; std::shared_ptr<const hyperbolic_tangent_layer> layer_derived = std::dynamic_pointer_cast<const hyperbolic_tangent_layer>(layer_schema); const float hyperbolic_tangent_steepness2 = layer_derived->steepness * 2.0F; const float hyperbolic_tangent_major_multiplier = layer_derived->scale; #pragma omp parallel for default(none) schedule(guided) num_threads(plain_config->openmp_thread_count) for(int i = 0; i < elem_count; ++i) { float inp = *(in_it + i); float inp2 = expf(inp * hyperbolic_tangent_steepness2); float res = (inp2 - 1.0F) / (inp2 + 1.0F) * hyperbolic_tangent_major_multiplier; *(out_it + i) = res; } }
supervised_data_mem_reader::supervised_data_mem_reader( const layer_configuration_specific& input_configuration, const layer_configuration_specific& output_configuration, const std::vector<std::tr1::shared_ptr<const std::vector<float> > >& input_data_list, const std::vector<std::tr1::shared_ptr<const std::vector<float> > >& output_data_list) : input_configuration(input_configuration) , output_configuration(output_configuration) , input_data_list_float(input_data_list) , output_data_list(output_data_list) , entry_read_count(0) , type_code(neuron_data_type::type_byte) , entry_count(static_cast<unsigned int>(input_data_list.size())) , input_neuron_count(input_configuration.get_neuron_count()) , output_neuron_count(output_configuration.get_neuron_count()) { }
structured_data_stream_writer::structured_data_stream_writer( nnforge_shared_ptr<std::ostream> output_stream, const layer_configuration_specific& config) : out_stream(output_stream), entry_count(0) { out_stream->exceptions(std::ostream::failbit | std::ostream::badbit); neuron_count = config.get_neuron_count(); out_stream->write(reinterpret_cast<const char*>(structured_data_stream_schema::structured_data_stream_guid.data), sizeof(structured_data_stream_schema::structured_data_stream_guid.data)); config.write(*out_stream); entry_count_pos = out_stream->tellp(); out_stream->write(reinterpret_cast<const char*>(&entry_count), sizeof(entry_count)); }
void absolute_layer_updater_plain::test( const_additional_buffer_smart_ptr input_buffer, additional_buffer_smart_ptr output_buffer, std::vector<additional_buffer_smart_ptr>& additional_buffers, plain_running_configuration_const_smart_ptr plain_config, const_layer_smart_ptr layer_schema, const_layer_data_smart_ptr data, const_layer_data_custom_smart_ptr data_custom, const layer_configuration_specific& input_configuration_specific, const layer_configuration_specific& output_configuration_specific, unsigned int updater_count, unsigned int offset_input_entry_id, bool force_deterministic) const { if (offset_input_entry_id > 0) throw neural_network_exception("absolute_layer_updater_plain is not able to run using offset"); const int elem_count = static_cast<int>(updater_count * input_configuration_specific.get_neuron_count()); const std::vector<float>::const_iterator in_it = input_buffer->begin(); const std::vector<float>::iterator out_it = output_buffer->begin(); #pragma omp parallel for default(none) schedule(guided) num_threads(plain_config->openmp_thread_count) for(int i = 0; i < elem_count; ++i) *(out_it + i) = fabs(*(in_it + i)); }
void hyperbolic_tangent_layer_updater_plain::test( const_additional_buffer_smart_ptr input_buffer, additional_buffer_smart_ptr output_buffer, std::vector<additional_buffer_smart_ptr>& additional_buffers, plain_running_configuration_const_smart_ptr plain_config, const_layer_smart_ptr layer_schema, const_layer_data_smart_ptr data, const_layer_data_custom_smart_ptr data_custom, const layer_configuration_specific& input_configuration_specific, const layer_configuration_specific& output_configuration_specific, unsigned int updater_count, unsigned int offset_input_entry_id) const { if (offset_input_entry_id > 0) throw neural_network_exception("hyperbolic_tangent_layer_updater_plain is not able to run using offset"); const int elem_count = static_cast<int>(updater_count * input_configuration_specific.get_neuron_count()); const std::vector<float>::const_iterator in_it = input_buffer->begin(); const std::vector<float>::iterator out_it = output_buffer->begin(); nnforge_shared_ptr<const hyperbolic_tangent_layer> layer_derived = nnforge_dynamic_pointer_cast<const hyperbolic_tangent_layer>(layer_schema); const float hyperbolic_tangent_steepness2 = layer_derived->steepness * 2.0F; const float hyperbolic_tangent_major_multiplier = layer_derived->major_multiplier; #pragma omp parallel for default(none) schedule(guided) num_threads(plain_config->openmp_thread_count) for(int i = 0; i < elem_count; ++i) { float inp = *(in_it + i); float inp2 = expf(inp * hyperbolic_tangent_steepness2); float res = (inp2 - 1.0F) / (inp2 + 1.0F) * hyperbolic_tangent_major_multiplier; *(out_it + i) = res; } }
void absolute_layer_updater_plain::backprop( additional_buffer_smart_ptr input_errors, const_additional_buffer_smart_ptr input_neurons, const_additional_buffer_smart_ptr output_errors, const_additional_buffer_smart_ptr output_neurons, std::vector<additional_buffer_smart_ptr>& additional_buffers, plain_running_configuration_const_smart_ptr plain_config, const_layer_smart_ptr layer_schema, const_layer_data_smart_ptr data, const layer_configuration_specific& input_configuration_specific, const layer_configuration_specific& output_configuration_specific, unsigned int updater_count) const { const int elem_count = static_cast<int>(updater_count * input_configuration_specific.get_neuron_count()); const std::vector<float>::const_iterator in_it = input_neurons->begin(); const std::vector<float>::iterator in_err_it = input_errors->begin(); #pragma omp parallel for default(none) schedule(guided) num_threads(plain_config->openmp_thread_count) for(int i = 0; i < elem_count; ++i) { float val = *(in_it + i); if (val < 0.0F) { *(in_err_it + i) = - *(in_err_it + i); } } }
void rectified_linear_layer_updater_plain::run_forward_propagation( plain_buffer::ptr output_buffer, const std::vector<plain_buffer::const_ptr>& input_buffers, plain_buffer::ptr temporary_working_fixed_buffer, plain_buffer::ptr temporary_working_per_entry_buffer, plain_buffer::ptr temporary_per_entry_buffer, plain_running_configuration::const_ptr plain_config, layer::const_ptr layer_schema, layer_data::const_ptr data, layer_data_custom::const_ptr data_custom, const std::vector<layer_configuration_specific>& input_configuration_specific_list, const layer_configuration_specific& output_configuration_specific, const std::set<layer_action>& actions, unsigned int entry_count) const { std::shared_ptr<const rectified_linear_layer> layer_derived = std::dynamic_pointer_cast<const rectified_linear_layer>(layer_schema); const int elem_count = static_cast<int>(entry_count * output_configuration_specific.get_neuron_count()); float * const out_it = *output_buffer; const float * const in_it = *input_buffers[0]; const float negative_slope = layer_derived->negative_slope; #pragma omp parallel for default(none) schedule(guided) num_threads(plain_config->openmp_thread_count) for(int i = 0; i < elem_count; ++i) { float input_val = *(in_it + i); *(out_it + i) = input_val >= 0.0F ? input_val : input_val * negative_slope; } }
void convert_to_polar_data_transformer::transform( const void * data, void * data_transformed, neuron_data_type::input_type type, const layer_configuration_specific& original_config, unsigned int sample_id) { if (type != neuron_data_type::type_byte) throw neural_network_exception("convert_to_polar_data_transformer is implemented for data stored as bytes only"); if (original_config.dimension_sizes.size() != 2) throw neural_network_exception((boost::format("convert_to_polar_data_transformer is processing 2D data only, data is passed with number of dimensions %1%") % original_config.dimension_sizes.size()).str()); if (original_config.dimension_sizes != input_window_sizes) throw neural_network_exception("convert_to_polar_data_transformer: input window size mismatch between creation and actual transform"); unsigned int original_neuron_count_per_feature_map = original_config.get_neuron_count_per_feature_map(); unsigned int transformed_neuron_count_per_feature_map = get_transformed_configuration(original_config).get_neuron_count_per_feature_map(); for(unsigned int feature_map_id = 0; feature_map_id < original_config.feature_map_count; ++feature_map_id) { cv::Mat1b original_image(static_cast<int>(original_config.dimension_sizes[1]), static_cast<int>(original_config.dimension_sizes[0]), const_cast<unsigned char *>(static_cast<const unsigned char *>(data)) + (original_neuron_count_per_feature_map * feature_map_id)); cv::Mat1b dest_image(static_cast<int>(output_window_sizes[1]), static_cast<int>(output_window_sizes[0]), static_cast<unsigned char *>(data_transformed) + (transformed_neuron_count_per_feature_map * feature_map_id)); // Should try INTER_CUBIC and INTER_LANCZOS4 as well cv::remap(original_image, dest_image, map_x, map_y, cv::INTER_LINEAR, cv::BORDER_CONSTANT, border_value); } }
void hyperbolic_tangent_layer_updater_plain::backprop( additional_buffer_smart_ptr input_errors, const_additional_buffer_smart_ptr input_neurons, const_additional_buffer_smart_ptr output_errors, const_additional_buffer_smart_ptr output_neurons, std::vector<additional_buffer_smart_ptr>& additional_buffers, plain_running_configuration_const_smart_ptr plain_config, const_layer_smart_ptr layer_schema, const_layer_data_smart_ptr data, const_layer_data_custom_smart_ptr data_custom, const layer_configuration_specific& input_configuration_specific, const layer_configuration_specific& output_configuration_specific, unsigned int updater_count) const { const int elem_count = static_cast<int>(updater_count * input_configuration_specific.get_neuron_count()); const std::vector<float>::iterator in_err_it = input_errors->begin(); const std::vector<float>::const_iterator out_it = output_neurons->begin(); nnforge_shared_ptr<const hyperbolic_tangent_layer> layer_derived = nnforge_dynamic_pointer_cast<const hyperbolic_tangent_layer>(layer_schema); const float hyperbolic_tangent_major_multiplier_reverse = 1.0F / layer_derived->major_multiplier; const float hyperbolic_tangent_steepness3 = layer_derived->steepness * layer_derived->major_multiplier; #pragma omp parallel for default(none) schedule(guided) num_threads(plain_config->openmp_thread_count) for(int i = 0; i < elem_count; ++i) { float out_neuron = *(out_it + i); float normalized_value = out_neuron * hyperbolic_tangent_major_multiplier_reverse; float der1st = hyperbolic_tangent_steepness3 * (1.0F - (normalized_value * normalized_value)); *(in_err_it + i) *= der1st; } }
void flip_2d_data_sampler_transformer::transform( const void * data, void * data_transformed, neuron_data_type::input_type type, const layer_configuration_specific& original_config, unsigned int sample_id) { if (type != neuron_data_type::type_byte) throw neural_network_exception("flip_2d_data_sampler_transformer is implemented for data stored as bytes only"); if (original_config.dimension_sizes.size() != 2) throw neural_network_exception((boost::format("flip_2d_data_sampler_transformer is processing 2d data only, data is passed with number of dimensions %1%") % original_config.dimension_sizes.size()).str()); unsigned int neuron_count_per_feature_map = original_config.get_neuron_count_per_feature_map(); for(unsigned int feature_map_id = 0; feature_map_id < original_config.feature_map_count; ++feature_map_id) { cv::Mat1b src_image(static_cast<int>(original_config.dimension_sizes[1]), static_cast<int>(original_config.dimension_sizes[0]), const_cast<unsigned char *>(static_cast<const unsigned char *>(data)) + (neuron_count_per_feature_map * feature_map_id)); cv::Mat1b image(static_cast<int>(original_config.dimension_sizes[1]), static_cast<int>(original_config.dimension_sizes[0]), static_cast<unsigned char *>(data_transformed) + (neuron_count_per_feature_map * feature_map_id)); memcpy( ((unsigned char *)data_transformed) + neuron_count_per_feature_map * feature_map_id, ((unsigned char *)data) + neuron_count_per_feature_map * feature_map_id, neuron_count_per_feature_map * neuron_data_type::get_input_size(type)); if (sample_id == 1) { data_transformer_util::flip( image, (flip_around_dimension_id == 0), (flip_around_dimension_id == 1)); } } }
layer_updater_cuda::ptr sparse_convolution_layer_updater_schema::create_updater_specific( const std::vector<layer_configuration_specific>& input_configuration_specific_list, const layer_configuration_specific& output_configuration_specific) const { layer_updater_cuda::ptr res; nnforge_shared_ptr<const sparse_convolution_layer> layer_derived = nnforge_dynamic_pointer_cast<const sparse_convolution_layer>(layer_schema); bool zero_padding = (layer_derived->left_zero_padding == std::vector<unsigned int>(layer_derived->left_zero_padding.size(), 0)) && (layer_derived->right_zero_padding == std::vector<unsigned int>(layer_derived->right_zero_padding.size(), 0)); if (zero_padding && (output_configuration_specific.get_neuron_count() == output_configuration_specific.feature_map_count)) { if (input_configuration_specific_list[0].dimension_sizes == output_configuration_specific.dimension_sizes) { res = layer_updater_cuda::ptr(new sparse_fully_connected_1x1_layer_updater_cuda()); } else { res = layer_updater_cuda::ptr(new sparse_fully_connected_layer_updater_cuda()); } } else { res = sparse_convolution_layer_updater_schema_helper_cuda_kepler::create_updater_specific(input_configuration_specific_list, output_configuration_specific); } return res; }
void intensity_2d_data_transformer::transform( const void * data, void * data_transformed, neuron_data_type::input_type type, const layer_configuration_specific& original_config) { if (type != neuron_data_type::type_byte) throw neural_network_exception("intensity_2d_data_transformer is implemented for data stored as bytes only"); if (original_config.dimension_sizes.size() != 2) throw neural_network_exception((boost::format("intensity_2d_data_transformer is processing 2d data only, data is passed with number of dimensions %1%") % original_config.dimension_sizes.size()).str()); float contrast = contrast_distribution(generator); float brightness_shift = brightness_shift_distribution(generator) * 255.0F; unsigned int neuron_count_per_feature_map = original_config.get_neuron_count_per_feature_map(); for(unsigned int feature_map_id = 0; feature_map_id < original_config.feature_map_count; ++feature_map_id) { cv::Mat1b image(static_cast<int>(original_config.dimension_sizes[1]), static_cast<int>(original_config.dimension_sizes[0]), static_cast<unsigned char *>(data_transformed) + (neuron_count_per_feature_map * feature_map_id)); data_transformer_util::change_brightness_and_contrast( image, contrast, brightness_shift); } }
void uniform_intensity_data_transformer::transform( const float * data, float * data_transformed, const layer_configuration_specific& original_config, unsigned int sample_id) { if (original_config.feature_map_count != shift_distribution_list.size()) throw neural_network_exception((boost::format("uniform_intensity_data_transformer was initialized with %1% distributions and data provided has %2% feature maps") % shift_distribution_list.size() % original_config.feature_map_count).str()); std::vector<float> shift_list(original_config.feature_map_count); { boost::lock_guard<boost::mutex> lock(gen_stream_mutex); for(unsigned int feature_map_id = 0; feature_map_id < original_config.feature_map_count; ++feature_map_id) { nnforge_uniform_real_distribution<float>& dist = shift_distribution_list[feature_map_id]; float shift = dist.min(); if (dist.max() > dist.min()) shift = dist(generator); shift_list[feature_map_id] = shift; } } unsigned int neuron_count_per_feature_map = original_config.get_neuron_count_per_feature_map(); for(unsigned int feature_map_id = 0; feature_map_id < original_config.feature_map_count; ++feature_map_id) { float shift = shift_list[feature_map_id]; const float * src_data = data + feature_map_id * neuron_count_per_feature_map; float * dest_data = data_transformed + feature_map_id * neuron_count_per_feature_map; for(unsigned int i = 0; i < neuron_count_per_feature_map; ++i) dest_data[i] = src_data[i] + shift; } }
void soft_rectified_linear_layer_updater_plain::backprop( additional_buffer_smart_ptr input_errors, const_additional_buffer_smart_ptr input_neurons, const_additional_buffer_smart_ptr output_errors, const_additional_buffer_smart_ptr output_neurons, std::vector<additional_buffer_smart_ptr>& additional_buffers, plain_running_configuration_const_smart_ptr plain_config, const_layer_smart_ptr layer_schema, const_layer_data_smart_ptr data, const_layer_data_custom_smart_ptr data_custom, const layer_configuration_specific& input_configuration_specific, const layer_configuration_specific& output_configuration_specific, unsigned int updater_count) const { const int elem_count = static_cast<int>(updater_count * input_configuration_specific.get_neuron_count()); const std::vector<float>::iterator in_err_it = input_errors->begin(); const std::vector<float>::const_iterator out_it = output_neurons->begin(); #pragma omp parallel for default(none) schedule(guided) num_threads(plain_config->openmp_thread_count) for(int i = 0; i < elem_count; ++i) { float out_neuron = *(out_it + i); float val = expf(out_neuron); float der1st = (val - 1.0F) / val; *(in_err_it + i) *= der1st; } }
layer_updater_cuda::ptr sparse_convolution_layer_updater_schema::create_updater_specific( const std::vector<layer_configuration_specific>& input_configuration_specific_list, const layer_configuration_specific& output_configuration_specific, const cuda_running_configuration& cuda_config) const { std::shared_ptr<const sparse_convolution_layer> layer_derived = std::dynamic_pointer_cast<const sparse_convolution_layer>(layer_schema); bool zero_padding = (layer_derived->left_zero_padding == std::vector<unsigned int>(layer_derived->left_zero_padding.size(), 0)) && (layer_derived->right_zero_padding == std::vector<unsigned int>(layer_derived->right_zero_padding.size(), 0)); bool unit_stride = (layer_derived->strides == std::vector<unsigned int>(layer_derived->strides.size(), 1)); bool single_output = (output_configuration_specific.get_neuron_count() == output_configuration_specific.feature_map_count); bool fully_connected = single_output & unit_stride; bool window1x1 = (layer_derived->window_sizes == std::vector<unsigned int>(layer_derived->window_sizes.size(), 1)); if (zero_padding) { if (fully_connected) { if (window1x1) return layer_updater_cuda::ptr(new sparse_fully_connected_1x1_layer_updater_cuda()); else return layer_updater_cuda::ptr(new sparse_fully_connected_layer_updater_cuda()); } else { if (window1x1) return layer_updater_cuda::ptr(new sparse_1x1_layer_updater_cuda()); } } if (unit_stride) return sparse_convolution_layer_updater_schema_helper_cuda::create_updater_specific(input_configuration_specific_list[0], output_configuration_specific); throw neural_network_exception("There is no sparse_convolution_layer tester implemented for non-unit stride and non-unit window"); }
void sigmoid_layer_updater_plain::test( const_additional_buffer_smart_ptr input_buffer, additional_buffer_smart_ptr output_buffer, std::vector<additional_buffer_smart_ptr>& additional_buffers, plain_running_configuration_const_smart_ptr plain_config, const_layer_smart_ptr layer_schema, const layer_data_list& data, const layer_configuration_specific& input_configuration_specific, const layer_configuration_specific& output_configuration_specific, unsigned int updater_count, int offset_input_entry_id) const { if (offset_input_entry_id >= 0) throw neural_network_exception("sigmoid_layer_updater_plain is not able to run using the same input"); const int elem_count = static_cast<int>(updater_count * input_configuration_specific.get_neuron_count()); const std::vector<float>::const_iterator in_it = input_buffer->begin(); const std::vector<float>::iterator out_it = output_buffer->begin(); #pragma omp parallel for default(none) schedule(guided) num_threads(plain_config->openmp_thread_count) for(int i = 0; i < elem_count; ++i) { float inp = *(in_it + i); float res = 1.0F / (expf(-inp) + 1.0F); *(out_it + i) = res; } }
void add_layer_tester_plain::run_forward_propagation( plain_buffer::ptr output_buffer, const std::vector<plain_buffer::const_ptr>& input_buffers, plain_buffer::ptr temporary_working_fixed_buffer, plain_buffer::ptr temporary_working_per_entry_buffer, plain_running_configuration::const_ptr plain_config, layer::const_ptr layer_schema, layer_data::const_ptr data, layer_data_custom::const_ptr data_custom, const std::vector<layer_configuration_specific>& input_configuration_specific_list, const layer_configuration_specific& output_configuration_specific, unsigned int entry_count) const { float * const out = *output_buffer; std::vector<const float *> in_list; for(std::vector<plain_buffer::const_ptr>::const_iterator it = input_buffers.begin(); it != input_buffers.end(); ++it) in_list.push_back(**it); const float ** const in_ptr_list = &in_list[0]; std::shared_ptr<const add_layer> layer_derived = std::dynamic_pointer_cast<const add_layer>(layer_schema); const float alpha = layer_derived->alpha; const int src_ptr_count = static_cast<int>(in_list.size()); const int elem_count = static_cast<int>(entry_count * output_configuration_specific.get_neuron_count()); #pragma omp parallel for default(none) schedule(guided) num_threads(plain_config->openmp_thread_count) for(int i = 0; i < elem_count; ++i) { float sum = 0.0F; for(int j = 0; j < src_ptr_count; ++j) sum += in_ptr_list[j][i]; out[i] = sum * alpha; } }
layer_configuration_specific reshape_data_transformer::get_transformed_configuration(const layer_configuration_specific& original_config) const { if (original_config.get_neuron_count() != config.get_neuron_count()) throw neural_network_exception((boost::format("Neuron counts for reshape_data_transformer don't match: %1% and %2%") % original_config.get_neuron_count() % config.get_neuron_count()).str()); return config; }
void prefix_sum_layer_tester_plain::run_forward_propagation( plain_buffer::ptr output_buffer, const std::vector<plain_buffer::const_ptr>& input_buffers, plain_buffer::ptr temporary_working_fixed_buffer, plain_buffer::ptr temporary_working_per_entry_buffer, plain_running_configuration::const_ptr plain_config, layer::const_ptr layer_schema, layer_data::const_ptr data, layer_data_custom::const_ptr data_custom, const std::vector<layer_configuration_specific>& input_configuration_specific_list, const layer_configuration_specific& output_configuration_specific, unsigned int entry_count) const { const float * const in_it_global = *input_buffers[0]; float * const out_it_global = *output_buffer; const unsigned int neuron_count = output_configuration_specific.get_neuron_count(); std::shared_ptr<const prefix_sum_layer> layer_derived = std::dynamic_pointer_cast<const prefix_sum_layer>(layer_schema); const unsigned int feature_map_segment_length = layer_derived->feature_map_segment_length; const unsigned int feature_map_segment_count = output_configuration_specific.feature_map_count / feature_map_segment_length; const unsigned int neuron_count_per_feature_map = output_configuration_specific.get_neuron_count_per_feature_map(); const float clamp_min = layer_derived->clamp_min; const float clamp_max = layer_derived->clamp_max; const int total_workload = entry_count * feature_map_segment_count * neuron_count_per_feature_map; #pragma omp parallel default(none) num_threads(plain_config->openmp_thread_count) { #pragma omp for schedule(guided) for(int workload_id = 0; workload_id < total_workload; ++workload_id) { int entry_id = workload_id / (feature_map_segment_count * neuron_count_per_feature_map); int tt = workload_id - entry_id * feature_map_segment_count * neuron_count_per_feature_map; int feature_map_segment_id = tt / neuron_count_per_feature_map; int neuron_id = tt - feature_map_segment_id * neuron_count_per_feature_map; int offset = entry_id * neuron_count + feature_map_segment_id * feature_map_segment_length * neuron_count_per_feature_map + neuron_id; float running_sum = 0.0F; for(unsigned int i = 0; i < feature_map_segment_length; ++i, offset += neuron_count_per_feature_map) { running_sum += in_it_global[offset]; out_it_global[offset] = std::min(std::max(running_sum, clamp_min), clamp_max); } } } }
size_t max_subsampling_layer_updater_plain::get_temporary_per_entry_buffer_size( const std::set<layer_action>& actions, plain_running_configuration::const_ptr plain_config, layer::const_ptr layer_schema, const std::vector<layer_configuration_specific>& input_configuration_specific_list, const layer_configuration_specific& output_configuration_specific) const { return output_configuration_specific.get_neuron_count() * sizeof(unsigned int); }