Example #1
0
		void layer_updater_cuda::configure(
			const std::vector<layer_configuration_specific>& input_configuration_specific_list,
			const layer_configuration_specific& output_configuration_specific,
			layer::const_ptr layer_schema,
			cuda_running_configuration::const_ptr cuda_config,
			const std::set<layer_action>& actions)
		{
			this->layer_schema = layer_schema;
			this->input_configuration_specific_list = input_configuration_specific_list;
			this->output_configuration_specific = output_configuration_specific;
			this->cuda_config = cuda_config;
			this->actions = actions;

			input_elem_count_per_entry_list.resize(input_configuration_specific_list.size());
			input_elem_count_per_feature_map_list.resize(input_configuration_specific_list.size());
			for(int i = 0; i < input_configuration_specific_list.size(); ++i)
			{
				input_elem_count_per_entry_list[i] = input_configuration_specific_list[i].get_neuron_count();
				input_elem_count_per_feature_map_list[i] = input_configuration_specific_list[i].get_neuron_count_per_feature_map();
			}

			output_elem_count_per_entry = output_configuration_specific.get_neuron_count();
			output_elem_count_per_feature_map = output_configuration_specific.get_neuron_count_per_feature_map();

			updater_configured();
		}
		void rgb_to_yuv_convert_layer_tester_plain::run_forward_propagation(
			plain_buffer::ptr output_buffer,
			const std::vector<plain_buffer::const_ptr>& input_buffers,
			plain_buffer::ptr temporary_working_fixed_buffer,
			plain_buffer::ptr temporary_working_per_entry_buffer,
			plain_running_configuration::const_ptr plain_config,
			layer::const_ptr layer_schema,
			layer_data::const_ptr data,
			layer_data_custom::const_ptr data_custom,
			const std::vector<layer_configuration_specific>& input_configuration_specific_list,
			const layer_configuration_specific& output_configuration_specific,
			unsigned int entry_count) const
		{
			const float * const in_it = *input_buffers[0];
			float * const out_it = *output_buffer;

			nnforge_shared_ptr<const rgb_to_yuv_convert_layer> layer_derived = nnforge_dynamic_pointer_cast<const rgb_to_yuv_convert_layer>(layer_schema);

			const unsigned int color_feature_map_config_count = static_cast<unsigned int>(layer_derived->color_feature_map_config_list.size());

			if ((out_it != in_it) && ((color_feature_map_config_count * 3) != output_configuration_specific.feature_map_count))
				memcpy(out_it, in_it, output_configuration_specific.get_neuron_count() * entry_count * sizeof(float));

			const int total_workload = static_cast<int>(entry_count * color_feature_map_config_count);

			const unsigned int input_neuron_count = output_configuration_specific.get_neuron_count();
			const unsigned int input_neuron_count_per_feature_map = output_configuration_specific.get_neuron_count_per_feature_map();
			const std::vector<color_feature_map_config>::const_iterator cfm_it = layer_derived->color_feature_map_config_list.begin();

			#pragma omp parallel for default(none) schedule(guided) num_threads(plain_config->openmp_thread_count)
			for(int workload_id = 0; workload_id < total_workload; ++workload_id)
			{
				int entry_id = workload_id / color_feature_map_config_count;
				int color_feature_map_config_id = workload_id - entry_id * color_feature_map_config_count;
				const color_feature_map_config& cfm = *(cfm_it + color_feature_map_config_id);

				const float * in_it_red_and_y = in_it + (entry_id * input_neuron_count) + (cfm.red_and_y_feature_map_id * input_neuron_count_per_feature_map);
				const float * in_it_green_and_u = in_it + (entry_id * input_neuron_count) + (cfm.green_and_u_feature_map_id * input_neuron_count_per_feature_map);
				const float * in_it_blue_and_v = in_it + (entry_id * input_neuron_count) + (cfm.blue_and_v_feature_map_id * input_neuron_count_per_feature_map);

				float * out_it_red_and_y = out_it + (entry_id * input_neuron_count) + (cfm.red_and_y_feature_map_id * input_neuron_count_per_feature_map);
				float * out_it_green_and_u = out_it + (entry_id * input_neuron_count) + (cfm.green_and_u_feature_map_id * input_neuron_count_per_feature_map);
				float * out_it_blue_and_v = out_it + (entry_id * input_neuron_count) + (cfm.blue_and_v_feature_map_id * input_neuron_count_per_feature_map);

				for(unsigned int i = 0; i < input_neuron_count_per_feature_map; ++i)
				{
					float red = in_it_red_and_y[i];
					float green = in_it_green_and_u[i];
					float blue = in_it_blue_and_v[i];

					float y = w_r * red + w_g * green + w_b * blue;
					float u = u_mult * (blue - y);
					float v = v_mult * (red - y);

					out_it_red_and_y[i] = y;
					out_it_green_and_u[i] = u;
					out_it_blue_and_v[i] = v;
				}
			}
		}
		void softmax_layer_hessian_plain::test(
			const_additional_buffer_smart_ptr input_buffer,
			additional_buffer_smart_ptr output_buffer,
			std::vector<additional_buffer_smart_ptr>& additional_buffers,
			plain_running_configuration_const_smart_ptr plain_config,
			const_layer_smart_ptr layer_schema,
			const_layer_data_smart_ptr data,
			const_layer_data_custom_smart_ptr data_custom,
			const layer_configuration_specific& input_configuration_specific,
			const layer_configuration_specific& output_configuration_specific,
			unsigned int entry_count) const
		{
			const unsigned int input_neuron_count = input_configuration_specific.get_neuron_count();
			const unsigned int input_neuron_count_per_feature_map = input_configuration_specific.get_neuron_count_per_feature_map();
			const unsigned int feature_map_count = static_cast<unsigned int>(input_configuration_specific.feature_map_count);

			const std::vector<float>::const_iterator input_buffer_it = input_buffer->begin();
			const std::vector<float>::iterator output_buffer_it = output_buffer->begin();

			const int total_workload = entry_count * input_neuron_count_per_feature_map;
			const int openmp_thread_count = plain_config->openmp_thread_count;
			
			#pragma omp parallel default(none) shared(additional_buffers) num_threads(openmp_thread_count)
			{
				int thread_id = 0;
				#ifdef _OPENMP
				thread_id = omp_get_thread_num();
				#endif

				std::vector<float>& local_additional_buffer = *(additional_buffers[thread_id]);

				#pragma omp for schedule(guided)
				for(int workload_id = 0; workload_id < total_workload; ++workload_id)
				{
					int entry_id = workload_id / input_neuron_count_per_feature_map;
					int neuron_id = workload_id - (entry_id * input_neuron_count_per_feature_map);

					const std::vector<float>::const_iterator in_it = input_buffer_it + (entry_id * input_neuron_count) + neuron_id;
					const std::vector<float>::iterator out_it = output_buffer_it + (entry_id * input_neuron_count) + neuron_id;

					float max_val = -1.0e+37F;
					for(unsigned int feature_map_id = 0; feature_map_id < feature_map_count; ++feature_map_id)
					{
						float val = *(in_it + (feature_map_id * input_neuron_count_per_feature_map));
						max_val = std::max(max_val, val);
					}

					float sum = 0.0F;
					for(unsigned int feature_map_id = 0; feature_map_id < feature_map_count; ++feature_map_id)
					{
						float val = expf((*(in_it + (feature_map_id * input_neuron_count_per_feature_map))) - max_val);
						sum += val;
						local_additional_buffer[feature_map_id] = val;
					}
					float mult = 1.0F / sum;
					for(unsigned int feature_map_id = 0; feature_map_id < feature_map_count; ++feature_map_id)
						*(out_it + (feature_map_id * input_neuron_count_per_feature_map)) = local_additional_buffer[feature_map_id] * mult;
				} // for(int workload_id
			} // #pragma parallel
		}
		void softmax_layer_hessian_plain::backprop(
			additional_buffer_smart_ptr input_errors,
			const_additional_buffer_smart_ptr output_errors,
			const_additional_buffer_smart_ptr output_neurons,
			std::vector<additional_buffer_smart_ptr>& additional_buffers,
			plain_running_configuration_const_smart_ptr plain_config,
			const_layer_smart_ptr layer_schema,
			const_layer_data_smart_ptr data,
			const_layer_data_custom_smart_ptr data_custom,
			const layer_configuration_specific& input_configuration_specific,
			const layer_configuration_specific& output_configuration_specific,
			unsigned int entry_count) const
		{
			const unsigned int input_neuron_count = input_configuration_specific.get_neuron_count();
			const unsigned int input_neuron_count_per_feature_map = input_configuration_specific.get_neuron_count_per_feature_map();
			const unsigned int feature_map_count = static_cast<unsigned int>(input_configuration_specific.feature_map_count);

			const std::vector<float>::iterator input_errors_it = input_errors->begin();
			const std::vector<float>::const_iterator output_errors_it = output_errors->begin();
			const std::vector<float>::const_iterator output_neurons_it = output_neurons->begin();

			const int total_workload = entry_count * input_neuron_count_per_feature_map;
			const int openmp_thread_count = plain_config->openmp_thread_count;
			
			#pragma omp parallel default(none) shared(additional_buffers) num_threads(openmp_thread_count)
			{
				int thread_id = 0;
				#ifdef _OPENMP
				thread_id = omp_get_thread_num();
				#endif

				#pragma omp for schedule(guided)
				for(int workload_id = 0; workload_id < total_workload; ++workload_id)
				{
					int entry_id = workload_id / input_neuron_count_per_feature_map;
					int neuron_id = workload_id - (entry_id * input_neuron_count_per_feature_map);

					const std::vector<float>::iterator in_errors_it = input_errors_it + (entry_id * input_neuron_count) + neuron_id;
					const std::vector<float>::const_iterator out_errors_it = output_errors_it + (entry_id * input_neuron_count) + neuron_id;
					const std::vector<float>::const_iterator out_neurons_it = output_neurons_it + (entry_id * input_neuron_count) + neuron_id;

					float sum = 0.0F;
					for(unsigned int feature_map_id = 0; feature_map_id < feature_map_count; ++feature_map_id)
					{
						unsigned int offset = feature_map_id * input_neuron_count_per_feature_map;
						float val = (*(out_neurons_it + offset));
						sum += val * val * (*(out_errors_it + offset));
					}

					for(unsigned int feature_map_id = 0; feature_map_id < feature_map_count; ++feature_map_id)
					{
						unsigned int offset = feature_map_id * input_neuron_count_per_feature_map;
						float y = *(out_neurons_it + offset);
						float y2 = y * y;
						*(in_errors_it + offset) = y2 * ((*(out_errors_it + offset)) * (2.0F * (y2 - y) + 1.0F) - sum);
					}
				} // for(int workload_id
			} // #pragma parallel
		}
	void reshape_data_transformer::transform(
		const float * data,
		float * data_transformed,
		const layer_configuration_specific& original_config,
		unsigned int sample_id)
	{
		if (original_config.get_neuron_count() != config.get_neuron_count())
			throw neural_network_exception((boost::format("Neuron counts for reshape_data_transformer don't match: %1% and %2%") % original_config.get_neuron_count() % config.get_neuron_count()).str());

		memcpy(data_transformed, data, original_config.get_neuron_count() * sizeof(float));
	}
		void dropout_layer_updater_plain::test(
			const_additional_buffer_smart_ptr input_buffer,
			additional_buffer_smart_ptr output_buffer,
			std::vector<additional_buffer_smart_ptr>& additional_buffers,
			plain_running_configuration_const_smart_ptr plain_config,
			const_layer_smart_ptr layer_schema,
			const_layer_data_smart_ptr data,
			const_layer_data_custom_smart_ptr data_custom,
			const layer_configuration_specific& input_configuration_specific,
			const layer_configuration_specific& output_configuration_specific,
			unsigned int updater_count,
			unsigned int offset_input_entry_id,
			bool force_deterministic) const
		{
			if (offset_input_entry_id > 0)
				throw neural_network_exception("dropout_layer_updater_plain is not able to run using offset");

			if (force_deterministic)
			{
				memcpy(&(output_buffer->at(0)), &(input_buffer->at(0)), input_configuration_specific.get_neuron_count() * updater_count * sizeof(float));
			}
			else
			{
				const std::vector<float>::const_iterator in_it_global = input_buffer->begin();
				const std::vector<float>::iterator out_it_global = output_buffer->begin();
				unsigned char * keep_elem_ptr = reinterpret_cast<unsigned char *>(&(additional_buffers[0]->at(0)));

				nnforge_shared_ptr<const dropout_layer> layer_derived = nnforge_dynamic_pointer_cast<const dropout_layer>(layer_schema);
				const float dropout_rate = layer_derived->dropout_rate;
				const float keep_rate = 1.0F - dropout_rate;
				const float mult = 1.0F / keep_rate;

				const int total_workload = input_configuration_specific.get_neuron_count() * updater_count;

				nnforge_uniform_real_distribution<float> dist(0.0F, 1.0F);

				for(int i = 0; i < total_workload; ++i)
					keep_elem_ptr[i] = (dist(gen) <= keep_rate ? (unsigned char)1 : (unsigned char)0);

				#pragma omp parallel default(none) num_threads(plain_config->openmp_thread_count) shared(keep_elem_ptr)
				{
					#pragma omp for schedule(guided)
					for(int workload_id = 0; workload_id < total_workload; ++workload_id)
					{
						int elem_id = workload_id;
						*(out_it_global + elem_id) = *(in_it_global + elem_id) * (keep_elem_ptr[elem_id] ? mult : 0.0F);
					}
				}
			}
		}
		void maxout_layer_tester_plain::run_forward_propagation(
			plain_buffer::ptr output_buffer,
			const std::vector<plain_buffer::const_ptr>& input_buffers,
			plain_buffer::ptr temporary_working_fixed_buffer,
			plain_buffer::ptr temporary_working_per_entry_buffer,
			plain_running_configuration::const_ptr plain_config,
			layer::const_ptr layer_schema,
			layer_data::const_ptr data,
			layer_data_custom::const_ptr data_custom,
			const std::vector<layer_configuration_specific>& input_configuration_specific_list,
			const layer_configuration_specific& output_configuration_specific,
			unsigned int entry_count) const
		{
			const float * const in_it_global = *input_buffers[0];
			float * const out_it_global = *output_buffer;
			const unsigned int input_neuron_count = input_configuration_specific_list[0].get_neuron_count();
			const unsigned int input_neuron_count_per_feature_map = input_configuration_specific_list[0].get_neuron_count_per_feature_map();
			const unsigned int output_neuron_count = output_configuration_specific.get_neuron_count();
			const unsigned int output_neuron_count_per_feature_map = output_configuration_specific.get_neuron_count_per_feature_map();
			nnforge_shared_ptr<const maxout_layer> layer_derived = nnforge_dynamic_pointer_cast<const maxout_layer>(layer_schema);
			const unsigned int feature_map_subsampling_size = layer_derived->feature_map_subsampling_size;
			const int output_feature_map_count = output_configuration_specific.feature_map_count;
			const int total_workload = entry_count * output_feature_map_count;

			#pragma omp parallel default(none) num_threads(plain_config->openmp_thread_count)
			{
				#pragma omp for schedule(guided)
				for(int workload_id = 0; workload_id < total_workload; ++workload_id)
				{
					int entry_id = workload_id / output_feature_map_count;
					int output_feature_map_id = workload_id - (entry_id * output_feature_map_count);

					const float * in_it_base = in_it_global + (entry_id * input_neuron_count) + (output_feature_map_id * input_neuron_count_per_feature_map);
					float * out_it_base = out_it_global + (entry_id * output_neuron_count) + (output_feature_map_id * output_neuron_count_per_feature_map);

					for(float * out_it = out_it_base; out_it != out_it_base + output_neuron_count_per_feature_map; ++out_it, ++in_it_base)
					{
						const float * in_it = in_it_base;
						float current_max = *in_it;
						for(unsigned int i = 1; i < feature_map_subsampling_size; ++i)
						{
							in_it += output_feature_map_count * output_neuron_count_per_feature_map;
							float new_val = *in_it;
							current_max = std::max(new_val, current_max);
						}
						*out_it = current_max;
					}
				}
			}
		}
		layer_tester_cuda_smart_ptr convolution_layer_testing_schema::create_tester_specific(
			const layer_configuration_specific& input_configuration_specific,
			const layer_configuration_specific& output_configuration_specific) const
		{
			layer_tester_cuda_smart_ptr res;

			if (output_configuration_specific.get_neuron_count() == output_configuration_specific.feature_map_count)
			{
				res = layer_tester_cuda_smart_ptr(new fully_connected_layer_tester_cuda());
			}
			else
			{
				switch (output_configuration_specific.dimension_sizes.size())
				{
				case 1:
					if (cuda_config->get_compute_capability() >= 300)
						res = layer_tester_cuda_smart_ptr(new convolution_1d_layer_tester_cuda_kepler());
					else
						res = layer_tester_cuda_smart_ptr(new convolution_1d_layer_tester_cuda_fermi());
					break;
				case 2:
					if (cuda_config->get_compute_capability() >= 300)
						res = layer_tester_cuda_smart_ptr(new convolution_2d_layer_tester_cuda_kepler());
					else
						res = layer_tester_cuda_smart_ptr(new convolution_2d_layer_tester_cuda_fermi());
					break;
				default:
					throw neural_network_exception((boost::format("No CUDA tester for the convolution layer of %1% dimensions") % output_configuration_specific.dimension_sizes.size()).str());
					break;
				}
			}

			return res;
		}
		void concat_layer_updater_plain::run_forward_propagation(
			plain_buffer::ptr output_buffer,
			const std::vector<plain_buffer::const_ptr>& input_buffers,
			plain_buffer::ptr temporary_working_fixed_buffer,
			plain_buffer::ptr temporary_working_per_entry_buffer,
			plain_buffer::ptr temporary_per_entry_buffer,
			plain_running_configuration::const_ptr plain_config,
			layer::const_ptr layer_schema,
			layer_data::const_ptr data,
			layer_data_custom::const_ptr data_custom,
			const std::vector<layer_configuration_specific>& input_configuration_specific_list,
			const layer_configuration_specific& output_configuration_specific,
			const std::set<layer_action>& actions,
			unsigned int entry_count) const
		{
			for(unsigned int entry_id = 0; entry_id < entry_count; ++entry_id)
			{
				float *dst = (float *)*output_buffer + entry_id * output_configuration_specific.get_neuron_count();
				for(unsigned int i = 0; i < static_cast<unsigned int>(input_configuration_specific_list.size()); ++i)
				{
					unsigned int input_neuron_count = input_configuration_specific_list[i].get_neuron_count();
					memcpy(
						dst,
						(const float *)(*input_buffers[i]) + entry_id * input_neuron_count,
						input_neuron_count * sizeof(float));
					dst += input_neuron_count;
				}
			}
		}
	layer_configuration_specific max_subsampling_layer::get_output_layer_configuration_specific(const layer_configuration_specific& input_configuration_specific) const
	{
		if (input_configuration_specific.get_dimension_count() != subsampling_sizes.size())
			throw neural_network_exception((boost::format("Dimension count in layer (%1%) and input configuration (%2%) don't match") % subsampling_sizes.size() % input_configuration_specific.get_dimension_count()).str());

		layer_configuration_specific res(input_configuration_specific.feature_map_count);

		if (tiling)
		{
			for(unsigned int i = 0; i < subsampling_sizes.size(); ++i)
			{
				if (input_configuration_specific.dimension_sizes[i] < (subsampling_sizes[i] * 2 - 1))
					throw neural_network_exception((boost::format("Input configuration size (%1%) of dimension (%2%) is smaller than subsampling size (%3%) * 2 - 1") % input_configuration_specific.dimension_sizes[i] % i % subsampling_sizes[i]).str());

				res.dimension_sizes.push_back((input_configuration_specific.dimension_sizes[i] - (subsampling_sizes[i] - 1))/ subsampling_sizes[i]);
			}
		}
		else
		{
			for(unsigned int i = 0; i < subsampling_sizes.size(); ++i)
			{
				if (input_configuration_specific.dimension_sizes[i] < subsampling_sizes[i])
					throw neural_network_exception((boost::format("Input configuration size (%1%) of dimension (%2%) is smaller than subsampling size (%3%)") % input_configuration_specific.dimension_sizes[i] % i % subsampling_sizes[i]).str());

				res.dimension_sizes.push_back(input_configuration_specific.dimension_sizes[i] / subsampling_sizes[i]);
			}
		}

		return res;
	}
		void hyperbolic_tangent_layer_tester_plain::run_forward_propagation(
			plain_buffer::ptr output_buffer,
			const std::vector<plain_buffer::const_ptr>& input_buffers,
			plain_buffer::ptr temporary_working_fixed_buffer,
			plain_buffer::ptr temporary_working_per_entry_buffer,
			plain_running_configuration::const_ptr plain_config,
			layer::const_ptr layer_schema,
			layer_data::const_ptr data,
			layer_data_custom::const_ptr data_custom,
			const std::vector<layer_configuration_specific>& input_configuration_specific_list,
			const layer_configuration_specific& output_configuration_specific,
			unsigned int entry_count) const
		{
			const int elem_count = static_cast<int>(entry_count * output_configuration_specific.get_neuron_count());
			float * const out_it = *output_buffer;
			const float * const in_it = *input_buffers[0];

			std::shared_ptr<const hyperbolic_tangent_layer> layer_derived = std::dynamic_pointer_cast<const hyperbolic_tangent_layer>(layer_schema);
			const float hyperbolic_tangent_steepness2 = layer_derived->steepness * 2.0F;
			const float hyperbolic_tangent_major_multiplier = layer_derived->scale;

			#pragma omp parallel for default(none) schedule(guided) num_threads(plain_config->openmp_thread_count)
			for(int i = 0; i < elem_count; ++i)
			{
				float inp = *(in_it + i);
				float inp2 = expf(inp * hyperbolic_tangent_steepness2);
				float res = (inp2 - 1.0F) / (inp2 + 1.0F) * hyperbolic_tangent_major_multiplier;
				*(out_it + i) = res;
			}
		}
	supervised_data_mem_reader::supervised_data_mem_reader(
		const layer_configuration_specific& input_configuration,
		const layer_configuration_specific& output_configuration,
		const std::vector<std::tr1::shared_ptr<const std::vector<float> > >& input_data_list,
		const std::vector<std::tr1::shared_ptr<const std::vector<float> > >& output_data_list)
		: input_configuration(input_configuration)
		, output_configuration(output_configuration)
		, input_data_list_float(input_data_list)
		, output_data_list(output_data_list)
		, entry_read_count(0)
		, type_code(neuron_data_type::type_byte)
		, entry_count(static_cast<unsigned int>(input_data_list.size()))
		, input_neuron_count(input_configuration.get_neuron_count())
		, output_neuron_count(output_configuration.get_neuron_count())
	{
	}
	structured_data_stream_writer::structured_data_stream_writer(
		nnforge_shared_ptr<std::ostream> output_stream,
		const layer_configuration_specific& config)
		: out_stream(output_stream), entry_count(0)
	{
		out_stream->exceptions(std::ostream::failbit | std::ostream::badbit);

		neuron_count = config.get_neuron_count();

		out_stream->write(reinterpret_cast<const char*>(structured_data_stream_schema::structured_data_stream_guid.data), sizeof(structured_data_stream_schema::structured_data_stream_guid.data));

		config.write(*out_stream);

		entry_count_pos = out_stream->tellp();
		out_stream->write(reinterpret_cast<const char*>(&entry_count), sizeof(entry_count));
	}
		void absolute_layer_updater_plain::test(
			const_additional_buffer_smart_ptr input_buffer,
			additional_buffer_smart_ptr output_buffer,
			std::vector<additional_buffer_smart_ptr>& additional_buffers,
			plain_running_configuration_const_smart_ptr plain_config,
			const_layer_smart_ptr layer_schema,
			const_layer_data_smart_ptr data,
			const_layer_data_custom_smart_ptr data_custom,
			const layer_configuration_specific& input_configuration_specific,
			const layer_configuration_specific& output_configuration_specific,
			unsigned int updater_count,
			unsigned int offset_input_entry_id,
			bool force_deterministic) const
		{
			if (offset_input_entry_id > 0)
				throw neural_network_exception("absolute_layer_updater_plain is not able to run using offset");

			const int elem_count = static_cast<int>(updater_count * input_configuration_specific.get_neuron_count());
			const std::vector<float>::const_iterator in_it = input_buffer->begin();
			const std::vector<float>::iterator out_it = output_buffer->begin();

			#pragma omp parallel for default(none) schedule(guided) num_threads(plain_config->openmp_thread_count)
			for(int i = 0; i < elem_count; ++i)
				*(out_it + i) = fabs(*(in_it + i));
		}
		void hyperbolic_tangent_layer_updater_plain::test(
			const_additional_buffer_smart_ptr input_buffer,
			additional_buffer_smart_ptr output_buffer,
			std::vector<additional_buffer_smart_ptr>& additional_buffers,
			plain_running_configuration_const_smart_ptr plain_config,
			const_layer_smart_ptr layer_schema,
			const_layer_data_smart_ptr data,
			const_layer_data_custom_smart_ptr data_custom,
			const layer_configuration_specific& input_configuration_specific,
			const layer_configuration_specific& output_configuration_specific,
			unsigned int updater_count,
			unsigned int offset_input_entry_id) const
		{
			if (offset_input_entry_id > 0)
				throw neural_network_exception("hyperbolic_tangent_layer_updater_plain is not able to run using offset");

			const int elem_count = static_cast<int>(updater_count * input_configuration_specific.get_neuron_count());
			const std::vector<float>::const_iterator in_it = input_buffer->begin();
			const std::vector<float>::iterator out_it = output_buffer->begin();

			nnforge_shared_ptr<const hyperbolic_tangent_layer> layer_derived = nnforge_dynamic_pointer_cast<const hyperbolic_tangent_layer>(layer_schema);
			const float hyperbolic_tangent_steepness2 = layer_derived->steepness * 2.0F;
			const float hyperbolic_tangent_major_multiplier = layer_derived->major_multiplier;

			#pragma omp parallel for default(none) schedule(guided) num_threads(plain_config->openmp_thread_count)
			for(int i = 0; i < elem_count; ++i)
			{
				float inp = *(in_it + i);
				float inp2 = expf(inp * hyperbolic_tangent_steepness2);
				float res = (inp2 - 1.0F) / (inp2 + 1.0F) * hyperbolic_tangent_major_multiplier;
				*(out_it + i) = res;
			}
		}
		void absolute_layer_updater_plain::backprop(
			additional_buffer_smart_ptr input_errors,
			const_additional_buffer_smart_ptr input_neurons,
			const_additional_buffer_smart_ptr output_errors,
			const_additional_buffer_smart_ptr output_neurons,
			std::vector<additional_buffer_smart_ptr>& additional_buffers,
			plain_running_configuration_const_smart_ptr plain_config,
			const_layer_smart_ptr layer_schema,
			const_layer_data_smart_ptr data,
			const layer_configuration_specific& input_configuration_specific,
			const layer_configuration_specific& output_configuration_specific,
			unsigned int updater_count) const
		{
			const int elem_count = static_cast<int>(updater_count * input_configuration_specific.get_neuron_count());
			const std::vector<float>::const_iterator in_it = input_neurons->begin();
			const std::vector<float>::iterator in_err_it = input_errors->begin();

			#pragma omp parallel for default(none) schedule(guided) num_threads(plain_config->openmp_thread_count)
			for(int i = 0; i < elem_count; ++i)
			{
				float val = *(in_it + i);
				if (val < 0.0F)
				{
					*(in_err_it + i) = - *(in_err_it + i);
				}
			}
		}
		void rectified_linear_layer_updater_plain::run_forward_propagation(
			plain_buffer::ptr output_buffer,
			const std::vector<plain_buffer::const_ptr>& input_buffers,
			plain_buffer::ptr temporary_working_fixed_buffer,
			plain_buffer::ptr temporary_working_per_entry_buffer,
			plain_buffer::ptr temporary_per_entry_buffer,
			plain_running_configuration::const_ptr plain_config,
			layer::const_ptr layer_schema,
			layer_data::const_ptr data,
			layer_data_custom::const_ptr data_custom,
			const std::vector<layer_configuration_specific>& input_configuration_specific_list,
			const layer_configuration_specific& output_configuration_specific,
			const std::set<layer_action>& actions,
			unsigned int entry_count) const
		{
			std::shared_ptr<const rectified_linear_layer> layer_derived = std::dynamic_pointer_cast<const rectified_linear_layer>(layer_schema);

			const int elem_count = static_cast<int>(entry_count * output_configuration_specific.get_neuron_count());
			float * const out_it = *output_buffer;
			const float * const in_it = *input_buffers[0];
			const float negative_slope = layer_derived->negative_slope;

			#pragma omp parallel for default(none) schedule(guided) num_threads(plain_config->openmp_thread_count)
			for(int i = 0; i < elem_count; ++i)
			{
				float input_val = *(in_it + i);
				*(out_it + i) = input_val >= 0.0F ? input_val : input_val * negative_slope;
			}
		}
	void convert_to_polar_data_transformer::transform(
		const void * data,
		void * data_transformed,
		neuron_data_type::input_type type,
		const layer_configuration_specific& original_config,
		unsigned int sample_id)
	{
		if (type != neuron_data_type::type_byte)
			throw neural_network_exception("convert_to_polar_data_transformer is implemented for data stored as bytes only");

		if (original_config.dimension_sizes.size() != 2)
			throw neural_network_exception((boost::format("convert_to_polar_data_transformer is processing 2D data only, data is passed with number of dimensions %1%") % original_config.dimension_sizes.size()).str());

		if (original_config.dimension_sizes != input_window_sizes)
			throw neural_network_exception("convert_to_polar_data_transformer: input window size mismatch between creation and actual transform");

		unsigned int original_neuron_count_per_feature_map = original_config.get_neuron_count_per_feature_map();
		unsigned int transformed_neuron_count_per_feature_map = get_transformed_configuration(original_config).get_neuron_count_per_feature_map();
		for(unsigned int feature_map_id = 0; feature_map_id < original_config.feature_map_count; ++feature_map_id)
		{
			cv::Mat1b original_image(static_cast<int>(original_config.dimension_sizes[1]), static_cast<int>(original_config.dimension_sizes[0]), const_cast<unsigned char *>(static_cast<const unsigned char *>(data)) + (original_neuron_count_per_feature_map * feature_map_id));
			cv::Mat1b dest_image(static_cast<int>(output_window_sizes[1]), static_cast<int>(output_window_sizes[0]), static_cast<unsigned char *>(data_transformed) + (transformed_neuron_count_per_feature_map * feature_map_id));

			// Should try INTER_CUBIC and INTER_LANCZOS4 as well
			cv::remap(original_image, dest_image, map_x, map_y, cv::INTER_LINEAR, cv::BORDER_CONSTANT, border_value);
		}
	}
		void hyperbolic_tangent_layer_updater_plain::backprop(
			additional_buffer_smart_ptr input_errors,
			const_additional_buffer_smart_ptr input_neurons,
			const_additional_buffer_smart_ptr output_errors,
			const_additional_buffer_smart_ptr output_neurons,
			std::vector<additional_buffer_smart_ptr>& additional_buffers,
			plain_running_configuration_const_smart_ptr plain_config,
			const_layer_smart_ptr layer_schema,
			const_layer_data_smart_ptr data,
			const_layer_data_custom_smart_ptr data_custom,
			const layer_configuration_specific& input_configuration_specific,
			const layer_configuration_specific& output_configuration_specific,
			unsigned int updater_count) const
		{
			const int elem_count = static_cast<int>(updater_count * input_configuration_specific.get_neuron_count());
			const std::vector<float>::iterator in_err_it = input_errors->begin();
			const std::vector<float>::const_iterator out_it = output_neurons->begin();

			nnforge_shared_ptr<const hyperbolic_tangent_layer> layer_derived = nnforge_dynamic_pointer_cast<const hyperbolic_tangent_layer>(layer_schema);
			const float hyperbolic_tangent_major_multiplier_reverse = 1.0F / layer_derived->major_multiplier;
			const float hyperbolic_tangent_steepness3 = layer_derived->steepness * layer_derived->major_multiplier;
			#pragma omp parallel for default(none) schedule(guided) num_threads(plain_config->openmp_thread_count)
			for(int i = 0; i < elem_count; ++i)
			{
				float out_neuron = *(out_it + i);
				float normalized_value = out_neuron * hyperbolic_tangent_major_multiplier_reverse;
				float der1st = hyperbolic_tangent_steepness3 * (1.0F - (normalized_value * normalized_value));
				*(in_err_it + i) *= der1st;
			}
		}
	void flip_2d_data_sampler_transformer::transform(
		const void * data,
		void * data_transformed,
		neuron_data_type::input_type type,
		const layer_configuration_specific& original_config,
		unsigned int sample_id)
	{
		if (type != neuron_data_type::type_byte)
			throw neural_network_exception("flip_2d_data_sampler_transformer is implemented for data stored as bytes only");

		if (original_config.dimension_sizes.size() != 2)
			throw neural_network_exception((boost::format("flip_2d_data_sampler_transformer is processing 2d data only, data is passed with number of dimensions %1%") % original_config.dimension_sizes.size()).str());

		unsigned int neuron_count_per_feature_map = original_config.get_neuron_count_per_feature_map();
		for(unsigned int feature_map_id = 0; feature_map_id < original_config.feature_map_count; ++feature_map_id)
		{
			cv::Mat1b src_image(static_cast<int>(original_config.dimension_sizes[1]), static_cast<int>(original_config.dimension_sizes[0]), const_cast<unsigned char *>(static_cast<const unsigned char *>(data)) + (neuron_count_per_feature_map * feature_map_id));
			cv::Mat1b image(static_cast<int>(original_config.dimension_sizes[1]), static_cast<int>(original_config.dimension_sizes[0]), static_cast<unsigned char *>(data_transformed) + (neuron_count_per_feature_map * feature_map_id));
			memcpy(
				((unsigned char *)data_transformed) + neuron_count_per_feature_map * feature_map_id,
				((unsigned char *)data) + neuron_count_per_feature_map * feature_map_id,
				neuron_count_per_feature_map * neuron_data_type::get_input_size(type));

			if (sample_id == 1)
			{
				data_transformer_util::flip(
					image,
					(flip_around_dimension_id == 0),
					(flip_around_dimension_id == 1));
			}
		}
	}
		layer_updater_cuda::ptr sparse_convolution_layer_updater_schema::create_updater_specific(
			const std::vector<layer_configuration_specific>& input_configuration_specific_list,
			const layer_configuration_specific& output_configuration_specific) const
		{
			layer_updater_cuda::ptr res;

			nnforge_shared_ptr<const sparse_convolution_layer> layer_derived = nnforge_dynamic_pointer_cast<const sparse_convolution_layer>(layer_schema);

			bool zero_padding = (layer_derived->left_zero_padding == std::vector<unsigned int>(layer_derived->left_zero_padding.size(), 0))
				&& (layer_derived->right_zero_padding == std::vector<unsigned int>(layer_derived->right_zero_padding.size(), 0));

			if (zero_padding && (output_configuration_specific.get_neuron_count() == output_configuration_specific.feature_map_count))
			{
				if (input_configuration_specific_list[0].dimension_sizes == output_configuration_specific.dimension_sizes)
				{
					res = layer_updater_cuda::ptr(new sparse_fully_connected_1x1_layer_updater_cuda());
				}
				else
				{
					res = layer_updater_cuda::ptr(new sparse_fully_connected_layer_updater_cuda());
				}
			}
			else
			{
				res = sparse_convolution_layer_updater_schema_helper_cuda_kepler::create_updater_specific(input_configuration_specific_list, output_configuration_specific);
			}

			return res;
		}
	void intensity_2d_data_transformer::transform(
		const void * data,
		void * data_transformed,
		neuron_data_type::input_type type,
		const layer_configuration_specific& original_config)
	{
		if (type != neuron_data_type::type_byte)
			throw neural_network_exception("intensity_2d_data_transformer is implemented for data stored as bytes only");

		if (original_config.dimension_sizes.size() != 2)
			throw neural_network_exception((boost::format("intensity_2d_data_transformer is processing 2d data only, data is passed with number of dimensions %1%") % original_config.dimension_sizes.size()).str());

		float contrast = contrast_distribution(generator);
		float brightness_shift = brightness_shift_distribution(generator) * 255.0F;

		unsigned int neuron_count_per_feature_map = original_config.get_neuron_count_per_feature_map();
		for(unsigned int feature_map_id = 0; feature_map_id < original_config.feature_map_count; ++feature_map_id)
		{
			cv::Mat1b image(static_cast<int>(original_config.dimension_sizes[1]), static_cast<int>(original_config.dimension_sizes[0]), static_cast<unsigned char *>(data_transformed) + (neuron_count_per_feature_map * feature_map_id));

			data_transformer_util::change_brightness_and_contrast(
				image,
				contrast,
				brightness_shift);
		}
	}
	void uniform_intensity_data_transformer::transform(
		const float * data,
		float * data_transformed,
		const layer_configuration_specific& original_config,
		unsigned int sample_id)
	{
		if (original_config.feature_map_count != shift_distribution_list.size())
			throw neural_network_exception((boost::format("uniform_intensity_data_transformer was initialized with %1% distributions and data provided has %2% feature maps") % shift_distribution_list.size() % original_config.feature_map_count).str());

		std::vector<float> shift_list(original_config.feature_map_count);
		{
			boost::lock_guard<boost::mutex> lock(gen_stream_mutex);

			for(unsigned int feature_map_id = 0; feature_map_id < original_config.feature_map_count; ++feature_map_id)
			{
				nnforge_uniform_real_distribution<float>& dist = shift_distribution_list[feature_map_id];
				float shift = dist.min();
				if (dist.max() > dist.min())
					shift = dist(generator);
				shift_list[feature_map_id] = shift;
			}
		}

		unsigned int neuron_count_per_feature_map = original_config.get_neuron_count_per_feature_map();
		for(unsigned int feature_map_id = 0; feature_map_id < original_config.feature_map_count; ++feature_map_id)
		{
			float shift = shift_list[feature_map_id];
			const float * src_data = data + feature_map_id * neuron_count_per_feature_map;
			float * dest_data = data_transformed + feature_map_id * neuron_count_per_feature_map;
			for(unsigned int i = 0; i < neuron_count_per_feature_map; ++i)
				dest_data[i] = src_data[i] + shift;
		}
	}
		void soft_rectified_linear_layer_updater_plain::backprop(
			additional_buffer_smart_ptr input_errors,
			const_additional_buffer_smart_ptr input_neurons,
			const_additional_buffer_smart_ptr output_errors,
			const_additional_buffer_smart_ptr output_neurons,
			std::vector<additional_buffer_smart_ptr>& additional_buffers,
			plain_running_configuration_const_smart_ptr plain_config,
			const_layer_smart_ptr layer_schema,
			const_layer_data_smart_ptr data,
			const_layer_data_custom_smart_ptr data_custom,
			const layer_configuration_specific& input_configuration_specific,
			const layer_configuration_specific& output_configuration_specific,
			unsigned int updater_count) const
		{
			const int elem_count = static_cast<int>(updater_count * input_configuration_specific.get_neuron_count());
			const std::vector<float>::iterator in_err_it = input_errors->begin();
			const std::vector<float>::const_iterator out_it = output_neurons->begin();

			#pragma omp parallel for default(none) schedule(guided) num_threads(plain_config->openmp_thread_count)
			for(int i = 0; i < elem_count; ++i)
			{
				float out_neuron = *(out_it + i);
				float val = expf(out_neuron);
				float der1st = (val - 1.0F) / val;
				*(in_err_it + i) *= der1st;
			}
		}
		layer_updater_cuda::ptr sparse_convolution_layer_updater_schema::create_updater_specific(
			const std::vector<layer_configuration_specific>& input_configuration_specific_list,
			const layer_configuration_specific& output_configuration_specific,
			const cuda_running_configuration& cuda_config) const
		{
			std::shared_ptr<const sparse_convolution_layer> layer_derived = std::dynamic_pointer_cast<const sparse_convolution_layer>(layer_schema);

			bool zero_padding = (layer_derived->left_zero_padding == std::vector<unsigned int>(layer_derived->left_zero_padding.size(), 0))
				&& (layer_derived->right_zero_padding == std::vector<unsigned int>(layer_derived->right_zero_padding.size(), 0));
			bool unit_stride = (layer_derived->strides == std::vector<unsigned int>(layer_derived->strides.size(), 1));
			bool single_output = (output_configuration_specific.get_neuron_count() == output_configuration_specific.feature_map_count);
			bool fully_connected = single_output & unit_stride;
			bool window1x1 = (layer_derived->window_sizes == std::vector<unsigned int>(layer_derived->window_sizes.size(), 1));

			if (zero_padding)
			{
				if (fully_connected)
				{
					if (window1x1)
						return layer_updater_cuda::ptr(new sparse_fully_connected_1x1_layer_updater_cuda());
					else
						return layer_updater_cuda::ptr(new sparse_fully_connected_layer_updater_cuda());
				}
				else
				{
					if (window1x1)
						return layer_updater_cuda::ptr(new sparse_1x1_layer_updater_cuda());
				}
			}

			if (unit_stride)
				return sparse_convolution_layer_updater_schema_helper_cuda::create_updater_specific(input_configuration_specific_list[0], output_configuration_specific);

			throw neural_network_exception("There is no sparse_convolution_layer tester implemented for non-unit stride and non-unit window");
		}
		void sigmoid_layer_updater_plain::test(
			const_additional_buffer_smart_ptr input_buffer,
			additional_buffer_smart_ptr output_buffer,
			std::vector<additional_buffer_smart_ptr>& additional_buffers,
			plain_running_configuration_const_smart_ptr plain_config,
			const_layer_smart_ptr layer_schema,
			const layer_data_list& data,
			const layer_configuration_specific& input_configuration_specific,
			const layer_configuration_specific& output_configuration_specific,
			unsigned int updater_count,
			int offset_input_entry_id) const
		{
			if (offset_input_entry_id >= 0)
				throw neural_network_exception("sigmoid_layer_updater_plain is not able to run using the same input");

			const int elem_count = static_cast<int>(updater_count * input_configuration_specific.get_neuron_count());
			const std::vector<float>::const_iterator in_it = input_buffer->begin();
			const std::vector<float>::iterator out_it = output_buffer->begin();

			#pragma omp parallel for default(none) schedule(guided) num_threads(plain_config->openmp_thread_count)
			for(int i = 0; i < elem_count; ++i)
			{
				float inp = *(in_it + i);
				float res = 1.0F / (expf(-inp) + 1.0F);
				*(out_it + i) = res;
			}
		}
		void add_layer_tester_plain::run_forward_propagation(
			plain_buffer::ptr output_buffer,
			const std::vector<plain_buffer::const_ptr>& input_buffers,
			plain_buffer::ptr temporary_working_fixed_buffer,
			plain_buffer::ptr temporary_working_per_entry_buffer,
			plain_running_configuration::const_ptr plain_config,
			layer::const_ptr layer_schema,
			layer_data::const_ptr data,
			layer_data_custom::const_ptr data_custom,
			const std::vector<layer_configuration_specific>& input_configuration_specific_list,
			const layer_configuration_specific& output_configuration_specific,
			unsigned int entry_count) const
		{
			float * const out = *output_buffer;
			std::vector<const float *> in_list;
			for(std::vector<plain_buffer::const_ptr>::const_iterator it = input_buffers.begin(); it != input_buffers.end(); ++it)
				in_list.push_back(**it);
			const float ** const in_ptr_list = &in_list[0];
			std::shared_ptr<const add_layer> layer_derived = std::dynamic_pointer_cast<const add_layer>(layer_schema);
			const float alpha = layer_derived->alpha;
			const int src_ptr_count = static_cast<int>(in_list.size());
			const int elem_count = static_cast<int>(entry_count * output_configuration_specific.get_neuron_count());
			#pragma omp parallel for default(none) schedule(guided) num_threads(plain_config->openmp_thread_count)
			for(int i = 0; i < elem_count; ++i)
			{
				float sum = 0.0F;
				for(int j = 0; j < src_ptr_count; ++j)
					sum += in_ptr_list[j][i];
				out[i] = sum * alpha;
			}
		}
	layer_configuration_specific reshape_data_transformer::get_transformed_configuration(const layer_configuration_specific& original_config) const
	{
		if (original_config.get_neuron_count() != config.get_neuron_count())
			throw neural_network_exception((boost::format("Neuron counts for reshape_data_transformer don't match: %1% and %2%") % original_config.get_neuron_count() % config.get_neuron_count()).str());

		return config;
	}
		void prefix_sum_layer_tester_plain::run_forward_propagation(
			plain_buffer::ptr output_buffer,
			const std::vector<plain_buffer::const_ptr>& input_buffers,
			plain_buffer::ptr temporary_working_fixed_buffer,
			plain_buffer::ptr temporary_working_per_entry_buffer,
			plain_running_configuration::const_ptr plain_config,
			layer::const_ptr layer_schema,
			layer_data::const_ptr data,
			layer_data_custom::const_ptr data_custom,
			const std::vector<layer_configuration_specific>& input_configuration_specific_list,
			const layer_configuration_specific& output_configuration_specific,
			unsigned int entry_count) const
		{
			const float * const in_it_global = *input_buffers[0];
			float * const out_it_global = *output_buffer;
			const unsigned int neuron_count = output_configuration_specific.get_neuron_count();
			std::shared_ptr<const prefix_sum_layer> layer_derived = std::dynamic_pointer_cast<const prefix_sum_layer>(layer_schema);
			const unsigned int feature_map_segment_length = layer_derived->feature_map_segment_length;
			const unsigned int feature_map_segment_count = output_configuration_specific.feature_map_count / feature_map_segment_length;
			const unsigned int neuron_count_per_feature_map = output_configuration_specific.get_neuron_count_per_feature_map();
			const float clamp_min = layer_derived->clamp_min;
			const float clamp_max = layer_derived->clamp_max;
			const int total_workload = entry_count * feature_map_segment_count * neuron_count_per_feature_map;

			#pragma omp parallel default(none) num_threads(plain_config->openmp_thread_count)
			{
				#pragma omp for schedule(guided)
				for(int workload_id = 0; workload_id < total_workload; ++workload_id)
				{
					int entry_id = workload_id / (feature_map_segment_count * neuron_count_per_feature_map);
					int tt = workload_id - entry_id * feature_map_segment_count * neuron_count_per_feature_map;
					int feature_map_segment_id = tt / neuron_count_per_feature_map;
					int neuron_id = tt - feature_map_segment_id * neuron_count_per_feature_map;

					int offset = entry_id * neuron_count + feature_map_segment_id * feature_map_segment_length * neuron_count_per_feature_map + neuron_id;

					float running_sum = 0.0F;
					for(unsigned int i = 0; i < feature_map_segment_length; ++i, offset += neuron_count_per_feature_map)
					{
						running_sum += in_it_global[offset];
						out_it_global[offset] = std::min(std::max(running_sum, clamp_min), clamp_max);
					}
				}
			}
		}
		size_t max_subsampling_layer_updater_plain::get_temporary_per_entry_buffer_size(
			const std::set<layer_action>& actions,
			plain_running_configuration::const_ptr plain_config,
			layer::const_ptr layer_schema,
			const std::vector<layer_configuration_specific>& input_configuration_specific_list,
			const layer_configuration_specific& output_configuration_specific) const
		{
			return output_configuration_specific.get_neuron_count() * sizeof(unsigned int);
		}