void softmax_layer_hessian_plain::backprop(
			additional_buffer_smart_ptr input_errors,
			const_additional_buffer_smart_ptr output_errors,
			const_additional_buffer_smart_ptr output_neurons,
			std::vector<additional_buffer_smart_ptr>& additional_buffers,
			plain_running_configuration_const_smart_ptr plain_config,
			const_layer_smart_ptr layer_schema,
			const_layer_data_smart_ptr data,
			const_layer_data_custom_smart_ptr data_custom,
			const layer_configuration_specific& input_configuration_specific,
			const layer_configuration_specific& output_configuration_specific,
			unsigned int entry_count) const
		{
			const unsigned int input_neuron_count = input_configuration_specific.get_neuron_count();
			const unsigned int input_neuron_count_per_feature_map = input_configuration_specific.get_neuron_count_per_feature_map();
			const unsigned int feature_map_count = static_cast<unsigned int>(input_configuration_specific.feature_map_count);

			const std::vector<float>::iterator input_errors_it = input_errors->begin();
			const std::vector<float>::const_iterator output_errors_it = output_errors->begin();
			const std::vector<float>::const_iterator output_neurons_it = output_neurons->begin();

			const int total_workload = entry_count * input_neuron_count_per_feature_map;
			const int openmp_thread_count = plain_config->openmp_thread_count;
			
			#pragma omp parallel default(none) shared(additional_buffers) num_threads(openmp_thread_count)
			{
				int thread_id = 0;
				#ifdef _OPENMP
				thread_id = omp_get_thread_num();
				#endif

				#pragma omp for schedule(guided)
				for(int workload_id = 0; workload_id < total_workload; ++workload_id)
				{
					int entry_id = workload_id / input_neuron_count_per_feature_map;
					int neuron_id = workload_id - (entry_id * input_neuron_count_per_feature_map);

					const std::vector<float>::iterator in_errors_it = input_errors_it + (entry_id * input_neuron_count) + neuron_id;
					const std::vector<float>::const_iterator out_errors_it = output_errors_it + (entry_id * input_neuron_count) + neuron_id;
					const std::vector<float>::const_iterator out_neurons_it = output_neurons_it + (entry_id * input_neuron_count) + neuron_id;

					float sum = 0.0F;
					for(unsigned int feature_map_id = 0; feature_map_id < feature_map_count; ++feature_map_id)
					{
						unsigned int offset = feature_map_id * input_neuron_count_per_feature_map;
						float val = (*(out_neurons_it + offset));
						sum += val * val * (*(out_errors_it + offset));
					}

					for(unsigned int feature_map_id = 0; feature_map_id < feature_map_count; ++feature_map_id)
					{
						unsigned int offset = feature_map_id * input_neuron_count_per_feature_map;
						float y = *(out_neurons_it + offset);
						float y2 = y * y;
						*(in_errors_it + offset) = y2 * ((*(out_errors_it + offset)) * (2.0F * (y2 - y) + 1.0F) - sum);
					}
				} // for(int workload_id
			} // #pragma parallel
		}
		void dropout_layer_updater_plain::test(
			const_additional_buffer_smart_ptr input_buffer,
			additional_buffer_smart_ptr output_buffer,
			std::vector<additional_buffer_smart_ptr>& additional_buffers,
			plain_running_configuration_const_smart_ptr plain_config,
			const_layer_smart_ptr layer_schema,
			const_layer_data_smart_ptr data,
			const_layer_data_custom_smart_ptr data_custom,
			const layer_configuration_specific& input_configuration_specific,
			const layer_configuration_specific& output_configuration_specific,
			unsigned int updater_count,
			unsigned int offset_input_entry_id,
			bool force_deterministic) const
		{
			if (offset_input_entry_id > 0)
				throw neural_network_exception("dropout_layer_updater_plain is not able to run using offset");

			if (force_deterministic)
			{
				memcpy(&(output_buffer->at(0)), &(input_buffer->at(0)), input_configuration_specific.get_neuron_count() * updater_count * sizeof(float));
			}
			else
			{
				const std::vector<float>::const_iterator in_it_global = input_buffer->begin();
				const std::vector<float>::iterator out_it_global = output_buffer->begin();
				unsigned char * keep_elem_ptr = reinterpret_cast<unsigned char *>(&(additional_buffers[0]->at(0)));

				nnforge_shared_ptr<const dropout_layer> layer_derived = nnforge_dynamic_pointer_cast<const dropout_layer>(layer_schema);
				const float dropout_rate = layer_derived->dropout_rate;
				const float keep_rate = 1.0F - dropout_rate;
				const float mult = 1.0F / keep_rate;

				const int total_workload = input_configuration_specific.get_neuron_count() * updater_count;

				nnforge_uniform_real_distribution<float> dist(0.0F, 1.0F);

				for(int i = 0; i < total_workload; ++i)
					keep_elem_ptr[i] = (dist(gen) <= keep_rate ? (unsigned char)1 : (unsigned char)0);

				#pragma omp parallel default(none) num_threads(plain_config->openmp_thread_count) shared(keep_elem_ptr)
				{
					#pragma omp for schedule(guided)
					for(int workload_id = 0; workload_id < total_workload; ++workload_id)
					{
						int elem_id = workload_id;
						*(out_it_global + elem_id) = *(in_it_global + elem_id) * (keep_elem_ptr[elem_id] ? mult : 0.0F);
					}
				}
			}
		}
		void hyperbolic_tangent_layer_updater_plain::test(
			const_additional_buffer_smart_ptr input_buffer,
			additional_buffer_smart_ptr output_buffer,
			std::vector<additional_buffer_smart_ptr>& additional_buffers,
			plain_running_configuration_const_smart_ptr plain_config,
			const_layer_smart_ptr layer_schema,
			const_layer_data_smart_ptr data,
			const_layer_data_custom_smart_ptr data_custom,
			const layer_configuration_specific& input_configuration_specific,
			const layer_configuration_specific& output_configuration_specific,
			unsigned int updater_count,
			unsigned int offset_input_entry_id) const
		{
			if (offset_input_entry_id > 0)
				throw neural_network_exception("hyperbolic_tangent_layer_updater_plain is not able to run using offset");

			const int elem_count = static_cast<int>(updater_count * input_configuration_specific.get_neuron_count());
			const std::vector<float>::const_iterator in_it = input_buffer->begin();
			const std::vector<float>::iterator out_it = output_buffer->begin();

			nnforge_shared_ptr<const hyperbolic_tangent_layer> layer_derived = nnforge_dynamic_pointer_cast<const hyperbolic_tangent_layer>(layer_schema);
			const float hyperbolic_tangent_steepness2 = layer_derived->steepness * 2.0F;
			const float hyperbolic_tangent_major_multiplier = layer_derived->major_multiplier;

			#pragma omp parallel for default(none) schedule(guided) num_threads(plain_config->openmp_thread_count)
			for(int i = 0; i < elem_count; ++i)
			{
				float inp = *(in_it + i);
				float inp2 = expf(inp * hyperbolic_tangent_steepness2);
				float res = (inp2 - 1.0F) / (inp2 + 1.0F) * hyperbolic_tangent_major_multiplier;
				*(out_it + i) = res;
			}
		}
		void hyperbolic_tangent_layer_updater_plain::backprop(
			additional_buffer_smart_ptr input_errors,
			const_additional_buffer_smart_ptr input_neurons,
			const_additional_buffer_smart_ptr output_errors,
			const_additional_buffer_smart_ptr output_neurons,
			std::vector<additional_buffer_smart_ptr>& additional_buffers,
			plain_running_configuration_const_smart_ptr plain_config,
			const_layer_smart_ptr layer_schema,
			const_layer_data_smart_ptr data,
			const_layer_data_custom_smart_ptr data_custom,
			const layer_configuration_specific& input_configuration_specific,
			const layer_configuration_specific& output_configuration_specific,
			unsigned int updater_count) const
		{
			const int elem_count = static_cast<int>(updater_count * input_configuration_specific.get_neuron_count());
			const std::vector<float>::iterator in_err_it = input_errors->begin();
			const std::vector<float>::const_iterator out_it = output_neurons->begin();

			nnforge_shared_ptr<const hyperbolic_tangent_layer> layer_derived = nnforge_dynamic_pointer_cast<const hyperbolic_tangent_layer>(layer_schema);
			const float hyperbolic_tangent_major_multiplier_reverse = 1.0F / layer_derived->major_multiplier;
			const float hyperbolic_tangent_steepness3 = layer_derived->steepness * layer_derived->major_multiplier;
			#pragma omp parallel for default(none) schedule(guided) num_threads(plain_config->openmp_thread_count)
			for(int i = 0; i < elem_count; ++i)
			{
				float out_neuron = *(out_it + i);
				float normalized_value = out_neuron * hyperbolic_tangent_major_multiplier_reverse;
				float der1st = hyperbolic_tangent_steepness3 * (1.0F - (normalized_value * normalized_value));
				*(in_err_it + i) *= der1st;
			}
		}
		void sigmoid_layer_updater_plain::test(
			const_additional_buffer_smart_ptr input_buffer,
			additional_buffer_smart_ptr output_buffer,
			std::vector<additional_buffer_smart_ptr>& additional_buffers,
			plain_running_configuration_const_smart_ptr plain_config,
			const_layer_smart_ptr layer_schema,
			const layer_data_list& data,
			const layer_configuration_specific& input_configuration_specific,
			const layer_configuration_specific& output_configuration_specific,
			unsigned int updater_count,
			int offset_input_entry_id) const
		{
			if (offset_input_entry_id >= 0)
				throw neural_network_exception("sigmoid_layer_updater_plain is not able to run using the same input");

			const int elem_count = static_cast<int>(updater_count * input_configuration_specific.get_neuron_count());
			const std::vector<float>::const_iterator in_it = input_buffer->begin();
			const std::vector<float>::iterator out_it = output_buffer->begin();

			#pragma omp parallel for default(none) schedule(guided) num_threads(plain_config->openmp_thread_count)
			for(int i = 0; i < elem_count; ++i)
			{
				float inp = *(in_it + i);
				float res = 1.0F / (expf(-inp) + 1.0F);
				*(out_it + i) = res;
			}
		}
		void softmax_layer_hessian_plain::test(
			const_additional_buffer_smart_ptr input_buffer,
			additional_buffer_smart_ptr output_buffer,
			std::vector<additional_buffer_smart_ptr>& additional_buffers,
			plain_running_configuration_const_smart_ptr plain_config,
			const_layer_smart_ptr layer_schema,
			const_layer_data_smart_ptr data,
			const_layer_data_custom_smart_ptr data_custom,
			const layer_configuration_specific& input_configuration_specific,
			const layer_configuration_specific& output_configuration_specific,
			unsigned int entry_count) const
		{
			const unsigned int input_neuron_count = input_configuration_specific.get_neuron_count();
			const unsigned int input_neuron_count_per_feature_map = input_configuration_specific.get_neuron_count_per_feature_map();
			const unsigned int feature_map_count = static_cast<unsigned int>(input_configuration_specific.feature_map_count);

			const std::vector<float>::const_iterator input_buffer_it = input_buffer->begin();
			const std::vector<float>::iterator output_buffer_it = output_buffer->begin();

			const int total_workload = entry_count * input_neuron_count_per_feature_map;
			const int openmp_thread_count = plain_config->openmp_thread_count;
			
			#pragma omp parallel default(none) shared(additional_buffers) num_threads(openmp_thread_count)
			{
				int thread_id = 0;
				#ifdef _OPENMP
				thread_id = omp_get_thread_num();
				#endif

				std::vector<float>& local_additional_buffer = *(additional_buffers[thread_id]);

				#pragma omp for schedule(guided)
				for(int workload_id = 0; workload_id < total_workload; ++workload_id)
				{
					int entry_id = workload_id / input_neuron_count_per_feature_map;
					int neuron_id = workload_id - (entry_id * input_neuron_count_per_feature_map);

					const std::vector<float>::const_iterator in_it = input_buffer_it + (entry_id * input_neuron_count) + neuron_id;
					const std::vector<float>::iterator out_it = output_buffer_it + (entry_id * input_neuron_count) + neuron_id;

					float max_val = -1.0e+37F;
					for(unsigned int feature_map_id = 0; feature_map_id < feature_map_count; ++feature_map_id)
					{
						float val = *(in_it + (feature_map_id * input_neuron_count_per_feature_map));
						max_val = std::max(max_val, val);
					}

					float sum = 0.0F;
					for(unsigned int feature_map_id = 0; feature_map_id < feature_map_count; ++feature_map_id)
					{
						float val = expf((*(in_it + (feature_map_id * input_neuron_count_per_feature_map))) - max_val);
						sum += val;
						local_additional_buffer[feature_map_id] = val;
					}
					float mult = 1.0F / sum;
					for(unsigned int feature_map_id = 0; feature_map_id < feature_map_count; ++feature_map_id)
						*(out_it + (feature_map_id * input_neuron_count_per_feature_map)) = local_additional_buffer[feature_map_id] * mult;
				} // for(int workload_id
			} // #pragma parallel
		}
		void absolute_layer_updater_plain::backprop(
			additional_buffer_smart_ptr input_errors,
			const_additional_buffer_smart_ptr input_neurons,
			const_additional_buffer_smart_ptr output_errors,
			const_additional_buffer_smart_ptr output_neurons,
			std::vector<additional_buffer_smart_ptr>& additional_buffers,
			plain_running_configuration_const_smart_ptr plain_config,
			const_layer_smart_ptr layer_schema,
			const_layer_data_smart_ptr data,
			const layer_configuration_specific& input_configuration_specific,
			const layer_configuration_specific& output_configuration_specific,
			unsigned int updater_count) const
		{
			const int elem_count = static_cast<int>(updater_count * input_configuration_specific.get_neuron_count());
			const std::vector<float>::const_iterator in_it = input_neurons->begin();
			const std::vector<float>::iterator in_err_it = input_errors->begin();

			#pragma omp parallel for default(none) schedule(guided) num_threads(plain_config->openmp_thread_count)
			for(int i = 0; i < elem_count; ++i)
			{
				float val = *(in_it + i);
				if (val < 0.0F)
				{
					*(in_err_it + i) = - *(in_err_it + i);
				}
			}
		}
		void absolute_layer_updater_plain::test(
			const_additional_buffer_smart_ptr input_buffer,
			additional_buffer_smart_ptr output_buffer,
			std::vector<additional_buffer_smart_ptr>& additional_buffers,
			plain_running_configuration_const_smart_ptr plain_config,
			const_layer_smart_ptr layer_schema,
			const_layer_data_smart_ptr data,
			const_layer_data_custom_smart_ptr data_custom,
			const layer_configuration_specific& input_configuration_specific,
			const layer_configuration_specific& output_configuration_specific,
			unsigned int updater_count,
			unsigned int offset_input_entry_id,
			bool force_deterministic) const
		{
			if (offset_input_entry_id > 0)
				throw neural_network_exception("absolute_layer_updater_plain is not able to run using offset");

			const int elem_count = static_cast<int>(updater_count * input_configuration_specific.get_neuron_count());
			const std::vector<float>::const_iterator in_it = input_buffer->begin();
			const std::vector<float>::iterator out_it = output_buffer->begin();

			#pragma omp parallel for default(none) schedule(guided) num_threads(plain_config->openmp_thread_count)
			for(int i = 0; i < elem_count; ++i)
				*(out_it + i) = fabs(*(in_it + i));
		}
		void soft_rectified_linear_layer_updater_plain::backprop(
			additional_buffer_smart_ptr input_errors,
			const_additional_buffer_smart_ptr input_neurons,
			const_additional_buffer_smart_ptr output_errors,
			const_additional_buffer_smart_ptr output_neurons,
			std::vector<additional_buffer_smart_ptr>& additional_buffers,
			plain_running_configuration_const_smart_ptr plain_config,
			const_layer_smart_ptr layer_schema,
			const_layer_data_smart_ptr data,
			const_layer_data_custom_smart_ptr data_custom,
			const layer_configuration_specific& input_configuration_specific,
			const layer_configuration_specific& output_configuration_specific,
			unsigned int updater_count) const
		{
			const int elem_count = static_cast<int>(updater_count * input_configuration_specific.get_neuron_count());
			const std::vector<float>::iterator in_err_it = input_errors->begin();
			const std::vector<float>::const_iterator out_it = output_neurons->begin();

			#pragma omp parallel for default(none) schedule(guided) num_threads(plain_config->openmp_thread_count)
			for(int i = 0; i < elem_count; ++i)
			{
				float out_neuron = *(out_it + i);
				float val = expf(out_neuron);
				float der1st = (val - 1.0F) / val;
				*(in_err_it + i) *= der1st;
			}
		}
		void sigmoid_layer_hessian_plain::test(
			const_additional_buffer_smart_ptr input_buffer,
			additional_buffer_smart_ptr output_buffer,
			std::vector<additional_buffer_smart_ptr>& additional_buffers,
			plain_running_configuration_const_smart_ptr plain_config,
			const_layer_smart_ptr layer_schema,
			const_layer_data_smart_ptr data,
			const layer_configuration_specific& input_configuration_specific,
			const layer_configuration_specific& output_configuration_specific,
			unsigned int entry_count) const
		{
			const int elem_count = static_cast<int>(entry_count * input_configuration_specific.get_neuron_count());
			const std::vector<float>::const_iterator in_it = input_buffer->begin();
			const std::vector<float>::iterator out_it = output_buffer->begin();

			#pragma omp parallel for default(none) schedule(guided) num_threads(plain_config->openmp_thread_count)
			for(int i = 0; i < elem_count; ++i)
			{
				float inp = *(in_it + i);
				float res = 1.0F / (expf(-inp) + 1.0F);
				*(out_it + i) = res;
			}
		}
		void max_subsampling_layer_updater_plain::backprop(
			additional_buffer_smart_ptr input_errors,
			const_additional_buffer_smart_ptr input_neurons,
			const_additional_buffer_smart_ptr output_errors,
			const_additional_buffer_smart_ptr output_neurons,
			std::vector<additional_buffer_smart_ptr>& additional_buffers,
			plain_running_configuration_const_smart_ptr plain_config,
			const_layer_smart_ptr layer_schema,
			const_layer_data_smart_ptr data,
			const_layer_data_custom_smart_ptr data_custom,
			const layer_configuration_specific& input_configuration_specific,
			const layer_configuration_specific& output_configuration_specific,
			unsigned int updater_count,
			bool force_deterministic) const
		{
			const std::vector<float>::iterator in_err_it_global = input_errors->begin();
			const std::vector<float>::const_iterator out_err_it_global = output_errors->begin();
			const std::vector<float>::const_iterator max_indexes_it_global = additional_buffers[0]->begin();

			const int total_clean_workload = updater_count * input_configuration_specific.get_neuron_count();

			#pragma omp parallel for default(none) schedule(guided) num_threads(plain_config->openmp_thread_count)
			for(int workload_id = 0; workload_id < total_clean_workload; ++workload_id)
			{
				*(in_err_it_global + workload_id) = 0.0F;
			}

			const int total_workload = updater_count * output_configuration_specific.get_neuron_count();

			#pragma omp parallel for default(none) schedule(guided) num_threads(plain_config->openmp_thread_count)
			for(int workload_id = 0; workload_id < total_workload; ++workload_id)
			{
				unsigned int max_index = *(((unsigned int *)(&(*max_indexes_it_global))) + workload_id);
				float err = *(out_err_it_global + workload_id);
				*(in_err_it_global + max_index) = err;
			}
		}
		void sigmoid_layer_hessian_plain::backprop(
			additional_buffer_smart_ptr input_errors,
			const_additional_buffer_smart_ptr output_errors,
			const_additional_buffer_smart_ptr output_neurons,
			std::vector<additional_buffer_smart_ptr>& additional_buffers,
			plain_running_configuration_const_smart_ptr plain_config,
			const_layer_smart_ptr layer_schema,
			const_layer_data_smart_ptr data,
			const layer_configuration_specific& input_configuration_specific,
			const layer_configuration_specific& output_configuration_specific,
			unsigned int entry_count) const
		{
			const int elem_count = static_cast<int>(entry_count * input_configuration_specific.get_neuron_count());
			const std::vector<float>::iterator in_err_it = input_errors->begin();
			const std::vector<float>::const_iterator out_it = output_neurons->begin();

			#pragma omp parallel for default(none) schedule(guided) num_threads(plain_config->openmp_thread_count)
			for(int i = 0; i < elem_count; ++i)
			{
				float out_neuron = *(out_it + i);
				float der1st = out_neuron * (1.0F - out_neuron);
				*(in_err_it + i) *= (der1st * der1st);
			}
		}
		void convolution_layer_hessian_plain::update_hessian(
			const_additional_buffer_smart_ptr input_neurons,
			const_additional_buffer_smart_ptr output_errors,
			std::vector<additional_buffer_smart_ptr>& additional_buffers,
			layer_data_smart_ptr hessian_data,
			plain_running_configuration_const_smart_ptr plain_config,
			const_layer_smart_ptr layer_schema,
			const layer_configuration_specific& input_configuration_specific,
			const layer_configuration_specific& output_configuration_specific,
			unsigned int entry_count) const
		{
			const std::vector<float>::const_iterator in_it_global = input_neurons->begin();
			const std::vector<float>::const_iterator out_err_it_global = output_errors->begin();
			const unsigned int input_neuron_count = input_configuration_specific.get_neuron_count();
			const unsigned int input_neuron_count_per_feature_map = input_configuration_specific.get_neuron_count_per_feature_map();
			const unsigned int output_neuron_count = output_configuration_specific.get_neuron_count();
			const unsigned int output_neuron_count_per_feature_map = output_configuration_specific.get_neuron_count_per_feature_map();
			std::tr1::shared_ptr<const convolution_layer> layer_derived = std::tr1::dynamic_pointer_cast<const convolution_layer>(layer_schema);
			const std::vector<unsigned int>& window_sizes = layer_derived->window_sizes;
			const unsigned int dimension_count = static_cast<unsigned int>(window_sizes.size());
			std::vector<unsigned int> input_slices(input_configuration_specific.dimension_sizes.size());
			input_slices[0] = 1;
			for(unsigned int i = 0; i < dimension_count - 1; ++i)
				input_slices[i + 1] = input_slices[i] * input_configuration_specific.dimension_sizes[i];
			unsigned int window_elem_count = 1;
			for(unsigned int i = 0; i < dimension_count; ++i)
				window_elem_count *= window_sizes[i];
			const unsigned int const_window_elem_count = window_elem_count;
			const std::vector<float>::iterator weights = (*hessian_data)[0].begin();
			const std::vector<float>::iterator biases = (*hessian_data)[1].begin();

			std::vector<unsigned int> current_local_input_position(dimension_count, 0);
			std::vector<unsigned int> offset_list(window_elem_count);
			for(unsigned int i = 1; i < window_elem_count; ++i)
			{
				int offset = 0;
				for(unsigned int j = 0; j < dimension_count; ++j)
				{
					offset += static_cast<int>(input_slices[j]);
					if ((++current_local_input_position[j]) < window_sizes[j])
					{
						offset_list[i] = offset_list[i-1] + offset;
						break;
					}
					current_local_input_position[j] = 0;
					offset -= static_cast<int>(window_sizes[j] * input_slices[j]);
				}
			}

			const unsigned int output_feature_map_count = output_configuration_specific.feature_map_count;
			const unsigned int input_feature_map_count = input_configuration_specific.feature_map_count;
			const int total_workload = output_feature_map_count * input_feature_map_count;
			const unsigned int const_entry_count = entry_count;
			const std::vector<unsigned int>::const_iterator output_dimension_sizes_it = output_configuration_specific.dimension_sizes.begin();
			const std::vector<unsigned int>::const_iterator input_slices_it = input_slices.begin();
			const std::vector<unsigned int>::const_iterator offset_list_it = offset_list.begin();

			#pragma omp parallel default(none) num_threads(plain_config->openmp_thread_count)
			{
				std::tr1::array<unsigned int, max_dimension_count> current_output_position;
				std::vector<float> weights_global(const_window_elem_count, 0.0F);
				std::vector<float> weights_local(const_window_elem_count, 0.0F);

				#pragma omp for schedule(guided)
				for(int workload_id = 0; workload_id < total_workload; ++workload_id)
				{
					int output_feature_map_id = workload_id / input_feature_map_count;
					int input_feature_map_id = workload_id - (output_feature_map_id * input_feature_map_count);

					std::vector<float>::const_iterator in_it_base = in_it_global + (input_feature_map_id * input_neuron_count_per_feature_map);
					std::vector<float>::const_iterator out_err_it_base = out_err_it_global + (output_feature_map_id * output_neuron_count_per_feature_map);
					std::vector<float>::iterator weights_it_base = weights + (output_feature_map_id * (const_window_elem_count * input_feature_map_count)) + (const_window_elem_count * input_feature_map_id);

					std::fill_n(weights_global.begin(), const_window_elem_count, 0.0F);

					for(unsigned int entry_id = 0; entry_id < const_entry_count; ++entry_id)
					{
						std::vector<float>::const_iterator in_it_base2 = in_it_base + (entry_id * input_neuron_count);
						std::vector<float>::const_iterator out_err_it_base2 = out_err_it_base + (entry_id * output_neuron_count);

						std::fill_n(current_output_position.begin(), dimension_count, 0);
						std::fill_n(weights_local.begin(), const_window_elem_count, 0.0F);
						for(std::vector<float>::const_iterator out_err_it = out_err_it_base2; out_err_it != out_err_it_base2 + output_neuron_count_per_feature_map; ++out_err_it)
						{
							std::vector<float>::const_iterator in_it = in_it_base2;
							for(unsigned int i = 0; i < dimension_count; ++i)
								in_it += current_output_position[i] * (*(input_slices_it + i));

							float current_err = *out_err_it;
							for(unsigned int i = 0; i < const_window_elem_count; ++i)
							{
								float in_neuron = *(in_it + *(offset_list_it + i));
								weights_local[i] += (in_neuron * in_neuron * current_err);
							}

							// Go to the next output element
							for(unsigned int i = 0; i < dimension_count; ++i)
							{
								if ((++current_output_position[i]) < *(output_dimension_sizes_it + i))
									break;
								current_output_position[i] = 0;
							}
						}

						std::vector<float>::iterator weights_local_it = weights_local.begin();
						for(std::vector<float>::iterator it = weights_global.begin(); it != weights_global.end(); ++it, ++weights_local_it)
							*it += *weights_local_it;
					}

					std::vector<float>::iterator weights_global_it = weights_global.begin();
					for(std::vector<float>::iterator it = weights_it_base; it != weights_it_base + const_window_elem_count; ++it, ++weights_global_it)
						*it += *weights_global_it;
				}
			}

			const int total_workload_bias = output_feature_map_count;
			#pragma omp parallel for default(none) schedule(guided) num_threads(plain_config->openmp_thread_count)
			for(int workload_id = 0; workload_id < total_workload_bias; ++workload_id)
			{
				unsigned int output_feature_map_id = workload_id;
				std::vector<float>::const_iterator out_err_it_base = out_err_it_global + (output_feature_map_id * output_neuron_count_per_feature_map);
				float sum = 0.0F;
				for(unsigned int entry_id = 0; entry_id < const_entry_count; ++entry_id)
				{
					std::vector<float>::const_iterator out_err_it_base2 = out_err_it_base + (entry_id * output_neuron_count);
					float sum_local = 0.0F;
					for(std::vector<float>::const_iterator out_err_it = out_err_it_base2; out_err_it != out_err_it_base2 + output_neuron_count_per_feature_map; ++out_err_it)
						sum_local += *out_err_it;

					sum += sum_local;
				}

				*(biases + output_feature_map_id) += sum;
			}
		}
		void sparse_convolution_layer_updater_plain::test(
			const_additional_buffer_smart_ptr input_buffer,
			additional_buffer_smart_ptr output_buffer,
			std::vector<additional_buffer_smart_ptr>& additional_buffers,
			plain_running_configuration_const_smart_ptr plain_config,
			const_layer_smart_ptr layer_schema,
			const_layer_data_smart_ptr data,
			const_layer_data_custom_smart_ptr data_custom,
			const layer_configuration_specific& input_configuration_specific,
			const layer_configuration_specific& output_configuration_specific,
			unsigned int updater_count,
			unsigned int offset_input_entry_id) const
		{
			const unsigned int input_neuron_count = input_configuration_specific.get_neuron_count();
			const unsigned int input_neuron_count_per_feature_map = input_configuration_specific.get_neuron_count_per_feature_map();
			const unsigned int output_neuron_count = output_configuration_specific.get_neuron_count();
			const unsigned int output_neuron_count_per_feature_map = output_configuration_specific.get_neuron_count_per_feature_map();
			const std::vector<float>::const_iterator in_it_global = input_buffer->begin() + input_neuron_count * offset_input_entry_id;
			const std::vector<float>::iterator out_it_global = output_buffer->begin();
			nnforge_shared_ptr<const sparse_convolution_layer> layer_derived = nnforge_dynamic_pointer_cast<const sparse_convolution_layer>(layer_schema);
			const std::vector<unsigned int>& window_sizes = layer_derived->window_sizes;
			const unsigned int dimension_count = static_cast<unsigned int>(window_sizes.size());
			std::vector<unsigned int> input_slices(input_configuration_specific.dimension_sizes.size());
			input_slices[0] = 1;
			for(unsigned int i = 0; i < dimension_count - 1; ++i)
				input_slices[i + 1] = input_slices[i] * input_configuration_specific.dimension_sizes[i];
			unsigned int window_elem_count = 1;
			for(unsigned int i = 0; i < dimension_count; ++i)
				window_elem_count *= window_sizes[i];
			const unsigned int const_window_elem_count = window_elem_count;

			const std::vector<float>::const_iterator weights = (*data)[0].begin();
			const std::vector<float>::const_iterator biases = (*data)[1].begin();

			const std::vector<int>::const_iterator column_indices = (*data_custom)[0].begin();
			const std::vector<int>::const_iterator row_indices = (*data_custom)[1].begin();

			std::vector<unsigned int> current_local_input_position(dimension_count, 0);
			std::vector<unsigned int> offset_list(window_elem_count);
			for(unsigned int i = 1; i < window_elem_count; ++i)
			{
				int offset = 0;
				for(unsigned int j = 0; j < dimension_count; ++j)
				{
					offset += static_cast<int>(input_slices[j]);
					if ((++current_local_input_position[j]) < window_sizes[j])
					{
						offset_list[i] = offset_list[i-1] + offset;
						break;
					}
					current_local_input_position[j] = 0;
					offset -= static_cast<int>(window_sizes[j] * input_slices[j]);
				}
			}

			const unsigned int output_feature_map_count = output_configuration_specific.feature_map_count;
			const unsigned int input_feature_map_count = input_configuration_specific.feature_map_count;
			const int total_workload = updater_count * output_feature_map_count;
			const std::vector<unsigned int>::const_iterator output_dimension_sizes_it = output_configuration_specific.dimension_sizes.begin();
			const std::vector<unsigned int>::const_iterator input_slices_it = input_slices.begin();
			const std::vector<unsigned int>::const_iterator offset_list_it = offset_list.begin();

			#pragma omp parallel default(none) num_threads(plain_config->openmp_thread_count)
			{
				nnforge_array<unsigned int, max_dimension_count> current_output_position;

				#pragma omp for schedule(guided)
				for(int workload_id = 0; workload_id < total_workload; ++workload_id)
				{
					int entry_id = workload_id / output_feature_map_count;
					int output_feature_map_id = workload_id - (entry_id * output_feature_map_count);

					std::vector<float>::iterator out_it_base = out_it_global + (entry_id * output_neuron_count) + (output_feature_map_id * output_neuron_count_per_feature_map);
					std::vector<float>::const_iterator in_it_base = in_it_global + entry_id * input_neuron_count;

					const int start_column_index = row_indices[output_feature_map_id];
					const int end_column_index = row_indices[output_feature_map_id + 1];

					std::fill_n(current_output_position.begin(), dimension_count, 0);
					for(std::vector<float>::iterator out_it = out_it_base; out_it != out_it_base + output_neuron_count_per_feature_map; ++out_it)
					{
						float sum = *(biases + output_feature_map_id);
						std::vector<float>::const_iterator weights_it = weights + start_column_index * const_window_elem_count;
						std::vector<float>::const_iterator in_it_base2 = in_it_base;
						for(unsigned int i = 0; i < dimension_count; ++i)
							in_it_base2 += current_output_position[i] * (*(input_slices_it + i));

						for(int column_index = start_column_index; column_index < end_column_index; ++column_index)
						{
							int input_feature_map_id = column_indices[column_index];

							// Define the starting position of the first input elem
							std::vector<float>::const_iterator in_it = in_it_base2 + (input_feature_map_id * input_neuron_count_per_feature_map);

							for(unsigned int i = 0; i < const_window_elem_count; ++i)
							{
								sum += (*(in_it + *(offset_list_it + i))) * (*weights_it);
								++weights_it;
							}
						}
						*out_it = sum;

						// Go to the next output element
						for(unsigned int i = 0; i < dimension_count; ++i)
						{
							if ((++current_output_position[i]) < *(output_dimension_sizes_it + i))
								break;
							current_output_position[i] = 0;
						}
					}
				}
			}
		}
		void sparse_convolution_layer_updater_plain::update_weights(
			const_additional_buffer_smart_ptr input_neurons,
			const_additional_buffer_smart_ptr output_errors,
			std::vector<additional_buffer_smart_ptr>& additional_buffers,
			layer_data_smart_ptr gradient,
			const_layer_data_custom_smart_ptr data_custom,
			plain_running_configuration_const_smart_ptr plain_config,
			const_layer_smart_ptr layer_schema,
			const layer_configuration_specific& input_configuration_specific,
			const layer_configuration_specific& output_configuration_specific,
			unsigned int updater_count,
			unsigned int offset_input_entry_id) const
		{
			const unsigned int input_neuron_count = input_configuration_specific.get_neuron_count();
			const unsigned int input_neuron_count_per_feature_map = input_configuration_specific.get_neuron_count_per_feature_map();
			const unsigned int output_neuron_count = output_configuration_specific.get_neuron_count();
			const unsigned int output_neuron_count_per_feature_map = output_configuration_specific.get_neuron_count_per_feature_map();
			const std::vector<float>::const_iterator in_it_global = input_neurons->begin() + input_neuron_count * offset_input_entry_id;
			const std::vector<float>::const_iterator out_err_it_global = output_errors->begin();
			nnforge_shared_ptr<const sparse_convolution_layer> layer_derived = nnforge_dynamic_pointer_cast<const sparse_convolution_layer>(layer_schema);
			const std::vector<unsigned int>& window_sizes = layer_derived->window_sizes;
			unsigned int feature_map_connection_count = layer_derived->feature_map_connection_count;
			const unsigned int dimension_count = static_cast<unsigned int>(window_sizes.size());
			std::vector<unsigned int> input_slices(input_configuration_specific.dimension_sizes.size());
			input_slices[0] = 1;
			for(unsigned int i = 0; i < dimension_count - 1; ++i)
				input_slices[i + 1] = input_slices[i] * input_configuration_specific.dimension_sizes[i];
			unsigned int window_elem_count = 1;
			for(unsigned int i = 0; i < dimension_count; ++i)
				window_elem_count *= window_sizes[i];
			const unsigned int const_window_elem_count = window_elem_count;

			const std::vector<float>::iterator gradient_weights = (*gradient)[0].begin();
			const std::vector<float>::iterator gradient_biases = (*gradient)[1].begin();

			const std::vector<int>::const_iterator column_indices = (*data_custom)[0].begin();
			const std::vector<int>::const_iterator row_indices = (*data_custom)[1].begin();

			std::vector<std::pair<int, int> > out_fm_in_fm_list(feature_map_connection_count);
			int i = 0;
			for(int output_feature_map_id = 0; output_feature_map_id < output_configuration_specific.feature_map_count; ++output_feature_map_id)
			{
				const int start_column_index = row_indices[output_feature_map_id];
				const int end_column_index = row_indices[output_feature_map_id + 1];
				for(int column_index = start_column_index; column_index < end_column_index; ++column_index)
				{
					int input_feature_map_id = column_indices[column_index];
					out_fm_in_fm_list[i].first = output_feature_map_id;
					out_fm_in_fm_list[i].second = input_feature_map_id;
					++i;
				}
			}

			std::vector<unsigned int> current_local_input_position(dimension_count, 0);
			std::vector<unsigned int> offset_list(window_elem_count);
			for(unsigned int i = 1; i < window_elem_count; ++i)
			{
				int offset = 0;
				for(unsigned int j = 0; j < dimension_count; ++j)
				{
					offset += static_cast<int>(input_slices[j]);
					if ((++current_local_input_position[j]) < window_sizes[j])
					{
						offset_list[i] = offset_list[i-1] + offset;
						break;
					}
					current_local_input_position[j] = 0;
					offset -= static_cast<int>(window_sizes[j] * input_slices[j]);
				}
			}

			const unsigned int output_feature_map_count = output_configuration_specific.feature_map_count;
			const unsigned int input_feature_map_count = input_configuration_specific.feature_map_count;
			const int total_workload = feature_map_connection_count;
			const unsigned int const_entry_count = updater_count;
			const std::vector<unsigned int>::const_iterator output_dimension_sizes_it = output_configuration_specific.dimension_sizes.begin();
			const std::vector<unsigned int>::const_iterator input_slices_it = input_slices.begin();
			const std::vector<unsigned int>::const_iterator offset_list_it = offset_list.begin();
			const std::vector<std::pair<int, int> >::const_iterator out_fm_in_fm_it = out_fm_in_fm_list.begin();
			const int const_updater_count = updater_count;

			#pragma omp parallel default(none) num_threads(plain_config->openmp_thread_count)
			{
				nnforge_array<unsigned int, max_dimension_count> current_output_position;
				std::vector<float> weights_local(const_window_elem_count, 0.0F);

				#pragma omp for schedule(guided)
				for(int workload_id = 0; workload_id < total_workload; ++workload_id)
				{
					int weight_block_id = workload_id;
					int output_feature_map_id = out_fm_in_fm_it[weight_block_id].first;
					int input_feature_map_id = out_fm_in_fm_it[weight_block_id].second;

					std::fill_n(weights_local.begin(), const_window_elem_count, 0.0F);

					for(int entry_id = 0; entry_id < const_updater_count; ++entry_id)
					{
						std::vector<float>::const_iterator in_it_base = in_it_global + (entry_id * input_neuron_count) + (input_feature_map_id * input_neuron_count_per_feature_map);
						std::vector<float>::const_iterator out_err_it_base = out_err_it_global + (entry_id * output_neuron_count) + (output_feature_map_id * output_neuron_count_per_feature_map);

						std::fill_n(current_output_position.begin(), dimension_count, 0);
						for(std::vector<float>::const_iterator out_err_it = out_err_it_base; out_err_it != out_err_it_base + output_neuron_count_per_feature_map; ++out_err_it)
						{
							std::vector<float>::const_iterator in_it = in_it_base;
							for(unsigned int i = 0; i < dimension_count; ++i)
								in_it += current_output_position[i] * (*(input_slices_it + i));

							float current_err = *out_err_it;
							for(unsigned int i = 0; i < const_window_elem_count; ++i)
							{
								float in_neuron = *(in_it + *(offset_list_it + i));
								weights_local[i] += (in_neuron * current_err);
							}

							// Go to the next output element
							for(unsigned int i = 0; i < dimension_count; ++i)
							{
								if ((++current_output_position[i]) < *(output_dimension_sizes_it + i))
									break;
								current_output_position[i] = 0;
							}
						}
					}

					std::vector<float>::iterator gradient_weights_it_base = gradient_weights + weight_block_id * const_window_elem_count;
					std::vector<float>::iterator weights_local_it = weights_local.begin();
					for(std::vector<float>::iterator it = gradient_weights_it_base; it != gradient_weights_it_base + const_window_elem_count; ++it, ++weights_local_it)
						*it += *weights_local_it;
				}
			}

			const int total_workload_bias = output_feature_map_count;
			#pragma omp parallel for default(none) schedule(guided) num_threads(plain_config->openmp_thread_count)
			for(int workload_id = 0; workload_id < total_workload_bias; ++workload_id)
			{
				int output_feature_map_id = workload_id;

				float sum = 0.0F;
				for(int entry_id = 0; entry_id < const_updater_count; ++entry_id)
				{
					std::vector<float>::const_iterator out_err_it_base = out_err_it_global + (entry_id * output_neuron_count) + (output_feature_map_id * output_neuron_count_per_feature_map);
					for(std::vector<float>::const_iterator out_err_it = out_err_it_base; out_err_it != out_err_it_base + output_neuron_count_per_feature_map; ++out_err_it)
						sum += *out_err_it;
				}

				*(gradient_biases + output_feature_map_id) += sum;
			}
		}
		void sparse_convolution_layer_updater_plain::backprop(
			additional_buffer_smart_ptr input_errors,
			const_additional_buffer_smart_ptr input_neurons,
			const_additional_buffer_smart_ptr output_errors,
			const_additional_buffer_smart_ptr output_neurons,
			std::vector<additional_buffer_smart_ptr>& additional_buffers,
			plain_running_configuration_const_smart_ptr plain_config,
			const_layer_smart_ptr layer_schema,
			const_layer_data_smart_ptr data,
			const_layer_data_custom_smart_ptr data_custom,
			const layer_configuration_specific& input_configuration_specific,
			const layer_configuration_specific& output_configuration_specific,
			unsigned int updater_count) const
		{
			const std::vector<float>::iterator in_err_it_global = input_errors->begin();
			const std::vector<float>::const_iterator out_err_it_global = output_errors->begin();
			const unsigned int input_neuron_count = input_configuration_specific.get_neuron_count();
			const unsigned int input_neuron_count_per_feature_map = input_configuration_specific.get_neuron_count_per_feature_map();
			const unsigned int output_neuron_count = output_configuration_specific.get_neuron_count();
			const unsigned int output_neuron_count_per_feature_map = output_configuration_specific.get_neuron_count_per_feature_map();
			nnforge_shared_ptr<const sparse_convolution_layer> layer_derived = nnforge_dynamic_pointer_cast<const sparse_convolution_layer>(layer_schema);
			const std::vector<unsigned int>& window_sizes = layer_derived->window_sizes;
			const unsigned int dimension_count = static_cast<unsigned int>(window_sizes.size());
			std::vector<unsigned int> input_slices(input_configuration_specific.dimension_sizes.size());
			input_slices[0] = 1;
			for(unsigned int i = 0; i < dimension_count - 1; ++i)
				input_slices[i + 1] = input_slices[i] * input_configuration_specific.dimension_sizes[i];
			unsigned int window_elem_count = 1;
			for(unsigned int i = 0; i < dimension_count; ++i)
				window_elem_count *= window_sizes[i];
			const unsigned int const_window_elem_count = window_elem_count;

			const std::vector<float>::const_iterator weights = (*data)[0].begin();

			const std::vector<int>::const_iterator column_indices = (*data_custom)[0].begin();
			const std::vector<int>::const_iterator row_indices = (*data_custom)[1].begin();

			std::vector<std::vector<std::pair<int, int> > > in_fm_out_fm_weight_pos_list_list(input_configuration_specific.feature_map_count);
			for(int output_feature_map_id = 0; output_feature_map_id < output_configuration_specific.feature_map_count; ++output_feature_map_id)
			{
				const int start_column_index = row_indices[output_feature_map_id];
				const int end_column_index = row_indices[output_feature_map_id + 1];
				for(int column_index = start_column_index; column_index < end_column_index; ++column_index)
				{
					int input_feature_map_id = column_indices[column_index];
					in_fm_out_fm_weight_pos_list_list[input_feature_map_id].push_back(std::make_pair(output_feature_map_id, column_index));
				}
			}

			std::vector<unsigned int> current_local_input_position(dimension_count, 0);
			std::vector<unsigned int> offset_list(window_elem_count);
			for(unsigned int i = 1; i < window_elem_count; ++i)
			{
				int offset = 0;
				for(unsigned int j = 0; j < dimension_count; ++j)
				{
					offset += static_cast<int>(input_slices[j]);
					if ((++current_local_input_position[j]) < window_sizes[j])
					{
						offset_list[i] = offset_list[i-1] + offset;
						break;
					}
					current_local_input_position[j] = 0;
					offset -= static_cast<int>(window_sizes[j] * input_slices[j]);
				}
			}

			const unsigned int output_feature_map_count = output_configuration_specific.feature_map_count;
			const unsigned int input_feature_map_count = input_configuration_specific.feature_map_count;
			const int total_workload = updater_count * input_feature_map_count;
			const std::vector<unsigned int>::const_iterator output_dimension_sizes_it = output_configuration_specific.dimension_sizes.begin();
			const std::vector<unsigned int>::const_iterator input_slices_it = input_slices.begin();
			const std::vector<unsigned int>::const_iterator offset_list_it = offset_list.begin();
			const std::vector<std::vector<std::pair<int, int> > >::const_iterator in_fm_out_fm_weight_pos_it = in_fm_out_fm_weight_pos_list_list.begin();

			#pragma omp parallel default(none) num_threads(plain_config->openmp_thread_count)
			{
				nnforge_array<unsigned int, max_dimension_count> current_output_position;

				#pragma omp for schedule(guided)
				for(int workload_id = 0; workload_id < total_workload; ++workload_id)
				{
					int entry_id = workload_id / input_feature_map_count;
					int input_feature_map_id = workload_id - (entry_id * input_feature_map_count);

					std::vector<float>::const_iterator out_err_it_base = out_err_it_global + (entry_id * output_neuron_count);
					std::vector<float>::iterator in_err_it_base = in_err_it_global + (entry_id * input_neuron_count) + (input_feature_map_id * input_neuron_count_per_feature_map);
					const std::vector<std::pair<int, int> >& out_fm_weight_pos_list = in_fm_out_fm_weight_pos_it[input_feature_map_id];

					std::fill_n(in_err_it_base, input_neuron_count_per_feature_map, 0.0F);
					std::fill_n(current_output_position.begin(), dimension_count, 0);
					for(std::vector<float>::const_iterator out_err_it_base2 = out_err_it_base; out_err_it_base2 != out_err_it_base + output_neuron_count_per_feature_map; ++out_err_it_base2)
					{
						std::vector<float>::iterator in_err_it = in_err_it_base;
						for(unsigned int i = 0; i < dimension_count; ++i)
							in_err_it += current_output_position[i] * (*(input_slices_it + i));

						for(std::vector<std::pair<int, int> >::const_iterator it = out_fm_weight_pos_list.begin(); it != out_fm_weight_pos_list.end(); ++it)
						{
							int output_feature_map_id = it->first;
							int weight_block_id = it->second;

							std::vector<float>::const_iterator out_err_it = out_err_it_base2 + (output_feature_map_id * output_neuron_count_per_feature_map);
							std::vector<float>::const_iterator weights_it = weights + weight_block_id * const_window_elem_count;
							float current_err = *out_err_it;
							for(unsigned int i = 0; i < const_window_elem_count; ++i)
							{
								float w = *weights_it;
								*(in_err_it + *(offset_list_it + i)) += (w * current_err);
								++weights_it;
							}
						}

						// Go to the next output element
						for(unsigned int i = 0; i < dimension_count; ++i)
						{
							if ((++current_output_position[i]) < *(output_dimension_sizes_it + i))
								break;
							current_output_position[i] = 0;
						}
					}
				}
			}
		}
		void local_contrast_subtractive_layer_updater_plain::test(
			const_additional_buffer_smart_ptr input_buffer,
			additional_buffer_smart_ptr output_buffer,
			std::vector<additional_buffer_smart_ptr>& additional_buffers,
			plain_running_configuration_const_smart_ptr plain_config,
			const_layer_smart_ptr layer_schema,
			const_layer_data_smart_ptr data,
			const_layer_data_custom_smart_ptr data_custom,
			const layer_configuration_specific& input_configuration_specific,
			const layer_configuration_specific& output_configuration_specific,
			unsigned int updater_count,
			unsigned int offset_input_entry_id) const
		{
			if (offset_input_entry_id > 0)
				throw neural_network_exception("local_contrast_subtractive_layer_updater_plain is not able to run using offset");

			const unsigned int input_neuron_count = input_configuration_specific.get_neuron_count();
			const unsigned int input_neuron_count_per_feature_map = input_configuration_specific.get_neuron_count_per_feature_map();
			const unsigned int output_neuron_count = output_configuration_specific.get_neuron_count();
			const unsigned int output_neuron_count_per_feature_map = output_configuration_specific.get_neuron_count_per_feature_map();
			nnforge_shared_ptr<const local_contrast_subtractive_layer> layer_derived = nnforge_dynamic_pointer_cast<const local_contrast_subtractive_layer>(layer_schema);
			const std::vector<std::vector<float> >& window_weights_list = layer_derived->window_weights_list;
			const std::vector<unsigned int>& feature_maps_affected = layer_derived->feature_maps_affected;
			const std::vector<unsigned int>& feature_maps_unaffected = layer_derived->feature_maps_unaffected;
			const unsigned int dimension_count = static_cast<unsigned int>(window_weights_list.size());
			std::vector<unsigned int> input_slices(input_configuration_specific.dimension_sizes.size());
			input_slices[0] = 1;
			for(unsigned int i = 0; i < dimension_count - 1; ++i)
				input_slices[i + 1] = input_slices[i] * input_configuration_specific.dimension_sizes[i];

			const std::vector<unsigned int>::const_iterator dimension_sizes_it = output_configuration_specific.dimension_sizes.begin();
			const unsigned int feature_maps_affected_count = static_cast<unsigned int>(feature_maps_affected.size());
			const unsigned int feature_maps_unaffected_count = static_cast<unsigned int>(feature_maps_affected.size());
			const std::vector<unsigned int>::const_iterator input_slices_it = input_slices.begin();
			const std::vector<unsigned int>::const_iterator feature_maps_affected_it = feature_maps_affected.begin();
			const std::vector<float>::const_iterator input_buffer_it = input_buffer->begin();
			const std::vector<float>::iterator output_buffer_it = output_buffer->begin();
			const std::vector<std::vector<float> >::const_iterator window_weights_list_it = window_weights_list.begin();

			const int total_workload = updater_count * feature_maps_affected_count;
			const int openmp_thread_count = plain_config->openmp_thread_count;
			
			#pragma omp parallel default(none) shared(additional_buffers) num_threads(openmp_thread_count)
			{
				std::vector<additional_buffer_smart_ptr> local_additional_buffers;
				int thread_id = 0;
				#ifdef _OPENMP
				thread_id = omp_get_thread_num();
				#endif

				local_additional_buffers.push_back(additional_buffers[thread_id]);
				if (dimension_count > 1)
					local_additional_buffers.push_back(additional_buffers[openmp_thread_count + thread_id]);

				#pragma omp for schedule(guided)
				for(int workload_id = 0; workload_id < total_workload; ++workload_id)
				{
					int entry_id = workload_id / feature_maps_affected_count;
					int affected_feature_map_id = workload_id - (entry_id * feature_maps_affected_count);

					unsigned int current_output_buffer_index = 0;
					unsigned int feature_map_id = *(feature_maps_affected_it + affected_feature_map_id);
					for(unsigned int dimension_id = 0; dimension_id < dimension_count; ++dimension_id)
					{
						std::vector<float>::iterator out_it_base = local_additional_buffers[current_output_buffer_index]->begin();
						std::vector<float>::const_iterator in_it;
						if (dimension_id > 0)
							in_it = local_additional_buffers[1 - current_output_buffer_index]->begin();
						else
							in_it = input_buffer_it + (entry_id * input_neuron_count) + (feature_map_id * input_neuron_count_per_feature_map);
						int max_output_size = *(dimension_sizes_it + dimension_id);
						int input_slice_size = *(input_slices_it + dimension_id);

						std::vector<unsigned int> current_output_position(dimension_count, 0);
						for(std::vector<float>::iterator out_it = out_it_base; out_it != out_it_base + output_neuron_count_per_feature_map; ++out_it, ++in_it)
						{
							const std::vector<float>& current_window_weights_list = *(window_weights_list_it + dimension_id);
							float sum = *in_it * current_window_weights_list[0];

							int current_position = static_cast<int>(current_output_position[dimension_id]);
							int dest_forward = current_position;
							int dest_backward = dest_forward;
							for (std::vector<float>::const_iterator it = current_window_weights_list.begin() + 1; it != current_window_weights_list.end(); ++it)
							{
								dest_forward++;
								dest_backward--;
								int dest_forward_actual = (dest_forward < max_output_size) ? dest_forward : (((max_output_size << 1) - 1) - dest_forward);
								int dest_backward_actual = (dest_backward >= 0) ? dest_backward : (-1 - dest_backward);
								int offset_forward = ((dest_forward_actual - current_position) * input_slice_size);
								int offset_backward = ((dest_backward_actual - current_position) * input_slice_size);
								sum += (*(in_it + offset_forward) + *(in_it + offset_backward)) * (*it);
							}

							*out_it = sum;

							// Go to the next output element
							for(unsigned int i = 0; i < dimension_count; ++i)
							{
								if ((++current_output_position[i]) < *(dimension_sizes_it + i))
									break;
								current_output_position[i] = 0;
							}
						}

						current_output_buffer_index = 1 - current_output_buffer_index;
					} // for(unsigned int dimension_id

					// Subtract the gaussian blur
					{
						std::vector<float>::const_iterator original_in_it = input_buffer_it + (entry_id * input_neuron_count) + (feature_map_id * input_neuron_count_per_feature_map);
						std::vector<float>::iterator out_it = output_buffer_it + (entry_id * input_neuron_count) + (feature_map_id * input_neuron_count_per_feature_map);
						std::vector<float>::const_iterator in_it = local_additional_buffers[1 - current_output_buffer_index]->begin();
						for(int i = 0; i < static_cast<int>(input_neuron_count_per_feature_map); ++i)
							*(out_it + i) = *(original_in_it + i) - *(in_it + i);
					}
				}
			} // #pragma parallel

			if (feature_maps_unaffected_count > 0)
			{
				for(unsigned int entry_id = 0; entry_id < updater_count; ++entry_id)
				{
					for(std::vector<unsigned int>::const_iterator it = feature_maps_unaffected.begin(); it != feature_maps_unaffected.end(); ++it)
					{
						unsigned int feature_map_id = *it;
						std::vector<float>::const_iterator original_in_it = input_buffer_it + (entry_id * input_neuron_count) + (feature_map_id * input_neuron_count_per_feature_map);
						std::vector<float>::iterator out_it = output_buffer_it + (entry_id * input_neuron_count) + (feature_map_id * input_neuron_count_per_feature_map);
						std::copy(original_in_it, original_in_it + input_neuron_count_per_feature_map, out_it);
					}
				}
			}
		}
		void average_subsampling_layer_hessian_plain::test(
			const_additional_buffer_smart_ptr input_buffer,
			additional_buffer_smart_ptr output_buffer,
			std::vector<additional_buffer_smart_ptr>& additional_buffers,
			plain_running_configuration_const_smart_ptr plain_config,
			const_layer_smart_ptr layer_schema,
			const_layer_data_smart_ptr data,
			const layer_configuration_specific& input_configuration_specific,
			const layer_configuration_specific& output_configuration_specific,
			unsigned int entry_count) const
		{
			const std::vector<float>::const_iterator in_it_global = input_buffer->begin();
			const std::vector<float>::iterator out_it_global = output_buffer->begin();
			const unsigned int input_neuron_count = input_configuration_specific.get_neuron_count();
			const unsigned int input_neuron_count_per_feature_map = input_configuration_specific.get_neuron_count_per_feature_map();
			const unsigned int output_neuron_count = output_configuration_specific.get_neuron_count();
			const unsigned int output_neuron_count_per_feature_map = output_configuration_specific.get_neuron_count_per_feature_map();
			std::tr1::shared_ptr<const average_subsampling_layer> layer_derived = std::tr1::dynamic_pointer_cast<const average_subsampling_layer>(layer_schema);
			const std::vector<unsigned int>& subsampling_sizes = layer_derived->subsampling_sizes;
			const unsigned int dimension_count = static_cast<unsigned int>(layer_derived->subsampling_sizes.size());
			std::vector<unsigned int> input_slices(input_configuration_specific.dimension_sizes.size());
			input_slices[0] = 1;
			for(unsigned int i = 0; i < dimension_count - 1; ++i)
				input_slices[i + 1] = input_slices[i] * input_configuration_specific.dimension_sizes[i];
			unsigned int subsampling_elem_count = 1;
			for(unsigned int i = 0; i < dimension_count; ++i)
				subsampling_elem_count *= subsampling_sizes[i];
			const unsigned int const_subsampling_elem_count = subsampling_elem_count;
			const float mult = 1.0F / static_cast<float>(subsampling_elem_count);
			const unsigned int feature_map_count = output_configuration_specific.feature_map_count;

			std::vector<unsigned int> current_local_input_position(dimension_count, 0);
			std::vector<unsigned int> offset_list(subsampling_elem_count);
			for(unsigned int i = 1; i < subsampling_elem_count; ++i)
			{
				int offset = 0;
				for(unsigned int j = 0; j < dimension_count; ++j)
				{
					offset += static_cast<int>(input_slices[j]);
					if ((++current_local_input_position[j]) < subsampling_sizes[j])
					{
						offset_list[i] = offset_list[i-1] + offset;
						break;
					}
					current_local_input_position[j] = 0;
					offset -= static_cast<int>(subsampling_sizes[j] * input_slices[j]);
				}
			}

			const int total_workload = entry_count * output_configuration_specific.feature_map_count;
			const std::vector<unsigned int>::const_iterator dimension_sizes_it = output_configuration_specific.dimension_sizes.begin();
			const std::vector<unsigned int>::const_iterator subsampling_sizes_it = subsampling_sizes.begin();
			const std::vector<unsigned int>::const_iterator input_slices_it = input_slices.begin();
			const std::vector<unsigned int>::const_iterator offset_list_it = offset_list.begin();

			#pragma omp parallel default(none) num_threads(plain_config->openmp_thread_count)
			{
				std::tr1::array<unsigned int, max_dimension_count> current_output_position;

				#pragma omp for schedule(guided)
				for(int workload_id = 0; workload_id < total_workload; ++workload_id)
				{
					int entry_id = workload_id / feature_map_count;
					int feature_map_id = workload_id - (entry_id * feature_map_count);

					std::vector<float>::const_iterator in_it_base = in_it_global + (entry_id * input_neuron_count) + (feature_map_id * input_neuron_count_per_feature_map);
					std::vector<float>::iterator out_it_base = out_it_global + (entry_id * output_neuron_count) + (feature_map_id * output_neuron_count_per_feature_map);

					std::fill_n(current_output_position.begin(), dimension_count, 0);
					for(std::vector<float>::iterator out_it = out_it_base; out_it != out_it_base + output_neuron_count_per_feature_map; ++out_it)
					{
						// Define the starting position of the first input elem
						std::vector<float>::const_iterator in_it = in_it_base;
						for(unsigned int i = 0; i < dimension_count; ++i)
							in_it += current_output_position[i] * (*(subsampling_sizes_it + i)) * (*(input_slices_it + i));

						float sum = 0.0F;
						for(unsigned int i = 0; i < const_subsampling_elem_count; ++i)
						{
							sum += *(in_it + (*(offset_list_it + i)));
						}
						*out_it = sum * mult;

						// Go to the next output element
						for(unsigned int i = 0; i < dimension_count; ++i)
						{
							if ((++current_output_position[i]) < *( dimension_sizes_it + i))
								break;
							current_output_position[i] = 0;
						}
					}
				}
			}
		}
		void max_subsampling_layer_updater_plain::test(
			const_additional_buffer_smart_ptr input_buffer,
			additional_buffer_smart_ptr output_buffer,
			std::vector<additional_buffer_smart_ptr>& additional_buffers,
			plain_running_configuration_const_smart_ptr plain_config,
			const_layer_smart_ptr layer_schema,
			const_layer_data_smart_ptr data,
			const_layer_data_custom_smart_ptr data_custom,
			const layer_configuration_specific& input_configuration_specific,
			const layer_configuration_specific& output_configuration_specific,
			unsigned int updater_count,
			unsigned int offset_input_entry_id,
			bool force_deterministic) const
		{
			nnforge_shared_ptr<const max_subsampling_layer> layer_derived = nnforge_dynamic_pointer_cast<const max_subsampling_layer>(layer_schema);

			if (layer_derived->tiling)
				throw neural_network_exception("max_subsampling_layer_updater_plain is not able to run for max subsampling layer with tiling");

			if (offset_input_entry_id > 0)
				throw neural_network_exception("max_subsampling_layer_updater_plain is not able to run using offset");

			const std::vector<float>::const_iterator in_it_global = input_buffer->begin();
			const std::vector<float>::iterator out_it_global = output_buffer->begin();
			const std::vector<float>::iterator max_indexes_it_global = additional_buffers[0]->begin();
			const unsigned int input_neuron_count = input_configuration_specific.get_neuron_count();
			const unsigned int input_neuron_count_per_feature_map = input_configuration_specific.get_neuron_count_per_feature_map();
			const unsigned int output_neuron_count = output_configuration_specific.get_neuron_count();
			const unsigned int output_neuron_count_per_feature_map = output_configuration_specific.get_neuron_count_per_feature_map();
			const std::vector<unsigned int>& subsampling_sizes = layer_derived->subsampling_sizes;
			const unsigned int dimension_count = static_cast<unsigned int>(layer_derived->subsampling_sizes.size());
			std::vector<unsigned int> input_slices(input_configuration_specific.dimension_sizes.size());
			input_slices[0] = 1;
			for(unsigned int i = 0; i < dimension_count - 1; ++i)
				input_slices[i + 1] = input_slices[i] * input_configuration_specific.dimension_sizes[i];
			unsigned int subsampling_elem_count = 1;
			for(unsigned int i = 0; i < dimension_count; ++i)
				subsampling_elem_count *= subsampling_sizes[i];
			const unsigned int const_subsampling_elem_count = subsampling_elem_count;
			const unsigned int feature_map_count = output_configuration_specific.feature_map_count;

			std::vector<unsigned int> current_local_input_position(dimension_count, 0);
			std::vector<unsigned int> offset_list(subsampling_elem_count);
			for(unsigned int i = 1; i < subsampling_elem_count; ++i)
			{
				int offset = 0;
				for(unsigned int j = 0; j < dimension_count; ++j)
				{
					offset += static_cast<int>(input_slices[j]);
					if ((++current_local_input_position[j]) < subsampling_sizes[j])
					{
						offset_list[i] = offset_list[i-1] + offset;
						break;
					}
					current_local_input_position[j] = 0;
					offset -= static_cast<int>(subsampling_sizes[j] * input_slices[j]);
				}
			}

			const int total_workload = updater_count * output_configuration_specific.feature_map_count;
			const std::vector<unsigned int>::const_iterator dimension_sizes_it = output_configuration_specific.dimension_sizes.begin();
			const std::vector<unsigned int>::const_iterator subsampling_sizes_it = subsampling_sizes.begin();
			const std::vector<unsigned int>::const_iterator input_slices_it = input_slices.begin();
			const std::vector<unsigned int>::const_iterator offset_list_it = offset_list.begin();

			#pragma omp parallel default(none) num_threads(plain_config->openmp_thread_count)
			{
				nnforge_array<unsigned int, max_dimension_count> current_output_position;

				#pragma omp for schedule(guided)
				for(int workload_id = 0; workload_id < total_workload; ++workload_id)
				{
					int entry_id = workload_id / feature_map_count;
					int feature_map_id = workload_id - (entry_id * feature_map_count);

					const int in_base_offset = (entry_id * input_neuron_count) + (feature_map_id * input_neuron_count_per_feature_map);
					std::vector<float>::iterator out_it_base = out_it_global + (entry_id * output_neuron_count) + (feature_map_id * output_neuron_count_per_feature_map);
					std::vector<float>::iterator max_indexes_it_base = max_indexes_it_global + (entry_id * output_neuron_count) + (feature_map_id * output_neuron_count_per_feature_map);

					std::fill_n(current_output_position.begin(), dimension_count, 0);
					std::vector<float>::iterator max_indexes_it = max_indexes_it_base;
					for(std::vector<float>::iterator out_it = out_it_base; out_it != out_it_base + output_neuron_count_per_feature_map; ++out_it, ++max_indexes_it)
					{
						// Define the starting position of the first input elem
						int in_offset = in_base_offset;
						for(unsigned int i = 0; i < dimension_count; ++i)
							in_offset += current_output_position[i] * (*(subsampling_sizes_it + i)) * (*(input_slices_it + i));

						unsigned int max_index = 0;
						float best_val = -1.0e38F;
						for(unsigned int i = 0; i < const_subsampling_elem_count; ++i)
						{
							int current_offset = in_offset + *(offset_list_it + i);
							float new_val = *(in_it_global + current_offset);
							if ((i == 0) || (new_val > best_val))
							{
								best_val = new_val;
								max_index = current_offset;
							}
						}
						*out_it = best_val;
						*((unsigned int *)(&(*max_indexes_it))) = max_index;

						// Go to the next output element
						for(unsigned int i = 0; i < dimension_count; ++i)
						{
							if ((++current_output_position[i]) < *( dimension_sizes_it + i))
								break;
							current_output_position[i] = 0;
						}
					}
				}
			}
		}