int main(int argc, char** argv) {
	std::cout << "hello world" << std::endl;

	std::random_device rand{};

	auto input_dim = 2u;
	auto output_dim = 1u;
	auto batch_size = 4u;

	std::vector<neu::cpu_vector> cpu_input = {
		{0.f, 0.f}, {1.f, 0.f}, {0.f, 1.f}, {1.f, 1.f}
	};
	std::vector<neu::cpu_vector> cpu_teach = {
		{0.f}, {1.f}, {1.f}, {0.f}
	};

	neu::gpu_vector input;
	for(auto const& cpui : cpu_input) {
		input.insert(input.end(), cpui.begin(), cpui.end());
	}
	neu::gpu_vector teach;
	for(auto const& cput : cpu_teach) {
		teach.insert(teach.end(), cput.begin(), cput.end());
	}

	auto layers = std::make_tuple(
		neu::make_full_connected_layer(input_dim, 3, batch_size, neu::rectifier()),
		neu::make_full_connected_layer(3, output_dim, batch_size, neu::sigmoid())
	);

	std::uniform_real_distribution<> bin{-1,1};
	neu::tuple_foreach(layers, [&rand, &bin](auto& l){
		l.init_weight_randomly([&rand, &bin]() { return bin(rand); }); });
	for(auto i = 0u; i < 1000u; ++i) {
		neu::feedforward_x(layers, input);

		const auto y = neu::layer_get_y(neu::tuple_back(layers));
		neu::gpu_vector errors(y.size());
		boost::compute::transform(y.begin(), y.end(), teach.begin(), errors.begin(),
			boost::compute::minus<neu::scalar>());
		neu::feedback_delta(layers, errors);

		neu::update_delta_weight(layers, input);
		neu::update_weight(layers);

		neu::gpu_vector squared_errors(errors.size());
		boost::compute::transform(errors.begin(), errors.end(),
			errors.begin(), squared_errors.begin(),
			boost::compute::multiplies<neu::scalar>());
		auto error_sum = boost::compute::accumulate(
			squared_errors.begin(), squared_errors.end(), 0.0f);
		if(i%100 == 0) {
			std::cout << i << ":" << error_sum << std::endl;
		}
	}
	neu::print(teach); std::cout << "\n";
	auto y = std::get<std::tuple_size<decltype(layers)>::value-1>(layers).get_y();
	neu::print(y);
	std::cout << std::endl;
}
Vector<double> NormalizedSquaredError::calculate_squared_errors(void) const
{
   // Control sentence (if debug)

   #ifdef __OPENNN_DEBUG__ 

   check();

   #endif

   // Neural network stuff

   const MultilayerPerceptron* multilayer_perceptron_pointer = neural_network_pointer->get_multilayer_perceptron_pointer();

   const size_t inputs_number = multilayer_perceptron_pointer->get_inputs_number();
   const size_t outputs_number = multilayer_perceptron_pointer->get_outputs_number();

   // Data set stuff

   const Instances& instances = data_set_pointer->get_instances();

   const size_t training_instances_number = instances.count_training_instances_number();

   const Vector<size_t> training_indices = instances.arrange_training_indices();

   size_t training_index;

   const Variables& variables = data_set_pointer->get_variables();

   const Vector<size_t> inputs_indices = variables.arrange_inputs_indices();
   const Vector<size_t> targets_indices = variables.arrange_targets_indices();

   // Calculate

   Vector<double> squared_errors(training_instances_number);

   Vector<double> inputs(inputs_number);
   Vector<double> outputs(outputs_number);
   Vector<double> targets(outputs_number);

   // Main loop

   int i = 0;

   #pragma omp parallel for private(i, training_index, inputs, outputs, targets)

   for(i = 0; i < (int)training_instances_number; i++)
   {
       training_index = training_indices[i];

       // Input vector

      inputs = data_set_pointer->get_instance(training_index, inputs_indices);

      // Output vector

      outputs = multilayer_perceptron_pointer->calculate_outputs(inputs);

      // Target vector

      targets = data_set_pointer->get_instance(training_index, targets_indices);

      // Error

      squared_errors[i] = outputs.calculate_sum_squared_error(targets);
   }

   return(squared_errors);
}