void RunIrisSupervisedLearning()
{
    // Load training/test dataset
	Table data = ParseCSVFile("data/iris.data");
	// Print out the data to ensure we've loaded it correctly
	std::cout << "Loaded Data:" << std::endl;
    PrintTable(data);


	// Extract feature std::vectors and classes from the loaded data
	std::vector<SampleType> allSamples = GetFeatureVectors(data);
	std::vector<std::string> classes = GetClasses(data);


	// Construct labels compatible with SVMs using the class data
	// Each class is made an integer. The integers grow one number
	// apart, so three classes will be assigned the labels 1, 2 and
	// 3 respectively.
	std::vector<LabelType> allLabels = ConstructLabels(classes);


    // Randomise the samples and labels to ensure the normalisation process
    // does affect the performance of cross validation
    randomize_samples(allSamples, allLabels);


	// Split dataset in half - one half being the training set
	// and one half being the test set.
	// Done AFTER randomising so half of the data set isn't one class
	// and half is the other - would result in a very incorrect classifier!
	unsigned int numTraining = round(allSamples.size() / 2);
	unsigned int numTest = allSamples.size() - numTraining;

	std::vector<SampleType> trainingSamples;
	std::vector<LabelType> trainingLabels;
	trainingSamples.reserve(numTraining);
	trainingLabels.reserve(numTraining);
	std::vector<SampleType> testSamples;
	std::vector<LabelType> testLabels;
	testSamples.reserve(numTest);
	testLabels.reserve(numTest);

	for (unsigned int i = 0; (i < numTraining); ++i)
	{
	    trainingSamples.push_back(allSamples[i]);
	    trainingLabels.push_back(allLabels[i]);
	}
	for (unsigned int i = numTraining; (i < allSamples.size()); ++i)
	{
        testSamples.push_back(allSamples[i]);
        testLabels.push_back(allLabels[i]);
	}


    // Construct a trainer for the problem
    dlib::krr_trainer<KernelType> trainer;
    double bestGamma = FindBestGamma(trainer, trainingSamples, trainingLabels);
    trainer.set_kernel(KernelType(bestGamma));


    // Actually TRAIN the classifier using the data, LEARNING the function
    FunctionType learnedFunction;
    learnedFunction = trainer.train(trainingSamples, trainingLabels);


    // NOTE: This should just print out 1 for our training method
    std::cout << "The number of support vectors in our learned function is "
        << learnedFunction.basis_vectors.nr() << std::endl;

    double accuracy = CalculateAccuracy(learnedFunction, testSamples, testLabels);
    std::cout << "The accuracy of this classifier is: "
        << (accuracy * 100) << "%." << std::endl;
}
    binary_relation_detector binary_relation_detector_trainer::
    train (
    ) const
    {
        DLIB_CASSERT(num_positive_examples() > 0, "Not enough training data given.");
        DLIB_CASSERT(num_negative_examples() > 0, "Not enough training data given.");

        std::vector<sparse_vector_type> samples;
        std::vector<double> labels;

        for (unsigned long i = 0; i < pos_sentences.size(); ++i)
        {
            samples.push_back(extract_binary_relation(pos_sentences[i], pos_arg1s[i], pos_arg2s[i], tfe).feats);
            labels.push_back(+1);
        }
        for (unsigned long i = 0; i < neg_sentences.size(); ++i)
        {
            samples.push_back(extract_binary_relation(neg_sentences[i], neg_arg1s[i], neg_arg2s[i], tfe).feats);
            labels.push_back(-1);
        }

        randomize_samples(samples, labels);

        const int cv_folds = 6;
        brdt_cv_objective obj(num_threads, cv_folds, beta, samples, labels);

        matrix<double,2,1> params;
        params = 5000.0/samples.size(), 5000.0/samples.size();
        // We do the parameter search in log space.
        params = log(params);
        // can't do the parameter search if we don't have enough data.   So if we don't
        // have much data then just use the default parameters.
        if (pos_sentences.size() > (unsigned)cv_folds)
        {
            matrix<double,2,1> lower_params, upper_params;
            lower_params = 1.0/samples.size(), 1.0/samples.size();
            upper_params = 100000.0/samples.size(), 100000.0/samples.size();
            lower_params = log(lower_params);
            upper_params = log(upper_params);
            const double rho_begin = min(upper_params-lower_params)*0.15;
            const double rho_end = log(1.2/samples.size()) - log(1.0/samples.size());
            find_max_bobyqa(obj, params, params.size()*2+1, lower_params, upper_params, rho_begin, rho_end, 200);
        }


        // Note that we rescale the parameters to account for the fact that the cross
        // validation was done on a dataset slightly smaller than the one we ultimately train
        // on and the C parameters of this trainer are not normalized by the number of training
        // samples.
        params = exp(params) * (cv_folds-1.0)/cv_folds;
        svm_c_linear_dcd_trainer<sparse_linear_kernel<sparse_vector_type> > trainer;
        trainer.set_c_class1(params(0));
        trainer.set_c_class2(params(1));
        cout << "using parameters of: " << trans(params);
        cout << "now doing training..." << endl;
        binary_relation_detector bd;
        bd.df = trainer.train(samples, labels);
        bd.relation_type = relation_name;
        bd.total_word_feature_extractor_fingerprint = tfe.get_fingerprint();

        cout << "test on train: " << test_binary_decision_function(bd.df, samples, labels) << endl;
        return bd;
    }