int main (int argc, char **argv) { Onyx::LinearLaRank::Classifier *classifier; Onyx::Example::Dataset datasetTrain; Onyx::Example::Dataset datasetTest; std::chrono::time_point<std::chrono::system_clock> start, end; // Timings std::string filenameTrainingData; std::string filenameTrainingLabels; std::string filenameTestData; std::string filenameTestLabels; std::string saveClassifier; std::string loadClassifier; bool doTraining = true; // Enable all by default bool doTesting = true; // Enable by default bool doOnlineTesting = false; // Disable by default unsigned int numEpochs = 10; double C = 1.0; double tau = 0.0001; // Random number generator (with random seed) std::mt19937 random_number_generator(std::random_device{}()); // *** Print banner *** std::cout << "Onyx v.1.0, (C) 2015 Rok Mandeljc <*****@*****.**>" << std::endl; std::cout << std::endl; // *** Command-line parser *** boost::program_options::options_description commandLineArguments("Onyx - Online Classifier Application"); boost::program_options::variables_map optionsMap; boost::program_options::positional_options_description positionalArguments; boost::program_options::options_description argArguments("Arguments"); argArguments.add_options() ("help", "produce help message") ("training-data", boost::program_options::value<std::string>(&filenameTrainingData), "name of training .data file") ("training-labels", boost::program_options::value<std::string>(&filenameTrainingLabels), "name of training .labels file") ("test-data", boost::program_options::value<std::string>(&filenameTestData), "name of test .data file") ("test-labels", boost::program_options::value<std::string>(&filenameTestLabels), "name of test .labels file") ("save-classifier", boost::program_options::value<std::string>(&saveClassifier), "optional filename to store classifier") ("load-classifier", boost::program_options::value<std::string>(&loadClassifier), "optional filename to load classifier") ("epochs", boost::program_options::value<unsigned int>(&numEpochs)->default_value(numEpochs), "number of epochs for (re)training") ("training", boost::program_options::value<bool>(&doTraining)->default_value(doTraining), "enable training (if training data is available)") ("test", boost::program_options::value<bool>(&doTesting)->default_value(doTesting), "enable testing (if testing data is available)") ("online-test", boost::program_options::value<bool>(&doOnlineTesting)->default_value(doOnlineTesting), "enable on-line testing (if testing data is available)") ; commandLineArguments.add(argArguments); boost::program_options::options_description argParameters("Algorithm parameters"); argParameters.add_options() ("C", boost::program_options::value<double>(&C)->default_value(C), "SVM regularization parameter") ("tau", boost::program_options::value<double>(&tau)->default_value(tau), "tolerance for choosing new support vectors") ; commandLineArguments.add(argParameters); positionalArguments.add("training-data", 1); positionalArguments.add("training-labels", 1); positionalArguments.add("test-data", 1); positionalArguments.add("test-labels", 1); // Parse command-line try { boost::program_options::store(boost::program_options::command_line_parser(argc, argv).options(commandLineArguments).positional(positionalArguments).run(), optionsMap); } catch (std::exception &error) { std::cout << commandLineArguments << std::endl << std::endl; std::cout << "Command-line error: " << error.what() << std::endl; return -1; } // Display help? if (optionsMap.count("help")) { std::cout << commandLineArguments << std::endl; return 1; } // Validate try { boost::program_options::notify(optionsMap); } catch (std::exception &error) { std::cout << commandLineArguments << std::endl << std::endl; std::cout << "Argument error: " << error.what() << std::endl; return -1; } bool trainingDataAvailable = !filenameTrainingData.empty() && !filenameTrainingLabels.empty(); bool testingDataAvailable = !filenameTestData.empty() && !filenameTestLabels.empty(); doTraining = doTraining && trainingDataAvailable; doTesting = doTesting && testingDataAvailable; doOnlineTesting = doOnlineTesting && testingDataAvailable; if (!doTraining && !doTesting && !doOnlineTesting) { std::cout << "Doing neither training nor testing; nothing to do!" << std::endl; return 1; } if (!doTraining && loadClassifier.empty()) { std::cout << "Neither training dataset nor pre-trained classifier provided!" << std::endl; return -1; } // *** Load datasets *** if (doTraining) { std::cout << "Loading training dataset..." << std::endl; try { datasetTrain.load(filenameTrainingData, filenameTrainingLabels); } catch (std::exception &error) { std::cout << "Failed to load training dataset: " << error.what() << std::endl; return -2; } std::cout << "Loaded training set:" << std::endl; std::cout << " data file: " << filenameTrainingData << std::endl; std::cout << " labels file: " << filenameTrainingLabels << std::endl; std::cout << " samples: " << datasetTrain.numSamples << std::endl; std::cout << " features: " << datasetTrain.numFeatures << std::endl; std::cout << " classes: " << datasetTrain.numClasses << std::endl; std::cout << std::endl; } if (doTesting || doOnlineTesting) { std::cout << "Loading testing dataset..." << std::endl; try { datasetTest.load(filenameTestData, filenameTestLabels); } catch (std::exception &error) { std::cout << "Failed to load testing dataset: " << error.what() << std::endl; return -2; } std::cout << "Loaded testing set:" << std::endl; std::cout << " data file: " << filenameTestData << std::endl; std::cout << " labels file: " << filenameTestLabels << std::endl; std::cout << " samples: " << datasetTest.numSamples << std::endl; std::cout << " features: " << datasetTest.numFeatures << std::endl; std::cout << " classes: " << datasetTest.numClasses << std::endl; std::cout << std::endl; } // *** Classifier *** classifier = Onyx::LinearLaRank::create_classifier(); if (!loadClassifier.empty()) { // Load from file std::cout << "Loading classifier from file: " << loadClassifier << std::endl; std::cout << std::endl; std::ifstream stream(loadClassifier, std::ios::binary); classifier->loadFromStream(stream); } else { // Create new classifier std::cout << "Creating new classifier..." << std::endl; std::cout << std::endl; classifier->setC(C); classifier->setTau(tau); } // *** Training (with optional testing) *** if (doTraining) { start = std::chrono::system_clock::now(); int sampleRatio = datasetTrain.numSamples / 10; std::vector<float> trainError(numEpochs, 0.0); std::vector<float> testError(numEpochs, 0.0); std::vector<int> sampleIndices(datasetTrain.numSamples); std::iota(sampleIndices.begin(), sampleIndices.end(), 0); for (unsigned int epoch = 0; epoch < numEpochs; epoch++) { std::cout << "Epoch " << epoch << std::endl; // *** Train *** // Randomly permute the sample indices std::vector<std::vector<int>::iterator> shuffledSampleIndices(sampleIndices.size()); std::iota(shuffledSampleIndices.begin(), shuffledSampleIndices.end(), sampleIndices.begin()); std::shuffle(shuffledSampleIndices.begin(), shuffledSampleIndices.end(), random_number_generator); for (unsigned int s = 0; s < shuffledSampleIndices.size(); s++) { int idx = *shuffledSampleIndices[s]; const Eigen::VectorXf &sampleFeature = datasetTrain.features[idx]; int sampleLabel = datasetTrain.labels[idx]; // Estimate training error int label = classifier->predict(sampleFeature); if (label != sampleLabel) { trainError[epoch]++; } // Update classifier->update(sampleFeature, sampleLabel, 1.0f); // Print progress if (s && sampleRatio && s % sampleRatio == 0) { std::cout << "Epoch: " << epoch << ": "; std::cout << (10 * s) / sampleRatio << "%"; std::cout << " -> training error: " << trainError[epoch]; std::cout << "/" << s; std::cout << " = " << trainError[epoch]/s*100 << "%"; std::cout << std::endl; } } if (doTesting) { // *** Test *** for (unsigned int s = 0; s < datasetTest.numSamples; s++) { const Eigen::VectorXf &sampleFeature = datasetTest.features[s]; int sampleLabel = datasetTest.labels[s]; if (classifier->predict(sampleFeature) != sampleLabel) { testError[epoch]++; } } std::cout << "Test error: " << testError[epoch] << "/" << datasetTest.numSamples << " = " << testError[epoch]/datasetTest.numSamples*100 << "%" << std::endl; std::cout << std::endl; } } end = std::chrono::system_clock::now(); std::cout << "Elapsed time: " << std::chrono::duration<float>(end - start).count() << " seconds." << std::endl; } // *** Save classifier *** if (!saveClassifier.empty()) { std::cout << "Saving classifier to file: " << saveClassifier << std::endl; std::ofstream stream(saveClassifier, std::ios::binary); classifier->saveToStream(stream); std::cout << "Done!" << std::endl; } // *** Test - only if we didn't do the training *** if (!doTraining && doTesting) { float testError = 0.0f; std::cout << "Performing off-line test..." << std::endl; // Experiment start = std::chrono::system_clock::now(); for (unsigned int s = 0; s < datasetTest.numSamples; s++) { const Eigen::VectorXf &sampleFeature = datasetTest.features[s]; int sampleLabel = datasetTest.labels[s]; if (classifier->predict(sampleFeature) != sampleLabel) { testError++; } } end = std::chrono::system_clock::now(); std::cout << "Test error: " << testError << "/" << datasetTest.numSamples << " = " << testError/datasetTest.numSamples*100 << "%" << std::endl; std::cout << "Elapsed time: " << std::chrono::duration<float>(end - start).count() << " seconds." << std::endl; std::cout << std::endl; } // *** Online test *** if (doOnlineTesting) { float testError = 0.0f; std::cout << "Performing on-line test..." << std::endl; // Randomly permute the sample indices std::vector<int> sampleIndices(datasetTest.numSamples); std::iota(sampleIndices.begin(), sampleIndices.end(), 0); std::vector<std::vector<int>::iterator> shuffledSampleIndices(sampleIndices.size()); std::iota(shuffledSampleIndices.begin(), shuffledSampleIndices.end(), sampleIndices.begin()); std::shuffle(shuffledSampleIndices.begin(), shuffledSampleIndices.end(), random_number_generator); // Experiment start = std::chrono::system_clock::now(); for (unsigned int s = 0; s < shuffledSampleIndices.size(); s++) { int idx = *shuffledSampleIndices[s]; const Eigen::VectorXf &sampleFeature = datasetTest.features[idx]; int sampleLabel = datasetTest.labels[idx]; // Predict if (classifier->predict(sampleFeature) != sampleLabel) { testError++; } // Update classifier->update(sampleFeature, sampleLabel); } end = std::chrono::system_clock::now(); std::cout << "Online test error: " << testError << "/" << datasetTest.numSamples << " = " << testError/datasetTest.numSamples*100 << "%" << std::endl; std::cout << "Elapsed time: " << std::chrono::duration<float>(end - start).count() << " seconds." << std::endl; std::cout << std::endl; } // Cleanup delete classifier; return 0; }
/////////////////////////////////////////////////////////// // uses particles, variance, ind // uses newPoints, newIndices, trees, Ndens void gibbs2(unsigned int _Ndens, const BallTreeDensity* _trees, unsigned long Np, unsigned int Niter, double *_pts, BallTree::index *_ind, double *_randU, double* _randN) { unsigned int i,j,l; unsigned long s, maxNp; Ndens = _Ndens; // SET UP GLOBALS trees = _trees; newPoints = _pts; newIndices = _ind; randU = _randU; randN = _randN; Ndim = trees[0].Ndim(); // dimension of densities maxNp = 0; // largest # of particles we deal with for (unsigned int j=0; j<Ndens; j++) // compute Max Np over all densities if (maxNp < trees[j].Npts()) maxNp = trees[j].Npts(); ind = new BallTree::index[Ndens]; // ALLOCATE GLOBALS p = new double[maxNp]; Nlevels = (unsigned int) (log((double)maxNp)/log((double)2))+1; // how many levels to a balanced binary tree? particles = new double[Ndim*Ndens]; variance = new double[Ndim*Ndens]; dNpts = new unsigned long[Ndens]; levelList = new BallTree::index*[Ndens]; levelListNew = new BallTree::index*[Ndens]; for (j=0;j<Ndens;j++) { levelList[j] = new BallTree::index[maxNp]; levelListNew[j] = new BallTree::index[maxNp]; } for (s=0; s<Np; s++) { levelInit(); initIndices(); calcIndices(); /////////////////////////////////////////////////////////////// // Perform Gibbs sampling only if multiple densities in product /////////////////////////////////////////////////////////////// samplePoint(newPoints); for (l=0;l<Nlevels;l++) { levelDown(); for (i=0;i<Niter;i++) { sampleIndices(newPoints); samplePoint(newPoints); }} for (unsigned int j=0; j<Ndens; j++) // save and newIndices[j] = trees[j].getIndexOf(ind[j])+1; // return particle label newIndices += Ndens; // move pointers to next sample newPoints += Ndim; } for (j=0;j<Ndens;j++) { delete[] levelList[j]; delete[] levelListNew[j]; } delete[] levelList; delete[] levelListNew; delete[] dNpts; delete[] ind; delete[] p; delete[] particles; delete[] variance; };
Result solve(const Problem& iProblem) const { // set up initial (empty) result Result result; result.mSuccess = false; result.mNumIterations = 0; // ensure that there are enough data points to proceed const int sampleSize = iProblem.getSampleSize(); const int n = iProblem.getNumDataPoints(); if (n < sampleSize) { return result; } const double epsilon = 1e-10; // best results are currently invalid double bestScore = -1; bool success = false; // start number of iterations as infinite, then reduce as we go double numIterationsNeeded = 1e10; int iterationCount = 0; int skippedSampleCount = 0; // for random sample index generation std::vector<int> allIndices(n); // iterate until adaptive number of iterations are exceeded while (iterationCount < numIterationsNeeded) { // determine random sample indices for (int i = 0; i < n; ++i) { allIndices[i] = i; } for (int i = 0; i < sampleSize; ++i) { int randIndex = std::rand() % n; std::swap(allIndices[i], allIndices[randIndex]); } std::vector<int> sampleIndices(allIndices.begin(), allIndices.begin() + sampleSize); // compute solution on minimal set typename Problem::Solution solution = iProblem.estimate(sampleIndices); // compute errors over all data points std::vector<double> errors2 = iProblem.computeSquaredErrors(solution); // check whether this is a valid sample // TODO: this should be done via a method in Problem class, but would // require changing all existing usages to include that method if (errors2.size() == 0) { ++skippedSampleCount; if (skippedSampleCount >= mMaximumIterations) break; continue; } skippedSampleCount = 0; // compute error threshold to be applied to each term double thresh = mMaximumError; if (thresh < 0) { std::sort(errors2.begin(), errors2.end()); double median = (n % 2 == 0) ? (0.5*(errors2[n/2]+errors2[n/2+1])) : errors2[n/2]; thresh = 1.4826*std::sqrt(median)*4.6851; } thresh *= thresh; // determine inliers std::vector<int> inliers; inliers.reserve(n); for (int i = 0; i < n; ++i) { if (errors2[i] <= thresh) { inliers.push_back(i); } } // if this is the best score, update solution and convergence criteria double score = inliers.size(); if (score > bestScore) { bestScore = score; result.mInliers = inliers; result.mSolution = solution; success = true; double inlierProbability = double(inliers.size()) / n; double anyOutlierProbability = 1 - pow(inlierProbability,sampleSize); anyOutlierProbability = std::min(anyOutlierProbability, 1-epsilon); anyOutlierProbability = std::max(anyOutlierProbability, epsilon); numIterationsNeeded = log(1-mGoodSolutionProbability) / log(anyOutlierProbability); } // bump up iteration count and terminate if it exceeds hard max ++iterationCount; if (iterationCount > mMaximumIterations) { break; } } // finish off result params result.mSuccess = success; result.mNumIterations = iterationCount; // refine result using all inliers if specified if (result.mSuccess && mRefineUsingInliers) { result.mSolution = iProblem.estimate(result.mInliers); } // done return result; }