Example #1
0
File: main.cpp Project: rokm/onyx
int main (int argc, char **argv)
{
    Onyx::LinearLaRank::Classifier *classifier;

    Onyx::Example::Dataset datasetTrain;
    Onyx::Example::Dataset datasetTest;

    std::chrono::time_point<std::chrono::system_clock> start, end; // Timings

    std::string filenameTrainingData;
    std::string filenameTrainingLabels;
    std::string filenameTestData;
    std::string filenameTestLabels;

    std::string saveClassifier;
    std::string loadClassifier;

    bool doTraining = true; // Enable all by default
    bool doTesting = true; // Enable by default
    bool doOnlineTesting = false; // Disable by default

    unsigned int numEpochs = 10;
    double C = 1.0;
    double tau = 0.0001;

    // Random number generator (with random seed)
    std::mt19937 random_number_generator(std::random_device{}());

    // *** Print banner ***
    std::cout << "Onyx v.1.0, (C) 2015 Rok Mandeljc <*****@*****.**>" << std::endl;
    std::cout << std::endl;

    // *** Command-line parser ***
    boost::program_options::options_description commandLineArguments("Onyx - Online Classifier Application");
    boost::program_options::variables_map optionsMap;
    boost::program_options::positional_options_description positionalArguments;

    boost::program_options::options_description argArguments("Arguments");
    argArguments.add_options()
        ("help", "produce help message")
        ("training-data", boost::program_options::value<std::string>(&filenameTrainingData), "name of training .data file")
        ("training-labels", boost::program_options::value<std::string>(&filenameTrainingLabels), "name of training .labels file")
        ("test-data", boost::program_options::value<std::string>(&filenameTestData), "name of test .data file")
        ("test-labels", boost::program_options::value<std::string>(&filenameTestLabels), "name of test .labels file")
        ("save-classifier", boost::program_options::value<std::string>(&saveClassifier), "optional filename to store classifier")
        ("load-classifier", boost::program_options::value<std::string>(&loadClassifier), "optional filename to load classifier")
        ("epochs", boost::program_options::value<unsigned int>(&numEpochs)->default_value(numEpochs), "number of epochs for (re)training")
        ("training", boost::program_options::value<bool>(&doTraining)->default_value(doTraining), "enable training (if training data is available)")
        ("test", boost::program_options::value<bool>(&doTesting)->default_value(doTesting), "enable testing (if testing data is available)")
        ("online-test", boost::program_options::value<bool>(&doOnlineTesting)->default_value(doOnlineTesting), "enable on-line testing (if testing data is available)")
    ;
    commandLineArguments.add(argArguments);

    boost::program_options::options_description argParameters("Algorithm parameters");
    argParameters.add_options()
        ("C", boost::program_options::value<double>(&C)->default_value(C), "SVM regularization parameter")
        ("tau", boost::program_options::value<double>(&tau)->default_value(tau), "tolerance for choosing new support vectors")
    ;
    commandLineArguments.add(argParameters);

    positionalArguments.add("training-data", 1);
    positionalArguments.add("training-labels", 1);
    positionalArguments.add("test-data", 1);
    positionalArguments.add("test-labels", 1);

    // Parse command-line
    try {
        boost::program_options::store(boost::program_options::command_line_parser(argc, argv).options(commandLineArguments).positional(positionalArguments).run(), optionsMap);
    } catch (std::exception &error) {
        std::cout << commandLineArguments << std::endl << std::endl;
        std::cout << "Command-line error: " << error.what() << std::endl;
        return -1;
    }

    // Display help?
    if (optionsMap.count("help")) {
        std::cout << commandLineArguments << std::endl;
        return 1;
    }

    // Validate
    try {
        boost::program_options::notify(optionsMap);
    } catch (std::exception &error) {
        std::cout << commandLineArguments << std::endl << std::endl;
        std::cout << "Argument error: " << error.what() << std::endl;
        return -1;
    }

    bool trainingDataAvailable = !filenameTrainingData.empty() && !filenameTrainingLabels.empty();
    bool testingDataAvailable = !filenameTestData.empty() && !filenameTestLabels.empty();

    doTraining = doTraining && trainingDataAvailable;
    doTesting = doTesting && testingDataAvailable;
    doOnlineTesting = doOnlineTesting && testingDataAvailable;

    if (!doTraining && !doTesting && !doOnlineTesting) {
        std::cout << "Doing neither training nor testing; nothing to do!" << std::endl;
        return 1;
    }

    if (!doTraining && loadClassifier.empty()) {
        std::cout << "Neither training dataset nor pre-trained classifier provided!" << std::endl;
        return -1;
    }

    // *** Load datasets ***
    if (doTraining) {
        std::cout << "Loading training dataset..." << std::endl;
        try {
            datasetTrain.load(filenameTrainingData, filenameTrainingLabels);
        } catch (std::exception &error) {
            std::cout << "Failed to load training dataset: " << error.what() << std::endl;
            return -2;
        }
        std::cout << "Loaded training set:" << std::endl;
        std::cout << " data file: " << filenameTrainingData << std::endl;
        std::cout << " labels file: " << filenameTrainingLabels << std::endl;
        std::cout << " samples: " << datasetTrain.numSamples << std::endl;
        std::cout << " features: " << datasetTrain.numFeatures << std::endl;
        std::cout << " classes: " << datasetTrain.numClasses << std::endl;
        std::cout << std::endl;
    }
    if (doTesting || doOnlineTesting) {
        std::cout << "Loading testing dataset..." << std::endl;
        try {
            datasetTest.load(filenameTestData, filenameTestLabels);
        } catch (std::exception &error) {
            std::cout << "Failed to load testing dataset: " << error.what() << std::endl;
            return -2;
        }
        std::cout << "Loaded testing set:" << std::endl;
        std::cout << " data file: " << filenameTestData << std::endl;
        std::cout << " labels file: " << filenameTestLabels << std::endl;
        std::cout << " samples: " << datasetTest.numSamples << std::endl;
        std::cout << " features: " << datasetTest.numFeatures << std::endl;
        std::cout << " classes: " << datasetTest.numClasses << std::endl;
        std::cout << std::endl;
    }

    // *** Classifier ***
    classifier = Onyx::LinearLaRank::create_classifier();
    if (!loadClassifier.empty()) {
        // Load from file
        std::cout << "Loading classifier from file: " << loadClassifier << std::endl;
        std::cout << std::endl;

        std::ifstream stream(loadClassifier, std::ios::binary);
        classifier->loadFromStream(stream);
    } else {
        // Create new classifier
        std::cout << "Creating new classifier..." << std::endl;
        std::cout << std::endl;

        classifier->setC(C);
        classifier->setTau(tau);
    }

    // *** Training (with optional testing) ***
    if (doTraining) {
        start = std::chrono::system_clock::now();

        int sampleRatio = datasetTrain.numSamples / 10;
        std::vector<float> trainError(numEpochs, 0.0);
        std::vector<float> testError(numEpochs, 0.0);

        std::vector<int> sampleIndices(datasetTrain.numSamples);
        std::iota(sampleIndices.begin(), sampleIndices.end(), 0);

        for (unsigned int epoch = 0; epoch < numEpochs; epoch++) {
            std::cout << "Epoch " << epoch << std::endl;

            // *** Train ***
            // Randomly permute the sample indices
            std::vector<std::vector<int>::iterator> shuffledSampleIndices(sampleIndices.size());
            std::iota(shuffledSampleIndices.begin(), shuffledSampleIndices.end(), sampleIndices.begin());

            std::shuffle(shuffledSampleIndices.begin(), shuffledSampleIndices.end(), random_number_generator);

            for (unsigned int s = 0; s < shuffledSampleIndices.size(); s++) {
                int idx = *shuffledSampleIndices[s];
                const Eigen::VectorXf &sampleFeature = datasetTrain.features[idx];
                int sampleLabel = datasetTrain.labels[idx];

                // Estimate training error
                int label = classifier->predict(sampleFeature);

                if (label != sampleLabel) {
                    trainError[epoch]++;
                }

                // Update
                classifier->update(sampleFeature, sampleLabel, 1.0f);

                // Print progress
                if (s && sampleRatio && s % sampleRatio == 0) {
                    std::cout << "Epoch: " << epoch << ": ";
                    std::cout << (10 * s) / sampleRatio << "%";
                    std::cout << " -> training error: " << trainError[epoch];
                    std::cout << "/" << s;
                    std::cout << " = "  << trainError[epoch]/s*100 << "%";
                    std::cout << std::endl;
                }
            }

            if (doTesting) {
                // *** Test ***
                for (unsigned int s = 0; s < datasetTest.numSamples; s++) {
                    const Eigen::VectorXf &sampleFeature = datasetTest.features[s];
                    int sampleLabel = datasetTest.labels[s];

                    if (classifier->predict(sampleFeature) != sampleLabel) {
                        testError[epoch]++;
                    }
                }

                std::cout << "Test error: " << testError[epoch] << "/" << datasetTest.numSamples << " = " << testError[epoch]/datasetTest.numSamples*100 << "%" << std::endl;
                std::cout << std::endl;
            }
        }

        end = std::chrono::system_clock::now();
        std::cout << "Elapsed time: " << std::chrono::duration<float>(end - start).count() << " seconds." << std::endl;
    }

    // *** Save classifier ***
    if (!saveClassifier.empty()) {
        std::cout << "Saving classifier to file: " << saveClassifier << std::endl;
        std::ofstream stream(saveClassifier, std::ios::binary);
        classifier->saveToStream(stream);
        std::cout << "Done!" << std::endl;
    }

    // *** Test - only if we didn't do the training ***
    if (!doTraining && doTesting) {
        float testError = 0.0f;

        std::cout << "Performing off-line test..." << std::endl;

        // Experiment
        start = std::chrono::system_clock::now();

        for (unsigned int s = 0; s < datasetTest.numSamples; s++) {
            const Eigen::VectorXf &sampleFeature = datasetTest.features[s];
            int sampleLabel = datasetTest.labels[s];

            if (classifier->predict(sampleFeature) != sampleLabel) {
                testError++;
            }
        }

        end = std::chrono::system_clock::now();

        std::cout << "Test error: " << testError << "/" << datasetTest.numSamples << " = " << testError/datasetTest.numSamples*100 << "%" << std::endl;
        std::cout << "Elapsed time: " << std::chrono::duration<float>(end - start).count() << " seconds." << std::endl;
        std::cout << std::endl;
    }

    // *** Online test ***
    if (doOnlineTesting) {
        float testError = 0.0f;

        std::cout << "Performing on-line test..." << std::endl;

        // Randomly permute the sample indices
        std::vector<int> sampleIndices(datasetTest.numSamples);
        std::iota(sampleIndices.begin(), sampleIndices.end(), 0);

        std::vector<std::vector<int>::iterator> shuffledSampleIndices(sampleIndices.size());
        std::iota(shuffledSampleIndices.begin(), shuffledSampleIndices.end(), sampleIndices.begin());

        std::shuffle(shuffledSampleIndices.begin(), shuffledSampleIndices.end(), random_number_generator);

        // Experiment
        start = std::chrono::system_clock::now();

        for (unsigned int s = 0; s < shuffledSampleIndices.size(); s++) {
            int idx = *shuffledSampleIndices[s];
            const Eigen::VectorXf &sampleFeature = datasetTest.features[idx];
            int sampleLabel = datasetTest.labels[idx];

            // Predict
            if (classifier->predict(sampleFeature) != sampleLabel) {
                testError++;
            }

            // Update
            classifier->update(sampleFeature, sampleLabel);
        }

        end = std::chrono::system_clock::now();

        std::cout << "Online test error: " << testError << "/" << datasetTest.numSamples << " = " << testError/datasetTest.numSamples*100 << "%" << std::endl;
        std::cout << "Elapsed time: " << std::chrono::duration<float>(end - start).count() << " seconds." << std::endl;
        std::cout << std::endl;
    }

    // Cleanup
    delete classifier;

    return 0;
}
///////////////////////////////////////////////////////////
// uses particles, variance, ind
// uses newPoints, newIndices, trees, Ndens
void gibbs2(unsigned int _Ndens, const BallTreeDensity* _trees, 
            unsigned long Np, unsigned int Niter,
            double *_pts, BallTree::index *_ind,
            double *_randU, double* _randN)
{
  unsigned int i,j,l;
  unsigned long s, maxNp;

  Ndens = _Ndens;                       // SET UP GLOBALS
  trees = _trees;
  newPoints = _pts; newIndices = _ind;
  randU = _randU; randN = _randN;
  Ndim  = trees[0].Ndim();              // dimension of densities    
  maxNp = 0;                            // largest # of particles we deal with
  for (unsigned int j=0; j<Ndens; j++)  // compute Max Np over all densities
    if (maxNp < trees[j].Npts()) maxNp = trees[j].Npts();

  ind = new BallTree::index[Ndens];     // ALLOCATE GLOBALS
  p = new double[maxNp];
  
  Nlevels = (unsigned int) (log((double)maxNp)/log((double)2))+1;          // how many levels to a balanced binary tree?

  particles = new double[Ndim*Ndens];
  variance  = new double[Ndim*Ndens];

  dNpts = new unsigned long[Ndens];
  levelList = new BallTree::index*[Ndens];
  levelListNew = new BallTree::index*[Ndens];
  for (j=0;j<Ndens;j++) { 
    levelList[j] = new BallTree::index[maxNp];
    levelListNew[j] = new BallTree::index[maxNp];
  }

  for (s=0; s<Np; s++) {                       

    levelInit();
    initIndices();
    calcIndices();

    ///////////////////////////////////////////////////////////////
    // Perform Gibbs sampling only if multiple densities in product
    ///////////////////////////////////////////////////////////////
    samplePoint(newPoints);
    for (l=0;l<Nlevels;l++) {
      levelDown();
      for (i=0;i<Niter;i++) {
        sampleIndices(newPoints);
        samplePoint(newPoints);
    }}

    for (unsigned int j=0; j<Ndens; j++)              // save and
      newIndices[j] = trees[j].getIndexOf(ind[j])+1;  // return particle label
    newIndices += Ndens;                              // move pointers to next sample
    newPoints  += Ndim;
  }

  for (j=0;j<Ndens;j++) { delete[] levelList[j];  delete[] levelListNew[j]; }
  delete[] levelList; delete[] levelListNew;
  delete[] dNpts;

  delete[] ind; delete[] p; delete[] particles; delete[] variance;
};
Example #3
0
  Result solve(const Problem& iProblem) const {
    // set up initial (empty) result
    Result result;
    result.mSuccess = false;
    result.mNumIterations = 0;

    // ensure that there are enough data points to proceed
    const int sampleSize = iProblem.getSampleSize();
    const int n = iProblem.getNumDataPoints();
    if (n < sampleSize) {
      return result;
    }

    const double epsilon = 1e-10;

    // best results are currently invalid
    double bestScore = -1;
    bool success = false;

    // start number of iterations as infinite, then reduce as we go
    double numIterationsNeeded = 1e10;
    int iterationCount = 0;
    int skippedSampleCount = 0;

    // for random sample index generation
    std::vector<int> allIndices(n);

    // iterate until adaptive number of iterations are exceeded
    while (iterationCount < numIterationsNeeded) {

      // determine random sample indices
      for (int i = 0; i < n; ++i) {
        allIndices[i] = i;
      }
      for (int i = 0; i < sampleSize; ++i) {
        int randIndex = std::rand() % n;
        std::swap(allIndices[i], allIndices[randIndex]);
      }
      std::vector<int> sampleIndices(allIndices.begin(),
                                     allIndices.begin() + sampleSize);

      // compute solution on minimal set
      typename Problem::Solution solution = iProblem.estimate(sampleIndices);

      // compute errors over all data points
      std::vector<double> errors2 = iProblem.computeSquaredErrors(solution);

      // check whether this is a valid sample
      // TODO: this should be done via a method in Problem class, but would
      // require changing all existing usages to include that method
      if (errors2.size() == 0) {
        ++skippedSampleCount;
        if (skippedSampleCount >= mMaximumIterations) break;
        continue;
      }
      skippedSampleCount = 0;

      // compute error threshold to be applied to each term
      double thresh = mMaximumError;
      if (thresh < 0) {
        std::sort(errors2.begin(), errors2.end());
        double median = (n % 2 == 0) ?
          (0.5*(errors2[n/2]+errors2[n/2+1])) : errors2[n/2];
        thresh = 1.4826*std::sqrt(median)*4.6851;
      }
      thresh *= thresh;

      // determine inliers
      std::vector<int> inliers;
      inliers.reserve(n);
      for (int i = 0; i < n; ++i) {
        if (errors2[i] <= thresh) {
          inliers.push_back(i);
        }
      }

      // if this is the best score, update solution and convergence criteria
      double score = inliers.size();
      if (score > bestScore) {
        bestScore = score;
        result.mInliers = inliers;
        result.mSolution = solution;
        success = true;
        double inlierProbability = double(inliers.size()) / n;
        double anyOutlierProbability = 1 - pow(inlierProbability,sampleSize);
        anyOutlierProbability = std::min(anyOutlierProbability, 1-epsilon);
        anyOutlierProbability = std::max(anyOutlierProbability, epsilon);
        numIterationsNeeded =
          log(1-mGoodSolutionProbability) / log(anyOutlierProbability);
      }

      // bump up iteration count and terminate if it exceeds hard max
      ++iterationCount;
      if (iterationCount > mMaximumIterations) {
        break;
      }
    }

    // finish off result params
    result.mSuccess = success;
    result.mNumIterations = iterationCount;

    // refine result using all inliers if specified
    if (result.mSuccess && mRefineUsingInliers) {
      result.mSolution = iProblem.estimate(result.mInliers);
    }

    // done
    return result;
  }