Filter::Filter(InputSet& inputSet) : inputSet_(inputSet) { // Get the input set's number of samples, features and labels const unsigned int nbSamples = inputSet.nbSamples(); const unsigned int nbFeatures = inputSet.nbFeatures(); const unsigned int nbLabels = inputSet.nbLabels(); // Resize the initial stack of samples sampleStack_.resize(1); sampleStack_[0].resize(nbSamples); // Make the indices range from 0 to nbSamples - 1 for(unsigned int s = 0; s < nbSamples; ++s) { sampleStack_[0][s] = s; } // Resize the initial stack of features featureStack_.resize(1); featureStack_[0].resize(nbFeatures); // Make the indices range from 0 to nbFeatures - 1 for(unsigned int f = 0; f < nbFeatures; ++f) { featureStack_[0][f] = f; } // Set the number of labels nbLabelStack_.push_back(nbLabels); // Push it twice so as to be sure to nbLabelStack_.push_back(nbLabels); // never overwrite it // Set the number of images and heuristics nbImages_ = inputSet.nbImages(); nbHeuristics_ = inputSet.nbHeuristics(); }
Statistical::Statistical(InputSet& inputSet) : Filter(inputSet) { // Get the number of samples and features const unsigned int nbSamples = inputSet.nbSamples(); const unsigned int nbFeatures = inputSet.nbFeatures(); // Used to store the current sum and sum of squares of every feature means_.resize(nbFeatures); stds_.resize(nbFeatures); for(unsigned int s = 0; s < nbSamples; ++s) { std::vector<unsigned int> sample(1, s); inputSet.pushSamples(sample); const scalar_t* features = inputSet.features(0); for(unsigned int f = 0; f < nbFeatures; ++f) { means_[f] += features[f]; stds_[f] += features[f] * features[f]; } inputSet.popSamples(); } std::vector<unsigned int> featureStack; for(unsigned int f = 0; f < nbFeatures; ++f) { scalar_t mean = means_[f] / nbSamples; scalar_t variance = (stds_[f] - means_[f] * mean) / (nbSamples - 1); if(variance > 0) { means_[f] = mean; stds_[f] = std::sqrt(variance); featureStack.push_back(f); } } // There must be at least one feature assert(!featureStack.empty()); // Push the selected features if needed if(featureStack.size() < nbFeatures) { featureStack_.push_back(featureStack); } }
void C45Tree::train(InputSet& inputSet) { // In case the tree was already trained delete children_[0]; delete children_[1]; children_[0] = 0; children_[1] = 0; // Get the number of samples and labels const unsigned int nbSamples = inputSet.nbSamples(); const unsigned int nbLabels = inputSet.nbLabels(); // Set the maximal depth if needed bool maxDepth = false; if(!maxDepth_) { maxDepth_ = std::ceil(std::log(double(nbLabels)) / std::log(2.0)); maxDepth = true; } // Get the labels and the weights of the samples const unsigned int* labels = inputSet.labels(); const scalar_t* weights = inputSet.weights(); // Make the weights sum to nbSamples as in Dr. Quilans implementation std::vector<scalar_t> oldWeights(nbSamples); scalar_t norm = std::accumulate(weights, weights + nbSamples, scalar_t()); std::transform(weights, weights + nbSamples, oldWeights.begin(), std::bind2nd(std::multiplies<scalar_t>(), nbSamples / norm)); inputSet.swapWeights(oldWeights); weights = inputSet.weights(); // Compute the frequency of appearance of each label distr_.clear(); distr_.resize(nbLabels, 0); // Determine the label with the largest frequency for(unsigned int s = 0; s < nbSamples; ++s) { distr_[labels[s]] += weights[s]; } label_ = std::max_element(distr_.begin(), distr_.end()) - distr_.begin(); sumWeights_ = nbSamples; // Vector of indices over the samples std::vector<unsigned int> indices(nbSamples); // Make the indices range from o to nbSamples - 1 for(unsigned int s = 0; s < nbSamples; ++s) { indices[s] = s; } // Create the root (the children will follow recursively) make(inputSet, &indices[0], nbSamples); if(confidence_ < 1) { // Make the indices range from o to nbSamples - 1 for(unsigned int s = 0; s < nbSamples; ++s) { indices[s] = s; } // Prune the tree previously created prune(inputSet, &indices[0], nbSamples, true); } // Restore the original weights inputSet.swapWeights(oldWeights); // Restore maxDepth if(maxDepth) { maxDepth_ = 0; } }
void LinearSVM::train(InputSet& inputSet) { // Sample features from every heuristic in order for the matrix of features // to fit in memory if(inputSet.nbFeatures() > NB_FEATURES_MAX / inputSet.nbSamples()) { inputSet.sampleFeatures(NB_FEATURES_MAX / inputSet.nbSamples(), indices_); inputSet.pushFeatures(indices_); } else { indices_.clear(); } // Get the number of features, samples and labels const unsigned int nbFeatures = inputSet.nbFeatures(); const unsigned int nbSamples = inputSet.nbSamples(); const unsigned int nbLabels = inputSet.nbLabels(); // Delete the previous model if(model_) { free_and_destroy_model(&model_); } // Create a new problem problem prob; // Recopy the number of samples prob.l = nbSamples; prob.n = nbFeatures; // Recopy the labels std::vector<int> labels(inputSet.labels(), inputSet.labels() + nbSamples); prob.y = &labels[0]; // Recopy the features (as we want to normalize them) std::vector<scalar_t> matrix; inputSet.swapFeatures(matrix); // Create samples as expected by liblinear std::vector<feature_node> samples(nbSamples); prob.x = &samples[0]; // Compute the mean norm scalar_t meanNorm = 0; for(unsigned int s = 0; s < nbSamples; ++s) { samples[s].dim = nbFeatures; samples[s].values = &matrix[s * nbFeatures]; // Add the mean norm of that sample meanNorm += nrm2(samples[s].dim, samples[s].values, 1); } // Divide the sum of the norms by the number of samples meanNorm /= nbSamples; std::cout << "[LinearSVM::train] mean(norm): " << meanNorm << '.' << std::endl; // Rescale the features so that their mean norm is 1 std::transform(matrix.begin(), matrix.end(), matrix.begin(), std::bind2nd(std::divides<scalar_t>(), meanNorm)); // Sets the bias to the default value (liblinear doesn't seem to handle the // bias parameter value correctly) prob.bias = -1; // Make sure that the parameters are correct bool crossValidate = parameters_.C < 0; // Sets C to a default value in order to pass the parameter check if(crossValidate) { parameters_.C = 1; } // There is a problem with the parameters assert(!check_parameter(&prob, ¶meters_)); // If C is below zero, use 5-folds cross-validation to determine it if(crossValidate) { std::vector<int> target(nbSamples); // The predicted labels unsigned int nbErrorsMin = nbSamples + 1; // Initialize past the maximum for(parameters_.C = 1000; parameters_.C >= 0.01; parameters_.C /= 10) { cross_validation(&prob, ¶meters_, 5, &target[0]); // Count the number of errors unsigned int nbErrors = 0; for(unsigned int s = 0; s < nbSamples; ++s) { if(target[s] != labels[s]) { ++nbErrors; } } std::cout << "[LinearSVM::train] 5 folds cross-validation error " "for C = " << parameters_.C << ": " << nbErrors * 100.0f / nbSamples << "%." << std::endl; // The new C is better than the previous one if(nbErrors < nbErrorsMin) { nbErrorsMin = nbErrors; } // The optimal C was found else { break; } } // C got divided one time too much parameters_.C *= 10; // Print C to the log std::cout << "[LinearSVM::train] optimal C as determined by 5 folds " "cross-validation: " << parameters_.C << '.' << std::endl; } // Train the svm model_ = ::train(&prob, ¶meters_); assert(model_); // Reset C so that it will be cross-validated again if(crossValidate) { parameters_.C = -1; } // Save libsvm labels map_.clear(); map_.resize(nbLabels); get_labels(model_, &map_[0]); // Pop the selected features if required if(!indices_.empty()) { inputSet.popFeatures(); } }