Filter::Filter(InputSet& inputSet) : inputSet_(inputSet) { // Get the input set's number of samples, features and labels const unsigned int nbSamples = inputSet.nbSamples(); const unsigned int nbFeatures = inputSet.nbFeatures(); const unsigned int nbLabels = inputSet.nbLabels(); // Resize the initial stack of samples sampleStack_.resize(1); sampleStack_[0].resize(nbSamples); // Make the indices range from 0 to nbSamples - 1 for(unsigned int s = 0; s < nbSamples; ++s) { sampleStack_[0][s] = s; } // Resize the initial stack of features featureStack_.resize(1); featureStack_[0].resize(nbFeatures); // Make the indices range from 0 to nbFeatures - 1 for(unsigned int f = 0; f < nbFeatures; ++f) { featureStack_[0][f] = f; } // Set the number of labels nbLabelStack_.push_back(nbLabels); // Push it twice so as to be sure to nbLabelStack_.push_back(nbLabels); // never overwrite it // Set the number of images and heuristics nbImages_ = inputSet.nbImages(); nbHeuristics_ = inputSet.nbHeuristics(); }
void LinearSVM::distribution(InputSet& inputSet, unsigned int sample, scalar_t* distr) const { // Push the selected features if required if(!indices_.empty()) { inputSet.pushFeatures(indices_); } // Get the number of features and labels const unsigned int nbFeatures = inputSet.nbFeatures(); const unsigned int nbLabels = inputSet.nbLabels(); // Make sure that we have a model assert(model_); // Make sure that there is the same number of labels assert(static_cast<unsigned int>(get_nr_class(model_)) <= nbLabels); // Create a node feature_node node; node.dim = nbFeatures; node.values = const_cast<scalar_t*>(inputSet.features(sample)); // The predicted labels std::vector<double> predictions(nbLabels); predict_values(model_, &node, &predictions[0]); // Update the ditribution according to the predictions for(unsigned int l = 0; l < nbLabels; ++l) { distr[map_[l]] = predictions[l]; } // Pop the selected features if required if(!indices_.empty()) { inputSet.popFeatures(); } }
void C45Tree::distribution(InputSet& inputSet, unsigned int sample, scalar_t* distr) const { // Get the features of the sample to classifiy const scalar_t* features = inputSet.features(sample); // The currently selected tree const C45Tree* current = this; // If the tree has children, find which one is selected while(current->children_[0]) { assert(current->feature_ < inputSet.nbFeatures()); if(features[current->feature_] <= current->split_) { current = current->children_[0]; } else { current = current->children_[1]; } } // Recopy the distribution assert(current->distr_.size() == inputSet.nbLabels()); std::copy(current->distr_.begin(), current->distr_.end(), distr); }
scalar_t C45Tree::prune(const InputSet& inputSet, unsigned int* indices, unsigned int nbSamples, bool update) { // Get the number of labels const unsigned int nbLabels = inputSet.nbLabels(); // Get the labels and the weights of the samples const unsigned int* labels = inputSet.labels(); const scalar_t* weights = inputSet.weights(); // Compute the frequency of appearance of each label std::vector<double> distr(nbLabels, 0); // Use double's for better precision // Determine the label with the largest frequency for(unsigned int s = 0; s < nbSamples; ++s) { distr[labels[indices[s]]] += weights[indices[s]]; } unsigned int label = std::max_element(distr.begin(), distr.end()) - distr.begin(); double sumWeights = std::accumulate(distr.begin(), distr.end(), 0.0); // Update the node if needed if(update) { label_ = label; sumWeights_ = sumWeights; } // The error if the tree was a leaf double leafError = sumWeights - distr[label]; // Add the uncertainty to obtain an upper bounds on the error leafError += addErrs(sumWeights, leafError); // If the tree is a leaf if(!children_[0]) { // Update the node if needed if(update) { distr_.assign(distr.begin(), distr.end()); } return leafError; } // If the tree has children first prune them (bottom-up traversal) else { // Build the indices for the left and the right children by partitioning // the indices in-place unsigned int rightIndex = Utils::partition(indices, inputSet.samples(feature_), nbSamples, split_); // Prune the left child scalar_t treeError = children_[0]->prune(inputSet, indices, rightIndex, update); // Prune the right child treeError += children_[1]->prune(inputSet, indices + rightIndex, nbSamples - rightIndex, update); if(!update) { return treeError; } // Compute the classification error on the biggest branch C45Tree* biggestBranch = children_[0]; C45Tree* smallestBranch = children_[1]; if(biggestBranch->sumWeights_ < smallestBranch->sumWeights_) { std::swap(biggestBranch, smallestBranch); } scalar_t branchError = biggestBranch->prune(inputSet, indices, nbSamples, false); // The +0.1 constant comes directly from Dr. Quilans implementation if(leafError <= branchError + 0.1 && leafError <= treeError + 0.1) { // Replace the tree by a leaf // Update the node if needed distr_.assign(distr.begin(), distr.end()); delete children_[0]; delete children_[1]; children_[0] = 0; children_[1] = 0; return leafError; } else if(branchError <= treeError + 0.1) { // Replace the tree by it's biggest branch delete smallestBranch; // Recopy the biggest branch feature_ = biggestBranch->feature_; split_ = biggestBranch->split_; children_[0] = biggestBranch->children_[0]; children_[1] = biggestBranch->children_[1]; // Delete the biggest branch top node biggestBranch->children_[0] = 0; biggestBranch->children_[1] = 0; delete biggestBranch; // Update the subtrees prune(inputSet, indices, nbSamples, true); return branchError; } else { return treeError; } } }
void C45Tree::make(const InputSet& inputSet, unsigned int* indices, unsigned int nbSamples) { // Stop if there is only one label (the tree is a leaf), or if we have // reached the maximum depth assert(distr_.size() > label_); if(Utils::geq(distr_[label_], sumWeights_) || !maxDepth_) { return; } // Get the number of features and labels const unsigned int nbFeatures = inputSet.nbFeatures(); const unsigned int nbLabels = inputSet.nbLabels(); // Get the labels and the weights of the samples const unsigned int* labels = inputSet.labels(); const scalar_t* weights = inputSet.weights(); // Compute the node's information from the frequency of each label scalar_t information = 0; for(unsigned int l = 0; l < nbLabels; ++l) { information += Utils::entropy(distr_[l]); } information -= Utils::entropy(sumWeights_); // Current best split in node scalar_t largestGain = 0; scalar_t sumWeights0 = 0; std::vector<scalar_t> distr0; // Frequencies of each label before the split. The frequencies of the labels // after the split can be directly calculated by subtracting to the total // frequencies std::vector<double> partialDistr(nbLabels); // Use double's for better // precision // Try to split on every feature for(unsigned int f = 0; f < nbFeatures; ++f) { // Set the partial frequencies to zero std::fill_n(partialDistr.begin(), nbLabels, 0); // Get the samples (the values of the current feature) const scalar_t* samples = inputSet.samples(f); // Sort the indices according to the current feature Utils::sort(indices, samples, nbSamples); // Update the weight of the samples before the split by adding the // weight of each sample at every iteration. The weight of the samples // after the split is just the sum of the weights minus that weight double leftWeight = 0; // Use a double for better precision for(unsigned int s = 0; s < nbSamples - 1; ++s) { const unsigned int index = indices[s]; const unsigned int nextIndex = indices[s + 1]; const unsigned int label = labels[index]; const scalar_t weight = weights[index]; partialDistr[label] += weight; leftWeight += weight; const double rightWeight = sumWeights_ - leftWeight; // Make sure the weights of the leaves are sufficient and try only // to split in-between two samples with different feature if(leftWeight >= minWeight_ && rightWeight >= minWeight_ && Utils::less(samples[index], samples[nextIndex])) { scalar_t leftInfo = 0, rightInfo = 0; for(unsigned int l = 0; l < nbLabels; ++l) { leftInfo += Utils::entropy(partialDistr[l]); rightInfo += Utils::entropy(distr_[l] - partialDistr[l]); } leftInfo -= Utils::entropy(leftWeight); rightInfo -= Utils::entropy(rightWeight); // Gain of the split scalar_t gain = information - leftInfo - rightInfo; // Gain ratio criterion // gain /= Utils::entropy(leftWeight) + // Utils::entropy(rightWeight) - // Utils::entropy(sumWeights_); // If it is the best gain so far... if(gain > largestGain) { largestGain = gain; feature_ = f; split_ = (samples[index] + samples[nextIndex]) / 2; sumWeights0 = leftWeight; distr0.assign(partialDistr.begin(), partialDistr.end()); } } } } // If no good split, create a leaf if(largestGain < Utils::epsilon) { return; } // Create the two children of the tree children_[0] = new C45Tree(minWeight_, confidence_, maxDepth_ - 1); children_[1] = new C45Tree(minWeight_, confidence_, maxDepth_ - 1); // Assign them the correct label, sumWeights and distribution children_[0]->label_ = std::max_element(distr0.begin(), distr0.end()) - distr0.begin(); // Ok since the distribution is only needed for leaves std::transform(distr_.begin(), distr_.end(), distr0.begin(), distr_.begin(), std::minus<scalar_t>()); children_[1]->label_ = std::max_element(distr_.begin(), distr_.end()) - distr_.begin(); children_[0]->sumWeights_ = sumWeights0; children_[1]->sumWeights_ = sumWeights_ - sumWeights0; children_[0]->distr_.swap(distr0); children_[1]->distr_.swap(distr_); // Build the indices for the left and the right children by partitioning // the indices in-place around the pivot split0 unsigned int rightIndex = Utils::partition(indices, inputSet.samples(feature_), nbSamples, split_); // Create the two children recursively children_[0]->make(inputSet, indices, rightIndex); children_[1]->make(inputSet, indices + rightIndex, nbSamples - rightIndex); }
void C45Tree::train(InputSet& inputSet) { // In case the tree was already trained delete children_[0]; delete children_[1]; children_[0] = 0; children_[1] = 0; // Get the number of samples and labels const unsigned int nbSamples = inputSet.nbSamples(); const unsigned int nbLabels = inputSet.nbLabels(); // Set the maximal depth if needed bool maxDepth = false; if(!maxDepth_) { maxDepth_ = std::ceil(std::log(double(nbLabels)) / std::log(2.0)); maxDepth = true; } // Get the labels and the weights of the samples const unsigned int* labels = inputSet.labels(); const scalar_t* weights = inputSet.weights(); // Make the weights sum to nbSamples as in Dr. Quilans implementation std::vector<scalar_t> oldWeights(nbSamples); scalar_t norm = std::accumulate(weights, weights + nbSamples, scalar_t()); std::transform(weights, weights + nbSamples, oldWeights.begin(), std::bind2nd(std::multiplies<scalar_t>(), nbSamples / norm)); inputSet.swapWeights(oldWeights); weights = inputSet.weights(); // Compute the frequency of appearance of each label distr_.clear(); distr_.resize(nbLabels, 0); // Determine the label with the largest frequency for(unsigned int s = 0; s < nbSamples; ++s) { distr_[labels[s]] += weights[s]; } label_ = std::max_element(distr_.begin(), distr_.end()) - distr_.begin(); sumWeights_ = nbSamples; // Vector of indices over the samples std::vector<unsigned int> indices(nbSamples); // Make the indices range from o to nbSamples - 1 for(unsigned int s = 0; s < nbSamples; ++s) { indices[s] = s; } // Create the root (the children will follow recursively) make(inputSet, &indices[0], nbSamples); if(confidence_ < 1) { // Make the indices range from o to nbSamples - 1 for(unsigned int s = 0; s < nbSamples; ++s) { indices[s] = s; } // Prune the tree previously created prune(inputSet, &indices[0], nbSamples, true); } // Restore the original weights inputSet.swapWeights(oldWeights); // Restore maxDepth if(maxDepth) { maxDepth_ = 0; } }
void LinearSVM::train(InputSet& inputSet) { // Sample features from every heuristic in order for the matrix of features // to fit in memory if(inputSet.nbFeatures() > NB_FEATURES_MAX / inputSet.nbSamples()) { inputSet.sampleFeatures(NB_FEATURES_MAX / inputSet.nbSamples(), indices_); inputSet.pushFeatures(indices_); } else { indices_.clear(); } // Get the number of features, samples and labels const unsigned int nbFeatures = inputSet.nbFeatures(); const unsigned int nbSamples = inputSet.nbSamples(); const unsigned int nbLabels = inputSet.nbLabels(); // Delete the previous model if(model_) { free_and_destroy_model(&model_); } // Create a new problem problem prob; // Recopy the number of samples prob.l = nbSamples; prob.n = nbFeatures; // Recopy the labels std::vector<int> labels(inputSet.labels(), inputSet.labels() + nbSamples); prob.y = &labels[0]; // Recopy the features (as we want to normalize them) std::vector<scalar_t> matrix; inputSet.swapFeatures(matrix); // Create samples as expected by liblinear std::vector<feature_node> samples(nbSamples); prob.x = &samples[0]; // Compute the mean norm scalar_t meanNorm = 0; for(unsigned int s = 0; s < nbSamples; ++s) { samples[s].dim = nbFeatures; samples[s].values = &matrix[s * nbFeatures]; // Add the mean norm of that sample meanNorm += nrm2(samples[s].dim, samples[s].values, 1); } // Divide the sum of the norms by the number of samples meanNorm /= nbSamples; std::cout << "[LinearSVM::train] mean(norm): " << meanNorm << '.' << std::endl; // Rescale the features so that their mean norm is 1 std::transform(matrix.begin(), matrix.end(), matrix.begin(), std::bind2nd(std::divides<scalar_t>(), meanNorm)); // Sets the bias to the default value (liblinear doesn't seem to handle the // bias parameter value correctly) prob.bias = -1; // Make sure that the parameters are correct bool crossValidate = parameters_.C < 0; // Sets C to a default value in order to pass the parameter check if(crossValidate) { parameters_.C = 1; } // There is a problem with the parameters assert(!check_parameter(&prob, ¶meters_)); // If C is below zero, use 5-folds cross-validation to determine it if(crossValidate) { std::vector<int> target(nbSamples); // The predicted labels unsigned int nbErrorsMin = nbSamples + 1; // Initialize past the maximum for(parameters_.C = 1000; parameters_.C >= 0.01; parameters_.C /= 10) { cross_validation(&prob, ¶meters_, 5, &target[0]); // Count the number of errors unsigned int nbErrors = 0; for(unsigned int s = 0; s < nbSamples; ++s) { if(target[s] != labels[s]) { ++nbErrors; } } std::cout << "[LinearSVM::train] 5 folds cross-validation error " "for C = " << parameters_.C << ": " << nbErrors * 100.0f / nbSamples << "%." << std::endl; // The new C is better than the previous one if(nbErrors < nbErrorsMin) { nbErrorsMin = nbErrors; } // The optimal C was found else { break; } } // C got divided one time too much parameters_.C *= 10; // Print C to the log std::cout << "[LinearSVM::train] optimal C as determined by 5 folds " "cross-validation: " << parameters_.C << '.' << std::endl; } // Train the svm model_ = ::train(&prob, ¶meters_); assert(model_); // Reset C so that it will be cross-validated again if(crossValidate) { parameters_.C = -1; } // Save libsvm labels map_.clear(); map_.resize(nbLabels); get_labels(model_, &map_[0]); // Pop the selected features if required if(!indices_.empty()) { inputSet.popFeatures(); } }