Filter::Filter(InputSet& inputSet) : inputSet_(inputSet) { // Get the input set's number of samples, features and labels const unsigned int nbSamples = inputSet.nbSamples(); const unsigned int nbFeatures = inputSet.nbFeatures(); const unsigned int nbLabels = inputSet.nbLabels(); // Resize the initial stack of samples sampleStack_.resize(1); sampleStack_[0].resize(nbSamples); // Make the indices range from 0 to nbSamples - 1 for(unsigned int s = 0; s < nbSamples; ++s) { sampleStack_[0][s] = s; } // Resize the initial stack of features featureStack_.resize(1); featureStack_[0].resize(nbFeatures); // Make the indices range from 0 to nbFeatures - 1 for(unsigned int f = 0; f < nbFeatures; ++f) { featureStack_[0][f] = f; } // Set the number of labels nbLabelStack_.push_back(nbLabels); // Push it twice so as to be sure to nbLabelStack_.push_back(nbLabels); // never overwrite it // Set the number of images and heuristics nbImages_ = inputSet.nbImages(); nbHeuristics_ = inputSet.nbHeuristics(); }
Statistical::Statistical(InputSet& inputSet) : Filter(inputSet) { // Get the number of samples and features const unsigned int nbSamples = inputSet.nbSamples(); const unsigned int nbFeatures = inputSet.nbFeatures(); // Used to store the current sum and sum of squares of every feature means_.resize(nbFeatures); stds_.resize(nbFeatures); for(unsigned int s = 0; s < nbSamples; ++s) { std::vector<unsigned int> sample(1, s); inputSet.pushSamples(sample); const scalar_t* features = inputSet.features(0); for(unsigned int f = 0; f < nbFeatures; ++f) { means_[f] += features[f]; stds_[f] += features[f] * features[f]; } inputSet.popSamples(); } std::vector<unsigned int> featureStack; for(unsigned int f = 0; f < nbFeatures; ++f) { scalar_t mean = means_[f] / nbSamples; scalar_t variance = (stds_[f] - means_[f] * mean) / (nbSamples - 1); if(variance > 0) { means_[f] = mean; stds_[f] = std::sqrt(variance); featureStack.push_back(f); } } // There must be at least one feature assert(!featureStack.empty()); // Push the selected features if needed if(featureStack.size() < nbFeatures) { featureStack_.push_back(featureStack); } }
void LinearSVM::distribution(InputSet& inputSet, unsigned int sample, scalar_t* distr) const { // Push the selected features if required if(!indices_.empty()) { inputSet.pushFeatures(indices_); } // Get the number of features and labels const unsigned int nbFeatures = inputSet.nbFeatures(); const unsigned int nbLabels = inputSet.nbLabels(); // Make sure that we have a model assert(model_); // Make sure that there is the same number of labels assert(static_cast<unsigned int>(get_nr_class(model_)) <= nbLabels); // Create a node feature_node node; node.dim = nbFeatures; node.values = const_cast<scalar_t*>(inputSet.features(sample)); // The predicted labels std::vector<double> predictions(nbLabels); predict_values(model_, &node, &predictions[0]); // Update the ditribution according to the predictions for(unsigned int l = 0; l < nbLabels; ++l) { distr[map_[l]] = predictions[l]; } // Pop the selected features if required if(!indices_.empty()) { inputSet.popFeatures(); } }
void C45Tree::distribution(InputSet& inputSet, unsigned int sample, scalar_t* distr) const { // Get the features of the sample to classifiy const scalar_t* features = inputSet.features(sample); // The currently selected tree const C45Tree* current = this; // If the tree has children, find which one is selected while(current->children_[0]) { assert(current->feature_ < inputSet.nbFeatures()); if(features[current->feature_] <= current->split_) { current = current->children_[0]; } else { current = current->children_[1]; } } // Recopy the distribution assert(current->distr_.size() == inputSet.nbLabels()); std::copy(current->distr_.begin(), current->distr_.end(), distr); }
void C45Tree::make(const InputSet& inputSet, unsigned int* indices, unsigned int nbSamples) { // Stop if there is only one label (the tree is a leaf), or if we have // reached the maximum depth assert(distr_.size() > label_); if(Utils::geq(distr_[label_], sumWeights_) || !maxDepth_) { return; } // Get the number of features and labels const unsigned int nbFeatures = inputSet.nbFeatures(); const unsigned int nbLabels = inputSet.nbLabels(); // Get the labels and the weights of the samples const unsigned int* labels = inputSet.labels(); const scalar_t* weights = inputSet.weights(); // Compute the node's information from the frequency of each label scalar_t information = 0; for(unsigned int l = 0; l < nbLabels; ++l) { information += Utils::entropy(distr_[l]); } information -= Utils::entropy(sumWeights_); // Current best split in node scalar_t largestGain = 0; scalar_t sumWeights0 = 0; std::vector<scalar_t> distr0; // Frequencies of each label before the split. The frequencies of the labels // after the split can be directly calculated by subtracting to the total // frequencies std::vector<double> partialDistr(nbLabels); // Use double's for better // precision // Try to split on every feature for(unsigned int f = 0; f < nbFeatures; ++f) { // Set the partial frequencies to zero std::fill_n(partialDistr.begin(), nbLabels, 0); // Get the samples (the values of the current feature) const scalar_t* samples = inputSet.samples(f); // Sort the indices according to the current feature Utils::sort(indices, samples, nbSamples); // Update the weight of the samples before the split by adding the // weight of each sample at every iteration. The weight of the samples // after the split is just the sum of the weights minus that weight double leftWeight = 0; // Use a double for better precision for(unsigned int s = 0; s < nbSamples - 1; ++s) { const unsigned int index = indices[s]; const unsigned int nextIndex = indices[s + 1]; const unsigned int label = labels[index]; const scalar_t weight = weights[index]; partialDistr[label] += weight; leftWeight += weight; const double rightWeight = sumWeights_ - leftWeight; // Make sure the weights of the leaves are sufficient and try only // to split in-between two samples with different feature if(leftWeight >= minWeight_ && rightWeight >= minWeight_ && Utils::less(samples[index], samples[nextIndex])) { scalar_t leftInfo = 0, rightInfo = 0; for(unsigned int l = 0; l < nbLabels; ++l) { leftInfo += Utils::entropy(partialDistr[l]); rightInfo += Utils::entropy(distr_[l] - partialDistr[l]); } leftInfo -= Utils::entropy(leftWeight); rightInfo -= Utils::entropy(rightWeight); // Gain of the split scalar_t gain = information - leftInfo - rightInfo; // Gain ratio criterion // gain /= Utils::entropy(leftWeight) + // Utils::entropy(rightWeight) - // Utils::entropy(sumWeights_); // If it is the best gain so far... if(gain > largestGain) { largestGain = gain; feature_ = f; split_ = (samples[index] + samples[nextIndex]) / 2; sumWeights0 = leftWeight; distr0.assign(partialDistr.begin(), partialDistr.end()); } } } } // If no good split, create a leaf if(largestGain < Utils::epsilon) { return; } // Create the two children of the tree children_[0] = new C45Tree(minWeight_, confidence_, maxDepth_ - 1); children_[1] = new C45Tree(minWeight_, confidence_, maxDepth_ - 1); // Assign them the correct label, sumWeights and distribution children_[0]->label_ = std::max_element(distr0.begin(), distr0.end()) - distr0.begin(); // Ok since the distribution is only needed for leaves std::transform(distr_.begin(), distr_.end(), distr0.begin(), distr_.begin(), std::minus<scalar_t>()); children_[1]->label_ = std::max_element(distr_.begin(), distr_.end()) - distr_.begin(); children_[0]->sumWeights_ = sumWeights0; children_[1]->sumWeights_ = sumWeights_ - sumWeights0; children_[0]->distr_.swap(distr0); children_[1]->distr_.swap(distr_); // Build the indices for the left and the right children by partitioning // the indices in-place around the pivot split0 unsigned int rightIndex = Utils::partition(indices, inputSet.samples(feature_), nbSamples, split_); // Create the two children recursively children_[0]->make(inputSet, indices, rightIndex); children_[1]->make(inputSet, indices + rightIndex, nbSamples - rightIndex); }
void LinearSVM::train(InputSet& inputSet) { // Sample features from every heuristic in order for the matrix of features // to fit in memory if(inputSet.nbFeatures() > NB_FEATURES_MAX / inputSet.nbSamples()) { inputSet.sampleFeatures(NB_FEATURES_MAX / inputSet.nbSamples(), indices_); inputSet.pushFeatures(indices_); } else { indices_.clear(); } // Get the number of features, samples and labels const unsigned int nbFeatures = inputSet.nbFeatures(); const unsigned int nbSamples = inputSet.nbSamples(); const unsigned int nbLabels = inputSet.nbLabels(); // Delete the previous model if(model_) { free_and_destroy_model(&model_); } // Create a new problem problem prob; // Recopy the number of samples prob.l = nbSamples; prob.n = nbFeatures; // Recopy the labels std::vector<int> labels(inputSet.labels(), inputSet.labels() + nbSamples); prob.y = &labels[0]; // Recopy the features (as we want to normalize them) std::vector<scalar_t> matrix; inputSet.swapFeatures(matrix); // Create samples as expected by liblinear std::vector<feature_node> samples(nbSamples); prob.x = &samples[0]; // Compute the mean norm scalar_t meanNorm = 0; for(unsigned int s = 0; s < nbSamples; ++s) { samples[s].dim = nbFeatures; samples[s].values = &matrix[s * nbFeatures]; // Add the mean norm of that sample meanNorm += nrm2(samples[s].dim, samples[s].values, 1); } // Divide the sum of the norms by the number of samples meanNorm /= nbSamples; std::cout << "[LinearSVM::train] mean(norm): " << meanNorm << '.' << std::endl; // Rescale the features so that their mean norm is 1 std::transform(matrix.begin(), matrix.end(), matrix.begin(), std::bind2nd(std::divides<scalar_t>(), meanNorm)); // Sets the bias to the default value (liblinear doesn't seem to handle the // bias parameter value correctly) prob.bias = -1; // Make sure that the parameters are correct bool crossValidate = parameters_.C < 0; // Sets C to a default value in order to pass the parameter check if(crossValidate) { parameters_.C = 1; } // There is a problem with the parameters assert(!check_parameter(&prob, ¶meters_)); // If C is below zero, use 5-folds cross-validation to determine it if(crossValidate) { std::vector<int> target(nbSamples); // The predicted labels unsigned int nbErrorsMin = nbSamples + 1; // Initialize past the maximum for(parameters_.C = 1000; parameters_.C >= 0.01; parameters_.C /= 10) { cross_validation(&prob, ¶meters_, 5, &target[0]); // Count the number of errors unsigned int nbErrors = 0; for(unsigned int s = 0; s < nbSamples; ++s) { if(target[s] != labels[s]) { ++nbErrors; } } std::cout << "[LinearSVM::train] 5 folds cross-validation error " "for C = " << parameters_.C << ": " << nbErrors * 100.0f / nbSamples << "%." << std::endl; // The new C is better than the previous one if(nbErrors < nbErrorsMin) { nbErrorsMin = nbErrors; } // The optimal C was found else { break; } } // C got divided one time too much parameters_.C *= 10; // Print C to the log std::cout << "[LinearSVM::train] optimal C as determined by 5 folds " "cross-validation: " << parameters_.C << '.' << std::endl; } // Train the svm model_ = ::train(&prob, ¶meters_); assert(model_); // Reset C so that it will be cross-validated again if(crossValidate) { parameters_.C = -1; } // Save libsvm labels map_.clear(); map_.resize(nbLabels); get_labels(model_, &map_[0]); // Pop the selected features if required if(!indices_.empty()) { inputSet.popFeatures(); } }