C++ (Cpp) InputSet::nbFeatures Examples

Example #1

0

Show file

File: Filter.cpp Project: idiap/mash

Filter::Filter(InputSet& inputSet) : inputSet_(inputSet) {
	// Get the input set's number of samples, features and labels
	const unsigned int nbSamples = inputSet.nbSamples();
	const unsigned int nbFeatures = inputSet.nbFeatures();
	const unsigned int nbLabels = inputSet.nbLabels();

	// Resize the initial stack of samples
	sampleStack_.resize(1);
	sampleStack_[0].resize(nbSamples);

	// Make the indices range from 0 to nbSamples - 1
	for(unsigned int s = 0; s < nbSamples; ++s) {
		sampleStack_[0][s] = s;
	}

	// Resize the initial stack of features
	featureStack_.resize(1);
	featureStack_[0].resize(nbFeatures);

	// Make the indices range from 0 to nbFeatures - 1
	for(unsigned int f = 0; f < nbFeatures; ++f) {
		featureStack_[0][f] = f;
	}

	// Set the number of labels
	nbLabelStack_.push_back(nbLabels); // Push it twice so as to be sure to
	nbLabelStack_.push_back(nbLabels); // never overwrite it

	// Set the number of images and heuristics
	nbImages_ = inputSet.nbImages();
	nbHeuristics_ = inputSet.nbHeuristics();
}

Example #2

0

Show file

File: Statistical.cpp Project: idiap/mash

Statistical::Statistical(InputSet& inputSet) : Filter(inputSet) {
	// Get the number of samples and features
	const unsigned int nbSamples = inputSet.nbSamples();
	const unsigned int nbFeatures = inputSet.nbFeatures();

	// Used to store the current sum and sum of squares of every feature
	means_.resize(nbFeatures);
	stds_.resize(nbFeatures);

	for(unsigned int s = 0; s < nbSamples; ++s) {
		std::vector<unsigned int> sample(1, s);
		inputSet.pushSamples(sample);

		const scalar_t* features = inputSet.features(0);

		for(unsigned int f = 0; f < nbFeatures; ++f) {
			means_[f] += features[f];
			stds_[f] += features[f] * features[f];
		}

		inputSet.popSamples();
	}

	std::vector<unsigned int> featureStack;

	for(unsigned int f = 0; f < nbFeatures; ++f) {
		scalar_t mean = means_[f] / nbSamples;
		scalar_t variance = (stds_[f] - means_[f] * mean) / (nbSamples - 1);

		if(variance > 0) {
			means_[f] = mean;
			stds_[f] = std::sqrt(variance);

			featureStack.push_back(f);
		}
	}

	// There must be at least one feature
	assert(!featureStack.empty());

	// Push the selected features if needed
	if(featureStack.size() < nbFeatures) {
		featureStack_.push_back(featureStack);
	}
}

Example #3

0

Show file

File: LinearSVM.cpp Project: idiap/mash

void LinearSVM::distribution(InputSet& inputSet,
							 unsigned int sample,
							 scalar_t* distr) const {
	// Push the selected features if required
	if(!indices_.empty()) {
		inputSet.pushFeatures(indices_);
	}

	// Get the number of features and labels
	const unsigned int nbFeatures = inputSet.nbFeatures();
	const unsigned int nbLabels = inputSet.nbLabels();

	// Make sure that we have a model
	assert(model_);

	// Make sure that there is the same number of labels
	assert(static_cast<unsigned int>(get_nr_class(model_)) <= nbLabels);

	// Create a node
	feature_node node;
	node.dim = nbFeatures;
	node.values = const_cast<scalar_t*>(inputSet.features(sample));

	// The predicted labels
	std::vector<double> predictions(nbLabels);
	predict_values(model_, &node, &predictions[0]);

	// Update the ditribution according to the predictions
	for(unsigned int l = 0; l < nbLabels; ++l) {
		distr[map_[l]] = predictions[l];
	}

	// Pop the selected features if required
	if(!indices_.empty()) {
		inputSet.popFeatures();
	}
}

Example #4

0

Show file

File: C45Tree.cpp Project: idiap/mash

void C45Tree::distribution(InputSet& inputSet,
						   unsigned int sample,
						   scalar_t* distr) const {
	// Get the features of the sample to classifiy
	const scalar_t* features = inputSet.features(sample);

	// The currently selected tree
	const C45Tree* current = this;

	// If the tree has children, find which one is selected
	while(current->children_[0]) {
		assert(current->feature_ < inputSet.nbFeatures());
		if(features[current->feature_] <= current->split_) {
			current = current->children_[0];
		}
		else {
			current = current->children_[1];
		}
	}

	// Recopy the distribution
	assert(current->distr_.size() == inputSet.nbLabels());
	std::copy(current->distr_.begin(), current->distr_.end(), distr);
}

Example #5

0

Show file

File: C45Tree.cpp Project: idiap/mash

void C45Tree::make(const InputSet& inputSet,
				   unsigned int* indices,
			 	   unsigned int nbSamples) {
	// Stop if there is only one label (the tree is a leaf), or if we have
	// reached the maximum depth
	assert(distr_.size() > label_);
	if(Utils::geq(distr_[label_], sumWeights_) || !maxDepth_) {
		return;
	}

	// Get the number of features and labels
	const unsigned int nbFeatures = inputSet.nbFeatures();
	const unsigned int nbLabels = inputSet.nbLabels();

	// Get the labels and the weights of the samples
	const unsigned int* labels = inputSet.labels();
	const scalar_t* weights = inputSet.weights();

	// Compute the node's information from the frequency of each label
	scalar_t information = 0;

	for(unsigned int l = 0; l < nbLabels; ++l) {
		information += Utils::entropy(distr_[l]);
	}

	information -= Utils::entropy(sumWeights_);

	// Current best split in node
	scalar_t largestGain = 0;
	scalar_t sumWeights0 = 0;
	std::vector<scalar_t> distr0;

	// Frequencies of each label before the split. The frequencies of the labels
	// after the split can be directly calculated by subtracting to the total
	// frequencies
	std::vector<double> partialDistr(nbLabels); // Use double's for better
												// precision

	// Try to split on every feature
	for(unsigned int f = 0; f < nbFeatures; ++f) {
		// Set the partial frequencies to zero
		std::fill_n(partialDistr.begin(), nbLabels, 0);

		// Get the samples (the values of the current feature)
		const scalar_t* samples = inputSet.samples(f);

		// Sort the indices according to the current feature
		Utils::sort(indices, samples, nbSamples);

		// Update the weight of the samples before the split by adding the
		// weight of each sample at every iteration. The weight of the samples
		// after the split is just the sum of the weights minus that weight
		double leftWeight = 0; // Use a double for better precision

		for(unsigned int s = 0; s < nbSamples - 1; ++s) {
			const unsigned int index = indices[s];
			const unsigned int nextIndex = indices[s + 1];
			const unsigned int label = labels[index];
			const scalar_t weight = weights[index];

			partialDistr[label] += weight;
			leftWeight += weight;
			const double rightWeight = sumWeights_ - leftWeight;

			// Make sure the weights of the leaves are sufficient and try only
			// to split in-between two samples with different feature
			if(leftWeight >= minWeight_ && rightWeight >= minWeight_ &&
			   Utils::less(samples[index], samples[nextIndex])) {
				scalar_t leftInfo = 0, rightInfo = 0;

				for(unsigned int l = 0; l < nbLabels; ++l) {
					leftInfo += Utils::entropy(partialDistr[l]);
					rightInfo += Utils::entropy(distr_[l] - partialDistr[l]);
				}

				leftInfo -= Utils::entropy(leftWeight);
				rightInfo -= Utils::entropy(rightWeight);

				// Gain of the split
				scalar_t gain = information - leftInfo - rightInfo;

				// Gain ratio criterion
				//	gain /= Utils::entropy(leftWeight) +
				//			Utils::entropy(rightWeight) -
				//			Utils::entropy(sumWeights_);

				// If it is the best gain so far...
				if(gain > largestGain) {
					largestGain = gain;
					feature_ = f;
					split_ = (samples[index] + samples[nextIndex]) / 2;
					sumWeights0 = leftWeight;
					distr0.assign(partialDistr.begin(), partialDistr.end());
				}
			}
		}
	}

	// If no good split, create a leaf
	if(largestGain < Utils::epsilon) {
		return;
	}

	// Create the two children of the tree
	children_[0] = new C45Tree(minWeight_, confidence_, maxDepth_ - 1);
	children_[1] = new C45Tree(minWeight_, confidence_, maxDepth_ - 1);

	// Assign them the correct label, sumWeights and distribution
	children_[0]->label_ = std::max_element(distr0.begin(), distr0.end()) -
						   distr0.begin();

	// Ok since the distribution is only needed for leaves
	std::transform(distr_.begin(), distr_.end(), distr0.begin(), distr_.begin(),
				   std::minus<scalar_t>());

	children_[1]->label_ = std::max_element(distr_.begin(), distr_.end()) -
						   distr_.begin();

	children_[0]->sumWeights_ = sumWeights0;
	children_[1]->sumWeights_ = sumWeights_ - sumWeights0;

	children_[0]->distr_.swap(distr0);
	children_[1]->distr_.swap(distr_);

	// Build the indices for the left and the right children by partitioning
	// the indices in-place around the pivot split0
	unsigned int rightIndex = Utils::partition(indices,
											   inputSet.samples(feature_),
											   nbSamples,
											   split_);

	// Create the two children recursively
	children_[0]->make(inputSet, indices, rightIndex);
	children_[1]->make(inputSet, indices + rightIndex, nbSamples - rightIndex);
}

Example #6

0

Show file

File: LinearSVM.cpp Project: idiap/mash

void LinearSVM::train(InputSet& inputSet) {
	// Sample features from every heuristic in order for the matrix of features
	// to fit in memory
	if(inputSet.nbFeatures() > NB_FEATURES_MAX / inputSet.nbSamples()) {
		inputSet.sampleFeatures(NB_FEATURES_MAX / inputSet.nbSamples(), indices_);
		inputSet.pushFeatures(indices_);
	}
	else {
		indices_.clear();
	}

	// Get the number of features, samples and labels
	const unsigned int nbFeatures = inputSet.nbFeatures();
	const unsigned int nbSamples = inputSet.nbSamples();
	const unsigned int nbLabels = inputSet.nbLabels();

	// Delete the previous model
	if(model_) {
		free_and_destroy_model(&model_);
	}

	// Create a new problem
	problem prob;

	// Recopy the number of samples
	prob.l = nbSamples;
	prob.n = nbFeatures;

	// Recopy the labels
	std::vector<int> labels(inputSet.labels(), inputSet.labels() + nbSamples);
	prob.y = &labels[0];

	// Recopy the features (as we want to normalize them)
	std::vector<scalar_t> matrix;
	inputSet.swapFeatures(matrix);

	// Create samples as expected by liblinear
	std::vector<feature_node> samples(nbSamples);
	prob.x = &samples[0];

	// Compute the mean norm
	scalar_t meanNorm = 0;

	for(unsigned int s = 0; s < nbSamples; ++s) {
		samples[s].dim = nbFeatures;
		samples[s].values = &matrix[s * nbFeatures];

		// Add the mean norm of that sample
		meanNorm += nrm2(samples[s].dim, samples[s].values, 1);
	}

	// Divide the sum of the norms by the number of samples
	meanNorm /= nbSamples;

	std::cout << "[LinearSVM::train] mean(norm): " << meanNorm << '.'
			  << std::endl;

	// Rescale the features so that their mean norm is 1
	std::transform(matrix.begin(), matrix.end(), matrix.begin(),
				   std::bind2nd(std::divides<scalar_t>(), meanNorm));

	// Sets the bias to the default value (liblinear doesn't seem to handle the
	// bias parameter value correctly)
	prob.bias = -1;

	// Make sure that the parameters are correct
	bool crossValidate = parameters_.C < 0;

	// Sets C to a default value in order to pass the parameter check
	if(crossValidate) {
		parameters_.C = 1;
	}

	// There is a problem with the parameters
	assert(!check_parameter(&prob, &parameters_));

	// If C is below zero, use 5-folds cross-validation to determine it
	if(crossValidate) {
		std::vector<int> target(nbSamples); // The predicted labels
		unsigned int nbErrorsMin = nbSamples + 1; // Initialize past the maximum

		for(parameters_.C = 1000; parameters_.C >= 0.01; parameters_.C /= 10) {
			cross_validation(&prob, &parameters_, 5, &target[0]);

			// Count the number of errors
			unsigned int nbErrors = 0;

			for(unsigned int s = 0; s < nbSamples; ++s) {
				if(target[s] != labels[s]) {
					++nbErrors;
				}
			}

			std::cout << "[LinearSVM::train] 5 folds cross-validation error "
						 "for C = " << parameters_.C << ": "
					  << nbErrors * 100.0f / nbSamples << "%." << std::endl;

			// The new C is better than the previous one
			if(nbErrors < nbErrorsMin) {
				nbErrorsMin = nbErrors;
			}
			// The optimal C was found
			else {
				break;
			}
		}

		// C got divided one time too much
		parameters_.C *= 10;

		// Print C to the log
		std::cout << "[LinearSVM::train] optimal C as determined by 5 folds "
			   		 "cross-validation: " << parameters_.C << '.' << std::endl;
	}

	// Train the svm
	model_ = ::train(&prob, &parameters_);
	assert(model_);

	// Reset C so that it will be cross-validated again
	if(crossValidate) {
		parameters_.C = -1;
	}

	// Save libsvm labels
	map_.clear();
	map_.resize(nbLabels);
	get_labels(model_, &map_[0]);

	// Pop the selected features if required
	if(!indices_.empty()) {
		inputSet.popFeatures();
	}
}