C++ (Cpp) InputSet::nbLabels示例

示例#1

0

显示文件

文件： Filter.cpp 项目： idiap/mash

Filter::Filter(InputSet& inputSet) : inputSet_(inputSet) {
	// Get the input set's number of samples, features and labels
	const unsigned int nbSamples = inputSet.nbSamples();
	const unsigned int nbFeatures = inputSet.nbFeatures();
	const unsigned int nbLabels = inputSet.nbLabels();

	// Resize the initial stack of samples
	sampleStack_.resize(1);
	sampleStack_[0].resize(nbSamples);

	// Make the indices range from 0 to nbSamples - 1
	for(unsigned int s = 0; s < nbSamples; ++s) {
		sampleStack_[0][s] = s;
	}

	// Resize the initial stack of features
	featureStack_.resize(1);
	featureStack_[0].resize(nbFeatures);

	// Make the indices range from 0 to nbFeatures - 1
	for(unsigned int f = 0; f < nbFeatures; ++f) {
		featureStack_[0][f] = f;
	}

	// Set the number of labels
	nbLabelStack_.push_back(nbLabels); // Push it twice so as to be sure to
	nbLabelStack_.push_back(nbLabels); // never overwrite it

	// Set the number of images and heuristics
	nbImages_ = inputSet.nbImages();
	nbHeuristics_ = inputSet.nbHeuristics();
}

示例#2

0

显示文件

文件： LinearSVM.cpp 项目： idiap/mash

void LinearSVM::distribution(InputSet& inputSet,
							 unsigned int sample,
							 scalar_t* distr) const {
	// Push the selected features if required
	if(!indices_.empty()) {
		inputSet.pushFeatures(indices_);
	}

	// Get the number of features and labels
	const unsigned int nbFeatures = inputSet.nbFeatures();
	const unsigned int nbLabels = inputSet.nbLabels();

	// Make sure that we have a model
	assert(model_);

	// Make sure that there is the same number of labels
	assert(static_cast<unsigned int>(get_nr_class(model_)) <= nbLabels);

	// Create a node
	feature_node node;
	node.dim = nbFeatures;
	node.values = const_cast<scalar_t*>(inputSet.features(sample));

	// The predicted labels
	std::vector<double> predictions(nbLabels);
	predict_values(model_, &node, &predictions[0]);

	// Update the ditribution according to the predictions
	for(unsigned int l = 0; l < nbLabels; ++l) {
		distr[map_[l]] = predictions[l];
	}

	// Pop the selected features if required
	if(!indices_.empty()) {
		inputSet.popFeatures();
	}
}

示例#3

0

显示文件

文件： C45Tree.cpp 项目： idiap/mash

void C45Tree::distribution(InputSet& inputSet,
						   unsigned int sample,
						   scalar_t* distr) const {
	// Get the features of the sample to classifiy
	const scalar_t* features = inputSet.features(sample);

	// The currently selected tree
	const C45Tree* current = this;

	// If the tree has children, find which one is selected
	while(current->children_[0]) {
		assert(current->feature_ < inputSet.nbFeatures());
		if(features[current->feature_] <= current->split_) {
			current = current->children_[0];
		}
		else {
			current = current->children_[1];
		}
	}

	// Recopy the distribution
	assert(current->distr_.size() == inputSet.nbLabels());
	std::copy(current->distr_.begin(), current->distr_.end(), distr);
}

示例#4

0

显示文件

文件： C45Tree.cpp 项目： idiap/mash

scalar_t C45Tree::prune(const InputSet& inputSet,
						unsigned int* indices,
						unsigned int nbSamples,
						bool update) {
	// Get the number of labels
	const unsigned int nbLabels = inputSet.nbLabels();

	// Get the labels and the weights of the samples
	const unsigned int* labels = inputSet.labels();
	const scalar_t* weights = inputSet.weights();

	// Compute the frequency of appearance of each label
	std::vector<double> distr(nbLabels, 0); // Use double's for better precision

	// Determine the label with the largest frequency
	for(unsigned int s = 0; s < nbSamples; ++s) {
		distr[labels[indices[s]]] += weights[indices[s]];
	}

	unsigned int label = std::max_element(distr.begin(), distr.end()) -
						 distr.begin();

	double sumWeights = std::accumulate(distr.begin(), distr.end(), 0.0);

	// Update the node if needed
	if(update) {
		label_ = label;
		sumWeights_ = sumWeights;
	}

	// The error if the tree was a leaf
	double leafError = sumWeights - distr[label];

	// Add the uncertainty to obtain an upper bounds on the error
	leafError += addErrs(sumWeights, leafError);

	// If the tree is a leaf
	if(!children_[0]) {
		// Update the node if needed
		if(update) {
			distr_.assign(distr.begin(), distr.end());
		}

		return leafError;
	}
	// If the tree has children first prune them (bottom-up traversal)
	else {
		// Build the indices for the left and the right children by partitioning
		// the indices in-place
		unsigned int rightIndex = Utils::partition(indices,
												   inputSet.samples(feature_),
												   nbSamples,
												   split_);

		// Prune the left child
		scalar_t treeError = children_[0]->prune(inputSet, indices, rightIndex,
												 update);

		// Prune the right child
		treeError += children_[1]->prune(inputSet, indices + rightIndex,
										 nbSamples - rightIndex, update);

		if(!update) {
			return treeError;
		}

		// Compute the classification error on the biggest branch
		C45Tree* biggestBranch = children_[0];
		C45Tree* smallestBranch = children_[1];

		if(biggestBranch->sumWeights_ < smallestBranch->sumWeights_) {
			std::swap(biggestBranch, smallestBranch);
		}

		scalar_t branchError = biggestBranch->prune(inputSet, indices,
													nbSamples, false);

		// The +0.1 constant comes directly from Dr. Quilans implementation
		if(leafError <= branchError + 0.1 && leafError <= treeError + 0.1) {
			// Replace the tree by a leaf
			// Update the node if needed
			distr_.assign(distr.begin(), distr.end());
			delete children_[0];
			delete children_[1];
			children_[0] = 0;
			children_[1] = 0;

			return leafError;
		}
		else if(branchError <= treeError + 0.1) {
			// Replace the tree by it's biggest branch
			delete smallestBranch;

			// Recopy the biggest branch
			feature_ = biggestBranch->feature_;
			split_ = biggestBranch->split_;
			children_[0] = biggestBranch->children_[0];
			children_[1] = biggestBranch->children_[1];

			// Delete the biggest branch top node
			biggestBranch->children_[0] = 0;
			biggestBranch->children_[1] = 0;
			delete biggestBranch;

			// Update the subtrees
			prune(inputSet, indices, nbSamples, true);

			return branchError;
		}
		else {
			return treeError;
		}
	}
}

示例#5

0

显示文件

文件： C45Tree.cpp 项目： idiap/mash

void C45Tree::make(const InputSet& inputSet,
				   unsigned int* indices,
			 	   unsigned int nbSamples) {
	// Stop if there is only one label (the tree is a leaf), or if we have
	// reached the maximum depth
	assert(distr_.size() > label_);
	if(Utils::geq(distr_[label_], sumWeights_) || !maxDepth_) {
		return;
	}

	// Get the number of features and labels
	const unsigned int nbFeatures = inputSet.nbFeatures();
	const unsigned int nbLabels = inputSet.nbLabels();

	// Get the labels and the weights of the samples
	const unsigned int* labels = inputSet.labels();
	const scalar_t* weights = inputSet.weights();

	// Compute the node's information from the frequency of each label
	scalar_t information = 0;

	for(unsigned int l = 0; l < nbLabels; ++l) {
		information += Utils::entropy(distr_[l]);
	}

	information -= Utils::entropy(sumWeights_);

	// Current best split in node
	scalar_t largestGain = 0;
	scalar_t sumWeights0 = 0;
	std::vector<scalar_t> distr0;

	// Frequencies of each label before the split. The frequencies of the labels
	// after the split can be directly calculated by subtracting to the total
	// frequencies
	std::vector<double> partialDistr(nbLabels); // Use double's for better
												// precision

	// Try to split on every feature
	for(unsigned int f = 0; f < nbFeatures; ++f) {
		// Set the partial frequencies to zero
		std::fill_n(partialDistr.begin(), nbLabels, 0);

		// Get the samples (the values of the current feature)
		const scalar_t* samples = inputSet.samples(f);

		// Sort the indices according to the current feature
		Utils::sort(indices, samples, nbSamples);

		// Update the weight of the samples before the split by adding the
		// weight of each sample at every iteration. The weight of the samples
		// after the split is just the sum of the weights minus that weight
		double leftWeight = 0; // Use a double for better precision

		for(unsigned int s = 0; s < nbSamples - 1; ++s) {
			const unsigned int index = indices[s];
			const unsigned int nextIndex = indices[s + 1];
			const unsigned int label = labels[index];
			const scalar_t weight = weights[index];

			partialDistr[label] += weight;
			leftWeight += weight;
			const double rightWeight = sumWeights_ - leftWeight;

			// Make sure the weights of the leaves are sufficient and try only
			// to split in-between two samples with different feature
			if(leftWeight >= minWeight_ && rightWeight >= minWeight_ &&
			   Utils::less(samples[index], samples[nextIndex])) {
				scalar_t leftInfo = 0, rightInfo = 0;

				for(unsigned int l = 0; l < nbLabels; ++l) {
					leftInfo += Utils::entropy(partialDistr[l]);
					rightInfo += Utils::entropy(distr_[l] - partialDistr[l]);
				}

				leftInfo -= Utils::entropy(leftWeight);
				rightInfo -= Utils::entropy(rightWeight);

				// Gain of the split
				scalar_t gain = information - leftInfo - rightInfo;

				// Gain ratio criterion
				//	gain /= Utils::entropy(leftWeight) +
				//			Utils::entropy(rightWeight) -
				//			Utils::entropy(sumWeights_);

				// If it is the best gain so far...
				if(gain > largestGain) {
					largestGain = gain;
					feature_ = f;
					split_ = (samples[index] + samples[nextIndex]) / 2;
					sumWeights0 = leftWeight;
					distr0.assign(partialDistr.begin(), partialDistr.end());
				}
			}
		}
	}

	// If no good split, create a leaf
	if(largestGain < Utils::epsilon) {
		return;
	}

	// Create the two children of the tree
	children_[0] = new C45Tree(minWeight_, confidence_, maxDepth_ - 1);
	children_[1] = new C45Tree(minWeight_, confidence_, maxDepth_ - 1);

	// Assign them the correct label, sumWeights and distribution
	children_[0]->label_ = std::max_element(distr0.begin(), distr0.end()) -
						   distr0.begin();

	// Ok since the distribution is only needed for leaves
	std::transform(distr_.begin(), distr_.end(), distr0.begin(), distr_.begin(),
				   std::minus<scalar_t>());

	children_[1]->label_ = std::max_element(distr_.begin(), distr_.end()) -
						   distr_.begin();

	children_[0]->sumWeights_ = sumWeights0;
	children_[1]->sumWeights_ = sumWeights_ - sumWeights0;

	children_[0]->distr_.swap(distr0);
	children_[1]->distr_.swap(distr_);

	// Build the indices for the left and the right children by partitioning
	// the indices in-place around the pivot split0
	unsigned int rightIndex = Utils::partition(indices,
											   inputSet.samples(feature_),
											   nbSamples,
											   split_);

	// Create the two children recursively
	children_[0]->make(inputSet, indices, rightIndex);
	children_[1]->make(inputSet, indices + rightIndex, nbSamples - rightIndex);
}

示例#6

0

显示文件

文件： C45Tree.cpp 项目： idiap/mash

void C45Tree::train(InputSet& inputSet) {
	// In case the tree was already trained
	delete children_[0];
	delete children_[1];
	children_[0] = 0;
	children_[1] = 0;

	// Get the number of samples and labels
	const unsigned int nbSamples = inputSet.nbSamples();
	const unsigned int nbLabels = inputSet.nbLabels();

	// Set the maximal depth if needed
	bool maxDepth = false;

	if(!maxDepth_) {
		maxDepth_ = std::ceil(std::log(double(nbLabels)) / std::log(2.0));
		maxDepth = true;
	}

	// Get the labels and the weights of the samples
	const unsigned int* labels = inputSet.labels();
	const scalar_t* weights = inputSet.weights();

	// Make the weights sum to nbSamples as in Dr. Quilans implementation
	std::vector<scalar_t> oldWeights(nbSamples);

	scalar_t norm = std::accumulate(weights, weights + nbSamples, scalar_t());

	std::transform(weights, weights + nbSamples, oldWeights.begin(),
				   std::bind2nd(std::multiplies<scalar_t>(), nbSamples / norm));

	inputSet.swapWeights(oldWeights);

	weights = inputSet.weights();

	// Compute the frequency of appearance of each label
	distr_.clear();
	distr_.resize(nbLabels, 0);

	// Determine the label with the largest frequency
	for(unsigned int s = 0; s < nbSamples; ++s) {
		distr_[labels[s]] += weights[s];
	}

	label_ = std::max_element(distr_.begin(), distr_.end()) - distr_.begin();
	sumWeights_ = nbSamples;

	// Vector of indices over the samples
	std::vector<unsigned int> indices(nbSamples);

	// Make the indices range from o to nbSamples - 1
	for(unsigned int s = 0; s < nbSamples; ++s) {
		indices[s] = s;
	}

	// Create the root (the children will follow recursively)
	make(inputSet, &indices[0], nbSamples);

	if(confidence_ < 1) {
		// Make the indices range from o to nbSamples - 1
		for(unsigned int s = 0; s < nbSamples; ++s) {
			indices[s] = s;
		}

		// Prune the tree previously created
		prune(inputSet, &indices[0], nbSamples, true);
	}

	// Restore the original weights
	inputSet.swapWeights(oldWeights);

	// Restore maxDepth
	if(maxDepth) {
		maxDepth_ = 0;
	}
}

示例#7

0

显示文件

文件： LinearSVM.cpp 项目： idiap/mash

void LinearSVM::train(InputSet& inputSet) {
	// Sample features from every heuristic in order for the matrix of features
	// to fit in memory
	if(inputSet.nbFeatures() > NB_FEATURES_MAX / inputSet.nbSamples()) {
		inputSet.sampleFeatures(NB_FEATURES_MAX / inputSet.nbSamples(), indices_);
		inputSet.pushFeatures(indices_);
	}
	else {
		indices_.clear();
	}

	// Get the number of features, samples and labels
	const unsigned int nbFeatures = inputSet.nbFeatures();
	const unsigned int nbSamples = inputSet.nbSamples();
	const unsigned int nbLabels = inputSet.nbLabels();

	// Delete the previous model
	if(model_) {
		free_and_destroy_model(&model_);
	}

	// Create a new problem
	problem prob;

	// Recopy the number of samples
	prob.l = nbSamples;
	prob.n = nbFeatures;

	// Recopy the labels
	std::vector<int> labels(inputSet.labels(), inputSet.labels() + nbSamples);
	prob.y = &labels[0];

	// Recopy the features (as we want to normalize them)
	std::vector<scalar_t> matrix;
	inputSet.swapFeatures(matrix);

	// Create samples as expected by liblinear
	std::vector<feature_node> samples(nbSamples);
	prob.x = &samples[0];

	// Compute the mean norm
	scalar_t meanNorm = 0;

	for(unsigned int s = 0; s < nbSamples; ++s) {
		samples[s].dim = nbFeatures;
		samples[s].values = &matrix[s * nbFeatures];

		// Add the mean norm of that sample
		meanNorm += nrm2(samples[s].dim, samples[s].values, 1);
	}

	// Divide the sum of the norms by the number of samples
	meanNorm /= nbSamples;

	std::cout << "[LinearSVM::train] mean(norm): " << meanNorm << '.'
			  << std::endl;

	// Rescale the features so that their mean norm is 1
	std::transform(matrix.begin(), matrix.end(), matrix.begin(),
				   std::bind2nd(std::divides<scalar_t>(), meanNorm));

	// Sets the bias to the default value (liblinear doesn't seem to handle the
	// bias parameter value correctly)
	prob.bias = -1;

	// Make sure that the parameters are correct
	bool crossValidate = parameters_.C < 0;

	// Sets C to a default value in order to pass the parameter check
	if(crossValidate) {
		parameters_.C = 1;
	}

	// There is a problem with the parameters
	assert(!check_parameter(&prob, &parameters_));

	// If C is below zero, use 5-folds cross-validation to determine it
	if(crossValidate) {
		std::vector<int> target(nbSamples); // The predicted labels
		unsigned int nbErrorsMin = nbSamples + 1; // Initialize past the maximum

		for(parameters_.C = 1000; parameters_.C >= 0.01; parameters_.C /= 10) {
			cross_validation(&prob, &parameters_, 5, &target[0]);

			// Count the number of errors
			unsigned int nbErrors = 0;

			for(unsigned int s = 0; s < nbSamples; ++s) {
				if(target[s] != labels[s]) {
					++nbErrors;
				}
			}

			std::cout << "[LinearSVM::train] 5 folds cross-validation error "
						 "for C = " << parameters_.C << ": "
					  << nbErrors * 100.0f / nbSamples << "%." << std::endl;

			// The new C is better than the previous one
			if(nbErrors < nbErrorsMin) {
				nbErrorsMin = nbErrors;
			}
			// The optimal C was found
			else {
				break;
			}
		}

		// C got divided one time too much
		parameters_.C *= 10;

		// Print C to the log
		std::cout << "[LinearSVM::train] optimal C as determined by 5 folds "
			   		 "cross-validation: " << parameters_.C << '.' << std::endl;
	}

	// Train the svm
	model_ = ::train(&prob, &parameters_);
	assert(model_);

	// Reset C so that it will be cross-validated again
	if(crossValidate) {
		parameters_.C = -1;
	}

	// Save libsvm labels
	map_.clear();
	map_.resize(nbLabels);
	get_labels(model_, &map_[0]);

	// Pop the selected features if required
	if(!indices_.empty()) {
		inputSet.popFeatures();
	}
}