Exemplo n.º 1
0
Filter::Filter(InputSet& inputSet) : inputSet_(inputSet) {
	// Get the input set's number of samples, features and labels
	const unsigned int nbSamples = inputSet.nbSamples();
	const unsigned int nbFeatures = inputSet.nbFeatures();
	const unsigned int nbLabels = inputSet.nbLabels();

	// Resize the initial stack of samples
	sampleStack_.resize(1);
	sampleStack_[0].resize(nbSamples);

	// Make the indices range from 0 to nbSamples - 1
	for(unsigned int s = 0; s < nbSamples; ++s) {
		sampleStack_[0][s] = s;
	}

	// Resize the initial stack of features
	featureStack_.resize(1);
	featureStack_[0].resize(nbFeatures);

	// Make the indices range from 0 to nbFeatures - 1
	for(unsigned int f = 0; f < nbFeatures; ++f) {
		featureStack_[0][f] = f;
	}

	// Set the number of labels
	nbLabelStack_.push_back(nbLabels); // Push it twice so as to be sure to
	nbLabelStack_.push_back(nbLabels); // never overwrite it

	// Set the number of images and heuristics
	nbImages_ = inputSet.nbImages();
	nbHeuristics_ = inputSet.nbHeuristics();
}
Exemplo n.º 2
0
Statistical::Statistical(InputSet& inputSet) : Filter(inputSet) {
	// Get the number of samples and features
	const unsigned int nbSamples = inputSet.nbSamples();
	const unsigned int nbFeatures = inputSet.nbFeatures();

	// Used to store the current sum and sum of squares of every feature
	means_.resize(nbFeatures);
	stds_.resize(nbFeatures);

	for(unsigned int s = 0; s < nbSamples; ++s) {
		std::vector<unsigned int> sample(1, s);
		inputSet.pushSamples(sample);

		const scalar_t* features = inputSet.features(0);

		for(unsigned int f = 0; f < nbFeatures; ++f) {
			means_[f] += features[f];
			stds_[f] += features[f] * features[f];
		}

		inputSet.popSamples();
	}

	std::vector<unsigned int> featureStack;

	for(unsigned int f = 0; f < nbFeatures; ++f) {
		scalar_t mean = means_[f] / nbSamples;
		scalar_t variance = (stds_[f] - means_[f] * mean) / (nbSamples - 1);

		if(variance > 0) {
			means_[f] = mean;
			stds_[f] = std::sqrt(variance);

			featureStack.push_back(f);
		}
	}

	// There must be at least one feature
	assert(!featureStack.empty());

	// Push the selected features if needed
	if(featureStack.size() < nbFeatures) {
		featureStack_.push_back(featureStack);
	}
}
Exemplo n.º 3
0
void C45Tree::train(InputSet& inputSet) {
	// In case the tree was already trained
	delete children_[0];
	delete children_[1];
	children_[0] = 0;
	children_[1] = 0;

	// Get the number of samples and labels
	const unsigned int nbSamples = inputSet.nbSamples();
	const unsigned int nbLabels = inputSet.nbLabels();

	// Set the maximal depth if needed
	bool maxDepth = false;

	if(!maxDepth_) {
		maxDepth_ = std::ceil(std::log(double(nbLabels)) / std::log(2.0));
		maxDepth = true;
	}

	// Get the labels and the weights of the samples
	const unsigned int* labels = inputSet.labels();
	const scalar_t* weights = inputSet.weights();

	// Make the weights sum to nbSamples as in Dr. Quilans implementation
	std::vector<scalar_t> oldWeights(nbSamples);

	scalar_t norm = std::accumulate(weights, weights + nbSamples, scalar_t());

	std::transform(weights, weights + nbSamples, oldWeights.begin(),
				   std::bind2nd(std::multiplies<scalar_t>(), nbSamples / norm));

	inputSet.swapWeights(oldWeights);

	weights = inputSet.weights();

	// Compute the frequency of appearance of each label
	distr_.clear();
	distr_.resize(nbLabels, 0);

	// Determine the label with the largest frequency
	for(unsigned int s = 0; s < nbSamples; ++s) {
		distr_[labels[s]] += weights[s];
	}

	label_ = std::max_element(distr_.begin(), distr_.end()) - distr_.begin();
	sumWeights_ = nbSamples;

	// Vector of indices over the samples
	std::vector<unsigned int> indices(nbSamples);

	// Make the indices range from o to nbSamples - 1
	for(unsigned int s = 0; s < nbSamples; ++s) {
		indices[s] = s;
	}

	// Create the root (the children will follow recursively)
	make(inputSet, &indices[0], nbSamples);

	if(confidence_ < 1) {
		// Make the indices range from o to nbSamples - 1
		for(unsigned int s = 0; s < nbSamples; ++s) {
			indices[s] = s;
		}

		// Prune the tree previously created
		prune(inputSet, &indices[0], nbSamples, true);
	}

	// Restore the original weights
	inputSet.swapWeights(oldWeights);

	// Restore maxDepth
	if(maxDepth) {
		maxDepth_ = 0;
	}
}
Exemplo n.º 4
0
void LinearSVM::train(InputSet& inputSet) {
	// Sample features from every heuristic in order for the matrix of features
	// to fit in memory
	if(inputSet.nbFeatures() > NB_FEATURES_MAX / inputSet.nbSamples()) {
		inputSet.sampleFeatures(NB_FEATURES_MAX / inputSet.nbSamples(), indices_);
		inputSet.pushFeatures(indices_);
	}
	else {
		indices_.clear();
	}

	// Get the number of features, samples and labels
	const unsigned int nbFeatures = inputSet.nbFeatures();
	const unsigned int nbSamples = inputSet.nbSamples();
	const unsigned int nbLabels = inputSet.nbLabels();

	// Delete the previous model
	if(model_) {
		free_and_destroy_model(&model_);
	}

	// Create a new problem
	problem prob;

	// Recopy the number of samples
	prob.l = nbSamples;
	prob.n = nbFeatures;

	// Recopy the labels
	std::vector<int> labels(inputSet.labels(), inputSet.labels() + nbSamples);
	prob.y = &labels[0];

	// Recopy the features (as we want to normalize them)
	std::vector<scalar_t> matrix;
	inputSet.swapFeatures(matrix);

	// Create samples as expected by liblinear
	std::vector<feature_node> samples(nbSamples);
	prob.x = &samples[0];

	// Compute the mean norm
	scalar_t meanNorm = 0;

	for(unsigned int s = 0; s < nbSamples; ++s) {
		samples[s].dim = nbFeatures;
		samples[s].values = &matrix[s * nbFeatures];

		// Add the mean norm of that sample
		meanNorm += nrm2(samples[s].dim, samples[s].values, 1);
	}

	// Divide the sum of the norms by the number of samples
	meanNorm /= nbSamples;

	std::cout << "[LinearSVM::train] mean(norm): " << meanNorm << '.'
			  << std::endl;

	// Rescale the features so that their mean norm is 1
	std::transform(matrix.begin(), matrix.end(), matrix.begin(),
				   std::bind2nd(std::divides<scalar_t>(), meanNorm));

	// Sets the bias to the default value (liblinear doesn't seem to handle the
	// bias parameter value correctly)
	prob.bias = -1;

	// Make sure that the parameters are correct
	bool crossValidate = parameters_.C < 0;

	// Sets C to a default value in order to pass the parameter check
	if(crossValidate) {
		parameters_.C = 1;
	}

	// There is a problem with the parameters
	assert(!check_parameter(&prob, &parameters_));

	// If C is below zero, use 5-folds cross-validation to determine it
	if(crossValidate) {
		std::vector<int> target(nbSamples); // The predicted labels
		unsigned int nbErrorsMin = nbSamples + 1; // Initialize past the maximum

		for(parameters_.C = 1000; parameters_.C >= 0.01; parameters_.C /= 10) {
			cross_validation(&prob, &parameters_, 5, &target[0]);

			// Count the number of errors
			unsigned int nbErrors = 0;

			for(unsigned int s = 0; s < nbSamples; ++s) {
				if(target[s] != labels[s]) {
					++nbErrors;
				}
			}

			std::cout << "[LinearSVM::train] 5 folds cross-validation error "
						 "for C = " << parameters_.C << ": "
					  << nbErrors * 100.0f / nbSamples << "%." << std::endl;

			// The new C is better than the previous one
			if(nbErrors < nbErrorsMin) {
				nbErrorsMin = nbErrors;
			}
			// The optimal C was found
			else {
				break;
			}
		}

		// C got divided one time too much
		parameters_.C *= 10;

		// Print C to the log
		std::cout << "[LinearSVM::train] optimal C as determined by 5 folds "
			   		 "cross-validation: " << parameters_.C << '.' << std::endl;
	}

	// Train the svm
	model_ = ::train(&prob, &parameters_);
	assert(model_);

	// Reset C so that it will be cross-validated again
	if(crossValidate) {
		parameters_.C = -1;
	}

	// Save libsvm labels
	map_.clear();
	map_.resize(nbLabels);
	get_labels(model_, &map_[0]);

	// Pop the selected features if required
	if(!indices_.empty()) {
		inputSet.popFeatures();
	}
}