예제 #1
0
파일: C45Tree.cpp 프로젝트: idiap/mash
scalar_t C45Tree::prune(const InputSet& inputSet,
						unsigned int* indices,
						unsigned int nbSamples,
						bool update) {
	// Get the number of labels
	const unsigned int nbLabels = inputSet.nbLabels();

	// Get the labels and the weights of the samples
	const unsigned int* labels = inputSet.labels();
	const scalar_t* weights = inputSet.weights();

	// Compute the frequency of appearance of each label
	std::vector<double> distr(nbLabels, 0); // Use double's for better precision

	// Determine the label with the largest frequency
	for(unsigned int s = 0; s < nbSamples; ++s) {
		distr[labels[indices[s]]] += weights[indices[s]];
	}

	unsigned int label = std::max_element(distr.begin(), distr.end()) -
						 distr.begin();

	double sumWeights = std::accumulate(distr.begin(), distr.end(), 0.0);

	// Update the node if needed
	if(update) {
		label_ = label;
		sumWeights_ = sumWeights;
	}

	// The error if the tree was a leaf
	double leafError = sumWeights - distr[label];

	// Add the uncertainty to obtain an upper bounds on the error
	leafError += addErrs(sumWeights, leafError);

	// If the tree is a leaf
	if(!children_[0]) {
		// Update the node if needed
		if(update) {
			distr_.assign(distr.begin(), distr.end());
		}

		return leafError;
	}
	// If the tree has children first prune them (bottom-up traversal)
	else {
		// Build the indices for the left and the right children by partitioning
		// the indices in-place
		unsigned int rightIndex = Utils::partition(indices,
												   inputSet.samples(feature_),
												   nbSamples,
												   split_);

		// Prune the left child
		scalar_t treeError = children_[0]->prune(inputSet, indices, rightIndex,
												 update);

		// Prune the right child
		treeError += children_[1]->prune(inputSet, indices + rightIndex,
										 nbSamples - rightIndex, update);

		if(!update) {
			return treeError;
		}

		// Compute the classification error on the biggest branch
		C45Tree* biggestBranch = children_[0];
		C45Tree* smallestBranch = children_[1];

		if(biggestBranch->sumWeights_ < smallestBranch->sumWeights_) {
			std::swap(biggestBranch, smallestBranch);
		}

		scalar_t branchError = biggestBranch->prune(inputSet, indices,
													nbSamples, false);

		// The +0.1 constant comes directly from Dr. Quilans implementation
		if(leafError <= branchError + 0.1 && leafError <= treeError + 0.1) {
			// Replace the tree by a leaf
			// Update the node if needed
			distr_.assign(distr.begin(), distr.end());
			delete children_[0];
			delete children_[1];
			children_[0] = 0;
			children_[1] = 0;

			return leafError;
		}
		else if(branchError <= treeError + 0.1) {
			// Replace the tree by it's biggest branch
			delete smallestBranch;

			// Recopy the biggest branch
			feature_ = biggestBranch->feature_;
			split_ = biggestBranch->split_;
			children_[0] = biggestBranch->children_[0];
			children_[1] = biggestBranch->children_[1];

			// Delete the biggest branch top node
			biggestBranch->children_[0] = 0;
			biggestBranch->children_[1] = 0;
			delete biggestBranch;

			// Update the subtrees
			prune(inputSet, indices, nbSamples, true);

			return branchError;
		}
		else {
			return treeError;
		}
	}
}
예제 #2
0
파일: C45Tree.cpp 프로젝트: idiap/mash
void C45Tree::make(const InputSet& inputSet,
				   unsigned int* indices,
			 	   unsigned int nbSamples) {
	// Stop if there is only one label (the tree is a leaf), or if we have
	// reached the maximum depth
	assert(distr_.size() > label_);
	if(Utils::geq(distr_[label_], sumWeights_) || !maxDepth_) {
		return;
	}

	// Get the number of features and labels
	const unsigned int nbFeatures = inputSet.nbFeatures();
	const unsigned int nbLabels = inputSet.nbLabels();

	// Get the labels and the weights of the samples
	const unsigned int* labels = inputSet.labels();
	const scalar_t* weights = inputSet.weights();

	// Compute the node's information from the frequency of each label
	scalar_t information = 0;

	for(unsigned int l = 0; l < nbLabels; ++l) {
		information += Utils::entropy(distr_[l]);
	}

	information -= Utils::entropy(sumWeights_);

	// Current best split in node
	scalar_t largestGain = 0;
	scalar_t sumWeights0 = 0;
	std::vector<scalar_t> distr0;

	// Frequencies of each label before the split. The frequencies of the labels
	// after the split can be directly calculated by subtracting to the total
	// frequencies
	std::vector<double> partialDistr(nbLabels); // Use double's for better
												// precision

	// Try to split on every feature
	for(unsigned int f = 0; f < nbFeatures; ++f) {
		// Set the partial frequencies to zero
		std::fill_n(partialDistr.begin(), nbLabels, 0);

		// Get the samples (the values of the current feature)
		const scalar_t* samples = inputSet.samples(f);

		// Sort the indices according to the current feature
		Utils::sort(indices, samples, nbSamples);

		// Update the weight of the samples before the split by adding the
		// weight of each sample at every iteration. The weight of the samples
		// after the split is just the sum of the weights minus that weight
		double leftWeight = 0; // Use a double for better precision

		for(unsigned int s = 0; s < nbSamples - 1; ++s) {
			const unsigned int index = indices[s];
			const unsigned int nextIndex = indices[s + 1];
			const unsigned int label = labels[index];
			const scalar_t weight = weights[index];

			partialDistr[label] += weight;
			leftWeight += weight;
			const double rightWeight = sumWeights_ - leftWeight;

			// Make sure the weights of the leaves are sufficient and try only
			// to split in-between two samples with different feature
			if(leftWeight >= minWeight_ && rightWeight >= minWeight_ &&
			   Utils::less(samples[index], samples[nextIndex])) {
				scalar_t leftInfo = 0, rightInfo = 0;

				for(unsigned int l = 0; l < nbLabels; ++l) {
					leftInfo += Utils::entropy(partialDistr[l]);
					rightInfo += Utils::entropy(distr_[l] - partialDistr[l]);
				}

				leftInfo -= Utils::entropy(leftWeight);
				rightInfo -= Utils::entropy(rightWeight);

				// Gain of the split
				scalar_t gain = information - leftInfo - rightInfo;

				// Gain ratio criterion
				//	gain /= Utils::entropy(leftWeight) +
				//			Utils::entropy(rightWeight) -
				//			Utils::entropy(sumWeights_);

				// If it is the best gain so far...
				if(gain > largestGain) {
					largestGain = gain;
					feature_ = f;
					split_ = (samples[index] + samples[nextIndex]) / 2;
					sumWeights0 = leftWeight;
					distr0.assign(partialDistr.begin(), partialDistr.end());
				}
			}
		}
	}

	// If no good split, create a leaf
	if(largestGain < Utils::epsilon) {
		return;
	}

	// Create the two children of the tree
	children_[0] = new C45Tree(minWeight_, confidence_, maxDepth_ - 1);
	children_[1] = new C45Tree(minWeight_, confidence_, maxDepth_ - 1);

	// Assign them the correct label, sumWeights and distribution
	children_[0]->label_ = std::max_element(distr0.begin(), distr0.end()) -
						   distr0.begin();

	// Ok since the distribution is only needed for leaves
	std::transform(distr_.begin(), distr_.end(), distr0.begin(), distr_.begin(),
				   std::minus<scalar_t>());

	children_[1]->label_ = std::max_element(distr_.begin(), distr_.end()) -
						   distr_.begin();

	children_[0]->sumWeights_ = sumWeights0;
	children_[1]->sumWeights_ = sumWeights_ - sumWeights0;

	children_[0]->distr_.swap(distr0);
	children_[1]->distr_.swap(distr_);

	// Build the indices for the left and the right children by partitioning
	// the indices in-place around the pivot split0
	unsigned int rightIndex = Utils::partition(indices,
											   inputSet.samples(feature_),
											   nbSamples,
											   split_);

	// Create the two children recursively
	children_[0]->make(inputSet, indices, rightIndex);
	children_[1]->make(inputSet, indices + rightIndex, nbSamples - rightIndex);
}