bool DecisionTreeClusterNode::computeSplit( const UINT &numSplittingSteps, const ClassificationData &trainingData, const Vector< UINT > &features, const Vector< UINT > &classLabels, UINT &featureIndex, Float &minError ){ const UINT M = trainingData.getNumSamples(); const UINT N = features.getSize(); const UINT K = classLabels.getSize(); if( N == 0 ) return false; if( K == 0 ) return false; minError = grt_numeric_limits< Float >::max(); Random random; UINT bestFeatureIndex = 0; Float bestThreshold = 0; Float error = 0; Vector< UINT > groupIndex(M); Vector< MinMax > ranges = trainingData.getRanges(); MatrixDouble data(M,1); //This will store our temporary data for each dimension //Randomly select which features we want to use UINT numRandomFeatures = numSplittingSteps > N ? N : numSplittingSteps; Vector< UINT > randomFeatures = random.getRandomSubset( 0, N, numRandomFeatures ); //Loop over each random feature and try and find the best split point for(UINT n=0; n<numRandomFeatures; n++){ featureIndex = features[ randomFeatures[n] ]; //Use the data in this feature dimension to create a sum dataset for(UINT i=0; i<M; i++){ data[i][0] = trainingData[i][featureIndex]; } if( computeError( trainingData, data, classLabels, ranges, groupIndex, featureIndex, threshold, error ) ){ //Store the best threshold and feature index if( error < minError ){ minError = error; bestThreshold = threshold; bestFeatureIndex = featureIndex; } } } //Set the best feature index that will be returned to the DecisionTree that called this function featureIndex = bestFeatureIndex; //Store the node size, feature index, best threshold and class probabilities for this node set( M, featureIndex, bestThreshold, trainingData.getClassProbabilities(classLabels) ); return true; }
bool DecisionTreeThresholdNode::computeBestSplitBestRandomSplit( const UINT &numSplittingSteps, const ClassificationData &trainingData, const Vector< UINT > &features, const Vector< UINT > &classLabels, UINT &featureIndex, Float &minError ){ const UINT M = trainingData.getNumSamples(); const UINT N = (UINT)features.size(); const UINT K = (UINT)classLabels.size(); if( N == 0 ) return false; minError = grt_numeric_limits< Float >::max(); UINT bestFeatureIndex = 0; Float bestThreshold = 0; Float error = 0; Float giniIndexL = 0; Float giniIndexR = 0; Float weightL = 0; Float weightR = 0; Random random; Vector< UINT > groupIndex(M); VectorFloat groupCounter(2,0); MatrixFloat classProbabilities(K,2); //Loop over each feature and try and find the best split point UINT m,n; const UINT numFeatures = features.getSize(); for(m=0; m<numSplittingSteps; m++){ //Chose a random feature n = random.getRandomNumberInt(0,numFeatures); featureIndex = features[n]; //Randomly choose the threshold, the threshold is based on a randomly selected sample with some random scaling threshold = trainingData[ random.getRandomNumberInt(0,M) ][ featureIndex ] * random.getRandomNumberUniform(0.8,1.2); //Iterate over each sample and work out if it should be in the lhs (0) or rhs (1) group groupCounter[0] = groupCounter[1] = 0; classProbabilities.setAllValues(0); for(UINT i=0; i<M; i++){ groupIndex[i] = trainingData[ i ][ featureIndex ] >= threshold ? 1 : 0; groupCounter[ groupIndex[i] ]++; classProbabilities[ getClassLabelIndexValue(trainingData[i].getClassLabel(),classLabels) ][ groupIndex[i] ]++; } //Compute the class probabilities for the lhs group and rhs group for(UINT k=0; k<K; k++){ classProbabilities[k][0] = groupCounter[0]>0 ? classProbabilities[k][0]/groupCounter[0] : 0; classProbabilities[k][1] = groupCounter[1]>0 ? classProbabilities[k][1]/groupCounter[1] : 0; } //Compute the Gini index for the lhs and rhs groups giniIndexL = giniIndexR = 0; for(UINT k=0; k<K; k++){ giniIndexL += classProbabilities[k][0] * (1.0-classProbabilities[k][0]); giniIndexR += classProbabilities[k][1] * (1.0-classProbabilities[k][1]); } weightL = groupCounter[0]/M; weightR = groupCounter[1]/M; error = (giniIndexL*weightL) + (giniIndexR*weightR); //Store the best threshold and feature index if( error < minError ){ minError = error; bestThreshold = threshold; bestFeatureIndex = featureIndex; } } //Set the best feature index that will be returned to the DecisionTree that called this function featureIndex = bestFeatureIndex; //Store the node size, feature index, best threshold and class probabilities for this node set(M,featureIndex,bestThreshold,trainingData.getClassProbabilities(classLabels)); return true; }
bool DecisionTreeThresholdNode::computeBestSplitBestIterativeSplit( const UINT &numSplittingSteps, const ClassificationData &trainingData, const Vector< UINT > &features, const Vector< UINT > &classLabels, UINT &featureIndex, Float &minError ){ const UINT M = trainingData.getNumSamples(); const UINT N = features.getSize(); const UINT K = classLabels.getSize(); if( N == 0 ) return false; minError = grt_numeric_limits< Float >::max(); UINT bestFeatureIndex = 0; Float bestThreshold = 0; Float error = 0; Float minRange = 0; Float maxRange = 0; Float step = 0; Float giniIndexL = 0; Float giniIndexR = 0; Float weightL = 0; Float weightR = 0; Vector< UINT > groupIndex(M); VectorFloat groupCounter(2,0); Vector< MinMax > ranges = trainingData.getRanges(); MatrixFloat classProbabilities(K,2); //Loop over each feature and try and find the best split point for(UINT n=0; n<N; n++){ minRange = ranges[n].minValue; maxRange = ranges[n].maxValue; step = (maxRange-minRange)/Float(numSplittingSteps); threshold = minRange; featureIndex = features[n]; while( threshold <= maxRange ){ //Iterate over each sample and work out if it should be in the lhs (0) or rhs (1) group groupCounter[0] = groupCounter[1] = 0; classProbabilities.setAllValues(0); for(UINT i=0; i<M; i++){ groupIndex[i] = trainingData[ i ][ featureIndex ] >= threshold ? 1 : 0; groupCounter[ groupIndex[i] ]++; classProbabilities[ getClassLabelIndexValue(trainingData[i].getClassLabel(),classLabels) ][ groupIndex[i] ]++; } //Compute the class probabilities for the lhs group and rhs group for(UINT k=0; k<K; k++){ classProbabilities[k][0] = groupCounter[0]>0 ? classProbabilities[k][0]/groupCounter[0] : 0; classProbabilities[k][1] = groupCounter[1]>0 ? classProbabilities[k][1]/groupCounter[1] : 0; } //Compute the Gini index for the lhs and rhs groups giniIndexL = giniIndexR = 0; for(UINT k=0; k<K; k++){ giniIndexL += classProbabilities[k][0] * (1.0-classProbabilities[k][0]); giniIndexR += classProbabilities[k][1] * (1.0-classProbabilities[k][1]); } weightL = groupCounter[0]/M; weightR = groupCounter[1]/M; error = (giniIndexL*weightL) + (giniIndexR*weightR); //Store the best threshold and feature index if( error < minError ){ minError = error; bestThreshold = threshold; bestFeatureIndex = featureIndex; } //Update the threshold threshold += step; } } //Set the best feature index that will be returned to the DecisionTree that called this function featureIndex = bestFeatureIndex; //Store the node size, feature index, best threshold and class probabilities for this node set(M,featureIndex,bestThreshold,trainingData.getClassProbabilities(classLabels)); return true; }