bool DecisionTreeClusterNode::computeError( const ClassificationData &trainingData, MatrixFloat &data, const Vector< UINT > &classLabels, Vector< MinMax > ranges, Vector< UINT > groupIndex, const UINT featureIndex, Float &threshold, Float &error ){ error = 0; threshold = 0; const UINT M = trainingData.getNumSamples(); const UINT K = (UINT)classLabels.size(); Float giniIndexL = 0; Float giniIndexR = 0; Float weightL = 0; Float weightR = 0; VectorFloat groupCounter(2,0); MatrixFloat classProbabilities(K,2); //Use this data to train a KMeans cluster with 2 clusters KMeans kmeans; kmeans.setNumClusters( 2 ); kmeans.setComputeTheta( true ); kmeans.setMinChange( 1.0e-5 ); kmeans.setMinNumEpochs( 1 ); kmeans.setMaxNumEpochs( 100 ); //Disable the logging to clean things up kmeans.setTrainingLoggingEnabled( false ); if( !kmeans.train_( data ) ){ errorLog << __GRT_LOG__ << " Failed to train KMeans model for feature: " << featureIndex << std::endl; return false; } //Set the split threshold as the mid point between the two clusters const MatrixFloat &clusters = kmeans.getClusters(); threshold = 0; for(UINT i=0; i<clusters.getNumRows(); i++){ threshold += clusters[i][0]; } threshold /= clusters.getNumRows(); //Iterate over each sample and work out if it should be in the lhs (0) or rhs (1) group based on the current threshold groupCounter[0] = groupCounter[1] = 0; classProbabilities.setAllValues(0); for(UINT i=0; i<M; i++){ groupIndex[i] = trainingData[ i ][ featureIndex ] >= threshold ? 1 : 0; groupCounter[ groupIndex[i] ]++; classProbabilities[ getClassLabelIndexValue(trainingData[i].getClassLabel(),classLabels) ][ groupIndex[i] ]++; } //Compute the class probabilities for the lhs group and rhs group for(UINT k=0; k<K; k++){ classProbabilities[k][0] = groupCounter[0]>0 ? classProbabilities[k][0]/groupCounter[0] : 0; classProbabilities[k][1] = groupCounter[1]>0 ? classProbabilities[k][1]/groupCounter[1] : 0; } //Compute the Gini index for the lhs and rhs groups giniIndexL = giniIndexR = 0; for(UINT k=0; k<K; k++){ giniIndexL += classProbabilities[k][0] * (1.0-classProbabilities[k][0]); giniIndexR += classProbabilities[k][1] * (1.0-classProbabilities[k][1]); } weightL = groupCounter[0]/M; weightR = groupCounter[1]/M; error = (giniIndexL*weightL) + (giniIndexR*weightR); return true; }
bool DecisionTreeThresholdNode::computeBestSplitBestRandomSplit( const UINT &numSplittingSteps, const ClassificationData &trainingData, const Vector< UINT > &features, const Vector< UINT > &classLabels, UINT &featureIndex, Float &minError ){ const UINT M = trainingData.getNumSamples(); const UINT N = (UINT)features.size(); const UINT K = (UINT)classLabels.size(); if( N == 0 ) return false; minError = grt_numeric_limits< Float >::max(); UINT bestFeatureIndex = 0; Float bestThreshold = 0; Float error = 0; Float giniIndexL = 0; Float giniIndexR = 0; Float weightL = 0; Float weightR = 0; Random random; Vector< UINT > groupIndex(M); VectorFloat groupCounter(2,0); MatrixFloat classProbabilities(K,2); //Loop over each feature and try and find the best split point UINT m,n; const UINT numFeatures = features.getSize(); for(m=0; m<numSplittingSteps; m++){ //Chose a random feature n = random.getRandomNumberInt(0,numFeatures); featureIndex = features[n]; //Randomly choose the threshold, the threshold is based on a randomly selected sample with some random scaling threshold = trainingData[ random.getRandomNumberInt(0,M) ][ featureIndex ] * random.getRandomNumberUniform(0.8,1.2); //Iterate over each sample and work out if it should be in the lhs (0) or rhs (1) group groupCounter[0] = groupCounter[1] = 0; classProbabilities.setAllValues(0); for(UINT i=0; i<M; i++){ groupIndex[i] = trainingData[ i ][ featureIndex ] >= threshold ? 1 : 0; groupCounter[ groupIndex[i] ]++; classProbabilities[ getClassLabelIndexValue(trainingData[i].getClassLabel(),classLabels) ][ groupIndex[i] ]++; } //Compute the class probabilities for the lhs group and rhs group for(UINT k=0; k<K; k++){ classProbabilities[k][0] = groupCounter[0]>0 ? classProbabilities[k][0]/groupCounter[0] : 0; classProbabilities[k][1] = groupCounter[1]>0 ? classProbabilities[k][1]/groupCounter[1] : 0; } //Compute the Gini index for the lhs and rhs groups giniIndexL = giniIndexR = 0; for(UINT k=0; k<K; k++){ giniIndexL += classProbabilities[k][0] * (1.0-classProbabilities[k][0]); giniIndexR += classProbabilities[k][1] * (1.0-classProbabilities[k][1]); } weightL = groupCounter[0]/M; weightR = groupCounter[1]/M; error = (giniIndexL*weightL) + (giniIndexR*weightR); //Store the best threshold and feature index if( error < minError ){ minError = error; bestThreshold = threshold; bestFeatureIndex = featureIndex; } } //Set the best feature index that will be returned to the DecisionTree that called this function featureIndex = bestFeatureIndex; //Store the node size, feature index, best threshold and class probabilities for this node set(M,featureIndex,bestThreshold,trainingData.getClassProbabilities(classLabels)); return true; }
bool RegressionTree::computeBestSpiltBestIterativeSpilt( const RegressionData &trainingData, const Vector< UINT > &features, UINT &featureIndex, Float &threshold, Float &minError ){ const UINT M = trainingData.getNumSamples(); const UINT N = (UINT)features.size(); if( N == 0 ) return false; minError = grt_numeric_limits< Float >::max(); UINT bestFeatureIndex = 0; UINT groupID = 0; Float bestThreshold = 0; Float error = 0; Float minRange = 0; Float maxRange = 0; Float step = 0; Vector< UINT > groupIndex(M); VectorFloat groupCounter(2,0); VectorFloat groupMean(2,0); VectorFloat groupMSE(2,0); Vector< MinMax > ranges = trainingData.getInputRanges(); //Loop over each feature and try and find the best split point for(UINT n=0; n<N; n++){ minRange = ranges[n].minValue; maxRange = ranges[n].maxValue; step = (maxRange-minRange)/Float(numSplittingSteps); threshold = minRange; featureIndex = features[n]; while( threshold <= maxRange ){ //Iterate over each sample and work out what group it falls into for(UINT i=0; i<M; i++){ groupID = trainingData[i].getInputVector()[featureIndex] >= threshold ? 1 : 0; groupIndex[i] = groupID; groupMean[ groupID ] += trainingData[i].getInputVector()[featureIndex]; groupCounter[ groupID ]++; } groupMean[0] /= groupCounter[0] > 0 ? groupCounter[0] : 1; groupMean[1] /= groupCounter[1] > 0 ? groupCounter[1] : 1; //Compute the MSE for each group for(UINT i=0; i<M; i++){ groupMSE[ groupIndex[i] ] += grt_sqr( groupMean[ groupIndex[i] ] - trainingData[ i ].getInputVector()[features[n]] ); } groupMSE[0] /= groupCounter[0] > 0 ? groupCounter[0] : 1; groupMSE[1] /= groupCounter[1] > 0 ? groupCounter[1] : 1; error = sqrt( groupMSE[0] + groupMSE[1] ); //Store the best threshold and feature index if( error < minError ){ minError = error; bestThreshold = threshold; bestFeatureIndex = featureIndex; } //Update the threshold threshold += step; } } //Set the best feature index and threshold featureIndex = bestFeatureIndex; threshold = bestThreshold; return true; }
bool DecisionTreeThresholdNode::computeBestSplitBestIterativeSplit( const UINT &numSplittingSteps, const ClassificationData &trainingData, const Vector< UINT > &features, const Vector< UINT > &classLabels, UINT &featureIndex, Float &minError ){ const UINT M = trainingData.getNumSamples(); const UINT N = features.getSize(); const UINT K = classLabels.getSize(); if( N == 0 ) return false; minError = grt_numeric_limits< Float >::max(); UINT bestFeatureIndex = 0; Float bestThreshold = 0; Float error = 0; Float minRange = 0; Float maxRange = 0; Float step = 0; Float giniIndexL = 0; Float giniIndexR = 0; Float weightL = 0; Float weightR = 0; Vector< UINT > groupIndex(M); VectorFloat groupCounter(2,0); Vector< MinMax > ranges = trainingData.getRanges(); MatrixFloat classProbabilities(K,2); //Loop over each feature and try and find the best split point for(UINT n=0; n<N; n++){ minRange = ranges[n].minValue; maxRange = ranges[n].maxValue; step = (maxRange-minRange)/Float(numSplittingSteps); threshold = minRange; featureIndex = features[n]; while( threshold <= maxRange ){ //Iterate over each sample and work out if it should be in the lhs (0) or rhs (1) group groupCounter[0] = groupCounter[1] = 0; classProbabilities.setAllValues(0); for(UINT i=0; i<M; i++){ groupIndex[i] = trainingData[ i ][ featureIndex ] >= threshold ? 1 : 0; groupCounter[ groupIndex[i] ]++; classProbabilities[ getClassLabelIndexValue(trainingData[i].getClassLabel(),classLabels) ][ groupIndex[i] ]++; } //Compute the class probabilities for the lhs group and rhs group for(UINT k=0; k<K; k++){ classProbabilities[k][0] = groupCounter[0]>0 ? classProbabilities[k][0]/groupCounter[0] : 0; classProbabilities[k][1] = groupCounter[1]>0 ? classProbabilities[k][1]/groupCounter[1] : 0; } //Compute the Gini index for the lhs and rhs groups giniIndexL = giniIndexR = 0; for(UINT k=0; k<K; k++){ giniIndexL += classProbabilities[k][0] * (1.0-classProbabilities[k][0]); giniIndexR += classProbabilities[k][1] * (1.0-classProbabilities[k][1]); } weightL = groupCounter[0]/M; weightR = groupCounter[1]/M; error = (giniIndexL*weightL) + (giniIndexR*weightR); //Store the best threshold and feature index if( error < minError ){ minError = error; bestThreshold = threshold; bestFeatureIndex = featureIndex; } //Update the threshold threshold += step; } } //Set the best feature index that will be returned to the DecisionTree that called this function featureIndex = bestFeatureIndex; //Store the node size, feature index, best threshold and class probabilities for this node set(M,featureIndex,bestThreshold,trainingData.getClassProbabilities(classLabels)); return true; }