static bool bestSplittingDimension(const vector< vector< nl_vector > > & inputs, const vector< double> & outputs, const vector< unsigned int > & indices, const unsigned int scaleIndex, const unsigned int minSize, const rf_rgrsn_tree_parameter & para, const ff_tree_cost_function & costFunction, // output unsigned int & splitDim, double & splitThreshold, double & loss, vector< unsigned int > & leftIndices, vector< unsigned int > & rightIndices) { assert(indices.size() >= minSize); // randomly generate dimensions that used in split unsigned int nDims = (unsigned int)inputs[0][scaleIndex].size(); vector< unsigned int > randomDimensions( nDims ); for ( unsigned int i = 0; i < nDims; ++i ) { randomDimensions[i] = i; } random_shuffle( randomDimensions.begin(), randomDimensions.end()); unsigned int subDims = sqrt(double(nDims)); assert( subDims > 0 ); assert( subDims <= nDims ); //erase some dimensions, to avoid always pick the same dimenstion every time vector< unsigned int > randomSubset( randomDimensions.begin(), randomDimensions.begin() + subDims ); //loop each dimenstion // double minLoss = INT_MAX; bool isSplitOk = false; for (int i = 0; i<randomSubset.size(); i++) { vector<unsigned int> curLeftIndices; vector<unsigned int> curRightIndices; double curLoss = INT_MAX; double threshold = 0.0; bool isSplit = bestSplittingInDimenstionWithRandom(inputs, outputs, indices, scaleIndex, randomSubset[i], minSize, para, costFunction, curLoss, threshold, curLeftIndices, curRightIndices); if (isSplit && curLoss < loss) { loss = curLoss; leftIndices = curLeftIndices; rightIndices = curRightIndices; splitDim = randomSubset[i]; splitThreshold = threshold; isSplitOk = true; } } return isSplitOk; }
/* Find the best split for node root along with other relevant information */ split_t bestSplit(tree_t* t, node_t* root, dataset_t* d){ split_t ret; int ii,i,j,ex,prev,prevex; float posleft,negleft,poszero,negzero,posnonzero,negnonzero; float threshold; float total = root->pos+root->neg; evpair_t* fi; ret.feature = -1; /* First compute the entropy of the parent */ ret.gain = -entropy(root->pos/total); /* Select random subset of features */ if(t->committee == RANDOMFOREST) randomSubset(t->feats, d->nfeat, t->fpn, t->used); for(ii=0; ii<t->fpn; ii++){ i=t->feats[ii]; if(t->used[i]) continue; fi=d->feature[i]; if(d->cont[i]){ /* If the feature is continuous */ /* Find the first valid example */ prevex = -1; for(j=0; j<d->size[i]; j++){ ex = fi[j].example; if(t->valid[ex]>0){ prevex = ex; break; } } if (prevex<0) continue; prev = j; /* Calculate the mass allocated to the zero value */ /* We start with the mass allocated to the nonzero values */ posnonzero = FLT_EPSILON; negnonzero = FLT_EPSILON; for(j=prev; j<d->size[i]; j++){ ex = fi[j].example; if(t->valid[ex]<=0) continue; if(d->target[ex]) posnonzero += d->weight[ex]; else negnonzero += d->weight[ex]; } /* The mass allocated to the zero value is the rest */ poszero = max(FLT_EPSILON, root->pos - posnonzero); negzero = max(FLT_EPSILON, root->neg - negnonzero); /* Initialize counts */ posleft = FLT_EPSILON; negleft = FLT_EPSILON; /* Add the mass allocated to zero if the first valid example is > 0 */ if (fi[prev].value > 0){ posleft += poszero; negleft += negzero; /*Also check the split between 0 and value */ threshold = 0.5*(0 + fi[prev].value); updateSplit(i,threshold,posleft,negleft,root,&ret); } for(j=prev+1; j<d->size[i]; j++){ ex = fi[j].example; if(t->valid[ex]<=0) continue; if(d->target[prevex]){ posleft += d->weight[prevex]; } else{ negleft += d->weight[prevex]; } if (fi[prev].value < 0 && 0 < fi[j].value){ threshold = 0.5*(fi[prev].value + 0); /* First check the split between previous value and 0 */ updateSplit(i,threshold,posleft,negleft,root,&ret); posleft += poszero; negleft += negzero; /* Now check the split between 0 and current value */ threshold = 0.5*(0 + fi[j].value); updateSplit(i,threshold,posleft,negleft,root,&ret); } /* Check the split between the two values if they are different */ /* The extra condition d->target[ex] != d->target[prevex] is not used because * it's not correct if the examples don't take unique values */ if(fi[j].value != fi[prev].value){ threshold = 0.5*(fi[j].value + fi[prev].value); updateSplit(i,threshold,posleft,negleft,root,&ret); } prev = j; prevex = ex; } } else{ /* The feature is binary */ /* These values are not used in the computation of entropy * so they don't need to be smoothed */ float posright = 0; float negright = 0; /* Count the number of positive and negative examples that will go to the right */ for(j=0; j<d->size[i]; j++){ ex = fi[j].example; if(t->valid[ex]<=0) continue; if(d->target[ex]) posright += d->weight[ex]; else negright += d->weight[ex]; } /* The ones that will go to the left are the rest */ posleft = max(FLT_EPSILON, root->pos - posright); negleft = max(FLT_EPSILON, root->neg - negright); updateSplit(i,0.5,posleft,negleft,root,&ret); } } return ret; }
void stepCustom(cliqueTreeType& currentTree, graphType& graph, std::vector<mpfr_class>& exactValues, int nVertices, boost::mt19937& randomSource, working& temp, int edgeLimit) { boost::random::uniform_int_distribution<> randomVertexDist(0, nVertices-1); boost::random::bernoulli_distribution<> standardBernoulli; boost::random::uniform_real_distribution<> standardUniform; int randomVertex1, randomVertex2; do { randomVertex1 = randomVertexDist(randomSource); randomVertex2 = randomVertexDist(randomSource); } while(randomVertex1 == randomVertex2); std::size_t original_edges = boost::num_edges(graph); std::pair<graphType::edge_descriptor, bool> existingEdge = boost::edge(randomVertex1, randomVertex2, graph); cliqueTreeAdjacencyMatrix& copied = temp.copied; //Here we remove edges if(existingEdge.second) { int cliqueVertex = -1; //Can we remove this edge? if(currentTree.canRemoveEdge(randomVertex1, randomVertex2, temp.counts1, cliqueVertex)) { //Form tree of removable edge subsets and work out the counts. currentTree.formRemovalTree(temp.stateCounts, copied, randomVertex1, randomVertex2, temp.uniqueSubsets, temp.removalTemporaries); //The maximum number of other removable edges, in addition to edge (u, v). int maximumOtherRemovableEdges = 0; for(int i = 0; i < nVertices; i++) { if(temp.stateCounts[i] == 0) break; maximumOtherRemovableEdges = i; } //The number of edges to actually remove for the proposal. int extraToRemove = 0; //Compute probabilities and normalizing constant. temp.probabilities.clear(); double sum1 = 0; for(int i = 0; i < maximumOtherRemovableEdges + 1; i++) { temp.probabilities.push_back(mpfr_class(exactValues[original_edges] / exactValues[original_edges - i - 1]).convert_to<double>()); sum1 += temp.probabilities.back(); } if(maximumOtherRemovableEdges > 0) { boost::random::discrete_distribution<> extraNumberToRemoveDist(temp.probabilities.begin(), temp.probabilities.end()); extraToRemove = extraNumberToRemoveDist(randomSource); } //The acceptance probability for the Metropolis-Hasting proposal. mpfr_class acceptanceProbability = 0; //If we actually only remove a single edge then things are more complicated - There are two ways this can happen, the second way corresponds to swapping randomVertex1 and randomVertex2 if(extraToRemove == 0) { //Form tree of removable edge subsets and work out the counts. currentTree.formRemovalTree(temp.stateCounts, copied, randomVertex2, randomVertex1, temp.uniqueSubsets, temp.removalTemporaries); int maximumOtherRemovableEdges2 = 0; for(int i = 0; i < nVertices; i++) { if(temp.stateCounts[i] == 0) break; maximumOtherRemovableEdges2 = i; } double sum2 = 0; for(int i = 0; i < maximumOtherRemovableEdges2 + 1; i++) { sum2 += mpfr_class(exactValues[original_edges] / exactValues[original_edges - i - 1]).convert_to<double>(); } acceptanceProbability = 1/(0.5 * (1/sum1 + 1/sum2)); } else acceptanceProbability = sum1 * temp.stateCounts[extraToRemove]; if(acceptanceProbability >= 1 || standardUniform(randomSource) <= acceptanceProbability.convert_to<double>()) { currentTree.removeEdgeKnownCliqueVertex(randomVertex1, randomVertex2, temp.colourVector, temp.counts2, cliqueVertex); boost::remove_edge(randomVertex1, randomVertex2, graph); if(extraToRemove != 0) { boost::random::uniform_int_distribution<> randomSubset(0, temp.stateCounts[extraToRemove] - 1); int index = randomSubset(randomSource); bitsetType chosenSubset; for(std::unordered_set<bitsetType>::iterator i = temp.uniqueSubsets.begin(); i != temp.uniqueSubsets.end(); i++) { if((int)i->count() == extraToRemove+1) { if(index == 0) { chosenSubset = *i; break; } index--; } } //This is 1 rather than 0, because the edge randomVertex1, randomVertex2 is already deleted. while(chosenSubset.count() > 1) { currentTree.canRemoveEdge(randomVertex1, randomVertex2, temp.counts1, cliqueVertex); for(int i = 0; i < nVertices; i++) { if(chosenSubset[i] && temp.counts1[i] == 1) { chosenSubset[i] = false; currentTree.tryRemoveEdge(randomVertex1, i, temp.colourVector, temp.counts2); boost::remove_edge(randomVertex1, i, graph); } } } } } } } //Here we add edges else { cliqueTreeAdjacencyMatrix& copied2 = temp.copied2; bitsetType newEdgesVertex1; currentTree.unionMinimalSeparators(randomVertex1, randomVertex2, newEdgesVertex1, temp.vertexSequence, temp.edgeSequence, temp.addEdges, temp.removeEdges, temp.unionMinimalSepTemp); int increaseInEdges = 1; for(int i = 0; i < nVertices; i++) { if(newEdgesVertex1[i] && !boost::edge(i, randomVertex1, graph).second) increaseInEdges++; } if((int)original_edges + increaseInEdges <= edgeLimit) { mpfr_class acceptanceProbability; if(increaseInEdges != 1) { copied.makeCopy(currentTree); //Actually add the edges to the copy copied.addEdge(randomVertex1, randomVertex2, newEdgesVertex1, temp.vertexSequence, temp.edgeSequence, temp.addEdges, temp.removeEdges, temp.unionMinimalSepTemp, true); copied.formRemovalTree(temp.stateCounts, copied2, randomVertex1, randomVertex2, temp.uniqueSubsets, temp.removalTemporaries); //Work out how many more edges can be removed from the original int otherBackwardsCanRemove = 0; for(int i = 0; i < nVertices; i++) { if(temp.stateCounts[i] == 0) break; otherBackwardsCanRemove = i; } double sum = 0; for(int i = 0; i < otherBackwardsCanRemove + 1; i++) { sum += mpfr_class(exactValues[original_edges + increaseInEdges] / exactValues[original_edges + increaseInEdges - i - 1]).convert_to<double>(); } acceptanceProbability = 1/(sum * temp.stateCounts[increaseInEdges - 1]); } else { copied.makeCopy(currentTree); //Actually add the edges to the copy copied.addEdge(randomVertex1, randomVertex2, newEdgesVertex1, temp.vertexSequence, temp.edgeSequence, temp.addEdges, temp.removeEdges, temp.unionMinimalSepTemp, true); //Form removal tree with the vertices the same way round copied.formRemovalTree(temp.stateCounts, copied2, randomVertex1, randomVertex2, temp.uniqueSubsets, temp.removalTemporaries); //Work out how many more edges can be removed from the original int otherBackwardsCanRemove = 0; for(int i = 0; i < nVertices; i++) { if(temp.stateCounts[i] == 0) break; otherBackwardsCanRemove = i; } double sum1 = 0; for(int i = 0; i < otherBackwardsCanRemove + 1; i++) { sum1 += mpfr_class(exactValues[original_edges + increaseInEdges] / exactValues[original_edges + increaseInEdges - i - 1]).convert_to<double>(); } //Form removal tree with the vertices reversed copied.formRemovalTree(temp.stateCounts, copied2, randomVertex2, randomVertex1, temp.uniqueSubsets, temp.removalTemporaries); //Work out how many more edges can be removed from the original int otherBackwardsCanRemove2 = 0; for(int i = 0; i < nVertices; i++) { if(temp.stateCounts[i] == 0) break; otherBackwardsCanRemove2 = i; } double sum2 = 0; for(int i = 0; i < otherBackwardsCanRemove2 + 1; i++) { sum2 += mpfr_class(exactValues[original_edges + increaseInEdges] / exactValues[original_edges + increaseInEdges - i - 1]).convert_to<double>(); } acceptanceProbability = 0.5 * (1/sum1 + 1/sum2); } if (acceptanceProbability >= 1 || standardUniform(randomSource) <= acceptanceProbability.convert_to<double>()) { currentTree.swap(copied); newEdgesVertex1[randomVertex2] = true; for(int i = 0; i < nVertices; i++) { if(newEdgesVertex1[i]) { boost::add_edge(randomVertex1, i, graph); } } } } } #ifndef NDEBUG currentTree.check(); #endif }