//recursively insert nodes void Tree::createBranch(boost::shared_ptr<Node> parentNode, DataFrame &dfsplit, const int nrFeat, bool verbose) { vector<int> featsubset = LUtils::sample(rng, nrFeat, dfsplit.nrcols - 1, false); if (verbose) { cout << "Feature subset: "; for (unsigned i = 0; i < featsubset.size(); ++i) { cout << " " << dfsplit.header[featsubset[i]]; } cout << endl; } DataFrame leftDF; DataFrame rightDF; dfsplit.splitFrame(parentNode->splitvalue, parentNode->feature, leftDF, rightDF); tree_size++; //LEFT BRANCH if (verbose && leftDF.nrrows > 0) { cout << "...Creating left branch: Feature: " << dfsplit.header[parentNode->feature] << " Value:" << parentNode->splitvalue << " n:" << leftDF.nrrows << " with prediction:" << leftDF.cm << endl; //leftDF.printSummary(); } if (leftDF.nrrows == 0) { //happens if one of the nodes is "practically" pure if (verbose) { cout << "No data in left node, right node:" << rightDF.nrrows << endl; cout << "Left node: Parent node is terminal." << endl; } parentNode->isTerminal = true; tnodecount++; return; } else if (leftDF.nrrows <= min_node || parentNode->depth + 1 > max_depth || leftDF.distinct[leftDF.classCol] < 2) { if (verbose) cout << "Terminal node, cm: " << leftDF.cm << endl; boost::shared_ptr<Node> left = boost::make_shared<Node>( parentNode->depth + 1, leftDF.cm); left->isTerminal = true; left->nrsamples = leftDF.nrrows; parentNode->left = left; tnodecount++; } else { DataFrame::FeatureResult featResulta = leftDF.findBestFeature( featsubset, entropy_loss); boost::shared_ptr<Node> left = boost::make_shared<Node>( featResulta.opt_feat, featResulta.opt_split, featResulta.loss, parentNode->depth + 1, leftDF.header[featResulta.opt_feat], leftDF.nrrows, leftDF.cm); parentNode->left = left; createBranch(left, leftDF, nrFeat, verbose); } //RIGHT BRANCH if (verbose && rightDF.nrrows > 0) { cout << "...Creating right branch: Feature: " << dfsplit.header[parentNode->feature] << " Value:" << parentNode->splitvalue << " n:" << rightDF.nrrows << " with prediction:" << rightDF.cm << endl; //rightDF.printSummary(); } if (rightDF.nrrows == 0) { //happens if one of the nodes is "practically" pure if (verbose) { cout << "No data in right node, left node:" << leftDF.nrrows << endl; cout << "Right node: Parent node is terminal." << endl; } parentNode->isTerminal = true; tnodecount++; return; } else if (rightDF.nrrows <= min_node || parentNode->depth + 1 > max_depth || rightDF.distinct[rightDF.classCol] < 2) { if (verbose) cout << "Terminal node, cm: " << rightDF.cm << endl; boost::shared_ptr<Node> right = boost::make_shared<Node>( parentNode->depth + 1, rightDF.cm); right->isTerminal = true; right->nrsamples = rightDF.nrrows; parentNode->right = right; tnodecount++; } else { DataFrame::FeatureResult featResultb = rightDF.findBestFeature( featsubset, entropy_loss); if (verbose) cout << "Terminal node, cm: " << rightDF.cm << endl; boost::shared_ptr<Node> right = boost::make_shared<Node>( featResultb.opt_feat, featResultb.opt_split, featResultb.loss, parentNode->depth + 1, rightDF.header[featResultb.opt_feat], rightDF.nrrows, rightDF.cm); parentNode->right = right; createBranch(right, rightDF, nrFeat, verbose); } //if we reach this point, we should return return; }