void prune_node(node_t* node, node_t** simpler) { if(node == NULL) { // update simpler so that if we are pruning, // the pointer pointing to this node, will point to this node instead *simpler = node; return; } if(is_prunable(node)) { // update the pointer pointing to the node we want to point to instead //prune_node(node->children[0], simpler); for(int i = 0; i < node->n_children; i++) { prune_node(node->children[i], simpler); } // clean up node we are removing node_finalize(node); } else { // recurse down the children and update pointers for (int i = 0; i < node->n_children; ++i) { node_t* child = node->children[i]; prune_node(child, simpler); // update the child pointer we are exploring, in case it was prunable node->children[i] = *simpler; } *simpler = node; } }
void simplify_tree ( node_t **simplified, node_t *root ) { /* TODO: implement the simplifications of the tree here */ *simplified = root; // prune all redundant nodes first prune_node(root, simplified); *simplified = root; // go through the whole tree, doing our work on each node dfs(root, root); }
// this is a public function for attempting to prune the decision tree and // improve classification int dt_prune(decision_tree *dt, data_set *validation_data) { return prune_node(dt, dt->root, validation_data); }
// this is a private function that recursively prunes nodes top-down // and only accepts a pruning if it increases the prediction score of the // validation data // returns the number of nodes successfully pruned int prune_node(decision_tree *dt, dt_node *node, data_set *validation_data) { // the score with both subtrees still attached float primary_score = dt_score(dt, validation_data); // save subtrees so that we can restore them if classification score // didn't improve dt_node *left = node->left; dt_node *right = node->right; int right_prune_count = 0; int left_prune_count = 0; if(left != NULL) { node->left = NULL; // score the decision tree with the missing subtree float left_prune_score = dt_score(dt, validation_data); if(left_prune_score >= primary_score) { // found a good prune! left_prune_count = count_nodes(left); float diff = left_prune_score - primary_score; if(diff > 0.0002 || left_prune_count > 10) { printf("Improved score by %.4f, dropped %d nodes\n", diff, left_prune_count); } // throw away the subtree now that we don't need it dt_free_node(left); } else { // prune was no good, so restore the subtree and recurse node->left = left; left_prune_count = prune_node(dt, node->left, validation_data); } } if(right != NULL) { // basically the same as above, but for the right subtree node->right = NULL; float right_prune_score = dt_score(dt, validation_data); if(right_prune_score >= primary_score) { right_prune_count = count_nodes(right); float diff = right_prune_score - primary_score; if(diff > 0.0002 || right_prune_count > 10) { printf("Improved score by %.4f, dropped %d nodes\n", diff, right_prune_count); } dt_free_node(right); } else { node->right = right; right_prune_count = prune_node(dt, node->right, validation_data); } } // need to see if we're a leaf now if(node->left == NULL && node->right == NULL) { node->prediction_value = guess_node_class(dt, node); node->is_leaf = 1; } return left_prune_count + right_prune_count; }