tree::tree(data_set& train_set, int max_leafs, int max_depth) : max_leafs(max_leafs) { std::set<int> features; for (size_t i = 0; i < train_set[0].features.size(); i++) { features.insert(i); } leafs = 1; int depth = 0; root = new node(0); root->data_begin = train_set.begin(); root->data_end = train_set.end(); root->calc_avg(); root->node_mse = calc_mse(root->data_begin, root->data_end, root->output_value, root->size); std::vector<node*> layer; layer.push_back(root); layers.push_back(layer); while (leafs < max_leafs && depth < max_depth && !features.empty()) { float min_error = INF; int best_feature = -1; make_layer(depth); for (std::set<int>::iterator cur_split_feature = features.begin(); cur_split_feature != features.end(); cur_split_feature++) //choose best split feature at current depth { float cur_error = 0; for (size_t i = 0; i < layers[depth].size(); i++) { cur_error += layers[depth][i]->split(*cur_split_feature); } if (cur_error < min_error) { min_error = cur_error; best_feature = *cur_split_feature; } } for (size_t i = 0; i < layers[depth].size(); i++) { layers[depth][i]->split(best_feature); } feature_id_at_depth.push_back(best_feature); features.erase(best_feature); depth++; //std::cout << "level " << depth << " created. training error: " << min_error << " best feat: " << best_feature << " split_val: " //<< root->split_value << std::endl; } for (size_t i = 0; i < layers.back().size(); i++) { layers.back()[i]->is_leaf = true; } //std::cout << "leafs before pruning: " << leafs << std::endl; //prune(root); //std::cout << "new tree! leafs after pruning: " << leafs << std::endl; while (layers.back().empty()) { layers.pop_back(); } }
float node::split(int split_feature_id) { if (is_leaf) { return calc_mse(data_begin, data_end, output_value, size); } std::sort(data_begin, data_end, test_comparator(split_feature_id)); float l_sum = 0; float l_size = 0; float r_sum = sum; float r_size = size; float best_mse = INF; for (data_set::iterator cur_test = data_begin + 1; cur_test != data_end; cur_test++) //try all possible splits { l_sum += (cur_test - 1)->anwser; l_size++; r_sum -= (cur_test - 1)->anwser; r_size--; if (cur_test->features[split_feature_id] == (cur_test - 1)->features[split_feature_id]) { continue; } float l_avg = l_sum / l_size; float r_avg = r_sum / r_size; float l_mse = calc_mse(data_begin, cur_test, l_avg, l_size); float r_mse = calc_mse(cur_test, data_end, r_avg, r_size); float cur_mse = l_mse + r_mse; if (cur_mse < best_mse) { best_mse = cur_mse; split_value = cur_test->features[split_feature_id]; left->data_begin = data_begin; left->data_end = cur_test; left->output_value = l_avg; left->size = l_size; left->sum = l_sum; left->node_mse = l_mse; left->is_leaf = (l_size == 1) ? true : false; right->data_begin = cur_test; right->data_end = data_end; right->output_value = r_avg; right->size = r_size; right->sum = r_sum; right->node_mse = r_mse; right->is_leaf = (r_size == 1) ? true : false; } } if (best_mse == INF) { best_mse = node_mse; split_value = (data_begin + 1)->features[split_feature_id]; left->output_value = output_value; left->size = 0; left->is_leaf = true; right->data_begin = data_begin; right->data_end = data_end; right->output_value = output_value; right->size = size; right->sum = sum; right->node_mse = node_mse; right->is_leaf = (size <= 1) ? true : false; } return best_mse; }
void double_es::evaluate() { _mae = calc_mae(_e); _mse = calc_mse(_e); _mape = calc_mape(_d, _e); }