Exemplo n.º 1
0
tree::tree(data_set& train_set, int max_leafs, int max_depth) : max_leafs(max_leafs)
{
	std::set<int> features;
	for (size_t i = 0; i < train_set[0].features.size(); i++)
	{
		features.insert(i);
	}
	leafs = 1;
	int depth = 0;
	root = new node(0);
	root->data_begin = train_set.begin();
	root->data_end = train_set.end();
	root->calc_avg();
	root->node_mse = calc_mse(root->data_begin, root->data_end, root->output_value, root->size);
	std::vector<node*> layer;
	layer.push_back(root);
	layers.push_back(layer);
	while (leafs < max_leafs && depth < max_depth && !features.empty())
	{
		float min_error = INF;
		int best_feature = -1;
		make_layer(depth);
		for (std::set<int>::iterator cur_split_feature = features.begin(); cur_split_feature != features.end(); cur_split_feature++)
		//choose best split feature at current depth
		{
			float cur_error = 0;
			for (size_t i = 0; i < layers[depth].size(); i++)
			{
				cur_error += layers[depth][i]->split(*cur_split_feature);
			}
			if (cur_error < min_error)
			{
				min_error = cur_error;
				best_feature = *cur_split_feature;
			}
		}
		for (size_t i = 0; i < layers[depth].size(); i++)
		{
			layers[depth][i]->split(best_feature);
		}
		feature_id_at_depth.push_back(best_feature);
		features.erase(best_feature);
		depth++;
		//std::cout << "level " << depth << " created. training error: " << min_error << " best feat: " << best_feature << " split_val: "
			//<< root->split_value << std::endl;
	}
	for (size_t i = 0; i < layers.back().size(); i++)
	{
		layers.back()[i]->is_leaf = true;
	}
	//std::cout << "leafs before pruning: " << leafs << std::endl;
	//prune(root);
	//std::cout << "new tree! leafs after pruning: " << leafs << std::endl;
	while (layers.back().empty())
	{
		layers.pop_back();
	}
}
Exemplo n.º 2
0
float node::split(int split_feature_id)
{
	if (is_leaf)
	{
		return calc_mse(data_begin, data_end, output_value, size);
	}
	std::sort(data_begin, data_end, test_comparator(split_feature_id));
	float l_sum = 0;
	float l_size = 0;
	float r_sum = sum;
	float r_size = size;
	float best_mse = INF; 
	for (data_set::iterator cur_test = data_begin + 1; cur_test != data_end; cur_test++) //try all possible splits
	{
		l_sum += (cur_test - 1)->anwser;
		l_size++;
		r_sum -= (cur_test - 1)->anwser;
		r_size--;
		if (cur_test->features[split_feature_id] == (cur_test - 1)->features[split_feature_id])
		{
			continue;
		}
		float l_avg = l_sum / l_size;
		float r_avg = r_sum / r_size;
		float l_mse = calc_mse(data_begin, cur_test, l_avg, l_size);
		float r_mse = calc_mse(cur_test, data_end, r_avg, r_size);
		float cur_mse = l_mse + r_mse;
		if (cur_mse < best_mse)
		{
			best_mse = cur_mse;
			split_value = cur_test->features[split_feature_id];
			left->data_begin = data_begin;
			left->data_end = cur_test;
			left->output_value = l_avg;
			left->size = l_size;
			left->sum = l_sum;
			left->node_mse = l_mse;
			left->is_leaf = (l_size == 1) ? true : false;
			right->data_begin = cur_test;
			right->data_end = data_end;
			right->output_value = r_avg;
			right->size = r_size;
			right->sum = r_sum;
			right->node_mse = r_mse;
			right->is_leaf = (r_size == 1) ? true : false;
		}
	}
	if (best_mse == INF)
	{
		best_mse = node_mse;
		split_value = (data_begin + 1)->features[split_feature_id];
		left->output_value = output_value;
		left->size = 0;
		left->is_leaf = true;
		right->data_begin = data_begin;
		right->data_end = data_end;
		right->output_value = output_value;
		right->size = size;
		right->sum = sum;
		right->node_mse = node_mse;
		right->is_leaf = (size <= 1) ? true : false;
	}
	return best_mse;
}
Exemplo n.º 3
0
void double_es::evaluate() {
    _mae = calc_mae(_e);
    _mse = calc_mse(_e);
    _mape = calc_mape(_d, _e);
}