struct SimpleTreeNode *
build_tree(struct Example *examples, int size, int depth, struct SimpleTreeNode *parent, struct Args *args)
{
    int i, cls_vals, best_attr;
    float cls_entropy, cls_mse, best_score, score, size_weight, best_split, split;
    struct SimpleTreeNode *node;
    struct Example *ex, *ex_end;
    TVarList::const_iterator it;

    cls_vals = args->domain->classVar->noOfValues();

    ASSERT(node = (SimpleTreeNode *)malloc(sizeof *node));

    if (args->type == Classification) {
        ASSERT(node->dist = (float *)calloc(cls_vals, sizeof(float *)));

        if (size == 0) {
            assert(parent);
            node->type = PredictorNode;
            node->children_size = 0;
            memcpy(node->dist, parent->dist, cls_vals * sizeof *node->dist);
            return node;
        }

        /* class distribution */
        size_weight = 0.0;
        for (ex = examples, ex_end = examples + size; ex < ex_end; ex++)
            if (!ex->example->getClass().isSpecial()) {
                node->dist[ex->example->getClass().intV] += ex->weight;
                size_weight += ex->weight;
            }

        /* stopping criterion: majority class */
        for (i = 0; i < cls_vals; i++)
            if (node->dist[i] / size_weight >= args->maxMajority)
                return make_predictor(node, examples, size, args);

        cls_entropy = entropy(node->dist, cls_vals);
    } else {
        float n, sum, sum2, cls_val;

        assert(args->type == Regression);
        if (size == 0) {
            assert(parent);
            node->type = PredictorNode;
            node->children_size = 0;
            node->n = parent->n;
            node->sum = parent->sum;
            return node;
        }

        n = sum = sum2 = 0.0;
        for (ex = examples, ex_end = examples + size; ex < ex_end; ex++)
            if (!ex->example->getClass().isSpecial()) {
                cls_val = ex->example->getClass().floatV;
                n += ex->weight;
                sum += ex->weight * cls_val;
                sum2 += ex->weight * cls_val * cls_val;
            }

        node->n = n;
        node->sum = sum;
        cls_mse = (sum2 - sum * sum / n) / n;

        if (cls_mse < 1e-5) {
            return make_predictor(node, examples, size, args);
        }
    }

    /* stopping criterion: depth exceeds limit */
    if (depth == args->maxDepth)
        return make_predictor(node, examples, size, args);

    /* score attributes */
    best_score = -INFINITY;

    for (i = 0, it = args->domain->attributes->begin(); it != args->domain->attributes->end(); it++, i++) {
        if (!args->attr_split_so_far[i]) {
            /* select random subset of attributes */
            if (args->randomGenerator->randdouble() < args->skipProb)
                continue;

            if ((*it)->varType == TValue::INTVAR) {
                score = args->type == Classification ?
                        gain_ratio_d(examples, size, i, cls_entropy, args) :
                        mse_d(examples, size, i, cls_mse, args);
                if (score > best_score) {
                    best_score = score;
                    best_attr = i;
                }
            } else if ((*it)->varType == TValue::FLOATVAR) {
                score = args->type == Classification ?
                        gain_ratio_c(examples, size, i, cls_entropy, args, &split) :
                        mse_c(examples, size, i, cls_mse, args, &split);
                if (score > best_score) {
                    best_score = score;
                    best_split = split;
                    best_attr = i;
                }
            }
        }
    }

    if (best_score == -INFINITY)
        return make_predictor(node, examples, size, args);

    if (args->domain->attributes->at(best_attr)->varType == TValue::INTVAR) {
        struct Example *child_examples, *child_ex;
        int attr_vals;
        float size_known, *attr_dist;

        /* printf("* %2d %3s %3d %f\n", depth, args->domain->attributes->at(best_attr)->get_name().c_str(), size, best_score); */

        attr_vals = args->domain->attributes->at(best_attr)->noOfValues();

        node->type = DiscreteNode;
        node->split_attr = best_attr;
        node->children_size = attr_vals;

        ASSERT(child_examples = (struct Example *)calloc(size, sizeof *child_examples));
        ASSERT(node->children = (SimpleTreeNode **)calloc(attr_vals, sizeof *node->children));
        ASSERT(attr_dist = (float *)calloc(attr_vals, sizeof *attr_dist));

        /* attribute distribution */
        size_known = 0;
        for (ex = examples, ex_end = examples + size; ex < ex_end; ex++)
            if (!ex->example->values[best_attr].isSpecial()) {
                attr_dist[ex->example->values[best_attr].intV] += ex->weight;
                size_known += ex->weight;
            }

        args->attr_split_so_far[best_attr] = 1;

        for (i = 0; i < attr_vals; i++) {
            /* create a new example table */
            for (ex = examples, ex_end = examples + size, child_ex = child_examples; ex < ex_end; ex++) {
                if (ex->example->values[best_attr].isSpecial()) {
                    *child_ex = *ex;
                    child_ex->weight *= attr_dist[i] / size_known;
                    child_ex++;
                } else if (ex->example->values[best_attr].intV == i) {
                    *child_ex++ = *ex;
                }
            }

            node->children[i] = build_tree(child_examples, child_ex - child_examples, depth + 1, node, args);
        }

        args->attr_split_so_far[best_attr] = 0;

        free(attr_dist);
        free(child_examples);
    } else {
        struct Example *examples_lt, *examples_ge, *ex_lt, *ex_ge;
        float size_lt, size_ge;

        /* printf("* %2d %3s %3d %f %f\n", depth, args->domain->attributes->at(best_attr)->get_name().c_str(), size, best_split, best_score); */

        assert(args->domain->attributes->at(best_attr)->varType == TValue::FLOATVAR);

        ASSERT(examples_lt = (struct Example *)calloc(size, sizeof *examples));
        ASSERT(examples_ge = (struct Example *)calloc(size, sizeof *examples));

        size_lt = size_ge = 0.0;
        for (ex = examples, ex_end = examples + size; ex < ex_end; ex++)
            if (!ex->example->values[best_attr].isSpecial())
                if (ex->example->values[best_attr].floatV < best_split)
                    size_lt += ex->weight;
                else
                    size_ge += ex->weight;

        for (ex = examples, ex_end = examples + size, ex_lt = examples_lt, ex_ge = examples_ge; ex < ex_end; ex++)
            if (ex->example->values[best_attr].isSpecial()) {
                *ex_lt = *ex;
                *ex_ge = *ex;
                ex_lt->weight *= size_lt / (size_lt + size_ge);
                ex_ge->weight *= size_ge / (size_lt + size_ge);
                ex_lt++;
                ex_ge++;
            } else if (ex->example->values[best_attr].floatV < best_split) {
                *ex_lt++ = *ex;
            } else {
                *ex_ge++ = *ex;
            }

        node->type = ContinuousNode;
        node->split_attr = best_attr;
        node->split = best_split;
        node->children_size = 2;
        ASSERT(node->children = (SimpleTreeNode **)calloc(2, sizeof *node->children));

        node->children[0] = build_tree(examples_lt, ex_lt - examples_lt, depth + 1, node, args);
        node->children[1] = build_tree(examples_ge, ex_ge - examples_ge, depth + 1, node, args);

        free(examples_lt);
        free(examples_ge);
    }

    return node;
}
Beispiel #2
0
struct SimpleTreeNode *
build_tree_(struct Example *examples, int size, int depth, struct SimpleTreeNode *parent, struct Args *args)
{
	int i, cls_vals, best_attr;
	float cls_entropy, cls_mse, best_score, score, size_weight, best_split, split;
	struct SimpleTreeNode *node;
	struct Example *ex, *ex_end;

	cls_vals = args->cls_vals;

	ASSERT(node = (struct SimpleTreeNode *)malloc(sizeof *node));

	cls_mse = cls_entropy = 0.0;
	if (args->type == Classification) {
		ASSERT(node->dist = (float *)calloc(cls_vals, sizeof(float)));

		if (size == 0) {
			assert(parent);
			node->type = PredictorNode;
			node->children_size = 0;
			memcpy(node->dist, parent->dist, cls_vals * sizeof *node->dist);
			return node;
		}

		/* class distribution */
		size_weight = 0.0;
		for (ex = examples, ex_end = examples + size; ex < ex_end; ex++)
			if (!isnan(ex->y)) {
				node->dist[(int)ex->y] += ex->weight;
				size_weight += ex->weight;
			}

		/* stopping criterion: majority class */
		for (i = 0; i < cls_vals; i++)
			if (node->dist[i] / size_weight >= args->max_majority)
				return make_predictor(node, examples, size, args);

		cls_entropy = entropy(node->dist, cls_vals);
	} else {
		float n, sum, sum2, cls_val;

		assert(args->type == Regression);
		if (size == 0) {
			assert(parent);
			node->type = PredictorNode;
			node->children_size = 0;
			node->n = parent->n;
			node->sum = parent->sum;
			return node;
		}

		n = sum = sum2 = 0.0;
		for (ex = examples, ex_end = examples + size; ex < ex_end; ex++)
			if (!isnan(ex->y)) {
				cls_val = ex->y;
				n += ex->weight;
				sum += ex->weight * cls_val;
				sum2 += ex->weight * cls_val * cls_val;
			}

		node->n = n;
		node->sum = sum;
		cls_mse = (sum2 - sum * sum / n) / n;

		if (cls_mse < 1e-5) {
			return make_predictor(node, examples, size, args);
		}
	}

	/* stopping criterion: depth exceeds limit */
	if (depth == args->max_depth)
		return make_predictor(node, examples, size, args);

	/* score attributes */
	best_score = -INFINITY;
	best_split = 0;
	best_attr = 0;

	for (i = 0; i < args->num_attrs; i++) {
		if (!args->attr_split_so_far[i]) {
			/* select random subset of attributes */
			if ((double)rand() / (double)RAND_MAX < args->skip_prob)
				continue;

			if (args->domain[i] == IntVar) {
				score = args->type == Classification ?
				  gain_ratio_d(examples, size, i, cls_entropy, args) :
				  mse_d(examples, size, i, cls_mse, args);
				if (score > best_score) {
					best_score = score;
					best_attr = i;
				}
			} else if (args->domain[i] == FloatVar) {
				score = args->type == Classification ?
				  gain_ratio_c(examples, size, i, cls_entropy, args, &split) :
				  mse_c(examples, size, i, cls_mse, args, &split);
				if (score > best_score) {
					best_score = score;
					best_split = split;
					best_attr = i;
				}
			}
		}
	}

	if (best_score == -INFINITY)
		return make_predictor(node, examples, size, args);

	if (args->domain[best_attr] == IntVar) {
		struct Example *child_examples, *child_ex;
		int attr_vals;
		float size_known, *attr_dist;

		// printf("* %2d %3d %3d %f\n", depth, best_attr, size, best_score);

		attr_vals = args->attr_vals[best_attr];

		node->type = DiscreteNode;
		node->split_attr = best_attr;
		node->children_size = attr_vals;

		ASSERT(child_examples = (struct Example *)calloc(size, sizeof *child_examples));
		ASSERT(node->children = (struct SimpleTreeNode **)calloc(attr_vals, sizeof *node->children));
		ASSERT(attr_dist = (float *)calloc(attr_vals, sizeof *attr_dist));

		/* attribute distribution */
		size_known = 0;
		for (ex = examples, ex_end = examples + size; ex < ex_end; ex++)
			if (!isnan(ex->x[best_attr])) {
				attr_dist[(int)ex->x[best_attr]] += ex->weight;
				size_known += ex->weight;
			}

		args->attr_split_so_far[best_attr] = 1;

		for (i = 0; i < attr_vals; i++) {
			/* create a new example table */
			for (ex = examples, ex_end = examples + size, child_ex = child_examples; ex < ex_end; ex++) {
				if (isnan(ex->x[best_attr])) {
					*child_ex = *ex;
					child_ex->weight *= attr_dist[i] / size_known;
					child_ex++;
				} else if ((int)ex->x[best_attr] == i) {
					*child_ex++ = *ex;
				}
			}

			node->children[i] = build_tree_(child_examples, child_ex - child_examples, depth + 1, node, args);
		}
					
		args->attr_split_so_far[best_attr] = 0;

		free(attr_dist);
		free(child_examples);
	} else {
		struct Example *examples_lt, *examples_ge, *ex_lt, *ex_ge;
		float size_lt, size_ge;

		// printf("* %2d %3d %3d %f %f\n", depth, best_attr, size, best_split, best_score);

		assert(args->domain[best_attr] == FloatVar);

		ASSERT(examples_lt = (struct Example *)calloc(size, sizeof *examples));
		ASSERT(examples_ge = (struct Example *)calloc(size, sizeof *examples));

		size_lt = size_ge = 0.0;
		for (ex = examples, ex_end = examples + size; ex < ex_end; ex++)
			if (!isnan(ex->x[best_attr])) {
				if (ex->x[best_attr] < best_split)
					size_lt += ex->weight;
				else
					size_ge += ex->weight;
			}

		for (ex = examples, ex_end = examples + size, ex_lt = examples_lt, ex_ge = examples_ge; ex < ex_end; ex++)
			if (isnan(ex->x[best_attr])) {
				*ex_lt = *ex;
				*ex_ge = *ex;
				ex_lt->weight *= size_lt / (size_lt + size_ge);
				ex_ge->weight *= size_ge / (size_lt + size_ge);
				ex_lt++;
				ex_ge++;
			} else if (ex->x[best_attr] < best_split) {
				*ex_lt++ = *ex;
			} else {
				*ex_ge++ = *ex;
			}

		/*
		 * Check there was an actual reduction of size in the the two subsets.
		 * This test fails when all best_attr's (the only attr) values  are
		 * the same (and equal best_split) so the data is split in 0 | n size
		 * subsets and recursing would lead to an infinite recursion.
		 */
		if ((ex_lt - examples_lt) < size && (ex_ge - examples_ge) < size) {
			node->type = ContinuousNode;
			node->split_attr = best_attr;
			node->split = best_split;
			node->children_size = 2;
			ASSERT(node->children = (struct SimpleTreeNode **)calloc(2, sizeof *node->children));

			node->children[0] = build_tree_(examples_lt, ex_lt - examples_lt, depth + 1, node, args);
			node->children[1] = build_tree_(examples_ge, ex_ge - examples_ge, depth + 1, node, args);
		} else {
			node = make_predictor(node, examples, size, args);
		}

		free(examples_lt);
		free(examples_ge);
	}

	return node;
}