int main(void) { double data[8] = {2, 4, 4, 4, 5, 5, 7, 9}; double x = sum(data, 8); printf("%f\n", x); double m = mean(data, 8); printf("%f\n", m); double v = variance_mean(data, m, 8); printf("%f\n", v); double sd = standard_dev_mean(data, m, 8); printf("%f\n", sd); double obs[5] = {13,17,18,20,24}; double sim[5] = {12,15,20,22,24}; double mean_square_error = mse_c(obs, sim, 5); printf("%f\n", mean_square_error); double pobs[5] = {1,2,3,4,5}; double psim[5] = {1,2,3,4,5}; double ns = nse_c(pobs, psim, 5); printf("%f\n", ns); double kge = kge_c(obs, sim, 5); printf("KGE: %f\n", kge); kge = kge_c(pobs, psim, 5); printf("Perfect KGE: %f\n", kge); double cv = covariance(obs, sim, 5); printf("Covariance: %f\n", cv); cv = covariance(pobs, psim, 5); printf("Covariance: %f\n", cv); double h_data[5] = {-3,-2,-1,1,2}; double out_data[6] = {0,0,0,0,0,0}; heaviside(h_data, out_data, 6); int i; for (i = 0; i < 5; i++) { printf("Heaviside %d: %f\n", i, out_data[i]); } return 0; }
struct SimpleTreeNode * build_tree(struct Example *examples, int size, int depth, struct SimpleTreeNode *parent, struct Args *args) { int i, cls_vals, best_attr; float cls_entropy, cls_mse, best_score, score, size_weight, best_split, split; struct SimpleTreeNode *node; struct Example *ex, *ex_end; TVarList::const_iterator it; cls_vals = args->domain->classVar->noOfValues(); ASSERT(node = (SimpleTreeNode *)malloc(sizeof *node)); if (args->type == Classification) { ASSERT(node->dist = (float *)calloc(cls_vals, sizeof(float *))); if (size == 0) { assert(parent); node->type = PredictorNode; node->children_size = 0; memcpy(node->dist, parent->dist, cls_vals * sizeof *node->dist); return node; } /* class distribution */ size_weight = 0.0; for (ex = examples, ex_end = examples + size; ex < ex_end; ex++) if (!ex->example->getClass().isSpecial()) { node->dist[ex->example->getClass().intV] += ex->weight; size_weight += ex->weight; } /* stopping criterion: majority class */ for (i = 0; i < cls_vals; i++) if (node->dist[i] / size_weight >= args->maxMajority) return make_predictor(node, examples, size, args); cls_entropy = entropy(node->dist, cls_vals); } else { float n, sum, sum2, cls_val; assert(args->type == Regression); if (size == 0) { assert(parent); node->type = PredictorNode; node->children_size = 0; node->n = parent->n; node->sum = parent->sum; return node; } n = sum = sum2 = 0.0; for (ex = examples, ex_end = examples + size; ex < ex_end; ex++) if (!ex->example->getClass().isSpecial()) { cls_val = ex->example->getClass().floatV; n += ex->weight; sum += ex->weight * cls_val; sum2 += ex->weight * cls_val * cls_val; } node->n = n; node->sum = sum; cls_mse = (sum2 - sum * sum / n) / n; if (cls_mse < 1e-5) { return make_predictor(node, examples, size, args); } } /* stopping criterion: depth exceeds limit */ if (depth == args->maxDepth) return make_predictor(node, examples, size, args); /* score attributes */ best_score = -INFINITY; for (i = 0, it = args->domain->attributes->begin(); it != args->domain->attributes->end(); it++, i++) { if (!args->attr_split_so_far[i]) { /* select random subset of attributes */ if (args->randomGenerator->randdouble() < args->skipProb) continue; if ((*it)->varType == TValue::INTVAR) { score = args->type == Classification ? gain_ratio_d(examples, size, i, cls_entropy, args) : mse_d(examples, size, i, cls_mse, args); if (score > best_score) { best_score = score; best_attr = i; } } else if ((*it)->varType == TValue::FLOATVAR) { score = args->type == Classification ? gain_ratio_c(examples, size, i, cls_entropy, args, &split) : mse_c(examples, size, i, cls_mse, args, &split); if (score > best_score) { best_score = score; best_split = split; best_attr = i; } } } } if (best_score == -INFINITY) return make_predictor(node, examples, size, args); if (args->domain->attributes->at(best_attr)->varType == TValue::INTVAR) { struct Example *child_examples, *child_ex; int attr_vals; float size_known, *attr_dist; /* printf("* %2d %3s %3d %f\n", depth, args->domain->attributes->at(best_attr)->get_name().c_str(), size, best_score); */ attr_vals = args->domain->attributes->at(best_attr)->noOfValues(); node->type = DiscreteNode; node->split_attr = best_attr; node->children_size = attr_vals; ASSERT(child_examples = (struct Example *)calloc(size, sizeof *child_examples)); ASSERT(node->children = (SimpleTreeNode **)calloc(attr_vals, sizeof *node->children)); ASSERT(attr_dist = (float *)calloc(attr_vals, sizeof *attr_dist)); /* attribute distribution */ size_known = 0; for (ex = examples, ex_end = examples + size; ex < ex_end; ex++) if (!ex->example->values[best_attr].isSpecial()) { attr_dist[ex->example->values[best_attr].intV] += ex->weight; size_known += ex->weight; } args->attr_split_so_far[best_attr] = 1; for (i = 0; i < attr_vals; i++) { /* create a new example table */ for (ex = examples, ex_end = examples + size, child_ex = child_examples; ex < ex_end; ex++) { if (ex->example->values[best_attr].isSpecial()) { *child_ex = *ex; child_ex->weight *= attr_dist[i] / size_known; child_ex++; } else if (ex->example->values[best_attr].intV == i) { *child_ex++ = *ex; } } node->children[i] = build_tree(child_examples, child_ex - child_examples, depth + 1, node, args); } args->attr_split_so_far[best_attr] = 0; free(attr_dist); free(child_examples); } else { struct Example *examples_lt, *examples_ge, *ex_lt, *ex_ge; float size_lt, size_ge; /* printf("* %2d %3s %3d %f %f\n", depth, args->domain->attributes->at(best_attr)->get_name().c_str(), size, best_split, best_score); */ assert(args->domain->attributes->at(best_attr)->varType == TValue::FLOATVAR); ASSERT(examples_lt = (struct Example *)calloc(size, sizeof *examples)); ASSERT(examples_ge = (struct Example *)calloc(size, sizeof *examples)); size_lt = size_ge = 0.0; for (ex = examples, ex_end = examples + size; ex < ex_end; ex++) if (!ex->example->values[best_attr].isSpecial()) if (ex->example->values[best_attr].floatV < best_split) size_lt += ex->weight; else size_ge += ex->weight; for (ex = examples, ex_end = examples + size, ex_lt = examples_lt, ex_ge = examples_ge; ex < ex_end; ex++) if (ex->example->values[best_attr].isSpecial()) { *ex_lt = *ex; *ex_ge = *ex; ex_lt->weight *= size_lt / (size_lt + size_ge); ex_ge->weight *= size_ge / (size_lt + size_ge); ex_lt++; ex_ge++; } else if (ex->example->values[best_attr].floatV < best_split) { *ex_lt++ = *ex; } else { *ex_ge++ = *ex; } node->type = ContinuousNode; node->split_attr = best_attr; node->split = best_split; node->children_size = 2; ASSERT(node->children = (SimpleTreeNode **)calloc(2, sizeof *node->children)); node->children[0] = build_tree(examples_lt, ex_lt - examples_lt, depth + 1, node, args); node->children[1] = build_tree(examples_ge, ex_ge - examples_ge, depth + 1, node, args); free(examples_lt); free(examples_ge); } return node; }
struct SimpleTreeNode * build_tree_(struct Example *examples, int size, int depth, struct SimpleTreeNode *parent, struct Args *args) { int i, cls_vals, best_attr; float cls_entropy, cls_mse, best_score, score, size_weight, best_split, split; struct SimpleTreeNode *node; struct Example *ex, *ex_end; cls_vals = args->cls_vals; ASSERT(node = (struct SimpleTreeNode *)malloc(sizeof *node)); cls_mse = cls_entropy = 0.0; if (args->type == Classification) { ASSERT(node->dist = (float *)calloc(cls_vals, sizeof(float))); if (size == 0) { assert(parent); node->type = PredictorNode; node->children_size = 0; memcpy(node->dist, parent->dist, cls_vals * sizeof *node->dist); return node; } /* class distribution */ size_weight = 0.0; for (ex = examples, ex_end = examples + size; ex < ex_end; ex++) if (!isnan(ex->y)) { node->dist[(int)ex->y] += ex->weight; size_weight += ex->weight; } /* stopping criterion: majority class */ for (i = 0; i < cls_vals; i++) if (node->dist[i] / size_weight >= args->max_majority) return make_predictor(node, examples, size, args); cls_entropy = entropy(node->dist, cls_vals); } else { float n, sum, sum2, cls_val; assert(args->type == Regression); if (size == 0) { assert(parent); node->type = PredictorNode; node->children_size = 0; node->n = parent->n; node->sum = parent->sum; return node; } n = sum = sum2 = 0.0; for (ex = examples, ex_end = examples + size; ex < ex_end; ex++) if (!isnan(ex->y)) { cls_val = ex->y; n += ex->weight; sum += ex->weight * cls_val; sum2 += ex->weight * cls_val * cls_val; } node->n = n; node->sum = sum; cls_mse = (sum2 - sum * sum / n) / n; if (cls_mse < 1e-5) { return make_predictor(node, examples, size, args); } } /* stopping criterion: depth exceeds limit */ if (depth == args->max_depth) return make_predictor(node, examples, size, args); /* score attributes */ best_score = -INFINITY; best_split = 0; best_attr = 0; for (i = 0; i < args->num_attrs; i++) { if (!args->attr_split_so_far[i]) { /* select random subset of attributes */ if ((double)rand() / (double)RAND_MAX < args->skip_prob) continue; if (args->domain[i] == IntVar) { score = args->type == Classification ? gain_ratio_d(examples, size, i, cls_entropy, args) : mse_d(examples, size, i, cls_mse, args); if (score > best_score) { best_score = score; best_attr = i; } } else if (args->domain[i] == FloatVar) { score = args->type == Classification ? gain_ratio_c(examples, size, i, cls_entropy, args, &split) : mse_c(examples, size, i, cls_mse, args, &split); if (score > best_score) { best_score = score; best_split = split; best_attr = i; } } } } if (best_score == -INFINITY) return make_predictor(node, examples, size, args); if (args->domain[best_attr] == IntVar) { struct Example *child_examples, *child_ex; int attr_vals; float size_known, *attr_dist; // printf("* %2d %3d %3d %f\n", depth, best_attr, size, best_score); attr_vals = args->attr_vals[best_attr]; node->type = DiscreteNode; node->split_attr = best_attr; node->children_size = attr_vals; ASSERT(child_examples = (struct Example *)calloc(size, sizeof *child_examples)); ASSERT(node->children = (struct SimpleTreeNode **)calloc(attr_vals, sizeof *node->children)); ASSERT(attr_dist = (float *)calloc(attr_vals, sizeof *attr_dist)); /* attribute distribution */ size_known = 0; for (ex = examples, ex_end = examples + size; ex < ex_end; ex++) if (!isnan(ex->x[best_attr])) { attr_dist[(int)ex->x[best_attr]] += ex->weight; size_known += ex->weight; } args->attr_split_so_far[best_attr] = 1; for (i = 0; i < attr_vals; i++) { /* create a new example table */ for (ex = examples, ex_end = examples + size, child_ex = child_examples; ex < ex_end; ex++) { if (isnan(ex->x[best_attr])) { *child_ex = *ex; child_ex->weight *= attr_dist[i] / size_known; child_ex++; } else if ((int)ex->x[best_attr] == i) { *child_ex++ = *ex; } } node->children[i] = build_tree_(child_examples, child_ex - child_examples, depth + 1, node, args); } args->attr_split_so_far[best_attr] = 0; free(attr_dist); free(child_examples); } else { struct Example *examples_lt, *examples_ge, *ex_lt, *ex_ge; float size_lt, size_ge; // printf("* %2d %3d %3d %f %f\n", depth, best_attr, size, best_split, best_score); assert(args->domain[best_attr] == FloatVar); ASSERT(examples_lt = (struct Example *)calloc(size, sizeof *examples)); ASSERT(examples_ge = (struct Example *)calloc(size, sizeof *examples)); size_lt = size_ge = 0.0; for (ex = examples, ex_end = examples + size; ex < ex_end; ex++) if (!isnan(ex->x[best_attr])) { if (ex->x[best_attr] < best_split) size_lt += ex->weight; else size_ge += ex->weight; } for (ex = examples, ex_end = examples + size, ex_lt = examples_lt, ex_ge = examples_ge; ex < ex_end; ex++) if (isnan(ex->x[best_attr])) { *ex_lt = *ex; *ex_ge = *ex; ex_lt->weight *= size_lt / (size_lt + size_ge); ex_ge->weight *= size_ge / (size_lt + size_ge); ex_lt++; ex_ge++; } else if (ex->x[best_attr] < best_split) { *ex_lt++ = *ex; } else { *ex_ge++ = *ex; } /* * Check there was an actual reduction of size in the the two subsets. * This test fails when all best_attr's (the only attr) values are * the same (and equal best_split) so the data is split in 0 | n size * subsets and recursing would lead to an infinite recursion. */ if ((ex_lt - examples_lt) < size && (ex_ge - examples_ge) < size) { node->type = ContinuousNode; node->split_attr = best_attr; node->split = best_split; node->children_size = 2; ASSERT(node->children = (struct SimpleTreeNode **)calloc(2, sizeof *node->children)); node->children[0] = build_tree_(examples_lt, ex_lt - examples_lt, depth + 1, node, args); node->children[1] = build_tree_(examples_ge, ex_ge - examples_ge, depth + 1, node, args); } else { node = make_predictor(node, examples, size, args); } free(examples_lt); free(examples_ge); } return node; }