void Data::sort() { // Reserve memory index_data = new size_t[num_cols_no_sparse * num_rows]; // For all columns, get unique values and save index for each observation for (size_t col = 0; col < num_cols_no_sparse; ++col) { // Get all unique values std::vector<double> unique_values(num_rows); for (size_t row = 0; row < num_rows; ++row) { unique_values[row] = get(row, col); } std::sort(unique_values.begin(), unique_values.end()); unique_values.erase(unique(unique_values.begin(), unique_values.end()), unique_values.end()); // Get index of unique value for (size_t row = 0; row < num_rows; ++row) { size_t idx = std::lower_bound(unique_values.begin(), unique_values.end(), get(row, col)) - unique_values.begin(); index_data[col * num_rows + row] = idx; } // Save unique values unique_data_values.push_back(unique_values); if (unique_values.size() > max_num_unique_values) { max_num_unique_values = unique_values.size(); } } }
static void majority_result(const struct sample *samples, int count, unsigned *field, int *val) { int unique; int *vals = unique_values(samples, count, &unique, SAMPLE_RESULT_FIELD); int *occurs = (int*)malloc(sizeof(int) * unique); for (int i=0; i<unique; i++) occurs[i] = value_count(samples, count, vals[i], SAMPLE_RESULT_FIELD); *field = SAMPLE_RESULT_FIELD; *val = -1; int best = 0; for (int i=0; i<unique; i++) { if (occurs[i] > best) { *val = vals[i]; best = occurs[i]; } } free(vals); free(occurs); }
static struct decision* dt_parse_samples(const struct sample *samples, int max, struct where *where) { bool ambiguous = is_set_ambiguous(samples, max); int best_field = best_field_where(samples, max, where); if (best_field < 0 || !ambiguous) { if (!ambiguous) printf("Non-ambiguous set:\n"); else printf("No best field:\n"); print_set_info(samples, max, where); struct decision *d = majority_result_node(samples, max); printf("\tLeaf with majority value %i -> %i\n", d->field, d->value); return d; } // The first call has no defined where, and it must be explicitly // deleted. Other calls only need append a new where-clause and // give it proper filters. struct where *w = where_alloc(); if (where) where_append(where, w); else where = w; w->field = best_field; // Get all the unique values from the set int unique = 0; int *vals = unique_values(samples, max, &unique, best_field); // The decision tree we are returning struct decision *dec = NULL; for (int i=0; i<unique; i++) { // Create a subset filtered for s->{best_field} = V[i] w->value = vals[i]; int wmax = 0; struct sample *wsamples = filter_where(samples, max, where, &wmax); // If the filtered subset is equal to the superset, the training // data is ambiguous. Return a leaf node with the majority result if (wmax == max) { printf("Ambiguity in training set:\n\t"); print_set_info(samples, max, where); dec = majority_result_node(samples, max); printf("\tassigning majority value %i=%i\n\n", dec->field, dec->value); goto dt_parse_samples_cleanup; } // Create a branch-node struct decision *d = dt_alloc(); d->field = best_field; d->value = vals[i]; // Append the branch to the tree if (!dec) dec = d; else dt_append_next(dec, d); // Create a subtree struct decision *sub = dt_parse_samples(wsamples, wmax, where); d->dest = sub; // Reference "dec" from all sibling nodes of sub while (sub) { sub->parent = dec; sub = sub->next; } free(wsamples); } dt_parse_samples_cleanup: if (where != w) where_destroy(where_pop(where)); else where_destroy(w); free(vals); return dec; }