Ejemplo n.º 1
0
void Data::sort() {

  // Reserve memory
  index_data = new size_t[num_cols_no_sparse * num_rows];

  // For all columns, get unique values and save index for each observation
  for (size_t col = 0; col < num_cols_no_sparse; ++col) {

    // Get all unique values
    std::vector<double> unique_values(num_rows);
    for (size_t row = 0; row < num_rows; ++row) {
      unique_values[row] = get(row, col);
    }
    std::sort(unique_values.begin(), unique_values.end());
    unique_values.erase(unique(unique_values.begin(), unique_values.end()), unique_values.end());

    // Get index of unique value
    for (size_t row = 0; row < num_rows; ++row) {
      size_t idx = std::lower_bound(unique_values.begin(), unique_values.end(), get(row, col)) - unique_values.begin();
      index_data[col * num_rows + row] = idx;
    }

    // Save unique values
    unique_data_values.push_back(unique_values);
    if (unique_values.size() > max_num_unique_values) {
      max_num_unique_values = unique_values.size();
    }
  }
}
Ejemplo n.º 2
0
Archivo: dtree.c Proyecto: pimms/aidt
static void
majority_result(const struct sample *samples, int count, unsigned *field, int *val)
{
    int unique;
    int *vals = unique_values(samples, count, &unique, SAMPLE_RESULT_FIELD);
    int *occurs = (int*)malloc(sizeof(int) * unique);
    for (int i=0; i<unique; i++)
        occurs[i] = value_count(samples, count, vals[i], SAMPLE_RESULT_FIELD);

    *field = SAMPLE_RESULT_FIELD;
    *val = -1;
    int best = 0;
    for (int i=0; i<unique; i++) {
        if (occurs[i] > best) {
            *val = vals[i];
            best = occurs[i];
        }
    }

    free(vals);
    free(occurs);
}
Ejemplo n.º 3
0
Archivo: dtree.c Proyecto: pimms/aidt
static struct decision*
dt_parse_samples(const struct sample *samples, int max, struct where *where)
{
    bool ambiguous = is_set_ambiguous(samples, max);
    int best_field = best_field_where(samples, max, where);

    if (best_field < 0 || !ambiguous)  {
        if (!ambiguous)
            printf("Non-ambiguous set:\n");
        else
            printf("No best field:\n");
        print_set_info(samples, max, where);

        struct decision *d = majority_result_node(samples, max);
        printf("\tLeaf with majority value %i -> %i\n", d->field, d->value);
        return d;
    }


    // The first call has no defined where, and it must be explicitly
    // deleted. Other calls only need append a new where-clause and
    // give it proper filters.
    struct where *w = where_alloc();
    if (where)	 where_append(where, w);
    else		 where = w;
    w->field = best_field;

    // Get all the unique values from the set
    int unique = 0;
    int *vals = unique_values(samples, max, &unique, best_field);

    // The decision tree we are returning
    struct decision *dec = NULL;

    for (int i=0; i<unique; i++) {
        // Create a subset filtered for s->{best_field} = V[i]
        w->value = vals[i];
        int wmax = 0;
        struct sample *wsamples = filter_where(samples, max, where, &wmax);

        // If the filtered subset is equal to the superset, the training
        // data is ambiguous. Return a leaf node with the majority result
        if (wmax == max) {
            printf("Ambiguity in training set:\n\t");
            print_set_info(samples, max, where);
            dec = majority_result_node(samples, max);

            printf("\tassigning majority value %i=%i\n\n",
                   dec->field, dec->value);
            goto dt_parse_samples_cleanup;
        }

        // Create a branch-node
        struct decision *d = dt_alloc();
        d->field = best_field;
        d->value = vals[i];

        // Append the branch to the tree
        if (!dec) 	dec = d;
        else 		dt_append_next(dec, d);

        // Create a subtree
        struct decision *sub = dt_parse_samples(wsamples, wmax, where);
        d->dest = sub;

        // Reference "dec" from all sibling nodes of sub
        while (sub) {
            sub->parent = dec;
            sub = sub->next;
        }

        free(wsamples);
    }

dt_parse_samples_cleanup:
    if (where != w)
        where_destroy(where_pop(where));
    else
        where_destroy(w);
    free(vals);
    return dec;
}