Ejemplo n.º 1
0
/** Give me a data set and a model, and I'll give you the jackknifed covariance matrix of the model parameters.

The basic algorithm for the jackknife (glossing over the details): create a sequence of data
sets, each with exactly one observation removed, and then produce a new set of parameter estimates 
using that slightly shortened data set. Then, find the covariance matrix of the derived parameters.

\li Jackknife or bootstrap? As a broad rule of thumb, the jackknife works best on models
    that are closer to linear. The worse a linear approximation does (at the given data),
    the worse the jackknife approximates the variance.

\param in	    The data set. An \ref apop_data set where each row is a single data point.
\param model    An \ref apop_model, that will be used internally by \ref apop_estimate.
            
\exception out->error=='n'   \c NULL input data.
\return         An \c apop_data set whose matrix element is the estimated covariance matrix of the parameters.
\see apop_bootstrap_cov

For example:
\include jack.c
*/
apop_data * apop_jackknife_cov(apop_data *in, apop_model *model){
    Apop_stopif(!in, apop_return_data_error(n), 0, "The data input can't be NULL.");
    Get_vmsizes(in); //msize1, msize2, vsize
    apop_model *e = apop_model_copy(model);
    int i, n = GSL_MAX(msize1, GSL_MAX(vsize, in->textsize[0]));
    apop_model *overall_est = e->parameters ? e : apop_estimate(in, e);//if not estimated, do so
    gsl_vector *overall_params = apop_data_pack(overall_est->parameters);
    gsl_vector_scale(overall_params, n); //do it just once.
    gsl_vector *pseudoval = gsl_vector_alloc(overall_params->size);

    //Copy the original, minus the first row.
    apop_data *subset = apop_data_copy(Apop_rs(in, 1, n-1));
    apop_name *tmpnames = in->names; 
    in->names = NULL;  //save on some copying below.

    apop_data *array_of_boots = apop_data_alloc(n, overall_params->size);

    for(i = -1; i< n-1; i++){
        //Get a view of row i, and copy it to position i-1 in the short matrix.
        if (i >= 0) apop_data_memcpy(Apop_r(subset, i), Apop_r(in, i));
        apop_model *est = apop_estimate(subset, e);
        gsl_vector *estp = apop_data_pack(est->parameters);
        gsl_vector_memcpy(pseudoval, overall_params);// *n above.
        gsl_vector_scale(estp, n-1);
        gsl_vector_sub(pseudoval, estp);
        gsl_matrix_set_row(array_of_boots->matrix, i+1, pseudoval);
        apop_model_free(est);
        gsl_vector_free(estp);
    }
    in->names = tmpnames;
    apop_data *out = apop_data_covariance(array_of_boots);
    gsl_matrix_scale(out->matrix, 1./(n-1.));
    apop_data_free(subset);
    gsl_vector_free(pseudoval);
    apop_data_free(array_of_boots);
    if (e!=overall_est)
        apop_model_free(overall_est);
    apop_model_free(e);
    gsl_vector_free(overall_params);
    return out;
}
Ejemplo n.º 2
0
/** Sort an \ref apop_data set on an arbitrary sequence of columns. 

The \c sort_order set is a one-row data set that should look like the data set being
sorted. The easiest way to generate it is to use \ref Apop_r to pull one row of the
table, then copy and fill it. For each column you want used in the sort, assign a ranking giving whether the column should be sorted first, second, .... Columns you don't want used in the sorting should be set to \c NAN. Ties are broken by the earlier element in the default order (see below).

E.g., to sort by the last column of a five-column matrix first, then the next-to-last column, then the next-to-next-to-last, then by the first text column, then by the second text column:

\code
apop_data *sort_order = apop_data_copy(Apop_r(data, 0));
sort_order->vector = NULL; //so it will be skipped.
Apop_data_fill(sort_order, NAN, NAN, 3, 2, 1);
apop_text_add(sort_order, 0, 0, "4");
apop_text_add(sort_order, 0, 1, "5");
apop_data_sort(data, sort_order);
\endcode

I use only comparisons, not the actual numeric values, so you can use any sequence of
numbers: (1, 2, 3) and (-1.32, 0, 27) work identically.

\li Strings are sorted case-insensitively, using \c strcasecmp. [exercise for the reader: modify the source to use Glib's locale-correct string sorting.]

\li The setup generates a lexicographic sort using the columns you specify. If you would like a different sort order, such as Euclidian distance to the origin, you can generate a new column expressing your preferred metric, and then sorting on that. See the example below.

\param data The data set to be sorted. If \c NULL, this function is a no-op that returns \c NULL.
\param sort_order A \ref apop_data set describing the order in which columns are used for sorting, as above. If \c NULL, then sort by the vector, then each matrix column, then text, then weights, then row names.
\param inplace If 'n', make a copy, else sort in place. (default: 'y').
\param asc If 'a', ascending; if 'd', descending. This is applied to all columns; column-by-column application is to do. (default: 'a').
\param col_order For internal use only. In your call, it should be \c NULL; the \ref designated syntax will takes care of it for you.

\return A pointer to the sorted data set. If <tt>inplace=='y'</tt> (the default), then this is the same as the input set.


A few examples:

\include "sort_example.c"

\li This function uses the \ref designated syntax for inputs.
*/
APOP_VAR_HEAD apop_data *apop_data_sort(apop_data *data, apop_data *sort_order, char asc, char inplace, double *col_order){
    apop_data * apop_varad_var(data, NULL);
    Apop_stopif(!data, return NULL, 1, "You gave me NULL data to sort. Returning NULL");
    apop_data * apop_varad_var(sort_order, NULL);
    char apop_varad_var(inplace, 'y');
    char apop_varad_var(asc, 'a');
    double * apop_varad_var(col_order, NULL);
APOP_VAR_ENDHEAD
    if (!data) return NULL;

    apop_data *out = inplace=='n' ? apop_data_copy(data) : data;

    apop_data *xx = sort_order ? sort_order : out;
    Get_vmsizes(xx); //firstcol, msize2
    int cols_to_sort_ct = msize2 - firstcol +1 + !!(xx->weights) + xx->textsize[1] + !!xx->names->rowct;
    double so[cols_to_sort_ct];
    if (!col_order){
        generate_sort_order(out, sort_order, cols_to_sort_ct, so);
        col_order = so;
    }

    bool is_text = ((int)*col_order != *col_order);
    bool is_name = (*col_order == 0.2);

    gsl_vector_view c;
    gsl_vector *cc = NULL;
    if (!is_text && *col_order>=0){
        c = gsl_matrix_column(out->matrix, *col_order);
        cc = &c.vector;
    }
    gsl_vector *thiscol =   cc               ? cc
                          : (*col_order==-2) ? out->weights
                          : (*col_order==-1) ? out->vector
                                             : NULL;

    size_t height = thiscol   ? thiscol->size
                    : is_name ? out->names->rowct
                              : *out->textsize;

    gsl_permutation *p = gsl_permutation_alloc(height);
    if (!is_text) gsl_sort_vector_index (p, thiscol);
    else {
        gsl_permutation_init(p);
        d = out;
        offset = is_name ? -1 : *col_order-0.5;        
        qsort(p->data, height, sizeof(size_t), compare_strings);
    }

    size_t *perm = p->data;
    if (asc=='d' || asc=='D') //reverse the perm matrix.
        for (size_t j=0; j< height/2; j++){
            double t         = perm[j];
            perm[j]          = perm[height-1-j];
            perm[height-1-j] = t;
        }
    rearrange(out, height, perm);
    gsl_permutation_free(p);
    if (col_order[1] == -100) return out;

    /*Second pass:
    find blocks where all are of the same value.
    After you pass a block of size > 1 row where all vals in this col are identical,
    sort that block, using the rest of the sort order. */
    int bottom=0;
    if (!is_text){
        double last_val = gsl_vector_get(thiscol, 0);
        for (int i=1; i< height+1; i++){
            double this_val=0;
            if ((i==height || (this_val=gsl_vector_get(thiscol, i)) != last_val) 
                    && bottom != i-1){
                apop_data_sort_base(Apop_rs(out, bottom, i-bottom), sort_order, 'a', 'y', col_order+1);
            }
            if (last_val != this_val) bottom = i;
            last_val = this_val;
        }
    } else {
        char *last_val =  is_name ? out->names->row[0] : out->text[0][(int)(*col_order-0.5)];
        for (int i=1; i< height+1; i++){
            char *this_val = i==height ? NULL : is_name ? out->names->row[i] : out->text[i][(int)(*col_order-0.5)];
            if ((i==height || strcasecmp(this_val, last_val)) 
                    && bottom != i-1){
                apop_data_sort_base(Apop_rs(out, bottom, i-bottom), sort_order, 'a', 'y', col_order+1);
            }
            if (this_val && strcmp(last_val, this_val)) bottom = i;
            last_val = this_val;
        }
    }
    return out;
}