Ejemplo n.º 1
0
/** Run the Kolmogorov-Smirnov test to determine whether two distributions are identical.

\param m1, m2  Two models, most likely of \ref apop_pmf type. I will ue the cdf method, so if your function doesn't have one, expect this to run the slow default. I run it for each row of each data set, so if your model has a \c NULL at the data, I won't know what to do. 
 
\return An \ref apop_data set including the \f$p\f$-value from the Kolmogorov test that the two distributions are equal.

\li I assume that the data sets are sorted.

\include ks_tests.c

\ingroup histograms
*/
apop_data *apop_test_kolmogorov(apop_model *m1, apop_model *m2){
    //version for not a pair of histograms
    Apop_assert(m1->data, "I will test the CDF at each point in the data set, but the first model has a NULL data set. "
                          "Maybe generate, then apop_data_sort, a few thousand random draws?");
    Apop_assert(m2->data, "I will test the CDF at each point in the data set, but the second model has a NULL data set. "
                          "Maybe generate, then apop_data_sort, a few thousand random draws?");
    int maxsize1, maxsize2;
    {Get_vmsizes(m1->data); maxsize1 = maxsize;}//copy one of the macro's variables 
    {Get_vmsizes(m2->data); maxsize2 = maxsize;}//  to the full function's scope.
    double largest_diff=GSL_NEGINF;
    for (size_t i=0; i< maxsize1; i++){
        Apop_data_row(m1->data, i, arow);
        largest_diff = GSL_MAX(largest_diff, fabs(apop_cdf(arow, m1)-apop_cdf(arow, m2)));
    }
    for (size_t i=0; i< maxsize2; i++){     //There should be matched data rows, so there is redundancy.
        Apop_data_row(m2->data, i, arow); // Feel free to submit a smarter version.
        largest_diff = GSL_MAX(largest_diff, fabs(apop_cdf(arow, m1)-apop_cdf(arow, m2)));
    }
    apop_data *out = apop_data_alloc();
    sprintf(out->names->title, "Kolmogorov-Smirnov test");
    apop_data_add_named_elmt(out, "max distance", largest_diff);
    double ps = psmirnov2x(largest_diff, maxsize1, maxsize2);
    apop_data_add_named_elmt(out, "p value, 2 tail", 1-ps);
    apop_data_add_named_elmt(out, "confidence, 2 tail", ps);
    return out;
}
Ejemplo n.º 2
0
/* \adoc cdf At the moment, only implemented for the Binomial.
  Let the first element of the data set (top of the vector or point (0,0) in the
  matrix, your pick) be $L$; then I return the sum of the odds of a draw from the given
  Binomial distribution returning $0, 1, \dots, L$ hits.  */
static double binomial_cdf(apop_data *d, apop_model *est){
    Nullcheck_mpd(d, est, GSL_NAN)
    Get_vmsizes(d); //firstcol
    double hitcount = apop_data_get(d, .col=firstcol);
    double n = gsl_vector_get(est->parameters->vector, 0);
    double p = gsl_vector_get(est->parameters->vector, 1);
    return gsl_cdf_binomial_P(hitcount, p, n);
}
Ejemplo n.º 3
0
apop_model* apop_chi_estimate(apop_data *d, apop_model *m){
    Apop_assert(d, "No data with which to count df. (the default estimation method)");
    Get_vmsizes(d); //vsize, msize1, msize2
    apop_model *out = apop_model_copy(*m);
    apop_data_add_named_elmt(out->parameters, "df", tsize - 1);
    apop_data_add_named_elmt(out->info, "log likelihood", out->log_likelihood(d, out));
    return out;
}
Ejemplo n.º 4
0
static apop_data *colmeans(apop_data *in){
    Get_vmsizes(in); //maxsize
    apop_data *sums = apop_data_summarize(in);
    Apop_col_tv(sums, "mean", means);
    apop_data *out = apop_matrix_to_data(apop_vector_to_matrix(means, 'r'));
    apop_name_stack(out->names, in->names, 'c', 'c');
    apop_data *cov = apop_data_add_page(out, apop_data_covariance(in), "<Covariance>");
    gsl_matrix_scale(cov->matrix, 1/sqrt(maxsize));
    return out;
}
Ejemplo n.º 5
0
/** If there is an NaN anywhere in the row of data (including the matrix, the vector, the weights, and the text) then delete the row from the data set.

\li If every row has an NaN, then this returns \c NULL.
\li If \c apop_opts.db_nan is not \c NULL, then I will use that as a regular expression to check the text elements for bad data as well.
\li If \c inplace = 'y', then I'll free each element of the input data
    set and refill it with the pruned elements. I'll still take up (up to)
    twice the size of the data set in memory during the function. If
    every row has an NaN, then your \c apop_data set will end up with
    \c NULL vector, matrix, \dots. if \c inplace = 'n', then the original data set is left unmolested.
\li I only look at the first page of data (i.e. the \c more element is ignored).
\li This function uses the \ref designated syntax for inputs.

    \param d    The data, with NaNs
    \param inplace If \c 'y', clear out the pointer-to-\ref apop_data that
    you sent in and refill with the pruned data. If \c 'n', leave the
    set alone and return a new data set.
    \return     A (potentially shorter) copy of the data set, without
    NaNs. If <tt>inplace=='y'</tt>, redundant with the input. If the entire data set is
    cleared out, then this will be \c NULL.
*/
APOP_VAR_HEAD apop_data * apop_data_listwise_delete(apop_data *d, char inplace){
    apop_data * apop_varad_var(d, NULL);
    if (!d) return NULL;
    char apop_varad_var(inplace, 'n');
APOP_VAR_ENDHEAD
    Get_vmsizes(d) //defines firstcol, vsize, wsize, msize1, msize2.
    apop_assert_c(msize1 || vsize || d->textsize[0], NULL, 0, 
            "You sent to apop_data_listwise_delete a data set with NULL matrix, NULL vector, and no text. "
            "Confused, it is returning NULL.");
    //find out where the NaNs are
    int len = GSL_MAX(vsize ? vsize : msize1, d->textsize[0]); //still some size assumptions here.
    int not_empty = 0;
    int *marked = calloc(len, sizeof(int));
    for (int i=0; i< (vsize ? vsize: msize1); i++)
        for (int j=firstcol; j <msize2; j++){
            if (gsl_isnan(apop_data_get(d, i, j))){
                    marked[i] = 1;
                    break;
            }
        }
    for (int i=0; i< wsize; i++)
        if (gsl_isnan(gsl_vector_get(d->weights, i)))
            marked[i] = 1;
    if (d->textsize[0] && apop_opts.db_nan){
        regex_t    rex;
        int compiled_ok = !regcomp(&rex, apop_opts.db_nan, REG_EXTENDED +  REG_ICASE + REG_NOSUB);
        apop_assert(compiled_ok, "apop_opts.db_nan needs to be a regular expression that "
                                "I can use to check the text element of your data set for "
                                "NaNs, But compiling %s into a regex failed. Or, set "
                                "apop_opts.db_nan=NULL to bypass text checking.", apop_opts.db_nan);
        for(int i=0; i< d->textsize[0]; i++)
            if (!marked[i])
                for(int j=0; j< d->textsize[1]; j++)
                    if (!regexec(&rex, d->text[i][j], 0, 0, 0)){
                        marked[i] ++;
                        break;
                    }
        regfree(&rex);
    }

    //check that at least something isn't NULL.
    for (int i=0; i< len; i++)
        if (!marked[i]){
            not_empty ++;
            break;
        }
    if (!not_empty){
        free(marked);
        return NULL;
    }
    apop_data *out = (inplace=='y'|| inplace=='Y') ? d : apop_data_copy(d);
    apop_data_rm_rows(out, marked);
    free(marked);
    return out;
}
Ejemplo n.º 6
0
 apop_data * apop_bootstrap_cov_base(apop_data * data, apop_model *model, gsl_rng *rng, int iterations, char keep_boots, char ignore_nans, apop_data **boot_store){
#endif
    Get_vmsizes(data); //vsize, msize1, msize2
    apop_model *e = apop_model_copy(model);
    apop_data *subset = apop_data_copy(data);
    apop_data *array_of_boots = NULL,
              *summary;
    //prevent and infinite regression of covariance calculation.
    Apop_model_add_group(e, apop_parts_wanted); //default wants for nothing.
    size_t i, nan_draws=0;
    apop_name *tmpnames = (data && data->names) ? data->names : NULL; //save on some copying below.
    if (data && data->names) data->names = NULL;

    int height = GSL_MAX(msize1, GSL_MAX(vsize, (data?(*data->textsize):0)));
	for (i=0; i<iterations && nan_draws < iterations; i++){
		for (size_t j=0; j< height; j++){       //create the data set
			size_t randrow	= gsl_rng_uniform_int(rng, height);
            apop_data_memcpy(Apop_r(subset, j), Apop_r(data, randrow));
		}
		//get the parameter estimates.
		apop_model *est = apop_estimate(subset, e);
        gsl_vector *estp = apop_data_pack(est->parameters);
        if (!gsl_isnan(apop_sum(estp))){
            if (i==0){
                array_of_boots	      = apop_data_alloc(iterations, estp->size);
                apop_name_stack(array_of_boots->names, est->parameters->names, 'c', 'v');
                apop_name_stack(array_of_boots->names, est->parameters->names, 'c', 'c');
                apop_name_stack(array_of_boots->names, est->parameters->names, 'c', 'r');
            }
            gsl_matrix_set_row(array_of_boots->matrix, i, estp);
        } else if (ignore_nans=='y'){
            i--; 
            nan_draws++;
        }
        apop_model_free(est);
        gsl_vector_free(estp);
	}
    if(data) data->names = tmpnames;
    apop_data_free(subset);
    apop_model_free(e);
    int set_error=0;
    Apop_stopif(i == 0 && nan_draws == iterations, apop_return_data_error(N),
                1, "I ran into %i NaNs and no not-NaN estimations, and so stopped. "
                       , iterations);
    Apop_stopif(nan_draws == iterations,  set_error++;
            apop_matrix_realloc(array_of_boots->matrix, i, array_of_boots->matrix->size2),
                1, "I ran into %i NaNs, and so stopped. Returning results based "
                       "on %zu bootstrap iterations.", iterations, i);
	summary	= apop_data_covariance(array_of_boots);
    if (boot_store) *boot_store = array_of_boots;
    else            apop_data_free(array_of_boots);
    if (set_error) summary->error = 'N';
	return summary;
}
Ejemplo n.º 7
0
apop_model* apop_t_estimate(apop_data *d, apop_model *m){
    Apop_assert(d, "No data with which to count df. (the default estimation method)");
    Get_vmsizes(d); //vsize, msize1, msize2, tsize
    apop_model *out = apop_model_copy(*m);
    double vmu = vsize ? apop_mean(d->vector) : 0;
    double v_sum_sq = vsize ? apop_var(d->vector)*(vsize-1) : 0;
    double m_sum_sq = 0;
    double mmu = 0;
   if (msize1) {
       apop_matrix_mean_and_var(d->matrix, &mmu, &m_sum_sq);
       m_sum_sq *= msize1*msize2-1;
   }
    apop_data_add_names(out->parameters, 'r', "mean", "standard deviation", "df");
    apop_data_set(out->parameters, 0, -1, (vmu *vsize + mmu * msize1*msize2)/tsize);
    apop_data_set(out->parameters, 1, -1, sqrt((v_sum_sq*vsize + m_sum_sq * msize1*msize2)/(tsize-1))); 
    apop_data_set(out->parameters, 2, -1, tsize-1);
    apop_data_add_named_elmt(out->info, "log likelihood", out->log_likelihood(d, out));
    return out;
}
Ejemplo n.º 8
0
/** Give me a data set and a model, and I'll give you the jackknifed covariance matrix of the model parameters.

The basic algorithm for the jackknife (glossing over the details): create a sequence of data
sets, each with exactly one observation removed, and then produce a new set of parameter estimates 
using that slightly shortened data set. Then, find the covariance matrix of the derived parameters.

\li Jackknife or bootstrap? As a broad rule of thumb, the jackknife works best on models
    that are closer to linear. The worse a linear approximation does (at the given data),
    the worse the jackknife approximates the variance.

\param in	    The data set. An \ref apop_data set where each row is a single data point.
\param model    An \ref apop_model, that will be used internally by \ref apop_estimate.
            
\exception out->error=='n'   \c NULL input data.
\return         An \c apop_data set whose matrix element is the estimated covariance matrix of the parameters.
\see apop_bootstrap_cov

For example:
\include jack.c
*/
apop_data * apop_jackknife_cov(apop_data *in, apop_model *model){
    Apop_stopif(!in, apop_return_data_error(n), 0, "The data input can't be NULL.");
    Get_vmsizes(in); //msize1, msize2, vsize
    apop_model *e = apop_model_copy(model);
    int i, n = GSL_MAX(msize1, GSL_MAX(vsize, in->textsize[0]));
    apop_model *overall_est = e->parameters ? e : apop_estimate(in, e);//if not estimated, do so
    gsl_vector *overall_params = apop_data_pack(overall_est->parameters);
    gsl_vector_scale(overall_params, n); //do it just once.
    gsl_vector *pseudoval = gsl_vector_alloc(overall_params->size);

    //Copy the original, minus the first row.
    apop_data *subset = apop_data_copy(Apop_rs(in, 1, n-1));
    apop_name *tmpnames = in->names; 
    in->names = NULL;  //save on some copying below.

    apop_data *array_of_boots = apop_data_alloc(n, overall_params->size);

    for(i = -1; i< n-1; i++){
        //Get a view of row i, and copy it to position i-1 in the short matrix.
        if (i >= 0) apop_data_memcpy(Apop_r(subset, i), Apop_r(in, i));
        apop_model *est = apop_estimate(subset, e);
        gsl_vector *estp = apop_data_pack(est->parameters);
        gsl_vector_memcpy(pseudoval, overall_params);// *n above.
        gsl_vector_scale(estp, n-1);
        gsl_vector_sub(pseudoval, estp);
        gsl_matrix_set_row(array_of_boots->matrix, i+1, pseudoval);
        apop_model_free(est);
        gsl_vector_free(estp);
    }
    in->names = tmpnames;
    apop_data *out = apop_data_covariance(array_of_boots);
    gsl_matrix_scale(out->matrix, 1./(n-1.));
    apop_data_free(subset);
    gsl_vector_free(pseudoval);
    apop_data_free(array_of_boots);
    if (e!=overall_est)
        apop_model_free(overall_est);
    apop_model_free(e);
    gsl_vector_free(overall_params);
    return out;
}
Ejemplo n.º 9
0
/** Make random draws from an \ref apop_model, and bin them using a binspec in the style
 of \ref apop_data_to_bins. If you have a data set that used the same binspec, you now have synced histograms, which you can plot or sensibly test hypotheses about.

The output is normalized to integrate to one.

\param binspec A description of the bins in which to place the draws; see \ref apop_data_to_bins. (default: as in \ref apop_data_to_bins.)
\param model The model to be drawn from. Because this function works via random draws, the model needs to have a 
\c draw method. (No default)
\param draws The number of random draws to make. (arbitrary default = 10,000)
\param bin_count If no bin spec, the number of bins to use (default: as per \ref apop_data_to_bins, \f$\sqrt(N)\f$)
\param rng The \c gsl_rng used to make random draws. (default: see note on \ref autorng)

\return An \ref apop_pmf model.

\li This function uses the \ref designated syntax for inputs.

\ingroup histograms
*/
APOP_VAR_HEAD apop_model *apop_model_to_pmf(apop_model *model, apop_data *binspec, long int draws, int bin_count, gsl_rng *rng){
    apop_model* apop_varad_var(model, NULL);
    Apop_assert(model && model->draw, "The second argument needs to be an apop_model with a 'draw' function "
                              "that I can use to make random draws.");
    apop_data* apop_varad_var(binspec, NULL);
    int apop_varad_var(bin_count, 0);
    long int apop_varad_var(draws, 1e4);
    gsl_rng *apop_varad_var(rng, NULL)
    static gsl_rng *spare = NULL;
    if (!rng && !spare) 
        spare = apop_rng_alloc(++apop_opts.rng_seed);
    if (!rng) rng = spare;
APOP_VAR_ENDHEAD
    Get_vmsizes(binspec);
    apop_data *outd = apop_data_alloc(draws, model->dsize); 
    for (long int i=0; i< draws; i++){
        Apop_row(outd, i, ach);
        apop_draw(ach->data, rng, model);
    }
    apop_data *outbinned = apop_data_to_bins(outd, binspec, .bin_count=bin_count);
    apop_data_free(outd);
    apop_vector_normalize(outbinned->weights);
    return apop_estimate(outbinned, apop_pmf);
} 
Ejemplo n.º 10
0
/** Create a histogram from data by putting data into bins of fixed width. 

\param indata The input data that will be binned. This is copied and the copy will be modified.
\param close_top_bin Normally, a bin covers the range from the point equal to its minimum to points strictly less than
the minimum plus the width.  if \c 'y', then the top bin includes points less than or equal to the upper bound. This solves the problem of displaying histograms where the top bin is just one point.
\param binspec This is an \ref apop_data set with the same number of columns as \c indata. 
If you want a fixed size for the bins, then the first row of the bin spec is the bin width for each column.
This allows you to specify a width for each dimension, or specify the same size for all with something like:

\param bin_count If you don't provide a bin spec, I'll provide this many evenly-sized bins. Default: \f$\sqrt(N)\f$.  \code
Apop_data_row(indata, 0, firstrow);
apop_data *binspec = apop_data_copy(firstrow);
gsl_matrix_set_all(binspec->matrix, 10); //bins of size 10 for all dim.s
apop_data_to_bins(indata, binspec);
\endcode
The presumption is that the first bin starts at zero in all cases. You can add a second row to the spec to give the offset for each dimension.  Default: NULL. if no binspec and no binlist, then a grid with offset equal to the min of the column, and bin size such that it takes \f$\sqrt{N}\f$ bins to cover the range to the max element. 


\return A pointer to a binned \ref apop_data set.  If you didn't give me a binspec, then I attach one to the output set as a page named \c \<binspec\>, so you can snap a second data set to the same grid using 
\code
apop_data_to_bins(first_set, NULL);
apop_data_to_bins(second_set, apop_data_get_page(first_set, "<binspec>"));
\endcode


  The text segment, if any, is not binned. I use \ref apop_data_pmf_compress as the final step in the binning, 
  and that does respect the text segment. 

Here is a sample program highlighting the difference between \ref apop_data_to_bins and \ref apop_data_pmf_compress .

\include binning.c
*/
APOP_VAR_HEAD apop_data *apop_data_to_bins(apop_data *indata, apop_data *binspec, int bin_count, char close_top_bin){
    apop_data *apop_varad_var(indata, NULL);
    Apop_assert_c(indata, NULL, 1, "NULL input data set, so returning NULL output data set.");
    apop_data *apop_varad_var(binspec, NULL);
    char apop_varad_var(close_top_bin, 'n');
    int apop_varad_var(bin_count, 0);
APOP_VAR_ENDHEAD
    Get_vmsizes(indata); //firstcol, vsize, msize1, msize2
    double binwidth, offset, max=0;
    apop_data *out = apop_data_copy(indata);
    apop_data *bs = binspec ? binspec
                    : apop_data_add_page(out, 
                        apop_data_alloc(vsize? 2: 0, msize1? 2: 0, indata->matrix ? msize2: 0),
                        "<binspec>");
    for (int j= firstcol; j< msize2; j++){
        Apop_col(out, j, onecol);
        if (binspec){
           binwidth = apop_data_get(binspec, 0, j);
           offset = ((binspec->vector && binspec->vector->size==2 )
                   ||(binspec->matrix && binspec->matrix->size1==2)) ? apop_data_get(binspec, 1, j) : 0;
        } else {
            Apop_col(bs, j, abin);
            max = gsl_vector_max(onecol);
            offset = abin->data[1] = gsl_vector_min(onecol);
            binwidth = abin->data[0] = (max - offset)/(bin_count ? bin_count : sqrt(onecol->size));
        }
        for (int i=0; i< onecol->size; i++){
            double val = gsl_vector_get(onecol, i);
            if (close_top_bin=='y' && val == max && val!=offset) 
                val -= 2*GSL_DBL_EPSILON;
            gsl_vector_set(onecol, i, (floor((val -offset)/binwidth))*binwidth+offset);
        }
    }
    apop_data_pmf_compress(out);
    return out;
}
Ejemplo n.º 11
0
/** Sort an \ref apop_data set on an arbitrary sequence of columns. 

The \c sort_order set is a one-row data set that should look like the data set being
sorted. The easiest way to generate it is to use \ref Apop_r to pull one row of the
table, then copy and fill it. For each column you want used in the sort, assign a ranking giving whether the column should be sorted first, second, .... Columns you don't want used in the sorting should be set to \c NAN. Ties are broken by the earlier element in the default order (see below).

E.g., to sort by the last column of a five-column matrix first, then the next-to-last column, then the next-to-next-to-last, then by the first text column, then by the second text column:

\code
apop_data *sort_order = apop_data_copy(Apop_r(data, 0));
sort_order->vector = NULL; //so it will be skipped.
Apop_data_fill(sort_order, NAN, NAN, 3, 2, 1);
apop_text_add(sort_order, 0, 0, "4");
apop_text_add(sort_order, 0, 1, "5");
apop_data_sort(data, sort_order);
\endcode

I use only comparisons, not the actual numeric values, so you can use any sequence of
numbers: (1, 2, 3) and (-1.32, 0, 27) work identically.

\li Strings are sorted case-insensitively, using \c strcasecmp. [exercise for the reader: modify the source to use Glib's locale-correct string sorting.]

\li The setup generates a lexicographic sort using the columns you specify. If you would like a different sort order, such as Euclidian distance to the origin, you can generate a new column expressing your preferred metric, and then sorting on that. See the example below.

\param data The data set to be sorted. If \c NULL, this function is a no-op that returns \c NULL.
\param sort_order A \ref apop_data set describing the order in which columns are used for sorting, as above. If \c NULL, then sort by the vector, then each matrix column, then text, then weights, then row names.
\param inplace If 'n', make a copy, else sort in place. (default: 'y').
\param asc If 'a', ascending; if 'd', descending. This is applied to all columns; column-by-column application is to do. (default: 'a').
\param col_order For internal use only. In your call, it should be \c NULL; the \ref designated syntax will takes care of it for you.

\return A pointer to the sorted data set. If <tt>inplace=='y'</tt> (the default), then this is the same as the input set.


A few examples:

\include "sort_example.c"

\li This function uses the \ref designated syntax for inputs.
*/
APOP_VAR_HEAD apop_data *apop_data_sort(apop_data *data, apop_data *sort_order, char asc, char inplace, double *col_order){
    apop_data * apop_varad_var(data, NULL);
    Apop_stopif(!data, return NULL, 1, "You gave me NULL data to sort. Returning NULL");
    apop_data * apop_varad_var(sort_order, NULL);
    char apop_varad_var(inplace, 'y');
    char apop_varad_var(asc, 'a');
    double * apop_varad_var(col_order, NULL);
APOP_VAR_ENDHEAD
    if (!data) return NULL;

    apop_data *out = inplace=='n' ? apop_data_copy(data) : data;

    apop_data *xx = sort_order ? sort_order : out;
    Get_vmsizes(xx); //firstcol, msize2
    int cols_to_sort_ct = msize2 - firstcol +1 + !!(xx->weights) + xx->textsize[1] + !!xx->names->rowct;
    double so[cols_to_sort_ct];
    if (!col_order){
        generate_sort_order(out, sort_order, cols_to_sort_ct, so);
        col_order = so;
    }

    bool is_text = ((int)*col_order != *col_order);
    bool is_name = (*col_order == 0.2);

    gsl_vector_view c;
    gsl_vector *cc = NULL;
    if (!is_text && *col_order>=0){
        c = gsl_matrix_column(out->matrix, *col_order);
        cc = &c.vector;
    }
    gsl_vector *thiscol =   cc               ? cc
                          : (*col_order==-2) ? out->weights
                          : (*col_order==-1) ? out->vector
                                             : NULL;

    size_t height = thiscol   ? thiscol->size
                    : is_name ? out->names->rowct
                              : *out->textsize;

    gsl_permutation *p = gsl_permutation_alloc(height);
    if (!is_text) gsl_sort_vector_index (p, thiscol);
    else {
        gsl_permutation_init(p);
        d = out;
        offset = is_name ? -1 : *col_order-0.5;        
        qsort(p->data, height, sizeof(size_t), compare_strings);
    }

    size_t *perm = p->data;
    if (asc=='d' || asc=='D') //reverse the perm matrix.
        for (size_t j=0; j< height/2; j++){
            double t         = perm[j];
            perm[j]          = perm[height-1-j];
            perm[height-1-j] = t;
        }
    rearrange(out, height, perm);
    gsl_permutation_free(p);
    if (col_order[1] == -100) return out;

    /*Second pass:
    find blocks where all are of the same value.
    After you pass a block of size > 1 row where all vals in this col are identical,
    sort that block, using the rest of the sort order. */
    int bottom=0;
    if (!is_text){
        double last_val = gsl_vector_get(thiscol, 0);
        for (int i=1; i< height+1; i++){
            double this_val=0;
            if ((i==height || (this_val=gsl_vector_get(thiscol, i)) != last_val) 
                    && bottom != i-1){
                apop_data_sort_base(Apop_rs(out, bottom, i-bottom), sort_order, 'a', 'y', col_order+1);
            }
            if (last_val != this_val) bottom = i;
            last_val = this_val;
        }
    } else {
        char *last_val =  is_name ? out->names->row[0] : out->text[0][(int)(*col_order-0.5)];
        for (int i=1; i< height+1; i++){
            char *this_val = i==height ? NULL : is_name ? out->names->row[i] : out->text[i][(int)(*col_order-0.5)];
            if ((i==height || strcasecmp(this_val, last_val)) 
                    && bottom != i-1){
                apop_data_sort_base(Apop_rs(out, bottom, i-bottom), sort_order, 'a', 'y', col_order+1);
            }
            if (this_val && strcmp(last_val, this_val)) bottom = i;
            last_val = this_val;
        }
    }
    return out;
}
Ejemplo n.º 12
0
apop_data* multiple_imputation_variance_base(multiple_imputation_variance_t in){
    /*The first half of this is filling in the values. In an attempt at versatility, I allow users to 
      give any named column, be it numeric or text, for every piece of input info. That means a whole lot 
      of checking around to determine what goes where---and a macro.  */

    Apop_assert_c(in.base_data,NULL, 1, "It doesn't make sense to impute over a NULL data set.");
    Apop_assert_c(in.fill_ins, NULL, 1, "Didn't receive a fill-in table. Returning NULL.");
    data_to_data stat = in.stat? in.stat : colmeans;

//At the end of this macro, you've got rowcol and rowtype, valuecol and valuetype, &c.
#define apop_setup_one_colthing(c) \
    int c##col = apop_name_find(in.fill_ins->names, in.c##_name, 'c');   \
    int c##type = 'd';         \
    if (c##col==-2){           \
        c##col = apop_name_find(in.fill_ins->names, in.c##_name, 't');   \
        c##type = 't';         \
       Apop_assert(c##col!=-2, "I couldn't find the c##_name %s in the column/text names of your fill_in table.", in.c##_name);    \
    }

    apop_setup_one_colthing(row)
    apop_setup_one_colthing(col)
    apop_setup_one_colthing(value)
    apop_setup_one_colthing(imputation)

    Apop_assert(!(rowtype=='t' && !in.base_data->names->rowct),
            "the rowname you gave refers to text, so I will be searching for a row name in the base data."
            " But the base_data set has no row names.");
    Apop_assert(!(coltype=='t' && !in.base_data->names->colct),
            "the colname you gave refers to text, so I will be searching for a column name in the base data."
            " But the base_data set has no column names.");

    //get a list of unique imputation markers.
    gsl_vector *imps = NULL;
    apop_data *impt = NULL; 
    if (imputationtype == 'd'){
        Apop_col_v(in.fill_ins, imputationcol, ic);
        imps = apop_vector_unique_elements(ic);
    } else impt = apop_text_unique_elements(in.fill_ins, imputationcol);

    int len = imps ? imps->size : impt->textsize[0];
    int thisimp=-2; char *thisimpt=NULL;
	apop_data *estimates[len];
    for (int impctr=0; impctr< len; impctr++){
        if (imps) thisimp  = gsl_vector_get(imps, impctr);
        else      thisimpt = impt->text[impctr][0];
        Get_vmsizes(in.fill_ins); //masxize
        int fillsize = maxsize ? maxsize : in.fill_ins->textsize[0];
        for (int i=0; i< fillsize; i++){
            if (!(thisimpt && apop_strcmp(in.fill_ins->text[i][imputationcol], thisimpt))
                && !(imps && thisimp==apop_data_get(in.fill_ins, i, imputationcol)))
                continue;
            int thisrow = (rowtype=='d') ? 
                                apop_data_get(in.fill_ins, i, rowcol)
                               :apop_name_find(in.base_data->names, in.fill_ins->text[i][rowcol], 'r');
            int thiscol = (coltype=='d') ? 
                                apop_data_get(in.fill_ins, i, colcol)
                               :apop_name_find(in.base_data->names, in.fill_ins->text[i][colcol], 'c');
            if (valuetype=='d') apop_data_set(in.base_data, thisrow, thiscol, 
                                            apop_data_get(in.fill_ins, i, valuecol));
            else apop_text_add(in.base_data, rowcol, colcol, in.fill_ins->text[i][valuecol]);
        }
        //OK, base_data is now filled in. Estimate the statistic for it.
		estimates[impctr] = stat(in.base_data);
    }


    //Part II: find the mean of the statistics and the total variance of the cov matrix.
	gsl_vector *vals = gsl_vector_alloc(len);
    apop_data *out = apop_data_copy(estimates[0]);
	//take the simple mean of the main data set.
	{ //this limits the scope of the Get_vmsizes macro.
	 Get_vmsizes(estimates[0]); 
     for (int j=0; j < msize2; j++)
         for (int i=0; i < (vsize ? vsize : msize1); i++){
            for (int k=0; k< len; k++)
                gsl_vector_set(vals, k, apop_data_get(estimates[k], i, j));
             apop_data_set(out, i, j, apop_vector_mean(vals));
         }
	}
    apop_data *out_var = apop_data_get_page(estimates[0], "<Covariance>");
    int cov_is_labelled = out_var !=NULL;
    if (!cov_is_labelled){
        asprintf(&out->more->names->title, "<Covariance>");
        out_var = estimates[0]->more;
    }
	Get_vmsizes(out_var);
    for (int i=0; i < msize1; i++)
        for (int j=i; j < msize2; j++){
            for (int k=0; k< len; k++){
                apop_data *this_p = cov_is_labelled ? apop_data_get_page(estimates[k], "<Covariance>")
                                        : estimates[k]->more;
                gsl_vector_set(vals, k, apop_data_get(this_p, i, j));
            }
            double total_var = apop_vector_mean(vals) + apop_var(vals)/(1+1./len);
            apop_data_set(out_var, i, j, total_var);
            if (j != i)
                apop_data_set(out_var, j, i, total_var);
        }
    return out;	
}
Ejemplo n.º 13
0
/* This function prettyprints the \c apop_data set to a screen.

It is currently not in the documentation. It'd be nice to merge this w/apop_data_print.

This takes a lot of machinery. I write every last element to a text array, then measure column widths, then print to screen with padding to guarantee that everything lines up.  There's no way to have the first element of a column line up with the last unless you interrogate the width of every element in the column, so printing columns really can't be a one-pass process.

So, I produce an \ref apop_data set with no numeric elements and a text element to be
filled with the input data set, and then print that. That means that I'll be using
(more than) twice the memory to print this. If this is a problem, you can use \ref
apop_data_print to dump your data to a text file, and view the text file, or print
subsets.

For more machine-readable printing, see \ref apop_data_print.
*/
void apop_data_show(const apop_data *in){
    if (!in) {printf("NULL\n"); return;}
    Get_vmsizes(in) //vsize, msize1, msize2, tsize
//Take inventory and get sizes
    size_t hasrownames = (in->names && in->names->rowct) ? 1 : 0;
    size_t hascolnames = in->names && 
                    (in->names->vector || in->names->colct || in->names->textct);
    size_t hasweights = (in->weights != NULL);

    size_t outsize_r = GSL_MAX(in->matrix ? in->matrix->size1 : 0, in->vector ? in->vector->size: 0);
    outsize_r = GSL_MAX(outsize_r, in->textsize[0]);
    outsize_r = GSL_MAX(outsize_r, wsize);
    if (in->names) outsize_r = GSL_MAX(outsize_r, in->names->rowct);
    outsize_r += hascolnames;

    size_t outsize_c = msize2;
    outsize_c += in->textsize[1];
    outsize_c += (vsize>0);
    outsize_c += (wsize>0);
    outsize_c += hasrownames + hasweights;

//Write to the printout data set.
    apop_data *printout = apop_text_alloc(NULL , outsize_r, outsize_c);
    if (hasrownames)
        for (size_t i=0; i < in->names->rowct; i ++)
            apop_text_set(printout, i + hascolnames, 0, "%s", in->names->row[i]);
    for (size_t i=0; i < vsize; i ++) //vsize may be zero.
        apop_text_set(printout, i + hascolnames, hasrownames, "%g", gsl_vector_get(in->vector, i));
    for (size_t i=0; i < msize1; i ++) //msize1 may be zero.
        for (size_t j=0; j < msize2; j ++)
            apop_text_set(printout, i + hascolnames, hasrownames + (vsize >0)+ j, "%g", gsl_matrix_get(in->matrix, i, j));
    if (in->textsize[0])
        for (size_t i=0; i < in->textsize[0]; i ++)
            for (size_t j=0; j < in->textsize[1]; j ++)
                apop_text_set(printout, i + hascolnames, hasrownames + (vsize>0)+ msize2 + j, "%s", in->text[i][j]);
    if (hasweights)
        for (size_t i=0; i < in->weights->size; i ++)
            apop_text_set(printout, i + hascolnames, outsize_c-1, "%g", gsl_vector_get(in->weights, i));

//column names
    if (hascolnames){
        if (vsize && in->names->vector)
            apop_text_set(printout, 0 , hasrownames, "%s", in->names->vector);
        if (msize2 && in->names)
            for (size_t i=0; i < in->names->colct; i ++)
                apop_text_set(printout, 0 , hasrownames + (vsize>0) + i, "%s", in->names->col[i]);
        if (in->textsize[1] && in->names)
            for (size_t i=0; i < in->names->textct; i ++)
                apop_text_set(printout, 0 , hasrownames + (vsize>0) + msize2 + i, "%s", in->names->text[i]);
        if (hasweights)
            apop_text_set(printout, 0 , outsize_c-1, "Weights");
    }

//get column sizes
    int colsizes[outsize_c];
    for (size_t i=0; i < outsize_c; i ++){
        colsizes[i] = strlen(printout->text[0][i]);
        for (size_t j=1; j < outsize_r; j ++)
            colsizes[i] = GSL_MAX(colsizes[i], strlen(printout->text[j][i]));
    }

//Finally, print
    if (in->names && in->names->title && strlen(in->names->title))
        printf("\t%s\n\n", in->names->title);
    for (size_t j=0; j < outsize_r; j ++){
        for (size_t i=0; i < outsize_c; i ++){
            white_pad(colsizes[i] - strlen(printout->text[j][i]) + 1);//one spare space.
            printf("%s", printout->text[j][i]);
            if (i > 0 && i< outsize_c-1) 
                printf(" %s ", apop_opts.output_delimiter);
        }
        printf("\n");
    }

    if (in->more) {
        printf("\n");
        apop_data_show(in->more);
    }
    apop_data_free(printout);
}