示例#1
0
/** Run the Kolmogorov-Smirnov test to determine whether two distributions are identical.

\param m1, m2  Two models, most likely of \ref apop_pmf type. I will ue the cdf method, so if your function doesn't have one, expect this to run the slow default. I run it for each row of each data set, so if your model has a \c NULL at the data, I won't know what to do. 
 
\return An \ref apop_data set including the \f$p\f$-value from the Kolmogorov test that the two distributions are equal.

\li I assume that the data sets are sorted.

\include ks_tests.c

\ingroup histograms
*/
apop_data *apop_test_kolmogorov(apop_model *m1, apop_model *m2){
    //version for not a pair of histograms
    Apop_assert(m1->data, "I will test the CDF at each point in the data set, but the first model has a NULL data set. "
                          "Maybe generate, then apop_data_sort, a few thousand random draws?");
    Apop_assert(m2->data, "I will test the CDF at each point in the data set, but the second model has a NULL data set. "
                          "Maybe generate, then apop_data_sort, a few thousand random draws?");
    int maxsize1, maxsize2;
    {Get_vmsizes(m1->data); maxsize1 = maxsize;}//copy one of the macro's variables 
    {Get_vmsizes(m2->data); maxsize2 = maxsize;}//  to the full function's scope.
    double largest_diff=GSL_NEGINF;
    for (size_t i=0; i< maxsize1; i++){
        Apop_data_row(m1->data, i, arow);
        largest_diff = GSL_MAX(largest_diff, fabs(apop_cdf(arow, m1)-apop_cdf(arow, m2)));
    }
    for (size_t i=0; i< maxsize2; i++){     //There should be matched data rows, so there is redundancy.
        Apop_data_row(m2->data, i, arow); // Feel free to submit a smarter version.
        largest_diff = GSL_MAX(largest_diff, fabs(apop_cdf(arow, m1)-apop_cdf(arow, m2)));
    }
    apop_data *out = apop_data_alloc();
    sprintf(out->names->title, "Kolmogorov-Smirnov test");
    apop_data_add_named_elmt(out, "max distance", largest_diff);
    double ps = psmirnov2x(largest_diff, maxsize1, maxsize2);
    apop_data_add_named_elmt(out, "p value, 2 tail", 1-ps);
    apop_data_add_named_elmt(out, "confidence, 2 tail", ps);
    return out;
}
示例#2
0
/** Test the goodness-of-fit between two \ref apop_pmf models. 

If you send two histograms, I assume that the histograms are synced: for PMFs,
you've used \ref apop_data_to_bins to generate two histograms using the same binspec,
or you've used \ref apop_data_pmf_compress to guarantee that each observation value
appears exactly once in each data set.

In any case, you are confident that all values in the \c observed set appear in the \c
expected set with nonzero weight; otherwise this will return a \f$\chi^2\f$ statistic
of \c GSL_POSINF, indicating that it is impossible for the \c observed data to have
been drawn from the \c expected distribution.

\li If an observation row has weight zero, I skip it. if <tt>apop_opts.verbose >=1 </tt> I will show a warning.

  \ingroup histograms
*/
apop_data *apop_histograms_test_goodness_of_fit(apop_model *observed, apop_model *expected){
    int df = observed->data->weights->size;
    double diff = 0;
    for (int i=0; i< observed->data->weights->size; i++){
        Apop_data_row(observed->data, i, one_obs);
        double obs_val = gsl_vector_get(observed->data->weights, i);
        double exp_val = apop_p(one_obs, expected);
        if (exp_val == 0){
            diff = GSL_POSINF; 
            break;
        }
        if (obs_val==0){
            Apop_notify(1, "element %i of the observed data has weight zero. Skipping it.", i);
            df --;
        } else 
            diff += gsl_pow_2(obs_val - exp_val)/exp_val;
    }
    //Data gathered. Now output
    apop_data   *out    = apop_data_alloc();
    double      toptail = gsl_cdf_chisq_Q(diff, df-1);
    sprintf(out->names->title, "Goodness-of-fit test via Chi-squared statistic");
    apop_data_add_named_elmt(out, "Chi squared statistic", diff);
    apop_data_add_named_elmt(out, "df", df-1);
    apop_data_add_named_elmt(out, "p value",  toptail); 
    apop_data_add_named_elmt(out, "confidence", 1 - toptail);
    return out;
}
示例#3
0
/** This function sorts the whole of a \c apop_data set based on one column. Sorts in place, with little additional memory used.

 Uses the \c gsl_sort_vector_index function internally, and that function just ignores NaNs; therefore this function just leaves NaNs exactly where they lay.

 \param data    The input set to be modified. (No default, must not be \c NULL.)
 \param sortby  The column of data by which the sorting will take place. As usual, -1 indicates the vector element. (default: column zero of the matrix if there is a matrix; if there's a vector but no matrix, then -1).
 \param asc   If 'd' or 'D', sort in descending order; else sort in ascending order. (Default: ascending)
 \return A pointer to the data set, so you can do things like \c apop_data_show(apop_data_sort(d, -1)).

This function uses the \ref designated syntax for inputs.
*/
APOP_VAR_HEAD apop_data * apop_data_sort(apop_data *data, int sortby, char asc){
    apop_data * apop_varad_var(data, NULL);
    apop_assert_s(data, "You gave me NULL data to sort.");
    int apop_varad_var(sortby, 0);
    if (sortby==0 && !data->matrix && data->vector) //you meant sort the vector
        sortby = -1;
    char apop_varad_var(asc, 0);
APOP_VAR_ENDHEAD
    size_t height  = (sortby==-1) ? data->vector->size: data->matrix->size1;
    size_t sorted[height];
    size_t i, *perm, start=0;
    gsl_permutation *p = gsl_permutation_alloc(height);
    memset(sorted, 0, sizeof(size_t)*height);
    if (sortby == -1)
        gsl_sort_vector_index (p, data->vector);
    else {
        APOP_COL(data, sortby, v);
        gsl_sort_vector_index (p, v);
    }
    perm = p->data;
    if (asc=='d' || asc=='D') //reverse the perm matrix.
        for (size_t j=0; j< height/2; j++){
            double t         = perm[j];
            perm[j]          = perm[height-1-j];
            perm[height-1-j] = t;
        }
    while (1){
        i     =
        start = find_min_unsorted(sorted, height, start);
        if (i==-1) 
            break;
        Apop_data_row(data, start, firstrow);
        apop_data *first_row_storage = apop_data_copy(firstrow);
        sorted[start]++;
        while (perm[i]!=start){
            //copy from perm[i] to i
            Apop_data_row(data, perm[i], onerow);
            apop_data_set_row(data, onerow, i);
            sorted[perm[i]]++;
            i = perm[i];
        }
        apop_data_set_row(data, first_row_storage, i);
        apop_data_free(first_row_storage);
    }
    gsl_permutation_free(p);
    return data;
}