Example #1
0
/** Run the Kolmogorov-Smirnov test to determine whether two distributions are identical.

\param m1, m2  Two models, most likely of \ref apop_pmf type. I will ue the cdf method, so if your function doesn't have one, expect this to run the slow default. I run it for each row of each data set, so if your model has a \c NULL at the data, I won't know what to do. 
 
\return An \ref apop_data set including the \f$p\f$-value from the Kolmogorov test that the two distributions are equal.

\li I assume that the data sets are sorted.

\include ks_tests.c

\ingroup histograms
*/
apop_data *apop_test_kolmogorov(apop_model *m1, apop_model *m2){
    //version for not a pair of histograms
    Apop_assert(m1->data, "I will test the CDF at each point in the data set, but the first model has a NULL data set. "
                          "Maybe generate, then apop_data_sort, a few thousand random draws?");
    Apop_assert(m2->data, "I will test the CDF at each point in the data set, but the second model has a NULL data set. "
                          "Maybe generate, then apop_data_sort, a few thousand random draws?");
    int maxsize1, maxsize2;
    {Get_vmsizes(m1->data); maxsize1 = maxsize;}//copy one of the macro's variables 
    {Get_vmsizes(m2->data); maxsize2 = maxsize;}//  to the full function's scope.
    double largest_diff=GSL_NEGINF;
    for (size_t i=0; i< maxsize1; i++){
        Apop_data_row(m1->data, i, arow);
        largest_diff = GSL_MAX(largest_diff, fabs(apop_cdf(arow, m1)-apop_cdf(arow, m2)));
    }
    for (size_t i=0; i< maxsize2; i++){     //There should be matched data rows, so there is redundancy.
        Apop_data_row(m2->data, i, arow); // Feel free to submit a smarter version.
        largest_diff = GSL_MAX(largest_diff, fabs(apop_cdf(arow, m1)-apop_cdf(arow, m2)));
    }
    apop_data *out = apop_data_alloc();
    sprintf(out->names->title, "Kolmogorov-Smirnov test");
    apop_data_add_named_elmt(out, "max distance", largest_diff);
    double ps = psmirnov2x(largest_diff, maxsize1, maxsize2);
    apop_data_add_named_elmt(out, "p value, 2 tail", 1-ps);
    apop_data_add_named_elmt(out, "confidence, 2 tail", ps);
    return out;
}
Example #2
0
void deciles(apop_model *m1, apop_model *m2, double max){
    double width = 30;
    for (double i=0; i< max; i+=1/width){
        apop_data *x = apop_data_falloc((1), i);
        double L = apop_cdf(x, m1);
        double R = apop_cdf(x, m2);
        assert(fabs(L-R) < 0.18); //wide, I know.
    }
}
Example #3
0
#include <apop.h>

int main(void){
    apop_text_to_db(.text_file="data", .tabname="d");
    apop_data *data = apop_query_to_data("select * from d");
    apop_model *est = apop_estimate(data, apop_ols);
    apop_model_show(est);

    Apop_settings_add_group(est, apop_pm, .index =1);  
    apop_model *first_param_distribution = apop_parameter_model(data, est);

    Apop_row(est->parameters, 1, param);
    double area_under_p = apop_cdf(param, first_param_distribution);

    apop_data_set(param, 0, -1, .val=0);
    double area_under_zero = apop_cdf(param, first_param_distribution);
    printf("reject the null for x_1 with %g percent confidence.\n",
                                 2*fabs(area_under_p-area_under_zero));
}
Example #4
0
int main(){
    //Set up an apop_data set with only one number.
    //Most of these functions will only look at the first data point encountered.
    apop_data *onept = apop_data_falloc((1), 23);

    apop_model *norm = apop_model_set_parameters(apop_normal, 23, 138.8);
    double val = apop_cdf(onept, norm);
    assert(fabs(val - 0.5) < 1e-4);

    double tolerance = 1e-8;
    //Macroizing the sample routine above:
    #define model_val_cdf(model, value, cdf_result) {   \
        apop_data_set(onept, .val=(value));             \
        assert(fabs((apop_cdf(onept, model))-(cdf_result))< tolerance);   \
    }

    apop_model *uni = apop_model_set_parameters(apop_uniform, 20, 26);
    model_val_cdf(uni, 0, 0);
    model_val_cdf(uni, 20, 0);
    model_val_cdf(uni, 21, 1./6);
    model_val_cdf(uni, 23, 0.5);
    model_val_cdf(uni, 25, 5./6);
    model_val_cdf(uni, 26, 1);
    model_val_cdf(uni, 260, 1);

    //Improper uniform always returns 1/2.
    model_val_cdf(apop_improper_uniform, 0, 0.5);
    model_val_cdf(apop_improper_uniform, 228, 0.5);
    model_val_cdf(apop_improper_uniform, INFINITY, 0.5);

    apop_model *binom = apop_model_set_parameters(apop_binomial, 2001, 0.5);
    model_val_cdf(binom, 0, 0);
    model_val_cdf(binom, 1000, .5);
    model_val_cdf(binom, 2000, 1);

    apop_model *bernie = apop_model_set_parameters(apop_bernoulli, 0.75);
    //p(0)=.25; p(1)=.75; that determines the CDF.
    //Notice that the CDF's integral is over a closed interval.
    model_val_cdf(bernie, -1, 0);
    model_val_cdf(bernie, 0, 0.25);
    model_val_cdf(bernie, 0.1, 0.25);
    model_val_cdf(bernie, .99, 0.25);
    model_val_cdf(bernie, 1, 1);
    model_val_cdf(bernie, INFINITY, 1);

    //alpha=beta -> symmetry
    apop_model *beta = apop_model_set_parameters(apop_beta, 2, 2);
    model_val_cdf(beta, -INFINITY, 0);
    model_val_cdf(beta, 0.5, 0.5);
    model_val_cdf(beta, INFINITY, 1);

    //This beta distribution -> uniform
    apop_model *beta_uni = apop_model_set_parameters(apop_beta, 1, 1);
    model_val_cdf(beta_uni, 0, 0);
    model_val_cdf(beta_uni, 1./6, 1./6);
    model_val_cdf(beta_uni, 0.5, 0.5);
    model_val_cdf(beta_uni, 1, 1);


    beta_uni->cdf = NULL; //With no closed-form CDF; make random draws to estimate the CDF.
    Apop_model_add_group(beta_uni, apop_cdf, .draws=1e6); //extra draws to improve accuracy, but we have to lower our tolerance anyway.
    tolerance=1e-3;
    model_val_cdf(beta_uni, 0, 0);
    model_val_cdf(beta_uni, 1./6, 1./6);
    model_val_cdf(beta_uni, 0.5, 0.5);
    model_val_cdf(beta_uni, 1, 1);


    //sum of three symmetric distributions: still symmetric.
    apop_model *sum_of_three = apop_model_mixture(beta, apop_improper_uniform, beta_uni);
    model_val_cdf(sum_of_three, 0.5, 0.5);


    apop_data *threepts = apop_data_falloc((3,1), -1, 0, 1);
    apop_model *kernels = apop_estimate(threepts, apop_kernel_density);
    model_val_cdf(kernels, -5, 0);
    model_val_cdf(kernels, 0, 0.5);
    model_val_cdf(kernels, 10, 1);
}