/** Run the Kolmogorov-Smirnov test to determine whether two distributions are identical. \param m1, m2 Two models, most likely of \ref apop_pmf type. I will ue the cdf method, so if your function doesn't have one, expect this to run the slow default. I run it for each row of each data set, so if your model has a \c NULL at the data, I won't know what to do. \return An \ref apop_data set including the \f$p\f$-value from the Kolmogorov test that the two distributions are equal. \li I assume that the data sets are sorted. \include ks_tests.c \ingroup histograms */ apop_data *apop_test_kolmogorov(apop_model *m1, apop_model *m2){ //version for not a pair of histograms Apop_assert(m1->data, "I will test the CDF at each point in the data set, but the first model has a NULL data set. " "Maybe generate, then apop_data_sort, a few thousand random draws?"); Apop_assert(m2->data, "I will test the CDF at each point in the data set, but the second model has a NULL data set. " "Maybe generate, then apop_data_sort, a few thousand random draws?"); int maxsize1, maxsize2; {Get_vmsizes(m1->data); maxsize1 = maxsize;}//copy one of the macro's variables {Get_vmsizes(m2->data); maxsize2 = maxsize;}// to the full function's scope. double largest_diff=GSL_NEGINF; for (size_t i=0; i< maxsize1; i++){ Apop_data_row(m1->data, i, arow); largest_diff = GSL_MAX(largest_diff, fabs(apop_cdf(arow, m1)-apop_cdf(arow, m2))); } for (size_t i=0; i< maxsize2; i++){ //There should be matched data rows, so there is redundancy. Apop_data_row(m2->data, i, arow); // Feel free to submit a smarter version. largest_diff = GSL_MAX(largest_diff, fabs(apop_cdf(arow, m1)-apop_cdf(arow, m2))); } apop_data *out = apop_data_alloc(); sprintf(out->names->title, "Kolmogorov-Smirnov test"); apop_data_add_named_elmt(out, "max distance", largest_diff); double ps = psmirnov2x(largest_diff, maxsize1, maxsize2); apop_data_add_named_elmt(out, "p value, 2 tail", 1-ps); apop_data_add_named_elmt(out, "confidence, 2 tail", ps); return out; }
void deciles(apop_model *m1, apop_model *m2, double max){ double width = 30; for (double i=0; i< max; i+=1/width){ apop_data *x = apop_data_falloc((1), i); double L = apop_cdf(x, m1); double R = apop_cdf(x, m2); assert(fabs(L-R) < 0.18); //wide, I know. } }
#include <apop.h> int main(void){ apop_text_to_db(.text_file="data", .tabname="d"); apop_data *data = apop_query_to_data("select * from d"); apop_model *est = apop_estimate(data, apop_ols); apop_model_show(est); Apop_settings_add_group(est, apop_pm, .index =1); apop_model *first_param_distribution = apop_parameter_model(data, est); Apop_row(est->parameters, 1, param); double area_under_p = apop_cdf(param, first_param_distribution); apop_data_set(param, 0, -1, .val=0); double area_under_zero = apop_cdf(param, first_param_distribution); printf("reject the null for x_1 with %g percent confidence.\n", 2*fabs(area_under_p-area_under_zero)); }
int main(){ //Set up an apop_data set with only one number. //Most of these functions will only look at the first data point encountered. apop_data *onept = apop_data_falloc((1), 23); apop_model *norm = apop_model_set_parameters(apop_normal, 23, 138.8); double val = apop_cdf(onept, norm); assert(fabs(val - 0.5) < 1e-4); double tolerance = 1e-8; //Macroizing the sample routine above: #define model_val_cdf(model, value, cdf_result) { \ apop_data_set(onept, .val=(value)); \ assert(fabs((apop_cdf(onept, model))-(cdf_result))< tolerance); \ } apop_model *uni = apop_model_set_parameters(apop_uniform, 20, 26); model_val_cdf(uni, 0, 0); model_val_cdf(uni, 20, 0); model_val_cdf(uni, 21, 1./6); model_val_cdf(uni, 23, 0.5); model_val_cdf(uni, 25, 5./6); model_val_cdf(uni, 26, 1); model_val_cdf(uni, 260, 1); //Improper uniform always returns 1/2. model_val_cdf(apop_improper_uniform, 0, 0.5); model_val_cdf(apop_improper_uniform, 228, 0.5); model_val_cdf(apop_improper_uniform, INFINITY, 0.5); apop_model *binom = apop_model_set_parameters(apop_binomial, 2001, 0.5); model_val_cdf(binom, 0, 0); model_val_cdf(binom, 1000, .5); model_val_cdf(binom, 2000, 1); apop_model *bernie = apop_model_set_parameters(apop_bernoulli, 0.75); //p(0)=.25; p(1)=.75; that determines the CDF. //Notice that the CDF's integral is over a closed interval. model_val_cdf(bernie, -1, 0); model_val_cdf(bernie, 0, 0.25); model_val_cdf(bernie, 0.1, 0.25); model_val_cdf(bernie, .99, 0.25); model_val_cdf(bernie, 1, 1); model_val_cdf(bernie, INFINITY, 1); //alpha=beta -> symmetry apop_model *beta = apop_model_set_parameters(apop_beta, 2, 2); model_val_cdf(beta, -INFINITY, 0); model_val_cdf(beta, 0.5, 0.5); model_val_cdf(beta, INFINITY, 1); //This beta distribution -> uniform apop_model *beta_uni = apop_model_set_parameters(apop_beta, 1, 1); model_val_cdf(beta_uni, 0, 0); model_val_cdf(beta_uni, 1./6, 1./6); model_val_cdf(beta_uni, 0.5, 0.5); model_val_cdf(beta_uni, 1, 1); beta_uni->cdf = NULL; //With no closed-form CDF; make random draws to estimate the CDF. Apop_model_add_group(beta_uni, apop_cdf, .draws=1e6); //extra draws to improve accuracy, but we have to lower our tolerance anyway. tolerance=1e-3; model_val_cdf(beta_uni, 0, 0); model_val_cdf(beta_uni, 1./6, 1./6); model_val_cdf(beta_uni, 0.5, 0.5); model_val_cdf(beta_uni, 1, 1); //sum of three symmetric distributions: still symmetric. apop_model *sum_of_three = apop_model_mixture(beta, apop_improper_uniform, beta_uni); model_val_cdf(sum_of_three, 0.5, 0.5); apop_data *threepts = apop_data_falloc((3,1), -1, 0, 1); apop_model *kernels = apop_estimate(threepts, apop_kernel_density); model_val_cdf(kernels, -5, 0); model_val_cdf(kernels, 0, 0.5); model_val_cdf(kernels, 10, 1); }