int main(){ apop_data *locations = apop_data_falloc((5, 2), 1.1, 2.2, 4.8, 7.4, 2.9, 8.6, -1.3, 3.7, 2.9, 1.1); Apop_model_add_group(min_distance, apop_mle, .method= "NM simplex", .tolerance=1e-5); apop_model *est = apop_estimate(locations, min_distance); apop_model_show(est); }
int main(){ gsl_rng *r = apop_rng_alloc(2312311); int empirical_size = 5e3; apop_model *expo = apop_model_set_parameters(apop_exponential, 1.7); assert (apop_kl_divergence(expo, expo) < 1e-4); apop_data *empirical = apop_data_alloc(empirical_size, 1); for (int i=0; i<empirical_size; i++) apop_draw(apop_data_ptr(empirical, i, 0), r, expo); apop_model *pmf = apop_estimate(empirical, apop_pmf); assert(apop_kl_divergence(pmf,expo) < 1e-4); apop_data_free(empirical); }
ykl_s make_yule(char const *zila, int y) { static gsl_matrix *indices; if (!indices) { indices = gsl_matrix_calloc(65,1); for (int i=0; i< 64; i++) gsl_matrix_set(indices, i,0, i); } apop_data *col = make_histo(zila, y); apop_data ww = (apop_data) { .weights=col->vector, .matrix=indices }; apop_data *d = apop_data_transpose(col); apop_data *exp = apop_data_rank_expand(d); apop_model *m = apop_estimate(exp, apop_yule); apop_model *n = apop_estimate(exp, apop_lognormal); ykl_s out = (ykl_s) { .yule=apop_data_get(m->parameters, .col=-1/*, .rowname="mu"*/), .ln=apop_data_get(n->parameters, .col=-1/*, .rowname="mu"*/), .lnstderr=sqrt(apop_data_get(n->parameters, .col=-1, .row=1/*, .rowname="mu"*/)), .kl = apop_kl_divergence(apop_estimate(&ww, apop_pmf), m), .lnkl = apop_kl_divergence(apop_estimate(&ww, apop_pmf), n), .mean = apop_matrix_mean(col->matrix) }; apop_data_free(d); apop_data_free(exp); apop_model_free(m); return out; } int main() { printf("zila|year|yule_p|kl_div|mu|ln_mu|ln_sigma|ln_kl\n"); apop_db_open("b.db"); apop_data *zilas = apop_query_to_text("select admname from ppl"); for (int i=0; i< *zilas->textsize; i++) for (int y=2001; y<= 2005; y++) { ykl_s ykl = make_yule(*zilas->text[i], y); printf("%20s| %i| %g| %g| %g| %g| %g|%g\n", *zilas->text[i], y, ykl.yule, ykl.kl, ykl.mean, ykl.ln, ykl.lnstderr, ykl.lnkl); } //apop_plot_histogram(m->data->weights, 64, .output_file="histo"); }
int main(){ apop_db_open("data-climate.db"); apop_data *data = apop_query_to_data("select pcp from precip"); apop_data_pmf_compress(data); //creates a weights vector apop_vector_normalize(data->weights); apop_data_sort(data); apop_model *pmf = apop_estimate(data, apop_pmf); FILE *outfile = fopen("out.h", "w"); apop_model_print(pmf, outfile); apop_model *kernel = apop_model_set_parameters(apop_normal, 0., 0.1); apop_model *k = apop_model_copy(apop_kernel_density); Apop_settings_add_group(k, apop_kernel_density, .base_pmf=pmf, .kernel=kernel); plot(k, "out.k"); printf("plot 'out.h' with lines title 'data', 'out.k' with lines title 'smoothed'\n"); }
int main(){ char outfile[] = "scatter.gplot"; apop_db_open("data-metro.db"); apop_data *data = apop_query_to_data("select riders, year from riders where station like 'Silver%%' and riders>0"); apop_db_close(); //The regression destroys your data, so copy it first. apop_data *data_copy = apop_data_copy(data); //Run OLS, display results on terminal apop_model *est = apop_estimate(data, apop_OLS); apop_model_show(est); //Prep the file with a header, then call the function. FILE *f = fopen(outfile, "w"); fprintf(f,"set term postscript;\n set output \"scatter.eps\"\n set yrange [0:*]\n"); apop_plot_line_and_scatter(data_copy, est, .output_pipe=f); fclose(f); }
int main(){ apop_model_print ( apop_estimate( apop_update( apop_model_draws( apop_model_mixture( apop_model_set_parameters(apop_poisson, 2.8), apop_model_set_parameters(apop_poisson, 2.0), apop_model_set_parameters(apop_poisson, 1.3) ), 1e4 ), truncate_model( apop_model_set_parameters(apop_normal, 2, 1), 0 ), apop_poisson )->data, apop_normal ) , NULL); }
int main(){ //bind together a Poisson and a Normal; //make a draw producing a 2-element vector apop_model *m1 = apop_model_set_parameters(apop_poisson, 3); apop_model *m2 = apop_model_set_parameters(apop_normal, -5, 1); apop_model *mm = apop_model_stack(m1, m2); int len = 1e5; gsl_rng *r = apop_rng_alloc(1); apop_data *draws = apop_data_alloc(len, 2); for (int i=0; i< len; i++){ Apop_row (draws, i, onev); apop_draw(onev->data, r, mm); assert((int)onev->data[0] == onev->data[0]); assert(onev->data[1]<0); } //The rest of the test script recovers the parameters. //First, set up a two-page data set: poisson data on p1, Normal on p2: apop_data *comeback = apop_data_alloc(); Apop_col(draws, 0,fishdraws) comeback->vector = apop_vector_copy(fishdraws); apop_data_add_page(comeback, apop_data_alloc(), "p2"); Apop_col(draws, 1, meandraws) comeback->more->vector = apop_vector_copy(meandraws); //set up the un-parameterized stacked model, including //the name at which to split the data set apop_model *estme = apop_model_stack(apop_model_copy(apop_poisson), apop_model_copy(apop_normal)); Apop_settings_add(estme, apop_stack, splitpage, "p2"); apop_model *ested = apop_estimate(comeback, *estme); //test that the parameters are as promised. apop_model *m1back = apop_settings_get(ested, apop_stack, model1); apop_model *m2back = apop_settings_get(ested, apop_stack, model2); assert(fabs(apop_data_get(m1back->parameters, .col=-1) - 3) < 1e-2); assert(fabs(apop_data_get(m2back->parameters, .col=-1) - -5) < 1e-2); assert(fabs(apop_data_get(m2back->parameters, .col=-1, .row=1) - 1) < 1e-2); }
int main(){ size_t ct = 5e4; //set up the model & params apop_data *params = apop_data_falloc((2,2,2), 8, 1, 0.5, 2, 0.5, 1); apop_model *pvm = apop_model_copy(apop_multivariate_normal); pvm->parameters = apop_data_copy(params); pvm->dsize = 2; apop_data *d = apop_model_draws(pvm, ct); //set up and estimate a model with fixed covariance matrix but free means gsl_vector_set_all(pvm->parameters->vector, GSL_NAN); apop_model *mep1 = apop_model_fix_params(pvm); apop_model *e1 = apop_estimate(d, mep1); //compare results printf("original params: "); apop_vector_print(params->vector); printf("estimated params: "); apop_vector_print(e1->parameters->vector); assert(apop_vector_distance(params->vector, e1->parameters->vector)<1e-2); }
/** Make random draws from an \ref apop_model, and bin them using a binspec in the style of \ref apop_data_to_bins. If you have a data set that used the same binspec, you now have synced histograms, which you can plot or sensibly test hypotheses about. The output is normalized to integrate to one. \param binspec A description of the bins in which to place the draws; see \ref apop_data_to_bins. (default: as in \ref apop_data_to_bins.) \param model The model to be drawn from. Because this function works via random draws, the model needs to have a \c draw method. (No default) \param draws The number of random draws to make. (arbitrary default = 10,000) \param bin_count If no bin spec, the number of bins to use (default: as per \ref apop_data_to_bins, \f$\sqrt(N)\f$) \param rng The \c gsl_rng used to make random draws. (default: see note on \ref autorng) \return An \ref apop_pmf model. \li This function uses the \ref designated syntax for inputs. \ingroup histograms */ APOP_VAR_HEAD apop_model *apop_model_to_pmf(apop_model *model, apop_data *binspec, long int draws, int bin_count, gsl_rng *rng){ apop_model* apop_varad_var(model, NULL); Apop_assert(model && model->draw, "The second argument needs to be an apop_model with a 'draw' function " "that I can use to make random draws."); apop_data* apop_varad_var(binspec, NULL); int apop_varad_var(bin_count, 0); long int apop_varad_var(draws, 1e4); gsl_rng *apop_varad_var(rng, NULL) static gsl_rng *spare = NULL; if (!rng && !spare) spare = apop_rng_alloc(++apop_opts.rng_seed); if (!rng) rng = spare; APOP_VAR_ENDHEAD Get_vmsizes(binspec); apop_data *outd = apop_data_alloc(draws, model->dsize); for (long int i=0; i< draws; i++){ Apop_row(outd, i, ach); apop_draw(ach->data, rng, model); } apop_data *outbinned = apop_data_to_bins(outd, binspec, .bin_count=bin_count); apop_data_free(outd); apop_vector_normalize(outbinned->weights); return apop_estimate(outbinned, apop_pmf); }
#include <apop.h> int main(void){ apop_text_to_db(.text_file="data", .tabname="d"); apop_data *data = apop_query_to_data("select * from d"); apop_model *est = apop_estimate(data, apop_ols); apop_model_show(est); Apop_settings_add_group(est, apop_pm, .index =1); apop_model *first_param_distribution = apop_parameter_model(data, est); Apop_row(est->parameters, 1, param); double area_under_p = apop_cdf(param, first_param_distribution); apop_data_set(param, 0, -1, .val=0); double area_under_zero = apop_cdf(param, first_param_distribution); printf("reject the null for x_1 with %g percent confidence.\n", 2*fabs(area_under_p-area_under_zero)); }
double cook_math(apop_data *reduced){ apop_model *r = apop_estimate(reduced, apop_ols); double out = sum_squared_diff(project(ols_data, r), predicted)/p_dot_mse; apop_model_free(r); return out; }
//these work by checking that K-L divergence shrunk, and that individual margins are correct. void test_raking_further(){ apop_table_exists("rake_test", 'd'); apop_query("create table rake_test (first, second, weights);" "insert into rake_test values(1, 1, 10);" "insert into rake_test values(1, 2, 2);" "insert into rake_test values(2, 1, 15);" "insert into rake_test values(2, 2, 5);" ); //Synthetic data, starting at all ones. apop_data_print( apop_rake(.margin_table="rake_test", .count_col="weights", .contrasts=(char*[]){"first", "second"}, .contrast_ct=2), .output_file="raked", .output_type='d'); apop_model *base= apop_estimate(apop_query_to_mixed_data("mmw", "select * from rake_test"), apop_pmf); apop_model *fitted= apop_estimate(apop_query_to_mixed_data("mmw", "select * from raked"), apop_pmf); //apop_data_show(apop_query_to_data("select * from raked")); //individual margins should match. //KL divergence should be down from the initial table. #define rake_check \ Diff(apop_query_to_float("select sum(weights) from raked where first=1"), 12, 1e-4);\ Diff(apop_query_to_float("select sum(weights) from raked where first=2"), 20, 1e-4);\ Diff(apop_query_to_float("select sum(weights) from raked where second=1"), 25, 1e-4);\ Diff(apop_query_to_float("select sum(weights) from raked where second=2"), 7, 1e-4); \ assert(apop_kl_divergence(base, fitted) <= apop_kl_divergence(base, \ apop_estimate(apop_query_to_mixed_data("mmw", "select first, second, 1 from rake_test"), apop_pmf))); rake_check
Nullcheck_m(m, ); //apop_data_set(m->parameters, 0, -1, d->matrix->size1); //Start with cov matrix via mean of inputs; df=NaN apop_data_set(m->parameters, 0, -1, GSL_NAN); apop_data *summ=apop_data_summarize(d); Apop_col_t(summ, "mean", means); gsl_vector *t = m->parameters->vector; //mask this while unpacking m->parameters->vector=NULL; apop_data_unpack(means, m->parameters); m->parameters->vector=t; //Estimate a model with fixed cov matrix and blank (NaN) df. apop_model *modified_wish = apop_model_copy(m); modified_wish->log_likelihood = fixed_wishart_ll; apop_model *fixed_wish = apop_model_fix_params(modified_wish); apop_model *est_via_fix = apop_estimate(d, fixed_wish); //copy df from fixed version to the real thing; clean up. t->data[0] = apop_data_get(est_via_fix->parameters, 0, -1); gsl_matrix_scale(m->parameters->matrix, 1./t->data[0]); apop_data_free(summ); apop_model_free(modified_wish); apop_model_free(fixed_wish); } /* amodel apop_wishart The Wishart distribution, which is currently somewhat untested. Here's the likelihood function. \f$p\f$ is the dimension of the data and covariance matrix, \f$n\f$ is the degrees of freedom, \f$\mathbf{V}\f$ is the \f$p\times p\f$ matrix of Wishart parameters, and \f${\mathbf{W}}\f$ is the \f$p\times p\f$ matrix whose likelihood is being evaluated. \f$\Gamma_p(\cdot)\f$ is the \ref apop_multivariate_gamma
apop_data *rev(apop_data *in){ return apop_map(in, .fn_d=log, .part='a'); } /*The derivative of the transformed-to-base function. */ double inv(double in){return 1./in;} double rev_j(apop_data *in){ return fabs(apop_map_sum(in, .fn_d=inv, .part='a')); } int main(){ apop_model *ct = apop_model_coordinate_transform( .transformed_to_base= rev, .jacobian_to_base=rev_j, .base_model=apop_normal); //Apop_model_add_group(ct, apop_parts_wanted);//Speed up the MLE. //make fake data double mu=2, sigma=1; apop_data *d = draw_exponentiated_normal(mu, sigma, 2e5); //If we correctly replicated a Lognormal, mu and sigma will be right: apop_model *est = apop_estimate(d, ct); apop_model_free(ct); Diff(apop_data_get(est->parameters, 0), mu); Diff(apop_data_get(est->parameters, 1), sigma); /*The K-L divergence between our Lognormal and the stock Lognormal should be small. Try it with both the original params and the estimated ones. */ apop_model *ln = apop_model_set_parameters(apop_lognormal, mu, sigma); apop_model *ln2 = apop_model_copy(apop_lognormal); ln2->parameters = est->parameters; Diff(apop_kl_divergence(ln, ln2,.draw_ct=1000), 0); Diff(apop_kl_divergence(ln, est,.draw_ct=1000), 0); }
\adoc settings \ref apop_kernel_density_settings. \adoc Examples This example sets up and uses KDEs based on a Normal and a Uniform distribution. \include kernel.c */ static void apop_set_first_param(apop_data *in, apop_model *m){ m->parameters->vector->data[0] = in->vector ? in->vector->data[0] : gsl_matrix_get(in->matrix, 0, 0); } Apop_settings_init(apop_kernel_density, //If there's a PMF associated with the model, run with it. //else, generate one from the data. Apop_varad_set(base_pmf, apop_estimate(in.base_data, apop_pmf)); Apop_varad_set(kernel, apop_model_set_parameters(apop_normal, 0, 1)); Apop_varad_set(set_fn, apop_set_first_param); out->own_pmf = !in.base_pmf; out->own_kernel = !in.kernel; if (!out->kernel->parameters) apop_prep(out->base_data, out->kernel); ) Apop_settings_copy(apop_kernel_density, out->own_pmf = out->own_kernel = 0; ) Apop_settings_free(apop_kernel_density, if (in->own_pmf) apop_model_free(in->base_pmf); if (in->own_kernel) apop_model_free(in->kernel);
int main(){ //Set up an apop_data set with only one number. //Most of these functions will only look at the first data point encountered. apop_data *onept = apop_data_falloc((1), 23); apop_model *norm = apop_model_set_parameters(apop_normal, 23, 138.8); double val = apop_cdf(onept, norm); assert(fabs(val - 0.5) < 1e-4); double tolerance = 1e-8; //Macroizing the sample routine above: #define model_val_cdf(model, value, cdf_result) { \ apop_data_set(onept, .val=(value)); \ assert(fabs((apop_cdf(onept, model))-(cdf_result))< tolerance); \ } apop_model *uni = apop_model_set_parameters(apop_uniform, 20, 26); model_val_cdf(uni, 0, 0); model_val_cdf(uni, 20, 0); model_val_cdf(uni, 21, 1./6); model_val_cdf(uni, 23, 0.5); model_val_cdf(uni, 25, 5./6); model_val_cdf(uni, 26, 1); model_val_cdf(uni, 260, 1); //Improper uniform always returns 1/2. model_val_cdf(apop_improper_uniform, 0, 0.5); model_val_cdf(apop_improper_uniform, 228, 0.5); model_val_cdf(apop_improper_uniform, INFINITY, 0.5); apop_model *binom = apop_model_set_parameters(apop_binomial, 2001, 0.5); model_val_cdf(binom, 0, 0); model_val_cdf(binom, 1000, .5); model_val_cdf(binom, 2000, 1); apop_model *bernie = apop_model_set_parameters(apop_bernoulli, 0.75); //p(0)=.25; p(1)=.75; that determines the CDF. //Notice that the CDF's integral is over a closed interval. model_val_cdf(bernie, -1, 0); model_val_cdf(bernie, 0, 0.25); model_val_cdf(bernie, 0.1, 0.25); model_val_cdf(bernie, .99, 0.25); model_val_cdf(bernie, 1, 1); model_val_cdf(bernie, INFINITY, 1); //alpha=beta -> symmetry apop_model *beta = apop_model_set_parameters(apop_beta, 2, 2); model_val_cdf(beta, -INFINITY, 0); model_val_cdf(beta, 0.5, 0.5); model_val_cdf(beta, INFINITY, 1); //This beta distribution -> uniform apop_model *beta_uni = apop_model_set_parameters(apop_beta, 1, 1); model_val_cdf(beta_uni, 0, 0); model_val_cdf(beta_uni, 1./6, 1./6); model_val_cdf(beta_uni, 0.5, 0.5); model_val_cdf(beta_uni, 1, 1); beta_uni->cdf = NULL; //With no closed-form CDF; make random draws to estimate the CDF. Apop_model_add_group(beta_uni, apop_cdf, .draws=1e6); //extra draws to improve accuracy, but we have to lower our tolerance anyway. tolerance=1e-3; model_val_cdf(beta_uni, 0, 0); model_val_cdf(beta_uni, 1./6, 1./6); model_val_cdf(beta_uni, 0.5, 0.5); model_val_cdf(beta_uni, 1, 1); //sum of three symmetric distributions: still symmetric. apop_model *sum_of_three = apop_model_mixture(beta, apop_improper_uniform, beta_uni); model_val_cdf(sum_of_three, 0.5, 0.5); apop_data *threepts = apop_data_falloc((3,1), -1, 0, 1); apop_model *kernels = apop_estimate(threepts, apop_kernel_density); model_val_cdf(kernels, -5, 0); model_val_cdf(kernels, 0, 0.5); model_val_cdf(kernels, 10, 1); }