void betabinom(){ apop_model *beta = apop_model_set_parameters(apop_beta, 10, 5); apop_model *drawfrom = apop_model_copy(apop_multinomial); drawfrom->parameters = apop_data_falloc((2), 30, .4); drawfrom->dsize = 2; int draw_ct = 80; apop_data *draws = apop_model_draws(drawfrom, draw_ct); apop_model *betaup = apop_update(draws, beta, apop_binomial); apop_model_show(betaup); beta->more = apop_beta; beta->log_likelihood = fake_ll; apop_model *bi = apop_model_fix_params(apop_model_set_parameters(apop_binomial, 30, NAN)); apop_model *upd = apop_update(draws, beta, bi); apop_model *betaed = apop_estimate(upd->data, apop_beta); deciles(betaed, betaup, 1); beta->log_likelihood = NULL; apop_model *upd_r = apop_update(draws, beta, bi); betaed = apop_estimate(apop_data_pmf_expand(upd_r->data, 2000), apop_beta); deciles(betaed, betaup, 1); apop_data *d2 = apop_model_draws(upd, draw_ct*2); apop_model *d2m = apop_estimate(d2, apop_beta); deciles(d2m, betaup, 1); }
void gammaexpo(){ printf("gamma/exponential\n"); apop_model *gamma = apop_model_set_parameters(apop_gamma, 1, 0.4); apop_model *drawfrom = apop_model_set_parameters(apop_exponential, 0.4); int draw_ct = 120; apop_data *draws = apop_model_draws(drawfrom, draw_ct); apop_model *gammaup = apop_update(draws, gamma, apop_exponential); apop_model_show(gammaup); gamma->more = apop_gamma; gamma->log_likelihood = fake_ll; Apop_settings_add_group(gamma, apop_mcmc, .burnin=.1, .periods=1e5, .proposal=apop_model_set_parameters(apop_normal, 1, .001)); apop_model *upd = apop_update(draws, gamma, apop_exponential); apop_model *gammaed = apop_estimate(upd->data, apop_gamma); apop_model_show(gammaed); deciles(gammaed, gammaup, 3); Apop_settings_add_group(gamma, apop_mcmc, .burnin=.1, .periods=1e5, .proposal=apop_model_set_parameters(apop_normal, 1, .01)); gamma->log_likelihood = NULL; apop_model *upd_r = apop_update(draws, gamma, apop_exponential); apop_model *gammafied2 = apop_estimate(apop_data_pmf_expand(upd_r->data, 2000), apop_gamma); deciles(gammafied2, gammaup, 5); }
int main(){ gsl_rng *r = apop_rng_alloc(2468); double binom_start = 0.6; double beta_start_a = 0.3; double beta_start_b = 0.5; int i, draws = 1500; double n = 4000; //First, the easy estimation using the conjugate distribution table. apop_model *bin = apop_model_set_parameters(apop_binomial, n, binom_start); apop_model *beta = apop_model_set_parameters(apop_beta, beta_start_a, beta_start_b); apop_model *updated = apop_update(.prior= beta, .likelihood=bin,.rng=r); //Now estimate via Gibbs sampling. //Requires a one-parameter binomial, with n fixed, //and a data set of n data points with the right p. apop_model *bcopy = apop_model_set_parameters(apop_binomial, n, GSL_NAN); apop_data *bin_draws = apop_data_fill(apop_data_alloc(1,2), n*(1-binom_start), n*binom_start); bin = apop_model_fix_params(bcopy); apop_model_add_group(beta, apop_update, .burnin=.1, .periods=1e4); apop_model *out_h = apop_update(bin_draws, beta, bin, NULL); //We now have a histogram of values for p. What's the closest beta //distribution? apop_data *d = apop_data_alloc(0, draws, 1); for(i=0; i < draws; i ++) apop_draw(apop_data_ptr(d, i, 0), r, out_h); apop_model *out_beta = apop_estimate(d, apop_beta); //Finally, we can compare the conjugate and Gibbs results: apop_vector_normalize(updated->parameters->vector); apop_vector_normalize(out_beta->parameters->vector); double error = apop_vector_distance(updated->parameters->vector, out_beta->parameters->vector, .metric='m'); double updated_size = apop_vector_sum(updated->parameters->vector); Apop_assert(error/updated_size < 0.01, "The error is %g, which is too big.", error/updated_size); }
void gammafish(){ printf("gamma/poisson\n"); apop_model *gamma = apop_model_set_parameters(apop_gamma, 1.5, 2.2); apop_model *drawfrom = apop_model_set_parameters(apop_poisson, 3.1); int draw_ct = 90; apop_data *draws = apop_model_draws(drawfrom, draw_ct); apop_model *gammaup = apop_update(draws, gamma, apop_poisson); apop_model_show(gammaup); gamma->more = apop_gamma; gamma->log_likelihood = fake_ll; apop_model *proposal = apop_model_fix_params(apop_model_set_parameters(apop_normal, NAN, 1)); proposal->parameters = apop_data_falloc((1), .9); //apop_data_set(apop_settings_get(gamma, apop_mcmc, proposal)->parameters, .val=.9); Apop_settings_add_group(gamma, apop_mcmc, .burnin=.1, .periods=1e4, .proposal=proposal); apop_model *upd = apop_update(draws, gamma, apop_poisson); apop_model *gammafied = apop_estimate(upd->data, apop_gamma); deciles(gammafied, gammaup, 5); //Apop_settings_add_group(beta, apop_mcmc, .burnin=.4, .periods=1e4); gamma->log_likelihood = NULL; apop_model *upd_r = apop_update(draws, gamma, apop_poisson); apop_model *gammafied2 = apop_estimate(apop_data_pmf_expand(upd_r->data, 2000), apop_gamma); deciles(gammafied2, gammaup, 5); deciles(gammafied, gammafied2, 5); }
//Use this function to produce test data below. apop_data *draw_exponentiated_normal(double mu, double sigma, double draws){ apop_model *n01 = apop_model_set_parameters(apop_normal, mu, sigma); apop_data *d = apop_data_alloc(draws); gsl_rng *r = apop_rng_alloc(13); for (int i=0; i< draws; i++) apop_draw(gsl_vector_ptr(d->vector,i), r, n01); apop_vector_exp(d->vector); return d; }
/** The Beta distribution is useful for modeling because it is bounded between zero and one, and can be either unimodal (if the variance is low) or bimodal (if the variance is high), and can have either a slant toward the bottom or top of the range (depending on the mean). The distribution has two parameters, typically named \f$\alpha\f$ and \f$\beta\f$, which can be difficult to interpret. However, there is a one-to-one mapping between (alpha, beta) pairs and (mean, variance) pairs. Since we have good intuition about the meaning of means and variances, this function takes in a mean and variance, calculates alpha and beta behind the scenes, and returns a random draw from the appropriate Beta distribution. \param m The mean the Beta distribution should have. Notice that m is in [0,1]. \param v The variance which the Beta distribution should have. It is in (0, 1/12), where (1/12) is the variance of a Uniform(0,1) distribution. Funny things happen with variance near 1/12 and mean far from 1/2. \return Returns an \c apop_beta model with its parameters appropriately set. */ apop_model *apop_beta_from_mean_var(double m, double v){ Apop_assert(m<1&&m > 0, "You asked for a beta distribution " "with mean %g, but the mean of the beta will always " "be strictly between zero and one.", m); double k = (m * (1- m)/ v) -1; double alpha = m*k; double beta = k * (1-m); return apop_model_set_parameters(apop_beta, alpha, beta); }
//generate a vector that is the original vector + noise void add_noise(gsl_vector *in, gsl_rng *r, double size){ apop_model *nnoise = apop_model_set_parameters(apop_normal, 0, size); for (int i=0; i< in->size; i++){ double noise; apop_draw(&noise, r, nnoise); apop_vector_increment(in, i, noise); } apop_model_free(nnoise); }
int main(){ apop_model *uniform_20 = apop_model_set_parameters(apop_uniform, 0, 20); apop_data *d = apop_model_draws(uniform_20, 10); //Estimate a Normal distribution from the data: apop_model *N = apop_estimate(d, apop_normal); print_draws(N); //estimate a one-dimensional multivariate Normal from the data: apop_model *mvN = apop_estimate(d, apop_multivariate_normal); print_draws(mvN); //fixed parameter list: apop_model *std_normal = apop_model_set_parameters(apop_normal, 0, 1); print_draws(std_normal); //variable-size parameter list: apop_model *std_multinormal = apop_model_copy(apop_multivariate_normal); std_multinormal->msize1 = std_multinormal->msize2 = std_multinormal->vsize = std_multinormal->dsize = 3; std_multinormal->parameters = apop_data_falloc((3, 3, 3), 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1); print_draws(std_multinormal); //estimate a KDE using the defaults: apop_model *k = apop_estimate(d, apop_kernel_density); print_draws(k); /*the documentation tells us that a KDE estimation consists of filling an apop_kernel_density_settings group, so we can set it to use a Normal(μ, 2) kernel via: */ apop_model *k2 = apop_model_copy_set(apop_kernel_density, apop_kernel_density, .base_data=d, .kernel = apop_model_set_parameters(apop_normal, 0, 2)); print_draws(k2); }
apop_model *fuzz(apop_model sim){ int draws = 100; gsl_rng *r = apop_rng_alloc(1); apop_model *prior = apop_model_cross( apop_model_set_parameters(apop_normal, 10, 2), apop_model_set_parameters(apop_normal, 10, 2)); apop_data *outdata = apop_data_alloc(draws, weibull->vsize); double *params = sim.parameters->vector->data; for (int i=0; i< draws; i++){ do { apop_draw(params, r, prior); } while (params[1]*2 > pow(params[0], 2)); sim.dsize=params[1]; apop_model *est = apop_estimate(apop_model_draws(&sim, 1000), weibull); Apop_row_v(outdata, i, onerow); gsl_vector_memcpy(onerow, est->parameters->vector); apop_model_free(est); } return apop_estimate(outdata, apop_pmf); }
int main(){ gsl_rng *r = apop_rng_alloc(2312311); int empirical_size = 5e3; apop_model *expo = apop_model_set_parameters(apop_exponential, 1.7); assert (apop_kl_divergence(expo, expo) < 1e-4); apop_data *empirical = apop_data_alloc(empirical_size, 1); for (int i=0; i<empirical_size; i++) apop_draw(apop_data_ptr(empirical, i, 0), r, expo); apop_model *pmf = apop_estimate(empirical, apop_pmf); assert(apop_kl_divergence(pmf,expo) < 1e-4); apop_data_free(empirical); }
int main(){ apop_model_print ( apop_estimate( apop_update( apop_model_draws( apop_model_mixture( apop_model_set_parameters(apop_poisson, 2.8), apop_model_set_parameters(apop_poisson, 2.0), apop_model_set_parameters(apop_poisson, 1.3) ), 1e4 ), truncate_model( apop_model_set_parameters(apop_normal, 2, 1), 0 ), apop_poisson )->data, apop_normal ) , NULL); }
int main(){ //bind together a Poisson and a Normal; //make a draw producing a 2-element vector apop_model *m1 = apop_model_set_parameters(apop_poisson, 3); apop_model *m2 = apop_model_set_parameters(apop_normal, -5, 1); apop_model *mm = apop_model_stack(m1, m2); int len = 1e5; gsl_rng *r = apop_rng_alloc(1); apop_data *draws = apop_data_alloc(len, 2); for (int i=0; i< len; i++){ Apop_row (draws, i, onev); apop_draw(onev->data, r, mm); assert((int)onev->data[0] == onev->data[0]); assert(onev->data[1]<0); } //The rest of the test script recovers the parameters. //First, set up a two-page data set: poisson data on p1, Normal on p2: apop_data *comeback = apop_data_alloc(); Apop_col(draws, 0,fishdraws) comeback->vector = apop_vector_copy(fishdraws); apop_data_add_page(comeback, apop_data_alloc(), "p2"); Apop_col(draws, 1, meandraws) comeback->more->vector = apop_vector_copy(meandraws); //set up the un-parameterized stacked model, including //the name at which to split the data set apop_model *estme = apop_model_stack(apop_model_copy(apop_poisson), apop_model_copy(apop_normal)); Apop_settings_add(estme, apop_stack, splitpage, "p2"); apop_model *ested = apop_estimate(comeback, *estme); //test that the parameters are as promised. apop_model *m1back = apop_settings_get(ested, apop_stack, model1); apop_model *m2back = apop_settings_get(ested, apop_stack, model2); assert(fabs(apop_data_get(m1back->parameters, .col=-1) - 3) < 1e-2); assert(fabs(apop_data_get(m2back->parameters, .col=-1) - -5) < 1e-2); assert(fabs(apop_data_get(m2back->parameters, .col=-1, .row=1) - 1) < 1e-2); }
int main(){ apop_db_open("data-climate.db"); apop_data *data = apop_query_to_data("select pcp from precip"); apop_data_pmf_compress(data); //creates a weights vector apop_vector_normalize(data->weights); apop_data_sort(data); apop_model *pmf = apop_estimate(data, apop_pmf); FILE *outfile = fopen("out.h", "w"); apop_model_print(pmf, outfile); apop_model *kernel = apop_model_set_parameters(apop_normal, 0., 0.1); apop_model *k = apop_model_copy(apop_kernel_density); Apop_settings_add_group(k, apop_kernel_density, .base_pmf=pmf, .kernel=kernel); plot(k, "out.k"); printf("plot 'out.h' with lines title 'data', 'out.k' with lines title 'smoothed'\n"); }
\adoc Examples This example sets up and uses KDEs based on a Normal and a Uniform distribution. \include kernel.c */ static void apop_set_first_param(apop_data *in, apop_model *m){ m->parameters->vector->data[0] = in->vector ? in->vector->data[0] : gsl_matrix_get(in->matrix, 0, 0); } Apop_settings_init(apop_kernel_density, //If there's a PMF associated with the model, run with it. //else, generate one from the data. Apop_varad_set(base_pmf, apop_estimate(in.base_data, apop_pmf)); Apop_varad_set(kernel, apop_model_set_parameters(apop_normal, 0, 1)); Apop_varad_set(set_fn, apop_set_first_param); out->own_pmf = !in.base_pmf; out->own_kernel = !in.kernel; if (!out->kernel->parameters) apop_prep(out->base_data, out->kernel); ) Apop_settings_copy(apop_kernel_density, out->own_pmf = out->own_kernel = 0; ) Apop_settings_free(apop_kernel_density, if (in->own_pmf) apop_model_free(in->base_pmf); if (in->own_kernel) apop_model_free(in->kernel); )
apop_data *rev(apop_data *in){ return apop_map(in, .fn_d=log, .part='a'); } /*The derivative of the transformed-to-base function. */ double inv(double in){return 1./in;} double rev_j(apop_data *in){ return fabs(apop_map_sum(in, .fn_d=inv, .part='a')); } int main(){ apop_model *ct = apop_model_coordinate_transform( .transformed_to_base= rev, .jacobian_to_base=rev_j, .base_model=apop_normal); //Apop_model_add_group(ct, apop_parts_wanted);//Speed up the MLE. //make fake data double mu=2, sigma=1; apop_data *d = draw_exponentiated_normal(mu, sigma, 2e5); //If we correctly replicated a Lognormal, mu and sigma will be right: apop_model *est = apop_estimate(d, ct); apop_model_free(ct); Diff(apop_data_get(est->parameters, 0), mu); Diff(apop_data_get(est->parameters, 1), sigma); /*The K-L divergence between our Lognormal and the stock Lognormal should be small. Try it with both the original params and the estimated ones. */ apop_model *ln = apop_model_set_parameters(apop_lognormal, mu, sigma); apop_model *ln2 = apop_model_copy(apop_lognormal); ln2->parameters = est->parameters; Diff(apop_kl_divergence(ln, ln2,.draw_ct=1000), 0); Diff(apop_kl_divergence(ln, est,.draw_ct=1000), 0); }
apop_data *draw_some_data(){ apop_model *uniform_0_20 = apop_model_set_parameters(apop_uniform, 0, 20); apop_data *d = apop_model_draws(uniform_0_20, 10); apop_data_print(apop_data_sort(d), .output_pipe=stderr); return d; }
int main(){ //Set up an apop_data set with only one number. //Most of these functions will only look at the first data point encountered. apop_data *onept = apop_data_falloc((1), 23); apop_model *norm = apop_model_set_parameters(apop_normal, 23, 138.8); double val = apop_cdf(onept, norm); assert(fabs(val - 0.5) < 1e-4); double tolerance = 1e-8; //Macroizing the sample routine above: #define model_val_cdf(model, value, cdf_result) { \ apop_data_set(onept, .val=(value)); \ assert(fabs((apop_cdf(onept, model))-(cdf_result))< tolerance); \ } apop_model *uni = apop_model_set_parameters(apop_uniform, 20, 26); model_val_cdf(uni, 0, 0); model_val_cdf(uni, 20, 0); model_val_cdf(uni, 21, 1./6); model_val_cdf(uni, 23, 0.5); model_val_cdf(uni, 25, 5./6); model_val_cdf(uni, 26, 1); model_val_cdf(uni, 260, 1); //Improper uniform always returns 1/2. model_val_cdf(apop_improper_uniform, 0, 0.5); model_val_cdf(apop_improper_uniform, 228, 0.5); model_val_cdf(apop_improper_uniform, INFINITY, 0.5); apop_model *binom = apop_model_set_parameters(apop_binomial, 2001, 0.5); model_val_cdf(binom, 0, 0); model_val_cdf(binom, 1000, .5); model_val_cdf(binom, 2000, 1); apop_model *bernie = apop_model_set_parameters(apop_bernoulli, 0.75); //p(0)=.25; p(1)=.75; that determines the CDF. //Notice that the CDF's integral is over a closed interval. model_val_cdf(bernie, -1, 0); model_val_cdf(bernie, 0, 0.25); model_val_cdf(bernie, 0.1, 0.25); model_val_cdf(bernie, .99, 0.25); model_val_cdf(bernie, 1, 1); model_val_cdf(bernie, INFINITY, 1); //alpha=beta -> symmetry apop_model *beta = apop_model_set_parameters(apop_beta, 2, 2); model_val_cdf(beta, -INFINITY, 0); model_val_cdf(beta, 0.5, 0.5); model_val_cdf(beta, INFINITY, 1); //This beta distribution -> uniform apop_model *beta_uni = apop_model_set_parameters(apop_beta, 1, 1); model_val_cdf(beta_uni, 0, 0); model_val_cdf(beta_uni, 1./6, 1./6); model_val_cdf(beta_uni, 0.5, 0.5); model_val_cdf(beta_uni, 1, 1); beta_uni->cdf = NULL; //With no closed-form CDF; make random draws to estimate the CDF. Apop_model_add_group(beta_uni, apop_cdf, .draws=1e6); //extra draws to improve accuracy, but we have to lower our tolerance anyway. tolerance=1e-3; model_val_cdf(beta_uni, 0, 0); model_val_cdf(beta_uni, 1./6, 1./6); model_val_cdf(beta_uni, 0.5, 0.5); model_val_cdf(beta_uni, 1, 1); //sum of three symmetric distributions: still symmetric. apop_model *sum_of_three = apop_model_mixture(beta, apop_improper_uniform, beta_uni); model_val_cdf(sum_of_three, 0.5, 0.5); apop_data *threepts = apop_data_falloc((3,1), -1, 0, 1); apop_model *kernels = apop_estimate(threepts, apop_kernel_density); model_val_cdf(kernels, -5, 0); model_val_cdf(kernels, 0, 0.5); model_val_cdf(kernels, 10, 1); }