예제 #1
0
 apop_data * apop_bootstrap_cov_base(apop_data * data, apop_model *model, gsl_rng *rng, int iterations, char keep_boots, char ignore_nans, apop_data **boot_store){
#endif
    Get_vmsizes(data); //vsize, msize1, msize2
    apop_model *e = apop_model_copy(model);
    apop_data *subset = apop_data_copy(data);
    apop_data *array_of_boots = NULL,
              *summary;
    //prevent and infinite regression of covariance calculation.
    Apop_model_add_group(e, apop_parts_wanted); //default wants for nothing.
    size_t i, nan_draws=0;
    apop_name *tmpnames = (data && data->names) ? data->names : NULL; //save on some copying below.
    if (data && data->names) data->names = NULL;

    int height = GSL_MAX(msize1, GSL_MAX(vsize, (data?(*data->textsize):0)));
	for (i=0; i<iterations && nan_draws < iterations; i++){
		for (size_t j=0; j< height; j++){       //create the data set
			size_t randrow	= gsl_rng_uniform_int(rng, height);
            apop_data_memcpy(Apop_r(subset, j), Apop_r(data, randrow));
		}
		//get the parameter estimates.
		apop_model *est = apop_estimate(subset, e);
        gsl_vector *estp = apop_data_pack(est->parameters);
        if (!gsl_isnan(apop_sum(estp))){
            if (i==0){
                array_of_boots	      = apop_data_alloc(iterations, estp->size);
                apop_name_stack(array_of_boots->names, est->parameters->names, 'c', 'v');
                apop_name_stack(array_of_boots->names, est->parameters->names, 'c', 'c');
                apop_name_stack(array_of_boots->names, est->parameters->names, 'c', 'r');
            }
            gsl_matrix_set_row(array_of_boots->matrix, i, estp);
        } else if (ignore_nans=='y'){
            i--; 
            nan_draws++;
        }
        apop_model_free(est);
        gsl_vector_free(estp);
	}
    if(data) data->names = tmpnames;
    apop_data_free(subset);
    apop_model_free(e);
    int set_error=0;
    Apop_stopif(i == 0 && nan_draws == iterations, apop_return_data_error(N),
                1, "I ran into %i NaNs and no not-NaN estimations, and so stopped. "
                       , iterations);
    Apop_stopif(nan_draws == iterations,  set_error++;
            apop_matrix_realloc(array_of_boots->matrix, i, array_of_boots->matrix->size2),
                1, "I ran into %i NaNs, and so stopped. Returning results based "
                       "on %zu bootstrap iterations.", iterations, i);
	summary	= apop_data_covariance(array_of_boots);
    if (boot_store) *boot_store = array_of_boots;
    else            apop_data_free(array_of_boots);
    if (set_error) summary->error = 'N';
	return summary;
}
예제 #2
0
static double cook_math(apop_data *reduced){
  apop_model *r = apop_estimate(reduced, apop_ols);
  apop_data * new_predicted = project(ols_data, r); 
  double out = sum_squared_diff(new_predicted->vector, predicted)/p_dot_mse;
  apop_data_free(new_predicted);
  apop_model_free(r);
  return out;
}
예제 #3
0
파일: iv.c 프로젝트: RayRacine/Apophenia
//generate a vector that is the original vector + noise
void add_noise(gsl_vector *in, gsl_rng *r, double size){
    apop_model *nnoise = apop_model_set_parameters(apop_normal, 0, size);
    for (int i=0; i< in->size; i++){
        double noise;
        apop_draw(&noise, r, nnoise);
        apop_vector_increment(in, i, noise);
    }
    apop_model_free(nnoise);
}
예제 #4
0
//The probability: draw from the rng, smooth with a kernel density, calculate p.
long double p(apop_data *d, apop_model *m){
    int draw_ct = 100;
    apop_data *draws = apop_model_draws(m, draw_ct);
    apop_model *smoothed = apop_model_copy_set(apop_kernel_density, apop_kernel_density,
            .base_data =draws, .kernel=apop_uniform, .set_fn=set_midpoint);
    double out = apop_p(d, smoothed);
    apop_data_free(draws);
    apop_model_free(smoothed);
    return out;
}
예제 #5
0
/** Give me a data set and a model, and I'll give you the jackknifed covariance matrix of the model parameters.

The basic algorithm for the jackknife (glossing over the details): create a sequence of data
sets, each with exactly one observation removed, and then produce a new set of parameter estimates 
using that slightly shortened data set. Then, find the covariance matrix of the derived parameters.

\li Jackknife or bootstrap? As a broad rule of thumb, the jackknife works best on models
    that are closer to linear. The worse a linear approximation does (at the given data),
    the worse the jackknife approximates the variance.

\param in	    The data set. An \ref apop_data set where each row is a single data point.
\param model    An \ref apop_model, that will be used internally by \ref apop_estimate.
            
\exception out->error=='n'   \c NULL input data.
\return         An \c apop_data set whose matrix element is the estimated covariance matrix of the parameters.
\see apop_bootstrap_cov

For example:
\include jack.c
*/
apop_data * apop_jackknife_cov(apop_data *in, apop_model *model){
    Apop_stopif(!in, apop_return_data_error(n), 0, "The data input can't be NULL.");
    Get_vmsizes(in); //msize1, msize2, vsize
    apop_model *e = apop_model_copy(model);
    int i, n = GSL_MAX(msize1, GSL_MAX(vsize, in->textsize[0]));
    apop_model *overall_est = e->parameters ? e : apop_estimate(in, e);//if not estimated, do so
    gsl_vector *overall_params = apop_data_pack(overall_est->parameters);
    gsl_vector_scale(overall_params, n); //do it just once.
    gsl_vector *pseudoval = gsl_vector_alloc(overall_params->size);

    //Copy the original, minus the first row.
    apop_data *subset = apop_data_copy(Apop_rs(in, 1, n-1));
    apop_name *tmpnames = in->names; 
    in->names = NULL;  //save on some copying below.

    apop_data *array_of_boots = apop_data_alloc(n, overall_params->size);

    for(i = -1; i< n-1; i++){
        //Get a view of row i, and copy it to position i-1 in the short matrix.
        if (i >= 0) apop_data_memcpy(Apop_r(subset, i), Apop_r(in, i));
        apop_model *est = apop_estimate(subset, e);
        gsl_vector *estp = apop_data_pack(est->parameters);
        gsl_vector_memcpy(pseudoval, overall_params);// *n above.
        gsl_vector_scale(estp, n-1);
        gsl_vector_sub(pseudoval, estp);
        gsl_matrix_set_row(array_of_boots->matrix, i+1, pseudoval);
        apop_model_free(est);
        gsl_vector_free(estp);
    }
    in->names = tmpnames;
    apop_data *out = apop_data_covariance(array_of_boots);
    gsl_matrix_scale(out->matrix, 1./(n-1.));
    apop_data_free(subset);
    gsl_vector_free(pseudoval);
    apop_data_free(array_of_boots);
    if (e!=overall_est)
        apop_model_free(overall_est);
    apop_model_free(e);
    gsl_vector_free(overall_params);
    return out;
}
예제 #6
0
apop_model *fuzz(apop_model sim){
    int draws = 100;
    gsl_rng *r = apop_rng_alloc(1);
    apop_model *prior = apop_model_cross(
                            apop_model_set_parameters(apop_normal, 10, 2),
                            apop_model_set_parameters(apop_normal, 10, 2));
    apop_data *outdata = apop_data_alloc(draws, weibull->vsize);
    double *params = sim.parameters->vector->data;
    for (int i=0; i< draws; i++){
        do {
            apop_draw(params, r, prior);
        } while (params[1]*2 > pow(params[0], 2));
        sim.dsize=params[1];
        apop_model *est = apop_estimate(apop_model_draws(&sim, 1000), weibull);
        Apop_row_v(outdata, i, onerow);
        gsl_vector_memcpy(onerow, est->parameters->vector);
        apop_model_free(est);
    }
    return apop_estimate(outdata, apop_pmf);
}
예제 #7
0
파일: stats.c 프로젝트: b-k/luminosity
ykl_s make_yule(char const *zila, int y) {
    static gsl_matrix *indices;
    if (!indices) {
        indices = gsl_matrix_calloc(65,1);
        for (int i=0; i< 64; i++) gsl_matrix_set(indices, i,0, i);
    }
    apop_data *col = make_histo(zila, y);
    apop_data ww = (apop_data) {
        .weights=col->vector, .matrix=indices
    };
    apop_data *d = apop_data_transpose(col);
    apop_data *exp = apop_data_rank_expand(d);
    apop_model *m = apop_estimate(exp, apop_yule);
    apop_model *n = apop_estimate(exp, apop_lognormal);
    ykl_s out = (ykl_s) {
        .yule=apop_data_get(m->parameters, .col=-1/*, .rowname="mu"*/),
         .ln=apop_data_get(n->parameters, .col=-1/*, .rowname="mu"*/),
          .lnstderr=sqrt(apop_data_get(n->parameters, .col=-1, .row=1/*, .rowname="mu"*/)),
           .kl = apop_kl_divergence(apop_estimate(&ww, apop_pmf), m),
            .lnkl = apop_kl_divergence(apop_estimate(&ww, apop_pmf), n),
             .mean = apop_matrix_mean(col->matrix)
    };
    apop_data_free(d);
    apop_data_free(exp);
    apop_model_free(m);
    return out;
}

int main() {
    printf("zila|year|yule_p|kl_div|mu|ln_mu|ln_sigma|ln_kl\n");
    apop_db_open("b.db");
    apop_data *zilas = apop_query_to_text("select admname from ppl");
    for (int i=0; i< *zilas->textsize; i++)
        for (int y=2001; y<= 2005; y++) {
            ykl_s ykl = make_yule(*zilas->text[i], y);
            printf("%20s| %i| %g| %g| %g| %g| %g|%g\n", *zilas->text[i], y, ykl.yule, ykl.kl, ykl.mean, ykl.ln, ykl.lnstderr, ykl.lnkl);
        }
    //apop_plot_histogram(m->data->weights, 64, .output_file="histo");
}
예제 #8
0
double cook_math(apop_data *reduced){
    apop_model *r = apop_estimate(reduced, apop_ols);
    double out = sum_squared_diff(project(ols_data, r), predicted)/p_dot_mse;
    apop_model_free(r);
    return out;
}