int main(){ int rep_ct = 10000; gsl_rng *r = apop_rng_alloc(0); apop_db_open("data-census.db"); gsl_vector *base_data = apop_query_to_vector("select in_per_capita from income where sumlevel+0.0 =40"); double RI = apop_query_to_float("select in_per_capita from income where sumlevel+0.0 =40 and geo_id2+0.0=44"); gsl_vector *boot_sample = gsl_vector_alloc(base_data->size); gsl_vector *replications = gsl_vector_alloc(rep_ct); for (int i=0; i< rep_ct; i++){ one_boot(base_data, r, boot_sample); gsl_vector_set(replications, i, apop_mean(boot_sample)); } double stderror = sqrt(apop_var(replications)); double mean = apop_mean(replications); printf("mean: %g; standard error: %g; (RI-mean)/stderr: %g; p value: %g\n", mean, stderror, (RI-mean)/stderror, 2*gsl_cdf_gaussian_Q(fabs(RI-mean), stderror)); }
apop_model* apop_t_estimate(apop_data *d, apop_model *m){ Apop_assert(d, "No data with which to count df. (the default estimation method)"); Get_vmsizes(d); //vsize, msize1, msize2, tsize apop_model *out = apop_model_copy(*m); double vmu = vsize ? apop_mean(d->vector) : 0; double v_sum_sq = vsize ? apop_var(d->vector)*(vsize-1) : 0; double m_sum_sq = 0; double mmu = 0; if (msize1) { apop_matrix_mean_and_var(d->matrix, &mmu, &m_sum_sq); m_sum_sq *= msize1*msize2-1; } apop_data_add_names(out->parameters, 'r', "mean", "standard deviation", "df"); apop_data_set(out->parameters, 0, -1, (vmu *vsize + mmu * msize1*msize2)/tsize); apop_data_set(out->parameters, 1, -1, sqrt((v_sum_sq*vsize + m_sum_sq * msize1*msize2)/(tsize-1))); apop_data_set(out->parameters, 2, -1, tsize-1); apop_data_add_named_elmt(out->info, "log likelihood", out->log_likelihood(d, out)); return out; }
double find_tstat(gsl_vector *in){ return apop_mean(in)/sqrt(apop_var(in));}
apop_data* multiple_imputation_variance_base(multiple_imputation_variance_t in){ /*The first half of this is filling in the values. In an attempt at versatility, I allow users to give any named column, be it numeric or text, for every piece of input info. That means a whole lot of checking around to determine what goes where---and a macro. */ Apop_assert_c(in.base_data,NULL, 1, "It doesn't make sense to impute over a NULL data set."); Apop_assert_c(in.fill_ins, NULL, 1, "Didn't receive a fill-in table. Returning NULL."); data_to_data stat = in.stat? in.stat : colmeans; //At the end of this macro, you've got rowcol and rowtype, valuecol and valuetype, &c. #define apop_setup_one_colthing(c) \ int c##col = apop_name_find(in.fill_ins->names, in.c##_name, 'c'); \ int c##type = 'd'; \ if (c##col==-2){ \ c##col = apop_name_find(in.fill_ins->names, in.c##_name, 't'); \ c##type = 't'; \ Apop_assert(c##col!=-2, "I couldn't find the c##_name %s in the column/text names of your fill_in table.", in.c##_name); \ } apop_setup_one_colthing(row) apop_setup_one_colthing(col) apop_setup_one_colthing(value) apop_setup_one_colthing(imputation) Apop_assert(!(rowtype=='t' && !in.base_data->names->rowct), "the rowname you gave refers to text, so I will be searching for a row name in the base data." " But the base_data set has no row names."); Apop_assert(!(coltype=='t' && !in.base_data->names->colct), "the colname you gave refers to text, so I will be searching for a column name in the base data." " But the base_data set has no column names."); //get a list of unique imputation markers. gsl_vector *imps = NULL; apop_data *impt = NULL; if (imputationtype == 'd'){ Apop_col_v(in.fill_ins, imputationcol, ic); imps = apop_vector_unique_elements(ic); } else impt = apop_text_unique_elements(in.fill_ins, imputationcol); int len = imps ? imps->size : impt->textsize[0]; int thisimp=-2; char *thisimpt=NULL; apop_data *estimates[len]; for (int impctr=0; impctr< len; impctr++){ if (imps) thisimp = gsl_vector_get(imps, impctr); else thisimpt = impt->text[impctr][0]; Get_vmsizes(in.fill_ins); //masxize int fillsize = maxsize ? maxsize : in.fill_ins->textsize[0]; for (int i=0; i< fillsize; i++){ if (!(thisimpt && apop_strcmp(in.fill_ins->text[i][imputationcol], thisimpt)) && !(imps && thisimp==apop_data_get(in.fill_ins, i, imputationcol))) continue; int thisrow = (rowtype=='d') ? apop_data_get(in.fill_ins, i, rowcol) :apop_name_find(in.base_data->names, in.fill_ins->text[i][rowcol], 'r'); int thiscol = (coltype=='d') ? apop_data_get(in.fill_ins, i, colcol) :apop_name_find(in.base_data->names, in.fill_ins->text[i][colcol], 'c'); if (valuetype=='d') apop_data_set(in.base_data, thisrow, thiscol, apop_data_get(in.fill_ins, i, valuecol)); else apop_text_add(in.base_data, rowcol, colcol, in.fill_ins->text[i][valuecol]); } //OK, base_data is now filled in. Estimate the statistic for it. estimates[impctr] = stat(in.base_data); } //Part II: find the mean of the statistics and the total variance of the cov matrix. gsl_vector *vals = gsl_vector_alloc(len); apop_data *out = apop_data_copy(estimates[0]); //take the simple mean of the main data set. { //this limits the scope of the Get_vmsizes macro. Get_vmsizes(estimates[0]); for (int j=0; j < msize2; j++) for (int i=0; i < (vsize ? vsize : msize1); i++){ for (int k=0; k< len; k++) gsl_vector_set(vals, k, apop_data_get(estimates[k], i, j)); apop_data_set(out, i, j, apop_vector_mean(vals)); } } apop_data *out_var = apop_data_get_page(estimates[0], "<Covariance>"); int cov_is_labelled = out_var !=NULL; if (!cov_is_labelled){ asprintf(&out->more->names->title, "<Covariance>"); out_var = estimates[0]->more; } Get_vmsizes(out_var); for (int i=0; i < msize1; i++) for (int j=i; j < msize2; j++){ for (int k=0; k< len; k++){ apop_data *this_p = cov_is_labelled ? apop_data_get_page(estimates[k], "<Covariance>") : estimates[k]->more; gsl_vector_set(vals, k, apop_data_get(this_p, i, j)); } double total_var = apop_vector_mean(vals) + apop_var(vals)/(1+1./len); apop_data_set(out_var, i, j, total_var); if (j != i) apop_data_set(out_var, j, i, total_var); } return out; }