apop_model* apop_fdist_estimate(apop_data *d, apop_model *m){ Apop_assert(d, "No data with which to count df. (the default estimation method)"); apop_name_add(m->parameters->names, "df", 'r'); apop_name_add(m->parameters->names, "df2", 'r'); apop_data_set(m->parameters, 0, -1, d->vector->size -1); apop_data_set(m->parameters, 1, -1, d->matrix->size1 * d->matrix->size2 -1); apop_data_add_named_elmt(m->info, "log likelihood", apop_f_distribution.log_likelihood(d, m)); return m; }
void FishModel::AddTiles(const FishHookTiles& f) { size_t nrows = f.size(); size_t ncols = f.NumCovariates(); // one for data, one for intercept mat = apop_data_alloc(0, nrows, (int)ncols+2); //vec, nrow, ncol // add the observed counts size_t i = 0; // row id (tile) for (i = 0; i < nrows; ++i) { float scaled_events = f.at(i).events; //scaled_events = f.at(i).covered > 0 ? scaled_events / f.at(i).covered : 0; apop_data_set(mat, i, (int)(ncols+1), scaled_events, NULL, NULL, NULL); } // set the covariates i = 0; // row id (tile) int j = 1; // column id (covariate) (0 is obs counts) for (const auto& r : f) { // loop the bins if (i == nrows) break;//debug, to break early j = 1; for (const auto& c : r) { // loop the covariates apop_data_set(mat, i, j++, c.second, NULL, NULL, NULL); } ++i; // update row interator } // add the intercept term for (i = 0; i < nrows; ++i) apop_data_set(mat, i, 0, 1.0, NULL, NULL, NULL); // add column names apop_name_add(mat->names, "events", 'c'); for (const auto& c : f[0]) apop_name_add(mat->names, c.first.c_str(), 'c'); apop_name_add(mat->names, "intercept", 'c'); // add row names std::stringstream ss; for (const auto& r : f) { ss << r.ChrName(SeqLib::BamHeader()) << ":" << r.pos1 << "-" << r.pos2; apop_name_add(mat->names, ss.str().c_str(), 'r'); ss.str(std::string()); } }
apop_data *draw_some_data(){ gsl_rng *r = apop_rng_alloc(7); apop_data *d = apop_data_alloc(0,10,1); for (int i=0; i<10; i++) apop_data_set(d, i, 0, gsl_rng_uniform(r)*20); apop_data_print(d, .output_pipe=stderr); return d; }
apop_model* apop_t_estimate(apop_data *d, apop_model *m){ Apop_assert(d, "No data with which to count df. (the default estimation method)"); Get_vmsizes(d); //vsize, msize1, msize2, tsize apop_model *out = apop_model_copy(*m); double vmu = vsize ? apop_mean(d->vector) : 0; double v_sum_sq = vsize ? apop_var(d->vector)*(vsize-1) : 0; double m_sum_sq = 0; double mmu = 0; if (msize1) { apop_matrix_mean_and_var(d->matrix, &mmu, &m_sum_sq); m_sum_sq *= msize1*msize2-1; } apop_data_add_names(out->parameters, 'r', "mean", "standard deviation", "df"); apop_data_set(out->parameters, 0, -1, (vmu *vsize + mmu * msize1*msize2)/tsize); apop_data_set(out->parameters, 1, -1, sqrt((v_sum_sq*vsize + m_sum_sq * msize1*msize2)/(tsize-1))); apop_data_set(out->parameters, 2, -1, tsize-1); apop_data_add_named_elmt(out->info, "log likelihood", out->log_likelihood(d, out)); return out; }
void plot(apop_model *k, apop_model *k2){ apop_data *onept = apop_data_alloc(0,1,1); FILE *outtab = fopen("kerneldata", "w"); for (double i=0; i<20; i+=0.01){ apop_data_set(onept,0,0, i); fprintf(outtab, "%g %g %g\n", i, apop_p(onept, k), apop_p(onept, k2)); } fclose(outtab); printf("plot 'kerneldata' using 1:2\n" "replot 'kerneldata' using 1:3\n"); }
static long double multinomial_constraint(apop_data *data, apop_model *b){ //constraint is that 0 < all elmts, and 1>all ps. int size = b->parameters->vector->size; static threadlocal apop_data *constr; if (constr && constr->matrix->size2 != size) apop_data_free(constr); if (!constr){ constr = apop_data_calloc(size*2-1, size*2-1, size); //top half: 0 < [param], including param 0 gsl_matrix_set_identity(Apop_subm(constr->matrix, 0, 0, size, size)); //bottom (almost) half: 1 >= [param], excluding param 0 for (int i=size; i < size*2-1; i++){ apop_data_set(constr, i, -1, -1); apop_data_set(constr, i, i-size+1, -1); } } return apop_linear_constraint(b->parameters->vector, constr); }
static void make_covar(apop_model *est){ int size = est->parameters->vector->size; //the trick where we turn the params into a p-vector double * pv = est->parameters->vector->data; int n = pv[0]; pv[0] = 1 - (apop_sum(est->parameters->vector)-n); apop_data *cov = apop_data_add_page(est->parameters, apop_data_alloc(size, size), "<Covariance>"); for (int i=0; i < size; i++){ double p = apop_data_get(est->parameters, i, -1); apop_data_set(cov, i, i, n * p *(1-p)); for (int j=i+1; j < size; j++){ double pj = apop_data_get(est->parameters, j, -1); double thiscell = -n*p*pj; apop_data_set(cov, i, j, thiscell); apop_data_set(cov, j, i, thiscell); } } pv[0]=n; }
apop_data *make_histo(char const *zila, int year) { apop_data *d = apop_data_calloc(65,1); for (int i=0; i< 64; i++) { char *col; asprintf(&col, "year_%i_num_intensity_%i", year, i); double val=apop_query_to_float("select %s from ppl where " "ADMName='%s'", col, zila); Apop_stopif(isnan(val), continue, 0, "couldn't find %s for %s", col, zila); apop_data_set(d, i+1, 0, val); free(col); } return d; }
static void * process_result_set_data (MYSQL *conn, MYSQL_RES *res_set) { MYSQL_ROW row; unsigned int j=0; unsigned int num_fields = mysql_num_fields(res_set); apop_data *out =apop_data_alloc(0, mysql_num_rows (res_set), num_fields); while ((row = mysql_fetch_row (res_set)) ) { for (size_t i = 0; i < mysql_num_fields (res_set); i++) apop_data_set(out, j , i, atof(row[i])); j++; } MYSQL_FIELD *fields = mysql_fetch_fields(res_set); for(size_t i = 0; i < num_fields; i++) apop_name_add(out->names, fields[i].name, 'c'); check_and_clean(apop_data_free(out)) }
/* This sample produces a dummy times table, gets a summary, and prunes the summary table. If you are not a test script, uncomment the last line to display the pruned table. */ void test_prune_cols(){ int i, j; apop_data *d = apop_data_alloc(0, 10, 4); for (i=0; i< 10; i++) for (j=0; j< 4; j++) apop_data_set(d, i, j, i*j); apop_data *summary = apop_data_summarize(d); apop_data_prune_columns(summary, "mean", "median"); assert(apop_name_find(summary->names, "mean", 'c')!=-2); assert(apop_name_find(summary->names, "median", 'c')!=-2); assert(apop_name_find(summary->names, "max", 'c')==-2); //not found assert(apop_name_find(summary->names, "variance", 'c')==-2); //not found assert(apop_data_get(summary, .row=0, .colname="mean")==0); assert(apop_data_get(summary, .row=1, .colname="median")==4); assert(apop_data_get(summary, .row=2, .colname="median")==8); //apop_data_show(summary); }
void set_uniform_edges(apop_data * r, apop_model *unif){ apop_data_set(unif->parameters, 0, -1, r->matrix->data[0]-0.5); apop_data_set(unif->parameters, 1, -1, r->matrix->data[0]+0.5); }
apop_data* multiple_imputation_variance_base(multiple_imputation_variance_t in){ /*The first half of this is filling in the values. In an attempt at versatility, I allow users to give any named column, be it numeric or text, for every piece of input info. That means a whole lot of checking around to determine what goes where---and a macro. */ Apop_assert_c(in.base_data,NULL, 1, "It doesn't make sense to impute over a NULL data set."); Apop_assert_c(in.fill_ins, NULL, 1, "Didn't receive a fill-in table. Returning NULL."); data_to_data stat = in.stat? in.stat : colmeans; //At the end of this macro, you've got rowcol and rowtype, valuecol and valuetype, &c. #define apop_setup_one_colthing(c) \ int c##col = apop_name_find(in.fill_ins->names, in.c##_name, 'c'); \ int c##type = 'd'; \ if (c##col==-2){ \ c##col = apop_name_find(in.fill_ins->names, in.c##_name, 't'); \ c##type = 't'; \ Apop_assert(c##col!=-2, "I couldn't find the c##_name %s in the column/text names of your fill_in table.", in.c##_name); \ } apop_setup_one_colthing(row) apop_setup_one_colthing(col) apop_setup_one_colthing(value) apop_setup_one_colthing(imputation) Apop_assert(!(rowtype=='t' && !in.base_data->names->rowct), "the rowname you gave refers to text, so I will be searching for a row name in the base data." " But the base_data set has no row names."); Apop_assert(!(coltype=='t' && !in.base_data->names->colct), "the colname you gave refers to text, so I will be searching for a column name in the base data." " But the base_data set has no column names."); //get a list of unique imputation markers. gsl_vector *imps = NULL; apop_data *impt = NULL; if (imputationtype == 'd'){ Apop_col_v(in.fill_ins, imputationcol, ic); imps = apop_vector_unique_elements(ic); } else impt = apop_text_unique_elements(in.fill_ins, imputationcol); int len = imps ? imps->size : impt->textsize[0]; int thisimp=-2; char *thisimpt=NULL; apop_data *estimates[len]; for (int impctr=0; impctr< len; impctr++){ if (imps) thisimp = gsl_vector_get(imps, impctr); else thisimpt = impt->text[impctr][0]; Get_vmsizes(in.fill_ins); //masxize int fillsize = maxsize ? maxsize : in.fill_ins->textsize[0]; for (int i=0; i< fillsize; i++){ if (!(thisimpt && apop_strcmp(in.fill_ins->text[i][imputationcol], thisimpt)) && !(imps && thisimp==apop_data_get(in.fill_ins, i, imputationcol))) continue; int thisrow = (rowtype=='d') ? apop_data_get(in.fill_ins, i, rowcol) :apop_name_find(in.base_data->names, in.fill_ins->text[i][rowcol], 'r'); int thiscol = (coltype=='d') ? apop_data_get(in.fill_ins, i, colcol) :apop_name_find(in.base_data->names, in.fill_ins->text[i][colcol], 'c'); if (valuetype=='d') apop_data_set(in.base_data, thisrow, thiscol, apop_data_get(in.fill_ins, i, valuecol)); else apop_text_add(in.base_data, rowcol, colcol, in.fill_ins->text[i][valuecol]); } //OK, base_data is now filled in. Estimate the statistic for it. estimates[impctr] = stat(in.base_data); } //Part II: find the mean of the statistics and the total variance of the cov matrix. gsl_vector *vals = gsl_vector_alloc(len); apop_data *out = apop_data_copy(estimates[0]); //take the simple mean of the main data set. { //this limits the scope of the Get_vmsizes macro. Get_vmsizes(estimates[0]); for (int j=0; j < msize2; j++) for (int i=0; i < (vsize ? vsize : msize1); i++){ for (int k=0; k< len; k++) gsl_vector_set(vals, k, apop_data_get(estimates[k], i, j)); apop_data_set(out, i, j, apop_vector_mean(vals)); } } apop_data *out_var = apop_data_get_page(estimates[0], "<Covariance>"); int cov_is_labelled = out_var !=NULL; if (!cov_is_labelled){ asprintf(&out->more->names->title, "<Covariance>"); out_var = estimates[0]->more; } Get_vmsizes(out_var); for (int i=0; i < msize1; i++) for (int j=i; j < msize2; j++){ for (int k=0; k< len; k++){ apop_data *this_p = cov_is_labelled ? apop_data_get_page(estimates[k], "<Covariance>") : estimates[k]->more; gsl_vector_set(vals, k, apop_data_get(this_p, i, j)); } double total_var = apop_vector_mean(vals) + apop_var(vals)/(1+1./len); apop_data_set(out_var, i, j, total_var); if (j != i) apop_data_set(out_var, j, i, total_var); } return out; }
static long double wishart_ll(apop_data *in, apop_model *m){ Nullcheck_mpd(in, m, GSL_NAN); wishartstruct_t ws = { .paraminv = apop_matrix_inverse(m->parameters->matrix), .len = sqrt(in->matrix->size2), .df = m->parameters->vector->data[0] }; double paramdet = apop_matrix_determinant(m->parameters->matrix); if (paramdet < 1e-3) return GSL_NEGINF; double ll = apop_map_sum(in, .fn_vp = one_wishart_row, .param=&ws, .part='r'); double k = log(ws.df)*ws.df/2.; k -= M_LN2 * ws.len* ws.df/2.; k -= log(paramdet) * ws.df/2.; k -= apop_multivariate_lngamma(ws.df/2., ws.len); return ll + k*in->matrix->size1; } static int apop_wishart_draw(double *out, gsl_rng *r, apop_model *m){ /* Translated from the Fortran by BK. Fortran comments: C SUBROUTINE DWSHRT(D, N, NP, NNP, SB, SA) C C ALGORITHM AS 53 APPL. STATIST. (1972) VOL.21, NO.3 C C Wishart variate generator. On output, SA is an upper-triangular C matrix of size NP * NP [...] C whose elements have a Wishart(N, SIGMA) distribution. */ Nullcheck_mp(m, ); int np = m->parameters->matrix->size1; int n = m->parameters->vector->data[0]; if (!m->more) { gsl_matrix *ccc = apop_matrix_copy(m->parameters->matrix); gsl_linalg_cholesky_decomp(ccc); for (int i=0; i < ccc->size1; i++) //zero out the upper diagonal for (int j=i+1; j < ccc->size2; j++) gsl_matrix_set(ccc, i, j, 0); m->more = apop_matrix_to_data(ccc); m->more_size = sizeof(apop_data); } apop_data *Chol = m->more; apop_data *rmatrix = apop_data_calloc(np, np); Staticdef(apop_model *, std_normal, apop_model_set_parameters(apop_normal, 0, 1)); //C Load diagonal elements with square root of chi-square variates for(int i = 0; i< np; i++){ int DF = n - i; apop_data_set(rmatrix, i, i, sqrt(gsl_ran_chisq(r, DF))); } for(int i = 1; i< np; i++) //off-diagonal triangles: Normals. for(int j = 0; j< i; j++){ double ndraw; apop_draw(&ndraw, r, std_normal); assert (!gsl_isnan(ndraw)); apop_data_set(rmatrix, i, j, ndraw); } //Now find C * rand * rand' * C' apop_data *cr = apop_dot(Chol, rmatrix); apop_data *crr = apop_dot(cr, rmatrix, .form2='t'); apop_data *crrc = apop_dot(crr, Chol, .form2='t'); memmove(out, crrc->matrix->data, sizeof(double)*np*np); apop_data_free(rmatrix); apop_data_free(cr); apop_data_free(crrc); apop_data_free(crr); return 0; }
//For centering a uniform distribution around a point. //Cut/pasted from the Apophenia documentation, but narrowed range to ±.0.8. This also took some tweaking. //The uniform is not a good choice (and ruins the covariance estimate), but the premise //was that we don't know the formula for a Normal distribution. void set_midpoint(apop_data * in, apop_model *m){ apop_data_set(m->parameters, 0, -1, apop_data_get(in)-0.07); apop_data_set(m->parameters, 1, -1, apop_data_get(in)+0.07); }
void set_uniform_edges(apop_data_row r, apop_model *unif){ apop_data_set(unif->parameters, 0, -1, r.matrix_row.data[0]-0.5); apop_data_set(unif->parameters, 1, -1, r.matrix_row.data[0]+0.5); }