int main(){ apop_data *d = apop_text_alloc(apop_data_alloc(6), 6, 1); apop_data_fill(d, 1, 2, 3, 3, 1, 2); apop_text_fill(d, "A", "A", "A", "A", "A", "B"); asprintf(&d->names->title, "Original data set"); printdata(d); //binned, where bin ends are equidistant but not necessarily in the data apop_data *binned = apop_data_to_bins(d, NULL); asprintf(&binned->names->title, "Post binning"); printdata(binned); assert(apop_sum(binned->weights)==6); assert(fabs(//equal distance between bins (apop_data_get(binned, 1, -1) - apop_data_get(binned, 0, -1)) - (apop_data_get(binned, 2, -1) - apop_data_get(binned, 1, -1))) < 1e-5); //compressed, where the data is as in the original, but weights //are redome to accommodate repeated observations. apop_data_pmf_compress(d); asprintf(&d->names->title, "Post compression"); printdata(d); assert(apop_sum(d->weights)==6); apop_model *d_as_pmf = apop_estimate(d, apop_pmf); Apop_row(d, 0, firstrow); //1A assert(fabs(apop_p(firstrow, d_as_pmf) - 2./6 < 1e-5)); }
static int multinomial_rng(double *out, gsl_rng *r, apop_model* est){ Nullcheck_mp(est, 1); double * p = est->parameters->vector->data; //the trick where we turn the params into a p-vector int N = p[0]; if (est->parameters->vector->size == 2) { *out = gsl_ran_binomial_knuth(r, 1-gsl_vector_get(est->parameters->vector, 1), N); out[1] = N-*out; goto done; } //else, multinomial //cut/pasted/modded from the GSL. Copyright them. p[0] = 1 - (apop_sum(est->parameters->vector)-N); double sum_p = 0.0; int sum_n = 0; for (int i = 0; i < est->parameters->vector->size; i++) { out[i] = (p[i] > 0) ? gsl_ran_binomial (r, p[i] / (1 - sum_p), N - sum_n) : 0; sum_p += p[i]; sum_n += out[i]; } done: p[0] = N; return 0; }
static void multinomial_show(apop_model *est, FILE *out){ double * p = est->parameters->vector->data; int N=p[0]; p[0] = 1 - (apop_sum(est->parameters->vector)-N); fprintf(out, "%s, with %i draws.\nBin odds:\n", est->name, N); apop_vector_print(est->parameters->vector, .output_pipe=out); p[0]=N; }
void rescale_cp_and_merge_into_candidate(apop_data *candidate, apop_data **cp, double scale_to){ if (!*cp) return; double cp_weight_sum = apop_sum((*cp)->weights); if (cp_weight_sum) { gsl_vector_scale((*cp)->weights, scale_to/cp_weight_sum); merge_in_weights_so_far(candidate, *cp);//append cp into candidate } if (!(*cp)->weights || (candidate? candidate->weights->size : 0) != (*cp)->weights->size) apop_data_free(*cp); }
apop_data * apop_bootstrap_cov_base(apop_data * data, apop_model *model, gsl_rng *rng, int iterations, char keep_boots, char ignore_nans, apop_data **boot_store){ #endif Get_vmsizes(data); //vsize, msize1, msize2 apop_model *e = apop_model_copy(model); apop_data *subset = apop_data_copy(data); apop_data *array_of_boots = NULL, *summary; //prevent and infinite regression of covariance calculation. Apop_model_add_group(e, apop_parts_wanted); //default wants for nothing. size_t i, nan_draws=0; apop_name *tmpnames = (data && data->names) ? data->names : NULL; //save on some copying below. if (data && data->names) data->names = NULL; int height = GSL_MAX(msize1, GSL_MAX(vsize, (data?(*data->textsize):0))); for (i=0; i<iterations && nan_draws < iterations; i++){ for (size_t j=0; j< height; j++){ //create the data set size_t randrow = gsl_rng_uniform_int(rng, height); apop_data_memcpy(Apop_r(subset, j), Apop_r(data, randrow)); } //get the parameter estimates. apop_model *est = apop_estimate(subset, e); gsl_vector *estp = apop_data_pack(est->parameters); if (!gsl_isnan(apop_sum(estp))){ if (i==0){ array_of_boots = apop_data_alloc(iterations, estp->size); apop_name_stack(array_of_boots->names, est->parameters->names, 'c', 'v'); apop_name_stack(array_of_boots->names, est->parameters->names, 'c', 'c'); apop_name_stack(array_of_boots->names, est->parameters->names, 'c', 'r'); } gsl_matrix_set_row(array_of_boots->matrix, i, estp); } else if (ignore_nans=='y'){ i--; nan_draws++; } apop_model_free(est); gsl_vector_free(estp); } if(data) data->names = tmpnames; apop_data_free(subset); apop_model_free(e); int set_error=0; Apop_stopif(i == 0 && nan_draws == iterations, apop_return_data_error(N), 1, "I ran into %i NaNs and no not-NaN estimations, and so stopped. " , iterations); Apop_stopif(nan_draws == iterations, set_error++; apop_matrix_realloc(array_of_boots->matrix, i, array_of_boots->matrix->size2), 1, "I ran into %i NaNs, and so stopped. Returning results based " "on %zu bootstrap iterations.", iterations, i); summary = apop_data_covariance(array_of_boots); if (boot_store) *boot_store = array_of_boots; else apop_data_free(array_of_boots); if (set_error) summary->error = 'N'; return summary; }
apop_data *kappa_and_pi(apop_data const *tab_in){ apop_data *out = apop_data_alloc(); Apop_stopif(!tab_in, out->error='n'; return out, 0, "NULL input. Returning output with 'n' error code."); Apop_stopif(!tab_in->matrix, out->error='m'; return out, 0, "NULL input matrix. Returning output with 'm' error code."); Apop_stopif(tab_in->matrix->size1 != tab_in->matrix->size2, out->error='s'; return out, 0, "Input rows=%zu; input cols=%zu; " "these need to be equal. Returning output with error code 's'.", tab_in->matrix->size1, tab_in->matrix->size2); apop_data *tab = apop_data_copy(tab_in); double total = apop_matrix_sum(tab->matrix); gsl_matrix_scale(tab->matrix, 1./total); double p_o = 0, p_e = 0, scott_pe = 0, ia = 0, row_ent = 0, col_ent = 0; for (int c=0; c< tab->matrix->size1; c++){ double this_obs = apop_data_get(tab, c, c); p_o += this_obs; Apop_row_v(tab, c, row); Apop_col_v(tab, c, col); double rsum = apop_sum(row); double csum = apop_sum(col); p_e += rsum * csum; scott_pe += pow((rsum+csum)/2, 2); ia += this_obs * log2(this_obs/(rsum * csum)); row_ent -= rsum * log2(rsum); col_ent -= csum * log2(csum); } apop_data_free(tab); asprintf(&out->names->title, "Scott's π and Cohen's κ"); apop_data_add_named_elmt(out, "total count", total); apop_data_add_named_elmt(out, "percent agreement", p_o); apop_data_add_named_elmt(out, "κ", ((p_e==1)? 0: (p_o - p_e) / (1-p_e) )); apop_data_add_named_elmt(out, "π", ((p_e==1)? 0: (p_o - scott_pe) / (1-scott_pe))); apop_data_add_named_elmt(out, "P_I", ia/((row_ent+col_ent)/2)); apop_data_add_named_elmt(out, "Cohen's p_e", p_e); apop_data_add_named_elmt(out, "Scott's p_e", scott_pe); apop_data_add_named_elmt(out, "information in agreement", ia); apop_data_add_named_elmt(out, "row entropy", row_ent); apop_data_add_named_elmt(out, "column entropy", col_ent); return out; }
static double multinomial_log_likelihood(apop_data *d, apop_model *params){ Nullcheck_mpd(d, params, GSL_NAN); double *pv = params->parameters->vector->data; double n = pv[0]; Apop_assert_c(pv[1] <=1, GSL_NAN, 1, "The input parameters should be [n, p_1, (...)], but " "element 1 of the parameter vector is >1.") //mostly makes sense for the binomial. if (n==2) return apop_map_sum(d, .fn_vp=binomial_ll, .param=params->parameters->vector); pv[0] = 1 - (apop_sum(params->parameters->vector)-n);//making the params a p-vector. Put n back at the end. double out = apop_map_sum(d, .fn_vp=multinomial_ll, .param=params); pv[0]=n; return out; }
int main(){ int len = 3000; gsl_vector *v = gsl_vector_alloc(len); for (double i=0; i< len; i++) gsl_vector_set(v, i, 1./(i+1)); double square; gsl_blas_ddot(v, v, &square); printf("1 + (1/2)^2 + (1/3)^2 + ...= %g\n", square); double pi_over_six = gsl_pow_2(M_PI)/6.; Diff(square, pi_over_six); /* Now using apop_dot, in a few forms. First, vector-as-data dot itself. If one of the inputs is a vector, apop_dot puts the output in a vector-as-data:*/ apop_data *v_as_data = &(apop_data){.vector=v}; apop_data *vdotv = apop_dot(v_as_data, v_as_data); Diff(gsl_vector_get(vdotv->vector, 0), pi_over_six); /* Wrap matrix in an apop_data set. */ gsl_matrix *v_as_matrix = apop_vector_to_matrix(v); apop_data dm = (apop_data){.matrix=v_as_matrix}; // (1 X len) vector dot (len X 1) matrix --- produce a scalar (one item vector). apop_data *mdotv = apop_dot(v_as_data, &dm); double scalarval = apop_data_get(mdotv); Diff(scalarval, pi_over_six); //(len X 1) dot (len X 1) --- bad dimensions. apop_opts.verbose=-1; //don't print an error. apop_data *mdotv2 = apop_dot(&dm, v_as_data); apop_opts.verbose=0; //back to safety. assert(mdotv2->error); // If we want (len X 1) dot (1 X len) --> (len X len), // use apop_vector_to_matrix. apop_data dmr = (apop_data){.matrix=apop_vector_to_matrix(v, .row_col='r')}; apop_data *product_matrix = apop_dot(&dm, &dmr); //The trace is the sum of squares: gsl_vector_view trace = gsl_matrix_diagonal(product_matrix->matrix); double tracesum = apop_sum(&trace.vector); Diff(tracesum, pi_over_six); apop_data_free(product_matrix); gsl_matrix_free(dmr.matrix); }
static double one_wishart_row(gsl_vector *in, void *ws_in){ wishartstruct_t *ws = ws_in; gsl_matrix *invparams_dot_data = gsl_matrix_alloc(ws->len, ws->len); apop_data *square= apop_data_alloc(ws->len, ws->len); apop_data_unpack(in, square); double datadet = apop_matrix_determinant(square->matrix); assert(datadet); gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1, ws->paraminv, square->matrix, 0, invparams_dot_data); gsl_vector_view diag = gsl_matrix_diagonal(invparams_dot_data); double trace = apop_sum(&diag.vector); gsl_matrix_free(invparams_dot_data); apop_data_free(square); double out= log(datadet) * (ws->df - ws->len -1.)/2. - trace*ws->df/2.; assert(isfinite(out)); return out; }
static void make_covar(apop_model *est){ int size = est->parameters->vector->size; //the trick where we turn the params into a p-vector double * pv = est->parameters->vector->data; int n = pv[0]; pv[0] = 1 - (apop_sum(est->parameters->vector)-n); apop_data *cov = apop_data_add_page(est->parameters, apop_data_alloc(size, size), "<Covariance>"); for (int i=0; i < size; i++){ double p = apop_data_get(est->parameters, i, -1); apop_data_set(cov, i, i, n * p *(1-p)); for (int j=i+1; j < size; j++){ double pj = apop_data_get(est->parameters, j, -1); double thiscell = -n*p*pj; apop_data_set(cov, i, j, thiscell); apop_data_set(cov, j, i, thiscell); } } pv[0]=n; }
fprintf(out, "%s, with %i draws.\nBin odds:\n", est->name, N); apop_vector_print(est->parameters->vector, .output_pipe=out); p[0]=N; } double avs(gsl_vector *v){return (double) apop_vector_sum(v);} /* \amodel apop_multinomial The \f$n\f$--option generalization of the \ref apop_binomial "Binomial distribution". \adoc estimated_info Reports <tt>log likelihood</tt>. */ static void multinomial_estimate(apop_data * data, apop_model *est){ Nullcheck_mpd(data, est, ); Get_vmsizes(data); //vsize, msize1 est->parameters= apop_map(data, .fn_v=avs, .part='c'); gsl_vector *v = est->parameters->vector; int n = apop_sum(v)/data->matrix->size1; //size of one row apop_vector_normalize(v); apop_name_add(est->parameters->names, "n", 'r'); apop_data_set(est->parameters, .val=n); //zeroth item is now n, not p_0 char name[100]; for(int i=1; i < v->size; i++){ sprintf(name, "p%i", i); apop_name_add(est->parameters->names, name, 'r'); } est->dsize = n; make_covar(est); apop_data_add_named_elmt(est->info, "log likelihood", multinomial_log_likelihood(data, est)); } static void multinom_prep(apop_data *data, apop_model *params){ apop_model_print_vtable_add(multinomial_show, params);