Ejemplo n.º 1
0
int main(){
    gsl_rng *r = apop_rng_alloc(10);
    size_t i, ct = 5e4;

    //set up the model & params
    apop_data *d  = apop_data_alloc(ct,2);
    apop_data *params = apop_data_alloc(2,2,2);
    apop_data_fill(params, 8,  1, 0.5,
                           2,  0.5, 1);
    apop_model *pvm = apop_model_copy(apop_multivariate_normal);
    pvm->parameters = apop_data_copy(params);

    //make random draws from the multivar. normal
    //this `pull a row view, fill its data element' form works for rows but not cols.
    for(i=0; i< ct; i++){
        Apop_row(d, i, onerow);
        apop_draw(onerow->data, r, pvm);
    }

    //set up and estimate a model with fixed covariance matrix but free means
    gsl_vector_set_all(pvm->parameters->vector, GSL_NAN);
    apop_model *mep1   = apop_model_fix_params(pvm);
    apop_model *e1  = apop_estimate(d, *mep1);
    
    //compare results
    printf("original params: ");
    apop_vector_show(params->vector);
    printf("estimated params: ");
    apop_vector_show(e1->parameters->vector);
}
Ejemplo n.º 2
0
int main(){
    gsl_rng *r = apop_rng_alloc(2468);
    double binom_start = 0.6;
    double beta_start_a = 0.3;
    double beta_start_b = 0.5;
    int i, draws = 1500;
    double n = 4000;
    //First, the easy estimation using the conjugate distribution table.
    apop_model *bin = apop_model_set_parameters(apop_binomial, n, binom_start);
    apop_model *beta = apop_model_set_parameters(apop_beta, beta_start_a, beta_start_b);
    apop_model *updated = apop_update(.prior= beta, .likelihood=bin,.rng=r);

    //Now estimate via Gibbs sampling. 
    //Requires a one-parameter binomial, with n fixed,
    //and a data set of n data points with the right p.
    apop_model *bcopy = apop_model_set_parameters(apop_binomial, n, GSL_NAN);
    apop_data *bin_draws = apop_data_fill(apop_data_alloc(1,2), n*(1-binom_start), n*binom_start);
    bin = apop_model_fix_params(bcopy);
    apop_model_add_group(beta, apop_update, .burnin=.1, .periods=1e4);
    apop_model *out_h = apop_update(bin_draws, beta, bin, NULL);

    //We now have a histogram of values for p. What's the closest beta
    //distribution?
    apop_data *d = apop_data_alloc(0, draws, 1);
    for(i=0; i < draws; i ++)
        apop_draw(apop_data_ptr(d, i, 0), r, out_h);
    apop_model *out_beta = apop_estimate(d, apop_beta);
    //Finally, we can compare the conjugate and Gibbs results:
    apop_vector_normalize(updated->parameters->vector);
    apop_vector_normalize(out_beta->parameters->vector);
    double error = apop_vector_distance(updated->parameters->vector, out_beta->parameters->vector, .metric='m');
    double updated_size = apop_vector_sum(updated->parameters->vector);
    Apop_assert(error/updated_size < 0.01, "The error is %g, which is too big.", error/updated_size);
}
Ejemplo n.º 3
0
apop_data *apop_data_pmf_expand(apop_data *in, int factor){
    apop_data *expanded = apop_data_alloc();
    apop_vector_normalize(in->weights);
    for (int i=0; i< in->weights->size;i++){
        int wt = gsl_vector_get(in->weights, i)* factor;
        if (wt){
            apop_data *next = apop_data_alloc(wt);
            gsl_vector_set_all(next->vector, apop_data_get(in, i));
            apop_data_stack(expanded, next, .inplace='y');
        }
    }
    if (expanded->vector) return expanded;
    else return NULL;
}
Ejemplo n.º 4
0
static void * process_result_set_chars (MYSQL *conn, MYSQL_RES *res_set) {
  MYSQL_ROW        row;
  unsigned int     currentrow = 0;
  size_t total_cols       = mysql_num_fields(res_set);
  size_t total_rows       = mysql_num_rows(res_set);
  char ***out      = malloc(sizeof(char**) * total_rows );
  apop_data *output= apop_data_alloc(0,0,0);
    while ((row = mysql_fetch_row (res_set)) ) {
		out[currentrow]	= malloc(sizeof(char*) * total_cols);
		for (size_t jj=0;jj<total_cols;jj++){
			if (row[jj]==NULL){
				out[currentrow][jj]	= malloc(sizeof(char));
				strcpy(out[currentrow][jj], "\0");
			} else {
				out[currentrow][jj]	= malloc(1+strlen(row[jj]));
				strcpy(out[currentrow][jj], row[jj]);
			}
		}
		currentrow++;
    }
    output->text        = out;
    output->textsize[0] = total_rows;
    output->textsize[1] = total_cols;
    check_and_clean(;)
}
Ejemplo n.º 5
0
int main(){
    apop_data *d = apop_data_alloc(10, 100);
    gsl_rng *r = apop_rng_alloc(3242);
    for (int i=0; i< 10; i++){
        row_offset = gsl_rng_uniform(r)*2 -1; //declared and used above.
        apop_vector_apply(Apop_rv(d, i), offset_rng);
    }

    int df = d->matrix->size2-1;
    apop_data *means = apop_map(d, .fn_v = mu, .part ='r');
    apop_data *tstats = apop_map(d, .fn_v = find_tstat, .part ='r');
    apop_data *confidences = apop_map(tstats, .fn_dp = conf, .param = &df);

    printf("means:\n"); apop_data_show(means);
    printf("\nt stats:\n"); apop_data_show(tstats);
    printf("\nconfidences:\n"); apop_data_show(confidences);

    //Some sanity checks, for Apophenia's test suite.
    for (int i=0; i< 10; i++){
        //sign of mean == sign of t stat.
        assert(apop_data_get(means, i, -1) * apop_data_get(tstats, i, -1) >=0);

        //inverse of P-value should be the t statistic.
        assert(fabs(gsl_cdf_tdist_Pinv(apop_data_get(confidences, i, -1), 99) 
                    - apop_data_get(tstats, i, -1)) < 1e-5);
    }
}
Ejemplo n.º 6
0
int main(){
    apop_data *d = apop_text_alloc(apop_data_alloc(6), 6, 1);
    apop_data_fill(d,   1,   2,   3,   3,   1,   2);
    apop_text_fill(d,  "A", "A", "A", "A", "A", "B");

    asprintf(&d->names->title, "Original data set");
    printdata(d);

        //binned, where bin ends are equidistant but not necessarily in the data
    apop_data *binned = apop_data_to_bins(d, NULL);
    asprintf(&binned->names->title, "Post binning");
    printdata(binned);
    assert(apop_sum(binned->weights)==6);
    assert(fabs(//equal distance between bins
              (apop_data_get(binned, 1, -1) - apop_data_get(binned, 0, -1))
            - (apop_data_get(binned, 2, -1) - apop_data_get(binned, 1, -1))) < 1e-5);

        //compressed, where the data is as in the original, but weights 
        //are redome to accommodate repeated observations.
    apop_data_pmf_compress(d);
    asprintf(&d->names->title, "Post compression");
    printdata(d);
    assert(apop_sum(d->weights)==6);

    apop_model *d_as_pmf = apop_estimate(d, apop_pmf);
    Apop_row(d, 0, firstrow); //1A
    assert(fabs(apop_p(firstrow, d_as_pmf) - 2./6 < 1e-5));
}
Ejemplo n.º 7
0
/** Run the Kolmogorov-Smirnov test to determine whether two distributions are identical.

\param m1, m2  Two models, most likely of \ref apop_pmf type. I will ue the cdf method, so if your function doesn't have one, expect this to run the slow default. I run it for each row of each data set, so if your model has a \c NULL at the data, I won't know what to do. 
 
\return An \ref apop_data set including the \f$p\f$-value from the Kolmogorov test that the two distributions are equal.

\li I assume that the data sets are sorted.

\include ks_tests.c

\ingroup histograms
*/
apop_data *apop_test_kolmogorov(apop_model *m1, apop_model *m2){
    //version for not a pair of histograms
    Apop_assert(m1->data, "I will test the CDF at each point in the data set, but the first model has a NULL data set. "
                          "Maybe generate, then apop_data_sort, a few thousand random draws?");
    Apop_assert(m2->data, "I will test the CDF at each point in the data set, but the second model has a NULL data set. "
                          "Maybe generate, then apop_data_sort, a few thousand random draws?");
    int maxsize1, maxsize2;
    {Get_vmsizes(m1->data); maxsize1 = maxsize;}//copy one of the macro's variables 
    {Get_vmsizes(m2->data); maxsize2 = maxsize;}//  to the full function's scope.
    double largest_diff=GSL_NEGINF;
    for (size_t i=0; i< maxsize1; i++){
        Apop_data_row(m1->data, i, arow);
        largest_diff = GSL_MAX(largest_diff, fabs(apop_cdf(arow, m1)-apop_cdf(arow, m2)));
    }
    for (size_t i=0; i< maxsize2; i++){     //There should be matched data rows, so there is redundancy.
        Apop_data_row(m2->data, i, arow); // Feel free to submit a smarter version.
        largest_diff = GSL_MAX(largest_diff, fabs(apop_cdf(arow, m1)-apop_cdf(arow, m2)));
    }
    apop_data *out = apop_data_alloc();
    sprintf(out->names->title, "Kolmogorov-Smirnov test");
    apop_data_add_named_elmt(out, "max distance", largest_diff);
    double ps = psmirnov2x(largest_diff, maxsize1, maxsize2);
    apop_data_add_named_elmt(out, "p value, 2 tail", 1-ps);
    apop_data_add_named_elmt(out, "confidence, 2 tail", ps);
    return out;
}
Ejemplo n.º 8
0
/** Test the goodness-of-fit between two \ref apop_pmf models. 

If you send two histograms, I assume that the histograms are synced: for PMFs,
you've used \ref apop_data_to_bins to generate two histograms using the same binspec,
or you've used \ref apop_data_pmf_compress to guarantee that each observation value
appears exactly once in each data set.

In any case, you are confident that all values in the \c observed set appear in the \c
expected set with nonzero weight; otherwise this will return a \f$\chi^2\f$ statistic
of \c GSL_POSINF, indicating that it is impossible for the \c observed data to have
been drawn from the \c expected distribution.

\li If an observation row has weight zero, I skip it. if <tt>apop_opts.verbose >=1 </tt> I will show a warning.

  \ingroup histograms
*/
apop_data *apop_histograms_test_goodness_of_fit(apop_model *observed, apop_model *expected){
    int df = observed->data->weights->size;
    double diff = 0;
    for (int i=0; i< observed->data->weights->size; i++){
        Apop_data_row(observed->data, i, one_obs);
        double obs_val = gsl_vector_get(observed->data->weights, i);
        double exp_val = apop_p(one_obs, expected);
        if (exp_val == 0){
            diff = GSL_POSINF; 
            break;
        }
        if (obs_val==0){
            Apop_notify(1, "element %i of the observed data has weight zero. Skipping it.", i);
            df --;
        } else 
            diff += gsl_pow_2(obs_val - exp_val)/exp_val;
    }
    //Data gathered. Now output
    apop_data   *out    = apop_data_alloc();
    double      toptail = gsl_cdf_chisq_Q(diff, df-1);
    sprintf(out->names->title, "Goodness-of-fit test via Chi-squared statistic");
    apop_data_add_named_elmt(out, "Chi squared statistic", diff);
    apop_data_add_named_elmt(out, "df", df-1);
    apop_data_add_named_elmt(out, "p value",  toptail); 
    apop_data_add_named_elmt(out, "confidence", 1 - toptail);
    return out;
}
Ejemplo n.º 9
0
//Use this function to produce test data below.
apop_data *draw_exponentiated_normal(double mu, double sigma, double draws){
    apop_model *n01 = apop_model_set_parameters(apop_normal, mu, sigma);
    apop_data *d = apop_data_alloc(draws);
    gsl_rng *r = apop_rng_alloc(13);
    for (int i=0; i< draws; i++) apop_draw(gsl_vector_ptr(d->vector,i), r, n01);
    apop_vector_exp(d->vector);
    return d;
}
Ejemplo n.º 10
0
apop_data *draw_some_data(){
    gsl_rng *r = apop_rng_alloc(7);
    apop_data *d  = apop_data_alloc(0,10,1); 
    for (int i=0; i<10; i++)
        apop_data_set(d, i, 0, gsl_rng_uniform(r)*20);
    apop_data_print(d, .output_pipe=stderr);
    return d;
}
Ejemplo n.º 11
0
void plot(apop_model *k, apop_model *k2){
    apop_data *onept = apop_data_alloc(0,1,1);
    FILE *outtab = fopen("kerneldata", "w");
    for (double i=0; i<20; i+=0.01){
        apop_data_set(onept,0,0, i);
        fprintf(outtab, "%g %g %g\n", i, apop_p(onept, k), apop_p(onept, k2));
    }
    fclose(outtab);
    printf("plot 'kerneldata' using 1:2\n"
           "replot 'kerneldata' using 1:3\n");
}
Ejemplo n.º 12
0
 apop_data * apop_bootstrap_cov_base(apop_data * data, apop_model *model, gsl_rng *rng, int iterations, char keep_boots, char ignore_nans, apop_data **boot_store){
#endif
    Get_vmsizes(data); //vsize, msize1, msize2
    apop_model *e = apop_model_copy(model);
    apop_data *subset = apop_data_copy(data);
    apop_data *array_of_boots = NULL,
              *summary;
    //prevent and infinite regression of covariance calculation.
    Apop_model_add_group(e, apop_parts_wanted); //default wants for nothing.
    size_t i, nan_draws=0;
    apop_name *tmpnames = (data && data->names) ? data->names : NULL; //save on some copying below.
    if (data && data->names) data->names = NULL;

    int height = GSL_MAX(msize1, GSL_MAX(vsize, (data?(*data->textsize):0)));
	for (i=0; i<iterations && nan_draws < iterations; i++){
		for (size_t j=0; j< height; j++){       //create the data set
			size_t randrow	= gsl_rng_uniform_int(rng, height);
            apop_data_memcpy(Apop_r(subset, j), Apop_r(data, randrow));
		}
		//get the parameter estimates.
		apop_model *est = apop_estimate(subset, e);
        gsl_vector *estp = apop_data_pack(est->parameters);
        if (!gsl_isnan(apop_sum(estp))){
            if (i==0){
                array_of_boots	      = apop_data_alloc(iterations, estp->size);
                apop_name_stack(array_of_boots->names, est->parameters->names, 'c', 'v');
                apop_name_stack(array_of_boots->names, est->parameters->names, 'c', 'c');
                apop_name_stack(array_of_boots->names, est->parameters->names, 'c', 'r');
            }
            gsl_matrix_set_row(array_of_boots->matrix, i, estp);
        } else if (ignore_nans=='y'){
            i--; 
            nan_draws++;
        }
        apop_model_free(est);
        gsl_vector_free(estp);
	}
    if(data) data->names = tmpnames;
    apop_data_free(subset);
    apop_model_free(e);
    int set_error=0;
    Apop_stopif(i == 0 && nan_draws == iterations, apop_return_data_error(N),
                1, "I ran into %i NaNs and no not-NaN estimations, and so stopped. "
                       , iterations);
    Apop_stopif(nan_draws == iterations,  set_error++;
            apop_matrix_realloc(array_of_boots->matrix, i, array_of_boots->matrix->size2),
                1, "I ran into %i NaNs, and so stopped. Returning results based "
                       "on %zu bootstrap iterations.", iterations, i);
	summary	= apop_data_covariance(array_of_boots);
    if (boot_store) *boot_store = array_of_boots;
    else            apop_data_free(array_of_boots);
    if (set_error) summary->error = 'N';
	return summary;
}
Ejemplo n.º 13
0
int main(){
    gsl_rng *r = apop_rng_alloc(2312311);
    int empirical_size = 5e3;
    apop_model *expo = apop_model_set_parameters(apop_exponential, 1.7);
    assert (apop_kl_divergence(expo, expo) < 1e-4);
    apop_data *empirical = apop_data_alloc(empirical_size, 1);
    for (int i=0; i<empirical_size; i++)
        apop_draw(apop_data_ptr(empirical, i, 0), r, expo);
    apop_model *pmf = apop_estimate(empirical, apop_pmf);
    assert(apop_kl_divergence(pmf,expo) < 1e-4);
    apop_data_free(empirical);
}
Ejemplo n.º 14
0
void plot(apop_model *k, char *outname){
    double max = 4.3, increment = 0.01;
    apop_data *out = apop_data_alloc(max/increment+1);
    out->weights =  gsl_vector_alloc(max/increment+1);
    for (int c=0; c<max/increment; c++){
        Apop_row(out, c, a_point);
        gsl_vector_set(a_point->vector, 0, c* increment);
        gsl_vector_set(a_point->weights, 0, apop_p(a_point, k));
    }
    apop_vector_normalize(out->weights); //Q: why is this always necessary?
    apop_data_print(out, .output_name=outname);
}
Ejemplo n.º 15
0
int main(){
    //bind together a Poisson and a Normal;
    //make a draw producing a 2-element vector
    apop_model *m1 = apop_model_set_parameters(apop_poisson, 3);
    apop_model *m2 = apop_model_set_parameters(apop_normal, -5, 1);
    apop_model *mm = apop_model_stack(m1, m2);
    int len = 1e5;
    gsl_rng *r = apop_rng_alloc(1);
    apop_data *draws = apop_data_alloc(len, 2);
    for (int i=0; i< len; i++){
        Apop_row (draws, i, onev);
        apop_draw(onev->data, r, mm);
        assert((int)onev->data[0] == onev->data[0]);
        assert(onev->data[1]<0);
    }

    //The rest of the test script recovers the parameters.
    //First, set up a two-page data set: poisson data on p1, Normal on p2:
    apop_data *comeback = apop_data_alloc();
    Apop_col(draws, 0,fishdraws)
    comeback->vector = apop_vector_copy(fishdraws);
    apop_data_add_page(comeback, apop_data_alloc(), "p2");
    Apop_col(draws, 1, meandraws)
    comeback->more->vector = apop_vector_copy(meandraws);

    //set up the un-parameterized stacked model, including
    //the name at which to split the data set
    apop_model *estme = apop_model_stack(apop_model_copy(apop_poisson), apop_model_copy(apop_normal));
    Apop_settings_add(estme, apop_stack, splitpage, "p2");
    apop_model *ested = apop_estimate(comeback, *estme);

    //test that the parameters are as promised.
    apop_model *m1back = apop_settings_get(ested, apop_stack, model1);
    apop_model *m2back = apop_settings_get(ested, apop_stack, model2);
    assert(fabs(apop_data_get(m1back->parameters, .col=-1) - 3) < 1e-2);
    assert(fabs(apop_data_get(m2back->parameters, .col=-1) - -5) < 1e-2);
    assert(fabs(apop_data_get(m2back->parameters, .col=-1, .row=1) - 1) < 1e-2);
}
Ejemplo n.º 16
0
static void * process_result_set_data (MYSQL *conn, MYSQL_RES *res_set) {
  MYSQL_ROW        row;
  unsigned int     j=0;
  unsigned int num_fields = mysql_num_fields(res_set);
  apop_data *out   =apop_data_alloc(0, mysql_num_rows (res_set), num_fields);
     while ((row = mysql_fetch_row (res_set)) ) {
             for (size_t i = 0; i < mysql_num_fields (res_set); i++) 
                 apop_data_set(out, j , i, atof(row[i]));
             j++;
        }
     MYSQL_FIELD *fields = mysql_fetch_fields(res_set);
     for(size_t i = 0; i < num_fields; i++)
         apop_name_add(out->names, fields[i].name, 'c');
     check_and_clean(apop_data_free(out))
}
Ejemplo n.º 17
0
/* This sample produces a dummy times table, gets a summary, and prunes the summary table.
If you are not a test script, uncomment the last line to display the pruned table.  */
void test_prune_cols(){
    int i, j;
    apop_data *d = apop_data_alloc(0, 10, 4);
    for (i=0; i< 10; i++)
        for (j=0; j< 4; j++)
            apop_data_set(d, i, j, i*j);
    apop_data *summary = apop_data_summarize(d);
    apop_data_prune_columns(summary, "mean", "median");
    assert(apop_name_find(summary->names, "mean", 'c')!=-2);
    assert(apop_name_find(summary->names, "median", 'c')!=-2);
    assert(apop_name_find(summary->names, "max", 'c')==-2); //not found
    assert(apop_name_find(summary->names, "variance", 'c')==-2); //not found
    assert(apop_data_get(summary, .row=0, .colname="mean")==0);
    assert(apop_data_get(summary, .row=1, .colname="median")==4);
    assert(apop_data_get(summary, .row=2, .colname="median")==8);
    //apop_data_show(summary);
}
Ejemplo n.º 18
0
static double one_wishart_row(gsl_vector *in, void *ws_in){
    wishartstruct_t *ws = ws_in;
    gsl_matrix *invparams_dot_data = gsl_matrix_alloc(ws->len, ws->len);
    apop_data *square= apop_data_alloc(ws->len, ws->len);
    apop_data_unpack(in, square);
    double datadet = apop_matrix_determinant(square->matrix);
    assert(datadet);

    gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1, ws->paraminv, square->matrix, 0, invparams_dot_data);   
    gsl_vector_view diag = gsl_matrix_diagonal(invparams_dot_data);
    double trace = apop_sum(&diag.vector);
    gsl_matrix_free(invparams_dot_data);
    apop_data_free(square);
    double out= log(datadet) * (ws->df - ws->len -1.)/2. - trace*ws->df/2.;
    assert(isfinite(out));
    return out;
}
Ejemplo n.º 19
0
// produce matrix with one row missing, for each possible row
// Do some math on the new sub-matrix
// Ben Klemens - MWD 4.6
static gsl_vector *jack_iteration(gsl_matrix *m, math_fn do_math){
  int height = m->size1;
  gsl_vector *out = gsl_vector_alloc(height);
  apop_data *reduced = apop_data_alloc(0, (size_t)height - 1, (int)m->size2);
  Apop_submatrix(m, 1, 0, height - 1, m->size2, mv);
  gsl_matrix_memcpy(reduced->matrix, mv);
  for (int i=0; i< height; i++){
    if (i % 100 == 0)
      std::cerr << "...jacknife at " << SeqLib::AddCommas(i) << " of " << SeqLib::AddCommas(height) << std::endl;
    gsl_vector_set(out, i, do_math(reduced)); // returns scalar output of do_math
    if (i < height - 1){ // create a new submatrix with new row ommited
      Apop_matrix_row(m, i, onerow);
      gsl_matrix_set_row(reduced->matrix, i, onerow);
    }
  }
  return out;
}
Ejemplo n.º 20
0
void FishModel::AddTiles(const FishHookTiles& f) {

  size_t nrows = f.size();
  size_t ncols = f.NumCovariates();
  // one for data, one for intercept
  mat = apop_data_alloc(0, nrows, (int)ncols+2); //vec, nrow, ncol
 
  // add the observed counts
  size_t i = 0; // row id (tile)
  for (i = 0; i < nrows; ++i) {
    float scaled_events = f.at(i).events;
    //scaled_events = f.at(i).covered > 0 ? scaled_events / f.at(i).covered : 0;
    apop_data_set(mat, i, (int)(ncols+1), scaled_events, NULL, NULL, NULL);
  }

  // set the covariates
  i = 0; // row id (tile)
  int j = 1; // column id (covariate) (0 is obs counts)
  for (const auto& r : f) { // loop the bins
    if (i == nrows) break;//debug, to break early
    j = 1;
    for (const auto& c : r) { // loop the covariates
      apop_data_set(mat, i, j++, c.second, NULL, NULL, NULL);
    }
    ++i; // update row interator
  }

  // add the intercept term
  for (i = 0; i < nrows; ++i)
    apop_data_set(mat, i, 0, 1.0, NULL, NULL, NULL);

  // add column names
  apop_name_add(mat->names, "events", 'c');
  for (const auto& c : f[0])
    apop_name_add(mat->names, c.first.c_str(), 'c');
  apop_name_add(mat->names, "intercept", 'c');

  // add row names
  std::stringstream ss;
  for (const auto& r : f) {
    ss << r.ChrName(SeqLib::BamHeader()) << ":" << r.pos1 << "-" << r.pos2;
    apop_name_add(mat->names, ss.str().c_str(), 'r');
    ss.str(std::string());
  }

}
Ejemplo n.º 21
0
/** A convenience function for regular expression searching 

\li There are three common flavors of regular expression: Basic, Extended,
and Perl-compatible (BRE, ERE, PCRE). I use EREs, as per the specs of
your C library, which should match POSIX's ERE specification. 

For example, "p.val" will match "P value", "p.value", "p values" (and even "tempeval", so be
careful).

If you give a non-\c NULL address in which to place a table of paren-delimited substrings, I'll return them as a row in the text element of the returned \ref apop_data set. I'll return <em>all</em> the matches, filling the first row with substrings from the first application of your regex, then filling the next row with another set of matches (if any), and so on to the end of the string. Useful when parsing a list of items, for example.


\param string        The string to search (no default)
\param regex       The regular expression (no default)
\param substrings   Parens in the regex indicate that I should return matching substrings. Give me the _address_ of an \ref apop_data* set, and I will allocate and fill the text portion with matches. Default= \c NULL, meaning do not return substrings (even if parens exist in the regex). If no match, return an empty \ref apop_data set, so <tt>output->textsize[0]==0</tt>.
\param use_case         Should I be case sensitive, \c 'y' or \c 'n'? (default = \c 'n', which is not the POSIX default.)

\return         Count of matches found. 0 == no match. \c substrings may be allocated and filled if needed.
\ingroup names


\li If <tt>apop_opts.stop_on_warning='n'</tt> returns -1 on error (e.g., regex \c NULL or didn't compile).
\li If <tt>strings==NULL</tt>, I return 0---no match---and if \c substrings is provided, set it to \c NULL.

\li Here is the test function. Notice that the substring-pulling
function call passes \c &subs, not plain \c subs. Also, the non-match
has a zero-length blank in <tt>subs->text[0][1]</tt>.

\include test_regex.c

\li Each set of matches will be one row of the output data. E.g., given the regex <tt>([A-Za-z])([0-9])</tt>, the column zero of <tt>outdata</tt> will hold letters, and column one will hold numbers.
Use \ref apop_data_transpose to reverse this so that the letters are in <tt>outdata->text[0]</tt> and numbers in <tt>outdata->text[1]</tt>.
*/
APOP_VAR_HEAD int  apop_regex(const char *string, const char* regex, apop_data **substrings, const char use_case){
    const char * apop_varad_var(string, NULL);
    apop_data **apop_varad_var(substrings, NULL);
    if (!string) {
        if (substrings) *substrings=NULL;
        return 0;
    }
    const char * apop_varad_var(regex, NULL);
    Apop_assert_negone(regex, "You gave me a NULL regex.");
    const char apop_varad_var(use_case, 'n');
APOP_VAR_ENDHEAD
    regex_t re;
    int matchcount=count_parens(regex);
    int found, found_ct=0;
    regmatch_t result[matchcount];
    int compiled_ok = !regcomp(&re, regex, REG_EXTENDED 
                                            + (use_case=='y' ? 0 : REG_ICASE)
                                            + (substrings ? 0 : REG_NOSUB) );
    Apop_stopif(!compiled_ok, return -1, 0, "This regular expression didn't compile: \"%s\"", regex)

    int matchrow = 0;
    if (substrings) *substrings = apop_data_alloc();
    do {
        found_ct+=
        found    = !regexec(&re, string, matchcount+1, result, matchrow ? REG_NOTBOL : 0);
        if (substrings && found){
            *substrings = apop_text_alloc(*substrings, matchrow+1, matchcount);
            //match zero is the whole string; ignore.
            for (int i=0; i< matchcount; i++){
                if (result[i+1].rm_eo > 0){//GNU peculiarity: match-to-empty marked with -1.
                    int length_of_match = result[i+1].rm_eo - result[i+1].rm_so;
                    free((*substrings)->text[matchrow][i]);
                    (*substrings)->text[matchrow][i] = malloc(strlen(string)+1);
                    memcpy((*substrings)->text[matchrow][i], string + result[i+1].rm_so, length_of_match);
                    (*substrings)->text[matchrow][i][length_of_match] = '\0';
                } //else matches nothing; apop_text_alloc already made this cell this NULL.
            }
            string += result[0].rm_eo; //end of whole match;
            matchrow++;
        }
    } while (substrings && found && string[0]!='\0');
    regfree(&re);
    return found_ct;
}
Ejemplo n.º 22
0
//Now let's use the model: make five draws from it, find the probability of those 
//draws given various paramter values; find the optimal parameter given the input data.
int main(){
    apop_data *five_draws= apop_data_alloc(5,1);
    asprintf(&five_draws->names->title, "five draws");
    apop_model_draws(.model=apop_model_set_parameters(binom, 0.3), 
                     .draws=five_draws);
    apop_data_print(five_draws);
    printf("\n\n");

    #define showprob(p) printf("PDF(five draws|param=" #p ") = %g\n", \
            apop_p(five_draws, apop_model_set_parameters(binom, p)));

    showprob(0.2)
    showprob(0.25)
    showprob(0.3)
    showprob(0.35)
    showprob(0.5)

    printf("\n\n");
    Apop_model_add_group(binom, apop_mle, .step_size=0.1, /*.method="NM simplex",*/
        .tolerance=1e-7, /*.verbose='y',*/ .starting_pt=(double[]){0.4});
Ejemplo n.º 23
0
apop_model *fuzz(apop_model sim){
    int draws = 100;
    gsl_rng *r = apop_rng_alloc(1);
    apop_model *prior = apop_model_cross(
                            apop_model_set_parameters(apop_normal, 10, 2),
                            apop_model_set_parameters(apop_normal, 10, 2));
    apop_data *outdata = apop_data_alloc(draws, weibull->vsize);
    double *params = sim.parameters->vector->data;
    for (int i=0; i< draws; i++){
        do {
            apop_draw(params, r, prior);
        } while (params[1]*2 > pow(params[0], 2));
        sim.dsize=params[1];
        apop_model *est = apop_estimate(apop_model_draws(&sim, 1000), weibull);
        Apop_row_v(outdata, i, onerow);
        gsl_vector_memcpy(onerow, est->parameters->vector);
        apop_model_free(est);
    }
    return apop_estimate(outdata, apop_pmf);
}
Ejemplo n.º 24
0
/** Give me a data set and a model, and I'll give you the jackknifed covariance matrix of the model parameters.

The basic algorithm for the jackknife (glossing over the details): create a sequence of data
sets, each with exactly one observation removed, and then produce a new set of parameter estimates 
using that slightly shortened data set. Then, find the covariance matrix of the derived parameters.

\li Jackknife or bootstrap? As a broad rule of thumb, the jackknife works best on models
    that are closer to linear. The worse a linear approximation does (at the given data),
    the worse the jackknife approximates the variance.

\param in	    The data set. An \ref apop_data set where each row is a single data point.
\param model    An \ref apop_model, that will be used internally by \ref apop_estimate.
            
\exception out->error=='n'   \c NULL input data.
\return         An \c apop_data set whose matrix element is the estimated covariance matrix of the parameters.
\see apop_bootstrap_cov

For example:
\include jack.c
*/
apop_data * apop_jackknife_cov(apop_data *in, apop_model *model){
    Apop_stopif(!in, apop_return_data_error(n), 0, "The data input can't be NULL.");
    Get_vmsizes(in); //msize1, msize2, vsize
    apop_model *e = apop_model_copy(model);
    int i, n = GSL_MAX(msize1, GSL_MAX(vsize, in->textsize[0]));
    apop_model *overall_est = e->parameters ? e : apop_estimate(in, e);//if not estimated, do so
    gsl_vector *overall_params = apop_data_pack(overall_est->parameters);
    gsl_vector_scale(overall_params, n); //do it just once.
    gsl_vector *pseudoval = gsl_vector_alloc(overall_params->size);

    //Copy the original, minus the first row.
    apop_data *subset = apop_data_copy(Apop_rs(in, 1, n-1));
    apop_name *tmpnames = in->names; 
    in->names = NULL;  //save on some copying below.

    apop_data *array_of_boots = apop_data_alloc(n, overall_params->size);

    for(i = -1; i< n-1; i++){
        //Get a view of row i, and copy it to position i-1 in the short matrix.
        if (i >= 0) apop_data_memcpy(Apop_r(subset, i), Apop_r(in, i));
        apop_model *est = apop_estimate(subset, e);
        gsl_vector *estp = apop_data_pack(est->parameters);
        gsl_vector_memcpy(pseudoval, overall_params);// *n above.
        gsl_vector_scale(estp, n-1);
        gsl_vector_sub(pseudoval, estp);
        gsl_matrix_set_row(array_of_boots->matrix, i+1, pseudoval);
        apop_model_free(est);
        gsl_vector_free(estp);
    }
    in->names = tmpnames;
    apop_data *out = apop_data_covariance(array_of_boots);
    gsl_matrix_scale(out->matrix, 1./(n-1.));
    apop_data_free(subset);
    gsl_vector_free(pseudoval);
    apop_data_free(array_of_boots);
    if (e!=overall_est)
        apop_model_free(overall_est);
    apop_model_free(e);
    gsl_vector_free(overall_params);
    return out;
}
Ejemplo n.º 25
0
apop_data *kappa_and_pi(apop_data const *tab_in){
    apop_data *out = apop_data_alloc();
    Apop_stopif(!tab_in, out->error='n'; return out, 0, "NULL input. Returning output with 'n' error code.");
    Apop_stopif(!tab_in->matrix, out->error='m'; return out, 0, "NULL input matrix. Returning output with 'm' error code.");
    Apop_stopif(tab_in->matrix->size1 != tab_in->matrix->size2, out->error='s'; return out, 0, "Input rows=%zu; input cols=%zu; "
                    "these need to be equal. Returning output with error code 's'.", tab_in->matrix->size1, tab_in->matrix->size2);

    apop_data *tab = apop_data_copy(tab_in);
    double total = apop_matrix_sum(tab->matrix);
    gsl_matrix_scale(tab->matrix, 1./total);
    double p_o = 0, p_e = 0, scott_pe = 0, ia = 0, row_ent = 0, col_ent = 0;
    for (int c=0; c< tab->matrix->size1; c++){
        double this_obs = apop_data_get(tab, c, c);
        p_o += this_obs;

        Apop_row_v(tab, c, row);
        Apop_col_v(tab, c, col);
        double rsum = apop_sum(row);
        double csum = apop_sum(col);
        p_e += rsum * csum;
        scott_pe += pow((rsum+csum)/2, 2);

        ia += this_obs * log2(this_obs/(rsum * csum));
        row_ent -= rsum * log2(rsum);
        col_ent -= csum * log2(csum);
    }
    apop_data_free(tab);

    asprintf(&out->names->title, "Scott's π and Cohen's κ");
    apop_data_add_named_elmt(out, "total count", total);
    apop_data_add_named_elmt(out, "percent agreement", p_o);
    apop_data_add_named_elmt(out, "κ", ((p_e==1)? 0: (p_o - p_e) / (1-p_e) ));
    apop_data_add_named_elmt(out, "π", ((p_e==1)? 0: (p_o - scott_pe) / (1-scott_pe)));
    apop_data_add_named_elmt(out, "P_I", ia/((row_ent+col_ent)/2));
    apop_data_add_named_elmt(out, "Cohen's p_e", p_e);
    apop_data_add_named_elmt(out, "Scott's p_e", scott_pe);
    apop_data_add_named_elmt(out, "information in agreement", ia);
    apop_data_add_named_elmt(out, "row entropy", row_ent);
    apop_data_add_named_elmt(out, "column entropy", col_ent);
    return out;
}
Ejemplo n.º 26
0
static void make_covar(apop_model *est){
    int size = est->parameters->vector->size;
    //the trick where we turn the params into a p-vector
    double * pv = est->parameters->vector->data;
    int n = pv[0];
    pv[0] = 1 - (apop_sum(est->parameters->vector)-n);

    apop_data *cov = apop_data_add_page(est->parameters, 
                            apop_data_alloc(size, size), "<Covariance>");
    for (int i=0; i < size; i++){
        double p = apop_data_get(est->parameters, i, -1);
        apop_data_set(cov, i, i, n * p *(1-p));
        for (int j=i+1; j < size; j++){
            double pj = apop_data_get(est->parameters, j, -1);
            double thiscell = -n*p*pj;
            apop_data_set(cov, i, j, thiscell);
            apop_data_set(cov, j, i, thiscell);
        }
    }
    pv[0]=n;
}
Ejemplo n.º 27
0
/** Make random draws from an \ref apop_model, and bin them using a binspec in the style
 of \ref apop_data_to_bins. If you have a data set that used the same binspec, you now have synced histograms, which you can plot or sensibly test hypotheses about.

The output is normalized to integrate to one.

\param binspec A description of the bins in which to place the draws; see \ref apop_data_to_bins. (default: as in \ref apop_data_to_bins.)
\param model The model to be drawn from. Because this function works via random draws, the model needs to have a 
\c draw method. (No default)
\param draws The number of random draws to make. (arbitrary default = 10,000)
\param bin_count If no bin spec, the number of bins to use (default: as per \ref apop_data_to_bins, \f$\sqrt(N)\f$)
\param rng The \c gsl_rng used to make random draws. (default: see note on \ref autorng)

\return An \ref apop_pmf model.

\li This function uses the \ref designated syntax for inputs.

\ingroup histograms
*/
APOP_VAR_HEAD apop_model *apop_model_to_pmf(apop_model *model, apop_data *binspec, long int draws, int bin_count, gsl_rng *rng){
    apop_model* apop_varad_var(model, NULL);
    Apop_assert(model && model->draw, "The second argument needs to be an apop_model with a 'draw' function "
                              "that I can use to make random draws.");
    apop_data* apop_varad_var(binspec, NULL);
    int apop_varad_var(bin_count, 0);
    long int apop_varad_var(draws, 1e4);
    gsl_rng *apop_varad_var(rng, NULL)
    static gsl_rng *spare = NULL;
    if (!rng && !spare) 
        spare = apop_rng_alloc(++apop_opts.rng_seed);
    if (!rng) rng = spare;
APOP_VAR_ENDHEAD
    Get_vmsizes(binspec);
    apop_data *outd = apop_data_alloc(draws, model->dsize); 
    for (long int i=0; i< draws; i++){
        Apop_row(outd, i, ach);
        apop_draw(ach->data, rng, model);
    }
    apop_data *outbinned = apop_data_to_bins(outd, binspec, .bin_count=bin_count);
    apop_data_free(outd);
    apop_vector_normalize(outbinned->weights);
    return apop_estimate(outbinned, apop_pmf);
} 
Ejemplo n.º 28
0
/** Create a histogram from data by putting data into bins of fixed width. 

\param indata The input data that will be binned. This is copied and the copy will be modified.
\param close_top_bin Normally, a bin covers the range from the point equal to its minimum to points strictly less than
the minimum plus the width.  if \c 'y', then the top bin includes points less than or equal to the upper bound. This solves the problem of displaying histograms where the top bin is just one point.
\param binspec This is an \ref apop_data set with the same number of columns as \c indata. 
If you want a fixed size for the bins, then the first row of the bin spec is the bin width for each column.
This allows you to specify a width for each dimension, or specify the same size for all with something like:

\param bin_count If you don't provide a bin spec, I'll provide this many evenly-sized bins. Default: \f$\sqrt(N)\f$.  \code
Apop_data_row(indata, 0, firstrow);
apop_data *binspec = apop_data_copy(firstrow);
gsl_matrix_set_all(binspec->matrix, 10); //bins of size 10 for all dim.s
apop_data_to_bins(indata, binspec);
\endcode
The presumption is that the first bin starts at zero in all cases. You can add a second row to the spec to give the offset for each dimension.  Default: NULL. if no binspec and no binlist, then a grid with offset equal to the min of the column, and bin size such that it takes \f$\sqrt{N}\f$ bins to cover the range to the max element. 


\return A pointer to a binned \ref apop_data set.  If you didn't give me a binspec, then I attach one to the output set as a page named \c \<binspec\>, so you can snap a second data set to the same grid using 
\code
apop_data_to_bins(first_set, NULL);
apop_data_to_bins(second_set, apop_data_get_page(first_set, "<binspec>"));
\endcode


  The text segment, if any, is not binned. I use \ref apop_data_pmf_compress as the final step in the binning, 
  and that does respect the text segment. 

Here is a sample program highlighting the difference between \ref apop_data_to_bins and \ref apop_data_pmf_compress .

\include binning.c
*/
APOP_VAR_HEAD apop_data *apop_data_to_bins(apop_data *indata, apop_data *binspec, int bin_count, char close_top_bin){
    apop_data *apop_varad_var(indata, NULL);
    Apop_assert_c(indata, NULL, 1, "NULL input data set, so returning NULL output data set.");
    apop_data *apop_varad_var(binspec, NULL);
    char apop_varad_var(close_top_bin, 'n');
    int apop_varad_var(bin_count, 0);
APOP_VAR_ENDHEAD
    Get_vmsizes(indata); //firstcol, vsize, msize1, msize2
    double binwidth, offset, max=0;
    apop_data *out = apop_data_copy(indata);
    apop_data *bs = binspec ? binspec
                    : apop_data_add_page(out, 
                        apop_data_alloc(vsize? 2: 0, msize1? 2: 0, indata->matrix ? msize2: 0),
                        "<binspec>");
    for (int j= firstcol; j< msize2; j++){
        Apop_col(out, j, onecol);
        if (binspec){
           binwidth = apop_data_get(binspec, 0, j);
           offset = ((binspec->vector && binspec->vector->size==2 )
                   ||(binspec->matrix && binspec->matrix->size1==2)) ? apop_data_get(binspec, 1, j) : 0;
        } else {
            Apop_col(bs, j, abin);
            max = gsl_vector_max(onecol);
            offset = abin->data[1] = gsl_vector_min(onecol);
            binwidth = abin->data[0] = (max - offset)/(bin_count ? bin_count : sqrt(onecol->size));
        }
        for (int i=0; i< onecol->size; i++){
            double val = gsl_vector_get(onecol, i);
            if (close_top_bin=='y' && val == max && val!=offset) 
                val -= 2*GSL_DBL_EPSILON;
            gsl_vector_set(onecol, i, (floor((val -offset)/binwidth))*binwidth+offset);
        }
    }
    apop_data_pmf_compress(out);
    return out;
}
Ejemplo n.º 29
0
   */

typedef struct {
    apop_data *d1, *d2, *dangly_bit;
    _Bool need_to_free;
} twop_s;


/* A model must accept data in the form of each observation being a single row of
   data---i.e., each row is what you'd get from apop_draw.
    If the length of the data matrix is longer than the first model's stated msize2,
    then we have to unpack the draw into two new apop_data sets.
*/
twop_s unpack_a_draw(apop_data *d, apop_cross_settings *s){
    twop_s out = (twop_s){
        .d1 = (s->model1 ? apop_data_alloc(s->model1->vsize, s->model1->msize1, s->model1->msize2) : NULL),
        .d2 = (s->model2 ? apop_data_alloc(s->model2->vsize, s->model2->msize1, s->model2->msize2) : NULL),
        .need_to_free=1};
    int len1 = s->model1->vsize + s->model1->msize1 * s->model1->msize2;
    for (int i=0; i< d->matrix->size1; i++){
        apop_data_unpack(Apop_subvector(Apop_rv(d, i), 0, len1), Apop_r(out.d1, i));
        apop_data_unpack(Apop_subvector(Apop_rv(d, i), len1, d->matrix->size1), Apop_r(out.d2, i));
      }
    return out;
}

static twop_s get_second(apop_data *d, char *splitpage, apop_cross_settings *s){
    twop_s out = {.d1=d, .d2=d};
    if (splitpage) {
        if (d->matrix && (d->matrix->size2 > s->model1->msize2))
            return unpack_a_draw(d, s);
Ejemplo n.º 30
0
 */

#include "apop_internal.h"
#include <search.h> //lsearch; bsearch is in stdlib.

/** For many, it is a knee-jerk reaction to a parameter estimation to test whether each individual parameter differs from zero. This function does that.

\param est  The \ref apop_model, which includes pre-calculated parameter estimates, var-covar matrix, and the original data set.

Returns nothing. At the end of the routine, <tt>est->info->more</tt> includes a set of t-test values: p value, confidence (=1-pval), t statistic, standard deviation, one-tailed Pval, one-tailed confidence.

*/
void apop_estimate_parameter_tests (apop_model *est){
    Nullcheck_p(est, )
    if (!est->data) return;
    apop_data *ep = apop_data_add_page(est->info, apop_data_alloc(est->parameters->vector->size, 2), "<test info>");
    apop_name_add(ep->names, "p value", 'c');
    apop_name_add(ep->names, "confidence", 'c');
    apop_name_stack(ep->names, est->parameters->names, 'r', 'r');
    Get_vmsizes(est->data); //msize1, vsize
    int df = msize1 ? msize1 : vsize;
    df -= est->parameters->vector->size;
    df  = df < 1 ? 1 : df; //some models aren't data-oriented.
    apop_data_add_named_elmt(est->info, "df", df);

    apop_data *one_elmt = apop_data_calloc(1, 1);
    gsl_vector *param_v = apop_data_pack(est->parameters);
    for (size_t i=0; i< est->parameters->vector->size; i++){
        Apop_settings_add_group(est, apop_pm, .index=i);
        apop_model *m = apop_parameter_model(est->data, est);