示例#1
0
double one_weibull(double d, void *params){
    double lambda = apop_data_get(params, 0);
    double k = apop_data_get(params, 1);
    return logl(k) - logl(lambda)
        + (k-1)*(logl(d) - logl(lambda))
        - powl(d/lambda, k);
}
示例#2
0
int main(){
    apop_data *d = apop_data_alloc(10, 100);
    gsl_rng *r = apop_rng_alloc(3242);
    for (int i=0; i< 10; i++){
        row_offset = gsl_rng_uniform(r)*2 -1; //declared and used above.
        apop_vector_apply(Apop_rv(d, i), offset_rng);
    }

    int df = d->matrix->size2-1;
    apop_data *means = apop_map(d, .fn_v = mu, .part ='r');
    apop_data *tstats = apop_map(d, .fn_v = find_tstat, .part ='r');
    apop_data *confidences = apop_map(tstats, .fn_dp = conf, .param = &df);

    printf("means:\n"); apop_data_show(means);
    printf("\nt stats:\n"); apop_data_show(tstats);
    printf("\nconfidences:\n"); apop_data_show(confidences);

    //Some sanity checks, for Apophenia's test suite.
    for (int i=0; i< 10; i++){
        //sign of mean == sign of t stat.
        assert(apop_data_get(means, i, -1) * apop_data_get(tstats, i, -1) >=0);

        //inverse of P-value should be the t statistic.
        assert(fabs(gsl_cdf_tdist_Pinv(apop_data_get(confidences, i, -1), 99) 
                    - apop_data_get(tstats, i, -1)) < 1e-5);
    }
}
示例#3
0
int main(){
    apop_data *d = apop_text_alloc(apop_data_alloc(6), 6, 1);
    apop_data_fill(d,   1,   2,   3,   3,   1,   2);
    apop_text_fill(d,  "A", "A", "A", "A", "A", "B");

    asprintf(&d->names->title, "Original data set");
    printdata(d);

        //binned, where bin ends are equidistant but not necessarily in the data
    apop_data *binned = apop_data_to_bins(d, NULL);
    asprintf(&binned->names->title, "Post binning");
    printdata(binned);
    assert(apop_sum(binned->weights)==6);
    assert(fabs(//equal distance between bins
              (apop_data_get(binned, 1, -1) - apop_data_get(binned, 0, -1))
            - (apop_data_get(binned, 2, -1) - apop_data_get(binned, 1, -1))) < 1e-5);

        //compressed, where the data is as in the original, but weights 
        //are redome to accommodate repeated observations.
    apop_data_pmf_compress(d);
    asprintf(&d->names->title, "Post compression");
    printdata(d);
    assert(apop_sum(d->weights)==6);

    apop_model *d_as_pmf = apop_estimate(d, apop_pmf);
    Apop_row(d, 0, firstrow); //1A
    assert(fabs(apop_p(firstrow, d_as_pmf) - 2./6 < 1e-5));
}
示例#4
0
//Binomial draws. Input parameter gives the odds of a one; we always make 1,000 draws.
int rng(double *out, gsl_rng *r, apop_model *m){
    double prob = apop_data_get(m->parameters);
    int draws = 1000;
    *out = 0;
    for (int i=0; i< draws; i++) if (gsl_rng_uniform(r) < prob) *out += 1./draws;
    return 0;
}
示例#5
0
int main(){
    apop_opts.db_engine='s'; //SQLite only.
    apop_query("create table atab (a numeric)");
    for (int i=0; i< 1e5; i++)
        apop_query("insert into atab values(ran())");
    apop_query("create table powa as "
            "select a, pow(a, 2) as sq, pow(a, 0.5) as sqrt "
            "from atab");

    //compare the std dev of a uniform as reported by the 
    //database routine, the matrix routine, and math.
    double db_pop_stddev = apop_query_to_float("select stddev_pop(a) from powa");
    apop_data *d = apop_query_to_data("select * from powa");
    apop_data *cov = apop_data_covariance(d);
    double matrix_pop_stddev = sqrt(apop_data_get(cov)*(d->matrix->size1/(d->matrix->size1-1.)));
    assert(fabs(db_pop_stddev - matrix_pop_stddev) < 1e-4);
    double actual_stddev = sqrt(2*gsl_pow_3(.5)/3);
    assert(fabs(db_pop_stddev - actual_stddev) < 1e-3);

    float sq_mean = apop_query_to_float("select avg(sq) from powa");
    float actual_sq_mean = 1./3;
    assert(fabs(sq_mean - actual_sq_mean) < 1e-3);

    float sqrt_mean = apop_query_to_float("select avg(sqrt) from powa");
    float actual_sqrt_mean = 2./3;
    assert(fabs(sqrt_mean - actual_sqrt_mean) < 1e-3);
}
示例#6
0
int main() {
    /* This test is thanks to Nick Eriksson, who sent it to me in the form of a bug report. */
    apop_data * testdata = apop_data_falloc((2, 3),
                              30, 50, 45, 
                              34, 12, 17 );
    apop_data * t2 = apop_test_fisher_exact(testdata);
    assert(fabs(apop_data_get(t2,1) - 0.0001761) < 1e-6);
}
示例#7
0
double one_chi_sq(apop_data *d, int row, int col, int n){
    Apop_row_v(d, row, vr);
    Apop_col_v(d, col, vc);
    double rowexp = apop_vector_sum(vr)/n;
    double colexp = apop_vector_sum(vc)/n;
    double observed = apop_data_get(d, row, col);
    double expected = n * rowexp * colexp;
    return gsl_pow_2(observed - expected)/expected; 
}
示例#8
0
void make_draws(){
    apop_model *multinom = apop_model_copy(apop_multivariate_normal);
    multinom->parameters = apop_data_falloc((2, 2, 2), 
                                        1,  1, .1,
                                        8, .1,  1);
    multinom->dsize = 2;

    apop_model *d1 = apop_estimate(apop_model_draws(multinom), apop_multivariate_normal);
    for (int i=0; i< 2; i++)
        for (int j=-1; j< 2; j++)
            assert(fabs(apop_data_get(multinom->parameters, i, j)
                    - apop_data_get(d1->parameters, i, j)) < .25);
    multinom->draw = NULL; //so draw via MCMC
    apop_model *d2 = apop_estimate(apop_model_draws(multinom, 10000), apop_multivariate_normal);
    for (int i=0; i< 2; i++)
        for (int j=-1; j< 2; j++)
            assert(fabs(apop_data_get(multinom->parameters, i, j)
                    - apop_data_get(d2->parameters, i, j)) < .25);
}
示例#9
0
/** If there is an NaN anywhere in the row of data (including the matrix, the vector, the weights, and the text) then delete the row from the data set.

\li If every row has an NaN, then this returns \c NULL.
\li If \c apop_opts.db_nan is not \c NULL, then I will use that as a regular expression to check the text elements for bad data as well.
\li If \c inplace = 'y', then I'll free each element of the input data
    set and refill it with the pruned elements. I'll still take up (up to)
    twice the size of the data set in memory during the function. If
    every row has an NaN, then your \c apop_data set will end up with
    \c NULL vector, matrix, \dots. if \c inplace = 'n', then the original data set is left unmolested.
\li I only look at the first page of data (i.e. the \c more element is ignored).
\li This function uses the \ref designated syntax for inputs.

    \param d    The data, with NaNs
    \param inplace If \c 'y', clear out the pointer-to-\ref apop_data that
    you sent in and refill with the pruned data. If \c 'n', leave the
    set alone and return a new data set.
    \return     A (potentially shorter) copy of the data set, without
    NaNs. If <tt>inplace=='y'</tt>, redundant with the input. If the entire data set is
    cleared out, then this will be \c NULL.
*/
APOP_VAR_HEAD apop_data * apop_data_listwise_delete(apop_data *d, char inplace){
    apop_data * apop_varad_var(d, NULL);
    if (!d) return NULL;
    char apop_varad_var(inplace, 'n');
APOP_VAR_ENDHEAD
    Get_vmsizes(d) //defines firstcol, vsize, wsize, msize1, msize2.
    apop_assert_c(msize1 || vsize || d->textsize[0], NULL, 0, 
            "You sent to apop_data_listwise_delete a data set with NULL matrix, NULL vector, and no text. "
            "Confused, it is returning NULL.");
    //find out where the NaNs are
    int len = GSL_MAX(vsize ? vsize : msize1, d->textsize[0]); //still some size assumptions here.
    int not_empty = 0;
    int *marked = calloc(len, sizeof(int));
    for (int i=0; i< (vsize ? vsize: msize1); i++)
        for (int j=firstcol; j <msize2; j++){
            if (gsl_isnan(apop_data_get(d, i, j))){
                    marked[i] = 1;
                    break;
            }
        }
    for (int i=0; i< wsize; i++)
        if (gsl_isnan(gsl_vector_get(d->weights, i)))
            marked[i] = 1;
    if (d->textsize[0] && apop_opts.db_nan){
        regex_t    rex;
        int compiled_ok = !regcomp(&rex, apop_opts.db_nan, REG_EXTENDED +  REG_ICASE + REG_NOSUB);
        apop_assert(compiled_ok, "apop_opts.db_nan needs to be a regular expression that "
                                "I can use to check the text element of your data set for "
                                "NaNs, But compiling %s into a regex failed. Or, set "
                                "apop_opts.db_nan=NULL to bypass text checking.", apop_opts.db_nan);
        for(int i=0; i< d->textsize[0]; i++)
            if (!marked[i])
                for(int j=0; j< d->textsize[1]; j++)
                    if (!regexec(&rex, d->text[i][j], 0, 0, 0)){
                        marked[i] ++;
                        break;
                    }
        regfree(&rex);
    }

    //check that at least something isn't NULL.
    for (int i=0; i< len; i++)
        if (!marked[i]){
            not_empty ++;
            break;
        }
    if (!not_empty){
        free(marked);
        return NULL;
    }
    apop_data *out = (inplace=='y'|| inplace=='Y') ? d : apop_data_copy(d);
    apop_data_rm_rows(out, marked);
    free(marked);
    return out;
}
示例#10
0
static void make_covar(apop_model *est){
    int size = est->parameters->vector->size;
    //the trick where we turn the params into a p-vector
    double * pv = est->parameters->vector->data;
    int n = pv[0];
    pv[0] = 1 - (apop_sum(est->parameters->vector)-n);

    apop_data *cov = apop_data_add_page(est->parameters, 
                            apop_data_alloc(size, size), "<Covariance>");
    for (int i=0; i < size; i++){
        double p = apop_data_get(est->parameters, i, -1);
        apop_data_set(cov, i, i, n * p *(1-p));
        for (int j=i+1; j < size; j++){
            double pj = apop_data_get(est->parameters, j, -1);
            double thiscell = -n*p*pj;
            apop_data_set(cov, i, j, thiscell);
            apop_data_set(cov, j, i, thiscell);
        }
    }
    pv[0]=n;
}
示例#11
0
SEXP data_frame_from_apop_data(apop_data *in){
    if (!in) return R_NilValue;
    int numeric_rows = !!(in->vector) 
                        + (in->matrix ? in->matrix->size2 : 0)
                        + !!(in->weights);
    int text_rows = in->textsize[1];
    SEXP out, onerow;
    PROTECT(out = allocVector(VECSXP, numeric_rows + text_rows));
    int col_ct = 0;
    int firstcol = in->vector ? -1 : 0;
    int lastcol = in->matrix ? in->matrix->size2 : 0;
    for (int i= firstcol; i < lastcol; i++){
        int len = (i == -1) ? in->vector->size : in->matrix->size1;
        apop_data *factorpage = find_factor(onerow, in, i);
        if (factorpage){
            SET_VECTOR_ELT(out, col_ct++, (onerow = allocVector(INTSXP, len)));
            for (int j=0; j< len; j++) INTEGER(onerow)[j] = apop_data_get(in, j, i);
            set_factor(onerow, factorpage);
        } else {
            SET_VECTOR_ELT(out, col_ct++, (onerow = allocVector(REALSXP, len)));
            for (int j=0; j< len; j++) REAL(onerow)[j] = apop_data_get(in, j, i);
        }
    }
    for (int i= 0; i < text_rows; i++){
        int len = in->textsize[0];
        SEXP onerow;
        SET_VECTOR_ELT(out, col_ct++, (onerow =  allocVector(STRSXP, len)));
        for (int j=0; j< len; j++) 
            SET_STRING_ELT(onerow, j, mkChar(in->text[j][i])); //Do I need strdup?
    }
    if (in->weights){
        int len =  in->weights->size;
        SET_VECTOR_ELT(out, col_ct++, (onerow = allocVector(REALSXP, len)));
        for (int j=0; j< len; j++) 
            REAL(onerow)[j] = gsl_vector_get(in->weights, j);
    }
    handle_names(in, out);
    UNPROTECT(1);
    return out;
}
示例#12
0
apop_data *apop_data_pmf_expand(apop_data *in, int factor){
    apop_data *expanded = apop_data_alloc();
    apop_vector_normalize(in->weights);
    for (int i=0; i< in->weights->size;i++){
        int wt = gsl_vector_get(in->weights, i)* factor;
        if (wt){
            apop_data *next = apop_data_alloc(wt);
            gsl_vector_set_all(next->vector, apop_data_get(in, i));
            apop_data_stack(expanded, next, .inplace='y');
        }
    }
    if (expanded->vector) return expanded;
    else return NULL;
}
示例#13
0
/** Create a histogram from data by putting data into bins of fixed width. 

\param indata The input data that will be binned. This is copied and the copy will be modified.
\param close_top_bin Normally, a bin covers the range from the point equal to its minimum to points strictly less than
the minimum plus the width.  if \c 'y', then the top bin includes points less than or equal to the upper bound. This solves the problem of displaying histograms where the top bin is just one point.
\param binspec This is an \ref apop_data set with the same number of columns as \c indata. 
If you want a fixed size for the bins, then the first row of the bin spec is the bin width for each column.
This allows you to specify a width for each dimension, or specify the same size for all with something like:

\param bin_count If you don't provide a bin spec, I'll provide this many evenly-sized bins. Default: \f$\sqrt(N)\f$.  \code
Apop_data_row(indata, 0, firstrow);
apop_data *binspec = apop_data_copy(firstrow);
gsl_matrix_set_all(binspec->matrix, 10); //bins of size 10 for all dim.s
apop_data_to_bins(indata, binspec);
\endcode
The presumption is that the first bin starts at zero in all cases. You can add a second row to the spec to give the offset for each dimension.  Default: NULL. if no binspec and no binlist, then a grid with offset equal to the min of the column, and bin size such that it takes \f$\sqrt{N}\f$ bins to cover the range to the max element. 


\return A pointer to a binned \ref apop_data set.  If you didn't give me a binspec, then I attach one to the output set as a page named \c \<binspec\>, so you can snap a second data set to the same grid using 
\code
apop_data_to_bins(first_set, NULL);
apop_data_to_bins(second_set, apop_data_get_page(first_set, "<binspec>"));
\endcode


  The text segment, if any, is not binned. I use \ref apop_data_pmf_compress as the final step in the binning, 
  and that does respect the text segment. 

Here is a sample program highlighting the difference between \ref apop_data_to_bins and \ref apop_data_pmf_compress .

\include binning.c
*/
APOP_VAR_HEAD apop_data *apop_data_to_bins(apop_data *indata, apop_data *binspec, int bin_count, char close_top_bin){
    apop_data *apop_varad_var(indata, NULL);
    Apop_assert_c(indata, NULL, 1, "NULL input data set, so returning NULL output data set.");
    apop_data *apop_varad_var(binspec, NULL);
    char apop_varad_var(close_top_bin, 'n');
    int apop_varad_var(bin_count, 0);
APOP_VAR_ENDHEAD
    Get_vmsizes(indata); //firstcol, vsize, msize1, msize2
    double binwidth, offset, max=0;
    apop_data *out = apop_data_copy(indata);
    apop_data *bs = binspec ? binspec
                    : apop_data_add_page(out, 
                        apop_data_alloc(vsize? 2: 0, msize1? 2: 0, indata->matrix ? msize2: 0),
                        "<binspec>");
    for (int j= firstcol; j< msize2; j++){
        Apop_col(out, j, onecol);
        if (binspec){
           binwidth = apop_data_get(binspec, 0, j);
           offset = ((binspec->vector && binspec->vector->size==2 )
                   ||(binspec->matrix && binspec->matrix->size1==2)) ? apop_data_get(binspec, 1, j) : 0;
        } else {
            Apop_col(bs, j, abin);
            max = gsl_vector_max(onecol);
            offset = abin->data[1] = gsl_vector_min(onecol);
            binwidth = abin->data[0] = (max - offset)/(bin_count ? bin_count : sqrt(onecol->size));
        }
        for (int i=0; i< onecol->size; i++){
            double val = gsl_vector_get(onecol, i);
            if (close_top_bin=='y' && val == max && val!=offset) 
                val -= 2*GSL_DBL_EPSILON;
            gsl_vector_set(onecol, i, (floor((val -offset)/binwidth))*binwidth+offset);
        }
    }
    apop_data_pmf_compress(out);
    return out;
}
示例#14
0
int main(){
    int len = 3000;
    gsl_vector *v = gsl_vector_alloc(len);
    for (double i=0; i< len; i++) gsl_vector_set(v, i, 1./(i+1));
    double square;
    gsl_blas_ddot(v, v, &square);
    printf("1 + (1/2)^2 + (1/3)^2 + ...= %g\n", square);

    double pi_over_six = gsl_pow_2(M_PI)/6.;
    Diff(square, pi_over_six);

    /* Now using apop_dot, in a few forms.
       First, vector-as-data dot itself.
       If one of the inputs is a vector,
       apop_dot puts the output in a vector-as-data:*/
    apop_data *v_as_data = &(apop_data){.vector=v};
    apop_data *vdotv = apop_dot(v_as_data, v_as_data);
    Diff(gsl_vector_get(vdotv->vector, 0), pi_over_six);

    /* Wrap matrix in an apop_data set. */
    gsl_matrix *v_as_matrix = apop_vector_to_matrix(v);
    apop_data dm = (apop_data){.matrix=v_as_matrix};

    // (1 X len) vector dot (len X 1) matrix --- produce a scalar (one item vector).
    apop_data *mdotv = apop_dot(v_as_data, &dm);
    double scalarval = apop_data_get(mdotv);
    Diff(scalarval, pi_over_six);

    //(len X 1) dot (len X 1) --- bad dimensions.
    apop_opts.verbose=-1; //don't print an error.
    apop_data *mdotv2 = apop_dot(&dm, v_as_data);
    apop_opts.verbose=0; //back to safety.
    assert(mdotv2->error);

    // If we want (len X 1) dot (1 X len) --> (len X len),
    // use apop_vector_to_matrix.
    apop_data dmr = (apop_data){.matrix=apop_vector_to_matrix(v, .row_col='r')};
    apop_data *product_matrix = apop_dot(&dm, &dmr);
    //The trace is the sum of squares:
    gsl_vector_view trace = gsl_matrix_diagonal(product_matrix->matrix);
    double tracesum = apop_sum(&trace.vector);
    Diff(tracesum, pi_over_six);

    apop_data_free(product_matrix);
    gsl_matrix_free(dmr.matrix);
}
示例#15
0
apop_data *kappa_and_pi(apop_data const *tab_in){
    apop_data *out = apop_data_alloc();
    Apop_stopif(!tab_in, out->error='n'; return out, 0, "NULL input. Returning output with 'n' error code.");
    Apop_stopif(!tab_in->matrix, out->error='m'; return out, 0, "NULL input matrix. Returning output with 'm' error code.");
    Apop_stopif(tab_in->matrix->size1 != tab_in->matrix->size2, out->error='s'; return out, 0, "Input rows=%zu; input cols=%zu; "
                    "these need to be equal. Returning output with error code 's'.", tab_in->matrix->size1, tab_in->matrix->size2);

    apop_data *tab = apop_data_copy(tab_in);
    double total = apop_matrix_sum(tab->matrix);
    gsl_matrix_scale(tab->matrix, 1./total);
    double p_o = 0, p_e = 0, scott_pe = 0, ia = 0, row_ent = 0, col_ent = 0;
    for (int c=0; c< tab->matrix->size1; c++){
        double this_obs = apop_data_get(tab, c, c);
        p_o += this_obs;

        Apop_row_v(tab, c, row);
        Apop_col_v(tab, c, col);
        double rsum = apop_sum(row);
        double csum = apop_sum(col);
        p_e += rsum * csum;
        scott_pe += pow((rsum+csum)/2, 2);

        ia += this_obs * log2(this_obs/(rsum * csum));
        row_ent -= rsum * log2(rsum);
        col_ent -= csum * log2(csum);
    }
    apop_data_free(tab);

    asprintf(&out->names->title, "Scott's π and Cohen's κ");
    apop_data_add_named_elmt(out, "total count", total);
    apop_data_add_named_elmt(out, "percent agreement", p_o);
    apop_data_add_named_elmt(out, "κ", ((p_e==1)? 0: (p_o - p_e) / (1-p_e) ));
    apop_data_add_named_elmt(out, "π", ((p_e==1)? 0: (p_o - scott_pe) / (1-scott_pe)));
    apop_data_add_named_elmt(out, "P_I", ia/((row_ent+col_ent)/2));
    apop_data_add_named_elmt(out, "Cohen's p_e", p_e);
    apop_data_add_named_elmt(out, "Scott's p_e", scott_pe);
    apop_data_add_named_elmt(out, "information in agreement", ia);
    apop_data_add_named_elmt(out, "row entropy", row_ent);
    apop_data_add_named_elmt(out, "column entropy", col_ent);
    return out;
}
示例#16
0
/* The hard part is when your candidate point does not satisfy other
   constraints, so you need to translate the point until it meets the new hypersurface.
   How far is that? Project beta onto the new surface, and find the
   distance between that projection and the original surface. Then
   translate beta toward the original surface by that amount. The
   projection of the translated beta onto the new surface now also touches the old
   surface.
   */
static void get_candiate(gsl_vector *beta, apop_data *constraint, int current, gsl_vector *candidate, double margin){
    double k, ck, off_by, s;
    gsl_vector *pseudobeta        = NULL;
    gsl_vector *pseudocandidate   = NULL;
    gsl_vector *pseudocandidate2  = NULL;
    gsl_vector *fix               = NULL;
    Apop_row_v(constraint, current, cc);
    ck = gsl_vector_get(constraint->vector, current);
    find_nearest_point(beta, ck, cc, candidate);
    for (size_t i=0; i< constraint->vector->size; i++){
        if (i!=current){
            Apop_row_v(constraint, i, other);
            k   =apop_data_get(constraint, i, -1);
            if (binds(candidate, k, other, margin)){
                if (!pseudobeta){
                    pseudobeta          = gsl_vector_alloc(beta->size);
                    gsl_vector_memcpy(pseudobeta, beta);
                    pseudocandidate     = gsl_vector_alloc(beta->size);
                    pseudocandidate2    = gsl_vector_alloc(beta->size);
                    fix                 = gsl_vector_alloc(beta->size);
                }
                find_nearest_point(pseudobeta, k, other, pseudocandidate);
                find_nearest_point(pseudocandidate, ck, cc, pseudocandidate2);
                off_by  = apop_vector_distance(pseudocandidate, pseudocandidate2);
                s       = trig_bit(cc, other, off_by);
                gsl_vector_memcpy(fix, cc);
                gsl_vector_scale(fix, magnitude(cc));
                gsl_vector_scale(fix, s);
                gsl_vector_add(pseudobeta, fix);
                find_nearest_point(pseudobeta, k, other, candidate);
                gsl_vector_memcpy(pseudobeta, candidate);
            } 
        }
    }
    if (fix){ 
        gsl_vector_free(fix); gsl_vector_free(pseudobeta);
        gsl_vector_free(pseudocandidate); gsl_vector_free(pseudocandidate2);
    }
}
示例#17
0
文件: factors.c 项目: b-k/apophenia
int main(){
    apop_data *d1 = apop_text_alloc(NULL, 5, 1);
    apop_data *d2 = apop_text_alloc(NULL, 5, 1);
    apop_text_fill(d1, "A", "B", "C", "B", "B");
    apop_text_fill(d2, "B", "A", "D", "B", "B");
    apop_data_to_factors(d1);
    apop_data_show(d1);
    d2->more = apop_data_copy(apop_data_get_factor_names(d1, 0, 't'));
    printf("-----\n");
    apop_data_to_dummies(d2, .append='y');
    apop_data_show(d2);


    //some spot checks.
    assert(apop_data_get(d1, 2)==2);
    assert(apop_data_get(d2, 0, 0)==1);
    assert(apop_data_get(d2, 2, 0)==0);
    assert(apop_data_get(d2, 2, 1)==0);
    assert(apop_data_get(d2, 2, 2)==1);
    assert(apop_data_get(d2, 3, 0)==1);
}
示例#18
0
apop_data* multiple_imputation_variance_base(multiple_imputation_variance_t in){
    /*The first half of this is filling in the values. In an attempt at versatility, I allow users to 
      give any named column, be it numeric or text, for every piece of input info. That means a whole lot 
      of checking around to determine what goes where---and a macro.  */

    Apop_assert_c(in.base_data,NULL, 1, "It doesn't make sense to impute over a NULL data set.");
    Apop_assert_c(in.fill_ins, NULL, 1, "Didn't receive a fill-in table. Returning NULL.");
    data_to_data stat = in.stat? in.stat : colmeans;

//At the end of this macro, you've got rowcol and rowtype, valuecol and valuetype, &c.
#define apop_setup_one_colthing(c) \
    int c##col = apop_name_find(in.fill_ins->names, in.c##_name, 'c');   \
    int c##type = 'd';         \
    if (c##col==-2){           \
        c##col = apop_name_find(in.fill_ins->names, in.c##_name, 't');   \
        c##type = 't';         \
       Apop_assert(c##col!=-2, "I couldn't find the c##_name %s in the column/text names of your fill_in table.", in.c##_name);    \
    }

    apop_setup_one_colthing(row)
    apop_setup_one_colthing(col)
    apop_setup_one_colthing(value)
    apop_setup_one_colthing(imputation)

    Apop_assert(!(rowtype=='t' && !in.base_data->names->rowct),
            "the rowname you gave refers to text, so I will be searching for a row name in the base data."
            " But the base_data set has no row names.");
    Apop_assert(!(coltype=='t' && !in.base_data->names->colct),
            "the colname you gave refers to text, so I will be searching for a column name in the base data."
            " But the base_data set has no column names.");

    //get a list of unique imputation markers.
    gsl_vector *imps = NULL;
    apop_data *impt = NULL; 
    if (imputationtype == 'd'){
        Apop_col_v(in.fill_ins, imputationcol, ic);
        imps = apop_vector_unique_elements(ic);
    } else impt = apop_text_unique_elements(in.fill_ins, imputationcol);

    int len = imps ? imps->size : impt->textsize[0];
    int thisimp=-2; char *thisimpt=NULL;
	apop_data *estimates[len];
    for (int impctr=0; impctr< len; impctr++){
        if (imps) thisimp  = gsl_vector_get(imps, impctr);
        else      thisimpt = impt->text[impctr][0];
        Get_vmsizes(in.fill_ins); //masxize
        int fillsize = maxsize ? maxsize : in.fill_ins->textsize[0];
        for (int i=0; i< fillsize; i++){
            if (!(thisimpt && apop_strcmp(in.fill_ins->text[i][imputationcol], thisimpt))
                && !(imps && thisimp==apop_data_get(in.fill_ins, i, imputationcol)))
                continue;
            int thisrow = (rowtype=='d') ? 
                                apop_data_get(in.fill_ins, i, rowcol)
                               :apop_name_find(in.base_data->names, in.fill_ins->text[i][rowcol], 'r');
            int thiscol = (coltype=='d') ? 
                                apop_data_get(in.fill_ins, i, colcol)
                               :apop_name_find(in.base_data->names, in.fill_ins->text[i][colcol], 'c');
            if (valuetype=='d') apop_data_set(in.base_data, thisrow, thiscol, 
                                            apop_data_get(in.fill_ins, i, valuecol));
            else apop_text_add(in.base_data, rowcol, colcol, in.fill_ins->text[i][valuecol]);
        }
        //OK, base_data is now filled in. Estimate the statistic for it.
		estimates[impctr] = stat(in.base_data);
    }


    //Part II: find the mean of the statistics and the total variance of the cov matrix.
	gsl_vector *vals = gsl_vector_alloc(len);
    apop_data *out = apop_data_copy(estimates[0]);
	//take the simple mean of the main data set.
	{ //this limits the scope of the Get_vmsizes macro.
	 Get_vmsizes(estimates[0]); 
     for (int j=0; j < msize2; j++)
         for (int i=0; i < (vsize ? vsize : msize1); i++){
            for (int k=0; k< len; k++)
                gsl_vector_set(vals, k, apop_data_get(estimates[k], i, j));
             apop_data_set(out, i, j, apop_vector_mean(vals));
         }
	}
    apop_data *out_var = apop_data_get_page(estimates[0], "<Covariance>");
    int cov_is_labelled = out_var !=NULL;
    if (!cov_is_labelled){
        asprintf(&out->more->names->title, "<Covariance>");
        out_var = estimates[0]->more;
    }
	Get_vmsizes(out_var);
    for (int i=0; i < msize1; i++)
        for (int j=i; j < msize2; j++){
            for (int k=0; k< len; k++){
                apop_data *this_p = cov_is_labelled ? apop_data_get_page(estimates[k], "<Covariance>")
                                        : estimates[k]->more;
                gsl_vector_set(vals, k, apop_data_get(this_p, i, j));
            }
            double total_var = apop_vector_mean(vals) + apop_var(vals)/(1+1./len);
            apop_data_set(out_var, i, j, total_var);
            if (j != i)
                apop_data_set(out_var, j, i, total_var);
        }
    return out;	
}
示例#19
0
    apop_data_set(m->parameters, 0, -1, GSL_NAN);
    apop_data *summ=apop_data_summarize(d);
    Apop_col_t(summ, "mean", means);
    gsl_vector *t = m->parameters->vector; //mask this while unpacking
    m->parameters->vector=NULL;
    apop_data_unpack(means, m->parameters);
    m->parameters->vector=t;

    //Estimate a model with fixed cov matrix and blank (NaN) df.
    apop_model *modified_wish = apop_model_copy(m);
    modified_wish->log_likelihood = fixed_wishart_ll;
    apop_model *fixed_wish = apop_model_fix_params(modified_wish);
    apop_model *est_via_fix = apop_estimate(d, fixed_wish);

    //copy df from fixed version to the real thing; clean up.
    t->data[0] = apop_data_get(est_via_fix->parameters, 0, -1);
    gsl_matrix_scale(m->parameters->matrix, 1./t->data[0]);
    apop_data_free(summ);
    apop_model_free(modified_wish);
    apop_model_free(fixed_wish);
}

/* amodel apop_wishart The Wishart distribution, which is currently somewhat untested. 

Here's the likelihood function. \f$p\f$ is the dimension of the data and covariance
matrix, \f$n\f$ is the degrees of freedom, \f$\mathbf{V}\f$ is the \f$p\times p\f$
matrix of Wishart parameters, and \f${\mathbf{W}}\f$ is the \f$p\times p\f$ matrix whose
likelihood is being evaluated.  \f$\Gamma_p(\cdot)\f$ is the \ref apop_multivariate_gamma
"multivariate gamma function".

\f[
示例#20
0
static void apop_data_print_core(const apop_data *data, FILE *f, char displaytype){
    if (!data){
        fprintf(f, "NULL\n");
        return;
    }
    int i, j, L = 0, 
        start   = (data->vector)? -1 : 0,
        end     = (data->matrix)? data->matrix->size2 : 0,
        rowend  = (data->matrix)? data->matrix->size1 : (data->vector) ? data->vector->size : data->text ? data->textsize[0] : -1;
    if (data->names && data->names->title && strlen(data->names->title))
        fprintf(f, "\t%s\n\n", data->names->title);
    if (data->names && data->names->rowct)
        L   = get_max_strlen(data->names->row, data->names->rowct);
    if (data->names && data->names->rowct && (data->names->vector || data->names->colct || data->names->textct)){
        if ((apop_opts.db_name_column || *apop_opts.db_name_column=='\0') || 
                !strcmp(apop_opts.db_name_column, "row_names"))
            fprintf(f, "%*s  ", L+2, " ");
        else { fprintf(f, "%s", apop_opts.db_name_column); a_pipe(f, displaytype); }
    }
    if (data->vector && data->names && data->names->vector){
        fprintf(f, "%s", data->names->vector);
    }
    if (data->matrix){
        if (data->vector && data->names && data->names->colct){
            fprintf(f, "%c ", data->names->vector ? ' ' : '\t' );
            a_pipe(f, displaytype);
        }
        if (data->names) 
          for(i=0; i< data->names->colct; i++){
            if (i < data->names->colct -1)
                fprintf(f, "%s%s", data->names->col[i], apop_opts.output_delimiter);
            else
                fprintf(f, "%s", data->names->col[i]);
        }
    }
    if (data->textsize[1] && data->names && data->names->textct){
        if ((data->vector && data->names && data->names->vector) || (data->matrix && data->names->colct))
            a_pipe(f, displaytype);
        if (data->names)
          for(i=0; i< data->names->textct; i++){
            if (i < data->names->textct -1)
                fprintf(f, "%s%s", data->names->text[i], apop_opts.output_delimiter);
            else
                fprintf(f, "%s", data->names->text[i]);
        }
    }
    if(data->names && (data->names->vector || data->names->colct || data->names->textct))
        fprintf(f, "\n");
    for(j=0; j< rowend; j++){
        if (data->names && data->names->rowct > j)
            fprintf(f, "%*s%s", L+2, data->names->row[j], apop_opts.output_delimiter);
        for(i=start; i< end; i++){
            if ((i < 0 && j < data->vector->size) || (i>= 0 && j < data->matrix->size1 && i < data->matrix->size2))
                p_fn(f,  apop_data_get(data, j, i));
            else
                fprintf(f, " ");
            if (i==-1 && data->matrix) 
                a_pipe(f, displaytype);
            if (i < end-1)
                fprintf(f, "%s", apop_opts.output_delimiter);
        }
        if (data->text){
            if (data->vector || data->matrix)
                a_pipe(f, displaytype);
            if (j < data->textsize[0])
                for(i=0; i< data->textsize[1]; i++){
                    fprintf(f, "%s", data->text[j][i]);
                    if (i < data->textsize[1]-1) fprintf(f, "%s", apop_opts.output_delimiter);
                }
        }
        if (data->weights && j < data->weights->size){
            a_pipe(f, displaytype);
            p_fn(f, data->weights->data[j]);
        }
        fprintf(f, "\n");
    }
}
示例#21
0
long double apop_linear_constraint(gsl_vector *beta, apop_data * constraint, double margin){
#else
apop_varad_head(long double, apop_linear_constraint){
    static threadlocal apop_data *default_constraint;
    gsl_vector * apop_varad_var(beta, NULL);
    double apop_varad_var(margin, 0);
    apop_data * apop_varad_var(constraint, NULL);
    Apop_assert(beta, "The vector to be checked is NULL.");
    if (!constraint){
        if (default_constraint && beta->size != default_constraint->vector->size){
            apop_data_free(default_constraint);
            default_constraint = NULL;
        }
        if (!default_constraint){
            default_constraint = apop_data_alloc(0,beta->size, beta->size);
            default_constraint->vector = gsl_vector_calloc(beta->size);
            gsl_matrix_set_identity(default_constraint->matrix);
        }
        constraint = default_constraint;
    }
    return apop_linear_constraint_base(beta, constraint, margin);
}

 long double apop_linear_constraint_base(gsl_vector *beta, apop_data * constraint, double margin){
#endif
    static threadlocal gsl_vector *closest_pt = NULL;
    static threadlocal gsl_vector *candidate  = NULL;
    static threadlocal gsl_vector *fix        = NULL;
    int constraint_ct = constraint->matrix->size1;
    int bindlist[constraint_ct];
    int i, bound = 0;
    /* For added efficiency, keep a scratch vector or two on hand. */
    if (closest_pt==NULL || closest_pt->size != constraint->matrix->size2){
        closest_pt  = gsl_vector_calloc(beta->size);
        candidate   = gsl_vector_alloc(beta->size);
        fix         = gsl_vector_alloc(beta->size);
        closest_pt->data[0] = GSL_NEGINF;
    }
    /* Do any constraints bind?*/
    memset(bindlist, 0, sizeof(int)*constraint_ct);
    for (i=0; i< constraint_ct; i++){
        Apop_row_v(constraint, i, c);
        bound           +=
        bindlist[i]      = binds(beta, apop_data_get(constraint, i, -1), c, margin);
    }
    if (!bound) return 0;   //All constraints met.
    gsl_vector *base_beta = apop_vector_copy(beta);
    /* With only one constraint, it's easy. */
    if (constraint->vector->size==1){
        Apop_row_v(constraint, 0, c);
        find_nearest_point(base_beta, constraint->vector->data[0], c, beta);
        goto add_margin;
    }
    /* Finally, multiple constraints, at least one binding.
       For each surface, pick a candidate point.
       Check whether the point meets the other constraints. 
            if not, translate to a new point that works.
            [Do this by maintaining a pseudopoint that translates by the
            necessary amount.]
        Once you have a candidate point, compare its distance to the
        current favorite; keep the best.
     */
    for (i=0; i< constraint_ct; i++)
        if (bindlist[i]){
            get_candiate(base_beta, constraint, i, candidate, margin);
            if(apop_vector_distance(base_beta, candidate) < apop_vector_distance(base_beta, closest_pt))
                gsl_vector_memcpy(closest_pt, candidate);
        }
    gsl_vector_memcpy(beta, closest_pt);
add_margin:
    for (i=0; i< constraint_ct; i++){
        if(bindlist[i]){
            Apop_row_v(constraint, i, c);
            gsl_vector_memcpy(fix, c);
            gsl_vector_scale(fix, magnitude(fix));
            gsl_vector_scale(fix, margin);
            gsl_vector_add(beta, fix);
        }
    }
    long double out = apop_vector_distance(base_beta, beta);
    gsl_vector_free(base_beta);
    return out;
}
示例#22
0
//For centering a uniform distribution around a point. 
//Cut/pasted from the Apophenia documentation, but narrowed range to ±.0.8. This also took some tweaking.
//The uniform is not a good choice (and ruins the covariance estimate), but the premise
//was that we don't know the formula for a Normal distribution.
void set_midpoint(apop_data * in, apop_model *m){
    apop_data_set(m->parameters, 0, -1, apop_data_get(in)-0.07);
    apop_data_set(m->parameters, 1, -1, apop_data_get(in)+0.07);
}
示例#23
0
文件: jacobian.c 项目: b-k/apophenia
apop_data *rev(apop_data *in){ return apop_map(in, .fn_d=log, .part='a'); }

/*The derivative of the transformed-to-base function. */
double inv(double in){return 1./in;} 
double rev_j(apop_data *in){ return fabs(apop_map_sum(in, .fn_d=inv, .part='a')); }

int main(){
    apop_model *ct = apop_model_coordinate_transform(
                        .transformed_to_base= rev, .jacobian_to_base=rev_j,
                        .base_model=apop_normal);
    //Apop_model_add_group(ct, apop_parts_wanted);//Speed up the MLE.

    //make fake data
    double mu=2, sigma=1;
    apop_data *d = draw_exponentiated_normal(mu, sigma, 2e5);

    //If we correctly replicated a Lognormal, mu and sigma will be right:
    apop_model *est = apop_estimate(d, ct);
    apop_model_free(ct);
    Diff(apop_data_get(est->parameters, 0), mu);
    Diff(apop_data_get(est->parameters, 1), sigma);

    /*The K-L divergence between our Lognormal and the stock Lognormal
      should be small. Try it with both the original params and the estimated ones. */
    apop_model *ln = apop_model_set_parameters(apop_lognormal, mu, sigma);
    apop_model *ln2 = apop_model_copy(apop_lognormal);
    ln2->parameters = est->parameters;
    Diff(apop_kl_divergence(ln, ln2,.draw_ct=1000), 0);
    Diff(apop_kl_divergence(ln, est,.draw_ct=1000), 0);
}