Exemple #1
0
double one_chi_sq(apop_data *d, int row, int col, int n){
    Apop_row_v(d, row, vr);
    Apop_col_v(d, col, vc);
    double rowexp = apop_vector_sum(vr)/n;
    double colexp = apop_vector_sum(vc)/n;
    double observed = apop_data_get(d, row, col);
    double expected = n * rowexp * colexp;
    return gsl_pow_2(observed - expected)/expected; 
}
apop_data *kappa_and_pi(apop_data const *tab_in){
    apop_data *out = apop_data_alloc();
    Apop_stopif(!tab_in, out->error='n'; return out, 0, "NULL input. Returning output with 'n' error code.");
    Apop_stopif(!tab_in->matrix, out->error='m'; return out, 0, "NULL input matrix. Returning output with 'm' error code.");
    Apop_stopif(tab_in->matrix->size1 != tab_in->matrix->size2, out->error='s'; return out, 0, "Input rows=%zu; input cols=%zu; "
                    "these need to be equal. Returning output with error code 's'.", tab_in->matrix->size1, tab_in->matrix->size2);

    apop_data *tab = apop_data_copy(tab_in);
    double total = apop_matrix_sum(tab->matrix);
    gsl_matrix_scale(tab->matrix, 1./total);
    double p_o = 0, p_e = 0, scott_pe = 0, ia = 0, row_ent = 0, col_ent = 0;
    for (int c=0; c< tab->matrix->size1; c++){
        double this_obs = apop_data_get(tab, c, c);
        p_o += this_obs;

        Apop_row_v(tab, c, row);
        Apop_col_v(tab, c, col);
        double rsum = apop_sum(row);
        double csum = apop_sum(col);
        p_e += rsum * csum;
        scott_pe += pow((rsum+csum)/2, 2);

        ia += this_obs * log2(this_obs/(rsum * csum));
        row_ent -= rsum * log2(rsum);
        col_ent -= csum * log2(csum);
    }
    apop_data_free(tab);

    asprintf(&out->names->title, "Scott's π and Cohen's κ");
    apop_data_add_named_elmt(out, "total count", total);
    apop_data_add_named_elmt(out, "percent agreement", p_o);
    apop_data_add_named_elmt(out, "κ", ((p_e==1)? 0: (p_o - p_e) / (1-p_e) ));
    apop_data_add_named_elmt(out, "π", ((p_e==1)? 0: (p_o - scott_pe) / (1-scott_pe)));
    apop_data_add_named_elmt(out, "P_I", ia/((row_ent+col_ent)/2));
    apop_data_add_named_elmt(out, "Cohen's p_e", p_e);
    apop_data_add_named_elmt(out, "Scott's p_e", scott_pe);
    apop_data_add_named_elmt(out, "information in agreement", ia);
    apop_data_add_named_elmt(out, "row entropy", row_ent);
    apop_data_add_named_elmt(out, "column entropy", col_ent);
    return out;
}
Exemple #3
0
apop_data* multiple_imputation_variance_base(multiple_imputation_variance_t in){
    /*The first half of this is filling in the values. In an attempt at versatility, I allow users to 
      give any named column, be it numeric or text, for every piece of input info. That means a whole lot 
      of checking around to determine what goes where---and a macro.  */

    Apop_assert_c(in.base_data,NULL, 1, "It doesn't make sense to impute over a NULL data set.");
    Apop_assert_c(in.fill_ins, NULL, 1, "Didn't receive a fill-in table. Returning NULL.");
    data_to_data stat = in.stat? in.stat : colmeans;

//At the end of this macro, you've got rowcol and rowtype, valuecol and valuetype, &c.
#define apop_setup_one_colthing(c) \
    int c##col = apop_name_find(in.fill_ins->names, in.c##_name, 'c');   \
    int c##type = 'd';         \
    if (c##col==-2){           \
        c##col = apop_name_find(in.fill_ins->names, in.c##_name, 't');   \
        c##type = 't';         \
       Apop_assert(c##col!=-2, "I couldn't find the c##_name %s in the column/text names of your fill_in table.", in.c##_name);    \
    }

    apop_setup_one_colthing(row)
    apop_setup_one_colthing(col)
    apop_setup_one_colthing(value)
    apop_setup_one_colthing(imputation)

    Apop_assert(!(rowtype=='t' && !in.base_data->names->rowct),
            "the rowname you gave refers to text, so I will be searching for a row name in the base data."
            " But the base_data set has no row names.");
    Apop_assert(!(coltype=='t' && !in.base_data->names->colct),
            "the colname you gave refers to text, so I will be searching for a column name in the base data."
            " But the base_data set has no column names.");

    //get a list of unique imputation markers.
    gsl_vector *imps = NULL;
    apop_data *impt = NULL; 
    if (imputationtype == 'd'){
        Apop_col_v(in.fill_ins, imputationcol, ic);
        imps = apop_vector_unique_elements(ic);
    } else impt = apop_text_unique_elements(in.fill_ins, imputationcol);

    int len = imps ? imps->size : impt->textsize[0];
    int thisimp=-2; char *thisimpt=NULL;
	apop_data *estimates[len];
    for (int impctr=0; impctr< len; impctr++){
        if (imps) thisimp  = gsl_vector_get(imps, impctr);
        else      thisimpt = impt->text[impctr][0];
        Get_vmsizes(in.fill_ins); //masxize
        int fillsize = maxsize ? maxsize : in.fill_ins->textsize[0];
        for (int i=0; i< fillsize; i++){
            if (!(thisimpt && apop_strcmp(in.fill_ins->text[i][imputationcol], thisimpt))
                && !(imps && thisimp==apop_data_get(in.fill_ins, i, imputationcol)))
                continue;
            int thisrow = (rowtype=='d') ? 
                                apop_data_get(in.fill_ins, i, rowcol)
                               :apop_name_find(in.base_data->names, in.fill_ins->text[i][rowcol], 'r');
            int thiscol = (coltype=='d') ? 
                                apop_data_get(in.fill_ins, i, colcol)
                               :apop_name_find(in.base_data->names, in.fill_ins->text[i][colcol], 'c');
            if (valuetype=='d') apop_data_set(in.base_data, thisrow, thiscol, 
                                            apop_data_get(in.fill_ins, i, valuecol));
            else apop_text_add(in.base_data, rowcol, colcol, in.fill_ins->text[i][valuecol]);
        }
        //OK, base_data is now filled in. Estimate the statistic for it.
		estimates[impctr] = stat(in.base_data);
    }


    //Part II: find the mean of the statistics and the total variance of the cov matrix.
	gsl_vector *vals = gsl_vector_alloc(len);
    apop_data *out = apop_data_copy(estimates[0]);
	//take the simple mean of the main data set.
	{ //this limits the scope of the Get_vmsizes macro.
	 Get_vmsizes(estimates[0]); 
     for (int j=0; j < msize2; j++)
         for (int i=0; i < (vsize ? vsize : msize1); i++){
            for (int k=0; k< len; k++)
                gsl_vector_set(vals, k, apop_data_get(estimates[k], i, j));
             apop_data_set(out, i, j, apop_vector_mean(vals));
         }
	}
    apop_data *out_var = apop_data_get_page(estimates[0], "<Covariance>");
    int cov_is_labelled = out_var !=NULL;
    if (!cov_is_labelled){
        asprintf(&out->more->names->title, "<Covariance>");
        out_var = estimates[0]->more;
    }
	Get_vmsizes(out_var);
    for (int i=0; i < msize1; i++)
        for (int j=i; j < msize2; j++){
            for (int k=0; k< len; k++){
                apop_data *this_p = cov_is_labelled ? apop_data_get_page(estimates[k], "<Covariance>")
                                        : estimates[k]->more;
                gsl_vector_set(vals, k, apop_data_get(this_p, i, j));
            }
            double total_var = apop_vector_mean(vals) + apop_var(vals)/(1+1./len);
            apop_data_set(out_var, i, j, total_var);
            if (j != i)
                apop_data_set(out_var, j, i, total_var);
        }
    return out;	
}
Exemple #4
0
apop_data *apop_data_from_frame(SEXP in){
    apop_data *out;
    if (TYPEOF(in)==NILSXP) return NULL;

    PROTECT(in);
    assert(TYPEOF(in)==VECSXP); //I should write a check for this on the R side.
    int total_cols=LENGTH(in);
    int total_rows=LENGTH(VECTOR_ELT(in,0));
    int char_cols = 0;
    for (int i=0; i< total_cols; i++){
        SEXP this_col = VECTOR_ELT(in, i);
        char_cols += (TYPEOF(this_col)==STRSXP);
    }
    SEXP rl, cl;
    //const char *rn, *cn;
    //GetMatrixDimnames(in, &rl, &cl, &rn, &cn);
    PROTECT(cl = getAttrib(in, R_NamesSymbol));
    PROTECT(rl = getAttrib(in, R_RowNamesSymbol));

    int current_numeric_col=0, current_text_col=0, found_vector=0;

    if(cl !=R_NilValue && TYPEOF(cl)==STRSXP) //just check for now.
        for (int ndx=0; ndx < LENGTH(cl) && !found_vector; ndx++)
            if (!strcmp(translateChar(STRING_ELT(cl, ndx)), "Vector")) found_vector++;

    int matrix_cols= total_cols-char_cols-found_vector;
    out= apop_data_alloc((found_vector?total_rows:0), (matrix_cols?total_rows:0),  matrix_cols);
    if (char_cols) out=apop_text_alloc(out, total_rows, char_cols);

    if(rl !=R_NilValue)
        for (int ndx=0; ndx < LENGTH(rl); ndx++)
            if (TYPEOF(rl)==STRSXP)
                apop_name_add(out->names, translateChar(STRING_ELT(rl, ndx)), 'r');
            else //let us guess that it's a numeric list and hope the R Project one day documents this stuff.
                {char *ss; asprintf(&ss, "%i", ndx); apop_name_add(out->names, ss, 'r'); free(ss);}

    for (int i=0; i< total_cols; i++){
        const char *colname = NULL;
        if(cl !=R_NilValue)
            colname = translateChar(STRING_ELT(cl, i));
        SEXP this_col = VECTOR_ELT(in, i);
        if (TYPEOF(this_col) == STRSXP){
            //could this be via aliases instead of copying?
            //printf("col %i is chars\n", i);
            if(colname) apop_name_add(out->names, colname, 't');
            for (int j=0; j< total_rows; j++)
                apop_text_add(out, j, current_text_col,
				(STRING_ELT(this_col,j)==NA_STRING ? apop_opts.nan_string : translateChar(STRING_ELT(this_col, j))));
            current_text_col++;
            continue;
        } else {    //plain old matrix data.
            int col_in_question = current_numeric_col;
            if (colname && !strcmp(colname, "Vector")) {
                out->vector = gsl_vector_alloc(total_rows);
                col_in_question = -1;
            } else {current_numeric_col++;}
            Apop_col_v(out, col_in_question, onecol);
            if (TYPEOF(this_col) == INTSXP){
                //printf("col %i is ints\n", i);
                int *vals = INTEGER(this_col);
                for (int j=0; j< onecol->size; j++){
					//printf("%i\n",vals[j]);
                    gsl_vector_set(onecol, j, (vals[j]==NA_INTEGER ? GSL_NAN : vals[j]));
					}
            } else {
                double *vals = REAL(this_col);
                for (int j=0; j< onecol->size; j++)
                    gsl_vector_set(onecol, j, (ISNAN(vals[j])||ISNA(vals[j]) ? GSL_NAN : vals[j]));
            }
            if(colname && col_in_question > -1) apop_name_add(out->names, colname, 'c');
            else apop_name_add(out->names, colname, 'v'); //which is "vector".
        }
        //Factors
        SEXP ls = getAttrib(this_col, R_LevelsSymbol);
        if (ls){
            apop_data *end;//find last page for adding factors.
            for(end=out; end->more!=NULL; end=end->more);
            end->more = get_factors(ls, colname);
        }
    }
    UNPROTECT(3);
    return out;
}