double one_chi_sq(apop_data *d, int row, int col, int n){ Apop_row_v(d, row, vr); Apop_col_v(d, col, vc); double rowexp = apop_vector_sum(vr)/n; double colexp = apop_vector_sum(vc)/n; double observed = apop_data_get(d, row, col); double expected = n * rowexp * colexp; return gsl_pow_2(observed - expected)/expected; }
apop_data *kappa_and_pi(apop_data const *tab_in){ apop_data *out = apop_data_alloc(); Apop_stopif(!tab_in, out->error='n'; return out, 0, "NULL input. Returning output with 'n' error code."); Apop_stopif(!tab_in->matrix, out->error='m'; return out, 0, "NULL input matrix. Returning output with 'm' error code."); Apop_stopif(tab_in->matrix->size1 != tab_in->matrix->size2, out->error='s'; return out, 0, "Input rows=%zu; input cols=%zu; " "these need to be equal. Returning output with error code 's'.", tab_in->matrix->size1, tab_in->matrix->size2); apop_data *tab = apop_data_copy(tab_in); double total = apop_matrix_sum(tab->matrix); gsl_matrix_scale(tab->matrix, 1./total); double p_o = 0, p_e = 0, scott_pe = 0, ia = 0, row_ent = 0, col_ent = 0; for (int c=0; c< tab->matrix->size1; c++){ double this_obs = apop_data_get(tab, c, c); p_o += this_obs; Apop_row_v(tab, c, row); Apop_col_v(tab, c, col); double rsum = apop_sum(row); double csum = apop_sum(col); p_e += rsum * csum; scott_pe += pow((rsum+csum)/2, 2); ia += this_obs * log2(this_obs/(rsum * csum)); row_ent -= rsum * log2(rsum); col_ent -= csum * log2(csum); } apop_data_free(tab); asprintf(&out->names->title, "Scott's π and Cohen's κ"); apop_data_add_named_elmt(out, "total count", total); apop_data_add_named_elmt(out, "percent agreement", p_o); apop_data_add_named_elmt(out, "κ", ((p_e==1)? 0: (p_o - p_e) / (1-p_e) )); apop_data_add_named_elmt(out, "π", ((p_e==1)? 0: (p_o - scott_pe) / (1-scott_pe))); apop_data_add_named_elmt(out, "P_I", ia/((row_ent+col_ent)/2)); apop_data_add_named_elmt(out, "Cohen's p_e", p_e); apop_data_add_named_elmt(out, "Scott's p_e", scott_pe); apop_data_add_named_elmt(out, "information in agreement", ia); apop_data_add_named_elmt(out, "row entropy", row_ent); apop_data_add_named_elmt(out, "column entropy", col_ent); return out; }
apop_data* multiple_imputation_variance_base(multiple_imputation_variance_t in){ /*The first half of this is filling in the values. In an attempt at versatility, I allow users to give any named column, be it numeric or text, for every piece of input info. That means a whole lot of checking around to determine what goes where---and a macro. */ Apop_assert_c(in.base_data,NULL, 1, "It doesn't make sense to impute over a NULL data set."); Apop_assert_c(in.fill_ins, NULL, 1, "Didn't receive a fill-in table. Returning NULL."); data_to_data stat = in.stat? in.stat : colmeans; //At the end of this macro, you've got rowcol and rowtype, valuecol and valuetype, &c. #define apop_setup_one_colthing(c) \ int c##col = apop_name_find(in.fill_ins->names, in.c##_name, 'c'); \ int c##type = 'd'; \ if (c##col==-2){ \ c##col = apop_name_find(in.fill_ins->names, in.c##_name, 't'); \ c##type = 't'; \ Apop_assert(c##col!=-2, "I couldn't find the c##_name %s in the column/text names of your fill_in table.", in.c##_name); \ } apop_setup_one_colthing(row) apop_setup_one_colthing(col) apop_setup_one_colthing(value) apop_setup_one_colthing(imputation) Apop_assert(!(rowtype=='t' && !in.base_data->names->rowct), "the rowname you gave refers to text, so I will be searching for a row name in the base data." " But the base_data set has no row names."); Apop_assert(!(coltype=='t' && !in.base_data->names->colct), "the colname you gave refers to text, so I will be searching for a column name in the base data." " But the base_data set has no column names."); //get a list of unique imputation markers. gsl_vector *imps = NULL; apop_data *impt = NULL; if (imputationtype == 'd'){ Apop_col_v(in.fill_ins, imputationcol, ic); imps = apop_vector_unique_elements(ic); } else impt = apop_text_unique_elements(in.fill_ins, imputationcol); int len = imps ? imps->size : impt->textsize[0]; int thisimp=-2; char *thisimpt=NULL; apop_data *estimates[len]; for (int impctr=0; impctr< len; impctr++){ if (imps) thisimp = gsl_vector_get(imps, impctr); else thisimpt = impt->text[impctr][0]; Get_vmsizes(in.fill_ins); //masxize int fillsize = maxsize ? maxsize : in.fill_ins->textsize[0]; for (int i=0; i< fillsize; i++){ if (!(thisimpt && apop_strcmp(in.fill_ins->text[i][imputationcol], thisimpt)) && !(imps && thisimp==apop_data_get(in.fill_ins, i, imputationcol))) continue; int thisrow = (rowtype=='d') ? apop_data_get(in.fill_ins, i, rowcol) :apop_name_find(in.base_data->names, in.fill_ins->text[i][rowcol], 'r'); int thiscol = (coltype=='d') ? apop_data_get(in.fill_ins, i, colcol) :apop_name_find(in.base_data->names, in.fill_ins->text[i][colcol], 'c'); if (valuetype=='d') apop_data_set(in.base_data, thisrow, thiscol, apop_data_get(in.fill_ins, i, valuecol)); else apop_text_add(in.base_data, rowcol, colcol, in.fill_ins->text[i][valuecol]); } //OK, base_data is now filled in. Estimate the statistic for it. estimates[impctr] = stat(in.base_data); } //Part II: find the mean of the statistics and the total variance of the cov matrix. gsl_vector *vals = gsl_vector_alloc(len); apop_data *out = apop_data_copy(estimates[0]); //take the simple mean of the main data set. { //this limits the scope of the Get_vmsizes macro. Get_vmsizes(estimates[0]); for (int j=0; j < msize2; j++) for (int i=0; i < (vsize ? vsize : msize1); i++){ for (int k=0; k< len; k++) gsl_vector_set(vals, k, apop_data_get(estimates[k], i, j)); apop_data_set(out, i, j, apop_vector_mean(vals)); } } apop_data *out_var = apop_data_get_page(estimates[0], "<Covariance>"); int cov_is_labelled = out_var !=NULL; if (!cov_is_labelled){ asprintf(&out->more->names->title, "<Covariance>"); out_var = estimates[0]->more; } Get_vmsizes(out_var); for (int i=0; i < msize1; i++) for (int j=i; j < msize2; j++){ for (int k=0; k< len; k++){ apop_data *this_p = cov_is_labelled ? apop_data_get_page(estimates[k], "<Covariance>") : estimates[k]->more; gsl_vector_set(vals, k, apop_data_get(this_p, i, j)); } double total_var = apop_vector_mean(vals) + apop_var(vals)/(1+1./len); apop_data_set(out_var, i, j, total_var); if (j != i) apop_data_set(out_var, j, i, total_var); } return out; }
apop_data *apop_data_from_frame(SEXP in){ apop_data *out; if (TYPEOF(in)==NILSXP) return NULL; PROTECT(in); assert(TYPEOF(in)==VECSXP); //I should write a check for this on the R side. int total_cols=LENGTH(in); int total_rows=LENGTH(VECTOR_ELT(in,0)); int char_cols = 0; for (int i=0; i< total_cols; i++){ SEXP this_col = VECTOR_ELT(in, i); char_cols += (TYPEOF(this_col)==STRSXP); } SEXP rl, cl; //const char *rn, *cn; //GetMatrixDimnames(in, &rl, &cl, &rn, &cn); PROTECT(cl = getAttrib(in, R_NamesSymbol)); PROTECT(rl = getAttrib(in, R_RowNamesSymbol)); int current_numeric_col=0, current_text_col=0, found_vector=0; if(cl !=R_NilValue && TYPEOF(cl)==STRSXP) //just check for now. for (int ndx=0; ndx < LENGTH(cl) && !found_vector; ndx++) if (!strcmp(translateChar(STRING_ELT(cl, ndx)), "Vector")) found_vector++; int matrix_cols= total_cols-char_cols-found_vector; out= apop_data_alloc((found_vector?total_rows:0), (matrix_cols?total_rows:0), matrix_cols); if (char_cols) out=apop_text_alloc(out, total_rows, char_cols); if(rl !=R_NilValue) for (int ndx=0; ndx < LENGTH(rl); ndx++) if (TYPEOF(rl)==STRSXP) apop_name_add(out->names, translateChar(STRING_ELT(rl, ndx)), 'r'); else //let us guess that it's a numeric list and hope the R Project one day documents this stuff. {char *ss; asprintf(&ss, "%i", ndx); apop_name_add(out->names, ss, 'r'); free(ss);} for (int i=0; i< total_cols; i++){ const char *colname = NULL; if(cl !=R_NilValue) colname = translateChar(STRING_ELT(cl, i)); SEXP this_col = VECTOR_ELT(in, i); if (TYPEOF(this_col) == STRSXP){ //could this be via aliases instead of copying? //printf("col %i is chars\n", i); if(colname) apop_name_add(out->names, colname, 't'); for (int j=0; j< total_rows; j++) apop_text_add(out, j, current_text_col, (STRING_ELT(this_col,j)==NA_STRING ? apop_opts.nan_string : translateChar(STRING_ELT(this_col, j)))); current_text_col++; continue; } else { //plain old matrix data. int col_in_question = current_numeric_col; if (colname && !strcmp(colname, "Vector")) { out->vector = gsl_vector_alloc(total_rows); col_in_question = -1; } else {current_numeric_col++;} Apop_col_v(out, col_in_question, onecol); if (TYPEOF(this_col) == INTSXP){ //printf("col %i is ints\n", i); int *vals = INTEGER(this_col); for (int j=0; j< onecol->size; j++){ //printf("%i\n",vals[j]); gsl_vector_set(onecol, j, (vals[j]==NA_INTEGER ? GSL_NAN : vals[j])); } } else { double *vals = REAL(this_col); for (int j=0; j< onecol->size; j++) gsl_vector_set(onecol, j, (ISNAN(vals[j])||ISNA(vals[j]) ? GSL_NAN : vals[j])); } if(colname && col_in_question > -1) apop_name_add(out->names, colname, 'c'); else apop_name_add(out->names, colname, 'v'); //which is "vector". } //Factors SEXP ls = getAttrib(this_col, R_LevelsSymbol); if (ls){ apop_data *end;//find last page for adding factors. for(end=out; end->more!=NULL; end=end->more); end->more = get_factors(ls, colname); } } UNPROTECT(3); return out; }