void check_out_impute(char **origin, char **destin, int *imputation_number, char **subset, char **filltabin){ char *filltab = (filltabin && *filltabin) ? *filltabin : "filled"; Tea_stopif(!origin || !*origin, return, 0, "NULL origin table, but I need that."); char *id_column= get_key_word(NULL, "id"); const char *dest = destin ? *destin : NULL; int use_rowids = 0; if (!id_column) { use_rowids++; id_column = strdup("rowid"); } sprintf(apop_opts.db_name_column, "%s", id_column); begin_transaction(); if (dest && strcmp(*origin, dest)){ apop_table_exists(dest, 'd'); apop_query("create table %s as select %s * from %s %s %s", dest, use_rowids ? "rowid as id_col, " : " ", *origin, (subset && *subset) ? "where" : " ", (subset && *subset) ? *subset : " " ); } else dest = *origin; create_index(dest, use_rowids ? "id_col" : id_column); Tea_stopif(!apop_table_exists(filltab), return , 0, "No table named '%s'; did you already doMImpute()?", filltab); apop_data *fills = apop_query_to_text("select %s, field, value from %s where (draw=%i or draw = -1)" , id_column, filltab, *imputation_number); Tea_stopif(!fills || fills->error, return, 0, "Expected fill-in table " "%s, but couldn't query it.", filltab); for(int i=0; i< *fills->textsize; i++){ _Bool is_null = !strcmp(fills->text[i][1], apop_opts.nan_string); char tick = is_null ? ' ' : '\''; apop_query("update %s set %s = %c%s%c " "where cast(%s as numeric) = %s", dest, fills->text[i][0], tick, is_null ? "NULL" : fills->text[i][1], tick, id_column, fills->names->row[i]); } commit_transaction(); apop_data_free(fills); free(id_column); }
ykl_s make_yule(char const *zila, int y) { static gsl_matrix *indices; if (!indices) { indices = gsl_matrix_calloc(65,1); for (int i=0; i< 64; i++) gsl_matrix_set(indices, i,0, i); } apop_data *col = make_histo(zila, y); apop_data ww = (apop_data) { .weights=col->vector, .matrix=indices }; apop_data *d = apop_data_transpose(col); apop_data *exp = apop_data_rank_expand(d); apop_model *m = apop_estimate(exp, apop_yule); apop_model *n = apop_estimate(exp, apop_lognormal); ykl_s out = (ykl_s) { .yule=apop_data_get(m->parameters, .col=-1/*, .rowname="mu"*/), .ln=apop_data_get(n->parameters, .col=-1/*, .rowname="mu"*/), .lnstderr=sqrt(apop_data_get(n->parameters, .col=-1, .row=1/*, .rowname="mu"*/)), .kl = apop_kl_divergence(apop_estimate(&ww, apop_pmf), m), .lnkl = apop_kl_divergence(apop_estimate(&ww, apop_pmf), n), .mean = apop_matrix_mean(col->matrix) }; apop_data_free(d); apop_data_free(exp); apop_model_free(m); return out; } int main() { printf("zila|year|yule_p|kl_div|mu|ln_mu|ln_sigma|ln_kl\n"); apop_db_open("b.db"); apop_data *zilas = apop_query_to_text("select admname from ppl"); for (int i=0; i< *zilas->textsize; i++) for (int y=2001; y<= 2005; y++) { ykl_s ykl = make_yule(*zilas->text[i], y); printf("%20s| %i| %g| %g| %g| %g| %g|%g\n", *zilas->text[i], y, ykl.yule, ykl.kl, ykl.mean, ykl.ln, ykl.lnstderr, ykl.lnkl); } //apop_plot_histogram(m->data->weights, 64, .output_file="histo"); }
int check_levenshtein_distances(int max_lev_distance){ int typo_counter=0; int min_distance; char *closest; if (!apop_table_exists("keys")) return 0; apop_data *userkeys = apop_query_to_text("select key from keys"); for (int i=0; i < *userkeys->textsize; i++){ min_distance = 100; for (char **keyptr=ok_keys; strlen(*keyptr); keyptr++){ int ld = levenshtein_distance(*keyptr, *userkeys->text[i]); if (ld < min_distance){ if(ld == 0) {min_distance=0; break;} min_distance=ld; closest = *keyptr; } } Apop_stopif(min_distance > 0 && min_distance <= max_lev_distance, typo_counter++ , 0, "You wrote %s for one of the keys in your spec file. Did you " "mean to write %s?", *userkeys->text[i], closest); } return typo_counter; }
void check_out_impute(char **origin, char **destin, int *imputation_number, char **subset, char **filltabin){ char *filltab = (filltabin && *filltabin) ? *filltabin : "filled"; Apop_stopif(!origin || !*origin, return, 0, "NULL origin table, but I need that."); char *id_column= get_key_word(NULL, "id"); const char *dest = destin ? *destin : NULL; int use_rowids = 0; if (!id_column) { use_rowids++; id_column = strdup("rowid"); } sprintf(apop_opts.db_name_column, "%s", id_column); if (dest){ apop_table_exists(dest, 'd'); apop_query("create table %s as select %s * from %s %s %s", dest, use_rowids ? "rowid as id_col, " : " ", *origin, (subset && *subset) ? "where" : " ", (subset && *subset) ? *subset : " " ); } else dest = *origin; has_sqlite3_index(dest, use_rowids ? "id_col" : id_column, 'y'); Apop_stopif(!apop_table_exists(filltab), return , 0, "No table named '%s'; did you already doMImpute()?", filltab); apop_data *fills = apop_query_to_text("select %s, field, value from %s where draw+0.0=%i" , id_column, filltab, *imputation_number); Apop_stopif(!fills || fills->error, return, 0, "Expected fill-in table " "%s, but couldn't query it.", filltab); begin_transaction(); if (fills) for(int i=0; i< *fills->textsize; i++) apop_query("update %s set %s = '%s' " "where %s = %s", dest, fills->text[i][0], fills->text[i][1], id_column, fills->names->row[i]); commit_transaction(); apop_data_free(fills); free(id_column); }
/** This function creates a series of spec files with paste in macros used * instead of normal keys. The tests will ensure that the correct keys are * getting written to the keys table by running read_spec() and then using * apop functions to verify that the keys are indeed in the spec file */ void pastein_tests(){ char *spec1; asprintf(&spec1, "1.spec"); char *spec2; asprintf(&spec2, "2.spec"); char *spec3; asprintf(&spec3, "3.spec"); char *spec4; asprintf(&spec4, "4.spec"); char *spec5; asprintf(&spec5, "5.spec"); /* Standard test here: creating a macro with a few sub keys and calling it on its own * in the impute key. If something goes wrong here it's because there's something * fundamentally wrong with the paste in macro (because there's only one so there's * nothing too complex going on). */ write_a_file(spec1, "\n" "database: demo.db\n" "verbose: 2\n" "catagesex{\n" " min group size: 3\n" " draw count: 3\n" " seed: 2332\n" " categories {\n" " CATAGE\n" " SEX\n" " }\n" "}\n" "\n" "input {\n" " input file: dc_pums_08.csv\n" " output table: dc \n " " overwrite: y \n " "} \n " " \n" "fields { \n" "SCHL: int 0-24 \n" "WAGP: real\n" "\n}" "impute{\n" " input table: viewdc\n" " output table: imputes\n" " paste in: catagesex\n" " method: hot deck\n" " output vars: SCHL\n" "}\n" "impute{\n" " input table: viewdc\n" " output table: imputes\n" " paste in: catagesex\n" " method: hot deck\n" " output vars: WAGP\n" "}\n" ); /* Creating test here that uses two macros that are used concurrently but that do not * call each other. tables{...} and catagesex{...} are each used in impute{...} but * they do not "paste each other in". This will be tested in spec 3. */ write_a_file(spec2, "\n" "database: demo.db\n" "verbose: 2\n" "catagesex{\n" " min group size: 3\n" " draw count: 3\n" " seed: 2332\n" " categories {\n" " CATAGE\n" " SEX\n" " }\n" "}\n" "tables{\n" " input table: viewdc\n" " output table: impuTable\n" //To account for analysts who like camel case "}\n" "\n" "input {\n" " paste in: tables\n" " input file: dc_pums_08.csv\n" " output table: dc \n " " overwrite: y \n " "} \n " " \n" "fields { \n" "SCHL: int 0-24 \n" "WAGP: real\n" "\n}" "impute{\n" " paste in: tables\n" " paste in: catagesex\n" " method: hot deck\n" " output vars: SCHL\n" "}\n" "impute{\n" " paste in: tables\n" " paste in: catagesex\n" " method: hot deck\n" " output vars: WAGP\n" "}\n" ); /* More complicated test that tests the ability of a macro to use another macro in its * own definition. For instance, it tests something along the lines of * catagesex{paste in: impute stuff \n paste in: categories} */ write_a_file(spec3, "\n" "database: demo.db\n" "verbose: 2\n" "imputestuff{\n" " min group size: 3\n" " draw count: 3\n" " seed: 2332\n" "}\n" "categoriesstuff {\n" " categories{\n" " CATAGE\n" " SEX\n" " }\n" "}\n" "catagesex{\n" " paste in: imputestuff\n" " paste in: categoriesstuff\n" "}\n" "\n" "input {\n" " input file: dc_pums_08.csv\n" " output table: dc \n " " overwrite: y \n " "} \n " " \n" "fields { \n" "SCHL: int 0-24 \n" "WAGP: real\n" "\n}" "impute{\n" " input table: viewdc\n" " output table: imputes\n" " paste in: catagesex\n" " method: hot deck\n" " output vars: SCHL\n" "}\n" "impute{\n" " input table: viewdc\n" " output table: imputes\n" " paste in: catagesex\n" " method: hot deck\n" " output vars: WAGP\n" "}\n" ); /* Tests whether it's possible to create a macro that comprises the entire spec file * (which, of course, is then pasted in on its own). This includes other macros that * are written within the overarching macro itself. Possibly overkill? But I think * it's worth it to test given that different analysts might include big portions of * the spec file separately and could decide to use a macro to do so. */ write_a_file(spec4, "\n" "database: demo.db\n" "wholeSpec{\n" "verbose: 2\n" "catagesex{\n" " min group size: 3\n" " draw count: 3\n" " seed: 2332\n" " categories {\n" " CATAGE\n" " SEX\n" " }\n" "}\n" "\n" "input {\n" " input file: dc_pums_08.csv\n" " output table: dc \n " " overwrite: y \n " "} \n " " \n" "fields { \n" "SCHL: int 0-24 \n" "WAGP: real\n" "\n}" "impute{\n" " input table: viewdc\n" " output table: imputes\n" " paste in: catagesex\n" " method: hot deck\n" " output vars: SCHL\n" "}\n" "impute{\n" " input table: viewdc\n" " output table: imputes\n" " paste in: catagesex\n" " method: hot deck\n" " output vars: WAGP\n" "}\n" "}\n" "paste in: wholeSpec\n" ); char *db_dummy; char *imp_min_grp, *imp_drw_cnt, *imp_seed, *imp_categories; read_spec(&spec1, &db_dummy); asprintf(&imp_min_grp, "impute/min group size"); asprintf(&imp_drw_cnt, "impute/draw count"); asprintf(&imp_seed, "impute/seed"); asprintf(&imp_categories, "impute/categories"); apop_data *spec1_keys1 = apop_query_to_text("select * from keys where key like " "'impute/m%%'"); printf("spec1_keys1->text[0][0] is given by: %s.\n", spec1_keys1->text[0][0]); assert(!strcmp(imp_min_grp, spec1_keys1->text[0][0])); apop_data *spec1_keys2 = apop_query_to_text("select * from keys where key like " "'impute/d%%'"); printf("spec1_keys2->text[0][0] is given by: %s.\n", spec1_keys2->text[0][0]); assert(!strcmp(imp_drw_cnt, spec1_keys2->text[0][0])); apop_data *spec1_keys3 = apop_query_to_text("select * from keys where key like " "'impute/s%%'"); printf("spec1_keys3->text[0][0] is given by: %s.\n", spec1_keys3->text[0][0]); assert(!strcmp(imp_seed, spec1_keys3->text[0][0])); apop_data *spec1_keys4 = apop_query_to_text("select * from keys where key like " "'impute/c%%'"); printf("spec1_keys4->text[0][0] is given by: %s.\n", spec1_keys4->text[0][0]); assert(!strcmp(imp_categories, spec1_keys4->text[0][0])); apop_data_free(spec1_keys1); apop_data_free(spec1_keys2); apop_data_free(spec1_keys3); apop_data_free(spec1_keys4); read_spec(&spec2, &db_dummy); char *inpt_inpt_table; char *inpt_otpt_table; asprintf(&inpt_inpt_table, "input/input table"); asprintf(&inpt_otpt_table, "input/output table"); apop_data *spec2_keys1 = apop_query_to_text("select * from keys where key like " "'impute/m%%'"); printf("spec2_keys1->text[0][0] is given by: %s.\n", spec2_keys1->text[0][0]); assert(!strcmp(imp_min_grp, spec2_keys1->text[0][0])); apop_data *spec2_keys2 = apop_query_to_text("select * from keys where key like " "'impute/d%%'"); printf("spec2_keys2->text[0][0] is given by: %s.\n", spec2_keys2->text[0][0]); assert(!strcmp(imp_drw_cnt, spec2_keys2->text[0][0])); apop_data *spec2_keys3 = apop_query_to_text("select * from keys where key like " "'impute/s%%'"); printf("spec2_keys3->text[0][0] is given by: %s.\n", spec2_keys3->text[0][0]); assert(!strcmp(imp_seed, spec2_keys3->text[0][0])); apop_data *spec2_keys4 = apop_query_to_text("select * from keys where key like " "'impute/c%%'"); printf("spec2_keys4->text[0][0] is given by: %s.\n", spec2_keys4->text[0][0]); assert(!strcmp(imp_categories, spec2_keys4->text[0][0])); apop_data *spec2_keys5 = apop_query_to_text("select * from keys where key like " "'input/input t%%'"); printf("spec2_keys5->text[0][0] is given by: %s.\n", spec2_keys5->text[0][0]); assert(!strcmp(inpt_inpt_table, spec2_keys5->text[0][0])); apop_data *spec2_keys6 = apop_query_to_text("select * from keys where key like " "'input/output t%%'"); printf("spec2_keys6->text[0][0] is given by: %s.\n", spec2_keys6->text[0][0]); assert(!strcmp(inpt_otpt_table, spec2_keys6->text[0][0])); apop_data_free(spec2_keys1); apop_data_free(spec2_keys2); apop_data_free(spec2_keys3); apop_data_free(spec2_keys4); apop_data_free(spec2_keys5); apop_data_free(spec2_keys6); read_spec(&spec3, &db_dummy); apop_data *spec3_keys1 = apop_query_to_text("select * from keys where key like " "'impute/m%%'"); printf("spec3_keys1->text[0][0] is given by: %s.\n", spec3_keys1->text[0][0]); assert(!strcmp(imp_min_grp, spec3_keys1->text[0][0])); apop_data *spec3_keys2 = apop_query_to_text("select * from keys where key like " "'impute/d%%'"); printf("spec3_keys2->text[0][0] is given by: %s.\n", spec3_keys2->text[0][0]); assert(!strcmp(imp_drw_cnt, spec3_keys2->text[0][0])); apop_data *spec3_keys3 = apop_query_to_text("select * from keys where key like " "'impute/s%%'"); printf("spec3_keys3->text[0][0] is given by: %s.\n", spec3_keys3->text[0][0]); assert(!strcmp(imp_seed, spec3_keys3->text[0][0])); apop_data *spec3_keys4 = apop_query_to_text("select * from keys where key like " "'impute/c%%'"); printf("spec3_keys4->text[0][0] is given by: %s.\n", spec3_keys4->text[0][0]); assert(!strcmp(imp_categories, spec3_keys4->text[0][0])); apop_data_free(spec3_keys1); apop_data_free(spec3_keys2); apop_data_free(spec3_keys3); apop_data_free(spec3_keys4); /* This is spec file that tests whether paste in works for pasting in the entire spec * file (without the database -- pasting in database has not been tested yet). spec 4 * paste in stuff is tested by just testing for an assortment of keys. */ read_spec(&spec4, &db_dummy); /* DV - ATTENTION: * This test is failing right now so I've put in an if statement below to exit when * there's no impute key to avoid a segfault in the testing. We need to fix the bug * that is preventing paste in from allowing an entire spec file (minus the database) * to be pasted in. */ apop_data *spec4_keys1 = apop_query_to_text("select * from keys where key like " "'impute/m%%'"); if(get_key_word("impute", NULL) == NULL) return; printf("spec4_keys1->text[0][0] is given by: %s.\n", spec4_keys1->text[0][0]); assert(!strcmp(imp_min_grp, spec4_keys1->text[0][0])); apop_data *spec4_keys2 = apop_query_to_text("select * from keys where key like " "'impute/d%%'"); printf("spec4_keys2->text[0][0] is given by: %s.\n", spec4_keys2->text[0][0]); assert(!strcmp(imp_drw_cnt, spec4_keys2->text[0][0])); apop_data *spec4_keys3 = apop_query_to_text("select * from keys where key like " "'impute/s%%'"); printf("spec4_keys3->text[0][0] is given by: %s.\n", spec4_keys3->text[0][0]); assert(!strcmp(imp_seed, spec4_keys3->text[0][0])); apop_data *spec4_keys4 = apop_query_to_text("select * from keys where key like " "'impute/c%%'"); printf("spec4_keys4->text[0][0] is given by: %s.\n", spec4_keys4->text[0][0]); assert(!strcmp(imp_categories, spec4_keys4->text[0][0])); apop_data *spec4_keys5 = apop_query_to_text("select * from keys where key like " "'input/input t%%'"); printf("spec4_keys5->text[0][0] is given by: %s.\n", spec4_keys5->text[0][0]); assert(!strcmp(inpt_inpt_table, spec4_keys5->text[0][0])); apop_data *spec4_keys6 = apop_query_to_text("select * from keys where key like " "'input/output t%%'"); printf("spec4_keys6->text[0][0] is given by: %s.\n", spec4_keys6->text[0][0]); assert(!strcmp(inpt_otpt_table, spec4_keys6->text[0][0])); apop_data_free(spec4_keys1); apop_data_free(spec4_keys2); apop_data_free(spec4_keys3); apop_data_free(spec4_keys4); apop_data_free(spec4_keys5); apop_data_free(spec4_keys6); free(imp_min_grp); free(imp_drw_cnt); free(imp_seed); free(imp_categories); free(inpt_inpt_table); free(inpt_otpt_table); free(spec1); free(spec2); free(spec3); free(spec4); free(spec5); printf("Reached end of test.\n"); }