/* Creates two new dyms, both with the same number of columns as the input x. *r_test will have "num_test_rows" rows and *r_train will have "dym_rows(x) - num_test_rows" rows. *r_test will consist of "num_test_rows" rows of x selected at random without replacement. *r_train will contain the rest. The rows will appear in the same order as they did in the original. */ void break_dym_into_train_and_test(dym *x,int num_test_rows, dym **r_train,dym **r_test) { ivec *train_rows; ivec *test_rows; make_train_and_test_rows(/* train_and_test_rows = */ NULL /* denoting ALL */, dym_rows(x),num_test_rows,&train_rows,&test_rows); *r_train = mk_dym_from_subset_of_rows(x,train_rows); *r_test = mk_dym_from_subset_of_rows(x,test_rows); free_ivec(train_rows); free_ivec(test_rows); }
/* If you've got a dataset where you are training and testing using the set of rows specified in "train_and_test_rows" and you wish to do k-fold cross-validation (k == num_folds) then what subset of train_and_test_rows should you use as the train set fold_num'th fold? And which should be the test set? This function tells you. It uses a deterministic shuffle and then selects the given rows. PRE: train_and_test_rows is a natset (see above) (it means strictly increasing set of natural numbers) (in which case num_rows is ignored). OR train_and_test_rows may be NULL, indicating "use all rows in the set { 0 , 1 , ... num_rows-1 } POST: *r_train_set is the fold_num'th training set *r_test_set is the fold_num'th test set. (Note that the union of these two is the original set of rows). The resulting sets are both returned in strictly increasing order. Note that if you called this function repeatedly with the same number of folds, but you varied fold_num from 0 up to num_folds on each call, then the union of all the resulting test_sets would contain the same set as train_and_test_rows. But all pairs of resulting test_sets would have an empty intersection. */ void make_kfold_rows(ivec *train_and_test_rows,int num_rows,int num_folds,int fold_num, ivec **r_train_rows,ivec **r_test_rows) { int save_seed = int_random(300000); ivec *srows = (train_and_test_rows==NULL) ? mk_identity_ivec(num_rows) : mk_copy_ivec(train_and_test_rows); int srows_size = ivec_size(srows); int start_i = (int) floor(fold_num * num_rows / (double) num_folds); int end_i = (int) floor((fold_num+1) * num_rows / (double) num_folds); int i; *r_train_rows = mk_ivec(srows_size - (end_i - start_i)); *r_test_rows = mk_ivec(end_i - start_i); am_srand(12345); shuffle_ivec(srows); for ( i = 0 ; i < srows_size ; i++ ) { ivec *update_me = (i >= start_i && i < end_i) ? *r_test_rows : *r_train_rows; int update_index = (i < start_i) ? i : (i < end_i) ? i - start_i : i - (end_i - start_i); ivec_set(update_me,update_index,ivec_ref(srows,i)); } free_ivec(srows); am_srand(save_seed); ivec_sort(*r_train_rows,*r_train_rows); ivec_sort(*r_test_rows,*r_test_rows); }
/* If you've got a dataset where you are training and testing using the set of rows specified in "train_and_test_rows" and you wish to make a test-set of size "num_test_rows" and a training-set of size "num_training_row (= num_tain_and_test - num_test)" this returns the training and test rows you should use, obtained by shuffling train_and_test. PRE: train_and_test_rows is a natset (see above) (it means strictly increasing set of natural numbers) (in which case num_rows is ignored). OR train_and_test_rows may be NULL, indicating "use all rows in the set { 0 , 1 , ... num_rows-1 } POST: *r_train_set is the training set *r_test_set is the test set. (Note that the union of these two is the original set of rows). The resulting sets are both returned in strictly increasing order. */ void make_train_and_test_rows(ivec *train_and_test_rows, int num_rows,int num_test_rows, ivec **r_train_rows,ivec **r_test_rows) { int save_seed = int_random(300000); ivec *srows = (train_and_test_rows==NULL) ? mk_identity_ivec(num_rows) : mk_copy_ivec(train_and_test_rows); int srows_size = ivec_size(srows); int i; *r_train_rows = mk_ivec(srows_size - num_test_rows); *r_test_rows = mk_ivec(num_test_rows); am_srand(12345); shuffle_ivec(srows); for ( i = 0 ; i < num_test_rows ; i++ ) ivec_set(*r_test_rows,i,ivec_ref(srows,i)); for ( i = num_test_rows ; i < srows_size ; i++ ) ivec_set(*r_train_rows,i-num_test_rows,ivec_ref(srows,i)); free_ivec(srows); am_srand(save_seed); ivec_sort(*r_train_rows,*r_train_rows); ivec_sort(*r_test_rows,*r_test_rows); }
int main(int argc, char** argv) { /* working variables */ int i,j,ifile,nfile,cm,w,w2; FILE *fp; char *cbuf; double *alphavec=NULL; double **cimat=NULL, **semat=NULL, **eimat=NULL; double **ci0mat=NULL, **se0mat=NULL, **ei0mat=NULL; int nalpha; int *orderv; double *obsvec; /* auxiliary info */ char **fnamev; fnamev=NEW_A(argc-1,char*); nfile=0; /* args */ for(i=1;i<argc;i++) { if(argv[i][0] != '-') { fnamev[nfile]=argv[i]; nfile++; } else if(streq(argv[i],"-d")) { if(i+1>=argc || sscanf(argv[i+1],"%d",&debugmode) != 1) byebye(); i+=1; } else if(streq(argv[i],"-v")) { sw_verpose=1; } else if(streq(argv[i],"--no_au")) { sw_au=0; } else if(streq(argv[i],"--no_np")) { sw_bp=0; } else byebye(); } for(ifile=0;ifile<nfile;ifile++) { fp=openfp(fnamev[ifile],fext_ci,"r",&cbuf); printf("\n# reading %s",cbuf); cm=nalpha=0; orderv=fread_ivec(fp,&cm); obsvec=fread_vec(fp,&cm); alphavec=fread_vec(fp,&nalpha); cimat=fread_mat(fp,&cm,&nalpha); semat=fread_mat(fp,&cm,&nalpha); eimat=fread_mat(fp,&cm,&nalpha); ci0mat=fread_mat(fp,&cm,&nalpha); se0mat=fread_mat(fp,&cm,&nalpha); ei0mat=fread_mat(fp,&cm,&nalpha); fclose(fp); printf("\n#"); repchar(' ',17); w=nalpha*(sw_verpose?17:7); w2=w/2-2; if(sw_au) { repchar('-',w2); printf(" au "); repchar('-',w-w2-4); } printf(" |"); if(sw_bp) { repchar('-',w2); printf(" np "); repchar('-',w-w2-4); } printf("\n# %4s %4s","rank","item"); printf(" %6s","obs"); if(sw_au) { for(j=0;j<nalpha;j++) { printf(" %6.3f",alphavec[j]); if(sw_verpose) printf(" %4s %4s","se","ei"); } } printf(" |"); if(sw_bp) { for(j=0;j<nalpha;j++) { printf(" %6.3f",alphavec[j]); if(sw_verpose) printf(" %4s %4s","se","ei"); } } for(i=0;i<cm;i++) { printf("\n# %4d %4d",i+1,orderv[i]+1); printf(" %6.1f",obsvec[i]); if(sw_au){ for(j=0;j<nalpha;j++) { printf(" %6.1f",cimat[i][j]); if(sw_verpose) printf(" %4.1f %4.1f",semat[i][j],eimat[i][j]); } } printf(" |"); if(sw_bp){ for(j=0;j<nalpha;j++) { printf(" %6.1f",ci0mat[i][j]); if(sw_verpose) printf(" %4.1f %4.1f",se0mat[i][j],ei0mat[i][j]); } } } printf("\n"); free_vec(alphavec); free_mat(cimat); free_mat(semat); free_mat(eimat); free_mat(ci0mat); free_mat(se0mat); free_mat(ei0mat); free_ivec(orderv); free_vec(obsvec); } return 0; }