/* If you've got a dataset where you are training and testing using the set of rows specified in "train_and_test_rows" and you wish to make a test-set of size "num_test_rows" and a training-set of size "num_training_row (= num_tain_and_test - num_test)" this returns the training and test rows you should use, obtained by shuffling train_and_test. PRE: train_and_test_rows is a natset (see above) (it means strictly increasing set of natural numbers) (in which case num_rows is ignored). OR train_and_test_rows may be NULL, indicating "use all rows in the set { 0 , 1 , ... num_rows-1 } POST: *r_train_set is the training set *r_test_set is the test set. (Note that the union of these two is the original set of rows). The resulting sets are both returned in strictly increasing order. */ void make_train_and_test_rows(ivec *train_and_test_rows, int num_rows,int num_test_rows, ivec **r_train_rows,ivec **r_test_rows) { int save_seed = int_random(300000); ivec *srows = (train_and_test_rows==NULL) ? mk_identity_ivec(num_rows) : mk_copy_ivec(train_and_test_rows); int srows_size = ivec_size(srows); int i; *r_train_rows = mk_ivec(srows_size - num_test_rows); *r_test_rows = mk_ivec(num_test_rows); am_srand(12345); shuffle_ivec(srows); for ( i = 0 ; i < num_test_rows ; i++ ) ivec_set(*r_test_rows,i,ivec_ref(srows,i)); for ( i = num_test_rows ; i < srows_size ; i++ ) ivec_set(*r_train_rows,i-num_test_rows,ivec_ref(srows,i)); free_ivec(srows); am_srand(save_seed); ivec_sort(*r_train_rows,*r_train_rows); ivec_sort(*r_test_rows,*r_test_rows); }
/* PRE: a and b are natsets. PRE: b is a subset of a POST: returns a natset c such that C intersect B = empty C union B = A (i.e. n is in result if and only if n is in A but not in B) This function is useful if you have a set of rows (record numbers) to use as a test set (in b) and you have all the rows for the test and train set in a, and you want to get the set of remaining rows to use as a training set. */ ivec *mk_ivec_set_difference(ivec *a,ivec *b) { int a_size = ivec_size(a); int b_size = ivec_size(b); int c_size = a_size - b_size; int a_ptr = 0; int b_ptr = 0; int c_ptr = 0; ivec *c = mk_ivec(c_size); for ( a_ptr = 0 ; a_ptr < a_size ; a_ptr++ ) { int a_val = ivec_ref(a,a_ptr); if ( b_ptr == b_size || ivec_ref(b,b_ptr) > a_val ) { ivec_set(c,c_ptr,a_val); c_ptr += 1; } else if ( ivec_ref(b,b_ptr) == a_val ) b_ptr += 1; else my_error("mk_ivec_set_difference: b not a subset of a"); } return c; }
/* If you've got a dataset where you are training and testing using the set of rows specified in "train_and_test_rows" and you wish to do k-fold cross-validation (k == num_folds) then what subset of train_and_test_rows should you use as the train set fold_num'th fold? And which should be the test set? This function tells you. It uses a deterministic shuffle and then selects the given rows. PRE: train_and_test_rows is a natset (see above) (it means strictly increasing set of natural numbers) (in which case num_rows is ignored). OR train_and_test_rows may be NULL, indicating "use all rows in the set { 0 , 1 , ... num_rows-1 } POST: *r_train_set is the fold_num'th training set *r_test_set is the fold_num'th test set. (Note that the union of these two is the original set of rows). The resulting sets are both returned in strictly increasing order. Note that if you called this function repeatedly with the same number of folds, but you varied fold_num from 0 up to num_folds on each call, then the union of all the resulting test_sets would contain the same set as train_and_test_rows. But all pairs of resulting test_sets would have an empty intersection. */ void make_kfold_rows(ivec *train_and_test_rows,int num_rows,int num_folds,int fold_num, ivec **r_train_rows,ivec **r_test_rows) { int save_seed = int_random(300000); ivec *srows = (train_and_test_rows==NULL) ? mk_identity_ivec(num_rows) : mk_copy_ivec(train_and_test_rows); int srows_size = ivec_size(srows); int start_i = (int) floor(fold_num * num_rows / (double) num_folds); int end_i = (int) floor((fold_num+1) * num_rows / (double) num_folds); int i; *r_train_rows = mk_ivec(srows_size - (end_i - start_i)); *r_test_rows = mk_ivec(end_i - start_i); am_srand(12345); shuffle_ivec(srows); for ( i = 0 ; i < srows_size ; i++ ) { ivec *update_me = (i >= start_i && i < end_i) ? *r_test_rows : *r_train_rows; int update_index = (i < start_i) ? i : (i < end_i) ? i - start_i : i - (end_i - start_i); ivec_set(update_me,update_index,ivec_ref(srows,i)); } free_ivec(srows); am_srand(save_seed); ivec_sort(*r_train_rows,*r_train_rows); ivec_sort(*r_test_rows,*r_test_rows); }
/* Returns the number of times val occurs in iv. */ int count_in_ivec( ivec *iv, int val) { int i, count; count = 0; for (i=0; i<ivec_size( iv); ++i) if (ivec_ref( iv, i) == val) count += 1; return count; }
/* Makes a dym consisting of a subset of the rows in x. The members of of the subset are those rows mentioned in "rows". Result will this have "ivec_size(rows)" rows and dym_cols(x) columns */ dym *mk_dym_from_subset_of_rows(dym *x,ivec *rows) { int num_rows = ivec_size(rows); int i; dym *result = mk_dym(num_rows,dym_cols(x)); for ( i = 0 ; i < num_rows ; i++ ) { int row = ivec_ref(rows,i); dyv *vec = mk_dyv_from_dym_row(x,row); copy_dyv_to_dym_row(vec,result,i); free_dyv(vec); } return result; }