void qsortd ( int first , int last , double *data ) { int lower, upper ; double ftemp, split ; split = data[(first+last)/2] ; lower = first ; upper = last ; do { while ( split > data[lower] ) ++lower ; while ( split < data[upper] ) --upper ; if (lower == upper) { ++lower ; --upper ; } else if (lower < upper) { ftemp = data[lower] ; data[lower++] = data[upper] ; data[upper--] = ftemp ; } } while ( lower <= upper ) ; if (first < upper) qsortd ( first , upper , data ) ; if (lower < last) qsortd ( lower , last , data ) ; }
/**************************************************\ Trie un tableau de notes a[N][2] par le Quick Sort INPUT: a[][0] doit contenir les N notes (double) a[][1] doit contenir l'index des notes (lo0, hiO) : indices min et max OUTPUT: a[][1] contientra les index des notes dans l'ordre des notes croissantes Utiliser qsortd2 pour utiliser 2 vecteurs au lieu du tableau. \**************************************************/ void qsortd(double **a, int lo0, int hi0) { int lo = lo0; int hi = hi0; double mid; if ( hi0 > lo0) { mid = a[ ( lo0 + hi0 ) / 2 ][0]; while( lo <= hi ) { while( ( lo < hi0 ) && ( a[lo][0] < mid )) ++lo; while( ( hi > lo0 ) && ( a[hi][0] > mid )) --hi; if( lo <= hi ) { swapQSd(a, lo, hi); ++lo; --hi; } } if( lo0 < hi ) qsortd( a, lo0, hi ); if( lo < hi0 ) qsortd( a, lo, hi0 ); } }
int rr ( int type , // Type of study (SCREEN_RR_? in CONST.H): continuous, tails, discrete) int npred , // Number of predictors int *preds , // Their indices are here int targetvar , // Index of target variable int nbins_pred , // Number of predictor bins int nbins_target , // Number of target bins, 0 for 2 sign-based bins double tail_frac , // Tail fraction int mcpt_type , // 1=complete, 2=cyclic int mcpt_reps , // Number of MCPT replications, <=1 for no MCPT int max_pred // Max number of predictors in optimal subset ) { int i, j, k, n, ret_val, ivar, irep, varnum, max_threads, bins_dim ; int *index, *stepwise_mcpt_count, *solo_mcpt_count, *stepwise_ivar, *original_stepwise_ivar ; int *pred_bin, *redun_pred_bin, *target_bin, *bin_counts ; int *work_bin, nkept, best_ivar, *which_preds, *tail_n, *target_bin_ptr ; double *casework, *sorted, *mutual, *pred_thresholds, *target_thresholds, *target, *work_target ; double *crit, *relevance, *original_relevance, *current_crits, *sorted_crits, best_crit, dtemp ; double *pred_bounds, *target_bounds, *pred_marginal, *redun_pred_marginal, *target_marginal ; double *stepwise_crit, *original_stepwise_crit ; double sum_relevance, *original_sum_relevance, *sum_redundancy ; char msg[4096], msg2[4096] ; casework = NULL ; mutual = NULL ; index = NULL ; pred_thresholds = NULL ; target_thresholds = NULL ; pred_bin = NULL ; redun_pred_bin = NULL ; redun_pred_marginal = NULL ; work_bin = NULL ; target_bin = NULL ; bin_counts = NULL ; target = NULL ; tail_n = NULL ; if (max_pred > npred) // Watch out for careless user max_pred = npred ; ret_val = 0 ; max_threads = MAX_THREADS ; /* Print header */ audit ( "" ) ; audit ( "" ) ; audit ( "******************************************************************************" ) ; audit ( "* *" ) ; audit ( "* Computing relevance minus redundancy for optimal predictor subset *" ) ; if (type == SCREEN_RR_CONTINUOUS) audit ( "* Predictors and target are continuous *" ) ; else if (type == SCREEN_RR_TAILS) { sprintf_s ( msg, "* %5.3lf predictor tails used *", tail_frac ) ; audit ( msg ) ; sprintf_s ( msg, "* %2d target bins *", nbins_target ) ; audit ( msg ) ; } else if (type == SCREEN_RR_DISCRETE) { sprintf_s ( msg, "* %2d predictor bins *", nbins_pred ) ; audit ( msg ) ; sprintf_s ( msg, "* %2d target bins *", nbins_target ) ; audit ( msg ) ; } sprintf_s ( msg, "* %5d predictor candidates *", npred ) ; audit ( msg ) ; sprintf_s ( msg, "* %7d best predictors will be printed *", max_pred ) ; audit ( msg ) ; if (mcpt_reps > 1) { if (mcpt_type == 1) sprintf_s ( msg, "* %5d replications of complete Monte-Carlo Permutation Test *", mcpt_reps ) ; else if (mcpt_type == 2) sprintf_s ( msg, "* %5d replications of cyclic Monte-Carlo Permutation Test *", mcpt_reps ) ; audit ( msg ) ; } else { sprintf_s ( msg, "* No Monte-Carlo Permutation Test *" ) ; audit ( msg ) ; } audit ( "* *" ) ; audit ( "******************************************************************************" ) ; /* Allocate memory needed for all types (CONTINUOUS, TAILS, DISCRETE) */ casework = (double *) malloc ( 2 * n_cases * sizeof(double) ) ; // Pred, sorted sorted = casework + n_cases ; mutual = (double *) malloc ( 10 * npred * sizeof(double) ) ; crit = mutual + npred ; current_crits = crit + npred ; sorted_crits = current_crits + npred ; stepwise_crit = sorted_crits + npred ; original_stepwise_crit = stepwise_crit + npred ; relevance = original_stepwise_crit + npred ; original_relevance = relevance + npred ; sum_redundancy = original_relevance + npred ; original_sum_relevance = sum_redundancy + npred ; index = (int *) malloc ( 6 * npred * sizeof(int) ) ; stepwise_mcpt_count = index + npred ; solo_mcpt_count = stepwise_mcpt_count + npred ; which_preds = solo_mcpt_count + npred ; stepwise_ivar = which_preds + npred ; original_stepwise_ivar = stepwise_ivar + npred ; if (casework == NULL || mutual == NULL || index == NULL) { audit ( "ERROR: Insufficient memory for Relevance minus Redundancy" ) ; ret_val = ERROR_INSUFFICIENT_MEMORY ; goto FINISH ; } /* For CONTINUOUS, allocate and save target */ if (type == SCREEN_RR_CONTINUOUS) { target = (double *) malloc ( 2 * n_cases * sizeof(double) ) ; work_target = target + n_cases ; if (target == NULL) { audit ( "ERROR: Insufficient memory for Relevance minus Redundancy" ) ; ret_val = ERROR_INSUFFICIENT_MEMORY ; goto FINISH ; } for (i=0 ; i<n_cases ; i++) // Extract target from database target[i] = database[i*n_vars+targetvar] ; } /* For binning types (TAILS, DISCRETE), allocate that memory and compute all bin information */ else if (type == SCREEN_RR_TAILS || type == SCREEN_RR_DISCRETE) { pred_thresholds = (double *) malloc ( 2 * nbins_pred * npred * sizeof(double) ) ; // pred_thresholds, pred_marginal pred_marginal = pred_thresholds + npred * nbins_pred ; // Not needed for computation but nice to print for user pred_bin = (int *) malloc ( npred * n_cases * sizeof(int) ) ; work_bin = (int *) malloc ( n_cases * sizeof(int) ) ; if (type == SCREEN_RR_TAILS) { assert ( nbins_pred == 2 ) ; k = 3 ; // We go trinary for redundancy } else k = nbins_pred ; if (k >= nbins_target) bins_dim = k * k ; else bins_dim = k * nbins_target ; bin_counts = (int *) malloc ( max_threads * bins_dim * sizeof(int) ) ; tail_n = (int *) malloc ( npred * sizeof(int) ) ; // We use tail_n[0] if DISCRETE, so we need it for eitherz if (type == SCREEN_RR_TAILS) { target_thresholds = (double *) malloc ( 2 * nbins_target * npred * sizeof(double) ) ; // target_thresholds, target_marginal target_marginal = target_thresholds + nbins_target * npred ; target_bin = (int *) malloc ( npred * n_cases * sizeof(int) ) ; // Target bin separate for each predictor redun_pred_bin = (int *) malloc ( npred * n_cases * sizeof(int) ) ; // Trinary for redundancy calculation redun_pred_marginal = (double *) malloc ( 3 * npred * sizeof(double) ) ; // Trinary } else if (type == SCREEN_RR_DISCRETE) { target_thresholds = (double *) malloc ( 2 * nbins_target * sizeof(double) ) ; // target_thresholds, target_marginal target_marginal = target_thresholds + nbins_target ; target_bin = (int *) malloc ( n_cases * sizeof(int) ) ; // Target bin the same for all predictors } if (pred_thresholds == NULL || target_thresholds == NULL || pred_bin == NULL || work_bin == NULL || target_bin == NULL || bin_counts == NULL) { audit ( "ERROR: Insufficient memory for Relevance minus Redundancy" ) ; ret_val = ERROR_INSUFFICIENT_MEMORY ; goto FINISH ; } /* Make an initial pass through the data to find predictor thresholds and permanently save bin indices for predictors and target. If tails-only, we must save the associated target subset indices, separately for each predictor. If not tails only, do target when ivar=-1. */ for (ivar=-1 ; ivar<npred ; ivar++) { if (ivar == -1) { // If this is target pass if (type == SCREEN_RR_TAILS) // But user specified tails only continue ; // then we process the targets separately for each predictor's subset } else varnum = preds[ivar] ; if (user_pressed_escape()) { audit ( "ERROR: User pressed ESCape during RELEVANCE MINUS REDUNDANCY" ) ; ret_val = ERROR_ESCAPE ; goto FINISH ; } // At this point, one of three things holds: // Case 1: ivar=-1 (which implies not TAILS): This is the target // Case 2: ivar>=0, not TAILS: This is a predictor // Case 3: ivar>=0, TAILS: This is a predictor AND we must save the corresponding target // ------> Case 1: ivar=-1 (which implies not TAILS): This is the target if (ivar == -1) { for (i=0 ; i<n_cases ; i++) // Extract target from database casework[i] = database[i*n_vars+targetvar] ; target_bounds = target_thresholds ; k = nbins_target ; partition ( n_cases , casework , &k , target_bounds , target_bin ) ; if (k <nbins_target) { sprintf_s ( msg, "ERROR: Numerous ties reduced target bins to %d", k ) ; audit ( msg ) ; ret_val = ERROR_SYNTAX ; goto FINISH ; } assert ( k == nbins_target ) ; tail_n[0] = n_cases ; // Later code is simplified if we save this as if TAILS } // ------> Case 2: ivar>=0, not TAILS: This is a predictor else if (ivar >= 0 && type != SCREEN_RR_TAILS) { for (i=0 ; i<n_cases ; i++) // Extract predictor from database casework[i] = database[i*n_vars+varnum] ; pred_bounds = pred_thresholds + ivar * nbins_pred ; k = nbins_pred ; partition ( n_cases , casework , &k , pred_bounds , pred_bin+ivar*n_cases ) ; if (k <nbins_pred) { sprintf_s ( msg, "ERROR: Numerous ties reduced predictor %s bins to %d", var_names[preds[ivar]], k ) ; audit ( msg ) ; ret_val = ERROR_SYNTAX ; goto FINISH ; } assert ( k == nbins_pred ) ; } // ------> Case 3: ivar>=0, TAILS: This is a predictor AND we must save the corresponding target else if (ivar >= 0 && type == SCREEN_RR_TAILS) { // Compute predictor bounds per tail fraction for (i=0 ; i<n_cases ; i++) // Extract predictor from database casework[i] = database[i*n_vars+varnum] ; qsortd ( 0 , n_cases-1 , casework ) ; pred_bounds = pred_thresholds + ivar * nbins_pred ; k = (int) (tail_frac * (n_cases+1)) - 1 ; if (k < 0) k = 0 ; pred_bounds[0] = casework[k] ; pred_bounds[1] = casework[n_cases-1-k] ; // Compute and save predictor bin indices; Also save target for soon computing its bounds and indices n = 0 ; for (i=0 ; i<n_cases ; i++) { if (database[i*n_vars+varnum] <= pred_bounds[0]) { pred_bin[ivar*n_cases+n] = 0 ; redun_pred_bin[ivar*n_cases+i] = 0 ; // Need this for intra-predictor redundancy } else if (database[i*n_vars+varnum] >= pred_bounds[1]) { pred_bin[ivar*n_cases+n] = 1 ; redun_pred_bin[ivar*n_cases+i] = 1 ; } else { redun_pred_bin[ivar*n_cases+i] = 2 ; continue ; } casework[n] = database[i*n_vars+targetvar] ; ++n ; } tail_n[ivar] = n ; // Compute the target bounds based on this 'predictor tail' subset of the entire dataset target_bounds = target_thresholds + ivar * nbins_target ; k = nbins_target ; partition ( n , casework , &k , target_bounds , target_bin+ivar*n_cases ) ; if (k <nbins_target) { sprintf_s ( msg, "ERROR: Numerous ties reduced target bins to %d", k ) ; audit ( msg ) ; ret_val = ERROR_SYNTAX ; goto FINISH ; } } else assert ( 1 == 0 ) ; } // For ivar (reading each variable) /* All thresholds (predictor and target) are computed and saved. The predictor and target bin indices are also saved. If not TAILS, the saved target bin indices are based on the entire dataset, and the saved target thresholds are similarly for the entire dataset. But if TAILS, each predictor candidate will have its own target subset and thresholds corresponding to that subset. Print the thresholds for the user's edification */ audit ( "" ) ; audit ( "" ) ; audit ( "The bounds that define bins are now shown" ) ; audit ( "" ) ; if (type == SCREEN_RR_TAILS) { audit ( "Target bounds are shown (after :) separately for each predictor candidate" ) ; audit ( "" ) ; audit ( " Variable Predictor bounds... : Target bounds" ) ; audit ( "" ) ; } else { audit ( "Target bounds are based on the entire dataset..." ) ; sprintf_s ( msg , "%12.5lf", target_thresholds[0] ) ; for (i=1 ; i<nbins_target-1 ; i++) { sprintf_s ( msg2 , " %12.5lf", target_thresholds[i] ) ; strcat_s ( msg , msg2 ) ; } audit ( msg ) ; audit ( "" ) ; audit ( " Variable Bounds..." ) ; audit ( "" ) ; } for (ivar=0 ; ivar<npred ; ivar++) { pred_bounds = pred_thresholds + ivar * nbins_pred ; sprintf_s ( msg, "%15s %12.5lf", var_names[preds[ivar]], pred_bounds[0] ) ; k = (type == SCREEN_RR_TAILS) ? 2 : nbins_pred-1 ; for (i=1 ; i<k ; i++) { sprintf_s ( msg2 , " %12.5lf", pred_bounds[i] ) ; strcat_s ( msg , msg2 ) ; } if (type == SCREEN_RR_TAILS) { target_bounds = target_thresholds + ivar * nbins_target ; sprintf_s ( msg2 , " : %12.5lf", target_bounds[0] ) ; strcat_s ( msg , msg2 ) ; for (i=1 ; i<nbins_target-1 ; i++) { sprintf_s ( msg2 , " %12.5lf", target_bounds[i] ) ; strcat_s ( msg , msg2 ) ; } } // If TAILS audit ( msg ) ; } // For all predictors /* Compute marginals */ for (ivar=0 ; ivar<npred ; ivar++) { for (i=0 ; i<nbins_pred ; i++) pred_marginal[ivar*nbins_pred+i] = 0.0 ; if (ivar==0 || type == SCREEN_RR_TAILS) { for (i=0 ; i<nbins_target ; i++) target_marginal[ivar*nbins_target+i] = 0.0 ; } for (i=0 ; i<n_cases ; i++) { ++pred_marginal[ivar*nbins_pred+pred_bin[ivar*n_cases+i]] ; if (type == SCREEN_UNIVAR_TAILS) { ++target_marginal[ivar*nbins_target+target_bin[ivar*n_cases+i]] ; if (i == tail_n[ivar]-1) break ; } else if (ivar == 0) // Do target just once ++target_marginal[target_bin[i]] ; } // For all cases if (type == SCREEN_RR_TAILS) { // Trinary for (i=0 ; i<3 ; i++) redun_pred_marginal[ivar*3+i] = 0.0 ; for (i=0 ; i<n_cases ; i++) ++redun_pred_marginal[ivar*3+redun_pred_bin[ivar*n_cases+i]] ; } } for (ivar=0 ; ivar<npred ; ivar++) { // Divide counts by number of cases to get marginal if (type == SCREEN_UNIVAR_TAILS) { assert ( nbins_pred == 2 ) ; for (i=0 ; i<nbins_pred ; i++) pred_marginal[ivar*nbins_pred+i] /= tail_n[ivar] ; for (i=0 ; i<3 ; i++) redun_pred_marginal[ivar*3+i] /= n_cases ; } else { for (i=0 ; i<nbins_pred ; i++) pred_marginal[ivar*nbins_pred+i] /= n_cases ; } if (ivar==0 || type == SCREEN_UNIVAR_TAILS) { for (i=0 ; i<nbins_target ; i++) target_marginal[ivar*nbins_target+i] /= tail_n[ivar] ; } } /* Print the marginals for the user's edification */ audit ( "" ) ; audit ( "" ) ; audit ( "The marginal distributions are now shown." ) ; audit ( "If the data is continuous, the marginals will be nearly equal." ) ; audit ( "Widely unequal marginals indicate potentially problematic ties." ) ; audit ( "" ) ; if (type == SCREEN_UNIVAR_TAILS) { audit ( "Target marginals are shown (after :) separately for each predictor candidate" ) ; audit ( "" ) ; audit ( " Variable Predictor marginals... : Target marginals" ) ; audit ( "" ) ; } else { audit ( "Target marginals are based on the entire dataset..." ) ; sprintf_s ( msg , "%12.5lf", target_marginal[0] ) ; for (i=1 ; i<nbins_target ; i++) { sprintf_s ( msg2 , " %12.5lf", target_marginal[i] ) ; strcat_s ( msg , msg2 ) ; } audit ( msg ) ; audit ( "" ) ; audit ( " Variable Marginal..." ) ; audit ( "" ) ; } for (ivar=0 ; ivar<npred ; ivar++) { sprintf_s ( msg, "%15s %12.5lf", var_names[preds[ivar]], pred_marginal[ivar*nbins_pred+0] ) ; for (i=1 ; i<nbins_pred ; i++) { sprintf_s ( msg2 , " %12.5lf", pred_marginal[ivar*nbins_pred+i] ) ; strcat_s ( msg , msg2 ) ; } if (type == SCREEN_UNIVAR_TAILS) { sprintf_s ( msg2 , " : %12.5lf", target_marginal[ivar*nbins_target+0] ) ; strcat_s ( msg , msg2 ) ; for (i=1 ; i<nbins_target ; i++) { sprintf_s ( msg2 , " %12.5lf", target_marginal[ivar*nbins_target+i] ) ; strcat_s ( msg , msg2 ) ; } } // If TAILS audit ( msg ) ; } // For all predictors disallow_menu = 0 ; mouse_cursor_arrow () ; end_progbar () ; } // If binning type (TAILS, DISCRETE) /* -------------------------------------------------------------------------------- Outer-most loop does MCPT replications -------------------------------------------------------------------------------- */ if (mcpt_reps < 1) mcpt_reps = 1 ; for (irep=0 ; irep<mcpt_reps ; irep++) { /* Shuffle target if in permutation run (irep>0) */ if (irep) { // If doing permuted runs, shuffle if (mcpt_type == 1) { // Complete if (type == SCREEN_UNIVAR_CONTINUOUS) { i = n_cases ; // Number remaining to be shuffled while (i > 1) { // While at least 2 left to shuffle j = (int) (unifrand_fast () * i) ; if (j >= i) j = i - 1 ; dtemp = target[--i] ; target[i] = target[j] ; target[j] = dtemp ; } } // If not using bins else if (type == SCREEN_UNIVAR_TAILS) { // Each predictor has its own target subset for (ivar=0 ; ivar<npred ; ivar++) { target_bin_ptr = target_bin + ivar * n_cases ; i = tail_n[ivar] ; // Number remaining to be shuffled while (i > 1) { // While at least 2 left to shuffle j = (int) (unifrand_fast () * i) ; if (j >= i) j = i - 1 ; k = target_bin_ptr[--i] ; target_bin_ptr[i] = target_bin_ptr[j] ; target_bin_ptr[j] = k ; } } } // Else if TAILS else { i = n_cases ; // Number remaining to be shuffled while (i > 1) { // While at least 2 left to shuffle j = (int) (unifrand_fast () * i) ; if (j >= i) j = i - 1 ; k = target_bin[--i] ; target_bin[i] = target_bin[j] ; target_bin[j] = k ; } } // Else discrete using entire dataset } // Type 1, Complete else if (mcpt_type == 2) { // Cyclic if (type == SCREEN_UNIVAR_CONTINUOUS) { j = (int) (unifrand_fast () * n_cases) ; if (j >= n_cases) j = n_cases - 1 ; for (i=0 ; i<n_cases ; i++) casework[i] = target[(i+j)%n_cases] ; for (i=0 ; i<n_cases ; i++) target[i] = casework[i] ; } // If continuous else if (type == SCREEN_UNIVAR_TAILS) { // Each predictor has its own target subset for (ivar=0 ; ivar<npred ; ivar++) { target_bin_ptr = target_bin + ivar * n_cases ; k = tail_n[ivar] ; j = (int) (unifrand_fast () * k) ; if (j >= k) j = k - 1 ; for (i=0 ; i<k ; i++) work_bin[i] = target_bin_ptr[(i+j)%k] ; for (i=0 ; i<k ; i++) target_bin_ptr[i] = work_bin[i] ; } } // Else if TAILS else { j = (int) (unifrand_fast () * n_cases) ; if (j >= n_cases) j = n_cases - 1 ; for (i=0 ; i<n_cases ; i++) work_bin[i] = target_bin[(i+j)%n_cases] ; for (i=0 ; i<n_cases ; i++) target_bin[i] = work_bin[i] ; } // Else discrete using entire dataset } // Type 2, Cyclic } // If in permutation run (irep > 0) /* ----------------------------------------------------------------------------------- First step: Compute and save criterion for all individual candidates ----------------------------------------------------------------------------------- */ for (i=0 ; i<npred ; i++) // We'll test all candidates which_preds[i] = i ; if (type == SCREEN_RR_TAILS) ret_val = rr_threaded ( type , database , n_vars , preds , NULL , mcpt_reps , max_threads , n_cases , tail_n , npred , which_preds , nbins_pred , pred_bin , pred_marginal , nbins_target , target_bin , target_marginal , crit , bins_dim , bin_counts ) ; else ret_val = rr_threaded ( type , database , n_vars , preds , target , mcpt_reps , max_threads , n_cases , NULL , npred , which_preds , nbins_pred , pred_bin , pred_marginal , nbins_target , target_bin , target_marginal , crit , bins_dim , bin_counts ) ; if (user_pressed_escape() && ret_val == 0) ret_val = ERROR_ESCAPE ; if (ret_val) { audit ( "ERROR: User pressed ESCape during RELEVANCE MINUS REDUNDANCY" ) ; goto FINISH ; } /* The individual mutual information for each predictor has been computed and saved in crit. Update 'best' information for this replication. Print a sorted table if this is the first replication. Else update MCPT count. */ for (ivar=0 ; ivar<npred ; ivar++) { relevance[ivar] = crit[ivar] ; // Will need this for Step 2, addition of more predictors if (ivar == 0 || crit[ivar] > best_crit) { best_crit = crit[ivar] ; best_ivar = ivar ; } } stepwise_crit[0] = best_crit ; // Criterion for first var is largest MI stepwise_ivar[0] = best_ivar ; // It's this candidate sum_relevance = best_crit ; if (irep == 0) { // Original, unpermuted data original_stepwise_crit[0] = best_crit ; // Criterion for first var is largest MI original_stepwise_ivar[0] = best_ivar ; // It's this candidate original_sum_relevance[0] = sum_relevance ; stepwise_mcpt_count[0] = 1 ; // Initialize cumulative MCPT // We need original_relevance for printing final table. Other crits are just for this table. for (ivar=0 ; ivar<npred ; ivar++) { index[ivar] = ivar ; original_relevance[ivar] = sorted_crits[ivar] = current_crits[ivar] = crit[ivar] ; solo_mcpt_count[ivar] = 1 ; // Initialize solo MCPT } qsortdsi ( 0 , npred-1 , sorted_crits , index ) ; audit ( "" ) ; audit ( "" ) ; sprintf_s ( msg, "Initial candidates, in order of decreasing mutual information with %s", var_names[targetvar] ) ; audit ( msg ) ; audit ( "" ) ; audit ( " Variable MI" ) ; audit ( "" ) ; for (i=npred-1 ; i>=0 ; i--) { k = index[i] ; sprintf_s ( msg, "%15s %12.4lf", var_names[preds[k]], current_crits[k] ) ; audit ( msg ) ; } } // If irep=0 (original, unpermuted run) else { // Count for MCPT if (sum_relevance >= original_sum_relevance[0]) ++stepwise_mcpt_count[0] ; for (ivar=0 ; ivar<npred ; ivar++) { if (relevance[ivar] >= original_relevance[ivar]) ++solo_mcpt_count[ivar] ; } } // Permuted /* ----------------------------------------------------------------------------------- Second step: Iterate to add more candidates Note that redundancy of a candidate can change as predictors are added. This is because the kept set is increasing, so sum_redundancy changes. ----------------------------------------------------------------------------------- */ for (i=0 ; i<npred ; i++) sum_redundancy[i] = 0.0 ; // sum_redundancy[i] is the total redundancy of candidate i with kept set for (nkept=1 ; nkept<max_pred ; nkept++) { // Main outermost loop /* Print candidates kept so far (if in unpermuted rep) */ if (irep == 0) { // Original, unpermuted audit ( "" ) ; audit ( "" ) ; audit ( "Predictors so far Relevance Redundancy Criterion" ) ; audit ( "" ) ; for (i=0 ; i<nkept ; i++) { k = stepwise_ivar[i] ; // Cannot print sum_redundancy/nkept here because sum froze but nkept keeps increasing sprintf_s ( msg, "%15s %12.4lf %12.4lf %12.4lf", var_names[preds[k]], relevance[k], relevance[k] - stepwise_crit[i], stepwise_crit[i] ) ; audit ( msg ) ; } } /* Build in which_preds the candidates not yet selected */ k = 0 ; // Candidate vector is all except those already kept for (i=0 ; i<npred ; i++) { for (j=0 ; j<nkept ; j++) { if (stepwise_ivar[j] == i) break ; } if (j == nkept) which_preds[k++] = i ; } assert ( k == npred - nkept ) ; /* Compute the MI of the most recently added predictor with each remaining candidate */ if (user_pressed_escape()) { ret_val = ERROR_ESCAPE ; audit ( "ERROR: User pressed ESCape or other serious error during RELEVANCE MINUS REDUNDANCY" ) ; goto FINISH ; } k = stepwise_ivar[nkept-1] ; // Index in preds of most recently added candidate if (type == SCREEN_RR_TAILS) // redun_pred_? is trinary ret_val = rr_threaded ( type , database , n_vars , preds , NULL , mcpt_reps , max_threads , n_cases , NULL , npred-nkept , which_preds , 3 , redun_pred_bin , redun_pred_marginal , 3 , redun_pred_bin+k*n_cases , redun_pred_marginal+k*3 , crit , bins_dim , bin_counts ) ; else { if (type == SCREEN_RR_CONTINUOUS) { for (i=0 ; i<n_cases ; i++) casework[i] = database[i*n_vars+preds[k]] ; } ret_val = rr_threaded ( type , database , n_vars , preds , casework , mcpt_reps , max_threads , n_cases , NULL , npred-nkept , which_preds , nbins_pred , pred_bin , pred_marginal , nbins_pred , pred_bin+k*n_cases , pred_marginal+k*nbins_pred , crit , bins_dim , bin_counts ) ; } if (user_pressed_escape() && ret_val == 0) ret_val = ERROR_ESCAPE ; if (ret_val) { audit ( "ERROR: User pressed ESCape or other serious error during RELEVANCE MINUS REDUNDANCY" ) ; goto FINISH ; } /* The redundancy of each remaining candidate with the most recently added predictor is now in crit. Cumulate the sum of redundancy. Then compute the criteria, sorting and printing if this is the unpermuted replication. */ for (i=0 ; i<npred-nkept ; i++) { k = which_preds[i] ; // Index in preds of this candidate sum_redundancy[k] += crit[i] ; index[i] = k ; sorted_crits[i] = current_crits[i] = relevance[k] - sum_redundancy[k] / nkept ; if (i == 0 || current_crits[i] > best_crit) { best_crit = current_crits[i] ; best_ivar = k ; } } stepwise_crit[nkept] = best_crit ; stepwise_ivar[nkept] = best_ivar ; sum_relevance += relevance[best_ivar] ; if (irep == 0) { // Original, unpermuted original_stepwise_crit[nkept] = best_crit ; original_stepwise_ivar[nkept] = best_ivar ; original_sum_relevance[nkept] = sum_relevance ; stepwise_mcpt_count[nkept] = 1 ; qsortdsi ( 0 , npred-nkept-1 , sorted_crits , index ) ; audit ( "" ) ; audit ( "" ) ; audit ( "Additional candidates, in order of decreasing relevance minus redundancy" ) ; audit ( "" ) ; audit ( " Variable Relevance Redundancy Criterion" ) ; audit ( "" ) ; for (i=npred-nkept-1 ; i>=0 ; i--) { k = index[i] ; sprintf_s ( msg, "%15s %12.4lf %12.4lf %12.4lf", var_names[preds[k]], relevance[k], sum_redundancy[k] / nkept, relevance[k] - sum_redundancy[k] / nkept ) ; audit ( msg ) ; } } // If irep=0 (original, unpermuted run) else { // Count for MCPT if (sum_relevance >= original_sum_relevance[nkept]) ++stepwise_mcpt_count[nkept] ; } // Permuted } // Second step (for nkept): Iterate to add predictors to kept set } // For all MCPT replications /* -------------------------------------------------------------------------------- All computation is finished. Print. -------------------------------------------------------------------------------- */ audit ( "" ) ; audit ( "" ) ; /* Print final list of candidates and p-values */ audit ( "" ) ; audit ( "" ) ; sprintf_s ( msg, "----------> Final results predicting %s <----------", var_names[targetvar] ) ; audit ( msg ) ; audit ( "" ) ; if (mcpt_reps > 1) audit ( "Final predictors Relevance Redundancy Criterion Solo pval Group pval" ) ; else audit ( "Final predictors Relevance Redundancy Criterion" ) ; audit ( "" ) ; for (i=0 ; i<nkept ; i++) { // Cannot print sum_redundancy/nkept here because sum froze but nkept keeps increasing k = original_stepwise_ivar[i] ; if (mcpt_reps > 1) sprintf_s ( msg, "%15s %12.4lf %12.4lf %12.4lf %8.3lf %8.3lf", var_names[preds[k]], original_relevance[k], original_relevance[k] - original_stepwise_crit[i], original_stepwise_crit[i], (double) solo_mcpt_count[k] / (double) mcpt_reps, (double) stepwise_mcpt_count[i] / (double) mcpt_reps ) ; else sprintf_s ( msg, "%15s %12.4lf %12.4lf %12.4lf", var_names[preds[k]], original_relevance[k], original_relevance[k] - original_stepwise_crit[i], original_stepwise_crit[i] ) ; audit ( msg ) ; } /* Finished. Clean up and exit. */ FINISH: if (casework != NULL) free ( casework ) ; if (mutual != NULL) free ( mutual ) ; if (index != NULL) free ( index ) ; if (pred_thresholds != NULL) free ( pred_thresholds ) ; if (target_thresholds != NULL) free ( target_thresholds ) ; if (pred_bin != NULL) free ( pred_bin ) ; if (redun_pred_bin != NULL) free ( redun_pred_bin ) ; if (redun_pred_marginal != NULL) free ( redun_pred_marginal ) ; if (work_bin != NULL) free ( work_bin ) ; if (target_bin != NULL) free ( target_bin ) ; if (bin_counts != NULL) free ( bin_counts ) ; if (target != NULL) free ( target ) ; if (tail_n != NULL) free ( tail_n ) ; return ret_val ; }
int main ( int argc , // Number of command line arguments (includes prog name) char *argv[] // Arguments (prog name is argv[0]) ) { int i, ncases, irep, nreps, m, n_lower, n_upper, n_ks2, n_ks_null, n_ks_alt ; double *x, pval, conf, pessimistic_lower, pessimistic_upper ; double ks_two, ks_one, D, Dp, Dm ; if (argc != 5) { printf ( "\nUsage: ConfConf ncases pval conf nreps" ) ; printf ( "\n ncases - Number of cases in the sample" ) ; printf ( "\n pval - Probability value (<0.5) for quantile test" ) ; printf ( "\n conf - Desired confidence value (<0.5) for both tests" ) ; printf ( "\n nreps - Number of replications" ) ; exit ( 1 ) ; } ncases = atoi ( argv[1] ) ; pval = atof ( argv[2] ) ; conf = atof ( argv[3] ) ; nreps = atoi ( argv[4] ) ; if (ncases < 10) { printf ( "\nERROR.. Must have at least 10 cases" ) ; exit ( 1 ) ; } if (pval * ncases < 1.0 || pval >= 0.5) { printf ( "\nERROR.. Pval too small or too large" ) ; exit ( 1 ) ; } if (conf <= 0.0 || conf >= 0.5) { printf ( "\nERROR.. Conf must be greater than 0 and less than 0.5" ) ; exit ( 1 ) ; } if (nreps < 1) { printf ( "\nERROR.. Must have at least 1 replication" ) ; exit ( 1 ) ; } /* Allocate memory and initialize */ x = (double *) malloc ( ncases * sizeof(double) ) ; m = (int) (pval * ncases) ; // Conservative order statistic for bound pessimistic_lower = quantile_conf ( ncases , m , conf ) ; pessimistic_upper = 1.0 - pessimistic_lower ; ks_two = inverse_ks ( ncases , 1.0 - conf ) ; // Two-tailed test ks_one = inverse_ks ( ncases , 1.0 - 2.0 * conf ) ; // One-tailed test printf ( "\nSuppose the model predicts values near 0 for the null hypothesis" ) ; printf ( "\nand values near 1 for the alternative hypothesis." ) ; printf ( "\n\nIf the dataset represents the null hypothesis, the threshold" ) ; printf ( "\nfor rejecting the null at p=%.4lf is given by the %d'th order statistic.", pval, ncases - m + 1 ) ; printf ( "\nThis is a conservative estimate of the %.4lf quantile", 1.0-pval ) ; printf ( "\nThere is only a %.4lf chance that it will really be the %.4lf quantile or worse.", conf, pessimistic_upper ) ; printf ( "\n\nIf the dataset represents the alternative hypothesis, the threshold" ) ; printf ( "\nfor rejecting the alt at p=%.4lf is given by the %d'th order statistic.", pval, m ) ; printf ( "\nThis is a conservative estimate of the %.4lf quantile", pval ) ; printf ( "\nThere is only a %.4lf chance that it will really be the %.4lf quantile or worse.", conf, pessimistic_lower) ; printf ( "\n\nKS thresholds: two-tailed KS = %.4lf one-tailed KS = %.4lf", ks_two, ks_one ) ; /* Now generate nreps samples. Verify that our required confidence level is observed. Note that the fact that this test uses a uniform distribution does not in any way limit its applicability to uniform distributions. If one were to generate cases from any other reasonable distribtion, the pessimistic quantile bounds would have to be transformed similarly. The result is that the inequalities below would pass or fail identically. We count the number of times 'disaster' happens. Disaster is when the order statistic used for the threshold is toward the inside (center) of the distribution, meaning that if this order statistic had been used as a threshold, more of the distribution would be outside the threshold than the user expected. We expect disaster to happen with probability equal to the specified conf parameter. For the two-tailed Kolmogorov-Smirnov test, disaster is when the empirical CDF deviates (above or below) from the correct value by more than the conf-inspired value. For the one-tailed test in which the dataset is from the NULL distribution, disaster is when the empirical CDF exceeds the true CDF, a situation that would encourage false rejection of the null hypothesis. This is measured by D+. For the one-tailed test in which the dataset is from the ALT distribution, disaster is when the empirical CDF is less than the true CDF, a situation that would encourage false rejection of the alternative hypothesis. This is measured by D-. */ n_lower = n_upper = n_ks2 = n_ks_null = n_ks_alt = 0 ; for (irep=0 ; irep<nreps ; irep++) { for (i=0 ; i<ncases ; i++) x[i] = unifrand () ; qsortd ( 0 , ncases-1 , x ) ; if (x[m-1] > pessimistic_lower) ++n_lower ; if (x[ncases-m] < pessimistic_upper) ++n_upper ; D = ks_test ( ncases , x , &Dp , &Dm ) ; if (D > ks_two) ++n_ks2 ; if (Dp > ks_one) ++n_ks_null ; if (Dm > ks_one) ++n_ks_alt ; } printf ( "\nPoint failure (expected=%.4lf) Lower=%.4lf Upper=%.4lf", conf, (double) n_lower / nreps, (double) n_upper / nreps) ; printf ( "\nKS failure: two-tailed = %.4lf NULL = %.4lf ALT = %.4lf", (double) n_ks2 / nreps, (double) n_ks_null / nreps, (double) n_ks_alt / nreps) ; free ( x ) ; return ( 0 ) ; }
int main ( int argc , // Number of command line arguments (includes prog name) char *argv[] // Arguments (prog name is argv[0]) ) { int i, j, k, nvars, ncases, irep, nreps, ivar, nties, ties ; int n_indep_vars, idep, icand, *index, *mcpt_max_counts, *mcpt_same_counts, *mcpt_solo_counts ; double *data, *work, dtemp, *save_info, criterion, *crits ; char filename[256], **names, depname[256] ; FILE *fp ; MutualInformationAdaptive *mi_adapt ; /* Process command line parameters */ #if 1 if (argc != 5) { printf ( "\nUsage: MI_ONLY datafile n_indep depname nreps" ) ; printf ( "\n datafile - name of the text file containing the data" ) ; printf ( "\n The first line is variable names" ) ; printf ( "\n Subsequent lines are the data." ) ; printf ( "\n Delimiters can be space, comma, or tab" ) ; printf ( "\n n_indep - Number of independent vars, starting with the first" ) ; printf ( "\n depname - Name of the 'dependent' variable" ) ; printf ( "\n It must be AFTER the first n_indep variables" ) ; printf ( "\n nreps - Number of Monte-Carlo permutations, including unpermuted" ) ; exit ( 1 ) ; } strcpy ( filename , argv[1] ) ; n_indep_vars = atoi ( argv[2] ) ; strcpy ( depname , argv[3] ) ; nreps = atoi ( argv[4] ) ; #else strcpy ( filename , "..\\SYNTH.TXT" ) ; n_indep_vars = 7 ; strcpy ( depname , "SUM1234" ) ; nreps = 100 ; #endif _strupr ( depname ) ; /* These are used by MEM.CPP for runtime memory validation */ _fullpath ( mem_file_name , "MEM.LOG" , 256 ) ; fp = fopen ( mem_file_name , "wt" ) ; if (fp == NULL) { // Should never happen printf ( "\nCannot open MEM.LOG file for writing!" ) ; return EXIT_FAILURE ; } fclose ( fp ) ; mem_keep_log = 0 ; // Change this to 1 to keep a memory use log (slows execution!) mem_max_used = 0 ; /* Open the text file to which results will be written */ fp = fopen ( "MI_ONLY.LOG" , "wt" ) ; if (fp == NULL) { // Should never happen printf ( "\nCannot open MI_ONLY.LOG file for writing!" ) ; return EXIT_FAILURE ; } /* Read the file and locate the index of the dependent variable */ if (readfile ( filename , &nvars , &names , &ncases , &data )) return EXIT_FAILURE ; for (idep=0 ; idep<nvars ; idep++) { if (! strcmp ( depname , names[idep] )) break ; } if (idep == nvars) { printf ( "\nERROR... Dependent variable %s is not in file", depname ) ; return EXIT_FAILURE ; } if (idep < n_indep_vars) { printf ( "\nERROR... Dependent variable %s must be beyond independent vars", depname ) ; return EXIT_FAILURE ; } /* Check each variable for ties. This is not needed for the algorithm, but it is good to warn the user, because more than a very few tied values in any variable seriously degrades performance of the adaptive partitioning algorithm. */ MEMTEXT ( "MI_ONLY: Work" ) ; work = (double *) MALLOC ( ncases * sizeof(double) ) ; assert ( work != NULL ) ; ties = 0 ; assert ( work != NULL ) ; for (ivar=0 ; ivar<nvars ; ivar++) { if (ivar > n_indep_vars && ivar != idep) continue ; // Check only the variables selected by the user for (i=0 ; i<ncases ; i++) work[i] = data[i*nvars+ivar] ; qsortd ( 0 , ncases-1 , work ) ; nties = 0 ; for (i=1 ; i<ncases ; i++) { if (work[i] == work[i-1]) ++nties ; } if ((double) nties / (double) ncases > 0.05) { ++ties ; fprintf ( fp , "\nWARNING... %s has %.2lf percent ties!", names[ivar], 100.0 * nties / (double) ncases ) ; } } // For all variables if (ties) { fprintf ( fp , "\nThe presence of ties will seriously degrade" ) ; fprintf ( fp , "\nperformance of the adaptive partitioning algorithm\n\n" ) ; } /* Allocate scratch memory and create the MutualInformation object using the dependent variable crits - Mutual information criterion index - Indices that sort the criterion save_info - Ditto, this is univariate information, to be sorted mi_adapt - The MutualInformation object, constructed with the 'dependent' variable */ MEMTEXT ( "MI_ONLY work allocs plus MutualInformation" ) ; crits = (double *) MALLOC ( n_indep_vars * sizeof(double) ) ; assert ( crits != NULL ) ; index = (int *) MALLOC ( n_indep_vars * sizeof(int) ) ; assert ( index != NULL ) ; mcpt_max_counts = (int *) MALLOC ( n_indep_vars * sizeof(int) ) ; assert ( mcpt_max_counts != NULL ) ; mcpt_same_counts = (int *) MALLOC ( n_indep_vars * sizeof(int) ) ; assert ( mcpt_same_counts != NULL ) ; mcpt_solo_counts = (int *) MALLOC ( n_indep_vars * sizeof(int) ) ; assert ( mcpt_solo_counts != NULL ) ; save_info = (double *) MALLOC ( n_indep_vars * sizeof(double) ) ; assert ( save_info != NULL ) ; for (irep=0 ; irep<nreps ; irep++) { for (i=0 ; i<ncases ; i++) // Get the 'dependent' variable work[i] = data[i*nvars+idep] ; // Shuffle dependent variable if in permutation run (irep>0) if (irep) { // If doing permuted runs, shuffle i = ncases ; // Number remaining to be shuffled while (i > 1) { // While at least 2 left to shuffle j = (int) (unifrand () * i) ; if (j >= i) j = i - 1 ; dtemp = work[--i] ; work[i] = work[j] ; work[j] = dtemp ; } } // Here we use a tiny split theshold (instead of the usual 6.0) so that it picks up // small amounts of mutual information (perhaps including noise). // If we used 6.0, nearly all permutations of any reasonably sized dataset // would have a computed mutual information of zero. It's safe picking up // some noise because the permutation test will account for this. mi_adapt = new MutualInformationAdaptive ( ncases , work , 1 , 0.1 ) ; // Deliberately tiny for low information assert ( mi_adapt != NULL ) ; /* Compute and save the mutual information for the dependent variable with each individual independent variable candidate. */ for (icand=0 ; icand<n_indep_vars ; icand++) { // Try all candidates for (i=0 ; i<ncases ; i++) work[i] = data[i*nvars+icand] ; criterion = mi_adapt->mut_inf ( work , 1 ) ; save_info[icand] = criterion ; // We will sort this when all candidates are done if (irep == 0) { // If doing original (unpermuted), save criterion index[icand] = icand ; // Will need original indices when criteria are sorted crits[icand] = criterion ; mcpt_max_counts[icand] = mcpt_same_counts[icand] = mcpt_solo_counts[icand] = 1 ; // This is >= itself so count it now } else { if (criterion >= crits[icand]) ++mcpt_solo_counts[icand] ; } } // Initial list of all candidates delete mi_adapt ; mi_adapt = NULL ; if (irep == 0) // Find the indices that sort the candidates per criterion qsortdsi ( 0 , n_indep_vars-1 , save_info , index ) ; else { qsortd ( 0 , n_indep_vars-1 , save_info ) ; for (icand=0 ; icand<n_indep_vars ; icand++) { if (save_info[icand] >= crits[index[icand]]) ++mcpt_same_counts[index[icand]] ; if (save_info[n_indep_vars-1] >= crits[index[icand]]) // Valid only for largest ++mcpt_max_counts[index[icand]] ; } } } // For all reps fprintf ( fp , "\nAdaptive partitioning mutual information of %s", depname); fprintf ( fp , "\n" ) ; fprintf ( fp , "\n" ) ; fprintf ( fp , "\nPredictors, in order of decreasing mutual information" ) ; fprintf ( fp , "\n" ) ; fprintf ( fp , "\n Variable Information Solo pval Min pval Max pval" ) ; for (icand=0 ; icand<n_indep_vars ; icand++) { // Do all candidates k = index[n_indep_vars-1-icand] ; // Index of sorted candidate fprintf ( fp , "\n%31s %11.5lf %12.4lf %10.4lf %10.4lf", names[k], crits[k], (double) mcpt_solo_counts[k] / nreps, (double) mcpt_same_counts[k] / nreps, (double) mcpt_max_counts[k] / nreps ) ; } MEMTEXT ( "MI_ONLY: Finish" ) ; fclose ( fp ) ; FREE ( work ) ; FREE ( crits ) ; FREE ( index ) ; FREE ( mcpt_max_counts ) ; FREE ( mcpt_same_counts ) ; FREE ( mcpt_solo_counts ) ; FREE ( save_info ) ; free_data ( nvars , names , data ) ; MEMCLOSE () ; printf ( "\n\nPress any key..." ) ; _getch () ; return EXIT_SUCCESS ; }
int main ( int argc , // Number of command line arguments (includes prog name) char *argv[] // Arguments (prog name is argv[0]) ) { int i, j, k, nvars, ncases, irep, nreps, nbins, nbins_dep, nbins_indep, *count ; int n_indep_vars, idep, icand, *index, *mcpt_max_counts, *mcpt_same_counts, *mcpt_solo_counts ; short int *bins_dep, *bins_indep ; double *data, *work, dtemp, *save_info, criterion, *crits ; double *ab, *bc, *b ; char filename[256], **names, depname[256] ; FILE *fp ; /* Process command line parameters */ #if 1 if (argc != 6) { printf ( "\nUsage: TRANSFER datafile n_indep depname nreps" ) ; printf ( "\n datafile - name of the text file containing the data" ) ; printf ( "\n The first line is variable names" ) ; printf ( "\n Subsequent lines are the data." ) ; printf ( "\n Delimiters can be space, comma, or tab" ) ; printf ( "\n n_indep - Number of independent vars, starting with the first" ) ; printf ( "\n depname - Name of the 'dependent' variable" ) ; printf ( "\n It must be AFTER the first n_indep variables" ) ; printf ( "\n nbins - Number of bins for all variables" ) ; printf ( "\n nreps - Number of Monte-Carlo permutations, including unpermuted" ) ; exit ( 1 ) ; } strcpy ( filename , argv[1] ) ; n_indep_vars = atoi ( argv[2] ) ; strcpy ( depname , argv[3] ) ; nbins = atoi ( argv[4] ) ; nreps = atoi ( argv[5] ) ; #else strcpy ( filename , "..\\SYNTH.TXT" ) ; n_indep_vars = 7 ; strcpy ( depname , "SUM1234" ) ; nbins = 2 ; nreps = 1 ; #endif _strupr ( depname ) ; /* These are used by MEM.CPP for runtime memory validation */ _fullpath ( mem_file_name , "MEM.LOG" , 256 ) ; fp = fopen ( mem_file_name , "wt" ) ; if (fp == NULL) { // Should never happen printf ( "\nCannot open MEM.LOG file for writing!" ) ; return EXIT_FAILURE ; } fclose ( fp ) ; mem_keep_log = 1 ; // Change this to 1 to keep a memory use log (slows execution!) mem_max_used = 0 ; /* Open the text file to which results will be written */ fp = fopen ( "TRANSFER.LOG" , "wt" ) ; if (fp == NULL) { // Should never happen printf ( "\nCannot open TRANSFER.LOG file for writing!" ) ; return EXIT_FAILURE ; } /* Read the file and locate the index of the dependent variable */ if (readfile ( filename , &nvars , &names , &ncases , &data )) return EXIT_FAILURE ; for (idep=0 ; idep<nvars ; idep++) { if (! strcmp ( depname , names[idep] )) break ; } if (idep == nvars) { printf ( "\nERROR... Dependent variable %s is not in file", depname ) ; return EXIT_FAILURE ; } if (idep < n_indep_vars) { printf ( "\nERROR... Dependent variable %s must be beyond independent vars", depname ) ; return EXIT_FAILURE ; } /* Allocate scratch memory crits - Transfer Entropy criterion index - Indices that sort the criterion save_info - Ditto, this is univariate criteria, to be sorted */ MEMTEXT ( "TRANSFER work allocs" ) ; work = (double *) MALLOC ( ncases * sizeof(double) ) ; assert ( work != NULL ) ; crits = (double *) MALLOC ( n_indep_vars * sizeof(double) ) ; assert ( crits != NULL ) ; index = (int *) MALLOC ( n_indep_vars * sizeof(int) ) ; assert ( index != NULL ) ; bins_indep = (short int *) MALLOC ( ncases * sizeof(short int) ) ; assert ( bins_indep != NULL ) ; bins_dep = (short int *) MALLOC ( ncases * sizeof(short int) ) ; assert ( bins_dep != NULL ) ; mcpt_max_counts = (int *) MALLOC ( n_indep_vars * sizeof(int) ) ; assert ( mcpt_max_counts != NULL ) ; mcpt_same_counts = (int *) MALLOC ( n_indep_vars * sizeof(int) ) ; assert ( mcpt_same_counts != NULL ) ; mcpt_solo_counts = (int *) MALLOC ( n_indep_vars * sizeof(int) ) ; assert ( mcpt_solo_counts != NULL ) ; save_info = (double *) MALLOC ( n_indep_vars * sizeof(double) ) ; assert ( save_info != NULL ) ; count = (int *) MALLOC ( nbins * nbins * nbins * sizeof(int) ) ; assert ( count != NULL ) ; ab = (double *) MALLOC ( nbins * nbins * sizeof(double) ) ; assert ( ab != NULL ) ; bc = (double *) MALLOC ( nbins * nbins * sizeof(double) ) ; assert ( bc != NULL ) ; b = (double *) MALLOC ( nbins * sizeof(double) ) ; assert ( b != NULL ) ; /* Get the dependent variable and partition it */ for (i=0 ; i<ncases ; i++) // Get the 'dependent' variable work[i] = data[i*nvars+idep] ; nbins_dep = nbins ; partition ( ncases , work , &nbins_dep , NULL , bins_dep ) ; /* Replication loop is here */ for (irep=0 ; irep<nreps ; irep++) { /* Compute and save the transfer entropy of the dependent variable with each individual independent variable candidate. */ for (icand=0 ; icand<n_indep_vars ; icand++) { // Try all candidates for (i=0 ; i<ncases ; i++) work[i] = data[i*nvars+icand] ; // Shuffle independent variable if in permutation run (irep>0) if (irep) { // If doing permuted runs, shuffle i = ncases ; // Number remaining to be shuffled while (i > 1) { // While at least 2 left to shuffle j = (int) (unifrand () * i) ; if (j >= i) j = i - 1 ; dtemp = work[--i] ; work[i] = work[j] ; work[j] = dtemp ; } } nbins_indep = nbins ; partition ( ncases , work , &nbins_indep , NULL , bins_indep ) ; criterion = trans_ent ( ncases , nbins_indep , nbins_dep , bins_indep , bins_dep , 0 , 1 , 1 , count , ab , bc , b ) ; save_info[icand] = criterion ; // We will sort this when all candidates are done if (irep == 0) { // If doing original (unpermuted), save criterion index[icand] = icand ; // Will need original indices when criteria are sorted crits[icand] = criterion ; mcpt_max_counts[icand] = mcpt_same_counts[icand] = mcpt_solo_counts[icand] = 1 ; // This is >= itself so count it now } else { if (criterion >= crits[icand]) ++mcpt_solo_counts[icand] ; } } // Initial list of all candidates if (irep == 0) // Find the indices that sort the candidates per criterion qsortdsi ( 0 , n_indep_vars-1 , save_info , index ) ; else { qsortd ( 0 , n_indep_vars-1 , save_info ) ; for (icand=0 ; icand<n_indep_vars ; icand++) { if (save_info[icand] >= crits[index[icand]]) ++mcpt_same_counts[index[icand]] ; if (save_info[n_indep_vars-1] >= crits[index[icand]]) // Valid only for largest ++mcpt_max_counts[index[icand]] ; } } } // For all reps fprintf ( fp , "\nTransfer entropy of %s", depname); fprintf ( fp , "\n" ) ; fprintf ( fp , "\n" ) ; fprintf ( fp , "\nPredictors, in order of decreasing transfer entropy" ) ; fprintf ( fp , "\n" ) ; fprintf ( fp , "\n Variable Information Solo pval Min pval Max pval" ) ; for (icand=0 ; icand<n_indep_vars ; icand++) { // Do all candidates k = index[n_indep_vars-1-icand] ; // Index of sorted candidate fprintf ( fp , "\n%31s %11.5lf %12.4lf %10.4lf %10.4lf", names[k], crits[k], (double) mcpt_solo_counts[k] / nreps, (double) mcpt_same_counts[k] / nreps, (double) mcpt_max_counts[k] / nreps ) ; } MEMTEXT ( "TRANSFER: Finish" ) ; fclose ( fp ) ; FREE ( work ) ; FREE ( crits ) ; FREE ( index ) ; FREE ( bins_indep ) ; FREE ( bins_dep ) ; FREE ( mcpt_max_counts ) ; FREE ( mcpt_same_counts ) ; FREE ( mcpt_solo_counts ) ; FREE ( save_info ) ; FREE ( count ) ; FREE ( ab ) ; FREE ( bc ) ; FREE ( b ) ; free_data ( nvars , names , data ) ; MEMCLOSE () ; printf ( "\n\nPress any key..." ) ; _getch () ; return EXIT_SUCCESS ; }
int main ( int argc , // Number of command line arguments (includes prog name) char *argv[] // Arguments (prog name is argv[0]) ) { int i, j, k, nvars, ncases, ndiv, maxkept, ivar, nties, ties ; int n_indep_vars, idep, icand, iother, ibest, *sortwork, nkept, *kept ; double *data, *work ; double *save_info, *univar_info, *pair_info, bestredun, redun, bestcrit ; double criterion, relevance, redundancy, *crits, *reduns ; char filename[256], **names, depname[256] ; char trial_name[256], *pair_found ; FILE *fp ; MutualInformationParzen *mi_parzen ; MutualInformationAdaptive *mi_adapt ; /* Process command line parameters */ #if 1 if (argc != 6) { printf ( "\nUsage: MI_CONT datafile n_indep depname ndiv maxkept" ) ; printf ( "\n datafile - name of the text file containing the data" ) ; printf ( "\n The first line is variable names" ) ; printf ( "\n Subsequent lines are the data." ) ; printf ( "\n Delimiters can be space, comma, or tab" ) ; printf ( "\n n_indep - Number of independent vars, starting with the first" ) ; printf ( "\n depname - Name of the 'dependent' variable" ) ; printf ( "\n It must be AFTER the first n_indep variables" ) ; printf ( "\n ndiv - Normally zero, to employ adaptive partitioning" ) ; printf ( "\n Specify 5 (for very few cases) to 15 (for an" ) ; printf ( "\n enormous number of cases) to use Parzen windows" ) ; printf ( "\n maxkept - Stepwise will allow at most this many predictors" ) ; exit ( 1 ) ; } strcpy ( filename , argv[1] ) ; n_indep_vars = atoi ( argv[2] ) ; strcpy ( depname , argv[3] ) ; ndiv = atoi ( argv[4] ) ; maxkept = atoi ( argv[5] ) ; #else strcpy ( filename , "..\\VARS.TXT" ) ; n_indep_vars = 8 ; strcpy ( depname , "DAY_RETURN" ) ; ndiv = 0 ; maxkept = 5 ; #endif _strupr ( depname ) ; /* These are used by MEM.CPP for runtime memory validation */ _fullpath ( mem_file_name , "MEM.LOG" , 256 ) ; fp = fopen ( mem_file_name , "wt" ) ; if (fp == NULL) { // Should never happen printf ( "\nCannot open MEM.LOG file for writing!" ) ; return EXIT_FAILURE ; } fclose ( fp ) ; mem_keep_log = 1 ; mem_max_used = 0 ; /* Open the text file to which results will be written */ fp = fopen ( "MI_CONT.LOG" , "wt" ) ; if (fp == NULL) { // Should never happen printf ( "\nCannot open MI_CONT.LOG file for writing!" ) ; return EXIT_FAILURE ; } /* Read the file and locate the index of the 'dependent' variable */ if (readfile ( filename , &nvars , &names , &ncases , &data )) return EXIT_FAILURE ; for (idep=0 ; idep<nvars ; idep++) { if (! strcmp ( depname , names[idep] )) break ; } if (idep == nvars) { printf ( "\nERROR... Dependent variable %s is not in file", depname ) ; return EXIT_FAILURE ; } if (idep < n_indep_vars) { printf ( "\nERROR... Dependent variable %s must be beyond independent vars", depname ) ; return EXIT_FAILURE ; } /* If adaptive partitioning is specified, check each variable for ties. This is not needed for the algorithm, but it is good to warn the user, because more than a very few tied values in any variable seriously degrades performance of the adaptive partitioning algorithm. */ MEMTEXT ( "MI_CONT: Work" ) ; work = (double *) MALLOC ( ncases * sizeof(double) ) ; assert ( work != NULL ) ; if (ndiv == 0) { // If adaptive partitioning, check for ties ties = 0 ; assert ( work != NULL ) ; for (ivar=0 ; ivar<nvars ; ivar++) { if (ivar > n_indep_vars && ivar != idep) continue ; // Check only the variables selected by the user for (i=0 ; i<ncases ; i++) work[i] = data[i*nvars+ivar] ; qsortd ( 0 , ncases-1 , work ) ; nties = 0 ; for (i=1 ; i<ncases ; i++) { if (work[i] == work[i-1]) ++nties ; } if ((double) nties / (double) ncases > 0.05) { ++ties ; fprintf ( fp , "\nWARNING... %s has %.2lf percent ties!", names[ivar], 100.0 * nties / (double) ncases ) ; } } // For all variables if (ties) { fprintf ( fp , "\nThe presence of ties will seriously degrade" ) ; fprintf ( fp , "\nperformance of the adaptive partitioning algorithm\n\n" ) ; } } // If adaptive partitioning, so testing for ties in the data /* Allocate scratch memory and create the MutualInformation object using the dependent variable kept - Array of indices of variables kept so far crits - Ditto, criterion reduns - Ditto, redundancy sortwork - Temporary use for printing variable's information sorted save_info - Ditto, this is univariate information, to be sorted univar_info - Also univariate information, but not sorted, for use in stepwise pair_found - Flag: is there valid info in the corresponding element of the next array pair_info - Preserve pairwise information of indeps to avoid expensive recalculation mi_parzen - The MutualInformation object, constructed with the 'dependent' variable mi_adapt - Ditto, but used if adaptive partitioning */ MEMTEXT ( "MI_CONT 6 allocs plus MutualInformation" ) ; kept = (int *) MALLOC ( n_indep_vars * sizeof(int) ) ; assert ( kept != NULL ) ; crits = (double *) MALLOC ( n_indep_vars * sizeof(double) ) ; assert ( crits != NULL ) ; reduns = (double *) MALLOC ( n_indep_vars * sizeof(double) ) ; assert ( reduns != NULL ) ; sortwork = (int *) MALLOC ( n_indep_vars * sizeof(int) ) ; assert ( sortwork != NULL ) ; save_info = (double *) MALLOC ( n_indep_vars * sizeof(double) ) ; assert ( save_info != NULL ) ; univar_info = (double *) MALLOC ( n_indep_vars * sizeof(double) ) ; assert ( univar_info != NULL ) ; pair_found = (char *) MALLOC ( (n_indep_vars * (n_indep_vars+1) / 2) * sizeof(char) ) ; assert ( pair_found != NULL ) ; pair_info = (double *) MALLOC ( (n_indep_vars * (n_indep_vars+1) / 2) * sizeof(double) ) ; assert ( pair_info != NULL ) ; for (i=0 ; i<ncases ; i++) // Get the 'dependent' variable work[i] = data[i*nvars+idep] ; if (ndiv > 0) { mi_parzen = new MutualInformationParzen ( ncases , work , ndiv ) ; mi_adapt = NULL ; assert ( mi_parzen != NULL ) ; } else { mi_adapt = new MutualInformationAdaptive ( ncases , work , 0 , 6.0 ) ; mi_parzen = NULL ; assert ( mi_adapt != NULL ) ; } memset ( pair_found , 0 , (n_indep_vars * (n_indep_vars+1) / 2) * sizeof(char) ) ; if (ndiv > 0) fprintf ( fp , "\nParzen mutual information of %s (ndiv=%d)", depname, ndiv); else fprintf ( fp , "\nAdaptive partitioning mutual information of %s", depname); fprintf ( fp , "\n" ) ; fprintf ( fp , "\n---------------------------------------------------------------" ) ; fprintf ( fp , "\n" ) ; /* Compute and save the mutual information for the dependent variable with each individual independent variable candidate. Print the results, sort them, and print them again, this time sorted. */ fprintf ( fp , "\nInitial candidates, in order of appearance in data file" ) ; fprintf ( fp , "\n" ) ; fprintf ( fp , "\n Variable Information" ) ; for (icand=0 ; icand<n_indep_vars ; icand++) { // Try all candidates for (i=0 ; i<ncases ; i++) work[i] = data[i*nvars+icand] ; if (ndiv > 0) criterion = mi_parzen->mut_inf ( work ) ; else criterion = mi_adapt->mut_inf ( work , 0 ) ; printf ( "\n%s = %.5lf", names[icand], criterion ) ; fprintf ( fp , "\n%31s %.5lf", names[icand], criterion ) ; sortwork[icand] = icand ; save_info[icand] = univar_info[icand] = criterion ; } // Initial list of all candidates if (mi_parzen != NULL) { delete mi_parzen ; mi_parzen = NULL ; } if (mi_adapt != NULL) { delete mi_adapt ; mi_adapt = NULL ; } fprintf ( fp , "\n" ) ; fprintf ( fp , "\nInitial candidates, in order of decreasing mutual information" ) ; fprintf ( fp , "\n" ) ; fprintf ( fp , "\n Variable Information" ) ; qsortdsi ( 0 , n_indep_vars-1 , save_info , sortwork ) ; for (icand=0 ; icand<n_indep_vars ; icand++) { // Do all candidates k = sortwork[n_indep_vars-1-icand] ; // Index of sorted candidate fprintf ( fp , "\n%31s %.5lf", names[k], save_info[n_indep_vars-1-icand] ) ; } /* Initialize the 'kept' set to be the best variable, and then begin the main outer loop that adds variables one at a time */ kept[0] = sortwork[n_indep_vars-1] ; // Index of best single candidate crits[0] = save_info[n_indep_vars-1] ; reduns[0] = 0.0 ; nkept = 1 ; if (maxkept > n_indep_vars) // Guard against silly user maxkept = n_indep_vars ; while (nkept < maxkept) { fprintf ( fp , "\n" ) ; fprintf ( fp , "\nVariables so far Relevance Redundancy Criterion" ) ; for (i=0 ; i<nkept ; i++) fprintf ( fp , "\n%31s %10.5lf %10.5lf %10.5lf", names[kept[i]], crits[i] + reduns[i], reduns[i], crits[i] ) ; fprintf ( fp , "\n" ) ; fprintf ( fp , "\nSearching for an additional candidate..." ) ; fprintf ( fp , "\n" ) ; fprintf ( fp , "\n Variable Relevance Redundancy Criterion" ) ; bestcrit = -1.e60 ; for (icand=0 ; icand<n_indep_vars ; icand++) { // Try all candidates for (i=0 ; i<nkept ; i++) { // Is this candidate already kept? if (kept[i] == icand) break ; } if (i < nkept) // If this candidate 'icand' is already kept continue ; // Skip it strcpy ( trial_name , names[icand] ) ; // Its name for printing for (i=0 ; i<ncases ; i++) // Get its cases work[i] = data[i*nvars+icand] ; if (ndiv > 0) { mi_parzen = new MutualInformationParzen ( ncases , work , ndiv ) ; mi_adapt = NULL ; assert ( mi_parzen != NULL ) ; } else { mi_adapt = new MutualInformationAdaptive ( ncases , work , 0 , 6.0 ) ; mi_parzen = NULL ; assert ( mi_adapt != NULL ) ; } relevance = univar_info[icand] ; // We saved it during initial printing printf ( "\n%s relevance = %.5lf", trial_name, relevance ) ; // Compute the redundancy of this candidate // This is the mean of its redundancy with all kept variables redundancy = 0.0 ; for (iother=0 ; iother<nkept ; iother++) { // Process entire kept set j = kept[iother] ; // Index of a variable in the kept set if (icand > j) // pair_found and pair_info are k = icand*(icand+1)/2+j ; // symmetric, so k is the index else // into them k = j*(j+1)/2+icand ; if (pair_found[k]) // If we already computed it redun = pair_info[k] ; // Don't do it again else { // First time for this pair, so compute for (i=0 ; i<ncases ; i++) // Get its cases work[i] = data[i*nvars+j] ; // Variable already in kept set if (ndiv > 0) redun = mi_parzen->mut_inf ( work ) ; else redun = mi_adapt->mut_inf ( work , 0 ) ; pair_found[k] = 1 ; // Flag that this pair has been computed pair_info[k] = redun ; // And save the MI for this pair } // Else must compute redundancy redundancy += redun ; printf ( "\n %s <-> %s redundancy = %.5lf", names[icand], names[j], redun ) ; } // For all kept variables, computing mean redundancy if (mi_parzen != NULL) { delete mi_parzen ; mi_parzen = NULL ; } if (mi_adapt != NULL) { delete mi_adapt ; mi_adapt = NULL ; } redundancy /= nkept ; // It is the mean across all kept printf ( "\nRedundancy = %.5lf", redundancy ) ; criterion = relevance - redundancy ; fprintf ( fp , "\n%31s %10.5lf %10.5lf %10.5lf", trial_name, relevance, redundancy, criterion ) ; if (criterion > bestcrit) { // Did we just set a new record? bestcrit = criterion ; // If so, update the record bestredun = redundancy ; // Needed for printing results later ibest = icand ; // Keep track of the winning candidate } } // For all candidates // We now have the best candidate if (bestcrit <= 0.0) break ; kept[nkept] = ibest ; crits[nkept] = bestcrit ; reduns[nkept] = bestredun ; ++nkept ; } // While adding new variables fprintf ( fp , "\n" ) ; fprintf ( fp , "\nFinal set Relevance Redundancy Criterion" ) ; for (i=0 ; i<nkept ; i++) fprintf ( fp , "\n%31s %10.5lf %10.5lf %10.5lf", names[kept[i]], crits[i] + reduns[i], reduns[i], crits[i] ) ; MEMTEXT ( "MI_CONT: Finish" ) ; fclose ( fp ) ; FREE ( work ) ; FREE ( kept ) ; FREE ( crits ) ; FREE ( reduns ) ; FREE ( sortwork ) ; FREE ( save_info ) ; FREE ( univar_info ) ; FREE ( pair_found ) ; FREE ( pair_info ) ; if (mi_parzen != NULL) delete mi_parzen ; if (mi_adapt != NULL) delete mi_adapt ; free_data ( nvars , names , data ) ; MEMCLOSE () ; printf ( "\n\nPress any key..." ) ; _getch () ; return EXIT_SUCCESS ; }