void qsortdsi ( int first , int last , double *data , int *slave ) { int lower, upper, itemp ; double ftemp, split ; split = data[(first+last)/2] ; lower = first ; upper = last ; do { while ( split > data[lower] ) ++lower ; while ( split < data[upper] ) --upper ; if (lower == upper) { ++lower ; --upper ; } else if (lower < upper) { itemp = slave[lower] ; slave[lower] = slave[upper] ; slave[upper] = itemp ; ftemp = data[lower] ; data[lower++] = data[upper] ; data[upper--] = ftemp ; } } while ( lower <= upper ) ; if (first < upper) qsortdsi ( first , upper , data , slave ) ; if (lower < last) qsortdsi ( lower , last , data , slave ) ; }
ParzDens_3::ParzDens_3 ( int n_tset , double *tset0 , double *tset1 , double *tset2 , int n_div ) { int i, *indices ; double std ; nd = n_tset ; d0 = (double *) malloc ( 3 * nd * sizeof(double) ) ; indices = (int *) malloc ( nd * sizeof(int) ) ; d1 = d0 + nd ; d2 = d1 + nd ; /* Convert the data to a normal distribution */ for (i=0 ; i<nd ; i++) { indices[i] = i ; d0[i] = tset0[i] ; } qsortdsi ( 0 , nd-1 , d0 , indices ) ; for (i=0 ; i<nd ; i++) d0[indices[i]] = inverse_normal_cdf ( (i + 1.0) / (nd + 1) ) ; for (i=0 ; i<nd ; i++) { indices[i] = i ; d1[i] = tset1[i] ; } qsortdsi ( 0 , nd-1 , d1 , indices ) ; for (i=0 ; i<nd ; i++) d1[indices[i]] = inverse_normal_cdf ( (i + 1.0) / (nd + 1) ) ; for (i=0 ; i<nd ; i++) { indices[i] = i ; d2[i] = tset2[i] ; } qsortdsi ( 0 , nd-1 , d2 , indices ) ; for (i=0 ; i<nd ; i++) d2[indices[i]] = inverse_normal_cdf ( (i + 1.0) / (nd + 1) ) ; free ( indices ) ; std = 2.0 / n_div ; var0 = var1 = var2 = std * std ; factor = 1.0 / (nd * 2.0 * PI * sqrt(2.0 * PI) * sqrt(var0 * var1 * var2) ) ; }
int rr ( int type , // Type of study (SCREEN_RR_? in CONST.H): continuous, tails, discrete) int npred , // Number of predictors int *preds , // Their indices are here int targetvar , // Index of target variable int nbins_pred , // Number of predictor bins int nbins_target , // Number of target bins, 0 for 2 sign-based bins double tail_frac , // Tail fraction int mcpt_type , // 1=complete, 2=cyclic int mcpt_reps , // Number of MCPT replications, <=1 for no MCPT int max_pred // Max number of predictors in optimal subset ) { int i, j, k, n, ret_val, ivar, irep, varnum, max_threads, bins_dim ; int *index, *stepwise_mcpt_count, *solo_mcpt_count, *stepwise_ivar, *original_stepwise_ivar ; int *pred_bin, *redun_pred_bin, *target_bin, *bin_counts ; int *work_bin, nkept, best_ivar, *which_preds, *tail_n, *target_bin_ptr ; double *casework, *sorted, *mutual, *pred_thresholds, *target_thresholds, *target, *work_target ; double *crit, *relevance, *original_relevance, *current_crits, *sorted_crits, best_crit, dtemp ; double *pred_bounds, *target_bounds, *pred_marginal, *redun_pred_marginal, *target_marginal ; double *stepwise_crit, *original_stepwise_crit ; double sum_relevance, *original_sum_relevance, *sum_redundancy ; char msg[4096], msg2[4096] ; casework = NULL ; mutual = NULL ; index = NULL ; pred_thresholds = NULL ; target_thresholds = NULL ; pred_bin = NULL ; redun_pred_bin = NULL ; redun_pred_marginal = NULL ; work_bin = NULL ; target_bin = NULL ; bin_counts = NULL ; target = NULL ; tail_n = NULL ; if (max_pred > npred) // Watch out for careless user max_pred = npred ; ret_val = 0 ; max_threads = MAX_THREADS ; /* Print header */ audit ( "" ) ; audit ( "" ) ; audit ( "******************************************************************************" ) ; audit ( "* *" ) ; audit ( "* Computing relevance minus redundancy for optimal predictor subset *" ) ; if (type == SCREEN_RR_CONTINUOUS) audit ( "* Predictors and target are continuous *" ) ; else if (type == SCREEN_RR_TAILS) { sprintf_s ( msg, "* %5.3lf predictor tails used *", tail_frac ) ; audit ( msg ) ; sprintf_s ( msg, "* %2d target bins *", nbins_target ) ; audit ( msg ) ; } else if (type == SCREEN_RR_DISCRETE) { sprintf_s ( msg, "* %2d predictor bins *", nbins_pred ) ; audit ( msg ) ; sprintf_s ( msg, "* %2d target bins *", nbins_target ) ; audit ( msg ) ; } sprintf_s ( msg, "* %5d predictor candidates *", npred ) ; audit ( msg ) ; sprintf_s ( msg, "* %7d best predictors will be printed *", max_pred ) ; audit ( msg ) ; if (mcpt_reps > 1) { if (mcpt_type == 1) sprintf_s ( msg, "* %5d replications of complete Monte-Carlo Permutation Test *", mcpt_reps ) ; else if (mcpt_type == 2) sprintf_s ( msg, "* %5d replications of cyclic Monte-Carlo Permutation Test *", mcpt_reps ) ; audit ( msg ) ; } else { sprintf_s ( msg, "* No Monte-Carlo Permutation Test *" ) ; audit ( msg ) ; } audit ( "* *" ) ; audit ( "******************************************************************************" ) ; /* Allocate memory needed for all types (CONTINUOUS, TAILS, DISCRETE) */ casework = (double *) malloc ( 2 * n_cases * sizeof(double) ) ; // Pred, sorted sorted = casework + n_cases ; mutual = (double *) malloc ( 10 * npred * sizeof(double) ) ; crit = mutual + npred ; current_crits = crit + npred ; sorted_crits = current_crits + npred ; stepwise_crit = sorted_crits + npred ; original_stepwise_crit = stepwise_crit + npred ; relevance = original_stepwise_crit + npred ; original_relevance = relevance + npred ; sum_redundancy = original_relevance + npred ; original_sum_relevance = sum_redundancy + npred ; index = (int *) malloc ( 6 * npred * sizeof(int) ) ; stepwise_mcpt_count = index + npred ; solo_mcpt_count = stepwise_mcpt_count + npred ; which_preds = solo_mcpt_count + npred ; stepwise_ivar = which_preds + npred ; original_stepwise_ivar = stepwise_ivar + npred ; if (casework == NULL || mutual == NULL || index == NULL) { audit ( "ERROR: Insufficient memory for Relevance minus Redundancy" ) ; ret_val = ERROR_INSUFFICIENT_MEMORY ; goto FINISH ; } /* For CONTINUOUS, allocate and save target */ if (type == SCREEN_RR_CONTINUOUS) { target = (double *) malloc ( 2 * n_cases * sizeof(double) ) ; work_target = target + n_cases ; if (target == NULL) { audit ( "ERROR: Insufficient memory for Relevance minus Redundancy" ) ; ret_val = ERROR_INSUFFICIENT_MEMORY ; goto FINISH ; } for (i=0 ; i<n_cases ; i++) // Extract target from database target[i] = database[i*n_vars+targetvar] ; } /* For binning types (TAILS, DISCRETE), allocate that memory and compute all bin information */ else if (type == SCREEN_RR_TAILS || type == SCREEN_RR_DISCRETE) { pred_thresholds = (double *) malloc ( 2 * nbins_pred * npred * sizeof(double) ) ; // pred_thresholds, pred_marginal pred_marginal = pred_thresholds + npred * nbins_pred ; // Not needed for computation but nice to print for user pred_bin = (int *) malloc ( npred * n_cases * sizeof(int) ) ; work_bin = (int *) malloc ( n_cases * sizeof(int) ) ; if (type == SCREEN_RR_TAILS) { assert ( nbins_pred == 2 ) ; k = 3 ; // We go trinary for redundancy } else k = nbins_pred ; if (k >= nbins_target) bins_dim = k * k ; else bins_dim = k * nbins_target ; bin_counts = (int *) malloc ( max_threads * bins_dim * sizeof(int) ) ; tail_n = (int *) malloc ( npred * sizeof(int) ) ; // We use tail_n[0] if DISCRETE, so we need it for eitherz if (type == SCREEN_RR_TAILS) { target_thresholds = (double *) malloc ( 2 * nbins_target * npred * sizeof(double) ) ; // target_thresholds, target_marginal target_marginal = target_thresholds + nbins_target * npred ; target_bin = (int *) malloc ( npred * n_cases * sizeof(int) ) ; // Target bin separate for each predictor redun_pred_bin = (int *) malloc ( npred * n_cases * sizeof(int) ) ; // Trinary for redundancy calculation redun_pred_marginal = (double *) malloc ( 3 * npred * sizeof(double) ) ; // Trinary } else if (type == SCREEN_RR_DISCRETE) { target_thresholds = (double *) malloc ( 2 * nbins_target * sizeof(double) ) ; // target_thresholds, target_marginal target_marginal = target_thresholds + nbins_target ; target_bin = (int *) malloc ( n_cases * sizeof(int) ) ; // Target bin the same for all predictors } if (pred_thresholds == NULL || target_thresholds == NULL || pred_bin == NULL || work_bin == NULL || target_bin == NULL || bin_counts == NULL) { audit ( "ERROR: Insufficient memory for Relevance minus Redundancy" ) ; ret_val = ERROR_INSUFFICIENT_MEMORY ; goto FINISH ; } /* Make an initial pass through the data to find predictor thresholds and permanently save bin indices for predictors and target. If tails-only, we must save the associated target subset indices, separately for each predictor. If not tails only, do target when ivar=-1. */ for (ivar=-1 ; ivar<npred ; ivar++) { if (ivar == -1) { // If this is target pass if (type == SCREEN_RR_TAILS) // But user specified tails only continue ; // then we process the targets separately for each predictor's subset } else varnum = preds[ivar] ; if (user_pressed_escape()) { audit ( "ERROR: User pressed ESCape during RELEVANCE MINUS REDUNDANCY" ) ; ret_val = ERROR_ESCAPE ; goto FINISH ; } // At this point, one of three things holds: // Case 1: ivar=-1 (which implies not TAILS): This is the target // Case 2: ivar>=0, not TAILS: This is a predictor // Case 3: ivar>=0, TAILS: This is a predictor AND we must save the corresponding target // ------> Case 1: ivar=-1 (which implies not TAILS): This is the target if (ivar == -1) { for (i=0 ; i<n_cases ; i++) // Extract target from database casework[i] = database[i*n_vars+targetvar] ; target_bounds = target_thresholds ; k = nbins_target ; partition ( n_cases , casework , &k , target_bounds , target_bin ) ; if (k <nbins_target) { sprintf_s ( msg, "ERROR: Numerous ties reduced target bins to %d", k ) ; audit ( msg ) ; ret_val = ERROR_SYNTAX ; goto FINISH ; } assert ( k == nbins_target ) ; tail_n[0] = n_cases ; // Later code is simplified if we save this as if TAILS } // ------> Case 2: ivar>=0, not TAILS: This is a predictor else if (ivar >= 0 && type != SCREEN_RR_TAILS) { for (i=0 ; i<n_cases ; i++) // Extract predictor from database casework[i] = database[i*n_vars+varnum] ; pred_bounds = pred_thresholds + ivar * nbins_pred ; k = nbins_pred ; partition ( n_cases , casework , &k , pred_bounds , pred_bin+ivar*n_cases ) ; if (k <nbins_pred) { sprintf_s ( msg, "ERROR: Numerous ties reduced predictor %s bins to %d", var_names[preds[ivar]], k ) ; audit ( msg ) ; ret_val = ERROR_SYNTAX ; goto FINISH ; } assert ( k == nbins_pred ) ; } // ------> Case 3: ivar>=0, TAILS: This is a predictor AND we must save the corresponding target else if (ivar >= 0 && type == SCREEN_RR_TAILS) { // Compute predictor bounds per tail fraction for (i=0 ; i<n_cases ; i++) // Extract predictor from database casework[i] = database[i*n_vars+varnum] ; qsortd ( 0 , n_cases-1 , casework ) ; pred_bounds = pred_thresholds + ivar * nbins_pred ; k = (int) (tail_frac * (n_cases+1)) - 1 ; if (k < 0) k = 0 ; pred_bounds[0] = casework[k] ; pred_bounds[1] = casework[n_cases-1-k] ; // Compute and save predictor bin indices; Also save target for soon computing its bounds and indices n = 0 ; for (i=0 ; i<n_cases ; i++) { if (database[i*n_vars+varnum] <= pred_bounds[0]) { pred_bin[ivar*n_cases+n] = 0 ; redun_pred_bin[ivar*n_cases+i] = 0 ; // Need this for intra-predictor redundancy } else if (database[i*n_vars+varnum] >= pred_bounds[1]) { pred_bin[ivar*n_cases+n] = 1 ; redun_pred_bin[ivar*n_cases+i] = 1 ; } else { redun_pred_bin[ivar*n_cases+i] = 2 ; continue ; } casework[n] = database[i*n_vars+targetvar] ; ++n ; } tail_n[ivar] = n ; // Compute the target bounds based on this 'predictor tail' subset of the entire dataset target_bounds = target_thresholds + ivar * nbins_target ; k = nbins_target ; partition ( n , casework , &k , target_bounds , target_bin+ivar*n_cases ) ; if (k <nbins_target) { sprintf_s ( msg, "ERROR: Numerous ties reduced target bins to %d", k ) ; audit ( msg ) ; ret_val = ERROR_SYNTAX ; goto FINISH ; } } else assert ( 1 == 0 ) ; } // For ivar (reading each variable) /* All thresholds (predictor and target) are computed and saved. The predictor and target bin indices are also saved. If not TAILS, the saved target bin indices are based on the entire dataset, and the saved target thresholds are similarly for the entire dataset. But if TAILS, each predictor candidate will have its own target subset and thresholds corresponding to that subset. Print the thresholds for the user's edification */ audit ( "" ) ; audit ( "" ) ; audit ( "The bounds that define bins are now shown" ) ; audit ( "" ) ; if (type == SCREEN_RR_TAILS) { audit ( "Target bounds are shown (after :) separately for each predictor candidate" ) ; audit ( "" ) ; audit ( " Variable Predictor bounds... : Target bounds" ) ; audit ( "" ) ; } else { audit ( "Target bounds are based on the entire dataset..." ) ; sprintf_s ( msg , "%12.5lf", target_thresholds[0] ) ; for (i=1 ; i<nbins_target-1 ; i++) { sprintf_s ( msg2 , " %12.5lf", target_thresholds[i] ) ; strcat_s ( msg , msg2 ) ; } audit ( msg ) ; audit ( "" ) ; audit ( " Variable Bounds..." ) ; audit ( "" ) ; } for (ivar=0 ; ivar<npred ; ivar++) { pred_bounds = pred_thresholds + ivar * nbins_pred ; sprintf_s ( msg, "%15s %12.5lf", var_names[preds[ivar]], pred_bounds[0] ) ; k = (type == SCREEN_RR_TAILS) ? 2 : nbins_pred-1 ; for (i=1 ; i<k ; i++) { sprintf_s ( msg2 , " %12.5lf", pred_bounds[i] ) ; strcat_s ( msg , msg2 ) ; } if (type == SCREEN_RR_TAILS) { target_bounds = target_thresholds + ivar * nbins_target ; sprintf_s ( msg2 , " : %12.5lf", target_bounds[0] ) ; strcat_s ( msg , msg2 ) ; for (i=1 ; i<nbins_target-1 ; i++) { sprintf_s ( msg2 , " %12.5lf", target_bounds[i] ) ; strcat_s ( msg , msg2 ) ; } } // If TAILS audit ( msg ) ; } // For all predictors /* Compute marginals */ for (ivar=0 ; ivar<npred ; ivar++) { for (i=0 ; i<nbins_pred ; i++) pred_marginal[ivar*nbins_pred+i] = 0.0 ; if (ivar==0 || type == SCREEN_RR_TAILS) { for (i=0 ; i<nbins_target ; i++) target_marginal[ivar*nbins_target+i] = 0.0 ; } for (i=0 ; i<n_cases ; i++) { ++pred_marginal[ivar*nbins_pred+pred_bin[ivar*n_cases+i]] ; if (type == SCREEN_UNIVAR_TAILS) { ++target_marginal[ivar*nbins_target+target_bin[ivar*n_cases+i]] ; if (i == tail_n[ivar]-1) break ; } else if (ivar == 0) // Do target just once ++target_marginal[target_bin[i]] ; } // For all cases if (type == SCREEN_RR_TAILS) { // Trinary for (i=0 ; i<3 ; i++) redun_pred_marginal[ivar*3+i] = 0.0 ; for (i=0 ; i<n_cases ; i++) ++redun_pred_marginal[ivar*3+redun_pred_bin[ivar*n_cases+i]] ; } } for (ivar=0 ; ivar<npred ; ivar++) { // Divide counts by number of cases to get marginal if (type == SCREEN_UNIVAR_TAILS) { assert ( nbins_pred == 2 ) ; for (i=0 ; i<nbins_pred ; i++) pred_marginal[ivar*nbins_pred+i] /= tail_n[ivar] ; for (i=0 ; i<3 ; i++) redun_pred_marginal[ivar*3+i] /= n_cases ; } else { for (i=0 ; i<nbins_pred ; i++) pred_marginal[ivar*nbins_pred+i] /= n_cases ; } if (ivar==0 || type == SCREEN_UNIVAR_TAILS) { for (i=0 ; i<nbins_target ; i++) target_marginal[ivar*nbins_target+i] /= tail_n[ivar] ; } } /* Print the marginals for the user's edification */ audit ( "" ) ; audit ( "" ) ; audit ( "The marginal distributions are now shown." ) ; audit ( "If the data is continuous, the marginals will be nearly equal." ) ; audit ( "Widely unequal marginals indicate potentially problematic ties." ) ; audit ( "" ) ; if (type == SCREEN_UNIVAR_TAILS) { audit ( "Target marginals are shown (after :) separately for each predictor candidate" ) ; audit ( "" ) ; audit ( " Variable Predictor marginals... : Target marginals" ) ; audit ( "" ) ; } else { audit ( "Target marginals are based on the entire dataset..." ) ; sprintf_s ( msg , "%12.5lf", target_marginal[0] ) ; for (i=1 ; i<nbins_target ; i++) { sprintf_s ( msg2 , " %12.5lf", target_marginal[i] ) ; strcat_s ( msg , msg2 ) ; } audit ( msg ) ; audit ( "" ) ; audit ( " Variable Marginal..." ) ; audit ( "" ) ; } for (ivar=0 ; ivar<npred ; ivar++) { sprintf_s ( msg, "%15s %12.5lf", var_names[preds[ivar]], pred_marginal[ivar*nbins_pred+0] ) ; for (i=1 ; i<nbins_pred ; i++) { sprintf_s ( msg2 , " %12.5lf", pred_marginal[ivar*nbins_pred+i] ) ; strcat_s ( msg , msg2 ) ; } if (type == SCREEN_UNIVAR_TAILS) { sprintf_s ( msg2 , " : %12.5lf", target_marginal[ivar*nbins_target+0] ) ; strcat_s ( msg , msg2 ) ; for (i=1 ; i<nbins_target ; i++) { sprintf_s ( msg2 , " %12.5lf", target_marginal[ivar*nbins_target+i] ) ; strcat_s ( msg , msg2 ) ; } } // If TAILS audit ( msg ) ; } // For all predictors disallow_menu = 0 ; mouse_cursor_arrow () ; end_progbar () ; } // If binning type (TAILS, DISCRETE) /* -------------------------------------------------------------------------------- Outer-most loop does MCPT replications -------------------------------------------------------------------------------- */ if (mcpt_reps < 1) mcpt_reps = 1 ; for (irep=0 ; irep<mcpt_reps ; irep++) { /* Shuffle target if in permutation run (irep>0) */ if (irep) { // If doing permuted runs, shuffle if (mcpt_type == 1) { // Complete if (type == SCREEN_UNIVAR_CONTINUOUS) { i = n_cases ; // Number remaining to be shuffled while (i > 1) { // While at least 2 left to shuffle j = (int) (unifrand_fast () * i) ; if (j >= i) j = i - 1 ; dtemp = target[--i] ; target[i] = target[j] ; target[j] = dtemp ; } } // If not using bins else if (type == SCREEN_UNIVAR_TAILS) { // Each predictor has its own target subset for (ivar=0 ; ivar<npred ; ivar++) { target_bin_ptr = target_bin + ivar * n_cases ; i = tail_n[ivar] ; // Number remaining to be shuffled while (i > 1) { // While at least 2 left to shuffle j = (int) (unifrand_fast () * i) ; if (j >= i) j = i - 1 ; k = target_bin_ptr[--i] ; target_bin_ptr[i] = target_bin_ptr[j] ; target_bin_ptr[j] = k ; } } } // Else if TAILS else { i = n_cases ; // Number remaining to be shuffled while (i > 1) { // While at least 2 left to shuffle j = (int) (unifrand_fast () * i) ; if (j >= i) j = i - 1 ; k = target_bin[--i] ; target_bin[i] = target_bin[j] ; target_bin[j] = k ; } } // Else discrete using entire dataset } // Type 1, Complete else if (mcpt_type == 2) { // Cyclic if (type == SCREEN_UNIVAR_CONTINUOUS) { j = (int) (unifrand_fast () * n_cases) ; if (j >= n_cases) j = n_cases - 1 ; for (i=0 ; i<n_cases ; i++) casework[i] = target[(i+j)%n_cases] ; for (i=0 ; i<n_cases ; i++) target[i] = casework[i] ; } // If continuous else if (type == SCREEN_UNIVAR_TAILS) { // Each predictor has its own target subset for (ivar=0 ; ivar<npred ; ivar++) { target_bin_ptr = target_bin + ivar * n_cases ; k = tail_n[ivar] ; j = (int) (unifrand_fast () * k) ; if (j >= k) j = k - 1 ; for (i=0 ; i<k ; i++) work_bin[i] = target_bin_ptr[(i+j)%k] ; for (i=0 ; i<k ; i++) target_bin_ptr[i] = work_bin[i] ; } } // Else if TAILS else { j = (int) (unifrand_fast () * n_cases) ; if (j >= n_cases) j = n_cases - 1 ; for (i=0 ; i<n_cases ; i++) work_bin[i] = target_bin[(i+j)%n_cases] ; for (i=0 ; i<n_cases ; i++) target_bin[i] = work_bin[i] ; } // Else discrete using entire dataset } // Type 2, Cyclic } // If in permutation run (irep > 0) /* ----------------------------------------------------------------------------------- First step: Compute and save criterion for all individual candidates ----------------------------------------------------------------------------------- */ for (i=0 ; i<npred ; i++) // We'll test all candidates which_preds[i] = i ; if (type == SCREEN_RR_TAILS) ret_val = rr_threaded ( type , database , n_vars , preds , NULL , mcpt_reps , max_threads , n_cases , tail_n , npred , which_preds , nbins_pred , pred_bin , pred_marginal , nbins_target , target_bin , target_marginal , crit , bins_dim , bin_counts ) ; else ret_val = rr_threaded ( type , database , n_vars , preds , target , mcpt_reps , max_threads , n_cases , NULL , npred , which_preds , nbins_pred , pred_bin , pred_marginal , nbins_target , target_bin , target_marginal , crit , bins_dim , bin_counts ) ; if (user_pressed_escape() && ret_val == 0) ret_val = ERROR_ESCAPE ; if (ret_val) { audit ( "ERROR: User pressed ESCape during RELEVANCE MINUS REDUNDANCY" ) ; goto FINISH ; } /* The individual mutual information for each predictor has been computed and saved in crit. Update 'best' information for this replication. Print a sorted table if this is the first replication. Else update MCPT count. */ for (ivar=0 ; ivar<npred ; ivar++) { relevance[ivar] = crit[ivar] ; // Will need this for Step 2, addition of more predictors if (ivar == 0 || crit[ivar] > best_crit) { best_crit = crit[ivar] ; best_ivar = ivar ; } } stepwise_crit[0] = best_crit ; // Criterion for first var is largest MI stepwise_ivar[0] = best_ivar ; // It's this candidate sum_relevance = best_crit ; if (irep == 0) { // Original, unpermuted data original_stepwise_crit[0] = best_crit ; // Criterion for first var is largest MI original_stepwise_ivar[0] = best_ivar ; // It's this candidate original_sum_relevance[0] = sum_relevance ; stepwise_mcpt_count[0] = 1 ; // Initialize cumulative MCPT // We need original_relevance for printing final table. Other crits are just for this table. for (ivar=0 ; ivar<npred ; ivar++) { index[ivar] = ivar ; original_relevance[ivar] = sorted_crits[ivar] = current_crits[ivar] = crit[ivar] ; solo_mcpt_count[ivar] = 1 ; // Initialize solo MCPT } qsortdsi ( 0 , npred-1 , sorted_crits , index ) ; audit ( "" ) ; audit ( "" ) ; sprintf_s ( msg, "Initial candidates, in order of decreasing mutual information with %s", var_names[targetvar] ) ; audit ( msg ) ; audit ( "" ) ; audit ( " Variable MI" ) ; audit ( "" ) ; for (i=npred-1 ; i>=0 ; i--) { k = index[i] ; sprintf_s ( msg, "%15s %12.4lf", var_names[preds[k]], current_crits[k] ) ; audit ( msg ) ; } } // If irep=0 (original, unpermuted run) else { // Count for MCPT if (sum_relevance >= original_sum_relevance[0]) ++stepwise_mcpt_count[0] ; for (ivar=0 ; ivar<npred ; ivar++) { if (relevance[ivar] >= original_relevance[ivar]) ++solo_mcpt_count[ivar] ; } } // Permuted /* ----------------------------------------------------------------------------------- Second step: Iterate to add more candidates Note that redundancy of a candidate can change as predictors are added. This is because the kept set is increasing, so sum_redundancy changes. ----------------------------------------------------------------------------------- */ for (i=0 ; i<npred ; i++) sum_redundancy[i] = 0.0 ; // sum_redundancy[i] is the total redundancy of candidate i with kept set for (nkept=1 ; nkept<max_pred ; nkept++) { // Main outermost loop /* Print candidates kept so far (if in unpermuted rep) */ if (irep == 0) { // Original, unpermuted audit ( "" ) ; audit ( "" ) ; audit ( "Predictors so far Relevance Redundancy Criterion" ) ; audit ( "" ) ; for (i=0 ; i<nkept ; i++) { k = stepwise_ivar[i] ; // Cannot print sum_redundancy/nkept here because sum froze but nkept keeps increasing sprintf_s ( msg, "%15s %12.4lf %12.4lf %12.4lf", var_names[preds[k]], relevance[k], relevance[k] - stepwise_crit[i], stepwise_crit[i] ) ; audit ( msg ) ; } } /* Build in which_preds the candidates not yet selected */ k = 0 ; // Candidate vector is all except those already kept for (i=0 ; i<npred ; i++) { for (j=0 ; j<nkept ; j++) { if (stepwise_ivar[j] == i) break ; } if (j == nkept) which_preds[k++] = i ; } assert ( k == npred - nkept ) ; /* Compute the MI of the most recently added predictor with each remaining candidate */ if (user_pressed_escape()) { ret_val = ERROR_ESCAPE ; audit ( "ERROR: User pressed ESCape or other serious error during RELEVANCE MINUS REDUNDANCY" ) ; goto FINISH ; } k = stepwise_ivar[nkept-1] ; // Index in preds of most recently added candidate if (type == SCREEN_RR_TAILS) // redun_pred_? is trinary ret_val = rr_threaded ( type , database , n_vars , preds , NULL , mcpt_reps , max_threads , n_cases , NULL , npred-nkept , which_preds , 3 , redun_pred_bin , redun_pred_marginal , 3 , redun_pred_bin+k*n_cases , redun_pred_marginal+k*3 , crit , bins_dim , bin_counts ) ; else { if (type == SCREEN_RR_CONTINUOUS) { for (i=0 ; i<n_cases ; i++) casework[i] = database[i*n_vars+preds[k]] ; } ret_val = rr_threaded ( type , database , n_vars , preds , casework , mcpt_reps , max_threads , n_cases , NULL , npred-nkept , which_preds , nbins_pred , pred_bin , pred_marginal , nbins_pred , pred_bin+k*n_cases , pred_marginal+k*nbins_pred , crit , bins_dim , bin_counts ) ; } if (user_pressed_escape() && ret_val == 0) ret_val = ERROR_ESCAPE ; if (ret_val) { audit ( "ERROR: User pressed ESCape or other serious error during RELEVANCE MINUS REDUNDANCY" ) ; goto FINISH ; } /* The redundancy of each remaining candidate with the most recently added predictor is now in crit. Cumulate the sum of redundancy. Then compute the criteria, sorting and printing if this is the unpermuted replication. */ for (i=0 ; i<npred-nkept ; i++) { k = which_preds[i] ; // Index in preds of this candidate sum_redundancy[k] += crit[i] ; index[i] = k ; sorted_crits[i] = current_crits[i] = relevance[k] - sum_redundancy[k] / nkept ; if (i == 0 || current_crits[i] > best_crit) { best_crit = current_crits[i] ; best_ivar = k ; } } stepwise_crit[nkept] = best_crit ; stepwise_ivar[nkept] = best_ivar ; sum_relevance += relevance[best_ivar] ; if (irep == 0) { // Original, unpermuted original_stepwise_crit[nkept] = best_crit ; original_stepwise_ivar[nkept] = best_ivar ; original_sum_relevance[nkept] = sum_relevance ; stepwise_mcpt_count[nkept] = 1 ; qsortdsi ( 0 , npred-nkept-1 , sorted_crits , index ) ; audit ( "" ) ; audit ( "" ) ; audit ( "Additional candidates, in order of decreasing relevance minus redundancy" ) ; audit ( "" ) ; audit ( " Variable Relevance Redundancy Criterion" ) ; audit ( "" ) ; for (i=npred-nkept-1 ; i>=0 ; i--) { k = index[i] ; sprintf_s ( msg, "%15s %12.4lf %12.4lf %12.4lf", var_names[preds[k]], relevance[k], sum_redundancy[k] / nkept, relevance[k] - sum_redundancy[k] / nkept ) ; audit ( msg ) ; } } // If irep=0 (original, unpermuted run) else { // Count for MCPT if (sum_relevance >= original_sum_relevance[nkept]) ++stepwise_mcpt_count[nkept] ; } // Permuted } // Second step (for nkept): Iterate to add predictors to kept set } // For all MCPT replications /* -------------------------------------------------------------------------------- All computation is finished. Print. -------------------------------------------------------------------------------- */ audit ( "" ) ; audit ( "" ) ; /* Print final list of candidates and p-values */ audit ( "" ) ; audit ( "" ) ; sprintf_s ( msg, "----------> Final results predicting %s <----------", var_names[targetvar] ) ; audit ( msg ) ; audit ( "" ) ; if (mcpt_reps > 1) audit ( "Final predictors Relevance Redundancy Criterion Solo pval Group pval" ) ; else audit ( "Final predictors Relevance Redundancy Criterion" ) ; audit ( "" ) ; for (i=0 ; i<nkept ; i++) { // Cannot print sum_redundancy/nkept here because sum froze but nkept keeps increasing k = original_stepwise_ivar[i] ; if (mcpt_reps > 1) sprintf_s ( msg, "%15s %12.4lf %12.4lf %12.4lf %8.3lf %8.3lf", var_names[preds[k]], original_relevance[k], original_relevance[k] - original_stepwise_crit[i], original_stepwise_crit[i], (double) solo_mcpt_count[k] / (double) mcpt_reps, (double) stepwise_mcpt_count[i] / (double) mcpt_reps ) ; else sprintf_s ( msg, "%15s %12.4lf %12.4lf %12.4lf", var_names[preds[k]], original_relevance[k], original_relevance[k] - original_stepwise_crit[i], original_stepwise_crit[i] ) ; audit ( msg ) ; } /* Finished. Clean up and exit. */ FINISH: if (casework != NULL) free ( casework ) ; if (mutual != NULL) free ( mutual ) ; if (index != NULL) free ( index ) ; if (pred_thresholds != NULL) free ( pred_thresholds ) ; if (target_thresholds != NULL) free ( target_thresholds ) ; if (pred_bin != NULL) free ( pred_bin ) ; if (redun_pred_bin != NULL) free ( redun_pred_bin ) ; if (redun_pred_marginal != NULL) free ( redun_pred_marginal ) ; if (work_bin != NULL) free ( work_bin ) ; if (target_bin != NULL) free ( target_bin ) ; if (bin_counts != NULL) free ( bin_counts ) ; if (target != NULL) free ( target ) ; if (tail_n != NULL) free ( tail_n ) ; return ret_val ; }
ParzDens_2::ParzDens_2 ( int n_tset , double *tset0 , double *tset1 , int n_div ) { int i, j, k, k0, k1, k2, *indices ; double *x, *y, *z, xbot, xinc, ybot, yinc, xlow, xhigh, ylow, yhigh, std ; double diff0, diff1, sum ; nd = n_tset ; bilin = NULL ; d0 = (double *) malloc ( 2 * nd * sizeof(double) ) ; indices = (int *) malloc ( nd * sizeof(int) ) ; d1 = d0 + nd ; /* Convert the data to a normal distribution */ for (i=0 ; i<nd ; i++) { indices[i] = i ; d0[i] = tset0[i] ; } qsortdsi ( 0 , nd-1 , d0 , indices ) ; for (i=0 ; i<nd ; i++) d0[indices[i]] = inverse_normal_cdf ( (i + 1.0) / (nd + 1) ) ; for (i=0 ; i<nd ; i++) { indices[i] = i ; d1[i] = tset1[i] ; } qsortdsi ( 0 , nd-1 , d1 , indices ) ; for (i=0 ; i<nd ; i++) d1[indices[i]] = inverse_normal_cdf ( (i + 1.0) / (nd + 1) ) ; free ( indices ) ; std = 2.0 / n_div ; var0 = var1 = std * std ; xhigh = yhigh = 3.0 + 2.0 * std ; xlow = ylow = -xhigh ; factor = 1.0 / (nd * 2.0 * PI * sqrt ( var0 * var1 ) ) ; if (nd <= 100) return ; // We have a lot of cases, so prepare for bilinear interpolation x = (double *) malloc ( P2RES * sizeof(double) ) ; y = (double *) malloc ( P2RES * sizeof(double) ) ; z = (double *) malloc ( P2RES * P2RES * sizeof(double) ) ; if (x == NULL || y == NULL || z == NULL) { if (x != NULL) free ( x ) ; if (y != NULL) free ( y ) ; if (z != NULL) free ( z ) ; return ; // If insufficient memory, do not interpolate } k0 = (int) (0.1 * P2RES) ; xinc = (-1.5 - xlow) / k0 ; for (i=0 ; i<k0 ; i++) x[i] = xlow + i * xinc ; k1 = (int) (0.8 * P2RES) ; xbot = x[k0-1] ; xinc = (1.5 - xbot) / (k1 + 1) ; for (i=0 ; i<k1 ; i++) x[i+k0] = xbot + (i+1) * xinc ; xbot = x[k0+k1-1] ; k2 = P2RES - k0 - k1 ; xinc = (xhigh - xbot) / k2 ; for (i=0 ; i<k2 ; i++) x[i+k0+k1] = xbot + (i+1) * xinc ; k0 = (int) (0.1 * P2RES) ; yinc = (-1.5 - ylow) / k0 ; for (i=0 ; i<k0 ; i++) y[i] = ylow + i * yinc ; k1 = (int) (0.8 * P2RES) ; ybot = y[k0-1] ; yinc = (1.5 - ybot) / (k1 + 1) ; for (i=0 ; i<k1 ; i++) y[i+k0] = ybot + (i+1) * yinc ; ybot = y[k0+k1-1] ; k2 = P2RES - k0 - k1 ; yinc = (yhigh - ybot) / k2 ; for (i=0 ; i<k2 ; i++) y[i+k0+k1] = ybot + (i+1) * yinc ; for (i=0 ; i<P2RES ; i++) { for (j=0 ; j<P2RES ; j++) { sum = 0.0 ; for (k=0 ; k<nd ; k++) { diff0 = x[i] - d0[k] ; diff1 = y[j] - d1[k] ; sum += exp ( -0.5 * (diff0 * diff0 / var0 + diff1 * diff1 / var1 )); } z[i*P2RES+j] = factor * sum ; } } bilin = new Bilinear ( P2RES , x , P2RES , y , z , 1 ) ; free ( x ) ; free ( y ) ; free ( z ) ; }
ParzDens_1::ParzDens_1 ( int n_tset , double *tset , int n_div ) { int i, j, *indices ; double std, *x, *y, xbot, xinc, diff, sum ; nd = n_tset ; spline = NULL ; d = (double *) malloc ( nd * sizeof(double) ) ; indices = (int *) malloc ( nd * sizeof(int) ) ; /* Convert the data to a normal distribution */ for (i=0 ; i<nd ; i++) { indices[i] = i ; d[i] = tset[i] ; } qsortdsi ( 0 , nd-1 , d , indices ) ; for (i=0 ; i<nd ; i++) d[indices[i]] = inverse_normal_cdf ( (i + 1.0) / (nd + 1) ) ; free ( indices ) ; std = 2.0 / n_div ; var = std * std ; high = 3.0 + 3.0 * std ; low = -high ; factor = 1.0 / (nd * sqrt (2.0 * PI * var) ) ; if (nd <= 100) return ; // We have a lot of cases, so prepare for cubic spline interpolation x = (double *) malloc ( 1001 * sizeof(double) ) ; y = (double *) malloc ( 1001 * sizeof(double) ) ; xinc = (-1.5 - low) / 100.0 ; for (i=0 ; i<100 ; i++) x[i] = low + i * xinc ; xbot = x[99] ; xinc = (1.5 - xbot) / 801.0 ; for (i=0 ; i<800 ; i++) x[i+100] = xbot + (i+1) * xinc ; xbot = x[899] ; xinc = (high - xbot) / 101.0 ; for (i=0 ; i<101 ; i++) x[i+900] = xbot + (i+1) * xinc ; for (i=0 ; i<1001 ; i++) { sum = 0.0 ; for (j=0 ; j<nd ; j++) { diff = x[i] - d[j] ; sum += exp ( -0.5 * diff * diff / var ) ; } y[i] = factor * sum ; } spline = new CubicSpline ( 1001 , x , y ) ; free ( x ) ; free ( y ) ; }
int main ( int argc , // Number of command line arguments (includes prog name) char *argv[] // Arguments (prog name is argv[0]) ) { int i, j, k, depzero, indepzero, nvars, ncases, maxkept, ivar, *kept ; int n_indep_vars, idep, icand, iz, ibest, *sortwork, nkept, *last_indices ; double *data, *work, temp, p, error_entropy ; double *save_info, bestcrit ; double criterion, entropy, bound, *crits, *scores ; short int *bins_dep, *bins_indep, *xbins ; char filename[256], **names, depname[256] ; char trial_name[256] ; FILE *fp ; /* Process command line parameters */ #if 1 if (argc != 7) { printf ( "\nUsage: MI_BIN datafile n_indep depname depzero indepzero maxkept" ) ; printf ( "\n datafile - name of the text file containing the data" ) ; printf ( "\n The first line is variable names" ) ; printf ( "\n Subsequent lines are the data." ) ; printf ( "\n Delimiters can be space, comma, or tab" ) ; printf ( "\n n_indep - Number of independent vars, starting with the first" ) ; printf ( "\n depname - Name of the 'dependent' variable" ) ; printf ( "\n It must be AFTER the first n_indep variables" ) ; printf ( "\n depzero - If nonzero, dependent variable is split >0 vs <=0" ) ; printf ( "\n Else split is by optimal partition" ) ; printf ( "\n indepzero - Ditto, for independent variables" ) ; printf ( "\n maxkept - Stepwise will allow at most this many predictors" ) ; return EXIT_FAILURE ; } strcpy ( filename , argv[1] ) ; n_indep_vars = atoi ( argv[2] ) ; strcpy ( depname , argv[3] ) ; depzero = atoi ( argv[4] ) ; indepzero = atoi ( argv[5] ) ; maxkept = atoi ( argv[6] ) ; #else strcpy ( filename , "..\\VARS.TXT" ) ; strcpy ( depname , "DAY_RETURN" ) ; n_indep_vars = 8 ; depzero = 1 ; indepzero = 1 ; maxkept = 99 ; #endif _strupr ( depname ) ; /* Open the text file to which results will be written */ fp = fopen ( "MI_BIN.LOG" , "wt" ) ; if (fp == NULL) { // Should never happen printf ( "\nCannot open MI_BIN.LOG file for writing!" ) ; return EXIT_FAILURE ; } /* Read the file and locate the index of the 'dependent' variable */ if (readfile ( filename , &nvars , &names , &ncases , &data )) return EXIT_FAILURE ; for (idep=0 ; idep<nvars ; idep++) { if (! strcmp ( depname , names[idep] )) break ; } if (idep == nvars) { printf ( "\nERROR... Dependent variable %s is not in file", depname ) ; return EXIT_FAILURE ; } if (idep < n_indep_vars) { printf ( "\nERROR... Dependent variable %s must be beyond independent vars", depname ) ; return EXIT_FAILURE ; } /* Allocate scratch memory bins_dep - Bin ids for the 'dependent' variable bins_indep - Bin ids for the 'independent' variables kept - Array of indices of variables kept so far crits - Ditto, criterion scores - Current (regularly updated) min I(Y;X|Z) for choosing best candidate last_indices - For each candidate, last index among Zs used to compute scores sortwork - Temporary use for printing variable's information sorted save_info - Ditto, this is univariate information, to be sorted */ work = (double *) malloc ( ncases * sizeof(double) ) ; assert ( work != NULL ) ; bins_dep = (short int *) malloc ( ncases * sizeof(short int) ) ; assert ( bins_dep != NULL ) ; bins_indep = (short int *) malloc ( ncases * n_indep_vars * sizeof(short int) ) ; assert ( bins_indep != NULL ) ; kept = (int *) malloc ( n_indep_vars * sizeof(int) ) ; assert ( kept != NULL ) ; crits = (double *) malloc ( n_indep_vars * sizeof(double) ) ; assert ( crits != NULL ) ; scores = (double *) malloc ( n_indep_vars * sizeof(double) ) ; assert ( scores != NULL ) ; last_indices = (int *) malloc ( n_indep_vars * sizeof(int) ) ; assert ( last_indices != NULL ) ; sortwork = (int *) malloc ( n_indep_vars * sizeof(int) ) ; assert ( sortwork != NULL ) ; save_info = (double *) malloc ( n_indep_vars * sizeof(double) ) ; assert ( save_info != NULL ) ; /* Compute the bin membership of all variables. If the user requested, we treat the variable as binary (two bins) using <=0 and >0 as the definition of bin membership. Otherwise we use partition() to do the split. */ if (depzero) { // The dependent variable is split at zero for (i=0 ; i<ncases ; i++) { if (data[i*nvars+idep] > 0.0) bins_dep[i] = (short int) 1 ; else bins_dep[i] = (short int) 0 ; } fprintf ( fp , "\n%s has been split at zero", names[idep] ) ; } else { // The dependent variable is to be partitioned for (i=0 ; i<ncases ; i++) work[i] = data[i*nvars+idep] ; k = 2 ; partition ( ncases , work , &k , NULL , bins_dep ) ; fprintf ( fp , "\n%s has been optimally partitioned", names[idep] ) ; } if (indepzero) { // The independent variable is split at zero fprintf ( fp , "\nIndependent variables have been split at zero"); for (ivar=0 ; ivar<n_indep_vars ; ivar++) { for (i=0 ; i<ncases ; i++) { if (data[i*nvars+ivar] > 0.0) bins_indep[ivar*ncases+i] = (short int) 1 ; else bins_indep[ivar*ncases+i] = (short int) 0 ; } } } else { fprintf ( fp , "\nIndependent variables have been given an optimal split"); for (ivar=0 ; ivar<n_indep_vars ; ivar++) { for (i=0 ; i<ncases ; i++) work[i] = data[i*nvars+ivar] ; k = 2 ; partition ( ncases , work , &k , NULL , bins_indep+ivar*ncases ) ; } } /* Compute and save the mutual information for the dependent variable with each individual independent variable candidate. Print the results, sort them, and print them again, this time sorted. Also compute the error entropy so we can use it for the Fano bound. We need to save the criterion of each in save_info because this is the array that will be sorted, and we also save it in scores because this is the array that will be used for future 'best variable' selection. While we're at it, initialize last_indices to -1 for each variable. This is explained in the big comment block later. */ entropy = mutinf_b ( ncases , bins_dep , NULL , NULL ) ; fprintf ( fp , "\n\n\nMutual information of %s (Entropy = %.4lf)", depname, entropy ) ; fprintf ( fp , "\n\nInitial candidates, in order of appearance in data file" ) ; fprintf ( fp , "\n" ) ; fprintf ( fp , "\n Variable Information Fano's bound" ) ; for (icand=0 ; icand<n_indep_vars ; icand++) { // Try all candidates xbins = bins_indep + icand * ncases ; // This X candidate is here // Compute the error entropy k = 0 ; for (i=0 ; i<ncases ; i++) { if (bins_dep[i] == xbins[i]) ++k ; } if (k > 0 && k < ncases) { p = (double) k / (double) ncases ; error_entropy = -p * log(p) - (1.0 - p) * log(1.0-p) ; } else error_entropy = 0.0 ; criterion = mutinf_b ( ncases , bins_dep , xbins , NULL ) ; bound = (entropy - criterion - error_entropy) / log ( 2.0 ) ; if (bound < 0.0) bound = 0.0 ; printf ( "\n%s = %.5lf (%.5lf)", names[icand], criterion, bound ) ; fprintf ( fp , "\n%31s %11.5lf %13.5lf", names[icand], criterion, bound ) ; sortwork[icand] = icand ; scores[icand] = save_info[icand] = criterion ; last_indices[icand] = -1 ; } // Initial list of all candidates fprintf ( fp , "\n" ) ; fprintf ( fp , "\nInitial candidates, in order of decreasing mutual information" ) ; fprintf ( fp , "\n" ) ; fprintf ( fp , "\n Variable Information" ) ; qsortdsi ( 0 , n_indep_vars-1 , save_info , sortwork ) ; for (icand=0 ; icand<n_indep_vars ; icand++) { // Do all candidates k = sortwork[n_indep_vars-1-icand] ; // Index of sorted candidate fprintf ( fp , "\n%31s %.5lf", names[k], save_info[n_indep_vars-1-icand] ) ; } /* Initialize the 'kept' set to be the best variable, and then begin the main outer loop that adds variables one at a time. The criterion for picking the best next candidate (we want the max criterion) is the minimum value of I(Y;X|Z) across the set of variables kept so far. In this expression, Y is the dependent variable, X is the candidate, and Z is a member of the kept set. I(Y;X|Z) is large when X adds information about Y above and beyond what Z already adds. It is small if X adds nothing useful. So by letting Z be each member of the kept set, one at a time, and using the minimum I(Y;X|Z) found, we avoid adding a new variable whose information is already supplied. There is a cute trick for avoiding having to check every candidate against every Z. When a new Z is tested in computing the minimum across all Z, the minimum obviously cannot increase. So if the minimum across Z so far is already worse than the best candidate criterion so far, there is no point in continuing to test more Zs for a candidate. This candidate has already lost the competition for this round. Of course, we need to keep track of, for each candidate, the place where we have stopped testing it against Zs. This is because on a later round of adding a variable, the best so far may be small, and a candidate whose testing was stopped early on a prior round may need to be tested against more Zs to see if it might be the best now. */ kept[0] = sortwork[n_indep_vars-1] ; // Index of best single candidate crits[0] = save_info[n_indep_vars-1] ; // Its criterion value nkept = 1 ; if (maxkept > n_indep_vars) // Guard against silly user maxkept = n_indep_vars ; while (nkept < maxkept) { printf ( "\n\nLatest candidate: %s", names[kept[nkept-1]] ) ; fprintf ( fp , "\n" ) ; fprintf ( fp , "\nVariables so far Criterion" ) ; for (i=0 ; i<nkept ; i++) fprintf ( fp , "\n%31s %10.5lf", names[kept[i]], crits[i] ) ; fprintf ( fp , "\n" ) ; fprintf ( fp , "\nSearching for an additional candidate..." ) ; fprintf ( fp , "\n" ) ; fprintf ( fp , "\n Variable Criterion" ) ; bestcrit = -1.e60 ; for (icand=0 ; icand<n_indep_vars ; icand++) { // Try all candidates for (i=0 ; i<nkept ; i++) { // Is this candidate already kept? if (kept[i] == icand) break ; } if (i < nkept) // If this candidate 'icand' is already kept continue ; // Skip it strcpy ( trial_name , names[icand] ) ; // Its name for printing printf ( "\n Testing candidate %s Starting score=%.5lf Tested thru %d", trial_name, scores[icand], last_indices[icand] ) ; // Compute I(Y;X|Z) for each Z in the kept set, and keep track of min // We've already done them through last_indices[icand], so start // with the next one up. Allow for early exit if icand already loses. for (iz=last_indices[icand]+1 ; iz<nkept ; iz++) { if (scores[icand] <= bestcrit) // Has this candidate already lost? break ; // If so, no need to keep doing Zs j = kept[iz] ; // Index of variable in the kept set temp = mutinf_b ( ncases , bins_dep , bins_indep + icand * ncases , bins_indep + j * ncases ) ; // I(Y;X|Z) if (temp < scores[icand]) scores[icand] = temp ; last_indices[icand] = iz ; printf ( "\n With kept %s I(Y;X|Z)=%.5lf score=%.5lf", names[j], temp, scores[icand] ) ; } // For all kept variables, computing min conditional mutual information criterion = scores[icand] ; printf ( "\n %s = %.5lf", trial_name, criterion ) ; fprintf ( fp , "\n%31s %10.5lf", trial_name, criterion ) ; if (criterion > bestcrit) { // Did we just set a new record? bestcrit = criterion ; // If so, update the record ibest = icand ; // Keep track of the winning candidate } } // For all candidates // We now have the best candidate if (bestcrit <= 0.0) break ; kept[nkept] = ibest ; crits[nkept] = bestcrit ; printf ( "\nAdded %s = %.5lf", names[ibest], bestcrit ) ; ++nkept ; } // While adding new variables fprintf ( fp , "\n" ) ; fprintf ( fp , "\nFinal set Criterion" ) ; for (i=0 ; i<nkept ; i++) fprintf ( fp , "\n%31s %10.5lf", names[kept[i]], crits[i] ) ; fclose ( fp ) ; free ( work ) ; free ( bins_dep ) ; free ( bins_indep ) ; free ( kept ) ; free ( crits ) ; free ( scores ) ; free ( last_indices ) ; free ( sortwork ) ; free ( save_info ) ; free_data ( nvars , names , data ) ; printf ( "\n\nPress any key..." ) ; _getch () ; return EXIT_SUCCESS ; }
int main ( int argc , // Number of command line arguments (includes prog name) char *argv[] // Arguments (prog name is argv[0]) ) { int i, j, k, nvars, ncases, irep, nreps, ivar, nties, ties ; int n_indep_vars, idep, icand, *index, *mcpt_max_counts, *mcpt_same_counts, *mcpt_solo_counts ; double *data, *work, dtemp, *save_info, criterion, *crits ; char filename[256], **names, depname[256] ; FILE *fp ; MutualInformationAdaptive *mi_adapt ; /* Process command line parameters */ #if 1 if (argc != 5) { printf ( "\nUsage: MI_ONLY datafile n_indep depname nreps" ) ; printf ( "\n datafile - name of the text file containing the data" ) ; printf ( "\n The first line is variable names" ) ; printf ( "\n Subsequent lines are the data." ) ; printf ( "\n Delimiters can be space, comma, or tab" ) ; printf ( "\n n_indep - Number of independent vars, starting with the first" ) ; printf ( "\n depname - Name of the 'dependent' variable" ) ; printf ( "\n It must be AFTER the first n_indep variables" ) ; printf ( "\n nreps - Number of Monte-Carlo permutations, including unpermuted" ) ; exit ( 1 ) ; } strcpy ( filename , argv[1] ) ; n_indep_vars = atoi ( argv[2] ) ; strcpy ( depname , argv[3] ) ; nreps = atoi ( argv[4] ) ; #else strcpy ( filename , "..\\SYNTH.TXT" ) ; n_indep_vars = 7 ; strcpy ( depname , "SUM1234" ) ; nreps = 100 ; #endif _strupr ( depname ) ; /* These are used by MEM.CPP for runtime memory validation */ _fullpath ( mem_file_name , "MEM.LOG" , 256 ) ; fp = fopen ( mem_file_name , "wt" ) ; if (fp == NULL) { // Should never happen printf ( "\nCannot open MEM.LOG file for writing!" ) ; return EXIT_FAILURE ; } fclose ( fp ) ; mem_keep_log = 0 ; // Change this to 1 to keep a memory use log (slows execution!) mem_max_used = 0 ; /* Open the text file to which results will be written */ fp = fopen ( "MI_ONLY.LOG" , "wt" ) ; if (fp == NULL) { // Should never happen printf ( "\nCannot open MI_ONLY.LOG file for writing!" ) ; return EXIT_FAILURE ; } /* Read the file and locate the index of the dependent variable */ if (readfile ( filename , &nvars , &names , &ncases , &data )) return EXIT_FAILURE ; for (idep=0 ; idep<nvars ; idep++) { if (! strcmp ( depname , names[idep] )) break ; } if (idep == nvars) { printf ( "\nERROR... Dependent variable %s is not in file", depname ) ; return EXIT_FAILURE ; } if (idep < n_indep_vars) { printf ( "\nERROR... Dependent variable %s must be beyond independent vars", depname ) ; return EXIT_FAILURE ; } /* Check each variable for ties. This is not needed for the algorithm, but it is good to warn the user, because more than a very few tied values in any variable seriously degrades performance of the adaptive partitioning algorithm. */ MEMTEXT ( "MI_ONLY: Work" ) ; work = (double *) MALLOC ( ncases * sizeof(double) ) ; assert ( work != NULL ) ; ties = 0 ; assert ( work != NULL ) ; for (ivar=0 ; ivar<nvars ; ivar++) { if (ivar > n_indep_vars && ivar != idep) continue ; // Check only the variables selected by the user for (i=0 ; i<ncases ; i++) work[i] = data[i*nvars+ivar] ; qsortd ( 0 , ncases-1 , work ) ; nties = 0 ; for (i=1 ; i<ncases ; i++) { if (work[i] == work[i-1]) ++nties ; } if ((double) nties / (double) ncases > 0.05) { ++ties ; fprintf ( fp , "\nWARNING... %s has %.2lf percent ties!", names[ivar], 100.0 * nties / (double) ncases ) ; } } // For all variables if (ties) { fprintf ( fp , "\nThe presence of ties will seriously degrade" ) ; fprintf ( fp , "\nperformance of the adaptive partitioning algorithm\n\n" ) ; } /* Allocate scratch memory and create the MutualInformation object using the dependent variable crits - Mutual information criterion index - Indices that sort the criterion save_info - Ditto, this is univariate information, to be sorted mi_adapt - The MutualInformation object, constructed with the 'dependent' variable */ MEMTEXT ( "MI_ONLY work allocs plus MutualInformation" ) ; crits = (double *) MALLOC ( n_indep_vars * sizeof(double) ) ; assert ( crits != NULL ) ; index = (int *) MALLOC ( n_indep_vars * sizeof(int) ) ; assert ( index != NULL ) ; mcpt_max_counts = (int *) MALLOC ( n_indep_vars * sizeof(int) ) ; assert ( mcpt_max_counts != NULL ) ; mcpt_same_counts = (int *) MALLOC ( n_indep_vars * sizeof(int) ) ; assert ( mcpt_same_counts != NULL ) ; mcpt_solo_counts = (int *) MALLOC ( n_indep_vars * sizeof(int) ) ; assert ( mcpt_solo_counts != NULL ) ; save_info = (double *) MALLOC ( n_indep_vars * sizeof(double) ) ; assert ( save_info != NULL ) ; for (irep=0 ; irep<nreps ; irep++) { for (i=0 ; i<ncases ; i++) // Get the 'dependent' variable work[i] = data[i*nvars+idep] ; // Shuffle dependent variable if in permutation run (irep>0) if (irep) { // If doing permuted runs, shuffle i = ncases ; // Number remaining to be shuffled while (i > 1) { // While at least 2 left to shuffle j = (int) (unifrand () * i) ; if (j >= i) j = i - 1 ; dtemp = work[--i] ; work[i] = work[j] ; work[j] = dtemp ; } } // Here we use a tiny split theshold (instead of the usual 6.0) so that it picks up // small amounts of mutual information (perhaps including noise). // If we used 6.0, nearly all permutations of any reasonably sized dataset // would have a computed mutual information of zero. It's safe picking up // some noise because the permutation test will account for this. mi_adapt = new MutualInformationAdaptive ( ncases , work , 1 , 0.1 ) ; // Deliberately tiny for low information assert ( mi_adapt != NULL ) ; /* Compute and save the mutual information for the dependent variable with each individual independent variable candidate. */ for (icand=0 ; icand<n_indep_vars ; icand++) { // Try all candidates for (i=0 ; i<ncases ; i++) work[i] = data[i*nvars+icand] ; criterion = mi_adapt->mut_inf ( work , 1 ) ; save_info[icand] = criterion ; // We will sort this when all candidates are done if (irep == 0) { // If doing original (unpermuted), save criterion index[icand] = icand ; // Will need original indices when criteria are sorted crits[icand] = criterion ; mcpt_max_counts[icand] = mcpt_same_counts[icand] = mcpt_solo_counts[icand] = 1 ; // This is >= itself so count it now } else { if (criterion >= crits[icand]) ++mcpt_solo_counts[icand] ; } } // Initial list of all candidates delete mi_adapt ; mi_adapt = NULL ; if (irep == 0) // Find the indices that sort the candidates per criterion qsortdsi ( 0 , n_indep_vars-1 , save_info , index ) ; else { qsortd ( 0 , n_indep_vars-1 , save_info ) ; for (icand=0 ; icand<n_indep_vars ; icand++) { if (save_info[icand] >= crits[index[icand]]) ++mcpt_same_counts[index[icand]] ; if (save_info[n_indep_vars-1] >= crits[index[icand]]) // Valid only for largest ++mcpt_max_counts[index[icand]] ; } } } // For all reps fprintf ( fp , "\nAdaptive partitioning mutual information of %s", depname); fprintf ( fp , "\n" ) ; fprintf ( fp , "\n" ) ; fprintf ( fp , "\nPredictors, in order of decreasing mutual information" ) ; fprintf ( fp , "\n" ) ; fprintf ( fp , "\n Variable Information Solo pval Min pval Max pval" ) ; for (icand=0 ; icand<n_indep_vars ; icand++) { // Do all candidates k = index[n_indep_vars-1-icand] ; // Index of sorted candidate fprintf ( fp , "\n%31s %11.5lf %12.4lf %10.4lf %10.4lf", names[k], crits[k], (double) mcpt_solo_counts[k] / nreps, (double) mcpt_same_counts[k] / nreps, (double) mcpt_max_counts[k] / nreps ) ; } MEMTEXT ( "MI_ONLY: Finish" ) ; fclose ( fp ) ; FREE ( work ) ; FREE ( crits ) ; FREE ( index ) ; FREE ( mcpt_max_counts ) ; FREE ( mcpt_same_counts ) ; FREE ( mcpt_solo_counts ) ; FREE ( save_info ) ; free_data ( nvars , names , data ) ; MEMCLOSE () ; printf ( "\n\nPress any key..." ) ; _getch () ; return EXIT_SUCCESS ; }
int main ( int argc , // Number of command line arguments (includes prog name) char *argv[] // Arguments (prog name is argv[0]) ) { int i, j, k, nvars, ncases, irep, nreps, nbins, nbins_dep, nbins_indep, *count ; int n_indep_vars, idep, icand, *index, *mcpt_max_counts, *mcpt_same_counts, *mcpt_solo_counts ; short int *bins_dep, *bins_indep ; double *data, *work, dtemp, *save_info, criterion, *crits ; double *ab, *bc, *b ; char filename[256], **names, depname[256] ; FILE *fp ; /* Process command line parameters */ #if 1 if (argc != 6) { printf ( "\nUsage: TRANSFER datafile n_indep depname nreps" ) ; printf ( "\n datafile - name of the text file containing the data" ) ; printf ( "\n The first line is variable names" ) ; printf ( "\n Subsequent lines are the data." ) ; printf ( "\n Delimiters can be space, comma, or tab" ) ; printf ( "\n n_indep - Number of independent vars, starting with the first" ) ; printf ( "\n depname - Name of the 'dependent' variable" ) ; printf ( "\n It must be AFTER the first n_indep variables" ) ; printf ( "\n nbins - Number of bins for all variables" ) ; printf ( "\n nreps - Number of Monte-Carlo permutations, including unpermuted" ) ; exit ( 1 ) ; } strcpy ( filename , argv[1] ) ; n_indep_vars = atoi ( argv[2] ) ; strcpy ( depname , argv[3] ) ; nbins = atoi ( argv[4] ) ; nreps = atoi ( argv[5] ) ; #else strcpy ( filename , "..\\SYNTH.TXT" ) ; n_indep_vars = 7 ; strcpy ( depname , "SUM1234" ) ; nbins = 2 ; nreps = 1 ; #endif _strupr ( depname ) ; /* These are used by MEM.CPP for runtime memory validation */ _fullpath ( mem_file_name , "MEM.LOG" , 256 ) ; fp = fopen ( mem_file_name , "wt" ) ; if (fp == NULL) { // Should never happen printf ( "\nCannot open MEM.LOG file for writing!" ) ; return EXIT_FAILURE ; } fclose ( fp ) ; mem_keep_log = 1 ; // Change this to 1 to keep a memory use log (slows execution!) mem_max_used = 0 ; /* Open the text file to which results will be written */ fp = fopen ( "TRANSFER.LOG" , "wt" ) ; if (fp == NULL) { // Should never happen printf ( "\nCannot open TRANSFER.LOG file for writing!" ) ; return EXIT_FAILURE ; } /* Read the file and locate the index of the dependent variable */ if (readfile ( filename , &nvars , &names , &ncases , &data )) return EXIT_FAILURE ; for (idep=0 ; idep<nvars ; idep++) { if (! strcmp ( depname , names[idep] )) break ; } if (idep == nvars) { printf ( "\nERROR... Dependent variable %s is not in file", depname ) ; return EXIT_FAILURE ; } if (idep < n_indep_vars) { printf ( "\nERROR... Dependent variable %s must be beyond independent vars", depname ) ; return EXIT_FAILURE ; } /* Allocate scratch memory crits - Transfer Entropy criterion index - Indices that sort the criterion save_info - Ditto, this is univariate criteria, to be sorted */ MEMTEXT ( "TRANSFER work allocs" ) ; work = (double *) MALLOC ( ncases * sizeof(double) ) ; assert ( work != NULL ) ; crits = (double *) MALLOC ( n_indep_vars * sizeof(double) ) ; assert ( crits != NULL ) ; index = (int *) MALLOC ( n_indep_vars * sizeof(int) ) ; assert ( index != NULL ) ; bins_indep = (short int *) MALLOC ( ncases * sizeof(short int) ) ; assert ( bins_indep != NULL ) ; bins_dep = (short int *) MALLOC ( ncases * sizeof(short int) ) ; assert ( bins_dep != NULL ) ; mcpt_max_counts = (int *) MALLOC ( n_indep_vars * sizeof(int) ) ; assert ( mcpt_max_counts != NULL ) ; mcpt_same_counts = (int *) MALLOC ( n_indep_vars * sizeof(int) ) ; assert ( mcpt_same_counts != NULL ) ; mcpt_solo_counts = (int *) MALLOC ( n_indep_vars * sizeof(int) ) ; assert ( mcpt_solo_counts != NULL ) ; save_info = (double *) MALLOC ( n_indep_vars * sizeof(double) ) ; assert ( save_info != NULL ) ; count = (int *) MALLOC ( nbins * nbins * nbins * sizeof(int) ) ; assert ( count != NULL ) ; ab = (double *) MALLOC ( nbins * nbins * sizeof(double) ) ; assert ( ab != NULL ) ; bc = (double *) MALLOC ( nbins * nbins * sizeof(double) ) ; assert ( bc != NULL ) ; b = (double *) MALLOC ( nbins * sizeof(double) ) ; assert ( b != NULL ) ; /* Get the dependent variable and partition it */ for (i=0 ; i<ncases ; i++) // Get the 'dependent' variable work[i] = data[i*nvars+idep] ; nbins_dep = nbins ; partition ( ncases , work , &nbins_dep , NULL , bins_dep ) ; /* Replication loop is here */ for (irep=0 ; irep<nreps ; irep++) { /* Compute and save the transfer entropy of the dependent variable with each individual independent variable candidate. */ for (icand=0 ; icand<n_indep_vars ; icand++) { // Try all candidates for (i=0 ; i<ncases ; i++) work[i] = data[i*nvars+icand] ; // Shuffle independent variable if in permutation run (irep>0) if (irep) { // If doing permuted runs, shuffle i = ncases ; // Number remaining to be shuffled while (i > 1) { // While at least 2 left to shuffle j = (int) (unifrand () * i) ; if (j >= i) j = i - 1 ; dtemp = work[--i] ; work[i] = work[j] ; work[j] = dtemp ; } } nbins_indep = nbins ; partition ( ncases , work , &nbins_indep , NULL , bins_indep ) ; criterion = trans_ent ( ncases , nbins_indep , nbins_dep , bins_indep , bins_dep , 0 , 1 , 1 , count , ab , bc , b ) ; save_info[icand] = criterion ; // We will sort this when all candidates are done if (irep == 0) { // If doing original (unpermuted), save criterion index[icand] = icand ; // Will need original indices when criteria are sorted crits[icand] = criterion ; mcpt_max_counts[icand] = mcpt_same_counts[icand] = mcpt_solo_counts[icand] = 1 ; // This is >= itself so count it now } else { if (criterion >= crits[icand]) ++mcpt_solo_counts[icand] ; } } // Initial list of all candidates if (irep == 0) // Find the indices that sort the candidates per criterion qsortdsi ( 0 , n_indep_vars-1 , save_info , index ) ; else { qsortd ( 0 , n_indep_vars-1 , save_info ) ; for (icand=0 ; icand<n_indep_vars ; icand++) { if (save_info[icand] >= crits[index[icand]]) ++mcpt_same_counts[index[icand]] ; if (save_info[n_indep_vars-1] >= crits[index[icand]]) // Valid only for largest ++mcpt_max_counts[index[icand]] ; } } } // For all reps fprintf ( fp , "\nTransfer entropy of %s", depname); fprintf ( fp , "\n" ) ; fprintf ( fp , "\n" ) ; fprintf ( fp , "\nPredictors, in order of decreasing transfer entropy" ) ; fprintf ( fp , "\n" ) ; fprintf ( fp , "\n Variable Information Solo pval Min pval Max pval" ) ; for (icand=0 ; icand<n_indep_vars ; icand++) { // Do all candidates k = index[n_indep_vars-1-icand] ; // Index of sorted candidate fprintf ( fp , "\n%31s %11.5lf %12.4lf %10.4lf %10.4lf", names[k], crits[k], (double) mcpt_solo_counts[k] / nreps, (double) mcpt_same_counts[k] / nreps, (double) mcpt_max_counts[k] / nreps ) ; } MEMTEXT ( "TRANSFER: Finish" ) ; fclose ( fp ) ; FREE ( work ) ; FREE ( crits ) ; FREE ( index ) ; FREE ( bins_indep ) ; FREE ( bins_dep ) ; FREE ( mcpt_max_counts ) ; FREE ( mcpt_same_counts ) ; FREE ( mcpt_solo_counts ) ; FREE ( save_info ) ; FREE ( count ) ; FREE ( ab ) ; FREE ( bc ) ; FREE ( b ) ; free_data ( nvars , names , data ) ; MEMCLOSE () ; printf ( "\n\nPress any key..." ) ; _getch () ; return EXIT_SUCCESS ; }
int main ( int argc , // Number of command line arguments (includes prog name) char *argv[] // Arguments (prog name is argv[0]) ) { int i, j, k, nvars, ncases, ndiv, maxkept, ivar, nties, ties ; int n_indep_vars, idep, icand, iother, ibest, *sortwork, nkept, *kept ; double *data, *work ; double *save_info, *univar_info, *pair_info, bestredun, redun, bestcrit ; double criterion, relevance, redundancy, *crits, *reduns ; char filename[256], **names, depname[256] ; char trial_name[256], *pair_found ; FILE *fp ; MutualInformationParzen *mi_parzen ; MutualInformationAdaptive *mi_adapt ; /* Process command line parameters */ #if 1 if (argc != 6) { printf ( "\nUsage: MI_CONT datafile n_indep depname ndiv maxkept" ) ; printf ( "\n datafile - name of the text file containing the data" ) ; printf ( "\n The first line is variable names" ) ; printf ( "\n Subsequent lines are the data." ) ; printf ( "\n Delimiters can be space, comma, or tab" ) ; printf ( "\n n_indep - Number of independent vars, starting with the first" ) ; printf ( "\n depname - Name of the 'dependent' variable" ) ; printf ( "\n It must be AFTER the first n_indep variables" ) ; printf ( "\n ndiv - Normally zero, to employ adaptive partitioning" ) ; printf ( "\n Specify 5 (for very few cases) to 15 (for an" ) ; printf ( "\n enormous number of cases) to use Parzen windows" ) ; printf ( "\n maxkept - Stepwise will allow at most this many predictors" ) ; exit ( 1 ) ; } strcpy ( filename , argv[1] ) ; n_indep_vars = atoi ( argv[2] ) ; strcpy ( depname , argv[3] ) ; ndiv = atoi ( argv[4] ) ; maxkept = atoi ( argv[5] ) ; #else strcpy ( filename , "..\\VARS.TXT" ) ; n_indep_vars = 8 ; strcpy ( depname , "DAY_RETURN" ) ; ndiv = 0 ; maxkept = 5 ; #endif _strupr ( depname ) ; /* These are used by MEM.CPP for runtime memory validation */ _fullpath ( mem_file_name , "MEM.LOG" , 256 ) ; fp = fopen ( mem_file_name , "wt" ) ; if (fp == NULL) { // Should never happen printf ( "\nCannot open MEM.LOG file for writing!" ) ; return EXIT_FAILURE ; } fclose ( fp ) ; mem_keep_log = 1 ; mem_max_used = 0 ; /* Open the text file to which results will be written */ fp = fopen ( "MI_CONT.LOG" , "wt" ) ; if (fp == NULL) { // Should never happen printf ( "\nCannot open MI_CONT.LOG file for writing!" ) ; return EXIT_FAILURE ; } /* Read the file and locate the index of the 'dependent' variable */ if (readfile ( filename , &nvars , &names , &ncases , &data )) return EXIT_FAILURE ; for (idep=0 ; idep<nvars ; idep++) { if (! strcmp ( depname , names[idep] )) break ; } if (idep == nvars) { printf ( "\nERROR... Dependent variable %s is not in file", depname ) ; return EXIT_FAILURE ; } if (idep < n_indep_vars) { printf ( "\nERROR... Dependent variable %s must be beyond independent vars", depname ) ; return EXIT_FAILURE ; } /* If adaptive partitioning is specified, check each variable for ties. This is not needed for the algorithm, but it is good to warn the user, because more than a very few tied values in any variable seriously degrades performance of the adaptive partitioning algorithm. */ MEMTEXT ( "MI_CONT: Work" ) ; work = (double *) MALLOC ( ncases * sizeof(double) ) ; assert ( work != NULL ) ; if (ndiv == 0) { // If adaptive partitioning, check for ties ties = 0 ; assert ( work != NULL ) ; for (ivar=0 ; ivar<nvars ; ivar++) { if (ivar > n_indep_vars && ivar != idep) continue ; // Check only the variables selected by the user for (i=0 ; i<ncases ; i++) work[i] = data[i*nvars+ivar] ; qsortd ( 0 , ncases-1 , work ) ; nties = 0 ; for (i=1 ; i<ncases ; i++) { if (work[i] == work[i-1]) ++nties ; } if ((double) nties / (double) ncases > 0.05) { ++ties ; fprintf ( fp , "\nWARNING... %s has %.2lf percent ties!", names[ivar], 100.0 * nties / (double) ncases ) ; } } // For all variables if (ties) { fprintf ( fp , "\nThe presence of ties will seriously degrade" ) ; fprintf ( fp , "\nperformance of the adaptive partitioning algorithm\n\n" ) ; } } // If adaptive partitioning, so testing for ties in the data /* Allocate scratch memory and create the MutualInformation object using the dependent variable kept - Array of indices of variables kept so far crits - Ditto, criterion reduns - Ditto, redundancy sortwork - Temporary use for printing variable's information sorted save_info - Ditto, this is univariate information, to be sorted univar_info - Also univariate information, but not sorted, for use in stepwise pair_found - Flag: is there valid info in the corresponding element of the next array pair_info - Preserve pairwise information of indeps to avoid expensive recalculation mi_parzen - The MutualInformation object, constructed with the 'dependent' variable mi_adapt - Ditto, but used if adaptive partitioning */ MEMTEXT ( "MI_CONT 6 allocs plus MutualInformation" ) ; kept = (int *) MALLOC ( n_indep_vars * sizeof(int) ) ; assert ( kept != NULL ) ; crits = (double *) MALLOC ( n_indep_vars * sizeof(double) ) ; assert ( crits != NULL ) ; reduns = (double *) MALLOC ( n_indep_vars * sizeof(double) ) ; assert ( reduns != NULL ) ; sortwork = (int *) MALLOC ( n_indep_vars * sizeof(int) ) ; assert ( sortwork != NULL ) ; save_info = (double *) MALLOC ( n_indep_vars * sizeof(double) ) ; assert ( save_info != NULL ) ; univar_info = (double *) MALLOC ( n_indep_vars * sizeof(double) ) ; assert ( univar_info != NULL ) ; pair_found = (char *) MALLOC ( (n_indep_vars * (n_indep_vars+1) / 2) * sizeof(char) ) ; assert ( pair_found != NULL ) ; pair_info = (double *) MALLOC ( (n_indep_vars * (n_indep_vars+1) / 2) * sizeof(double) ) ; assert ( pair_info != NULL ) ; for (i=0 ; i<ncases ; i++) // Get the 'dependent' variable work[i] = data[i*nvars+idep] ; if (ndiv > 0) { mi_parzen = new MutualInformationParzen ( ncases , work , ndiv ) ; mi_adapt = NULL ; assert ( mi_parzen != NULL ) ; } else { mi_adapt = new MutualInformationAdaptive ( ncases , work , 0 , 6.0 ) ; mi_parzen = NULL ; assert ( mi_adapt != NULL ) ; } memset ( pair_found , 0 , (n_indep_vars * (n_indep_vars+1) / 2) * sizeof(char) ) ; if (ndiv > 0) fprintf ( fp , "\nParzen mutual information of %s (ndiv=%d)", depname, ndiv); else fprintf ( fp , "\nAdaptive partitioning mutual information of %s", depname); fprintf ( fp , "\n" ) ; fprintf ( fp , "\n---------------------------------------------------------------" ) ; fprintf ( fp , "\n" ) ; /* Compute and save the mutual information for the dependent variable with each individual independent variable candidate. Print the results, sort them, and print them again, this time sorted. */ fprintf ( fp , "\nInitial candidates, in order of appearance in data file" ) ; fprintf ( fp , "\n" ) ; fprintf ( fp , "\n Variable Information" ) ; for (icand=0 ; icand<n_indep_vars ; icand++) { // Try all candidates for (i=0 ; i<ncases ; i++) work[i] = data[i*nvars+icand] ; if (ndiv > 0) criterion = mi_parzen->mut_inf ( work ) ; else criterion = mi_adapt->mut_inf ( work , 0 ) ; printf ( "\n%s = %.5lf", names[icand], criterion ) ; fprintf ( fp , "\n%31s %.5lf", names[icand], criterion ) ; sortwork[icand] = icand ; save_info[icand] = univar_info[icand] = criterion ; } // Initial list of all candidates if (mi_parzen != NULL) { delete mi_parzen ; mi_parzen = NULL ; } if (mi_adapt != NULL) { delete mi_adapt ; mi_adapt = NULL ; } fprintf ( fp , "\n" ) ; fprintf ( fp , "\nInitial candidates, in order of decreasing mutual information" ) ; fprintf ( fp , "\n" ) ; fprintf ( fp , "\n Variable Information" ) ; qsortdsi ( 0 , n_indep_vars-1 , save_info , sortwork ) ; for (icand=0 ; icand<n_indep_vars ; icand++) { // Do all candidates k = sortwork[n_indep_vars-1-icand] ; // Index of sorted candidate fprintf ( fp , "\n%31s %.5lf", names[k], save_info[n_indep_vars-1-icand] ) ; } /* Initialize the 'kept' set to be the best variable, and then begin the main outer loop that adds variables one at a time */ kept[0] = sortwork[n_indep_vars-1] ; // Index of best single candidate crits[0] = save_info[n_indep_vars-1] ; reduns[0] = 0.0 ; nkept = 1 ; if (maxkept > n_indep_vars) // Guard against silly user maxkept = n_indep_vars ; while (nkept < maxkept) { fprintf ( fp , "\n" ) ; fprintf ( fp , "\nVariables so far Relevance Redundancy Criterion" ) ; for (i=0 ; i<nkept ; i++) fprintf ( fp , "\n%31s %10.5lf %10.5lf %10.5lf", names[kept[i]], crits[i] + reduns[i], reduns[i], crits[i] ) ; fprintf ( fp , "\n" ) ; fprintf ( fp , "\nSearching for an additional candidate..." ) ; fprintf ( fp , "\n" ) ; fprintf ( fp , "\n Variable Relevance Redundancy Criterion" ) ; bestcrit = -1.e60 ; for (icand=0 ; icand<n_indep_vars ; icand++) { // Try all candidates for (i=0 ; i<nkept ; i++) { // Is this candidate already kept? if (kept[i] == icand) break ; } if (i < nkept) // If this candidate 'icand' is already kept continue ; // Skip it strcpy ( trial_name , names[icand] ) ; // Its name for printing for (i=0 ; i<ncases ; i++) // Get its cases work[i] = data[i*nvars+icand] ; if (ndiv > 0) { mi_parzen = new MutualInformationParzen ( ncases , work , ndiv ) ; mi_adapt = NULL ; assert ( mi_parzen != NULL ) ; } else { mi_adapt = new MutualInformationAdaptive ( ncases , work , 0 , 6.0 ) ; mi_parzen = NULL ; assert ( mi_adapt != NULL ) ; } relevance = univar_info[icand] ; // We saved it during initial printing printf ( "\n%s relevance = %.5lf", trial_name, relevance ) ; // Compute the redundancy of this candidate // This is the mean of its redundancy with all kept variables redundancy = 0.0 ; for (iother=0 ; iother<nkept ; iother++) { // Process entire kept set j = kept[iother] ; // Index of a variable in the kept set if (icand > j) // pair_found and pair_info are k = icand*(icand+1)/2+j ; // symmetric, so k is the index else // into them k = j*(j+1)/2+icand ; if (pair_found[k]) // If we already computed it redun = pair_info[k] ; // Don't do it again else { // First time for this pair, so compute for (i=0 ; i<ncases ; i++) // Get its cases work[i] = data[i*nvars+j] ; // Variable already in kept set if (ndiv > 0) redun = mi_parzen->mut_inf ( work ) ; else redun = mi_adapt->mut_inf ( work , 0 ) ; pair_found[k] = 1 ; // Flag that this pair has been computed pair_info[k] = redun ; // And save the MI for this pair } // Else must compute redundancy redundancy += redun ; printf ( "\n %s <-> %s redundancy = %.5lf", names[icand], names[j], redun ) ; } // For all kept variables, computing mean redundancy if (mi_parzen != NULL) { delete mi_parzen ; mi_parzen = NULL ; } if (mi_adapt != NULL) { delete mi_adapt ; mi_adapt = NULL ; } redundancy /= nkept ; // It is the mean across all kept printf ( "\nRedundancy = %.5lf", redundancy ) ; criterion = relevance - redundancy ; fprintf ( fp , "\n%31s %10.5lf %10.5lf %10.5lf", trial_name, relevance, redundancy, criterion ) ; if (criterion > bestcrit) { // Did we just set a new record? bestcrit = criterion ; // If so, update the record bestredun = redundancy ; // Needed for printing results later ibest = icand ; // Keep track of the winning candidate } } // For all candidates // We now have the best candidate if (bestcrit <= 0.0) break ; kept[nkept] = ibest ; crits[nkept] = bestcrit ; reduns[nkept] = bestredun ; ++nkept ; } // While adding new variables fprintf ( fp , "\n" ) ; fprintf ( fp , "\nFinal set Relevance Redundancy Criterion" ) ; for (i=0 ; i<nkept ; i++) fprintf ( fp , "\n%31s %10.5lf %10.5lf %10.5lf", names[kept[i]], crits[i] + reduns[i], reduns[i], crits[i] ) ; MEMTEXT ( "MI_CONT: Finish" ) ; fclose ( fp ) ; FREE ( work ) ; FREE ( kept ) ; FREE ( crits ) ; FREE ( reduns ) ; FREE ( sortwork ) ; FREE ( save_info ) ; FREE ( univar_info ) ; FREE ( pair_found ) ; FREE ( pair_info ) ; if (mi_parzen != NULL) delete mi_parzen ; if (mi_adapt != NULL) delete mi_adapt ; free_data ( nvars , names , data ) ; MEMCLOSE () ; printf ( "\n\nPress any key..." ) ; _getch () ; return EXIT_SUCCESS ; }