double gamma ( int v ) { double x, y, z, vm1, root ; switch (v) { case 1: // Chi-square with 1 df is 2 gamma(.5) x = normal () ; return 0.5 * x * x ; case 2: // Gamma(1) is exponential(1) for (;;) { x = unifrand () ; if (x > 0.0) return -log ( x ) ; } default: // Valid for all real a>1 (a=v/2) vm1 = 0.5 * v - 1.0 ; root = sqrt ( v - 1.0 ) ; for (;;) { y = tan ( PI * unifrand () ) ; x = root * y + vm1 ; if (x <= 0.0) continue ; z = (1.0 + y * y) * exp ( vm1 * log(x/vm1) - root * y ) ; if (unifrand () <= z) return x ; } } }
double normal () { double x1, x2 ; for (;;) { x1 = unifrand () ; if (x1 <= 0.0) // Safety: log(0) is undefined continue ; x1 = sqrt ( -2.0 * log ( x1 )) ; x2 = cos ( 2.0 * PI * unifrand () ) ; return x1 * x2 ; } }
void normal_pair ( double *x1 , double *x2 ) { double u1, u2 ; for (;;) { u1 = unifrand () ; if (u1 <= 0.0) // Safety: log(0) is undefined continue ; u1 = sqrt ( -2.0 * log ( u1 )) ; u2 = 2.0 * PI * unifrand () ; *x1 = u1 * sin ( u2 ) ; *x2 = u1 * cos ( u2 ) ; return ; } }
uint32_t generate_addressv4(struct gentrie *trie) { /* assume that if there's a non-NULL trie passed that it has been properly initialized with initialize_trie */ if (!trie) return 0; struct node *currnode = trie->root; int remaining_bits = 32 - trie->prefixlen; uint32_t xaddr = 0; while (remaining_bits > 0) { xaddr <<= 1; double pprime = unifrand(); int leftright = pprime >= currnode->p; // 0 if left, 1 if right xaddr |= leftright; /* don't bother to create a node for the last bit */ /* if we're not there yet, create a new node based on p vs. pprime */ if (remaining_bits > 1 && !currnode->children[leftright]) { struct node *xnode = new_node(); xnode->p = genbeta(trie->beta); currnode->children[leftright] = xnode; currnode = xnode; } remaining_bits--; } return (trie->netaddr | xaddr); }
static void pick_parents ( int *nchoices , // Number of choices (returned decremented by two) int *choices , // Array (nchoices long) of candidates for parent int *parent1 , // One parent returned here int *parent2 // and the other here ) { int k ; k = unifrand() * *nchoices ; // Select position in choices array *parent1 = choices[k] ; // Then return that parent choices[k] = choices[--*nchoices] ; // without replacement k = unifrand() * *nchoices ; *parent2 = choices[k] ; choices[k] = choices[--*nchoices] ; }
static void fitness_to_choices ( int popsize , // Length of fitness, choices vectors float *fitness , // Input array of expected selection frequencies int *choices // Output array of parents ) { int individual, expected, k ; float rn ; /* We build the choices array in two steps. This, the first step, assigns parents according to the integer part of their expected frequencies. */ k = 0 ; // Will index choices array for (individual=0 ; individual<popsize ; individual++) { expected = (int) fitness[individual] ; // Assign this many now fitness[individual] -= expected ; // Save fractional remainder while (expected--) // Forcibly use the int expected choices[k++] = individual ; // quantity of this individual } /* The second step is to take care of the remaining fractional expected frequencies. Pass through the population, randomly selecting members with probability equal to their remaining fractional expectation. It is tempting to think that the algorithm below could loop excessively due to a very small fitness. But recall that the sum of the fitnesses will be AT LEAST as large as the number remaining to be selected, and generally much more. Thus, the ones with very small fitness (likely to cause trouble) will never become the only remaining possibilities. */ while (k < popsize) { // Select until choices is full individual = unifrand() * popsize ;// Randomly select individual if (fitness[individual] > 0.0) { // Try members still having expectation if (fitness[individual] >= unifrand()) { // Selects with this prob choices[k++] = individual ; // Bingo! Select this individual fitness[individual] -= 1.0 ; // and make it ineligable for future } } } }
int KSSingle::rvalrand(int n) { int i; --n; double x = unifrand(rval_[n]); for (i=0; i < n; ++i) { if (rval_[i] >= x) { return i; } } return n; }
/* -------------------------------------------------------------------------------- mutate - apply the mutation operator to a single child -------------------------------------------------------------------------------- */ static void mutate ( char *child , // Input/Output of the child int chromsize , // Number of variables in objective function float pmutate // Probability of mutation ) { while (chromsize--) { if (unifrand() < pmutate) // Mutate this gene? child[chromsize] ^= (char) 1 << (longrand() % 8) ; // Flip random bit } }
void E0 ( int n , // Number of data points DataClass *data , // The data is here double *app , // Apparent error from testing tset double *excess // Excess error (add to app to get pop) ) { int i, rep, sub, ntot, *count ; int m = 200 ; double errsum ; DataClass *x ; if (m < n) m = n ; x = (DataClass *) malloc ( n * sizeof(DataClass) ) ; // Bootstraps here count = (int *) malloc ( n * sizeof(int) ) ; // Count uses in bootstrap errsum = 0.0 ; ntot = 0 ; for (rep=0 ; rep<m ; rep++) { memset ( count , 0 , n * sizeof(int) ) ; // Zero usage counter for (i=0 ; i<n ; i++) { // Bootstrap sample same size sub = unifrand() * n ; // Select this case if (sub >= n) // Cheap insurance in case sub = n-1 ; // unifrand() returns 1 x[i] = data[sub] ; // Get this case ++count[sub] ; // Count its use } train ( n , x ) ; // Train on bootstrap sample for (i=0 ; i<n ; i++) { // Check all cases if (! count[i]) { // If not used in training errsum += test ( 1 , data+i ) ; // Find its error ++ntot ; // Grand test count } } } errsum /= ntot ; // Mean of all tests train ( n , data ) ; // Also need the *app = test ( n , data ) ; // Apparent error *excess = errsum - *app ; free ( x ) ; free ( count ) ; }
void boot_bias_var ( int n , // Number of cases in sample double *data , // The sample double (*user_t) (int , double * , double * ) , // Compute param int nboot , // Number of bootstrap replications double *rawstat , // Raw statistic of sample, theta-hat double *bias , // Output of bias estimate double *var , // Output of variance estimate double *work , // Work area n long double *work2 , // Work area nboot long double *freq // Work area n long ) { int i, rep, k ; double stat, mean, variance, diff ; mean = 0.0 ; for (i=0 ; i<n ; i++) freq[i] = 0.0 ; for (rep=0 ; rep<nboot ; rep++) { // Do all bootstrap reps (b from 1 to B) for (i=0 ; i<n ; i++) { // Generate the bootstrap sample k = (int) (unifrand() * n) ; // Select a case from the sample if (k >= n) // Should never happen, but be prepared k = n - 1 ; work[i] = data[k] ; // Put bootstrap sample in work ++freq[k] ; // Tally for mean frequency } stat = user_t ( n , work , NULL ) ; // Evaluate estimator for this rep work2[rep] = stat ; // Enables more accurate variance mean += stat ; // Cumulate theta-hat star dot } mean /= nboot ; variance = 0.0 ; for (rep=0 ; rep<nboot ; rep++) { // Cumulate variance diff = work2[rep] - mean ; variance += diff * diff ; } for (i=0 ; i<n ; i++) // Convert tally of useage freq[i] /= nboot * n ; // To mean frequency of use memcpy ( work , data , n * sizeof(double) ) ; // user_t may reorder, so preserve *rawstat = user_t ( n , data , NULL) ; // Final but biased estimate *bias = mean - user_t ( n , work , freq ) ; *var = variance / (nboot - 1) ; }
void boot ( int n , // Number of data points DataClass *data , // The data is here double *app , // Apparent error from testing tset double *excess // Excess error (add to app to get pop) ) { int i, rep, sub, *count ; int m = 200 ; double err, errsum ; DataClass *x ; if (m < n) // If the dataset is large m = n ; // Do enough reps to be thorough x = (DataClass *) malloc ( n * sizeof(DataClass) ) ; // Bootstraps here count = (int *) malloc ( n * sizeof(int) ) ; // Count uses in bootstrap errsum = 0.0 ; for (rep=0 ; rep<m ; rep++) { memset ( count , 0 , n * sizeof(int) ) ; // Zero usage counter for (i=0 ; i<n ; i++) { // Bootstrap sample same size sub = unifrand() * n ; // Select this case if (sub >= n) // Cheap insurance in case sub = n-1 ; // unifrand() returns 1 x[i] = data[sub] ; // Get this case ++count[sub] ; // Count its use } train ( n , x ) ; // Train on bootstrap sample for (i=0 ; i<n ; i++) { // Test all cases err = test ( 1 , data+i ) ; // Error of this case errsum += err * (1 - count[i]) ; // Bootstrap formula } } errsum /= (double) m * (double) n ; // Grand mean train ( n , data ) ; // Also return the *app = test ( n , data ) ; // Apparent error *excess = errsum ; free ( x ) ; free ( count ) ; }
void cauchy ( int n , double scale , double *x ) { double temp ; if (n == 1) { temp = PI * unifrand () - 0.5 * PI ; x[0] = scale * tan ( 0.99999999 * temp ) ; return ; } rand_sphere ( n , x ) ; temp = beta ( n , 1 ) ; if (temp < 1.0) temp = scale * sqrt ( temp / (1.0 - temp) ) ; else temp = 1.e10 ; while (n--) x[n] *= temp ; }
static void reproduce ( char *p1 , // Pointer to one parent char *p2 , // and the other int first_child , // Is this the first of their 2 children? int chromsize , // Number of genes in chromosome char *child , // Output of a child int *crosspt , // If first_child, output of xover pt, else input it. int *split // In/out of within byte splitting point ) { int i, n1, n2, n3, n4 ; char left, right, *pa, *pb ; if (first_child) { *split = longrand() % 8 ; // We will split boundary bytes here *crosspt = 1 + unifrand() * chromsize ; // Randomly select cross pt if ((chromsize >= 16) && (unifrand() < 0.33333)) // Two point? *crosspt = -*crosspt ; // flag this for second child pa = p1 ; pb = p2 ; } // If first child else { // Second child pa = p2 ; // so parents reverse roles pb = p1 ; } // If second child /* Prepare for reproduction */ if (*split) { // Create left and right splitting masks right = 1 ; i = *split ; while (--i) right = (right << 1) | 1 ; left = 255 ^ right ; } if (*crosspt > 0) { // Use one point crossover n1 = chromsize / 2 ; // This many genes in first half of child n2 = chromsize - n1 ; // and this many in second half n3 = n4 = 0 ; // We are using one point crossover i = *crosspt - 1 ; // We will start building child here } else { // Use two point crossover n1 = n2 = n3 = chromsize / 4 ; // This many in first three quarters n4 = chromsize - n1 - n2 - n3 ; // And the last quarter gets the rest i = -*crosspt - 1 ; // 2 point method was flagged by neg } /* Do reproduction here */ if (*split) { i = (i+1) % chromsize ; child[i] = (left & pa[i]) | (right & pb[i]) ; --n1 ; } while (n1--) { i = (i+1) % chromsize ; child[i] = pb[i] ; } if (*split) { i = (i+1) % chromsize ; child[i] = (left & pb[i]) | (right & pa[i]) ; --n2 ; } while (n2--) { i = (i+1) % chromsize ; child[i] = pa[i] ; } if (n4) { // Two point crossover? if (*split) { i = (i+1) % chromsize ; child[i] = (left & pa[i]) | (right & pb[i]) ; --n3 ; } while (n3--) { i = (i+1) % chromsize ; child[i] = pb[i] ; } if (*split) { i = (i+1) % chromsize ; child[i] = (left & pb[i]) | (right & pa[i]) ; --n4 ; } while (n4--) { i = (i+1) % chromsize ; child[i] = pa[i] ; } } // If two point crossover }
int main ( int argc , // Number of command line arguments (includes prog name) char *argv[] // Arguments (prog name is argv[0]) ) { int i, j, k, nvars, ncases, irep, nreps, nbins, nbins_dep, nbins_indep, *count ; int n_indep_vars, idep, icand, *index, *mcpt_max_counts, *mcpt_same_counts, *mcpt_solo_counts ; short int *bins_dep, *bins_indep ; double *data, *work, dtemp, *save_info, criterion, *crits ; double *ab, *bc, *b ; char filename[256], **names, depname[256] ; FILE *fp ; /* Process command line parameters */ #if 1 if (argc != 6) { printf ( "\nUsage: TRANSFER datafile n_indep depname nreps" ) ; printf ( "\n datafile - name of the text file containing the data" ) ; printf ( "\n The first line is variable names" ) ; printf ( "\n Subsequent lines are the data." ) ; printf ( "\n Delimiters can be space, comma, or tab" ) ; printf ( "\n n_indep - Number of independent vars, starting with the first" ) ; printf ( "\n depname - Name of the 'dependent' variable" ) ; printf ( "\n It must be AFTER the first n_indep variables" ) ; printf ( "\n nbins - Number of bins for all variables" ) ; printf ( "\n nreps - Number of Monte-Carlo permutations, including unpermuted" ) ; exit ( 1 ) ; } strcpy ( filename , argv[1] ) ; n_indep_vars = atoi ( argv[2] ) ; strcpy ( depname , argv[3] ) ; nbins = atoi ( argv[4] ) ; nreps = atoi ( argv[5] ) ; #else strcpy ( filename , "..\\SYNTH.TXT" ) ; n_indep_vars = 7 ; strcpy ( depname , "SUM1234" ) ; nbins = 2 ; nreps = 1 ; #endif _strupr ( depname ) ; /* These are used by MEM.CPP for runtime memory validation */ _fullpath ( mem_file_name , "MEM.LOG" , 256 ) ; fp = fopen ( mem_file_name , "wt" ) ; if (fp == NULL) { // Should never happen printf ( "\nCannot open MEM.LOG file for writing!" ) ; return EXIT_FAILURE ; } fclose ( fp ) ; mem_keep_log = 1 ; // Change this to 1 to keep a memory use log (slows execution!) mem_max_used = 0 ; /* Open the text file to which results will be written */ fp = fopen ( "TRANSFER.LOG" , "wt" ) ; if (fp == NULL) { // Should never happen printf ( "\nCannot open TRANSFER.LOG file for writing!" ) ; return EXIT_FAILURE ; } /* Read the file and locate the index of the dependent variable */ if (readfile ( filename , &nvars , &names , &ncases , &data )) return EXIT_FAILURE ; for (idep=0 ; idep<nvars ; idep++) { if (! strcmp ( depname , names[idep] )) break ; } if (idep == nvars) { printf ( "\nERROR... Dependent variable %s is not in file", depname ) ; return EXIT_FAILURE ; } if (idep < n_indep_vars) { printf ( "\nERROR... Dependent variable %s must be beyond independent vars", depname ) ; return EXIT_FAILURE ; } /* Allocate scratch memory crits - Transfer Entropy criterion index - Indices that sort the criterion save_info - Ditto, this is univariate criteria, to be sorted */ MEMTEXT ( "TRANSFER work allocs" ) ; work = (double *) MALLOC ( ncases * sizeof(double) ) ; assert ( work != NULL ) ; crits = (double *) MALLOC ( n_indep_vars * sizeof(double) ) ; assert ( crits != NULL ) ; index = (int *) MALLOC ( n_indep_vars * sizeof(int) ) ; assert ( index != NULL ) ; bins_indep = (short int *) MALLOC ( ncases * sizeof(short int) ) ; assert ( bins_indep != NULL ) ; bins_dep = (short int *) MALLOC ( ncases * sizeof(short int) ) ; assert ( bins_dep != NULL ) ; mcpt_max_counts = (int *) MALLOC ( n_indep_vars * sizeof(int) ) ; assert ( mcpt_max_counts != NULL ) ; mcpt_same_counts = (int *) MALLOC ( n_indep_vars * sizeof(int) ) ; assert ( mcpt_same_counts != NULL ) ; mcpt_solo_counts = (int *) MALLOC ( n_indep_vars * sizeof(int) ) ; assert ( mcpt_solo_counts != NULL ) ; save_info = (double *) MALLOC ( n_indep_vars * sizeof(double) ) ; assert ( save_info != NULL ) ; count = (int *) MALLOC ( nbins * nbins * nbins * sizeof(int) ) ; assert ( count != NULL ) ; ab = (double *) MALLOC ( nbins * nbins * sizeof(double) ) ; assert ( ab != NULL ) ; bc = (double *) MALLOC ( nbins * nbins * sizeof(double) ) ; assert ( bc != NULL ) ; b = (double *) MALLOC ( nbins * sizeof(double) ) ; assert ( b != NULL ) ; /* Get the dependent variable and partition it */ for (i=0 ; i<ncases ; i++) // Get the 'dependent' variable work[i] = data[i*nvars+idep] ; nbins_dep = nbins ; partition ( ncases , work , &nbins_dep , NULL , bins_dep ) ; /* Replication loop is here */ for (irep=0 ; irep<nreps ; irep++) { /* Compute and save the transfer entropy of the dependent variable with each individual independent variable candidate. */ for (icand=0 ; icand<n_indep_vars ; icand++) { // Try all candidates for (i=0 ; i<ncases ; i++) work[i] = data[i*nvars+icand] ; // Shuffle independent variable if in permutation run (irep>0) if (irep) { // If doing permuted runs, shuffle i = ncases ; // Number remaining to be shuffled while (i > 1) { // While at least 2 left to shuffle j = (int) (unifrand () * i) ; if (j >= i) j = i - 1 ; dtemp = work[--i] ; work[i] = work[j] ; work[j] = dtemp ; } } nbins_indep = nbins ; partition ( ncases , work , &nbins_indep , NULL , bins_indep ) ; criterion = trans_ent ( ncases , nbins_indep , nbins_dep , bins_indep , bins_dep , 0 , 1 , 1 , count , ab , bc , b ) ; save_info[icand] = criterion ; // We will sort this when all candidates are done if (irep == 0) { // If doing original (unpermuted), save criterion index[icand] = icand ; // Will need original indices when criteria are sorted crits[icand] = criterion ; mcpt_max_counts[icand] = mcpt_same_counts[icand] = mcpt_solo_counts[icand] = 1 ; // This is >= itself so count it now } else { if (criterion >= crits[icand]) ++mcpt_solo_counts[icand] ; } } // Initial list of all candidates if (irep == 0) // Find the indices that sort the candidates per criterion qsortdsi ( 0 , n_indep_vars-1 , save_info , index ) ; else { qsortd ( 0 , n_indep_vars-1 , save_info ) ; for (icand=0 ; icand<n_indep_vars ; icand++) { if (save_info[icand] >= crits[index[icand]]) ++mcpt_same_counts[index[icand]] ; if (save_info[n_indep_vars-1] >= crits[index[icand]]) // Valid only for largest ++mcpt_max_counts[index[icand]] ; } } } // For all reps fprintf ( fp , "\nTransfer entropy of %s", depname); fprintf ( fp , "\n" ) ; fprintf ( fp , "\n" ) ; fprintf ( fp , "\nPredictors, in order of decreasing transfer entropy" ) ; fprintf ( fp , "\n" ) ; fprintf ( fp , "\n Variable Information Solo pval Min pval Max pval" ) ; for (icand=0 ; icand<n_indep_vars ; icand++) { // Do all candidates k = index[n_indep_vars-1-icand] ; // Index of sorted candidate fprintf ( fp , "\n%31s %11.5lf %12.4lf %10.4lf %10.4lf", names[k], crits[k], (double) mcpt_solo_counts[k] / nreps, (double) mcpt_same_counts[k] / nreps, (double) mcpt_max_counts[k] / nreps ) ; } MEMTEXT ( "TRANSFER: Finish" ) ; fclose ( fp ) ; FREE ( work ) ; FREE ( crits ) ; FREE ( index ) ; FREE ( bins_indep ) ; FREE ( bins_dep ) ; FREE ( mcpt_max_counts ) ; FREE ( mcpt_same_counts ) ; FREE ( mcpt_solo_counts ) ; FREE ( save_info ) ; FREE ( count ) ; FREE ( ab ) ; FREE ( bc ) ; FREE ( b ) ; free_data ( nvars , names , data ) ; MEMCLOSE () ; printf ( "\n\nPress any key..." ) ; _getch () ; return EXIT_SUCCESS ; }
int main ( int argc , // Number of command line arguments (includes prog name) char *argv[] // Arguments (prog name is argv[0]) ) { int i, j, k, nvars, ncases, irep, nreps, ivar, nties, ties ; int n_indep_vars, idep, icand, *index, *mcpt_max_counts, *mcpt_same_counts, *mcpt_solo_counts ; double *data, *work, dtemp, *save_info, criterion, *crits ; char filename[256], **names, depname[256] ; FILE *fp ; MutualInformationAdaptive *mi_adapt ; /* Process command line parameters */ #if 1 if (argc != 5) { printf ( "\nUsage: MI_ONLY datafile n_indep depname nreps" ) ; printf ( "\n datafile - name of the text file containing the data" ) ; printf ( "\n The first line is variable names" ) ; printf ( "\n Subsequent lines are the data." ) ; printf ( "\n Delimiters can be space, comma, or tab" ) ; printf ( "\n n_indep - Number of independent vars, starting with the first" ) ; printf ( "\n depname - Name of the 'dependent' variable" ) ; printf ( "\n It must be AFTER the first n_indep variables" ) ; printf ( "\n nreps - Number of Monte-Carlo permutations, including unpermuted" ) ; exit ( 1 ) ; } strcpy ( filename , argv[1] ) ; n_indep_vars = atoi ( argv[2] ) ; strcpy ( depname , argv[3] ) ; nreps = atoi ( argv[4] ) ; #else strcpy ( filename , "..\\SYNTH.TXT" ) ; n_indep_vars = 7 ; strcpy ( depname , "SUM1234" ) ; nreps = 100 ; #endif _strupr ( depname ) ; /* These are used by MEM.CPP for runtime memory validation */ _fullpath ( mem_file_name , "MEM.LOG" , 256 ) ; fp = fopen ( mem_file_name , "wt" ) ; if (fp == NULL) { // Should never happen printf ( "\nCannot open MEM.LOG file for writing!" ) ; return EXIT_FAILURE ; } fclose ( fp ) ; mem_keep_log = 0 ; // Change this to 1 to keep a memory use log (slows execution!) mem_max_used = 0 ; /* Open the text file to which results will be written */ fp = fopen ( "MI_ONLY.LOG" , "wt" ) ; if (fp == NULL) { // Should never happen printf ( "\nCannot open MI_ONLY.LOG file for writing!" ) ; return EXIT_FAILURE ; } /* Read the file and locate the index of the dependent variable */ if (readfile ( filename , &nvars , &names , &ncases , &data )) return EXIT_FAILURE ; for (idep=0 ; idep<nvars ; idep++) { if (! strcmp ( depname , names[idep] )) break ; } if (idep == nvars) { printf ( "\nERROR... Dependent variable %s is not in file", depname ) ; return EXIT_FAILURE ; } if (idep < n_indep_vars) { printf ( "\nERROR... Dependent variable %s must be beyond independent vars", depname ) ; return EXIT_FAILURE ; } /* Check each variable for ties. This is not needed for the algorithm, but it is good to warn the user, because more than a very few tied values in any variable seriously degrades performance of the adaptive partitioning algorithm. */ MEMTEXT ( "MI_ONLY: Work" ) ; work = (double *) MALLOC ( ncases * sizeof(double) ) ; assert ( work != NULL ) ; ties = 0 ; assert ( work != NULL ) ; for (ivar=0 ; ivar<nvars ; ivar++) { if (ivar > n_indep_vars && ivar != idep) continue ; // Check only the variables selected by the user for (i=0 ; i<ncases ; i++) work[i] = data[i*nvars+ivar] ; qsortd ( 0 , ncases-1 , work ) ; nties = 0 ; for (i=1 ; i<ncases ; i++) { if (work[i] == work[i-1]) ++nties ; } if ((double) nties / (double) ncases > 0.05) { ++ties ; fprintf ( fp , "\nWARNING... %s has %.2lf percent ties!", names[ivar], 100.0 * nties / (double) ncases ) ; } } // For all variables if (ties) { fprintf ( fp , "\nThe presence of ties will seriously degrade" ) ; fprintf ( fp , "\nperformance of the adaptive partitioning algorithm\n\n" ) ; } /* Allocate scratch memory and create the MutualInformation object using the dependent variable crits - Mutual information criterion index - Indices that sort the criterion save_info - Ditto, this is univariate information, to be sorted mi_adapt - The MutualInformation object, constructed with the 'dependent' variable */ MEMTEXT ( "MI_ONLY work allocs plus MutualInformation" ) ; crits = (double *) MALLOC ( n_indep_vars * sizeof(double) ) ; assert ( crits != NULL ) ; index = (int *) MALLOC ( n_indep_vars * sizeof(int) ) ; assert ( index != NULL ) ; mcpt_max_counts = (int *) MALLOC ( n_indep_vars * sizeof(int) ) ; assert ( mcpt_max_counts != NULL ) ; mcpt_same_counts = (int *) MALLOC ( n_indep_vars * sizeof(int) ) ; assert ( mcpt_same_counts != NULL ) ; mcpt_solo_counts = (int *) MALLOC ( n_indep_vars * sizeof(int) ) ; assert ( mcpt_solo_counts != NULL ) ; save_info = (double *) MALLOC ( n_indep_vars * sizeof(double) ) ; assert ( save_info != NULL ) ; for (irep=0 ; irep<nreps ; irep++) { for (i=0 ; i<ncases ; i++) // Get the 'dependent' variable work[i] = data[i*nvars+idep] ; // Shuffle dependent variable if in permutation run (irep>0) if (irep) { // If doing permuted runs, shuffle i = ncases ; // Number remaining to be shuffled while (i > 1) { // While at least 2 left to shuffle j = (int) (unifrand () * i) ; if (j >= i) j = i - 1 ; dtemp = work[--i] ; work[i] = work[j] ; work[j] = dtemp ; } } // Here we use a tiny split theshold (instead of the usual 6.0) so that it picks up // small amounts of mutual information (perhaps including noise). // If we used 6.0, nearly all permutations of any reasonably sized dataset // would have a computed mutual information of zero. It's safe picking up // some noise because the permutation test will account for this. mi_adapt = new MutualInformationAdaptive ( ncases , work , 1 , 0.1 ) ; // Deliberately tiny for low information assert ( mi_adapt != NULL ) ; /* Compute and save the mutual information for the dependent variable with each individual independent variable candidate. */ for (icand=0 ; icand<n_indep_vars ; icand++) { // Try all candidates for (i=0 ; i<ncases ; i++) work[i] = data[i*nvars+icand] ; criterion = mi_adapt->mut_inf ( work , 1 ) ; save_info[icand] = criterion ; // We will sort this when all candidates are done if (irep == 0) { // If doing original (unpermuted), save criterion index[icand] = icand ; // Will need original indices when criteria are sorted crits[icand] = criterion ; mcpt_max_counts[icand] = mcpt_same_counts[icand] = mcpt_solo_counts[icand] = 1 ; // This is >= itself so count it now } else { if (criterion >= crits[icand]) ++mcpt_solo_counts[icand] ; } } // Initial list of all candidates delete mi_adapt ; mi_adapt = NULL ; if (irep == 0) // Find the indices that sort the candidates per criterion qsortdsi ( 0 , n_indep_vars-1 , save_info , index ) ; else { qsortd ( 0 , n_indep_vars-1 , save_info ) ; for (icand=0 ; icand<n_indep_vars ; icand++) { if (save_info[icand] >= crits[index[icand]]) ++mcpt_same_counts[index[icand]] ; if (save_info[n_indep_vars-1] >= crits[index[icand]]) // Valid only for largest ++mcpt_max_counts[index[icand]] ; } } } // For all reps fprintf ( fp , "\nAdaptive partitioning mutual information of %s", depname); fprintf ( fp , "\n" ) ; fprintf ( fp , "\n" ) ; fprintf ( fp , "\nPredictors, in order of decreasing mutual information" ) ; fprintf ( fp , "\n" ) ; fprintf ( fp , "\n Variable Information Solo pval Min pval Max pval" ) ; for (icand=0 ; icand<n_indep_vars ; icand++) { // Do all candidates k = index[n_indep_vars-1-icand] ; // Index of sorted candidate fprintf ( fp , "\n%31s %11.5lf %12.4lf %10.4lf %10.4lf", names[k], crits[k], (double) mcpt_solo_counts[k] / nreps, (double) mcpt_same_counts[k] / nreps, (double) mcpt_max_counts[k] / nreps ) ; } MEMTEXT ( "MI_ONLY: Finish" ) ; fclose ( fp ) ; FREE ( work ) ; FREE ( crits ) ; FREE ( index ) ; FREE ( mcpt_max_counts ) ; FREE ( mcpt_same_counts ) ; FREE ( mcpt_solo_counts ) ; FREE ( save_info ) ; free_data ( nvars , names , data ) ; MEMCLOSE () ; printf ( "\n\nPress any key..." ) ; _getch () ; return EXIT_SUCCESS ; }
static double genbeta(double aa) /* ********************************************************************** float genbet(float aa,float bb) GENerate BETa random deviate Function Returns a single random deviate from the beta distribution with parameters A and B. The density of the beta is x^(a-1) * (1-x)^(b-1) / B(a,b) for 0 < x < 1 Arguments aa --> First parameter of the beta distribution bb --> Second parameter of the beta distribution Method R. C. H. Cheng Generating Beta Variatew with Nonintegral Shape Parameters Communications of the ACM, 21:317-322 (1978) (Algorithms BB and BC) ********************************************************************** */ { double bb = aa; #define expmax 89.0 #define infnty 1.0E38 static double olda = -1.0; static double oldb = -1.0; static double genbet,a,alpha,b,beta,delta,gamma,k1,k2,r,s,t,u1,u2,v,w,y,z; static long qsame; qsame = olda == aa && oldb == bb; if(qsame) goto S20; if(!(aa <= 0.0 || bb <= 0.0)) goto S10; S10: olda = aa; oldb = bb; S20: if(!(MIN(aa,bb) > 1.0)) goto S100; /* Alborithm BB Initialize */ if(qsame) goto S30; a = MIN(aa,bb); b = MAX(aa,bb); alpha = a+b; beta = sqrt((alpha-2.0)/(2.0*a*b-alpha)); gamma = a+1.0/beta; S30: S40: u1 = unifrand(); /* Step 1 */ u2 = unifrand(); v = beta*log(u1/(1.0-u1)); if(!(v > expmax)) goto S50; w = infnty; goto S60; S50: w = a*exp(v); S60: z = pow(u1,2.0)*u2; r = gamma*v-1.3862944; s = a+r-w; /* Step 2 */ if(s+2.609438 >= 5.0*z) goto S70; /* Step 3 */ t = log(z); if(s > t) goto S70; /* Step 4 */ if(r+alpha*log(alpha/(b+w)) < t) goto S40; S70: /* Step 5 */ if(!(aa == a)) goto S80; genbet = w/(b+w); goto S90; S80: genbet = b/(b+w); S90: goto S230; S100: /* Algorithm BC Initialize */ if(qsame) goto S110; a = MAX(aa,bb); b = MIN(aa,bb); alpha = a+b; beta = 1.0/b; delta = 1.0+a-b; k1 = delta*(1.38889E-2+4.16667E-2*b)/(a*beta-0.777778); k2 = 0.25+(0.5+0.25/delta)*b; S110: S120: u1 = unifrand(); /* Step 1 */ u2 = unifrand(); if(u1 >= 0.5) goto S130; /* Step 2 */ y = u1*u2; z = u1*y; if(0.25*u2+z-y >= k1) goto S120; goto S170; S130: /* Step 3 */ z = pow(u1,2.0)*u2; if(!(z <= 0.25)) goto S160; v = beta*log(u1/(1.0-u1)); if(!(v > expmax)) goto S140; w = infnty; goto S150; S140: w = a*exp(v); S150: goto S200; S160: if(z >= k2) goto S120; S170: /* Step 4 Step 5 */ v = beta*log(u1/(1.0-u1)); if(!(v > expmax)) goto S180; w = infnty; goto S190; S180: w = a*exp(v); S190: if(alpha*(log(alpha/(b+w))+v)-1.3862944 < log(z)) goto S120; S200: /* Step 6 */ if(!(a == aa)) goto S210; genbet = w/(b+w); goto S220; S210: genbet = b/(b+w); S230: S220: return genbet; #undef expmax #undef infnty }
int main ( int argc , // Number of command line arguments (includes prog name) char *argv[] // Arguments (prog name is argv[0]) ) { int i, ncases, irep, nreps, m, n_lower, n_upper, n_ks2, n_ks_null, n_ks_alt ; double *x, pval, conf, pessimistic_lower, pessimistic_upper ; double ks_two, ks_one, D, Dp, Dm ; if (argc != 5) { printf ( "\nUsage: ConfConf ncases pval conf nreps" ) ; printf ( "\n ncases - Number of cases in the sample" ) ; printf ( "\n pval - Probability value (<0.5) for quantile test" ) ; printf ( "\n conf - Desired confidence value (<0.5) for both tests" ) ; printf ( "\n nreps - Number of replications" ) ; exit ( 1 ) ; } ncases = atoi ( argv[1] ) ; pval = atof ( argv[2] ) ; conf = atof ( argv[3] ) ; nreps = atoi ( argv[4] ) ; if (ncases < 10) { printf ( "\nERROR.. Must have at least 10 cases" ) ; exit ( 1 ) ; } if (pval * ncases < 1.0 || pval >= 0.5) { printf ( "\nERROR.. Pval too small or too large" ) ; exit ( 1 ) ; } if (conf <= 0.0 || conf >= 0.5) { printf ( "\nERROR.. Conf must be greater than 0 and less than 0.5" ) ; exit ( 1 ) ; } if (nreps < 1) { printf ( "\nERROR.. Must have at least 1 replication" ) ; exit ( 1 ) ; } /* Allocate memory and initialize */ x = (double *) malloc ( ncases * sizeof(double) ) ; m = (int) (pval * ncases) ; // Conservative order statistic for bound pessimistic_lower = quantile_conf ( ncases , m , conf ) ; pessimistic_upper = 1.0 - pessimistic_lower ; ks_two = inverse_ks ( ncases , 1.0 - conf ) ; // Two-tailed test ks_one = inverse_ks ( ncases , 1.0 - 2.0 * conf ) ; // One-tailed test printf ( "\nSuppose the model predicts values near 0 for the null hypothesis" ) ; printf ( "\nand values near 1 for the alternative hypothesis." ) ; printf ( "\n\nIf the dataset represents the null hypothesis, the threshold" ) ; printf ( "\nfor rejecting the null at p=%.4lf is given by the %d'th order statistic.", pval, ncases - m + 1 ) ; printf ( "\nThis is a conservative estimate of the %.4lf quantile", 1.0-pval ) ; printf ( "\nThere is only a %.4lf chance that it will really be the %.4lf quantile or worse.", conf, pessimistic_upper ) ; printf ( "\n\nIf the dataset represents the alternative hypothesis, the threshold" ) ; printf ( "\nfor rejecting the alt at p=%.4lf is given by the %d'th order statistic.", pval, m ) ; printf ( "\nThis is a conservative estimate of the %.4lf quantile", pval ) ; printf ( "\nThere is only a %.4lf chance that it will really be the %.4lf quantile or worse.", conf, pessimistic_lower) ; printf ( "\n\nKS thresholds: two-tailed KS = %.4lf one-tailed KS = %.4lf", ks_two, ks_one ) ; /* Now generate nreps samples. Verify that our required confidence level is observed. Note that the fact that this test uses a uniform distribution does not in any way limit its applicability to uniform distributions. If one were to generate cases from any other reasonable distribtion, the pessimistic quantile bounds would have to be transformed similarly. The result is that the inequalities below would pass or fail identically. We count the number of times 'disaster' happens. Disaster is when the order statistic used for the threshold is toward the inside (center) of the distribution, meaning that if this order statistic had been used as a threshold, more of the distribution would be outside the threshold than the user expected. We expect disaster to happen with probability equal to the specified conf parameter. For the two-tailed Kolmogorov-Smirnov test, disaster is when the empirical CDF deviates (above or below) from the correct value by more than the conf-inspired value. For the one-tailed test in which the dataset is from the NULL distribution, disaster is when the empirical CDF exceeds the true CDF, a situation that would encourage false rejection of the null hypothesis. This is measured by D+. For the one-tailed test in which the dataset is from the ALT distribution, disaster is when the empirical CDF is less than the true CDF, a situation that would encourage false rejection of the alternative hypothesis. This is measured by D-. */ n_lower = n_upper = n_ks2 = n_ks_null = n_ks_alt = 0 ; for (irep=0 ; irep<nreps ; irep++) { for (i=0 ; i<ncases ; i++) x[i] = unifrand () ; qsortd ( 0 , ncases-1 , x ) ; if (x[m-1] > pessimistic_lower) ++n_lower ; if (x[ncases-m] < pessimistic_upper) ++n_upper ; D = ks_test ( ncases , x , &Dp , &Dm ) ; if (D > ks_two) ++n_ks2 ; if (Dp > ks_one) ++n_ks_null ; if (Dm > ks_one) ++n_ks_alt ; } printf ( "\nPoint failure (expected=%.4lf) Lower=%.4lf Upper=%.4lf", conf, (double) n_lower / nreps, (double) n_upper / nreps) ; printf ( "\nKS failure: two-tailed = %.4lf NULL = %.4lf ALT = %.4lf", (double) n_ks2 / nreps, (double) n_ks_null / nreps, (double) n_ks_alt / nreps) ; free ( x ) ; return ( 0 ) ; }
int main ( int argc , // Number of command line arguments (includes prog name) char *argv[] // Arguments (prog name is argv[0]) ) { int i, j, k, nsamps, ntries, itype, divisor, itry, npart ; int isplit, nsplits, splits[10], nmiss ; short int *xbins, *ybins ; double param, ptie, *x, *y, x1, x2, result, prior_x1, p, sum, marg1, marg2 ; double ent, denom, cond, low0, low1, high0, high1, missfrac ; double right, wrong0, wrong1, cut0, cut1, cut2, cut3, cut4 ; double correctMI[10], total[10], bias[10], std_err[10] ; double lower0[10], upper0[10], lower1[10], upper1[10], miss[10] ; double outside0[10], outside1[10] ; MutualInformationDiscrete *mi ; /* Process command line parameters */ #if 1 if (argc != 6) { printf ( "\nUsage: TEST_DIS nsamples ntries type parameter ptie" ) ; printf ( "\n nsamples - Number of cases in the dataset" ) ; printf ( "\n ntried - Number of Monte-Carlo replications" ) ; printf ( "\n type - Type of test" ) ; printf ( "\n 0=bivariate normal with specified correlation" ) ; printf ( "\n 1=discrete bins with uniform error distribution" ) ; printf ( "\n 2=discrete bins with triangular error distribution" ) ; printf ( "\n 3=discrete bins with cyclic error distribution" ) ; printf ( "\n 4=discrete bins with attractive class error distribution" ) ; printf ( "\n parameter - Depends on type of test" ) ; printf ( "\n 0 - Correlation" ) ; printf ( "\n >0 - error probability" ) ; printf ( "\n ptie - If typ=0, probability of a tied case, else ignored" ) ; exit ( 1 ) ; } nsamps = atoi ( argv[1] ) ; ntries = atoi ( argv[2] ) ; itype = atoi ( argv[3] ) ; param = atof ( argv[4] ) ; ptie = atof ( argv[5] ) ; #else nsamps = 1000 ; ntries = 10000 ; itype = 2 ; param = 0.2 ; ptie = 0.0 ; #endif if ((nsamps <= 0) || (ntries <= 0) || (param < 0.0) || (param > 1.0) || (itype < 0) || (itype > 4) || (ptie < 0.0) || (ptie > 1.0)) { printf ( "\nUsage: TEST_DIS nsamples ntries type parameter ptie" ) ; exit ( 1 ) ; } if (itype > 0) { if (param > 0.5) { printf ( "\nNOTE... Reducing P(error) from %.4lf to 0.5", param ) ; printf ( "\nPress any key..." ) ; param = 0.5 ; _getch () ; } if (param == 0.0) // Prevent numerical problems param = 1.e-14 ; if (param == 1.0) param = 1.0 - 1.e-14 ; } /* Allocate memory and initialize */ divisor = ntries / 100 ; // This is for progress reports only if (divisor < 1) divisor = 1 ; x = (double *) malloc ( nsamps * sizeof(double) ) ; assert ( x != NULL ) ; y = (double *) malloc ( nsamps * sizeof(double) ) ; assert ( y != NULL ) ; xbins = (short int *) malloc ( nsamps * sizeof(short int) ) ; assert ( xbins != NULL ) ; ybins = (short int *) malloc ( nsamps * sizeof(short int) ) ; assert ( ybins != NULL ) ; /* Compute the different numbers of splits We increase them by doubling from 2, except that two bins causes various problems with the bound algorithms. So we increase the fist to 3 bins. */ splits[0] = 2 ; for (nsplits=1 ; nsplits<10 ; nsplits++) { if (nsamps / splits[nsplits-1] < 5) break ; splits[nsplits] = splits[nsplits-1] * 2 ; } splits[0] = 3 ; /* -------------------------------------------------------------------------------- Compute the correct mutual information according to the type -------------------------------------------------------------------------------- */ /* Bivariate normal */ if (itype == 0) { for (i=0 ; i<10 ; i++) correctMI[i] = -0.5 * log ( 1.0 - param * param ) ; } /* Errors are uniformly distributed to all possible error bins */ else if (itype == 1) { // Uniform error distribution for (i=0 ; i<nsplits ; i++) { j = splits[i] ; // Number of bins p = 1.0 - param ;// Probability of a correct decision p /= j ; // Probability of a given bin being chosen and correct // This is the diagonal of the confusion matrix sum = j * p * log(p*j*j) ; // Diagonal p = param ; // Probability of error (off diagonal) p /= j * (j-1) ; // Probability of a given bin being chosen and wrong // This is the off-diagonal elements sum += j * (j-1) * p * log(p*j*j) ; correctMI[i] = sum ; } } /* 90% of errors go in the upper triangle, 10% in the lower triangle */ else if (itype == 2) { // Triangular error distribution for (isplit=0 ; isplit<nsplits ; isplit++) { npart = splits[isplit] ; // Number of bins right = (1.0 - param) / npart ; wrong0 = 0.1 * param / (npart * (npart-1) / 2) ; wrong1 = 0.9 * param / (npart * (npart-1) / 2) ; sum = 0.0 ; for (i=0 ; i<npart ; i++) { marg1 = right + i * wrong0 + (npart - 1 - i) * wrong1 ; marg2 = right + (npart - 1 - i) * wrong0 ; for (j=0 ; j<npart ; j++) { if (j < i) sum += wrong0 * log(wrong0/(marg1*marg2)) ; else if (j == i) sum += right * log(right/(marg1*marg2)) ; else sum += wrong1 * log(wrong1/(marg1*marg2)) ; marg2 += wrong1 - wrong0 ; } } correctMI[isplit] = sum ; } } /* Half of the errors go one bin to the right of the correct bin, and the other half go two bins to the right (with wraparound) */ else if (itype == 3) { // itype=3; Cyclic error distribution for (i=0 ; i<nsplits ; i++) { j = splits[i] ; // Number of bins p = 1.0 - param ;// Probability of a correct decision p /= j ; // Probability of a given bin being chosen and correct // This is the diagonal of the confusion matrix sum = j * p * log(p*j*j) ; // Diagonal p = param ; // Probability of error (off diagonal) p /= 2 * j ; // Probability of this adjacent bin being chosen and wrong sum += 2 * j * p * log(p*j*j) ; correctMI[i] = sum ; } } /* This is a really complicated test of a couple classes being unnaturally attractive. For the first nbins-2 true classes, most of the errors go to the last (rightmost) class, and the rest of the errors go to the second-last class. All other members of the row are zero. For the second-last true class, most of the errors go to the last class, and the few remaining errors are evenly distributed across the remaining classes. For the last true class, all errors (and it just has a few) are evenly distributed across the other classes. This tests what happens when most of the errors land in a single class, and most of the remaining errors land in a different single class. */ else if (itype == 4) { // itype=4; Attractive class error distribution for (isplit=0 ; isplit<nsplits ; isplit++) { npart = splits[isplit] ; // Number of bins right = (1.0 - param) / npart ; wrong0 = 0.05 * param / ((npart-1) + 2 * (npart-2)) ; wrong1 = 0.95 * param / (npart-1) ; sum = 0.0 ; for (i=0 ; i<npart ; i++) { if (i < npart-2) { marg1 = right + wrong0 + wrong1 ; marg2 = right + 2 * wrong0 ; sum += right * log(right/(marg1*marg2)) ; marg2 = right + (npart-1) * wrong0 ; sum += wrong0 * log(wrong0/(marg1*marg2)) ; marg2 = right + (npart-1) * wrong1 ; sum += wrong1 * log(wrong1/(marg1*marg2)) ; } else if (i == npart-2) { marg1 = right + (npart-2) * wrong0 + wrong1 ; marg2 = right + 2.0 * wrong0 ; sum += (npart-2) * wrong0 * log(wrong0/(marg1*marg2)) ; marg2 = right + (npart-1) * wrong0 ; sum += right * log(right/(marg1*marg2)) ; marg2 = right + (npart-1) * wrong1 ; sum += wrong1 * log(wrong1/(marg1*marg2)) ; } else { marg1 = right + (npart-3) * wrong0 ; marg2 = right + 2.0 * wrong0 ; sum += (npart-2) * wrong0 * log(wrong0/(marg1*marg2)) ; marg2 = right + (npart-1) * wrong0 ; sum += wrong0 * log(wrong0/(marg1*marg2)) ; marg2 = right + (npart-1) * wrong1 ; sum += right * log(right/(marg1*marg2)) ; } } correctMI[isplit] = sum ; } } /* Main outer loop does all tries */ for (i=0 ; i<nsplits ; i++) total[i] = bias[i] = std_err[i] = lower0[i] = upper0[i] = lower1[i] = upper1[i] = miss[i] = outside0[i] = outside1[i] = 0.0 ; for (itry=1 ; itry<=ntries ; itry++) { if (((itry-1) % divisor) == 0) printf ( "\n\n\nTry %d of %d", itry, ntries ) ; if (itype == 0) { // If bivariate normal, generate the data prior_x1 = 0.5 ; // Arbitrary for (i=0 ; i<nsamps ; i++) { // Create bivariate sample with known correlation if (unifrand() < ptie) // Duplicate the prior observation for a tie? x1 = prior_x1 ; else { x1 = normal () ; prior_x1 = x1 ; } x2 = normal () ; if (i < nsamps/2) { // Equally split ties between X and Y x[i] = x1 ; y[i] = param * x1 + sqrt ( 1.0 - param * param ) * x2 ; } else { y[i] = x1 ; x[i] = param * x1 + sqrt ( 1.0 - param * param ) * x2 ; } } } for (isplit=0 ; isplit<nsplits ; isplit++) { if (itype == 0) { // Bivariate normal npart = splits[isplit] ; partition ( nsamps , x , &npart , NULL , xbins ) ; npart = splits[isplit] ; partition ( nsamps , y , &npart , NULL , ybins ) ; } else if (itype == 1) { // Uniform error distribution for (i=0 ; i<nsamps ; i++) x[i] = unifrand () ; npart = splits[isplit] ; partition ( nsamps , x , &npart , NULL , xbins ) ; for (j=0 ; j<nsamps ; j++) { if (unifrand() < param) { for (;;) { // This is an error ybins[j] = (short int) (0.999999999999 * unifrand() * npart) ; if (xbins[j] != ybins[j]) // Must not accidentally be right! break ; } } else // This is correct ybins[j] = xbins[j] ; } } else if (itype == 2) { // Triangular error distribution npart = splits[isplit] ; right = (1.0 - param) / npart ; wrong0 = 0.1 * param / (npart * (npart-1) / 2) ; // Lower triangle wrong1 = 0.9 * param / (npart * (npart-1) / 2) ; // Upper triangle for (j=0 ; j<nsamps ; j++) { cut0 = right + (npart-1) * wrong0 ; p = unifrand () ; for (k=0 ; k<npart ; k++) { if (p < cut0 || k == npart-1) { ybins[j] = k ; cut1 = k * wrong1 ; cut2 = cut1 + right ; cut3 = (npart-k-1) * wrong0 ; p = unifrand () * (cut2 + cut3) ; if (p < cut1) { i = (int) (p / cut1 * k) ; xbins[j] = i ; } else if (p < cut2) xbins[j] = k ; else { i = k + (int) (((p - cut2) / cut3) * (npart-k)) ; xbins[j] = i ; } break ; } cut0 += right + (k+1) * wrong1 + (npart-k-2) * wrong0 ; } } } else if (itype == 3) { // itype == 3 (Cyclic error distribution) for (i=0 ; i<nsamps ; i++) x[i] = unifrand () ; npart = splits[isplit] ; partition ( nsamps , x , &npart , NULL , xbins ) ; for (j=0 ; j<nsamps ; j++) { if (unifrand() < param) { if (unifrand() < 0.5) ybins[j] = (short int) ((xbins[j]+1) % splits[isplit]) ; else ybins[j] = (short int) ((xbins[j]+2) % splits[isplit]) ; } else ybins[j] = xbins[j] ; } } else if (itype == 4) { // itype == 4 (attractive class error distribution) npart = splits[isplit] ; right = (1.0 - param) / npart ; wrong0 = 0.05 * param / ((npart-1) + 2 * (npart-2)) ; wrong1 = 0.95 * param / (npart-1) ; cut0 = (npart-2) * (right + wrong0 + wrong1) ; cut1 = cut0 + (npart-2) * wrong0 + right + wrong1 ; cut2 = (npart-2) * wrong0 ; cut3 = cut2 + right ; cut4 = (npart-1) * wrong0 ; for (j=0 ; j<nsamps ; j++) { p = unifrand () ; if (p < cut0) { k = (int) (p / cut0 * (npart-2)) ; xbins[j] = k ; p = unifrand () * (right + wrong0 + wrong1) ; if (p < right) ybins[j] = k ; else if (p < right + wrong0) ybins[j] = npart-2 ; else ybins[j] = npart-1 ; } else if (p < cut1) { xbins[j] = npart-2 ; p = unifrand () * (cut3 + wrong1) ; if (p < cut2) { k = (int) (p / cut2 * (npart-2)) ; ybins[j] = k ; } else if (p < cut3) ybins[j] = npart-2 ; else ybins[j] = npart-1 ; } else { xbins[j] = npart-1 ; p = unifrand () * (cut4 + right) ; if (p < cut4) { k = (int) (p / cut4 * (npart-1)) ; ybins[j] = k ; } else ybins[j] = npart-1 ; } } } /* Create the MutualInformation object. Count errors. This is used only for type 0 (bivariate normal) */ mi = new MutualInformationDiscrete ( nsamps , ybins ) ; assert ( mi != NULL ) ; nmiss = 0 ; for (j=0 ; j<nsamps ; j++) { if (xbins[j] != ybins[j]) ++nmiss ; } missfrac = (double) nmiss / (double) nsamps ; miss[isplit] += missfrac ; /* Compute the mutual information, Y entropy, and conditional entropy Tally the mean mutual information and bias and standard error */ result = mi->mut_inf ( xbins ) ; // Mutual information ent = mi->entropy () ; // Y entropy cond = ent - result ; // Conditional entropy H(Y|X) // printf ( "\n\nent=%.5lf cond=%.5lf MI=%.5lf hPe=%.5lf cond_err=%.5lf", /*!!!!!!*/ // ent, cond, result, mi->hPe(xbins), mi->conditional_error(xbins)) ; /*!!!!!!*/ // printf ( "\nnumer0=%.5lf numer1=%.5lf lo den=%.5lf hi den=%.5lf", /*!!!!!!*/ // cond - log(2.0), cond - mi->conditional_error ( xbins ), // log(splits[isplit]-1.0), mi->HYe ( xbins )) ; /*!!!!!!*/ total[isplit] += result ; bias[isplit] += result - correctMI[isplit] ; std_err[isplit] += (result - correctMI[isplit]) * (result - correctMI[isplit]) ; /* Compute loose and tight lower and upper bounds */ low0 = (cond - log(2.0)) / log ( splits[isplit] - 1.0 ) ; low1 = (cond - mi->conditional_error ( xbins )) / log ( splits[isplit] - 1.0 ) ; denom = mi->HYe ( xbins ) + 1.e-30 ; high0 = cond / denom ; high1 = (cond - mi->conditional_error ( xbins )) / denom ; /* Don't allow nonsense lower bound Don't go beyond what a naive classifier based on equal priors could do */ if (low0 < 0.0) low0 = 0.0 ; if (high0 > 1.0 - 1.0 / splits[isplit]) high0 = 1.0 - 1.0 / splits[isplit] ; if (high1 > 1.0 - 1.0 / splits[isplit]) high1 = 1.0 - 1.0 / splits[isplit] ; /* In rare pathological cases, the limit we just did to the high bound may pull it under the low bound. Prevent this from happening. */ if (low0 > high0) low0 = high0 ; if (low1 > high1) low1 = high1 ; /* Cumulate the mean of the bounds. */ lower0[isplit] += low0 ; lower1[isplit] += low1 ; upper0[isplit] += high0 ; upper1[isplit] += high1 ; /* Count how many times the true population value is outside the computed bounds. Note that I am not aware of any way of computing the expected error rate for a bivariate normal, so for the sake of doing something, I use the obtained error rate. Ultimately this will be very good, because it will be a reasonably good, asymptotically unbiased Monte-Carlo estimator. Unfortunately, it will take a while to get there, and meanwhile outages may be counted. Expect the outage count to be somewhat inaccurate. A better program would not start counting outages until after many replications, thus allowing the Monte-Carlo error rate to decently converge before being used. */ if (itype == 0) { // I'm not aware of any simple way to compute true error rate if (missfrac < low0 || missfrac > high0) ++outside0[isplit] ; if (missfrac < low1 || missfrac > high1) ++outside1[isplit] ; } else { // We know the true error rate, because it was specified if (param < low0 || param > high0) ++outside0[isplit] ; if (param < low1 || param > high1) ++outside1[isplit] ; } delete mi ; } // For all splits /* Print intermediate results to keep the user happy */ if ((((itry-1) % divisor) == 0) || (itry == ntries) ) { // Don't do this every try! Too slow. if (itry == ntries) printf ( "\n\nFinal... n=%d reps=%d type=%d param=%.4lf ptie=%.4lf\n", nsamps, ntries, itype, param, ptie) ; printf ( "\nSplits size Est. MI True MI Bias StdErr Lower Upper Outside" ) ; for (i=0 ; i<nsplits ; i++) { printf ( "\n%4d %6d %8.4lf %8.4lf %7.4lf %7.4lf %8.4lf %7.4lf %6.3lf", splits[i], nsamps/splits[i], total[i]/itry, correctMI[i], bias[i]/itry, sqrt ( std_err[i]/itry ), lower0[i]/itry, upper0[i]/itry, outside0[i]/itry ) ; printf ( "\n %8.4lf %7.4lf %6.3lf", lower1[i]/itry, upper1[i]/itry, outside1[i]/itry ) ; } } if (_kbhit ()) { // Has the user pressed a key? if (_getch() == 27) // The ESCape key? break ; } } // For all tries free ( x ) ; free ( y ) ; free ( xbins ) ; free ( ybins ) ; return EXIT_SUCCESS ; }