void test1d_resample (void) { size_t i; int status = 0; gsl_histogram *h; gsl_ieee_env_setup (); h = gsl_histogram_calloc_uniform (10, 0.0, 1.0); gsl_histogram_increment (h, 0.1); gsl_histogram_increment (h, 0.2); gsl_histogram_increment (h, 0.2); gsl_histogram_increment (h, 0.3); { gsl_histogram_pdf *p = gsl_histogram_pdf_alloc (10); gsl_histogram *hh = gsl_histogram_calloc_uniform (100, 0.0, 1.0); gsl_histogram_pdf_init (p, h); for (i = 0; i < 100000; i++) { double u = urand(); double x = gsl_histogram_pdf_sample (p, u); gsl_histogram_increment (hh, x); } for (i = 0; i < 100; i++) { double y = gsl_histogram_get (hh, i) / 2500; double x, xmax; size_t k; double ya; gsl_histogram_get_range (hh, i, &x, &xmax); gsl_histogram_find (h, x, &k); ya = gsl_histogram_get (h, k); if (ya == 0) { if (y != 0) { printf ("%d: %g vs %g\n", (int) i, y, ya); status = 1; } } else { double err = 1 / sqrt (gsl_histogram_get (hh, i)); double sigma = fabs ((y - ya) / (ya * err)); if (sigma > 3) { status = 1; printf ("%g vs %g err=%g sigma=%g\n", y, ya, err, sigma); } } } gsl_histogram_pdf_free (p) ; gsl_histogram_free (hh); gsl_test (status, "gsl_histogram_pdf_sample within statistical errors"); } gsl_histogram_free (h); }
//Run the generator void RandPSSMGen::RunGenerator() { int c, i, j, k,l,m,q,w,z, curr_len; double curr_depth; double x, r; int zeros=0; double col_sum=0; double firstDraw, secondDraw, thirdDraw, sum; gsl_histogram* width_hist; gsl_histogram_pdf* width_pdf; gsl_histogram* depth_hist; gsl_histogram_pdf* depth_pdf; double invariant_cols[6]; double total_cols[6]; double invariant_prob[6]; double abszero_cells[6]; double total_cells[6]; double abszero_prob[6]; gsl_histogram* first_edge_hist; gsl_histogram_pdf* first_edge_pdf; gsl_histogram* first_inner_hist; gsl_histogram_pdf* first_inner_pdf; gsl_histogram* second_edge_hist[5]; gsl_histogram_pdf* second_edge_pdf[5]; gsl_histogram* second_inner_hist[5]; gsl_histogram_pdf* second_inner_pdf[5]; gsl_histogram* third_edge_hist[5]; gsl_histogram_pdf* third_edge_pdf[5]; gsl_histogram* third_inner_hist[5]; gsl_histogram_pdf* third_inner_pdf[5]; FILE* out; bool edge; double known_zeros=0, known_total=0; double new_zeros=0, new_total=0; out = fopen(outFN, "w"); if(out==NULL) { printf("Error: cannot open file named %s\n", outFN); exit(1); } //How many random matrices? printf("%d Matrices Will Be Generated\n", numRandomMats); //Read in the matrices printf("%d Matrices Read In\n", numMatrices); //1) The first step is to read in the width distribution width_hist = gsl_histogram_alloc(7); //7 places in the histogram double width_range[8] = {3, 5, 8, 10, 12, 14, 16, 25}; gsl_histogram_set_ranges(width_hist, width_range, 8); for(i=0; i<numMatrices; i++) {//Go through each matrix, adding size to histogram gsl_histogram_increment(width_hist, (double)matrices[i]->len); } width_pdf= gsl_histogram_pdf_alloc(7); gsl_histogram_pdf_init(width_pdf, width_hist); //1.1) Find the sequence depth distribution depth_hist = gsl_histogram_alloc(7); //20 places in the histogram double depth_range[8] = {0,5,10,20,40,80,160,1000}; gsl_histogram_set_ranges(depth_hist, depth_range, 8); for(i=0; i<numMatrices; i++) {//Go through each matrix, adding each column depth to histogram for(j=0; j<matrices[i]->len; j++){ double sum=0; for(k=0; k<B; k++){ sum += matrices[i]->n[j][k]; } gsl_histogram_increment(depth_hist, sum); } } depth_pdf = gsl_histogram_pdf_alloc(7); gsl_histogram_pdf_init(depth_pdf, depth_hist); //2) The second step is to find the probability of invariance given the position of the column //Also find the probability of an absolute zero (not including the invariant columns) for(i=0; i<6; i++) { invariant_cols[i]=0; total_cols[i]=0; abszero_cells[i]=0; total_cells[i]=0; } bool inv=false; for(i=0; i<numMatrices; i++) { curr_len = matrices[i]->len; for(j=0; j<curr_len; j++){ //Is the column invariant? inv = Invariant(matrices[i]->n[j], zeros); //What column are we in? z = WhatColumn(j, curr_len); total_cols[z]++; invariant_cols[z]+=inv; //Find zeros in a variable column if(!inv) { total_cells[z]+=4; abszero_cells[z]+=zeros; } known_total+=4; known_zeros+=zeros; } } for(i=0; i<6; i++){ invariant_prob[i]=invariant_cols[i]/total_cols[i]; abszero_prob[i]=abszero_cells[i]/total_cells[i]; } printf("Known Zeros: %lf\n", known_zeros/known_total); //3) Fill the First, Second, and Third Draw Histograms. first_edge_hist = gsl_histogram_alloc(5); gsl_histogram_set_ranges_uniform (first_edge_hist, 0.0001, 0.99999); first_inner_hist = gsl_histogram_alloc(5); gsl_histogram_set_ranges_uniform (first_inner_hist, 0.0001, 0.99999); for(i=0; i<5; i++){ second_edge_hist[i] = gsl_histogram_alloc(5); gsl_histogram_set_ranges_uniform(second_edge_hist[i], 0.0001, 0.99999); second_inner_hist[i] = gsl_histogram_alloc(5); gsl_histogram_set_ranges_uniform(second_inner_hist[i], 0.0001, 0.99999); } for(i=0; i<5; i++){ third_edge_hist[i] = gsl_histogram_alloc(5); gsl_histogram_set_ranges_uniform(third_edge_hist[i], 0.0001, 0.99999); third_inner_hist[i] = gsl_histogram_alloc(5); gsl_histogram_set_ranges_uniform(third_inner_hist[i], 0.0001, 0.99999); } for(i=0; i<numMatrices; i++) { curr_len = matrices[i]->len; for(j=0; j<curr_len; j++){ if(WhatColumn(j, curr_len)==0) edge=true; else edge=false; //Discard Invariant Columns if(!Invariant(matrices[i]->n[j], zeros)) { col_sum = SumColumn(matrices[i]->n[j]); for(k=0; k<B; k++) { //Update first draw distribution firstDraw =matrices[i]->n[j][k]; if(firstDraw!=0){//Discard Zeros if(edge) gsl_histogram_increment(first_edge_hist, firstDraw/col_sum); else gsl_histogram_increment(first_inner_hist, firstDraw/col_sum); } //Update second draw distribution for(l=0; l<B; l++){ if(l!=k) { secondDraw = matrices[i]->n[j][l]; if(secondDraw!=0) { if(edge) gsl_histogram_increment(second_edge_hist[(int)floor((firstDraw/col_sum)*5)], secondDraw/col_sum); else gsl_histogram_increment(second_inner_hist[(int)floor((firstDraw/col_sum)*5)], secondDraw/col_sum); } sum = secondDraw + firstDraw; //Update third draw distribution for(m=0; m<B; m++){ if(m!=k && m!=l) { thirdDraw = matrices[i]->n[j][m]; if(thirdDraw!=0) { if(edge) gsl_histogram_increment(third_edge_hist[(int)floor((sum/col_sum)*5)], thirdDraw/col_sum); else gsl_histogram_increment(third_inner_hist[(int)floor((sum/col_sum)*5)], thirdDraw/col_sum); } } } } } } } } } //Start the PDFs here first_edge_pdf= gsl_histogram_pdf_alloc(5); gsl_histogram_pdf_init(first_edge_pdf, first_edge_hist); first_inner_pdf= gsl_histogram_pdf_alloc(5); gsl_histogram_pdf_init(first_inner_pdf, first_inner_hist); for(i=0; i<5; i++) { second_edge_pdf[i]= gsl_histogram_pdf_alloc(5); gsl_histogram_pdf_init(second_edge_pdf[i], second_edge_hist[i]); second_inner_pdf[i]= gsl_histogram_pdf_alloc(5); gsl_histogram_pdf_init(second_inner_pdf[i], second_inner_hist[i]); } for(i=0; i<5; i++) { third_edge_pdf[i]= gsl_histogram_pdf_alloc(5); gsl_histogram_pdf_init(third_edge_pdf[i], third_edge_hist[i]); third_inner_pdf[i]= gsl_histogram_pdf_alloc(5); gsl_histogram_pdf_init(third_inner_pdf[i], third_inner_hist[i]); } //////////////////////////////////////////////////////////////////////////////////////////////////////////// //////// All information gathered... generating random samples from here on in ///////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////// Motif* newPSSM = new Motif(31); for(z=0; z<numRandomMats; z++) { double r; int base; int first, second, third, fourth; //first step: pick a length r=((double)rand())/RAND_MAX; curr_len = (int)gsl_histogram_pdf_sample(width_pdf, r); if(curr_len>30){curr_len=30;} for(i=0; i<curr_len; i++) { //Generate one column at a time //Reset the column for(j=0; j<B; j++) newPSSM->f[i][j]=0; if(WhatColumn(i, curr_len)==0) edge=true; else edge=false; //Is the column variable? r=((double)rand())/RAND_MAX; c = WhatColumn(i, curr_len); if(r<invariant_prob[c]) { //The column has been chosen as invariant //Which base is invariant? r = ((double)rand())/RAND_MAX; if(r<0.285){base=0;} else if(r<0.57){base=3;} else if(r<0.785){base=1;} else{base=2;} newPSSM->f[i][base]=1; for(j=0; j<B; j++) { if(j!=base) newPSSM->f[i][j]=0; } }else{//the column has been chosen as variable sum=0; //Which base will be the focus of the first draw? first = rand()%B; //Is the first draw an absolute zero? r=((double)rand())/RAND_MAX; if(r<abszero_prob[WhatColumn(i, curr_len)]){//the cell is zero newPSSM->f[i][first]=0; }else{//the cell isn't zero //Sample from the first cell pdf r=((double)rand())/RAND_MAX; if(edge) newPSSM->f[i][first] = gsl_histogram_pdf_sample(first_edge_pdf, r); else newPSSM->f[i][first] = gsl_histogram_pdf_sample(first_inner_pdf, r); } sum+=newPSSM->f[i][first]; //Onto the second draw second=rand()%B; while(second==first) { second=rand()%B;} r=((double)rand())/RAND_MAX; if(r<abszero_prob[WhatColumn(i, curr_len)]){//the cell is zero newPSSM->f[i][second]=0; }else{//the cell isn't zero //Sample from the first cell pdf r=((double)rand())/RAND_MAX; if(edge) newPSSM->f[i][second] = gsl_histogram_pdf_sample(second_edge_pdf[(int)floor((sum)*5)], r); else newPSSM->f[i][second] = gsl_histogram_pdf_sample(second_inner_pdf[(int)floor((sum)*5)], r); } sum+=newPSSM->f[i][second]; //NORMALIZING! Check if anything is over 1 at this stage! if(sum>1) { newPSSM->f[i][first] = newPSSM->f[i][first]/sum; newPSSM->f[i][second] = newPSSM->f[i][second]/sum; sum=1; }else{ //Deal with the third draw here third=rand()%B; while(third==first || third==second) { third=rand()%B;} r=((double)rand())/RAND_MAX; if(r<abszero_prob[WhatColumn(i, curr_len)]){//the cell is zero newPSSM->f[i][third]=0; }else{//the cell isn't zero //Sample from the first cell pdf r=((double)rand())/RAND_MAX; if(edge) newPSSM->f[i][third] = gsl_histogram_pdf_sample(third_edge_pdf[(int)floor((sum)*5)], r); else newPSSM->f[i][third] = gsl_histogram_pdf_sample(third_inner_pdf[(int)floor((sum)*5)], r); } sum+=newPSSM->f[i][third]; //NORMALIZING! Check if anything is over 1 at this stage! if(sum>1) { newPSSM->f[i][first] = newPSSM->f[i][first]/sum; newPSSM->f[i][second] = newPSSM->f[i][second]/sum; newPSSM->f[i][third] = newPSSM->f[i][third]/sum; sum=1; }else{ //Deal with the last base here fourth=0; while(fourth==first||fourth==second||fourth==third) fourth++; newPSSM->f[i][fourth]=1-sum; } } } Invariant(newPSSM->f[i], zeros); new_total+=4; new_zeros+=zeros; } //PSSM Generated! //Convert to n's r=((double)rand())/RAND_MAX; curr_depth = gsl_histogram_pdf_sample(depth_pdf, r); if(curr_depth<5){curr_len=5;} for(q=0; q<curr_len; q++){ for(w=0; w<B; w++){ newPSSM->n[q][w] = ceil(newPSSM->f[q][w]*curr_depth); } } //Output in TRANSFAC format fprintf(out, "DE\tRAND%d\n", z); for(q=0; q<curr_len; q++){ fprintf(out, "%d\t%lf\t%lf\t%lf\t%lf\tX\n", q, newPSSM->n[q][0],newPSSM->n[q][1],newPSSM->n[q][2],newPSSM->n[q][3]); } fprintf(out, "XX\n"); } printf("New Zeros: %lf\n", new_zeros/new_total); /////////////////// Memory cleaning area /////////////////////////////////////////////////////////////////// delete newPSSM; gsl_histogram_free(width_hist); gsl_histogram_pdf_free(width_pdf); gsl_histogram_free(first_edge_hist); gsl_histogram_pdf_free(first_edge_pdf); gsl_histogram_free(first_inner_hist); gsl_histogram_pdf_free(first_inner_pdf); for(i=0; i<5; i++) { gsl_histogram_free(second_edge_hist[i]); gsl_histogram_pdf_free(second_edge_pdf[i]); gsl_histogram_free(second_inner_hist[i]); gsl_histogram_pdf_free(second_inner_pdf[i]); } for(i=0; i<5; i++) { gsl_histogram_free(third_edge_hist[i]); gsl_histogram_pdf_free(third_edge_pdf[i]); gsl_histogram_free(third_inner_hist[i]); gsl_histogram_pdf_free(third_inner_pdf[i]); } fclose(out); }