void get_windowed_ins_probs2(int win_length) { // Allocate two arrays for ins. prob over each window of length win_length over each sequence. double* winned_ins_probs1 = (double*)malloc(sizeof(double) * global_aln_info.length1 - win_length + 1); double* winned_ins_probs2 = (double*)malloc(sizeof(double) * global_aln_info.length2 - win_length + 1); char win_ins_prob_file_name[100]; sprintf(win_ins_prob_file_name, "win_ins_probs1_%d.txt", win_length); FILE* win_ins_prob_file = fopen(win_ins_prob_file_name, "w"); for(int cnt1 = 1; cnt1 < global_aln_info.length1 - win_length + 1; cnt1++) { // cnt1 is the index of starting of current window. winned_ins_probs1[cnt1] = LOG_OF_ZERO; // cnt2 is the index in second sequence where window is betweenm i.e., // the window is assumed to be inserted between cnt2 and cnt2 + 1. // however I do not put the constraint that cnt1-1 is not aligned to cnt2 and cnt2 is not aligned to // cnt1 + win_length + 1. for(int cnt2 = 1; cnt2 < global_aln_info.length2 + 1; cnt2++) { // cur_win_ins_prob includes probability of all enumerations of alignments where // currently set window is inserted in first sequences. double cur_win_ins_prob = xlog_mul(global_aln_info.fore_hmm_array->probs[cnt1][cnt2][STATE_INS1], global_aln_info.back_hmm_array->probs[cnt1 + win_length][cnt2][STATE_INS1]); // Calculate the emission probability of inserted sequence by going over all inserted sequence. // emit the nucleotides in the inserted sequence is sequence 1. double int_emit_prob = xlog(1); for(int emit_cnt = cnt1 + 1; emit_cnt <= cnt1 + win_length; emit_cnt++) { double trans_emit_prob; if(emit_cnt == cnt1 + win_length) { trans_emit_prob = get_trans_emit_prob(STATE_INS1, STATE_INS1, emit_cnt, cnt2); } else { trans_emit_prob = get_trans_emit_prob(STATE_INS1, STATE_INS1, emit_cnt, cnt2); } int_emit_prob = xlog_mul(int_emit_prob, trans_emit_prob); } cur_win_ins_prob = xlog_mul(cur_win_ins_prob, int_emit_prob); winned_ins_probs1[cnt1] = xlog_sum(winned_ins_probs1[cnt1], cur_win_ins_prob); } fprintf(win_ins_prob_file, "%f ", xlog_div(winned_ins_probs1[cnt1], global_aln_info.op_prob)); } fprintf(win_ins_prob_file, "\n"); fclose(win_ins_prob_file); }
double* get_poisson_thresholds(t_chip_seq_chr_data* cur_chr_data, double target_fdr, int enrichment_mapped_fragment_length, int min_thresh, int max_thresh) { if(__DUMP_POISSON_BCKGRND_MSGS__) { fprintf(stderr, "Computing Poisson based thresholds %d,..., %d\n", min_thresh, max_thresh); } int n_wins = cur_chr_data->n_meg_wins(); double* thresholds_per_win = new double[n_wins+2]; for(int i_win = 0; i_win <= n_wins; i_win++) { //thresholds_per_win[i_win] = max_thresh; // Ensure that no peaks are selected at the initialization, this automatically handles the windows for which there is no data. thresholds_per_win[i_win] = 0; } // Count the number of reads per window. int* n_reads_per_window = get_n_reads_per_window(cur_chr_data->n_meg_wins(), cur_chr_data->chip_seq_fragments); // For each window, go over the threholds to find the threholds for which the probability is above 95%. for(int i_win = 0; i_win < n_wins; i_win++) { if(n_reads_per_window[i_win] == 0) { if(__DUMP_POISSON_BCKGRND_MSGS__) printf("There are no mapped reads in window %d\n", i_win); //getc(stdin); } if(cur_chr_data->n_uniquely_mappable_nucs_per_meg_win->at(i_win) == 0) { if(__DUMP_POISSON_BCKGRND_MSGS__) printf("There are no uniquely mappable nucleotides in window %d\n", i_win); //getc(stdin); } else { // This is the average of poisson distribution for the heights in this window. double avg_read_depth = (double)(n_reads_per_window[i_win] * enrichment_mapped_fragment_length) / (double)cur_chr_data->n_uniquely_mappable_nucs_per_meg_win->at(i_win); double lambda = avg_read_depth; double log_lambda = xlog(lambda); double log_target_cdf = xlog(1.0 - target_fdr); // Update the log_cdf value for thresholds smaller than min_threshold. double current_log_cdf = -1.0 * lambda; double current_factor = -1.0 * lambda; // This is the probability value differentially updated in CDF computation. for(int thresh = 1; thresh < min_thresh; thresh++) { double new_multiplier = xlog_div(log_lambda, xlog((double)thresh)); current_factor = xlog_mul(current_factor, new_multiplier); current_log_cdf = xlog_sum(current_log_cdf, current_factor); } // For thresholds between min and max thresholds, compare the value of (log)CDF with the (log)target FDR. bool found_thresh = false; for(int thresh = min_thresh; thresh <= max_thresh && !found_thresh; thresh++) { double new_multiplier = xlog_div(log_lambda, xlog((double)thresh)); current_factor = xlog_mul(current_factor, new_multiplier); current_log_cdf = xlog_sum(current_log_cdf, current_factor); if(current_log_cdf > log_target_cdf) { found_thresh = true; thresholds_per_win[i_win] = thresh; // Set the threshold in this window. } } // thresh loop. if(thresholds_per_win[i_win] > 0) { if(__DUMP_POISSON_BCKGRND_MSGS__) fprintf(stderr, "Window %d: %lf\n", i_win, thresholds_per_win[i_win]); } } // Check for positive number of reads in the window. } // i_win loop. delete [] n_reads_per_window; // Return the list of threshold arrays. return(thresholds_per_win); }