void get_windowed_ins_probs2(int win_length) { // Allocate two arrays for ins. prob over each window of length win_length over each sequence. double* winned_ins_probs1 = (double*)malloc(sizeof(double) * global_aln_info.length1 - win_length + 1); double* winned_ins_probs2 = (double*)malloc(sizeof(double) * global_aln_info.length2 - win_length + 1); char win_ins_prob_file_name[100]; sprintf(win_ins_prob_file_name, "win_ins_probs1_%d.txt", win_length); FILE* win_ins_prob_file = fopen(win_ins_prob_file_name, "w"); for(int cnt1 = 1; cnt1 < global_aln_info.length1 - win_length + 1; cnt1++) { // cnt1 is the index of starting of current window. winned_ins_probs1[cnt1] = LOG_OF_ZERO; // cnt2 is the index in second sequence where window is betweenm i.e., // the window is assumed to be inserted between cnt2 and cnt2 + 1. // however I do not put the constraint that cnt1-1 is not aligned to cnt2 and cnt2 is not aligned to // cnt1 + win_length + 1. for(int cnt2 = 1; cnt2 < global_aln_info.length2 + 1; cnt2++) { // cur_win_ins_prob includes probability of all enumerations of alignments where // currently set window is inserted in first sequences. double cur_win_ins_prob = xlog_mul(global_aln_info.fore_hmm_array->probs[cnt1][cnt2][STATE_INS1], global_aln_info.back_hmm_array->probs[cnt1 + win_length][cnt2][STATE_INS1]); // Calculate the emission probability of inserted sequence by going over all inserted sequence. // emit the nucleotides in the inserted sequence is sequence 1. double int_emit_prob = xlog(1); for(int emit_cnt = cnt1 + 1; emit_cnt <= cnt1 + win_length; emit_cnt++) { double trans_emit_prob; if(emit_cnt == cnt1 + win_length) { trans_emit_prob = get_trans_emit_prob(STATE_INS1, STATE_INS1, emit_cnt, cnt2); } else { trans_emit_prob = get_trans_emit_prob(STATE_INS1, STATE_INS1, emit_cnt, cnt2); } int_emit_prob = xlog_mul(int_emit_prob, trans_emit_prob); } cur_win_ins_prob = xlog_mul(cur_win_ins_prob, int_emit_prob); winned_ins_probs1[cnt1] = xlog_sum(winned_ins_probs1[cnt1], cur_win_ins_prob); } fprintf(win_ins_prob_file, "%f ", xlog_div(winned_ins_probs1[cnt1], global_aln_info.op_prob)); } fprintf(win_ins_prob_file, "\n"); fclose(win_ins_prob_file); }
// Backend function for computing alignment envelope. t_aln_env_result* t_phmm_aln::compute_alignment_envelope(int aln_env_type, t_pp_result* _pp_result, double log_threshold, int par) { if(_DUMP_ALN_ENV_UTILS_MESSAGES_) printf("Computing alignment envelope...\n"); // if pp_result is not supplied, recompute it. t_pp_result* pp_result = NULL; if(_pp_result == NULL) { pp_result = this->compute_posterior_probs(); } else { pp_result = _pp_result; } // alignment envelope type affects how the limits are set. // Limit indices are 1 based. int* low_limits = (int*)malloc(sizeof(int) * (this->l1() + 2)); int* high_limits = (int*)malloc(sizeof(int) * (this->l1() + 2)); // Initialize loop limits. for(int i = 0; i <= this->l1(); i++) { low_limits[i] = 0; high_limits[i] = 0; } if(aln_env_type == PROB_ALN_ENV) { // Compute alignment envelope. if(_DUMP_ALN_ENV_UTILS_MESSAGES_) printf("Allocating alignment envelope...\n"); bool** aln_env = (bool**)malloc((this->l1() + 1) * sizeof(bool*)); double n_aln_env_bytes = 0.0f; for(int i = 0; i <= this->l1(); i++) { int low_k = t_phmm_array::low_phmm_limit(i, l1(), l2(), this->phmm_band_constraint_size); int high_k = t_phmm_array::high_phmm_limit(i, l1(), l2(), this->phmm_band_constraint_size); aln_env[i] = (bool*)malloc((high_k - low_k + 1) * sizeof(bool)); n_aln_env_bytes += ((high_k - low_k + 1) * sizeof(bool)); aln_env[i] -= low_k; } if(_DUMP_ALN_ENV_UTILS_MESSAGES_) printf("Allocated %lf bytes for alignment envelope.\n", n_aln_env_bytes); if(_DUMP_ALN_ENV_UTILS_MESSAGES_) printf("Computing alignment envelope from probability planes.\n"); for(int i = 0; i <= this->l1(); i++) { int low_k = t_phmm_array::low_phmm_limit(i, l1(), l2(), this->phmm_band_constraint_size); int high_k = t_phmm_array::high_phmm_limit(i, l1(), l2(), this->phmm_band_constraint_size); for(int k = low_k; k <= high_k; k++) { //printf("(%d, %d): %f, %f\n", cnt1, cnt2, xlog_div(global_aln_info.aln_probs[cnt1][cnt2], global_aln_info.op_prob), log_threshold); double ins1_prob = pp_result->ins1_probs[i][k]; double ins2_prob = pp_result->ins2_probs[i][k]; double aln_prob = pp_result->aln_probs[i][k]; double three_plane_sum = xlog_sum(ins1_prob, xlog_sum(ins2_prob, aln_prob)); if(three_plane_sum < log_threshold) { aln_env[i][k] = false; } else { aln_env[i][k] = true; } } } //FILE* f_aln_env = fopen("aln_env.txt", "w"); //for(int i = 0; i <= this->l1(); i++) //{ // int low_k = t_phmm_array::low_phmm_limit(i, l1(), l2(), this->phmm_band_constraint_size); // int high_k = t_phmm_array::high_phmm_limit(i, l1(), l2(), this->phmm_band_constraint_size); // for(int k = low_k; k <= high_k; k++) // { // fprintf(f_aln_env, "%d ", aln_env[i][k]); // } // k loop // fprintf(f_aln_env, "\n"); //} // i loop //fclose(f_aln_env); if(_DUMP_ALN_ENV_UTILS_MESSAGES_) printf("Validating alignment envelope connectivity...\n"); // If alignment envelope is not connected, return NULL. if(!this->check_connection(aln_env)) { printf("Alignment envelope not connected.\n"); // If pp_result is allocated, free it. if(_pp_result == NULL) { this->free_pp_result(pp_result); } // Free the limits. free(low_limits); free(high_limits); // Free aln. env. since it is of no use any more. for(int i = 0; i <= this->l1(); i++) { int low_k = t_phmm_array::low_phmm_limit(i, l1(), l2(), this->phmm_band_constraint_size); aln_env[i] += low_k; free( aln_env[i] ); } free(aln_env); return(NULL); } if(_DUMP_ALN_ENV_UTILS_MESSAGES_) printf("Pruning alignment envelope...\n"); // Calculate pruned alignment envelope and set it to global_aln_info's alignment envelope, // calculate also the size of alignment envelope. //#define _PRUNE_ALN_ //#ifdef _PRUNE_ALN_ bool** pruned_aln_env = this->prune_aln_env(aln_env); //#else // copy_aln_env(aln_env); //#endif if(_DUMP_ALN_ENV_UTILS_MESSAGES_) printf("Releasing alignment envelope memory.\n"); // Free aln. env. since it is of no use any more. for(int i = 0; i <= this->l1(); i++) { int low_k = t_phmm_array::low_phmm_limit(i, l1(), l2(), this->phmm_band_constraint_size); aln_env[i] += low_k; free( aln_env[i] ); } free(aln_env); if(_DUMP_ALN_ENV_UTILS_MESSAGES_) printf("Computing loop limits.\n"); // Compute the loop limits. for(int i = 1; i <= this->l1(); i++) { int low_k = t_phmm_array::low_phmm_limit(i, l1(), l2(), this->phmm_band_constraint_size); int high_k = t_phmm_array::high_phmm_limit(i, l1(), l2(), this->phmm_band_constraint_size); for(int k = low_k; k <= high_k; k++) { if(pruned_aln_env[i][k]) { //fprintf(ll_file, "%d ", cnt2); // Dump low limit. low_limits[i] = k; break; } } for(int k = high_k; k >= low_k; k--) { if(pruned_aln_env[i][k]) { //fprintf(ll_file, "%d", cnt2); // Dump high limit. high_limits[i] = k; break; } } } // loop limit computation loop. // Free pruned aln. env. since it is of no use any more. if(_DUMP_ALN_ENV_UTILS_MESSAGES_) printf("Releasing pruned alignment envelope memory.\n"); for(int i = 1; i <= this->l1(); i++) { int low_k = t_phmm_array::low_phmm_limit(i, l1(), l2(), this->phmm_band_constraint_size); pruned_aln_env[i] += low_k; free(pruned_aln_env[i]); } free(pruned_aln_env); } // PROB_ALN_ENV else if(aln_env_type == BANDED_ALN_ENV) { // par argument contains band size. double band_size = (double)par; double floating_N1 = (double)this->l1(); double floating_N2 = (double)this->l2(); // Initialize loop limits. for(double i = 1.0f; i <= this->l1(); i++) { low_limits[(int)i] = (int) MAX(0, ((i * floating_N2 / floating_N1) - band_size)); high_limits[(int)i] = (int) MIN(floating_N2, ((i * floating_N2 / floating_N1) + band_size)); if(_DUMP_ALN_ENV_UTILS_MESSAGES_) printf("%d -> (%d, %d)\n", (int)i, low_limits[(int)i], high_limits[(int)i]); } //exit(0); } // BANDED_ALN_ENV else if(aln_env_type == FULL_ALN_ENV) { // Initialize loop limits. for(double i = 0.0f; i <= this->l1(); i++) { low_limits[(int)i] = 0; high_limits[(int)i] = this->l2(); } } // FULL_ALN_ENV else if(aln_env_type == MANUAL_ALN_ENV) { this->load_map_limits_from_map("aln_map.txt", low_limits, high_limits); } else { printf("Invalid alignment envelope type: %d\n", aln_env_type); exit(0); } // switch according to selected alignment envelope type. low_limits[0] = low_limits[1]; high_limits[0] = high_limits[1]; // Set low limits with values 1 to 0, so that the initialized values can be recursed correctly. for(int i = 0; i <= this->l1(); i++) { if(low_limits[i] == 1) { low_limits[i] = 0; } } // Allocate and set aln_env_result. t_aln_env_result* aln_env_result = (t_aln_env_result*)malloc(sizeof(t_aln_env_result)); aln_env_result->high_limits = high_limits; aln_env_result->low_limits = low_limits; //aln_env_result->pp_result = pp_result; // Check for alignment constraints in the alignment envelope. this->check_ins1_ins2(aln_env_result); // Dump the probability planes. (all of it) if(_DUMP_ALN_ENV_UTILS_MESSAGES_) { FILE* f_aln_probs = open_f("aln_plane_probs", "wb"); FILE* f_ins1_probs = open_f("ins1_plane_probs", "wb"); FILE* f_ins2_probs = open_f("ins2_plane_probs", "wb"); for(int i1 = 1; i1 <= this->l1(); i1++) { int low_i2 = t_phmm_array::low_phmm_limit(i1, l1(), l2(), this->phmm_band_constraint_size); int high_i2 = t_phmm_array::high_phmm_limit(i1, l1(), l2(), this->phmm_band_constraint_size); for(int i2 = low_i2; i2 <= high_i2; i2++) { if(pp_result->aln_probs[i1][i2] != xlog(0.0)) { double cur_aln_prob = pp_result->aln_probs[i1][i2]; fwrite(&i1, sizeof(int), 1, f_aln_probs); fwrite(&i2, sizeof(int), 1, f_aln_probs); fwrite(&cur_aln_prob, sizeof(double), 1, f_aln_probs); } if(pp_result->ins1_probs[i1][i2] != xlog(0.0)) { double cur_ins1_prob = pp_result->ins1_probs[i1][i2]; fwrite(&i1, sizeof(int), 1, f_ins1_probs); fwrite(&i2, sizeof(int), 1, f_ins1_probs); fwrite(&cur_ins1_prob, sizeof(double), 1, f_ins1_probs); } if(pp_result->ins2_probs[i1][i2] != xlog(0.0)) { double cur_ins2_prob = pp_result->ins2_probs[i1][i2]; fwrite(&i1, sizeof(int), 1, f_ins2_probs); fwrite(&i2, sizeof(int), 1, f_ins2_probs); fwrite(&cur_ins2_prob, sizeof(double), 1, f_ins2_probs); } } // i2 loop. } // i1 loop. fclose(f_aln_probs); fclose(f_ins1_probs); fclose(f_ins2_probs); FILE* f_lls = open_f("loop_limits.txt", "w"); // Dump the loop limits. for(int i = 0; i <= this->l1(); i++) { fprintf(f_lls, "%d %d %d\n", i, low_limits[i], high_limits[i]); } fclose(f_lls); } // message dump check. //printf("Dumping alignment map.\n"); //FILE* f_aln_map = open_f("aln_map.txt", "w"); //for(int i = 1; i <= this->l1(); i++) //{ // for(int j = 1; j <= this->l2(); j++) // { // if(j < low_limits[i]) // { // fprintf(f_aln_map, "0"); // } // else if(j <= high_limits[i]) // { // fprintf(f_aln_map, "1"); // } // else // { // fprintf(f_aln_map, "0"); // } // } // fprintf(f_aln_map, "\n"); //} //fclose(f_aln_map); if(_DUMP_ALN_ENV_UTILS_MESSAGES_) printf("Computed alignment envelope.\n"); for(int i = 2; i <= this->l1(); i++) { // fprintf(f_lls, "%d %d %d\n", i, low_limits[i], high_limits[i]); if(aln_env_result->low_limits[i] < aln_env_result->low_limits[i-1]) aln_env_result->low_limits[i] = aln_env_result->low_limits[i-1]; } for(int i = this->l1()-1; i >= 1; i--) { // fprintf(f_lls, "%d %d %d\n", i, low_limits[i], high_limits[i]); if(aln_env_result->high_limits[i] > aln_env_result->high_limits[i+1]) aln_env_result->high_limits[i] = aln_env_result->high_limits[i+1]; } return(aln_env_result); }
double* get_poisson_thresholds(t_chip_seq_chr_data* cur_chr_data, double target_fdr, int enrichment_mapped_fragment_length, int min_thresh, int max_thresh) { if(__DUMP_POISSON_BCKGRND_MSGS__) { fprintf(stderr, "Computing Poisson based thresholds %d,..., %d\n", min_thresh, max_thresh); } int n_wins = cur_chr_data->n_meg_wins(); double* thresholds_per_win = new double[n_wins+2]; for(int i_win = 0; i_win <= n_wins; i_win++) { //thresholds_per_win[i_win] = max_thresh; // Ensure that no peaks are selected at the initialization, this automatically handles the windows for which there is no data. thresholds_per_win[i_win] = 0; } // Count the number of reads per window. int* n_reads_per_window = get_n_reads_per_window(cur_chr_data->n_meg_wins(), cur_chr_data->chip_seq_fragments); // For each window, go over the threholds to find the threholds for which the probability is above 95%. for(int i_win = 0; i_win < n_wins; i_win++) { if(n_reads_per_window[i_win] == 0) { if(__DUMP_POISSON_BCKGRND_MSGS__) printf("There are no mapped reads in window %d\n", i_win); //getc(stdin); } if(cur_chr_data->n_uniquely_mappable_nucs_per_meg_win->at(i_win) == 0) { if(__DUMP_POISSON_BCKGRND_MSGS__) printf("There are no uniquely mappable nucleotides in window %d\n", i_win); //getc(stdin); } else { // This is the average of poisson distribution for the heights in this window. double avg_read_depth = (double)(n_reads_per_window[i_win] * enrichment_mapped_fragment_length) / (double)cur_chr_data->n_uniquely_mappable_nucs_per_meg_win->at(i_win); double lambda = avg_read_depth; double log_lambda = xlog(lambda); double log_target_cdf = xlog(1.0 - target_fdr); // Update the log_cdf value for thresholds smaller than min_threshold. double current_log_cdf = -1.0 * lambda; double current_factor = -1.0 * lambda; // This is the probability value differentially updated in CDF computation. for(int thresh = 1; thresh < min_thresh; thresh++) { double new_multiplier = xlog_div(log_lambda, xlog((double)thresh)); current_factor = xlog_mul(current_factor, new_multiplier); current_log_cdf = xlog_sum(current_log_cdf, current_factor); } // For thresholds between min and max thresholds, compare the value of (log)CDF with the (log)target FDR. bool found_thresh = false; for(int thresh = min_thresh; thresh <= max_thresh && !found_thresh; thresh++) { double new_multiplier = xlog_div(log_lambda, xlog((double)thresh)); current_factor = xlog_mul(current_factor, new_multiplier); current_log_cdf = xlog_sum(current_log_cdf, current_factor); if(current_log_cdf > log_target_cdf) { found_thresh = true; thresholds_per_win[i_win] = thresh; // Set the threshold in this window. } } // thresh loop. if(thresholds_per_win[i_win] > 0) { if(__DUMP_POISSON_BCKGRND_MSGS__) fprintf(stderr, "Window %d: %lf\n", i_win, thresholds_per_win[i_win]); } } // Check for positive number of reads in the window. } // i_win loop. delete [] n_reads_per_window; // Return the list of threshold arrays. return(thresholds_per_win); }