Ejemplo n.º 1
0
void get_windowed_ins_probs2(int win_length)
{
	// Allocate two arrays for ins. prob over each window of length win_length over each sequence.
	double* winned_ins_probs1 = (double*)malloc(sizeof(double) * global_aln_info.length1 - win_length + 1);
	double* winned_ins_probs2 = (double*)malloc(sizeof(double) * global_aln_info.length2 - win_length + 1);

	char win_ins_prob_file_name[100];
	sprintf(win_ins_prob_file_name, "win_ins_probs1_%d.txt", win_length);
	FILE* win_ins_prob_file = fopen(win_ins_prob_file_name, "w");

	for(int cnt1 = 1; cnt1 < global_aln_info.length1 - win_length + 1; cnt1++)
	{
		// cnt1 is the index of starting of current window.
		winned_ins_probs1[cnt1] = LOG_OF_ZERO;

		// cnt2 is the index in second sequence where window is betweenm i.e., 
		// the window is assumed to be inserted between cnt2 and cnt2 + 1.
		// however I do not put the constraint that cnt1-1 is not aligned to cnt2 and cnt2 is not aligned to 
		// cnt1 + win_length + 1.
		for(int cnt2 = 1; cnt2 < global_aln_info.length2 + 1; cnt2++)
		{
			// cur_win_ins_prob includes probability of all enumerations of alignments where 
			// currently set window is inserted in first sequences.
			double cur_win_ins_prob	= xlog_mul(global_aln_info.fore_hmm_array->probs[cnt1][cnt2][STATE_INS1], global_aln_info.back_hmm_array->probs[cnt1 + win_length][cnt2][STATE_INS1]);			

			// Calculate the emission probability of inserted sequence by going over all inserted sequence.
			// emit the nucleotides in the inserted sequence is sequence 1.
			double int_emit_prob = xlog(1);

			for(int emit_cnt = cnt1 + 1; emit_cnt <= cnt1 + win_length; emit_cnt++)
			{
				double trans_emit_prob;

				if(emit_cnt == cnt1 + win_length)
				{
					trans_emit_prob = get_trans_emit_prob(STATE_INS1, STATE_INS1, emit_cnt, cnt2);
				}
				else
				{
					trans_emit_prob = get_trans_emit_prob(STATE_INS1, STATE_INS1, emit_cnt, cnt2);
				}

				int_emit_prob = xlog_mul(int_emit_prob, trans_emit_prob);
			}

			cur_win_ins_prob = xlog_mul(cur_win_ins_prob, int_emit_prob);

			winned_ins_probs1[cnt1] = xlog_sum(winned_ins_probs1[cnt1], cur_win_ins_prob);			
		}	

		fprintf(win_ins_prob_file, "%f ", xlog_div(winned_ins_probs1[cnt1], global_aln_info.op_prob));
	}

	fprintf(win_ins_prob_file, "\n");

	fclose(win_ins_prob_file);
}
Ejemplo n.º 2
0
// Backend function for computing alignment envelope.
t_aln_env_result* t_phmm_aln::compute_alignment_envelope(int aln_env_type, 
														 t_pp_result* _pp_result, 
														 double log_threshold, 
														 int par)
{
if(_DUMP_ALN_ENV_UTILS_MESSAGES_)
	printf("Computing alignment envelope...\n");

	// if pp_result is not supplied, recompute it.
	t_pp_result* pp_result = NULL;
	if(_pp_result == NULL)
	{
		pp_result = this->compute_posterior_probs();
	}
	else
	{
		pp_result = _pp_result;
	}

	// alignment envelope type affects how the limits are set.
	// Limit indices are 1 based.
	int* low_limits = (int*)malloc(sizeof(int) * (this->l1() + 2));
	int* high_limits = (int*)malloc(sizeof(int) * (this->l1() + 2));

	// Initialize loop limits.
	for(int i = 0; i <= this->l1(); i++)
	{
		low_limits[i] = 0;
		high_limits[i] = 0;
	}

	if(aln_env_type == PROB_ALN_ENV)
	{
		// Compute alignment envelope.
if(_DUMP_ALN_ENV_UTILS_MESSAGES_)
		printf("Allocating alignment envelope...\n");
		bool** aln_env = (bool**)malloc((this->l1() + 1) * sizeof(bool*));
		double n_aln_env_bytes = 0.0f;

		for(int i = 0; i <= this->l1(); i++)
		{
			int low_k = t_phmm_array::low_phmm_limit(i, l1(), l2(), this->phmm_band_constraint_size);
			int high_k = t_phmm_array::high_phmm_limit(i, l1(), l2(), this->phmm_band_constraint_size);
			aln_env[i] = (bool*)malloc((high_k - low_k + 1) * sizeof(bool));
			n_aln_env_bytes += ((high_k - low_k + 1) * sizeof(bool));
			aln_env[i] -= low_k;
		}
if(_DUMP_ALN_ENV_UTILS_MESSAGES_)
		printf("Allocated %lf bytes for alignment envelope.\n", n_aln_env_bytes);

if(_DUMP_ALN_ENV_UTILS_MESSAGES_)
		printf("Computing alignment envelope from probability planes.\n");

		for(int i = 0; i <= this->l1(); i++)
		{
			int low_k = t_phmm_array::low_phmm_limit(i, l1(), l2(), this->phmm_band_constraint_size);
			int high_k = t_phmm_array::high_phmm_limit(i, l1(), l2(), this->phmm_band_constraint_size);

			for(int k = low_k; k <= high_k; k++)
			{
				//printf("(%d, %d): %f, %f\n", cnt1, cnt2, xlog_div(global_aln_info.aln_probs[cnt1][cnt2], global_aln_info.op_prob), log_threshold);
				double ins1_prob = pp_result->ins1_probs[i][k];
				double ins2_prob = pp_result->ins2_probs[i][k];
				double aln_prob = pp_result->aln_probs[i][k];
				double three_plane_sum = xlog_sum(ins1_prob, xlog_sum(ins2_prob, aln_prob));

				if(three_plane_sum < log_threshold)
				{
					aln_env[i][k] = false;
				}
				else
				{
					aln_env[i][k] = true;
				}
			}
		}

		//FILE* f_aln_env = fopen("aln_env.txt", "w");
		//for(int i = 0; i <= this->l1(); i++)
		//{
		//	int low_k = t_phmm_array::low_phmm_limit(i, l1(), l2(), this->phmm_band_constraint_size);
		//	int high_k = t_phmm_array::high_phmm_limit(i, l1(), l2(), this->phmm_band_constraint_size);

		//	for(int k = low_k; k <= high_k; k++)
		//	{
		//		fprintf(f_aln_env, "%d ", aln_env[i][k]);
		//	} // k loop

		//	fprintf(f_aln_env, "\n");
		//} // i loop
		//fclose(f_aln_env);

if(_DUMP_ALN_ENV_UTILS_MESSAGES_)
		printf("Validating alignment envelope connectivity...\n");

		// If alignment envelope is not connected, return NULL.
		if(!this->check_connection(aln_env))
		{
			printf("Alignment envelope not connected.\n");

			// If pp_result is allocated, free it.
			if(_pp_result == NULL)
			{
				this->free_pp_result(pp_result);
			}

			// Free the limits.
			free(low_limits);
			free(high_limits);

			// Free aln. env. since it is of no use any more.
			for(int i = 0; i <= this->l1(); i++)
			{
				int low_k = t_phmm_array::low_phmm_limit(i, l1(), l2(), this->phmm_band_constraint_size);
				aln_env[i] += low_k;
				free( aln_env[i] );
			}	

			free(aln_env);	

			return(NULL);
		}

if(_DUMP_ALN_ENV_UTILS_MESSAGES_)
		printf("Pruning alignment envelope...\n");
		// Calculate pruned alignment envelope and set it to global_aln_info's alignment envelope, 
		// calculate also the size of alignment envelope.
		//#define _PRUNE_ALN_
		//#ifdef _PRUNE_ALN_
		bool** pruned_aln_env = this->prune_aln_env(aln_env);
		//#else
		//	copy_aln_env(aln_env);
		//#endif

if(_DUMP_ALN_ENV_UTILS_MESSAGES_)
		printf("Releasing alignment envelope memory.\n");

		// Free aln. env. since it is of no use any more.
		for(int i = 0; i <= this->l1(); i++)
		{
			int low_k = t_phmm_array::low_phmm_limit(i, l1(), l2(), this->phmm_band_constraint_size);
			aln_env[i] += low_k;
			free( aln_env[i] );
		}	

		free(aln_env);	

if(_DUMP_ALN_ENV_UTILS_MESSAGES_)
		printf("Computing loop limits.\n");
		// Compute the loop limits.
		for(int i = 1; i <= this->l1(); i++)
		{
			int low_k = t_phmm_array::low_phmm_limit(i, l1(), l2(), this->phmm_band_constraint_size);
			int high_k = t_phmm_array::high_phmm_limit(i, l1(), l2(), this->phmm_band_constraint_size);

			for(int k = low_k; k <= high_k; k++)
			{
				if(pruned_aln_env[i][k])
				{
					//fprintf(ll_file, "%d ", cnt2); // Dump low limit.
					low_limits[i] = k;
					break;
				}
			}

			for(int k = high_k; k >= low_k; k--)
			{
				if(pruned_aln_env[i][k])
				{
					//fprintf(ll_file, "%d", cnt2); // Dump high limit.
					high_limits[i] = k;
					break;
				}
			}
		} // loop limit computation loop.

		// Free pruned aln. env. since it is of no use any more.
if(_DUMP_ALN_ENV_UTILS_MESSAGES_)
		printf("Releasing pruned alignment envelope memory.\n");

		for(int i = 1; i <= this->l1(); i++)
		{
			int low_k = t_phmm_array::low_phmm_limit(i, l1(), l2(), this->phmm_band_constraint_size);
			pruned_aln_env[i] += low_k;
			free(pruned_aln_env[i]);
		}	

		free(pruned_aln_env);
	} // PROB_ALN_ENV
	else if(aln_env_type == BANDED_ALN_ENV)
	{
		// par argument contains band size.
		double band_size = (double)par;
		double floating_N1 = (double)this->l1();
		double floating_N2 = (double)this->l2();

		// Initialize loop limits.
		for(double i = 1.0f; i <= this->l1(); i++)
		{
			low_limits[(int)i] = (int) MAX(0, ((i * floating_N2 / floating_N1) - band_size));
			high_limits[(int)i] = (int) MIN(floating_N2, ((i * floating_N2 / floating_N1) + band_size));			

if(_DUMP_ALN_ENV_UTILS_MESSAGES_)
			printf("%d -> (%d, %d)\n", (int)i, low_limits[(int)i], high_limits[(int)i]);
		}

		//exit(0);

	} // BANDED_ALN_ENV
	else if(aln_env_type == FULL_ALN_ENV)
	{
		// Initialize loop limits.
		for(double i = 0.0f; i <= this->l1(); i++)
		{
			low_limits[(int)i] = 0;
			high_limits[(int)i] = this->l2();
		}
	} // FULL_ALN_ENV
	else if(aln_env_type == MANUAL_ALN_ENV)
	{
		this->load_map_limits_from_map("aln_map.txt", low_limits, high_limits);
	}
	else
	{
		printf("Invalid alignment envelope type: %d\n", aln_env_type);
		exit(0);
	} // switch according to selected alignment envelope type.

	low_limits[0] = low_limits[1];
	high_limits[0] = high_limits[1];

	// Set low limits with values 1 to 0, so that the initialized values can be recursed correctly.
	for(int i = 0; i <= this->l1(); i++)
	{
		if(low_limits[i] == 1)
		{
			low_limits[i] = 0;
		}
	}

	// Allocate and set aln_env_result.
	t_aln_env_result* aln_env_result = (t_aln_env_result*)malloc(sizeof(t_aln_env_result));
	aln_env_result->high_limits = high_limits;
	aln_env_result->low_limits = low_limits;
	//aln_env_result->pp_result = pp_result;

	// Check for alignment constraints in the alignment envelope.
	this->check_ins1_ins2(aln_env_result);

	// Dump the probability planes. (all of it)
if(_DUMP_ALN_ENV_UTILS_MESSAGES_)
{
	FILE* f_aln_probs = open_f("aln_plane_probs", "wb");
	FILE* f_ins1_probs = open_f("ins1_plane_probs", "wb");
	FILE* f_ins2_probs = open_f("ins2_plane_probs", "wb");
	for(int i1 = 1; i1 <= this->l1(); i1++)
	{
		int low_i2 = t_phmm_array::low_phmm_limit(i1, l1(), l2(), this->phmm_band_constraint_size);
		int high_i2 = t_phmm_array::high_phmm_limit(i1, l1(), l2(), this->phmm_band_constraint_size);

		for(int i2 = low_i2; i2 <= high_i2; i2++)
		{
			if(pp_result->aln_probs[i1][i2] != xlog(0.0))
			{
				double cur_aln_prob = pp_result->aln_probs[i1][i2];
				fwrite(&i1, sizeof(int), 1, f_aln_probs);
				fwrite(&i2, sizeof(int), 1, f_aln_probs);
				fwrite(&cur_aln_prob, sizeof(double), 1, f_aln_probs);
			}

			if(pp_result->ins1_probs[i1][i2] != xlog(0.0))
			{
				double cur_ins1_prob = pp_result->ins1_probs[i1][i2];
				fwrite(&i1, sizeof(int), 1, f_ins1_probs);
				fwrite(&i2, sizeof(int), 1, f_ins1_probs);
				fwrite(&cur_ins1_prob, sizeof(double), 1, f_ins1_probs);
			}

			if(pp_result->ins2_probs[i1][i2] != xlog(0.0))
			{
				double cur_ins2_prob = pp_result->ins2_probs[i1][i2];
				fwrite(&i1, sizeof(int), 1, f_ins2_probs);
				fwrite(&i2, sizeof(int), 1, f_ins2_probs);
				fwrite(&cur_ins2_prob, sizeof(double), 1, f_ins2_probs);
			}
		} // i2 loop.
	} // i1 loop.

	fclose(f_aln_probs);
	fclose(f_ins1_probs);
	fclose(f_ins2_probs);

	FILE* f_lls = open_f("loop_limits.txt", "w");

	// Dump the loop limits.
	for(int i = 0; i <= this->l1(); i++)
	{
		fprintf(f_lls, "%d %d %d\n", i, low_limits[i], high_limits[i]);
	}

	fclose(f_lls);
} // message dump check.

	//printf("Dumping alignment map.\n");
	//FILE* f_aln_map = open_f("aln_map.txt", "w");

	//for(int i = 1; i <= this->l1(); i++)
	//{
	//	for(int j = 1; j <= this->l2(); j++)
	//	{
	//		if(j < low_limits[i])
	//		{
	//			fprintf(f_aln_map, "0");
	//		}
	//		else if(j <= high_limits[i])
	//		{
	//			fprintf(f_aln_map, "1");
	//		}
	//		else
	//		{
	//			fprintf(f_aln_map, "0");
	//		}
	//	}
	//	fprintf(f_aln_map, "\n");
	//}

	//fclose(f_aln_map);

if(_DUMP_ALN_ENV_UTILS_MESSAGES_)
	printf("Computed alignment envelope.\n");

	for(int i = 2; i <= this->l1(); i++)
	{
//		fprintf(f_lls, "%d %d %d\n", i, low_limits[i], high_limits[i]);
            if(aln_env_result->low_limits[i] < aln_env_result->low_limits[i-1])
                aln_env_result->low_limits[i] = aln_env_result->low_limits[i-1];
            
	}


        for(int i = this->l1()-1; i >= 1; i--)
	{
//		fprintf(f_lls, "%d %d %d\n", i, low_limits[i], high_limits[i]);
            if(aln_env_result->high_limits[i] > aln_env_result->high_limits[i+1])
                aln_env_result->high_limits[i] = aln_env_result->high_limits[i+1];
            
	}



	return(aln_env_result);
}
Ejemplo n.º 3
0
double* get_poisson_thresholds(t_chip_seq_chr_data* cur_chr_data,
								double target_fdr,
								int enrichment_mapped_fragment_length,
								int min_thresh,
								int max_thresh)
{
	if(__DUMP_POISSON_BCKGRND_MSGS__)
	{
		fprintf(stderr, "Computing Poisson based thresholds %d,..., %d\n", min_thresh, max_thresh);
	}

	int n_wins = cur_chr_data->n_meg_wins();

	double* thresholds_per_win = new double[n_wins+2];
	for(int i_win = 0; i_win <= n_wins; i_win++)
	{
		//thresholds_per_win[i_win] = max_thresh; // Ensure that no peaks are selected at the initialization, this automatically handles the windows for which there is no data.
		thresholds_per_win[i_win] = 0;
	}

	// Count the number of reads per window.
	int* n_reads_per_window = get_n_reads_per_window(cur_chr_data->n_meg_wins(), cur_chr_data->chip_seq_fragments);

	// For each window, go over the threholds to find the threholds for which the probability is above 95%.
	for(int i_win = 0; i_win < n_wins; i_win++)
	{
		if(n_reads_per_window[i_win] == 0)
		{
			if(__DUMP_POISSON_BCKGRND_MSGS__)
				printf("There are no mapped reads in window %d\n", i_win);
			//getc(stdin);
		}

		if(cur_chr_data->n_uniquely_mappable_nucs_per_meg_win->at(i_win) == 0)
		{
			if(__DUMP_POISSON_BCKGRND_MSGS__)
				printf("There are no uniquely mappable nucleotides in window %d\n", i_win);
			//getc(stdin);
		}
		else
		{
			// This is the average of poisson distribution for the heights in this window.
			double avg_read_depth = (double)(n_reads_per_window[i_win] * enrichment_mapped_fragment_length) / (double)cur_chr_data->n_uniquely_mappable_nucs_per_meg_win->at(i_win);
			double lambda = avg_read_depth;
			double log_lambda = xlog(lambda);

			double log_target_cdf = xlog(1.0 - target_fdr);

			// Update the log_cdf value for thresholds smaller than min_threshold.
			double current_log_cdf = -1.0 * lambda;
			double current_factor = -1.0 * lambda; // This is the probability value differentially updated in CDF computation.
			for(int thresh = 1; 
				thresh < min_thresh;
				thresh++)
			{
				double new_multiplier = xlog_div(log_lambda, xlog((double)thresh));
				current_factor = xlog_mul(current_factor, new_multiplier);
				current_log_cdf = xlog_sum(current_log_cdf, current_factor);
			}

			// For thresholds between min and max thresholds, compare the value of (log)CDF with the (log)target FDR.
			bool found_thresh = false;
			for(int thresh = min_thresh; 
				thresh <= max_thresh && !found_thresh; 
				thresh++)
			{
				double new_multiplier = xlog_div(log_lambda, xlog((double)thresh));
				current_factor = xlog_mul(current_factor, new_multiplier);
				current_log_cdf = xlog_sum(current_log_cdf, current_factor);

				if(current_log_cdf > log_target_cdf)
				{
					found_thresh = true;
					thresholds_per_win[i_win] = thresh; // Set the threshold in this window.
				}
			} // thresh loop.

			if(thresholds_per_win[i_win] > 0)
			{
				if(__DUMP_POISSON_BCKGRND_MSGS__)
					fprintf(stderr, "Window %d: %lf\n", i_win, thresholds_per_win[i_win]);
			}
		} // Check for positive number of reads in the window.
	} // i_win loop.

	delete [] n_reads_per_window;

	// Return the list of threshold arrays.
	return(thresholds_per_win);
}