Пример #1
0
// Run spf calculation on sequence and load folding priors.
// THE PRIORS COMING FROM SPF PROGRAM ARE IN LINEAR DOMAIN!!!
t_spf_array::t_spf_array(int seq_length, 
						 char* seq_path, 
						 t_ppf_cli* _ppf_cli, 
						 char* pairing_probs_file, 
						 bool mallocate)
{
	this->ppf_cli = _ppf_cli;

	this->n_bytes_alloced = 0.0f;

	// Copy sequence length.
	this->N = seq_length;

	if(mallocate)
	{
		this->pairing_array = (double**)malloc(sizeof(double*) * (N + 1));
		this->ind_unpairing_array = (double*)malloc(sizeof(double) * (N + 1));
		this->ind_pairing_array = (double*)malloc(sizeof(double) * (N + 1));
		this->fold_env = (bool**)malloc(sizeof(bool*) * (N + 1));
		this->str_coinc_env = (bool**)malloc(sizeof(bool*) * (N + 1));

		this->n_bytes_alloced += ((sizeof(double*) * (N + 1)) + 
									(sizeof(double) * (N + 1)) + 
									(sizeof(double) * (N + 1)) +
									(sizeof(bool*) * (N + 1)) + 
									(sizeof(bool*) * (N + 1)) + 
									(sizeof(short*) * (N + 1)));
	}
	else
	{
		this->pairing_array = NULL;
		this->ind_unpairing_array = NULL;
		this->ind_pairing_array = NULL;
		this->fold_env = NULL;
		this->str_coinc_env = NULL;
	}

	this->n_bytes_alloced += ((sizeof(double*) * (N + 1)) + 
								(sizeof(double) * (N + 1)) + 
								(sizeof(double) * (N + 1)) +
								(sizeof(bool*) * (N + 1)) + 
								(sizeof(bool*) * (N + 1)) + 
								(sizeof(short*) * (N + 1)));

	// Allocate pairing and unpairing spf arrays.
	for(int i1 = 0; i1 <= N; i1++)
	{
		// Include the max_separation criterion in the allocation function.
		int min_i2 = i1;
		int max_i2 = MIN(i1 + ppf_cli->max_n_separation_between_nucs, N);

		if(mallocate)
		{
			this->fold_env[i1] = (bool*)malloc(sizeof(bool) * (max_i2 - min_i2 + 2));
			this->fold_env[i1] -= i1; // Do pointer shift for fold envelope.
		}

		this->n_bytes_alloced += (sizeof(bool) * (max_i2 - min_i2 + 2));

		if(mallocate)
		{
			this->pairing_array[i1] = (double*)malloc(sizeof(double) * (max_i2 - min_i2 + 2)); // Allocate pairing prob.
			this->pairing_array[i1] -= i1; // Do pointer shift to access the array using sequence indices.
		}

		this->n_bytes_alloced += (sizeof(double) * (max_i2 - min_i2 + 2));

		if(mallocate)
		{
			this->str_coinc_env[i1] = (bool*)malloc(sizeof(bool) * (max_i2 - min_i2 + 2));
			this->str_coinc_env[i1] -= i1; // Do pointer shift for fold envelope.
		}

		this->n_bytes_alloced += (sizeof(double) * (max_i2 - min_i2 + 2));

		if(mallocate)
		{
			this->ind_pairing_array[i1] = ZERO;
			this->ind_unpairing_array[i1] = ZERO;

			for(int i2 = min_i2; i2 <= max_i2; i2++)
			{
				this->pairing_array[i1][i2] = CONVERT_FROM_LIN(0.0); // Initialize the probabilities to 0.
				this->fold_env[i1][i2] = false; // Set all possible pairs.
				this->str_coinc_env[i1][i2] = false;
			}
		}
	} // i1 loop

	if(!mallocate)
	{
		return;
	}

	// Now arrays are allocated, do single partition function calculation for that sequence
	if(pairing_probs_file == NULL)
	{
        	RNA* rna = new RNA(seq_path, 2);
	        rna->PartitionFunction();

		// Load pairing array.
		for(int i = 1; i <= this->N; i++)
		{
			int min_j = i+1;
			int max_j = MIN(i + ppf_cli->max_n_separation_between_nucs, N);

			for(int j = min_j; j <= max_j; j++)
			{
				this->pairing_array[i][j] = rna->GetPairProbability(i, j);
			}
		}
	}
	else
	{
		// Read spf file:
		/*
		Read spf array file, the format is as following:
		1 2 0.000000000000000000000000000000
		1 3 0.000000000000000000000000000000
		1 4 0.000000000000000000000000000000
		1 5 0.000000000000000000000000000000
		1 6 0.000000000000000000000000000000
		1 7 0.000000000000000000000000000000
		1 8 0.000000000000000000000000000000
		1 9 0.000000000000000000000000000000
		1 10 0.000000000000000000000000000000
		1 11 0.000002568332606195572231287628
		...

		where each line consists of
		[index 1] [index 2] [pairing probability of two nucleotides]
		*/
		char spf_array_fn[1000];
		strcpy(spf_array_fn, pairing_probs_file);

		// Read file, read all lines, # of lines read must be equal to # of nucleotides in sequence.
		FILE* spf_file = open_f(spf_array_fn, "rb");

		if(spf_file == NULL)
		{
			printf("Could not open single partition function %s @ %s(%d)\n", spf_array_fn, __FILE__, __LINE__);
			exit(0);
		}

		// SPF file do not contain all the (i1, i2) pairs, it rather includes 
		// i1, i2 pairs where i1 < i2. However in ppf calculations, 
		int i1 = 0;
		int i2 = 0;
		double current_lin_prob = ZERO;
		int n_samples = 0;
		int n_curr_pp_cnt = 0;

		if(fread(&n_samples, sizeof(int), 1, spf_file) != 1)
		{
			printf("Could not read number of samples from %s\n", spf_array_fn);
			exit(0);
		}
		else
		{
			printf("%d samples are processed to estimate base pairing probabilities.\n", n_samples);
		}

		// 1 11 0.000002568332606195572231287628
		while(true)
		{
			if(fread(&i1, sizeof(int), 1, spf_file) != 1)
			{
				break;
			}
		
			if(fread(&i2, sizeof(int), 1, spf_file) != 1)
			{
				printf("Could not read i2 for i1=%d in %s\n", i1, spf_array_fn);
				exit(0);
			}

			if(fread(&n_curr_pp_cnt, sizeof(int), 1, spf_file) != 1)
			{
				printf("Could not read i2 for i1=%d in %s\n", i1, spf_array_fn);
				exit(0);
			}

			current_lin_prob = (double)n_curr_pp_cnt / (double)n_samples;

			// Check max_separation criterion.
			if(i2 > i1 && (i2 - i1) <= ppf_cli->max_n_separation_between_nucs)
			{
				// It should be noted that probabilities in spf file are linear, might need to change them.
				this->pairing_array[i1][i2] = CONVERT_FROM_LIN(current_lin_prob);

	if(_DUMP_SPF_MESSAGES_)
				printf("pp(%d, %d) = %.25f\n", i1, i2, current_lin_prob);

				// If the pairing probability is smaller than fold_env_prob_treshold, set fold envelope for this to 0.
				if(this->pairing_array[i1][i2] >= CONVERT_FROM_LIN(ppf_cli->fold_env_prob_treshold))
				{
					this->fold_env[i1][i2] = true;
				}
			}
			else
			{
				// Out of bounds, do not set the value here since it is not allocated.
			}

	if(_DUMP_SPF_MESSAGES_)
			printf("P_pair(%d, %d) = %.25f\n", i1, i2, this->pairing_array[i1][i2]);

			//fscanf(spf_file, "%d %d %lf", &i1, &i2, &current_lin_prob);
			//printf("read %d %d\n", i1, i2);
		}

		fclose(spf_file);
	} // read the pairing probabilities from external file.

	// Compute the pairing and coincidence ptr relocation maps with base pairing enforced for pairs that have 0.999 or higher probability of pairing.
	this->folding_constraints = new t_folding_constraints(seq_path, this->pairing_array, 0.999f);

	// Weigh all pairing probabilities with a factor in log domain to
	// decrease affect of positive feedback.
	this->calculate_unpairing_probs();

if(_DUMP_SPF_MESSAGES_)
	printf("t_spf_array allocated %lf bytes\n", this->n_bytes_alloced);

//	// Dump spf plane if desired.
//if(_DUMP_SPF_PLANES_)
//{
//	this->dump_spf_plane();
//	this->dump_fold_env();
//}
}
///////////////////////////////////////////////////////////////////////////////
// Read base pair probabilities from a partition function save file.
///////////////////////////////////////////////////////////////////////////////
void Postscript_Annotation_Handler::readPartition( string file,
						   RNA* structureStrand ) {

  // Initialize the RNA strand and error checker that reads partition data.
  RNA* partStrand = new RNA( file.c_str(), PFS_TYPE );
  ErrorChecker<RNA>* partChecker = new ErrorChecker<RNA>( partStrand );

  // If the RNA strand and error checker were created successfully, read in the
  // annotation data.
  if( !( error = partChecker->isErrorStatus() ) ) {

    // If there are no structures in the strand, print out an error message.
    // Otherwise, initialize the annotation array to handle the appropriate
    // amount of structures.
    if( structures == 0 ) {
      cerr << "No structures or pairs are present to annotate." << endl;
      error = true;
    } else {
      probabilityAnnotations.resize( structures );
      for( int i = 1; i <= structures; i++ ) {
	vector<char> row;
	row.resize( length );
	for( int j = 1; j <= length; j++ ) { row[j-1] = 'i'; }
	probabilityAnnotations[i - 1] = row;
      }
    }

    // For each structure possible, read in its base pair probability data.
    for( int i = 1; i <= structures; i++ ) {

      // If an error has occurred, stop reading data.
      if( error ) { break; }

      // Loop through the structure to find pairs.
      for( int j = 1; j <= length; j++ ) {

	// If an error has occurred, stop reading data.
	if( error ) { break; }

	// Get the next pair. If an error occurred, stop reading data.
	int pair = structureStrand->GetPair( j, i );
	int code = structureStrand->GetErrorCode();
	if( code != 0 ) {
	  cerr << endl << structureStrand->GetErrorMessage( code ) << endl;
	  error = true;
	  break;
	}

	// If the next nucleotide is in fact paired, determine the proper
	// color code for it.
	if( ( pair != 0 ) && ( pair > j ) ) {

	  // Get the probability for this pair.
	  // If an error occurred, stop reading data.
	  double bp = partStrand->GetPairProbability( j, pair );
	  if( ( error = partChecker->isErrorStatus() ) ) { break; }

	  // Set the proper values for the color code.
	  probabilityAnnotations[i-1][j-1] =
	    ( bp >= 0.99 ) ? 'a' :
	    ( bp > 0.95 ) ? 'b' :
	    ( bp > 0.90 ) ? 'c' :
	    ( bp > 0.80 ) ? 'd' :
	    ( bp > 0.70 ) ? 'e' :
	    ( bp > 0.60 ) ? 'f' :
	    ( bp > 0.50 ) ? 'g' :
	    'h';

	  probabilityAnnotations[i-1][pair-1] =
	    probabilityAnnotations[i-1][j-1];
	}
      }
    }
  }

  // If an error occurred, print out an extra error message to make sure the
  // user knows the error came from reading the partition function annotation
  // file in.
  if( error ) {
    cerr << "Partition function save file not read successfully." << endl;
  }

  // Delete the RNA strand and error checker when they're no longer needed.
  delete partStrand;
  delete partChecker;

}