Beispiel #1
0
Datei: seq.c Projekt: CPFL/gmeme
/**************************************************************************
 * Copy a sequence object.  Memory must be freed by caller.
 **************************************************************************/
SEQ_T* copy_sequence
  (SEQ_T* source_sequence)
{
  // Allocate the sequence object.
  SEQ_T* new_sequence = allocate_seq(source_sequence->name,
				     source_sequence->desc,
				     source_sequence->offset,
				     source_sequence->sequence);

  // Copy additional fields.
  new_sequence->weight = source_sequence->weight;
  new_sequence->is_complete = source_sequence->is_complete;
  if (source_sequence->intseq != NULL) {
    new_sequence->intseq = (int*)mm_malloc(sizeof(int) * source_sequence->length);
    int i;
    for (i = 0; i < source_sequence->length; i++) {
      new_sequence->intseq[i] = source_sequence->intseq[i];
    }
  }
  if (source_sequence->gc != NULL) {
    new_sequence->gc = (int*)mm_malloc(sizeof(int) * source_sequence->length);
    int i;
    for (i = 0; i < source_sequence->length; i++) {
      new_sequence->gc[i] = source_sequence->gc[i];
    }
  }

  return(new_sequence);
}
Beispiel #2
0
Datei: seq.c Projekt: CPFL/gmeme
/**********************************************************************
  shuffle_sequence()

  shuffle a given sequences based on their content
**********************************************************************/
void shuffle_sequence(
  SEQ_T* seq,		/* original sequence IN */
  unsigned int seed,	/* seed IN */
  SEQ_T** target	/* target sequence OUT */
){
	my_srand(seed);
	assert(*target==NULL);
	// reset target if not null
	if (*target != NULL){
		free_seq(*target);
	}

	*target = allocate_seq(get_seq_name(seq),"shuffled",get_seq_offset(seq),get_raw_sequence(seq));
	char *raw = get_raw_sequence(*target);

	/* copy original in temp string */
	char* tmp = (char*)mm_calloc(get_seq_length(seq)+1,sizeof(char));
	strcpy(tmp,get_raw_sequence(seq));
	tmp[get_seq_length(seq)]='\0';

	int i,j;
	char *ss;
	char *dd;
	for(j=0,i=get_seq_length(seq);i>0;i--){
		// Pick a random number in the range:
		int pick = rand() % i;
		raw[j++] = tmp[pick];
		// "shift" routine here eliminates the "picked" base from the _src string:
		// dd starts at the picked position: ss is one beyond that:
		for( dd = tmp+pick , ss = dd + 1 ; *dd ; *dd++=*ss++ );
	}
	myfree(tmp);
}
/******************************************************************************
 * This function allocates and initializes a SEQ_T object from a FASTA and
 * a prior reader. The prior reader is optional and may be null.
 *
 * Returns a pointer to a new SEQ_T object.
 *****************************************************************************/
SEQ_T *get_next_seq_from_readers(
  DATA_BLOCK_READER_T *fasta_reader, 
  DATA_BLOCK_READER_T *prior_reader,
  size_t max_size
) {

  // Move to the next sequence in the fasta file.
  BOOLEAN_T got_seq = fasta_reader->go_to_next_sequence(fasta_reader);
  if (got_seq == FALSE) {
    // Reached EOF
    return NULL;
  }
  char *fasta_seq_name = NULL;
  fasta_reader->get_seq_name(fasta_reader, &fasta_seq_name);
  size_t seq_offset = get_current_pos_from_seq_reader_from_fasta(fasta_reader);
  SEQ_T *sequence = allocate_seq(
    fasta_seq_name, 
    NULL, // description
    seq_offset,
    NULL // raw sequence
  );
  // Read the first raw sequence segment into the sequence.
  read_one_fasta_segment_from_reader(
    fasta_reader,
    max_size, 
    0, // No buffer offset on first segment
    sequence
  );

  // Move to the next sequence in the priors file.
  if (prior_reader) {
    BOOLEAN_T got_priors_seq = prior_reader->go_to_next_sequence(prior_reader);
    if (got_priors_seq == FALSE) {
      die("Unable to read sequence from priors file.");
    }
    // Check that the sequence name from the FASTA reader matches
    // the sequence name from the prior reader.
    char *prior_seq_name = NULL;
    prior_reader->get_seq_name(prior_reader, &prior_seq_name);
    if (strcmp(fasta_seq_name, prior_seq_name) != 0) {
      die(
        "Sequence named %s from prior reader did not "
        "match sequence name %s from fasta reader\n",
        prior_seq_name,
        fasta_seq_name
      );
    }
    // Read the first segment of priors data into the sequence.
    read_one_priors_segment_from_reader(
      prior_reader,
      max_size,
      0, // No buffer offset on first segment
      sequence 
    );
  }

  myfree(fasta_seq_name);
  return sequence;

}
Beispiel #4
0
/****************************************************************************
 * Extract a small alignment out of the middle of a larger alignment.
 ****************************************************************************/
ALIGNMENT_T* extract_subalignment
  (int start,
   int width,
   ALIGNMENT_T* alignment)
{
  int num_sequences = get_num_aligned_sequences(alignment);
  SEQ_T** sequences = get_alignment_sequences(alignment);
  SEQ_T** subsequences = (SEQ_T**)mm_malloc(num_sequences * sizeof(SEQ_T*));

  // Extract the specified columns into a new list of sequences.
  int i_seq = 0;
  char* subsequence = mm_malloc((width + 1) * sizeof(char));
  for (i_seq = 0; i_seq < num_sequences; i_seq++) {
    SEQ_T* this_seq = sequences[i_seq];
    char* raw_seq = get_raw_sequence(this_seq);
    strncpy(subsequence, raw_seq + start, width);
    subsequence[width] = '\0';
    subsequences[i_seq] = 
      allocate_seq(get_seq_name(this_seq),
		   get_seq_description(this_seq),
		   get_seq_offset(this_seq), 
		   subsequence);
  }

  // Extract the consensus string in the specified columns.
  char* consensus = get_consensus_string(alignment);
  char* subconsensus = mm_malloc(sizeof(char) * (width + 1));
  strncpy(subconsensus, consensus + start, width);
  subconsensus[width] = '\0';

  // Allocate and return the new alignment.
  ALIGNMENT_T* subalignment 
    = allocate_alignment(get_alignment_name(alignment),
			 get_alignment_description(alignment),
			 num_sequences,
			 subsequences,
			 subconsensus);

  // Free local dynamic memory.
  for (i_seq = 0; i_seq < num_sequences; i_seq++) {
    free_seq(subsequences[i_seq]);
  }
  myfree(subsequences);
  myfree(subsequence);
  return(subalignment);
}
Beispiel #5
0
SEQ_T* get_consensus_sequence(double threshold, ALIGNMENT_T* alignment) {
  char* seq_string = NULL;
  unsigned char c = 0;
  unsigned char most_freq_char = 0;
  #define NUM_CHARS 127
  char char_counts[NUM_CHARS];
  int i = 0;
  int j = 0;
  double max_char_freq = 0.0;
  SEQ_T* consensus;
  assert(alignment != NULL);
  
  seq_string = mm_malloc(alignment->length * sizeof(char) + 1);
  if (seq_string == NULL) {
    die("Error allocating consensus sequence string\n");
  }
  // For each column in the alignment
  for (i = 0; i < alignment->length; i++) {
    most_freq_char = 0;
    memset(char_counts, 0, NUM_CHARS * sizeof(char));
    // Count character occurances
    for (j = 0; j < alignment->num_sequences; j++) {
      c = get_seq_char(i, alignment->sequences[j]);
      char_counts[c]++;
    }
    // Find the index of the character that occurs the most frequently
    for (c = 0; c < NUM_CHARS; c++) {
      most_freq_char = char_counts[most_freq_char] >= char_counts[c] ? 
        most_freq_char : c;
    }
    // If the most frequent character exceeds the threshold
    // it will be the consensus character.
    max_char_freq = (double) char_counts[most_freq_char] / 
      (double) alignment->num_sequences;
    if (max_char_freq >= threshold) {
      seq_string[i] = most_freq_char;
    } else {
      // Otherwise the consensus is the gap character
      seq_string[i] = '-';
    }
  }
  seq_string[i] = '\0';
  consensus = allocate_seq("Consensus", "", 0, seq_string);
  if (seq_string != NULL) myfree(seq_string);
  return(consensus);
}
Beispiel #6
0
consensus_data * generate_consensus( char ** input_seq,
                           unsigned int n_seq,
                           unsigned min_cov,
                           unsigned K,
                           double min_idt) {
    unsigned int j;
    unsigned int seq_count;
    unsigned int aligned_seq_count;
    kmer_lookup * lk_ptr;
    seq_array sa_ptr;
    seq_addr_array sda_ptr;
    kmer_match * kmer_match_ptr;
    aln_range * arange;
    alignment * aln;
    align_tags_t ** tags_list;
    //char * consensus;
    consensus_data * consensus;
    double max_diff;
    max_diff = 1.0 - min_idt;

    seq_count = n_seq;
    //printf("XX n_seq %d\n", n_seq);
    //for (j=0; j < seq_count; j++) {
    //    printf("seq_len: %u %u\n", j, strlen(input_seq[j]));
    //};
    fflush(stdout);

    tags_list = calloc( seq_count, sizeof(align_tags_t *) );
    lk_ptr = allocate_kmer_lookup( 1 << (K * 2) );
    sa_ptr = allocate_seq( (seq_coor_t) strlen( input_seq[0]) );
    sda_ptr = allocate_seq_addr( (seq_coor_t) strlen( input_seq[0]) );
    add_sequence( 0, K, input_seq[0], strlen(input_seq[0]), sda_ptr, sa_ptr, lk_ptr);
    //mask_k_mer(1 << (K * 2), lk_ptr, 16);

    aligned_seq_count = 0;
    for (j=1; j < seq_count; j++) {

        //printf("seq_len: %ld %u\n", j, strlen(input_seq[j]));

        kmer_match_ptr = find_kmer_pos_for_seq(input_seq[j], strlen(input_seq[j]), K, sda_ptr, lk_ptr);
#define INDEL_ALLOWENCE_0 6

        arange = find_best_aln_range(kmer_match_ptr, K, K * INDEL_ALLOWENCE_0, 5);  // narrow band to avoid aligning through big indels

        //printf("1:%ld %ld %ld %ld\n", arange_->s1, arange_->e1, arange_->s2, arange_->e2);

        //arange = find_best_aln_range2(kmer_match_ptr, K, K * INDEL_ALLOWENCE_0, 5);  // narrow band to avoid aligning through big indels

        //printf("2:%ld %ld %ld %ld\n\n", arange->s1, arange->e1, arange->s2, arange->e2);

#define INDEL_ALLOWENCE_1 0.10
        if (arange->e1 - arange->s1 < 100 || arange->e2 - arange->s2 < 100 ||
            abs( (arange->e1 - arange->s1 ) - (arange->e2 - arange->s2) ) >
                   (int) (0.5 * INDEL_ALLOWENCE_1 * (arange->e1 - arange->s1 + arange->e2 - arange->s2))) {
            free_kmer_match( kmer_match_ptr);
            free_aln_range(arange);
            continue;
        }
        //printf("%ld %s\n", strlen(input_seq[j]), input_seq[j]);
        //printf("%ld %s\n\n", strlen(input_seq[0]), input_seq[0]);


#define INDEL_ALLOWENCE_2 150

        aln = align(input_seq[j]+arange->s1, arange->e1 - arange->s1 ,
                    input_seq[0]+arange->s2, arange->e2 - arange->s2 ,
                    INDEL_ALLOWENCE_2, 1);
        if (aln->aln_str_size > 500 && ((double) aln->dist / (double) aln->aln_str_size) < max_diff) {
            tags_list[aligned_seq_count] = get_align_tags( aln->q_aln_str,
                                                           aln->t_aln_str,
                                                           aln->aln_str_size,
                                                           arange, j,
                                                           0);
            aligned_seq_count ++;
        }
        /***
        for (k = 0; k < tags_list[j]->len; k++) {
            printf("%ld %d %c\n", tags_list[j]->align_tags[k].t_pos,
                                   tags_list[j]->align_tags[k].delta,
                                   tags_list[j]->align_tags[k].q_base);
        }
        ***/
        free_aln_range(arange);
        free_alignment(aln);
        free_kmer_match( kmer_match_ptr);
    }

    if (aligned_seq_count > 0) {
        consensus = get_cns_from_align_tags( tags_list, aligned_seq_count, strlen(input_seq[0]), min_cov );
    } else {
        // allocate an empty consensus sequence
        consensus = calloc( 1, sizeof(consensus_data) );
        consensus->sequence = calloc( 1, sizeof(char) );
        consensus->eqv = calloc( 1, sizeof(unsigned int) );
    }
    //free(consensus);
    free_seq_addr_array(sda_ptr);
    free_seq_array(sa_ptr);
    free_kmer_lookup(lk_ptr);
    for (j=0; j < aligned_seq_count; j++) {
        free_align_tags(tags_list[j]);
    }
    free(tags_list);
    return consensus;
}
Beispiel #7
0
SEQ_T *print_random_seqs (
  FILE *out,				// Stream to print on.
  int seed,				// Random number seed.
  int nseqs, 				// Number of sequences to print.
  int min,				// Minimum sequence length.
  int max,				// Maximum sequence length.
  char **letters,			// Array of letter strings.
  int r,				// Number of letter strings.
  int c,				// Length of letter strings.
  int order,				// Order of Markov model.
  double *cum				// Cumulative distribution(s) defining model.
)
{
  int i, j;
  int n;				// Length of sequence.
  char *buffer = NULL;			// Buffer for sequences.
  char *id = NULL;			// Sequence name.
  char first_letter = letters[0][0];	// First letter in alphabet.

  // Create the buffer for the string.
  mm_resize(buffer, max*(c-1)+1, char);

  /* set up random number generator */
  if (seed != 0) srand48(seed);

  /* print random sequences */
  for (i=0; i<nseqs; i++) {			/* sequence */

    // Decide length of sequence to print.
    n = (int) (min + drand48() * (max - min + 1));

    // Print FASTA ID line.
    if (out != NULL) {
      fprintf(out, ">SEQ_%-d %d\n", i+1, n);
    } else {
      mm_resize(id, 50, char);
      sprintf(id, ">SEQ_%-d %d\n", i+1, n);
    }

    /*
      Generate letters by
        1) random x ~ [0,1)
        2) binary search of cum for cum[i-1]<x<=cum[i]
        3) letter/codon is letters[i-1]
    */
    for (j=0; j<n; j++) {   // generate letters/codons
      double x = drand48(); // random number
      int lo = 0;
      int hi = r;
      int offset = 0; // Offset into cum array.

      if (order >= 1) { // Markov model.
        int start_ptr;
        // Find the offset into the cumulative prob array by looking
        // for the offset of the preceeding "order" characters.
        buffer[j] = first_letter; // Now contains index into array.
        buffer[j+1] = '\0';
        start_ptr = j > order ? j-order : 0; // Start of index string.
        offset = s2i(buffer + start_ptr);
        //fprintf(stderr, "b: %s\n offset: %d x: %f\n", buffer+start_ptr, offset, x);
      }

      while (hi-lo > 1) {    // binary search
        int mid = (lo+hi)/2; // midpoint
        if (x > cum[mid+offset]) { lo = mid; } else { hi = mid; }
      }
      //fprintf(stderr, "%11.8f %s r %d\n", x, letters[x<cum[lo+offset] ? lo : lo+1], r);
      //fprintf(stderr, "%s", letters[x<cum[lo+offset] ? lo : lo+1]);
      buffer[j] = letters[x<cum[lo+offset] ? lo : lo+1][0];
    } /* generate letters/codons */
    buffer[j] = '\0';

    // Print the sequence.
    if (out != NULL) {
      for (j=0; j<n; j+=50) {
        fprintf(out, "%-50.50s\n", buffer+j);
      }
    } else {
      SEQ_T *seq = allocate_seq(id, "", 0, buffer);
      set_complete(TRUE, seq);
      myfree(buffer);
      return(seq);
    }

  } /* sequence */

  myfree(buffer);
  return(NULL);
} // print_random_seqs
Beispiel #8
0
/****************************************************************************
 * Remove from the alignment all columns that contain gaps for the
 * specified species.
 ****************************************************************************/
ALIGNMENT_T* remove_alignment_gaps
  (char*        species,
   ALIGNMENT_T* alignment)
{
  // Locate this species in the alignment.
  int species_index = get_index_in_string_list(species, 
					       get_species_names(alignment));
  if (species_index == -1) {
    die("Can't find %s in alignment.\n", species);
  }
  SEQ_T* this_seq = get_alignment_sequence(species_index, alignment);

  // Get the dimensions of the original matrix.
  int num_sequences = get_num_aligned_sequences(alignment);
  int alignment_length = get_alignment_length(alignment);

  // Allocate memory for raw sequences that will constitute the new alignment.
  char** raw_sequences = (char**)mm_malloc(sizeof(char*) * num_sequences);
  int i_seq = 0;
  for (i_seq = 0; i_seq < num_sequences; i_seq++) {
    raw_sequences[i_seq] 
      = (char*)mm_calloc(alignment_length + 1, sizeof(char*));
  }
  char* consensus = get_consensus_string(alignment);
  char* new_consensus 
    = (char*)mm_calloc(alignment_length + 1, sizeof(char*));

  // Iterate over all columns.
  int i_column;
  int i_raw = 0;
  for (i_column = 0; i_column < alignment_length; i_column++) {

    // Is there a gap?
    char this_char = get_seq_char(i_column, this_seq);
    if ((this_char != '-') && (this_char != '.')) {

      // If no gap, then copy over this column.
      for (i_seq = 0; i_seq < num_sequences; i_seq++) {
	SEQ_T* this_sequence = get_alignment_sequence(i_seq, alignment);
	char this_char = get_seq_char(i_column, this_sequence);
				      
	raw_sequences[i_seq][i_raw] = this_char;
      }
      new_consensus[i_raw] = consensus[i_column];
      i_raw++;
    }
  }

  // Create new sequence objects.
  SEQ_T** new_sequences = (SEQ_T**)mm_malloc(num_sequences * sizeof(SEQ_T*));
  for (i_seq = 0; i_seq < num_sequences; i_seq++) {
    SEQ_T* this_sequence = get_alignment_sequence(i_seq, alignment);
    new_sequences[i_seq] = allocate_seq(get_seq_name(this_sequence),
					get_seq_description(this_sequence),
					get_seq_offset(this_sequence),
					raw_sequences[i_seq]);
  }

  // Allocate and return the new alignment.
  ALIGNMENT_T* new_alignment
    = allocate_alignment(get_alignment_name(alignment),
			 get_alignment_description(alignment),
			 num_sequences,
			 new_sequences,
			 new_consensus);
  
  // Free local dynamic memory.
  for (i_seq = 0; i_seq < num_sequences; i_seq++) {
    myfree(raw_sequences[i_seq]);
    free_seq(new_sequences[i_seq]);
  }
  myfree(raw_sequences);
  myfree(new_sequences);
  myfree(new_consensus);

  return(new_alignment);
}
Beispiel #9
0
/****************************************************************************
 * Allocate one alignment object. Name and description may be NULL.
 * 
 * Returns a pointer to the newly created alignment.
 ****************************************************************************/
ALIGNMENT_T* allocate_alignment(
   char* name,
   char* description,
   int num_sequences,
   SEQ_T** sequences,
   char* consensus_string
)
{
  assert(num_sequences > 0);
  assert(sequences != NULL);
  assert(consensus_string != NULL);

  // Allocate the alignment object.
  ALIGNMENT_T* new_alignment = (ALIGNMENT_T*)mm_malloc(sizeof(ALIGNMENT_T));
  if (new_alignment == NULL) {
    die("Error allocating alignment\n");
  }

  // Store the name, truncating if necessary.
  if (name != NULL) {
    strncpy(new_alignment->name, name, MAX_ALIGNMENT_NAME);
    new_alignment->name[MAX_ALIGNMENT_NAME] = '\0';
    if (strlen(new_alignment->name) != strlen(name)) {
      fprintf(stderr, "Warning: truncating alignment program name %s to %s.\n",
	      name, new_alignment->name);
    }
  } else {
    new_alignment->name[0] = '\0';
  }

  // Store the description, truncating if necessary.
  if (description != NULL) {
    strncpy(new_alignment->desc, description, MAX_ALIGNMENT_COMMENT);
    new_alignment->desc[MAX_ALIGNMENT_COMMENT] = '\0';
  } else {
    new_alignment->desc[0] = '\0';
  }

  // Store the sequences.
  new_alignment->sequences = (SEQ_T**) mm_malloc(num_sequences * sizeof(SEQ_T*));
  if (new_alignment->sequences == NULL) {
    die("Error allocating sequences\n");
  }
  new_alignment->num_sequences = num_sequences;
  int seq_length = strlen(get_raw_sequence(sequences[0]));
  int i;
  for (i = 0; i < num_sequences; i++) {
    myassert(TRUE,
	     strlen(get_raw_sequence(sequences[i])) == seq_length,
	     "Sequence #1 (%s) is length=%d, but sequence #%d (%s) is length=%d.\n<%s>\n",
	     get_seq_name(sequences[0]), seq_length, i, 
	     get_seq_name(sequences[i]), strlen(get_raw_sequence(sequences[i])),
	     get_raw_sequence(sequences[i]));
    new_alignment->sequences[i] = 
      allocate_seq(get_seq_name(sequences[i]),
        get_seq_description(sequences[i]),
        get_seq_offset(sequences[i]), 
        get_raw_sequence(sequences[i])
      );
  }

  // Fill in the remaining fields.
  new_alignment->length = seq_length;
  copy_string(&(new_alignment->consensus_string), consensus_string);

  return(new_alignment);
}