예제 #1
0
/****************************************************************************
 * Create a new alignment with any sequence that contains nothing but 
 * gap ('-') characters removed. Returns the new alignment.  Does not 
 * change the old alignment.
 * If there are no all-gap sequences, the returned alignment is the
 * same object as the original alignment.
 ****************************************************************************/
static ALIGNMENT_T* remove_allgap_sequences(ALIGNMENT_T* alignment)
{
  ALIGNMENT_T* new_alignment;
  int i_aln;
  int l_aln = get_num_aligned_sequences(alignment);
  STRING_LIST_T* keeper_seqs = new_string_list();

  // Identify the all-gap sequences.
  for (i_aln=0; i_aln<l_aln; i_aln++) {
    SEQ_T* sequence = get_alignment_sequence(i_aln, alignment);
    int i_seq;
    int l_seq = get_seq_length(sequence);
    // Add sequence to keepers if it contains a non-gap.
    for (i_seq=0; i_seq<l_seq; i_seq++) {
      if (get_seq_char(i_seq, sequence) != '-') {           // not gap?
	add_string(get_seq_name(sequence), keeper_seqs);    // non-gap: keeper
	break;
      }
    }
  }

  // Remove any sequences not in keeper list.
  if (get_num_strings(keeper_seqs) < l_aln) {
    new_alignment = remove_alignment_seqs(keeper_seqs, alignment);
    free_string_list(keeper_seqs);
  } else {
    new_alignment = alignment;
  }

  return(new_alignment);
} // remove_allgap_sequences
예제 #2
0
void assimilate_text_chars(string txt, Store &s, int order){
	string chars;
	queue<string> seq;
	for (int i = 0; i < txt.length(); i++){
		seq = get_seq_char(txt, i, order);
		s.add_hits(seq, 1);
	}
	cerr << "ok" << endl;
}
예제 #3
0
/****************************************************************************
 *  Does a column of an alignment contain any ambiguity codes?
 ****************************************************************************/
BOOLEAN_T alignment_site_ambiguous(ALPH_T alph, int index, ALIGNMENT_T* alignment) {

  int i;
  for (i = 0; i < alignment->num_sequences; i++) {
    char c = get_seq_char(index, alignment->sequences[i]);
    if (!alph_is_concrete(alph, c)) return(TRUE);
  }
  return(FALSE);
}
예제 #4
0
/****************************************************************************
 * Fill in a null terminated string with the bases in one column of the 
 * alignment. The user must allocate the memory for the string, which should
 * be large enough to store one characters from each sequence in the alignment
 * plus the trailing null character. This is done for reasons of efficiency,
 * since in most cases the user will be making for this call iteratively over
 * the length of the alignment.
 ****************************************************************************/
void get_alignment_col(int col, char* alignment_col, ALIGNMENT_T* alignment) {

  int i;

  for (i = 0; i < alignment->num_sequences; i++, alignment_col++) {
    *alignment_col = get_seq_char(col, alignment->sequences[i]);
  }
  *alignment_col = '\0';
}
예제 #5
0
/****************************************************************************
 *  Does a column of an alignment contain gaps?
 ****************************************************************************/
BOOLEAN_T alignment_site_has_gaps(int index, ALIGNMENT_T* alignment) {

  int i = 0;
  for (i = 0; i < alignment->num_sequences; i++) {
    char c = get_seq_char(index, alignment->sequences[i]);
    if (c == '-' || c == '.') {
      return(TRUE);
    }
  }
  return(FALSE);
}
예제 #6
0
/****************************************************************************
 *  Get a cumulative count of gaps within one sequence of the alignment
 ****************************************************************************/
int* get_cumulative_gap_count(int seqIndex, ALIGNMENT_T* alignment) {

  int* results = (int *)mm_malloc( sizeof(int) * alignment->length );

  int i = 0;
  char c = get_seq_char(i, alignment->sequences[seqIndex]);
  if (c == '-' || c == '.') {
      results[i] = 1;
  } else {
      results[i] = 0;
  }
  for (i = 1; i < alignment->length; i++) {
    c = get_seq_char(i, alignment->sequences[seqIndex]);
    if (c == '-' || c == '.') {
      results[i] = results[i-1] + 1;
    } else {
      results[i] = results[i-1];
    }
  }

  return results;
}
예제 #7
0
SEQ_T* get_consensus_sequence(double threshold, ALIGNMENT_T* alignment) {
  char* seq_string = NULL;
  unsigned char c = 0;
  unsigned char most_freq_char = 0;
  #define NUM_CHARS 127
  char char_counts[NUM_CHARS];
  int i = 0;
  int j = 0;
  double max_char_freq = 0.0;
  SEQ_T* consensus;
  assert(alignment != NULL);
  
  seq_string = mm_malloc(alignment->length * sizeof(char) + 1);
  if (seq_string == NULL) {
    die("Error allocating consensus sequence string\n");
  }
  // For each column in the alignment
  for (i = 0; i < alignment->length; i++) {
    most_freq_char = 0;
    memset(char_counts, 0, NUM_CHARS * sizeof(char));
    // Count character occurances
    for (j = 0; j < alignment->num_sequences; j++) {
      c = get_seq_char(i, alignment->sequences[j]);
      char_counts[c]++;
    }
    // Find the index of the character that occurs the most frequently
    for (c = 0; c < NUM_CHARS; c++) {
      most_freq_char = char_counts[most_freq_char] >= char_counts[c] ? 
        most_freq_char : c;
    }
    // If the most frequent character exceeds the threshold
    // it will be the consensus character.
    max_char_freq = (double) char_counts[most_freq_char] / 
      (double) alignment->num_sequences;
    if (max_char_freq >= threshold) {
      seq_string[i] = most_freq_char;
    } else {
      // Otherwise the consensus is the gap character
      seq_string[i] = '-';
    }
  }
  seq_string[i] = '\0';
  consensus = allocate_seq("Consensus", "", 0, seq_string);
  if (seq_string != NULL) myfree(seq_string);
  return(consensus);
}
예제 #8
0
/****************************************************************************
*  Return an array containing the frequencies in the alignment for each 
*  character of the alphabet. Gaps and ambiguity characters other then
*  ANY_BASE are not counted. The freq. of ANY_BASE characters is stored
*  in the last element of the array.
****************************************************************************/
ARRAY_T* get_alignment_freqs(ALPH_T alph, ALIGNMENT_T* alignment) {
  char c = 0;
  int aindex = 0;
  int asize = 0;
  int i = 0;
  int s = 0;
  int total_bases = 0;
  int* num_bases = NULL;
  ARRAY_T* freqs = NULL;
  
  // Initialize counts for each character in the alphabet
  asize = alph_size(alph, ALPH_SIZE);
  num_bases = mm_malloc(asize * sizeof(int));
  for (i = 0; i < asize; i++) {
    num_bases[i] = 0;
  }

  for (s = 0; s < alignment->num_sequences; s++) {
    for (i = 0; i < alignment->length; i++) {
      c = get_seq_char(i, alignment->sequences[s]);
      aindex = alph_index(alph, c);
      // c might be an ambiguity code. We don't count ambiguity codes.
      if (aindex != -1 && aindex < asize) {
        num_bases[aindex]++;
        total_bases++;
      }
    }
  }

  freqs = allocate_array(asize);
  for (i = 0; i < asize; i++) {
    set_array_item(i, (double) num_bases[i] / (double) total_bases, freqs);
  }

  // Clean up the count of characters
  myfree(num_bases);

  return freqs;
}
예제 #9
0
파일: seq.c 프로젝트: CPFL/gmeme
/****************************************************************************
 *  Return an array containing the frequencies in the sequence for each
 *  character of the alphabet. Characters not in the alphabet are not
 *  counted.
 ****************************************************************************/
ARRAY_T* get_sequence_freqs
  (SEQ_T* seq, ALPH_T alph)
{
  char c = 0;
  int a_index = 0;
  int a_size = 0;
  int i = 0;
  int total_bases = 0;
  int* num_bases = NULL;
  ARRAY_T* freqs = NULL;

  // Initialize counts for each character in the alphabet
  a_size = alph_size(alph, ALPH_SIZE);
  num_bases = mm_malloc(a_size * sizeof(int));
  for (i = 0; i < a_size; i++) {
    num_bases[i] = 0;
  }

  for (i = 0; i < seq->length; i++) {
    c = get_seq_char(i, seq);
    if (c != '-' && c != '-') {
      a_index = alph_index(alph, c);
      if (a_index == -1 || a_index >= a_size) continue;
      num_bases[a_index]++;
      total_bases++;
    }
  }

  freqs = allocate_array(a_size);
  for (i = 0; i < a_size; i++) {
    set_array_item(i, (double) num_bases[i] / (double) total_bases, freqs);
  }

  // Clean up the count of characters
  myfree(num_bases);

  return freqs;
}
예제 #10
0
파일: seq.c 프로젝트: CPFL/gmeme
/****************************************************************************
 *  Return an array containing the frequencies in the sequences for each
 *  character of the alphabet. Characters not in the alphabet are not
 *  counted.
 *
 *  When seq is provided it returns null, otherwise it converts the accumulated 
 *  result in bgcalc into a background.
 *
 *
 *  Pseudocode example:
 *    ALPH_T alph = ...
 *    BGCALC_T *bgcalc = NULL;
 *    for each seq:
 *      calculate_background(alph, seq, &bgcalc);
 *    ARRAY_T *bg = calculate_background(NULL, &bgcalc);
 ****************************************************************************/
ARRAY_T* calculate_background(
  ALPH_T alph,
  SEQ_T* seq,
  BGCALC_T** bgcalc
){
  BGCALC_T *calc;
  int a_size, i, a_index;
  char c;
  double freq, chunk_part, chunk_freq;
  ARRAY_T *background;
  assert(bgcalc != NULL);
  assert(seq != NULL || *bgcalc != NULL);
  // get the alphabet
  // get the alphabet size
  a_size = alph_size(alph, ALPH_SIZE);
  if (*bgcalc == NULL) {
    //allocate and initialize calc
    calc = mm_malloc(sizeof(BGCALC_T));
    calc->alph = alph;
    calc->chunk_seen = 0;
    calc->weight = 0;
    calc->chunk_counts = mm_malloc(a_size * sizeof(long));
    calc->bg = mm_malloc(a_size * sizeof(double));
    for (i = 0; i < a_size; ++i) {
      calc->chunk_counts[i] = 0;
      calc->bg[i] = 0;
    }
    *bgcalc = calc;
  } else {
    calc = *bgcalc;
    assert(alph == calc->alph);
    if (calc->weight == LONG_MAX) return NULL;
  }
  if (seq == NULL) {
    // no sequence so calculate the final result
    background = allocate_array(alph_size(alph, ALL_SIZE));
    if (calc->weight == 0) {
      if (calc->chunk_seen > 0) {
        // when we haven't had to approximate yet
        // just do a normal background calculation
        for (i = 0; i < a_size; i++) {
          freq = (double) calc->chunk_counts[i] / (double) calc->chunk_seen;
          set_array_item(i, freq, background);
        }
      } else {
        fputs("Uniform\n", stdout);
        // when there are no counts then return uniform
        freq = (double) 1 / (double) a_size;
        for (i = 0; i < a_size; i++) {
          set_array_item(i, freq, background);
        }
      }
    } else {
      if (calc->chunk_seen > 0) {
        // combine the frequencies for the existing chunks with the counts
        // for the partially completed chunk
        chunk_part = (double) calc->chunk_seen / (double) BG_CALC_CHUNK;
        for (i = 0; i < a_size; i++) {
          chunk_freq = (double) calc->chunk_counts[i] / 
              (double) calc->chunk_seen;
          freq = ((calc->bg[i] * calc->weight) + (chunk_freq * chunk_part)) / 
              (calc->weight + chunk_part);
          set_array_item(i, freq, background);
        }
      } else {
        // in the odd case we get to an integer number of chunks
        for (i = 0; i < a_size; i++) {
          set_array_item(i, calc->bg[i], background);
        }
      }
    }
    calc_ambigs(alph, FALSE, background);
    // free bgcalc structure
    free(calc->bg);
    free(calc->chunk_counts);
    free(calc);
    *bgcalc = NULL;
    return background;
  }
  // we have a sequence to add to the background calculation
  for (i = 0; i < seq->length; i++) {
    c = get_seq_char(i, seq);
    a_index = alph_index(alph, c);
    if (a_index == -1 || a_index >= a_size) continue;
    calc->chunk_counts[a_index]++;
    calc->chunk_seen++;
    if (calc->chunk_seen == BG_CALC_CHUNK) {
      if (calc->weight == 0) {
        for (i = 0; i < a_size; i++) {
          calc->bg[i] = (double) calc->chunk_counts[i] / (double) BG_CALC_CHUNK;
        }
      } else {
        for (i = 0; i < a_size; i++) {
          chunk_freq = (double) calc->chunk_counts[i] / (double) BG_CALC_CHUNK;
          calc->bg[i] = (calc->bg[i] * calc->weight + chunk_freq) / 
              (calc->weight + 1);
        }
      }
      calc->weight++;
      // reset the counts for the next chunk
      for (i = 0; i < a_size; i++) {
        calc->chunk_counts[i] = 0;
      }
      calc->chunk_seen = 0;
      // I don't think it is feasible to reach this limit
      // but I guess I'd better check anyway
      if (calc->weight == LONG_MAX) {
        fprintf(stderr, "Sequence data set is so large that even the "
            "approximation designed for large datasets can't handle it!");
        return NULL;
      }
    }
  }
  return NULL;
}
예제 #11
0
/****************************************************************************
 * Remove from the alignment all columns that contain gaps for the
 * specified species.
 ****************************************************************************/
ALIGNMENT_T* remove_alignment_gaps
  (char*        species,
   ALIGNMENT_T* alignment)
{
  // Locate this species in the alignment.
  int species_index = get_index_in_string_list(species, 
					       get_species_names(alignment));
  if (species_index == -1) {
    die("Can't find %s in alignment.\n", species);
  }
  SEQ_T* this_seq = get_alignment_sequence(species_index, alignment);

  // Get the dimensions of the original matrix.
  int num_sequences = get_num_aligned_sequences(alignment);
  int alignment_length = get_alignment_length(alignment);

  // Allocate memory for raw sequences that will constitute the new alignment.
  char** raw_sequences = (char**)mm_malloc(sizeof(char*) * num_sequences);
  int i_seq = 0;
  for (i_seq = 0; i_seq < num_sequences; i_seq++) {
    raw_sequences[i_seq] 
      = (char*)mm_calloc(alignment_length + 1, sizeof(char*));
  }
  char* consensus = get_consensus_string(alignment);
  char* new_consensus 
    = (char*)mm_calloc(alignment_length + 1, sizeof(char*));

  // Iterate over all columns.
  int i_column;
  int i_raw = 0;
  for (i_column = 0; i_column < alignment_length; i_column++) {

    // Is there a gap?
    char this_char = get_seq_char(i_column, this_seq);
    if ((this_char != '-') && (this_char != '.')) {

      // If no gap, then copy over this column.
      for (i_seq = 0; i_seq < num_sequences; i_seq++) {
	SEQ_T* this_sequence = get_alignment_sequence(i_seq, alignment);
	char this_char = get_seq_char(i_column, this_sequence);
				      
	raw_sequences[i_seq][i_raw] = this_char;
      }
      new_consensus[i_raw] = consensus[i_column];
      i_raw++;
    }
  }

  // Create new sequence objects.
  SEQ_T** new_sequences = (SEQ_T**)mm_malloc(num_sequences * sizeof(SEQ_T*));
  for (i_seq = 0; i_seq < num_sequences; i_seq++) {
    SEQ_T* this_sequence = get_alignment_sequence(i_seq, alignment);
    new_sequences[i_seq] = allocate_seq(get_seq_name(this_sequence),
					get_seq_description(this_sequence),
					get_seq_offset(this_sequence),
					raw_sequences[i_seq]);
  }

  // Allocate and return the new alignment.
  ALIGNMENT_T* new_alignment
    = allocate_alignment(get_alignment_name(alignment),
			 get_alignment_description(alignment),
			 num_sequences,
			 new_sequences,
			 new_consensus);
  
  // Free local dynamic memory.
  for (i_seq = 0; i_seq < num_sequences; i_seq++) {
    myfree(raw_sequences[i_seq]);
    free_seq(new_sequences[i_seq]);
  }
  myfree(raw_sequences);
  myfree(new_sequences);
  myfree(new_consensus);

  return(new_alignment);
}