/**************************************************************************** * Create a new alignment with any sequence that contains nothing but * gap ('-') characters removed. Returns the new alignment. Does not * change the old alignment. * If there are no all-gap sequences, the returned alignment is the * same object as the original alignment. ****************************************************************************/ static ALIGNMENT_T* remove_allgap_sequences(ALIGNMENT_T* alignment) { ALIGNMENT_T* new_alignment; int i_aln; int l_aln = get_num_aligned_sequences(alignment); STRING_LIST_T* keeper_seqs = new_string_list(); // Identify the all-gap sequences. for (i_aln=0; i_aln<l_aln; i_aln++) { SEQ_T* sequence = get_alignment_sequence(i_aln, alignment); int i_seq; int l_seq = get_seq_length(sequence); // Add sequence to keepers if it contains a non-gap. for (i_seq=0; i_seq<l_seq; i_seq++) { if (get_seq_char(i_seq, sequence) != '-') { // not gap? add_string(get_seq_name(sequence), keeper_seqs); // non-gap: keeper break; } } } // Remove any sequences not in keeper list. if (get_num_strings(keeper_seqs) < l_aln) { new_alignment = remove_alignment_seqs(keeper_seqs, alignment); free_string_list(keeper_seqs); } else { new_alignment = alignment; } return(new_alignment); } // remove_allgap_sequences
void assimilate_text_chars(string txt, Store &s, int order){ string chars; queue<string> seq; for (int i = 0; i < txt.length(); i++){ seq = get_seq_char(txt, i, order); s.add_hits(seq, 1); } cerr << "ok" << endl; }
/**************************************************************************** * Does a column of an alignment contain any ambiguity codes? ****************************************************************************/ BOOLEAN_T alignment_site_ambiguous(ALPH_T alph, int index, ALIGNMENT_T* alignment) { int i; for (i = 0; i < alignment->num_sequences; i++) { char c = get_seq_char(index, alignment->sequences[i]); if (!alph_is_concrete(alph, c)) return(TRUE); } return(FALSE); }
/**************************************************************************** * Fill in a null terminated string with the bases in one column of the * alignment. The user must allocate the memory for the string, which should * be large enough to store one characters from each sequence in the alignment * plus the trailing null character. This is done for reasons of efficiency, * since in most cases the user will be making for this call iteratively over * the length of the alignment. ****************************************************************************/ void get_alignment_col(int col, char* alignment_col, ALIGNMENT_T* alignment) { int i; for (i = 0; i < alignment->num_sequences; i++, alignment_col++) { *alignment_col = get_seq_char(col, alignment->sequences[i]); } *alignment_col = '\0'; }
/**************************************************************************** * Does a column of an alignment contain gaps? ****************************************************************************/ BOOLEAN_T alignment_site_has_gaps(int index, ALIGNMENT_T* alignment) { int i = 0; for (i = 0; i < alignment->num_sequences; i++) { char c = get_seq_char(index, alignment->sequences[i]); if (c == '-' || c == '.') { return(TRUE); } } return(FALSE); }
/**************************************************************************** * Get a cumulative count of gaps within one sequence of the alignment ****************************************************************************/ int* get_cumulative_gap_count(int seqIndex, ALIGNMENT_T* alignment) { int* results = (int *)mm_malloc( sizeof(int) * alignment->length ); int i = 0; char c = get_seq_char(i, alignment->sequences[seqIndex]); if (c == '-' || c == '.') { results[i] = 1; } else { results[i] = 0; } for (i = 1; i < alignment->length; i++) { c = get_seq_char(i, alignment->sequences[seqIndex]); if (c == '-' || c == '.') { results[i] = results[i-1] + 1; } else { results[i] = results[i-1]; } } return results; }
SEQ_T* get_consensus_sequence(double threshold, ALIGNMENT_T* alignment) { char* seq_string = NULL; unsigned char c = 0; unsigned char most_freq_char = 0; #define NUM_CHARS 127 char char_counts[NUM_CHARS]; int i = 0; int j = 0; double max_char_freq = 0.0; SEQ_T* consensus; assert(alignment != NULL); seq_string = mm_malloc(alignment->length * sizeof(char) + 1); if (seq_string == NULL) { die("Error allocating consensus sequence string\n"); } // For each column in the alignment for (i = 0; i < alignment->length; i++) { most_freq_char = 0; memset(char_counts, 0, NUM_CHARS * sizeof(char)); // Count character occurances for (j = 0; j < alignment->num_sequences; j++) { c = get_seq_char(i, alignment->sequences[j]); char_counts[c]++; } // Find the index of the character that occurs the most frequently for (c = 0; c < NUM_CHARS; c++) { most_freq_char = char_counts[most_freq_char] >= char_counts[c] ? most_freq_char : c; } // If the most frequent character exceeds the threshold // it will be the consensus character. max_char_freq = (double) char_counts[most_freq_char] / (double) alignment->num_sequences; if (max_char_freq >= threshold) { seq_string[i] = most_freq_char; } else { // Otherwise the consensus is the gap character seq_string[i] = '-'; } } seq_string[i] = '\0'; consensus = allocate_seq("Consensus", "", 0, seq_string); if (seq_string != NULL) myfree(seq_string); return(consensus); }
/**************************************************************************** * Return an array containing the frequencies in the alignment for each * character of the alphabet. Gaps and ambiguity characters other then * ANY_BASE are not counted. The freq. of ANY_BASE characters is stored * in the last element of the array. ****************************************************************************/ ARRAY_T* get_alignment_freqs(ALPH_T alph, ALIGNMENT_T* alignment) { char c = 0; int aindex = 0; int asize = 0; int i = 0; int s = 0; int total_bases = 0; int* num_bases = NULL; ARRAY_T* freqs = NULL; // Initialize counts for each character in the alphabet asize = alph_size(alph, ALPH_SIZE); num_bases = mm_malloc(asize * sizeof(int)); for (i = 0; i < asize; i++) { num_bases[i] = 0; } for (s = 0; s < alignment->num_sequences; s++) { for (i = 0; i < alignment->length; i++) { c = get_seq_char(i, alignment->sequences[s]); aindex = alph_index(alph, c); // c might be an ambiguity code. We don't count ambiguity codes. if (aindex != -1 && aindex < asize) { num_bases[aindex]++; total_bases++; } } } freqs = allocate_array(asize); for (i = 0; i < asize; i++) { set_array_item(i, (double) num_bases[i] / (double) total_bases, freqs); } // Clean up the count of characters myfree(num_bases); return freqs; }
/**************************************************************************** * Return an array containing the frequencies in the sequence for each * character of the alphabet. Characters not in the alphabet are not * counted. ****************************************************************************/ ARRAY_T* get_sequence_freqs (SEQ_T* seq, ALPH_T alph) { char c = 0; int a_index = 0; int a_size = 0; int i = 0; int total_bases = 0; int* num_bases = NULL; ARRAY_T* freqs = NULL; // Initialize counts for each character in the alphabet a_size = alph_size(alph, ALPH_SIZE); num_bases = mm_malloc(a_size * sizeof(int)); for (i = 0; i < a_size; i++) { num_bases[i] = 0; } for (i = 0; i < seq->length; i++) { c = get_seq_char(i, seq); if (c != '-' && c != '-') { a_index = alph_index(alph, c); if (a_index == -1 || a_index >= a_size) continue; num_bases[a_index]++; total_bases++; } } freqs = allocate_array(a_size); for (i = 0; i < a_size; i++) { set_array_item(i, (double) num_bases[i] / (double) total_bases, freqs); } // Clean up the count of characters myfree(num_bases); return freqs; }
/**************************************************************************** * Return an array containing the frequencies in the sequences for each * character of the alphabet. Characters not in the alphabet are not * counted. * * When seq is provided it returns null, otherwise it converts the accumulated * result in bgcalc into a background. * * * Pseudocode example: * ALPH_T alph = ... * BGCALC_T *bgcalc = NULL; * for each seq: * calculate_background(alph, seq, &bgcalc); * ARRAY_T *bg = calculate_background(NULL, &bgcalc); ****************************************************************************/ ARRAY_T* calculate_background( ALPH_T alph, SEQ_T* seq, BGCALC_T** bgcalc ){ BGCALC_T *calc; int a_size, i, a_index; char c; double freq, chunk_part, chunk_freq; ARRAY_T *background; assert(bgcalc != NULL); assert(seq != NULL || *bgcalc != NULL); // get the alphabet // get the alphabet size a_size = alph_size(alph, ALPH_SIZE); if (*bgcalc == NULL) { //allocate and initialize calc calc = mm_malloc(sizeof(BGCALC_T)); calc->alph = alph; calc->chunk_seen = 0; calc->weight = 0; calc->chunk_counts = mm_malloc(a_size * sizeof(long)); calc->bg = mm_malloc(a_size * sizeof(double)); for (i = 0; i < a_size; ++i) { calc->chunk_counts[i] = 0; calc->bg[i] = 0; } *bgcalc = calc; } else { calc = *bgcalc; assert(alph == calc->alph); if (calc->weight == LONG_MAX) return NULL; } if (seq == NULL) { // no sequence so calculate the final result background = allocate_array(alph_size(alph, ALL_SIZE)); if (calc->weight == 0) { if (calc->chunk_seen > 0) { // when we haven't had to approximate yet // just do a normal background calculation for (i = 0; i < a_size; i++) { freq = (double) calc->chunk_counts[i] / (double) calc->chunk_seen; set_array_item(i, freq, background); } } else { fputs("Uniform\n", stdout); // when there are no counts then return uniform freq = (double) 1 / (double) a_size; for (i = 0; i < a_size; i++) { set_array_item(i, freq, background); } } } else { if (calc->chunk_seen > 0) { // combine the frequencies for the existing chunks with the counts // for the partially completed chunk chunk_part = (double) calc->chunk_seen / (double) BG_CALC_CHUNK; for (i = 0; i < a_size; i++) { chunk_freq = (double) calc->chunk_counts[i] / (double) calc->chunk_seen; freq = ((calc->bg[i] * calc->weight) + (chunk_freq * chunk_part)) / (calc->weight + chunk_part); set_array_item(i, freq, background); } } else { // in the odd case we get to an integer number of chunks for (i = 0; i < a_size; i++) { set_array_item(i, calc->bg[i], background); } } } calc_ambigs(alph, FALSE, background); // free bgcalc structure free(calc->bg); free(calc->chunk_counts); free(calc); *bgcalc = NULL; return background; } // we have a sequence to add to the background calculation for (i = 0; i < seq->length; i++) { c = get_seq_char(i, seq); a_index = alph_index(alph, c); if (a_index == -1 || a_index >= a_size) continue; calc->chunk_counts[a_index]++; calc->chunk_seen++; if (calc->chunk_seen == BG_CALC_CHUNK) { if (calc->weight == 0) { for (i = 0; i < a_size; i++) { calc->bg[i] = (double) calc->chunk_counts[i] / (double) BG_CALC_CHUNK; } } else { for (i = 0; i < a_size; i++) { chunk_freq = (double) calc->chunk_counts[i] / (double) BG_CALC_CHUNK; calc->bg[i] = (calc->bg[i] * calc->weight + chunk_freq) / (calc->weight + 1); } } calc->weight++; // reset the counts for the next chunk for (i = 0; i < a_size; i++) { calc->chunk_counts[i] = 0; } calc->chunk_seen = 0; // I don't think it is feasible to reach this limit // but I guess I'd better check anyway if (calc->weight == LONG_MAX) { fprintf(stderr, "Sequence data set is so large that even the " "approximation designed for large datasets can't handle it!"); return NULL; } } } return NULL; }
/**************************************************************************** * Remove from the alignment all columns that contain gaps for the * specified species. ****************************************************************************/ ALIGNMENT_T* remove_alignment_gaps (char* species, ALIGNMENT_T* alignment) { // Locate this species in the alignment. int species_index = get_index_in_string_list(species, get_species_names(alignment)); if (species_index == -1) { die("Can't find %s in alignment.\n", species); } SEQ_T* this_seq = get_alignment_sequence(species_index, alignment); // Get the dimensions of the original matrix. int num_sequences = get_num_aligned_sequences(alignment); int alignment_length = get_alignment_length(alignment); // Allocate memory for raw sequences that will constitute the new alignment. char** raw_sequences = (char**)mm_malloc(sizeof(char*) * num_sequences); int i_seq = 0; for (i_seq = 0; i_seq < num_sequences; i_seq++) { raw_sequences[i_seq] = (char*)mm_calloc(alignment_length + 1, sizeof(char*)); } char* consensus = get_consensus_string(alignment); char* new_consensus = (char*)mm_calloc(alignment_length + 1, sizeof(char*)); // Iterate over all columns. int i_column; int i_raw = 0; for (i_column = 0; i_column < alignment_length; i_column++) { // Is there a gap? char this_char = get_seq_char(i_column, this_seq); if ((this_char != '-') && (this_char != '.')) { // If no gap, then copy over this column. for (i_seq = 0; i_seq < num_sequences; i_seq++) { SEQ_T* this_sequence = get_alignment_sequence(i_seq, alignment); char this_char = get_seq_char(i_column, this_sequence); raw_sequences[i_seq][i_raw] = this_char; } new_consensus[i_raw] = consensus[i_column]; i_raw++; } } // Create new sequence objects. SEQ_T** new_sequences = (SEQ_T**)mm_malloc(num_sequences * sizeof(SEQ_T*)); for (i_seq = 0; i_seq < num_sequences; i_seq++) { SEQ_T* this_sequence = get_alignment_sequence(i_seq, alignment); new_sequences[i_seq] = allocate_seq(get_seq_name(this_sequence), get_seq_description(this_sequence), get_seq_offset(this_sequence), raw_sequences[i_seq]); } // Allocate and return the new alignment. ALIGNMENT_T* new_alignment = allocate_alignment(get_alignment_name(alignment), get_alignment_description(alignment), num_sequences, new_sequences, new_consensus); // Free local dynamic memory. for (i_seq = 0; i_seq < num_sequences; i_seq++) { myfree(raw_sequences[i_seq]); free_seq(new_sequences[i_seq]); } myfree(raw_sequences); myfree(new_sequences); myfree(new_consensus); return(new_alignment); }