int main (int argc, char * argv[]) { // Parse the command line. if (argc != 4) { fprintf(stderr, "Usage: reduce-alignment <start> <width> <alignment>\n"); exit(1); } int start_position = atoi(argv[1]); int width = atoi(argv[2]); char* alignment_filename = argv[3]; // Read the alignment. ALIGNMENT_T* big_alignment = read_alignment_from_file( alignment_filename, FALSE, FALSE, NULL // pointer to ref_seq_index, not used. ); fprintf(stderr, "Read alignment of %d sequences and %d columns.\n", get_num_aligned_sequences(big_alignment), get_alignment_length(big_alignment)); if (start_position + width > get_alignment_length(big_alignment)) { fprintf(stderr, "Invalid coordinates: %d + %d > %d.\n", start_position, width, get_alignment_length(big_alignment)); exit(1); } // Extract the smaller alignment. ALIGNMENT_T* small_alignment = extract_subalignment(start_position, width, big_alignment); fprintf(stderr, "Created alignment of %d sequences and %d columns.\n", get_num_aligned_sequences(small_alignment), get_alignment_length(small_alignment)); // Print the alignment. print_clustalw(stdout, FALSE, small_alignment); // Free locally allocated memory. free_alignment(big_alignment); free_alignment(small_alignment); return(0); }
/**************************************************************************** * Create a new alignment with any sequence that contains nothing but * gap ('-') characters removed. Returns the new alignment. Does not * change the old alignment. * If there are no all-gap sequences, the returned alignment is the * same object as the original alignment. ****************************************************************************/ static ALIGNMENT_T* remove_allgap_sequences(ALIGNMENT_T* alignment) { ALIGNMENT_T* new_alignment; int i_aln; int l_aln = get_num_aligned_sequences(alignment); STRING_LIST_T* keeper_seqs = new_string_list(); // Identify the all-gap sequences. for (i_aln=0; i_aln<l_aln; i_aln++) { SEQ_T* sequence = get_alignment_sequence(i_aln, alignment); int i_seq; int l_seq = get_seq_length(sequence); // Add sequence to keepers if it contains a non-gap. for (i_seq=0; i_seq<l_seq; i_seq++) { if (get_seq_char(i_seq, sequence) != '-') { // not gap? add_string(get_seq_name(sequence), keeper_seqs); // non-gap: keeper break; } } } // Remove any sequences not in keeper list. if (get_num_strings(keeper_seqs) < l_aln) { new_alignment = remove_alignment_seqs(keeper_seqs, alignment); free_string_list(keeper_seqs); } else { new_alignment = alignment; } return(new_alignment); } // remove_allgap_sequences
/**************************************************************************** * Extract a small alignment out of the middle of a larger alignment. ****************************************************************************/ ALIGNMENT_T* extract_subalignment (int start, int width, ALIGNMENT_T* alignment) { int num_sequences = get_num_aligned_sequences(alignment); SEQ_T** sequences = get_alignment_sequences(alignment); SEQ_T** subsequences = (SEQ_T**)mm_malloc(num_sequences * sizeof(SEQ_T*)); // Extract the specified columns into a new list of sequences. int i_seq = 0; char* subsequence = mm_malloc((width + 1) * sizeof(char)); for (i_seq = 0; i_seq < num_sequences; i_seq++) { SEQ_T* this_seq = sequences[i_seq]; char* raw_seq = get_raw_sequence(this_seq); strncpy(subsequence, raw_seq + start, width); subsequence[width] = '\0'; subsequences[i_seq] = allocate_seq(get_seq_name(this_seq), get_seq_description(this_seq), get_seq_offset(this_seq), subsequence); } // Extract the consensus string in the specified columns. char* consensus = get_consensus_string(alignment); char* subconsensus = mm_malloc(sizeof(char) * (width + 1)); strncpy(subconsensus, consensus + start, width); subconsensus[width] = '\0'; // Allocate and return the new alignment. ALIGNMENT_T* subalignment = allocate_alignment(get_alignment_name(alignment), get_alignment_description(alignment), num_sequences, subsequences, subconsensus); // Free local dynamic memory. for (i_seq = 0; i_seq < num_sequences; i_seq++) { free_seq(subsequences[i_seq]); } myfree(subsequences); myfree(subsequence); return(subalignment); }
/**************************************************************************** * Get a list of the names of the species in the alignment. ****************************************************************************/ STRING_LIST_T* get_species_names(ALIGNMENT_T* an_alignment) { STRING_LIST_T* return_value; int i_seq; int num_seqs; // Allocate a new string list. return_value = new_string_list(); // Extract all the sequence names and add them to the list. num_seqs = get_num_aligned_sequences(an_alignment); for (i_seq = 0; i_seq < num_seqs; i_seq++) { add_string(get_seq_name(get_alignment_sequence(i_seq, an_alignment)), return_value); } return(return_value); }
/**************************************************************************** * Read an alignment from a file. Sort the sequences by sequence name if * requested. Remove all gap sequences if requested. ****************************************************************************/ ALIGNMENT_T* read_alignment_from_file (char *filename, BOOLEAN_T sort, BOOLEAN_T remove_allgap_seqs, int* ref_seq_index ) { int i; // Read the sequences. ALIGNMENT_T* alignment = read_alignment_from_clustalw_file(filename); if (sort) { // Create a temporary array to hold sorted sequence pointers. int num_sequences = get_num_aligned_sequences(alignment); SEQ_T** sequences = (SEQ_T**) mm_malloc(num_sequences * sizeof(SEQ_T*)); // Sort the sequences by name. STRING_LIST_T* alignment_species = get_species_names(alignment); // Store the name of the reference sequence. char *ref_name = get_nth_string(*ref_seq_index, alignment_species); sort_string_list(alignment_species); // keep species alphabetical for (i=0; i<num_sequences; i++) { char *name = get_nth_string(i, alignment_species); sequences[i] = get_alignment_sequence_by_name(name, alignment); } myfree(alignment->sequences); alignment->sequences = sequences; // Find the new index of the reference sequence. *ref_seq_index = get_index_in_string_list(ref_name, alignment_species); } if (remove_allgap_seqs) { ALIGNMENT_T* new_alignment = remove_allgap_sequences(alignment); if (new_alignment != alignment) { free_alignment(alignment); alignment = new_alignment; } } return(alignment); } // read_alignment_from_file
/************************************************************************* * Build array containing the counts of columns in the alignment * Caller is responsible for freeing the returned array. * If input parameter "freqs" is NULL, allocates the array. * Otherwise, the counts are added to the existing counts in the counts * array. Ignores all columns containing gaps or ambiguity characters: * [.-nNxX] *************************************************************************/ static ARRAY_T* build_alignment_column_counts( ALPH_T alph, ALIGNMENT_T* alignment, ARRAY_T* counts ) { assert(alignment != NULL); int asize = alph_size(alph, ALPH_SIZE); // Calculate number of possible alignment columns // and create storage for counting occurences. int num_seqs = get_num_aligned_sequences(alignment); int num_alignment_cols = (int) pow((double) asize, (double) num_seqs); if (counts == NULL) { counts = allocate_array(num_alignment_cols); } // Count how many examples of each column occur in the alignment. // Skip columns that contain gaps or ambiguity characters. int alignment_length = get_alignment_length(alignment); char* alignment_col = mm_malloc(sizeof(char) * (num_seqs + 1)); alignment_col[num_seqs] = 0; int i, h; for(i = 0; i < alignment_length; i++) { get_alignment_col(i, alignment_col, alignment); if (strchr(alignment_col, '-') != NULL) { continue; } if (strchr(alignment_col, '.') != NULL) { continue; } if (strchr(alignment_col, 'N') != NULL) { continue; } if (strchr(alignment_col, 'n') != NULL) { continue; } if (strchr(alignment_col, 'X') != NULL) { continue; } if (strchr(alignment_col, 'x') != NULL) { continue; } h = hash_alignment_col(alph, alignment_col, num_seqs); incr_array_item(h, 1, counts); } return counts; } // build_alignment_column_counts
/**************************************************************************** * Remove from the alignment all columns that contain gaps for the * specified species. ****************************************************************************/ ALIGNMENT_T* remove_alignment_gaps (char* species, ALIGNMENT_T* alignment) { // Locate this species in the alignment. int species_index = get_index_in_string_list(species, get_species_names(alignment)); if (species_index == -1) { die("Can't find %s in alignment.\n", species); } SEQ_T* this_seq = get_alignment_sequence(species_index, alignment); // Get the dimensions of the original matrix. int num_sequences = get_num_aligned_sequences(alignment); int alignment_length = get_alignment_length(alignment); // Allocate memory for raw sequences that will constitute the new alignment. char** raw_sequences = (char**)mm_malloc(sizeof(char*) * num_sequences); int i_seq = 0; for (i_seq = 0; i_seq < num_sequences; i_seq++) { raw_sequences[i_seq] = (char*)mm_calloc(alignment_length + 1, sizeof(char*)); } char* consensus = get_consensus_string(alignment); char* new_consensus = (char*)mm_calloc(alignment_length + 1, sizeof(char*)); // Iterate over all columns. int i_column; int i_raw = 0; for (i_column = 0; i_column < alignment_length; i_column++) { // Is there a gap? char this_char = get_seq_char(i_column, this_seq); if ((this_char != '-') && (this_char != '.')) { // If no gap, then copy over this column. for (i_seq = 0; i_seq < num_sequences; i_seq++) { SEQ_T* this_sequence = get_alignment_sequence(i_seq, alignment); char this_char = get_seq_char(i_column, this_sequence); raw_sequences[i_seq][i_raw] = this_char; } new_consensus[i_raw] = consensus[i_column]; i_raw++; } } // Create new sequence objects. SEQ_T** new_sequences = (SEQ_T**)mm_malloc(num_sequences * sizeof(SEQ_T*)); for (i_seq = 0; i_seq < num_sequences; i_seq++) { SEQ_T* this_sequence = get_alignment_sequence(i_seq, alignment); new_sequences[i_seq] = allocate_seq(get_seq_name(this_sequence), get_seq_description(this_sequence), get_seq_offset(this_sequence), raw_sequences[i_seq]); } // Allocate and return the new alignment. ALIGNMENT_T* new_alignment = allocate_alignment(get_alignment_name(alignment), get_alignment_description(alignment), num_sequences, new_sequences, new_consensus); // Free local dynamic memory. for (i_seq = 0; i_seq < num_sequences; i_seq++) { myfree(raw_sequences[i_seq]); free_seq(new_sequences[i_seq]); } myfree(raw_sequences); myfree(new_sequences); myfree(new_consensus); return(new_alignment); }