int main
  (int    argc,
   char * argv[])
{

  // Parse the command line.
  if (argc != 4) {
    fprintf(stderr, "Usage: reduce-alignment <start> <width> <alignment>\n");
    exit(1);
  }
  int start_position = atoi(argv[1]);
  int width = atoi(argv[2]);
  char* alignment_filename = argv[3];

  // Read the alignment.
  ALIGNMENT_T* big_alignment = read_alignment_from_file(
    alignment_filename,
		FALSE, 
    FALSE, 
    NULL // pointer to ref_seq_index, not used.
  );
  fprintf(stderr, "Read alignment of %d sequences and %d columns.\n",
	  get_num_aligned_sequences(big_alignment),
	  get_alignment_length(big_alignment));

  if (start_position + width > get_alignment_length(big_alignment)) {
    fprintf(stderr, "Invalid coordinates: %d + %d > %d.\n",
	    start_position, width, get_alignment_length(big_alignment));
    exit(1);
  }

  // Extract the smaller alignment.
  ALIGNMENT_T* small_alignment = extract_subalignment(start_position,
						      width,
						      big_alignment);
  fprintf(stderr, "Created alignment of %d sequences and %d columns.\n",
	  get_num_aligned_sequences(small_alignment),
	  get_alignment_length(small_alignment));

  // Print the alignment.
  print_clustalw(stdout, FALSE, small_alignment);

  // Free locally allocated memory.
  free_alignment(big_alignment);
  free_alignment(small_alignment);

  return(0);
}
Esempio n. 2
0
/****************************************************************************
 * Create a new alignment with any sequence that contains nothing but 
 * gap ('-') characters removed. Returns the new alignment.  Does not 
 * change the old alignment.
 * If there are no all-gap sequences, the returned alignment is the
 * same object as the original alignment.
 ****************************************************************************/
static ALIGNMENT_T* remove_allgap_sequences(ALIGNMENT_T* alignment)
{
  ALIGNMENT_T* new_alignment;
  int i_aln;
  int l_aln = get_num_aligned_sequences(alignment);
  STRING_LIST_T* keeper_seqs = new_string_list();

  // Identify the all-gap sequences.
  for (i_aln=0; i_aln<l_aln; i_aln++) {
    SEQ_T* sequence = get_alignment_sequence(i_aln, alignment);
    int i_seq;
    int l_seq = get_seq_length(sequence);
    // Add sequence to keepers if it contains a non-gap.
    for (i_seq=0; i_seq<l_seq; i_seq++) {
      if (get_seq_char(i_seq, sequence) != '-') {           // not gap?
	add_string(get_seq_name(sequence), keeper_seqs);    // non-gap: keeper
	break;
      }
    }
  }

  // Remove any sequences not in keeper list.
  if (get_num_strings(keeper_seqs) < l_aln) {
    new_alignment = remove_alignment_seqs(keeper_seqs, alignment);
    free_string_list(keeper_seqs);
  } else {
    new_alignment = alignment;
  }

  return(new_alignment);
} // remove_allgap_sequences
Esempio n. 3
0
/****************************************************************************
 * Extract a small alignment out of the middle of a larger alignment.
 ****************************************************************************/
ALIGNMENT_T* extract_subalignment
  (int start,
   int width,
   ALIGNMENT_T* alignment)
{
  int num_sequences = get_num_aligned_sequences(alignment);
  SEQ_T** sequences = get_alignment_sequences(alignment);
  SEQ_T** subsequences = (SEQ_T**)mm_malloc(num_sequences * sizeof(SEQ_T*));

  // Extract the specified columns into a new list of sequences.
  int i_seq = 0;
  char* subsequence = mm_malloc((width + 1) * sizeof(char));
  for (i_seq = 0; i_seq < num_sequences; i_seq++) {
    SEQ_T* this_seq = sequences[i_seq];
    char* raw_seq = get_raw_sequence(this_seq);
    strncpy(subsequence, raw_seq + start, width);
    subsequence[width] = '\0';
    subsequences[i_seq] = 
      allocate_seq(get_seq_name(this_seq),
		   get_seq_description(this_seq),
		   get_seq_offset(this_seq), 
		   subsequence);
  }

  // Extract the consensus string in the specified columns.
  char* consensus = get_consensus_string(alignment);
  char* subconsensus = mm_malloc(sizeof(char) * (width + 1));
  strncpy(subconsensus, consensus + start, width);
  subconsensus[width] = '\0';

  // Allocate and return the new alignment.
  ALIGNMENT_T* subalignment 
    = allocate_alignment(get_alignment_name(alignment),
			 get_alignment_description(alignment),
			 num_sequences,
			 subsequences,
			 subconsensus);

  // Free local dynamic memory.
  for (i_seq = 0; i_seq < num_sequences; i_seq++) {
    free_seq(subsequences[i_seq]);
  }
  myfree(subsequences);
  myfree(subsequence);
  return(subalignment);
}
Esempio n. 4
0
/****************************************************************************
 * Get a list of the names of the species in the alignment.
 ****************************************************************************/
STRING_LIST_T* get_species_names(ALIGNMENT_T* an_alignment) {
  STRING_LIST_T* return_value;
  int i_seq;
  int num_seqs;

  // Allocate a new string list.
  return_value = new_string_list();

  // Extract all the sequence names and add them to the list.
  num_seqs = get_num_aligned_sequences(an_alignment);
  for (i_seq = 0; i_seq < num_seqs; i_seq++) {
    add_string(get_seq_name(get_alignment_sequence(i_seq, an_alignment)),
	       return_value);
  }

  return(return_value);
}
Esempio n. 5
0
/****************************************************************************
 * Read an alignment from a file.  Sort the sequences by sequence name if
 * requested.  Remove all gap sequences if requested.
 ****************************************************************************/
ALIGNMENT_T* read_alignment_from_file
  (char *filename, 
   BOOLEAN_T sort,
   BOOLEAN_T remove_allgap_seqs,
   int* ref_seq_index
  )
{
  int i;

  // Read the sequences.
  ALIGNMENT_T* alignment = read_alignment_from_clustalw_file(filename);

  if (sort) {
    // Create a temporary array to hold sorted sequence pointers.
    int num_sequences = get_num_aligned_sequences(alignment);
    SEQ_T** sequences = (SEQ_T**) mm_malloc(num_sequences * sizeof(SEQ_T*));

    // Sort the sequences by name.
    STRING_LIST_T* alignment_species = get_species_names(alignment);
    // Store the name of the reference sequence.
    char *ref_name = get_nth_string(*ref_seq_index, alignment_species);
    sort_string_list(alignment_species); 	// keep species alphabetical
    for (i=0; i<num_sequences; i++) { 
      char *name = get_nth_string(i, alignment_species);
      sequences[i] = get_alignment_sequence_by_name(name, alignment);
    }
    myfree(alignment->sequences);
    alignment->sequences = sequences;

    // Find the new index of the reference sequence.
    *ref_seq_index = get_index_in_string_list(ref_name, alignment_species);
  }

  if (remove_allgap_seqs) {
    ALIGNMENT_T* new_alignment = remove_allgap_sequences(alignment);
    if (new_alignment != alignment) {
      free_alignment(alignment);
      alignment = new_alignment;
    }
  }

  return(alignment);
} // read_alignment_from_file
Esempio n. 6
0
/*************************************************************************
 *  Build array containing the counts of columns in the alignment
 *  Caller is responsible for freeing the returned array.
 *  If input parameter "freqs" is NULL, allocates the array.
 *  Otherwise, the counts are added to the existing counts in the counts
 *  array.  Ignores all columns containing gaps or ambiguity characters:
 *    [.-nNxX]
 *************************************************************************/
static ARRAY_T* build_alignment_column_counts(
  ALPH_T alph,
  ALIGNMENT_T* alignment,
  ARRAY_T* counts 
) 
{

  assert(alignment != NULL);

  int asize = alph_size(alph, ALPH_SIZE);

  // Calculate number of possible alignment columns
  // and create storage for counting occurences.
  int num_seqs = get_num_aligned_sequences(alignment);
  int num_alignment_cols = (int) pow((double) asize, (double) num_seqs);
  if (counts == NULL) {
    counts = allocate_array(num_alignment_cols);
  }

  // Count how many examples of each column occur in the alignment.
  // Skip columns that contain gaps or ambiguity characters.
  int alignment_length = get_alignment_length(alignment);
  char* alignment_col = mm_malloc(sizeof(char) * (num_seqs + 1));
  alignment_col[num_seqs] = 0;
  int i, h;
  for(i = 0; i < alignment_length; i++) {
    get_alignment_col(i, alignment_col, alignment);
    if (strchr(alignment_col, '-') != NULL) { continue; }
    if (strchr(alignment_col, '.') != NULL) { continue; }
    if (strchr(alignment_col, 'N') != NULL) { continue; }
    if (strchr(alignment_col, 'n') != NULL) { continue; }
    if (strchr(alignment_col, 'X') != NULL) { continue; }
    if (strchr(alignment_col, 'x') != NULL) { continue; }
    h = hash_alignment_col(alph, alignment_col, num_seqs);
    incr_array_item(h, 1, counts);
  }

  return counts;
} // build_alignment_column_counts
Esempio n. 7
0
/****************************************************************************
 * Remove from the alignment all columns that contain gaps for the
 * specified species.
 ****************************************************************************/
ALIGNMENT_T* remove_alignment_gaps
  (char*        species,
   ALIGNMENT_T* alignment)
{
  // Locate this species in the alignment.
  int species_index = get_index_in_string_list(species, 
					       get_species_names(alignment));
  if (species_index == -1) {
    die("Can't find %s in alignment.\n", species);
  }
  SEQ_T* this_seq = get_alignment_sequence(species_index, alignment);

  // Get the dimensions of the original matrix.
  int num_sequences = get_num_aligned_sequences(alignment);
  int alignment_length = get_alignment_length(alignment);

  // Allocate memory for raw sequences that will constitute the new alignment.
  char** raw_sequences = (char**)mm_malloc(sizeof(char*) * num_sequences);
  int i_seq = 0;
  for (i_seq = 0; i_seq < num_sequences; i_seq++) {
    raw_sequences[i_seq] 
      = (char*)mm_calloc(alignment_length + 1, sizeof(char*));
  }
  char* consensus = get_consensus_string(alignment);
  char* new_consensus 
    = (char*)mm_calloc(alignment_length + 1, sizeof(char*));

  // Iterate over all columns.
  int i_column;
  int i_raw = 0;
  for (i_column = 0; i_column < alignment_length; i_column++) {

    // Is there a gap?
    char this_char = get_seq_char(i_column, this_seq);
    if ((this_char != '-') && (this_char != '.')) {

      // If no gap, then copy over this column.
      for (i_seq = 0; i_seq < num_sequences; i_seq++) {
	SEQ_T* this_sequence = get_alignment_sequence(i_seq, alignment);
	char this_char = get_seq_char(i_column, this_sequence);
				      
	raw_sequences[i_seq][i_raw] = this_char;
      }
      new_consensus[i_raw] = consensus[i_column];
      i_raw++;
    }
  }

  // Create new sequence objects.
  SEQ_T** new_sequences = (SEQ_T**)mm_malloc(num_sequences * sizeof(SEQ_T*));
  for (i_seq = 0; i_seq < num_sequences; i_seq++) {
    SEQ_T* this_sequence = get_alignment_sequence(i_seq, alignment);
    new_sequences[i_seq] = allocate_seq(get_seq_name(this_sequence),
					get_seq_description(this_sequence),
					get_seq_offset(this_sequence),
					raw_sequences[i_seq]);
  }

  // Allocate and return the new alignment.
  ALIGNMENT_T* new_alignment
    = allocate_alignment(get_alignment_name(alignment),
			 get_alignment_description(alignment),
			 num_sequences,
			 new_sequences,
			 new_consensus);
  
  // Free local dynamic memory.
  for (i_seq = 0; i_seq < num_sequences; i_seq++) {
    myfree(raw_sequences[i_seq]);
    free_seq(new_sequences[i_seq]);
  }
  myfree(raw_sequences);
  myfree(new_sequences);
  myfree(new_consensus);

  return(new_alignment);
}