int main
  (int    argc,
   char * argv[])
{

  // Parse the command line.
  if (argc != 4) {
    fprintf(stderr, "Usage: reduce-alignment <start> <width> <alignment>\n");
    exit(1);
  }
  int start_position = atoi(argv[1]);
  int width = atoi(argv[2]);
  char* alignment_filename = argv[3];

  // Read the alignment.
  ALIGNMENT_T* big_alignment = read_alignment_from_file(
    alignment_filename,
		FALSE, 
    FALSE, 
    NULL // pointer to ref_seq_index, not used.
  );
  fprintf(stderr, "Read alignment of %d sequences and %d columns.\n",
	  get_num_aligned_sequences(big_alignment),
	  get_alignment_length(big_alignment));

  if (start_position + width > get_alignment_length(big_alignment)) {
    fprintf(stderr, "Invalid coordinates: %d + %d > %d.\n",
	    start_position, width, get_alignment_length(big_alignment));
    exit(1);
  }

  // Extract the smaller alignment.
  ALIGNMENT_T* small_alignment = extract_subalignment(start_position,
						      width,
						      big_alignment);
  fprintf(stderr, "Created alignment of %d sequences and %d columns.\n",
	  get_num_aligned_sequences(small_alignment),
	  get_alignment_length(small_alignment));

  // Print the alignment.
  print_clustalw(stdout, FALSE, small_alignment);

  // Free locally allocated memory.
  free_alignment(big_alignment);
  free_alignment(small_alignment);

  return(0);
}
Beispiel #2
0
/****************************************************************************
 *  Return a list containing the empirical column frequency distributions
 *  for all alignments in the input.
 *
 *  Each file in the list of filenames is read and the species list is
 *  determined.  The counts of each occurring column are tallied.  
 *  All files with the same species lists get their counts combined.
 *
 *  The returned list contains one distribution per species list that 
 *  occurs in some alignment.
 ****************************************************************************/
OBJECT_LIST_T* get_alignment_column_freqs_list
  (ALPH_T alph, 
   STRING_LIST_T* filenames,
  BOOLEAN_T remove_allgap_seqs) 
{
  int file_index;
  int num_filenames = get_num_strings(filenames);
  ARRAY_T* alignment_column_freqs = NULL;
  OBJECT_LIST_T* alignment_column_freqs_list
    = new_object_list(equal_string_lists,
		      (void*)copy_string_list,
		      free_string_list,
		      free_array);

  // Consider each alignment in turn.
  for(file_index = 0; file_index < num_filenames; file_index++) { 
    char* filename = get_nth_string(file_index, filenames);

    if (verbosity >= NORMAL_VERBOSE && !(file_index % 1)) {
      fprintf(
	stderr, 
	"Computing column freqs: alignment file number %d of %d total files.\n",
	file_index+1, num_filenames
      );
    }

    // Read the alignment
    int ref_seq_index = 0;
    ALIGNMENT_T* alignment = 
      read_alignment_from_file(filename, TRUE, remove_allgap_seqs, &ref_seq_index);
    STRING_LIST_T* alignment_species = get_species_names(alignment);

    // Try to retrieve the counts so far for this list of species.
    alignment_column_freqs = 
      (ARRAY_T*)retrieve_object(
	alignment_species, 
	alignment_column_freqs_list
      );
    
    // Found counts for current species list?
    if (alignment_column_freqs) {
      // Add counts from current alignment.
      (void) build_alignment_column_counts(alph, alignment, alignment_column_freqs);
      // Note: objects in lists are references, so no need to re-store
      // after modification.
    } 
    // Didn't find counts for this species list, so create new array of counts.
    else {
      alignment_column_freqs = build_alignment_column_counts(alph, alignment, NULL);
      store_object(
	(void*)alignment_column_freqs,
	(void*)alignment_species,
	0.0, // Score
	alignment_column_freqs_list
      );
    }
    // free space used by alignment
    free_alignment(alignment);
  } // each filename
  fprintf(stderr, "\n");

  // Convert counts to frequencies by retrieving each array of counts
  // and dividing by the total counts for that list of species.
  while ( 
    (alignment_column_freqs = retrieve_next_object(alignment_column_freqs_list)
    ) != NULL )
  {
    int i;
    int num_freqs = get_array_length(alignment_column_freqs);
    double total_counts;

    // Get total counts.
    for (i=total_counts=0; i<num_freqs; i++) {
      total_counts += get_array_item(i, alignment_column_freqs);
    }

    // Get frequencies.
    for (i=0; i<num_freqs; i++) {
      double f = get_array_item(i, alignment_column_freqs);
      set_array_item(i, f/total_counts, alignment_column_freqs);

#ifdef DEBUG
      int asize = alph_size(alph, ALPH_SIZE);
      int num_leaves = NINT(log(num_freqs)/log(asize));
      char* alignment_col = mm_malloc((num_leaves + 1) * sizeof(char));
      unhash_alignment_col(
        alph,
        i, 				//col_index
	alignment_col,
	num_leaves
      );
      printf("%s %g %g\n", alignment_col, f, f/total_counts);
      myfree(alignment_col);
#endif
    } // get frequencies
  } // while more species lists

  return(alignment_column_freqs_list);
} // get_alignment_column_freqs_list