Пример #1
0
Файл: seq.c Проект: CPFL/gmeme
/**********************************************************************
  shuffle_sequence()

  shuffle a given sequences based on their content
**********************************************************************/
void shuffle_sequence(
  SEQ_T* seq,		/* original sequence IN */
  unsigned int seed,	/* seed IN */
  SEQ_T** target	/* target sequence OUT */
){
	my_srand(seed);
	assert(*target==NULL);
	// reset target if not null
	if (*target != NULL){
		free_seq(*target);
	}

	*target = allocate_seq(get_seq_name(seq),"shuffled",get_seq_offset(seq),get_raw_sequence(seq));
	char *raw = get_raw_sequence(*target);

	/* copy original in temp string */
	char* tmp = (char*)mm_calloc(get_seq_length(seq)+1,sizeof(char));
	strcpy(tmp,get_raw_sequence(seq));
	tmp[get_seq_length(seq)]='\0';

	int i,j;
	char *ss;
	char *dd;
	for(j=0,i=get_seq_length(seq);i>0;i--){
		// Pick a random number in the range:
		int pick = rand() % i;
		raw[j++] = tmp[pick];
		// "shift" routine here eliminates the "picked" base from the _src string:
		// dd starts at the picked position: ss is one beyond that:
		for( dd = tmp+pick , ss = dd + 1 ; *dd ; *dd++=*ss++ );
	}
	myfree(tmp);
}
Пример #2
0
/**
 * Creates a motif for a given mod using a simple frequency matrix.
 */
void create_simple_motif(SUMMARY_T* summary,
                         MOMO_OPTIONS_T* options,
                         MOD_INFO_T * mod_info) {
  int i;
  int j;
  
  const char* alph_letters = summary->alph_letters;
  
  // Create the frequency matrix
  MATRIX_T* freqs = NULL;
  freqs = get_count_matrix(freqs, mod_info->seq_list, NULL, options, summary);
  normalize_rows(0.0, freqs);
  
  // Create the motif
  MOTIF_INFO_T* motifinfo = mm_malloc(sizeof(MOTIF_INFO_T));
  motifinfo->motif = allocate_motif(mod_info->mod_name, "", summary->alph, freqs, NULL);
  motifinfo->seqs = arraylst_create();
  for (i = 0; i < arraylst_size(mod_info->seq_list); ++i) {
    SEQ_T* seqobject = options->eliminate_repeats ? hash_get_entry_value(arraylst_get(i, mod_info->seq_list)) : arraylst_get(i, mod_info->seq_list);
    arraylst_add(get_raw_sequence(seqobject), motifinfo->seqs);
  }
  motifinfo->fg_size = arraylst_size(mod_info->seq_list);
  arraylst_add(motifinfo, mod_info->motifinfos);
  
  // clean up
  free_matrix(freqs);
}
Пример #3
0
/****************************************************************************
 * Create a lookup table for converting an index into a sequence to an index
 * into the alignment. Note that because there are many alignment positions
 * that correspond to a sequence position we take the first occurence.
 * JCH: I have added this function for the sake of the BLS scan mode
 * so that single mode matches in each sequence can be mapped back
 * to positions in the alignment.
 ****************************************************************************/
int* make_seq_to_alignment_table(int ref_seq_index, ALIGNMENT_T* an_alignment) {

  char* raw_seq = NULL;
  int align_length = 0;
  int align_index = 0;
  int seq_index = 0;
  int *table = NULL;
  SEQ_T* seq = NULL;

  align_length = get_alignment_length(an_alignment);
  seq = get_alignment_sequence(ref_seq_index, an_alignment);
  raw_seq = get_raw_sequence(seq);

  // Table is indexed by position in the sequence
  // Table values are the first corresponding
  // position in the alignment.
  table = (int *) mm_malloc((align_length) * sizeof(int));
  seq_index = 0;
  table[seq_index] = 0;

  for (align_index = 0; align_index < align_length; align_index++) {
    if (raw_seq[align_index] != '-' && raw_seq[align_index] != '.') {
      seq_index++;
      table[seq_index] = align_index;
    }
  }

  return table;

}
Пример #4
0
void print_phylip_alignment
  (ALIGNMENT_T* the_alignment,
   FILE* outfile)
{
  int i_seq;
  int i_position;
  char buffer[OUTPUT_WIDTH+1];
  char* this_sequence;

  fprintf(outfile, "%d %d\n", the_alignment->num_sequences, 
	  the_alignment->length);

  /* Print the IDs and initial sequences. */
  for (i_seq = 0; i_seq < the_alignment->num_sequences; i_seq++) {
    
    /* Print the ID. */
    strncpy(buffer, get_seq_name(the_alignment->sequences[i_seq]), 10);
    buffer[10] = '\0';
    fprintf(outfile, "%-10s ", buffer);

    /* Print the first block of sequence. */
    this_sequence = get_raw_sequence(the_alignment->sequences[i_seq]);
    strncpy(buffer, &(this_sequence[0]), OUTPUT_WIDTH);
    buffer[OUTPUT_WIDTH] = '\0';
    fprintf(outfile, "%s\n", buffer);
  }

  /* Blank line between sequences. */
  fprintf(outfile, "\n");

  /* Print successive blocks. */
  for (i_position = OUTPUT_WIDTH; i_position < the_alignment->length;
       i_position += OUTPUT_WIDTH) {

    for (i_seq = 0; i_seq < the_alignment->num_sequences; i_seq++) {
      this_sequence = get_raw_sequence(the_alignment->sequences[i_seq]);
      strncpy(buffer, &(this_sequence[i_position]), OUTPUT_WIDTH);
      buffer[OUTPUT_WIDTH] = '\0';
      fprintf(outfile, "           %s\n", buffer);
    }

    /* Blank line between sequences. */
    fprintf(outfile, "\n");
  }
}
Пример #5
0
Файл: seq.c Проект: CPFL/gmeme
char* get_raw_subsequence
  (int start, int stop, SEQ_T* a_sequence)
{
  assert(a_sequence != NULL);
  assert((stop - start) >= 0);
  char *sequence = get_raw_sequence(a_sequence);
  char *subsequence = mm_malloc((stop - start + 2) * sizeof(char));
  strncpy(subsequence, sequence + start, stop - start + 1);
  subsequence[stop - start + 1] = 0;
  return(subsequence);
}
Пример #6
0
/****************************************************************************
 * Extract a small alignment out of the middle of a larger alignment.
 ****************************************************************************/
ALIGNMENT_T* extract_subalignment
  (int start,
   int width,
   ALIGNMENT_T* alignment)
{
  int num_sequences = get_num_aligned_sequences(alignment);
  SEQ_T** sequences = get_alignment_sequences(alignment);
  SEQ_T** subsequences = (SEQ_T**)mm_malloc(num_sequences * sizeof(SEQ_T*));

  // Extract the specified columns into a new list of sequences.
  int i_seq = 0;
  char* subsequence = mm_malloc((width + 1) * sizeof(char));
  for (i_seq = 0; i_seq < num_sequences; i_seq++) {
    SEQ_T* this_seq = sequences[i_seq];
    char* raw_seq = get_raw_sequence(this_seq);
    strncpy(subsequence, raw_seq + start, width);
    subsequence[width] = '\0';
    subsequences[i_seq] = 
      allocate_seq(get_seq_name(this_seq),
		   get_seq_description(this_seq),
		   get_seq_offset(this_seq), 
		   subsequence);
  }

  // Extract the consensus string in the specified columns.
  char* consensus = get_consensus_string(alignment);
  char* subconsensus = mm_malloc(sizeof(char) * (width + 1));
  strncpy(subconsensus, consensus + start, width);
  subconsensus[width] = '\0';

  // Allocate and return the new alignment.
  ALIGNMENT_T* subalignment 
    = allocate_alignment(get_alignment_name(alignment),
			 get_alignment_description(alignment),
			 num_sequences,
			 subsequences,
			 subconsensus);

  // Free local dynamic memory.
  for (i_seq = 0; i_seq < num_sequences; i_seq++) {
    free_seq(subsequences[i_seq]);
  }
  myfree(subsequences);
  myfree(subsequence);
  return(subalignment);
}
Пример #7
0
static double * log_cumulative_background(ALPH_T *alph, const int sdbg_order, SEQ_T *sequence) {
  BGCALC_T *calc;
  ARRAY_T *cp;
  double *logcumback;
  const char *raw_seq;
  int i;
  if (sdbg_order < 0) die("No such thing as a negative background order");
  logcumback = mm_malloc(sizeof(double) * (get_seq_length(sequence)+1));
  raw_seq = get_raw_sequence(sequence);
  calc = NULL;
  // calculate background model
  calculate_markov_model(alph, sdbg_order, 1.0, false, raw_seq, &calc);
  cp = calculate_markov_model(alph, sdbg_order, 1.0, false, NULL, &calc);
  // add x-tuples to model
  extend_markov_model(alph, true, SUM_FREQS, cp);
  // normalize for each prefix (convert to conditional probability)
  for (i = 0; i < get_array_length(cp); i += alph_size_wild(alph)) {
    normalize_subarray(i, alph_size_core(alph), 0, cp); 
    set_array_item(i + alph_wild(alph), 1.0, cp);
  }
  calculate_log_cumulative_background(alph, true, sdbg_order, cp, raw_seq, logcumback);
  free_array(cp);
  return logcumback;
}
Пример #8
0
/**
 * Remove sequences do not match a pattern from phospho and bg lists and update their respective count matrix
 */
void remove_sequences_and_update_matrix(char letter,
                                        int pos,
                                        ARRAYLST_T* seqs,
                                        MOTIFX_STATUS_T** status_array,
                                        int* num_active,
                                        MATRIX_T* count,
                                        SUMMARY_T* summary,
                                        MOMO_OPTIONS_T* options) {
  
  int i;
  const char* alph_letters = summary->alph_letters;
  
  // Look through phospho_seqs and remove sequences. Update phospho_seqs
  for (i = 0; i < arraylst_size(seqs); ++i) {
    char* curr_seq = get_raw_sequence((SEQ_T*) (options->eliminate_repeats ? hash_get_entry_value((HASH_TABLE_ENTRY*) arraylst_get(i, seqs)) : arraylst_get(i, seqs)));
    // For anything active that does not match the pattern, turn it inactive.
    MOTIFX_STATUS_T status = (*status_array)[i];
    if (status == ACTIVE && curr_seq[pos] != letter) {
      *num_active = *num_active - 1;
      (*status_array)[i] = INACTIVE;
    }
  }
  count = get_count_matrix(count, seqs, status_array, options, summary);
}
Пример #9
0
/*************************************************************************
 * Calculate the odds score for each motif-sized window at each
 * site in the sequence using the given nucleotide frequencies.
 *
 * This function is a lightweight version based on the one contained in
 * motiph-scoring. Several calculations that are unnecessary for gomo
 * have been removed in order to speed up the process
 *************************************************************************/
static double score_sequence(
    SEQ_T *seq,         // sequence to scan (IN)
    MOTIF_T *motif,     // motif already converted to odds values (IN)
    PSSM_T *m_pssm,     // motif pssm (IN)
    MATRIX_T *m_odds,   // motif odds (IN)
    int method,         // method used for scoring (IN)
    double threshold,   // Threshold to use in TOTAL_HITS mode with a PWM
    ARRAY_T *bg_freqs   //background model
    )
{

  assert(seq != NULL);
  assert(motif != NULL);
  assert((method == TOTAL_HITS && m_pssm) || (method != TOTAL_HITS && m_odds));

  char* raw_seq = get_raw_sequence(seq);
  int seq_length = get_seq_length(seq);

  // Get the pv lookup table
  ARRAY_T* pv_lookup = NULL;
  if (NULL != m_pssm) {
    pv_lookup = m_pssm->pv;
    assert(get_array_length(pv_lookup) > 0);
  }

  // Prepare storage for the string representing the portion
  // of the reference sequence within the window.
  char* window_seq = (char *) mm_malloc(sizeof(char) * (get_motif_length(motif) + 1));
  window_seq[get_motif_length(motif)] = '\0';

  int max_index = seq_length - get_motif_length(motif);
  if (max_index < 0) max_index = 0;
  const int asize = alph_size(get_motif_alph(motif), ALPH_SIZE);
  double* odds =  (double*) mm_malloc(sizeof(double)*max_index);
  double* scaled_log_odds =  (double*) mm_malloc(sizeof(double)*max_index);

  // For each site in the sequence
  int seq_index;
  for (seq_index = 0; seq_index < max_index; seq_index++) {
    double odd = 1.0;
    scaled_log_odds[seq_index] = 0;

    // For each site in the motif window
    int motif_position;
    for (motif_position = 0; motif_position < get_motif_length(motif); motif_position++) {
      char c = raw_seq[seq_index + motif_position];
      window_seq[motif_position] = c;

      // Check for gaps at this site
      if(c == '-' || c == '.') {
        break;
      }

      // Check for ambiguity codes at this site
      //TODO: This next call is very expensive - it takes up approx. 10% of a
      //      programme's running time. It should be fixed up somehow.
      int aindex = alph_index(get_motif_alph(motif), c);
      if (aindex > asize) {
        break;
      }
      if (method == TOTAL_HITS) {
        //If we're in this mode, then we're using LOG ODDS.
        //scaled_log_odds[seq_index] += get_matrix_cell(motif_position, aindex, get_motif_freqs(motif));
        scaled_log_odds[seq_index] += get_matrix_cell(motif_position, aindex, m_pssm->matrix);
      } else {
        odd *= get_matrix_cell(motif_position, aindex, m_odds);
      }
    }
    odds[seq_index] = odd;
  }

  // return odds as requested (MAX or AVG scoring)
  double requested_odds = 0.0;
  if (method == AVG_ODDS){
    for (seq_index = 0; seq_index < max_index; seq_index++) {
      requested_odds += odds[seq_index];
    }
    requested_odds /= max_index + 1;		// Divide by 0 if max_index==0
  } else if (method == MAX_ODDS){
    for (seq_index = 0; seq_index < max_index; seq_index++) {
      if (odds[seq_index] > requested_odds){
        requested_odds = odds[seq_index];
      }
    }
  } else if (method == SUM_ODDS) {
    for (seq_index = 0; seq_index < max_index; seq_index++) {
      requested_odds += odds[seq_index];
    }
  } else if (method == TOTAL_HITS) {
    for (seq_index = 0; seq_index < max_index; seq_index++) {

      if (scaled_log_odds[seq_index] >= (double)get_array_length(pv_lookup)) {
        scaled_log_odds[seq_index] = (double)(get_array_length(pv_lookup) - 1);
      } 
      double pvalue = get_array_item((int) scaled_log_odds[seq_index], pv_lookup);

      //Figure out how to calculate the p-value of a hit
      //fprintf(stderr, "m: %s pv_l len: %i scaled_log_odds: %g seq index: %i pvalue: %g\n", 
      //    get_motif_id(motif), get_array_length(pv_lookup), scaled_log_odds[seq_index], seq_index, pvalue);

      if (pvalue < threshold) {
        requested_odds++; //Add another hit.
      }

      if (verbosity > HIGHER_VERBOSE) {
        fprintf(stderr, "Window Data: %s\t%s\t%i\t%g\t%g\t%g\n",
            get_seq_name(seq), get_motif_id(motif), seq_index, scaled_log_odds[seq_index], pvalue, threshold);
      }
    }
  }

  myfree(odds);
  myfree(scaled_log_odds);
  myfree(window_seq);
  return requested_odds;
}
Пример #10
0
/**
 * Recursive function. Creates and stores a motif using the motif-x
 * algorithm until no more are left.
 */
void create_motifx_motif(ARRAYLST_T* phospho_seqs,
                         ARRAYLST_T* bg_seqs,
                         MOTIFX_STATUS_T** phospho_status,
                         MOTIFX_STATUS_T** bg_status,
                         MATRIX_T* phospho_count,
                         MATRIX_T* bg_count,
                         int* num_active,
                         int* num_bg_active,
                         char* modname,
                         MOD_INFO_T* mod_info,
                         MOMO_OPTIONS_T* options,
                         SUMMARY_T* summary) {
  int i;
  int j;
  
  const char* alph_letters = summary->alph_letters;
  
  // Initialize pattern, sequence count, bg sequence count, and overall score for this motif.
  char* pattern = mm_malloc(options->width + 1);
  for (i = 0; i < options->width; ++i) {
    pattern[i] = 'X';
  }
  pattern[options->width] = '\0';
  int* num_active_copy = mm_malloc(sizeof(int));
  *num_active_copy = *num_active;
  int* num_bg_active_copy = mm_malloc(sizeof(int));
  *num_bg_active_copy = *num_bg_active;
  double* motif_score = mm_malloc(sizeof(double));
  *motif_score = 0;
  
  // Set the pattern, num active copy, num bg active copy, motif score, and get a count of the sequences
  MATRIX_T* result_count_matrix = add_to_pattern(pattern, phospho_seqs, bg_seqs, phospho_status, bg_status, num_active_copy, num_bg_active_copy, phospho_count, bg_count, motif_score, summary, options);
  
  // If any of the characters are not X, then we have found a pattern
  BOOLEAN_T found_pattern = FALSE;
  for (i = 0; i < options->width; ++i) {
    if (pattern[i] != 'X') {
      found_pattern = TRUE;
    }
  }
  
  // If there is a pattern, store the pattern and call create_motifx_motif again.
  if (found_pattern) {
    // fill out the rest of the pattern (e.g. if you have pattern ..ASAAA, and realize the actual pattern is A.ASAAA
    for (i = 0; i < options->width; i++) {
      for (j = 0; j < strlen(alph_letters); j++) {
        if ((int) get_matrix_cell_defcheck(i, j, result_count_matrix) == *num_active_copy) {
          pattern[i] = alph_letters[j];
        }
      }
    }
    
    // create the pattern name
    char* pattern_name = mm_malloc(strlen(pattern) + strlen(modname) + 3);
    pattern_name[0] = '\0';
    strncat(pattern_name, pattern, strlen(pattern)/2);
    strncat(pattern_name, "_", 1);
    strncat(pattern_name, modname, strlen(modname));
    strncat(pattern_name, "_", 1);
    strncat(pattern_name, pattern + strlen(pattern)/2 + 1, strlen(pattern)/2);
    
    // convert this count matrix into frequencies
    normalize_rows(0.0, result_count_matrix);
    
    // Store this motif
    MOTIF_INFO_T* motifinfo = mm_malloc(sizeof(MOTIF_INFO_T));
    MOTIF_T* motif = allocate_motif(pattern_name, "", summary->alph, result_count_matrix, NULL);
    set_motif_nsites(motif, *num_active_copy);
    motifinfo->motif = motif;
    motifinfo->seqs = arraylst_create();
    motifinfo->score = *motif_score;
    motifinfo->fg_match = *num_active_copy;
    motifinfo->fg_size = *num_active;
    motifinfo->bg_match = *num_bg_active_copy;
    motifinfo->bg_size = *num_bg_active;
    for (i = 0; i < arraylst_size(phospho_seqs); ++i) {
      MOTIFX_STATUS_T status = (*phospho_status)[i];
      if (status == ACTIVE) {
        SEQ_T* active_sequence = (options->eliminate_repeats) ? hash_get_entry_value(arraylst_get(i, phospho_seqs)) : arraylst_get(i, phospho_seqs);
        arraylst_add(get_raw_sequence(active_sequence), motifinfo->seqs);
      }
    }
    arraylst_add(motifinfo, mod_info->motifinfos);
    
    // delete the sequences from this motif. turn inactive into active.
    delete_sequences(phospho_status, arraylst_size(phospho_seqs));
    delete_sequences(bg_status, arraylst_size(bg_seqs));
    
    // update the count of number of actives
    *num_active = *num_active - *num_active_copy;
    *num_bg_active = *num_bg_active - *num_bg_active_copy;
    
    // recalculate phospho count and bg count.
    phospho_count = get_count_matrix(phospho_count, phospho_seqs, phospho_status, options, summary);
    bg_count = get_count_matrix(bg_count, bg_seqs, bg_status, options, summary);
    
    // free up space
    myfree(pattern);
    myfree(num_active_copy);
    myfree(num_bg_active_copy);
    myfree(motif_score);
    myfree(pattern_name);
    
    // try to create another motif.
    create_motifx_motif(phospho_seqs,
                        bg_seqs,
                        phospho_status,
                        bg_status,
                        phospho_count,
                        bg_count,
                        num_active,
                        num_bg_active,
                        modname,
                        mod_info,
                        options,
                        summary);
  }
  // free up space
  myfree(pattern);
  myfree(num_active_copy);
  myfree(num_bg_active_copy);
  myfree(motif_score);
}
Пример #11
0
/*************************************************************************
 * Entry point for ama
 *************************************************************************/
int main(int argc, char *argv[]) {
  int max_seq_length = MAX_SEQ;
  STRING_LIST_T* selected_motifs = NULL;
  double pseudocount = 0.01;
  int output_format = CISML_FORMAT;
  program_name = "ama";
  int scoring = AVG_ODDS;
  BOOLEAN_T pvalues = FALSE;
  BOOLEAN_T normalize_scores = FALSE;
  BOOLEAN_T combine_duplicates = FALSE;
  int num_gc_bins = 1;
  int sdbg_order = -1;				// don't use sequence background
  BOOLEAN_T scan_both_strands = TRUE;
  ARRAY_T* pos_bg_freqs = NULL;
  ARRAY_T* rev_bg_freqs = NULL;
  clock_t c0, c1; /* measuring cpu_time */
  CISML_T *cisml;
  char * out_dir = NULL;
  BOOLEAN_T clobber = FALSE;
  int i;
  int last = 0;
  ALPH_T alph = INVALID_ALPH;

  /**********************************************
   * COMMAND LINE PROCESSING
   **********************************************/

  const int num_options = 16;
  cmdoption const motif_scan_options[] = {
    { "max-seq-length", REQUIRED_VALUE },
    { "motif", REQUIRED_VALUE },
    { "motif-pseudo", REQUIRED_VALUE },
    { "rma", NO_VALUE },
    { "pvalues", NO_VALUE },
    { "sdbg", REQUIRED_VALUE },
    { "norc", NO_VALUE },
    { "cs", NO_VALUE },
    { "o-format", REQUIRED_VALUE },
    { "o", REQUIRED_VALUE },
    { "oc", REQUIRED_VALUE },
    { "scoring", REQUIRED_VALUE },
    { "verbosity", REQUIRED_VALUE },
    { "gcbins", REQUIRED_VALUE },
    { "last", REQUIRED_VALUE },
    { "version", NO_VALUE }
  };

  int option_index = 0;

  // Define the usage message.
  char usage[] = "USAGE: ama [options] <motif file> <sequence file> [<background file>]\n"
    "\n"
    "   Options:\n"
    "     --sdbg <order>\t\t\tUse Markov background model of\n"
    "       \t\t\t\t\torder <order> derived from the sequence\n"
    "       \t\t\t\t\tto compute its likelihood ratios.\n"
    "       \t\t\t\t\tOverrides --pvalues, --gcbins and --rma;\n"
    "       \t\t\t\t\t<background file> is required unless\n"
    "       \t\t\t\t\t--sdbg is given.\n"
    "     --motif <id>\t\t\tUse only the motif identified by <id>.\n"
    "       \t\t\t\t\tThis option may be repeated.\n"
    "     --motif-pseudo <float>\t\tThe value <float> times the background\n"
    "       \t\t\t\t\tfrequency is added to the count of each\n"
    "       \t\t\t\t\tletter when creating the likelihood \n"
    "       \t\t\t\t\tratio matrix (default: %g).\n"
    "     --norc\t\t\t\tDisables the scanning of the reverse\n"
    "       \t\t\t\t\tcomplement strand.\n"
    "     --scoring [avg-odds|max-odds]\tIndicates whether the average or \n"
    "       \t\t\t\t\tthe maximum odds should be calculated\n"
    "       \t\t\t\t\t(default: avg-odds)\n"
    "     --rma\t\t\t\tScale motif scores to the range 0-1.\n"
    "       \t\t\t\t\t(Relative Motif Affinity).\n"
    "       \t\t\t\t\tMotif scores are scaled by the maximum\n"
    "       \t\t\t\t\tscore achievable by that PWM. (default:\n"
    "       \t\t\t\t\tmotif scores are not normalized)\n"
    "     --pvalues\t\t\t\tPrint p-value of avg-odds score in cisml\n"
    "       \t\t\t\t\toutput. Ignored for max-odds scoring.\n"
    "       \t\t\t\t\t(default: p-values are not printed)\n"
    "     --gcbins <bins>\t\t\tCompensate p-values for GC content of\n"
    "       \t\t\t\t\teach sequence using given number of \n"
    "       \t\t\t\t\tGC range bins. Recommended bins: 41.\n"
    "       \t\t\t\t\t(default: p-values are based on\n"
    "       \t\t\t\t\tfrequencies in background file)\n"
    "     --cs\t\t\t\tEnable combining sequences with same\n"
    "       \t\t\t\t\tidentifier by taking the average score\n"
    "       \t\t\t\t\tand the Sidac corrected p-value.\n"
    "     --o-format [gff|cisml]\t\tOutput file format (default: cisml)\n"
    "       \t\t\t\t\tignored if --o or --oc option used\n"
    "     --o <directory>\t\t\tOutput all available formats to\n"
    "       \t\t\t\t\t<directory>; give up if <directory>\n"
    "       \t\t\t\t\texists\n"
    "     --oc <directory>\t\t\tOutput all available formats to\n"
    "       \t\t\t\t\t<directory>; if <directory> exists\n"
    "       \t\t\t\t\toverwrite contents\n"
    "     --verbosity [1|2|3|4]\t\tControls amount of screen output\n"
    "       \t\t\t\t\t(default: %d)\n"
    "     --max-seq-length <int>\t\tSet the maximum length allowed for \n"
    "       \t\t\t\t\tinput sequences. (default: %d)\n"
    "     --last <int>\t\t\tUse only scores of (up to) last <n>\n"
    "       \t\t\t\t\tsequence positions to compute AMA.\n"
    "     --version   \t\t\tPrint version and exit.\n"
    "\n";

  // Parse the command line.
  if (simple_setopt(argc, argv, num_options, motif_scan_options) != NO_ERROR) {
    die("Error processing command line options: option name too long.\n");
  }
    
    BOOLEAN_T setoutputformat = FALSE;
    BOOLEAN_T setoutputdirectory = FALSE;

  while (TRUE) {
    int c = 0;
    char* option_name = NULL;
    char* option_value = NULL;
    const char * message = NULL;

    // Read the next option, and break if we're done.
    c = simple_getopt(&option_name, &option_value, &option_index);
    if (c == 0) {
      break;
    } else if (c < 0) {
      (void) simple_getopterror(&message);
      die("Error processing command line options (%s).\n", message);
    } else if (strcmp(option_name, "max-seq-length") == 0) {
	max_seq_length = atoi(option_value);
    } else if (strcmp(option_name, "norc") == 0) {
	scan_both_strands = FALSE;
    } else if (strcmp(option_name, "cs") == 0) {
		combine_duplicates = TRUE;
    } else if (strcmp(option_name, "motif") == 0) {
	if (selected_motifs == NULL) {
	  selected_motifs = new_string_list();
	}
	add_string(option_value, selected_motifs);
    } else if (strcmp(option_name, "motif-pseudo") == 0) {
	pseudocount = atof(option_value);
    } else if (strcmp(option_name, "o-format") == 0) {
        if (setoutputdirectory) {
            if (verbosity >= NORMAL_VERBOSE)
                fprintf(stderr, "output directory specified, ignoring --o-format\n");
        } else {
            setoutputformat = TRUE;
            if (strcmp(option_value, "gff") == 0)
                output_format = GFF_FORMAT;
            else if (strcmp(option_value, "cisml") == 0)
                output_format = CISML_FORMAT;
            else {
                if (verbosity >= NORMAL_VERBOSE)
                  fprintf(stderr, "Output format not known. Using standard instead (cisML).\n");
                  output_format = CISML_FORMAT;
            }
        }
    } else if (strcmp(option_name, "o") == 0 || strcmp(option_name, "oc") == 0) {
        setoutputdirectory = TRUE;
        if (setoutputformat) {
            if (verbosity >= NORMAL_VERBOSE)
                fprintf(stderr, "output directory specified, ignoring --o-format\n");
        }
        clobber = strcmp(option_name, "oc") == 0;
        out_dir = (char*) malloc (sizeof(char)*(strlen(option_value)+1));
        strcpy(out_dir, option_value);
        output_format = DIRECTORY_FORMAT;
    } else if (strcmp(option_name, "verbosity") == 0) {
	verbosity = atoi(option_value);
    } else if (strcmp(option_name, "scoring") == 0) {
      if (strcmp(option_value, "max-odds") == 0)
	scoring = MAX_ODDS;
      else if (strcmp(option_value, "avg-odds") == 0)
	scoring = AVG_ODDS;
      else if (strcmp(option_value, "sum-odds") == 0)
	scoring = SUM_ODDS;
	  else
	die("Specified scoring scheme not known.\n", message);
    } else if (strcmp(option_name, "pvalues") == 0) {
      pvalues = TRUE;
    } else if (strcmp(option_name, "rma") == 0) {
      normalize_scores = TRUE;
      fprintf(stderr, "Normalizing motif scores using RMA method.\n");
    } else if (strcmp(option_name, "gcbins") == 0) {
      num_gc_bins = atoi(option_value);
      pvalues = TRUE;
      if (num_gc_bins <= 1) die("Number of bins in --gcbins must be greater than 1.\n", message);
    } else if (strcmp(option_name, "sdbg") == 0) {
      sdbg_order = atoi(option_value);			// >=0 means use sequence bkg
    }
    else if (strcmp(option_name, "last") == 0) {
      int i = 0;
      if (option_value[0] == '-') ++i;
      while (option_value[i] != '\0') {
        if (!isdigit(option_value[i])) {
          die("Specified parameter 'last' contains non-numeric characters.\n");
        }
        ++i;
      }
      last = atoi(option_value);
      if (errno != 0) {
        die("Specified parameter 'last' could not be parsed as a number as:\n%s\n",strerror(errno));
      }
      if (last < 0) {
        die("Specified parameter 'last' had negative value (%d) when only postive or zero values are allowed \n", last);
      }
    }
    else if (strcmp(option_name, "version") == 0) {
      fprintf(stdout, VERSION "\n");
      exit(EXIT_SUCCESS);
    }
  }

  // --sdbg overrides --pvalues and --gcbins and --rma
  int req_args = 3;
  if (sdbg_order >= 0) {
    pvalues = FALSE;
    normalize_scores = FALSE;
    num_gc_bins = 1;
    req_args = 2;
  }

  // Check all required arguments given
  if (sdbg_order >= 0 && argc > option_index + req_args) {
    die("<background file> cannot be given together with --sdbg.\n");
  } else if (argc != option_index + req_args) {
    fprintf(stderr, usage, pseudocount, verbosity, max_seq_length);
    exit(EXIT_FAILURE);
  }

  // Get required arguments. 
  char* motif_filename = argv[option_index];
  option_index++;
  char* fasta_filename = argv[option_index];
  option_index++;
  char* bg_filename;
  if (req_args == 3) {			// required unless --sdbg given
    bg_filename = argv[option_index];
    option_index++;
  } else {
    bg_filename = "--uniform--";	// So PSSMs will use uniform background;
					// we can multiply them out later.
  }

  // measure time
  c0 = clock();

  // Set up hash tables for computing reverse complement if doing --sdbg
  if (sdbg_order >= 0) setup_hash_alph(DNAB);

  // Create cisml data structure for recording results
  cisml = allocate_cisml(program_name, motif_filename, fasta_filename);
  set_cisml_background_file(cisml, bg_filename);

  /**********************************************
   * Read the motifs and background model.
   **********************************************/
  int num_motifs = 0;
  MREAD_T *mread;
  ARRAYLST_T *motifs;
  PSSM_PAIR_T** pssm_pairs;	// note pssm_pairs is an array of pointers

  //this reads any meme file, xml, txt and html
  mread = mread_create(motif_filename, OPEN_MFILE);
  mread_set_bg_source(mread, bg_filename);
  mread_set_pseudocount(mread, pseudocount);

  motifs = mread_load(mread, NULL);
  alph = mread_get_alphabet(mread);
  pos_bg_freqs = mread_get_background(mread);

  mread_destroy(mread);

  num_motifs = arraylst_size(motifs);

  // allocate memory for PSSM pairs
  pssm_pairs = (PSSM_PAIR_T**)mm_malloc(sizeof(PSSM_PAIR_T*) * num_motifs);

  if (verbosity >= NORMAL_VERBOSE) 
    fprintf(stderr, "Number of motifs in file %d.\n", num_motifs);

  // make a CISML pattern to hold scores for each motif
  PATTERN_T** patterns = NULL;
  Resize(patterns, num_motifs, PATTERN_T*);
  int motif_index;
  for (motif_index = 0; motif_index < num_motifs; motif_index++) {
    MOTIF_T* motif = (MOTIF_T*)arraylst_get(motif_index, motifs);
    patterns[motif_index] = allocate_pattern(get_motif_id(motif), "");
    add_cisml_pattern(cisml, patterns[motif_index]);
  }

  // make reverse complement motifs and background frequencies.
  if (scan_both_strands == TRUE) {
    add_reverse_complements(motifs);
    assert(arraylst_size(motifs) == (2 * num_motifs));
    rev_bg_freqs = allocate_array(get_array_length(pos_bg_freqs));
    complement_dna_freqs(pos_bg_freqs, rev_bg_freqs);
  }

  /**************************************************************
   * Convert motif matrices into log-odds matrices.
   * Scale them.
   * Compute the lookup tables for the PDF of scaled log-odds scores.
   **************************************************************/
  int ns = scan_both_strands ? 2 : 1;	// number of strands
  for (motif_index = 0; motif_index < num_motifs; motif_index++) {
    MOTIF_T *motif, *motif_rc;
    motif = (MOTIF_T*)arraylst_get(motif_index*ns, motifs);
    if (scan_both_strands)
      motif_rc = (MOTIF_T*)arraylst_get(motif_index*ns + 1, motifs);
    else
      motif_rc = NULL;
    /*
     *  Note: If scanning both strands, we complement the motif frequencies
     *  but not the background frequencies so the motif looks the same.
     *  However, the given frequencies are used in computing the p-values
     *  since they represent the frequencies on the negative strands.
     *  (If we instead were to complement the input sequence, keeping the
     *  the motif fixed, we would need to use the complemented frequencies
     *  in computing the p-values.  Is that any clearer?)
    */
    double range = 300;		// 100 is not very good; 1000 is great but too slow
    PSSM_T* pos_pssm =
      build_motif_pssm(
        motif, 
        pos_bg_freqs, 
        pos_bg_freqs, 
        NULL, // Priors not used
        0.0L, // alpha not used
        range, 
        num_gc_bins, 
        TRUE
      );
    PSSM_T* neg_pssm = (scan_both_strands ?
      build_motif_pssm(
        motif_rc, 
        rev_bg_freqs, 
        pos_bg_freqs, 
        NULL, // Priors not used
        0.0L, // alpha not used
        range, 
        num_gc_bins, 
        TRUE
      )
      : NULL
    );
    pssm_pairs[motif_index] = create_pssm_pair(pos_pssm, neg_pssm);
  }

  // Open the FASTA file for reading.
  FILE* fasta_file = NULL;
  if (open_file(fasta_filename, "r", FALSE, "FASTA", "sequences", &fasta_file) == 0) {
    die("Couldn't open the file %s.\n", fasta_filename);
  }
  if (verbosity >= NORMAL_VERBOSE) {
    if (last == 0) {
      fprintf(stderr, "Using entire sequence\n");
    } else {
      fprintf(stderr, "Limiting sequence to last %d positions.\n", last);
    }
  }

  /**************************************************************
   * Read in all sequences and score with all motifs
   **************************************************************/
  int seq_loading_num = 0;  // keeps track on the number of sequences read in total
  int seq_counter = 0;		// holds the index to the seq in the pattern
  int unique_seqs = 0;      // keeps track on the number of unique sequences
  BOOLEAN_T need_postprocessing = FALSE;
  SEQ_T* sequence = NULL;
  RBTREE_T* seq_ids = rbtree_create(rbtree_strcasecmp,NULL,free,rbtree_intcpy,free);
  RBNODE_T* seq_node;
  BOOLEAN_T created;
  while (read_one_fasta(alph, fasta_file, max_seq_length, &sequence)) {
    ++seq_loading_num;
	created = FALSE;
    char* seq_name = get_seq_name(sequence);
    int seq_len = get_seq_length(sequence);
    int scan_len;
    if (last != 0) {
      scan_len = last;
    } else {
      scan_len = seq_len;
    }
	  
	// red-black trees are only required if duplicates should be combined
	if (combine_duplicates){
		//lookup seq id and create new entry if required, return sequence index
		char *tmp_id = mm_malloc(strlen(seq_name)+1); // required copy for rb-tree
		strncpy(tmp_id,seq_name,strlen(seq_name)+1);
		seq_node = rbtree_lookup(seq_ids, tmp_id, TRUE, &created);
		if (created) {// assign it a loading number
			rbtree_set(seq_ids, seq_node, &unique_seqs);
			seq_counter = unique_seqs;
			++unique_seqs;
		} else {
			seq_counter = *((int*)rbnode_get(seq_node));
		}
	}
	  
    //
    // Set up sequence-dependent background model and compute
    // log cumulative probability of sequence.
    //
    double *logcumback = NULL;                    // array of log cumulative probs.
    if (sdbg_order >= 0) {
      Resize(logcumback, seq_len+1, double);
      char* raw_seq = get_raw_sequence(sequence);
      BOOLEAN rc = FALSE;
      double *a_cp = get_markov_from_sequence(raw_seq, alph_string(alph), rc, sdbg_order, 0);
      log_cum_back(raw_seq, a_cp, sdbg_order, logcumback);
      myfree(a_cp);
    }

    // Get the GC content of the sequence if binning p-values by GC
    // and store it in the sequence object.
    if (num_gc_bins > 1) {
      ARRAY_T *freqs = get_sequence_freqs(sequence, alph);
      set_total_gc_sequence(sequence,
        get_array_item(1,freqs) + get_array_item(2,freqs));	// f(C) + f(G)
      free_array(freqs);			// clean up
    } else {
      set_total_gc_sequence(sequence, -1);	// flag ignore
    }

    /**************************************************************
     * Process all motifs.
     **************************************************************/
    int ns = scan_both_strands ? 2 : 1;
    for (motif_index = 0; motif_index < num_motifs; motif_index++) {
      PATTERN_T *pattern = patterns[motif_index];
      MOTIF_T* motif = (MOTIF_T*)arraylst_get(ns*motif_index, motifs);
      char* motif_id = (scan_both_strands ? get_motif_st_id(motif) : get_motif_id(motif));
      if (verbosity >= HIGH_VERBOSE) {
        fprintf(stderr, "Using motif %s of width %d.\n", motif_id, get_motif_length(motif));
      }
      if ((selected_motifs == NULL) || (have_string(get_motif_id(motif), selected_motifs) == TRUE)) {
        if (verbosity >= HIGHER_VERBOSE) {
          fprintf(stderr, "Scanning %s sequence with length %d "
              "abbreviated to %d with motif %s with length %d.\n",
              seq_name, seq_len, scan_len, motif_id, get_motif_length(motif));
        }
		SCANNED_SEQUENCE_T* scanned_seq = NULL;

		
		if (!combine_duplicates || get_pattern_num_scanned_sequences(pattern) <= seq_counter){
			// Create a scanned_sequence record and save it in the pattern.
			scanned_seq = allocate_scanned_sequence(seq_name, seq_name, pattern);
			set_scanned_sequence_length(scanned_seq, scan_len);
		} else {
			// get existing sequence record
			scanned_seq = get_pattern_scanned_sequences(pattern)[seq_counter];
			set_scanned_sequence_length(scanned_seq, max(scan_len, get_scanned_sequence_length(scanned_seq)));
		}
		
		// check if scanned component of sequence has sufficient length for the motif
		if (scan_len < get_motif_length(motif)) {
			// set score to zero and p-value to 1 if not set yet
			if(!has_scanned_sequence_score(scanned_seq)){
				set_scanned_sequence_score(scanned_seq, 0.0);
			}
			if(pvalues && !has_scanned_sequence_pvalue(scanned_seq)){
				set_scanned_sequence_pvalue(scanned_seq, 1.0);
			} 
			add_scanned_sequence_scanned_position(scanned_seq); 
			if (get_scanned_sequence_num_scanned_positions(scanned_seq) > 0L) need_postprocessing = TRUE;
			if (verbosity >= HIGH_VERBOSE) fprintf(stderr, "%s too short for motif %s. Score set to 0!\n", seq_name, motif_id);
		} else {  
			// scan the sequence using average/maximum motif affinity
			ama_sequence_scan(alph, sequence, logcumback, pssm_pairs[motif_index], scoring, 
							  pvalues, last, scanned_seq, &need_postprocessing);
		}

      } else {
        if (verbosity >= HIGH_VERBOSE) fprintf(stderr, "Skipping motif %s.\n", motif_id);
      }
    } // All motifs parsed

    free_seq(sequence);
    if (sdbg_order >= 0) myfree(logcumback);

  } // read sequences
Пример #12
0
/*************************************************************************
 * Calculate the odds score for each motif-sized window at each
 * site in the sequence using the given nucleotide frequencies.
 *
 * This function is a lightweight version based on the one contained in
 * motiph-scoring. Several calculations that are unnecessary for gomo
 * have been removed in order to speed up the process.
 * Scores sequence with up to two motifs.
 *************************************************************************/
double score_sequence(
  SEQ_T*        seq,		// sequence to scan (IN)
  double *logcumback,		// cumulative bkg probability of sequence (IN)
  PSSM_PAIR_T*  pssm_pair,	// pos and neg pssms (IN)
  int method, 			// method used for scoring (IN)
  int last, 			//score only last <n> or
				//score all if <n> is zero (IN)
  BOOLEAN_T* isFeasible		// FLAG indicated if there is at least one position
				// where the motif could be matched against (OUT)
)
{
  assert(pssm_pair != NULL);
  assert(seq != NULL);

  PSSM_T* pos_pssm = pssm_pair->pos_pssm;
  assert(pos_pssm != NULL);
  PSSM_T* neg_pssm = pssm_pair->neg_pssm;
  int n_motifs = neg_pssm ? 2 : 1;

  char* raw_seq = get_raw_sequence(seq);
  int seq_length = get_seq_length(seq);
  int w = get_num_rows(pos_pssm->matrix);
  int n = seq_length - w + 1;

  if (verbosity >= DUMP_VERBOSE) {
    fprintf(stderr, "Debug n_motifs: %d seq_length: %d w: %d n: %d.\n", n_motifs, seq_length, w, n);
  }

  // Get alphabet;
  char* alphabet = get_alphabet(FALSE);
  int alph_size = get_alph_size(ALPH_SIZE);

  // Dependent on the "last" parameter, change the starting point
  int start;
  int N_scored;
  if (last > 0 && last < seq_length) {
    start = seq_length - last;
    N_scored  = n_motifs * (last - w + 1);	// number of sites scored
  } else {
    start = 0;
    N_scored  = n_motifs * n;			// number of sites scored
  }

  // For each motif (positive and reverse complement)
  double max_odds = 0.0;
  double sum_odds = 0.0;
  double requested_odds = 0.0;
  int i;

  if (verbosity >= HIGHER_VERBOSE) {
    fprintf(stderr, "Starting scan at position %d .\n", start);
  }

  for (i=0; i<n_motifs; i++) { 	// pos (and negative) motif
    PSSM_T* pssm = (i==0 ? pos_pssm : neg_pssm);	// choose +/- motif
    // For each site in the sequence
    int seq_index;
    for (seq_index = start; seq_index < n; seq_index++) {	// site 
      double odds = 1.0;
      // For each position in the motif window
      int motif_position;
      for (motif_position = 0; motif_position < w; motif_position++) { // column
        int i_site = seq_index + motif_position;
        char c = raw_seq[i_site];
        // Check for gaps at this site
        if (c == '-' || c == '.') { N_scored--; odds = 0; break; }
        // Check for ambiguity codes at this site
        int alph_index = alphabet_index(c, alphabet);
        if (alph_index >= alph_size || alph_index < 0) { N_scored--; odds = 0; break; }
        // multiple odds by value in appropriate motif cell
        odds *= get_matrix_cell(motif_position, alph_index, pssm->matrix);
      } // column
      //
      // Apply sequence-dependent background model.
      //
      if (logcumback) {
        int i_site = seq_index;
        double log_p = logcumback[i_site+w] - logcumback[i_site];	// log Pr(x | background)
        //printf("log_p:: %g motif_pos %d\n", log_p, motif_position);
        double adjust = exp(w*log(1/4.0) - log_p);	// Pr(x | uniform) / Pr(x | background)
        odds *= adjust;
      }
      // Add odds to growing sum.
      sum_odds += odds;				// sum of odds
      if (odds > max_odds) max_odds = odds;	// max of odds
    } // site
  } // motif

  if (verbosity >= HIGHER_VERBOSE) {
    fprintf(stderr, "Scored %d positions with the sum odds %f and the max odds %f.\n", N_scored, sum_odds, max_odds);
  }

  // has there been anything matched at all?
  if (N_scored == 0){
      if (verbosity >= NORMAL_VERBOSE) {
	    fprintf(stderr,"Sequence \'%s\' offers no location to match the motif against (sequence length too short?)\n",get_seq_name(seq));
      }
	  *isFeasible = FALSE;
	  return 0.0;
    // return odds as requested (MAX or AVG scoring)
  } else if (method == AVG_ODDS) {
    requested_odds = sum_odds / N_scored;	// mean
  } else if (method == MAX_ODDS) {
    requested_odds = max_odds;			// maximum
  } else if (method == SUM_ODDS) {
	requested_odds = sum_odds ;	// sum
  }

  return(requested_odds);
} // score_sequence
Пример #13
0
/*************************************************************************
 * Calculate the log-odds score for each possible motif site in the 
 * sequence and record the sites of the best. Apply a count to each
 * best site and increment the total site count.
 *************************************************************************/
static void score_sequence(
  CENTRIMO_OPTIONS_T *options,
  SEQ_T* sequence,
  PSSM_T*  pssm,
  PSSM_T*  rev_pssm,
  SEQ_SITES_T* seq_sites,
  SITE_COUNTS_T* counts
)
{
  char *raw_seq, *seg;
  int i, L, w, pos;
  double score;
  double count;
  SEQ_SITE_T *site;
  // check we got passed stuff
  assert(options != NULL);
  assert(sequence != NULL);
  assert(pssm != NULL);
  assert(seq_sites != NULL);
  assert(counts != NULL);
  // make Mac OS compiler happy.
  score = -BIG;
  // Score and record each possible motif site in the sequence
  raw_seq = get_raw_sequence(sequence);
  L = get_seq_length(sequence);
  w = pssm->w;
  // Reset the sequence stats structure
  seq_sites->best = -BIG;
  seq_sites->used = 0;
  // Read and score each position in the sequence.
  for (i = 0; i < L - w + 1; i++) {
    seg = raw_seq+i;
    // Score and record forward strand
    if (score_motif_site(options->alphabet, seg, pssm, &score)) 
      track_site(seq_sites, score, i, '+');
    // Score and record reverse strand if appropriate.
    if (rev_pssm && score_motif_site(options->alphabet, seg, rev_pssm, &score)) 
      track_site(seq_sites, score, i, '-');
  }
  // Record the position of best site, averaging ties
  // and using position in RC of sequence if site on reverse strand
  // unless no_flip is true.
  if (seq_sites->used && seq_sites->best >= options->score_thresh) {
    // add 1/n_ties to each tied position's count, 
    // averaging rather than random choice 
    count = (double)1.0 / (double)seq_sites->used;
    for (i = 0; i < seq_sites->used; i++) {
      site = seq_sites->sites+i;
      if (options->no_flip || site->strand == '+') {
        //pos = 2 * (site->start + w/2 - 1/2); // a motif of width 1 can have sites at the first index
        pos = 2 * site->start + w - 1; // a motif of width 1 can have sites at the first index
      } else {
        //pos = 2 * (L - (site->start + w/2) - 1; // a motif of width 1 can have sites at the first index
        pos = 2 * (L - site->start) - w - 1;
      }
      //record the count
      counts->sites[pos] += count;
    }
    counts->total_sites++;
  }
}
Пример #14
0
/****************************************************************************
 * Allocate one alignment object. Name and description may be NULL.
 * 
 * Returns a pointer to the newly created alignment.
 ****************************************************************************/
ALIGNMENT_T* allocate_alignment(
   char* name,
   char* description,
   int num_sequences,
   SEQ_T** sequences,
   char* consensus_string
)
{
  assert(num_sequences > 0);
  assert(sequences != NULL);
  assert(consensus_string != NULL);

  // Allocate the alignment object.
  ALIGNMENT_T* new_alignment = (ALIGNMENT_T*)mm_malloc(sizeof(ALIGNMENT_T));
  if (new_alignment == NULL) {
    die("Error allocating alignment\n");
  }

  // Store the name, truncating if necessary.
  if (name != NULL) {
    strncpy(new_alignment->name, name, MAX_ALIGNMENT_NAME);
    new_alignment->name[MAX_ALIGNMENT_NAME] = '\0';
    if (strlen(new_alignment->name) != strlen(name)) {
      fprintf(stderr, "Warning: truncating alignment program name %s to %s.\n",
	      name, new_alignment->name);
    }
  } else {
    new_alignment->name[0] = '\0';
  }

  // Store the description, truncating if necessary.
  if (description != NULL) {
    strncpy(new_alignment->desc, description, MAX_ALIGNMENT_COMMENT);
    new_alignment->desc[MAX_ALIGNMENT_COMMENT] = '\0';
  } else {
    new_alignment->desc[0] = '\0';
  }

  // Store the sequences.
  new_alignment->sequences = (SEQ_T**) mm_malloc(num_sequences * sizeof(SEQ_T*));
  if (new_alignment->sequences == NULL) {
    die("Error allocating sequences\n");
  }
  new_alignment->num_sequences = num_sequences;
  int seq_length = strlen(get_raw_sequence(sequences[0]));
  int i;
  for (i = 0; i < num_sequences; i++) {
    myassert(TRUE,
	     strlen(get_raw_sequence(sequences[i])) == seq_length,
	     "Sequence #1 (%s) is length=%d, but sequence #%d (%s) is length=%d.\n<%s>\n",
	     get_seq_name(sequences[0]), seq_length, i, 
	     get_seq_name(sequences[i]), strlen(get_raw_sequence(sequences[i])),
	     get_raw_sequence(sequences[i]));
    new_alignment->sequences[i] = 
      allocate_seq(get_seq_name(sequences[i]),
        get_seq_description(sequences[i]),
        get_seq_offset(sequences[i]), 
        get_raw_sequence(sequences[i])
      );
  }

  // Fill in the remaining fields.
  new_alignment->length = seq_length;
  copy_string(&(new_alignment->consensus_string), consensus_string);

  return(new_alignment);
}