Ejemplo n.º 1
0
/*************************************************************************
 * Converts the motif frequency matrix into a odds matrix: taken from old ama-scan.c
 *************************************************************************/
void convert_to_odds_matrix(MOTIF_T* motif, ARRAY_T* bg_freqs){
  const int asize = alph_size(get_motif_alph(motif), ALPH_SIZE);
  int motif_position_index,alph_index;
  MATRIX_T *freqs;
  freqs = get_motif_freqs(motif);

  const int num_motif_positions = get_num_rows(freqs);
  for (alph_index=0;alph_index<asize;++alph_index){
    double bg_likelihood = get_array_item(alph_index, bg_freqs);
    for (motif_position_index=0;motif_position_index<num_motif_positions;++motif_position_index){
      freqs->rows[motif_position_index]->items[alph_index] /= bg_likelihood;
    }
  }
}
Ejemplo n.º 2
0
/*************************************************************************
 * Copies the motif frequency matrix and converts it into a odds matrix
 *************************************************************************/
MATRIX_T* create_odds_matrix(MOTIF_T *motif, ARRAY_T* bg_freqs){
  const int asize = alph_size(get_motif_alph(motif), ALPH_SIZE);
  int pos, aidx;
  MATRIX_T *odds;
  
  odds = duplicate_matrix(get_motif_freqs(motif));
  const int num_pos = get_num_rows(odds);
  for (aidx = 0; aidx < asize; ++aidx) {
    double bg_likelihood = get_array_item(aidx, bg_freqs);
    for (pos = 0; pos < num_pos; ++pos) {
      odds->rows[pos]->items[aidx] /= bg_likelihood;
    }
  }
  return odds;
}
Ejemplo n.º 3
0
/*************************************************************************
 * Output JSON data for a motif
 *************************************************************************/
static void output_motif_json(JSONWR_T* json, MOTIF_STATS_T* stats, 
    SITE_COUNTS_T* counts) {
  //vars
  MOTIF_T *motif;
  MATRIX_T *freqs;
  int i, j, mlen, asize, end;
  motif = stats->motif;
  freqs = get_motif_freqs(motif);
  asize = alph_size(get_motif_alph(motif), ALPH_SIZE);
  jsonwr_start_object_value(json);
  jsonwr_lng_prop(json, "db", stats->db->id);
  jsonwr_str_prop(json, "id", get_motif_id(motif));
  if (*(get_motif_id2(motif))) {
    jsonwr_str_prop(json, "alt", get_motif_id2(motif));
  }
  mlen = get_motif_length(motif);
  jsonwr_lng_prop(json, "len", mlen);
  jsonwr_dbl_prop(json, "motif_evalue", get_motif_evalue(motif));
  jsonwr_dbl_prop(json, "motif_nsites", get_motif_nsites(motif));
  if (get_motif_url(motif) && *get_motif_url(motif)) {
    jsonwr_str_prop(json, "url", get_motif_url(motif));
  }
  jsonwr_property(json, "pwm");
  jsonwr_start_array_value(json);
  for (i = 0; i < mlen; i++) {
    jsonwr_start_array_value(json);
    for (j = 0; j < asize; j++) {
      jsonwr_dbl_value(json, get_matrix_cell(i, j, freqs));
    }
    jsonwr_end_array_value(json);
  }
  jsonwr_end_array_value(json);
  jsonwr_lng_prop(json, "bin_width", stats->central_window+1);
  jsonwr_dbl_prop(json, "bin_sites", stats->central_sites);
  jsonwr_lng_prop(json, "total_sites", counts->total_sites);
  jsonwr_dbl_prop(json, "log_pvalue", stats->log_adj_pvalue);
  jsonwr_dbl_prop(json, "max_prob", stats->max_prob);
  jsonwr_property(json, "sites");
  jsonwr_start_array_value(json);
  end = counts->allocated - (mlen - 1);
  for (i = (mlen - 1); i < end; i += 2) {
    jsonwr_dbl_value(json, counts->sites[i]);
  }
  jsonwr_end_array_value(json);
  jsonwr_end_object_value(json);
}
Ejemplo n.º 4
0
/*************************************************************************
 * Calculate the odds score for each motif-sized window at each
 * site in the sequence using the given nucleotide frequencies.
 *
 * This function is a lightweight version based on the one contained in
 * motiph-scoring. Several calculations that are unnecessary for gomo
 * have been removed in order to speed up the process
 *************************************************************************/
static double score_sequence(
    SEQ_T *seq,         // sequence to scan (IN)
    MOTIF_T *motif,     // motif already converted to odds values (IN)
    PSSM_T *m_pssm,     // motif pssm (IN)
    MATRIX_T *m_odds,   // motif odds (IN)
    int method,         // method used for scoring (IN)
    double threshold,   // Threshold to use in TOTAL_HITS mode with a PWM
    ARRAY_T *bg_freqs   //background model
    )
{

  assert(seq != NULL);
  assert(motif != NULL);
  assert((method == TOTAL_HITS && m_pssm) || (method != TOTAL_HITS && m_odds));

  char* raw_seq = get_raw_sequence(seq);
  int seq_length = get_seq_length(seq);

  // Get the pv lookup table
  ARRAY_T* pv_lookup = NULL;
  if (NULL != m_pssm) {
    pv_lookup = m_pssm->pv;
    assert(get_array_length(pv_lookup) > 0);
  }

  // Prepare storage for the string representing the portion
  // of the reference sequence within the window.
  char* window_seq = (char *) mm_malloc(sizeof(char) * (get_motif_length(motif) + 1));
  window_seq[get_motif_length(motif)] = '\0';

  int max_index = seq_length - get_motif_length(motif);
  if (max_index < 0) max_index = 0;
  const int asize = alph_size(get_motif_alph(motif), ALPH_SIZE);
  double* odds =  (double*) mm_malloc(sizeof(double)*max_index);
  double* scaled_log_odds =  (double*) mm_malloc(sizeof(double)*max_index);

  // For each site in the sequence
  int seq_index;
  for (seq_index = 0; seq_index < max_index; seq_index++) {
    double odd = 1.0;
    scaled_log_odds[seq_index] = 0;

    // For each site in the motif window
    int motif_position;
    for (motif_position = 0; motif_position < get_motif_length(motif); motif_position++) {
      char c = raw_seq[seq_index + motif_position];
      window_seq[motif_position] = c;

      // Check for gaps at this site
      if(c == '-' || c == '.') {
        break;
      }

      // Check for ambiguity codes at this site
      //TODO: This next call is very expensive - it takes up approx. 10% of a
      //      programme's running time. It should be fixed up somehow.
      int aindex = alph_index(get_motif_alph(motif), c);
      if (aindex > asize) {
        break;
      }
      if (method == TOTAL_HITS) {
        //If we're in this mode, then we're using LOG ODDS.
        //scaled_log_odds[seq_index] += get_matrix_cell(motif_position, aindex, get_motif_freqs(motif));
        scaled_log_odds[seq_index] += get_matrix_cell(motif_position, aindex, m_pssm->matrix);
      } else {
        odd *= get_matrix_cell(motif_position, aindex, m_odds);
      }
    }
    odds[seq_index] = odd;
  }

  // return odds as requested (MAX or AVG scoring)
  double requested_odds = 0.0;
  if (method == AVG_ODDS){
    for (seq_index = 0; seq_index < max_index; seq_index++) {
      requested_odds += odds[seq_index];
    }
    requested_odds /= max_index + 1;		// Divide by 0 if max_index==0
  } else if (method == MAX_ODDS){
    for (seq_index = 0; seq_index < max_index; seq_index++) {
      if (odds[seq_index] > requested_odds){
        requested_odds = odds[seq_index];
      }
    }
  } else if (method == SUM_ODDS) {
    for (seq_index = 0; seq_index < max_index; seq_index++) {
      requested_odds += odds[seq_index];
    }
  } else if (method == TOTAL_HITS) {
    for (seq_index = 0; seq_index < max_index; seq_index++) {

      if (scaled_log_odds[seq_index] >= (double)get_array_length(pv_lookup)) {
        scaled_log_odds[seq_index] = (double)(get_array_length(pv_lookup) - 1);
      } 
      double pvalue = get_array_item((int) scaled_log_odds[seq_index], pv_lookup);

      //Figure out how to calculate the p-value of a hit
      //fprintf(stderr, "m: %s pv_l len: %i scaled_log_odds: %g seq index: %i pvalue: %g\n", 
      //    get_motif_id(motif), get_array_length(pv_lookup), scaled_log_odds[seq_index], seq_index, pvalue);

      if (pvalue < threshold) {
        requested_odds++; //Add another hit.
      }

      if (verbosity > HIGHER_VERBOSE) {
        fprintf(stderr, "Window Data: %s\t%s\t%i\t%g\t%g\t%g\n",
            get_seq_name(seq), get_motif_id(motif), seq_index, scaled_log_odds[seq_index], pvalue, threshold);
      }
    }
  }

  myfree(odds);
  myfree(scaled_log_odds);
  myfree(window_seq);
  return requested_odds;
}
Ejemplo n.º 5
0
/**************************************************************************
 * Dump sequence matches sorted by the name of the sequence.
 *
 * Outputs Columns:
 *   1) Trimmed lowercase sequence with uppercase matches.
 *   2) Position of the secondary match within the whole sequence.
 *   3) Sequence fragment that the primary matched.
 *   4) Strand of the primary match (+|-)
 *   5) Sequence fragment that the secondary matched.
 *   6) Strand of the secondary match (+|-)
 *   7) Is the primary match on the same strand as the secondary (s|o)
 *   8) Is the secondary match downstream or upstream (d|u)
 *   9) The gap between the primary and secondary matches
 *  10) The name of the sequence
 *  11) The p-value of the bin containing the match (adjusted for # of bins)
 *  ---if the FASTA input file sequence names are in Genome Browser format:
 *  12-14) Position of primary match in BED coordinates
 *  15) Position of primary match in Genome Browser coordinates
 *  16-18) Position of secondary match in BED coordinates
 *  19) Position of secondary match in Genome Browser coordinates
 *
 * If you wish to sort based on the gap column:
 * Sort individual output:
 *  sort -n -k 9,9 -o seqs_primary_secondary.txt seqs_primary_secondary.txt
 * Or sort all outputs:
 *  for f in seqs_*.txt; do sort -n -k 9,9 -o $f $f; done
 * Or to get just locations of primary motif in BED coordinates
 * where the secondary is on the opposite strand, upstream with a gap of 118bp:
 *   awk '$7=="o" && $8=="u" && $9==118 {print $12"\t"$13"\t"$14;}' seqs_primary_secondary.txt 
 *
 **************************************************************************/
static void dump_sequence_matches(FILE *out, int margin, int bin, 
    double sigthresh, BOOLEAN_T sig_only, RBTREE_T *sequences,
    MOTIF_T *primary_motif, SECONDARY_MOTIF_T *secondary_motif,
    ARRAY_T **matches) {
  RBNODE_T *node;
  SEQUENCE_T *sequence;
  int idx, seqlen, i, j, start, end, secondary, secondary_pos, primary_len, secondary_len, distance;
  BOOLEAN_T primary_rc, secondary_rc, downstream; 
  char *buffer, *seq, *primary_match, *secondary_match;
  ARRAY_T *secondary_array;
  ALPH_T *alph;
  // get the alphabet
  alph = get_motif_alph(primary_motif);
  // allocate a buffer for copying the trimmed sequence into and modify it
  seqlen = margin * 2 + get_motif_trimmed_length(primary_motif);
  buffer = (char*)mm_malloc(sizeof(char) * (seqlen + 1));
  // get the lengths of the motifs
  primary_len = get_motif_trimmed_length(primary_motif);
  secondary_len = get_motif_trimmed_length(secondary_motif->motif); 
  // allocate some strings for storing the matches
  primary_match = (char*)mm_malloc(sizeof(char) * (primary_len + 1));
  secondary_match = (char*)mm_malloc(sizeof(char) * (secondary_len + 1));
  // add null byte at the end of the match strings
  primary_match[primary_len] = '\0';
  secondary_match[secondary_len] = '\0';

  // iterate over all the sequences
  for (node = rbtree_first(sequences); node != NULL; node = rbtree_next(node)) {
    sequence = (SEQUENCE_T*)rbtree_value(node);
    primary_rc = get_array_item(0, sequence->primary_matches) < 0;

    //secondary = matches[sequence->index];
    secondary_array = matches[sequence->index];
    if (! secondary_array) continue;
    int n_secondary_matches = get_array_length(secondary_array);
    for (idx=0; idx<n_secondary_matches; idx++) {
      secondary = get_array_item(idx, secondary_array);
      secondary_rc = secondary < 0;
      secondary_pos = abs(secondary);

      // calculate the distance
      if (secondary_pos <= margin) {
        distance = margin - secondary_pos - secondary_len + 1;
        downstream = primary_rc;
      } else {
        distance = secondary_pos - margin - primary_len - 1;
        downstream = !primary_rc;
      }

      // copy the trimmed sequence
      seq = sequence->data;
      for (i = 0; i < seqlen; ++i) {
        buffer[i] = (alph_is_case_insensitive(alph) ? tolower(seq[i]) : seq[i]);
      }
      buffer[seqlen] = '\0';

      // uppercase primary
      start = margin;
      end = margin + primary_len;
      for (i = start, j = 0; i < end; ++i, ++j) {
        buffer[i] = (alph_is_case_insensitive(alph) ? toupper(buffer[i]) : buffer[i]);
        primary_match[j] = buffer[i];
      }

      // uppercase secondary
      // note orign was one, subtract 1 to make origin zero as required for arrays
      start = secondary_pos -1;
      end = start + secondary_len;
      for (i = start, j = 0; i < end; ++i, ++j) {
        buffer[i] = (alph_is_case_insensitive(alph) ? toupper(buffer[i]) : buffer[i]);
        secondary_match[j] = buffer[i];
      }

      // get the p-value of the seconndary match
      SPACING_T *spacings;
      if (secondary_rc == primary_rc) {
        spacings = downstream ? secondary_motif->spacings+(SAME+RIGHT) : secondary_motif->spacings+(SAME+LEFT); 
      } else {
        spacings = downstream ? secondary_motif->spacings+(OPPO+RIGHT) : secondary_motif->spacings+(OPPO+LEFT); 
      }
      double p_value = spacings->pvalue[distance/bin];

      // skip match if not significant and only reporting significant matches
      if (sig_only && (p_value > sigthresh)) continue;

      // output line to file
      fprintf(out, "%s    %3d    %s    %s    %s    %s    %s    %s    %3d    %s    %.1e", 
          buffer, 
          secondary_pos, 
          primary_match, 
          (primary_rc ? "-" : "+"), 
          secondary_match, 
          (secondary_rc ? "-" : "+"), 
          (secondary_rc == primary_rc ? "s" : "o"),
          (downstream ? "d" : "u"), 
          distance, 
          sequence->name,
          p_value
      );

      // Parse the sequence name to see if we can get genomic coordinates
      // and print additional columns with primary and secondary matches
      // in both BED and Genome Browser coordinates.
      char *chr_name;
      size_t chr_name_len;
      int start_pos, end_pos;
      if (parse_genomic_coordinates_helper(
          sequence->name,
          &chr_name,
          &chr_name_len,
          &start_pos,
          &end_pos))
      {
        // Get the start and end of the primary match in 
        // 0-relative, half-open genomic coordinates.
        int p_start = start_pos + fabs(get_array_item(0, sequence->primary_matches)) - 1;
        int p_end = p_start + primary_len;
        // Get the start and end of the secondary match in 
        // 0-relative, half-open genomic coordinates.
        int s_start, s_end;
        if ( (!primary_rc && downstream) || (primary_rc && !downstream) ) {
          s_start = p_end + distance;
          s_end = s_start + secondary_len;
        } else {
          s_end = p_start - distance;
          s_start = s_end - secondary_len;
        }
        fprintf(out, "    %s    %d    %d    %s:%d-%d", 
          chr_name, p_start, p_end, chr_name, p_start+1, p_end);
        fprintf(out, "    %s    %d    %d    %s:%d-%d\n", 
          chr_name, s_start, s_end, chr_name, s_start+1, s_end);
      } else {
        fprintf(out, "\n");
      }

    } // secondary match
  } // primary match

  free(buffer);
  free(primary_match);
  free(secondary_match);
}
Ejemplo n.º 6
0
/*************************************************************************
 * Entry point for ama
 *************************************************************************/
int main(int argc, char **argv) {
  AMA_OPTIONS_T options;
  ARRAYLST_T *motifs;
  clock_t c0, c1; // measuring cpu_time
  MOTIF_AND_PSSM_T *combo;
  CISML_T *cisml;
  PATTERN_T** patterns;
  PATTERN_T *pattern;
  FILE *fasta_file, *text_output, *cisml_output;
  int i, seq_loading_num, seq_counter, unique_seqs, seq_len, scan_len, x1, x2, y1, y2;
  char *seq_name, *path;
  bool need_postprocessing, created;
  SEQ_T *sequence;
  RBTREE_T *seq_ids;
  RBNODE_T *seq_node;
  double *logcumback;
  ALPH_T *alph;

  // process the command
  process_command_line(argc, argv, &options);

  // load DNA motifs
  motifs = load_motifs(&options);

  // get the alphabet
  if (arraylst_size(motifs) > 0) {
    combo = (MOTIF_AND_PSSM_T*)arraylst_get(0, motifs);
    alph = alph_hold(get_motif_alph(combo->motif));
  } else {
    alph = alph_dna();
  }

  // pick columns for GC operations
  x1 = -1; x2 = -1; y1 = -1; y2 = -1;
  if (alph_size_core(alph) == 4 && alph_size_pairs(alph) == 2) {
    x1 = 0; // A
    x2 = alph_complement(alph, x1); // T
    y1 = (x2 == 1 ? 2 : 1); // C
    y2 = alph_complement(alph, y1); // G
    assert(x1 != x2 && y1 != y2 && x1 != y1 && x2 != y2 && x1 != y2 && x2 != y1);
  }

  // record starting time
  c0 = clock();

  // Create cisml data structure for recording results
  cisml = allocate_cisml(PROGRAM_NAME, options.command_line, options.motif_filename, options.fasta_filename);
  set_cisml_background_file(cisml, options.bg_filename);

  // make a CISML pattern to hold scores for each motif
  for (i = 0; i < arraylst_size(motifs); i++) {
    combo = (MOTIF_AND_PSSM_T*)arraylst_get(i, motifs);
    add_cisml_pattern(cisml, allocate_pattern(get_motif_id(combo->motif), ""));
  }

  // Open the FASTA file for reading.
  fasta_file = NULL;
  if (!open_file(options.fasta_filename, "r", false, "FASTA", "sequences", &fasta_file)) {
    die("Couldn't open the file %s.\n", options.fasta_filename);
  }
  if (verbosity >= NORMAL_VERBOSE) {
    if (options.last == 0) {
      fprintf(stderr, "Using entire sequence\n");
    } else {
      fprintf(stderr, "Limiting sequence to last %d positions.\n", options.last);
    }
  }

  //
  // Read in all sequences and score with all motifs
  //
  seq_loading_num = 0;  // keeps track on the number of sequences read in total
  seq_counter = 0;      // holds the index to the seq in the pattern
  unique_seqs = 0;      // keeps track on the number of unique sequences
  need_postprocessing = false;
  sequence = NULL;
  logcumback = NULL;
  seq_ids = rbtree_create(rbtree_strcasecmp,rbtree_strcpy,free,rbtree_intcpy,free);
  while (read_one_fasta(alph, fasta_file, options.max_seq_length, &sequence)) {
    ++seq_loading_num;
    seq_name = get_seq_name(sequence);
    seq_len = get_seq_length(sequence);
    scan_len = (options.last != 0 ? options.last : seq_len);
    // red-black trees are only required if duplicates should be combined
    if (options.combine_duplicates){
      //lookup seq id and create new entry if required, return sequence index
      seq_node = rbtree_lookup(seq_ids, get_seq_name(sequence), true, &created);
      if (created) { // assign it a loading number
        rbtree_set(seq_ids, seq_node, &unique_seqs);
        seq_counter = unique_seqs;
        ++unique_seqs;
      } else {
        seq_counter = *((int*)rbnode_get(seq_node));
      }
    }
          
    //
    // Set up sequence-dependent background model and compute
    // log cumulative probability of sequence.
    // This needs the sequence in raw format.
    //
    if (options.sdbg_order >= 0)
      logcumback = log_cumulative_background(alph, options.sdbg_order, sequence);

    // Index the sequence, throwing away the raw format and ambiguous characters
    index_sequence(sequence, alph, SEQ_NOAMBIG);

    // Get the GC content of the sequence if binning p-values by GC
    // and store it in the sequence object.
    if (options.num_gc_bins > 1) {
      ARRAY_T *freqs = get_sequence_freqs(sequence, alph);
      set_total_gc_sequence(sequence, get_array_item(y1, freqs) + get_array_item(y2, freqs)); // f(C) + f(G)
      free_array(freqs);                        // clean up
    } else {
      set_total_gc_sequence(sequence, -1);      // flag ignore
    }

    // Scan with motifs.
    for (i = 0; i < arraylst_size(motifs); i++) {
      pattern = get_cisml_patterns(cisml)[i];
      combo = (MOTIF_AND_PSSM_T*)arraylst_get(i, motifs);
      if (verbosity >= HIGHER_VERBOSE) {
        fprintf(stderr, "Scanning %s sequence with length %d "
            "abbreviated to %d with motif %s with length %d.\n",
            seq_name, seq_len, scan_len, 
            get_motif_id(combo->motif), get_motif_length(combo->motif));
      }
      SCANNED_SEQUENCE_T* scanned_seq = NULL;
      if (!options.combine_duplicates || get_pattern_num_scanned_sequences(pattern) <= seq_counter) {
        // Create a scanned_sequence record and save it in the pattern.
        scanned_seq = allocate_scanned_sequence(seq_name, seq_name, pattern);
        set_scanned_sequence_length(scanned_seq, scan_len);
      } else {
        // get existing sequence record
        scanned_seq = get_pattern_scanned_sequences(pattern)[seq_counter];
        set_scanned_sequence_length(scanned_seq, max(scan_len, get_scanned_sequence_length(scanned_seq)));
      }
      
      // check if scanned component of sequence has sufficient length for the motif
      if (scan_len < get_motif_length(combo->motif)) {
        // set score to zero and p-value to 1 if not set yet
        if(!has_scanned_sequence_score(scanned_seq)){
          set_scanned_sequence_score(scanned_seq, 0.0);
        }
        if(options.pvalues && !has_scanned_sequence_pvalue(scanned_seq)){
          set_scanned_sequence_pvalue(scanned_seq, 1.0);
        } 
        add_scanned_sequence_scanned_position(scanned_seq); 
        if (get_scanned_sequence_num_scanned_positions(scanned_seq) > 0L) {
          need_postprocessing = true;
        }
        if (verbosity >= HIGH_VERBOSE) {
          fprintf(stderr, "%s too short for motif %s. Score set to 0.\n",
              seq_name, get_motif_id(combo->motif));
        }
      } else {
        // scan the sequence using average/maximum motif affinity
        ama_sequence_scan(alph, sequence, logcumback, combo->pssm_pair,
            options.scoring, options.pvalues, options.last, scanned_seq,
            &need_postprocessing);
      }
    } // All motifs scanned

    free_seq(sequence);
    if (options.sdbg_order >= 0) myfree(logcumback);

  } // read sequences

  fclose(fasta_file);
  if (verbosity >= HIGH_VERBOSE) fprintf(stderr, "(%d) sequences read in.\n", seq_loading_num);
  if (verbosity >= NORMAL_VERBOSE) fprintf(stderr, "Finished          \n");

        
  // if any sequence identifier was multiple times in the sequence set  then
  // postprocess of the data is required
  if (need_postprocessing || options.normalize_scores) {
    post_process(cisml, motifs, options.normalize_scores);
  }
        
  // output results
  if (options.output_format == DIRECTORY_FORMAT) {
    if (create_output_directory(options.out_dir, options.clobber, verbosity > QUIET_VERBOSE)) {
      // only warn in higher verbose modes
      fprintf(stderr, "failed to create output directory `%s' or already exists\n", options.out_dir);
      exit(1);
    }
    path = make_path_to_file(options.out_dir, text_filename);
    //FIXME check for errors: MEME doesn't either and we at least know we have a good directory
    text_output = fopen(path, "w");
    free(path);
    path = make_path_to_file(options.out_dir, cisml_filename);
    //FIXME check for errors
    cisml_output = fopen(path, "w");
    free(path);
    print_cisml(cisml_output, cisml, true, NULL, false);
    print_score(cisml, text_output);
    fclose(cisml_output);
    fclose(text_output);
  } else if (options.output_format == GFF_FORMAT) {
    print_score(cisml, stdout);
  } else if (options.output_format == CISML_FORMAT) {
    print_cisml(stdout, cisml, true, NULL, false);
  } else {
    die("Output format invalid!\n");
  }

  //
  // Clean up.
  //
  rbtree_destroy(seq_ids);
  arraylst_destroy(motif_and_pssm_destroy, motifs);
  free_cisml(cisml);
  rbtree_destroy(options.selected_motifs);
  alph_release(alph);
        
  // measure time
  if (verbosity >= NORMAL_VERBOSE) { // starting time
    c1 = clock();
    fprintf(stderr, "cycles (CPU);            %ld cycles\n", (long) c1);
    fprintf(stderr, "elapsed CPU time:        %f seconds\n", (float) (c1-c0) / CLOCKS_PER_SEC);
  }
  return 0;
}
Ejemplo n.º 7
0
/*************************************************************************
 * Build a completely connected HMM.
 *************************************************************************/
void build_complete_hmm
  (ARRAY_T* background,
   int spacer_states, 
   MOTIF_T *motifs,
   int nmotifs,
   MATRIX_T *transp_freq,
   MATRIX_T *spacer_ave,
   BOOLEAN_T fim,
   MHMM_T **the_hmm)
{
  ALPH_T    alph;
  int motif_states; // Total length of the motifs.
  int num_spacers;  // Total number of spacer states.
  int num_states;   // Total number of states in the model.
  int i_motif;      // Index of the current "from" motif.
  int j_motif;      // Index of the current "to" motif.
  int i_position;   // Index within the current motif or spacer.
  int i_state = 0;  // Index of the current state.

  assert(nmotifs > 0);
  alph = get_motif_alph(motifs);// get the alphabet from the first motif

  // Count the width of the motifs.
  for (motif_states = 0, i_motif = 0; i_motif < nmotifs; i_motif++)
    motif_states += get_motif_length(motif_at(motifs, i_motif));
  // Count the spacer states adjacent to begin and end.
  num_spacers = nmotifs * 2;
  // Add the spacer states between motifs.
  num_spacers += nmotifs * nmotifs;
  // Total states = motifs + spacer_states + begin/end
  num_states = motif_states + (num_spacers * spacer_states) + 2;

  // Allocate the model.
  *the_hmm = allocate_mhmm(alph, num_states);

  // Record that this is a completely connected model.
  (*the_hmm)->type = COMPLETE_HMM;

  // Record the number of motifs in the model.
  (*the_hmm)->num_motifs = nmotifs;

  // Record the number of states in the model.
  (*the_hmm)->num_states = num_states;
  (*the_hmm)->num_spacers = ((nmotifs + 1) * (nmotifs + 1)) - 1;
  (*the_hmm)->spacer_states = spacer_states;

  // Put the background distribution into the model.
  copy_array(background, (*the_hmm)->background);

  // Build the begin state.
  build_complete_state(
      START_STATE, 
      i_state,
      alph,
      0, // expected length
      NULL, // Emissions.
      0, // Number of sites.
      NON_MOTIF_INDEX,
      NON_MOTIF_POSITION,
      nmotifs,
      0, // previous motif
      0, // next motif
      transp_freq,
      spacer_states,
      num_spacers,
      motifs,
      &((*the_hmm)->states[i_state]));
  i_state++;

  int from_motif_state, to_motif_state;
  // Build the spacer states. No transitions from the end state.
  for (i_motif = 0; i_motif <= nmotifs; i_motif++) {
    // No transitions to the start state.
    for (j_motif = 1; j_motif <= nmotifs+1; j_motif++) {
      // No transitions from start to end.
      if ((i_motif == 0) && (j_motif == nmotifs+1))
        continue;
      // Allow multi-state spacers.
      for (i_position = 0; i_position < spacer_states; i_position++, i_state++) {
        build_complete_state(
            SPACER_STATE, 
            i_state, 
            alph,
            get_matrix_cell(i_motif, j_motif, spacer_ave),
            background,
            SPACER_NUMSITES,
            NON_MOTIF_INDEX,
            i_position,
            nmotifs,
            i_motif,
            j_motif,
            transp_freq,
            spacer_states,
            num_spacers,
            motifs,
            &((*the_hmm)->states[i_state]));
      }
    }
  }

  // Build the motif states.
  for (i_motif = 0; i_motif < nmotifs; i_motif++) {
    MOTIF_T *this_motif = motif_at(motifs, i_motif);
    STATE_T state;
    for (i_position = 0; i_position < get_motif_length(this_motif); i_position++, i_state++) {
      if (i_position == 0) {
        state = START_MOTIF_STATE;
      } else if (i_position == (get_motif_length(this_motif) - 1)) {
        state = END_MOTIF_STATE;
      } else {
        state = MID_MOTIF_STATE;
      }
      build_complete_state(
          MID_MOTIF_STATE, 
          i_state,
          alph,
          0, // Expected spacer length. 
          get_matrix_row(i_position, get_motif_freqs(this_motif)),
          get_motif_nsites(this_motif),
          i_motif,
          i_position, 
          nmotifs,
          0, // Previous motif index.
          0, // Next motif index.
          transp_freq,
          spacer_states,
          num_spacers,
          motifs,
          &((*the_hmm)->states[i_state]));
    }
  }

  // Build the end state.
  build_complete_state(
      END_STATE, 
      i_state,
      alph,
      0, // Expected spacer length.
      NULL, // Emissions
      0, // Number of sites.
      NON_MOTIF_INDEX,
      NON_MOTIF_POSITION,
      nmotifs,
      0, // Previous motif index.
      0, // Next motif index.
      transp_freq,
      spacer_states,
      num_spacers,
      motifs,
      &((*the_hmm)->states[i_state]));
  i_state++;

  // Convert spacers to FIMs if requested.
  if (fim) {
    convert_to_fims(*the_hmm);
  }

  // Fill in the transition matrix.
  build_transition_matrix(*the_hmm);
}
Ejemplo n.º 8
0
/*************************************************************************
 * Build a linear HMM.
 *************************************************************************/
void build_linear_hmm
  (ARRAY_T*  background,
   ORDER_T*  order_spacing,
   int       spacer_states, 
   RBTREE_T* motifs, // motifs with key as in order_spacing
   BOOLEAN_T fim,
   MHMM_T**  the_hmm)
{
  ALPH_T    alph;
  int       model_length; // Total number of states in the model.
  int       i_state;      // Index of the current state.
  int       i_order;      // Index within the order and spacing.
  int       i_position;   // Index within the current motif or spacer.
  int       motif_i;      // motif key in order spacing
  MOTIF_T  *motif;        // motif
  RBNODE_T *node;

  alph = get_motif_alph((MOTIF_T*)rbtree_value(rbtree_first(motifs)));

  // Calculate the total length of the model.
  model_length = 2; // start and end state
  for (i_order = 0; i_order < get_order_occurs(order_spacing); i_order++) {
    motif_i = get_order_motif(order_spacing, i_order);
    motif = (MOTIF_T*)rbtree_get(motifs, &motif_i);
    model_length += get_motif_length(motif);
  }
  model_length += (get_order_occurs(order_spacing) + 1) * spacer_states;


  // Allocate the model.
  *the_hmm = allocate_mhmm(alph, model_length);
  check_sq_matrix((*the_hmm)->trans, model_length);

  // Record that this is a linear model.
  (*the_hmm)->type = LINEAR_HMM;

  // Record the number of motifs in the model. 
  // It doesn't want the distinct count
  (*the_hmm)->num_motifs = get_order_occurs(order_spacing);

  // Record the number of states in the model.
  (*the_hmm)->num_states = model_length;
  (*the_hmm)->num_spacers = get_order_occurs(order_spacing) + 1;
  (*the_hmm)->spacer_states = spacer_states;

  // Put the background distribution into the model.
  copy_array(background, (*the_hmm)->background);

  // Begin the model with a non-emitting state.
  i_state = 0;
  check_sq_matrix((*the_hmm)->trans, (*the_hmm)->num_states);
  build_linear_state(
      alph,
      START_STATE,
      i_state,
      get_spacer_length(order_spacing, 0),
      NULL, // Emissions.
      0, // Number of sites.
      NON_MOTIF_INDEX,
      NON_MOTIF_POSITION, // position within state (not relevant to start state)
      NULL, // no motif
      &((*the_hmm)->states[i_state]));
  ++i_state;

  // Build the first spacer.
  for (i_position = 0; i_position < spacer_states; i_position++, i_state++) {
    check_sq_matrix((*the_hmm)->trans, (*the_hmm)->num_states);
    build_linear_state(
        alph,
        SPACER_STATE,
        i_state, 
        get_spacer_length(order_spacing, 0),
        background, 
        SPACER_NUMSITES,
        NON_MOTIF_INDEX,
        i_position, // position within spacer
        NULL, // no motif
        &((*the_hmm)->states[i_state]));
  }

  // Build each motif and subsequent spacer.
  for (i_order = 0; i_order < get_order_occurs(order_spacing); i_order++) {
    STATE_T state;
    int spacer_len;
    motif_i = get_order_motif(order_spacing, i_order);
    motif = (MOTIF_T*)rbtree_get(motifs, &motif_i);

    // Build the motif.
    for (i_position = 0; i_position < get_motif_length(motif); i_position++, i_state++) {
      if (i_position == 0) {
        state = START_MOTIF_STATE;
        spacer_len = get_spacer_length(order_spacing, i_order);
      } else if (i_position == (get_motif_length(motif) - 1)) {
        state = END_MOTIF_STATE;
        spacer_len = get_spacer_length(order_spacing, i_order+1);
      } else {
        state = MID_MOTIF_STATE;
        spacer_len = 0;
      }
      check_sq_matrix((*the_hmm)->trans, (*the_hmm)->num_states);
      build_linear_state(
          alph, 
          state, 
          i_state, 
          spacer_len, // Expected spacer length.
          get_matrix_row(i_position, get_motif_freqs(motif)),
          get_motif_nsites(motif),
          i_order,
          i_position, // position within motif (middle)
          motif,
          &((*the_hmm)->states[i_state]));
    }

    // Build the following spacer.
    for (i_position = 0; i_position < spacer_states; i_position++, i_state++) {
      check_sq_matrix((*the_hmm)->trans, (*the_hmm)->num_states);
      build_linear_state(
          alph, 
          SPACER_STATE, 
          i_state, 
          get_spacer_length(order_spacing, i_order+1),
          background,
          SPACER_NUMSITES,
          NON_MOTIF_INDEX, 
          i_position, // position within spacer
          NULL, // no motif
          &((*the_hmm)->states[i_state]));
    }
  }

  check_sq_matrix((*the_hmm)->trans, (*the_hmm)->num_states);
  // Finish up the model with a non-emitting end state.
  build_linear_state(
      alph, 
      END_STATE, 
      i_state, 
      get_spacer_length(order_spacing, i_order),
      NULL, // Emissions.
      0, // Number of sites.
      NON_MOTIF_INDEX,
      NON_MOTIF_POSITION, // position within state (not relevant to end state)
      NULL, // no motif
      &((*the_hmm)->states[i_state]));
  ++i_state;
  assert(i_state == model_length);

  check_sq_matrix((*the_hmm)->trans, (*the_hmm)->num_states);
  // Convert spacers to FIMs if requested.
  if (fim) {
    convert_to_fims(*the_hmm);
  }

  // Fill in the transition matrix.
  build_transition_matrix(*the_hmm);
}
Ejemplo n.º 9
0
/*************************************************************************
 * Build a star topology HMM.
 *************************************************************************/
void build_star_hmm
  (ARRAY_T*  background,
   int       spacer_states, 
   MOTIF_T*  motifs,
   int       nmotifs,
   BOOLEAN_T fim,
   MHMM_T**  the_hmm)
{
  ALPH_T alph;
  int       motif_states; /* Total length of the motifs. */
  int       num_spacers;  /* Total number of spacer states. */
  int       num_states;   /* Total number of states in the model. */
  int       i_motif;      /* Index of the current "from" motif. */
  int       i_position;   /* Index within the current motif or spacer. */
  int       i_state = 0;  /* Index of the current state. */

  alph = get_motif_alph(motif_at(motifs, 0));

  /* Count the width of the motifs. */
  for (motif_states = 0, i_motif = 0; i_motif < nmotifs; i_motif++)
    motif_states += get_motif_length(motif_at(motifs, i_motif));
  // Only 1 spacer.
  num_spacers = 1;
  /* Total states = motifs + spacer_states + begin/end */
  num_states = motif_states + (num_spacers * spacer_states) + 2;
  /* fprintf(stderr, "motif_states=%d num_spacers=%d num_states=%d\n",
	  motif_states, num_spacers, num_states); */

  /* Allocate the model. */
  *the_hmm = allocate_mhmm(alph, num_states);

  /* Record that this is a star model. */
  (*the_hmm)->type = STAR_HMM;

  /* Record the number of motifs in the model. */
  (*the_hmm)->num_motifs = nmotifs;

  /* Record the number of states in the model. */
  (*the_hmm)->num_states = num_states;
  (*the_hmm)->num_spacers = 1;
  (*the_hmm)->spacer_states = spacer_states;

  // Put the background distribution into the model.
  copy_array(background, (*the_hmm)->background);

  /* Build the begin state. */
  build_star_state(
    alph,
    START_STATE,
		i_state,
		0, // expected length
		NULL,
		0, // Number of sites.
		NON_MOTIF_INDEX,
		NON_MOTIF_POSITION,
		nmotifs,
		spacer_states,
		motifs,
		&((*the_hmm)->states[i_state])
  );
  i_state++;

  // Build the spacer state (state 0).  Allow multi-state spacers.
  for (i_position = 0; i_position < spacer_states; i_position++) {
    build_star_state(
      alph,
      SPACER_STATE, 
		  i_state, 
		  DEFAULT_SPACER_LENGTH,
		  background,
		  SPACER_NUMSITES,
		  NON_MOTIF_INDEX,
		  i_position,
		  nmotifs,
		  spacer_states,
		  motifs,
		  &((*the_hmm)->states[i_state])
    );
    i_state++;
  }

  /* Build the motif states. */
  for (i_motif = 0; i_motif < nmotifs; i_motif++) {
    MOTIF_T *this_motif = motif_at(motifs, i_motif);
    assert(get_motif_length(this_motif) > 1);
    i_position = 0;
    build_star_state(
      alph,
      START_MOTIF_STATE, 
		  i_state,
		  0, // Expected spacer length.
		  get_matrix_row(i_position, get_motif_freqs(this_motif)),
		  get_motif_nsites(this_motif),
		  i_motif,
		  i_position,
		  nmotifs,
		  spacer_states,
		  motifs,
		  &((*the_hmm)->states[i_state])
    );
    i_state++;
    for (i_position = 1; i_position < get_motif_length(this_motif) - 1; i_position++) {
      build_star_state(
        alph,
        MID_MOTIF_STATE, 
		    i_state,
		    0, // Expected spacer length. 
		    get_matrix_row(i_position, get_motif_freqs(this_motif)),
		    get_motif_nsites(this_motif),
		    i_motif,
		    i_position, 
		    nmotifs,
		    spacer_states,
		    motifs,
		    &((*the_hmm)->states[i_state])
      );
      i_state++;
    }
    build_star_state(
      alph,
      END_MOTIF_STATE, 
		  i_state,
		  0, // Expected spacer length.
		  get_matrix_row(i_position, get_motif_freqs(this_motif)),
		  get_motif_nsites(this_motif),
		  i_motif,
		  i_position,
		  nmotifs,
		  spacer_states,
		  motifs,
		  &((*the_hmm)->states[i_state])
    );
    i_state++;
  }

  /* Build the end state. */
  build_star_state(
    alph,
    END_STATE, 
		i_state,
		0, // Expected spacer length.
		NULL, // Emissions
		0, // Number of sites.
		NON_MOTIF_INDEX,
		NON_MOTIF_POSITION,
		nmotifs,
		spacer_states,
		motifs,
		&((*the_hmm)->states[i_state])
  );
  i_state++;

  /* Convert spacers to FIMs if requested. */
  if (fim) {
    convert_to_fims(*the_hmm);
  }

  /* Fill in the transition matrix. */
  build_transition_matrix(*the_hmm);
} // build_star_hmm