Example #1
0
/*************************************************************************
 * Find the index of the starting or ending state of a given motif in
 * a given HMM.
 * 0 = START_STATE and nmotifs+1 = END_STATE
 *************************************************************************/
static int motif_index
  (const int       motif_num,
   const BOOLEAN_T start_or_end,
   const int       num_spacers,
   const int       spacer_states,
   const MOTIF_T*  motifs,
   const int       nmotifs)
{
  int i_motif;
  int return_value;

  assert(motif_num >= 0);
  assert(motif_num <= (nmotifs + 1));

  if (motif_num == 0) return START_INDEX;

  // Skip the spacer states.
  return_value = (num_spacers * spacer_states) + 1;

  // Add the lengths of the preceding motifs.
  for (i_motif = 0; i_motif < motif_num - 1; i_motif++)
    return_value += get_motif_length(motif_at((MOTIF_T*)motifs, i_motif));
  
  // If we're looking for the end of this motif, add its length as well. 
  // unless it is the end state we're after which has only one state
  if (start_or_end && motif_num != (nmotifs + 1))
    return_value += get_motif_length(motif_at((MOTIF_T*)motifs, i_motif)) - 1;

  // fprintf(stderr, "Motif %d -> %d\n", motif_num, return_value);

  return(return_value);
}    
Example #2
0
/**************************************************************************
 * Callback invoked when matching an opening matched_element tag for a 
 * CISML file of a secondary motif database. A hit must pass the checks:
 * 1) The current match is for a sequence/motif that we're interested in.
 * 2) A score is supplied.
 * 3) The score supplied is better or equal to the existing best score.
 * 4) Consistant with CISML format so start and stop are larger than 0.
 * 5) The distance between the start and stop matches the motif.
 * 6) It fits within the margin region around the primary motif
 * 7) It does not overlap the primary motif. 
 * Provided all those checks pass then the hit is calculated relative to
 * the start of the matched region. If the score is equal to the current
 * best then the relative hit position is added to the list of best hits,
 * otherwise the list is cleared, the best score is updated and the hit
 * is added to the previously empty list.
 **************************************************************************/
void match_secondary(void *ctx, long start, long stop, double *score, double *pvalue, char *clusterId) {
  SECONDARY_LOADER_T *loader = (SECONDARY_LOADER_T*)ctx;
  int lpos, rpos, rc, relative_position, match;
  //check if we're loading this match
  if (loader->current_sequence == NULL) return;
  //check if this match has enough information to be considered
  if (score == NULL) return;
  //check to see if the existing match is better
  if (loader->hit_count > 0 && loader->secondary_score > *score) return;
  //convert the coordinates of the match into easier to use ones
  if (start <= 0 || stop <= 0) {
    die("Expected start and stop fields in cisml to be 1 or larger.\n");
  }
  if (start < stop) {
    lpos = start;
    rpos = stop;
    rc = FALSE;
  } else {
    lpos = stop;
    rpos = start;
    rc = TRUE;
  }
  //check that gap makes sense
  if ((rpos - lpos + 1) != get_motif_length(loader->secondary_motif->motif)) {
    die("Motif %s has length %d but a match in a CISML file had a start of %ld and stop of %ld which evaluates to a length of %d\n", 
        get_motif_id(loader->secondary_motif->motif), get_motif_length(loader->secondary_motif->motif), start, stop, (rpos - lpos + 1) );
  }
  //check for overlap with the primary match
  //and that the secondary motif fits within the margin
  if (rpos < loader->primary_lpos) { // left side (upstream)
    if ((loader->primary_lpos - lpos) > loader->margin) return;//outside margin    
  } else if (lpos > loader->primary_rpos) { // right side (downstream)
    if ((rpos - loader->primary_rpos) > loader->margin) return;//outside margin
  } else {
    return;//overlap
  }
  //match seems valid and better than anything we've seen previous so update
  //note that stored position is relative to the start of the margin, as if
  //this was scored on a trimmed sequence indexing from 1 
  //this has the advantage that we only need the width of the primary
  //motif and the size of the margin to calculate the offset
  relative_position = lpos - (loader->primary_lpos - loader->margin) + 1;
  //now make the scale pos/neg dependent on if the match is with a
  //reverse complement
  match = (rc ? -relative_position : relative_position);
  if (loader->hit_count == 0 || loader->secondary_score > *score) {
    loader->secondary_score = *score;
    loader->hit_count = 1;
    loader->hits[0] = match;
  } else if (loader->secondary_score == *score) {
    if (loader->hit_count >= loader->hits_size) {
      loader->hits_size = loader->hit_count + 10;
      loader->hits = mm_realloc(loader->hits, sizeof(int) * loader->hits_size);
    }
    loader->hits[loader->hit_count++] = match;
  }
}
Example #3
0
void mcast_print_motif_list(FILE * output, MOTIF_T* motifs, int num_motifs) {
  fputs("\n", output);
  int i;
  for (i = 0; i < num_motifs; i++) {
    MOTIF_T *motif = motif_at(motifs, i);
    MOTIF_T *rc_motif = NULL;
    char *motif_id = get_motif_id(motif);
    int width = get_motif_length(motif);
    char *rc_motif_id = NULL;
    if (i < (num_motifs - 1)) {
      rc_motif = motif_at(motifs, i + 1);
      rc_motif_id = get_motif_id(rc_motif);
    }
    char *best_possible_match = get_best_possible_match(motif);
    char *colored_best_possible_match = color_dna_sequence(best_possible_match);
    char *best_possible_rc_match = NULL;
    char *colored_best_possible_rc_match = NULL;
    if (rc_motif_id && strcmp(motif_id, rc_motif_id) == 0) {
      ++i; // Pair of identiical motif ids indicate forward/reverse pair.
      best_possible_rc_match = get_best_possible_match(rc_motif);
      colored_best_possible_rc_match = color_dna_sequence(best_possible_rc_match);
    }
    const char *indent = "               ";
    fprintf(output, "%s<tr>\n", indent);
    fprintf(output, "%s<td>%s</td>\n", indent, motif_id);
    fprintf(output, "%s<td>%d</td>\n", indent, width);
    fprintf(output, "%s<td class=\"sequence\">%s</td>\n", indent, colored_best_possible_match);
    fprintf(output, "%s<td class=\"sequence\">%s</td>\n", indent, colored_best_possible_rc_match);
    fprintf(output, "%s</tr>\n", indent);
    myfree(best_possible_match);
    myfree(best_possible_rc_match);
    myfree(colored_best_possible_match);
    myfree(colored_best_possible_rc_match);
  }
};
Example #4
0
/**************************************************************************
 * Callback invoked when matching a matched_element tag in the CISML file
 * for the primary motif. If we are recording scores for this motif and
 * sequence then it:
 * 1) Checks that a score was supplied
 * 2) Checks that the start and stop are correctly spaced for the expected 
 *    motif.
 * 3) Checks that the hit does not overlap the margin on each end of the
 *    sequence.
 * 4) If we don't have a best score, or this score is better: 
 *      - clear the list of best hits and add this one.
 * 5) Alternately if this score is equal to the existing one:
 *      - add the hit to the list of best hits.
 **************************************************************************/
void match_primary(void *ctx, long start, long stop, double *score, double *pvalue, char *clusterId) {
  PRIMARY_LOADER_T *loader = (PRIMARY_LOADER_T*)ctx;
  int lpos, rpos, rc;
  //check we're actually loading data
  if (loader->current_sequence == NULL) return;
  //check that this match is worth investigating further
  if (score == NULL) return;
  if (start <= 0 || stop <= 0) {
    die("Expected start and stop fields in cisml to be 1 or larger.\n");
  }
  if (start < stop) {
    lpos = start;
    rpos = stop;
    rc = FALSE;
  } else {
    lpos = stop;
    rpos = start;
    rc = TRUE;
  }
  //check that gap makes sense
  if ((rpos - lpos + 1) != get_motif_length(loader->motif)) {
    die("Motif %s has length %d but a match in a CISML file had a start of %ld and stop of %ld which evaluates to a length of %d\n", 
        get_motif_id(loader->motif), get_motif_length(loader->motif), start, stop, (rpos - lpos + 1) );
  }
  //check left margin
  // For example if we had a margin of 1 then the primary motif must start at 
  // 2 or larger which would allow a secondary motif of length 1 to fit at 
  // position 1
  if (lpos <= loader->margin) return;
  //check right margin
  // For example if we had a sequence of length 5 and a margin of 1 then the 
  // primary motif must finish at 4 or smaller which would allow a secondary 
  // motif of length 1 to fit at position 5
  if (rpos > (loader->current_sequence->length - loader->margin)) return;
  //now see if our existing best match is worse than this one
  if (loader->hit_count == 0 || *score > loader->current_score) {
    loader->current_score = *score;
    loader->hit_count = 1;
    loader->hits[0] = (rc ? -lpos : lpos);
  } else if (*score == loader->current_score) {
    if (loader->hit_count >= loader->hits_size) {
      loader->hits_size = loader->hit_count + 10;
      loader->hits = mm_realloc(loader->hits, sizeof(int) * loader->hits_size);
    }
    loader->hits[loader->hit_count++] = (rc ? -lpos : lpos);
  }
}
Example #5
0
/***********************************************************************
 * Say that the motif ID is printed centered above a given motif.
 * If the motif ID string is longer than the motif, we truncate
 * it on the right and align the first character over the start of
 * the motif.
 * This function returns the character that appears in the nth
 * position of that motif ID string.
 * If the motif was created from a double stranded source then
 * include the strand.
 ***********************************************************************/
static char get_motif_id_char
  (int      position,
   MOTIF_T* a_motif)
{
  char* motif_id_string, *id;
  int   id_width, m_width, id_start;
  char  return_char;

  assert(position < get_motif_length(a_motif));

  id = get_full_motif_id(a_motif);
  id_width = strlen(id);
  m_width = get_motif_length(a_motif);

  // Allocate the string.
  motif_id_string = mm_calloc(sizeof(char), m_width + 1);

  // Get position where ID starts relative to start of motif.
  id_start = id_width <= m_width ? ((m_width - id_width) / 2) : 0;
  // FIXME: (tlb) The following if() was put in to make the smoke tests of mhmm
  // pass.  It should be removed and the smoke test comparison files changed.
  if (m_width % 2 == 0 && id_width % 2 == 0) {
    id_start++; 
  } else {
   id_start+=2;
  }

  // Create the centered ID string.
  sprintf(motif_id_string, "%*.*s%-*.*s", id_start, id_start, "",
          m_width-id_start, m_width-id_start, id);
  assert((int)(strlen(motif_id_string)) == m_width);

  // Get the nth character.
  return_char = motif_id_string[position];

  if (return_char == ' ') {
    if ((position == 0) || (position == (m_width - 1))) {
      return_char = '*';
    } else {
      return_char = '_';
    }
  }

  // Free up memory and return.
  myfree(motif_id_string);
  return(return_char);
}
Example #6
0
/*************************************************************************
 * Output JSON data for a motif
 *************************************************************************/
static void output_motif_json(JSONWR_T* json, MOTIF_STATS_T* stats, 
    SITE_COUNTS_T* counts) {
  //vars
  MOTIF_T *motif;
  MATRIX_T *freqs;
  int i, j, mlen, asize, end;
  motif = stats->motif;
  freqs = get_motif_freqs(motif);
  asize = alph_size(get_motif_alph(motif), ALPH_SIZE);
  jsonwr_start_object_value(json);
  jsonwr_lng_prop(json, "db", stats->db->id);
  jsonwr_str_prop(json, "id", get_motif_id(motif));
  if (*(get_motif_id2(motif))) {
    jsonwr_str_prop(json, "alt", get_motif_id2(motif));
  }
  mlen = get_motif_length(motif);
  jsonwr_lng_prop(json, "len", mlen);
  jsonwr_dbl_prop(json, "motif_evalue", get_motif_evalue(motif));
  jsonwr_dbl_prop(json, "motif_nsites", get_motif_nsites(motif));
  if (get_motif_url(motif) && *get_motif_url(motif)) {
    jsonwr_str_prop(json, "url", get_motif_url(motif));
  }
  jsonwr_property(json, "pwm");
  jsonwr_start_array_value(json);
  for (i = 0; i < mlen; i++) {
    jsonwr_start_array_value(json);
    for (j = 0; j < asize; j++) {
      jsonwr_dbl_value(json, get_matrix_cell(i, j, freqs));
    }
    jsonwr_end_array_value(json);
  }
  jsonwr_end_array_value(json);
  jsonwr_lng_prop(json, "bin_width", stats->central_window+1);
  jsonwr_dbl_prop(json, "bin_sites", stats->central_sites);
  jsonwr_lng_prop(json, "total_sites", counts->total_sites);
  jsonwr_dbl_prop(json, "log_pvalue", stats->log_adj_pvalue);
  jsonwr_dbl_prop(json, "max_prob", stats->max_prob);
  jsonwr_property(json, "sites");
  jsonwr_start_array_value(json);
  end = counts->allocated - (mlen - 1);
  for (i = (mlen - 1); i < end; i += 2) {
    jsonwr_dbl_value(json, counts->sites[i]);
  }
  jsonwr_end_array_value(json);
  jsonwr_end_object_value(json);
}
Example #7
0
/*************************************************************************
 * Output motif site counts
 *************************************************************************/
static void output_site_counts(FILE* fh, int sequence_length, 
    MOTIF_DB_T* db, MOTIF_T* motif, SITE_COUNTS_T* counts) {
  // vars
  int i, w, end;
  char *alt;
  fprintf(fh, "DB %d MOTIF\t%s", db->id, get_motif_id(motif));
  alt = get_motif_id2(motif);
  if (alt[0]) fprintf(fh, "\t%s", alt);
  fprintf(fh, "\n");
  w = get_motif_length(motif);
  end = counts->allocated - (w - 1);
  for (i = (w - 1); i < end; i += 2) {
    fprintf(fh, "% 6.1f\t%g\n", 
        ((double)(i - sequence_length + 1)) / 2.0, 
        counts->sites[i]);
  }
}
Example #8
0
/**************************************************************************
 * Callback invoked when matching an opening scanned_sequence tag for a
 * CISML file of a secondary motif database. It calcualtes and caches the
 * left and right bounds of the primary motif and stores the current 
 * sequence.
 **************************************************************************/
void sequence_secondary(void *ctx, char *accession, char *name, char *db, char *lsId, double *score, double *pvalue, long *length) {
  SECONDARY_LOADER_T *loader = (SECONDARY_LOADER_T*)ctx;
  RBNODE_T *node;
  int pmatch;
  if (loader->secondary_motif == NULL) return;
  node = rbtree_lookup(loader->sequences, accession, FALSE, NULL);
  if (node != NULL) {
    loader->current_sequence = (SEQUENCE_T*)rbtree_value(node);
    pmatch = loader->current_sequence->primary_match;
    loader->primary_lpos = (pmatch < 0 ? -pmatch : pmatch);
    loader->primary_rpos = loader->primary_lpos + get_motif_length(loader->primary_motif) - 1;
    if (loader->secondary_matches[loader->current_sequence->index] != 0) {
      die("Already seen this sequence!");
    }
    loader->secondary_score = 0;
    loader->hit_count = 0;
  } else {
    loader->current_sequence = NULL;
  }
}
Example #9
0
/*************************************************************************
 * Set up one state in a complete HMM, given the appropriate data.
 *************************************************************************/
static void build_complete_state
  (STATE_T state_type,    // Type of state (START, SPACER,..)
   int i_state,           // State index.
   ALPH_T alph,           // alphabet
   int expected_length,   // For spacers, the expected length of output.
   ARRAY_T *freqs,        // Emission probability distrib.
   double num_sites,      // Number of sites for this emission.
   int i_motif,           // Index of motif this state is in.
   int i_position,        // Position of this state within motif
   int nmotifs,           // Total number of motifs.
   int prev_motif,        // Index of previous motif.
   int next_motif,        // Index of next motif.
   MATRIX_T *transp_freq, // Transition freq matrix.
   int spacer_states,     // Number of HMM states per spacer.
   int num_spacers,       // Total number of spacers in HMM.
   MOTIF_T *motifs,       // Motifs.
   MHMM_STATE_T *a_state) // State to be filled in (pre-allocated).
{
  MOTIF_T *motif; // The motif (for motif state)
  int j_motif;    // Index of the current motif.

  if (i_motif != NON_MOTIF_INDEX) motif = motif_at(motifs, i_motif);
  else motif = NULL;

  // Tell the user what's up.
  if (verbosity >= NORMAL_VERBOSE) {
    switch (state_type) {
    case START_STATE :
      fprintf(stderr, "Building HMM: (0) ");
      break;
    case SPACER_STATE :
      fprintf(stderr, "%d ", i_state);
      break;
    case END_MOTIF_STATE :
      fprintf(stderr, "%d | ", i_state);
      break;
    case START_MOTIF_STATE :
    case MID_MOTIF_STATE :
      fprintf(stderr, "%d-", i_state);
      break;
    case END_STATE :
      fprintf(stderr, "(%d)\n", i_state);
      break;
    default:
      die("Invalid state!");
    }
  }

  // Record what type of state this is.
  a_state->type = state_type;

  // Record the motif width if this is a motif.
  if (state_type == START_MOTIF_STATE ||
      state_type == MID_MOTIF_STATE ||
      state_type == END_MOTIF_STATE) {
    a_state->w_motif = get_motif_length(motif);
  } else {
    a_state->w_motif = 1;
  }
  

  // Set up the emission distribution and a few other tidbits.
  if (freqs != NULL) { // Start and end states have no emissions.
    a_state->emit = allocate_array(alph_size(alph, ALL_SIZE));
    copy_array(freqs, a_state->emit);
  }
  a_state->num_sites = num_sites;
  a_state->i_motif = i_motif;
  a_state->i_position = i_position;

  // Record the motif ID character at this position.
  if ((state_type == START_STATE) ||
      (state_type == END_STATE) ||
      (state_type == SPACER_STATE)) {
    a_state->id_char = NON_MOTIF_ID_CHAR;
  } else { // motif state
    strncpy(a_state->motif_id, get_full_motif_id(motif), MAX_MOTIF_ID_LENGTH + 2);
    a_state->id_char = get_motif_id_char(i_position, motif);
  }
  assert(a_state->id_char != '\0');

  // First set up the transitions into this state.
  switch (state_type) {
  case START_STATE :
    a_state->ntrans_in = 0;
    a_state->itrans_in = NULL;
    a_state->trans_in = NULL;
    break;
  case START_MOTIF_STATE :
    // Transitions come from any motif or from the start state.
    a_state->ntrans_in = nmotifs + 1;
    a_state->itrans_in = (int *)mm_malloc(sizeof(int) * (nmotifs + 1));
    a_state->trans_in = allocate_array(nmotifs + 1);
    for (j_motif = 0; j_motif < nmotifs + 1; j_motif++) {
      a_state->itrans_in[j_motif]
        = spacer_index(j_motif, i_motif + 1, TRUE, nmotifs, spacer_states);
      set_array_item(j_motif, 
                     get_matrix_cell(j_motif, i_motif + 1, transp_freq), 
                     a_state->trans_in);
    }
    break;
  case END_STATE :
    // Transitions come from any motif.
    a_state->ntrans_in = nmotifs;
    a_state->itrans_in = (int *)mm_malloc(sizeof(int) * nmotifs);
    a_state->trans_in = allocate_array(nmotifs);
    for (j_motif = 0; j_motif < nmotifs; j_motif++) {
      a_state->itrans_in[j_motif] = spacer_index(j_motif + 1,
                                                 nmotifs + 1, TRUE,
                                                 nmotifs, spacer_states);
      set_array_item(j_motif, 
                     get_matrix_cell(j_motif + 1, nmotifs + 1, transp_freq), 
                     a_state->trans_in);
    }
    break;
  case MID_MOTIF_STATE :
  case END_MOTIF_STATE :
    a_state->ntrans_in = 1;
    a_state->itrans_in = (int *)mm_malloc(sizeof(int));
    a_state->itrans_in[0] = i_state - 1;
    a_state->trans_in = allocate_array(1);
    set_array_item(0, 1.0, a_state->trans_in);
    break;
  case SPACER_STATE :
    a_state->ntrans_in = 2;
    a_state->itrans_in = (int *)mm_malloc(sizeof(int) * 2);
    a_state->trans_in = allocate_array(2);
    // For multi-state spacers, incoming transition from previous state.
    if (i_position != 0)
      a_state->itrans_in[0] = i_state - 1;
    else 
      a_state->itrans_in[0] = motif_index(prev_motif, TRUE, num_spacers,
                                          spacer_states, motifs, nmotifs);
    // The other transition is a self-transition.
    a_state->itrans_in[1] = i_state;
    set_array_item(0, 1.0 - self_trans(expected_length / spacer_states),
                   a_state->trans_in);
    set_array_item(1, self_trans(expected_length / spacer_states),
                   a_state->trans_in);
    break;
  default:
    die("Illegal state!");
  }

  // Then set up the transitions out of this state.
  switch (state_type) {
  case START_STATE :
    // Transitions go to each motif.
    a_state->ntrans_out = nmotifs;
    a_state->itrans_out = (int *)mm_malloc(sizeof(int) * nmotifs);
    a_state->trans_out = allocate_array(nmotifs);
    for (j_motif = 0; j_motif < nmotifs; j_motif++) {
      a_state->itrans_out[j_motif] = spacer_index(0, j_motif + 1, FALSE,
                                                  nmotifs, spacer_states);
      set_array_item(j_motif,
                     get_matrix_cell(0, j_motif + 1, transp_freq),
                     a_state->trans_out);
    }
    break;
  case END_MOTIF_STATE :
    // Can go to any other motif or to the end state.
    a_state->ntrans_out = nmotifs + 1;
    a_state->itrans_out = (int *)mm_malloc(sizeof(int) * (nmotifs + 1));
    a_state->trans_out = allocate_array(nmotifs + 1);
    for (j_motif = 0; j_motif < nmotifs + 1; j_motif++) {
      a_state->itrans_out[j_motif] = spacer_index(i_motif + 1,
                                                  j_motif + 1, FALSE,
                                                  nmotifs, spacer_states);
      set_array_item(j_motif,
                     get_matrix_cell(i_motif + 1, j_motif + 1, transp_freq),
                     a_state->trans_out);
    }
    break;
  case START_MOTIF_STATE :
  case MID_MOTIF_STATE :
    a_state->ntrans_out = 1;
    a_state->itrans_out = (int *)mm_malloc(sizeof(int));
    a_state->itrans_out[0] = i_state + 1;
    a_state->trans_out = allocate_array(1);
    set_array_item(0, 1.0, a_state->trans_out);
    break;
  case SPACER_STATE :
    a_state->ntrans_out = 2;
    a_state->itrans_out = (int *)mm_malloc(sizeof(int) * 2);
    a_state->trans_out = allocate_array(2);
    // The first transition is a self-transition.
    a_state->itrans_out[0] = i_state;
    // For multi-state spacers, outgoing transition to next state.
    if (i_position < spacer_states - 1)
      a_state->itrans_out[1] = i_state + 1;
    else 
      a_state->itrans_out[1] = motif_index(next_motif, FALSE, num_spacers,
                                           spacer_states, motifs, nmotifs);
    set_array_item(0, self_trans(expected_length), a_state->trans_out);
    set_array_item(1, 1.0 - self_trans(expected_length), a_state->trans_out);
    break;
  case END_STATE :
    a_state->ntrans_out = 0;
    a_state->itrans_out = NULL;
    a_state->trans_out = NULL;
    break;
  default:
    die("Illegal state!");
  }
}
Example #10
0
/*************************************************************************
 * Build a completely connected HMM.
 *************************************************************************/
void build_complete_hmm
  (ARRAY_T* background,
   int spacer_states, 
   MOTIF_T *motifs,
   int nmotifs,
   MATRIX_T *transp_freq,
   MATRIX_T *spacer_ave,
   BOOLEAN_T fim,
   MHMM_T **the_hmm)
{
  ALPH_T    alph;
  int motif_states; // Total length of the motifs.
  int num_spacers;  // Total number of spacer states.
  int num_states;   // Total number of states in the model.
  int i_motif;      // Index of the current "from" motif.
  int j_motif;      // Index of the current "to" motif.
  int i_position;   // Index within the current motif or spacer.
  int i_state = 0;  // Index of the current state.

  assert(nmotifs > 0);
  alph = get_motif_alph(motifs);// get the alphabet from the first motif

  // Count the width of the motifs.
  for (motif_states = 0, i_motif = 0; i_motif < nmotifs; i_motif++)
    motif_states += get_motif_length(motif_at(motifs, i_motif));
  // Count the spacer states adjacent to begin and end.
  num_spacers = nmotifs * 2;
  // Add the spacer states between motifs.
  num_spacers += nmotifs * nmotifs;
  // Total states = motifs + spacer_states + begin/end
  num_states = motif_states + (num_spacers * spacer_states) + 2;

  // Allocate the model.
  *the_hmm = allocate_mhmm(alph, num_states);

  // Record that this is a completely connected model.
  (*the_hmm)->type = COMPLETE_HMM;

  // Record the number of motifs in the model.
  (*the_hmm)->num_motifs = nmotifs;

  // Record the number of states in the model.
  (*the_hmm)->num_states = num_states;
  (*the_hmm)->num_spacers = ((nmotifs + 1) * (nmotifs + 1)) - 1;
  (*the_hmm)->spacer_states = spacer_states;

  // Put the background distribution into the model.
  copy_array(background, (*the_hmm)->background);

  // Build the begin state.
  build_complete_state(
      START_STATE, 
      i_state,
      alph,
      0, // expected length
      NULL, // Emissions.
      0, // Number of sites.
      NON_MOTIF_INDEX,
      NON_MOTIF_POSITION,
      nmotifs,
      0, // previous motif
      0, // next motif
      transp_freq,
      spacer_states,
      num_spacers,
      motifs,
      &((*the_hmm)->states[i_state]));
  i_state++;

  int from_motif_state, to_motif_state;
  // Build the spacer states. No transitions from the end state.
  for (i_motif = 0; i_motif <= nmotifs; i_motif++) {
    // No transitions to the start state.
    for (j_motif = 1; j_motif <= nmotifs+1; j_motif++) {
      // No transitions from start to end.
      if ((i_motif == 0) && (j_motif == nmotifs+1))
        continue;
      // Allow multi-state spacers.
      for (i_position = 0; i_position < spacer_states; i_position++, i_state++) {
        build_complete_state(
            SPACER_STATE, 
            i_state, 
            alph,
            get_matrix_cell(i_motif, j_motif, spacer_ave),
            background,
            SPACER_NUMSITES,
            NON_MOTIF_INDEX,
            i_position,
            nmotifs,
            i_motif,
            j_motif,
            transp_freq,
            spacer_states,
            num_spacers,
            motifs,
            &((*the_hmm)->states[i_state]));
      }
    }
  }

  // Build the motif states.
  for (i_motif = 0; i_motif < nmotifs; i_motif++) {
    MOTIF_T *this_motif = motif_at(motifs, i_motif);
    STATE_T state;
    for (i_position = 0; i_position < get_motif_length(this_motif); i_position++, i_state++) {
      if (i_position == 0) {
        state = START_MOTIF_STATE;
      } else if (i_position == (get_motif_length(this_motif) - 1)) {
        state = END_MOTIF_STATE;
      } else {
        state = MID_MOTIF_STATE;
      }
      build_complete_state(
          MID_MOTIF_STATE, 
          i_state,
          alph,
          0, // Expected spacer length. 
          get_matrix_row(i_position, get_motif_freqs(this_motif)),
          get_motif_nsites(this_motif),
          i_motif,
          i_position, 
          nmotifs,
          0, // Previous motif index.
          0, // Next motif index.
          transp_freq,
          spacer_states,
          num_spacers,
          motifs,
          &((*the_hmm)->states[i_state]));
    }
  }

  // Build the end state.
  build_complete_state(
      END_STATE, 
      i_state,
      alph,
      0, // Expected spacer length.
      NULL, // Emissions
      0, // Number of sites.
      NON_MOTIF_INDEX,
      NON_MOTIF_POSITION,
      nmotifs,
      0, // Previous motif index.
      0, // Next motif index.
      transp_freq,
      spacer_states,
      num_spacers,
      motifs,
      &((*the_hmm)->states[i_state]));
  i_state++;

  // Convert spacers to FIMs if requested.
  if (fim) {
    convert_to_fims(*the_hmm);
  }

  // Fill in the transition matrix.
  build_transition_matrix(*the_hmm);
}
Example #11
0
File: ramen.c Project: CPFL/gmeme
void ramen_scan_sequences() {
		FILE* seq_file = NULL;
		MOTIF_T* motif = NULL;
		MOTIF_T* rev_motif = NULL;
		SEQ_T* sequence = NULL;
		SCANNED_SEQUENCE_T* scanned_seq = NULL;
		PATTERN_T* pattern;
		int i;
		int j;
		SEQ_T** seq_list;
		int num_seqs;
		int seq_len;
		//For the bdb_bg mode:
		ARRAY_T* seq_bg_freqs;
		double atcontent;
		double roundatcontent;
		double avg_seq_length = 0;

		//Open the file.
		if (open_file(args.sequence_filename, "r", FALSE, "FASTA", "sequences", &seq_file) == 0) {
				fprintf(stderr, "Couldn't open the file %s.\n", args.sequence_filename);
				ramen_terminate(1);
		}

		//Start reading in the sequences
		read_many_fastas(ramen_alph, seq_file, MAX_SEQ_LENGTH, &num_seqs, &seq_list);


		seq_ids = new_string_list();
		seq_fscores = allocate_array(num_seqs);

		//Allocate the required space for results
		results = malloc(sizeof(double*) * motifs.num);
		for (i=0;i<motifs.num;i++) {
				results[i] = malloc(sizeof(double)*num_seqs);
		}

		for (j=0;j<num_seqs;j++) {

				fprintf(stderr, "\rScanning %i of %i sequences...", j+1, num_seqs);

				//copy the pointer into our current object for clarity
				sequence = seq_list[j];

				//Read the fluorescence data from the description field.
				add_string(get_seq_name(sequence),seq_ids);
				seq_len = get_seq_length(sequence);
				set_array_item(j,atof(get_seq_description(sequence)),seq_fscores);

				//Scan with each motif.
				for (i=0;i<motifs.num;i++) {
						int motifindex = i*2;

						results[i][j] = ramen_sequence_scan(sequence, motif_at(motifs.motifs, motifindex), 
											      motif_at(motifs.motifs, motifindex+1),
											      NULL, NULL, //No need to pass PSSM.
										              AVG_ODDS, 0, TRUE, 0, motifs.bg_freqs);

						if (TRUE == args.linreg_normalise) {
								int k;
								double maxscore = 1;
								motif = motif_at(motifs.motifs,motifindex); 
								for (k=0;k<get_motif_length(motif);k++) {
										double maxprob = 0;
										if (maxprob < get_matrix_cell(k, alph_index(ramen_alph, 'A'), get_motif_freqs(motif)))
												maxprob = get_matrix_cell(k, alph_index(ramen_alph, 'A'), get_motif_freqs(motif));
										if (maxprob < get_matrix_cell(k, alph_index(ramen_alph, 'C'), get_motif_freqs(motif)))
												maxprob = get_matrix_cell(k, alph_index(ramen_alph, 'C'), get_motif_freqs(motif));
										if (maxprob < get_matrix_cell(k, alph_index(ramen_alph, 'G'), get_motif_freqs(motif)))
												maxprob = get_matrix_cell(k, alph_index(ramen_alph, 'G'), get_motif_freqs(motif));
										if (maxprob < get_matrix_cell(k, alph_index(ramen_alph, 'T'), get_motif_freqs(motif)))
												maxprob = get_matrix_cell(k, alph_index(ramen_alph, 'T'), get_motif_freqs(motif));
										maxscore *= maxprob;
								}
								results[i][j] /= maxscore;
						}
				}
		}

}
Example #12
0
/*************************************************************************
 * Entry point for centrimo
 *************************************************************************/
int main(int argc, char *argv[]) {
  CENTRIMO_OPTIONS_T options;
  SEQ_SITES_T seq_sites;
  SITE_COUNTS_T counts;
  int seqN, motifN, seqlen, db_i, motif_i, i;
  double log_pvalue_thresh;
  SEQ_T** sequences = NULL;
  ARRAY_T* bg_freqs = NULL;
  ARRAYLST_T *stats_list;
  MOTIF_DB_T **dbs, *db;
  MREAD_T *mread;
  MOTIF_STATS_T *stats;
  MOTIF_T *motif, *rev_motif;
  PSSM_T *pos_pssm, *rev_pssm;
  char *sites_path, *desc;
  FILE *sites_file;
  HTMLWR_T *html;
  JSONWR_T *json;

  // COMMAND LINE PROCESSING
  process_command_line(argc, argv, &options);

  // load the sequences
  read_sequences(options.alphabet, options.seq_source, &sequences, &seqN);
  seqlen = (seqN ? get_seq_length(sequences[0]) : 0);
  // calculate a sequence background (unless other background is given)
  if (!options.bg_source) {
    bg_freqs = calc_bg_from_fastas(options.alphabet, seqN, sequences);
  }

  // load the motifs
  motifN = 0;
  dbs = mm_malloc(sizeof(MOTIF_DB_T*) * arraylst_size(options.motif_sources));
  for (i = 0; i < arraylst_size(options.motif_sources); i++) {
    char* db_source;
    db_source = (char*)arraylst_get(i, options.motif_sources);
    dbs[i] = read_motifs(i, db_source, options.bg_source, &bg_freqs, 
        options.pseudocount, options.selected_motifs, options.alphabet);
    motifN += arraylst_size(dbs[i]->motifs);
  }
  log_pvalue_thresh = log(options.evalue_thresh) - log(motifN);
  // Setup some things for double strand scanning
  if (options.scan_both_strands == TRUE) {
    // Set up hash tables for computing reverse complement
    setup_hash_alph(DNAB);
    setalph(0);
    // Correct background by averaging on freq. for both strands.
    average_freq_with_complement(options.alphabet, bg_freqs);
    normalize_subarray(0, alph_size(options.alphabet, ALPH_SIZE), 0.0, bg_freqs);
    calc_ambigs(options.alphabet, FALSE, bg_freqs);
  }
  // Create output directory
  if (create_output_directory(options.output_dirname, options.allow_clobber, 
        (verbosity >= NORMAL_VERBOSE))) {
    die("Couldn't create output directory %s.\n", options.output_dirname);
  }
  // open output files
  sites_path = make_path_to_file(options.output_dirname, SITES_FILENAME);
  sites_file = fopen(sites_path, "w");
  free(sites_path);
  // setup html monolith writer
  json = NULL;
  if ((html = htmlwr_create(get_meme_etc_dir(), TEMPLATE_FILENAME))) {
    htmlwr_set_dest_name(html, options.output_dirname, HTML_FILENAME);
    htmlwr_replace(html, "centrimo_data.js", "data");
    json = htmlwr_output(html);
    if (json == NULL) die("Template does not contain data section.\n");
  } else {
    DEBUG_MSG(QUIET_VERBOSE, "Failed to open html template file.\n");
  }
  if (json) {
    // output some top level variables
    jsonwr_str_prop(json, "version", VERSION);
    jsonwr_str_prop(json, "revision", REVISION);
    jsonwr_str_prop(json, "release", ARCHIVE_DATE);
    jsonwr_str_array_prop(json, "cmd", argv, argc);
    jsonwr_property(json, "options");
    jsonwr_start_object_value(json);
    jsonwr_dbl_prop(json, "motif-pseudo", options.pseudocount);
    jsonwr_dbl_prop(json, "score", options.score_thresh);
    jsonwr_dbl_prop(json, "ethresh", options.evalue_thresh);
    jsonwr_lng_prop(json, "maxbin", options.max_window+1);
    jsonwr_bool_prop(json, "norc", !options.scan_both_strands);
    jsonwr_bool_prop(json, "noflip", options.no_flip);
    jsonwr_end_object_value(json);
    // output the description
    desc = prepare_description(&options);
    if (desc) {
      jsonwr_str_prop(json, "job_description", desc);
      free(desc);
    }
    // output size metrics
    jsonwr_lng_prop(json, "seqlen", seqlen);
    jsonwr_lng_prop(json, "tested", motifN);
    // output the fasta db
    jsonwr_property(json, "sequence_db");
    jsonwr_start_object_value(json);
    jsonwr_str_prop(json, "source", options.seq_source);
    jsonwr_lng_prop(json, "count", seqN);
    jsonwr_end_object_value(json);
    // output the motif dbs
    jsonwr_property(json, "motif_dbs");
    jsonwr_start_array_value(json);
    for (db_i = 0; db_i < arraylst_size(options.motif_sources); db_i++) {
      db = dbs[db_i];
      jsonwr_start_object_value(json);
      jsonwr_str_prop(json, "source", db->source);
      jsonwr_lng_prop(json, "count", arraylst_size(db->motifs));
      jsonwr_end_object_value(json);
    }
    jsonwr_end_array_value(json);
    // start the motif array
    jsonwr_property(json, "motifs");
    jsonwr_start_array_value(json);
  }
  /**************************************************************
   * Tally the positions of the best sites for each of the 
   * selected motifs.
   **************************************************************/
  // prepare the sequence sites
  memset(&seq_sites, 0, sizeof(SEQ_SITES_T));
  // prepare the site counts
  counts.allocated = ((2 * seqlen) - 1);
  counts.sites = mm_malloc(sizeof(double) * counts.allocated);
  // prepare the motifs stats list
  stats_list = arraylst_create();
  // prepare the other vars
  motif = NULL; pos_pssm = NULL; rev_motif = NULL; rev_pssm = NULL;
  for (db_i = 0; db_i < arraylst_size(options.motif_sources); db_i++) {
    db = dbs[db_i];
    for (motif_i = 0; motif_i < arraylst_size(db->motifs); motif_i++) {
      motif = (MOTIF_T *) arraylst_get(motif_i, db->motifs);
      DEBUG_FMT(NORMAL_VERBOSE, "Using motif %s of width %d.\n",  
          get_motif_id(motif), get_motif_length(motif));
      // reset the counts
      for (i = 0; i < counts.allocated; i++) counts.sites[i] = 0;
      counts.total_sites = 0;
      // create the pssm 
      pos_pssm = make_pssm(bg_freqs, motif);
      // If required, do the same for the reverse complement motif.
      if (options.scan_both_strands) {
        rev_motif = dup_rc_motif(motif);
        rev_pssm = make_pssm(bg_freqs, rev_motif);
      }
      // scan the sequences
      for (i = 0; i < seqN; i++)
        score_sequence(&options, sequences[i], pos_pssm, rev_pssm, 
            &seq_sites, &counts);
      // DEBUG check that the sum of the sites is close to the site count
      double sum_check = 0, sum_diff;
      for (i = 0; i < counts.allocated; i++) sum_check += counts.sites[i];
      sum_diff = counts.total_sites - sum_check;
      if (sum_diff < 0) sum_diff = -sum_diff;
      if (sum_diff > 0.1) {
        fprintf(stderr, "Warning: site counts don't sum to accurate value! "
            "%g != %ld", sum_check, counts.total_sites);
      }
      // output the plain text site counts
      output_site_counts(sites_file, seqlen, db, motif, &counts);
      // compute the best central window
      stats = compute_stats(options.max_window, seqlen, db, motif, &counts);
      // check if it passes the threshold
      if (json && stats->log_adj_pvalue <= log_pvalue_thresh) {
        output_motif_json(json, stats, &counts);
        arraylst_add(stats, stats_list);
      } else {
        free(stats);
      }
      // Free memory associated with this motif.
      free_pssm(pos_pssm);
      free_pssm(rev_pssm);
      destroy_motif(rev_motif);
    }
  }
  if (json) jsonwr_end_array_value(json);
  // finish writing sites
  fclose(sites_file);
  // finish writing html file
  if (html) {
    if (htmlwr_output(html) != NULL) {
      die("Found another JSON replacement!\n");
    }
    htmlwr_destroy(html);
  }
  // write text file
  output_centrimo_text(&options, motifN, stats_list);
  // Clean up.
  for (i = 0; i < seqN; ++i) {
    free_seq(sequences[i]); 
  }
  free(sequences);
  for (i = 0; i < arraylst_size(options.motif_sources); i++) {
    free_db(dbs[i]);
  }
  free(dbs);
  free_array(bg_freqs);
  free(counts.sites);
  free(seq_sites.sites);
  arraylst_destroy(free, stats_list);
  cleanup_options(&options);
  return 0;

}
Example #13
0
/*************************************************************************
 * Set up one state in a linear HMM, given the appropriate data.
 *************************************************************************/
static void build_linear_state
  (ALPH_T   alph,           // alphabet
   STATE_T  state_type,     // Type of state (START, SPACER,...)
   int      i_state,        // The state index.
   int      expected_length,// For spacers, expected length of output.
   ARRAY_T* freqs,          // Emission probability distrib.
   double   num_sites,      // Number of sites for this emission.
   int      i_motif,        // Index of motif this state is in.
   int      i_position,     // Position of this state within a motif or spacer.
   MOTIF_T*  motif,         // Motif.
   MHMM_STATE_T * a_state)  // State to be filled in (pre-allocated).
{
  if (verbosity >= NORMAL_VERBOSE) {
    switch (state_type) {
    case START_STATE :
      fprintf(stderr, "Building HMM: 0 ");
      break;
    case SPACER_STATE :
    case END_MOTIF_STATE :
      fprintf(stderr, "%d ", i_state);
      break;
    case START_MOTIF_STATE :
    case MID_MOTIF_STATE :
      fprintf(stderr, "%d-", i_state);
      break;
    case END_STATE :
      fprintf(stderr, "%d\n", i_state);
      break;
    case INVALID_STATE :
      die("Invalid state.\n");
    }
  }

  /* Record what type of state this is. */
  a_state->type = state_type;

  // Record the motif width if this is a motif.
  if (state_type == START_MOTIF_STATE ||
      state_type == MID_MOTIF_STATE ||
      state_type == END_MOTIF_STATE) {
    a_state->w_motif = get_motif_length(motif);
  } else {
    a_state->w_motif = 1;
  }
  
  /* Set up the emission distribution and a few other tidbits. */
  a_state->emit = allocate_array(alph_size(alph, ALL_SIZE));
  a_state->emit_odds = allocate_array(alph_size(alph, ALL_SIZE));
  if (state_type == START_STATE || state_type == END_STATE) { 
    /* Start and end don't have emissions. */
    int i_alph;
    for (i_alph = 0; i_alph < alph_size(alph, ALL_SIZE); i_alph++) {
      set_array_item(i_alph, 1.0, a_state->emit);
    }
  }
  else {
    copy_array(freqs, a_state->emit);
  }
  a_state->num_sites = num_sites;

  /* Record the motif index and ID. */
  a_state->i_motif = i_motif;
  if ((state_type == START_STATE) ||
      (state_type == END_STATE) ||
      (state_type == SPACER_STATE)) {
    strcpy(a_state->motif_id, NON_MOTIF_ID);
    a_state->id_char = NON_MOTIF_ID_CHAR;
  } else { // a motif state
    strcpy(a_state->motif_id, get_full_motif_id(motif));
    a_state->id_char = get_motif_id_char(i_position, motif);
  }
  a_state->i_position = i_position;

  /* First set up the transitions into this state. */
  switch (state_type) {
  case START_STATE :
    a_state->ntrans_in = 0;
    a_state->itrans_in = NULL;
    a_state->trans_in = NULL;
    break;
  case START_MOTIF_STATE :
  case END_STATE :
    a_state->ntrans_in = 2;
    a_state->itrans_in = (int *)mm_malloc(sizeof(int) * 2);
    a_state->itrans_in[0] = i_state - 2;
    a_state->itrans_in[1] = i_state - 1;
    a_state->trans_in = allocate_array(2);
    set_array_item(0, 1.0 - self_trans(expected_length), a_state->trans_in);
    set_array_item(1, 1.0 - self_trans(expected_length), a_state->trans_in);
    break;
  case MID_MOTIF_STATE :
  case END_MOTIF_STATE :
    a_state->ntrans_in = 1;
    a_state->itrans_in = (int *)mm_malloc(sizeof(int));
    a_state->itrans_in[0] = i_state - 1;
    a_state->trans_in = allocate_array(1);
    set_array_item(0, 1.0, a_state->trans_in);
    break;
  case SPACER_STATE :
    a_state->ntrans_in = 2;
    a_state->itrans_in = (int *)mm_malloc(sizeof(int) * 2);
    a_state->itrans_in[0] = i_state - 1;
    a_state->itrans_in[1] = i_state;
    a_state->trans_in = allocate_array(2);
    set_array_item(0, 1.0 - self_trans(expected_length), a_state->trans_in);
    set_array_item(1, self_trans(expected_length), a_state->trans_in);
    break;
  default:
    die("Invalid state type.\n");
  }

  /* Then set up the transitions out of this state. */
  switch (state_type) {
  case START_STATE :
  case END_MOTIF_STATE :
    a_state->ntrans_out = 2;
    a_state->itrans_out = (int *)mm_malloc(sizeof(int) * 2);
    a_state->itrans_out[0] = i_state + 1;
    a_state->itrans_out[1] = i_state + 2;
    a_state->trans_out = allocate_array(2);
    set_array_item(0, self_trans(expected_length), a_state->trans_out);
    set_array_item(1, 1.0 - self_trans(expected_length), a_state->trans_out);
    break;
  case START_MOTIF_STATE :
  case MID_MOTIF_STATE :
    a_state->ntrans_out = 1;
    a_state->itrans_out = (int *)mm_malloc(sizeof(int));
    a_state->itrans_out[0] = i_state + 1;
    a_state->trans_out = allocate_array(1);
    set_array_item(0, 1.0, a_state->trans_out);
    break;
  case SPACER_STATE :
    a_state->ntrans_out = 2;
    a_state->itrans_out = (int *)mm_malloc(sizeof(int) * 2);
    a_state->itrans_out[0] = i_state;
    a_state->itrans_out[1] = i_state + 1;
    a_state->trans_out = allocate_array(2);
    set_array_item(0, self_trans(expected_length), a_state->trans_out);
    set_array_item(1, 1.0 - self_trans(expected_length), a_state->trans_out);
    break;
  case END_STATE :
    a_state->ntrans_out = 0;
    a_state->itrans_out = NULL;
    a_state->trans_out = NULL;
    break;
  default:
    die("Invalid state type.\n");
  }
}
Example #14
0
/*************************************************************************
 * Build a linear HMM.
 *************************************************************************/
void build_linear_hmm
  (ARRAY_T*  background,
   ORDER_T*  order_spacing,
   int       spacer_states, 
   RBTREE_T* motifs, // motifs with key as in order_spacing
   BOOLEAN_T fim,
   MHMM_T**  the_hmm)
{
  ALPH_T    alph;
  int       model_length; // Total number of states in the model.
  int       i_state;      // Index of the current state.
  int       i_order;      // Index within the order and spacing.
  int       i_position;   // Index within the current motif or spacer.
  int       motif_i;      // motif key in order spacing
  MOTIF_T  *motif;        // motif
  RBNODE_T *node;

  alph = get_motif_alph((MOTIF_T*)rbtree_value(rbtree_first(motifs)));

  // Calculate the total length of the model.
  model_length = 2; // start and end state
  for (i_order = 0; i_order < get_order_occurs(order_spacing); i_order++) {
    motif_i = get_order_motif(order_spacing, i_order);
    motif = (MOTIF_T*)rbtree_get(motifs, &motif_i);
    model_length += get_motif_length(motif);
  }
  model_length += (get_order_occurs(order_spacing) + 1) * spacer_states;


  // Allocate the model.
  *the_hmm = allocate_mhmm(alph, model_length);
  check_sq_matrix((*the_hmm)->trans, model_length);

  // Record that this is a linear model.
  (*the_hmm)->type = LINEAR_HMM;

  // Record the number of motifs in the model. 
  // It doesn't want the distinct count
  (*the_hmm)->num_motifs = get_order_occurs(order_spacing);

  // Record the number of states in the model.
  (*the_hmm)->num_states = model_length;
  (*the_hmm)->num_spacers = get_order_occurs(order_spacing) + 1;
  (*the_hmm)->spacer_states = spacer_states;

  // Put the background distribution into the model.
  copy_array(background, (*the_hmm)->background);

  // Begin the model with a non-emitting state.
  i_state = 0;
  check_sq_matrix((*the_hmm)->trans, (*the_hmm)->num_states);
  build_linear_state(
      alph,
      START_STATE,
      i_state,
      get_spacer_length(order_spacing, 0),
      NULL, // Emissions.
      0, // Number of sites.
      NON_MOTIF_INDEX,
      NON_MOTIF_POSITION, // position within state (not relevant to start state)
      NULL, // no motif
      &((*the_hmm)->states[i_state]));
  ++i_state;

  // Build the first spacer.
  for (i_position = 0; i_position < spacer_states; i_position++, i_state++) {
    check_sq_matrix((*the_hmm)->trans, (*the_hmm)->num_states);
    build_linear_state(
        alph,
        SPACER_STATE,
        i_state, 
        get_spacer_length(order_spacing, 0),
        background, 
        SPACER_NUMSITES,
        NON_MOTIF_INDEX,
        i_position, // position within spacer
        NULL, // no motif
        &((*the_hmm)->states[i_state]));
  }

  // Build each motif and subsequent spacer.
  for (i_order = 0; i_order < get_order_occurs(order_spacing); i_order++) {
    STATE_T state;
    int spacer_len;
    motif_i = get_order_motif(order_spacing, i_order);
    motif = (MOTIF_T*)rbtree_get(motifs, &motif_i);

    // Build the motif.
    for (i_position = 0; i_position < get_motif_length(motif); i_position++, i_state++) {
      if (i_position == 0) {
        state = START_MOTIF_STATE;
        spacer_len = get_spacer_length(order_spacing, i_order);
      } else if (i_position == (get_motif_length(motif) - 1)) {
        state = END_MOTIF_STATE;
        spacer_len = get_spacer_length(order_spacing, i_order+1);
      } else {
        state = MID_MOTIF_STATE;
        spacer_len = 0;
      }
      check_sq_matrix((*the_hmm)->trans, (*the_hmm)->num_states);
      build_linear_state(
          alph, 
          state, 
          i_state, 
          spacer_len, // Expected spacer length.
          get_matrix_row(i_position, get_motif_freqs(motif)),
          get_motif_nsites(motif),
          i_order,
          i_position, // position within motif (middle)
          motif,
          &((*the_hmm)->states[i_state]));
    }

    // Build the following spacer.
    for (i_position = 0; i_position < spacer_states; i_position++, i_state++) {
      check_sq_matrix((*the_hmm)->trans, (*the_hmm)->num_states);
      build_linear_state(
          alph, 
          SPACER_STATE, 
          i_state, 
          get_spacer_length(order_spacing, i_order+1),
          background,
          SPACER_NUMSITES,
          NON_MOTIF_INDEX, 
          i_position, // position within spacer
          NULL, // no motif
          &((*the_hmm)->states[i_state]));
    }
  }

  check_sq_matrix((*the_hmm)->trans, (*the_hmm)->num_states);
  // Finish up the model with a non-emitting end state.
  build_linear_state(
      alph, 
      END_STATE, 
      i_state, 
      get_spacer_length(order_spacing, i_order),
      NULL, // Emissions.
      0, // Number of sites.
      NON_MOTIF_INDEX,
      NON_MOTIF_POSITION, // position within state (not relevant to end state)
      NULL, // no motif
      &((*the_hmm)->states[i_state]));
  ++i_state;
  assert(i_state == model_length);

  check_sq_matrix((*the_hmm)->trans, (*the_hmm)->num_states);
  // Convert spacers to FIMs if requested.
  if (fim) {
    convert_to_fims(*the_hmm);
  }

  // Fill in the transition matrix.
  build_transition_matrix(*the_hmm);
}
Example #15
0
/*************************************************************************
 * Build a star topology HMM.
 *************************************************************************/
void build_star_hmm
  (ARRAY_T*  background,
   int       spacer_states, 
   MOTIF_T*  motifs,
   int       nmotifs,
   BOOLEAN_T fim,
   MHMM_T**  the_hmm)
{
  ALPH_T alph;
  int       motif_states; /* Total length of the motifs. */
  int       num_spacers;  /* Total number of spacer states. */
  int       num_states;   /* Total number of states in the model. */
  int       i_motif;      /* Index of the current "from" motif. */
  int       i_position;   /* Index within the current motif or spacer. */
  int       i_state = 0;  /* Index of the current state. */

  alph = get_motif_alph(motif_at(motifs, 0));

  /* Count the width of the motifs. */
  for (motif_states = 0, i_motif = 0; i_motif < nmotifs; i_motif++)
    motif_states += get_motif_length(motif_at(motifs, i_motif));
  // Only 1 spacer.
  num_spacers = 1;
  /* Total states = motifs + spacer_states + begin/end */
  num_states = motif_states + (num_spacers * spacer_states) + 2;
  /* fprintf(stderr, "motif_states=%d num_spacers=%d num_states=%d\n",
	  motif_states, num_spacers, num_states); */

  /* Allocate the model. */
  *the_hmm = allocate_mhmm(alph, num_states);

  /* Record that this is a star model. */
  (*the_hmm)->type = STAR_HMM;

  /* Record the number of motifs in the model. */
  (*the_hmm)->num_motifs = nmotifs;

  /* Record the number of states in the model. */
  (*the_hmm)->num_states = num_states;
  (*the_hmm)->num_spacers = 1;
  (*the_hmm)->spacer_states = spacer_states;

  // Put the background distribution into the model.
  copy_array(background, (*the_hmm)->background);

  /* Build the begin state. */
  build_star_state(
    alph,
    START_STATE,
		i_state,
		0, // expected length
		NULL,
		0, // Number of sites.
		NON_MOTIF_INDEX,
		NON_MOTIF_POSITION,
		nmotifs,
		spacer_states,
		motifs,
		&((*the_hmm)->states[i_state])
  );
  i_state++;

  // Build the spacer state (state 0).  Allow multi-state spacers.
  for (i_position = 0; i_position < spacer_states; i_position++) {
    build_star_state(
      alph,
      SPACER_STATE, 
		  i_state, 
		  DEFAULT_SPACER_LENGTH,
		  background,
		  SPACER_NUMSITES,
		  NON_MOTIF_INDEX,
		  i_position,
		  nmotifs,
		  spacer_states,
		  motifs,
		  &((*the_hmm)->states[i_state])
    );
    i_state++;
  }

  /* Build the motif states. */
  for (i_motif = 0; i_motif < nmotifs; i_motif++) {
    MOTIF_T *this_motif = motif_at(motifs, i_motif);
    assert(get_motif_length(this_motif) > 1);
    i_position = 0;
    build_star_state(
      alph,
      START_MOTIF_STATE, 
		  i_state,
		  0, // Expected spacer length.
		  get_matrix_row(i_position, get_motif_freqs(this_motif)),
		  get_motif_nsites(this_motif),
		  i_motif,
		  i_position,
		  nmotifs,
		  spacer_states,
		  motifs,
		  &((*the_hmm)->states[i_state])
    );
    i_state++;
    for (i_position = 1; i_position < get_motif_length(this_motif) - 1; i_position++) {
      build_star_state(
        alph,
        MID_MOTIF_STATE, 
		    i_state,
		    0, // Expected spacer length. 
		    get_matrix_row(i_position, get_motif_freqs(this_motif)),
		    get_motif_nsites(this_motif),
		    i_motif,
		    i_position, 
		    nmotifs,
		    spacer_states,
		    motifs,
		    &((*the_hmm)->states[i_state])
      );
      i_state++;
    }
    build_star_state(
      alph,
      END_MOTIF_STATE, 
		  i_state,
		  0, // Expected spacer length.
		  get_matrix_row(i_position, get_motif_freqs(this_motif)),
		  get_motif_nsites(this_motif),
		  i_motif,
		  i_position,
		  nmotifs,
		  spacer_states,
		  motifs,
		  &((*the_hmm)->states[i_state])
    );
    i_state++;
  }

  /* Build the end state. */
  build_star_state(
    alph,
    END_STATE, 
		i_state,
		0, // Expected spacer length.
		NULL, // Emissions
		0, // Number of sites.
		NON_MOTIF_INDEX,
		NON_MOTIF_POSITION,
		nmotifs,
		spacer_states,
		motifs,
		&((*the_hmm)->states[i_state])
  );
  i_state++;

  /* Convert spacers to FIMs if requested. */
  if (fim) {
    convert_to_fims(*the_hmm);
  }

  /* Fill in the transition matrix. */
  build_transition_matrix(*the_hmm);
} // build_star_hmm
Example #16
0
/*************************************************************************
 * Compute the enrichment of the central region
 *************************************************************************/
static MOTIF_STATS_T* compute_stats(int max_window, int sequence_length, 
    MOTIF_DB_T* db, MOTIF_T* motif, SITE_COUNTS_T* counts) {
  // variables
  MOTIF_STATS_T *stats;
  double window_counts, max_sites;
  int i, max_bins, is_centered, big_window, middle, window, bins;
  double log_p_value;
  // allocate memory for stats
  stats = mm_malloc(sizeof(MOTIF_STATS_T));
  // initilise stats to defaults
  stats->db = db;
  stats->motif = motif;
  stats->total_sites = counts->total_sites;
  stats->n_win_tested = 0;
  stats->max_prob = 0;
  stats->central_sites = 0;
  stats->central_prob = 0.0;
  stats->central_window = 0;
  stats->log_pvalue = 0;
  stats->log_adj_pvalue = 0;
  // find the largest site count
  max_sites = 0;
  for (i = 0; i < counts->allocated; i++) {
    if (max_sites < counts->sites[i]) max_sites = counts->sites[i];
  }
  // calculate the max probability
  stats->max_prob = (counts->total_sites == 0 ? 0 : 
      max_sites / (double)counts->total_sites);
  // get the number of bins that the motif could possibly have landed in
  max_bins = sequence_length - get_motif_length(motif) + 1;
  // determine if this motif can have sites in a completely central bin
  is_centered = (max_bins % 2);
  // calculate the window that contains all sites from this motif
  stats->all_window = max_bins - is_centered;
  // calculate the biggest window which might have a p-value
  big_window = stats->all_window - 2;
  // check that max window is ok
  if (max_window == -1 || max_window > big_window) max_window = big_window;
  if (max_window < 0) return stats; // no windows to test!
  // calculate the number of tested windows
  stats->n_win_tested = (is_centered ? (max_window / 2) + 1 : (max_window + 1) / 2);
  if (stats->n_win_tested == 0) return stats; // no windows to test!
  // the index of the bin in the center
  middle = sequence_length - 1;
  // initialise counts
  stats->log_pvalue = BIG; //ensure it is replaced by the loop
  window_counts = 0;
  bins = 0;
  if (is_centered) { // test the central window
    bins++;
    window_counts  = counts->sites[middle];
    stats->log_pvalue = window_enrichment(0, 
        window_counts, counts->total_sites, bins, max_bins);
    stats->central_sites = window_counts;
    stats->central_prob = (double)bins / (double)max_bins;
  }
  // find the best window by trying all possible windows
  for (bins += 2, window = bins-1; window <= max_window; bins += 2, window += 2) {
    window_counts +=  counts->sites[middle - window];
    window_counts += counts->sites[middle + window];
    // calculate the window p-value
    log_p_value = window_enrichment(window, 
        window_counts, counts->total_sites, bins, max_bins);
    // check if the p-value is better
    if (log_p_value < stats->log_pvalue) {
      stats->log_pvalue = log_p_value;
      stats->central_window = window;
      stats->central_sites = window_counts;
      stats->central_prob = (double)bins / (double)max_bins;
    }
  }
  stats->log_adj_pvalue = LOGEV(log(stats->n_win_tested), stats->log_pvalue);
  DEBUG_FMT(HIGHER_VERBOSE, "best bin: %d sites: %g "
      "log_adj_p-value: %g (%d tests)\n", stats->central_window+1, 
      stats->central_sites, stats->log_adj_pvalue, 
      stats->n_win_tested);
  return stats;
}
Example #17
0
/*************************************************************************
 * Calculate the odds score for each motif-sized window at each
 * site in the sequence using the given nucleotide frequencies.
 *
 * This function is a lightweight version based on the one contained in
 * motiph-scoring. Several calculations that are unnecessary for gomo
 * have been removed in order to speed up the process
 *************************************************************************/
static double score_sequence(
    SEQ_T *seq,         // sequence to scan (IN)
    MOTIF_T *motif,     // motif already converted to odds values (IN)
    PSSM_T *m_pssm,     // motif pssm (IN)
    MATRIX_T *m_odds,   // motif odds (IN)
    int method,         // method used for scoring (IN)
    double threshold,   // Threshold to use in TOTAL_HITS mode with a PWM
    ARRAY_T *bg_freqs   //background model
    )
{

  assert(seq != NULL);
  assert(motif != NULL);
  assert((method == TOTAL_HITS && m_pssm) || (method != TOTAL_HITS && m_odds));

  char* raw_seq = get_raw_sequence(seq);
  int seq_length = get_seq_length(seq);

  // Get the pv lookup table
  ARRAY_T* pv_lookup = NULL;
  if (NULL != m_pssm) {
    pv_lookup = m_pssm->pv;
    assert(get_array_length(pv_lookup) > 0);
  }

  // Prepare storage for the string representing the portion
  // of the reference sequence within the window.
  char* window_seq = (char *) mm_malloc(sizeof(char) * (get_motif_length(motif) + 1));
  window_seq[get_motif_length(motif)] = '\0';

  int max_index = seq_length - get_motif_length(motif);
  if (max_index < 0) max_index = 0;
  const int asize = alph_size(get_motif_alph(motif), ALPH_SIZE);
  double* odds =  (double*) mm_malloc(sizeof(double)*max_index);
  double* scaled_log_odds =  (double*) mm_malloc(sizeof(double)*max_index);

  // For each site in the sequence
  int seq_index;
  for (seq_index = 0; seq_index < max_index; seq_index++) {
    double odd = 1.0;
    scaled_log_odds[seq_index] = 0;

    // For each site in the motif window
    int motif_position;
    for (motif_position = 0; motif_position < get_motif_length(motif); motif_position++) {
      char c = raw_seq[seq_index + motif_position];
      window_seq[motif_position] = c;

      // Check for gaps at this site
      if(c == '-' || c == '.') {
        break;
      }

      // Check for ambiguity codes at this site
      //TODO: This next call is very expensive - it takes up approx. 10% of a
      //      programme's running time. It should be fixed up somehow.
      int aindex = alph_index(get_motif_alph(motif), c);
      if (aindex > asize) {
        break;
      }
      if (method == TOTAL_HITS) {
        //If we're in this mode, then we're using LOG ODDS.
        //scaled_log_odds[seq_index] += get_matrix_cell(motif_position, aindex, get_motif_freqs(motif));
        scaled_log_odds[seq_index] += get_matrix_cell(motif_position, aindex, m_pssm->matrix);
      } else {
        odd *= get_matrix_cell(motif_position, aindex, m_odds);
      }
    }
    odds[seq_index] = odd;
  }

  // return odds as requested (MAX or AVG scoring)
  double requested_odds = 0.0;
  if (method == AVG_ODDS){
    for (seq_index = 0; seq_index < max_index; seq_index++) {
      requested_odds += odds[seq_index];
    }
    requested_odds /= max_index + 1;		// Divide by 0 if max_index==0
  } else if (method == MAX_ODDS){
    for (seq_index = 0; seq_index < max_index; seq_index++) {
      if (odds[seq_index] > requested_odds){
        requested_odds = odds[seq_index];
      }
    }
  } else if (method == SUM_ODDS) {
    for (seq_index = 0; seq_index < max_index; seq_index++) {
      requested_odds += odds[seq_index];
    }
  } else if (method == TOTAL_HITS) {
    for (seq_index = 0; seq_index < max_index; seq_index++) {

      if (scaled_log_odds[seq_index] >= (double)get_array_length(pv_lookup)) {
        scaled_log_odds[seq_index] = (double)(get_array_length(pv_lookup) - 1);
      } 
      double pvalue = get_array_item((int) scaled_log_odds[seq_index], pv_lookup);

      //Figure out how to calculate the p-value of a hit
      //fprintf(stderr, "m: %s pv_l len: %i scaled_log_odds: %g seq index: %i pvalue: %g\n", 
      //    get_motif_id(motif), get_array_length(pv_lookup), scaled_log_odds[seq_index], seq_index, pvalue);

      if (pvalue < threshold) {
        requested_odds++; //Add another hit.
      }

      if (verbosity > HIGHER_VERBOSE) {
        fprintf(stderr, "Window Data: %s\t%s\t%i\t%g\t%g\t%g\n",
            get_seq_name(seq), get_motif_id(motif), seq_index, scaled_log_odds[seq_index], pvalue, threshold);
      }
    }
  }

  myfree(odds);
  myfree(scaled_log_odds);
  myfree(window_seq);
  return requested_odds;
}
Example #18
0
ARRAYLST_T* load_motifs(AMA_OPTIONS_T *opts) {
  ARRAYLST_T *motifs;
  ARRAY_T *pos_bg_freqs, *rev_bg_freqs;
  MREAD_T *mread;
  MOTIF_T *motif, *motif_rc;
  double range;
  PSSM_T *pos_pssm, *neg_pssm;
  int total_motifs;
  ALPH_T *alph;

  //
  // Read the motifs and background model.
  //
  //this reads any meme file, xml, txt and html
  mread = mread_create(opts->motif_filename, OPEN_MFILE);
  mread_set_bg_source(mread, opts->bg_filename);
  mread_set_pseudocount(mread, opts->pseudocount);

  // sanity check, since the rest of the code relies on the motifs being complementable
  alph = alph_hold(mread_get_alphabet(mread));
  if (alph == NULL) die("Unable to determine alphabet from motifs");
  if (opts->scan_both_strands && !alph_has_complement(alph)) {
    opts->scan_both_strands = false;
  }
  if (opts->num_gc_bins > 1 && alph_size_core(alph) != 4 && alph_size_pairs(alph) != 2) {
    fprintf(stderr, "Warning: The motif alphabet does not have exactly 2 complementary pairs so \"GC binning\" will be disabled.\n");
    opts->num_gc_bins = 1;
  }

  pos_bg_freqs = mread_get_background(mread);
  rev_bg_freqs = NULL;
  if (opts->scan_both_strands) {
    rev_bg_freqs = allocate_array(get_array_length(pos_bg_freqs));
    copy_array(pos_bg_freqs, rev_bg_freqs);
    complement_swap_freqs(alph, rev_bg_freqs, rev_bg_freqs);
  }

  // allocate memory for motifs
  motifs = arraylst_create();
  //
  // Convert motif matrices into log-odds matrices.
  // Scale them.
  // Compute the lookup tables for the PDF of scaled log-odds scores.
  //
  range = 300; // 100 is not very good; 1000 is great but too slow
  neg_pssm = NULL;
  total_motifs = 0;
  while (mread_has_motif(mread)) {
    motif = mread_next_motif(mread);
    total_motifs++;
    if (rbtree_size(opts->selected_motifs) == 0 || rbtree_find(opts->selected_motifs, get_motif_id(motif)) != NULL) {
      if (verbosity >= HIGH_VERBOSE) {
        fprintf(stderr, "Using motif %s of width %d.\n", get_motif_id(motif), get_motif_length(motif));
      }
      pos_pssm =
        build_motif_pssm(
          motif, 
          pos_bg_freqs, 
          pos_bg_freqs, 
          NULL, // Priors not used
          0.0L, // alpha not used
          range, 
          opts->num_gc_bins, 
          true 
        );
      //
      //  Note: If scanning both strands, we complement the motif frequencies
      //  but not the background frequencies so the motif looks the same.
      //  However, the given frequencies are used in computing the p-values
      //  since they represent the frequencies on the negative strands.
      //  (If we instead were to complement the input sequence, keeping the
      //  the motif fixed, we would need to use the complemented frequencies
      //  in computing the p-values.  Is that any clearer?)
      //
      if (opts->scan_both_strands) {
        motif_rc = dup_rc_motif(motif);
        neg_pssm =
          build_motif_pssm(
            motif_rc, 
            rev_bg_freqs, 
            pos_bg_freqs, 
            NULL, // Priors not used
            0.0L, // alpha not used
            range, 
            opts->num_gc_bins, 
            true
          );
        destroy_motif(motif_rc);
      }
      arraylst_add(motif_and_pssm_create(motif, pos_pssm, neg_pssm), motifs);
    } else {
      if (verbosity >= HIGH_VERBOSE) fprintf(stderr, "Skipping motif %s.\n",
          get_motif_id(motif));
      destroy_motif(motif);
    }
  }
  mread_destroy(mread);
  free_array(pos_bg_freqs);
  free_array(rev_bg_freqs);
  alph_release(alph);
  if (verbosity >= NORMAL_VERBOSE) {
    fprintf(stderr, "Loaded %d/%d motifs from %s.\n", 
        arraylst_size(motifs), total_motifs, opts->motif_filename);
  }
  return motifs;
}
Example #19
0
File: ama.c Project: a1aks/Haystack
/*************************************************************************
 * Entry point for ama
 *************************************************************************/
int main(int argc, char *argv[]) {
  int max_seq_length = MAX_SEQ;
  STRING_LIST_T* selected_motifs = NULL;
  double pseudocount = 0.01;
  int output_format = CISML_FORMAT;
  program_name = "ama";
  int scoring = AVG_ODDS;
  BOOLEAN_T pvalues = FALSE;
  BOOLEAN_T normalize_scores = FALSE;
  BOOLEAN_T combine_duplicates = FALSE;
  int num_gc_bins = 1;
  int sdbg_order = -1;				// don't use sequence background
  BOOLEAN_T scan_both_strands = TRUE;
  ARRAY_T* pos_bg_freqs = NULL;
  ARRAY_T* rev_bg_freqs = NULL;
  clock_t c0, c1; /* measuring cpu_time */
  CISML_T *cisml;
  char * out_dir = NULL;
  BOOLEAN_T clobber = FALSE;
  int i;
  int last = 0;
  ALPH_T alph = INVALID_ALPH;

  /**********************************************
   * COMMAND LINE PROCESSING
   **********************************************/

  const int num_options = 16;
  cmdoption const motif_scan_options[] = {
    { "max-seq-length", REQUIRED_VALUE },
    { "motif", REQUIRED_VALUE },
    { "motif-pseudo", REQUIRED_VALUE },
    { "rma", NO_VALUE },
    { "pvalues", NO_VALUE },
    { "sdbg", REQUIRED_VALUE },
    { "norc", NO_VALUE },
    { "cs", NO_VALUE },
    { "o-format", REQUIRED_VALUE },
    { "o", REQUIRED_VALUE },
    { "oc", REQUIRED_VALUE },
    { "scoring", REQUIRED_VALUE },
    { "verbosity", REQUIRED_VALUE },
    { "gcbins", REQUIRED_VALUE },
    { "last", REQUIRED_VALUE },
    { "version", NO_VALUE }
  };

  int option_index = 0;

  // Define the usage message.
  char usage[] = "USAGE: ama [options] <motif file> <sequence file> [<background file>]\n"
    "\n"
    "   Options:\n"
    "     --sdbg <order>\t\t\tUse Markov background model of\n"
    "       \t\t\t\t\torder <order> derived from the sequence\n"
    "       \t\t\t\t\tto compute its likelihood ratios.\n"
    "       \t\t\t\t\tOverrides --pvalues, --gcbins and --rma;\n"
    "       \t\t\t\t\t<background file> is required unless\n"
    "       \t\t\t\t\t--sdbg is given.\n"
    "     --motif <id>\t\t\tUse only the motif identified by <id>.\n"
    "       \t\t\t\t\tThis option may be repeated.\n"
    "     --motif-pseudo <float>\t\tThe value <float> times the background\n"
    "       \t\t\t\t\tfrequency is added to the count of each\n"
    "       \t\t\t\t\tletter when creating the likelihood \n"
    "       \t\t\t\t\tratio matrix (default: %g).\n"
    "     --norc\t\t\t\tDisables the scanning of the reverse\n"
    "       \t\t\t\t\tcomplement strand.\n"
    "     --scoring [avg-odds|max-odds]\tIndicates whether the average or \n"
    "       \t\t\t\t\tthe maximum odds should be calculated\n"
    "       \t\t\t\t\t(default: avg-odds)\n"
    "     --rma\t\t\t\tScale motif scores to the range 0-1.\n"
    "       \t\t\t\t\t(Relative Motif Affinity).\n"
    "       \t\t\t\t\tMotif scores are scaled by the maximum\n"
    "       \t\t\t\t\tscore achievable by that PWM. (default:\n"
    "       \t\t\t\t\tmotif scores are not normalized)\n"
    "     --pvalues\t\t\t\tPrint p-value of avg-odds score in cisml\n"
    "       \t\t\t\t\toutput. Ignored for max-odds scoring.\n"
    "       \t\t\t\t\t(default: p-values are not printed)\n"
    "     --gcbins <bins>\t\t\tCompensate p-values for GC content of\n"
    "       \t\t\t\t\teach sequence using given number of \n"
    "       \t\t\t\t\tGC range bins. Recommended bins: 41.\n"
    "       \t\t\t\t\t(default: p-values are based on\n"
    "       \t\t\t\t\tfrequencies in background file)\n"
    "     --cs\t\t\t\tEnable combining sequences with same\n"
    "       \t\t\t\t\tidentifier by taking the average score\n"
    "       \t\t\t\t\tand the Sidac corrected p-value.\n"
    "     --o-format [gff|cisml]\t\tOutput file format (default: cisml)\n"
    "       \t\t\t\t\tignored if --o or --oc option used\n"
    "     --o <directory>\t\t\tOutput all available formats to\n"
    "       \t\t\t\t\t<directory>; give up if <directory>\n"
    "       \t\t\t\t\texists\n"
    "     --oc <directory>\t\t\tOutput all available formats to\n"
    "       \t\t\t\t\t<directory>; if <directory> exists\n"
    "       \t\t\t\t\toverwrite contents\n"
    "     --verbosity [1|2|3|4]\t\tControls amount of screen output\n"
    "       \t\t\t\t\t(default: %d)\n"
    "     --max-seq-length <int>\t\tSet the maximum length allowed for \n"
    "       \t\t\t\t\tinput sequences. (default: %d)\n"
    "     --last <int>\t\t\tUse only scores of (up to) last <n>\n"
    "       \t\t\t\t\tsequence positions to compute AMA.\n"
    "     --version   \t\t\tPrint version and exit.\n"
    "\n";

  // Parse the command line.
  if (simple_setopt(argc, argv, num_options, motif_scan_options) != NO_ERROR) {
    die("Error processing command line options: option name too long.\n");
  }
    
    BOOLEAN_T setoutputformat = FALSE;
    BOOLEAN_T setoutputdirectory = FALSE;

  while (TRUE) {
    int c = 0;
    char* option_name = NULL;
    char* option_value = NULL;
    const char * message = NULL;

    // Read the next option, and break if we're done.
    c = simple_getopt(&option_name, &option_value, &option_index);
    if (c == 0) {
      break;
    } else if (c < 0) {
      (void) simple_getopterror(&message);
      die("Error processing command line options (%s).\n", message);
    } else if (strcmp(option_name, "max-seq-length") == 0) {
	max_seq_length = atoi(option_value);
    } else if (strcmp(option_name, "norc") == 0) {
	scan_both_strands = FALSE;
    } else if (strcmp(option_name, "cs") == 0) {
		combine_duplicates = TRUE;
    } else if (strcmp(option_name, "motif") == 0) {
	if (selected_motifs == NULL) {
	  selected_motifs = new_string_list();
	}
	add_string(option_value, selected_motifs);
    } else if (strcmp(option_name, "motif-pseudo") == 0) {
	pseudocount = atof(option_value);
    } else if (strcmp(option_name, "o-format") == 0) {
        if (setoutputdirectory) {
            if (verbosity >= NORMAL_VERBOSE)
                fprintf(stderr, "output directory specified, ignoring --o-format\n");
        } else {
            setoutputformat = TRUE;
            if (strcmp(option_value, "gff") == 0)
                output_format = GFF_FORMAT;
            else if (strcmp(option_value, "cisml") == 0)
                output_format = CISML_FORMAT;
            else {
                if (verbosity >= NORMAL_VERBOSE)
                  fprintf(stderr, "Output format not known. Using standard instead (cisML).\n");
                  output_format = CISML_FORMAT;
            }
        }
    } else if (strcmp(option_name, "o") == 0 || strcmp(option_name, "oc") == 0) {
        setoutputdirectory = TRUE;
        if (setoutputformat) {
            if (verbosity >= NORMAL_VERBOSE)
                fprintf(stderr, "output directory specified, ignoring --o-format\n");
        }
        clobber = strcmp(option_name, "oc") == 0;
        out_dir = (char*) malloc (sizeof(char)*(strlen(option_value)+1));
        strcpy(out_dir, option_value);
        output_format = DIRECTORY_FORMAT;
    } else if (strcmp(option_name, "verbosity") == 0) {
	verbosity = atoi(option_value);
    } else if (strcmp(option_name, "scoring") == 0) {
      if (strcmp(option_value, "max-odds") == 0)
	scoring = MAX_ODDS;
      else if (strcmp(option_value, "avg-odds") == 0)
	scoring = AVG_ODDS;
      else if (strcmp(option_value, "sum-odds") == 0)
	scoring = SUM_ODDS;
	  else
	die("Specified scoring scheme not known.\n", message);
    } else if (strcmp(option_name, "pvalues") == 0) {
      pvalues = TRUE;
    } else if (strcmp(option_name, "rma") == 0) {
      normalize_scores = TRUE;
      fprintf(stderr, "Normalizing motif scores using RMA method.\n");
    } else if (strcmp(option_name, "gcbins") == 0) {
      num_gc_bins = atoi(option_value);
      pvalues = TRUE;
      if (num_gc_bins <= 1) die("Number of bins in --gcbins must be greater than 1.\n", message);
    } else if (strcmp(option_name, "sdbg") == 0) {
      sdbg_order = atoi(option_value);			// >=0 means use sequence bkg
    }
    else if (strcmp(option_name, "last") == 0) {
      int i = 0;
      if (option_value[0] == '-') ++i;
      while (option_value[i] != '\0') {
        if (!isdigit(option_value[i])) {
          die("Specified parameter 'last' contains non-numeric characters.\n");
        }
        ++i;
      }
      last = atoi(option_value);
      if (errno != 0) {
        die("Specified parameter 'last' could not be parsed as a number as:\n%s\n",strerror(errno));
      }
      if (last < 0) {
        die("Specified parameter 'last' had negative value (%d) when only postive or zero values are allowed \n", last);
      }
    }
    else if (strcmp(option_name, "version") == 0) {
      fprintf(stdout, VERSION "\n");
      exit(EXIT_SUCCESS);
    }
  }

  // --sdbg overrides --pvalues and --gcbins and --rma
  int req_args = 3;
  if (sdbg_order >= 0) {
    pvalues = FALSE;
    normalize_scores = FALSE;
    num_gc_bins = 1;
    req_args = 2;
  }

  // Check all required arguments given
  if (sdbg_order >= 0 && argc > option_index + req_args) {
    die("<background file> cannot be given together with --sdbg.\n");
  } else if (argc != option_index + req_args) {
    fprintf(stderr, usage, pseudocount, verbosity, max_seq_length);
    exit(EXIT_FAILURE);
  }

  // Get required arguments. 
  char* motif_filename = argv[option_index];
  option_index++;
  char* fasta_filename = argv[option_index];
  option_index++;
  char* bg_filename;
  if (req_args == 3) {			// required unless --sdbg given
    bg_filename = argv[option_index];
    option_index++;
  } else {
    bg_filename = "--uniform--";	// So PSSMs will use uniform background;
					// we can multiply them out later.
  }

  // measure time
  c0 = clock();

  // Set up hash tables for computing reverse complement if doing --sdbg
  if (sdbg_order >= 0) setup_hash_alph(DNAB);

  // Create cisml data structure for recording results
  cisml = allocate_cisml(program_name, motif_filename, fasta_filename);
  set_cisml_background_file(cisml, bg_filename);

  /**********************************************
   * Read the motifs and background model.
   **********************************************/
  int num_motifs = 0;
  MREAD_T *mread;
  ARRAYLST_T *motifs;
  PSSM_PAIR_T** pssm_pairs;	// note pssm_pairs is an array of pointers

  //this reads any meme file, xml, txt and html
  mread = mread_create(motif_filename, OPEN_MFILE);
  mread_set_bg_source(mread, bg_filename);
  mread_set_pseudocount(mread, pseudocount);

  motifs = mread_load(mread, NULL);
  alph = mread_get_alphabet(mread);
  pos_bg_freqs = mread_get_background(mread);

  mread_destroy(mread);

  num_motifs = arraylst_size(motifs);

  // allocate memory for PSSM pairs
  pssm_pairs = (PSSM_PAIR_T**)mm_malloc(sizeof(PSSM_PAIR_T*) * num_motifs);

  if (verbosity >= NORMAL_VERBOSE) 
    fprintf(stderr, "Number of motifs in file %d.\n", num_motifs);

  // make a CISML pattern to hold scores for each motif
  PATTERN_T** patterns = NULL;
  Resize(patterns, num_motifs, PATTERN_T*);
  int motif_index;
  for (motif_index = 0; motif_index < num_motifs; motif_index++) {
    MOTIF_T* motif = (MOTIF_T*)arraylst_get(motif_index, motifs);
    patterns[motif_index] = allocate_pattern(get_motif_id(motif), "");
    add_cisml_pattern(cisml, patterns[motif_index]);
  }

  // make reverse complement motifs and background frequencies.
  if (scan_both_strands == TRUE) {
    add_reverse_complements(motifs);
    assert(arraylst_size(motifs) == (2 * num_motifs));
    rev_bg_freqs = allocate_array(get_array_length(pos_bg_freqs));
    complement_dna_freqs(pos_bg_freqs, rev_bg_freqs);
  }

  /**************************************************************
   * Convert motif matrices into log-odds matrices.
   * Scale them.
   * Compute the lookup tables for the PDF of scaled log-odds scores.
   **************************************************************/
  int ns = scan_both_strands ? 2 : 1;	// number of strands
  for (motif_index = 0; motif_index < num_motifs; motif_index++) {
    MOTIF_T *motif, *motif_rc;
    motif = (MOTIF_T*)arraylst_get(motif_index*ns, motifs);
    if (scan_both_strands)
      motif_rc = (MOTIF_T*)arraylst_get(motif_index*ns + 1, motifs);
    else
      motif_rc = NULL;
    /*
     *  Note: If scanning both strands, we complement the motif frequencies
     *  but not the background frequencies so the motif looks the same.
     *  However, the given frequencies are used in computing the p-values
     *  since they represent the frequencies on the negative strands.
     *  (If we instead were to complement the input sequence, keeping the
     *  the motif fixed, we would need to use the complemented frequencies
     *  in computing the p-values.  Is that any clearer?)
    */
    double range = 300;		// 100 is not very good; 1000 is great but too slow
    PSSM_T* pos_pssm =
      build_motif_pssm(
        motif, 
        pos_bg_freqs, 
        pos_bg_freqs, 
        NULL, // Priors not used
        0.0L, // alpha not used
        range, 
        num_gc_bins, 
        TRUE
      );
    PSSM_T* neg_pssm = (scan_both_strands ?
      build_motif_pssm(
        motif_rc, 
        rev_bg_freqs, 
        pos_bg_freqs, 
        NULL, // Priors not used
        0.0L, // alpha not used
        range, 
        num_gc_bins, 
        TRUE
      )
      : NULL
    );
    pssm_pairs[motif_index] = create_pssm_pair(pos_pssm, neg_pssm);
  }

  // Open the FASTA file for reading.
  FILE* fasta_file = NULL;
  if (open_file(fasta_filename, "r", FALSE, "FASTA", "sequences", &fasta_file) == 0) {
    die("Couldn't open the file %s.\n", fasta_filename);
  }
  if (verbosity >= NORMAL_VERBOSE) {
    if (last == 0) {
      fprintf(stderr, "Using entire sequence\n");
    } else {
      fprintf(stderr, "Limiting sequence to last %d positions.\n", last);
    }
  }

  /**************************************************************
   * Read in all sequences and score with all motifs
   **************************************************************/
  int seq_loading_num = 0;  // keeps track on the number of sequences read in total
  int seq_counter = 0;		// holds the index to the seq in the pattern
  int unique_seqs = 0;      // keeps track on the number of unique sequences
  BOOLEAN_T need_postprocessing = FALSE;
  SEQ_T* sequence = NULL;
  RBTREE_T* seq_ids = rbtree_create(rbtree_strcasecmp,NULL,free,rbtree_intcpy,free);
  RBNODE_T* seq_node;
  BOOLEAN_T created;
  while (read_one_fasta(alph, fasta_file, max_seq_length, &sequence)) {
    ++seq_loading_num;
	created = FALSE;
    char* seq_name = get_seq_name(sequence);
    int seq_len = get_seq_length(sequence);
    int scan_len;
    if (last != 0) {
      scan_len = last;
    } else {
      scan_len = seq_len;
    }
	  
	// red-black trees are only required if duplicates should be combined
	if (combine_duplicates){
		//lookup seq id and create new entry if required, return sequence index
		char *tmp_id = mm_malloc(strlen(seq_name)+1); // required copy for rb-tree
		strncpy(tmp_id,seq_name,strlen(seq_name)+1);
		seq_node = rbtree_lookup(seq_ids, tmp_id, TRUE, &created);
		if (created) {// assign it a loading number
			rbtree_set(seq_ids, seq_node, &unique_seqs);
			seq_counter = unique_seqs;
			++unique_seqs;
		} else {
			seq_counter = *((int*)rbnode_get(seq_node));
		}
	}
	  
    //
    // Set up sequence-dependent background model and compute
    // log cumulative probability of sequence.
    //
    double *logcumback = NULL;                    // array of log cumulative probs.
    if (sdbg_order >= 0) {
      Resize(logcumback, seq_len+1, double);
      char* raw_seq = get_raw_sequence(sequence);
      BOOLEAN rc = FALSE;
      double *a_cp = get_markov_from_sequence(raw_seq, alph_string(alph), rc, sdbg_order, 0);
      log_cum_back(raw_seq, a_cp, sdbg_order, logcumback);
      myfree(a_cp);
    }

    // Get the GC content of the sequence if binning p-values by GC
    // and store it in the sequence object.
    if (num_gc_bins > 1) {
      ARRAY_T *freqs = get_sequence_freqs(sequence, alph);
      set_total_gc_sequence(sequence,
        get_array_item(1,freqs) + get_array_item(2,freqs));	// f(C) + f(G)
      free_array(freqs);			// clean up
    } else {
      set_total_gc_sequence(sequence, -1);	// flag ignore
    }

    /**************************************************************
     * Process all motifs.
     **************************************************************/
    int ns = scan_both_strands ? 2 : 1;
    for (motif_index = 0; motif_index < num_motifs; motif_index++) {
      PATTERN_T *pattern = patterns[motif_index];
      MOTIF_T* motif = (MOTIF_T*)arraylst_get(ns*motif_index, motifs);
      char* motif_id = (scan_both_strands ? get_motif_st_id(motif) : get_motif_id(motif));
      if (verbosity >= HIGH_VERBOSE) {
        fprintf(stderr, "Using motif %s of width %d.\n", motif_id, get_motif_length(motif));
      }
      if ((selected_motifs == NULL) || (have_string(get_motif_id(motif), selected_motifs) == TRUE)) {
        if (verbosity >= HIGHER_VERBOSE) {
          fprintf(stderr, "Scanning %s sequence with length %d "
              "abbreviated to %d with motif %s with length %d.\n",
              seq_name, seq_len, scan_len, motif_id, get_motif_length(motif));
        }
		SCANNED_SEQUENCE_T* scanned_seq = NULL;

		
		if (!combine_duplicates || get_pattern_num_scanned_sequences(pattern) <= seq_counter){
			// Create a scanned_sequence record and save it in the pattern.
			scanned_seq = allocate_scanned_sequence(seq_name, seq_name, pattern);
			set_scanned_sequence_length(scanned_seq, scan_len);
		} else {
			// get existing sequence record
			scanned_seq = get_pattern_scanned_sequences(pattern)[seq_counter];
			set_scanned_sequence_length(scanned_seq, max(scan_len, get_scanned_sequence_length(scanned_seq)));
		}
		
		// check if scanned component of sequence has sufficient length for the motif
		if (scan_len < get_motif_length(motif)) {
			// set score to zero and p-value to 1 if not set yet
			if(!has_scanned_sequence_score(scanned_seq)){
				set_scanned_sequence_score(scanned_seq, 0.0);
			}
			if(pvalues && !has_scanned_sequence_pvalue(scanned_seq)){
				set_scanned_sequence_pvalue(scanned_seq, 1.0);
			} 
			add_scanned_sequence_scanned_position(scanned_seq); 
			if (get_scanned_sequence_num_scanned_positions(scanned_seq) > 0L) need_postprocessing = TRUE;
			if (verbosity >= HIGH_VERBOSE) fprintf(stderr, "%s too short for motif %s. Score set to 0!\n", seq_name, motif_id);
		} else {  
			// scan the sequence using average/maximum motif affinity
			ama_sequence_scan(alph, sequence, logcumback, pssm_pairs[motif_index], scoring, 
							  pvalues, last, scanned_seq, &need_postprocessing);
		}

      } else {
        if (verbosity >= HIGH_VERBOSE) fprintf(stderr, "Skipping motif %s.\n", motif_id);
      }
    } // All motifs parsed

    free_seq(sequence);
    if (sdbg_order >= 0) myfree(logcumback);

  } // read sequences
Example #20
0
/*************************************************************************
 * Entry point for ama
 *************************************************************************/
int main(int argc, char **argv) {
  AMA_OPTIONS_T options;
  ARRAYLST_T *motifs;
  clock_t c0, c1; // measuring cpu_time
  MOTIF_AND_PSSM_T *combo;
  CISML_T *cisml;
  PATTERN_T** patterns;
  PATTERN_T *pattern;
  FILE *fasta_file, *text_output, *cisml_output;
  int i, seq_loading_num, seq_counter, unique_seqs, seq_len, scan_len, x1, x2, y1, y2;
  char *seq_name, *path;
  bool need_postprocessing, created;
  SEQ_T *sequence;
  RBTREE_T *seq_ids;
  RBNODE_T *seq_node;
  double *logcumback;
  ALPH_T *alph;

  // process the command
  process_command_line(argc, argv, &options);

  // load DNA motifs
  motifs = load_motifs(&options);

  // get the alphabet
  if (arraylst_size(motifs) > 0) {
    combo = (MOTIF_AND_PSSM_T*)arraylst_get(0, motifs);
    alph = alph_hold(get_motif_alph(combo->motif));
  } else {
    alph = alph_dna();
  }

  // pick columns for GC operations
  x1 = -1; x2 = -1; y1 = -1; y2 = -1;
  if (alph_size_core(alph) == 4 && alph_size_pairs(alph) == 2) {
    x1 = 0; // A
    x2 = alph_complement(alph, x1); // T
    y1 = (x2 == 1 ? 2 : 1); // C
    y2 = alph_complement(alph, y1); // G
    assert(x1 != x2 && y1 != y2 && x1 != y1 && x2 != y2 && x1 != y2 && x2 != y1);
  }

  // record starting time
  c0 = clock();

  // Create cisml data structure for recording results
  cisml = allocate_cisml(PROGRAM_NAME, options.command_line, options.motif_filename, options.fasta_filename);
  set_cisml_background_file(cisml, options.bg_filename);

  // make a CISML pattern to hold scores for each motif
  for (i = 0; i < arraylst_size(motifs); i++) {
    combo = (MOTIF_AND_PSSM_T*)arraylst_get(i, motifs);
    add_cisml_pattern(cisml, allocate_pattern(get_motif_id(combo->motif), ""));
  }

  // Open the FASTA file for reading.
  fasta_file = NULL;
  if (!open_file(options.fasta_filename, "r", false, "FASTA", "sequences", &fasta_file)) {
    die("Couldn't open the file %s.\n", options.fasta_filename);
  }
  if (verbosity >= NORMAL_VERBOSE) {
    if (options.last == 0) {
      fprintf(stderr, "Using entire sequence\n");
    } else {
      fprintf(stderr, "Limiting sequence to last %d positions.\n", options.last);
    }
  }

  //
  // Read in all sequences and score with all motifs
  //
  seq_loading_num = 0;  // keeps track on the number of sequences read in total
  seq_counter = 0;      // holds the index to the seq in the pattern
  unique_seqs = 0;      // keeps track on the number of unique sequences
  need_postprocessing = false;
  sequence = NULL;
  logcumback = NULL;
  seq_ids = rbtree_create(rbtree_strcasecmp,rbtree_strcpy,free,rbtree_intcpy,free);
  while (read_one_fasta(alph, fasta_file, options.max_seq_length, &sequence)) {
    ++seq_loading_num;
    seq_name = get_seq_name(sequence);
    seq_len = get_seq_length(sequence);
    scan_len = (options.last != 0 ? options.last : seq_len);
    // red-black trees are only required if duplicates should be combined
    if (options.combine_duplicates){
      //lookup seq id and create new entry if required, return sequence index
      seq_node = rbtree_lookup(seq_ids, get_seq_name(sequence), true, &created);
      if (created) { // assign it a loading number
        rbtree_set(seq_ids, seq_node, &unique_seqs);
        seq_counter = unique_seqs;
        ++unique_seqs;
      } else {
        seq_counter = *((int*)rbnode_get(seq_node));
      }
    }
          
    //
    // Set up sequence-dependent background model and compute
    // log cumulative probability of sequence.
    // This needs the sequence in raw format.
    //
    if (options.sdbg_order >= 0)
      logcumback = log_cumulative_background(alph, options.sdbg_order, sequence);

    // Index the sequence, throwing away the raw format and ambiguous characters
    index_sequence(sequence, alph, SEQ_NOAMBIG);

    // Get the GC content of the sequence if binning p-values by GC
    // and store it in the sequence object.
    if (options.num_gc_bins > 1) {
      ARRAY_T *freqs = get_sequence_freqs(sequence, alph);
      set_total_gc_sequence(sequence, get_array_item(y1, freqs) + get_array_item(y2, freqs)); // f(C) + f(G)
      free_array(freqs);                        // clean up
    } else {
      set_total_gc_sequence(sequence, -1);      // flag ignore
    }

    // Scan with motifs.
    for (i = 0; i < arraylst_size(motifs); i++) {
      pattern = get_cisml_patterns(cisml)[i];
      combo = (MOTIF_AND_PSSM_T*)arraylst_get(i, motifs);
      if (verbosity >= HIGHER_VERBOSE) {
        fprintf(stderr, "Scanning %s sequence with length %d "
            "abbreviated to %d with motif %s with length %d.\n",
            seq_name, seq_len, scan_len, 
            get_motif_id(combo->motif), get_motif_length(combo->motif));
      }
      SCANNED_SEQUENCE_T* scanned_seq = NULL;
      if (!options.combine_duplicates || get_pattern_num_scanned_sequences(pattern) <= seq_counter) {
        // Create a scanned_sequence record and save it in the pattern.
        scanned_seq = allocate_scanned_sequence(seq_name, seq_name, pattern);
        set_scanned_sequence_length(scanned_seq, scan_len);
      } else {
        // get existing sequence record
        scanned_seq = get_pattern_scanned_sequences(pattern)[seq_counter];
        set_scanned_sequence_length(scanned_seq, max(scan_len, get_scanned_sequence_length(scanned_seq)));
      }
      
      // check if scanned component of sequence has sufficient length for the motif
      if (scan_len < get_motif_length(combo->motif)) {
        // set score to zero and p-value to 1 if not set yet
        if(!has_scanned_sequence_score(scanned_seq)){
          set_scanned_sequence_score(scanned_seq, 0.0);
        }
        if(options.pvalues && !has_scanned_sequence_pvalue(scanned_seq)){
          set_scanned_sequence_pvalue(scanned_seq, 1.0);
        } 
        add_scanned_sequence_scanned_position(scanned_seq); 
        if (get_scanned_sequence_num_scanned_positions(scanned_seq) > 0L) {
          need_postprocessing = true;
        }
        if (verbosity >= HIGH_VERBOSE) {
          fprintf(stderr, "%s too short for motif %s. Score set to 0.\n",
              seq_name, get_motif_id(combo->motif));
        }
      } else {
        // scan the sequence using average/maximum motif affinity
        ama_sequence_scan(alph, sequence, logcumback, combo->pssm_pair,
            options.scoring, options.pvalues, options.last, scanned_seq,
            &need_postprocessing);
      }
    } // All motifs scanned

    free_seq(sequence);
    if (options.sdbg_order >= 0) myfree(logcumback);

  } // read sequences

  fclose(fasta_file);
  if (verbosity >= HIGH_VERBOSE) fprintf(stderr, "(%d) sequences read in.\n", seq_loading_num);
  if (verbosity >= NORMAL_VERBOSE) fprintf(stderr, "Finished          \n");

        
  // if any sequence identifier was multiple times in the sequence set  then
  // postprocess of the data is required
  if (need_postprocessing || options.normalize_scores) {
    post_process(cisml, motifs, options.normalize_scores);
  }
        
  // output results
  if (options.output_format == DIRECTORY_FORMAT) {
    if (create_output_directory(options.out_dir, options.clobber, verbosity > QUIET_VERBOSE)) {
      // only warn in higher verbose modes
      fprintf(stderr, "failed to create output directory `%s' or already exists\n", options.out_dir);
      exit(1);
    }
    path = make_path_to_file(options.out_dir, text_filename);
    //FIXME check for errors: MEME doesn't either and we at least know we have a good directory
    text_output = fopen(path, "w");
    free(path);
    path = make_path_to_file(options.out_dir, cisml_filename);
    //FIXME check for errors
    cisml_output = fopen(path, "w");
    free(path);
    print_cisml(cisml_output, cisml, true, NULL, false);
    print_score(cisml, text_output);
    fclose(cisml_output);
    fclose(text_output);
  } else if (options.output_format == GFF_FORMAT) {
    print_score(cisml, stdout);
  } else if (options.output_format == CISML_FORMAT) {
    print_cisml(stdout, cisml, true, NULL, false);
  } else {
    die("Output format invalid!\n");
  }

  //
  // Clean up.
  //
  rbtree_destroy(seq_ids);
  arraylst_destroy(motif_and_pssm_destroy, motifs);
  free_cisml(cisml);
  rbtree_destroy(options.selected_motifs);
  alph_release(alph);
        
  // measure time
  if (verbosity >= NORMAL_VERBOSE) { // starting time
    c1 = clock();
    fprintf(stderr, "cycles (CPU);            %ld cycles\n", (long) c1);
    fprintf(stderr, "elapsed CPU time:        %f seconds\n", (float) (c1-c0) / CLOCKS_PER_SEC);
  }
  return 0;
}
Example #21
0
/*************************************************************************
 * Set up one state in a star HMM, given the appropriate data.
 *************************************************************************/
static void build_star_state (
   ALPH_T    alph,            // Type of alphabet
   int       state_type,      // Type of state (START, SPACER,..)
   STATE_T   i_state,         // State index.
   int       expected_length, /* For spacers, the expected length of output. */
   ARRAY_T*  freqs,           // Emission probability distrib.
   double    num_sites,       // Number of sites for this emission.
   int       i_motif,         // Index of motif this state is in.
   int       i_position,      // Position of this state within motif
   int       nmotifs,         // Total number of motifs.
   int       spacer_states,   // Number of HMM states per spacer.
   MOTIF_T*  motifs,          // Motifs.
   MHMM_STATE_T* a_state
)     // State to be filled in (pre-allocated).
{
  int j_motif;    		  // Index of the current motif.
  int num_spacers = 1;  // Total number of spacers in HMM.
  double in_p;			    // Probability of transition into a state 

  // Size of the alphabet, including ambiguity codes.
  int full_alph_size = alph_size(alph, ALL_SIZE);

  MOTIF_T *motif = NULL;
  if (i_motif != NON_MOTIF_INDEX) {
    motif = motif_at(motifs, i_motif);
  }

  // Tell the user what's up.
  if (verbosity >= NORMAL_VERBOSE) {
    switch (state_type) {
    case START_STATE :
      fprintf(stderr, "Building HMM: (0) ");
      break;
    case SPACER_STATE :
      fprintf(stderr, "%d ", i_state);
      break;
    case END_MOTIF_STATE :
      fprintf(stderr, "%d | ", i_state);
      break;
    case START_MOTIF_STATE :
    case MID_MOTIF_STATE :
      fprintf(stderr, "%d-", i_state);
      break;
    case END_STATE :
      fprintf(stderr, "(%d)\n", i_state);
      break;
    }
  }

  // Record what type of state this is.
  a_state->type = state_type;

  // Record the motif width if this is a motif.
  if (state_type == START_MOTIF_STATE ||
      state_type == MID_MOTIF_STATE ||
      state_type == END_MOTIF_STATE) {
    a_state->w_motif = get_motif_length(motif);
  } else {
    a_state->w_motif = 1;
  }

  // Set up the emission distribution and a few other tidbits.
  a_state->emit = allocate_array(full_alph_size);
  a_state->emit_odds = allocate_array(full_alph_size);
  if (freqs != NULL) { // Start and end states have no emissions.
    copy_array(freqs, a_state->emit);
  }

  a_state->num_sites = num_sites;
  a_state->i_motif = i_motif;
  if (motif != NULL) {
  }
  a_state->i_position = i_position;

  // Record the motif ID character at this position.
  if ((state_type == START_STATE) ||
      (state_type == END_STATE) ||
      (state_type == SPACER_STATE)) {
    a_state->id_char = NON_MOTIF_ID_CHAR;
    strcpy(a_state->motif_id, NON_MOTIF_ID);
  } else {
    strcpy(a_state->motif_id, get_full_motif_id(motif));
    a_state->id_char = get_motif_id_char(i_position, motif);
  }
  assert(a_state->id_char != '\0');

  // First set up the transitions into this state.
  switch (state_type) {
  case START_STATE :
    a_state->ntrans_in = 0;
    a_state->itrans_in = NULL;
    a_state->trans_in = NULL;
    break;
  case END_STATE :
  case START_MOTIF_STATE :
    // Transitions come from spacer state.
    a_state->ntrans_in = 1;
    a_state->itrans_in = (int *)mm_malloc(sizeof(int));
    a_state->trans_in = allocate_array(1);
    a_state->itrans_in[0] = SPACER_INDEX;
    // Distribute non-self loop probability evenly among motifs and end state.
    in_p = (1 - self_trans(expected_length / spacer_states))/(nmotifs+1);
    set_array_item(0, in_p, a_state->trans_in);
    break;
  case MID_MOTIF_STATE :
  case END_MOTIF_STATE :
    // Transitions come from previous state.
    a_state->ntrans_in = 1;
    a_state->itrans_in = (int *)mm_malloc(sizeof(int));
    a_state->itrans_in[0] = i_state - 1;
    a_state->trans_in = allocate_array(1);
    set_array_item(0, 1.0, a_state->trans_in);
    break;
  case SPACER_STATE :
    // Transitions come from start and each motif except for internal
    // multi-states.
    a_state->ntrans_in = (i_position != 0) ? 2 : nmotifs + 2;
    a_state->itrans_in = (int *)mm_malloc(sizeof(int) * a_state->ntrans_in);
    a_state->trans_in = allocate_array(a_state->ntrans_in);

    // First transition is a self-transition. 
    a_state->itrans_in[0] = i_state;
    set_array_item(
      0, 
      self_trans(expected_length / spacer_states), 
		  a_state->trans_in
    );

    // Next the transitions from all the motifs (or the previous spacer).
    if (i_position != 0) {
      a_state->itrans_in[1] = i_state - 1;
      set_array_item(1, 1.0 - self_trans(expected_length / spacer_states),
		     a_state->trans_in);
    } else {
      a_state->itrans_in[1] = START_INDEX;	// From start state.
      // From each motif.
      for (j_motif = 0; j_motif < nmotifs; j_motif++) {	
        a_state->itrans_in[j_motif+2] = 
          motif_index(
            j_motif+1, 
            TRUE, 
            num_spacers, 
            spacer_states, 
            motifs, 
            nmotifs
          );
	      set_array_item(j_motif+2, 1.0, a_state->trans_in);
      }
    }
    break;
  }

  // Then set up the transitions out of this state.
  switch (state_type) {
  case START_STATE :
  case END_MOTIF_STATE :
    // Transition goes to spacer. 
    a_state->ntrans_out = 1;
    a_state->itrans_out = (int *)mm_malloc(sizeof(int));
    a_state->trans_out = allocate_array(1);
    a_state->itrans_out[0] = SPACER_INDEX;
    set_array_item(0, 1.0, a_state->trans_out);
    break;
  case START_MOTIF_STATE :
  case MID_MOTIF_STATE :
    a_state->ntrans_out = 1;
    a_state->itrans_out = (int *)mm_malloc(sizeof(int));
    a_state->itrans_out[0] = i_state + 1;
    a_state->trans_out = allocate_array(1);
    set_array_item(0, 1.0, a_state->trans_out);
    break;
  case SPACER_STATE :
    // Transitions go to self, motifs and end (except for beginning
    // multi-state spacers)
    a_state->ntrans_out = (i_position < spacer_states -1 ) ? 2 : nmotifs + 2;
    a_state->itrans_out = (int *)mm_malloc(sizeof(int) * a_state->ntrans_out);
    a_state->trans_out = allocate_array(a_state->ntrans_out);

    // The first transition is a self-transition.
    a_state->itrans_out[0] = i_state;
    set_array_item(0, self_trans(expected_length), a_state->trans_out);

    // For multi-state spacers, outgoing transition to next state.
    if (i_position < spacer_states - 1) {
      a_state->itrans_out[1] = i_state + 1;
      set_array_item(1, 1-self_trans(expected_length), a_state->trans_out);
    } else {
      double out_p = (1 - self_trans(expected_length))/(nmotifs+1);
      // Out to each motif start.
      for (j_motif = 0; j_motif < nmotifs; j_motif++) {	
        a_state->itrans_out[j_motif+1] = 
          motif_index(
            j_motif+1,
            FALSE,
            num_spacers,
            spacer_states,
            motifs,
            nmotifs
          );
	      set_array_item(j_motif+1, out_p, a_state->trans_out);
      }
      // Out to end state.
      a_state->itrans_out[j_motif+1] = 
        motif_index(nmotifs, TRUE, num_spacers, spacer_states, motifs, nmotifs) + 1;
      set_array_item(j_motif+1, out_p, a_state->trans_out);
    }
    break;
  case END_STATE :
    a_state->ntrans_out = 0;
    a_state->itrans_out = NULL;
    a_state->trans_out = NULL;
    break;
  }
} // build_star_state
Example #22
0
/*************************************************************************
 * Entry point for pmp_bf
 *************************************************************************/
int main(int argc, char *argv[]) {

  char* bg_filename = NULL;
  char* motif_name = "motif"; // Use this motif name in the output.
  STRING_LIST_T* selected_motifs = NULL;
  double fg_rate = 1.0;
  double bg_rate = 1.0;
  double purine_pyrimidine = 1.0; // r
  double transition_transversion = 0.5; // R
  double pseudocount = 0.1;
  GAP_SUPPORT_T gap_support = SKIP_GAPS;
  MODEL_TYPE_T model_type = F81_MODEL;
  BOOLEAN_T use_halpern_bruno = FALSE;
  char* ustar_label = NULL;	// TLB; create uniform star tree
  int i;

  program_name = "pmp_bf";

  /**********************************************
   * COMMAND LINE PROCESSING
   **********************************************/

  // Define command line options. (FIXME: Repeated code)
  // FIXME: Note that if you add or remove options you
  // must change n_options.
  int n_options = 12;
  cmdoption const pmp_options[] = {
    {"hb", NO_VALUE},
    {"ustar", REQUIRED_VALUE},
    {"model", REQUIRED_VALUE},
    {"pur-pyr", REQUIRED_VALUE},
    {"transition-transversion", REQUIRED_VALUE},
    {"bg", REQUIRED_VALUE},
    {"fg", REQUIRED_VALUE},
    {"motif", REQUIRED_VALUE},
    {"motif-name", REQUIRED_VALUE},
    {"bgfile", REQUIRED_VALUE},
    {"pseudocount", REQUIRED_VALUE},
    {"verbosity", REQUIRED_VALUE}
  };

  int option_index = 0;

  // Define the usage message.
  char      usage[1000] = "";
  strcat(usage, "USAGE: pmp [options] <tree file> <MEME file>\n");
  strcat(usage, "\n");
  strcat(usage, "   Options:\n");

  // Evolutionary model parameters.
  strcat(usage, "     --hb\n");
  strcat(usage, "     --model single|average|jc|k2|f81|f84|hky|tn");
  strcat(usage, " (default=f81)\n");
  strcat(usage, "     --pur-pyr <float> (default=1.0)\n");
  strcat(usage, "     --transition-transversion <float> (default=0.5)\n");
  strcat(usage, "     --bg <float> (default=1.0)\n");
  strcat(usage, "     --fg <float> (default=1.0)\n");

  // Motif parameters.
  strcat(usage, "     --motif <id> (default=all)\n");
  strcat(usage, "     --motif-name <string> (default from motif file)\n");

  // Miscellaneous parameters
  strcat(usage, "     --bgfile <background> (default from motif file)\n");
  strcat(usage, "     --pseudocount <float> (default=0.1)\n");
  strcat(usage, "     --ustar <label>\n");	// TLB; create uniform star tree
  strcat(usage, "     --verbosity [1|2|3|4] (default 2)\n");
  strcat(usage, "\n    Prints the FP and FN rate at each of 10000 score values.\n");
  strcat(usage, "\n    Output format: [<motif_id> score <score> FPR <fpr> TPR <tpr>]+\n");

  // Parse the command line.
  if (simple_setopt(argc, argv, n_options, pmp_options) != NO_ERROR) {
    die("Error processing command line options: option name too long.\n");
  }

  while (TRUE) { 
    int c = 0;
    char* option_name = NULL;
    char* option_value = NULL;
    const char * message = NULL;

    // Read the next option, and break if we're done.
    c = simple_getopt(&option_name, &option_value, &option_index);
    if (c == 0) {
      break;
    } else if (c < 0) {
      (void) simple_getopterror(&message);
      die("Error processing command line options (%s)\n", message);
    }
    
    if (strcmp(option_name, "model") == 0) {
      if (strcmp(option_value, "jc") == 0) {
        model_type = JC_MODEL;
      } else if (strcmp(option_value, "k2") == 0) {
        model_type = K2_MODEL;
      } else if (strcmp(option_value, "f81") == 0) {
        model_type = F81_MODEL;
      } else if (strcmp(option_value, "f84") == 0) {
        model_type = F84_MODEL;
      } else if (strcmp(option_value, "hky") == 0) {
        model_type = HKY_MODEL;
      } else if (strcmp(option_value, "tn") == 0) {
        model_type = TAMURA_NEI_MODEL;
      } else if (strcmp(option_value, "single") == 0) {
        model_type = SINGLE_MODEL;
      } else if (strcmp(option_value, "average") == 0) {
        model_type = AVERAGE_MODEL;
      } else {
        die("Unknown model: %s\n", option_value);
      }
    } else if (strcmp(option_name, "hb") == 0){
        use_halpern_bruno = TRUE;
    } else if (strcmp(option_name, "ustar") == 0){	// TLB; create uniform star tree
        ustar_label = option_value;
    } else if (strcmp(option_name, "pur-pyr") == 0){
        purine_pyrimidine = atof(option_value);
    } else if (strcmp(option_name, "transition-transversion") == 0){
        transition_transversion = atof(option_value);
    } else if (strcmp(option_name, "bg") == 0){
      bg_rate = atof(option_value);
    } else if (strcmp(option_name, "fg") == 0){
      fg_rate = atof(option_value);
    } else if (strcmp(option_name, "motif") == 0){
        if (selected_motifs == NULL) {
          selected_motifs = new_string_list();
        }
       add_string(option_value, selected_motifs);
    } else if (strcmp(option_name, "motif-name") == 0){
        motif_name = option_value;
    } else if (strcmp(option_name, "bgfile") == 0){
      bg_filename = option_value;
    } else if (strcmp(option_name, "pseudocount") == 0){
        pseudocount = atof(option_value);
    } else if (strcmp(option_name, "verbosity") == 0){
        verbosity = atoi(option_value);
    }
  }

  // Must have tree and motif file names
  if (argc != option_index + 2) {
    fprintf(stderr, "%s", usage);
    exit(EXIT_FAILURE);
  } 

  /**********************************************
   * Read the phylogenetic tree.
   **********************************************/
  char* tree_filename = NULL;
  TREE_T* tree = NULL;
  tree_filename = argv[option_index];
  option_index++;
  tree = read_tree_from_file(tree_filename);

  // get the species names
  STRING_LIST_T* alignment_species = make_leaf_list(tree);
  char *root_label = get_label(tree);	// in case target in center
  if (strlen(root_label)>0) add_string(root_label, alignment_species);
  //write_string_list(" ", alignment_species, stderr);

  // TLB; Convert the tree to a uniform star tree with
  // the target sequence at its center.
  if (ustar_label != NULL) {
    tree = convert_to_uniform_star_tree(tree, ustar_label);
    if (tree == NULL) 
      die("Tree or alignment missing target %s\n", ustar_label);
    if (verbosity >= NORMAL_VERBOSE) {
      fprintf(stderr, 
	"Target %s placed at center of uniform (d=%.3f) star tree:\n", 
          ustar_label, get_total_length(tree) / get_num_children(tree) 
      );
      write_tree(tree, stderr);
    }
  }

  /**********************************************
   * Read the motifs.
   **********************************************/
  char* meme_filename = argv[option_index];
  option_index++;
  int num_motifs = 0; 

  MREAD_T *mread;
  ALPH_T alph;
  ARRAYLST_T *motifs;
  ARRAY_T *bg_freqs;

  mread = mread_create(meme_filename, OPEN_MFILE);
  mread_set_bg_source(mread, bg_filename);
  mread_set_pseudocount(mread, pseudocount);
  // read motifs
  motifs = mread_load(mread, NULL);
  alph = mread_get_alphabet(mread);
  bg_freqs = mread_get_background(mread);
  // check
  if (arraylst_size(motifs) == 0) die("No motifs in %s.", meme_filename);

  

  // TLB; need to resize bg_freqs array to ALPH_SIZE items
  // or copy array breaks in HB mode.  This throws away
  // the freqs for the ambiguous characters;
  int asize = alph_size(alph, ALPH_SIZE);
  resize_array(bg_freqs, asize);

  /**************************************************************
  * Compute probability distributions for each of the selected motifs.
  **************************************************************/
  int motif_index;
  for (motif_index = 0; motif_index < arraylst_size(motifs); motif_index++) {

    MOTIF_T* motif = (MOTIF_T*)arraylst_get(motif_index, motifs);
    char* motif_id = get_motif_id(motif);
    char* bare_motif_id = motif_id;

    // We may have specified on the command line that
    // only certain motifs were to be used.
    if (selected_motifs != NULL) {
      if (*bare_motif_id == '+' || *bare_motif_id == '-') {
        // The selected  motif id won't included a strand indicator.
        bare_motif_id++;
      }
      if (have_string(bare_motif_id, selected_motifs) == FALSE) {
        continue;
      }
    }

    if (verbosity >= NORMAL_VERBOSE) {
      fprintf(
        stderr, 
        "Using motif %s of width %d.\n",
        motif_id, get_motif_length(motif)
      );
    }

    // Build an array of evolutionary models for each position in the motif.
    EVOMODEL_T** models = make_motif_models(
      motif, 
      bg_freqs,
      model_type,
      fg_rate, 
      bg_rate, 
      purine_pyrimidine, 
      transition_transversion, 
      use_halpern_bruno
    );

    // Get the frequencies under the background model (row 0) 
    // and position-dependent scores (rows 1..w)
    // for each possible alignment column.
    MATRIX_T* pssm_matrix = build_alignment_pssm_matrix(
      alph,
      alignment_species,
      get_motif_length(motif) + 1, 
      models, 
      tree, 
      gap_support
    );
    ARRAY_T* alignment_col_freqs = allocate_array(get_num_cols(pssm_matrix)); 
    copy_array(get_matrix_row(0, pssm_matrix), alignment_col_freqs);
    remove_matrix_row(0, pssm_matrix);		// throw away first row
    //print_col_frequencies(alph, alignment_col_freqs);

    //
    // Get the position-dependent null model alignment column frequencies
    //
    int w = get_motif_length(motif);
    int ncols = get_num_cols(pssm_matrix); 
    MATRIX_T* pos_dep_bkg = allocate_matrix(w, ncols);
    for (i=0; i<w; i++) {
      // get the evo model corresponding to this column of the motif
      // and store it as the first evolutionary model.
      myfree(models[0]);
      // Use motif PSFM for equilibrium freqs. for model.
      ARRAY_T* site_specific_freqs = allocate_array(asize);
      int j = 0;
      for(j = 0; j < asize; j++) {
	double value = get_matrix_cell(i, j, get_motif_freqs(motif));
	set_array_item(j, value, site_specific_freqs);
      }
      if (use_halpern_bruno == FALSE) {
	models[0] = make_model(
	  model_type,
	  fg_rate,
	  transition_transversion,
	  purine_pyrimidine,
	  site_specific_freqs,
          NULL
	);
      } else {
        models[0] = make_model(
	  model_type,
	  fg_rate,
	  transition_transversion,
	  purine_pyrimidine,
	  bg_freqs,
	  site_specific_freqs
	);
      }
      // get the alignment column frequencies using this model
      MATRIX_T* tmp_pssm_matrix = build_alignment_pssm_matrix(
        alph,
	alignment_species,
	2,				// only interested in freqs under bkg
	models, 
	tree, 
	gap_support
      );
      // assemble the position-dependent background alignment column freqs.
      set_matrix_row(i, get_matrix_row(0, tmp_pssm_matrix), pos_dep_bkg);
      // chuck the pssm (not his real name)
      free_matrix(tmp_pssm_matrix);
    }

    //
    // Compute and print the score distribution under the background model
    // and under the (position-dependent) motif model.
    //
    int range = 10000;	// 10^4 gives same result as 10^5, but 10^3 differs

    // under background model
    PSSM_T* pssm = build_matrix_pssm(alph, pssm_matrix, alignment_col_freqs, range);

    // under position-dependent background (motif) model
    PSSM_T* pssm_pos_dep = build_matrix_pssm(alph, pssm_matrix, alignment_col_freqs, range);
    get_pv_lookup_pos_dep(
      pssm_pos_dep, 
      pos_dep_bkg, 
      NULL // no priors used
    );

    // print FP and FN distributions
    int num_items = get_pssm_pv_length(pssm_pos_dep);
    for (i=0; i<num_items; i++) {
      double pvf = get_pssm_pv(i, pssm);
      double pvt = get_pssm_pv(i, pssm_pos_dep);
      double fpr = pvf;
      double fnr = 1 - pvt;
      if (fpr >= 0.99999 || fnr == 0) continue;
      printf("%s score %d FPR %.3g FNR %.3g\n", motif_id, i, fpr, fnr);
    }

    // free stuff
    free_pssm(pssm);
    free_pssm(pssm_pos_dep);
    if (models != NULL) {
      int model_index;
      int num_models = get_motif_length(motif) + 1;
      for (model_index = 0; model_index < num_models; model_index++) {
        free_model(models[model_index]);
      }
      myfree(models);
    }

  } // motif

  arraylst_destroy(destroy_motif, motifs);

  /**********************************************
   * Clean up.
   **********************************************/
  // TLB may have encountered a memory corruption bug here
  // CEG has not been able to reproduce it. valgrind says all is well.
  free_array(bg_freqs);
  free_tree(TRUE, tree);
  free_string_list(selected_motifs);

  return(0);
} // main
Example #23
0
void generate_ceq_logos(char *meme_path, char *output_dir) {
  int i, dir_len, prefix_len, path_len;
  ARRAY_T *background;
  BOOLEAN_T has_reverse_strand;
  char *path, *alphabet;
  double logo_height, logo_width;
  ARRAYLST_T *motifs;
  MOTIF_T *motif;

  motifs = arraylst_create();

  logo_height = LOGOHEIGHT;
  //make the path
  dir_len = strlen(output_dir);
  prefix_len = strlen(LOGO_PREFIX);
  path_len = dir_len + 1 + prefix_len + MAX_MOTIF_ID_LENGTH + 1;
  path = malloc(sizeof(char)*path_len);
  strncpy(path, output_dir, path_len);
  if (path[dir_len-1] != '/') {
    path[dir_len] = '/';
    path[++dir_len] = '\0';
  }
  strncpy(path+dir_len, LOGO_PREFIX, path_len - dir_len);

  // Read all motifs into an array.
  read_meme_file2(meme_path,
		 NULL, // bg file name
		 DEFAULT_PSEUDOCOUNTS,
     REQUIRE_PSPM,
		 motifs, 
		 NULL,//motif occurrences
		 &has_reverse_strand,
		 &background);

  // global alphabet is set by read_meme_file
  alphabet = get_alphabet(FALSE);

  if (create_output_directory(output_dir, TRUE, (verbosity >= NORMAL_VERBOSE))) {
    // Failed to create output directory.
    exit(1);
  }

  for(i = 0; i < arraylst_size(motifs); i++) {
    motif = (MOTIF_T*)arraylst_get(i, motifs);
    logo_width = get_motif_length(motif);
    if (logo_width > MAXLOGOWIDTH) logo_width = MAXLOGOWIDTH;
    copy_and_sanatise_name(path+(dir_len+prefix_len), get_motif_id(motif), path_len - (dir_len + prefix_len)); 
    CL_create2(
      motif, 			        // motif
      "", 			          // no title 
      NULL, 			        // no second motif
      "", 			          // no x-axis label
      FALSE, 			        // no error bars
      FALSE,			        // ssc
      logo_height,		    // logo height (cm)
      logo_width,		      // logo width (cm)
      alphabet, 	        // alphabet
      0, 			            // no offset to second motif
      path,			          // output file path
      "MEME (no SSC)"		  // program name
    );
  }
  free_motifs(motifs);
  free_array(background); // not used 
  free(path);
}