Esempio n. 1
0
/*************************************************************************
 * Find the index of the starting or ending state of a given motif in
 * a given HMM.
 * 0 = START_STATE and nmotifs+1 = END_STATE
 *************************************************************************/
static int motif_index
  (const int       motif_num,
   const BOOLEAN_T start_or_end,
   const int       num_spacers,
   const int       spacer_states,
   const MOTIF_T*  motifs,
   const int       nmotifs)
{
  int i_motif;
  int return_value;

  assert(motif_num >= 0);
  assert(motif_num <= (nmotifs + 1));

  if (motif_num == 0) return START_INDEX;

  // Skip the spacer states.
  return_value = (num_spacers * spacer_states) + 1;

  // Add the lengths of the preceding motifs.
  for (i_motif = 0; i_motif < motif_num - 1; i_motif++)
    return_value += get_motif_length(motif_at((MOTIF_T*)motifs, i_motif));
  
  // If we're looking for the end of this motif, add its length as well. 
  // unless it is the end state we're after which has only one state
  if (start_or_end && motif_num != (nmotifs + 1))
    return_value += get_motif_length(motif_at((MOTIF_T*)motifs, i_motif)) - 1;

  // fprintf(stderr, "Motif %d -> %d\n", motif_num, return_value);

  return(return_value);
}    
Esempio n. 2
0
void mcast_print_motif_list(FILE * output, MOTIF_T* motifs, int num_motifs) {
  fputs("\n", output);
  int i;
  for (i = 0; i < num_motifs; i++) {
    MOTIF_T *motif = motif_at(motifs, i);
    MOTIF_T *rc_motif = NULL;
    char *motif_id = get_motif_id(motif);
    int width = get_motif_length(motif);
    char *rc_motif_id = NULL;
    if (i < (num_motifs - 1)) {
      rc_motif = motif_at(motifs, i + 1);
      rc_motif_id = get_motif_id(rc_motif);
    }
    char *best_possible_match = get_best_possible_match(motif);
    char *colored_best_possible_match = color_dna_sequence(best_possible_match);
    char *best_possible_rc_match = NULL;
    char *colored_best_possible_rc_match = NULL;
    if (rc_motif_id && strcmp(motif_id, rc_motif_id) == 0) {
      ++i; // Pair of identiical motif ids indicate forward/reverse pair.
      best_possible_rc_match = get_best_possible_match(rc_motif);
      colored_best_possible_rc_match = color_dna_sequence(best_possible_rc_match);
    }
    const char *indent = "               ";
    fprintf(output, "%s<tr>\n", indent);
    fprintf(output, "%s<td>%s</td>\n", indent, motif_id);
    fprintf(output, "%s<td>%d</td>\n", indent, width);
    fprintf(output, "%s<td class=\"sequence\">%s</td>\n", indent, colored_best_possible_match);
    fprintf(output, "%s<td class=\"sequence\">%s</td>\n", indent, colored_best_possible_rc_match);
    fprintf(output, "%s</tr>\n", indent);
    myfree(best_possible_match);
    myfree(best_possible_rc_match);
    myfree(colored_best_possible_match);
    myfree(colored_best_possible_rc_match);
  }
};
Esempio n. 3
0
/******************************************************************************
  Print JavaScript code defining an array of motifs 
  and their best possible matches.
 *****************************************************************************/
void mcast_print_motif_array(FILE *output, MOTIF_T *motifs, int num_motifs) {
   int i;
   fputs("\n", output);
   for (i = 0; i < num_motifs; i++) {
     MOTIF_T *motif = motif_at(motifs, i);
     MOTIF_T *rc_motif = NULL;
     char *motif_id = get_motif_id(motif);
     char *rc_motif_id = NULL;
     if (i < (num_motifs - 1)) {
       rc_motif = motif_at(motifs, i + 1);
       rc_motif_id = get_motif_id(rc_motif);
     }
     char *best_possible_match = get_best_possible_match(motif);
     if (rc_motif_id && strcmp(motif_id, rc_motif_id) == 0) {
       ++i; // Pair of identiical motif ids indicate forward/reverse pair.
       char *best_possible_rc_match = get_best_possible_match(rc_motif);
       fprintf(
           output,
          "      motifs[\"%s\"] = new Motif(\"%s\", \"nucleotide\", \"%s\", \"%s\");\n",
          motif_id,
          motif_id,
          best_possible_match,
          best_possible_rc_match
       );
       myfree(best_possible_rc_match);
     }
     else {
       fprintf(
           output,
          "      motifs[\"%s\"] = new Motif(\"%s\", \"nucleotide\", \"%s\", \"%s\");\n",
          motif_id,
          motif_id,
          best_possible_match,
          "" 
       );
     }
     myfree(best_possible_match);
   }
};
Esempio n. 4
0
File: ramen.c Progetto: CPFL/gmeme
void ramen_load_motifs() {
  BOOLEAN_T read_file = FALSE;
  MREAD_T *mread;
  ARRAYLST_T* read_motifs;
  int num_motifs_before_rc;
  int i;
  int j;

  memset(&motifs, 0, sizeof(ramen_motifs_t));
	read_motifs = arraylst_create();
  for (i = 0; i < args.number_motif_files; i++) {
      mread = mread_create(args.motif_filenames[i], OPEN_MFILE);
      if (args.bg_format == FILE_BG) {
		mread_set_bg_source(mread, args.bg_filename);
      } else {
		mread_set_background(mread, motifs.bg_freqs);
      }
      mread_set_pseudocount(mread, args.pseudocount);

      mread_load(mread, read_motifs);
      if (!(motifs.bg_freqs)) motifs.bg_freqs = mread_get_background(mread);

      mread_destroy(mread);
  }

  // reverse complement the originals adding to the original read in list
  num_motifs_before_rc = arraylst_size(read_motifs);
  add_reverse_complements(read_motifs);        
  motifs.num = arraylst_size(read_motifs);
  //Allocate array for the motifs
  motif_list_to_array(read_motifs, &(motifs.motifs), &(motifs.num));
  //free the list of motifs
  free_motifs(read_motifs);
  

  // check reverse complements.
  assert(motifs.num / 2 == num_motifs_before_rc);
  // reset motif count to before rev comp
  motifs.num = num_motifs_before_rc;

  //Now, we need to convert the motifs into odds matrices if we're doing that kind of scoring
  for (i=0;i<2*motifs.num;i++) {
	  convert_to_odds_matrix(motif_at(motifs.motifs, i), motifs.bg_freqs);
  }
}
Esempio n. 5
0
/*************************************************************************
 * Build a completely connected HMM.
 *************************************************************************/
void build_complete_hmm
  (ARRAY_T* background,
   int spacer_states, 
   MOTIF_T *motifs,
   int nmotifs,
   MATRIX_T *transp_freq,
   MATRIX_T *spacer_ave,
   BOOLEAN_T fim,
   MHMM_T **the_hmm)
{
  ALPH_T    alph;
  int motif_states; // Total length of the motifs.
  int num_spacers;  // Total number of spacer states.
  int num_states;   // Total number of states in the model.
  int i_motif;      // Index of the current "from" motif.
  int j_motif;      // Index of the current "to" motif.
  int i_position;   // Index within the current motif or spacer.
  int i_state = 0;  // Index of the current state.

  assert(nmotifs > 0);
  alph = get_motif_alph(motifs);// get the alphabet from the first motif

  // Count the width of the motifs.
  for (motif_states = 0, i_motif = 0; i_motif < nmotifs; i_motif++)
    motif_states += get_motif_length(motif_at(motifs, i_motif));
  // Count the spacer states adjacent to begin and end.
  num_spacers = nmotifs * 2;
  // Add the spacer states between motifs.
  num_spacers += nmotifs * nmotifs;
  // Total states = motifs + spacer_states + begin/end
  num_states = motif_states + (num_spacers * spacer_states) + 2;

  // Allocate the model.
  *the_hmm = allocate_mhmm(alph, num_states);

  // Record that this is a completely connected model.
  (*the_hmm)->type = COMPLETE_HMM;

  // Record the number of motifs in the model.
  (*the_hmm)->num_motifs = nmotifs;

  // Record the number of states in the model.
  (*the_hmm)->num_states = num_states;
  (*the_hmm)->num_spacers = ((nmotifs + 1) * (nmotifs + 1)) - 1;
  (*the_hmm)->spacer_states = spacer_states;

  // Put the background distribution into the model.
  copy_array(background, (*the_hmm)->background);

  // Build the begin state.
  build_complete_state(
      START_STATE, 
      i_state,
      alph,
      0, // expected length
      NULL, // Emissions.
      0, // Number of sites.
      NON_MOTIF_INDEX,
      NON_MOTIF_POSITION,
      nmotifs,
      0, // previous motif
      0, // next motif
      transp_freq,
      spacer_states,
      num_spacers,
      motifs,
      &((*the_hmm)->states[i_state]));
  i_state++;

  int from_motif_state, to_motif_state;
  // Build the spacer states. No transitions from the end state.
  for (i_motif = 0; i_motif <= nmotifs; i_motif++) {
    // No transitions to the start state.
    for (j_motif = 1; j_motif <= nmotifs+1; j_motif++) {
      // No transitions from start to end.
      if ((i_motif == 0) && (j_motif == nmotifs+1))
        continue;
      // Allow multi-state spacers.
      for (i_position = 0; i_position < spacer_states; i_position++, i_state++) {
        build_complete_state(
            SPACER_STATE, 
            i_state, 
            alph,
            get_matrix_cell(i_motif, j_motif, spacer_ave),
            background,
            SPACER_NUMSITES,
            NON_MOTIF_INDEX,
            i_position,
            nmotifs,
            i_motif,
            j_motif,
            transp_freq,
            spacer_states,
            num_spacers,
            motifs,
            &((*the_hmm)->states[i_state]));
      }
    }
  }

  // Build the motif states.
  for (i_motif = 0; i_motif < nmotifs; i_motif++) {
    MOTIF_T *this_motif = motif_at(motifs, i_motif);
    STATE_T state;
    for (i_position = 0; i_position < get_motif_length(this_motif); i_position++, i_state++) {
      if (i_position == 0) {
        state = START_MOTIF_STATE;
      } else if (i_position == (get_motif_length(this_motif) - 1)) {
        state = END_MOTIF_STATE;
      } else {
        state = MID_MOTIF_STATE;
      }
      build_complete_state(
          MID_MOTIF_STATE, 
          i_state,
          alph,
          0, // Expected spacer length. 
          get_matrix_row(i_position, get_motif_freqs(this_motif)),
          get_motif_nsites(this_motif),
          i_motif,
          i_position, 
          nmotifs,
          0, // Previous motif index.
          0, // Next motif index.
          transp_freq,
          spacer_states,
          num_spacers,
          motifs,
          &((*the_hmm)->states[i_state]));
    }
  }

  // Build the end state.
  build_complete_state(
      END_STATE, 
      i_state,
      alph,
      0, // Expected spacer length.
      NULL, // Emissions
      0, // Number of sites.
      NON_MOTIF_INDEX,
      NON_MOTIF_POSITION,
      nmotifs,
      0, // Previous motif index.
      0, // Next motif index.
      transp_freq,
      spacer_states,
      num_spacers,
      motifs,
      &((*the_hmm)->states[i_state]));
  i_state++;

  // Convert spacers to FIMs if requested.
  if (fim) {
    convert_to_fims(*the_hmm);
  }

  // Fill in the transition matrix.
  build_transition_matrix(*the_hmm);
}
Esempio n. 6
0
/*************************************************************************
 * Set up one state in a complete HMM, given the appropriate data.
 *************************************************************************/
static void build_complete_state
  (STATE_T state_type,    // Type of state (START, SPACER,..)
   int i_state,           // State index.
   ALPH_T alph,           // alphabet
   int expected_length,   // For spacers, the expected length of output.
   ARRAY_T *freqs,        // Emission probability distrib.
   double num_sites,      // Number of sites for this emission.
   int i_motif,           // Index of motif this state is in.
   int i_position,        // Position of this state within motif
   int nmotifs,           // Total number of motifs.
   int prev_motif,        // Index of previous motif.
   int next_motif,        // Index of next motif.
   MATRIX_T *transp_freq, // Transition freq matrix.
   int spacer_states,     // Number of HMM states per spacer.
   int num_spacers,       // Total number of spacers in HMM.
   MOTIF_T *motifs,       // Motifs.
   MHMM_STATE_T *a_state) // State to be filled in (pre-allocated).
{
  MOTIF_T *motif; // The motif (for motif state)
  int j_motif;    // Index of the current motif.

  if (i_motif != NON_MOTIF_INDEX) motif = motif_at(motifs, i_motif);
  else motif = NULL;

  // Tell the user what's up.
  if (verbosity >= NORMAL_VERBOSE) {
    switch (state_type) {
    case START_STATE :
      fprintf(stderr, "Building HMM: (0) ");
      break;
    case SPACER_STATE :
      fprintf(stderr, "%d ", i_state);
      break;
    case END_MOTIF_STATE :
      fprintf(stderr, "%d | ", i_state);
      break;
    case START_MOTIF_STATE :
    case MID_MOTIF_STATE :
      fprintf(stderr, "%d-", i_state);
      break;
    case END_STATE :
      fprintf(stderr, "(%d)\n", i_state);
      break;
    default:
      die("Invalid state!");
    }
  }

  // Record what type of state this is.
  a_state->type = state_type;

  // Record the motif width if this is a motif.
  if (state_type == START_MOTIF_STATE ||
      state_type == MID_MOTIF_STATE ||
      state_type == END_MOTIF_STATE) {
    a_state->w_motif = get_motif_length(motif);
  } else {
    a_state->w_motif = 1;
  }
  

  // Set up the emission distribution and a few other tidbits.
  if (freqs != NULL) { // Start and end states have no emissions.
    a_state->emit = allocate_array(alph_size(alph, ALL_SIZE));
    copy_array(freqs, a_state->emit);
  }
  a_state->num_sites = num_sites;
  a_state->i_motif = i_motif;
  a_state->i_position = i_position;

  // Record the motif ID character at this position.
  if ((state_type == START_STATE) ||
      (state_type == END_STATE) ||
      (state_type == SPACER_STATE)) {
    a_state->id_char = NON_MOTIF_ID_CHAR;
  } else { // motif state
    strncpy(a_state->motif_id, get_full_motif_id(motif), MAX_MOTIF_ID_LENGTH + 2);
    a_state->id_char = get_motif_id_char(i_position, motif);
  }
  assert(a_state->id_char != '\0');

  // First set up the transitions into this state.
  switch (state_type) {
  case START_STATE :
    a_state->ntrans_in = 0;
    a_state->itrans_in = NULL;
    a_state->trans_in = NULL;
    break;
  case START_MOTIF_STATE :
    // Transitions come from any motif or from the start state.
    a_state->ntrans_in = nmotifs + 1;
    a_state->itrans_in = (int *)mm_malloc(sizeof(int) * (nmotifs + 1));
    a_state->trans_in = allocate_array(nmotifs + 1);
    for (j_motif = 0; j_motif < nmotifs + 1; j_motif++) {
      a_state->itrans_in[j_motif]
        = spacer_index(j_motif, i_motif + 1, TRUE, nmotifs, spacer_states);
      set_array_item(j_motif, 
                     get_matrix_cell(j_motif, i_motif + 1, transp_freq), 
                     a_state->trans_in);
    }
    break;
  case END_STATE :
    // Transitions come from any motif.
    a_state->ntrans_in = nmotifs;
    a_state->itrans_in = (int *)mm_malloc(sizeof(int) * nmotifs);
    a_state->trans_in = allocate_array(nmotifs);
    for (j_motif = 0; j_motif < nmotifs; j_motif++) {
      a_state->itrans_in[j_motif] = spacer_index(j_motif + 1,
                                                 nmotifs + 1, TRUE,
                                                 nmotifs, spacer_states);
      set_array_item(j_motif, 
                     get_matrix_cell(j_motif + 1, nmotifs + 1, transp_freq), 
                     a_state->trans_in);
    }
    break;
  case MID_MOTIF_STATE :
  case END_MOTIF_STATE :
    a_state->ntrans_in = 1;
    a_state->itrans_in = (int *)mm_malloc(sizeof(int));
    a_state->itrans_in[0] = i_state - 1;
    a_state->trans_in = allocate_array(1);
    set_array_item(0, 1.0, a_state->trans_in);
    break;
  case SPACER_STATE :
    a_state->ntrans_in = 2;
    a_state->itrans_in = (int *)mm_malloc(sizeof(int) * 2);
    a_state->trans_in = allocate_array(2);
    // For multi-state spacers, incoming transition from previous state.
    if (i_position != 0)
      a_state->itrans_in[0] = i_state - 1;
    else 
      a_state->itrans_in[0] = motif_index(prev_motif, TRUE, num_spacers,
                                          spacer_states, motifs, nmotifs);
    // The other transition is a self-transition.
    a_state->itrans_in[1] = i_state;
    set_array_item(0, 1.0 - self_trans(expected_length / spacer_states),
                   a_state->trans_in);
    set_array_item(1, self_trans(expected_length / spacer_states),
                   a_state->trans_in);
    break;
  default:
    die("Illegal state!");
  }

  // Then set up the transitions out of this state.
  switch (state_type) {
  case START_STATE :
    // Transitions go to each motif.
    a_state->ntrans_out = nmotifs;
    a_state->itrans_out = (int *)mm_malloc(sizeof(int) * nmotifs);
    a_state->trans_out = allocate_array(nmotifs);
    for (j_motif = 0; j_motif < nmotifs; j_motif++) {
      a_state->itrans_out[j_motif] = spacer_index(0, j_motif + 1, FALSE,
                                                  nmotifs, spacer_states);
      set_array_item(j_motif,
                     get_matrix_cell(0, j_motif + 1, transp_freq),
                     a_state->trans_out);
    }
    break;
  case END_MOTIF_STATE :
    // Can go to any other motif or to the end state.
    a_state->ntrans_out = nmotifs + 1;
    a_state->itrans_out = (int *)mm_malloc(sizeof(int) * (nmotifs + 1));
    a_state->trans_out = allocate_array(nmotifs + 1);
    for (j_motif = 0; j_motif < nmotifs + 1; j_motif++) {
      a_state->itrans_out[j_motif] = spacer_index(i_motif + 1,
                                                  j_motif + 1, FALSE,
                                                  nmotifs, spacer_states);
      set_array_item(j_motif,
                     get_matrix_cell(i_motif + 1, j_motif + 1, transp_freq),
                     a_state->trans_out);
    }
    break;
  case START_MOTIF_STATE :
  case MID_MOTIF_STATE :
    a_state->ntrans_out = 1;
    a_state->itrans_out = (int *)mm_malloc(sizeof(int));
    a_state->itrans_out[0] = i_state + 1;
    a_state->trans_out = allocate_array(1);
    set_array_item(0, 1.0, a_state->trans_out);
    break;
  case SPACER_STATE :
    a_state->ntrans_out = 2;
    a_state->itrans_out = (int *)mm_malloc(sizeof(int) * 2);
    a_state->trans_out = allocate_array(2);
    // The first transition is a self-transition.
    a_state->itrans_out[0] = i_state;
    // For multi-state spacers, outgoing transition to next state.
    if (i_position < spacer_states - 1)
      a_state->itrans_out[1] = i_state + 1;
    else 
      a_state->itrans_out[1] = motif_index(next_motif, FALSE, num_spacers,
                                           spacer_states, motifs, nmotifs);
    set_array_item(0, self_trans(expected_length), a_state->trans_out);
    set_array_item(1, 1.0 - self_trans(expected_length), a_state->trans_out);
    break;
  case END_STATE :
    a_state->ntrans_out = 0;
    a_state->itrans_out = NULL;
    a_state->trans_out = NULL;
    break;
  default:
    die("Illegal state!");
  }
}
Esempio n. 7
0
/*************************************************************************
 * Build a star topology HMM.
 *************************************************************************/
void build_star_hmm
  (ARRAY_T*  background,
   int       spacer_states, 
   MOTIF_T*  motifs,
   int       nmotifs,
   BOOLEAN_T fim,
   MHMM_T**  the_hmm)
{
  ALPH_T alph;
  int       motif_states; /* Total length of the motifs. */
  int       num_spacers;  /* Total number of spacer states. */
  int       num_states;   /* Total number of states in the model. */
  int       i_motif;      /* Index of the current "from" motif. */
  int       i_position;   /* Index within the current motif or spacer. */
  int       i_state = 0;  /* Index of the current state. */

  alph = get_motif_alph(motif_at(motifs, 0));

  /* Count the width of the motifs. */
  for (motif_states = 0, i_motif = 0; i_motif < nmotifs; i_motif++)
    motif_states += get_motif_length(motif_at(motifs, i_motif));
  // Only 1 spacer.
  num_spacers = 1;
  /* Total states = motifs + spacer_states + begin/end */
  num_states = motif_states + (num_spacers * spacer_states) + 2;
  /* fprintf(stderr, "motif_states=%d num_spacers=%d num_states=%d\n",
	  motif_states, num_spacers, num_states); */

  /* Allocate the model. */
  *the_hmm = allocate_mhmm(alph, num_states);

  /* Record that this is a star model. */
  (*the_hmm)->type = STAR_HMM;

  /* Record the number of motifs in the model. */
  (*the_hmm)->num_motifs = nmotifs;

  /* Record the number of states in the model. */
  (*the_hmm)->num_states = num_states;
  (*the_hmm)->num_spacers = 1;
  (*the_hmm)->spacer_states = spacer_states;

  // Put the background distribution into the model.
  copy_array(background, (*the_hmm)->background);

  /* Build the begin state. */
  build_star_state(
    alph,
    START_STATE,
		i_state,
		0, // expected length
		NULL,
		0, // Number of sites.
		NON_MOTIF_INDEX,
		NON_MOTIF_POSITION,
		nmotifs,
		spacer_states,
		motifs,
		&((*the_hmm)->states[i_state])
  );
  i_state++;

  // Build the spacer state (state 0).  Allow multi-state spacers.
  for (i_position = 0; i_position < spacer_states; i_position++) {
    build_star_state(
      alph,
      SPACER_STATE, 
		  i_state, 
		  DEFAULT_SPACER_LENGTH,
		  background,
		  SPACER_NUMSITES,
		  NON_MOTIF_INDEX,
		  i_position,
		  nmotifs,
		  spacer_states,
		  motifs,
		  &((*the_hmm)->states[i_state])
    );
    i_state++;
  }

  /* Build the motif states. */
  for (i_motif = 0; i_motif < nmotifs; i_motif++) {
    MOTIF_T *this_motif = motif_at(motifs, i_motif);
    assert(get_motif_length(this_motif) > 1);
    i_position = 0;
    build_star_state(
      alph,
      START_MOTIF_STATE, 
		  i_state,
		  0, // Expected spacer length.
		  get_matrix_row(i_position, get_motif_freqs(this_motif)),
		  get_motif_nsites(this_motif),
		  i_motif,
		  i_position,
		  nmotifs,
		  spacer_states,
		  motifs,
		  &((*the_hmm)->states[i_state])
    );
    i_state++;
    for (i_position = 1; i_position < get_motif_length(this_motif) - 1; i_position++) {
      build_star_state(
        alph,
        MID_MOTIF_STATE, 
		    i_state,
		    0, // Expected spacer length. 
		    get_matrix_row(i_position, get_motif_freqs(this_motif)),
		    get_motif_nsites(this_motif),
		    i_motif,
		    i_position, 
		    nmotifs,
		    spacer_states,
		    motifs,
		    &((*the_hmm)->states[i_state])
      );
      i_state++;
    }
    build_star_state(
      alph,
      END_MOTIF_STATE, 
		  i_state,
		  0, // Expected spacer length.
		  get_matrix_row(i_position, get_motif_freqs(this_motif)),
		  get_motif_nsites(this_motif),
		  i_motif,
		  i_position,
		  nmotifs,
		  spacer_states,
		  motifs,
		  &((*the_hmm)->states[i_state])
    );
    i_state++;
  }

  /* Build the end state. */
  build_star_state(
    alph,
    END_STATE, 
		i_state,
		0, // Expected spacer length.
		NULL, // Emissions
		0, // Number of sites.
		NON_MOTIF_INDEX,
		NON_MOTIF_POSITION,
		nmotifs,
		spacer_states,
		motifs,
		&((*the_hmm)->states[i_state])
  );
  i_state++;

  /* Convert spacers to FIMs if requested. */
  if (fim) {
    convert_to_fims(*the_hmm);
  }

  /* Fill in the transition matrix. */
  build_transition_matrix(*the_hmm);
} // build_star_hmm
Esempio n. 8
0
/*************************************************************************
 * Set up one state in a star HMM, given the appropriate data.
 *************************************************************************/
static void build_star_state (
   ALPH_T    alph,            // Type of alphabet
   int       state_type,      // Type of state (START, SPACER,..)
   STATE_T   i_state,         // State index.
   int       expected_length, /* For spacers, the expected length of output. */
   ARRAY_T*  freqs,           // Emission probability distrib.
   double    num_sites,       // Number of sites for this emission.
   int       i_motif,         // Index of motif this state is in.
   int       i_position,      // Position of this state within motif
   int       nmotifs,         // Total number of motifs.
   int       spacer_states,   // Number of HMM states per spacer.
   MOTIF_T*  motifs,          // Motifs.
   MHMM_STATE_T* a_state
)     // State to be filled in (pre-allocated).
{
  int j_motif;    		  // Index of the current motif.
  int num_spacers = 1;  // Total number of spacers in HMM.
  double in_p;			    // Probability of transition into a state 

  // Size of the alphabet, including ambiguity codes.
  int full_alph_size = alph_size(alph, ALL_SIZE);

  MOTIF_T *motif = NULL;
  if (i_motif != NON_MOTIF_INDEX) {
    motif = motif_at(motifs, i_motif);
  }

  // Tell the user what's up.
  if (verbosity >= NORMAL_VERBOSE) {
    switch (state_type) {
    case START_STATE :
      fprintf(stderr, "Building HMM: (0) ");
      break;
    case SPACER_STATE :
      fprintf(stderr, "%d ", i_state);
      break;
    case END_MOTIF_STATE :
      fprintf(stderr, "%d | ", i_state);
      break;
    case START_MOTIF_STATE :
    case MID_MOTIF_STATE :
      fprintf(stderr, "%d-", i_state);
      break;
    case END_STATE :
      fprintf(stderr, "(%d)\n", i_state);
      break;
    }
  }

  // Record what type of state this is.
  a_state->type = state_type;

  // Record the motif width if this is a motif.
  if (state_type == START_MOTIF_STATE ||
      state_type == MID_MOTIF_STATE ||
      state_type == END_MOTIF_STATE) {
    a_state->w_motif = get_motif_length(motif);
  } else {
    a_state->w_motif = 1;
  }

  // Set up the emission distribution and a few other tidbits.
  a_state->emit = allocate_array(full_alph_size);
  a_state->emit_odds = allocate_array(full_alph_size);
  if (freqs != NULL) { // Start and end states have no emissions.
    copy_array(freqs, a_state->emit);
  }

  a_state->num_sites = num_sites;
  a_state->i_motif = i_motif;
  if (motif != NULL) {
  }
  a_state->i_position = i_position;

  // Record the motif ID character at this position.
  if ((state_type == START_STATE) ||
      (state_type == END_STATE) ||
      (state_type == SPACER_STATE)) {
    a_state->id_char = NON_MOTIF_ID_CHAR;
    strcpy(a_state->motif_id, NON_MOTIF_ID);
  } else {
    strcpy(a_state->motif_id, get_full_motif_id(motif));
    a_state->id_char = get_motif_id_char(i_position, motif);
  }
  assert(a_state->id_char != '\0');

  // First set up the transitions into this state.
  switch (state_type) {
  case START_STATE :
    a_state->ntrans_in = 0;
    a_state->itrans_in = NULL;
    a_state->trans_in = NULL;
    break;
  case END_STATE :
  case START_MOTIF_STATE :
    // Transitions come from spacer state.
    a_state->ntrans_in = 1;
    a_state->itrans_in = (int *)mm_malloc(sizeof(int));
    a_state->trans_in = allocate_array(1);
    a_state->itrans_in[0] = SPACER_INDEX;
    // Distribute non-self loop probability evenly among motifs and end state.
    in_p = (1 - self_trans(expected_length / spacer_states))/(nmotifs+1);
    set_array_item(0, in_p, a_state->trans_in);
    break;
  case MID_MOTIF_STATE :
  case END_MOTIF_STATE :
    // Transitions come from previous state.
    a_state->ntrans_in = 1;
    a_state->itrans_in = (int *)mm_malloc(sizeof(int));
    a_state->itrans_in[0] = i_state - 1;
    a_state->trans_in = allocate_array(1);
    set_array_item(0, 1.0, a_state->trans_in);
    break;
  case SPACER_STATE :
    // Transitions come from start and each motif except for internal
    // multi-states.
    a_state->ntrans_in = (i_position != 0) ? 2 : nmotifs + 2;
    a_state->itrans_in = (int *)mm_malloc(sizeof(int) * a_state->ntrans_in);
    a_state->trans_in = allocate_array(a_state->ntrans_in);

    // First transition is a self-transition. 
    a_state->itrans_in[0] = i_state;
    set_array_item(
      0, 
      self_trans(expected_length / spacer_states), 
		  a_state->trans_in
    );

    // Next the transitions from all the motifs (or the previous spacer).
    if (i_position != 0) {
      a_state->itrans_in[1] = i_state - 1;
      set_array_item(1, 1.0 - self_trans(expected_length / spacer_states),
		     a_state->trans_in);
    } else {
      a_state->itrans_in[1] = START_INDEX;	// From start state.
      // From each motif.
      for (j_motif = 0; j_motif < nmotifs; j_motif++) {	
        a_state->itrans_in[j_motif+2] = 
          motif_index(
            j_motif+1, 
            TRUE, 
            num_spacers, 
            spacer_states, 
            motifs, 
            nmotifs
          );
	      set_array_item(j_motif+2, 1.0, a_state->trans_in);
      }
    }
    break;
  }

  // Then set up the transitions out of this state.
  switch (state_type) {
  case START_STATE :
  case END_MOTIF_STATE :
    // Transition goes to spacer. 
    a_state->ntrans_out = 1;
    a_state->itrans_out = (int *)mm_malloc(sizeof(int));
    a_state->trans_out = allocate_array(1);
    a_state->itrans_out[0] = SPACER_INDEX;
    set_array_item(0, 1.0, a_state->trans_out);
    break;
  case START_MOTIF_STATE :
  case MID_MOTIF_STATE :
    a_state->ntrans_out = 1;
    a_state->itrans_out = (int *)mm_malloc(sizeof(int));
    a_state->itrans_out[0] = i_state + 1;
    a_state->trans_out = allocate_array(1);
    set_array_item(0, 1.0, a_state->trans_out);
    break;
  case SPACER_STATE :
    // Transitions go to self, motifs and end (except for beginning
    // multi-state spacers)
    a_state->ntrans_out = (i_position < spacer_states -1 ) ? 2 : nmotifs + 2;
    a_state->itrans_out = (int *)mm_malloc(sizeof(int) * a_state->ntrans_out);
    a_state->trans_out = allocate_array(a_state->ntrans_out);

    // The first transition is a self-transition.
    a_state->itrans_out[0] = i_state;
    set_array_item(0, self_trans(expected_length), a_state->trans_out);

    // For multi-state spacers, outgoing transition to next state.
    if (i_position < spacer_states - 1) {
      a_state->itrans_out[1] = i_state + 1;
      set_array_item(1, 1-self_trans(expected_length), a_state->trans_out);
    } else {
      double out_p = (1 - self_trans(expected_length))/(nmotifs+1);
      // Out to each motif start.
      for (j_motif = 0; j_motif < nmotifs; j_motif++) {	
        a_state->itrans_out[j_motif+1] = 
          motif_index(
            j_motif+1,
            FALSE,
            num_spacers,
            spacer_states,
            motifs,
            nmotifs
          );
	      set_array_item(j_motif+1, out_p, a_state->trans_out);
      }
      // Out to end state.
      a_state->itrans_out[j_motif+1] = 
        motif_index(nmotifs, TRUE, num_spacers, spacer_states, motifs, nmotifs) + 1;
      set_array_item(j_motif+1, out_p, a_state->trans_out);
    }
    break;
  case END_STATE :
    a_state->ntrans_out = 0;
    a_state->itrans_out = NULL;
    a_state->trans_out = NULL;
    break;
  }
} // build_star_state
Esempio n. 9
0
File: ramen.c Progetto: CPFL/gmeme
/*
 * Using the linreg test,
 *
 * this method returns the lowest scoring subdivision of a set of sequences for a given motif.
 * It's not self-contained, as it requires to hook into the global variables results, motifs, seq_ids.
 */
ramen_result_t* ramen_do_linreg_test(int motif_num) {
  //Assorted vars
  int seq_num;
  int j,k;
  int motif_index = motif_num * 2; //This is a workaround to the change in the motif datastructure where it now
                      // goes +MOTIFA -MOTIFA +MOTIFB etc. rather than all + then all - motifs.

  //Vars for the regression
  double* x;
  double* y;
  double m = 0;
  double b = 0;
  double mse = 0;

  //Vars for scoring
  ramen_result_t* r;

  //Allocate memory or set initial values
  seq_num = get_num_strings(seq_ids); //number of sequences
  r = malloc(sizeof(ramen_result_t)); //allocate space, as a ptr to this will go in the array later
                    //that's why we don't free it in this loop.
  x = malloc(sizeof(double)*seq_num);
  y = malloc(sizeof(double)*seq_num);

  //Now we need to copy the scores into two double arrays
  //Use LOG macro so that log(0) 'works'
  for (j=0; j < seq_num; j++) {
    if (args.log_fscores == TRUE) {
      y[j] = LOG(get_array_item(j, seq_fscores));
    } else {
      y[j] = get_array_item(j, seq_fscores);
    }

    if (args.log_pwmscores == TRUE) {
      x[j] = LOG(results[motif_num][j]);
    } else {
      x[j] = results[motif_num][j];
    }
  }

    //Switch x&y if they're to be switched
    if (args.linreg_switchxy) {
        SWAP(double*, x, y);
    }

  // TODO: Tidy and/or remove this for production
  if(args.linreg_dump_dir > 0) {
    FILE *fh;
    char* filename;
    filename = malloc(sizeof(char)*(strlen(args.linreg_dump_dir) + 50));
    sprintf(filename, "%s/%s.tsv", args.linreg_dump_dir, get_motif_id(motif_at(motifs.motifs, motif_index)));

    fh = fopen(filename, "w");
    fputs("PWM_Score\tFluorescence_Score\n", fh);
    for (j=0; j < seq_num; j++) {
      fprintf(fh, "%.10e %.10e\n", x[j], y[j]);
    }
    fclose(fh);
    free(filename);
  }


  /*extern double regress(
    int n,                        / number of points /
    double *x,                    / x values /
    double *y,                    / y values /
    double *m,                    / slope /
    double *b                     / y intercept /
    );*/
    mse = regress(seq_num, x, y, &m, &b);

  if (args.verbose >= 3) {
    printf("LinReg MSE of motif %s on %i seqs: %.4g (m: %.4g b: %.4g)\n",
           get_motif_id(motif_at(motifs.motifs, motif_index)), seq_num, mse, m, b);
  }

  //Add to our motif list if lowest MSE
  r->motif_id = strdup(get_motif_id(motif_at(motifs.motifs, motif_index)));
  r->m = m; //Not p-values, but they'll do when we re-use this structure...
  r->b = b;
  r->mse = mse;
  r->p = -1;

    //Do stochastic sampling if required.
    if (args.repeats > 0) {
        int repeat_wins = 0;
        for (j=0;j<args.repeats;j++) {
            double repeat_mse = 0;
            shuffle(x,seq_num); //Shuffle and break the associations between x and y
            repeat_mse = regress(seq_num, x, y, &m, &b);
            //fprintf(stderr, "Motif %d Repeat %d RMSE: %g MSE: %g\n",motif_index,j,repeat_mse,mse);
            if (repeat_mse <= mse) {
                repeat_wins++;
            }
        }
        r->p = repeat_wins*1.0/ args.repeats*1.0;
    }
    free(x);
    free(y);

  return r;
}
Esempio n. 10
0
File: ramen.c Progetto: CPFL/gmeme
void ramen_scan_sequences() {
		FILE* seq_file = NULL;
		MOTIF_T* motif = NULL;
		MOTIF_T* rev_motif = NULL;
		SEQ_T* sequence = NULL;
		SCANNED_SEQUENCE_T* scanned_seq = NULL;
		PATTERN_T* pattern;
		int i;
		int j;
		SEQ_T** seq_list;
		int num_seqs;
		int seq_len;
		//For the bdb_bg mode:
		ARRAY_T* seq_bg_freqs;
		double atcontent;
		double roundatcontent;
		double avg_seq_length = 0;

		//Open the file.
		if (open_file(args.sequence_filename, "r", FALSE, "FASTA", "sequences", &seq_file) == 0) {
				fprintf(stderr, "Couldn't open the file %s.\n", args.sequence_filename);
				ramen_terminate(1);
		}

		//Start reading in the sequences
		read_many_fastas(ramen_alph, seq_file, MAX_SEQ_LENGTH, &num_seqs, &seq_list);


		seq_ids = new_string_list();
		seq_fscores = allocate_array(num_seqs);

		//Allocate the required space for results
		results = malloc(sizeof(double*) * motifs.num);
		for (i=0;i<motifs.num;i++) {
				results[i] = malloc(sizeof(double)*num_seqs);
		}

		for (j=0;j<num_seqs;j++) {

				fprintf(stderr, "\rScanning %i of %i sequences...", j+1, num_seqs);

				//copy the pointer into our current object for clarity
				sequence = seq_list[j];

				//Read the fluorescence data from the description field.
				add_string(get_seq_name(sequence),seq_ids);
				seq_len = get_seq_length(sequence);
				set_array_item(j,atof(get_seq_description(sequence)),seq_fscores);

				//Scan with each motif.
				for (i=0;i<motifs.num;i++) {
						int motifindex = i*2;

						results[i][j] = ramen_sequence_scan(sequence, motif_at(motifs.motifs, motifindex), 
											      motif_at(motifs.motifs, motifindex+1),
											      NULL, NULL, //No need to pass PSSM.
										              AVG_ODDS, 0, TRUE, 0, motifs.bg_freqs);

						if (TRUE == args.linreg_normalise) {
								int k;
								double maxscore = 1;
								motif = motif_at(motifs.motifs,motifindex); 
								for (k=0;k<get_motif_length(motif);k++) {
										double maxprob = 0;
										if (maxprob < get_matrix_cell(k, alph_index(ramen_alph, 'A'), get_motif_freqs(motif)))
												maxprob = get_matrix_cell(k, alph_index(ramen_alph, 'A'), get_motif_freqs(motif));
										if (maxprob < get_matrix_cell(k, alph_index(ramen_alph, 'C'), get_motif_freqs(motif)))
												maxprob = get_matrix_cell(k, alph_index(ramen_alph, 'C'), get_motif_freqs(motif));
										if (maxprob < get_matrix_cell(k, alph_index(ramen_alph, 'G'), get_motif_freqs(motif)))
												maxprob = get_matrix_cell(k, alph_index(ramen_alph, 'G'), get_motif_freqs(motif));
										if (maxprob < get_matrix_cell(k, alph_index(ramen_alph, 'T'), get_motif_freqs(motif)))
												maxprob = get_matrix_cell(k, alph_index(ramen_alph, 'T'), get_motif_freqs(motif));
										maxscore *= maxprob;
								}
								results[i][j] /= maxscore;
						}
				}
		}

}