/************************************************************************* * Find the index of the starting or ending state of a given motif in * a given HMM. * 0 = START_STATE and nmotifs+1 = END_STATE *************************************************************************/ static int motif_index (const int motif_num, const BOOLEAN_T start_or_end, const int num_spacers, const int spacer_states, const MOTIF_T* motifs, const int nmotifs) { int i_motif; int return_value; assert(motif_num >= 0); assert(motif_num <= (nmotifs + 1)); if (motif_num == 0) return START_INDEX; // Skip the spacer states. return_value = (num_spacers * spacer_states) + 1; // Add the lengths of the preceding motifs. for (i_motif = 0; i_motif < motif_num - 1; i_motif++) return_value += get_motif_length(motif_at((MOTIF_T*)motifs, i_motif)); // If we're looking for the end of this motif, add its length as well. // unless it is the end state we're after which has only one state if (start_or_end && motif_num != (nmotifs + 1)) return_value += get_motif_length(motif_at((MOTIF_T*)motifs, i_motif)) - 1; // fprintf(stderr, "Motif %d -> %d\n", motif_num, return_value); return(return_value); }
void mcast_print_motif_list(FILE * output, MOTIF_T* motifs, int num_motifs) { fputs("\n", output); int i; for (i = 0; i < num_motifs; i++) { MOTIF_T *motif = motif_at(motifs, i); MOTIF_T *rc_motif = NULL; char *motif_id = get_motif_id(motif); int width = get_motif_length(motif); char *rc_motif_id = NULL; if (i < (num_motifs - 1)) { rc_motif = motif_at(motifs, i + 1); rc_motif_id = get_motif_id(rc_motif); } char *best_possible_match = get_best_possible_match(motif); char *colored_best_possible_match = color_dna_sequence(best_possible_match); char *best_possible_rc_match = NULL; char *colored_best_possible_rc_match = NULL; if (rc_motif_id && strcmp(motif_id, rc_motif_id) == 0) { ++i; // Pair of identiical motif ids indicate forward/reverse pair. best_possible_rc_match = get_best_possible_match(rc_motif); colored_best_possible_rc_match = color_dna_sequence(best_possible_rc_match); } const char *indent = " "; fprintf(output, "%s<tr>\n", indent); fprintf(output, "%s<td>%s</td>\n", indent, motif_id); fprintf(output, "%s<td>%d</td>\n", indent, width); fprintf(output, "%s<td class=\"sequence\">%s</td>\n", indent, colored_best_possible_match); fprintf(output, "%s<td class=\"sequence\">%s</td>\n", indent, colored_best_possible_rc_match); fprintf(output, "%s</tr>\n", indent); myfree(best_possible_match); myfree(best_possible_rc_match); myfree(colored_best_possible_match); myfree(colored_best_possible_rc_match); } };
/****************************************************************************** Print JavaScript code defining an array of motifs and their best possible matches. *****************************************************************************/ void mcast_print_motif_array(FILE *output, MOTIF_T *motifs, int num_motifs) { int i; fputs("\n", output); for (i = 0; i < num_motifs; i++) { MOTIF_T *motif = motif_at(motifs, i); MOTIF_T *rc_motif = NULL; char *motif_id = get_motif_id(motif); char *rc_motif_id = NULL; if (i < (num_motifs - 1)) { rc_motif = motif_at(motifs, i + 1); rc_motif_id = get_motif_id(rc_motif); } char *best_possible_match = get_best_possible_match(motif); if (rc_motif_id && strcmp(motif_id, rc_motif_id) == 0) { ++i; // Pair of identiical motif ids indicate forward/reverse pair. char *best_possible_rc_match = get_best_possible_match(rc_motif); fprintf( output, " motifs[\"%s\"] = new Motif(\"%s\", \"nucleotide\", \"%s\", \"%s\");\n", motif_id, motif_id, best_possible_match, best_possible_rc_match ); myfree(best_possible_rc_match); } else { fprintf( output, " motifs[\"%s\"] = new Motif(\"%s\", \"nucleotide\", \"%s\", \"%s\");\n", motif_id, motif_id, best_possible_match, "" ); } myfree(best_possible_match); } };
void ramen_load_motifs() { BOOLEAN_T read_file = FALSE; MREAD_T *mread; ARRAYLST_T* read_motifs; int num_motifs_before_rc; int i; int j; memset(&motifs, 0, sizeof(ramen_motifs_t)); read_motifs = arraylst_create(); for (i = 0; i < args.number_motif_files; i++) { mread = mread_create(args.motif_filenames[i], OPEN_MFILE); if (args.bg_format == FILE_BG) { mread_set_bg_source(mread, args.bg_filename); } else { mread_set_background(mread, motifs.bg_freqs); } mread_set_pseudocount(mread, args.pseudocount); mread_load(mread, read_motifs); if (!(motifs.bg_freqs)) motifs.bg_freqs = mread_get_background(mread); mread_destroy(mread); } // reverse complement the originals adding to the original read in list num_motifs_before_rc = arraylst_size(read_motifs); add_reverse_complements(read_motifs); motifs.num = arraylst_size(read_motifs); //Allocate array for the motifs motif_list_to_array(read_motifs, &(motifs.motifs), &(motifs.num)); //free the list of motifs free_motifs(read_motifs); // check reverse complements. assert(motifs.num / 2 == num_motifs_before_rc); // reset motif count to before rev comp motifs.num = num_motifs_before_rc; //Now, we need to convert the motifs into odds matrices if we're doing that kind of scoring for (i=0;i<2*motifs.num;i++) { convert_to_odds_matrix(motif_at(motifs.motifs, i), motifs.bg_freqs); } }
/************************************************************************* * Build a completely connected HMM. *************************************************************************/ void build_complete_hmm (ARRAY_T* background, int spacer_states, MOTIF_T *motifs, int nmotifs, MATRIX_T *transp_freq, MATRIX_T *spacer_ave, BOOLEAN_T fim, MHMM_T **the_hmm) { ALPH_T alph; int motif_states; // Total length of the motifs. int num_spacers; // Total number of spacer states. int num_states; // Total number of states in the model. int i_motif; // Index of the current "from" motif. int j_motif; // Index of the current "to" motif. int i_position; // Index within the current motif or spacer. int i_state = 0; // Index of the current state. assert(nmotifs > 0); alph = get_motif_alph(motifs);// get the alphabet from the first motif // Count the width of the motifs. for (motif_states = 0, i_motif = 0; i_motif < nmotifs; i_motif++) motif_states += get_motif_length(motif_at(motifs, i_motif)); // Count the spacer states adjacent to begin and end. num_spacers = nmotifs * 2; // Add the spacer states between motifs. num_spacers += nmotifs * nmotifs; // Total states = motifs + spacer_states + begin/end num_states = motif_states + (num_spacers * spacer_states) + 2; // Allocate the model. *the_hmm = allocate_mhmm(alph, num_states); // Record that this is a completely connected model. (*the_hmm)->type = COMPLETE_HMM; // Record the number of motifs in the model. (*the_hmm)->num_motifs = nmotifs; // Record the number of states in the model. (*the_hmm)->num_states = num_states; (*the_hmm)->num_spacers = ((nmotifs + 1) * (nmotifs + 1)) - 1; (*the_hmm)->spacer_states = spacer_states; // Put the background distribution into the model. copy_array(background, (*the_hmm)->background); // Build the begin state. build_complete_state( START_STATE, i_state, alph, 0, // expected length NULL, // Emissions. 0, // Number of sites. NON_MOTIF_INDEX, NON_MOTIF_POSITION, nmotifs, 0, // previous motif 0, // next motif transp_freq, spacer_states, num_spacers, motifs, &((*the_hmm)->states[i_state])); i_state++; int from_motif_state, to_motif_state; // Build the spacer states. No transitions from the end state. for (i_motif = 0; i_motif <= nmotifs; i_motif++) { // No transitions to the start state. for (j_motif = 1; j_motif <= nmotifs+1; j_motif++) { // No transitions from start to end. if ((i_motif == 0) && (j_motif == nmotifs+1)) continue; // Allow multi-state spacers. for (i_position = 0; i_position < spacer_states; i_position++, i_state++) { build_complete_state( SPACER_STATE, i_state, alph, get_matrix_cell(i_motif, j_motif, spacer_ave), background, SPACER_NUMSITES, NON_MOTIF_INDEX, i_position, nmotifs, i_motif, j_motif, transp_freq, spacer_states, num_spacers, motifs, &((*the_hmm)->states[i_state])); } } } // Build the motif states. for (i_motif = 0; i_motif < nmotifs; i_motif++) { MOTIF_T *this_motif = motif_at(motifs, i_motif); STATE_T state; for (i_position = 0; i_position < get_motif_length(this_motif); i_position++, i_state++) { if (i_position == 0) { state = START_MOTIF_STATE; } else if (i_position == (get_motif_length(this_motif) - 1)) { state = END_MOTIF_STATE; } else { state = MID_MOTIF_STATE; } build_complete_state( MID_MOTIF_STATE, i_state, alph, 0, // Expected spacer length. get_matrix_row(i_position, get_motif_freqs(this_motif)), get_motif_nsites(this_motif), i_motif, i_position, nmotifs, 0, // Previous motif index. 0, // Next motif index. transp_freq, spacer_states, num_spacers, motifs, &((*the_hmm)->states[i_state])); } } // Build the end state. build_complete_state( END_STATE, i_state, alph, 0, // Expected spacer length. NULL, // Emissions 0, // Number of sites. NON_MOTIF_INDEX, NON_MOTIF_POSITION, nmotifs, 0, // Previous motif index. 0, // Next motif index. transp_freq, spacer_states, num_spacers, motifs, &((*the_hmm)->states[i_state])); i_state++; // Convert spacers to FIMs if requested. if (fim) { convert_to_fims(*the_hmm); } // Fill in the transition matrix. build_transition_matrix(*the_hmm); }
/************************************************************************* * Set up one state in a complete HMM, given the appropriate data. *************************************************************************/ static void build_complete_state (STATE_T state_type, // Type of state (START, SPACER,..) int i_state, // State index. ALPH_T alph, // alphabet int expected_length, // For spacers, the expected length of output. ARRAY_T *freqs, // Emission probability distrib. double num_sites, // Number of sites for this emission. int i_motif, // Index of motif this state is in. int i_position, // Position of this state within motif int nmotifs, // Total number of motifs. int prev_motif, // Index of previous motif. int next_motif, // Index of next motif. MATRIX_T *transp_freq, // Transition freq matrix. int spacer_states, // Number of HMM states per spacer. int num_spacers, // Total number of spacers in HMM. MOTIF_T *motifs, // Motifs. MHMM_STATE_T *a_state) // State to be filled in (pre-allocated). { MOTIF_T *motif; // The motif (for motif state) int j_motif; // Index of the current motif. if (i_motif != NON_MOTIF_INDEX) motif = motif_at(motifs, i_motif); else motif = NULL; // Tell the user what's up. if (verbosity >= NORMAL_VERBOSE) { switch (state_type) { case START_STATE : fprintf(stderr, "Building HMM: (0) "); break; case SPACER_STATE : fprintf(stderr, "%d ", i_state); break; case END_MOTIF_STATE : fprintf(stderr, "%d | ", i_state); break; case START_MOTIF_STATE : case MID_MOTIF_STATE : fprintf(stderr, "%d-", i_state); break; case END_STATE : fprintf(stderr, "(%d)\n", i_state); break; default: die("Invalid state!"); } } // Record what type of state this is. a_state->type = state_type; // Record the motif width if this is a motif. if (state_type == START_MOTIF_STATE || state_type == MID_MOTIF_STATE || state_type == END_MOTIF_STATE) { a_state->w_motif = get_motif_length(motif); } else { a_state->w_motif = 1; } // Set up the emission distribution and a few other tidbits. if (freqs != NULL) { // Start and end states have no emissions. a_state->emit = allocate_array(alph_size(alph, ALL_SIZE)); copy_array(freqs, a_state->emit); } a_state->num_sites = num_sites; a_state->i_motif = i_motif; a_state->i_position = i_position; // Record the motif ID character at this position. if ((state_type == START_STATE) || (state_type == END_STATE) || (state_type == SPACER_STATE)) { a_state->id_char = NON_MOTIF_ID_CHAR; } else { // motif state strncpy(a_state->motif_id, get_full_motif_id(motif), MAX_MOTIF_ID_LENGTH + 2); a_state->id_char = get_motif_id_char(i_position, motif); } assert(a_state->id_char != '\0'); // First set up the transitions into this state. switch (state_type) { case START_STATE : a_state->ntrans_in = 0; a_state->itrans_in = NULL; a_state->trans_in = NULL; break; case START_MOTIF_STATE : // Transitions come from any motif or from the start state. a_state->ntrans_in = nmotifs + 1; a_state->itrans_in = (int *)mm_malloc(sizeof(int) * (nmotifs + 1)); a_state->trans_in = allocate_array(nmotifs + 1); for (j_motif = 0; j_motif < nmotifs + 1; j_motif++) { a_state->itrans_in[j_motif] = spacer_index(j_motif, i_motif + 1, TRUE, nmotifs, spacer_states); set_array_item(j_motif, get_matrix_cell(j_motif, i_motif + 1, transp_freq), a_state->trans_in); } break; case END_STATE : // Transitions come from any motif. a_state->ntrans_in = nmotifs; a_state->itrans_in = (int *)mm_malloc(sizeof(int) * nmotifs); a_state->trans_in = allocate_array(nmotifs); for (j_motif = 0; j_motif < nmotifs; j_motif++) { a_state->itrans_in[j_motif] = spacer_index(j_motif + 1, nmotifs + 1, TRUE, nmotifs, spacer_states); set_array_item(j_motif, get_matrix_cell(j_motif + 1, nmotifs + 1, transp_freq), a_state->trans_in); } break; case MID_MOTIF_STATE : case END_MOTIF_STATE : a_state->ntrans_in = 1; a_state->itrans_in = (int *)mm_malloc(sizeof(int)); a_state->itrans_in[0] = i_state - 1; a_state->trans_in = allocate_array(1); set_array_item(0, 1.0, a_state->trans_in); break; case SPACER_STATE : a_state->ntrans_in = 2; a_state->itrans_in = (int *)mm_malloc(sizeof(int) * 2); a_state->trans_in = allocate_array(2); // For multi-state spacers, incoming transition from previous state. if (i_position != 0) a_state->itrans_in[0] = i_state - 1; else a_state->itrans_in[0] = motif_index(prev_motif, TRUE, num_spacers, spacer_states, motifs, nmotifs); // The other transition is a self-transition. a_state->itrans_in[1] = i_state; set_array_item(0, 1.0 - self_trans(expected_length / spacer_states), a_state->trans_in); set_array_item(1, self_trans(expected_length / spacer_states), a_state->trans_in); break; default: die("Illegal state!"); } // Then set up the transitions out of this state. switch (state_type) { case START_STATE : // Transitions go to each motif. a_state->ntrans_out = nmotifs; a_state->itrans_out = (int *)mm_malloc(sizeof(int) * nmotifs); a_state->trans_out = allocate_array(nmotifs); for (j_motif = 0; j_motif < nmotifs; j_motif++) { a_state->itrans_out[j_motif] = spacer_index(0, j_motif + 1, FALSE, nmotifs, spacer_states); set_array_item(j_motif, get_matrix_cell(0, j_motif + 1, transp_freq), a_state->trans_out); } break; case END_MOTIF_STATE : // Can go to any other motif or to the end state. a_state->ntrans_out = nmotifs + 1; a_state->itrans_out = (int *)mm_malloc(sizeof(int) * (nmotifs + 1)); a_state->trans_out = allocate_array(nmotifs + 1); for (j_motif = 0; j_motif < nmotifs + 1; j_motif++) { a_state->itrans_out[j_motif] = spacer_index(i_motif + 1, j_motif + 1, FALSE, nmotifs, spacer_states); set_array_item(j_motif, get_matrix_cell(i_motif + 1, j_motif + 1, transp_freq), a_state->trans_out); } break; case START_MOTIF_STATE : case MID_MOTIF_STATE : a_state->ntrans_out = 1; a_state->itrans_out = (int *)mm_malloc(sizeof(int)); a_state->itrans_out[0] = i_state + 1; a_state->trans_out = allocate_array(1); set_array_item(0, 1.0, a_state->trans_out); break; case SPACER_STATE : a_state->ntrans_out = 2; a_state->itrans_out = (int *)mm_malloc(sizeof(int) * 2); a_state->trans_out = allocate_array(2); // The first transition is a self-transition. a_state->itrans_out[0] = i_state; // For multi-state spacers, outgoing transition to next state. if (i_position < spacer_states - 1) a_state->itrans_out[1] = i_state + 1; else a_state->itrans_out[1] = motif_index(next_motif, FALSE, num_spacers, spacer_states, motifs, nmotifs); set_array_item(0, self_trans(expected_length), a_state->trans_out); set_array_item(1, 1.0 - self_trans(expected_length), a_state->trans_out); break; case END_STATE : a_state->ntrans_out = 0; a_state->itrans_out = NULL; a_state->trans_out = NULL; break; default: die("Illegal state!"); } }
/************************************************************************* * Build a star topology HMM. *************************************************************************/ void build_star_hmm (ARRAY_T* background, int spacer_states, MOTIF_T* motifs, int nmotifs, BOOLEAN_T fim, MHMM_T** the_hmm) { ALPH_T alph; int motif_states; /* Total length of the motifs. */ int num_spacers; /* Total number of spacer states. */ int num_states; /* Total number of states in the model. */ int i_motif; /* Index of the current "from" motif. */ int i_position; /* Index within the current motif or spacer. */ int i_state = 0; /* Index of the current state. */ alph = get_motif_alph(motif_at(motifs, 0)); /* Count the width of the motifs. */ for (motif_states = 0, i_motif = 0; i_motif < nmotifs; i_motif++) motif_states += get_motif_length(motif_at(motifs, i_motif)); // Only 1 spacer. num_spacers = 1; /* Total states = motifs + spacer_states + begin/end */ num_states = motif_states + (num_spacers * spacer_states) + 2; /* fprintf(stderr, "motif_states=%d num_spacers=%d num_states=%d\n", motif_states, num_spacers, num_states); */ /* Allocate the model. */ *the_hmm = allocate_mhmm(alph, num_states); /* Record that this is a star model. */ (*the_hmm)->type = STAR_HMM; /* Record the number of motifs in the model. */ (*the_hmm)->num_motifs = nmotifs; /* Record the number of states in the model. */ (*the_hmm)->num_states = num_states; (*the_hmm)->num_spacers = 1; (*the_hmm)->spacer_states = spacer_states; // Put the background distribution into the model. copy_array(background, (*the_hmm)->background); /* Build the begin state. */ build_star_state( alph, START_STATE, i_state, 0, // expected length NULL, 0, // Number of sites. NON_MOTIF_INDEX, NON_MOTIF_POSITION, nmotifs, spacer_states, motifs, &((*the_hmm)->states[i_state]) ); i_state++; // Build the spacer state (state 0). Allow multi-state spacers. for (i_position = 0; i_position < spacer_states; i_position++) { build_star_state( alph, SPACER_STATE, i_state, DEFAULT_SPACER_LENGTH, background, SPACER_NUMSITES, NON_MOTIF_INDEX, i_position, nmotifs, spacer_states, motifs, &((*the_hmm)->states[i_state]) ); i_state++; } /* Build the motif states. */ for (i_motif = 0; i_motif < nmotifs; i_motif++) { MOTIF_T *this_motif = motif_at(motifs, i_motif); assert(get_motif_length(this_motif) > 1); i_position = 0; build_star_state( alph, START_MOTIF_STATE, i_state, 0, // Expected spacer length. get_matrix_row(i_position, get_motif_freqs(this_motif)), get_motif_nsites(this_motif), i_motif, i_position, nmotifs, spacer_states, motifs, &((*the_hmm)->states[i_state]) ); i_state++; for (i_position = 1; i_position < get_motif_length(this_motif) - 1; i_position++) { build_star_state( alph, MID_MOTIF_STATE, i_state, 0, // Expected spacer length. get_matrix_row(i_position, get_motif_freqs(this_motif)), get_motif_nsites(this_motif), i_motif, i_position, nmotifs, spacer_states, motifs, &((*the_hmm)->states[i_state]) ); i_state++; } build_star_state( alph, END_MOTIF_STATE, i_state, 0, // Expected spacer length. get_matrix_row(i_position, get_motif_freqs(this_motif)), get_motif_nsites(this_motif), i_motif, i_position, nmotifs, spacer_states, motifs, &((*the_hmm)->states[i_state]) ); i_state++; } /* Build the end state. */ build_star_state( alph, END_STATE, i_state, 0, // Expected spacer length. NULL, // Emissions 0, // Number of sites. NON_MOTIF_INDEX, NON_MOTIF_POSITION, nmotifs, spacer_states, motifs, &((*the_hmm)->states[i_state]) ); i_state++; /* Convert spacers to FIMs if requested. */ if (fim) { convert_to_fims(*the_hmm); } /* Fill in the transition matrix. */ build_transition_matrix(*the_hmm); } // build_star_hmm
/************************************************************************* * Set up one state in a star HMM, given the appropriate data. *************************************************************************/ static void build_star_state ( ALPH_T alph, // Type of alphabet int state_type, // Type of state (START, SPACER,..) STATE_T i_state, // State index. int expected_length, /* For spacers, the expected length of output. */ ARRAY_T* freqs, // Emission probability distrib. double num_sites, // Number of sites for this emission. int i_motif, // Index of motif this state is in. int i_position, // Position of this state within motif int nmotifs, // Total number of motifs. int spacer_states, // Number of HMM states per spacer. MOTIF_T* motifs, // Motifs. MHMM_STATE_T* a_state ) // State to be filled in (pre-allocated). { int j_motif; // Index of the current motif. int num_spacers = 1; // Total number of spacers in HMM. double in_p; // Probability of transition into a state // Size of the alphabet, including ambiguity codes. int full_alph_size = alph_size(alph, ALL_SIZE); MOTIF_T *motif = NULL; if (i_motif != NON_MOTIF_INDEX) { motif = motif_at(motifs, i_motif); } // Tell the user what's up. if (verbosity >= NORMAL_VERBOSE) { switch (state_type) { case START_STATE : fprintf(stderr, "Building HMM: (0) "); break; case SPACER_STATE : fprintf(stderr, "%d ", i_state); break; case END_MOTIF_STATE : fprintf(stderr, "%d | ", i_state); break; case START_MOTIF_STATE : case MID_MOTIF_STATE : fprintf(stderr, "%d-", i_state); break; case END_STATE : fprintf(stderr, "(%d)\n", i_state); break; } } // Record what type of state this is. a_state->type = state_type; // Record the motif width if this is a motif. if (state_type == START_MOTIF_STATE || state_type == MID_MOTIF_STATE || state_type == END_MOTIF_STATE) { a_state->w_motif = get_motif_length(motif); } else { a_state->w_motif = 1; } // Set up the emission distribution and a few other tidbits. a_state->emit = allocate_array(full_alph_size); a_state->emit_odds = allocate_array(full_alph_size); if (freqs != NULL) { // Start and end states have no emissions. copy_array(freqs, a_state->emit); } a_state->num_sites = num_sites; a_state->i_motif = i_motif; if (motif != NULL) { } a_state->i_position = i_position; // Record the motif ID character at this position. if ((state_type == START_STATE) || (state_type == END_STATE) || (state_type == SPACER_STATE)) { a_state->id_char = NON_MOTIF_ID_CHAR; strcpy(a_state->motif_id, NON_MOTIF_ID); } else { strcpy(a_state->motif_id, get_full_motif_id(motif)); a_state->id_char = get_motif_id_char(i_position, motif); } assert(a_state->id_char != '\0'); // First set up the transitions into this state. switch (state_type) { case START_STATE : a_state->ntrans_in = 0; a_state->itrans_in = NULL; a_state->trans_in = NULL; break; case END_STATE : case START_MOTIF_STATE : // Transitions come from spacer state. a_state->ntrans_in = 1; a_state->itrans_in = (int *)mm_malloc(sizeof(int)); a_state->trans_in = allocate_array(1); a_state->itrans_in[0] = SPACER_INDEX; // Distribute non-self loop probability evenly among motifs and end state. in_p = (1 - self_trans(expected_length / spacer_states))/(nmotifs+1); set_array_item(0, in_p, a_state->trans_in); break; case MID_MOTIF_STATE : case END_MOTIF_STATE : // Transitions come from previous state. a_state->ntrans_in = 1; a_state->itrans_in = (int *)mm_malloc(sizeof(int)); a_state->itrans_in[0] = i_state - 1; a_state->trans_in = allocate_array(1); set_array_item(0, 1.0, a_state->trans_in); break; case SPACER_STATE : // Transitions come from start and each motif except for internal // multi-states. a_state->ntrans_in = (i_position != 0) ? 2 : nmotifs + 2; a_state->itrans_in = (int *)mm_malloc(sizeof(int) * a_state->ntrans_in); a_state->trans_in = allocate_array(a_state->ntrans_in); // First transition is a self-transition. a_state->itrans_in[0] = i_state; set_array_item( 0, self_trans(expected_length / spacer_states), a_state->trans_in ); // Next the transitions from all the motifs (or the previous spacer). if (i_position != 0) { a_state->itrans_in[1] = i_state - 1; set_array_item(1, 1.0 - self_trans(expected_length / spacer_states), a_state->trans_in); } else { a_state->itrans_in[1] = START_INDEX; // From start state. // From each motif. for (j_motif = 0; j_motif < nmotifs; j_motif++) { a_state->itrans_in[j_motif+2] = motif_index( j_motif+1, TRUE, num_spacers, spacer_states, motifs, nmotifs ); set_array_item(j_motif+2, 1.0, a_state->trans_in); } } break; } // Then set up the transitions out of this state. switch (state_type) { case START_STATE : case END_MOTIF_STATE : // Transition goes to spacer. a_state->ntrans_out = 1; a_state->itrans_out = (int *)mm_malloc(sizeof(int)); a_state->trans_out = allocate_array(1); a_state->itrans_out[0] = SPACER_INDEX; set_array_item(0, 1.0, a_state->trans_out); break; case START_MOTIF_STATE : case MID_MOTIF_STATE : a_state->ntrans_out = 1; a_state->itrans_out = (int *)mm_malloc(sizeof(int)); a_state->itrans_out[0] = i_state + 1; a_state->trans_out = allocate_array(1); set_array_item(0, 1.0, a_state->trans_out); break; case SPACER_STATE : // Transitions go to self, motifs and end (except for beginning // multi-state spacers) a_state->ntrans_out = (i_position < spacer_states -1 ) ? 2 : nmotifs + 2; a_state->itrans_out = (int *)mm_malloc(sizeof(int) * a_state->ntrans_out); a_state->trans_out = allocate_array(a_state->ntrans_out); // The first transition is a self-transition. a_state->itrans_out[0] = i_state; set_array_item(0, self_trans(expected_length), a_state->trans_out); // For multi-state spacers, outgoing transition to next state. if (i_position < spacer_states - 1) { a_state->itrans_out[1] = i_state + 1; set_array_item(1, 1-self_trans(expected_length), a_state->trans_out); } else { double out_p = (1 - self_trans(expected_length))/(nmotifs+1); // Out to each motif start. for (j_motif = 0; j_motif < nmotifs; j_motif++) { a_state->itrans_out[j_motif+1] = motif_index( j_motif+1, FALSE, num_spacers, spacer_states, motifs, nmotifs ); set_array_item(j_motif+1, out_p, a_state->trans_out); } // Out to end state. a_state->itrans_out[j_motif+1] = motif_index(nmotifs, TRUE, num_spacers, spacer_states, motifs, nmotifs) + 1; set_array_item(j_motif+1, out_p, a_state->trans_out); } break; case END_STATE : a_state->ntrans_out = 0; a_state->itrans_out = NULL; a_state->trans_out = NULL; break; } } // build_star_state
/* * Using the linreg test, * * this method returns the lowest scoring subdivision of a set of sequences for a given motif. * It's not self-contained, as it requires to hook into the global variables results, motifs, seq_ids. */ ramen_result_t* ramen_do_linreg_test(int motif_num) { //Assorted vars int seq_num; int j,k; int motif_index = motif_num * 2; //This is a workaround to the change in the motif datastructure where it now // goes +MOTIFA -MOTIFA +MOTIFB etc. rather than all + then all - motifs. //Vars for the regression double* x; double* y; double m = 0; double b = 0; double mse = 0; //Vars for scoring ramen_result_t* r; //Allocate memory or set initial values seq_num = get_num_strings(seq_ids); //number of sequences r = malloc(sizeof(ramen_result_t)); //allocate space, as a ptr to this will go in the array later //that's why we don't free it in this loop. x = malloc(sizeof(double)*seq_num); y = malloc(sizeof(double)*seq_num); //Now we need to copy the scores into two double arrays //Use LOG macro so that log(0) 'works' for (j=0; j < seq_num; j++) { if (args.log_fscores == TRUE) { y[j] = LOG(get_array_item(j, seq_fscores)); } else { y[j] = get_array_item(j, seq_fscores); } if (args.log_pwmscores == TRUE) { x[j] = LOG(results[motif_num][j]); } else { x[j] = results[motif_num][j]; } } //Switch x&y if they're to be switched if (args.linreg_switchxy) { SWAP(double*, x, y); } // TODO: Tidy and/or remove this for production if(args.linreg_dump_dir > 0) { FILE *fh; char* filename; filename = malloc(sizeof(char)*(strlen(args.linreg_dump_dir) + 50)); sprintf(filename, "%s/%s.tsv", args.linreg_dump_dir, get_motif_id(motif_at(motifs.motifs, motif_index))); fh = fopen(filename, "w"); fputs("PWM_Score\tFluorescence_Score\n", fh); for (j=0; j < seq_num; j++) { fprintf(fh, "%.10e %.10e\n", x[j], y[j]); } fclose(fh); free(filename); } /*extern double regress( int n, / number of points / double *x, / x values / double *y, / y values / double *m, / slope / double *b / y intercept / );*/ mse = regress(seq_num, x, y, &m, &b); if (args.verbose >= 3) { printf("LinReg MSE of motif %s on %i seqs: %.4g (m: %.4g b: %.4g)\n", get_motif_id(motif_at(motifs.motifs, motif_index)), seq_num, mse, m, b); } //Add to our motif list if lowest MSE r->motif_id = strdup(get_motif_id(motif_at(motifs.motifs, motif_index))); r->m = m; //Not p-values, but they'll do when we re-use this structure... r->b = b; r->mse = mse; r->p = -1; //Do stochastic sampling if required. if (args.repeats > 0) { int repeat_wins = 0; for (j=0;j<args.repeats;j++) { double repeat_mse = 0; shuffle(x,seq_num); //Shuffle and break the associations between x and y repeat_mse = regress(seq_num, x, y, &m, &b); //fprintf(stderr, "Motif %d Repeat %d RMSE: %g MSE: %g\n",motif_index,j,repeat_mse,mse); if (repeat_mse <= mse) { repeat_wins++; } } r->p = repeat_wins*1.0/ args.repeats*1.0; } free(x); free(y); return r; }
void ramen_scan_sequences() { FILE* seq_file = NULL; MOTIF_T* motif = NULL; MOTIF_T* rev_motif = NULL; SEQ_T* sequence = NULL; SCANNED_SEQUENCE_T* scanned_seq = NULL; PATTERN_T* pattern; int i; int j; SEQ_T** seq_list; int num_seqs; int seq_len; //For the bdb_bg mode: ARRAY_T* seq_bg_freqs; double atcontent; double roundatcontent; double avg_seq_length = 0; //Open the file. if (open_file(args.sequence_filename, "r", FALSE, "FASTA", "sequences", &seq_file) == 0) { fprintf(stderr, "Couldn't open the file %s.\n", args.sequence_filename); ramen_terminate(1); } //Start reading in the sequences read_many_fastas(ramen_alph, seq_file, MAX_SEQ_LENGTH, &num_seqs, &seq_list); seq_ids = new_string_list(); seq_fscores = allocate_array(num_seqs); //Allocate the required space for results results = malloc(sizeof(double*) * motifs.num); for (i=0;i<motifs.num;i++) { results[i] = malloc(sizeof(double)*num_seqs); } for (j=0;j<num_seqs;j++) { fprintf(stderr, "\rScanning %i of %i sequences...", j+1, num_seqs); //copy the pointer into our current object for clarity sequence = seq_list[j]; //Read the fluorescence data from the description field. add_string(get_seq_name(sequence),seq_ids); seq_len = get_seq_length(sequence); set_array_item(j,atof(get_seq_description(sequence)),seq_fscores); //Scan with each motif. for (i=0;i<motifs.num;i++) { int motifindex = i*2; results[i][j] = ramen_sequence_scan(sequence, motif_at(motifs.motifs, motifindex), motif_at(motifs.motifs, motifindex+1), NULL, NULL, //No need to pass PSSM. AVG_ODDS, 0, TRUE, 0, motifs.bg_freqs); if (TRUE == args.linreg_normalise) { int k; double maxscore = 1; motif = motif_at(motifs.motifs,motifindex); for (k=0;k<get_motif_length(motif);k++) { double maxprob = 0; if (maxprob < get_matrix_cell(k, alph_index(ramen_alph, 'A'), get_motif_freqs(motif))) maxprob = get_matrix_cell(k, alph_index(ramen_alph, 'A'), get_motif_freqs(motif)); if (maxprob < get_matrix_cell(k, alph_index(ramen_alph, 'C'), get_motif_freqs(motif))) maxprob = get_matrix_cell(k, alph_index(ramen_alph, 'C'), get_motif_freqs(motif)); if (maxprob < get_matrix_cell(k, alph_index(ramen_alph, 'G'), get_motif_freqs(motif))) maxprob = get_matrix_cell(k, alph_index(ramen_alph, 'G'), get_motif_freqs(motif)); if (maxprob < get_matrix_cell(k, alph_index(ramen_alph, 'T'), get_motif_freqs(motif))) maxprob = get_matrix_cell(k, alph_index(ramen_alph, 'T'), get_motif_freqs(motif)); maxscore *= maxprob; } results[i][j] /= maxscore; } } } }