/************************************************************************* * Find the index of the starting or ending state of a given motif in * a given HMM. * 0 = START_STATE and nmotifs+1 = END_STATE *************************************************************************/ static int motif_index (const int motif_num, const BOOLEAN_T start_or_end, const int num_spacers, const int spacer_states, const MOTIF_T* motifs, const int nmotifs) { int i_motif; int return_value; assert(motif_num >= 0); assert(motif_num <= (nmotifs + 1)); if (motif_num == 0) return START_INDEX; // Skip the spacer states. return_value = (num_spacers * spacer_states) + 1; // Add the lengths of the preceding motifs. for (i_motif = 0; i_motif < motif_num - 1; i_motif++) return_value += get_motif_length(motif_at((MOTIF_T*)motifs, i_motif)); // If we're looking for the end of this motif, add its length as well. // unless it is the end state we're after which has only one state if (start_or_end && motif_num != (nmotifs + 1)) return_value += get_motif_length(motif_at((MOTIF_T*)motifs, i_motif)) - 1; // fprintf(stderr, "Motif %d -> %d\n", motif_num, return_value); return(return_value); }
/************************************************************************** * Callback invoked when matching an opening matched_element tag for a * CISML file of a secondary motif database. A hit must pass the checks: * 1) The current match is for a sequence/motif that we're interested in. * 2) A score is supplied. * 3) The score supplied is better or equal to the existing best score. * 4) Consistant with CISML format so start and stop are larger than 0. * 5) The distance between the start and stop matches the motif. * 6) It fits within the margin region around the primary motif * 7) It does not overlap the primary motif. * Provided all those checks pass then the hit is calculated relative to * the start of the matched region. If the score is equal to the current * best then the relative hit position is added to the list of best hits, * otherwise the list is cleared, the best score is updated and the hit * is added to the previously empty list. **************************************************************************/ void match_secondary(void *ctx, long start, long stop, double *score, double *pvalue, char *clusterId) { SECONDARY_LOADER_T *loader = (SECONDARY_LOADER_T*)ctx; int lpos, rpos, rc, relative_position, match; //check if we're loading this match if (loader->current_sequence == NULL) return; //check if this match has enough information to be considered if (score == NULL) return; //check to see if the existing match is better if (loader->hit_count > 0 && loader->secondary_score > *score) return; //convert the coordinates of the match into easier to use ones if (start <= 0 || stop <= 0) { die("Expected start and stop fields in cisml to be 1 or larger.\n"); } if (start < stop) { lpos = start; rpos = stop; rc = FALSE; } else { lpos = stop; rpos = start; rc = TRUE; } //check that gap makes sense if ((rpos - lpos + 1) != get_motif_length(loader->secondary_motif->motif)) { die("Motif %s has length %d but a match in a CISML file had a start of %ld and stop of %ld which evaluates to a length of %d\n", get_motif_id(loader->secondary_motif->motif), get_motif_length(loader->secondary_motif->motif), start, stop, (rpos - lpos + 1) ); } //check for overlap with the primary match //and that the secondary motif fits within the margin if (rpos < loader->primary_lpos) { // left side (upstream) if ((loader->primary_lpos - lpos) > loader->margin) return;//outside margin } else if (lpos > loader->primary_rpos) { // right side (downstream) if ((rpos - loader->primary_rpos) > loader->margin) return;//outside margin } else { return;//overlap } //match seems valid and better than anything we've seen previous so update //note that stored position is relative to the start of the margin, as if //this was scored on a trimmed sequence indexing from 1 //this has the advantage that we only need the width of the primary //motif and the size of the margin to calculate the offset relative_position = lpos - (loader->primary_lpos - loader->margin) + 1; //now make the scale pos/neg dependent on if the match is with a //reverse complement match = (rc ? -relative_position : relative_position); if (loader->hit_count == 0 || loader->secondary_score > *score) { loader->secondary_score = *score; loader->hit_count = 1; loader->hits[0] = match; } else if (loader->secondary_score == *score) { if (loader->hit_count >= loader->hits_size) { loader->hits_size = loader->hit_count + 10; loader->hits = mm_realloc(loader->hits, sizeof(int) * loader->hits_size); } loader->hits[loader->hit_count++] = match; } }
void mcast_print_motif_list(FILE * output, MOTIF_T* motifs, int num_motifs) { fputs("\n", output); int i; for (i = 0; i < num_motifs; i++) { MOTIF_T *motif = motif_at(motifs, i); MOTIF_T *rc_motif = NULL; char *motif_id = get_motif_id(motif); int width = get_motif_length(motif); char *rc_motif_id = NULL; if (i < (num_motifs - 1)) { rc_motif = motif_at(motifs, i + 1); rc_motif_id = get_motif_id(rc_motif); } char *best_possible_match = get_best_possible_match(motif); char *colored_best_possible_match = color_dna_sequence(best_possible_match); char *best_possible_rc_match = NULL; char *colored_best_possible_rc_match = NULL; if (rc_motif_id && strcmp(motif_id, rc_motif_id) == 0) { ++i; // Pair of identiical motif ids indicate forward/reverse pair. best_possible_rc_match = get_best_possible_match(rc_motif); colored_best_possible_rc_match = color_dna_sequence(best_possible_rc_match); } const char *indent = " "; fprintf(output, "%s<tr>\n", indent); fprintf(output, "%s<td>%s</td>\n", indent, motif_id); fprintf(output, "%s<td>%d</td>\n", indent, width); fprintf(output, "%s<td class=\"sequence\">%s</td>\n", indent, colored_best_possible_match); fprintf(output, "%s<td class=\"sequence\">%s</td>\n", indent, colored_best_possible_rc_match); fprintf(output, "%s</tr>\n", indent); myfree(best_possible_match); myfree(best_possible_rc_match); myfree(colored_best_possible_match); myfree(colored_best_possible_rc_match); } };
/************************************************************************** * Callback invoked when matching a matched_element tag in the CISML file * for the primary motif. If we are recording scores for this motif and * sequence then it: * 1) Checks that a score was supplied * 2) Checks that the start and stop are correctly spaced for the expected * motif. * 3) Checks that the hit does not overlap the margin on each end of the * sequence. * 4) If we don't have a best score, or this score is better: * - clear the list of best hits and add this one. * 5) Alternately if this score is equal to the existing one: * - add the hit to the list of best hits. **************************************************************************/ void match_primary(void *ctx, long start, long stop, double *score, double *pvalue, char *clusterId) { PRIMARY_LOADER_T *loader = (PRIMARY_LOADER_T*)ctx; int lpos, rpos, rc; //check we're actually loading data if (loader->current_sequence == NULL) return; //check that this match is worth investigating further if (score == NULL) return; if (start <= 0 || stop <= 0) { die("Expected start and stop fields in cisml to be 1 or larger.\n"); } if (start < stop) { lpos = start; rpos = stop; rc = FALSE; } else { lpos = stop; rpos = start; rc = TRUE; } //check that gap makes sense if ((rpos - lpos + 1) != get_motif_length(loader->motif)) { die("Motif %s has length %d but a match in a CISML file had a start of %ld and stop of %ld which evaluates to a length of %d\n", get_motif_id(loader->motif), get_motif_length(loader->motif), start, stop, (rpos - lpos + 1) ); } //check left margin // For example if we had a margin of 1 then the primary motif must start at // 2 or larger which would allow a secondary motif of length 1 to fit at // position 1 if (lpos <= loader->margin) return; //check right margin // For example if we had a sequence of length 5 and a margin of 1 then the // primary motif must finish at 4 or smaller which would allow a secondary // motif of length 1 to fit at position 5 if (rpos > (loader->current_sequence->length - loader->margin)) return; //now see if our existing best match is worse than this one if (loader->hit_count == 0 || *score > loader->current_score) { loader->current_score = *score; loader->hit_count = 1; loader->hits[0] = (rc ? -lpos : lpos); } else if (*score == loader->current_score) { if (loader->hit_count >= loader->hits_size) { loader->hits_size = loader->hit_count + 10; loader->hits = mm_realloc(loader->hits, sizeof(int) * loader->hits_size); } loader->hits[loader->hit_count++] = (rc ? -lpos : lpos); } }
/*********************************************************************** * Say that the motif ID is printed centered above a given motif. * If the motif ID string is longer than the motif, we truncate * it on the right and align the first character over the start of * the motif. * This function returns the character that appears in the nth * position of that motif ID string. * If the motif was created from a double stranded source then * include the strand. ***********************************************************************/ static char get_motif_id_char (int position, MOTIF_T* a_motif) { char* motif_id_string, *id; int id_width, m_width, id_start; char return_char; assert(position < get_motif_length(a_motif)); id = get_full_motif_id(a_motif); id_width = strlen(id); m_width = get_motif_length(a_motif); // Allocate the string. motif_id_string = mm_calloc(sizeof(char), m_width + 1); // Get position where ID starts relative to start of motif. id_start = id_width <= m_width ? ((m_width - id_width) / 2) : 0; // FIXME: (tlb) The following if() was put in to make the smoke tests of mhmm // pass. It should be removed and the smoke test comparison files changed. if (m_width % 2 == 0 && id_width % 2 == 0) { id_start++; } else { id_start+=2; } // Create the centered ID string. sprintf(motif_id_string, "%*.*s%-*.*s", id_start, id_start, "", m_width-id_start, m_width-id_start, id); assert((int)(strlen(motif_id_string)) == m_width); // Get the nth character. return_char = motif_id_string[position]; if (return_char == ' ') { if ((position == 0) || (position == (m_width - 1))) { return_char = '*'; } else { return_char = '_'; } } // Free up memory and return. myfree(motif_id_string); return(return_char); }
/************************************************************************* * Output JSON data for a motif *************************************************************************/ static void output_motif_json(JSONWR_T* json, MOTIF_STATS_T* stats, SITE_COUNTS_T* counts) { //vars MOTIF_T *motif; MATRIX_T *freqs; int i, j, mlen, asize, end; motif = stats->motif; freqs = get_motif_freqs(motif); asize = alph_size(get_motif_alph(motif), ALPH_SIZE); jsonwr_start_object_value(json); jsonwr_lng_prop(json, "db", stats->db->id); jsonwr_str_prop(json, "id", get_motif_id(motif)); if (*(get_motif_id2(motif))) { jsonwr_str_prop(json, "alt", get_motif_id2(motif)); } mlen = get_motif_length(motif); jsonwr_lng_prop(json, "len", mlen); jsonwr_dbl_prop(json, "motif_evalue", get_motif_evalue(motif)); jsonwr_dbl_prop(json, "motif_nsites", get_motif_nsites(motif)); if (get_motif_url(motif) && *get_motif_url(motif)) { jsonwr_str_prop(json, "url", get_motif_url(motif)); } jsonwr_property(json, "pwm"); jsonwr_start_array_value(json); for (i = 0; i < mlen; i++) { jsonwr_start_array_value(json); for (j = 0; j < asize; j++) { jsonwr_dbl_value(json, get_matrix_cell(i, j, freqs)); } jsonwr_end_array_value(json); } jsonwr_end_array_value(json); jsonwr_lng_prop(json, "bin_width", stats->central_window+1); jsonwr_dbl_prop(json, "bin_sites", stats->central_sites); jsonwr_lng_prop(json, "total_sites", counts->total_sites); jsonwr_dbl_prop(json, "log_pvalue", stats->log_adj_pvalue); jsonwr_dbl_prop(json, "max_prob", stats->max_prob); jsonwr_property(json, "sites"); jsonwr_start_array_value(json); end = counts->allocated - (mlen - 1); for (i = (mlen - 1); i < end; i += 2) { jsonwr_dbl_value(json, counts->sites[i]); } jsonwr_end_array_value(json); jsonwr_end_object_value(json); }
/************************************************************************* * Output motif site counts *************************************************************************/ static void output_site_counts(FILE* fh, int sequence_length, MOTIF_DB_T* db, MOTIF_T* motif, SITE_COUNTS_T* counts) { // vars int i, w, end; char *alt; fprintf(fh, "DB %d MOTIF\t%s", db->id, get_motif_id(motif)); alt = get_motif_id2(motif); if (alt[0]) fprintf(fh, "\t%s", alt); fprintf(fh, "\n"); w = get_motif_length(motif); end = counts->allocated - (w - 1); for (i = (w - 1); i < end; i += 2) { fprintf(fh, "% 6.1f\t%g\n", ((double)(i - sequence_length + 1)) / 2.0, counts->sites[i]); } }
/************************************************************************** * Callback invoked when matching an opening scanned_sequence tag for a * CISML file of a secondary motif database. It calcualtes and caches the * left and right bounds of the primary motif and stores the current * sequence. **************************************************************************/ void sequence_secondary(void *ctx, char *accession, char *name, char *db, char *lsId, double *score, double *pvalue, long *length) { SECONDARY_LOADER_T *loader = (SECONDARY_LOADER_T*)ctx; RBNODE_T *node; int pmatch; if (loader->secondary_motif == NULL) return; node = rbtree_lookup(loader->sequences, accession, FALSE, NULL); if (node != NULL) { loader->current_sequence = (SEQUENCE_T*)rbtree_value(node); pmatch = loader->current_sequence->primary_match; loader->primary_lpos = (pmatch < 0 ? -pmatch : pmatch); loader->primary_rpos = loader->primary_lpos + get_motif_length(loader->primary_motif) - 1; if (loader->secondary_matches[loader->current_sequence->index] != 0) { die("Already seen this sequence!"); } loader->secondary_score = 0; loader->hit_count = 0; } else { loader->current_sequence = NULL; } }
/************************************************************************* * Set up one state in a complete HMM, given the appropriate data. *************************************************************************/ static void build_complete_state (STATE_T state_type, // Type of state (START, SPACER,..) int i_state, // State index. ALPH_T alph, // alphabet int expected_length, // For spacers, the expected length of output. ARRAY_T *freqs, // Emission probability distrib. double num_sites, // Number of sites for this emission. int i_motif, // Index of motif this state is in. int i_position, // Position of this state within motif int nmotifs, // Total number of motifs. int prev_motif, // Index of previous motif. int next_motif, // Index of next motif. MATRIX_T *transp_freq, // Transition freq matrix. int spacer_states, // Number of HMM states per spacer. int num_spacers, // Total number of spacers in HMM. MOTIF_T *motifs, // Motifs. MHMM_STATE_T *a_state) // State to be filled in (pre-allocated). { MOTIF_T *motif; // The motif (for motif state) int j_motif; // Index of the current motif. if (i_motif != NON_MOTIF_INDEX) motif = motif_at(motifs, i_motif); else motif = NULL; // Tell the user what's up. if (verbosity >= NORMAL_VERBOSE) { switch (state_type) { case START_STATE : fprintf(stderr, "Building HMM: (0) "); break; case SPACER_STATE : fprintf(stderr, "%d ", i_state); break; case END_MOTIF_STATE : fprintf(stderr, "%d | ", i_state); break; case START_MOTIF_STATE : case MID_MOTIF_STATE : fprintf(stderr, "%d-", i_state); break; case END_STATE : fprintf(stderr, "(%d)\n", i_state); break; default: die("Invalid state!"); } } // Record what type of state this is. a_state->type = state_type; // Record the motif width if this is a motif. if (state_type == START_MOTIF_STATE || state_type == MID_MOTIF_STATE || state_type == END_MOTIF_STATE) { a_state->w_motif = get_motif_length(motif); } else { a_state->w_motif = 1; } // Set up the emission distribution and a few other tidbits. if (freqs != NULL) { // Start and end states have no emissions. a_state->emit = allocate_array(alph_size(alph, ALL_SIZE)); copy_array(freqs, a_state->emit); } a_state->num_sites = num_sites; a_state->i_motif = i_motif; a_state->i_position = i_position; // Record the motif ID character at this position. if ((state_type == START_STATE) || (state_type == END_STATE) || (state_type == SPACER_STATE)) { a_state->id_char = NON_MOTIF_ID_CHAR; } else { // motif state strncpy(a_state->motif_id, get_full_motif_id(motif), MAX_MOTIF_ID_LENGTH + 2); a_state->id_char = get_motif_id_char(i_position, motif); } assert(a_state->id_char != '\0'); // First set up the transitions into this state. switch (state_type) { case START_STATE : a_state->ntrans_in = 0; a_state->itrans_in = NULL; a_state->trans_in = NULL; break; case START_MOTIF_STATE : // Transitions come from any motif or from the start state. a_state->ntrans_in = nmotifs + 1; a_state->itrans_in = (int *)mm_malloc(sizeof(int) * (nmotifs + 1)); a_state->trans_in = allocate_array(nmotifs + 1); for (j_motif = 0; j_motif < nmotifs + 1; j_motif++) { a_state->itrans_in[j_motif] = spacer_index(j_motif, i_motif + 1, TRUE, nmotifs, spacer_states); set_array_item(j_motif, get_matrix_cell(j_motif, i_motif + 1, transp_freq), a_state->trans_in); } break; case END_STATE : // Transitions come from any motif. a_state->ntrans_in = nmotifs; a_state->itrans_in = (int *)mm_malloc(sizeof(int) * nmotifs); a_state->trans_in = allocate_array(nmotifs); for (j_motif = 0; j_motif < nmotifs; j_motif++) { a_state->itrans_in[j_motif] = spacer_index(j_motif + 1, nmotifs + 1, TRUE, nmotifs, spacer_states); set_array_item(j_motif, get_matrix_cell(j_motif + 1, nmotifs + 1, transp_freq), a_state->trans_in); } break; case MID_MOTIF_STATE : case END_MOTIF_STATE : a_state->ntrans_in = 1; a_state->itrans_in = (int *)mm_malloc(sizeof(int)); a_state->itrans_in[0] = i_state - 1; a_state->trans_in = allocate_array(1); set_array_item(0, 1.0, a_state->trans_in); break; case SPACER_STATE : a_state->ntrans_in = 2; a_state->itrans_in = (int *)mm_malloc(sizeof(int) * 2); a_state->trans_in = allocate_array(2); // For multi-state spacers, incoming transition from previous state. if (i_position != 0) a_state->itrans_in[0] = i_state - 1; else a_state->itrans_in[0] = motif_index(prev_motif, TRUE, num_spacers, spacer_states, motifs, nmotifs); // The other transition is a self-transition. a_state->itrans_in[1] = i_state; set_array_item(0, 1.0 - self_trans(expected_length / spacer_states), a_state->trans_in); set_array_item(1, self_trans(expected_length / spacer_states), a_state->trans_in); break; default: die("Illegal state!"); } // Then set up the transitions out of this state. switch (state_type) { case START_STATE : // Transitions go to each motif. a_state->ntrans_out = nmotifs; a_state->itrans_out = (int *)mm_malloc(sizeof(int) * nmotifs); a_state->trans_out = allocate_array(nmotifs); for (j_motif = 0; j_motif < nmotifs; j_motif++) { a_state->itrans_out[j_motif] = spacer_index(0, j_motif + 1, FALSE, nmotifs, spacer_states); set_array_item(j_motif, get_matrix_cell(0, j_motif + 1, transp_freq), a_state->trans_out); } break; case END_MOTIF_STATE : // Can go to any other motif or to the end state. a_state->ntrans_out = nmotifs + 1; a_state->itrans_out = (int *)mm_malloc(sizeof(int) * (nmotifs + 1)); a_state->trans_out = allocate_array(nmotifs + 1); for (j_motif = 0; j_motif < nmotifs + 1; j_motif++) { a_state->itrans_out[j_motif] = spacer_index(i_motif + 1, j_motif + 1, FALSE, nmotifs, spacer_states); set_array_item(j_motif, get_matrix_cell(i_motif + 1, j_motif + 1, transp_freq), a_state->trans_out); } break; case START_MOTIF_STATE : case MID_MOTIF_STATE : a_state->ntrans_out = 1; a_state->itrans_out = (int *)mm_malloc(sizeof(int)); a_state->itrans_out[0] = i_state + 1; a_state->trans_out = allocate_array(1); set_array_item(0, 1.0, a_state->trans_out); break; case SPACER_STATE : a_state->ntrans_out = 2; a_state->itrans_out = (int *)mm_malloc(sizeof(int) * 2); a_state->trans_out = allocate_array(2); // The first transition is a self-transition. a_state->itrans_out[0] = i_state; // For multi-state spacers, outgoing transition to next state. if (i_position < spacer_states - 1) a_state->itrans_out[1] = i_state + 1; else a_state->itrans_out[1] = motif_index(next_motif, FALSE, num_spacers, spacer_states, motifs, nmotifs); set_array_item(0, self_trans(expected_length), a_state->trans_out); set_array_item(1, 1.0 - self_trans(expected_length), a_state->trans_out); break; case END_STATE : a_state->ntrans_out = 0; a_state->itrans_out = NULL; a_state->trans_out = NULL; break; default: die("Illegal state!"); } }
/************************************************************************* * Build a completely connected HMM. *************************************************************************/ void build_complete_hmm (ARRAY_T* background, int spacer_states, MOTIF_T *motifs, int nmotifs, MATRIX_T *transp_freq, MATRIX_T *spacer_ave, BOOLEAN_T fim, MHMM_T **the_hmm) { ALPH_T alph; int motif_states; // Total length of the motifs. int num_spacers; // Total number of spacer states. int num_states; // Total number of states in the model. int i_motif; // Index of the current "from" motif. int j_motif; // Index of the current "to" motif. int i_position; // Index within the current motif or spacer. int i_state = 0; // Index of the current state. assert(nmotifs > 0); alph = get_motif_alph(motifs);// get the alphabet from the first motif // Count the width of the motifs. for (motif_states = 0, i_motif = 0; i_motif < nmotifs; i_motif++) motif_states += get_motif_length(motif_at(motifs, i_motif)); // Count the spacer states adjacent to begin and end. num_spacers = nmotifs * 2; // Add the spacer states between motifs. num_spacers += nmotifs * nmotifs; // Total states = motifs + spacer_states + begin/end num_states = motif_states + (num_spacers * spacer_states) + 2; // Allocate the model. *the_hmm = allocate_mhmm(alph, num_states); // Record that this is a completely connected model. (*the_hmm)->type = COMPLETE_HMM; // Record the number of motifs in the model. (*the_hmm)->num_motifs = nmotifs; // Record the number of states in the model. (*the_hmm)->num_states = num_states; (*the_hmm)->num_spacers = ((nmotifs + 1) * (nmotifs + 1)) - 1; (*the_hmm)->spacer_states = spacer_states; // Put the background distribution into the model. copy_array(background, (*the_hmm)->background); // Build the begin state. build_complete_state( START_STATE, i_state, alph, 0, // expected length NULL, // Emissions. 0, // Number of sites. NON_MOTIF_INDEX, NON_MOTIF_POSITION, nmotifs, 0, // previous motif 0, // next motif transp_freq, spacer_states, num_spacers, motifs, &((*the_hmm)->states[i_state])); i_state++; int from_motif_state, to_motif_state; // Build the spacer states. No transitions from the end state. for (i_motif = 0; i_motif <= nmotifs; i_motif++) { // No transitions to the start state. for (j_motif = 1; j_motif <= nmotifs+1; j_motif++) { // No transitions from start to end. if ((i_motif == 0) && (j_motif == nmotifs+1)) continue; // Allow multi-state spacers. for (i_position = 0; i_position < spacer_states; i_position++, i_state++) { build_complete_state( SPACER_STATE, i_state, alph, get_matrix_cell(i_motif, j_motif, spacer_ave), background, SPACER_NUMSITES, NON_MOTIF_INDEX, i_position, nmotifs, i_motif, j_motif, transp_freq, spacer_states, num_spacers, motifs, &((*the_hmm)->states[i_state])); } } } // Build the motif states. for (i_motif = 0; i_motif < nmotifs; i_motif++) { MOTIF_T *this_motif = motif_at(motifs, i_motif); STATE_T state; for (i_position = 0; i_position < get_motif_length(this_motif); i_position++, i_state++) { if (i_position == 0) { state = START_MOTIF_STATE; } else if (i_position == (get_motif_length(this_motif) - 1)) { state = END_MOTIF_STATE; } else { state = MID_MOTIF_STATE; } build_complete_state( MID_MOTIF_STATE, i_state, alph, 0, // Expected spacer length. get_matrix_row(i_position, get_motif_freqs(this_motif)), get_motif_nsites(this_motif), i_motif, i_position, nmotifs, 0, // Previous motif index. 0, // Next motif index. transp_freq, spacer_states, num_spacers, motifs, &((*the_hmm)->states[i_state])); } } // Build the end state. build_complete_state( END_STATE, i_state, alph, 0, // Expected spacer length. NULL, // Emissions 0, // Number of sites. NON_MOTIF_INDEX, NON_MOTIF_POSITION, nmotifs, 0, // Previous motif index. 0, // Next motif index. transp_freq, spacer_states, num_spacers, motifs, &((*the_hmm)->states[i_state])); i_state++; // Convert spacers to FIMs if requested. if (fim) { convert_to_fims(*the_hmm); } // Fill in the transition matrix. build_transition_matrix(*the_hmm); }
void ramen_scan_sequences() { FILE* seq_file = NULL; MOTIF_T* motif = NULL; MOTIF_T* rev_motif = NULL; SEQ_T* sequence = NULL; SCANNED_SEQUENCE_T* scanned_seq = NULL; PATTERN_T* pattern; int i; int j; SEQ_T** seq_list; int num_seqs; int seq_len; //For the bdb_bg mode: ARRAY_T* seq_bg_freqs; double atcontent; double roundatcontent; double avg_seq_length = 0; //Open the file. if (open_file(args.sequence_filename, "r", FALSE, "FASTA", "sequences", &seq_file) == 0) { fprintf(stderr, "Couldn't open the file %s.\n", args.sequence_filename); ramen_terminate(1); } //Start reading in the sequences read_many_fastas(ramen_alph, seq_file, MAX_SEQ_LENGTH, &num_seqs, &seq_list); seq_ids = new_string_list(); seq_fscores = allocate_array(num_seqs); //Allocate the required space for results results = malloc(sizeof(double*) * motifs.num); for (i=0;i<motifs.num;i++) { results[i] = malloc(sizeof(double)*num_seqs); } for (j=0;j<num_seqs;j++) { fprintf(stderr, "\rScanning %i of %i sequences...", j+1, num_seqs); //copy the pointer into our current object for clarity sequence = seq_list[j]; //Read the fluorescence data from the description field. add_string(get_seq_name(sequence),seq_ids); seq_len = get_seq_length(sequence); set_array_item(j,atof(get_seq_description(sequence)),seq_fscores); //Scan with each motif. for (i=0;i<motifs.num;i++) { int motifindex = i*2; results[i][j] = ramen_sequence_scan(sequence, motif_at(motifs.motifs, motifindex), motif_at(motifs.motifs, motifindex+1), NULL, NULL, //No need to pass PSSM. AVG_ODDS, 0, TRUE, 0, motifs.bg_freqs); if (TRUE == args.linreg_normalise) { int k; double maxscore = 1; motif = motif_at(motifs.motifs,motifindex); for (k=0;k<get_motif_length(motif);k++) { double maxprob = 0; if (maxprob < get_matrix_cell(k, alph_index(ramen_alph, 'A'), get_motif_freqs(motif))) maxprob = get_matrix_cell(k, alph_index(ramen_alph, 'A'), get_motif_freqs(motif)); if (maxprob < get_matrix_cell(k, alph_index(ramen_alph, 'C'), get_motif_freqs(motif))) maxprob = get_matrix_cell(k, alph_index(ramen_alph, 'C'), get_motif_freqs(motif)); if (maxprob < get_matrix_cell(k, alph_index(ramen_alph, 'G'), get_motif_freqs(motif))) maxprob = get_matrix_cell(k, alph_index(ramen_alph, 'G'), get_motif_freqs(motif)); if (maxprob < get_matrix_cell(k, alph_index(ramen_alph, 'T'), get_motif_freqs(motif))) maxprob = get_matrix_cell(k, alph_index(ramen_alph, 'T'), get_motif_freqs(motif)); maxscore *= maxprob; } results[i][j] /= maxscore; } } } }
/************************************************************************* * Entry point for centrimo *************************************************************************/ int main(int argc, char *argv[]) { CENTRIMO_OPTIONS_T options; SEQ_SITES_T seq_sites; SITE_COUNTS_T counts; int seqN, motifN, seqlen, db_i, motif_i, i; double log_pvalue_thresh; SEQ_T** sequences = NULL; ARRAY_T* bg_freqs = NULL; ARRAYLST_T *stats_list; MOTIF_DB_T **dbs, *db; MREAD_T *mread; MOTIF_STATS_T *stats; MOTIF_T *motif, *rev_motif; PSSM_T *pos_pssm, *rev_pssm; char *sites_path, *desc; FILE *sites_file; HTMLWR_T *html; JSONWR_T *json; // COMMAND LINE PROCESSING process_command_line(argc, argv, &options); // load the sequences read_sequences(options.alphabet, options.seq_source, &sequences, &seqN); seqlen = (seqN ? get_seq_length(sequences[0]) : 0); // calculate a sequence background (unless other background is given) if (!options.bg_source) { bg_freqs = calc_bg_from_fastas(options.alphabet, seqN, sequences); } // load the motifs motifN = 0; dbs = mm_malloc(sizeof(MOTIF_DB_T*) * arraylst_size(options.motif_sources)); for (i = 0; i < arraylst_size(options.motif_sources); i++) { char* db_source; db_source = (char*)arraylst_get(i, options.motif_sources); dbs[i] = read_motifs(i, db_source, options.bg_source, &bg_freqs, options.pseudocount, options.selected_motifs, options.alphabet); motifN += arraylst_size(dbs[i]->motifs); } log_pvalue_thresh = log(options.evalue_thresh) - log(motifN); // Setup some things for double strand scanning if (options.scan_both_strands == TRUE) { // Set up hash tables for computing reverse complement setup_hash_alph(DNAB); setalph(0); // Correct background by averaging on freq. for both strands. average_freq_with_complement(options.alphabet, bg_freqs); normalize_subarray(0, alph_size(options.alphabet, ALPH_SIZE), 0.0, bg_freqs); calc_ambigs(options.alphabet, FALSE, bg_freqs); } // Create output directory if (create_output_directory(options.output_dirname, options.allow_clobber, (verbosity >= NORMAL_VERBOSE))) { die("Couldn't create output directory %s.\n", options.output_dirname); } // open output files sites_path = make_path_to_file(options.output_dirname, SITES_FILENAME); sites_file = fopen(sites_path, "w"); free(sites_path); // setup html monolith writer json = NULL; if ((html = htmlwr_create(get_meme_etc_dir(), TEMPLATE_FILENAME))) { htmlwr_set_dest_name(html, options.output_dirname, HTML_FILENAME); htmlwr_replace(html, "centrimo_data.js", "data"); json = htmlwr_output(html); if (json == NULL) die("Template does not contain data section.\n"); } else { DEBUG_MSG(QUIET_VERBOSE, "Failed to open html template file.\n"); } if (json) { // output some top level variables jsonwr_str_prop(json, "version", VERSION); jsonwr_str_prop(json, "revision", REVISION); jsonwr_str_prop(json, "release", ARCHIVE_DATE); jsonwr_str_array_prop(json, "cmd", argv, argc); jsonwr_property(json, "options"); jsonwr_start_object_value(json); jsonwr_dbl_prop(json, "motif-pseudo", options.pseudocount); jsonwr_dbl_prop(json, "score", options.score_thresh); jsonwr_dbl_prop(json, "ethresh", options.evalue_thresh); jsonwr_lng_prop(json, "maxbin", options.max_window+1); jsonwr_bool_prop(json, "norc", !options.scan_both_strands); jsonwr_bool_prop(json, "noflip", options.no_flip); jsonwr_end_object_value(json); // output the description desc = prepare_description(&options); if (desc) { jsonwr_str_prop(json, "job_description", desc); free(desc); } // output size metrics jsonwr_lng_prop(json, "seqlen", seqlen); jsonwr_lng_prop(json, "tested", motifN); // output the fasta db jsonwr_property(json, "sequence_db"); jsonwr_start_object_value(json); jsonwr_str_prop(json, "source", options.seq_source); jsonwr_lng_prop(json, "count", seqN); jsonwr_end_object_value(json); // output the motif dbs jsonwr_property(json, "motif_dbs"); jsonwr_start_array_value(json); for (db_i = 0; db_i < arraylst_size(options.motif_sources); db_i++) { db = dbs[db_i]; jsonwr_start_object_value(json); jsonwr_str_prop(json, "source", db->source); jsonwr_lng_prop(json, "count", arraylst_size(db->motifs)); jsonwr_end_object_value(json); } jsonwr_end_array_value(json); // start the motif array jsonwr_property(json, "motifs"); jsonwr_start_array_value(json); } /************************************************************** * Tally the positions of the best sites for each of the * selected motifs. **************************************************************/ // prepare the sequence sites memset(&seq_sites, 0, sizeof(SEQ_SITES_T)); // prepare the site counts counts.allocated = ((2 * seqlen) - 1); counts.sites = mm_malloc(sizeof(double) * counts.allocated); // prepare the motifs stats list stats_list = arraylst_create(); // prepare the other vars motif = NULL; pos_pssm = NULL; rev_motif = NULL; rev_pssm = NULL; for (db_i = 0; db_i < arraylst_size(options.motif_sources); db_i++) { db = dbs[db_i]; for (motif_i = 0; motif_i < arraylst_size(db->motifs); motif_i++) { motif = (MOTIF_T *) arraylst_get(motif_i, db->motifs); DEBUG_FMT(NORMAL_VERBOSE, "Using motif %s of width %d.\n", get_motif_id(motif), get_motif_length(motif)); // reset the counts for (i = 0; i < counts.allocated; i++) counts.sites[i] = 0; counts.total_sites = 0; // create the pssm pos_pssm = make_pssm(bg_freqs, motif); // If required, do the same for the reverse complement motif. if (options.scan_both_strands) { rev_motif = dup_rc_motif(motif); rev_pssm = make_pssm(bg_freqs, rev_motif); } // scan the sequences for (i = 0; i < seqN; i++) score_sequence(&options, sequences[i], pos_pssm, rev_pssm, &seq_sites, &counts); // DEBUG check that the sum of the sites is close to the site count double sum_check = 0, sum_diff; for (i = 0; i < counts.allocated; i++) sum_check += counts.sites[i]; sum_diff = counts.total_sites - sum_check; if (sum_diff < 0) sum_diff = -sum_diff; if (sum_diff > 0.1) { fprintf(stderr, "Warning: site counts don't sum to accurate value! " "%g != %ld", sum_check, counts.total_sites); } // output the plain text site counts output_site_counts(sites_file, seqlen, db, motif, &counts); // compute the best central window stats = compute_stats(options.max_window, seqlen, db, motif, &counts); // check if it passes the threshold if (json && stats->log_adj_pvalue <= log_pvalue_thresh) { output_motif_json(json, stats, &counts); arraylst_add(stats, stats_list); } else { free(stats); } // Free memory associated with this motif. free_pssm(pos_pssm); free_pssm(rev_pssm); destroy_motif(rev_motif); } } if (json) jsonwr_end_array_value(json); // finish writing sites fclose(sites_file); // finish writing html file if (html) { if (htmlwr_output(html) != NULL) { die("Found another JSON replacement!\n"); } htmlwr_destroy(html); } // write text file output_centrimo_text(&options, motifN, stats_list); // Clean up. for (i = 0; i < seqN; ++i) { free_seq(sequences[i]); } free(sequences); for (i = 0; i < arraylst_size(options.motif_sources); i++) { free_db(dbs[i]); } free(dbs); free_array(bg_freqs); free(counts.sites); free(seq_sites.sites); arraylst_destroy(free, stats_list); cleanup_options(&options); return 0; }
/************************************************************************* * Set up one state in a linear HMM, given the appropriate data. *************************************************************************/ static void build_linear_state (ALPH_T alph, // alphabet STATE_T state_type, // Type of state (START, SPACER,...) int i_state, // The state index. int expected_length,// For spacers, expected length of output. ARRAY_T* freqs, // Emission probability distrib. double num_sites, // Number of sites for this emission. int i_motif, // Index of motif this state is in. int i_position, // Position of this state within a motif or spacer. MOTIF_T* motif, // Motif. MHMM_STATE_T * a_state) // State to be filled in (pre-allocated). { if (verbosity >= NORMAL_VERBOSE) { switch (state_type) { case START_STATE : fprintf(stderr, "Building HMM: 0 "); break; case SPACER_STATE : case END_MOTIF_STATE : fprintf(stderr, "%d ", i_state); break; case START_MOTIF_STATE : case MID_MOTIF_STATE : fprintf(stderr, "%d-", i_state); break; case END_STATE : fprintf(stderr, "%d\n", i_state); break; case INVALID_STATE : die("Invalid state.\n"); } } /* Record what type of state this is. */ a_state->type = state_type; // Record the motif width if this is a motif. if (state_type == START_MOTIF_STATE || state_type == MID_MOTIF_STATE || state_type == END_MOTIF_STATE) { a_state->w_motif = get_motif_length(motif); } else { a_state->w_motif = 1; } /* Set up the emission distribution and a few other tidbits. */ a_state->emit = allocate_array(alph_size(alph, ALL_SIZE)); a_state->emit_odds = allocate_array(alph_size(alph, ALL_SIZE)); if (state_type == START_STATE || state_type == END_STATE) { /* Start and end don't have emissions. */ int i_alph; for (i_alph = 0; i_alph < alph_size(alph, ALL_SIZE); i_alph++) { set_array_item(i_alph, 1.0, a_state->emit); } } else { copy_array(freqs, a_state->emit); } a_state->num_sites = num_sites; /* Record the motif index and ID. */ a_state->i_motif = i_motif; if ((state_type == START_STATE) || (state_type == END_STATE) || (state_type == SPACER_STATE)) { strcpy(a_state->motif_id, NON_MOTIF_ID); a_state->id_char = NON_MOTIF_ID_CHAR; } else { // a motif state strcpy(a_state->motif_id, get_full_motif_id(motif)); a_state->id_char = get_motif_id_char(i_position, motif); } a_state->i_position = i_position; /* First set up the transitions into this state. */ switch (state_type) { case START_STATE : a_state->ntrans_in = 0; a_state->itrans_in = NULL; a_state->trans_in = NULL; break; case START_MOTIF_STATE : case END_STATE : a_state->ntrans_in = 2; a_state->itrans_in = (int *)mm_malloc(sizeof(int) * 2); a_state->itrans_in[0] = i_state - 2; a_state->itrans_in[1] = i_state - 1; a_state->trans_in = allocate_array(2); set_array_item(0, 1.0 - self_trans(expected_length), a_state->trans_in); set_array_item(1, 1.0 - self_trans(expected_length), a_state->trans_in); break; case MID_MOTIF_STATE : case END_MOTIF_STATE : a_state->ntrans_in = 1; a_state->itrans_in = (int *)mm_malloc(sizeof(int)); a_state->itrans_in[0] = i_state - 1; a_state->trans_in = allocate_array(1); set_array_item(0, 1.0, a_state->trans_in); break; case SPACER_STATE : a_state->ntrans_in = 2; a_state->itrans_in = (int *)mm_malloc(sizeof(int) * 2); a_state->itrans_in[0] = i_state - 1; a_state->itrans_in[1] = i_state; a_state->trans_in = allocate_array(2); set_array_item(0, 1.0 - self_trans(expected_length), a_state->trans_in); set_array_item(1, self_trans(expected_length), a_state->trans_in); break; default: die("Invalid state type.\n"); } /* Then set up the transitions out of this state. */ switch (state_type) { case START_STATE : case END_MOTIF_STATE : a_state->ntrans_out = 2; a_state->itrans_out = (int *)mm_malloc(sizeof(int) * 2); a_state->itrans_out[0] = i_state + 1; a_state->itrans_out[1] = i_state + 2; a_state->trans_out = allocate_array(2); set_array_item(0, self_trans(expected_length), a_state->trans_out); set_array_item(1, 1.0 - self_trans(expected_length), a_state->trans_out); break; case START_MOTIF_STATE : case MID_MOTIF_STATE : a_state->ntrans_out = 1; a_state->itrans_out = (int *)mm_malloc(sizeof(int)); a_state->itrans_out[0] = i_state + 1; a_state->trans_out = allocate_array(1); set_array_item(0, 1.0, a_state->trans_out); break; case SPACER_STATE : a_state->ntrans_out = 2; a_state->itrans_out = (int *)mm_malloc(sizeof(int) * 2); a_state->itrans_out[0] = i_state; a_state->itrans_out[1] = i_state + 1; a_state->trans_out = allocate_array(2); set_array_item(0, self_trans(expected_length), a_state->trans_out); set_array_item(1, 1.0 - self_trans(expected_length), a_state->trans_out); break; case END_STATE : a_state->ntrans_out = 0; a_state->itrans_out = NULL; a_state->trans_out = NULL; break; default: die("Invalid state type.\n"); } }
/************************************************************************* * Build a linear HMM. *************************************************************************/ void build_linear_hmm (ARRAY_T* background, ORDER_T* order_spacing, int spacer_states, RBTREE_T* motifs, // motifs with key as in order_spacing BOOLEAN_T fim, MHMM_T** the_hmm) { ALPH_T alph; int model_length; // Total number of states in the model. int i_state; // Index of the current state. int i_order; // Index within the order and spacing. int i_position; // Index within the current motif or spacer. int motif_i; // motif key in order spacing MOTIF_T *motif; // motif RBNODE_T *node; alph = get_motif_alph((MOTIF_T*)rbtree_value(rbtree_first(motifs))); // Calculate the total length of the model. model_length = 2; // start and end state for (i_order = 0; i_order < get_order_occurs(order_spacing); i_order++) { motif_i = get_order_motif(order_spacing, i_order); motif = (MOTIF_T*)rbtree_get(motifs, &motif_i); model_length += get_motif_length(motif); } model_length += (get_order_occurs(order_spacing) + 1) * spacer_states; // Allocate the model. *the_hmm = allocate_mhmm(alph, model_length); check_sq_matrix((*the_hmm)->trans, model_length); // Record that this is a linear model. (*the_hmm)->type = LINEAR_HMM; // Record the number of motifs in the model. // It doesn't want the distinct count (*the_hmm)->num_motifs = get_order_occurs(order_spacing); // Record the number of states in the model. (*the_hmm)->num_states = model_length; (*the_hmm)->num_spacers = get_order_occurs(order_spacing) + 1; (*the_hmm)->spacer_states = spacer_states; // Put the background distribution into the model. copy_array(background, (*the_hmm)->background); // Begin the model with a non-emitting state. i_state = 0; check_sq_matrix((*the_hmm)->trans, (*the_hmm)->num_states); build_linear_state( alph, START_STATE, i_state, get_spacer_length(order_spacing, 0), NULL, // Emissions. 0, // Number of sites. NON_MOTIF_INDEX, NON_MOTIF_POSITION, // position within state (not relevant to start state) NULL, // no motif &((*the_hmm)->states[i_state])); ++i_state; // Build the first spacer. for (i_position = 0; i_position < spacer_states; i_position++, i_state++) { check_sq_matrix((*the_hmm)->trans, (*the_hmm)->num_states); build_linear_state( alph, SPACER_STATE, i_state, get_spacer_length(order_spacing, 0), background, SPACER_NUMSITES, NON_MOTIF_INDEX, i_position, // position within spacer NULL, // no motif &((*the_hmm)->states[i_state])); } // Build each motif and subsequent spacer. for (i_order = 0; i_order < get_order_occurs(order_spacing); i_order++) { STATE_T state; int spacer_len; motif_i = get_order_motif(order_spacing, i_order); motif = (MOTIF_T*)rbtree_get(motifs, &motif_i); // Build the motif. for (i_position = 0; i_position < get_motif_length(motif); i_position++, i_state++) { if (i_position == 0) { state = START_MOTIF_STATE; spacer_len = get_spacer_length(order_spacing, i_order); } else if (i_position == (get_motif_length(motif) - 1)) { state = END_MOTIF_STATE; spacer_len = get_spacer_length(order_spacing, i_order+1); } else { state = MID_MOTIF_STATE; spacer_len = 0; } check_sq_matrix((*the_hmm)->trans, (*the_hmm)->num_states); build_linear_state( alph, state, i_state, spacer_len, // Expected spacer length. get_matrix_row(i_position, get_motif_freqs(motif)), get_motif_nsites(motif), i_order, i_position, // position within motif (middle) motif, &((*the_hmm)->states[i_state])); } // Build the following spacer. for (i_position = 0; i_position < spacer_states; i_position++, i_state++) { check_sq_matrix((*the_hmm)->trans, (*the_hmm)->num_states); build_linear_state( alph, SPACER_STATE, i_state, get_spacer_length(order_spacing, i_order+1), background, SPACER_NUMSITES, NON_MOTIF_INDEX, i_position, // position within spacer NULL, // no motif &((*the_hmm)->states[i_state])); } } check_sq_matrix((*the_hmm)->trans, (*the_hmm)->num_states); // Finish up the model with a non-emitting end state. build_linear_state( alph, END_STATE, i_state, get_spacer_length(order_spacing, i_order), NULL, // Emissions. 0, // Number of sites. NON_MOTIF_INDEX, NON_MOTIF_POSITION, // position within state (not relevant to end state) NULL, // no motif &((*the_hmm)->states[i_state])); ++i_state; assert(i_state == model_length); check_sq_matrix((*the_hmm)->trans, (*the_hmm)->num_states); // Convert spacers to FIMs if requested. if (fim) { convert_to_fims(*the_hmm); } // Fill in the transition matrix. build_transition_matrix(*the_hmm); }
/************************************************************************* * Build a star topology HMM. *************************************************************************/ void build_star_hmm (ARRAY_T* background, int spacer_states, MOTIF_T* motifs, int nmotifs, BOOLEAN_T fim, MHMM_T** the_hmm) { ALPH_T alph; int motif_states; /* Total length of the motifs. */ int num_spacers; /* Total number of spacer states. */ int num_states; /* Total number of states in the model. */ int i_motif; /* Index of the current "from" motif. */ int i_position; /* Index within the current motif or spacer. */ int i_state = 0; /* Index of the current state. */ alph = get_motif_alph(motif_at(motifs, 0)); /* Count the width of the motifs. */ for (motif_states = 0, i_motif = 0; i_motif < nmotifs; i_motif++) motif_states += get_motif_length(motif_at(motifs, i_motif)); // Only 1 spacer. num_spacers = 1; /* Total states = motifs + spacer_states + begin/end */ num_states = motif_states + (num_spacers * spacer_states) + 2; /* fprintf(stderr, "motif_states=%d num_spacers=%d num_states=%d\n", motif_states, num_spacers, num_states); */ /* Allocate the model. */ *the_hmm = allocate_mhmm(alph, num_states); /* Record that this is a star model. */ (*the_hmm)->type = STAR_HMM; /* Record the number of motifs in the model. */ (*the_hmm)->num_motifs = nmotifs; /* Record the number of states in the model. */ (*the_hmm)->num_states = num_states; (*the_hmm)->num_spacers = 1; (*the_hmm)->spacer_states = spacer_states; // Put the background distribution into the model. copy_array(background, (*the_hmm)->background); /* Build the begin state. */ build_star_state( alph, START_STATE, i_state, 0, // expected length NULL, 0, // Number of sites. NON_MOTIF_INDEX, NON_MOTIF_POSITION, nmotifs, spacer_states, motifs, &((*the_hmm)->states[i_state]) ); i_state++; // Build the spacer state (state 0). Allow multi-state spacers. for (i_position = 0; i_position < spacer_states; i_position++) { build_star_state( alph, SPACER_STATE, i_state, DEFAULT_SPACER_LENGTH, background, SPACER_NUMSITES, NON_MOTIF_INDEX, i_position, nmotifs, spacer_states, motifs, &((*the_hmm)->states[i_state]) ); i_state++; } /* Build the motif states. */ for (i_motif = 0; i_motif < nmotifs; i_motif++) { MOTIF_T *this_motif = motif_at(motifs, i_motif); assert(get_motif_length(this_motif) > 1); i_position = 0; build_star_state( alph, START_MOTIF_STATE, i_state, 0, // Expected spacer length. get_matrix_row(i_position, get_motif_freqs(this_motif)), get_motif_nsites(this_motif), i_motif, i_position, nmotifs, spacer_states, motifs, &((*the_hmm)->states[i_state]) ); i_state++; for (i_position = 1; i_position < get_motif_length(this_motif) - 1; i_position++) { build_star_state( alph, MID_MOTIF_STATE, i_state, 0, // Expected spacer length. get_matrix_row(i_position, get_motif_freqs(this_motif)), get_motif_nsites(this_motif), i_motif, i_position, nmotifs, spacer_states, motifs, &((*the_hmm)->states[i_state]) ); i_state++; } build_star_state( alph, END_MOTIF_STATE, i_state, 0, // Expected spacer length. get_matrix_row(i_position, get_motif_freqs(this_motif)), get_motif_nsites(this_motif), i_motif, i_position, nmotifs, spacer_states, motifs, &((*the_hmm)->states[i_state]) ); i_state++; } /* Build the end state. */ build_star_state( alph, END_STATE, i_state, 0, // Expected spacer length. NULL, // Emissions 0, // Number of sites. NON_MOTIF_INDEX, NON_MOTIF_POSITION, nmotifs, spacer_states, motifs, &((*the_hmm)->states[i_state]) ); i_state++; /* Convert spacers to FIMs if requested. */ if (fim) { convert_to_fims(*the_hmm); } /* Fill in the transition matrix. */ build_transition_matrix(*the_hmm); } // build_star_hmm
/************************************************************************* * Compute the enrichment of the central region *************************************************************************/ static MOTIF_STATS_T* compute_stats(int max_window, int sequence_length, MOTIF_DB_T* db, MOTIF_T* motif, SITE_COUNTS_T* counts) { // variables MOTIF_STATS_T *stats; double window_counts, max_sites; int i, max_bins, is_centered, big_window, middle, window, bins; double log_p_value; // allocate memory for stats stats = mm_malloc(sizeof(MOTIF_STATS_T)); // initilise stats to defaults stats->db = db; stats->motif = motif; stats->total_sites = counts->total_sites; stats->n_win_tested = 0; stats->max_prob = 0; stats->central_sites = 0; stats->central_prob = 0.0; stats->central_window = 0; stats->log_pvalue = 0; stats->log_adj_pvalue = 0; // find the largest site count max_sites = 0; for (i = 0; i < counts->allocated; i++) { if (max_sites < counts->sites[i]) max_sites = counts->sites[i]; } // calculate the max probability stats->max_prob = (counts->total_sites == 0 ? 0 : max_sites / (double)counts->total_sites); // get the number of bins that the motif could possibly have landed in max_bins = sequence_length - get_motif_length(motif) + 1; // determine if this motif can have sites in a completely central bin is_centered = (max_bins % 2); // calculate the window that contains all sites from this motif stats->all_window = max_bins - is_centered; // calculate the biggest window which might have a p-value big_window = stats->all_window - 2; // check that max window is ok if (max_window == -1 || max_window > big_window) max_window = big_window; if (max_window < 0) return stats; // no windows to test! // calculate the number of tested windows stats->n_win_tested = (is_centered ? (max_window / 2) + 1 : (max_window + 1) / 2); if (stats->n_win_tested == 0) return stats; // no windows to test! // the index of the bin in the center middle = sequence_length - 1; // initialise counts stats->log_pvalue = BIG; //ensure it is replaced by the loop window_counts = 0; bins = 0; if (is_centered) { // test the central window bins++; window_counts = counts->sites[middle]; stats->log_pvalue = window_enrichment(0, window_counts, counts->total_sites, bins, max_bins); stats->central_sites = window_counts; stats->central_prob = (double)bins / (double)max_bins; } // find the best window by trying all possible windows for (bins += 2, window = bins-1; window <= max_window; bins += 2, window += 2) { window_counts += counts->sites[middle - window]; window_counts += counts->sites[middle + window]; // calculate the window p-value log_p_value = window_enrichment(window, window_counts, counts->total_sites, bins, max_bins); // check if the p-value is better if (log_p_value < stats->log_pvalue) { stats->log_pvalue = log_p_value; stats->central_window = window; stats->central_sites = window_counts; stats->central_prob = (double)bins / (double)max_bins; } } stats->log_adj_pvalue = LOGEV(log(stats->n_win_tested), stats->log_pvalue); DEBUG_FMT(HIGHER_VERBOSE, "best bin: %d sites: %g " "log_adj_p-value: %g (%d tests)\n", stats->central_window+1, stats->central_sites, stats->log_adj_pvalue, stats->n_win_tested); return stats; }
/************************************************************************* * Calculate the odds score for each motif-sized window at each * site in the sequence using the given nucleotide frequencies. * * This function is a lightweight version based on the one contained in * motiph-scoring. Several calculations that are unnecessary for gomo * have been removed in order to speed up the process *************************************************************************/ static double score_sequence( SEQ_T *seq, // sequence to scan (IN) MOTIF_T *motif, // motif already converted to odds values (IN) PSSM_T *m_pssm, // motif pssm (IN) MATRIX_T *m_odds, // motif odds (IN) int method, // method used for scoring (IN) double threshold, // Threshold to use in TOTAL_HITS mode with a PWM ARRAY_T *bg_freqs //background model ) { assert(seq != NULL); assert(motif != NULL); assert((method == TOTAL_HITS && m_pssm) || (method != TOTAL_HITS && m_odds)); char* raw_seq = get_raw_sequence(seq); int seq_length = get_seq_length(seq); // Get the pv lookup table ARRAY_T* pv_lookup = NULL; if (NULL != m_pssm) { pv_lookup = m_pssm->pv; assert(get_array_length(pv_lookup) > 0); } // Prepare storage for the string representing the portion // of the reference sequence within the window. char* window_seq = (char *) mm_malloc(sizeof(char) * (get_motif_length(motif) + 1)); window_seq[get_motif_length(motif)] = '\0'; int max_index = seq_length - get_motif_length(motif); if (max_index < 0) max_index = 0; const int asize = alph_size(get_motif_alph(motif), ALPH_SIZE); double* odds = (double*) mm_malloc(sizeof(double)*max_index); double* scaled_log_odds = (double*) mm_malloc(sizeof(double)*max_index); // For each site in the sequence int seq_index; for (seq_index = 0; seq_index < max_index; seq_index++) { double odd = 1.0; scaled_log_odds[seq_index] = 0; // For each site in the motif window int motif_position; for (motif_position = 0; motif_position < get_motif_length(motif); motif_position++) { char c = raw_seq[seq_index + motif_position]; window_seq[motif_position] = c; // Check for gaps at this site if(c == '-' || c == '.') { break; } // Check for ambiguity codes at this site //TODO: This next call is very expensive - it takes up approx. 10% of a // programme's running time. It should be fixed up somehow. int aindex = alph_index(get_motif_alph(motif), c); if (aindex > asize) { break; } if (method == TOTAL_HITS) { //If we're in this mode, then we're using LOG ODDS. //scaled_log_odds[seq_index] += get_matrix_cell(motif_position, aindex, get_motif_freqs(motif)); scaled_log_odds[seq_index] += get_matrix_cell(motif_position, aindex, m_pssm->matrix); } else { odd *= get_matrix_cell(motif_position, aindex, m_odds); } } odds[seq_index] = odd; } // return odds as requested (MAX or AVG scoring) double requested_odds = 0.0; if (method == AVG_ODDS){ for (seq_index = 0; seq_index < max_index; seq_index++) { requested_odds += odds[seq_index]; } requested_odds /= max_index + 1; // Divide by 0 if max_index==0 } else if (method == MAX_ODDS){ for (seq_index = 0; seq_index < max_index; seq_index++) { if (odds[seq_index] > requested_odds){ requested_odds = odds[seq_index]; } } } else if (method == SUM_ODDS) { for (seq_index = 0; seq_index < max_index; seq_index++) { requested_odds += odds[seq_index]; } } else if (method == TOTAL_HITS) { for (seq_index = 0; seq_index < max_index; seq_index++) { if (scaled_log_odds[seq_index] >= (double)get_array_length(pv_lookup)) { scaled_log_odds[seq_index] = (double)(get_array_length(pv_lookup) - 1); } double pvalue = get_array_item((int) scaled_log_odds[seq_index], pv_lookup); //Figure out how to calculate the p-value of a hit //fprintf(stderr, "m: %s pv_l len: %i scaled_log_odds: %g seq index: %i pvalue: %g\n", // get_motif_id(motif), get_array_length(pv_lookup), scaled_log_odds[seq_index], seq_index, pvalue); if (pvalue < threshold) { requested_odds++; //Add another hit. } if (verbosity > HIGHER_VERBOSE) { fprintf(stderr, "Window Data: %s\t%s\t%i\t%g\t%g\t%g\n", get_seq_name(seq), get_motif_id(motif), seq_index, scaled_log_odds[seq_index], pvalue, threshold); } } } myfree(odds); myfree(scaled_log_odds); myfree(window_seq); return requested_odds; }
ARRAYLST_T* load_motifs(AMA_OPTIONS_T *opts) { ARRAYLST_T *motifs; ARRAY_T *pos_bg_freqs, *rev_bg_freqs; MREAD_T *mread; MOTIF_T *motif, *motif_rc; double range; PSSM_T *pos_pssm, *neg_pssm; int total_motifs; ALPH_T *alph; // // Read the motifs and background model. // //this reads any meme file, xml, txt and html mread = mread_create(opts->motif_filename, OPEN_MFILE); mread_set_bg_source(mread, opts->bg_filename); mread_set_pseudocount(mread, opts->pseudocount); // sanity check, since the rest of the code relies on the motifs being complementable alph = alph_hold(mread_get_alphabet(mread)); if (alph == NULL) die("Unable to determine alphabet from motifs"); if (opts->scan_both_strands && !alph_has_complement(alph)) { opts->scan_both_strands = false; } if (opts->num_gc_bins > 1 && alph_size_core(alph) != 4 && alph_size_pairs(alph) != 2) { fprintf(stderr, "Warning: The motif alphabet does not have exactly 2 complementary pairs so \"GC binning\" will be disabled.\n"); opts->num_gc_bins = 1; } pos_bg_freqs = mread_get_background(mread); rev_bg_freqs = NULL; if (opts->scan_both_strands) { rev_bg_freqs = allocate_array(get_array_length(pos_bg_freqs)); copy_array(pos_bg_freqs, rev_bg_freqs); complement_swap_freqs(alph, rev_bg_freqs, rev_bg_freqs); } // allocate memory for motifs motifs = arraylst_create(); // // Convert motif matrices into log-odds matrices. // Scale them. // Compute the lookup tables for the PDF of scaled log-odds scores. // range = 300; // 100 is not very good; 1000 is great but too slow neg_pssm = NULL; total_motifs = 0; while (mread_has_motif(mread)) { motif = mread_next_motif(mread); total_motifs++; if (rbtree_size(opts->selected_motifs) == 0 || rbtree_find(opts->selected_motifs, get_motif_id(motif)) != NULL) { if (verbosity >= HIGH_VERBOSE) { fprintf(stderr, "Using motif %s of width %d.\n", get_motif_id(motif), get_motif_length(motif)); } pos_pssm = build_motif_pssm( motif, pos_bg_freqs, pos_bg_freqs, NULL, // Priors not used 0.0L, // alpha not used range, opts->num_gc_bins, true ); // // Note: If scanning both strands, we complement the motif frequencies // but not the background frequencies so the motif looks the same. // However, the given frequencies are used in computing the p-values // since they represent the frequencies on the negative strands. // (If we instead were to complement the input sequence, keeping the // the motif fixed, we would need to use the complemented frequencies // in computing the p-values. Is that any clearer?) // if (opts->scan_both_strands) { motif_rc = dup_rc_motif(motif); neg_pssm = build_motif_pssm( motif_rc, rev_bg_freqs, pos_bg_freqs, NULL, // Priors not used 0.0L, // alpha not used range, opts->num_gc_bins, true ); destroy_motif(motif_rc); } arraylst_add(motif_and_pssm_create(motif, pos_pssm, neg_pssm), motifs); } else { if (verbosity >= HIGH_VERBOSE) fprintf(stderr, "Skipping motif %s.\n", get_motif_id(motif)); destroy_motif(motif); } } mread_destroy(mread); free_array(pos_bg_freqs); free_array(rev_bg_freqs); alph_release(alph); if (verbosity >= NORMAL_VERBOSE) { fprintf(stderr, "Loaded %d/%d motifs from %s.\n", arraylst_size(motifs), total_motifs, opts->motif_filename); } return motifs; }
/************************************************************************* * Entry point for ama *************************************************************************/ int main(int argc, char *argv[]) { int max_seq_length = MAX_SEQ; STRING_LIST_T* selected_motifs = NULL; double pseudocount = 0.01; int output_format = CISML_FORMAT; program_name = "ama"; int scoring = AVG_ODDS; BOOLEAN_T pvalues = FALSE; BOOLEAN_T normalize_scores = FALSE; BOOLEAN_T combine_duplicates = FALSE; int num_gc_bins = 1; int sdbg_order = -1; // don't use sequence background BOOLEAN_T scan_both_strands = TRUE; ARRAY_T* pos_bg_freqs = NULL; ARRAY_T* rev_bg_freqs = NULL; clock_t c0, c1; /* measuring cpu_time */ CISML_T *cisml; char * out_dir = NULL; BOOLEAN_T clobber = FALSE; int i; int last = 0; ALPH_T alph = INVALID_ALPH; /********************************************** * COMMAND LINE PROCESSING **********************************************/ const int num_options = 16; cmdoption const motif_scan_options[] = { { "max-seq-length", REQUIRED_VALUE }, { "motif", REQUIRED_VALUE }, { "motif-pseudo", REQUIRED_VALUE }, { "rma", NO_VALUE }, { "pvalues", NO_VALUE }, { "sdbg", REQUIRED_VALUE }, { "norc", NO_VALUE }, { "cs", NO_VALUE }, { "o-format", REQUIRED_VALUE }, { "o", REQUIRED_VALUE }, { "oc", REQUIRED_VALUE }, { "scoring", REQUIRED_VALUE }, { "verbosity", REQUIRED_VALUE }, { "gcbins", REQUIRED_VALUE }, { "last", REQUIRED_VALUE }, { "version", NO_VALUE } }; int option_index = 0; // Define the usage message. char usage[] = "USAGE: ama [options] <motif file> <sequence file> [<background file>]\n" "\n" " Options:\n" " --sdbg <order>\t\t\tUse Markov background model of\n" " \t\t\t\t\torder <order> derived from the sequence\n" " \t\t\t\t\tto compute its likelihood ratios.\n" " \t\t\t\t\tOverrides --pvalues, --gcbins and --rma;\n" " \t\t\t\t\t<background file> is required unless\n" " \t\t\t\t\t--sdbg is given.\n" " --motif <id>\t\t\tUse only the motif identified by <id>.\n" " \t\t\t\t\tThis option may be repeated.\n" " --motif-pseudo <float>\t\tThe value <float> times the background\n" " \t\t\t\t\tfrequency is added to the count of each\n" " \t\t\t\t\tletter when creating the likelihood \n" " \t\t\t\t\tratio matrix (default: %g).\n" " --norc\t\t\t\tDisables the scanning of the reverse\n" " \t\t\t\t\tcomplement strand.\n" " --scoring [avg-odds|max-odds]\tIndicates whether the average or \n" " \t\t\t\t\tthe maximum odds should be calculated\n" " \t\t\t\t\t(default: avg-odds)\n" " --rma\t\t\t\tScale motif scores to the range 0-1.\n" " \t\t\t\t\t(Relative Motif Affinity).\n" " \t\t\t\t\tMotif scores are scaled by the maximum\n" " \t\t\t\t\tscore achievable by that PWM. (default:\n" " \t\t\t\t\tmotif scores are not normalized)\n" " --pvalues\t\t\t\tPrint p-value of avg-odds score in cisml\n" " \t\t\t\t\toutput. Ignored for max-odds scoring.\n" " \t\t\t\t\t(default: p-values are not printed)\n" " --gcbins <bins>\t\t\tCompensate p-values for GC content of\n" " \t\t\t\t\teach sequence using given number of \n" " \t\t\t\t\tGC range bins. Recommended bins: 41.\n" " \t\t\t\t\t(default: p-values are based on\n" " \t\t\t\t\tfrequencies in background file)\n" " --cs\t\t\t\tEnable combining sequences with same\n" " \t\t\t\t\tidentifier by taking the average score\n" " \t\t\t\t\tand the Sidac corrected p-value.\n" " --o-format [gff|cisml]\t\tOutput file format (default: cisml)\n" " \t\t\t\t\tignored if --o or --oc option used\n" " --o <directory>\t\t\tOutput all available formats to\n" " \t\t\t\t\t<directory>; give up if <directory>\n" " \t\t\t\t\texists\n" " --oc <directory>\t\t\tOutput all available formats to\n" " \t\t\t\t\t<directory>; if <directory> exists\n" " \t\t\t\t\toverwrite contents\n" " --verbosity [1|2|3|4]\t\tControls amount of screen output\n" " \t\t\t\t\t(default: %d)\n" " --max-seq-length <int>\t\tSet the maximum length allowed for \n" " \t\t\t\t\tinput sequences. (default: %d)\n" " --last <int>\t\t\tUse only scores of (up to) last <n>\n" " \t\t\t\t\tsequence positions to compute AMA.\n" " --version \t\t\tPrint version and exit.\n" "\n"; // Parse the command line. if (simple_setopt(argc, argv, num_options, motif_scan_options) != NO_ERROR) { die("Error processing command line options: option name too long.\n"); } BOOLEAN_T setoutputformat = FALSE; BOOLEAN_T setoutputdirectory = FALSE; while (TRUE) { int c = 0; char* option_name = NULL; char* option_value = NULL; const char * message = NULL; // Read the next option, and break if we're done. c = simple_getopt(&option_name, &option_value, &option_index); if (c == 0) { break; } else if (c < 0) { (void) simple_getopterror(&message); die("Error processing command line options (%s).\n", message); } else if (strcmp(option_name, "max-seq-length") == 0) { max_seq_length = atoi(option_value); } else if (strcmp(option_name, "norc") == 0) { scan_both_strands = FALSE; } else if (strcmp(option_name, "cs") == 0) { combine_duplicates = TRUE; } else if (strcmp(option_name, "motif") == 0) { if (selected_motifs == NULL) { selected_motifs = new_string_list(); } add_string(option_value, selected_motifs); } else if (strcmp(option_name, "motif-pseudo") == 0) { pseudocount = atof(option_value); } else if (strcmp(option_name, "o-format") == 0) { if (setoutputdirectory) { if (verbosity >= NORMAL_VERBOSE) fprintf(stderr, "output directory specified, ignoring --o-format\n"); } else { setoutputformat = TRUE; if (strcmp(option_value, "gff") == 0) output_format = GFF_FORMAT; else if (strcmp(option_value, "cisml") == 0) output_format = CISML_FORMAT; else { if (verbosity >= NORMAL_VERBOSE) fprintf(stderr, "Output format not known. Using standard instead (cisML).\n"); output_format = CISML_FORMAT; } } } else if (strcmp(option_name, "o") == 0 || strcmp(option_name, "oc") == 0) { setoutputdirectory = TRUE; if (setoutputformat) { if (verbosity >= NORMAL_VERBOSE) fprintf(stderr, "output directory specified, ignoring --o-format\n"); } clobber = strcmp(option_name, "oc") == 0; out_dir = (char*) malloc (sizeof(char)*(strlen(option_value)+1)); strcpy(out_dir, option_value); output_format = DIRECTORY_FORMAT; } else if (strcmp(option_name, "verbosity") == 0) { verbosity = atoi(option_value); } else if (strcmp(option_name, "scoring") == 0) { if (strcmp(option_value, "max-odds") == 0) scoring = MAX_ODDS; else if (strcmp(option_value, "avg-odds") == 0) scoring = AVG_ODDS; else if (strcmp(option_value, "sum-odds") == 0) scoring = SUM_ODDS; else die("Specified scoring scheme not known.\n", message); } else if (strcmp(option_name, "pvalues") == 0) { pvalues = TRUE; } else if (strcmp(option_name, "rma") == 0) { normalize_scores = TRUE; fprintf(stderr, "Normalizing motif scores using RMA method.\n"); } else if (strcmp(option_name, "gcbins") == 0) { num_gc_bins = atoi(option_value); pvalues = TRUE; if (num_gc_bins <= 1) die("Number of bins in --gcbins must be greater than 1.\n", message); } else if (strcmp(option_name, "sdbg") == 0) { sdbg_order = atoi(option_value); // >=0 means use sequence bkg } else if (strcmp(option_name, "last") == 0) { int i = 0; if (option_value[0] == '-') ++i; while (option_value[i] != '\0') { if (!isdigit(option_value[i])) { die("Specified parameter 'last' contains non-numeric characters.\n"); } ++i; } last = atoi(option_value); if (errno != 0) { die("Specified parameter 'last' could not be parsed as a number as:\n%s\n",strerror(errno)); } if (last < 0) { die("Specified parameter 'last' had negative value (%d) when only postive or zero values are allowed \n", last); } } else if (strcmp(option_name, "version") == 0) { fprintf(stdout, VERSION "\n"); exit(EXIT_SUCCESS); } } // --sdbg overrides --pvalues and --gcbins and --rma int req_args = 3; if (sdbg_order >= 0) { pvalues = FALSE; normalize_scores = FALSE; num_gc_bins = 1; req_args = 2; } // Check all required arguments given if (sdbg_order >= 0 && argc > option_index + req_args) { die("<background file> cannot be given together with --sdbg.\n"); } else if (argc != option_index + req_args) { fprintf(stderr, usage, pseudocount, verbosity, max_seq_length); exit(EXIT_FAILURE); } // Get required arguments. char* motif_filename = argv[option_index]; option_index++; char* fasta_filename = argv[option_index]; option_index++; char* bg_filename; if (req_args == 3) { // required unless --sdbg given bg_filename = argv[option_index]; option_index++; } else { bg_filename = "--uniform--"; // So PSSMs will use uniform background; // we can multiply them out later. } // measure time c0 = clock(); // Set up hash tables for computing reverse complement if doing --sdbg if (sdbg_order >= 0) setup_hash_alph(DNAB); // Create cisml data structure for recording results cisml = allocate_cisml(program_name, motif_filename, fasta_filename); set_cisml_background_file(cisml, bg_filename); /********************************************** * Read the motifs and background model. **********************************************/ int num_motifs = 0; MREAD_T *mread; ARRAYLST_T *motifs; PSSM_PAIR_T** pssm_pairs; // note pssm_pairs is an array of pointers //this reads any meme file, xml, txt and html mread = mread_create(motif_filename, OPEN_MFILE); mread_set_bg_source(mread, bg_filename); mread_set_pseudocount(mread, pseudocount); motifs = mread_load(mread, NULL); alph = mread_get_alphabet(mread); pos_bg_freqs = mread_get_background(mread); mread_destroy(mread); num_motifs = arraylst_size(motifs); // allocate memory for PSSM pairs pssm_pairs = (PSSM_PAIR_T**)mm_malloc(sizeof(PSSM_PAIR_T*) * num_motifs); if (verbosity >= NORMAL_VERBOSE) fprintf(stderr, "Number of motifs in file %d.\n", num_motifs); // make a CISML pattern to hold scores for each motif PATTERN_T** patterns = NULL; Resize(patterns, num_motifs, PATTERN_T*); int motif_index; for (motif_index = 0; motif_index < num_motifs; motif_index++) { MOTIF_T* motif = (MOTIF_T*)arraylst_get(motif_index, motifs); patterns[motif_index] = allocate_pattern(get_motif_id(motif), ""); add_cisml_pattern(cisml, patterns[motif_index]); } // make reverse complement motifs and background frequencies. if (scan_both_strands == TRUE) { add_reverse_complements(motifs); assert(arraylst_size(motifs) == (2 * num_motifs)); rev_bg_freqs = allocate_array(get_array_length(pos_bg_freqs)); complement_dna_freqs(pos_bg_freqs, rev_bg_freqs); } /************************************************************** * Convert motif matrices into log-odds matrices. * Scale them. * Compute the lookup tables for the PDF of scaled log-odds scores. **************************************************************/ int ns = scan_both_strands ? 2 : 1; // number of strands for (motif_index = 0; motif_index < num_motifs; motif_index++) { MOTIF_T *motif, *motif_rc; motif = (MOTIF_T*)arraylst_get(motif_index*ns, motifs); if (scan_both_strands) motif_rc = (MOTIF_T*)arraylst_get(motif_index*ns + 1, motifs); else motif_rc = NULL; /* * Note: If scanning both strands, we complement the motif frequencies * but not the background frequencies so the motif looks the same. * However, the given frequencies are used in computing the p-values * since they represent the frequencies on the negative strands. * (If we instead were to complement the input sequence, keeping the * the motif fixed, we would need to use the complemented frequencies * in computing the p-values. Is that any clearer?) */ double range = 300; // 100 is not very good; 1000 is great but too slow PSSM_T* pos_pssm = build_motif_pssm( motif, pos_bg_freqs, pos_bg_freqs, NULL, // Priors not used 0.0L, // alpha not used range, num_gc_bins, TRUE ); PSSM_T* neg_pssm = (scan_both_strands ? build_motif_pssm( motif_rc, rev_bg_freqs, pos_bg_freqs, NULL, // Priors not used 0.0L, // alpha not used range, num_gc_bins, TRUE ) : NULL ); pssm_pairs[motif_index] = create_pssm_pair(pos_pssm, neg_pssm); } // Open the FASTA file for reading. FILE* fasta_file = NULL; if (open_file(fasta_filename, "r", FALSE, "FASTA", "sequences", &fasta_file) == 0) { die("Couldn't open the file %s.\n", fasta_filename); } if (verbosity >= NORMAL_VERBOSE) { if (last == 0) { fprintf(stderr, "Using entire sequence\n"); } else { fprintf(stderr, "Limiting sequence to last %d positions.\n", last); } } /************************************************************** * Read in all sequences and score with all motifs **************************************************************/ int seq_loading_num = 0; // keeps track on the number of sequences read in total int seq_counter = 0; // holds the index to the seq in the pattern int unique_seqs = 0; // keeps track on the number of unique sequences BOOLEAN_T need_postprocessing = FALSE; SEQ_T* sequence = NULL; RBTREE_T* seq_ids = rbtree_create(rbtree_strcasecmp,NULL,free,rbtree_intcpy,free); RBNODE_T* seq_node; BOOLEAN_T created; while (read_one_fasta(alph, fasta_file, max_seq_length, &sequence)) { ++seq_loading_num; created = FALSE; char* seq_name = get_seq_name(sequence); int seq_len = get_seq_length(sequence); int scan_len; if (last != 0) { scan_len = last; } else { scan_len = seq_len; } // red-black trees are only required if duplicates should be combined if (combine_duplicates){ //lookup seq id and create new entry if required, return sequence index char *tmp_id = mm_malloc(strlen(seq_name)+1); // required copy for rb-tree strncpy(tmp_id,seq_name,strlen(seq_name)+1); seq_node = rbtree_lookup(seq_ids, tmp_id, TRUE, &created); if (created) {// assign it a loading number rbtree_set(seq_ids, seq_node, &unique_seqs); seq_counter = unique_seqs; ++unique_seqs; } else { seq_counter = *((int*)rbnode_get(seq_node)); } } // // Set up sequence-dependent background model and compute // log cumulative probability of sequence. // double *logcumback = NULL; // array of log cumulative probs. if (sdbg_order >= 0) { Resize(logcumback, seq_len+1, double); char* raw_seq = get_raw_sequence(sequence); BOOLEAN rc = FALSE; double *a_cp = get_markov_from_sequence(raw_seq, alph_string(alph), rc, sdbg_order, 0); log_cum_back(raw_seq, a_cp, sdbg_order, logcumback); myfree(a_cp); } // Get the GC content of the sequence if binning p-values by GC // and store it in the sequence object. if (num_gc_bins > 1) { ARRAY_T *freqs = get_sequence_freqs(sequence, alph); set_total_gc_sequence(sequence, get_array_item(1,freqs) + get_array_item(2,freqs)); // f(C) + f(G) free_array(freqs); // clean up } else { set_total_gc_sequence(sequence, -1); // flag ignore } /************************************************************** * Process all motifs. **************************************************************/ int ns = scan_both_strands ? 2 : 1; for (motif_index = 0; motif_index < num_motifs; motif_index++) { PATTERN_T *pattern = patterns[motif_index]; MOTIF_T* motif = (MOTIF_T*)arraylst_get(ns*motif_index, motifs); char* motif_id = (scan_both_strands ? get_motif_st_id(motif) : get_motif_id(motif)); if (verbosity >= HIGH_VERBOSE) { fprintf(stderr, "Using motif %s of width %d.\n", motif_id, get_motif_length(motif)); } if ((selected_motifs == NULL) || (have_string(get_motif_id(motif), selected_motifs) == TRUE)) { if (verbosity >= HIGHER_VERBOSE) { fprintf(stderr, "Scanning %s sequence with length %d " "abbreviated to %d with motif %s with length %d.\n", seq_name, seq_len, scan_len, motif_id, get_motif_length(motif)); } SCANNED_SEQUENCE_T* scanned_seq = NULL; if (!combine_duplicates || get_pattern_num_scanned_sequences(pattern) <= seq_counter){ // Create a scanned_sequence record and save it in the pattern. scanned_seq = allocate_scanned_sequence(seq_name, seq_name, pattern); set_scanned_sequence_length(scanned_seq, scan_len); } else { // get existing sequence record scanned_seq = get_pattern_scanned_sequences(pattern)[seq_counter]; set_scanned_sequence_length(scanned_seq, max(scan_len, get_scanned_sequence_length(scanned_seq))); } // check if scanned component of sequence has sufficient length for the motif if (scan_len < get_motif_length(motif)) { // set score to zero and p-value to 1 if not set yet if(!has_scanned_sequence_score(scanned_seq)){ set_scanned_sequence_score(scanned_seq, 0.0); } if(pvalues && !has_scanned_sequence_pvalue(scanned_seq)){ set_scanned_sequence_pvalue(scanned_seq, 1.0); } add_scanned_sequence_scanned_position(scanned_seq); if (get_scanned_sequence_num_scanned_positions(scanned_seq) > 0L) need_postprocessing = TRUE; if (verbosity >= HIGH_VERBOSE) fprintf(stderr, "%s too short for motif %s. Score set to 0!\n", seq_name, motif_id); } else { // scan the sequence using average/maximum motif affinity ama_sequence_scan(alph, sequence, logcumback, pssm_pairs[motif_index], scoring, pvalues, last, scanned_seq, &need_postprocessing); } } else { if (verbosity >= HIGH_VERBOSE) fprintf(stderr, "Skipping motif %s.\n", motif_id); } } // All motifs parsed free_seq(sequence); if (sdbg_order >= 0) myfree(logcumback); } // read sequences
/************************************************************************* * Entry point for ama *************************************************************************/ int main(int argc, char **argv) { AMA_OPTIONS_T options; ARRAYLST_T *motifs; clock_t c0, c1; // measuring cpu_time MOTIF_AND_PSSM_T *combo; CISML_T *cisml; PATTERN_T** patterns; PATTERN_T *pattern; FILE *fasta_file, *text_output, *cisml_output; int i, seq_loading_num, seq_counter, unique_seqs, seq_len, scan_len, x1, x2, y1, y2; char *seq_name, *path; bool need_postprocessing, created; SEQ_T *sequence; RBTREE_T *seq_ids; RBNODE_T *seq_node; double *logcumback; ALPH_T *alph; // process the command process_command_line(argc, argv, &options); // load DNA motifs motifs = load_motifs(&options); // get the alphabet if (arraylst_size(motifs) > 0) { combo = (MOTIF_AND_PSSM_T*)arraylst_get(0, motifs); alph = alph_hold(get_motif_alph(combo->motif)); } else { alph = alph_dna(); } // pick columns for GC operations x1 = -1; x2 = -1; y1 = -1; y2 = -1; if (alph_size_core(alph) == 4 && alph_size_pairs(alph) == 2) { x1 = 0; // A x2 = alph_complement(alph, x1); // T y1 = (x2 == 1 ? 2 : 1); // C y2 = alph_complement(alph, y1); // G assert(x1 != x2 && y1 != y2 && x1 != y1 && x2 != y2 && x1 != y2 && x2 != y1); } // record starting time c0 = clock(); // Create cisml data structure for recording results cisml = allocate_cisml(PROGRAM_NAME, options.command_line, options.motif_filename, options.fasta_filename); set_cisml_background_file(cisml, options.bg_filename); // make a CISML pattern to hold scores for each motif for (i = 0; i < arraylst_size(motifs); i++) { combo = (MOTIF_AND_PSSM_T*)arraylst_get(i, motifs); add_cisml_pattern(cisml, allocate_pattern(get_motif_id(combo->motif), "")); } // Open the FASTA file for reading. fasta_file = NULL; if (!open_file(options.fasta_filename, "r", false, "FASTA", "sequences", &fasta_file)) { die("Couldn't open the file %s.\n", options.fasta_filename); } if (verbosity >= NORMAL_VERBOSE) { if (options.last == 0) { fprintf(stderr, "Using entire sequence\n"); } else { fprintf(stderr, "Limiting sequence to last %d positions.\n", options.last); } } // // Read in all sequences and score with all motifs // seq_loading_num = 0; // keeps track on the number of sequences read in total seq_counter = 0; // holds the index to the seq in the pattern unique_seqs = 0; // keeps track on the number of unique sequences need_postprocessing = false; sequence = NULL; logcumback = NULL; seq_ids = rbtree_create(rbtree_strcasecmp,rbtree_strcpy,free,rbtree_intcpy,free); while (read_one_fasta(alph, fasta_file, options.max_seq_length, &sequence)) { ++seq_loading_num; seq_name = get_seq_name(sequence); seq_len = get_seq_length(sequence); scan_len = (options.last != 0 ? options.last : seq_len); // red-black trees are only required if duplicates should be combined if (options.combine_duplicates){ //lookup seq id and create new entry if required, return sequence index seq_node = rbtree_lookup(seq_ids, get_seq_name(sequence), true, &created); if (created) { // assign it a loading number rbtree_set(seq_ids, seq_node, &unique_seqs); seq_counter = unique_seqs; ++unique_seqs; } else { seq_counter = *((int*)rbnode_get(seq_node)); } } // // Set up sequence-dependent background model and compute // log cumulative probability of sequence. // This needs the sequence in raw format. // if (options.sdbg_order >= 0) logcumback = log_cumulative_background(alph, options.sdbg_order, sequence); // Index the sequence, throwing away the raw format and ambiguous characters index_sequence(sequence, alph, SEQ_NOAMBIG); // Get the GC content of the sequence if binning p-values by GC // and store it in the sequence object. if (options.num_gc_bins > 1) { ARRAY_T *freqs = get_sequence_freqs(sequence, alph); set_total_gc_sequence(sequence, get_array_item(y1, freqs) + get_array_item(y2, freqs)); // f(C) + f(G) free_array(freqs); // clean up } else { set_total_gc_sequence(sequence, -1); // flag ignore } // Scan with motifs. for (i = 0; i < arraylst_size(motifs); i++) { pattern = get_cisml_patterns(cisml)[i]; combo = (MOTIF_AND_PSSM_T*)arraylst_get(i, motifs); if (verbosity >= HIGHER_VERBOSE) { fprintf(stderr, "Scanning %s sequence with length %d " "abbreviated to %d with motif %s with length %d.\n", seq_name, seq_len, scan_len, get_motif_id(combo->motif), get_motif_length(combo->motif)); } SCANNED_SEQUENCE_T* scanned_seq = NULL; if (!options.combine_duplicates || get_pattern_num_scanned_sequences(pattern) <= seq_counter) { // Create a scanned_sequence record and save it in the pattern. scanned_seq = allocate_scanned_sequence(seq_name, seq_name, pattern); set_scanned_sequence_length(scanned_seq, scan_len); } else { // get existing sequence record scanned_seq = get_pattern_scanned_sequences(pattern)[seq_counter]; set_scanned_sequence_length(scanned_seq, max(scan_len, get_scanned_sequence_length(scanned_seq))); } // check if scanned component of sequence has sufficient length for the motif if (scan_len < get_motif_length(combo->motif)) { // set score to zero and p-value to 1 if not set yet if(!has_scanned_sequence_score(scanned_seq)){ set_scanned_sequence_score(scanned_seq, 0.0); } if(options.pvalues && !has_scanned_sequence_pvalue(scanned_seq)){ set_scanned_sequence_pvalue(scanned_seq, 1.0); } add_scanned_sequence_scanned_position(scanned_seq); if (get_scanned_sequence_num_scanned_positions(scanned_seq) > 0L) { need_postprocessing = true; } if (verbosity >= HIGH_VERBOSE) { fprintf(stderr, "%s too short for motif %s. Score set to 0.\n", seq_name, get_motif_id(combo->motif)); } } else { // scan the sequence using average/maximum motif affinity ama_sequence_scan(alph, sequence, logcumback, combo->pssm_pair, options.scoring, options.pvalues, options.last, scanned_seq, &need_postprocessing); } } // All motifs scanned free_seq(sequence); if (options.sdbg_order >= 0) myfree(logcumback); } // read sequences fclose(fasta_file); if (verbosity >= HIGH_VERBOSE) fprintf(stderr, "(%d) sequences read in.\n", seq_loading_num); if (verbosity >= NORMAL_VERBOSE) fprintf(stderr, "Finished \n"); // if any sequence identifier was multiple times in the sequence set then // postprocess of the data is required if (need_postprocessing || options.normalize_scores) { post_process(cisml, motifs, options.normalize_scores); } // output results if (options.output_format == DIRECTORY_FORMAT) { if (create_output_directory(options.out_dir, options.clobber, verbosity > QUIET_VERBOSE)) { // only warn in higher verbose modes fprintf(stderr, "failed to create output directory `%s' or already exists\n", options.out_dir); exit(1); } path = make_path_to_file(options.out_dir, text_filename); //FIXME check for errors: MEME doesn't either and we at least know we have a good directory text_output = fopen(path, "w"); free(path); path = make_path_to_file(options.out_dir, cisml_filename); //FIXME check for errors cisml_output = fopen(path, "w"); free(path); print_cisml(cisml_output, cisml, true, NULL, false); print_score(cisml, text_output); fclose(cisml_output); fclose(text_output); } else if (options.output_format == GFF_FORMAT) { print_score(cisml, stdout); } else if (options.output_format == CISML_FORMAT) { print_cisml(stdout, cisml, true, NULL, false); } else { die("Output format invalid!\n"); } // // Clean up. // rbtree_destroy(seq_ids); arraylst_destroy(motif_and_pssm_destroy, motifs); free_cisml(cisml); rbtree_destroy(options.selected_motifs); alph_release(alph); // measure time if (verbosity >= NORMAL_VERBOSE) { // starting time c1 = clock(); fprintf(stderr, "cycles (CPU); %ld cycles\n", (long) c1); fprintf(stderr, "elapsed CPU time: %f seconds\n", (float) (c1-c0) / CLOCKS_PER_SEC); } return 0; }
/************************************************************************* * Set up one state in a star HMM, given the appropriate data. *************************************************************************/ static void build_star_state ( ALPH_T alph, // Type of alphabet int state_type, // Type of state (START, SPACER,..) STATE_T i_state, // State index. int expected_length, /* For spacers, the expected length of output. */ ARRAY_T* freqs, // Emission probability distrib. double num_sites, // Number of sites for this emission. int i_motif, // Index of motif this state is in. int i_position, // Position of this state within motif int nmotifs, // Total number of motifs. int spacer_states, // Number of HMM states per spacer. MOTIF_T* motifs, // Motifs. MHMM_STATE_T* a_state ) // State to be filled in (pre-allocated). { int j_motif; // Index of the current motif. int num_spacers = 1; // Total number of spacers in HMM. double in_p; // Probability of transition into a state // Size of the alphabet, including ambiguity codes. int full_alph_size = alph_size(alph, ALL_SIZE); MOTIF_T *motif = NULL; if (i_motif != NON_MOTIF_INDEX) { motif = motif_at(motifs, i_motif); } // Tell the user what's up. if (verbosity >= NORMAL_VERBOSE) { switch (state_type) { case START_STATE : fprintf(stderr, "Building HMM: (0) "); break; case SPACER_STATE : fprintf(stderr, "%d ", i_state); break; case END_MOTIF_STATE : fprintf(stderr, "%d | ", i_state); break; case START_MOTIF_STATE : case MID_MOTIF_STATE : fprintf(stderr, "%d-", i_state); break; case END_STATE : fprintf(stderr, "(%d)\n", i_state); break; } } // Record what type of state this is. a_state->type = state_type; // Record the motif width if this is a motif. if (state_type == START_MOTIF_STATE || state_type == MID_MOTIF_STATE || state_type == END_MOTIF_STATE) { a_state->w_motif = get_motif_length(motif); } else { a_state->w_motif = 1; } // Set up the emission distribution and a few other tidbits. a_state->emit = allocate_array(full_alph_size); a_state->emit_odds = allocate_array(full_alph_size); if (freqs != NULL) { // Start and end states have no emissions. copy_array(freqs, a_state->emit); } a_state->num_sites = num_sites; a_state->i_motif = i_motif; if (motif != NULL) { } a_state->i_position = i_position; // Record the motif ID character at this position. if ((state_type == START_STATE) || (state_type == END_STATE) || (state_type == SPACER_STATE)) { a_state->id_char = NON_MOTIF_ID_CHAR; strcpy(a_state->motif_id, NON_MOTIF_ID); } else { strcpy(a_state->motif_id, get_full_motif_id(motif)); a_state->id_char = get_motif_id_char(i_position, motif); } assert(a_state->id_char != '\0'); // First set up the transitions into this state. switch (state_type) { case START_STATE : a_state->ntrans_in = 0; a_state->itrans_in = NULL; a_state->trans_in = NULL; break; case END_STATE : case START_MOTIF_STATE : // Transitions come from spacer state. a_state->ntrans_in = 1; a_state->itrans_in = (int *)mm_malloc(sizeof(int)); a_state->trans_in = allocate_array(1); a_state->itrans_in[0] = SPACER_INDEX; // Distribute non-self loop probability evenly among motifs and end state. in_p = (1 - self_trans(expected_length / spacer_states))/(nmotifs+1); set_array_item(0, in_p, a_state->trans_in); break; case MID_MOTIF_STATE : case END_MOTIF_STATE : // Transitions come from previous state. a_state->ntrans_in = 1; a_state->itrans_in = (int *)mm_malloc(sizeof(int)); a_state->itrans_in[0] = i_state - 1; a_state->trans_in = allocate_array(1); set_array_item(0, 1.0, a_state->trans_in); break; case SPACER_STATE : // Transitions come from start and each motif except for internal // multi-states. a_state->ntrans_in = (i_position != 0) ? 2 : nmotifs + 2; a_state->itrans_in = (int *)mm_malloc(sizeof(int) * a_state->ntrans_in); a_state->trans_in = allocate_array(a_state->ntrans_in); // First transition is a self-transition. a_state->itrans_in[0] = i_state; set_array_item( 0, self_trans(expected_length / spacer_states), a_state->trans_in ); // Next the transitions from all the motifs (or the previous spacer). if (i_position != 0) { a_state->itrans_in[1] = i_state - 1; set_array_item(1, 1.0 - self_trans(expected_length / spacer_states), a_state->trans_in); } else { a_state->itrans_in[1] = START_INDEX; // From start state. // From each motif. for (j_motif = 0; j_motif < nmotifs; j_motif++) { a_state->itrans_in[j_motif+2] = motif_index( j_motif+1, TRUE, num_spacers, spacer_states, motifs, nmotifs ); set_array_item(j_motif+2, 1.0, a_state->trans_in); } } break; } // Then set up the transitions out of this state. switch (state_type) { case START_STATE : case END_MOTIF_STATE : // Transition goes to spacer. a_state->ntrans_out = 1; a_state->itrans_out = (int *)mm_malloc(sizeof(int)); a_state->trans_out = allocate_array(1); a_state->itrans_out[0] = SPACER_INDEX; set_array_item(0, 1.0, a_state->trans_out); break; case START_MOTIF_STATE : case MID_MOTIF_STATE : a_state->ntrans_out = 1; a_state->itrans_out = (int *)mm_malloc(sizeof(int)); a_state->itrans_out[0] = i_state + 1; a_state->trans_out = allocate_array(1); set_array_item(0, 1.0, a_state->trans_out); break; case SPACER_STATE : // Transitions go to self, motifs and end (except for beginning // multi-state spacers) a_state->ntrans_out = (i_position < spacer_states -1 ) ? 2 : nmotifs + 2; a_state->itrans_out = (int *)mm_malloc(sizeof(int) * a_state->ntrans_out); a_state->trans_out = allocate_array(a_state->ntrans_out); // The first transition is a self-transition. a_state->itrans_out[0] = i_state; set_array_item(0, self_trans(expected_length), a_state->trans_out); // For multi-state spacers, outgoing transition to next state. if (i_position < spacer_states - 1) { a_state->itrans_out[1] = i_state + 1; set_array_item(1, 1-self_trans(expected_length), a_state->trans_out); } else { double out_p = (1 - self_trans(expected_length))/(nmotifs+1); // Out to each motif start. for (j_motif = 0; j_motif < nmotifs; j_motif++) { a_state->itrans_out[j_motif+1] = motif_index( j_motif+1, FALSE, num_spacers, spacer_states, motifs, nmotifs ); set_array_item(j_motif+1, out_p, a_state->trans_out); } // Out to end state. a_state->itrans_out[j_motif+1] = motif_index(nmotifs, TRUE, num_spacers, spacer_states, motifs, nmotifs) + 1; set_array_item(j_motif+1, out_p, a_state->trans_out); } break; case END_STATE : a_state->ntrans_out = 0; a_state->itrans_out = NULL; a_state->trans_out = NULL; break; } } // build_star_state
/************************************************************************* * Entry point for pmp_bf *************************************************************************/ int main(int argc, char *argv[]) { char* bg_filename = NULL; char* motif_name = "motif"; // Use this motif name in the output. STRING_LIST_T* selected_motifs = NULL; double fg_rate = 1.0; double bg_rate = 1.0; double purine_pyrimidine = 1.0; // r double transition_transversion = 0.5; // R double pseudocount = 0.1; GAP_SUPPORT_T gap_support = SKIP_GAPS; MODEL_TYPE_T model_type = F81_MODEL; BOOLEAN_T use_halpern_bruno = FALSE; char* ustar_label = NULL; // TLB; create uniform star tree int i; program_name = "pmp_bf"; /********************************************** * COMMAND LINE PROCESSING **********************************************/ // Define command line options. (FIXME: Repeated code) // FIXME: Note that if you add or remove options you // must change n_options. int n_options = 12; cmdoption const pmp_options[] = { {"hb", NO_VALUE}, {"ustar", REQUIRED_VALUE}, {"model", REQUIRED_VALUE}, {"pur-pyr", REQUIRED_VALUE}, {"transition-transversion", REQUIRED_VALUE}, {"bg", REQUIRED_VALUE}, {"fg", REQUIRED_VALUE}, {"motif", REQUIRED_VALUE}, {"motif-name", REQUIRED_VALUE}, {"bgfile", REQUIRED_VALUE}, {"pseudocount", REQUIRED_VALUE}, {"verbosity", REQUIRED_VALUE} }; int option_index = 0; // Define the usage message. char usage[1000] = ""; strcat(usage, "USAGE: pmp [options] <tree file> <MEME file>\n"); strcat(usage, "\n"); strcat(usage, " Options:\n"); // Evolutionary model parameters. strcat(usage, " --hb\n"); strcat(usage, " --model single|average|jc|k2|f81|f84|hky|tn"); strcat(usage, " (default=f81)\n"); strcat(usage, " --pur-pyr <float> (default=1.0)\n"); strcat(usage, " --transition-transversion <float> (default=0.5)\n"); strcat(usage, " --bg <float> (default=1.0)\n"); strcat(usage, " --fg <float> (default=1.0)\n"); // Motif parameters. strcat(usage, " --motif <id> (default=all)\n"); strcat(usage, " --motif-name <string> (default from motif file)\n"); // Miscellaneous parameters strcat(usage, " --bgfile <background> (default from motif file)\n"); strcat(usage, " --pseudocount <float> (default=0.1)\n"); strcat(usage, " --ustar <label>\n"); // TLB; create uniform star tree strcat(usage, " --verbosity [1|2|3|4] (default 2)\n"); strcat(usage, "\n Prints the FP and FN rate at each of 10000 score values.\n"); strcat(usage, "\n Output format: [<motif_id> score <score> FPR <fpr> TPR <tpr>]+\n"); // Parse the command line. if (simple_setopt(argc, argv, n_options, pmp_options) != NO_ERROR) { die("Error processing command line options: option name too long.\n"); } while (TRUE) { int c = 0; char* option_name = NULL; char* option_value = NULL; const char * message = NULL; // Read the next option, and break if we're done. c = simple_getopt(&option_name, &option_value, &option_index); if (c == 0) { break; } else if (c < 0) { (void) simple_getopterror(&message); die("Error processing command line options (%s)\n", message); } if (strcmp(option_name, "model") == 0) { if (strcmp(option_value, "jc") == 0) { model_type = JC_MODEL; } else if (strcmp(option_value, "k2") == 0) { model_type = K2_MODEL; } else if (strcmp(option_value, "f81") == 0) { model_type = F81_MODEL; } else if (strcmp(option_value, "f84") == 0) { model_type = F84_MODEL; } else if (strcmp(option_value, "hky") == 0) { model_type = HKY_MODEL; } else if (strcmp(option_value, "tn") == 0) { model_type = TAMURA_NEI_MODEL; } else if (strcmp(option_value, "single") == 0) { model_type = SINGLE_MODEL; } else if (strcmp(option_value, "average") == 0) { model_type = AVERAGE_MODEL; } else { die("Unknown model: %s\n", option_value); } } else if (strcmp(option_name, "hb") == 0){ use_halpern_bruno = TRUE; } else if (strcmp(option_name, "ustar") == 0){ // TLB; create uniform star tree ustar_label = option_value; } else if (strcmp(option_name, "pur-pyr") == 0){ purine_pyrimidine = atof(option_value); } else if (strcmp(option_name, "transition-transversion") == 0){ transition_transversion = atof(option_value); } else if (strcmp(option_name, "bg") == 0){ bg_rate = atof(option_value); } else if (strcmp(option_name, "fg") == 0){ fg_rate = atof(option_value); } else if (strcmp(option_name, "motif") == 0){ if (selected_motifs == NULL) { selected_motifs = new_string_list(); } add_string(option_value, selected_motifs); } else if (strcmp(option_name, "motif-name") == 0){ motif_name = option_value; } else if (strcmp(option_name, "bgfile") == 0){ bg_filename = option_value; } else if (strcmp(option_name, "pseudocount") == 0){ pseudocount = atof(option_value); } else if (strcmp(option_name, "verbosity") == 0){ verbosity = atoi(option_value); } } // Must have tree and motif file names if (argc != option_index + 2) { fprintf(stderr, "%s", usage); exit(EXIT_FAILURE); } /********************************************** * Read the phylogenetic tree. **********************************************/ char* tree_filename = NULL; TREE_T* tree = NULL; tree_filename = argv[option_index]; option_index++; tree = read_tree_from_file(tree_filename); // get the species names STRING_LIST_T* alignment_species = make_leaf_list(tree); char *root_label = get_label(tree); // in case target in center if (strlen(root_label)>0) add_string(root_label, alignment_species); //write_string_list(" ", alignment_species, stderr); // TLB; Convert the tree to a uniform star tree with // the target sequence at its center. if (ustar_label != NULL) { tree = convert_to_uniform_star_tree(tree, ustar_label); if (tree == NULL) die("Tree or alignment missing target %s\n", ustar_label); if (verbosity >= NORMAL_VERBOSE) { fprintf(stderr, "Target %s placed at center of uniform (d=%.3f) star tree:\n", ustar_label, get_total_length(tree) / get_num_children(tree) ); write_tree(tree, stderr); } } /********************************************** * Read the motifs. **********************************************/ char* meme_filename = argv[option_index]; option_index++; int num_motifs = 0; MREAD_T *mread; ALPH_T alph; ARRAYLST_T *motifs; ARRAY_T *bg_freqs; mread = mread_create(meme_filename, OPEN_MFILE); mread_set_bg_source(mread, bg_filename); mread_set_pseudocount(mread, pseudocount); // read motifs motifs = mread_load(mread, NULL); alph = mread_get_alphabet(mread); bg_freqs = mread_get_background(mread); // check if (arraylst_size(motifs) == 0) die("No motifs in %s.", meme_filename); // TLB; need to resize bg_freqs array to ALPH_SIZE items // or copy array breaks in HB mode. This throws away // the freqs for the ambiguous characters; int asize = alph_size(alph, ALPH_SIZE); resize_array(bg_freqs, asize); /************************************************************** * Compute probability distributions for each of the selected motifs. **************************************************************/ int motif_index; for (motif_index = 0; motif_index < arraylst_size(motifs); motif_index++) { MOTIF_T* motif = (MOTIF_T*)arraylst_get(motif_index, motifs); char* motif_id = get_motif_id(motif); char* bare_motif_id = motif_id; // We may have specified on the command line that // only certain motifs were to be used. if (selected_motifs != NULL) { if (*bare_motif_id == '+' || *bare_motif_id == '-') { // The selected motif id won't included a strand indicator. bare_motif_id++; } if (have_string(bare_motif_id, selected_motifs) == FALSE) { continue; } } if (verbosity >= NORMAL_VERBOSE) { fprintf( stderr, "Using motif %s of width %d.\n", motif_id, get_motif_length(motif) ); } // Build an array of evolutionary models for each position in the motif. EVOMODEL_T** models = make_motif_models( motif, bg_freqs, model_type, fg_rate, bg_rate, purine_pyrimidine, transition_transversion, use_halpern_bruno ); // Get the frequencies under the background model (row 0) // and position-dependent scores (rows 1..w) // for each possible alignment column. MATRIX_T* pssm_matrix = build_alignment_pssm_matrix( alph, alignment_species, get_motif_length(motif) + 1, models, tree, gap_support ); ARRAY_T* alignment_col_freqs = allocate_array(get_num_cols(pssm_matrix)); copy_array(get_matrix_row(0, pssm_matrix), alignment_col_freqs); remove_matrix_row(0, pssm_matrix); // throw away first row //print_col_frequencies(alph, alignment_col_freqs); // // Get the position-dependent null model alignment column frequencies // int w = get_motif_length(motif); int ncols = get_num_cols(pssm_matrix); MATRIX_T* pos_dep_bkg = allocate_matrix(w, ncols); for (i=0; i<w; i++) { // get the evo model corresponding to this column of the motif // and store it as the first evolutionary model. myfree(models[0]); // Use motif PSFM for equilibrium freqs. for model. ARRAY_T* site_specific_freqs = allocate_array(asize); int j = 0; for(j = 0; j < asize; j++) { double value = get_matrix_cell(i, j, get_motif_freqs(motif)); set_array_item(j, value, site_specific_freqs); } if (use_halpern_bruno == FALSE) { models[0] = make_model( model_type, fg_rate, transition_transversion, purine_pyrimidine, site_specific_freqs, NULL ); } else { models[0] = make_model( model_type, fg_rate, transition_transversion, purine_pyrimidine, bg_freqs, site_specific_freqs ); } // get the alignment column frequencies using this model MATRIX_T* tmp_pssm_matrix = build_alignment_pssm_matrix( alph, alignment_species, 2, // only interested in freqs under bkg models, tree, gap_support ); // assemble the position-dependent background alignment column freqs. set_matrix_row(i, get_matrix_row(0, tmp_pssm_matrix), pos_dep_bkg); // chuck the pssm (not his real name) free_matrix(tmp_pssm_matrix); } // // Compute and print the score distribution under the background model // and under the (position-dependent) motif model. // int range = 10000; // 10^4 gives same result as 10^5, but 10^3 differs // under background model PSSM_T* pssm = build_matrix_pssm(alph, pssm_matrix, alignment_col_freqs, range); // under position-dependent background (motif) model PSSM_T* pssm_pos_dep = build_matrix_pssm(alph, pssm_matrix, alignment_col_freqs, range); get_pv_lookup_pos_dep( pssm_pos_dep, pos_dep_bkg, NULL // no priors used ); // print FP and FN distributions int num_items = get_pssm_pv_length(pssm_pos_dep); for (i=0; i<num_items; i++) { double pvf = get_pssm_pv(i, pssm); double pvt = get_pssm_pv(i, pssm_pos_dep); double fpr = pvf; double fnr = 1 - pvt; if (fpr >= 0.99999 || fnr == 0) continue; printf("%s score %d FPR %.3g FNR %.3g\n", motif_id, i, fpr, fnr); } // free stuff free_pssm(pssm); free_pssm(pssm_pos_dep); if (models != NULL) { int model_index; int num_models = get_motif_length(motif) + 1; for (model_index = 0; model_index < num_models; model_index++) { free_model(models[model_index]); } myfree(models); } } // motif arraylst_destroy(destroy_motif, motifs); /********************************************** * Clean up. **********************************************/ // TLB may have encountered a memory corruption bug here // CEG has not been able to reproduce it. valgrind says all is well. free_array(bg_freqs); free_tree(TRUE, tree); free_string_list(selected_motifs); return(0); } // main
void generate_ceq_logos(char *meme_path, char *output_dir) { int i, dir_len, prefix_len, path_len; ARRAY_T *background; BOOLEAN_T has_reverse_strand; char *path, *alphabet; double logo_height, logo_width; ARRAYLST_T *motifs; MOTIF_T *motif; motifs = arraylst_create(); logo_height = LOGOHEIGHT; //make the path dir_len = strlen(output_dir); prefix_len = strlen(LOGO_PREFIX); path_len = dir_len + 1 + prefix_len + MAX_MOTIF_ID_LENGTH + 1; path = malloc(sizeof(char)*path_len); strncpy(path, output_dir, path_len); if (path[dir_len-1] != '/') { path[dir_len] = '/'; path[++dir_len] = '\0'; } strncpy(path+dir_len, LOGO_PREFIX, path_len - dir_len); // Read all motifs into an array. read_meme_file2(meme_path, NULL, // bg file name DEFAULT_PSEUDOCOUNTS, REQUIRE_PSPM, motifs, NULL,//motif occurrences &has_reverse_strand, &background); // global alphabet is set by read_meme_file alphabet = get_alphabet(FALSE); if (create_output_directory(output_dir, TRUE, (verbosity >= NORMAL_VERBOSE))) { // Failed to create output directory. exit(1); } for(i = 0; i < arraylst_size(motifs); i++) { motif = (MOTIF_T*)arraylst_get(i, motifs); logo_width = get_motif_length(motif); if (logo_width > MAXLOGOWIDTH) logo_width = MAXLOGOWIDTH; copy_and_sanatise_name(path+(dir_len+prefix_len), get_motif_id(motif), path_len - (dir_len + prefix_len)); CL_create2( motif, // motif "", // no title NULL, // no second motif "", // no x-axis label FALSE, // no error bars FALSE, // ssc logo_height, // logo height (cm) logo_width, // logo width (cm) alphabet, // alphabet 0, // no offset to second motif path, // output file path "MEME (no SSC)" // program name ); } free_motifs(motifs); free_array(background); // not used free(path); }