/************************************************************************* * Converts the motif frequency matrix into a odds matrix: taken from old ama-scan.c *************************************************************************/ void convert_to_odds_matrix(MOTIF_T* motif, ARRAY_T* bg_freqs){ const int asize = alph_size(get_motif_alph(motif), ALPH_SIZE); int motif_position_index,alph_index; MATRIX_T *freqs; freqs = get_motif_freqs(motif); const int num_motif_positions = get_num_rows(freqs); for (alph_index=0;alph_index<asize;++alph_index){ double bg_likelihood = get_array_item(alph_index, bg_freqs); for (motif_position_index=0;motif_position_index<num_motif_positions;++motif_position_index){ freqs->rows[motif_position_index]->items[alph_index] /= bg_likelihood; } } }
/************************************************************************* * Copies the motif frequency matrix and converts it into a odds matrix *************************************************************************/ MATRIX_T* create_odds_matrix(MOTIF_T *motif, ARRAY_T* bg_freqs){ const int asize = alph_size(get_motif_alph(motif), ALPH_SIZE); int pos, aidx; MATRIX_T *odds; odds = duplicate_matrix(get_motif_freqs(motif)); const int num_pos = get_num_rows(odds); for (aidx = 0; aidx < asize; ++aidx) { double bg_likelihood = get_array_item(aidx, bg_freqs); for (pos = 0; pos < num_pos; ++pos) { odds->rows[pos]->items[aidx] /= bg_likelihood; } } return odds; }
/************************************************************************* * Output JSON data for a motif *************************************************************************/ static void output_motif_json(JSONWR_T* json, MOTIF_STATS_T* stats, SITE_COUNTS_T* counts) { //vars MOTIF_T *motif; MATRIX_T *freqs; int i, j, mlen, asize, end; motif = stats->motif; freqs = get_motif_freqs(motif); asize = alph_size(get_motif_alph(motif), ALPH_SIZE); jsonwr_start_object_value(json); jsonwr_lng_prop(json, "db", stats->db->id); jsonwr_str_prop(json, "id", get_motif_id(motif)); if (*(get_motif_id2(motif))) { jsonwr_str_prop(json, "alt", get_motif_id2(motif)); } mlen = get_motif_length(motif); jsonwr_lng_prop(json, "len", mlen); jsonwr_dbl_prop(json, "motif_evalue", get_motif_evalue(motif)); jsonwr_dbl_prop(json, "motif_nsites", get_motif_nsites(motif)); if (get_motif_url(motif) && *get_motif_url(motif)) { jsonwr_str_prop(json, "url", get_motif_url(motif)); } jsonwr_property(json, "pwm"); jsonwr_start_array_value(json); for (i = 0; i < mlen; i++) { jsonwr_start_array_value(json); for (j = 0; j < asize; j++) { jsonwr_dbl_value(json, get_matrix_cell(i, j, freqs)); } jsonwr_end_array_value(json); } jsonwr_end_array_value(json); jsonwr_lng_prop(json, "bin_width", stats->central_window+1); jsonwr_dbl_prop(json, "bin_sites", stats->central_sites); jsonwr_lng_prop(json, "total_sites", counts->total_sites); jsonwr_dbl_prop(json, "log_pvalue", stats->log_adj_pvalue); jsonwr_dbl_prop(json, "max_prob", stats->max_prob); jsonwr_property(json, "sites"); jsonwr_start_array_value(json); end = counts->allocated - (mlen - 1); for (i = (mlen - 1); i < end; i += 2) { jsonwr_dbl_value(json, counts->sites[i]); } jsonwr_end_array_value(json); jsonwr_end_object_value(json); }
/************************************************************************* * Calculate the odds score for each motif-sized window at each * site in the sequence using the given nucleotide frequencies. * * This function is a lightweight version based on the one contained in * motiph-scoring. Several calculations that are unnecessary for gomo * have been removed in order to speed up the process *************************************************************************/ static double score_sequence( SEQ_T *seq, // sequence to scan (IN) MOTIF_T *motif, // motif already converted to odds values (IN) PSSM_T *m_pssm, // motif pssm (IN) MATRIX_T *m_odds, // motif odds (IN) int method, // method used for scoring (IN) double threshold, // Threshold to use in TOTAL_HITS mode with a PWM ARRAY_T *bg_freqs //background model ) { assert(seq != NULL); assert(motif != NULL); assert((method == TOTAL_HITS && m_pssm) || (method != TOTAL_HITS && m_odds)); char* raw_seq = get_raw_sequence(seq); int seq_length = get_seq_length(seq); // Get the pv lookup table ARRAY_T* pv_lookup = NULL; if (NULL != m_pssm) { pv_lookup = m_pssm->pv; assert(get_array_length(pv_lookup) > 0); } // Prepare storage for the string representing the portion // of the reference sequence within the window. char* window_seq = (char *) mm_malloc(sizeof(char) * (get_motif_length(motif) + 1)); window_seq[get_motif_length(motif)] = '\0'; int max_index = seq_length - get_motif_length(motif); if (max_index < 0) max_index = 0; const int asize = alph_size(get_motif_alph(motif), ALPH_SIZE); double* odds = (double*) mm_malloc(sizeof(double)*max_index); double* scaled_log_odds = (double*) mm_malloc(sizeof(double)*max_index); // For each site in the sequence int seq_index; for (seq_index = 0; seq_index < max_index; seq_index++) { double odd = 1.0; scaled_log_odds[seq_index] = 0; // For each site in the motif window int motif_position; for (motif_position = 0; motif_position < get_motif_length(motif); motif_position++) { char c = raw_seq[seq_index + motif_position]; window_seq[motif_position] = c; // Check for gaps at this site if(c == '-' || c == '.') { break; } // Check for ambiguity codes at this site //TODO: This next call is very expensive - it takes up approx. 10% of a // programme's running time. It should be fixed up somehow. int aindex = alph_index(get_motif_alph(motif), c); if (aindex > asize) { break; } if (method == TOTAL_HITS) { //If we're in this mode, then we're using LOG ODDS. //scaled_log_odds[seq_index] += get_matrix_cell(motif_position, aindex, get_motif_freqs(motif)); scaled_log_odds[seq_index] += get_matrix_cell(motif_position, aindex, m_pssm->matrix); } else { odd *= get_matrix_cell(motif_position, aindex, m_odds); } } odds[seq_index] = odd; } // return odds as requested (MAX or AVG scoring) double requested_odds = 0.0; if (method == AVG_ODDS){ for (seq_index = 0; seq_index < max_index; seq_index++) { requested_odds += odds[seq_index]; } requested_odds /= max_index + 1; // Divide by 0 if max_index==0 } else if (method == MAX_ODDS){ for (seq_index = 0; seq_index < max_index; seq_index++) { if (odds[seq_index] > requested_odds){ requested_odds = odds[seq_index]; } } } else if (method == SUM_ODDS) { for (seq_index = 0; seq_index < max_index; seq_index++) { requested_odds += odds[seq_index]; } } else if (method == TOTAL_HITS) { for (seq_index = 0; seq_index < max_index; seq_index++) { if (scaled_log_odds[seq_index] >= (double)get_array_length(pv_lookup)) { scaled_log_odds[seq_index] = (double)(get_array_length(pv_lookup) - 1); } double pvalue = get_array_item((int) scaled_log_odds[seq_index], pv_lookup); //Figure out how to calculate the p-value of a hit //fprintf(stderr, "m: %s pv_l len: %i scaled_log_odds: %g seq index: %i pvalue: %g\n", // get_motif_id(motif), get_array_length(pv_lookup), scaled_log_odds[seq_index], seq_index, pvalue); if (pvalue < threshold) { requested_odds++; //Add another hit. } if (verbosity > HIGHER_VERBOSE) { fprintf(stderr, "Window Data: %s\t%s\t%i\t%g\t%g\t%g\n", get_seq_name(seq), get_motif_id(motif), seq_index, scaled_log_odds[seq_index], pvalue, threshold); } } } myfree(odds); myfree(scaled_log_odds); myfree(window_seq); return requested_odds; }
/************************************************************************** * Dump sequence matches sorted by the name of the sequence. * * Outputs Columns: * 1) Trimmed lowercase sequence with uppercase matches. * 2) Position of the secondary match within the whole sequence. * 3) Sequence fragment that the primary matched. * 4) Strand of the primary match (+|-) * 5) Sequence fragment that the secondary matched. * 6) Strand of the secondary match (+|-) * 7) Is the primary match on the same strand as the secondary (s|o) * 8) Is the secondary match downstream or upstream (d|u) * 9) The gap between the primary and secondary matches * 10) The name of the sequence * 11) The p-value of the bin containing the match (adjusted for # of bins) * ---if the FASTA input file sequence names are in Genome Browser format: * 12-14) Position of primary match in BED coordinates * 15) Position of primary match in Genome Browser coordinates * 16-18) Position of secondary match in BED coordinates * 19) Position of secondary match in Genome Browser coordinates * * If you wish to sort based on the gap column: * Sort individual output: * sort -n -k 9,9 -o seqs_primary_secondary.txt seqs_primary_secondary.txt * Or sort all outputs: * for f in seqs_*.txt; do sort -n -k 9,9 -o $f $f; done * Or to get just locations of primary motif in BED coordinates * where the secondary is on the opposite strand, upstream with a gap of 118bp: * awk '$7=="o" && $8=="u" && $9==118 {print $12"\t"$13"\t"$14;}' seqs_primary_secondary.txt * **************************************************************************/ static void dump_sequence_matches(FILE *out, int margin, int bin, double sigthresh, BOOLEAN_T sig_only, RBTREE_T *sequences, MOTIF_T *primary_motif, SECONDARY_MOTIF_T *secondary_motif, ARRAY_T **matches) { RBNODE_T *node; SEQUENCE_T *sequence; int idx, seqlen, i, j, start, end, secondary, secondary_pos, primary_len, secondary_len, distance; BOOLEAN_T primary_rc, secondary_rc, downstream; char *buffer, *seq, *primary_match, *secondary_match; ARRAY_T *secondary_array; ALPH_T *alph; // get the alphabet alph = get_motif_alph(primary_motif); // allocate a buffer for copying the trimmed sequence into and modify it seqlen = margin * 2 + get_motif_trimmed_length(primary_motif); buffer = (char*)mm_malloc(sizeof(char) * (seqlen + 1)); // get the lengths of the motifs primary_len = get_motif_trimmed_length(primary_motif); secondary_len = get_motif_trimmed_length(secondary_motif->motif); // allocate some strings for storing the matches primary_match = (char*)mm_malloc(sizeof(char) * (primary_len + 1)); secondary_match = (char*)mm_malloc(sizeof(char) * (secondary_len + 1)); // add null byte at the end of the match strings primary_match[primary_len] = '\0'; secondary_match[secondary_len] = '\0'; // iterate over all the sequences for (node = rbtree_first(sequences); node != NULL; node = rbtree_next(node)) { sequence = (SEQUENCE_T*)rbtree_value(node); primary_rc = get_array_item(0, sequence->primary_matches) < 0; //secondary = matches[sequence->index]; secondary_array = matches[sequence->index]; if (! secondary_array) continue; int n_secondary_matches = get_array_length(secondary_array); for (idx=0; idx<n_secondary_matches; idx++) { secondary = get_array_item(idx, secondary_array); secondary_rc = secondary < 0; secondary_pos = abs(secondary); // calculate the distance if (secondary_pos <= margin) { distance = margin - secondary_pos - secondary_len + 1; downstream = primary_rc; } else { distance = secondary_pos - margin - primary_len - 1; downstream = !primary_rc; } // copy the trimmed sequence seq = sequence->data; for (i = 0; i < seqlen; ++i) { buffer[i] = (alph_is_case_insensitive(alph) ? tolower(seq[i]) : seq[i]); } buffer[seqlen] = '\0'; // uppercase primary start = margin; end = margin + primary_len; for (i = start, j = 0; i < end; ++i, ++j) { buffer[i] = (alph_is_case_insensitive(alph) ? toupper(buffer[i]) : buffer[i]); primary_match[j] = buffer[i]; } // uppercase secondary // note orign was one, subtract 1 to make origin zero as required for arrays start = secondary_pos -1; end = start + secondary_len; for (i = start, j = 0; i < end; ++i, ++j) { buffer[i] = (alph_is_case_insensitive(alph) ? toupper(buffer[i]) : buffer[i]); secondary_match[j] = buffer[i]; } // get the p-value of the seconndary match SPACING_T *spacings; if (secondary_rc == primary_rc) { spacings = downstream ? secondary_motif->spacings+(SAME+RIGHT) : secondary_motif->spacings+(SAME+LEFT); } else { spacings = downstream ? secondary_motif->spacings+(OPPO+RIGHT) : secondary_motif->spacings+(OPPO+LEFT); } double p_value = spacings->pvalue[distance/bin]; // skip match if not significant and only reporting significant matches if (sig_only && (p_value > sigthresh)) continue; // output line to file fprintf(out, "%s %3d %s %s %s %s %s %s %3d %s %.1e", buffer, secondary_pos, primary_match, (primary_rc ? "-" : "+"), secondary_match, (secondary_rc ? "-" : "+"), (secondary_rc == primary_rc ? "s" : "o"), (downstream ? "d" : "u"), distance, sequence->name, p_value ); // Parse the sequence name to see if we can get genomic coordinates // and print additional columns with primary and secondary matches // in both BED and Genome Browser coordinates. char *chr_name; size_t chr_name_len; int start_pos, end_pos; if (parse_genomic_coordinates_helper( sequence->name, &chr_name, &chr_name_len, &start_pos, &end_pos)) { // Get the start and end of the primary match in // 0-relative, half-open genomic coordinates. int p_start = start_pos + fabs(get_array_item(0, sequence->primary_matches)) - 1; int p_end = p_start + primary_len; // Get the start and end of the secondary match in // 0-relative, half-open genomic coordinates. int s_start, s_end; if ( (!primary_rc && downstream) || (primary_rc && !downstream) ) { s_start = p_end + distance; s_end = s_start + secondary_len; } else { s_end = p_start - distance; s_start = s_end - secondary_len; } fprintf(out, " %s %d %d %s:%d-%d", chr_name, p_start, p_end, chr_name, p_start+1, p_end); fprintf(out, " %s %d %d %s:%d-%d\n", chr_name, s_start, s_end, chr_name, s_start+1, s_end); } else { fprintf(out, "\n"); } } // secondary match } // primary match free(buffer); free(primary_match); free(secondary_match); }
/************************************************************************* * Entry point for ama *************************************************************************/ int main(int argc, char **argv) { AMA_OPTIONS_T options; ARRAYLST_T *motifs; clock_t c0, c1; // measuring cpu_time MOTIF_AND_PSSM_T *combo; CISML_T *cisml; PATTERN_T** patterns; PATTERN_T *pattern; FILE *fasta_file, *text_output, *cisml_output; int i, seq_loading_num, seq_counter, unique_seqs, seq_len, scan_len, x1, x2, y1, y2; char *seq_name, *path; bool need_postprocessing, created; SEQ_T *sequence; RBTREE_T *seq_ids; RBNODE_T *seq_node; double *logcumback; ALPH_T *alph; // process the command process_command_line(argc, argv, &options); // load DNA motifs motifs = load_motifs(&options); // get the alphabet if (arraylst_size(motifs) > 0) { combo = (MOTIF_AND_PSSM_T*)arraylst_get(0, motifs); alph = alph_hold(get_motif_alph(combo->motif)); } else { alph = alph_dna(); } // pick columns for GC operations x1 = -1; x2 = -1; y1 = -1; y2 = -1; if (alph_size_core(alph) == 4 && alph_size_pairs(alph) == 2) { x1 = 0; // A x2 = alph_complement(alph, x1); // T y1 = (x2 == 1 ? 2 : 1); // C y2 = alph_complement(alph, y1); // G assert(x1 != x2 && y1 != y2 && x1 != y1 && x2 != y2 && x1 != y2 && x2 != y1); } // record starting time c0 = clock(); // Create cisml data structure for recording results cisml = allocate_cisml(PROGRAM_NAME, options.command_line, options.motif_filename, options.fasta_filename); set_cisml_background_file(cisml, options.bg_filename); // make a CISML pattern to hold scores for each motif for (i = 0; i < arraylst_size(motifs); i++) { combo = (MOTIF_AND_PSSM_T*)arraylst_get(i, motifs); add_cisml_pattern(cisml, allocate_pattern(get_motif_id(combo->motif), "")); } // Open the FASTA file for reading. fasta_file = NULL; if (!open_file(options.fasta_filename, "r", false, "FASTA", "sequences", &fasta_file)) { die("Couldn't open the file %s.\n", options.fasta_filename); } if (verbosity >= NORMAL_VERBOSE) { if (options.last == 0) { fprintf(stderr, "Using entire sequence\n"); } else { fprintf(stderr, "Limiting sequence to last %d positions.\n", options.last); } } // // Read in all sequences and score with all motifs // seq_loading_num = 0; // keeps track on the number of sequences read in total seq_counter = 0; // holds the index to the seq in the pattern unique_seqs = 0; // keeps track on the number of unique sequences need_postprocessing = false; sequence = NULL; logcumback = NULL; seq_ids = rbtree_create(rbtree_strcasecmp,rbtree_strcpy,free,rbtree_intcpy,free); while (read_one_fasta(alph, fasta_file, options.max_seq_length, &sequence)) { ++seq_loading_num; seq_name = get_seq_name(sequence); seq_len = get_seq_length(sequence); scan_len = (options.last != 0 ? options.last : seq_len); // red-black trees are only required if duplicates should be combined if (options.combine_duplicates){ //lookup seq id and create new entry if required, return sequence index seq_node = rbtree_lookup(seq_ids, get_seq_name(sequence), true, &created); if (created) { // assign it a loading number rbtree_set(seq_ids, seq_node, &unique_seqs); seq_counter = unique_seqs; ++unique_seqs; } else { seq_counter = *((int*)rbnode_get(seq_node)); } } // // Set up sequence-dependent background model and compute // log cumulative probability of sequence. // This needs the sequence in raw format. // if (options.sdbg_order >= 0) logcumback = log_cumulative_background(alph, options.sdbg_order, sequence); // Index the sequence, throwing away the raw format and ambiguous characters index_sequence(sequence, alph, SEQ_NOAMBIG); // Get the GC content of the sequence if binning p-values by GC // and store it in the sequence object. if (options.num_gc_bins > 1) { ARRAY_T *freqs = get_sequence_freqs(sequence, alph); set_total_gc_sequence(sequence, get_array_item(y1, freqs) + get_array_item(y2, freqs)); // f(C) + f(G) free_array(freqs); // clean up } else { set_total_gc_sequence(sequence, -1); // flag ignore } // Scan with motifs. for (i = 0; i < arraylst_size(motifs); i++) { pattern = get_cisml_patterns(cisml)[i]; combo = (MOTIF_AND_PSSM_T*)arraylst_get(i, motifs); if (verbosity >= HIGHER_VERBOSE) { fprintf(stderr, "Scanning %s sequence with length %d " "abbreviated to %d with motif %s with length %d.\n", seq_name, seq_len, scan_len, get_motif_id(combo->motif), get_motif_length(combo->motif)); } SCANNED_SEQUENCE_T* scanned_seq = NULL; if (!options.combine_duplicates || get_pattern_num_scanned_sequences(pattern) <= seq_counter) { // Create a scanned_sequence record and save it in the pattern. scanned_seq = allocate_scanned_sequence(seq_name, seq_name, pattern); set_scanned_sequence_length(scanned_seq, scan_len); } else { // get existing sequence record scanned_seq = get_pattern_scanned_sequences(pattern)[seq_counter]; set_scanned_sequence_length(scanned_seq, max(scan_len, get_scanned_sequence_length(scanned_seq))); } // check if scanned component of sequence has sufficient length for the motif if (scan_len < get_motif_length(combo->motif)) { // set score to zero and p-value to 1 if not set yet if(!has_scanned_sequence_score(scanned_seq)){ set_scanned_sequence_score(scanned_seq, 0.0); } if(options.pvalues && !has_scanned_sequence_pvalue(scanned_seq)){ set_scanned_sequence_pvalue(scanned_seq, 1.0); } add_scanned_sequence_scanned_position(scanned_seq); if (get_scanned_sequence_num_scanned_positions(scanned_seq) > 0L) { need_postprocessing = true; } if (verbosity >= HIGH_VERBOSE) { fprintf(stderr, "%s too short for motif %s. Score set to 0.\n", seq_name, get_motif_id(combo->motif)); } } else { // scan the sequence using average/maximum motif affinity ama_sequence_scan(alph, sequence, logcumback, combo->pssm_pair, options.scoring, options.pvalues, options.last, scanned_seq, &need_postprocessing); } } // All motifs scanned free_seq(sequence); if (options.sdbg_order >= 0) myfree(logcumback); } // read sequences fclose(fasta_file); if (verbosity >= HIGH_VERBOSE) fprintf(stderr, "(%d) sequences read in.\n", seq_loading_num); if (verbosity >= NORMAL_VERBOSE) fprintf(stderr, "Finished \n"); // if any sequence identifier was multiple times in the sequence set then // postprocess of the data is required if (need_postprocessing || options.normalize_scores) { post_process(cisml, motifs, options.normalize_scores); } // output results if (options.output_format == DIRECTORY_FORMAT) { if (create_output_directory(options.out_dir, options.clobber, verbosity > QUIET_VERBOSE)) { // only warn in higher verbose modes fprintf(stderr, "failed to create output directory `%s' or already exists\n", options.out_dir); exit(1); } path = make_path_to_file(options.out_dir, text_filename); //FIXME check for errors: MEME doesn't either and we at least know we have a good directory text_output = fopen(path, "w"); free(path); path = make_path_to_file(options.out_dir, cisml_filename); //FIXME check for errors cisml_output = fopen(path, "w"); free(path); print_cisml(cisml_output, cisml, true, NULL, false); print_score(cisml, text_output); fclose(cisml_output); fclose(text_output); } else if (options.output_format == GFF_FORMAT) { print_score(cisml, stdout); } else if (options.output_format == CISML_FORMAT) { print_cisml(stdout, cisml, true, NULL, false); } else { die("Output format invalid!\n"); } // // Clean up. // rbtree_destroy(seq_ids); arraylst_destroy(motif_and_pssm_destroy, motifs); free_cisml(cisml); rbtree_destroy(options.selected_motifs); alph_release(alph); // measure time if (verbosity >= NORMAL_VERBOSE) { // starting time c1 = clock(); fprintf(stderr, "cycles (CPU); %ld cycles\n", (long) c1); fprintf(stderr, "elapsed CPU time: %f seconds\n", (float) (c1-c0) / CLOCKS_PER_SEC); } return 0; }
/************************************************************************* * Build a completely connected HMM. *************************************************************************/ void build_complete_hmm (ARRAY_T* background, int spacer_states, MOTIF_T *motifs, int nmotifs, MATRIX_T *transp_freq, MATRIX_T *spacer_ave, BOOLEAN_T fim, MHMM_T **the_hmm) { ALPH_T alph; int motif_states; // Total length of the motifs. int num_spacers; // Total number of spacer states. int num_states; // Total number of states in the model. int i_motif; // Index of the current "from" motif. int j_motif; // Index of the current "to" motif. int i_position; // Index within the current motif or spacer. int i_state = 0; // Index of the current state. assert(nmotifs > 0); alph = get_motif_alph(motifs);// get the alphabet from the first motif // Count the width of the motifs. for (motif_states = 0, i_motif = 0; i_motif < nmotifs; i_motif++) motif_states += get_motif_length(motif_at(motifs, i_motif)); // Count the spacer states adjacent to begin and end. num_spacers = nmotifs * 2; // Add the spacer states between motifs. num_spacers += nmotifs * nmotifs; // Total states = motifs + spacer_states + begin/end num_states = motif_states + (num_spacers * spacer_states) + 2; // Allocate the model. *the_hmm = allocate_mhmm(alph, num_states); // Record that this is a completely connected model. (*the_hmm)->type = COMPLETE_HMM; // Record the number of motifs in the model. (*the_hmm)->num_motifs = nmotifs; // Record the number of states in the model. (*the_hmm)->num_states = num_states; (*the_hmm)->num_spacers = ((nmotifs + 1) * (nmotifs + 1)) - 1; (*the_hmm)->spacer_states = spacer_states; // Put the background distribution into the model. copy_array(background, (*the_hmm)->background); // Build the begin state. build_complete_state( START_STATE, i_state, alph, 0, // expected length NULL, // Emissions. 0, // Number of sites. NON_MOTIF_INDEX, NON_MOTIF_POSITION, nmotifs, 0, // previous motif 0, // next motif transp_freq, spacer_states, num_spacers, motifs, &((*the_hmm)->states[i_state])); i_state++; int from_motif_state, to_motif_state; // Build the spacer states. No transitions from the end state. for (i_motif = 0; i_motif <= nmotifs; i_motif++) { // No transitions to the start state. for (j_motif = 1; j_motif <= nmotifs+1; j_motif++) { // No transitions from start to end. if ((i_motif == 0) && (j_motif == nmotifs+1)) continue; // Allow multi-state spacers. for (i_position = 0; i_position < spacer_states; i_position++, i_state++) { build_complete_state( SPACER_STATE, i_state, alph, get_matrix_cell(i_motif, j_motif, spacer_ave), background, SPACER_NUMSITES, NON_MOTIF_INDEX, i_position, nmotifs, i_motif, j_motif, transp_freq, spacer_states, num_spacers, motifs, &((*the_hmm)->states[i_state])); } } } // Build the motif states. for (i_motif = 0; i_motif < nmotifs; i_motif++) { MOTIF_T *this_motif = motif_at(motifs, i_motif); STATE_T state; for (i_position = 0; i_position < get_motif_length(this_motif); i_position++, i_state++) { if (i_position == 0) { state = START_MOTIF_STATE; } else if (i_position == (get_motif_length(this_motif) - 1)) { state = END_MOTIF_STATE; } else { state = MID_MOTIF_STATE; } build_complete_state( MID_MOTIF_STATE, i_state, alph, 0, // Expected spacer length. get_matrix_row(i_position, get_motif_freqs(this_motif)), get_motif_nsites(this_motif), i_motif, i_position, nmotifs, 0, // Previous motif index. 0, // Next motif index. transp_freq, spacer_states, num_spacers, motifs, &((*the_hmm)->states[i_state])); } } // Build the end state. build_complete_state( END_STATE, i_state, alph, 0, // Expected spacer length. NULL, // Emissions 0, // Number of sites. NON_MOTIF_INDEX, NON_MOTIF_POSITION, nmotifs, 0, // Previous motif index. 0, // Next motif index. transp_freq, spacer_states, num_spacers, motifs, &((*the_hmm)->states[i_state])); i_state++; // Convert spacers to FIMs if requested. if (fim) { convert_to_fims(*the_hmm); } // Fill in the transition matrix. build_transition_matrix(*the_hmm); }
/************************************************************************* * Build a linear HMM. *************************************************************************/ void build_linear_hmm (ARRAY_T* background, ORDER_T* order_spacing, int spacer_states, RBTREE_T* motifs, // motifs with key as in order_spacing BOOLEAN_T fim, MHMM_T** the_hmm) { ALPH_T alph; int model_length; // Total number of states in the model. int i_state; // Index of the current state. int i_order; // Index within the order and spacing. int i_position; // Index within the current motif or spacer. int motif_i; // motif key in order spacing MOTIF_T *motif; // motif RBNODE_T *node; alph = get_motif_alph((MOTIF_T*)rbtree_value(rbtree_first(motifs))); // Calculate the total length of the model. model_length = 2; // start and end state for (i_order = 0; i_order < get_order_occurs(order_spacing); i_order++) { motif_i = get_order_motif(order_spacing, i_order); motif = (MOTIF_T*)rbtree_get(motifs, &motif_i); model_length += get_motif_length(motif); } model_length += (get_order_occurs(order_spacing) + 1) * spacer_states; // Allocate the model. *the_hmm = allocate_mhmm(alph, model_length); check_sq_matrix((*the_hmm)->trans, model_length); // Record that this is a linear model. (*the_hmm)->type = LINEAR_HMM; // Record the number of motifs in the model. // It doesn't want the distinct count (*the_hmm)->num_motifs = get_order_occurs(order_spacing); // Record the number of states in the model. (*the_hmm)->num_states = model_length; (*the_hmm)->num_spacers = get_order_occurs(order_spacing) + 1; (*the_hmm)->spacer_states = spacer_states; // Put the background distribution into the model. copy_array(background, (*the_hmm)->background); // Begin the model with a non-emitting state. i_state = 0; check_sq_matrix((*the_hmm)->trans, (*the_hmm)->num_states); build_linear_state( alph, START_STATE, i_state, get_spacer_length(order_spacing, 0), NULL, // Emissions. 0, // Number of sites. NON_MOTIF_INDEX, NON_MOTIF_POSITION, // position within state (not relevant to start state) NULL, // no motif &((*the_hmm)->states[i_state])); ++i_state; // Build the first spacer. for (i_position = 0; i_position < spacer_states; i_position++, i_state++) { check_sq_matrix((*the_hmm)->trans, (*the_hmm)->num_states); build_linear_state( alph, SPACER_STATE, i_state, get_spacer_length(order_spacing, 0), background, SPACER_NUMSITES, NON_MOTIF_INDEX, i_position, // position within spacer NULL, // no motif &((*the_hmm)->states[i_state])); } // Build each motif and subsequent spacer. for (i_order = 0; i_order < get_order_occurs(order_spacing); i_order++) { STATE_T state; int spacer_len; motif_i = get_order_motif(order_spacing, i_order); motif = (MOTIF_T*)rbtree_get(motifs, &motif_i); // Build the motif. for (i_position = 0; i_position < get_motif_length(motif); i_position++, i_state++) { if (i_position == 0) { state = START_MOTIF_STATE; spacer_len = get_spacer_length(order_spacing, i_order); } else if (i_position == (get_motif_length(motif) - 1)) { state = END_MOTIF_STATE; spacer_len = get_spacer_length(order_spacing, i_order+1); } else { state = MID_MOTIF_STATE; spacer_len = 0; } check_sq_matrix((*the_hmm)->trans, (*the_hmm)->num_states); build_linear_state( alph, state, i_state, spacer_len, // Expected spacer length. get_matrix_row(i_position, get_motif_freqs(motif)), get_motif_nsites(motif), i_order, i_position, // position within motif (middle) motif, &((*the_hmm)->states[i_state])); } // Build the following spacer. for (i_position = 0; i_position < spacer_states; i_position++, i_state++) { check_sq_matrix((*the_hmm)->trans, (*the_hmm)->num_states); build_linear_state( alph, SPACER_STATE, i_state, get_spacer_length(order_spacing, i_order+1), background, SPACER_NUMSITES, NON_MOTIF_INDEX, i_position, // position within spacer NULL, // no motif &((*the_hmm)->states[i_state])); } } check_sq_matrix((*the_hmm)->trans, (*the_hmm)->num_states); // Finish up the model with a non-emitting end state. build_linear_state( alph, END_STATE, i_state, get_spacer_length(order_spacing, i_order), NULL, // Emissions. 0, // Number of sites. NON_MOTIF_INDEX, NON_MOTIF_POSITION, // position within state (not relevant to end state) NULL, // no motif &((*the_hmm)->states[i_state])); ++i_state; assert(i_state == model_length); check_sq_matrix((*the_hmm)->trans, (*the_hmm)->num_states); // Convert spacers to FIMs if requested. if (fim) { convert_to_fims(*the_hmm); } // Fill in the transition matrix. build_transition_matrix(*the_hmm); }
/************************************************************************* * Build a star topology HMM. *************************************************************************/ void build_star_hmm (ARRAY_T* background, int spacer_states, MOTIF_T* motifs, int nmotifs, BOOLEAN_T fim, MHMM_T** the_hmm) { ALPH_T alph; int motif_states; /* Total length of the motifs. */ int num_spacers; /* Total number of spacer states. */ int num_states; /* Total number of states in the model. */ int i_motif; /* Index of the current "from" motif. */ int i_position; /* Index within the current motif or spacer. */ int i_state = 0; /* Index of the current state. */ alph = get_motif_alph(motif_at(motifs, 0)); /* Count the width of the motifs. */ for (motif_states = 0, i_motif = 0; i_motif < nmotifs; i_motif++) motif_states += get_motif_length(motif_at(motifs, i_motif)); // Only 1 spacer. num_spacers = 1; /* Total states = motifs + spacer_states + begin/end */ num_states = motif_states + (num_spacers * spacer_states) + 2; /* fprintf(stderr, "motif_states=%d num_spacers=%d num_states=%d\n", motif_states, num_spacers, num_states); */ /* Allocate the model. */ *the_hmm = allocate_mhmm(alph, num_states); /* Record that this is a star model. */ (*the_hmm)->type = STAR_HMM; /* Record the number of motifs in the model. */ (*the_hmm)->num_motifs = nmotifs; /* Record the number of states in the model. */ (*the_hmm)->num_states = num_states; (*the_hmm)->num_spacers = 1; (*the_hmm)->spacer_states = spacer_states; // Put the background distribution into the model. copy_array(background, (*the_hmm)->background); /* Build the begin state. */ build_star_state( alph, START_STATE, i_state, 0, // expected length NULL, 0, // Number of sites. NON_MOTIF_INDEX, NON_MOTIF_POSITION, nmotifs, spacer_states, motifs, &((*the_hmm)->states[i_state]) ); i_state++; // Build the spacer state (state 0). Allow multi-state spacers. for (i_position = 0; i_position < spacer_states; i_position++) { build_star_state( alph, SPACER_STATE, i_state, DEFAULT_SPACER_LENGTH, background, SPACER_NUMSITES, NON_MOTIF_INDEX, i_position, nmotifs, spacer_states, motifs, &((*the_hmm)->states[i_state]) ); i_state++; } /* Build the motif states. */ for (i_motif = 0; i_motif < nmotifs; i_motif++) { MOTIF_T *this_motif = motif_at(motifs, i_motif); assert(get_motif_length(this_motif) > 1); i_position = 0; build_star_state( alph, START_MOTIF_STATE, i_state, 0, // Expected spacer length. get_matrix_row(i_position, get_motif_freqs(this_motif)), get_motif_nsites(this_motif), i_motif, i_position, nmotifs, spacer_states, motifs, &((*the_hmm)->states[i_state]) ); i_state++; for (i_position = 1; i_position < get_motif_length(this_motif) - 1; i_position++) { build_star_state( alph, MID_MOTIF_STATE, i_state, 0, // Expected spacer length. get_matrix_row(i_position, get_motif_freqs(this_motif)), get_motif_nsites(this_motif), i_motif, i_position, nmotifs, spacer_states, motifs, &((*the_hmm)->states[i_state]) ); i_state++; } build_star_state( alph, END_MOTIF_STATE, i_state, 0, // Expected spacer length. get_matrix_row(i_position, get_motif_freqs(this_motif)), get_motif_nsites(this_motif), i_motif, i_position, nmotifs, spacer_states, motifs, &((*the_hmm)->states[i_state]) ); i_state++; } /* Build the end state. */ build_star_state( alph, END_STATE, i_state, 0, // Expected spacer length. NULL, // Emissions 0, // Number of sites. NON_MOTIF_INDEX, NON_MOTIF_POSITION, nmotifs, spacer_states, motifs, &((*the_hmm)->states[i_state]) ); i_state++; /* Convert spacers to FIMs if requested. */ if (fim) { convert_to_fims(*the_hmm); } /* Fill in the transition matrix. */ build_transition_matrix(*the_hmm); } // build_star_hmm