/************************************************************************** * Callback invoked when matching an opening pattern tag for a CISML file * of a secondary motif database. It checks that the motif should be scored, * clears out the list of sequence matches and stores the current motif. **************************************************************************/ void motif_secondary(void *ctx, char *accession, char *name, char *db, char *lsId, double *pvalue, double *score) { SECONDARY_LOADER_T *loader = (SECONDARY_LOADER_T*)ctx; SECONDARY_KEY_T key; RBNODE_T *node; PSSM_T *pssm; int i, seq_count; key.db_id = loader->db_id; key.motif_id = accession; node = rbtree_lookup(loader->secondary_motifs, &key, FALSE, NULL); if (node != NULL) { loader->secondary_motif = (SECONDARY_MOTIF_T*)rbtree_value(node); if (!(loader->secondary_motif->loaded)) { seq_count = rbtree_size(loader->sequences); for (i = 0; i < seq_count; ++i) loader->secondary_matches[i] = 0; if (loader->score_threshold_or_multiplier < 0 && loader->score_threshold_or_multiplier >= -1) { pssm = build_motif_pssm(loader->secondary_motif->motif, loader->background, loader->background, NULL, 0, PSSM_RANGE, 0, FALSE); loader->calculated_score_threshold = pssm_best_match_score(pssm) * (-loader->score_threshold_or_multiplier); free_pssm(pssm); } } else { die("Already seen CISML data for this motif!"); } } else { loader->secondary_motif = NULL; } }
/***************************************************************************** * Reads frequency attributes into the pre-allocated freqs array. ****************************************************************************/ static void parse_freq_attrs(PS_T *ps, const char* tag, const xmlChar **attrs) { int i, ncore, seen, *idx; char *end_ptr; double value, sum; RBNODE_T *node; bool seen_bad; ncore = rbtree_size(ps->alph_ids); // initilize the freqs array if (ps->freqs == NULL) ps->freqs = mm_malloc(sizeof(double) * ncore); // reset freqs array; for (i = 0; i < ncore; i++) ps->freqs[i] = -1; seen = 0; seen_bad = false; sum = 0.0; // iterate over attributes for (i = 0; attrs[i] != NULL; i += 2) { idx = (int*)rbtree_get(ps->alph_ids, attrs[i]); if (idx != NULL) { assert(*idx < ncore); if (ps->freqs[*idx] != -1) { dreme_attr_parse_error(ps, PARSE_ATTR_DUPLICATE, tag, (const char*)attrs[i], NULL); continue; } seen++; errno = 0; // reset because we're about to check it value = strtod((const char*)attrs[i+1], &end_ptr); // allow out of range values, mainly because freqs can be very close to zero if (end_ptr == (const char*)attrs[i+1] || (errno && errno != ERANGE) || value < 0 || value > 1) { dreme_attr_parse_error(ps, PARSE_ATTR_BAD_VALUE, tag, (const char*)attrs[i], (const char*)attrs[i+1]); ps->freqs[*idx] = 0; // mark frequence as seen, even though it's bad seen_bad = true; continue; } ps->freqs[*idx] = value; sum += value; } } // check we got everthing if (seen < ncore) { // identify what we're missing for (node = rbtree_first(ps->alph_ids); node != NULL; node = rbtree_next(node)) { idx = (int*)rbtree_value(node); if (ps->freqs[*idx] == -1) { dreme_attr_parse_error(ps, PARSE_ATTR_MISSING, tag, (char*)rbtree_key(node), NULL); } } } else if (!seen_bad) { // check the frequencies sum to 1 double delta = sum - 1; delta = (delta < 0 ? -delta : delta); if (delta > (0.001 * ncore)) { // dreme writes background probabilities to 3 decimal places so assuming // the error on each is at maximum 0.001 then the total error for the // sum must be less than or equal to 0.004 error(ps, "Probabilities of %s do not sum to 1, got %g .\n", tag, sum); } } }
/************************************************************************** * Puts counts into the spacing bins. **************************************************************************/ void bin_matches(int margin, int bin_size, RBTREE_T *sequences, MOTIF_T *primary_motif, SECONDARY_MOTIF_T *secondary_motif, int *matches) { int primary_len, secondary_len, secondary, secondary_pos, primary_rc, secondary_rc, quad, distance, max_distance; RBNODE_T *node; SECONDARY_MOTIF_T *smotif; SEQUENCE_T *sequence; SPACING_T *spacing; primary_len = get_motif_trimmed_length(primary_motif); smotif = secondary_motif; secondary_len = get_motif_trimmed_length(smotif->motif); // Note that distance counts from zero max_distance = margin - secondary_len; // for each sequence for (node = rbtree_first(sequences); node != NULL; node = rbtree_next(node)) { sequence = (SEQUENCE_T*)rbtree_value(node); secondary = matches[sequence->index]; // check for a match if (!secondary) continue; // convert the encoded form into easier to use form primary_rc = sequence->primary_match < 0; secondary_rc = secondary < 0; secondary_pos = (secondary_rc ? -secondary : secondary); // calculate the distance (counts from zero) and side if (secondary_pos <= margin) { distance = margin - secondary_pos - secondary_len + 1; if (primary_rc) {//rotate reference direction quad = RIGHT; } else { quad = LEFT; } } else { distance = secondary_pos - margin - primary_len - 1; if (primary_rc) {//rotate reference direction quad = LEFT; } else { quad = RIGHT; } } // check that we're within the acceptable range if (distance < 0 || distance > max_distance) { die("Secondary motif match not within margin as it should be due to prior checks!"); } // calculate the strand if (secondary_rc == primary_rc) { quad |= SAME; } else { quad |= OPPO; } // add a count to the frequencies spacing = smotif->spacings+(quad); spacing->bins[(int)(distance / bin_size)] += 1; smotif->total_spacings += 1; } }
/*********************************************************************** * Convert a tree of motifs into an array of motifs with a count. * This is intended to allow backwards compatibility with the older * version. ***********************************************************************/ void motif_tree_to_array(RBTREE_T *motif_tree, MOTIF_T **motif_array, int *num) { int count, i; MOTIF_T *motifs; RBNODE_T *node; count = rbtree_size(motif_tree); motifs = mm_malloc(sizeof(MOTIF_T) * count); for (i = 0, node = rbtree_first(motif_tree); node != NULL; i++, node = rbtree_next(node)) { copy_motif((MOTIF_T*)rbtree_value(node), motifs+i); } *motif_array = motifs; *num = count; }
/***************************************************************************** * MEME > training_set > /alphabet * Read in the number of symbols in the alphabet and if it is nucleotide or * amino-acid (RNA is apparently classed as nucleotide). ****************************************************************************/ void mxml_end_alphabet(void *ctx) { PARMSG_T *message; CTX_T *data; RBNODE_T *node; char *id, symbol; bool *exists; int i; data = (CTX_T*)ctx; if (data->alph == NULL) { // Custom alphabet alph_reader_done(data->alph_rdr); // report any errors that the alphabet reader found while (alph_reader_has_message(data->alph_rdr)) { message = alph_reader_next_message(data->alph_rdr); if (message->severity == SEVERITY_ERROR) { local_error(data, "Alphabet error: %s.\n", message->message); } else { local_warning(data, "Alphabet warning: %s.\n", message->message); } parmsg_destroy(message); } // try to get an alphabet data->alph = alph_reader_alphabet(data->alph_rdr); alph_reader_destroy(data->alph_rdr); data->alph_rdr = NULL; } else { // legacy alphabet exists = mm_malloc(sizeof(bool) * alph_size_core(data->alph)); // set list to false for (i = 0; i < alph_size_core(data->alph); i++) exists[i] = false; // check that id's were defined for all the core alphabet symbols for (node = rbtree_first(data->letter_lookup); node != NULL; node = rbtree_next(node)) { id = (char*)rbtree_key(node); symbol = ((char*)rbtree_value(node))[0]; if (exists[alph_indexc(data->alph, symbol)]) { // duplicate! local_error(data, "The letter identifier %s is not the first to refer to symbol %c.\n", id, symbol); } exists[alph_indexc(data->alph, symbol)] = true; } // now check for missing identifiers for (i = 0; i < alph_size_core(data->alph); i++) { if (!exists[i]) { // missing id for symbol local_error(data, "The symbol %c does not have an assigned identifier.\n", alph_char(data->alph, i)); } } free(exists); } }
/************************************************************************** * Callback invoked when matching an opening scanned_sequence tag in the * CISML file for the primary motif. Checks if the sequence is one we are * scoring and if so records it as the current sequence as well as clearing * the hits list. **************************************************************************/ void sequence_primary(void *ctx, char *accession, char *name, char *db, char *lsId, double *score, double *pvalue, long *length) { PRIMARY_LOADER_T *loader = (PRIMARY_LOADER_T*)ctx; if (!(loader->in_motif)) { loader->current_sequence = NULL; } else { RBNODE_T *node = rbtree_lookup(loader->sequences, name, FALSE, NULL); if (node) { loader->current_sequence = rbtree_value(node); if (loader->current_sequence->primary_match) die("Already seen this sequence! We can't process this information " "because the scoring information from the previous sighting has already been discarded.\n"); loader->current_score = 0; // reset the current score loader->hit_count = 0; //reset the hit count } else { loader->current_sequence = NULL; } } }
/************************************************************************** * Calculate the total number of pvalue calculations that will be done * by the program. This number is used to correct the pvalues for multiple * tests using a bonferoni correction. **************************************************************************/ int calculate_test_count(int margin, int bin, int test_max, RBTREE_T *secondary_motifs) { int total_tests, quad_opt_count, quad_bin_count; SECONDARY_MOTIF_T *smotif; RBNODE_T *node; total_tests = 0; for (node = rbtree_first(secondary_motifs); node != NULL; node = rbtree_next(node)) { smotif = (SECONDARY_MOTIF_T*)rbtree_value(node); //the number of possible values for spacings in one quadrant quad_opt_count = margin - get_motif_trimmed_length(smotif->motif) + 1; //the number of bins in one quadrant (excluding a possible leftover bin) quad_bin_count = (int)(quad_opt_count / bin) + (quad_opt_count % bin ? 1 : 0); //add the number of tested bins total_tests += (test_max < quad_bin_count ? test_max : quad_bin_count) * 4; } return total_tests; }
/************************************************************************** * compute the list of ids for the most significant spacing **************************************************************************/ void compute_idset(int margin, int bin_size, RBTREE_T *sequences, MOTIF_T *primary_motif, SECONDARY_MOTIF_T *secondary_motif, int *matches) { int primary_len, secondary_len, secondary, secondary_pos, primary_rc, secondary_rc, quad, distance; RBNODE_T *node; SEQUENCE_T *sequence; if (secondary_motif->sig_count == 0) return; primary_len = get_motif_trimmed_length(primary_motif); secondary_len = get_motif_trimmed_length(secondary_motif->motif); // for each sequence for (node = rbtree_first(sequences); node != NULL; node = rbtree_next(node)) { sequence = (SEQUENCE_T*)rbtree_value(node); secondary = matches[sequence->index]; // check for a match if (!secondary) continue; // convert the encoded form into easier to use form primary_rc = sequence->primary_match < 0; secondary_rc = secondary < 0; secondary_pos = (secondary_rc ? -secondary : secondary); // calculate the distance and side // note that distance can be zero meaning the primary is next to the secondary if (secondary_pos <= margin) { distance = margin - secondary_pos - secondary_len + 1; quad = LEFT; } else { distance = secondary_pos - margin - primary_len; quad = RIGHT; } // calculate the strand if (secondary_rc == primary_rc) { quad |= SAME; } else { quad |= OPPO; } // add the sequence id to the set if the bin matches if (quad == secondary_motif->sigs->quad && (distance / bin_size) == secondary_motif->sigs->bin) { secondary_motif->seq_count += 1; secondary_motif->seqs = (int*)mm_realloc(secondary_motif->seqs, sizeof(int) * secondary_motif->seq_count); secondary_motif->seqs[secondary_motif->seq_count-1] = sequence->index; } } }
/************************************************************************** * Callback invoked when matching an opening scanned_sequence tag for a * CISML file of a secondary motif database. It calcualtes and caches the * left and right bounds of the primary motif and stores the current * sequence. **************************************************************************/ void sequence_secondary(void *ctx, char *accession, char *name, char *db, char *lsId, double *score, double *pvalue, long *length) { SECONDARY_LOADER_T *loader = (SECONDARY_LOADER_T*)ctx; RBNODE_T *node; int pmatch; if (loader->secondary_motif == NULL) return; node = rbtree_lookup(loader->sequences, accession, FALSE, NULL); if (node != NULL) { loader->current_sequence = (SEQUENCE_T*)rbtree_value(node); pmatch = loader->current_sequence->primary_match; loader->primary_lpos = (pmatch < 0 ? -pmatch : pmatch); loader->primary_rpos = loader->primary_lpos + get_motif_length(loader->primary_motif) - 1; if (loader->secondary_matches[loader->current_sequence->index] != 0) { die("Already seen this sequence!"); } loader->secondary_score = 0; loader->hit_count = 0; } else { loader->current_sequence = NULL; } }
/************************************************************************** * Dump sequence matches sorted by the name of the sequence. * * Outputs Columns: * 1) Trimmed lowercase sequence with uppercase matches. * 2) Position of the secondary match within the whole sequence. * 3) Sequence fragment that the primary matched. * 4) Strand of the primary match (+|-) * 5) Sequence fragment that the secondary matched. * 6) Strand of the secondary match (+|-) * 7) Is the primary match on the same strand as the secondary (s|o) * 8) Is the secondary match downstream or upstream (d|u) * 9) The gap between the primary and secondary matches * 10) The name of the sequence * 11) The p-value of the bin containing the match (adjusted for # of bins) * ---if the FASTA input file sequence names are in Genome Browser format: * 12-14) Position of primary match in BED coordinates * 15) Position of primary match in Genome Browser coordinates * 16-18) Position of secondary match in BED coordinates * 19) Position of secondary match in Genome Browser coordinates * * If you wish to sort based on the gap column: * Sort individual output: * sort -n -k 9,9 -o seqs_primary_secondary.txt seqs_primary_secondary.txt * Or sort all outputs: * for f in seqs_*.txt; do sort -n -k 9,9 -o $f $f; done * Or to get just locations of primary motif in BED coordinates * where the secondary is on the opposite strand, upstream with a gap of 118bp: * awk '$7=="o" && $8=="u" && $9==118 {print $12"\t"$13"\t"$14;}' seqs_primary_secondary.txt * **************************************************************************/ static void dump_sequence_matches(FILE *out, int margin, int bin, double sigthresh, BOOLEAN_T sig_only, RBTREE_T *sequences, MOTIF_T *primary_motif, SECONDARY_MOTIF_T *secondary_motif, ARRAY_T **matches) { RBNODE_T *node; SEQUENCE_T *sequence; int idx, seqlen, i, j, start, end, secondary, secondary_pos, primary_len, secondary_len, distance; BOOLEAN_T primary_rc, secondary_rc, downstream; char *buffer, *seq, *primary_match, *secondary_match; ARRAY_T *secondary_array; ALPH_T *alph; // get the alphabet alph = get_motif_alph(primary_motif); // allocate a buffer for copying the trimmed sequence into and modify it seqlen = margin * 2 + get_motif_trimmed_length(primary_motif); buffer = (char*)mm_malloc(sizeof(char) * (seqlen + 1)); // get the lengths of the motifs primary_len = get_motif_trimmed_length(primary_motif); secondary_len = get_motif_trimmed_length(secondary_motif->motif); // allocate some strings for storing the matches primary_match = (char*)mm_malloc(sizeof(char) * (primary_len + 1)); secondary_match = (char*)mm_malloc(sizeof(char) * (secondary_len + 1)); // add null byte at the end of the match strings primary_match[primary_len] = '\0'; secondary_match[secondary_len] = '\0'; // iterate over all the sequences for (node = rbtree_first(sequences); node != NULL; node = rbtree_next(node)) { sequence = (SEQUENCE_T*)rbtree_value(node); primary_rc = get_array_item(0, sequence->primary_matches) < 0; //secondary = matches[sequence->index]; secondary_array = matches[sequence->index]; if (! secondary_array) continue; int n_secondary_matches = get_array_length(secondary_array); for (idx=0; idx<n_secondary_matches; idx++) { secondary = get_array_item(idx, secondary_array); secondary_rc = secondary < 0; secondary_pos = abs(secondary); // calculate the distance if (secondary_pos <= margin) { distance = margin - secondary_pos - secondary_len + 1; downstream = primary_rc; } else { distance = secondary_pos - margin - primary_len - 1; downstream = !primary_rc; } // copy the trimmed sequence seq = sequence->data; for (i = 0; i < seqlen; ++i) { buffer[i] = (alph_is_case_insensitive(alph) ? tolower(seq[i]) : seq[i]); } buffer[seqlen] = '\0'; // uppercase primary start = margin; end = margin + primary_len; for (i = start, j = 0; i < end; ++i, ++j) { buffer[i] = (alph_is_case_insensitive(alph) ? toupper(buffer[i]) : buffer[i]); primary_match[j] = buffer[i]; } // uppercase secondary // note orign was one, subtract 1 to make origin zero as required for arrays start = secondary_pos -1; end = start + secondary_len; for (i = start, j = 0; i < end; ++i, ++j) { buffer[i] = (alph_is_case_insensitive(alph) ? toupper(buffer[i]) : buffer[i]); secondary_match[j] = buffer[i]; } // get the p-value of the seconndary match SPACING_T *spacings; if (secondary_rc == primary_rc) { spacings = downstream ? secondary_motif->spacings+(SAME+RIGHT) : secondary_motif->spacings+(SAME+LEFT); } else { spacings = downstream ? secondary_motif->spacings+(OPPO+RIGHT) : secondary_motif->spacings+(OPPO+LEFT); } double p_value = spacings->pvalue[distance/bin]; // skip match if not significant and only reporting significant matches if (sig_only && (p_value > sigthresh)) continue; // output line to file fprintf(out, "%s %3d %s %s %s %s %s %s %3d %s %.1e", buffer, secondary_pos, primary_match, (primary_rc ? "-" : "+"), secondary_match, (secondary_rc ? "-" : "+"), (secondary_rc == primary_rc ? "s" : "o"), (downstream ? "d" : "u"), distance, sequence->name, p_value ); // Parse the sequence name to see if we can get genomic coordinates // and print additional columns with primary and secondary matches // in both BED and Genome Browser coordinates. char *chr_name; size_t chr_name_len; int start_pos, end_pos; if (parse_genomic_coordinates_helper( sequence->name, &chr_name, &chr_name_len, &start_pos, &end_pos)) { // Get the start and end of the primary match in // 0-relative, half-open genomic coordinates. int p_start = start_pos + fabs(get_array_item(0, sequence->primary_matches)) - 1; int p_end = p_start + primary_len; // Get the start and end of the secondary match in // 0-relative, half-open genomic coordinates. int s_start, s_end; if ( (!primary_rc && downstream) || (primary_rc && !downstream) ) { s_start = p_end + distance; s_end = s_start + secondary_len; } else { s_end = p_start - distance; s_start = s_end - secondary_len; } fprintf(out, " %s %d %d %s:%d-%d", chr_name, p_start, p_end, chr_name, p_start+1, p_end); fprintf(out, " %s %d %d %s:%d-%d\n", chr_name, s_start, s_end, chr_name, s_start+1, s_end); } else { fprintf(out, "\n"); } } // secondary match } // primary match free(buffer); free(primary_match); free(secondary_match); }
/* * Load background file frequencies into the array. */ ARRAY_T* get_file_frequencies(ALPH_T *alph, char *bg_filename, ARRAY_T *freqs) { regmatch_t matches[4]; STR_T *line; char chunk[BG_CHUNK_SIZE+1], letter[2], *key; int size, terminate, offset, i; FILE *fp; regex_t bgfreq; double freq; RBTREE_T *letters; RBNODE_T *node; regcomp_or_die("bg freq", &bgfreq, BGFREQ_RE, REG_EXTENDED); letters = rbtree_create(rbtree_strcasecmp, rbtree_strcpy, free, rbtree_dblcpy, free); line = str_create(100); if (!(fp = fopen(bg_filename, "r"))) { die("Unable to open background file \"%s\" for reading.\n", bg_filename); } terminate = feof(fp); while (!terminate) { size = fread(chunk, sizeof(char), BG_CHUNK_SIZE, fp); chunk[size] = '\0'; terminate = feof(fp); offset = 0; while (offset < size) { // skip mac newline if (str_len(line) == 0 && chunk[offset] == '\r') { offset++; continue; } // find next new line for (i = offset; i < size; ++i) { if (chunk[i] == '\n') break; } // append portion up to the new line or end of chunk str_append(line, chunk+offset, i - offset); // read more if we didn't find a new line if (i == size && !terminate) break; // move the offset past the new line offset = i + 1; // handle windows new line if (str_char(line, -1) == '\r') str_truncate(line, -1); // remove everything to the right of a comment character for (i = 0; i < str_len(line); ++i) { if (str_char(line, i) == '#') { str_truncate(line, i); break; } } // check the line for a single letter followed by a number if (regexec_or_die("bg freq", &bgfreq, str_internal(line), 4, matches, 0)) { // parse the letter and frequency value regex_strncpy(matches+1, str_internal(line), letter, 2); freq = regex_dbl(matches+2, str_internal(line)); // check the frequency is acceptable if (freq < 0 || freq > 1) { die("The background file lists the illegal probability %g for " "the letter %s.\n", freq, letter); } else if (freq == 0) { die("The background file lists a probability of zero for the " "letter %s\n", letter); } if (freq >= 0 && freq <= 1) rbtree_put(letters, letter, &freq); } str_clear(line); } } // finished with the file so clean up file parsing stuff fclose(fp); str_destroy(line, FALSE); regfree(&bgfreq); // guess the alphabet if (*alph == INVALID_ALPH) { switch (rbtree_size(letters)) { case PROTEIN_ASIZE: *alph = PROTEIN_ALPH; break; case DNA_ASIZE: *alph = DNA_ALPH; break; default: die("Number of single character entries in background does not match " "an alphabet.\n"); } } // make the background if (freqs == NULL) freqs = allocate_array(alph_size(*alph, ALL_SIZE)); assert(get_array_length(freqs) >= alph_size(*alph, ALL_SIZE)); init_array(-1, freqs); for (node = rbtree_first(letters); node != NULL; node = rbtree_next(node)) { key = (char*)rbtree_key(node); i = alph_index(*alph, key[0]); freq = *((double*)rbtree_value(node)); if (i == -1) { die("Background contains letter %s which is not in the %s alphabet.\n", key, alph_name(*alph)); } if (get_array_item(i, freqs) != -1) { die("Background contains letter %s which has the same meaning as an " "already listed letter.\n", key); } set_array_item(i, freq, freqs); } // check that all items were set for (i = 0; i < alph_size(*alph, ALPH_SIZE); i++) { if (get_array_item(i, freqs) == -1) { die("Background is missing letter %c.\n", alph_char(*alph, i)); } } // disabled for backwards compatability (AMA test was failing) //normalize_subarray(0, ALPH_ASIZE[*alph], 0.0, freqs); // calculate the values of the ambiguous letters from the concrete ones calc_ambigs(*alph, FALSE, freqs); // cleanup rbtree_destroy(letters); // return result return freqs; }
/************************************************************************* * Build a linear HMM. *************************************************************************/ void build_linear_hmm (ARRAY_T* background, ORDER_T* order_spacing, int spacer_states, RBTREE_T* motifs, // motifs with key as in order_spacing BOOLEAN_T fim, MHMM_T** the_hmm) { ALPH_T alph; int model_length; // Total number of states in the model. int i_state; // Index of the current state. int i_order; // Index within the order and spacing. int i_position; // Index within the current motif or spacer. int motif_i; // motif key in order spacing MOTIF_T *motif; // motif RBNODE_T *node; alph = get_motif_alph((MOTIF_T*)rbtree_value(rbtree_first(motifs))); // Calculate the total length of the model. model_length = 2; // start and end state for (i_order = 0; i_order < get_order_occurs(order_spacing); i_order++) { motif_i = get_order_motif(order_spacing, i_order); motif = (MOTIF_T*)rbtree_get(motifs, &motif_i); model_length += get_motif_length(motif); } model_length += (get_order_occurs(order_spacing) + 1) * spacer_states; // Allocate the model. *the_hmm = allocate_mhmm(alph, model_length); check_sq_matrix((*the_hmm)->trans, model_length); // Record that this is a linear model. (*the_hmm)->type = LINEAR_HMM; // Record the number of motifs in the model. // It doesn't want the distinct count (*the_hmm)->num_motifs = get_order_occurs(order_spacing); // Record the number of states in the model. (*the_hmm)->num_states = model_length; (*the_hmm)->num_spacers = get_order_occurs(order_spacing) + 1; (*the_hmm)->spacer_states = spacer_states; // Put the background distribution into the model. copy_array(background, (*the_hmm)->background); // Begin the model with a non-emitting state. i_state = 0; check_sq_matrix((*the_hmm)->trans, (*the_hmm)->num_states); build_linear_state( alph, START_STATE, i_state, get_spacer_length(order_spacing, 0), NULL, // Emissions. 0, // Number of sites. NON_MOTIF_INDEX, NON_MOTIF_POSITION, // position within state (not relevant to start state) NULL, // no motif &((*the_hmm)->states[i_state])); ++i_state; // Build the first spacer. for (i_position = 0; i_position < spacer_states; i_position++, i_state++) { check_sq_matrix((*the_hmm)->trans, (*the_hmm)->num_states); build_linear_state( alph, SPACER_STATE, i_state, get_spacer_length(order_spacing, 0), background, SPACER_NUMSITES, NON_MOTIF_INDEX, i_position, // position within spacer NULL, // no motif &((*the_hmm)->states[i_state])); } // Build each motif and subsequent spacer. for (i_order = 0; i_order < get_order_occurs(order_spacing); i_order++) { STATE_T state; int spacer_len; motif_i = get_order_motif(order_spacing, i_order); motif = (MOTIF_T*)rbtree_get(motifs, &motif_i); // Build the motif. for (i_position = 0; i_position < get_motif_length(motif); i_position++, i_state++) { if (i_position == 0) { state = START_MOTIF_STATE; spacer_len = get_spacer_length(order_spacing, i_order); } else if (i_position == (get_motif_length(motif) - 1)) { state = END_MOTIF_STATE; spacer_len = get_spacer_length(order_spacing, i_order+1); } else { state = MID_MOTIF_STATE; spacer_len = 0; } check_sq_matrix((*the_hmm)->trans, (*the_hmm)->num_states); build_linear_state( alph, state, i_state, spacer_len, // Expected spacer length. get_matrix_row(i_position, get_motif_freqs(motif)), get_motif_nsites(motif), i_order, i_position, // position within motif (middle) motif, &((*the_hmm)->states[i_state])); } // Build the following spacer. for (i_position = 0; i_position < spacer_states; i_position++, i_state++) { check_sq_matrix((*the_hmm)->trans, (*the_hmm)->num_states); build_linear_state( alph, SPACER_STATE, i_state, get_spacer_length(order_spacing, i_order+1), background, SPACER_NUMSITES, NON_MOTIF_INDEX, i_position, // position within spacer NULL, // no motif &((*the_hmm)->states[i_state])); } } check_sq_matrix((*the_hmm)->trans, (*the_hmm)->num_states); // Finish up the model with a non-emitting end state. build_linear_state( alph, END_STATE, i_state, get_spacer_length(order_spacing, i_order), NULL, // Emissions. 0, // Number of sites. NON_MOTIF_INDEX, NON_MOTIF_POSITION, // position within state (not relevant to end state) NULL, // no motif &((*the_hmm)->states[i_state])); ++i_state; assert(i_state == model_length); check_sq_matrix((*the_hmm)->trans, (*the_hmm)->num_states); // Convert spacers to FIMs if requested. if (fim) { convert_to_fims(*the_hmm); } // Fill in the transition matrix. build_transition_matrix(*the_hmm); }