/*********************************************************************** * Convert transition counts to transition probabilities, and compute * average spacer lengths. * * Each matrix is indexed 0 ... n+1, where n is the number of motifs. * The entry at [i,j] corresponds to the transition from motif i to * motif j. Hence, after normalization, each row in the transition * matrix should sum to 1. ***********************************************************************/ static void normalize_spacer_counts( double trans_pseudo, double spacer_pseudo, // Pseudocount for self-loop. BOOLEAN_T keep_unused, MATRIX_T* transp_freq, MATRIX_T* spacer_ave ) { int i_row; int i_col; int num_rows; double total_spacer; double num_transitions; double ave_spacer; /* Divide the spacer lengths by the number of occurrences. */ num_rows = get_num_rows(transp_freq); for (i_row = 0; i_row < num_rows; i_row++) { for (i_col = 0; i_col < num_rows; i_col++) { total_spacer = get_matrix_cell(i_row, i_col, spacer_ave) + spacer_pseudo; num_transitions = get_matrix_cell(i_row, i_col, transp_freq); if (spacer_pseudo > 0) num_transitions++; if (num_transitions != 0.0) { ave_spacer = total_spacer / num_transitions; set_matrix_cell(i_row, i_col, ave_spacer, spacer_ave); } } } // Add pseudocounts. for (i_row = 0; i_row < num_rows; i_row++) { for (i_col = 0; i_col < num_rows; i_col++) { // Force some transitions to zero. if (// No transitions to the start state. (i_col == 0) || // No transitions from the end state. (i_row == num_rows - 1) || // No transition from start to end. ((i_row == 0) && (i_col == num_rows - 1))) { set_matrix_cell(i_row, i_col, 0.0, transp_freq); } else { // Only increment the used transitions. if ((keep_unused) || (get_matrix_cell(i_row, i_col, transp_freq) > 0.0)) { incr_matrix_cell(i_row, i_col, trans_pseudo, transp_freq); } } } } // Normalize rows. for (i_row = 0; i_row < num_rows - 1; i_row++) { if (array_total(get_matrix_row(i_row, transp_freq)) > 0.0) { normalize(SLOP, get_matrix_row(i_row, transp_freq)); } } }
/************************************************************************** * scale_pssm * * Scale and round the scores in a PSSM so that the score of a word * is in the range [0..w*range]. * * Returns the scaled PSSM. * **************************************************************************/ void scale_pssm( PSSM_T *pssm, // The PSSM. (IN/OUT) PRIOR_DIST_T *prior_dist, // Distribution of priors (IN) double alpha, // Fraction of all TFBS that are the TFBS of interest int range // The desired range. (IN) ) { int i, j; MATRIX_T* matrix = pssm->matrix; int r = pssm->w; int c = pssm->alphsize; double small = BIG; double large = -BIG; double scale, offset; // Get the largest and smallest scores in the PSSM. for (i=0; i<r; i++) { for (j=0; j<c; j++) { double x = get_matrix_cell(i, j, matrix); small = MIN(small, x); large = MAX(large, x); } } // Get the smallest and largest prior log-odds from the prior distribution // and use them to adjust small and large. if (prior_dist != NULL) { double min_lo_prior = get_min_lo_prior(prior_dist, alpha); double max_lo_prior = get_max_lo_prior(prior_dist, alpha); small = MIN(small, min_lo_prior); large = MAX(large, max_lo_prior); } // Find offset and scale factors so that PSSM scores for words is in the // range: [0..w*range] // To make LO=0 map back to 0, need offset*scale to be an integer. // So we make offset and scale integers. (TLB 31 May 2013) if (large == small) { small = large - 1; } // In case all motif entries are the same. offset = small = floor(small); // Make offset an integer. scale = floor(range/(large-small)); // Ensure scaled scores are <= range. // Scale and round the PSSM entries. for (i=0; i<r; i++) { for (j=0; j<c; j++) { double x = raw_to_scaled(get_matrix_cell(i, j, matrix), 1, scale, offset); set_matrix_cell(i, j, x, matrix); } } // return scale and offset of scores pssm->scale = scale; pssm->offset = offset; pssm->range = range; } // scale_pssm
KARLIN_INPUT_T *make_karlin_input( MATRIX_T *matrix, /* scoring matrix */ ARRAY_T *probs /* letter freq distribution */ ) { int i, j; double escore; long lowest, highest; ARRAY_T *score_probs; int nscores; int alen = get_num_rows(matrix); /* size of alphabet */ KARLIN_INPUT_T *karlin_input; /* data to return */ /* find the highest and lowest scores in the scoring matrix */ lowest = 1; highest = -1; for (i=0; i<alen; i++) { for (j=0; j<alen; j++) { double s = get_matrix_cell(i, j, matrix); if (s < lowest) lowest = s; if (s > highest) highest = s; } } if (lowest >= 0) die("Lowest score in scoring matrix must be negative, is %f.", (double)lowest); if (highest<= 0) die("Highest score in scoring matrix must be positve, is %f.", (double)highest); /* allocate the array of score probabilities and set to 0 */ nscores = highest - lowest + 1; score_probs = allocate_array(nscores); init_array(0, score_probs); /* compute the probabilities of different scores */ escore = 0; for (i=0; i<alen; i++) { for (j=0; j<alen; j++) { int s = get_matrix_cell(i, j, matrix); double pi = get_array_item(i, probs); double pj = get_array_item(j, probs); double sp = get_array_item(s-lowest, score_probs); set_array_item(s-lowest, sp + pi*pj, score_probs); /* cumulative prob. of score */ escore += pi*pj*s; /*printf("i %d j %d s %d pi %f pj %f sp %f escore %f\n",i,j,s, pi, pj, sp, escore);*/ } } karlin_input = (KARLIN_INPUT_T *)mm_malloc(sizeof(KARLIN_INPUT_T)); karlin_input->low = lowest; karlin_input->high = highest; karlin_input->escore = escore; karlin_input->prob = score_probs; return(karlin_input); } /* make_karlin_input */
BOOLEAN_T verify_trans_matrix (BOOLEAN_T log_form, /* Is the transition matrix in log form? */ int num_states, /* Number of states in the (square) matrix. */ MATRIX_T* trans) /* The matrix. */ { int i_state; PROB_T total; for (i_state = 0; i_state < num_states - 1; i_state++) { /* Cf. Rabiner, formula (43b), p. 265. */ if (log_form) { total = log_array_total(get_matrix_row(i_state, trans)); if ((!almost_equal(total, 0.0, SLOP)) && (!almost_equal(total, 1.0, SLOP)) && // Allow for FIMS. (!almost_equal(EXP2(total), 0.0, SLOP))) { fprintf(stderr, "Warning: Row %d of transition matrix differs from 0.0 by %g.\n", i_state, EXP2(total)); return(FALSE); } } else { total = array_total(get_matrix_row(i_state, trans)); if ((!almost_equal(total, 1.0, SLOP)) && (!almost_equal(total, 2.0, SLOP)) && // Allow FIMs. (!almost_equal(total, 0.0, SLOP))) { // Allow inaccessible motifs. fprintf(stderr, "Warning: Row %d of transition matrix differs from 1.0 by %g.\n", i_state, 1.0 - total); return(FALSE); } } /* All transitions from the end state must be zero. */ if ((log_form) && (get_matrix_cell(num_states - 1, i_state, trans) > LOG_SMALL)) { fprintf(stderr, "Warning: Transition %d from end state is non-zero (%g).\n", i_state, get_matrix_cell(num_states - 1, i_state, trans)); return(FALSE); } else if (!(log_form) && (!almost_equal(get_matrix_cell(num_states - 1, i_state, trans), 0.0, SLOP))) { fprintf(stderr, "Warning: Transition %d from end state is non-zero (%g).\n", i_state, get_matrix_cell(num_states - 1, i_state, trans)); return(FALSE); } } return(TRUE); }
/************************************************************************** * get_min_pvalue * * Return the minimum p-value for a given pssm. * **************************************************************************/ static double get_min_pvalue( PSSM_T *pssm // The PSSM. ) { int i, j; int max_score; int r = pssm->w; int c = pssm->alphsize; double min_p_value; // Get the largest score in each row and sum them. max_score = 0; for (i=0; i<r; i++) { double large = -BIG; for (j=0; j<c; j++) { double x = get_matrix_cell(i, j, pssm->matrix); large = MAX(large, x); } max_score += large; } min_p_value = get_array_item(max_score, pssm->pv); return(min_p_value); } /* get_min_pvalue */
/************************************************************************** * hash_pssm_matrix_pos Recursively create a single position of a hashed PSSM. * **************************************************************************/ static void hash_pssm_matrix_pos( MATRIX_T *pssm, // pssm to hash MATRIX_T *hashed_pssm, // hashed pssm int pos, // position in pssm int hashed_pos, // position in hashed pssm int n, // number of columns to hash together double score, // cumulative score; call with 0 int index // cumulative index; call with 0 ) { int i; int alen = get_num_cols(pssm); // alphabet length int w = get_num_rows(pssm); // pssm width if (n==0) { // done, set hashed_pssm entry set_matrix_cell(hashed_pos, index, score, hashed_pssm); } else { // combine next column of pssm for (i=0; i<=alen; i++) { // letters + blank // not past right edge of motif and not blank? double s = (pos<w && i!=alen) ? get_matrix_cell(pos, i, pssm) : 0; hash_pssm_matrix_pos(pssm, hashed_pssm, pos+1, // position in old pssm hashed_pos, // position working on n-1, // positions remaining to hash score+s, // score so far index*(alen+1)+i); // hashed alphabet index so far } // leter } } // hash_pssm_matrix_pos
/************************************************************************* * Calculate the log odds score for a single motif-sized window. *************************************************************************/ static inline BOOLEAN_T score_motif_site( ALPH_T alph, char *seq, PSSM_T *pssm, double *score // OUT ) { int asize = alph_size(alph, ALPH_SIZE); MATRIX_T* pssm_matrix = pssm->matrix; double scaled_log_odds = 0.0; // For each position in the site int motif_position; for (motif_position = 0; motif_position < pssm->w; motif_position++) { char c = seq[motif_position]; int aindex = alph_index(alph, c); // Check for gaps and ambiguity codes at this site if(aindex == -1 || aindex >= asize) return FALSE; scaled_log_odds += get_matrix_cell(motif_position, aindex, pssm_matrix); } *score = get_unscaled_pssm_score(scaled_log_odds, pssm); // Handle scores that are out of range if ((int) scaled_log_odds >= get_array_length(pssm->pv)) { scaled_log_odds = (float)(get_array_length(pssm->pv) - 1); *score = scaled_to_raw(scaled_log_odds, pssm->w, pssm->scale, pssm->offset); } return TRUE; }
/*********************************************************************** * Apply a pseudocount to the motif pspm. ***********************************************************************/ void apply_pseudocount_to_motif (MOTIF_T* motif, ARRAY_T *background, double pseudocount) { int pos, letter, len, asize, sites; double prob, count, total; ARRAY_T *temp; // no point in doing work when it makes no difference if (pseudocount == 0) return; assert(pseudocount > 0); // motif dimensions asize = alph_size(motif->alph, ALPH_SIZE); len = motif->length; // create a uniform background if none is given temp = NULL; if (background == NULL) { temp = allocate_array(asize); get_uniform_frequencies(motif->alph, temp); background = temp; } // calculate the counts sites = (motif->num_sites > 0 ? motif->num_sites : DEFAULT_SITE_COUNT); total = sites + pseudocount; for (pos = 0; pos < len; ++pos) { for (letter = 0; letter < asize; ++letter) { prob = get_matrix_cell(pos, letter, motif->freqs); count = (prob * sites) + (pseudocount * get_array_item(letter, background)); prob = count / total; set_matrix_cell(pos, letter, prob, motif->freqs); } } if (temp) free_array(temp); }
MATRIX_T *reorder_matrix( const char *alpha1, /* current alphabet */ const char *alpha2, /* new alphabet; must be subset */ MATRIX_T *in_matrix /* matrix to reorder */ ) { int i, j; int alen1 = strlen(alpha1); int alen2 = strlen(alpha2); MATRIX_T *out_matrix; if (alen2 > alen1) die("The new alphabet %s must be a subset of the old alphabet %s.\n", alpha2, alpha1); out_matrix = allocate_matrix(alen2, alen2); for (i=0; i<alen2; i++) { int ii = strchr(alpha1, alpha2[i]) - alpha1; for (j=0; j<alen2; j++) { int jj; char *ptr = strchr(alpha1, alpha2[j]); if (!ptr) die("The new alphabet %s must be a subset of the old alphabet %s\n", alpha2, alpha1); jj = ptr - alpha1; set_matrix_cell(i, j, get_matrix_cell(ii, jj, in_matrix), out_matrix); } } return(out_matrix); } /* reorder_matrix */
/************************************************************************** * Get pseudocount frequencies. * * The target_freq matrix only has values for the basic alphabet. * Fill in the ambiguous character pseudocounts afterwards using * the average of pseudocounts for letters matching the ambiguous ones. **************************************************************************/ ARRAY_T *get_pseudocount_freqs( ALPH_T alph, ARRAY_T * f, /* Foreground distribution. */ ARRAY_T * b, /* Background distribution. */ MATRIX_T * target_freq /* Target frequency matrix. */ ) { int i, j; int asize = alph_size(alph, ALPH_SIZE); // excludes ambigs ARRAY_T *g = allocate_array(alph_size(alph, ALL_SIZE));// includes ambigs /* Create pseudocount frequencies. */ for (i = 0; i < asize; i++) { /* non-ambiguous freqs */ double gi = 0; for (j= 0; j < asize; j++) { /* non-ambiguous freqs */ double qij = get_matrix_cell(i, j, target_freq); double fj = get_array_item(j, f); double bj = get_array_item(j, b); gi += (fj/bj) * qij; } /* j */ set_array_item(i, gi, g); if (SUBST_MATRIX_DEBUG) printf("%g %g, ", get_array_item(i, f), gi); } /* i */ calc_ambigs(alph, FALSE, g); /* takes the average pseudocount */ if (SUBST_MATRIX_DEBUG) printf("\n"); return(g); /* return the pseudocounts */ } /* get_pseudocount_freqs */
/********************************************************************** post_process() adjust/normalize scores and p-values **********************************************************************/ void post_process(CISML_T* cisml, ARRAYLST_T* motifs, BOOLEAN_T normalize_scores){ int m_index, seq_index; MOTIF_AND_PSSM_T *combo; for (m_index = 0; m_index < get_cisml_num_patterns(cisml); ++m_index) { PATTERN_T* pattern = get_cisml_patterns(cisml)[m_index]; double maxscore = 1; // FIXME: This should be done to the PSSM, not the individual scores!!! // Normalize the scores to RMA format if necessary. if (normalize_scores) { int k; combo = (MOTIF_AND_PSSM_T*)arraylst_get(m_index, motifs); PSSM_T* pssm = combo->pssm_pair->pos_pssm; for (k = 0; k < pssm->w; k++) { double maxprob = -BIG; // These are scores, not probabilities!!! int a; for (a = 0; a < alph_size_core(pssm->alph); a++) { double prob = get_matrix_cell(k, a, pssm->matrix); if (maxprob < prob) maxprob = prob; } maxscore *= maxprob; } } // adjust each scanned sequence for (seq_index = 0; seq_index < get_pattern_num_scanned_sequences(pattern); ++seq_index) { SCANNED_SEQUENCE_T* scanned_seq = get_pattern_scanned_sequences(pattern)[seq_index]; // only adjust scores and p-values if more than one copy was scored // num_scanned_positions is (mis-)used in ama to indicate the number of times // a sequence identifier 0occured in the set if (get_scanned_sequence_num_scanned_positions(scanned_seq) > 1L){ // take average score if(has_scanned_sequence_score(scanned_seq)){ double avg_odds = get_scanned_sequence_score(scanned_seq) / get_scanned_sequence_num_scanned_positions(scanned_seq); set_scanned_sequence_score(scanned_seq, avg_odds); } // adjust the minimum p-value for multiple hypothesis testing if(has_scanned_sequence_pvalue(scanned_seq)){ double corr_pvalue = 1.0 - pow( 1.0 - get_scanned_sequence_pvalue(scanned_seq), get_scanned_sequence_num_scanned_positions(scanned_seq) ); set_scanned_sequence_pvalue(scanned_seq, corr_pvalue); } } // normalize if requested if (normalize_scores) { set_scanned_sequence_score(scanned_seq, get_scanned_sequence_score(scanned_seq) / maxscore ); } } } }
extern MATRIX_T* gen_pam_matrix( ALPH_T alph, /* alphabet */ int dist, /* PAM distance */ BOOLEAN_T logodds /* true: generate log-odds matrix false: generate target frequency matrix */ ) { assert(alph == DNA_ALPH || alph == PROTEIN_ALPH); int i, j; MATRIX_T *matrix, *mul; BOOLEAN_T dna = (alph == DNA_ALPH); double *pfreq = dna ? pam_dna_freq : pam_prot_freq; // standard frequencies int alen = alph_size(alph, ALPH_SIZE); // length of standard alphabet double factor = dist < 170 ? 2/log(2) : 3/log(2); // same as in "pam" Version 1.0.6 /* create the array for the joint probability matrix */ matrix = allocate_matrix(alen, alen); mul = allocate_matrix(alen, alen); /* initialize the matrix: PAM 1: due to roundoff, take the average of the two estimates of the joint frequency of i and j as the joint, then compute the conditionals for the matrix */ for (i=0; i<alen; i++) { for (j=0; j<=i; j++) { double vij = dna ? trans[i][j] : dayhoff[i][j]; double vji = dna ? trans[j][i] : dayhoff[j][i]; double joint = ((vij * pfreq[j]) + (vji * pfreq[i]))/20000;/* use average to fix rndoff */ set_matrix_cell(i, j, joint/pfreq[j], matrix); if (i!=j) set_matrix_cell(j, i, joint/pfreq[i], matrix); } } /* take PAM matrix to desired power to scale it */ copy_matrix(matrix, mul); for (i=dist; i>1; i--) { MATRIX_T *product = matrix_multiply(matrix, mul); SWAP(MATRIX_T*, product, matrix) free_matrix(product); } free_matrix(mul); /* convert to joint or logodds matrix: target: J_ij = Pr(i,j) = Mij pr(j) logodds: L_ij = log (Pr(i,j)/(Pr(i)Pr(j)) = log (Mij Pr(j)/Pr(i)Pr(j)) = log(Mij/pr(i)) */ for (i=0; i<alen; i++) { for (j=0; j<alen; j++) { double vij = get_matrix_cell(i, j, matrix); vij = logodds ? nint(factor * log((vij+EPSILON)/pfreq[i])) : vij * pfreq[j]; set_matrix_cell(i, j, vij, matrix); } } return matrix; } /* gen_pam_matrix */
/*********************************************************************** * Converts a TRANSFAC motif to a MEME motif. * Caller is responsible for freeing the returned MOTIF_T. ***********************************************************************/ MOTIF_T *convert_transfac_motif_to_meme_motif( char *id, int pseudocount, ARRAY_T *bg, TRANSFAC_MOTIF_T *motif ) { MATRIX_T *counts = get_transfac_counts(motif); if (counts == NULL) { die( "Unable to convert TRANSFAC motif %s to MEME motif: " "missing counts matrix.", id ); }; // Convert the motif counts to frequencies. int num_bases = get_num_cols(counts); int motif_width = get_num_rows(counts); int motif_position = 0; MATRIX_T *freqs = allocate_matrix(motif_width, num_bases); for (motif_position = 0; motif_position < motif_width; ++motif_position) { int i_base = 0; int num_seqs = 0; // motif columns may have different counts for (i_base = 0; i_base < num_bases; i_base++) { num_seqs += get_matrix_cell(motif_position, i_base, counts); } for (i_base = 0; i_base < num_bases; i_base++) { double freq = (get_matrix_cell(motif_position, i_base, counts) + (pseudocount * get_array_item(i_base, bg))) / (num_seqs + pseudocount); set_matrix_cell(motif_position, i_base, freq, freqs); } } MOTIF_T *meme_motif = allocate_motif(id, DNA_ALPH, NULL, freqs); calc_motif_ambigs(meme_motif); return meme_motif; }
/*********************************************************************** * Return one column of a motif, as a newly allocated array of counts. ***********************************************************************/ ARRAY_T* get_motif_counts (int position, MOTIF_T* motif) { ARRAY_T* return_value = allocate_array(motif->alph_size); int i_alph; for (i_alph = 0; i_alph < motif->alph_size; i_alph++) { set_array_item(i_alph, motif->num_sites * get_matrix_cell(position, i_alph, motif->freqs), return_value); } return(return_value); }
/************************************************************************* * Output JSON data for a motif *************************************************************************/ static void output_motif_json(JSONWR_T* json, MOTIF_STATS_T* stats, SITE_COUNTS_T* counts) { //vars MOTIF_T *motif; MATRIX_T *freqs; int i, j, mlen, asize, end; motif = stats->motif; freqs = get_motif_freqs(motif); asize = alph_size(get_motif_alph(motif), ALPH_SIZE); jsonwr_start_object_value(json); jsonwr_lng_prop(json, "db", stats->db->id); jsonwr_str_prop(json, "id", get_motif_id(motif)); if (*(get_motif_id2(motif))) { jsonwr_str_prop(json, "alt", get_motif_id2(motif)); } mlen = get_motif_length(motif); jsonwr_lng_prop(json, "len", mlen); jsonwr_dbl_prop(json, "motif_evalue", get_motif_evalue(motif)); jsonwr_dbl_prop(json, "motif_nsites", get_motif_nsites(motif)); if (get_motif_url(motif) && *get_motif_url(motif)) { jsonwr_str_prop(json, "url", get_motif_url(motif)); } jsonwr_property(json, "pwm"); jsonwr_start_array_value(json); for (i = 0; i < mlen; i++) { jsonwr_start_array_value(json); for (j = 0; j < asize; j++) { jsonwr_dbl_value(json, get_matrix_cell(i, j, freqs)); } jsonwr_end_array_value(json); } jsonwr_end_array_value(json); jsonwr_lng_prop(json, "bin_width", stats->central_window+1); jsonwr_dbl_prop(json, "bin_sites", stats->central_sites); jsonwr_lng_prop(json, "total_sites", counts->total_sites); jsonwr_dbl_prop(json, "log_pvalue", stats->log_adj_pvalue); jsonwr_dbl_prop(json, "max_prob", stats->max_prob); jsonwr_property(json, "sites"); jsonwr_start_array_value(json); end = counts->allocated - (mlen - 1); for (i = (mlen - 1); i < end; i += 2) { jsonwr_dbl_value(json, counts->sites[i]); } jsonwr_end_array_value(json); jsonwr_end_object_value(json); }
MATRIX_T *get_subst_target_matrix( char *score_filename, /* name of score file */ ALPH_T alph, /* alphabet */ int dist, /* PAM distance (ignored if score_filename != NULL) */ ARRAY_T *back /* background frequencies of standard alphabet */ ) { MATRIX_T *score; /* score matrix */ MATRIX_T *target; /* target frequency matrix */ score = get_score_matrix(score_filename, alph, dist); target = convert_score_to_target(score, back); if (SUBST_MATRIX_DEBUG) { int i, j, alength=alph_size(alph, ALPH_SIZE); double sum; if (score_filename) { printf("From file %s\n", score_filename); } else { printf("Generated PAM %d\n", dist); } printf("%6c ", ' '); for (i=0; i<alength; i++) { printf("%6c ", alph_char(alph, i)); } printf("\n"); sum = 0; for (i=0; i<alength; i++) { printf("%6c ", alph_char(alph, i)); for (j=0; j<alength; j++) { double x = get_matrix_cell(i,j,score); sum += x; printf("%6.4f ", x); } printf("\n"); } printf("sum of entries = %f\n", sum); } free_matrix(score); return(target); } /* get_subst_target_matrix */
/*********************************************************************** * Takes a matrix of meme scores and converts them into letter * probabilities. * * The probablility can be got by: * p = (2 ^ (s / 100)) * bg * ***********************************************************************/ MATRIX_T* convert_scores_into_freqs (ALPH_T alph, MATRIX_T *scores, ARRAY_T *bg, int site_count, double pseudo_count) { int asize, length; double freq, score, total_count, counts, bg_freq; MATRIX_T *freqs; int row, col; assert(alph != INVALID_ALPH); assert(scores != NULL); assert(bg != NULL); length = get_num_rows(scores); asize = alph_size(alph, ALPH_SIZE); freqs = allocate_matrix(length, asize); total_count = site_count + pseudo_count; for (col = 0; col < asize; ++col) { bg_freq = get_array_item(col, bg); for (row = 0; row < length; ++row) { score = get_matrix_cell(row, col, scores); // convert to a probability freq = pow(2.0, score / 100.0) * bg_freq; // remove the pseudo count freq = ((freq * total_count) - (bg_freq * pseudo_count)) / site_count; if (freq < 0) freq = 0; else if (freq > 1) freq = 1; set_matrix_cell(row, col, freq, freqs); } } for (row = 0; row < length; ++row) { normalize_subarray(0, asize, 0.0, get_matrix_row(row, freqs)); } return freqs; }
static int count_trans (MATRIX_T* trans, /* The transition matrix. */ BOOLEAN_T log_form, /* Is the transition matrix in log form? */ int num_states, /* Number of states in the (square) matrix. */ int state_num, /* Index of the state we're interested in. */ int in_or_out) /* Incoming or outgoing transitions? */ { int i_row; int i_col; int ntrans = 0; /* The return value. */ for (i_row = 0; i_row < num_states; i_row++) { for (i_col = 0; i_col < num_states; i_col++) { if (!is_zero(get_matrix_cell(i_row, i_col, trans), log_form)) { if ((in_or_out == TRANS_IN) && (i_col == state_num)) ntrans++; else if ((in_or_out == TRANS_OUT) && (i_row == state_num)) ntrans++; } } } return(ntrans); } // count_trans
MATRIX_T *convert_score_to_target( MATRIX_T *score, /* score matrix */ ARRAY_T *prob /* letter frequencies */ ) { int i, j; KARLIN_INPUT_T *karlin_input; double lambda, K, H; MATRIX_T *target; /* target freq. matrix */ int alen = get_num_rows(score); /* alphabet length */ /* make input for karlin() */ karlin_input = make_karlin_input(score, prob); /* get lambda */ karlin(karlin_input->low, karlin_input->high, karlin_input->prob->items, &lambda, &K, &H); /*printf("lambda %f K %f H %f\n", lambda, K, H);*/ /* calculate target frequencies */ target = allocate_matrix(alen, alen); for (i=0; i<alen; i++) { for (j=0; j<alen; j++) { double pi = get_array_item(i, prob); double pj = get_array_item(j, prob); double sij = get_matrix_cell(i, j, score); double f = pi * pj * exp(lambda * sij); set_matrix_cell(i, j, f, target); } } // Free local dynamic memory. free_array(karlin_input->prob); myfree(karlin_input); return(target); } /* convert_score_to_target */
/************************************************************************ * Compute the indices and values of transitions to or from a state. ************************************************************************/ void compute_ins_and_outs (MHMM_T* the_hmm, BOOLEAN_T log_form) /* Is the transition matrix in log form? */ { int i_row, i_col; int n = the_hmm->num_states; MATRIX_T *trans = the_hmm->trans; // // Visit the transition matrix cells just once each // to update ntrans, itrans and trans arrays. // This is quadratic in n. // for (i_row = 0; i_row < n; i_row++) { for (i_col = 0; i_col < n; i_col++) { double p; // The transition probability. int old_n, new_n; // Number of transitions. if (!is_zero((p = get_matrix_cell(i_row, i_col, trans)), log_form)) { MHMM_STATE_T * out_state = &(the_hmm->states[i_row]); MHMM_STATE_T * in_state = &(the_hmm->states[i_col]); // out old_n = out_state->ntrans_out; new_n = ++out_state->ntrans_out; mm_resize(out_state->itrans_out, new_n, int); out_state->trans_out = resize_array(out_state->trans_out, new_n); out_state->itrans_out[old_n] = i_col; set_array_item(old_n, p, out_state->trans_out); // in old_n = in_state->ntrans_in; new_n = ++in_state->ntrans_in; mm_resize(in_state->itrans_in, new_n, int); in_state->trans_in = resize_array(in_state->trans_in, new_n); in_state->itrans_in[old_n] = i_row; set_array_item(old_n, p, in_state->trans_in); } } // col } // row
/*********************************************************************** * Takes a matrix of letter probabilities and converts them into meme * score. * * Assuming the probability is nonzero the score is just: * s = log2(p / bg) * 100 * ***********************************************************************/ MATRIX_T* convert_freqs_into_scores (ALPH_T alph, MATRIX_T *freqs, ARRAY_T *bg, int site_count, double pseudo_count) { int asize, length; double freq, score, total_count, counts, bg_freq; MATRIX_T *scores; int row, col; assert(alph != INVALID_ALPH); assert(freqs != NULL); assert(bg != NULL); length = get_num_rows(freqs); asize = alph_size(alph, ALPH_SIZE); scores = allocate_matrix(length, asize); total_count = site_count + pseudo_count; for (col = 0; col < asize; ++col) { bg_freq = get_array_item(col, bg); for (row = 0; row < length; ++row) { freq = get_matrix_cell(row, col, freqs); // apply a pseudo count freq = ((pseudo_count * bg_freq) + (freq * site_count)) / total_count; // if the background is correct this shouldn't happen if (freq <= 0) freq = 0.0000005; // convert to a score score = (log(freq / bg_freq) / log(2)) * 100; set_matrix_cell(row, col, score, scores); } } return scores; }
/***************************************************************************** * MEME > motifs > motif > probabilities > alphabet_matrix > alphabet_array > /value * Lookup a letter and check it exists and does not have a probability. * Set the letter's score to the passed value. ****************************************************************************/ void mxml_probability_value(void *ctx, char *letter_id, double probability) { CTX_T *data; MATRIX_T *freqs; char *symbol; int index; data = (CTX_T*)ctx; freqs = data->mscope.motif->freqs; // lookup letter ID symbol = (char*)rbtree_get(data->letter_lookup, letter_id); if (symbol == NULL) { local_error(data, "Probability is not allowed for unknown letter identifier \"%s\".\n", letter_id); return; } index = alph_indexc(data->alph, symbol[0]); if (index < 0) { local_error(data, "Probability is not allowed for non-core letter %c.\n", symbol[0]); return; } if (get_matrix_cell(data->current_pos, index, freqs) != -1) { local_error(data, "Probability for letter %c in position %d has already been set.\n", symbol[0], data->current_pos + 1); return; } set_matrix_cell(data->current_pos, index, probability, freqs); }
/************************************************************************* * Entry point for pmp_bf *************************************************************************/ int main(int argc, char *argv[]) { char* bg_filename = NULL; char* motif_name = "motif"; // Use this motif name in the output. STRING_LIST_T* selected_motifs = NULL; double fg_rate = 1.0; double bg_rate = 1.0; double purine_pyrimidine = 1.0; // r double transition_transversion = 0.5; // R double pseudocount = 0.1; GAP_SUPPORT_T gap_support = SKIP_GAPS; MODEL_TYPE_T model_type = F81_MODEL; BOOLEAN_T use_halpern_bruno = FALSE; char* ustar_label = NULL; // TLB; create uniform star tree int i; program_name = "pmp_bf"; /********************************************** * COMMAND LINE PROCESSING **********************************************/ // Define command line options. (FIXME: Repeated code) // FIXME: Note that if you add or remove options you // must change n_options. int n_options = 12; cmdoption const pmp_options[] = { {"hb", NO_VALUE}, {"ustar", REQUIRED_VALUE}, {"model", REQUIRED_VALUE}, {"pur-pyr", REQUIRED_VALUE}, {"transition-transversion", REQUIRED_VALUE}, {"bg", REQUIRED_VALUE}, {"fg", REQUIRED_VALUE}, {"motif", REQUIRED_VALUE}, {"motif-name", REQUIRED_VALUE}, {"bgfile", REQUIRED_VALUE}, {"pseudocount", REQUIRED_VALUE}, {"verbosity", REQUIRED_VALUE} }; int option_index = 0; // Define the usage message. char usage[1000] = ""; strcat(usage, "USAGE: pmp [options] <tree file> <MEME file>\n"); strcat(usage, "\n"); strcat(usage, " Options:\n"); // Evolutionary model parameters. strcat(usage, " --hb\n"); strcat(usage, " --model single|average|jc|k2|f81|f84|hky|tn"); strcat(usage, " (default=f81)\n"); strcat(usage, " --pur-pyr <float> (default=1.0)\n"); strcat(usage, " --transition-transversion <float> (default=0.5)\n"); strcat(usage, " --bg <float> (default=1.0)\n"); strcat(usage, " --fg <float> (default=1.0)\n"); // Motif parameters. strcat(usage, " --motif <id> (default=all)\n"); strcat(usage, " --motif-name <string> (default from motif file)\n"); // Miscellaneous parameters strcat(usage, " --bgfile <background> (default from motif file)\n"); strcat(usage, " --pseudocount <float> (default=0.1)\n"); strcat(usage, " --ustar <label>\n"); // TLB; create uniform star tree strcat(usage, " --verbosity [1|2|3|4] (default 2)\n"); strcat(usage, "\n Prints the FP and FN rate at each of 10000 score values.\n"); strcat(usage, "\n Output format: [<motif_id> score <score> FPR <fpr> TPR <tpr>]+\n"); // Parse the command line. if (simple_setopt(argc, argv, n_options, pmp_options) != NO_ERROR) { die("Error processing command line options: option name too long.\n"); } while (TRUE) { int c = 0; char* option_name = NULL; char* option_value = NULL; const char * message = NULL; // Read the next option, and break if we're done. c = simple_getopt(&option_name, &option_value, &option_index); if (c == 0) { break; } else if (c < 0) { (void) simple_getopterror(&message); die("Error processing command line options (%s)\n", message); } if (strcmp(option_name, "model") == 0) { if (strcmp(option_value, "jc") == 0) { model_type = JC_MODEL; } else if (strcmp(option_value, "k2") == 0) { model_type = K2_MODEL; } else if (strcmp(option_value, "f81") == 0) { model_type = F81_MODEL; } else if (strcmp(option_value, "f84") == 0) { model_type = F84_MODEL; } else if (strcmp(option_value, "hky") == 0) { model_type = HKY_MODEL; } else if (strcmp(option_value, "tn") == 0) { model_type = TAMURA_NEI_MODEL; } else if (strcmp(option_value, "single") == 0) { model_type = SINGLE_MODEL; } else if (strcmp(option_value, "average") == 0) { model_type = AVERAGE_MODEL; } else { die("Unknown model: %s\n", option_value); } } else if (strcmp(option_name, "hb") == 0){ use_halpern_bruno = TRUE; } else if (strcmp(option_name, "ustar") == 0){ // TLB; create uniform star tree ustar_label = option_value; } else if (strcmp(option_name, "pur-pyr") == 0){ purine_pyrimidine = atof(option_value); } else if (strcmp(option_name, "transition-transversion") == 0){ transition_transversion = atof(option_value); } else if (strcmp(option_name, "bg") == 0){ bg_rate = atof(option_value); } else if (strcmp(option_name, "fg") == 0){ fg_rate = atof(option_value); } else if (strcmp(option_name, "motif") == 0){ if (selected_motifs == NULL) { selected_motifs = new_string_list(); } add_string(option_value, selected_motifs); } else if (strcmp(option_name, "motif-name") == 0){ motif_name = option_value; } else if (strcmp(option_name, "bgfile") == 0){ bg_filename = option_value; } else if (strcmp(option_name, "pseudocount") == 0){ pseudocount = atof(option_value); } else if (strcmp(option_name, "verbosity") == 0){ verbosity = atoi(option_value); } } // Must have tree and motif file names if (argc != option_index + 2) { fprintf(stderr, "%s", usage); exit(EXIT_FAILURE); } /********************************************** * Read the phylogenetic tree. **********************************************/ char* tree_filename = NULL; TREE_T* tree = NULL; tree_filename = argv[option_index]; option_index++; tree = read_tree_from_file(tree_filename); // get the species names STRING_LIST_T* alignment_species = make_leaf_list(tree); char *root_label = get_label(tree); // in case target in center if (strlen(root_label)>0) add_string(root_label, alignment_species); //write_string_list(" ", alignment_species, stderr); // TLB; Convert the tree to a uniform star tree with // the target sequence at its center. if (ustar_label != NULL) { tree = convert_to_uniform_star_tree(tree, ustar_label); if (tree == NULL) die("Tree or alignment missing target %s\n", ustar_label); if (verbosity >= NORMAL_VERBOSE) { fprintf(stderr, "Target %s placed at center of uniform (d=%.3f) star tree:\n", ustar_label, get_total_length(tree) / get_num_children(tree) ); write_tree(tree, stderr); } } /********************************************** * Read the motifs. **********************************************/ char* meme_filename = argv[option_index]; option_index++; int num_motifs = 0; MREAD_T *mread; ALPH_T alph; ARRAYLST_T *motifs; ARRAY_T *bg_freqs; mread = mread_create(meme_filename, OPEN_MFILE); mread_set_bg_source(mread, bg_filename); mread_set_pseudocount(mread, pseudocount); // read motifs motifs = mread_load(mread, NULL); alph = mread_get_alphabet(mread); bg_freqs = mread_get_background(mread); // check if (arraylst_size(motifs) == 0) die("No motifs in %s.", meme_filename); // TLB; need to resize bg_freqs array to ALPH_SIZE items // or copy array breaks in HB mode. This throws away // the freqs for the ambiguous characters; int asize = alph_size(alph, ALPH_SIZE); resize_array(bg_freqs, asize); /************************************************************** * Compute probability distributions for each of the selected motifs. **************************************************************/ int motif_index; for (motif_index = 0; motif_index < arraylst_size(motifs); motif_index++) { MOTIF_T* motif = (MOTIF_T*)arraylst_get(motif_index, motifs); char* motif_id = get_motif_id(motif); char* bare_motif_id = motif_id; // We may have specified on the command line that // only certain motifs were to be used. if (selected_motifs != NULL) { if (*bare_motif_id == '+' || *bare_motif_id == '-') { // The selected motif id won't included a strand indicator. bare_motif_id++; } if (have_string(bare_motif_id, selected_motifs) == FALSE) { continue; } } if (verbosity >= NORMAL_VERBOSE) { fprintf( stderr, "Using motif %s of width %d.\n", motif_id, get_motif_length(motif) ); } // Build an array of evolutionary models for each position in the motif. EVOMODEL_T** models = make_motif_models( motif, bg_freqs, model_type, fg_rate, bg_rate, purine_pyrimidine, transition_transversion, use_halpern_bruno ); // Get the frequencies under the background model (row 0) // and position-dependent scores (rows 1..w) // for each possible alignment column. MATRIX_T* pssm_matrix = build_alignment_pssm_matrix( alph, alignment_species, get_motif_length(motif) + 1, models, tree, gap_support ); ARRAY_T* alignment_col_freqs = allocate_array(get_num_cols(pssm_matrix)); copy_array(get_matrix_row(0, pssm_matrix), alignment_col_freqs); remove_matrix_row(0, pssm_matrix); // throw away first row //print_col_frequencies(alph, alignment_col_freqs); // // Get the position-dependent null model alignment column frequencies // int w = get_motif_length(motif); int ncols = get_num_cols(pssm_matrix); MATRIX_T* pos_dep_bkg = allocate_matrix(w, ncols); for (i=0; i<w; i++) { // get the evo model corresponding to this column of the motif // and store it as the first evolutionary model. myfree(models[0]); // Use motif PSFM for equilibrium freqs. for model. ARRAY_T* site_specific_freqs = allocate_array(asize); int j = 0; for(j = 0; j < asize; j++) { double value = get_matrix_cell(i, j, get_motif_freqs(motif)); set_array_item(j, value, site_specific_freqs); } if (use_halpern_bruno == FALSE) { models[0] = make_model( model_type, fg_rate, transition_transversion, purine_pyrimidine, site_specific_freqs, NULL ); } else { models[0] = make_model( model_type, fg_rate, transition_transversion, purine_pyrimidine, bg_freqs, site_specific_freqs ); } // get the alignment column frequencies using this model MATRIX_T* tmp_pssm_matrix = build_alignment_pssm_matrix( alph, alignment_species, 2, // only interested in freqs under bkg models, tree, gap_support ); // assemble the position-dependent background alignment column freqs. set_matrix_row(i, get_matrix_row(0, tmp_pssm_matrix), pos_dep_bkg); // chuck the pssm (not his real name) free_matrix(tmp_pssm_matrix); } // // Compute and print the score distribution under the background model // and under the (position-dependent) motif model. // int range = 10000; // 10^4 gives same result as 10^5, but 10^3 differs // under background model PSSM_T* pssm = build_matrix_pssm(alph, pssm_matrix, alignment_col_freqs, range); // under position-dependent background (motif) model PSSM_T* pssm_pos_dep = build_matrix_pssm(alph, pssm_matrix, alignment_col_freqs, range); get_pv_lookup_pos_dep( pssm_pos_dep, pos_dep_bkg, NULL // no priors used ); // print FP and FN distributions int num_items = get_pssm_pv_length(pssm_pos_dep); for (i=0; i<num_items; i++) { double pvf = get_pssm_pv(i, pssm); double pvt = get_pssm_pv(i, pssm_pos_dep); double fpr = pvf; double fnr = 1 - pvt; if (fpr >= 0.99999 || fnr == 0) continue; printf("%s score %d FPR %.3g FNR %.3g\n", motif_id, i, fpr, fnr); } // free stuff free_pssm(pssm); free_pssm(pssm_pos_dep); if (models != NULL) { int model_index; int num_models = get_motif_length(motif) + 1; for (model_index = 0; model_index < num_models; model_index++) { free_model(models[model_index]); } myfree(models); } } // motif arraylst_destroy(destroy_motif, motifs); /********************************************** * Clean up. **********************************************/ // TLB may have encountered a memory corruption bug here // CEG has not been able to reproduce it. valgrind says all is well. free_array(bg_freqs); free_tree(TRUE, tree); free_string_list(selected_motifs); return(0); } // main
/************************************************************************* * Calculate the odds score for each motif-sized window at each * site in the sequence using the given nucleotide frequencies. * * This function is a lightweight version based on the one contained in * motiph-scoring. Several calculations that are unnecessary for gomo * have been removed in order to speed up the process *************************************************************************/ static double score_sequence( SEQ_T *seq, // sequence to scan (IN) MOTIF_T *motif, // motif already converted to odds values (IN) PSSM_T *m_pssm, // motif pssm (IN) MATRIX_T *m_odds, // motif odds (IN) int method, // method used for scoring (IN) double threshold, // Threshold to use in TOTAL_HITS mode with a PWM ARRAY_T *bg_freqs //background model ) { assert(seq != NULL); assert(motif != NULL); assert((method == TOTAL_HITS && m_pssm) || (method != TOTAL_HITS && m_odds)); char* raw_seq = get_raw_sequence(seq); int seq_length = get_seq_length(seq); // Get the pv lookup table ARRAY_T* pv_lookup = NULL; if (NULL != m_pssm) { pv_lookup = m_pssm->pv; assert(get_array_length(pv_lookup) > 0); } // Prepare storage for the string representing the portion // of the reference sequence within the window. char* window_seq = (char *) mm_malloc(sizeof(char) * (get_motif_length(motif) + 1)); window_seq[get_motif_length(motif)] = '\0'; int max_index = seq_length - get_motif_length(motif); if (max_index < 0) max_index = 0; const int asize = alph_size(get_motif_alph(motif), ALPH_SIZE); double* odds = (double*) mm_malloc(sizeof(double)*max_index); double* scaled_log_odds = (double*) mm_malloc(sizeof(double)*max_index); // For each site in the sequence int seq_index; for (seq_index = 0; seq_index < max_index; seq_index++) { double odd = 1.0; scaled_log_odds[seq_index] = 0; // For each site in the motif window int motif_position; for (motif_position = 0; motif_position < get_motif_length(motif); motif_position++) { char c = raw_seq[seq_index + motif_position]; window_seq[motif_position] = c; // Check for gaps at this site if(c == '-' || c == '.') { break; } // Check for ambiguity codes at this site //TODO: This next call is very expensive - it takes up approx. 10% of a // programme's running time. It should be fixed up somehow. int aindex = alph_index(get_motif_alph(motif), c); if (aindex > asize) { break; } if (method == TOTAL_HITS) { //If we're in this mode, then we're using LOG ODDS. //scaled_log_odds[seq_index] += get_matrix_cell(motif_position, aindex, get_motif_freqs(motif)); scaled_log_odds[seq_index] += get_matrix_cell(motif_position, aindex, m_pssm->matrix); } else { odd *= get_matrix_cell(motif_position, aindex, m_odds); } } odds[seq_index] = odd; } // return odds as requested (MAX or AVG scoring) double requested_odds = 0.0; if (method == AVG_ODDS){ for (seq_index = 0; seq_index < max_index; seq_index++) { requested_odds += odds[seq_index]; } requested_odds /= max_index + 1; // Divide by 0 if max_index==0 } else if (method == MAX_ODDS){ for (seq_index = 0; seq_index < max_index; seq_index++) { if (odds[seq_index] > requested_odds){ requested_odds = odds[seq_index]; } } } else if (method == SUM_ODDS) { for (seq_index = 0; seq_index < max_index; seq_index++) { requested_odds += odds[seq_index]; } } else if (method == TOTAL_HITS) { for (seq_index = 0; seq_index < max_index; seq_index++) { if (scaled_log_odds[seq_index] >= (double)get_array_length(pv_lookup)) { scaled_log_odds[seq_index] = (double)(get_array_length(pv_lookup) - 1); } double pvalue = get_array_item((int) scaled_log_odds[seq_index], pv_lookup); //Figure out how to calculate the p-value of a hit //fprintf(stderr, "m: %s pv_l len: %i scaled_log_odds: %g seq index: %i pvalue: %g\n", // get_motif_id(motif), get_array_length(pv_lookup), scaled_log_odds[seq_index], seq_index, pvalue); if (pvalue < threshold) { requested_odds++; //Add another hit. } if (verbosity > HIGHER_VERBOSE) { fprintf(stderr, "Window Data: %s\t%s\t%i\t%g\t%g\t%g\n", get_seq_name(seq), get_motif_id(motif), seq_index, scaled_log_odds[seq_index], pvalue, threshold); } } } myfree(odds); myfree(scaled_log_odds); myfree(window_seq); return requested_odds; }
/************************************************************************* * Calculate the odds score for each motif-sized window at each * site in the sequence using the given nucleotide frequencies. * * This function is a lightweight version based on the one contained in * motiph-scoring. Several calculations that are unnecessary for gomo * have been removed in order to speed up the process. * Scores sequence with up to two motifs. *************************************************************************/ double score_sequence( SEQ_T* seq, // sequence to scan (IN) double *logcumback, // cumulative bkg probability of sequence (IN) PSSM_PAIR_T* pssm_pair, // pos and neg pssms (IN) int method, // method used for scoring (IN) int last, //score only last <n> or //score all if <n> is zero (IN) BOOLEAN_T* isFeasible // FLAG indicated if there is at least one position // where the motif could be matched against (OUT) ) { assert(pssm_pair != NULL); assert(seq != NULL); PSSM_T* pos_pssm = pssm_pair->pos_pssm; assert(pos_pssm != NULL); PSSM_T* neg_pssm = pssm_pair->neg_pssm; int n_motifs = neg_pssm ? 2 : 1; char* raw_seq = get_raw_sequence(seq); int seq_length = get_seq_length(seq); int w = get_num_rows(pos_pssm->matrix); int n = seq_length - w + 1; if (verbosity >= DUMP_VERBOSE) { fprintf(stderr, "Debug n_motifs: %d seq_length: %d w: %d n: %d.\n", n_motifs, seq_length, w, n); } // Get alphabet; char* alphabet = get_alphabet(FALSE); int alph_size = get_alph_size(ALPH_SIZE); // Dependent on the "last" parameter, change the starting point int start; int N_scored; if (last > 0 && last < seq_length) { start = seq_length - last; N_scored = n_motifs * (last - w + 1); // number of sites scored } else { start = 0; N_scored = n_motifs * n; // number of sites scored } // For each motif (positive and reverse complement) double max_odds = 0.0; double sum_odds = 0.0; double requested_odds = 0.0; int i; if (verbosity >= HIGHER_VERBOSE) { fprintf(stderr, "Starting scan at position %d .\n", start); } for (i=0; i<n_motifs; i++) { // pos (and negative) motif PSSM_T* pssm = (i==0 ? pos_pssm : neg_pssm); // choose +/- motif // For each site in the sequence int seq_index; for (seq_index = start; seq_index < n; seq_index++) { // site double odds = 1.0; // For each position in the motif window int motif_position; for (motif_position = 0; motif_position < w; motif_position++) { // column int i_site = seq_index + motif_position; char c = raw_seq[i_site]; // Check for gaps at this site if (c == '-' || c == '.') { N_scored--; odds = 0; break; } // Check for ambiguity codes at this site int alph_index = alphabet_index(c, alphabet); if (alph_index >= alph_size || alph_index < 0) { N_scored--; odds = 0; break; } // multiple odds by value in appropriate motif cell odds *= get_matrix_cell(motif_position, alph_index, pssm->matrix); } // column // // Apply sequence-dependent background model. // if (logcumback) { int i_site = seq_index; double log_p = logcumback[i_site+w] - logcumback[i_site]; // log Pr(x | background) //printf("log_p:: %g motif_pos %d\n", log_p, motif_position); double adjust = exp(w*log(1/4.0) - log_p); // Pr(x | uniform) / Pr(x | background) odds *= adjust; } // Add odds to growing sum. sum_odds += odds; // sum of odds if (odds > max_odds) max_odds = odds; // max of odds } // site } // motif if (verbosity >= HIGHER_VERBOSE) { fprintf(stderr, "Scored %d positions with the sum odds %f and the max odds %f.\n", N_scored, sum_odds, max_odds); } // has there been anything matched at all? if (N_scored == 0){ if (verbosity >= NORMAL_VERBOSE) { fprintf(stderr,"Sequence \'%s\' offers no location to match the motif against (sequence length too short?)\n",get_seq_name(seq)); } *isFeasible = FALSE; return 0.0; // return odds as requested (MAX or AVG scoring) } else if (method == AVG_ODDS) { requested_odds = sum_odds / N_scored; // mean } else if (method == MAX_ODDS) { requested_odds = max_odds; // maximum } else if (method == SUM_ODDS) { requested_odds = sum_odds ; // sum } return(requested_odds); } // score_sequence
void ramen_scan_sequences() { FILE* seq_file = NULL; MOTIF_T* motif = NULL; MOTIF_T* rev_motif = NULL; SEQ_T* sequence = NULL; SCANNED_SEQUENCE_T* scanned_seq = NULL; PATTERN_T* pattern; int i; int j; SEQ_T** seq_list; int num_seqs; int seq_len; //For the bdb_bg mode: ARRAY_T* seq_bg_freqs; double atcontent; double roundatcontent; double avg_seq_length = 0; //Open the file. if (open_file(args.sequence_filename, "r", FALSE, "FASTA", "sequences", &seq_file) == 0) { fprintf(stderr, "Couldn't open the file %s.\n", args.sequence_filename); ramen_terminate(1); } //Start reading in the sequences read_many_fastas(ramen_alph, seq_file, MAX_SEQ_LENGTH, &num_seqs, &seq_list); seq_ids = new_string_list(); seq_fscores = allocate_array(num_seqs); //Allocate the required space for results results = malloc(sizeof(double*) * motifs.num); for (i=0;i<motifs.num;i++) { results[i] = malloc(sizeof(double)*num_seqs); } for (j=0;j<num_seqs;j++) { fprintf(stderr, "\rScanning %i of %i sequences...", j+1, num_seqs); //copy the pointer into our current object for clarity sequence = seq_list[j]; //Read the fluorescence data from the description field. add_string(get_seq_name(sequence),seq_ids); seq_len = get_seq_length(sequence); set_array_item(j,atof(get_seq_description(sequence)),seq_fscores); //Scan with each motif. for (i=0;i<motifs.num;i++) { int motifindex = i*2; results[i][j] = ramen_sequence_scan(sequence, motif_at(motifs.motifs, motifindex), motif_at(motifs.motifs, motifindex+1), NULL, NULL, //No need to pass PSSM. AVG_ODDS, 0, TRUE, 0, motifs.bg_freqs); if (TRUE == args.linreg_normalise) { int k; double maxscore = 1; motif = motif_at(motifs.motifs,motifindex); for (k=0;k<get_motif_length(motif);k++) { double maxprob = 0; if (maxprob < get_matrix_cell(k, alph_index(ramen_alph, 'A'), get_motif_freqs(motif))) maxprob = get_matrix_cell(k, alph_index(ramen_alph, 'A'), get_motif_freqs(motif)); if (maxprob < get_matrix_cell(k, alph_index(ramen_alph, 'C'), get_motif_freqs(motif))) maxprob = get_matrix_cell(k, alph_index(ramen_alph, 'C'), get_motif_freqs(motif)); if (maxprob < get_matrix_cell(k, alph_index(ramen_alph, 'G'), get_motif_freqs(motif))) maxprob = get_matrix_cell(k, alph_index(ramen_alph, 'G'), get_motif_freqs(motif)); if (maxprob < get_matrix_cell(k, alph_index(ramen_alph, 'T'), get_motif_freqs(motif))) maxprob = get_matrix_cell(k, alph_index(ramen_alph, 'T'), get_motif_freqs(motif)); maxscore *= maxprob; } results[i][j] /= maxscore; } } } }
/************************************************************************* * Calculate the odds score for each motif-sized window at each * site in the sequence using the given nucleotide frequencies. * * This function is a lightweight version based on the one contained in * motiph-scoring. Several calculations that are unnecessary for gomo * have been removed in order to speed up the process. * Scores sequence with up to two motifs. *************************************************************************/ static double score_sequence( ALPH_T* alph, // alphabet (IN) SEQ_T* seq, // sequence to scan (IN) double *logcumback, // cumulative bkg probability of sequence (IN) PSSM_PAIR_T *pssm_pair, // pos and neg pssms (IN) SCORING_EN method, // method used for scoring (IN) int last, // score only last <n> or score all if <n> // is zero (IN) BOOLEAN_T* isFeasible // FLAG indicated if there is at least one position // where the motif could be matched against (OUT) ) { PSSM_T *pos_pssm, *neg_pssm, *pssm; int strands, seq_length, w, n, asize, strand, start, N_scored, s_pos, m_pos; double max_odds, sum_odds, requested_odds, odds, adjust, log_p; int8_t *isequence, *iseq; assert(pssm_pair != NULL); assert(seq != NULL); asize = alph_size_core(alph); pos_pssm = pssm_pair->pos_pssm; assert(pos_pssm != NULL); neg_pssm = pssm_pair->neg_pssm; strands = neg_pssm ? 2 : 1; isequence = get_isequence(seq); seq_length = get_seq_length(seq); w = get_num_rows(pos_pssm->matrix); n = seq_length - w + 1; if (verbosity >= DUMP_VERBOSE) { fprintf(stderr, "Debug strands: %d seq_length: %d w: %d n: %d.\n", strands, seq_length, w, n); } // Dependent on the "last" parameter, change the starting point if (last > 0 && last < seq_length) { start = seq_length - last; N_scored = strands * (last - w + 1); // number of sites scored } else { start = 0; N_scored = strands * n; // number of sites scored } // For each motif (positive and reverse complement) max_odds = 0.0; sum_odds = 0.0; if (verbosity >= HIGHER_VERBOSE) { fprintf(stderr, "Starting scan at position %d .\n", start); } for (strand = 0; strand < strands; strand++) { // pos (and negative) motif pssm = (strand == 0 ? pos_pssm : neg_pssm); // choose +/- motif // For each site in the sequence for (s_pos = start; s_pos < n; s_pos++) { odds = 1.0; // For each position in the motif window for (m_pos = 0, iseq = isequence+s_pos; m_pos < w; m_pos++, iseq++) { if (*iseq == -1) { N_scored--; odds = 0; break; } // multiple odds by value in appropriate motif cell odds *= get_matrix_cell(m_pos, *iseq, pssm->matrix); } // Apply sequence-dependent background model. if (logcumback) { log_p = logcumback[s_pos+w] - logcumback[s_pos]; // log Pr(x | background) //printf("log_p:: %g motif_pos %d\n", log_p, m_pos); adjust = exp(w*log(1/4.0) - log_p); // Pr(x | uniform) / Pr(x | background) odds *= adjust; } // Add odds to growing sum. sum_odds += odds; // sum of odds if (odds > max_odds) max_odds = odds; // max of odds } // site } // strand if (verbosity >= HIGHER_VERBOSE) { fprintf(stderr, "Scored %d positions with the sum odds %f and the " "max odds %f.\n", N_scored, sum_odds, max_odds); } // has there been anything matched at all? if (N_scored == 0) { if (verbosity >= NORMAL_VERBOSE) { fprintf(stderr,"Sequence \'%s\' offers no location to match " "the motif against (sequence length too short?)\n", get_seq_name(seq)); } *isFeasible = false; return 0.0; // return odds as requested (MAX or AVG scoring) } else if (method == AVG_ODDS) { return sum_odds / N_scored; // mean } else if (method == MAX_ODDS) { return max_odds; // maximum } else if (method == SUM_ODDS) { return sum_odds; // sum } else { die("Unknown scoring method"); // should not get here... but the compiler will complain if I don't handle this case *isFeasible = false; return 0.0; } } // score_sequence
/************************************************************************* * Build a completely connected HMM. *************************************************************************/ void build_complete_hmm (ARRAY_T* background, int spacer_states, MOTIF_T *motifs, int nmotifs, MATRIX_T *transp_freq, MATRIX_T *spacer_ave, BOOLEAN_T fim, MHMM_T **the_hmm) { ALPH_T alph; int motif_states; // Total length of the motifs. int num_spacers; // Total number of spacer states. int num_states; // Total number of states in the model. int i_motif; // Index of the current "from" motif. int j_motif; // Index of the current "to" motif. int i_position; // Index within the current motif or spacer. int i_state = 0; // Index of the current state. assert(nmotifs > 0); alph = get_motif_alph(motifs);// get the alphabet from the first motif // Count the width of the motifs. for (motif_states = 0, i_motif = 0; i_motif < nmotifs; i_motif++) motif_states += get_motif_length(motif_at(motifs, i_motif)); // Count the spacer states adjacent to begin and end. num_spacers = nmotifs * 2; // Add the spacer states between motifs. num_spacers += nmotifs * nmotifs; // Total states = motifs + spacer_states + begin/end num_states = motif_states + (num_spacers * spacer_states) + 2; // Allocate the model. *the_hmm = allocate_mhmm(alph, num_states); // Record that this is a completely connected model. (*the_hmm)->type = COMPLETE_HMM; // Record the number of motifs in the model. (*the_hmm)->num_motifs = nmotifs; // Record the number of states in the model. (*the_hmm)->num_states = num_states; (*the_hmm)->num_spacers = ((nmotifs + 1) * (nmotifs + 1)) - 1; (*the_hmm)->spacer_states = spacer_states; // Put the background distribution into the model. copy_array(background, (*the_hmm)->background); // Build the begin state. build_complete_state( START_STATE, i_state, alph, 0, // expected length NULL, // Emissions. 0, // Number of sites. NON_MOTIF_INDEX, NON_MOTIF_POSITION, nmotifs, 0, // previous motif 0, // next motif transp_freq, spacer_states, num_spacers, motifs, &((*the_hmm)->states[i_state])); i_state++; int from_motif_state, to_motif_state; // Build the spacer states. No transitions from the end state. for (i_motif = 0; i_motif <= nmotifs; i_motif++) { // No transitions to the start state. for (j_motif = 1; j_motif <= nmotifs+1; j_motif++) { // No transitions from start to end. if ((i_motif == 0) && (j_motif == nmotifs+1)) continue; // Allow multi-state spacers. for (i_position = 0; i_position < spacer_states; i_position++, i_state++) { build_complete_state( SPACER_STATE, i_state, alph, get_matrix_cell(i_motif, j_motif, spacer_ave), background, SPACER_NUMSITES, NON_MOTIF_INDEX, i_position, nmotifs, i_motif, j_motif, transp_freq, spacer_states, num_spacers, motifs, &((*the_hmm)->states[i_state])); } } } // Build the motif states. for (i_motif = 0; i_motif < nmotifs; i_motif++) { MOTIF_T *this_motif = motif_at(motifs, i_motif); STATE_T state; for (i_position = 0; i_position < get_motif_length(this_motif); i_position++, i_state++) { if (i_position == 0) { state = START_MOTIF_STATE; } else if (i_position == (get_motif_length(this_motif) - 1)) { state = END_MOTIF_STATE; } else { state = MID_MOTIF_STATE; } build_complete_state( MID_MOTIF_STATE, i_state, alph, 0, // Expected spacer length. get_matrix_row(i_position, get_motif_freqs(this_motif)), get_motif_nsites(this_motif), i_motif, i_position, nmotifs, 0, // Previous motif index. 0, // Next motif index. transp_freq, spacer_states, num_spacers, motifs, &((*the_hmm)->states[i_state])); } } // Build the end state. build_complete_state( END_STATE, i_state, alph, 0, // Expected spacer length. NULL, // Emissions 0, // Number of sites. NON_MOTIF_INDEX, NON_MOTIF_POSITION, nmotifs, 0, // Previous motif index. 0, // Next motif index. transp_freq, spacer_states, num_spacers, motifs, &((*the_hmm)->states[i_state])); i_state++; // Convert spacers to FIMs if requested. if (fim) { convert_to_fims(*the_hmm); } // Fill in the transition matrix. build_transition_matrix(*the_hmm); }
/************************************************************************* * Set up one state in a complete HMM, given the appropriate data. *************************************************************************/ static void build_complete_state (STATE_T state_type, // Type of state (START, SPACER,..) int i_state, // State index. ALPH_T alph, // alphabet int expected_length, // For spacers, the expected length of output. ARRAY_T *freqs, // Emission probability distrib. double num_sites, // Number of sites for this emission. int i_motif, // Index of motif this state is in. int i_position, // Position of this state within motif int nmotifs, // Total number of motifs. int prev_motif, // Index of previous motif. int next_motif, // Index of next motif. MATRIX_T *transp_freq, // Transition freq matrix. int spacer_states, // Number of HMM states per spacer. int num_spacers, // Total number of spacers in HMM. MOTIF_T *motifs, // Motifs. MHMM_STATE_T *a_state) // State to be filled in (pre-allocated). { MOTIF_T *motif; // The motif (for motif state) int j_motif; // Index of the current motif. if (i_motif != NON_MOTIF_INDEX) motif = motif_at(motifs, i_motif); else motif = NULL; // Tell the user what's up. if (verbosity >= NORMAL_VERBOSE) { switch (state_type) { case START_STATE : fprintf(stderr, "Building HMM: (0) "); break; case SPACER_STATE : fprintf(stderr, "%d ", i_state); break; case END_MOTIF_STATE : fprintf(stderr, "%d | ", i_state); break; case START_MOTIF_STATE : case MID_MOTIF_STATE : fprintf(stderr, "%d-", i_state); break; case END_STATE : fprintf(stderr, "(%d)\n", i_state); break; default: die("Invalid state!"); } } // Record what type of state this is. a_state->type = state_type; // Record the motif width if this is a motif. if (state_type == START_MOTIF_STATE || state_type == MID_MOTIF_STATE || state_type == END_MOTIF_STATE) { a_state->w_motif = get_motif_length(motif); } else { a_state->w_motif = 1; } // Set up the emission distribution and a few other tidbits. if (freqs != NULL) { // Start and end states have no emissions. a_state->emit = allocate_array(alph_size(alph, ALL_SIZE)); copy_array(freqs, a_state->emit); } a_state->num_sites = num_sites; a_state->i_motif = i_motif; a_state->i_position = i_position; // Record the motif ID character at this position. if ((state_type == START_STATE) || (state_type == END_STATE) || (state_type == SPACER_STATE)) { a_state->id_char = NON_MOTIF_ID_CHAR; } else { // motif state strncpy(a_state->motif_id, get_full_motif_id(motif), MAX_MOTIF_ID_LENGTH + 2); a_state->id_char = get_motif_id_char(i_position, motif); } assert(a_state->id_char != '\0'); // First set up the transitions into this state. switch (state_type) { case START_STATE : a_state->ntrans_in = 0; a_state->itrans_in = NULL; a_state->trans_in = NULL; break; case START_MOTIF_STATE : // Transitions come from any motif or from the start state. a_state->ntrans_in = nmotifs + 1; a_state->itrans_in = (int *)mm_malloc(sizeof(int) * (nmotifs + 1)); a_state->trans_in = allocate_array(nmotifs + 1); for (j_motif = 0; j_motif < nmotifs + 1; j_motif++) { a_state->itrans_in[j_motif] = spacer_index(j_motif, i_motif + 1, TRUE, nmotifs, spacer_states); set_array_item(j_motif, get_matrix_cell(j_motif, i_motif + 1, transp_freq), a_state->trans_in); } break; case END_STATE : // Transitions come from any motif. a_state->ntrans_in = nmotifs; a_state->itrans_in = (int *)mm_malloc(sizeof(int) * nmotifs); a_state->trans_in = allocate_array(nmotifs); for (j_motif = 0; j_motif < nmotifs; j_motif++) { a_state->itrans_in[j_motif] = spacer_index(j_motif + 1, nmotifs + 1, TRUE, nmotifs, spacer_states); set_array_item(j_motif, get_matrix_cell(j_motif + 1, nmotifs + 1, transp_freq), a_state->trans_in); } break; case MID_MOTIF_STATE : case END_MOTIF_STATE : a_state->ntrans_in = 1; a_state->itrans_in = (int *)mm_malloc(sizeof(int)); a_state->itrans_in[0] = i_state - 1; a_state->trans_in = allocate_array(1); set_array_item(0, 1.0, a_state->trans_in); break; case SPACER_STATE : a_state->ntrans_in = 2; a_state->itrans_in = (int *)mm_malloc(sizeof(int) * 2); a_state->trans_in = allocate_array(2); // For multi-state spacers, incoming transition from previous state. if (i_position != 0) a_state->itrans_in[0] = i_state - 1; else a_state->itrans_in[0] = motif_index(prev_motif, TRUE, num_spacers, spacer_states, motifs, nmotifs); // The other transition is a self-transition. a_state->itrans_in[1] = i_state; set_array_item(0, 1.0 - self_trans(expected_length / spacer_states), a_state->trans_in); set_array_item(1, self_trans(expected_length / spacer_states), a_state->trans_in); break; default: die("Illegal state!"); } // Then set up the transitions out of this state. switch (state_type) { case START_STATE : // Transitions go to each motif. a_state->ntrans_out = nmotifs; a_state->itrans_out = (int *)mm_malloc(sizeof(int) * nmotifs); a_state->trans_out = allocate_array(nmotifs); for (j_motif = 0; j_motif < nmotifs; j_motif++) { a_state->itrans_out[j_motif] = spacer_index(0, j_motif + 1, FALSE, nmotifs, spacer_states); set_array_item(j_motif, get_matrix_cell(0, j_motif + 1, transp_freq), a_state->trans_out); } break; case END_MOTIF_STATE : // Can go to any other motif or to the end state. a_state->ntrans_out = nmotifs + 1; a_state->itrans_out = (int *)mm_malloc(sizeof(int) * (nmotifs + 1)); a_state->trans_out = allocate_array(nmotifs + 1); for (j_motif = 0; j_motif < nmotifs + 1; j_motif++) { a_state->itrans_out[j_motif] = spacer_index(i_motif + 1, j_motif + 1, FALSE, nmotifs, spacer_states); set_array_item(j_motif, get_matrix_cell(i_motif + 1, j_motif + 1, transp_freq), a_state->trans_out); } break; case START_MOTIF_STATE : case MID_MOTIF_STATE : a_state->ntrans_out = 1; a_state->itrans_out = (int *)mm_malloc(sizeof(int)); a_state->itrans_out[0] = i_state + 1; a_state->trans_out = allocate_array(1); set_array_item(0, 1.0, a_state->trans_out); break; case SPACER_STATE : a_state->ntrans_out = 2; a_state->itrans_out = (int *)mm_malloc(sizeof(int) * 2); a_state->trans_out = allocate_array(2); // The first transition is a self-transition. a_state->itrans_out[0] = i_state; // For multi-state spacers, outgoing transition to next state. if (i_position < spacer_states - 1) a_state->itrans_out[1] = i_state + 1; else a_state->itrans_out[1] = motif_index(next_motif, FALSE, num_spacers, spacer_states, motifs, nmotifs); set_array_item(0, self_trans(expected_length), a_state->trans_out); set_array_item(1, 1.0 - self_trans(expected_length), a_state->trans_out); break; case END_STATE : a_state->ntrans_out = 0; a_state->itrans_out = NULL; a_state->trans_out = NULL; break; default: die("Illegal state!"); } }
main(int argc, char **argv) { int i, j, alength; int dist = 0; ALPH_T alph = PROTEIN_ALPH; char *score_filename = NULL; char *alpha; MATRIX_T *matrix; ARRAY_T *probs; double *freqs; KARLIN_INPUT_T *karlin_input; int nscores; double sum; char usage[1000] = ""; // Define the usage message. strcat(usage, "USAGE: subst_matrix [options] <score file>\n"); strcat(usage, "\n"); strcat(usage, " Options:\n"); strcat(usage, " --dna\n"); strcat(usage, " --dist <float>\n"); strcat(usage, "\n"); // Parse the command line. while (1) { int c; int option_index = 0; const char* option_name; // Define command line options. static struct option long_options[] = { {"dna", 0, 0, 0}, {"dist", 1, 0, 0}, }; // Read the next option, and break if we're done. c = getopt_long_only(argc, argv, "+", long_options, &option_index); if (c == -1) { break; } else if (c != 0) { die("Invalid return from getopt (%d)\n", c); } // Get the option name (we only use long options). option_name = long_options[option_index].name; if (strcmp(option_name, "dna") == 0) { alph = DNA_ALPH; } else if (strcmp(option_name, "dist") == 0) { dist = atoi(optarg); } else { die("Invalid option (%s).\n", option_name); } } // Read the single required argument. if (optind + 1 != argc) { fprintf(stderr, usage); exit(1); } score_filename = argv[optind]; alength = alph_size(alph, ALPH_SIZE); /* background frequencies */ probs = allocate_array(alength); freqs = alph == DNA_ALPH ? pam_dna_freq : pam_prot_freq; fill_array(freqs, probs); /* copy freqs into ARRAY_T */ if (dist > 1) { printf("From gen_pam_matrix:\n"); matrix = gen_pam_matrix(alph, dist, FALSE); printf("%6c ", ' '); for (i=0; i<alength; i++) { printf("%6c ", alph_char(alph, i)); } printf("\n"); sum = 0; for (i=0; i<alength; i++) { printf("%6c ", alph_char(alph, i)); for (j=0; j<alength; j++) { double x = get_matrix_cell(i,j,matrix); sum += x; printf("%6.4f ", x); } printf("\n"); } printf("sum of entries = %f\n", sum); } printf("From get_subst_target_matrix:\n"); matrix = get_subst_target_matrix(score_filename, alph, dist, probs); } /* main */