/*********************************************************************** * Returns the string that is the best possible match to the given motif. * Caller is responsible for freeing string. ***********************************************************************/ char *get_best_possible_match(MOTIF_T *motif) { int mpos, apos, asize; char *match_string; ALPH_SIZE_T size; asize = alph_size(motif->alph, ALPH_SIZE); assert(motif != NULL); assert(motif->freqs != NULL); assert(motif->length == motif->freqs->num_rows); size = (motif->flags & MOTIF_HAS_AMBIGS ? ALL_SIZE : ALPH_SIZE); assert(alph_size(motif->alph, size) == motif->freqs->num_cols); match_string = mm_malloc(sizeof(char) * (motif->length + 1)); // Find the higest scoring character at each position in the motif. for(mpos = 0; mpos < motif->length; ++mpos) { ARRAY_T *row = motif->freqs->rows[mpos]; double max_v = row->items[0]; int max_i = 0; for(apos = 1; apos < asize; ++apos) { if (row->items[apos] >= max_v) { max_i = apos; max_v = row->items[apos]; } } match_string[mpos] = alph_char(motif->alph, max_i); } // Add null termination match_string[motif->length] = '\0'; return match_string; }
/************************************************************************** * Get pseudocount frequencies. * * The target_freq matrix only has values for the basic alphabet. * Fill in the ambiguous character pseudocounts afterwards using * the average of pseudocounts for letters matching the ambiguous ones. **************************************************************************/ ARRAY_T *get_pseudocount_freqs( ALPH_T alph, ARRAY_T * f, /* Foreground distribution. */ ARRAY_T * b, /* Background distribution. */ MATRIX_T * target_freq /* Target frequency matrix. */ ) { int i, j; int asize = alph_size(alph, ALPH_SIZE); // excludes ambigs ARRAY_T *g = allocate_array(alph_size(alph, ALL_SIZE));// includes ambigs /* Create pseudocount frequencies. */ for (i = 0; i < asize; i++) { /* non-ambiguous freqs */ double gi = 0; for (j= 0; j < asize; j++) { /* non-ambiguous freqs */ double qij = get_matrix_cell(i, j, target_freq); double fj = get_array_item(j, f); double bj = get_array_item(j, b); gi += (fj/bj) * qij; } /* j */ set_array_item(i, gi, g); if (SUBST_MATRIX_DEBUG) printf("%g %g, ", get_array_item(i, f), gi); } /* i */ calc_ambigs(alph, FALSE, g); /* takes the average pseudocount */ if (SUBST_MATRIX_DEBUG) printf("\n"); return(g); /* return the pseudocounts */ } /* get_pseudocount_freqs */
/*********************************************************************** * Turn a given motif into its own reverse complement. ***********************************************************************/ void reverse_complement_motif (MOTIF_T* a_motif) { ALPH_SIZE_T size; int i, temp_trim; ARRAY_T* left_freqs; ARRAY_T* right_freqs; ARRAY_T* temp_freqs; // Temporary space during swap. assert(a_motif->alph == DNA_ALPH); // Allocate space. size = (a_motif->flags & MOTIF_HAS_AMBIGS ? ALL_SIZE : ALPH_SIZE); temp_freqs = allocate_array(alph_size(a_motif->alph, size)); // Consider each row (position) in the motif. for (i = 0; i < (int)((a_motif->length + 1) / 2); i++) { left_freqs = get_matrix_row(i, a_motif->freqs); right_freqs = get_matrix_row(a_motif->length - (i + 1), a_motif->freqs); // Make a temporary copy of one row. copy_array(left_freqs, temp_freqs); // Compute reverse complements in both directions. complement_dna_freqs(right_freqs, left_freqs); complement_dna_freqs(temp_freqs, right_freqs); } free_array(temp_freqs); if (a_motif->scores) { // Allocate space. temp_freqs = allocate_array(alph_size(a_motif->alph, ALPH_SIZE)); // Consider each row (position) in the motif. for (i = 0; i < (int)((a_motif->length + 1) / 2); i++) { left_freqs = get_matrix_row(i, a_motif->scores); right_freqs = get_matrix_row(a_motif->length - (i + 1), a_motif->scores); // Make a temporary copy of one row. copy_array(left_freqs, temp_freqs); // Compute reverse complements in both directions. complement_dna_freqs(right_freqs, left_freqs); complement_dna_freqs(temp_freqs, right_freqs); } free_array(temp_freqs); } //swap the trimming variables temp_trim = a_motif->trim_left; a_motif->trim_left = a_motif->trim_right; a_motif->trim_right = temp_trim; //swap the strand indicator //this assumes a ? is equalivant to + if (get_motif_strand(a_motif) == '-') { set_motif_strand('+', a_motif); } else { set_motif_strand('-', a_motif); } }
/*********************************************************************** * Read the background letter frequencies from XML. * Caller is responsible for freeing the returned array. ***********************************************************************/ ARRAY_T* read_bg_freqs_from_xml(xmlXPathContextPtr xpath_ctxt, ALPH_T alph) { xmlXPathObjectPtr xpathObj = NULL; ATYPE value; ARRAY_T* bg_freqs; int a_size = alph_size(alph, ALPH_SIZE); // Use XPATH to get the background frequencies from XML xpathObj = xpath_query( xpath_ctxt, "//*/background_frequencies/alphabet_array/value" ); int num_values = (xpathObj->nodesetval ? xpathObj->nodesetval->nodeNr : 0); xmlXPathFreeObject(xpathObj); // The number of background frequences should match the alphabet size. assert(num_values == a_size); // Allocate the array. bg_freqs= allocate_array(alph_size(alph, ALL_SIZE)); // XML doesn't enforce any order on the emission probability values, // so force reading bg frequency values in alphabet order. const int MAX_XPATH_EXPRESSION = 200; char xpath_expression[MAX_XPATH_EXPRESSION]; xmlNodePtr currValueNode = NULL; int i_node = 0; for (i_node = 0; i_node < a_size; i_node++) { // Build the XPATH expression to get bg freq for a character. snprintf( xpath_expression, MAX_XPATH_EXPRESSION, "//*/background_frequencies/" "alphabet_array/value[@letter_id='letter_%c']", alph_char(alph, i_node) ); // Read the selected bg frequency. xpathObj = xpath_query(xpath_ctxt, xpath_expression); // Should only find one node assert(xpathObj->nodesetval->nodeNr == 1); // Decode from node set to numeric value for bg freq. currValueNode = xpathObj->nodesetval->nodeTab[0]; xmlXPathFreeObject(xpathObj); value = xmlXPathCastNodeToNumber(currValueNode); set_array_item(i_node, value, bg_freqs); } // Make sure the frequencies add up to 1.0. normalize_subarray(0, a_size, 0.0, bg_freqs); // Fill in ambiguous characters. calc_ambigs(alph, FALSE, bg_freqs); return bg_freqs; }
/* * Load uniform frequencies into the array. */ ARRAY_T* get_uniform_frequencies(ALPH_T alph, ARRAY_T *freqs) { int i, n; n = ALPH_ASIZE[alph]; if (freqs == NULL) freqs = allocate_array(alph_size(alph, ALL_SIZE)); assert(get_array_length(freqs) >= alph_size(alph, ALL_SIZE)); for (i = 0; i < n; i++) { set_array_item(i, 1.0/n, freqs); } calc_ambigs(alph, FALSE, freqs); return freqs; }
/* * Load the non-redundant database frequencies into the array. */ ARRAY_T* get_nrdb_frequencies(ALPH_T alph, ARRAY_T *freqs) { int i, size; const PROB_T *nrdb_freqs; size = ALPH_ASIZE[alph]; if (freqs == NULL) freqs = allocate_array(alph_size(alph, ALL_SIZE)); assert(get_array_length(freqs) >= alph_size(alph, ALL_SIZE)); nrdb_freqs = ALPH_NRDB[alph]; for (i = 0; i < size; ++i) { set_array_item(i, nrdb_freqs[i], freqs); } normalize_subarray(0, size, 0.0, freqs); calc_ambigs(alph, FALSE, freqs); return freqs; }
/*********************************************************************** * Compute the complexity of a motif as a number between 0 and 1. * * Motif complexity is the average K-L distance between the "motif * background distribution" and each column of the motif. The motif * background is just the average distribution of all the columns. The * K-L distance, which measures the difference between two * distributions, is the same as the information content: * * \sum_i p_i log(p_i/f_i) * * This value increases with increasing complexity. ***********************************************************************/ double compute_motif_complexity (MOTIF_T* a_motif) { double return_value; ARRAY_T* motif_background; // Mean emission distribution. int num_rows; int i_row; int num_cols; int i_col; num_cols = alph_size(a_motif->alph, ALPH_SIZE); num_rows = a_motif->length; // Compute the mean emission distribution. motif_background = get_matrix_col_sums(a_motif->freqs); scalar_mult(1.0 / (double)num_rows, motif_background); // Compute the K-L distance w.r.t. the background. return_value = 0; for (i_row = 0; i_row < num_rows; i_row++) { ARRAY_T* this_emission = get_matrix_row(i_row, a_motif->freqs); for (i_col = 0; i_col < num_cols; i_col++) { ATYPE this_item = get_array_item(i_col, this_emission); ATYPE background_item = get_array_item(i_col, motif_background); // Use two logs to avoid handling divide-by-zero as a special case. return_value += this_item * (my_log(this_item) - my_log(background_item)); } } free_array(motif_background); return(return_value / (double)num_rows); }
/*********************************************************************** * Apply a pseudocount to the motif pspm. ***********************************************************************/ void apply_pseudocount_to_motif (MOTIF_T* motif, ARRAY_T *background, double pseudocount) { int pos, letter, len, asize, sites; double prob, count, total; ARRAY_T *temp; // no point in doing work when it makes no difference if (pseudocount == 0) return; assert(pseudocount > 0); // motif dimensions asize = alph_size(motif->alph, ALPH_SIZE); len = motif->length; // create a uniform background if none is given temp = NULL; if (background == NULL) { temp = allocate_array(asize); get_uniform_frequencies(motif->alph, temp); background = temp; } // calculate the counts sites = (motif->num_sites > 0 ? motif->num_sites : DEFAULT_SITE_COUNT); total = sites + pseudocount; for (pos = 0; pos < len; ++pos) { for (letter = 0; letter < asize; ++letter) { prob = get_matrix_cell(pos, letter, motif->freqs); count = (prob * sites) + (pseudocount * get_array_item(letter, background)); prob = count / total; set_matrix_cell(pos, letter, prob, motif->freqs); } } if (temp) free_array(temp); }
void dxml_start_motif(void *ctx, char *id, char *seq, int length, long num_sites, long p_hits, long n_hits, double pvalue, double evalue, double uevalue) { CTX_T *data; MOTIF_T *motif; data = (CTX_T*)ctx; data->motif = (MOTIF_T*)mm_malloc(sizeof(MOTIF_T)); motif = data->motif; memset(motif, 0, sizeof(MOTIF_T)); set_motif_id(seq, strlen(seq), motif); set_motif_id2("", 0, motif); set_motif_strand('+', motif); motif->length = length; motif->num_sites = num_sites; motif->evalue = evalue; // both DNA and RNA have 4 letters motif->alph = data->fscope.alphabet; motif->flags = MOTIF_BOTH_STRANDS; // DREME does not support the concept of single strand scanning (yet) // allocate the matrix motif->freqs = allocate_matrix(motif->length, alph_size(motif->alph, ALPH_SIZE)); motif->scores = NULL; // no scores in DREME xml // no url in DREME motif->url = strdup(""); // set by postprocessing motif->complexity = -1; motif->trim_left = 0; motif->trim_right = 0; }
/************************************************************************* * Convert an integer representing a column in a PSSM into the * corresponding alignment column string. * If the alphabet has m characters, and the alignment columns have n entries, * the array of all alignment columns is conveniently numbered by the set of * consecutive n-digit base m numerals: * AAAA = 0000, AAAC = 0001, ..., TTTG = 3332, TTTT = 3333. * The caller must allocate the memory for the alignment column string. * The memory required is the number of sequences in the alignment, plus one * for the terminating null. *************************************************************************/ void unhash_alignment_col( ALPH_T alph, int alignment_col_index, char *alignment_col, int alignment_col_size ) { int asize = alph_size(alph, ALPH_SIZE); assert(alignment_col_index >= 0); assert( alignment_col_index < pow( (double) asize, (double) alignment_col_index ) ); assert(alignment_col != NULL); assert(alignment_col_size >= 1); alignment_col[alignment_col_size] = '\0'; int i, j; for (i = alignment_col_size - 1; i >= 0; i--) { j = alignment_col_index % asize; alignment_col_index -= j; alignment_col[i] = alph_char(alph, j); alignment_col_index /= asize; } } // unhash_alignment_col
/* * When the parser has been selected do some processing */ static void parser_selected(MREAD_T *mread) { ALPH_T alph; MFORMAT_T* format; format = mread->formats; // get the alphabet alph = format->get_alphabet(mread->formats->data); // get the background if (format->get_bg(format->data, &(mread->motif_bg))) { normalize_subarray(0, alph_size(alph, ALPH_SIZE), 0.0, mread->motif_bg); resize_array(mread->motif_bg, alph_size(alph, ALL_SIZE)); calc_ambigs(alph, FALSE, mread->motif_bg); } else { mread->motif_bg = get_uniform_frequencies(alph, mread->motif_bg); } set_pseudo_bg(mread); }
/************************************************************************** * * reverse_complement_pssm_matrix * * Turn a pssm matrix into its own reverse complement. * *************************************************************************/ static void reverse_complement_pssm ( ALPH_T alph, MATRIX_T* pssm_matrix ) { int i; ARRAY_T* left_scores; ARRAY_T* right_scores; ARRAY_T* temp_scores; // Temporary space during swap. int length = get_num_rows(pssm_matrix); // Allocate space. temp_scores = allocate_array(alph_size(alph, ALL_SIZE)); // Consider each row (position) in the motif. for (i = 0; i < (int)((length+1) / 2); i++) { left_scores = get_matrix_row(i, pssm_matrix); right_scores = get_matrix_row(length - (i + 1), pssm_matrix); // Make a temporary copy of one row. copy_array(left_scores, temp_scores); // Compute reverse complements in both directions. complement_dna_freqs(right_scores, left_scores); complement_dna_freqs(temp_scores, right_scores); } free_array(temp_scores); } // reverse_complement_pssm_matrix
/************************************************************************* * Calculate the log odds score for a single motif-sized window. *************************************************************************/ static inline BOOLEAN_T score_motif_site( ALPH_T alph, char *seq, PSSM_T *pssm, double *score // OUT ) { int asize = alph_size(alph, ALPH_SIZE); MATRIX_T* pssm_matrix = pssm->matrix; double scaled_log_odds = 0.0; // For each position in the site int motif_position; for (motif_position = 0; motif_position < pssm->w; motif_position++) { char c = seq[motif_position]; int aindex = alph_index(alph, c); // Check for gaps and ambiguity codes at this site if(aindex == -1 || aindex >= asize) return FALSE; scaled_log_odds += get_matrix_cell(motif_position, aindex, pssm_matrix); } *score = get_unscaled_pssm_score(scaled_log_odds, pssm); // Handle scores that are out of range if ((int) scaled_log_odds >= get_array_length(pssm->pv)) { scaled_log_odds = (float)(get_array_length(pssm->pv) - 1); *score = scaled_to_raw(scaled_log_odds, pssm->w, pssm->scale, pssm->offset); } return TRUE; }
/************************************************************************** * * hash_sequence * * Hash a sequence, compressing hash_n letters into 1. * * Return the newly allocated sequence. * *************************************************************************/ static int* hash_sequence( ALPH_T alph, int *int_sequence, // Sequence in integer format. int seq_length, // Length of sequence. int hash_n // Number of letters to compress to 1. ) { int i, j; int base = alph_size(alph, ALL_SIZE) + 1; // Base to hash to. int* hashed_sequence = NULL; // Allocate the hashed sequence. mm_resize(hashed_sequence, seq_length, int); for(i=0; i<seq_length; i++) { int c = int_sequence[i]; // Character in hashed alphabet. int* old_cp; // Pointer to unhashed character in int_sequence. if ((seq_length - i - hash_n) < 0) { // Hash window is within sequence. for(j=1, old_cp=&(int_sequence[i+1]); j<hash_n; j++, old_cp++) { c = (base * c) + *old_cp; } } else { // Hash window runs off sequence end. for(j=1, old_cp=&(int_sequence[i+1]); j<hash_n; j++, old_cp++) { c = (base * c); if (old_cp - int_sequence < seq_length) c += *old_cp; } } hashed_sequence[i] = c; // Record the hashed character. } return(hashed_sequence); } // hash_sequence
/* * Tests the letter against the alphabet. If the alphabet is unknown * it attempts to work it out and set it from the letter. * For simplicy this assumes you will pass indexes in asscending order. * Returns false if the letter is unacceptable */ BOOLEAN_T alph_test(ALPH_T *alpha, int index, char letter) { char uc_letter; uc_letter = toupper(letter); if (*alpha == INVALID_ALPH) { switch (index) { case 0: return (uc_letter == 'A'); case 1: return (uc_letter == 'C'); case 2: if (uc_letter == 'D') { *alpha = PROTEIN_ALPH; return TRUE; } return (uc_letter == 'G'); // DNA or RNA case 3: if (uc_letter == 'T') { *alpha = DNA_ALPH; } else if (uc_letter == 'U') { *alpha = DNA_ALPH; //FIXME need RNA but substitute DNA for now } else { return FALSE; } return TRUE; default:// Bad state! die("Should not still be attempting to guess by the 5th letter " "(index = %d).", index); return FALSE; } } else { if (index >= alph_size(*alpha, ALPH_SIZE)) return FALSE; // index too big return (uc_letter == alph_char(*alpha, index)); } }
extern MATRIX_T* gen_pam_matrix( ALPH_T alph, /* alphabet */ int dist, /* PAM distance */ BOOLEAN_T logodds /* true: generate log-odds matrix false: generate target frequency matrix */ ) { assert(alph == DNA_ALPH || alph == PROTEIN_ALPH); int i, j; MATRIX_T *matrix, *mul; BOOLEAN_T dna = (alph == DNA_ALPH); double *pfreq = dna ? pam_dna_freq : pam_prot_freq; // standard frequencies int alen = alph_size(alph, ALPH_SIZE); // length of standard alphabet double factor = dist < 170 ? 2/log(2) : 3/log(2); // same as in "pam" Version 1.0.6 /* create the array for the joint probability matrix */ matrix = allocate_matrix(alen, alen); mul = allocate_matrix(alen, alen); /* initialize the matrix: PAM 1: due to roundoff, take the average of the two estimates of the joint frequency of i and j as the joint, then compute the conditionals for the matrix */ for (i=0; i<alen; i++) { for (j=0; j<=i; j++) { double vij = dna ? trans[i][j] : dayhoff[i][j]; double vji = dna ? trans[j][i] : dayhoff[j][i]; double joint = ((vij * pfreq[j]) + (vji * pfreq[i]))/20000;/* use average to fix rndoff */ set_matrix_cell(i, j, joint/pfreq[j], matrix); if (i!=j) set_matrix_cell(j, i, joint/pfreq[i], matrix); } } /* take PAM matrix to desired power to scale it */ copy_matrix(matrix, mul); for (i=dist; i>1; i--) { MATRIX_T *product = matrix_multiply(matrix, mul); SWAP(MATRIX_T*, product, matrix) free_matrix(product); } free_matrix(mul); /* convert to joint or logodds matrix: target: J_ij = Pr(i,j) = Mij pr(j) logodds: L_ij = log (Pr(i,j)/(Pr(i)Pr(j)) = log (Mij Pr(j)/Pr(i)Pr(j)) = log(Mij/pr(i)) */ for (i=0; i<alen; i++) { for (j=0; j<alen; j++) { double vij = get_matrix_cell(i, j, matrix); vij = logodds ? nint(factor * log((vij+EPSILON)/pfreq[i])) : vij * pfreq[j]; set_matrix_cell(i, j, vij, matrix); } } return matrix; } /* gen_pam_matrix */
/*********************************************************************** * Normalize the motif's pspm ***********************************************************************/ void normalize_motif (MOTIF_T *motif, double tolerance) { int i_row, asize; asize = alph_size(motif->alph, ALPH_SIZE); for (i_row = 0; i_row < motif->length; ++i_row) { normalize_subarray(0, asize, tolerance, get_matrix_row(i_row, motif->freqs)); } }
/*********************************************************************** * Calculate the ambiguous letters from the concrete ones. ***********************************************************************/ void calc_motif_ambigs (MOTIF_T *motif) { int i_row; resize_matrix(motif->length, alph_size(motif->alph, ALL_SIZE), 0, motif->freqs); motif->flags |= MOTIF_HAS_AMBIGS; for (i_row = 0; i_row < motif->length; ++i_row) { calc_ambigs(motif->alph, FALSE, get_matrix_row(i_row, motif->freqs)); } }
/*********************************************************************** * Copy a motif from one place to another. ***********************************************************************/ void copy_motif (MOTIF_T* source, MOTIF_T* dest) { ALPH_SIZE_T size; memset(dest, 0, sizeof(MOTIF_T)); strcpy(dest->id, source->id); strcpy(dest->id2, source->id2); dest->length = source->length; dest->alph = source->alph; dest->flags = source->flags; dest->evalue = source->evalue; dest->num_sites = source->num_sites; dest->complexity = source->complexity; dest->trim_left = source->trim_left; dest->trim_right = source->trim_right; if (source->freqs) { size = (dest->flags & MOTIF_HAS_AMBIGS ? ALL_SIZE : ALPH_SIZE); // Allocate memory for the matrix. dest->freqs = allocate_matrix(dest->length, alph_size(dest->alph, size)); // Copy the matrix. copy_matrix(source->freqs, dest->freqs); } else { dest->freqs = NULL; } if (source->scores) { // Allocate memory for the matrix. Note that scores don't contain ambigs. dest->scores = allocate_matrix(dest->length, alph_size(dest->alph, ALPH_SIZE)); // Copy the matrix. copy_matrix(source->scores, dest->scores); } else { dest->scores = NULL; } if (dest->url != NULL) { free(dest->url); dest->url = NULL; } copy_string(&(dest->url), source->url); }
/************************************************************************* * Converts the motif frequency matrix into a odds matrix: taken from old ama-scan.c *************************************************************************/ void convert_to_odds_matrix(MOTIF_T* motif, ARRAY_T* bg_freqs){ const int asize = alph_size(get_motif_alph(motif), ALPH_SIZE); int motif_position_index,alph_index; MATRIX_T *freqs; freqs = get_motif_freqs(motif); const int num_motif_positions = get_num_rows(freqs); for (alph_index=0;alph_index<asize;++alph_index){ double bg_likelihood = get_array_item(alph_index, bg_freqs); for (motif_position_index=0;motif_position_index<num_motif_positions;++motif_position_index){ freqs->rows[motif_position_index]->items[alph_index] /= bg_likelihood; } } }
/************************************************************************* * Copies the motif frequency matrix and converts it into a odds matrix *************************************************************************/ MATRIX_T* create_odds_matrix(MOTIF_T *motif, ARRAY_T* bg_freqs){ const int asize = alph_size(get_motif_alph(motif), ALPH_SIZE); int pos, aidx; MATRIX_T *odds; odds = duplicate_matrix(get_motif_freqs(motif)); const int num_pos = get_num_rows(odds); for (aidx = 0; aidx < asize; ++aidx) { double bg_likelihood = get_array_item(aidx, bg_freqs); for (pos = 0; pos < num_pos; ++pos) { odds->rows[pos]->items[aidx] /= bg_likelihood; } } return odds; }
/*********************************************************************** * Return one column of a motif, as a newly allocated array of counts. * This assumes that num_sites is a reasonable value and not zero... ***********************************************************************/ ARRAY_T* get_motif_counts (int position, MOTIF_T* motif) { int i_alph, asize; ARRAY_T* return_value; asize = alph_size(motif->alph, ALPH_SIZE); return_value = allocate_array(asize); for (i_alph = 0; i_alph < asize; i_alph++) { set_array_item(i_alph, motif->num_sites * get_matrix_cell(position, i_alph, motif->freqs), return_value); } return(return_value); }
/*********************************************************************** * Calculates the information content of a position of the motif. ***********************************************************************/ static inline double position_information_content( MOTIF_T *a_motif, int position ) { int i, asize; double H, item; ARRAY_T *freqs; asize = alph_size(a_motif->alph, ALPH_SIZE); H = 0; freqs = get_matrix_row(position, a_motif->freqs); for (i = 0; i < asize; ++i) { item = get_array_item(i, freqs); H -= item*my_log2(item); } return my_log2(asize) - H; }
/************************************************************************* * Output JSON data for a motif *************************************************************************/ static void output_motif_json(JSONWR_T* json, MOTIF_STATS_T* stats, SITE_COUNTS_T* counts) { //vars MOTIF_T *motif; MATRIX_T *freqs; int i, j, mlen, asize, end; motif = stats->motif; freqs = get_motif_freqs(motif); asize = alph_size(get_motif_alph(motif), ALPH_SIZE); jsonwr_start_object_value(json); jsonwr_lng_prop(json, "db", stats->db->id); jsonwr_str_prop(json, "id", get_motif_id(motif)); if (*(get_motif_id2(motif))) { jsonwr_str_prop(json, "alt", get_motif_id2(motif)); } mlen = get_motif_length(motif); jsonwr_lng_prop(json, "len", mlen); jsonwr_dbl_prop(json, "motif_evalue", get_motif_evalue(motif)); jsonwr_dbl_prop(json, "motif_nsites", get_motif_nsites(motif)); if (get_motif_url(motif) && *get_motif_url(motif)) { jsonwr_str_prop(json, "url", get_motif_url(motif)); } jsonwr_property(json, "pwm"); jsonwr_start_array_value(json); for (i = 0; i < mlen; i++) { jsonwr_start_array_value(json); for (j = 0; j < asize; j++) { jsonwr_dbl_value(json, get_matrix_cell(i, j, freqs)); } jsonwr_end_array_value(json); } jsonwr_end_array_value(json); jsonwr_lng_prop(json, "bin_width", stats->central_window+1); jsonwr_dbl_prop(json, "bin_sites", stats->central_sites); jsonwr_lng_prop(json, "total_sites", counts->total_sites); jsonwr_dbl_prop(json, "log_pvalue", stats->log_adj_pvalue); jsonwr_dbl_prop(json, "max_prob", stats->max_prob); jsonwr_property(json, "sites"); jsonwr_start_array_value(json); end = counts->allocated - (mlen - 1); for (i = (mlen - 1); i < end; i += 2) { jsonwr_dbl_value(json, counts->sites[i]); } jsonwr_end_array_value(json); jsonwr_end_object_value(json); }
MATRIX_T *get_subst_target_matrix( char *score_filename, /* name of score file */ ALPH_T alph, /* alphabet */ int dist, /* PAM distance (ignored if score_filename != NULL) */ ARRAY_T *back /* background frequencies of standard alphabet */ ) { MATRIX_T *score; /* score matrix */ MATRIX_T *target; /* target frequency matrix */ score = get_score_matrix(score_filename, alph, dist); target = convert_score_to_target(score, back); if (SUBST_MATRIX_DEBUG) { int i, j, alength=alph_size(alph, ALPH_SIZE); double sum; if (score_filename) { printf("From file %s\n", score_filename); } else { printf("Generated PAM %d\n", dist); } printf("%6c ", ' '); for (i=0; i<alength; i++) { printf("%6c ", alph_char(alph, i)); } printf("\n"); sum = 0; for (i=0; i<alength; i++) { printf("%6c ", alph_char(alph, i)); for (j=0; j<alength; j++) { double x = get_matrix_cell(i,j,score); sum += x; printf("%6.4f ", x); } printf("\n"); } printf("sum of entries = %f\n", sum); } free_matrix(score); return(target); } /* get_subst_target_matrix */
void mcast_print_bg_freqs( FILE *output, ARRAY_T *bgfreqs, MHMMSCAN_OPTIONS_T *options ) { int asize = alph_size(options->alphabet, ALPH_SIZE); int i; for (i = 0; i < asize; i++) { if (i % 9 == 0) { fputc('\n', output); } fprintf( output, "%c: %1.3f ", alph_char(options->alphabet, i), get_array_item(i, bgfreqs) ); } };
/*********************************************************************** * Takes a matrix of meme scores and converts them into letter * probabilities. * * The probablility can be got by: * p = (2 ^ (s / 100)) * bg * ***********************************************************************/ MATRIX_T* convert_scores_into_freqs (ALPH_T alph, MATRIX_T *scores, ARRAY_T *bg, int site_count, double pseudo_count) { int asize, length; double freq, score, total_count, counts, bg_freq; MATRIX_T *freqs; int row, col; assert(alph != INVALID_ALPH); assert(scores != NULL); assert(bg != NULL); length = get_num_rows(scores); asize = alph_size(alph, ALPH_SIZE); freqs = allocate_matrix(length, asize); total_count = site_count + pseudo_count; for (col = 0; col < asize; ++col) { bg_freq = get_array_item(col, bg); for (row = 0; row < length; ++row) { score = get_matrix_cell(row, col, scores); // convert to a probability freq = pow(2.0, score / 100.0) * bg_freq; // remove the pseudo count freq = ((freq * total_count) - (bg_freq * pseudo_count)) / site_count; if (freq < 0) freq = 0; else if (freq > 1) freq = 1; set_matrix_cell(row, col, freq, freqs); } } for (row = 0; row < length; ++row) { normalize_subarray(0, asize, 0.0, get_matrix_row(row, freqs)); } return freqs; }
/******************************************************************* Print the column frequency distribution. ********************************************************************/ static void print_col_frequencies( ALPH_T alph, ARRAY_T* alignment_column_freqs ) { int i; int num_freqs = get_array_length(alignment_column_freqs); int asize = alph_size(alph, ALPH_SIZE); int num_leaves = NINT(log(num_freqs)/log(asize)); char* alignment_col = mm_malloc((num_leaves + 1) * sizeof(char)); for (i=0; i<num_freqs; i++) { unhash_alignment_col( alph, i, //col_index alignment_col, num_leaves ); printf("%s %d %g\n", alignment_col, i+1, get_array_item(i, alignment_column_freqs)); } } // print_col_freqs
/************************************************************************* * Build array containing the counts of columns in the alignment * Caller is responsible for freeing the returned array. * If input parameter "freqs" is NULL, allocates the array. * Otherwise, the counts are added to the existing counts in the counts * array. Ignores all columns containing gaps or ambiguity characters: * [.-nNxX] *************************************************************************/ static ARRAY_T* build_alignment_column_counts( ALPH_T alph, ALIGNMENT_T* alignment, ARRAY_T* counts ) { assert(alignment != NULL); int asize = alph_size(alph, ALPH_SIZE); // Calculate number of possible alignment columns // and create storage for counting occurences. int num_seqs = get_num_aligned_sequences(alignment); int num_alignment_cols = (int) pow((double) asize, (double) num_seqs); if (counts == NULL) { counts = allocate_array(num_alignment_cols); } // Count how many examples of each column occur in the alignment. // Skip columns that contain gaps or ambiguity characters. int alignment_length = get_alignment_length(alignment); char* alignment_col = mm_malloc(sizeof(char) * (num_seqs + 1)); alignment_col[num_seqs] = 0; int i, h; for(i = 0; i < alignment_length; i++) { get_alignment_col(i, alignment_col, alignment); if (strchr(alignment_col, '-') != NULL) { continue; } if (strchr(alignment_col, '.') != NULL) { continue; } if (strchr(alignment_col, 'N') != NULL) { continue; } if (strchr(alignment_col, 'n') != NULL) { continue; } if (strchr(alignment_col, 'X') != NULL) { continue; } if (strchr(alignment_col, 'x') != NULL) { continue; } h = hash_alignment_col(alph, alignment_col, num_seqs); incr_array_item(h, 1, counts); } return counts; } // build_alignment_column_counts
/**************************************************************************** * Return an array containing the frequencies in the alignment for each * character of the alphabet. Gaps and ambiguity characters other then * ANY_BASE are not counted. The freq. of ANY_BASE characters is stored * in the last element of the array. ****************************************************************************/ ARRAY_T* get_alignment_freqs(ALPH_T alph, ALIGNMENT_T* alignment) { char c = 0; int aindex = 0; int asize = 0; int i = 0; int s = 0; int total_bases = 0; int* num_bases = NULL; ARRAY_T* freqs = NULL; // Initialize counts for each character in the alphabet asize = alph_size(alph, ALPH_SIZE); num_bases = mm_malloc(asize * sizeof(int)); for (i = 0; i < asize; i++) { num_bases[i] = 0; } for (s = 0; s < alignment->num_sequences; s++) { for (i = 0; i < alignment->length; i++) { c = get_seq_char(i, alignment->sequences[s]); aindex = alph_index(alph, c); // c might be an ambiguity code. We don't count ambiguity codes. if (aindex != -1 && aindex < asize) { num_bases[aindex]++; total_bases++; } } } freqs = allocate_array(asize); for (i = 0; i < asize; i++) { set_array_item(i, (double) num_bases[i] / (double) total_bases, freqs); } // Clean up the count of characters myfree(num_bases); return freqs; }