/************************************************************************** * Get pseudocount frequencies. * * The target_freq matrix only has values for the basic alphabet. * Fill in the ambiguous character pseudocounts afterwards using * the average of pseudocounts for letters matching the ambiguous ones. **************************************************************************/ ARRAY_T *get_pseudocount_freqs( ALPH_T alph, ARRAY_T * f, /* Foreground distribution. */ ARRAY_T * b, /* Background distribution. */ MATRIX_T * target_freq /* Target frequency matrix. */ ) { int i, j; int asize = alph_size(alph, ALPH_SIZE); // excludes ambigs ARRAY_T *g = allocate_array(alph_size(alph, ALL_SIZE));// includes ambigs /* Create pseudocount frequencies. */ for (i = 0; i < asize; i++) { /* non-ambiguous freqs */ double gi = 0; for (j= 0; j < asize; j++) { /* non-ambiguous freqs */ double qij = get_matrix_cell(i, j, target_freq); double fj = get_array_item(j, f); double bj = get_array_item(j, b); gi += (fj/bj) * qij; } /* j */ set_array_item(i, gi, g); if (SUBST_MATRIX_DEBUG) printf("%g %g, ", get_array_item(i, f), gi); } /* i */ calc_ambigs(alph, FALSE, g); /* takes the average pseudocount */ if (SUBST_MATRIX_DEBUG) printf("\n"); return(g); /* return the pseudocounts */ } /* get_pseudocount_freqs */
/*********************************************************************** * Read the background letter frequencies from XML. * Caller is responsible for freeing the returned array. ***********************************************************************/ ARRAY_T* read_bg_freqs_from_xml(xmlXPathContextPtr xpath_ctxt, ALPH_T alph) { xmlXPathObjectPtr xpathObj = NULL; ATYPE value; ARRAY_T* bg_freqs; int a_size = alph_size(alph, ALPH_SIZE); // Use XPATH to get the background frequencies from XML xpathObj = xpath_query( xpath_ctxt, "//*/background_frequencies/alphabet_array/value" ); int num_values = (xpathObj->nodesetval ? xpathObj->nodesetval->nodeNr : 0); xmlXPathFreeObject(xpathObj); // The number of background frequences should match the alphabet size. assert(num_values == a_size); // Allocate the array. bg_freqs= allocate_array(alph_size(alph, ALL_SIZE)); // XML doesn't enforce any order on the emission probability values, // so force reading bg frequency values in alphabet order. const int MAX_XPATH_EXPRESSION = 200; char xpath_expression[MAX_XPATH_EXPRESSION]; xmlNodePtr currValueNode = NULL; int i_node = 0; for (i_node = 0; i_node < a_size; i_node++) { // Build the XPATH expression to get bg freq for a character. snprintf( xpath_expression, MAX_XPATH_EXPRESSION, "//*/background_frequencies/" "alphabet_array/value[@letter_id='letter_%c']", alph_char(alph, i_node) ); // Read the selected bg frequency. xpathObj = xpath_query(xpath_ctxt, xpath_expression); // Should only find one node assert(xpathObj->nodesetval->nodeNr == 1); // Decode from node set to numeric value for bg freq. currValueNode = xpathObj->nodesetval->nodeTab[0]; xmlXPathFreeObject(xpathObj); value = xmlXPathCastNodeToNumber(currValueNode); set_array_item(i_node, value, bg_freqs); } // Make sure the frequencies add up to 1.0. normalize_subarray(0, a_size, 0.0, bg_freqs); // Fill in ambiguous characters. calc_ambigs(alph, FALSE, bg_freqs); return bg_freqs; }
/*********************************************************************** * Calculate the ambiguous letters from the concrete ones. ***********************************************************************/ void calc_motif_ambigs (MOTIF_T *motif) { int i_row; resize_matrix(motif->length, alph_size(motif->alph, ALL_SIZE), 0, motif->freqs); motif->flags |= MOTIF_HAS_AMBIGS; for (i_row = 0; i_row < motif->length; ++i_row) { calc_ambigs(motif->alph, FALSE, get_matrix_row(i_row, motif->freqs)); } }
/* * Load uniform frequencies into the array. */ ARRAY_T* get_uniform_frequencies(ALPH_T alph, ARRAY_T *freqs) { int i, n; n = ALPH_ASIZE[alph]; if (freqs == NULL) freqs = allocate_array(alph_size(alph, ALL_SIZE)); assert(get_array_length(freqs) >= alph_size(alph, ALL_SIZE)); for (i = 0; i < n; i++) { set_array_item(i, 1.0/n, freqs); } calc_ambigs(alph, FALSE, freqs); return freqs; }
/*********************************************************************** * Convert array by compute the average of complementary dna frequencies. * * Apparently no-one uses this. * * Assumes DNA alphabet in order ACGT. ***********************************************************************/ void balance_complementary_dna_freqs (ARRAY_T* source) { double at = (get_array_item(0, source)+get_array_item(3, source))/2.0; double cg = (get_array_item(1, source)+get_array_item(2, source))/2.0; set_array_item(0, at, source); // A -> T set_array_item(1, cg, source); // C -> G set_array_item(2, cg, source); // G -> C set_array_item(3, at, source); // T -> A calc_ambigs(DNA_ALPH, FALSE, source); }
/* * Load the non-redundant database frequencies into the array. */ ARRAY_T* get_nrdb_frequencies(ALPH_T alph, ARRAY_T *freqs) { int i, size; const PROB_T *nrdb_freqs; size = ALPH_ASIZE[alph]; if (freqs == NULL) freqs = allocate_array(alph_size(alph, ALL_SIZE)); assert(get_array_length(freqs) >= alph_size(alph, ALL_SIZE)); nrdb_freqs = ALPH_NRDB[alph]; for (i = 0; i < size; ++i) { set_array_item(i, nrdb_freqs[i], freqs); } normalize_subarray(0, size, 0.0, freqs); calc_ambigs(alph, FALSE, freqs); return freqs; }
/*********************************************************************** * Compute the complement of one DNA frequency distribution. * * Assumes DNA alphabet in order ACGT. ***********************************************************************/ void complement_dna_freqs (ARRAY_T* source, ARRAY_T* dest) { set_array_item(0, get_array_item(3, source), dest); // A -> T set_array_item(1, get_array_item(2, source), dest); // C -> G set_array_item(2, get_array_item(1, source), dest); // G -> C set_array_item(3, get_array_item(0, source), dest); // T -> A //check if the frequencies have ambiguous characters //for example meme does not use ambiguous characters if (get_array_length(source) > 4) { calc_ambigs(DNA_ALPH, FALSE, dest); } }
/* * When the parser has been selected do some processing */ static void parser_selected(MREAD_T *mread) { ALPH_T alph; MFORMAT_T* format; format = mread->formats; // get the alphabet alph = format->get_alphabet(mread->formats->data); // get the background if (format->get_bg(format->data, &(mread->motif_bg))) { normalize_subarray(0, alph_size(alph, ALPH_SIZE), 0.0, mread->motif_bg); resize_array(mread->motif_bg, alph_size(alph, ALL_SIZE)); calc_ambigs(alph, FALSE, mread->motif_bg); } else { mread->motif_bg = get_uniform_frequencies(alph, mread->motif_bg); } set_pseudo_bg(mread); }
/**************************************************************************** * Return an array containing the frequencies in the sequences for each * character of the alphabet. Characters not in the alphabet are not * counted. * * When seq is provided it returns null, otherwise it converts the accumulated * result in bgcalc into a background. * * * Pseudocode example: * ALPH_T alph = ... * BGCALC_T *bgcalc = NULL; * for each seq: * calculate_background(alph, seq, &bgcalc); * ARRAY_T *bg = calculate_background(NULL, &bgcalc); ****************************************************************************/ ARRAY_T* calculate_background( ALPH_T alph, SEQ_T* seq, BGCALC_T** bgcalc ){ BGCALC_T *calc; int a_size, i, a_index; char c; double freq, chunk_part, chunk_freq; ARRAY_T *background; assert(bgcalc != NULL); assert(seq != NULL || *bgcalc != NULL); // get the alphabet // get the alphabet size a_size = alph_size(alph, ALPH_SIZE); if (*bgcalc == NULL) { //allocate and initialize calc calc = mm_malloc(sizeof(BGCALC_T)); calc->alph = alph; calc->chunk_seen = 0; calc->weight = 0; calc->chunk_counts = mm_malloc(a_size * sizeof(long)); calc->bg = mm_malloc(a_size * sizeof(double)); for (i = 0; i < a_size; ++i) { calc->chunk_counts[i] = 0; calc->bg[i] = 0; } *bgcalc = calc; } else { calc = *bgcalc; assert(alph == calc->alph); if (calc->weight == LONG_MAX) return NULL; } if (seq == NULL) { // no sequence so calculate the final result background = allocate_array(alph_size(alph, ALL_SIZE)); if (calc->weight == 0) { if (calc->chunk_seen > 0) { // when we haven't had to approximate yet // just do a normal background calculation for (i = 0; i < a_size; i++) { freq = (double) calc->chunk_counts[i] / (double) calc->chunk_seen; set_array_item(i, freq, background); } } else { fputs("Uniform\n", stdout); // when there are no counts then return uniform freq = (double) 1 / (double) a_size; for (i = 0; i < a_size; i++) { set_array_item(i, freq, background); } } } else { if (calc->chunk_seen > 0) { // combine the frequencies for the existing chunks with the counts // for the partially completed chunk chunk_part = (double) calc->chunk_seen / (double) BG_CALC_CHUNK; for (i = 0; i < a_size; i++) { chunk_freq = (double) calc->chunk_counts[i] / (double) calc->chunk_seen; freq = ((calc->bg[i] * calc->weight) + (chunk_freq * chunk_part)) / (calc->weight + chunk_part); set_array_item(i, freq, background); } } else { // in the odd case we get to an integer number of chunks for (i = 0; i < a_size; i++) { set_array_item(i, calc->bg[i], background); } } } calc_ambigs(alph, FALSE, background); // free bgcalc structure free(calc->bg); free(calc->chunk_counts); free(calc); *bgcalc = NULL; return background; } // we have a sequence to add to the background calculation for (i = 0; i < seq->length; i++) { c = get_seq_char(i, seq); a_index = alph_index(alph, c); if (a_index == -1 || a_index >= a_size) continue; calc->chunk_counts[a_index]++; calc->chunk_seen++; if (calc->chunk_seen == BG_CALC_CHUNK) { if (calc->weight == 0) { for (i = 0; i < a_size; i++) { calc->bg[i] = (double) calc->chunk_counts[i] / (double) BG_CALC_CHUNK; } } else { for (i = 0; i < a_size; i++) { chunk_freq = (double) calc->chunk_counts[i] / (double) BG_CALC_CHUNK; calc->bg[i] = (calc->bg[i] * calc->weight + chunk_freq) / (calc->weight + 1); } } calc->weight++; // reset the counts for the next chunk for (i = 0; i < a_size; i++) { calc->chunk_counts[i] = 0; } calc->chunk_seen = 0; // I don't think it is feasible to reach this limit // but I guess I'd better check anyway if (calc->weight == LONG_MAX) { fprintf(stderr, "Sequence data set is so large that even the " "approximation designed for large datasets can't handle it!"); return NULL; } } } return NULL; }
/* * Load background file frequencies into the array. */ ARRAY_T* get_file_frequencies(ALPH_T *alph, char *bg_filename, ARRAY_T *freqs) { regmatch_t matches[4]; STR_T *line; char chunk[BG_CHUNK_SIZE+1], letter[2], *key; int size, terminate, offset, i; FILE *fp; regex_t bgfreq; double freq; RBTREE_T *letters; RBNODE_T *node; regcomp_or_die("bg freq", &bgfreq, BGFREQ_RE, REG_EXTENDED); letters = rbtree_create(rbtree_strcasecmp, rbtree_strcpy, free, rbtree_dblcpy, free); line = str_create(100); if (!(fp = fopen(bg_filename, "r"))) { die("Unable to open background file \"%s\" for reading.\n", bg_filename); } terminate = feof(fp); while (!terminate) { size = fread(chunk, sizeof(char), BG_CHUNK_SIZE, fp); chunk[size] = '\0'; terminate = feof(fp); offset = 0; while (offset < size) { // skip mac newline if (str_len(line) == 0 && chunk[offset] == '\r') { offset++; continue; } // find next new line for (i = offset; i < size; ++i) { if (chunk[i] == '\n') break; } // append portion up to the new line or end of chunk str_append(line, chunk+offset, i - offset); // read more if we didn't find a new line if (i == size && !terminate) break; // move the offset past the new line offset = i + 1; // handle windows new line if (str_char(line, -1) == '\r') str_truncate(line, -1); // remove everything to the right of a comment character for (i = 0; i < str_len(line); ++i) { if (str_char(line, i) == '#') { str_truncate(line, i); break; } } // check the line for a single letter followed by a number if (regexec_or_die("bg freq", &bgfreq, str_internal(line), 4, matches, 0)) { // parse the letter and frequency value regex_strncpy(matches+1, str_internal(line), letter, 2); freq = regex_dbl(matches+2, str_internal(line)); // check the frequency is acceptable if (freq < 0 || freq > 1) { die("The background file lists the illegal probability %g for " "the letter %s.\n", freq, letter); } else if (freq == 0) { die("The background file lists a probability of zero for the " "letter %s\n", letter); } if (freq >= 0 && freq <= 1) rbtree_put(letters, letter, &freq); } str_clear(line); } } // finished with the file so clean up file parsing stuff fclose(fp); str_destroy(line, FALSE); regfree(&bgfreq); // guess the alphabet if (*alph == INVALID_ALPH) { switch (rbtree_size(letters)) { case PROTEIN_ASIZE: *alph = PROTEIN_ALPH; break; case DNA_ASIZE: *alph = DNA_ALPH; break; default: die("Number of single character entries in background does not match " "an alphabet.\n"); } } // make the background if (freqs == NULL) freqs = allocate_array(alph_size(*alph, ALL_SIZE)); assert(get_array_length(freqs) >= alph_size(*alph, ALL_SIZE)); init_array(-1, freqs); for (node = rbtree_first(letters); node != NULL; node = rbtree_next(node)) { key = (char*)rbtree_key(node); i = alph_index(*alph, key[0]); freq = *((double*)rbtree_value(node)); if (i == -1) { die("Background contains letter %s which is not in the %s alphabet.\n", key, alph_name(*alph)); } if (get_array_item(i, freqs) != -1) { die("Background contains letter %s which has the same meaning as an " "already listed letter.\n", key); } set_array_item(i, freq, freqs); } // check that all items were set for (i = 0; i < alph_size(*alph, ALPH_SIZE); i++) { if (get_array_item(i, freqs) == -1) { die("Background is missing letter %c.\n", alph_char(*alph, i)); } } // disabled for backwards compatability (AMA test was failing) //normalize_subarray(0, ALPH_ASIZE[*alph], 0.0, freqs); // calculate the values of the ambiguous letters from the concrete ones calc_ambigs(*alph, FALSE, freqs); // cleanup rbtree_destroy(letters); // return result return freqs; }
/************************************************************************* * Entry point for centrimo *************************************************************************/ int main(int argc, char *argv[]) { CENTRIMO_OPTIONS_T options; SEQ_SITES_T seq_sites; SITE_COUNTS_T counts; int seqN, motifN, seqlen, db_i, motif_i, i; double log_pvalue_thresh; SEQ_T** sequences = NULL; ARRAY_T* bg_freqs = NULL; ARRAYLST_T *stats_list; MOTIF_DB_T **dbs, *db; MREAD_T *mread; MOTIF_STATS_T *stats; MOTIF_T *motif, *rev_motif; PSSM_T *pos_pssm, *rev_pssm; char *sites_path, *desc; FILE *sites_file; HTMLWR_T *html; JSONWR_T *json; // COMMAND LINE PROCESSING process_command_line(argc, argv, &options); // load the sequences read_sequences(options.alphabet, options.seq_source, &sequences, &seqN); seqlen = (seqN ? get_seq_length(sequences[0]) : 0); // calculate a sequence background (unless other background is given) if (!options.bg_source) { bg_freqs = calc_bg_from_fastas(options.alphabet, seqN, sequences); } // load the motifs motifN = 0; dbs = mm_malloc(sizeof(MOTIF_DB_T*) * arraylst_size(options.motif_sources)); for (i = 0; i < arraylst_size(options.motif_sources); i++) { char* db_source; db_source = (char*)arraylst_get(i, options.motif_sources); dbs[i] = read_motifs(i, db_source, options.bg_source, &bg_freqs, options.pseudocount, options.selected_motifs, options.alphabet); motifN += arraylst_size(dbs[i]->motifs); } log_pvalue_thresh = log(options.evalue_thresh) - log(motifN); // Setup some things for double strand scanning if (options.scan_both_strands == TRUE) { // Set up hash tables for computing reverse complement setup_hash_alph(DNAB); setalph(0); // Correct background by averaging on freq. for both strands. average_freq_with_complement(options.alphabet, bg_freqs); normalize_subarray(0, alph_size(options.alphabet, ALPH_SIZE), 0.0, bg_freqs); calc_ambigs(options.alphabet, FALSE, bg_freqs); } // Create output directory if (create_output_directory(options.output_dirname, options.allow_clobber, (verbosity >= NORMAL_VERBOSE))) { die("Couldn't create output directory %s.\n", options.output_dirname); } // open output files sites_path = make_path_to_file(options.output_dirname, SITES_FILENAME); sites_file = fopen(sites_path, "w"); free(sites_path); // setup html monolith writer json = NULL; if ((html = htmlwr_create(get_meme_etc_dir(), TEMPLATE_FILENAME))) { htmlwr_set_dest_name(html, options.output_dirname, HTML_FILENAME); htmlwr_replace(html, "centrimo_data.js", "data"); json = htmlwr_output(html); if (json == NULL) die("Template does not contain data section.\n"); } else { DEBUG_MSG(QUIET_VERBOSE, "Failed to open html template file.\n"); } if (json) { // output some top level variables jsonwr_str_prop(json, "version", VERSION); jsonwr_str_prop(json, "revision", REVISION); jsonwr_str_prop(json, "release", ARCHIVE_DATE); jsonwr_str_array_prop(json, "cmd", argv, argc); jsonwr_property(json, "options"); jsonwr_start_object_value(json); jsonwr_dbl_prop(json, "motif-pseudo", options.pseudocount); jsonwr_dbl_prop(json, "score", options.score_thresh); jsonwr_dbl_prop(json, "ethresh", options.evalue_thresh); jsonwr_lng_prop(json, "maxbin", options.max_window+1); jsonwr_bool_prop(json, "norc", !options.scan_both_strands); jsonwr_bool_prop(json, "noflip", options.no_flip); jsonwr_end_object_value(json); // output the description desc = prepare_description(&options); if (desc) { jsonwr_str_prop(json, "job_description", desc); free(desc); } // output size metrics jsonwr_lng_prop(json, "seqlen", seqlen); jsonwr_lng_prop(json, "tested", motifN); // output the fasta db jsonwr_property(json, "sequence_db"); jsonwr_start_object_value(json); jsonwr_str_prop(json, "source", options.seq_source); jsonwr_lng_prop(json, "count", seqN); jsonwr_end_object_value(json); // output the motif dbs jsonwr_property(json, "motif_dbs"); jsonwr_start_array_value(json); for (db_i = 0; db_i < arraylst_size(options.motif_sources); db_i++) { db = dbs[db_i]; jsonwr_start_object_value(json); jsonwr_str_prop(json, "source", db->source); jsonwr_lng_prop(json, "count", arraylst_size(db->motifs)); jsonwr_end_object_value(json); } jsonwr_end_array_value(json); // start the motif array jsonwr_property(json, "motifs"); jsonwr_start_array_value(json); } /************************************************************** * Tally the positions of the best sites for each of the * selected motifs. **************************************************************/ // prepare the sequence sites memset(&seq_sites, 0, sizeof(SEQ_SITES_T)); // prepare the site counts counts.allocated = ((2 * seqlen) - 1); counts.sites = mm_malloc(sizeof(double) * counts.allocated); // prepare the motifs stats list stats_list = arraylst_create(); // prepare the other vars motif = NULL; pos_pssm = NULL; rev_motif = NULL; rev_pssm = NULL; for (db_i = 0; db_i < arraylst_size(options.motif_sources); db_i++) { db = dbs[db_i]; for (motif_i = 0; motif_i < arraylst_size(db->motifs); motif_i++) { motif = (MOTIF_T *) arraylst_get(motif_i, db->motifs); DEBUG_FMT(NORMAL_VERBOSE, "Using motif %s of width %d.\n", get_motif_id(motif), get_motif_length(motif)); // reset the counts for (i = 0; i < counts.allocated; i++) counts.sites[i] = 0; counts.total_sites = 0; // create the pssm pos_pssm = make_pssm(bg_freqs, motif); // If required, do the same for the reverse complement motif. if (options.scan_both_strands) { rev_motif = dup_rc_motif(motif); rev_pssm = make_pssm(bg_freqs, rev_motif); } // scan the sequences for (i = 0; i < seqN; i++) score_sequence(&options, sequences[i], pos_pssm, rev_pssm, &seq_sites, &counts); // DEBUG check that the sum of the sites is close to the site count double sum_check = 0, sum_diff; for (i = 0; i < counts.allocated; i++) sum_check += counts.sites[i]; sum_diff = counts.total_sites - sum_check; if (sum_diff < 0) sum_diff = -sum_diff; if (sum_diff > 0.1) { fprintf(stderr, "Warning: site counts don't sum to accurate value! " "%g != %ld", sum_check, counts.total_sites); } // output the plain text site counts output_site_counts(sites_file, seqlen, db, motif, &counts); // compute the best central window stats = compute_stats(options.max_window, seqlen, db, motif, &counts); // check if it passes the threshold if (json && stats->log_adj_pvalue <= log_pvalue_thresh) { output_motif_json(json, stats, &counts); arraylst_add(stats, stats_list); } else { free(stats); } // Free memory associated with this motif. free_pssm(pos_pssm); free_pssm(rev_pssm); destroy_motif(rev_motif); } } if (json) jsonwr_end_array_value(json); // finish writing sites fclose(sites_file); // finish writing html file if (html) { if (htmlwr_output(html) != NULL) { die("Found another JSON replacement!\n"); } htmlwr_destroy(html); } // write text file output_centrimo_text(&options, motifN, stats_list); // Clean up. for (i = 0; i < seqN; ++i) { free_seq(sequences[i]); } free(sequences); for (i = 0; i < arraylst_size(options.motif_sources); i++) { free_db(dbs[i]); } free(dbs); free_array(bg_freqs); free(counts.sites); free(seq_sites.sites); arraylst_destroy(free, stats_list); cleanup_options(&options); return 0; }