/********************************************************************** ramen_sequence_scan() scan a given sequence with a specified motif using either average motif affinity scoring or maximum one. In addition z-scores may be calculated. The motif has to be converted to log odds in advance (in order to speed up the scanning). Use convert_to_odds_matrix() once for each motif. **********************************************************************/ double ramen_sequence_scan( SEQ_T* sequence, // the sequence to scan INPUT MOTIF_T* motif, // the motif to scan with (converted to odds matrix) INPUT MOTIF_T* rev_motif, // the reversed motif PSSM_T* pssm, PSSM_T* rev_pssm, int scoring, // the scoring function to apply AVG_ODDS, MAX_ODDS or TOTAL_HITS int zscoring, // the number of shuffled sequences used for z-score computation INPUT BOOLEAN_T scan_both_strands, //Should we scan with both motifs and combine scores double threshold, // Threshold to use in TOTAL_HITS mode with a PWM ARRAY_T* bg_freqs //background model ){ assert(zscoring >= 0); char* seq_name = get_seq_name(sequence); // Score the forward strand. double odds = score_sequence( sequence, motif, pssm, motif_name, seq_name, scoring, threshold, bg_freqs ); // Score the reverse strand. if (scan_both_strands) { double rev_odds = score_sequence( sequence, rev_motif, rev_pssm, motif_name, seq_name, scoring, threshold, bg_freqs ); if (scoring == AVG_ODDS){ odds = (odds+rev_odds)/2.0; } else if (scoring == MAX_ODDS){ odds = max(odds,rev_odds); } else if (scoring == TOTAL_HITS) { odds = odds + rev_odds; } } return odds; }
void test_score_sequence(void) { char *seqa = "ATCGATCGATCGATCGATCGATCG"; char *seqb = "AACGATCGATCGATCGATCGATCG"; int cmp1[] = {1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; int *cmp1_t = score_sequence(seqa, seqb, strlen(seqa)); TEST_INIT; TEST_ARRAY(cmp1, cmp1_t, 24, "score_sequence"); TEST(sum(score_sequence(seqa, seqb, strlen(seqa)), strlen(seqa)) == 22, "sum of score_sequence"); TEST_CLOSE; }
// // Outlier filtering // void filter_outlier_data(std::vector<HMMInputData>& input, const std::string& sequence) { std::vector<HMMInputData> out_rs; for(uint32_t ri = 0; ri < input.size(); ++ri) { const HMMInputData& rs = input[ri]; double curr = score_sequence(sequence, rs); double n_events = abs(rs.event_start_idx - rs.event_stop_idx) + 1.0f; double lp_per_event = curr / n_events; if(opt::verbose >= 1) { fprintf(stderr, "OUTLIER_FILTER %d %.2lf %.2lf %.2lf\n", ri, curr, n_events, lp_per_event); } double threshold = model_stdv() ? 7.0f : 3.5f; // TODO: check if(fabs(lp_per_event) < threshold) { out_rs.push_back(rs); } } input.swap(out_rs); }
// This scores each path using the HMM and // sorts the paths into ascending order by score void score_paths(PathConsVector& paths, const std::vector<HMMInputData>& input) { PROFILE_FUNC("score_paths") size_t CULL_RATE = 5; double CULL_MIN_SCORE = -30.0f; double CULL_MIN_IMPROVED_FRACTION = 0.2f; // cache the initial sequence std::string first = paths[0].path; PathConsVector dedup_paths; // initialize and deduplicate paths to avoid redundant computation std::set<std::string> path_string_set; for(size_t pi = 0; pi < paths.size(); ++pi) { if(path_string_set.find(paths[pi].path) == path_string_set.end()) { paths[pi].score = 0; paths[pi].sum_rank = 0; paths[pi].num_improved = 0; paths[pi].num_scored = 0; dedup_paths.push_back(paths[pi]); path_string_set.insert(paths[pi].path); } } paths.clear(); paths.swap(dedup_paths); // Score all reads for(uint32_t ri = 0; ri < input.size(); ++ri) { if(opt::verbose > 2) { fprintf(stderr, "Scoring %d\n", ri); } //const HMMInputData& data = input[ri]; std::vector<IndexedPathScore> result(paths.size()); // Score all paths #pragma omp parallel for for(size_t pi = 0; pi < paths.size(); ++pi) { double curr = score_sequence(paths[pi].path, input[ri]); result[pi].score = curr; result[pi].path_index = pi; } // Save score of first path double first_path_score = result[0].score; // Sort result by score std::stable_sort(result.begin(), result.end(), sortIndexedPathScoreDesc); for(size_t pri = 0; pri < result.size(); ++pri) { size_t pi = result[pri].path_index; paths[pi].score += (result[pri].score - first_path_score); uint32_t rank_score = pri; paths[pi].sum_rank += rank_score; paths[pi].num_improved += (result[pri].score > first_path_score); paths[pi].num_scored += 1; } // Cull paths if(ri > 0 && ri % CULL_RATE == 0) { PathConsVector retained_paths; for(size_t pi = 0; pi < paths.size(); ++pi) { // We keep a path if any of these conditions are met: // 1) it is the original unmodified sequence // 2) its score is greater than CULL_MIN_SCORE // 3) the fraction of reads that score better on this // path compared to the original sequence is greater // than CULL_MIN_IMPROVED_FRACTION double f = (double)paths[pi].num_improved / (double)paths[pi].num_scored; if(pi == 0 || paths[pi].score > CULL_MIN_SCORE || f >= CULL_MIN_IMPROVED_FRACTION) { retained_paths.push_back(paths[pi]); } } paths.swap(retained_paths); } } // select new sequence //std::stable_sort(paths.begin(), paths.end(), sortPathConsRankAsc); std::stable_sort(paths.begin(), paths.end(), sortPathConsScoreDesc); #if DEBUG_PATH_SELECTION double MIN_FIT = INFINITY; for(size_t pi = 0; pi < paths.size(); ++pi) { // Calculate the length of the matching prefix with the initial sequence const std::string& s = paths[pi].path; char initial = s == first ? 'I' : ' '; printf("%zu\t%s\t%.1lf\t%zu %c %s", pi, paths[pi].path.c_str(), paths[pi].score, paths[pi].sum_rank, initial, paths[pi].mutdesc.c_str()); // If this is the truth path or the best path, show the scores for all reads if(pi <= 1 || initial == 'I') { for(uint32_t ri = 0; ri < input.size(); ++ri) { const HMMInputData& data = input[ri]; const KHMMParameters& parameters = data.read->parameters[data.strand]; if( fabs(parameters.fit_quality) > MIN_FIT) continue; double curr = score_sequence(paths[pi].path, input[ri]); printf("%.1lf,%.2lf ", parameters.fit_quality, curr); } } printf("\n"); } #endif }
/********************************************************************** ama_sequence_scan() Scan a given sequence with a specified motif using either average motif affinity scoring or maximum one. In addition z-scores may be calculated. Also the scan can be limited to only the end of the passed sequences. The motif has to be converted to odds in advance (in order to speed up the scanning). The result will be stored in the scanned_sequence parameter. **********************************************************************/ void ama_sequence_scan( ALPH_T* alph, // alphabet SEQ_T *sequence, // the sequence to scan (IN) double *logcumback, // cumulative bkg probability of sequence (IN) PSSM_PAIR_T *pssm_pair, // the pos/neg pssms (IN) int scoring, // AVG_ODDS or MAX_ODDS (IN) BOOLEAN_T pvalues, // compute p-values (IN) int last, // use only last <n> sequence positions // or 0 if all positions should be used SCANNED_SEQUENCE_T* scanned_seq,// the scanned sequence results (OUT) BOOLEAN_T* need_postprocessing // Flag indicating the need for postprocessing (OUT) ) { assert(sequence != NULL); assert(pssm_pair != NULL); // FLAG indicates if sequence was suitable for motif matching BOOLEAN_T isFeasible = true; // Score the sequence. double odds = score_sequence(alph, sequence, logcumback, pssm_pair, scoring, last, &isFeasible); // Compute the p-value of the AVG_ODDS score. if (get_scanned_sequence_num_scanned_positions(scanned_seq) == 0L) { set_scanned_sequence_score(scanned_seq, odds); // sequence has not been scanned before if (!isFeasible) { if (verbosity >= NORMAL_VERBOSE) { fprintf(stderr,"Sequence '%s' not suited for motif. P-value " "set to 1.0!\n", get_scanned_sequence_accession(scanned_seq)); } set_scanned_sequence_pvalue(scanned_seq, 1.0); } else if (odds < 0.0){ if (verbosity >= NORMAL_VERBOSE) { fprintf(stderr,"Sequence '%s' got invalid (negative) odds " "score. P-value set to 1.0!\n", get_scanned_sequence_accession(scanned_seq)); } set_scanned_sequence_pvalue(scanned_seq, 1.0); } else if (pvalues && scoring == AVG_ODDS) { double pvalue = get_ama_pv(odds, get_scanned_sequence_length(scanned_seq), get_total_gc_sequence(sequence), pssm_pair); set_scanned_sequence_pvalue(scanned_seq, pvalue); } // scanned_position is used to keep track how often a sequence has been scored // this feature is used in downstream gomo where a one2many homolog relationship // is encoded through the same sequence identifier add_scanned_sequence_scanned_position(scanned_seq); } else { // sequence has been scored before if(!has_scanned_sequence_score(scanned_seq)) { // no score set yet, so do set_scanned_sequence_score(scanned_seq, odds); } else { // sum scores (take average later) set_scanned_sequence_score(scanned_seq, odds + get_scanned_sequence_score(scanned_seq)); } if (!isFeasible) { if (verbosity >= NORMAL_VERBOSE) { fprintf(stderr,"Sequence '%s' not suited for motif. P-value set " "to 1.0!\n", get_scanned_sequence_accession(scanned_seq)); } if (!has_scanned_sequence_pvalue(scanned_seq)) { set_scanned_sequence_pvalue(scanned_seq, 1.0); } } else if (odds < 0.0) { if (verbosity >= NORMAL_VERBOSE) { fprintf(stderr,"Sequence '%s' got invalid (negative) odds score. " "P-value set to 1.0!\n", get_scanned_sequence_accession(scanned_seq)); } if (!has_scanned_sequence_pvalue(scanned_seq)) { set_scanned_sequence_pvalue(scanned_seq, 1.0); } } else if (pvalues && scoring == AVG_ODDS) { double pvalue = get_ama_pv(odds, get_scanned_sequence_length(scanned_seq), get_total_gc_sequence(sequence), pssm_pair); if (!has_scanned_sequence_pvalue(scanned_seq)) { set_scanned_sequence_pvalue(scanned_seq, pvalue); } else { // keep minimum p-value only set_scanned_sequence_pvalue(scanned_seq, min(pvalue, get_scanned_sequence_pvalue(scanned_seq))); } } add_scanned_sequence_scanned_position(scanned_seq); *need_postprocessing = true; } } // ama_sequence_scan
match *find_best_match(const adapter_array *aa, const char *read, float *p_quals, float prior, float p_match, int min_l) { /* Take an adapter array, and check the read against all adapters. Brute force string matching is used. This is to avoid approximate matching algorithms which required an a priori specified number mismatches. */ match *best_match=NULL; int i, shift, max_shift, found_contam=0; int *best_arr=NULL, best_adapter=0, best_length=0, best_shift=0, best_score=INT_MIN; int al, curr_score, *curr_arr=NULL; int rl = strlen(read); posterior_set *ps=NULL; float *best_p_quals=NULL; max_shift = rl - min_l; for (shift = 0; shift < max_shift; shift++) { for (i = 0; i < aa->n; i++) { if (min_l >= aa->adapters[i].length) { fprintf(stderr, "Minimum match length (option -n) greater than or " \ "equal to length of adapter.\n"); exit(EXIT_FAILURE); } al = min(aa->adapters[i].length, strlen(&(read)[shift])); curr_arr = score_sequence(&(read)[shift], (aa->adapters[i]).seq, al); curr_score = sum(curr_arr, al); if (curr_score > best_score) { best_score = curr_score; best_length = al; best_shift = shift; best_p_quals = &(p_quals)[shift]; best_arr = curr_arr; best_adapter = i; ps = posterior(best_arr, best_p_quals, prior, 0.25, best_length); found_contam = ps->is_contam; if (found_contam) { break; } else { free(ps); ps=NULL; free(best_arr); } } else free(curr_arr); } if (found_contam) break; } if (!found_contam) /* no match found */ return NULL; /* save this match */ best_match = xmalloc(sizeof(match)); best_match->match = best_arr; best_match->shift = best_shift; best_match->length = best_length; best_match->ps = ps; best_match->score = best_score; best_match->adapter_index = best_adapter; best_match->p_quals = best_p_quals; best_match->match_pos = calloc(best_length, sizeof(int)); for (i = 0; i < best_length; i++) best_match->match_pos[i] = best_match->match[i] == MATCH_SCORE; return best_match; }
/************************************************************************* * Entry point for centrimo *************************************************************************/ int main(int argc, char *argv[]) { CENTRIMO_OPTIONS_T options; SEQ_SITES_T seq_sites; SITE_COUNTS_T counts; int seqN, motifN, seqlen, db_i, motif_i, i; double log_pvalue_thresh; SEQ_T** sequences = NULL; ARRAY_T* bg_freqs = NULL; ARRAYLST_T *stats_list; MOTIF_DB_T **dbs, *db; MREAD_T *mread; MOTIF_STATS_T *stats; MOTIF_T *motif, *rev_motif; PSSM_T *pos_pssm, *rev_pssm; char *sites_path, *desc; FILE *sites_file; HTMLWR_T *html; JSONWR_T *json; // COMMAND LINE PROCESSING process_command_line(argc, argv, &options); // load the sequences read_sequences(options.alphabet, options.seq_source, &sequences, &seqN); seqlen = (seqN ? get_seq_length(sequences[0]) : 0); // calculate a sequence background (unless other background is given) if (!options.bg_source) { bg_freqs = calc_bg_from_fastas(options.alphabet, seqN, sequences); } // load the motifs motifN = 0; dbs = mm_malloc(sizeof(MOTIF_DB_T*) * arraylst_size(options.motif_sources)); for (i = 0; i < arraylst_size(options.motif_sources); i++) { char* db_source; db_source = (char*)arraylst_get(i, options.motif_sources); dbs[i] = read_motifs(i, db_source, options.bg_source, &bg_freqs, options.pseudocount, options.selected_motifs, options.alphabet); motifN += arraylst_size(dbs[i]->motifs); } log_pvalue_thresh = log(options.evalue_thresh) - log(motifN); // Setup some things for double strand scanning if (options.scan_both_strands == TRUE) { // Set up hash tables for computing reverse complement setup_hash_alph(DNAB); setalph(0); // Correct background by averaging on freq. for both strands. average_freq_with_complement(options.alphabet, bg_freqs); normalize_subarray(0, alph_size(options.alphabet, ALPH_SIZE), 0.0, bg_freqs); calc_ambigs(options.alphabet, FALSE, bg_freqs); } // Create output directory if (create_output_directory(options.output_dirname, options.allow_clobber, (verbosity >= NORMAL_VERBOSE))) { die("Couldn't create output directory %s.\n", options.output_dirname); } // open output files sites_path = make_path_to_file(options.output_dirname, SITES_FILENAME); sites_file = fopen(sites_path, "w"); free(sites_path); // setup html monolith writer json = NULL; if ((html = htmlwr_create(get_meme_etc_dir(), TEMPLATE_FILENAME))) { htmlwr_set_dest_name(html, options.output_dirname, HTML_FILENAME); htmlwr_replace(html, "centrimo_data.js", "data"); json = htmlwr_output(html); if (json == NULL) die("Template does not contain data section.\n"); } else { DEBUG_MSG(QUIET_VERBOSE, "Failed to open html template file.\n"); } if (json) { // output some top level variables jsonwr_str_prop(json, "version", VERSION); jsonwr_str_prop(json, "revision", REVISION); jsonwr_str_prop(json, "release", ARCHIVE_DATE); jsonwr_str_array_prop(json, "cmd", argv, argc); jsonwr_property(json, "options"); jsonwr_start_object_value(json); jsonwr_dbl_prop(json, "motif-pseudo", options.pseudocount); jsonwr_dbl_prop(json, "score", options.score_thresh); jsonwr_dbl_prop(json, "ethresh", options.evalue_thresh); jsonwr_lng_prop(json, "maxbin", options.max_window+1); jsonwr_bool_prop(json, "norc", !options.scan_both_strands); jsonwr_bool_prop(json, "noflip", options.no_flip); jsonwr_end_object_value(json); // output the description desc = prepare_description(&options); if (desc) { jsonwr_str_prop(json, "job_description", desc); free(desc); } // output size metrics jsonwr_lng_prop(json, "seqlen", seqlen); jsonwr_lng_prop(json, "tested", motifN); // output the fasta db jsonwr_property(json, "sequence_db"); jsonwr_start_object_value(json); jsonwr_str_prop(json, "source", options.seq_source); jsonwr_lng_prop(json, "count", seqN); jsonwr_end_object_value(json); // output the motif dbs jsonwr_property(json, "motif_dbs"); jsonwr_start_array_value(json); for (db_i = 0; db_i < arraylst_size(options.motif_sources); db_i++) { db = dbs[db_i]; jsonwr_start_object_value(json); jsonwr_str_prop(json, "source", db->source); jsonwr_lng_prop(json, "count", arraylst_size(db->motifs)); jsonwr_end_object_value(json); } jsonwr_end_array_value(json); // start the motif array jsonwr_property(json, "motifs"); jsonwr_start_array_value(json); } /************************************************************** * Tally the positions of the best sites for each of the * selected motifs. **************************************************************/ // prepare the sequence sites memset(&seq_sites, 0, sizeof(SEQ_SITES_T)); // prepare the site counts counts.allocated = ((2 * seqlen) - 1); counts.sites = mm_malloc(sizeof(double) * counts.allocated); // prepare the motifs stats list stats_list = arraylst_create(); // prepare the other vars motif = NULL; pos_pssm = NULL; rev_motif = NULL; rev_pssm = NULL; for (db_i = 0; db_i < arraylst_size(options.motif_sources); db_i++) { db = dbs[db_i]; for (motif_i = 0; motif_i < arraylst_size(db->motifs); motif_i++) { motif = (MOTIF_T *) arraylst_get(motif_i, db->motifs); DEBUG_FMT(NORMAL_VERBOSE, "Using motif %s of width %d.\n", get_motif_id(motif), get_motif_length(motif)); // reset the counts for (i = 0; i < counts.allocated; i++) counts.sites[i] = 0; counts.total_sites = 0; // create the pssm pos_pssm = make_pssm(bg_freqs, motif); // If required, do the same for the reverse complement motif. if (options.scan_both_strands) { rev_motif = dup_rc_motif(motif); rev_pssm = make_pssm(bg_freqs, rev_motif); } // scan the sequences for (i = 0; i < seqN; i++) score_sequence(&options, sequences[i], pos_pssm, rev_pssm, &seq_sites, &counts); // DEBUG check that the sum of the sites is close to the site count double sum_check = 0, sum_diff; for (i = 0; i < counts.allocated; i++) sum_check += counts.sites[i]; sum_diff = counts.total_sites - sum_check; if (sum_diff < 0) sum_diff = -sum_diff; if (sum_diff > 0.1) { fprintf(stderr, "Warning: site counts don't sum to accurate value! " "%g != %ld", sum_check, counts.total_sites); } // output the plain text site counts output_site_counts(sites_file, seqlen, db, motif, &counts); // compute the best central window stats = compute_stats(options.max_window, seqlen, db, motif, &counts); // check if it passes the threshold if (json && stats->log_adj_pvalue <= log_pvalue_thresh) { output_motif_json(json, stats, &counts); arraylst_add(stats, stats_list); } else { free(stats); } // Free memory associated with this motif. free_pssm(pos_pssm); free_pssm(rev_pssm); destroy_motif(rev_motif); } } if (json) jsonwr_end_array_value(json); // finish writing sites fclose(sites_file); // finish writing html file if (html) { if (htmlwr_output(html) != NULL) { die("Found another JSON replacement!\n"); } htmlwr_destroy(html); } // write text file output_centrimo_text(&options, motifN, stats_list); // Clean up. for (i = 0; i < seqN; ++i) { free_seq(sequences[i]); } free(sequences); for (i = 0; i < arraylst_size(options.motif_sources); i++) { free_db(dbs[i]); } free(dbs); free_array(bg_freqs); free(counts.sites); free(seq_sites.sites); arraylst_destroy(free, stats_list); cleanup_options(&options); return 0; }