예제 #1
0
파일: ramen_scan.c 프로젝트: CPFL/gmeme
/**********************************************************************
  ramen_sequence_scan()

  scan a given sequence with a specified motif using either
  average motif affinity scoring or maximum one. In addition z-scores
  may be calculated.

  The motif has to be converted to log odds in advance (in order
  to speed up the scanning). Use convert_to_odds_matrix() once for each
  motif.

 **********************************************************************/
double ramen_sequence_scan(
		SEQ_T* sequence,	// the sequence to scan INPUT
		MOTIF_T* motif,		// the motif to scan with (converted to odds matrix) INPUT
		MOTIF_T* rev_motif, // the reversed motif
		PSSM_T* pssm,
		PSSM_T* rev_pssm,
		int scoring,		// the scoring function to apply AVG_ODDS, MAX_ODDS or TOTAL_HITS
		int zscoring,		// the number of shuffled sequences used for z-score computation INPUT
		BOOLEAN_T scan_both_strands,			//Should we scan with both motifs and combine scores
		double threshold,	// Threshold to use in TOTAL_HITS mode with a PWM
		ARRAY_T* bg_freqs //background model
		){
	assert(zscoring >= 0);
	char* seq_name = get_seq_name(sequence);

	// Score the forward strand.
	double odds = score_sequence(
			sequence,
			motif,
			pssm,
			motif_name,
			seq_name,
			scoring,
			threshold,
			bg_freqs
			);

	// Score the reverse strand.
	if (scan_both_strands) {

		double rev_odds = score_sequence(
				sequence,
				rev_motif,
				rev_pssm,
				motif_name,
				seq_name,
				scoring,
				threshold,
				bg_freqs
				);

		if (scoring == AVG_ODDS){
			odds = (odds+rev_odds)/2.0;
		} else if (scoring == MAX_ODDS){
			odds = max(odds,rev_odds);
		} else if (scoring == TOTAL_HITS) {
			odds = odds + rev_odds;
		}

	}

	return odds;

}
예제 #2
0
void test_score_sequence(void) {
  char *seqa = "ATCGATCGATCGATCGATCGATCG";
  char *seqb = "AACGATCGATCGATCGATCGATCG";
  int cmp1[] = {1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
  int *cmp1_t = score_sequence(seqa, seqb, strlen(seqa));

  TEST_INIT;
  TEST_ARRAY(cmp1, cmp1_t, 24, "score_sequence");

  TEST(sum(score_sequence(seqa, seqb, strlen(seqa)), strlen(seqa)) == 22, "sum of score_sequence");

  TEST_CLOSE;
}
예제 #3
0
//
// Outlier filtering
//
void filter_outlier_data(std::vector<HMMInputData>& input, const std::string& sequence)
{
    std::vector<HMMInputData> out_rs;
    for(uint32_t ri = 0; ri < input.size(); ++ri) {
        const HMMInputData& rs = input[ri];

        double curr = score_sequence(sequence, rs);
        double n_events = abs(rs.event_start_idx - rs.event_stop_idx) + 1.0f;
        double lp_per_event = curr / n_events;

        if(opt::verbose >= 1) {
            fprintf(stderr, "OUTLIER_FILTER %d %.2lf %.2lf %.2lf\n", ri, curr, n_events, lp_per_event);
        }

        double threshold = model_stdv() ? 7.0f : 3.5f; // TODO: check
        if(fabs(lp_per_event) < threshold) {
            out_rs.push_back(rs);
        }
    }
    input.swap(out_rs);
}
예제 #4
0
// This scores each path using the HMM and 
// sorts the paths into ascending order by score
void score_paths(PathConsVector& paths, const std::vector<HMMInputData>& input)
{
    PROFILE_FUNC("score_paths")
    size_t CULL_RATE = 5;
    double CULL_MIN_SCORE = -30.0f;
    double CULL_MIN_IMPROVED_FRACTION = 0.2f;

    // cache the initial sequence
    std::string first = paths[0].path;
    
    PathConsVector dedup_paths;

    // initialize and deduplicate paths to avoid redundant computation
    std::set<std::string> path_string_set;
    for(size_t pi = 0; pi < paths.size(); ++pi) {

        if(path_string_set.find(paths[pi].path) == path_string_set.end()) {
            paths[pi].score = 0;
            paths[pi].sum_rank = 0;
            paths[pi].num_improved = 0;
            paths[pi].num_scored = 0;
            dedup_paths.push_back(paths[pi]);
            path_string_set.insert(paths[pi].path);
        }
    }
    paths.clear();
    paths.swap(dedup_paths);
    

    // Score all reads
    for(uint32_t ri = 0; ri < input.size(); ++ri) {

        if(opt::verbose > 2) {
            fprintf(stderr, "Scoring %d\n", ri);
        }

        //const HMMInputData& data = input[ri];
        std::vector<IndexedPathScore> result(paths.size());

        // Score all paths
        #pragma omp parallel for
        for(size_t pi = 0; pi < paths.size(); ++pi) {
            double curr = score_sequence(paths[pi].path, input[ri]);
            result[pi].score = curr;
            result[pi].path_index = pi;
        }

        // Save score of first path
        double first_path_score = result[0].score;

        // Sort result by score
        std::stable_sort(result.begin(), result.end(), sortIndexedPathScoreDesc);

        for(size_t pri = 0; pri < result.size(); ++pri) {
            size_t pi = result[pri].path_index;

            paths[pi].score += (result[pri].score - first_path_score);
            uint32_t rank_score = pri;
            paths[pi].sum_rank += rank_score;
            paths[pi].num_improved += (result[pri].score > first_path_score);
            paths[pi].num_scored += 1;
        }

        // Cull paths
        if(ri > 0 && ri % CULL_RATE == 0) {
            PathConsVector retained_paths;
            for(size_t pi = 0; pi < paths.size(); ++pi) {
                
                // We keep a path if any of these conditions are met:
                //  1) it is the original unmodified sequence
                //  2) its score is greater than CULL_MIN_SCORE
                //  3) the fraction of reads that score better on this
                //     path compared to the original sequence is greater
                //     than CULL_MIN_IMPROVED_FRACTION
                double f = (double)paths[pi].num_improved / (double)paths[pi].num_scored;
                if(pi == 0 || paths[pi].score > CULL_MIN_SCORE || f >= CULL_MIN_IMPROVED_FRACTION) {
                    retained_paths.push_back(paths[pi]);
                }
            }
            paths.swap(retained_paths);
        }
    }

    // select new sequence
    //std::stable_sort(paths.begin(), paths.end(), sortPathConsRankAsc);
    std::stable_sort(paths.begin(), paths.end(), sortPathConsScoreDesc);

#if DEBUG_PATH_SELECTION
    double MIN_FIT = INFINITY;
    for(size_t pi = 0; pi < paths.size(); ++pi) {

        // Calculate the length of the matching prefix with the initial sequence
        const std::string& s = paths[pi].path;

        char initial = s == first ? 'I' : ' ';

        printf("%zu\t%s\t%.1lf\t%zu %c %s", pi, paths[pi].path.c_str(), paths[pi].score, paths[pi].sum_rank, initial, paths[pi].mutdesc.c_str());
        // If this is the truth path or the best path, show the scores for all reads
        if(pi <= 1 || initial == 'I') {
            for(uint32_t ri = 0; ri < input.size(); ++ri) {
                const HMMInputData& data = input[ri];
                const KHMMParameters& parameters = data.read->parameters[data.strand];
                if( fabs(parameters.fit_quality) > MIN_FIT)
                    continue;

                double curr = score_sequence(paths[pi].path, input[ri]);
                printf("%.1lf,%.2lf ", parameters.fit_quality, curr);
            }
        }
        printf("\n");
    }
#endif

}
예제 #5
0
/**********************************************************************
  ama_sequence_scan()

  Scan a given sequence with a specified motif using either
  average motif affinity scoring or maximum one. In addition z-scores
  may be calculated. Also the scan can be limited to only the end of
  the passed sequences.

  The motif has to be converted to odds in advance (in order
  to speed up the scanning).

  The result will be stored in the scanned_sequence parameter.
 **********************************************************************/
void ama_sequence_scan(
  ALPH_T*     alph,         // alphabet
  SEQ_T       *sequence,    // the sequence to scan (IN)
  double      *logcumback,  // cumulative bkg probability of sequence (IN)
  PSSM_PAIR_T *pssm_pair,   // the pos/neg pssms (IN)
  int         scoring,      // AVG_ODDS or MAX_ODDS (IN)
  BOOLEAN_T   pvalues,      // compute p-values (IN)
  int         last,         // use only last <n> sequence positions
                            // or 0 if all positions should be used
  SCANNED_SEQUENCE_T* scanned_seq,// the scanned sequence results (OUT)
  BOOLEAN_T* need_postprocessing // Flag indicating the need for postprocessing (OUT)
)
{
  assert(sequence != NULL);
  assert(pssm_pair != NULL);

  // FLAG indicates if sequence was suitable for motif matching
  BOOLEAN_T isFeasible = true;

  // Score the sequence.
  double odds = score_sequence(alph, sequence, logcumback, 
      pssm_pair, scoring, last, &isFeasible);
        
  // Compute the p-value of the AVG_ODDS score.
  if (get_scanned_sequence_num_scanned_positions(scanned_seq) == 0L) {
    set_scanned_sequence_score(scanned_seq, odds);
    // sequence has not been scanned before
    if (!isFeasible) {
      if (verbosity >= NORMAL_VERBOSE) {
      fprintf(stderr,"Sequence '%s' not suited for motif. P-value "
        "set to 1.0!\n", get_scanned_sequence_accession(scanned_seq));
      }
      set_scanned_sequence_pvalue(scanned_seq, 1.0);
    } else if (odds < 0.0){
      if (verbosity >= NORMAL_VERBOSE) {
        fprintf(stderr,"Sequence '%s' got invalid (negative) odds "
            "score. P-value set to 1.0!\n",
            get_scanned_sequence_accession(scanned_seq));
      }
      set_scanned_sequence_pvalue(scanned_seq, 1.0);
    } else if (pvalues && scoring == AVG_ODDS) {
      double pvalue = get_ama_pv(odds, get_scanned_sequence_length(scanned_seq),
          get_total_gc_sequence(sequence), pssm_pair);
      set_scanned_sequence_pvalue(scanned_seq, pvalue);
    }
    // scanned_position is used to keep track how often a sequence has been scored
    // this feature is used in downstream gomo where a one2many homolog relationship
    // is encoded through the same sequence identifier
    add_scanned_sequence_scanned_position(scanned_seq); 
  } else {
    // sequence has been scored before
    if(!has_scanned_sequence_score(scanned_seq)) {
      // no score set yet, so do
      set_scanned_sequence_score(scanned_seq, odds);
    } else {
      // sum scores (take average later)
      set_scanned_sequence_score(scanned_seq, odds + 
          get_scanned_sequence_score(scanned_seq));
    }
    if (!isFeasible) {
      if (verbosity >= NORMAL_VERBOSE) {
        fprintf(stderr,"Sequence '%s' not suited for motif. P-value set "
            "to 1.0!\n", get_scanned_sequence_accession(scanned_seq));
      }
      if (!has_scanned_sequence_pvalue(scanned_seq)) {
        set_scanned_sequence_pvalue(scanned_seq, 1.0);
      }
    } else if (odds < 0.0) {
      if (verbosity >= NORMAL_VERBOSE) {
        fprintf(stderr,"Sequence '%s' got invalid (negative) odds score. "
            "P-value set to 1.0!\n", get_scanned_sequence_accession(scanned_seq));
      }
      if (!has_scanned_sequence_pvalue(scanned_seq)) { 
        set_scanned_sequence_pvalue(scanned_seq, 1.0);
      }
    } else if (pvalues && scoring == AVG_ODDS) {
      double pvalue = get_ama_pv(odds, get_scanned_sequence_length(scanned_seq),
          get_total_gc_sequence(sequence), pssm_pair);
      if (!has_scanned_sequence_pvalue(scanned_seq)) {
        set_scanned_sequence_pvalue(scanned_seq, pvalue);
      } else {
        // keep minimum p-value only
        set_scanned_sequence_pvalue(scanned_seq, min(pvalue, 
              get_scanned_sequence_pvalue(scanned_seq)));
      }
    }
    add_scanned_sequence_scanned_position(scanned_seq); 
    *need_postprocessing = true;
  }
} // ama_sequence_scan
예제 #6
0
파일: match.c 프로젝트: feltstykket/scythe
match *find_best_match(const adapter_array *aa, const char *read,  
                       float *p_quals, float prior, float p_match, int min_l) {
  /* 
   Take an adapter array, and check the read against all
   adapters. Brute force string matching is used. This is to avoid
   approximate matching algorithms which required an a priori
   specified number mismatches.

  */
  
  match *best_match=NULL;
  int i, shift, max_shift, found_contam=0;
  int *best_arr=NULL, best_adapter=0, best_length=0, best_shift=0, best_score=INT_MIN;
  int al, curr_score, *curr_arr=NULL;
  int rl = strlen(read);
  posterior_set *ps=NULL;
  float *best_p_quals=NULL;

  max_shift = rl - min_l;
  for (shift = 0; shift < max_shift; shift++) {
    for (i = 0; i < aa->n; i++) {
      if (min_l >= aa->adapters[i].length) {
        fprintf(stderr, "Minimum match length (option -n) greater than or " \
                "equal to length of adapter.\n");
        exit(EXIT_FAILURE);
      }
      al = min(aa->adapters[i].length, strlen(&(read)[shift]));
      curr_arr = score_sequence(&(read)[shift], (aa->adapters[i]).seq, al);
      curr_score = sum(curr_arr, al);
      if (curr_score > best_score) {
        best_score = curr_score; 
        best_length = al;
        best_shift = shift;
        best_p_quals = &(p_quals)[shift];
        best_arr = curr_arr;
        best_adapter = i;
        ps = posterior(best_arr, best_p_quals, prior, 0.25, best_length);
        found_contam = ps->is_contam;
        if (found_contam) {
          break;
        } else {
          free(ps); 
          ps=NULL;
          free(best_arr);
        }
      } else free(curr_arr);
    }
    if (found_contam)
      break;
  }
  
  if (!found_contam) /* no match found */
    return NULL;
  
  /* save this match */
  best_match = xmalloc(sizeof(match));
  best_match->match = best_arr;
  best_match->shift = best_shift;
  best_match->length = best_length;
  best_match->ps = ps;
  best_match->score = best_score;
  best_match->adapter_index = best_adapter;
  best_match->p_quals = best_p_quals;
  best_match->match_pos = calloc(best_length, sizeof(int));
  for (i = 0; i < best_length; i++)
    best_match->match_pos[i] = best_match->match[i] == MATCH_SCORE;
  return best_match;
}
예제 #7
0
파일: centrimo.c 프로젝트: CPFL/gmeme
/*************************************************************************
 * Entry point for centrimo
 *************************************************************************/
int main(int argc, char *argv[]) {
  CENTRIMO_OPTIONS_T options;
  SEQ_SITES_T seq_sites;
  SITE_COUNTS_T counts;
  int seqN, motifN, seqlen, db_i, motif_i, i;
  double log_pvalue_thresh;
  SEQ_T** sequences = NULL;
  ARRAY_T* bg_freqs = NULL;
  ARRAYLST_T *stats_list;
  MOTIF_DB_T **dbs, *db;
  MREAD_T *mread;
  MOTIF_STATS_T *stats;
  MOTIF_T *motif, *rev_motif;
  PSSM_T *pos_pssm, *rev_pssm;
  char *sites_path, *desc;
  FILE *sites_file;
  HTMLWR_T *html;
  JSONWR_T *json;

  // COMMAND LINE PROCESSING
  process_command_line(argc, argv, &options);

  // load the sequences
  read_sequences(options.alphabet, options.seq_source, &sequences, &seqN);
  seqlen = (seqN ? get_seq_length(sequences[0]) : 0);
  // calculate a sequence background (unless other background is given)
  if (!options.bg_source) {
    bg_freqs = calc_bg_from_fastas(options.alphabet, seqN, sequences);
  }

  // load the motifs
  motifN = 0;
  dbs = mm_malloc(sizeof(MOTIF_DB_T*) * arraylst_size(options.motif_sources));
  for (i = 0; i < arraylst_size(options.motif_sources); i++) {
    char* db_source;
    db_source = (char*)arraylst_get(i, options.motif_sources);
    dbs[i] = read_motifs(i, db_source, options.bg_source, &bg_freqs, 
        options.pseudocount, options.selected_motifs, options.alphabet);
    motifN += arraylst_size(dbs[i]->motifs);
  }
  log_pvalue_thresh = log(options.evalue_thresh) - log(motifN);
  // Setup some things for double strand scanning
  if (options.scan_both_strands == TRUE) {
    // Set up hash tables for computing reverse complement
    setup_hash_alph(DNAB);
    setalph(0);
    // Correct background by averaging on freq. for both strands.
    average_freq_with_complement(options.alphabet, bg_freqs);
    normalize_subarray(0, alph_size(options.alphabet, ALPH_SIZE), 0.0, bg_freqs);
    calc_ambigs(options.alphabet, FALSE, bg_freqs);
  }
  // Create output directory
  if (create_output_directory(options.output_dirname, options.allow_clobber, 
        (verbosity >= NORMAL_VERBOSE))) {
    die("Couldn't create output directory %s.\n", options.output_dirname);
  }
  // open output files
  sites_path = make_path_to_file(options.output_dirname, SITES_FILENAME);
  sites_file = fopen(sites_path, "w");
  free(sites_path);
  // setup html monolith writer
  json = NULL;
  if ((html = htmlwr_create(get_meme_etc_dir(), TEMPLATE_FILENAME))) {
    htmlwr_set_dest_name(html, options.output_dirname, HTML_FILENAME);
    htmlwr_replace(html, "centrimo_data.js", "data");
    json = htmlwr_output(html);
    if (json == NULL) die("Template does not contain data section.\n");
  } else {
    DEBUG_MSG(QUIET_VERBOSE, "Failed to open html template file.\n");
  }
  if (json) {
    // output some top level variables
    jsonwr_str_prop(json, "version", VERSION);
    jsonwr_str_prop(json, "revision", REVISION);
    jsonwr_str_prop(json, "release", ARCHIVE_DATE);
    jsonwr_str_array_prop(json, "cmd", argv, argc);
    jsonwr_property(json, "options");
    jsonwr_start_object_value(json);
    jsonwr_dbl_prop(json, "motif-pseudo", options.pseudocount);
    jsonwr_dbl_prop(json, "score", options.score_thresh);
    jsonwr_dbl_prop(json, "ethresh", options.evalue_thresh);
    jsonwr_lng_prop(json, "maxbin", options.max_window+1);
    jsonwr_bool_prop(json, "norc", !options.scan_both_strands);
    jsonwr_bool_prop(json, "noflip", options.no_flip);
    jsonwr_end_object_value(json);
    // output the description
    desc = prepare_description(&options);
    if (desc) {
      jsonwr_str_prop(json, "job_description", desc);
      free(desc);
    }
    // output size metrics
    jsonwr_lng_prop(json, "seqlen", seqlen);
    jsonwr_lng_prop(json, "tested", motifN);
    // output the fasta db
    jsonwr_property(json, "sequence_db");
    jsonwr_start_object_value(json);
    jsonwr_str_prop(json, "source", options.seq_source);
    jsonwr_lng_prop(json, "count", seqN);
    jsonwr_end_object_value(json);
    // output the motif dbs
    jsonwr_property(json, "motif_dbs");
    jsonwr_start_array_value(json);
    for (db_i = 0; db_i < arraylst_size(options.motif_sources); db_i++) {
      db = dbs[db_i];
      jsonwr_start_object_value(json);
      jsonwr_str_prop(json, "source", db->source);
      jsonwr_lng_prop(json, "count", arraylst_size(db->motifs));
      jsonwr_end_object_value(json);
    }
    jsonwr_end_array_value(json);
    // start the motif array
    jsonwr_property(json, "motifs");
    jsonwr_start_array_value(json);
  }
  /**************************************************************
   * Tally the positions of the best sites for each of the 
   * selected motifs.
   **************************************************************/
  // prepare the sequence sites
  memset(&seq_sites, 0, sizeof(SEQ_SITES_T));
  // prepare the site counts
  counts.allocated = ((2 * seqlen) - 1);
  counts.sites = mm_malloc(sizeof(double) * counts.allocated);
  // prepare the motifs stats list
  stats_list = arraylst_create();
  // prepare the other vars
  motif = NULL; pos_pssm = NULL; rev_motif = NULL; rev_pssm = NULL;
  for (db_i = 0; db_i < arraylst_size(options.motif_sources); db_i++) {
    db = dbs[db_i];
    for (motif_i = 0; motif_i < arraylst_size(db->motifs); motif_i++) {
      motif = (MOTIF_T *) arraylst_get(motif_i, db->motifs);
      DEBUG_FMT(NORMAL_VERBOSE, "Using motif %s of width %d.\n",  
          get_motif_id(motif), get_motif_length(motif));
      // reset the counts
      for (i = 0; i < counts.allocated; i++) counts.sites[i] = 0;
      counts.total_sites = 0;
      // create the pssm 
      pos_pssm = make_pssm(bg_freqs, motif);
      // If required, do the same for the reverse complement motif.
      if (options.scan_both_strands) {
        rev_motif = dup_rc_motif(motif);
        rev_pssm = make_pssm(bg_freqs, rev_motif);
      }
      // scan the sequences
      for (i = 0; i < seqN; i++)
        score_sequence(&options, sequences[i], pos_pssm, rev_pssm, 
            &seq_sites, &counts);
      // DEBUG check that the sum of the sites is close to the site count
      double sum_check = 0, sum_diff;
      for (i = 0; i < counts.allocated; i++) sum_check += counts.sites[i];
      sum_diff = counts.total_sites - sum_check;
      if (sum_diff < 0) sum_diff = -sum_diff;
      if (sum_diff > 0.1) {
        fprintf(stderr, "Warning: site counts don't sum to accurate value! "
            "%g != %ld", sum_check, counts.total_sites);
      }
      // output the plain text site counts
      output_site_counts(sites_file, seqlen, db, motif, &counts);
      // compute the best central window
      stats = compute_stats(options.max_window, seqlen, db, motif, &counts);
      // check if it passes the threshold
      if (json && stats->log_adj_pvalue <= log_pvalue_thresh) {
        output_motif_json(json, stats, &counts);
        arraylst_add(stats, stats_list);
      } else {
        free(stats);
      }
      // Free memory associated with this motif.
      free_pssm(pos_pssm);
      free_pssm(rev_pssm);
      destroy_motif(rev_motif);
    }
  }
  if (json) jsonwr_end_array_value(json);
  // finish writing sites
  fclose(sites_file);
  // finish writing html file
  if (html) {
    if (htmlwr_output(html) != NULL) {
      die("Found another JSON replacement!\n");
    }
    htmlwr_destroy(html);
  }
  // write text file
  output_centrimo_text(&options, motifN, stats_list);
  // Clean up.
  for (i = 0; i < seqN; ++i) {
    free_seq(sequences[i]); 
  }
  free(sequences);
  for (i = 0; i < arraylst_size(options.motif_sources); i++) {
    free_db(dbs[i]);
  }
  free(dbs);
  free_array(bg_freqs);
  free(counts.sites);
  free(seq_sites.sites);
  arraylst_destroy(free, stats_list);
  cleanup_options(&options);
  return 0;

}