/************************************************************************** * Prepare a sequence for recognition by * - making sure it is uppercase, * - making sure it doesn't contain illegal characters, * - adding flanking Xs to match START/END states, and * - converting it to an integer format * - computing cumulative GC counts * * In the integer form, each character in the sequence is replaced by * the index of that character in the alphabet array. Thus, if the * alphabet is 'ACGT', every occurence of the letter 'G' in the * sequence will be represented by the index 2. **************************************************************************/ void prepare_sequence (SEQ_T* sequence, ALPH_T alph) { int i_seq; // Index in the sequence. int badchar; // Number of characters converted. char wildcard; // Wildcard character wildcard = alph_wildcard(alph); badchar = 0; for (i_seq = 0; i_seq < get_seq_length(sequence); i_seq++) { // Make sure the sequence is uppercase. if (islower((int)(sequence->sequence)[i_seq])) { (sequence->sequence)[i_seq] = toupper((int)(sequence->sequence)[i_seq]); } // Convert non-alphabetic characters to ambiguous. if (alph_index(alph, (sequence->sequence)[i_seq]) == -1) { fprintf(stderr, "%c -> %c\n", (sequence->sequence)[i_seq], wildcard); (sequence->sequence)[i_seq] = wildcard; badchar++; } } // Tell the user about the conversions. if (badchar) { fprintf(stderr, "Warning: converted %d non-alphabetic ", badchar); fprintf(stderr, "characters to %c in sequence %s.\n", wildcard, get_seq_name(sequence)); } // Add flanking X's. add_flanking_xs(sequence, alph); // Make the integer sequence. sequence->intseq = (int *)mm_malloc(sizeof(int) * get_seq_length(sequence)); for (i_seq = 0; i_seq < get_seq_length(sequence); i_seq++) { (sequence->intseq)[i_seq] = alph_index(alph, (sequence->sequence)[i_seq]); } // // Get cumulative GC counts. // if (alph == DNA_ALPH) { int len = get_seq_length(sequence); char c = (sequence->sequence)[0]; // first character sequence->gc = (int *)mm_malloc(sizeof(int) * get_seq_length(sequence)); // set count at first position (sequence->gc)[0] = (c == 'G' || c == 'C') ? 1 : 0; // set cumulative counts at rest of postitions for (i_seq = 1; i_seq < len; i_seq++) { c = (sequence->sequence)[i_seq]; (sequence->gc)[i_seq] = (c == 'G' || c == 'C') ? (sequence->gc)[i_seq-1] + 1 : (sequence->gc)[i_seq-1]; } } }
/********************************************************************** shuffle_sequence() shuffle a given sequences based on their content **********************************************************************/ void shuffle_sequence( SEQ_T* seq, /* original sequence IN */ unsigned int seed, /* seed IN */ SEQ_T** target /* target sequence OUT */ ){ my_srand(seed); assert(*target==NULL); // reset target if not null if (*target != NULL){ free_seq(*target); } *target = allocate_seq(get_seq_name(seq),"shuffled",get_seq_offset(seq),get_raw_sequence(seq)); char *raw = get_raw_sequence(*target); /* copy original in temp string */ char* tmp = (char*)mm_calloc(get_seq_length(seq)+1,sizeof(char)); strcpy(tmp,get_raw_sequence(seq)); tmp[get_seq_length(seq)]='\0'; int i,j; char *ss; char *dd; for(j=0,i=get_seq_length(seq);i>0;i--){ // Pick a random number in the range: int pick = rand() % i; raw[j++] = tmp[pick]; // "shift" routine here eliminates the "picked" base from the _src string: // dd starts at the picked position: ss is one beyond that: for( dd = tmp+pick , ss = dd + 1 ; *dd ; *dd++=*ss++ ); } myfree(tmp); }
/**************************************************************************** * Create a new alignment with any sequence that contains nothing but * gap ('-') characters removed. Returns the new alignment. Does not * change the old alignment. * If there are no all-gap sequences, the returned alignment is the * same object as the original alignment. ****************************************************************************/ static ALIGNMENT_T* remove_allgap_sequences(ALIGNMENT_T* alignment) { ALIGNMENT_T* new_alignment; int i_aln; int l_aln = get_num_aligned_sequences(alignment); STRING_LIST_T* keeper_seqs = new_string_list(); // Identify the all-gap sequences. for (i_aln=0; i_aln<l_aln; i_aln++) { SEQ_T* sequence = get_alignment_sequence(i_aln, alignment); int i_seq; int l_seq = get_seq_length(sequence); // Add sequence to keepers if it contains a non-gap. for (i_seq=0; i_seq<l_seq; i_seq++) { if (get_seq_char(i_seq, sequence) != '-') { // not gap? add_string(get_seq_name(sequence), keeper_seqs); // non-gap: keeper break; } } } // Remove any sequences not in keeper list. if (get_num_strings(keeper_seqs) < l_aln) { new_alignment = remove_alignment_seqs(keeper_seqs, alignment); free_string_list(keeper_seqs); } else { new_alignment = alignment; } return(new_alignment); } // remove_allgap_sequences
void CompTool::search(int argc, char** argv){ if(argc < 3) {cout << "File not enough" << endl; exit(1);} const string seq1_file = argv[1]; const string seq2_file = argv[2]; const string seq1_name = basename(seq1_file); const string seq2_name = basename(seq2_file); // Options int kmer_size = 15; int slide_letters = 1; int bwt_interval = 1; int max_num_matches = 1000000000; bool search_forward = true; bool search_reverse = true; if(argc > 3){ for(int i = 3; i < argc; i++){ if (argv[i][1] == 'k') kmer_size = atoi(argv[++i]); else if(argv[i][1] == 'l') slide_letters = atoi(argv[++i]); else if(argv[i][1] == 'i') bwt_interval = atoi(argv[++i]); else if(argv[i][1] == 'm') max_num_matches = atoi(argv[++i]); else if(argv[i][1] == 'f') search_reverse = false; else if(argv[i][1] == 'r') search_forward = false; } } seq1_size_ = get_seq_length(seq1_file) + 1; seq2_size_ = get_seq_length(seq2_file) + 1; seq1_ = read_fasta_and_create_int8_t_array(seq1_file, seq1_size_); seq2_ = read_fasta_and_create_int8_t_array(seq2_file, seq2_size_); int* SA = create_SA(seq1_file, seq1_size_); BWT bwt(seq1_, SA, seq1_size_, num_char_, bwt_interval); if(search_forward) search_forward_matches(seq1_name, seq2_name, SA, bwt, kmer_size, slide_letters, max_num_matches); if(search_reverse) search_reverse_matches(seq1_name, seq2_name, SA, bwt, kmer_size, slide_letters, max_num_matches); delete SA; }
/************************************************************************* * Read all the sequences into an array of SEQ_T *************************************************************************/ static void read_sequences(ALPH_T alph, char *seq_file_name, SEQ_T ***sequences, int *seq_num) { const int max_sequence = 32768; // unlikely to be this big int i, seq_len, move; FILE * seq_fh = fopen(seq_file_name, "r"); if (!seq_fh) die("failed to open sequence file `%s'", seq_file_name); read_many_fastas(alph, seq_fh, max_sequence, seq_num, sequences); if (fclose(seq_fh) != 0) die("failed to close sequence file\n"); seq_len = get_seq_length((*sequences)[0]); move = 0; for (i = 1; i < *seq_num; i++) { if (seq_len == get_seq_length((*sequences)[i])) { if (move > 0) (*sequences)[i-move] = (*sequences)[i]; } else { fprintf(stderr, "Skipping sequence %s as its length (%d) does not " "match the first sequence (%d).\n", get_seq_name((*sequences)[i]), get_seq_length((*sequences)[i]), seq_len); move++; } } *seq_num -= move; for (i--; i >= *seq_num; i--) (*sequences)[i] = NULL; }
/*************************************************************************** * Get the maximum sequence length from a set of sequences. ***************************************************************************/ int get_max_seq_length (int num_seqs, SEQ_T** sequences) { int max_length; int this_length; int i_seq; max_length = 0; for (i_seq = 0; i_seq < num_seqs; i_seq++) { this_length = get_seq_length(sequences[i_seq]); if (this_length > max_length) { max_length = this_length; } } return(max_length); }
static double * log_cumulative_background(ALPH_T *alph, const int sdbg_order, SEQ_T *sequence) { BGCALC_T *calc; ARRAY_T *cp; double *logcumback; const char *raw_seq; int i; if (sdbg_order < 0) die("No such thing as a negative background order"); logcumback = mm_malloc(sizeof(double) * (get_seq_length(sequence)+1)); raw_seq = get_raw_sequence(sequence); calc = NULL; // calculate background model calculate_markov_model(alph, sdbg_order, 1.0, false, raw_seq, &calc); cp = calculate_markov_model(alph, sdbg_order, 1.0, false, NULL, &calc); // add x-tuples to model extend_markov_model(alph, true, SUM_FREQS, cp); // normalize for each prefix (convert to conditional probability) for (i = 0; i < get_array_length(cp); i += alph_size_wild(alph)) { normalize_subarray(i, alph_size_core(alph), 0, cp); set_array_item(i + alph_wild(alph), 1.0, cp); } calculate_log_cumulative_background(alph, true, sdbg_order, cp, raw_seq, logcumback); free_array(cp); return logcumback; }
/************************************************************************* * Calculate the odds score for each motif-sized window at each * site in the sequence using the given nucleotide frequencies. * * This function is a lightweight version based on the one contained in * motiph-scoring. Several calculations that are unnecessary for gomo * have been removed in order to speed up the process *************************************************************************/ static double score_sequence( SEQ_T *seq, // sequence to scan (IN) MOTIF_T *motif, // motif already converted to odds values (IN) PSSM_T *m_pssm, // motif pssm (IN) MATRIX_T *m_odds, // motif odds (IN) int method, // method used for scoring (IN) double threshold, // Threshold to use in TOTAL_HITS mode with a PWM ARRAY_T *bg_freqs //background model ) { assert(seq != NULL); assert(motif != NULL); assert((method == TOTAL_HITS && m_pssm) || (method != TOTAL_HITS && m_odds)); char* raw_seq = get_raw_sequence(seq); int seq_length = get_seq_length(seq); // Get the pv lookup table ARRAY_T* pv_lookup = NULL; if (NULL != m_pssm) { pv_lookup = m_pssm->pv; assert(get_array_length(pv_lookup) > 0); } // Prepare storage for the string representing the portion // of the reference sequence within the window. char* window_seq = (char *) mm_malloc(sizeof(char) * (get_motif_length(motif) + 1)); window_seq[get_motif_length(motif)] = '\0'; int max_index = seq_length - get_motif_length(motif); if (max_index < 0) max_index = 0; const int asize = alph_size(get_motif_alph(motif), ALPH_SIZE); double* odds = (double*) mm_malloc(sizeof(double)*max_index); double* scaled_log_odds = (double*) mm_malloc(sizeof(double)*max_index); // For each site in the sequence int seq_index; for (seq_index = 0; seq_index < max_index; seq_index++) { double odd = 1.0; scaled_log_odds[seq_index] = 0; // For each site in the motif window int motif_position; for (motif_position = 0; motif_position < get_motif_length(motif); motif_position++) { char c = raw_seq[seq_index + motif_position]; window_seq[motif_position] = c; // Check for gaps at this site if(c == '-' || c == '.') { break; } // Check for ambiguity codes at this site //TODO: This next call is very expensive - it takes up approx. 10% of a // programme's running time. It should be fixed up somehow. int aindex = alph_index(get_motif_alph(motif), c); if (aindex > asize) { break; } if (method == TOTAL_HITS) { //If we're in this mode, then we're using LOG ODDS. //scaled_log_odds[seq_index] += get_matrix_cell(motif_position, aindex, get_motif_freqs(motif)); scaled_log_odds[seq_index] += get_matrix_cell(motif_position, aindex, m_pssm->matrix); } else { odd *= get_matrix_cell(motif_position, aindex, m_odds); } } odds[seq_index] = odd; } // return odds as requested (MAX or AVG scoring) double requested_odds = 0.0; if (method == AVG_ODDS){ for (seq_index = 0; seq_index < max_index; seq_index++) { requested_odds += odds[seq_index]; } requested_odds /= max_index + 1; // Divide by 0 if max_index==0 } else if (method == MAX_ODDS){ for (seq_index = 0; seq_index < max_index; seq_index++) { if (odds[seq_index] > requested_odds){ requested_odds = odds[seq_index]; } } } else if (method == SUM_ODDS) { for (seq_index = 0; seq_index < max_index; seq_index++) { requested_odds += odds[seq_index]; } } else if (method == TOTAL_HITS) { for (seq_index = 0; seq_index < max_index; seq_index++) { if (scaled_log_odds[seq_index] >= (double)get_array_length(pv_lookup)) { scaled_log_odds[seq_index] = (double)(get_array_length(pv_lookup) - 1); } double pvalue = get_array_item((int) scaled_log_odds[seq_index], pv_lookup); //Figure out how to calculate the p-value of a hit //fprintf(stderr, "m: %s pv_l len: %i scaled_log_odds: %g seq index: %i pvalue: %g\n", // get_motif_id(motif), get_array_length(pv_lookup), scaled_log_odds[seq_index], seq_index, pvalue); if (pvalue < threshold) { requested_odds++; //Add another hit. } if (verbosity > HIGHER_VERBOSE) { fprintf(stderr, "Window Data: %s\t%s\t%i\t%g\t%g\t%g\n", get_seq_name(seq), get_motif_id(motif), seq_index, scaled_log_odds[seq_index], pvalue, threshold); } } } myfree(odds); myfree(scaled_log_odds); myfree(window_seq); return requested_odds; }
/************************************************************************* * Entry point for ama *************************************************************************/ int main(int argc, char **argv) { AMA_OPTIONS_T options; ARRAYLST_T *motifs; clock_t c0, c1; // measuring cpu_time MOTIF_AND_PSSM_T *combo; CISML_T *cisml; PATTERN_T** patterns; PATTERN_T *pattern; FILE *fasta_file, *text_output, *cisml_output; int i, seq_loading_num, seq_counter, unique_seqs, seq_len, scan_len, x1, x2, y1, y2; char *seq_name, *path; bool need_postprocessing, created; SEQ_T *sequence; RBTREE_T *seq_ids; RBNODE_T *seq_node; double *logcumback; ALPH_T *alph; // process the command process_command_line(argc, argv, &options); // load DNA motifs motifs = load_motifs(&options); // get the alphabet if (arraylst_size(motifs) > 0) { combo = (MOTIF_AND_PSSM_T*)arraylst_get(0, motifs); alph = alph_hold(get_motif_alph(combo->motif)); } else { alph = alph_dna(); } // pick columns for GC operations x1 = -1; x2 = -1; y1 = -1; y2 = -1; if (alph_size_core(alph) == 4 && alph_size_pairs(alph) == 2) { x1 = 0; // A x2 = alph_complement(alph, x1); // T y1 = (x2 == 1 ? 2 : 1); // C y2 = alph_complement(alph, y1); // G assert(x1 != x2 && y1 != y2 && x1 != y1 && x2 != y2 && x1 != y2 && x2 != y1); } // record starting time c0 = clock(); // Create cisml data structure for recording results cisml = allocate_cisml(PROGRAM_NAME, options.command_line, options.motif_filename, options.fasta_filename); set_cisml_background_file(cisml, options.bg_filename); // make a CISML pattern to hold scores for each motif for (i = 0; i < arraylst_size(motifs); i++) { combo = (MOTIF_AND_PSSM_T*)arraylst_get(i, motifs); add_cisml_pattern(cisml, allocate_pattern(get_motif_id(combo->motif), "")); } // Open the FASTA file for reading. fasta_file = NULL; if (!open_file(options.fasta_filename, "r", false, "FASTA", "sequences", &fasta_file)) { die("Couldn't open the file %s.\n", options.fasta_filename); } if (verbosity >= NORMAL_VERBOSE) { if (options.last == 0) { fprintf(stderr, "Using entire sequence\n"); } else { fprintf(stderr, "Limiting sequence to last %d positions.\n", options.last); } } // // Read in all sequences and score with all motifs // seq_loading_num = 0; // keeps track on the number of sequences read in total seq_counter = 0; // holds the index to the seq in the pattern unique_seqs = 0; // keeps track on the number of unique sequences need_postprocessing = false; sequence = NULL; logcumback = NULL; seq_ids = rbtree_create(rbtree_strcasecmp,rbtree_strcpy,free,rbtree_intcpy,free); while (read_one_fasta(alph, fasta_file, options.max_seq_length, &sequence)) { ++seq_loading_num; seq_name = get_seq_name(sequence); seq_len = get_seq_length(sequence); scan_len = (options.last != 0 ? options.last : seq_len); // red-black trees are only required if duplicates should be combined if (options.combine_duplicates){ //lookup seq id and create new entry if required, return sequence index seq_node = rbtree_lookup(seq_ids, get_seq_name(sequence), true, &created); if (created) { // assign it a loading number rbtree_set(seq_ids, seq_node, &unique_seqs); seq_counter = unique_seqs; ++unique_seqs; } else { seq_counter = *((int*)rbnode_get(seq_node)); } } // // Set up sequence-dependent background model and compute // log cumulative probability of sequence. // This needs the sequence in raw format. // if (options.sdbg_order >= 0) logcumback = log_cumulative_background(alph, options.sdbg_order, sequence); // Index the sequence, throwing away the raw format and ambiguous characters index_sequence(sequence, alph, SEQ_NOAMBIG); // Get the GC content of the sequence if binning p-values by GC // and store it in the sequence object. if (options.num_gc_bins > 1) { ARRAY_T *freqs = get_sequence_freqs(sequence, alph); set_total_gc_sequence(sequence, get_array_item(y1, freqs) + get_array_item(y2, freqs)); // f(C) + f(G) free_array(freqs); // clean up } else { set_total_gc_sequence(sequence, -1); // flag ignore } // Scan with motifs. for (i = 0; i < arraylst_size(motifs); i++) { pattern = get_cisml_patterns(cisml)[i]; combo = (MOTIF_AND_PSSM_T*)arraylst_get(i, motifs); if (verbosity >= HIGHER_VERBOSE) { fprintf(stderr, "Scanning %s sequence with length %d " "abbreviated to %d with motif %s with length %d.\n", seq_name, seq_len, scan_len, get_motif_id(combo->motif), get_motif_length(combo->motif)); } SCANNED_SEQUENCE_T* scanned_seq = NULL; if (!options.combine_duplicates || get_pattern_num_scanned_sequences(pattern) <= seq_counter) { // Create a scanned_sequence record and save it in the pattern. scanned_seq = allocate_scanned_sequence(seq_name, seq_name, pattern); set_scanned_sequence_length(scanned_seq, scan_len); } else { // get existing sequence record scanned_seq = get_pattern_scanned_sequences(pattern)[seq_counter]; set_scanned_sequence_length(scanned_seq, max(scan_len, get_scanned_sequence_length(scanned_seq))); } // check if scanned component of sequence has sufficient length for the motif if (scan_len < get_motif_length(combo->motif)) { // set score to zero and p-value to 1 if not set yet if(!has_scanned_sequence_score(scanned_seq)){ set_scanned_sequence_score(scanned_seq, 0.0); } if(options.pvalues && !has_scanned_sequence_pvalue(scanned_seq)){ set_scanned_sequence_pvalue(scanned_seq, 1.0); } add_scanned_sequence_scanned_position(scanned_seq); if (get_scanned_sequence_num_scanned_positions(scanned_seq) > 0L) { need_postprocessing = true; } if (verbosity >= HIGH_VERBOSE) { fprintf(stderr, "%s too short for motif %s. Score set to 0.\n", seq_name, get_motif_id(combo->motif)); } } else { // scan the sequence using average/maximum motif affinity ama_sequence_scan(alph, sequence, logcumback, combo->pssm_pair, options.scoring, options.pvalues, options.last, scanned_seq, &need_postprocessing); } } // All motifs scanned free_seq(sequence); if (options.sdbg_order >= 0) myfree(logcumback); } // read sequences fclose(fasta_file); if (verbosity >= HIGH_VERBOSE) fprintf(stderr, "(%d) sequences read in.\n", seq_loading_num); if (verbosity >= NORMAL_VERBOSE) fprintf(stderr, "Finished \n"); // if any sequence identifier was multiple times in the sequence set then // postprocess of the data is required if (need_postprocessing || options.normalize_scores) { post_process(cisml, motifs, options.normalize_scores); } // output results if (options.output_format == DIRECTORY_FORMAT) { if (create_output_directory(options.out_dir, options.clobber, verbosity > QUIET_VERBOSE)) { // only warn in higher verbose modes fprintf(stderr, "failed to create output directory `%s' or already exists\n", options.out_dir); exit(1); } path = make_path_to_file(options.out_dir, text_filename); //FIXME check for errors: MEME doesn't either and we at least know we have a good directory text_output = fopen(path, "w"); free(path); path = make_path_to_file(options.out_dir, cisml_filename); //FIXME check for errors cisml_output = fopen(path, "w"); free(path); print_cisml(cisml_output, cisml, true, NULL, false); print_score(cisml, text_output); fclose(cisml_output); fclose(text_output); } else if (options.output_format == GFF_FORMAT) { print_score(cisml, stdout); } else if (options.output_format == CISML_FORMAT) { print_cisml(stdout, cisml, true, NULL, false); } else { die("Output format invalid!\n"); } // // Clean up. // rbtree_destroy(seq_ids); arraylst_destroy(motif_and_pssm_destroy, motifs); free_cisml(cisml); rbtree_destroy(options.selected_motifs); alph_release(alph); // measure time if (verbosity >= NORMAL_VERBOSE) { // starting time c1 = clock(); fprintf(stderr, "cycles (CPU); %ld cycles\n", (long) c1); fprintf(stderr, "elapsed CPU time: %f seconds\n", (float) (c1-c0) / CLOCKS_PER_SEC); } return 0; }
/************************************************************************* * Calculate the odds score for each motif-sized window at each * site in the sequence using the given nucleotide frequencies. * * This function is a lightweight version based on the one contained in * motiph-scoring. Several calculations that are unnecessary for gomo * have been removed in order to speed up the process. * Scores sequence with up to two motifs. *************************************************************************/ static double score_sequence( ALPH_T* alph, // alphabet (IN) SEQ_T* seq, // sequence to scan (IN) double *logcumback, // cumulative bkg probability of sequence (IN) PSSM_PAIR_T *pssm_pair, // pos and neg pssms (IN) SCORING_EN method, // method used for scoring (IN) int last, // score only last <n> or score all if <n> // is zero (IN) BOOLEAN_T* isFeasible // FLAG indicated if there is at least one position // where the motif could be matched against (OUT) ) { PSSM_T *pos_pssm, *neg_pssm, *pssm; int strands, seq_length, w, n, asize, strand, start, N_scored, s_pos, m_pos; double max_odds, sum_odds, requested_odds, odds, adjust, log_p; int8_t *isequence, *iseq; assert(pssm_pair != NULL); assert(seq != NULL); asize = alph_size_core(alph); pos_pssm = pssm_pair->pos_pssm; assert(pos_pssm != NULL); neg_pssm = pssm_pair->neg_pssm; strands = neg_pssm ? 2 : 1; isequence = get_isequence(seq); seq_length = get_seq_length(seq); w = get_num_rows(pos_pssm->matrix); n = seq_length - w + 1; if (verbosity >= DUMP_VERBOSE) { fprintf(stderr, "Debug strands: %d seq_length: %d w: %d n: %d.\n", strands, seq_length, w, n); } // Dependent on the "last" parameter, change the starting point if (last > 0 && last < seq_length) { start = seq_length - last; N_scored = strands * (last - w + 1); // number of sites scored } else { start = 0; N_scored = strands * n; // number of sites scored } // For each motif (positive and reverse complement) max_odds = 0.0; sum_odds = 0.0; if (verbosity >= HIGHER_VERBOSE) { fprintf(stderr, "Starting scan at position %d .\n", start); } for (strand = 0; strand < strands; strand++) { // pos (and negative) motif pssm = (strand == 0 ? pos_pssm : neg_pssm); // choose +/- motif // For each site in the sequence for (s_pos = start; s_pos < n; s_pos++) { odds = 1.0; // For each position in the motif window for (m_pos = 0, iseq = isequence+s_pos; m_pos < w; m_pos++, iseq++) { if (*iseq == -1) { N_scored--; odds = 0; break; } // multiple odds by value in appropriate motif cell odds *= get_matrix_cell(m_pos, *iseq, pssm->matrix); } // Apply sequence-dependent background model. if (logcumback) { log_p = logcumback[s_pos+w] - logcumback[s_pos]; // log Pr(x | background) //printf("log_p:: %g motif_pos %d\n", log_p, m_pos); adjust = exp(w*log(1/4.0) - log_p); // Pr(x | uniform) / Pr(x | background) odds *= adjust; } // Add odds to growing sum. sum_odds += odds; // sum of odds if (odds > max_odds) max_odds = odds; // max of odds } // site } // strand if (verbosity >= HIGHER_VERBOSE) { fprintf(stderr, "Scored %d positions with the sum odds %f and the " "max odds %f.\n", N_scored, sum_odds, max_odds); } // has there been anything matched at all? if (N_scored == 0) { if (verbosity >= NORMAL_VERBOSE) { fprintf(stderr,"Sequence \'%s\' offers no location to match " "the motif against (sequence length too short?)\n", get_seq_name(seq)); } *isFeasible = false; return 0.0; // return odds as requested (MAX or AVG scoring) } else if (method == AVG_ODDS) { return sum_odds / N_scored; // mean } else if (method == MAX_ODDS) { return max_odds; // maximum } else if (method == SUM_ODDS) { return sum_odds; // sum } else { die("Unknown scoring method"); // should not get here... but the compiler will complain if I don't handle this case *isFeasible = false; return 0.0; } } // score_sequence
/************************************************************************* * Entry point for ama *************************************************************************/ int main(int argc, char *argv[]) { int max_seq_length = MAX_SEQ; STRING_LIST_T* selected_motifs = NULL; double pseudocount = 0.01; int output_format = CISML_FORMAT; program_name = "ama"; int scoring = AVG_ODDS; BOOLEAN_T pvalues = FALSE; BOOLEAN_T normalize_scores = FALSE; BOOLEAN_T combine_duplicates = FALSE; int num_gc_bins = 1; int sdbg_order = -1; // don't use sequence background BOOLEAN_T scan_both_strands = TRUE; ARRAY_T* pos_bg_freqs = NULL; ARRAY_T* rev_bg_freqs = NULL; clock_t c0, c1; /* measuring cpu_time */ CISML_T *cisml; char * out_dir = NULL; BOOLEAN_T clobber = FALSE; int i; int last = 0; ALPH_T alph = INVALID_ALPH; /********************************************** * COMMAND LINE PROCESSING **********************************************/ const int num_options = 16; cmdoption const motif_scan_options[] = { { "max-seq-length", REQUIRED_VALUE }, { "motif", REQUIRED_VALUE }, { "motif-pseudo", REQUIRED_VALUE }, { "rma", NO_VALUE }, { "pvalues", NO_VALUE }, { "sdbg", REQUIRED_VALUE }, { "norc", NO_VALUE }, { "cs", NO_VALUE }, { "o-format", REQUIRED_VALUE }, { "o", REQUIRED_VALUE }, { "oc", REQUIRED_VALUE }, { "scoring", REQUIRED_VALUE }, { "verbosity", REQUIRED_VALUE }, { "gcbins", REQUIRED_VALUE }, { "last", REQUIRED_VALUE }, { "version", NO_VALUE } }; int option_index = 0; // Define the usage message. char usage[] = "USAGE: ama [options] <motif file> <sequence file> [<background file>]\n" "\n" " Options:\n" " --sdbg <order>\t\t\tUse Markov background model of\n" " \t\t\t\t\torder <order> derived from the sequence\n" " \t\t\t\t\tto compute its likelihood ratios.\n" " \t\t\t\t\tOverrides --pvalues, --gcbins and --rma;\n" " \t\t\t\t\t<background file> is required unless\n" " \t\t\t\t\t--sdbg is given.\n" " --motif <id>\t\t\tUse only the motif identified by <id>.\n" " \t\t\t\t\tThis option may be repeated.\n" " --motif-pseudo <float>\t\tThe value <float> times the background\n" " \t\t\t\t\tfrequency is added to the count of each\n" " \t\t\t\t\tletter when creating the likelihood \n" " \t\t\t\t\tratio matrix (default: %g).\n" " --norc\t\t\t\tDisables the scanning of the reverse\n" " \t\t\t\t\tcomplement strand.\n" " --scoring [avg-odds|max-odds]\tIndicates whether the average or \n" " \t\t\t\t\tthe maximum odds should be calculated\n" " \t\t\t\t\t(default: avg-odds)\n" " --rma\t\t\t\tScale motif scores to the range 0-1.\n" " \t\t\t\t\t(Relative Motif Affinity).\n" " \t\t\t\t\tMotif scores are scaled by the maximum\n" " \t\t\t\t\tscore achievable by that PWM. (default:\n" " \t\t\t\t\tmotif scores are not normalized)\n" " --pvalues\t\t\t\tPrint p-value of avg-odds score in cisml\n" " \t\t\t\t\toutput. Ignored for max-odds scoring.\n" " \t\t\t\t\t(default: p-values are not printed)\n" " --gcbins <bins>\t\t\tCompensate p-values for GC content of\n" " \t\t\t\t\teach sequence using given number of \n" " \t\t\t\t\tGC range bins. Recommended bins: 41.\n" " \t\t\t\t\t(default: p-values are based on\n" " \t\t\t\t\tfrequencies in background file)\n" " --cs\t\t\t\tEnable combining sequences with same\n" " \t\t\t\t\tidentifier by taking the average score\n" " \t\t\t\t\tand the Sidac corrected p-value.\n" " --o-format [gff|cisml]\t\tOutput file format (default: cisml)\n" " \t\t\t\t\tignored if --o or --oc option used\n" " --o <directory>\t\t\tOutput all available formats to\n" " \t\t\t\t\t<directory>; give up if <directory>\n" " \t\t\t\t\texists\n" " --oc <directory>\t\t\tOutput all available formats to\n" " \t\t\t\t\t<directory>; if <directory> exists\n" " \t\t\t\t\toverwrite contents\n" " --verbosity [1|2|3|4]\t\tControls amount of screen output\n" " \t\t\t\t\t(default: %d)\n" " --max-seq-length <int>\t\tSet the maximum length allowed for \n" " \t\t\t\t\tinput sequences. (default: %d)\n" " --last <int>\t\t\tUse only scores of (up to) last <n>\n" " \t\t\t\t\tsequence positions to compute AMA.\n" " --version \t\t\tPrint version and exit.\n" "\n"; // Parse the command line. if (simple_setopt(argc, argv, num_options, motif_scan_options) != NO_ERROR) { die("Error processing command line options: option name too long.\n"); } BOOLEAN_T setoutputformat = FALSE; BOOLEAN_T setoutputdirectory = FALSE; while (TRUE) { int c = 0; char* option_name = NULL; char* option_value = NULL; const char * message = NULL; // Read the next option, and break if we're done. c = simple_getopt(&option_name, &option_value, &option_index); if (c == 0) { break; } else if (c < 0) { (void) simple_getopterror(&message); die("Error processing command line options (%s).\n", message); } else if (strcmp(option_name, "max-seq-length") == 0) { max_seq_length = atoi(option_value); } else if (strcmp(option_name, "norc") == 0) { scan_both_strands = FALSE; } else if (strcmp(option_name, "cs") == 0) { combine_duplicates = TRUE; } else if (strcmp(option_name, "motif") == 0) { if (selected_motifs == NULL) { selected_motifs = new_string_list(); } add_string(option_value, selected_motifs); } else if (strcmp(option_name, "motif-pseudo") == 0) { pseudocount = atof(option_value); } else if (strcmp(option_name, "o-format") == 0) { if (setoutputdirectory) { if (verbosity >= NORMAL_VERBOSE) fprintf(stderr, "output directory specified, ignoring --o-format\n"); } else { setoutputformat = TRUE; if (strcmp(option_value, "gff") == 0) output_format = GFF_FORMAT; else if (strcmp(option_value, "cisml") == 0) output_format = CISML_FORMAT; else { if (verbosity >= NORMAL_VERBOSE) fprintf(stderr, "Output format not known. Using standard instead (cisML).\n"); output_format = CISML_FORMAT; } } } else if (strcmp(option_name, "o") == 0 || strcmp(option_name, "oc") == 0) { setoutputdirectory = TRUE; if (setoutputformat) { if (verbosity >= NORMAL_VERBOSE) fprintf(stderr, "output directory specified, ignoring --o-format\n"); } clobber = strcmp(option_name, "oc") == 0; out_dir = (char*) malloc (sizeof(char)*(strlen(option_value)+1)); strcpy(out_dir, option_value); output_format = DIRECTORY_FORMAT; } else if (strcmp(option_name, "verbosity") == 0) { verbosity = atoi(option_value); } else if (strcmp(option_name, "scoring") == 0) { if (strcmp(option_value, "max-odds") == 0) scoring = MAX_ODDS; else if (strcmp(option_value, "avg-odds") == 0) scoring = AVG_ODDS; else if (strcmp(option_value, "sum-odds") == 0) scoring = SUM_ODDS; else die("Specified scoring scheme not known.\n", message); } else if (strcmp(option_name, "pvalues") == 0) { pvalues = TRUE; } else if (strcmp(option_name, "rma") == 0) { normalize_scores = TRUE; fprintf(stderr, "Normalizing motif scores using RMA method.\n"); } else if (strcmp(option_name, "gcbins") == 0) { num_gc_bins = atoi(option_value); pvalues = TRUE; if (num_gc_bins <= 1) die("Number of bins in --gcbins must be greater than 1.\n", message); } else if (strcmp(option_name, "sdbg") == 0) { sdbg_order = atoi(option_value); // >=0 means use sequence bkg } else if (strcmp(option_name, "last") == 0) { int i = 0; if (option_value[0] == '-') ++i; while (option_value[i] != '\0') { if (!isdigit(option_value[i])) { die("Specified parameter 'last' contains non-numeric characters.\n"); } ++i; } last = atoi(option_value); if (errno != 0) { die("Specified parameter 'last' could not be parsed as a number as:\n%s\n",strerror(errno)); } if (last < 0) { die("Specified parameter 'last' had negative value (%d) when only postive or zero values are allowed \n", last); } } else if (strcmp(option_name, "version") == 0) { fprintf(stdout, VERSION "\n"); exit(EXIT_SUCCESS); } } // --sdbg overrides --pvalues and --gcbins and --rma int req_args = 3; if (sdbg_order >= 0) { pvalues = FALSE; normalize_scores = FALSE; num_gc_bins = 1; req_args = 2; } // Check all required arguments given if (sdbg_order >= 0 && argc > option_index + req_args) { die("<background file> cannot be given together with --sdbg.\n"); } else if (argc != option_index + req_args) { fprintf(stderr, usage, pseudocount, verbosity, max_seq_length); exit(EXIT_FAILURE); } // Get required arguments. char* motif_filename = argv[option_index]; option_index++; char* fasta_filename = argv[option_index]; option_index++; char* bg_filename; if (req_args == 3) { // required unless --sdbg given bg_filename = argv[option_index]; option_index++; } else { bg_filename = "--uniform--"; // So PSSMs will use uniform background; // we can multiply them out later. } // measure time c0 = clock(); // Set up hash tables for computing reverse complement if doing --sdbg if (sdbg_order >= 0) setup_hash_alph(DNAB); // Create cisml data structure for recording results cisml = allocate_cisml(program_name, motif_filename, fasta_filename); set_cisml_background_file(cisml, bg_filename); /********************************************** * Read the motifs and background model. **********************************************/ int num_motifs = 0; MREAD_T *mread; ARRAYLST_T *motifs; PSSM_PAIR_T** pssm_pairs; // note pssm_pairs is an array of pointers //this reads any meme file, xml, txt and html mread = mread_create(motif_filename, OPEN_MFILE); mread_set_bg_source(mread, bg_filename); mread_set_pseudocount(mread, pseudocount); motifs = mread_load(mread, NULL); alph = mread_get_alphabet(mread); pos_bg_freqs = mread_get_background(mread); mread_destroy(mread); num_motifs = arraylst_size(motifs); // allocate memory for PSSM pairs pssm_pairs = (PSSM_PAIR_T**)mm_malloc(sizeof(PSSM_PAIR_T*) * num_motifs); if (verbosity >= NORMAL_VERBOSE) fprintf(stderr, "Number of motifs in file %d.\n", num_motifs); // make a CISML pattern to hold scores for each motif PATTERN_T** patterns = NULL; Resize(patterns, num_motifs, PATTERN_T*); int motif_index; for (motif_index = 0; motif_index < num_motifs; motif_index++) { MOTIF_T* motif = (MOTIF_T*)arraylst_get(motif_index, motifs); patterns[motif_index] = allocate_pattern(get_motif_id(motif), ""); add_cisml_pattern(cisml, patterns[motif_index]); } // make reverse complement motifs and background frequencies. if (scan_both_strands == TRUE) { add_reverse_complements(motifs); assert(arraylst_size(motifs) == (2 * num_motifs)); rev_bg_freqs = allocate_array(get_array_length(pos_bg_freqs)); complement_dna_freqs(pos_bg_freqs, rev_bg_freqs); } /************************************************************** * Convert motif matrices into log-odds matrices. * Scale them. * Compute the lookup tables for the PDF of scaled log-odds scores. **************************************************************/ int ns = scan_both_strands ? 2 : 1; // number of strands for (motif_index = 0; motif_index < num_motifs; motif_index++) { MOTIF_T *motif, *motif_rc; motif = (MOTIF_T*)arraylst_get(motif_index*ns, motifs); if (scan_both_strands) motif_rc = (MOTIF_T*)arraylst_get(motif_index*ns + 1, motifs); else motif_rc = NULL; /* * Note: If scanning both strands, we complement the motif frequencies * but not the background frequencies so the motif looks the same. * However, the given frequencies are used in computing the p-values * since they represent the frequencies on the negative strands. * (If we instead were to complement the input sequence, keeping the * the motif fixed, we would need to use the complemented frequencies * in computing the p-values. Is that any clearer?) */ double range = 300; // 100 is not very good; 1000 is great but too slow PSSM_T* pos_pssm = build_motif_pssm( motif, pos_bg_freqs, pos_bg_freqs, NULL, // Priors not used 0.0L, // alpha not used range, num_gc_bins, TRUE ); PSSM_T* neg_pssm = (scan_both_strands ? build_motif_pssm( motif_rc, rev_bg_freqs, pos_bg_freqs, NULL, // Priors not used 0.0L, // alpha not used range, num_gc_bins, TRUE ) : NULL ); pssm_pairs[motif_index] = create_pssm_pair(pos_pssm, neg_pssm); } // Open the FASTA file for reading. FILE* fasta_file = NULL; if (open_file(fasta_filename, "r", FALSE, "FASTA", "sequences", &fasta_file) == 0) { die("Couldn't open the file %s.\n", fasta_filename); } if (verbosity >= NORMAL_VERBOSE) { if (last == 0) { fprintf(stderr, "Using entire sequence\n"); } else { fprintf(stderr, "Limiting sequence to last %d positions.\n", last); } } /************************************************************** * Read in all sequences and score with all motifs **************************************************************/ int seq_loading_num = 0; // keeps track on the number of sequences read in total int seq_counter = 0; // holds the index to the seq in the pattern int unique_seqs = 0; // keeps track on the number of unique sequences BOOLEAN_T need_postprocessing = FALSE; SEQ_T* sequence = NULL; RBTREE_T* seq_ids = rbtree_create(rbtree_strcasecmp,NULL,free,rbtree_intcpy,free); RBNODE_T* seq_node; BOOLEAN_T created; while (read_one_fasta(alph, fasta_file, max_seq_length, &sequence)) { ++seq_loading_num; created = FALSE; char* seq_name = get_seq_name(sequence); int seq_len = get_seq_length(sequence); int scan_len; if (last != 0) { scan_len = last; } else { scan_len = seq_len; } // red-black trees are only required if duplicates should be combined if (combine_duplicates){ //lookup seq id and create new entry if required, return sequence index char *tmp_id = mm_malloc(strlen(seq_name)+1); // required copy for rb-tree strncpy(tmp_id,seq_name,strlen(seq_name)+1); seq_node = rbtree_lookup(seq_ids, tmp_id, TRUE, &created); if (created) {// assign it a loading number rbtree_set(seq_ids, seq_node, &unique_seqs); seq_counter = unique_seqs; ++unique_seqs; } else { seq_counter = *((int*)rbnode_get(seq_node)); } } // // Set up sequence-dependent background model and compute // log cumulative probability of sequence. // double *logcumback = NULL; // array of log cumulative probs. if (sdbg_order >= 0) { Resize(logcumback, seq_len+1, double); char* raw_seq = get_raw_sequence(sequence); BOOLEAN rc = FALSE; double *a_cp = get_markov_from_sequence(raw_seq, alph_string(alph), rc, sdbg_order, 0); log_cum_back(raw_seq, a_cp, sdbg_order, logcumback); myfree(a_cp); } // Get the GC content of the sequence if binning p-values by GC // and store it in the sequence object. if (num_gc_bins > 1) { ARRAY_T *freqs = get_sequence_freqs(sequence, alph); set_total_gc_sequence(sequence, get_array_item(1,freqs) + get_array_item(2,freqs)); // f(C) + f(G) free_array(freqs); // clean up } else { set_total_gc_sequence(sequence, -1); // flag ignore } /************************************************************** * Process all motifs. **************************************************************/ int ns = scan_both_strands ? 2 : 1; for (motif_index = 0; motif_index < num_motifs; motif_index++) { PATTERN_T *pattern = patterns[motif_index]; MOTIF_T* motif = (MOTIF_T*)arraylst_get(ns*motif_index, motifs); char* motif_id = (scan_both_strands ? get_motif_st_id(motif) : get_motif_id(motif)); if (verbosity >= HIGH_VERBOSE) { fprintf(stderr, "Using motif %s of width %d.\n", motif_id, get_motif_length(motif)); } if ((selected_motifs == NULL) || (have_string(get_motif_id(motif), selected_motifs) == TRUE)) { if (verbosity >= HIGHER_VERBOSE) { fprintf(stderr, "Scanning %s sequence with length %d " "abbreviated to %d with motif %s with length %d.\n", seq_name, seq_len, scan_len, motif_id, get_motif_length(motif)); } SCANNED_SEQUENCE_T* scanned_seq = NULL; if (!combine_duplicates || get_pattern_num_scanned_sequences(pattern) <= seq_counter){ // Create a scanned_sequence record and save it in the pattern. scanned_seq = allocate_scanned_sequence(seq_name, seq_name, pattern); set_scanned_sequence_length(scanned_seq, scan_len); } else { // get existing sequence record scanned_seq = get_pattern_scanned_sequences(pattern)[seq_counter]; set_scanned_sequence_length(scanned_seq, max(scan_len, get_scanned_sequence_length(scanned_seq))); } // check if scanned component of sequence has sufficient length for the motif if (scan_len < get_motif_length(motif)) { // set score to zero and p-value to 1 if not set yet if(!has_scanned_sequence_score(scanned_seq)){ set_scanned_sequence_score(scanned_seq, 0.0); } if(pvalues && !has_scanned_sequence_pvalue(scanned_seq)){ set_scanned_sequence_pvalue(scanned_seq, 1.0); } add_scanned_sequence_scanned_position(scanned_seq); if (get_scanned_sequence_num_scanned_positions(scanned_seq) > 0L) need_postprocessing = TRUE; if (verbosity >= HIGH_VERBOSE) fprintf(stderr, "%s too short for motif %s. Score set to 0!\n", seq_name, motif_id); } else { // scan the sequence using average/maximum motif affinity ama_sequence_scan(alph, sequence, logcumback, pssm_pairs[motif_index], scoring, pvalues, last, scanned_seq, &need_postprocessing); } } else { if (verbosity >= HIGH_VERBOSE) fprintf(stderr, "Skipping motif %s.\n", motif_id); } } // All motifs parsed free_seq(sequence); if (sdbg_order >= 0) myfree(logcumback); } // read sequences
void ramen_scan_sequences() { FILE* seq_file = NULL; MOTIF_T* motif = NULL; MOTIF_T* rev_motif = NULL; SEQ_T* sequence = NULL; SCANNED_SEQUENCE_T* scanned_seq = NULL; PATTERN_T* pattern; int i; int j; SEQ_T** seq_list; int num_seqs; int seq_len; //For the bdb_bg mode: ARRAY_T* seq_bg_freqs; double atcontent; double roundatcontent; double avg_seq_length = 0; //Open the file. if (open_file(args.sequence_filename, "r", FALSE, "FASTA", "sequences", &seq_file) == 0) { fprintf(stderr, "Couldn't open the file %s.\n", args.sequence_filename); ramen_terminate(1); } //Start reading in the sequences read_many_fastas(ramen_alph, seq_file, MAX_SEQ_LENGTH, &num_seqs, &seq_list); seq_ids = new_string_list(); seq_fscores = allocate_array(num_seqs); //Allocate the required space for results results = malloc(sizeof(double*) * motifs.num); for (i=0;i<motifs.num;i++) { results[i] = malloc(sizeof(double)*num_seqs); } for (j=0;j<num_seqs;j++) { fprintf(stderr, "\rScanning %i of %i sequences...", j+1, num_seqs); //copy the pointer into our current object for clarity sequence = seq_list[j]; //Read the fluorescence data from the description field. add_string(get_seq_name(sequence),seq_ids); seq_len = get_seq_length(sequence); set_array_item(j,atof(get_seq_description(sequence)),seq_fscores); //Scan with each motif. for (i=0;i<motifs.num;i++) { int motifindex = i*2; results[i][j] = ramen_sequence_scan(sequence, motif_at(motifs.motifs, motifindex), motif_at(motifs.motifs, motifindex+1), NULL, NULL, //No need to pass PSSM. AVG_ODDS, 0, TRUE, 0, motifs.bg_freqs); if (TRUE == args.linreg_normalise) { int k; double maxscore = 1; motif = motif_at(motifs.motifs,motifindex); for (k=0;k<get_motif_length(motif);k++) { double maxprob = 0; if (maxprob < get_matrix_cell(k, alph_index(ramen_alph, 'A'), get_motif_freqs(motif))) maxprob = get_matrix_cell(k, alph_index(ramen_alph, 'A'), get_motif_freqs(motif)); if (maxprob < get_matrix_cell(k, alph_index(ramen_alph, 'C'), get_motif_freqs(motif))) maxprob = get_matrix_cell(k, alph_index(ramen_alph, 'C'), get_motif_freqs(motif)); if (maxprob < get_matrix_cell(k, alph_index(ramen_alph, 'G'), get_motif_freqs(motif))) maxprob = get_matrix_cell(k, alph_index(ramen_alph, 'G'), get_motif_freqs(motif)); if (maxprob < get_matrix_cell(k, alph_index(ramen_alph, 'T'), get_motif_freqs(motif))) maxprob = get_matrix_cell(k, alph_index(ramen_alph, 'T'), get_motif_freqs(motif)); maxscore *= maxprob; } results[i][j] /= maxscore; } } } }
/************************************************************************* * Calculate the odds score for each motif-sized window at each * site in the sequence using the given nucleotide frequencies. * * This function is a lightweight version based on the one contained in * motiph-scoring. Several calculations that are unnecessary for gomo * have been removed in order to speed up the process. * Scores sequence with up to two motifs. *************************************************************************/ double score_sequence( SEQ_T* seq, // sequence to scan (IN) double *logcumback, // cumulative bkg probability of sequence (IN) PSSM_PAIR_T* pssm_pair, // pos and neg pssms (IN) int method, // method used for scoring (IN) int last, //score only last <n> or //score all if <n> is zero (IN) BOOLEAN_T* isFeasible // FLAG indicated if there is at least one position // where the motif could be matched against (OUT) ) { assert(pssm_pair != NULL); assert(seq != NULL); PSSM_T* pos_pssm = pssm_pair->pos_pssm; assert(pos_pssm != NULL); PSSM_T* neg_pssm = pssm_pair->neg_pssm; int n_motifs = neg_pssm ? 2 : 1; char* raw_seq = get_raw_sequence(seq); int seq_length = get_seq_length(seq); int w = get_num_rows(pos_pssm->matrix); int n = seq_length - w + 1; if (verbosity >= DUMP_VERBOSE) { fprintf(stderr, "Debug n_motifs: %d seq_length: %d w: %d n: %d.\n", n_motifs, seq_length, w, n); } // Get alphabet; char* alphabet = get_alphabet(FALSE); int alph_size = get_alph_size(ALPH_SIZE); // Dependent on the "last" parameter, change the starting point int start; int N_scored; if (last > 0 && last < seq_length) { start = seq_length - last; N_scored = n_motifs * (last - w + 1); // number of sites scored } else { start = 0; N_scored = n_motifs * n; // number of sites scored } // For each motif (positive and reverse complement) double max_odds = 0.0; double sum_odds = 0.0; double requested_odds = 0.0; int i; if (verbosity >= HIGHER_VERBOSE) { fprintf(stderr, "Starting scan at position %d .\n", start); } for (i=0; i<n_motifs; i++) { // pos (and negative) motif PSSM_T* pssm = (i==0 ? pos_pssm : neg_pssm); // choose +/- motif // For each site in the sequence int seq_index; for (seq_index = start; seq_index < n; seq_index++) { // site double odds = 1.0; // For each position in the motif window int motif_position; for (motif_position = 0; motif_position < w; motif_position++) { // column int i_site = seq_index + motif_position; char c = raw_seq[i_site]; // Check for gaps at this site if (c == '-' || c == '.') { N_scored--; odds = 0; break; } // Check for ambiguity codes at this site int alph_index = alphabet_index(c, alphabet); if (alph_index >= alph_size || alph_index < 0) { N_scored--; odds = 0; break; } // multiple odds by value in appropriate motif cell odds *= get_matrix_cell(motif_position, alph_index, pssm->matrix); } // column // // Apply sequence-dependent background model. // if (logcumback) { int i_site = seq_index; double log_p = logcumback[i_site+w] - logcumback[i_site]; // log Pr(x | background) //printf("log_p:: %g motif_pos %d\n", log_p, motif_position); double adjust = exp(w*log(1/4.0) - log_p); // Pr(x | uniform) / Pr(x | background) odds *= adjust; } // Add odds to growing sum. sum_odds += odds; // sum of odds if (odds > max_odds) max_odds = odds; // max of odds } // site } // motif if (verbosity >= HIGHER_VERBOSE) { fprintf(stderr, "Scored %d positions with the sum odds %f and the max odds %f.\n", N_scored, sum_odds, max_odds); } // has there been anything matched at all? if (N_scored == 0){ if (verbosity >= NORMAL_VERBOSE) { fprintf(stderr,"Sequence \'%s\' offers no location to match the motif against (sequence length too short?)\n",get_seq_name(seq)); } *isFeasible = FALSE; return 0.0; // return odds as requested (MAX or AVG scoring) } else if (method == AVG_ODDS) { requested_odds = sum_odds / N_scored; // mean } else if (method == MAX_ODDS) { requested_odds = max_odds; // maximum } else if (method == SUM_ODDS) { requested_odds = sum_odds ; // sum } return(requested_odds); } // score_sequence
/************************************************************************* * Entry point for centrimo *************************************************************************/ int main(int argc, char *argv[]) { CENTRIMO_OPTIONS_T options; SEQ_SITES_T seq_sites; SITE_COUNTS_T counts; int seqN, motifN, seqlen, db_i, motif_i, i; double log_pvalue_thresh; SEQ_T** sequences = NULL; ARRAY_T* bg_freqs = NULL; ARRAYLST_T *stats_list; MOTIF_DB_T **dbs, *db; MREAD_T *mread; MOTIF_STATS_T *stats; MOTIF_T *motif, *rev_motif; PSSM_T *pos_pssm, *rev_pssm; char *sites_path, *desc; FILE *sites_file; HTMLWR_T *html; JSONWR_T *json; // COMMAND LINE PROCESSING process_command_line(argc, argv, &options); // load the sequences read_sequences(options.alphabet, options.seq_source, &sequences, &seqN); seqlen = (seqN ? get_seq_length(sequences[0]) : 0); // calculate a sequence background (unless other background is given) if (!options.bg_source) { bg_freqs = calc_bg_from_fastas(options.alphabet, seqN, sequences); } // load the motifs motifN = 0; dbs = mm_malloc(sizeof(MOTIF_DB_T*) * arraylst_size(options.motif_sources)); for (i = 0; i < arraylst_size(options.motif_sources); i++) { char* db_source; db_source = (char*)arraylst_get(i, options.motif_sources); dbs[i] = read_motifs(i, db_source, options.bg_source, &bg_freqs, options.pseudocount, options.selected_motifs, options.alphabet); motifN += arraylst_size(dbs[i]->motifs); } log_pvalue_thresh = log(options.evalue_thresh) - log(motifN); // Setup some things for double strand scanning if (options.scan_both_strands == TRUE) { // Set up hash tables for computing reverse complement setup_hash_alph(DNAB); setalph(0); // Correct background by averaging on freq. for both strands. average_freq_with_complement(options.alphabet, bg_freqs); normalize_subarray(0, alph_size(options.alphabet, ALPH_SIZE), 0.0, bg_freqs); calc_ambigs(options.alphabet, FALSE, bg_freqs); } // Create output directory if (create_output_directory(options.output_dirname, options.allow_clobber, (verbosity >= NORMAL_VERBOSE))) { die("Couldn't create output directory %s.\n", options.output_dirname); } // open output files sites_path = make_path_to_file(options.output_dirname, SITES_FILENAME); sites_file = fopen(sites_path, "w"); free(sites_path); // setup html monolith writer json = NULL; if ((html = htmlwr_create(get_meme_etc_dir(), TEMPLATE_FILENAME))) { htmlwr_set_dest_name(html, options.output_dirname, HTML_FILENAME); htmlwr_replace(html, "centrimo_data.js", "data"); json = htmlwr_output(html); if (json == NULL) die("Template does not contain data section.\n"); } else { DEBUG_MSG(QUIET_VERBOSE, "Failed to open html template file.\n"); } if (json) { // output some top level variables jsonwr_str_prop(json, "version", VERSION); jsonwr_str_prop(json, "revision", REVISION); jsonwr_str_prop(json, "release", ARCHIVE_DATE); jsonwr_str_array_prop(json, "cmd", argv, argc); jsonwr_property(json, "options"); jsonwr_start_object_value(json); jsonwr_dbl_prop(json, "motif-pseudo", options.pseudocount); jsonwr_dbl_prop(json, "score", options.score_thresh); jsonwr_dbl_prop(json, "ethresh", options.evalue_thresh); jsonwr_lng_prop(json, "maxbin", options.max_window+1); jsonwr_bool_prop(json, "norc", !options.scan_both_strands); jsonwr_bool_prop(json, "noflip", options.no_flip); jsonwr_end_object_value(json); // output the description desc = prepare_description(&options); if (desc) { jsonwr_str_prop(json, "job_description", desc); free(desc); } // output size metrics jsonwr_lng_prop(json, "seqlen", seqlen); jsonwr_lng_prop(json, "tested", motifN); // output the fasta db jsonwr_property(json, "sequence_db"); jsonwr_start_object_value(json); jsonwr_str_prop(json, "source", options.seq_source); jsonwr_lng_prop(json, "count", seqN); jsonwr_end_object_value(json); // output the motif dbs jsonwr_property(json, "motif_dbs"); jsonwr_start_array_value(json); for (db_i = 0; db_i < arraylst_size(options.motif_sources); db_i++) { db = dbs[db_i]; jsonwr_start_object_value(json); jsonwr_str_prop(json, "source", db->source); jsonwr_lng_prop(json, "count", arraylst_size(db->motifs)); jsonwr_end_object_value(json); } jsonwr_end_array_value(json); // start the motif array jsonwr_property(json, "motifs"); jsonwr_start_array_value(json); } /************************************************************** * Tally the positions of the best sites for each of the * selected motifs. **************************************************************/ // prepare the sequence sites memset(&seq_sites, 0, sizeof(SEQ_SITES_T)); // prepare the site counts counts.allocated = ((2 * seqlen) - 1); counts.sites = mm_malloc(sizeof(double) * counts.allocated); // prepare the motifs stats list stats_list = arraylst_create(); // prepare the other vars motif = NULL; pos_pssm = NULL; rev_motif = NULL; rev_pssm = NULL; for (db_i = 0; db_i < arraylst_size(options.motif_sources); db_i++) { db = dbs[db_i]; for (motif_i = 0; motif_i < arraylst_size(db->motifs); motif_i++) { motif = (MOTIF_T *) arraylst_get(motif_i, db->motifs); DEBUG_FMT(NORMAL_VERBOSE, "Using motif %s of width %d.\n", get_motif_id(motif), get_motif_length(motif)); // reset the counts for (i = 0; i < counts.allocated; i++) counts.sites[i] = 0; counts.total_sites = 0; // create the pssm pos_pssm = make_pssm(bg_freqs, motif); // If required, do the same for the reverse complement motif. if (options.scan_both_strands) { rev_motif = dup_rc_motif(motif); rev_pssm = make_pssm(bg_freqs, rev_motif); } // scan the sequences for (i = 0; i < seqN; i++) score_sequence(&options, sequences[i], pos_pssm, rev_pssm, &seq_sites, &counts); // DEBUG check that the sum of the sites is close to the site count double sum_check = 0, sum_diff; for (i = 0; i < counts.allocated; i++) sum_check += counts.sites[i]; sum_diff = counts.total_sites - sum_check; if (sum_diff < 0) sum_diff = -sum_diff; if (sum_diff > 0.1) { fprintf(stderr, "Warning: site counts don't sum to accurate value! " "%g != %ld", sum_check, counts.total_sites); } // output the plain text site counts output_site_counts(sites_file, seqlen, db, motif, &counts); // compute the best central window stats = compute_stats(options.max_window, seqlen, db, motif, &counts); // check if it passes the threshold if (json && stats->log_adj_pvalue <= log_pvalue_thresh) { output_motif_json(json, stats, &counts); arraylst_add(stats, stats_list); } else { free(stats); } // Free memory associated with this motif. free_pssm(pos_pssm); free_pssm(rev_pssm); destroy_motif(rev_motif); } } if (json) jsonwr_end_array_value(json); // finish writing sites fclose(sites_file); // finish writing html file if (html) { if (htmlwr_output(html) != NULL) { die("Found another JSON replacement!\n"); } htmlwr_destroy(html); } // write text file output_centrimo_text(&options, motifN, stats_list); // Clean up. for (i = 0; i < seqN; ++i) { free_seq(sequences[i]); } free(sequences); for (i = 0; i < arraylst_size(options.motif_sources); i++) { free_db(dbs[i]); } free(dbs); free_array(bg_freqs); free(counts.sites); free(seq_sites.sites); arraylst_destroy(free, stats_list); cleanup_options(&options); return 0; }
/************************************************************************* * Calculate the log-odds score for each possible motif site in the * sequence and record the sites of the best. Apply a count to each * best site and increment the total site count. *************************************************************************/ static void score_sequence( CENTRIMO_OPTIONS_T *options, SEQ_T* sequence, PSSM_T* pssm, PSSM_T* rev_pssm, SEQ_SITES_T* seq_sites, SITE_COUNTS_T* counts ) { char *raw_seq, *seg; int i, L, w, pos; double score; double count; SEQ_SITE_T *site; // check we got passed stuff assert(options != NULL); assert(sequence != NULL); assert(pssm != NULL); assert(seq_sites != NULL); assert(counts != NULL); // make Mac OS compiler happy. score = -BIG; // Score and record each possible motif site in the sequence raw_seq = get_raw_sequence(sequence); L = get_seq_length(sequence); w = pssm->w; // Reset the sequence stats structure seq_sites->best = -BIG; seq_sites->used = 0; // Read and score each position in the sequence. for (i = 0; i < L - w + 1; i++) { seg = raw_seq+i; // Score and record forward strand if (score_motif_site(options->alphabet, seg, pssm, &score)) track_site(seq_sites, score, i, '+'); // Score and record reverse strand if appropriate. if (rev_pssm && score_motif_site(options->alphabet, seg, rev_pssm, &score)) track_site(seq_sites, score, i, '-'); } // Record the position of best site, averaging ties // and using position in RC of sequence if site on reverse strand // unless no_flip is true. if (seq_sites->used && seq_sites->best >= options->score_thresh) { // add 1/n_ties to each tied position's count, // averaging rather than random choice count = (double)1.0 / (double)seq_sites->used; for (i = 0; i < seq_sites->used; i++) { site = seq_sites->sites+i; if (options->no_flip || site->strand == '+') { //pos = 2 * (site->start + w/2 - 1/2); // a motif of width 1 can have sites at the first index pos = 2 * site->start + w - 1; // a motif of width 1 can have sites at the first index } else { //pos = 2 * (L - (site->start + w/2) - 1; // a motif of width 1 can have sites at the first index pos = 2 * (L - site->start) - w - 1; } //record the count counts->sites[pos] += count; } counts->total_sites++; } }