/********************************************************************** shuffle_sequence() shuffle a given sequences based on their content **********************************************************************/ void shuffle_sequence( SEQ_T* seq, /* original sequence IN */ unsigned int seed, /* seed IN */ SEQ_T** target /* target sequence OUT */ ){ my_srand(seed); assert(*target==NULL); // reset target if not null if (*target != NULL){ free_seq(*target); } *target = allocate_seq(get_seq_name(seq),"shuffled",get_seq_offset(seq),get_raw_sequence(seq)); char *raw = get_raw_sequence(*target); /* copy original in temp string */ char* tmp = (char*)mm_calloc(get_seq_length(seq)+1,sizeof(char)); strcpy(tmp,get_raw_sequence(seq)); tmp[get_seq_length(seq)]='\0'; int i,j; char *ss; char *dd; for(j=0,i=get_seq_length(seq);i>0;i--){ // Pick a random number in the range: int pick = rand() % i; raw[j++] = tmp[pick]; // "shift" routine here eliminates the "picked" base from the _src string: // dd starts at the picked position: ss is one beyond that: for( dd = tmp+pick , ss = dd + 1 ; *dd ; *dd++=*ss++ ); } myfree(tmp); }
/** * Creates a motif for a given mod using a simple frequency matrix. */ void create_simple_motif(SUMMARY_T* summary, MOMO_OPTIONS_T* options, MOD_INFO_T * mod_info) { int i; int j; const char* alph_letters = summary->alph_letters; // Create the frequency matrix MATRIX_T* freqs = NULL; freqs = get_count_matrix(freqs, mod_info->seq_list, NULL, options, summary); normalize_rows(0.0, freqs); // Create the motif MOTIF_INFO_T* motifinfo = mm_malloc(sizeof(MOTIF_INFO_T)); motifinfo->motif = allocate_motif(mod_info->mod_name, "", summary->alph, freqs, NULL); motifinfo->seqs = arraylst_create(); for (i = 0; i < arraylst_size(mod_info->seq_list); ++i) { SEQ_T* seqobject = options->eliminate_repeats ? hash_get_entry_value(arraylst_get(i, mod_info->seq_list)) : arraylst_get(i, mod_info->seq_list); arraylst_add(get_raw_sequence(seqobject), motifinfo->seqs); } motifinfo->fg_size = arraylst_size(mod_info->seq_list); arraylst_add(motifinfo, mod_info->motifinfos); // clean up free_matrix(freqs); }
/**************************************************************************** * Create a lookup table for converting an index into a sequence to an index * into the alignment. Note that because there are many alignment positions * that correspond to a sequence position we take the first occurence. * JCH: I have added this function for the sake of the BLS scan mode * so that single mode matches in each sequence can be mapped back * to positions in the alignment. ****************************************************************************/ int* make_seq_to_alignment_table(int ref_seq_index, ALIGNMENT_T* an_alignment) { char* raw_seq = NULL; int align_length = 0; int align_index = 0; int seq_index = 0; int *table = NULL; SEQ_T* seq = NULL; align_length = get_alignment_length(an_alignment); seq = get_alignment_sequence(ref_seq_index, an_alignment); raw_seq = get_raw_sequence(seq); // Table is indexed by position in the sequence // Table values are the first corresponding // position in the alignment. table = (int *) mm_malloc((align_length) * sizeof(int)); seq_index = 0; table[seq_index] = 0; for (align_index = 0; align_index < align_length; align_index++) { if (raw_seq[align_index] != '-' && raw_seq[align_index] != '.') { seq_index++; table[seq_index] = align_index; } } return table; }
void print_phylip_alignment (ALIGNMENT_T* the_alignment, FILE* outfile) { int i_seq; int i_position; char buffer[OUTPUT_WIDTH+1]; char* this_sequence; fprintf(outfile, "%d %d\n", the_alignment->num_sequences, the_alignment->length); /* Print the IDs and initial sequences. */ for (i_seq = 0; i_seq < the_alignment->num_sequences; i_seq++) { /* Print the ID. */ strncpy(buffer, get_seq_name(the_alignment->sequences[i_seq]), 10); buffer[10] = '\0'; fprintf(outfile, "%-10s ", buffer); /* Print the first block of sequence. */ this_sequence = get_raw_sequence(the_alignment->sequences[i_seq]); strncpy(buffer, &(this_sequence[0]), OUTPUT_WIDTH); buffer[OUTPUT_WIDTH] = '\0'; fprintf(outfile, "%s\n", buffer); } /* Blank line between sequences. */ fprintf(outfile, "\n"); /* Print successive blocks. */ for (i_position = OUTPUT_WIDTH; i_position < the_alignment->length; i_position += OUTPUT_WIDTH) { for (i_seq = 0; i_seq < the_alignment->num_sequences; i_seq++) { this_sequence = get_raw_sequence(the_alignment->sequences[i_seq]); strncpy(buffer, &(this_sequence[i_position]), OUTPUT_WIDTH); buffer[OUTPUT_WIDTH] = '\0'; fprintf(outfile, " %s\n", buffer); } /* Blank line between sequences. */ fprintf(outfile, "\n"); } }
char* get_raw_subsequence (int start, int stop, SEQ_T* a_sequence) { assert(a_sequence != NULL); assert((stop - start) >= 0); char *sequence = get_raw_sequence(a_sequence); char *subsequence = mm_malloc((stop - start + 2) * sizeof(char)); strncpy(subsequence, sequence + start, stop - start + 1); subsequence[stop - start + 1] = 0; return(subsequence); }
/**************************************************************************** * Extract a small alignment out of the middle of a larger alignment. ****************************************************************************/ ALIGNMENT_T* extract_subalignment (int start, int width, ALIGNMENT_T* alignment) { int num_sequences = get_num_aligned_sequences(alignment); SEQ_T** sequences = get_alignment_sequences(alignment); SEQ_T** subsequences = (SEQ_T**)mm_malloc(num_sequences * sizeof(SEQ_T*)); // Extract the specified columns into a new list of sequences. int i_seq = 0; char* subsequence = mm_malloc((width + 1) * sizeof(char)); for (i_seq = 0; i_seq < num_sequences; i_seq++) { SEQ_T* this_seq = sequences[i_seq]; char* raw_seq = get_raw_sequence(this_seq); strncpy(subsequence, raw_seq + start, width); subsequence[width] = '\0'; subsequences[i_seq] = allocate_seq(get_seq_name(this_seq), get_seq_description(this_seq), get_seq_offset(this_seq), subsequence); } // Extract the consensus string in the specified columns. char* consensus = get_consensus_string(alignment); char* subconsensus = mm_malloc(sizeof(char) * (width + 1)); strncpy(subconsensus, consensus + start, width); subconsensus[width] = '\0'; // Allocate and return the new alignment. ALIGNMENT_T* subalignment = allocate_alignment(get_alignment_name(alignment), get_alignment_description(alignment), num_sequences, subsequences, subconsensus); // Free local dynamic memory. for (i_seq = 0; i_seq < num_sequences; i_seq++) { free_seq(subsequences[i_seq]); } myfree(subsequences); myfree(subsequence); return(subalignment); }
static double * log_cumulative_background(ALPH_T *alph, const int sdbg_order, SEQ_T *sequence) { BGCALC_T *calc; ARRAY_T *cp; double *logcumback; const char *raw_seq; int i; if (sdbg_order < 0) die("No such thing as a negative background order"); logcumback = mm_malloc(sizeof(double) * (get_seq_length(sequence)+1)); raw_seq = get_raw_sequence(sequence); calc = NULL; // calculate background model calculate_markov_model(alph, sdbg_order, 1.0, false, raw_seq, &calc); cp = calculate_markov_model(alph, sdbg_order, 1.0, false, NULL, &calc); // add x-tuples to model extend_markov_model(alph, true, SUM_FREQS, cp); // normalize for each prefix (convert to conditional probability) for (i = 0; i < get_array_length(cp); i += alph_size_wild(alph)) { normalize_subarray(i, alph_size_core(alph), 0, cp); set_array_item(i + alph_wild(alph), 1.0, cp); } calculate_log_cumulative_background(alph, true, sdbg_order, cp, raw_seq, logcumback); free_array(cp); return logcumback; }
/** * Remove sequences do not match a pattern from phospho and bg lists and update their respective count matrix */ void remove_sequences_and_update_matrix(char letter, int pos, ARRAYLST_T* seqs, MOTIFX_STATUS_T** status_array, int* num_active, MATRIX_T* count, SUMMARY_T* summary, MOMO_OPTIONS_T* options) { int i; const char* alph_letters = summary->alph_letters; // Look through phospho_seqs and remove sequences. Update phospho_seqs for (i = 0; i < arraylst_size(seqs); ++i) { char* curr_seq = get_raw_sequence((SEQ_T*) (options->eliminate_repeats ? hash_get_entry_value((HASH_TABLE_ENTRY*) arraylst_get(i, seqs)) : arraylst_get(i, seqs))); // For anything active that does not match the pattern, turn it inactive. MOTIFX_STATUS_T status = (*status_array)[i]; if (status == ACTIVE && curr_seq[pos] != letter) { *num_active = *num_active - 1; (*status_array)[i] = INACTIVE; } } count = get_count_matrix(count, seqs, status_array, options, summary); }
/************************************************************************* * Calculate the odds score for each motif-sized window at each * site in the sequence using the given nucleotide frequencies. * * This function is a lightweight version based on the one contained in * motiph-scoring. Several calculations that are unnecessary for gomo * have been removed in order to speed up the process *************************************************************************/ static double score_sequence( SEQ_T *seq, // sequence to scan (IN) MOTIF_T *motif, // motif already converted to odds values (IN) PSSM_T *m_pssm, // motif pssm (IN) MATRIX_T *m_odds, // motif odds (IN) int method, // method used for scoring (IN) double threshold, // Threshold to use in TOTAL_HITS mode with a PWM ARRAY_T *bg_freqs //background model ) { assert(seq != NULL); assert(motif != NULL); assert((method == TOTAL_HITS && m_pssm) || (method != TOTAL_HITS && m_odds)); char* raw_seq = get_raw_sequence(seq); int seq_length = get_seq_length(seq); // Get the pv lookup table ARRAY_T* pv_lookup = NULL; if (NULL != m_pssm) { pv_lookup = m_pssm->pv; assert(get_array_length(pv_lookup) > 0); } // Prepare storage for the string representing the portion // of the reference sequence within the window. char* window_seq = (char *) mm_malloc(sizeof(char) * (get_motif_length(motif) + 1)); window_seq[get_motif_length(motif)] = '\0'; int max_index = seq_length - get_motif_length(motif); if (max_index < 0) max_index = 0; const int asize = alph_size(get_motif_alph(motif), ALPH_SIZE); double* odds = (double*) mm_malloc(sizeof(double)*max_index); double* scaled_log_odds = (double*) mm_malloc(sizeof(double)*max_index); // For each site in the sequence int seq_index; for (seq_index = 0; seq_index < max_index; seq_index++) { double odd = 1.0; scaled_log_odds[seq_index] = 0; // For each site in the motif window int motif_position; for (motif_position = 0; motif_position < get_motif_length(motif); motif_position++) { char c = raw_seq[seq_index + motif_position]; window_seq[motif_position] = c; // Check for gaps at this site if(c == '-' || c == '.') { break; } // Check for ambiguity codes at this site //TODO: This next call is very expensive - it takes up approx. 10% of a // programme's running time. It should be fixed up somehow. int aindex = alph_index(get_motif_alph(motif), c); if (aindex > asize) { break; } if (method == TOTAL_HITS) { //If we're in this mode, then we're using LOG ODDS. //scaled_log_odds[seq_index] += get_matrix_cell(motif_position, aindex, get_motif_freqs(motif)); scaled_log_odds[seq_index] += get_matrix_cell(motif_position, aindex, m_pssm->matrix); } else { odd *= get_matrix_cell(motif_position, aindex, m_odds); } } odds[seq_index] = odd; } // return odds as requested (MAX or AVG scoring) double requested_odds = 0.0; if (method == AVG_ODDS){ for (seq_index = 0; seq_index < max_index; seq_index++) { requested_odds += odds[seq_index]; } requested_odds /= max_index + 1; // Divide by 0 if max_index==0 } else if (method == MAX_ODDS){ for (seq_index = 0; seq_index < max_index; seq_index++) { if (odds[seq_index] > requested_odds){ requested_odds = odds[seq_index]; } } } else if (method == SUM_ODDS) { for (seq_index = 0; seq_index < max_index; seq_index++) { requested_odds += odds[seq_index]; } } else if (method == TOTAL_HITS) { for (seq_index = 0; seq_index < max_index; seq_index++) { if (scaled_log_odds[seq_index] >= (double)get_array_length(pv_lookup)) { scaled_log_odds[seq_index] = (double)(get_array_length(pv_lookup) - 1); } double pvalue = get_array_item((int) scaled_log_odds[seq_index], pv_lookup); //Figure out how to calculate the p-value of a hit //fprintf(stderr, "m: %s pv_l len: %i scaled_log_odds: %g seq index: %i pvalue: %g\n", // get_motif_id(motif), get_array_length(pv_lookup), scaled_log_odds[seq_index], seq_index, pvalue); if (pvalue < threshold) { requested_odds++; //Add another hit. } if (verbosity > HIGHER_VERBOSE) { fprintf(stderr, "Window Data: %s\t%s\t%i\t%g\t%g\t%g\n", get_seq_name(seq), get_motif_id(motif), seq_index, scaled_log_odds[seq_index], pvalue, threshold); } } } myfree(odds); myfree(scaled_log_odds); myfree(window_seq); return requested_odds; }
/** * Recursive function. Creates and stores a motif using the motif-x * algorithm until no more are left. */ void create_motifx_motif(ARRAYLST_T* phospho_seqs, ARRAYLST_T* bg_seqs, MOTIFX_STATUS_T** phospho_status, MOTIFX_STATUS_T** bg_status, MATRIX_T* phospho_count, MATRIX_T* bg_count, int* num_active, int* num_bg_active, char* modname, MOD_INFO_T* mod_info, MOMO_OPTIONS_T* options, SUMMARY_T* summary) { int i; int j; const char* alph_letters = summary->alph_letters; // Initialize pattern, sequence count, bg sequence count, and overall score for this motif. char* pattern = mm_malloc(options->width + 1); for (i = 0; i < options->width; ++i) { pattern[i] = 'X'; } pattern[options->width] = '\0'; int* num_active_copy = mm_malloc(sizeof(int)); *num_active_copy = *num_active; int* num_bg_active_copy = mm_malloc(sizeof(int)); *num_bg_active_copy = *num_bg_active; double* motif_score = mm_malloc(sizeof(double)); *motif_score = 0; // Set the pattern, num active copy, num bg active copy, motif score, and get a count of the sequences MATRIX_T* result_count_matrix = add_to_pattern(pattern, phospho_seqs, bg_seqs, phospho_status, bg_status, num_active_copy, num_bg_active_copy, phospho_count, bg_count, motif_score, summary, options); // If any of the characters are not X, then we have found a pattern BOOLEAN_T found_pattern = FALSE; for (i = 0; i < options->width; ++i) { if (pattern[i] != 'X') { found_pattern = TRUE; } } // If there is a pattern, store the pattern and call create_motifx_motif again. if (found_pattern) { // fill out the rest of the pattern (e.g. if you have pattern ..ASAAA, and realize the actual pattern is A.ASAAA for (i = 0; i < options->width; i++) { for (j = 0; j < strlen(alph_letters); j++) { if ((int) get_matrix_cell_defcheck(i, j, result_count_matrix) == *num_active_copy) { pattern[i] = alph_letters[j]; } } } // create the pattern name char* pattern_name = mm_malloc(strlen(pattern) + strlen(modname) + 3); pattern_name[0] = '\0'; strncat(pattern_name, pattern, strlen(pattern)/2); strncat(pattern_name, "_", 1); strncat(pattern_name, modname, strlen(modname)); strncat(pattern_name, "_", 1); strncat(pattern_name, pattern + strlen(pattern)/2 + 1, strlen(pattern)/2); // convert this count matrix into frequencies normalize_rows(0.0, result_count_matrix); // Store this motif MOTIF_INFO_T* motifinfo = mm_malloc(sizeof(MOTIF_INFO_T)); MOTIF_T* motif = allocate_motif(pattern_name, "", summary->alph, result_count_matrix, NULL); set_motif_nsites(motif, *num_active_copy); motifinfo->motif = motif; motifinfo->seqs = arraylst_create(); motifinfo->score = *motif_score; motifinfo->fg_match = *num_active_copy; motifinfo->fg_size = *num_active; motifinfo->bg_match = *num_bg_active_copy; motifinfo->bg_size = *num_bg_active; for (i = 0; i < arraylst_size(phospho_seqs); ++i) { MOTIFX_STATUS_T status = (*phospho_status)[i]; if (status == ACTIVE) { SEQ_T* active_sequence = (options->eliminate_repeats) ? hash_get_entry_value(arraylst_get(i, phospho_seqs)) : arraylst_get(i, phospho_seqs); arraylst_add(get_raw_sequence(active_sequence), motifinfo->seqs); } } arraylst_add(motifinfo, mod_info->motifinfos); // delete the sequences from this motif. turn inactive into active. delete_sequences(phospho_status, arraylst_size(phospho_seqs)); delete_sequences(bg_status, arraylst_size(bg_seqs)); // update the count of number of actives *num_active = *num_active - *num_active_copy; *num_bg_active = *num_bg_active - *num_bg_active_copy; // recalculate phospho count and bg count. phospho_count = get_count_matrix(phospho_count, phospho_seqs, phospho_status, options, summary); bg_count = get_count_matrix(bg_count, bg_seqs, bg_status, options, summary); // free up space myfree(pattern); myfree(num_active_copy); myfree(num_bg_active_copy); myfree(motif_score); myfree(pattern_name); // try to create another motif. create_motifx_motif(phospho_seqs, bg_seqs, phospho_status, bg_status, phospho_count, bg_count, num_active, num_bg_active, modname, mod_info, options, summary); } // free up space myfree(pattern); myfree(num_active_copy); myfree(num_bg_active_copy); myfree(motif_score); }
/************************************************************************* * Entry point for ama *************************************************************************/ int main(int argc, char *argv[]) { int max_seq_length = MAX_SEQ; STRING_LIST_T* selected_motifs = NULL; double pseudocount = 0.01; int output_format = CISML_FORMAT; program_name = "ama"; int scoring = AVG_ODDS; BOOLEAN_T pvalues = FALSE; BOOLEAN_T normalize_scores = FALSE; BOOLEAN_T combine_duplicates = FALSE; int num_gc_bins = 1; int sdbg_order = -1; // don't use sequence background BOOLEAN_T scan_both_strands = TRUE; ARRAY_T* pos_bg_freqs = NULL; ARRAY_T* rev_bg_freqs = NULL; clock_t c0, c1; /* measuring cpu_time */ CISML_T *cisml; char * out_dir = NULL; BOOLEAN_T clobber = FALSE; int i; int last = 0; ALPH_T alph = INVALID_ALPH; /********************************************** * COMMAND LINE PROCESSING **********************************************/ const int num_options = 16; cmdoption const motif_scan_options[] = { { "max-seq-length", REQUIRED_VALUE }, { "motif", REQUIRED_VALUE }, { "motif-pseudo", REQUIRED_VALUE }, { "rma", NO_VALUE }, { "pvalues", NO_VALUE }, { "sdbg", REQUIRED_VALUE }, { "norc", NO_VALUE }, { "cs", NO_VALUE }, { "o-format", REQUIRED_VALUE }, { "o", REQUIRED_VALUE }, { "oc", REQUIRED_VALUE }, { "scoring", REQUIRED_VALUE }, { "verbosity", REQUIRED_VALUE }, { "gcbins", REQUIRED_VALUE }, { "last", REQUIRED_VALUE }, { "version", NO_VALUE } }; int option_index = 0; // Define the usage message. char usage[] = "USAGE: ama [options] <motif file> <sequence file> [<background file>]\n" "\n" " Options:\n" " --sdbg <order>\t\t\tUse Markov background model of\n" " \t\t\t\t\torder <order> derived from the sequence\n" " \t\t\t\t\tto compute its likelihood ratios.\n" " \t\t\t\t\tOverrides --pvalues, --gcbins and --rma;\n" " \t\t\t\t\t<background file> is required unless\n" " \t\t\t\t\t--sdbg is given.\n" " --motif <id>\t\t\tUse only the motif identified by <id>.\n" " \t\t\t\t\tThis option may be repeated.\n" " --motif-pseudo <float>\t\tThe value <float> times the background\n" " \t\t\t\t\tfrequency is added to the count of each\n" " \t\t\t\t\tletter when creating the likelihood \n" " \t\t\t\t\tratio matrix (default: %g).\n" " --norc\t\t\t\tDisables the scanning of the reverse\n" " \t\t\t\t\tcomplement strand.\n" " --scoring [avg-odds|max-odds]\tIndicates whether the average or \n" " \t\t\t\t\tthe maximum odds should be calculated\n" " \t\t\t\t\t(default: avg-odds)\n" " --rma\t\t\t\tScale motif scores to the range 0-1.\n" " \t\t\t\t\t(Relative Motif Affinity).\n" " \t\t\t\t\tMotif scores are scaled by the maximum\n" " \t\t\t\t\tscore achievable by that PWM. (default:\n" " \t\t\t\t\tmotif scores are not normalized)\n" " --pvalues\t\t\t\tPrint p-value of avg-odds score in cisml\n" " \t\t\t\t\toutput. Ignored for max-odds scoring.\n" " \t\t\t\t\t(default: p-values are not printed)\n" " --gcbins <bins>\t\t\tCompensate p-values for GC content of\n" " \t\t\t\t\teach sequence using given number of \n" " \t\t\t\t\tGC range bins. Recommended bins: 41.\n" " \t\t\t\t\t(default: p-values are based on\n" " \t\t\t\t\tfrequencies in background file)\n" " --cs\t\t\t\tEnable combining sequences with same\n" " \t\t\t\t\tidentifier by taking the average score\n" " \t\t\t\t\tand the Sidac corrected p-value.\n" " --o-format [gff|cisml]\t\tOutput file format (default: cisml)\n" " \t\t\t\t\tignored if --o or --oc option used\n" " --o <directory>\t\t\tOutput all available formats to\n" " \t\t\t\t\t<directory>; give up if <directory>\n" " \t\t\t\t\texists\n" " --oc <directory>\t\t\tOutput all available formats to\n" " \t\t\t\t\t<directory>; if <directory> exists\n" " \t\t\t\t\toverwrite contents\n" " --verbosity [1|2|3|4]\t\tControls amount of screen output\n" " \t\t\t\t\t(default: %d)\n" " --max-seq-length <int>\t\tSet the maximum length allowed for \n" " \t\t\t\t\tinput sequences. (default: %d)\n" " --last <int>\t\t\tUse only scores of (up to) last <n>\n" " \t\t\t\t\tsequence positions to compute AMA.\n" " --version \t\t\tPrint version and exit.\n" "\n"; // Parse the command line. if (simple_setopt(argc, argv, num_options, motif_scan_options) != NO_ERROR) { die("Error processing command line options: option name too long.\n"); } BOOLEAN_T setoutputformat = FALSE; BOOLEAN_T setoutputdirectory = FALSE; while (TRUE) { int c = 0; char* option_name = NULL; char* option_value = NULL; const char * message = NULL; // Read the next option, and break if we're done. c = simple_getopt(&option_name, &option_value, &option_index); if (c == 0) { break; } else if (c < 0) { (void) simple_getopterror(&message); die("Error processing command line options (%s).\n", message); } else if (strcmp(option_name, "max-seq-length") == 0) { max_seq_length = atoi(option_value); } else if (strcmp(option_name, "norc") == 0) { scan_both_strands = FALSE; } else if (strcmp(option_name, "cs") == 0) { combine_duplicates = TRUE; } else if (strcmp(option_name, "motif") == 0) { if (selected_motifs == NULL) { selected_motifs = new_string_list(); } add_string(option_value, selected_motifs); } else if (strcmp(option_name, "motif-pseudo") == 0) { pseudocount = atof(option_value); } else if (strcmp(option_name, "o-format") == 0) { if (setoutputdirectory) { if (verbosity >= NORMAL_VERBOSE) fprintf(stderr, "output directory specified, ignoring --o-format\n"); } else { setoutputformat = TRUE; if (strcmp(option_value, "gff") == 0) output_format = GFF_FORMAT; else if (strcmp(option_value, "cisml") == 0) output_format = CISML_FORMAT; else { if (verbosity >= NORMAL_VERBOSE) fprintf(stderr, "Output format not known. Using standard instead (cisML).\n"); output_format = CISML_FORMAT; } } } else if (strcmp(option_name, "o") == 0 || strcmp(option_name, "oc") == 0) { setoutputdirectory = TRUE; if (setoutputformat) { if (verbosity >= NORMAL_VERBOSE) fprintf(stderr, "output directory specified, ignoring --o-format\n"); } clobber = strcmp(option_name, "oc") == 0; out_dir = (char*) malloc (sizeof(char)*(strlen(option_value)+1)); strcpy(out_dir, option_value); output_format = DIRECTORY_FORMAT; } else if (strcmp(option_name, "verbosity") == 0) { verbosity = atoi(option_value); } else if (strcmp(option_name, "scoring") == 0) { if (strcmp(option_value, "max-odds") == 0) scoring = MAX_ODDS; else if (strcmp(option_value, "avg-odds") == 0) scoring = AVG_ODDS; else if (strcmp(option_value, "sum-odds") == 0) scoring = SUM_ODDS; else die("Specified scoring scheme not known.\n", message); } else if (strcmp(option_name, "pvalues") == 0) { pvalues = TRUE; } else if (strcmp(option_name, "rma") == 0) { normalize_scores = TRUE; fprintf(stderr, "Normalizing motif scores using RMA method.\n"); } else if (strcmp(option_name, "gcbins") == 0) { num_gc_bins = atoi(option_value); pvalues = TRUE; if (num_gc_bins <= 1) die("Number of bins in --gcbins must be greater than 1.\n", message); } else if (strcmp(option_name, "sdbg") == 0) { sdbg_order = atoi(option_value); // >=0 means use sequence bkg } else if (strcmp(option_name, "last") == 0) { int i = 0; if (option_value[0] == '-') ++i; while (option_value[i] != '\0') { if (!isdigit(option_value[i])) { die("Specified parameter 'last' contains non-numeric characters.\n"); } ++i; } last = atoi(option_value); if (errno != 0) { die("Specified parameter 'last' could not be parsed as a number as:\n%s\n",strerror(errno)); } if (last < 0) { die("Specified parameter 'last' had negative value (%d) when only postive or zero values are allowed \n", last); } } else if (strcmp(option_name, "version") == 0) { fprintf(stdout, VERSION "\n"); exit(EXIT_SUCCESS); } } // --sdbg overrides --pvalues and --gcbins and --rma int req_args = 3; if (sdbg_order >= 0) { pvalues = FALSE; normalize_scores = FALSE; num_gc_bins = 1; req_args = 2; } // Check all required arguments given if (sdbg_order >= 0 && argc > option_index + req_args) { die("<background file> cannot be given together with --sdbg.\n"); } else if (argc != option_index + req_args) { fprintf(stderr, usage, pseudocount, verbosity, max_seq_length); exit(EXIT_FAILURE); } // Get required arguments. char* motif_filename = argv[option_index]; option_index++; char* fasta_filename = argv[option_index]; option_index++; char* bg_filename; if (req_args == 3) { // required unless --sdbg given bg_filename = argv[option_index]; option_index++; } else { bg_filename = "--uniform--"; // So PSSMs will use uniform background; // we can multiply them out later. } // measure time c0 = clock(); // Set up hash tables for computing reverse complement if doing --sdbg if (sdbg_order >= 0) setup_hash_alph(DNAB); // Create cisml data structure for recording results cisml = allocate_cisml(program_name, motif_filename, fasta_filename); set_cisml_background_file(cisml, bg_filename); /********************************************** * Read the motifs and background model. **********************************************/ int num_motifs = 0; MREAD_T *mread; ARRAYLST_T *motifs; PSSM_PAIR_T** pssm_pairs; // note pssm_pairs is an array of pointers //this reads any meme file, xml, txt and html mread = mread_create(motif_filename, OPEN_MFILE); mread_set_bg_source(mread, bg_filename); mread_set_pseudocount(mread, pseudocount); motifs = mread_load(mread, NULL); alph = mread_get_alphabet(mread); pos_bg_freqs = mread_get_background(mread); mread_destroy(mread); num_motifs = arraylst_size(motifs); // allocate memory for PSSM pairs pssm_pairs = (PSSM_PAIR_T**)mm_malloc(sizeof(PSSM_PAIR_T*) * num_motifs); if (verbosity >= NORMAL_VERBOSE) fprintf(stderr, "Number of motifs in file %d.\n", num_motifs); // make a CISML pattern to hold scores for each motif PATTERN_T** patterns = NULL; Resize(patterns, num_motifs, PATTERN_T*); int motif_index; for (motif_index = 0; motif_index < num_motifs; motif_index++) { MOTIF_T* motif = (MOTIF_T*)arraylst_get(motif_index, motifs); patterns[motif_index] = allocate_pattern(get_motif_id(motif), ""); add_cisml_pattern(cisml, patterns[motif_index]); } // make reverse complement motifs and background frequencies. if (scan_both_strands == TRUE) { add_reverse_complements(motifs); assert(arraylst_size(motifs) == (2 * num_motifs)); rev_bg_freqs = allocate_array(get_array_length(pos_bg_freqs)); complement_dna_freqs(pos_bg_freqs, rev_bg_freqs); } /************************************************************** * Convert motif matrices into log-odds matrices. * Scale them. * Compute the lookup tables for the PDF of scaled log-odds scores. **************************************************************/ int ns = scan_both_strands ? 2 : 1; // number of strands for (motif_index = 0; motif_index < num_motifs; motif_index++) { MOTIF_T *motif, *motif_rc; motif = (MOTIF_T*)arraylst_get(motif_index*ns, motifs); if (scan_both_strands) motif_rc = (MOTIF_T*)arraylst_get(motif_index*ns + 1, motifs); else motif_rc = NULL; /* * Note: If scanning both strands, we complement the motif frequencies * but not the background frequencies so the motif looks the same. * However, the given frequencies are used in computing the p-values * since they represent the frequencies on the negative strands. * (If we instead were to complement the input sequence, keeping the * the motif fixed, we would need to use the complemented frequencies * in computing the p-values. Is that any clearer?) */ double range = 300; // 100 is not very good; 1000 is great but too slow PSSM_T* pos_pssm = build_motif_pssm( motif, pos_bg_freqs, pos_bg_freqs, NULL, // Priors not used 0.0L, // alpha not used range, num_gc_bins, TRUE ); PSSM_T* neg_pssm = (scan_both_strands ? build_motif_pssm( motif_rc, rev_bg_freqs, pos_bg_freqs, NULL, // Priors not used 0.0L, // alpha not used range, num_gc_bins, TRUE ) : NULL ); pssm_pairs[motif_index] = create_pssm_pair(pos_pssm, neg_pssm); } // Open the FASTA file for reading. FILE* fasta_file = NULL; if (open_file(fasta_filename, "r", FALSE, "FASTA", "sequences", &fasta_file) == 0) { die("Couldn't open the file %s.\n", fasta_filename); } if (verbosity >= NORMAL_VERBOSE) { if (last == 0) { fprintf(stderr, "Using entire sequence\n"); } else { fprintf(stderr, "Limiting sequence to last %d positions.\n", last); } } /************************************************************** * Read in all sequences and score with all motifs **************************************************************/ int seq_loading_num = 0; // keeps track on the number of sequences read in total int seq_counter = 0; // holds the index to the seq in the pattern int unique_seqs = 0; // keeps track on the number of unique sequences BOOLEAN_T need_postprocessing = FALSE; SEQ_T* sequence = NULL; RBTREE_T* seq_ids = rbtree_create(rbtree_strcasecmp,NULL,free,rbtree_intcpy,free); RBNODE_T* seq_node; BOOLEAN_T created; while (read_one_fasta(alph, fasta_file, max_seq_length, &sequence)) { ++seq_loading_num; created = FALSE; char* seq_name = get_seq_name(sequence); int seq_len = get_seq_length(sequence); int scan_len; if (last != 0) { scan_len = last; } else { scan_len = seq_len; } // red-black trees are only required if duplicates should be combined if (combine_duplicates){ //lookup seq id and create new entry if required, return sequence index char *tmp_id = mm_malloc(strlen(seq_name)+1); // required copy for rb-tree strncpy(tmp_id,seq_name,strlen(seq_name)+1); seq_node = rbtree_lookup(seq_ids, tmp_id, TRUE, &created); if (created) {// assign it a loading number rbtree_set(seq_ids, seq_node, &unique_seqs); seq_counter = unique_seqs; ++unique_seqs; } else { seq_counter = *((int*)rbnode_get(seq_node)); } } // // Set up sequence-dependent background model and compute // log cumulative probability of sequence. // double *logcumback = NULL; // array of log cumulative probs. if (sdbg_order >= 0) { Resize(logcumback, seq_len+1, double); char* raw_seq = get_raw_sequence(sequence); BOOLEAN rc = FALSE; double *a_cp = get_markov_from_sequence(raw_seq, alph_string(alph), rc, sdbg_order, 0); log_cum_back(raw_seq, a_cp, sdbg_order, logcumback); myfree(a_cp); } // Get the GC content of the sequence if binning p-values by GC // and store it in the sequence object. if (num_gc_bins > 1) { ARRAY_T *freqs = get_sequence_freqs(sequence, alph); set_total_gc_sequence(sequence, get_array_item(1,freqs) + get_array_item(2,freqs)); // f(C) + f(G) free_array(freqs); // clean up } else { set_total_gc_sequence(sequence, -1); // flag ignore } /************************************************************** * Process all motifs. **************************************************************/ int ns = scan_both_strands ? 2 : 1; for (motif_index = 0; motif_index < num_motifs; motif_index++) { PATTERN_T *pattern = patterns[motif_index]; MOTIF_T* motif = (MOTIF_T*)arraylst_get(ns*motif_index, motifs); char* motif_id = (scan_both_strands ? get_motif_st_id(motif) : get_motif_id(motif)); if (verbosity >= HIGH_VERBOSE) { fprintf(stderr, "Using motif %s of width %d.\n", motif_id, get_motif_length(motif)); } if ((selected_motifs == NULL) || (have_string(get_motif_id(motif), selected_motifs) == TRUE)) { if (verbosity >= HIGHER_VERBOSE) { fprintf(stderr, "Scanning %s sequence with length %d " "abbreviated to %d with motif %s with length %d.\n", seq_name, seq_len, scan_len, motif_id, get_motif_length(motif)); } SCANNED_SEQUENCE_T* scanned_seq = NULL; if (!combine_duplicates || get_pattern_num_scanned_sequences(pattern) <= seq_counter){ // Create a scanned_sequence record and save it in the pattern. scanned_seq = allocate_scanned_sequence(seq_name, seq_name, pattern); set_scanned_sequence_length(scanned_seq, scan_len); } else { // get existing sequence record scanned_seq = get_pattern_scanned_sequences(pattern)[seq_counter]; set_scanned_sequence_length(scanned_seq, max(scan_len, get_scanned_sequence_length(scanned_seq))); } // check if scanned component of sequence has sufficient length for the motif if (scan_len < get_motif_length(motif)) { // set score to zero and p-value to 1 if not set yet if(!has_scanned_sequence_score(scanned_seq)){ set_scanned_sequence_score(scanned_seq, 0.0); } if(pvalues && !has_scanned_sequence_pvalue(scanned_seq)){ set_scanned_sequence_pvalue(scanned_seq, 1.0); } add_scanned_sequence_scanned_position(scanned_seq); if (get_scanned_sequence_num_scanned_positions(scanned_seq) > 0L) need_postprocessing = TRUE; if (verbosity >= HIGH_VERBOSE) fprintf(stderr, "%s too short for motif %s. Score set to 0!\n", seq_name, motif_id); } else { // scan the sequence using average/maximum motif affinity ama_sequence_scan(alph, sequence, logcumback, pssm_pairs[motif_index], scoring, pvalues, last, scanned_seq, &need_postprocessing); } } else { if (verbosity >= HIGH_VERBOSE) fprintf(stderr, "Skipping motif %s.\n", motif_id); } } // All motifs parsed free_seq(sequence); if (sdbg_order >= 0) myfree(logcumback); } // read sequences
/************************************************************************* * Calculate the odds score for each motif-sized window at each * site in the sequence using the given nucleotide frequencies. * * This function is a lightweight version based on the one contained in * motiph-scoring. Several calculations that are unnecessary for gomo * have been removed in order to speed up the process. * Scores sequence with up to two motifs. *************************************************************************/ double score_sequence( SEQ_T* seq, // sequence to scan (IN) double *logcumback, // cumulative bkg probability of sequence (IN) PSSM_PAIR_T* pssm_pair, // pos and neg pssms (IN) int method, // method used for scoring (IN) int last, //score only last <n> or //score all if <n> is zero (IN) BOOLEAN_T* isFeasible // FLAG indicated if there is at least one position // where the motif could be matched against (OUT) ) { assert(pssm_pair != NULL); assert(seq != NULL); PSSM_T* pos_pssm = pssm_pair->pos_pssm; assert(pos_pssm != NULL); PSSM_T* neg_pssm = pssm_pair->neg_pssm; int n_motifs = neg_pssm ? 2 : 1; char* raw_seq = get_raw_sequence(seq); int seq_length = get_seq_length(seq); int w = get_num_rows(pos_pssm->matrix); int n = seq_length - w + 1; if (verbosity >= DUMP_VERBOSE) { fprintf(stderr, "Debug n_motifs: %d seq_length: %d w: %d n: %d.\n", n_motifs, seq_length, w, n); } // Get alphabet; char* alphabet = get_alphabet(FALSE); int alph_size = get_alph_size(ALPH_SIZE); // Dependent on the "last" parameter, change the starting point int start; int N_scored; if (last > 0 && last < seq_length) { start = seq_length - last; N_scored = n_motifs * (last - w + 1); // number of sites scored } else { start = 0; N_scored = n_motifs * n; // number of sites scored } // For each motif (positive and reverse complement) double max_odds = 0.0; double sum_odds = 0.0; double requested_odds = 0.0; int i; if (verbosity >= HIGHER_VERBOSE) { fprintf(stderr, "Starting scan at position %d .\n", start); } for (i=0; i<n_motifs; i++) { // pos (and negative) motif PSSM_T* pssm = (i==0 ? pos_pssm : neg_pssm); // choose +/- motif // For each site in the sequence int seq_index; for (seq_index = start; seq_index < n; seq_index++) { // site double odds = 1.0; // For each position in the motif window int motif_position; for (motif_position = 0; motif_position < w; motif_position++) { // column int i_site = seq_index + motif_position; char c = raw_seq[i_site]; // Check for gaps at this site if (c == '-' || c == '.') { N_scored--; odds = 0; break; } // Check for ambiguity codes at this site int alph_index = alphabet_index(c, alphabet); if (alph_index >= alph_size || alph_index < 0) { N_scored--; odds = 0; break; } // multiple odds by value in appropriate motif cell odds *= get_matrix_cell(motif_position, alph_index, pssm->matrix); } // column // // Apply sequence-dependent background model. // if (logcumback) { int i_site = seq_index; double log_p = logcumback[i_site+w] - logcumback[i_site]; // log Pr(x | background) //printf("log_p:: %g motif_pos %d\n", log_p, motif_position); double adjust = exp(w*log(1/4.0) - log_p); // Pr(x | uniform) / Pr(x | background) odds *= adjust; } // Add odds to growing sum. sum_odds += odds; // sum of odds if (odds > max_odds) max_odds = odds; // max of odds } // site } // motif if (verbosity >= HIGHER_VERBOSE) { fprintf(stderr, "Scored %d positions with the sum odds %f and the max odds %f.\n", N_scored, sum_odds, max_odds); } // has there been anything matched at all? if (N_scored == 0){ if (verbosity >= NORMAL_VERBOSE) { fprintf(stderr,"Sequence \'%s\' offers no location to match the motif against (sequence length too short?)\n",get_seq_name(seq)); } *isFeasible = FALSE; return 0.0; // return odds as requested (MAX or AVG scoring) } else if (method == AVG_ODDS) { requested_odds = sum_odds / N_scored; // mean } else if (method == MAX_ODDS) { requested_odds = max_odds; // maximum } else if (method == SUM_ODDS) { requested_odds = sum_odds ; // sum } return(requested_odds); } // score_sequence
/************************************************************************* * Calculate the log-odds score for each possible motif site in the * sequence and record the sites of the best. Apply a count to each * best site and increment the total site count. *************************************************************************/ static void score_sequence( CENTRIMO_OPTIONS_T *options, SEQ_T* sequence, PSSM_T* pssm, PSSM_T* rev_pssm, SEQ_SITES_T* seq_sites, SITE_COUNTS_T* counts ) { char *raw_seq, *seg; int i, L, w, pos; double score; double count; SEQ_SITE_T *site; // check we got passed stuff assert(options != NULL); assert(sequence != NULL); assert(pssm != NULL); assert(seq_sites != NULL); assert(counts != NULL); // make Mac OS compiler happy. score = -BIG; // Score and record each possible motif site in the sequence raw_seq = get_raw_sequence(sequence); L = get_seq_length(sequence); w = pssm->w; // Reset the sequence stats structure seq_sites->best = -BIG; seq_sites->used = 0; // Read and score each position in the sequence. for (i = 0; i < L - w + 1; i++) { seg = raw_seq+i; // Score and record forward strand if (score_motif_site(options->alphabet, seg, pssm, &score)) track_site(seq_sites, score, i, '+'); // Score and record reverse strand if appropriate. if (rev_pssm && score_motif_site(options->alphabet, seg, rev_pssm, &score)) track_site(seq_sites, score, i, '-'); } // Record the position of best site, averaging ties // and using position in RC of sequence if site on reverse strand // unless no_flip is true. if (seq_sites->used && seq_sites->best >= options->score_thresh) { // add 1/n_ties to each tied position's count, // averaging rather than random choice count = (double)1.0 / (double)seq_sites->used; for (i = 0; i < seq_sites->used; i++) { site = seq_sites->sites+i; if (options->no_flip || site->strand == '+') { //pos = 2 * (site->start + w/2 - 1/2); // a motif of width 1 can have sites at the first index pos = 2 * site->start + w - 1; // a motif of width 1 can have sites at the first index } else { //pos = 2 * (L - (site->start + w/2) - 1; // a motif of width 1 can have sites at the first index pos = 2 * (L - site->start) - w - 1; } //record the count counts->sites[pos] += count; } counts->total_sites++; } }
/**************************************************************************** * Allocate one alignment object. Name and description may be NULL. * * Returns a pointer to the newly created alignment. ****************************************************************************/ ALIGNMENT_T* allocate_alignment( char* name, char* description, int num_sequences, SEQ_T** sequences, char* consensus_string ) { assert(num_sequences > 0); assert(sequences != NULL); assert(consensus_string != NULL); // Allocate the alignment object. ALIGNMENT_T* new_alignment = (ALIGNMENT_T*)mm_malloc(sizeof(ALIGNMENT_T)); if (new_alignment == NULL) { die("Error allocating alignment\n"); } // Store the name, truncating if necessary. if (name != NULL) { strncpy(new_alignment->name, name, MAX_ALIGNMENT_NAME); new_alignment->name[MAX_ALIGNMENT_NAME] = '\0'; if (strlen(new_alignment->name) != strlen(name)) { fprintf(stderr, "Warning: truncating alignment program name %s to %s.\n", name, new_alignment->name); } } else { new_alignment->name[0] = '\0'; } // Store the description, truncating if necessary. if (description != NULL) { strncpy(new_alignment->desc, description, MAX_ALIGNMENT_COMMENT); new_alignment->desc[MAX_ALIGNMENT_COMMENT] = '\0'; } else { new_alignment->desc[0] = '\0'; } // Store the sequences. new_alignment->sequences = (SEQ_T**) mm_malloc(num_sequences * sizeof(SEQ_T*)); if (new_alignment->sequences == NULL) { die("Error allocating sequences\n"); } new_alignment->num_sequences = num_sequences; int seq_length = strlen(get_raw_sequence(sequences[0])); int i; for (i = 0; i < num_sequences; i++) { myassert(TRUE, strlen(get_raw_sequence(sequences[i])) == seq_length, "Sequence #1 (%s) is length=%d, but sequence #%d (%s) is length=%d.\n<%s>\n", get_seq_name(sequences[0]), seq_length, i, get_seq_name(sequences[i]), strlen(get_raw_sequence(sequences[i])), get_raw_sequence(sequences[i])); new_alignment->sequences[i] = allocate_seq(get_seq_name(sequences[i]), get_seq_description(sequences[i]), get_seq_offset(sequences[i]), get_raw_sequence(sequences[i]) ); } // Fill in the remaining fields. new_alignment->length = seq_length; copy_string(&(new_alignment->consensus_string), consensus_string); return(new_alignment); }