/**************************************************************************** * Remove from an alignment any sequence whose ID is not in a given list. * * N.B. It is NOT an error for the given list to contain sequence IDs that * are not in the alignment. ****************************************************************************/ ALIGNMENT_T* remove_alignment_seqs (STRING_LIST_T* seqs_to_keep, ALIGNMENT_T* alignment) { // Extract the names of the sequences in the alignment. STRING_LIST_T* alignment_species = get_species_names(alignment); int num_species = get_num_strings(alignment_species); // Count how many sequences will be in the new alignment. int i_species; int num_final = 0; for (i_species = 0; i_species < num_species; i_species++) { char* this_species = get_nth_string(i_species, alignment_species); if (have_string(this_species, seqs_to_keep)) { num_final++; } else { if (verbosity >= NORMAL_VERBOSE) { fprintf(stderr, "Removing %s from alignment.\n", this_species); } } } // Allocate space for the new sequences. SEQ_T** new_sequences = (SEQ_T**)mm_malloc(num_final * sizeof(SEQ_T*)); // Copy the sequences. int final_index = 0; num_species = get_num_strings(seqs_to_keep); for (i_species = 0; i_species < num_species; i_species++) { char* this_species = get_nth_string(i_species, seqs_to_keep); // If the requested ID is in the alignment, then copy over the sequence. if (have_string(this_species, alignment_species)) { SEQ_T* this_seq = get_alignment_sequence_by_name(this_species, alignment); new_sequences[final_index] = copy_sequence(this_seq); final_index++; } } // Allocate and return the new alignment. char *consensus = NULL; copy_string(&consensus, get_consensus_string(alignment)); ALIGNMENT_T* new_alignment = allocate_alignment(get_alignment_name(alignment), get_alignment_description(alignment), num_final, new_sequences, consensus); return(new_alignment); }
MATRIX_T *read_score_matrix( char *score_filename, /* name of score file */ char **alpha1 /* alphabet in score matrix */ ) { int i; char *alpha; /* alphabet in file */ int alen; FILE *score_file; RDB_MATRIX_T *rdb_matrix; /* open the score file */ if (open_file(score_filename, "r", FALSE, "score", "substitution scores", &score_file) == 0) exit(1); /* read in the score file */ rdb_matrix = read_rdb_matrix(" ", FALSE, 0, FALSE, NULL, score_file); /* get alphabet */ alen = get_num_strings(rdb_matrix->col_names); alpha = (char *)mm_malloc(sizeof(char) * (alen+1)); for (i=0; i<alen; i++) alpha[i] = get_nth_string(i, rdb_matrix->col_names)[0]; alpha[i] = '\0'; *alpha1 = alpha; /* return alphabet */ return(rdb_matrix->matrix); } /* read_score_matrix */
/**************************************************************************** * Create a new alignment with any sequence that contains nothing but * gap ('-') characters removed. Returns the new alignment. Does not * change the old alignment. * If there are no all-gap sequences, the returned alignment is the * same object as the original alignment. ****************************************************************************/ static ALIGNMENT_T* remove_allgap_sequences(ALIGNMENT_T* alignment) { ALIGNMENT_T* new_alignment; int i_aln; int l_aln = get_num_aligned_sequences(alignment); STRING_LIST_T* keeper_seqs = new_string_list(); // Identify the all-gap sequences. for (i_aln=0; i_aln<l_aln; i_aln++) { SEQ_T* sequence = get_alignment_sequence(i_aln, alignment); int i_seq; int l_seq = get_seq_length(sequence); // Add sequence to keepers if it contains a non-gap. for (i_seq=0; i_seq<l_seq; i_seq++) { if (get_seq_char(i_seq, sequence) != '-') { // not gap? add_string(get_seq_name(sequence), keeper_seqs); // non-gap: keeper break; } } } // Remove any sequences not in keeper list. if (get_num_strings(keeper_seqs) < l_aln) { new_alignment = remove_alignment_seqs(keeper_seqs, alignment); free_string_list(keeper_seqs); } else { new_alignment = alignment; } return(new_alignment); } // remove_allgap_sequences
void ramen_get_scores() { int i; int seq_num; seq_num = get_num_strings(seq_ids); //number of sequences //allocate space for final one result per motif array. rsr = malloc(sizeof(ramen_result_t*)*motifs.num); for(i=0;i<motifs.num;i++) { fprintf(stderr, "\rScoring %i of %i motifs...", i+1, motifs.num); rsr[i] = ramen_do_linreg_test(i); } //Order by MSE. qsort(rsr, motifs.num, sizeof(ramen_result_t*), ramen_compare_mse); fprintf(stderr, "\n"); }
/*********************************************************************** * Should the given motif be inserted into the model? * FIXME: These tests needn't be mutually exclusive. ***********************************************************************/ static BOOLEAN_T retain_motif( STRING_LIST_T* requested_motifs, // IDs of motifs to include. double e_threshold, // E-value to include motifs. double complexity_threshold, // Complexity threshold to include. ORDER_T* order_spacing, // Motif order and spacing (linear HMM). MOTIF_T* motif // The motif. ) { int num_requested; int i; char* motif_id; /* Method 1: Select motifs by index. */ num_requested = get_num_strings(requested_motifs); if (num_requested > 0) { motif_id = get_motif_id(motif); for (i = 0; i < num_requested; i++) { if (strcmp(get_nth_string(i, requested_motifs), motif_id) == 0) { return(TRUE); } } return(FALSE); } /* Method 2: Select motifs below a certain E-value threshold. */ else if (e_threshold != 0.0) { return (get_motif_evalue(motif) <= e_threshold); } /* Method 3: Select motifs that are included in the order string. */ else if (order_spacing != NULL) { return order_contains(get_motif_id(motif), order_spacing); } // Method 4: Select motifs by their complexity score. else if (complexity_threshold != 0.0) { return(motif->complexity >= complexity_threshold); } /* Default is to include all motifs. */ return(TRUE); }
/************************************************************************* * int main *************************************************************************/ int main(int argc, char *argv[]) { /* Data structures. */ int num_motifs; /* The number of motifs in the model. */ MOTIF_T motifs[2 * MAX_MOTIFS]; /* The motifs. */ STRING_LIST_T* motif_occurrences = NULL; /* Strings describing occurrences of motifs */ BOOLEAN_T has_reverse_strand = FALSE; /* MEME file contained both strands */ ARRAY_T* background; /* Background probs for alphabet. */ ORDER_T* order_spacing; /* Linear HMM order and spacing. */ MATRIX_T* transp_freq = NULL; /* Matrix of inter-motif transitions freqs. */ MATRIX_T* spacer_ave = NULL; /* Matrix of average spacer lengths. */ MHMM_T * the_hmm = NULL; /* The HMM being constructed. */ /* Command line parameters. */ char * meme_filename; /* Input file containg motifs. */ char * hmm_type_str; /* HMM type. */ HMM_T hmm_type; STRING_LIST_T* requested_motifs; /* Indices of requested motifs. */ int request_n; /* The user asked for the first n motifs. */ double e_threshold; /* E-value threshold for motif inclusion. */ double complexity_threshold; // For eliminating low-complexity motifs. double p_threshold; /* p-value threshold for motif occurences. */ char* order_string; /* Motif order and spacing. */ int spacer_states; /* Number of states in each spacer. */ BOOLEAN_T fim; /* Represent spacers as free insertion modules? */ BOOLEAN_T keep_unused; // Drop unused inter-motif transitions? double trans_pseudo; /* Transition pseudocount. */ double spacer_pseudo; // Spacer (self-loop) pseudocount. */ char* description; // Descriptive text to be stored in model. BOOLEAN_T print_header; /* Print file header? */ BOOLEAN_T print_params; /* Print parameter summary? */ BOOLEAN_T print_time; /* Print timing data (dummy: always false). */ /* Local variables. */ int i_motif; /********************************************** * COMMAND LINE PROCESSING **********************************************/ // Define command line options. cmdoption const options[] = { {"type", OPTIONAL_VALUE}, {"description", REQUIRED_VALUE}, {"motif", REQUIRED_VALUE}, {"nmotifs", REQUIRED_VALUE}, {"ethresh", REQUIRED_VALUE}, {"lowcomp", REQUIRED_VALUE}, {"pthresh", REQUIRED_VALUE}, {"order", REQUIRED_VALUE}, {"nspacer", REQUIRED_VALUE}, {"fim", NO_VALUE}, {"keep-unused", NO_VALUE}, {"transpseudo", REQUIRED_VALUE}, {"spacerpseudo", REQUIRED_VALUE}, {"verbosity", REQUIRED_VALUE}, {"noheader", NO_VALUE}, {"noparams", NO_VALUE}, {"notime", NO_VALUE}, {"quiet", NO_VALUE}, }; int option_count = 18; int option_index = 0; // Define the usage message. char usage[1000] = ""; strcat(usage, "USAGE: mhmm [options] <MEME file>\n"); strcat(usage, "\n"); strcat(usage, " Options:\n"); strcat(usage, " --type [linear|complete|star] (default=linear)\n"); strcat(usage, " --description <string> (may be repeated)\n"); strcat(usage, " --motif <motif #> (may be repeated)\n"); strcat(usage, " --nmotifs <#>\n"); strcat(usage, " --ethresh <E-value>\n"); strcat(usage, " --lowcomp <value>\n"); strcat(usage, " --pthresh <p-value>\n"); strcat(usage, " --order <string>\n"); strcat(usage, " --nspacer <spacer length> (default=1)\n"); strcat(usage, " --fim\n"); strcat(usage, " --keep-unused\n"); strcat(usage, " --transpseudo <pseudocount>\n"); strcat(usage, " --spacerpseudo <pseudocount>\n"); strcat(usage, " --verbosity 1|2|3|4|5 (default=2)\n"); strcat(usage, " --noheader\n"); strcat(usage, " --noparams\n"); strcat(usage, " --notime\n"); strcat(usage, " --quiet\n"); strcat(usage, "\n"); /* Make sure various options are set to NULL or defaults. */ meme_filename = NULL; hmm_type_str = NULL; hmm_type = INVALID_HMM; requested_motifs = new_string_list(); request_n = 0; e_threshold = 0.0; complexity_threshold = 0.0; p_threshold = 0.0; order_string = NULL; spacer_states = DEFAULT_SPACER_STATES, fim = FALSE; keep_unused = FALSE; trans_pseudo = DEFAULT_TRANS_PSEUDO; spacer_pseudo = DEFAULT_SPACER_PSEUDO; description = NULL; print_header = TRUE; print_params = TRUE; print_time = FALSE; simple_setopt(argc, argv, option_count, options); // Parse the command line. while (1) { int c = 0; char* option_name = NULL; char* option_value = NULL; const char * message = NULL; // Read the next option, and break if we're done. c = simple_getopt(&option_name, &option_value, &option_index); if (c == 0) { break; } else if (c < 0) { simple_getopterror(&message); die("Error processing command line options (%s)\n", message); } if (strcmp(option_name, "type") == 0) { if (option_value != NULL) { hmm_type_str = option_value; } } else if (strcmp(option_name, "description") == 0) { description = option_value; } else if (strcmp(option_name, "motif") == 0) { add_string(option_value, requested_motifs); } else if (strcmp(option_name, "nmotifs") == 0) { request_n = atoi(option_value); } else if (strcmp(option_name, "ethresh") == 0) { e_threshold = atof(option_value); } else if (strcmp(option_name, "lowcomp") == 0) { complexity_threshold = atof(option_value); } else if (strcmp(option_name, "pthresh") == 0) { p_threshold = atof(option_value); } else if (strcmp(option_name, "order") == 0) { order_string = option_value; } else if (strcmp(option_name, "nspacer") == 0) { spacer_states = atoi(option_value); } else if (strcmp(option_name, "fim") == 0) { fim = TRUE; } else if (strcmp(option_name, "keep-unused") == 0) { keep_unused = TRUE; } else if (strcmp(option_name, "transpseudo") == 0) { trans_pseudo = atof(option_value); } else if (strcmp(option_name, "spacerpseudo") == 0) { spacer_pseudo = atof(option_value); } else if (strcmp(option_name, "verbosity") == 0) { verbosity = (VERBOSE_T)atoi(option_value); } else if (strcmp(option_name, "noheader") == 0) { print_header = FALSE; } else if (strcmp(option_name, "noparams") == 0) { print_params = FALSE; } else if (strcmp(option_name, "notime") == 0) { print_time = FALSE; } else if (strcmp(option_name, "quiet") == 0) { print_header = print_params = print_time = FALSE; verbosity = QUIET_VERBOSE; } } // Read the single required argument. if (option_index + 1 != argc) { fprintf(stderr, "%s", usage); exit(1); } meme_filename = argv[option_index]; // Set up motif requests. if (request_n != 0) { if (get_num_strings(requested_motifs) != 0) { die("Can't combine the -motif and -nmotifs options.\n"); } else { for (i_motif = 0; i_motif < request_n; i_motif++) { char motif_id[MAX_MOTIF_ID_LENGTH + 1]; sprintf(motif_id, "%d", i_motif + 1); add_string(motif_id, requested_motifs); } } } /* Set the model type. */ hmm_type = convert_enum_type_str(hmm_type_str, LINEAR_HMM, HMM_STRS, NUM_HMM_T); /* Gotta have positive spacer length. */ if (spacer_states <= 0) { die("Negative spacer length (%d).\n", spacer_states); } /* Make sure motifs weren't selected redundantly. */ // FIXME: Add tests for complexity threshold. if ((get_num_strings(requested_motifs) != 0) && (e_threshold != 0.0)) { die("Can't use -motif or -nmotifs with -ethresh."); } if ((get_num_strings(requested_motifs) != 0) && (order_string != NULL)) { die("Can't use -motif or -nmotifs with -order."); } if ((order_string != NULL) && (e_threshold != 0.0)) { die("Can't use -ethresh and -order."); } /* Prevent trying to build a complete or star model with ordering. */ if (order_string != NULL) { if (hmm_type == COMPLETE_HMM) die("Can't specify motif order with a completely connected model."); else if (hmm_type == STAR_HMM) die("Can't specify motif order with a star model."); } // Parse the order string. order_spacing = create_order(order_string); /********************************************** * READING THE MOTIFS **********************************************/ BOOLEAN_T read_file = FALSE; double pseudocount = 0; read_meme_file( meme_filename, "motif-file", // Take bg freq. from motif file. pseudocount, REQUIRE_PSPM, &num_motifs, motifs, &motif_occurrences, &has_reverse_strand, &background ); process_raw_motifs_for_model( &num_motifs, motifs, motif_occurrences, requested_motifs, has_reverse_strand, keep_unused, p_threshold, e_threshold, complexity_threshold, &order_spacing, &transp_freq, &spacer_ave, trans_pseudo, spacer_pseudo ); /********************************************** * BUILDING THE HMM **********************************************/ /* Build the motif-based HMM. */ if (hmm_type == LINEAR_HMM) { if (order_spacing != NULL) { reorder_motifs(order_spacing, &num_motifs, motifs); } else { die("No order specified for the motifs.\n" "For the linear model the motif file must contain motif occurence\n" "data or the motif order must be specified using " "the --order option."); } build_linear_hmm( background, order_spacing, spacer_states, motifs, num_motifs, fim, &the_hmm ); } else if (hmm_type == COMPLETE_HMM) { build_complete_hmm( background, spacer_states, motifs, num_motifs, transp_freq, spacer_ave, fim, &the_hmm ); } else if (hmm_type == STAR_HMM) { build_star_hmm( background, spacer_states, motifs, num_motifs, fim, &the_hmm ); } // Add some global information. copy_string(&(the_hmm->motif_file), meme_filename); /********************************************** * WRITING THE HMM **********************************************/ /* Print the header. */ if (print_header) write_header( program, "", description, meme_filename, NULL, NULL, stdout ); /* Write the HMM. */ write_mhmm(verbosity, the_hmm, stdout); /* Print the program parameters. */ if (print_params) { printf("Program parameters for mhmm\n"); printf(" MEME file: %s\n", meme_filename); printf(" Motifs:"); write_string_list(" ", requested_motifs, stdout); printf("\n"); printf(" Model topology: %s\n", convert_enum_type(hmm_type, HMM_STRS, NUM_HMM_T)); printf(" States per spacer: %d\n", spacer_states); printf(" Spacers are free-insertion modules: %s\n", boolean_to_string(fim)); printf("\n"); } free_array(background); free_string_list(requested_motifs); free_order(order_spacing); free_matrix(transp_freq); free_matrix(spacer_ave); for (i_motif = 0; i_motif < num_motifs; i_motif++) free_motif(&(motifs[i_motif])); free_mhmm(the_hmm); return(0); }
/*********************************************************************** * Select the motifs used to build the model, parse any motif * occurences, build the motif order object, and the motif * and spacer frequency matrices. ***********************************************************************/ void process_raw_motifs_for_model( int* num_motifs, // Number of motifs. IN, OUT MOTIF_T* motifs, // Array of motifs IN, OUT STRING_LIST_T* motif_occurrences, // List of motif occurrences. OUT STRING_LIST_T* requested_motifs, // Explicitly requested motifs. IN BOOLEAN_T has_reverse_strand, // Did file contain both strands? IN BOOLEAN_T keep_unused, // Retain unsed motifs? IN double p_threshold, // Motif p-value threshold IN double e_threshold, // Motif e-value threshold IN double complexity_threshold, // Motif complexity threshold IN ORDER_T** order_spacing, // Motif/spacer order IN, OUT MATRIX_T** transp_freq, // Motif transition freqs OUT MATRIX_T** spacer_ave, // Spacer transition freqs OUT double trans_pseudo, // Motif transition pseudo-counts IN double spacer_pseudo // Spacer transition pseudo-counts IN ) { // If both strands, make reverse complements. if (has_reverse_strand) { add_reverse_complements(num_motifs, motifs); } /* Remove motifs not allowed by the command line parameters */ filter_motifs( requested_motifs, e_threshold, complexity_threshold, order_spacing, num_motifs, motifs ); /* Turn the raw motifs and motif occurences into the */ /* elements of the model */ if (motif_occurrences != NULL && get_num_strings(motif_occurrences) > 0) { parse_motif_occurrences( motif_occurrences, has_reverse_strand, p_threshold, order_spacing, transp_freq, spacer_ave, *num_motifs, motifs ); } else { // If no occurrences are found, initialize matrices uniformly. compute_naive_transitions_and_spacers( *num_motifs, transp_freq, spacer_ave ); } // Convert spacer info to probabilities. normalize_spacer_counts( trans_pseudo, spacer_pseudo, keep_unused, *transp_freq, *spacer_ave); // Throw out unused motifs. throw_out_unused_motifs(*transp_freq, *spacer_ave, num_motifs, motifs); }
/*********************************************************************** * Parse the motif occurrences. * * Each motif occurence string contains the following items * - sequence id, * - sequence p-value, * - number n of motif occurrences, and * - length of sequence. * * This is followed by n triples containing * - motif id, * - occurrence position, and * - occurrence p-value. * ***********************************************************************/ static void parse_motif_occurrences( STRING_LIST_T* motif_occurrences, // List of motif occurences OUT BOOLEAN_T has_reverse_strand, // File included both strands? IN double p_threshold, // P-value to include motif occurences. OUT ORDER_T** order_spacing, // Motif order and spacing (linear HMM) // IN OUT. MATRIX_T** transp_freq, // Motif-to-motif transitions. OUT MATRIX_T** spacer_ave, // Average inter-motif distances. OUT int num_motifs, // Number of motifs retrieved. IN MOTIF_T* motifs // The retrieved motifs. IN ) { ORDER_T* new_order; // New order and spacing. BOOLEAN_T find_order; // Should we look for the motif order? // If we already have a motif order and spacing, don't find any more. if (*order_spacing == NULL) { find_order = TRUE; } else { find_order = FALSE; } new_order = NULL; // Allocate the matrices. *transp_freq = allocate_matrix(num_motifs + 2, num_motifs + 2); *spacer_ave = allocate_matrix(num_motifs + 2, num_motifs + 2); init_matrix(0.0, *transp_freq); init_matrix(0.0, *spacer_ave); int num_occurrence_strings = get_num_strings(motif_occurrences); int i; for (i = 0; i < num_occurrence_strings; i++) { char* sequence_id; // ID of the current sequence. float sequence_p; // pvalue of the entire sequence. int num_occurs; // Number of motif occurences in this sequence. int seq_length; // Length of the current sequence. int i_occur; // Index of the current occurrence. char prev_motif[MAX_MOTIF_ID_LENGTH + 1]; // Index of the previous motif. int prev_position; // Location of the right edge of previous motif. float motif_p; // P-value of the current occurrence. char *c; // Dummy to hold return of strtok. char* line = get_nth_string(i, motif_occurrences); /* Read the sequence identifier, p-value, number of occurrences and length. */ // tlb; sscanf crashes if strtok returns NULL so pass it "" then sequence_id = strtok(line, " "); if (sequence_id == NULL) { die("Error reading motif occurrences.\n%s", line); } if (sscanf((c=strtok(NULL, " "))?c:"", "%f", &sequence_p) != 1) { die("Can't read p-value of sequence %s.", sequence_id); } if (sscanf((c=strtok(NULL, " "))?c:"", "%d", &num_occurs) != 1) { die("Can't read number of motif occurences in sequence %s.", sequence_id); } if (sscanf((c=strtok(NULL, " "))?c:"", "%d", &seq_length) != 1) { die("Can't read length of sequence %s.", sequence_id); } if (verbosity > NORMAL_VERBOSE) { fprintf(stderr, "Reading motif occurrences for sequence %s.\n", sequence_id); } // If requested, try to create an order string. if (find_order) { new_order = create_empty_order(num_occurs, sequence_p); } // Accumulate motif occurence data. sprintf(prev_motif, "%d", 0); prev_position = 0; for (i_occur = 0; i_occur < num_occurs; i_occur++) { char motif_id[MAX_MOTIF_ID_LENGTH + 1]; // ID of the current motif. int motif_position; // Position of the current motif occurrence. char *c; // Dummy to hold return of strtok. // Read the three values. if (sscanf((c=strtok(NULL, " "))?c:"", "%s", motif_id) != 1) { die("Can't read index of occurrence %d in sequence %s.", i_occur, sequence_id); } if (sscanf((c=strtok(NULL, " "))?c:"", "%d", &motif_position) != 1) { die("Can't read position of occurrence %d in sequence %s.", i_occur, sequence_id); } if (sscanf((c=strtok(NULL, " "))?c:"", "%f", &motif_p) != 1) { die("Can't read p-value of occurrence %d in sequence %s.", i_occur, sequence_id); } // Only include motifs that have been retained if (have_motif(motif_id, num_motifs, motifs)) { // Make sure we have strand information in the ID. if (has_reverse_strand) { add_strand(motif_id); } // Record this occurrence. record_occurrence(sequence_id, motif_id, p_threshold, motif_p, prev_motif, &prev_position, motif_position, *transp_freq, *spacer_ave, new_order, num_motifs, motifs); /* Motifs are stored in order of their motif IDs, but they are indexed from zero rather than one. */ prev_position = motif_position + (motifs[find_matrix_location(motifs, motif_id, num_motifs) - 1]).length; } } assert(seq_length >= prev_position); // Record the transition to the end state. record_occurrence(sequence_id, END_TRANSITION, p_threshold, motif_p, prev_motif, &prev_position, seq_length, *transp_freq, *spacer_ave, new_order, num_motifs, motifs); // Decide whether to keep the new order object. if (find_order) { if ((get_num_distinct(new_order) > get_num_distinct(*order_spacing)) || (((get_num_distinct(new_order) == get_num_distinct(*order_spacing)) && (get_pvalue(new_order) < get_pvalue(*order_spacing))))) { if (verbosity > NORMAL_VERBOSE) { fprintf(stderr, "Storing order from sequence %s (%g < %g).\n", sequence_id, get_pvalue(new_order), get_pvalue(*order_spacing)); print_order_and_spacing(stderr, new_order); } free_order(*order_spacing); *order_spacing = new_order; } else { free_order(new_order); } } } }
/* * Using the linreg test, * * this method returns the lowest scoring subdivision of a set of sequences for a given motif. * It's not self-contained, as it requires to hook into the global variables results, motifs, seq_ids. */ ramen_result_t* ramen_do_linreg_test(int motif_num) { //Assorted vars int seq_num; int j,k; int motif_index = motif_num * 2; //This is a workaround to the change in the motif datastructure where it now // goes +MOTIFA -MOTIFA +MOTIFB etc. rather than all + then all - motifs. //Vars for the regression double* x; double* y; double m = 0; double b = 0; double mse = 0; //Vars for scoring ramen_result_t* r; //Allocate memory or set initial values seq_num = get_num_strings(seq_ids); //number of sequences r = malloc(sizeof(ramen_result_t)); //allocate space, as a ptr to this will go in the array later //that's why we don't free it in this loop. x = malloc(sizeof(double)*seq_num); y = malloc(sizeof(double)*seq_num); //Now we need to copy the scores into two double arrays //Use LOG macro so that log(0) 'works' for (j=0; j < seq_num; j++) { if (args.log_fscores == TRUE) { y[j] = LOG(get_array_item(j, seq_fscores)); } else { y[j] = get_array_item(j, seq_fscores); } if (args.log_pwmscores == TRUE) { x[j] = LOG(results[motif_num][j]); } else { x[j] = results[motif_num][j]; } } //Switch x&y if they're to be switched if (args.linreg_switchxy) { SWAP(double*, x, y); } // TODO: Tidy and/or remove this for production if(args.linreg_dump_dir > 0) { FILE *fh; char* filename; filename = malloc(sizeof(char)*(strlen(args.linreg_dump_dir) + 50)); sprintf(filename, "%s/%s.tsv", args.linreg_dump_dir, get_motif_id(motif_at(motifs.motifs, motif_index))); fh = fopen(filename, "w"); fputs("PWM_Score\tFluorescence_Score\n", fh); for (j=0; j < seq_num; j++) { fprintf(fh, "%.10e %.10e\n", x[j], y[j]); } fclose(fh); free(filename); } /*extern double regress( int n, / number of points / double *x, / x values / double *y, / y values / double *m, / slope / double *b / y intercept / );*/ mse = regress(seq_num, x, y, &m, &b); if (args.verbose >= 3) { printf("LinReg MSE of motif %s on %i seqs: %.4g (m: %.4g b: %.4g)\n", get_motif_id(motif_at(motifs.motifs, motif_index)), seq_num, mse, m, b); } //Add to our motif list if lowest MSE r->motif_id = strdup(get_motif_id(motif_at(motifs.motifs, motif_index))); r->m = m; //Not p-values, but they'll do when we re-use this structure... r->b = b; r->mse = mse; r->p = -1; //Do stochastic sampling if required. if (args.repeats > 0) { int repeat_wins = 0; for (j=0;j<args.repeats;j++) { double repeat_mse = 0; shuffle(x,seq_num); //Shuffle and break the associations between x and y repeat_mse = regress(seq_num, x, y, &m, &b); //fprintf(stderr, "Motif %d Repeat %d RMSE: %g MSE: %g\n",motif_index,j,repeat_mse,mse); if (repeat_mse <= mse) { repeat_wins++; } } r->p = repeat_wins*1.0/ args.repeats*1.0; } free(x); free(y); return r; }
/******************************************************************************* * Return count of TRANSFAC species strings. ******************************************************************************/ int get_transfac_num_species(TRANSFAC_MOTIF_T *motif) { return motif->species_list == NULL ? 0 : get_num_strings(motif->species_list); }
/**************************************************************************** * Return a list containing the empirical column frequency distributions * for all alignments in the input. * * Each file in the list of filenames is read and the species list is * determined. The counts of each occurring column are tallied. * All files with the same species lists get their counts combined. * * The returned list contains one distribution per species list that * occurs in some alignment. ****************************************************************************/ OBJECT_LIST_T* get_alignment_column_freqs_list (ALPH_T alph, STRING_LIST_T* filenames, BOOLEAN_T remove_allgap_seqs) { int file_index; int num_filenames = get_num_strings(filenames); ARRAY_T* alignment_column_freqs = NULL; OBJECT_LIST_T* alignment_column_freqs_list = new_object_list(equal_string_lists, (void*)copy_string_list, free_string_list, free_array); // Consider each alignment in turn. for(file_index = 0; file_index < num_filenames; file_index++) { char* filename = get_nth_string(file_index, filenames); if (verbosity >= NORMAL_VERBOSE && !(file_index % 1)) { fprintf( stderr, "Computing column freqs: alignment file number %d of %d total files.\n", file_index+1, num_filenames ); } // Read the alignment int ref_seq_index = 0; ALIGNMENT_T* alignment = read_alignment_from_file(filename, TRUE, remove_allgap_seqs, &ref_seq_index); STRING_LIST_T* alignment_species = get_species_names(alignment); // Try to retrieve the counts so far for this list of species. alignment_column_freqs = (ARRAY_T*)retrieve_object( alignment_species, alignment_column_freqs_list ); // Found counts for current species list? if (alignment_column_freqs) { // Add counts from current alignment. (void) build_alignment_column_counts(alph, alignment, alignment_column_freqs); // Note: objects in lists are references, so no need to re-store // after modification. } // Didn't find counts for this species list, so create new array of counts. else { alignment_column_freqs = build_alignment_column_counts(alph, alignment, NULL); store_object( (void*)alignment_column_freqs, (void*)alignment_species, 0.0, // Score alignment_column_freqs_list ); } // free space used by alignment free_alignment(alignment); } // each filename fprintf(stderr, "\n"); // Convert counts to frequencies by retrieving each array of counts // and dividing by the total counts for that list of species. while ( (alignment_column_freqs = retrieve_next_object(alignment_column_freqs_list) ) != NULL ) { int i; int num_freqs = get_array_length(alignment_column_freqs); double total_counts; // Get total counts. for (i=total_counts=0; i<num_freqs; i++) { total_counts += get_array_item(i, alignment_column_freqs); } // Get frequencies. for (i=0; i<num_freqs; i++) { double f = get_array_item(i, alignment_column_freqs); set_array_item(i, f/total_counts, alignment_column_freqs); #ifdef DEBUG int asize = alph_size(alph, ALPH_SIZE); int num_leaves = NINT(log(num_freqs)/log(asize)); char* alignment_col = mm_malloc((num_leaves + 1) * sizeof(char)); unhash_alignment_col( alph, i, //col_index alignment_col, num_leaves ); printf("%s %g %g\n", alignment_col, f, f/total_counts); myfree(alignment_col); #endif } // get frequencies } // while more species lists return(alignment_column_freqs_list); } // get_alignment_column_freqs_list