/************************************************************************* * Converts the motif frequency matrix into a odds matrix: taken from old ama-scan.c *************************************************************************/ void convert_to_odds_matrix(MOTIF_T* motif, ARRAY_T* bg_freqs){ const int asize = alph_size(get_motif_alph(motif), ALPH_SIZE); int motif_position_index,alph_index; MATRIX_T *freqs; freqs = get_motif_freqs(motif); const int num_motif_positions = get_num_rows(freqs); for (alph_index=0;alph_index<asize;++alph_index){ double bg_likelihood = get_array_item(alph_index, bg_freqs); for (motif_position_index=0;motif_position_index<num_motif_positions;++motif_position_index){ freqs->rows[motif_position_index]->items[alph_index] /= bg_likelihood; } } }
/************************************************************************* * Copies the motif frequency matrix and converts it into a odds matrix *************************************************************************/ MATRIX_T* create_odds_matrix(MOTIF_T *motif, ARRAY_T* bg_freqs){ const int asize = alph_size(get_motif_alph(motif), ALPH_SIZE); int pos, aidx; MATRIX_T *odds; odds = duplicate_matrix(get_motif_freqs(motif)); const int num_pos = get_num_rows(odds); for (aidx = 0; aidx < asize; ++aidx) { double bg_likelihood = get_array_item(aidx, bg_freqs); for (pos = 0; pos < num_pos; ++pos) { odds->rows[pos]->items[aidx] /= bg_likelihood; } } return odds; }
/************************************************************************* * Output JSON data for a motif *************************************************************************/ static void output_motif_json(JSONWR_T* json, MOTIF_STATS_T* stats, SITE_COUNTS_T* counts) { //vars MOTIF_T *motif; MATRIX_T *freqs; int i, j, mlen, asize, end; motif = stats->motif; freqs = get_motif_freqs(motif); asize = alph_size(get_motif_alph(motif), ALPH_SIZE); jsonwr_start_object_value(json); jsonwr_lng_prop(json, "db", stats->db->id); jsonwr_str_prop(json, "id", get_motif_id(motif)); if (*(get_motif_id2(motif))) { jsonwr_str_prop(json, "alt", get_motif_id2(motif)); } mlen = get_motif_length(motif); jsonwr_lng_prop(json, "len", mlen); jsonwr_dbl_prop(json, "motif_evalue", get_motif_evalue(motif)); jsonwr_dbl_prop(json, "motif_nsites", get_motif_nsites(motif)); if (get_motif_url(motif) && *get_motif_url(motif)) { jsonwr_str_prop(json, "url", get_motif_url(motif)); } jsonwr_property(json, "pwm"); jsonwr_start_array_value(json); for (i = 0; i < mlen; i++) { jsonwr_start_array_value(json); for (j = 0; j < asize; j++) { jsonwr_dbl_value(json, get_matrix_cell(i, j, freqs)); } jsonwr_end_array_value(json); } jsonwr_end_array_value(json); jsonwr_lng_prop(json, "bin_width", stats->central_window+1); jsonwr_dbl_prop(json, "bin_sites", stats->central_sites); jsonwr_lng_prop(json, "total_sites", counts->total_sites); jsonwr_dbl_prop(json, "log_pvalue", stats->log_adj_pvalue); jsonwr_dbl_prop(json, "max_prob", stats->max_prob); jsonwr_property(json, "sites"); jsonwr_start_array_value(json); end = counts->allocated - (mlen - 1); for (i = (mlen - 1); i < end; i += 2) { jsonwr_dbl_value(json, counts->sites[i]); } jsonwr_end_array_value(json); jsonwr_end_object_value(json); }
/************************************************************************* * Entry point for pmp_bf *************************************************************************/ int main(int argc, char *argv[]) { char* bg_filename = NULL; char* motif_name = "motif"; // Use this motif name in the output. STRING_LIST_T* selected_motifs = NULL; double fg_rate = 1.0; double bg_rate = 1.0; double purine_pyrimidine = 1.0; // r double transition_transversion = 0.5; // R double pseudocount = 0.1; GAP_SUPPORT_T gap_support = SKIP_GAPS; MODEL_TYPE_T model_type = F81_MODEL; BOOLEAN_T use_halpern_bruno = FALSE; char* ustar_label = NULL; // TLB; create uniform star tree int i; program_name = "pmp_bf"; /********************************************** * COMMAND LINE PROCESSING **********************************************/ // Define command line options. (FIXME: Repeated code) // FIXME: Note that if you add or remove options you // must change n_options. int n_options = 12; cmdoption const pmp_options[] = { {"hb", NO_VALUE}, {"ustar", REQUIRED_VALUE}, {"model", REQUIRED_VALUE}, {"pur-pyr", REQUIRED_VALUE}, {"transition-transversion", REQUIRED_VALUE}, {"bg", REQUIRED_VALUE}, {"fg", REQUIRED_VALUE}, {"motif", REQUIRED_VALUE}, {"motif-name", REQUIRED_VALUE}, {"bgfile", REQUIRED_VALUE}, {"pseudocount", REQUIRED_VALUE}, {"verbosity", REQUIRED_VALUE} }; int option_index = 0; // Define the usage message. char usage[1000] = ""; strcat(usage, "USAGE: pmp [options] <tree file> <MEME file>\n"); strcat(usage, "\n"); strcat(usage, " Options:\n"); // Evolutionary model parameters. strcat(usage, " --hb\n"); strcat(usage, " --model single|average|jc|k2|f81|f84|hky|tn"); strcat(usage, " (default=f81)\n"); strcat(usage, " --pur-pyr <float> (default=1.0)\n"); strcat(usage, " --transition-transversion <float> (default=0.5)\n"); strcat(usage, " --bg <float> (default=1.0)\n"); strcat(usage, " --fg <float> (default=1.0)\n"); // Motif parameters. strcat(usage, " --motif <id> (default=all)\n"); strcat(usage, " --motif-name <string> (default from motif file)\n"); // Miscellaneous parameters strcat(usage, " --bgfile <background> (default from motif file)\n"); strcat(usage, " --pseudocount <float> (default=0.1)\n"); strcat(usage, " --ustar <label>\n"); // TLB; create uniform star tree strcat(usage, " --verbosity [1|2|3|4] (default 2)\n"); strcat(usage, "\n Prints the FP and FN rate at each of 10000 score values.\n"); strcat(usage, "\n Output format: [<motif_id> score <score> FPR <fpr> TPR <tpr>]+\n"); // Parse the command line. if (simple_setopt(argc, argv, n_options, pmp_options) != NO_ERROR) { die("Error processing command line options: option name too long.\n"); } while (TRUE) { int c = 0; char* option_name = NULL; char* option_value = NULL; const char * message = NULL; // Read the next option, and break if we're done. c = simple_getopt(&option_name, &option_value, &option_index); if (c == 0) { break; } else if (c < 0) { (void) simple_getopterror(&message); die("Error processing command line options (%s)\n", message); } if (strcmp(option_name, "model") == 0) { if (strcmp(option_value, "jc") == 0) { model_type = JC_MODEL; } else if (strcmp(option_value, "k2") == 0) { model_type = K2_MODEL; } else if (strcmp(option_value, "f81") == 0) { model_type = F81_MODEL; } else if (strcmp(option_value, "f84") == 0) { model_type = F84_MODEL; } else if (strcmp(option_value, "hky") == 0) { model_type = HKY_MODEL; } else if (strcmp(option_value, "tn") == 0) { model_type = TAMURA_NEI_MODEL; } else if (strcmp(option_value, "single") == 0) { model_type = SINGLE_MODEL; } else if (strcmp(option_value, "average") == 0) { model_type = AVERAGE_MODEL; } else { die("Unknown model: %s\n", option_value); } } else if (strcmp(option_name, "hb") == 0){ use_halpern_bruno = TRUE; } else if (strcmp(option_name, "ustar") == 0){ // TLB; create uniform star tree ustar_label = option_value; } else if (strcmp(option_name, "pur-pyr") == 0){ purine_pyrimidine = atof(option_value); } else if (strcmp(option_name, "transition-transversion") == 0){ transition_transversion = atof(option_value); } else if (strcmp(option_name, "bg") == 0){ bg_rate = atof(option_value); } else if (strcmp(option_name, "fg") == 0){ fg_rate = atof(option_value); } else if (strcmp(option_name, "motif") == 0){ if (selected_motifs == NULL) { selected_motifs = new_string_list(); } add_string(option_value, selected_motifs); } else if (strcmp(option_name, "motif-name") == 0){ motif_name = option_value; } else if (strcmp(option_name, "bgfile") == 0){ bg_filename = option_value; } else if (strcmp(option_name, "pseudocount") == 0){ pseudocount = atof(option_value); } else if (strcmp(option_name, "verbosity") == 0){ verbosity = atoi(option_value); } } // Must have tree and motif file names if (argc != option_index + 2) { fprintf(stderr, "%s", usage); exit(EXIT_FAILURE); } /********************************************** * Read the phylogenetic tree. **********************************************/ char* tree_filename = NULL; TREE_T* tree = NULL; tree_filename = argv[option_index]; option_index++; tree = read_tree_from_file(tree_filename); // get the species names STRING_LIST_T* alignment_species = make_leaf_list(tree); char *root_label = get_label(tree); // in case target in center if (strlen(root_label)>0) add_string(root_label, alignment_species); //write_string_list(" ", alignment_species, stderr); // TLB; Convert the tree to a uniform star tree with // the target sequence at its center. if (ustar_label != NULL) { tree = convert_to_uniform_star_tree(tree, ustar_label); if (tree == NULL) die("Tree or alignment missing target %s\n", ustar_label); if (verbosity >= NORMAL_VERBOSE) { fprintf(stderr, "Target %s placed at center of uniform (d=%.3f) star tree:\n", ustar_label, get_total_length(tree) / get_num_children(tree) ); write_tree(tree, stderr); } } /********************************************** * Read the motifs. **********************************************/ char* meme_filename = argv[option_index]; option_index++; int num_motifs = 0; MREAD_T *mread; ALPH_T alph; ARRAYLST_T *motifs; ARRAY_T *bg_freqs; mread = mread_create(meme_filename, OPEN_MFILE); mread_set_bg_source(mread, bg_filename); mread_set_pseudocount(mread, pseudocount); // read motifs motifs = mread_load(mread, NULL); alph = mread_get_alphabet(mread); bg_freqs = mread_get_background(mread); // check if (arraylst_size(motifs) == 0) die("No motifs in %s.", meme_filename); // TLB; need to resize bg_freqs array to ALPH_SIZE items // or copy array breaks in HB mode. This throws away // the freqs for the ambiguous characters; int asize = alph_size(alph, ALPH_SIZE); resize_array(bg_freqs, asize); /************************************************************** * Compute probability distributions for each of the selected motifs. **************************************************************/ int motif_index; for (motif_index = 0; motif_index < arraylst_size(motifs); motif_index++) { MOTIF_T* motif = (MOTIF_T*)arraylst_get(motif_index, motifs); char* motif_id = get_motif_id(motif); char* bare_motif_id = motif_id; // We may have specified on the command line that // only certain motifs were to be used. if (selected_motifs != NULL) { if (*bare_motif_id == '+' || *bare_motif_id == '-') { // The selected motif id won't included a strand indicator. bare_motif_id++; } if (have_string(bare_motif_id, selected_motifs) == FALSE) { continue; } } if (verbosity >= NORMAL_VERBOSE) { fprintf( stderr, "Using motif %s of width %d.\n", motif_id, get_motif_length(motif) ); } // Build an array of evolutionary models for each position in the motif. EVOMODEL_T** models = make_motif_models( motif, bg_freqs, model_type, fg_rate, bg_rate, purine_pyrimidine, transition_transversion, use_halpern_bruno ); // Get the frequencies under the background model (row 0) // and position-dependent scores (rows 1..w) // for each possible alignment column. MATRIX_T* pssm_matrix = build_alignment_pssm_matrix( alph, alignment_species, get_motif_length(motif) + 1, models, tree, gap_support ); ARRAY_T* alignment_col_freqs = allocate_array(get_num_cols(pssm_matrix)); copy_array(get_matrix_row(0, pssm_matrix), alignment_col_freqs); remove_matrix_row(0, pssm_matrix); // throw away first row //print_col_frequencies(alph, alignment_col_freqs); // // Get the position-dependent null model alignment column frequencies // int w = get_motif_length(motif); int ncols = get_num_cols(pssm_matrix); MATRIX_T* pos_dep_bkg = allocate_matrix(w, ncols); for (i=0; i<w; i++) { // get the evo model corresponding to this column of the motif // and store it as the first evolutionary model. myfree(models[0]); // Use motif PSFM for equilibrium freqs. for model. ARRAY_T* site_specific_freqs = allocate_array(asize); int j = 0; for(j = 0; j < asize; j++) { double value = get_matrix_cell(i, j, get_motif_freqs(motif)); set_array_item(j, value, site_specific_freqs); } if (use_halpern_bruno == FALSE) { models[0] = make_model( model_type, fg_rate, transition_transversion, purine_pyrimidine, site_specific_freqs, NULL ); } else { models[0] = make_model( model_type, fg_rate, transition_transversion, purine_pyrimidine, bg_freqs, site_specific_freqs ); } // get the alignment column frequencies using this model MATRIX_T* tmp_pssm_matrix = build_alignment_pssm_matrix( alph, alignment_species, 2, // only interested in freqs under bkg models, tree, gap_support ); // assemble the position-dependent background alignment column freqs. set_matrix_row(i, get_matrix_row(0, tmp_pssm_matrix), pos_dep_bkg); // chuck the pssm (not his real name) free_matrix(tmp_pssm_matrix); } // // Compute and print the score distribution under the background model // and under the (position-dependent) motif model. // int range = 10000; // 10^4 gives same result as 10^5, but 10^3 differs // under background model PSSM_T* pssm = build_matrix_pssm(alph, pssm_matrix, alignment_col_freqs, range); // under position-dependent background (motif) model PSSM_T* pssm_pos_dep = build_matrix_pssm(alph, pssm_matrix, alignment_col_freqs, range); get_pv_lookup_pos_dep( pssm_pos_dep, pos_dep_bkg, NULL // no priors used ); // print FP and FN distributions int num_items = get_pssm_pv_length(pssm_pos_dep); for (i=0; i<num_items; i++) { double pvf = get_pssm_pv(i, pssm); double pvt = get_pssm_pv(i, pssm_pos_dep); double fpr = pvf; double fnr = 1 - pvt; if (fpr >= 0.99999 || fnr == 0) continue; printf("%s score %d FPR %.3g FNR %.3g\n", motif_id, i, fpr, fnr); } // free stuff free_pssm(pssm); free_pssm(pssm_pos_dep); if (models != NULL) { int model_index; int num_models = get_motif_length(motif) + 1; for (model_index = 0; model_index < num_models; model_index++) { free_model(models[model_index]); } myfree(models); } } // motif arraylst_destroy(destroy_motif, motifs); /********************************************** * Clean up. **********************************************/ // TLB may have encountered a memory corruption bug here // CEG has not been able to reproduce it. valgrind says all is well. free_array(bg_freqs); free_tree(TRUE, tree); free_string_list(selected_motifs); return(0); } // main
/************************************************************************* * Build a completely connected HMM. *************************************************************************/ void build_complete_hmm (ARRAY_T* background, int spacer_states, MOTIF_T *motifs, int nmotifs, MATRIX_T *transp_freq, MATRIX_T *spacer_ave, BOOLEAN_T fim, MHMM_T **the_hmm) { ALPH_T alph; int motif_states; // Total length of the motifs. int num_spacers; // Total number of spacer states. int num_states; // Total number of states in the model. int i_motif; // Index of the current "from" motif. int j_motif; // Index of the current "to" motif. int i_position; // Index within the current motif or spacer. int i_state = 0; // Index of the current state. assert(nmotifs > 0); alph = get_motif_alph(motifs);// get the alphabet from the first motif // Count the width of the motifs. for (motif_states = 0, i_motif = 0; i_motif < nmotifs; i_motif++) motif_states += get_motif_length(motif_at(motifs, i_motif)); // Count the spacer states adjacent to begin and end. num_spacers = nmotifs * 2; // Add the spacer states between motifs. num_spacers += nmotifs * nmotifs; // Total states = motifs + spacer_states + begin/end num_states = motif_states + (num_spacers * spacer_states) + 2; // Allocate the model. *the_hmm = allocate_mhmm(alph, num_states); // Record that this is a completely connected model. (*the_hmm)->type = COMPLETE_HMM; // Record the number of motifs in the model. (*the_hmm)->num_motifs = nmotifs; // Record the number of states in the model. (*the_hmm)->num_states = num_states; (*the_hmm)->num_spacers = ((nmotifs + 1) * (nmotifs + 1)) - 1; (*the_hmm)->spacer_states = spacer_states; // Put the background distribution into the model. copy_array(background, (*the_hmm)->background); // Build the begin state. build_complete_state( START_STATE, i_state, alph, 0, // expected length NULL, // Emissions. 0, // Number of sites. NON_MOTIF_INDEX, NON_MOTIF_POSITION, nmotifs, 0, // previous motif 0, // next motif transp_freq, spacer_states, num_spacers, motifs, &((*the_hmm)->states[i_state])); i_state++; int from_motif_state, to_motif_state; // Build the spacer states. No transitions from the end state. for (i_motif = 0; i_motif <= nmotifs; i_motif++) { // No transitions to the start state. for (j_motif = 1; j_motif <= nmotifs+1; j_motif++) { // No transitions from start to end. if ((i_motif == 0) && (j_motif == nmotifs+1)) continue; // Allow multi-state spacers. for (i_position = 0; i_position < spacer_states; i_position++, i_state++) { build_complete_state( SPACER_STATE, i_state, alph, get_matrix_cell(i_motif, j_motif, spacer_ave), background, SPACER_NUMSITES, NON_MOTIF_INDEX, i_position, nmotifs, i_motif, j_motif, transp_freq, spacer_states, num_spacers, motifs, &((*the_hmm)->states[i_state])); } } } // Build the motif states. for (i_motif = 0; i_motif < nmotifs; i_motif++) { MOTIF_T *this_motif = motif_at(motifs, i_motif); STATE_T state; for (i_position = 0; i_position < get_motif_length(this_motif); i_position++, i_state++) { if (i_position == 0) { state = START_MOTIF_STATE; } else if (i_position == (get_motif_length(this_motif) - 1)) { state = END_MOTIF_STATE; } else { state = MID_MOTIF_STATE; } build_complete_state( MID_MOTIF_STATE, i_state, alph, 0, // Expected spacer length. get_matrix_row(i_position, get_motif_freqs(this_motif)), get_motif_nsites(this_motif), i_motif, i_position, nmotifs, 0, // Previous motif index. 0, // Next motif index. transp_freq, spacer_states, num_spacers, motifs, &((*the_hmm)->states[i_state])); } } // Build the end state. build_complete_state( END_STATE, i_state, alph, 0, // Expected spacer length. NULL, // Emissions 0, // Number of sites. NON_MOTIF_INDEX, NON_MOTIF_POSITION, nmotifs, 0, // Previous motif index. 0, // Next motif index. transp_freq, spacer_states, num_spacers, motifs, &((*the_hmm)->states[i_state])); i_state++; // Convert spacers to FIMs if requested. if (fim) { convert_to_fims(*the_hmm); } // Fill in the transition matrix. build_transition_matrix(*the_hmm); }
/************************************************************************* * Build a linear HMM. *************************************************************************/ void build_linear_hmm (ARRAY_T* background, ORDER_T* order_spacing, int spacer_states, RBTREE_T* motifs, // motifs with key as in order_spacing BOOLEAN_T fim, MHMM_T** the_hmm) { ALPH_T alph; int model_length; // Total number of states in the model. int i_state; // Index of the current state. int i_order; // Index within the order and spacing. int i_position; // Index within the current motif or spacer. int motif_i; // motif key in order spacing MOTIF_T *motif; // motif RBNODE_T *node; alph = get_motif_alph((MOTIF_T*)rbtree_value(rbtree_first(motifs))); // Calculate the total length of the model. model_length = 2; // start and end state for (i_order = 0; i_order < get_order_occurs(order_spacing); i_order++) { motif_i = get_order_motif(order_spacing, i_order); motif = (MOTIF_T*)rbtree_get(motifs, &motif_i); model_length += get_motif_length(motif); } model_length += (get_order_occurs(order_spacing) + 1) * spacer_states; // Allocate the model. *the_hmm = allocate_mhmm(alph, model_length); check_sq_matrix((*the_hmm)->trans, model_length); // Record that this is a linear model. (*the_hmm)->type = LINEAR_HMM; // Record the number of motifs in the model. // It doesn't want the distinct count (*the_hmm)->num_motifs = get_order_occurs(order_spacing); // Record the number of states in the model. (*the_hmm)->num_states = model_length; (*the_hmm)->num_spacers = get_order_occurs(order_spacing) + 1; (*the_hmm)->spacer_states = spacer_states; // Put the background distribution into the model. copy_array(background, (*the_hmm)->background); // Begin the model with a non-emitting state. i_state = 0; check_sq_matrix((*the_hmm)->trans, (*the_hmm)->num_states); build_linear_state( alph, START_STATE, i_state, get_spacer_length(order_spacing, 0), NULL, // Emissions. 0, // Number of sites. NON_MOTIF_INDEX, NON_MOTIF_POSITION, // position within state (not relevant to start state) NULL, // no motif &((*the_hmm)->states[i_state])); ++i_state; // Build the first spacer. for (i_position = 0; i_position < spacer_states; i_position++, i_state++) { check_sq_matrix((*the_hmm)->trans, (*the_hmm)->num_states); build_linear_state( alph, SPACER_STATE, i_state, get_spacer_length(order_spacing, 0), background, SPACER_NUMSITES, NON_MOTIF_INDEX, i_position, // position within spacer NULL, // no motif &((*the_hmm)->states[i_state])); } // Build each motif and subsequent spacer. for (i_order = 0; i_order < get_order_occurs(order_spacing); i_order++) { STATE_T state; int spacer_len; motif_i = get_order_motif(order_spacing, i_order); motif = (MOTIF_T*)rbtree_get(motifs, &motif_i); // Build the motif. for (i_position = 0; i_position < get_motif_length(motif); i_position++, i_state++) { if (i_position == 0) { state = START_MOTIF_STATE; spacer_len = get_spacer_length(order_spacing, i_order); } else if (i_position == (get_motif_length(motif) - 1)) { state = END_MOTIF_STATE; spacer_len = get_spacer_length(order_spacing, i_order+1); } else { state = MID_MOTIF_STATE; spacer_len = 0; } check_sq_matrix((*the_hmm)->trans, (*the_hmm)->num_states); build_linear_state( alph, state, i_state, spacer_len, // Expected spacer length. get_matrix_row(i_position, get_motif_freqs(motif)), get_motif_nsites(motif), i_order, i_position, // position within motif (middle) motif, &((*the_hmm)->states[i_state])); } // Build the following spacer. for (i_position = 0; i_position < spacer_states; i_position++, i_state++) { check_sq_matrix((*the_hmm)->trans, (*the_hmm)->num_states); build_linear_state( alph, SPACER_STATE, i_state, get_spacer_length(order_spacing, i_order+1), background, SPACER_NUMSITES, NON_MOTIF_INDEX, i_position, // position within spacer NULL, // no motif &((*the_hmm)->states[i_state])); } } check_sq_matrix((*the_hmm)->trans, (*the_hmm)->num_states); // Finish up the model with a non-emitting end state. build_linear_state( alph, END_STATE, i_state, get_spacer_length(order_spacing, i_order), NULL, // Emissions. 0, // Number of sites. NON_MOTIF_INDEX, NON_MOTIF_POSITION, // position within state (not relevant to end state) NULL, // no motif &((*the_hmm)->states[i_state])); ++i_state; assert(i_state == model_length); check_sq_matrix((*the_hmm)->trans, (*the_hmm)->num_states); // Convert spacers to FIMs if requested. if (fim) { convert_to_fims(*the_hmm); } // Fill in the transition matrix. build_transition_matrix(*the_hmm); }
/************************************************************************* * Build a star topology HMM. *************************************************************************/ void build_star_hmm (ARRAY_T* background, int spacer_states, MOTIF_T* motifs, int nmotifs, BOOLEAN_T fim, MHMM_T** the_hmm) { ALPH_T alph; int motif_states; /* Total length of the motifs. */ int num_spacers; /* Total number of spacer states. */ int num_states; /* Total number of states in the model. */ int i_motif; /* Index of the current "from" motif. */ int i_position; /* Index within the current motif or spacer. */ int i_state = 0; /* Index of the current state. */ alph = get_motif_alph(motif_at(motifs, 0)); /* Count the width of the motifs. */ for (motif_states = 0, i_motif = 0; i_motif < nmotifs; i_motif++) motif_states += get_motif_length(motif_at(motifs, i_motif)); // Only 1 spacer. num_spacers = 1; /* Total states = motifs + spacer_states + begin/end */ num_states = motif_states + (num_spacers * spacer_states) + 2; /* fprintf(stderr, "motif_states=%d num_spacers=%d num_states=%d\n", motif_states, num_spacers, num_states); */ /* Allocate the model. */ *the_hmm = allocate_mhmm(alph, num_states); /* Record that this is a star model. */ (*the_hmm)->type = STAR_HMM; /* Record the number of motifs in the model. */ (*the_hmm)->num_motifs = nmotifs; /* Record the number of states in the model. */ (*the_hmm)->num_states = num_states; (*the_hmm)->num_spacers = 1; (*the_hmm)->spacer_states = spacer_states; // Put the background distribution into the model. copy_array(background, (*the_hmm)->background); /* Build the begin state. */ build_star_state( alph, START_STATE, i_state, 0, // expected length NULL, 0, // Number of sites. NON_MOTIF_INDEX, NON_MOTIF_POSITION, nmotifs, spacer_states, motifs, &((*the_hmm)->states[i_state]) ); i_state++; // Build the spacer state (state 0). Allow multi-state spacers. for (i_position = 0; i_position < spacer_states; i_position++) { build_star_state( alph, SPACER_STATE, i_state, DEFAULT_SPACER_LENGTH, background, SPACER_NUMSITES, NON_MOTIF_INDEX, i_position, nmotifs, spacer_states, motifs, &((*the_hmm)->states[i_state]) ); i_state++; } /* Build the motif states. */ for (i_motif = 0; i_motif < nmotifs; i_motif++) { MOTIF_T *this_motif = motif_at(motifs, i_motif); assert(get_motif_length(this_motif) > 1); i_position = 0; build_star_state( alph, START_MOTIF_STATE, i_state, 0, // Expected spacer length. get_matrix_row(i_position, get_motif_freqs(this_motif)), get_motif_nsites(this_motif), i_motif, i_position, nmotifs, spacer_states, motifs, &((*the_hmm)->states[i_state]) ); i_state++; for (i_position = 1; i_position < get_motif_length(this_motif) - 1; i_position++) { build_star_state( alph, MID_MOTIF_STATE, i_state, 0, // Expected spacer length. get_matrix_row(i_position, get_motif_freqs(this_motif)), get_motif_nsites(this_motif), i_motif, i_position, nmotifs, spacer_states, motifs, &((*the_hmm)->states[i_state]) ); i_state++; } build_star_state( alph, END_MOTIF_STATE, i_state, 0, // Expected spacer length. get_matrix_row(i_position, get_motif_freqs(this_motif)), get_motif_nsites(this_motif), i_motif, i_position, nmotifs, spacer_states, motifs, &((*the_hmm)->states[i_state]) ); i_state++; } /* Build the end state. */ build_star_state( alph, END_STATE, i_state, 0, // Expected spacer length. NULL, // Emissions 0, // Number of sites. NON_MOTIF_INDEX, NON_MOTIF_POSITION, nmotifs, spacer_states, motifs, &((*the_hmm)->states[i_state]) ); i_state++; /* Convert spacers to FIMs if requested. */ if (fim) { convert_to_fims(*the_hmm); } /* Fill in the transition matrix. */ build_transition_matrix(*the_hmm); } // build_star_hmm
void ramen_scan_sequences() { FILE* seq_file = NULL; MOTIF_T* motif = NULL; MOTIF_T* rev_motif = NULL; SEQ_T* sequence = NULL; SCANNED_SEQUENCE_T* scanned_seq = NULL; PATTERN_T* pattern; int i; int j; SEQ_T** seq_list; int num_seqs; int seq_len; //For the bdb_bg mode: ARRAY_T* seq_bg_freqs; double atcontent; double roundatcontent; double avg_seq_length = 0; //Open the file. if (open_file(args.sequence_filename, "r", FALSE, "FASTA", "sequences", &seq_file) == 0) { fprintf(stderr, "Couldn't open the file %s.\n", args.sequence_filename); ramen_terminate(1); } //Start reading in the sequences read_many_fastas(ramen_alph, seq_file, MAX_SEQ_LENGTH, &num_seqs, &seq_list); seq_ids = new_string_list(); seq_fscores = allocate_array(num_seqs); //Allocate the required space for results results = malloc(sizeof(double*) * motifs.num); for (i=0;i<motifs.num;i++) { results[i] = malloc(sizeof(double)*num_seqs); } for (j=0;j<num_seqs;j++) { fprintf(stderr, "\rScanning %i of %i sequences...", j+1, num_seqs); //copy the pointer into our current object for clarity sequence = seq_list[j]; //Read the fluorescence data from the description field. add_string(get_seq_name(sequence),seq_ids); seq_len = get_seq_length(sequence); set_array_item(j,atof(get_seq_description(sequence)),seq_fscores); //Scan with each motif. for (i=0;i<motifs.num;i++) { int motifindex = i*2; results[i][j] = ramen_sequence_scan(sequence, motif_at(motifs.motifs, motifindex), motif_at(motifs.motifs, motifindex+1), NULL, NULL, //No need to pass PSSM. AVG_ODDS, 0, TRUE, 0, motifs.bg_freqs); if (TRUE == args.linreg_normalise) { int k; double maxscore = 1; motif = motif_at(motifs.motifs,motifindex); for (k=0;k<get_motif_length(motif);k++) { double maxprob = 0; if (maxprob < get_matrix_cell(k, alph_index(ramen_alph, 'A'), get_motif_freqs(motif))) maxprob = get_matrix_cell(k, alph_index(ramen_alph, 'A'), get_motif_freqs(motif)); if (maxprob < get_matrix_cell(k, alph_index(ramen_alph, 'C'), get_motif_freqs(motif))) maxprob = get_matrix_cell(k, alph_index(ramen_alph, 'C'), get_motif_freqs(motif)); if (maxprob < get_matrix_cell(k, alph_index(ramen_alph, 'G'), get_motif_freqs(motif))) maxprob = get_matrix_cell(k, alph_index(ramen_alph, 'G'), get_motif_freqs(motif)); if (maxprob < get_matrix_cell(k, alph_index(ramen_alph, 'T'), get_motif_freqs(motif))) maxprob = get_matrix_cell(k, alph_index(ramen_alph, 'T'), get_motif_freqs(motif)); maxscore *= maxprob; } results[i][j] /= maxscore; } } } }
/************************************************************************* * Calculate the odds score for each motif-sized window at each * site in the sequence using the given nucleotide frequencies. * * This function is a lightweight version based on the one contained in * motiph-scoring. Several calculations that are unnecessary for gomo * have been removed in order to speed up the process *************************************************************************/ double score_sequence( SEQ_T* seq, // sequence to scan (IN) MOTIF_T* motif, // motif already converted to odds values (IN) PSSM_T* pssm, // motif pssm char* feature_type, // motif name (IN) char* seq_name, // sequence name (IN) int method, // method used for scoring (IN) double threshold, // Threshold to use in TOTAL_HITS mode with a PWM ARRAY_T* bg_freqs //background model ) { assert(seq != NULL); assert(motif != NULL); // Get data for sequence if (seq_name == NULL) { seq_name = get_seq_name(seq); } char* raw_seq = get_raw_sequence(seq); int seq_length = get_seq_length(seq); // Get the pv lookup table ARRAY_T* pv_lookup = NULL; if (NULL != pssm) { pv_lookup = pssm->pv; assert(get_array_length(pv_lookup) > 0); } // Prepare storage for the string representing the portion // of the reference sequence within the window. char* window_seq = (char *) mm_malloc(sizeof(char) * (get_motif_length(motif) + 1)); window_seq[get_motif_length(motif)] = '\0'; // Get a character for the strand. char strand = 0; if (get_motif_id(motif)!= NULL) { get_motif_strand(motif); } else { strand = '.'; } int max_index = seq_length - get_motif_length(motif); const int asize = alph_size(get_motif_alph(motif), ALPH_SIZE); double* odds = (double*) mm_malloc(sizeof(double)*max_index); double* scaled_log_odds = (double*) mm_malloc(sizeof(double)*max_index); // For each site in the sequence int seq_index; for (seq_index = 0; seq_index < max_index; seq_index++) { BOOLEAN_T skip_window = FALSE; BOOLEAN_T found_gap = FALSE; BOOLEAN_T found_ambiguity = FALSE; double odd = 1.0; scaled_log_odds[seq_index] = 0; // For each site in the motif window int motif_position; for (motif_position = 0; motif_position < get_motif_length(motif); motif_position++) { char c = raw_seq[seq_index + motif_position]; window_seq[motif_position] = c; // Check for gaps at this site if(c == '-' || c == '.') { found_gap = TRUE; skip_window = TRUE; break; } // Check for ambiguity codes at this site //TODO: This next call is very expensive - it takes up approx. 10% of a // programme's running time. It should be fixed up somehow. int aindex = alph_index(get_motif_alph(motif), c); if (aindex > asize) { found_ambiguity = TRUE; skip_window = TRUE; break; } if (method == TOTAL_HITS) { //If we're in this mode, then we're using LOG ODDS. //scaled_log_odds[seq_index] += get_matrix_cell(motif_position, aindex, get_motif_freqs(motif)); scaled_log_odds[seq_index] += get_matrix_cell(motif_position, aindex, pssm->matrix); } else { odd *= get_matrix_cell(motif_position, aindex, get_motif_freqs(motif)); } } odds[seq_index] = odd; } // return odds as requested (MAX or AVG scoring) double requested_odds = 0.0; if (method == AVG_ODDS){ for (seq_index = 0; seq_index < max_index; seq_index++) { requested_odds += odds[seq_index]; } requested_odds /= max_index; } else if (method == MAX_ODDS){ for (seq_index = 0; seq_index < max_index; seq_index++) { if (odds[seq_index] > requested_odds){ requested_odds = odds[seq_index]; } } } else if (method == SUM_ODDS) { for (seq_index = 0; seq_index < max_index; seq_index++) { requested_odds += odds[seq_index]; } } else if (method == TOTAL_HITS) { for (seq_index = 0; seq_index < max_index; seq_index++) { if (scaled_log_odds[seq_index] >= (double)get_array_length(pv_lookup)) { scaled_log_odds[seq_index] = (double)(get_array_length(pv_lookup) - 1); } double pvalue = get_array_item((int) scaled_log_odds[seq_index], pv_lookup); //Figure out how to calculate the p-value of a hit //fprintf(stderr, "m: %s pv_l len: %i scaled_log_odds: %g seq index: %i pvalue: %g\n", // get_motif_id(motif), get_array_length(pv_lookup), scaled_log_odds[seq_index], seq_index, pvalue); if (pvalue < threshold) { requested_odds++; //Add another hit. } if (verbosity > HIGHER_VERBOSE) { fprintf(stderr, "Window Data: %s\t%s\t%i\t%g\t%g\t%g\n", get_seq_name(seq), get_motif_id(motif), seq_index, scaled_log_odds[seq_index], pvalue, threshold); } } } myfree(odds); myfree(scaled_log_odds); myfree(window_seq); return requested_odds; }