/*********************************************************************** * Turn a given motif into its own reverse complement. * TODO this does not handle the scores matrix, and it should. ***********************************************************************/ void reverse_complement_motif (MOTIF_T* a_motif) { int i, temp_trim; ARRAY_T* left_freqs; ARRAY_T* right_freqs; ARRAY_T* temp_freqs; // Temporary space during swap. // Allocate space. //temp_freqs = allocate_array(get_alph_size(ALL_SIZE)); //this relys on a global which assumes DNA has ambigs which meme doesn't seem to use temp_freqs = allocate_array(a_motif->alph_size + a_motif->ambigs); // Consider each row (position) in the motif. for (i = 0; i < (int)((a_motif->length + 1) / 2); i++) { left_freqs = get_matrix_row(i, a_motif->freqs); right_freqs = get_matrix_row(a_motif->length - (i + 1), a_motif->freqs); // Make a temporary copy of one row. copy_array(left_freqs, temp_freqs); // Compute reverse complements in both directions. complement_dna_freqs(right_freqs, left_freqs); complement_dna_freqs(temp_freqs, right_freqs); } free_array(temp_freqs); //swap the trimming variables temp_trim = a_motif->trim_left; a_motif->trim_left = a_motif->trim_right; a_motif->trim_right = temp_trim; }
void copy_row(int *from_matrix, int *to_matrix, int from_row_idx, int to_row_idx, int num_cols) { int *from_row = get_matrix_row(from_matrix, from_row_idx, num_cols); int *to_row = get_matrix_row(to_matrix, to_row_idx, num_cols); memcpy(to_row, from_row, sizeof(int)*num_cols); }
/************************************************************************** * * reverse_complement_pssm_matrix * * Turn a pssm matrix into its own reverse complement. * *************************************************************************/ static void reverse_complement_pssm ( ALPH_T alph, MATRIX_T* pssm_matrix ) { int i; ARRAY_T* left_scores; ARRAY_T* right_scores; ARRAY_T* temp_scores; // Temporary space during swap. int length = get_num_rows(pssm_matrix); // Allocate space. temp_scores = allocate_array(alph_size(alph, ALL_SIZE)); // Consider each row (position) in the motif. for (i = 0; i < (int)((length+1) / 2); i++) { left_scores = get_matrix_row(i, pssm_matrix); right_scores = get_matrix_row(length - (i + 1), pssm_matrix); // Make a temporary copy of one row. copy_array(left_scores, temp_scores); // Compute reverse complements in both directions. complement_dna_freqs(right_scores, left_scores); complement_dna_freqs(temp_scores, right_scores); } free_array(temp_scores); } // reverse_complement_pssm_matrix
/*********************************************************************** * Turn a given motif into its own reverse complement. ***********************************************************************/ void reverse_complement_motif (MOTIF_T* a_motif) { ALPH_SIZE_T size; int i, temp_trim; ARRAY_T* left_freqs; ARRAY_T* right_freqs; ARRAY_T* temp_freqs; // Temporary space during swap. assert(a_motif->alph == DNA_ALPH); // Allocate space. size = (a_motif->flags & MOTIF_HAS_AMBIGS ? ALL_SIZE : ALPH_SIZE); temp_freqs = allocate_array(alph_size(a_motif->alph, size)); // Consider each row (position) in the motif. for (i = 0; i < (int)((a_motif->length + 1) / 2); i++) { left_freqs = get_matrix_row(i, a_motif->freqs); right_freqs = get_matrix_row(a_motif->length - (i + 1), a_motif->freqs); // Make a temporary copy of one row. copy_array(left_freqs, temp_freqs); // Compute reverse complements in both directions. complement_dna_freqs(right_freqs, left_freqs); complement_dna_freqs(temp_freqs, right_freqs); } free_array(temp_freqs); if (a_motif->scores) { // Allocate space. temp_freqs = allocate_array(alph_size(a_motif->alph, ALPH_SIZE)); // Consider each row (position) in the motif. for (i = 0; i < (int)((a_motif->length + 1) / 2); i++) { left_freqs = get_matrix_row(i, a_motif->scores); right_freqs = get_matrix_row(a_motif->length - (i + 1), a_motif->scores); // Make a temporary copy of one row. copy_array(left_freqs, temp_freqs); // Compute reverse complements in both directions. complement_dna_freqs(right_freqs, left_freqs); complement_dna_freqs(temp_freqs, right_freqs); } free_array(temp_freqs); } //swap the trimming variables temp_trim = a_motif->trim_left; a_motif->trim_left = a_motif->trim_right; a_motif->trim_right = temp_trim; //swap the strand indicator //this assumes a ? is equalivant to + if (get_motif_strand(a_motif) == '-') { set_motif_strand('+', a_motif); } else { set_motif_strand('-', a_motif); } }
/*********************************************************************** * Convert transition counts to transition probabilities, and compute * average spacer lengths. * * Each matrix is indexed 0 ... n+1, where n is the number of motifs. * The entry at [i,j] corresponds to the transition from motif i to * motif j. Hence, after normalization, each row in the transition * matrix should sum to 1. ***********************************************************************/ static void normalize_spacer_counts( double trans_pseudo, double spacer_pseudo, // Pseudocount for self-loop. BOOLEAN_T keep_unused, MATRIX_T* transp_freq, MATRIX_T* spacer_ave ) { int i_row; int i_col; int num_rows; double total_spacer; double num_transitions; double ave_spacer; /* Divide the spacer lengths by the number of occurrences. */ num_rows = get_num_rows(transp_freq); for (i_row = 0; i_row < num_rows; i_row++) { for (i_col = 0; i_col < num_rows; i_col++) { total_spacer = get_matrix_cell(i_row, i_col, spacer_ave) + spacer_pseudo; num_transitions = get_matrix_cell(i_row, i_col, transp_freq); if (spacer_pseudo > 0) num_transitions++; if (num_transitions != 0.0) { ave_spacer = total_spacer / num_transitions; set_matrix_cell(i_row, i_col, ave_spacer, spacer_ave); } } } // Add pseudocounts. for (i_row = 0; i_row < num_rows; i_row++) { for (i_col = 0; i_col < num_rows; i_col++) { // Force some transitions to zero. if (// No transitions to the start state. (i_col == 0) || // No transitions from the end state. (i_row == num_rows - 1) || // No transition from start to end. ((i_row == 0) && (i_col == num_rows - 1))) { set_matrix_cell(i_row, i_col, 0.0, transp_freq); } else { // Only increment the used transitions. if ((keep_unused) || (get_matrix_cell(i_row, i_col, transp_freq) > 0.0)) { incr_matrix_cell(i_row, i_col, trans_pseudo, transp_freq); } } } } // Normalize rows. for (i_row = 0; i_row < num_rows - 1; i_row++) { if (array_total(get_matrix_row(i_row, transp_freq)) > 0.0) { normalize(SLOP, get_matrix_row(i_row, transp_freq)); } } }
BOOLEAN_T verify_trans_matrix (BOOLEAN_T log_form, /* Is the transition matrix in log form? */ int num_states, /* Number of states in the (square) matrix. */ MATRIX_T* trans) /* The matrix. */ { int i_state; PROB_T total; for (i_state = 0; i_state < num_states - 1; i_state++) { /* Cf. Rabiner, formula (43b), p. 265. */ if (log_form) { total = log_array_total(get_matrix_row(i_state, trans)); if ((!almost_equal(total, 0.0, SLOP)) && (!almost_equal(total, 1.0, SLOP)) && // Allow for FIMS. (!almost_equal(EXP2(total), 0.0, SLOP))) { fprintf(stderr, "Warning: Row %d of transition matrix differs from 0.0 by %g.\n", i_state, EXP2(total)); return(FALSE); } } else { total = array_total(get_matrix_row(i_state, trans)); if ((!almost_equal(total, 1.0, SLOP)) && (!almost_equal(total, 2.0, SLOP)) && // Allow FIMs. (!almost_equal(total, 0.0, SLOP))) { // Allow inaccessible motifs. fprintf(stderr, "Warning: Row %d of transition matrix differs from 1.0 by %g.\n", i_state, 1.0 - total); return(FALSE); } } /* All transitions from the end state must be zero. */ if ((log_form) && (get_matrix_cell(num_states - 1, i_state, trans) > LOG_SMALL)) { fprintf(stderr, "Warning: Transition %d from end state is non-zero (%g).\n", i_state, get_matrix_cell(num_states - 1, i_state, trans)); return(FALSE); } else if (!(log_form) && (!almost_equal(get_matrix_cell(num_states - 1, i_state, trans), 0.0, SLOP))) { fprintf(stderr, "Warning: Transition %d from end state is non-zero (%g).\n", i_state, get_matrix_cell(num_states - 1, i_state, trans)); return(FALSE); } } return(TRUE); }
/*********************************************************************** * Compute the complexity of a motif as a number between 0 and 1. * * Motif complexity is the average K-L distance between the "motif * background distribution" and each column of the motif. The motif * background is just the average distribution of all the columns. The * K-L distance, which measures the difference between two * distributions, is the same as the information content: * * \sum_i p_i log(p_i/f_i) * * This value increases with increasing complexity. ***********************************************************************/ double compute_motif_complexity (MOTIF_T* a_motif) { double return_value; ARRAY_T* motif_background; // Mean emission distribution. int num_rows; int i_row; int num_cols; int i_col; num_cols = get_alph_size(ALPH_SIZE); num_rows = get_num_rows(a_motif->freqs); // Compute the mean emission distribution. motif_background = get_matrix_col_sums(a_motif->freqs); scalar_mult(1.0 / (double)num_rows, motif_background); // Compute the K-L distance w.r.t. the background. return_value = 0; for (i_row = 0; i_row < num_rows; i_row++) { ARRAY_T* this_emission = get_matrix_row(i_row, a_motif->freqs); for (i_col = 0; i_col < num_cols; i_col++) { ATYPE this_item = get_array_item(i_col, this_emission); ATYPE background_item = get_array_item(i_col, motif_background); // Use two logs to avoid handling divide-by-zero as a special case. return_value += this_item * (my_log(this_item) - my_log(background_item)); } } free_array(motif_background); return(return_value / (double)num_rows); }
void check_sq_matrix(MATRIX_T *sq, int expected_size) { int i; assert(get_num_rows(sq) == expected_size); assert(get_num_cols(sq) == expected_size); for (i = 0; i < expected_size; ++i) { assert(get_array_length(get_matrix_row(i, sq)) == expected_size); } }
/*********************************************************************** * Normalize the motif's pspm ***********************************************************************/ void normalize_motif (MOTIF_T *motif, double tolerance) { int i_row, asize; asize = alph_size(motif->alph, ALPH_SIZE); for (i_row = 0; i_row < motif->length; ++i_row) { normalize_subarray(0, asize, tolerance, get_matrix_row(i_row, motif->freqs)); } }
/*********************************************************************** * Calculate the ambiguous letters from the concrete ones. ***********************************************************************/ void calc_motif_ambigs (MOTIF_T *motif) { int i_row; resize_matrix(motif->length, alph_size(motif->alph, ALL_SIZE), 0, motif->freqs); motif->flags |= MOTIF_HAS_AMBIGS; for (i_row = 0; i_row < motif->length; ++i_row) { calc_ambigs(motif->alph, FALSE, get_matrix_row(i_row, motif->freqs)); } }
void dxml_handle_pos(void *ctx, int pos, double A, double C, double G, double T) { CTX_T *data; MOTIF_T *motif; ARRAY_T *row; data = (CTX_T*)ctx; motif = data->motif; row = get_matrix_row(pos - 1, motif->freqs); set_array_item(0, A, row); set_array_item(1, C, row); set_array_item(2, G, row); set_array_item(3, T, row); }
/***************************************************************************** * MEME > motifs > motif > probabilities > alphabet_matrix > /alphabet_array * Check that all letters have a probability and update the current matrix row. ****************************************************************************/ void mxml_end_probability_pos(void *ctx) { CTX_T *data; ARRAY_T *pos; int i; data = (CTX_T*)ctx; pos = get_matrix_row(data->current_pos, data->mscope.motif->freqs); for (i = 0; i < get_array_length(pos); i++) { if (get_array_item(i, pos) == -1) { local_error(data, "Probability for letter %c in position %d is missing.\n", alph_char(data->alph, i), i + 1); } } data->current_pos++; }
/*********************************************************************** * Calculates the information content of a position of the motif. * * Assumes that alph_size does not include ambigious characters. ***********************************************************************/ static inline double position_information_content( MOTIF_T *a_motif, int position ) { int i; double H, item; ARRAY_T *freqs; H = 0; freqs = get_matrix_row(position, a_motif->freqs); for (i = 0; i < a_motif->alph_size; ++i) { item = get_array_item(i, freqs); H -= item*my_log2(item); } return my_log2(a_motif->alph_size) - H; }
/*********************************************************************** * Takes a matrix of meme scores and converts them into letter * probabilities. * * The probablility can be got by: * p = (2 ^ (s / 100)) * bg * ***********************************************************************/ MATRIX_T* convert_scores_into_freqs (ALPH_T alph, MATRIX_T *scores, ARRAY_T *bg, int site_count, double pseudo_count) { int asize, length; double freq, score, total_count, counts, bg_freq; MATRIX_T *freqs; int row, col; assert(alph != INVALID_ALPH); assert(scores != NULL); assert(bg != NULL); length = get_num_rows(scores); asize = alph_size(alph, ALPH_SIZE); freqs = allocate_matrix(length, asize); total_count = site_count + pseudo_count; for (col = 0; col < asize; ++col) { bg_freq = get_array_item(col, bg); for (row = 0; row < length; ++row) { score = get_matrix_cell(row, col, scores); // convert to a probability freq = pow(2.0, score / 100.0) * bg_freq; // remove the pseudo count freq = ((freq * total_count) - (bg_freq * pseudo_count)) / site_count; if (freq < 0) freq = 0; else if (freq > 1) freq = 1; set_matrix_cell(row, col, freq, freqs); } } for (row = 0; row < length; ++row) { normalize_subarray(0, asize, 0.0, get_matrix_row(row, freqs)); } return freqs; }
/************************************************************************* * Entry point for pmp_bf *************************************************************************/ int main(int argc, char *argv[]) { char* bg_filename = NULL; char* motif_name = "motif"; // Use this motif name in the output. STRING_LIST_T* selected_motifs = NULL; double fg_rate = 1.0; double bg_rate = 1.0; double purine_pyrimidine = 1.0; // r double transition_transversion = 0.5; // R double pseudocount = 0.1; GAP_SUPPORT_T gap_support = SKIP_GAPS; MODEL_TYPE_T model_type = F81_MODEL; BOOLEAN_T use_halpern_bruno = FALSE; char* ustar_label = NULL; // TLB; create uniform star tree int i; program_name = "pmp_bf"; /********************************************** * COMMAND LINE PROCESSING **********************************************/ // Define command line options. (FIXME: Repeated code) // FIXME: Note that if you add or remove options you // must change n_options. int n_options = 12; cmdoption const pmp_options[] = { {"hb", NO_VALUE}, {"ustar", REQUIRED_VALUE}, {"model", REQUIRED_VALUE}, {"pur-pyr", REQUIRED_VALUE}, {"transition-transversion", REQUIRED_VALUE}, {"bg", REQUIRED_VALUE}, {"fg", REQUIRED_VALUE}, {"motif", REQUIRED_VALUE}, {"motif-name", REQUIRED_VALUE}, {"bgfile", REQUIRED_VALUE}, {"pseudocount", REQUIRED_VALUE}, {"verbosity", REQUIRED_VALUE} }; int option_index = 0; // Define the usage message. char usage[1000] = ""; strcat(usage, "USAGE: pmp [options] <tree file> <MEME file>\n"); strcat(usage, "\n"); strcat(usage, " Options:\n"); // Evolutionary model parameters. strcat(usage, " --hb\n"); strcat(usage, " --model single|average|jc|k2|f81|f84|hky|tn"); strcat(usage, " (default=f81)\n"); strcat(usage, " --pur-pyr <float> (default=1.0)\n"); strcat(usage, " --transition-transversion <float> (default=0.5)\n"); strcat(usage, " --bg <float> (default=1.0)\n"); strcat(usage, " --fg <float> (default=1.0)\n"); // Motif parameters. strcat(usage, " --motif <id> (default=all)\n"); strcat(usage, " --motif-name <string> (default from motif file)\n"); // Miscellaneous parameters strcat(usage, " --bgfile <background> (default from motif file)\n"); strcat(usage, " --pseudocount <float> (default=0.1)\n"); strcat(usage, " --ustar <label>\n"); // TLB; create uniform star tree strcat(usage, " --verbosity [1|2|3|4] (default 2)\n"); strcat(usage, "\n Prints the FP and FN rate at each of 10000 score values.\n"); strcat(usage, "\n Output format: [<motif_id> score <score> FPR <fpr> TPR <tpr>]+\n"); // Parse the command line. if (simple_setopt(argc, argv, n_options, pmp_options) != NO_ERROR) { die("Error processing command line options: option name too long.\n"); } while (TRUE) { int c = 0; char* option_name = NULL; char* option_value = NULL; const char * message = NULL; // Read the next option, and break if we're done. c = simple_getopt(&option_name, &option_value, &option_index); if (c == 0) { break; } else if (c < 0) { (void) simple_getopterror(&message); die("Error processing command line options (%s)\n", message); } if (strcmp(option_name, "model") == 0) { if (strcmp(option_value, "jc") == 0) { model_type = JC_MODEL; } else if (strcmp(option_value, "k2") == 0) { model_type = K2_MODEL; } else if (strcmp(option_value, "f81") == 0) { model_type = F81_MODEL; } else if (strcmp(option_value, "f84") == 0) { model_type = F84_MODEL; } else if (strcmp(option_value, "hky") == 0) { model_type = HKY_MODEL; } else if (strcmp(option_value, "tn") == 0) { model_type = TAMURA_NEI_MODEL; } else if (strcmp(option_value, "single") == 0) { model_type = SINGLE_MODEL; } else if (strcmp(option_value, "average") == 0) { model_type = AVERAGE_MODEL; } else { die("Unknown model: %s\n", option_value); } } else if (strcmp(option_name, "hb") == 0){ use_halpern_bruno = TRUE; } else if (strcmp(option_name, "ustar") == 0){ // TLB; create uniform star tree ustar_label = option_value; } else if (strcmp(option_name, "pur-pyr") == 0){ purine_pyrimidine = atof(option_value); } else if (strcmp(option_name, "transition-transversion") == 0){ transition_transversion = atof(option_value); } else if (strcmp(option_name, "bg") == 0){ bg_rate = atof(option_value); } else if (strcmp(option_name, "fg") == 0){ fg_rate = atof(option_value); } else if (strcmp(option_name, "motif") == 0){ if (selected_motifs == NULL) { selected_motifs = new_string_list(); } add_string(option_value, selected_motifs); } else if (strcmp(option_name, "motif-name") == 0){ motif_name = option_value; } else if (strcmp(option_name, "bgfile") == 0){ bg_filename = option_value; } else if (strcmp(option_name, "pseudocount") == 0){ pseudocount = atof(option_value); } else if (strcmp(option_name, "verbosity") == 0){ verbosity = atoi(option_value); } } // Must have tree and motif file names if (argc != option_index + 2) { fprintf(stderr, "%s", usage); exit(EXIT_FAILURE); } /********************************************** * Read the phylogenetic tree. **********************************************/ char* tree_filename = NULL; TREE_T* tree = NULL; tree_filename = argv[option_index]; option_index++; tree = read_tree_from_file(tree_filename); // get the species names STRING_LIST_T* alignment_species = make_leaf_list(tree); char *root_label = get_label(tree); // in case target in center if (strlen(root_label)>0) add_string(root_label, alignment_species); //write_string_list(" ", alignment_species, stderr); // TLB; Convert the tree to a uniform star tree with // the target sequence at its center. if (ustar_label != NULL) { tree = convert_to_uniform_star_tree(tree, ustar_label); if (tree == NULL) die("Tree or alignment missing target %s\n", ustar_label); if (verbosity >= NORMAL_VERBOSE) { fprintf(stderr, "Target %s placed at center of uniform (d=%.3f) star tree:\n", ustar_label, get_total_length(tree) / get_num_children(tree) ); write_tree(tree, stderr); } } /********************************************** * Read the motifs. **********************************************/ char* meme_filename = argv[option_index]; option_index++; int num_motifs = 0; MREAD_T *mread; ALPH_T alph; ARRAYLST_T *motifs; ARRAY_T *bg_freqs; mread = mread_create(meme_filename, OPEN_MFILE); mread_set_bg_source(mread, bg_filename); mread_set_pseudocount(mread, pseudocount); // read motifs motifs = mread_load(mread, NULL); alph = mread_get_alphabet(mread); bg_freqs = mread_get_background(mread); // check if (arraylst_size(motifs) == 0) die("No motifs in %s.", meme_filename); // TLB; need to resize bg_freqs array to ALPH_SIZE items // or copy array breaks in HB mode. This throws away // the freqs for the ambiguous characters; int asize = alph_size(alph, ALPH_SIZE); resize_array(bg_freqs, asize); /************************************************************** * Compute probability distributions for each of the selected motifs. **************************************************************/ int motif_index; for (motif_index = 0; motif_index < arraylst_size(motifs); motif_index++) { MOTIF_T* motif = (MOTIF_T*)arraylst_get(motif_index, motifs); char* motif_id = get_motif_id(motif); char* bare_motif_id = motif_id; // We may have specified on the command line that // only certain motifs were to be used. if (selected_motifs != NULL) { if (*bare_motif_id == '+' || *bare_motif_id == '-') { // The selected motif id won't included a strand indicator. bare_motif_id++; } if (have_string(bare_motif_id, selected_motifs) == FALSE) { continue; } } if (verbosity >= NORMAL_VERBOSE) { fprintf( stderr, "Using motif %s of width %d.\n", motif_id, get_motif_length(motif) ); } // Build an array of evolutionary models for each position in the motif. EVOMODEL_T** models = make_motif_models( motif, bg_freqs, model_type, fg_rate, bg_rate, purine_pyrimidine, transition_transversion, use_halpern_bruno ); // Get the frequencies under the background model (row 0) // and position-dependent scores (rows 1..w) // for each possible alignment column. MATRIX_T* pssm_matrix = build_alignment_pssm_matrix( alph, alignment_species, get_motif_length(motif) + 1, models, tree, gap_support ); ARRAY_T* alignment_col_freqs = allocate_array(get_num_cols(pssm_matrix)); copy_array(get_matrix_row(0, pssm_matrix), alignment_col_freqs); remove_matrix_row(0, pssm_matrix); // throw away first row //print_col_frequencies(alph, alignment_col_freqs); // // Get the position-dependent null model alignment column frequencies // int w = get_motif_length(motif); int ncols = get_num_cols(pssm_matrix); MATRIX_T* pos_dep_bkg = allocate_matrix(w, ncols); for (i=0; i<w; i++) { // get the evo model corresponding to this column of the motif // and store it as the first evolutionary model. myfree(models[0]); // Use motif PSFM for equilibrium freqs. for model. ARRAY_T* site_specific_freqs = allocate_array(asize); int j = 0; for(j = 0; j < asize; j++) { double value = get_matrix_cell(i, j, get_motif_freqs(motif)); set_array_item(j, value, site_specific_freqs); } if (use_halpern_bruno == FALSE) { models[0] = make_model( model_type, fg_rate, transition_transversion, purine_pyrimidine, site_specific_freqs, NULL ); } else { models[0] = make_model( model_type, fg_rate, transition_transversion, purine_pyrimidine, bg_freqs, site_specific_freqs ); } // get the alignment column frequencies using this model MATRIX_T* tmp_pssm_matrix = build_alignment_pssm_matrix( alph, alignment_species, 2, // only interested in freqs under bkg models, tree, gap_support ); // assemble the position-dependent background alignment column freqs. set_matrix_row(i, get_matrix_row(0, tmp_pssm_matrix), pos_dep_bkg); // chuck the pssm (not his real name) free_matrix(tmp_pssm_matrix); } // // Compute and print the score distribution under the background model // and under the (position-dependent) motif model. // int range = 10000; // 10^4 gives same result as 10^5, but 10^3 differs // under background model PSSM_T* pssm = build_matrix_pssm(alph, pssm_matrix, alignment_col_freqs, range); // under position-dependent background (motif) model PSSM_T* pssm_pos_dep = build_matrix_pssm(alph, pssm_matrix, alignment_col_freqs, range); get_pv_lookup_pos_dep( pssm_pos_dep, pos_dep_bkg, NULL // no priors used ); // print FP and FN distributions int num_items = get_pssm_pv_length(pssm_pos_dep); for (i=0; i<num_items; i++) { double pvf = get_pssm_pv(i, pssm); double pvt = get_pssm_pv(i, pssm_pos_dep); double fpr = pvf; double fnr = 1 - pvt; if (fpr >= 0.99999 || fnr == 0) continue; printf("%s score %d FPR %.3g FNR %.3g\n", motif_id, i, fpr, fnr); } // free stuff free_pssm(pssm); free_pssm(pssm_pos_dep); if (models != NULL) { int model_index; int num_models = get_motif_length(motif) + 1; for (model_index = 0; model_index < num_models; model_index++) { free_model(models[model_index]); } myfree(models); } } // motif arraylst_destroy(destroy_motif, motifs); /********************************************** * Clean up. **********************************************/ // TLB may have encountered a memory corruption bug here // CEG has not been able to reproduce it. valgrind says all is well. free_array(bg_freqs); free_tree(TRUE, tree); free_string_list(selected_motifs); return(0); } // main
int mg_computepath(CombinedScoreMatrixEntry **combinedscore_matrix, HitInformation *hit_information, unsigned long rows, unsigned long contig_len, ParseStruct *parsestruct_ptr, GtError * err) { int had_err = 0; /* Initialisieren der Matrix fuer die Pfadberechnung */ PathMatrixEntry **path_matrix; /* i: Zaehlvariable fuer die Matrix-Zeilen; k: Zaehlvariable Precursors (von 0 bis max 2) maxpath_frame: Speichern des vorherigen Frames von dem der max-Wert berechnet wird */ unsigned short row_index = 0, precursor_index = 0, precursors_row = 0, maxpath_frame = 0; /* Position in der Query-DNA */ unsigned long column_index = 0; /* Variablen fuer den aktuellen Frame, den vorherigen Frame(speichert einen Wert aus precursors[], die Zeile des vorherigen Frames, GtArray mit den Precursors-Frames */ short current_frame = 0, precursors_frame = 0, precursors[NUM_PRECURSORS]; /* q ist der Wert, der bei Aus- oder Eintreten in ein Gen auf dem Forward- bzw. Reverse-Strang berechnet wird */ double q = ARGUMENTSSTRUCT(leavegene_value), max_new = 1, max_old = 1; /* Speicherreservierung fuer die Path-Matrix - Groesse entsprechend der CombinedScore-Matrix */ gt_array2dim_calloc(path_matrix, 7, contig_len); gt_error_check(err); /* fuer die erste Spalte der Path-Matrix wird die erste Spalte der CombinedScore-Matrix uebernommen */ for (row_index = 0; row_index < rows; row_index++) { path_matrix[row_index][0].score = combinedscore_matrix[row_index][0].matrix_score; path_matrix[row_index][0].path_frame = row_index; } /* Spaltenweise Berechnung des opt. Pfades */ for (column_index = 1; column_index < contig_len; column_index++) { for (row_index = 0; row_index < rows; row_index++) { /* Zaehlvariable fuer die Zeile wird umgerechnet in den entsprechenden Leserahmen */ current_frame = get_current_frame(row_index); /* Aufruf der Methode zum Berechnen der moeglichen Leserahmen anhand von aktuellem Leserahmen und der Query-DNA-Sequenz */ compute_precursors(current_frame, column_index, precursors); /* der max-Wert der moeglichen Vorgaenger wird berechnet */ for (precursor_index = 0; precursor_index < NUM_PRECURSORS && (precursors[precursor_index] != UNDEFINED); ++precursor_index) { /* aktueller Vorgaengerleserahmen - es gibt max. 3 moegliche Vorgaenger */ precursors_frame = precursors[precursor_index]; /* Vorgaengerleserahmen wird umgerechnet in die entsprechende Matrix-Zeile */ precursors_row = get_matrix_row(precursors_frame); /* der DP-Algo umfasst 3 moegliche Faelle 1. Fall: Wechsel vom Reversen- auf den Forward-Strang bzw. umgekehrt */ if ((current_frame < 0 && precursors_frame > 0) || (current_frame > 0 && precursors_frame < 0)) { max_new = path_matrix[precursors_row][column_index-1].score + combinedscore_matrix[row_index][column_index].matrix_score + 2*q; } /* 2. Fall: Einfacher Wechsel des Leserahmens, also von + zu + bzw.- zu - */ else if (current_frame != 0 && precursors_frame != current_frame) { max_new = path_matrix[precursors_row][column_index-1].score + combinedscore_matrix[row_index][column_index].matrix_score + q; } /* 3. Fall: Leserahmen wird beibehalten bzw. Wechsel von kodierend zu nicht-kodierend oder umgekehrt */ else { max_new = path_matrix[precursors_row][column_index-1].score + combinedscore_matrix[row_index][column_index] .matrix_score; } /* Bestimmen des Max-Wertes der max. 3 Moeglichkeiten und Speichern der Zeile, von der der Max-Wert stammt */ if (gt_double_compare(max_new, max_old) > 0) { max_old = max_new; maxpath_frame = precursors_row; } } /* Speichern des Max-Wertes und der "Vorgaenger"-Zeile; zuruecksetzen der Variablen */ path_matrix[row_index][column_index].score = max_old; path_matrix[row_index][column_index].path_frame = maxpath_frame; max_new = DBL_MIN; max_old = DBL_MIN; maxpath_frame = 0; } } /* Aufruf der Methode zur Genvorhersage */ had_err = mg_compute_gene_prediction(combinedscore_matrix, path_matrix, contig_len, hit_information, parsestruct_ptr, err); gt_array2dim_delete(path_matrix); return had_err; }
int main(int argc, char **argv) { gettimeofday(&start_global, NULL); print_lib_version(); mpz_init(N); mpz_t B; mpz_init(B); unsigned long int uBase; int64_t nb_primes; modular_root_t *modular_roots; uint64_t i, j; if (mpz_init_set_str(N, argv[1], 10) == -1) { printf("Cannot load N %s\n", argv[1]); exit(2); } mpz_t sqrtN, rem; mpz_init(sqrtN); mpz_init(rem); mpz_sqrtrem(sqrtN, rem, N); if (mpz_cmp_ui(rem, 0) != 0) /* if not perfect square, calculate the ceiling */ mpz_add_ui(sqrtN, sqrtN, 1); else /* N is a perfect square, factored! */ { printf("\n<<<[FACTOR]>>> %s\n", mpz_get_str(NULL, 10, sqrtN)); return 0; } if (mpz_probab_prime_p(N, 10) > 0) /* don't bother factoring */ { printf("N:%s is prime\n", mpz_get_str(NULL, 10, N)); exit(0); } OPEN_LOG_FILE("freq"); //-------------------------------------------------------- // calculate the smoothness base for the given N //-------------------------------------------------------- get_smoothness_base(B, N); /* if N is too small, the program will surely fail, please consider a pen and paper instead */ uBase = mpz_get_ui(B); printf("n: %s\tBase: %s\n", mpz_get_str(NULL, 10, N), mpz_get_str(NULL, 10, B)); //-------------------------------------------------------- // sieve primes that are less than the smoothness base using Eratosthenes sieve //-------------------------------------------------------- START_TIMER(); nb_primes = sieve_primes_up_to((int64_t) (uBase)); printf("\nPrimes found %" PRId64 " [Smoothness Base %lu]\n", nb_primes, uBase); STOP_TIMER_PRINT_TIME("\tEratosthenes Sieving done"); //-------------------------------------------------------- // fill the primes array with primes to which n is a quadratic residue //-------------------------------------------------------- START_TIMER(); primes = calloc(nb_primes, sizeof(int64_t)); nb_qr_primes = fill_primes_with_quadratic_residue(primes, N); /*for(i=0; i<nb_qr_primes; i++) printf("%" PRId64 "\n", primes[i]);*/ printf("\nN-Quadratic primes found %" PRId64 "\n", nb_qr_primes); STOP_TIMER_PRINT_TIME("\tQuadratic prime filtering done"); //-------------------------------------------------------- // calculate modular roots //-------------------------------------------------------- START_TIMER(); modular_roots = calloc(nb_qr_primes, sizeof(modular_root_t)); mpz_t tmp, r1, r2; mpz_init(tmp); mpz_init(r1); mpz_init(r2); for (i = 0; i < nb_qr_primes; i++) { mpz_set_ui(tmp, (unsigned long) primes[i]); mpz_sqrtm(r1, N, tmp); /* calculate the modular root */ mpz_neg(r2, r1); /* -q mod n */ mpz_mod(r2, r2, tmp); modular_roots[i].root1 = mpz_get_ui(r1); modular_roots[i].root2 = mpz_get_ui(r2); } mpz_clear(tmp); mpz_clear(r1); mpz_clear(r2); STOP_TIMER_PRINT_TIME("\nModular roots calculation done"); /*for(i=0; i<nb_qr_primes; i++) { printf("[%10" PRId64 "-> roots: %10u - %10u]\n", primes[i], modular_roots[i].root1, modular_roots[i].root2); }*/ //-------------------------------------------------------- // ***** initialize the matrix ***** //-------------------------------------------------------- START_TIMER(); init_matrix(&matrix, nb_qr_primes + NB_VECTORS_OFFSET, nb_qr_primes); mpz_init2(tmp_matrix_row, nb_qr_primes); STOP_TIMER_PRINT_TIME("\nMatrix initialized"); //-------------------------------------------------------- // [Sieving] //-------------------------------------------------------- START_TIMER(); mpz_t x, sieving_index, next_sieving_index; unsigned long ui_index, SIEVING_STEP = 50000; /* we sieve for 50000 elements at each loop */ uint64_t p_pow; smooth_number_t *x_squared; x_squared = calloc(SIEVING_STEP, sizeof(smooth_number_t)); smooth_numbers = calloc(nb_qr_primes + NB_VECTORS_OFFSET, sizeof(smooth_number_t)); mpz_init_set(x, sqrtN); mpz_init_set(sieving_index, x); mpz_init_set(next_sieving_index, x); mpz_t p; mpz_init(p); mpz_t str; mpz_init_set(str, sieving_index); printf("\nSieving ...\n"); //-------------------------------------------------------- // Init before sieving //-------------------------------------------------------- for (i = 0; i < SIEVING_STEP; i++) { mpz_init(x_squared[i].value_x); mpz_init(x_squared[i].value_x_squared); /* the factors_exp array is used to keep track of exponents */ //x_squared[i].factors_exp = calloc(nb_qr_primes, sizeof(uint64_t)); /* we use directly the exponents vector modulo 2 to preserve space */mpz_init2( x_squared[i].factors_vect, nb_qr_primes); mpz_add_ui(x, x, 1); } int nb_smooth_per_round = 0; char s[512]; //-------------------------------------------------------- // WHILE smooth numbers found less than the primes in the smooth base + NB_VECTORS_OFFSET //-------------------------------------------------------- while (nb_smooth_numbers_found < nb_qr_primes + NB_VECTORS_OFFSET) { nb_smooth_per_round = 0; mpz_set(x, next_sieving_index); /* sieve numbers from sieving_index to sieving_index + sieving_step */ mpz_set(sieving_index, next_sieving_index); printf("\r"); printf( "\t\tSieving at: %s30 <--> Smooth numbers found: %" PRId64 "/%" PRId64 "", mpz_get_str(NULL, 10, sieving_index), nb_smooth_numbers_found, nb_qr_primes); fflush(stdout); for (i = 0; i < SIEVING_STEP; i++) { mpz_set(x_squared[i].value_x, x); mpz_pow_ui(x_squared[i].value_x_squared, x, 2); /* calculate value_x_squared <- x²-n */ mpz_sub(x_squared[i].value_x_squared, x_squared[i].value_x_squared, N); mpz_clear(x_squared[i].factors_vect); mpz_init2(x_squared[i].factors_vect, nb_qr_primes); /* reconstruct a new fresh 0ed vector of size nb_qr_primes bits */ mpz_add_ui(x, x, 1); } mpz_set(next_sieving_index, x); //-------------------------------------------------------- // eliminate factors in the x_squared array, those who are 'destructed' to 1 are smooth //-------------------------------------------------------- for (i = 0; i < nb_qr_primes; i++) { mpz_set_ui(p, (unsigned long) primes[i]); mpz_set(x, sieving_index); /* get the first multiple of p that is directly larger that sieving_index * Quadratic SIEVING: all elements from this number and in positions multiples of root1 and root2 * are also multiples of p */ get_sieving_start_index(x, x, p, modular_roots[i].root1); mpz_set(str, x); mpz_sub(x, x, sieving_index); /* x contains index of first number that is divisible by p */ for (j = mpz_get_ui(x); j < SIEVING_STEP; j += primes[i]) { p_pow = mpz_remove(x_squared[j].value_x_squared, x_squared[j].value_x_squared, p); /* eliminate all factors of p */ if (p_pow & 1) /* mark bit if odd power of p exists in this x_squared[j] */ { mpz_setbit(x_squared[j].factors_vect, i); } if (mpz_cmp_ui(x_squared[j].value_x_squared, 1) == 0) { save_smooth_number(x_squared[j]); nb_smooth_per_round++; } /* sieve next element located p steps from here */ } /* same goes for root2 */ if (modular_roots[i].root2 == modular_roots[i].root1) continue; mpz_set(x, sieving_index); get_sieving_start_index(x, x, p, modular_roots[i].root2); mpz_set(str, x); mpz_sub(x, x, sieving_index); for (j = mpz_get_ui(x); j < SIEVING_STEP; j += primes[i]) { p_pow = mpz_remove(x_squared[j].value_x_squared, x_squared[j].value_x_squared, p); if (p_pow & 1) { mpz_setbit(x_squared[j].factors_vect, i); } if (mpz_cmp_ui(x_squared[j].value_x_squared, 1) == 0) { save_smooth_number(x_squared[j]); nb_smooth_per_round++; } } } //printf("\tSmooth numbers found %" PRId64 "\n", nb_smooth_numbers_found); /*sprintf(s, "[start: %s - end: %s - step: %" PRId64 "] nb_smooth_per_round: %d", mpz_get_str(NULL, 10, sieving_index), mpz_get_str(NULL, 10, next_sieving_index), SIEVING_STEP, nb_smooth_per_round); APPEND_TO_LOG_FILE(s);*/ } STOP_TIMER_PRINT_TIME("\nSieving DONE"); uint64_t t = 0; //-------------------------------------------------------- //the matrix ready, start Gauss elimination. The Matrix is filled on the call of save_smooth_number() //-------------------------------------------------------- START_TIMER(); gauss_elimination(&matrix); STOP_TIMER_PRINT_TIME("\nGauss elimination done"); //print_matrix_matrix(&matrix); //print_matrix_identity(&matrix); uint64_t row_index = nb_qr_primes + NB_VECTORS_OFFSET - 1; /* last row in the matrix */ int nb_linear_relations = 0; mpz_t linear_relation_z, solution_z; mpz_init(linear_relation_z); mpz_init(solution_z); get_matrix_row(linear_relation_z, &matrix, row_index--); /* get the last few rows in the Gauss eliminated matrix*/ while (mpz_cmp_ui(linear_relation_z, 0) == 0) { nb_linear_relations++; get_matrix_row(linear_relation_z, &matrix, row_index--); } printf("\tLinear dependent relations found : %d\n", nb_linear_relations); //-------------------------------------------------------- // Factor //-------------------------------------------------------- //We use the last linear relation to reconstruct our solution START_TIMER(); printf("\nFactorizing..\n"); mpz_t solution_X, solution_Y; mpz_init(solution_X); mpz_init(solution_Y); /* we start testing from the first linear relation encountered in the matrix */ for (j = nb_linear_relations; j > 0; j--) { printf("Trying %d..\n", nb_linear_relations - j + 1); mpz_set_ui(solution_X, 1); mpz_set_ui(solution_Y, 1); get_identity_row(solution_z, &matrix, nb_qr_primes + NB_VECTORS_OFFSET - j + 1); for (i = 0; i < nb_qr_primes; i++) { if (mpz_tstbit(solution_z, i)) { mpz_mul(solution_X, solution_X, smooth_numbers[i].value_x); mpz_mod(solution_X, solution_X, N); /* reduce x to modulo N */ mpz_mul(solution_Y, solution_Y, smooth_numbers[i].value_x_squared); /*TODO: handling huge stuff here, there is no modulo N like in the solution_X case! * eliminate squares as long as you go*/ } } mpz_sqrt(solution_Y, solution_Y); mpz_mod(solution_Y, solution_Y, N); /* y = sqrt(MUL(xi²-n)) mod N */ mpz_sub(solution_X, solution_X, solution_Y); mpz_gcd(solution_X, solution_X, N); if (mpz_cmp(solution_X, N) != 0 && mpz_cmp_ui(solution_X, 1) != 0) /* factor can be 1 or N, try another relation */ break; } mpz_cdiv_q(solution_Y, N, solution_X); printf("\n>>>>>>>>>>> FACTORED %s =\n", mpz_get_str(NULL, 10, N)); printf("\tFactor 1: %s \n\tFactor 2: %s", mpz_get_str(NULL, 10, solution_X), mpz_get_str(NULL, 10, solution_Y)); /*sprintf(s, "\n>>>>>>>>>>> FACTORED %s =\n", mpz_get_str(NULL, 10, N)); APPEND_TO_LOG_FILE(s); sprintf(s, "\tFactor 1: %s \n\tFactor 2: %s", mpz_get_str(NULL, 10, solution_X), mpz_get_str(NULL, 10, solution_Y)); APPEND_TO_LOG_FILE(s); gettimeofday(&end_global, NULL); timersub(&end_global, &start_global, &elapsed); sprintf(s, "****** TOTAL TIME: %.3f ms\n", elapsed.tv_sec * 1000 + elapsed.tv_usec / (double) 1000); APPEND_TO_LOG_FILE(s);*/ STOP_TIMER_PRINT_TIME("\nFactorizing done"); printf("Cleaning memory..\n"); /********************** clear the x_squared array **********************/ for (i = 0; i < SIEVING_STEP; i++) { mpz_clear(x_squared[i].value_x); mpz_clear(x_squared[i].value_x_squared); //free(x_squared[i].factors_exp); mpz_clear(x_squared[i].factors_vect); } free(x_squared); /********************** clear the x_squared array **********************/ free(modular_roots); /********************** clear the smooth_numbers array **********************/ for (i = 0; i < nb_qr_primes + NB_VECTORS_OFFSET; i++) { mpz_clear(smooth_numbers[i].value_x); mpz_clear(smooth_numbers[i].value_x_squared); //free(smooth_numbers[i].factors_exp); } free(smooth_numbers); /********************** clear the smooth_numbers array **********************/ free(primes); /********************** clear mpz _t **********************/mpz_clear(B); mpz_clear(N); sqrtN, rem; mpz_clear(x); mpz_clear(sieving_index); mpz_clear(next_sieving_index); mpz_clear(p); mpz_clear(str); /********************** clear mpz _t **********************/ free_matrix(&matrix); gettimeofday(&end_global, NULL); timersub(&end_global, &start_global, &elapsed); printf("****** TOTAL TIME: %.3f ms\n", elapsed.tv_sec * 1000 + elapsed.tv_usec / (double) 1000); show_mem_usage(); return 0; }
/************************************************************************* * Build a completely connected HMM. *************************************************************************/ void build_complete_hmm (ARRAY_T* background, int spacer_states, MOTIF_T *motifs, int nmotifs, MATRIX_T *transp_freq, MATRIX_T *spacer_ave, BOOLEAN_T fim, MHMM_T **the_hmm) { ALPH_T alph; int motif_states; // Total length of the motifs. int num_spacers; // Total number of spacer states. int num_states; // Total number of states in the model. int i_motif; // Index of the current "from" motif. int j_motif; // Index of the current "to" motif. int i_position; // Index within the current motif or spacer. int i_state = 0; // Index of the current state. assert(nmotifs > 0); alph = get_motif_alph(motifs);// get the alphabet from the first motif // Count the width of the motifs. for (motif_states = 0, i_motif = 0; i_motif < nmotifs; i_motif++) motif_states += get_motif_length(motif_at(motifs, i_motif)); // Count the spacer states adjacent to begin and end. num_spacers = nmotifs * 2; // Add the spacer states between motifs. num_spacers += nmotifs * nmotifs; // Total states = motifs + spacer_states + begin/end num_states = motif_states + (num_spacers * spacer_states) + 2; // Allocate the model. *the_hmm = allocate_mhmm(alph, num_states); // Record that this is a completely connected model. (*the_hmm)->type = COMPLETE_HMM; // Record the number of motifs in the model. (*the_hmm)->num_motifs = nmotifs; // Record the number of states in the model. (*the_hmm)->num_states = num_states; (*the_hmm)->num_spacers = ((nmotifs + 1) * (nmotifs + 1)) - 1; (*the_hmm)->spacer_states = spacer_states; // Put the background distribution into the model. copy_array(background, (*the_hmm)->background); // Build the begin state. build_complete_state( START_STATE, i_state, alph, 0, // expected length NULL, // Emissions. 0, // Number of sites. NON_MOTIF_INDEX, NON_MOTIF_POSITION, nmotifs, 0, // previous motif 0, // next motif transp_freq, spacer_states, num_spacers, motifs, &((*the_hmm)->states[i_state])); i_state++; int from_motif_state, to_motif_state; // Build the spacer states. No transitions from the end state. for (i_motif = 0; i_motif <= nmotifs; i_motif++) { // No transitions to the start state. for (j_motif = 1; j_motif <= nmotifs+1; j_motif++) { // No transitions from start to end. if ((i_motif == 0) && (j_motif == nmotifs+1)) continue; // Allow multi-state spacers. for (i_position = 0; i_position < spacer_states; i_position++, i_state++) { build_complete_state( SPACER_STATE, i_state, alph, get_matrix_cell(i_motif, j_motif, spacer_ave), background, SPACER_NUMSITES, NON_MOTIF_INDEX, i_position, nmotifs, i_motif, j_motif, transp_freq, spacer_states, num_spacers, motifs, &((*the_hmm)->states[i_state])); } } } // Build the motif states. for (i_motif = 0; i_motif < nmotifs; i_motif++) { MOTIF_T *this_motif = motif_at(motifs, i_motif); STATE_T state; for (i_position = 0; i_position < get_motif_length(this_motif); i_position++, i_state++) { if (i_position == 0) { state = START_MOTIF_STATE; } else if (i_position == (get_motif_length(this_motif) - 1)) { state = END_MOTIF_STATE; } else { state = MID_MOTIF_STATE; } build_complete_state( MID_MOTIF_STATE, i_state, alph, 0, // Expected spacer length. get_matrix_row(i_position, get_motif_freqs(this_motif)), get_motif_nsites(this_motif), i_motif, i_position, nmotifs, 0, // Previous motif index. 0, // Next motif index. transp_freq, spacer_states, num_spacers, motifs, &((*the_hmm)->states[i_state])); } } // Build the end state. build_complete_state( END_STATE, i_state, alph, 0, // Expected spacer length. NULL, // Emissions 0, // Number of sites. NON_MOTIF_INDEX, NON_MOTIF_POSITION, nmotifs, 0, // Previous motif index. 0, // Next motif index. transp_freq, spacer_states, num_spacers, motifs, &((*the_hmm)->states[i_state])); i_state++; // Convert spacers to FIMs if requested. if (fim) { convert_to_fims(*the_hmm); } // Fill in the transition matrix. build_transition_matrix(*the_hmm); }
/************************************************************************** * * get_pdf_table * * Compute the pdf of a pssm. * * Returns an array of pdf values: * pdf[x] = Pr(score == x) * for 0<=x<=range*w. * * Assumes: * 1) motif scores are non-negative, integral * 2) background model is position-dependent, 0-order Markov * **************************************************************************/ static ARRAY_T* get_pdf_table( PSSM_T* pssm, // The PSSM. MATRIX_T* background_matrix, // Background model PSSM matrix. ARRAY_T* scaled_lo_prior_dist // Scaled distribution of log odds priors. ) { int i, j, k; MATRIX_T* matrix = pssm->matrix;// The PSSM matrix. int w = pssm->w;// PSSM width. int alen = pssm->alphsize; if (alen == alph_size(pssm->alph, ALL_SIZE)) { // CONSIDER We need to review how ambiguity characters are used // in probability calculation throughout the code. // We don't want to include the background probabilities for ambiguity // characters in this calculation, only the primary charcters of the alphabet. // However, Motiph the motiph 'alphabet' actually includes all possible // columns of the mutiple alignment, so we skip this clause if // pssm->alphsize > get_alph_size(ALL_SIZE) alen = alph_size(pssm->alph, ALPH_SIZE); } int range = pssm->range;// Maximum score in PSSM. int size = w*range+1; if (scaled_lo_prior_dist != NULL) { // Having priors expands the range of possible scores size += range; } ARRAY_T* pdf_old = allocate_array(size); ARRAY_T* pdf_new = allocate_array(size); init_array(0, pdf_new); set_array_item(0, 1, pdf_new); // Prob(0) if (scaled_lo_prior_dist != NULL) { // Use distribution of log odds priors to // initialize starting probabilities. for (k=0; k<=range; k++) { double prob = get_array_item(k, scaled_lo_prior_dist); set_array_item(k, prob, pdf_new); } } // Compute the pdf recursively. for (i=0; i<w; i++) { int max; if (scaled_lo_prior_dist == NULL) { max = i * range; } else { // Having priors expands the range of possible scores max = (i + 1) * range; } // get position dependent background model ARRAY_T* background = get_matrix_row(i, background_matrix); SWAP(ARRAY_T*, pdf_new, pdf_old) for (k=0; k<=max+range; k++) { set_array_item(k, 0, pdf_new); } for (j=0; j<alen; j++) { int s = (int) get_matrix_cell(i, j, matrix); for(k=0; k<=max; k++) { double old = get_array_item(k, pdf_old); if (old != 0) { double new = get_array_item(k+s, pdf_new) + (old * get_array_item(j, background)); set_array_item(k+s, new, pdf_new); } // old } // k } // j } // i
/************************************************************************* * Build a linear HMM. *************************************************************************/ void build_linear_hmm (ARRAY_T* background, ORDER_T* order_spacing, int spacer_states, RBTREE_T* motifs, // motifs with key as in order_spacing BOOLEAN_T fim, MHMM_T** the_hmm) { ALPH_T alph; int model_length; // Total number of states in the model. int i_state; // Index of the current state. int i_order; // Index within the order and spacing. int i_position; // Index within the current motif or spacer. int motif_i; // motif key in order spacing MOTIF_T *motif; // motif RBNODE_T *node; alph = get_motif_alph((MOTIF_T*)rbtree_value(rbtree_first(motifs))); // Calculate the total length of the model. model_length = 2; // start and end state for (i_order = 0; i_order < get_order_occurs(order_spacing); i_order++) { motif_i = get_order_motif(order_spacing, i_order); motif = (MOTIF_T*)rbtree_get(motifs, &motif_i); model_length += get_motif_length(motif); } model_length += (get_order_occurs(order_spacing) + 1) * spacer_states; // Allocate the model. *the_hmm = allocate_mhmm(alph, model_length); check_sq_matrix((*the_hmm)->trans, model_length); // Record that this is a linear model. (*the_hmm)->type = LINEAR_HMM; // Record the number of motifs in the model. // It doesn't want the distinct count (*the_hmm)->num_motifs = get_order_occurs(order_spacing); // Record the number of states in the model. (*the_hmm)->num_states = model_length; (*the_hmm)->num_spacers = get_order_occurs(order_spacing) + 1; (*the_hmm)->spacer_states = spacer_states; // Put the background distribution into the model. copy_array(background, (*the_hmm)->background); // Begin the model with a non-emitting state. i_state = 0; check_sq_matrix((*the_hmm)->trans, (*the_hmm)->num_states); build_linear_state( alph, START_STATE, i_state, get_spacer_length(order_spacing, 0), NULL, // Emissions. 0, // Number of sites. NON_MOTIF_INDEX, NON_MOTIF_POSITION, // position within state (not relevant to start state) NULL, // no motif &((*the_hmm)->states[i_state])); ++i_state; // Build the first spacer. for (i_position = 0; i_position < spacer_states; i_position++, i_state++) { check_sq_matrix((*the_hmm)->trans, (*the_hmm)->num_states); build_linear_state( alph, SPACER_STATE, i_state, get_spacer_length(order_spacing, 0), background, SPACER_NUMSITES, NON_MOTIF_INDEX, i_position, // position within spacer NULL, // no motif &((*the_hmm)->states[i_state])); } // Build each motif and subsequent spacer. for (i_order = 0; i_order < get_order_occurs(order_spacing); i_order++) { STATE_T state; int spacer_len; motif_i = get_order_motif(order_spacing, i_order); motif = (MOTIF_T*)rbtree_get(motifs, &motif_i); // Build the motif. for (i_position = 0; i_position < get_motif_length(motif); i_position++, i_state++) { if (i_position == 0) { state = START_MOTIF_STATE; spacer_len = get_spacer_length(order_spacing, i_order); } else if (i_position == (get_motif_length(motif) - 1)) { state = END_MOTIF_STATE; spacer_len = get_spacer_length(order_spacing, i_order+1); } else { state = MID_MOTIF_STATE; spacer_len = 0; } check_sq_matrix((*the_hmm)->trans, (*the_hmm)->num_states); build_linear_state( alph, state, i_state, spacer_len, // Expected spacer length. get_matrix_row(i_position, get_motif_freqs(motif)), get_motif_nsites(motif), i_order, i_position, // position within motif (middle) motif, &((*the_hmm)->states[i_state])); } // Build the following spacer. for (i_position = 0; i_position < spacer_states; i_position++, i_state++) { check_sq_matrix((*the_hmm)->trans, (*the_hmm)->num_states); build_linear_state( alph, SPACER_STATE, i_state, get_spacer_length(order_spacing, i_order+1), background, SPACER_NUMSITES, NON_MOTIF_INDEX, i_position, // position within spacer NULL, // no motif &((*the_hmm)->states[i_state])); } } check_sq_matrix((*the_hmm)->trans, (*the_hmm)->num_states); // Finish up the model with a non-emitting end state. build_linear_state( alph, END_STATE, i_state, get_spacer_length(order_spacing, i_order), NULL, // Emissions. 0, // Number of sites. NON_MOTIF_INDEX, NON_MOTIF_POSITION, // position within state (not relevant to end state) NULL, // no motif &((*the_hmm)->states[i_state])); ++i_state; assert(i_state == model_length); check_sq_matrix((*the_hmm)->trans, (*the_hmm)->num_states); // Convert spacers to FIMs if requested. if (fim) { convert_to_fims(*the_hmm); } // Fill in the transition matrix. build_transition_matrix(*the_hmm); }
/*********************************************************************** * Compute the number of positions from the start or end of a motif * that contain a given percentage of the information content. * * Information content is the same as relative entropy, and is computed * as * * \sum_i p_i log(p_i/f_i) * ***********************************************************************/ int get_info_content_position (BOOLEAN_T from_start, // Count from start? Otherwise, count from end. float threshold, // Information content threshold (in 0-100). ARRAY_T* background, // Background distribution. MOTIF_T* a_motif) { // Make sure the given threshold is in the right range. if ((threshold < 0.0) || (threshold > 100.0)) { die( "Information threshold (%g) must be a percentage between 0 and 100.\n", threshold ); } // Get the dimensions of the motif. int num_cols = get_alph_size(ALPH_SIZE); int num_rows = get_num_rows(a_motif->freqs); // Compute and store the information content for each row // and the total information content for the motif. ATYPE total_information_content = 0.0; ARRAY_T* information_content = allocate_array(num_rows); int i_row; int i_col; for (i_row = 0; i_row < num_rows; i_row++) { ATYPE row_content = 0.0; ARRAY_T* this_emission = get_matrix_row(i_row, a_motif->freqs); for (i_col = 0; i_col < num_cols; i_col++) { ATYPE this_item = get_array_item(i_col, this_emission); ATYPE background_item = get_array_item(i_col, background); // Use two logs to avoid handling divide-by-zero as a special case. ATYPE partial_row_content = this_item * (my_log(this_item) - my_log(background_item)); row_content += partial_row_content; total_information_content += partial_row_content; } set_array_item(i_row, row_content, information_content); } // Search for the target position. int return_value = -1; ATYPE cumulative_content = 0.0; ATYPE percent = 0.0; if (from_start) { // Search from start for IC exceeding threshold. for (i_row = 0; i_row < num_rows; i_row++) { cumulative_content += get_array_item(i_row, information_content); percent = 100 * cumulative_content / total_information_content; if (percent >= threshold) { return_value = i_row; break; } } } else { // Search from end for IC exceeding threshold. for (i_row = num_rows - 1; i_row >= 0; i_row--) { cumulative_content += get_array_item(i_row, information_content); percent = 100 * cumulative_content / total_information_content; if (percent >= threshold) { return_value = i_row; break; } } } if (return_value == -1) { die( "Can't find a position that accounts for %g of information content.", threshold ); } free_array(information_content); return(return_value); }
/* assuming slaves (workers)) are all homogenous, let them all do the calculations regarding primes sieving, calculating the smoothness base and the modular roots */ int main(int argc, char **argv) { MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); MPI_Comm_size(MPI_COMM_WORLD, &mpi_group_size); int len; MPI_Get_processor_name(processor_name, &len); gettimeofday(&start_global, NULL); print_lib_version(); mpz_init(N); mpz_t B; mpz_init(B); unsigned long int uBase; int64_t nb_primes; modular_root_t *modular_roots; uint64_t i, j; if (argc < 2) { PRINT(my_rank, "usage: %s Number_to_factorize\n", argv[0]); exit(2); } if (mpz_init_set_str(N, argv[1], 10) == -1) { PRINT(my_rank, "Cannot load N %s\n", argv[1]); exit(2); } mpz_t sqrtN, rem; mpz_init(sqrtN); mpz_init(rem); mpz_sqrtrem(sqrtN, rem, N); if (mpz_cmp_ui(rem, 0) != 0) /* if not perfect square, calculate the ceiling */ mpz_add_ui(sqrtN, sqrtN, 1); else /* N is a perfect square, factored! */ { PRINT(my_rank, "\n<<<[FACTOR]>>> %s\n", mpz_get_str(NULL, 10, sqrtN)); return 0; } if (mpz_probab_prime_p(N, 10) > 0) /* don't bother factoring */ { PRINT(my_rank, "N:%s is prime\n", mpz_get_str(NULL, 10, N)); exit(0); } OPEN_LOG_FILE("freq"); //-------------------------------------------------------- // calculate the smoothness base for the given N //-------------------------------------------------------- get_smoothness_base(B, N); /* if N is too small, the program will surely fail, please consider a pen and paper instead */ uBase = mpz_get_ui(B); PRINT(my_rank, "n: %s\tBase: %s\n", mpz_get_str(NULL, 10, N), mpz_get_str(NULL, 10, B)); //-------------------------------------------------------- // sieve primes that are less than the smoothness base using Eratosthenes sieve //-------------------------------------------------------- START_TIMER(); nb_primes = sieve_primes_up_to((int64_t) (uBase)); PRINT(my_rank, "\tPrimes found %" PRId64 " [Smoothness Base %lu]\n", nb_primes, uBase); STOP_TIMER_PRINT_TIME("\tEratosthenes Sieving done"); //-------------------------------------------------------- // fill the primes array with primes to which n is a quadratic residue //-------------------------------------------------------- START_TIMER(); primes = calloc(nb_primes, sizeof(int64_t)); nb_qr_primes = fill_primes_with_quadratic_residue(primes, N); /*for(i=0; i<nb_qr_primes; i++) PRINT(my_rank, "%" PRId64 "\n", primes[i]);*/ PRINT(my_rank, "\tN-Quadratic primes found %" PRId64 "\n", nb_qr_primes); STOP_TIMER_PRINT_TIME("\tQuadratic prime filtering done"); //-------------------------------------------------------- // calculate modular roots //-------------------------------------------------------- START_TIMER(); modular_roots = calloc(nb_qr_primes, sizeof(modular_root_t)); mpz_t tmp, r1, r2; mpz_init(tmp); mpz_init(r1); mpz_init(r2); for (i = 0; i < nb_qr_primes; i++) { mpz_set_ui(tmp, (unsigned long) primes[i]); mpz_sqrtm(r1, N, tmp); /* calculate the modular root */ mpz_neg(r2, r1); /* -q mod n */ mpz_mod(r2, r2, tmp); modular_roots[i].root1 = mpz_get_ui(r1); modular_roots[i].root2 = mpz_get_ui(r2); } mpz_clear(tmp); mpz_clear(r1); mpz_clear(r2); STOP_TIMER_PRINT_TIME("Modular roots calculation done"); //-------------------------------------------------------- // ***** initialize the matrix ***** //-------------------------------------------------------- if (my_rank == 0) /* only the master have the matrix */ { START_TIMER(); init_matrix(&matrix, nb_qr_primes + NB_VECTORS_OFFSET, nb_qr_primes); mpz_init2(tmp_matrix_row, nb_qr_primes); STOP_TIMER_PRINT_TIME("Matrix initialized"); } //-------------------------------------------------------- // [Sieving] - everyones sieves including the master //-------------------------------------------------------- START_TIMER(); mpz_t x, sieving_index, next_sieving_index, relative_start, global_step; unsigned long ui_index, SIEVING_STEP = 50000; /* we sieve for 50000 elements at each loop */ int LOCAL_SIEVING_ROUNDS = 10; /* number of iterations a worker sieves before communicating results to the master */ unsigned long sieving_round = 0; unsigned long nb_big_rounds = 0; uint64_t p_pow; smooth_number_t *x_squared; x_squared = calloc(SIEVING_STEP, sizeof(smooth_number_t)); if (my_rank == 0) smooth_numbers = calloc(nb_qr_primes + NB_VECTORS_OFFSET, sizeof(smooth_number_t)); else temp_slaves_smooth_numbers = calloc(500, sizeof(smooth_number_t)); /* TODO: this is not properly correct, using a linkedlist is better to keep track of temporary * smooth numbers at the slaves nodes however it's pretty rare to find 500 smooth numbers in * 50000 * 10 interval. */ mpz_init_set(x, sqrtN); mpz_init(global_step); mpz_init(relative_start); mpz_init(sieving_index); mpz_init(next_sieving_index); mpz_t p; mpz_init(p); mpz_t str; mpz_init_set(str, sieving_index); PRINT(my_rank, "\n[%s] Sieving ...\n", processor_name); //-------------------------------------------------------- // Init before sieving //-------------------------------------------------------- for (i = 0; i < SIEVING_STEP; i++) { mpz_init(x_squared[i].value_x); mpz_init(x_squared[i].value_x_squared); mpz_init2(x_squared[i].factors_vect, nb_qr_primes); mpz_add_ui(x, x, 1); } int nb_smooth_per_round = 0; char s[512]; //-------------------------------------------------------- // WHILE smooth numbers found less than the primes in the smooth base + NB_VECTORS_OFFSET for master // Or master asked for more smooth numbers from slaves //-------------------------------------------------------- while (1) { mpz_set_ui(global_step, nb_big_rounds); /* calculates the coordinate where the workers start sieving from */ mpz_mul_ui(global_step, global_step, (unsigned long) mpi_group_size); mpz_mul_ui(global_step, global_step, SIEVING_STEP); mpz_mul_ui(global_step, global_step, LOCAL_SIEVING_ROUNDS); mpz_add(global_step, global_step, sqrtN); mpz_set_ui(relative_start, SIEVING_STEP); mpz_mul_ui(relative_start, relative_start, LOCAL_SIEVING_ROUNDS); mpz_mul_ui(relative_start, relative_start, (unsigned long) my_rank); mpz_add(relative_start, relative_start, global_step); mpz_set(sieving_index, relative_start); mpz_set(next_sieving_index, relative_start); for (sieving_round = 0; sieving_round < LOCAL_SIEVING_ROUNDS; /* each slave sieves for LOCAL_SIEVING_ROUNDS rounds */ sieving_round++) { nb_smooth_per_round = 0; mpz_set(x, next_sieving_index); /* sieve numbers from sieving_index to sieving_index + sieving_step */ mpz_set(sieving_index, next_sieving_index); if (my_rank == 0) { printf("\r"); printf( "\t\tSieving at: %s30 <--> Smooth numbers found: %" PRId64 "/%" PRId64 "", mpz_get_str(NULL, 10, sieving_index), nb_global_smooth_numbers_found, nb_qr_primes); fflush(stdout); } for (i = 0; i < SIEVING_STEP; i++) { mpz_set(x_squared[i].value_x, x); mpz_pow_ui(x_squared[i].value_x_squared, x, 2); /* calculate value_x_squared <- x²-n */ mpz_sub(x_squared[i].value_x_squared, x_squared[i].value_x_squared, N); mpz_clear(x_squared[i].factors_vect); mpz_init2(x_squared[i].factors_vect, nb_qr_primes); /* reconstruct a new fresh 0ed vector of size nb_qr_primes bits */ mpz_add_ui(x, x, 1); } mpz_set(next_sieving_index, x); //-------------------------------------------------------- // eliminate factors in the x_squared array, those who are 'destructed' to 1 are smooth //-------------------------------------------------------- for (i = 0; i < nb_qr_primes; i++) { mpz_set_ui(p, (unsigned long) primes[i]); mpz_set(x, sieving_index); /* get the first multiple of p that is directly larger that sieving_index * Quadratic SIEVING: all elements from this number and in positions multiples of root1 and root2 * are also multiples of p */ get_sieving_start_index(x, x, p, modular_roots[i].root1); mpz_set(str, x); mpz_sub(x, x, sieving_index); /* x contains index of first number that is divisible by p */ for (j = mpz_get_ui(x); j < SIEVING_STEP; j += primes[i]) { p_pow = mpz_remove(x_squared[j].value_x_squared, x_squared[j].value_x_squared, p); /* eliminate all factors of p */ if (p_pow & 1) /* mark bit if odd power of p exists in this x_squared[j] */ { mpz_setbit(x_squared[j].factors_vect, i); } if (mpz_cmp_ui(x_squared[j].value_x_squared, 1) == 0) { save_smooth_number(x_squared[j]); nb_smooth_per_round++; } /* sieve next element located p steps from here */ } /* same goes for root2 */ if (modular_roots[i].root2 == modular_roots[i].root1) continue; mpz_set(x, sieving_index); get_sieving_start_index(x, x, p, modular_roots[i].root2); mpz_set(str, x); mpz_sub(x, x, sieving_index); for (j = mpz_get_ui(x); j < SIEVING_STEP; j += primes[i]) { p_pow = mpz_remove(x_squared[j].value_x_squared, x_squared[j].value_x_squared, p); if (p_pow & 1) { mpz_setbit(x_squared[j].factors_vect, i); } if (mpz_cmp_ui(x_squared[j].value_x_squared, 1) == 0) { save_smooth_number(x_squared[j]); nb_smooth_per_round++; } } } } if (my_rank == 0) /* master gathers smooth numbers from slaves */ { gather_smooth_numbers(); notify_slaves(); } else /* slaves send their smooth numbers to master */ { send_smooth_numbers_to_master(); nb_global_smooth_numbers_found = get_server_notification(); } if (nb_global_smooth_numbers_found >= nb_qr_primes + NB_VECTORS_OFFSET) break; nb_big_rounds++; } STOP_TIMER_PRINT_TIME("\nSieving DONE"); if (my_rank == 0) { uint64_t t = 0; //-------------------------------------------------------- //the matrix ready, start Gauss elimination. The Matrix is filled on the call of save_smooth_number() //-------------------------------------------------------- START_TIMER(); gauss_elimination(&matrix); STOP_TIMER_PRINT_TIME("\nGauss elimination done"); uint64_t row_index = nb_qr_primes + NB_VECTORS_OFFSET - 1; /* last row in the matrix */ int nb_linear_relations = 0; mpz_t linear_relation_z, solution_z; mpz_init(linear_relation_z); mpz_init(solution_z); get_matrix_row(linear_relation_z, &matrix, row_index--); /* get the last few rows in the Gauss eliminated matrix*/ while (mpz_cmp_ui(linear_relation_z, 0) == 0) { nb_linear_relations++; get_matrix_row(linear_relation_z, &matrix, row_index--); } PRINT(my_rank, "\tLinear dependent relations found : %d\n", nb_linear_relations); //-------------------------------------------------------- // Factor //-------------------------------------------------------- //We use the last linear relation to reconstruct our solution START_TIMER(); PRINT(my_rank, "%s", "\nFactorizing..\n"); mpz_t solution_X, solution_Y; mpz_init(solution_X); mpz_init(solution_Y); /* we start testing from the first linear relation encountered in the matrix */ for (j = nb_linear_relations; j > 0; j--) { PRINT(my_rank, "Trying %d..\n", nb_linear_relations - j + 1); mpz_set_ui(solution_X, 1); mpz_set_ui(solution_Y, 1); get_identity_row(solution_z, &matrix, nb_qr_primes + NB_VECTORS_OFFSET - j + 1); for (i = 0; i < nb_qr_primes; i++) { if (mpz_tstbit(solution_z, i)) { mpz_mul(solution_X, solution_X, smooth_numbers[i].value_x); mpz_mod(solution_X, solution_X, N); /* reduce x to modulo N */ mpz_mul(solution_Y, solution_Y, smooth_numbers[i].value_x_squared); /*TODO: handling huge stuff here, there is no modulo N like in the solution_X case! * eliminate squares as long as you go*/ } } mpz_sqrt(solution_Y, solution_Y); mpz_mod(solution_Y, solution_Y, N); /* y = sqrt(MUL(xi²-n)) mod N */ mpz_sub(solution_X, solution_X, solution_Y); mpz_gcd(solution_X, solution_X, N); if (mpz_cmp(solution_X, N) != 0 && mpz_cmp_ui(solution_X, 1) != 0) /* factor can be 1 or N, try another relation */ break; } mpz_cdiv_q(solution_Y, N, solution_X); PRINT(my_rank, "\n>>>>>>>>>>> FACTORED %s =\n", mpz_get_str(NULL, 10, N)); PRINT( my_rank, "\tFactor 1: %s \n\tFactor 2: %s", mpz_get_str(NULL, 10, solution_X), mpz_get_str(NULL, 10, solution_Y)); sprintf(s, "\n>>>>>>>>>>> FACTORED %s =\n", mpz_get_str(NULL, 10, N)); APPEND_TO_LOG_FILE(s); sprintf(s, "\tFactor 1: %s \n\tFactor 2: %s", mpz_get_str(NULL, 10, solution_X), mpz_get_str(NULL, 10, solution_Y)); APPEND_TO_LOG_FILE(s); gettimeofday(&end_global, NULL); timersub(&end_global, &start_global, &elapsed); sprintf(s, "****** TOTAL TIME: %.3f ms\n", elapsed.tv_sec * 1000 + elapsed.tv_usec / (double) 1000); APPEND_TO_LOG_FILE(s); STOP_TIMER_PRINT_TIME("\nFactorizing done"); } PRINT(my_rank, "%s", "\nCleaning memory..\n"); /********************** clear the x_squared array **********************/ for (i = 0; i < SIEVING_STEP; i++) { mpz_clear(x_squared[i].value_x); mpz_clear(x_squared[i].value_x_squared); //free(x_squared[i].factors_exp); mpz_clear(x_squared[i].factors_vect); } free(x_squared); /********************** clear the x_squared array **********************/ free(modular_roots); /********************** clear the smooth_numbers array **********************/ if (my_rank == 0) { for (i = 0; i < nb_qr_primes + NB_VECTORS_OFFSET; i++) { mpz_clear(smooth_numbers[i].value_x); mpz_clear(smooth_numbers[i].value_x_squared); mpz_clear(smooth_numbers[i].factors_vect); //free(smooth_numbers[i].factors_exp); } free(smooth_numbers); } else { for (i = 0; i < 500; i++) { mpz_clear(temp_slaves_smooth_numbers[i].value_x); mpz_clear(temp_slaves_smooth_numbers[i].value_x_squared); mpz_clear(temp_slaves_smooth_numbers[i].factors_vect); } free(temp_slaves_smooth_numbers); } /********************** clear the smooth_numbers array **********************/ free(primes); /********************** clear mpz _t **********************/mpz_clear(B); mpz_clear(N); sqrtN, rem; mpz_clear(x); mpz_clear(sieving_index); mpz_clear(next_sieving_index); mpz_clear(p); mpz_clear(str); /********************** clear mpz _t **********************/ free_matrix(&matrix); gettimeofday(&end_global, NULL); timersub(&end_global, &start_global, &elapsed); PRINT(my_rank, "****** TOTAL TIME: %.3f ms\n", elapsed.tv_sec * 1000 + elapsed.tv_usec / (double) 1000); show_mem_usage(); MPI_Finalize(); return 0; }
/************************************************************************* * Build a star topology HMM. *************************************************************************/ void build_star_hmm (ARRAY_T* background, int spacer_states, MOTIF_T* motifs, int nmotifs, BOOLEAN_T fim, MHMM_T** the_hmm) { ALPH_T alph; int motif_states; /* Total length of the motifs. */ int num_spacers; /* Total number of spacer states. */ int num_states; /* Total number of states in the model. */ int i_motif; /* Index of the current "from" motif. */ int i_position; /* Index within the current motif or spacer. */ int i_state = 0; /* Index of the current state. */ alph = get_motif_alph(motif_at(motifs, 0)); /* Count the width of the motifs. */ for (motif_states = 0, i_motif = 0; i_motif < nmotifs; i_motif++) motif_states += get_motif_length(motif_at(motifs, i_motif)); // Only 1 spacer. num_spacers = 1; /* Total states = motifs + spacer_states + begin/end */ num_states = motif_states + (num_spacers * spacer_states) + 2; /* fprintf(stderr, "motif_states=%d num_spacers=%d num_states=%d\n", motif_states, num_spacers, num_states); */ /* Allocate the model. */ *the_hmm = allocate_mhmm(alph, num_states); /* Record that this is a star model. */ (*the_hmm)->type = STAR_HMM; /* Record the number of motifs in the model. */ (*the_hmm)->num_motifs = nmotifs; /* Record the number of states in the model. */ (*the_hmm)->num_states = num_states; (*the_hmm)->num_spacers = 1; (*the_hmm)->spacer_states = spacer_states; // Put the background distribution into the model. copy_array(background, (*the_hmm)->background); /* Build the begin state. */ build_star_state( alph, START_STATE, i_state, 0, // expected length NULL, 0, // Number of sites. NON_MOTIF_INDEX, NON_MOTIF_POSITION, nmotifs, spacer_states, motifs, &((*the_hmm)->states[i_state]) ); i_state++; // Build the spacer state (state 0). Allow multi-state spacers. for (i_position = 0; i_position < spacer_states; i_position++) { build_star_state( alph, SPACER_STATE, i_state, DEFAULT_SPACER_LENGTH, background, SPACER_NUMSITES, NON_MOTIF_INDEX, i_position, nmotifs, spacer_states, motifs, &((*the_hmm)->states[i_state]) ); i_state++; } /* Build the motif states. */ for (i_motif = 0; i_motif < nmotifs; i_motif++) { MOTIF_T *this_motif = motif_at(motifs, i_motif); assert(get_motif_length(this_motif) > 1); i_position = 0; build_star_state( alph, START_MOTIF_STATE, i_state, 0, // Expected spacer length. get_matrix_row(i_position, get_motif_freqs(this_motif)), get_motif_nsites(this_motif), i_motif, i_position, nmotifs, spacer_states, motifs, &((*the_hmm)->states[i_state]) ); i_state++; for (i_position = 1; i_position < get_motif_length(this_motif) - 1; i_position++) { build_star_state( alph, MID_MOTIF_STATE, i_state, 0, // Expected spacer length. get_matrix_row(i_position, get_motif_freqs(this_motif)), get_motif_nsites(this_motif), i_motif, i_position, nmotifs, spacer_states, motifs, &((*the_hmm)->states[i_state]) ); i_state++; } build_star_state( alph, END_MOTIF_STATE, i_state, 0, // Expected spacer length. get_matrix_row(i_position, get_motif_freqs(this_motif)), get_motif_nsites(this_motif), i_motif, i_position, nmotifs, spacer_states, motifs, &((*the_hmm)->states[i_state]) ); i_state++; } /* Build the end state. */ build_star_state( alph, END_STATE, i_state, 0, // Expected spacer length. NULL, // Emissions 0, // Number of sites. NON_MOTIF_INDEX, NON_MOTIF_POSITION, nmotifs, spacer_states, motifs, &((*the_hmm)->states[i_state]) ); i_state++; /* Convert spacers to FIMs if requested. */ if (fim) { convert_to_fims(*the_hmm); } /* Fill in the transition matrix. */ build_transition_matrix(*the_hmm); } // build_star_hmm
void read_regexp_file( char* filename, // Name of MEME file IN int* num_motifs, // Number of motifs retrieved OUT MOTIF_T* motifs // The retrieved motifs - NOT ALLOCATED! ) { FILE* motif_file; // MEME file containing the motifs. char motif_name[MAX_MOTIF_ID_LENGTH+1]; char motif_regexp[MAX_MOTIF_WIDTH]; ARRAY_T* these_freqs; MOTIF_T* m; int i; //Set things to the defaults. *num_motifs = 0; // Open the given MEME file. if (open_file(filename, "r", TRUE, "motif", "motifs", &motif_file) == 0) exit(1); //Set alphabet - ONLY supports dna. set_alphabet(verbosity, "ACGT"); while (fscanf(motif_file, "%s\t%s", motif_name, motif_regexp) == 2) { /* * Now we: * 1. Fill in new motif (preallocated) * 2. Assign name * 3. Convert regexp into frequency table. */ m = &(motifs[*num_motifs]); set_motif_id(motif_name, m); m->length = strlen(motif_regexp); /* Store the alphabet size in the motif. */ m->alph_size = get_alph_size(ALPH_SIZE); m->ambigs = get_alph_size(AMBIG_SIZE); /* Allocate memory for the matrix. */ m->freqs = allocate_matrix(m->length, get_alph_size(ALL_SIZE)); //Set motif frequencies here. for (i=0;i<strlen(motif_regexp);i++) { switch(toupper(motif_regexp[i])) { case 'A': set_matrix_cell(i,alphabet_index('A',get_alphabet(TRUE)),1,m->freqs); break; case 'C': set_matrix_cell(i,alphabet_index('C',get_alphabet(TRUE)),1,m->freqs); break; case 'G': set_matrix_cell(i,alphabet_index('G',get_alphabet(TRUE)),1,m->freqs); break; case 'T': set_matrix_cell(i,alphabet_index('T',get_alphabet(TRUE)),1,m->freqs); break; case 'U': set_matrix_cell(i,alphabet_index('U',get_alphabet(TRUE)),1,m->freqs); break; case 'R': //purines set_matrix_cell(i,alphabet_index('G',get_alphabet(TRUE)),1,m->freqs); set_matrix_cell(i,alphabet_index('A',get_alphabet(TRUE)),1,m->freqs); break; case 'Y': //pyramidines set_matrix_cell(i,alphabet_index('T',get_alphabet(TRUE)),1,m->freqs); set_matrix_cell(i,alphabet_index('C',get_alphabet(TRUE)),1,m->freqs); break; case 'K': //keto set_matrix_cell(i,alphabet_index('G',get_alphabet(TRUE)),1,m->freqs); set_matrix_cell(i,alphabet_index('T',get_alphabet(TRUE)),1,m->freqs); break; case 'M': //amino set_matrix_cell(i,alphabet_index('A',get_alphabet(TRUE)),1,m->freqs); set_matrix_cell(i,alphabet_index('C',get_alphabet(TRUE)),1,m->freqs); break; case 'S': //strong set_matrix_cell(i,alphabet_index('G',get_alphabet(TRUE)),1,m->freqs); set_matrix_cell(i,alphabet_index('C',get_alphabet(TRUE)),1,m->freqs); break; case 'W': //weak set_matrix_cell(i,alphabet_index('A',get_alphabet(TRUE)),1,m->freqs); set_matrix_cell(i,alphabet_index('T',get_alphabet(TRUE)),1,m->freqs); break; case 'B': set_matrix_cell(i,alphabet_index('G',get_alphabet(TRUE)),1,m->freqs); set_matrix_cell(i,alphabet_index('T',get_alphabet(TRUE)),1,m->freqs); set_matrix_cell(i,alphabet_index('C',get_alphabet(TRUE)),1,m->freqs); break; case 'D': set_matrix_cell(i,alphabet_index('G',get_alphabet(TRUE)),1,m->freqs); set_matrix_cell(i,alphabet_index('A',get_alphabet(TRUE)),1,m->freqs); set_matrix_cell(i,alphabet_index('T',get_alphabet(TRUE)),1,m->freqs); break; case 'H': set_matrix_cell(i,alphabet_index('A',get_alphabet(TRUE)),1,m->freqs); set_matrix_cell(i,alphabet_index('C',get_alphabet(TRUE)),1,m->freqs); set_matrix_cell(i,alphabet_index('T',get_alphabet(TRUE)),1,m->freqs); break; case 'V': set_matrix_cell(i,alphabet_index('G',get_alphabet(TRUE)),1,m->freqs); set_matrix_cell(i,alphabet_index('C',get_alphabet(TRUE)),1,m->freqs); set_matrix_cell(i,alphabet_index('A',get_alphabet(TRUE)),1,m->freqs); break; case 'N': set_matrix_cell(i,alphabet_index('A',get_alphabet(TRUE)),1,m->freqs); set_matrix_cell(i,alphabet_index('C',get_alphabet(TRUE)),1,m->freqs); set_matrix_cell(i,alphabet_index('G',get_alphabet(TRUE)),1,m->freqs); set_matrix_cell(i,alphabet_index('T',get_alphabet(TRUE)),1,m->freqs); break; } } /* Compute values for ambiguous characters. */ for (i = 0; i < m->length; i++) { these_freqs = get_matrix_row(i, m->freqs); fill_in_ambiguous_chars(FALSE, these_freqs); } /* Compute and store the motif complexity. */ m->complexity = compute_motif_complexity(m); //Move our pointer along to do the next motif. (*num_motifs)++; } }