/************************************************************************* * Entry point for pmp_bf *************************************************************************/ int main(int argc, char *argv[]) { char* bg_filename = NULL; char* motif_name = "motif"; // Use this motif name in the output. STRING_LIST_T* selected_motifs = NULL; double fg_rate = 1.0; double bg_rate = 1.0; double purine_pyrimidine = 1.0; // r double transition_transversion = 0.5; // R double pseudocount = 0.1; GAP_SUPPORT_T gap_support = SKIP_GAPS; MODEL_TYPE_T model_type = F81_MODEL; BOOLEAN_T use_halpern_bruno = FALSE; char* ustar_label = NULL; // TLB; create uniform star tree int i; program_name = "pmp_bf"; /********************************************** * COMMAND LINE PROCESSING **********************************************/ // Define command line options. (FIXME: Repeated code) // FIXME: Note that if you add or remove options you // must change n_options. int n_options = 12; cmdoption const pmp_options[] = { {"hb", NO_VALUE}, {"ustar", REQUIRED_VALUE}, {"model", REQUIRED_VALUE}, {"pur-pyr", REQUIRED_VALUE}, {"transition-transversion", REQUIRED_VALUE}, {"bg", REQUIRED_VALUE}, {"fg", REQUIRED_VALUE}, {"motif", REQUIRED_VALUE}, {"motif-name", REQUIRED_VALUE}, {"bgfile", REQUIRED_VALUE}, {"pseudocount", REQUIRED_VALUE}, {"verbosity", REQUIRED_VALUE} }; int option_index = 0; // Define the usage message. char usage[1000] = ""; strcat(usage, "USAGE: pmp [options] <tree file> <MEME file>\n"); strcat(usage, "\n"); strcat(usage, " Options:\n"); // Evolutionary model parameters. strcat(usage, " --hb\n"); strcat(usage, " --model single|average|jc|k2|f81|f84|hky|tn"); strcat(usage, " (default=f81)\n"); strcat(usage, " --pur-pyr <float> (default=1.0)\n"); strcat(usage, " --transition-transversion <float> (default=0.5)\n"); strcat(usage, " --bg <float> (default=1.0)\n"); strcat(usage, " --fg <float> (default=1.0)\n"); // Motif parameters. strcat(usage, " --motif <id> (default=all)\n"); strcat(usage, " --motif-name <string> (default from motif file)\n"); // Miscellaneous parameters strcat(usage, " --bgfile <background> (default from motif file)\n"); strcat(usage, " --pseudocount <float> (default=0.1)\n"); strcat(usage, " --ustar <label>\n"); // TLB; create uniform star tree strcat(usage, " --verbosity [1|2|3|4] (default 2)\n"); strcat(usage, "\n Prints the FP and FN rate at each of 10000 score values.\n"); strcat(usage, "\n Output format: [<motif_id> score <score> FPR <fpr> TPR <tpr>]+\n"); // Parse the command line. if (simple_setopt(argc, argv, n_options, pmp_options) != NO_ERROR) { die("Error processing command line options: option name too long.\n"); } while (TRUE) { int c = 0; char* option_name = NULL; char* option_value = NULL; const char * message = NULL; // Read the next option, and break if we're done. c = simple_getopt(&option_name, &option_value, &option_index); if (c == 0) { break; } else if (c < 0) { (void) simple_getopterror(&message); die("Error processing command line options (%s)\n", message); } if (strcmp(option_name, "model") == 0) { if (strcmp(option_value, "jc") == 0) { model_type = JC_MODEL; } else if (strcmp(option_value, "k2") == 0) { model_type = K2_MODEL; } else if (strcmp(option_value, "f81") == 0) { model_type = F81_MODEL; } else if (strcmp(option_value, "f84") == 0) { model_type = F84_MODEL; } else if (strcmp(option_value, "hky") == 0) { model_type = HKY_MODEL; } else if (strcmp(option_value, "tn") == 0) { model_type = TAMURA_NEI_MODEL; } else if (strcmp(option_value, "single") == 0) { model_type = SINGLE_MODEL; } else if (strcmp(option_value, "average") == 0) { model_type = AVERAGE_MODEL; } else { die("Unknown model: %s\n", option_value); } } else if (strcmp(option_name, "hb") == 0){ use_halpern_bruno = TRUE; } else if (strcmp(option_name, "ustar") == 0){ // TLB; create uniform star tree ustar_label = option_value; } else if (strcmp(option_name, "pur-pyr") == 0){ purine_pyrimidine = atof(option_value); } else if (strcmp(option_name, "transition-transversion") == 0){ transition_transversion = atof(option_value); } else if (strcmp(option_name, "bg") == 0){ bg_rate = atof(option_value); } else if (strcmp(option_name, "fg") == 0){ fg_rate = atof(option_value); } else if (strcmp(option_name, "motif") == 0){ if (selected_motifs == NULL) { selected_motifs = new_string_list(); } add_string(option_value, selected_motifs); } else if (strcmp(option_name, "motif-name") == 0){ motif_name = option_value; } else if (strcmp(option_name, "bgfile") == 0){ bg_filename = option_value; } else if (strcmp(option_name, "pseudocount") == 0){ pseudocount = atof(option_value); } else if (strcmp(option_name, "verbosity") == 0){ verbosity = atoi(option_value); } } // Must have tree and motif file names if (argc != option_index + 2) { fprintf(stderr, "%s", usage); exit(EXIT_FAILURE); } /********************************************** * Read the phylogenetic tree. **********************************************/ char* tree_filename = NULL; TREE_T* tree = NULL; tree_filename = argv[option_index]; option_index++; tree = read_tree_from_file(tree_filename); // get the species names STRING_LIST_T* alignment_species = make_leaf_list(tree); char *root_label = get_label(tree); // in case target in center if (strlen(root_label)>0) add_string(root_label, alignment_species); //write_string_list(" ", alignment_species, stderr); // TLB; Convert the tree to a uniform star tree with // the target sequence at its center. if (ustar_label != NULL) { tree = convert_to_uniform_star_tree(tree, ustar_label); if (tree == NULL) die("Tree or alignment missing target %s\n", ustar_label); if (verbosity >= NORMAL_VERBOSE) { fprintf(stderr, "Target %s placed at center of uniform (d=%.3f) star tree:\n", ustar_label, get_total_length(tree) / get_num_children(tree) ); write_tree(tree, stderr); } } /********************************************** * Read the motifs. **********************************************/ char* meme_filename = argv[option_index]; option_index++; int num_motifs = 0; MREAD_T *mread; ALPH_T alph; ARRAYLST_T *motifs; ARRAY_T *bg_freqs; mread = mread_create(meme_filename, OPEN_MFILE); mread_set_bg_source(mread, bg_filename); mread_set_pseudocount(mread, pseudocount); // read motifs motifs = mread_load(mread, NULL); alph = mread_get_alphabet(mread); bg_freqs = mread_get_background(mread); // check if (arraylst_size(motifs) == 0) die("No motifs in %s.", meme_filename); // TLB; need to resize bg_freqs array to ALPH_SIZE items // or copy array breaks in HB mode. This throws away // the freqs for the ambiguous characters; int asize = alph_size(alph, ALPH_SIZE); resize_array(bg_freqs, asize); /************************************************************** * Compute probability distributions for each of the selected motifs. **************************************************************/ int motif_index; for (motif_index = 0; motif_index < arraylst_size(motifs); motif_index++) { MOTIF_T* motif = (MOTIF_T*)arraylst_get(motif_index, motifs); char* motif_id = get_motif_id(motif); char* bare_motif_id = motif_id; // We may have specified on the command line that // only certain motifs were to be used. if (selected_motifs != NULL) { if (*bare_motif_id == '+' || *bare_motif_id == '-') { // The selected motif id won't included a strand indicator. bare_motif_id++; } if (have_string(bare_motif_id, selected_motifs) == FALSE) { continue; } } if (verbosity >= NORMAL_VERBOSE) { fprintf( stderr, "Using motif %s of width %d.\n", motif_id, get_motif_length(motif) ); } // Build an array of evolutionary models for each position in the motif. EVOMODEL_T** models = make_motif_models( motif, bg_freqs, model_type, fg_rate, bg_rate, purine_pyrimidine, transition_transversion, use_halpern_bruno ); // Get the frequencies under the background model (row 0) // and position-dependent scores (rows 1..w) // for each possible alignment column. MATRIX_T* pssm_matrix = build_alignment_pssm_matrix( alph, alignment_species, get_motif_length(motif) + 1, models, tree, gap_support ); ARRAY_T* alignment_col_freqs = allocate_array(get_num_cols(pssm_matrix)); copy_array(get_matrix_row(0, pssm_matrix), alignment_col_freqs); remove_matrix_row(0, pssm_matrix); // throw away first row //print_col_frequencies(alph, alignment_col_freqs); // // Get the position-dependent null model alignment column frequencies // int w = get_motif_length(motif); int ncols = get_num_cols(pssm_matrix); MATRIX_T* pos_dep_bkg = allocate_matrix(w, ncols); for (i=0; i<w; i++) { // get the evo model corresponding to this column of the motif // and store it as the first evolutionary model. myfree(models[0]); // Use motif PSFM for equilibrium freqs. for model. ARRAY_T* site_specific_freqs = allocate_array(asize); int j = 0; for(j = 0; j < asize; j++) { double value = get_matrix_cell(i, j, get_motif_freqs(motif)); set_array_item(j, value, site_specific_freqs); } if (use_halpern_bruno == FALSE) { models[0] = make_model( model_type, fg_rate, transition_transversion, purine_pyrimidine, site_specific_freqs, NULL ); } else { models[0] = make_model( model_type, fg_rate, transition_transversion, purine_pyrimidine, bg_freqs, site_specific_freqs ); } // get the alignment column frequencies using this model MATRIX_T* tmp_pssm_matrix = build_alignment_pssm_matrix( alph, alignment_species, 2, // only interested in freqs under bkg models, tree, gap_support ); // assemble the position-dependent background alignment column freqs. set_matrix_row(i, get_matrix_row(0, tmp_pssm_matrix), pos_dep_bkg); // chuck the pssm (not his real name) free_matrix(tmp_pssm_matrix); } // // Compute and print the score distribution under the background model // and under the (position-dependent) motif model. // int range = 10000; // 10^4 gives same result as 10^5, but 10^3 differs // under background model PSSM_T* pssm = build_matrix_pssm(alph, pssm_matrix, alignment_col_freqs, range); // under position-dependent background (motif) model PSSM_T* pssm_pos_dep = build_matrix_pssm(alph, pssm_matrix, alignment_col_freqs, range); get_pv_lookup_pos_dep( pssm_pos_dep, pos_dep_bkg, NULL // no priors used ); // print FP and FN distributions int num_items = get_pssm_pv_length(pssm_pos_dep); for (i=0; i<num_items; i++) { double pvf = get_pssm_pv(i, pssm); double pvt = get_pssm_pv(i, pssm_pos_dep); double fpr = pvf; double fnr = 1 - pvt; if (fpr >= 0.99999 || fnr == 0) continue; printf("%s score %d FPR %.3g FNR %.3g\n", motif_id, i, fpr, fnr); } // free stuff free_pssm(pssm); free_pssm(pssm_pos_dep); if (models != NULL) { int model_index; int num_models = get_motif_length(motif) + 1; for (model_index = 0; model_index < num_models; model_index++) { free_model(models[model_index]); } myfree(models); } } // motif arraylst_destroy(destroy_motif, motifs); /********************************************** * Clean up. **********************************************/ // TLB may have encountered a memory corruption bug here // CEG has not been able to reproduce it. valgrind says all is well. free_array(bg_freqs); free_tree(TRUE, tree); free_string_list(selected_motifs); return(0); } // main
/*********************************************************************** Process command line options ***********************************************************************/ static void process_command_line( int argc, char* argv[], MCAST_OPTIONS_T *options ) { // Set default values for command line arguments options->allow_clobber = TRUE; options->text_only = FALSE; options->motif_format = MEME_FORMAT; options->bg_filename = NULL; options->motif_filename = NULL; options->output_dirname = "mcast_out"; options->seq_filename = NULL; options->max_gap = "50"; options->pseudo_weight = "4.0"; options->e_thresh = "10.0"; options->p_thresh = "0.0005"; options->quiet = FALSE; options->use_synth = FALSE; // Define command line options. int option_count = 12; int option_index = 0; cmdoption const cmdoptions[] = { {"bgfile", REQUIRED_VALUE}, {"bgweight", REQUIRED_VALUE}, {"e-thresh", REQUIRED_VALUE}, {"max-gap", REQUIRED_VALUE}, {"o", REQUIRED_VALUE}, {"oc", REQUIRED_VALUE}, {"p-thresh", REQUIRED_VALUE}, {"synth", NO_VALUE}, {"text", NO_VALUE}, {"transfac", NO_VALUE}, {"verbosity", REQUIRED_VALUE}, {"quiet", NO_VALUE}, }; simple_setopt(argc, argv, option_count, cmdoptions); // Define the usage message. const char *usage = "USAGE: mcast [options] <query> <database>\n" "\n" " [--bgfile <file>] File containing n-order Markov background model\n" " [--bgweight <b>] Add b * background frequency to each count in query\n" " (default: 4.0 )\n" " [--e-thresh <value>] Print matches with E-values less than E\n" " (default = 10)\n" " [--max-gap <value>] Maximum allowed distance between adjacent hits\n" " (default = 50)\n" " [--o <output dir>] Name of output directory. Existing files will not be\n" " overwritten (default=mcast_out)\n" " [--oc <output dir>] Name of output directory. Existing files will be\n" " overwritten.\n" " [--p-thresh <value>] p-value threshold for motif hits\n" " (default = 0.0005).\n" " [--synth] Use synthetic scores for distribution\n" " [--text] Output plain text rather than HTML.\n" " [--transfac] Query is in TRANSFAC format (default: MEME format)\n" " [--verbosity <value>] Verbosity of error messagess\n"; // Parse the command line. while (1) { int c = 0; char* option_name = NULL; char* option_value = NULL; const char* message = NULL; // Read the next option, and break if we're done. c = simple_getopt(&option_name, &option_value, &option_index); if (c == 0) { break; } else if (c < 0) { simple_getopterror(&message); die("Error in command line arguments.\n%s", usage); } // Assign the parsed value to the appropriate variable if (strcmp(option_name, "bgfile") == 0) { options->bg_filename = option_value; } else if (strcmp(option_name, "bgweight") == 0) { options->pseudo_weight = option_value; } else if (strcmp(option_name, "e-thresh") == 0) { options->e_thresh = option_value; } else if (strcmp(option_name, "max-gap") == 0) { options->max_gap = option_value; } else if (strcmp(option_name, "o") == 0){ // Set output directory with no clobber options->output_dirname = option_value; options->allow_clobber = FALSE; } else if (strcmp(option_name, "oc") == 0){ // Set output directory with clobber options->output_dirname = option_value; options->allow_clobber = TRUE; } else if (strcmp(option_name, "p-thresh") == 0) { options->p_thresh = option_value; } else if (strcmp(option_name, "synth") == 0) { options->use_synth = TRUE; } else if (strcmp(option_name, "text") == 0) { options->text_only = TRUE; } else if (strcmp(option_name, "transfac") == 0) { options->motif_format = TRANSFAC_FORMAT; } else if (strcmp(option_name, "quiet") == 0) { options->quiet = TRUE; } else if (strcmp(option_name, "verbosity") == 0) { verbosity = (VERBOSE_T) atoi(option_value); } } // Read and verify the two required arguments. if (option_index + 2 != argc) { fprintf(stderr, "%s", usage); exit(1); } options->motif_filename = argv[option_index]; options->seq_filename = argv[option_index+1]; if (options->motif_filename == NULL) { die("No motif file given.\n%s", usage); } if (options->seq_filename == NULL) { die("No sequence file given.\n%s", usage); } }
/************************************************************************* * int main *************************************************************************/ int main(int argc, char *argv[]) { /* Data structures. */ int num_motifs; /* The number of motifs in the model. */ MOTIF_T motifs[2 * MAX_MOTIFS]; /* The motifs. */ STRING_LIST_T* motif_occurrences = NULL; /* Strings describing occurrences of motifs */ BOOLEAN_T has_reverse_strand = FALSE; /* MEME file contained both strands */ ARRAY_T* background; /* Background probs for alphabet. */ ORDER_T* order_spacing; /* Linear HMM order and spacing. */ MATRIX_T* transp_freq = NULL; /* Matrix of inter-motif transitions freqs. */ MATRIX_T* spacer_ave = NULL; /* Matrix of average spacer lengths. */ MHMM_T * the_hmm = NULL; /* The HMM being constructed. */ /* Command line parameters. */ char * meme_filename; /* Input file containg motifs. */ char * hmm_type_str; /* HMM type. */ HMM_T hmm_type; STRING_LIST_T* requested_motifs; /* Indices of requested motifs. */ int request_n; /* The user asked for the first n motifs. */ double e_threshold; /* E-value threshold for motif inclusion. */ double complexity_threshold; // For eliminating low-complexity motifs. double p_threshold; /* p-value threshold for motif occurences. */ char* order_string; /* Motif order and spacing. */ int spacer_states; /* Number of states in each spacer. */ BOOLEAN_T fim; /* Represent spacers as free insertion modules? */ BOOLEAN_T keep_unused; // Drop unused inter-motif transitions? double trans_pseudo; /* Transition pseudocount. */ double spacer_pseudo; // Spacer (self-loop) pseudocount. */ char* description; // Descriptive text to be stored in model. BOOLEAN_T print_header; /* Print file header? */ BOOLEAN_T print_params; /* Print parameter summary? */ BOOLEAN_T print_time; /* Print timing data (dummy: always false). */ /* Local variables. */ int i_motif; /********************************************** * COMMAND LINE PROCESSING **********************************************/ // Define command line options. cmdoption const options[] = { {"type", OPTIONAL_VALUE}, {"description", REQUIRED_VALUE}, {"motif", REQUIRED_VALUE}, {"nmotifs", REQUIRED_VALUE}, {"ethresh", REQUIRED_VALUE}, {"lowcomp", REQUIRED_VALUE}, {"pthresh", REQUIRED_VALUE}, {"order", REQUIRED_VALUE}, {"nspacer", REQUIRED_VALUE}, {"fim", NO_VALUE}, {"keep-unused", NO_VALUE}, {"transpseudo", REQUIRED_VALUE}, {"spacerpseudo", REQUIRED_VALUE}, {"verbosity", REQUIRED_VALUE}, {"noheader", NO_VALUE}, {"noparams", NO_VALUE}, {"notime", NO_VALUE}, {"quiet", NO_VALUE}, }; int option_count = 18; int option_index = 0; // Define the usage message. char usage[1000] = ""; strcat(usage, "USAGE: mhmm [options] <MEME file>\n"); strcat(usage, "\n"); strcat(usage, " Options:\n"); strcat(usage, " --type [linear|complete|star] (default=linear)\n"); strcat(usage, " --description <string> (may be repeated)\n"); strcat(usage, " --motif <motif #> (may be repeated)\n"); strcat(usage, " --nmotifs <#>\n"); strcat(usage, " --ethresh <E-value>\n"); strcat(usage, " --lowcomp <value>\n"); strcat(usage, " --pthresh <p-value>\n"); strcat(usage, " --order <string>\n"); strcat(usage, " --nspacer <spacer length> (default=1)\n"); strcat(usage, " --fim\n"); strcat(usage, " --keep-unused\n"); strcat(usage, " --transpseudo <pseudocount>\n"); strcat(usage, " --spacerpseudo <pseudocount>\n"); strcat(usage, " --verbosity 1|2|3|4|5 (default=2)\n"); strcat(usage, " --noheader\n"); strcat(usage, " --noparams\n"); strcat(usage, " --notime\n"); strcat(usage, " --quiet\n"); strcat(usage, "\n"); /* Make sure various options are set to NULL or defaults. */ meme_filename = NULL; hmm_type_str = NULL; hmm_type = INVALID_HMM; requested_motifs = new_string_list(); request_n = 0; e_threshold = 0.0; complexity_threshold = 0.0; p_threshold = 0.0; order_string = NULL; spacer_states = DEFAULT_SPACER_STATES, fim = FALSE; keep_unused = FALSE; trans_pseudo = DEFAULT_TRANS_PSEUDO; spacer_pseudo = DEFAULT_SPACER_PSEUDO; description = NULL; print_header = TRUE; print_params = TRUE; print_time = FALSE; simple_setopt(argc, argv, option_count, options); // Parse the command line. while (1) { int c = 0; char* option_name = NULL; char* option_value = NULL; const char * message = NULL; // Read the next option, and break if we're done. c = simple_getopt(&option_name, &option_value, &option_index); if (c == 0) { break; } else if (c < 0) { simple_getopterror(&message); die("Error processing command line options (%s)\n", message); } if (strcmp(option_name, "type") == 0) { if (option_value != NULL) { hmm_type_str = option_value; } } else if (strcmp(option_name, "description") == 0) { description = option_value; } else if (strcmp(option_name, "motif") == 0) { add_string(option_value, requested_motifs); } else if (strcmp(option_name, "nmotifs") == 0) { request_n = atoi(option_value); } else if (strcmp(option_name, "ethresh") == 0) { e_threshold = atof(option_value); } else if (strcmp(option_name, "lowcomp") == 0) { complexity_threshold = atof(option_value); } else if (strcmp(option_name, "pthresh") == 0) { p_threshold = atof(option_value); } else if (strcmp(option_name, "order") == 0) { order_string = option_value; } else if (strcmp(option_name, "nspacer") == 0) { spacer_states = atoi(option_value); } else if (strcmp(option_name, "fim") == 0) { fim = TRUE; } else if (strcmp(option_name, "keep-unused") == 0) { keep_unused = TRUE; } else if (strcmp(option_name, "transpseudo") == 0) { trans_pseudo = atof(option_value); } else if (strcmp(option_name, "spacerpseudo") == 0) { spacer_pseudo = atof(option_value); } else if (strcmp(option_name, "verbosity") == 0) { verbosity = (VERBOSE_T)atoi(option_value); } else if (strcmp(option_name, "noheader") == 0) { print_header = FALSE; } else if (strcmp(option_name, "noparams") == 0) { print_params = FALSE; } else if (strcmp(option_name, "notime") == 0) { print_time = FALSE; } else if (strcmp(option_name, "quiet") == 0) { print_header = print_params = print_time = FALSE; verbosity = QUIET_VERBOSE; } } // Read the single required argument. if (option_index + 1 != argc) { fprintf(stderr, "%s", usage); exit(1); } meme_filename = argv[option_index]; // Set up motif requests. if (request_n != 0) { if (get_num_strings(requested_motifs) != 0) { die("Can't combine the -motif and -nmotifs options.\n"); } else { for (i_motif = 0; i_motif < request_n; i_motif++) { char motif_id[MAX_MOTIF_ID_LENGTH + 1]; sprintf(motif_id, "%d", i_motif + 1); add_string(motif_id, requested_motifs); } } } /* Set the model type. */ hmm_type = convert_enum_type_str(hmm_type_str, LINEAR_HMM, HMM_STRS, NUM_HMM_T); /* Gotta have positive spacer length. */ if (spacer_states <= 0) { die("Negative spacer length (%d).\n", spacer_states); } /* Make sure motifs weren't selected redundantly. */ // FIXME: Add tests for complexity threshold. if ((get_num_strings(requested_motifs) != 0) && (e_threshold != 0.0)) { die("Can't use -motif or -nmotifs with -ethresh."); } if ((get_num_strings(requested_motifs) != 0) && (order_string != NULL)) { die("Can't use -motif or -nmotifs with -order."); } if ((order_string != NULL) && (e_threshold != 0.0)) { die("Can't use -ethresh and -order."); } /* Prevent trying to build a complete or star model with ordering. */ if (order_string != NULL) { if (hmm_type == COMPLETE_HMM) die("Can't specify motif order with a completely connected model."); else if (hmm_type == STAR_HMM) die("Can't specify motif order with a star model."); } // Parse the order string. order_spacing = create_order(order_string); /********************************************** * READING THE MOTIFS **********************************************/ BOOLEAN_T read_file = FALSE; double pseudocount = 0; read_meme_file( meme_filename, "motif-file", // Take bg freq. from motif file. pseudocount, REQUIRE_PSPM, &num_motifs, motifs, &motif_occurrences, &has_reverse_strand, &background ); process_raw_motifs_for_model( &num_motifs, motifs, motif_occurrences, requested_motifs, has_reverse_strand, keep_unused, p_threshold, e_threshold, complexity_threshold, &order_spacing, &transp_freq, &spacer_ave, trans_pseudo, spacer_pseudo ); /********************************************** * BUILDING THE HMM **********************************************/ /* Build the motif-based HMM. */ if (hmm_type == LINEAR_HMM) { if (order_spacing != NULL) { reorder_motifs(order_spacing, &num_motifs, motifs); } else { die("No order specified for the motifs.\n" "For the linear model the motif file must contain motif occurence\n" "data or the motif order must be specified using " "the --order option."); } build_linear_hmm( background, order_spacing, spacer_states, motifs, num_motifs, fim, &the_hmm ); } else if (hmm_type == COMPLETE_HMM) { build_complete_hmm( background, spacer_states, motifs, num_motifs, transp_freq, spacer_ave, fim, &the_hmm ); } else if (hmm_type == STAR_HMM) { build_star_hmm( background, spacer_states, motifs, num_motifs, fim, &the_hmm ); } // Add some global information. copy_string(&(the_hmm->motif_file), meme_filename); /********************************************** * WRITING THE HMM **********************************************/ /* Print the header. */ if (print_header) write_header( program, "", description, meme_filename, NULL, NULL, stdout ); /* Write the HMM. */ write_mhmm(verbosity, the_hmm, stdout); /* Print the program parameters. */ if (print_params) { printf("Program parameters for mhmm\n"); printf(" MEME file: %s\n", meme_filename); printf(" Motifs:"); write_string_list(" ", requested_motifs, stdout); printf("\n"); printf(" Model topology: %s\n", convert_enum_type(hmm_type, HMM_STRS, NUM_HMM_T)); printf(" States per spacer: %d\n", spacer_states); printf(" Spacers are free-insertion modules: %s\n", boolean_to_string(fim)); printf("\n"); } free_array(background); free_string_list(requested_motifs); free_order(order_spacing); free_matrix(transp_freq); free_matrix(spacer_ave); for (i_motif = 0; i_motif < num_motifs; i_motif++) free_motif(&(motifs[i_motif])); free_mhmm(the_hmm); return(0); }
int main (int argc, char * argv[]) { const int P_LEFT = -1; const int P_ONETAILED = 0; const int P_RIGHT = 1; const int P_TWOTAILED = 2; const int P_ALL = 3; // Default parameter settings. verbosity = NORMAL_VERBOSE; int requested_p_value = P_ALL; const int num_options = 2; cmdoption const options[] = { { "verbosity", REQUIRED_VALUE }, { "p-value", REQUIRED_VALUE } }; // Define the usage message. char usage[400] = ""; strcat(usage, "USAGE: ranksum_test [options] <n> <p> <r>\n"); strcat(usage, "\n"); strcat(usage, " <n> number of samples \n"); strcat(usage, " <p> number of positives\n"); strcat(usage, " <r> ranksum of positives (may be a real number)\n"); strcat(usage, "\n"); strcat(usage, " Options:\n"); strcat(usage, " --p-value -1|0|1|2|3 (-1=left, 0=one-tailed,1=right,\n" " 2=two-tailed, 3=all (default))\n"); strcat(usage, " --verbosity 1|2|3|4 (default = 2)\n"); strcat(usage, "\n"); // Parse the command line. int option_index = 0; char* option_name = NULL; char* option_value = NULL; const char * message = NULL; char line[MAX_LINE]; // Buffer for reading. simple_setopt(argc, argv, num_options, options); while(1) { // Read the next option, and break if we're done. int c = simple_getopt(&option_name, &option_value, &option_index); if (c == 0) { break; } else if (c < 0) { simple_getopterror(&message); die("Error processing command line options (%s)\n", message); } if (strcmp(option_name, "p-value") == 0) { requested_p_value = atoi(option_value); if (requested_p_value < -1 || requested_p_value > 3) die("Error requested p-value parameter unknown (%d)\n", requested_p_value); } else if (strcmp(option_name, "verbosity") == 0) { verbosity = atoi(option_value); } } // Read the single required argument. if (option_index + 3 != argc) { fprintf(stderr, "%s", usage); exit(1); } int n = atoi(argv[option_index]);option_index++; int p = atoi(argv[option_index]);option_index++; double r = atof(argv[option_index]);option_index++; RSR_T* mww = ranksum_from_stats(n,p,r); // Print to stdout. if (requested_p_value == P_RIGHT){ printf("%g\n",RSR_get_p_right(mww)); } else if (requested_p_value == P_LEFT){ printf("%g\n",RSR_get_p_left(mww)); } else if (requested_p_value == P_ONETAILED){ printf("%g\n",RSR_get_p_onetailed(mww)); } else if (requested_p_value == P_TWOTAILED){ printf("%g\n",RSR_get_p_twotailed(mww)); } else { printf("p-left\t%g\tp-right\t%g\tone-tailed\t%g\ttwo-tailed\t%g\tU-value\t%g\n", RSR_get_p_left(mww), RSR_get_p_right(mww), RSR_get_p_onetailed(mww), RSR_get_p_twotailed(mww), RSR_get_u(mww)); } return(0); }
/************************************************************************* * Entry point for ama *************************************************************************/ int main(int argc, char *argv[]) { int max_seq_length = MAX_SEQ; STRING_LIST_T* selected_motifs = NULL; double pseudocount = 0.01; int output_format = CISML_FORMAT; program_name = "ama"; int scoring = AVG_ODDS; BOOLEAN_T pvalues = FALSE; BOOLEAN_T normalize_scores = FALSE; BOOLEAN_T combine_duplicates = FALSE; int num_gc_bins = 1; int sdbg_order = -1; // don't use sequence background BOOLEAN_T scan_both_strands = TRUE; ARRAY_T* pos_bg_freqs = NULL; ARRAY_T* rev_bg_freqs = NULL; clock_t c0, c1; /* measuring cpu_time */ CISML_T *cisml; char * out_dir = NULL; BOOLEAN_T clobber = FALSE; int i; int last = 0; ALPH_T alph = INVALID_ALPH; /********************************************** * COMMAND LINE PROCESSING **********************************************/ const int num_options = 16; cmdoption const motif_scan_options[] = { { "max-seq-length", REQUIRED_VALUE }, { "motif", REQUIRED_VALUE }, { "motif-pseudo", REQUIRED_VALUE }, { "rma", NO_VALUE }, { "pvalues", NO_VALUE }, { "sdbg", REQUIRED_VALUE }, { "norc", NO_VALUE }, { "cs", NO_VALUE }, { "o-format", REQUIRED_VALUE }, { "o", REQUIRED_VALUE }, { "oc", REQUIRED_VALUE }, { "scoring", REQUIRED_VALUE }, { "verbosity", REQUIRED_VALUE }, { "gcbins", REQUIRED_VALUE }, { "last", REQUIRED_VALUE }, { "version", NO_VALUE } }; int option_index = 0; // Define the usage message. char usage[] = "USAGE: ama [options] <motif file> <sequence file> [<background file>]\n" "\n" " Options:\n" " --sdbg <order>\t\t\tUse Markov background model of\n" " \t\t\t\t\torder <order> derived from the sequence\n" " \t\t\t\t\tto compute its likelihood ratios.\n" " \t\t\t\t\tOverrides --pvalues, --gcbins and --rma;\n" " \t\t\t\t\t<background file> is required unless\n" " \t\t\t\t\t--sdbg is given.\n" " --motif <id>\t\t\tUse only the motif identified by <id>.\n" " \t\t\t\t\tThis option may be repeated.\n" " --motif-pseudo <float>\t\tThe value <float> times the background\n" " \t\t\t\t\tfrequency is added to the count of each\n" " \t\t\t\t\tletter when creating the likelihood \n" " \t\t\t\t\tratio matrix (default: %g).\n" " --norc\t\t\t\tDisables the scanning of the reverse\n" " \t\t\t\t\tcomplement strand.\n" " --scoring [avg-odds|max-odds]\tIndicates whether the average or \n" " \t\t\t\t\tthe maximum odds should be calculated\n" " \t\t\t\t\t(default: avg-odds)\n" " --rma\t\t\t\tScale motif scores to the range 0-1.\n" " \t\t\t\t\t(Relative Motif Affinity).\n" " \t\t\t\t\tMotif scores are scaled by the maximum\n" " \t\t\t\t\tscore achievable by that PWM. (default:\n" " \t\t\t\t\tmotif scores are not normalized)\n" " --pvalues\t\t\t\tPrint p-value of avg-odds score in cisml\n" " \t\t\t\t\toutput. Ignored for max-odds scoring.\n" " \t\t\t\t\t(default: p-values are not printed)\n" " --gcbins <bins>\t\t\tCompensate p-values for GC content of\n" " \t\t\t\t\teach sequence using given number of \n" " \t\t\t\t\tGC range bins. Recommended bins: 41.\n" " \t\t\t\t\t(default: p-values are based on\n" " \t\t\t\t\tfrequencies in background file)\n" " --cs\t\t\t\tEnable combining sequences with same\n" " \t\t\t\t\tidentifier by taking the average score\n" " \t\t\t\t\tand the Sidac corrected p-value.\n" " --o-format [gff|cisml]\t\tOutput file format (default: cisml)\n" " \t\t\t\t\tignored if --o or --oc option used\n" " --o <directory>\t\t\tOutput all available formats to\n" " \t\t\t\t\t<directory>; give up if <directory>\n" " \t\t\t\t\texists\n" " --oc <directory>\t\t\tOutput all available formats to\n" " \t\t\t\t\t<directory>; if <directory> exists\n" " \t\t\t\t\toverwrite contents\n" " --verbosity [1|2|3|4]\t\tControls amount of screen output\n" " \t\t\t\t\t(default: %d)\n" " --max-seq-length <int>\t\tSet the maximum length allowed for \n" " \t\t\t\t\tinput sequences. (default: %d)\n" " --last <int>\t\t\tUse only scores of (up to) last <n>\n" " \t\t\t\t\tsequence positions to compute AMA.\n" " --version \t\t\tPrint version and exit.\n" "\n"; // Parse the command line. if (simple_setopt(argc, argv, num_options, motif_scan_options) != NO_ERROR) { die("Error processing command line options: option name too long.\n"); } BOOLEAN_T setoutputformat = FALSE; BOOLEAN_T setoutputdirectory = FALSE; while (TRUE) { int c = 0; char* option_name = NULL; char* option_value = NULL; const char * message = NULL; // Read the next option, and break if we're done. c = simple_getopt(&option_name, &option_value, &option_index); if (c == 0) { break; } else if (c < 0) { (void) simple_getopterror(&message); die("Error processing command line options (%s).\n", message); } else if (strcmp(option_name, "max-seq-length") == 0) { max_seq_length = atoi(option_value); } else if (strcmp(option_name, "norc") == 0) { scan_both_strands = FALSE; } else if (strcmp(option_name, "cs") == 0) { combine_duplicates = TRUE; } else if (strcmp(option_name, "motif") == 0) { if (selected_motifs == NULL) { selected_motifs = new_string_list(); } add_string(option_value, selected_motifs); } else if (strcmp(option_name, "motif-pseudo") == 0) { pseudocount = atof(option_value); } else if (strcmp(option_name, "o-format") == 0) { if (setoutputdirectory) { if (verbosity >= NORMAL_VERBOSE) fprintf(stderr, "output directory specified, ignoring --o-format\n"); } else { setoutputformat = TRUE; if (strcmp(option_value, "gff") == 0) output_format = GFF_FORMAT; else if (strcmp(option_value, "cisml") == 0) output_format = CISML_FORMAT; else { if (verbosity >= NORMAL_VERBOSE) fprintf(stderr, "Output format not known. Using standard instead (cisML).\n"); output_format = CISML_FORMAT; } } } else if (strcmp(option_name, "o") == 0 || strcmp(option_name, "oc") == 0) { setoutputdirectory = TRUE; if (setoutputformat) { if (verbosity >= NORMAL_VERBOSE) fprintf(stderr, "output directory specified, ignoring --o-format\n"); } clobber = strcmp(option_name, "oc") == 0; out_dir = (char*) malloc (sizeof(char)*(strlen(option_value)+1)); strcpy(out_dir, option_value); output_format = DIRECTORY_FORMAT; } else if (strcmp(option_name, "verbosity") == 0) { verbosity = atoi(option_value); } else if (strcmp(option_name, "scoring") == 0) { if (strcmp(option_value, "max-odds") == 0) scoring = MAX_ODDS; else if (strcmp(option_value, "avg-odds") == 0) scoring = AVG_ODDS; else if (strcmp(option_value, "sum-odds") == 0) scoring = SUM_ODDS; else die("Specified scoring scheme not known.\n", message); } else if (strcmp(option_name, "pvalues") == 0) { pvalues = TRUE; } else if (strcmp(option_name, "rma") == 0) { normalize_scores = TRUE; fprintf(stderr, "Normalizing motif scores using RMA method.\n"); } else if (strcmp(option_name, "gcbins") == 0) { num_gc_bins = atoi(option_value); pvalues = TRUE; if (num_gc_bins <= 1) die("Number of bins in --gcbins must be greater than 1.\n", message); } else if (strcmp(option_name, "sdbg") == 0) { sdbg_order = atoi(option_value); // >=0 means use sequence bkg } else if (strcmp(option_name, "last") == 0) { int i = 0; if (option_value[0] == '-') ++i; while (option_value[i] != '\0') { if (!isdigit(option_value[i])) { die("Specified parameter 'last' contains non-numeric characters.\n"); } ++i; } last = atoi(option_value); if (errno != 0) { die("Specified parameter 'last' could not be parsed as a number as:\n%s\n",strerror(errno)); } if (last < 0) { die("Specified parameter 'last' had negative value (%d) when only postive or zero values are allowed \n", last); } } else if (strcmp(option_name, "version") == 0) { fprintf(stdout, VERSION "\n"); exit(EXIT_SUCCESS); } } // --sdbg overrides --pvalues and --gcbins and --rma int req_args = 3; if (sdbg_order >= 0) { pvalues = FALSE; normalize_scores = FALSE; num_gc_bins = 1; req_args = 2; } // Check all required arguments given if (sdbg_order >= 0 && argc > option_index + req_args) { die("<background file> cannot be given together with --sdbg.\n"); } else if (argc != option_index + req_args) { fprintf(stderr, usage, pseudocount, verbosity, max_seq_length); exit(EXIT_FAILURE); } // Get required arguments. char* motif_filename = argv[option_index]; option_index++; char* fasta_filename = argv[option_index]; option_index++; char* bg_filename; if (req_args == 3) { // required unless --sdbg given bg_filename = argv[option_index]; option_index++; } else { bg_filename = "--uniform--"; // So PSSMs will use uniform background; // we can multiply them out later. } // measure time c0 = clock(); // Set up hash tables for computing reverse complement if doing --sdbg if (sdbg_order >= 0) setup_hash_alph(DNAB); // Create cisml data structure for recording results cisml = allocate_cisml(program_name, motif_filename, fasta_filename); set_cisml_background_file(cisml, bg_filename); /********************************************** * Read the motifs and background model. **********************************************/ int num_motifs = 0; MREAD_T *mread; ARRAYLST_T *motifs; PSSM_PAIR_T** pssm_pairs; // note pssm_pairs is an array of pointers //this reads any meme file, xml, txt and html mread = mread_create(motif_filename, OPEN_MFILE); mread_set_bg_source(mread, bg_filename); mread_set_pseudocount(mread, pseudocount); motifs = mread_load(mread, NULL); alph = mread_get_alphabet(mread); pos_bg_freqs = mread_get_background(mread); mread_destroy(mread); num_motifs = arraylst_size(motifs); // allocate memory for PSSM pairs pssm_pairs = (PSSM_PAIR_T**)mm_malloc(sizeof(PSSM_PAIR_T*) * num_motifs); if (verbosity >= NORMAL_VERBOSE) fprintf(stderr, "Number of motifs in file %d.\n", num_motifs); // make a CISML pattern to hold scores for each motif PATTERN_T** patterns = NULL; Resize(patterns, num_motifs, PATTERN_T*); int motif_index; for (motif_index = 0; motif_index < num_motifs; motif_index++) { MOTIF_T* motif = (MOTIF_T*)arraylst_get(motif_index, motifs); patterns[motif_index] = allocate_pattern(get_motif_id(motif), ""); add_cisml_pattern(cisml, patterns[motif_index]); } // make reverse complement motifs and background frequencies. if (scan_both_strands == TRUE) { add_reverse_complements(motifs); assert(arraylst_size(motifs) == (2 * num_motifs)); rev_bg_freqs = allocate_array(get_array_length(pos_bg_freqs)); complement_dna_freqs(pos_bg_freqs, rev_bg_freqs); } /************************************************************** * Convert motif matrices into log-odds matrices. * Scale them. * Compute the lookup tables for the PDF of scaled log-odds scores. **************************************************************/ int ns = scan_both_strands ? 2 : 1; // number of strands for (motif_index = 0; motif_index < num_motifs; motif_index++) { MOTIF_T *motif, *motif_rc; motif = (MOTIF_T*)arraylst_get(motif_index*ns, motifs); if (scan_both_strands) motif_rc = (MOTIF_T*)arraylst_get(motif_index*ns + 1, motifs); else motif_rc = NULL; /* * Note: If scanning both strands, we complement the motif frequencies * but not the background frequencies so the motif looks the same. * However, the given frequencies are used in computing the p-values * since they represent the frequencies on the negative strands. * (If we instead were to complement the input sequence, keeping the * the motif fixed, we would need to use the complemented frequencies * in computing the p-values. Is that any clearer?) */ double range = 300; // 100 is not very good; 1000 is great but too slow PSSM_T* pos_pssm = build_motif_pssm( motif, pos_bg_freqs, pos_bg_freqs, NULL, // Priors not used 0.0L, // alpha not used range, num_gc_bins, TRUE ); PSSM_T* neg_pssm = (scan_both_strands ? build_motif_pssm( motif_rc, rev_bg_freqs, pos_bg_freqs, NULL, // Priors not used 0.0L, // alpha not used range, num_gc_bins, TRUE ) : NULL ); pssm_pairs[motif_index] = create_pssm_pair(pos_pssm, neg_pssm); } // Open the FASTA file for reading. FILE* fasta_file = NULL; if (open_file(fasta_filename, "r", FALSE, "FASTA", "sequences", &fasta_file) == 0) { die("Couldn't open the file %s.\n", fasta_filename); } if (verbosity >= NORMAL_VERBOSE) { if (last == 0) { fprintf(stderr, "Using entire sequence\n"); } else { fprintf(stderr, "Limiting sequence to last %d positions.\n", last); } } /************************************************************** * Read in all sequences and score with all motifs **************************************************************/ int seq_loading_num = 0; // keeps track on the number of sequences read in total int seq_counter = 0; // holds the index to the seq in the pattern int unique_seqs = 0; // keeps track on the number of unique sequences BOOLEAN_T need_postprocessing = FALSE; SEQ_T* sequence = NULL; RBTREE_T* seq_ids = rbtree_create(rbtree_strcasecmp,NULL,free,rbtree_intcpy,free); RBNODE_T* seq_node; BOOLEAN_T created; while (read_one_fasta(alph, fasta_file, max_seq_length, &sequence)) { ++seq_loading_num; created = FALSE; char* seq_name = get_seq_name(sequence); int seq_len = get_seq_length(sequence); int scan_len; if (last != 0) { scan_len = last; } else { scan_len = seq_len; } // red-black trees are only required if duplicates should be combined if (combine_duplicates){ //lookup seq id and create new entry if required, return sequence index char *tmp_id = mm_malloc(strlen(seq_name)+1); // required copy for rb-tree strncpy(tmp_id,seq_name,strlen(seq_name)+1); seq_node = rbtree_lookup(seq_ids, tmp_id, TRUE, &created); if (created) {// assign it a loading number rbtree_set(seq_ids, seq_node, &unique_seqs); seq_counter = unique_seqs; ++unique_seqs; } else { seq_counter = *((int*)rbnode_get(seq_node)); } } // // Set up sequence-dependent background model and compute // log cumulative probability of sequence. // double *logcumback = NULL; // array of log cumulative probs. if (sdbg_order >= 0) { Resize(logcumback, seq_len+1, double); char* raw_seq = get_raw_sequence(sequence); BOOLEAN rc = FALSE; double *a_cp = get_markov_from_sequence(raw_seq, alph_string(alph), rc, sdbg_order, 0); log_cum_back(raw_seq, a_cp, sdbg_order, logcumback); myfree(a_cp); } // Get the GC content of the sequence if binning p-values by GC // and store it in the sequence object. if (num_gc_bins > 1) { ARRAY_T *freqs = get_sequence_freqs(sequence, alph); set_total_gc_sequence(sequence, get_array_item(1,freqs) + get_array_item(2,freqs)); // f(C) + f(G) free_array(freqs); // clean up } else { set_total_gc_sequence(sequence, -1); // flag ignore } /************************************************************** * Process all motifs. **************************************************************/ int ns = scan_both_strands ? 2 : 1; for (motif_index = 0; motif_index < num_motifs; motif_index++) { PATTERN_T *pattern = patterns[motif_index]; MOTIF_T* motif = (MOTIF_T*)arraylst_get(ns*motif_index, motifs); char* motif_id = (scan_both_strands ? get_motif_st_id(motif) : get_motif_id(motif)); if (verbosity >= HIGH_VERBOSE) { fprintf(stderr, "Using motif %s of width %d.\n", motif_id, get_motif_length(motif)); } if ((selected_motifs == NULL) || (have_string(get_motif_id(motif), selected_motifs) == TRUE)) { if (verbosity >= HIGHER_VERBOSE) { fprintf(stderr, "Scanning %s sequence with length %d " "abbreviated to %d with motif %s with length %d.\n", seq_name, seq_len, scan_len, motif_id, get_motif_length(motif)); } SCANNED_SEQUENCE_T* scanned_seq = NULL; if (!combine_duplicates || get_pattern_num_scanned_sequences(pattern) <= seq_counter){ // Create a scanned_sequence record and save it in the pattern. scanned_seq = allocate_scanned_sequence(seq_name, seq_name, pattern); set_scanned_sequence_length(scanned_seq, scan_len); } else { // get existing sequence record scanned_seq = get_pattern_scanned_sequences(pattern)[seq_counter]; set_scanned_sequence_length(scanned_seq, max(scan_len, get_scanned_sequence_length(scanned_seq))); } // check if scanned component of sequence has sufficient length for the motif if (scan_len < get_motif_length(motif)) { // set score to zero and p-value to 1 if not set yet if(!has_scanned_sequence_score(scanned_seq)){ set_scanned_sequence_score(scanned_seq, 0.0); } if(pvalues && !has_scanned_sequence_pvalue(scanned_seq)){ set_scanned_sequence_pvalue(scanned_seq, 1.0); } add_scanned_sequence_scanned_position(scanned_seq); if (get_scanned_sequence_num_scanned_positions(scanned_seq) > 0L) need_postprocessing = TRUE; if (verbosity >= HIGH_VERBOSE) fprintf(stderr, "%s too short for motif %s. Score set to 0!\n", seq_name, motif_id); } else { // scan the sequence using average/maximum motif affinity ama_sequence_scan(alph, sequence, logcumback, pssm_pairs[motif_index], scoring, pvalues, last, scanned_seq, &need_postprocessing); } } else { if (verbosity >= HIGH_VERBOSE) fprintf(stderr, "Skipping motif %s.\n", motif_id); } } // All motifs parsed free_seq(sequence); if (sdbg_order >= 0) myfree(logcumback); } // read sequences
int main(int argc, char **argv) { // Defaults for command line options. char* bfile = NULL; // Background file. int seed = 0; // For random numbers. int min = LOWEST; // Minimum sequence length. int max = HIGHEST; // Maximum sequence length. int use_order = -1; // Use order defined in bfile. int type = 0; // (See usage statement.) BOOLEAN_T dummy = FALSE; // Don't print dummy sequence. // Define command line options. const int num_options = 7; cmdoption const options[] = { {"bfile", REQUIRED_VALUE}, {"seed", REQUIRED_VALUE}, {"minseq", REQUIRED_VALUE}, {"maxseq", REQUIRED_VALUE}, {"order", REQUIRED_VALUE}, {"type", REQUIRED_VALUE}, {"dummy", NO_VALUE} }; int option_index = 0; simple_setopt(argc, argv, num_options, options); // Define the usage message. char usage[1000] = ""; strcpy(usage, "USAGE: gendb [options] <numseqs>\n"); strcat(usage, "\n"); strcat(usage, " Options:\n"); strcat(usage, " --bfile <file>\n"); strcat(usage, " --seed <int> (default=0)\n"); strcat(usage, " --minseq <int> (default=50)\n"); strcat(usage, " --maxseq <int> (default=2000)\n"); strcat(usage, " --order <int> use Markov model of given order\n"); strcat(usage, " --type 0|1|2|3|4\n"); strcat(usage, " 0=protein (default)\n"); strcat(usage, " 1=dna with ambiguous characters\n"); strcat(usage, " 2=codons\n"); strcat(usage, " 3=dna w/o ambiguous characters\n"); strcat(usage, " 4=protein w/o ambiguous characters\n"); strcat(usage, " --dummy\n"); // Parse the command line. while (1) { int c = 0; char* option_name = NULL; char* option_value = NULL; const char* message = ""; // Read the next option, and break if we're done. c = simple_getopt(&option_name, &option_value, &option_index); if (c == 0) { break; } else if (c < 0) { die("Error processing command line options (%s)\n", message); } if (strcmp(option_name, "bfile") == 0) { bfile = option_value; } else if (strcmp(option_name, "seed") == 0) { seed = atoi(option_value); } else if (strcmp(option_name, "minseq") == 0) { min = (int)atof(option_value); } else if (strcmp(option_name, "maxseq") == 0) { max = (int)atof(option_value); } else if (strcmp(option_name, "order") == 0) { use_order = atoi(option_value); } else if (strcmp(option_name, "type") == 0) { type = atoi(option_value); } else if (strcmp(option_name, "dummy") == 0) { dummy = TRUE; } } if (option_index + 1 != argc) { fprintf(stderr, "%s", usage); exit(EXIT_FAILURE); } int nseqs = atoi(argv[option_index]); fprintf(stderr, "Generating %d sequences.\n", nseqs); // Print dummy sequence if asked to. if (dummy) { printf(">SEQ_0 n= %d s= %d l= %d seed= %d\n", nseqs, min, max, seed); } gendb( stdout, type, bfile, use_order, NULL, // 0-order model. -- WSN 8/13/03 nseqs, min, max, seed ); exit(0); } // main
void ramen_getopt(int argc, char *argv[]) { const int num_options = 12; cmdoption const motif_scan_options[] = { {"bgfile", REQUIRED_VALUE}, {"bgformat", REQUIRED_VALUE}, {"repeats", REQUIRED_VALUE}, {"pseudocount", REQUIRED_VALUE}, {"pvalue-cutoff", REQUIRED_VALUE}, {"verbose", REQUIRED_VALUE}, {"linreg-dumpdir", REQUIRED_VALUE}, {"linreg-switchxy", REQUIRED_VALUE}, {"log-fscores", REQUIRED_VALUE}, {"log-pwmscores", REQUIRED_VALUE}, {"normalise-motifs", REQUIRED_VALUE}, {"help", NO_VALUE}, }; int option_index = 0; char* option_name = NULL; char* option_value = NULL; const char * message = NULL; BOOLEAN_T bad_options = FALSE; int i; if (simple_setopt(argc, argv, num_options, motif_scan_options) != NO_ERROR) { die("Error processing command line options: option name too long.\n"); } /* * Now parse the command line options */ //simple_getopt will return 0 when there are no more options to parse while(simple_getopt(&option_name, &option_value, &option_index) > 0) { if (strcmp(option_name, "bgfile") == 0) { args.bg_filename = option_value; } else if (strcmp(option_name, "bgformat") == 0) { if (atoi(option_value)==MOTIF_BG) { args.bg_format = MOTIF_BG; } else if (atoi(option_value)==FILE_BG) { args.bg_format = FILE_BG; } else if (atoi(option_value)==UNIFORM_BG) { args.bg_format = UNIFORM_BG; } else { ramen_usage(); ramen_terminate(1); } } else if (strcmp(option_name, "pvalue-cutoff") == 0) { args.pvalue_cutoff = atof(option_value); } else if (strcmp(option_name, "repeats") == 0) { args.repeats = atof(option_value); } else if (strcmp(option_name, "pseudocount") == 0) { args.pseudocount = atof(option_value); } else if (strcmp(option_name, "verbose") == 0) { args.verbose = atoi(option_value); verbosity = args.verbose; if (args.verbose <= INVALID_VERBOSE || args.verbose > DUMP_VERBOSE) { ramen_usage(); ramen_terminate(1); } } else if (strcmp(option_name, "log-fscores") == 0) { if (strcmp(option_value,"on")==0) { args.log_fscores = TRUE; } else if (strcmp(option_value,"off")==0){ args.log_fscores = FALSE; } else { ramen_usage(); ramen_terminate(1); } } else if (strcmp(option_name, "log-pwmscores") == 0) { if (strcmp(option_value,"on")==0) { args.log_pwmscores = TRUE; } else if (strcmp(option_value,"off")==0){ args.log_pwmscores = FALSE; } else { ramen_usage(); ramen_terminate(1); } } else if (strcmp(option_name, "normalise-motifs") == 0) { if (strcmp(option_value,"on")==0) { args.linreg_normalise = TRUE; } else if (strcmp(option_value,"off")==0){ args.linreg_normalise = FALSE; } else { ramen_usage(); ramen_terminate(1); } } else if (strcmp(option_name, "linreg-dumpdir") == 0) { args.linreg_dump_dir = option_value; } else if (strcmp(option_name, "linreg-switchxy") == 0) { if (strcmp(option_value,"on")==0) { args.linreg_switchxy = TRUE; } else if (strcmp(option_value,"off")==0){ args.linreg_switchxy = FALSE; } else { ramen_usage(); ramen_terminate(1); } } else if (strcmp(option_name, "help") == 0) { printf("%s",ramen_get_usage()); //not to stderr ramen_terminate(0); } else { printf("Error: %s is not a recognised switch.\n", option_name); ramen_usage(); ramen_terminate(1); } option_index++; } args.sequence_filename = argv[option_index]; args.motif_filenames = &argv[option_index+1]; // FIXME: must now iterate until argc getting all motif DB filenames args.number_motif_files = argc-option_index-1; /* Now validate the options. * * Illegal combinations are: * - sequence bg and no sequence * - no motif * - no sequences * - each file exists. */ if (args.motif_filenames == NULL) { fprintf(stderr, "Error: Motif file not specified.\n"); bad_options = TRUE; } else { int i; for (i = 0; i < args.number_motif_files; i++) { if (!file_exists(args.motif_filenames[i])) { fprintf(stderr, "Error: Specified motif '%s' file does not exist.\n", args.motif_filenames[i]); bad_options = TRUE; } } } if (args.sequence_filename == NULL) { fprintf(stderr, "Error: Sequence file not specified.\n"); bad_options = TRUE; } else if (!file_exists(args.sequence_filename)) { fprintf(stderr, "Error: Specified sequence file does not exist.\n"); bad_options = TRUE; } if (args.bg_format == MOTIF_BG) { //bgfile is the same as the motif file. //TODO: make more robust args.bg_filename = argv[option_index]; args.bg_filename = NULL; } if (bad_options) { ramen_usage(); ramen_terminate(1); } }
/*********************************************************************** Process command line options ***********************************************************************/ static void process_command_line( int argc, char* argv[], CENTRIMO_OPTIONS_T *options ) { // Define command line options. const int num_options = 12; cmdoption const centrimo_options[] = { {"bgfile", REQUIRED_VALUE}, {"o", REQUIRED_VALUE}, {"oc", REQUIRED_VALUE}, {"score", REQUIRED_VALUE}, {"motif-pseudo", REQUIRED_VALUE}, {"ethresh", REQUIRED_VALUE}, {"maxbin", REQUIRED_VALUE}, {"norc", NO_VALUE}, {"noflip", NO_VALUE}, {"desc", REQUIRED_VALUE}, {"dfile", REQUIRED_VALUE}, {"verbosity", REQUIRED_VALUE} }; int option_index = 0; /* Make sure various options are set to NULL or defaults. */ options->alphabet = DNA_ALPH; options->allow_clobber = TRUE; options->scan_both_strands = TRUE; options->no_flip = FALSE; options->description = NULL; options->desc_file = NULL; options->bg_source = NULL; options->output_dirname = "centrimo_out"; options->seq_source = NULL; options->motif_sources = arraylst_create(); options->score_thresh = DEFAULT_SCORE_THRESH; options->pseudocount = DEFAULT_PSEUDOCOUNT; options->evalue_thresh = DEFAULT_EVALUE_THRESH; options->max_window = DEFAULT_MAX_WINDOW; // no need to copy, as string is declared in argv array options->selected_motifs = rbtree_create(rbtree_strcmp, NULL, NULL, NULL, NULL); verbosity = NORMAL_VERBOSE; simple_setopt(argc, argv, num_options, centrimo_options); // Parse the command line. while (TRUE) { int c = 0; char* option_name = NULL; char* option_value = NULL; const char * message = NULL; // Read the next option, and break if we're done. c = simple_getopt(&option_name, &option_value, &option_index); if (c == 0) { break; } else if (c < 0) { (void) simple_getopterror(&message); fprintf(stderr, "Error processing command line options (%s)\n", message); fprintf(stderr, CENTRIMO_USAGE, DEFAULT_PSEUDOCOUNT, DEFAULT_SCORE_THRESH, DEFAULT_EVALUE_THRESH, NORMAL_VERBOSE); exit(EXIT_FAILURE); } if (strcmp(option_name, "bgfile") == 0){ options->bg_source = option_value; } else if (strcmp(option_name, "ethresh") == 0){ options->evalue_thresh = atof(option_value); } else if (strcmp(option_name, "maxbin") == 0){ // max_window is one less than the number of places a motif can align // within the central window options->max_window = atoi(option_value) - 1; } else if (strcmp(option_name, "motif") == 0){ rbtree_put(options->selected_motifs, option_value, NULL); } else if (strcmp(option_name, "motif-pseudo") == 0){ options->pseudocount = atof(option_value); } else if (strcmp(option_name, "norc") == 0){ options->scan_both_strands = FALSE; } else if (strcmp(option_name, "noflip") == 0){ options->no_flip = TRUE; } else if (strcmp(option_name, "o") == 0){ // Set output directory with no clobber options->output_dirname = option_value; options->allow_clobber = FALSE; } else if (strcmp(option_name, "oc") == 0){ // Set output directory with clobber options->output_dirname = option_value; options->allow_clobber = TRUE; } else if (strcmp(option_name, "score") == 0){ options->score_thresh = atof(option_value); } else if (strcmp(option_name, "desc") == 0) { options->description = option_value; } else if (strcmp(option_name, "dfile") == 0) { options->desc_file = option_value; } else if (strcmp(option_name, "verbosity") == 0){ verbosity = atoi(option_value); } } // Must have sequence and motif file names if (argc < option_index + 2) { fprintf(stderr, "Sequences and motifs are both required\n"); fprintf(stderr, CENTRIMO_USAGE, DEFAULT_PSEUDOCOUNT, DEFAULT_SCORE_THRESH, DEFAULT_EVALUE_THRESH, NORMAL_VERBOSE); exit(EXIT_FAILURE); } // Record the input file names options->seq_source = argv[option_index++]; for (;option_index < argc; option_index++) arraylst_add(argv[option_index], options->motif_sources); // Set up path values for needed stylesheets and output files. }