/************************************************************************* * Entry point for centrimo *************************************************************************/ int main(int argc, char *argv[]) { CENTRIMO_OPTIONS_T options; SEQ_SITES_T seq_sites; SITE_COUNTS_T counts; int seqN, motifN, seqlen, db_i, motif_i, i; double log_pvalue_thresh; SEQ_T** sequences = NULL; ARRAY_T* bg_freqs = NULL; ARRAYLST_T *stats_list; MOTIF_DB_T **dbs, *db; MREAD_T *mread; MOTIF_STATS_T *stats; MOTIF_T *motif, *rev_motif; PSSM_T *pos_pssm, *rev_pssm; char *sites_path, *desc; FILE *sites_file; HTMLWR_T *html; JSONWR_T *json; // COMMAND LINE PROCESSING process_command_line(argc, argv, &options); // load the sequences read_sequences(options.alphabet, options.seq_source, &sequences, &seqN); seqlen = (seqN ? get_seq_length(sequences[0]) : 0); // calculate a sequence background (unless other background is given) if (!options.bg_source) { bg_freqs = calc_bg_from_fastas(options.alphabet, seqN, sequences); } // load the motifs motifN = 0; dbs = mm_malloc(sizeof(MOTIF_DB_T*) * arraylst_size(options.motif_sources)); for (i = 0; i < arraylst_size(options.motif_sources); i++) { char* db_source; db_source = (char*)arraylst_get(i, options.motif_sources); dbs[i] = read_motifs(i, db_source, options.bg_source, &bg_freqs, options.pseudocount, options.selected_motifs, options.alphabet); motifN += arraylst_size(dbs[i]->motifs); } log_pvalue_thresh = log(options.evalue_thresh) - log(motifN); // Setup some things for double strand scanning if (options.scan_both_strands == TRUE) { // Set up hash tables for computing reverse complement setup_hash_alph(DNAB); setalph(0); // Correct background by averaging on freq. for both strands. average_freq_with_complement(options.alphabet, bg_freqs); normalize_subarray(0, alph_size(options.alphabet, ALPH_SIZE), 0.0, bg_freqs); calc_ambigs(options.alphabet, FALSE, bg_freqs); } // Create output directory if (create_output_directory(options.output_dirname, options.allow_clobber, (verbosity >= NORMAL_VERBOSE))) { die("Couldn't create output directory %s.\n", options.output_dirname); } // open output files sites_path = make_path_to_file(options.output_dirname, SITES_FILENAME); sites_file = fopen(sites_path, "w"); free(sites_path); // setup html monolith writer json = NULL; if ((html = htmlwr_create(get_meme_etc_dir(), TEMPLATE_FILENAME))) { htmlwr_set_dest_name(html, options.output_dirname, HTML_FILENAME); htmlwr_replace(html, "centrimo_data.js", "data"); json = htmlwr_output(html); if (json == NULL) die("Template does not contain data section.\n"); } else { DEBUG_MSG(QUIET_VERBOSE, "Failed to open html template file.\n"); } if (json) { // output some top level variables jsonwr_str_prop(json, "version", VERSION); jsonwr_str_prop(json, "revision", REVISION); jsonwr_str_prop(json, "release", ARCHIVE_DATE); jsonwr_str_array_prop(json, "cmd", argv, argc); jsonwr_property(json, "options"); jsonwr_start_object_value(json); jsonwr_dbl_prop(json, "motif-pseudo", options.pseudocount); jsonwr_dbl_prop(json, "score", options.score_thresh); jsonwr_dbl_prop(json, "ethresh", options.evalue_thresh); jsonwr_lng_prop(json, "maxbin", options.max_window+1); jsonwr_bool_prop(json, "norc", !options.scan_both_strands); jsonwr_bool_prop(json, "noflip", options.no_flip); jsonwr_end_object_value(json); // output the description desc = prepare_description(&options); if (desc) { jsonwr_str_prop(json, "job_description", desc); free(desc); } // output size metrics jsonwr_lng_prop(json, "seqlen", seqlen); jsonwr_lng_prop(json, "tested", motifN); // output the fasta db jsonwr_property(json, "sequence_db"); jsonwr_start_object_value(json); jsonwr_str_prop(json, "source", options.seq_source); jsonwr_lng_prop(json, "count", seqN); jsonwr_end_object_value(json); // output the motif dbs jsonwr_property(json, "motif_dbs"); jsonwr_start_array_value(json); for (db_i = 0; db_i < arraylst_size(options.motif_sources); db_i++) { db = dbs[db_i]; jsonwr_start_object_value(json); jsonwr_str_prop(json, "source", db->source); jsonwr_lng_prop(json, "count", arraylst_size(db->motifs)); jsonwr_end_object_value(json); } jsonwr_end_array_value(json); // start the motif array jsonwr_property(json, "motifs"); jsonwr_start_array_value(json); } /************************************************************** * Tally the positions of the best sites for each of the * selected motifs. **************************************************************/ // prepare the sequence sites memset(&seq_sites, 0, sizeof(SEQ_SITES_T)); // prepare the site counts counts.allocated = ((2 * seqlen) - 1); counts.sites = mm_malloc(sizeof(double) * counts.allocated); // prepare the motifs stats list stats_list = arraylst_create(); // prepare the other vars motif = NULL; pos_pssm = NULL; rev_motif = NULL; rev_pssm = NULL; for (db_i = 0; db_i < arraylst_size(options.motif_sources); db_i++) { db = dbs[db_i]; for (motif_i = 0; motif_i < arraylst_size(db->motifs); motif_i++) { motif = (MOTIF_T *) arraylst_get(motif_i, db->motifs); DEBUG_FMT(NORMAL_VERBOSE, "Using motif %s of width %d.\n", get_motif_id(motif), get_motif_length(motif)); // reset the counts for (i = 0; i < counts.allocated; i++) counts.sites[i] = 0; counts.total_sites = 0; // create the pssm pos_pssm = make_pssm(bg_freqs, motif); // If required, do the same for the reverse complement motif. if (options.scan_both_strands) { rev_motif = dup_rc_motif(motif); rev_pssm = make_pssm(bg_freqs, rev_motif); } // scan the sequences for (i = 0; i < seqN; i++) score_sequence(&options, sequences[i], pos_pssm, rev_pssm, &seq_sites, &counts); // DEBUG check that the sum of the sites is close to the site count double sum_check = 0, sum_diff; for (i = 0; i < counts.allocated; i++) sum_check += counts.sites[i]; sum_diff = counts.total_sites - sum_check; if (sum_diff < 0) sum_diff = -sum_diff; if (sum_diff > 0.1) { fprintf(stderr, "Warning: site counts don't sum to accurate value! " "%g != %ld", sum_check, counts.total_sites); } // output the plain text site counts output_site_counts(sites_file, seqlen, db, motif, &counts); // compute the best central window stats = compute_stats(options.max_window, seqlen, db, motif, &counts); // check if it passes the threshold if (json && stats->log_adj_pvalue <= log_pvalue_thresh) { output_motif_json(json, stats, &counts); arraylst_add(stats, stats_list); } else { free(stats); } // Free memory associated with this motif. free_pssm(pos_pssm); free_pssm(rev_pssm); destroy_motif(rev_motif); } } if (json) jsonwr_end_array_value(json); // finish writing sites fclose(sites_file); // finish writing html file if (html) { if (htmlwr_output(html) != NULL) { die("Found another JSON replacement!\n"); } htmlwr_destroy(html); } // write text file output_centrimo_text(&options, motifN, stats_list); // Clean up. for (i = 0; i < seqN; ++i) { free_seq(sequences[i]); } free(sequences); for (i = 0; i < arraylst_size(options.motif_sources); i++) { free_db(dbs[i]); } free(dbs); free_array(bg_freqs); free(counts.sites); free(seq_sites.sites); arraylst_destroy(free, stats_list); cleanup_options(&options); return 0; }
int main(int argc, char *argv[]) { set_new_handler(alloc_error); if(argc < 6) { print_usage(cout); exit(0); } string seqfile; // file with sequences string exprfile; // file with expression data string subsetfile; // file with subset of sequence names to search string scorefile; // file with scores if(! GetArg2(argc, argv, "-s", seqfile)) { cerr << "Please specify sequence file\n\n"; print_usage(cout); exit(0); } if(! GetArg2(argc, argv, "-o", outfile)) { cerr << "Please specify output file\n\n"; print_usage(cout); exit(0); } int search_type = UNDEFINED; if(GetArg2(argc, argv, "-ex", exprfile)) { search_type = EXPRESSION; } if(GetArg2(argc, argv, "-su", subsetfile)) { search_type = SUBSET; } if(GetArg2(argc, argv, "-sc", scorefile)) { search_type = SCORE; } if(search_type == UNDEFINED) { cerr << "Please specify either an expression data file, a file with a subset of sequence names, or a file with sequence scores.\n\n"; print_usage(cout); exit(0); } // Decide mode of running archive = false; if(! GetArg2(argc, argv, "-worker", worker)) { worker = -1; archive = true; } // Read parameters vector<string> seqs; cerr << "Reading sequence data from '" << seqfile << "'... "; get_fasta_fast(seqfile.c_str(), seqs, seq_nameset); cerr << "done.\n"; ngenes = seq_nameset.size(); npoints = 0; if(search_type == EXPRESSION) { cerr << "Reading expression data from '" << exprfile << "'... "; get_expr(exprfile.c_str(), expr, data_nameset); cerr << "done.\n"; npoints = expr[0].size(); nsubset = 0; } else if(search_type == SUBSET) { cerr << "Reading subset of sequences to search from '" << subsetfile << "'... "; get_list(subsetfile.c_str(), subset); cerr << "done.\n"; npoints = 0; for(int i = 0; i < ngenes; i++) { vector<float> row(0); expr.push_back(row); } nsubset = subset.size(); sort(subset.begin(), subset.end()); } else if(search_type == SCORE) { cerr << "Reading sequence scores from '" << scorefile << "'... "; get_scores(scorefile.c_str(), scores, data_nameset); cerr << "done.\n"; npoints = 1; nsubset = 0; } vector<vector <float> > newexpr; vector<float> newscores; if(search_type == EXPRESSION) { order_data_expr(newexpr); } if(search_type == SCORE) { order_data_scores(newscores); } if(search_type == EXPRESSION) { cerr << "Successfully read input files -- dataset size is " << ngenes << " sequences X " << npoints << " timepoints\n"; } else if(search_type == SUBSET) { cerr << "Successfully read input files -- dataset size is " << ngenes << " sequences, with " << nsubset << " to be searched\n"; } else if(search_type == SCORE) { cerr << "Successfully read input files -- dataset size is " << ngenes << " scored sequences\n"; } cerr << "Setting up MotifSearch... "; if(! GetArg2(argc, argv, "-numcols", ncol)) ncol = 10; if(! GetArg2(argc, argv, "-order", order)) order = 0; if(! GetArg2(argc, argv, "-simcut", simcut)) simcut = 0.8; if(! GetArg2(argc, argv, "-maxm", maxm)) maxm = 20; MotifSearch* ms; if(search_type == EXPRESSION) { ms = new MotifSearchExpr(seq_nameset, seqs, ncol, order, simcut, maxm, newexpr, npoints); } else if(search_type == SCORE) { ms = new MotifSearchScore(seq_nameset, seqs, ncol, order, simcut, maxm, newscores); } else { ms = new MotifSearchSubset(seq_nameset, seqs, ncol, order, simcut, maxm, subset); } ms->modify_params(argc, argv); ms->set_final_params(); ms->ace_initialize(); cerr << "done.\n"; cerr << "Random seed: " << ms->get_params().seed << '\n'; if(archive) { cerr << "Running in archive mode...\n"; string archinstr(outfile); archinstr.append(".ms"); ifstream archin(archinstr.c_str()); if(archin) { cerr << "Refreshing from existing archive file " << archinstr << "... "; ms->get_archive().read(archin); cerr << "done.\n"; } while(true) { int found = read_motifs(ms); if(found > 0) output(ms); if(found < 50) sleep(10); } } else { cerr << "Running as worker " << worker << "...\n"; int nruns = ms->positions_in_search_space()/(ms->get_params().expect * ncol); nruns *= ms->get_params().oversample; nruns /= ms->get_params().undersample; cerr << "Restarts planned: " << nruns << '\n'; string archinstr(outfile); archinstr.append(".ms"); string lockstr(outfile); lockstr.append(".lock"); for(int j = 1; j <= nruns; j++) { if(j == 1 || j % 50 == 0 || search_type == SUBSET) { struct flock fl; int fd; fl.l_type = F_RDLCK; fl.l_whence = SEEK_SET; fl.l_start = 0; fl.l_len = 0; fl.l_pid = getpid(); fd = open(lockstr.c_str(), O_RDONLY); if(fd == -1) { if(errno != ENOENT) cerr << "\t\tUnable to read lock file, error was " << strerror(errno) << "\n"; } else { while(fcntl(fd, F_SETLK, &fl) == -1) { cerr << "\t\tWaiting for lock release on archive file... \n"; sleep(10); } ifstream archin(archinstr.c_str()); if(archin) { cerr << "\t\tRefreshing archive from " << archinstr << "..."; ms->get_archive().clear(); ms->get_archive().read(archin); archin.close(); cerr << "done.\n"; } fl.l_type = F_UNLCK; fcntl(fd, F_SETLK, &fl); close(fd); cerr << "\t\tArchive now has " << ms->get_archive().nmots() << " motifs\n"; } } cerr << "\t\tSearch restart #" << j << "/" << nruns << "\n"; ms->search_for_motif(worker, j, outfile); } } delete ms; return 0; }
int main(int argc, char ** argv) { int i ; char *seedfile ; char *rna_fastafile ; char *expfile ; char *dataoutfile ; char *motifoutfile ; char *location ; char *icshape_file; //new line int dooptimize = 1 ; int doonlypositive = 0 ; float *E ; int *E_q ; int *M_q ; struct my_hsearch_data *h_rna_ind ; ENTRY e ; ENTRY* ep ; int hashret =0 ; int quantized = 1 ; int ebins = 0 ; int mbins = 2 ; int divbins = 50 ; float* E_q_bins = 0 ; int shuffle = 1000000 ; int rnd_fasta = 0 ; float max_p = 0.0000001 ; float max_z = -100 ; int mincount = 5 ; double minr = 5.0 ; double minratio = 0; int midx; float maxfreq = 0.5; float myfreq; float lastmyfreq; float best_lastmyfreq; int jn = 10; int jn_t = 6; int jn_f = 3; s_sequence **sequences ; s_motif **motifs ; s_motif **opt_motifs ; char **seq_names ; int seq_count ; int t_seq_count ; int icshape_count; //new line int motif_count = 0 ; seedfile = get_parameter(argc, argv, "-seedfile") ; rna_fastafile = get_parameter(argc, argv, "-rna_fastafile") ; dataoutfile = get_parameter(argc, argv, "-dataoutfile") ; motifoutfile = get_parameter(argc, argv, "-motifoutfile") ; location = get_parameter(argc, argv, "-location") ; icshape_file = get_parameter(argc, argv, "-icshapefile"); //new line expfile = get_parameter(argc, argv, "-expfile") ; quantized = atoi(get_parameter(argc, argv, "-quantized")); if (exist_parameter(argc, argv, "-max_p")) { max_p = atof(get_parameter(argc, argv, "-max_p")); } if (exist_parameter(argc, argv, "-max_z")) { max_z = atof(get_parameter(argc, argv, "-max_z")); } if (exist_parameter(argc, argv, "-minr")) { minr = atof(get_parameter(argc, argv, "-minr")); } if (exist_parameter(argc, argv, "-ebins")) { ebins = atoi(get_parameter(argc, argv, "-ebins")); } if (exist_parameter(argc, argv, "-jn_t")) { jn_t = atoi(get_parameter(argc, argv, "-jn_t")); } if (exist_parameter(argc, argv, "-shuffle")) { shuffle = atoi(get_parameter(argc, argv, "-shuffle")); } if (exist_parameter(argc, argv, "-mincount")) { mincount = atoi(get_parameter(argc, argv, "-mincount")); } if (exist_parameter(argc, argv, "-dooptimize")) { dooptimize = atoi(get_parameter(argc, argv, "-dooptimize")); } if (exist_parameter(argc, argv, "-doonlypositive")) { doonlypositive = atoi(get_parameter(argc, argv, "-doonlypositive")); } FILE *f, *fmotif ; FILE *fptr = fopen ( seedfile, "rb") ; if (!fptr){ printf("Could not open the seed file: %s\n", seedfile) ; exit(0) ; } motif_count = read_motifs( fptr, &motifs ) ; printf("%d seeds were loaded...\n", motif_count) ; fflush(stdout) ; fclose(fptr) ; /*Read Fasta here*/ t_seq_count = read_FASTA ( rna_fastafile, &sequences, rnd_fasta) ; printf("%d sequences loaded...\n", t_seq_count) ; fflush(stdout) ; /*we should read icSHAPE Value here*/ printf("Read icSHAPE Begin.....\n"); //new line icshape_count = read_icSHAPE(icshape_file,sequences,t_seq_count); //new line printf("%d icSHAPE Values loaded",icshape_count); //new line fflush(stdout); //new line E = read_expfile (expfile, sequences, t_seq_count, &seq_names, &seq_count) ; printf("Expfile loaded: %d values...\n", seq_count) ; fflush(stdout) ; if ((quantized == 0) && (ebins == 0)) { ebins = (int)(0.5 + (float)seq_count / ( divbins * mbins )); } if (quantized == 0) { printf("Adding small values...\n") ; add_small_values_to_identical_floats(E, seq_count); } printf("Quantizing the input vector...") ; E_q = (int*)malloc((seq_count) * sizeof(int)) ; quantize_E(E, seq_count, quantized, &ebins, &E_q, &E_q_bins); printf("Done\n") ; fflush(stdout) ; h_rna_ind = (struct my_hsearch_data*)calloc(1, sizeof(struct my_hsearch_data)); hashret = my_hcreate_r(100000, h_rna_ind); if (hashret == 0) { printf("main: couldn't make the hashtable...\n"); exit(0); } for (i=0 ; i<seq_count ; i++){ e.key = strdup(seq_names[i]) ; e.data = (char*) i ; hashret = my_hsearch_r(e, ENTER, &ep, h_rna_ind); if (hashret == 0){ printf("main: couldn't add the data to hashtable...\n"); exit(0); } } int opt_count=0 ; int hits =0; float init_best_mymi = 0 ; opt_motifs = (s_motif**) malloc (motif_count*sizeof(s_motif*)) ; int *jnres = (int*) malloc (motif_count*sizeof(int)) ; for (i=0 ; i<motif_count ; i++){ M_q = get_motif_profile (motifs[i], sequences, t_seq_count, h_rna_ind, &hits) ; // check how much information it adds to the previous guys if (opt_count > 0){ minratio = minCondInfoNormalized(opt_motifs, mbins, opt_count, M_q, mbins, E_q, ebins, seq_count, minr, &midx, sequences, t_seq_count, h_rna_ind) ; }else{ minratio = minr+1 ; } if (minratio < minr){ free(M_q) ; printf("seed %d killed by motif %d.\n", i, midx) ; continue ; }else{ printf("optimizing.\n") ; } if (doonlypositive==1){ float r = pearson_int (M_q, E_q, seq_count) ; if (r<0){ free(M_q) ; printf("seed %d killed due to negative association (pearson=%4.3f)\n", i, r) ; continue ; } } lastmyfreq = (float)(hits) / seq_count ; best_lastmyfreq = lastmyfreq; s_motif *bestmotif = copy_motif(motifs[i]) ; if (dooptimize==1){ // initial mi value init_best_mymi = CalculateMIbasic(M_q, E_q, seq_count, mbins, ebins) ; printf("Initial MI = %3.4f\n", init_best_mymi) ; // create a random index int *k_shu ; int *k_inc = (int*) malloc (sizeof(int)*motifs[i]->num_phrases) ; int k ; for (k=0; k<motifs[i]->num_phrases; k++) { k_inc[k] = k; } k_shu = shuffleInt(k_inc, motifs[i]->num_phrases); // optimize motif printf("Optimzing the sequence of motif %d/%d...\n", i+1, motif_count) ; float bestmi = init_best_mymi ; printf("initial motif (mi = %3.3f): %s\n", bestmi, print_motif_to_char(bestmotif)) ; for (k=0 ; k<motifs[i]->num_phrases ; k++){ int pos = k_shu[k] ; s_motif *modified_motifs [15] ; modify_base( bestmotif, modified_motifs, pos ) ; int j=0 ; for (j=0 ; j<15 ; j++){ int h ; int *M_q_t = get_motif_profile (modified_motifs[j], sequences, t_seq_count, h_rna_ind, &h) ; myfreq = (float)(h) / seq_count ; float tempmi = CalculateMIbasic(M_q_t, E_q, seq_count, mbins, ebins) ; if (tempmi>bestmi && h>10 && (myfreq<maxfreq || myfreq<lastmyfreq)){ free(bestmotif->phrases) ; free(bestmotif) ; bestmotif = copy_motif(modified_motifs[j]) ; bestmi = tempmi ; lastmyfreq = myfreq ; printf("new motif (mi = %3.3f): %s\n", bestmi, print_motif_to_char(bestmotif)) ; } free (M_q_t) ; } } /* Elongating the motif here */ printf("Elongating the motif...\n") ; float premi = bestmi ; do{ s_motif *modified_motifs [46] ; elongate_motif( bestmotif, modified_motifs) ; int j ; for (j=0 ; j<46 ; j++){ int h ; int *M_q_t = get_motif_profile (modified_motifs[j], sequences, t_seq_count, h_rna_ind, &h) ; float tempmi = CalculateMIbasic(M_q_t, E_q, seq_count, mbins, ebins) ; if (tempmi>bestmi && h>10){ free(bestmotif->phrases) ; free(bestmotif) ; bestmotif = copy_motif(modified_motifs[j]) ; bestmi = tempmi ; printf("new motif (mi = %3.3f): %s\n", bestmi, print_motif_to_char(bestmotif)) ; } free(M_q_t) ; } } while (premi >= bestmi && bestmotif->num_phrases > motifs[i]->num_phrases) ; } free(M_q) ; M_q = get_motif_profile (bestmotif, sequences, t_seq_count, h_rna_ind, &hits) ; float mi = CalculateMIbasic(M_q, E_q, seq_count, mbins, ebins) ; float z = teiser_z_score_test(mi, M_q, mbins, E_q, ebins, seq_count, 10000) ; printf("Final z-score = %4.3f (threshold=%3.3f)\n", z, max_z) ; if (z>max_z){ printf("z-score passed the test\n.") ; int jn_test = teiser_jn_max_rank_test(M_q, mbins, E_q, ebins, seq_count, shuffle/10, max_p, jn, jn_f) ; if (jn_test>=jn_t){ opt_motifs[opt_count] = copy_motif(bestmotif) ; jnres[opt_count] = jn_test ; opt_count++ ; }else{ printf("robustness=%d/%d\n.", jn_test, jn_t) ; } } free(bestmotif->phrases) ; free(bestmotif) ; free(M_q) ; } f = fopen(dataoutfile, "w") ; fmotif = fopen(motifoutfile, "wb") ; if (!f || !fmotif) die("Cannot open datafile\n"); fprintf(f, "index\tlocation\tmotif-seq\tmotif_structure\tmi-value\tseq mi-value\tfrequency\tz-score\trobustness\tp-value\n") ; for (i=0 ; i<opt_count ; i++){ int *M_qs = get_motif_profile_seq_only (opt_motifs[i], sequences, t_seq_count, h_rna_ind, &hits) ; float mis = CalculateMIbasic(M_qs, E_q, seq_count, mbins, ebins) ; M_q = get_motif_profile (opt_motifs[i], sequences, t_seq_count, h_rna_ind, &hits) ; float mi = CalculateMIbasic(M_q, E_q, seq_count, mbins, ebins) ; double pass = evalSeed(M_q, seq_count, mi, mbins, E_q, ebins, shuffle) ; float z = teiser_z_score_test(mi, M_q, mbins, E_q, ebins, seq_count, 10000) ; char p_val[100] ; if (pass == 0){ sprintf(p_val, "<%.2e", 1.0/shuffle) ; }else{ sprintf(p_val, "<%.6e", pass) ; } printf("motif %d/%d\t%s\tmi=%3.6f\thits=%d\t%3.4f\n", i+1, opt_count, print_motif_to_char(opt_motifs[i]), mi, hits, z) ; fprintf(f, "%d\t%s\t%s\t%3.6f\t%3.6f\t%3.3f\t%3.3f\t%d/10\t%s\n", i, location, print_motif_to_char(opt_motifs[i]), mi, mis, ((float)hits)/seq_count, z, jnres[i], p_val) ; free(M_q) ; free(M_qs) ; } write_motifs (fmotif, opt_motifs, opt_count) ; free(E) ; free(E_q) ; fclose(fmotif) ; fclose(f) ; return (0) ; }