/************************************************************************* * Output JSON data for a motif *************************************************************************/ static void output_motif_json(JSONWR_T* json, MOTIF_STATS_T* stats, SITE_COUNTS_T* counts) { //vars MOTIF_T *motif; MATRIX_T *freqs; int i, j, mlen, asize, end; motif = stats->motif; freqs = get_motif_freqs(motif); asize = alph_size(get_motif_alph(motif), ALPH_SIZE); jsonwr_start_object_value(json); jsonwr_lng_prop(json, "db", stats->db->id); jsonwr_str_prop(json, "id", get_motif_id(motif)); if (*(get_motif_id2(motif))) { jsonwr_str_prop(json, "alt", get_motif_id2(motif)); } mlen = get_motif_length(motif); jsonwr_lng_prop(json, "len", mlen); jsonwr_dbl_prop(json, "motif_evalue", get_motif_evalue(motif)); jsonwr_dbl_prop(json, "motif_nsites", get_motif_nsites(motif)); if (get_motif_url(motif) && *get_motif_url(motif)) { jsonwr_str_prop(json, "url", get_motif_url(motif)); } jsonwr_property(json, "pwm"); jsonwr_start_array_value(json); for (i = 0; i < mlen; i++) { jsonwr_start_array_value(json); for (j = 0; j < asize; j++) { jsonwr_dbl_value(json, get_matrix_cell(i, j, freqs)); } jsonwr_end_array_value(json); } jsonwr_end_array_value(json); jsonwr_lng_prop(json, "bin_width", stats->central_window+1); jsonwr_dbl_prop(json, "bin_sites", stats->central_sites); jsonwr_lng_prop(json, "total_sites", counts->total_sites); jsonwr_dbl_prop(json, "log_pvalue", stats->log_adj_pvalue); jsonwr_dbl_prop(json, "max_prob", stats->max_prob); jsonwr_property(json, "sites"); jsonwr_start_array_value(json); end = counts->allocated - (mlen - 1); for (i = (mlen - 1); i < end; i += 2) { jsonwr_dbl_value(json, counts->sites[i]); } jsonwr_end_array_value(json); jsonwr_end_object_value(json); }
/* * jsonwr_arg_prop * * Outputs an argv array while removing the path from the program name */ void jsonwr_args_prop(JSONWR_T* jsonwr, const char* property, int argc, char** argv) { char *prog; int i; jsonwr_property(jsonwr, property); jsonwr_start_array_value(jsonwr); prog = strdup(argv[0]); jsonwr_str_value(jsonwr, basename(prog)); free(prog); for (i = 1; i < argc; i++) { jsonwr_str_value(jsonwr, argv[i]); } jsonwr_end_array_value(jsonwr); }
/* * jsonwr_bool_array_prop */ void jsonwr_bool_array_prop(JSONWR_T* jsonwr, char* property, int* values, int count) { jsonwr_property(jsonwr, property); jsonwr_bool_array_value(jsonwr, values, count); }
/* * jsonwr_bool_prop */ void jsonwr_bool_prop(JSONWR_T* jsonwr, char* property, int value) { jsonwr_property(jsonwr, property); jsonwr_bool_value(jsonwr, value); }
/* * jsonwr_dbl_prop */ void jsonwr_dbl_prop(JSONWR_T* jsonwr, char* property, double value) { jsonwr_property(jsonwr, property); jsonwr_dbl_value(jsonwr, value); }
/* * jsonwr_lng_prop */ void jsonwr_lng_prop(JSONWR_T* jsonwr, char* property, long value) { jsonwr_property(jsonwr, property); jsonwr_lng_value(jsonwr, value); }
/* * jsonwr_nstr_prop */ void jsonwr_nstr_prop(JSONWR_T* jsonwr, char* property, char* value) { jsonwr_property(jsonwr, property); jsonwr_nstr_value(jsonwr, value); }
/* * jsonwr_null_prop */ void jsonwr_null_prop(JSONWR_T* jsonwr, char* property) { jsonwr_property(jsonwr, property); jsonwr_null_value(jsonwr); }
/* * jsonwr_log10num_array_prop */ void jsonwr_log10num_array_prop(JSONWR_T* jsonwr, const char* property, double* values, int prec, int count) { jsonwr_property(jsonwr, property); jsonwr_log10num_array_value(jsonwr, values, prec, count); }
/* * jsonwr_dbl_array_prop */ void jsonwr_dbl_array_prop(JSONWR_T* jsonwr, const char* property, double* values, int count) { jsonwr_property(jsonwr, property); jsonwr_dbl_array_value(jsonwr, values, count); }
/* * jsonwr_lng_array_prop */ void jsonwr_lng_array_prop(JSONWR_T* jsonwr, const char* property, long* values, int count) { jsonwr_property(jsonwr, property); jsonwr_lng_array_value(jsonwr, values, count); }
/* * jsonwr_log10num_prop */ void jsonwr_log10num_prop(JSONWR_T* jsonwr, const char* property, double value, int prec) { jsonwr_property(jsonwr, property); jsonwr_log10num_value(jsonwr, value, prec); }
/************************************************************************* * Entry point for centrimo *************************************************************************/ int main(int argc, char *argv[]) { CENTRIMO_OPTIONS_T options; SEQ_SITES_T seq_sites; SITE_COUNTS_T counts; int seqN, motifN, seqlen, db_i, motif_i, i; double log_pvalue_thresh; SEQ_T** sequences = NULL; ARRAY_T* bg_freqs = NULL; ARRAYLST_T *stats_list; MOTIF_DB_T **dbs, *db; MREAD_T *mread; MOTIF_STATS_T *stats; MOTIF_T *motif, *rev_motif; PSSM_T *pos_pssm, *rev_pssm; char *sites_path, *desc; FILE *sites_file; HTMLWR_T *html; JSONWR_T *json; // COMMAND LINE PROCESSING process_command_line(argc, argv, &options); // load the sequences read_sequences(options.alphabet, options.seq_source, &sequences, &seqN); seqlen = (seqN ? get_seq_length(sequences[0]) : 0); // calculate a sequence background (unless other background is given) if (!options.bg_source) { bg_freqs = calc_bg_from_fastas(options.alphabet, seqN, sequences); } // load the motifs motifN = 0; dbs = mm_malloc(sizeof(MOTIF_DB_T*) * arraylst_size(options.motif_sources)); for (i = 0; i < arraylst_size(options.motif_sources); i++) { char* db_source; db_source = (char*)arraylst_get(i, options.motif_sources); dbs[i] = read_motifs(i, db_source, options.bg_source, &bg_freqs, options.pseudocount, options.selected_motifs, options.alphabet); motifN += arraylst_size(dbs[i]->motifs); } log_pvalue_thresh = log(options.evalue_thresh) - log(motifN); // Setup some things for double strand scanning if (options.scan_both_strands == TRUE) { // Set up hash tables for computing reverse complement setup_hash_alph(DNAB); setalph(0); // Correct background by averaging on freq. for both strands. average_freq_with_complement(options.alphabet, bg_freqs); normalize_subarray(0, alph_size(options.alphabet, ALPH_SIZE), 0.0, bg_freqs); calc_ambigs(options.alphabet, FALSE, bg_freqs); } // Create output directory if (create_output_directory(options.output_dirname, options.allow_clobber, (verbosity >= NORMAL_VERBOSE))) { die("Couldn't create output directory %s.\n", options.output_dirname); } // open output files sites_path = make_path_to_file(options.output_dirname, SITES_FILENAME); sites_file = fopen(sites_path, "w"); free(sites_path); // setup html monolith writer json = NULL; if ((html = htmlwr_create(get_meme_etc_dir(), TEMPLATE_FILENAME))) { htmlwr_set_dest_name(html, options.output_dirname, HTML_FILENAME); htmlwr_replace(html, "centrimo_data.js", "data"); json = htmlwr_output(html); if (json == NULL) die("Template does not contain data section.\n"); } else { DEBUG_MSG(QUIET_VERBOSE, "Failed to open html template file.\n"); } if (json) { // output some top level variables jsonwr_str_prop(json, "version", VERSION); jsonwr_str_prop(json, "revision", REVISION); jsonwr_str_prop(json, "release", ARCHIVE_DATE); jsonwr_str_array_prop(json, "cmd", argv, argc); jsonwr_property(json, "options"); jsonwr_start_object_value(json); jsonwr_dbl_prop(json, "motif-pseudo", options.pseudocount); jsonwr_dbl_prop(json, "score", options.score_thresh); jsonwr_dbl_prop(json, "ethresh", options.evalue_thresh); jsonwr_lng_prop(json, "maxbin", options.max_window+1); jsonwr_bool_prop(json, "norc", !options.scan_both_strands); jsonwr_bool_prop(json, "noflip", options.no_flip); jsonwr_end_object_value(json); // output the description desc = prepare_description(&options); if (desc) { jsonwr_str_prop(json, "job_description", desc); free(desc); } // output size metrics jsonwr_lng_prop(json, "seqlen", seqlen); jsonwr_lng_prop(json, "tested", motifN); // output the fasta db jsonwr_property(json, "sequence_db"); jsonwr_start_object_value(json); jsonwr_str_prop(json, "source", options.seq_source); jsonwr_lng_prop(json, "count", seqN); jsonwr_end_object_value(json); // output the motif dbs jsonwr_property(json, "motif_dbs"); jsonwr_start_array_value(json); for (db_i = 0; db_i < arraylst_size(options.motif_sources); db_i++) { db = dbs[db_i]; jsonwr_start_object_value(json); jsonwr_str_prop(json, "source", db->source); jsonwr_lng_prop(json, "count", arraylst_size(db->motifs)); jsonwr_end_object_value(json); } jsonwr_end_array_value(json); // start the motif array jsonwr_property(json, "motifs"); jsonwr_start_array_value(json); } /************************************************************** * Tally the positions of the best sites for each of the * selected motifs. **************************************************************/ // prepare the sequence sites memset(&seq_sites, 0, sizeof(SEQ_SITES_T)); // prepare the site counts counts.allocated = ((2 * seqlen) - 1); counts.sites = mm_malloc(sizeof(double) * counts.allocated); // prepare the motifs stats list stats_list = arraylst_create(); // prepare the other vars motif = NULL; pos_pssm = NULL; rev_motif = NULL; rev_pssm = NULL; for (db_i = 0; db_i < arraylst_size(options.motif_sources); db_i++) { db = dbs[db_i]; for (motif_i = 0; motif_i < arraylst_size(db->motifs); motif_i++) { motif = (MOTIF_T *) arraylst_get(motif_i, db->motifs); DEBUG_FMT(NORMAL_VERBOSE, "Using motif %s of width %d.\n", get_motif_id(motif), get_motif_length(motif)); // reset the counts for (i = 0; i < counts.allocated; i++) counts.sites[i] = 0; counts.total_sites = 0; // create the pssm pos_pssm = make_pssm(bg_freqs, motif); // If required, do the same for the reverse complement motif. if (options.scan_both_strands) { rev_motif = dup_rc_motif(motif); rev_pssm = make_pssm(bg_freqs, rev_motif); } // scan the sequences for (i = 0; i < seqN; i++) score_sequence(&options, sequences[i], pos_pssm, rev_pssm, &seq_sites, &counts); // DEBUG check that the sum of the sites is close to the site count double sum_check = 0, sum_diff; for (i = 0; i < counts.allocated; i++) sum_check += counts.sites[i]; sum_diff = counts.total_sites - sum_check; if (sum_diff < 0) sum_diff = -sum_diff; if (sum_diff > 0.1) { fprintf(stderr, "Warning: site counts don't sum to accurate value! " "%g != %ld", sum_check, counts.total_sites); } // output the plain text site counts output_site_counts(sites_file, seqlen, db, motif, &counts); // compute the best central window stats = compute_stats(options.max_window, seqlen, db, motif, &counts); // check if it passes the threshold if (json && stats->log_adj_pvalue <= log_pvalue_thresh) { output_motif_json(json, stats, &counts); arraylst_add(stats, stats_list); } else { free(stats); } // Free memory associated with this motif. free_pssm(pos_pssm); free_pssm(rev_pssm); destroy_motif(rev_motif); } } if (json) jsonwr_end_array_value(json); // finish writing sites fclose(sites_file); // finish writing html file if (html) { if (htmlwr_output(html) != NULL) { die("Found another JSON replacement!\n"); } htmlwr_destroy(html); } // write text file output_centrimo_text(&options, motifN, stats_list); // Clean up. for (i = 0; i < seqN; ++i) { free_seq(sequences[i]); } free(sequences); for (i = 0; i < arraylst_size(options.motif_sources); i++) { free_db(dbs[i]); } free(dbs); free_array(bg_freqs); free(counts.sites); free(seq_sites.sites); arraylst_destroy(free, stats_list); cleanup_options(&options); return 0; }