/*********************************************************************** * Duplicates and reverse complements the motif ***********************************************************************/ MOTIF_T* dup_rc_motif (MOTIF_T *motif) { MOTIF_T *rc_motif; rc_motif = mm_malloc(sizeof(MOTIF_T)); copy_motif(motif, rc_motif); reverse_complement_motif(rc_motif); return rc_motif; }
/*********************************************************************** * Discard motifs that are not connected. ***********************************************************************/ static void throw_out_unused_motifs (MATRIX_T* transp_freq, MATRIX_T* spacer_ave, int* num_motifs, MOTIF_T* motifs) { int i_motif, j_motif; ARRAY_T* row_sums; ARRAY_T* col_sums; // Extract the margins of the transition matrix. row_sums = get_matrix_row_sums(transp_freq); col_sums = get_matrix_col_sums(transp_freq); for (i_motif = 0; i_motif < *num_motifs; i_motif++) { // Is this row or column empty? if ((get_array_item(i_motif + 1, row_sums) == 0.0) || (get_array_item(i_motif + 1, col_sums) == 0.0)) { if (verbosity >= NORMAL_VERBOSE) { fprintf(stderr, "Removing unused motif %s. No occurrences of this motif were found.\n", get_motif_id(&(motifs[i_motif]))); } // Remove the row and column from the transition matrix. remove_matrix_row(i_motif + 1, transp_freq); remove_matrix_col(i_motif + 1, transp_freq); assert(get_num_rows(transp_freq) == get_num_cols(transp_freq)); remove_matrix_row(i_motif + 1, spacer_ave); remove_matrix_col(i_motif + 1, spacer_ave); assert(get_num_rows(spacer_ave) == get_num_cols(spacer_ave)); // Remove the motif from the array. for (j_motif = i_motif + 1; j_motif < *num_motifs; j_motif++) { free_motif(&(motifs[j_motif - 1])); copy_motif(&(motifs[j_motif]), &(motifs[j_motif - 1])); } free_motif(&(motifs[j_motif - 1])); (*num_motifs)--; i_motif--; // Recalculate the row and column sums. free_array(row_sums); free_array(col_sums); row_sums = get_matrix_row_sums(transp_freq); col_sums = get_matrix_col_sums(transp_freq); } } free_array(row_sums); free_array(col_sums); }
/*********************************************************************** * Convert a list of motifs into an array of motifs with a count. * This is intended to allow backwards compatibility with the older * version. ***********************************************************************/ void motif_list_to_array(ARRAYLST_T *motif_list, MOTIF_T **motif_array, int *num) { int count, i; MOTIF_T *motifs; count = arraylst_size(motif_list); motifs = (MOTIF_T*)mm_malloc(sizeof(MOTIF_T) * count); for (i = 0; i < count; ++i) { copy_motif((MOTIF_T*)arraylst_get(i, motif_list), motifs+i); } *motif_array = motifs; *num = count; }
/*********************************************************************** * Convert a tree of motifs into an array of motifs with a count. * This is intended to allow backwards compatibility with the older * version. ***********************************************************************/ void motif_tree_to_array(RBTREE_T *motif_tree, MOTIF_T **motif_array, int *num) { int count, i; MOTIF_T *motifs; RBNODE_T *node; count = rbtree_size(motif_tree); motifs = mm_malloc(sizeof(MOTIF_T) * count); for (i = 0, node = rbtree_first(motif_tree); node != NULL; i++, node = rbtree_next(node)) { copy_motif((MOTIF_T*)rbtree_value(node), motifs+i); } *motif_array = motifs; *num = count; }
int main(int argc, char ** argv) { int i ; char *seedfile ; char *rna_fastafile ; char *expfile ; char *dataoutfile ; char *motifoutfile ; char *location ; char *icshape_file; //new line int dooptimize = 1 ; int doonlypositive = 0 ; float *E ; int *E_q ; int *M_q ; struct my_hsearch_data *h_rna_ind ; ENTRY e ; ENTRY* ep ; int hashret =0 ; int quantized = 1 ; int ebins = 0 ; int mbins = 2 ; int divbins = 50 ; float* E_q_bins = 0 ; int shuffle = 1000000 ; int rnd_fasta = 0 ; float max_p = 0.0000001 ; float max_z = -100 ; int mincount = 5 ; double minr = 5.0 ; double minratio = 0; int midx; float maxfreq = 0.5; float myfreq; float lastmyfreq; float best_lastmyfreq; int jn = 10; int jn_t = 6; int jn_f = 3; s_sequence **sequences ; s_motif **motifs ; s_motif **opt_motifs ; char **seq_names ; int seq_count ; int t_seq_count ; int icshape_count; //new line int motif_count = 0 ; seedfile = get_parameter(argc, argv, "-seedfile") ; rna_fastafile = get_parameter(argc, argv, "-rna_fastafile") ; dataoutfile = get_parameter(argc, argv, "-dataoutfile") ; motifoutfile = get_parameter(argc, argv, "-motifoutfile") ; location = get_parameter(argc, argv, "-location") ; icshape_file = get_parameter(argc, argv, "-icshapefile"); //new line expfile = get_parameter(argc, argv, "-expfile") ; quantized = atoi(get_parameter(argc, argv, "-quantized")); if (exist_parameter(argc, argv, "-max_p")) { max_p = atof(get_parameter(argc, argv, "-max_p")); } if (exist_parameter(argc, argv, "-max_z")) { max_z = atof(get_parameter(argc, argv, "-max_z")); } if (exist_parameter(argc, argv, "-minr")) { minr = atof(get_parameter(argc, argv, "-minr")); } if (exist_parameter(argc, argv, "-ebins")) { ebins = atoi(get_parameter(argc, argv, "-ebins")); } if (exist_parameter(argc, argv, "-jn_t")) { jn_t = atoi(get_parameter(argc, argv, "-jn_t")); } if (exist_parameter(argc, argv, "-shuffle")) { shuffle = atoi(get_parameter(argc, argv, "-shuffle")); } if (exist_parameter(argc, argv, "-mincount")) { mincount = atoi(get_parameter(argc, argv, "-mincount")); } if (exist_parameter(argc, argv, "-dooptimize")) { dooptimize = atoi(get_parameter(argc, argv, "-dooptimize")); } if (exist_parameter(argc, argv, "-doonlypositive")) { doonlypositive = atoi(get_parameter(argc, argv, "-doonlypositive")); } FILE *f, *fmotif ; FILE *fptr = fopen ( seedfile, "rb") ; if (!fptr){ printf("Could not open the seed file: %s\n", seedfile) ; exit(0) ; } motif_count = read_motifs( fptr, &motifs ) ; printf("%d seeds were loaded...\n", motif_count) ; fflush(stdout) ; fclose(fptr) ; /*Read Fasta here*/ t_seq_count = read_FASTA ( rna_fastafile, &sequences, rnd_fasta) ; printf("%d sequences loaded...\n", t_seq_count) ; fflush(stdout) ; /*we should read icSHAPE Value here*/ printf("Read icSHAPE Begin.....\n"); //new line icshape_count = read_icSHAPE(icshape_file,sequences,t_seq_count); //new line printf("%d icSHAPE Values loaded",icshape_count); //new line fflush(stdout); //new line E = read_expfile (expfile, sequences, t_seq_count, &seq_names, &seq_count) ; printf("Expfile loaded: %d values...\n", seq_count) ; fflush(stdout) ; if ((quantized == 0) && (ebins == 0)) { ebins = (int)(0.5 + (float)seq_count / ( divbins * mbins )); } if (quantized == 0) { printf("Adding small values...\n") ; add_small_values_to_identical_floats(E, seq_count); } printf("Quantizing the input vector...") ; E_q = (int*)malloc((seq_count) * sizeof(int)) ; quantize_E(E, seq_count, quantized, &ebins, &E_q, &E_q_bins); printf("Done\n") ; fflush(stdout) ; h_rna_ind = (struct my_hsearch_data*)calloc(1, sizeof(struct my_hsearch_data)); hashret = my_hcreate_r(100000, h_rna_ind); if (hashret == 0) { printf("main: couldn't make the hashtable...\n"); exit(0); } for (i=0 ; i<seq_count ; i++){ e.key = strdup(seq_names[i]) ; e.data = (char*) i ; hashret = my_hsearch_r(e, ENTER, &ep, h_rna_ind); if (hashret == 0){ printf("main: couldn't add the data to hashtable...\n"); exit(0); } } int opt_count=0 ; int hits =0; float init_best_mymi = 0 ; opt_motifs = (s_motif**) malloc (motif_count*sizeof(s_motif*)) ; int *jnres = (int*) malloc (motif_count*sizeof(int)) ; for (i=0 ; i<motif_count ; i++){ M_q = get_motif_profile (motifs[i], sequences, t_seq_count, h_rna_ind, &hits) ; // check how much information it adds to the previous guys if (opt_count > 0){ minratio = minCondInfoNormalized(opt_motifs, mbins, opt_count, M_q, mbins, E_q, ebins, seq_count, minr, &midx, sequences, t_seq_count, h_rna_ind) ; }else{ minratio = minr+1 ; } if (minratio < minr){ free(M_q) ; printf("seed %d killed by motif %d.\n", i, midx) ; continue ; }else{ printf("optimizing.\n") ; } if (doonlypositive==1){ float r = pearson_int (M_q, E_q, seq_count) ; if (r<0){ free(M_q) ; printf("seed %d killed due to negative association (pearson=%4.3f)\n", i, r) ; continue ; } } lastmyfreq = (float)(hits) / seq_count ; best_lastmyfreq = lastmyfreq; s_motif *bestmotif = copy_motif(motifs[i]) ; if (dooptimize==1){ // initial mi value init_best_mymi = CalculateMIbasic(M_q, E_q, seq_count, mbins, ebins) ; printf("Initial MI = %3.4f\n", init_best_mymi) ; // create a random index int *k_shu ; int *k_inc = (int*) malloc (sizeof(int)*motifs[i]->num_phrases) ; int k ; for (k=0; k<motifs[i]->num_phrases; k++) { k_inc[k] = k; } k_shu = shuffleInt(k_inc, motifs[i]->num_phrases); // optimize motif printf("Optimzing the sequence of motif %d/%d...\n", i+1, motif_count) ; float bestmi = init_best_mymi ; printf("initial motif (mi = %3.3f): %s\n", bestmi, print_motif_to_char(bestmotif)) ; for (k=0 ; k<motifs[i]->num_phrases ; k++){ int pos = k_shu[k] ; s_motif *modified_motifs [15] ; modify_base( bestmotif, modified_motifs, pos ) ; int j=0 ; for (j=0 ; j<15 ; j++){ int h ; int *M_q_t = get_motif_profile (modified_motifs[j], sequences, t_seq_count, h_rna_ind, &h) ; myfreq = (float)(h) / seq_count ; float tempmi = CalculateMIbasic(M_q_t, E_q, seq_count, mbins, ebins) ; if (tempmi>bestmi && h>10 && (myfreq<maxfreq || myfreq<lastmyfreq)){ free(bestmotif->phrases) ; free(bestmotif) ; bestmotif = copy_motif(modified_motifs[j]) ; bestmi = tempmi ; lastmyfreq = myfreq ; printf("new motif (mi = %3.3f): %s\n", bestmi, print_motif_to_char(bestmotif)) ; } free (M_q_t) ; } } /* Elongating the motif here */ printf("Elongating the motif...\n") ; float premi = bestmi ; do{ s_motif *modified_motifs [46] ; elongate_motif( bestmotif, modified_motifs) ; int j ; for (j=0 ; j<46 ; j++){ int h ; int *M_q_t = get_motif_profile (modified_motifs[j], sequences, t_seq_count, h_rna_ind, &h) ; float tempmi = CalculateMIbasic(M_q_t, E_q, seq_count, mbins, ebins) ; if (tempmi>bestmi && h>10){ free(bestmotif->phrases) ; free(bestmotif) ; bestmotif = copy_motif(modified_motifs[j]) ; bestmi = tempmi ; printf("new motif (mi = %3.3f): %s\n", bestmi, print_motif_to_char(bestmotif)) ; } free(M_q_t) ; } } while (premi >= bestmi && bestmotif->num_phrases > motifs[i]->num_phrases) ; } free(M_q) ; M_q = get_motif_profile (bestmotif, sequences, t_seq_count, h_rna_ind, &hits) ; float mi = CalculateMIbasic(M_q, E_q, seq_count, mbins, ebins) ; float z = teiser_z_score_test(mi, M_q, mbins, E_q, ebins, seq_count, 10000) ; printf("Final z-score = %4.3f (threshold=%3.3f)\n", z, max_z) ; if (z>max_z){ printf("z-score passed the test\n.") ; int jn_test = teiser_jn_max_rank_test(M_q, mbins, E_q, ebins, seq_count, shuffle/10, max_p, jn, jn_f) ; if (jn_test>=jn_t){ opt_motifs[opt_count] = copy_motif(bestmotif) ; jnres[opt_count] = jn_test ; opt_count++ ; }else{ printf("robustness=%d/%d\n.", jn_test, jn_t) ; } } free(bestmotif->phrases) ; free(bestmotif) ; free(M_q) ; } f = fopen(dataoutfile, "w") ; fmotif = fopen(motifoutfile, "wb") ; if (!f || !fmotif) die("Cannot open datafile\n"); fprintf(f, "index\tlocation\tmotif-seq\tmotif_structure\tmi-value\tseq mi-value\tfrequency\tz-score\trobustness\tp-value\n") ; for (i=0 ; i<opt_count ; i++){ int *M_qs = get_motif_profile_seq_only (opt_motifs[i], sequences, t_seq_count, h_rna_ind, &hits) ; float mis = CalculateMIbasic(M_qs, E_q, seq_count, mbins, ebins) ; M_q = get_motif_profile (opt_motifs[i], sequences, t_seq_count, h_rna_ind, &hits) ; float mi = CalculateMIbasic(M_q, E_q, seq_count, mbins, ebins) ; double pass = evalSeed(M_q, seq_count, mi, mbins, E_q, ebins, shuffle) ; float z = teiser_z_score_test(mi, M_q, mbins, E_q, ebins, seq_count, 10000) ; char p_val[100] ; if (pass == 0){ sprintf(p_val, "<%.2e", 1.0/shuffle) ; }else{ sprintf(p_val, "<%.6e", pass) ; } printf("motif %d/%d\t%s\tmi=%3.6f\thits=%d\t%3.4f\n", i+1, opt_count, print_motif_to_char(opt_motifs[i]), mi, hits, z) ; fprintf(f, "%d\t%s\t%s\t%3.6f\t%3.6f\t%3.3f\t%3.3f\t%d/10\t%s\n", i, location, print_motif_to_char(opt_motifs[i]), mi, mis, ((float)hits)/seq_count, z, jnres[i], p_val) ; free(M_q) ; free(M_qs) ; } write_motifs (fmotif, opt_motifs, opt_count) ; free(E) ; free(E_q) ; fclose(fmotif) ; fclose(f) ; return (0) ; }