/** * print_site_array * * Prints the specified array of sites to the specified file handle. */ extern void print_site_array( P_PROB sites, ///< An array of sites to be printed int nsites, ///< Length of the array FILE *outfile, ///< The stream for output int w, ///< The size of each of the sites DATASET *dataset ///< Contains the sequences which contain the sites ) { // Print out the sites predicted by the seed: fprintf(outfile, "###########################\n"); int site_idx; for (site_idx = 0; site_idx < nsites; site_idx++) { int seq_num = sites[site_idx].x; int site_loc = sites[site_idx].y; char *e_site = (dataset->samples[seq_num]->res)+site_loc; char *curr_site = to_str_seed(e_site, w); SAMPLE *seq = dataset->samples[seq_num]; char *seq_name = seq->sample_name; fprintf(stdout, "%7s %4i %s\n", seq_name, site_loc, curr_site); } fprintf(stdout, "---------------------------\n"); } // print_site_array
extern void subseq7( MODEL *model, // the model DATASET *dataset, /* the dataset */ int w, // w to use int n_nsites0, /* number of nsites0 values to try */ S_POINT s_points[], /* array of starting points: 1 per nsites0 */ HASH_TABLE evaluated_seed_ht /* A hash table used for remembering which seeds have been evaluated previously */ ) { MOTYPE mtype = model->mtype; /* type of model */ BOOLEAN ic = model->invcomp; /* use reverse complement strand of DNA, too */ THETA map = dataset->map; /* freq x letter map */ LOG_THETA_TYPE(ltheta); /* integer encoded log theta */ int iseq, ioff; int alength = dataset->alength; /* length of alphabet */ int n_samples = dataset->n_samples; /* number of samples in dataset */ SAMPLE **samples = dataset->samples; /* samples in dataset */ int n_starts = 0; /* number of sampled start subseq */ int n_maxima = ps(dataset, w); /* upper bound on # maxima */ /* the local maxima positions */ P_PROB maxima = (P_PROB) mymalloc(n_maxima * sizeof(p_prob)); int lmap[MAXALPH][MAXALPH]; /* consensus letter vs. log frequency matrix */ double col_scores[MAXSITE]; /* not used */ #ifdef PARALLEL int start_seq, start_off=0, end_seq, end_off=0; #endif char *str_seed; // A string representation of a seed. // PRECONDITIONS: // 1. If the sequence model is oops, then n_nsites0 is exactly 1: if (mtype == Oops) { assert(n_nsites0 == 1); } convert_to_lmap(map, lmap, alength); if (TRACE) { printf("w= %d\n", w); } /* get the probability that a site starting at position x_ij would NOT overlap a previously found motif. */ get_not_o(dataset, w); // Set up log_not_o: log_not_o[site] is: // log ( Pr(site not overlapped) * scaled_to_one_Pr(site) ) if (model->mtype != Tcm) { add_psp_to_log_not_o(dataset, w, model->invcomp, model->mtype); } /* score all the sampled positions saving the best position for each value of NSITES0 */ #ifdef PARALLEL /* Retrieve the previously-calculated starting and ending points. */ get_start_n_end(&start_seq, &start_off, &end_seq, &end_off); /* Divide the various samples among processors. */ for (iseq = start_seq; iseq <= end_seq; iseq++) { /* sequence */ #else /* not PARALLEL */ for (iseq = 0; iseq < n_samples; iseq++) { /* sequence */ #endif /* PARALLEL */ SAMPLE *s = samples[iseq]; int lseq = s->length; char *res = s->res; /* left to right */ char *name = s->sample_name; double *not_o = s->not_o; int max_off, init_off; if (lseq < w) continue; /* shorter than motif */ #ifdef PARALLEL if (mpMyID() == 0) #endif if ((!NO_STATUS) && ((iseq % 5) == 0)) { fprintf(stderr, "starts: w=%d, seq=%d, l=%d \r", w, iseq, lseq); } /* Set the appropriate starting and ending points. */ #ifdef PARALLEL if (iseq == start_seq) init_off = start_off; else #endif init_off = 0; #ifdef PARALLEL if (iseq == end_seq) max_off = MIN(end_off, lseq - w); else #endif max_off = lseq - w; /* Loop over all subsequences in the current sequence testing them each as "starting points" (inital values) for theta */ for (ioff = init_off; ioff <= max_off; ioff++) {/* subsequence */ /* warning: always do the next step; don't ever "continue" or the value of pY will not be correct since it is computed based the previous value */ /* convert subsequence in dataset to starting point for EM */ init_theta_1(w, res+ioff, <heta[1][0], lmap); if (ioff == init_off) { /* new sequence */ /* Compute p(Y_ij | theta_1^0) */ if (!ic) { get_pY(dataset, <heta[1][0], w, 0); } else { get_pY(dataset, <heta[1][0], w, 1); get_pY(dataset, <heta[1][0], w, 2); } } else { /* same sequence */ /* get theta[0][0]^{k-1} */ init_theta_1(1, res+ioff-1, <heta[0][0], lmap); /* compute p(Y_ij | theta_1^k) */ if (!ic) { next_pY(dataset, <heta[1][0], w, <heta[0][0][0], 0); } else { next_pY(dataset, <heta[1][0], w, <heta[0][0][0], 1); next_pY(dataset, <heta[1][0], w, <heta[0][0][0], 2); } } /* same sequence */ /* skip if there is a high probability that this subsequence is part of a site which has already been found */ if (not_o[ioff] < MIN_NOT_O) continue; /*fprintf(stderr, "subseq: %d %d\r", iseq+1, ioff+1);*/ // Put highest pY into first scratch array if using both DNA strands: if (ic) { combine_strands(samples, n_samples, w); } /* get a sorted list of the maxima of pY */ n_maxima = get_max(mtype, dataset, w, maxima, ic, TRUE); /* "fake out" align_top_subsequences by setting each of the scores in the s_points objects to LITTLE, thereby forcing align_top_subsequences to record the attributes for the current seed in the s_points, rather than the seed with the highest respective scores: */ int sp_idx; for (sp_idx = 0; sp_idx < n_nsites0; sp_idx++) { s_points[sp_idx].score = LITTLE; } /* align the top nsites0 subsequences for each value of nsites0 and save the alignments with the highest likelihood */ n_starts += align_top_subsequences( mtype, w, dataset, iseq, ioff, res+ioff, name, n_nsites0, n_maxima, maxima, col_scores, s_points ); /* A string version of the current seed is required for updating the S_POINT heaps: */ str_seed = to_str_seed(res+ioff, w); /* For each of the S_POINT objects, add the current seed to that S_POINT'S heap. Also, branching search will require a hash_table of all seeds that have been evaluated prior to when branching search is called. Hence also record the current seed (string, nsites0) combination in the hash_table, for all nsites0, unless that seed was already in the hash_table: */ hash_insert_str(str_seed, evaluated_seed_ht); update_s_point_heaps(s_points, str_seed, n_nsites0); myfree(str_seed); } /* subsequence */ } /* sequence */ #ifdef PARALLEL reduce_across_heaps(s_points, n_nsites0); #endif // PARALLEL // Print the sites predicted using the seed after subsequence search, for // each of the starting points, if requested: if (dataset->print_pred) { int sp_idx; for (sp_idx = 0; sp_idx < n_nsites0; sp_idx++) { // Retrieve the best seed, from the heap: HEAP *heap = s_points[sp_idx].seed_heap; // Only print sites for the s_point if its heap was non-empty: if (get_num_nodes(heap) > 0) { SEED *best_seed = (SEED *)get_node(heap, get_best_node(heap)); char *seed = get_str_seed(best_seed); /* Print the sites predicted using the motif corresponding to that seed, according to the sequence model being used: */ int nsites0 = s_points[sp_idx].nsites0; fprintf(stdout, "PREDICTED SITES AFTER SUBSEQUENCE SEARCH WITH W = %i " "NSITES = %i MOTIF = %i\n", w, nsites0, dataset->imotif); int n_maxima = ps(dataset, w); // upper bound on number of maxima P_PROB psites = (P_PROB) mymalloc(n_maxima * sizeof(p_prob)); n_maxima = get_pred_sites(psites, mtype, w, seed, ltheta[1], lmap, dataset, ic); print_site_array(psites, nsites0, stdout, w, dataset); myfree(psites); } // get_num_nodes > 0 } //sp_idx } // print_pred if (TRACE){ printf("Tested %d possible starts...\n", n_starts); } myfree(maxima); } // subseq7 /**********************************************************************/ /* next_pY Compute the value of p(Y_ij | theta_1^{k+1}) from p(Y_ij | theta_1^{k} and the probability of first letter of Y_ij given theta_1^k, p(Y_ij^0 | theta_1^k). */ /**********************************************************************/ static void next_pY( DATASET *dataset, /* the dataset */ LOG_THETAG_TYPE(theta_1), /* integer log theta_1 */ int w, /* width of motif */ int *theta_0, /* first column of previous theta_1 */ int pYindex /* which pY array to use */ ) { int i, k; int *theta_last = theta_1[w-1]; /* last column of theta_1 */ int n_samples = dataset->n_samples; SAMPLE **samples = dataset->samples; for (i=0; i < n_samples; i++) { /* sequence */ SAMPLE *s = samples[i]; /* sequence */ int lseq = s->length; /* length of sequence */ char *res = pYindex<2 ? s->res : s->resic; /* integer sequence */ int *pY = s->pY[pYindex]; /* log p(Y_j | theta_1) */ char *r = res+lseq-1; /* last position in sequence */ char *r0 = res+lseq-w-1; /* prior to start of last subsequence */ int j, p; if (lseq < w) continue; /* skip if sequence too short */ /* calculate p(Y_ij | theta_1) */ int *pY_shifted_1 = pY - 1; for (j=lseq-w; j>0; j--) { pY[j] = pY_shifted_1[j] + theta_last[(int)(*r--)] - theta_0[(int)(*r0--)]; } /* calculate log p(Y_i0 | theta_1) */ p = 0; r = res; for (k=0; k<w; k++) { /* position in site */ p += theta_1[k][(int)(*r++)]; } pY[0] = p; } }