/** * union_seed_packets * * Find the union of two seed_packet arrays. Return an array containing * the best seed_packets from this union. * * This function is used in reduce_across_heaps. * */ void union_seed_packets(void *f_data, void *f_result, int *f_length, MPI_Datatype *datatype) { int i; int num_seed_packets; SEED *bumped_seed; // create a heap to do the heap union HEAP *heap = create_heap( *f_length, (int (*) (void *, void*))compare_seed, (void *)copy_seed, (void (*)(void*))free_seed, (char* (*)(void*))get_str_seed, (void (*)(FILE *, void*))print_seed ); // get the number of seed_packets in f_data num_seed_packets = ((SEED_PACKET *)f_data + 0)->num_seed_packets; // unpack the seeds from f_data and add them to the heap for (i = 0; i < num_seed_packets; i++){ // get the data seed char *data_seed_str = ((SEED_PACKET *)f_data + i)->seed; double data_score = ((SEED_PACKET *)f_data + i)->score; SEED *data_seed = new_seed(data_seed_str, data_score); // add the seeds to the heap bumped_seed = (SEED *)(add_node_heap(heap, data_seed)); } // unpack the seeds from f_result and add them to the heap num_seed_packets = ((SEED_PACKET *)f_result + 0)->num_seed_packets; for (i = 0; i < num_seed_packets; i++){ // get the result seed char *result_seed_str = ((SEED_PACKET *)f_result + i)->seed; double result_score = ((SEED_PACKET *)f_result + i)->score; SEED *result_seed = new_seed(result_seed_str, result_score); // add the seeds to the heap bumped_seed = (SEED *)(add_node_heap(heap, result_seed)); } // pack the heap int num_seeds = get_num_nodes(heap); // set the number of filled packets (in case the heap is empty) ((SEED_PACKET *)f_result + 0)->num_seed_packets = num_seeds; for (i = 0; i < num_seeds; i++){ // set the number of seed_packets ((SEED_PACKET *)f_result + i)->num_seed_packets = num_seeds; // get the index for the seed in the heap // (populated heap nodes are at index 1 to num_seeds) int heap_idx = i + 1; // get the node SEED *curr_seed = get_node(heap, heap_idx); //double score = get_seed_score(curr_seed); ((SEED_PACKET *)f_result + i)->score = get_seed_score(curr_seed); char *seed_str = get_str_seed(curr_seed); strcpy(((SEED_PACKET *)f_result + i)->seed, seed_str); } } // union_seed_packets
/** * copy_seed copies the fields of the input seed object into the fields of * the a new seed object. * An EXACT copy of the seed is made, such that compare_seed between the * original and the copy yields zero. * \return A pointer to a new seed that is a copy of the original. */ SEED *copy_seed( SEED *orig_object ///< The existing seed object that will be copied ) { SEED *orig_seed = orig_object; SEED *seed = new_seed( get_str_seed(orig_seed), get_seed_score(orig_seed), orig_seed->iseq, orig_seed->ipos ); return(seed); }
/** * transfer_final_scores * * Transfer the scores of the best seeds in the S_POINT heaps into the * S_POINTs themselves. */ void transfer_final_scores ( SP_MATRIX *sp_matrix ///< This object ) { // Proceed through the entire matrix, transfering the details for each // S_POINT: int row_idx; int col_idx; for (row_idx = 0; row_idx < sp_get_num_rows(sp_matrix); row_idx++) { S_POINT *curr_row = sp_matrix->matrix[row_idx]; for (col_idx = 0; col_idx < sp_get_num_cols(sp_matrix); col_idx++) { S_POINT *curr_sp = curr_row+col_idx; HEAP *sp_heap = curr_sp->seed_heap; if (get_num_nodes(sp_heap) >= 1) { SEED *best_seed = (SEED *)(get_node(sp_heap, get_best_node(sp_heap))); curr_sp->score = get_seed_score(best_seed); curr_sp->iseq = -1; // Seed does not correspond to a location in the dataset. curr_sp->ioff = -1; // Seed does not correspond to a location in the dataset. curr_sp->e_cons0 = get_e_seed(best_seed); free(curr_sp->cons0); curr_sp->cons0 = strdup(get_str_seed(best_seed)); } /* If the seed heap of the current s_point is empty, then it could mean that no seeds added to the s_point had enough maxima to be evaluated by align_top_subsequences. Report this situation: */ else if (TRACE) { fprintf(stderr, "Heap of spoint was empty, possibly because no seeds had" " enough local maxima. w = %i. nsites0 = %f.\n", curr_sp->w0, curr_sp->nsites0); } } // col_idx } // row_idx } // transfer_final_scores
/** * reduce_across_heaps * * Do a reduction across an array of S_POINT heaps. For each S_POINT in the * array, all the seeds from the heaps on each node are combinded (using a * union function). A heap containing the best seeds from every node is then * propogated to all nodes. * */ void reduce_across_heaps( S_POINT *s_points, // an array of S_POINTS int n_nsites0 // the number of S_POINTS in the s_points array ) { static int init; static MPI_Datatype seed_packet_type; static MPI_Op union_seed_packets_op; int i_packet; // Initialise MPI stuff if (init==0){ init = 1; SEED_PACKET seed_packet; int block_lengths[4]; MPI_Aint displacements[4]; MPI_Aint address[4]; MPI_Datatype typelist[4]; // Build the derived datatype // set the types typelist[0]=MPI_DOUBLE; typelist[1]=MPI_INT; typelist[2]=MPI_INT; typelist[3]=MPI_CHAR; // set number of elements of each type block_lengths[0] = block_lengths[1] = block_lengths[2] = 1; block_lengths[3] = MAXSITE; // the maximum length of a seed // calculate the displacements MPI_Address(&seed_packet.score, &address[0]); MPI_Address(&seed_packet.width, &address[1]); MPI_Address(&seed_packet.num_seed_packets, &address[2]); MPI_Address(&seed_packet.seed, &address[3]); displacements[0]=0; displacements[1]=address[1]-address[0]; displacements[2]=address[2]-address[0]; displacements[3]=address[3]-address[0]; // create the derived type MPI_Type_struct(4, block_lengths, displacements, typelist, &seed_packet_type); // commit the derived type MPI_Type_commit(&seed_packet_type); // set the MPI reduction operation MPI_Op_create(union_seed_packets, FALSE, &union_seed_packets_op); } // initialise MPI // do a reduction for each s_point in the s_point list int sp_idx; for (sp_idx = 0; sp_idx < n_nsites0; sp_idx++){ // package the heap for the spoint at sp_idx in the s_points list HEAP *seed_heap = s_points[sp_idx].seed_heap; // get the maximum heap size and the number of seeds in the heap int max_heap_size = get_max_size(seed_heap); int num_seeds = get_num_nodes(seed_heap); // set the number of seed packets to the maximum heap size SEED_PACKET packets[max_heap_size], best_packets[max_heap_size]; // set num_seed_packets to the number of filled nodes in the heap (in // case the heap is empty) packets[0].num_seed_packets = num_seeds; // package each seed in the heap into a seed packet for (i_packet = 0; i_packet < num_seeds; i_packet++){ // set the number of seed_packets that will be filled packets[i_packet].num_seed_packets = num_seeds; // get the seed at the root SEED *curr_seed = pop_heap_root(seed_heap); // set the seed packet score packets[i_packet].score = get_seed_score(curr_seed); // set the width of the string packets[i_packet].width = get_width(curr_seed); // set the seed char *seed_str = get_str_seed(curr_seed); strcpy(packets[i_packet].seed, seed_str); } /* // print the packets before the reduction if (mpMyID() == NODE_NO){ fprintf(stdout, "BEFORE\n"); for (i_packet = 0; i_packet < max_heap_size; i_packet++) fprintf(stdout, "node %d packet %d score= %g width= %i seed= %s\n", mpMyID(), i_packet, packets[i_packet].score, packets[i_packet].width, packets[i_packet].seed); fflush(stdout); } */ // Do the reduction MPI_Allreduce((void *)&packets, (void *)&best_packets, max_heap_size, seed_packet_type, union_seed_packets_op, MPI_COMM_WORLD); /* // print the packets after the reduction if (mpMyID() == NODE_NO){ fprintf(stdout, "AFTER\n"); for (i_packet = 0; i_packet < max_heap_size; i_packet++) fprintf(stdout, "node %d packet %d score= %g width= %i seed= %s\n", mpMyID(), i_packet, best_packets[i_packet].score, best_packets[i_packet].width, best_packets[i_packet].seed); fflush(stdout); } */ // Unpack the best seed packets into the heap // Get the number of filled packets int num_seed_packets = best_packets[0].num_seed_packets; // Add the best seeds to the heap for (i_packet = 0; i_packet < num_seed_packets; i_packet++){ double score = best_packets[i_packet].score; char *seed_str = best_packets[i_packet].seed; SEED *best_seed = new_seed(seed_str, score); //SEED *bumped_seed = (SEED *)(add_node_heap(seed_heap, best_seed)); (void *)(add_node_heap(seed_heap, best_seed)); } } // end n_nsites0 } // reduce_across_heaps
extern void subseq7( MODEL *model, // the model DATASET *dataset, /* the dataset */ int w, // w to use int n_nsites0, /* number of nsites0 values to try */ S_POINT s_points[], /* array of starting points: 1 per nsites0 */ HASH_TABLE evaluated_seed_ht /* A hash table used for remembering which seeds have been evaluated previously */ ) { MOTYPE mtype = model->mtype; /* type of model */ BOOLEAN ic = model->invcomp; /* use reverse complement strand of DNA, too */ THETA map = dataset->map; /* freq x letter map */ LOG_THETA_TYPE(ltheta); /* integer encoded log theta */ int iseq, ioff; int alength = dataset->alength; /* length of alphabet */ int n_samples = dataset->n_samples; /* number of samples in dataset */ SAMPLE **samples = dataset->samples; /* samples in dataset */ int n_starts = 0; /* number of sampled start subseq */ int n_maxima = ps(dataset, w); /* upper bound on # maxima */ /* the local maxima positions */ P_PROB maxima = (P_PROB) mymalloc(n_maxima * sizeof(p_prob)); int lmap[MAXALPH][MAXALPH]; /* consensus letter vs. log frequency matrix */ double col_scores[MAXSITE]; /* not used */ #ifdef PARALLEL int start_seq, start_off=0, end_seq, end_off=0; #endif char *str_seed; // A string representation of a seed. // PRECONDITIONS: // 1. If the sequence model is oops, then n_nsites0 is exactly 1: if (mtype == Oops) { assert(n_nsites0 == 1); } convert_to_lmap(map, lmap, alength); if (TRACE) { printf("w= %d\n", w); } /* get the probability that a site starting at position x_ij would NOT overlap a previously found motif. */ get_not_o(dataset, w); // Set up log_not_o: log_not_o[site] is: // log ( Pr(site not overlapped) * scaled_to_one_Pr(site) ) if (model->mtype != Tcm) { add_psp_to_log_not_o(dataset, w, model->invcomp, model->mtype); } /* score all the sampled positions saving the best position for each value of NSITES0 */ #ifdef PARALLEL /* Retrieve the previously-calculated starting and ending points. */ get_start_n_end(&start_seq, &start_off, &end_seq, &end_off); /* Divide the various samples among processors. */ for (iseq = start_seq; iseq <= end_seq; iseq++) { /* sequence */ #else /* not PARALLEL */ for (iseq = 0; iseq < n_samples; iseq++) { /* sequence */ #endif /* PARALLEL */ SAMPLE *s = samples[iseq]; int lseq = s->length; char *res = s->res; /* left to right */ char *name = s->sample_name; double *not_o = s->not_o; int max_off, init_off; if (lseq < w) continue; /* shorter than motif */ #ifdef PARALLEL if (mpMyID() == 0) #endif if ((!NO_STATUS) && ((iseq % 5) == 0)) { fprintf(stderr, "starts: w=%d, seq=%d, l=%d \r", w, iseq, lseq); } /* Set the appropriate starting and ending points. */ #ifdef PARALLEL if (iseq == start_seq) init_off = start_off; else #endif init_off = 0; #ifdef PARALLEL if (iseq == end_seq) max_off = MIN(end_off, lseq - w); else #endif max_off = lseq - w; /* Loop over all subsequences in the current sequence testing them each as "starting points" (inital values) for theta */ for (ioff = init_off; ioff <= max_off; ioff++) {/* subsequence */ /* warning: always do the next step; don't ever "continue" or the value of pY will not be correct since it is computed based the previous value */ /* convert subsequence in dataset to starting point for EM */ init_theta_1(w, res+ioff, <heta[1][0], lmap); if (ioff == init_off) { /* new sequence */ /* Compute p(Y_ij | theta_1^0) */ if (!ic) { get_pY(dataset, <heta[1][0], w, 0); } else { get_pY(dataset, <heta[1][0], w, 1); get_pY(dataset, <heta[1][0], w, 2); } } else { /* same sequence */ /* get theta[0][0]^{k-1} */ init_theta_1(1, res+ioff-1, <heta[0][0], lmap); /* compute p(Y_ij | theta_1^k) */ if (!ic) { next_pY(dataset, <heta[1][0], w, <heta[0][0][0], 0); } else { next_pY(dataset, <heta[1][0], w, <heta[0][0][0], 1); next_pY(dataset, <heta[1][0], w, <heta[0][0][0], 2); } } /* same sequence */ /* skip if there is a high probability that this subsequence is part of a site which has already been found */ if (not_o[ioff] < MIN_NOT_O) continue; /*fprintf(stderr, "subseq: %d %d\r", iseq+1, ioff+1);*/ // Put highest pY into first scratch array if using both DNA strands: if (ic) { combine_strands(samples, n_samples, w); } /* get a sorted list of the maxima of pY */ n_maxima = get_max(mtype, dataset, w, maxima, ic, TRUE); /* "fake out" align_top_subsequences by setting each of the scores in the s_points objects to LITTLE, thereby forcing align_top_subsequences to record the attributes for the current seed in the s_points, rather than the seed with the highest respective scores: */ int sp_idx; for (sp_idx = 0; sp_idx < n_nsites0; sp_idx++) { s_points[sp_idx].score = LITTLE; } /* align the top nsites0 subsequences for each value of nsites0 and save the alignments with the highest likelihood */ n_starts += align_top_subsequences( mtype, w, dataset, iseq, ioff, res+ioff, name, n_nsites0, n_maxima, maxima, col_scores, s_points ); /* A string version of the current seed is required for updating the S_POINT heaps: */ str_seed = to_str_seed(res+ioff, w); /* For each of the S_POINT objects, add the current seed to that S_POINT'S heap. Also, branching search will require a hash_table of all seeds that have been evaluated prior to when branching search is called. Hence also record the current seed (string, nsites0) combination in the hash_table, for all nsites0, unless that seed was already in the hash_table: */ hash_insert_str(str_seed, evaluated_seed_ht); update_s_point_heaps(s_points, str_seed, n_nsites0); myfree(str_seed); } /* subsequence */ } /* sequence */ #ifdef PARALLEL reduce_across_heaps(s_points, n_nsites0); #endif // PARALLEL // Print the sites predicted using the seed after subsequence search, for // each of the starting points, if requested: if (dataset->print_pred) { int sp_idx; for (sp_idx = 0; sp_idx < n_nsites0; sp_idx++) { // Retrieve the best seed, from the heap: HEAP *heap = s_points[sp_idx].seed_heap; // Only print sites for the s_point if its heap was non-empty: if (get_num_nodes(heap) > 0) { SEED *best_seed = (SEED *)get_node(heap, get_best_node(heap)); char *seed = get_str_seed(best_seed); /* Print the sites predicted using the motif corresponding to that seed, according to the sequence model being used: */ int nsites0 = s_points[sp_idx].nsites0; fprintf(stdout, "PREDICTED SITES AFTER SUBSEQUENCE SEARCH WITH W = %i " "NSITES = %i MOTIF = %i\n", w, nsites0, dataset->imotif); int n_maxima = ps(dataset, w); // upper bound on number of maxima P_PROB psites = (P_PROB) mymalloc(n_maxima * sizeof(p_prob)); n_maxima = get_pred_sites(psites, mtype, w, seed, ltheta[1], lmap, dataset, ic); print_site_array(psites, nsites0, stdout, w, dataset); myfree(psites); } // get_num_nodes > 0 } //sp_idx } // print_pred if (TRACE){ printf("Tested %d possible starts...\n", n_starts); } myfree(maxima); } // subseq7 /**********************************************************************/ /* next_pY Compute the value of p(Y_ij | theta_1^{k+1}) from p(Y_ij | theta_1^{k} and the probability of first letter of Y_ij given theta_1^k, p(Y_ij^0 | theta_1^k). */ /**********************************************************************/ static void next_pY( DATASET *dataset, /* the dataset */ LOG_THETAG_TYPE(theta_1), /* integer log theta_1 */ int w, /* width of motif */ int *theta_0, /* first column of previous theta_1 */ int pYindex /* which pY array to use */ ) { int i, k; int *theta_last = theta_1[w-1]; /* last column of theta_1 */ int n_samples = dataset->n_samples; SAMPLE **samples = dataset->samples; for (i=0; i < n_samples; i++) { /* sequence */ SAMPLE *s = samples[i]; /* sequence */ int lseq = s->length; /* length of sequence */ char *res = pYindex<2 ? s->res : s->resic; /* integer sequence */ int *pY = s->pY[pYindex]; /* log p(Y_j | theta_1) */ char *r = res+lseq-1; /* last position in sequence */ char *r0 = res+lseq-w-1; /* prior to start of last subsequence */ int j, p; if (lseq < w) continue; /* skip if sequence too short */ /* calculate p(Y_ij | theta_1) */ int *pY_shifted_1 = pY - 1; for (j=lseq-w; j>0; j--) { pY[j] = pY_shifted_1[j] + theta_last[(int)(*r--)] - theta_0[(int)(*r0--)]; } /* calculate log p(Y_i0 | theta_1) */ p = 0; r = res; for (k=0; k<w; k++) { /* position in site */ p += theta_1[k][(int)(*r++)]; } pY[0] = p; } }