예제 #1
0
파일: sp_matrix.c 프로젝트: a1aks/Haystack
/**
 * union_seed_packets
 *
 * Find the union of two seed_packet arrays. Return an array containing 
 * the best seed_packets from this union.
 * 
 * This function is used in reduce_across_heaps.
 *
 */
void union_seed_packets(void *f_data, void *f_result, int *f_length,
                   MPI_Datatype *datatype)
{
  int i;
  int num_seed_packets; 	
  SEED *bumped_seed;
  
  // create a heap to do the heap union
  HEAP *heap = create_heap(
      *f_length, 
      (int (*) (void *, void*))compare_seed,
      (void *)copy_seed,
      (void (*)(void*))free_seed,
      (char* (*)(void*))get_str_seed,
      (void (*)(FILE *, void*))print_seed
    );
  
  // get the number of seed_packets in f_data
  num_seed_packets = ((SEED_PACKET *)f_data + 0)->num_seed_packets; 

  // unpack the seeds from f_data and add them to the heap
  for (i = 0; i < num_seed_packets; i++){
    // get the data seed
    char *data_seed_str = ((SEED_PACKET *)f_data + i)->seed;
    double data_score = ((SEED_PACKET *)f_data + i)->score;
    SEED *data_seed = new_seed(data_seed_str, data_score);
    // add the seeds to the heap
    bumped_seed = (SEED *)(add_node_heap(heap, data_seed)); 
  } 

  // unpack the seeds from f_result and add them to the heap
  num_seed_packets = ((SEED_PACKET *)f_result + 0)->num_seed_packets;
  for (i = 0; i < num_seed_packets; i++){
    // get the result seed
    char *result_seed_str = ((SEED_PACKET *)f_result + i)->seed;
    double result_score = ((SEED_PACKET *)f_result + i)->score;
    SEED *result_seed = new_seed(result_seed_str, result_score);
    // add the seeds to the heap
    bumped_seed = (SEED *)(add_node_heap(heap, result_seed)); 
  }

  // pack the heap
  int num_seeds = get_num_nodes(heap);
  // set the number of filled packets (in case the heap is empty)
  ((SEED_PACKET *)f_result + 0)->num_seed_packets = num_seeds;
  for (i = 0; i < num_seeds; i++){
    // set the number of seed_packets
    ((SEED_PACKET *)f_result + i)->num_seed_packets = num_seeds;
    // get the index for the seed in the heap
    // (populated heap nodes are at index 1 to num_seeds)
    int heap_idx = i + 1;
    // get the node
    SEED *curr_seed = get_node(heap, heap_idx);
    //double score = get_seed_score(curr_seed);
    ((SEED_PACKET *)f_result + i)->score = get_seed_score(curr_seed);
    char *seed_str = get_str_seed(curr_seed);
    strcpy(((SEED_PACKET *)f_result + i)->seed, seed_str);
  }
} // union_seed_packets
예제 #2
0
/**
 * copy_seed copies the fields of the input seed object into the fields of
 * the a new seed object.
 * An EXACT copy of the seed is made, such that compare_seed between the
 * original and the copy yields zero. 
 * \return A pointer to a new seed that is a copy of the original.
 */
SEED *copy_seed(
  SEED *orig_object  ///< The existing seed object that will be copied
)
{
  SEED *orig_seed = orig_object;
  SEED *seed = new_seed(
    get_str_seed(orig_seed), 
    get_seed_score(orig_seed),
    orig_seed->iseq,
    orig_seed->ipos
  );

  return(seed);
}
예제 #3
0
파일: sp_matrix.c 프로젝트: a1aks/Haystack
/**
 * transfer_final_scores
 *
 * Transfer the scores of the best seeds in the S_POINT heaps into the
 * S_POINTs themselves.
 */
void transfer_final_scores (
  SP_MATRIX *sp_matrix ///< This object
) {
  // Proceed through the entire matrix, transfering the details for each
  // S_POINT:
  int row_idx;
  int col_idx;
  for (row_idx = 0; row_idx < sp_get_num_rows(sp_matrix); row_idx++) {
    S_POINT *curr_row = sp_matrix->matrix[row_idx];
    for (col_idx = 0; col_idx < sp_get_num_cols(sp_matrix); col_idx++) {
      S_POINT *curr_sp = curr_row+col_idx;
      HEAP *sp_heap = curr_sp->seed_heap;
      
      if (get_num_nodes(sp_heap) >= 1) {
        SEED *best_seed = (SEED *)(get_node(sp_heap, get_best_node(sp_heap)));
        curr_sp->score = get_seed_score(best_seed);
        curr_sp->iseq = -1; // Seed does not correspond to a location in the dataset.
        curr_sp->ioff = -1; // Seed does not correspond to a location in the dataset.
        curr_sp->e_cons0 = get_e_seed(best_seed);
        free(curr_sp->cons0);
        curr_sp->cons0 = strdup(get_str_seed(best_seed));
      }

      /* If the seed heap of the current s_point is empty, then it could
         mean that no seeds added to the s_point had enough maxima to be
         evaluated by align_top_subsequences. Report this situation:
      */
      else if (TRACE) {
        fprintf(stderr,
                "Heap of spoint was empty, possibly because no seeds had"
                " enough local maxima. w = %i. nsites0 = %f.\n", curr_sp->w0,
                curr_sp->nsites0);
      }
    } // col_idx
  } // row_idx
} // transfer_final_scores
예제 #4
0
파일: sp_matrix.c 프로젝트: a1aks/Haystack
/**
 * reduce_across_heaps
 *
 * Do a reduction across an array of S_POINT heaps. For each S_POINT in the
 * array, all the seeds from the heaps on each node are combinded (using a
 * union function). A heap containing the best seeds from every node is then
 * propogated to all nodes.
 *
 */
void reduce_across_heaps(
  S_POINT *s_points,     // an array of S_POINTS
  int n_nsites0          // the number of S_POINTS in the s_points array
) 
{
  static int init;
  static MPI_Datatype seed_packet_type;
  static MPI_Op union_seed_packets_op;
  int i_packet;

  // Initialise MPI stuff
  if (init==0){
    init = 1;
    SEED_PACKET seed_packet;
    int block_lengths[4];
    MPI_Aint displacements[4];
    MPI_Aint address[4]; 
    MPI_Datatype typelist[4];

    // Build the derived datatype
    // set the types
    typelist[0]=MPI_DOUBLE;
    typelist[1]=MPI_INT;
    typelist[2]=MPI_INT;
    typelist[3]=MPI_CHAR;

    // set number of elements of each type
    block_lengths[0] = block_lengths[1] = block_lengths[2] = 1;
    block_lengths[3] = MAXSITE;	// the maximum length of a seed

    // calculate the displacements
    MPI_Address(&seed_packet.score, &address[0]);
    MPI_Address(&seed_packet.width, &address[1]);
    MPI_Address(&seed_packet.num_seed_packets, &address[2]);
    MPI_Address(&seed_packet.seed, &address[3]);
    displacements[0]=0;
    displacements[1]=address[1]-address[0];
    displacements[2]=address[2]-address[0];
    displacements[3]=address[3]-address[0];

    // create the derived type
    MPI_Type_struct(4, block_lengths, displacements, typelist, &seed_packet_type);

    // commit the derived type
    MPI_Type_commit(&seed_packet_type);

    // set the MPI reduction operation
    MPI_Op_create(union_seed_packets, FALSE, &union_seed_packets_op);
  } // initialise MPI

  // do a reduction for each s_point in the s_point list
  int sp_idx;
  for (sp_idx = 0; sp_idx < n_nsites0; sp_idx++){
    // package the heap for the spoint at sp_idx in the s_points list
    HEAP *seed_heap = s_points[sp_idx].seed_heap;
    // get the maximum heap size and the number of seeds in the heap
    int max_heap_size = get_max_size(seed_heap);
    int num_seeds = get_num_nodes(seed_heap);

    // set the number of seed packets to the maximum heap size
    SEED_PACKET packets[max_heap_size], best_packets[max_heap_size];
    // set num_seed_packets to the number of filled nodes in the heap (in 
    // case the heap is empty)
    packets[0].num_seed_packets = num_seeds;
    // package each seed in the heap into a seed packet

    for (i_packet = 0; i_packet < num_seeds; i_packet++){
      // set the number of seed_packets that will be filled
      packets[i_packet].num_seed_packets = num_seeds;
      // get the seed at the root
      SEED *curr_seed = pop_heap_root(seed_heap);
      // set the seed packet score
      packets[i_packet].score = get_seed_score(curr_seed);
      // set the width of the string
      packets[i_packet].width = get_width(curr_seed);
      // set the seed
      char *seed_str = get_str_seed(curr_seed);
      strcpy(packets[i_packet].seed, seed_str); 
    }

/*
    // print the packets before the reduction
    if (mpMyID() == NODE_NO){
      fprintf(stdout, "BEFORE\n");
      for (i_packet = 0; i_packet < max_heap_size; i_packet++)
      fprintf(stdout, "node %d packet %d score= %g width= %i seed= %s\n",
                       mpMyID(), i_packet, packets[i_packet].score,
                       packets[i_packet].width, packets[i_packet].seed);
      fflush(stdout);
    }
*/

    // Do the reduction
    MPI_Allreduce((void *)&packets, (void *)&best_packets, max_heap_size,
                    seed_packet_type, union_seed_packets_op, MPI_COMM_WORLD);

/*
    // print the packets after the reduction
    if (mpMyID() == NODE_NO){
      fprintf(stdout, "AFTER\n");
      for (i_packet = 0; i_packet < max_heap_size; i_packet++)
      fprintf(stdout, "node %d packet %d score= %g width= %i seed= %s\n",
                       mpMyID(), i_packet, best_packets[i_packet].score,
                       best_packets[i_packet].width, best_packets[i_packet].seed);
      fflush(stdout);
    }
*/

    // Unpack the best seed packets into the heap

    // Get the number of filled packets
    int num_seed_packets = best_packets[0].num_seed_packets;

    // Add the best seeds to the heap
    for (i_packet = 0; i_packet < num_seed_packets; i_packet++){
      double score =  best_packets[i_packet].score;
      char *seed_str = best_packets[i_packet].seed;
      SEED *best_seed = new_seed(seed_str, score);
      //SEED *bumped_seed = (SEED *)(add_node_heap(seed_heap, best_seed));
      (void *)(add_node_heap(seed_heap, best_seed));
    }
  } // end n_nsites0

} // reduce_across_heaps
예제 #5
0
파일: subseq7.c 프로젝트: CPFL/gmeme
extern void subseq7(
  MODEL *model,			// the model
  DATASET *dataset,		/* the dataset */
  int w,			// w to use
  int n_nsites0,		/* number of nsites0 values to try */
  S_POINT s_points[],           /* array of starting points: 1 per nsites0 */
  HASH_TABLE evaluated_seed_ht 	/* A hash table used for remembering which seeds
                                   have been evaluated previously */
)
{
  MOTYPE mtype = model->mtype;		/* type of model */
  BOOLEAN ic = model->invcomp;		/* use reverse complement strand of DNA, too */
  THETA map = dataset->map;		/* freq x letter map */
  LOG_THETA_TYPE(ltheta);		/* integer encoded log theta */
  int iseq, ioff;
  int alength = dataset->alength;	/* length of alphabet */
  int n_samples = dataset->n_samples;	/* number of samples in dataset */
  SAMPLE **samples = dataset->samples;	/* samples in dataset */
  int n_starts = 0;			/* number of sampled start subseq */
  int n_maxima = ps(dataset, w);	/* upper bound on # maxima */
  /* the local maxima positions */
  P_PROB maxima = (P_PROB) mymalloc(n_maxima * sizeof(p_prob));
  int lmap[MAXALPH][MAXALPH];	/* consensus letter vs. log frequency matrix */
  double col_scores[MAXSITE];		/* not used */
#ifdef PARALLEL
  int start_seq, start_off=0, end_seq, end_off=0;
#endif
  char *str_seed;                       // A string representation of a seed.

  // PRECONDITIONS:

  // 1. If the sequence model is oops, then n_nsites0 is exactly 1:
  if (mtype == Oops) {
    assert(n_nsites0 == 1);
  }

  convert_to_lmap(map, lmap, alength);

  if (TRACE) { printf("w= %d\n", w); }

  /* get the probability that a site starting at position x_ij would
     NOT overlap a previously found motif.
  */
  get_not_o(dataset, w);

  // Set up log_not_o: log_not_o[site] is:
  // log ( Pr(site not overlapped) * scaled_to_one_Pr(site) )
  if (model->mtype != Tcm) {
    add_psp_to_log_not_o(dataset, w, model->invcomp, model->mtype);
  }

  /* score all the sampled positions saving the best position for
     each value of NSITES0 */
#ifdef PARALLEL
  /* Retrieve the previously-calculated starting and ending points. */
  get_start_n_end(&start_seq, &start_off, &end_seq, &end_off);
  /* Divide the various samples among processors. */
  for (iseq = start_seq; iseq <= end_seq; iseq++) { /* sequence */
#else /* not PARALLEL */
  for (iseq = 0; iseq < n_samples; iseq++) {	/* sequence */
#endif /* PARALLEL */

    SAMPLE *s = samples[iseq];
    int lseq = s->length;
    char *res = s->res;				/* left to right */
    char *name = s->sample_name;
    double *not_o = s->not_o;
    int max_off, init_off;

    if (lseq < w) continue;			/* shorter than motif */

#ifdef PARALLEL
    if (mpMyID() == 0)
#endif
    if ((!NO_STATUS) && ((iseq % 5) == 0)) {
      fprintf(stderr, "starts: w=%d, seq=%d, l=%d          \r", w, iseq, lseq); 
    }
    /* Set the appropriate starting and ending points. */
#ifdef PARALLEL
    if (iseq == start_seq)
      init_off = start_off;
    else
#endif
      init_off = 0;

#ifdef PARALLEL
    if (iseq == end_seq)
      max_off = MIN(end_off, lseq - w);
    else
#endif
      max_off = lseq - w;

    /*
      Loop over all subsequences in the current sequence testing them
      each as "starting points" (inital values) for theta
    */
    for (ioff = init_off; ioff <= max_off; ioff++) {/* subsequence */ 
      /* warning: always do the next step; don't ever
         "continue" or the value of pY will not be correct since
         it is computed based the previous value 
      */

      /* convert subsequence in dataset to starting point for EM */
      init_theta_1(w, res+ioff, &ltheta[1][0], lmap);

      if (ioff == init_off) { 			/* new sequence */

        /* Compute p(Y_ij | theta_1^0) */
        if (!ic) {
          get_pY(dataset, &ltheta[1][0], w, 0);
        } else {
          get_pY(dataset, &ltheta[1][0], w, 1);
          get_pY(dataset, &ltheta[1][0], w, 2);
        }

      } else {					/* same sequence */
        
        /* get theta[0][0]^{k-1} */
        init_theta_1(1, res+ioff-1, &ltheta[0][0], lmap);

        /* compute p(Y_ij | theta_1^k) */
        if (!ic) {
          next_pY(dataset, &ltheta[1][0], w, &ltheta[0][0][0], 0);
        } else {
          next_pY(dataset, &ltheta[1][0], w, &ltheta[0][0][0], 1);
          next_pY(dataset, &ltheta[1][0], w, &ltheta[0][0][0], 2);
        }
      } /* same sequence */

      /* skip if there is a high probability that this subsequence
         is part of a site which has already been found 
      */
      if (not_o[ioff] < MIN_NOT_O) continue;

      /*fprintf(stderr, "subseq: %d %d\r", iseq+1, ioff+1);*/

      // Put highest pY into first scratch array if using both DNA strands:
      if (ic) {
        combine_strands(samples, n_samples, w);
      }

      /* get a sorted list of the maxima of pY */
      n_maxima = get_max(mtype, dataset, w, maxima, ic, TRUE);

      /* "fake out" align_top_subsequences by setting each of the scores in
         the s_points objects to LITTLE, thereby forcing
         align_top_subsequences to record the attributes for the current seed
         in the s_points, rather than the seed with the highest respective
         scores: */
      int sp_idx;
      for (sp_idx = 0; sp_idx < n_nsites0; sp_idx++) {
        s_points[sp_idx].score = LITTLE;
      }

      /* align the top nsites0 subsequences for each value
         of nsites0 and save the alignments with the highest likelihood 
      */
      n_starts += align_top_subsequences(
        mtype,
        w,
        dataset,
        iseq,
        ioff, 
        res+ioff,
        name,
        n_nsites0,
        n_maxima,
        maxima,
        col_scores,
        s_points
      );

      /* A string version of the current seed is required for updating the
         S_POINT heaps: */
      str_seed = to_str_seed(res+ioff, w);

      /* For each of the S_POINT objects, add the current seed to that
         S_POINT'S heap.
         Also, branching search will require a hash_table of all seeds that
         have been evaluated prior to when branching search is called. Hence
         also record the current seed (string, nsites0) combination in the
         hash_table, for all nsites0, unless that seed was already in the
         hash_table:
      */
      hash_insert_str(str_seed, evaluated_seed_ht);
      update_s_point_heaps(s_points, str_seed, n_nsites0);

      myfree(str_seed);
    } /* subsequence */
  } /* sequence */

#ifdef PARALLEL
  reduce_across_heaps(s_points, n_nsites0);
#endif // PARALLEL 

  // Print the sites predicted using the seed after subsequence search, for
  // each of the starting points, if requested:
  if (dataset->print_pred) {
    int sp_idx;
    for (sp_idx = 0; sp_idx < n_nsites0; sp_idx++) {
      // Retrieve the best seed, from the heap:
      HEAP *heap = s_points[sp_idx].seed_heap;
      // Only print sites for the s_point if its heap was non-empty:
      if (get_num_nodes(heap) > 0) {
        SEED *best_seed = (SEED *)get_node(heap, get_best_node(heap));
        char *seed = get_str_seed(best_seed);

        /* Print the sites predicted using the motif corresponding to that seed,
           according to the sequence model being used:
        */
        int nsites0 = s_points[sp_idx].nsites0;
        fprintf(stdout,
                "PREDICTED SITES AFTER SUBSEQUENCE SEARCH WITH W = %i "
                "NSITES = %i MOTIF = %i\n", w, nsites0, dataset->imotif);
        int n_maxima = ps(dataset, w); // upper bound on number of maxima
        P_PROB psites = (P_PROB) mymalloc(n_maxima * sizeof(p_prob));
        n_maxima = get_pred_sites(psites, mtype, w, seed, ltheta[1], lmap,
                                  dataset, ic);
        print_site_array(psites, nsites0, stdout, w, dataset);
        myfree(psites);
      } // get_num_nodes > 0
    } //sp_idx
  } // print_pred

  if (TRACE){
    printf("Tested %d possible starts...\n", n_starts);
    }

  myfree(maxima);
} // subseq7


/**********************************************************************/
/*
	next_pY

	Compute the value of p(Y_ij | theta_1^{k+1})
	from p(Y_ij | theta_1^{k} and the probability
	of first letter of Y_ij given theta_1^k,
	p(Y_ij^0 | theta_1^k).
*/
/**********************************************************************/
static void next_pY(
  DATASET *dataset,			/* the dataset */
  LOG_THETAG_TYPE(theta_1),		/* integer log theta_1 */
  int w,				/* width of motif */
  int *theta_0,				/* first column of previous theta_1 */
  int pYindex				/* which pY array to use */
) {
  int i, k;
  int *theta_last = theta_1[w-1];	/* last column of theta_1 */
  int n_samples = dataset->n_samples;
  SAMPLE **samples = dataset->samples;
  
  for (i=0; i < n_samples; i++) { 	/* sequence */
    SAMPLE *s = samples[i];		/* sequence */
    int lseq = s->length;		/* length of sequence */
    char *res = pYindex<2 ? s->res : s->resic;	/* integer sequence */
    int *pY = s->pY[pYindex];		/* log p(Y_j | theta_1) */
    char *r = res+lseq-1;		/* last position in sequence */
    char *r0 = res+lseq-w-1;	        /* prior to start of last subsequence */
    int j, p;

    if (lseq < w) continue;		/* skip if sequence too short */

    /* calculate p(Y_ij | theta_1) */
    int *pY_shifted_1 = pY - 1;
    for (j=lseq-w; j>0; j--) {
      pY[j] = pY_shifted_1[j] + theta_last[(int)(*r--)] - theta_0[(int)(*r0--)];
    }

    /* calculate log p(Y_i0 | theta_1) */
    p = 0;
    r = res;
    for (k=0; k<w; k++) {     		/* position in site */
      p += theta_1[k][(int)(*r++)];
    }
    pY[0] = p;
  }
}