DATASET *read_seq_file(
  char *file_name,		/* name of file to open */
  char *alpha,			/* alphabet used in sequences */
  BOOLEAN use_comp,		/* use complementary strands, too */
  double seqfrac 		/* fraction of input sequences to use */
)
{
  int i, j;
  FILE *data_file;		/* file with samples to read */
  FILE *prior_file=NULL;	/* file with positional priors to read */
  char *sample_name;		/* name of sample read */
  char *sample_de;		/* descriptor text for sample */
  char *sequence;		/* sequence read */
  long length;			/* length of sequence */
  BOOLEAN error=FALSE;		/* none yet */
  SAMPLE *sample;		/* sample created */
  DATASET *dataset;		/* dataset created */
  int n_samples=0;		/* number of samples read */
  double *seq_weights=NULL;	/* sequence weights */
  int n_wgts=0;			/* number of sequence weights given */

  /* create a hash table of sequence names */
  if (!ht_seq_names) ht_seq_names = hash_create(DATA_HASH_SIZE);

  /* create a dataset */
  dataset = (DATASET *) mymalloc(sizeof(DATASET));
  dataset->alength = strlen(alpha);
  dataset->alphabet = alpha;
  dataset->psp_w = 0;			// indicates no PSP was read
  dataset->log_psp_w = 0;		// so log_psp will get initialized

  /* open data file */
  if (file_name == NULL) {
    fprintf(stderr, "You must specify a data file or `stdin'.\n");
    exit(1);
  } else if (strcmp(file_name, "stdin")) {
    data_file = fopen(file_name, "r"); 
    if (data_file == NULL) {
      fprintf(stderr, "Cannot open file `%s'.\n", file_name);
      exit(1);
    }
  } else {
    data_file = stdin;
  }

  /* initialize maximum length of sequences */
  dataset->max_slength = 0;
  dataset->min_slength = 10000000;

  dataset->n_samples = 0;	/* no samples yet */
  dataset->samples = NULL;	/* no samples */

  while (read_sequence(data_file, &sample_name, &sample_de, &sequence, 
    &length)) {

    /* skip sequence if an error occurred */
    if (length < 0) continue;

    /* parse weights if given; make (more than enough) room in array */
    if (strcmp(sample_name, "WEIGHTS")==0) {
      double wgt; 
      char *wgt_str = sample_de;
      Resize(seq_weights, n_wgts+(int)strlen(wgt_str), double);
      while (sscanf(wgt_str, "%lf", &wgt) == 1) {
        if (wgt <= 0 || wgt > 1) {
	  fprintf(stderr, 
            "Weights must be larger than zero and no greater than 1.\n");
	  exit(1);
        }
        seq_weights[n_wgts++] = wgt;			/* save weight */
        wgt_str += strspn(wgt_str, "      ");		/* skip white */
        wgt_str += strcspn(wgt_str, "     ");		/* skip token */
      }
      myfree(sample_name);
      myfree(sample_de);
      myfree(sequence);
      continue;
    }

    /* ignore duplicate (same sample name) sequences */ 
    if (hash_lookup_str(sample_name, ht_seq_names) != NULL) {
      fprintf(stderr, "Skipping sequence '%s'.\n", sample_name);
      myfree(sample_name);
      myfree(sample_de);
      myfree(sequence);
      continue;
    }
    hash_insert_str(sample_name, ht_seq_names);  /* put name in hash table */

    n_samples++;

    /* see if sequence will be used in random sample; store it if yes */
    if (drand48() >= 1 - seqfrac) {

      HASH_TABLE_ENTRY *hash_entry; // needed to add pointer to sample

      /* create the sample */
      sample = create_sample(alpha, length, sample_name, sequence, sample_de, use_comp);
      if (sample == NULL) {error = TRUE; continue;}

      /* record maximum length of actual sequences */
      dataset->max_slength = MAX(sample->length, dataset->max_slength);
      dataset->min_slength = MIN(sample->length, dataset->min_slength);

      /* put the sample in the array of samples */
      if ((dataset->n_samples % RCHUNK) == 0) {
        Resize(dataset->samples, dataset->n_samples + RCHUNK, SAMPLE *);
      }
Esempio n. 2
0
File: subseq7.c Progetto: CPFL/gmeme
extern void subseq7(
  MODEL *model,			// the model
  DATASET *dataset,		/* the dataset */
  int w,			// w to use
  int n_nsites0,		/* number of nsites0 values to try */
  S_POINT s_points[],           /* array of starting points: 1 per nsites0 */
  HASH_TABLE evaluated_seed_ht 	/* A hash table used for remembering which seeds
                                   have been evaluated previously */
)
{
  MOTYPE mtype = model->mtype;		/* type of model */
  BOOLEAN ic = model->invcomp;		/* use reverse complement strand of DNA, too */
  THETA map = dataset->map;		/* freq x letter map */
  LOG_THETA_TYPE(ltheta);		/* integer encoded log theta */
  int iseq, ioff;
  int alength = dataset->alength;	/* length of alphabet */
  int n_samples = dataset->n_samples;	/* number of samples in dataset */
  SAMPLE **samples = dataset->samples;	/* samples in dataset */
  int n_starts = 0;			/* number of sampled start subseq */
  int n_maxima = ps(dataset, w);	/* upper bound on # maxima */
  /* the local maxima positions */
  P_PROB maxima = (P_PROB) mymalloc(n_maxima * sizeof(p_prob));
  int lmap[MAXALPH][MAXALPH];	/* consensus letter vs. log frequency matrix */
  double col_scores[MAXSITE];		/* not used */
#ifdef PARALLEL
  int start_seq, start_off=0, end_seq, end_off=0;
#endif
  char *str_seed;                       // A string representation of a seed.

  // PRECONDITIONS:

  // 1. If the sequence model is oops, then n_nsites0 is exactly 1:
  if (mtype == Oops) {
    assert(n_nsites0 == 1);
  }

  convert_to_lmap(map, lmap, alength);

  if (TRACE) { printf("w= %d\n", w); }

  /* get the probability that a site starting at position x_ij would
     NOT overlap a previously found motif.
  */
  get_not_o(dataset, w);

  // Set up log_not_o: log_not_o[site] is:
  // log ( Pr(site not overlapped) * scaled_to_one_Pr(site) )
  if (model->mtype != Tcm) {
    add_psp_to_log_not_o(dataset, w, model->invcomp, model->mtype);
  }

  /* score all the sampled positions saving the best position for
     each value of NSITES0 */
#ifdef PARALLEL
  /* Retrieve the previously-calculated starting and ending points. */
  get_start_n_end(&start_seq, &start_off, &end_seq, &end_off);
  /* Divide the various samples among processors. */
  for (iseq = start_seq; iseq <= end_seq; iseq++) { /* sequence */
#else /* not PARALLEL */
  for (iseq = 0; iseq < n_samples; iseq++) {	/* sequence */
#endif /* PARALLEL */

    SAMPLE *s = samples[iseq];
    int lseq = s->length;
    char *res = s->res;				/* left to right */
    char *name = s->sample_name;
    double *not_o = s->not_o;
    int max_off, init_off;

    if (lseq < w) continue;			/* shorter than motif */

#ifdef PARALLEL
    if (mpMyID() == 0)
#endif
    if ((!NO_STATUS) && ((iseq % 5) == 0)) {
      fprintf(stderr, "starts: w=%d, seq=%d, l=%d          \r", w, iseq, lseq); 
    }
    /* Set the appropriate starting and ending points. */
#ifdef PARALLEL
    if (iseq == start_seq)
      init_off = start_off;
    else
#endif
      init_off = 0;

#ifdef PARALLEL
    if (iseq == end_seq)
      max_off = MIN(end_off, lseq - w);
    else
#endif
      max_off = lseq - w;

    /*
      Loop over all subsequences in the current sequence testing them
      each as "starting points" (inital values) for theta
    */
    for (ioff = init_off; ioff <= max_off; ioff++) {/* subsequence */ 
      /* warning: always do the next step; don't ever
         "continue" or the value of pY will not be correct since
         it is computed based the previous value 
      */

      /* convert subsequence in dataset to starting point for EM */
      init_theta_1(w, res+ioff, &ltheta[1][0], lmap);

      if (ioff == init_off) { 			/* new sequence */

        /* Compute p(Y_ij | theta_1^0) */
        if (!ic) {
          get_pY(dataset, &ltheta[1][0], w, 0);
        } else {
          get_pY(dataset, &ltheta[1][0], w, 1);
          get_pY(dataset, &ltheta[1][0], w, 2);
        }

      } else {					/* same sequence */
        
        /* get theta[0][0]^{k-1} */
        init_theta_1(1, res+ioff-1, &ltheta[0][0], lmap);

        /* compute p(Y_ij | theta_1^k) */
        if (!ic) {
          next_pY(dataset, &ltheta[1][0], w, &ltheta[0][0][0], 0);
        } else {
          next_pY(dataset, &ltheta[1][0], w, &ltheta[0][0][0], 1);
          next_pY(dataset, &ltheta[1][0], w, &ltheta[0][0][0], 2);
        }
      } /* same sequence */

      /* skip if there is a high probability that this subsequence
         is part of a site which has already been found 
      */
      if (not_o[ioff] < MIN_NOT_O) continue;

      /*fprintf(stderr, "subseq: %d %d\r", iseq+1, ioff+1);*/

      // Put highest pY into first scratch array if using both DNA strands:
      if (ic) {
        combine_strands(samples, n_samples, w);
      }

      /* get a sorted list of the maxima of pY */
      n_maxima = get_max(mtype, dataset, w, maxima, ic, TRUE);

      /* "fake out" align_top_subsequences by setting each of the scores in
         the s_points objects to LITTLE, thereby forcing
         align_top_subsequences to record the attributes for the current seed
         in the s_points, rather than the seed with the highest respective
         scores: */
      int sp_idx;
      for (sp_idx = 0; sp_idx < n_nsites0; sp_idx++) {
        s_points[sp_idx].score = LITTLE;
      }

      /* align the top nsites0 subsequences for each value
         of nsites0 and save the alignments with the highest likelihood 
      */
      n_starts += align_top_subsequences(
        mtype,
        w,
        dataset,
        iseq,
        ioff, 
        res+ioff,
        name,
        n_nsites0,
        n_maxima,
        maxima,
        col_scores,
        s_points
      );

      /* A string version of the current seed is required for updating the
         S_POINT heaps: */
      str_seed = to_str_seed(res+ioff, w);

      /* For each of the S_POINT objects, add the current seed to that
         S_POINT'S heap.
         Also, branching search will require a hash_table of all seeds that
         have been evaluated prior to when branching search is called. Hence
         also record the current seed (string, nsites0) combination in the
         hash_table, for all nsites0, unless that seed was already in the
         hash_table:
      */
      hash_insert_str(str_seed, evaluated_seed_ht);
      update_s_point_heaps(s_points, str_seed, n_nsites0);

      myfree(str_seed);
    } /* subsequence */
  } /* sequence */

#ifdef PARALLEL
  reduce_across_heaps(s_points, n_nsites0);
#endif // PARALLEL 

  // Print the sites predicted using the seed after subsequence search, for
  // each of the starting points, if requested:
  if (dataset->print_pred) {
    int sp_idx;
    for (sp_idx = 0; sp_idx < n_nsites0; sp_idx++) {
      // Retrieve the best seed, from the heap:
      HEAP *heap = s_points[sp_idx].seed_heap;
      // Only print sites for the s_point if its heap was non-empty:
      if (get_num_nodes(heap) > 0) {
        SEED *best_seed = (SEED *)get_node(heap, get_best_node(heap));
        char *seed = get_str_seed(best_seed);

        /* Print the sites predicted using the motif corresponding to that seed,
           according to the sequence model being used:
        */
        int nsites0 = s_points[sp_idx].nsites0;
        fprintf(stdout,
                "PREDICTED SITES AFTER SUBSEQUENCE SEARCH WITH W = %i "
                "NSITES = %i MOTIF = %i\n", w, nsites0, dataset->imotif);
        int n_maxima = ps(dataset, w); // upper bound on number of maxima
        P_PROB psites = (P_PROB) mymalloc(n_maxima * sizeof(p_prob));
        n_maxima = get_pred_sites(psites, mtype, w, seed, ltheta[1], lmap,
                                  dataset, ic);
        print_site_array(psites, nsites0, stdout, w, dataset);
        myfree(psites);
      } // get_num_nodes > 0
    } //sp_idx
  } // print_pred

  if (TRACE){
    printf("Tested %d possible starts...\n", n_starts);
    }

  myfree(maxima);
} // subseq7


/**********************************************************************/
/*
	next_pY

	Compute the value of p(Y_ij | theta_1^{k+1})
	from p(Y_ij | theta_1^{k} and the probability
	of first letter of Y_ij given theta_1^k,
	p(Y_ij^0 | theta_1^k).
*/
/**********************************************************************/
static void next_pY(
  DATASET *dataset,			/* the dataset */
  LOG_THETAG_TYPE(theta_1),		/* integer log theta_1 */
  int w,				/* width of motif */
  int *theta_0,				/* first column of previous theta_1 */
  int pYindex				/* which pY array to use */
) {
  int i, k;
  int *theta_last = theta_1[w-1];	/* last column of theta_1 */
  int n_samples = dataset->n_samples;
  SAMPLE **samples = dataset->samples;
  
  for (i=0; i < n_samples; i++) { 	/* sequence */
    SAMPLE *s = samples[i];		/* sequence */
    int lseq = s->length;		/* length of sequence */
    char *res = pYindex<2 ? s->res : s->resic;	/* integer sequence */
    int *pY = s->pY[pYindex];		/* log p(Y_j | theta_1) */
    char *r = res+lseq-1;		/* last position in sequence */
    char *r0 = res+lseq-w-1;	        /* prior to start of last subsequence */
    int j, p;

    if (lseq < w) continue;		/* skip if sequence too short */

    /* calculate p(Y_ij | theta_1) */
    int *pY_shifted_1 = pY - 1;
    for (j=lseq-w; j>0; j--) {
      pY[j] = pY_shifted_1[j] + theta_last[(int)(*r--)] - theta_0[(int)(*r0--)];
    }

    /* calculate log p(Y_i0 | theta_1) */
    p = 0;
    r = res;
    for (k=0; k<w; k++) {     		/* position in site */
      p += theta_1[k][(int)(*r++)];
    }
    pY[0] = p;
  }
}