Esempio n. 1
0
/* Function:  esl_msashuffle_Shuffle()
 * Synopsis:  Shuffle an alignment's columns.
 *
 * Purpose:   Returns a column-shuffled version of <msa> in <shuf>,
 *            using random generator <r>. Shuffling by columns
 *            preserves the \% identity of the original
 *            alignment. <msa> and <shuf> can be identical, to shuffle
 *            in place.
 *            
 *            The caller sets up the rest of the data (everything but
 *            the alignment itself) in <shuf> the way it wants,
 *            including sequence names, MSA name, and other
 *            annotation. The easy thing to do is to make <shuf>
 *            a copy of <msa>: the caller might create <shuf> by
 *            a call to <esl_msa_Clone()>.
 *            
 *            The alignments <msa> and <shuf> can both be in digital
 *            mode, or can both be in text mode; you cannot mix
 *            digital and text modes.
 *
 * Returns:   <eslOK> on success.
 *
 * Throws:    <eslEINVAL> if <msa>,<shuf> aren't in the same mode (digital vs. text).
 */
int
esl_msashuffle_Shuffle(ESL_RANDOMNESS *r, ESL_MSA *msa, ESL_MSA *shuf)
{
  int i, pos, alen;

  if (! (msa->flags & eslMSA_DIGITAL))
    {
      char c;
      if (shuf->flags & eslMSA_DIGITAL) ESL_EXCEPTION(eslEINVAL, "<shuf> must be in text mode if <msa> is");
      if (msa != shuf) {
	for (i = 0; i < msa->nseq; i++)
	  strcpy(shuf->aseq[i], msa->aseq[i]);
      }

      for (i = 0; i < msa->nseq; i++)
	shuf->aseq[i][msa->alen] = '\0';

      for (alen = msa->alen; alen > 1; alen--)
	{
	  pos = esl_rnd_Roll(r, alen);
	  for (i = 0; i < msa->nseq; i++)
	    {
	      c                     = msa->aseq[i][pos];
	      shuf->aseq[i][pos]    = shuf->aseq[i][alen-1];
	      shuf->aseq[i][alen-1] = c;
	    }
	}
    }
#ifdef eslAUGMENT_ALPHABET
  else 
    {
      ESL_DSQ x;
      if (! (shuf->flags & eslMSA_DIGITAL)) ESL_EXCEPTION(eslEINVAL, "<shuf> must be in digital mode if <msa> is");

      if (msa != shuf) {
	for (i = 0; i < msa->nseq; i++)
	  memcpy(shuf->ax[i], msa->ax[i], (msa->alen + 2) * sizeof(ESL_DSQ));
      }

      for (i = 0; i < msa->nseq; i++)
	shuf->ax[i][msa->alen+1] = eslDSQ_SENTINEL;

      for (alen = msa->alen; alen > 1; alen--)
	{
	  pos = esl_rnd_Roll(r, alen) + 1;
	  for (i = 0; i < msa->nseq; i++)
	    {
	      x                 = msa->ax[i][pos];
	      shuf->ax[i][pos]  = shuf->ax[i][alen];
	      shuf->ax[i][alen] = x;
	    }
	}
    }
#endif /*eslAUGMENT_ALPHABET*/

  return eslOK;
}
Esempio n. 2
0
/* Fetch in a random sequence of length <L> from the the pre-digitized
 * concatenated sequence database, select a random subseq, shuffle it
 * by the chosen algorithm; set dsq[1..L] to the resulting randomized
 * segment.
 *
 * If <logfp> is non-NULL, append one or more "<sqname> <from> <to>"
 * fields to current line, to record where the random segment was
 * selected from. This is useful in cases where we want to track back
 * the origin of a high-scoring segment, in case the randomization
 * wasn't good enough to obscure the identity of a segment.
 *
 */
static int
set_random_segment(ESL_GETOPTS *go, struct cfg_s *cfg, FILE *logfp, ESL_DSQ *dsq, int L)
{
    ESL_SQ  *sq           = esl_sq_CreateDigital(cfg->abc);
    int      minDPL       = esl_opt_GetInteger(go, "--minDPL");
    int      db_dependent = (esl_opt_GetBoolean(go, "--iid") == TRUE ? FALSE : TRUE);
    char    *pkey         = NULL;
    int      start, end;
    int64_t  Lseq;
    int      status;

    if (L==0) return eslOK;
    if (L > cfg->db_maxL) esl_fatal("can't fetch a segment of length %d; database max is %d\n", L, cfg->db_maxL);

    /* fetch a random subseq from the source database */
    esl_sq_GrowTo(sq, L);
    if (db_dependent)
    {
        do {
            if (pkey != NULL) free(pkey);
            if (esl_ssi_FindNumber(cfg->dbfp->data.ascii.ssi, esl_rnd_Roll(cfg->r, cfg->db_nseq), NULL, NULL, NULL, &Lseq, &pkey) != eslOK)
                esl_fatal("failed to look up a random seq");
        } while (Lseq < L);

        start = 1 + esl_rnd_Roll(cfg->r, Lseq-L);
        end   = start + L - 1;
        if (esl_sqio_FetchSubseq(cfg->dbfp, pkey, start, end, sq) != eslOK) esl_fatal("failed to fetch subseq");
        esl_sq_ConvertDegen2X(sq);
    }

    /* log sequence source info: <name> <start> <end> */
    if (logfp != NULL && db_dependent)
        fprintf(logfp, " %-15s %5d %5d", pkey, start, end);

    /* Now apply the appropriate randomization algorithm */
    if      (esl_opt_GetBoolean(go, "--mono"))    status = esl_rsq_XShuffle  (cfg->r, sq->dsq, L, sq->dsq);
    else if (esl_opt_GetBoolean(go, "--di")) {
        if (L < minDPL)                             status = esl_rsq_XShuffle  (cfg->r, sq->dsq, L, sq->dsq);
        else                                        status = esl_rsq_XShuffleDP(cfg->r, sq->dsq, L, cfg->abc->Kp, sq->dsq);
    }
    else if (esl_opt_GetBoolean(go, "--markov0")) status = esl_rsq_XMarkov0  (cfg->r, sq->dsq, L, cfg->abc->Kp, sq->dsq);
    else if (esl_opt_GetBoolean(go, "--markov1")) status = esl_rsq_XMarkov1  (cfg->r, sq->dsq, L, cfg->abc->Kp, sq->dsq);
    else if (esl_opt_GetBoolean(go, "--reverse")) status = esl_rsq_XReverse  (sq->dsq, L, sq->dsq);
    else if (esl_opt_GetBoolean(go, "--iid"))     status = esl_rsq_xIID      (cfg->r, cfg->fq, cfg->abc->K, L, sq->dsq);
    else                                          status = eslEINCONCEIVABLE;
    if (status != eslOK) esl_fatal("esl's shuffling failed");

    memcpy(dsq, sq->dsq+1, sizeof(ESL_DSQ) * L);
    esl_sq_Destroy(sq);
    free(pkey);
    return eslOK;
}
Esempio n. 3
0
/* Function:  p7_anchors_SampleFromTrace()
 * Synopsis:  Make a reasonable anchor set from a trace.
 *
 * Purpose:   Make a reasonable anchor set from trace <tr>, by
 *            randomly sampling a match state in each domain.
 *            Return the anchor set in <anch>, which will be
 *            reallocated if needed.
 *
 *            <tr> must be indexed by the caller with <p7_trace_Index()>.
 *
 * Returns:   <eslOK> on success.
 *
 * Throws:    <eslEMEM> on reallocation failure.
 */
int
p7_anchors_SampleFromTrace(P7_ANCHORS *anch, ESL_RANDOMNESS *rng, const P7_TRACE *tr)
{
    int D = tr->ndom;
    int d,z,w;
    int nM;
    int status;

    if ((status = p7_anchors_Resize(anch, D)) != eslOK) goto ERROR;

    for (d = 1; d <= D; d++)
    {
        for (nM = 0, z = tr->tfrom[d-1]; z <= tr->tto[d-1]; z++)   // P7_TRACE numbers domains 0..D-1, off by one from P7_ANCHORS
            if (p7_trace_IsM(tr->st[z])) nM++;
        ESL_DASSERT1(( nM ));

        w = 1+esl_rnd_Roll(rng, nM);               // w = 1..nM : choice of which M state to make the anchor

        for ( z = tr->tfrom[d-1]; w; z++)          // when w reaches 0, tr->st[z] is the M state we want to make the anchor, and we break out; there's a final z++, so the state we want ends up being z-1
            if (p7_trace_IsM(tr->st[z])) w--;
        ESL_DASSERT1(( p7_trace_IsM(tr->st[z-1]) )); // since the logic above is overly elegant... better doublecheck.

        anch->a[d].i0 = tr->i[z-1];
        anch->a[d].k0 = tr->k[z-1];
    }

    p7_anchor_SetSentinels(anch->a, D, tr->L, tr->M);
    anch->D = D;
    return eslOK;

ERROR:
    return status;
}
Esempio n. 4
0
static void
utest_ReadWrite(ESL_RANDOMNESS *rng)
{
  char          msg[]       = "bg Read/Write unit test failed";
  char          tmpfile[32] = "esltmpXXXXXX";
  FILE         *fp          = NULL;
  ESL_ALPHABET *abc         = NULL;   /* random alphabet choice eslRNA..eslDICE */
  float        *fq          = NULL;
  P7_BG        *bg          = NULL; 

  if ((abc = esl_alphabet_Create(esl_rnd_Roll(rng, 5) + 1)) == NULL)  esl_fatal(msg);
  if (( bg = p7_bg_Create(abc))                             == NULL)  esl_fatal(msg);
  if (( fq = malloc(sizeof(float) * abc->K))                == NULL)  esl_fatal(msg);                 
  do {
    if (esl_dirichlet_FSampleUniform(rng, abc->K, fq)      != eslOK) esl_fatal(msg);
  } while (esl_vec_FMin(fq, abc->K) < 0.001); /* small p's will get rounded off and fail FCompare() */
  esl_vec_FCopy(fq, abc->K, bg->f);

  if (esl_tmpfile_named(tmpfile, &fp) != eslOK) esl_fatal(msg);
  if ( p7_bg_Write(fp, bg)            != eslOK) esl_fatal(msg);
  fclose(fp);

  esl_vec_FSet(bg->f, bg->abc->K, 0.0);
  if ( p7_bg_Read(tmpfile, bg, NULL)                 != eslOK) esl_fatal(msg);
  if ( esl_vec_FCompare(fq, bg->f, bg->abc->K, 0.01) != eslOK) esl_fatal(msg);

  p7_bg_Destroy(bg);
  esl_alphabet_Destroy(abc);
  free(fq);
  remove(tmpfile);
}
Esempio n. 5
0
/* sample_endpoints()
 * Incept:    SRE, Mon Jan 22 10:43:20 2007 [Janelia]
 *
 * Purpose:   Given a profile <gm> and random number source <r>, sample
 *            a begin transition from the implicit probabilistic profile
 *            model, yielding a sampled start and end node; return these
 *            via <ret_kstart> and <ret_kend>.
 *            
 *            By construction, the entry at node <kstart> is into a
 *            match state, but the exit from node <kend> might turn
 *            out to be from either a match or delete state.
 *            
 *            We assume that exits j are uniformly distributed for a
 *            particular entry point i: $a_{ij} =$ constant $\forall
 *            j$.
 *
 * Returns:   <eslOK> on success.
 *
 * Throws:    <eslEMEM> on allocation error.
 *            
 * Xref:      STL11/138           
 */
static int
sample_endpoints(ESL_RANDOMNESS *r, const P7_PROFILE *gm, int *ret_kstart, int *ret_kend)
{
  float *pstart = NULL;
  int    k;
  int    kstart, kend;
  int    status;

  /* We have to backcalculate a probability distribution from the
   * lod B->Mk scores in a local model; this is a little time consuming,
   * but we don't have to do it often.
   */
  ESL_ALLOC(pstart, sizeof(float) * (gm->M+1));
  pstart[0] = 0.0f;
  for (k = 1; k <= gm->M; k++)
    pstart[k] = exp(p7P_TSC(gm, k-1, p7P_BM)) * (gm->M - k + 1); /* multiply p_ij by the number of exits j */
  kstart = esl_rnd_FChoose(r, pstart, gm->M+1);          	 /* sample the starting position from that distribution */
  kend   = kstart + esl_rnd_Roll(r, gm->M-kstart+1);           /* and the exit uniformly from possible exits for it */

  free(pstart);
  *ret_kstart = kstart;
  *ret_kend   = kend;
  return eslOK;
  
 ERROR:
  if (pstart != NULL) free(pstart);
  *ret_kstart = 0;
  *ret_kend   = 0;
  return status;
}
/* Function:  esl_dst_XAverageId()
 * Synopsis:  Calculate avg identity for digital MSA 
 * Incept:    SRE, Fri May 18 15:19:14 2007 [Janelia]
 *
 * Purpose:   Calculates the average pairwise fractional identity in
 *            a digital multiple sequence alignment <ax>, consisting of <N>
 *            aligned digital sequences of identical length.
 *            
 *            If an exhaustive calculation would require more than
 *            <max_comparisons> pairwise comparisons, then instead of
 *            looking at all pairs, calculate the average over a
 *            stochastic sample of <max_comparisons> random pairs.
 *            This allows the routine to work efficiently even on very
 *            deep MSAs.
 *            
 *            Each fractional pairwise identity (range $[0..$ pid $..1]$
 *            is calculated using <esl_dsq_XPairId()>.
 *
 * Returns:   <eslOK> on success, and <*ret_id> contains the average
 *            fractional identity.
 *
 * Throws:    <eslEMEM> on allocation failure.
 *            <eslEINVAL> if any of the aligned sequence pairs aren't 
 *            of the same length.
 *            In either case, <*ret_id> is set to 0.
 */
int
esl_dst_XAverageId(const ESL_ALPHABET *abc, ESL_DSQ **ax, int N, int max_comparisons, double *ret_id)
{
  int    status;
  double id;
  double sum;
  int    i,j,n;
  
  if (N <= 1) { *ret_id = 1.; return eslOK; }
  *ret_id = 0.;

  /* Is N small enough that we can average over all pairwise comparisons? 
     watch out for numerical overflow in this: Pfam N's easily overflow when squared
   */
  if (N <= max_comparisons &&
      N <= sqrt(2. * max_comparisons) &&
      (N * (N-1) / 2) <= max_comparisons)
    {
      for (i = 0; i < N; i++)
	for (j = i+1; j < N; j++)
	  {
	    if ((status = esl_dst_XPairId(abc, ax[i], ax[j], &id, NULL, NULL)) != eslOK) return status;
	    sum += id;
	  }
      sum /= (double) (N * (N-1) / 2);
    }

  /* If nseq is large, calculate average over a stochastic sample. */
  else				
    {
      ESL_RANDOMNESS *r = esl_randomness_CreateTimeseeded();

      for (n = 0; n < max_comparisons; n++)
	{
	  do { i = esl_rnd_Roll(r, N); j = esl_rnd_Roll(r, N); } while (j == i); /* make sure j != i */
	  if ((status = esl_dst_XPairId(abc, ax[i], ax[j], &id, NULL, NULL)) != eslOK) return status;
	  sum += id;
	}
      sum /= (double) max_comparisons;
      esl_randomness_Destroy(r);
    }

  *ret_id = sum;
  return eslOK;
}
Esempio n. 7
0
/* Function:  esl_msashuffle_Bootstrap()
 * Synopsis:  Bootstrap sample an MSA.
 * Incept:    SRE, Tue Jan 22 11:05:07 2008 [Janelia]
 *
 * Purpose:   Takes a bootstrap sample of <msa> (sample <alen> columns,
 *            with replacement) and puts it in <bootsample>, using
 *            random generator <r>. 
 *            
 *            The caller provides allocated space for <bootsample>.
 *            It must be different space than <msa>; you cannot take
 *            a bootstrap sample "in place". The caller sets up the
 *            rest of the data in <bootsample> (everything but the
 *            alignment itself) the way it wants, including sequence
 *            names, MSA name, and other annotation. The easy thing to
 *            do is to initialize <bootsample> by cloning <msa>.
 *
 *            The alignments <msa> and <bootsample> can both be in digital
 *            mode, or can both be in text mode; you cannot mix
 *            digital and text modes.
 *
 * Returns:   <eslOK> on success, and the alignment in <bootsample> is
 *            set to be a bootstrap resample of the alignment in <msa>.
 *
 * Throws:    <eslEINVAL> if <msa>,<bootsample> aren't in the same mode
 *            (digital vs. text).
 */
int 
esl_msashuffle_Bootstrap(ESL_RANDOMNESS *r, ESL_MSA *msa, ESL_MSA *bootsample)
{
  int i, pos, col;

  /* contract checks */
  if (  (msa->flags & eslMSA_DIGITAL) && ! (bootsample->flags & eslMSA_DIGITAL))
    ESL_EXCEPTION(eslEINVAL, "<msa> and <bootsample> must both be in digital or text mode");
  if (! (msa->flags & eslMSA_DIGITAL) &&   (bootsample->flags & eslMSA_DIGITAL))
    ESL_EXCEPTION(eslEINVAL, "<msa> and <bootsample> must both be in digital or text mode");

  if (! (msa->flags & eslMSA_DIGITAL))
    {
      for (pos = 0; pos < msa->alen; pos++)
	{
	  col = esl_rnd_Roll(r, msa->alen);
	  for (i = 0; i < msa->nseq; i++)
	    bootsample->aseq[i][pos] = msa->aseq[i][col];
	}

      for (i = 0; i < msa->nseq; i++)
	bootsample->aseq[i][msa->alen] = '\0';
    }
#ifdef eslAUGMENT_ALPHABET
  else
    {
      for (i = 0; i < msa->nseq; i++)
	bootsample->ax[i][0] = eslDSQ_SENTINEL;

      for (pos = 1; pos <= msa->alen; pos++)
	{
	  col = esl_rnd_Roll(r, msa->alen) + 1;
	  for (i = 0; i < msa->nseq; i++)
	    bootsample->ax[i][pos] = msa->ax[i][col];
	}

      for (i = 0; i < msa->nseq; i++)
	bootsample->ax[i][msa->alen+1] = eslDSQ_SENTINEL;
    }
#endif /*eslAUGMENT_ALPHABET*/

  return eslOK;
}
/* Function:  esl_dst_CAverageId()
 * Synopsis:  Calculate avg identity for multiple alignment
 * Incept:    SRE, Fri May 18 15:02:38 2007 [Janelia]
 *
 * Purpose:   Calculates the average pairwise fractional identity in
 *            a multiple sequence alignment <as>, consisting of <N>
 *            aligned character sequences of identical length.
 *            
 *            If an exhaustive calculation would require more than
 *            <max_comparisons> pairwise comparisons, then instead of
 *            looking at all pairs, calculate the average over a
 *            stochastic sample of <max_comparisons> random pairs.
 *            This allows the routine to work efficiently even on very
 *            deep MSAs.
 *            
 *            Each fractional pairwise identity (range $[0..$ pid $..1]$
 *            is calculated using <esl_dsq_CPairId()>.
 *
 * Returns:   <eslOK> on success, and <*ret_id> contains the average
 *            fractional identity.
 *
 * Throws:    <eslEMEM> on allocation failure.
 *            <eslEINVAL> if any of the aligned sequence pairs aren't 
 *            of the same length.
 *            In either case, <*ret_id> is set to 0.
 */
int
esl_dst_CAverageId(char **as, int N, int max_comparisons, double *ret_id)
{
  int    status;
  double id;
  double sum;
  int    i,j,n;
  
  if (N <= 1) { *ret_id = 1.; return eslOK; }
  *ret_id = 0.;

  /* Is nseq small enough that we can average over all pairwise comparisons? */
  if ((N * (N-1) / 2) <= max_comparisons)
    {
      for (i = 0; i < N; i++)
	for (j = i+1; j < N; j++)
	  {
	    if ((status = esl_dst_CPairId(as[i], as[j], &id, NULL, NULL)) != eslOK) return status;
	    sum += id;
	  }
      id /= (double) (N * (N-1) / 2);
    }

  /* If nseq is large, calculate average over a stochastic sample. */
  else				
    {
      ESL_RANDOMNESS *r = esl_randomness_CreateTimeseeded();

      for (n = 0; n < max_comparisons; n++)
	{
	  do { i = esl_rnd_Roll(r, N); j = esl_rnd_Roll(r, N); } while (j == i); /* make sure j != i */
	  if ((status = esl_dst_CPairId(as[i], as[j], &id, NULL, NULL)) != eslOK) return status;
	  sum += id;
	}
      id /= (double) max_comparisons;
      esl_randomness_Destroy(r);
    }

  *ret_id = id;
  return eslOK;
}
Esempio n. 9
0
static void
generate_testfile(ESL_RANDOMNESS *rng, char *tmpfile, int *is_data, int nlines)
{
  char *msg      = "esl_recorder:: test file generator failed";
  FILE *fp       = NULL;
  int   in_block = esl_rnd_Roll(rng, 2);      /* TRUE | FALSE */
  int   nblock   = 1 + esl_rnd_Roll(rng, 10); /* 1..10        */
  int   i;

  if (esl_tmpfile_named(tmpfile, &fp) != eslOK) esl_fatal(msg);
  for (i = 0; i < nlines; i++)
    {
      is_data[i] = in_block ? TRUE : FALSE;
      fprintf(fp, "%c%d\n", (in_block ? '#' : ' '), i);
      if (--nblock == 0) {
	in_block = ! in_block;
	nblock   = 1 + esl_rnd_Roll(rng, 10); /* 1..10 */
      }
    }
  fclose(fp);
}
Esempio n. 10
0
/* Function:  p7_anchors_Sample()
 * Synopsis:  Sample a randomized anchor set, for testing.
 *
 * Purpose:   Randomly generate an anchor set for a profile of
 *            length <M> compared to a sequence of length <L>,
 *            with a random number of up to <maxD> anchors.
 */
int
p7_anchors_Sample(ESL_RANDOMNESS *rng, int L, int M, int maxD, P7_ANCHORS *anch)
{
    int      D   = 1 + esl_rnd_Roll(rng, maxD);
    int32_t *tmp = NULL;
    int      i,d,r;
    int      status;

    if ((status = p7_anchors_Resize(anch, D)) != eslOK) goto ERROR;

    /* A reservoir sort like algorithm samples a combination of <D> i0 anchors, w/o replacement */
    ESL_ALLOC(tmp, sizeof(int32_t) * D);
    for (i = 0; i < L; i++)
    {
        if (i < D) tmp[i] = i+1;
        else {
            r = esl_rnd_Roll(rng, L);
            if (r < D) tmp[r] = i+1;
        }
    }
    esl_vec_ISortIncreasing(tmp, D);

    for (d = 1; d <= D; d++) {
        anch->a[d].i0 = tmp[d-1];                   // the <D> i0's are sorted
        anch->a[d].k0 = 1 + esl_rnd_Roll(rng, M);   // k0's are independent, uniform on 1..M
    }

    p7_anchor_SetSentinels(anch->a, D, L, M);
    anch->D = D;

    free(tmp);
    return eslOK;

ERROR:
    if (tmp) free(tmp);
    return status;
}
Esempio n. 11
0
/* Function:  p7_tophits_TestSample()
 * Synopsis:  Sample a random, bogus, mostly-syntactically-valid P7_TOPHITS
 *
 * Purpose:   Sample a random but syntactically valid <P7_TOPHITS>
 *            structure, using random number generator <rng>.  Return
 *            it thru <*ret_th>. It was allocated here; caller becomes
 *            responsible for freeing it with <p7_tophits_Destroy()>.
 *            
 *            <th->hit[]> 'sorted' array of ptrs is put in a
 *            randomized order. The <th->sortkey> value and
 *            <th->is_sorted*> flags are set randomly, and have
 *            nothing to do with the order of <th->hit[]>.  (Main use
 *            here is for testing faithful communication of the
 *            object, including its sorted ptrs.) 
 *
 * Returns:   <eslOK> on success, and <*ret_th> points to the sampled
 *            <P7_TOPHITS> object.
 *
 * Throws:    (no abnormal error conditions)
 * 
 * Notes:     Easel code spec requires that TestSample() generates an
 *            object that passes whatever Validate() looks for.
 */
int
p7_tophits_TestSample(ESL_RANDOMNESS *rng, P7_TOPHITS **ret_th)
{
  P7_TOPHITS *th    = NULL;
  int         nhits = 1049;	/* prime. don't make it divisible by any chunk size. */
  int         h,n;
  int         status;

  if (( th = p7_tophits_Create(nhits)) == NULL) { status = eslEMEM; goto ERROR; }
  th->nreported = 1+esl_rnd_Roll(rng, nhits);
  th->nincluded = 1+esl_rnd_Roll(rng, nhits);
  th->is_sorted_by_sortkey = esl_rnd_Roll(rng, 2);
  if (! th->is_sorted_by_sortkey) th->is_sorted_by_seqidx = esl_rnd_Roll(rng, 2);
  
  for (h = 0; h < nhits; h++)
    {
      if ( (status = p7_hit_TestSample(rng, &(th->unsrt[h]))) != eslOK) goto ERROR;
      th->N++;  /* keep th->N precisely up to date; p7_tophits_Destroy() must work w/ unfinished <th> on error  */
    }

  /* Put the hit[] "sorted" array in a random order w/ a Fisher-Yates shuffle */
  for (h = 0; h < th->N; h++)
    th->hit[h] = &(th->unsrt[h]);
  for (n = th->N; n > 1; n--) {	
    h = esl_rnd_Roll(rng, n);
    ESL_SWAP( th->hit[h], th->hit[n-1], P7_HIT *);
  }

  *ret_th = th;
  return eslOK;

 ERROR:
  if (th) p7_tophits_Destroy(th);
  *ret_th = NULL;
  return status;
}
Esempio n. 12
0
static int
synthesize_negatives(ESL_GETOPTS *go, struct cfg_s *cfg, int nneg)
{
  ESL_SQ *sq = esl_sq_CreateDigital(cfg->abc);
  int     a;
  int     i;
  int     L1,L2,L3,d1n,d2n;

  for (i = 0; i < nneg; i++)
    {
      /* Select a random test seq, to use its same segments */
      a = esl_rnd_Roll(cfg->r, cfg->ntest);

      L1  = cfg->test_lens[a].L1;
      L2  = cfg->test_lens[a].L2;
      L3  = cfg->test_lens[a].L3;
      d1n = cfg->test_lens[a].d1n;
      d2n = cfg->test_lens[a].d2n;

      esl_sq_GrowTo(sq, cfg->test_lens[a].L);

      esl_sq_FormatName(sq, "decoy%d", i+1);
      esl_sq_FormatDesc(sq, "L=%d in segments: %d/%d/%d/%d/%d", cfg->test_lens[a].L, L1, d1n, L2, d2n, L3);
      sq->n = cfg->test_lens[a].L;

      fprintf(cfg->negsummfp, "%-15s %5d %5d %5d %5d %5d %5d", 
	      sq->name, (int) sq->n,
	      L1, d1n, L2, d2n, L3);

      sq->dsq[0] = sq->dsq[cfg->test_lens[a].L+1] = eslDSQ_SENTINEL;
      set_random_segment(go, cfg, cfg->negsummfp, sq->dsq+1,               L1);
      set_random_segment(go, cfg, cfg->negsummfp, sq->dsq+1+L1,            d1n);
      set_random_segment(go, cfg, cfg->negsummfp, sq->dsq+1+L1+d1n,        L2);
      set_random_segment(go, cfg, cfg->negsummfp, sq->dsq+1+L1+d1n+L2,     d2n);
      set_random_segment(go, cfg, cfg->negsummfp, sq->dsq+1+L1+d1n+L2+d2n, L3);

      fprintf(cfg->negsummfp, "\n");
  
      esl_sqio_Write(cfg->out_seqfp, sq, eslSQFILE_FASTA, FALSE);

      esl_sq_Reuse(sq);
    }

  esl_sq_Destroy(sq);
  return eslOK;
}
Esempio n. 13
0
/* Function:  esl_msashuffle_PermuteSequenceOrder()
 * Synopsis:  Permutes the order of the sequences.
 *
 * Purpose:   Randomly permute the order of the sequences in <msa>,
 *            and any associated sequence annotation, in place.
 *
 * Returns:   <eslOK> on success.
 *
 * Throws:    (no abnormal error conditions)
 */
int
esl_msashuffle_PermuteSequenceOrder(ESL_RANDOMNESS *r, ESL_MSA *msa)
{
  void   *tmp;
  double  tmpwgt;
  int64_t tmplen;
  int     N, i, tag;

  for (N = msa->nseq; N > 1; N--)
    {
      i = esl_rnd_Roll(r, N);	/* idx = 0..N-1 */
      
      if ( ! (msa->flags & eslMSA_DIGITAL)) { tmp = msa->aseq[i]; msa->aseq[i] = msa->aseq[N-1]; msa->aseq[N-1] = tmp; }
#ifdef eslAUGMENT_ALPHABET
      else 	                            { tmp = msa->ax[i];   msa->ax[i]   = msa->ax[N-1];   msa->ax[N-1]   = tmp; }
#endif
      tmp    = msa->sqname[i]; msa->sqname[i] = msa->sqname[N-1]; msa->sqname[N-1] = tmp;
      tmpwgt = msa->wgt[i];    msa->wgt[i]    = msa->wgt[N-1];    msa->wgt[N-1]    = tmpwgt;

      if (msa->sqacc)  { tmp    = msa->sqacc[i];  msa->sqacc[i]  = msa->sqacc[N-1];  msa->sqacc[N-1]  = tmp;    }
      if (msa->sqdesc) { tmp    = msa->sqdesc[i]; msa->sqdesc[i] = msa->sqdesc[N-1]; msa->sqdesc[N-1] = tmp;    }
      if (msa->ss)     { tmp    = msa->ss[i];     msa->ss[i]     = msa->ss[N-1];     msa->ss[N-1]     = tmp;    }
      if (msa->sa)     { tmp    = msa->sa[i];     msa->sa[i]     = msa->sa[N-1];     msa->sa[N-1]     = tmp;    }
      if (msa->pp)     { tmp    = msa->pp[i];     msa->pp[i]     = msa->pp[N-1];     msa->pp[N-1]     = tmp;    }
      if (msa->sqlen)  { tmplen = msa->sqlen[i];  msa->sqlen[i]  = msa->sqlen[N-1];  msa->sqlen[N-1]  = tmplen; }
      if (msa->sslen)  { tmplen = msa->sslen[i];  msa->sslen[i]  = msa->sslen[N-1];  msa->sslen[N-1]  = tmplen; }
      if (msa->salen)  { tmplen = msa->salen[i];  msa->salen[i]  = msa->salen[N-1];  msa->salen[N-1]  = tmplen; }
      if (msa->pplen)  { tmplen = msa->pplen[i];  msa->pplen[i]  = msa->pplen[N-1];  msa->pplen[N-1]  = tmplen; }

      for (tag = 0; tag < msa->ngs; tag++) if (msa->gs[tag]) { tmp = msa->gs[tag][i]; msa->gs[tag][i] = msa->gs[tag][N-1]; msa->gs[tag][N-1] = tmp; }
      for (tag = 0; tag < msa->ngr; tag++) if (msa->gr[tag]) { tmp = msa->gr[tag][i]; msa->gr[tag][i] = msa->gr[tag][N-1]; msa->gr[tag][N-1] = tmp; }
    }

  /* if <msa> has a keyhash that maps seqname => seqidx, we'll need to rebuild it. */
  if (msa->index) 
    {
      esl_keyhash_Reuse(msa->index);
      for (i = 0; i < msa->nseq; i++)
	esl_keyhash_Store(msa->index, msa->sqname[i], -1, NULL);
    }

  return eslOK;
}
Esempio n. 14
0
/* Sample random domain segment positions, start/end pairs, sorted and nonoverlapping.
 */
int
p7_coords2_Sample(ESL_RANDOMNESS *rng, P7_COORDS2 *c2, int32_t maxseg, int32_t L, int32_t **byp_wrk)
{
  int32_t *wrk  = NULL;
  int32_t  nseg = 1 + esl_rnd_Roll(rng, maxseg); /* 1..maxseg */
  int32_t  i;
  int      status;

  /* Using the bypass idiom, make sure we have a workspace for <L> coords */
  if      (esl_byp_IsInternal(byp_wrk) ) ESL_ALLOC(wrk, sizeof(int32_t) * L);
  else if (esl_byp_IsReturned(byp_wrk) ) ESL_ALLOC(wrk, sizeof(int32_t) * L);
  else if (esl_byp_IsProvided(byp_wrk) ) { wrk = *byp_wrk; ESL_REALLOC(wrk, sizeof(int32_t) * L); }
			      
  /* We put the numbers 1..L into the workspace <wrk>; shuffle them;
   * then sort the top nseg*2 of them. This gives us <nseg>
   * nonoverlapping start/end coords, in order.
   */
  for (i = 0; i < L; i++) wrk[i] = i+1;
  esl_vec_IShuffle(rng, wrk, L);
  esl_vec_ISortIncreasing(wrk, nseg*2);

  /* Store those randomized coords now in the data structure. */
  p7_coords2_GrowTo(c2, nseg);
  c2->n    = nseg;
  for (i = 0; i < nseg; i++)
    {
      c2->arr[i].n1 = wrk[i*2];
      c2->arr[i].n2 = wrk[i*2+1];
    }
  
  /* Using the bypass idiom, recycle workspace, if we're supposed to */
  if      (esl_byp_IsInternal(byp_wrk)) free(wrk);
  else if (esl_byp_IsReturned(byp_wrk)) *byp_wrk = wrk;
  else if (esl_byp_IsProvided(byp_wrk)) *byp_wrk = wrk;
  return eslOK;

 ERROR:
  if (esl_byp_IsInternal(byp_wrk) && wrk) free(wrk);
  return status;
}
Esempio n. 15
0
/* The esl_random() unit test:
 * a binned frequency test.
 */
static void
utest_random(long seed, int n, int nbins, int be_verbose)
{
  ESL_RANDOMNESS *r      = NULL;
  int            *counts = NULL;
  double          X2p    = 0.;
  int             i;
  double          X2, exp, diff;

  if ((counts = malloc(sizeof(int) * nbins)) == NULL) esl_fatal("malloc failed");
  esl_vec_ISet(counts, nbins, 0);

  /* This contrived call sequence exercises CreateTimeseeded() and
   * Init(), while leaving us a reproducible chain. Because it's
   * reproducible, we know this test succeeds, despite being
   * statistical in nature.
   */
  if ((r = esl_randomness_CreateTimeseeded()) == NULL)  esl_fatal("randomness create failed");
  if (esl_randomness_Init(r, seed)            != eslOK) esl_fatal("randomness init failed");

  for (i = 0; i < n; i++)
    counts[esl_rnd_Roll(r, nbins)]++;

  /* X^2 value: \sum (o_i - e_i)^2 / e_i */
  for (X2 = 0., i = 0; i < nbins; i++) {
    exp  = (double) n / (double) nbins;
    diff = (double) counts[i] - exp;
    X2 +=  diff*diff/exp;
  }
  if (esl_stats_ChiSquaredTest(nbins, X2, &X2p) != eslOK) esl_fatal("chi squared eval failed");
  if (be_verbose) printf("random():  \t%g\n", X2p);
  if (X2p < 0.01) esl_fatal("chi squared test failed");

  esl_randomness_Destroy(r);
  free(counts);
  return;
}
Esempio n. 16
0
/* Function: esl_msashuffle_CQRNA()
 * Synopsis: Gap-preserving column shuffle of a pairwise alignment.
 * Incept:   SRE, Tue Jan 22 08:45:34 2008 [Market Street Cafe, Leesburg]
 *
 * Purpose:  Shuffle a pairwise alignment <x>,<y> while preserving the
 *           position of gaps, using the random number generator <r>.
 *           Return the shuffled alignment in <xs>,
 *           <ys>. Caller provides allocated space for <xs> and <ys>.
 *           
 *           An alphabet <abc> must also be provided, solely for the
 *           definition of gap characters. Because Easel's default
 *           alphabets (DNA, RNA, and protein) all use the same
 *           definition of gap characters <-_.>, you can actually
 *           provide any alphabet here, and get the same results.
 *           (This may save having to determine the alphabet of input
 *           sequences.)
 *           
 *           Works by doing three separate
 *           shuffles, of (1) columns with residues in both
 *           <x> and <y>, (2) columns with residue in <x> and gap in <y>,
 *           and (3) columns with gap in <x> and residue in <y>.
 *           
 *           <xs>,<x> and <ys>,<y> may be identical: that is, to shuffle
 *           an alignment "in place", destroying the original
 *           alignment, just call <esl_msashuffle_CQRNA(r, abc, x,y,x,y)>.
 *
 * Returns:  <eslOK> on success, and the shuffled alignment is 
 *           returned in <xs>, <ys>.
 *           
 * Throws:   <eslEMEM> on allocation failure.          
 */
int
esl_msashuffle_CQRNA(ESL_RANDOMNESS *r, ESL_ALPHABET *abc, char *x, char *y, char *xs, char *ys)
{
  int  L;
  int *xycol = NULL;
  int *xcol  = NULL;
  int *ycol  = NULL;
  int  nxy, nx, ny;
  int  i;
  int  pos, c;
  char xsym, ysym;
  int  status;

  if (xs != x) strcpy(xs, x);
  if (ys != y) strcpy(ys, y);

  /* First, construct three arrays containing lists of the column positions
   * of the three types of columns. (If a column contains gaps in both x and y,
   * we've already simply copied it to the shuffled sequence.)
   */
  L = strlen(x);
  if (strlen(y) != L) ESL_XEXCEPTION(eslEINVAL, "sequences of different lengths in qrna shuffle");
  ESL_ALLOC(xycol, sizeof(int) * L);
  ESL_ALLOC(xcol,  sizeof(int) * L);
  ESL_ALLOC(ycol,  sizeof(int) * L);
  nxy = nx = ny = 0;

  for (i = 0; i < L; i++)
    {
      if      (  esl_abc_CIsGap(abc, x[i]) &&   esl_abc_CIsGap(abc, y[i])) { continue; }
      else if (! esl_abc_CIsGap(abc, x[i]) && ! esl_abc_CIsGap(abc, y[i])) { xycol[nxy] = i; nxy++; }
      else if (  esl_abc_CIsGap(abc, x[i]))                                { ycol[ny] = i;   ny++;  }
      else if (  esl_abc_CIsGap(abc, y[i]))                                { xcol[nx] = i;   nx++;  }
    }

  /* Second, shuffle the sequences indirectly, via shuffling these arrays.
   * Yow, careful with those indices, and with order of the statements...
   */
  for (; nxy > 1; nxy--) {
    pos              = esl_rnd_Roll(r, nxy);
    xsym             = xs[xycol[pos]];   ysym             = ys[xycol[pos]];    c            = xycol[pos];   
    xs[xycol[pos]]   = xs[xycol[nxy-1]]; ys[xycol[pos]]   = ys[xycol[nxy-1]];  xycol[pos]   = xycol[nxy-1];
    xs[xycol[nxy-1]] = xsym;             ys[xycol[nxy-1]] = ysym;              xycol[pos]   = xycol[nxy-1];
  }
  for (; nx > 1; nx--) {
    pos            = esl_rnd_Roll(r, nx); 
    xsym           = xs[xcol[pos]];  ysym           = ys[xcol[pos]];  c          = xcol[pos];  
    xs[xcol[pos]]  = xs[xcol[nx-1]]; ys[xcol[pos]]  = ys[xcol[nx-1]]; xcol[pos]  = xcol[nx-1]; 
    xs[xcol[nx-1]] = xsym;           ys[xcol[nx-1]] = ysym;           xcol[nx-1] = c;          
  }
  for (; ny > 1; ny--) {
    pos            = esl_rnd_Roll(r, ny); 
    xsym           = xs[ycol[pos]];  ysym           = ys[ycol[pos]];  c          = ycol[pos]; 
    xs[ycol[pos]]  = xs[ycol[ny-1]]; ys[ycol[pos]]  = ys[ycol[ny-1]]; ycol[pos]  = ycol[ny-1];
    xs[ycol[ny-1]] = xsym;           ys[ycol[ny-1]] = ysym;           ycol[ny-1] = c;          
  }

  free(xycol); free(xcol); free(ycol);
  return eslOK;

 ERROR:
  if (xycol != NULL) free(xycol);
  if (xcol  != NULL) free(xcol);
  if (ycol  != NULL) free(ycol);
  return status;
}
Esempio n. 17
0
/* seq_shuffling()
 * SRE, Tue Jan 22 08:35:51 2008 [Market Street Cafe, Leesburg]
 *
 * Shuffling of input sequences.
 *
 * Fixed-length (L>0) vs. full-length (L=0) modes handled differently.
 * In fixed-length mode:
 *   <shuff->seq> only needs to be allocated once, for L
 *   <targ> is an allocated copy of a random subseq of length L
 *   sequences < L residues long can't be shuffled
 * In full-length mode:
 *   <shuff->seq> is grown to length <sq->n> for each input seq
 *   <targ> just points to <sq->seq>
 */
static int 
seq_shuffling(ESL_GETOPTS *go, ESL_RANDOMNESS *r, FILE *ofp, int outfmt)
{
  char       *seqfile = esl_opt_GetArg(go, 1);
  int         infmt   = eslSQFILE_UNKNOWN;
  ESL_SQFILE *sqfp    = NULL;
  ESL_SQ     *sq      = esl_sq_Create();
  ESL_SQ     *shuff   = esl_sq_Create();
  char       *targ    = NULL;
  int         N       = esl_opt_GetInteger(go, "-N");
  int         L       = esl_opt_GetInteger(go, "-L"); /* L>0 means select random fixed-len subseqs */
  int         kmers   = 0;
  int         i;
  int         status;
  
  if (esl_opt_GetString(go, "--informat") != NULL) {
    infmt = esl_sqio_EncodeFormat(esl_opt_GetString(go, "--informat"));
    if (infmt == eslSQFILE_UNKNOWN) esl_fatal("%s is not a valid input sequence file format for --informat"); 
  }

  if (esl_opt_IsOn(go, "-k")) kmers = esl_opt_GetInteger(go, "-k");


  status = esl_sqfile_Open(seqfile, infmt, NULL, &sqfp);
  if      (status == eslENOTFOUND) esl_fatal("No such file %s", seqfile);
  else if (status == eslEFORMAT)   esl_fatal("Format of seqfile %s unrecognized.", seqfile);
  else if (status == eslEINVAL)    esl_fatal("Can't autodetect stdin or .gz.");
  else if (status != eslOK)        esl_fatal("Open failed, code %d.", status);

  if (L>0) { 
    esl_sq_GrowTo(shuff, L);
    shuff->n = L;
    ESL_ALLOC(targ, sizeof(char) * (L+1));
  }

  while ((status = esl_sqio_Read(sqfp, sq)) == eslOK)
    {
      if (L == 0) {		     /* shuffling entire sequence   */
	esl_sq_GrowTo(shuff, sq->n); /* make sure shuff can hold sq */	  
	shuff->n = sq->n;
	targ = sq->seq;
      } else {
	if (sq->n < L) continue;     /* reject seqs < L long */
      }

      for (i = 0; i < N; i++)
	{
	  if (L > 0) {		/* fixed-len mode: copy a random subseq */
	    int pos = esl_rnd_Roll(r, sq->n - L + 1);
	    strncpy(targ, sq->seq + pos, L);
	    targ[L] = '\0';	    
	  }

	  /* Do the requested kind of shuffling */
	  if      (esl_opt_GetBoolean(go, "-m"))  esl_rsq_CShuffle     (r, targ,        shuff->seq);  /* monoresidue shuffling */
	  else if (esl_opt_GetBoolean(go, "-d"))  esl_rsq_CShuffleDP   (r, targ,        shuff->seq);  /* diresidue shuffling */
	  else if (esl_opt_IsOn      (go, "-k"))  esl_rsq_CShuffleKmers(r, targ, kmers, shuff->seq);  /* diresidue shuffling */
	  else if (esl_opt_GetBoolean(go, "-0"))  esl_rsq_CMarkov0     (r, targ,        shuff->seq);  /* 0th order Markov */
	  else if (esl_opt_GetBoolean(go, "-1"))  esl_rsq_CMarkov1     (r, targ,        shuff->seq);  /* 1st order Markov */
	  else if (esl_opt_GetBoolean(go, "-r"))  esl_rsq_CReverse     (   targ,        shuff->seq);  /* reverse */
	  else if (esl_opt_IsOn      (go, "-w")) { /* regionally shuffle */	
	    int W= esl_opt_GetInteger(go, "-w"); esl_rsq_CShuffleWindows(r, targ, W, shuff->seq);
	  }

	  /* Set the name of the shuffled sequence */
	  if (N > 1) esl_sq_FormatName(shuff, "%s-shuffled-%d", sq->name, i);
	  else       esl_sq_FormatName(shuff, "%s-shuffled", sq->name);

	  /* Output the resulting sequence */
	  esl_sqio_Write(ofp, shuff, outfmt, FALSE);

	  /* don't need to reuse the shuffled sequence: we will use exactly the same memory */
	}
      esl_sq_Reuse(sq);
    }
  if      (status == eslEFORMAT) esl_fatal("Parse failed (sequence file %s):\n%s\n",
					   sqfp->filename, esl_sqfile_GetErrorBuf(sqfp));
  else if (status != eslEOF)     esl_fatal("Unexpected error %d reading sequence file %s",
					    status, sqfp->filename);

  if (L>0) free(targ);
  esl_sq_Destroy(shuff);
  esl_sq_Destroy(sq);
  esl_sqfile_Close(sqfp);
  return eslOK;

 ERROR:
  if (targ != NULL) free(targ);
  esl_sq_Destroy(shuff);
  esl_sq_Destroy(sq);
  esl_sqfile_Close(sqfp);
  return status;
}
Esempio n. 18
0
int
main(int argc, char **argv)
{
  ESL_GETOPTS    *go       = NULL; 
  ESL_RANDOMNESS *r        = NULL;
  int             nselect  = 0;
  char           *filename = NULL;
  FILE           *fp       = NULL;
  char          **larr     = NULL;
  char           *buf      = NULL;
  int             buflen   = 0;
  char           *tmp      = NULL;
  int             i,j;
  int             n;

  /* Parse command line */
  go = esl_getopts_Create(options);
  if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) cmdline_failure(argv[0], "Failed to parse command line: %s\n", go->errbuf);
  if (esl_opt_VerifyConfig(go)               != eslOK) cmdline_failure(argv[0], "Error in app configuration: %s\n",   go->errbuf);
  if (esl_opt_GetBoolean(go, "-h") )                   cmdline_help(argv[0], go);
  if (esl_opt_ArgNumber(go) != 2)                      cmdline_failure(argv[0], "Incorrect number of command line arguments.\n");

  nselect  = atoi(esl_opt_GetArg(go, 1));
  filename = esl_opt_GetArg(go, 2);

  r = esl_randomness_Create(esl_opt_GetInteger(go, "--seed"));

  if ((larr = malloc(sizeof(char *) * nselect)) == NULL) esl_fatal("allocation failed");

  if (strcmp(filename, "-") == 0) fp = stdin;
  else {
    if ((fp = fopen(filename, "r")) == NULL) esl_fatal("Failed to open file %s\n", filename);
  }
   
   n = 0;
   while (esl_fgets(&buf, &buflen, fp) == eslOK)
     {
       n++;
       i = esl_rnd_Roll(r, n);
       if (i < nselect) {
	 for (j = i; j < nselect && j < n; j++)
	   {
	     tmp     = larr[j];
	     larr[j] = buf;
	     buf     = tmp;
	   }
	 free(buf);
	 buf    = NULL;
	 buflen = 0; 
       }
     }  

   for (i = 0; i < nselect; i++) printf("%s", larr[i]);

   if (fp != stdin) fclose(fp);
   for (i = 0; i < nselect; i++) free(larr[i]);
   free(larr);
   free(buf);
   esl_randomness_Destroy(r);
   esl_getopts_Destroy(go);
   return 0;
}
Esempio n. 19
0
static void
utest_block(ESL_RANDOMNESS *rng, char *tmpfile, int *is_data, int N)
{
  char         *msg            = "esl_recorder:: block unit test failed";
  ESL_RECORDER *rc             = NULL;
  FILE         *fp             = NULL;
  int           linenumber     = 0; /* where we should be in the file */
  int           max_reposition = 2;
  int           max_realloc    = 2;
  int          *nseen1         = NULL; /* # of times we Read() each line */
  int          *nseen2         = NULL; /* # of times we see each line in a block */
  int           minalloc;
  int           roll;
  char         *buf;
  char        **block;
  int           from;
  int           n,i;
  int           status         = eslOK;

  if ((fp = fopen(tmpfile, "r"))           == NULL) esl_fatal(msg);
  roll = 1+esl_rnd_Roll(rng, N+1);	/* 1..N+1 */
  if ((rc = esl_recorder_Create(fp, roll)) == NULL) esl_fatal(msg);

  if ((nseen1  = malloc(sizeof(int) * N))   == NULL) esl_fatal(msg);
  if ((nseen2  = malloc(sizeof(int) * N))   == NULL) esl_fatal(msg);
  for (i = 0; i < N; i++) nseen1[i]  = 0;
  for (i = 0; i < N; i++) nseen2[i]  = 0;
  
  while (status == eslOK)
    {
      /* skip nondata lines (no # prefix) */
      do {
	if (esl_recorder_Read(rc, &buf) == eslEOF)     goto DONE;   
	if (atoi(buf+1)                 != linenumber) esl_fatal(msg);
	if (esl_recorder_GetCurrent(rc) != linenumber) esl_fatal(msg);
	nseen1[linenumber]++;
	linenumber++;
      } while (*buf != '#');

      /* read block */
      from = esl_recorder_GetCurrent(rc);
      esl_recorder_MarkBlock(rc, from);
      do {
	if ((status = esl_recorder_Read(rc, &buf)) == eslEOF)   break;
	if (atoi(buf+1)                 != linenumber) esl_fatal(msg);
	if (esl_recorder_GetCurrent(rc) != linenumber) esl_fatal(msg);
	nseen1[linenumber]++;	
	linenumber++;
      } while (*buf == '#');
      
      /* get the block */
      esl_recorder_GetBlock(rc, &block, NULL, NULL, &n);
      if (status == eslOK) n--;

      /* check the block */
      for (i = 0; i < n; i++)
	{
	  if (atoi(block[i]+1) != from+i) esl_fatal(msg);
	  nseen2[from+i]++;	
	}

      /* unmark it */
      esl_recorder_UnmarkBlock(rc);

      /* some fraction of the time, reposition randomly */
      if (status == eslOK && max_reposition && (roll = esl_rnd_Roll(rng, 5)) == 0)
	{
	  linenumber = esl_recorder_GetFirst(rc) + 
	    esl_rnd_Roll(rng, esl_recorder_GetLast(rc) - esl_recorder_GetFirst(rc) + 1);
	  if (esl_recorder_Position(rc, linenumber) != eslOK) esl_fatal(msg);
	  max_reposition--;
	}

      /* some fraction of the time, shrink the allocation */
      if (status == eslOK && max_realloc && (roll = esl_rnd_Roll(rng, 5)) == 0)
	{
	  /* must keep at least nread-ncurr+1 lines, to keep curr line in window */
	  minalloc = rc->nread-rc->ncurr+1;
	  roll = minalloc + esl_rnd_Roll(rng, rc->nalloc-minalloc+1);
	  if (esl_recorder_ResizeTo(rc, roll) != eslOK) esl_fatal(msg);
	  max_realloc--;
	}
    }
  
 DONE:			
  /* we're EOF. We should be sitting on the last line. */
  if (esl_recorder_GetCurrent(rc) != N-1) esl_fatal(msg);

  /* We should have Read() every line at least once. */
  for (i = 0; i < N; i++) 
    if (! nseen1[i]) esl_fatal(msg);

  /* In reading blocks, we should have seen each "data" line at least
   * once; non-data lines, not at all.
   */
  for (i = 0; i < N; i++) {
    if (  is_data[i] && ! nseen2[i]) esl_fatal(msg);
    if (! is_data[i] &&   nseen2[i]) esl_fatal(msg);
  }

  fclose(fp);
  esl_recorder_Destroy(rc);
  free(nseen1);
  free(nseen2);
}
Esempio n. 20
0
/* ideal_local_endpoints()
 *
 * Purpose:  Implementation of the "two-step" fragment sampling
 *           algorithm, sampling a uniform local fragment w.r.t.
 *           sequence coords, by first sampling a complete
 *           sequence of length L from <hmm>; then choosing
 *           a random fragment <i1..i2> uniformly from all
 *           possible $\frac{L(L+1)/2}$ fragments;  then finding
 *           local alignment coordinates wrt model and sequence,
 *           using convention that local alignment starts/stops
 *           with match states. (Thus, if the initially selected
 *           i1 or i2 were generated by insert states, bounds
 *           are moved to reach first/last match state.)
 *           
 *           The caller also provides an allocated sequence <sq> and
 *           traceback <tr>, as storage to be provided to
 *           <p7_CoreEmit()>. They contain the generated global
 *           sequence and trace upon return (not a local trace, note).
 *           
 *           i endpoints are normalized/discretized to 1..<Lbins>, so
 *           we can collate i statistics from sampled sequences of
 *           varying L. Note this causes discretization artifacts,
 *           leading to underrepresentation of j=M and
 *           overrepresentation of i=1.
 *           
 *           This routine is only intended for collecting endpoint
 *           statistics (i1,i2,k1,k2); it does not generate a local
 *           alignment trace. (xref milestone 2, STL11/115).
 *           
 * Returns:  <eslOK> on success; returns normalized/binned sequence
 *           coords in <*ret_i1> and <*ret_i2> in range <1..Lbins> and
 *           the model entry/exit coords in <*ret_k1> and <*ret_k2> in
 *           range <1..M>. By internal def'n of local alignment endpoints,
 *           M_k1 emits residue x_i1, M_k2 emits residue x_i2.
 *           
 * Xref:     STL11/142-143 
 */
static int
ideal_local_endpoints(ESL_RANDOMNESS *r, P7_HMM *hmm, ESL_SQ *sq, P7_TRACE *tr, int Lbins,
		      int *ret_i1, int *ret_i2, int *ret_k1, int *ret_k2)
{
  int status;
  int tpos;
  int i1, i2, k1,k2, t1,t2;
  int all_insert;
  int failsafe = 0;		/* a failsafe timer for rejection sampling */

  do {
    if (failsafe++ == 1000) ESL_XEXCEPTION(eslENOHALT, "failed to obtain local alignment that wasn't all inserts");

    if ((status = p7_CoreEmit(r, hmm, sq, tr)) != eslOK) goto ERROR;

    /* a simple way to sample uniformly from upper triangle is by rejection 
     * this do/while cannot infinite loop, doesn't need failsafe 
     */
    do {
      i1 = 1 + esl_rnd_Roll(r, sq->n);
      i2 = 1 + esl_rnd_Roll(r, sq->n);
    } while (i1 > i2);

    /* Get initial k1,k2 coords: this step must work in a core model, 
     * i1/i2 were generated by an M or I. Also record t1,t2 endpoints
     * on core's trace.
     */
    for (tpos = 0; tpos < tr->N; tpos++)
      if (tr->i[tpos] == i1) { t1 = tpos; k1 = tr->k[tpos]; break; }
    for (tpos = tr->N-1; tpos >= 0; tpos--)
      if (tr->i[tpos] == i2) { t2 = tpos; k2 = tr->k[tpos]; break; }

    /* Enforce the definition of local alignment endpoints being
     * match-delimited - roll up any leading/trailing I states. 
     * Watch out for pathological case of a local fragment that
     * includes no M state at all.
     */
    all_insert = FALSE;
    for (; t1 <= t2; t1++) if (tr->st[t1] == p7T_M) break;
    for (; t2 >= t1; t2--) if (tr->st[t2] == p7T_M) break;
    if (t2 < t1) all_insert = TRUE; /* sufficient to check both. */
    i1 = tr->i[t1];  i2 = tr->i[t2];
    k1 = tr->k[t1];  k2 = tr->k[t2];
  } while (all_insert);

  /* Normalize sequence coords.
   * They're 1..L now; make them 1..Lbins
   */
  *ret_i1 = ((i1-1) * Lbins / sq->n) + 1;
  *ret_i2 = ((i2-1) * Lbins / sq->n) + 1;
  *ret_k1 = k1;
  *ret_k2 = k2;
  return eslOK;

 ERROR:
  *ret_i1 = 0.;
  *ret_i2 = 0.;
  *ret_k1 = 0;
  *ret_k2 = 0;
  return status;
}
Esempio n. 21
0
/* Function:  p7_hit_TestSample()
 * Synopsis:  Sample a random, bogus, mostly syntactic P7_HIT.
 *
 * Purpose:   Sample a random but syntactically valid <P7_HIT>
 *            array, using random number generator <rng>, and 
 *            store it in <hit>, space provided by the caller
 *            (usually, one <P7_HIT> in an array that the caller
 *            has).
 */
int
p7_hit_TestSample(ESL_RANDOMNESS *rng, P7_HIT *hit)
{
  int d;
  int status;

  if ((status = esl_rsq_Sample(rng, eslRSQ_SAMPLE_GRAPH, 1+esl_rnd_Roll(rng, 30),  &(hit->name))) != eslOK) goto ERROR;
  if (esl_rnd_Roll(rng, 2)) { if ((status = esl_rsq_Sample(rng, eslRSQ_SAMPLE_ALNUM, 1+esl_rnd_Roll(rng, 10),  &(hit->acc)))  != eslOK) goto ERROR; }
  if (esl_rnd_Roll(rng, 2)) { if ((status = esl_rsq_Sample(rng, eslRSQ_SAMPLE_PRINT, 1+esl_rnd_Roll(rng, 120), &(hit->desc))) != eslOK) goto ERROR; }

  hit->window_length = 1 + esl_rnd_Roll(rng, 100000);
  hit->sortkey       = -1000. + 2000. * esl_random(rng);
  hit->score         = -1000. + 2000. * esl_random(rng);
  hit->pre_score     = -1000. + 2000. * esl_random(rng);
  hit->sum_score     = -1000. + 2000. * esl_random(rng);
  hit->lnP           = -1000. + 2000. * esl_random(rng);
  hit->pre_lnP       = -1000. + 2000. * esl_random(rng);
  hit->sum_lnP       = -1000. + 2000. * esl_random(rng);
  hit->ndom          = 1 + esl_rnd_Roll(rng, 10);
  hit->noverlaps     = esl_rnd_Roll(rng, hit->ndom);
  hit->nexpected     = esl_random(rng)*10;
  hit->flags         = p7_HITFLAGS_DEFAULT;
  if (esl_rnd_Roll(rng, 2)) hit->flags |= p7_IS_INCLUDED;
  if (esl_rnd_Roll(rng, 2)) hit->flags |= p7_IS_REPORTED;
  if (esl_rnd_Roll(rng, 2)) hit->flags |= p7_IS_NEW;
  if (esl_rnd_Roll(rng, 2)) hit->flags |= p7_IS_DROPPED;
  if (esl_rnd_Roll(rng, 2)) hit->flags |= p7_IS_DUPLICATE;
  hit->nreported     = 1 + esl_rnd_Roll(rng, hit->ndom);  
  hit->nincluded     = 1 + esl_rnd_Roll(rng, hit->ndom);
  hit->best_domain   =     esl_rnd_Roll(rng, hit->ndom);
  hit->seqidx        = 1 + esl_rnd_Roll(rng, 1000000);
  hit->subseq_start  = 1 + esl_rnd_Roll(rng, 1000000);
  hit->offset        = 1 + esl_rnd_Roll(rng, 1000000);

  if (( hit->dcl = p7_domain_Create(hit->ndom) ) == NULL) { status = eslEMEM; goto ERROR; }
  for (d = 0; d < hit->ndom; d++)
    if (( status = p7_domain_TestSample(rng, 1 + esl_rnd_Roll(rng, 100), &(hit->dcl[d]))) != eslOK) goto ERROR;
  return eslOK;

 ERROR:
  /* should free inside hit; caller has the shell of it though */
  return status;
}
Esempio n. 22
0
/* Step 2. Extract the training set and test set.
 */
static int
separate_sets(struct cfg_s *cfg, ESL_MSA *msa, ESL_MSA **ret_trainmsa, ESL_STACK **ret_teststack)
{      
  ESL_MSA   *trainmsa  = NULL;
  ESL_MSA   *test_msa  = NULL;
  ESL_STACK *teststack = NULL;
  ESL_SQ    *sq        = NULL;
  int *assignment = NULL;
  int *nin        = NULL;
  int *useme      = NULL;
  int  nc         = 0;
  int  c;
  int  ctrain;			/* index of the cluster that becomes the training alignment */
  int  ntrain;			/* number of seqs in the training alignment */
  int  nskip;
  int  i;
  int  status;

  if ((teststack = esl_stack_PCreate()) == NULL) { status = eslEMEM; goto ERROR; }
  ESL_ALLOC(useme, sizeof(int) * msa->nseq);

  if ((status = esl_msacluster_SingleLinkage(msa, cfg->idthresh1, &assignment, &nin, &nc)) != eslOK) goto ERROR;
  ctrain = esl_vec_IArgMax(nin, nc);
  ntrain = esl_vec_IMax(nin, nc);

  for (i = 0; i < msa->nseq; i++) useme[i] = (assignment[i] == ctrain) ? 1 : 0;
  if ((status = esl_msa_SequenceSubset(msa, useme, &trainmsa)) != eslOK) goto ERROR;

  /* If all the seqs went into the training msa, none are left for testing; we're done here */
  if (trainmsa->nseq == msa->nseq) {
    free(useme);
    free(assignment);
    free(nin);
    *ret_trainmsa  = trainmsa;
    *ret_teststack = teststack;
    return eslOK;
  }

  /* Put all the other sequences into an MSA of their own; from these, we'll
   * choose test sequences.
   */
  for (i = 0; i < msa->nseq; i++) useme[i] = (assignment[i] != ctrain) ? 1 : 0;
  if ((status = esl_msa_SequenceSubset(msa, useme, &test_msa))                             != eslOK) goto ERROR;

  /* Cluster those test sequences. */
  free(nin);         nin        = NULL;
  free(assignment);  assignment = NULL;
  if ((status = esl_msacluster_SingleLinkage(test_msa, cfg->idthresh2, &assignment, &nin, &nc)) != eslOK) goto ERROR;
  for (c = 0; c < nc; c++)
    {
      nskip = esl_rnd_Roll(cfg->r, nin[c]); /* pick a random seq in this cluster to be the test. */
      for (i=0; i < test_msa->nseq; i++)
	if (assignment[i] == c) {
	  if (nskip == 0) {
	    esl_sq_FetchFromMSA(test_msa, i, &sq);
	    esl_stack_PPush(teststack, (void *) sq);
	    break;
	  } else nskip--;
	}
    }

  esl_msa_Destroy(test_msa);
  free(useme);
  free(nin);
  free(assignment);

  *ret_trainmsa  = trainmsa;
  *ret_teststack = teststack;
  return eslOK;

 ERROR:
  if (useme      != NULL) free(useme);
  if (assignment != NULL) free(assignment);
  if (nin        != NULL) free(nin);
  esl_msa_Destroy(trainmsa); 
  esl_msa_Destroy(test_msa); 
  while (esl_stack_PPop(teststack, (void **) &sq) == eslOK) esl_sq_Destroy(sq);
  esl_stack_Destroy(teststack);
  *ret_trainmsa  = NULL;
  *ret_teststack = NULL;
  return status;
}
Esempio n. 23
0
/* Each test sequence will contain one or two domains, depending on whether --single is set.
 */
static int
synthesize_positives(ESL_GETOPTS *go, struct cfg_s *cfg, char *testname, ESL_STACK *teststack, int *ret_ntest)
{
  ESL_SQ *domain1, *domain2;
  ESL_SQ *sq;
  void   *p;
  int64_t L;			/* total length of synthetic test seq */
  int     d1n, d2n;		/* lengths of two domains             */
  int     L1,L2,L3;		/* lengths of three random regions    */
  int     i,j;
  int     ntest = 0;
  int     ndomains = ( (esl_opt_GetBoolean(go, "--single") == TRUE) ? 1 : 2);
  int     status;

  while (esl_stack_ObjectCount(teststack) >= ndomains)
    {
      ESL_RALLOC(cfg->test_lens, p, (cfg->ntest+1) * sizeof(struct testseq_s));

      /* Pop our one or two test domains off the stack */
      esl_stack_PPop(teststack, &p);   
      domain1 = p; 
      d1n     = domain1->n;

      if (ndomains == 2)
	{
	  esl_stack_PPop(teststack, &p); 
	  domain2 = p;
	  d2n = domain2->n;
	}
      else
	{
	  domain2 = NULL;
	  d2n     = 0;
	}

      /* Select a random total sequence length */
      if (d1n+d2n > cfg->db_maxL) esl_fatal("can't construct test seq; no db seq >= %d residues\n", d1n+d2n);
      do {                                                     
	if (esl_ssi_FindNumber(cfg->dbfp->data.ascii.ssi, esl_rnd_Roll(cfg->r, cfg->db_nseq), NULL, NULL, NULL, &L, NULL) != eslOK)
	  esl_fatal("failed to look up a random seq");
      } while (L < d1n+d2n);

      /* Now figure out the embedding */
      if (ndomains == 2) 
	{
	  /* Select random lengths of three flanking domains;
	   * Imagine picking two "insert after" points i,j in sequence 1..L', for
	   * L' = L-d1n-d2n (the total length of nonhomologous test seq)
	   */
	  do {
	    i = esl_rnd_Roll(cfg->r, L - d1n - d2n + 1 ); /* i = 0..L' */
	    j = esl_rnd_Roll(cfg->r, L - d1n - d2n + 1 ); /* j = 0..L' */
	  } while (i > j);

	  /* now 1           .. i         = random region 1 (if i==0, there's none); 
	   *     i+1         .. i+d1n     = domain 1
	   *     i+d1n+1     .. j+d1n     = random region 2 (if i==j, there's none);
	   *     j+d1n+1     .. j+d1n+d2n = domain 2
	   *     j+d1n+d2n+1 .. L         = random region 3 (if j == L-d1n-d2n, there's none);
	   */
	  L1 = i;			
	  L2 = j-i;
	  L3 = L - d1n - d2n - j;
	}
      else 
	{ /* embedding one domain */
	  i = esl_rnd_Roll(cfg->r, L - d1n + 1 ); /* i = 0..L' */
	  /* now 1           .. i         = random region 1 (if i==0, there's none); 
	   *     i+1         .. i+d1n     = domain 1
	   *     i+d1n+1     .. L         = random region 2 (if i==j, there's none);
	   */
	  L1 = i;			
	  L2 = L - d1n - L1;
	  L3 = 0;
	}
      
      sq = esl_sq_CreateDigital(cfg->abc);
      esl_sq_GrowTo(sq, L);
      sq->n = L;
      if (ndomains == 2) 
	{
	  esl_sq_FormatName(sq, "%s/%d/%d-%d/%d-%d", testname, cfg->ntest, i+1, i+d1n, j+d1n+1, j+d1n+d2n);
	  esl_sq_FormatDesc(sq, "domains: %s %s", domain1->name, domain2->name);
	}
      else
	{
	  esl_sq_FormatName(sq, "%s/%d/%d-%d",   testname, cfg->ntest, i+1, i+d1n);
	  esl_sq_FormatDesc(sq, "domain: %s", domain1->name);
	}

      fprintf(cfg->possummfp, "%-35s %5d %5d %5d %5d %5d %5d", sq->name, (int) sq->n, L1, d1n, L2, d2n, L3);


      sq->dsq[0] = sq->dsq[L+1] = eslDSQ_SENTINEL;
      set_random_segment(go, cfg, cfg->possummfp, sq->dsq+1,           L1);
      memcpy(sq->dsq+i+1,     domain1->dsq+1, sizeof(ESL_DSQ) * d1n);
      fprintf(cfg->possummfp, " %-24s %5d %5d", domain1->name, 1, d1n);
      set_random_segment(go, cfg, cfg->possummfp, sq->dsq+i+d1n+1,     L2);
      if (ndomains == 2) 
	{
	  memcpy(sq->dsq+j+d1n+1, domain2->dsq+1, sizeof(ESL_DSQ) * d2n);
	  fprintf(cfg->possummfp, " %-24s %5d %5d", domain2->name, 1, d2n);
	  set_random_segment(go, cfg, cfg->possummfp, sq->dsq+j+d1n+d2n+1, L3);
	}
      fprintf(cfg->possummfp, "\n");

      cfg->test_lens[cfg->ntest].L   = L;
      cfg->test_lens[cfg->ntest].L1  = L1;
      cfg->test_lens[cfg->ntest].d1n = d1n;
      cfg->test_lens[cfg->ntest].L2  = L2;
      cfg->test_lens[cfg->ntest].d2n = d2n;
      cfg->test_lens[cfg->ntest].L3  = L3;
      cfg->ntest++;
      ntest++;

      esl_sqio_Write(cfg->out_seqfp, sq, eslSQFILE_FASTA, FALSE);

      esl_sq_Destroy(domain1);
      if (ndomains == 2) esl_sq_Destroy(domain2);
      esl_sq_Destroy(sq);
    }

  *ret_ntest = ntest;
  return eslOK;

 ERROR:
  esl_fatal("Failure in synthesize_positives");
  return status;
}
int 
main(int argc, char **argv)
{
  ESL_GETOPTS   *go = NULL;
  ESL_RANDOMNESS *r = NULL;
  char  **as = NULL;		/* aligned character seqs (random, iid) */
  int     N,L;			/* # of seqs, and their aligned lengths */
  int seed;
  int i,j;
  int status;
  double p[4];			/* ACGT probabilities */
#ifdef eslAUGMENT_ALPHABET
  ESL_DSQ      **ax = NULL;		/* digitized alignment                  */
  ESL_ALPHABET *abc = NULL;
#endif

  /* Process command line
   */
  go = esl_getopts_Create(options);
  esl_opt_ProcessCmdline(go, argc, argv);
  esl_opt_VerifyConfig(go);
  if (esl_opt_GetBoolean(go, "-h") == TRUE) {
    puts(usage); 
    puts("\n  where options are:");
    esl_opt_DisplayHelp(stdout, go, 0, 2, 80); /* 0=all docgroups; 2=indentation; 80=width */
    return 0;
  }
  L    = esl_opt_GetInteger(go, "-L");
  N    = esl_opt_GetInteger(go, "-N");
  seed = esl_opt_GetInteger(go, "--seed");
  if (esl_opt_ArgNumber(go) != 0) {
    puts("Incorrect number of command line arguments.");
    puts(usage);
    return 1;
  }
  esl_getopts_Destroy(go);

  /* Create a random DNA alignment;
   * force it to obey the conventions of the unit tests:
   *   0,1 are identical
   *   0,2 are completely dissimilar
   */
  r   = esl_randomness_Create(seed);
  for (i = 0; i < 4; i++) p[i] = 0.25;
  ESL_ALLOC(as, sizeof(char *) * N);
  for (i = 0; i < N; i++) 
    ESL_ALLOC(as[i], sizeof(char) * (L+1));
  esl_rsq_IID(r, "ACGT", p, 4, L, as[0]);
  strcpy(as[1], as[0]);
  esl_rsq_IID(r, "ACGT", p, 4, L, as[2]);
  for (j = 0; j < L; j++)
    while (as[2][j] == as[0][j])
      as[2][j] = "ACGT"[esl_rnd_Roll(r, 4)];
  for (i = 3; i < N; i++)
    esl_rsq_IID(r, "ACGT", p, 4, L, as[i]);

#ifdef eslAUGMENT_ALPHABET
  abc = esl_alphabet_Create(eslDNA);
  ESL_ALLOC(ax, sizeof(ESL_DSQ *) * N);
  for (i = 0; i < N; i++) 
    esl_abc_CreateDsq(abc, as[i], &(ax[i]));
#endif /*eslAUGMENT_ALPHABET*/


  /* Unit tests
   */
  if (utest_CPairId(as, N)               != eslOK) return eslFAIL;
  if (utest_CJukesCantor(4, as, N)       != eslOK) return eslFAIL;

#ifdef eslAUGMENT_ALPHABET
  if (utest_XPairId(abc, as, ax, N)      != eslOK) return eslFAIL;
  if (utest_XJukesCantor(abc, as, ax, N) != eslOK) return eslFAIL;
#endif /*eslAUGMENT_ALPHABET*/

#ifdef eslAUGMENT_DMATRIX
  if (utest_CPairIdMx(as, N)             != eslOK) return eslFAIL;
  if (utest_CDiffMx(as, N)               != eslOK) return eslFAIL;
  if (utest_CJukesCantorMx(4, as, N)     != eslOK) return eslFAIL;
#endif /* eslAUGMENT_DMATRIX*/

#if defined (eslAUGMENT_ALPHABET) && defined (eslAUGMENT_DMATRIX)
  if (utest_XPairIdMx(abc, as, ax, N)       != eslOK) return eslFAIL;
  if (utest_XDiffMx(abc, as, ax, N)         != eslOK) return eslFAIL;
  if (utest_XJukesCantorMx(abc, as, ax, N)  != eslOK) return eslFAIL;
#endif

  esl_randomness_Destroy(r);
  esl_Free2D((void **) as, N);
#ifdef eslAUGMENT_ALPHABET
  esl_alphabet_Destroy(abc);
  esl_Free2D((void **) ax, N);
#endif
  return eslOK;

 ERROR:
  return eslFAIL;
}