Пример #1
0
/* The MSV score can be validated against Viterbi (provided we trust
 * Viterbi), by creating a multihit local profile in which:
 *   1. All t_MM scores = 0
 *   2. All other core transitions = -inf
 *   3. All t_BMk entries uniformly log 2/(M(M+1))
 */
static void
utest_msv(ESL_GETOPTS *go, ESL_RANDOMNESS *r, ESL_ALPHABET *abc, P7_BG *bg, P7_PROFILE *gm, int nseq, int L)
{
  P7_PROFILE *g2 = NULL;
  ESL_DSQ   *dsq = NULL;
  P7_GMX    *gx  = NULL;
  float     sc1, sc2;
  int       k, idx;

  if ((dsq    = malloc(sizeof(ESL_DSQ) *(L+2))) == NULL)  esl_fatal("malloc failed");
  if ((gx     = p7_gmx_Create(gm->M, L))        == NULL)  esl_fatal("matrix creation failed");
  if ((g2     = p7_profile_Clone(gm))           == NULL)  esl_fatal("profile clone failed");

  /* Make g2's scores appropriate for simulating the MSV algorithm in Viterbi */
  esl_vec_FSet(g2->tsc, p7P_NTRANS * g2->M, -eslINFINITY);
  for (k = 1; k <  g2->M; k++) p7P_TSC(g2, k, p7P_MM) = 0.0f;
  for (k = 0; k <  g2->M; k++) p7P_TSC(g2, k, p7P_BM) = log(2.0f / ((float) g2->M * (float) (g2->M+1)));

  for (idx = 0; idx < nseq; idx++)
    {
      if (esl_rsq_xfIID(r, bg->f, abc->K, L, dsq) != eslOK) esl_fatal("seq generation failed");

      if (p7_GMSV    (dsq, L, gm, gx, 2.0, &sc1)       != eslOK) esl_fatal("MSV failed");
      if (p7_GViterbi(dsq, L, g2, gx,      &sc2)       != eslOK) esl_fatal("viterbi failed");
      if (fabs(sc1-sc2) > 0.0001) esl_fatal("MSV score not equal to Viterbi score");
    }

  p7_gmx_Destroy(gx);
  p7_profile_Destroy(g2);
  free(dsq);
  return;
}
Пример #2
0
/* Function:  p7_profile_SameAsMF()
 * Synopsis:  Set a generic profile's scores to give MSV scores.
 * Incept:    MSF Tue Nov 3, 2009 [Janelia]
 *
 * Purpose:   Set a generic profile's scores so that the normal <dp_generic> DP 
 *            algorithms will give the same score as <p7_MSVFilter()>:
 *            all t_MM scores = 0; all other core transitions = -inf;
 *            multihit local mode; all <t_BMk> entries uniformly <log 2/(M(M+1))>;
 *            <tCC, tNN, tJJ> scores 0; total approximated later as -3;
 *            rounded in the same way as the 8-bit limited precision.
 *
 * Returns:   <eslOK> on success.
 */
int
p7_profile_SameAsMF(const P7_OPROFILE *om, P7_PROFILE *gm)
{
  int    k;
  float  tbm = log(2.0f / ((float) gm->M * (float) (gm->M+1)));

  /* Transitions */
  esl_vec_FSet(gm->tsc, p7P_NTRANS * gm->M, -eslINFINITY);
  for (k = 1; k <  gm->M; k++) p7P_TSC(gm, k, p7P_MM) = 0.0f;
  for (k = 0; k <  gm->M; k++) p7P_TSC(gm, k, p7P_BM) = tbm;
  
  return eslOK;
}
Пример #3
0
/* sample_endpoints()
 * Incept:    SRE, Mon Jan 22 10:43:20 2007 [Janelia]
 *
 * Purpose:   Given a profile <gm> and random number source <r>, sample
 *            a begin transition from the implicit probabilistic profile
 *            model, yielding a sampled start and end node; return these
 *            via <ret_kstart> and <ret_kend>.
 *            
 *            By construction, the entry at node <kstart> is into a
 *            match state, but the exit from node <kend> might turn
 *            out to be from either a match or delete state.
 *            
 *            We assume that exits j are uniformly distributed for a
 *            particular entry point i: $a_{ij} =$ constant $\forall
 *            j$.
 *
 * Returns:   <eslOK> on success.
 *
 * Throws:    <eslEMEM> on allocation error.
 *            
 * Xref:      STL11/138           
 */
static int
sample_endpoints(ESL_RANDOMNESS *r, const P7_PROFILE *gm, int *ret_kstart, int *ret_kend)
{
  float *pstart = NULL;
  int    k;
  int    kstart, kend;
  int    status;

  /* We have to backcalculate a probability distribution from the
   * lod B->Mk scores in a local model; this is a little time consuming,
   * but we don't have to do it often.
   */
  ESL_ALLOC(pstart, sizeof(float) * (gm->M+1));
  pstart[0] = 0.0f;
  for (k = 1; k <= gm->M; k++)
    pstart[k] = exp(p7P_TSC(gm, k-1, p7P_BM)) * (gm->M - k + 1); /* multiply p_ij by the number of exits j */
  kstart = esl_rnd_FChoose(r, pstart, gm->M+1);          	 /* sample the starting position from that distribution */
  kend   = kstart + esl_rnd_Roll(r, gm->M-kstart+1);           /* and the exit uniformly from possible exits for it */

  free(pstart);
  *ret_kstart = kstart;
  *ret_kend   = kend;
  return eslOK;
  
 ERROR:
  if (pstart != NULL) free(pstart);
  *ret_kstart = 0;
  *ret_kend   = 0;
  return status;
}
Пример #4
0
/* Function:  p7_oprofile_GetFwdTransitionArray()
 * Synopsis:  Retrieve full 32-bit float transition probabilities from a
 *            profile into a flat array
 *
 * Purpose:   Extract an array of <type> (e.g. p7O_II) transition probabilities
 *            from the underlying <om> profile. In SIMD implementations,
 *            these are striped and interleaved, making them difficult to
 *            directly access. Here, this is trivial.
 *
 * Args:      <om>   - optimized profile, containing transition information
 *            <type> - transition type (e.g. p7O_II)
 *            <arr>  - preallocated array into which floats will be placed
 *
 * Returns:   <eslOK> on success.
 *
 * Throws:    (no abnormal error conditions)
 */
int
p7_oprofile_GetFwdTransitionArray(const P7_OPROFILE *om, int type, float *arr )
{
  int i;

  for (i=0; i<om->M; i++) {
    arr[i] = exp(p7P_TSC(om, i, type));
  }

  return eslOK;

}
Пример #5
0
/* Function:  p7_ProfileConfig()
 * Synopsis:  Configure a search profile.
 *
 * Purpose:   Given a model <hmm> with core probabilities, the null1
 *            model <bg>, a desired search <mode> (one of <p7_LOCAL>,
 *            <p7_GLOCAL>, <p7_UNILOCAL>, or <p7_UNIGLOCAL>), and an
 *            expected target sequence length <L>; configure the
 *            search model in <gm> with lod scores relative to the
 *            background frequencies in <bg>.
 *            
 * Returns:   <eslOK> on success; the profile <gm> now contains 
 *            scores and is ready for searching target sequences.
 *            
 * Throws:    <eslEMEM> on allocation error.
 */
int
p7_ProfileConfig(const P7_HMM *hmm, const P7_BG *bg, P7_PROFILE *gm, int L, int mode)
{
  int   k, x, z;	/* counters over states, residues, annotation */
  int   status;
  float *occ = NULL;
  float *tp, *rp;
  float  sc[p7_MAXCODE];
  float  Z;
 
  /* Contract checks */
  if (gm->abc->type != hmm->abc->type) ESL_XEXCEPTION(eslEINVAL, "HMM and profile alphabet don't match");
  if (hmm->M > gm->allocM)             ESL_XEXCEPTION(eslEINVAL, "profile too small to hold HMM");
  if (! (hmm->flags & p7H_CONS))       ESL_XEXCEPTION(eslEINVAL, "HMM must have a consensus to transfer to the profile");

  /* Copy some pointer references and other info across from HMM  */
  gm->M                = hmm->M;
  gm->max_length       = hmm->max_length;
  gm->mode             = mode;
  gm->roff             = -1;
  gm->eoff             = -1;
  gm->offs[p7_MOFFSET] = -1;
  gm->offs[p7_FOFFSET] = -1;
  gm->offs[p7_POFFSET] = -1;
  if (gm->name != NULL) free(gm->name);
  if (gm->acc  != NULL) free(gm->acc);
  if (gm->desc != NULL) free(gm->desc);
  if ((status = esl_strdup(hmm->name,   -1, &(gm->name))) != eslOK) goto ERROR;
  if ((status = esl_strdup(hmm->acc,    -1, &(gm->acc)))  != eslOK) goto ERROR;
  if ((status = esl_strdup(hmm->desc,   -1, &(gm->desc))) != eslOK) goto ERROR;
  if (hmm->flags & p7H_RF)    strcpy(gm->rf,        hmm->rf);
  if (hmm->flags & p7H_MMASK) strcpy(gm->mm,        hmm->mm);
  if (hmm->flags & p7H_CONS)  strcpy(gm->consensus, hmm->consensus); /* must be present, actually, so the flag test is just for symmetry w/ other optional HMM fields */
  if (hmm->flags & p7H_CS)    strcpy(gm->cs,        hmm->cs);
  for (z = 0; z < p7_NEVPARAM; z++) gm->evparam[z] = hmm->evparam[z];
  for (z = 0; z < p7_NCUTOFFS; z++) gm->cutoff[z]  = hmm->cutoff[z];
  for (z = 0; z < p7_MAXABET;  z++) gm->compo[z]   = hmm->compo[z];

  /* Entry scores. */
  if (p7_profile_IsLocal(gm))
    {
      /* Local mode entry:  occ[k] /( \sum_i occ[i] * (M-i+1))
       * (Reduces to uniform 2/(M(M+1)) for occupancies of 1.0)  */
      Z = 0.;
      ESL_ALLOC(occ, sizeof(float) * (hmm->M+1));

      if ((status = p7_hmm_CalculateOccupancy(hmm, occ, NULL)) != eslOK) goto ERROR;
      for (k = 1; k <= hmm->M; k++) 
	Z += occ[k] * (float) (hmm->M-k+1);
      for (k = 1; k <= hmm->M; k++) 
	p7P_TSC(gm, k-1, p7P_BM) = log(occ[k] / Z); /* note off-by-one: entry at Mk stored as [k-1][BM] */

      free(occ);
    }
  else	/* glocal modes: left wing retraction; must be in log space for precision */
    {
      Z = log(hmm->t[0][p7H_MD]);
      p7P_TSC(gm, 0, p7P_BM) = log(1.0 - hmm->t[0][p7H_MD]);
      for (k = 1; k < hmm->M; k++) 
	{
	   p7P_TSC(gm, k, p7P_BM) = Z + log(hmm->t[k][p7H_DM]);
	   Z += log(hmm->t[k][p7H_DD]);
	}
    }

  /* E state loop/move probabilities: nonzero for MOVE allows loops/multihits
   * N,C,J transitions are set later by length config 
   */
  if (p7_profile_IsMultihit(gm)) {
    gm->xsc[p7P_E][p7P_MOVE] = -eslCONST_LOG2;   
    gm->xsc[p7P_E][p7P_LOOP] = -eslCONST_LOG2;   
    gm->nj                   = 1.0f;
  } else {
    gm->xsc[p7P_E][p7P_MOVE] = 0.0f;   
    gm->xsc[p7P_E][p7P_LOOP] = -eslINFINITY;  
    gm->nj                   = 0.0f;
  }

  /* Transition scores. */
  for (k = 1; k < gm->M; k++) {
    tp = gm->tsc + k * p7P_NTRANS;
    tp[p7P_MM] = log(hmm->t[k][p7H_MM]);
    tp[p7P_MI] = log(hmm->t[k][p7H_MI]);
    tp[p7P_MD] = log(hmm->t[k][p7H_MD]);
    tp[p7P_IM] = log(hmm->t[k][p7H_IM]);
    tp[p7P_II] = log(hmm->t[k][p7H_II]);
    tp[p7P_DM] = log(hmm->t[k][p7H_DM]);
    tp[p7P_DD] = log(hmm->t[k][p7H_DD]);
  }
  
  /* Match emission scores. */
  sc[hmm->abc->K]     = -eslINFINITY; /* gap character */
  sc[hmm->abc->Kp-2]  = -eslINFINITY; /* nonresidue character */
  sc[hmm->abc->Kp-1]  = -eslINFINITY; /* missing data character */
  for (k = 1; k <= hmm->M; k++) {
    for (x = 0; x < hmm->abc->K; x++)
     sc[x] = log((double)hmm->mat[k][x] / bg->f[x]);

    esl_abc_FExpectScVec(hmm->abc, sc, bg->f); 

    for (x = 0; x < hmm->abc->Kp; x++) {
      rp = gm->rsc[x] + k * p7P_NR;
      rp[p7P_MSC] = sc[x];
    }
  }

  /* Insert emission scores */
  /* SRE, Fri Dec 5 08:41:08 2008: We currently hardwire insert scores
   * to 0, i.e. corresponding to the insertion emission probabilities
   * being equal to the background probabilities. Benchmarking shows
   * that setting inserts to informative emission distributions causes
   * more problems than it's worth: polar biased composition hits
   * driven by stretches of "insertion" occur, and are difficult to
   * correct for.
   */
  for (x = 0; x < gm->abc->Kp; x++)
    {
      for (k = 1; k < hmm->M; k++) p7P_ISC(gm, k, x) = 0.0f;
      p7P_ISC(gm, hmm->M, x) = -eslINFINITY;   /* init I_M to impossible.   */
    }
  for (k = 1; k <= hmm->M; k++) p7P_ISC(gm, k, gm->abc->K)    = -eslINFINITY; /* gap symbol */
  for (k = 1; k <= hmm->M; k++) p7P_ISC(gm, k, gm->abc->Kp-2) = -eslINFINITY; /* nonresidue symbol */
  for (k = 1; k <= hmm->M; k++) p7P_ISC(gm, k, gm->abc->Kp-1) = -eslINFINITY; /* missing data symbol */


#if 0
  /* original (informative) insert setting: relies on sc[K, Kp-1] initialization to -inf above */
  for (k = 1; k < hmm->M; k++) {
    for (x = 0; x < hmm->abc->K; x++) 
      sc[x] = log(hmm->ins[k][x] / bg->f[x]); 
    esl_abc_FExpectScVec(hmm->abc, sc, bg->f); 
    for (x = 0; x < hmm->abc->Kp; x++) {
      rp = gm->rsc[x] + k*p7P_NR;
      rp[p7P_ISC] = sc[x];
    }
  }    
  for (x = 0; x < hmm->abc->Kp; x++)
    p7P_ISC(gm, hmm->M, x) = -eslINFINITY;   /* init I_M to impossible.   */
#endif

  /* Remaining specials, [NCJ][MOVE | LOOP] are set by ReconfigLength()
   */
  gm->L = 0;			/* force ReconfigLength to reconfig */
  if ((status = p7_ReconfigLength(gm, L)) != eslOK) goto ERROR;
  return eslOK;

 ERROR:
  if (occ != NULL) free(occ);
  return status;
}
Пример #6
0
/* Function:  p7_ProfileConfig()
* Synopsis:  Configure a search profile.
* Incept:    SRE, Sun Sep 25 12:21:25 2005 [St. Louis]
*
* Purpose:   Given a model <hmm> with core probabilities, the null1
*            model <bg>, a desired search <mode> (one of <p7_LOCAL>,
*            <p7_GLOCAL>, <p7_UNILOCAL>, or <p7_UNIGLOCAL>), and an
*            expected target sequence length <L>; configure the
*            search model in <gm> with lod scores relative to the
*            background frequencies in <bg>.
*            
* Returns:   <eslOK> on success; the profile <gm> now contains 
*            scores and is ready for searching target sequences.
*            
* Throws:    <eslEMEM> on allocation error.
*/
int
p7_ProfileConfig(const P7_HMM *hmm, const P7_BG *bg, P7_PROFILE *gm, int L, int mode)
{
    int   k, x, z;	/* counters over states, residues, annotation */
    int   status;
    float *occ = NULL;
    float *tp, *rp;
    float  sc[p7_MAXCODE];
    float  mthresh;
    float  Z;

    /* Contract checks */
    if (gm->abc->type != hmm->abc->type) ESL_XEXCEPTION(eslEINVAL, "HMM and profile alphabet don't match");
    if (hmm->M > gm->allocM)             ESL_XEXCEPTION(eslEINVAL, "profile too small to hold HMM");

    /* Copy some pointer references and other info across from HMM  */
    gm->M      = hmm->M;
    gm->mode   = mode;
    gm->roff   = -1;
    gm->eoff   = -1;
    gm->offs[p7_MOFFSET] = -1;
    gm->offs[p7_FOFFSET] = -1;
    gm->offs[p7_POFFSET] = -1;
    if (gm->name != NULL) free(gm->name);
    if (gm->acc  != NULL) free(gm->acc);
    if (gm->desc != NULL) free(gm->desc);
    if ((status = esl_strdup(hmm->name,   -1, &(gm->name))) != eslOK) goto ERROR;
    if ((status = esl_strdup(hmm->acc,    -1, &(gm->acc)))  != eslOK) goto ERROR;
    if ((status = esl_strdup(hmm->desc,   -1, &(gm->desc))) != eslOK) goto ERROR;
    if (hmm->flags & p7H_RF) strcpy(gm->rf, hmm->rf);
    if (hmm->flags & p7H_CS) strcpy(gm->cs, hmm->cs);
    for (z = 0; z < p7_NEVPARAM; z++) gm->evparam[z] = hmm->evparam[z];
    for (z = 0; z < p7_NCUTOFFS; z++) gm->cutoff[z]  = hmm->cutoff[z];
    for (z = 0; z < p7_MAXABET;  z++) gm->compo[z]   = hmm->compo[z];

    /* Determine the "consensus" residue for each match position.
    * This is only used for alignment displays, not in any calculations.
    */
    if      (hmm->abc->type == eslAMINO) mthresh = 0.5;
    else if (hmm->abc->type == eslDNA)   mthresh = 0.9;
    else if (hmm->abc->type == eslRNA)   mthresh = 0.9;
    else                                 mthresh = 0.5;
    gm->consensus[0] = ' ';
    for (k = 1; k <= hmm->M; k++) {
        x = esl_vec_FArgMax(hmm->mat[k], hmm->abc->K);
        gm->consensus[k] = ((hmm->mat[k][x] > mthresh) ? toupper(hmm->abc->sym[x]) : tolower(hmm->abc->sym[x]));
    }
    gm->consensus[hmm->M+1] = '\0';

    /* Entry scores. */
    if (p7_profile_IsLocal(gm))
    {
        /* Local mode entry:  occ[k] /( \sum_i occ[i] * (M-i+1))
        * (Reduces to uniform 2/(M(M+1)) for occupancies of 1.0)  */
        Z = 0.;
        ESL_ALLOC_WITH_TYPE(occ, float*, sizeof(float) * (hmm->M+1));

        if ((status = p7_hmm_CalculateOccupancy(hmm, occ, NULL)) != eslOK) goto ERROR;
        for (k = 1; k <= hmm->M; k++) 
            Z += occ[k] * (float) (hmm->M-k+1);
        for (k = 1; k <= hmm->M; k++) 
            p7P_TSC(gm, k-1, p7P_BM) = log((double)(occ[k] / Z)); /* note off-by-one: entry at Mk stored as [k-1][BM] */

        free(occ);
    }
    else	/* glocal modes: left wing retraction; must be in log space for precision */
    {