Пример #1
0
/* Function:  p7_GNull2_ByTrace()
 * Synopsis:  Assign null2 scores to an envelope by the sampling method.
 * Incept:    SRE, Thu May  1 10:00:43 2008 [Janelia]
 *
 * Purpose:   Given a traceback <tr> for an alignment of model <gm> to
 *            some target sequence; calculate null2 odds ratios $\frac{f'{x}}{f{x}}$ 
 *            as the state-usage-weighted emission probabilities,
 *            with state usages calculated by counting emissions used
 *            at positions <zstart..zend> in the trace.
 *            
 *            Because we only need to collect state usages from the
 *            trace <tr>, the target sequence is irrelevant. Because
 *            we are only averaging emission odds ratios from model
 *            <gm>, the configuration of <gm> is irrelevant (uni
 *            vs. multihit, or length config).
 *
 * Args:      gm     - model, in any configuration; only emission odds are used
 *            tr     - traceback for any region (or all) of a target sequence
 *            zstart - first elem in <tr> to collect from; use 0 for complete
 *            zend   - last elem in <tr> to collect from; use tr->N-1 for complete
 *            wrk    - DP matrix w/ at least one row, for workspace
 *            null2  - RESULT: odds ratios f'(x)/f(x) for all Kp residues
 * 
 * Returns:   <eslOK> on success, and the <ddef->n2sc> scores are set
 *            for region <i..j>.
 *
 * Throws:    <eslEMEM> on allocation error.
 */
int
p7_GNull2_ByTrace(const P7_PROFILE *gm, const P7_TRACE *tr, int zstart, int zend, P7_GMX *wrk, float *null2)
{
  float  **dp   = wrk->dp;	/* so that {MDI}MX() macros work */
  float   *xmx  = wrk->xmx;	/* so that XMX() macro works     */
  int      Ld   = 0;
  int      M    = gm->M;
  int      k;			/* index over model position     */
  int      x;			/* index over residues           */
  int      z;			/* index over trace position     */
  float    xfactor;
 
  /* We'll use the i=0 row in wrk for working space: dp[0][] and xmx[0..4]. */
  esl_vec_FSet(wrk->dp[0], (M+1)*p7G_NSCELLS, 0.0);
  esl_vec_FSet(wrk->xmx,   p7G_NXCELLS,       0.0);

  /* Calculate emitting state usage in this particular trace segment: */
  for (z = zstart; z <= zend; z++) 
    {
      switch (tr->st[z]) {
      case p7T_M:  Ld++; MMX(0,tr->k[z]) += 1.0; break;
      case p7T_I:  Ld++; IMX(0,tr->k[z]) += 1.0; break;
      case p7T_N:  if (tr->st[z-1] == p7T_N) { Ld++; XMX(0,p7G_N) += 1.0; } break;
      case p7T_C:  if (tr->st[z-1] == p7T_C) { Ld++; XMX(0,p7G_C) += 1.0; } break;
      case p7T_J:  if (tr->st[z-1] == p7T_J) { Ld++; XMX(0,p7G_J) += 1.0; } break;
      }
    }
  esl_vec_FScale(wrk->dp[0], (M+1)*p7G_NSCELLS, (1.0 / (float) Ld));
  esl_vec_FScale(wrk->xmx,   p7G_NXCELLS,       (1.0 / (float) Ld));
  
  /* Calculate null2's odds ratio emission probabilities, by taking
   * posterior weighted sum over all emission vectors used in paths
   * explaining the domain.
   */
  esl_vec_FSet(null2, gm->abc->K, 0.0);
  xfactor = XMX(0,p7G_N) + XMX(0,p7G_C) + XMX(0,p7G_J);
  for (x = 0; x < gm->abc->K; x++)
    {
      for (k = 1; k < M; k++)
	{
	  null2[x] += MMX(0,k) * expf(p7P_MSC(gm, k, x));
	  null2[x] += IMX(0,k) * expf(p7P_ISC(gm, k, x));
	}
      null2[x] += MMX(0,M) * expf(p7P_MSC(gm, M, x));
      null2[x] += xfactor;
    }
  /* now null2[x] = \frac{f_d(x)}{f_0(x)} odds ratios for all x in alphabet,
   * 0..K-1, where f_d(x) are the ad hoc "null2" residue frequencies
   * for this envelope.
   */

  /* make valid scores for all degeneracies, by averaging the odds ratios. */
  esl_abc_FAvgScVec(gm->abc, null2);
  null2[gm->abc->K]    = 1.0;        /* gap character    */
  null2[gm->abc->Kp-2] = 1.0;	     /* nonresidue "*"   */
  null2[gm->abc->Kp-1] = 1.0;	     /* missing data "~" */

  return eslOK;
}
Пример #2
0
/* Function:  p7_masstrace_Zero()
 * Synopsis:  Initialize cumulative endpoint distributions to zeros.
 *
 * Purpose:   Zero the cumulative distributions in <mt>, preparing to
 *            collect masstrace endpoint data for a sequence of length
 *            <L> and a profile of length <M>.
 *
 * Args:      mt - mass trace object to collect endpoint data in
 *            M  - profile length
 *            L  - sequence length
 *
 * Returns:   <eslOK> on success.
 */
int
p7_masstrace_Zero(P7_MASSTRACE *mt, int M, int L)
{
  /* contract checks / argument validation */
  ESL_DASSERT1( (mt->imass == NULL || L+2 <= mt->ialloc ) );
  ESL_DASSERT1( (M+2 <= mt->kalloc) );

  if (mt->imass) esl_vec_FSet(mt->imass, L+2, 0.0f);
  esl_vec_FSet(mt->kmass, M+2, 0.0f);
  mt->L = L;
  mt->M = M;
  return eslOK;
}
Пример #3
0
/* The MSV score can be validated against Viterbi (provided we trust
 * Viterbi), by creating a multihit local profile in which:
 *   1. All t_MM scores = 0
 *   2. All other core transitions = -inf
 *   3. All t_BMk entries uniformly log 2/(M(M+1))
 */
static void
utest_msv(ESL_GETOPTS *go, ESL_RANDOMNESS *r, ESL_ALPHABET *abc, P7_BG *bg, P7_PROFILE *gm, int nseq, int L)
{
  P7_PROFILE *g2 = NULL;
  ESL_DSQ   *dsq = NULL;
  P7_GMX    *gx  = NULL;
  float     sc1, sc2;
  int       k, idx;

  if ((dsq    = malloc(sizeof(ESL_DSQ) *(L+2))) == NULL)  esl_fatal("malloc failed");
  if ((gx     = p7_gmx_Create(gm->M, L))        == NULL)  esl_fatal("matrix creation failed");
  if ((g2     = p7_profile_Clone(gm))           == NULL)  esl_fatal("profile clone failed");

  /* Make g2's scores appropriate for simulating the MSV algorithm in Viterbi */
  esl_vec_FSet(g2->tsc, p7P_NTRANS * g2->M, -eslINFINITY);
  for (k = 1; k <  g2->M; k++) p7P_TSC(g2, k, p7P_MM) = 0.0f;
  for (k = 0; k <  g2->M; k++) p7P_TSC(g2, k, p7P_BM) = log(2.0f / ((float) g2->M * (float) (g2->M+1)));

  for (idx = 0; idx < nseq; idx++)
    {
      if (esl_rsq_xfIID(r, bg->f, abc->K, L, dsq) != eslOK) esl_fatal("seq generation failed");

      if (p7_GMSV    (dsq, L, gm, gx, 2.0, &sc1)       != eslOK) esl_fatal("MSV failed");
      if (p7_GViterbi(dsq, L, g2, gx,      &sc2)       != eslOK) esl_fatal("viterbi failed");
      if (fabs(sc1-sc2) > 0.0001) esl_fatal("MSV score not equal to Viterbi score");
    }

  p7_gmx_Destroy(gx);
  p7_profile_Destroy(g2);
  free(dsq);
  return;
}
Пример #4
0
/* Function:  p7_bg_Create()
 * Incept:    SRE, Fri Jan 12 13:32:51 2007 [Janelia]
 *
 * Purpose:   Allocate a <P7_BG> object for digital alphabet <abc>,
 *            initializes it to appropriate default values, and
 *            returns a pointer to it.
 *            
 *            For protein models, default iid background frequencies
 *            are set (by <p7_AminoFrequencies()>) to average
 *            SwissProt residue composition. For DNA, RNA and other
 *            alphabets, default frequencies are set to a uniform
 *            distribution.
 *            
 *            The model composition <bg->mcomp[]> is not initialized
 *            here; neither is the filter null model <bg->fhmm>.  To
 *            use the filter null model, caller will want to
 *            initialize these fields by calling
 *            <p7_bg_SetFilterByHMM()>.
 *
 * Throws:    <NULL> on allocation failure.
 *
 * Xref:      STL11/125.
 */
P7_BG *
p7_bg_Create(const ESL_ALPHABET *abc)
{
  P7_BG *bg = NULL;
  int    status;

  ESL_ALLOC(bg, sizeof(P7_BG));
  bg->f     = NULL;
  bg->fhmm  = NULL;

  ESL_ALLOC(bg->f,     sizeof(float) * abc->K);
  if ((bg->fhmm = esl_hmm_Create(abc, 2)) == NULL) goto ERROR;

  if       (abc->type == eslAMINO)
    {
      if (p7_AminoFrequencies(bg->f) != eslOK) goto ERROR;
    }
  else
    esl_vec_FSet(bg->f, abc->K, 1. / (float) abc->K);

  bg->p1    = 350./351.;
  bg->omega = 1./256.;
  bg->abc   = abc;
  return bg;

 ERROR:
  p7_bg_Destroy(bg);
  return NULL;
}
Пример #5
0
static void
utest_ReadWrite(ESL_RANDOMNESS *rng)
{
  char          msg[]       = "bg Read/Write unit test failed";
  char          tmpfile[32] = "esltmpXXXXXX";
  FILE         *fp          = NULL;
  ESL_ALPHABET *abc         = NULL;   /* random alphabet choice eslRNA..eslDICE */
  float        *fq          = NULL;
  P7_BG        *bg          = NULL; 

  if ((abc = esl_alphabet_Create(esl_rnd_Roll(rng, 5) + 1)) == NULL)  esl_fatal(msg);
  if (( bg = p7_bg_Create(abc))                             == NULL)  esl_fatal(msg);
  if (( fq = malloc(sizeof(float) * abc->K))                == NULL)  esl_fatal(msg);                 
  do {
    if (esl_dirichlet_FSampleUniform(rng, abc->K, fq)      != eslOK) esl_fatal(msg);
  } while (esl_vec_FMin(fq, abc->K) < 0.001); /* small p's will get rounded off and fail FCompare() */
  esl_vec_FCopy(fq, abc->K, bg->f);

  if (esl_tmpfile_named(tmpfile, &fp) != eslOK) esl_fatal(msg);
  if ( p7_bg_Write(fp, bg)            != eslOK) esl_fatal(msg);
  fclose(fp);

  esl_vec_FSet(bg->f, bg->abc->K, 0.0);
  if ( p7_bg_Read(tmpfile, bg, NULL)                 != eslOK) esl_fatal(msg);
  if ( esl_vec_FCompare(fq, bg->f, bg->abc->K, 0.01) != eslOK) esl_fatal(msg);

  p7_bg_Destroy(bg);
  esl_alphabet_Destroy(abc);
  free(fq);
  remove(tmpfile);
}
Пример #6
0
/* Function:  p7_ParameterEstimation()
 * Incept:    SRE, Sat Mar 24 10:15:37 2007 [Janelia]
 *
 * Purpose:   Given an <hmm> containing collected, weighted counts;
 *            and given a mixture Dirichlet prior <pri>;
 *            calculate mean posterior parameter estimates for 
 *            all model parameters, converting the 
 *            HMM to a parameterized probabilistic model.
 *            
 * Returns:   <eslOK> on success.
 */
int
p7_ParameterEstimation(P7_HMM *hmm, const P7_PRIOR *pri)
{
  int   k;
  double c[p7_MAXABET];
  double p[p7_MAXABET];
  double mix[p7_MAXDCHLET];
  
  /* Match transitions 0,1..M: 0 is the B state
   * TMD at node M is 0.
   */
  for (k = 0; k <= hmm->M; k++) {
    esl_vec_F2D(hmm->t[k], 3, c);
    esl_mixdchlet_MPParameters(c, 3, pri->tm, mix, p);
    esl_vec_D2F(p, 3, hmm->t[k]);
  }
  hmm->t[hmm->M][p7H_MD] = 0.0;
  esl_vec_FNorm(hmm->t[hmm->M], 3);

  /* Insert transitions, 0..M
   */
  for (k = 0; k <= hmm->M; k++) {
    esl_vec_F2D(hmm->t[k]+3, 2, c);
    esl_mixdchlet_MPParameters(c, 2, pri->ti, mix, p);
    esl_vec_D2F(p, 2, hmm->t[k]+3);
  }

  /* Delete transitions, 1..M-1
   * For k=0, which is unused; convention sets TMM=1.0, TMD=0.0
   * For k=M, TMM = 1.0 (to the E state) and TMD=0.0 (no next D; must go to E).
   */
  for (k = 1; k < hmm->M; k++) {
    esl_vec_F2D(hmm->t[k]+5, 2, c);
    esl_mixdchlet_MPParameters(c, 2, pri->td, mix, p);
    esl_vec_D2F(p, 2, hmm->t[k]+5);
  }
  hmm->t[0][p7H_DM] = hmm->t[hmm->M][p7H_DM] = 1.0;
  hmm->t[0][p7H_DD] = hmm->t[hmm->M][p7H_DD] = 0.0;

  /* Match emissions, 1..M
   * Convention sets mat[0] to a valid pvector: first elem 1, the rest 0.
   */
  for (k = 1; k <= hmm->M; k++) {
    esl_vec_F2D(hmm->mat[k], hmm->abc->K, c);
    esl_mixdchlet_MPParameters(c, hmm->abc->K, pri->em, mix, p);
    esl_vec_D2F(p, hmm->abc->K, hmm->mat[k]);
  }
  esl_vec_FSet(hmm->mat[0], hmm->abc->K, 0.);
  hmm->mat[0][0] = 1.0;

  /* Insert emissions 0..M
   */
  for (k = 0; k <= hmm->M; k++) {
    esl_vec_F2D(hmm->ins[k], hmm->abc->K, c);
    esl_mixdchlet_MPParameters(c, hmm->abc->K, pri->ei, mix, p);
    esl_vec_D2F(p, hmm->abc->K, hmm->ins[k]);
  }
  return eslOK;
}
Пример #7
0
/* Function: ZeroCPlan9()
 *
 * Purpose:  Zeros the counts/probabilities fields in a model.
 *           Leaves null model untouched.
 */
void
ZeroCPlan9(CP9_t *hmm)
{
    int k;
    esl_vec_FSet(hmm->ins[0], hmm->abc->K, 0.);
    esl_vec_FSet(hmm->t[0], cp9_NTRANS, 0.);
    for (k = 1; k <= hmm->M; k++)
    {
        esl_vec_FSet(hmm->t[k], cp9_NTRANS, 0.);
        esl_vec_FSet(hmm->mat[k], hmm->abc->K, 0.);
        esl_vec_FSet(hmm->ins[k], hmm->abc->K, 0.);
    }
    esl_vec_FSet(hmm->begin+1, hmm->M, 0.);
    esl_vec_FSet(hmm->end+1, hmm->M, 0.);

    /* initialize the el_* data structures, these
     * depend on the CM guide tree and will be set
     * when the CP9 is constructed from the CM.
     */
    for (k = 0; k <= (hmm->M); k++)
    {
        hmm->has_el[k]      = FALSE;
        hmm->el_from_ct[k]  = 0;
        hmm->el_from_idx[k] = NULL;
        hmm->el_from_cmnd[k] = NULL;
    }
    /* special case hmm->M+1 corresponds to the E state here */
    hmm->el_from_ct[(hmm->M+1)]  = 0;
    hmm->el_from_idx[(hmm->M+1)] = NULL;
    hmm->el_from_cmnd[(hmm->M+1)] = NULL;

    hmm->flags &= ~CPLAN9_HASBITS;	/* invalidates scores */
    hmm->flags &= ~CPLAN9_HASPROB;	/* invalidates probabilities */
    hmm->el_self = 0.; /* EL self transition probability */
}
/* Function: CPlan9SWConfig()
 * EPN 05.30.06
 * based on SRE's Plan7SWConfig() from HMMER's plan7.c
 * 
 * Purpose:  Set the alignment independent parameters of
 *           a CM Plan 9 model to hmmsw (Smith/Waterman) configuration.
 *           
 * Notes:    The desideratum for begin/end probs is that all fragments ij
 *           (starting at match i, ending at match j) are
 *           equiprobable -- there is no information in the choice of
 *           entry/exit. There are M(M+1)/2 possible choices of ij, so
 *           each must get a probability of 2/M(M+1). This prob is the
 *           product of a begin, an end, and all the not-end probs in
 *           the path between i,j. 
 *            
 *           Thus: entry/exit is asymmetric because of the left/right
 *           nature of the HMM/profile. Entry probability is distributed
 *           simply by assigning p_x = pentry / (M-1) to M-1 
 *           internal match states. However, the same approach doesn't
 *           lead to a flat distribution over exit points. Exit p's
 *           must be corrected for the probability of a previous exit
 *           from the model. Requiring a flat distribution over exit
 *           points leads to an easily solved piece of algebra, giving:
 *                      p_1 = pexit / (M-1)
 *                      p_x = p_1 / (1 - (x-1) p_1)
 *
 *           Modified EPN, Thu Feb  7 15:54:16 2008, as follows:
 *           To better match a locally configured CM, if <do_match_local_cm>
 *           we disallow insertions before the first (emitting) match state, 
 *           (from I_0), and after the final (emitting) match state,
 *           (from I_M). I_0 maps to ROOT_IL and I_M maps to ROOT_IR
 *           which can never be entered in a locally configured CM
 *           (b/c the ROOT_S state MUST jump into a local begin state, which
 *            are always match states>). Also we disallow a M_0->D_1 transition
 *           because these would be impossible in a locally configured CM.
 *
 *           <do_match_local_cm> is usually TRUE, unless we're configuring
 *           the CP9 specifically for eventual sub CM alignment, where
 *           the goal is simply find the most likely start/end point
 *           of the alignment with this CP9 (in that case we want
 *           I_0 and I_M reachable).
 *           
 * Args:     hmm    - the CM Plan 9 model w/ data-dep prob's valid
 *           pentry - probability of an internal entry somewhere;
 *                    will be evenly distributed over M-1 match states
 *           pexit  - probability of an internal exit somewhere; 
 *                    will be distributed over M-1 match states.
 *           do_match_local_cm - TRUE to make I_0, D_1 and I_M unreachable
 *                    to better match a locally configured CM.
 *           first_cm_ndtype - only used if do_match_local_cm is TRUE
 *                             if it's MATL or MATP then D_1 should be unreachable (it is in the CM)
 *                             if it's MATR or MATP then D_M should be unreachable (it is in the CM)
 *                    
 * Return:   (void)
 *           HMM probabilities are modified.
 */
void
CPlan9SWConfig(CP9_t *hmm, float pentry, float pexit, int do_match_local_cm, int first_cm_ndtype)
{
  float basep;			/* p1 for exits: the base p */
  int   k;			/* counter over states      */
  float d;
  
  /* No special (*x* states in Plan 7) states in CM Plan 9 */

  /*for (k = 1; k <= hmm->M; k++) printf("before anything: end[%d]: %f\n", k, hmm->end[k]);*/
  /* Configure entry.
   */
  if(do_match_local_cm) { 
    hmm->t[0][CTMI] = 0.;
    hmm->t[0][CTMM] = 0.;  /* already was 0.0, transition from M_0 to M_1 is begin[1] */
    hmm->t[0][CTMEL] = 0.; /* already was 0.0, can never do a local end from M_0 */
    if((first_cm_ndtype == MATL_nd) || (first_cm_ndtype == MATP_nd)) { /* CM can't possibly reach the CM delete state that maps to D_1, make D_1 unreachable too */
      hmm->t[0][CTMD] = 0.;
    }

    hmm->t[hmm->M][CTMI] = 0.;
    hmm->t[hmm->M][CTDI] = 0.;
    if((first_cm_ndtype == MATR_nd) || (first_cm_ndtype == MATP_nd)) { /* CM can't possibly reach the CM delete state that maps to D_M, make D_M unreachable too */
      hmm->t[hmm->M][CTMD] = 0.;
    }

    /* renormalize transitions out of M_M */
    d = esl_vec_FSum(hmm->t[hmm->M], cp9_TRANS_NMATCH) + hmm->end[hmm->M]; 
    esl_vec_FScale(hmm->t[hmm->M], cp9_TRANS_NMATCH, 1./d);
    hmm->end[hmm->M] /= d;
    
    /* renormalize transitions out of D_M */
    esl_vec_FNorm(hmm->t[hmm->M] + cp9_TRANS_DELETE_OFFSET, cp9_TRANS_NDELETE);	/* delete */
  }

  hmm->begin[1] = (1. - pentry) * (1. - (hmm->t[0][CTMI] + hmm->t[0][CTMD] + hmm->t[0][CTMEL]));
  esl_vec_FSet(hmm->begin+2, hmm->M-1, (pentry * (1.- (hmm->t[0][CTMI] + hmm->t[0][CTMD] + hmm->t[0][CTMEL]))) / (float)(hmm->M-1));
  /* note: hmm->t[0][CTMEL] == 0. (can't locally end from begin) 
   *       and if do_match_local_cm, hmm->t[0][CTMI] and hmm->t[0][CTMD] were just set to 0. 
   */
  
  /* Configure exit.
   * Don't touch hmm->end[hmm->M]
   */

  basep = pexit / (float) (hmm->M-1);
  for (k = 1; k < hmm->M; k++)
    hmm->end[k] = basep / (1. - basep * (float) (k-1));
  CPlan9RenormalizeExits(hmm, 1);
  /*for (k = 1; k <= hmm->M; k++) printf("after renormalizing: end[%d]: %f\n", k, hmm->end[k]);*/

  hmm->flags       &= ~CPLAN9_HASBITS; /* reconfig invalidates log-odds scores */
  hmm->flags       |= CPLAN9_LOCAL_BEGIN; /* local begins now on */
  hmm->flags       |= CPLAN9_LOCAL_END;   /* local ends now on */

  CP9Logoddsify(hmm);
}
Пример #9
0
/* Function: CPlan9Renormalize()
 *
 * Purpose:  Take an HMM in counts form, and renormalize
 *           all of its probability vectors. Also enforces
 *           CM Plan9 restrictions on nonexistent transitions.
 *
 * Args:     hmm - the model to renormalize.
 *
 * Return:   (void)
 *           hmm is changed.
 */
void
CPlan9Renormalize(CP9_t *hmm)
{
    int   k;			/* counter for model position */
    float d;			/* denominator */

    /* match emissions */
    esl_vec_FSet(hmm->mat[0], hmm->abc->K, 0.);   /*M_0 is B state, non-emitter*/
    for (k = 1; k <= hmm->M; k++)
        esl_vec_FNorm(hmm->mat[k], hmm->abc->K);
    /* insert emissions */
    for (k = 0; k <= hmm->M; k++)
        esl_vec_FNorm(hmm->ins[k], hmm->abc->K);

    /* begin transitions */
    d = esl_vec_FSum(hmm->begin+1, hmm->M) + hmm->t[0][CTMI] + hmm->t[0][CTMD] + hmm->t[0][CTMEL];
    /* hmm->t[0][CTMEL] should always be 0., can't local end from the M_0 == B state */
    esl_vec_FScale(hmm->begin+1, hmm->M, 1./d);
    hmm->t[0][CTMI] /= d;
    hmm->t[0][CTMD] /= d;
    hmm->t[0][CTMEL] /= d;

    esl_vec_FNorm(hmm->t[0] + cp9_TRANS_INSERT_OFFSET, cp9_TRANS_NINSERT);	        /* transitions out of insert for node 0 (state N)*/
    esl_vec_FSet (hmm->t[0] + cp9_TRANS_DELETE_OFFSET, cp9_TRANS_NDELETE, 0.);
    /* main model transitions */
    for (k = 1; k <= hmm->M; k++) /* safe for node M too, hmm->t[hmm->M][CTMM] should be 0.*/
    {
        d = esl_vec_FSum(hmm->t[k], cp9_TRANS_NMATCH) + hmm->end[k];
        esl_vec_FScale(hmm->t[k], cp9_TRANS_NMATCH, 1./d);
        hmm->end[k] /= d;

        esl_vec_FNorm(hmm->t[k] + cp9_TRANS_INSERT_OFFSET, cp9_TRANS_NINSERT);	/* insert */
        esl_vec_FNorm(hmm->t[k] + cp9_TRANS_DELETE_OFFSET, cp9_TRANS_NDELETE);	/* delete */
    }
    /* null model emissions */
    esl_vec_FNorm(hmm->null, hmm->abc->K);

    hmm->flags &= ~CPLAN9_HASBITS;	/* clear the log-odds ready flag */
    hmm->flags |= CPLAN9_HASPROB;	/* set the probabilities OK flag */
}
Пример #10
0
/* Function:  p7_profile_SameAsMF()
 * Synopsis:  Set a generic profile's scores to give MSV scores.
 * Incept:    MSF Tue Nov 3, 2009 [Janelia]
 *
 * Purpose:   Set a generic profile's scores so that the normal <dp_generic> DP 
 *            algorithms will give the same score as <p7_MSVFilter()>:
 *            all t_MM scores = 0; all other core transitions = -inf;
 *            multihit local mode; all <t_BMk> entries uniformly <log 2/(M(M+1))>;
 *            <tCC, tNN, tJJ> scores 0; total approximated later as -3;
 *            rounded in the same way as the 8-bit limited precision.
 *
 * Returns:   <eslOK> on success.
 */
int
p7_profile_SameAsMF(const P7_OPROFILE *om, P7_PROFILE *gm)
{
  int    k;
  float  tbm = log(2.0f / ((float) gm->M * (float) (gm->M+1)));

  /* Transitions */
  esl_vec_FSet(gm->tsc, p7P_NTRANS * gm->M, -eslINFINITY);
  for (k = 1; k <  gm->M; k++) p7P_TSC(gm, k, p7P_MM) = 0.0f;
  for (k = 0; k <  gm->M; k++) p7P_TSC(gm, k, p7P_BM) = tbm;
  
  return eslOK;
}
/* Function: CPlan9CMLocalBeginConfig()
 * Incept:   EPN, Thu Jun 21 15:43:29 2007
 * based on SRE's Plan7SWConfig() from HMMER's plan7.c
 * 
 * Purpose:  Set up a CM Plan 9 HMM to mimic CM local begins as closely
 *           as it can. We can't enforce that a begin/end point are chosen
 *           the same way a CM's are, as the choice of a CM local begin
 *           (in non-truncated CYK mode) defines both a start and end point,
 *           and some start/end combinations are impossible. For the CP9
 *           we allow all possible start/end combos.
 *           
 * Args:     cm    - the CM, must have valid cm->cp9, we'll use
 *                   the CM local begin probs to set the cm->cp9s
 *                   begin/end probs.
 *                    
 * Return:   (void)
 *           HMM probabilities are modified.
 */
void
CPlan9CMLocalBeginConfig(CM_t *cm)
{
  CMEmitMap_t *emap;            /* consensus emit map for the CM */
  int nd;

  /* Contract checks */
  if(cm->cp9 == NULL)
    cm_Fail("ERROR in CPlan9CMLocalBeginConfig, cm->cp9 is NULL.\n");
  if(cm->cp9map == NULL)
    cm_Fail("ERROR in CPlan9CMLocalBeginConfig, cm->cp9map is NULL.\n");
  if(!(cm->flags & CMH_CP9))
     cm_Fail("ERROR in CPlan9CMLocalBeginConfig, CMH_CP9 flag is down.");
  if(!(cm->flags & CMH_LOCAL_BEGIN))
     cm_Fail("ERROR in CPlan9CMLocalBeginConfig, CMH_LOCAL_BEGIN flag is down.");
  if(!(cm->flags & CMH_LOCAL_END))
     cm_Fail("ERROR in CPlan9CMLocalBeginConfig, CP9_LOCAL_BEGIN flag is already up.");
  if(cm->cp9->flags & CPLAN9_LOCAL_END)
     cm_Fail("ERROR in CPlan9CMLocalBeginConfig, CP9_LOCAL_END flag is already up.");

  /* Configure entry.
   * To match CM, we enforce the only way out of the B state (M_0)
   * is through a local begin into a match state 
   */
  esl_vec_FSet(cm->cp9->begin, cm->cp9->M, 0.);
  emap = CreateEmitMap(cm); 
  for (nd = 1; nd < cm->nodes; nd++) {
    if(NOT_IMPOSSIBLE(cm->begin[cm->nodemap[nd]])) {
      cm->cp9->begin[emap->lpos[nd]] += cm->begin[cm->nodemap[nd]]; /* we do += b/c for lpos of BIFs, there's > 1 way to enter there, the BIF and the first MATP or MATL of the left child of the BIF */
    }
  }

  cm->cp9->flags       &= ~CPLAN9_HASBITS; /* reconfig invalidates log-odds scores */
  cm->cp9->flags       |= CPLAN9_LOCAL_BEGIN; /* local begins now on */
  cm->cp9->flags       |= CPLAN9_LOCAL_END;   /* local ends now on */

  CP9Logoddsify(cm->cp9);
}
Пример #12
0
/* Function:  p7_bg_CreateUniform()
 * Synopsis:  Creates background model with uniform freqs.
 * Incept:    SRE, Sat Jun 30 10:25:27 2007 [Janelia]
 *
 * Purpose:   Creates a background model for alphabet <abc>
 *            with uniform residue frequencies.
 */
P7_BG *
p7_bg_CreateUniform(const ESL_ALPHABET *abc)
{
  P7_BG *bg = NULL;
  int    status;

  ESL_ALLOC(bg, sizeof(P7_BG));
  bg->f     = NULL;
  bg->fhmm  = NULL;

  ESL_ALLOC(bg->f,     sizeof(float) * abc->K);
  if ((bg->fhmm = esl_hmm_Create(abc, 2)) == NULL) goto ERROR;

  esl_vec_FSet(bg->f, abc->K, 1. / (float) abc->K);
  bg->p1    = 350./351.;
  bg->omega = 1./256.;
  bg->abc = (ESL_ALPHABET *) abc; /* safe: we're just keeping a reference */
  return bg;

 ERROR:
  p7_bg_Destroy(bg);
  return NULL;
}
Пример #13
0
/* Function:  p7_bg_Create()
 * Synopsis:  Create a <P7_BG> null model object.
 *
 * Purpose:   Allocate a <P7_BG> object for digital alphabet <abc>,
 *            initializes it to appropriate default values, and
 *            returns a pointer to it.
 *            
 *            For protein models, default iid background frequencies
 *            are set (by <p7_AminoFrequencies()>) to average
 *            Swiss-Prot residue composition. For DNA, RNA and other
 *            alphabets, default frequencies are set to a uniform
 *            distribution.
 *            
 *            The model composition <bg->mcomp[]> is not initialized
 *            here; neither is the filter null model <bg->fhmm>.  To
 *            use the filter null model, caller will want to
 *            initialize these fields by calling
 *            <p7_bg_SetFilter()>.
 *
 * Throws:    <NULL> on allocation failure.
 *
 * Xref:      STL11/125.
 */
P7_BG *
p7_bg_Create(const ESL_ALPHABET *abc)
{
  P7_BG *bg = NULL;
  int    status;

  ESL_ALLOC(bg, sizeof(P7_BG));
  bg->f     = NULL;
  bg->fhmm  = NULL;

  // this is not hard-coded for alphabet size...

  ESL_ALLOC(bg->f,     sizeof(float) * abc->K);
  if ((bg->fhmm = esl_hmm_Create(abc, 2)) == NULL) goto ERROR;

  if       (abc->type == eslAMINO)
    {
      if (p7_AminoFrequencies(bg->f) != eslOK) goto ERROR;
    }
  // adding in background probabilities for music intervals
  else if (abc->type == eslMUSIC)
    {
      if (p7_MusicFrequencies(bg->f) != eslOK) goto ERROR;
    }

  else
    esl_vec_FSet(bg->f, abc->K, 1. / (float) abc->K);

  bg->p1    = 350./351.;
  bg->omega = 1./256.;
  bg->abc   = abc;
  return bg;

 ERROR:
  p7_bg_Destroy(bg);
  return NULL;
}
Пример #14
0
/* Function: p7_null3_score()
 *
 * Purpose:  Calculate a correction (in log_2 odds) to be applied
 *           to a sequence, using a null model based on the
 *           composition of the target sequence.
 *           The null model is constructed /post hoc/ as the
 *           distribution of the target sequence; if the target
 *           sequence is 40% A, 5% C, 5% G, 40% T, then the null
 *           model is (0.4, 0.05, 0.05, 0.4). This function is
 *           based heavily on Infernal's ScoreCorrectionNull3(),
 *           with two important changes:
 *            - it leaves the log2 conversion from NATS to BITS
 *              for the calling function.
 *            - it doesn't include the omega score modifier
 *              (based on prior probability of using the null3
 *              model), again leaving this to the calling function.
 *
 * Args:     abc   - alphabet for hit (only used to get alphabet size)
 *           dsq   - the sequence the hit resides in
 *           tr   - trace of the alignment, used to find the match states
 *                  (non-match chars are ignored in computing freq, not used if NULL)
 *           start - start position of hit in dsq
 *           stop  - end  position of hit in dsq
 *           bg    - background, used for the default null model's emission freq
 *           ret_sc - RETURN: the correction to the score (in NATS);
 *                   caller subtracts this from hit score to get
 *                   corrected score.
 * Return:   void, ret_sc: the log-odds score correction (in NATS).
 */
void
p7_null3_score(const ESL_ALPHABET *abc, const ESL_DSQ *dsq, P7_TRACE *tr, int start, int stop, P7_BG *bg, float *ret_sc)
{
    float score = 0.;
    int status;
    int i;
    float *freq;
    int dir;
    int tr_pos;

    ESL_ALLOC(freq, sizeof(float) * abc->K);
    esl_vec_FSet(freq, abc->K, 0.0);

    /* contract check */
    if(abc == NULL) esl_exception(eslEINVAL, FALSE, __FILE__, __LINE__, "p7_null3_score() alphabet is NULL.%s\n", "");
    if(dsq == NULL) esl_exception(eslEINVAL, FALSE, __FILE__, __LINE__, "p7_null3_score() dsq alphabet is NULL.%s\n", "");
    if(abc->type != eslRNA && abc->type != eslDNA) esl_exception(eslEINVAL, FALSE, __FILE__, __LINE__, "p7_null3_score() expects alphabet of RNA or DNA.%s\n", "");

    dir = start < stop ? 1 : -1;

    if (tr != NULL) {
        /* skip the parts of the trace that precede the first match state */
        tr_pos = 2;
        i = start;
        while (tr->st[tr_pos] != p7T_M) {
            if (tr->st[tr_pos] == p7T_N)
                i += dir;
            tr_pos++;
        }

        /* tally frequencies from characters hitting match state*/
        while (tr->st[tr_pos] != p7T_E) {
            if (tr->st[tr_pos] == p7T_M) {
                if(esl_abc_XIsGap(abc, dsq[i])) esl_exception(eslEINVAL, FALSE, __FILE__, __LINE__, "in p7_null3_score(), res %d is a gap!%s\n", "");
                esl_abc_FCount(abc, freq, dsq[i], 1.);
            }
            if (tr->st[tr_pos] != p7T_D )
                i += dir;
            tr_pos++;
        }
    } else {
        /* tally frequencies from the full envelope */
        for (i=ESL_MIN(start,stop); i <= ESL_MAX(start,stop); i++)
        {
            if(esl_abc_XIsGap(abc, dsq[i])) esl_exception(eslEINVAL, FALSE, __FILE__, __LINE__, "in p7_null3_score(), res %d is a gap!%s\n", "");
            esl_abc_FCount(abc, freq, dsq[i], 1.);
        }
    }

    esl_vec_FNorm(freq, abc->K);


    /* now compute score modifier (nats) - note: even with tr!=NULL, this includes the unmatched characters*/
    for (i = 0; i < abc->K; i++)
        score += freq[i]==0 ? 0.0 : esl_logf( freq[i]/bg->f[i] ) * freq[i] * ( (stop-start)*dir +1) ;

    /* Return the correction to the bit score. */
    score = p7_FLogsum(0., score);
    *ret_sc = score;

    return;

ERROR:
    esl_exception(eslEINVAL, FALSE, __FILE__, __LINE__, "p7_null3_score() memory allocation error.%s\n", "");
    return; /* never reached */

}
Пример #15
0
/* Function: cm_tr_penalties_Create()
 * Date:     EPN, Sat Jan 21 12:03:52 2012
 *
 * Purpose:  Allocate and initialize a CM_TR_PENALTIES object.
 *           A CM and its emit map are required to determine
 *           truncation penalty scores. This is annoyingly
 *           complex, see verbose notes within code below.
 * 
 *           Some of the code in this function, specifically
 *           that which calculates the probability of a fragment
 *           aligning at a given node, is checkable, but only
 *           if we disallow truncated begins into insert states.
 *           However, we want to allow truncated begins in reality.
 *           I've left in a flag for ignoring inserts (<ignore_inserts>)
 *           I used in testing this function. Set it to TRUE to 
 *           perform the test. 
 *
 * Returns:  Newly allocated CM_TR_PENALTIES object. NULL if out
 *           of memory.
 */
CM_TR_PENALTIES *
cm_tr_penalties_Create(CM_t *cm, int ignore_inserts, char *errbuf)
{
  int status;
  int v, nd, m, i1, i2;
  int lpos, rpos;
  int i;

  /* variables used for determining ratio of inserts to match at each consensus position */
  float   *mexpocc  = NULL;  /* [0..c..clen] probability match  state is used to emit at cons posn c */     
  float   *iexpocc  = NULL;  /* [0..c..clen] probability insert state is used to emit after cons posn c */     
  double  *psi      = NULL;  /* [0..v..M-1]  expected occupancy of state v */
  float    m_psi, i1_psi, i2_psi; /* temp psi values */
  float    summed_psi; 
  CM_TR_PENALTIES *trp = NULL;

  /* variables used for calculating global truncation penalties */
  float g_5and3; /* fragment probability if 5' and 3' truncation are allowed */
  float g_5or3;  /* fragment probability if 5' or  3' truncation are allowed */

  /* variables used for calculating local truncation penalties */
  float *begin = NULL;           /* local begin probabilities 0..v..M-1 */
  int   subtree_clen;            /* consensus length of subtree under this node */
  float prv53, prv5, prv3;       /* previous node's fragment probability, 5'&3', 5' only, 3'only */
  float cur53, cur5, cur3;       /* current node's fragment probability, 5'&3', 5' only, 3'only */
  int   nfrag53, nfrag5, nfrag3; /* number of fragments, 5'&3', 5' only, 3'only */

  if(cm == NULL || cm->emap == NULL) goto ERROR;

  ESL_ALLOC(trp, sizeof(CM_TR_PENALTIES));

  trp->M = cm->M;
  trp->ignored_inserts = ignore_inserts;

  /* Define truncation penalties for each state v. This will be 
   * the score for doing a truncated begin into state v.
   * 
   * Important note: For this discussion we assume that sequences can
   * only be truncated at consensus positions, which means we don't
   * have to worry about truncated begins into inserts. This is an
   * approximation (also made by Diana and Sean in the 2009 trCYK
   * paper) that greatly simplifies the explanation of the calculation
   * of the truncation penalties.  The examples in my ELN3 notebook
   * also use this simplification. However, I need to be able to do
   * truncated begins into insert states in some cases (some pass/mode
   * combinations see ELN bottom of p.47). I explain first the
   * rationale for calculating truncation penalties ignoring inserts
   * and then I describe how I adapt those penalties to allow
   * for inserts. 
   * 
   * This is a lengthy comment. I've divided it into 3 sections:
   * Section 1. Global mode truncation penalties, ignoring inserts.
   * Section 2. Local mode truncation penalties, ignoring inserts.
   * Section 3. Adapting truncation penalties to allow for inserts.
   *
   **************************************************************
   * Section 1. Global mode truncation penalties, ignoring inserts.
   *
   * We want the truncation penalty to be the log of the probability
   * that the particular fragment we're aligning was generated from
   * the following generative process. The generative process differs
   * between global and local mode. 
   * 
   * In global mode: 
   * o Sample global parsetree which spans consensus positions 1..clen.
   * o Randomly choose g and h in range 1..clen, where h >= g and
   *   truncate sequence from g..h. The first residue will either be
   *   an insert before position g, or a match at position g of the 
   *   model. The final residue will either be an insert after position
   *   h or a match at position h of the model.
   * 
   * All g,h fragments are equiprobable, so the probability of any
   * particular fragment is 2 / (clen * (clen+1)). So log_2 of this
   * value is the truncation penalty for all truncated alignments in
   * global mode where both 5' and 3' truncation are allowed. 
   * 
   * We store this penalty, per-state in the
   * g_ptyAA[TRPENALTY_5P_AND_3P][0..v..M-1].  The penalty is
   * identical for all emitting states. The penalty value for
   * non-emitters is IMPOSSIBLE because truncated begins are 
   * not allowed into non-emitters. 
   * 
   * If only 5' OR 3' truncation is allowed, we only truncate at g or
   * h, which menas there's 1/clen possible fragments and log_2
   * (1/clen) is our global truncation penalty. 
   * 
   * However, if 5' truncation is allowed we can only do a truncated
   * begin into states that with a consensus subtree that spans
   * position clen (since we don't allow a truncation at the 3' end).
   * Thus any state whose subtree that doesn't span clen gets
   * an IMPOSSIBLE value for its truncation score in:
   * g_ptyAA[TRPENALTY_5P_ONLY][0..v..M-1].
   * 
   * Likewise, if 3' truncation is allowed we can only do a truncated
   * begin into states that with a consensus subtree that spans
   * position 1 (since we don't allow a truncation at the 5' end).
   *
   * There's an example of computing all three types of penalties for
   * a simple CM in ELN 3 p43.
   * 
   ************************************************************
   * Section 2. Local mode truncation penalties, ignoring inserts.
   * 
   * Generative process that generates fragments in local mode:
   * o Sample local begin state b with consensus subtree from i..j from
   *   local begin state distribution.
   * o Randomly choose g and h in range i..j, where h >= g and
   *   truncate sequence from g..h. The first residue will either be
   *   an insert before position g, or a match at position g of the 
   *   model. The final residue will either be an insert after position
   *   h or a match at position h of the model.
   * 
   * Unlike in global mode, in local mode all fragments are not
   * equiprobable since the local begin state distribution can be
   * anything, and each b allows different sets of fragments to be
   * generated (because they can only span from i to j).
   * 
   * The truncation penalty should be the log of the probability of
   * aligning the current fragment to the model. So we need to know 
   * the probability of generating each possible fragment. 
   * We could calculate probability of any fragment g,h with the 
   * following inefficient algorithm:
   *
   * For each start fragment point g,
   *   For each start fragment point h,
   *     For each state v,
   *       If lpos[v] <= g && rpos[v] >= h, then
   *       prob[g][h] += begin[v] * 2. / (st_clen[v] * (st_clen[v]+1));
   * 
   * Where lpos[v]/rpos[v] are the left/right consensus positions in
   * consensus subtree rooted at state v. And st_clen[v] is rpos[v] -
   * lpos[v] + 1, the consensus length of that subtree. 
   *  
   * This gives us prob[g][h], the probability of generating fragment
   * g,h. But we want to apply the penalty to a state, not to a
   * fragment, to avoid needing to know the fragment boundaries g,h
   * during the DP recursion when applying the penalty.  
   * 
   * To facilitate this, we need to find state t, the state with
   * smallest subtree that contains g,h. State t is relevant because
   * it is the state which will root the alignment of the fragment g,h
   * by using a truncated begin transition into t. This gives a new
   * algorithm:
   *
   * For each start fragment point g,
   *   For each start fragment point h,
   *     Identify state t, the max valued state for which 
   *       lpos[v] <= g && rpos[v] >= h, then {
   *         prob[t] += prob[g][h]
   *         fcount[t]++;
   *       }
   * 
   * prob[t] will be the probability of observing an alignment that
   * uses a truncated begin into t to align any fragment. Then we take
   * average over all fragments: prob[t] / fcount[t] (since we'll only
   * be aligning one of those fragments) and use the log of that
   * probability as the penalty for observing a truncated alignment
   * rooted at state t. Conveniently, it turns out that all fragments
   * that share t are equiprobable (have equal prob[g][h] values), so
   * the average probability is the actual probability for each
   * fragment, and thus the correct penalty to apply.
   * 
   * Fortunately, we can compute the correct penalty much more
   * efficiently than the two algorithms shown above. The
   * efficient way is implemented below. A test that the penalties
   * are correctly computed is in cm_tr_penalties_Validate().
   * 
   * This discussion assumes we're truncating 5' and 3', but if we're
   * only truncating 5' or 3' The situation is a little different.
   * 
   * There's an example of computing all three types of penalties for
   * a simple CM in ELN3 p44-45.
   *
   ************************************************************
   * Section 3. Adapting truncation penalties to allow for inserts.
   * 
   * We need to be able to do truncated begins into insert states
   * because we enforce that the first/final residue of a sequence be
   * included in 5'/3' truncated alignments and we want to be able
   * to properly align those residues if they're probably emitted
   * by insert states. 
   * 
   * The methods/logic explained in sections 1 and 2 above I believe
   * is correct IF we ignore inserts (assume truncated begins into
   * them are impossible). But we need to allow inserts, so I modify
   * the truncation penalties as described above to allow for inserts
   * as follows. We can calculate the appropriate truncated begin
   * penalty for all MATP_MP, MATL_ML, MATR_MR, BIF_B states as with
   * the methods described above by ignoring inserts. This gives us a
   * probability p of using that state as the root of the truncated
   * alignment, i.e. the truncated begin state. (The log_2 of this
   * probability is the penalty.) We then partition p amongst the
   * MATP_MP, MATL_ML, MATR_MR, BIF_B states and any parent insert
   * states, i.e. any insert state that can transition into the
   * match/bif state. For each match/bif state there's 0, 1 or 2
   * parent inserts. We then partition p based on the relative
   * expected occupancy of these inserts versus the match/bif state.
   * 
   * This is certainly 'incorrect' in that it doesn't reflect the
   * true probability of a fragment being aligned to each of the
   * states, but it should be a close approximation. I think doing
   * it correctly is basically impossible in the context of a single
   * state-specific penalty (i.e. the penalty would have to be per-fragment
   * which would be hard to deal with in the DP functions).
   */ 

  /* allocate and initialize the penalty arrays */
  ESL_ALLOC(trp->g_ptyAA,  sizeof(float *) * NTRPENALTY); 
  ESL_ALLOC(trp->l_ptyAA,  sizeof(float *) * NTRPENALTY); 
  ESL_ALLOC(trp->ig_ptyAA, sizeof(int *)   * NTRPENALTY); 
  ESL_ALLOC(trp->il_ptyAA, sizeof(int *)   * NTRPENALTY); 

  for(i = 0; i < NTRPENALTY; i++) { 
    trp->g_ptyAA[i]  = NULL;
    trp->l_ptyAA[i]  = NULL;
    trp->il_ptyAA[i] = NULL;
    trp->ig_ptyAA[i] = NULL;
    ESL_ALLOC(trp->g_ptyAA[i],  sizeof(float) * cm->M);
    ESL_ALLOC(trp->l_ptyAA[i],  sizeof(float) * cm->M);
    ESL_ALLOC(trp->ig_ptyAA[i], sizeof(int)   * cm->M);
    ESL_ALLOC(trp->il_ptyAA[i], sizeof(int)   * cm->M);
    esl_vec_FSet(trp->g_ptyAA[i],   cm->M, IMPOSSIBLE);
    esl_vec_FSet(trp->l_ptyAA[i],   cm->M, IMPOSSIBLE);
    esl_vec_ISet(trp->ig_ptyAA[i],  cm->M, -INFTY);
    esl_vec_ISet(trp->il_ptyAA[i],  cm->M, -INFTY);
  }

  /* DumpEmitMap(stdout, cm->emap, cm); */

  /* Calculate local begin probabilities and expected occupancy */
  ESL_ALLOC(begin, sizeof(float) * cm->M);
  cm_CalculateLocalBeginProbs(cm, cm->pbegin, cm->t, begin);
  if((status = cm_ExpectedPositionOccupancy(cm, &mexpocc, &iexpocc, &psi, NULL, NULL, NULL)) != eslOK) goto ERROR;

  /* Fill global and local truncation penalties in a single loop. We
   * step through all nodes and set the truncation penalties for the
   * MATP_MP, MATL_ML, MATR_MR, and BIF_B states and any parent
   * inserts (i1, i2) of those states.
   */
  g_5and3 = 2. / (cm->clen * (cm->clen+1)); /* for global mode: probability of all fragments if we're truncating 5' and 3' */
  g_5or3  = 1. / cm->clen;                  /* for global mode: probability of all fragments if we're only truncating 5' or  3' */

  prv5 = prv3 = prv53 = 0.; /* initialize 'previous' probability values used for calc'ing local truncation penalties */
  for(nd = 0; nd < cm->nodes; nd++) { 
    lpos = (cm->ndtype[nd] == MATP_nd || cm->ndtype[nd] == MATL_nd) ? cm->emap->lpos[nd] : cm->emap->lpos[nd] + 1;
    rpos = (cm->ndtype[nd] == MATP_nd || cm->ndtype[nd] == MATR_nd) ? cm->emap->rpos[nd] : cm->emap->rpos[nd] - 1;

    /* now set penalties for match and insert states m, i1 and maybe i2 (if we're a MATP_MP or BIF_B) */
    if(cm->ndtype[nd] == END_nd) { 
      prv5 = prv3 = prv53 = 0.;
    }
    else if(cm->ndtype[nd] == BEGL_nd || cm->ndtype[nd] == BEGR_nd) {
      prv5  = (cm->ndtype[nd] == BEGL_nd) ? 0. : trp->l_ptyAA[TRPENALTY_5P_ONLY][cm->plast[cm->nodemap[nd]]];  /* parent BIF_B's probability */;
      prv3  = (cm->ndtype[nd] == BEGR_nd) ? 0. : trp->l_ptyAA[TRPENALTY_3P_ONLY][cm->plast[cm->nodemap[nd]]];  /* parent BIF_B's probability */;
      prv53 = trp->l_ptyAA[TRPENALTY_5P_AND_3P][cm->plast[cm->nodemap[nd]]];  /* parent BIF_B's probability */
    }
    else if(cm->ndtype[nd] == MATP_nd || cm->ndtype[nd] == MATL_nd || cm->ndtype[nd] == MATR_nd || cm->ndtype[nd] == BIF_nd) { 
      /* determine match states and insert states that pertain to this node */
      m = cm->nodemap[nd]; /* MATP_MP, MATL_ML, MATR_MR, or BIF_B */
      InsertsGivenNodeIndex(cm, nd-1, &i1, &i2);

      m_psi = psi[m];
      if(cm->ndtype[nd] == MATP_MP) { m_psi += (psi[m+1] + psi[m+2]); } /* include MATP_ML and MATP_MR psi */
      i1_psi = (i1 == -1) ? 0. : psi[i1];
      i2_psi = (i2 == -1) ? 0. : psi[i2]; 
      summed_psi = m_psi + i1_psi + i2_psi; 
      if(ignore_inserts) { 
	i1_psi = i2_psi = 0.;
	summed_psi = m_psi;
      }

      /* Global penalties */
      /* sanity check, we should only set truncation penalty once per state */
      if(NOT_IMPOSSIBLE(trp->g_ptyAA[TRPENALTY_5P_AND_3P][m]))  goto ERROR;
      if((i1 != -1) && NOT_IMPOSSIBLE(trp->g_ptyAA[TRPENALTY_5P_AND_3P][i1])) goto ERROR;
      if((i2 != -1) && NOT_IMPOSSIBLE(trp->g_ptyAA[TRPENALTY_5P_AND_3P][i2])) goto ERROR;
      /* divide up the probability g_5and3 amongst relevant states m, i1, i2, weighted by psi */
      trp->g_ptyAA[TRPENALTY_5P_AND_3P][m]  = (m_psi  / summed_psi) * g_5and3;
      if(i1 != -1) trp->g_ptyAA[TRPENALTY_5P_AND_3P][i1] = (i1_psi / summed_psi) * g_5and3;
      if(i2 != -1) trp->g_ptyAA[TRPENALTY_5P_AND_3P][i2] = (i2_psi / summed_psi) * g_5and3;

      /* same thing, for 5P only and 3P only */
      if(rpos == cm->clen) { /* else it will remain IMPOSSIBLE */
	trp->g_ptyAA[TRPENALTY_5P_ONLY][m]  = (m_psi / summed_psi) * g_5or3;
	if(i1 != -1) trp->g_ptyAA[TRPENALTY_5P_ONLY][i1] = (i1_psi / summed_psi) * g_5or3;
	if(i2 != -1) trp->g_ptyAA[TRPENALTY_5P_ONLY][i2] = (i2_psi / summed_psi) * g_5or3;
      }
      if(lpos == 1) { /* else it will remain IMPOSSIBLE */
	trp->g_ptyAA[TRPENALTY_3P_ONLY][m]  = (m_psi  / summed_psi) * g_5or3;
	if(i1 != -1) trp->g_ptyAA[TRPENALTY_3P_ONLY][i1] = (i1_psi / summed_psi) * g_5or3;
	if(i2 != -1) trp->g_ptyAA[TRPENALTY_3P_ONLY][i2] = (i2_psi / summed_psi) * g_5or3;
      }

      /* Local penalties */
      subtree_clen = rpos - lpos + 1;
      nfrag5  = subtree_clen;
      nfrag3  = subtree_clen;
      nfrag53 = (subtree_clen * (subtree_clen+1)) / 2;

      /* determine probability of observing a fragment aligned at
       * state m (here, m is what I call t above and in notes) and
       * partition that probability between m and i1 and/or i2 by
       * relative occupancy of match versus inserts
       */
      cur5  = begin[m] / (float) nfrag5  + prv5;
      cur3  = begin[m] / (float) nfrag3  + prv3;
      cur53 = begin[m] / (float) nfrag53 + prv53;

      /* sanity check, we should only set truncation penalty once per state */
      if(NOT_IMPOSSIBLE(trp->l_ptyAA[TRPENALTY_5P_AND_3P][m]))  goto ERROR;
      if((i1 != -1) && NOT_IMPOSSIBLE(trp->l_ptyAA[TRPENALTY_5P_AND_3P][i1])) goto ERROR;
      if((i2 != -1) && NOT_IMPOSSIBLE(trp->l_ptyAA[TRPENALTY_5P_AND_3P][i2])) goto ERROR;

      trp->l_ptyAA[TRPENALTY_5P_AND_3P][m]  = (m_psi  / summed_psi) * cur53;
      if(i1 != -1) trp->l_ptyAA[TRPENALTY_5P_AND_3P][i1] = (i1_psi / summed_psi) * cur53;
      if(i2 != -1) trp->l_ptyAA[TRPENALTY_5P_AND_3P][i2] = (i2_psi / summed_psi) * cur53;

      trp->l_ptyAA[TRPENALTY_5P_ONLY][m]  = (m_psi  / summed_psi) * cur5;
      if(i1 != -1) trp->l_ptyAA[TRPENALTY_5P_ONLY][i1] = (i1_psi / summed_psi) * cur5;
      if(i2 != -1) trp->l_ptyAA[TRPENALTY_5P_ONLY][i2] = (i2_psi / summed_psi) * cur5;

      trp->l_ptyAA[TRPENALTY_3P_ONLY][m]  = (m_psi  / summed_psi) * cur3;
      if(i1 != -1) trp->l_ptyAA[TRPENALTY_3P_ONLY][i1] = (i1_psi / summed_psi) * cur3;
      if(i2 != -1) trp->l_ptyAA[TRPENALTY_3P_ONLY][i2] = (i2_psi / summed_psi) * cur3;

      prv5  = (cm->ndtype[nd] == MATL_nd) ? cur5 : 0.;
      prv3  = (cm->ndtype[nd] == MATR_nd) ? cur3 : 0.;
      prv53 = cur53;
    }
  }

  /* all penalties are currently probabilities, convert them to log
   * probs and set integer penalties (careful, we have to check if
   * IMPOSSIBLE first)
   */
  for(v = 0; v < cm->M; v++) 
    { 
      if((cm->stid[v] == MATP_MP || cm->stid[v] == MATL_ML || cm->stid[v] == MATR_MR || cm->stid[v] == BIF_B) || 
	 ((cm->sttype[v] == IL_st || cm->sttype[v] == IR_st) && (! StateIsDetached(cm, v)))) 
	{
	  /* Check for rare special case: if we're a MATP_IL and next
	   * two states are MATP_IR and END_E, then we won't have set
	   * a trunction penalty. This state will keep an impossible
	   * truncated begin score, if we did a truncated begin into
	   * it we'd just emit from the MATP_IL and then go to the
	   * END_E anyway (the MATP_IR will be detached.
	   */
	  if(cm->stid[v] == MATP_IL && cm->ndtype[cm->ndidx[v]+1] == END_nd) continue;

	  /* glocal 5P AND 3P: all of these should have been set to a non-IMPOSSIBLE value */
	  if(! NOT_IMPOSSIBLE(trp->g_ptyAA[TRPENALTY_5P_AND_3P][v])) goto ERROR;
	  trp->ig_ptyAA[TRPENALTY_5P_AND_3P][v] = Prob2Score(trp->g_ptyAA[TRPENALTY_5P_AND_3P][v], 1.0);
	  trp->g_ptyAA[TRPENALTY_5P_AND_3P][v]  = sreLOG2(trp->g_ptyAA[TRPENALTY_5P_AND_3P][v]);
	  
	  /* glocal 5P only: some may be IMPOSSIBLE */
	  if(NOT_IMPOSSIBLE(trp->g_ptyAA[TRPENALTY_5P_ONLY][v])) { 
	    trp->ig_ptyAA[TRPENALTY_5P_ONLY][v] = Prob2Score(trp->g_ptyAA[TRPENALTY_5P_ONLY][v], 1.0);
	    trp->g_ptyAA[TRPENALTY_5P_ONLY][v]  = sreLOG2(trp->g_ptyAA[TRPENALTY_5P_ONLY][v]);
	  }
	  /* glocal 5P only: some may be IMPOSSIBLE */
	  if(NOT_IMPOSSIBLE(trp->g_ptyAA[TRPENALTY_3P_ONLY][v])) { 
	    trp->ig_ptyAA[TRPENALTY_3P_ONLY][v] = Prob2Score(trp->g_ptyAA[TRPENALTY_3P_ONLY][v], 1.0);
	    trp->g_ptyAA[TRPENALTY_3P_ONLY][v]  = sreLOG2(trp->g_ptyAA[TRPENALTY_3P_ONLY][v]);
	  }

	  /* local penalties all of these should have been set to a non-IMPOSSIBLE value */
	  if(! NOT_IMPOSSIBLE(trp->il_ptyAA[TRPENALTY_5P_AND_3P][v])) goto ERROR;
	  if(! NOT_IMPOSSIBLE(trp->il_ptyAA[TRPENALTY_5P_ONLY][v]))   goto ERROR;
	  if(! NOT_IMPOSSIBLE(trp->il_ptyAA[TRPENALTY_3P_ONLY][v]))   goto ERROR;

	  trp->il_ptyAA[TRPENALTY_5P_AND_3P][v] = Prob2Score(trp->l_ptyAA[TRPENALTY_5P_AND_3P][v], 1.0);
	  trp->il_ptyAA[TRPENALTY_5P_ONLY][v]   = Prob2Score(trp->l_ptyAA[TRPENALTY_5P_ONLY][v], 1.0);
	  trp->il_ptyAA[TRPENALTY_3P_ONLY][v]   = Prob2Score(trp->l_ptyAA[TRPENALTY_3P_ONLY][v], 1.0);
	  trp->l_ptyAA[TRPENALTY_5P_AND_3P][v]  = sreLOG2(trp->l_ptyAA[TRPENALTY_5P_AND_3P][v]);
	  trp->l_ptyAA[TRPENALTY_5P_ONLY][v]    = sreLOG2(trp->l_ptyAA[TRPENALTY_5P_ONLY][v]);
	  trp->l_ptyAA[TRPENALTY_3P_ONLY][v]    = sreLOG2(trp->l_ptyAA[TRPENALTY_3P_ONLY][v]);
	}
    }

  if(ignore_inserts) { 
    if((status = cm_tr_penalties_Validate(trp, cm, 0.0001, errbuf)) != eslOK) { printf("%s", errbuf);  goto ERROR; }
  }

  /* cm_tr_penalties_Dump(stdout, cm, trp); */

  if(mexpocc != NULL) free(mexpocc);
  if(iexpocc != NULL) free(iexpocc);
  if(psi     != NULL) free(psi);
  if(begin   != NULL) free(begin);

  return trp;

 ERROR:
  if(mexpocc != NULL) free(mexpocc);
  if(iexpocc != NULL) free(iexpocc);
  if(psi     != NULL) free(psi);
  if(begin   != NULL) free(begin);
  if(trp     != NULL) cm_tr_penalties_Destroy(trp);

  return NULL;
}
/**
 * int main(int argc, char **argv)
 * Main driver
 */
int
main(int argc, char **argv)
{
  ESL_GETOPTS     *go	   = NULL;      /* command line processing                   */
  ESL_ALPHABET    *abc     = NULL;
  char            *hmmfile = NULL;
  char            *outhmmfile = NULL;
  P7_HMMFILE      *hfp     = NULL;
  FILE         *outhmmfp;          /* HMM output file handle                  */
  P7_HMM          *hmm     = NULL;
  P7_BG           *bg      = NULL;
  int              nhmm;	
  double           x;
  float            KL;
  int              status;
  char             errbuf[eslERRBUFSIZE];

  float average_internal_transitions[ p7H_NTRANSITIONS ];
  int k;

  char        errmsg[eslERRBUFSIZE];

  /* Process the command line options.
   */
  go = esl_getopts_Create(options);
  if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK || 
      esl_opt_VerifyConfig(go)               != eslOK)
    {
      printf("Failed to parse command line: %s\n", go->errbuf);
      esl_usage(stdout, argv[0], usage);
      printf("\nTo see more help on available options, do %s -h\n\n", argv[0]);
      exit(1);
    }
  if (esl_opt_GetBoolean(go, "-h") == TRUE) 
    {
      profillic_p7_banner(stdout, argv[0], banner);
      esl_usage(stdout, argv[0], usage);
      puts("\nOptions:");
      esl_opt_DisplayHelp(stdout, go, 0, 2, 80); /* 0=docgroup, 2 = indentation; 80=textwidth*/
      exit(0);
    }
  if (esl_opt_ArgNumber(go) != 2) 
    {
      puts("Incorrect number of command line arguments.");
      esl_usage(stdout, argv[0], usage);
      printf("\nTo see more help on available options, do %s -h\n\n", argv[0]);
      exit(1);
    }

  if ((hmmfile = esl_opt_GetArg(go, 1)) == NULL) 
    {
      puts("Failed to read <input hmmfile> argument from command line.");
      esl_usage(stdout, argv[0], usage);
      printf("\nTo see more help on available options, do %s -h\n\n", argv[0]);
      exit(1);
    }

  if ((outhmmfile = esl_opt_GetArg(go, 2)) == NULL) 
    {
      puts("Failed to read <output hmmfile> argument from command line.");
      esl_usage(stdout, argv[0], usage);
      printf("\nTo see more help on available options, do %s -h\n\n", argv[0]);
      exit(1);
    }

  profillic_p7_banner(stdout, argv[0], banner);
  
  /* Initializations: open the input HMM file for reading
   */
  status = p7_hmmfile_OpenE(hmmfile, NULL, &hfp, errbuf);
  if      (status == eslENOTFOUND) p7_Fail("File existence/permissions problem in trying to open HMM file %s.\n%s\n", hmmfile, errbuf);
  else if (status == eslEFORMAT)   p7_Fail("File format problem in trying to open HMM file %s.\n%s\n",                hmmfile, errbuf);
  else if (status != eslOK)        p7_Fail("Unexpected error %d in opening HMM file %s.\n%s\n",               status, hmmfile, errbuf);  

  /* Initializations: open the output HMM file for writing
   */
  if ((outhmmfp = fopen(outhmmfile, "w")) == NULL) ESL_FAIL(status, errmsg, "Failed to open HMM file %s for writing", outhmmfile);

  /* Main body: read HMMs one at a time, print one line of stats
   */
  printf("#\n");
  printf("# %-4s %-20s %-12s %8s %8s %6s %6s %6s %6s %6s\n", "idx",  "name",                 "accession",    "nseq",     "eff_nseq", "M",      "relent", "info",   "p relE", "compKL");
  printf("# %-4s %-20s %-12s %8s %8s %6s %6s %6s %6s %6s\n", "----", "--------------------", "------------", "--------", "--------", "------", "------", "------", "------", "------");

  nhmm = 0;
  while ((status = p7_hmmfile_Read(hfp, &abc, &hmm)) != eslEOF) 
    {
      if      (status == eslEOD)       esl_fatal("read failed, HMM file %s may be truncated?", hmmfile);
      else if (status == eslEFORMAT)   esl_fatal("bad file format in HMM file %s",             hmmfile);
      else if (status == eslEINCOMPAT) esl_fatal("HMM file %s contains different alphabets",   hmmfile);
      else if (status != eslOK)        esl_fatal("Unexpected error in reading HMMs from %s",   hmmfile);
      nhmm++;

      if (bg == NULL) bg = p7_bg_Create(abc);

      esl_vec_FSet(average_internal_transitions, p7H_NTRANSITIONS, 0.);
      for( k = 1; k < hmm->M; k++ ) {
        esl_vec_FAdd(average_internal_transitions, hmm->t[k], p7H_NTRANSITIONS);
      }
      // Match transitions
      esl_vec_FNorm(average_internal_transitions, 3);
      // Insert transitions
      esl_vec_FNorm(average_internal_transitions + 3, 2);
      // Delete transitions
      esl_vec_FNorm(average_internal_transitions + 5, 2);
      // Ok now set them.
      for( k = 1; k < hmm->M; k++ ) {
        esl_vec_FCopy( average_internal_transitions, p7H_NTRANSITIONS, hmm->t[k] );
      }

      if ((status = p7_hmm_Validate(hmm, errmsg, 0.0001))       != eslOK) return status;
      if ((status = p7_hmmfile_WriteASCII(outhmmfp, -1, hmm)) != eslOK) ESL_FAIL(status, errmsg, "HMM save failed");
  
      p7_MeanPositionRelativeEntropy(hmm, bg, &x); 
      p7_hmm_CompositionKLDist(hmm, bg, &KL, NULL);

      printf("%-6d %-20s %-12s %8d %8.2f %6d %6.2f %6.2f %6.2f %6.2f\n",
	     nhmm,
	     hmm->name,
	     hmm->acc == NULL ? "-" : hmm->acc,
	     hmm->nseq,
	     hmm->eff_nseq,
	     hmm->M,
	     p7_MeanMatchRelativeEntropy(hmm, bg),
	     p7_MeanMatchInfo(hmm, bg),
	     x,
	     KL);

	     /*	     p7_MeanForwardScore(hmm, bg)); */

      p7_hmm_Destroy(hmm);
    }

  p7_bg_Destroy(bg);
  esl_alphabet_Destroy(abc);
  p7_hmmfile_Close(hfp);
  if (outhmmfp != NULL) fclose(outhmmfp);
 esl_getopts_Destroy(go);
  exit(0);
}
Пример #17
0
/* Function:  p7_GNull2_ByExpectation()
 * Synopsis:  Calculate null2 model from posterior probabilities.
 * Incept:    SRE, Thu Feb 28 09:52:28 2008 [Janelia]
 *
 * Purpose:   Calculate the "null2" model for the envelope encompassed
 *            by a posterior probability calculation <pp> for model
 *            <gm>.  Return the null2 odds emission probabilities
 *            $\frac{f'{x}}{f{x}}$ in <null2>, which caller
 *            provides as space for at least <alphabet->Kp> residues.
 *            
 *            The expectation method is applied to envelopes in
 *            simple, well resolved regions (regions containing just a
 *            single envelope, where no stochastic traceback
 *            clustering was required).
 *            
 *            Make sure that the posterior probability matrix <pp> has
 *            been calculated by the caller for only the envelope; thus
 *            its rows are numbered <1..Ld>, for envelope <ienv..jenv>
 *            of length <Ld=jenv-ienv+1>.
 *            
 * Args:      gm    - profile, in any mode, target length model set to <L>
 *            pp    - posterior prob matrix, for <gm> against domain envelope <dsq+i-1> (offset)
 *            null2 - RETURN: null2 odds ratios per residue; <0..Kp-1>; caller allocated space
 *
 * Returns:   <eslOK> on success; <null2> contains the null2 scores. The 0
 *            row of <pp> has been used as temp space, and happens to contain
 *            the expected frequency that each M,I,N,C,J state is used in this
 *            <pp> matrix to generate residues.
 *
 * Throws:    (no abnormal error conditions)
 */
int
p7_GNull2_ByExpectation(const P7_PROFILE *gm, P7_GMX *pp, float *null2)
{
  int      M      = gm->M;
  int      Ld     = pp->L;
  float  **dp     = pp->dp;
  float   *xmx    = pp->xmx;
  float    xfactor;
  int      x;			/* over symbols 0..K-1                       */
  int      i;			/* over offset envelope dsq positions 1..Ld  */
  int      k;			/* over model M states 1..M, I states 1..M-1 */

  /* Calculate expected # of times that each emitting state was used
   * in generating the Ld residues in this domain.
   * The 0 row in <wrk> is used to hold these numbers.
   */
  esl_vec_FCopy(pp->dp[1],            (M+1)*p7G_NSCELLS, pp->dp[0]); 
  esl_vec_FCopy(pp->xmx+p7G_NXCELLS,  p7G_NXCELLS,       pp->xmx);   
  for (i = 2; i <= Ld; i++)
    {
      esl_vec_FAdd(pp->dp[0], pp->dp[i],             (M+1)*p7G_NSCELLS);
      esl_vec_FAdd(pp->xmx,   pp->xmx+i*p7G_NXCELLS, p7G_NXCELLS); 
    }
  
  /* Convert those expected #'s to log frequencies; these we'll use as
   * the log posterior weights.
   */
  esl_vec_FLog(pp->dp[0], (M+1)*p7G_NSCELLS);
  esl_vec_FLog(pp->xmx,   p7G_NXCELLS);  

  esl_vec_FIncrement(pp->dp[0], (M+1)*p7G_NSCELLS, -log((float)Ld));
  esl_vec_FIncrement(pp->xmx,   p7G_NXCELLS,       -log((float)Ld)); 

  /* Calculate null2's log odds emission probabilities, by taking
   * posterior weighted sum over all emission vectors used in paths
   * explaining the domain.
   * This is dog-slow; a point for future optimization.
   */
  xfactor = XMX(0,p7G_N);
  xfactor = p7_FLogsum(xfactor, XMX(0,p7G_C));
  xfactor = p7_FLogsum(xfactor, XMX(0,p7G_J));
  esl_vec_FSet(null2, gm->abc->K, -eslINFINITY);
  for (x = 0; x < gm->abc->K; x++)
    { 
      for (k = 1; k < M; k++)
	{
	  null2[x] = p7_FLogsum(null2[x], MMX(0,k) + p7P_MSC(gm, k, x));
	  null2[x] = p7_FLogsum(null2[x], IMX(0,k) + p7P_ISC(gm, k, x));
	}
      null2[x] = p7_FLogsum(null2[x], MMX(0,M) + p7P_MSC(gm, k, x));
      null2[x] = p7_FLogsum(null2[x], xfactor);
    }

  esl_vec_FExp (null2, gm->abc->K);
  /* now null2[x] = \frac{f_d(x)}{f_0(x)} for all x in alphabet,
   * 0..K-1, where f_d(x) are the ad hoc "null2" residue frequencies
   * for this envelope.
   */

  /* make valid scores for all degeneracies, by averaging the odds ratios. */
  esl_abc_FAvgScVec(gm->abc, null2); /* does not set gap, nonres, missing  */
  null2[gm->abc->K]    = 1.0;        /* gap character    */
  null2[gm->abc->Kp-2] = 1.0;	     /* nonresidue "*"   */
  null2[gm->abc->Kp-1] = 1.0;	     /* missing data "~" */

  return eslOK;
}
Пример #18
0
/* Function:  p7_bg_Read()
 * Synopsis:  Read background frequencies from a file.
 *
 * Purpose:   Read new background frequencies from file <bgfile>,
 *            overwriting the frequencies previously in the 
 *            <P7_BG> object <bg>.
 *            
 *            Note that <bg> is already created by the caller, not
 *            created here. Also note that <p7_bg_Read()> only reads
 *            residue background frequencies used for the "null
 *            model", whereas a <P7_BG> object contains additional
 *            information for the bias filter and for the biased
 *            composition correction.
 *            
 * Args:      bgfile  - file to read.
 *            bg      - existing <P7_BG> object provided by the caller.
 *            errbuf  - OPTIONAL: space for an error message, upon parse errors; or NULL.
 *
 * Returns:   <eslOK> on success, and background frequencies in <bg>
 *            are overwritten.
 * 
 *            <eslENOTFOUND> if <bgfile> can't be opened for reading.
 *            <eslEFORMAT> if parsing of <bgfile> fails for some
 *            reason.  In both cases, <errbuf> contains a
 *            user-directed error message upon return, including (if
 *            relevant) the file name <bgfile> and the line number on
 *            which an error was detected. <bg> is unmodified.
 *
 * Throws:    <eslEMEM> on allocation failure; <bg> is unmodified,
 *            and <errbuf> is empty.
 */
int
p7_bg_Read(char *bgfile, P7_BG *bg, char *errbuf)
{
  ESL_FILEPARSER *efp   = NULL;
  float          *fq    = NULL;
  int             n     = 0;
  char           *tok;
  int             toklen;
  int             alphatype;
  ESL_DSQ         x;
  int             status;

  if (errbuf) errbuf[0] = '\0';

  status =  esl_fileparser_Open(bgfile, NULL, &efp);
  if      (status == eslENOTFOUND) ESL_XFAIL(eslENOTFOUND, errbuf, "couldn't open bg file  %s for reading", bgfile);
  else if (status != eslOK)        goto ERROR;

  esl_fileparser_SetCommentChar(efp, '#');

  /* First token is alphabet type: amino | DNA | RNA */
  status = esl_fileparser_GetToken(efp, &tok, &toklen);
  if      (status == eslEOF) ESL_XFAIL(eslEFORMAT, errbuf, "premature end of file [line %d of bgfile %s]", efp->linenumber, bgfile);
  else if (status != eslOK)  goto ERROR;

  alphatype = esl_abc_EncodeType(tok);
  if      (alphatype == eslUNKNOWN)    ESL_XFAIL(eslEFORMAT, errbuf, "expected alphabet type but saw \"%s\" [line %d of bgfile %s]", tok, efp->linenumber, bgfile);
  else if (alphatype != bg->abc->type) ESL_XFAIL(eslEFORMAT, errbuf, "bg file's alphabet is %s; expected %s [line %d, %s]", tok, esl_abc_DecodeType(bg->abc->type), efp->linenumber, bgfile);
  
  ESL_ALLOC(fq, sizeof(float) * bg->abc->K);
  esl_vec_FSet(fq, bg->abc->K, -1.0);

  while ((status = esl_fileparser_NextLine(efp)) == eslOK)
    {
      status = esl_fileparser_GetTokenOnLine(efp, &tok, &toklen);
      if      (status == eslEOL) ESL_XFAIL(eslEFORMAT, errbuf, "premature end of file [line %d of bgfile %s", efp->linenumber, bgfile);
      else if (status != eslOK)  goto ERROR;

      if      (toklen != 1 ||   ! esl_abc_CIsCanonical(bg->abc, *tok))
	ESL_XFAIL(eslEFORMAT, errbuf, "expected to parse a residue letter; saw %s [line %d of bgfile %s]", tok, efp->linenumber, bgfile);

      x = esl_abc_DigitizeSymbol(bg->abc, *tok);
      if (fq[x] != -1.0)         ESL_XFAIL(eslEFORMAT, errbuf, "already parsed probability of %c [line %d of bgfile %s]", bg->abc->sym[x], efp->linenumber, bgfile);
      n++;

      status = esl_fileparser_GetTokenOnLine(efp, &tok, &toklen);
      if      (status == eslEOL) ESL_XFAIL(eslEFORMAT, errbuf, "premature end of file, expected a probability [line %d of bgfile %s]", efp->linenumber, bgfile);
      else if (status != eslOK)  goto ERROR;
      if (! esl_str_IsReal(tok)) ESL_XFAIL(eslEFORMAT, errbuf, "expected a probability, saw %s [line %d of bgfile %s]", tok, efp->linenumber, bgfile);

      fq[x] = atof(tok);

      status = esl_fileparser_GetTokenOnLine(efp, &tok, &toklen);
      if      (status == eslOK)  ESL_XFAIL(eslEFORMAT, errbuf, "extra unexpected data found [line %d of bgfile %s]", efp->linenumber, bgfile);
      else if (status != eslEOL) goto ERROR;
    }
  if (status != eslEOF) goto ERROR;

  if ( n != bg->abc->K) 
    ESL_XFAIL(eslEFORMAT, errbuf, "expected %d residue frequencies, but found %d in bgfile %s", bg->abc->K, n, bgfile);
  if ( esl_FCompare(esl_vec_FSum(fq, bg->abc->K), 1.0, 0.001) != eslOK) 
    ESL_XFAIL(eslEFORMAT, errbuf, "residue frequencies do not sum to 1.0 in bgfile %s", bgfile);
  
  /* all checking complete. no more error cases. overwrite bg with the new frequencies */
  esl_vec_FNorm(fq, bg->abc->K);
  esl_vec_FCopy(fq, bg->abc->K, bg->f);

  free(fq);
  esl_fileparser_Close(efp);
  return eslOK;

 ERROR:
  if (fq)  free(fq);
  if (efp) esl_fileparser_Close(efp);
  return status;
}
Пример #19
0
/* Function:  p7_profile_Create()
 * Synopsis:  Allocates a profile.
 *
 * Purpose:   Allocates for a profile of up to <M> nodes, for digital
 *            alphabet <abc>.
 *            
 *            Because this function might be in the critical path (in
 *            hmmscan, for example), we leave much of the model
 *            uninitialized, including scores and length model
 *            probabilities. The <p7_profile_Config()> call is what
 *            sets these.
 *            
 *            The reference pointer <gm->abc> is set to <abc>.
 *
 * Returns:   a pointer to the newly allocated profile.
 *
 * Throws:    <NULL> on allocation error.
 */
P7_PROFILE *
p7_profile_Create(int allocM, const ESL_ALPHABET *abc)
{
  P7_PROFILE *gm = NULL;
  int         x;
  int         status;

  /* level 0 */
  ESL_ALLOC(gm, sizeof(P7_PROFILE));
  gm->tsc       = NULL;
  gm->rsc       = NULL;
  gm->name      = NULL;
  gm->acc       = NULL;
  gm->desc      = NULL;
  gm->rf        = NULL;
  gm->mm        = NULL;
  gm->cs        = NULL;
  gm->consensus = NULL;

  /* level 1 */
  ESL_ALLOC(gm->tsc,       sizeof(float)   * (allocM+1) * p7P_NTRANS); /* 0..M */
  ESL_ALLOC(gm->rsc,       sizeof(float *) * abc->Kp);
  ESL_ALLOC(gm->rf,        sizeof(char)    * (allocM+2)); /* yes, +2: each is (0)1..M, +trailing \0  */
  ESL_ALLOC(gm->mm,        sizeof(char)    * (allocM+2));
  ESL_ALLOC(gm->cs,        sizeof(char)    * (allocM+2));
  ESL_ALLOC(gm->consensus, sizeof(char)    * (allocM+2));
  gm->rsc[0] = NULL;
  
  /* level 2 */
  ESL_ALLOC(gm->rsc[0], sizeof(float) * abc->Kp * (allocM+1) * p7P_NR);
  for (x = 1; x < abc->Kp; x++) 
    gm->rsc[x] = gm->rsc[0] + x * (allocM+1) * p7P_NR;

  /* Initialization of tsc[0], including removal of I0.  tsc[k-1,LM],tsc[k-1,GM] will be configured + overwritten later */
  esl_vec_FSet(gm->tsc, p7P_NTRANS, -eslINFINITY);  
  /* tsc[M] initialized and Im removed when we know actual M : see modelconfig.c */

  for (x = 0; x < abc->Kp; x++) {        
    P7P_MSC(gm, 0, x) = -eslINFINITY;                  /* no emissions from nonexistent M_0... */
    P7P_ISC(gm, 0, x) = -eslINFINITY;                  /* nor I_0...                           */
    /* I_M is initialized in profile config, when we know actual M, not just allocated max M   */
  }
  x = esl_abc_XGetGap(abc);	                       /* no emission can emit/score gap characters */
  esl_vec_FSet(gm->rsc[x], (allocM+1)*p7P_NR, -eslINFINITY);
  x = esl_abc_XGetMissing(abc);	                       /* no emission can emit/score missing data characters */
  esl_vec_FSet(gm->rsc[x], (allocM+1)*p7P_NR, -eslINFINITY);


  /* Set remaining info  */
  gm->M                = 0;
  gm->allocM           = allocM;
  gm->L                = -1;  	   /* "unset" flag */
  gm->nj               = -1.0f;    /* "unset" flag */
  gm->pglocal          = -1.0f;    /* "unset" flag */

  gm->roff             = -1;
  gm->eoff             = -1;
  gm->offs[p7_MOFFSET] = -1;
  gm->offs[p7_FOFFSET] = -1;
  gm->offs[p7_POFFSET] = -1;

  gm->name             = NULL;
  gm->acc              = NULL;
  gm->desc             = NULL;
  gm->rf[0]            = 0;     /* RF line is optional annotation; this flags that it's not set yet */
  gm->mm[0]            = 0;     /* likewise for MM annotation line */
  gm->cs[0]            = 0;     /* likewise for CS annotation line */
  gm->consensus[0]     = 0;
  
  for (x = 0; x < p7_NEVPARAM; x++) gm->evparam[x] = p7_EVPARAM_UNSET;
  for (x = 0; x < p7_NCUTOFFS; x++) gm->cutoff[x]  = p7_CUTOFF_UNSET;
  for (x = 0; x < p7_MAXABET;  x++) gm->compo[x]   = p7_COMPO_UNSET;

  gm->max_length  = -1;		/* "unset" */
  gm->abc         = abc;

  return gm;

 ERROR:
  p7_profile_Destroy(gm);
  return NULL;
}
Пример #20
0
/* glocal_region_trace_ensemble()
 * EPN, Tue Oct  5 10:13:25 2010
 *
 * Based on p7_domaindef.c's region_trace_ensemble(). Modified so that
 * generic matrices (which can be used for glocally configured models)
 * can be used. An additional parameter <do_null2> has been added,
 * so that null2-related calculations are only done if necessary.
 * That is, they're skipped if null2 has been turned off in the pipeline.
 * 
 * Notes from p7_domaindef.c::region_trace_ensemble():
 *~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 * SRE, Fri Feb  8 11:49:44 2008 [Janelia]
 *
 * Here, we've decided that region <ireg>..<jreg> in sequence <dsq> might be
 * composed of more than one domain, and we're going to use clustering
 * of a posterior ensemble of stochastic tracebacks to sort it out.
 * 
 * Caller provides a filled Forward matrix in <fwd> for the sequence
 * region <dsq+ireg-1>, length <jreg-ireg+1>, for the model <om>
 * configured in multihit mode with its target length distribution
 * set to the total length of <dsq>: i.e., the same model
 * configuration used to score the complete sequence (if it weren't
 * multihit, we wouldn't be worried about multiple domains).
 * 
 * Caller also provides a DP matrix in <wrk> containing at least one
 * row, for use as temporary workspace. (This will typically be the
 * caller's Backwards matrix, which we haven't yet used at this point
 * in the processing pipeline.)
 * 
 * Caller provides <ddef>, which defines heuristic parameters that
 * control the clustering, and provides working space for the
 * calculation and the answers. The <ddef->sp> object must have been
 * reused (i.e., it needs to be fresh; we're going to use it here);
 * the caller needs to Reuse() it specifically, because it can't just
 * Reuse() the whole <ddef>, when it's in the process of analyzing
 * regions.
 * 
 * Upon return, <*ret_nc> contains the number of clusters that were
 * defined.
 * 
 * The caller can retrieve info on each cluster by calling
 * <p7_spensemble_GetClusterCoords(ddef->sp...)> on the
 * <P7_SPENSEMBLE> object in <ddef>.
 * 
 * Other information on what's happened in working memory:
 * 
 * <ddef->n2sc[ireg..jreg]> now contains log f'(x_i) / f(x_i) null2 scores
 *    for each residue.
 *
 * <ddef->sp> gets filled in, and upon return, it's holding the answers 
 *    (the cluster definitions). When the caller is done retrieving those
 *    answers, it needs to <esl_spensemble_Reuse()> it before calling
 *    <region_trace_ensemble()> again.
 *    
 * <ddef->tr> is used as working memory for sampled traces.
 *    
 * <wrk> has had its zero row clobbered as working space for a null2 calculation.
 *~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 */
static int
glocal_region_trace_ensemble(P7_DOMAINDEF *ddef, const P7_PROFILE *gm, const ESL_DSQ *dsq, int ireg, int jreg, 
			     const P7_GMX *fwd, P7_GMX *wrk, int do_null2, int *ret_nc)
{
  int    Lr  = jreg-ireg+1;
  int    t, d, d2;
  int    nov, n;
  int    nc;
  int    pos;
  float  null2[p7_MAXCODE];

  esl_vec_FSet(ddef->n2sc+ireg, Lr, 0.0); /* zero the null2 scores in region */

  /* By default, we make results reproducible by forcing a reset of
   * the RNG to its originally seeded state.
   */
  if (ddef->do_reseeding) 
    esl_randomness_Init(ddef->r, esl_randomness_GetSeed(ddef->r));

  /* Collect an ensemble of sampled traces; calculate null2 odds ratios from these if nec */
  for (t = 0; t < ddef->nsamples; t++)
    {
      p7_GStochasticTrace(ddef->r, dsq+ireg-1, Lr, gm, fwd, ddef->tr);
      p7_trace_Index(ddef->tr);

      pos = 1;
      for (d = 0; d < ddef->tr->ndom; d++)
	{
	  p7_spensemble_Add(ddef->sp, t, ddef->tr->sqfrom[d]+ireg-1, ddef->tr->sqto[d]+ireg-1, ddef->tr->hmmfrom[d], ddef->tr->hmmto[d]);
	  
	  if(do_null2) { 
	    p7_GNull2_ByTrace(gm, ddef->tr, ddef->tr->tfrom[d], ddef->tr->tto[d], wrk, null2);
	    
	    /* residues outside domains get bumped +1: because f'(x) = f(x), so f'(x)/f(x) = 1 in these segments */
	    for (; pos <= ddef->tr->sqfrom[d]; pos++) ddef->n2sc[ireg+pos-1] += 1.0;
	    
	    /* Residues inside domains get bumped by their null2 ratio */
	    for (; pos <= ddef->tr->sqto[d];   pos++) ddef->n2sc[ireg+pos-1] += null2[dsq[ireg+pos-1]];
	  }
	}
      if(do_null2) { 
	/* the remaining residues in the region outside any domains get +1 */
	for (; pos <= Lr; pos++)  ddef->n2sc[ireg+pos-1] += 1.0;
      }
      p7_trace_Reuse(ddef->tr);        
    }

  /* Convert the accumulated n2sc[] ratios in this region to log odds null2 scores on each residue. */
  if(do_null2) { 
    for (pos = ireg; pos <= jreg; pos++)
      ddef->n2sc[pos] = logf(ddef->n2sc[pos] / (float) ddef->nsamples);
  }

  /* Cluster the ensemble of traces to break region into envelopes. */
  p7_spensemble_Cluster(ddef->sp, ddef->min_overlap, ddef->of_smaller, ddef->max_diagdiff, ddef->min_posterior, ddef->min_endpointp, &nc);

  /* A little hacky now. Remove "dominated" domains relative to seq coords. */
  for (d = 0; d < nc; d++) 
    ddef->sp->assignment[d] = 0; /* overload <assignment> to flag that a domain is dominated */

  /* who dominates who? (by post prob) */
  for (d = 0; d < nc; d++)
    {
      for (d2 = d+1; d2 < nc; d2++)
	{
	  nov = ESL_MIN(ddef->sp->sigc[d].j, ddef->sp->sigc[d2].j) - ESL_MAX(ddef->sp->sigc[d].i, ddef->sp->sigc[d2].i) + 1;
	  if (nov == 0) break;
	  n   = ESL_MIN(ddef->sp->sigc[d].j - ddef->sp->sigc[d].i + 1,  ddef->sp->sigc[d2].j - ddef->sp->sigc[d2].i + 1);
	  if ((float) nov / (float) n >= 0.8) /* overlap */
	    {
	      if (ddef->sp->sigc[d].prob > ddef->sp->sigc[d2].prob) ddef->sp->assignment[d2] = 1;
	      else                                                  ddef->sp->assignment[d]  = 1;
	    }
	}
    }
      
  /* shrink the sigc list, removing dominated domains */
  d = 0;
  for (d2 = 0; d2 < nc; d2++)
    {
      if (ddef->sp->assignment[d2]) continue; /* skip domain d2, it's dominated. */
      if (d != d2) memcpy(ddef->sp->sigc + d, ddef->sp->sigc + d2, sizeof(struct p7_spcoord_s));
      d++;
    }
  ddef->sp->nc = d;
  *ret_nc = d;
  return eslOK;
}
Пример #21
0
/* Function:  p7_domaindef_GlocalByPosteriorHeuristics()
 * Synopsis:  Define glocal domains in a sequence using posterior probs.
 * Incept:    EPN, Tue Oct  5 10:02:34 2010         
 *            SRE, Sat Feb 23 08:17:44 2008 [Janelia] (p7_domaindef_ByPosteriorHeuristics())
 *
 * Purpose:   Given a sequence <sq> and model <gm> for which we have
 *            already calculated a Forward and Backward parsing
 *            matrices <gxf> and <gxb>; use posterior probability
 *            heuristics to determine an annotated domain structure;
 *            and for each domain found, score it (with null2
 *            calculations) and obtain an optimal accuracy alignment,
 *            using <fwd> and <bck> matrices as workspace for the
 *            necessary full-matrix DP calculations. Caller provides a
 *            new or reused <ddef> object to hold these results.
 *            
 *            As a special case, if the profile is in unihit mode
 *            upon entering, we don't ever modify its configuration.
 *            This is especially important if this function is 
 *            being used within a search/scan pipeline with a 
 *            specially configured p7 profile in which N->N and/or
 *            C->C transitions have been set to IMPOSSIBLE. (If
 *            we were to call ReconfigLength() on such a profile
 *            we would make those transitions possible.) 
 *
 *            One case in which profile reconfiguration is necessary
 *            is when multiple domains are suspected. However, we
 *            guard against this if the profile enters in unihit mode
 *            by no allowing multiple domains (in fact, it should
 *            never happen because J states are unreachable in unihit
 *            profiles). If multiple domains are suspected in this case,
 *             we return eslEINCONCEIVABLE.
 * 
 *            Upon return, <ddef> contains the definitions of all the
 *            domains: their bounds, their null-corrected Forward
 *            scores, and their optimal posterior accuracy alignments.
 *            
 *            <do_null2> is TRUE if we'll eventually apply a null2
 *            penalty FALSE if not. If FALSE, we can save time by
 *            skipping Backward calls at some stages.
 *
 * Returns:   <eslOK> on success.           
 *
 *            <eslERANGE> on numeric overflow in posterior
 *            decoding. This should not be possible for multihit
 *            models.
 *
 *            <eslEINCONCEIVABLE> if profile enters as unihit but
 *            multiple domains are suspected.
 */
int
p7_domaindef_GlocalByPosteriorHeuristics(const ESL_SQ *sq, P7_PROFILE *gm, 
					 P7_GMX *gxf, P7_GMX *gxb, P7_GMX *fwd, P7_GMX *bck, 
					 P7_DOMAINDEF *ddef, int do_null2)
{
  int i, j;
  int triggered;
  int d;
  int i2,j2;
  int last_j2;
  int nc;
  int saveL     = gm->L;	/* Save the length config of <om>; will restore upon return */
  int save_mode = gm->mode;	/* Likewise for the mode. */
  int status;
  int save_mode_is_unihit;
  
  save_mode_is_unihit = (p7_IsMulti(save_mode)) ? FALSE : TRUE; /* if save_mode_is_unihit is TRUE, we never modify profile's configuration (length nor mode) */

  if ((status = p7_domaindef_GrowTo(ddef, sq->n))       != eslOK) return status;  /* ddef's btot,etot,mocc now ready for seq of length n */
  /*printf("GDD P7 mode: %d\n", gm->mode);*/
  if ((status = p7_GDomainDecoding(gm, gxf, gxb, ddef)) != eslOK) return status;  /* ddef->{btot,etot,mocc} now made.                    */

  /*printf("In p7_domaindef_GlocalByPosteriorHeuristics(): mode: %d rt1: %g rt2: %g rt3: %g nsamples: %d reseed: %d\n", save_mode, ddef->rt1, ddef->rt2, ddef->rt3, ddef->nsamples, ddef->do_reseeding);*/

  esl_vec_FSet(ddef->n2sc, sq->n+1, 0.0);          /* ddef->n2sc null2 scores are initialized                        */
  ddef->nexpected = ddef->btot[sq->n];             /* posterior expectation for # of domains (same as etot[sq->n])   */

  if(! save_mode_is_unihit) p7_ReconfigUnihit(gm, saveL); /* process each domain in unihit mode, regardless of gm->mode     */
  i     = -1;
  triggered = FALSE;
  for (j = 1; j <= sq->n; j++)
    {
      /*printf("GDD j: %5d  m: %.5f  b: %8.3f  e: %8.3f    bhere: %8.3f  ehere: %8.3f\n", 
	j, 
	ddef->mocc[j], 
	ddef->btot[j], 
	ddef->etot[j], 
	ddef->btot[j] - ddef->btot[j-1], 
	ddef->etot[j] - ddef->etot[j-1]); 
      */
      if (! triggered) 
	{			/* xref J2/101 for what the logic below is: */
	  if       (ddef->mocc[j] - (ddef->btot[j] - ddef->btot[j-1]) <  ddef->rt2) i = j;
	  else if  (i == -1)                                                        i = j;
	  if       (ddef->mocc[j]                                     >= ddef->rt1) triggered = TRUE;  
	} 
      else if (ddef->mocc[j] - (ddef->etot[j] - ddef->etot[j-1])  <  ddef->rt2) 
	{
	  /* We have a region i..j to evaluate. */
	  p7_gmx_GrowTo(fwd, gm->M, j-i+1);
	  p7_gmx_GrowTo(bck, gm->M, j-i+1);
	  ddef->nregions++;
	  if (is_multidomain_region(ddef, i, j))
	    {
	      if(save_mode_is_unihit) return eslEINCONCEIVABLE;

	      /* This region appears to contain more than one domain, so we have to 
               * resolve it by cluster analysis of posterior trace samples, to define
               * one or more domain envelopes.
	       */
	      ddef->nclustered++;

	      /* Resolve the region into domains by stochastic trace
	       * clustering; assign position-specific null2 model by
	       * stochastic trace clustering; there is redundancy
	       * here; we will consolidate later if null2 strategy
	       * works
	       */
	      p7_ReconfigMultihit(gm, saveL);
	      p7_GForward(sq->dsq+i-1, j-i+1, gm, fwd, NULL);
	      glocal_region_trace_ensemble(ddef, gm, sq->dsq, i, j, fwd, bck, do_null2, &nc);
	      p7_ReconfigUnihit(gm, saveL);
	      /* ddef->n2sc is now set on i..j by the traceback-dependent method */

	      last_j2 = 0;
	      for (d = 0; d < nc; d++) {
		p7_spensemble_GetClusterCoords(ddef->sp, d, &i2, &j2, NULL, NULL, NULL);
		if (i2 <= last_j2) ddef->noverlaps++;

		/* Note that k..m coords on model are available, but
                 * we're currently ignoring them.  This leads to a
                 * rare clustering bug that we eventually need to fix
                 * properly [xref J3/32]: two different regions in one
                 * profile HMM might have hit same seq domain, and
                 * when we now go to calculate an OA trace, nothing
                 * constrains us to find the two different alignments
                 * to the HMM; in fact, because OA is optimal, we'll
                 * find one and the *same* alignment, leading to an
                 * apparent duplicate alignment in the output.
                 * 
                 * Registered as #h74, Dec 2009, after EBI finds and
                 * reports it.  #h74 is worked around in p7_tophits.c
                 * by hiding all but one envelope with an identical
                 * alignment, in the rare event that this
                 * happens. [xref J5/130].
		 */
		ddef->nenvelopes++;
		if (glocal_rescore_isolated_domain(ddef, gm, sq, fwd, bck, i2, j2, TRUE, do_null2, FALSE) == eslOK) 
		  last_j2 = j2;
	      }
	      p7_spensemble_Reuse(ddef->sp);
	      p7_trace_Reuse(ddef->tr);
	    }
	  else 
	    {
	      /* The region looks simple, single domain; convert the region to an envelope. */
	      ddef->nenvelopes++;
	      glocal_rescore_isolated_domain(ddef, gm, sq, fwd, bck, i, j, FALSE, do_null2, FALSE);
	    }
	  i     = -1;
	  triggered = FALSE;
	}
    }

  /* If profile was unihit upon entrance, we didn't modify its configuration (length nor mode),
   * else restore it to its original multihit mode, and to its original length model */
  if (! save_mode_is_unihit) { 
    p7_ReconfigMultihit(gm, saveL); 
  }

  return eslOK;
}