Beispiel #1
0
/* D(i,k) is reached from M(i, k-1) or D(i,k-1). */
static inline int
select_d(ESL_RANDOMNESS *rng, const P7_OPROFILE *om, const P7_OMX *ox, int i, int k)
{
  int     Q     = p7O_NQF(ox->M);
  int     q     = (k-1) % Q;		/* (q,r) is position of the current DP cell D(i,k) */
  int     r     = (k-1) / Q;
  __m128  zerov = _mm_setzero_ps();
  __m128  mpv, dpv;
  __m128  tmdv, tddv;
  union { __m128 v; float p[4]; } u;
  float   path[2];
  int     state[2] = { p7T_M, p7T_D };

  if (q > 0) {
    mpv  = ox->dpf[i][(q-1)*3 + p7X_M];
    dpv  = ox->dpf[i][(q-1)*3 + p7X_D];
    tmdv = om->tfv[7*(q-1) + p7O_MD];
    tddv = om->tfv[7*Q + (q-1)];
  } else {
    mpv  = esl_sse_rightshift_ps(ox->dpf[i][(Q-1)*3 + p7X_M], zerov);
    dpv  = esl_sse_rightshift_ps(ox->dpf[i][(Q-1)*3 + p7X_D], zerov);
    tmdv = esl_sse_rightshift_ps(om->tfv[7*(Q-1) + p7O_MD],   zerov);
    tddv = esl_sse_rightshift_ps(om->tfv[8*Q-1],              zerov);
  }	  

  u.v = _mm_mul_ps(mpv, tmdv); path[0] = u.p[r];
  u.v = _mm_mul_ps(dpv, tddv); path[1] = u.p[r];
  esl_vec_FNorm(path, 2);
  return state[esl_rnd_FChoose(rng, path, 2)];
}
Beispiel #2
0
/* D(i,k) is reached from M(i, k-1) or D(i,k-1). */
static inline int
select_d(ESL_RANDOMNESS *rng, const P7_OPROFILE *om, const P7_OMX *ox, int i, int k)
{
  int     Q     = p7O_NQF(ox->M);
  int     q     = (k-1) % Q;		/* (q,r) is position of the current DP cell D(i,k) */
  int     r     = (k-1) / Q;
  vector float  zerov;
  vector float  mpv, dpv;
  vector float  tmdv, tddv;
  union { vector float v; float p[4]; } u;
  float   path[2];
  int     state[2] = { p7T_M, p7T_D };

  zerov = (vector float) vec_splat_u32(0);

  if (q > 0) {
    mpv  = ox->dpf[i][(q-1)*3 + p7X_M];
    dpv  = ox->dpf[i][(q-1)*3 + p7X_D];
    tmdv = om->tfv[7*(q-1) + p7O_MD];
    tddv = om->tfv[7*Q + (q-1)];
  } else {
    mpv  = vec_sld(zerov, ox->dpf[i][(Q-1)*3 + p7X_M], 12);
    dpv  = vec_sld(zerov, ox->dpf[i][(Q-1)*3 + p7X_D], 12);
    tmdv = vec_sld(zerov, om->tfv[7*(Q-1) + p7O_MD],   12);
    tddv = vec_sld(zerov, om->tfv[8*Q-1],              12);
  }	  

  u.v = vec_madd(mpv, tmdv, zerov); path[0] = u.p[r];
  u.v = vec_madd(dpv, tddv, zerov); path[1] = u.p[r];
  esl_vec_FNorm(path, 2);
  return state[esl_rnd_FChoose(rng, path, 2)];
}
Beispiel #3
0
/* M(i,k) is reached from B(i-1), M(i-1,k-1), D(i-1,k-1), or I(i-1,k-1). */
static inline int
select_m(ESL_RANDOMNESS *rng, const P7_OPROFILE *om, const P7_OMX *ox, int i, int k)
{
  int     Q     = p7O_NQF(ox->M);
  int     q     = (k-1) % Q;		/* (q,r) is position of the current DP cell M(i,k) */
  int     r     = (k-1) / Q;
  __m128 *tp    = om->tfv + 7*q;       	/* *tp now at start of transitions to cur cell M(i,k) */
  __m128  xBv   = _mm_set1_ps(ox->xmx[(i-1)*p7X_NXCELLS+p7X_B]);
  __m128  zerov = _mm_setzero_ps();
  __m128  mpv, dpv, ipv;
  union { __m128 v; float p[4]; } u;
  float   path[4];
  int     state[4] = { p7T_B, p7T_M, p7T_I, p7T_D };
  
  if (q > 0) {
    mpv = ox->dpf[i-1][(q-1)*3 + p7X_M];
    dpv = ox->dpf[i-1][(q-1)*3 + p7X_D];
    ipv = ox->dpf[i-1][(q-1)*3 + p7X_I];
  } else {
    mpv = esl_sse_rightshift_ps(ox->dpf[i-1][(Q-1)*3 + p7X_M], zerov);
    dpv = esl_sse_rightshift_ps(ox->dpf[i-1][(Q-1)*3 + p7X_D], zerov);
    ipv = esl_sse_rightshift_ps(ox->dpf[i-1][(Q-1)*3 + p7X_I], zerov);
  }	  
  
  u.v = _mm_mul_ps(xBv, *tp); tp++;  path[0] = u.p[r];
  u.v = _mm_mul_ps(mpv, *tp); tp++;  path[1] = u.p[r];
  u.v = _mm_mul_ps(ipv, *tp); tp++;  path[2] = u.p[r];
  u.v = _mm_mul_ps(dpv, *tp);        path[3] = u.p[r];
  esl_vec_FNorm(path, 4);
  return state[esl_rnd_FChoose(rng, path, 4)];
}
Beispiel #4
0
/* M(i,k) is reached from B(i-1), M(i-1,k-1), D(i-1,k-1), or I(i-1,k-1). */
static inline int
select_m(ESL_RANDOMNESS *rng, const P7_OPROFILE *om, const P7_OMX *ox, int i, int k)
{
  int     Q     = p7O_NQF(ox->M);
  int     q     = (k-1) % Q;		/* (q,r) is position of the current DP cell M(i,k) */
  int     r     = (k-1) / Q;
  vector float *tp = om->tfv + 7*q;    	/* *tp now at start of transitions to cur cell M(i,k) */
  vector float  xBv;
  vector float  zerov;
  vector float  mpv, dpv, ipv;
  union { vector float v; float p[4]; } u;
  float   path[4];
  int     state[4] = { p7T_B, p7T_M, p7T_I, p7T_D };
  
  xBv   = esl_vmx_set_float(ox->xmx[(i-1)*p7X_NXCELLS+p7X_B]);
  zerov = (vector float) vec_splat_u32(0);

  if (q > 0) {
    mpv = ox->dpf[i-1][(q-1)*3 + p7X_M];
    dpv = ox->dpf[i-1][(q-1)*3 + p7X_D];
    ipv = ox->dpf[i-1][(q-1)*3 + p7X_I];
  } else {
    mpv = vec_sld(zerov, ox->dpf[i-1][(Q-1)*3 + p7X_M], 12);
    dpv = vec_sld(zerov, ox->dpf[i-1][(Q-1)*3 + p7X_D], 12);
    ipv = vec_sld(zerov, ox->dpf[i-1][(Q-1)*3 + p7X_I], 12);
  }	  
  
  u.v = vec_madd(xBv, *tp, zerov); tp++;  path[0] = u.p[r];
  u.v = vec_madd(mpv, *tp, zerov); tp++;  path[1] = u.p[r];
  u.v = vec_madd(ipv, *tp, zerov); tp++;  path[2] = u.p[r];
  u.v = vec_madd(dpv, *tp, zerov);        path[3] = u.p[r];
  esl_vec_FNorm(path, 4);
  return state[esl_rnd_FChoose(rng, path, 4)];
}
Beispiel #5
0
/* Function:  p7_ParameterEstimation()
 * Incept:    SRE, Sat Mar 24 10:15:37 2007 [Janelia]
 *
 * Purpose:   Given an <hmm> containing collected, weighted counts;
 *            and given a mixture Dirichlet prior <pri>;
 *            calculate mean posterior parameter estimates for 
 *            all model parameters, converting the 
 *            HMM to a parameterized probabilistic model.
 *            
 * Returns:   <eslOK> on success.
 */
int
p7_ParameterEstimation(P7_HMM *hmm, const P7_PRIOR *pri)
{
  int   k;
  double c[p7_MAXABET];
  double p[p7_MAXABET];
  double mix[p7_MAXDCHLET];
  
  /* Match transitions 0,1..M: 0 is the B state
   * TMD at node M is 0.
   */
  for (k = 0; k <= hmm->M; k++) {
    esl_vec_F2D(hmm->t[k], 3, c);
    esl_mixdchlet_MPParameters(c, 3, pri->tm, mix, p);
    esl_vec_D2F(p, 3, hmm->t[k]);
  }
  hmm->t[hmm->M][p7H_MD] = 0.0;
  esl_vec_FNorm(hmm->t[hmm->M], 3);

  /* Insert transitions, 0..M
   */
  for (k = 0; k <= hmm->M; k++) {
    esl_vec_F2D(hmm->t[k]+3, 2, c);
    esl_mixdchlet_MPParameters(c, 2, pri->ti, mix, p);
    esl_vec_D2F(p, 2, hmm->t[k]+3);
  }

  /* Delete transitions, 1..M-1
   * For k=0, which is unused; convention sets TMM=1.0, TMD=0.0
   * For k=M, TMM = 1.0 (to the E state) and TMD=0.0 (no next D; must go to E).
   */
  for (k = 1; k < hmm->M; k++) {
    esl_vec_F2D(hmm->t[k]+5, 2, c);
    esl_mixdchlet_MPParameters(c, 2, pri->td, mix, p);
    esl_vec_D2F(p, 2, hmm->t[k]+5);
  }
  hmm->t[0][p7H_DM] = hmm->t[hmm->M][p7H_DM] = 1.0;
  hmm->t[0][p7H_DD] = hmm->t[hmm->M][p7H_DD] = 0.0;

  /* Match emissions, 1..M
   * Convention sets mat[0] to a valid pvector: first elem 1, the rest 0.
   */
  for (k = 1; k <= hmm->M; k++) {
    esl_vec_F2D(hmm->mat[k], hmm->abc->K, c);
    esl_mixdchlet_MPParameters(c, hmm->abc->K, pri->em, mix, p);
    esl_vec_D2F(p, hmm->abc->K, hmm->mat[k]);
  }
  esl_vec_FSet(hmm->mat[0], hmm->abc->K, 0.);
  hmm->mat[0][0] = 1.0;

  /* Insert emissions 0..M
   */
  for (k = 0; k <= hmm->M; k++) {
    esl_vec_F2D(hmm->ins[k], hmm->abc->K, c);
    esl_mixdchlet_MPParameters(c, hmm->abc->K, pri->ei, mix, p);
    esl_vec_D2F(p, hmm->abc->K, hmm->ins[k]);
  }
  return eslOK;
}
/* Function: CPlan9SWConfig()
 * EPN 05.30.06
 * based on SRE's Plan7SWConfig() from HMMER's plan7.c
 * 
 * Purpose:  Set the alignment independent parameters of
 *           a CM Plan 9 model to hmmsw (Smith/Waterman) configuration.
 *           
 * Notes:    The desideratum for begin/end probs is that all fragments ij
 *           (starting at match i, ending at match j) are
 *           equiprobable -- there is no information in the choice of
 *           entry/exit. There are M(M+1)/2 possible choices of ij, so
 *           each must get a probability of 2/M(M+1). This prob is the
 *           product of a begin, an end, and all the not-end probs in
 *           the path between i,j. 
 *            
 *           Thus: entry/exit is asymmetric because of the left/right
 *           nature of the HMM/profile. Entry probability is distributed
 *           simply by assigning p_x = pentry / (M-1) to M-1 
 *           internal match states. However, the same approach doesn't
 *           lead to a flat distribution over exit points. Exit p's
 *           must be corrected for the probability of a previous exit
 *           from the model. Requiring a flat distribution over exit
 *           points leads to an easily solved piece of algebra, giving:
 *                      p_1 = pexit / (M-1)
 *                      p_x = p_1 / (1 - (x-1) p_1)
 *
 *           Modified EPN, Thu Feb  7 15:54:16 2008, as follows:
 *           To better match a locally configured CM, if <do_match_local_cm>
 *           we disallow insertions before the first (emitting) match state, 
 *           (from I_0), and after the final (emitting) match state,
 *           (from I_M). I_0 maps to ROOT_IL and I_M maps to ROOT_IR
 *           which can never be entered in a locally configured CM
 *           (b/c the ROOT_S state MUST jump into a local begin state, which
 *            are always match states>). Also we disallow a M_0->D_1 transition
 *           because these would be impossible in a locally configured CM.
 *
 *           <do_match_local_cm> is usually TRUE, unless we're configuring
 *           the CP9 specifically for eventual sub CM alignment, where
 *           the goal is simply find the most likely start/end point
 *           of the alignment with this CP9 (in that case we want
 *           I_0 and I_M reachable).
 *           
 * Args:     hmm    - the CM Plan 9 model w/ data-dep prob's valid
 *           pentry - probability of an internal entry somewhere;
 *                    will be evenly distributed over M-1 match states
 *           pexit  - probability of an internal exit somewhere; 
 *                    will be distributed over M-1 match states.
 *           do_match_local_cm - TRUE to make I_0, D_1 and I_M unreachable
 *                    to better match a locally configured CM.
 *           first_cm_ndtype - only used if do_match_local_cm is TRUE
 *                             if it's MATL or MATP then D_1 should be unreachable (it is in the CM)
 *                             if it's MATR or MATP then D_M should be unreachable (it is in the CM)
 *                    
 * Return:   (void)
 *           HMM probabilities are modified.
 */
void
CPlan9SWConfig(CP9_t *hmm, float pentry, float pexit, int do_match_local_cm, int first_cm_ndtype)
{
  float basep;			/* p1 for exits: the base p */
  int   k;			/* counter over states      */
  float d;
  
  /* No special (*x* states in Plan 7) states in CM Plan 9 */

  /*for (k = 1; k <= hmm->M; k++) printf("before anything: end[%d]: %f\n", k, hmm->end[k]);*/
  /* Configure entry.
   */
  if(do_match_local_cm) { 
    hmm->t[0][CTMI] = 0.;
    hmm->t[0][CTMM] = 0.;  /* already was 0.0, transition from M_0 to M_1 is begin[1] */
    hmm->t[0][CTMEL] = 0.; /* already was 0.0, can never do a local end from M_0 */
    if((first_cm_ndtype == MATL_nd) || (first_cm_ndtype == MATP_nd)) { /* CM can't possibly reach the CM delete state that maps to D_1, make D_1 unreachable too */
      hmm->t[0][CTMD] = 0.;
    }

    hmm->t[hmm->M][CTMI] = 0.;
    hmm->t[hmm->M][CTDI] = 0.;
    if((first_cm_ndtype == MATR_nd) || (first_cm_ndtype == MATP_nd)) { /* CM can't possibly reach the CM delete state that maps to D_M, make D_M unreachable too */
      hmm->t[hmm->M][CTMD] = 0.;
    }

    /* renormalize transitions out of M_M */
    d = esl_vec_FSum(hmm->t[hmm->M], cp9_TRANS_NMATCH) + hmm->end[hmm->M]; 
    esl_vec_FScale(hmm->t[hmm->M], cp9_TRANS_NMATCH, 1./d);
    hmm->end[hmm->M] /= d;
    
    /* renormalize transitions out of D_M */
    esl_vec_FNorm(hmm->t[hmm->M] + cp9_TRANS_DELETE_OFFSET, cp9_TRANS_NDELETE);	/* delete */
  }

  hmm->begin[1] = (1. - pentry) * (1. - (hmm->t[0][CTMI] + hmm->t[0][CTMD] + hmm->t[0][CTMEL]));
  esl_vec_FSet(hmm->begin+2, hmm->M-1, (pentry * (1.- (hmm->t[0][CTMI] + hmm->t[0][CTMD] + hmm->t[0][CTMEL]))) / (float)(hmm->M-1));
  /* note: hmm->t[0][CTMEL] == 0. (can't locally end from begin) 
   *       and if do_match_local_cm, hmm->t[0][CTMI] and hmm->t[0][CTMD] were just set to 0. 
   */
  
  /* Configure exit.
   * Don't touch hmm->end[hmm->M]
   */

  basep = pexit / (float) (hmm->M-1);
  for (k = 1; k < hmm->M; k++)
    hmm->end[k] = basep / (1. - basep * (float) (k-1));
  CPlan9RenormalizeExits(hmm, 1);
  /*for (k = 1; k <= hmm->M; k++) printf("after renormalizing: end[%d]: %f\n", k, hmm->end[k]);*/

  hmm->flags       &= ~CPLAN9_HASBITS; /* reconfig invalidates log-odds scores */
  hmm->flags       |= CPLAN9_LOCAL_BEGIN; /* local begins now on */
  hmm->flags       |= CPLAN9_LOCAL_END;   /* local ends now on */

  CP9Logoddsify(hmm);
}
Beispiel #7
0
void
esl_vec_FLogNorm(float *vec, int n)
{
  float denom;
  
  denom = esl_vec_FLogSum(vec, n);
  esl_vec_FIncrement(vec, n, -1.*denom);
  esl_vec_FExp (vec, n);
  esl_vec_FNorm(vec, n);
}
Beispiel #8
0
/* J(i) is reached from E(i) or J(i-1). */
static inline int
select_j(ESL_RANDOMNESS *rng, const P7_OPROFILE *om, const P7_OMX *ox, int i)
{
  float path[2];
  int   state[2] = { p7T_J, p7T_E };

  path[0] = ox->xmx[(i-1)*p7X_NXCELLS+p7X_J] * om->xf[p7O_J][p7O_LOOP];
  path[1] = ox->xmx[    i*p7X_NXCELLS+p7X_E] * om->xf[p7O_E][p7O_LOOP] * ox->xmx[i*p7X_NXCELLS+p7X_SCALE];
  esl_vec_FNorm(path, 2);
  return state[esl_rnd_FChoose(rng, path, 2)];
}
Beispiel #9
0
/* B(i) is reached from N(i) or J(i). */
static inline int
select_b(ESL_RANDOMNESS *rng, const P7_OPROFILE *om, const P7_OMX *ox, int i)
{
  float path[2];
  int   state[2] = { p7T_N, p7T_J };

  path[0] = ox->xmx[i*p7X_NXCELLS+p7X_N] * om->xf[p7O_N][p7O_MOVE];
  path[1] = ox->xmx[i*p7X_NXCELLS+p7X_J] * om->xf[p7O_J][p7O_MOVE];
  esl_vec_FNorm(path, 2);
  return state[esl_rnd_FChoose(rng, path, 2)];
}
Beispiel #10
0
int
esl_hmm_PosteriorDecoding(const ESL_DSQ *dsq, int L, const ESL_HMM *hmm, ESL_HMX *fwd, ESL_HMX *bck, ESL_HMX *pp)
{
  int i,k;

  for (i = 1; i <= L; i++)
    {
      for (k = 0; k < hmm->M; k++)
	pp->dp[i][k] = fwd->dp[i][k] * bck->dp[i][k];
      esl_vec_FNorm(pp->dp[i], hmm->M);
    }
  return eslOK;
}
Beispiel #11
0
/* I(i,k) is reached from M(i-1, k) or I(i-1,k). */
static inline int
select_i(ESL_RANDOMNESS *rng, const P7_OPROFILE *om, const P7_OMX *ox, int i, int k)
{
  int     Q     = p7O_NQF(ox->M);
  int     q    = (k-1) % Q;		/* (q,r) is position of the current DP cell D(i,k) */
  int     r    = (k-1) / Q;
  __m128  mpv  = ox->dpf[i-1][q*3 + p7X_M];
  __m128  ipv  = ox->dpf[i-1][q*3 + p7X_I];
  __m128 *tp   = om->tfv + 7*q + p7O_MI;
  union { __m128 v; float p[4]; } u;
  float   path[2];
  int     state[2] = { p7T_M, p7T_I };

  u.v = _mm_mul_ps(mpv, *tp); tp++;  path[0] = u.p[r];
  u.v = _mm_mul_ps(ipv, *tp);        path[1] = u.p[r];
  esl_vec_FNorm(path, 2);
  return state[esl_rnd_FChoose(rng, path, 2)];
}
Beispiel #12
0
/* I(i,k) is reached from M(i-1, k) or I(i-1,k). */
static inline int
select_i(ESL_RANDOMNESS *rng, const P7_OPROFILE *om, const P7_OMX *ox, int i, int k)
{
  int     Q     = p7O_NQF(ox->M);
  int     q    = (k-1) % Q;		/* (q,r) is position of the current DP cell D(i,k) */
  int     r    = (k-1) / Q;
  vector float  zerov;
  vector float  mpv  = ox->dpf[i-1][q*3 + p7X_M];
  vector float  ipv  = ox->dpf[i-1][q*3 + p7X_I];
  vector float *tp   = om->tfv + 7*q + p7O_MI;
  union { vector float v; float p[4]; } u;
  float   path[2];
  int     state[2] = { p7T_M, p7T_I };

  zerov = (vector float) vec_splat_u32(0);

  u.v = vec_madd(mpv, *tp, zerov); tp++;  path[0] = u.p[r];
  u.v = vec_madd(ipv, *tp, zerov);        path[1] = u.p[r];
  esl_vec_FNorm(path, 2);
  return state[esl_rnd_FChoose(rng, path, 2)];
}
Beispiel #13
0
/* Function: CPlan9Renormalize()
 *
 * Purpose:  Take an HMM in counts form, and renormalize
 *           all of its probability vectors. Also enforces
 *           CM Plan9 restrictions on nonexistent transitions.
 *
 * Args:     hmm - the model to renormalize.
 *
 * Return:   (void)
 *           hmm is changed.
 */
void
CPlan9Renormalize(CP9_t *hmm)
{
    int   k;			/* counter for model position */
    float d;			/* denominator */

    /* match emissions */
    esl_vec_FSet(hmm->mat[0], hmm->abc->K, 0.);   /*M_0 is B state, non-emitter*/
    for (k = 1; k <= hmm->M; k++)
        esl_vec_FNorm(hmm->mat[k], hmm->abc->K);
    /* insert emissions */
    for (k = 0; k <= hmm->M; k++)
        esl_vec_FNorm(hmm->ins[k], hmm->abc->K);

    /* begin transitions */
    d = esl_vec_FSum(hmm->begin+1, hmm->M) + hmm->t[0][CTMI] + hmm->t[0][CTMD] + hmm->t[0][CTMEL];
    /* hmm->t[0][CTMEL] should always be 0., can't local end from the M_0 == B state */
    esl_vec_FScale(hmm->begin+1, hmm->M, 1./d);
    hmm->t[0][CTMI] /= d;
    hmm->t[0][CTMD] /= d;
    hmm->t[0][CTMEL] /= d;

    esl_vec_FNorm(hmm->t[0] + cp9_TRANS_INSERT_OFFSET, cp9_TRANS_NINSERT);	        /* transitions out of insert for node 0 (state N)*/
    esl_vec_FSet (hmm->t[0] + cp9_TRANS_DELETE_OFFSET, cp9_TRANS_NDELETE, 0.);
    /* main model transitions */
    for (k = 1; k <= hmm->M; k++) /* safe for node M too, hmm->t[hmm->M][CTMM] should be 0.*/
    {
        d = esl_vec_FSum(hmm->t[k], cp9_TRANS_NMATCH) + hmm->end[k];
        esl_vec_FScale(hmm->t[k], cp9_TRANS_NMATCH, 1./d);
        hmm->end[k] /= d;

        esl_vec_FNorm(hmm->t[k] + cp9_TRANS_INSERT_OFFSET, cp9_TRANS_NINSERT);	/* insert */
        esl_vec_FNorm(hmm->t[k] + cp9_TRANS_DELETE_OFFSET, cp9_TRANS_NDELETE);	/* delete */
    }
    /* null model emissions */
    esl_vec_FNorm(hmm->null, hmm->abc->K);

    hmm->flags &= ~CPLAN9_HASBITS;	/* clear the log-odds ready flag */
    hmm->flags |= CPLAN9_HASPROB;	/* set the probabilities OK flag */
}
/* Function: p7_null3_score()
 *
 * Purpose:  Calculate a correction (in log_2 odds) to be applied
 *           to a sequence, using a null model based on the
 *           composition of the target sequence.
 *           The null model is constructed /post hoc/ as the
 *           distribution of the target sequence; if the target
 *           sequence is 40% A, 5% C, 5% G, 40% T, then the null
 *           model is (0.4, 0.05, 0.05, 0.4). This function is
 *           based heavily on Infernal's ScoreCorrectionNull3(),
 *           with two important changes:
 *            - it leaves the log2 conversion from NATS to BITS
 *              for the calling function.
 *            - it doesn't include the omega score modifier
 *              (based on prior probability of using the null3
 *              model), again leaving this to the calling function.
 *
 * Args:     abc   - alphabet for hit (only used to get alphabet size)
 *           dsq   - the sequence the hit resides in
 *           tr   - trace of the alignment, used to find the match states
 *                  (non-match chars are ignored in computing freq, not used if NULL)
 *           start - start position of hit in dsq
 *           stop  - end  position of hit in dsq
 *           bg    - background, used for the default null model's emission freq
 *           ret_sc - RETURN: the correction to the score (in NATS);
 *                   caller subtracts this from hit score to get
 *                   corrected score.
 * Return:   void, ret_sc: the log-odds score correction (in NATS).
 */
void
p7_null3_score(const ESL_ALPHABET *abc, const ESL_DSQ *dsq, P7_TRACE *tr, int start, int stop, P7_BG *bg, float *ret_sc)
{
    float score = 0.;
    int status;
    int i;
    float *freq;
    int dir;
    int tr_pos;

    ESL_ALLOC(freq, sizeof(float) * abc->K);
    esl_vec_FSet(freq, abc->K, 0.0);

    /* contract check */
    if(abc == NULL) esl_exception(eslEINVAL, FALSE, __FILE__, __LINE__, "p7_null3_score() alphabet is NULL.%s\n", "");
    if(dsq == NULL) esl_exception(eslEINVAL, FALSE, __FILE__, __LINE__, "p7_null3_score() dsq alphabet is NULL.%s\n", "");
    if(abc->type != eslRNA && abc->type != eslDNA) esl_exception(eslEINVAL, FALSE, __FILE__, __LINE__, "p7_null3_score() expects alphabet of RNA or DNA.%s\n", "");

    dir = start < stop ? 1 : -1;

    if (tr != NULL) {
        /* skip the parts of the trace that precede the first match state */
        tr_pos = 2;
        i = start;
        while (tr->st[tr_pos] != p7T_M) {
            if (tr->st[tr_pos] == p7T_N)
                i += dir;
            tr_pos++;
        }

        /* tally frequencies from characters hitting match state*/
        while (tr->st[tr_pos] != p7T_E) {
            if (tr->st[tr_pos] == p7T_M) {
                if(esl_abc_XIsGap(abc, dsq[i])) esl_exception(eslEINVAL, FALSE, __FILE__, __LINE__, "in p7_null3_score(), res %d is a gap!%s\n", "");
                esl_abc_FCount(abc, freq, dsq[i], 1.);
            }
            if (tr->st[tr_pos] != p7T_D )
                i += dir;
            tr_pos++;
        }
    } else {
        /* tally frequencies from the full envelope */
        for (i=ESL_MIN(start,stop); i <= ESL_MAX(start,stop); i++)
        {
            if(esl_abc_XIsGap(abc, dsq[i])) esl_exception(eslEINVAL, FALSE, __FILE__, __LINE__, "in p7_null3_score(), res %d is a gap!%s\n", "");
            esl_abc_FCount(abc, freq, dsq[i], 1.);
        }
    }

    esl_vec_FNorm(freq, abc->K);


    /* now compute score modifier (nats) - note: even with tr!=NULL, this includes the unmatched characters*/
    for (i = 0; i < abc->K; i++)
        score += freq[i]==0 ? 0.0 : esl_logf( freq[i]/bg->f[i] ) * freq[i] * ( (stop-start)*dir +1) ;

    /* Return the correction to the bit score. */
    score = p7_FLogsum(0., score);
    *ret_sc = score;

    return;

ERROR:
    esl_exception(eslEINVAL, FALSE, __FILE__, __LINE__, "p7_null3_score() memory allocation error.%s\n", "");
    return; /* never reached */

}
Beispiel #15
0
static void
utest_pvectors(void)
{
  char  *msg   = "pvector unit test failed";
  double p1[4] = { 0.25, 0.25, 0.25, 0.25 };
  double p2[4];
  double p3[4];
  float  p1f[4]; 
  float  p2f[4] = { 0.0,   0.5, 0.5,  0.0  };
  float  p3f[4];
  int    n = 4;
  double result;

  esl_vec_D2F(p1,  n, p1f);
  esl_vec_F2D(p2f, n, p2);  

  if (esl_vec_DValidate(p1,  n, 1e-12, NULL) != eslOK) esl_fatal(msg);
  if (esl_vec_FValidate(p1f, n, 1e-7,  NULL) != eslOK) esl_fatal(msg);

  result = esl_vec_DEntropy(p1,  n);          if (esl_DCompare(2.0, result, 1e-9) != eslOK) esl_fatal(msg);
  result = esl_vec_FEntropy(p1f, n);          if (esl_DCompare(2.0, result, 1e-9) != eslOK) esl_fatal(msg);
  result = esl_vec_DEntropy(p2,  n);          if (esl_DCompare(1.0, result, 1e-9) != eslOK) esl_fatal(msg);
  result = esl_vec_FEntropy(p2f, n);          if (esl_DCompare(1.0, result, 1e-9) != eslOK) esl_fatal(msg);

  result = esl_vec_DRelEntropy(p2,  p1,  n);  if (esl_DCompare(1.0, result, 1e-9) != eslOK) esl_fatal(msg);
  result = esl_vec_FRelEntropy(p2f, p1f, n);  if (esl_DCompare(1.0, result, 1e-9) != eslOK) esl_fatal(msg);

  result = esl_vec_DRelEntropy(p1,  p2,  n);  if (result != eslINFINITY)  esl_fatal(msg);
  result = esl_vec_FRelEntropy(p1f, p2f, n);  if (result != eslINFINITY)  esl_fatal(msg);

  esl_vec_DLog(p2, n);
  if (esl_vec_DLogValidate(p2, n, 1e-12, NULL) != eslOK) esl_fatal(msg);
  esl_vec_DExp(p2, n);
  if (p2[0] != 0.) esl_fatal(msg);

  esl_vec_FLog(p2f, n);
  if (esl_vec_FLogValidate(p2f, n, 1e-7, NULL) != eslOK) esl_fatal(msg);
  esl_vec_FExp(p2f, n);
  if (p2f[0] != 0.) esl_fatal(msg);

  esl_vec_DCopy(p2, n, p3);
  esl_vec_DScale(p3, n, 10.);
  esl_vec_DNorm(p3, n);
  if (esl_vec_DCompare(p2, p3, n, 1e-12) != eslOK) esl_fatal(msg);

  esl_vec_DLog(p3, n);
  result = esl_vec_DLogSum(p3, n); if (esl_DCompare(0.0, result, 1e-12) != eslOK) esl_fatal(msg);
  esl_vec_DIncrement(p3, n, 2.0);
  esl_vec_DLogNorm(p3, n);
  if (esl_vec_DCompare(p2, p3, n, 1e-12) != eslOK) esl_fatal(msg);

  esl_vec_FCopy(p2f, n, p3f);
  esl_vec_FScale(p3f, n, 10.);
  esl_vec_FNorm(p3f, n);
  if (esl_vec_FCompare(p2f, p3f, n, 1e-7) != eslOK) esl_fatal(msg);

  esl_vec_FLog(p3f, n);
  result = esl_vec_FLogSum(p3f, n); if (esl_DCompare(0.0, result, 1e-7) != eslOK) esl_fatal(msg);
  esl_vec_FIncrement(p3f, n, 2.0);
  esl_vec_FLogNorm(p3f, n);
  if (esl_vec_FCompare(p2f, p3f, n, 1e-7) != eslOK) esl_fatal(msg);

  return;
}
Beispiel #16
0
/* Function:  p7_bg_Read()
 * Synopsis:  Read background frequencies from a file.
 *
 * Purpose:   Read new background frequencies from file <bgfile>,
 *            overwriting the frequencies previously in the 
 *            <P7_BG> object <bg>.
 *            
 *            Note that <bg> is already created by the caller, not
 *            created here. Also note that <p7_bg_Read()> only reads
 *            residue background frequencies used for the "null
 *            model", whereas a <P7_BG> object contains additional
 *            information for the bias filter and for the biased
 *            composition correction.
 *            
 * Args:      bgfile  - file to read.
 *            bg      - existing <P7_BG> object provided by the caller.
 *            errbuf  - OPTIONAL: space for an error message, upon parse errors; or NULL.
 *
 * Returns:   <eslOK> on success, and background frequencies in <bg>
 *            are overwritten.
 * 
 *            <eslENOTFOUND> if <bgfile> can't be opened for reading.
 *            <eslEFORMAT> if parsing of <bgfile> fails for some
 *            reason.  In both cases, <errbuf> contains a
 *            user-directed error message upon return, including (if
 *            relevant) the file name <bgfile> and the line number on
 *            which an error was detected. <bg> is unmodified.
 *
 * Throws:    <eslEMEM> on allocation failure; <bg> is unmodified,
 *            and <errbuf> is empty.
 */
int
p7_bg_Read(char *bgfile, P7_BG *bg, char *errbuf)
{
  ESL_FILEPARSER *efp   = NULL;
  float          *fq    = NULL;
  int             n     = 0;
  char           *tok;
  int             toklen;
  int             alphatype;
  ESL_DSQ         x;
  int             status;

  if (errbuf) errbuf[0] = '\0';

  status =  esl_fileparser_Open(bgfile, NULL, &efp);
  if      (status == eslENOTFOUND) ESL_XFAIL(eslENOTFOUND, errbuf, "couldn't open bg file  %s for reading", bgfile);
  else if (status != eslOK)        goto ERROR;

  esl_fileparser_SetCommentChar(efp, '#');

  /* First token is alphabet type: amino | DNA | RNA */
  status = esl_fileparser_GetToken(efp, &tok, &toklen);
  if      (status == eslEOF) ESL_XFAIL(eslEFORMAT, errbuf, "premature end of file [line %d of bgfile %s]", efp->linenumber, bgfile);
  else if (status != eslOK)  goto ERROR;

  alphatype = esl_abc_EncodeType(tok);
  if      (alphatype == eslUNKNOWN)    ESL_XFAIL(eslEFORMAT, errbuf, "expected alphabet type but saw \"%s\" [line %d of bgfile %s]", tok, efp->linenumber, bgfile);
  else if (alphatype != bg->abc->type) ESL_XFAIL(eslEFORMAT, errbuf, "bg file's alphabet is %s; expected %s [line %d, %s]", tok, esl_abc_DecodeType(bg->abc->type), efp->linenumber, bgfile);
  
  ESL_ALLOC(fq, sizeof(float) * bg->abc->K);
  esl_vec_FSet(fq, bg->abc->K, -1.0);

  while ((status = esl_fileparser_NextLine(efp)) == eslOK)
    {
      status = esl_fileparser_GetTokenOnLine(efp, &tok, &toklen);
      if      (status == eslEOL) ESL_XFAIL(eslEFORMAT, errbuf, "premature end of file [line %d of bgfile %s", efp->linenumber, bgfile);
      else if (status != eslOK)  goto ERROR;

      if      (toklen != 1 ||   ! esl_abc_CIsCanonical(bg->abc, *tok))
	ESL_XFAIL(eslEFORMAT, errbuf, "expected to parse a residue letter; saw %s [line %d of bgfile %s]", tok, efp->linenumber, bgfile);

      x = esl_abc_DigitizeSymbol(bg->abc, *tok);
      if (fq[x] != -1.0)         ESL_XFAIL(eslEFORMAT, errbuf, "already parsed probability of %c [line %d of bgfile %s]", bg->abc->sym[x], efp->linenumber, bgfile);
      n++;

      status = esl_fileparser_GetTokenOnLine(efp, &tok, &toklen);
      if      (status == eslEOL) ESL_XFAIL(eslEFORMAT, errbuf, "premature end of file, expected a probability [line %d of bgfile %s]", efp->linenumber, bgfile);
      else if (status != eslOK)  goto ERROR;
      if (! esl_str_IsReal(tok)) ESL_XFAIL(eslEFORMAT, errbuf, "expected a probability, saw %s [line %d of bgfile %s]", tok, efp->linenumber, bgfile);

      fq[x] = atof(tok);

      status = esl_fileparser_GetTokenOnLine(efp, &tok, &toklen);
      if      (status == eslOK)  ESL_XFAIL(eslEFORMAT, errbuf, "extra unexpected data found [line %d of bgfile %s]", efp->linenumber, bgfile);
      else if (status != eslEOL) goto ERROR;
    }
  if (status != eslEOF) goto ERROR;

  if ( n != bg->abc->K) 
    ESL_XFAIL(eslEFORMAT, errbuf, "expected %d residue frequencies, but found %d in bgfile %s", bg->abc->K, n, bgfile);
  if ( esl_FCompare(esl_vec_FSum(fq, bg->abc->K), 1.0, 0.001) != eslOK) 
    ESL_XFAIL(eslEFORMAT, errbuf, "residue frequencies do not sum to 1.0 in bgfile %s", bgfile);
  
  /* all checking complete. no more error cases. overwrite bg with the new frequencies */
  esl_vec_FNorm(fq, bg->abc->K);
  esl_vec_FCopy(fq, bg->abc->K, bg->f);

  free(fq);
  esl_fileparser_Close(efp);
  return eslOK;

 ERROR:
  if (fq)  free(fq);
  if (efp) esl_fileparser_Close(efp);
  return status;
}
Beispiel #17
0
int 
main(int argc, char **argv)
{
  ESL_GETOPTS  *go;
  char         *msafile;
  ESLX_MSAFILE *afp;
  ESL_MSA      *msa;
  float        *sqd;
  int          status;
  int          nbad;
  int          nali    = 0;
  int          nbadali = 0;
  int          nwgt    = 0;
  int          nbadwgt = 0;
  int i;
  int be_quiet;
  int do_gsc;
  int do_pb;
  int do_blosum;
  double maxid;
  double tol;
  int    maxN;

  /* Process command line
   */
  go = esl_getopts_Create(options);
  if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) esl_fatal("failed to parse cmd line: %s\n", go->errbuf);
  if (esl_opt_VerifyConfig(go)               != eslOK) esl_fatal("failed to parse cmd line: %s\n", go->errbuf);
  if (esl_opt_GetBoolean(go, "-h") == TRUE) {
    puts(usage); 
    puts("\n  where options are:");
    esl_opt_DisplayHelp(stdout, go, 0, 2, 80); /* 0=all docgroups; 2=indentation; 80=width */
    return 0;
  }
  be_quiet  = esl_opt_GetBoolean(go, "-q");
  do_blosum = esl_opt_GetBoolean(go, "--blosum");
  do_gsc    = esl_opt_GetBoolean(go, "--gsc");
  do_pb     = esl_opt_GetBoolean(go, "--pb");
  maxid     = esl_opt_GetReal   (go, "--id");
  tol       = esl_opt_GetReal   (go, "--tol");
  maxN      = esl_opt_GetInteger(go, "--maxN");
  if (esl_opt_ArgNumber(go) != 1) {
    puts("Incorrect number of command line arguments.");
    puts(usage);
    return 1;
  }
  msafile = esl_opt_GetArg(go, 1);
  esl_getopts_Destroy(go);

  /* Weight one or more alignments from input file
   */
  if ((status = eslx_msafile_Open(NULL, msafile, NULL, eslMSAFILE_UNKNOWN, NULL, &afp)) != eslOK)
    eslx_msafile_OpenFailure(afp, status);

  while ( (status = eslx_msafile_Read(afp, &msa)) != eslEOF)
    {
      if (status != eslOK) eslx_msafile_ReadFailure(afp, status);

      if (maxN > 0 && msa->nseq > maxN) { esl_msa_Destroy(msa); continue; }

      nali++;
      nwgt += msa->nseq;
      ESL_ALLOC(sqd, sizeof(float) * msa->nseq);

      if (do_gsc) {
	esl_msaweight_GSC(msa);
	GSCWeights(msa->aseq, msa->nseq, msa->alen, sqd);
      } else if (do_pb) {
	esl_msaweight_PB(msa);
	PositionBasedWeights(msa->aseq, msa->nseq, msa->alen, sqd);
      } else if (do_blosum) {
	esl_msaweight_BLOSUM(msa, maxid);
	BlosumWeights(msa->aseq, msa->nseq, msa->alen, maxid, sqd);
	/* workaround SQUID bug: BLOSUM weights weren't renormalized to sum to nseq. */
	esl_vec_FNorm (sqd, msa->nseq);
	esl_vec_FScale(sqd, msa->nseq, (float) msa->nseq);	
      }

      if (! be_quiet) {
	for (i = 0; i < msa->nseq; i++)
	  fprintf(stdout, "%-20s  %.3f  %.3f\n",
		  msa->sqname[i], msa->wgt[i], sqd[i]);
      }
	
      nbad = 0;
      for (i = 0; i < msa->nseq; i++)
	if (esl_DCompare((double) sqd[i], msa->wgt[i], tol) != eslOK) 
	  nbad++;
      if (nbad > 0) nbadali++;
      nbadwgt += nbad;

      if (nbad > 0) printf("%-20s  :: alignment shows %d weights that differ (out of %d) \n", 
			   msa->name, nbad, msa->nseq);

      esl_msa_Destroy(msa);
      free(sqd);
    } 
  eslx_msafile_Close(afp);

  if (nbadali == 0) 
    printf("OK: all weights identical between squid and Easel in %d alignment(s)\n", nali);
  else {
    printf("%d of %d weights mismatched at (> %f fractional difference)\n",
	   nbadwgt, nwgt, tol);
    printf("involving %d of %d total alignments\n", nbadali, nali);
  }
  return eslOK;

 ERROR:
  return status;
}
/**
 * int main(int argc, char **argv)
 * Main driver
 */
int
main(int argc, char **argv)
{
  ESL_GETOPTS     *go	   = NULL;      /* command line processing                   */
  ESL_ALPHABET    *abc     = NULL;
  char            *hmmfile = NULL;
  char            *outhmmfile = NULL;
  P7_HMMFILE      *hfp     = NULL;
  FILE         *outhmmfp;          /* HMM output file handle                  */
  P7_HMM          *hmm     = NULL;
  P7_BG           *bg      = NULL;
  int              nhmm;	
  double           x;
  float            KL;
  int              status;
  char             errbuf[eslERRBUFSIZE];

  float average_internal_transitions[ p7H_NTRANSITIONS ];
  int k;

  char        errmsg[eslERRBUFSIZE];

  /* Process the command line options.
   */
  go = esl_getopts_Create(options);
  if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK || 
      esl_opt_VerifyConfig(go)               != eslOK)
    {
      printf("Failed to parse command line: %s\n", go->errbuf);
      esl_usage(stdout, argv[0], usage);
      printf("\nTo see more help on available options, do %s -h\n\n", argv[0]);
      exit(1);
    }
  if (esl_opt_GetBoolean(go, "-h") == TRUE) 
    {
      profillic_p7_banner(stdout, argv[0], banner);
      esl_usage(stdout, argv[0], usage);
      puts("\nOptions:");
      esl_opt_DisplayHelp(stdout, go, 0, 2, 80); /* 0=docgroup, 2 = indentation; 80=textwidth*/
      exit(0);
    }
  if (esl_opt_ArgNumber(go) != 2) 
    {
      puts("Incorrect number of command line arguments.");
      esl_usage(stdout, argv[0], usage);
      printf("\nTo see more help on available options, do %s -h\n\n", argv[0]);
      exit(1);
    }

  if ((hmmfile = esl_opt_GetArg(go, 1)) == NULL) 
    {
      puts("Failed to read <input hmmfile> argument from command line.");
      esl_usage(stdout, argv[0], usage);
      printf("\nTo see more help on available options, do %s -h\n\n", argv[0]);
      exit(1);
    }

  if ((outhmmfile = esl_opt_GetArg(go, 2)) == NULL) 
    {
      puts("Failed to read <output hmmfile> argument from command line.");
      esl_usage(stdout, argv[0], usage);
      printf("\nTo see more help on available options, do %s -h\n\n", argv[0]);
      exit(1);
    }

  profillic_p7_banner(stdout, argv[0], banner);
  
  /* Initializations: open the input HMM file for reading
   */
  status = p7_hmmfile_OpenE(hmmfile, NULL, &hfp, errbuf);
  if      (status == eslENOTFOUND) p7_Fail("File existence/permissions problem in trying to open HMM file %s.\n%s\n", hmmfile, errbuf);
  else if (status == eslEFORMAT)   p7_Fail("File format problem in trying to open HMM file %s.\n%s\n",                hmmfile, errbuf);
  else if (status != eslOK)        p7_Fail("Unexpected error %d in opening HMM file %s.\n%s\n",               status, hmmfile, errbuf);  

  /* Initializations: open the output HMM file for writing
   */
  if ((outhmmfp = fopen(outhmmfile, "w")) == NULL) ESL_FAIL(status, errmsg, "Failed to open HMM file %s for writing", outhmmfile);

  /* Main body: read HMMs one at a time, print one line of stats
   */
  printf("#\n");
  printf("# %-4s %-20s %-12s %8s %8s %6s %6s %6s %6s %6s\n", "idx",  "name",                 "accession",    "nseq",     "eff_nseq", "M",      "relent", "info",   "p relE", "compKL");
  printf("# %-4s %-20s %-12s %8s %8s %6s %6s %6s %6s %6s\n", "----", "--------------------", "------------", "--------", "--------", "------", "------", "------", "------", "------");

  nhmm = 0;
  while ((status = p7_hmmfile_Read(hfp, &abc, &hmm)) != eslEOF) 
    {
      if      (status == eslEOD)       esl_fatal("read failed, HMM file %s may be truncated?", hmmfile);
      else if (status == eslEFORMAT)   esl_fatal("bad file format in HMM file %s",             hmmfile);
      else if (status == eslEINCOMPAT) esl_fatal("HMM file %s contains different alphabets",   hmmfile);
      else if (status != eslOK)        esl_fatal("Unexpected error in reading HMMs from %s",   hmmfile);
      nhmm++;

      if (bg == NULL) bg = p7_bg_Create(abc);

      esl_vec_FSet(average_internal_transitions, p7H_NTRANSITIONS, 0.);
      for( k = 1; k < hmm->M; k++ ) {
        esl_vec_FAdd(average_internal_transitions, hmm->t[k], p7H_NTRANSITIONS);
      }
      // Match transitions
      esl_vec_FNorm(average_internal_transitions, 3);
      // Insert transitions
      esl_vec_FNorm(average_internal_transitions + 3, 2);
      // Delete transitions
      esl_vec_FNorm(average_internal_transitions + 5, 2);
      // Ok now set them.
      for( k = 1; k < hmm->M; k++ ) {
        esl_vec_FCopy( average_internal_transitions, p7H_NTRANSITIONS, hmm->t[k] );
      }

      if ((status = p7_hmm_Validate(hmm, errmsg, 0.0001))       != eslOK) return status;
      if ((status = p7_hmmfile_WriteASCII(outhmmfp, -1, hmm)) != eslOK) ESL_FAIL(status, errmsg, "HMM save failed");
  
      p7_MeanPositionRelativeEntropy(hmm, bg, &x); 
      p7_hmm_CompositionKLDist(hmm, bg, &KL, NULL);

      printf("%-6d %-20s %-12s %8d %8.2f %6d %6.2f %6.2f %6.2f %6.2f\n",
	     nhmm,
	     hmm->name,
	     hmm->acc == NULL ? "-" : hmm->acc,
	     hmm->nseq,
	     hmm->eff_nseq,
	     hmm->M,
	     p7_MeanMatchRelativeEntropy(hmm, bg),
	     p7_MeanMatchInfo(hmm, bg),
	     x,
	     KL);

	     /*	     p7_MeanForwardScore(hmm, bg)); */

      p7_hmm_Destroy(hmm);
    }

  p7_bg_Destroy(bg);
  esl_alphabet_Destroy(abc);
  p7_hmmfile_Close(hfp);
  if (outhmmfp != NULL) fclose(outhmmfp);
 esl_getopts_Destroy(go);
  exit(0);
}