/* Function: P7PriorifyEmissionVector()
 * 
 * Purpose:  Add prior pseudocounts to an observed 
 *           emission count vector and renormalize. 
 *
 *           Can return the posterior mixture probabilities
 *           P(q | counts) if ret_mix[MAXDCHLET] is passed.
 *           Else, pass NULL.  
 * 
 * Args:     vec     - the 4 or 20-long vector of counts to modify
 *           pri     - prior data structure
 *           num     - pri->mnum or pri->inum; # of mixtures
 *           eq      - pri->mq or pri->iq; prior mixture probabilities
 *           e       - pri->i or pri->m; Dirichlet components          
 *           ret_mix - filled with posterior mixture probabilities, or NULL
 *                   
 * Return:   (void)
 *           The counts in vec are changed and normalized to probabilities.
 */                  
void
P7PriorifyEmissionVector(float *vec, struct p7prior_s *pri, 
		       int num, float eq[MAXDCHLET], float e[MAXDCHLET][MAXABET],
		       float *ret_mix)
{
  int   x;                      /* counter over vec                     */
  int   q;                      /* counter over mixtures                */
  float mix[MAXDCHLET];         /* posterior distribution over mixtures */
  float totc;                   /* total counts                         */
  float tota;                   /* total alpha terms                    */
  float xi;                     /* X_i term, Sjolander eq. 41           */

  /* Calculate mix[], which is the posterior probability
   * P(q | n) of mixture component q given the count vector n
   *
   * (side effect note: note that an insert vector in a PAM prior
   * is passed with num = 1, bypassing pam prior code; this means
   * that inserts cannot be mixture Dirichlets...)
   * [SRE, 12/24/00: the above comment is cryptic! what the hell does that
   *  mean, inserts can't be mixtures? doesn't seem to be true. it 
   *  may mean that in a PAM prior, you can't have a mixture for inserts,
   *  but I don't even understand that. The insert vectors aren't passed
   *  with num=1!!]
   */
  mix[0] = 1.0;
  if (pri->strategy == PRI_DCHLET && num > 1) 
    {
      for (q = 0; q < num; q++) 
	{
	  mix[q] =  eq[q] > 0.0 ? log(eq[q]) : -999.;
	  mix[q] += Logp_cvec(vec, Alphabet_size, e[q]);
	}
      LogNorm(mix, num);      /* now mix[q] is P(component_q | n) */
    }
  else if (pri->strategy == PRI_PAM && num > 1) 
    {		/* pam prior uses aa frequencies as `P(q|n)' */
      for (q = 0; q < Alphabet_size; q++) 
	mix[q] = vec[q];
      FNorm(mix, Alphabet_size);
    }

  /* Convert the counts to probabilities, following Sjolander (1996) 
   */
  totc = FSum(vec, Alphabet_size);
  for (x = 0; x < Alphabet_size; x++) {
    xi = 0.0;
    for (q = 0; q < num; q++) {
      tota = FSum(e[q], Alphabet_size);
      xi += mix[q] * (vec[x] + e[q][x]) / (totc + tota);
    }
    vec[x] = xi;
  }
  FNorm(vec, Alphabet_size);

  if (ret_mix != NULL)
    for (q = 0; q < num; q++)
      ret_mix[q] = mix[q];
}
/* Function: P7PriorifyTransitionVector()
 * 
 * Purpose:  Add prior pseudocounts to transition vector,
 *           which contains three different probability vectors
 *           for m, d, and i. 
 *           
 * Args:     t     - state transitions, counts: 3 for M, 2 for I, 2 for D.   
 *           prior - Dirichlet prior information
 *           tq    - prior distribution over Dirichlet components.
 *                   (overrides prior->iq[]; used for alternative
 *                   methods of conditioning prior on structural data)  
 *           
 * Return:   (void)
 *           t is changed, and renormalized -- comes back as
 *           probability vectors.
 */          
void
P7PriorifyTransitionVector(float *t, struct p7prior_s *prior, 
			   float tq[MAXDCHLET])
{
  int   ts;
  int   q;
  float mix[MAXDCHLET];
  float totm, totd, toti;       /* total counts in three transition vecs */
  float xi;                     /* Sjolander's X_i term */

  mix[0] = 1.0;			/* default is simple one component */
  if ((prior->strategy == PRI_DCHLET || prior->strategy == PRI_PAM) && prior->mnum > 1)
    {
      for (q = 0; q < prior->tnum; q++)
        {
          mix[q] =  tq[q] > 0.0 ? log(tq[q]) : -999.;
          mix[q] += Logp_cvec(t,   3, prior->t[q]);   /* 3 match  */
          mix[q] += Logp_cvec(t+3, 2, prior->t[q]+3); /* 2 insert */
	  mix[q] += Logp_cvec(t+5, 2, prior->t[q]+5); /* 2 delete */
        }
      LogNorm(mix, prior->tnum); /* mix[q] is now P(q | counts) */
    }
				/* precalc some denominators */
  totm = FSum(t,3);		
  toti = t[TIM] + t[TII];
  totd = t[TDM] + t[TDD];

  for (ts = 0; ts < 7; ts++)  
    {
      xi = 0.0;
      for (q = 0; q < prior->tnum; q++)
        {
	  switch (ts) {
	  case TMM: case TMI: case TMD: 
	    xi += mix[q] * (t[ts] + prior->t[q][ts]) / 
	      (totm + FSum(prior->t[q], 3)); 
	    break;
	  case TIM: case TII: 
	    xi += mix[q] * (t[ts] + prior->t[q][ts]) / 
	      (toti + prior->t[q][TIM] + prior->t[q][TII]);
	    break;
	  case TDM: case TDD: 
	    xi += mix[q] * (t[ts] + prior->t[q][ts]) / 
	      (totd + prior->t[q][TDM] + prior->t[q][TDD]);
	    break;
	  }
        }
      t[ts] = xi;
    }
  FNorm(t,   3);		/* match  */
  FNorm(t+3, 2);		/* insert */
  FNorm(t+5, 2);		/* delete */
}
Ejemplo n.º 3
0
/* Function: StrMarkov0()
 * Date:     SRE, Fri Oct 29 11:08:31 1999 [St. Louis]
 *
 * Purpose:  Returns a random string s1 with the same
 *           length and zero-th order Markov properties
 *           as s2. 
 *           
 *           s1 and s2 may be identical, to randomize s2
 *           in place.
 *
 * Args:     s1 - allocated space for random string
 *           s2 - string to base s1's properties on.
 *
 * Returns:  1 on success; 0 if s2 doesn't look alphabetical.
 */
int 
StrMarkov0(char *s1, char *s2)
{
  int   len;
  int   pos; 
  float p[26];			/* symbol probabilities */

  /* First, verify that the string is entirely alphabetic.
   */
  len = strlen(s2);
  for (pos = 0; pos < len; pos++)
    if (! isalpha(s2[pos])) return 0;

  /* Collect zeroth order counts and convert to frequencies.
   */
  FSet(p, 26, 0.);
  for (pos = 0; pos < len; pos++)
    p[(int)(toupper(s2[pos]) - 'A')] += 1.0;
  FNorm(p, 26);

  /* Generate a random string using those p's.
   */
  for (pos = 0; pos < len; pos++)
    s1[pos] = FChoose(p, 26) + 'A';
  s1[pos] = '\0';

  return 1;
}
Ejemplo n.º 4
0
/* Function: Renormalize()
 * 
 * Normalize all P distributions so they sum to 1.
 * P distributions that are all 0, or contain negative
 * probabilities, are left untouched.
 * 
 * Returns 1 on success, or 0 on failure.
 */
void
Renormalize(struct hmm_struc *hmm)
{
  int    k;			/* counter for states                  */

  for (k = 0; k <= hmm->M ; k++)
    {
				/* match state transition frequencies */
      FNorm(hmm->mat[k].t, 3);
      FNorm(hmm->ins[k].t, 3);
      if (k > 0) FNorm(hmm->del[k].t, 3);

      if (k > 0) FNorm(hmm->mat[k].p, Alphabet_size);
      FNorm(hmm->ins[k].p, Alphabet_size);
    }
}
/* Function: PAMPrior()
 * 
 * Purpose:  Produces an ad hoc "Dirichlet mixture" prior for
 *           match emissions, using a PAM matrix. 
 *           
 *           Side effect notice: PAMPrior() replaces the match
 *           emission section of an existing Dirichlet prior,
 *           which is /expected/ to be a simple one-component 
 *           kind of prior. The insert emissions /must/ be a
 *           one-component prior (because of details in how 
 *           PriorifyEmissionVector() is done). However, 
 *           the transitions /could/ be a mixture Dirichlet prior 
 *           without causing problems. In other words, the
 *           -p and -P options of hmmb can coexist, but there
 *           may be conflicts. PAMPrior() checks for these,
 *           so there's no serious problem, except that the
 *           error message from PAMPrior() might be confusing to
 *           a user. 
 */
void
PAMPrior(char *pamfile, struct p7prior_s *pri, float wt)
{
  FILE  *fp;
  char  *blastpamfile;            /* BLAST looks in aa/ subdirectory of BLASTMAT */
  int  **pam;
  float  scale;
  int    xi, xj;
  int    idx1, idx2;

  if (Alphabet_type != hmmAMINO)
    Die("PAM prior is only valid for protein sequences");
  if (pri->strategy != PRI_DCHLET)
    Die("PAM prior may only be applied over an existing Dirichlet prior");
  if (pri->inum != 1)
    Die("PAM prior requires that the insert emissions be a single Dirichlet");
  if (MAXDCHLET < 20)
    Die("Whoa, code is misconfigured; MAXDCHLET must be >= 20 for PAM prior");

  blastpamfile = FileConcat("aa", pamfile);

  if ((fp = fopen(pamfile, "r")) == NULL &&
      (fp = EnvFileOpen(pamfile, "BLASTMAT", NULL)) == NULL &&
      (fp = EnvFileOpen(blastpamfile, "BLASTMAT", NULL)) == NULL)
    Die("Failed to open PAM scoring matrix file %s", pamfile);
  if (! ParsePAMFile(fp, &pam, &scale))
    Die("Failed to parse PAM scoring matrix file %s", pamfile);
  fclose(fp);
  free(blastpamfile);

  pri->strategy = PRI_PAM;
  pri->mnum     = 20;
  
  /* Convert PAM entries back to conditional prob's P(xj | xi),
   * which we'll use as "pseudocounts" weighted by wt.
   */
  for (xi = 0; xi < Alphabet_size; xi++)
    for (xj = 0; xj < Alphabet_size; xj++)
      {
        idx1 = Alphabet[xi] - 'A';
        idx2 = Alphabet[xj] - 'A';
        pri->m[xi][xj] = aafq[xj] * exp((float) pam[idx1][idx2] * scale);
      }
  
  /* Normalize so that rows add up to wt.
   * i.e. Sum(xj) mat[xi][xj] = wt for every row xi
   */
  for (xi = 0; xi < Alphabet_size; xi++)
    {
      pri->mq[xi] = 1. / Alphabet_size;
      FNorm(pri->m[xi], Alphabet_size);
      FScale(pri->m[xi], Alphabet_size, wt);
    }

  Free2DArray((void **)pam,27);
}
Ejemplo n.º 6
0
/* Function: StrMarkov1()
 * Date:     SRE, Fri Oct 29 11:22:20 1999 [St. Louis]
 *
 * Purpose:  Returns a random string s1 with the same
 *           length and first order Markov properties
 *           as s2. 
 *           
 *           s1 and s2 may be identical, to randomize s2
 *           in place.
 *
 * Args:     s1 - allocated space for random string
 *           s2 - string to base s1's properties on.
 *
 * Returns:  1 on success; 0 if s2 doesn't look alphabetical.
 */
int 
StrMarkov1(char *s1, char *s2)
{
  int   len;
  int   pos; 
  int   x,y;
  int   i;			/* initial symbol */
  float p[26][26];		/* symbol probabilities */

  /* First, verify that the string is entirely alphabetic.
   */
  len = strlen(s2);
  for (pos = 0; pos < len; pos++)
    if (! isalpha(s2[pos])) return 0;

  /* Collect first order counts and convert to frequencies.
   */
  for (x = 0; x < 26; x++) FSet(p[x], 26, 0.);

  i = x = toupper(s2[0]) - 'A';
  for (pos = 1; pos < len; pos++)
    {
      y = toupper(s2[pos]) - 'A';
      p[x][y] += 1.0; 
      x = y;
    }
  for (x = 0; x < 26; x++) 
    FNorm(p[x], 26);

  /* Generate a random string using those p's.
   */
  x = i;
  s1[0] = x + 'A';
  for (pos = 1; pos < len; pos++)
    {
      y = FChoose(p[x], 26);
      s1[pos] = y + 'A';
      x = y;
    } 
  s1[pos] = '\0';

  return 1;
}
Ejemplo n.º 7
0
/* Function: Plan7Renormalize()
 * 
 * Purpose:  Take an HMM in counts form, and renormalize
 *           all of its probability vectors. Also enforces
 *           Plan7 restrictions on nonexistent transitions.
 *           
 * Args:     hmm - the model to renormalize.
 *                 
 * Return:   (void)
 *           hmm is changed.
 */                          
void
Plan7Renormalize(struct plan7_s *hmm)
{
  int   k;			/* counter for model position */
  int   st;			/* counter for special states */
  float d;			/* denominator */

				/* match emissions */
  for (k = 1; k <= hmm->M; k++) 
    FNorm(hmm->mat[k], Alphabet_size);
				/* insert emissions */
  for (k = 1; k < hmm->M; k++)
    FNorm(hmm->ins[k], Alphabet_size);
				/* begin transitions */
  d = FSum(hmm->begin+1, hmm->M) + hmm->tbd1;
  FScale(hmm->begin+1, hmm->M, 1./d);
  hmm->tbd1 /= d;
				/* main model transitions */
  for (k = 1; k < hmm->M; k++)
    {
      d = FSum(hmm->t[k], 3) + hmm->end[k]; 
      FScale(hmm->t[k], 3, 1./d);
      hmm->end[k] /= d;

      FNorm(hmm->t[k]+3, 2);	/* insert */
      FNorm(hmm->t[k]+5, 2);	/* delete */
    }
				/* null model emissions */
  FNorm(hmm->null, Alphabet_size);
				/* special transitions  */
  for (st = 0; st < 4; st++)
    FNorm(hmm->xt[st], 2);
				/* enforce nonexistent transitions */
				/* (is this necessary?) */
  hmm->t[0][TDM] = hmm->t[0][TDD] = 0.0;

  hmm->flags &= ~PLAN7_HASBITS;	/* clear the log-odds ready flag */
  hmm->flags |= PLAN7_HASPROB;	/* set the probabilities OK flag */
}
Ejemplo n.º 8
0
//-----------------------------------------------------------------------------
// TestMatrixFNorm
//-----------------------------------------------------------------------------
bool TestMatrixFNorm()
{
   Matrix A("1,2,3;4,5,6;7,8,9");

   return ApproxEqual( FNorm(A), 16.8819430161341, TOLERANCE);
}