Ejemplo n.º 1
0
/* Function:  p7_ParameterEstimation()
 * Incept:    SRE, Sat Mar 24 10:15:37 2007 [Janelia]
 *
 * Purpose:   Given an <hmm> containing collected, weighted counts;
 *            and given a mixture Dirichlet prior <pri>;
 *            calculate mean posterior parameter estimates for 
 *            all model parameters, converting the 
 *            HMM to a parameterized probabilistic model.
 *            
 * Returns:   <eslOK> on success.
 */
int
p7_ParameterEstimation(P7_HMM *hmm, const P7_PRIOR *pri)
{
  int   k;
  double c[p7_MAXABET];
  double p[p7_MAXABET];
  double mix[p7_MAXDCHLET];
  
  /* Match transitions 0,1..M: 0 is the B state
   * TMD at node M is 0.
   */
  for (k = 0; k <= hmm->M; k++) {
    esl_vec_F2D(hmm->t[k], 3, c);
    esl_mixdchlet_MPParameters(c, 3, pri->tm, mix, p);
    esl_vec_D2F(p, 3, hmm->t[k]);
  }
  hmm->t[hmm->M][p7H_MD] = 0.0;
  esl_vec_FNorm(hmm->t[hmm->M], 3);

  /* Insert transitions, 0..M
   */
  for (k = 0; k <= hmm->M; k++) {
    esl_vec_F2D(hmm->t[k]+3, 2, c);
    esl_mixdchlet_MPParameters(c, 2, pri->ti, mix, p);
    esl_vec_D2F(p, 2, hmm->t[k]+3);
  }

  /* Delete transitions, 1..M-1
   * For k=0, which is unused; convention sets TMM=1.0, TMD=0.0
   * For k=M, TMM = 1.0 (to the E state) and TMD=0.0 (no next D; must go to E).
   */
  for (k = 1; k < hmm->M; k++) {
    esl_vec_F2D(hmm->t[k]+5, 2, c);
    esl_mixdchlet_MPParameters(c, 2, pri->td, mix, p);
    esl_vec_D2F(p, 2, hmm->t[k]+5);
  }
  hmm->t[0][p7H_DM] = hmm->t[hmm->M][p7H_DM] = 1.0;
  hmm->t[0][p7H_DD] = hmm->t[hmm->M][p7H_DD] = 0.0;

  /* Match emissions, 1..M
   * Convention sets mat[0] to a valid pvector: first elem 1, the rest 0.
   */
  for (k = 1; k <= hmm->M; k++) {
    esl_vec_F2D(hmm->mat[k], hmm->abc->K, c);
    esl_mixdchlet_MPParameters(c, hmm->abc->K, pri->em, mix, p);
    esl_vec_D2F(p, hmm->abc->K, hmm->mat[k]);
  }
  esl_vec_FSet(hmm->mat[0], hmm->abc->K, 0.);
  hmm->mat[0][0] = 1.0;

  /* Insert emissions 0..M
   */
  for (k = 0; k <= hmm->M; k++) {
    esl_vec_F2D(hmm->ins[k], hmm->abc->K, c);
    esl_mixdchlet_MPParameters(c, hmm->abc->K, pri->ei, mix, p);
    esl_vec_D2F(p, hmm->abc->K, hmm->ins[k]);
  }
  return eslOK;
}
Ejemplo n.º 2
0
/* Function:  p7_Seqmodel()
 * Synopsis:  Make a profile HMM from a single sequence.
 *
 * Purpose:   Make a profile HMM from a single sequence, for
 *            probabilistic Smith/Waterman alignment, HMMER3-style.
 *            
 *            The query is digital sequence <dsq> of length <M>
 *            residues in alphabet <abc>, named <name>. 
 *            
 *            The scoring system is given by <Q>, <f>, <popen>, and
 *            <pextend>. <Q> is a $K \times K$ matrix giving
 *            conditional residue probabilities $P(a \mid b)}$; these
 *            are typically obtained by reverse engineering a score
 *            matrix like BLOSUM62. <f> is a vector of $K$ background
 *            frequencies $p_a$. <popen> and <pextend> are the
 *            probabilities assigned to gap-open ($t_{MI}$ and
 *            $t_{MD}$) and gap-extend ($t_{II}$ and $t_{DD}$)
 *            transitions.
 *            
 *            The <p7H_SINGLE> flag is set on the <hmm>. Model
 *            configuration (<p7_profile_Config(), friends> detects
 *            this flag. <B->Mk> entry transitions include a match
 *            state occupancy term for profile HMMs, but for single
 *            queries, that <occ[]> term is assumed 1.0 for all
 *            positions. See commentary in modelconfig.c.
 *            
 * Args:      
 *
 * Returns:   <eslOK> on success, and a newly allocated HMM is returned
 *            in <ret_hmm>. 
 *
 * Throws:    <eslEMEM> on allocation error, and <*ret_hmm> is <NULL>.
 */
int
p7_Seqmodel(const ESL_ALPHABET *abc, ESL_DSQ *dsq, int M, char *name,
	    ESL_DMATRIX *Q, float *f, double popen, double pextend,
	    P7_HMM **ret_hmm)
{
  int     status;
  P7_HMM *hmm    = NULL;
  char   *logmsg = "[HMM created from a query sequence]";
  int     k;

  if ((hmm = p7_hmm_Create(M, abc)) == NULL) { status = eslEMEM; goto ERROR; }
  
  for (k = 0; k <= M; k++)
    {
      /* Use rows of P matrix as source of match emission vectors */
      if (k > 0) esl_vec_D2F(Q->mx[(int) dsq[k]], abc->K, hmm->mat[k]);

      /* Set inserts to background for now. This will be improved. */
      esl_vec_FCopy(f, abc->K, hmm->ins[k]);

      hmm->t[k][p7H_MM] = 1.0 - 2 * popen;
      hmm->t[k][p7H_MI] = popen;
      hmm->t[k][p7H_MD] = popen;
      hmm->t[k][p7H_IM] = 1.0 - pextend;
      hmm->t[k][p7H_II] = pextend;
      hmm->t[k][p7H_DM] = 1.0 - pextend;
      hmm->t[k][p7H_DD] = pextend;
    }

  /* Deal w/ special stuff at node M, overwriting a little of what we
   * just did. 
   */
  hmm->t[M][p7H_MM] = 1.0 - popen;
  hmm->t[M][p7H_MD] = 0.;
  hmm->t[M][p7H_DM] = 1.0;
  hmm->t[M][p7H_DD] = 0.;
  
  /* Add mandatory annotation
   */
  p7_hmm_SetName(hmm, name);
  p7_hmm_AppendComlog(hmm, 1, &logmsg);
  hmm->nseq     = 1;
  p7_hmm_SetCtime(hmm);
  hmm->checksum = 0;

  hmm->flags |= p7H_SINGLE;
  *ret_hmm = hmm;
  return eslOK;
  
 ERROR:
  if (hmm != NULL) p7_hmm_Destroy(hmm);
  *ret_hmm = NULL;
  return status;
}
Ejemplo n.º 3
0
/* The DChoose() and FChoose() unit tests.
 */
static void
utest_choose(ESL_RANDOMNESS *r, int n, int nbins, int be_verbose)
{
  double *pd = NULL;
  float  *pf = NULL;
  int    *ct = NULL;
  int     i;
  double  X2, diff, exp, X2p;

  if ((pd = malloc(sizeof(double) * nbins)) == NULL) esl_fatal("malloc failed"); 
  if ((pf = malloc(sizeof(float)  * nbins)) == NULL) esl_fatal("malloc failed");
  if ((ct = malloc(sizeof(int)    * nbins)) == NULL) esl_fatal("malloc failed");

  /* Sample a random multinomial probability vector.  */
  if (esl_dirichlet_DSampleUniform(r, nbins, pd) != eslOK) esl_fatal("dirichlet sample failed");
  esl_vec_D2F(pd, nbins, pf);

  /* Sample observed counts using DChoose(). */
  esl_vec_ISet(ct, nbins, 0);
  for (i = 0; i < n; i++)
    ct[esl_rnd_DChoose(r, pd, nbins)]++;

  /* X^2 test on those observed counts. */
  for (X2 = 0., i=0; i < nbins; i++) {
    exp = (double) n * pd[i];
    diff = (double) ct[i] - exp;
    X2 += diff*diff/exp;
  }
  if (esl_stats_ChiSquaredTest(nbins, X2, &X2p) != eslOK) esl_fatal("chi square eval failed");
  if (be_verbose) printf("DChoose():  \t%g\n", X2p);
  if (X2p < 0.01) esl_fatal("chi squared test failed");

  /* Repeat above for FChoose(). */
  esl_vec_ISet(ct, nbins, 0);
  for (i = 0; i < n; i++)
    ct[esl_rnd_FChoose(r, pf, nbins)]++;
  for (X2 = 0., i=0; i < nbins; i++) {
    exp = (double) n * pd[i];
    diff = (double) ct[i] - exp;
    X2 += diff*diff/exp;
  }
  if (esl_stats_ChiSquaredTest(nbins, X2, &X2p) != eslOK) esl_fatal("chi square eval failed");
  if (be_verbose) printf("FChoose():  \t%g\n", X2p);
  if (X2p < 0.01) esl_fatal("chi squared test failed");
  
  free(pd);
  free(pf);
  free(ct);
  return;
}
Ejemplo n.º 4
0
static void
utest_pvectors(void)
{
  char  *msg   = "pvector unit test failed";
  double p1[4] = { 0.25, 0.25, 0.25, 0.25 };
  double p2[4];
  double p3[4];
  float  p1f[4]; 
  float  p2f[4] = { 0.0,   0.5, 0.5,  0.0  };
  float  p3f[4];
  int    n = 4;
  double result;

  esl_vec_D2F(p1,  n, p1f);
  esl_vec_F2D(p2f, n, p2);  

  if (esl_vec_DValidate(p1,  n, 1e-12, NULL) != eslOK) esl_fatal(msg);
  if (esl_vec_FValidate(p1f, n, 1e-7,  NULL) != eslOK) esl_fatal(msg);

  result = esl_vec_DEntropy(p1,  n);          if (esl_DCompare(2.0, result, 1e-9) != eslOK) esl_fatal(msg);
  result = esl_vec_FEntropy(p1f, n);          if (esl_DCompare(2.0, result, 1e-9) != eslOK) esl_fatal(msg);
  result = esl_vec_DEntropy(p2,  n);          if (esl_DCompare(1.0, result, 1e-9) != eslOK) esl_fatal(msg);
  result = esl_vec_FEntropy(p2f, n);          if (esl_DCompare(1.0, result, 1e-9) != eslOK) esl_fatal(msg);

  result = esl_vec_DRelEntropy(p2,  p1,  n);  if (esl_DCompare(1.0, result, 1e-9) != eslOK) esl_fatal(msg);
  result = esl_vec_FRelEntropy(p2f, p1f, n);  if (esl_DCompare(1.0, result, 1e-9) != eslOK) esl_fatal(msg);

  result = esl_vec_DRelEntropy(p1,  p2,  n);  if (result != eslINFINITY)  esl_fatal(msg);
  result = esl_vec_FRelEntropy(p1f, p2f, n);  if (result != eslINFINITY)  esl_fatal(msg);

  esl_vec_DLog(p2, n);
  if (esl_vec_DLogValidate(p2, n, 1e-12, NULL) != eslOK) esl_fatal(msg);
  esl_vec_DExp(p2, n);
  if (p2[0] != 0.) esl_fatal(msg);

  esl_vec_FLog(p2f, n);
  if (esl_vec_FLogValidate(p2f, n, 1e-7, NULL) != eslOK) esl_fatal(msg);
  esl_vec_FExp(p2f, n);
  if (p2f[0] != 0.) esl_fatal(msg);

  esl_vec_DCopy(p2, n, p3);
  esl_vec_DScale(p3, n, 10.);
  esl_vec_DNorm(p3, n);
  if (esl_vec_DCompare(p2, p3, n, 1e-12) != eslOK) esl_fatal(msg);

  esl_vec_DLog(p3, n);
  result = esl_vec_DLogSum(p3, n); if (esl_DCompare(0.0, result, 1e-12) != eslOK) esl_fatal(msg);
  esl_vec_DIncrement(p3, n, 2.0);
  esl_vec_DLogNorm(p3, n);
  if (esl_vec_DCompare(p2, p3, n, 1e-12) != eslOK) esl_fatal(msg);

  esl_vec_FCopy(p2f, n, p3f);
  esl_vec_FScale(p3f, n, 10.);
  esl_vec_FNorm(p3f, n);
  if (esl_vec_FCompare(p2f, p3f, n, 1e-7) != eslOK) esl_fatal(msg);

  esl_vec_FLog(p3f, n);
  result = esl_vec_FLogSum(p3f, n); if (esl_DCompare(0.0, result, 1e-7) != eslOK) esl_fatal(msg);
  esl_vec_FIncrement(p3f, n, 2.0);
  esl_vec_FLogNorm(p3f, n);
  if (esl_vec_FCompare(p2f, p3f, n, 1e-7) != eslOK) esl_fatal(msg);

  return;
}
Ejemplo n.º 5
0
int 
main(int argc, char **argv)
{
  ESL_GETOPTS    *go      = p7_CreateDefaultApp(options, 2, argc, argv, banner, usage);
  ESL_ALPHABET   *abc     = esl_alphabet_Create(eslAMINO);
  char           *qfile   = esl_opt_GetArg(go, 1);
  char           *tfile   = esl_opt_GetArg(go, 2);
  ESL_SQFILE     *qfp     = NULL;
  ESL_SQFILE     *tfp     = NULL;
  ESL_SQ         *qsq     = esl_sq_CreateDigital(abc);
  ESL_SQ         *tsq     = esl_sq_CreateDigital(abc);
  ESL_SCOREMATRIX *S      = esl_scorematrix_Create(abc);
  ESL_DMATRIX     *Q      = NULL;
  P7_BG           *bg     = p7_bg_Create(abc);
  P7_HMM          *hmm    = NULL;
  P7_PROFILE      *gm     = NULL;
  P7_REFMX        *vit    = p7_refmx_Create(200, 400); /* will grow as needed */
  double          *fa     = malloc(sizeof(double) * abc->K);
  double          popen   = 0.02;
  double          pextend = 0.4;
  double          lambda;
  float           vsc;
  float           nullsc;
  int             status;

  esl_composition_BL62(fa);
  esl_vec_D2F(fa, abc->K, bg->f);
  esl_scorematrix_Set("BLOSUM62", S);
  esl_scorematrix_ProbifyGivenBG(S, fa, fa, &lambda, &Q); 
  esl_scorematrix_JointToConditionalOnQuery(abc, Q);

  if (esl_sqfile_OpenDigital(abc, qfile, eslSQFILE_UNKNOWN, NULL, &qfp) != eslOK) esl_fatal("failed to open %s", qfile);
  if (esl_sqio_Read(qfp, qsq) != eslOK) esl_fatal("failed to read query seq");

  p7_Seqmodel(abc, qsq->dsq, qsq->n, qsq->name, Q, bg->f, popen, pextend, &hmm);  
  p7_hmm_SetComposition(hmm);
  p7_hmm_SetConsensus(hmm, qsq); 
  
  gm = p7_profile_Create(hmm->M, abc);
  p7_profile_ConfigUnilocal(gm, hmm, bg, 400);
  
  if (esl_sqfile_OpenDigital(abc, tfile, eslSQFILE_UNKNOWN, NULL, &tfp) != eslOK) esl_fatal("failed to open %s", tfile);
  while ((status = esl_sqio_Read(tfp, tsq)) == eslOK)
    {
      p7_bg_SetLength     (bg, tsq->n);
      p7_profile_SetLength(gm, tsq->n);
      
      p7_ReferenceViterbi(tsq->dsq, tsq->n, gm, vit, NULL, &vsc);    
      p7_bg_NullOne(bg, tsq->dsq, tsq->n, &nullsc);

      printf("%.4f %-25s %-25s\n", (vsc - nullsc) / eslCONST_LOG2, tsq->name, gm->name);
   
      esl_sq_Reuse(tsq);
      p7_refmx_Reuse(vit);
    }
  
  p7_refmx_Destroy(vit);
  p7_profile_Destroy(gm);
  p7_hmm_Destroy(hmm);
  p7_bg_Destroy(bg);
  esl_dmatrix_Destroy(Q);
  esl_scorematrix_Destroy(S);
  free(fa);
  esl_sq_Destroy(qsq);
  esl_sq_Destroy(tsq);
  esl_sqfile_Close(qfp);
  esl_sqfile_Close(tfp);
  esl_alphabet_Destroy(abc);
  esl_getopts_Destroy(go);
  return 0;
}