Exemplo n.º 1
0
/* Function:  p7_Builder()
 * Synopsis:  Build a new HMM from an MSA.
 *
 * Purpose:   Take the multiple sequence alignment <msa> and a build configuration <bld>,
 *            and build a new HMM. 
 * 
 *            Effective sequence number determination and calibration steps require
 *            additionally providing a null model <bg>.
 *
 * Args:      bld         - build configuration
 *            msa         - multiple sequence alignment
 *            bg          - null model
 *            opt_hmm     - optRETURN: new HMM
 *            opt_trarr   - optRETURN: array of faux tracebacks, <0..nseq-1>
 *            opt_gm      - optRETURN: profile corresponding to <hmm>
 *            opt_om      - optRETURN: optimized profile corresponding to <gm>
 *            opt_postmsa - optRETURN: RF-annotated, possibly modified MSA 
 *
 * Returns:   <eslOK> on success. The new HMM is optionally returned in
 *            <*opt_hmm>, along with optional returns of an array of faux tracebacks
 *            for each sequence in <*opt_trarr>, the annotated MSA used to construct
 *            the model in <*opt_postmsa>, a configured search profile in 
 *            <*opt_gm>, and an optimized search profile in <*opt_om>. These are
 *            all optional returns because the caller may, for example, be interested
 *            only in an optimized profile, or may only be interested in the HMM.
 *            
 *            Returns <eslENORESULT> if no consensus columns were annotated.
 *            Returns <eslEFORMAT> on MSA format problems, such as a missing RF annotation
 *            line in hand architecture construction. On any returned error,
 *            <bld->errbuf> contains an informative error message.
 *
 * Throws:    <eslEMEM> on allocation error.
 *            <eslEINVAL> if relative weights couldn't be calculated from <msa>.
 *
 * Xref:      J4/30.
 */
int
p7_Builder(P7_BUILDER *bld, ESL_MSA *msa, P7_BG *bg,
	   P7_HMM **opt_hmm, P7_TRACE ***opt_trarr, P7_PROFILE **opt_gm, P7_OPROFILE **opt_om,
	   ESL_MSA **opt_postmsa)
{
  int i,j;
  uint32_t    checksum = 0;	/* checksum calculated for the input MSA. hmmalign --mapali verifies against this. */
  P7_HMM     *hmm      = NULL;
  P7_TRACE  **tr       = NULL;
  P7_TRACE ***tr_ptr   = (opt_trarr != NULL || opt_postmsa != NULL) ? &tr : NULL;
  int         status;
  if ((status =  validate_msa         (bld, msa))                       != eslOK) goto ERROR;
  if ((status =  esl_msa_Checksum     (msa, &checksum))                 != eslOK) ESL_XFAIL(status, bld->errbuf, "Failed to calculate checksum"); 
  if ((status =  relative_weights     (bld, msa))                       != eslOK) goto ERROR;
  if ((status =  esl_msa_MarkFragments(msa, bld->fragthresh))           != eslOK) goto ERROR;
  if ((status =  build_model          (bld, msa, &hmm, tr_ptr))         != eslOK) goto ERROR;

  //Ensures that the weighted-average I->I count <=  bld->max_insert_len
  //(MI currently contains the number of observed insert-starts)
  if (bld->max_insert_len>0)
    for (i=1; i<hmm->M; i++ )
      hmm->t[i][p7H_II] = ESL_MIN(hmm->t[i][p7H_II], bld->max_insert_len*hmm->t[i][p7H_MI]);

  if ((status =  effective_seqnumber  (bld, msa, hmm, bg))              != eslOK) goto ERROR;
  if ((status =  parameterize         (bld, hmm))                       != eslOK) goto ERROR;
  if ((status =  annotate             (bld, msa, hmm))                  != eslOK) goto ERROR;
  if ((status =  calibrate            (bld, hmm, bg, opt_gm, opt_om))   != eslOK) goto ERROR;
  if ((status =  make_post_msa        (bld, msa, hmm, tr, opt_postmsa)) != eslOK) goto ERROR;

  //force masked positions to background  (it'll be close already, so no relevant impact on weighting)
  if (hmm->mm != NULL)
    for (i=1; i<hmm->M; i++ )
      if (hmm->mm[i] == 'm')
        for (j=0; j<hmm->abc->K; j++)
          hmm->mat[i][j] = bg->f[j];

  if ( bld->abc->type == eslDNA ||  bld->abc->type == eslRNA ) {
	  if (bld->w_len > 0)           hmm->max_length = bld->w_len;
	  else if (bld->w_beta == 0.0)  hmm->max_length = hmm->M *4;
	  else if ( (status =  p7_Builder_MaxLength(hmm, bld->w_beta)) != eslOK) goto ERROR;
  }

  hmm->checksum = checksum;
  hmm->flags   |= p7H_CHKSUM;

  if (opt_hmm   != NULL) *opt_hmm   = hmm; else p7_hmm_Destroy(hmm);
  if (opt_trarr != NULL) *opt_trarr = tr;  else p7_trace_DestroyArray(tr, msa->nseq);
  return eslOK;

 ERROR:
  p7_hmm_Destroy(hmm);
  p7_trace_DestroyArray(tr, msa->nseq);
  if (opt_gm    != NULL) p7_profile_Destroy(*opt_gm);
  if (opt_om    != NULL) p7_oprofile_Destroy(*opt_om);
  return status;
}
Exemplo n.º 2
0
int
main(int argc, char **argv)
{
  ESL_GETOPTS  *go        = p7_CreateDefaultApp(options, 1, argc, argv, banner, usage);
  char         *msafile   = esl_opt_GetArg(go, 1);
  int           fmt       = eslMSAFILE_UNKNOWN;
  ESL_ALPHABET *abc       = NULL;
  ESL_MSAFILE  *afp       = NULL;
  ESL_MSA      *msa       = NULL;
  P7_HMM       *hmm       = NULL;
  P7_PRIOR     *prior     = NULL;
  P7_TRACE    **trarr     = NULL;
  P7_BG        *bg        = NULL;
  P7_PROFILE   *gm        = NULL;
  ESL_MSA      *postmsa   = NULL;
  int           i;
  int           status;
  
  /* Standard idioms for opening and reading a digital MSA. (See esl_msafile.c example). */
  if      (esl_opt_GetBoolean(go, "--rna"))   abc = esl_alphabet_Create(eslRNA);
  else if (esl_opt_GetBoolean(go, "--dna"))   abc = esl_alphabet_Create(eslDNA);
  else if (esl_opt_GetBoolean(go, "--amino")) abc = esl_alphabet_Create(eslAMINO);

  if ((status = esl_msafile_Open(&abc, msafile, NULL, fmt, NULL, &afp)) != eslOK)
    esl_msafile_OpenFailure(afp, status);

  bg  = p7_bg_Create(abc);

  switch (abc->type) {
  case eslAMINO: prior = p7_prior_CreateAmino();      break;
  case eslDNA:   prior = p7_prior_CreateNucleic();    break;
  case eslRNA:   prior = p7_prior_CreateNucleic();    break;
  default:       prior = p7_prior_CreateLaplace(abc); break;
  }
  if (prior == NULL) esl_fatal("Failed to initialize prior");

  while ((status = esl_msafile_Read(afp, &msa)) != eslEOF)
    {
      if (status != eslOK) esl_msafile_ReadFailure(afp, status);

      /* The modelmakers collect counts in an HMM structure */
      status = p7_Handmodelmaker(msa, NULL, &hmm, &trarr);
      if      (status == eslENORESULT) esl_fatal("no consensus columns in alignment %s\n",  msa->name);
      else if (status != eslOK)        esl_fatal("failed to build HMM from alignment %s\n", msa->name);

      printf("COUNTS:\n");
      p7_hmm_Dump(stdout, hmm);

      /* These counts, in combination with a prior, are converted to probability parameters */
      status = p7_ParameterEstimation(hmm, prior);
      if (status != eslOK)             esl_fatal("failed to parameterize HMM for %s", msa->name);

      printf("PROBABILITIES:\n");
      p7_hmm_Dump(stdout, hmm);

      /* Just so we can dump a more informatively annotated trace - build a profile */
      gm = p7_profile_Create(hmm->M, abc);
      p7_profile_Config   (gm, hmm, bg);
      p7_profile_SetLength(gm, 400);

      /* Dump the individual traces */
      for (i = 0; i < msa->nseq; i++)
	{
	  printf("Trace %d: %s\n", i+1, msa->sqname[i]);
	  p7_trace_DumpAnnotated(stdout, trarr[i], gm, msa->ax[i]);
	}
      
      /* Create an MSA from the individual traces */
      status = p7_tracealign_MSA(msa, trarr, hmm->M, p7_DEFAULT, &postmsa);
      if (status != eslOK) esl_fatal("failed to create new MSA from traces\n");

      esl_msafile_Write(stdout, postmsa, eslMSAFILE_PFAM);

      p7_profile_Destroy(gm);
      p7_hmm_Destroy(hmm);
      p7_trace_DestroyArray(trarr, msa->nseq);
      esl_msa_Destroy(postmsa);
      esl_msa_Destroy(msa);
    }

  esl_msafile_Close(afp);
  p7_bg_Destroy(bg);
  esl_alphabet_Destroy(abc);
  esl_getopts_Destroy(go);
  return 0;
}
Exemplo n.º 3
0
/* utest_fragments()
 * This exercises the building code that deals with fragments,
 * creating traces with B->X->{MDI}k and {MDI}k->X->E 
 * transitions, and making sure we can make MSAs correctly
 * from them using p7_tracealign_MSA(). This code was initially
 * buggy when first written; bugs first detected by Elena, 
 * Nov 2009
 */
static void
utest_fragments(void)
{
  char         *failmsg      = "failure in build.c::utest_fragments() unit test";
  char          msafile[16]  = "p7tmpXXXXXX"; /* tmpfile name template */
  FILE         *ofp          = NULL;
  ESL_ALPHABET *abc          = esl_alphabet_Create(eslAMINO);
  ESL_MSAFILE  *afp          = NULL;
  ESL_MSA      *msa          = NULL;
  ESL_MSA      *dmsa         = NULL;
  ESL_MSA      *postmsa      = NULL;
  P7_HMM       *hmm          = NULL;
  P7_TRACE    **trarr        = NULL;
  int           i;

  /* Write an MSA that tests fragment/missing data transitions. 
   * When built with Handmodelmaker (using the RF line):
   *   seq1 forces B->X->Mk and Mk->X->E missing data transitions; 
   *   seq2 forces B->X->Ik and Ik->X->E missing data transitions;
   *   seq3 forces B->X->Dk and Dk->X->E missing data transitions.
   *
   * The first two cases can arise from fragment definition in
   * model construction, or in an input file. 
   *
   * The X->Dk and Dk->X cases should never happen, but we don't
   * prohibit them. They can only arise in an input file, because
   * esl_msa_MarkFragments() converts everything before/after
   * first/last residue to ~, and won't leave a gap character in
   * between.
   *
   * There's nothing being tested by seq4 and seq5; they're just there.
   */
  if (esl_tmpfile_named(msafile, &ofp) != eslOK) esl_fatal(failmsg);
  fprintf(ofp, "# STOCKHOLM 1.0\n");
  fprintf(ofp, "#=GC RF xxxxx.xxxxxxxxxxxx.xxx\n");
  fprintf(ofp, "seq1    ~~~~~~GHIKLMNPQRST~~~~\n");
  fprintf(ofp, "seq2    ~~~~~aGHIKLMNPQRSTa~~~\n");
  fprintf(ofp, "seq3    ~~~~~~~HIKLMNPQRS~~~~~\n");
  fprintf(ofp, "seq4    ACDEF.GHIKLMNPQRST.VWY\n");
  fprintf(ofp, "seq5    ACDEF.GHIKLMNPQRST.VWY\n");
  fprintf(ofp, "//\n");
  fclose(ofp);

  /* Read the original as text for comparison to postmsa. Make a digital copy for construction */
  if (esl_msafile_Open(NULL, msafile, NULL, eslMSAFILE_UNKNOWN, NULL, &afp)!= eslOK) esl_fatal(failmsg);
  if (esl_msafile_Read(afp, &msa)                                          != eslOK) esl_fatal(failmsg);
  if ((dmsa = esl_msa_Clone(msa))                                           == NULL)  esl_fatal(failmsg);
  if (esl_msa_Digitize(abc, dmsa, NULL)                                     != eslOK) esl_fatal(failmsg);

  if (p7_Handmodelmaker(dmsa, NULL, &hmm, &trarr)                           != eslOK) esl_fatal(failmsg);
  for (i = 0; i < dmsa->nseq; i++)
    if (p7_trace_Validate(trarr[i], abc, dmsa->ax[i], NULL)                 != eslOK) esl_fatal(failmsg);

  /* The example is contrived such that the traces should give exactly the
   * same (text) alignment as the input alignment; no tracedoctoring.
   * Not a trivial test; for example, sequence 2 has a B->X->I transition that 
   * can be problematic to handle.
   */
  if (p7_tracealign_MSA(dmsa, trarr, hmm->M, p7_DEFAULT, &postmsa)          != eslOK) esl_fatal(failmsg);
  for (i = 0; i < msa->nseq; i++)
    if (strcmp(msa->aseq[i], postmsa->aseq[i]) != 0) esl_fatal(failmsg);

  p7_trace_DestroyArray(trarr, msa->nseq);
  p7_hmm_Destroy(hmm);
  esl_msa_Destroy(msa);
  esl_msa_Destroy(dmsa);
  esl_msa_Destroy(postmsa);
  esl_msafile_Close(afp);
  esl_alphabet_Destroy(abc);
  remove(msafile);
  return;
}
Exemplo n.º 4
0
/* Function: matassign2hmm()
 * 
 * Purpose:  Given an assignment of alignment columns to match vs.
 *           insert, finish the final part of the model construction 
 *           calculation that is constant between model construction
 *           algorithms.
 *           
 * Args:     msa       - multiple sequence alignment
 *           matassign - 1..alen bit flags for column assignments
 *           ret_hmm   - RETURN: counts-form HMM
 *           opt_tr    - optRETURN: array of tracebacks for aseq's
 *                         
 * Return:   <eslOK> on success.
 *           <eslENORESULT> if no consensus columns are identified.
 *
 *           ret_hmm and opt_tr alloc'ed here.
 */
static int
matassign2hmm(ESL_MSA *msa, int *matassign, P7_HMM **ret_hmm, P7_TRACE ***opt_tr)
{
  int        status;		/* return status                       */
  P7_HMM    *hmm = NULL;        /* RETURN: new hmm                     */
  P7_TRACE **tr  = NULL;        /* RETURN: 0..nseq-1 fake traces       */
  int      M;                   /* length of new model in match states */
  int      idx;                 /* counter over sequences              */
  int      apos;                /* counter for aligned columns         */
#ifdef p7_DEBUGGING
  char     errbuf[eslERRBUFSIZE];
#endif

  /* apply the model mask in the 'GC MM' row */
  do_modelmask(msa);

  /* How many match states in the HMM? */
  for (M = 0, apos = 1; apos <= msa->alen; apos++) 
    if (matassign[apos]) M++;
  if (M == 0) { status = eslENORESULT; goto ERROR; }

  /* Make fake tracebacks for each seq */
  ESL_ALLOC(tr, sizeof(P7_TRACE *) * msa->nseq);
  if ((status = p7_trace_FauxFromMSA(msa, matassign, p7_MSA_COORDS, tr))        != eslOK) goto ERROR;
  for (idx = 0; idx < msa->nseq; idx++)
    {
      if ((status = p7_trace_Doctor(tr[idx], NULL, NULL))                       != eslOK) goto ERROR;
#ifdef p7_DEBUGGING
      if ((status = p7_trace_Validate(tr[idx], msa->abc, msa->ax[idx], errbuf)) != eslOK) 
	ESL_XEXCEPTION(eslFAIL, "validation failed: %s", errbuf);
#endif
    }

  /* Build count model from tracebacks */
  if ((hmm    = p7_hmm_Create(M, msa->abc)) == NULL)  { status = eslEMEM; goto ERROR; }
  if ((status = p7_hmm_Zero(hmm))           != eslOK) goto ERROR;
  for (idx = 0; idx < msa->nseq; idx++) {
    if (tr[idx] == NULL) continue; /* skip rare examples of empty sequences */
    if ((status = p7_trace_Count(hmm, msa->ax[idx], msa->wgt[idx], tr[idx])) != eslOK) goto ERROR;
  }

  hmm->nseq     = msa->nseq;
  hmm->eff_nseq = msa->nseq;

  /* Transfer annotation from the MSA to the new model
   */
  if ((status = annotate_model(hmm, matassign, msa)) != eslOK) goto ERROR;

  /* Reset #=RF line of alignment to reflect our assignment
   * of match, delete. matassign is valid from 1..alen and is off
   * by one from msa->rf.
   */
  if (msa->rf == NULL)  ESL_ALLOC(msa->rf, sizeof(char) * (msa->alen + 1));
  for (apos = 1; apos <= msa->alen; apos++)
    msa->rf[apos-1] = matassign[apos] ? 'x' : '.';
  msa->rf[msa->alen] = '\0';

  if (opt_tr  != NULL) *opt_tr  = tr; 
  else                  p7_trace_DestroyArray(tr, msa->nseq);
  *ret_hmm = hmm;
  return eslOK;

 ERROR:
  if (tr     != NULL) p7_trace_DestroyArray(tr, msa->nseq);
  if (hmm    != NULL) p7_hmm_Destroy(hmm);
  if (opt_tr != NULL) *opt_tr = NULL;
  *ret_hmm = NULL;
  return status;
}