/**
 * int main(int argc, char **argv)
 * Main driver
 */
int
main(int argc, char **argv)
{
  ESL_GETOPTS     *go	   = NULL;      /* command line processing                   */
  ESL_ALPHABET    *abc     = NULL;
  char            *hmmfile = NULL;
  char            *outhmmfile = NULL;
  P7_HMMFILE      *hfp     = NULL;
  FILE         *outhmmfp;          /* HMM output file handle                  */
  P7_HMM          *hmm     = NULL;
  P7_BG           *bg      = NULL;
  int              nhmm;	
  double           x;
  float            KL;
  int              status;
  char             errbuf[eslERRBUFSIZE];

  float average_internal_transitions[ p7H_NTRANSITIONS ];
  int k;

  char        errmsg[eslERRBUFSIZE];

  /* Process the command line options.
   */
  go = esl_getopts_Create(options);
  if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK || 
      esl_opt_VerifyConfig(go)               != eslOK)
    {
      printf("Failed to parse command line: %s\n", go->errbuf);
      esl_usage(stdout, argv[0], usage);
      printf("\nTo see more help on available options, do %s -h\n\n", argv[0]);
      exit(1);
    }
  if (esl_opt_GetBoolean(go, "-h") == TRUE) 
    {
      profillic_p7_banner(stdout, argv[0], banner);
      esl_usage(stdout, argv[0], usage);
      puts("\nOptions:");
      esl_opt_DisplayHelp(stdout, go, 0, 2, 80); /* 0=docgroup, 2 = indentation; 80=textwidth*/
      exit(0);
    }
  if (esl_opt_ArgNumber(go) != 2) 
    {
      puts("Incorrect number of command line arguments.");
      esl_usage(stdout, argv[0], usage);
      printf("\nTo see more help on available options, do %s -h\n\n", argv[0]);
      exit(1);
    }

  if ((hmmfile = esl_opt_GetArg(go, 1)) == NULL) 
    {
      puts("Failed to read <input hmmfile> argument from command line.");
      esl_usage(stdout, argv[0], usage);
      printf("\nTo see more help on available options, do %s -h\n\n", argv[0]);
      exit(1);
    }

  if ((outhmmfile = esl_opt_GetArg(go, 2)) == NULL) 
    {
      puts("Failed to read <output hmmfile> argument from command line.");
      esl_usage(stdout, argv[0], usage);
      printf("\nTo see more help on available options, do %s -h\n\n", argv[0]);
      exit(1);
    }

  profillic_p7_banner(stdout, argv[0], banner);
  
  /* Initializations: open the input HMM file for reading
   */
  status = p7_hmmfile_OpenE(hmmfile, NULL, &hfp, errbuf);
  if      (status == eslENOTFOUND) p7_Fail("File existence/permissions problem in trying to open HMM file %s.\n%s\n", hmmfile, errbuf);
  else if (status == eslEFORMAT)   p7_Fail("File format problem in trying to open HMM file %s.\n%s\n",                hmmfile, errbuf);
  else if (status != eslOK)        p7_Fail("Unexpected error %d in opening HMM file %s.\n%s\n",               status, hmmfile, errbuf);  

  /* Initializations: open the output HMM file for writing
   */
  if ((outhmmfp = fopen(outhmmfile, "w")) == NULL) ESL_FAIL(status, errmsg, "Failed to open HMM file %s for writing", outhmmfile);

  /* Main body: read HMMs one at a time, print one line of stats
   */
  printf("#\n");
  printf("# %-4s %-20s %-12s %8s %8s %6s %6s %6s %6s %6s\n", "idx",  "name",                 "accession",    "nseq",     "eff_nseq", "M",      "relent", "info",   "p relE", "compKL");
  printf("# %-4s %-20s %-12s %8s %8s %6s %6s %6s %6s %6s\n", "----", "--------------------", "------------", "--------", "--------", "------", "------", "------", "------", "------");

  nhmm = 0;
  while ((status = p7_hmmfile_Read(hfp, &abc, &hmm)) != eslEOF) 
    {
      if      (status == eslEOD)       esl_fatal("read failed, HMM file %s may be truncated?", hmmfile);
      else if (status == eslEFORMAT)   esl_fatal("bad file format in HMM file %s",             hmmfile);
      else if (status == eslEINCOMPAT) esl_fatal("HMM file %s contains different alphabets",   hmmfile);
      else if (status != eslOK)        esl_fatal("Unexpected error in reading HMMs from %s",   hmmfile);
      nhmm++;

      if (bg == NULL) bg = p7_bg_Create(abc);

      esl_vec_FSet(average_internal_transitions, p7H_NTRANSITIONS, 0.);
      for( k = 1; k < hmm->M; k++ ) {
        esl_vec_FAdd(average_internal_transitions, hmm->t[k], p7H_NTRANSITIONS);
      }
      // Match transitions
      esl_vec_FNorm(average_internal_transitions, 3);
      // Insert transitions
      esl_vec_FNorm(average_internal_transitions + 3, 2);
      // Delete transitions
      esl_vec_FNorm(average_internal_transitions + 5, 2);
      // Ok now set them.
      for( k = 1; k < hmm->M; k++ ) {
        esl_vec_FCopy( average_internal_transitions, p7H_NTRANSITIONS, hmm->t[k] );
      }

      if ((status = p7_hmm_Validate(hmm, errmsg, 0.0001))       != eslOK) return status;
      if ((status = p7_hmmfile_WriteASCII(outhmmfp, -1, hmm)) != eslOK) ESL_FAIL(status, errmsg, "HMM save failed");
  
      p7_MeanPositionRelativeEntropy(hmm, bg, &x); 
      p7_hmm_CompositionKLDist(hmm, bg, &KL, NULL);

      printf("%-6d %-20s %-12s %8d %8.2f %6d %6.2f %6.2f %6.2f %6.2f\n",
	     nhmm,
	     hmm->name,
	     hmm->acc == NULL ? "-" : hmm->acc,
	     hmm->nseq,
	     hmm->eff_nseq,
	     hmm->M,
	     p7_MeanMatchRelativeEntropy(hmm, bg),
	     p7_MeanMatchInfo(hmm, bg),
	     x,
	     KL);

	     /*	     p7_MeanForwardScore(hmm, bg)); */

      p7_hmm_Destroy(hmm);
    }

  p7_bg_Destroy(bg);
  esl_alphabet_Destroy(abc);
  p7_hmmfile_Close(hfp);
  if (outhmmfp != NULL) fclose(outhmmfp);
 esl_getopts_Destroy(go);
  exit(0);
}
示例#2
0
/* Function:  p7_GNull2_ByExpectation()
 * Synopsis:  Calculate null2 model from posterior probabilities.
 * Incept:    SRE, Thu Feb 28 09:52:28 2008 [Janelia]
 *
 * Purpose:   Calculate the "null2" model for the envelope encompassed
 *            by a posterior probability calculation <pp> for model
 *            <gm>.  Return the null2 odds emission probabilities
 *            $\frac{f'{x}}{f{x}}$ in <null2>, which caller
 *            provides as space for at least <alphabet->Kp> residues.
 *            
 *            The expectation method is applied to envelopes in
 *            simple, well resolved regions (regions containing just a
 *            single envelope, where no stochastic traceback
 *            clustering was required).
 *            
 *            Make sure that the posterior probability matrix <pp> has
 *            been calculated by the caller for only the envelope; thus
 *            its rows are numbered <1..Ld>, for envelope <ienv..jenv>
 *            of length <Ld=jenv-ienv+1>.
 *            
 * Args:      gm    - profile, in any mode, target length model set to <L>
 *            pp    - posterior prob matrix, for <gm> against domain envelope <dsq+i-1> (offset)
 *            null2 - RETURN: null2 odds ratios per residue; <0..Kp-1>; caller allocated space
 *
 * Returns:   <eslOK> on success; <null2> contains the null2 scores. The 0
 *            row of <pp> has been used as temp space, and happens to contain
 *            the expected frequency that each M,I,N,C,J state is used in this
 *            <pp> matrix to generate residues.
 *
 * Throws:    (no abnormal error conditions)
 */
int
p7_GNull2_ByExpectation(const P7_PROFILE *gm, P7_GMX *pp, float *null2)
{
  int      M      = gm->M;
  int      Ld     = pp->L;
  float  **dp     = pp->dp;
  float   *xmx    = pp->xmx;
  float    xfactor;
  int      x;			/* over symbols 0..K-1                       */
  int      i;			/* over offset envelope dsq positions 1..Ld  */
  int      k;			/* over model M states 1..M, I states 1..M-1 */

  /* Calculate expected # of times that each emitting state was used
   * in generating the Ld residues in this domain.
   * The 0 row in <wrk> is used to hold these numbers.
   */
  esl_vec_FCopy(pp->dp[1],            (M+1)*p7G_NSCELLS, pp->dp[0]); 
  esl_vec_FCopy(pp->xmx+p7G_NXCELLS,  p7G_NXCELLS,       pp->xmx);   
  for (i = 2; i <= Ld; i++)
    {
      esl_vec_FAdd(pp->dp[0], pp->dp[i],             (M+1)*p7G_NSCELLS);
      esl_vec_FAdd(pp->xmx,   pp->xmx+i*p7G_NXCELLS, p7G_NXCELLS); 
    }
  
  /* Convert those expected #'s to log frequencies; these we'll use as
   * the log posterior weights.
   */
  esl_vec_FLog(pp->dp[0], (M+1)*p7G_NSCELLS);
  esl_vec_FLog(pp->xmx,   p7G_NXCELLS);  

  esl_vec_FIncrement(pp->dp[0], (M+1)*p7G_NSCELLS, -log((float)Ld));
  esl_vec_FIncrement(pp->xmx,   p7G_NXCELLS,       -log((float)Ld)); 

  /* Calculate null2's log odds emission probabilities, by taking
   * posterior weighted sum over all emission vectors used in paths
   * explaining the domain.
   * This is dog-slow; a point for future optimization.
   */
  xfactor = XMX(0,p7G_N);
  xfactor = p7_FLogsum(xfactor, XMX(0,p7G_C));
  xfactor = p7_FLogsum(xfactor, XMX(0,p7G_J));
  esl_vec_FSet(null2, gm->abc->K, -eslINFINITY);
  for (x = 0; x < gm->abc->K; x++)
    { 
      for (k = 1; k < M; k++)
	{
	  null2[x] = p7_FLogsum(null2[x], MMX(0,k) + p7P_MSC(gm, k, x));
	  null2[x] = p7_FLogsum(null2[x], IMX(0,k) + p7P_ISC(gm, k, x));
	}
      null2[x] = p7_FLogsum(null2[x], MMX(0,M) + p7P_MSC(gm, k, x));
      null2[x] = p7_FLogsum(null2[x], xfactor);
    }

  esl_vec_FExp (null2, gm->abc->K);
  /* now null2[x] = \frac{f_d(x)}{f_0(x)} for all x in alphabet,
   * 0..K-1, where f_d(x) are the ad hoc "null2" residue frequencies
   * for this envelope.
   */

  /* make valid scores for all degeneracies, by averaging the odds ratios. */
  esl_abc_FAvgScVec(gm->abc, null2); /* does not set gap, nonres, missing  */
  null2[gm->abc->K]    = 1.0;        /* gap character    */
  null2[gm->abc->Kp-2] = 1.0;	     /* nonresidue "*"   */
  null2[gm->abc->Kp-1] = 1.0;	     /* missing data "~" */

  return eslOK;
}