/* Function:  p7_sparsemask_Compare()
 * Synopsis:  Compare two sparse masks for equality.
 *
 * Purpose:   Compare <sm1> and <sm2>; return <eslOK> if they
 *            are equal, <eslFAIL> if they are not.
 */
int
p7_sparsemask_Compare_avx512(const P7_SPARSEMASK *sm1, const P7_SPARSEMASK *sm2)
{
#ifdef HAVE_AVX512  
  char msg[] = "P7_SPARSEMASK comparison failed";
  int  i;
  int  s;
 if(sm2->simd != AVX512){
    ESL_FAIL(eslFAIL, NULL, "Can't compare sparsemasks generated for different SIMD instruction sets");
  }
  if ( (sm1->L      != sm2->L)      ||
       (sm1->M      != sm2->M)      ||
       (sm1->S_AVX_512     != sm2->S_AVX_512)      ||
       (sm1->nrow_AVX_512   != sm2->nrow_AVX_512)   ||
       (sm1->ncells_AVX_512 != sm2->ncells_AVX_512)) 
    ESL_FAIL(eslFAIL, NULL, msg);

  for (s = 0; s <= sm1->S_AVX_512+1; s++)
    {
      if (sm1->seg_AVX_512[s].ia != sm2->seg_AVX_512[s].ia)   ESL_FAIL(eslFAIL, NULL, msg);
      if (sm1->seg_AVX_512[s].ib != sm2->seg_AVX_512[s].ib)   ESL_FAIL(eslFAIL, NULL, msg);
    }
  if ( esl_vec_ICompare(sm1->n_AVX_512, sm2->n_AVX_512, sm1->L+1)    != eslOK)  ESL_FAIL(eslFAIL, NULL, msg);
  for (i = 0; i <= sm1->L; i++)
    if ( esl_vec_ICompare(sm1->k_AVX_512[i], sm2->k_AVX_512[i], sm1->n_AVX_512[i]) != eslOK) ESL_FAIL(eslFAIL, NULL, msg);
  return eslOK;
  #endif
#ifndef HAVE_AVX512
return eslENORESULT;
#endif
}
Beispiel #2
0
/* Function:  esl_rmx_ValidateQ()
 * Incept:    SRE, Sun Mar 11 10:30:50 2007 [Janelia]
 *
 * Purpose:   Validates an instantaneous rate matrix <Q> for a
 *            continuous-time Markov process, whose elements $q_{ij}$
 *            represent instantaneous transition rates $i \rightarrow
 *            j$. 
 *            
 *            Rows satisfy the condition that
 *            $q_{ii} = -\sum_{i \neq j} q_{ij}$, and also
 *            that $q_{ij} \geq 0$ for all $j \neq i$. 
 *            
 *            <tol> specifies the floating-point tolerance to which
 *            that condition must hold: <fabs(sum-q_ii) <= tol>.
 *            
 *            <errbuf> is an optional error message buffer. The caller
 *            may pass <NULL> or a pointer to a buffer of at least
 *            <eslERRBUFSIZE> characters.
 *            
 * Args:      Q      - rate matrix to validate
 *            tol    - floating-point tolerance (0.00001, for example)      
 *            errbuf - OPTIONAL: ptr to an error buffer of at least
 *                     <eslERRBUFSIZE> characters.
 *
 * Returns:   <eslOK> on successful validation. 
 *            <eslFAIL> on failure, and if a non-<NULL> <errbuf> was
 *            provided by the caller, a message describing
 *            the reason for the failure is put there.
 *
 * Throws:    (no abnormal error conditions)
 */
int
esl_rmx_ValidateQ(ESL_DMATRIX *Q, double tol, char *errbuf)
{
  int    i,j;
  double qi;

  if (Q->type != eslGENERAL) ESL_EXCEPTION(eslEINVAL, "Q must be type eslGENERAL to be validated");
  if (Q->n    != Q->m)       ESL_EXCEPTION(eslEINVAL, "a rate matrix Q must be square");

  for (i = 0; i < Q->n; i++)
    {
      qi = 0.;
      for (j = 0; j < Q->m; j++)
	{
	  if (i != j) {
	    if (Q->mx[i][j] < 0.)       ESL_FAIL(eslFAIL, errbuf, "offdiag elem %d,%d < 0",i,j);
	    qi += Q->mx[i][j];
	  } else {
	    if (Q->mx[i][j] > 0.)       ESL_FAIL(eslFAIL, errbuf, "diag elem %d,%d < 0", i,j);
	  }
	}
      if (fabs(qi + Q->mx[i][i]) > tol) ESL_FAIL(eslFAIL, errbuf, "row %d does not sum to 0.0", i);
    }
  return eslOK;
}
/* Function:  p7_sparsemask_Validate()
 * Synopsis:  Validate a P7_SPARSEMASK sparse DP mask.
 *
 * Purpose:   Validate the contents of sparse mask <sm>. 
 *            Return <eslOK> if it passes. Return <eslFAIL>
 *            if it fails, and set <errbuf> to contain an
 *            explanation, if caller provides a non-<NULL>
 *            <errbuf>.
 *
 * Args:      sm      - sparse DP mask to validate
 *            errbuf  - [eslERRBUFSIZE] space for an error msg; or NULL      
 *
 * Returns:   <eslOK> on success; <errbuf>, if provided, is set
 *            to an empty string "\0".
 *            
 *            <eslFAIL> on failure; <errbuf>, if provided, contains an
 *            informative error message.
 *            
 * Note:      We don't check for all possible invalidity; the goal of a
 *            Validate() is primarily to catch any future problems
 *            similar to past problems that we've already run across
 *            in debugging/testing.
 */
int
p7_sparsemask_Validate_avx512(const P7_SPARSEMASK *sm, char *errbuf)
{
#ifdef HAVE_AVX512
  int g, i;

  if (errbuf) errbuf[0] = '\0';

  if ( sm->L < 1) ESL_FAIL(eslFAIL, errbuf, "L must be >=1");
  if ( sm->M < 1) ESL_FAIL(eslFAIL, errbuf, "M must be >=1");
  if ( sm->S_AVX_512 < 0) ESL_FAIL(eslFAIL, errbuf, "S must be >=0");

  for (g = 1; g <= sm->S_AVX_512; g++)
    {
      if (sm->seg_AVX_512[g-1].ib >= sm->seg_AVX_512[g].ia)           ESL_FAIL(eslFAIL, errbuf, "seg %d overlaps with previous one", g);  // Note boundary condition, seg[0].ib=-1
      if (sm->seg_AVX_512[g].ia   >  sm->seg_AVX_512[g].ib)           ESL_FAIL(eslFAIL, errbuf, "ia..ib are not in order for seg %d", g);
      if (sm->seg_AVX_512[g].ia < 1 || sm->seg_AVX_512[g].ia > sm->L) ESL_FAIL(eslFAIL, errbuf, "ia[%d] is invalid", g);
      if (sm->seg_AVX_512[g].ib < 1 || sm->seg_AVX_512[g].ib > sm->L) ESL_FAIL(eslFAIL, errbuf, "ib[%d] is invalid", g);

      for (i = sm->seg_AVX_512[g-1].ib+1; i < sm->seg_AVX_512[g].ia; i++)   // Note boundary condition. Sentinel seg[0].ib == -1, so (i = seg[0]+1) means 0
  if (sm->n_AVX_512[i] != 0) ESL_FAIL(eslFAIL, errbuf, "n[i] != 0 for i unmarked, not in sparse segment");
      for (i = sm->seg_AVX_512[g].ia; i <= sm->seg_AVX_512[g].ib; i++)
  if (sm->n_AVX_512[i] == 0) ESL_FAIL(eslFAIL, errbuf, "n[i] == 0 for i supposedly marked in sparse seg");
    }
  for (i = sm->seg_AVX_512[sm->S_AVX_512].ib+1; i <= sm->L; i++)
    if (sm->n_AVX_512[i] != 0) ESL_FAIL(eslFAIL, errbuf, "n[i] != 0 for i unmarked, not in sparse segment");

  return eslOK;
  #endif
#ifndef HAVE_AVX512
return eslENORESULT;
#endif
}
Beispiel #4
0
int
p7_masstrace_Compare(const P7_MASSTRACE *mte, const P7_MASSTRACE *mta, float tol)
{
  char msg[] = "masstrace object comparison failed";
  int i,k;

  if (mte->L   != mta->L)   ESL_FAIL(eslFAIL, NULL, msg);
  if (mte->M   != mta->M)   ESL_FAIL(eslFAIL, NULL, msg);
  if (mte->i0  != mta->i0)  ESL_FAIL(eslFAIL, NULL, msg);
  if (mte->k0  != mta->k0)  ESL_FAIL(eslFAIL, NULL, msg);
  if (mte->st0 != mta->st0) ESL_FAIL(eslFAIL, NULL, msg);

  if (mte->imass && mta->imass)
    {
      for (i = 1; i <= mte->L; i++)
	{
	  if (mte->imass[i] == 0.0 && mta->imass[i] > 0.0)                  ESL_FAIL(eslFAIL, NULL, msg);
	  if (esl_FCompareAbs(mte->imass[i], mta->imass[i], tol) != eslOK)  ESL_FAIL(eslFAIL, NULL, msg);
	}
    }
  for (k = 1; k <= mte->M; k++)
    {
      if (mte->kmass[k] == 0.0 && mta->kmass[k] > 0.0)                  ESL_FAIL(eslFAIL, NULL, msg);
      if (esl_FCompareAbs(mte->kmass[k], mta->kmass[k], tol) != eslOK)  ESL_FAIL(eslFAIL, NULL, msg);
    }
  return eslOK;
}
Beispiel #5
0
/* init_master_cfg()
 * Called by either master version, mpi or serial.
 * Already set:
 *    cfg->hmmfile - command line arg 
 * Sets:
 *    cfg->hfp     - open HMM stream
 *    cfg->ofp     - open output steam
 *    cfg->survfp  - open xmgrace survival plot file 
 *    cfg->efp     - open E vs. E plot file
 *    cfg->ffp     - open filter power data file
 *    cfg->xfp     - open binary score file
 *    cfg->alfp    - open alignment length file
 *
 * Error handling relies on the result pointers being initialized to
 * NULL by the caller.
 *                   
 * Errors in the MPI master here are considered to be "recoverable",
 * in the sense that we'll try to delay output of the error message
 * until we've cleanly shut down the worker processes. Therefore
 * errors return (code, errmsg) by the ESL_FAIL mech.
 */
static int
init_master_cfg(ESL_GETOPTS *go, struct cfg_s *cfg, char *errbuf)
{
  char *filename;
  int   status;

  status = p7_hmmfile_OpenE(cfg->hmmfile, NULL, &(cfg->hfp), NULL);
  if      (status == eslENOTFOUND) ESL_FAIL(eslFAIL, errbuf, "Failed to open HMM file %s for reading.\n",                   cfg->hmmfile);
  else if (status == eslEFORMAT)   ESL_FAIL(eslFAIL, errbuf, "File %s does not appear to be in a recognized HMM format.\n", cfg->hmmfile);
  else if (status != eslOK)        ESL_FAIL(eslFAIL, errbuf, "Unexpected error %d in opening HMM file %s.\n",       status, cfg->hmmfile);  

  filename = esl_opt_GetString(go, "-o");
  if (filename != NULL) 
    {
      if ((cfg->ofp = fopen(filename, "w")) == NULL) 
	ESL_FAIL(eslFAIL, errbuf, "Failed to open -o output file %s\n", filename);
    } 
  else cfg->ofp = stdout;

  filename = esl_opt_GetString(go, "--pfile");
  if (filename != NULL) 
    {
      if ((cfg->survfp = fopen(filename, "w")) == NULL) 
	ESL_FAIL(eslFAIL, errbuf, "Failed to open --pfile output file %s\n", filename);
    }

  filename = esl_opt_GetString(go, "--efile");
  if (filename != NULL) 
    {
      if ((cfg->efp = fopen(filename, "w")) == NULL) 
	ESL_FAIL(eslFAIL, errbuf, "Failed to open --efile output file %s\n", filename);
    }

  filename = esl_opt_GetString(go, "--ffile");
  if (filename != NULL) 
    {
      if ((cfg->ffp = fopen(filename, "w")) == NULL) 
	ESL_FAIL(eslFAIL, errbuf, "Failed to open --ffile output file %s\n", filename);
    }

  filename = esl_opt_GetString(go, "--xfile");
  if (filename != NULL) 
    {
      if ((cfg->xfp = fopen(filename, "w")) == NULL) 
	ESL_FAIL(eslFAIL, errbuf, "Failed to open --xfile output file %s\n", filename);
    }

  filename = esl_opt_GetString(go, "--afile");
  if (filename != NULL) 
    {
      if ((cfg->alfp = fopen(filename, "w")) == NULL) 
	ESL_FAIL(eslFAIL, errbuf, "Failed to open --afile output file %s\n", filename);
    }

  return eslOK;
}
Beispiel #6
0
int
p7_hit_Validate(const P7_HIT *hit, char *errbuf)
{
  int d;
  int status;

  if (hit->name == NULL) ESL_FAIL(eslFAIL, errbuf, "name cannot be NULL");
  if (isnan(hit->sortkey) ||
      isnan(hit->score) ||
      isnan(hit->pre_score) ||
      isnan(hit->sum_score) ||
      isnan(hit->lnP) ||
      isnan(hit->pre_lnP) ||
      isnan(hit->sum_lnP) ||
      isnan(hit->nexpected)) ESL_FAIL(eslFAIL, errbuf, "NaN found");
  
  if ( (hit->flags & (! ( p7_IS_REPORTED | p7_IS_INCLUDED | p7_IS_NEW | p7_IS_DROPPED | p7_IS_DUPLICATE))) != 0)
    ESL_FAIL(eslFAIL, errbuf, "unrecognized flag is up");
  
  if (hit->ndom < 0)  ESL_FAIL(eslFAIL, errbuf, "negative ndom");
  if (hit->noverlaps   < 0 || hit->noverlaps   >  hit->ndom) ESL_FAIL(eslFAIL, errbuf, "bad noverlaps");
  if (hit->nreported   < 0 || hit->nreported   >  hit->ndom) ESL_FAIL(eslFAIL, errbuf, "bad nreported");
  if (hit->nincluded   < 0 || hit->nincluded   >  hit->ndom) ESL_FAIL(eslFAIL, errbuf, "bad nincluded");
  if (hit->best_domain < 0 || hit->best_domain >= hit->ndom) ESL_FAIL(eslFAIL, errbuf, "bad best_domain");

  for (d = 0; d < hit->ndom; d++)
    if (( status = p7_domain_Validate(&(hit->dcl[d]), errbuf)) != eslOK) return status;
  
  return eslOK;
}
Beispiel #7
0
/* output_filter_power()
 *
 * Used for testing whether the filters (MSV scores, Viterbi scores)
 * have the power they're supposed to have: for example, if MSV filter
 * is set at a P-value threshold of 0.02, ~2% of sequences should get
 * through, regardless of things like model and target sequence
 * length.
 * 
 * Output a file suitable for constructing histograms over many HMMs,
 * for a particular choice of hmmsim'ed L and N targets:
 *    <hmm name>  <# of seqs passing threshold>  <fraction of seqs passing threshold>
 * 
 * SRE, Thu Apr  9 08:57:32 2009 [Janelia] xref J4/133
 */
static int
output_filter_power(ESL_GETOPTS *go, struct cfg_s *cfg, char *errbuf, P7_HMM *hmm, double *scores)
{
  double pthresh = esl_opt_GetReal(go, "--pthresh"); /* P-value threshold set for the filter score       */
  double P;					     /* calculated P-value (using HMM's own calibration) */
  int    npass = 0;				     /* number of scores that pass the P threshold       */
  double fpass;					     /* fraction of scores that pass the P threshold     */
  int    i;					     /* counter over scores                              */
  int    do_gumbel;				     /* flag for how to determine P values               */
  double pmu, plambda;

  if       (esl_opt_GetBoolean(go, "--vit")) { pmu = hmm->evparam[p7_VMU];  plambda = hmm->evparam[p7_VLAMBDA]; do_gumbel = TRUE;  }
  else if  (esl_opt_GetBoolean(go, "--msv")) { pmu = hmm->evparam[p7_MMU];  plambda = hmm->evparam[p7_MLAMBDA]; do_gumbel = TRUE;  }
  else if  (esl_opt_GetBoolean(go, "--fwd")) { pmu = hmm->evparam[p7_FTAU]; plambda = hmm->evparam[p7_FLAMBDA]; do_gumbel = FALSE; }
  else     ESL_FAIL(eslEINVAL, errbuf, "can only use --ffile with viterbi, msv, or fwd scores");

  for (i = 0; i < cfg->N; i++)
    {
      P = (do_gumbel ?  esl_gumbel_surv(scores[i], pmu, plambda) : 
                        esl_exp_surv   (scores[i], pmu, plambda));
      if (P <= pthresh) npass++;
    }
  fpass = (double) npass / (double) cfg->N;

  fprintf(cfg->ffp, "%s\t%d\t%.4f\n", hmm->name, npass, fpass);
  return eslOK;
}
/* map_rfpos_to_apos
 *                   
 * Given an MSA, determine the alignment position of each
 * non-gap RF (reference) position. The abc is only necessary
 * for defining gap characters.
 * 
 * rf2a_map[0..rfpos..rflen-1] = apos, apos is the alignment position (0..msa->alen-1) that 
 *                               is non-gap RF position rfpos+1 (for rfpos in 0..rflen-1) 
 */
static int map_rfpos_to_apos(ESL_MSA *msa, ESL_ALPHABET *abc, char *errbuf, int64_t alen, int **ret_i_am_rf, int **ret_rf2a_map, int *ret_rflen)
{
  int status;
  int rflen = 0;
  int *rf2a_map = NULL;
  int *i_am_rf = NULL;
  int rfpos = 0;
  int apos = 0;

  /* contract check */
  if(msa->rf == NULL) ESL_FAIL(eslEINVAL, errbuf, "Error, trying to map RF positions to alignment positions, but msa->rf is NULL.");

  /* count non-gap RF columns */
  for(apos = 0; apos < alen; apos++) { 
    if((! esl_abc_CIsGap(abc, msa->rf[apos])) && 
       (! esl_abc_CIsMissing(abc, msa->rf[apos])) && 
       (! esl_abc_CIsNonresidue(abc, msa->rf[apos])))
      { 
	rflen++;
	/* I don't use esl_abc_CIsResidue() b/c that would return FALSE for 'x' with RNA and DNA */
      }
  }
  /* build map */
  ESL_ALLOC(i_am_rf, sizeof(int) * alen);
  ESL_ALLOC(rf2a_map, sizeof(int) * rflen);
  for(apos = 0; apos < alen; apos++) {
    if((! esl_abc_CIsGap(abc, msa->rf[apos])) && 
       (! esl_abc_CIsMissing(abc, msa->rf[apos])) && 
       (! esl_abc_CIsNonresidue(abc, msa->rf[apos]))) { 
      i_am_rf[apos] = TRUE;
      rf2a_map[rfpos++] = apos;
    }
    else { 
      i_am_rf[apos] = FALSE;
    }
  }
  *ret_i_am_rf  = i_am_rf;
  *ret_rf2a_map = rf2a_map;
  *ret_rflen    = rflen;
  return eslOK;

 ERROR:
  if(i_am_rf  != NULL) free(i_am_rf);
  if(rf2a_map != NULL) free(rf2a_map);
  ESL_FAIL(status, errbuf, "Error, out of memory while mapping RF positions to alignment positions.");
}
Beispiel #9
0
/* Function:  esl_rmx_ValidateP()
 * Incept:    SRE, Sun Mar 11 10:30:50 2007 [Janelia]
 *
 * Purpose:   Validates a conditional probability matrix <P>, whose
 *            elements $P_{ij}$ represent conditional probabilities
 *            $P(j \mid i)$; for example in a first-order Markov
 *            chain, or a continuous-time Markov transition process
 *            where <P> is for a particular $t$.
 *            
 *            Rows must sum to one, and each element $P_{ij}$ is a
 *            probability $0 \leq P_{ij} \leq 1$.
 *            
 *            <tol> specifies the floating-point tolerance to which
 *            the row sums must equal one: <fabs(sum-1.0) <= tol>.
 *            
 *            <errbuf> is an optional error message buffer. The caller
 *            may pass <NULL> or a pointer to a buffer of at least
 *            <eslERRBUFSIZE> characters.
 *            
 * Args:      P      - matrix to validate
 *            tol    - floating-point tolerance (0.00001, for example)      
 *            errbuf - OPTIONAL: ptr to an error buffer of at least
 *                     <eslERRBUFSIZE> characters.
 *
 * Returns:   <eslOK> on successful validation. 
 *            <eslFAIL> on failure, and if a non-<NULL> <errbuf> was
 *            provided by the caller, a message describing
 *            the reason for the failure is put there.
 *
 * Throws:    (no abnormal error conditions)
 */
int
esl_rmx_ValidateP(ESL_DMATRIX *P, double tol, char *errbuf)
{
  int    i,j;
  double sum;

  if (P->type != eslGENERAL) ESL_EXCEPTION(eslEINVAL, "P must be type eslGENERAL to be validated");

  for (i = 0; i < P->n; i++)
    {
      sum = esl_vec_DSum(P->mx[i], P->m);
      if (fabs(sum-1.0) > tol) ESL_FAIL(eslFAIL, errbuf, "row %d does not sum to 1.0", i);
      
      for (j = 0; j < P->m; j++)
	if (P->mx[i][j] < 0.0 || P->mx[i][j] > 1.0)
	  ESL_FAIL(eslFAIL, errbuf, "element %d,%d is not a probability (%f)", i,j,P->mx[i][j]);
    }
  return eslOK;
}
Beispiel #10
0
/* Function: cp9_GetNCalcsPerResidue()
 * Date:     EPN, Thu Jan 17 06:12:37 2008
 *
 * Returns: eslOK on success, eslEINCOMPAT on contract violation.
 *          <ret_cp9_ncalcs_per_res> set as millions of DP calculations
 *          per residue for the CP9 HMM.
 */
int
cp9_GetNCalcsPerResidue(CP9_t *cp9, char *errbuf, float *ret_cp9_ncalcs_per_res)
{
    int cp9_ntrans;
    float cp9_ncalcs_per_res;

    if(cp9 == NULL)                    ESL_FAIL(eslEINCOMPAT, errbuf, "cp9_GetNCalcsPerRes(), cp9 == NULL.");
    if(ret_cp9_ncalcs_per_res == NULL) ESL_FAIL(eslEINCOMPAT, errbuf, "cp9_GetNCalcsPerRes(), ret_cp9_ncalcs_per_res == NULL.");

    /* determine millions of CP9 DP calcs per residue */
    cp9_ntrans = NHMMSTATETYPES * NHMMSTATETYPES; /* 3*3 = 9 transitions in global mode */
    if(cp9->flags & CPLAN9_LOCAL_BEGIN) cp9_ntrans++;
    if(cp9->flags & CPLAN9_LOCAL_END)   cp9_ntrans++;
    if(cp9->flags & CPLAN9_EL)          cp9_ntrans++;
    cp9_ncalcs_per_res = (cp9_ntrans * cp9->M) / 1000000.; /* convert to millions of calcs per residue */

    *ret_cp9_ncalcs_per_res = cp9_ncalcs_per_res;
    return eslOK;
}
Beispiel #11
0
/* init_master_cfg()
 * Called by masters, mpi or serial.
 * Already set:
 *    cfg->hmmfile     - command line arg 1
 *    cfg->alifile     - command line arg 2
 *    cfg->postmsafile - option -O (default NULL)
 *    cfg->fmt         - format of alignment file
 * Sets: 
 *    cfg->afp       - open alignment file                
 *    cfg->abc       - digital alphabet
 *    cfg->hmmfp     - open HMM file
 *    cfg->postmsafp - open MSA resave file, or NULL
 *                   
 * Errors in the MPI master here are considered to be "recoverable",
 * in the sense that we'll try to delay output of the error message
 * until we've cleanly shut down the worker processes. Therefore
 * errors return (code, errmsg) by the ESL_FAIL mech.
 */
static int
init_master_cfg(const ESL_GETOPTS *go, struct cfg_s *cfg, char *errmsg)
{
  int status;

  if (esl_opt_GetString(go, "-o") != NULL) {
    if ((cfg->ofp = fopen(esl_opt_GetString(go, "-o"), "w")) == NULL) 
      ESL_FAIL(eslFAIL, errmsg, "Failed to open -o output file %s\n", esl_opt_GetString(go, "-o"));
  } else cfg->ofp = stdout;

  status = esl_msafile_Open(cfg->alifile, cfg->fmt, NULL, &(cfg->afp));
  if (status == eslENOTFOUND)    ESL_FAIL(status, errmsg, "Alignment file %s doesn't exist or is not readable\n", cfg->alifile);
  else if (status == eslEFORMAT) ESL_FAIL(status, errmsg, "Couldn't determine format of alignment %s\n", cfg->alifile);
  else if (status != eslOK)      ESL_FAIL(status, errmsg, "Alignment file open failed with error %d\n", status);

  if      (esl_opt_GetBoolean(go, "--amino"))   cfg->abc = esl_alphabet_Create(eslAMINO);
  else if (esl_opt_GetBoolean(go, "--dna"))     cfg->abc = esl_alphabet_Create(eslDNA);
  else if (esl_opt_GetBoolean(go, "--rna"))     cfg->abc = esl_alphabet_Create(eslRNA);
  else {
    int type;
    status = esl_msafile_GuessAlphabet(cfg->afp, &type);
    if (status == eslEAMBIGUOUS)    ESL_FAIL(status, errmsg, "Failed to guess the bio alphabet used in %s.\nUse --dna, --rna, or --amino option to specify it.", cfg->alifile);
    else if (status == eslEFORMAT)  ESL_FAIL(status, errmsg, "Alignment file parse failed: %s\n", cfg->afp->errbuf);
    else if (status == eslENODATA)  ESL_FAIL(status, errmsg, "Alignment file %s is empty\n", cfg->alifile);
    else if (status != eslOK)       ESL_FAIL(status, errmsg, "Failed to read alignment file %s\n", cfg->alifile);
    cfg->abc = esl_alphabet_Create(type);
  }
  esl_msafile_SetDigital(cfg->afp, cfg->abc);

  if ((cfg->hmmfp = fopen(cfg->hmmfile, "w")) == NULL) ESL_FAIL(status, errmsg, "Failed to open HMM file %s for writing", cfg->hmmfile);

  if (cfg->postmsafile != NULL) {
    if ((cfg->postmsafp = fopen(cfg->postmsafile, "w")) == NULL) ESL_FAIL(status, errmsg, "Failed to MSA resave file %s for writing", cfg->postmsafile);
  } else cfg->postmsafp = NULL;

  output_header(go, cfg);

  /* with msa == NULL, output_result() prints the tabular results header, if needed */
  output_result(cfg, errmsg, 0, NULL, NULL, NULL, 0.0);
  return eslOK;
}
Beispiel #12
0
/* Function:  p7_anchors_Validate()
 * Synopsis:  Validate an anchor set object.
 *
 * Purpose:   Validates an anchor set object.
 *
 *            If <M>,<L> dimensions are provided, then the sentinels
 *            at <0> and <D+1> are validated too. If <M> or <L> are
 *            unknown they can be passed as 0, and the sentinels in
 *            <anch> will be used to determine them -- which of course
 *            depends on the sentinels being valid, so is less strong.
 *
 * Args:      anch   - anchors to validate
 *            L      - sequence length if known; else 0
 *            M      - profile length if known; else 0
 *            errbuf - optional error message, allocated for eslERRBUFSIZE; or NULL
 *
 * Returns:   <eslOK> on success.
 *            <eslFAIL> on failure, and if <errbuf> was provided, it contains
 *            an informative error message.
 *
 * Throws:    (no abnormal error conditions)
 */
int
p7_anchors_Validate(P7_ANCHORS *anch, int L, int M, char *errbuf)
{
    int D = anch->D;
    int d;

    /* If M or L aren't provided, set them from the sentinels */
    if (!L) L = anch->a[D+1].i0 - 1;
    if (!M) M = anch->a[0].k0 - 1;

    for (d = 0; d <= D; d++)
        if (! (anch->a[d].i0 < anch->a[d+1].i0)) ESL_FAIL(eslFAIL, errbuf, "i0 anchors not sorted");

    for (d = 1; d <= D; d++) {
        if (! (anch->a[d].i0 >= 1 && anch->a[d].i0 <= L)) ESL_FAIL(eslFAIL, errbuf, "i0 %d not in range 1..L", d);
        if (! (anch->a[d].k0 >= 1 && anch->a[d].k0 <= M)) ESL_FAIL(eslFAIL, errbuf, "k0 %d not in range 1..M", d);
    }

    if (anch->a[0].i0   != 0   || anch->a[0].k0   != M+1) ESL_FAIL(eslFAIL, errbuf, "sentinel 0 invalid");
    if (anch->a[D+1].i0 != L+1 || anch->a[D+1].k0 != 0)   ESL_FAIL(eslFAIL, errbuf, "sentinel D+1 invalid");

    return eslOK;
}
Beispiel #13
0
/* set_msa_name() 
 * Make sure the alignment has a name; this name will
 * then be transferred to the model.
 * 
 * We can only do this for a single alignment in a file. For multi-MSA
 * files, each MSA is required to have a name already.
 *
 * Priority is:
 *      1. Use -n <name> if set, overriding any name the alignment might already have. 
 *      2. Use alignment's existing name, if non-NULL.
 *      3. Make a name, from alignment file name without path and without filename extension 
 *         (e.g. "/usr/foo/globins.slx" gets named "globins")
 * If none of these succeeds, return <eslEINVAL>.
 *         
 * If a multiple MSA database (e.g. Stockholm/Pfam), and we encounter
 * an MSA that doesn't already have a name, return <eslEINVAL> if nali > 1.
 * (We don't know we're in a multiple MSA database until we're on the second
 * alignment.)
 * 
 * If we're in MPI mode, we assume we're in a multiple MSA database,
 * even on the first alignment.
 * 
 * Because we can't tell whether we've got more than one
 * alignment 'til we're on the second one, these fatal errors
 * only happen after the first HMM has already been built.
 * Oh well.
 */
static int
set_msa_name(struct cfg_s *cfg, char *errbuf, ESL_MSA *msa)
{
  char *name = NULL;
  int   status;

  if (cfg->do_mpi == FALSE && cfg->nali == 1) /* first (only?) HMM in file: */
    {
      if  (cfg->hmmName != NULL)
	{
	  if ((status = esl_msa_SetName(msa, cfg->hmmName)) != eslOK) return status;
	}
      else if (msa->name != NULL) 
	{
	  cfg->nnamed++;
	}
      else if (! cfg->afp->do_stdin)
	{
	  if ((status = esl_FileTail(cfg->afp->fname, TRUE, &name)) != eslOK) return status; /* TRUE=nosuffix */	  
	  if ((status = esl_msa_SetName(msa, name))                 != eslOK) return status;
	  free(name);
	}
      else ESL_FAIL(eslEINVAL, errbuf, "Failed to set model name: msa has no name, no msa filename, and no -n");
    }
  else 
    {
      if (cfg->hmmName   != NULL) ESL_FAIL(eslEINVAL, errbuf, "Oops. Wait. You can't use -n with an alignment database.");
      else if (msa->name != NULL) cfg->nnamed++;
      else                        ESL_FAIL(eslEINVAL, errbuf, "Oops. Wait. I need name annotation on each alignment in a multi MSA file; failed on #%d", cfg->nali+1);

      /* special kind of failure: the *first* alignment didn't have a name, and we used the filename to
       * construct one; now that we see a second alignment, we realize this was a boo-boo*/
      if (cfg->nnamed != cfg->nali)            ESL_FAIL(eslEINVAL, errbuf, "Oops. Wait. I need name annotation on each alignment in a multi MSA file; first MSA didn't have one");
    }
  return eslOK;
}
Beispiel #14
0
int
p7_tophits_Validate(const P7_TOPHITS *th, char *errbuf)
{
  int i;
  int idx;
  int status;

  if (th->is_sorted_by_sortkey || th->is_sorted_by_seqidx)
    {
      for (i = 0; i < th->N; i++)
	{
	  idx = th->hit[i] - th->unsrt; /* i.e., by ptr arithmetic: #i in sorted list is #idx in unsorted list */
	  if (idx < 0 || idx >= th->N) ESL_FAIL(eslFAIL, errbuf, "sorted hit number %d points to bad address", i);
	  /* TestSample() currently doesn't sort its sampled hit array, so we don't test for proper sortedness */
	}
    }
  if (th->nreported < 0 || th->nreported > th->N) ESL_FAIL(eslFAIL, errbuf, "bad nreported field");
  if (th->nincluded < 0 || th->nincluded > th->N) ESL_FAIL(eslFAIL, errbuf, "bad nreported field");
  if (th->is_sorted_by_sortkey && th->is_sorted_by_seqidx) ESL_FAIL(eslFAIL, errbuf, "both sort type flags are up");
  for (i = 0; i < th->N; i++)
    if (( status = p7_hit_Validate( &(th->unsrt[i]), errbuf)) != eslOK) return status;

  return eslOK;
}
Beispiel #15
0
/* set_relative_weights():
 * Set msa->wgt vector, using user's choice of relative weighting algorithm.
 */
static int
relative_weights(P7_BUILDER *bld, ESL_MSA *msa)
{
  int status = eslOK;

  if      (bld->wgt_strategy == p7_WGT_NONE)                    { esl_vec_DSet(msa->wgt, msa->nseq, 1.); }
  else if (bld->wgt_strategy == p7_WGT_GIVEN)                   ;
  else if (bld->wgt_strategy == p7_WGT_PB)                      status = esl_msaweight_PB(msa); 
  else if (bld->wgt_strategy == p7_WGT_GSC)                     status = esl_msaweight_GSC(msa); 
  else if (bld->wgt_strategy == p7_WGT_BLOSUM)                  status = esl_msaweight_BLOSUM(msa, bld->wid); 
  else ESL_EXCEPTION(eslEINCONCEIVABLE, "no such weighting strategy");

  if (status != eslOK) ESL_FAIL(status, bld->errbuf, "failed to set relative weights in alignment");
  return eslOK;
}
Beispiel #16
0
/* validate_msa:
 * SRE, Thu Dec  3 16:10:31 2009 [J5/119; bug #h70 fix]
 * 
 * HMMER uses a convention for missing data characters: they
 * indicate that a sequence is a fragment.  (See
 * esl_msa_MarkFragments()).
 *
 * Because of the way these fragments will be handled in tracebacks,
 * we reject any alignment that uses missing data characters in any
 * other way.
 * 
 * This validation step costs negligible time.
 */
static int
validate_msa(P7_BUILDER *bld, ESL_MSA *msa)
{
  int     idx;
  int64_t apos;

  for (idx = 0; idx < msa->nseq; idx++)
    {
      apos = 1;
      while (  esl_abc_XIsMissing(msa->abc, msa->ax[idx][apos]) && apos <= msa->alen) apos++;
      while (! esl_abc_XIsMissing(msa->abc, msa->ax[idx][apos]) && apos <= msa->alen) apos++;
      while (  esl_abc_XIsMissing(msa->abc, msa->ax[idx][apos]) && apos <= msa->alen) apos++;
      if (apos != msa->alen+1) ESL_FAIL(eslEINVAL, bld->errbuf, "msa %s; sequence %s\nhas missing data chars (~) other than at fragment edges", msa->name, msa->sqname[idx]);
    }
  
  return eslOK;
}
Beispiel #17
0
/* map_sub_msas
 *                   
 * msa1 and msa2 contain the same named sequences, msa1 contains a superset 
 * of the columns in msa2. Determine which of the msa1 columns the msa2
 * columns correspond to.
 */
static int
map_sub_msas(const ESL_GETOPTS *go, char *errbuf, ESL_MSA *msa1, ESL_MSA *msa2, char **ret_msa1_to_msa2_mask)
{
  int status;
  int  apos1, apos2;          /* counters over alignment position in msa1, msa2 respectively */
  int i;
  int *msa1_to_msa2_map;    /* [0..apos1..msa1->alen] msa2 alignment position that apos1 corresponds to */
  char *mask;

  /* contract check */
  if(! (msa1->flags & eslMSA_DIGITAL)) ESL_FAIL(eslEINVAL, errbuf, "in map_sub_msas() msa1 (%s) not digitized.\n", esl_opt_GetArg(go, 1));
  if(! (msa2->flags & eslMSA_DIGITAL)) ESL_FAIL(eslEINVAL, errbuf, "in map_sub_msas() msa2 (%s) not digitized.\n", esl_opt_GetString(go, "--submap"));
  if(msa1->alen <= msa2->alen) ESL_FAIL(eslEINVAL, errbuf, "in map_sub_msas() alignment length for msa1 (%" PRId64 "d) <= length for msa2 (%" PRId64 ")\n", msa1->alen, msa2->alen);
  
  ESL_ALLOC(mask, sizeof(char) * (msa1->alen+1));
  for(apos1 = 0; apos1 < msa1->alen; apos1++) mask[apos1] = '0';
  mask[msa1->alen] = '\0';

  ESL_ALLOC(msa1_to_msa2_map, sizeof(int) * (msa1->alen+1));
  esl_vec_ISet(msa1_to_msa2_map, (msa1->alen+1), -1);

  /* both alignments must have same 'named' sequences in same order */
  if(msa1->nseq != msa2->nseq) ESL_FAIL(eslEINVAL, errbuf, "in map_sub_msas() msa1 has %d sequences, msa2 has %d sequences\n", msa1->nseq, msa2->nseq);
  for(i = 0; i < msa1->nseq; i++) { 
    if(strcmp(msa1->sqname[i], msa2->sqname[i]) != 0) ESL_FAIL(eslEINVAL, errbuf, "in map_sub_msas() msa1 seq %d is named %s, msa2 seq %d is named %s\n", i, msa1->sqname[i], i, msa2->sqname[i]);
  }

  apos1 = 1;
  apos2 = 1;
  while((apos2 <= msa2->alen) || (apos1 <= msa1->alen)) { /* determine which apos1 (alignment column in msa1), apos2 (alignment column in msa2) corresponds to */
    for(i = 0; i < msa1->nseq; i++) { 
      if(msa1->ax[i][apos1] != msa2->ax[i][apos2]) { 
	apos1++; 
	break; /* try next apos1 */ 
      }
    }	
    if(i == msa1->nseq) { /* found a match */
      msa1_to_msa2_map[apos1] = apos2;
      mask[(apos1-1)] = '1';
      apos1++;
      apos2++;
    }
  }
  if((apos1 != (msa1->alen+1)) || (apos2 != (msa2->alen+1))) ESL_FAIL(eslEINVAL, errbuf, "in map_sub_msas(), failure mapping alignments, end of loop apos1-1 = %d (msa1->alen: %" PRId64 ") and apos2-1 = %d (msa2->alen: %" PRId64 ")\n", apos1-1, msa1->alen, apos2-1, msa2->alen);

  free(msa1_to_msa2_map);
  *ret_msa1_to_msa2_mask = mask;
  return eslOK;
  
 ERROR: 
  return status;
}
Beispiel #18
0
/* Function: DispatchSqBlockAlignment()
 * Date:     EPN, Fri Dec 30 14:59:43 2011
 *
 * Purpose:  Given a CM and a block of sequences, align the
 *           sequence(s) using the appropriate alignment function and
 *           return relevant data for eventual output in <ret_dataA>.
 *           This function simply calls DispatchSqAlignment() serially
 *           for each sequence in the block, and creates an array
 *           of the <ret_data> DispatchSqAlignment() returns.
 *
 *           Currently <mode>, <cp9b_valid> and <pass_idx> values sent
 *           to DispatchSqAlignment() are hard-coded to
 *           TRMODE_UNKNOWN, FALSE, and PLI_PASS_5P_AND_3P_FORCE (if
 *           cm->align_opts & CM_ALIGN_TRUNC) or PLI_PASS_STD_ANY (if
 *           (! cm->align_opts & CM_ALIGN_TRUNC)). This is because
 *           this function is only used by the alignment pipeline, in
 *           which these values are correct. If this changes, we may
 *           want caller to pass in an array of modes, cp9b_valids and
 *           pass_idx values, one per sq.
 *
 *           If (cm->flags & CM_ALIGN_XTAU) we'll potentially tighten
 *           HMM bands until the required DP matrices are below out
 *           limit (<mxsize>). cm->maxtau is the max allowed tau value
 *           during this iterative band tightening, and cm->xtau is
 *           the factor by which we multiply cm->tau at each iteration
 *           during band tightening.
 *
 * Args:     cm        - the covariance model
 *           errbuf    - char buffer for reporting errors
 *           sq_block  - block of sequences to align
 *           mxsize    - max size in Mb of allowable DP mx
 *           w         - stopwatch for timing individual stages
 *           w_tot     - stopwatch for timing total time per seq
 *           r         - RNG, req'd if CM_ALIGN_SAMPLE, can be NULL otherwise
 *           ret_dataA - RETURN: newly created array of CM_ALNDATA objects
 *
 * Returns:  eslOK on success;
 *           eslEINCOMPAT on contract violation, errbuf is filled;
 *           eslEMEM if we run out of memory;
 *           <ret_dataA> is alloc'ed and filled with sq_block->count CM_ALNDATA objects.
 */
int
DispatchSqBlockAlignment(CM_t *cm, char *errbuf, ESL_SQ_BLOCK *sq_block, float mxsize, ESL_STOPWATCH *w, 
			 ESL_STOPWATCH *w_tot, ESL_RANDOMNESS *r, CM_ALNDATA ***ret_dataA)
{
  int           status;          /* easel status */
  int           j;               /* counter over parsetrees */
  CM_ALNDATA  **dataA = NULL;    /* CM_ALNDATA array we'll create and return */
  ESL_SQ       *sqp;             /* ptr to a ESL_SQ */
  int           pass_idx;        /* pass_idx passed to DispatchSqAlignment() */
  char          mode;            /* mode passed to DispatchSqAlignment() */
  int           cp9b_valid;      /* passed to DispatchSqAlignment() */

  ESL_ALLOC(dataA, sizeof(CM_ALNDATA *) * ESL_MAX(1, sq_block->count)); // avoid 0 malloc
  for(j = 0; j < sq_block->count; j++) dataA[j] = NULL;

  /* DispatchSqAligment() needs a mode, pipeline pass index, and
   * knowledge of whether cm->cp9b are valid for sequence to align
   * (see note in 'Purpose' above). Currently the relevant values 
   * for these are as follows:
   */
  mode       = TRMODE_UNKNOWN;
  pass_idx   = (cm->align_opts & CM_ALIGN_TRUNC) ? PLI_PASS_5P_AND_3P_FORCE : PLI_PASS_STD_ANY; 
  cp9b_valid = FALSE;

  /* main loop: for each sequence, call DispatchSqAlignment() to do the work */
  for(j = 0; j < sq_block->count; j++) { 
    sqp = sq_block->list + j;
    if((status = DispatchSqAlignment(cm, errbuf, sqp, sq_block->first_seqidx + j, mxsize, mode, pass_idx, cp9b_valid, w, w_tot, r, &(dataA[j]))) != eslOK) goto ERROR;
  }
  *ret_dataA = dataA;

  return eslOK;

 ERROR: 
  if(dataA != NULL) { 
    for(j = 0; j < sq_block->count; j++) { 
      if(dataA[j] != NULL) cm_alndata_Destroy(dataA[j], FALSE);
    }
    free(dataA);
  }
  *ret_dataA = NULL;
  if(status == eslEMEM) ESL_FAIL(status, errbuf, "DispatchSqBlockAlignment(), out of memory");
  else return status; /* errbuf was filled by DispatchSqAlignment() */
}
Beispiel #19
0
static int
output_result(const struct cfg_s *cfg, char *errbuf, int msaidx, ESL_MSA *msa, P7_HMM *hmm, ESL_MSA *postmsa, double entropy)
{
  int status;

  /* Special case: output the tabular results header. 
   * Arranged this way to keep the two fprintf()'s close together in the code,
   * so we can keep the data and labels properly sync'ed.
   */
  if (msa == NULL)
    {
      fprintf(cfg->ofp, "#%4s %-20s %5s %5s %5s %8s %6s %s\n", " idx", "name",                 "nseq",  "alen",  "mlen",  "eff_nseq",  "re/pos",  "description");
      fprintf(cfg->ofp, "#%4s %-20s %5s %5s %5s %8s %6s %s\n", "----", "--------------------", "-----", "-----", "-----", "--------",  "------",  "-----------");
      return eslOK;
    }

  if ((status = p7_hmm_Validate(hmm, errbuf, 0.0001))       != eslOK) return status;
  if ((status = p7_hmmfile_WriteASCII(cfg->hmmfp, -1, hmm)) != eslOK) ESL_FAIL(status, errbuf, "HMM save failed");
  
	             /* #   name nseq alen M eff_nseq re/pos description*/
  fprintf(cfg->ofp, "%-5d %-20s %5d %5" PRId64 " %5d %8.2f %6.3f %s\n",
	  msaidx,
	  (msa->name != NULL) ? msa->name : "",
	  msa->nseq,
	  msa->alen,
	  hmm->M,
	  hmm->eff_nseq,
	  entropy,
	  (msa->desc != NULL) ? msa->desc : "");
  
  if (cfg->postmsafp != NULL && postmsa != NULL) {
    esl_msa_Write(cfg->postmsafp, postmsa, eslMSAFILE_STOCKHOLM);
  }

  return eslOK;
}
Beispiel #20
0
/* Function: GrowCP9Matrix()
 *
 * Purpose:  Reallocate a CP9 dp matrix, if necessary, for seq for
 *           length N, or 2 rows (if we're scanning in memory 
 *           efficient mode, in this case N == 1, nrows = N+1).
 * 
 *           Note: unlike HMMER, M never changes, so we only have
 *           to worry about increasing the number of rows if nec.
 *           
 *           Returns individual ptrs to the four matrix components
 *           as a convenience.
 *           
 *           This function allocates the requested matrix regardless
 *           of it's size.
 * 
 *           If kmin and kmax are non-NULL, the matrix will be a p7
 *           HMM banded matrix as defined by bands in kmin, kmax.
 *           In this case N must be length of the sequence. If caller
 *           wants a non-banded CP9 matrix, pass kmin = kmax = NULL.
 *
 * Args:     mx    - an already allocated matrix to grow.
 *           N     - seq length to allocate for; N+1 rows
 *           M     - size of model, contract enforces this must == mx->M
 *           kmin  - OPTIONAL: [0.1..i..N] minimum k for residue i
 *           kmax  - OPTIONAL: [0.1..i..N] maximum k for residue i
 *           mmx, imx, dmx, elmx, erow 
 *                 - RETURN: ptrs to four mx components as a convenience
 *                   
 * Return:   eslOK on success, eslEINCOMPAT if contract is violated,
 *           mx is (re)allocated here.
 */
int
GrowCP9Matrix(CP9_MX *mx, char *errbuf, int N, int M, int *kmin, int *kmax, int ***mmx, int ***imx, int ***dmx, int ***elmx, int **erow)
{
  int status;
  void *p;
  int i;
  int ncells_needed = 0;
  int do_banded;
  int cur_ncells = 0;
  int do_reallocate;

  if(mx->M != M) ESL_FAIL(eslEINCOMPAT, errbuf, "GrowCP9Matrix(), mx->M: %d != M passed in: %d\n", mx->M, M);
  if(N < 0)      ESL_FAIL(eslEINCOMPAT, errbuf, "GrowCP9Matrix(), N: %d < 0\n", N);

  do_banded = (kmin != NULL && kmax == NULL) ?  TRUE : FALSE;
  if(do_banded) { 
    for (i = 0; i <= N; i++) ncells_needed += (kmax[i] - kmin[i] + 1);
  }
  else ncells_needed = (N+1) * (M+1);
  do_reallocate = (ncells_needed <= mx->ncells_allocated) ? FALSE : TRUE;

  if(do_reallocate) { 
    /* we need more space */
    ESL_RALLOC(mx->mmx,  p, sizeof(int *) * (N+1));
    ESL_RALLOC(mx->imx,  p, sizeof(int *) * (N+1));
    ESL_RALLOC(mx->dmx,  p, sizeof(int *) * (N+1));
    ESL_RALLOC(mx->elmx, p, sizeof(int *) * (N+1)); 
    ESL_RALLOC(mx->erow, p, sizeof(int)   * (N+1));
    ESL_RALLOC(mx->mmx_mem,  p, sizeof(int) * ncells_needed);
    ESL_RALLOC(mx->imx_mem,  p, sizeof(int) * ncells_needed);
    ESL_RALLOC(mx->dmx_mem,  p, sizeof(int) * ncells_needed);
    ESL_RALLOC(mx->elmx_mem, p, sizeof(int) * ncells_needed);
    mx->ncells_allocated = ncells_needed;

    /* update size */
    mx->size_Mb =  (float) sizeof(CP9_MX);
    mx->size_Mb += (float) (sizeof(int *) * (N+1) * 4);           /* mx->*mx ptrs */
    mx->size_Mb += (float) (sizeof(int)   * (ncells_needed * 4)); /* mx->*mx_mem */
    mx->size_Mb += (float) (sizeof(int)   * (N+1));               /* mx->erow */
    mx->size_Mb /= 1000000.;
  }

  if(do_banded || do_reallocate) { /* rearrange pointers */
    mx->mmx[0]  = mx->mmx_mem;
    mx->imx[0]  = mx->imx_mem;
    mx->dmx[0]  = mx->dmx_mem;
    mx->elmx[0] = mx->elmx_mem;

    if(do_banded) { 
      cur_ncells = kmax[0] - kmin[0] + 1;
      for (i = 1; i <= N; i++) {
	mx->mmx[i] = mx->mmx[0] + cur_ncells;
	mx->imx[i] = mx->imx[0] + cur_ncells;
	mx->dmx[i] = mx->dmx[0] + cur_ncells;
	mx->elmx[i]= mx->elmx[0]+ cur_ncells;
	cur_ncells += kmax[i] - kmin[i] + 1;
      }
    }
    else { /* non-banded, we only get here if we didn't go to done, i.e. we reallocated */
      for (i = 1; i <= N; i++) {
	mx->mmx[i] = mx->mmx[0] + (i*(M+1));
	mx->imx[i] = mx->imx[0] + (i*(M+1));
	mx->dmx[i] = mx->dmx[0] + (i*(M+1));
	mx->elmx[i]= mx->elmx[0]+ (i*(M+1));
      }
    }
  }

  mx->rows = N;
  mx->kmin = kmin; /* could be NULL */
  mx->kmax = kmax; /* could be NULL */
  mx->ncells_valid = ncells_needed;
  if (mmx != NULL) *mmx = mx->mmx;
  if (imx != NULL) *imx = mx->imx;
  if (dmx != NULL) *dmx = mx->dmx;
  if (elmx!= NULL) *elmx= mx->elmx;
  if (erow != NULL) *erow = mx->erow;
  return eslOK;

 ERROR:
  ESL_FAIL(status, errbuf, ("GrowCP9Matrix(), memory reallocation error."));
}
/* dump_infocontent_info
 *                   
 * Given an MSA with RF annotation, dump information content per column data to 
 * an open output file.
 */
static int dump_infocontent_info(FILE *fp, ESL_ALPHABET *abc, double **abc_ct, int use_weights, int nali, int64_t alen, int nseq, int *i_am_rf, char *msa_name, char *alifile, char *errbuf)
{
  int status;
  int apos, rfpos;
  double bg_ent;
  double *bg = NULL;
  double *abc_freq = NULL;
  double nnongap;

  ESL_ALLOC(bg, sizeof(double) * abc->K);
  esl_vec_DSet(bg, abc->K, 1./(abc->K));
  bg_ent = esl_vec_DEntropy(bg, abc->K);
  free(bg);

  ESL_ALLOC(abc_freq, sizeof(double) * abc->K);


  fprintf(fp, "# Information content per column (bits):\n");
  fprintf(fp, "# Alignment file: %s\n", alifile);
  fprintf(fp, "# Alignment idx:  %d\n", nali);
  if(msa_name != NULL) { fprintf(fp, "# Alignment name: %s\n", msa_name); }
  fprintf(fp, "# Number of sequences: %d\n", nseq);
  if(use_weights) { fprintf(fp, "# IMPORTANT: Counts are weighted based on sequence weights in alignment file.\n"); }
  else            { fprintf(fp, "# Sequence weights from alignment were ignored (if they existed).\n"); }
  fprintf(fp, "#\n");

  if(i_am_rf != NULL) { 
    fprintf(fp, "# %7s  %7s  %10s  %10s\n", "rfpos",    "alnpos",  "freqnongap", "info(bits)");
    fprintf(fp, "# %7s  %7s  %10s  %10s\n", "-------", "-------",  "----------", "----------");
  }  
  else { 
    fprintf(fp, "# %7s  %10s  %10s\n", "alnpos",  "freqnongap", "info(bits)");
    fprintf(fp, "# %7s  %10s  %10s\n", "-------", "----------", "----------");
  }

  rfpos = 0;
  for(apos = 0; apos < alen; apos++) {
    if(i_am_rf != NULL) { 
      if(i_am_rf[apos]) { 
	fprintf(fp, "  %7d", rfpos+1);
	rfpos++; 
      }
      else { 
	fprintf(fp, "  %7s", "-");
      }
    }
    nnongap = esl_vec_DSum(abc_ct[apos], abc->K);
    esl_vec_DCopy(abc_ct[apos], abc->K, abc_freq);
    esl_vec_DNorm(abc_freq, abc->K);
    fprintf(fp, "  %7d  %10.8f  %10.8f\n", apos+1, 
	    nnongap / (nnongap + abc_ct[apos][abc->K]),
	    (bg_ent - esl_vec_DEntropy(abc_freq, abc->K)));
  }
  fprintf(fp, "//\n");

  if(abc_freq != NULL) free(abc_freq);

  return eslOK;

 ERROR:
  ESL_FAIL(eslEINVAL, errbuf, "out of memory");
  return status; /* NEVERREACHED */
}
/* dump_posterior_sequence_info
 *                   
 * Dump per-sequence posterior probability data to a file.
 *
 */      
static int dump_posterior_sequence_info(FILE *fp, ESL_MSA *msa, int nali, char *alifile, char *errbuf)
{
  int    i,p,apos;     /* counters over sequences, columns of MSA */
  int    ppidx;
  int    nppvals = 12;
  int    nnongap;
  double sum;
  float ppavgA[11];
  char ppstring[12] = "0123456789*.";
  int seq_pp_ct[12];

  ppavgA[0]  = 0.025;  
  ppavgA[1]  = 0.10;
  ppavgA[2]  = 0.20;
  ppavgA[3]  = 0.30;
  ppavgA[4]  = 0.40;
  ppavgA[5]  = 0.50;
  ppavgA[6]  = 0.60;
  ppavgA[7]  = 0.70;
  ppavgA[8]  = 0.80;
  ppavgA[9]  = 0.90;
  ppavgA[10] = 0.975;

  fprintf(fp, "# Posterior probability stats per sequence:\n");
  fprintf(fp, "# Alignment file: %s\n", alifile);
  fprintf(fp, "# Alignment idx:  %d\n", nali);
  if(msa->name != NULL) { fprintf(fp, "# Alignment name: %s\n", msa->name); }
  fprintf(fp, "# Number of sequences: %d\n", msa->nseq);
  fprintf(fp, "# %7s  %-40s  %7s", "seqidx", "seqname", "nnongap");
  for(p = 0; p < nppvals-1; p++) {  /* don't include gaps in per-sequence output */
    fprintf(fp, "  %7c", ppstring[p]);
  }
  fprintf(fp, "  %7s\n", "avgPP");

  fprintf(fp, "# %7s  %40s  %7s", "-------", "----------------------------------------", "-------");
  for(p = 0; p < nppvals-1; p++) { /* don't include gaps in per-sequence output */
    fprintf(fp, "  %7s", "-------");
  }
  fprintf(fp, "  %7s\n", "-------");

  for(i = 0; i < msa->nseq; i++) { 
    if(msa->pp[i] != NULL) { 
      fprintf(fp, "  %7d  %-40s", i+1, msa->sqname[i]);
      sum = 0.;
      esl_vec_ISet(seq_pp_ct, nppvals, 0);
      for(apos = 0; apos < msa->alen; apos++) { 
	if((ppidx = get_pp_idx(msa->abc, msa->pp[i][apos])) == -1) ESL_FAIL(eslEFORMAT, errbuf, "bad #=GR PP char: %c", msa->pp[i][apos]);
	seq_pp_ct[ppidx]++;
      }
      nnongap = esl_vec_ISum(seq_pp_ct, 11);
      fprintf(fp, "  %7d", nnongap);
      for(p = 0; p < nppvals-1; p++) { /* don't include gaps in per-sequence output */
	fprintf(fp, "  %7d", seq_pp_ct[p]);
	if(p <= 10) sum += (float) seq_pp_ct[p] * ppavgA[p];
      }
      fprintf(fp, "  %.5f\n", sum / (float) nnongap);
    }
  }
  fprintf(fp, "//\n");

  return eslOK;
}
/* dump_insert_info
 *                   
 * Given an MSA with RF annotation, print out information about how many 'insertions' come
 * after each non-gap RF column (consensus column). 
 */
static int dump_insert_info(FILE *fp, ESL_MSA *msa, int use_weights, int nali, int *i_am_rf, char *alifile, char *errbuf)
{
  int status;
  int apos, rfpos;
  double **ict;
  double *total_ict;
  int i;
  int rflen;
  double seqwt; /* weight of current sequence */
  double nseq;

  /* contract check */
  if(! (msa->flags & eslMSA_DIGITAL)) ESL_XFAIL(eslEINVAL, errbuf, "in dump_insert_info(), msa must be digitized.");
  if(msa->rf == NULL) ESL_XFAIL(eslEINVAL, errbuf, "No #=GC RF markup in alignment, it is needed for --iinfo.");
  if(i_am_rf == NULL) ESL_XFAIL(eslEINVAL, errbuf, "internal error, dump_insert_info() i_am_rf is NULL.");
  if(use_weights && msa->wgt == NULL) ESL_FAIL(eslEINCOMPAT, errbuf, "dump_insert_info(): use_weights==TRUE but msa->wgt == NULL");

  ESL_ALLOC(total_ict, sizeof(double) * (msa->alen+2));
  esl_vec_DSet(total_ict, (msa->alen+2), 0.);

  ESL_ALLOC(ict,  sizeof(double *) * (msa->alen+2));
  for(i = 0; i <= msa->alen; i++)
    {
      ESL_ALLOC(ict[i],  sizeof(double) * (msa->nseq));
      esl_vec_DSet(ict[i], (msa->nseq), 0.);
    }

  fprintf(fp, "# Insert information:\n");
  fprintf(fp, "# Alignment file: %s\n", alifile);
  fprintf(fp, "# Alignment idx:  %d\n", nali);
  if(msa->name != NULL) { fprintf(fp, "# Alignment name: %s\n", msa->name); }
  fprintf(fp, "# rfpos is the nongap RF position after which insertions occur\n");
  fprintf(fp, "# An rfpos of '0' indicates insertions before the first nongap RF position\n");
  fprintf(fp, "# Number of sequences: %d\n", msa->nseq);
  if(use_weights) { fprintf(fp, "# IMPORTANT: Counts are weighted based on sequence weights in alignment file.\n"); }
  else            { fprintf(fp, "# Sequence weights from alignment were ignored (if they existed).\n"); }
  fprintf(fp, "#\n");

  fprintf(fp, "# %8s  %10s  %8s  %8s\n", "rfpos",    "nseq w/ins",  "freq ins", "avg len");
  fprintf(fp, "# %8s  %10s  %8s  %8s\n", "--------", "----------", "--------", "--------");

  rflen = 0;
  for(apos = 1; apos <= msa->alen; apos++)
    if(i_am_rf[apos-1]) rflen++;

  rfpos = 0;
  for(apos = 1; apos <= msa->alen; apos++)
    {
      if(i_am_rf[apos-1]) rfpos++;
      else {
	for(i = 0; i < msa->nseq; i++) { 
	  seqwt = use_weights ? msa->wgt[i] : 1.0;
	  if(esl_abc_XIsResidue(msa->abc, msa->ax[i][apos])) { 
	    ict[rfpos][i]++;
	    total_ict[rfpos] += seqwt;
	  }
	}	
      }  
    }
  rflen = rfpos;

  for(rfpos = 0; rfpos <= rflen; rfpos++)
    {
      nseq = 0.;
      for(i = 0; i < msa->nseq; i++) { 
	if(ict[rfpos][i] >= 1) { 
	  seqwt = use_weights ? msa->wgt[i] : 1.0;
	  nseq += seqwt;
	}
      }
      if(nseq > 0.) 
	fprintf(fp, "  %8d  %10.1f  %8.6f  %8.3f\n", rfpos, nseq, nseq / (float) msa->nseq, ((float) total_ict[rfpos] / (float) nseq));
    }
  fprintf(fp, "//\n");

  for(i = 0; i <= msa->alen; i++) free(ict[i]);
  free(ict);
  free(total_ict);

  return eslOK;

 ERROR:
  return status;
}
Beispiel #24
0
/* map_msas
 *                   
 * Align msa1 and msa2.
 * For each column in msa1, determine the corresponding column
 * in msa2. This implementation requires:
 *  - msa1 and msa2 contain exactly the same sequences in the same order
 * Note: the seqs in msa1 and msa2 do not have to have the same names.
 *
 * Uses a DP algorithm similar to Needleman-Wunsch, but that's aligning
 * two alignment columns at a time instead of two residues. 
 */
static int
map_msas(const ESL_GETOPTS *go, char *errbuf, ESL_MSA *msa1, ESL_MSA *msa2, int **ret_msa1_to_msa2_map)
{
  int status;
  int **one2two;              /* [0..c..rflen1][0..a..alen2] number of residues from non-gap RF column c of msa1
			       * aligned in column a of msa 2 */
  int *rf2a_map1 = NULL;       /* msa1 map of reference columns (non-gap RF residues) to alignment columns, NULL if msa1->rf == NULL */
  int *rf2a_map2 = NULL;       /* msa2 map of reference columns (non-gap RF residues) to alignment columns, NULL if msa2->rf == NULL */
  int *a2rf_map1 = NULL;       /* msa1 map of alignment columns to reference columns, NULL if msa1->rf == NULL */
  int *a2rf_map2 = NULL;       /* msa2 map of alignment columns to reference columns, NULL if msa2->rf == NULL */
  int apos1, apos2;           /* counters over alignment position in msa1, msa2 respectively */
  int alen1, alen2;           /* alignment lengths */
  int rfpos1, rfpos2;           /* counters over reference positions */
  int rflen1, rflen2;           /* reference (non-gap RF) lengths */
  int **mx;                   /* [0..c..rflen1][0..a..alen2] dp matrix, score of max scoring aln 
			       * from 1..c in msa1 and 1..a in msa 2 */
  int **tb;                   /* [0..c..rflen1][0..a..alen2] traceback ptrs, 0 for diagonal, 1 for vertical */
  char *seq1, *seq2;          /* temporary strings for ensuring dealigned sequences in msa1 and msa2 are identical */
  int64_t len1, len2;         /* length of seq1, seq2 */
  int isgap1, isgap2;         /* is this residue a gap in msa1, msa2? */
  int i;                      /* counter over sequences */
  int *res1_per_apos;         /* [0..apos..alen1] number of residues in column apos of msa1 */
  int sc;                     /* max score of full path (alignment) through dp mx */
  int tb_sc;                  /* score of traceback, should equal sc */
  int *one2two_map;           /* [0..a..alen1] the alignment, msa2 column that column apos1 in msa1 maps to */
  int total_res = 0;          /* total number of residues in msa1 */
  float coverage;             /* fraction of total_res that are within mapped msa2 columns from one2two_map, 
			       * this is tb_sc / total_res */
  int  total_cres1=0;         /* total number of residues in reference positions in msa1 */ 
  int  covered_cres1 = 0;     /* number of residues in reference positions in msa1 that also appear in the corresponding
			       * mapped column of msa2 
			       */
  int be_quiet = esl_opt_GetBoolean(go, "-q");
  int *choices;
  int i_choice;

  /* contract check */
  if(! (msa1->flags & eslMSA_DIGITAL)) ESL_FAIL(eslEINVAL, errbuf, "in map_msas() msa1 (%s) not digitized.\n", esl_opt_GetArg(go, 1));
  if(! (msa2->flags & eslMSA_DIGITAL)) ESL_FAIL(eslEINVAL, errbuf, "in map_msas() msa2 (%s) not digitized.\n", esl_opt_GetArg(go, 2));
  alen1 = msa1->alen;
  alen2 = msa2->alen;
  
  /* Map msa1 (reference) columns to alignment positions */
  rflen1 = rflen2 = 0;
  if(msa1->rf != NULL) if((status = map_rfpos_to_apos(msa1, &rf2a_map1, &a2rf_map1, &rflen1)) != eslOK) goto ERROR;
  if(msa2->rf != NULL) if((status = map_rfpos_to_apos(msa2, &rf2a_map2, &a2rf_map2, &rflen2)) != eslOK) goto ERROR;
  if(! be_quiet) {
    printf("# %-25s alignment length:              %d\n", esl_opt_GetArg(go, 1), alen1);
    printf("# %-25s alignment length:              %d\n", esl_opt_GetArg(go, 2), alen2);
  }
  /* collect counts in one2two[i][j]: number of sequences for which residue aligned in msa1 non-gap column i
   * is aligned in msa2 alignment column j.
   */
  ESL_ALLOC(seq1, sizeof(char) * (alen1+1));
  ESL_ALLOC(seq2, sizeof(char) * (alen2+1));
  ESL_ALLOC(one2two, sizeof(int *) * (alen1+1));
  for(apos1 = 0; apos1 <= alen1; apos1++) { 
    ESL_ALLOC(one2two[apos1], sizeof(int) * (alen2+1));
    esl_vec_ISet(one2two[apos1], (alen2+1), 0);
  }

  total_res = 0;
  for(i = 0; i < msa1->nseq; i++) { 
    /* ensure raw (unaligned) seq i in the 2 msas is the same */
    esl_abc_Textize(msa1->abc, msa1->ax[i], alen1, seq1); 
    esl_abc_Textize(msa1->abc, msa2->ax[i], alen2, seq2); /* note: msa*1*->abc used on purpose, allows DNA/RNA to peacefully coexist in this func */
    esl_strdealign(seq1, seq1, "-_.~", &len1);
    esl_strdealign(seq2, seq2, "-_.~", &len2);

    if(len1 != len2) { 
      ESL_FAIL(eslEINVAL, errbuf, "unaligned seq number %d (msa1: %s, msa2: %s) differs in length %s (%" PRId64 ") and %s (%" PRId64 "), those files must contain identical raw seqs\n",
	       i, msa1->sqname[i], msa2->sqname[i], esl_opt_GetArg(go, 1), len1, esl_opt_GetArg(go, 2), len2);
    }
    if(strncmp(seq1, seq2, len1) != 0)  ESL_FAIL(eslEINVAL, errbuf, "unaligned seq number %d differs between %s and %s, those files must contain identical raw seqs\n", i, esl_opt_GetArg(go, 1), esl_opt_GetArg(go, 2));
    total_res += len1;
    
    apos1 = apos2 = 1;
    while((apos1 <= alen1) || (apos2 <= alen2)) {
      isgap1 = esl_abc_XIsGap(msa1->abc, msa1->ax[i][apos1]);
      isgap2 = esl_abc_XIsGap(msa2->abc, msa2->ax[i][apos2]);
      if      ( isgap1 &&  isgap2) { apos1++; apos2++; }
      else if ( isgap1 && !isgap2) { apos1++;          }
      else if (!isgap1 &&  isgap2) {          apos2++; }
      else if ( msa1->ax[i][apos1] == msa2->ax[i][apos2]) { 
	one2two[apos1++][apos2++]++;
	/* two2one[apos2][apos1]++; */
      }
    }
  }

  /******************************************************************
   * DP alignment of msa1 to msa2
   * dp matrix: mx[apos1][apos2] apos1=1..msa->alen1, apos2=1..alen2 (apos1=0 || apos2=0 is invalid)
   * mx[apos1][apos2] = score of maximal alignment for apos1=1..apos1, apos2'=1..apos2 INCLUDING
   *                    apos1 and apos2. Score is number of residues from msa1 columns
   *                    1..apos1 that exist in their respective aligned columns in msa2 (the growing
   *                    maximally scoring alignment).
   */

  /******************************************************************
   * initialization 
   */
  ESL_ALLOC(mx, sizeof(int *) * (alen1+1));
  ESL_ALLOC(tb, sizeof(int *) * (alen1+1));
  for(apos1 = 0; apos1 <= alen1; apos1++) { 
    ESL_ALLOC(mx[apos1], sizeof(int) * (alen2+1));
    ESL_ALLOC(tb[apos1], sizeof(int) * (alen2+1));
    esl_vec_ISet(mx[apos1], (alen2+1), 0);
    esl_vec_ISet(tb[apos1], (alen2+1), -2); /* -2 is a bogus value, if we see it during traceback, there's a problem */
    tb[apos1][0] = HORZ; /* special case, if we hit apos2==0 and apos1 > 0, we have to do HORZ moves until apos1==1 */
  }
  esl_vec_ISet(tb[0], (alen2+1), VERT); /* special case, if we hit apos1==0 and apos2 > 0, we have to do VERT moves until apos2==1 */
  tb[0][0] = -2; /* all alignments must end here */

  ESL_ALLOC(res1_per_apos, sizeof(int) * (alen1+1));
  esl_vec_ISet(res1_per_apos, (alen1+1), 0);
  mx[0][0] = 0;
  tb[0][0] = -1; /* last cell, special value */

  /*****************************************************************
   * recursion
   */
  ESL_ALLOC(choices, sizeof(int) * NCHOICES);
  for(apos1 = 1; apos1 <= alen1; apos1++) {
    for(apos2 = 1; apos2 <= alen2; apos2++) {
      choices[DIAG] = mx[(apos1-1)][(apos2-1)] + one2two[apos1][apos2];
      choices[VERT] = mx[ apos1   ][(apos2-1)];
      choices[HORZ] = mx[(apos1-1)][ apos2   ];
      i_choice  = esl_vec_IArgMax(choices, NCHOICES);
      mx[apos1][apos2] = choices[i_choice];
      tb[apos1][apos2] = i_choice; 
      res1_per_apos[apos1] += one2two[apos1][apos2];
      /*printf("mx[%3d][%3d]: %5d (%d)\n", apos1, apos2, mx[apos1][apos2], tb[apos1][apos2]);*/
    }
  }
  free(choices);

  total_cres1 = 0;
  if(rf2a_map1 != NULL) { 
    for(rfpos1 = 1; rfpos1 <= rflen1; rfpos1++) total_cres1 += res1_per_apos[rf2a_map1[rfpos1]];
  }

  /*****************************************************************
   * traceback 
   */
  
  sc = mx[alen1][alen2];
  if(!be_quiet) {
    /* printf("score %d\n", sc);*/
    if(a2rf_map1 != NULL && a2rf_map2 != NULL) { 
      printf("# %12s       %12s  %22s\n", "   msa 1   ", "   msa 2   ", "");
      printf("# %12s       %12s  %22s\n", "------------", "------------", "");
      printf("# %5s  %5s       %5s  %5s  %22s\n", "rfpos",  "apos",  "rfpos",  "apos",  " num common residues");
      printf("# %5s  %5s       %5s  %5s  %22s\n", "-----", "-----", "-----", "-----", "---------------------");
    }
    else if(a2rf_map1 != NULL) { 
      printf("# %12s        %5s  %22s\n", "   msa 1   ", "msa 2", "");
      printf("# %12s        %5s  %22s\n", "------------", "-----", "");
      printf("# %5s  %5s       %5s  %22s\n", "rfpos",  "apos",  "apos",  " num common residues");
      printf("# %5s  %5s       %5s  %22s\n", "-----", "-----", "-----", "---------------------");
    }
    else if (a2rf_map2 != NULL) { 
      printf("# %5s        %12s  %22s\n", "msa 1", "   msa 2   ", "");
      printf("# %5s        %12s  %22s\n", "-----", "------------", "");
      printf("# %5s        %5s  %5s  %22s\n", "apos",  "rfpos",  "apos",  " num common residues");
      printf("# %5s        %5s  %5s  %22s\n", "-----", "-----", "-----", "---------------------");
    }
    else {
      printf("# %5s        %5s  %22s\n", "msa 1", "msa 2", "");
      printf("# %5s        %5s  %22s\n", "-----", "-----", "");
      printf("# %5s        %5s  %22s\n", "apos",  "apos",  " num common residues");
      printf("# %5s        %5s  %22s\n", "-----", "-----", "---------------------");
    }
  }

  /* traceback, and build one2two_map[] */
  apos1 = alen1;
  apos2 = alen2;
  tb_sc = 0;
  covered_cres1 = 0;
  ESL_ALLOC(one2two_map, sizeof(int) * (alen1+1));
  esl_vec_ISet(one2two_map, (alen1+1), 0);
  one2two_map[0] = -1; /* invalid */

  while(tb[apos1][apos2] != -1) {
    if(tb[apos1][apos2] == DIAG) { /* diagonal move */
      rfpos1 = (a2rf_map1 == NULL) ? -1 : a2rf_map1[apos1];
      rfpos2 = (a2rf_map2 == NULL) ? -1 : a2rf_map2[apos2];
      if(!be_quiet) { 
	if(a2rf_map1 != NULL && a2rf_map2 != NULL) { 
	  if(rfpos1 == -1 && rfpos2 == -1) { 
	    printf("  %5s  %5d  -->  %5s  %5d  %5d / %5d (%.4f)\n", "-",    apos1, "-",    apos2, one2two[apos1][apos2], res1_per_apos[apos1], (res1_per_apos[apos1] == 0) ? 0.0000 : ((float) one2two[apos1][apos2] / (float) res1_per_apos[apos1])); 
	  }
	  else if (rfpos1 == -1) { 
	    printf("  %5s  %5d  -->  %5d  %5d  %5d / %5d (%.4f)\n", "-",    apos1, rfpos2, apos2, one2two[apos1][apos2], res1_per_apos[apos1], (res1_per_apos[apos1] == 0) ? 0.0000 : ((float) one2two[apos1][apos2] / (float) res1_per_apos[apos1])); 
	  }
	  else if (rfpos2 == -1) { 
	    printf("  %5d  %5d  -->  %5s  %5d  %5d / %5d (%.4f)\n", rfpos1, apos1, "-",    apos2, one2two[apos1][apos2], res1_per_apos[apos1], (res1_per_apos[apos1] == 0) ? 0.0000 : ((float) one2two[apos1][apos2] / (float) res1_per_apos[apos1])); 
	  }
	  else { 
	    printf("  %5d  %5d  -->  %5d  %5d  %5d / %5d (%.4f)\n", rfpos1, apos1, rfpos2, apos2, one2two[apos1][apos2], res1_per_apos[apos1], (res1_per_apos[apos1] == 0) ? 0.0000 : ((float) one2two[apos1][apos2] / (float) res1_per_apos[apos1])); 
	  }
	}	
	else if(a2rf_map1 != NULL) { 
	  if (rfpos1 == -1) { 
	    printf("  %5s  %5d  -->  %5d  %5d / %5d (%.4f)\n", "-",   apos1, apos2, one2two[apos1][apos2], res1_per_apos[apos1], (res1_per_apos[apos1] == 0) ? 0.0000 : ((float) one2two[apos1][apos2] / (float) res1_per_apos[apos1])); 
	  }
	  else { 
	    printf("  %5d  %5d  -->  %5d  %5d / %5d (%.4f)\n", rfpos1, apos1, apos2, one2two[apos1][apos2], res1_per_apos[apos1], (res1_per_apos[apos1] == 0) ? 0.0000 : ((float) one2two[apos1][apos2] / (float) res1_per_apos[apos1])); 
	  }
	}
	else if (a2rf_map2 != NULL) { 
	  if (rfpos2 == -1) { 
	    printf("  %5d  -->  %5s  %5d  %5d / %5d (%.4f)\n", apos1, "-",    apos2, one2two[apos1][apos2], res1_per_apos[apos1], (res1_per_apos[apos1] == 0) ? 0.0000 : ((float) one2two[apos1][apos2] / (float) res1_per_apos[apos1])); 
	  }
	  else { 
	    printf("  %5d  -->  %5d  %5d  %5d / %5d (%.4f)\n", apos1, rfpos2, apos2, one2two[apos1][apos2], res1_per_apos[apos1], (res1_per_apos[apos1] == 0) ? 0.0000 : ((float) one2two[apos1][apos2] / (float) res1_per_apos[apos1])); 
	  }
	}
	else {
	  printf("  %5d  -->  %5d  %5d / %5d (%.4f)\n", apos1, apos2, one2two[apos1][apos2], res1_per_apos[apos1], (res1_per_apos[apos1] == 0) ? 0.0000 : ((float) one2two[apos1][apos2] / (float) res1_per_apos[apos1])); 
	}
      }
      tb_sc += one2two[apos1][apos2];
      one2two_map[apos1] = apos2;
      if(rfpos1 > 0) covered_cres1 += one2two[apos1][apos2]; /* apos1 is a rfpos */
      apos1--; apos2--;
    }
    else if(tb[apos1][apos2] == VERT) { 
      apos2--; /* vertical move */
    }
    else if(tb[apos1][apos2] == HORZ) { 
      apos1--; /* horizontal move */
    }
    else if(tb[apos1][apos2] != -1) /* shouldn't happen */
      ESL_FAIL(eslEINVAL, errbuf, "in dp traceback, tb[apos1: %d][apos2: %d] %d\n", apos1, apos2, tb[apos1][apos2]);
  }
  /* done DP code 
   **********************************/

  if(!be_quiet) printf("# Total trace back sc: %d\n", tb_sc);
  if(tb_sc != sc) ESL_FAIL(eslEINVAL, errbuf, "in dp traceback, tb_sc (%d) != sc (%d)\n", tb_sc, sc);
  coverage = (float) tb_sc / (float) total_res;
  printf("# Coverage: %6d / %6d (%.4f)\n# Coverage is fraction of residues from %s in optimally mapped columns in %s\n", tb_sc, total_res, coverage, esl_opt_GetArg(go, 1), esl_opt_GetArg(go, 2));
  if(total_cres1 > 0) printf("# RF coverage: %6d / %6d (%.4f)\n# RF coverage is fraction of non-gap RF residues from %s in optimally mapped columns in %s\n", covered_cres1, total_cres1, (float) covered_cres1 / (float) total_cres1, esl_opt_GetArg(go, 1), esl_opt_GetArg(go, 2));
  /* print masks if nec */
  if((status = map2masks(go, errbuf, alen1, alen2, a2rf_map1, a2rf_map2, rf2a_map1, rf2a_map2, rflen1, rflen2, one2two_map)) != eslOK) return status;

  /* clean up and return */
  for(apos1 = 0; apos1 <= alen1; apos1++) { 
    free(mx[apos1]);
    free(tb[apos1]);
  }
  free(mx);
  free(tb);

  for(apos1 = 0; apos1 <= alen1; apos1++) free(one2two[apos1]);
  free(one2two);
  free(res1_per_apos);
  if(rf2a_map1 != NULL) free(rf2a_map1);
  if(rf2a_map2 != NULL) free(rf2a_map2);
  if(a2rf_map1 != NULL) free(a2rf_map1);
  if(a2rf_map2 != NULL) free(a2rf_map2);

  free(seq1);
  free(seq2);
  *ret_msa1_to_msa2_map = one2two_map;
  return eslOK;
  
 ERROR: 
  return status;
}
Beispiel #25
0
/* map2masks
 *                   
 * Given a map of alignment columns in msa1 to alignment columns
 * to msa2, construct and output masks as per command-line options.
 * 
 * Args:    msa1_to_msa2_map: [1..apos..msa1->alen]: '0': msa1 apos maps to a gap in msa2 (doesn't map to any column in msa2)
 *                                                   'x': msa1 apos maps to posn x in msa2 (x>0)
 */
static int
map2masks(const ESL_GETOPTS *go, char *errbuf, int alen1, int alen2, int *a2rf_map1, int *a2rf_map2, int *rf2a_map1, int *rf2a_map2, int rflen1, int rflen2, int *msa1_to_msa2_map)
{
  int status;
  int apos1, apos2;           /* counters over alignment position in msa1, msa2 respectively */
  int rfpos1, rfpos2;         /* counters over reference positions */
  int num_ones;               /* number of 1s in current mask */
  int num_zeroes;             /* number of 0s in current mask */
  FILE *fp;
  char *mask = NULL;

  if(esl_opt_GetString(go, "--mask-a2a")) { 
    if ((fp = fopen(esl_opt_GetString(go, "--mask-a2a"), "w")) == NULL) 
      ESL_FAIL(eslFAIL, errbuf, "Failed to open --mask-a2a mask output file %s", esl_opt_GetString(go, "--mask-a2a"));
    /* construct mask as follows:
     * mask[0..apos1..alen1-1] = '1' if column apos1+1 maps to an alignment column of msa2 
     *                         = '0' if column apos1+1 maps to a gap in msa2 (doesn't map to any column in msa2) 
     */
    ESL_ALLOC(mask, sizeof(char) * (alen1+1));
    num_ones = num_zeroes = 0;
    for(apos1 = 1; apos1 <= alen1; apos1++) { 
      if(msa1_to_msa2_map[apos1] == 0) { mask[(apos1-1)] = '0'; num_zeroes++; }
      else                             { mask[(apos1-1)] = '1'; num_ones++; }
    }
    mask[alen1] = '\0';
    fprintf(fp, "%s\n", mask);
    free(mask);
    fclose(fp);
    printf("# Mask of 1/0s with 1 indicating aln column in %s maps to aln column in %s saved to file %s.\n# (Length: %d; '1's: %d; '0's: %d)\n", esl_opt_GetArg(go, 1), esl_opt_GetArg(go, 2), esl_opt_GetString(go, "--mask-a2a"), (num_ones+num_zeroes), num_ones, num_zeroes);
  }

  if(esl_opt_GetString(go, "--mask-a2rf")) { 
    if (a2rf_map2 == NULL) ESL_FAIL(eslFAIL, errbuf, "with --mask-a2rf, <msafile2> %s must have #=GC RF annotation, but it doesn't.", esl_opt_GetArg(go, 2));
    if ((fp = fopen(esl_opt_GetString(go, "--mask-a2rf"), "w")) == NULL) 
      ESL_FAIL(eslFAIL, errbuf, "Failed to open --mask-a2rf mask output file %s\n", esl_opt_GetString(go, "--mask-a2rf"));
    /* construct mask as follows:
     * mask[0..apos1..alen1-1] = '1' if column apos1+1 maps to a reference column (non-gap in RF) of msa2 
     *                         = '0' if column apos1+1 maps to a gap (doesn't map to any column in msa2) or an insert (gap in RF) in msa2 
     */
    ESL_ALLOC(mask, sizeof(char) * (alen1+1));
    num_ones = num_zeroes = 0;
    for(apos1 = 1; apos1 <= alen1; apos1++) { 
      apos2 = msa1_to_msa2_map[apos1];
      if(apos2 == 0) { mask[(apos1-1)] = '0'; num_zeroes++; } /* apos1 doesn't map to any column in msa2 */
      else { 
	rfpos2 = a2rf_map2[apos2];
	if(rfpos2 <= 0) { mask[(apos1-1)] = '0'; num_zeroes++; } /* apos1 maps to a gap RF (insert) in msa2 */
	else           { mask[(apos1-1)] = '1'; num_ones++; }   /* apos1 maps to a non-gap RF (reference) column in msa2 */
      }
    }
    mask[alen1] = '\0';
    fprintf(fp, "%s\n", mask);
    free(mask);
    fclose(fp);
    printf("# Mask of 1/0s with 1 indicating aln column in %s maps to reference (non-gap RF) column in %s saved to file %s.\n# (Length: %d; '1's: %d; '0's: %d)\n", esl_opt_GetArg(go, 1), esl_opt_GetArg(go, 2), esl_opt_GetString(go, "--mask-a2rf"), (num_ones+num_zeroes), num_ones, num_zeroes);
  }

  if(esl_opt_GetString(go, "--mask-rf2a")) { 
    if (a2rf_map1 == NULL) ESL_FAIL(eslFAIL, errbuf, "with --mask-rf2a, <msafile1> %s must have #=GC RF annotation, but it doesn't.", esl_opt_GetArg(go, 1));
    if ((fp = fopen(esl_opt_GetString(go, "--mask-rf2a"), "w")) == NULL) 
      ESL_FAIL(eslFAIL, errbuf, "Failed to open --mask-rf2a mask output file %s\n", esl_opt_GetString(go, "--mask-rf2a"));
    /* construct mask as follows:
     * mask[0..rfpos1..rflen-1] = '1' if non-gap RF msa1 column rfpos1+1 maps to an alignment column of msa2 
     *                        = '0' if non-gap RF msa1 column rfpos1+1 maps to a gap in msa2 (doesn't map to any column in msa2)
     */
    ESL_ALLOC(mask, sizeof(char) * (rflen1+1));
    num_ones = num_zeroes = 0;
    for(rfpos1 = 1; rfpos1 <= rflen1; rfpos1++) { 
      apos1 = rf2a_map1[rfpos1];
      apos2 = msa1_to_msa2_map[apos1];
      if(apos2 == 0) { mask[(rfpos1-1)] = '0'; num_zeroes++; } 
      else           { mask[(rfpos1-1)] = '1'; num_ones++; } 
    }
    mask[rflen1] = '\0';
    fprintf(fp, "%s\n", mask);
    free(mask);
    fclose(fp);
    printf("# Mask of 1/0s with 1 indicating reference (non-gap RF) column in %s maps to aln column in %s saved to file %s.\n# (Length: %d; '1's: %d; '0's: %d)\n", esl_opt_GetArg(go, 1), esl_opt_GetArg(go, 2), esl_opt_GetString(go, "--mask-rf2a"), (num_ones+num_zeroes), num_ones, num_zeroes);
  }

  if(esl_opt_GetString(go, "--mask-rf2rf")) { 
    if (a2rf_map1 == NULL) ESL_FAIL(eslFAIL, errbuf, "with --mask-rf2rf, <msafile1> %s must have #=GC RF annotation, but it doesn't.", esl_opt_GetArg(go, 1));
    if (a2rf_map2 == NULL) ESL_FAIL(eslFAIL, errbuf, "with --mask-rf2rf, <msafile2> %s must have #=GC RF annotation, but it doesn't.", esl_opt_GetArg(go, 2));
    if ((fp = fopen(esl_opt_GetString(go, "--mask-rf2rf"), "w")) == NULL) 
      ESL_FAIL(eslFAIL, errbuf, "Failed to open --mask-rf2rf mask output file %s\n", esl_opt_GetString(go, "--mask-rf2rf"));
    /* construct mask as follows:
     * mask[0..apos1..alen-1] = '1' if column apos1+1 maps to a reference column (non-gap in RF) of msa2 
     *                        = '0' if column apos1+1 maps to a gap (doesn't map to any column in msa2) or an insert (gap in RF) in msa2 
     */
    ESL_ALLOC(mask, sizeof(char) * (alen1+1));
    num_ones = num_zeroes = 0;
    for(rfpos1 = 1; rfpos1 <= rflen1; rfpos1++) { 
      apos1 = rf2a_map1[rfpos1];
      apos2 = msa1_to_msa2_map[apos1];
      if(apos2 == 0) { mask[(rfpos1-1)] = '0'; num_zeroes++; } /* rfpos1 doesn't map to any column in msa2 */
      else { 
	rfpos2 = a2rf_map2[apos2];
	if(rfpos2 <= 0) { mask[(rfpos1-1)] = '0'; num_zeroes++; } /* rfpos1 maps to a gap RF (insert) in msa2 */
	else            { mask[(rfpos1-1)] = '1'; num_ones++; }   /* rfpos1 maps to a non-gap RF (reference) column in msa2 */
      }
    }
    mask[rflen1] = '\0';
    fprintf(fp, "%s\n", mask);
    free(mask);
    fclose(fp);
    printf("# Mask of 1/0s with 1 indicating reference (non-gap RF) column in %s maps to reference (non-gap RF) column in %s saved to file %s.\n# (Length: %d; '1's: %d; '0's: %d)\n", esl_opt_GetArg(go, 1), esl_opt_GetArg(go, 2), esl_opt_GetString(go, "--mask-rf2rf"), (num_ones+num_zeroes), num_ones, num_zeroes);
  }
  return eslOK;

 ERROR: 
  ESL_FAIL(eslEMEM, errbuf, "map2masks(): memory allocation error.");
  return status; /* NEVERREACHED */
}
Beispiel #26
0
int
main(int argc, char **argv)
{
  ESL_GETOPTS  *go      = NULL;	/* application configuration       */
  ESL_ALPHABET *abc     = NULL;	/* biological alphabet             */
  char         *alifile1= NULL;	/* alignment 1 file name           */
  char         *alifile2= NULL;	/* alignment 2 file name           */
  int           fmt;		/* format code for alifiles        */
  ESLX_MSAFILE *afp1    = NULL;	/* open alignment file 1           */
  ESLX_MSAFILE *afp2    = NULL;	/* open alignment file 2           */
  ESL_MSA      *msa1    = NULL;	/* multiple sequence alignment 1   */
  ESL_MSA      *msa2    = NULL;	/* multiple sequence alignment 2   */
  int           status;		/* easel return code               */
  char          errbuf[eslERRBUFSIZE*4];

  int  *msa1_to_msa2_map;       /* map from <msafile1> to <msafile2> */
  char *sub_msa1_to_msa2_mask;  /* with --sub the map from <msafile1> to <msafile2> in mask form */
  FILE *subfp = NULL;

  /***********************************************
   * Parse command line
   ***********************************************/

  go = esl_getopts_Create(options);
  if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK ||
      esl_opt_VerifyConfig(go)               != eslOK)
    {
      printf("Failed to parse command line: %s\n", go->errbuf);
      esl_usage(stdout, argv[0], usage);
      printf("\nTo see more help on available options, do %s -h\n\n", argv[0]);
      exit(1);
    }

  if (esl_opt_GetBoolean(go, "-h") )
    {
      esl_banner(stdout, argv[0], banner);
      esl_usage (stdout, argv[0], usage);
      puts("\nwhere basic options are:");
      esl_opt_DisplayHelp(stdout, go, 1, 2, 80);
      exit(0);
    }

  if (esl_opt_ArgNumber(go) != 2) 
    {
      printf("Incorrect number of command line arguments.\n");
      esl_usage(stdout, argv[0], usage);
      printf("\nTo see more help on available options, do %s -h\n\n", argv[0]);
      exit(1);
    }

  alifile1 = esl_opt_GetArg(go, 1);
  alifile2 = esl_opt_GetArg(go, 2);

  fmt             = eslMSAFILE_STOCKHOLM;

  /***********************************************
   * Open the MSA files
   ***********************************************/

  if      (esl_opt_GetBoolean(go, "--amino"))   abc = esl_alphabet_Create(eslAMINO);
  else if (esl_opt_GetBoolean(go, "--dna"))     abc = esl_alphabet_Create(eslDNA);
  else if (esl_opt_GetBoolean(go, "--rna"))     abc = esl_alphabet_Create(eslRNA);

  if ( (status = eslx_msafile_Open(&abc, alifile1, NULL, fmt, NULL, &afp1)) != eslOK) eslx_msafile_OpenFailure(afp1, status);
  if ( (status = eslx_msafile_Open(&abc, alifile2, NULL, fmt, NULL, &afp2)) != eslOK) eslx_msafile_OpenFailure(afp2, status);

  /******************************************************************
   * Read first alignment from each file, we only use the first one 
   ******************************************************************/

  if ((status = eslx_msafile_Read(afp1, &msa1)) != eslOK) eslx_msafile_ReadFailure(afp1, status);
  if ((status = eslx_msafile_Read(afp2, &msa2)) != eslOK) eslx_msafile_ReadFailure(afp2, status);

  /* map the alignments in msa1 and msa2 */
  if(! esl_opt_IsOn(go, "--submap")) { 
    if((status = map_msas(go, errbuf, msa1, msa2, &msa1_to_msa2_map)) != eslOK) goto ERROR;
    free(msa1_to_msa2_map);
  }

  /* --submap: if nec, map <msafile1> to a subset of it's own columns in <msafile2>  */
  else { /* --submap was enabled */
    if ((subfp = fopen(esl_opt_GetString(go, "--submap"), "w")) == NULL) 
      ESL_FAIL(eslFAIL, errbuf, "Failed to open --submap output file %s\n", esl_opt_GetString(go, "--submap"));
    if((status = map_sub_msas(go, errbuf, msa1, msa2, &sub_msa1_to_msa2_mask)) != eslOK) goto ERROR;
    fprintf(subfp, "%s\n", sub_msa1_to_msa2_mask);
    fclose(subfp);
    subfp = NULL;
    printf("# Mask of 1/0s with 1 indicating aln column in %s maps to a column in %s saved to file %s.\n", alifile1, alifile2, esl_opt_GetString(go, "--submap")); 
    free(sub_msa1_to_msa2_mask);
  }
  
  /* Cleanup, normal return
   */
  eslx_msafile_Close(afp1);
  eslx_msafile_Close(afp2);
  esl_alphabet_Destroy(abc);
  esl_getopts_Destroy(go);
  esl_msa_Destroy(msa1);
  esl_msa_Destroy(msa2);
  return 0;
  
 ERROR:
  if (afp1)   eslx_msafile_Close(afp1);
  if (afp2)   eslx_msafile_Close(afp2);
  if (go)     esl_getopts_Destroy(go);
  if (msa1)   esl_msa_Destroy(msa1);
  if (msa2)   esl_msa_Destroy(msa2);
  if (subfp)  fclose(subfp);
  esl_fatal(errbuf);
  return 1; /* never reached */
}
Beispiel #27
0
int
p7_masstrace_Validate(const P7_MASSTRACE *mt, char *errbuf)
{
  float tol = 1e-3;
  int i,k;

  if (mt->L  <= 0)                           ESL_FAIL(eslFAIL, errbuf, "L=0");
  if (mt->M  <= 0)                           ESL_FAIL(eslFAIL, errbuf, "L=0");
  if (mt->i0 < 1 || mt->i0 > mt->L)          ESL_FAIL(eslFAIL, errbuf, "i0 range");
  if (mt->k0 < 1 || mt->k0 > mt->M)          ESL_FAIL(eslFAIL, errbuf, "k0 range");
  if ( ! p7_trace_IsMain(mt->st0))           ESL_FAIL(eslFAIL, errbuf, "st0 not {MID}{LG}");
  if (mt->imass && mt->imass[0]       != 0.) ESL_FAIL(eslFAIL, errbuf, "imass[0] not 0");
  if (mt->imass && mt->imass[mt->i0]  != 1.) ESL_FAIL(eslFAIL, errbuf, "imass[i0] not 1");
  if (mt->imass && mt->imass[mt->L+1] != 0.) ESL_FAIL(eslFAIL, errbuf, "imass[L+1] not 0");
  if (mt->kmass[0]       != 0.)              ESL_FAIL(eslFAIL, errbuf, "kmass[0] not 0");
  if (mt->kmass[mt->k0]  != 1.)              ESL_FAIL(eslFAIL, errbuf, "kmass[k0] not 1");
  if (mt->kmass[mt->M+1] != 0.)              ESL_FAIL(eslFAIL, errbuf, "kmass[M+1] not 0");
  if (mt->imass) {
    for (i = 0; i <= mt->L+1; i++) 
      if (!isfinite(mt->imass[i]) || mt->imass[i] < 0.0 || mt->imass[i] > 1+tol) 
	ESL_FAIL(eslFAIL, errbuf, "imass[%d] isn't a probability: %f\n", i, mt->imass[i]);
  }
  for (k = 0; k <= mt->M+1; k++) 
    if (!isfinite(mt->kmass[k]) || mt->kmass[k] < 0.0 || mt->kmass[k] > 1+tol) 
      ESL_FAIL(eslFAIL, errbuf, "kmass[%d] isn't a probability: %f\n", k, mt->kmass[k]);
  return eslOK;
}
Beispiel #28
0
/* Function:  esl_min_ConjugateGradientDescent()
 * Incept:    SRE, Wed Jun 22 08:49:42 2005 [St. Louis]
 *
 * Purpose:   n-dimensional minimization by conjugate gradient descent.
 *           
 *            An initial point is provided by <x>, a vector of <n>
 *            components. The caller also provides a function <*func()> that 
 *            compute the objective function f(x) when called as 
 *            <(*func)(x, n, prm)>, and a function <*dfunc()> that can
 *            compute the gradient <dx> at <x> when called as 
 *            <(*dfunc)(x, n, prm, dx)>, given an allocated vector <dx>
 *            to put the derivative in. Any additional data or fixed
 *            parameters that these functions require are passed by
 *            the void pointer <prm>.
 *            
 *            The first step of each iteration is to try to bracket
 *            the minimum along the current direction. The initial step
 *            size is controlled by <u[]>; the first step will not exceed 
 *            <u[i]> for any dimension <i>. (You can think of <u> as
 *            being the natural "units" to use along a graph axis, if
 *            you were plotting the objective function.)
 *
 *            The caller also provides an allocated workspace sufficient to
 *            hold four allocated n-vectors. (4 * sizeof(double) * n).
 *
 *            Iterations continue until the objective function has changed
 *            by less than a fraction <tol>. This should not be set to less than
 *            sqrt(<DBL_EPSILON>). 
 *
 *            Upon return, <x> is the minimum, and <ret_fx> is f(x),
 *            the function value at <x>.
 *            
 * Args:      x        - an initial guess n-vector; RETURN: x at the minimum
 *            u        - "units": maximum initial step size along gradient when bracketing.
 *            n        - dimensionality of all vectors
 *            *func()  - function for computing objective function f(x)
 *            *dfunc() - function for computing a gradient at x
 *            prm      - void ptr to any data/params func,dfunc need 
 *            tol      - convergence criterion applied to f(x)
 *            wrk      - allocated 4xn-vector for workspace
 *            ret_fx   - optRETURN: f(x) at the minimum
 *
 * Returns:   <eslOK> on success.
 *
 * Throws:    <eslENOHALT> if it fails to converge in MAXITERATIONS.
 *            <eslERANGE> if the minimum is not finite, which may
 *            indicate a problem in the implementation or choice of <*func()>.
 *
 * Xref:      STL9/101.
 */
int
esl_min_ConjugateGradientDescent(double *x, double *u, int n, 
       				 double (*func)(double *, int, void *),
				 void (*dfunc)(double *, int, void *, double *),
				 void *prm, double tol, double *wrk, double *ret_fx)
{
  double oldfx;
  double coeff;
  int    i, i1;
  double *dx, *cg, *w1, *w2;
  double cvg;
  double fa,fb,fc;
  double ax,bx,cx;
  double fx;

  dx = wrk;
  cg = wrk + n;
  w1 = wrk + 2*n;
  w2 = wrk + 3*n;

  oldfx = (*func)(x, n, prm);	/* init the objective function */
  
  /* Bail out if the function is +/-inf: this can happen if the caller
   * has screwed something up, or has chosen a bad start point.
   */
  if (oldfx == eslINFINITY || oldfx == -eslINFINITY)
	  ESL_EXCEPTION(eslERANGE, "minimum not finite");


  if (dfunc != NULL) 
    {
      (*dfunc)(x, n, prm, dx);	/* find the current negative gradient, - df(x)/dxi  */
      esl_vec_DScale(dx, n, -1.0);
    } 
  else numeric_derivative(x, u, n, func, prm, 1e-4, dx); /* resort to brute force */

  esl_vec_DCopy(dx, n, cg);	/* and make that the first conjugate direction, cg  */



  /* (failsafe) convergence test: a zero direction can happen, 
   * and it either means we're stuck or we're finished (most likely stuck)
   */
  for (i1 = 0; i1 < n; i1++) 
    if (cg[i1] != 0.) break;
  if  (i1 == n) {
    if (ret_fx != NULL) *ret_fx = oldfx;
    return eslOK;
  }
  
  for (i = 0; i < MAXITERATIONS; i++)
  {

      /* Figure out the initial step size.
       */
       bx = fabs(u[0] / cg[0]);
       for (i1 = 1; i1 < n; i1++)
	 {
	   cx = fabs(u[i1] / cg[i1]);
	   if (cx < bx) bx = cx;
	 }
 
       /* Bracket the minimum.
	*/
       bracket(x, cg, n, bx, func, prm, w1,
	      &ax, &bx, &cx, 
	      &fa, &fb, &fc);
       
       /* Minimize along the line given by the conjugate gradient <cg> */
       brent(x, cg, n, func, prm, ax, cx, 1e-3, 1e-8, w2, NULL, &fx);
       esl_vec_DCopy(w2, n, x);

      /* Bail out if the function is now +/-inf: this can happen if the caller
       * has screwed something up.
       */
      if (fx == eslINFINITY || fx == -eslINFINITY)
    	  ESL_EXCEPTION(eslERANGE, "minimum not finite");


      /* Find the negative gradient at that point (temporarily in w1) */
      if (dfunc != NULL) 
	  {
	    (*dfunc)(x, n, prm, w1);
	    esl_vec_DScale(w1, n, -1.0);
	  }
      else numeric_derivative(x, u, n, func, prm, 1e-4, w1); /* resort to brute force */

      /* Calculate the Polak-Ribiere coefficient */
      for (coeff = 0., i1 = 0; i1 < n; i1++)
	      coeff += (w1[i1] - dx[i1]) * w1[i1];
      coeff /= esl_vec_DDot(dx, dx, n);
      
      /* Calculate the next conjugate gradient direction in w2 */
      esl_vec_DCopy(w1, n, w2);
      esl_vec_DAddScaled(w2, cg, coeff, n);

      /* Finishing set up for next iteration: */
      esl_vec_DCopy(w1, n, dx);
      esl_vec_DCopy(w2, n, cg);

      /* Now: x is the current point; 
       *      fx is the function value at that point;
       *      dx is the current gradient at x;
       *      cg is the current conjugate gradient direction. 
       */

      /* Main convergence test. 1e-9 factor is fudging the case where our
       * minimum is at exactly f()=0.
       */
      cvg = 2.0 * fabs((oldfx-fx)) / (1e-10 + fabs(oldfx) + fabs(fx));

//      fprintf(stderr, "(%d): Old f() = %.9f    New f() = %.9f    Convergence = %.9f\n", i, oldfx, fx, cvg);
//      fprintf(stdout, "(%d): Old f() = %.9f    New f() = %.9f    Convergence = %.9f\n", i, oldfx, fx, cvg);

#if eslDEBUGLEVEL >= 2
      printf("\nesl_min_ConjugateGradientDescent():\n");
      printf("new point:     ");
      for (i1 = 0; i1 < n; i1++)
	    printf("%g ", x[i1]);

      printf("\nnew gradient:    ");
      for (i1 = 0; i1 < n; i1++)
	    printf("%g ", dx[i1]);

      numeric_derivative(x, u, n, func, prm, 1e-4, w1);
      printf("\n(numeric grad):  ");
      for (i1 = 0; i1 < n; i1++)
	    printf("%g ", w1[i1]);

      printf("\nnew direction: ");
      for (i1 = 0; i1 < n; i1++)
	    printf("%g ", cg[i1]);

      printf("\nOld f() = %g    New f() = %g    Convergence = %g\n\n", oldfx, fx, cvg);
#endif

     if (cvg <= tol) break;

      /* Second (failsafe) convergence test: a zero direction can happen, 
       * and it either means we're stuck or we're finished (most likely stuck)
       */
      for (i1 = 0; i1 < n; i1++) 
	     if (cg[i1] != 0.) break;
      if  (i1 == n) break;

      oldfx = fx;
    }


	if (ret_fx != NULL) *ret_fx = fx;

    if (i == MAXITERATIONS)
	  ESL_FAIL(eslENOHALT, NULL, " ");
// 	  ESL_EXCEPTION(eslENOHALT, "Failed to converge in ConjugateGradientDescent()");



  return eslOK;
}
Beispiel #29
0
/* Function: DispatchSqAlignment()
 * Date:     EPN, Thu Jan 12 14:47:26 2012
 *
 * Purpose:  Given a CM and a sequence, align the sequence(s) using
 *           the appropriate alignment function and return relevant
 *           data for eventual output in <ret_data>. 
 *
 *           This function can be called from either an alignment
 *           pipeline (i.e. cmalign) or a search/scan pipeline
 *           (i.e. cmsearch or cmscan). <idx> is the (overloaded) flag
 *           for determining which, if -1, we're a search/scan
 *           pipeline. This is only relevant because in a search/scan
 *           pipeline we don't care about determining spos/epos so we
 *           don't call ParsetreeToCMBounds().
 *                        
 *           If (cm->flags & CM_ALIGN_XTAU) we'll potentially tighten
 *           HMM bands until the required DP matrices are below out
 *           limit (<mxsize>). cm->maxtau is the max allowed tau value
 *           during this iterative band tightening, and cm->xtau is
 *           the factor by which we multiply cm->tau at each iteration
 *           during band tightening.
 *
 * Args:     cm         - the covariance model
 *           errbuf     - char buffer for reporting errors
 *           sq         - sequence to align
 *           idx        - index of sequence (may be used to reorder data later)
 *           mxsize     - max size in Mb of allowable DP mx 
 *           mode       - preset mode of alignment (TRMODE_UNKNOWN if unknown)
 *           pass_idx   - pipeline pass index, determines trunc penalty
 *           cp9b_valid - TRUE if cm->cp9b are valid, don't compute HMM bands
 *           w          - stopwatch for timing individual stages, can be NULL
 *           w_tot      - stopwatch for timing total time per seq, can be NULL
 *           r          - RNG, req'd if CM_ALIGN_SAMPLE, can be NULL otherwise
 *           ret_data   - RETURN: newly created CM_ALNDATA object
 *
 * Returns:  eslOK on success;
 *           eslEINCOMPAT on contract violation, errbuf is filled;
 *           eslEMEM if we run out of memory;
 *           <ret_data> is alloc'ed and filled.
 */
int
DispatchSqAlignment(CM_t *cm, char *errbuf, ESL_SQ *sq, int64_t idx, float mxsize, char mode, int pass_idx, 
		    int cp9b_valid, ESL_STOPWATCH *w, ESL_STOPWATCH *w_tot, ESL_RANDOMNESS *r, CM_ALNDATA **ret_data)
{
  int           status;            /* easel status */
  CM_ALNDATA   *data         = NULL; /* CM_ALNDATA we'll create and fill */
  float         sc           = 0.;   /* score from alignment function */
  float         pp           = 0.;   /* average PP from alignment function */
  Parsetree_t  *tr           = NULL; /* ptr to a parsetree */
  char         *ppstr        = NULL; /* ptr to a PP string */
  float         secs_bands   = 0.;   /* seconds elapsed for band calculation */
  float         secs_aln     = 0.;   /* seconds elapsed for alignment calculation */
  float         mb_tot       = 0.;   /* size of all DP matrices used for alignment */
  double        tau          = -1.;  /* tau used for calculating bands */
  float         thresh1      = -1.;  /* cp9b->thresh1 used for calculating bands */
  float         thresh2      = -1.;  /* cp9b->thresh2 used for calculating bands */
  int           spos         = -1;   /* start posn: first non-gap CM consensus position */
  int           epos         = -1;   /* end   posn: final non-gap CM consensus position */
  double        save_tau     = cm->tau; /* cm->tau upon entrance, we restore before leaving */
  float         save_thresh1 = (cm->cp9b == NULL) ? -1. : cm->cp9b->thresh1;
  float         save_thresh2 = (cm->cp9b == NULL) ? -1. : cm->cp9b->thresh2;

  /* alignment options */
  int do_nonbanded = (cm->align_opts & CM_ALIGN_NONBANDED) ? TRUE  : FALSE;
  int do_qdb       = (cm->align_opts & CM_ALIGN_QDB)       ? TRUE  : FALSE;
  int do_hbanded   = (do_nonbanded || do_qdb)              ? FALSE : TRUE;
  int do_optacc    = (cm->align_opts & CM_ALIGN_OPTACC)    ? TRUE  : FALSE;
  int do_sample    = (cm->align_opts & CM_ALIGN_SAMPLE)    ? TRUE  : FALSE;
  int do_post      = (cm->align_opts & CM_ALIGN_POST)      ? TRUE  : FALSE;
  int do_sub       = (cm->align_opts & CM_ALIGN_SUB)       ? TRUE  : FALSE;
  int do_small     = (cm->align_opts & CM_ALIGN_SMALL)     ? TRUE  : FALSE;
  int do_trunc     = (cm->align_opts & CM_ALIGN_TRUNC)     ? TRUE  : FALSE;
  int do_xtau      = (cm->align_opts & CM_ALIGN_XTAU)      ? TRUE  : FALSE;
  int doing_search = FALSE;

#if eslDEBUGLEVEL >= 1
  printf("in DispatchSqAlignment() %s\n", sq->name);
  printf("\tdo_nonbanded: %d\n", do_nonbanded);
  printf("\tdo_optacc:    %d\n", do_optacc);
  printf("\tdo_sample:    %d\n", do_sample);
  printf("\tdo_post:      %d\n", do_post);
  printf("\tdo_sub:       %d\n", do_sub);
  printf("\tdo_small:     %d\n", do_small);
  printf("\tdo_trunc:     %d\n", do_trunc);
  printf("\tdo_qdb:       %d\n", do_qdb);
  printf("\tdoing_search: %d\n", doing_search);
#endif
  
  /* sub-mode specific variables (wouldn't be needed if sub mode were not supported) */
  CM_t        *orig_cm = cm;      /* pointer to the original CM */
  CM_t        *sub_cm  = NULL;    /* the sub CM */
  CMSubMap_t  *submap  = NULL;    /* map from mother CM to sub CM, and vice versa */
  Parsetree_t *full_tr = NULL;    /* converted parsetree to full CM */

  /* contract check */
  if(do_small  && do_hbanded)       ESL_XFAIL(eslEINCOMPAT, errbuf, "DispatchSqAlignment() trying to do small and HMM banded alignment");
  if(do_small  && do_optacc)        ESL_XFAIL(eslEINCOMPAT, errbuf, "DispatchSqAlignment() trying to do small and opt acc alignment");
  if(do_post   && do_small)         ESL_XFAIL(eslEINCOMPAT, errbuf, "DispatchSqAlignment() trying to do PP and small alignment");
  if(do_optacc && do_sample)        ESL_XFAIL(eslEINCOMPAT, errbuf, "DispatchSqAlignment() trying to sample and do optacc alignment");
  if(do_sub    && do_small)         ESL_XFAIL(eslEINCOMPAT, errbuf, "DispatchSqAlignment() trying to do sub and small alignment");
  if(do_sub    && do_trunc)         ESL_XFAIL(eslEINCOMPAT, errbuf, "DispatchSqAlignment() trying to do sub and truncated alignment");
  if(do_sample && r == NULL)        ESL_XFAIL(eslEINCOMPAT, errbuf, "DispatchSqAlignment() trying to sample but RNG r == NULL");
  if(do_xtau   && ! do_hbanded)     ESL_XFAIL(eslEINCOMPAT, errbuf, "DispatchSqAlignment() trying to multiply tau without HMM banded alignment");
  if(do_xtau   && cp9b_valid)       ESL_XFAIL(eslEINCOMPAT, errbuf, "DispatchSqAlignment() trying to multiply tau but HMM bands already valid");
  if(do_qdb    && do_nonbanded)     ESL_XFAIL(eslEINCOMPAT, errbuf, "DispatchSqAlignment() trying to do qdb and nonbanded alignment");
  if(do_qdb    && do_trunc)         ESL_XFAIL(eslEINCOMPAT, errbuf, "DispatchSqAlignment() trying to use qdbs and truncated alignment");
  /* qdb + trunc combo disallowed only b/c no function exists for it yet */
  if(do_qdb    && (! do_small))     ESL_XFAIL(eslEINCOMPAT, errbuf, "DispatchSqAlignment() trying to use qdbs but not divide and conquer");
  /* qdb + small combo disallowed b/c only non-HMM banded non-small alignment functions are not set up to use QDBs */
  if(do_qdb && cm->qdbinfo == NULL) { 
    ESL_XFAIL(eslEINCOMPAT, errbuf, "DispatchSqAlignment() trying to use qdbs but cm->qdbinfo is NULL");
  }
  if(do_qdb && (cm->qdbinfo->dmin2 == NULL || cm->qdbinfo->dmax2 == NULL)) { 
    ESL_XFAIL(eslEINCOMPAT, errbuf, "DispatchSqAlignment() trying to use qdbs but cm->qdbinfo is NULL");
  }
  if(do_trunc && (! cm_pli_PassAllowsTruncation(pass_idx))) { 
    ESL_XFAIL(eslEINCOMPAT, errbuf, "DispatchSqAlignment() trying to do truncated alignment, but pass_idx doesn't allow truncation (PLI_PASS_STD_ANY)");
  }
  if(pass_idx == PLI_PASS_STD_ANY && (mode == TRMODE_L || mode == TRMODE_R || mode == TRMODE_T)) { 
    ESL_XFAIL(eslEINCOMPAT, errbuf, "DispatchSqAlignment() mode is L, R, or T, but pass_idx is PLI_PASS_STD_ANY");
  }

  if(w_tot != NULL) esl_stopwatch_Start(w_tot);

  /* do sub-mode specific pre-alignment steps, if nec */
  if(do_sub) { 
    if((status = sub_alignment_prep(cm, errbuf, sq, &submap, &sub_cm)) != eslOK) goto ERROR;
    cm = sub_cm;
  }

  if(w != NULL) esl_stopwatch_Start(w);
  /* do small D&C alignment, if nec */
  if(do_small) { 
    if(do_trunc) { 
      sc = TrCYK_DnC(cm, sq->dsq, sq->L, 0, 1, sq->L, pass_idx, FALSE, &tr); /* FALSE: don't reproduce 1.0 behavior */
      mb_tot = 4. * CYKNonQDBSmallMbNeeded(cm, sq->L); /* not sure how accurate this is */
    }
    else { 
      /* with QDB, always use dmin2/dmax2, the looser of the two sets of QDBs in cm->qdbinfo */
      sc = CYKDivideAndConquer(cm, sq->dsq, sq->L, 0, 1, sq->L, &tr, 
			       (do_qdb) ? cm->qdbinfo->dmin2 : NULL, 
			       (do_qdb) ? cm->qdbinfo->dmax2 : NULL);
      mb_tot = CYKNonQDBSmallMbNeeded(cm, sq->L);
    }
  }
  else { /* do_small is FALSE */
    if(do_nonbanded || do_qdb) { /* do not use HMM bands */
      if(do_trunc) { 
	if((status = cm_TrAlignSizeNeeded(cm, errbuf, sq->L, mxsize, do_sample, do_post, 
					  NULL, NULL, NULL, &mb_tot)) != eslOK) goto ERROR;
	if((status = cm_TrAlign(cm, errbuf, sq->dsq, sq->L, mxsize, mode, pass_idx, 
				do_optacc, do_sample, cm->trnb_mx, cm->trnb_shmx, cm->trnb_omx, 
				cm->trnb_emx, r, do_post ? &ppstr : NULL, &tr, NULL, &pp, &sc)) != eslOK) goto ERROR;
      }
      else {
	if((status = cm_AlignSizeNeeded(cm, errbuf, sq->L, mxsize, do_sample, do_post, 
					NULL, NULL, NULL, &mb_tot)) != eslOK) goto ERROR;
	if((status = cm_Align(cm, errbuf, sq->dsq, sq->L, mxsize, do_optacc, do_sample, cm->nb_mx, cm->nb_shmx, 
			      cm->nb_omx, cm->nb_emx, r, do_post ? &ppstr : NULL, &tr, &pp, &sc)) != eslOK) goto ERROR;
      }
    }
    else { /* use HMM bands */
      if(! cp9b_valid) { 
	if(do_xtau) { /* multiply tau (if nec) until required mx is below Mb limit (mxsize) */
	  if((status = cp9_IterateSeq2Bands(cm, errbuf, sq->dsq, 1, sq->L, pass_idx, mxsize, doing_search, do_sample, do_post, 
					    cm->maxtau, NULL)) != eslOK) goto ERROR;
	}
	else {
	  if((status = cp9_Seq2Bands(cm, errbuf, cm->cp9_mx, cm->cp9_bmx, cm->cp9_bmx, sq->dsq, 
				     1, sq->L, cm->cp9b, doing_search, pass_idx, 0)) != eslOK) goto ERROR;
	}
	if(w != NULL) esl_stopwatch_Stop(w);
	secs_bands = (w == NULL) ? 0. : w->elapsed;
	tau     = cm->tau; 
	thresh1 = cm->cp9b->thresh1;
	thresh2 = cm->cp9b->thresh2;
	/* note: we don't set these three if cp9b_valid is TRUE */
      }
      
      if(w != NULL) esl_stopwatch_Start(w);
      if(do_trunc) { 
	if((status = cm_TrAlignSizeNeededHB(cm, errbuf, sq->L, mxsize, do_sample, do_post, 
					    NULL, NULL, NULL, &mb_tot)) != eslOK) goto ERROR;
      	if((status = cm_TrAlignHB(cm, errbuf, sq->dsq, sq->L, mxsize, mode, pass_idx, 
				  do_optacc, do_sample, cm->trhb_mx, cm->trhb_shmx, cm->trhb_omx, 
				  cm->trhb_emx, r, do_post ? &ppstr : NULL, &tr, NULL, &pp, &sc)) != eslOK) goto ERROR;
      }
      else { 
	if((status = cm_AlignSizeNeededHB(cm, errbuf, sq->L, mxsize, do_sample, do_post, 
					  NULL, NULL, NULL, &mb_tot)) != eslOK) goto ERROR;
	if((status = cm_AlignHB(cm, errbuf, sq->dsq, sq->L, mxsize, do_optacc, do_sample, cm->hb_mx, cm->hb_shmx, 
				cm->hb_omx, cm->hb_emx, r, do_post ? &ppstr : NULL, &tr, &pp, &sc)) != eslOK) goto ERROR;
      }
      /* add size of CP9 matrices used for calculating bands */
      mb_tot += ((float) cm->cp9_mx->ncells_valid  * sizeof(int)) / 1000000.;
      mb_tot += ((float) cm->cp9_bmx->ncells_valid * sizeof(int)) / 1000000.;
      if(do_sub) { /* add size of original CM's CP9 matrices used for calculating start/end position */
	mb_tot += ((float) orig_cm->cp9_mx->ncells_valid  * sizeof(int)) / 1000000.;
	mb_tot += ((float) orig_cm->cp9_bmx->ncells_valid * sizeof(int)) / 1000000.;
      }
    }
  }
  if(w != NULL) esl_stopwatch_Stop(w);
  secs_aln = (w == NULL) ? 0. : w->elapsed;

  if(do_sub) { 
    /* convert sub cm parsetree to a full CM parsetree */
    if((status = sub_cm2cm_parsetree(orig_cm, cm, &full_tr, tr, submap, 0)) != eslOK) ESL_XFAIL(status, errbuf, "out of memory, converting sub parsetree to full parsetree");
    /* free sub data structures, we're done with them */
    FreeParsetree(tr);   tr     = full_tr;
    FreeCM(cm);          cm     = orig_cm;
    FreeSubMap(submap);  submap = NULL;
  }
  
  /* determine start and end points of the parsetree, 
   * but only if we're not in a search/scan pipeline 
   */
  if(idx != -1) { /* we're not in a search/scan pipeline */
    if((status = ParsetreeToCMBounds(cm, tr, TRUE, TRUE, errbuf, NULL, NULL, NULL, NULL, &spos, &epos)) != eslOK) goto ERROR;
  }
  
  /* create and fill data */
  ESL_ALLOC(data, sizeof(CM_ALNDATA));
  data->sq         = sq;
  data->idx        = idx;
  data->tr         = tr;
  data->sc         = sc;
  data->pp         = (do_post)      ? pp     : 0.;
  data->ppstr      = (do_post)      ? ppstr  : NULL;
  data->spos       = spos;
  data->epos       = epos;
  data->secs_bands = (do_nonbanded) ? 0.     : secs_bands;
  data->secs_aln   = secs_aln;
  data->mb_tot     = mb_tot;
  data->tau        = tau;
  data->thresh1    = thresh1;
  data->thresh2    = thresh2;
  if(w_tot != NULL) esl_stopwatch_Stop(w_tot);
  data->secs_tot   = (w_tot == NULL) ? 0. : w_tot->elapsed;

  *ret_data = data;

  cm->tau = save_tau;
  if(cm->cp9b != NULL) { 
    cm->cp9b->thresh1 = save_thresh1;
    cm->cp9b->thresh2 = save_thresh2;
  }
  return eslOK;

 ERROR: 
  cm->tau = save_tau;
  if(cm->cp9b != NULL) { 
    cm->cp9b->thresh1 = save_thresh1;
    cm->cp9b->thresh2 = save_thresh2;
  }
  if(data != NULL) cm_alndata_Destroy(data, FALSE);
  *ret_data = NULL;

  if(status == eslEMEM) ESL_FAIL(status, errbuf, "DispatchSqAlignment(), out of memory");

  return status; 
}
/**
 * int main(int argc, char **argv)
 * Main driver
 */
int
main(int argc, char **argv)
{
  ESL_GETOPTS     *go	   = NULL;      /* command line processing                   */
  ESL_ALPHABET    *abc     = NULL;
  char            *hmmfile = NULL;
  char            *outhmmfile = NULL;
  P7_HMMFILE      *hfp     = NULL;
  FILE         *outhmmfp;          /* HMM output file handle                  */
  P7_HMM          *hmm     = NULL;
  P7_BG           *bg      = NULL;
  int              nhmm;	
  double           x;
  float            KL;
  int              status;
  char             errbuf[eslERRBUFSIZE];

  float average_internal_transitions[ p7H_NTRANSITIONS ];
  int k;

  char        errmsg[eslERRBUFSIZE];

  /* Process the command line options.
   */
  go = esl_getopts_Create(options);
  if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK || 
      esl_opt_VerifyConfig(go)               != eslOK)
    {
      printf("Failed to parse command line: %s\n", go->errbuf);
      esl_usage(stdout, argv[0], usage);
      printf("\nTo see more help on available options, do %s -h\n\n", argv[0]);
      exit(1);
    }
  if (esl_opt_GetBoolean(go, "-h") == TRUE) 
    {
      profillic_p7_banner(stdout, argv[0], banner);
      esl_usage(stdout, argv[0], usage);
      puts("\nOptions:");
      esl_opt_DisplayHelp(stdout, go, 0, 2, 80); /* 0=docgroup, 2 = indentation; 80=textwidth*/
      exit(0);
    }
  if (esl_opt_ArgNumber(go) != 2) 
    {
      puts("Incorrect number of command line arguments.");
      esl_usage(stdout, argv[0], usage);
      printf("\nTo see more help on available options, do %s -h\n\n", argv[0]);
      exit(1);
    }

  if ((hmmfile = esl_opt_GetArg(go, 1)) == NULL) 
    {
      puts("Failed to read <input hmmfile> argument from command line.");
      esl_usage(stdout, argv[0], usage);
      printf("\nTo see more help on available options, do %s -h\n\n", argv[0]);
      exit(1);
    }

  if ((outhmmfile = esl_opt_GetArg(go, 2)) == NULL) 
    {
      puts("Failed to read <output hmmfile> argument from command line.");
      esl_usage(stdout, argv[0], usage);
      printf("\nTo see more help on available options, do %s -h\n\n", argv[0]);
      exit(1);
    }

  profillic_p7_banner(stdout, argv[0], banner);
  
  /* Initializations: open the input HMM file for reading
   */
  status = p7_hmmfile_OpenE(hmmfile, NULL, &hfp, errbuf);
  if      (status == eslENOTFOUND) p7_Fail("File existence/permissions problem in trying to open HMM file %s.\n%s\n", hmmfile, errbuf);
  else if (status == eslEFORMAT)   p7_Fail("File format problem in trying to open HMM file %s.\n%s\n",                hmmfile, errbuf);
  else if (status != eslOK)        p7_Fail("Unexpected error %d in opening HMM file %s.\n%s\n",               status, hmmfile, errbuf);  

  /* Initializations: open the output HMM file for writing
   */
  if ((outhmmfp = fopen(outhmmfile, "w")) == NULL) ESL_FAIL(status, errmsg, "Failed to open HMM file %s for writing", outhmmfile);

  /* Main body: read HMMs one at a time, print one line of stats
   */
  printf("#\n");
  printf("# %-4s %-20s %-12s %8s %8s %6s %6s %6s %6s %6s\n", "idx",  "name",                 "accession",    "nseq",     "eff_nseq", "M",      "relent", "info",   "p relE", "compKL");
  printf("# %-4s %-20s %-12s %8s %8s %6s %6s %6s %6s %6s\n", "----", "--------------------", "------------", "--------", "--------", "------", "------", "------", "------", "------");

  nhmm = 0;
  while ((status = p7_hmmfile_Read(hfp, &abc, &hmm)) != eslEOF) 
    {
      if      (status == eslEOD)       esl_fatal("read failed, HMM file %s may be truncated?", hmmfile);
      else if (status == eslEFORMAT)   esl_fatal("bad file format in HMM file %s",             hmmfile);
      else if (status == eslEINCOMPAT) esl_fatal("HMM file %s contains different alphabets",   hmmfile);
      else if (status != eslOK)        esl_fatal("Unexpected error in reading HMMs from %s",   hmmfile);
      nhmm++;

      if (bg == NULL) bg = p7_bg_Create(abc);

      esl_vec_FSet(average_internal_transitions, p7H_NTRANSITIONS, 0.);
      for( k = 1; k < hmm->M; k++ ) {
        esl_vec_FAdd(average_internal_transitions, hmm->t[k], p7H_NTRANSITIONS);
      }
      // Match transitions
      esl_vec_FNorm(average_internal_transitions, 3);
      // Insert transitions
      esl_vec_FNorm(average_internal_transitions + 3, 2);
      // Delete transitions
      esl_vec_FNorm(average_internal_transitions + 5, 2);
      // Ok now set them.
      for( k = 1; k < hmm->M; k++ ) {
        esl_vec_FCopy( average_internal_transitions, p7H_NTRANSITIONS, hmm->t[k] );
      }

      if ((status = p7_hmm_Validate(hmm, errmsg, 0.0001))       != eslOK) return status;
      if ((status = p7_hmmfile_WriteASCII(outhmmfp, -1, hmm)) != eslOK) ESL_FAIL(status, errmsg, "HMM save failed");
  
      p7_MeanPositionRelativeEntropy(hmm, bg, &x); 
      p7_hmm_CompositionKLDist(hmm, bg, &KL, NULL);

      printf("%-6d %-20s %-12s %8d %8.2f %6d %6.2f %6.2f %6.2f %6.2f\n",
	     nhmm,
	     hmm->name,
	     hmm->acc == NULL ? "-" : hmm->acc,
	     hmm->nseq,
	     hmm->eff_nseq,
	     hmm->M,
	     p7_MeanMatchRelativeEntropy(hmm, bg),
	     p7_MeanMatchInfo(hmm, bg),
	     x,
	     KL);

	     /*	     p7_MeanForwardScore(hmm, bg)); */

      p7_hmm_Destroy(hmm);
    }

  p7_bg_Destroy(bg);
  esl_alphabet_Destroy(abc);
  p7_hmmfile_Close(hfp);
  if (outhmmfp != NULL) fclose(outhmmfp);
 esl_getopts_Destroy(go);
  exit(0);
}