Ejemplo n.º 1
0
/* The gradient of the NLL w.r.t. each free parameter in p.
 */
static void
hyperexp_complete_gradient(double *p, int np, void *dptr, double *dp)
{
  struct hyperexp_data *data = (struct hyperexp_data *) dptr;
  ESL_HYPEREXP         *h    = data->h;
  double pdf;
  int i,k;
  int pidx;			
  
  hyperexp_unpack_paramvector(p, np, h);
  esl_vec_DSet(dp, np, 0.);
  for (i = 0; i < data->n; i++)
    {
      /* FIXME: I think the calculation below may need to be done
       * in log space, to avoid underflow errors; see complete_binned_gradient()
       */
      /* Precalculate q_k PDF_k(x) terms, and their sum */
      for (k = 0; k < h->K; k++)
	h->wrk[k] = h->q[k] * esl_exp_pdf(data->x[i], h->mu, h->lambda[k]);
      pdf = esl_vec_DSum(h->wrk, h->K);

      pidx = 0;
      if (! h->fixmix) {
	for (k = 1; k < h->K; k++) /* generic d/dQ solution for mixture models */
	  dp[pidx++] -= h->wrk[k]/pdf - h->q[k];
      }
      
      for (k = 0; k < h->K; k++)
	if (! h->fixlambda[k])
	  dp[pidx++] -= (1.-h->lambda[k]*(data->x[i]-h->mu))*h->wrk[k]/pdf; /* d/dw */
    }
}
Ejemplo n.º 2
0
/* Function:  esl_rmx_E2Q()
 * Incept:    SRE, Tue Jul 13 15:52:41 2004 [St. Louis]
 *
 * Purpose:   Given a lower triangular matrix ($j<i$) of 
 *            residue exchangeabilities <E>, and a stationary residue
 *            frequency vector <pi>; assuming $E_{ij} = E_{ji}$;
 *            calculates a rate matrix <Q> as
 *            
 *            $Q_{ij} = E_{ij} * \pi_j$
 *            
 *            The resulting <Q> is not normalized to any particular
 *            number of substitutions/site/time unit. See
 *            <esl_rmx_ScaleTo()> for that.
 *            
 * Args:      E     - symmetric residue "exchangeabilities";
 *                    only lower triangular entries are used.
 *            pi    - residue frequencies at stationarity. 
 *            Q     - RETURN: rate matrix, square (NxN). 
 *                    Caller allocates the memory for this.
 *                    
 * Returns:   <eslOK> on success; Q is calculated and filled in.
 * 
 * Xref:      STL8/p56.
 */
int
esl_rmx_E2Q(ESL_DMATRIX *E, double *pi, ESL_DMATRIX *Q)
{
  int          i,j;

  if (E->n != Q->n) ESL_EXCEPTION(eslEINVAL, "E and Q sizes differ");

  /* Scale all off-diagonals to pi[j] * E[i][j].
   */
  for (i = 0; i < E->n; i++)
    for (j = 0; j < i; j++)	/* only look at lower triangle of E. */
      {
	Q->mx[i][j] = pi[j] * E->mx[i][j]; 
	Q->mx[j][i] = pi[i] * E->mx[i][j];
      }

  /* Set diagonal to  -\sum of all j != i.
   */
  for (i = 0; i < Q->n; i++)
    {
      Q->mx[i][i] = 0.;		/* makes the vector sum work for j != i */
      Q->mx[i][i] = -1. * esl_vec_DSum(Q->mx[i], Q->n);
    }
  return eslOK;
}
Ejemplo n.º 3
0
/* Function:  esl_rmx_SetWAG()
 * Incept:    SRE, Thu Mar  8 18:00:00 2007 [Janelia]
 *
 * Purpose:   Sets a $20 \times 20$ rate matrix <Q> to WAG parameters.
 *            The caller allocated <Q>.
 *
 *            If <pi> is non-<NULL>, it provides a vector of 20 amino
 *            acid stationary probabilities in Easel alphabetic order,
 *            A..Y, and the WAG stationary probabilities are set to
 *            these desired $\pi_i$. If <pi> is <NULL>, the default
 *            WAG stationary probabilities are used.
 *            
 *            The WAG parameters are a maximum likelihood
 *            parameterization obtained by Whelan and Goldman
 *            \citep{WhelanGoldman01}.
 *            
 * Note:      The data table was reformatted from wag.dat by the UTILITY1
 *            executable in the paml module. The wag.dat file was obtained from
 *            \url{http://www.ebi.ac.uk/goldman/WAG/wag.dat}. A copy
 *            is in formats/wag.dat.
 *
 * Args:      Q   - a 20x20 rate matrix to set, allocated by caller.
 *            pi  - desired stationary probabilities A..Y, or
 *                  NULL to use WAG defaults.
 *
 * Returns:   <eslOK> on success.
 * 
 * Throws:    <eslEINVAL> if <Q> isn't a 20x20 general matrix; and
 *            the state of <Q> is undefined.
 */
int
esl_rmx_SetWAG(ESL_DMATRIX *Q, double *pi)
{
  static double wagE[190] = {
    1.027040, 0.738998, 0.030295, 1.582850, 0.021352, 6.174160, 0.210494, 0.398020, 0.046730, 0.081134,
    1.416720, 0.306674, 0.865584, 0.567717, 0.049931, 0.316954, 0.248972, 0.930676, 0.570025, 0.679371,
    0.249410, 0.193335, 0.170135, 0.039437, 0.127395, 1.059470, 0.030450, 0.138190, 0.906265, 0.074034,
    0.479855, 2.584430, 0.088836, 0.373558, 0.890432, 0.323832, 0.397915, 0.384287, 0.084805, 0.154263,
    2.115170, 0.061304, 0.499462, 3.170970, 0.257555, 0.893496, 0.390482, 0.103754, 0.315124, 1.190630,
    0.174100, 0.404141, 4.257460, 0.934276, 4.854020, 0.509848, 0.265256, 5.429420, 0.947198, 0.096162,
    1.125560, 3.956290, 0.554236, 3.012010, 0.131528, 0.198221, 1.438550, 0.109404, 0.423984, 0.682355,
    0.161444, 0.243570, 0.696198, 0.099929, 0.556896, 0.415844, 0.171329, 0.195081, 0.908598, 0.098818,
    0.616783, 5.469470, 0.099921, 0.330052, 4.294110, 0.113917, 3.894900, 0.869489, 1.545260, 1.543640,
    0.933372, 0.551571, 0.528191, 0.147304, 0.439157, 0.102711, 0.584665, 2.137150, 0.186979, 5.351420,
    0.497671, 0.683162, 0.635346, 0.679489, 3.035500, 3.370790, 1.407660, 1.071760, 0.704939, 0.545931,
    1.341820, 0.740169, 0.319440, 0.967130, 0.344739, 0.493905, 3.974230, 1.613280, 1.028870, 1.224190,
    2.121110, 0.512984, 0.374866, 0.822765, 0.171903, 0.225833, 0.473307, 1.458160, 1.386980, 0.326622,
    1.516120, 2.030060, 0.795384, 0.857928, 0.554413, 4.378020, 2.006010, 1.002140, 0.152335, 0.588731,
    0.649892, 0.187247, 0.118358, 7.821300, 0.305434, 1.800340, 2.058450, 0.196246, 0.314887, 0.301281,
    0.251849, 0.232739, 1.388230, 0.113133, 0.717070, 0.129767, 0.156557, 1.529640, 0.336983, 0.262569,
    0.212483, 0.137505, 0.665309, 0.515706, 0.071917, 0.139405, 0.215737, 1.163920, 0.523742, 0.110864,
    0.365369, 0.240735, 0.543833, 0.325711, 0.196303, 6.454280, 0.103604, 3.873440, 0.420170, 0.133264,
    0.398618, 0.428437, 1.086000, 0.216046, 0.227710, 0.381533, 0.786993, 0.291148, 0.314730, 2.485390};
  static double wagpi[20];
  int i,j,z;
  
  if (Q->m != 20 || Q->n != 20 || Q->type != eslGENERAL)
    ESL_EXCEPTION(eslEINVAL, "Q must be a 20x20 general matrix");
  esl_composition_WAG(wagpi);

  /* 1. Transfer the wag E lower triagonal matrix directly into Q. */
  z = 0;
  for (i = 0; i < 20; i++)
    {
      Q->mx[i][i] = 0.; /* code below depends on this zero initialization */
      for (j = 0; j < i; j++) {
	Q->mx[i][j] = wagE[z++];
	Q->mx[j][i] = Q->mx[i][j];
      }
    }

  /* 2. Set offdiagonals Q_ij = E_ij * pi_j */
  for (i = 0; i < 20; i++)
    for (j = 0; j < 20; j++)
      if (pi != NULL) Q->mx[i][j] *= pi[j];
      else            Q->mx[i][j] *= wagpi[j];

  /* 3. Set diagonal Q_ii to -\sum_{i \neq j} Q_ij */
  for (i = 0; i < 20; i++)
    Q->mx[i][i] = -1. * esl_vec_DSum(Q->mx[i], 20);
  
  /* 4. Renormalize matrix to units of 1 substitution/site. */
  if (pi != NULL) esl_rmx_ScaleTo(Q, pi,    1.0);
  else            esl_rmx_ScaleTo(Q, wagpi, 1.0);

  return eslOK;
}
Ejemplo n.º 4
0
/* Function:  esl_vec_DNorm()
 * Synopsis:  Normalize probability vector.           
 *
 * Purpose:   Normalizes a probability vector <vec>,
 *            such that $\sum_{i=1}{n} \mathrm{vec}_i = 1.0$.
 *            
 *            <esl_vec_FNorm()> does the same, for a probability vector
 *            of floats.
 */
void
esl_vec_DNorm(double *vec, int n)
{
  int    x;
  double sum;

  sum = esl_vec_DSum(vec, n);
  if (sum != 0.0) for (x = 0; x < n; x++) vec[x] /= sum;
  else            for (x = 0; x < n; x++) vec[x] = 1. / (double) n;
}
Ejemplo n.º 5
0
/* Function:  esl_hyperexp_Read()
 *
 * Purpose:   Reads hyperexponential parameters from an open <e>.
 *            which is an <ESL_FILEPARSER> tokenizer for an open stream.
 *            
 *            The first token is <K>, the number of mixture components.
 *            The second token is <mu>, the x offset shared by all components.
 *            Then for each mixture component <k=1..K>, it reads
 *            a mixture coefficient <q[k]> and a decay parameter
 *            <lambda[k]>.
 *            
 *            The <2K+2> data tokens must occur in this order, but
 *            they can be grouped into any number of lines, because the
 *            parser ignores line breaks.
 *            
 *            Anything after a <\#> character on a line is a comment, and
 *            is ignored.
 *            
 * Returns:   <eslOK> on success, and <ret_hxp> points to a new <ESL_HYPEREXP>
 *            object.
 *            <eslEFORMAT> on "normal" parse failure caused by a bad file 
 *            format that's likely the user's fault.
 *
 * Throws:    <eslEMEM> if allocation of the new <ESL_HYPEREXP> fails.
 *
 * 
 * FIXME: All our mixture models (esl_dirichlet, for example) should be
 *        reconciled w/ identical interfaces & behaviour.
 */
int
esl_hyperexp_Read(ESL_FILEPARSER *e, ESL_HYPEREXP **ret_hxp)
{
  ESL_HYPEREXP   *hxp = NULL;
  char           *tok;
  int             status = eslOK;
  int             nc;
  int             k;
  double          sum;

  esl_fileparser_SetCommentChar(e, '#');

  if ((status = esl_fileparser_GetToken(e, &tok, NULL)) != eslOK) goto ERROR;
  nc = atoi(tok);
  if (nc < 1) {  
    sprintf(e->errbuf, "Expected # of components K >= 1 as first token");
    goto ERROR;
  }

  if ((hxp = esl_hyperexp_Create(nc)) == NULL) return eslEMEM; /* percolation */
  
  if ((status = esl_fileparser_GetToken(e, &tok, NULL)) != eslOK) goto ERROR;
  hxp->mu = atof(tok);

  for (k = 0; k < hxp->K; k++)
    {
      if ((status = esl_fileparser_GetToken(e, &tok, NULL)) != eslOK) goto ERROR;
      hxp->q[k] = atof(tok);
      
      if ((status = esl_fileparser_GetToken(e, &tok, NULL)) != eslOK) goto ERROR;
      hxp->lambda[k] = atof(tok);

      if (hxp->q[k] < 0. || hxp->q[k] > 1.) {
	sprintf(e->errbuf, "Expected a mixture coefficient q[k], 0<=q[k]<=1");
	goto ERROR;
      }
      if (hxp->lambda[k] <= 0.) {
	sprintf(e->errbuf, "Expected a lambda parameter, lambda>0");
	goto ERROR;
      }
    }
  sum = esl_vec_DSum(hxp->q, hxp->K);
  if (fabs(sum-1.0) > 0.05) {
    sprintf(e->errbuf, "Expected mixture coefficients to sum to 1");
    goto ERROR;
  }
  esl_vec_DNorm(hxp->q, hxp->K);
  *ret_hxp = hxp;
  return eslOK;

 ERROR:
  esl_hyperexp_Destroy(hxp); 
  return eslEFORMAT;
}
Ejemplo n.º 6
0
/* Function:  esl_rmx_SetHKY()
 * Incept:    SRE, Thu Aug 12 08:26:39 2004 [St. Louis]
 *
 * Purpose:   Given stationary base composition <pi> for ACGT, and
 *            transition and transversion relative rates <alpha> and
 *            <beta> respectively, sets the matrix <Q> to be the
 *            corresponding HKY (Hasegawa/Kishino/Yano) DNA rate
 *            matrix, scaled in units of 1t= 1.0 substitutions/site
 *            \citep{Hasegawa85}.        
 *
 * Args:      pi     - stationary base composition A..T
 *            alpha  - relative transition rate
 *            beta   - relative transversion rate
 *                     
 *
 * Returns:   <eslOK> 
 *
 * Xref:      
 */
int
esl_rmx_SetHKY( ESL_DMATRIX *Q, double *pi, double alpha, double beta)
{
  int i,j;

  if (Q->m != 4 || Q->n != 4 || Q->type != eslGENERAL)
    ESL_EXCEPTION(eslEINVAL, "Q must be a 4x4 general matrix");
  
  for (i = 0; i < 4; i++) {
    for (j = 0; j < 4; j++)
      {
	if (i != j)  Q->mx[i][j] = ((i+j)%2)? pi[j]*beta : pi[j]*alpha; /* even=0=transition;odd=1=transversion */
	else         Q->mx[i][j] = 0.;
      }
    Q->mx[i][i] =  -1. * esl_vec_DSum(Q->mx[i], 4);
  }
  esl_rmx_ScaleTo(Q, pi, 1.0);
  return eslOK;
}
Ejemplo n.º 7
0
/* Function:  esl_rmx_ValidateP()
 * Incept:    SRE, Sun Mar 11 10:30:50 2007 [Janelia]
 *
 * Purpose:   Validates a conditional probability matrix <P>, whose
 *            elements $P_{ij}$ represent conditional probabilities
 *            $P(j \mid i)$; for example in a first-order Markov
 *            chain, or a continuous-time Markov transition process
 *            where <P> is for a particular $t$.
 *            
 *            Rows must sum to one, and each element $P_{ij}$ is a
 *            probability $0 \leq P_{ij} \leq 1$.
 *            
 *            <tol> specifies the floating-point tolerance to which
 *            the row sums must equal one: <fabs(sum-1.0) <= tol>.
 *            
 *            <errbuf> is an optional error message buffer. The caller
 *            may pass <NULL> or a pointer to a buffer of at least
 *            <eslERRBUFSIZE> characters.
 *            
 * Args:      P      - matrix to validate
 *            tol    - floating-point tolerance (0.00001, for example)      
 *            errbuf - OPTIONAL: ptr to an error buffer of at least
 *                     <eslERRBUFSIZE> characters.
 *
 * Returns:   <eslOK> on successful validation. 
 *            <eslFAIL> on failure, and if a non-<NULL> <errbuf> was
 *            provided by the caller, a message describing
 *            the reason for the failure is put there.
 *
 * Throws:    (no abnormal error conditions)
 */
int
esl_rmx_ValidateP(ESL_DMATRIX *P, double tol, char *errbuf)
{
  int    i,j;
  double sum;

  if (P->type != eslGENERAL) ESL_EXCEPTION(eslEINVAL, "P must be type eslGENERAL to be validated");

  for (i = 0; i < P->n; i++)
    {
      sum = esl_vec_DSum(P->mx[i], P->m);
      if (fabs(sum-1.0) > tol) ESL_FAIL(eslFAIL, errbuf, "row %d does not sum to 1.0", i);
      
      for (j = 0; j < P->m; j++)
	if (P->mx[i][j] < 0.0 || P->mx[i][j] > 1.0)
	  ESL_FAIL(eslFAIL, errbuf, "element %d,%d is not a probability (%f)", i,j,P->mx[i][j]);
    }
  return eslOK;
}
Ejemplo n.º 8
0
/* Function:  esl_rmx_SetF81()
 * Incept:    SRE, Thu Mar 15 13:33:30 2007 [Janelia]
 *
 * Purpose:   Sets a 4x4 rate matrix to the F81 model (aka
 *            equal-input model) given stationary base 
 *            compositions <pi>, 
 *            scaled to units of 1t = 1.0 substitutions/site.
 */
int
esl_rmx_SetF81(ESL_DMATRIX *Q, double *pi)
{
  int i,j;

  if (Q->m != 4 || Q->n != 4 || Q->type != eslGENERAL)
    ESL_EXCEPTION(eslEINVAL, "Q must be a 4x4 general matrix");
  
  for (i = 0; i < 4; i++) {
    for (j = 0; j < 4; j++)
      {
	if (i != j) Q->mx[i][j] = pi[j];
	else        Q->mx[i][j] = 0.0;
      }
    Q->mx[i][i] =  -1. * esl_vec_DSum(Q->mx[i], 4);
  }
  esl_rmx_ScaleTo(Q, pi, 1.0);
  return eslOK;
}
Ejemplo n.º 9
0
/* dump_residue_info
 *                   
 * Given an MSA, print out the number of sequences with
 * a non-gap residue in each column of the alignment.
 */
static int dump_residue_info(FILE *fp, ESL_ALPHABET *abc, double **abc_ct, int use_weights, int nali, int64_t alen, int nseq, int *i_am_rf, char *msa_name, char *alifile, char *errbuf)
{
  int apos, rfpos;
  double rct, gct;

  fprintf(fp, "# Insert information:\n");
  fprintf(fp, "# Alignment file: %s\n", alifile);
  fprintf(fp, "# Alignment idx:  %d\n", nali);
  if(msa_name != NULL) { fprintf(fp, "# Alignment name: %s\n", msa_name); }
  fprintf(fp, "# Number of sequences: %d\n", nseq);
  if(use_weights) { fprintf(fp, "# IMPORTANT: Counts are weighted based on sequence weights in alignment file.\n"); }
  else            { fprintf(fp, "# Sequence weights from alignment were ignored (if they existed).\n"); }
  fprintf(fp, "#\n");
  if(i_am_rf != NULL) { 
    fprintf(fp, "# %7s  %7s  %10s  %8s  %10s  %8s\n", "rfpos",    "alnpos",  "numres",    "freqres",  "numgap",     "freqgap");
    fprintf(fp, "# %7s  %7s  %10s  %8s  %10s  %8s\n", "-------", "-------", "----------", "--------", "----------", "--------");
  }  
  else { 
    fprintf(fp, "# %7s  %10s  %8s  %10s  %8s\n", "alnpos",  "numres",   "freqres", "numgap",    "freqgap");
    fprintf(fp, "# %7s  %10s  %8s  %10s  %8s\n", "-------", "----------", "--------", "----------", "--------");
  }
 
  rfpos = 0;
  for(apos = 0; apos < alen; apos++) {
    rct = esl_vec_DSum(abc_ct[apos], abc->K);
    gct = abc_ct[apos][abc->K];
    if(i_am_rf != NULL) { 
      if(i_am_rf[apos]) { 
	fprintf(fp, "  %7d", rfpos+1);
	rfpos++; 
      }
      else { 
	fprintf(fp, "  %7s", "-");
      }
    }
    fprintf(fp, "  %7d  %10.1f  %8.6f  %10.1f  %8.6f\n", apos+1, rct, rct / (float) nseq, gct, gct / (float) nseq);
  }
  fprintf(fp, "//\n");

  return eslOK;
}
Ejemplo n.º 10
0
/* dump_posterior_column_info
 *                   
 * Dump per-column posterior probability data to a file.
 *
 */      
static int dump_posterior_column_info(FILE *fp, double **pp_ct, int use_weights, int nali, int64_t alen, int nseq, int *i_am_rf, char *msa_name, char *alifile, char *errbuf)
{
  int    p,apos;     /* counters over sequences, columns of MSA */
  int    nppvals = 12;
  int    rfpos;
  double nnongap;
  double sum;
  float ppavgA[11];
  char ppstring[12] = "0123456789*.";

  ppavgA[0]  = 0.025;  
  ppavgA[1]  = 0.10;
  ppavgA[2]  = 0.20;
  ppavgA[3]  = 0.30;
  ppavgA[4]  = 0.40;
  ppavgA[5]  = 0.50;
  ppavgA[6]  = 0.60;
  ppavgA[7]  = 0.70;
  ppavgA[8]  = 0.80;
  ppavgA[9]  = 0.90;
  ppavgA[10] = 0.975;

  fprintf(fp, "# Posterior probability stats per column:\n");
  fprintf(fp, "# Alignment file: %s\n", alifile);
  fprintf(fp, "# Alignment idx:  %d\n", nali);
  if(msa_name != NULL) { fprintf(fp, "# Alignment name: %s\n", msa_name); }
  fprintf(fp, "# Number of sequences: %d\n", nseq);
  if(use_weights) { fprintf(fp, "# IMPORTANT: Counts are weighted based on sequence weights in alignment file.\n"); }
  else            { fprintf(fp, "# Sequence weights from alignment were ignored (if they existed).\n"); }
  fprintf(fp, "#\n");

  fprintf(fp, "# %6s", "alnpos");
  if(i_am_rf != NULL) fprintf(fp, "  %6s", "rfpos");
  fprintf(fp, "  %9s", "nnongap");  
  for(p = 0; p < nppvals; p++) { 
    fprintf(fp, "  %9c", ppstring[p]);
  }
  fprintf(fp, "  %9s\n", "avgPP");

  fprintf(fp, "# %6s  %6s  %9s", "------", "------", "---------");
  for(p = 0; p < nppvals; p++) { 
    fprintf(fp, "  %9s", "---------");
  }
  fprintf(fp, "  %9s\n", "---------");

  rfpos = 1;
  for(apos = 0; apos < alen; apos++) { 
    sum = 0;
    fprintf(fp, "  %6d", apos+1);
    if(i_am_rf != NULL) { 
      if(i_am_rf[apos]) fprintf(fp, "  %6d", rfpos++);
      else              fprintf(fp, "  %6s", "-");
    }
    nnongap = esl_vec_DSum(pp_ct[apos], 11);
    fprintf(fp, "  %9.1f", nnongap);
    for(p = 0; p < nppvals; p++) { 
      fprintf(fp, "  %9.1f", pp_ct[apos][p]);
      if(p <= 10) sum += pp_ct[apos][p] * ppavgA[p];
    }
    fprintf(fp, "  %.5f\n", sum / nnongap);
  }
  fprintf(fp, "//\n");

  return eslOK;
}
Ejemplo n.º 11
0
/* dump_infocontent_info
 *                   
 * Given an MSA with RF annotation, dump information content per column data to 
 * an open output file.
 */
static int dump_infocontent_info(FILE *fp, ESL_ALPHABET *abc, double **abc_ct, int use_weights, int nali, int64_t alen, int nseq, int *i_am_rf, char *msa_name, char *alifile, char *errbuf)
{
  int status;
  int apos, rfpos;
  double bg_ent;
  double *bg = NULL;
  double *abc_freq = NULL;
  double nnongap;

  ESL_ALLOC(bg, sizeof(double) * abc->K);
  esl_vec_DSet(bg, abc->K, 1./(abc->K));
  bg_ent = esl_vec_DEntropy(bg, abc->K);
  free(bg);

  ESL_ALLOC(abc_freq, sizeof(double) * abc->K);


  fprintf(fp, "# Information content per column (bits):\n");
  fprintf(fp, "# Alignment file: %s\n", alifile);
  fprintf(fp, "# Alignment idx:  %d\n", nali);
  if(msa_name != NULL) { fprintf(fp, "# Alignment name: %s\n", msa_name); }
  fprintf(fp, "# Number of sequences: %d\n", nseq);
  if(use_weights) { fprintf(fp, "# IMPORTANT: Counts are weighted based on sequence weights in alignment file.\n"); }
  else            { fprintf(fp, "# Sequence weights from alignment were ignored (if they existed).\n"); }
  fprintf(fp, "#\n");

  if(i_am_rf != NULL) { 
    fprintf(fp, "# %7s  %7s  %10s  %10s\n", "rfpos",    "alnpos",  "freqnongap", "info(bits)");
    fprintf(fp, "# %7s  %7s  %10s  %10s\n", "-------", "-------",  "----------", "----------");
  }  
  else { 
    fprintf(fp, "# %7s  %10s  %10s\n", "alnpos",  "freqnongap", "info(bits)");
    fprintf(fp, "# %7s  %10s  %10s\n", "-------", "----------", "----------");
  }

  rfpos = 0;
  for(apos = 0; apos < alen; apos++) {
    if(i_am_rf != NULL) { 
      if(i_am_rf[apos]) { 
	fprintf(fp, "  %7d", rfpos+1);
	rfpos++; 
      }
      else { 
	fprintf(fp, "  %7s", "-");
      }
    }
    nnongap = esl_vec_DSum(abc_ct[apos], abc->K);
    esl_vec_DCopy(abc_ct[apos], abc->K, abc_freq);
    esl_vec_DNorm(abc_freq, abc->K);
    fprintf(fp, "  %7d  %10.8f  %10.8f\n", apos+1, 
	    nnongap / (nnongap + abc_ct[apos][abc->K]),
	    (bg_ent - esl_vec_DEntropy(abc_freq, abc->K)));
  }
  fprintf(fp, "//\n");

  if(abc_freq != NULL) free(abc_freq);

  return eslOK;

 ERROR:
  ESL_FAIL(eslEINVAL, errbuf, "out of memory");
  return status; /* NEVERREACHED */
}
Ejemplo n.º 12
0
int
main(int argc, char **argv)
{
  ESL_GETOPTS  *go      = NULL;	               /* application configuration       */
  ESL_ALPHABET *abc     = NULL;      	       /* biological alphabet             */
  char         *alifile = NULL;	               /* alignment file name             */
  int           fmt     = eslMSAFILE_UNKNOWN;  /* format code for alifile         */
  ESLX_MSAFILE *afp     = NULL;		       /* open msa file                   */
  ESL_MSAFILE2 *old_afp = NULL;	               /* open msa file, legacy (--small) */
  ESL_MSA      *msa     = NULL;	               /* one multiple sequence alignment */
  int           nali;		               /* number of alignments read       */
  int           i;		               /* counter over seqs               */
  int64_t       alen;		               /* alignment length                */
  int           nseq;                          /* number of sequences in the msa */
  int64_t       rlen;		               /* a raw (unaligned) seq length    */
  int64_t       small, large;	               /* smallest, largest sequence      */
  int64_t       nres;		               /* total # of residues in msa      */
  double        avgid;		               /* average fractional pair id      */
  int           max_comparisons;               /* maximum # comparisons for avg id */
  int           do_stall;                      /* used to stall when debugging     */
  double      **abc_ct = NULL;                 /* [0..msa->alen-1][0..abc->K] number of each residue at each position (abc->K is gap) */
  double     ***bp_ct  = NULL;                 /* [0..msa->alen-1][0..abc->Kp-1][0..abc->Kp-1] per (non-pknotted) consensus basepair *
						* count of each possible basepair over all seqs basepairs are indexed by 'i' the minimum *
						* of 'i:j' for a pair between i and j, where i < j. */
  double       **pp_ct = NULL;                 /* [0..msa->alen-1][0..11], count of each posterior probability (PP) code, over all sequences, gap is 11 */  
  int  *i_am_rf = NULL;                        /* [0..i..msa->alen-1]: TRUE if pos i is non-gap RF posn, if msa->rf == NULL remains NULL */
  int  *rf2a_map = NULL;                       /* [0..rfpos..rflen-1] = apos,                     
						* apos is the alignment position (0..msa->alen-1) that     
						* is non-gap RF position rfpos+1 (for rfpos in 0..rflen-1) */
  int rflen = -1;                              /* nongap RF length */
  char          errbuf[eslERRBUFSIZE];
  int           status;		               /* easel return code               */

  /* optional output files */
  FILE *iinfofp  = NULL; /* output file for --iinfo */
  FILE *pcinfofp = NULL; /* output file for --pcinfo */
  FILE *psinfofp = NULL; /* output file for --psinfo */
  FILE *rinfofp  = NULL; /* output file for --rinfo */
  FILE *icinfofp = NULL; /* output file for --icinfo */
  FILE *listfp   = NULL; /* output file for --list */
  FILE *cinfofp  = NULL; /* output file for --cinfo */
  FILE *bpinfofp = NULL; /* output file for --bpinfo */
  int use_weights;       /* TRUE if --weight, reported weighted counts (using msa->wgt) to all output files */
  int weights_exist;     /* TRUE if at least one msa->wgt value differs from 1.0, FALSE if not (or if msa->wgt==NULL) */ 

  /***********************************************
   * Parse command line
   ***********************************************/

  go = esl_getopts_Create(options);
  if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK ||
      esl_opt_VerifyConfig(go)               != eslOK)
    {
      printf("Failed to parse command line: %s\n", go->errbuf);
      esl_usage(stdout, argv[0], usage);
      printf("\nTo see more help on available options, do %s -h\n\n", argv[0]);
      exit(1);
    }

  if (esl_opt_GetBoolean(go, "-h") )
    {
      esl_banner(stdout, argv[0], banner);
      esl_usage (stdout, argv[0], usage);
      puts("\n where options are:");
      esl_opt_DisplayHelp(stdout, go, 1, 2, 80);
      puts("\n small memory mode, requires --amino,--dna, or --rna and --informat pfam:");
      esl_opt_DisplayHelp(stdout, go, 2, 2, 80);
      puts("\n optional output files:");
      esl_opt_DisplayHelp(stdout, go, 3, 2, 80);
      exit(0);
    }

  if (esl_opt_ArgNumber(go) != 1) 
    {
      printf("Incorrect number of command line arguments.\n");
      esl_usage(stdout, argv[0], usage);
      printf("\nTo see more help on available options, do %s -h\n\n", argv[0]);
      exit(1);
    }

  alifile = esl_opt_GetArg(go, 1);

  if (esl_opt_IsOn(go, "--informat") &&
      (fmt = eslx_msafile_EncodeFormat(esl_opt_GetString(go, "--informat"))) == eslMSAFILE_UNKNOWN)
    esl_fatal("%s is not a valid input sequence file format for --informat", esl_opt_GetString(go, "--informat")); 
    
  if (esl_opt_GetBoolean(go, "--small") && fmt != eslMSAFILE_PFAM) esl_fatal("--small requires --informat pfam\n"); 

  max_comparisons = 1000;

  do_stall = esl_opt_GetBoolean(go, "--stall"); /* a stall point for attaching gdb */
  while (do_stall); 

  /***********************************************
   * Open the MSA file; determine alphabet; set for digital input
   ***********************************************/

  if      (esl_opt_GetBoolean(go, "--amino"))   abc = esl_alphabet_Create(eslAMINO);
  else if (esl_opt_GetBoolean(go, "--dna"))     abc = esl_alphabet_Create(eslDNA);
  else if (esl_opt_GetBoolean(go, "--rna"))     abc = esl_alphabet_Create(eslRNA);

  /* We'd like to get rid of the legacy msafile interface, but it
   * includes small memory functionality for Pfam format which we have
   * to replace first. For now, use both interfaces, new and legacy
   */
  if ( esl_opt_GetBoolean(go, "--small") )
    {
      if (! abc) esl_fatal("--small requires one of --amino, --dna, --rna be specified.");

      status = esl_msafile2_OpenDigital(abc, alifile, NULL, &old_afp);
      if      (status == eslENOTFOUND) esl_fatal("Alignment file %s doesn't exist or is not readable\n", alifile);
      else if (status == eslEFORMAT)   esl_fatal("Couldn't determine format of alignment %s\n", alifile);
      else if (status != eslOK)        esl_fatal("Alignment file open failed with error %d\n", status);
    }
  else
    {
      if ( (status = eslx_msafile_Open(&abc, alifile, NULL, fmt, NULL, &afp)) != eslOK)
	eslx_msafile_OpenFailure(afp, status);
    }

  /**************************************
   * Open optional output files, as nec *
   **************************************/
  /* determine name for first list file, if nec */
  if( esl_opt_IsOn(go, "--list")) {
    if ((listfp = fopen(esl_opt_GetString(go, "--list"), "w")) == NULL) 
      esl_fatal("Failed to open --list output file %s\n", esl_opt_GetString(go, "--list"));
  }
  if( esl_opt_IsOn(go, "--icinfo")) {
    if ((icinfofp = fopen(esl_opt_GetString(go, "--icinfo"), "w")) == NULL) 
      esl_fatal("Failed to open --icinfo output file %s\n", esl_opt_GetString(go, "--icinfo"));
  }
  if( esl_opt_IsOn(go, "--rinfo")) {
    if ((rinfofp = fopen(esl_opt_GetString(go, "--rinfo"), "w")) == NULL) 
      esl_fatal("Failed to open --rinfo output file %s\n", esl_opt_GetString(go, "--rinfo"));
  }
  if( esl_opt_IsOn(go, "--pcinfo")) {
    if ((pcinfofp = fopen(esl_opt_GetString(go, "--pcinfo"), "w")) == NULL) 
      esl_fatal("Failed to open --pcinfo output file %s\n", esl_opt_GetString(go, "--pcinfo"));
  }
  if( esl_opt_IsOn(go, "--psinfo")) {
    if ((psinfofp = fopen(esl_opt_GetString(go, "--psinfo"), "w")) == NULL) 
      esl_fatal("Failed to open --psinfo output file %s\n", esl_opt_GetString(go, "--psinfo"));
  }
  if( esl_opt_IsOn(go, "--iinfo")) {
    if ((iinfofp = fopen(esl_opt_GetString(go, "--iinfo"), "w")) == NULL) 
      esl_fatal("Failed to open --iinfo output file %s\n", esl_opt_GetString(go, "--iinfo"));
  }
  if( esl_opt_IsOn(go, "--cinfo")) {
    if ((cinfofp = fopen(esl_opt_GetString(go, "--cinfo"), "w")) == NULL) 
      esl_fatal("Failed to open --cinfo output file %s\n", esl_opt_GetString(go, "--cinfo"));
  }
  if( esl_opt_IsOn(go, "--bpinfo")) {
    if ((bpinfofp = fopen(esl_opt_GetString(go, "--bpinfo"), "w")) == NULL) 
      esl_fatal("Failed to open --bpinfo output file %s\n", esl_opt_GetString(go, "--bpinfo"));
  }

  /***********************************************
   * Read MSAs one at a time.
   ***********************************************/

  if (esl_opt_GetBoolean(go, "-1")) {
    puts("#");
    if(! esl_opt_GetBoolean(go, "--small")) { 
      printf("# %-4s %-20s %10s %7s %7s %12s %6s %6s %10s %3s\n", "idx", "name", "format", "nseq", "alen", "nres", "small", "large", "avlen", "%id");
      printf("# %-4s %-20s %10s %7s %7s %12s %6s %6s %10s %3s\n", "----", "--------------------", "----------", "-------", "-------", "------------", "------", "------", "----------", "---");
    }
    else { 
      printf("# %-4s %-20s %10s %7s %7s %12s %10s\n", "idx", "name", "format", "nseq", "alen", "nres", "avlen");
      printf("# %-4s %-20s %10s %7s %7s %12s %10s\n", "----", "--------------------", "----------", "-------", "-------", "------------", "----------");
    }
  }

  nali = 0;
  
  fmt = (esl_opt_GetBoolean(go, "--small") ? old_afp->format : afp->format);

  while ( (status = ( esl_opt_GetBoolean(go, "--small") ? 
		      esl_msafile2_ReadInfoPfam(old_afp, listfp, abc, -1, NULL, NULL, &msa, &nseq, &alen, NULL, NULL, NULL, NULL, NULL, &abc_ct, &pp_ct, NULL, NULL, NULL) :
		      eslx_msafile_Read        (afp, &msa))) == eslOK)
    { 
      nali++;
      nres = 0;

      if (! esl_opt_GetBoolean(go, "--small")) { 
	nseq = msa->nseq;
	alen = msa->alen;
	small = large = -1;
	for (i = 0; i < msa->nseq; i++)
	  {
	    rlen  = esl_abc_dsqrlen(msa->abc, msa->ax[i]);
	    nres += rlen;
	    if (small == -1 || rlen < small) small = rlen;
	    if (large == -1 || rlen > large) large = rlen;
	  }

	esl_dst_XAverageId(abc, msa->ax, msa->nseq, max_comparisons, &avgid);
      }
      else { /* --small invoked */
	for(i = 0; i < alen; i++) nres += (int) esl_vec_DSum(abc_ct[i], abc->K);
      }

      if (esl_opt_GetBoolean(go, "-1")) 
	{
	  printf("%-6d %-20s %10s %7d %7" PRId64 " %12" PRId64, 
		 nali, 
		 msa->name,
		 eslx_msafile_DecodeFormat(fmt),
		 nseq,
		 alen,
		 nres);
	  if (! esl_opt_GetBoolean(go, "--small")) { 
	    printf(" %6" PRId64 " %6" PRId64 " %10.1f %3.0f\n",
		   small,
		   large,
		   (double) nres / (double) msa->nseq,
		   100.*avgid);
	  }
	  else { 
	    printf(" %10.1f\n", (double) nres / (double) nseq);
	  }
	}
      else
	{
	  printf("Alignment number:    %d\n",     nali);
	  if (msa->name != NULL)
	    printf("Alignment name:      %s\n",        msa->name); 
	  printf("Format:              %s\n",          eslx_msafile_DecodeFormat(fmt));
	  printf("Number of sequences: %d\n",          nseq);
	  printf("Alignment length:    %" PRId64 "\n", alen);
	  printf("Total # residues:    %" PRId64 "\n", nres);
	  if(! esl_opt_GetBoolean(go, "--small")) { 
	    printf("Smallest:            %" PRId64 "\n", small);
	    printf("Largest:             %" PRId64 "\n", large);
	  }
	  printf("Average length:      %.1f\n",        (double) nres / (double) nseq);
	  if(! esl_opt_GetBoolean(go, "--small")) { 
	    printf("Average identity:    %.0f%%\n",      100.*avgid); 
	  }
	  printf("//\n");
	}

      /* Dump data to optional output files, if nec */
      if(esl_opt_IsOn(go, "--list")) {
	if(! esl_opt_GetBoolean(go, "--small")) { 
	    /* only print sequence name to list file if ! --small, else we already have in esl_msafile2_ReadInfoPfam() */
	    for(i = 0; i < msa->nseq; i++) fprintf(listfp, "%s\n", msa->sqname[i]);
	}
      }

      /* if RF exists, get i_am_rf array[0..alen] which tells us which positions are non-gap RF positions
       * and rf2a_map, a map of non-gap RF positions to overall alignment positions */
      if(msa->rf != NULL) {
	if((status = map_rfpos_to_apos(msa, abc, errbuf, alen, &i_am_rf, &rf2a_map, &rflen)) != eslOK) esl_fatal(errbuf);
      }
      else i_am_rf = NULL;

      weights_exist = check_msa_weights(msa);
      use_weights   = (weights_exist && esl_opt_GetBoolean(go, "--weight")) ? TRUE : FALSE;
      
      if( (! esl_opt_GetBoolean(go, "--small")) && 
	  (esl_opt_IsOn(go, "--icinfo") || esl_opt_IsOn(go, "--rinfo")  || esl_opt_IsOn(go, "--pcinfo") || 
	   esl_opt_IsOn(go, "--cinfo")  || esl_opt_IsOn(go, "--bpinfo")))  {
	/* collect counts of each residue and PPs (if they exist) from the msa */
	if(esl_opt_GetBoolean(go, "--weight") && msa->wgt == NULL) esl_fatal("--weight requires all alignments have #=GS WT annotation, but aln %d does not", nali);
	if((status = count_msa(msa, errbuf, nali, 
			       esl_opt_GetBoolean(go, "--noambig"), /* ignore ambiguous residues? */
			       esl_opt_GetBoolean(go, "--weight"),  /* use msa->wgt sequence weights? */
			       &abc_ct, 
			       ((bpinfofp != NULL && msa->ss_cons != NULL) ? &bp_ct : NULL), /* get basepair counts? */
			       (msa->pp != NULL ? &pp_ct : NULL)))   /* get PP counts? */
	   != eslOK) esl_fatal(errbuf);
      }

      if( esl_opt_IsOn(go, "--icinfo")) {
	if((status = dump_infocontent_info(icinfofp, abc, abc_ct, use_weights, nali, alen, nseq, i_am_rf, msa->name, alifile, errbuf) != eslOK)) esl_fatal(errbuf);
      }
      if( esl_opt_IsOn(go, "--rinfo")) {
	if((status = dump_residue_info(rinfofp, abc, abc_ct, use_weights, nali, alen, nseq, i_am_rf, msa->name, alifile, errbuf) != eslOK)) esl_fatal(errbuf);
      }
      if(esl_opt_IsOn(go, "--pcinfo")) {
	if(pp_ct == NULL) esl_fatal("Error: --pcinfo requires all alignments have #=GR PP annotation, but alignment %d does not", nali);
	if((status = dump_posterior_column_info(pcinfofp, pp_ct, use_weights, nali, alen, nseq, i_am_rf, msa->name, alifile, errbuf) != eslOK)) esl_fatal(errbuf);
      }
      if(esl_opt_IsOn(go, "--psinfo")) {
	if(msa->pp == NULL) esl_fatal("Error: --psinfo requires all alignments have #=GR PP annotation, but alignment %d does not", nali);
	if((status = dump_posterior_sequence_info(psinfofp, msa, nali, alifile, errbuf) != eslOK)) esl_fatal(errbuf);
      }
      if( esl_opt_IsOn(go, "--iinfo")) {
	if(msa->rf == NULL) esl_fatal("--iinfo requires all alignments have #=GC RF annotation, but alignment %d does not", nali);
	if(esl_opt_GetBoolean(go, "--weight") && msa->wgt == NULL) esl_fatal("--weight requires all alignments have #=GS WT annotation, but aln %d does not", nali);
	if((status = dump_insert_info(iinfofp, msa, use_weights, nali, i_am_rf, alifile, errbuf) != eslOK)) esl_fatal(errbuf);
      }
      if( esl_opt_IsOn(go, "--cinfo")) {
	if((status = dump_column_residue_counts(cinfofp, abc, abc_ct, esl_opt_GetBoolean(go, "--noambig"), use_weights, nali, alen, nseq, msa->name, alifile, errbuf) != eslOK)) esl_fatal(errbuf);
      }
      if( esl_opt_IsOn(go, "--bpinfo")) {
	if(msa->ss_cons == NULL) esl_fatal("--bpinfo requires all alignments have #=GC SS_cons annotation, but alignment %d does not", nali);
	if((status = dump_basepair_counts(bpinfofp, msa, abc, bp_ct, use_weights, nali, nseq, msa->name, alifile, errbuf) != eslOK)) esl_fatal(errbuf);
      }

      esl_msa_Destroy(msa);
      if(abc_ct != NULL)   { esl_Free2D((void **) abc_ct, alen);          abc_ct   = NULL; }
      if(bp_ct != NULL)    { esl_Free3D((void ***) bp_ct, alen, abc->Kp); bp_ct    = NULL; }
      if(pp_ct != NULL)    { esl_Free2D((void **) pp_ct, alen);           pp_ct    = NULL; }
      if(i_am_rf != NULL)  { free(i_am_rf);                               i_am_rf  = NULL; }
      if(rf2a_map != NULL) { free(rf2a_map);                              rf2a_map = NULL; }
    }
  
  /* If an msa read failed, we've dropped out to here with an informative status code. 
   * we have to handle failures from new vs. legacy msa parsing differently 
   */
  if (esl_opt_GetBoolean(go, "--small")) 
    {
      if      (status == eslEFORMAT)  esl_fatal("Alignment file parse error, line %d of file %s:\n%s\nOffending line is:\n%s\n", old_afp->linenumber, old_afp->fname, old_afp->errbuf, old_afp->buf);	
      else if (status != eslEOF)      esl_fatal("Alignment file read failed with error code %d\n", status);
      else if (nali   == 0)           esl_fatal("No alignments found in file %s\n", alifile);
    }
  else
    {
      if (nali == 0 || status != eslEOF) eslx_msafile_ReadFailure(afp, status);
    }

  /* Cleanup, normal return
   */
  if(listfp != NULL) { 
    fclose(listfp);
    printf("# List of sequences in %d alignment(s) saved to file %s\n", nali, esl_opt_GetString(go, "--list"));
  }
  if(icinfofp != NULL) { 
    fclose(icinfofp);
    printf("# Information content data saved to file %s.\n", esl_opt_GetString(go, "--icinfo")); 
  }
  if(rinfofp != NULL) { 
    fclose(rinfofp);
    printf("# Residue data saved to file %s.\n", esl_opt_GetString(go, "--rinfo")); 
  }
  if(pcinfofp != NULL) { 
    fclose(pcinfofp);
    printf("# Per-column posterior probability data saved to file %s.\n", esl_opt_GetString(go, "--pcinfo")); 
  }
  if(psinfofp != NULL) { 
    fclose(psinfofp);
    printf("# Per-sequence posterior probability data saved to file %s.\n", esl_opt_GetString(go, "--psinfo")); 
  }
  if(iinfofp != NULL) { 
    printf("# Insert data saved to file %s.\n", esl_opt_GetString(go, "--iinfo")); 
    fclose(iinfofp);
  }
  if(cinfofp != NULL) { 
    printf("# Per-column counts data saved to file %s.\n", esl_opt_GetString(go, "--cinfo")); 
    fclose(cinfofp);
  }
  if(bpinfofp != NULL) { 
    printf("# Per-column basepair counts data saved to file %s.\n", esl_opt_GetString(go, "--bpinfo")); 
    fclose(bpinfofp);
  }


  if (afp)     eslx_msafile_Close(afp);
  if (old_afp) esl_msafile2_Close(old_afp);
  esl_alphabet_Destroy(abc);
  esl_getopts_Destroy(go);
  return 0;
}