Пример #1
0
/* Function:  esl_hxp_FitGuess()
 *
 * Purpose:   Given a sorted vector of <n> observed data samples <x[]>,
 *            from smallest <x[0]> to largest <x[n-1]>, calculate a
 *            very crude guesstimate of a fit -- suitable only as a starting
 *            point for further optimization -- and return those parameters
 *            in <h>.
 *
 *            Assigns $q_k \propto \frac{1}{k}$ and  $\mu = \min_i x_i$;
 *            splits $x$ into $K$ roughly equal-sized bins, and
 *            and assigns $\lambda_k$ as the ML estimate from bin $k$.
 *            (If $q_k$ coefficients have already been fixed to 
 *            known values, this step is skipped.)
 */
int
esl_hxp_FitGuess(double *x, int n, ESL_HYPEREXP *h)
{
  double tmu;			/* current mu */
  double mean;			/* mean (x-tmu) in a bin */
  int    i,k;
  int    imin, imax;

  h->mu = x[0];  /* minimum */
  for (k = 0; k < h->K; k++)
    {
      if (! h->fixmix) 
	h->q[k] = 1 / (double)(k+1); /* priors ~ 1, 1/2, 1/3... */

      imin = (int) ((double)(k*n)/(double)h->K);
      imax = (int) ((double)((k+1)*n)/(double)h->K);
      tmu = x[imin];
      mean = 0.;
      for (i = imin; i < imax; i++)
	mean += x[i] - tmu;
      mean /= (double)(imax-imin);
      h->lambda[k] = 1 / mean;
    }
  esl_vec_DNorm(h->q, h->K);
  return eslOK;
}
Пример #2
0
/* Function:  esl_vec_DLogNorm()
 * Synopsis:  Normalize a log p-vector, make it a p-vector.           
 * Incept:    SRE, Thu Apr  7 17:45:39 2005 [St. Louis]
 *
 * Purpose:   Given an unnormalized log probability vector <vec>   
 *            of length <n>, normalize it and make it a 
 *            probability vector. 
 *            
 *            <esl_vec_FLogNorm()> does the same, but for a vector
 *            of floats instead of doubles.
 *
 * Returns:   (void); <vec> is changed in place.
 */
void
esl_vec_DLogNorm(double *vec, int n)
{
  double denom;
  
  denom = esl_vec_DLogSum(vec, n);
  esl_vec_DIncrement(vec, n, -1.*denom);
  esl_vec_DExp (vec, n);
  esl_vec_DNorm(vec, n);
}
Пример #3
0
/* Function:  esl_hyperexp_Read()
 *
 * Purpose:   Reads hyperexponential parameters from an open <e>.
 *            which is an <ESL_FILEPARSER> tokenizer for an open stream.
 *            
 *            The first token is <K>, the number of mixture components.
 *            The second token is <mu>, the x offset shared by all components.
 *            Then for each mixture component <k=1..K>, it reads
 *            a mixture coefficient <q[k]> and a decay parameter
 *            <lambda[k]>.
 *            
 *            The <2K+2> data tokens must occur in this order, but
 *            they can be grouped into any number of lines, because the
 *            parser ignores line breaks.
 *            
 *            Anything after a <\#> character on a line is a comment, and
 *            is ignored.
 *            
 * Returns:   <eslOK> on success, and <ret_hxp> points to a new <ESL_HYPEREXP>
 *            object.
 *            <eslEFORMAT> on "normal" parse failure caused by a bad file 
 *            format that's likely the user's fault.
 *
 * Throws:    <eslEMEM> if allocation of the new <ESL_HYPEREXP> fails.
 *
 * 
 * FIXME: All our mixture models (esl_dirichlet, for example) should be
 *        reconciled w/ identical interfaces & behaviour.
 */
int
esl_hyperexp_Read(ESL_FILEPARSER *e, ESL_HYPEREXP **ret_hxp)
{
  ESL_HYPEREXP   *hxp = NULL;
  char           *tok;
  int             status = eslOK;
  int             nc;
  int             k;
  double          sum;

  esl_fileparser_SetCommentChar(e, '#');

  if ((status = esl_fileparser_GetToken(e, &tok, NULL)) != eslOK) goto ERROR;
  nc = atoi(tok);
  if (nc < 1) {  
    sprintf(e->errbuf, "Expected # of components K >= 1 as first token");
    goto ERROR;
  }

  if ((hxp = esl_hyperexp_Create(nc)) == NULL) return eslEMEM; /* percolation */
  
  if ((status = esl_fileparser_GetToken(e, &tok, NULL)) != eslOK) goto ERROR;
  hxp->mu = atof(tok);

  for (k = 0; k < hxp->K; k++)
    {
      if ((status = esl_fileparser_GetToken(e, &tok, NULL)) != eslOK) goto ERROR;
      hxp->q[k] = atof(tok);
      
      if ((status = esl_fileparser_GetToken(e, &tok, NULL)) != eslOK) goto ERROR;
      hxp->lambda[k] = atof(tok);

      if (hxp->q[k] < 0. || hxp->q[k] > 1.) {
	sprintf(e->errbuf, "Expected a mixture coefficient q[k], 0<=q[k]<=1");
	goto ERROR;
      }
      if (hxp->lambda[k] <= 0.) {
	sprintf(e->errbuf, "Expected a lambda parameter, lambda>0");
	goto ERROR;
      }
    }
  sum = esl_vec_DSum(hxp->q, hxp->K);
  if (fabs(sum-1.0) > 0.05) {
    sprintf(e->errbuf, "Expected mixture coefficients to sum to 1");
    goto ERROR;
  }
  esl_vec_DNorm(hxp->q, hxp->K);
  *ret_hxp = hxp;
  return eslOK;

 ERROR:
  esl_hyperexp_Destroy(hxp); 
  return eslEFORMAT;
}
Пример #4
0
int main(void)
{
  double *p;
  char    labels[] = "ACGT";
  int     n = 4;

  p = malloc(sizeof(double) * n);
  esl_vec_DSet(p, n, 1.0);
  esl_vec_DNorm(p, n);
  esl_vec_DDump(stdout, p, n, labels);
  free(p);
  return 0;
}
Пример #5
0
/* Function:  esl_msaweight_BLOSUM()
 * Synopsis:  BLOSUM weights.
 * Incept:    SRE, Sun Nov  5 09:52:41 2006 [Janelia]
 *
 * Purpose:   Given a multiple sequence alignment <msa> and an identity
 *            threshold <maxid>, calculate sequence weights using the
 *            BLOSUM algorithm (Henikoff and Henikoff, PNAS
 *            89:10915-10919, 1992). These weights are stored
 *            internally in the <msa> object, replacing any weights
 *            that may have already been there. Weights are $\geq 0$
 *            and they sum to <msa->nseq>.
 *            
 *            The algorithm does a single linkage clustering by
 *            fractional id, defines clusters such that no two clusters
 *            have a pairwise link $\geq$ <maxid>), and assigns
 *            weights of $\frac{1}{M_i}$ to each of the $M_i$
 *            sequences in each cluster $i$. The <maxid> threshold
 *            is a fractional pairwise identity, in the range
 *            $0..1$.
 *            
 *            The <msa> may be in either digitized or text mode.
 *            Digital mode is preferred, so that the pairwise identity
 *            calculations deal with degenerate residue symbols
 *            properly.
 *
 * Returns:   <eslOK> on success, and the weights inside <msa> have been
 *            modified. 
 *            
 * Throws:    <eslEMEM> on allocation error. <eslEINVAL> if a pairwise
 *            identity calculation fails because of corrupted sequence 
 *            data. In either case, the <msa> is unmodified.
 *
 * Xref:      [Henikoff92]; squid::weight.c::BlosumWeights().
 */
int
esl_msaweight_BLOSUM(ESL_MSA *msa, double maxid)
{
  int  *c    = NULL; /* cluster assignments for each sequence */
  int  *nmem = NULL; /* number of seqs in each cluster */
  int   nc;	     /* number of clusters  */
  int   i;           /* loop counter */
  int   status;

  /* Contract checks
   */
  ESL_DASSERT1( (maxid >= 0. && maxid <= 1.) );
  ESL_DASSERT1( (msa->nseq >= 1) );
  ESL_DASSERT1( (msa->alen >= 1) );
  if (msa->nseq == 1) { msa->wgt[0] = 1.0; return eslOK; }

  if ((status = esl_msacluster_SingleLinkage(msa, maxid, &c, NULL, &nc)) != eslOK) goto ERROR;
  ESL_ALLOC(nmem, sizeof(int) * nc);
  esl_vec_ISet(nmem, nc, 0);
  for (i = 0; i < msa->nseq; i++) nmem[c[i]]++;
  for (i = 0; i < msa->nseq; i++) msa->wgt[i] = 1. / (double) nmem[c[i]];

  /* Make weights normalize up to nseq, and return.
   */
  esl_vec_DNorm(msa->wgt, msa->nseq);
  esl_vec_DScale(msa->wgt, msa->nseq, (double) msa->nseq);	
  msa->flags |= eslMSA_HASWGTS;

  free(nmem);
  free(c);
  return eslOK;

 ERROR:
  if (c    != NULL) free(c);
  if (nmem != NULL) free(nmem);
  return status;
}
Пример #6
0
static void
utest_pvectors(void)
{
  char  *msg   = "pvector unit test failed";
  double p1[4] = { 0.25, 0.25, 0.25, 0.25 };
  double p2[4];
  double p3[4];
  float  p1f[4]; 
  float  p2f[4] = { 0.0,   0.5, 0.5,  0.0  };
  float  p3f[4];
  int    n = 4;
  double result;

  esl_vec_D2F(p1,  n, p1f);
  esl_vec_F2D(p2f, n, p2);  

  if (esl_vec_DValidate(p1,  n, 1e-12, NULL) != eslOK) esl_fatal(msg);
  if (esl_vec_FValidate(p1f, n, 1e-7,  NULL) != eslOK) esl_fatal(msg);

  result = esl_vec_DEntropy(p1,  n);          if (esl_DCompare(2.0, result, 1e-9) != eslOK) esl_fatal(msg);
  result = esl_vec_FEntropy(p1f, n);          if (esl_DCompare(2.0, result, 1e-9) != eslOK) esl_fatal(msg);
  result = esl_vec_DEntropy(p2,  n);          if (esl_DCompare(1.0, result, 1e-9) != eslOK) esl_fatal(msg);
  result = esl_vec_FEntropy(p2f, n);          if (esl_DCompare(1.0, result, 1e-9) != eslOK) esl_fatal(msg);

  result = esl_vec_DRelEntropy(p2,  p1,  n);  if (esl_DCompare(1.0, result, 1e-9) != eslOK) esl_fatal(msg);
  result = esl_vec_FRelEntropy(p2f, p1f, n);  if (esl_DCompare(1.0, result, 1e-9) != eslOK) esl_fatal(msg);

  result = esl_vec_DRelEntropy(p1,  p2,  n);  if (result != eslINFINITY)  esl_fatal(msg);
  result = esl_vec_FRelEntropy(p1f, p2f, n);  if (result != eslINFINITY)  esl_fatal(msg);

  esl_vec_DLog(p2, n);
  if (esl_vec_DLogValidate(p2, n, 1e-12, NULL) != eslOK) esl_fatal(msg);
  esl_vec_DExp(p2, n);
  if (p2[0] != 0.) esl_fatal(msg);

  esl_vec_FLog(p2f, n);
  if (esl_vec_FLogValidate(p2f, n, 1e-7, NULL) != eslOK) esl_fatal(msg);
  esl_vec_FExp(p2f, n);
  if (p2f[0] != 0.) esl_fatal(msg);

  esl_vec_DCopy(p2, n, p3);
  esl_vec_DScale(p3, n, 10.);
  esl_vec_DNorm(p3, n);
  if (esl_vec_DCompare(p2, p3, n, 1e-12) != eslOK) esl_fatal(msg);

  esl_vec_DLog(p3, n);
  result = esl_vec_DLogSum(p3, n); if (esl_DCompare(0.0, result, 1e-12) != eslOK) esl_fatal(msg);
  esl_vec_DIncrement(p3, n, 2.0);
  esl_vec_DLogNorm(p3, n);
  if (esl_vec_DCompare(p2, p3, n, 1e-12) != eslOK) esl_fatal(msg);

  esl_vec_FCopy(p2f, n, p3f);
  esl_vec_FScale(p3f, n, 10.);
  esl_vec_FNorm(p3f, n);
  if (esl_vec_FCompare(p2f, p3f, n, 1e-7) != eslOK) esl_fatal(msg);

  esl_vec_FLog(p3f, n);
  result = esl_vec_FLogSum(p3f, n); if (esl_DCompare(0.0, result, 1e-7) != eslOK) esl_fatal(msg);
  esl_vec_FIncrement(p3f, n, 2.0);
  esl_vec_FLogNorm(p3f, n);
  if (esl_vec_FCompare(p2f, p3f, n, 1e-7) != eslOK) esl_fatal(msg);

  return;
}
Пример #7
0
/* Function:  esl_msaweight_GSC()
 * Synopsis:  GSC weights.
 * Incept:    SRE, Fri Nov  3 13:31:14 2006 [Janelia]
 *
 * Purpose:   Given a multiple sequence alignment <msa>, calculate
 *            sequence weights according to the
 *            Gerstein/Sonnhammer/Chothia algorithm. These weights
 *            are stored internally in the <msa> object, replacing
 *            any weights that may have already been there. Weights
 *            are $\geq 0$ and they sum to <msa->nseq>.
 *            
 *            The <msa> may be in either digitized or text mode.
 *            Digital mode is preferred, so that distance calculations
 *            used by the GSC algorithm are robust against degenerate
 *            residue symbols.
 *
 *            This is an implementation of Gerstein et al., "A method to
 *            weight protein sequences to correct for unequal
 *            representation", JMB 236:1067-1078, 1994.
 *            
 *            The algorithm is $O(N^2)$ memory (it requires a pairwise
 *            distance matrix) and $O(N^3 + LN^2)$ time ($N^3$ for a UPGMA
 *            tree building step, $LN^2$ for distance matrix construction)
 *            for an alignment of N sequences and L columns. 
 *            
 *            In the current implementation, the actual memory
 *            requirement is dominated by two full NxN distance
 *            matrices (one tmp copy in UPGMA, and one here): for
 *            8-byte doubles, that's $16N^2$ bytes. To keep the
 *            calculation under memory limits, don't process large
 *            alignments: max 1400 sequences for 32 MB, max 4000
 *            sequences for 256 MB, max 8000 seqs for 1 GB. Watch
 *            out, because Pfam alignments can easily blow this up.
 *            
 * Note:      Memory usage could be improved. UPGMA consumes a distance
 *            matrix, but that can be D itself, not a copy, if the
 *            caller doesn't mind the destruction of D. Also, D is
 *            symmetrical, so we could use upper or lower triangular
 *            matrices if we rewrote dmatrix to allow them.
 *            
 *            I also think UPGMA can be reduced to O(N^2) time, by
 *            being more tricky about rapidly identifying the minimum
 *            element: could keep min of each row, and update that,
 *            I think.
 *
 * Returns:   <eslOK> on success, and the weights inside <msa> have been
 *            modified.  
 *
 * Throws:    <eslEINVAL> if the alignment data are somehow invalid and
 *            distance matrices can't be calculated. <eslEMEM> on an
 *            allocation error. In either case, the original <msa> is
 *            left unmodified.
 *
 * Xref:      [Gerstein94]; squid::weight.c::GSCWeights(); STL11/81.
 */
int
esl_msaweight_GSC(ESL_MSA *msa)
{
  ESL_DMATRIX *D = NULL;     /* distance matrix */
  ESL_TREE    *T = NULL;     /* UPGMA tree */
  double      *x = NULL;     /* storage per node, 0..N-2 */
  double       lw, rw;       /* total branchlen on left, right subtrees */
  double       lx, rx;	     /* distribution of weight to left, right side */
  int i;		     /* counter over nodes */
  int status;
  
  /* Contract checks
   */
  ESL_DASSERT1( (msa       != NULL) );
  ESL_DASSERT1( (msa->nseq >= 1)    );
  ESL_DASSERT1( (msa->alen >= 1)    );
  ESL_DASSERT1( (msa->wgt  != NULL) );
  if (msa->nseq == 1) { msa->wgt[0] = 1.0; return eslOK; }

  /* GSC weights use a rooted tree with "branch lengths" calculated by
   * UPGMA on a fractional difference matrix - pretty crude.
   */
  if (! (msa->flags & eslMSA_DIGITAL)) {
    if ((status = esl_dst_CDiffMx(msa->aseq, msa->nseq, &D))         != eslOK) goto ERROR;
  } 
#ifdef eslAUGMENT_ALPHABET
  else {
    if ((status = esl_dst_XDiffMx(msa->abc, msa->ax, msa->nseq, &D)) != eslOK) goto ERROR;
  }
#endif

  /* oi, look out here.  UPGMA is correct, but old squid library uses
   * single linkage, so for regression tests ONLY, we use single link. 
   */
#ifdef  eslMSAWEIGHT_REGRESSION
  if ((status = esl_tree_SingleLinkage(D, &T)) != eslOK) goto ERROR; 
#else
  if ((status = esl_tree_UPGMA(D, &T)) != eslOK) goto ERROR; 
#endif
  esl_tree_SetCladesizes(T);	

  ESL_ALLOC(x, sizeof(double) * (T->N-1));
  
  /* Postorder traverse (leaves to root) to calculate the total branch
   * length under each internal node; store this in x[].  Remember the
   * total branch length (x[0]) for a future sanity check.
   */
  for (i = T->N-2; i >= 0; i--)
    {
      x[i] = T->ld[i] + T->rd[i];
      if (T->left[i]  > 0) x[i] += x[T->left[i]];
      if (T->right[i] > 0) x[i] += x[T->right[i]];
    }
  
  /* Preorder traverse (root to leaves) to calculate the weights.  Now
   * we use x[] to mean, the total weight *above* this node that we will
   * apportion to the node's left and right children. The two
   * meanings of x[] never cross: every x[] beneath x[i] is still a
   * total branch length.
   *
   * Because the API guarantees that msa is returned unmodified in case
   * of an exception, and we're touching msa->wgt here, no exceptions
   * may be thrown from now on in this function.
   */
  x[0] = 0;			/* initialize: no branch to the root. */
  for (i = 0; i <= T->N-2; i++)
    {
      lw = T->ld[i];   if (T->left[i]  > 0) lw += x[T->left[i]];
      rw = T->rd[i];   if (T->right[i] > 0) rw += x[T->right[i]];

      if (lw+rw == 0.) 
	{
	  /* A special case arises in GSC weights when all branch lengths in a subtree are 0.
	   * In this case, all seqs in this clade should get equal weights, sharing x[i] equally.
           * So, split x[i] in proportion to cladesize, not to branch weight.
	   */
	  if (T->left[i] > 0)  lx =  x[i] * ((double) T->cladesize[T->left[i]]  / (double) T->cladesize[i]);
	  else                 lx =  x[i] / (double) T->cladesize[i];

	  if (T->right[i] > 0) rx =  x[i] * ((double) T->cladesize[T->right[i]] / (double) T->cladesize[i]);
	  else                 rx =  x[i] / (double) T->cladesize[i];
	} 
      else /* normal case: x[i] split in proportion to branch weight. */
	{
	  lx = x[i] * lw/(lw+rw);
	  rx = x[i] * rw/(lw+rw);
	}
      
      if (T->left[i]  <= 0) msa->wgt[-(T->left[i])] = lx + T->ld[i];
      else                  x[T->left[i]] = lx + T->ld[i];

      if (T->right[i] <= 0) msa->wgt[-(T->right[i])] = rx + T->rd[i];
      else                  x[T->right[i]] = rx + T->rd[i];
    } 

  /* Renormalize weights to sum to N.
   */
  esl_vec_DNorm(msa->wgt, msa->nseq);
  esl_vec_DScale(msa->wgt, msa->nseq, (double) msa->nseq);
  msa->flags |= eslMSA_HASWGTS;

  free(x);
  esl_tree_Destroy(T);
  esl_dmatrix_Destroy(D);
  return eslOK;

 ERROR:
  if (x != NULL) free(x);
  if (T != NULL) esl_tree_Destroy(T);
  if (D != NULL) esl_dmatrix_Destroy(D);
  return status;
}
Пример #8
0
/* Function:  esl_msaweight_PB()
 * Synopsis:  PB (position-based) weights.
 * Incept:    SRE, Sun Nov  5 08:59:28 2006 [Janelia]
 *
 * Purpose:   Given a multiple alignment <msa>, calculate sequence
 *            weights according to the position-based weighting
 *            algorithm (Henikoff and Henikoff, JMB 243:574-578,
 *            1994). These weights are stored internally in the <msa>
 *            object, replacing any weights that may have already been
 *            there. Weights are $\geq 0$ and they sum to <msa->nseq>.
 *            
 *            The <msa> may be in either digitized or text mode.
 *            Digital mode is preferred, so that the algorithm
 *            deals with degenerate residue symbols properly.
 *            
 *            The Henikoffs' algorithm does not give rules for dealing
 *            with gaps or degenerate residue symbols. The rule here
 *            is to ignore them. This means that longer sequences
 *            initially get more weight; hence a "double
 *            normalization" in which the weights are first divided by
 *            sequence length in canonical residues (to compensate for
 *            that effect), then normalized to sum to nseq.
 *            
 *            An advantage of the PB method is efficiency.
 *            It is $O(1)$ in memory and $O(NL)$ time, for an alignment of
 *            N sequences and L columns. This makes it a good method 
 *            for ad hoc weighting of very deep alignments.
 *            
 *            When the alignment is in simple text mode, IUPAC
 *            degenerate symbols are not dealt with correctly; instead,
 *            the algorithm simply uses the 26 letters as "residues"
 *            (case-insensitively), and treats all other residues as
 *            gaps.
 *
 * Returns:   <eslOK> on success, and the weights inside <msa> have been
 *            modified. 
 *
 * Throws:    <eslEMEM> on allocation error, in which case <msa> is
 *            returned unmodified.
 *
 * Xref:      [Henikoff94b]; squid::weight.c::PositionBasedWeights().
 */
int
esl_msaweight_PB(ESL_MSA *msa)
{
  int    *nres = NULL;   	/* counts of each residue observed in a column */
  int     ntotal;		/* number of different symbols observed in a column */
  int     rlen;			/* number of residues in a sequence */
  int     idx, pos, i;
  int     K;			/* alphabet size */
  int     status;

  /* Contract checks
   */
  ESL_DASSERT1( (msa->nseq >= 1) );
  ESL_DASSERT1( (msa->alen >= 1) );
  if (msa->nseq == 1) { msa->wgt[0] = 1.0; return eslOK; }

  /* Initialize
   */
  if (! (msa->flags & eslMSA_DIGITAL)) 
    { ESL_ALLOC(nres, sizeof(int) * 26);          K = 26;          }
#ifdef eslAUGMENT_ALPHABET
  else 
    { ESL_ALLOC(nres, sizeof(int) * msa->abc->K); K = msa->abc->K; }
#endif

  esl_vec_DSet(msa->wgt, msa->nseq, 0.);

  /* This section handles text alignments */
  if (! (msa->flags & eslMSA_DIGITAL)) 
    {
      for (pos = 0; pos < msa->alen; pos++)
	{
	  /* Collect # of letters A..Z in this column, and total */
	  esl_vec_ISet(nres, K, 0.);
	  for (idx = 0; idx < msa->nseq; idx++)
	    if (isalpha((int) msa->aseq[idx][pos]))
	      nres[toupper((int) msa->aseq[idx][pos]) - 'A'] ++;
	  for (ntotal = 0, i = 0; i < K; i++) if (nres[i] > 0) ntotal++;

	  /* Bump weight on each seq by PB rule */
	  if (ntotal > 0) {
	    for (idx = 0; idx < msa->nseq; idx++) {
	      if (isalpha((int) msa->aseq[idx][pos]))
		msa->wgt[idx] += 1. / 
		  (double) (ntotal * nres[toupper((int) msa->aseq[idx][pos]) - 'A'] );
	    }
	  }
	}

      /* first normalization by # of residues counted in each seq */
      for (idx = 0; idx < msa->nseq; idx++) {
	for (rlen = 0, pos = 0; pos < msa->alen; pos++) 
      	  if (isalpha((int) msa->aseq[idx][pos])) rlen++;
	if (ntotal > 0) msa->wgt[idx] /= (double) rlen;
	/* if rlen == 0 for this seq, its weight is still 0.0, as initialized. */
      }
    }

  /* This section handles digital alignments. */
#ifdef eslAUGMENT_ALPHABET
  else
    {
      for (pos = 1; pos <= msa->alen; pos++)
	{
	  /* Collect # of residues 0..K-1 in this column, and total # */
	  esl_vec_ISet(nres, K, 0.);
	  for (idx = 0; idx < msa->nseq; idx++)
	    if (esl_abc_XIsCanonical(msa->abc, msa->ax[idx][pos]))
	      nres[(int) msa->ax[idx][pos]] ++;
	  for (ntotal = 0, i = 0; i < K; i++) if (nres[i] > 0) ntotal++;

	  /* Bump weight on each sequence by PB rule */
	  if (ntotal > 0) {
	    for (idx = 0; idx < msa->nseq; idx++) {
	      if (esl_abc_XIsCanonical(msa->abc, msa->ax[idx][pos]))
		msa->wgt[idx] += 1. / (double) (ntotal * nres[msa->ax[idx][pos]]);
	    }
	  }
	}

      /* first normalization by # of residues counted in each seq */
      for (idx = 0; idx < msa->nseq; idx++)
	{
	  for (rlen = 0, pos = 1; pos <= msa->alen; pos++) 
	    if (esl_abc_XIsCanonical(msa->abc, msa->ax[idx][pos])) rlen++;
	  if (rlen > 0) msa->wgt[idx] /= (double) rlen;
	  /* if rlen == 0 for this seq, its weight is still 0.0, as initialized. */
	}
    }
#endif

  /* Make weights normalize up to nseq, and return.  In pathological
   * case where all wgts were 0 (no seqs contain any unambiguous
   * residues), weights become 1.0.
   */
  esl_vec_DNorm(msa->wgt, msa->nseq);
  esl_vec_DScale(msa->wgt, msa->nseq, (double) msa->nseq);	
  msa->flags |= eslMSA_HASWGTS;

  free(nres);
  return eslOK;

 ERROR:
  if (nres != NULL) free(nres);
  return status;
}
Пример #9
0
/* dump_infocontent_info
 *                   
 * Given an MSA with RF annotation, dump information content per column data to 
 * an open output file.
 */
static int dump_infocontent_info(FILE *fp, ESL_ALPHABET *abc, double **abc_ct, int use_weights, int nali, int64_t alen, int nseq, int *i_am_rf, char *msa_name, char *alifile, char *errbuf)
{
  int status;
  int apos, rfpos;
  double bg_ent;
  double *bg = NULL;
  double *abc_freq = NULL;
  double nnongap;

  ESL_ALLOC(bg, sizeof(double) * abc->K);
  esl_vec_DSet(bg, abc->K, 1./(abc->K));
  bg_ent = esl_vec_DEntropy(bg, abc->K);
  free(bg);

  ESL_ALLOC(abc_freq, sizeof(double) * abc->K);


  fprintf(fp, "# Information content per column (bits):\n");
  fprintf(fp, "# Alignment file: %s\n", alifile);
  fprintf(fp, "# Alignment idx:  %d\n", nali);
  if(msa_name != NULL) { fprintf(fp, "# Alignment name: %s\n", msa_name); }
  fprintf(fp, "# Number of sequences: %d\n", nseq);
  if(use_weights) { fprintf(fp, "# IMPORTANT: Counts are weighted based on sequence weights in alignment file.\n"); }
  else            { fprintf(fp, "# Sequence weights from alignment were ignored (if they existed).\n"); }
  fprintf(fp, "#\n");

  if(i_am_rf != NULL) { 
    fprintf(fp, "# %7s  %7s  %10s  %10s\n", "rfpos",    "alnpos",  "freqnongap", "info(bits)");
    fprintf(fp, "# %7s  %7s  %10s  %10s\n", "-------", "-------",  "----------", "----------");
  }  
  else { 
    fprintf(fp, "# %7s  %10s  %10s\n", "alnpos",  "freqnongap", "info(bits)");
    fprintf(fp, "# %7s  %10s  %10s\n", "-------", "----------", "----------");
  }

  rfpos = 0;
  for(apos = 0; apos < alen; apos++) {
    if(i_am_rf != NULL) { 
      if(i_am_rf[apos]) { 
	fprintf(fp, "  %7d", rfpos+1);
	rfpos++; 
      }
      else { 
	fprintf(fp, "  %7s", "-");
      }
    }
    nnongap = esl_vec_DSum(abc_ct[apos], abc->K);
    esl_vec_DCopy(abc_ct[apos], abc->K, abc_freq);
    esl_vec_DNorm(abc_freq, abc->K);
    fprintf(fp, "  %7d  %10.8f  %10.8f\n", apos+1, 
	    nnongap / (nnongap + abc_ct[apos][abc->K]),
	    (bg_ent - esl_vec_DEntropy(abc_freq, abc->K)));
  }
  fprintf(fp, "//\n");

  if(abc_freq != NULL) free(abc_freq);

  return eslOK;

 ERROR:
  ESL_FAIL(eslEINVAL, errbuf, "out of memory");
  return status; /* NEVERREACHED */
}