/* Function:  esl_dst_CDiffMx()
 * Synopsis:  NxN difference matrix for N aligned text sequences.
 * Incept:    SRE, Fri Apr 28 06:27:20 2006 [New York]
 *
 * Purpose:   Same as <esl_dst_CPairIdMx()>, but calculates
 *            the fractional difference <d=1-s> instead of the
 *            fractional identity <s> for each pair.
 *
 * Args:      as      - aligned seqs (all same length), [0..N-1]
 *            N       - # of aligned sequences
 *            ret_D   - RETURN: symmetric fractional difference matrix
 *
 * Returns:   <eslOK> on success, and <ret_D> contains the
 *            fractional difference matrix. Caller free's <D> with 
 *            <esl_dmatrix_Destroy()>.
 *
 * Throws:    <eslEINVAL> if any seq has a different
 *            length than others. On failure, <ret_D> is returned <NULL>
 *            and state of inputs is unchanged.
 */
int
esl_dst_CDiffMx(char **as, int N, ESL_DMATRIX **ret_D)
{
  ESL_DMATRIX *D = NULL;
  int status;
  int i,j;

  status = esl_dst_CPairIdMx(as, N, &D);
  if (status != eslOK) goto ERROR;

  for (i = 0; i < N; i++)
    {
      D->mx[i][i] = 0.;
      for (j = i+1; j < N; j++) 
	{
	  D->mx[i][j] = 1. - D->mx[i][j];
	  D->mx[j][i] = D->mx[i][j];
	}
    }

  if (ret_D != NULL) *ret_D = D; else esl_dmatrix_Destroy(D);
  return eslOK;

 ERROR:
  if (D     != NULL)  esl_dmatrix_Destroy(D);
  if (ret_D != NULL) *ret_D = NULL;
  return status;

}
/* Function:  esl_dst_XDiffMx()
 * Synopsis:  NxN difference matrix for N aligned digital seqs.         
 * Incept:    SRE, Fri Apr 28 06:37:29 2006 [New York]
 *
 * Purpose:   Same as <esl_dst_XPairIdMx()>, but calculates fractional
 *            difference <1-s> instead of fractional identity <s> for
 *            each pair.
 *
 * Args:      abc   - digital alphabet in use
 *            ax    - aligned dsq's, [0..N-1][1..alen]                  
 *            N     - number of aligned sequences
 *            ret_D - RETURN: NxN matrix of fractional differences
 *            
 * Returns:   <eslOK> on success, and <ret_D> contains the difference
 *            matrix; caller is obligated to free <D> with 
 *            <esl_dmatrix_Destroy()>. 
 *
 * Throws:    <eslEINVAL> if a seq has a different
 *            length than others. On failure, <ret_D> is returned <NULL>
 *            and state of inputs is unchanged.
 */
int
esl_dst_XDiffMx(const ESL_ALPHABET *abc, ESL_DSQ **ax, int N, ESL_DMATRIX **ret_D)
{
  int status;
  ESL_DMATRIX *D = NULL;
  int i,j;

  status = esl_dst_XPairIdMx(abc, ax, N, &D);
  if (status != eslOK) goto ERROR;

  for (i = 0; i < N; i++)
    {
      D->mx[i][i] = 0.;
      for (j = i+1; j < N; j++) 
	{
	  D->mx[i][j] = 1. - D->mx[i][j];
	  D->mx[j][i] = D->mx[i][j];
	}
    }
  if (ret_D != NULL) *ret_D = D; else esl_dmatrix_Destroy(D);
  return eslOK;

 ERROR:
  if (D     != NULL)  esl_dmatrix_Destroy(D);
  if (ret_D != NULL) *ret_D = NULL;
  return status;
}
/* Function:  esl_dst_XPairIdMx()
 * Synopsis:  NxN identity matrix for N aligned digital seqs.
 * Incept:    SRE, Thu Apr 27 09:08:11 2006 [New York]
 *
 * Purpose:   Given a digitized multiple sequence alignment <ax>, consisting
 *            of <N> aligned digital sequences in alphabet <abc>; calculate
 *            a symmetric pairwise fractional identity matrix by $N(N-1)/2$
 *            calls to <esl_dst_XPairId()>, and return it in <ret_S>.
 *            
 * Args:      abc   - digital alphabet in use
 *            ax    - aligned dsq's, [0..N-1][1..alen]                  
 *            N     - number of aligned sequences
 *            ret_S - RETURN: NxN matrix of fractional identities
 *
 * Returns:   <eslOK> on success, and <ret_S> contains the distance
 *            matrix. Caller is obligated to free <S> with 
 *            <esl_dmatrix_Destroy()>. 
 *
 * Throws:    <eslEINVAL> if a seq has a different
 *            length than others. On failure, <ret_S> is returned <NULL>
 *            and state of inputs is unchanged.
 */
int
esl_dst_XPairIdMx(const ESL_ALPHABET *abc,  ESL_DSQ **ax, int N, ESL_DMATRIX **ret_S)
{
  int status;
  ESL_DMATRIX *S = NULL;
  int i,j;

  if (( S = esl_dmatrix_Create(N,N) ) == NULL) goto ERROR;
  
  for (i = 0; i < N; i++)
    {
      S->mx[i][i] = 1.;
      for (j = i+1; j < N; j++)
	{
	  status = esl_dst_XPairId(abc, ax[i], ax[j], &(S->mx[i][j]), NULL, NULL);
	  if (status != eslOK)
	    ESL_XEXCEPTION(status, "Pairwise identity calculation failed at seqs %d,%d\n", i,j);
	  S->mx[j][i] =  S->mx[i][j];
	}
    }
  if (ret_S != NULL) *ret_S = S; else esl_dmatrix_Destroy(S);
  return eslOK;

 ERROR:
  if (S     != NULL)  esl_dmatrix_Destroy(S);
  if (ret_S != NULL) *ret_S = NULL;
  return status;
}
static int 
utest_CJukesCantorMx(int K, char **as, int N)
{
  ESL_DMATRIX *D, *V;
  /* just a crash test */
  if (esl_dst_CJukesCantorMx(K, as, N, &D, &V) != eslOK) abort();
  esl_dmatrix_Destroy(D);
  esl_dmatrix_Destroy(V);
  return eslOK;
}
int main(int argc, char **argv)
{
  ESL_MSAFILE *afp; 
  ESL_MSA     *msa;
  ESL_DMATRIX *P;
  int          status;
  int          i,j;
  double       min, avg, max;

  esl_msafile_Open(argv[1], eslMSAFILE_UNKNOWN, NULL, &afp);
  esl_msa_Read(afp, &msa);

  esl_dst_CPairIdMx(msa->aseq, msa->nseq, &P);

  min = 1.0;
  max = 0.0;
  avg = 0.0;
  for (i = 0; i < msa->nseq; i++)
    for (j = i+1; j < msa->nseq; j++)
      {
	avg += P->mx[i][j];
	if (P->mx[i][j] < min) min = P->mx[i][j];
	if (P->mx[i][j] > max) max = P->mx[i][j];
      }
  avg /= (double) (msa->nseq * (msa->nseq-1) / 2);

  printf("Average pairwise %% id:  %.1f%%\n", avg * 100.);
  printf("Minimum pairwise %% id:  %.1f%%\n", min * 100.);
  printf("Maximum pairwise %% id:  %.1f%%\n", max * 100.);

  esl_dmatrix_Destroy(P);
  esl_msa_Destroy(msa);
  esl_msafile_Close(afp);
  return 0;
}
Exemple #6
0
/* Function:  esl_dmatrix_Create()
 *
 * Purpose:   Creates a general <n> x <m> matrix (<n> rows, <m> 
 *            columns).
 *
 * Args:      <n> - number of rows;    $>= 1$
 *            <m> - number of columns; $>= 1$
 * 
 * Returns:   a pointer to a new <ESL_DMATRIX> object. Caller frees
 *            with <esl_dmatrix_Destroy()>.
 *
 * Throws:    <NULL> if an allocation failed.
 */
ESL_DMATRIX *
esl_dmatrix_Create(int n, int m)
{
  ESL_DMATRIX *A = NULL;
  int r;
  int status;

  ESL_ALLOC(A, sizeof(ESL_DMATRIX));
  A->mx = NULL;
  A->n  = n;
  A->m  = m;

  ESL_ALLOC(A->mx, sizeof(double *) * n);
  A->mx[0] = NULL;

  ESL_ALLOC(A->mx[0], sizeof(double) * n * m);
  for (r = 1; r < n; r++)
    A->mx[r] = A->mx[0] + r*m;

  A->type   = eslGENERAL;
  A->ncells = n * m; 
  return A;
  
 ERROR:
  esl_dmatrix_Destroy(A);
  return NULL;
}
static int 
utest_XDiffMx(ESL_ALPHABET *abc, char **as, ESL_DSQ **ax, int N)
{
  ESL_DMATRIX *D, *D2;
  int i, j;

  if (esl_dst_XDiffMx(abc, ax, N, &D) != eslOK) abort();
  if (esl_dst_CDiffMx(as, N, &D2)     != eslOK) abort();

  for (i = 0; i < N; i++) 
    for (j = i; j < N; j++)
      if (fabs(D->mx[i][j] - D2->mx[j][i]) > 0.01) abort();

  esl_dmatrix_Destroy(D);
  esl_dmatrix_Destroy(D2);
  return eslOK;
}
int
main(void)
{
  ESL_STOPWATCH *w = NULL;
  ESL_DMATRIX *Q  = NULL;
  ESL_DMATRIX *P  = NULL;
  double       t = 5.0;
  int          esl_iterations = 100;
  int          i;
#ifdef HAVE_LIBGSL
  gsl_matrix  *Qg = NULL;
  gsl_matrix  *Pg = NULL;
  int          gsl_iterations = 100;
#endif

  w = esl_stopwatch_Create();
  Q = esl_dmatrix_Create(20, 20);
  P = esl_dmatrix_Create(20, 20);
  esl_rmx_SetWAG(Q, NULL);

  esl_stopwatch_Start(w);
  for (i = 0; i < esl_iterations; i++)
    esl_dmx_Exp(Q, t, P);
  esl_stopwatch_Stop(w);
  printf("Easel takes:   %g sec\n", w->user / (double) esl_iterations);

#ifdef HAVE_LIBGSL
  if (esl_dmx_MorphGSL(Q, &Qg)             != eslOK) esl_fatal("morph to gsl_matrix failed");
  if ((Pg = gsl_matrix_alloc(20, 20))      == NULL)  esl_fatal("gsl alloc failed");
  gsl_matrix_scale(Qg, t);
  
  esl_stopwatch_Start(w);
  for (i = 0; i < gsl_iterations; i++)
    gsl_linalg_exponential_ss(Qg, Pg, GSL_PREC_DOUBLE);
  esl_stopwatch_Stop(w);
  printf("  GSL takes:   %g sec\n", w->user / (double) gsl_iterations);

  gsl_matrix_free(Qg);
  gsl_matrix_free(Pg);
#endif /*HAVE_LIBGSL*/

  esl_dmatrix_Destroy(Q);
  esl_dmatrix_Destroy(P);
  esl_stopwatch_Destroy(w);
  return 0;
}
static int 
utest_XPairIdMx(ESL_ALPHABET *abc, char **as, ESL_DSQ **ax, int N)
{
  ESL_DMATRIX *S, *S2;
  int i, j;

  if (esl_dst_XPairIdMx(abc, ax, N, &S) != eslOK) abort();
  if (esl_dst_CPairIdMx(as, N, &S2)     != eslOK) abort();

  for (i = 0; i < N; i++) 
    for (j = i; j < N; j++)
      if (fabs(S->mx[i][j] - S2->mx[j][i]) > 0.01) abort();

  esl_dmatrix_Destroy(S);
  esl_dmatrix_Destroy(S2);
  return eslOK;
}
Exemple #10
0
/* Function:  p7_builder_SetScoreSystem()
 * Synopsis:  Initialize score system for single sequence queries.
 *
 * Purpose:   Initialize the builder <bld> to be able to parameterize
 *            single sequence queries, using a substitution matrix
 *            from a file.
 *            
 *            Read a standard substitution score matrix from file
 *            <mxfile>. If <mxfile> is <NULL>, default to BLOSUM62
 *            scores. If <mxfile> is "-", read score matrix from
 *            <stdin> stream. If <env> is non-<NULL> and <mxfile> is
 *            not found in the current working directory, look for
 *            <mxfile> in colon-delimited directory list contained in
 *            environment variable <env>.
 *            
 *            Set the gap-open and gap-extend probabilities to
 *            <popen>, <pextend>, respectively.
 *            
 *            Use background residue frequencies in the null model
 *            <bg> to convert substitution matrix scores to
 *            conditional probability parameters.
 *
 * Args:      bld      - <P7_BUILDER> to initialize
 *            mxfile   - score matrix file to use, or NULL for BLOSUM62 default
 *            env      - env variable containing directory list where <mxfile> may reside
 *            popen    - gap open probability
 *            pextend  - gap extend probability
 *            bg       - null model, containing background frequencies
 *
 * Returns:   <eslOK> on success.
 *            
 *            <eslENOTFOUND> if <mxfile> can't be found or opened, even
 *            in any of the directories specified by the <env> variable.   
 *            
 *            <eslEINVAL> if the score matrix can't be converted into
 *            conditional probabilities; for example, if it has no valid
 *            solution for <lambda>.
 * 
 *            On either error, <bld->errbuf> contains a useful error message
 *            for the user.
 *
 * Throws:    <eslEMEM> on allocation failure.
 */
int
p7_builder_SetScoreSystem(P7_BUILDER *bld, const char *mxfile, const char *env, double popen, double pextend, P7_BG *bg)
{
  ESL_FILEPARSER  *efp = NULL;
  double          *f   = NULL;
  double           slambda;
  int              status;

  bld->errbuf[0] = '\0';

  /* If a score system is already set, delete it. */
  if (bld->S != NULL) esl_scorematrix_Destroy(bld->S);
  if (bld->Q != NULL) esl_dmatrix_Destroy(bld->Q);

  /* Get the scoring matrix */
  if ((bld->S  = esl_scorematrix_Create(bld->abc)) == NULL) { status = eslEMEM; goto ERROR; }
  if (mxfile == NULL) 
    {
      if (bld->abc->type == eslAMINO) {
        if ((status = esl_scorematrix_Set("BLOSUM62", bld->S)) != eslOK) goto ERROR;
      } else {
        if ((status = esl_scorematrix_Set("DNA1", bld->S)) != eslOK) goto ERROR;
      }
    } 
  else 
    {
      if ((status = esl_fileparser_Open(mxfile, env, &efp))         != eslOK) ESL_XFAIL(status, bld->errbuf, "Failed to find or open matrix file %s", mxfile);
      if ((status = esl_scorematrix_Read(efp, bld->abc, &(bld->S))) != eslOK) ESL_XFAIL(status, bld->errbuf, "Failed to read matrix from %s:\n%s",    mxfile, efp->errbuf);
      esl_fileparser_Close(efp); 
      efp = NULL;
    }

  /* A wasteful conversion of the HMMER single-precision background probs to Easel double-prec */
  ESL_ALLOC(f, sizeof(double) * bg->abc->K);
  esl_vec_F2D(bg->f, bg->abc->K, f);

  /* Backcalculate joint probability matrix Q, given scores S and background freqs bg->f.  */
  status = esl_scorematrix_ProbifyGivenBG(bld->S, f, f, &slambda, &(bld->Q));
  if      (status == eslEINVAL)  ESL_XFAIL(eslEINVAL, bld->errbuf, "input score matrix %s has no valid solution for lambda", mxfile);
  else if (status == eslENOHALT) ESL_XFAIL(eslEINVAL, bld->errbuf, "failed to solve input score matrix %s for lambda: are you sure it's valid?", mxfile);
  else if (status != eslOK)      ESL_XFAIL(eslEINVAL, bld->errbuf, "unexpected error in solving input score matrix %s for probability parameters", mxfile);

  /* Convert joint probabilities P(ab) to conditionals P(b|a) */
  esl_scorematrix_JointToConditionalOnQuery(bld->abc, bld->Q);

  bld->popen   = popen;
  bld->pextend = pextend;
  free(f);
  return eslOK;

 ERROR:
  if (efp) esl_fileparser_Close(efp);
  if (f)   free(f);
  return status;
}
Exemple #11
0
/* Function:  p7_builder_Destroy()
 * Synopsis:  Free a <P7_BUILDER>
 *
 * Purpose:   Frees a <P7_BUILDER> object.
 */
void
p7_builder_Destroy(P7_BUILDER *bld)
{
  if (bld == NULL) return;

  if (bld->prior   != NULL) p7_prior_Destroy(bld->prior);
  if (bld->r       != NULL) esl_randomness_Destroy(bld->r);
  if (bld->Q       != NULL) esl_dmatrix_Destroy(bld->Q);
  if (bld->S       != NULL) esl_scorematrix_Destroy(bld->S);

  free(bld);
  return;
}
static int 
utest_XJukesCantorMx(ESL_ALPHABET *abc, char **as, ESL_DSQ **ax, int N)
{
  ESL_DMATRIX *D, *D2, *V, *V2;
  int i, j;

  if (esl_dst_XJukesCantorMx(abc, ax, N, &D, &V)      != eslOK) abort();
  if (esl_dst_CJukesCantorMx(abc->K, as, N, &D2, &V2) != eslOK) abort();

  for (i = 0; i < N; i++) 
    for (j = i; j < N; j++)
      {
	if (fabs(D->mx[i][j] - D2->mx[j][i]) > 0.01) abort();
	if (fabs(V->mx[i][j] - V2->mx[j][i]) > 0.01) abort();
      }

  esl_dmatrix_Destroy(D);
  esl_dmatrix_Destroy(D2);
  esl_dmatrix_Destroy(V);
  esl_dmatrix_Destroy(V2);
  return eslOK;
}
int
main(void)
{
  char errbuf[eslERRBUFSIZE];
  char *alphabet = "ACDEFGHIKLMNPQRSTVWY";
  ESL_DMATRIX *Q  = NULL;
  ESL_DMATRIX *P  = NULL;
  gsl_matrix  *Qg = NULL;
  gsl_matrix  *Pg = NULL;
  ESL_DMATRIX *Pge = NULL;
  double       t = 15.0;

  if ((Q = esl_dmatrix_Create(20, 20))     == NULL)  esl_fatal("malloc failed");
  if ((P = esl_dmatrix_Create(20, 20))     == NULL)  esl_fatal("malloc failed");

  if (esl_rmx_SetWAG(Q, NULL)              != eslOK) esl_fatal("_SetWAG() failed");
  if (esl_rmx_ValidateQ(Q, 0.0001, errbuf) != eslOK) esl_fatal("Q validation failed: %s", errbuf);

  if (esl_dmx_Exp(Q, t, P)                 != eslOK) esl_fatal("matrix exponentiation failed");
  if (esl_rmx_ValidateP(P, 0.0001, errbuf) != eslOK) esl_fatal("P validation failed: %s", errbuf);

  if (esl_dmx_MorphGSL(Q, &Qg)             != eslOK) esl_fatal("morph to gsl_matrix failed");
  if ((Pg = gsl_matrix_alloc(20, 20))      == NULL)  esl_fatal("gsl alloc failed");
  gsl_matrix_scale(Qg, t);
  if (gsl_linalg_exponential_ss(Qg, Pg, GSL_PREC_DOUBLE) != 0) esl_fatal("gsl's exponentiation failed");
  if (esl_dmx_UnmorphGSL(Pg, &Pge)         != eslOK) esl_fatal("morph from gsl_matrix failed");  
  
  esl_dmatrix_Dump(stdout, P, alphabet, alphabet);

  if (esl_dmatrix_Compare(Pge, P, 0.00001) != eslOK) esl_fatal("whoops, different answers.");
  
  esl_dmatrix_Destroy(Q);
  esl_dmatrix_Destroy(P);
  esl_dmatrix_Destroy(Pge);
  gsl_matrix_free(Qg);
  gsl_matrix_free(Pg);
  return 0;
}
/* Function:  esl_dst_XJukesCantorMx()
 * Synopsis:  NxN Jukes/Cantor distance matrix for N aligned digital seqs.
 * Incept:    SRE, Thu Apr 27 08:38:08 2006 [New York City]
 *
 * Purpose:   Given a digitized multiple sequence alignment <ax>,
 *            consisting of <nseq> aligned digital sequences in
 *            bioalphabet <abc>, calculate a symmetric Jukes/Cantor
 *            pairwise distance matrix for all sequence pairs;
 *            optionally return the distance matrix in <ret_D> and 
 *            a matrix of the large-sample variances for those ML distance
 *            estimates in <ret_V>.
 *            
 *            Infinite distances (and variances) are possible. They
 *            are represented as <HUGE_VAL> in <D> and <V>. Caller must
 *            be prepared to deal with them as appropriate.
 *
 * Args:      abc    - bioalphabet for <aseq>
 *            ax     - aligned digital sequences [0.nseq-1][1..L]
 *            nseq   - number of aseqs
 *            opt_D  - optRETURN: [0..nseq-1]x[0..nseq-1] symmetric distance mx
 *            opt_V  - optRETURN: matrix of variances.
 *
 * Returns:   <eslOK> on success. <D> (and optionally <V>) contain the
 *            distance matrix (and variances). Caller frees these with
 *            <esl_dmatrix_Destroy()>. 
 *
 * Throws:    <eslEINVAL> if any pair of sequences have differing lengths
 *            (and thus cannot have been properly aligned). 
 *            <eslEDIVZERO> if some pair of sequences had no aligned
 *            residues. On failure, <D> and <V> are both returned <NULL>
 *            and state of inputs is unchanged.
 */
int
esl_dst_XJukesCantorMx(const ESL_ALPHABET *abc, ESL_DSQ **ax, int nseq, 
		       ESL_DMATRIX **opt_D, ESL_DMATRIX **opt_V)
{
  ESL_DMATRIX *D = NULL;
  ESL_DMATRIX *V = NULL;
  int          status;
  int          i,j;

  if (( D = esl_dmatrix_Create(nseq, nseq) ) == NULL) goto ERROR;
  if (( V = esl_dmatrix_Create(nseq, nseq) ) == NULL) goto ERROR;

  for (i = 0; i < nseq; i++)
    {
      D->mx[i][i] = 0.;
      V->mx[i][i] = 0.;
      for (j = i+1; j < nseq; j++)
	{
	  status = esl_dst_XJukesCantor(abc, ax[i], ax[j], 
					&(D->mx[i][j]), &(V->mx[i][j]));
	  if (status != eslOK) 
	    ESL_XEXCEPTION(status, "J/C calculation failed at digital aseqs %d,%d", i,j);

	  D->mx[j][i] = D->mx[i][j];
	  V->mx[j][i] = V->mx[i][j];
	}
    }
  if (opt_D != NULL) *opt_D = D;  else esl_dmatrix_Destroy(D);
  if (opt_V != NULL) *opt_V = V;  else esl_dmatrix_Destroy(V);
  return eslOK;

 ERROR:
  if (D     != NULL) esl_dmatrix_Destroy(D);
  if (V     != NULL) esl_dmatrix_Destroy(V);
  if (opt_D != NULL) *opt_D = NULL;
  if (opt_V != NULL) *opt_V = NULL;
  return status;
}
static void
utest_SetWAG(void)
{
  char errbuf[eslERRBUFSIZE];
  ESL_DMATRIX *Q = NULL;
  ESL_DMATRIX *P = NULL;
  double       t = 50.0;	/* sufficiently large to drive e^tQ to stationarity  */
  double       pi[20];
  int          i;

  if ((Q = esl_dmatrix_Create(20, 20))     == NULL)  esl_fatal("malloc failed");
  if ((P = esl_dmatrix_Create(20, 20))     == NULL)  esl_fatal("malloc failed");

  /* This tests that exponentiating WAG gives a stable conditional 
   * probability matrix solution. (It doesn't particularly test that
   * WAG was set correctly, but how could we have screwed that up?)
   */
  if (esl_rmx_SetWAG(Q, NULL)              != eslOK) esl_fatal("_SetWAG() failed");
  if (esl_dmx_Exp(Q, t, P)                 != eslOK) esl_fatal("matrix exponentiation failed");
  if (esl_rmx_ValidateP(P, 1e-7, errbuf)   != eslOK) esl_fatal("P validation failed: %s", errbuf);
  if (esl_rmx_ValidateQ(Q, 1e-7, errbuf)   != eslOK) esl_fatal("Q validation failed: %s", errbuf);

  /* This tests setting WAG to different stationary pi's than default,
   * then tests that exponentiating to large t reaches those stationaries.
   */
  esl_vec_DSet(pi, 20, 0.05);
  if (esl_rmx_SetWAG(Q, pi)                != eslOK) esl_fatal("_SetWAG() failed");
  if (esl_dmx_Exp(Q, t, P)                 != eslOK) esl_fatal("matrix exponentiation failed");
  if (esl_rmx_ValidateP(P, 1e-7, errbuf)   != eslOK) esl_fatal("P validation failed: %s", errbuf);
  if (esl_rmx_ValidateQ(Q, 1e-7, errbuf)   != eslOK) esl_fatal("Q validation failed: %s", errbuf);
  for (i = 0; i < 20; i++)
    if (esl_vec_DCompare(P->mx[i], pi, 20, 1e-7) != eslOK) esl_fatal("P didn't converge to right pi's");

  esl_dmatrix_Destroy(Q);
  esl_dmatrix_Destroy(P);
  return;
}
Exemple #16
0
/* Function:  p7_builder_LoadScoreSystem()
 * Synopsis:  Load a standard score system for single sequence queries.
 *
 * Purpose:   Initialize the builder <bld> to be able to parameterize
 *            single sequence queries, using the standard (built-in) score
 *            matrix named <mx>.
 *            
 *            Available score matrices <mx> include PAM30, 70, 120, and 240;
 *            and BLOSUM45, 50, 62, 80, and 90. See <esl_scorematrix.c>.
 *
 *            Set the gap-open and gap-extend probabilities to
 *            <popen>, <pextend>, respectively.
 *            
 *            Use background residue frequencies in the null model
 *            <bg> to convert substitution matrix scores to
 *            conditional probability parameters.
 *
 * Args:      bld      - <P7_BUILDER> to initialize
 *            matrix   - score matrix file to use
 *            popen    - gap open probability
 *            pextend  - gap extend probability
 *            bg       - null model, containing background frequencies           
 *
 * Returns:   <eslOK> on success.
 *            
 *            <eslENOTFOUND> if <mxfile> can't be found or opened, even
 *            in any of the directories specified by the <env> variable.   
 *            
 *            <eslEINVAL> if the score matrix can't be converted into
 *            conditional probabilities; for example, if it has no valid
 *            solution for <lambda>.
 * 
 *            On either error, <bld->errbuf> contains a useful error message
 *            for the user.
 *
 * Throws:    <eslEMEM> on allocation failure.
 */
int
p7_builder_LoadScoreSystem(P7_BUILDER *bld, const char *matrix, double popen, double pextend, P7_BG *bg)
{
  double  *f = NULL;
  double   slambda;
  int      status;

  bld->errbuf[0] = '\0';

  /* If a score system is already set, delete it. */
  if (bld->S != NULL) esl_scorematrix_Destroy(bld->S);
  if (bld->Q != NULL) esl_dmatrix_Destroy(bld->Q);

  /* Get the scoring matrix */
  if ((bld->S  = esl_scorematrix_Create(bld->abc)) == NULL) { status = eslEMEM; goto ERROR; }
  status =  esl_scorematrix_Set(matrix, bld->S);
  if      (status == eslENOTFOUND) ESL_XFAIL(status, bld->errbuf, "no matrix named %s is available as a built-in", matrix);
  else if (status != eslOK)        ESL_XFAIL(status, bld->errbuf, "failed to set score matrix %s as a built-in",   matrix);

  /* A wasteful conversion of the HMMER single-precision background probs to Easel double-prec */
  ESL_ALLOC(f, sizeof(double) * bg->abc->K);
  esl_vec_F2D(bg->f, bg->abc->K, f);

  /* Backcalculate joint probability matrix Q, given scores S and background freqs bg->f.  */
  /* Failures shouldn't happen here: these are standard matrices.  */
  status = esl_scorematrix_ProbifyGivenBG(bld->S, f, f, &slambda, &(bld->Q));
  if      (status == eslEINVAL)  ESL_XFAIL(eslEINVAL, bld->errbuf, "built-in score matrix %s has no valid solution for lambda", matrix);
  else if (status == eslENOHALT) ESL_XFAIL(eslEINVAL, bld->errbuf, "failed to solve score matrix %s for lambda", matrix);
  else if (status != eslOK)      ESL_XFAIL(eslEINVAL, bld->errbuf, "unexpected error in solving score matrix %s for probability parameters", matrix);

  /* Convert joint probabilities P(ab) to conditionals P(b|a) */
  esl_scorematrix_JointToConditionalOnQuery(bld->abc, bld->Q);

  bld->popen   = popen;
  bld->pextend = pextend;
  free(f);
  return eslOK;

 ERROR:
  if (f) free(f);
  return status;
}
static void
utest_Diagonalization(void)
{
  ESL_DMATRIX *P      = NULL;
  ESL_DMATRIX *P2     = NULL;
  ESL_DMATRIX *C      = NULL;
  ESL_DMATRIX *D      = NULL;
  double      *lambda = NULL;		/* eigenvalues */
  ESL_DMATRIX *U      = NULL;		/* left eigenvectors */
  ESL_DMATRIX *Ui     = NULL;		/* inverse of U */
  int  i,j;

  /* Create a J/C probability matrix for t=1:
   *    1/4 + 3/4 e^{-4/3 at}
   *    1/4 - 1/4 e^{-4/3 at}
   */
  if ((P  = esl_dmatrix_Create(4, 4))    == NULL)  esl_fatal("malloc failed");
  if ((C  = esl_dmatrix_Create(4, 4))    == NULL)  esl_fatal("malloc failed");
  if ((Ui = esl_dmatrix_Create(4, 4))    == NULL)  esl_fatal("malloc failed");
  if ((D  = esl_dmatrix_Create(4, 4))    == NULL)  esl_fatal("malloc failed");
  if ((P2 = esl_dmatrix_Create(4, 4))    == NULL)  esl_fatal("malloc failed");
  for (i = 0; i < 4; i++)
    for (j = 0; j < 4; j++)
      if (i == j) P->mx[i][j] = 0.25 + 0.75 * exp(-4./3.);
      else        P->mx[i][j] = 0.25 - 0.25 * exp(-4./3.);

  /* Diagonalize it
   */
  if (esl_dmx_Diagonalize(P, &lambda, NULL, &U, NULL) != eslOK) esl_fatal("diagonalization failed");

  /* Calculate P^k by U [diag(lambda_i)]^k U^{-1}
   */
  esl_dmatrix_SetZero(D);
  for (i = 0; i < P->n; i++) D->mx[i][i] = lambda[i];
  esl_dmx_Invert(U, Ui);
  esl_dmx_Multiply(U, D,  C);
  esl_dmx_Multiply(C, Ui, P2);

  if (esl_dmatrix_Compare(P, P2, 1e-7) != eslOK) esl_fatal("diagonalization unit test failed");

  free(lambda);
  esl_dmatrix_Destroy(P2);
  esl_dmatrix_Destroy(Ui);
  esl_dmatrix_Destroy(U);
  esl_dmatrix_Destroy(D);
  esl_dmatrix_Destroy(C);
  esl_dmatrix_Destroy(P);
  return;
}
static int 
utest_CPairIdMx(char **as, int N)
{
  ESL_DMATRIX *S;
  int          i,j;
  double       pid;

  if (esl_dst_CPairIdMx(as, N, &S) != eslOK) abort();

  for (i = 0; i < N; i++) 
    if (S->mx[i][i] != 1.0) abort();

  pid = 0.;
  for (i = 3; i < N; i++)
    for (j = i+1; j < N; j++)
      pid += S->mx[i][j];
  pid /= (double) ((N-3) * (N-4) / 2); /* first 3 don't count */
  if (pid < 0.15 || pid > 0.35) abort(); /* should be 0.25 */

  esl_dmatrix_Destroy(S);
  return eslOK;
}
static int 
utest_CDiffMx(char **as, int N)
{
  ESL_DMATRIX *D;
  int          i,j;
  double       diff;

  if (esl_dst_CDiffMx(as, N, &D) != eslOK) abort();

  for (i = 0; i < N; i++) 
    if (D->mx[i][i] != 0.0) abort();

  diff = 0.;
  for (i = 3; i < N; i++)
    for (j = i+1; j < N; j++)
      diff += D->mx[i][j];
  diff /= (double) ((N-3) * (N-4) / 2);	/* first 3 don't count */
  if (diff < 0.65 || diff > 0.85) abort(); /* should be 0.75 */

  esl_dmatrix_Destroy(D);
  return eslOK;
}
Exemple #20
0
/* Function:  esl_dmatrix_CreateUpper()
 * Incept:    SRE, Wed Feb 28 08:45:45 2007 [Janelia]
 *
 * Purpose:   Creates a packed upper triangular matrix of <n> rows and
 *            <n> columns. Caller may only access cells $i \leq j$.
 *            Cells $i > j$ are not stored and are implicitly 0.
 *            
 *            Not all matrix operations in Easel can work on packed
 *            upper triangular matrices.
 *
 * Returns:   a pointer to a new <ESL_DMATRIX> object of type
 *            <eslUPPER>. Caller frees with <esl_dmatrix_Destroy()>.
 *
 * Throws:    <NULL> if allocation fails.
 *
 * Xref:      J1/10
 */
ESL_DMATRIX *
esl_dmatrix_CreateUpper(int n)
{
  int status;
  ESL_DMATRIX *A = NULL;
  int r;			/* counter over rows */
  int nc;			/* cell counter */

  /* matrix structure allocation */
  ESL_ALLOC(A, sizeof(ESL_DMATRIX)); 
  A->mx = NULL;
  A->n  = n;
  A->m  = n;

  /* n row ptrs */
  ESL_ALLOC(A->mx, sizeof(double *) * n); 
  A->mx[0] = NULL;

  /* cell storage */
  ESL_ALLOC(A->mx[0], sizeof(double) * n * (n+1) / 2);
  
  /* row pointers set in a tricksy overlapping way, so
   * mx[i][j] access works normally but only i<=j are valid.
   * xref J1/10.
   */
  nc = n;  /* nc is the number of valid cells assigned to rows so far */
  for (r = 1; r < n; r++) {
    A->mx[r] = A->mx[0] + nc - r; /* -r overlaps this row w/ previous row */
    nc += n-r;
  }
  A->type   = eslUPPER;
  A->ncells = n * (n+1) / 2; 
  return A;

 ERROR:
  esl_dmatrix_Destroy(A);
  return NULL;
}
Exemple #21
0
int
main(int argc, char **argv)
{
  ESL_GETOPTS    *go       = NULL; 
  char           *keyfile  = NULL;
  char           *tabfile  = NULL;
  ESL_KEYHASH    *kh       = esl_keyhash_Create();
  int             nkeys    = 0;
  ESL_DMATRIX    *D        = NULL;
  ESL_TREE       *T        = NULL;

  go = esl_getopts_Create(options);
  if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) cmdline_failure(argv[0], go, "Failed to parse command line: %s\n", go->errbuf);
  if (esl_opt_VerifyConfig(go)               != eslOK) cmdline_failure(argv[0], go, "Error in app configuration: %s\n",   go->errbuf);
  if (esl_opt_GetBoolean(go, "-h") )                   cmdline_help   (argv[0], go);
  if (esl_opt_ArgNumber(go) != 2)                      cmdline_failure(argv[0], go, "Incorrect number of command line arguments.\n");
  keyfile = esl_opt_GetArg(go, 1);
  tabfile = esl_opt_GetArg(go, 2);

  read_keyfile(go, keyfile, kh);
  nkeys = esl_keyhash_GetNumber(kh);

  D = esl_dmatrix_Create(nkeys, nkeys);
  read_tabfile(go, tabfile, kh, D);

  esl_tree_SingleLinkage(D, &T);
    
  //esl_tree_WriteNewick(stdout, T);
  output_clusters(go, T, kh);


  esl_tree_Destroy(T);
  esl_dmatrix_Destroy(D);
  esl_keyhash_Destroy(kh);
  esl_getopts_Destroy(go);
  return 0;
}
Exemple #22
0
int 
main(int argc, char **argv)
{
  char        *filename = argv[1];
  FILE        *fp       = NULL;
  ESL_DMATRIX *E        = NULL;
  double      *pi       = NULL;
  int          i,j,n;

  E = esl_dmatrix_Create(20, 20);
  pi = malloc(20 * sizeof(double));
  if ((fp = fopen(filename, "r")) == NULL) esl_fatal("open failed");
  if (esl_paml_ReadE(fp, E, pi) != eslOK)  esl_fatal("parse failed");

  n = 1;
  for (i = 1; i < 20; i++)
    for (j = 0; j < i; j++)
      {
	printf("%8.6f, ", E->mx[i][j]);
	if (n++ == 10) { puts(""); n=1; }
      }
  
  puts("");

  n = 1;
  for (i = 0; i < 20; i++)
    {
      printf("%8.6f, ", pi[i]);
      if (n++ == 10) { puts(""); n=1; }
    }
  
  fclose(fp);
  free(pi);
  esl_dmatrix_Destroy(E);
  return 0;
}
Exemple #23
0
int
main(int argc, char **argv)
{
  ESL_ALPHABET    *abc     = NULL;     /* sequence alphabet                       */
  ESL_GETOPTS     *go      = NULL;     /* command line processing                 */
  ESL_RANDOMNESS  *r       = NULL;     /* source of randomness                    */
  P7_HMM          *hmm     = NULL;     /* sampled HMM to emit from                */
  P7_HMM          *core    = NULL;     /* safe copy of the HMM, before config     */
  P7_BG           *bg      = NULL;     /* null model                              */
  ESL_SQ          *sq      = NULL;     /* sampled sequence                        */
  P7_TRACE        *tr      = NULL;     /* sampled trace                           */
  P7_PROFILE      *gm      = NULL;     /* profile                                 */
  int              i,j;
  int              i1,i2;
  int              k1,k2;
  int              iseq;
  FILE            *fp      = NULL;
  double           expected;

  int              do_ilocal;
  char            *hmmfile = NULL;
  int              nseq;
  int              do_swlike;
  int              do_ungapped;
  int              L;
  int              M;
  int              do_h2;
  char            *ipsfile = NULL;
  char            *kpsfile = NULL;
  ESL_DMATRIX     *imx     = NULL;
  ESL_DMATRIX     *kmx     = NULL;
  ESL_DMATRIX     *iref    = NULL; /* reference matrix: expected i distribution under ideality */
  int              Lbins;
  int              status;
  char             errbuf[eslERRBUFSIZE];
  
  /*****************************************************************
   * Parse the command line
   *****************************************************************/
  go = esl_getopts_Create(options);
  if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) esl_fatal("Failed to parse command line: %s\n", go->errbuf);
  if (esl_opt_VerifyConfig(go)               != eslOK) esl_fatal("Failed to parse command line: %s\n", go->errbuf);
  if (esl_opt_GetBoolean(go, "-h") == TRUE) {
    puts(usage);
    puts("\n  where options are:\n");
    esl_opt_DisplayHelp(stdout, go, 0, 2, 80); /* 0=all docgroups; 2 = indentation; 80=textwidth*/
    return eslOK;
  }
  do_ilocal   = esl_opt_GetBoolean(go, "-i");
  hmmfile     = esl_opt_GetString (go, "-m");
  nseq        = esl_opt_GetInteger(go, "-n");
  do_swlike   = esl_opt_GetBoolean(go, "-s");
  do_ungapped = esl_opt_GetBoolean(go, "-u");
  L           = esl_opt_GetInteger(go, "-L");
  M           = esl_opt_GetInteger(go, "-M");
  do_h2       = esl_opt_GetBoolean(go, "-2");
  ipsfile     = esl_opt_GetString (go, "--ips");
  kpsfile     = esl_opt_GetString (go, "--kps");

  if (esl_opt_ArgNumber(go) != 0) {
    puts("Incorrect number of command line arguments.");
    printf("Usage: %s [options]\n", argv[0]);
    return eslFAIL;
  }

  r = esl_randomness_CreateFast(0);

  if (hmmfile != NULL)
    {	/* Read the HMM (and get alphabet from it) */
      P7_HMMFILE      *hfp     = NULL;

      status = p7_hmmfile_OpenE(hmmfile, NULL, &hfp, errbuf);
      if      (status == eslENOTFOUND) p7_Fail("File existence/permissions problem in trying to open HMM file %s.\n%s\n", hmmfile, errbuf);
      else if (status == eslEFORMAT)   p7_Fail("File format problem in trying to open HMM file %s.\n%s\n",                hmmfile, errbuf);
      else if (status != eslOK)        p7_Fail("Unexpected error %d in opening HMM file %s.\n%s\n",               status, hmmfile, errbuf);  
    
      if ((status = p7_hmmfile_Read(hfp, &abc, &hmm)) != eslOK) {
	if      (status == eslEOD)       esl_fatal("read failed, HMM file %s may be truncated?", hmmfile);
	else if (status == eslEFORMAT)   esl_fatal("bad file format in HMM file %s", hmmfile);
	else if (status == eslEINCOMPAT) esl_fatal("HMM file %s contains different alphabets", hmmfile);
	else                             esl_fatal("Unexpected error in reading HMMs");
      }
      M = hmm->M;
      p7_hmmfile_Close(hfp);
    }
  else
    {			/* Or sample the HMM (create alphabet first) */
      abc = esl_alphabet_Create(eslAMINO);    
      if      (do_ungapped) p7_hmm_SampleUngapped(r, M, abc, &hmm);
      else if (do_swlike)   p7_hmm_SampleUniform (r, M, abc, 0.05, 0.5, 0.05, 0.2, &hmm); /* tmi, tii, tmd, tdd */
      else                  p7_hmm_Sample        (r, M, abc, &hmm);
    }

  Lbins = M;
  imx  = esl_dmatrix_Create(Lbins, Lbins);
  iref = esl_dmatrix_Create(Lbins, Lbins);
  kmx  = esl_dmatrix_Create(M, M);
  esl_dmatrix_SetZero(imx);
  esl_dmatrix_SetZero(iref);
  esl_dmatrix_SetZero(kmx);
  tr    = p7_trace_Create();
  sq    = esl_sq_CreateDigital(abc);
  bg    = p7_bg_Create(abc);
  core  = p7_hmm_Clone(hmm);

  if (do_h2) {
    gm = p7_profile_Create(hmm->M, abc);
    p7_H2_ProfileConfig(hmm, bg, gm, p7_UNILOCAL);
  } else {
    gm = p7_profile_Create(hmm->M, abc);
    p7_ProfileConfig(hmm, bg, gm, L, p7_UNILOCAL);
    if (p7_hmm_Validate    (hmm, NULL, 0.0001) != eslOK) esl_fatal("whoops, HMM is bad!");
    if (p7_profile_Validate(gm,  NULL, 0.0001) != eslOK) esl_fatal("whoops, profile is bad!");
  }

  /* Sample endpoints.
   * Also sample an ideal reference distribution for i endpoints.  i
   * endpoints are prone to discretization artifacts, when emitted
   * sequences have varying lengths. Taking log odds w.r.t. an ideal
   * reference that is subject to the same discretization artifacts 
   * cancels out the effect.
   */
  for (iseq = 0; iseq < nseq; iseq++)
    {				
      if (do_ilocal) ideal_local_endpoints  (r, core,     sq, tr, Lbins, &i1, &i2, &k1, &k2);
      else           profile_local_endpoints(r, core, gm, sq, tr, Lbins, &i1, &i2, &k1, &k2);

      imx->mx[i1-1][i2-1] += 1.;
      kmx->mx[k1-1][k2-1] += 1.; 

      /* reference distribution for i */
      ideal_local_endpoints  (r, core, sq, tr, Lbins, &i1, &i2, &k1, &k2);
      iref->mx[i1-1][i2-1] += 1.;
    }


  /* Adjust both mx's to log_2(obs/exp) ratio */
  printf("Before normalization/log-odds:\n");
  printf("   i matrix values range from %f to %f\n", dmx_upper_min(imx), dmx_upper_max(imx));
  printf("   k matrix values range from %f to %f\n", dmx_upper_min(kmx), dmx_upper_max(kmx));
  printf("iref matrix values range from %f to %f\n", dmx_upper_min(iref), dmx_upper_max(iref));

  expected = (double) nseq * 2. / (double) (M*(M+1));
  for (i = 0; i < kmx->m; i++)
    for (j = i; j < kmx->n; j++)
      kmx->mx[i][j] = log(kmx->mx[i][j] / expected) / log(2.0);

  for (i = 0; i < imx->m; i++)
    for (j = i; j < imx->m; j++)
      if (iref->mx[i][j] == 0. && imx->mx[i][j] == 0.) 
	imx->mx[i][j] = 0.;
      else if (iref->mx[i][j] == 0.)
	imx->mx[i][j] = eslINFINITY;
      else if (imx->mx[i][j] == 0.)
	imx->mx[i][j] = -eslINFINITY;
      else
	imx->mx[i][j] = log(imx->mx[i][j] / iref->mx[i][j]) / log(2.0);
  
  /* Print ps files */
  if (kpsfile != NULL) {
    if ((fp = fopen(kpsfile, "w")) == NULL) esl_fatal("Failed to open output postscript file %s", kpsfile);
    dmx_Visualize(fp, kmx, -4., 5.);
    fclose(fp);
  }
  if (ipsfile != NULL) {
    if ((fp = fopen(ipsfile, "w")) == NULL) esl_fatal("Failed to open output postscript file %s", ipsfile);
    dmx_Visualize(fp, imx, -4., 5.); 
    /* dmx_Visualize(fp, imx, dmx_upper_min(imx), dmx_upper_max(imx)); */
    fclose(fp);
  }

  printf("After normalization/log-odds:\n");
  printf("i matrix values range from %f to %f\n", dmx_upper_min(imx), dmx_upper_max(imx));
  printf("k matrix values range from %f to %f\n", dmx_upper_min(kmx), dmx_upper_max(kmx));

  
  p7_profile_Destroy(gm);
  p7_bg_Destroy(bg);
  p7_hmm_Destroy(core);
  p7_hmm_Destroy(hmm);
  p7_trace_Destroy(tr);
  esl_sq_Destroy(sq);
  esl_dmatrix_Destroy(imx);
  esl_dmatrix_Destroy(kmx);
  esl_alphabet_Destroy(abc);
  esl_randomness_Destroy(r);
  esl_getopts_Destroy(go);
  return eslOK;
}
/* A:       nxn real matrix
 * ret_Er:  RETURN: vector of eigenvalues, real part, allocated 0..n-1
 * ret_Ei:  RETURN: vector of eigenvalues, imaginary part, allocated 0..n-1
 * ret_VL:  RETURN: left eigenvectors
 * ret_VR:  RETURN: right eigenvectors
 */
int
esl_lapack_dgeev(ESL_DMATRIX *A, double **ret_Er, double **ret_Ei, ESL_DMATRIX **ret_VL, ESL_DMATRIX **ret_VR)
{
  double      *Er   = NULL;
  double      *Ei   = NULL;
  ESL_DMATRIX *VL   = NULL;
  ESL_DMATRIX *VR   = NULL;
  double      *work = NULL;
  char   jobvl, jobvr;
  int    lda;
  int    ldvl, ldvr;
  int    lwork;
  int    info;
  int    status;

  if ((VL = esl_dmatrix_Create(A->n,A->n)) == NULL)       { status = eslEMEM; goto ERROR; }
  if ((VR = esl_dmatrix_Create(A->n,A->n)) == NULL)       { status = eslEMEM; goto ERROR; }
  ESL_ALLOC(Er,   sizeof(double) * A->n);
  ESL_ALLOC(Ei,   sizeof(double) * A->n);
  ESL_ALLOC(work, sizeof(double) * 4 * A->n);

  jobvl = (ret_VL == NULL) ? 'N' : 'V';	/* do we want left eigenvectors? */
  jobvr = (ret_VR == NULL) ? 'N' : 'V'; /* do we want right eigenvectors? */
  lda   = A->n; 
  ldvl  = A->n;
  ldvr  = A->n;
  lwork = 4*A->n;

  /* Fortran convention is colxrow, not rowxcol; so transpose
   * A before passing it to a Fortran routine.
   */
  esl_dmx_Transpose(A);

  /* The actual Fortran77 interface call to LAPACK.
   * All args must be passed by reference.
   * Fortran 2D arrays are 1D: so pass the A[0] part of a DSMX.
   */
  dgeev_(&jobvl, &jobvr, &(A->n), A->mx[0], &lda, Er, Ei, VL->mx[0], &ldvl, VR->mx[0], &ldvr, work, &lwork, &info);

  /* Now, VL, VR are transposed (col x row), so transpose them back to
   * C convention.
   */
  esl_dmx_Transpose(VL);
  esl_dmx_Transpose(VR);

  if (ret_VL != NULL) *ret_VL = VL; else esl_dmatrix_Destroy(VL);
  if (ret_VR != NULL) *ret_VR = VR; else esl_dmatrix_Destroy(VR);
  if (ret_Er != NULL) *ret_Er = Er; else free(Er);
  if (ret_Ei != NULL) *ret_Ei = Ei; else free(Ei);
  free(work);
  return eslOK;

 ERROR:
  if (ret_VL != NULL) *ret_VL = NULL;
  if (ret_VR != NULL) *ret_VR = NULL;
  if (ret_Er != NULL) *ret_Er = NULL;
  if (ret_Ei != NULL) *ret_Ei = NULL;
  if (VL   != NULL) free(VL);
  if (VR   != NULL) free(VR);
  if (Er   != NULL) free(Er);
  if (Ei   != NULL) free(Ei);
  if (work != NULL) free(work);
  return status;
}
Exemple #25
0
int 
main(int argc, char **argv)
{
  ESL_GETOPTS    *go    = p7_CreateDefaultApp(options, 2, argc, argv, banner, usage);
  ESL_ALPHABET   *abc   = esl_alphabet_Create(eslAMINO);
  char           *hmmfile = esl_opt_GetArg(go, 1);
  char           *qfile = esl_opt_GetArg(go, 2);
  ESL_SQ         *qsq   = esl_sq_CreateDigital(abc);
  ESL_SQFILE     *qfp   = NULL;
  FILE           *hmmfp = NULL;
  ESL_SCOREMATRIX *S    = esl_scorematrix_Create(abc);
  ESL_DMATRIX     *Q    = NULL;
  P7_BG           *bg   = p7_bg_Create(abc);		
  P7_HMM          *hmm  = NULL;
  double          *fa   = NULL;
  double          popen   = esl_opt_GetReal  (go, "-q");
  double          pextend = esl_opt_GetReal  (go, "-r");
  char            *mxfile = esl_opt_GetString(go, "-m");
  char            errbuf[eslERRBUFSIZE];
  double          slambda;
  int             a,b;
  int             status;

  /* Reverse engineer a scoring matrix to obtain conditional prob's
   * that we'll use for the single-seq query HMM. Because score mx is
   * symmetric, we can set up P[a][b] = P(b | a), so we can use the
   * matrix rows as HMM match emission vectors. This means dividing
   * the joint probs through by f_a.
   */
  if (mxfile == NULL) {
    if (esl_scorematrix_Set("BLOSUM62", S) != eslOK) esl_fatal("failed to set BLOSUM62 scores");
  } else {
    ESL_FILEPARSER *efp = NULL;

    if ( esl_fileparser_Open(mxfile, NULL,  &efp) != eslOK) esl_fatal("failed to open score file %s",  mxfile);
    if ( esl_scorematrix_Read(efp, abc, &S)               != eslOK) esl_fatal("failed to read matrix from %s", mxfile);
    esl_fileparser_Close(efp);
  }

  /* A wasteful conversion of the HMMER single-precision background probs to Easel double-prec */
  ESL_ALLOC(fa, sizeof(double) * bg->abc->K);
  esl_vec_F2D(bg->f, bg->abc->K, fa);

  /* Backcalculate joint probabilities Q, given score matrix S and background frequencies fa */
  status = esl_scorematrix_ProbifyGivenBG(S, fa, fa, &slambda, &Q); 
  if      (status == eslEINVAL)  esl_fatal("built-in score matrix %s has no valid solution for lambda", matrix);
  else if (status == eslENOHALT) esl_fatal("failed to solve score matrix %s for lambda", matrix);
  else if (status != eslOK)      esl_fatal("unexpected error in solving score matrix %s for probability parameters", matrix);

  esl_scorematrix_JointToConditionalOnQuery(abc, Q);

  /* Open the query sequence file in FASTA format */
  status = esl_sqfile_Open(qfile, eslSQFILE_FASTA, NULL, &qfp);
  if      (status == eslENOTFOUND) esl_fatal("No such file %s.", qfile);
  else if (status == eslEFORMAT)   esl_fatal("Format of %s unrecognized.", qfile);
  else if (status == eslEINVAL)    esl_fatal("Can't autodetect stdin or .gz.");
  else if (status != eslOK)        esl_fatal("Open of %s failed, code %d.", qfile, status);

  /* Open the output HMM file */
  if ((hmmfp = fopen(hmmfile, "w")) == NULL) esl_fatal("Failed to open output HMM file %s", hmmfile);

  /* For each sequence, build a model and save it. 
   */
  while ((status = esl_sqio_Read(qfp, qsq)) == eslOK)
    {
      p7_Seqmodel(abc, qsq->dsq, qsq->n, qsq->name, Q, bg->f, popen, pextend, &hmm);
      if ( p7_hmm_Validate(hmm, errbuf, 1e-5)     != eslOK) esl_fatal("HMM validation failed: %s\n", errbuf);
      if ( p7_hmmfile_WriteASCII(hmmfp, -1, hmm)  != eslOK) esl_fatal("HMM save failed");

      p7_hmm_Destroy(hmm);
    }
  if      (status == eslEFORMAT) esl_fatal("Parse failed (sequence file %s line %" PRId64 "):\n%s\n",
					    qfp->filename, qfp->linenumber, qfp->errbuf);     
  else if (status != eslEOF)     esl_fatal("Unexpected error %d reading sequence file %s",
					    status, qfp->filename);
  
  esl_dmatrix_Destroy(Q);
  esl_scorematrix_Destroy(S);
  free(fa);
  free(fb);
  esl_sq_Destroy(qsq);
  esl_sqfile_Close(qfp);
  fclose(hmmfp);
  esl_alphabet_Destroy(abc);
  esl_getopts_Destroy(go);
  return 0;
}
Exemple #26
0
int 
main(int argc, char **argv)
{
  ESL_GETOPTS    *go      = p7_CreateDefaultApp(options, 2, argc, argv, banner, usage);
  ESL_ALPHABET   *abc     = esl_alphabet_Create(eslAMINO);
  char           *qfile   = esl_opt_GetArg(go, 1);
  char           *tfile   = esl_opt_GetArg(go, 2);
  ESL_SQFILE     *qfp     = NULL;
  ESL_SQFILE     *tfp     = NULL;
  ESL_SQ         *qsq     = esl_sq_CreateDigital(abc);
  ESL_SQ         *tsq     = esl_sq_CreateDigital(abc);
  ESL_SCOREMATRIX *S      = esl_scorematrix_Create(abc);
  ESL_DMATRIX     *Q      = NULL;
  P7_BG           *bg     = p7_bg_Create(abc);
  P7_HMM          *hmm    = NULL;
  P7_PROFILE      *gm     = NULL;
  P7_REFMX        *vit    = p7_refmx_Create(200, 400); /* will grow as needed */
  double          *fa     = malloc(sizeof(double) * abc->K);
  double          popen   = 0.02;
  double          pextend = 0.4;
  double          lambda;
  float           vsc;
  float           nullsc;
  int             status;

  esl_composition_BL62(fa);
  esl_vec_D2F(fa, abc->K, bg->f);
  esl_scorematrix_Set("BLOSUM62", S);
  esl_scorematrix_ProbifyGivenBG(S, fa, fa, &lambda, &Q); 
  esl_scorematrix_JointToConditionalOnQuery(abc, Q);

  if (esl_sqfile_OpenDigital(abc, qfile, eslSQFILE_UNKNOWN, NULL, &qfp) != eslOK) esl_fatal("failed to open %s", qfile);
  if (esl_sqio_Read(qfp, qsq) != eslOK) esl_fatal("failed to read query seq");

  p7_Seqmodel(abc, qsq->dsq, qsq->n, qsq->name, Q, bg->f, popen, pextend, &hmm);  
  p7_hmm_SetComposition(hmm);
  p7_hmm_SetConsensus(hmm, qsq); 
  
  gm = p7_profile_Create(hmm->M, abc);
  p7_profile_ConfigUnilocal(gm, hmm, bg, 400);
  
  if (esl_sqfile_OpenDigital(abc, tfile, eslSQFILE_UNKNOWN, NULL, &tfp) != eslOK) esl_fatal("failed to open %s", tfile);
  while ((status = esl_sqio_Read(tfp, tsq)) == eslOK)
    {
      p7_bg_SetLength     (bg, tsq->n);
      p7_profile_SetLength(gm, tsq->n);
      
      p7_ReferenceViterbi(tsq->dsq, tsq->n, gm, vit, NULL, &vsc);    
      p7_bg_NullOne(bg, tsq->dsq, tsq->n, &nullsc);

      printf("%.4f %-25s %-25s\n", (vsc - nullsc) / eslCONST_LOG2, tsq->name, gm->name);
   
      esl_sq_Reuse(tsq);
      p7_refmx_Reuse(vit);
    }
  
  p7_refmx_Destroy(vit);
  p7_profile_Destroy(gm);
  p7_hmm_Destroy(hmm);
  p7_bg_Destroy(bg);
  esl_dmatrix_Destroy(Q);
  esl_scorematrix_Destroy(S);
  free(fa);
  esl_sq_Destroy(qsq);
  esl_sq_Destroy(tsq);
  esl_sqfile_Close(qfp);
  esl_sqfile_Close(tfp);
  esl_alphabet_Destroy(abc);
  esl_getopts_Destroy(go);
  return 0;
}
Exemple #27
0
/* Function:  esl_msaweight_GSC()
 * Synopsis:  GSC weights.
 * Incept:    SRE, Fri Nov  3 13:31:14 2006 [Janelia]
 *
 * Purpose:   Given a multiple sequence alignment <msa>, calculate
 *            sequence weights according to the
 *            Gerstein/Sonnhammer/Chothia algorithm. These weights
 *            are stored internally in the <msa> object, replacing
 *            any weights that may have already been there. Weights
 *            are $\geq 0$ and they sum to <msa->nseq>.
 *            
 *            The <msa> may be in either digitized or text mode.
 *            Digital mode is preferred, so that distance calculations
 *            used by the GSC algorithm are robust against degenerate
 *            residue symbols.
 *
 *            This is an implementation of Gerstein et al., "A method to
 *            weight protein sequences to correct for unequal
 *            representation", JMB 236:1067-1078, 1994.
 *            
 *            The algorithm is $O(N^2)$ memory (it requires a pairwise
 *            distance matrix) and $O(N^3 + LN^2)$ time ($N^3$ for a UPGMA
 *            tree building step, $LN^2$ for distance matrix construction)
 *            for an alignment of N sequences and L columns. 
 *            
 *            In the current implementation, the actual memory
 *            requirement is dominated by two full NxN distance
 *            matrices (one tmp copy in UPGMA, and one here): for
 *            8-byte doubles, that's $16N^2$ bytes. To keep the
 *            calculation under memory limits, don't process large
 *            alignments: max 1400 sequences for 32 MB, max 4000
 *            sequences for 256 MB, max 8000 seqs for 1 GB. Watch
 *            out, because Pfam alignments can easily blow this up.
 *            
 * Note:      Memory usage could be improved. UPGMA consumes a distance
 *            matrix, but that can be D itself, not a copy, if the
 *            caller doesn't mind the destruction of D. Also, D is
 *            symmetrical, so we could use upper or lower triangular
 *            matrices if we rewrote dmatrix to allow them.
 *            
 *            I also think UPGMA can be reduced to O(N^2) time, by
 *            being more tricky about rapidly identifying the minimum
 *            element: could keep min of each row, and update that,
 *            I think.
 *
 * Returns:   <eslOK> on success, and the weights inside <msa> have been
 *            modified.  
 *
 * Throws:    <eslEINVAL> if the alignment data are somehow invalid and
 *            distance matrices can't be calculated. <eslEMEM> on an
 *            allocation error. In either case, the original <msa> is
 *            left unmodified.
 *
 * Xref:      [Gerstein94]; squid::weight.c::GSCWeights(); STL11/81.
 */
int
esl_msaweight_GSC(ESL_MSA *msa)
{
  ESL_DMATRIX *D = NULL;     /* distance matrix */
  ESL_TREE    *T = NULL;     /* UPGMA tree */
  double      *x = NULL;     /* storage per node, 0..N-2 */
  double       lw, rw;       /* total branchlen on left, right subtrees */
  double       lx, rx;	     /* distribution of weight to left, right side */
  int i;		     /* counter over nodes */
  int status;
  
  /* Contract checks
   */
  ESL_DASSERT1( (msa       != NULL) );
  ESL_DASSERT1( (msa->nseq >= 1)    );
  ESL_DASSERT1( (msa->alen >= 1)    );
  ESL_DASSERT1( (msa->wgt  != NULL) );
  if (msa->nseq == 1) { msa->wgt[0] = 1.0; return eslOK; }

  /* GSC weights use a rooted tree with "branch lengths" calculated by
   * UPGMA on a fractional difference matrix - pretty crude.
   */
  if (! (msa->flags & eslMSA_DIGITAL)) {
    if ((status = esl_dst_CDiffMx(msa->aseq, msa->nseq, &D))         != eslOK) goto ERROR;
  } 
#ifdef eslAUGMENT_ALPHABET
  else {
    if ((status = esl_dst_XDiffMx(msa->abc, msa->ax, msa->nseq, &D)) != eslOK) goto ERROR;
  }
#endif

  /* oi, look out here.  UPGMA is correct, but old squid library uses
   * single linkage, so for regression tests ONLY, we use single link. 
   */
#ifdef  eslMSAWEIGHT_REGRESSION
  if ((status = esl_tree_SingleLinkage(D, &T)) != eslOK) goto ERROR; 
#else
  if ((status = esl_tree_UPGMA(D, &T)) != eslOK) goto ERROR; 
#endif
  esl_tree_SetCladesizes(T);	

  ESL_ALLOC(x, sizeof(double) * (T->N-1));
  
  /* Postorder traverse (leaves to root) to calculate the total branch
   * length under each internal node; store this in x[].  Remember the
   * total branch length (x[0]) for a future sanity check.
   */
  for (i = T->N-2; i >= 0; i--)
    {
      x[i] = T->ld[i] + T->rd[i];
      if (T->left[i]  > 0) x[i] += x[T->left[i]];
      if (T->right[i] > 0) x[i] += x[T->right[i]];
    }
  
  /* Preorder traverse (root to leaves) to calculate the weights.  Now
   * we use x[] to mean, the total weight *above* this node that we will
   * apportion to the node's left and right children. The two
   * meanings of x[] never cross: every x[] beneath x[i] is still a
   * total branch length.
   *
   * Because the API guarantees that msa is returned unmodified in case
   * of an exception, and we're touching msa->wgt here, no exceptions
   * may be thrown from now on in this function.
   */
  x[0] = 0;			/* initialize: no branch to the root. */
  for (i = 0; i <= T->N-2; i++)
    {
      lw = T->ld[i];   if (T->left[i]  > 0) lw += x[T->left[i]];
      rw = T->rd[i];   if (T->right[i] > 0) rw += x[T->right[i]];

      if (lw+rw == 0.) 
	{
	  /* A special case arises in GSC weights when all branch lengths in a subtree are 0.
	   * In this case, all seqs in this clade should get equal weights, sharing x[i] equally.
           * So, split x[i] in proportion to cladesize, not to branch weight.
	   */
	  if (T->left[i] > 0)  lx =  x[i] * ((double) T->cladesize[T->left[i]]  / (double) T->cladesize[i]);
	  else                 lx =  x[i] / (double) T->cladesize[i];

	  if (T->right[i] > 0) rx =  x[i] * ((double) T->cladesize[T->right[i]] / (double) T->cladesize[i]);
	  else                 rx =  x[i] / (double) T->cladesize[i];
	} 
      else /* normal case: x[i] split in proportion to branch weight. */
	{
	  lx = x[i] * lw/(lw+rw);
	  rx = x[i] * rw/(lw+rw);
	}
      
      if (T->left[i]  <= 0) msa->wgt[-(T->left[i])] = lx + T->ld[i];
      else                  x[T->left[i]] = lx + T->ld[i];

      if (T->right[i] <= 0) msa->wgt[-(T->right[i])] = rx + T->rd[i];
      else                  x[T->right[i]] = rx + T->rd[i];
    } 

  /* Renormalize weights to sum to N.
   */
  esl_vec_DNorm(msa->wgt, msa->nseq);
  esl_vec_DScale(msa->wgt, msa->nseq, (double) msa->nseq);
  msa->flags |= eslMSA_HASWGTS;

  free(x);
  esl_tree_Destroy(T);
  esl_dmatrix_Destroy(D);
  return eslOK;

 ERROR:
  if (x != NULL) free(x);
  if (T != NULL) esl_tree_Destroy(T);
  if (D != NULL) esl_dmatrix_Destroy(D);
  return status;
}