/* Function: esl_dst_CDiffMx() * Synopsis: NxN difference matrix for N aligned text sequences. * Incept: SRE, Fri Apr 28 06:27:20 2006 [New York] * * Purpose: Same as <esl_dst_CPairIdMx()>, but calculates * the fractional difference <d=1-s> instead of the * fractional identity <s> for each pair. * * Args: as - aligned seqs (all same length), [0..N-1] * N - # of aligned sequences * ret_D - RETURN: symmetric fractional difference matrix * * Returns: <eslOK> on success, and <ret_D> contains the * fractional difference matrix. Caller free's <D> with * <esl_dmatrix_Destroy()>. * * Throws: <eslEINVAL> if any seq has a different * length than others. On failure, <ret_D> is returned <NULL> * and state of inputs is unchanged. */ int esl_dst_CDiffMx(char **as, int N, ESL_DMATRIX **ret_D) { ESL_DMATRIX *D = NULL; int status; int i,j; status = esl_dst_CPairIdMx(as, N, &D); if (status != eslOK) goto ERROR; for (i = 0; i < N; i++) { D->mx[i][i] = 0.; for (j = i+1; j < N; j++) { D->mx[i][j] = 1. - D->mx[i][j]; D->mx[j][i] = D->mx[i][j]; } } if (ret_D != NULL) *ret_D = D; else esl_dmatrix_Destroy(D); return eslOK; ERROR: if (D != NULL) esl_dmatrix_Destroy(D); if (ret_D != NULL) *ret_D = NULL; return status; }
/* Function: esl_dst_XDiffMx() * Synopsis: NxN difference matrix for N aligned digital seqs. * Incept: SRE, Fri Apr 28 06:37:29 2006 [New York] * * Purpose: Same as <esl_dst_XPairIdMx()>, but calculates fractional * difference <1-s> instead of fractional identity <s> for * each pair. * * Args: abc - digital alphabet in use * ax - aligned dsq's, [0..N-1][1..alen] * N - number of aligned sequences * ret_D - RETURN: NxN matrix of fractional differences * * Returns: <eslOK> on success, and <ret_D> contains the difference * matrix; caller is obligated to free <D> with * <esl_dmatrix_Destroy()>. * * Throws: <eslEINVAL> if a seq has a different * length than others. On failure, <ret_D> is returned <NULL> * and state of inputs is unchanged. */ int esl_dst_XDiffMx(const ESL_ALPHABET *abc, ESL_DSQ **ax, int N, ESL_DMATRIX **ret_D) { int status; ESL_DMATRIX *D = NULL; int i,j; status = esl_dst_XPairIdMx(abc, ax, N, &D); if (status != eslOK) goto ERROR; for (i = 0; i < N; i++) { D->mx[i][i] = 0.; for (j = i+1; j < N; j++) { D->mx[i][j] = 1. - D->mx[i][j]; D->mx[j][i] = D->mx[i][j]; } } if (ret_D != NULL) *ret_D = D; else esl_dmatrix_Destroy(D); return eslOK; ERROR: if (D != NULL) esl_dmatrix_Destroy(D); if (ret_D != NULL) *ret_D = NULL; return status; }
/* Function: esl_dst_XPairIdMx() * Synopsis: NxN identity matrix for N aligned digital seqs. * Incept: SRE, Thu Apr 27 09:08:11 2006 [New York] * * Purpose: Given a digitized multiple sequence alignment <ax>, consisting * of <N> aligned digital sequences in alphabet <abc>; calculate * a symmetric pairwise fractional identity matrix by $N(N-1)/2$ * calls to <esl_dst_XPairId()>, and return it in <ret_S>. * * Args: abc - digital alphabet in use * ax - aligned dsq's, [0..N-1][1..alen] * N - number of aligned sequences * ret_S - RETURN: NxN matrix of fractional identities * * Returns: <eslOK> on success, and <ret_S> contains the distance * matrix. Caller is obligated to free <S> with * <esl_dmatrix_Destroy()>. * * Throws: <eslEINVAL> if a seq has a different * length than others. On failure, <ret_S> is returned <NULL> * and state of inputs is unchanged. */ int esl_dst_XPairIdMx(const ESL_ALPHABET *abc, ESL_DSQ **ax, int N, ESL_DMATRIX **ret_S) { int status; ESL_DMATRIX *S = NULL; int i,j; if (( S = esl_dmatrix_Create(N,N) ) == NULL) goto ERROR; for (i = 0; i < N; i++) { S->mx[i][i] = 1.; for (j = i+1; j < N; j++) { status = esl_dst_XPairId(abc, ax[i], ax[j], &(S->mx[i][j]), NULL, NULL); if (status != eslOK) ESL_XEXCEPTION(status, "Pairwise identity calculation failed at seqs %d,%d\n", i,j); S->mx[j][i] = S->mx[i][j]; } } if (ret_S != NULL) *ret_S = S; else esl_dmatrix_Destroy(S); return eslOK; ERROR: if (S != NULL) esl_dmatrix_Destroy(S); if (ret_S != NULL) *ret_S = NULL; return status; }
static int utest_CJukesCantorMx(int K, char **as, int N) { ESL_DMATRIX *D, *V; /* just a crash test */ if (esl_dst_CJukesCantorMx(K, as, N, &D, &V) != eslOK) abort(); esl_dmatrix_Destroy(D); esl_dmatrix_Destroy(V); return eslOK; }
int main(int argc, char **argv) { ESL_MSAFILE *afp; ESL_MSA *msa; ESL_DMATRIX *P; int status; int i,j; double min, avg, max; esl_msafile_Open(argv[1], eslMSAFILE_UNKNOWN, NULL, &afp); esl_msa_Read(afp, &msa); esl_dst_CPairIdMx(msa->aseq, msa->nseq, &P); min = 1.0; max = 0.0; avg = 0.0; for (i = 0; i < msa->nseq; i++) for (j = i+1; j < msa->nseq; j++) { avg += P->mx[i][j]; if (P->mx[i][j] < min) min = P->mx[i][j]; if (P->mx[i][j] > max) max = P->mx[i][j]; } avg /= (double) (msa->nseq * (msa->nseq-1) / 2); printf("Average pairwise %% id: %.1f%%\n", avg * 100.); printf("Minimum pairwise %% id: %.1f%%\n", min * 100.); printf("Maximum pairwise %% id: %.1f%%\n", max * 100.); esl_dmatrix_Destroy(P); esl_msa_Destroy(msa); esl_msafile_Close(afp); return 0; }
/* Function: esl_dmatrix_Create() * * Purpose: Creates a general <n> x <m> matrix (<n> rows, <m> * columns). * * Args: <n> - number of rows; $>= 1$ * <m> - number of columns; $>= 1$ * * Returns: a pointer to a new <ESL_DMATRIX> object. Caller frees * with <esl_dmatrix_Destroy()>. * * Throws: <NULL> if an allocation failed. */ ESL_DMATRIX * esl_dmatrix_Create(int n, int m) { ESL_DMATRIX *A = NULL; int r; int status; ESL_ALLOC(A, sizeof(ESL_DMATRIX)); A->mx = NULL; A->n = n; A->m = m; ESL_ALLOC(A->mx, sizeof(double *) * n); A->mx[0] = NULL; ESL_ALLOC(A->mx[0], sizeof(double) * n * m); for (r = 1; r < n; r++) A->mx[r] = A->mx[0] + r*m; A->type = eslGENERAL; A->ncells = n * m; return A; ERROR: esl_dmatrix_Destroy(A); return NULL; }
static int utest_XDiffMx(ESL_ALPHABET *abc, char **as, ESL_DSQ **ax, int N) { ESL_DMATRIX *D, *D2; int i, j; if (esl_dst_XDiffMx(abc, ax, N, &D) != eslOK) abort(); if (esl_dst_CDiffMx(as, N, &D2) != eslOK) abort(); for (i = 0; i < N; i++) for (j = i; j < N; j++) if (fabs(D->mx[i][j] - D2->mx[j][i]) > 0.01) abort(); esl_dmatrix_Destroy(D); esl_dmatrix_Destroy(D2); return eslOK; }
int main(void) { ESL_STOPWATCH *w = NULL; ESL_DMATRIX *Q = NULL; ESL_DMATRIX *P = NULL; double t = 5.0; int esl_iterations = 100; int i; #ifdef HAVE_LIBGSL gsl_matrix *Qg = NULL; gsl_matrix *Pg = NULL; int gsl_iterations = 100; #endif w = esl_stopwatch_Create(); Q = esl_dmatrix_Create(20, 20); P = esl_dmatrix_Create(20, 20); esl_rmx_SetWAG(Q, NULL); esl_stopwatch_Start(w); for (i = 0; i < esl_iterations; i++) esl_dmx_Exp(Q, t, P); esl_stopwatch_Stop(w); printf("Easel takes: %g sec\n", w->user / (double) esl_iterations); #ifdef HAVE_LIBGSL if (esl_dmx_MorphGSL(Q, &Qg) != eslOK) esl_fatal("morph to gsl_matrix failed"); if ((Pg = gsl_matrix_alloc(20, 20)) == NULL) esl_fatal("gsl alloc failed"); gsl_matrix_scale(Qg, t); esl_stopwatch_Start(w); for (i = 0; i < gsl_iterations; i++) gsl_linalg_exponential_ss(Qg, Pg, GSL_PREC_DOUBLE); esl_stopwatch_Stop(w); printf(" GSL takes: %g sec\n", w->user / (double) gsl_iterations); gsl_matrix_free(Qg); gsl_matrix_free(Pg); #endif /*HAVE_LIBGSL*/ esl_dmatrix_Destroy(Q); esl_dmatrix_Destroy(P); esl_stopwatch_Destroy(w); return 0; }
static int utest_XPairIdMx(ESL_ALPHABET *abc, char **as, ESL_DSQ **ax, int N) { ESL_DMATRIX *S, *S2; int i, j; if (esl_dst_XPairIdMx(abc, ax, N, &S) != eslOK) abort(); if (esl_dst_CPairIdMx(as, N, &S2) != eslOK) abort(); for (i = 0; i < N; i++) for (j = i; j < N; j++) if (fabs(S->mx[i][j] - S2->mx[j][i]) > 0.01) abort(); esl_dmatrix_Destroy(S); esl_dmatrix_Destroy(S2); return eslOK; }
/* Function: p7_builder_SetScoreSystem() * Synopsis: Initialize score system for single sequence queries. * * Purpose: Initialize the builder <bld> to be able to parameterize * single sequence queries, using a substitution matrix * from a file. * * Read a standard substitution score matrix from file * <mxfile>. If <mxfile> is <NULL>, default to BLOSUM62 * scores. If <mxfile> is "-", read score matrix from * <stdin> stream. If <env> is non-<NULL> and <mxfile> is * not found in the current working directory, look for * <mxfile> in colon-delimited directory list contained in * environment variable <env>. * * Set the gap-open and gap-extend probabilities to * <popen>, <pextend>, respectively. * * Use background residue frequencies in the null model * <bg> to convert substitution matrix scores to * conditional probability parameters. * * Args: bld - <P7_BUILDER> to initialize * mxfile - score matrix file to use, or NULL for BLOSUM62 default * env - env variable containing directory list where <mxfile> may reside * popen - gap open probability * pextend - gap extend probability * bg - null model, containing background frequencies * * Returns: <eslOK> on success. * * <eslENOTFOUND> if <mxfile> can't be found or opened, even * in any of the directories specified by the <env> variable. * * <eslEINVAL> if the score matrix can't be converted into * conditional probabilities; for example, if it has no valid * solution for <lambda>. * * On either error, <bld->errbuf> contains a useful error message * for the user. * * Throws: <eslEMEM> on allocation failure. */ int p7_builder_SetScoreSystem(P7_BUILDER *bld, const char *mxfile, const char *env, double popen, double pextend, P7_BG *bg) { ESL_FILEPARSER *efp = NULL; double *f = NULL; double slambda; int status; bld->errbuf[0] = '\0'; /* If a score system is already set, delete it. */ if (bld->S != NULL) esl_scorematrix_Destroy(bld->S); if (bld->Q != NULL) esl_dmatrix_Destroy(bld->Q); /* Get the scoring matrix */ if ((bld->S = esl_scorematrix_Create(bld->abc)) == NULL) { status = eslEMEM; goto ERROR; } if (mxfile == NULL) { if (bld->abc->type == eslAMINO) { if ((status = esl_scorematrix_Set("BLOSUM62", bld->S)) != eslOK) goto ERROR; } else { if ((status = esl_scorematrix_Set("DNA1", bld->S)) != eslOK) goto ERROR; } } else { if ((status = esl_fileparser_Open(mxfile, env, &efp)) != eslOK) ESL_XFAIL(status, bld->errbuf, "Failed to find or open matrix file %s", mxfile); if ((status = esl_scorematrix_Read(efp, bld->abc, &(bld->S))) != eslOK) ESL_XFAIL(status, bld->errbuf, "Failed to read matrix from %s:\n%s", mxfile, efp->errbuf); esl_fileparser_Close(efp); efp = NULL; } /* A wasteful conversion of the HMMER single-precision background probs to Easel double-prec */ ESL_ALLOC(f, sizeof(double) * bg->abc->K); esl_vec_F2D(bg->f, bg->abc->K, f); /* Backcalculate joint probability matrix Q, given scores S and background freqs bg->f. */ status = esl_scorematrix_ProbifyGivenBG(bld->S, f, f, &slambda, &(bld->Q)); if (status == eslEINVAL) ESL_XFAIL(eslEINVAL, bld->errbuf, "input score matrix %s has no valid solution for lambda", mxfile); else if (status == eslENOHALT) ESL_XFAIL(eslEINVAL, bld->errbuf, "failed to solve input score matrix %s for lambda: are you sure it's valid?", mxfile); else if (status != eslOK) ESL_XFAIL(eslEINVAL, bld->errbuf, "unexpected error in solving input score matrix %s for probability parameters", mxfile); /* Convert joint probabilities P(ab) to conditionals P(b|a) */ esl_scorematrix_JointToConditionalOnQuery(bld->abc, bld->Q); bld->popen = popen; bld->pextend = pextend; free(f); return eslOK; ERROR: if (efp) esl_fileparser_Close(efp); if (f) free(f); return status; }
/* Function: p7_builder_Destroy() * Synopsis: Free a <P7_BUILDER> * * Purpose: Frees a <P7_BUILDER> object. */ void p7_builder_Destroy(P7_BUILDER *bld) { if (bld == NULL) return; if (bld->prior != NULL) p7_prior_Destroy(bld->prior); if (bld->r != NULL) esl_randomness_Destroy(bld->r); if (bld->Q != NULL) esl_dmatrix_Destroy(bld->Q); if (bld->S != NULL) esl_scorematrix_Destroy(bld->S); free(bld); return; }
static int utest_XJukesCantorMx(ESL_ALPHABET *abc, char **as, ESL_DSQ **ax, int N) { ESL_DMATRIX *D, *D2, *V, *V2; int i, j; if (esl_dst_XJukesCantorMx(abc, ax, N, &D, &V) != eslOK) abort(); if (esl_dst_CJukesCantorMx(abc->K, as, N, &D2, &V2) != eslOK) abort(); for (i = 0; i < N; i++) for (j = i; j < N; j++) { if (fabs(D->mx[i][j] - D2->mx[j][i]) > 0.01) abort(); if (fabs(V->mx[i][j] - V2->mx[j][i]) > 0.01) abort(); } esl_dmatrix_Destroy(D); esl_dmatrix_Destroy(D2); esl_dmatrix_Destroy(V); esl_dmatrix_Destroy(V2); return eslOK; }
int main(void) { char errbuf[eslERRBUFSIZE]; char *alphabet = "ACDEFGHIKLMNPQRSTVWY"; ESL_DMATRIX *Q = NULL; ESL_DMATRIX *P = NULL; gsl_matrix *Qg = NULL; gsl_matrix *Pg = NULL; ESL_DMATRIX *Pge = NULL; double t = 15.0; if ((Q = esl_dmatrix_Create(20, 20)) == NULL) esl_fatal("malloc failed"); if ((P = esl_dmatrix_Create(20, 20)) == NULL) esl_fatal("malloc failed"); if (esl_rmx_SetWAG(Q, NULL) != eslOK) esl_fatal("_SetWAG() failed"); if (esl_rmx_ValidateQ(Q, 0.0001, errbuf) != eslOK) esl_fatal("Q validation failed: %s", errbuf); if (esl_dmx_Exp(Q, t, P) != eslOK) esl_fatal("matrix exponentiation failed"); if (esl_rmx_ValidateP(P, 0.0001, errbuf) != eslOK) esl_fatal("P validation failed: %s", errbuf); if (esl_dmx_MorphGSL(Q, &Qg) != eslOK) esl_fatal("morph to gsl_matrix failed"); if ((Pg = gsl_matrix_alloc(20, 20)) == NULL) esl_fatal("gsl alloc failed"); gsl_matrix_scale(Qg, t); if (gsl_linalg_exponential_ss(Qg, Pg, GSL_PREC_DOUBLE) != 0) esl_fatal("gsl's exponentiation failed"); if (esl_dmx_UnmorphGSL(Pg, &Pge) != eslOK) esl_fatal("morph from gsl_matrix failed"); esl_dmatrix_Dump(stdout, P, alphabet, alphabet); if (esl_dmatrix_Compare(Pge, P, 0.00001) != eslOK) esl_fatal("whoops, different answers."); esl_dmatrix_Destroy(Q); esl_dmatrix_Destroy(P); esl_dmatrix_Destroy(Pge); gsl_matrix_free(Qg); gsl_matrix_free(Pg); return 0; }
/* Function: esl_dst_XJukesCantorMx() * Synopsis: NxN Jukes/Cantor distance matrix for N aligned digital seqs. * Incept: SRE, Thu Apr 27 08:38:08 2006 [New York City] * * Purpose: Given a digitized multiple sequence alignment <ax>, * consisting of <nseq> aligned digital sequences in * bioalphabet <abc>, calculate a symmetric Jukes/Cantor * pairwise distance matrix for all sequence pairs; * optionally return the distance matrix in <ret_D> and * a matrix of the large-sample variances for those ML distance * estimates in <ret_V>. * * Infinite distances (and variances) are possible. They * are represented as <HUGE_VAL> in <D> and <V>. Caller must * be prepared to deal with them as appropriate. * * Args: abc - bioalphabet for <aseq> * ax - aligned digital sequences [0.nseq-1][1..L] * nseq - number of aseqs * opt_D - optRETURN: [0..nseq-1]x[0..nseq-1] symmetric distance mx * opt_V - optRETURN: matrix of variances. * * Returns: <eslOK> on success. <D> (and optionally <V>) contain the * distance matrix (and variances). Caller frees these with * <esl_dmatrix_Destroy()>. * * Throws: <eslEINVAL> if any pair of sequences have differing lengths * (and thus cannot have been properly aligned). * <eslEDIVZERO> if some pair of sequences had no aligned * residues. On failure, <D> and <V> are both returned <NULL> * and state of inputs is unchanged. */ int esl_dst_XJukesCantorMx(const ESL_ALPHABET *abc, ESL_DSQ **ax, int nseq, ESL_DMATRIX **opt_D, ESL_DMATRIX **opt_V) { ESL_DMATRIX *D = NULL; ESL_DMATRIX *V = NULL; int status; int i,j; if (( D = esl_dmatrix_Create(nseq, nseq) ) == NULL) goto ERROR; if (( V = esl_dmatrix_Create(nseq, nseq) ) == NULL) goto ERROR; for (i = 0; i < nseq; i++) { D->mx[i][i] = 0.; V->mx[i][i] = 0.; for (j = i+1; j < nseq; j++) { status = esl_dst_XJukesCantor(abc, ax[i], ax[j], &(D->mx[i][j]), &(V->mx[i][j])); if (status != eslOK) ESL_XEXCEPTION(status, "J/C calculation failed at digital aseqs %d,%d", i,j); D->mx[j][i] = D->mx[i][j]; V->mx[j][i] = V->mx[i][j]; } } if (opt_D != NULL) *opt_D = D; else esl_dmatrix_Destroy(D); if (opt_V != NULL) *opt_V = V; else esl_dmatrix_Destroy(V); return eslOK; ERROR: if (D != NULL) esl_dmatrix_Destroy(D); if (V != NULL) esl_dmatrix_Destroy(V); if (opt_D != NULL) *opt_D = NULL; if (opt_V != NULL) *opt_V = NULL; return status; }
static void utest_SetWAG(void) { char errbuf[eslERRBUFSIZE]; ESL_DMATRIX *Q = NULL; ESL_DMATRIX *P = NULL; double t = 50.0; /* sufficiently large to drive e^tQ to stationarity */ double pi[20]; int i; if ((Q = esl_dmatrix_Create(20, 20)) == NULL) esl_fatal("malloc failed"); if ((P = esl_dmatrix_Create(20, 20)) == NULL) esl_fatal("malloc failed"); /* This tests that exponentiating WAG gives a stable conditional * probability matrix solution. (It doesn't particularly test that * WAG was set correctly, but how could we have screwed that up?) */ if (esl_rmx_SetWAG(Q, NULL) != eslOK) esl_fatal("_SetWAG() failed"); if (esl_dmx_Exp(Q, t, P) != eslOK) esl_fatal("matrix exponentiation failed"); if (esl_rmx_ValidateP(P, 1e-7, errbuf) != eslOK) esl_fatal("P validation failed: %s", errbuf); if (esl_rmx_ValidateQ(Q, 1e-7, errbuf) != eslOK) esl_fatal("Q validation failed: %s", errbuf); /* This tests setting WAG to different stationary pi's than default, * then tests that exponentiating to large t reaches those stationaries. */ esl_vec_DSet(pi, 20, 0.05); if (esl_rmx_SetWAG(Q, pi) != eslOK) esl_fatal("_SetWAG() failed"); if (esl_dmx_Exp(Q, t, P) != eslOK) esl_fatal("matrix exponentiation failed"); if (esl_rmx_ValidateP(P, 1e-7, errbuf) != eslOK) esl_fatal("P validation failed: %s", errbuf); if (esl_rmx_ValidateQ(Q, 1e-7, errbuf) != eslOK) esl_fatal("Q validation failed: %s", errbuf); for (i = 0; i < 20; i++) if (esl_vec_DCompare(P->mx[i], pi, 20, 1e-7) != eslOK) esl_fatal("P didn't converge to right pi's"); esl_dmatrix_Destroy(Q); esl_dmatrix_Destroy(P); return; }
/* Function: p7_builder_LoadScoreSystem() * Synopsis: Load a standard score system for single sequence queries. * * Purpose: Initialize the builder <bld> to be able to parameterize * single sequence queries, using the standard (built-in) score * matrix named <mx>. * * Available score matrices <mx> include PAM30, 70, 120, and 240; * and BLOSUM45, 50, 62, 80, and 90. See <esl_scorematrix.c>. * * Set the gap-open and gap-extend probabilities to * <popen>, <pextend>, respectively. * * Use background residue frequencies in the null model * <bg> to convert substitution matrix scores to * conditional probability parameters. * * Args: bld - <P7_BUILDER> to initialize * matrix - score matrix file to use * popen - gap open probability * pextend - gap extend probability * bg - null model, containing background frequencies * * Returns: <eslOK> on success. * * <eslENOTFOUND> if <mxfile> can't be found or opened, even * in any of the directories specified by the <env> variable. * * <eslEINVAL> if the score matrix can't be converted into * conditional probabilities; for example, if it has no valid * solution for <lambda>. * * On either error, <bld->errbuf> contains a useful error message * for the user. * * Throws: <eslEMEM> on allocation failure. */ int p7_builder_LoadScoreSystem(P7_BUILDER *bld, const char *matrix, double popen, double pextend, P7_BG *bg) { double *f = NULL; double slambda; int status; bld->errbuf[0] = '\0'; /* If a score system is already set, delete it. */ if (bld->S != NULL) esl_scorematrix_Destroy(bld->S); if (bld->Q != NULL) esl_dmatrix_Destroy(bld->Q); /* Get the scoring matrix */ if ((bld->S = esl_scorematrix_Create(bld->abc)) == NULL) { status = eslEMEM; goto ERROR; } status = esl_scorematrix_Set(matrix, bld->S); if (status == eslENOTFOUND) ESL_XFAIL(status, bld->errbuf, "no matrix named %s is available as a built-in", matrix); else if (status != eslOK) ESL_XFAIL(status, bld->errbuf, "failed to set score matrix %s as a built-in", matrix); /* A wasteful conversion of the HMMER single-precision background probs to Easel double-prec */ ESL_ALLOC(f, sizeof(double) * bg->abc->K); esl_vec_F2D(bg->f, bg->abc->K, f); /* Backcalculate joint probability matrix Q, given scores S and background freqs bg->f. */ /* Failures shouldn't happen here: these are standard matrices. */ status = esl_scorematrix_ProbifyGivenBG(bld->S, f, f, &slambda, &(bld->Q)); if (status == eslEINVAL) ESL_XFAIL(eslEINVAL, bld->errbuf, "built-in score matrix %s has no valid solution for lambda", matrix); else if (status == eslENOHALT) ESL_XFAIL(eslEINVAL, bld->errbuf, "failed to solve score matrix %s for lambda", matrix); else if (status != eslOK) ESL_XFAIL(eslEINVAL, bld->errbuf, "unexpected error in solving score matrix %s for probability parameters", matrix); /* Convert joint probabilities P(ab) to conditionals P(b|a) */ esl_scorematrix_JointToConditionalOnQuery(bld->abc, bld->Q); bld->popen = popen; bld->pextend = pextend; free(f); return eslOK; ERROR: if (f) free(f); return status; }
static void utest_Diagonalization(void) { ESL_DMATRIX *P = NULL; ESL_DMATRIX *P2 = NULL; ESL_DMATRIX *C = NULL; ESL_DMATRIX *D = NULL; double *lambda = NULL; /* eigenvalues */ ESL_DMATRIX *U = NULL; /* left eigenvectors */ ESL_DMATRIX *Ui = NULL; /* inverse of U */ int i,j; /* Create a J/C probability matrix for t=1: * 1/4 + 3/4 e^{-4/3 at} * 1/4 - 1/4 e^{-4/3 at} */ if ((P = esl_dmatrix_Create(4, 4)) == NULL) esl_fatal("malloc failed"); if ((C = esl_dmatrix_Create(4, 4)) == NULL) esl_fatal("malloc failed"); if ((Ui = esl_dmatrix_Create(4, 4)) == NULL) esl_fatal("malloc failed"); if ((D = esl_dmatrix_Create(4, 4)) == NULL) esl_fatal("malloc failed"); if ((P2 = esl_dmatrix_Create(4, 4)) == NULL) esl_fatal("malloc failed"); for (i = 0; i < 4; i++) for (j = 0; j < 4; j++) if (i == j) P->mx[i][j] = 0.25 + 0.75 * exp(-4./3.); else P->mx[i][j] = 0.25 - 0.25 * exp(-4./3.); /* Diagonalize it */ if (esl_dmx_Diagonalize(P, &lambda, NULL, &U, NULL) != eslOK) esl_fatal("diagonalization failed"); /* Calculate P^k by U [diag(lambda_i)]^k U^{-1} */ esl_dmatrix_SetZero(D); for (i = 0; i < P->n; i++) D->mx[i][i] = lambda[i]; esl_dmx_Invert(U, Ui); esl_dmx_Multiply(U, D, C); esl_dmx_Multiply(C, Ui, P2); if (esl_dmatrix_Compare(P, P2, 1e-7) != eslOK) esl_fatal("diagonalization unit test failed"); free(lambda); esl_dmatrix_Destroy(P2); esl_dmatrix_Destroy(Ui); esl_dmatrix_Destroy(U); esl_dmatrix_Destroy(D); esl_dmatrix_Destroy(C); esl_dmatrix_Destroy(P); return; }
static int utest_CPairIdMx(char **as, int N) { ESL_DMATRIX *S; int i,j; double pid; if (esl_dst_CPairIdMx(as, N, &S) != eslOK) abort(); for (i = 0; i < N; i++) if (S->mx[i][i] != 1.0) abort(); pid = 0.; for (i = 3; i < N; i++) for (j = i+1; j < N; j++) pid += S->mx[i][j]; pid /= (double) ((N-3) * (N-4) / 2); /* first 3 don't count */ if (pid < 0.15 || pid > 0.35) abort(); /* should be 0.25 */ esl_dmatrix_Destroy(S); return eslOK; }
static int utest_CDiffMx(char **as, int N) { ESL_DMATRIX *D; int i,j; double diff; if (esl_dst_CDiffMx(as, N, &D) != eslOK) abort(); for (i = 0; i < N; i++) if (D->mx[i][i] != 0.0) abort(); diff = 0.; for (i = 3; i < N; i++) for (j = i+1; j < N; j++) diff += D->mx[i][j]; diff /= (double) ((N-3) * (N-4) / 2); /* first 3 don't count */ if (diff < 0.65 || diff > 0.85) abort(); /* should be 0.75 */ esl_dmatrix_Destroy(D); return eslOK; }
/* Function: esl_dmatrix_CreateUpper() * Incept: SRE, Wed Feb 28 08:45:45 2007 [Janelia] * * Purpose: Creates a packed upper triangular matrix of <n> rows and * <n> columns. Caller may only access cells $i \leq j$. * Cells $i > j$ are not stored and are implicitly 0. * * Not all matrix operations in Easel can work on packed * upper triangular matrices. * * Returns: a pointer to a new <ESL_DMATRIX> object of type * <eslUPPER>. Caller frees with <esl_dmatrix_Destroy()>. * * Throws: <NULL> if allocation fails. * * Xref: J1/10 */ ESL_DMATRIX * esl_dmatrix_CreateUpper(int n) { int status; ESL_DMATRIX *A = NULL; int r; /* counter over rows */ int nc; /* cell counter */ /* matrix structure allocation */ ESL_ALLOC(A, sizeof(ESL_DMATRIX)); A->mx = NULL; A->n = n; A->m = n; /* n row ptrs */ ESL_ALLOC(A->mx, sizeof(double *) * n); A->mx[0] = NULL; /* cell storage */ ESL_ALLOC(A->mx[0], sizeof(double) * n * (n+1) / 2); /* row pointers set in a tricksy overlapping way, so * mx[i][j] access works normally but only i<=j are valid. * xref J1/10. */ nc = n; /* nc is the number of valid cells assigned to rows so far */ for (r = 1; r < n; r++) { A->mx[r] = A->mx[0] + nc - r; /* -r overlaps this row w/ previous row */ nc += n-r; } A->type = eslUPPER; A->ncells = n * (n+1) / 2; return A; ERROR: esl_dmatrix_Destroy(A); return NULL; }
int main(int argc, char **argv) { ESL_GETOPTS *go = NULL; char *keyfile = NULL; char *tabfile = NULL; ESL_KEYHASH *kh = esl_keyhash_Create(); int nkeys = 0; ESL_DMATRIX *D = NULL; ESL_TREE *T = NULL; go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) cmdline_failure(argv[0], go, "Failed to parse command line: %s\n", go->errbuf); if (esl_opt_VerifyConfig(go) != eslOK) cmdline_failure(argv[0], go, "Error in app configuration: %s\n", go->errbuf); if (esl_opt_GetBoolean(go, "-h") ) cmdline_help (argv[0], go); if (esl_opt_ArgNumber(go) != 2) cmdline_failure(argv[0], go, "Incorrect number of command line arguments.\n"); keyfile = esl_opt_GetArg(go, 1); tabfile = esl_opt_GetArg(go, 2); read_keyfile(go, keyfile, kh); nkeys = esl_keyhash_GetNumber(kh); D = esl_dmatrix_Create(nkeys, nkeys); read_tabfile(go, tabfile, kh, D); esl_tree_SingleLinkage(D, &T); //esl_tree_WriteNewick(stdout, T); output_clusters(go, T, kh); esl_tree_Destroy(T); esl_dmatrix_Destroy(D); esl_keyhash_Destroy(kh); esl_getopts_Destroy(go); return 0; }
int main(int argc, char **argv) { char *filename = argv[1]; FILE *fp = NULL; ESL_DMATRIX *E = NULL; double *pi = NULL; int i,j,n; E = esl_dmatrix_Create(20, 20); pi = malloc(20 * sizeof(double)); if ((fp = fopen(filename, "r")) == NULL) esl_fatal("open failed"); if (esl_paml_ReadE(fp, E, pi) != eslOK) esl_fatal("parse failed"); n = 1; for (i = 1; i < 20; i++) for (j = 0; j < i; j++) { printf("%8.6f, ", E->mx[i][j]); if (n++ == 10) { puts(""); n=1; } } puts(""); n = 1; for (i = 0; i < 20; i++) { printf("%8.6f, ", pi[i]); if (n++ == 10) { puts(""); n=1; } } fclose(fp); free(pi); esl_dmatrix_Destroy(E); return 0; }
int main(int argc, char **argv) { ESL_ALPHABET *abc = NULL; /* sequence alphabet */ ESL_GETOPTS *go = NULL; /* command line processing */ ESL_RANDOMNESS *r = NULL; /* source of randomness */ P7_HMM *hmm = NULL; /* sampled HMM to emit from */ P7_HMM *core = NULL; /* safe copy of the HMM, before config */ P7_BG *bg = NULL; /* null model */ ESL_SQ *sq = NULL; /* sampled sequence */ P7_TRACE *tr = NULL; /* sampled trace */ P7_PROFILE *gm = NULL; /* profile */ int i,j; int i1,i2; int k1,k2; int iseq; FILE *fp = NULL; double expected; int do_ilocal; char *hmmfile = NULL; int nseq; int do_swlike; int do_ungapped; int L; int M; int do_h2; char *ipsfile = NULL; char *kpsfile = NULL; ESL_DMATRIX *imx = NULL; ESL_DMATRIX *kmx = NULL; ESL_DMATRIX *iref = NULL; /* reference matrix: expected i distribution under ideality */ int Lbins; int status; char errbuf[eslERRBUFSIZE]; /***************************************************************** * Parse the command line *****************************************************************/ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) esl_fatal("Failed to parse command line: %s\n", go->errbuf); if (esl_opt_VerifyConfig(go) != eslOK) esl_fatal("Failed to parse command line: %s\n", go->errbuf); if (esl_opt_GetBoolean(go, "-h") == TRUE) { puts(usage); puts("\n where options are:\n"); esl_opt_DisplayHelp(stdout, go, 0, 2, 80); /* 0=all docgroups; 2 = indentation; 80=textwidth*/ return eslOK; } do_ilocal = esl_opt_GetBoolean(go, "-i"); hmmfile = esl_opt_GetString (go, "-m"); nseq = esl_opt_GetInteger(go, "-n"); do_swlike = esl_opt_GetBoolean(go, "-s"); do_ungapped = esl_opt_GetBoolean(go, "-u"); L = esl_opt_GetInteger(go, "-L"); M = esl_opt_GetInteger(go, "-M"); do_h2 = esl_opt_GetBoolean(go, "-2"); ipsfile = esl_opt_GetString (go, "--ips"); kpsfile = esl_opt_GetString (go, "--kps"); if (esl_opt_ArgNumber(go) != 0) { puts("Incorrect number of command line arguments."); printf("Usage: %s [options]\n", argv[0]); return eslFAIL; } r = esl_randomness_CreateFast(0); if (hmmfile != NULL) { /* Read the HMM (and get alphabet from it) */ P7_HMMFILE *hfp = NULL; status = p7_hmmfile_OpenE(hmmfile, NULL, &hfp, errbuf); if (status == eslENOTFOUND) p7_Fail("File existence/permissions problem in trying to open HMM file %s.\n%s\n", hmmfile, errbuf); else if (status == eslEFORMAT) p7_Fail("File format problem in trying to open HMM file %s.\n%s\n", hmmfile, errbuf); else if (status != eslOK) p7_Fail("Unexpected error %d in opening HMM file %s.\n%s\n", status, hmmfile, errbuf); if ((status = p7_hmmfile_Read(hfp, &abc, &hmm)) != eslOK) { if (status == eslEOD) esl_fatal("read failed, HMM file %s may be truncated?", hmmfile); else if (status == eslEFORMAT) esl_fatal("bad file format in HMM file %s", hmmfile); else if (status == eslEINCOMPAT) esl_fatal("HMM file %s contains different alphabets", hmmfile); else esl_fatal("Unexpected error in reading HMMs"); } M = hmm->M; p7_hmmfile_Close(hfp); } else { /* Or sample the HMM (create alphabet first) */ abc = esl_alphabet_Create(eslAMINO); if (do_ungapped) p7_hmm_SampleUngapped(r, M, abc, &hmm); else if (do_swlike) p7_hmm_SampleUniform (r, M, abc, 0.05, 0.5, 0.05, 0.2, &hmm); /* tmi, tii, tmd, tdd */ else p7_hmm_Sample (r, M, abc, &hmm); } Lbins = M; imx = esl_dmatrix_Create(Lbins, Lbins); iref = esl_dmatrix_Create(Lbins, Lbins); kmx = esl_dmatrix_Create(M, M); esl_dmatrix_SetZero(imx); esl_dmatrix_SetZero(iref); esl_dmatrix_SetZero(kmx); tr = p7_trace_Create(); sq = esl_sq_CreateDigital(abc); bg = p7_bg_Create(abc); core = p7_hmm_Clone(hmm); if (do_h2) { gm = p7_profile_Create(hmm->M, abc); p7_H2_ProfileConfig(hmm, bg, gm, p7_UNILOCAL); } else { gm = p7_profile_Create(hmm->M, abc); p7_ProfileConfig(hmm, bg, gm, L, p7_UNILOCAL); if (p7_hmm_Validate (hmm, NULL, 0.0001) != eslOK) esl_fatal("whoops, HMM is bad!"); if (p7_profile_Validate(gm, NULL, 0.0001) != eslOK) esl_fatal("whoops, profile is bad!"); } /* Sample endpoints. * Also sample an ideal reference distribution for i endpoints. i * endpoints are prone to discretization artifacts, when emitted * sequences have varying lengths. Taking log odds w.r.t. an ideal * reference that is subject to the same discretization artifacts * cancels out the effect. */ for (iseq = 0; iseq < nseq; iseq++) { if (do_ilocal) ideal_local_endpoints (r, core, sq, tr, Lbins, &i1, &i2, &k1, &k2); else profile_local_endpoints(r, core, gm, sq, tr, Lbins, &i1, &i2, &k1, &k2); imx->mx[i1-1][i2-1] += 1.; kmx->mx[k1-1][k2-1] += 1.; /* reference distribution for i */ ideal_local_endpoints (r, core, sq, tr, Lbins, &i1, &i2, &k1, &k2); iref->mx[i1-1][i2-1] += 1.; } /* Adjust both mx's to log_2(obs/exp) ratio */ printf("Before normalization/log-odds:\n"); printf(" i matrix values range from %f to %f\n", dmx_upper_min(imx), dmx_upper_max(imx)); printf(" k matrix values range from %f to %f\n", dmx_upper_min(kmx), dmx_upper_max(kmx)); printf("iref matrix values range from %f to %f\n", dmx_upper_min(iref), dmx_upper_max(iref)); expected = (double) nseq * 2. / (double) (M*(M+1)); for (i = 0; i < kmx->m; i++) for (j = i; j < kmx->n; j++) kmx->mx[i][j] = log(kmx->mx[i][j] / expected) / log(2.0); for (i = 0; i < imx->m; i++) for (j = i; j < imx->m; j++) if (iref->mx[i][j] == 0. && imx->mx[i][j] == 0.) imx->mx[i][j] = 0.; else if (iref->mx[i][j] == 0.) imx->mx[i][j] = eslINFINITY; else if (imx->mx[i][j] == 0.) imx->mx[i][j] = -eslINFINITY; else imx->mx[i][j] = log(imx->mx[i][j] / iref->mx[i][j]) / log(2.0); /* Print ps files */ if (kpsfile != NULL) { if ((fp = fopen(kpsfile, "w")) == NULL) esl_fatal("Failed to open output postscript file %s", kpsfile); dmx_Visualize(fp, kmx, -4., 5.); fclose(fp); } if (ipsfile != NULL) { if ((fp = fopen(ipsfile, "w")) == NULL) esl_fatal("Failed to open output postscript file %s", ipsfile); dmx_Visualize(fp, imx, -4., 5.); /* dmx_Visualize(fp, imx, dmx_upper_min(imx), dmx_upper_max(imx)); */ fclose(fp); } printf("After normalization/log-odds:\n"); printf("i matrix values range from %f to %f\n", dmx_upper_min(imx), dmx_upper_max(imx)); printf("k matrix values range from %f to %f\n", dmx_upper_min(kmx), dmx_upper_max(kmx)); p7_profile_Destroy(gm); p7_bg_Destroy(bg); p7_hmm_Destroy(core); p7_hmm_Destroy(hmm); p7_trace_Destroy(tr); esl_sq_Destroy(sq); esl_dmatrix_Destroy(imx); esl_dmatrix_Destroy(kmx); esl_alphabet_Destroy(abc); esl_randomness_Destroy(r); esl_getopts_Destroy(go); return eslOK; }
/* A: nxn real matrix * ret_Er: RETURN: vector of eigenvalues, real part, allocated 0..n-1 * ret_Ei: RETURN: vector of eigenvalues, imaginary part, allocated 0..n-1 * ret_VL: RETURN: left eigenvectors * ret_VR: RETURN: right eigenvectors */ int esl_lapack_dgeev(ESL_DMATRIX *A, double **ret_Er, double **ret_Ei, ESL_DMATRIX **ret_VL, ESL_DMATRIX **ret_VR) { double *Er = NULL; double *Ei = NULL; ESL_DMATRIX *VL = NULL; ESL_DMATRIX *VR = NULL; double *work = NULL; char jobvl, jobvr; int lda; int ldvl, ldvr; int lwork; int info; int status; if ((VL = esl_dmatrix_Create(A->n,A->n)) == NULL) { status = eslEMEM; goto ERROR; } if ((VR = esl_dmatrix_Create(A->n,A->n)) == NULL) { status = eslEMEM; goto ERROR; } ESL_ALLOC(Er, sizeof(double) * A->n); ESL_ALLOC(Ei, sizeof(double) * A->n); ESL_ALLOC(work, sizeof(double) * 4 * A->n); jobvl = (ret_VL == NULL) ? 'N' : 'V'; /* do we want left eigenvectors? */ jobvr = (ret_VR == NULL) ? 'N' : 'V'; /* do we want right eigenvectors? */ lda = A->n; ldvl = A->n; ldvr = A->n; lwork = 4*A->n; /* Fortran convention is colxrow, not rowxcol; so transpose * A before passing it to a Fortran routine. */ esl_dmx_Transpose(A); /* The actual Fortran77 interface call to LAPACK. * All args must be passed by reference. * Fortran 2D arrays are 1D: so pass the A[0] part of a DSMX. */ dgeev_(&jobvl, &jobvr, &(A->n), A->mx[0], &lda, Er, Ei, VL->mx[0], &ldvl, VR->mx[0], &ldvr, work, &lwork, &info); /* Now, VL, VR are transposed (col x row), so transpose them back to * C convention. */ esl_dmx_Transpose(VL); esl_dmx_Transpose(VR); if (ret_VL != NULL) *ret_VL = VL; else esl_dmatrix_Destroy(VL); if (ret_VR != NULL) *ret_VR = VR; else esl_dmatrix_Destroy(VR); if (ret_Er != NULL) *ret_Er = Er; else free(Er); if (ret_Ei != NULL) *ret_Ei = Ei; else free(Ei); free(work); return eslOK; ERROR: if (ret_VL != NULL) *ret_VL = NULL; if (ret_VR != NULL) *ret_VR = NULL; if (ret_Er != NULL) *ret_Er = NULL; if (ret_Ei != NULL) *ret_Ei = NULL; if (VL != NULL) free(VL); if (VR != NULL) free(VR); if (Er != NULL) free(Er); if (Ei != NULL) free(Ei); if (work != NULL) free(work); return status; }
int main(int argc, char **argv) { ESL_GETOPTS *go = p7_CreateDefaultApp(options, 2, argc, argv, banner, usage); ESL_ALPHABET *abc = esl_alphabet_Create(eslAMINO); char *hmmfile = esl_opt_GetArg(go, 1); char *qfile = esl_opt_GetArg(go, 2); ESL_SQ *qsq = esl_sq_CreateDigital(abc); ESL_SQFILE *qfp = NULL; FILE *hmmfp = NULL; ESL_SCOREMATRIX *S = esl_scorematrix_Create(abc); ESL_DMATRIX *Q = NULL; P7_BG *bg = p7_bg_Create(abc); P7_HMM *hmm = NULL; double *fa = NULL; double popen = esl_opt_GetReal (go, "-q"); double pextend = esl_opt_GetReal (go, "-r"); char *mxfile = esl_opt_GetString(go, "-m"); char errbuf[eslERRBUFSIZE]; double slambda; int a,b; int status; /* Reverse engineer a scoring matrix to obtain conditional prob's * that we'll use for the single-seq query HMM. Because score mx is * symmetric, we can set up P[a][b] = P(b | a), so we can use the * matrix rows as HMM match emission vectors. This means dividing * the joint probs through by f_a. */ if (mxfile == NULL) { if (esl_scorematrix_Set("BLOSUM62", S) != eslOK) esl_fatal("failed to set BLOSUM62 scores"); } else { ESL_FILEPARSER *efp = NULL; if ( esl_fileparser_Open(mxfile, NULL, &efp) != eslOK) esl_fatal("failed to open score file %s", mxfile); if ( esl_scorematrix_Read(efp, abc, &S) != eslOK) esl_fatal("failed to read matrix from %s", mxfile); esl_fileparser_Close(efp); } /* A wasteful conversion of the HMMER single-precision background probs to Easel double-prec */ ESL_ALLOC(fa, sizeof(double) * bg->abc->K); esl_vec_F2D(bg->f, bg->abc->K, fa); /* Backcalculate joint probabilities Q, given score matrix S and background frequencies fa */ status = esl_scorematrix_ProbifyGivenBG(S, fa, fa, &slambda, &Q); if (status == eslEINVAL) esl_fatal("built-in score matrix %s has no valid solution for lambda", matrix); else if (status == eslENOHALT) esl_fatal("failed to solve score matrix %s for lambda", matrix); else if (status != eslOK) esl_fatal("unexpected error in solving score matrix %s for probability parameters", matrix); esl_scorematrix_JointToConditionalOnQuery(abc, Q); /* Open the query sequence file in FASTA format */ status = esl_sqfile_Open(qfile, eslSQFILE_FASTA, NULL, &qfp); if (status == eslENOTFOUND) esl_fatal("No such file %s.", qfile); else if (status == eslEFORMAT) esl_fatal("Format of %s unrecognized.", qfile); else if (status == eslEINVAL) esl_fatal("Can't autodetect stdin or .gz."); else if (status != eslOK) esl_fatal("Open of %s failed, code %d.", qfile, status); /* Open the output HMM file */ if ((hmmfp = fopen(hmmfile, "w")) == NULL) esl_fatal("Failed to open output HMM file %s", hmmfile); /* For each sequence, build a model and save it. */ while ((status = esl_sqio_Read(qfp, qsq)) == eslOK) { p7_Seqmodel(abc, qsq->dsq, qsq->n, qsq->name, Q, bg->f, popen, pextend, &hmm); if ( p7_hmm_Validate(hmm, errbuf, 1e-5) != eslOK) esl_fatal("HMM validation failed: %s\n", errbuf); if ( p7_hmmfile_WriteASCII(hmmfp, -1, hmm) != eslOK) esl_fatal("HMM save failed"); p7_hmm_Destroy(hmm); } if (status == eslEFORMAT) esl_fatal("Parse failed (sequence file %s line %" PRId64 "):\n%s\n", qfp->filename, qfp->linenumber, qfp->errbuf); else if (status != eslEOF) esl_fatal("Unexpected error %d reading sequence file %s", status, qfp->filename); esl_dmatrix_Destroy(Q); esl_scorematrix_Destroy(S); free(fa); free(fb); esl_sq_Destroy(qsq); esl_sqfile_Close(qfp); fclose(hmmfp); esl_alphabet_Destroy(abc); esl_getopts_Destroy(go); return 0; }
int main(int argc, char **argv) { ESL_GETOPTS *go = p7_CreateDefaultApp(options, 2, argc, argv, banner, usage); ESL_ALPHABET *abc = esl_alphabet_Create(eslAMINO); char *qfile = esl_opt_GetArg(go, 1); char *tfile = esl_opt_GetArg(go, 2); ESL_SQFILE *qfp = NULL; ESL_SQFILE *tfp = NULL; ESL_SQ *qsq = esl_sq_CreateDigital(abc); ESL_SQ *tsq = esl_sq_CreateDigital(abc); ESL_SCOREMATRIX *S = esl_scorematrix_Create(abc); ESL_DMATRIX *Q = NULL; P7_BG *bg = p7_bg_Create(abc); P7_HMM *hmm = NULL; P7_PROFILE *gm = NULL; P7_REFMX *vit = p7_refmx_Create(200, 400); /* will grow as needed */ double *fa = malloc(sizeof(double) * abc->K); double popen = 0.02; double pextend = 0.4; double lambda; float vsc; float nullsc; int status; esl_composition_BL62(fa); esl_vec_D2F(fa, abc->K, bg->f); esl_scorematrix_Set("BLOSUM62", S); esl_scorematrix_ProbifyGivenBG(S, fa, fa, &lambda, &Q); esl_scorematrix_JointToConditionalOnQuery(abc, Q); if (esl_sqfile_OpenDigital(abc, qfile, eslSQFILE_UNKNOWN, NULL, &qfp) != eslOK) esl_fatal("failed to open %s", qfile); if (esl_sqio_Read(qfp, qsq) != eslOK) esl_fatal("failed to read query seq"); p7_Seqmodel(abc, qsq->dsq, qsq->n, qsq->name, Q, bg->f, popen, pextend, &hmm); p7_hmm_SetComposition(hmm); p7_hmm_SetConsensus(hmm, qsq); gm = p7_profile_Create(hmm->M, abc); p7_profile_ConfigUnilocal(gm, hmm, bg, 400); if (esl_sqfile_OpenDigital(abc, tfile, eslSQFILE_UNKNOWN, NULL, &tfp) != eslOK) esl_fatal("failed to open %s", tfile); while ((status = esl_sqio_Read(tfp, tsq)) == eslOK) { p7_bg_SetLength (bg, tsq->n); p7_profile_SetLength(gm, tsq->n); p7_ReferenceViterbi(tsq->dsq, tsq->n, gm, vit, NULL, &vsc); p7_bg_NullOne(bg, tsq->dsq, tsq->n, &nullsc); printf("%.4f %-25s %-25s\n", (vsc - nullsc) / eslCONST_LOG2, tsq->name, gm->name); esl_sq_Reuse(tsq); p7_refmx_Reuse(vit); } p7_refmx_Destroy(vit); p7_profile_Destroy(gm); p7_hmm_Destroy(hmm); p7_bg_Destroy(bg); esl_dmatrix_Destroy(Q); esl_scorematrix_Destroy(S); free(fa); esl_sq_Destroy(qsq); esl_sq_Destroy(tsq); esl_sqfile_Close(qfp); esl_sqfile_Close(tfp); esl_alphabet_Destroy(abc); esl_getopts_Destroy(go); return 0; }
/* Function: esl_msaweight_GSC() * Synopsis: GSC weights. * Incept: SRE, Fri Nov 3 13:31:14 2006 [Janelia] * * Purpose: Given a multiple sequence alignment <msa>, calculate * sequence weights according to the * Gerstein/Sonnhammer/Chothia algorithm. These weights * are stored internally in the <msa> object, replacing * any weights that may have already been there. Weights * are $\geq 0$ and they sum to <msa->nseq>. * * The <msa> may be in either digitized or text mode. * Digital mode is preferred, so that distance calculations * used by the GSC algorithm are robust against degenerate * residue symbols. * * This is an implementation of Gerstein et al., "A method to * weight protein sequences to correct for unequal * representation", JMB 236:1067-1078, 1994. * * The algorithm is $O(N^2)$ memory (it requires a pairwise * distance matrix) and $O(N^3 + LN^2)$ time ($N^3$ for a UPGMA * tree building step, $LN^2$ for distance matrix construction) * for an alignment of N sequences and L columns. * * In the current implementation, the actual memory * requirement is dominated by two full NxN distance * matrices (one tmp copy in UPGMA, and one here): for * 8-byte doubles, that's $16N^2$ bytes. To keep the * calculation under memory limits, don't process large * alignments: max 1400 sequences for 32 MB, max 4000 * sequences for 256 MB, max 8000 seqs for 1 GB. Watch * out, because Pfam alignments can easily blow this up. * * Note: Memory usage could be improved. UPGMA consumes a distance * matrix, but that can be D itself, not a copy, if the * caller doesn't mind the destruction of D. Also, D is * symmetrical, so we could use upper or lower triangular * matrices if we rewrote dmatrix to allow them. * * I also think UPGMA can be reduced to O(N^2) time, by * being more tricky about rapidly identifying the minimum * element: could keep min of each row, and update that, * I think. * * Returns: <eslOK> on success, and the weights inside <msa> have been * modified. * * Throws: <eslEINVAL> if the alignment data are somehow invalid and * distance matrices can't be calculated. <eslEMEM> on an * allocation error. In either case, the original <msa> is * left unmodified. * * Xref: [Gerstein94]; squid::weight.c::GSCWeights(); STL11/81. */ int esl_msaweight_GSC(ESL_MSA *msa) { ESL_DMATRIX *D = NULL; /* distance matrix */ ESL_TREE *T = NULL; /* UPGMA tree */ double *x = NULL; /* storage per node, 0..N-2 */ double lw, rw; /* total branchlen on left, right subtrees */ double lx, rx; /* distribution of weight to left, right side */ int i; /* counter over nodes */ int status; /* Contract checks */ ESL_DASSERT1( (msa != NULL) ); ESL_DASSERT1( (msa->nseq >= 1) ); ESL_DASSERT1( (msa->alen >= 1) ); ESL_DASSERT1( (msa->wgt != NULL) ); if (msa->nseq == 1) { msa->wgt[0] = 1.0; return eslOK; } /* GSC weights use a rooted tree with "branch lengths" calculated by * UPGMA on a fractional difference matrix - pretty crude. */ if (! (msa->flags & eslMSA_DIGITAL)) { if ((status = esl_dst_CDiffMx(msa->aseq, msa->nseq, &D)) != eslOK) goto ERROR; } #ifdef eslAUGMENT_ALPHABET else { if ((status = esl_dst_XDiffMx(msa->abc, msa->ax, msa->nseq, &D)) != eslOK) goto ERROR; } #endif /* oi, look out here. UPGMA is correct, but old squid library uses * single linkage, so for regression tests ONLY, we use single link. */ #ifdef eslMSAWEIGHT_REGRESSION if ((status = esl_tree_SingleLinkage(D, &T)) != eslOK) goto ERROR; #else if ((status = esl_tree_UPGMA(D, &T)) != eslOK) goto ERROR; #endif esl_tree_SetCladesizes(T); ESL_ALLOC(x, sizeof(double) * (T->N-1)); /* Postorder traverse (leaves to root) to calculate the total branch * length under each internal node; store this in x[]. Remember the * total branch length (x[0]) for a future sanity check. */ for (i = T->N-2; i >= 0; i--) { x[i] = T->ld[i] + T->rd[i]; if (T->left[i] > 0) x[i] += x[T->left[i]]; if (T->right[i] > 0) x[i] += x[T->right[i]]; } /* Preorder traverse (root to leaves) to calculate the weights. Now * we use x[] to mean, the total weight *above* this node that we will * apportion to the node's left and right children. The two * meanings of x[] never cross: every x[] beneath x[i] is still a * total branch length. * * Because the API guarantees that msa is returned unmodified in case * of an exception, and we're touching msa->wgt here, no exceptions * may be thrown from now on in this function. */ x[0] = 0; /* initialize: no branch to the root. */ for (i = 0; i <= T->N-2; i++) { lw = T->ld[i]; if (T->left[i] > 0) lw += x[T->left[i]]; rw = T->rd[i]; if (T->right[i] > 0) rw += x[T->right[i]]; if (lw+rw == 0.) { /* A special case arises in GSC weights when all branch lengths in a subtree are 0. * In this case, all seqs in this clade should get equal weights, sharing x[i] equally. * So, split x[i] in proportion to cladesize, not to branch weight. */ if (T->left[i] > 0) lx = x[i] * ((double) T->cladesize[T->left[i]] / (double) T->cladesize[i]); else lx = x[i] / (double) T->cladesize[i]; if (T->right[i] > 0) rx = x[i] * ((double) T->cladesize[T->right[i]] / (double) T->cladesize[i]); else rx = x[i] / (double) T->cladesize[i]; } else /* normal case: x[i] split in proportion to branch weight. */ { lx = x[i] * lw/(lw+rw); rx = x[i] * rw/(lw+rw); } if (T->left[i] <= 0) msa->wgt[-(T->left[i])] = lx + T->ld[i]; else x[T->left[i]] = lx + T->ld[i]; if (T->right[i] <= 0) msa->wgt[-(T->right[i])] = rx + T->rd[i]; else x[T->right[i]] = rx + T->rd[i]; } /* Renormalize weights to sum to N. */ esl_vec_DNorm(msa->wgt, msa->nseq); esl_vec_DScale(msa->wgt, msa->nseq, (double) msa->nseq); msa->flags |= eslMSA_HASWGTS; free(x); esl_tree_Destroy(T); esl_dmatrix_Destroy(D); return eslOK; ERROR: if (x != NULL) free(x); if (T != NULL) esl_tree_Destroy(T); if (D != NULL) esl_dmatrix_Destroy(D); return status; }